Home | History | Annotate | Download | only in io
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 /*
     28  * xdf.c - Xen Virtual Block Device Driver
     29  * TODO:
     30  *	- support alternate block size (currently only DEV_BSIZE supported)
     31  *	- revalidate geometry for removable devices
     32  *
     33  * This driver export solaris disk device nodes, accepts IO requests from
     34  * those nodes, and services those requests by talking to a backend device
     35  * in another domain.
     36  *
     37  * Communication with the backend device is done via a ringbuffer (which is
     38  * managed via xvdi interfaces) and dma memory (which is managed via ddi
     39  * interfaces).
     40  *
     41  * Communication with the backend device is dependant upon establishing a
     42  * connection to the backend device.  This connection process involves
     43  * reading device configuration information from xenbus and publishing
     44  * some frontend runtime configuration parameters via the xenbus (for
     45  * consumption by the backend).  Once we've published runtime configuration
     46  * information via the xenbus, the backend device can enter the connected
     47  * state and we'll enter the XD_CONNECTED state.  But before we can allow
     48  * random IO to begin, we need to do IO to the backend device to determine
     49  * the device label and if flush operations are supported.  Once this is
     50  * done we enter the XD_READY state and can process any IO operations.
     51  *
     52  * We recieve notifications of xenbus state changes for the backend device
     53  * (aka, the "other end") via the xdf_oe_change() callback.  This callback
     54  * is single threaded, meaning that we can't recieve new notification of
     55  * other end state changes while we're processing an outstanding
     56  * notification of an other end state change.  There for we can't do any
     57  * blocking operations from the xdf_oe_change() callback.  This is why we
     58  * have a seperate taskq (xdf_ready_tq) which exists to do the necessary
     59  * IO to get us from the XD_CONNECTED to the XD_READY state.  All IO
     60  * generated by the xdf_ready_tq thread (xdf_ready_tq_thread) will go
     61  * throught xdf_lb_rdwr(), which is a synchronous IO interface.  IOs
     62  * generated by the xdf_ready_tq_thread thread have priority over all
     63  * other IO requests.
     64  *
     65  * We also communicate with the backend device via the xenbus "media-req"
     66  * (XBP_MEDIA_REQ) property.  For more information on this see the
     67  * comments in blkif.h.
     68  */
     69 
     70 #include <io/xdf.h>
     71 
     72 #include <sys/conf.h>
     73 #include <sys/dkio.h>
     74 #include <sys/promif.h>
     75 #include <sys/sysmacros.h>
     76 #include <sys/kstat.h>
     77 #include <sys/mach_mmu.h>
     78 #ifdef XPV_HVM_DRIVER
     79 #include <sys/xpv_support.h>
     80 #include <sys/sunndi.h>
     81 #else /* !XPV_HVM_DRIVER */
     82 #include <sys/evtchn_impl.h>
     83 #endif /* !XPV_HVM_DRIVER */
     84 #include <public/io/xenbus.h>
     85 #include <xen/sys/xenbus_impl.h>
     86 #include <sys/scsi/generic/inquiry.h>
     87 #include <xen/io/blkif_impl.h>
     88 #include <sys/fdio.h>
     89 #include <sys/cdio.h>
     90 
     91 /*
     92  * DEBUG_EVAL can be used to include debug only statements without
     93  * having to use '#ifdef DEBUG' statements
     94  */
     95 #ifdef DEBUG
     96 #define	DEBUG_EVAL(x)	(x)
     97 #else /* !DEBUG */
     98 #define	DEBUG_EVAL(x)
     99 #endif /* !DEBUG */
    100 
    101 #define	XDF_DRAIN_MSEC_DELAY		(50*1000)	/* 00.05 sec */
    102 #define	XDF_DRAIN_RETRY_COUNT		200		/* 10.00 sec */
    103 
    104 #define	INVALID_DOMID	((domid_t)-1)
    105 #define	FLUSH_DISKCACHE	0x1
    106 #define	WRITE_BARRIER	0x2
    107 #define	DEFAULT_FLUSH_BLOCK	156 /* block to write to cause a cache flush */
    108 #define	USE_WRITE_BARRIER(vdp)						\
    109 	((vdp)->xdf_feature_barrier && !(vdp)->xdf_flush_supported)
    110 #define	USE_FLUSH_DISKCACHE(vdp)					\
    111 	((vdp)->xdf_feature_barrier && (vdp)->xdf_flush_supported)
    112 #define	IS_WRITE_BARRIER(vdp, bp)					\
    113 	(!IS_READ(bp) && USE_WRITE_BARRIER(vdp) &&			\
    114 	((bp)->b_un.b_addr == (vdp)->xdf_cache_flush_block))
    115 #define	IS_FLUSH_DISKCACHE(bp)						\
    116 	(!IS_READ(bp) && USE_FLUSH_DISKCACHE(vdp) && ((bp)->b_bcount == 0))
    117 
    118 #define	VREQ_DONE(vreq)							\
    119 	VOID2BOOLEAN(((vreq)->v_status == VREQ_DMAWIN_DONE) &&		\
    120 	    (((vreq)->v_flush_diskcache == FLUSH_DISKCACHE) ||		\
    121 	    (((vreq)->v_dmaw + 1) == (vreq)->v_ndmaws)))
    122 
    123 #define	BP_VREQ(bp)		((v_req_t *)((bp)->av_back))
    124 #define	BP_VREQ_SET(bp, vreq)	(((bp)->av_back = (buf_t *)(vreq)))
    125 
    126 extern int		do_polled_io;
    127 
    128 /* run-time tunables that we don't want the compiler to optimize away */
    129 volatile int		xdf_debug = 0;
    130 volatile boolean_t	xdf_barrier_flush_disable = B_FALSE;
    131 
    132 /* per module globals */
    133 major_t			xdf_major;
    134 static void		*xdf_ssp;
    135 static kmem_cache_t	*xdf_vreq_cache;
    136 static kmem_cache_t	*xdf_gs_cache;
    137 static int		xdf_maxphys = XB_MAXPHYS;
    138 static diskaddr_t	xdf_flush_block = DEFAULT_FLUSH_BLOCK;
    139 static int		xdf_fbrewrites;	/* flush block re-write count */
    140 
    141 /* misc public functions (used by xdf_shell.c) */
    142 int xdf_lb_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t, size_t, void *);
    143 int xdf_lb_getinfo(dev_info_t *, int, void *, void *);
    144 
    145 /*  misc private functions */
    146 static void xdf_io_start(xdf_t *);
    147 
    148 /* callbacks from commmon label */
    149 static cmlb_tg_ops_t xdf_lb_ops = {
    150 	TG_DK_OPS_VERSION_1,
    151 	xdf_lb_rdwr,
    152 	xdf_lb_getinfo
    153 };
    154 
    155 /*
    156  * I/O buffer DMA attributes
    157  * Make sure: one DMA window contains BLKIF_MAX_SEGMENTS_PER_REQUEST at most
    158  */
    159 static ddi_dma_attr_t xb_dma_attr = {
    160 	DMA_ATTR_V0,
    161 	(uint64_t)0,			/* lowest address */
    162 	(uint64_t)0xffffffffffffffff,	/* highest usable address */
    163 	(uint64_t)0xffffff,		/* DMA counter limit max */
    164 	(uint64_t)XB_BSIZE,		/* alignment in bytes */
    165 	XB_BSIZE - 1,			/* bitmap of burst sizes */
    166 	XB_BSIZE,			/* min transfer */
    167 	(uint64_t)XB_MAX_XFER, 		/* maximum transfer */
    168 	(uint64_t)PAGEOFFSET,		/* 1 page segment length  */
    169 	BLKIF_MAX_SEGMENTS_PER_REQUEST,	/* maximum number of segments */
    170 	XB_BSIZE,			/* granularity */
    171 	0,				/* flags (reserved) */
    172 };
    173 
    174 static ddi_device_acc_attr_t xc_acc_attr = {
    175 	DDI_DEVICE_ATTR_V0,
    176 	DDI_NEVERSWAP_ACC,
    177 	DDI_STRICTORDER_ACC
    178 };
    179 
    180 static void
    181 xdf_timeout_handler(void *arg)
    182 {
    183 	xdf_t *vdp = arg;
    184 
    185 	mutex_enter(&vdp->xdf_dev_lk);
    186 	vdp->xdf_timeout_id = 0;
    187 	mutex_exit(&vdp->xdf_dev_lk);
    188 
    189 	/* new timeout thread could be re-scheduled */
    190 	xdf_io_start(vdp);
    191 }
    192 
    193 /*
    194  * callback func when DMA/GTE resources is available
    195  *
    196  * Note: we only register one callback function to grant table subsystem
    197  * since we only have one 'struct gnttab_free_callback' in xdf_t.
    198  */
    199 static int
    200 xdf_dmacallback(caddr_t arg)
    201 {
    202 	xdf_t *vdp = (xdf_t *)arg;
    203 	ASSERT(vdp != NULL);
    204 
    205 	DPRINTF(DMA_DBG, ("xdf@%s: DMA callback started\n",
    206 	    vdp->xdf_addr));
    207 
    208 	ddi_trigger_softintr(vdp->xdf_softintr_id);
    209 	return (DDI_DMA_CALLBACK_DONE);
    210 }
    211 
    212 static ge_slot_t *
    213 gs_get(xdf_t *vdp, int isread)
    214 {
    215 	grant_ref_t gh;
    216 	ge_slot_t *gs;
    217 
    218 	/* try to alloc GTEs needed in this slot, first */
    219 	if (gnttab_alloc_grant_references(
    220 	    BLKIF_MAX_SEGMENTS_PER_REQUEST, &gh) == -1) {
    221 		if (vdp->xdf_gnt_callback.next == NULL) {
    222 			SETDMACBON(vdp);
    223 			gnttab_request_free_callback(
    224 			    &vdp->xdf_gnt_callback,
    225 			    (void (*)(void *))xdf_dmacallback,
    226 			    (void *)vdp,
    227 			    BLKIF_MAX_SEGMENTS_PER_REQUEST);
    228 		}
    229 		return (NULL);
    230 	}
    231 
    232 	gs = kmem_cache_alloc(xdf_gs_cache, KM_NOSLEEP);
    233 	if (gs == NULL) {
    234 		gnttab_free_grant_references(gh);
    235 		if (vdp->xdf_timeout_id == 0)
    236 			/* restart I/O after one second */
    237 			vdp->xdf_timeout_id =
    238 			    timeout(xdf_timeout_handler, vdp, hz);
    239 		return (NULL);
    240 	}
    241 
    242 	/* init gs_slot */
    243 	gs->gs_oeid = vdp->xdf_peer;
    244 	gs->gs_isread = isread;
    245 	gs->gs_ghead = gh;
    246 	gs->gs_ngrefs = 0;
    247 
    248 	return (gs);
    249 }
    250 
    251 static void
    252 gs_free(ge_slot_t *gs)
    253 {
    254 	int		i;
    255 
    256 	/* release all grant table entry resources used in this slot */
    257 	for (i = 0; i < gs->gs_ngrefs; i++)
    258 		gnttab_end_foreign_access(gs->gs_ge[i], !gs->gs_isread, 0);
    259 	gnttab_free_grant_references(gs->gs_ghead);
    260 	list_remove(&gs->gs_vreq->v_gs, gs);
    261 	kmem_cache_free(xdf_gs_cache, gs);
    262 }
    263 
    264 static grant_ref_t
    265 gs_grant(ge_slot_t *gs, mfn_t mfn)
    266 {
    267 	grant_ref_t gr = gnttab_claim_grant_reference(&gs->gs_ghead);
    268 
    269 	ASSERT(gr != -1);
    270 	ASSERT(gs->gs_ngrefs < BLKIF_MAX_SEGMENTS_PER_REQUEST);
    271 	gs->gs_ge[gs->gs_ngrefs++] = gr;
    272 	gnttab_grant_foreign_access_ref(gr, gs->gs_oeid, mfn, !gs->gs_isread);
    273 
    274 	return (gr);
    275 }
    276 
    277 /*
    278  * Alloc a vreq for this bp
    279  * bp->av_back contains the pointer to the vreq upon return
    280  */
    281 static v_req_t *
    282 vreq_get(xdf_t *vdp, buf_t *bp)
    283 {
    284 	v_req_t *vreq = NULL;
    285 
    286 	ASSERT(BP_VREQ(bp) == NULL);
    287 
    288 	vreq = kmem_cache_alloc(xdf_vreq_cache, KM_NOSLEEP);
    289 	if (vreq == NULL) {
    290 		if (vdp->xdf_timeout_id == 0)
    291 			/* restart I/O after one second */
    292 			vdp->xdf_timeout_id =
    293 			    timeout(xdf_timeout_handler, vdp, hz);
    294 		return (NULL);
    295 	}
    296 	bzero(vreq, sizeof (v_req_t));
    297 	list_create(&vreq->v_gs, sizeof (ge_slot_t),
    298 	    offsetof(ge_slot_t, gs_vreq_link));
    299 	vreq->v_buf = bp;
    300 	vreq->v_status = VREQ_INIT;
    301 	vreq->v_runq = B_FALSE;
    302 	BP_VREQ_SET(bp, vreq);
    303 	/* init of other fields in vreq is up to the caller */
    304 
    305 	list_insert_head(&vdp->xdf_vreq_act, (void *)vreq);
    306 
    307 	return (vreq);
    308 }
    309 
    310 static void
    311 vreq_free(xdf_t *vdp, v_req_t *vreq)
    312 {
    313 	buf_t	*bp = vreq->v_buf;
    314 
    315 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
    316 	ASSERT(BP_VREQ(bp) == vreq);
    317 
    318 	list_remove(&vdp->xdf_vreq_act, vreq);
    319 
    320 	if (vreq->v_flush_diskcache == FLUSH_DISKCACHE)
    321 		goto done;
    322 
    323 	switch (vreq->v_status) {
    324 	case VREQ_DMAWIN_DONE:
    325 	case VREQ_GS_ALLOCED:
    326 	case VREQ_DMABUF_BOUND:
    327 		(void) ddi_dma_unbind_handle(vreq->v_dmahdl);
    328 		/*FALLTHRU*/
    329 	case VREQ_DMAMEM_ALLOCED:
    330 		if (!ALIGNED_XFER(bp)) {
    331 			ASSERT(vreq->v_abuf != NULL);
    332 			if (!IS_ERROR(bp) && IS_READ(bp))
    333 				bcopy(vreq->v_abuf, bp->b_un.b_addr,
    334 				    bp->b_bcount);
    335 			ddi_dma_mem_free(&vreq->v_align);
    336 		}
    337 		/*FALLTHRU*/
    338 	case VREQ_MEMDMAHDL_ALLOCED:
    339 		if (!ALIGNED_XFER(bp))
    340 			ddi_dma_free_handle(&vreq->v_memdmahdl);
    341 		/*FALLTHRU*/
    342 	case VREQ_DMAHDL_ALLOCED:
    343 		ddi_dma_free_handle(&vreq->v_dmahdl);
    344 		break;
    345 	default:
    346 		break;
    347 	}
    348 done:
    349 	ASSERT(!vreq->v_runq);
    350 	list_destroy(&vreq->v_gs);
    351 	kmem_cache_free(xdf_vreq_cache, vreq);
    352 }
    353 
    354 /*
    355  * Snarf new data if our flush block was re-written
    356  */
    357 static void
    358 check_fbwrite(xdf_t *vdp, buf_t *bp, daddr_t blkno)
    359 {
    360 	int nblks;
    361 	boolean_t mapin;
    362 
    363 	if (IS_WRITE_BARRIER(vdp, bp))
    364 		return; /* write was a flush write */
    365 
    366 	mapin = B_FALSE;
    367 	nblks = bp->b_bcount >> DEV_BSHIFT;
    368 	if (xdf_flush_block >= blkno && xdf_flush_block < (blkno + nblks)) {
    369 		xdf_fbrewrites++;
    370 		if (bp->b_flags & (B_PAGEIO | B_PHYS)) {
    371 			mapin = B_TRUE;
    372 			bp_mapin(bp);
    373 		}
    374 		bcopy(bp->b_un.b_addr +
    375 		    ((xdf_flush_block - blkno) << DEV_BSHIFT),
    376 		    vdp->xdf_cache_flush_block, DEV_BSIZE);
    377 		if (mapin)
    378 			bp_mapout(bp);
    379 	}
    380 }
    381 
    382 /*
    383  * Initalize the DMA and grant table resources for the buf
    384  */
    385 static int
    386 vreq_setup(xdf_t *vdp, v_req_t *vreq)
    387 {
    388 	int rc;
    389 	ddi_dma_attr_t dmaattr;
    390 	uint_t ndcs, ndws;
    391 	ddi_dma_handle_t dh;
    392 	ddi_dma_handle_t mdh;
    393 	ddi_dma_cookie_t dc;
    394 	ddi_acc_handle_t abh;
    395 	caddr_t	aba;
    396 	ge_slot_t *gs;
    397 	size_t bufsz;
    398 	off_t off;
    399 	size_t sz;
    400 	buf_t *bp = vreq->v_buf;
    401 	int dma_flags = (IS_READ(bp) ? DDI_DMA_READ : DDI_DMA_WRITE) |
    402 	    DDI_DMA_STREAMING | DDI_DMA_PARTIAL;
    403 
    404 	switch (vreq->v_status) {
    405 	case VREQ_INIT:
    406 		if (IS_FLUSH_DISKCACHE(bp)) {
    407 			if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
    408 				DPRINTF(DMA_DBG, ("xdf@%s: "
    409 				    "get ge_slotfailed\n", vdp->xdf_addr));
    410 				return (DDI_FAILURE);
    411 			}
    412 			vreq->v_blkno = 0;
    413 			vreq->v_nslots = 1;
    414 			vreq->v_flush_diskcache = FLUSH_DISKCACHE;
    415 			vreq->v_status = VREQ_GS_ALLOCED;
    416 			gs->gs_vreq = vreq;
    417 			list_insert_head(&vreq->v_gs, gs);
    418 			return (DDI_SUCCESS);
    419 		}
    420 
    421 		if (IS_WRITE_BARRIER(vdp, bp))
    422 			vreq->v_flush_diskcache = WRITE_BARRIER;
    423 		vreq->v_blkno = bp->b_blkno +
    424 		    (diskaddr_t)(uintptr_t)bp->b_private;
    425 		/* See if we wrote new data to our flush block */
    426 		if (!IS_READ(bp) && USE_WRITE_BARRIER(vdp))
    427 			check_fbwrite(vdp, bp, vreq->v_blkno);
    428 		vreq->v_status = VREQ_INIT_DONE;
    429 		/*FALLTHRU*/
    430 
    431 	case VREQ_INIT_DONE:
    432 		/*
    433 		 * alloc DMA handle
    434 		 */
    435 		rc = ddi_dma_alloc_handle(vdp->xdf_dip, &xb_dma_attr,
    436 		    xdf_dmacallback, (caddr_t)vdp, &dh);
    437 		if (rc != DDI_SUCCESS) {
    438 			SETDMACBON(vdp);
    439 			DPRINTF(DMA_DBG, ("xdf@%s: DMA handle alloc failed\n",
    440 			    vdp->xdf_addr));
    441 			return (DDI_FAILURE);
    442 		}
    443 
    444 		vreq->v_dmahdl = dh;
    445 		vreq->v_status = VREQ_DMAHDL_ALLOCED;
    446 		/*FALLTHRU*/
    447 
    448 	case VREQ_DMAHDL_ALLOCED:
    449 		/*
    450 		 * alloc dma handle for 512-byte aligned buf
    451 		 */
    452 		if (!ALIGNED_XFER(bp)) {
    453 			/*
    454 			 * XXPV: we need to temporarily enlarge the seg
    455 			 * boundary and s/g length to work round CR6381968
    456 			 */
    457 			dmaattr = xb_dma_attr;
    458 			dmaattr.dma_attr_seg = (uint64_t)-1;
    459 			dmaattr.dma_attr_sgllen = INT_MAX;
    460 			rc = ddi_dma_alloc_handle(vdp->xdf_dip, &dmaattr,
    461 			    xdf_dmacallback, (caddr_t)vdp, &mdh);
    462 			if (rc != DDI_SUCCESS) {
    463 				SETDMACBON(vdp);
    464 				DPRINTF(DMA_DBG, ("xdf@%s: "
    465 				    "unaligned buf DMAhandle alloc failed\n",
    466 				    vdp->xdf_addr));
    467 				return (DDI_FAILURE);
    468 			}
    469 			vreq->v_memdmahdl = mdh;
    470 			vreq->v_status = VREQ_MEMDMAHDL_ALLOCED;
    471 		}
    472 		/*FALLTHRU*/
    473 
    474 	case VREQ_MEMDMAHDL_ALLOCED:
    475 		/*
    476 		 * alloc 512-byte aligned buf
    477 		 */
    478 		if (!ALIGNED_XFER(bp)) {
    479 			if (bp->b_flags & (B_PAGEIO | B_PHYS))
    480 				bp_mapin(bp);
    481 			rc = ddi_dma_mem_alloc(vreq->v_memdmahdl,
    482 			    roundup(bp->b_bcount, XB_BSIZE), &xc_acc_attr,
    483 			    DDI_DMA_STREAMING, xdf_dmacallback, (caddr_t)vdp,
    484 			    &aba, &bufsz, &abh);
    485 			if (rc != DDI_SUCCESS) {
    486 				SETDMACBON(vdp);
    487 				DPRINTF(DMA_DBG, ("xdf@%s: "
    488 				    "DMA mem allocation failed\n",
    489 				    vdp->xdf_addr));
    490 				return (DDI_FAILURE);
    491 			}
    492 
    493 			vreq->v_abuf = aba;
    494 			vreq->v_align = abh;
    495 			vreq->v_status = VREQ_DMAMEM_ALLOCED;
    496 
    497 			ASSERT(bufsz >= bp->b_bcount);
    498 			if (!IS_READ(bp))
    499 				bcopy(bp->b_un.b_addr, vreq->v_abuf,
    500 				    bp->b_bcount);
    501 		}
    502 		/*FALLTHRU*/
    503 
    504 	case VREQ_DMAMEM_ALLOCED:
    505 		/*
    506 		 * dma bind
    507 		 */
    508 		if (ALIGNED_XFER(bp)) {
    509 			rc = ddi_dma_buf_bind_handle(vreq->v_dmahdl, bp,
    510 			    dma_flags, xdf_dmacallback, (caddr_t)vdp,
    511 			    &dc, &ndcs);
    512 		} else {
    513 			rc = ddi_dma_addr_bind_handle(vreq->v_dmahdl,
    514 			    NULL, vreq->v_abuf, bp->b_bcount, dma_flags,
    515 			    xdf_dmacallback, (caddr_t)vdp, &dc, &ndcs);
    516 		}
    517 		if (rc == DDI_DMA_MAPPED || rc == DDI_DMA_PARTIAL_MAP) {
    518 			/* get num of dma windows */
    519 			if (rc == DDI_DMA_PARTIAL_MAP) {
    520 				rc = ddi_dma_numwin(vreq->v_dmahdl, &ndws);
    521 				ASSERT(rc == DDI_SUCCESS);
    522 			} else {
    523 				ndws = 1;
    524 			}
    525 		} else {
    526 			SETDMACBON(vdp);
    527 			DPRINTF(DMA_DBG, ("xdf@%s: DMA bind failed\n",
    528 			    vdp->xdf_addr));
    529 			return (DDI_FAILURE);
    530 		}
    531 
    532 		vreq->v_dmac = dc;
    533 		vreq->v_dmaw = 0;
    534 		vreq->v_ndmacs = ndcs;
    535 		vreq->v_ndmaws = ndws;
    536 		vreq->v_nslots = ndws;
    537 		vreq->v_status = VREQ_DMABUF_BOUND;
    538 		/*FALLTHRU*/
    539 
    540 	case VREQ_DMABUF_BOUND:
    541 		/*
    542 		 * get ge_slot, callback is set upon failure from gs_get(),
    543 		 * if not set previously
    544 		 */
    545 		if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
    546 			DPRINTF(DMA_DBG, ("xdf@%s: get ge_slot failed\n",
    547 			    vdp->xdf_addr));
    548 			return (DDI_FAILURE);
    549 		}
    550 
    551 		vreq->v_status = VREQ_GS_ALLOCED;
    552 		gs->gs_vreq = vreq;
    553 		list_insert_head(&vreq->v_gs, gs);
    554 		break;
    555 
    556 	case VREQ_GS_ALLOCED:
    557 		/* nothing need to be done */
    558 		break;
    559 
    560 	case VREQ_DMAWIN_DONE:
    561 		/*
    562 		 * move to the next dma window
    563 		 */
    564 		ASSERT((vreq->v_dmaw + 1) < vreq->v_ndmaws);
    565 
    566 		/* get a ge_slot for this DMA window */
    567 		if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
    568 			DPRINTF(DMA_DBG, ("xdf@%s: get ge_slot failed\n",
    569 			    vdp->xdf_addr));
    570 			return (DDI_FAILURE);
    571 		}
    572 
    573 		vreq->v_dmaw++;
    574 		VERIFY(ddi_dma_getwin(vreq->v_dmahdl, vreq->v_dmaw, &off, &sz,
    575 		    &vreq->v_dmac, &vreq->v_ndmacs) == DDI_SUCCESS);
    576 		vreq->v_status = VREQ_GS_ALLOCED;
    577 		gs->gs_vreq = vreq;
    578 		list_insert_head(&vreq->v_gs, gs);
    579 		break;
    580 
    581 	default:
    582 		return (DDI_FAILURE);
    583 	}
    584 
    585 	return (DDI_SUCCESS);
    586 }
    587 
    588 static int
    589 xdf_cmlb_attach(xdf_t *vdp)
    590 {
    591 	dev_info_t	*dip = vdp->xdf_dip;
    592 
    593 	return (cmlb_attach(dip, &xdf_lb_ops,
    594 	    XD_IS_CD(vdp) ? DTYPE_RODIRECT : DTYPE_DIRECT,
    595 	    XD_IS_RM(vdp),
    596 	    B_TRUE,
    597 	    XD_IS_CD(vdp) ? DDI_NT_CD_XVMD : DDI_NT_BLOCK_XVMD,
    598 #if defined(XPV_HVM_DRIVER)
    599 	    (XD_IS_CD(vdp) ? 0 : CMLB_CREATE_ALTSLICE_VTOC_16_DTYPE_DIRECT) |
    600 	    CMLB_INTERNAL_MINOR_NODES,
    601 #else /* !XPV_HVM_DRIVER */
    602 	    XD_IS_CD(vdp) ? 0 : CMLB_FAKE_LABEL_ONE_PARTITION,
    603 #endif /* !XPV_HVM_DRIVER */
    604 	    vdp->xdf_vd_lbl, NULL));
    605 }
    606 
    607 static void
    608 xdf_io_err(buf_t *bp, int err, size_t resid)
    609 {
    610 	bioerror(bp, err);
    611 	if (resid == 0)
    612 		bp->b_resid = bp->b_bcount;
    613 	biodone(bp);
    614 }
    615 
    616 static void
    617 xdf_kstat_enter(xdf_t *vdp, buf_t *bp)
    618 {
    619 	v_req_t *vreq = BP_VREQ(bp);
    620 
    621 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
    622 
    623 	if (vdp->xdf_xdev_iostat == NULL)
    624 		return;
    625 	if ((vreq != NULL) && vreq->v_runq) {
    626 		kstat_runq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
    627 	} else {
    628 		kstat_waitq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
    629 	}
    630 }
    631 
    632 static void
    633 xdf_kstat_exit(xdf_t *vdp, buf_t *bp)
    634 {
    635 	v_req_t *vreq = BP_VREQ(bp);
    636 
    637 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
    638 
    639 	if (vdp->xdf_xdev_iostat == NULL)
    640 		return;
    641 	if ((vreq != NULL) && vreq->v_runq) {
    642 		kstat_runq_exit(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
    643 	} else {
    644 		kstat_waitq_exit(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
    645 	}
    646 }
    647 
    648 static void
    649 xdf_kstat_waitq_to_runq(xdf_t *vdp, buf_t *bp)
    650 {
    651 	v_req_t *vreq = BP_VREQ(bp);
    652 
    653 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
    654 	ASSERT(!vreq->v_runq);
    655 
    656 	vreq->v_runq = B_TRUE;
    657 	if (vdp->xdf_xdev_iostat == NULL)
    658 		return;
    659 	kstat_waitq_to_runq(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
    660 }
    661 
    662 static void
    663 xdf_kstat_runq_to_waitq(xdf_t *vdp, buf_t *bp)
    664 {
    665 	v_req_t *vreq = BP_VREQ(bp);
    666 
    667 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
    668 	ASSERT(vreq->v_runq);
    669 
    670 	vreq->v_runq = B_FALSE;
    671 	if (vdp->xdf_xdev_iostat == NULL)
    672 		return;
    673 	kstat_runq_back_to_waitq(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
    674 }
    675 
    676 int
    677 xdf_kstat_create(dev_info_t *dip, char *ks_module, int instance)
    678 {
    679 	xdf_t		*vdp = (xdf_t *)ddi_get_driver_private(dip);
    680 	kstat_t		*kstat;
    681 	buf_t		*bp;
    682 
    683 	if ((kstat = kstat_create(
    684 	    ks_module, instance, NULL, "disk",
    685 	    KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT)) == NULL)
    686 		return (-1);
    687 
    688 	/* See comment about locking in xdf_kstat_delete(). */
    689 	mutex_enter(&vdp->xdf_iostat_lk);
    690 	mutex_enter(&vdp->xdf_dev_lk);
    691 
    692 	/* only one kstat can exist at a time */
    693 	if (vdp->xdf_xdev_iostat != NULL) {
    694 		mutex_exit(&vdp->xdf_dev_lk);
    695 		mutex_exit(&vdp->xdf_iostat_lk);
    696 		kstat_delete(kstat);
    697 		return (-1);
    698 	}
    699 
    700 	vdp->xdf_xdev_iostat = kstat;
    701 	vdp->xdf_xdev_iostat->ks_lock = &vdp->xdf_dev_lk;
    702 	kstat_install(vdp->xdf_xdev_iostat);
    703 
    704 	/*
    705 	 * Now that we've created a kstat, we need to update the waitq and
    706 	 * runq counts for the kstat to reflect our current state.
    707 	 *
    708 	 * For a buf_t structure to be on the runq, it must have a ring
    709 	 * buffer slot associated with it.  To get a ring buffer slot the
    710 	 * buf must first have a v_req_t and a ge_slot_t associated with it.
    711 	 * Then when it is granted a ring buffer slot, v_runq will be set to
    712 	 * true.
    713 	 *
    714 	 * For a buf_t structure to be on the waitq, it must not be on the
    715 	 * runq.  So to find all the buf_t's that should be on waitq, we
    716 	 * walk the active buf list and add any buf_t's which aren't on the
    717 	 * runq to the waitq.
    718 	 */
    719 	bp = vdp->xdf_f_act;
    720 	while (bp != NULL) {
    721 		xdf_kstat_enter(vdp, bp);
    722 		bp = bp->av_forw;
    723 	}
    724 	if (vdp->xdf_ready_tq_bp != NULL)
    725 		xdf_kstat_enter(vdp, vdp->xdf_ready_tq_bp);
    726 
    727 	mutex_exit(&vdp->xdf_dev_lk);
    728 	mutex_exit(&vdp->xdf_iostat_lk);
    729 	return (0);
    730 }
    731 
    732 void
    733 xdf_kstat_delete(dev_info_t *dip)
    734 {
    735 	xdf_t		*vdp = (xdf_t *)ddi_get_driver_private(dip);
    736 	kstat_t		*kstat;
    737 	buf_t		*bp;
    738 
    739 	/*
    740 	 * The locking order here is xdf_iostat_lk and then xdf_dev_lk.
    741 	 * xdf_dev_lk is used to protect the xdf_xdev_iostat pointer
    742 	 * and the contents of the our kstat.  xdf_iostat_lk is used
    743 	 * to protect the allocation and freeing of the actual kstat.
    744 	 * xdf_dev_lk can't be used for this purpose because kstat
    745 	 * readers use it to access the contents of the kstat and
    746 	 * hence it can't be held when calling kstat_delete().
    747 	 */
    748 	mutex_enter(&vdp->xdf_iostat_lk);
    749 	mutex_enter(&vdp->xdf_dev_lk);
    750 
    751 	if (vdp->xdf_xdev_iostat == NULL) {
    752 		mutex_exit(&vdp->xdf_dev_lk);
    753 		mutex_exit(&vdp->xdf_iostat_lk);
    754 		return;
    755 	}
    756 
    757 	/*
    758 	 * We're about to destroy the kstat structures, so it isn't really
    759 	 * necessary to update the runq and waitq counts.  But, since this
    760 	 * isn't a hot code path we can afford to be a little pedantic and
    761 	 * go ahead and decrement the runq and waitq kstat counters to zero
    762 	 * before free'ing them.  This helps us ensure that we've gotten all
    763 	 * our accounting correct.
    764 	 *
    765 	 * For an explanation of how we determine which buffers go on the
    766 	 * runq vs which go on the waitq, see the comments in
    767 	 * xdf_kstat_create().
    768 	 */
    769 	bp = vdp->xdf_f_act;
    770 	while (bp != NULL) {
    771 		xdf_kstat_exit(vdp, bp);
    772 		bp = bp->av_forw;
    773 	}
    774 	if (vdp->xdf_ready_tq_bp != NULL)
    775 		xdf_kstat_exit(vdp, vdp->xdf_ready_tq_bp);
    776 
    777 	kstat = vdp->xdf_xdev_iostat;
    778 	vdp->xdf_xdev_iostat = NULL;
    779 	mutex_exit(&vdp->xdf_dev_lk);
    780 	kstat_delete(kstat);
    781 	mutex_exit(&vdp->xdf_iostat_lk);
    782 }
    783 
    784 /*
    785  * Add an IO requests onto the active queue.
    786  *
    787  * We have to detect IOs generated by xdf_ready_tq_thread.  These IOs
    788  * are used to establish a connection to the backend, so they recieve
    789  * priority over all other IOs.  Since xdf_ready_tq_thread only does
    790  * synchronous IO, there can only be one xdf_ready_tq_thread request at any
    791  * given time and we record the buf associated with that request in
    792  * xdf_ready_tq_bp.
    793  */
    794 static void
    795 xdf_bp_push(xdf_t *vdp, buf_t *bp)
    796 {
    797 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
    798 	ASSERT(bp->av_forw == NULL);
    799 
    800 	xdf_kstat_enter(vdp, bp);
    801 
    802 	if (curthread == vdp->xdf_ready_tq_thread) {
    803 		/* new IO requests from the ready thread */
    804 		ASSERT(vdp->xdf_ready_tq_bp == NULL);
    805 		vdp->xdf_ready_tq_bp = bp;
    806 		return;
    807 	}
    808 
    809 	/* this is normal IO request */
    810 	ASSERT(bp != vdp->xdf_ready_tq_bp);
    811 
    812 	if (vdp->xdf_f_act == NULL) {
    813 		/* this is only only IO on the active queue */
    814 		ASSERT(vdp->xdf_l_act == NULL);
    815 		ASSERT(vdp->xdf_i_act == NULL);
    816 		vdp->xdf_f_act = vdp->xdf_l_act = vdp->xdf_i_act = bp;
    817 		return;
    818 	}
    819 
    820 	/* add this IO to the tail of the active queue */
    821 	vdp->xdf_l_act->av_forw = bp;
    822 	vdp->xdf_l_act = bp;
    823 	if (vdp->xdf_i_act == NULL)
    824 		vdp->xdf_i_act = bp;
    825 }
    826 
    827 static void
    828 xdf_bp_pop(xdf_t *vdp, buf_t *bp)
    829 {
    830 	buf_t	*bp_iter;
    831 
    832 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
    833 	ASSERT(VREQ_DONE(BP_VREQ(bp)));
    834 
    835 	if (vdp->xdf_ready_tq_bp == bp) {
    836 		/* we're done with a ready thread IO request */
    837 		ASSERT(bp->av_forw == NULL);
    838 		vdp->xdf_ready_tq_bp = NULL;
    839 		return;
    840 	}
    841 
    842 	/* we're done with a normal IO request */
    843 	ASSERT((bp->av_forw != NULL) || (bp == vdp->xdf_l_act));
    844 	ASSERT((bp->av_forw == NULL) || (bp != vdp->xdf_l_act));
    845 	ASSERT(VREQ_DONE(BP_VREQ(vdp->xdf_f_act)));
    846 	ASSERT(vdp->xdf_f_act != vdp->xdf_i_act);
    847 
    848 	if (bp == vdp->xdf_f_act) {
    849 		/* This IO was at the head of our active queue. */
    850 		vdp->xdf_f_act = bp->av_forw;
    851 		if (bp == vdp->xdf_l_act)
    852 			vdp->xdf_l_act = NULL;
    853 	} else {
    854 		/* There IO finished before some other pending IOs. */
    855 		bp_iter = vdp->xdf_f_act;
    856 		while (bp != bp_iter->av_forw) {
    857 			bp_iter = bp_iter->av_forw;
    858 			ASSERT(VREQ_DONE(BP_VREQ(bp_iter)));
    859 			ASSERT(bp_iter != vdp->xdf_i_act);
    860 		}
    861 		bp_iter->av_forw = bp->av_forw;
    862 		if (bp == vdp->xdf_l_act)
    863 			vdp->xdf_l_act = bp_iter;
    864 	}
    865 	bp->av_forw = NULL;
    866 }
    867 
    868 static buf_t *
    869 xdf_bp_next(xdf_t *vdp)
    870 {
    871 	v_req_t	*vreq;
    872 	buf_t	*bp;
    873 
    874 	if (vdp->xdf_state == XD_CONNECTED) {
    875 		/*
    876 		 * If we're in the XD_CONNECTED state, we only service IOs
    877 		 * from the xdf_ready_tq_thread thread.
    878 		 */
    879 		if ((bp = vdp->xdf_ready_tq_bp) == NULL)
    880 			return (NULL);
    881 		if (((vreq = BP_VREQ(bp)) == NULL) || (!VREQ_DONE(vreq)))
    882 			return (bp);
    883 		return (NULL);
    884 	}
    885 
    886 	/* if we're not in the XD_CONNECTED or XD_READY state we can't do IO */
    887 	if (vdp->xdf_state != XD_READY)
    888 		return (NULL);
    889 
    890 	ASSERT(vdp->xdf_ready_tq_bp == NULL);
    891 	for (;;) {
    892 		if ((bp = vdp->xdf_i_act) == NULL)
    893 			return (NULL);
    894 		if (((vreq = BP_VREQ(bp)) == NULL) || (!VREQ_DONE(vreq)))
    895 			return (bp);
    896 
    897 		/* advance the active buf index pointer */
    898 		vdp->xdf_i_act = bp->av_forw;
    899 	}
    900 }
    901 
    902 static void
    903 xdf_io_fini(xdf_t *vdp, uint64_t id, int bioerr)
    904 {
    905 	ge_slot_t	*gs = (ge_slot_t *)(uintptr_t)id;
    906 	v_req_t		*vreq = gs->gs_vreq;
    907 	buf_t		*bp = vreq->v_buf;
    908 
    909 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
    910 	ASSERT(BP_VREQ(bp) == vreq);
    911 
    912 	gs_free(gs);
    913 
    914 	if (bioerr != 0)
    915 		bioerror(bp, bioerr);
    916 	ASSERT(vreq->v_nslots > 0);
    917 	if (--vreq->v_nslots > 0)
    918 		return;
    919 
    920 	/* remove this IO from our active queue */
    921 	xdf_bp_pop(vdp, bp);
    922 
    923 	ASSERT(vreq->v_runq);
    924 	xdf_kstat_exit(vdp, bp);
    925 	vreq->v_runq = B_FALSE;
    926 	vreq_free(vdp, vreq);
    927 
    928 	if (IS_ERROR(bp)) {
    929 		xdf_io_err(bp, geterror(bp), 0);
    930 	} else if (bp->b_resid != 0) {
    931 		/* Partial transfers are an error */
    932 		xdf_io_err(bp, EIO, bp->b_resid);
    933 	} else {
    934 		biodone(bp);
    935 	}
    936 }
    937 
    938 /*
    939  * xdf interrupt handler
    940  */
    941 static uint_t
    942 xdf_intr_locked(xdf_t *vdp)
    943 {
    944 	xendev_ring_t *xbr;
    945 	blkif_response_t *resp;
    946 	int bioerr;
    947 	uint64_t id;
    948 	uint8_t op;
    949 	uint16_t status;
    950 	ddi_acc_handle_t acchdl;
    951 
    952 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
    953 
    954 	if ((xbr = vdp->xdf_xb_ring) == NULL)
    955 		return (DDI_INTR_UNCLAIMED);
    956 
    957 	acchdl = vdp->xdf_xb_ring_hdl;
    958 
    959 	/*
    960 	 * complete all requests which have a response
    961 	 */
    962 	while (resp = xvdi_ring_get_response(xbr)) {
    963 		id = ddi_get64(acchdl, &resp->id);
    964 		op = ddi_get8(acchdl, &resp->operation);
    965 		status = ddi_get16(acchdl, (uint16_t *)&resp->status);
    966 		DPRINTF(INTR_DBG, ("resp: op %d id %"PRIu64" status %d\n",
    967 		    op, id, status));
    968 
    969 		if (status != BLKIF_RSP_OKAY) {
    970 			DPRINTF(IO_DBG, ("xdf@%s: I/O error while %s",
    971 			    vdp->xdf_addr,
    972 			    (op == BLKIF_OP_READ) ? "reading" : "writing"));
    973 			bioerr = EIO;
    974 		} else {
    975 			bioerr = 0;
    976 		}
    977 
    978 		xdf_io_fini(vdp, id, bioerr);
    979 	}
    980 	return (DDI_INTR_CLAIMED);
    981 }
    982 
    983 /*
    984  * xdf_intr runs at PIL 5, so no one else can grab xdf_dev_lk and
    985  * block at a lower pil.
    986  */
    987 static uint_t
    988 xdf_intr(caddr_t arg)
    989 {
    990 	xdf_t *vdp = (xdf_t *)arg;
    991 	int rv;
    992 
    993 	mutex_enter(&vdp->xdf_dev_lk);
    994 	rv = xdf_intr_locked(vdp);
    995 	mutex_exit(&vdp->xdf_dev_lk);
    996 
    997 	if (!do_polled_io)
    998 		xdf_io_start(vdp);
    999 
   1000 	return (rv);
   1001 }
   1002 
   1003 static void
   1004 xdf_ring_push(xdf_t *vdp)
   1005 {
   1006 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
   1007 
   1008 	if (vdp->xdf_xb_ring == NULL)
   1009 		return;
   1010 
   1011 	if (xvdi_ring_push_request(vdp->xdf_xb_ring)) {
   1012 		DPRINTF(IO_DBG, (
   1013 		    "xdf@%s: xdf_ring_push: sent request(s) to backend\n",
   1014 		    vdp->xdf_addr));
   1015 	}
   1016 
   1017 	if (xvdi_get_evtchn(vdp->xdf_dip) != INVALID_EVTCHN)
   1018 		xvdi_notify_oe(vdp->xdf_dip);
   1019 }
   1020 
   1021 static int
   1022 xdf_ring_drain_locked(xdf_t *vdp)
   1023 {
   1024 	int		pollc, rv = 0;
   1025 
   1026 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
   1027 
   1028 	if (xdf_debug & SUSRES_DBG)
   1029 		xen_printf("xdf_ring_drain: start\n");
   1030 
   1031 	for (pollc = 0; pollc < XDF_DRAIN_RETRY_COUNT; pollc++) {
   1032 		if (vdp->xdf_xb_ring == NULL)
   1033 			goto out;
   1034 
   1035 		if (xvdi_ring_has_unconsumed_responses(vdp->xdf_xb_ring))
   1036 			(void) xdf_intr_locked(vdp);
   1037 		if (!xvdi_ring_has_incomp_request(vdp->xdf_xb_ring))
   1038 			goto out;
   1039 		xdf_ring_push(vdp);
   1040 
   1041 		/* file-backed devices can be slow */
   1042 		mutex_exit(&vdp->xdf_dev_lk);
   1043 #ifdef XPV_HVM_DRIVER
   1044 		(void) HYPERVISOR_yield();
   1045 #endif /* XPV_HVM_DRIVER */
   1046 		delay(drv_usectohz(XDF_DRAIN_MSEC_DELAY));
   1047 		mutex_enter(&vdp->xdf_dev_lk);
   1048 	}
   1049 	cmn_err(CE_WARN, "xdf@%s: xdf_ring_drain: timeout", vdp->xdf_addr);
   1050 
   1051 out:
   1052 	if (vdp->xdf_xb_ring != NULL) {
   1053 		if (xvdi_ring_has_incomp_request(vdp->xdf_xb_ring) ||
   1054 		    xvdi_ring_has_unconsumed_responses(vdp->xdf_xb_ring))
   1055 			rv = EIO;
   1056 	}
   1057 	if (xdf_debug & SUSRES_DBG)
   1058 		xen_printf("xdf@%s: xdf_ring_drain: end, err=%d\n",
   1059 		    vdp->xdf_addr, rv);
   1060 	return (rv);
   1061 }
   1062 
   1063 static int
   1064 xdf_ring_drain(xdf_t *vdp)
   1065 {
   1066 	int rv;
   1067 	mutex_enter(&vdp->xdf_dev_lk);
   1068 	rv = xdf_ring_drain_locked(vdp);
   1069 	mutex_exit(&vdp->xdf_dev_lk);
   1070 	return (rv);
   1071 }
   1072 
   1073 /*
   1074  * Destroy all v_req_t, grant table entries, and our ring buffer.
   1075  */
   1076 static void
   1077 xdf_ring_destroy(xdf_t *vdp)
   1078 {
   1079 	v_req_t		*vreq;
   1080 	buf_t		*bp;
   1081 	ge_slot_t	*gs;
   1082 
   1083 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
   1084 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
   1085 
   1086 	if ((vdp->xdf_state != XD_INIT) &&
   1087 	    (vdp->xdf_state != XD_CONNECTED) &&
   1088 	    (vdp->xdf_state != XD_READY)) {
   1089 		ASSERT(vdp->xdf_xb_ring == NULL);
   1090 		ASSERT(vdp->xdf_xb_ring_hdl == NULL);
   1091 		ASSERT(vdp->xdf_peer == INVALID_DOMID);
   1092 		ASSERT(vdp->xdf_evtchn == INVALID_EVTCHN);
   1093 		ASSERT(list_is_empty(&vdp->xdf_vreq_act));
   1094 		return;
   1095 	}
   1096 
   1097 	/*
   1098 	 * We don't want to recieve async notifications from the backend
   1099 	 * when it finishes processing ring entries.
   1100 	 */
   1101 #ifdef XPV_HVM_DRIVER
   1102 	ec_unbind_evtchn(vdp->xdf_evtchn);
   1103 #else /* !XPV_HVM_DRIVER */
   1104 	(void) ddi_remove_intr(vdp->xdf_dip, 0, NULL);
   1105 #endif /* !XPV_HVM_DRIVER */
   1106 
   1107 	/*
   1108 	 * Drain any requests in the ring.  We need to do this before we
   1109 	 * can free grant table entries, because if active ring entries
   1110 	 * point to grants, then the backend could be trying to access
   1111 	 * those grants.
   1112 	 */
   1113 	(void) xdf_ring_drain_locked(vdp);
   1114 
   1115 	/* We're done talking to the backend so free up our event channel */
   1116 	xvdi_free_evtchn(vdp->xdf_dip);
   1117 	vdp->xdf_evtchn = INVALID_EVTCHN;
   1118 
   1119 	while ((vreq = list_head(&vdp->xdf_vreq_act)) != NULL) {
   1120 		bp = vreq->v_buf;
   1121 		ASSERT(BP_VREQ(bp) == vreq);
   1122 
   1123 		/* Free up any grant table entries associaed with this IO */
   1124 		while ((gs = list_head(&vreq->v_gs)) != NULL)
   1125 			gs_free(gs);
   1126 
   1127 		/* If this IO was on the runq, move it back to the waitq. */
   1128 		if (vreq->v_runq)
   1129 			xdf_kstat_runq_to_waitq(vdp, bp);
   1130 
   1131 		/*
   1132 		 * Reset any buf IO state since we're going to re-issue the
   1133 		 * IO when we reconnect.
   1134 		 */
   1135 		vreq_free(vdp, vreq);
   1136 		BP_VREQ_SET(bp, NULL);
   1137 		bioerror(bp, 0);
   1138 	}
   1139 
   1140 	/* reset the active queue index pointer */
   1141 	vdp->xdf_i_act = vdp->xdf_f_act;
   1142 
   1143 	/* Destroy the ring */
   1144 	xvdi_free_ring(vdp->xdf_xb_ring);
   1145 	vdp->xdf_xb_ring = NULL;
   1146 	vdp->xdf_xb_ring_hdl = NULL;
   1147 	vdp->xdf_peer = INVALID_DOMID;
   1148 }
   1149 
   1150 void
   1151 xdfmin(struct buf *bp)
   1152 {
   1153 	if (bp->b_bcount > xdf_maxphys)
   1154 		bp->b_bcount = xdf_maxphys;
   1155 }
   1156 
   1157 /*
   1158  * Check if we have a pending "eject" media request.
   1159  */
   1160 static int
   1161 xdf_eject_pending(xdf_t *vdp)
   1162 {
   1163 	dev_info_t	*dip = vdp->xdf_dip;
   1164 	char		*xsname, *str;
   1165 
   1166 	if (!vdp->xdf_media_req_supported)
   1167 		return (B_FALSE);
   1168 
   1169 	if (((xsname = xvdi_get_xsname(dip)) == NULL) ||
   1170 	    (xenbus_read_str(xsname, XBP_MEDIA_REQ, &str) != 0))
   1171 		return (B_FALSE);
   1172 
   1173 	if (strcmp(str, XBV_MEDIA_REQ_EJECT) != 0) {
   1174 		strfree(str);
   1175 		return (B_FALSE);
   1176 	}
   1177 	strfree(str);
   1178 	return (B_TRUE);
   1179 }
   1180 
   1181 /*
   1182  * Generate a media request.
   1183  */
   1184 static int
   1185 xdf_media_req(xdf_t *vdp, char *req, boolean_t media_required)
   1186 {
   1187 	dev_info_t	*dip = vdp->xdf_dip;
   1188 	char		*xsname;
   1189 
   1190 	/*
   1191 	 * we can't be holding xdf_dev_lk because xenbus_printf() can
   1192 	 * block while waiting for a PIL 1 interrupt message.  this
   1193 	 * would cause a deadlock with xdf_intr() which needs to grab
   1194 	 * xdf_dev_lk as well and runs at PIL 5.
   1195 	 */
   1196 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
   1197 	ASSERT(MUTEX_NOT_HELD(&vdp->xdf_dev_lk));
   1198 
   1199 	if ((xsname = xvdi_get_xsname(dip)) == NULL)
   1200 		return (ENXIO);
   1201 
   1202 	/* Check if we support media requests */
   1203 	if (!XD_IS_CD(vdp) || !vdp->xdf_media_req_supported)
   1204 		return (ENOTTY);
   1205 
   1206 	/* If an eject is pending then don't allow any new requests */
   1207 	if (xdf_eject_pending(vdp))
   1208 		return (ENXIO);
   1209 
   1210 	/* Make sure that there is media present */
   1211 	if (media_required && (vdp->xdf_xdev_nblocks == 0))
   1212 		return (ENXIO);
   1213 
   1214 	/* We only allow operations when the device is ready and connected */
   1215 	if (vdp->xdf_state != XD_READY)
   1216 		return (EIO);
   1217 
   1218 	if (xenbus_printf(XBT_NULL, xsname, XBP_MEDIA_REQ, "%s", req) != 0)
   1219 		return (EIO);
   1220 
   1221 	return (0);
   1222 }
   1223 
   1224 /*
   1225  * populate a single blkif_request_t w/ a buf
   1226  */
   1227 static void
   1228 xdf_process_rreq(xdf_t *vdp, struct buf *bp, blkif_request_t *rreq)
   1229 {
   1230 	grant_ref_t	gr;
   1231 	uint8_t		fsect, lsect;
   1232 	size_t		bcnt;
   1233 	paddr_t		dma_addr;
   1234 	off_t		blk_off;
   1235 	dev_info_t	*dip = vdp->xdf_dip;
   1236 	blkif_vdev_t	vdev = xvdi_get_vdevnum(dip);
   1237 	v_req_t		*vreq = BP_VREQ(bp);
   1238 	uint64_t	blkno = vreq->v_blkno;
   1239 	uint_t		ndmacs = vreq->v_ndmacs;
   1240 	ddi_acc_handle_t acchdl = vdp->xdf_xb_ring_hdl;
   1241 	int		seg = 0;
   1242 	int		isread = IS_READ(bp);
   1243 	ge_slot_t	*gs = list_head(&vreq->v_gs);
   1244 
   1245 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
   1246 	ASSERT(vreq->v_status == VREQ_GS_ALLOCED);
   1247 
   1248 	if (isread)
   1249 		ddi_put8(acchdl, &rreq->operation, BLKIF_OP_READ);
   1250 	else {
   1251 		switch (vreq->v_flush_diskcache) {
   1252 		case FLUSH_DISKCACHE:
   1253 			ddi_put8(acchdl, &rreq->operation,
   1254 			    BLKIF_OP_FLUSH_DISKCACHE);
   1255 			ddi_put16(acchdl, &rreq->handle, vdev);
   1256 			ddi_put64(acchdl, &rreq->id,
   1257 			    (uint64_t)(uintptr_t)(gs));
   1258 			ddi_put8(acchdl, &rreq->nr_segments, 0);
   1259 			vreq->v_status = VREQ_DMAWIN_DONE;
   1260 			return;
   1261 		case WRITE_BARRIER:
   1262 			ddi_put8(acchdl, &rreq->operation,
   1263 			    BLKIF_OP_WRITE_BARRIER);
   1264 			break;
   1265 		default:
   1266 			if (!vdp->xdf_wce)
   1267 				ddi_put8(acchdl, &rreq->operation,
   1268 				    BLKIF_OP_WRITE_BARRIER);
   1269 			else
   1270 				ddi_put8(acchdl, &rreq->operation,
   1271 				    BLKIF_OP_WRITE);
   1272 			break;
   1273 		}
   1274 	}
   1275 
   1276 	ddi_put16(acchdl, &rreq->handle, vdev);
   1277 	ddi_put64(acchdl, &rreq->sector_number, blkno);
   1278 	ddi_put64(acchdl, &rreq->id, (uint64_t)(uintptr_t)(gs));
   1279 
   1280 	/*
   1281 	 * loop until all segments are populated or no more dma cookie in buf
   1282 	 */
   1283 	for (;;) {
   1284 		/*
   1285 		 * Each segment of a blkif request can transfer up to
   1286 		 * one 4K page of data.
   1287 		 */
   1288 		bcnt = vreq->v_dmac.dmac_size;
   1289 		dma_addr = vreq->v_dmac.dmac_laddress;
   1290 		blk_off = (uint_t)((paddr_t)XB_SEGOFFSET & dma_addr);
   1291 		fsect = blk_off >> XB_BSHIFT;
   1292 		lsect = fsect + (bcnt >> XB_BSHIFT) - 1;
   1293 
   1294 		ASSERT(bcnt <= PAGESIZE);
   1295 		ASSERT((bcnt % XB_BSIZE) == 0);
   1296 		ASSERT((blk_off & XB_BMASK) == 0);
   1297 		ASSERT(fsect < XB_MAX_SEGLEN / XB_BSIZE &&
   1298 		    lsect < XB_MAX_SEGLEN / XB_BSIZE);
   1299 
   1300 		gr = gs_grant(gs, PATOMA(dma_addr) >> PAGESHIFT);
   1301 		ddi_put32(acchdl, &rreq->seg[seg].gref, gr);
   1302 		ddi_put8(acchdl, &rreq->seg[seg].first_sect, fsect);
   1303 		ddi_put8(acchdl, &rreq->seg[seg].last_sect, lsect);
   1304 
   1305 		DPRINTF(IO_DBG, (
   1306 		    "xdf@%s: seg%d: dmacS %lu blk_off %ld\n",
   1307 		    vdp->xdf_addr, seg, vreq->v_dmac.dmac_size, blk_off));
   1308 		DPRINTF(IO_DBG, (
   1309 		    "xdf@%s: seg%d: fs %d ls %d gr %d dma 0x%"PRIx64"\n",
   1310 		    vdp->xdf_addr, seg, fsect, lsect, gr, dma_addr));
   1311 
   1312 		blkno += (bcnt >> XB_BSHIFT);
   1313 		seg++;
   1314 		ASSERT(seg <= BLKIF_MAX_SEGMENTS_PER_REQUEST);
   1315 		if (--ndmacs) {
   1316 			ddi_dma_nextcookie(vreq->v_dmahdl, &vreq->v_dmac);
   1317 			continue;
   1318 		}
   1319 
   1320 		vreq->v_status = VREQ_DMAWIN_DONE;
   1321 		vreq->v_blkno = blkno;
   1322 		break;
   1323 	}
   1324 	ddi_put8(acchdl,  &rreq->nr_segments, seg);
   1325 	DPRINTF(IO_DBG, (
   1326 	    "xdf@%s: xdf_process_rreq: request id=%"PRIx64" ready\n",
   1327 	    vdp->xdf_addr, rreq->id));
   1328 }
   1329 
   1330 static void
   1331 xdf_io_start(xdf_t *vdp)
   1332 {
   1333 	struct buf	*bp;
   1334 	v_req_t		*vreq;
   1335 	blkif_request_t	*rreq;
   1336 	boolean_t	rreqready = B_FALSE;
   1337 
   1338 	mutex_enter(&vdp->xdf_dev_lk);
   1339 
   1340 	/*
   1341 	 * Populate the ring request(s).  Loop until there is no buf to
   1342 	 * transfer or no free slot available in I/O ring.
   1343 	 */
   1344 	for (;;) {
   1345 		/* don't start any new IO if we're suspending */
   1346 		if (vdp->xdf_suspending)
   1347 			break;
   1348 		if ((bp = xdf_bp_next(vdp)) == NULL)
   1349 			break;
   1350 
   1351 		/* if the buf doesn't already have a vreq, allocate one */
   1352 		if (((vreq = BP_VREQ(bp)) == NULL) &&
   1353 		    ((vreq = vreq_get(vdp, bp)) == NULL))
   1354 			break;
   1355 
   1356 		/* alloc DMA/GTE resources */
   1357 		if (vreq_setup(vdp, vreq) != DDI_SUCCESS)
   1358 			break;
   1359 
   1360 		/* get next blkif_request in the ring */
   1361 		if ((rreq = xvdi_ring_get_request(vdp->xdf_xb_ring)) == NULL)
   1362 			break;
   1363 		bzero(rreq, sizeof (blkif_request_t));
   1364 		rreqready = B_TRUE;
   1365 
   1366 		/* populate blkif_request with this buf */
   1367 		xdf_process_rreq(vdp, bp, rreq);
   1368 
   1369 		/*
   1370 		 * This buffer/vreq pair is has been allocated a ring buffer
   1371 		 * resources, so if it isn't already in our runq, add it.
   1372 		 */
   1373 		if (!vreq->v_runq)
   1374 			xdf_kstat_waitq_to_runq(vdp, bp);
   1375 	}
   1376 
   1377 	/* Send the request(s) to the backend */
   1378 	if (rreqready)
   1379 		xdf_ring_push(vdp);
   1380 
   1381 	mutex_exit(&vdp->xdf_dev_lk);
   1382 }
   1383 
   1384 
   1385 /* check if partition is open, -1 - check all partitions on the disk */
   1386 static boolean_t
   1387 xdf_isopen(xdf_t *vdp, int partition)
   1388 {
   1389 	int i;
   1390 	ulong_t parbit;
   1391 	boolean_t rval = B_FALSE;
   1392 
   1393 	ASSERT((partition == -1) ||
   1394 	    ((partition >= 0) || (partition < XDF_PEXT)));
   1395 
   1396 	if (partition == -1)
   1397 		parbit = (ulong_t)-1;
   1398 	else
   1399 		parbit = 1 << partition;
   1400 
   1401 	for (i = 0; i < OTYPCNT; i++) {
   1402 		if (vdp->xdf_vd_open[i] & parbit)
   1403 			rval = B_TRUE;
   1404 	}
   1405 
   1406 	return (rval);
   1407 }
   1408 
   1409 /*
   1410  * The connection should never be closed as long as someone is holding
   1411  * us open, there is pending IO, or someone is waiting waiting for a
   1412  * connection.
   1413  */
   1414 static boolean_t
   1415 xdf_busy(xdf_t *vdp)
   1416 {
   1417 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
   1418 
   1419 	if ((vdp->xdf_xb_ring != NULL) &&
   1420 	    xvdi_ring_has_unconsumed_responses(vdp->xdf_xb_ring)) {
   1421 		ASSERT(vdp->xdf_state != XD_CLOSED);
   1422 		return (B_TRUE);
   1423 	}
   1424 
   1425 	if (!list_is_empty(&vdp->xdf_vreq_act) || (vdp->xdf_f_act != NULL)) {
   1426 		ASSERT(vdp->xdf_state != XD_CLOSED);
   1427 		return (B_TRUE);
   1428 	}
   1429 
   1430 	if (xdf_isopen(vdp, -1)) {
   1431 		ASSERT(vdp->xdf_state != XD_CLOSED);
   1432 		return (B_TRUE);
   1433 	}
   1434 
   1435 	if (vdp->xdf_connect_req > 0) {
   1436 		ASSERT(vdp->xdf_state != XD_CLOSED);
   1437 		return (B_TRUE);
   1438 	}
   1439 
   1440 	return (B_FALSE);
   1441 }
   1442 
   1443 static void
   1444 xdf_set_state(xdf_t *vdp, xdf_state_t new_state)
   1445 {
   1446 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
   1447 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
   1448 	DPRINTF(DDI_DBG, ("xdf@%s: state change %d -> %d\n",
   1449 	    vdp->xdf_addr, vdp->xdf_state, new_state));
   1450 	vdp->xdf_state = new_state;
   1451 	cv_broadcast(&vdp->xdf_dev_cv);
   1452 }
   1453 
   1454 static void
   1455 xdf_disconnect(xdf_t *vdp, xdf_state_t new_state, boolean_t quiet)
   1456 {
   1457 	dev_info_t	*dip = vdp->xdf_dip;
   1458 	boolean_t	busy;
   1459 
   1460 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
   1461 	ASSERT(MUTEX_NOT_HELD(&vdp->xdf_dev_lk));
   1462 	ASSERT((new_state == XD_UNKNOWN) || (new_state == XD_CLOSED));
   1463 
   1464 	/* Check if we're already there. */
   1465 	if (vdp->xdf_state == new_state)
   1466 		return;
   1467 
   1468 	mutex_enter(&vdp->xdf_dev_lk);
   1469 	busy = xdf_busy(vdp);
   1470 
   1471 	/* If we're already closed then there's nothing todo. */
   1472 	if (vdp->xdf_state == XD_CLOSED) {
   1473 		ASSERT(!busy);
   1474 		xdf_set_state(vdp, new_state);
   1475 		mutex_exit(&vdp->xdf_dev_lk);
   1476 		return;
   1477 	}
   1478 
   1479 #ifdef DEBUG
   1480 	/* UhOh.  Warn the user that something bad has happened. */
   1481 	if (!quiet && busy && (vdp->xdf_state == XD_READY) &&
   1482 	    (vdp->xdf_xdev_nblocks != 0)) {
   1483 		cmn_err(CE_WARN, "xdf@%s: disconnected while in use",
   1484 		    vdp->xdf_addr);
   1485 	}
   1486 #endif /* DEBUG */
   1487 
   1488 	xdf_ring_destroy(vdp);
   1489 
   1490 	/* If we're busy then we can only go into the unknown state */
   1491 	xdf_set_state(vdp, (busy) ? XD_UNKNOWN : new_state);
   1492 	mutex_exit(&vdp->xdf_dev_lk);
   1493 
   1494 	/* if we're closed now, let the other end know */
   1495 	if (vdp->xdf_state == XD_CLOSED)
   1496 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosed);
   1497 }
   1498 
   1499 
   1500 /*
   1501  * Kick-off connect process
   1502  * Status should be XD_UNKNOWN or XD_CLOSED
   1503  * On success, status will be changed to XD_INIT
   1504  * On error, it will be changed to XD_UNKNOWN
   1505  */
   1506 static int
   1507 xdf_setstate_init(xdf_t *vdp)
   1508 {
   1509 	dev_info_t		*dip = vdp->xdf_dip;
   1510 	xenbus_transaction_t	xbt;
   1511 	grant_ref_t		gref;
   1512 	char			*xsname, *str;
   1513 	int 			rv;
   1514 
   1515 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
   1516 	ASSERT(MUTEX_NOT_HELD(&vdp->xdf_dev_lk));
   1517 	ASSERT((vdp->xdf_state == XD_UNKNOWN) ||
   1518 	    (vdp->xdf_state == XD_CLOSED));
   1519 
   1520 	DPRINTF(DDI_DBG,
   1521 	    ("xdf@%s: starting connection process\n", vdp->xdf_addr));
   1522 
   1523 	/*
   1524 	 * If an eject is pending then don't allow a new connection.
   1525 	 * (Only the backend can clear media request eject request.)
   1526 	 */
   1527 	if (xdf_eject_pending(vdp))
   1528 		return (DDI_FAILURE);
   1529 
   1530 	if ((xsname = xvdi_get_xsname(dip)) == NULL)
   1531 		goto errout;
   1532 
   1533 	if ((vdp->xdf_peer = xvdi_get_oeid(dip)) == INVALID_DOMID)
   1534 		goto errout;
   1535 
   1536 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateInitialising);
   1537 
   1538 	/*
   1539 	 * Sanity check for the existance of the xenbus device-type property.
   1540 	 * This property might not exist if we our xenbus device nodes was
   1541 	 * force destroyed while we were still connected to the backend.
   1542 	 */
   1543 	if (xenbus_read_str(xsname, XBP_DEV_TYPE, &str) != 0)
   1544 		goto errout;
   1545 	strfree(str);
   1546 
   1547 	if (xvdi_alloc_evtchn(dip) != DDI_SUCCESS)
   1548 		goto errout;
   1549 
   1550 	vdp->xdf_evtchn = xvdi_get_evtchn(dip);
   1551 #ifdef XPV_HVM_DRIVER
   1552 	ec_bind_evtchn_to_handler(vdp->xdf_evtchn, IPL_VBD, xdf_intr, vdp);
   1553 #else /* !XPV_HVM_DRIVER */
   1554 	if (ddi_add_intr(dip, 0, NULL, NULL, xdf_intr, (caddr_t)vdp) !=
   1555 	    DDI_SUCCESS) {
   1556 		cmn_err(CE_WARN, "xdf@%s: xdf_setstate_init: "
   1557 		    "failed to add intr handler", vdp->xdf_addr);
   1558 		goto errout1;
   1559 	}
   1560 #endif /* !XPV_HVM_DRIVER */
   1561 
   1562 	if (xvdi_alloc_ring(dip, BLKIF_RING_SIZE,
   1563 	    sizeof (union blkif_sring_entry), &gref, &vdp->xdf_xb_ring) !=
   1564 	    DDI_SUCCESS) {
   1565 		cmn_err(CE_WARN, "xdf@%s: failed to alloc comm ring",
   1566 		    vdp->xdf_addr);
   1567 		goto errout2;
   1568 	}
   1569 	vdp->xdf_xb_ring_hdl = vdp->xdf_xb_ring->xr_acc_hdl; /* ugly!! */
   1570 
   1571 	/*
   1572 	 * Write into xenstore the info needed by backend
   1573 	 */
   1574 trans_retry:
   1575 	if (xenbus_transaction_start(&xbt)) {
   1576 		cmn_err(CE_WARN, "xdf@%s: failed to start transaction",
   1577 		    vdp->xdf_addr);
   1578 		xvdi_fatal_error(dip, EIO, "connect transaction init");
   1579 		goto fail_trans;
   1580 	}
   1581 
   1582 	/*
   1583 	 * XBP_PROTOCOL is written by the domain builder in the case of PV
   1584 	 * domains. However, it is not written for HVM domains, so let's
   1585 	 * write it here.
   1586 	 */
   1587 	if (((rv = xenbus_printf(xbt, xsname,
   1588 	    XBP_MEDIA_REQ, "%s", XBV_MEDIA_REQ_NONE)) != 0) ||
   1589 	    ((rv = xenbus_printf(xbt, xsname,
   1590 	    XBP_RING_REF, "%u", gref)) != 0) ||
   1591 	    ((rv = xenbus_printf(xbt, xsname,
   1592 	    XBP_EVENT_CHAN, "%u", vdp->xdf_evtchn)) != 0) ||
   1593 	    ((rv = xenbus_printf(xbt, xsname,
   1594 	    XBP_PROTOCOL, "%s", XEN_IO_PROTO_ABI_NATIVE)) != 0) ||
   1595 	    ((rv = xvdi_switch_state(dip, xbt, XenbusStateInitialised)) > 0)) {
   1596 		(void) xenbus_transaction_end(xbt, 1);
   1597 		xvdi_fatal_error(dip, rv, "connect transaction setup");
   1598 		goto fail_trans;
   1599 	}
   1600 
   1601 	/* kick-off connect process */
   1602 	if (rv = xenbus_transaction_end(xbt, 0)) {
   1603 		if (rv == EAGAIN)
   1604 			goto trans_retry;
   1605 		xvdi_fatal_error(dip, rv, "connect transaction commit");
   1606 		goto fail_trans;
   1607 	}
   1608 
   1609 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
   1610 	mutex_enter(&vdp->xdf_dev_lk);
   1611 	xdf_set_state(vdp, XD_INIT);
   1612 	mutex_exit(&vdp->xdf_dev_lk);
   1613 
   1614 	return (DDI_SUCCESS);
   1615 
   1616 fail_trans:
   1617 	xvdi_free_ring(vdp->xdf_xb_ring);
   1618 errout2:
   1619 #ifdef XPV_HVM_DRIVER
   1620 	ec_unbind_evtchn(vdp->xdf_evtchn);
   1621 #else /* !XPV_HVM_DRIVER */
   1622 	(void) ddi_remove_intr(vdp->xdf_dip, 0, NULL);
   1623 #endif /* !XPV_HVM_DRIVER */
   1624 errout1:
   1625 	xvdi_free_evtchn(dip);
   1626 	vdp->xdf_evtchn = INVALID_EVTCHN;
   1627 errout:
   1628 	xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
   1629 	cmn_err(CE_WARN, "xdf@%s: failed to start connection to backend",
   1630 	    vdp->xdf_addr);
   1631 	return (DDI_FAILURE);
   1632 }
   1633 
   1634 int
   1635 xdf_get_flush_block(xdf_t *vdp)
   1636 {
   1637 	/*
   1638 	 * Get a DEV_BSIZE aligned bufer
   1639 	 */
   1640 	vdp->xdf_flush_mem = kmem_alloc(vdp->xdf_xdev_secsize * 2, KM_SLEEP);
   1641 	vdp->xdf_cache_flush_block =
   1642 	    (char *)P2ROUNDUP((uintptr_t)(vdp->xdf_flush_mem),
   1643 	    (int)vdp->xdf_xdev_secsize);
   1644 
   1645 	if (xdf_lb_rdwr(vdp->xdf_dip, TG_READ, vdp->xdf_cache_flush_block,
   1646 	    xdf_flush_block, vdp->xdf_xdev_secsize, NULL) != 0)
   1647 		return (DDI_FAILURE);
   1648 	return (DDI_SUCCESS);
   1649 }
   1650 
   1651 static void
   1652 xdf_setstate_ready(void *arg)
   1653 {
   1654 	xdf_t	*vdp = (xdf_t *)arg;
   1655 
   1656 	vdp->xdf_ready_tq_thread = curthread;
   1657 
   1658 	/*
   1659 	 * We've created all the minor nodes via cmlb_attach() using default
   1660 	 * value in xdf_attach() to make it possible to block in xdf_open(),
   1661 	 * in case there's anyone (say, booting thread) ever trying to open
   1662 	 * it before connected to backend. We will refresh all those minor
   1663 	 * nodes w/ latest info we've got now when we are almost connected.
   1664 	 */
   1665 	mutex_enter(&vdp->xdf_dev_lk);
   1666 	if (vdp->xdf_cmbl_reattach) {
   1667 		vdp->xdf_cmbl_reattach = B_FALSE;
   1668 
   1669 		mutex_exit(&vdp->xdf_dev_lk);
   1670 		if (xdf_cmlb_attach(vdp) != 0) {
   1671 			xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
   1672 			return;
   1673 		}
   1674 		mutex_enter(&vdp->xdf_dev_lk);
   1675 	}
   1676 
   1677 	/* If we're not still trying to get to the ready state, then bail. */
   1678 	if (vdp->xdf_state != XD_CONNECTED) {
   1679 		mutex_exit(&vdp->xdf_dev_lk);
   1680 		return;
   1681 	}
   1682 	mutex_exit(&vdp->xdf_dev_lk);
   1683 
   1684 	/*
   1685 	 * If backend has feature-barrier, see if it supports disk
   1686 	 * cache flush op.
   1687 	 */
   1688 	vdp->xdf_flush_supported = B_FALSE;
   1689 	if (vdp->xdf_feature_barrier) {
   1690 		/*
   1691 		 * Pretend we already know flush is supported so probe
   1692 		 * will attempt the correct op.
   1693 		 */
   1694 		vdp->xdf_flush_supported = B_TRUE;
   1695 		if (xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE, NULL, 0, 0, 0) == 0) {
   1696 			vdp->xdf_flush_supported = B_TRUE;
   1697 		} else {
   1698 			vdp->xdf_flush_supported = B_FALSE;
   1699 			/*
   1700 			 * If the other end does not support the cache flush op
   1701 			 * then we must use a barrier-write to force disk
   1702 			 * cache flushing.  Barrier writes require that a data
   1703 			 * block actually be written.
   1704 			 * Cache a block to barrier-write when we are
   1705 			 * asked to perform a flush.
   1706 			 * XXX - would it be better to just copy 1 block
   1707 			 * (512 bytes) from whatever write we did last
   1708 			 * and rewrite that block?
   1709 			 */
   1710 			if (xdf_get_flush_block(vdp) != DDI_SUCCESS) {
   1711 				xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
   1712 				return;
   1713 			}
   1714 		}
   1715 	}
   1716 
   1717 	mutex_enter(&vdp->xdf_cb_lk);
   1718 	mutex_enter(&vdp->xdf_dev_lk);
   1719 	if (vdp->xdf_state == XD_CONNECTED)
   1720 		xdf_set_state(vdp, XD_READY);
   1721 	mutex_exit(&vdp->xdf_dev_lk);
   1722 
   1723 	/* Restart any currently queued up io */
   1724 	xdf_io_start(vdp);
   1725 
   1726 	mutex_exit(&vdp->xdf_cb_lk);
   1727 }
   1728 
   1729 /*
   1730  * synthetic geometry
   1731  */
   1732 #define	XDF_NSECTS	256
   1733 #define	XDF_NHEADS	16
   1734 
   1735 static void
   1736 xdf_synthetic_pgeom(dev_info_t *dip, cmlb_geom_t *geomp)
   1737 {
   1738 	xdf_t *vdp;
   1739 	uint_t ncyl;
   1740 
   1741 	vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip));
   1742 
   1743 	ncyl = vdp->xdf_xdev_nblocks / (XDF_NHEADS * XDF_NSECTS);
   1744 
   1745 	bzero(geomp, sizeof (*geomp));
   1746 	geomp->g_ncyl = ncyl == 0 ? 1 : ncyl;
   1747 	geomp->g_acyl = 0;
   1748 	geomp->g_nhead = XDF_NHEADS;
   1749 	geomp->g_nsect = XDF_NSECTS;
   1750 	geomp->g_secsize = vdp->xdf_xdev_secsize;
   1751 	geomp->g_capacity = vdp->xdf_xdev_nblocks;
   1752 	geomp->g_intrlv = 0;
   1753 	geomp->g_rpm = 7200;
   1754 }
   1755 
   1756 /*
   1757  * Finish other initialization after we've connected to backend
   1758  * Status should be XD_INIT before calling this routine
   1759  * On success, status should be changed to XD_CONNECTED.
   1760  * On error, status should stay XD_INIT
   1761  */
   1762 static int
   1763 xdf_setstate_connected(xdf_t *vdp)
   1764 {
   1765 	dev_info_t	*dip = vdp->xdf_dip;
   1766 	cmlb_geom_t	pgeom;
   1767 	diskaddr_t	nblocks = 0;
   1768 	uint_t		secsize = 0;
   1769 	char		*oename, *xsname, *str;
   1770 	uint_t		dinfo;
   1771 
   1772 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
   1773 	ASSERT(MUTEX_NOT_HELD(&vdp->xdf_dev_lk));
   1774 	ASSERT(vdp->xdf_state == XD_INIT);
   1775 
   1776 	if (((xsname = xvdi_get_xsname(dip)) == NULL) ||
   1777 	    ((oename = xvdi_get_oename(dip)) == NULL))
   1778 		return (DDI_FAILURE);
   1779 
   1780 	/* Make sure the other end is XenbusStateConnected */
   1781 	if (xenbus_read_driver_state(oename) != XenbusStateConnected)
   1782 		return (DDI_FAILURE);
   1783 
   1784 	/* Determine if feature barrier is supported by backend */
   1785 	if (!(vdp->xdf_feature_barrier = xenbus_exists(oename, XBP_FB)))
   1786 		cmn_err(CE_NOTE, "!xdf@%s: feature-barrier not supported",
   1787 		    vdp->xdf_addr);
   1788 
   1789 	/*
   1790 	 * Probe backend.  Read the device size into xdf_xdev_nblocks
   1791 	 * and set the VDISK_READONLY, VDISK_CDROM, and VDISK_REMOVABLE
   1792 	 * flags in xdf_dinfo.  If the emulated device type is "cdrom",
   1793 	 * we always set VDISK_CDROM, regardless of if it's present in
   1794 	 * the xenbus info parameter.
   1795 	 */
   1796 	if (xenbus_gather(XBT_NULL, oename,
   1797 	    XBP_SECTORS, "%"SCNu64, &nblocks,
   1798 	    XBP_SECTOR_SIZE, "%u", &secsize,
   1799 	    XBP_INFO, "%u", &dinfo,
   1800 	    NULL) != 0) {
   1801 		cmn_err(CE_WARN, "xdf@%s: xdf_setstate_connected: "
   1802 		    "cannot read backend info", vdp->xdf_addr);
   1803 		return (DDI_FAILURE);
   1804 	}
   1805 	if (xenbus_read_str(xsname, XBP_DEV_TYPE, &str) != 0) {
   1806 		cmn_err(CE_WARN, "xdf@%s: cannot read device-type",
   1807 		    vdp->xdf_addr);
   1808 		return (DDI_FAILURE);
   1809 	}
   1810 	if (strcmp(str, XBV_DEV_TYPE_CD) == 0)
   1811 		dinfo |= VDISK_CDROM;
   1812 	strfree(str);
   1813 
   1814 	if (secsize == 0 || !(ISP2(secsize / DEV_BSIZE)))
   1815 		secsize = DEV_BSIZE;
   1816 	vdp->xdf_xdev_nblocks = nblocks;
   1817 	vdp->xdf_xdev_secsize = secsize;
   1818 #ifdef _ILP32
   1819 	if (vdp->xdf_xdev_nblocks > DK_MAX_BLOCKS) {
   1820 		cmn_err(CE_WARN, "xdf@%s: xdf_setstate_connected: "
   1821 		    "backend disk device too large with %llu blocks for"
   1822 		    " 32-bit kernel", vdp->xdf_addr, vdp->xdf_xdev_nblocks);
   1823 		xvdi_fatal_error(dip, EFBIG, "reading backend info");
   1824 		return (DDI_FAILURE);
   1825 	}
   1826 #endif
   1827 
   1828 	/*
   1829 	 * If the physical geometry for a fixed disk has been explicity
   1830 	 * set then make sure that the specified physical geometry isn't
   1831 	 * larger than the device we connected to.
   1832 	 */
   1833 	if (vdp->xdf_pgeom_fixed &&
   1834 	    (vdp->xdf_pgeom.g_capacity > vdp->xdf_xdev_nblocks)) {
   1835 		cmn_err(CE_WARN,
   1836 		    "xdf@%s: connect failed, fixed geometry too large",
   1837 		    vdp->xdf_addr);
   1838 		return (DDI_FAILURE);
   1839 	}
   1840 
   1841 	vdp->xdf_media_req_supported = xenbus_exists(oename, XBP_MEDIA_REQ_SUP);
   1842 
   1843 	/* mark vbd is ready for I/O */
   1844 	mutex_enter(&vdp->xdf_dev_lk);
   1845 	xdf_set_state(vdp, XD_CONNECTED);
   1846 
   1847 	/* check if the cmlb label should be updated */
   1848 	xdf_synthetic_pgeom(dip, &pgeom);
   1849 	if ((vdp->xdf_dinfo != dinfo) ||
   1850 	    (!vdp->xdf_pgeom_fixed &&
   1851 	    (memcmp(&vdp->xdf_pgeom, &pgeom, sizeof (pgeom)) != 0))) {
   1852 		vdp->xdf_cmbl_reattach = B_TRUE;
   1853 
   1854 		vdp->xdf_dinfo = dinfo;
   1855 		if (!vdp->xdf_pgeom_fixed)
   1856 			vdp->xdf_pgeom = pgeom;
   1857 	}
   1858 
   1859 	if (XD_IS_CD(vdp) || XD_IS_RM(vdp)) {
   1860 		if (vdp->xdf_xdev_nblocks == 0) {
   1861 			vdp->xdf_mstate = DKIO_EJECTED;
   1862 			cv_broadcast(&vdp->xdf_mstate_cv);
   1863 		} else {
   1864 			vdp->xdf_mstate = DKIO_INSERTED;
   1865 			cv_broadcast(&vdp->xdf_mstate_cv);
   1866 		}
   1867 	} else {
   1868 		if (vdp->xdf_mstate != DKIO_NONE) {
   1869 			vdp->xdf_mstate = DKIO_NONE;
   1870 			cv_broadcast(&vdp->xdf_mstate_cv);
   1871 		}
   1872 	}
   1873 
   1874 	mutex_exit(&vdp->xdf_dev_lk);
   1875 
   1876 	cmn_err(CE_CONT, "?xdf@%s: %"PRIu64" blocks", vdp->xdf_addr,
   1877 	    (uint64_t)vdp->xdf_xdev_nblocks);
   1878 
   1879 	/* Restart any currently queued up io */
   1880 	xdf_io_start(vdp);
   1881 
   1882 	/*
   1883 	 * To get to the ready state we have to do IO to the backend device,
   1884 	 * but we can't initiate IO from the other end change callback thread
   1885 	 * (which is the current context we're executing in.)  This is because
   1886 	 * if the other end disconnects while we're doing IO from the callback
   1887 	 * thread, then we can't recieve that disconnect event and we hang
   1888 	 * waiting for an IO that can never complete.
   1889 	 */
   1890 	(void) ddi_taskq_dispatch(vdp->xdf_ready_tq, xdf_setstate_ready, vdp,
   1891 	    DDI_SLEEP);
   1892 
   1893 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected);
   1894 	return (DDI_SUCCESS);
   1895 }
   1896 
   1897 /*ARGSUSED*/
   1898 static void
   1899 xdf_oe_change(dev_info_t *dip, ddi_eventcookie_t id, void *arg, void *impl_data)
   1900 {
   1901 	XenbusState new_state = *(XenbusState *)impl_data;
   1902 	xdf_t *vdp = (xdf_t *)ddi_get_driver_private(dip);
   1903 
   1904 	DPRINTF(DDI_DBG, ("xdf@%s: otherend state change to %d!\n",
   1905 	    vdp->xdf_addr, new_state));
   1906 
   1907 	mutex_enter(&vdp->xdf_cb_lk);
   1908 
   1909 	/* We assume that this callback is single threaded */
   1910 	ASSERT(vdp->xdf_oe_change_thread == NULL);
   1911 	DEBUG_EVAL(vdp->xdf_oe_change_thread = curthread);
   1912 
   1913 	/* ignore any backend state changes if we're suspending/suspended */
   1914 	if (vdp->xdf_suspending || (vdp->xdf_state == XD_SUSPEND)) {
   1915 		DEBUG_EVAL(vdp->xdf_oe_change_thread = NULL);
   1916 		mutex_exit(&vdp->xdf_cb_lk);
   1917 		return;
   1918 	}
   1919 
   1920 	switch (new_state) {
   1921 	case XenbusStateUnknown:
   1922 	case XenbusStateInitialising:
   1923 	case XenbusStateInitWait:
   1924 	case XenbusStateInitialised:
   1925 		if (vdp->xdf_state == XD_INIT)
   1926 			break;
   1927 
   1928 		xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
   1929 		if (xdf_setstate_init(vdp) != DDI_SUCCESS)
   1930 			break;
   1931 		ASSERT(vdp->xdf_state == XD_INIT);
   1932 		break;
   1933 
   1934 	case XenbusStateConnected:
   1935 		if ((vdp->xdf_state == XD_CONNECTED) ||
   1936 		    (vdp->xdf_state == XD_READY))
   1937 			break;
   1938 
   1939 		if (vdp->xdf_state != XD_INIT) {
   1940 			xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
   1941 			if (xdf_setstate_init(vdp) != DDI_SUCCESS)
   1942 				break;
   1943 			ASSERT(vdp->xdf_state == XD_INIT);
   1944 		}
   1945 
   1946 		if (xdf_setstate_connected(vdp) != DDI_SUCCESS) {
   1947 			xdf_disconnect(vdp, XD_UNKNOWN, B_FALSE);
   1948 			break;
   1949 		}
   1950 		ASSERT(vdp->xdf_state == XD_CONNECTED);
   1951 		break;
   1952 
   1953 	case XenbusStateClosing:
   1954 		if (xdf_isopen(vdp, -1)) {
   1955 			cmn_err(CE_NOTE,
   1956 			    "xdf@%s: hot-unplug failed, still in use",
   1957 			    vdp->xdf_addr);
   1958 			break;
   1959 		}
   1960 		/*FALLTHROUGH*/
   1961 	case XenbusStateClosed:
   1962 		xdf_disconnect(vdp, XD_CLOSED, B_FALSE);
   1963 		break;
   1964 	}
   1965 
   1966 	/* notify anybody waiting for oe state change */
   1967 	cv_broadcast(&vdp->xdf_dev_cv);
   1968 	DEBUG_EVAL(vdp->xdf_oe_change_thread = NULL);
   1969 	mutex_exit(&vdp->xdf_cb_lk);
   1970 }
   1971 
   1972 static int
   1973 xdf_connect_locked(xdf_t *vdp, boolean_t wait)
   1974 {
   1975 	int	rv, timeouts = 0, reset = 20;
   1976 
   1977 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
   1978 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
   1979 
   1980 	/* we can't connect once we're in the closed state */
   1981 	if (vdp->xdf_state == XD_CLOSED)
   1982 		return (XD_CLOSED);
   1983 
   1984 	vdp->xdf_connect_req++;
   1985 	while (vdp->xdf_state != XD_READY) {
   1986 		mutex_exit(&vdp->xdf_dev_lk);
   1987 
   1988 		/* only one thread at a time can be the connection thread */
   1989 		if (vdp->xdf_connect_thread == NULL)
   1990 			vdp->xdf_connect_thread = curthread;
   1991 
   1992 		if (vdp->xdf_connect_thread == curthread) {
   1993 			if ((timeouts > 0) && ((timeouts % reset) == 0)) {
   1994 				/*
   1995 				 * If we haven't establised a connection
   1996 				 * within the reset time, then disconnect
   1997 				 * so we can try again, and double the reset
   1998 				 * time.  The reset time starts at 2 sec.
   1999 				 */
   2000 				(void) xdf_disconnect(vdp, XD_UNKNOWN, B_TRUE);
   2001 				reset *= 2;
   2002 			}
   2003 			if (vdp->xdf_state == XD_UNKNOWN)
   2004 				(void) xdf_setstate_init(vdp);
   2005 			if (vdp->xdf_state == XD_INIT)
   2006 				(void) xdf_setstate_connected(vdp);
   2007 		}
   2008 
   2009 		mutex_enter(&vdp->xdf_dev_lk);
   2010 		if (!wait || (vdp->xdf_state == XD_READY))
   2011 			goto out;
   2012 
   2013 		mutex_exit((&vdp->xdf_cb_lk));
   2014 		if (vdp->xdf_connect_thread != curthread) {
   2015 			rv = cv_wait_sig(&vdp->xdf_dev_cv, &vdp->xdf_dev_lk);
   2016 		} else {
   2017 			/* delay for 0.1 sec */
   2018 			rv = cv_reltimedwait_sig(&vdp->xdf_dev_cv,
   2019 			    &vdp->xdf_dev_lk, drv_usectohz(100*1000),
   2020 			    TR_CLOCK_TICK);
   2021 			if (rv == -1)
   2022 				timeouts++;
   2023 		}
   2024 		mutex_exit((&vdp->xdf_dev_lk));
   2025 		mutex_enter((&vdp->xdf_cb_lk));
   2026 		mutex_enter((&vdp->xdf_dev_lk));
   2027 		if (rv == 0)
   2028 			goto out;
   2029 	}
   2030 
   2031 out:
   2032 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
   2033 	ASSERT(MUTEX_HELD(&vdp->xdf_dev_lk));
   2034 
   2035 	if (vdp->xdf_connect_thread == curthread) {
   2036 		/*
   2037 		 * wake up someone else so they can become the connection
   2038 		 * thread.
   2039 		 */
   2040 		cv_signal(&vdp->xdf_dev_cv);
   2041 		vdp->xdf_connect_thread = NULL;
   2042 	}
   2043 
   2044 	/* Try to lock the media */
   2045 	mutex_exit((&vdp->xdf_dev_lk));
   2046 	(void) xdf_media_req(vdp, XBV_MEDIA_REQ_LOCK, B_TRUE);
   2047 	mutex_enter((&vdp->xdf_dev_lk));
   2048 
   2049 	vdp->xdf_connect_req--;
   2050 	return (vdp->xdf_state);
   2051 }
   2052 
   2053 static uint_t
   2054 xdf_iorestart(caddr_t arg)
   2055 {
   2056 	xdf_t *vdp = (xdf_t *)arg;
   2057 
   2058 	ASSERT(vdp != NULL);
   2059 
   2060 	mutex_enter(&vdp->xdf_dev_lk);
   2061 	ASSERT(ISDMACBON(vdp));
   2062 	SETDMACBOFF(vdp);
   2063 	mutex_exit(&vdp->xdf_dev_lk);
   2064 
   2065 	xdf_io_start(vdp);
   2066 
   2067 	return (DDI_INTR_CLAIMED);
   2068 }
   2069 
   2070 #if defined(XPV_HVM_DRIVER)
   2071 
   2072 typedef struct xdf_hvm_entry {
   2073 	list_node_t	xdf_he_list;
   2074 	char		*xdf_he_path;
   2075 	dev_info_t	*xdf_he_dip;
   2076 } xdf_hvm_entry_t;
   2077 
   2078 static list_t xdf_hvm_list;
   2079 static kmutex_t xdf_hvm_list_lock;
   2080 
   2081 static xdf_hvm_entry_t *
   2082 i_xdf_hvm_find(const char *path, dev_info_t *dip)
   2083 {
   2084 	xdf_hvm_entry_t	*i;
   2085 
   2086 	ASSERT((path != NULL) || (dip != NULL));
   2087 	ASSERT(MUTEX_HELD(&xdf_hvm_list_lock));
   2088 
   2089 	i = list_head(&xdf_hvm_list);
   2090 	while (i != NULL) {
   2091 		if ((path != NULL) && strcmp(i->xdf_he_path, path) != 0) {
   2092 			i = list_next(&xdf_hvm_list, i);
   2093 			continue;
   2094 		}
   2095 		if ((dip != NULL) && (i->xdf_he_dip != dip)) {
   2096 			i = list_next(&xdf_hvm_list, i);
   2097 			continue;
   2098 		}
   2099 		break;
   2100 	}
   2101 	return (i);
   2102 }
   2103 
   2104 dev_info_t *
   2105 xdf_hvm_hold(const char *path)
   2106 {
   2107 	xdf_hvm_entry_t	*i;
   2108 	dev_info_t	*dip;
   2109 
   2110 	mutex_enter(&xdf_hvm_list_lock);
   2111 	i = i_xdf_hvm_find(path, NULL);
   2112 	if (i == NULL) {
   2113 		mutex_exit(&xdf_hvm_list_lock);
   2114 		return (B_FALSE);
   2115 	}
   2116 	ndi_hold_devi(dip = i->xdf_he_dip);
   2117 	mutex_exit(&xdf_hvm_list_lock);
   2118 	return (dip);
   2119 }
   2120 
   2121 static void
   2122 xdf_hvm_add(dev_info_t *dip)
   2123 {
   2124 	xdf_hvm_entry_t	*i;
   2125 	char		*path;
   2126 
   2127 	/* figure out the path for the dip */
   2128 	path = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
   2129 	(void) ddi_pathname(dip, path);
   2130 
   2131 	i = kmem_alloc(sizeof (*i), KM_SLEEP);
   2132 	i->xdf_he_dip = dip;
   2133 	i->xdf_he_path = i_ddi_strdup(path, KM_SLEEP);
   2134 
   2135 	mutex_enter(&xdf_hvm_list_lock);
   2136 	ASSERT(i_xdf_hvm_find(path, NULL) == NULL);
   2137 	ASSERT(i_xdf_hvm_find(NULL, dip) == NULL);
   2138 	list_insert_head(&xdf_hvm_list, i);
   2139 	mutex_exit(&xdf_hvm_list_lock);
   2140 
   2141 	kmem_free(path, MAXPATHLEN);
   2142 }
   2143 
   2144 static void
   2145 xdf_hvm_rm(dev_info_t *dip)
   2146 {
   2147 	xdf_hvm_entry_t	*i;
   2148 
   2149 	mutex_enter(&xdf_hvm_list_lock);
   2150 	VERIFY((i = i_xdf_hvm_find(NULL, dip)) != NULL);
   2151 	list_remove(&xdf_hvm_list, i);
   2152 	mutex_exit(&xdf_hvm_list_lock);
   2153 
   2154 	kmem_free(i->xdf_he_path, strlen(i->xdf_he_path) + 1);
   2155 	kmem_free(i, sizeof (*i));
   2156 }
   2157 
   2158 static void
   2159 xdf_hvm_init(void)
   2160 {
   2161 	list_create(&xdf_hvm_list, sizeof (xdf_hvm_entry_t),
   2162 	    offsetof(xdf_hvm_entry_t, xdf_he_list));
   2163 	mutex_init(&xdf_hvm_list_lock, NULL, MUTEX_DEFAULT, NULL);
   2164 }
   2165 
   2166 static void
   2167 xdf_hvm_fini(void)
   2168 {
   2169 	ASSERT(list_head(&xdf_hvm_list) == NULL);
   2170 	list_destroy(&xdf_hvm_list);
   2171 	mutex_destroy(&xdf_hvm_list_lock);
   2172 }
   2173 
   2174 boolean_t
   2175 xdf_hvm_connect(dev_info_t *dip)
   2176 {
   2177 	xdf_t	*vdp = (xdf_t *)ddi_get_driver_private(dip);
   2178 	char	*oename, *str;
   2179 	int	rv;
   2180 
   2181 	mutex_enter(&vdp->xdf_cb_lk);
   2182 
   2183 	/*
   2184 	 * Before try to establish a connection we need to wait for the
   2185 	 * backend hotplug scripts to have run.  Once they are run the
   2186 	 * "<oename>/hotplug-status" property will be set to "connected".
   2187 	 */
   2188 	for (;;) {
   2189 		ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
   2190 
   2191 		/*
   2192 		 * Get the xenbus path to the backend device.  Note that
   2193 		 * we can't cache this path (and we look it up on each pass
   2194 		 * through this loop) because it could change during
   2195 		 * suspend, resume, and migration operations.
   2196 		 */
   2197 		if ((oename = xvdi_get_oename(dip)) == NULL) {
   2198 			mutex_exit(&vdp->xdf_cb_lk);
   2199 			return (B_FALSE);
   2200 		}
   2201 
   2202 		str = NULL;
   2203 		if ((xenbus_read_str(oename, XBP_HP_STATUS, &str) == 0) &&
   2204 		    (strcmp(str, XBV_HP_STATUS_CONN) == 0))
   2205 			break;
   2206 
   2207 		if (str != NULL)
   2208 			strfree(str);
   2209 
   2210 		/* wait for an update to "<oename>/hotplug-status" */
   2211 		if (cv_wait_sig(&vdp->xdf_hp_status_cv, &vdp->xdf_cb_lk) == 0) {
   2212 			/* we got interrupted by a signal */
   2213 			mutex_exit(&vdp->xdf_cb_lk);
   2214 			return (B_FALSE);
   2215 		}
   2216 	}
   2217 
   2218 	/* Good news.  The backend hotplug scripts have been run. */
   2219 	ASSERT(MUTEX_HELD(&vdp->xdf_cb_lk));
   2220 	ASSERT(strcmp(str, XBV_HP_STATUS_CONN) == 0);
   2221 	strfree(str);
   2222 
   2223 	/*
   2224 	 * If we're emulating a cd device and if the backend doesn't support
   2225 	 * media request opreations, then we're not going to bother trying
   2226 	 * to establish a connection for a couple reasons.  First off, media
   2227 	 * requests support is required to support operations like eject and
   2228 	 * media locking.  Second, other backend platforms like Linux don't
   2229 	 * support hvm pv cdrom access.  They don't even have a backend pv
   2230 	 * driver for cdrom device nodes, so we don't want to block forever
   2231 	 * waiting for a connection to a backend driver that doesn't exist.
   2232 	 */
   2233 	if (XD_IS_CD(vdp) && !xenbus_exists(oename, XBP_MEDIA_REQ_SUP)) {
   2234 		mutex_exit(&vdp->xdf_cb_lk);
   2235 		return (B_FALSE);
   2236 	}
   2237 
   2238 	mutex_enter(&vdp->xdf_dev_lk);
   2239 	rv = xdf_connect_locked(vdp, B_TRUE);
   2240 	mutex_exit(&vdp->xdf_dev_lk);
   2241 	mutex_exit(&vdp->xdf_cb_lk);
   2242 
   2243 	return ((rv == XD_READY) ? B_TRUE : B_FALSE);
   2244 }
   2245 
   2246 int
   2247 xdf_hvm_setpgeom(dev_info_t *dip, cmlb_geom_t *geomp)
   2248 {
   2249 	xdf_t	*vdp = (xdf_t *)ddi_get_driver_private(dip);
   2250 
   2251 	/* sanity check the requested physical geometry */
   2252 	mutex_enter(&vdp->xdf_dev_lk);
   2253 	if ((geomp->g_secsize != XB_BSIZE) ||
   2254 	    (geomp->g_capacity == 0)) {
   2255 		mutex_exit(&vdp->xdf_dev_lk);
   2256 		return (EINVAL);
   2257 	}
   2258 
   2259 	/*
   2260 	 * If we've already connected to the backend device then make sure
   2261 	 * we're not defining a physical geometry larger than our backend
   2262 	 * device.
   2263 	 */
   2264 	if ((vdp->xdf_xdev_nblocks != 0) &&
   2265 	    (geomp->g_capacity > vdp->xdf_xdev_nblocks)) {
   2266 		mutex_exit(&vdp->xdf_dev_lk);
   2267 		return (EINVAL);
   2268 	}
   2269 
   2270 	bzero(&vdp->xdf_pgeom, sizeof (vdp->xdf_pgeom));
   2271 	vdp->xdf_pgeom.g_ncyl = geomp->g_ncyl;
   2272 	vdp->xdf_pgeom.g_acyl = geomp->g_acyl;
   2273 	vdp->xdf_pgeom.g_nhead = geomp->g_nhead;
   2274 	vdp->xdf_pgeom.g_nsect = geomp->g_nsect;
   2275 	vdp->xdf_pgeom.g_secsize = geomp->g_secsize;
   2276 	vdp->xdf_pgeom.g_capacity = geomp->g_capacity;
   2277 	vdp->xdf_pgeom.g_intrlv = geomp->g_intrlv;
   2278 	vdp->xdf_pgeom.g_rpm = geomp->g_rpm;
   2279 
   2280 	vdp->xdf_pgeom_fixed = B_TRUE;
   2281 	mutex_exit(&vdp->xdf_dev_lk);
   2282 
   2283 	/* force a re-validation */
   2284 	cmlb_invalidate(vdp->xdf_vd_lbl, NULL);
   2285 
   2286 	return (0);
   2287 }
   2288 
   2289 boolean_t
   2290 xdf_is_cd(dev_info_t *dip)
   2291 {
   2292 	xdf_t		*vdp = (xdf_t *)ddi_get_driver_private(dip);
   2293 	boolean_t	rv;
   2294 
   2295 	mutex_enter(&vdp->xdf_cb_lk);
   2296 	rv = XD_IS_CD(vdp);
   2297 	mutex_exit(&vdp->xdf_cb_lk);
   2298 	return (rv);
   2299 }
   2300 
   2301 boolean_t
   2302 xdf_is_rm(dev_info_t *dip)
   2303 {
   2304 	xdf_t		*vdp = (xdf_t *)ddi_get_driver_private(dip);
   2305 	boolean_t	rv;
   2306 
   2307 	mutex_enter(&vdp->xdf_cb_lk);
   2308 	rv = XD_IS_RM(vdp);
   2309 	mutex_exit(&vdp->xdf_cb_lk);
   2310 	return (rv);
   2311 }
   2312 
   2313 boolean_t
   2314 xdf_media_req_supported(dev_info_t *dip)
   2315 {
   2316 	xdf_t		*vdp = (xdf_t *)ddi_get_driver_private(dip);
   2317 	boolean_t	rv;
   2318 
   2319 	mutex_enter(&vdp->xdf_cb_lk);
   2320 	rv = vdp->xdf_media_req_supported;
   2321 	mutex_exit(&vdp->xdf_cb_lk);
   2322 	return (rv);
   2323 }
   2324 
   2325 #endif /* XPV_HVM_DRIVER */
   2326 
   2327 static int
   2328 xdf_lb_getcap(dev_info_t *dip, diskaddr_t *capp)
   2329 {
   2330 	xdf_t *vdp;
   2331 	vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip));
   2332 
   2333 	if (vdp == NULL)
   2334 		return (ENXIO);
   2335 
   2336 	mutex_enter(&vdp->xdf_dev_lk);
   2337 	*capp = vdp->xdf_pgeom.g_capacity;
   2338 	DPRINTF(LBL_DBG, ("xdf@%s:capacity %llu\n", vdp->xdf_addr, *capp));
   2339 	mutex_exit(&vdp->xdf_dev_lk);
   2340 	return (0);
   2341 }
   2342 
   2343 static int
   2344 xdf_lb_getpgeom(dev_info_t *dip, cmlb_geom_t *geomp)
   2345 {
   2346 	xdf_t *vdp;
   2347 
   2348 	if ((vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip))) == NULL)
   2349 		return (ENXIO);
   2350 	*geomp = vdp->xdf_pgeom;
   2351 	return (0);
   2352 }
   2353 
   2354 /*
   2355  * No real HBA, no geometry available from it
   2356  */
   2357 /*ARGSUSED*/
   2358 static int
   2359 xdf_lb_getvgeom(dev_info_t *dip, cmlb_geom_t *geomp)
   2360 {
   2361 	return (EINVAL);
   2362 }
   2363 
   2364 static int
   2365 xdf_lb_getattribute(dev_info_t *dip, tg_attribute_t *tgattributep)
   2366 {
   2367 	xdf_t *vdp;
   2368 
   2369 	if (!(vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip))))
   2370 		return (ENXIO);
   2371 
   2372 	if (XD_IS_RO(vdp))
   2373 		tgattributep->media_is_writable = 0;
   2374 	else
   2375 		tgattributep->media_is_writable = 1;
   2376 	return (0);
   2377 }
   2378 
   2379 /* ARGSUSED3 */
   2380 int
   2381 xdf_lb_getinfo(dev_info_t *dip, int cmd, void *arg, void *tg_cookie)
   2382 {
   2383 	int instance;
   2384 	xdf_t   *vdp;
   2385 
   2386 	instance = ddi_get_instance(dip);
   2387 
   2388 	if ((vdp = ddi_get_soft_state(xdf_ssp, instance)) == NULL)
   2389 		return (ENXIO);
   2390 
   2391 	switch (cmd) {
   2392 	case TG_GETPHYGEOM:
   2393 		return (xdf_lb_getpgeom(dip, (cmlb_geom_t *)arg));
   2394 	case TG_GETVIRTGEOM:
   2395 		return (xdf_lb_getvgeom(dip, (cmlb_geom_t *)arg));
   2396 	case TG_GETCAPACITY:
   2397 		return (xdf_lb_getcap(dip, (diskaddr_t *)arg));
   2398 	case TG_GETBLOCKSIZE:
   2399 		mutex_enter(&vdp->xdf_cb_lk);
   2400 		*(uint32_t *)arg = vdp->xdf_xdev_secsize;
   2401 		mutex_exit(&vdp->xdf_cb_lk);
   2402 		return (0);
   2403 	case TG_GETATTR:
   2404 		return (xdf_lb_getattribute(dip, (tg_attribute_t *)arg));
   2405 	default:
   2406 		return (ENOTTY);
   2407 	}
   2408 }
   2409 
   2410 /* ARGSUSED5 */
   2411 int
   2412 xdf_lb_rdwr(dev_info_t *dip, uchar_t cmd, void *bufp,
   2413     diskaddr_t start, size_t reqlen, void *tg_cookie)
   2414 {
   2415 	xdf_t *vdp;
   2416 	struct buf *bp;
   2417 	int err = 0;
   2418 
   2419 	vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip));
   2420 
   2421 	/* We don't allow IO from the oe_change callback thread */
   2422 	ASSERT(curthread != vdp->xdf_oe_change_thread);
   2423 
   2424 	if ((start + ((reqlen / (vdp->xdf_xdev_secsize / DEV_BSIZE))
   2425 	    >> DEV_BSHIFT)) > vdp->xdf_pgeom.g_capacity)
   2426 		return (EINVAL);
   2427 
   2428 	bp = getrbuf(KM_SLEEP);
   2429 	if (cmd == TG_READ)
   2430 		bp->b_flags = B_BUSY | B_READ;
   2431 	else
   2432 		bp->b_flags = B_BUSY | B_WRITE;
   2433 
   2434 	bp->b_un.b_addr = bufp;
   2435 	bp->b_bcount = reqlen;
   2436 	bp->b_blkno = start * (vdp->xdf_xdev_secsize / DEV_BSIZE);
   2437 	bp->b_edev = DDI_DEV_T_NONE; /* don't have dev_t */
   2438 
   2439 	mutex_enter(&vdp->xdf_dev_lk);
   2440 	xdf_bp_push(vdp, bp);
   2441 	mutex_exit(&vdp->xdf_dev_lk);
   2442 	xdf_io_start(vdp);
   2443 	if (curthread == vdp->xdf_ready_tq_thread)
   2444 		(void) xdf_ring_drain(vdp);
   2445 	err = biowait(bp);
   2446 	ASSERT(bp->b_flags & B_DONE);
   2447 	freerbuf(bp);
   2448 	return (err);
   2449 }
   2450 
   2451 /*
   2452  * Lock the current media.  Set the media state to "lock".
   2453  * (Media locks are only respected by the backend driver.)
   2454  */
   2455 static int
   2456 xdf_ioctl_mlock(xdf_t *vdp)
   2457 {
   2458 	int rv;
   2459 	mutex_enter(&vdp->xdf_cb_lk);
   2460 	rv = xdf_media_req(vdp, XBV_MEDIA_REQ_LOCK, B_TRUE);
   2461 	mutex_exit(&vdp->xdf_cb_lk);
   2462 	return (rv);
   2463 }
   2464 
   2465 /*
   2466  * Release a media lock.  Set the media state to "none".
   2467  */
   2468 static int
   2469 xdf_ioctl_munlock(xdf_t *vdp)
   2470 {
   2471 	int rv;
   2472 	mutex_enter(&vdp->xdf_cb_lk);
   2473 	rv = xdf_media_req(vdp, XBV_MEDIA_REQ_NONE, B_TRUE);
   2474 	mutex_exit(&vdp->xdf_cb_lk);
   2475 	return (rv);
   2476 }
   2477 
   2478 /*
   2479  * Eject the current media.  Ignores any media locks.  (Media locks
   2480  * are only for benifit of the the backend.)
   2481  */
   2482 static int
   2483 xdf_ioctl_eject(xdf_t *vdp)
   2484 {
   2485 	int rv;
   2486 
   2487 	mutex_enter(&vdp->xdf_cb_lk);
   2488 	if ((rv = xdf_media_req(vdp, XBV_MEDIA_REQ_EJECT, B_FALSE)) != 0) {
   2489 		mutex_exit(&vdp->xdf_cb_lk);
   2490 		return (rv);
   2491 	}
   2492 
   2493 	/*
   2494 	 * We've set the media requests xenbus parameter to eject, so now
   2495 	 * disconnect from the backend, wait for the backend to clear
   2496 	 * the media requets xenbus paramter, and then we can reconnect
   2497 	 * to the backend.
   2498 	 */
   2499 	(void) xdf_disconnect(vdp, XD_UNKNOWN, B_TRUE);
   2500 	mutex_enter(&vdp->xdf_dev_lk);
   2501 	if (xdf_connect_locked(vdp, B_TRUE) != XD_READY) {
   2502 		mutex_exit(&vdp->xdf_dev_lk);
   2503 		mutex_exit(&vdp->xdf_cb_lk);
   2504 		return (EIO);
   2505 	}
   2506 	mutex_exit(&vdp->xdf_dev_lk);
   2507 	mutex_exit(&vdp->xdf_cb_lk);
   2508 	return (0);
   2509 }
   2510 
   2511 /*
   2512  * Watch for media state changes.  This can be an insertion of a device
   2513  * (triggered by a 'xm block-configure' request in another domain) or
   2514  * the ejection of a device (triggered by a local "eject" operation).
   2515  * For a full description of the DKIOCSTATE ioctl behavior see dkio(7I).
   2516  */
   2517 static int
   2518 xdf_dkstate(xdf_t *vdp, enum dkio_state mstate)
   2519 {
   2520 	enum dkio_state		prev_state;
   2521 
   2522 	mutex_enter(&vdp->xdf_cb_lk);
   2523 	prev_state = vdp->xdf_mstate;
   2524 
   2525 	if (vdp->xdf_mstate == mstate) {
   2526 		while (vdp->xdf_mstate == prev_state) {
   2527 			if (cv_wait_sig(&vdp->xdf_mstate_cv,
   2528 			    &vdp->xdf_cb_lk) == 0) {
   2529 				mutex_exit(&vdp->xdf_cb_lk);
   2530 				return (EINTR);
   2531 			}
   2532 		}
   2533 	}
   2534 
   2535 	if ((prev_state != DKIO_INSERTED) &&
   2536 	    (vdp->xdf_mstate == DKIO_INSERTED)) {
   2537 		(void) xdf_media_req(vdp, XBV_MEDIA_REQ_LOCK, B_TRUE);
   2538 		mutex_exit(&vdp->xdf_cb_lk);
   2539 		return (0);
   2540 	}
   2541 
   2542 	mutex_exit(&vdp->xdf_cb_lk);
   2543 	return (0);
   2544 }
   2545 
   2546 /*ARGSUSED*/
   2547 static int
   2548 xdf_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
   2549     int *rvalp)
   2550 {
   2551 	minor_t		minor = getminor(dev);
   2552 	int		part = XDF_PART(minor);
   2553 	xdf_t		*vdp;
   2554 	int		rv;
   2555 
   2556 	if (((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL) ||
   2557 	    (!xdf_isopen(vdp, part)))
   2558 		return (ENXIO);
   2559 
   2560 	DPRINTF(IOCTL_DBG, ("xdf@%s:ioctl: cmd %d (0x%x)\n",
   2561 	    vdp->xdf_addr, cmd, cmd));
   2562 
   2563 	switch (cmd) {
   2564 	default:
   2565 		return (ENOTTY);
   2566 	case DKIOCG_PHYGEOM:
   2567 	case DKIOCG_VIRTGEOM:
   2568 	case DKIOCGGEOM:
   2569 	case DKIOCSGEOM:
   2570 	case DKIOCGAPART:
   2571 	case DKIOCSAPART:
   2572 	case DKIOCGVTOC:
   2573 	case DKIOCSVTOC:
   2574 	case DKIOCPARTINFO:
   2575 	case DKIOCGEXTVTOC:
   2576 	case DKIOCSEXTVTOC:
   2577 	case DKIOCEXTPARTINFO:
   2578 	case DKIOCGMBOOT:
   2579 	case DKIOCSMBOOT:
   2580 	case DKIOCGETEFI:
   2581 	case DKIOCSETEFI:
   2582 	case DKIOCSETEXTPART:
   2583 	case DKIOCPARTITION:
   2584 		return (cmlb_ioctl(vdp->xdf_vd_lbl, dev, cmd, arg, mode, credp,
   2585 		    rvalp, NULL));
   2586 	case FDEJECT:
   2587 	case DKIOCEJECT:
   2588 	case CDROMEJECT:
   2589 		return (xdf_ioctl_eject(vdp));
   2590 	case DKIOCLOCK:
   2591 		return (xdf_ioctl_mlock(vdp));
   2592 	case DKIOCUNLOCK:
   2593 		return (xdf_ioctl_munlock(vdp));
   2594 	case CDROMREADOFFSET: {
   2595 		int offset = 0;
   2596 		if (!XD_IS_CD(vdp))
   2597 			return (ENOTTY);
   2598 		if (ddi_copyout(&offset, (void *)arg, sizeof (int), mode))
   2599 			return (EFAULT);
   2600 		return (0);
   2601 	}
   2602 	case DKIOCGMEDIAINFO: {
   2603 		struct dk_minfo media_info;
   2604 
   2605 		media_info.dki_lbsize = vdp->xdf_xdev_secsize;
   2606 		media_info.dki_capacity = vdp->xdf_pgeom.g_capacity;
   2607 		if (XD_IS_CD(vdp))
   2608 			media_info.dki_media_type = DK_CDROM;
   2609 		else
   2610 			media_info.dki_media_type = DK_FIXED_DISK;
   2611 
   2612 		if (ddi_copyout(&media_info, (void *)arg,
   2613 		    sizeof (struct dk_minfo), mode))
   2614 			return (EFAULT);
   2615 		return (0);
   2616 	}
   2617 	case DKIOCINFO: {
   2618 		struct dk_cinfo info;
   2619 
   2620 		/* controller information */
   2621 		if (XD_IS_CD(vdp))
   2622 			info.dki_ctype = DKC_CDROM;
   2623 		else
   2624 			info.dki_ctype = DKC_VBD;
   2625 
   2626 		info.dki_cnum = 0;
   2627 		(void) strncpy((char *)(&info.dki_cname), "xdf", 8);
   2628 
   2629 		/* unit information */
   2630 		info.dki_unit = ddi_get_instance(vdp->xdf_dip);
   2631 		(void) strncpy((char *)(&info.dki_dname), "xdf", 8);
   2632 		info.dki_flags = DKI_FMTVOL;
   2633 		info.dki_partition = part;
   2634 		info.dki_maxtransfer = maxphys / DEV_BSIZE;
   2635 		info.dki_addr = 0;
   2636 		info.dki_space = 0;
   2637 		info.dki_prio = 0;
   2638 		info.dki_vec = 0;
   2639 
   2640 		if (ddi_copyout(&info, (void *)arg, sizeof (info), mode))
   2641 			return (EFAULT);
   2642 		return (0);
   2643 	}
   2644 	case DKIOCSTATE: {
   2645 		enum dkio_state mstate;
   2646 
   2647 		if (ddi_copyin((void *)arg, &mstate,
   2648 		    sizeof (mstate), mode) != 0)
   2649 			return (EFAULT);
   2650 		if ((rv = xdf_dkstate(vdp, mstate)) != 0)
   2651 			return (rv);
   2652 		mstate = vdp->xdf_mstate;
   2653 		if (ddi_copyout(&mstate, (void *)arg,
   2654 		    sizeof (mstate), mode) != 0)
   2655 			return (EFAULT);
   2656 		return (0);
   2657 	}
   2658 	case DKIOCREMOVABLE: {
   2659 		int i = BOOLEAN2VOID(XD_IS_RM(vdp));
   2660 		if (ddi_copyout(&i, (caddr_t)arg, sizeof (i), mode))
   2661 			return (EFAULT);
   2662 		return (0);
   2663 	}
   2664 	case DKIOCGETWCE: {
   2665 		int i = BOOLEAN2VOID(XD_IS_RM(vdp));
   2666 		if (ddi_copyout(&i, (void *)arg, sizeof (i), mode))
   2667 			return (EFAULT);
   2668 		return (0);
   2669 	}
   2670 	case DKIOCSETWCE: {
   2671 		int i;
   2672 		if (ddi_copyin((void *)arg, &i, sizeof (i), mode))
   2673 			return (EFAULT);
   2674 		vdp->xdf_wce = VOID2BOOLEAN(i);
   2675 		return (0);
   2676 	}
   2677 	case DKIOCFLUSHWRITECACHE: {
   2678 		struct dk_callback *dkc = (struct dk_callback *)arg;
   2679 
   2680 		if (vdp->xdf_flush_supported) {
   2681 			rv = xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE,
   2682 			    NULL, 0, 0, (void *)dev);
   2683 		} else if (vdp->xdf_feature_barrier &&
   2684 		    !xdf_barrier_flush_disable) {
   2685 			rv = xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE,
   2686 			    vdp->xdf_cache_flush_block, xdf_flush_block,
   2687 			    vdp->xdf_xdev_secsize, (void *)dev);
   2688 		} else {
   2689 			return (ENOTTY);
   2690 		}
   2691 		if ((mode & FKIOCTL) && (dkc != NULL) &&
   2692 		    (dkc->dkc_callback != NULL)) {
   2693 			(*dkc->dkc_callback)(dkc->dkc_cookie, rv);
   2694 			/* need to return 0 after calling callback */
   2695 			rv = 0;
   2696 		}
   2697 		return (rv);
   2698 	}
   2699 	}
   2700 	/*NOTREACHED*/
   2701 }
   2702 
   2703 static int
   2704 xdf_strategy(struct buf *bp)
   2705 {
   2706 	xdf_t	*vdp;
   2707 	minor_t minor;
   2708 	diskaddr_t p_blkct, p_blkst;
   2709 	daddr_t blkno;
   2710 	ulong_t nblks;
   2711 	int part;
   2712 
   2713 	minor = getminor(bp->b_edev);
   2714 	part = XDF_PART(minor);
   2715 	vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor));
   2716 
   2717 	mutex_enter(&vdp->xdf_dev_lk);
   2718 	if (!xdf_isopen(vdp, part)) {
   2719 		mutex_exit(&vdp->xdf_dev_lk);
   2720 		xdf_io_err(bp, ENXIO, 0);
   2721 		return (0);
   2722 	}
   2723 
   2724 	/* We don't allow IO from the oe_change callback thread */
   2725 	ASSERT(curthread != vdp->xdf_oe_change_thread);
   2726 
   2727 	/* Check for writes to a read only device */
   2728 	if (!IS_READ(bp) && XD_IS_RO(vdp)) {
   2729 		mutex_exit(&vdp->xdf_dev_lk);
   2730 		xdf_io_err(bp, EROFS, 0);
   2731 		return (0);
   2732 	}
   2733 
   2734 	/* Check if this I/O is accessing a partition or the entire disk */
   2735 	if ((long)bp->b_private == XB_SLICE_NONE) {
   2736 		/* This I/O is using an absolute offset */
   2737 		p_blkct = vdp->xdf_xdev_nblocks;
   2738 		p_blkst = 0;
   2739 	} else {
   2740 		/* This I/O is using a partition relative offset */
   2741 		mutex_exit(&vdp->xdf_dev_lk);
   2742 		if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkct,
   2743 		    &p_blkst, NULL, NULL, NULL)) {
   2744 			xdf_io_err(bp, ENXIO, 0);
   2745 			return (0);
   2746 		}
   2747 		mutex_enter(&vdp->xdf_dev_lk);
   2748 	}
   2749 
   2750 	/*
   2751 	 * Adjust the real blkno and bcount according to the underline
   2752 	 * physical sector size.
   2753 	 */
   2754 	blkno = bp->b_blkno / (vdp->xdf_xdev_secsize / XB_BSIZE);
   2755 
   2756 	/* check for a starting block beyond the disk or partition limit */
   2757 	if (blkno > p_blkct) {
   2758 		DPRINTF(IO_DBG, ("xdf@%s: block %lld exceeds VBD size %"PRIu64,
   2759 		    vdp->xdf_addr, (longlong_t)blkno, (uint64_t)p_blkct));
   2760 		mutex_exit(&vdp->xdf_dev_lk);
   2761 		xdf_io_err(bp, EINVAL, 0);
   2762 		return (0);
   2763 	}
   2764 
   2765 	/* Legacy: don't set error flag at this case */
   2766 	if (blkno == p_blkct) {
   2767 		mutex_exit(&vdp->xdf_dev_lk);
   2768 		bp->b_resid = bp->b_bcount;
   2769 		biodone(bp);
   2770 		return (0);
   2771 	}
   2772 
   2773 	/* sanitize the input buf */
   2774 	bioerror(bp, 0);
   2775 	bp->b_resid = 0;
   2776 	bp->av_back = bp->av_forw = NULL;
   2777 
   2778 	/* Adjust for partial transfer, this will result in an error later */
   2779 	if (vdp->xdf_xdev_secsize != 0 &&
   2780 	    vdp->xdf_xdev_secsize != XB_BSIZE) {
   2781 		nblks = bp->b_bcount / vdp->xdf_xdev_secsize;
   2782 	} else {
   2783 		nblks = bp->b_bcount >> XB_BSHIFT;
   2784 	}
   2785 
   2786 	if ((blkno + nblks) > p_blkct) {
   2787 		if (vdp->xdf_xdev_secsize != 0 &&
   2788 		    vdp->xdf_xdev_secsize != XB_BSIZE) {
   2789 			bp->b_resid =
   2790 			    ((blkno + nblks) - p_blkct) *
   2791 			    vdp->xdf_xdev_secsize;
   2792 		} else {
   2793 			bp->b_resid =
   2794 			    ((blkno + nblks) - p_blkct) <<
   2795 			    XB_BSHIFT;
   2796 		}
   2797 		bp->b_bcount -= bp->b_resid;
   2798 	}
   2799 
   2800 	DPRINTF(IO_DBG, ("xdf@%s: strategy blk %lld len %lu\n",
   2801 	    vdp->xdf_addr, (longlong_t)blkno, (ulong_t)bp->b_bcount));
   2802 
   2803 	/* Fix up the buf struct */
   2804 	bp->b_flags |= B_BUSY;
   2805 	bp->b_private = (void *)(uintptr_t)p_blkst;
   2806 
   2807 	xdf_bp_push(vdp, bp);
   2808 	mutex_exit(&vdp->xdf_dev_lk);
   2809 	xdf_io_start(vdp);
   2810 	if (do_polled_io)
   2811 		(void) xdf_ring_drain(vdp);
   2812 	return (0);
   2813 }
   2814 
   2815 /*ARGSUSED*/
   2816 static int
   2817 xdf_read(dev_t dev, struct uio *uiop, cred_t *credp)
   2818 {
   2819 	xdf_t	*vdp;
   2820 	minor_t minor;
   2821 	diskaddr_t p_blkcnt;
   2822 	int part;
   2823 
   2824 	minor = getminor(dev);
   2825 	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
   2826 		return (ENXIO);
   2827 
   2828 	DPRINTF(IO_DBG, ("xdf@%s: read offset 0x%"PRIx64"\n",
   2829 	    vdp->xdf_addr, (int64_t)uiop->uio_offset));
   2830 
   2831 	part = XDF_PART(minor);
   2832 	if (!xdf_isopen(vdp, part))
   2833 		return (ENXIO);
   2834 
   2835 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
   2836 	    NULL, NULL, NULL, NULL))
   2837 		return (ENXIO);
   2838 
   2839 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt, vdp))
   2840 		return (ENOSPC);
   2841 
   2842 	if (U_INVAL(uiop))
   2843 		return (EINVAL);
   2844 
   2845 	return (physio(xdf_strategy, NULL, dev, B_READ, xdfmin, uiop));
   2846 }
   2847 
   2848 /*ARGSUSED*/
   2849 static int
   2850 xdf_write(dev_t dev, struct uio *uiop, cred_t *credp)
   2851 {
   2852 	xdf_t *vdp;
   2853 	minor_t minor;
   2854 	diskaddr_t p_blkcnt;
   2855 	int part;
   2856 
   2857 	minor = getminor(dev);
   2858 	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
   2859 		return (ENXIO);
   2860 
   2861 	DPRINTF(IO_DBG, ("xdf@%s: write offset 0x%"PRIx64"\n",
   2862 	    vdp->xdf_addr, (int64_t)uiop->uio_offset));
   2863 
   2864 	part = XDF_PART(minor);
   2865 	if (!xdf_isopen(vdp, part))
   2866 		return (ENXIO);
   2867 
   2868 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
   2869 	    NULL, NULL, NULL, NULL))
   2870 		return (ENXIO);
   2871 
   2872 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt, vdp))
   2873 		return (ENOSPC);
   2874 
   2875 	if (U_INVAL(uiop))
   2876 		return (EINVAL);
   2877 
   2878 	return (physio(xdf_strategy, NULL, dev, B_WRITE, xdfmin, uiop));
   2879 }
   2880 
   2881 /*ARGSUSED*/
   2882 static int
   2883 xdf_aread(dev_t dev, struct aio_req *aiop, cred_t *credp)
   2884 {
   2885 	xdf_t	*vdp;
   2886 	minor_t minor;
   2887 	struct uio *uiop = aiop->aio_uio;
   2888 	diskaddr_t p_blkcnt;
   2889 	int part;
   2890 
   2891 	minor = getminor(dev);
   2892 	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
   2893 		return (ENXIO);
   2894 
   2895 	part = XDF_PART(minor);
   2896 	if (!xdf_isopen(vdp, part))
   2897 		return (ENXIO);
   2898 
   2899 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
   2900 	    NULL, NULL, NULL, NULL))
   2901 		return (ENXIO);
   2902 
   2903 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt, vdp))
   2904 		return (ENOSPC);
   2905 
   2906 	if (U_INVAL(uiop))
   2907 		return (EINVAL);
   2908 
   2909 	return (aphysio(xdf_strategy, anocancel, dev, B_READ, xdfmin, aiop));
   2910 }
   2911 
   2912 /*ARGSUSED*/
   2913 static int
   2914 xdf_awrite(dev_t dev, struct aio_req *aiop, cred_t *credp)
   2915 {
   2916 	xdf_t *vdp;
   2917 	minor_t minor;
   2918 	struct uio *uiop = aiop->aio_uio;
   2919 	diskaddr_t p_blkcnt;
   2920 	int part;
   2921 
   2922 	minor = getminor(dev);
   2923 	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
   2924 		return (ENXIO);
   2925 
   2926 	part = XDF_PART(minor);
   2927 	if (!xdf_isopen(vdp, part))
   2928 		return (ENXIO);
   2929 
   2930 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
   2931 	    NULL, NULL, NULL, NULL))
   2932 		return (ENXIO);
   2933 
   2934 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt, vdp))
   2935 		return (ENOSPC);
   2936 
   2937 	if (U_INVAL(uiop))
   2938 		return (EINVAL);
   2939 
   2940 	return (aphysio(xdf_strategy, anocancel, dev, B_WRITE, xdfmin, aiop));
   2941 }
   2942 
   2943 static int
   2944 xdf_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
   2945 {
   2946 	struct buf dumpbuf, *dbp = &dumpbuf;
   2947 	xdf_t	*vdp;
   2948 	minor_t minor;
   2949 	int err = 0;
   2950 	int part;
   2951 	diskaddr_t p_blkcnt, p_blkst;
   2952 
   2953 	minor = getminor(dev);
   2954 	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
   2955 		return (ENXIO);
   2956 
   2957 	DPRINTF(IO_DBG, ("xdf@%s: dump addr (0x%p) blk (%ld) nblks (%d)\n",
   2958 	    vdp->xdf_addr, (void *)addr, blkno, nblk));
   2959 
   2960 	/* We don't allow IO from the oe_change callback thread */
   2961 	ASSERT(curthread != vdp->xdf_oe_change_thread);
   2962 
   2963 	part = XDF_PART(minor);
   2964 	if (!xdf_isopen(vdp, part))
   2965 		return (ENXIO);
   2966 
   2967 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt, &p_blkst,
   2968 	    NULL, NULL, NULL))
   2969 		return (ENXIO);
   2970 
   2971 	if ((blkno + nblk) >
   2972 	    (p_blkcnt * (vdp->xdf_xdev_secsize / XB_BSIZE))) {
   2973 		cmn_err(CE_WARN, "xdf@%s: block %ld exceeds VBD size %"PRIu64,
   2974 		    vdp->xdf_addr, (daddr_t)((blkno + nblk) /
   2975 		    (vdp->xdf_xdev_secsize / XB_BSIZE)), (uint64_t)p_blkcnt);
   2976 		return (EINVAL);
   2977 	}
   2978 
   2979 	bioinit(dbp);
   2980 	dbp->b_flags = B_BUSY;
   2981 	dbp->b_un.b_addr = addr;
   2982 	dbp->b_bcount = nblk << DEV_BSHIFT;
   2983 	dbp->b_blkno = blkno;
   2984 	dbp->b_edev = dev;
   2985 	dbp->b_private = (void *)(uintptr_t)p_blkst;
   2986 
   2987 	mutex_enter(&vdp->xdf_dev_lk);
   2988 	xdf_bp_push(vdp, dbp);
   2989 	mutex_exit(&vdp->xdf_dev_lk);
   2990 	xdf_io_start(vdp);
   2991 	err = xdf_ring_drain(vdp);
   2992 	biofini(dbp);
   2993 	return (err);
   2994 }
   2995 
   2996 /*ARGSUSED*/
   2997 static int
   2998 xdf_close(dev_t dev, int flag, int otyp, struct cred *credp)
   2999 {
   3000 	minor_t	minor;
   3001 	xdf_t	*vdp;
   3002 	int part;
   3003 	ulong_t parbit;
   3004 
   3005 	minor = getminor(dev);
   3006 	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
   3007 		return (ENXIO);
   3008 
   3009 	mutex_enter(&vdp->xdf_dev_lk);
   3010 	part = XDF_PART(minor);
   3011 	if (!xdf_isopen(vdp, part)) {
   3012 		mutex_exit(&vdp->xdf_dev_lk);
   3013 		return (ENXIO);
   3014 	}
   3015 	parbit = 1 << part;
   3016 
   3017 	ASSERT((vdp->xdf_vd_open[otyp] & parbit) != 0);
   3018 	if (otyp == OTYP_LYR) {
   3019 		ASSERT(vdp->xdf_vd_lyropen[part] > 0);
   3020 		if (--vdp->xdf_vd_lyropen[part] == 0)
   3021 			vdp->xdf_vd_open[otyp] &= ~parbit;
   3022 	} else {
   3023 		vdp->xdf_vd_open[otyp] &= ~parbit;
   3024 	}
   3025 	vdp->xdf_vd_exclopen &= ~parbit;
   3026 
   3027 	mutex_exit(&vdp->xdf_dev_lk);
   3028 	return (0);
   3029 }
   3030 
   3031 static int
   3032 xdf_open(dev_t *devp, int flag, int otyp, cred_t *credp)
   3033 {
   3034 	minor_t	minor;
   3035 	xdf_t	*vdp;
   3036 	int part;
   3037 	ulong_t parbit;
   3038 	diskaddr_t p_blkct = 0;
   3039 	boolean_t firstopen;
   3040 	boolean_t nodelay;
   3041 
   3042 	minor = getminor(*devp);
   3043 	if ((vdp = ddi_get_soft_state(xdf_ssp, XDF_INST(minor))) == NULL)
   3044 		return (ENXIO);
   3045 
   3046 	nodelay = (flag & (FNDELAY | FNONBLOCK));
   3047 
   3048 	DPRINTF(DDI_DBG, ("xdf@%s: opening\n", vdp->xdf_addr));
   3049 
   3050 	/* do cv_wait until connected or failed */
   3051 	mutex_enter(&vdp->xdf_cb_lk);
   3052 	mutex_enter(&vdp->xdf_dev_lk);
   3053 	if (!nodelay && (xdf_connect_locked(vdp, B_TRUE) != XD_READY)) {
   3054 		mutex_exit(&vdp->xdf_dev_lk);
   3055 		mutex_exit(&vdp->xdf_cb_lk);
   3056 		return (ENXIO);
   3057 	}
   3058 	mutex_exit(&vdp->xdf_cb_lk);
   3059 
   3060 	if ((flag & FWRITE) && XD_IS_RO(vdp)) {
   3061 		mutex_exit(&vdp->xdf_dev_lk);
   3062 		return (EROFS);
   3063 	}
   3064 
   3065 	part = XDF_PART(minor);
   3066 	parbit = 1 << part;
   3067 	if ((vdp->xdf_vd_exclopen & parbit) ||
   3068 	    ((flag & FEXCL) && xdf_isopen(vdp, part))) {
   3069 		mutex_exit(&vdp->xdf_dev_lk);
   3070 		return (EBUSY);
   3071 	}
   3072 
   3073 	/* are we the first one to open this node? */
   3074 	firstopen = !xdf_isopen(vdp, -1);
   3075 
   3076 	if (otyp == OTYP_LYR)
   3077 		vdp->xdf_vd_lyropen[part]++;
   3078 
   3079 	vdp->xdf_vd_open[otyp] |= parbit;
   3080 
   3081 	if (flag & FEXCL)
   3082 		vdp->xdf_vd_exclopen |= parbit;
   3083 
   3084 	mutex_exit(&vdp->xdf_dev_lk);
   3085 
   3086 	/* force a re-validation */
   3087 	if (firstopen)
   3088 		cmlb_invalidate(vdp->xdf_vd_lbl, NULL);
   3089 
   3090 	/* If this is a non-blocking open then we're done */
   3091 	if (nodelay)
   3092 		return (0);
   3093 
   3094 	/*
   3095 	 * This is a blocking open, so we require:
   3096 	 * - that the disk have a valid label on it
   3097 	 * - that the size of the partition that we're opening is non-zero
   3098 	 */
   3099 	if ((cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkct,
   3100 	    NULL, NULL, NULL, NULL) != 0) || (p_blkct == 0)) {
   3101 		(void) xdf_close(*devp, flag, otyp, credp);
   3102 		return (ENXIO);
   3103 	}
   3104 
   3105 	return (0);
   3106 }
   3107 
   3108 /*ARGSUSED*/
   3109 static void
   3110 xdf_watch_hp_status_cb(dev_info_t *dip, const char *path, void *arg)
   3111 {
   3112 	xdf_t *vdp = (xdf_t *)ddi_get_driver_private(dip);
   3113 	cv_broadcast(&vdp->xdf_hp_status_cv);
   3114 }
   3115 
   3116 static int
   3117 xdf_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int flags,
   3118 	char *name, caddr_t valuep, int *lengthp)
   3119 {
   3120 	xdf_t	*vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip));
   3121 
   3122 	/*
   3123 	 * Sanity check that if a dev_t or dip were specified that they
   3124 	 * correspond to this device driver.  On debug kernels we'll
   3125 	 * panic and on non-debug kernels we'll return failure.
   3126 	 */
   3127 	ASSERT(ddi_driver_major(dip) == xdf_major);
   3128 	ASSERT((dev == DDI_DEV_T_ANY) || (getmajor(dev) == xdf_major));
   3129 	if ((ddi_driver_major(dip) != xdf_major) ||
   3130 	    ((dev != DDI_DEV_T_ANY) && (getmajor(dev) != xdf_major)))
   3131 		return (DDI_PROP_NOT_FOUND);
   3132 
   3133 	if (vdp == NULL)
   3134 		return (ddi_prop_op(dev, dip, prop_op, flags,
   3135 		    name, valuep, lengthp));
   3136 
   3137 	return (cmlb_prop_op(vdp->xdf_vd_lbl,
   3138 	    dev, dip, prop_op, flags, name, valuep, lengthp,
   3139 	    XDF_PART(getminor(dev)), NULL));
   3140 }
   3141 
   3142 /*ARGSUSED*/
   3143 static int
   3144 xdf_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **rp)
   3145 {
   3146 	int	instance = XDF_INST(getminor((dev_t)arg));
   3147 	xdf_t	*vbdp;
   3148 
   3149 	switch (cmd) {
   3150 	case DDI_INFO_DEVT2DEVINFO:
   3151 		if ((vbdp = ddi_get_soft_state(xdf_ssp, instance)) == NULL) {
   3152 			*rp = NULL;
   3153 			return (DDI_FAILURE);
   3154 		}
   3155 		*rp = vbdp->xdf_dip;
   3156 		return (DDI_SUCCESS);
   3157 
   3158 	case DDI_INFO_DEVT2INSTANCE:
   3159 		*rp = (void *)(uintptr_t)instance;
   3160 		return (DDI_SUCCESS);
   3161 
   3162 	default:
   3163 		return (DDI_FAILURE);
   3164 	}
   3165 }
   3166 
   3167 /*ARGSUSED*/
   3168 static int
   3169 xdf_resume(dev_info_t *dip)
   3170 {
   3171 	xdf_t	*vdp;
   3172 	char	*oename;
   3173 
   3174 	if ((vdp = ddi_get_soft_state(xdf_ssp, ddi_get_instance(dip))) == NULL)
   3175 		goto err;
   3176 
   3177 	if (xdf_debug & SUSRES_DBG)
   3178 		xen_printf("xdf@%s: xdf_resume\n", vdp->xdf_addr);
   3179 
   3180 	mutex_enter(&vdp->xdf_cb_lk);
   3181 
   3182 	if (xvdi_resume(dip) != DDI_SUCCESS) {
   3183 		mutex_exit(&vdp->xdf_cb_lk);
   3184 		goto err;
   3185 	}
   3186 
   3187 	if (((oename = xvdi_get_oename(dip)) == NULL) ||
   3188 	    (xvdi_add_xb_watch_handler(dip, oename, XBP_HP_STATUS,
   3189 	    xdf_watch_hp_status_cb, NULL) != DDI_SUCCESS)) {
   3190 		mutex_exit(&vdp->xdf_cb_lk);
   3191 		goto err;
   3192 	}
   3193 
   3194 	mutex_enter(&vdp->xdf_dev_lk);
   3195 	ASSERT(vdp->xdf_state != XD_READY);
   3196 	xdf_set_state(vdp, XD_UNKNOWN);
   3197 	mutex_exit(&vdp->xdf_dev_lk);
   3198 
   3199 	if (xdf_setstate_init(vdp) != DDI_SUCCESS) {
   3200 		mutex_exit(&vdp->xdf_cb_lk);
   3201 		goto err;
   3202 	}
   3203 
   3204 	mutex_exit(&vdp->xdf_cb_lk);
   3205 
   3206 	if (xdf_debug & SUSRES_DBG)
   3207 		xen_printf("xdf@%s: xdf_resume: done\n", vdp->xdf_addr);
   3208 	return (DDI_SUCCESS);
   3209 err:
   3210 	if (xdf_debug & SUSRES_DBG)
   3211 		xen_printf("xdf@%s: xdf_resume: fail\n", vdp->xdf_addr);
   3212 	return (DDI_FAILURE);
   3213 }
   3214 
   3215 static int
   3216 xdf_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
   3217 {
   3218 	int			n, instance = ddi_get_instance(dip);
   3219 	ddi_iblock_cookie_t	ibc, softibc;
   3220 	boolean_t		dev_iscd = B_FALSE;
   3221 	xdf_t			*vdp;
   3222 	char			*oename, *xsname, *str;
   3223 
   3224 	if ((n = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_NOTPROM,
   3225 	    "xdf_debug", 0)) != 0)
   3226 		xdf_debug = n;
   3227 
   3228 	switch (cmd) {
   3229 	case DDI_RESUME:
   3230 		return (xdf_resume(dip));
   3231 	case DDI_ATTACH:
   3232 		break;
   3233 	default:
   3234 		return (DDI_FAILURE);
   3235 	}
   3236 	/* DDI_ATTACH */
   3237 
   3238 	if (((xsname = xvdi_get_xsname(dip)) == NULL) ||
   3239 	    ((oename = xvdi_get_oename(dip)) == NULL))
   3240 		return (DDI_FAILURE);
   3241 
   3242 	/*
   3243 	 * Disable auto-detach.  This is necessary so that we don't get
   3244 	 * detached while we're disconnected from the back end.
   3245 	 */
   3246 	if ((ddi_prop_update_int(DDI_DEV_T_NONE, dip,
   3247 	    DDI_NO_AUTODETACH, 1) != DDI_PROP_SUCCESS))
   3248 		return (DDI_FAILURE);
   3249 
   3250 	/* driver handles kernel-issued IOCTLs */
   3251 	if (ddi_prop_create(DDI_DEV_T_NONE, dip,
   3252 	    DDI_PROP_CANSLEEP, DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS)
   3253 		return (DDI_FAILURE);
   3254 
   3255 	if (ddi_get_iblock_cookie(dip, 0, &ibc) != DDI_SUCCESS)
   3256 		return (DDI_FAILURE);
   3257 
   3258 	if (ddi_get_soft_iblock_cookie(dip,
   3259 	    DDI_SOFTINT_LOW, &softibc) != DDI_SUCCESS)
   3260 		return (DDI_FAILURE);
   3261 
   3262 	if (xenbus_read_str(xsname, XBP_DEV_TYPE, &str) != 0) {
   3263 		cmn_err(CE_WARN, "xdf@%s: cannot read device-type",
   3264 		    ddi_get_name_addr(dip));
   3265 		return (DDI_FAILURE);
   3266 	}
   3267 	if (strcmp(str, XBV_DEV_TYPE_CD) == 0)
   3268 		dev_iscd = B_TRUE;
   3269 	strfree(str);
   3270 
   3271 	if (ddi_soft_state_zalloc(xdf_ssp, instance) != DDI_SUCCESS)
   3272 		return (DDI_FAILURE);
   3273 
   3274 	DPRINTF(DDI_DBG, ("xdf@%s: attaching\n", ddi_get_name_addr(dip)));
   3275 	vdp = ddi_get_soft_state(xdf_ssp, instance);
   3276 	ddi_set_driver_private(dip, vdp);
   3277 	vdp->xdf_dip = dip;
   3278 	vdp->xdf_addr = ddi_get_name_addr(dip);
   3279 	vdp->xdf_suspending = B_FALSE;
   3280 	vdp->xdf_media_req_supported = B_FALSE;
   3281 	vdp->xdf_peer = INVALID_DOMID;
   3282 	vdp->xdf_evtchn = INVALID_EVTCHN;
   3283 	list_create(&vdp->xdf_vreq_act, sizeof (v_req_t),
   3284 	    offsetof(v_req_t, v_link));
   3285 	cv_init(&vdp->xdf_dev_cv, NULL, CV_DEFAULT, NULL);
   3286 	cv_init(&vdp->xdf_hp_status_cv, NULL, CV_DEFAULT, NULL);
   3287 	cv_init(&vdp->xdf_mstate_cv, NULL, CV_DEFAULT, NULL);
   3288 	mutex_init(&vdp->xdf_dev_lk, NULL, MUTEX_DRIVER, (void *)ibc);
   3289 	mutex_init(&vdp->xdf_cb_lk, NULL, MUTEX_DRIVER, (void *)ibc);
   3290 	mutex_init(&vdp->xdf_iostat_lk, NULL, MUTEX_DRIVER, (void *)ibc);
   3291 	vdp->xdf_cmbl_reattach = B_TRUE;
   3292 	if (dev_iscd) {
   3293 		vdp->xdf_dinfo |= VDISK_CDROM;
   3294 		vdp->xdf_mstate = DKIO_EJECTED;
   3295 	} else {
   3296 		vdp->xdf_mstate = DKIO_NONE;
   3297 	}
   3298 
   3299 	if ((vdp->xdf_ready_tq = ddi_taskq_create(dip, "xdf_ready_tq",
   3300 	    1, TASKQ_DEFAULTPRI, 0)) == NULL)
   3301 		goto errout0;
   3302 
   3303 	if (xvdi_add_xb_watch_handler(dip, oename, XBP_HP_STATUS,
   3304 	    xdf_watch_hp_status_cb, NULL) != DDI_SUCCESS)
   3305 		goto errout0;
   3306 
   3307 	if (ddi_add_softintr(dip, DDI_SOFTINT_LOW, &vdp->xdf_softintr_id,
   3308 	    &softibc, NULL, xdf_iorestart, (caddr_t)vdp) != DDI_SUCCESS) {
   3309 		cmn_err(CE_WARN, "xdf@%s: failed to add softintr",
   3310 		    ddi_get_name_addr(dip));
   3311 		goto errout0;
   3312 	}
   3313 
   3314 	/*
   3315 	 * Initialize the physical geometry stucture.  Note that currently
   3316 	 * we don't know the size of the backend device so the number
   3317 	 * of blocks on the device will be initialized to zero.  Once
   3318 	 * we connect to the backend device we'll update the physical
   3319 	 * geometry to reflect the real size of the device.
   3320 	 */
   3321 	xdf_synthetic_pgeom(dip, &vdp->xdf_pgeom);
   3322 	vdp->xdf_pgeom_fixed = B_FALSE;
   3323 
   3324 	/*
   3325 	 * create default device minor nodes: non-removable disk
   3326 	 * we will adjust minor nodes after we are connected w/ backend
   3327 	 */
   3328 	cmlb_alloc_handle(&vdp->xdf_vd_lbl);
   3329 	if (xdf_cmlb_attach(vdp) != 0) {
   3330 		cmn_err(CE_WARN,
   3331 		    "xdf@%s: attach failed, cmlb attach failed",
   3332 		    ddi_get_name_addr(dip));
   3333 		goto errout0;
   3334 	}
   3335 
   3336 	/*
   3337 	 * We ship with cache-enabled disks
   3338 	 */
   3339 	vdp->xdf_wce = B_TRUE;
   3340 
   3341 	mutex_enter(&vdp->xdf_cb_lk);
   3342 	/* Watch backend XenbusState change */
   3343 	if (xvdi_add_event_handler(dip,
   3344 	    XS_OE_STATE, xdf_oe_change, NULL) != DDI_SUCCESS) {
   3345 		mutex_exit(&vdp->xdf_cb_lk);
   3346 		goto errout0;
   3347 	}
   3348 
   3349 	if (xdf_setstate_init(vdp) != DDI_SUCCESS) {
   3350 		cmn_err(CE_WARN, "xdf@%s: start connection failed",
   3351 		    ddi_get_name_addr(dip));
   3352 		mutex_exit(&vdp->xdf_cb_lk);
   3353 		goto errout1;
   3354 	}
   3355 	mutex_exit(&vdp->xdf_cb_lk);
   3356 
   3357 #if defined(XPV_HVM_DRIVER)
   3358 
   3359 	xdf_hvm_add(dip);
   3360 
   3361 	/* Report our version to dom0.  */
   3362 	if (xenbus_printf(XBT_NULL, "guest/xdf", "version", "%d",
   3363 	    HVMPV_XDF_VERS))
   3364 		cmn_err(CE_WARN, "xdf: couldn't write version\n");
   3365 
   3366 #else /* !XPV_HVM_DRIVER */
   3367 
   3368 	/* create kstat for iostat(1M) */
   3369 	if (xdf_kstat_create(dip, "xdf", instance) != 0) {
   3370 		cmn_err(CE_WARN, "xdf@%s: failed to create kstat",
   3371 		    ddi_get_name_addr(dip));
   3372 		goto errout1;
   3373 	}
   3374 
   3375 #endif /* !XPV_HVM_DRIVER */
   3376 
   3377 	ddi_report_dev(dip);
   3378 	DPRINTF(DDI_DBG, ("xdf@%s: attached\n", vdp->xdf_addr));
   3379 	return (DDI_SUCCESS);
   3380 
   3381 errout1:
   3382 	(void) xvdi_switch_state(vdp->xdf_dip, XBT_NULL, XenbusStateClosed);
   3383 	xvdi_remove_event_handler(dip, XS_OE_STATE);
   3384 errout0:
   3385 	if (vdp->xdf_vd_lbl != NULL) {
   3386 		cmlb_detach(vdp->xdf_vd_lbl, NULL);
   3387 		cmlb_free_handle(&vdp->xdf_vd_lbl);
   3388 		vdp->xdf_vd_lbl = NULL;
   3389 	}
   3390 	if (vdp->xdf_softintr_id != NULL)
   3391 		ddi_remove_softintr(vdp->xdf_softintr_id);
   3392 	xvdi_remove_xb_watch_handlers(dip);
   3393 	if (vdp->xdf_ready_tq != NULL)
   3394 		ddi_taskq_destroy(vdp->xdf_ready_tq);
   3395 	mutex_destroy(&vdp->xdf_cb_lk);
   3396 	mutex_destroy(&vdp->xdf_dev_lk);
   3397 	cv_destroy(&vdp->xdf_dev_cv);
   3398 	cv_destroy(&vdp->xdf_hp_status_cv);
   3399 	ddi_soft_state_free(xdf_ssp, instance);
   3400 	ddi_set_driver_private(dip, NULL);
   3401 	ddi_prop_remove_all(dip);
   3402 	cmn_err(CE_WARN, "xdf@%s: attach failed", ddi_get_name_addr(dip));
   3403 	return (DDI_FAILURE);
   3404 }
   3405 
   3406 static int
   3407 xdf_suspend(dev_info_t *dip)
   3408 {
   3409 	int		instance = ddi_get_instance(dip);
   3410 	xdf_t		*vdp;
   3411 
   3412 	if ((vdp = ddi_get_soft_state(xdf_ssp, instance)) == NULL)
   3413 		return (DDI_FAILURE);
   3414 
   3415 	if (xdf_debug & SUSRES_DBG)
   3416 		xen_printf("xdf@%s: xdf_suspend\n", vdp->xdf_addr);
   3417 
   3418 	xvdi_suspend(dip);
   3419 
   3420 	mutex_enter(&vdp->xdf_cb_lk);
   3421 	mutex_enter(&vdp->xdf_dev_lk);
   3422 
   3423 	vdp->xdf_suspending = B_TRUE;
   3424 	xdf_ring_destroy(vdp);
   3425 	xdf_set_state(vdp, XD_SUSPEND);
   3426 	vdp->xdf_suspending = B_FALSE;
   3427 
   3428 	mutex_exit(&vdp->xdf_dev_lk);
   3429 	mutex_exit(&vdp->xdf_cb_lk);
   3430 
   3431 	if (xdf_debug & SUSRES_DBG)
   3432 		xen_printf("xdf@%s: xdf_suspend: done\n", vdp->xdf_addr);
   3433 
   3434 	return (DDI_SUCCESS);
   3435 }
   3436 
   3437 static int
   3438 xdf_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
   3439 {
   3440 	xdf_t *vdp;
   3441 	int instance;
   3442 
   3443 	switch (cmd) {
   3444 
   3445 	case DDI_PM_SUSPEND:
   3446 		break;
   3447 
   3448 	case DDI_SUSPEND:
   3449 		return (xdf_suspend(dip));
   3450 
   3451 	case DDI_DETACH:
   3452 		break;
   3453 
   3454 	default:
   3455 		return (DDI_FAILURE);
   3456 	}
   3457 
   3458 	instance = ddi_get_instance(dip);
   3459 	DPRINTF(DDI_DBG, ("xdf@%s: detaching\n", ddi_get_name_addr(dip)));
   3460 	vdp = ddi_get_soft_state(xdf_ssp, instance);
   3461 
   3462 	if (vdp == NULL)
   3463 		return (DDI_FAILURE);
   3464 
   3465 	mutex_enter(&vdp->xdf_cb_lk);
   3466 	xdf_disconnect(vdp, XD_CLOSED, B_FALSE);
   3467 	if (vdp->xdf_state != XD_CLOSED) {
   3468 		mutex_exit(&vdp->xdf_cb_lk);
   3469 		return (DDI_FAILURE);
   3470 	}
   3471 	mutex_exit(&vdp->xdf_cb_lk);
   3472 
   3473 	ASSERT(!ISDMACBON(vdp));
   3474 
   3475 #if defined(XPV_HVM_DRIVER)
   3476 	xdf_hvm_rm(dip);
   3477 #endif /* XPV_HVM_DRIVER */
   3478 
   3479 	if (vdp->xdf_timeout_id != 0)
   3480 		(void) untimeout(vdp->xdf_timeout_id);
   3481 
   3482 	xvdi_remove_event_handler(dip, XS_OE_STATE);
   3483 	ddi_taskq_destroy(vdp->xdf_ready_tq);
   3484 
   3485 	cmlb_detach(vdp->xdf_vd_lbl, NULL);
   3486 	cmlb_free_handle(&vdp->xdf_vd_lbl);
   3487 
   3488 	/* we'll support backend running in domU later */
   3489 #ifdef	DOMU_BACKEND
   3490 	(void) xvdi_post_event(dip, XEN_HP_REMOVE);
   3491 #endif
   3492 
   3493 	list_destroy(&vdp->xdf_vreq_act);
   3494 	ddi_prop_remove_all(dip);
   3495 	xdf_kstat_delete(dip);
   3496 	ddi_remove_softintr(vdp->xdf_softintr_id);
   3497 	xvdi_remove_xb_watch_handlers(dip);
   3498 	ddi_set_driver_private(dip, NULL);
   3499 	cv_destroy(&vdp->xdf_dev_cv);
   3500 	mutex_destroy(&vdp->xdf_cb_lk);
   3501 	mutex_destroy(&vdp->xdf_dev_lk);
   3502 	if (vdp->xdf_cache_flush_block != NULL)
   3503 		kmem_free(vdp->xdf_flush_mem, 2 * vdp->xdf_xdev_secsize);
   3504 	ddi_soft_state_free(xdf_ssp, instance);
   3505 	return (DDI_SUCCESS);
   3506 }
   3507 
   3508 /*
   3509  * Driver linkage structures.
   3510  */
   3511 static struct cb_ops xdf_cbops = {
   3512 	xdf_open,
   3513 	xdf_close,
   3514 	xdf_strategy,
   3515 	nodev,
   3516 	xdf_dump,
   3517 	xdf_read,
   3518 	xdf_write,
   3519 	xdf_ioctl,
   3520 	nodev,
   3521 	nodev,
   3522 	nodev,
   3523 	nochpoll,
   3524 	xdf_prop_op,
   3525 	NULL,
   3526 	D_MP | D_NEW | D_64BIT,
   3527 	CB_REV,
   3528 	xdf_aread,
   3529 	xdf_awrite
   3530 };
   3531 
   3532 struct dev_ops xdf_devops = {
   3533 	DEVO_REV,		/* devo_rev */
   3534 	0,			/* devo_refcnt */
   3535 	xdf_getinfo,		/* devo_getinfo */
   3536 	nulldev,		/* devo_identify */
   3537 	nulldev,		/* devo_probe */
   3538 	xdf_attach,		/* devo_attach */
   3539 	xdf_detach,		/* devo_detach */
   3540 	nodev,			/* devo_reset */
   3541 	&xdf_cbops,		/* devo_cb_ops */
   3542 	NULL,			/* devo_bus_ops */
   3543 	NULL,			/* devo_power */
   3544 	ddi_quiesce_not_supported, /* devo_quiesce */
   3545 };
   3546 
   3547 /*
   3548  * Module linkage structures.
   3549  */
   3550 static struct modldrv modldrv = {
   3551 	&mod_driverops,		/* Type of module.  This one is a driver */
   3552 	"virtual block driver",	/* short description */
   3553 	&xdf_devops		/* driver specific ops */
   3554 };
   3555 
   3556 static struct modlinkage xdf_modlinkage = {
   3557 	MODREV_1, (void *)&modldrv, NULL
   3558 };
   3559 
   3560 /*
   3561  * standard module entry points
   3562  */
   3563 int
   3564 _init(void)
   3565 {
   3566 	int rc;
   3567 
   3568 	xdf_major = ddi_name_to_major("xdf");
   3569 	if (xdf_major == (major_t)-1)
   3570 		return (EINVAL);
   3571 
   3572 	if ((rc = ddi_soft_state_init(&xdf_ssp, sizeof (xdf_t), 0)) != 0)
   3573 		return (rc);
   3574 
   3575 	xdf_vreq_cache = kmem_cache_create("xdf_vreq_cache",
   3576 	    sizeof (v_req_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
   3577 	xdf_gs_cache = kmem_cache_create("xdf_gs_cache",
   3578 	    sizeof (ge_slot_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
   3579 
   3580 #if defined(XPV_HVM_DRIVER)
   3581 	xdf_hvm_init();
   3582 #endif /* XPV_HVM_DRIVER */
   3583 
   3584 	if ((rc = mod_install(&xdf_modlinkage)) != 0) {
   3585 #if defined(XPV_HVM_DRIVER)
   3586 		xdf_hvm_fini();
   3587 #endif /* XPV_HVM_DRIVER */
   3588 		kmem_cache_destroy(xdf_vreq_cache);
   3589 		kmem_cache_destroy(xdf_gs_cache);
   3590 		ddi_soft_state_fini(&xdf_ssp);
   3591 		return (rc);
   3592 	}
   3593 
   3594 	return (rc);
   3595 }
   3596 
   3597 int
   3598 _fini(void)
   3599 {
   3600 	int err;
   3601 	if ((err = mod_remove(&xdf_modlinkage)) != 0)
   3602 		return (err);
   3603 
   3604 #if defined(XPV_HVM_DRIVER)
   3605 	xdf_hvm_fini();
   3606 #endif /* XPV_HVM_DRIVER */
   3607 
   3608 	kmem_cache_destroy(xdf_vreq_cache);
   3609 	kmem_cache_destroy(xdf_gs_cache);
   3610 	ddi_soft_state_fini(&xdf_ssp);
   3611 
   3612 	return (0);
   3613 }
   3614 
   3615 int
   3616 _info(struct modinfo *modinfop)
   3617 {
   3618 	return (mod_info(&xdf_modlinkage, modinfop));
   3619 }
   3620