Home | History | Annotate | Download | only in io
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 /*
     28  * Virtual disk server
     29  */
     30 
     31 
     32 #include <sys/types.h>
     33 #include <sys/conf.h>
     34 #include <sys/crc32.h>
     35 #include <sys/ddi.h>
     36 #include <sys/dkio.h>
     37 #include <sys/file.h>
     38 #include <sys/fs/hsfs_isospec.h>
     39 #include <sys/mdeg.h>
     40 #include <sys/mhd.h>
     41 #include <sys/modhash.h>
     42 #include <sys/note.h>
     43 #include <sys/pathname.h>
     44 #include <sys/sdt.h>
     45 #include <sys/sunddi.h>
     46 #include <sys/sunldi.h>
     47 #include <sys/sysmacros.h>
     48 #include <sys/vio_common.h>
     49 #include <sys/vio_util.h>
     50 #include <sys/vdsk_mailbox.h>
     51 #include <sys/vdsk_common.h>
     52 #include <sys/vtoc.h>
     53 #include <sys/vfs.h>
     54 #include <sys/stat.h>
     55 #include <sys/scsi/impl/uscsi.h>
     56 #include <sys/ontrap.h>
     57 #include <vm/seg_map.h>
     58 
     59 #define	ONE_MEGABYTE	(1ULL << 20)
     60 #define	ONE_GIGABYTE	(1ULL << 30)
     61 #define	ONE_TERABYTE	(1ULL << 40)
     62 
     63 /* Virtual disk server initialization flags */
     64 #define	VDS_LDI			0x01
     65 #define	VDS_MDEG		0x02
     66 
     67 /* Virtual disk server tunable parameters */
     68 #define	VDS_RETRIES		5
     69 #define	VDS_LDC_DELAY		1000 /* 1 msecs */
     70 #define	VDS_DEV_DELAY		10000000 /* 10 secs */
     71 #define	VDS_NCHAINS		32
     72 
     73 /* Identification parameters for MD, synthetic dkio(7i) structures, etc. */
     74 #define	VDS_NAME		"virtual-disk-server"
     75 
     76 #define	VD_NAME			"vd"
     77 #define	VD_VOLUME_NAME		"vdisk"
     78 #define	VD_ASCIILABEL		"Virtual Disk"
     79 
     80 #define	VD_CHANNEL_ENDPOINT	"channel-endpoint"
     81 #define	VD_ID_PROP		"id"
     82 #define	VD_BLOCK_DEVICE_PROP	"vds-block-device"
     83 #define	VD_BLOCK_DEVICE_OPTS	"vds-block-device-opts"
     84 #define	VD_REG_PROP		"reg"
     85 
     86 /* Virtual disk initialization flags */
     87 #define	VD_DISK_READY		0x01
     88 #define	VD_LOCKING		0x02
     89 #define	VD_LDC			0x04
     90 #define	VD_DRING		0x08
     91 #define	VD_SID			0x10
     92 #define	VD_SEQ_NUM		0x20
     93 #define	VD_SETUP_ERROR		0x40
     94 
     95 /* Number of backup labels */
     96 #define	VD_DSKIMG_NUM_BACKUP	5
     97 
     98 /* Timeout for SCSI I/O */
     99 #define	VD_SCSI_RDWR_TIMEOUT	30	/* 30 secs */
    100 
    101 /*
    102  * Default number of threads for the I/O queue. In many cases, we will not
    103  * receive more than 8 I/O requests at the same time. However there are
    104  * cases (for example during the OS installation) where we can have a lot
    105  * more (up to the limit of the DRing size).
    106  */
    107 #define	VD_IOQ_NTHREADS		8
    108 
    109 /* Maximum number of logical partitions */
    110 #define	VD_MAXPART	(NDKMAP + 1)
    111 
    112 /*
    113  * By Solaris convention, slice/partition 2 represents the entire disk;
    114  * unfortunately, this convention does not appear to be codified.
    115  */
    116 #define	VD_ENTIRE_DISK_SLICE	2
    117 
    118 /* Logical block address for EFI */
    119 #define	VD_EFI_LBA_GPT		1	/* LBA of the GPT */
    120 #define	VD_EFI_LBA_GPE		2	/* LBA of the GPE */
    121 
    122 #define	VD_EFI_DEV_SET(dev, vdsk, ioctl)	\
    123 	VDSK_EFI_DEV_SET(dev, vdsk, ioctl,	\
    124 	    (vdsk)->vdisk_bsize, (vdsk)->vdisk_size)
    125 
    126 /*
    127  * Flags defining the behavior for flushing asynchronous writes used to
    128  * performed some write I/O requests.
    129  *
    130  * The VD_AWFLUSH_IMMEDIATE enables immediate flushing of asynchronous
    131  * writes. This ensures that data are committed to the backend when the I/O
    132  * request reply is sent to the guest domain so this prevents any data to
    133  * be lost in case a service domain unexpectedly crashes.
    134  *
    135  * The flag VD_AWFLUSH_DEFER indicates that flushing is deferred to another
    136  * thread while the request is immediatly marked as completed. In that case,
    137  * a guest domain can a receive a reply that its write request is completed
    138  * while data haven't been flushed to disk yet.
    139  *
    140  * Flags VD_AWFLUSH_IMMEDIATE and VD_AWFLUSH_DEFER are mutually exclusive.
    141  */
    142 #define	VD_AWFLUSH_IMMEDIATE	0x01	/* immediate flushing */
    143 #define	VD_AWFLUSH_DEFER	0x02	/* defer flushing */
    144 #define	VD_AWFLUSH_GROUP	0x04	/* group requests before flushing */
    145 
    146 /* Driver types */
    147 typedef enum vd_driver {
    148 	VD_DRIVER_UNKNOWN = 0,	/* driver type unknown  */
    149 	VD_DRIVER_DISK,		/* disk driver */
    150 	VD_DRIVER_VOLUME	/* volume driver */
    151 } vd_driver_t;
    152 
    153 #define	VD_DRIVER_NAME_LEN	64
    154 
    155 #define	VDS_NUM_DRIVERS	(sizeof (vds_driver_types) / sizeof (vd_driver_type_t))
    156 
    157 typedef struct vd_driver_type {
    158 	char name[VD_DRIVER_NAME_LEN];	/* driver name */
    159 	vd_driver_t type;		/* driver type (disk or volume) */
    160 } vd_driver_type_t;
    161 
    162 /*
    163  * There is no reliable way to determine if a device is representing a disk
    164  * or a volume, especially with pseudo devices. So we maintain a list of well
    165  * known drivers and the type of device they represent (either a disk or a
    166  * volume).
    167  *
    168  * The list can be extended by adding a "driver-type-list" entry in vds.conf
    169  * with the following syntax:
    170  *
    171  * 	driver-type-list="<driver>:<type>", ... ,"<driver>:<type>";
    172  *
    173  * Where:
    174  *	<driver> is the name of a driver (limited to 64 characters)
    175  *	<type> is either the string "disk" or "volume"
    176  *
    177  * Invalid entries in "driver-type-list" will be ignored.
    178  *
    179  * For example, the following line in vds.conf:
    180  *
    181  * 	driver-type-list="foo:disk","bar:volume";
    182  *
    183  * defines that "foo" is a disk driver, and driver "bar" is a volume driver.
    184  *
    185  * When a list is defined in vds.conf, it is checked before the built-in list
    186  * (vds_driver_types[]) so that any definition from this list can be overriden
    187  * using vds.conf.
    188  */
    189 vd_driver_type_t vds_driver_types[] = {
    190 	{ "dad",	VD_DRIVER_DISK },	/* Solaris */
    191 	{ "did",	VD_DRIVER_DISK },	/* Sun Cluster */
    192 	{ "dlmfdrv",	VD_DRIVER_DISK },	/* Hitachi HDLM */
    193 	{ "emcp",	VD_DRIVER_DISK },	/* EMC Powerpath */
    194 	{ "lofi",	VD_DRIVER_VOLUME },	/* Solaris */
    195 	{ "md",		VD_DRIVER_VOLUME },	/* Solaris - SVM */
    196 	{ "sd",		VD_DRIVER_DISK },	/* Solaris */
    197 	{ "ssd",	VD_DRIVER_DISK },	/* Solaris */
    198 	{ "vdc",	VD_DRIVER_DISK },	/* Solaris */
    199 	{ "vxdmp",	VD_DRIVER_DISK },	/* Veritas */
    200 	{ "vxio",	VD_DRIVER_VOLUME },	/* Veritas - VxVM */
    201 	{ "zfs",	VD_DRIVER_VOLUME }	/* Solaris */
    202 };
    203 
    204 /* Return a cpp token as a string */
    205 #define	STRINGIZE(token)	#token
    206 
    207 /*
    208  * Print a message prefixed with the current function name to the message log
    209  * (and optionally to the console for verbose boots); these macros use cpp's
    210  * concatenation of string literals and C99 variable-length-argument-list
    211  * macros
    212  */
    213 #define	PRN(...)	_PRN("?%s():  "__VA_ARGS__, "")
    214 #define	_PRN(format, ...)					\
    215 	cmn_err(CE_CONT, format"%s", __func__, __VA_ARGS__)
    216 
    217 /* Return a pointer to the "i"th vdisk dring element */
    218 #define	VD_DRING_ELEM(i)	((vd_dring_entry_t *)(void *)	\
    219 	    (vd->dring + (i)*vd->descriptor_size))
    220 
    221 /* Return the virtual disk client's type as a string (for use in messages) */
    222 #define	VD_CLIENT(vd)							\
    223 	(((vd)->xfer_mode == VIO_DESC_MODE) ? "in-band client" :	\
    224 	    (((vd)->xfer_mode == VIO_DRING_MODE_V1_0) ? "dring client" :    \
    225 		(((vd)->xfer_mode == 0) ? "null client" :		\
    226 		    "unsupported client")))
    227 
    228 /* Read disk label from a disk image */
    229 #define	VD_DSKIMG_LABEL_READ(vd, labelp) \
    230 	vd_dskimg_rw(vd, VD_SLICE_NONE, VD_OP_BREAD, (caddr_t)labelp, \
    231 	    0, sizeof (struct dk_label))
    232 
    233 /* Write disk label to a disk image */
    234 #define	VD_DSKIMG_LABEL_WRITE(vd, labelp)	\
    235 	vd_dskimg_rw(vd, VD_SLICE_NONE, VD_OP_BWRITE, (caddr_t)labelp, \
    236 	    0, sizeof (struct dk_label))
    237 
    238 /* Identify if a backend is a disk image */
    239 #define	VD_DSKIMG(vd)	((vd)->vdisk_type == VD_DISK_TYPE_DISK &&	\
    240 	((vd)->file || (vd)->volume))
    241 
    242 /* Next index in a write queue */
    243 #define	VD_WRITE_INDEX_NEXT(vd, id)		\
    244 	((((id) + 1) >= vd->dring_len)? 0 : (id) + 1)
    245 
    246 /* Message for disk access rights reset failure */
    247 #define	VD_RESET_ACCESS_FAILURE_MSG \
    248 	"Fail to reset disk access rights for disk %s"
    249 
    250 /*
    251  * Specification of an MD node passed to the MDEG to filter any
    252  * 'vport' nodes that do not belong to the specified node. This
    253  * template is copied for each vds instance and filled in with
    254  * the appropriate 'cfg-handle' value before being passed to the MDEG.
    255  */
    256 static mdeg_prop_spec_t	vds_prop_template[] = {
    257 	{ MDET_PROP_STR,	"name",		VDS_NAME },
    258 	{ MDET_PROP_VAL,	"cfg-handle",	NULL },
    259 	{ MDET_LIST_END,	NULL, 		NULL }
    260 };
    261 
    262 #define	VDS_SET_MDEG_PROP_INST(specp, val) (specp)[1].ps_val = (val);
    263 
    264 /*
    265  * Matching criteria passed to the MDEG to register interest
    266  * in changes to 'virtual-device-port' nodes identified by their
    267  * 'id' property.
    268  */
    269 static md_prop_match_t	vd_prop_match[] = {
    270 	{ MDET_PROP_VAL,	VD_ID_PROP },
    271 	{ MDET_LIST_END,	NULL }
    272 };
    273 
    274 static mdeg_node_match_t vd_match = {"virtual-device-port",
    275 				    vd_prop_match};
    276 
    277 /*
    278  * Options for the VD_BLOCK_DEVICE_OPTS property.
    279  */
    280 #define	VD_OPT_RDONLY		0x1	/* read-only  */
    281 #define	VD_OPT_SLICE		0x2	/* single slice */
    282 #define	VD_OPT_EXCLUSIVE	0x4	/* exclusive access */
    283 
    284 #define	VD_OPTION_NLEN	128
    285 
    286 typedef struct vd_option {
    287 	char vdo_name[VD_OPTION_NLEN];
    288 	uint64_t vdo_value;
    289 } vd_option_t;
    290 
    291 vd_option_t vd_bdev_options[] = {
    292 	{ "ro",		VD_OPT_RDONLY },
    293 	{ "slice", 	VD_OPT_SLICE },
    294 	{ "excl",	VD_OPT_EXCLUSIVE }
    295 };
    296 
    297 /* Debugging macros */
    298 #ifdef DEBUG
    299 
    300 static int	vd_msglevel = 0;
    301 
    302 #define	PR0 if (vd_msglevel > 0)	PRN
    303 #define	PR1 if (vd_msglevel > 1)	PRN
    304 #define	PR2 if (vd_msglevel > 2)	PRN
    305 
    306 #define	VD_DUMP_DRING_ELEM(elem)					\
    307 	PR0("dst:%x op:%x st:%u nb:%lx addr:%lx ncook:%u\n",		\
    308 	    elem->hdr.dstate,						\
    309 	    elem->payload.operation,					\
    310 	    elem->payload.status,					\
    311 	    elem->payload.nbytes,					\
    312 	    elem->payload.addr,						\
    313 	    elem->payload.ncookies);
    314 
    315 char *
    316 vd_decode_state(int state)
    317 {
    318 	char *str;
    319 
    320 #define	CASE_STATE(_s)	case _s: str = #_s; break;
    321 
    322 	switch (state) {
    323 	CASE_STATE(VD_STATE_INIT)
    324 	CASE_STATE(VD_STATE_VER)
    325 	CASE_STATE(VD_STATE_ATTR)
    326 	CASE_STATE(VD_STATE_DRING)
    327 	CASE_STATE(VD_STATE_RDX)
    328 	CASE_STATE(VD_STATE_DATA)
    329 	default: str = "unknown"; break;
    330 	}
    331 
    332 #undef CASE_STATE
    333 
    334 	return (str);
    335 }
    336 
    337 void
    338 vd_decode_tag(vio_msg_t *msg)
    339 {
    340 	char *tstr, *sstr, *estr;
    341 
    342 #define	CASE_TYPE(_s)	case _s: tstr = #_s; break;
    343 
    344 	switch (msg->tag.vio_msgtype) {
    345 	CASE_TYPE(VIO_TYPE_CTRL)
    346 	CASE_TYPE(VIO_TYPE_DATA)
    347 	CASE_TYPE(VIO_TYPE_ERR)
    348 	default: tstr = "unknown"; break;
    349 	}
    350 
    351 #undef CASE_TYPE
    352 
    353 #define	CASE_SUBTYPE(_s) case _s: sstr = #_s; break;
    354 
    355 	switch (msg->tag.vio_subtype) {
    356 	CASE_SUBTYPE(VIO_SUBTYPE_INFO)
    357 	CASE_SUBTYPE(VIO_SUBTYPE_ACK)
    358 	CASE_SUBTYPE(VIO_SUBTYPE_NACK)
    359 	default: sstr = "unknown"; break;
    360 	}
    361 
    362 #undef CASE_SUBTYPE
    363 
    364 #define	CASE_ENV(_s)	case _s: estr = #_s; break;
    365 
    366 	switch (msg->tag.vio_subtype_env) {
    367 	CASE_ENV(VIO_VER_INFO)
    368 	CASE_ENV(VIO_ATTR_INFO)
    369 	CASE_ENV(VIO_DRING_REG)
    370 	CASE_ENV(VIO_DRING_UNREG)
    371 	CASE_ENV(VIO_RDX)
    372 	CASE_ENV(VIO_PKT_DATA)
    373 	CASE_ENV(VIO_DESC_DATA)
    374 	CASE_ENV(VIO_DRING_DATA)
    375 	default: estr = "unknown"; break;
    376 	}
    377 
    378 #undef CASE_ENV
    379 
    380 	PR1("(%x/%x/%x) message : (%s/%s/%s)",
    381 	    msg->tag.vio_msgtype, msg->tag.vio_subtype,
    382 	    msg->tag.vio_subtype_env, tstr, sstr, estr);
    383 }
    384 
    385 #else	/* !DEBUG */
    386 
    387 #define	PR0(...)
    388 #define	PR1(...)
    389 #define	PR2(...)
    390 
    391 #define	VD_DUMP_DRING_ELEM(elem)
    392 
    393 #define	vd_decode_state(_s)	(NULL)
    394 #define	vd_decode_tag(_s)	(NULL)
    395 
    396 #endif	/* DEBUG */
    397 
    398 
    399 /*
    400  * Soft state structure for a vds instance
    401  */
    402 typedef struct vds {
    403 	uint_t		initialized;	/* driver inst initialization flags */
    404 	dev_info_t	*dip;		/* driver inst devinfo pointer */
    405 	ldi_ident_t	ldi_ident;	/* driver's identifier for LDI */
    406 	mod_hash_t	*vd_table;	/* table of virtual disks served */
    407 	mdeg_node_spec_t *ispecp;	/* mdeg node specification */
    408 	mdeg_handle_t	mdeg;		/* handle for MDEG operations  */
    409 	vd_driver_type_t *driver_types;	/* extra driver types (from vds.conf) */
    410 	int 		num_drivers;	/* num of extra driver types */
    411 } vds_t;
    412 
    413 /*
    414  * Types of descriptor-processing tasks
    415  */
    416 typedef enum vd_task_type {
    417 	VD_NONFINAL_RANGE_TASK,	/* task for intermediate descriptor in range */
    418 	VD_FINAL_RANGE_TASK,	/* task for last in a range of descriptors */
    419 } vd_task_type_t;
    420 
    421 /*
    422  * Structure describing the task for processing a descriptor
    423  */
    424 typedef struct vd_task {
    425 	struct vd		*vd;		/* vd instance task is for */
    426 	vd_task_type_t		type;		/* type of descriptor task */
    427 	int			index;		/* dring elem index for task */
    428 	vio_msg_t		*msg;		/* VIO message task is for */
    429 	size_t			msglen;		/* length of message content */
    430 	vd_dring_payload_t	*request;	/* request task will perform */
    431 	struct buf		buf;		/* buf(9s) for I/O request */
    432 	ldc_mem_handle_t	mhdl;		/* task memory handle */
    433 	int			status;		/* status of processing task */
    434 	int	(*completef)(struct vd_task *task); /* completion func ptr */
    435 	uint32_t		write_index;	/* index in the write_queue */
    436 } vd_task_t;
    437 
    438 /*
    439  * Soft state structure for a virtual disk instance
    440  */
    441 typedef struct vd {
    442 	uint64_t		id;		/* vdisk id */
    443 	uint_t			initialized;	/* vdisk initialization flags */
    444 	uint64_t		operations;	/* bitmask of VD_OPs exported */
    445 	vio_ver_t		version;	/* ver negotiated with client */
    446 	vds_t			*vds;		/* server for this vdisk */
    447 	ddi_taskq_t		*startq;	/* queue for I/O start tasks */
    448 	ddi_taskq_t		*completionq;	/* queue for completion tasks */
    449 	ddi_taskq_t		*ioq;		/* queue for I/O */
    450 	uint32_t		write_index;	/* next write index */
    451 	buf_t			**write_queue;	/* queue for async writes */
    452 	ldi_handle_t		ldi_handle[V_NUMPAR];	/* LDI slice handles */
    453 	char			device_path[MAXPATHLEN + 1]; /* vdisk device */
    454 	dev_t			dev[V_NUMPAR];	/* dev numbers for slices */
    455 	int			open_flags;	/* open flags */
    456 	uint_t			nslices;	/* number of slices we export */
    457 	size_t			vdisk_size;	/* number of blocks in vdisk */
    458 	size_t			vdisk_bsize;	/* blk size of the vdisk */
    459 	vd_disk_type_t		vdisk_type;	/* slice or entire disk */
    460 	vd_disk_label_t		vdisk_label;	/* EFI or VTOC label */
    461 	vd_media_t		vdisk_media;	/* media type of backing dev. */
    462 	boolean_t		is_atapi_dev;	/* Is this an IDE CD-ROM dev? */
    463 	ushort_t		max_xfer_sz;	/* max xfer size in DEV_BSIZE */
    464 	size_t			backend_bsize;	/* blk size of backend device */
    465 	int			vio_bshift;	/* shift for blk convertion */
    466 	boolean_t		volume;		/* is vDisk backed by volume */
    467 	boolean_t		zvol;		/* is vDisk backed by a zvol */
    468 	boolean_t		file;		/* is vDisk backed by a file? */
    469 	boolean_t		scsi;		/* is vDisk backed by scsi? */
    470 	vnode_t			*file_vnode;	/* file vnode */
    471 	size_t			dskimg_size;	/* size of disk image */
    472 	ddi_devid_t		dskimg_devid;	/* devid for disk image */
    473 	int			efi_reserved;	/* EFI reserved slice */
    474 	caddr_t			flabel;		/* fake label for slice type */
    475 	uint_t			flabel_size;	/* fake label size */
    476 	uint_t			flabel_limit;	/* limit of the fake label */
    477 	struct dk_geom		dk_geom;	/* synthetic for slice type */
    478 	struct extvtoc		vtoc;		/* synthetic for slice type */
    479 	vd_slice_t		slices[VD_MAXPART]; /* logical partitions */
    480 	boolean_t		ownership;	/* disk ownership status */
    481 	ldc_status_t		ldc_state;	/* LDC connection state */
    482 	ldc_handle_t		ldc_handle;	/* handle for LDC comm */
    483 	size_t			max_msglen;	/* largest LDC message len */
    484 	vd_state_t		state;		/* client handshake state */
    485 	uint8_t			xfer_mode;	/* transfer mode with client */
    486 	uint32_t		sid;		/* client's session ID */
    487 	uint64_t		seq_num;	/* message sequence number */
    488 	uint64_t		dring_ident;	/* identifier of dring */
    489 	ldc_dring_handle_t	dring_handle;	/* handle for dring ops */
    490 	uint32_t		descriptor_size;	/* num bytes in desc */
    491 	uint32_t		dring_len;	/* number of dring elements */
    492 	uint8_t			dring_mtype;	/* dring mem map type */
    493 	caddr_t			dring;		/* address of dring */
    494 	caddr_t			vio_msgp;	/* vio msg staging buffer */
    495 	vd_task_t		inband_task;	/* task for inband descriptor */
    496 	vd_task_t		*dring_task;	/* tasks dring elements */
    497 
    498 	kmutex_t		lock;		/* protects variables below */
    499 	boolean_t		enabled;	/* is vdisk enabled? */
    500 	boolean_t		reset_state;	/* reset connection state? */
    501 	boolean_t		reset_ldc;	/* reset LDC channel? */
    502 } vd_t;
    503 
    504 /*
    505  * Macros to manipulate the fake label (flabel) for single slice disks.
    506  *
    507  * If we fake a VTOC label then the fake label consists of only one block
    508  * containing the VTOC label (struct dk_label).
    509  *
    510  * If we fake an EFI label then the fake label consists of a blank block
    511  * followed by a GPT (efi_gpt_t) and a GPE (efi_gpe_t).
    512  *
    513  */
    514 #define	VD_LABEL_VTOC_SIZE(lba)					\
    515 	P2ROUNDUP(sizeof (struct dk_label), (lba))
    516 
    517 #define	VD_LABEL_EFI_SIZE(lba)					\
    518 	P2ROUNDUP(2 * (lba) + sizeof (efi_gpe_t) * VD_MAXPART,	\
    519 	    (lba))
    520 
    521 #define	VD_LABEL_VTOC(vd)	\
    522 		((struct dk_label *)(void *)((vd)->flabel))
    523 
    524 #define	VD_LABEL_EFI_GPT(vd, lba)	\
    525 		((efi_gpt_t *)(void *)((vd)->flabel + (lba)))
    526 #define	VD_LABEL_EFI_GPE(vd, lba)	\
    527 		((efi_gpe_t *)(void *)((vd)->flabel + 2 * (lba)))
    528 
    529 
    530 typedef struct vds_operation {
    531 	char	*namep;
    532 	uint8_t	operation;
    533 	int	(*start)(vd_task_t *task);
    534 	int	(*complete)(vd_task_t *task);
    535 } vds_operation_t;
    536 
    537 typedef struct vd_ioctl {
    538 	uint8_t		operation;		/* vdisk operation */
    539 	const char	*operation_name;	/* vdisk operation name */
    540 	size_t		nbytes;			/* size of operation buffer */
    541 	int		cmd;			/* corresponding ioctl cmd */
    542 	const char	*cmd_name;		/* ioctl cmd name */
    543 	void		*arg;			/* ioctl cmd argument */
    544 	/* convert input vd_buf to output ioctl_arg */
    545 	int		(*copyin)(void *vd_buf, size_t, void *ioctl_arg);
    546 	/* convert input ioctl_arg to output vd_buf */
    547 	void		(*copyout)(void *ioctl_arg, void *vd_buf);
    548 	/* write is true if the operation writes any data to the backend */
    549 	boolean_t	write;
    550 } vd_ioctl_t;
    551 
    552 /* Define trivial copyin/copyout conversion function flag */
    553 #define	VD_IDENTITY_IN	((int (*)(void *, size_t, void *))-1)
    554 #define	VD_IDENTITY_OUT	((void (*)(void *, void *))-1)
    555 
    556 
    557 static int	vds_ldc_retries = VDS_RETRIES;
    558 static int	vds_ldc_delay = VDS_LDC_DELAY;
    559 static int	vds_dev_retries = VDS_RETRIES;
    560 static int	vds_dev_delay = VDS_DEV_DELAY;
    561 static void	*vds_state;
    562 
    563 static short	vd_scsi_rdwr_timeout = VD_SCSI_RDWR_TIMEOUT;
    564 static int	vd_scsi_debug = USCSI_SILENT;
    565 
    566 /*
    567  * Number of threads in the taskq handling vdisk I/O. This can be set up to
    568  * the size of the DRing which is the maximum number of I/O we can receive
    569  * in parallel. Note that using a high number of threads can improve performance
    570  * but this is going to consume a lot of resources if there are many vdisks.
    571  */
    572 static int	vd_ioq_nthreads = VD_IOQ_NTHREADS;
    573 
    574 /*
    575  * Tunable to define the behavior for flushing asynchronous writes used to
    576  * performed some write I/O requests. The default behavior is to group as
    577  * much asynchronous writes as possible and to flush them immediatly.
    578  *
    579  * If the tunable is set to 0 then explicit flushing is disabled. In that
    580  * case, data will be flushed by traditional mechanism (like fsflush) but
    581  * this might not happen immediatly.
    582  *
    583  */
    584 static int	vd_awflush = VD_AWFLUSH_IMMEDIATE | VD_AWFLUSH_GROUP;
    585 
    586 /*
    587  * Tunable to define the behavior of the service domain if the vdisk server
    588  * fails to reset disk exclusive access when a LDC channel is reset. When a
    589  * LDC channel is reset the vdisk server will try to reset disk exclusive
    590  * access by releasing any SCSI-2 reservation or resetting the disk. If these
    591  * actions fail then the default behavior (vd_reset_access_failure = 0) is to
    592  * print a warning message. This default behavior can be changed by setting
    593  * the vd_reset_access_failure variable to A_REBOOT (= 0x1) and that will
    594  * cause the service domain to reboot, or A_DUMP (= 0x5) and that will cause
    595  * the service domain to panic. In both cases, the reset of the service domain
    596  * should trigger a reset SCSI buses and hopefully clear any SCSI-2 reservation.
    597  */
    598 static int 	vd_reset_access_failure = 0;
    599 
    600 /*
    601  * Tunable for backward compatibility. When this variable is set to B_TRUE,
    602  * all disk volumes (ZFS, SVM, VxvM volumes) will be exported as single
    603  * slice disks whether or not they have the "slice" option set. This is
    604  * to provide a simple backward compatibility mechanism when upgrading
    605  * the vds driver and using a domain configuration created before the
    606  * "slice" option was available.
    607  */
    608 static boolean_t vd_volume_force_slice = B_FALSE;
    609 
    610 /*
    611  * The label of disk images created with some earlier versions of the virtual
    612  * disk software is not entirely correct and have an incorrect v_sanity field
    613  * (usually 0) instead of VTOC_SANE. This creates a compatibility problem with
    614  * these images because we are now validating that the disk label (and the
    615  * sanity) is correct when a disk image is opened.
    616  *
    617  * This tunable is set to false to not validate the sanity field and ensure
    618  * compatibility. If the tunable is set to true, we will do a strict checking
    619  * of the sanity but this can create compatibility problems with old disk
    620  * images.
    621  */
    622 static boolean_t vd_dskimg_validate_sanity = B_FALSE;
    623 
    624 /*
    625  * Enables the use of LDC_DIRECT_MAP when mapping in imported descriptor rings.
    626  */
    627 static boolean_t vd_direct_mapped_drings = B_TRUE;
    628 
    629 /*
    630  * When a backend is exported as a single-slice disk then we entirely fake
    631  * its disk label. So it can be exported either with a VTOC label or with
    632  * an EFI label. If vd_slice_label is set to VD_DISK_LABEL_VTOC then all
    633  * single-slice disks will be exported with a VTOC label; and if it is set
    634  * to VD_DISK_LABEL_EFI then all single-slice disks will be exported with
    635  * an EFI label.
    636  *
    637  * If vd_slice_label is set to VD_DISK_LABEL_UNK and the backend is a disk
    638  * or volume device then it will be exported with the same type of label as
    639  * defined on the device. Otherwise if the backend is a file then it will
    640  * exported with the disk label type set in the vd_file_slice_label variable.
    641  *
    642  * Note that if the backend size is greater than 1TB then it will always be
    643  * exported with an EFI label no matter what the setting is.
    644  */
    645 static vd_disk_label_t vd_slice_label = VD_DISK_LABEL_UNK;
    646 
    647 static vd_disk_label_t vd_file_slice_label = VD_DISK_LABEL_VTOC;
    648 
    649 /*
    650  * Tunable for backward compatibility. If this variable is set to B_TRUE then
    651  * single-slice disks are exported as disks with only one slice instead of
    652  * faking a complete disk partitioning.
    653  */
    654 static boolean_t vd_slice_single_slice = B_FALSE;
    655 
    656 /*
    657  * Supported protocol version pairs, from highest (newest) to lowest (oldest)
    658  *
    659  * Each supported major version should appear only once, paired with (and only
    660  * with) its highest supported minor version number (as the protocol requires
    661  * supporting all lower minor version numbers as well)
    662  */
    663 static const vio_ver_t	vds_version[] = {{1, 1}};
    664 static const size_t	vds_num_versions =
    665     sizeof (vds_version)/sizeof (vds_version[0]);
    666 
    667 static void vd_free_dring_task(vd_t *vdp);
    668 static int vd_setup_vd(vd_t *vd);
    669 static int vd_setup_single_slice_disk(vd_t *vd);
    670 static int vd_setup_slice_image(vd_t *vd);
    671 static int vd_setup_disk_image(vd_t *vd);
    672 static int vd_backend_check_size(vd_t *vd);
    673 static boolean_t vd_enabled(vd_t *vd);
    674 static ushort_t vd_lbl2cksum(struct dk_label *label);
    675 static int vd_dskimg_validate_geometry(vd_t *vd);
    676 static boolean_t vd_dskimg_is_iso_image(vd_t *vd);
    677 static void vd_set_exported_operations(vd_t *vd);
    678 static void vd_reset_access(vd_t *vd);
    679 static int vd_backend_ioctl(vd_t *vd, int cmd, caddr_t arg);
    680 static int vds_efi_alloc_and_read(vd_t *, efi_gpt_t **, efi_gpe_t **);
    681 static void vds_efi_free(vd_t *, efi_gpt_t *, efi_gpe_t *);
    682 static void vds_driver_types_free(vds_t *vds);
    683 static void vd_vtocgeom_to_label(struct extvtoc *vtoc, struct dk_geom *geom,
    684     struct dk_label *label);
    685 static void vd_label_to_vtocgeom(struct dk_label *label, struct extvtoc *vtoc,
    686     struct dk_geom *geom);
    687 static boolean_t vd_slice_geom_isvalid(vd_t *vd, struct dk_geom *geom);
    688 static boolean_t vd_slice_vtoc_isvalid(vd_t *vd, struct extvtoc *vtoc);
    689 
    690 extern int is_pseudo_device(dev_info_t *);
    691 
    692 /*
    693  * Function:
    694  *	vd_get_readable_size
    695  *
    696  * Description:
    697  * 	Convert a given size in bytes to a human readable format in
    698  * 	kilobytes, megabytes, gigabytes or terabytes.
    699  *
    700  * Parameters:
    701  *	full_size	- the size to convert in bytes.
    702  *	size		- the converted size.
    703  *	unit		- the unit of the converted size: 'K' (kilobyte),
    704  *			  'M' (Megabyte), 'G' (Gigabyte), 'T' (Terabyte).
    705  *
    706  * Return Code:
    707  *	none
    708  */
    709 static void
    710 vd_get_readable_size(size_t full_size, size_t *size, char *unit)
    711 {
    712 	if (full_size < (1ULL << 20)) {
    713 		*size = full_size >> 10;
    714 		*unit = 'K'; /* Kilobyte */
    715 	} else if (full_size < (1ULL << 30)) {
    716 		*size = full_size >> 20;
    717 		*unit = 'M'; /* Megabyte */
    718 	} else if (full_size < (1ULL << 40)) {
    719 		*size = full_size >> 30;
    720 		*unit = 'G'; /* Gigabyte */
    721 	} else {
    722 		*size = full_size >> 40;
    723 		*unit = 'T'; /* Terabyte */
    724 	}
    725 }
    726 
    727 /*
    728  * Function:
    729  *	vd_dskimg_io_params
    730  *
    731  * Description:
    732  * 	Convert virtual disk I/O parameters (slice, block, length) to
    733  *	(offset, length) relative to the disk image and according to
    734  *	the virtual disk partitioning.
    735  *
    736  * Parameters:
    737  *	vd		- disk on which the operation is performed.
    738  *	slice		- slice to which is the I/O parameters apply.
    739  *			  VD_SLICE_NONE indicates that parameters are
    740  *			  are relative to the entire virtual disk.
    741  *	blkp		- pointer to the starting block relative to the
    742  *			  slice; return the starting block relative to
    743  *			  the disk image.
    744  *	lenp		- pointer to the number of bytes requested; return
    745  *			  the number of bytes that can effectively be used.
    746  *
    747  * Return Code:
    748  *	0		- I/O parameters have been successfully converted;
    749  *			  blkp and lenp point to the converted values.
    750  *	ENODATA		- no data are available for the given I/O parameters;
    751  *			  This occurs if the starting block is past the limit
    752  *			  of the slice.
    753  *	EINVAL		- I/O parameters are invalid.
    754  */
    755 static int
    756 vd_dskimg_io_params(vd_t *vd, int slice, size_t *blkp, size_t *lenp)
    757 {
    758 	size_t blk = *blkp;
    759 	size_t len = *lenp;
    760 	size_t offset, maxlen;
    761 
    762 	ASSERT(vd->file || VD_DSKIMG(vd));
    763 	ASSERT(len > 0);
    764 	ASSERT(vd->vdisk_bsize == DEV_BSIZE);
    765 
    766 	/*
    767 	 * If a file is exported as a slice then we don't care about the vtoc.
    768 	 * In that case, the vtoc is a fake mainly to make newfs happy and we
    769 	 * handle any I/O as a raw disk access so that we can have access to the
    770 	 * entire backend.
    771 	 */
    772 	if (vd->vdisk_type == VD_DISK_TYPE_SLICE || slice == VD_SLICE_NONE) {
    773 		/* raw disk access */
    774 		offset = blk * DEV_BSIZE;
    775 		if (offset >= vd->dskimg_size) {
    776 			/* offset past the end of the disk */
    777 			PR0("offset (0x%lx) >= size (0x%lx)",
    778 			    offset, vd->dskimg_size);
    779 			return (ENODATA);
    780 		}
    781 		maxlen = vd->dskimg_size - offset;
    782 	} else {
    783 		ASSERT(slice >= 0 && slice < V_NUMPAR);
    784 
    785 		/*
    786 		 * v1.0 vDisk clients depended on the server not verifying
    787 		 * the label of a unformatted disk.  This "feature" is
    788 		 * maintained for backward compatibility but all versions
    789 		 * from v1.1 onwards must do the right thing.
    790 		 */
    791 		if (vd->vdisk_label == VD_DISK_LABEL_UNK &&
    792 		    vio_ver_is_supported(vd->version, 1, 1)) {
    793 			(void) vd_dskimg_validate_geometry(vd);
    794 			if (vd->vdisk_label == VD_DISK_LABEL_UNK) {
    795 				PR0("Unknown disk label, can't do I/O "
    796 				    "from slice %d", slice);
    797 				return (EINVAL);
    798 			}
    799 		}
    800 
    801 		if (vd->vdisk_label == VD_DISK_LABEL_VTOC) {
    802 			ASSERT(vd->vtoc.v_sectorsz == DEV_BSIZE);
    803 		} else {
    804 			ASSERT(vd->vdisk_label == VD_DISK_LABEL_EFI);
    805 		}
    806 
    807 		if (blk >= vd->slices[slice].nblocks) {
    808 			/* address past the end of the slice */
    809 			PR0("req_addr (0x%lx) >= psize (0x%lx)",
    810 			    blk, vd->slices[slice].nblocks);
    811 			return (ENODATA);
    812 		}
    813 
    814 		offset = (vd->slices[slice].start + blk) * DEV_BSIZE;
    815 		maxlen = (vd->slices[slice].nblocks - blk) * DEV_BSIZE;
    816 	}
    817 
    818 	/*
    819 	 * If the requested size is greater than the size
    820 	 * of the partition, truncate the read/write.
    821 	 */
    822 	if (len > maxlen) {
    823 		PR0("I/O size truncated to %lu bytes from %lu bytes",
    824 		    maxlen, len);
    825 		len = maxlen;
    826 	}
    827 
    828 	/*
    829 	 * We have to ensure that we are reading/writing into the mmap
    830 	 * range. If we have a partial disk image (e.g. an image of
    831 	 * s0 instead s2) the system can try to access slices that
    832 	 * are not included into the disk image.
    833 	 */
    834 	if ((offset + len) > vd->dskimg_size) {
    835 		PR0("offset + nbytes (0x%lx + 0x%lx) > "
    836 		    "dskimg_size (0x%lx)", offset, len, vd->dskimg_size);
    837 		return (EINVAL);
    838 	}
    839 
    840 	*blkp = offset / DEV_BSIZE;
    841 	*lenp = len;
    842 
    843 	return (0);
    844 }
    845 
    846 /*
    847  * Function:
    848  *	vd_dskimg_rw
    849  *
    850  * Description:
    851  * 	Read or write to a disk image. It handles the case where the disk
    852  *	image is a file or a volume exported as a full disk or a file
    853  *	exported as single-slice disk. Read or write to volumes exported as
    854  *	single slice disks are done by directly using the ldi interface.
    855  *
    856  * Parameters:
    857  *	vd		- disk on which the operation is performed.
    858  *	slice		- slice on which the operation is performed,
    859  *			  VD_SLICE_NONE indicates that the operation
    860  *			  is done using an absolute disk offset.
    861  *	operation	- operation to execute: read (VD_OP_BREAD) or
    862  *			  write (VD_OP_BWRITE).
    863  *	data		- buffer where data are read to or written from.
    864  *	blk		- starting block for the operation.
    865  *	len		- number of bytes to read or write.
    866  *
    867  * Return Code:
    868  *	n >= 0		- success, n indicates the number of bytes read
    869  *			  or written.
    870  *	-1		- error.
    871  */
    872 static ssize_t
    873 vd_dskimg_rw(vd_t *vd, int slice, int operation, caddr_t data, size_t offset,
    874     size_t len)
    875 {
    876 	ssize_t resid;
    877 	struct buf buf;
    878 	int status;
    879 
    880 	ASSERT(vd->file || VD_DSKIMG(vd));
    881 	ASSERT(len > 0);
    882 	ASSERT(vd->vdisk_bsize == DEV_BSIZE);
    883 
    884 	if ((status = vd_dskimg_io_params(vd, slice, &offset, &len)) != 0)
    885 		return ((status == ENODATA)? 0: -1);
    886 
    887 	if (vd->volume) {
    888 
    889 		bioinit(&buf);
    890 		buf.b_flags	= B_BUSY |
    891 		    ((operation == VD_OP_BREAD)? B_READ : B_WRITE);
    892 		buf.b_bcount	= len;
    893 		buf.b_lblkno	= offset;
    894 		buf.b_edev 	= vd->dev[0];
    895 		buf.b_un.b_addr = data;
    896 
    897 		/*
    898 		 * We use ldi_strategy() and not ldi_read()/ldi_write() because
    899 		 * the read/write functions of the underlying driver may try to
    900 		 * lock pages of the data buffer, and this requires the data
    901 		 * buffer to be kmem_alloc'ed (and not allocated on the stack).
    902 		 *
    903 		 * Also using ldi_strategy() ensures that writes are immediatly
    904 		 * commited and not cached as this may be the case with
    905 		 * ldi_write() (for example with a ZFS volume).
    906 		 */
    907 		if (ldi_strategy(vd->ldi_handle[0], &buf) != 0) {
    908 			biofini(&buf);
    909 			return (-1);
    910 		}
    911 
    912 		if (biowait(&buf) != 0) {
    913 			biofini(&buf);
    914 			return (-1);
    915 		}
    916 
    917 		resid = buf.b_resid;
    918 		biofini(&buf);
    919 
    920 		ASSERT(resid <= len);
    921 		return (len - resid);
    922 	}
    923 
    924 	ASSERT(vd->file);
    925 
    926 	status = vn_rdwr((operation == VD_OP_BREAD)? UIO_READ : UIO_WRITE,
    927 	    vd->file_vnode, data, len, offset * DEV_BSIZE, UIO_SYSSPACE, FSYNC,
    928 	    RLIM64_INFINITY, kcred, &resid);
    929 
    930 	if (status != 0)
    931 		return (-1);
    932 
    933 	return (len);
    934 }
    935 
    936 /*
    937  * Function:
    938  *	vd_build_default_label
    939  *
    940  * Description:
    941  *	Return a default label for a given disk size. This is used when the disk
    942  *	does not have a valid VTOC so that the user can get a valid default
    943  *	configuration. The default label has all slice sizes set to 0 (except
    944  *	slice 2 which is the entire disk) to force the user to write a valid
    945  *	label onto the disk image.
    946  *
    947  * Parameters:
    948  *	disk_size	- the disk size in bytes
    949  *	bsize		- the disk block size in bytes
    950  *	label		- the returned default label.
    951  *
    952  * Return Code:
    953  *	none.
    954  */
    955 static void
    956 vd_build_default_label(size_t disk_size, size_t bsize, struct dk_label *label)
    957 {
    958 	size_t size;
    959 	char unit;
    960 
    961 	ASSERT(bsize > 0);
    962 
    963 	bzero(label, sizeof (struct dk_label));
    964 
    965 	/*
    966 	 * Ideally we would like the cylinder size (nsect * nhead) to be the
    967 	 * same whatever the disk size is. That way the VTOC label could be
    968 	 * easily updated in case the disk size is increased (keeping the
    969 	 * same cylinder size allows to preserve the existing partitioning
    970 	 * when updating the VTOC label). But it is not possible to have
    971 	 * a fixed cylinder size and to cover all disk size.
    972 	 *
    973 	 * So we define different cylinder sizes depending on the disk size.
    974 	 * The cylinder size is chosen so that we don't have too few cylinders
    975 	 * for a small disk image, or so many on a big disk image that you
    976 	 * waste space for backup superblocks or cylinder group structures.
    977 	 * Also we must have a resonable number of cylinders and sectors so
    978 	 * that newfs can run using default values.
    979 	 *
    980 	 *	+-----------+--------+---------+--------+
    981 	 *	| disk_size |  < 2MB | 2MB-4GB | >= 8GB |
    982 	 *	+-----------+--------+---------+--------+
    983 	 *	| nhead	    |	 1   |	   1   |    96  |
    984 	 *	| nsect	    |  200   |   600   |   768  |
    985 	 *	+-----------+--------+---------+--------+
    986 	 *
    987 	 * Other parameters are computed from these values:
    988 	 *
    989 	 * 	pcyl = disk_size / (nhead * nsect * 512)
    990 	 * 	acyl = (pcyl > 2)? 2 : 0
    991 	 * 	ncyl = pcyl - acyl
    992 	 *
    993 	 * The maximum number of cylinder is 65535 so this allows to define a
    994 	 * geometry for a disk size up to 65535 * 96 * 768 * 512 = 2.24 TB
    995 	 * which is more than enough to cover the maximum size allowed by the
    996 	 * extended VTOC format (2TB).
    997 	 */
    998 
    999 	if (disk_size >= 8 * ONE_GIGABYTE) {
   1000 
   1001 		label->dkl_nhead = 96;
   1002 		label->dkl_nsect = 768;
   1003 
   1004 	} else if (disk_size >= 2 * ONE_MEGABYTE) {
   1005 
   1006 		label->dkl_nhead = 1;
   1007 		label->dkl_nsect = 600;
   1008 
   1009 	} else {
   1010 
   1011 		label->dkl_nhead = 1;
   1012 		label->dkl_nsect = 200;
   1013 	}
   1014 
   1015 	label->dkl_pcyl = disk_size /
   1016 	    (label->dkl_nsect * label->dkl_nhead * bsize);
   1017 
   1018 	if (label->dkl_pcyl == 0)
   1019 		label->dkl_pcyl = 1;
   1020 
   1021 	label->dkl_acyl = 0;
   1022 
   1023 	if (label->dkl_pcyl > 2)
   1024 		label->dkl_acyl = 2;
   1025 
   1026 	label->dkl_ncyl = label->dkl_pcyl - label->dkl_acyl;
   1027 	label->dkl_write_reinstruct = 0;
   1028 	label->dkl_read_reinstruct = 0;
   1029 	label->dkl_rpm = 7200;
   1030 	label->dkl_apc = 0;
   1031 	label->dkl_intrlv = 0;
   1032 
   1033 	PR0("requested disk size: %ld bytes\n", disk_size);
   1034 	PR0("setup: ncyl=%d nhead=%d nsec=%d\n", label->dkl_pcyl,
   1035 	    label->dkl_nhead, label->dkl_nsect);
   1036 	PR0("provided disk size: %ld bytes\n", (uint64_t)
   1037 	    (label->dkl_pcyl * label->dkl_nhead *
   1038 	    label->dkl_nsect * bsize));
   1039 
   1040 	vd_get_readable_size(disk_size, &size, &unit);
   1041 
   1042 	/*
   1043 	 * We must have a correct label name otherwise format(1m) will
   1044 	 * not recognized the disk as labeled.
   1045 	 */
   1046 	(void) snprintf(label->dkl_asciilabel, LEN_DKL_ASCII,
   1047 	    "SUN-DiskImage-%ld%cB cyl %d alt %d hd %d sec %d",
   1048 	    size, unit,
   1049 	    label->dkl_ncyl, label->dkl_acyl, label->dkl_nhead,
   1050 	    label->dkl_nsect);
   1051 
   1052 	/* default VTOC */
   1053 	label->dkl_vtoc.v_version = V_EXTVERSION;
   1054 	label->dkl_vtoc.v_nparts = V_NUMPAR;
   1055 	label->dkl_vtoc.v_sanity = VTOC_SANE;
   1056 	label->dkl_vtoc.v_part[VD_ENTIRE_DISK_SLICE].p_tag = V_BACKUP;
   1057 	label->dkl_map[VD_ENTIRE_DISK_SLICE].dkl_cylno = 0;
   1058 	label->dkl_map[VD_ENTIRE_DISK_SLICE].dkl_nblk = label->dkl_ncyl *
   1059 	    label->dkl_nhead * label->dkl_nsect;
   1060 	label->dkl_magic = DKL_MAGIC;
   1061 	label->dkl_cksum = vd_lbl2cksum(label);
   1062 }
   1063 
   1064 /*
   1065  * Function:
   1066  *	vd_dskimg_set_vtoc
   1067  *
   1068  * Description:
   1069  *	Set the vtoc of a disk image by writing the label and backup
   1070  *	labels into the disk image backend.
   1071  *
   1072  * Parameters:
   1073  *	vd		- disk on which the operation is performed.
   1074  *	label		- the data to be written.
   1075  *
   1076  * Return Code:
   1077  *	0		- success.
   1078  *	n > 0		- error, n indicates the errno code.
   1079  */
   1080 static int
   1081 vd_dskimg_set_vtoc(vd_t *vd, struct dk_label *label)
   1082 {
   1083 	size_t blk, sec, cyl, head, cnt;
   1084 
   1085 	ASSERT(VD_DSKIMG(vd));
   1086 
   1087 	if (VD_DSKIMG_LABEL_WRITE(vd, label) < 0) {
   1088 		PR0("fail to write disk label");
   1089 		return (EIO);
   1090 	}
   1091 
   1092 	/*
   1093 	 * Backup labels are on the last alternate cylinder's
   1094 	 * first five odd sectors.
   1095 	 */
   1096 	if (label->dkl_acyl == 0) {
   1097 		PR0("no alternate cylinder, can not store backup labels");
   1098 		return (0);
   1099 	}
   1100 
   1101 	cyl = label->dkl_ncyl  + label->dkl_acyl - 1;
   1102 	head = label->dkl_nhead - 1;
   1103 
   1104 	blk = (cyl * ((label->dkl_nhead * label->dkl_nsect) - label->dkl_apc)) +
   1105 	    (head * label->dkl_nsect);
   1106 
   1107 	/*
   1108 	 * Write the backup labels. Make sure we don't try to write past
   1109 	 * the last cylinder.
   1110 	 */
   1111 	sec = 1;
   1112 
   1113 	for (cnt = 0; cnt < VD_DSKIMG_NUM_BACKUP; cnt++) {
   1114 
   1115 		if (sec >= label->dkl_nsect) {
   1116 			PR0("not enough sector to store all backup labels");
   1117 			return (0);
   1118 		}
   1119 
   1120 		if (vd_dskimg_rw(vd, VD_SLICE_NONE, VD_OP_BWRITE,
   1121 		    (caddr_t)label, blk + sec, sizeof (struct dk_label)) < 0) {
   1122 			PR0("error writing backup label at block %lu\n",
   1123 			    blk + sec);
   1124 			return (EIO);
   1125 		}
   1126 
   1127 		PR1("wrote backup label at block %lu\n", blk + sec);
   1128 
   1129 		sec += 2;
   1130 	}
   1131 
   1132 	return (0);
   1133 }
   1134 
   1135 /*
   1136  * Function:
   1137  *	vd_dskimg_get_devid_block
   1138  *
   1139  * Description:
   1140  *	Return the block number where the device id is stored.
   1141  *
   1142  * Parameters:
   1143  *	vd		- disk on which the operation is performed.
   1144  *	blkp		- pointer to the block number
   1145  *
   1146  * Return Code:
   1147  *	0		- success
   1148  *	ENOSPC		- disk has no space to store a device id
   1149  */
   1150 static int
   1151 vd_dskimg_get_devid_block(vd_t *vd, size_t *blkp)
   1152 {
   1153 	diskaddr_t spc, head, cyl;
   1154 
   1155 	ASSERT(VD_DSKIMG(vd));
   1156 
   1157 	if (vd->vdisk_label == VD_DISK_LABEL_UNK) {
   1158 		/*
   1159 		 * If no label is defined we don't know where to find
   1160 		 * a device id.
   1161 		 */
   1162 		return (ENOSPC);
   1163 	}
   1164 
   1165 	if (vd->vdisk_label == VD_DISK_LABEL_EFI) {
   1166 		/*
   1167 		 * For an EFI disk, the devid is at the beginning of
   1168 		 * the reserved slice
   1169 		 */
   1170 		if (vd->efi_reserved == -1) {
   1171 			PR0("EFI disk has no reserved slice");
   1172 			return (ENOSPC);
   1173 		}
   1174 
   1175 		*blkp = vd->slices[vd->efi_reserved].start;
   1176 		return (0);
   1177 	}
   1178 
   1179 	ASSERT(vd->vdisk_label == VD_DISK_LABEL_VTOC);
   1180 
   1181 	/* this geometry doesn't allow us to have a devid */
   1182 	if (vd->dk_geom.dkg_acyl < 2) {
   1183 		PR0("not enough alternate cylinder available for devid "
   1184 		    "(acyl=%u)", vd->dk_geom.dkg_acyl);
   1185 		return (ENOSPC);
   1186 	}
   1187 
   1188 	/* the devid is in on the track next to the last cylinder */
   1189 	cyl = vd->dk_geom.dkg_ncyl + vd->dk_geom.dkg_acyl - 2;
   1190 	spc = vd->dk_geom.dkg_nhead * vd->dk_geom.dkg_nsect;
   1191 	head = vd->dk_geom.dkg_nhead - 1;
   1192 
   1193 	*blkp = (cyl * (spc - vd->dk_geom.dkg_apc)) +
   1194 	    (head * vd->dk_geom.dkg_nsect) + 1;
   1195 
   1196 	return (0);
   1197 }
   1198 
   1199 /*
   1200  * Return the checksum of a disk block containing an on-disk devid.
   1201  */
   1202 static uint_t
   1203 vd_dkdevid2cksum(struct dk_devid *dkdevid)
   1204 {
   1205 	uint_t chksum, *ip;
   1206 	int i;
   1207 
   1208 	chksum = 0;
   1209 	ip = (void *)dkdevid;
   1210 	for (i = 0; i < ((DEV_BSIZE - sizeof (int)) / sizeof (int)); i++)
   1211 		chksum ^= ip[i];
   1212 
   1213 	return (chksum);
   1214 }
   1215 
   1216 /*
   1217  * Function:
   1218  *	vd_dskimg_read_devid
   1219  *
   1220  * Description:
   1221  *	Read the device id stored on a disk image.
   1222  *
   1223  * Parameters:
   1224  *	vd		- disk on which the operation is performed.
   1225  *	devid		- the return address of the device ID.
   1226  *
   1227  * Return Code:
   1228  *	0		- success
   1229  *	EIO		- I/O error while trying to access the disk image
   1230  *	EINVAL		- no valid device id was found
   1231  *	ENOSPC		- disk has no space to store a device id
   1232  */
   1233 static int
   1234 vd_dskimg_read_devid(vd_t *vd, ddi_devid_t *devid)
   1235 {
   1236 	struct dk_devid *dkdevid;
   1237 	size_t blk;
   1238 	uint_t chksum;
   1239 	int status, sz;
   1240 
   1241 	ASSERT(vd->vdisk_bsize == DEV_BSIZE);
   1242 
   1243 	if ((status = vd_dskimg_get_devid_block(vd, &blk)) != 0)
   1244 		return (status);
   1245 
   1246 	dkdevid = kmem_zalloc(DEV_BSIZE, KM_SLEEP);
   1247 
   1248 	/* get the devid */
   1249 	if ((vd_dskimg_rw(vd, VD_SLICE_NONE, VD_OP_BREAD, (caddr_t)dkdevid, blk,
   1250 	    DEV_BSIZE)) < 0) {
   1251 		PR0("error reading devid block at %lu", blk);
   1252 		status = EIO;
   1253 		goto done;
   1254 	}
   1255 
   1256 	/* validate the revision */
   1257 	if ((dkdevid->dkd_rev_hi != DK_DEVID_REV_MSB) ||
   1258 	    (dkdevid->dkd_rev_lo != DK_DEVID_REV_LSB)) {
   1259 		PR0("invalid devid found at block %lu (bad revision)", blk);
   1260 		status = EINVAL;
   1261 		goto done;
   1262 	}
   1263 
   1264 	/* compute checksum */
   1265 	chksum = vd_dkdevid2cksum(dkdevid);
   1266 
   1267 	/* compare the checksums */
   1268 	if (DKD_GETCHKSUM(dkdevid) != chksum) {
   1269 		PR0("invalid devid found at block %lu (bad checksum)", blk);
   1270 		status = EINVAL;
   1271 		goto done;
   1272 	}
   1273 
   1274 	/* validate the device id */
   1275 	if (ddi_devid_valid((ddi_devid_t)&dkdevid->dkd_devid) != DDI_SUCCESS) {
   1276 		PR0("invalid devid found at block %lu", blk);
   1277 		status = EINVAL;
   1278 		goto done;
   1279 	}
   1280 
   1281 	PR1("devid read at block %lu", blk);
   1282 
   1283 	sz = ddi_devid_sizeof((ddi_devid_t)&dkdevid->dkd_devid);
   1284 	*devid = kmem_alloc(sz, KM_SLEEP);
   1285 	bcopy(&dkdevid->dkd_devid, *devid, sz);
   1286 
   1287 done:
   1288 	kmem_free(dkdevid, DEV_BSIZE);
   1289 	return (status);
   1290 
   1291 }
   1292 
   1293 /*
   1294  * Function:
   1295  *	vd_dskimg_write_devid
   1296  *
   1297  * Description:
   1298  *	Write a device id into disk image.
   1299  *
   1300  * Parameters:
   1301  *	vd		- disk on which the operation is performed.
   1302  *	devid		- the device ID to store.
   1303  *
   1304  * Return Code:
   1305  *	0		- success
   1306  *	EIO		- I/O error while trying to access the disk image
   1307  *	ENOSPC		- disk has no space to store a device id
   1308  */
   1309 static int
   1310 vd_dskimg_write_devid(vd_t *vd, ddi_devid_t devid)
   1311 {
   1312 	struct dk_devid *dkdevid;
   1313 	uint_t chksum;
   1314 	size_t blk;
   1315 	int status;
   1316 
   1317 	ASSERT(vd->vdisk_bsize == DEV_BSIZE);
   1318 
   1319 	if (devid == NULL) {
   1320 		/* nothing to write */
   1321 		return (0);
   1322 	}
   1323 
   1324 	if ((status = vd_dskimg_get_devid_block(vd, &blk)) != 0)
   1325 		return (status);
   1326 
   1327 	dkdevid = kmem_zalloc(DEV_BSIZE, KM_SLEEP);
   1328 
   1329 	/* set revision */
   1330 	dkdevid->dkd_rev_hi = DK_DEVID_REV_MSB;
   1331 	dkdevid->dkd_rev_lo = DK_DEVID_REV_LSB;
   1332 
   1333 	/* copy devid */
   1334 	bcopy(devid, &dkdevid->dkd_devid, ddi_devid_sizeof(devid));
   1335 
   1336 	/* compute checksum */
   1337 	chksum = vd_dkdevid2cksum(dkdevid);
   1338 
   1339 	/* set checksum */
   1340 	DKD_FORMCHKSUM(chksum, dkdevid);
   1341 
   1342 	/* store the devid */
   1343 	if ((status = vd_dskimg_rw(vd, VD_SLICE_NONE, VD_OP_BWRITE,
   1344 	    (caddr_t)dkdevid, blk, DEV_BSIZE)) < 0) {
   1345 		PR0("Error writing devid block at %lu", blk);
   1346 		status = EIO;
   1347 	} else {
   1348 		PR1("devid written at block %lu", blk);
   1349 		status = 0;
   1350 	}
   1351 
   1352 	kmem_free(dkdevid, DEV_BSIZE);
   1353 	return (status);
   1354 }
   1355 
   1356 /*
   1357  * Function:
   1358  *	vd_do_scsi_rdwr
   1359  *
   1360  * Description:
   1361  * 	Read or write to a SCSI disk using an absolute disk offset.
   1362  *
   1363  * Parameters:
   1364  *	vd		- disk on which the operation is performed.
   1365  *	operation	- operation to execute: read (VD_OP_BREAD) or
   1366  *			  write (VD_OP_BWRITE).
   1367  *	data		- buffer where data are read to or written from.
   1368  *	blk		- starting block for the operation.
   1369  *	len		- number of bytes to read or write.
   1370  *
   1371  * Return Code:
   1372  *	0		- success
   1373  *	n != 0		- error.
   1374  */
   1375 static int
   1376 vd_do_scsi_rdwr(vd_t *vd, int operation, caddr_t data, size_t blk, size_t len)
   1377 {
   1378 	struct uscsi_cmd ucmd;
   1379 	union scsi_cdb cdb;
   1380 	int nsectors, nblk;
   1381 	int max_sectors;
   1382 	int status, rval;
   1383 
   1384 	ASSERT(!vd->file);
   1385 	ASSERT(!vd->volume);
   1386 	ASSERT(vd->vdisk_bsize > 0);
   1387 
   1388 	max_sectors = vd->max_xfer_sz;
   1389 	nblk = (len / vd->vdisk_bsize);
   1390 
   1391 	if (len % vd->vdisk_bsize != 0)
   1392 		return (EINVAL);
   1393 
   1394 	/*
   1395 	 * Build and execute the uscsi ioctl.  We build a group0, group1
   1396 	 * or group4 command as necessary, since some targets
   1397 	 * do not support group1 commands.
   1398 	 */
   1399 	while (nblk) {
   1400 
   1401 		bzero(&ucmd, sizeof (ucmd));
   1402 		bzero(&cdb, sizeof (cdb));
   1403 
   1404 		nsectors = (max_sectors < nblk) ? max_sectors : nblk;
   1405 
   1406 		/*
   1407 		 * Some of the optical drives on sun4v machines are ATAPI
   1408 		 * devices which use Group 1 Read/Write commands so we need
   1409 		 * to explicitly check a flag which is set when a domain
   1410 		 * is bound.
   1411 		 */
   1412 		if (blk < (2 << 20) && nsectors <= 0xff && !vd->is_atapi_dev) {
   1413 			FORMG0ADDR(&cdb, blk);
   1414 			FORMG0COUNT(&cdb, (uchar_t)nsectors);
   1415 			ucmd.uscsi_cdblen = CDB_GROUP0;
   1416 		} else if (blk > 0xffffffff) {
   1417 			FORMG4LONGADDR(&cdb, blk);
   1418 			FORMG4COUNT(&cdb, nsectors);
   1419 			ucmd.uscsi_cdblen = CDB_GROUP4;
   1420 			cdb.scc_cmd |= SCMD_GROUP4;
   1421 		} else {
   1422 			FORMG1ADDR(&cdb, blk);
   1423 			FORMG1COUNT(&cdb, nsectors);
   1424 			ucmd.uscsi_cdblen = CDB_GROUP1;
   1425 			cdb.scc_cmd |= SCMD_GROUP1;
   1426 		}
   1427 		ucmd.uscsi_cdb = (caddr_t)&cdb;
   1428 		ucmd.uscsi_bufaddr = data;
   1429 		ucmd.uscsi_buflen = nsectors * vd->backend_bsize;
   1430 		ucmd.uscsi_timeout = vd_scsi_rdwr_timeout;
   1431 		/*
   1432 		 * Set flags so that the command is isolated from normal
   1433 		 * commands and no error message is printed.
   1434 		 */
   1435 		ucmd.uscsi_flags = USCSI_ISOLATE | USCSI_SILENT;
   1436 
   1437 		if (operation == VD_OP_BREAD) {
   1438 			cdb.scc_cmd |= SCMD_READ;
   1439 			ucmd.uscsi_flags |= USCSI_READ;
   1440 		} else {
   1441 			cdb.scc_cmd |= SCMD_WRITE;
   1442 		}
   1443 
   1444 		status = ldi_ioctl(vd->ldi_handle[VD_ENTIRE_DISK_SLICE],
   1445 		    USCSICMD, (intptr_t)&ucmd, (vd->open_flags | FKIOCTL),
   1446 		    kcred, &rval);
   1447 
   1448 		if (status == 0)
   1449 			status = ucmd.uscsi_status;
   1450 
   1451 		if (status != 0)
   1452 			break;
   1453 
   1454 		/*
   1455 		 * Check if partial DMA breakup is required. If so, reduce
   1456 		 * the request size by half and retry the last request.
   1457 		 */
   1458 		if (ucmd.uscsi_resid == ucmd.uscsi_buflen) {
   1459 			max_sectors >>= 1;
   1460 			if (max_sectors <= 0) {
   1461 				status = EIO;
   1462 				break;
   1463 			}
   1464 			continue;
   1465 		}
   1466 
   1467 		if (ucmd.uscsi_resid != 0) {
   1468 			status = EIO;
   1469 			break;
   1470 		}
   1471 
   1472 		blk += nsectors;
   1473 		nblk -= nsectors;
   1474 		data += nsectors * vd->vdisk_bsize;
   1475 	}
   1476 
   1477 	return (status);
   1478 }
   1479 
   1480 /*
   1481  * Function:
   1482  *	vd_scsi_rdwr
   1483  *
   1484  * Description:
   1485  * 	Wrapper function to read or write to a SCSI disk using an absolute
   1486  *	disk offset. It checks the blocksize of the underlying device and,
   1487  *	if necessary, adjusts the buffers accordingly before calling
   1488  *	vd_do_scsi_rdwr() to do the actual read or write.
   1489  *
   1490  * Parameters:
   1491  *	vd		- disk on which the operation is performed.
   1492  *	operation	- operation to execute: read (VD_OP_BREAD) or
   1493  *			  write (VD_OP_BWRITE).
   1494  *	data		- buffer where data are read to or written from.
   1495  *	blk		- starting block for the operation.
   1496  *	len		- number of bytes to read or write.
   1497  *
   1498  * Return Code:
   1499  *	0		- success
   1500  *	n != 0		- error.
   1501  */
   1502 static int
   1503 vd_scsi_rdwr(vd_t *vd, int operation, caddr_t data, size_t vblk, size_t vlen)
   1504 {
   1505 	int	rv;
   1506 
   1507 	size_t	pblk;	/* physical device block number of data on device */
   1508 	size_t	delta;	/* relative offset between pblk and vblk */
   1509 	size_t	pnblk;	/* number of physical blocks to be read from device */
   1510 	size_t	plen;	/* length of data to be read from physical device */
   1511 	char	*buf;	/* buffer area to fit physical device's block size */
   1512 
   1513 	if (vd->backend_bsize == 0) {
   1514 		/*
   1515 		 * The block size was not available during the attach,
   1516 		 * try to update it now.
   1517 		 */
   1518 		if (vd_backend_check_size(vd) != 0)
   1519 			return (EIO);
   1520 	}
   1521 
   1522 	/*
   1523 	 * If the vdisk block size and the block size of the underlying device
   1524 	 * match we can skip straight to vd_do_scsi_rdwr(), otherwise we need
   1525 	 * to create a buffer large enough to handle the device's block size
   1526 	 * and adjust the block to be read from and the amount of data to
   1527 	 * read to correspond with the device's block size.
   1528 	 */
   1529 	if (vd->vdisk_bsize == vd->backend_bsize)
   1530 		return (vd_do_scsi_rdwr(vd, operation, data, vblk, vlen));
   1531 
   1532 	if (vd->vdisk_bsize > vd->backend_bsize)
   1533 		return (EINVAL);
   1534 
   1535 	/*
   1536 	 * Writing of physical block sizes larger than the virtual block size
   1537 	 * is not supported. This would be added if/when support for guests
   1538 	 * writing to DVDs is implemented.
   1539 	 */
   1540 	if (operation == VD_OP_BWRITE)
   1541 		return (ENOTSUP);
   1542 
   1543 	/* BEGIN CSTYLED */
   1544 	/*
   1545 	 * Below is a diagram showing the relationship between the physical
   1546 	 * and virtual blocks. If the virtual blocks marked by 'X' below are
   1547 	 * requested, then the physical blocks denoted by 'Y' are read.
   1548 	 *
   1549 	 *           vblk
   1550 	 *             |      vlen
   1551 	 *             |<--------------->|
   1552 	 *             v                 v
   1553 	 *  --+--+--+--+--+--+--+--+--+--+--+--+--+--+--+-   virtual disk:
   1554 	 *    |  |  |  |XX|XX|XX|XX|XX|XX|  |  |  |  |  |  } block size is
   1555 	 *  --+--+--+--+--+--+--+--+--+--+--+--+--+--+--+-   vd->vdisk_bsize
   1556 	 *          :  :                 :  :
   1557 	 *         >:==:< delta          :  :
   1558 	 *          :  :                 :  :
   1559 	 *  --+-----+-----+-----+-----+-----+-----+-----+--   physical disk:
   1560 	 *    |     |YY:YY|YYYYY|YYYYY|YY:YY|     |     |   } block size is
   1561 	 *  --+-----+-----+-----+-----+-----+-----+-----+--   vd->backend_bsize
   1562 	 *          ^                       ^
   1563 	 *          |<--------------------->|
   1564 	 *          |         plen
   1565 	 *         pblk
   1566 	 */
   1567 	/* END CSTYLED */
   1568 	pblk = (vblk * vd->vdisk_bsize) / vd->backend_bsize;
   1569 	delta = (vblk * vd->vdisk_bsize) - (pblk * vd->backend_bsize);
   1570 	pnblk = ((delta + vlen - 1) / vd->backend_bsize) + 1;
   1571 	plen = pnblk * vd->backend_bsize;
   1572 
   1573 	PR2("vblk %lx:pblk %lx: vlen %ld:plen %ld", vblk, pblk, vlen, plen);
   1574 
   1575 	buf = kmem_zalloc(sizeof (caddr_t) * plen, KM_SLEEP);
   1576 	rv = vd_do_scsi_rdwr(vd, operation, (caddr_t)buf, pblk, plen);
   1577 	bcopy(buf + delta, data, vlen);
   1578 
   1579 	kmem_free(buf, sizeof (caddr_t) * plen);
   1580 
   1581 	return (rv);
   1582 }
   1583 
   1584 /*
   1585  * Function:
   1586  *	vd_slice_flabel_read
   1587  *
   1588  * Description:
   1589  *	This function simulates a read operation from the fake label of
   1590  *	a single-slice disk.
   1591  *
   1592  * Parameters:
   1593  *	vd		- single-slice disk to read from
   1594  *	data		- buffer where data should be read to
   1595  *	offset		- offset in byte where the read should start
   1596  *	length		- number of bytes to read
   1597  *
   1598  * Return Code:
   1599  *	n >= 0		- success, n indicates the number of bytes read
   1600  *	-1		- error
   1601  */
   1602 static ssize_t
   1603 vd_slice_flabel_read(vd_t *vd, caddr_t data, size_t offset, size_t length)
   1604 {
   1605 	size_t n = 0;
   1606 	uint_t limit = vd->flabel_limit * vd->vdisk_bsize;
   1607 
   1608 	ASSERT(vd->vdisk_type == VD_DISK_TYPE_SLICE);
   1609 	ASSERT(vd->flabel != NULL);
   1610 
   1611 	/* if offset is past the fake label limit there's nothing to read */
   1612 	if (offset >= limit)
   1613 		return (0);
   1614 
   1615 	/* data with offset 0 to flabel_size are read from flabel */
   1616 	if (offset < vd->flabel_size) {
   1617 
   1618 		if (offset + length <= vd->flabel_size) {
   1619 			bcopy(vd->flabel + offset, data, length);
   1620 			return (length);
   1621 		}
   1622 
   1623 		n = vd->flabel_size - offset;
   1624 		bcopy(vd->flabel + offset, data, n);
   1625 		data += n;
   1626 	}
   1627 
   1628 	/* data with offset from flabel_size to flabel_limit are all zeros */
   1629 	if (offset + length <= limit) {
   1630 		bzero(data, length - n);
   1631 		return (length);
   1632 	}
   1633 
   1634 	bzero(data, limit - offset - n);
   1635 	return (limit - offset);
   1636 }
   1637 
   1638 /*
   1639  * Function:
   1640  *	vd_slice_flabel_write
   1641  *
   1642  * Description:
   1643  *	This function simulates a write operation to the fake label of
   1644  *	a single-slice disk. Write operations are actually faked and return
   1645  *	success although the label is never changed. This is mostly to
   1646  *	simulate a successful label update.
   1647  *
   1648  * Parameters:
   1649  *	vd		- single-slice disk to write to
   1650  *	data		- buffer where data should be written from
   1651  *	offset		- offset in byte where the write should start
   1652  *	length		- number of bytes to written
   1653  *
   1654  * Return Code:
   1655  *	n >= 0		- success, n indicates the number of bytes written
   1656  *	-1		- error
   1657  */
   1658 static ssize_t
   1659 vd_slice_flabel_write(vd_t *vd, caddr_t data, size_t offset, size_t length)
   1660 {
   1661 	uint_t limit = vd->flabel_limit * vd->vdisk_bsize;
   1662 	struct dk_label *label;
   1663 	struct dk_geom geom;
   1664 	struct extvtoc vtoc;
   1665 
   1666 	ASSERT(vd->vdisk_type == VD_DISK_TYPE_SLICE);
   1667 	ASSERT(vd->flabel != NULL);
   1668 
   1669 	if (offset >= limit)
   1670 		return (0);
   1671 
   1672 	/*
   1673 	 * If this is a request to overwrite the VTOC disk label, check that
   1674 	 * the new label is similar to the previous one and return that the
   1675 	 * write was successful, but note that nothing is actually overwritten.
   1676 	 */
   1677 	if (vd->vdisk_label == VD_DISK_LABEL_VTOC &&
   1678 	    offset == 0 && length == vd->vdisk_bsize) {
   1679 		label = (void *)data;
   1680 
   1681 		/* check that this is a valid label */
   1682 		if (label->dkl_magic != DKL_MAGIC ||
   1683 		    label->dkl_cksum != vd_lbl2cksum(label))
   1684 			return (-1);
   1685 
   1686 		/* check the vtoc and geometry */
   1687 		vd_label_to_vtocgeom(label, &vtoc, &geom);
   1688 		if (vd_slice_geom_isvalid(vd, &geom) &&
   1689 		    vd_slice_vtoc_isvalid(vd, &vtoc))
   1690 			return (length);
   1691 	}
   1692 
   1693 	/* fail any other write */
   1694 	return (-1);
   1695 }
   1696 
   1697 /*
   1698  * Function:
   1699  *	vd_slice_fake_rdwr
   1700  *
   1701  * Description:
   1702  *	This function simulates a raw read or write operation to a single-slice
   1703  *	disk. It only handles the faked part of the operation i.e. I/Os to
   1704  *	blocks which have no mapping with the vdisk backend (I/Os to the
   1705  *	beginning and to the end of the vdisk).
   1706  *
   1707  *	The function returns 0 is the operation	is completed and it has been
   1708  *	entirely handled as a fake read or write. In that case, lengthp points
   1709  *	to the number of bytes not read or written. Values returned by datap
   1710  *	and blkp are undefined.
   1711  *
   1712  *	If the fake operation has succeeded but the read or write is not
   1713  *	complete (i.e. the read/write operation extends beyond the blocks
   1714  *	we fake) then the function returns EAGAIN and datap, blkp and lengthp
   1715  *	pointers points to the parameters for completing the operation.
   1716  *
   1717  *	In case of an error, for example if the slice is empty or parameters
   1718  *	are invalid, then the function returns a non-zero value different
   1719  *	from EAGAIN. In that case, the returned values of datap, blkp and
   1720  *	lengthp are undefined.
   1721  *
   1722  * Parameters:
   1723  *	vd		- single-slice disk on which the operation is performed
   1724  *	slice		- slice on which the operation is performed,
   1725  *			  VD_SLICE_NONE indicates that the operation
   1726  *			  is done using an absolute disk offset.
   1727  *	operation	- operation to execute: read (VD_OP_BREAD) or
   1728  *			  write (VD_OP_BWRITE).
   1729  *	datap		- pointer to the buffer where data are read to
   1730  *			  or written from. Return the pointer where remaining
   1731  *			  data have to be read to or written from.
   1732  *	blkp		- pointer to the starting block for the operation.
   1733  *			  Return the starting block relative to the vdisk
   1734  *			  backend for the remaining operation.
   1735  *	lengthp		- pointer to the number of bytes to read or write.
   1736  *			  This should be a multiple of vdisk_bsize. Return the
   1737  *			  remaining number of bytes to read or write.
   1738  *
   1739  * Return Code:
   1740  *	0		- read/write operation is completed
   1741  *	EAGAIN		- read/write operation is not completed
   1742  *	other values	- error
   1743  */
   1744 static int
   1745 vd_slice_fake_rdwr(vd_t *vd, int slice, int operation, caddr_t *datap,
   1746     size_t *blkp, size_t *lengthp)
   1747 {
   1748 	struct dk_label *label;
   1749 	caddr_t data;
   1750 	size_t blk, length, csize;
   1751 	size_t ablk, asize, aoff, alen;
   1752 	ssize_t n;
   1753 	int sec, status;
   1754 	size_t bsize = vd->vdisk_bsize;
   1755 
   1756 	ASSERT(vd->vdisk_type == VD_DISK_TYPE_SLICE);
   1757 	ASSERT(slice != 0);
   1758 
   1759 	data = *datap;
   1760 	blk = *blkp;
   1761 	length = *lengthp;
   1762 
   1763 	/*
   1764 	 * If this is not a raw I/O or an I/O from a full disk slice then
   1765 	 * this is an I/O to/from an empty slice.
   1766 	 */
   1767 	if (slice != VD_SLICE_NONE &&
   1768 	    (slice != VD_ENTIRE_DISK_SLICE ||
   1769 	    vd->vdisk_label != VD_DISK_LABEL_VTOC) &&
   1770 	    (slice != VD_EFI_WD_SLICE ||
   1771 	    vd->vdisk_label != VD_DISK_LABEL_EFI)) {
   1772 		return (EIO);
   1773 	}
   1774 
   1775 	if (length % bsize != 0)
   1776 		return (EINVAL);
   1777 
   1778 	/* handle any I/O with the fake label */
   1779 	if (operation == VD_OP_BWRITE)
   1780 		n = vd_slice_flabel_write(vd, data, blk * bsize, length);
   1781 	else
   1782 		n = vd_slice_flabel_read(vd, data, blk * bsize, length);
   1783 
   1784 	if (n == -1)
   1785 		return (EINVAL);
   1786 
   1787 	ASSERT(n % bsize == 0);
   1788 
   1789 	/* adjust I/O arguments */
   1790 	data += n;
   1791 	blk += n / bsize;
   1792 	length -= n;
   1793 
   1794 	/* check if there's something else to process */
   1795 	if (length == 0) {
   1796 		status = 0;
   1797 		goto done;
   1798 	}
   1799 
   1800 	if (vd->vdisk_label == VD_DISK_LABEL_VTOC &&
   1801 	    slice == VD_ENTIRE_DISK_SLICE) {
   1802 		status = EAGAIN;
   1803 		goto done;
   1804 	}
   1805 
   1806 	if (vd->vdisk_label == VD_DISK_LABEL_EFI) {
   1807 		asize = EFI_MIN_RESV_SIZE + (EFI_MIN_ARRAY_SIZE / bsize) + 1;
   1808 		ablk = vd->vdisk_size - asize;
   1809 	} else {
   1810 		ASSERT(vd->vdisk_label == VD_DISK_LABEL_VTOC);
   1811 		ASSERT(vd->dk_geom.dkg_apc == 0);
   1812 
   1813 		csize = vd->dk_geom.dkg_nhead * vd->dk_geom.dkg_nsect;
   1814 		ablk = vd->dk_geom.dkg_ncyl * csize;
   1815 		asize = vd->dk_geom.dkg_acyl * csize;
   1816 	}
   1817 
   1818 	alen = length / bsize;
   1819 	aoff = blk;
   1820 
   1821 	/* if we have reached the last block then the I/O is completed */
   1822 	if (aoff == ablk + asize) {
   1823 		status = 0;
   1824 		goto done;
   1825 	}
   1826 
   1827 	/* if we are past the last block then return an error */
   1828 	if (aoff > ablk + asize)
   1829 		return (EIO);
   1830 
   1831 	/* check if there is any I/O to end of the disk */
   1832 	if (aoff + alen < ablk) {
   1833 		status = EAGAIN;
   1834 		goto done;
   1835 	}
   1836 
   1837 	/* we don't allow any write to the end of the disk */
   1838 	if (operation == VD_OP_BWRITE)
   1839 		return (EIO);
   1840 
   1841 	if (aoff < ablk) {
   1842 		alen -= (ablk - aoff);
   1843 		aoff = ablk;
   1844 	}
   1845 
   1846 	if (aoff + alen > ablk + asize) {
   1847 		alen = ablk + asize - aoff;
   1848 	}
   1849 
   1850 	alen *= bsize;
   1851 
   1852 	if (operation == VD_OP_BREAD) {
   1853 		bzero(data + (aoff - blk) * bsize, alen);
   1854 
   1855 		if (vd->vdisk_label == VD_DISK_LABEL_VTOC) {
   1856 			/* check if we read backup labels */
   1857 			label = VD_LABEL_VTOC(vd);
   1858 			ablk += (label->dkl_acyl - 1) * csize +
   1859 			    (label->dkl_nhead - 1) * label->dkl_nsect;
   1860 
   1861 			for (sec = 1; (sec < 5 * 2 + 1); sec += 2) {
   1862 
   1863 				if (ablk + sec >= blk &&
   1864 				    ablk + sec < blk + (length / bsize)) {
   1865 					bcopy(label, data +
   1866 					    (ablk + sec - blk) * bsize,
   1867 					    sizeof (struct dk_label));
   1868 				}
   1869 			}
   1870 		}
   1871 	}
   1872 
   1873 	length -= alen;
   1874 
   1875 	status = (length == 0)? 0: EAGAIN;
   1876 
   1877 done:
   1878 	ASSERT(length == 0 || blk >= vd->flabel_limit);
   1879 
   1880 	/*
   1881 	 * Return the parameters for the remaining I/O. The starting block is
   1882 	 * adjusted so that it is relative to the vdisk backend.
   1883 	 */
   1884 	*datap = data;
   1885 	*blkp = blk - vd->flabel_limit;
   1886 	*lengthp = length;
   1887 
   1888 	return (status);
   1889 }
   1890 
   1891 static int
   1892 vd_flush_write(vd_t *vd)
   1893 {
   1894 	int status, rval;
   1895 
   1896 	if (vd->file) {
   1897 		status = VOP_FSYNC(vd->file_vnode, FSYNC, kcred, NULL);
   1898 	} else {
   1899 		status = ldi_ioctl(vd->ldi_handle[0], DKIOCFLUSHWRITECACHE,
   1900 		    NULL, vd->open_flags | FKIOCTL, kcred, &rval);
   1901 	}
   1902 
   1903 	return (status);
   1904 }
   1905 
   1906 static void
   1907 vd_bio_task(void *arg)
   1908 {
   1909 	struct buf *buf = (struct buf *)arg;
   1910 	vd_task_t *task = (vd_task_t *)buf->b_private;
   1911 	vd_t *vd = task->vd;
   1912 	ssize_t resid;
   1913 	int status;
   1914 
   1915 	ASSERT(vd->vdisk_bsize == DEV_BSIZE);
   1916 
   1917 	if (vd->zvol) {
   1918 
   1919 		status = ldi_strategy(vd->ldi_handle[0], buf);
   1920 
   1921 	} else {
   1922 
   1923 		ASSERT(vd->file);
   1924 
   1925 		status = vn_rdwr((buf->b_flags & B_READ)? UIO_READ : UIO_WRITE,
   1926 		    vd->file_vnode, buf->b_un.b_addr, buf->b_bcount,
   1927 		    buf->b_lblkno * DEV_BSIZE, UIO_SYSSPACE, 0,
   1928 		    RLIM64_INFINITY, kcred, &resid);
   1929 
   1930 		if (status == 0) {
   1931 			buf->b_resid = resid;
   1932 			biodone(buf);
   1933 			return;
   1934 		}
   1935 	}
   1936 
   1937 	if (status != 0) {
   1938 		bioerror(buf, status);
   1939 		biodone(buf);
   1940 	}
   1941 }
   1942 
   1943 /*
   1944  * We define our own biodone function so that buffers used for
   1945  * asynchronous writes are not released when biodone() is called.
   1946  */
   1947 static int
   1948 vd_biodone(struct buf *bp)
   1949 {
   1950 	ASSERT((bp->b_flags & B_DONE) == 0);
   1951 	ASSERT(SEMA_HELD(&bp->b_sem));
   1952 
   1953 	bp->b_flags |= B_DONE;
   1954 	sema_v(&bp->b_io);
   1955 
   1956 	return (0);
   1957 }
   1958 
   1959 /*
   1960  * Return Values
   1961  *	EINPROGRESS	- operation was successfully started
   1962  *	EIO		- encountered LDC (aka. task error)
   1963  *	0		- operation completed successfully
   1964  *
   1965  * Side Effect
   1966  *     sets request->status = <disk operation status>
   1967  */
   1968 static int
   1969 vd_start_bio(vd_task_t *task)
   1970 {
   1971 	int			rv, status = 0;
   1972 	vd_t			*vd		= task->vd;
   1973 	vd_dring_payload_t	*request	= task->request;
   1974 	struct buf		*buf		= &task->buf;
   1975 	uint8_t			mtype;
   1976 	int 			slice;
   1977 	char			*bufaddr = 0;
   1978 	size_t			buflen;
   1979 	size_t			offset, length, nbytes;
   1980 
   1981 	ASSERT(vd != NULL);
   1982 	ASSERT(request != NULL);
   1983 
   1984 	slice = request->slice;
   1985 
   1986 	ASSERT(slice == VD_SLICE_NONE || slice < vd->nslices);
   1987 	ASSERT((request->operation == VD_OP_BREAD) ||
   1988 	    (request->operation == VD_OP_BWRITE));
   1989 
   1990 	if (request->nbytes == 0) {
   1991 		/* no service for trivial requests */
   1992 		request->status = EINVAL;
   1993 		return (0);
   1994 	}
   1995 
   1996 	PR1("%s %lu bytes at block %lu",
   1997 	    (request->operation == VD_OP_BREAD) ? "Read" : "Write",
   1998 	    request->nbytes, request->addr);
   1999 
   2000 	/*
   2001 	 * We have to check the open flags because the functions processing
   2002 	 * the read/write request will not do it.
   2003 	 */
   2004 	if (request->operation == VD_OP_BWRITE && !(vd->open_flags & FWRITE)) {
   2005 		PR0("write fails because backend is opened read-only");
   2006 		request->nbytes = 0;
   2007 		request->status = EROFS;
   2008 		return (0);
   2009 	}
   2010 
   2011 	mtype = (&vd->inband_task == task) ? LDC_SHADOW_MAP : LDC_DIRECT_MAP;
   2012 
   2013 	/* Map memory exported by client */
   2014 	status = ldc_mem_map(task->mhdl, request->cookie, request->ncookies,
   2015 	    mtype, (request->operation == VD_OP_BREAD) ? LDC_MEM_W : LDC_MEM_R,
   2016 	    &bufaddr, NULL);
   2017 	if (status != 0) {
   2018 		PR0("ldc_mem_map() returned err %d ", status);
   2019 		return (EIO);
   2020 	}
   2021 
   2022 	/*
   2023 	 * The buffer size has to be 8-byte aligned, so the client should have
   2024 	 * sent a buffer which size is roundup to the next 8-byte aligned value.
   2025 	 */
   2026 	buflen = P2ROUNDUP(request->nbytes, 8);
   2027 
   2028 	status = ldc_mem_acquire(task->mhdl, 0, buflen);
   2029 	if (status != 0) {
   2030 		(void) ldc_mem_unmap(task->mhdl);
   2031 		PR0("ldc_mem_acquire() returned err %d ", status);
   2032 		return (EIO);
   2033 	}
   2034 
   2035 	offset = request->addr;
   2036 	nbytes = request->nbytes;
   2037 	length = nbytes;
   2038 
   2039 	/* default number of byte returned by the I/O */
   2040 	request->nbytes = 0;
   2041 
   2042 	if (vd->vdisk_type == VD_DISK_TYPE_SLICE) {
   2043 
   2044 		if (slice != 0) {
   2045 			/* handle any fake I/O */
   2046 			rv = vd_slice_fake_rdwr(vd, slice, request->operation,
   2047 			    &bufaddr, &offset, &length);
   2048 
   2049 			/* record the number of bytes from the fake I/O */
   2050 			request->nbytes = nbytes - length;
   2051 
   2052 			if (rv == 0) {
   2053 				request->status = 0;
   2054 				goto io_done;
   2055 			}
   2056 
   2057 			if (rv != EAGAIN) {
   2058 				request->nbytes = 0;
   2059 				request->status = EIO;
   2060 				goto io_done;
   2061 			}
   2062 
   2063 			/*
   2064 			 * If we return with EAGAIN then this means that there
   2065 			 * are still data to read or write.
   2066 			 */
   2067 			ASSERT(length != 0);
   2068 
   2069 			/*
   2070 			 * We need to continue the I/O from the slice backend to
   2071 			 * complete the request. The variables bufaddr, offset
   2072 			 * and length have been adjusted to have the right
   2073 			 * information to do the remaining I/O from the backend.
   2074 			 * The backend is entirely mapped to slice 0 so we just
   2075 			 * have to complete the I/O from that slice.
   2076 			 */
   2077 			slice = 0;
   2078 		}
   2079 
   2080 	} else if (vd->volume || vd->file) {
   2081 
   2082 		rv = vd_dskimg_io_params(vd, slice, &offset, &length);
   2083 		if (rv != 0) {
   2084 			request->status = (rv == ENODATA)? 0: EIO;
   2085 			goto io_done;
   2086 		}
   2087 		slice = 0;
   2088 
   2089 	} else if (slice == VD_SLICE_NONE) {
   2090 
   2091 		/*
   2092 		 * This is not a disk image so it is a real disk. We
   2093 		 * assume that the underlying device driver supports
   2094 		 * USCSICMD ioctls. This is the case of all SCSI devices
   2095 		 * (sd, ssd...).
   2096 		 *
   2097 		 * In the future if we have non-SCSI disks we would need
   2098 		 * to invoke the appropriate function to do I/O using an
   2099 		 * absolute disk offset (for example using DIOCTL_RWCMD
   2100 		 * for IDE disks).
   2101 		 */
   2102 		rv = vd_scsi_rdwr(vd, request->operation, bufaddr, offset,
   2103 		    length);
   2104 		if (rv != 0) {
   2105 			request->status = EIO;
   2106 		} else {
   2107 			request->nbytes = length;
   2108 			request->status = 0;
   2109 		}
   2110 		goto io_done;
   2111 	}
   2112 
   2113 	/* Start the block I/O */
   2114 	bioinit(buf);
   2115 	buf->b_flags	= B_BUSY;
   2116 	buf->b_bcount	= length;
   2117 	buf->b_lblkno	= offset;
   2118 	buf->b_bufsize	= buflen;
   2119 	buf->b_edev 	= vd->dev[slice];
   2120 	buf->b_un.b_addr = bufaddr;
   2121 	buf->b_iodone	= vd_biodone;
   2122 
   2123 	if (vd->file || vd->zvol) {
   2124 		/*
   2125 		 * I/O to a file are dispatched to an I/O queue, so that several
   2126 		 * I/Os can be processed in parallel. We also do that for ZFS
   2127 		 * volumes because the ZFS volume strategy() function will only
   2128 		 * return after the I/O is completed (instead of just starting
   2129 		 * the I/O).
   2130 		 */
   2131 
   2132 		if (request->operation == VD_OP_BREAD) {
   2133 			buf->b_flags |= B_READ;
   2134 		} else {
   2135 			/*
   2136 			 * For ZFS volumes and files, we do an asynchronous
   2137 			 * write and we will wait for the completion of the
   2138 			 * write in vd_complete_bio() by flushing the volume
   2139 			 * or file.
   2140 			 *
   2141 			 * This done for performance reasons, so that we can
   2142 			 * group together several write requests into a single
   2143 			 * flush operation.
   2144 			 */
   2145 			buf->b_flags |= B_WRITE | B_ASYNC;
   2146 
   2147 			/*
   2148 			 * We keep track of the write so that we can group
   2149 			 * requests when flushing. The write queue has the
   2150 			 * same number of slots as the dring so this prevents
   2151 			 * the write queue from wrapping and overwriting
   2152 			 * existing entries: if the write queue gets full
   2153 			 * then that means that the dring is full so we stop
   2154 			 * receiving new requests until an existing request
   2155 			 * is processed, removed from the write queue and
   2156 			 * then from the dring.
   2157 			 */
   2158 			task->write_index = vd->write_index;
   2159 			vd->write_queue[task->write_index] = buf;
   2160 			vd->write_index =
   2161 			    VD_WRITE_INDEX_NEXT(vd, vd->write_index);
   2162 		}
   2163 
   2164 		buf->b_private = task;
   2165 
   2166 		ASSERT(vd->ioq != NULL);
   2167 
   2168 		request->status = 0;
   2169 		(void) ddi_taskq_dispatch(task->vd->ioq, vd_bio_task, buf,
   2170 		    DDI_SLEEP);
   2171 
   2172 	} else {
   2173 
   2174 		if (request->operation == VD_OP_BREAD) {
   2175 			buf->b_flags |= B_READ;
   2176 		} else {
   2177 			buf->b_flags |= B_WRITE;
   2178 		}
   2179 
   2180 		/* convert VIO block number to buf block number */
   2181 		buf->b_lblkno = offset << vd->vio_bshift;
   2182 
   2183 		request->status = ldi_strategy(vd->ldi_handle[slice], buf);
   2184 	}
   2185 
   2186 	/*
   2187 	 * This is to indicate to the caller that the request
   2188 	 * needs to be finished by vd_complete_bio() by calling
   2189 	 * biowait() there and waiting for that to return before
   2190 	 * triggering the notification of the vDisk client.
   2191 	 *
   2192 	 * This is necessary when writing to real disks as
   2193 	 * otherwise calls to ldi_strategy() would be serialized
   2194 	 * behind the calls to biowait() and performance would
   2195 	 * suffer.
   2196 	 */
   2197 	if (request->status == 0)
   2198 		return (EINPROGRESS);
   2199 
   2200 	biofini(buf);
   2201 
   2202 io_done:
   2203 	/* Clean up after error or completion */
   2204 	rv = ldc_mem_release(task->mhdl, 0, buflen);
   2205 	if (rv) {
   2206 		PR0("ldc_mem_release() returned err %d ", rv);
   2207 		status = EIO;
   2208 	}
   2209 	rv = ldc_mem_unmap(task->mhdl);
   2210 	if (rv) {
   2211 		PR0("ldc_mem_unmap() returned err %d ", rv);
   2212 		status = EIO;
   2213 	}
   2214 
   2215 	return (status);
   2216 }
   2217 
   2218 /*
   2219  * This function should only be called from vd_notify to ensure that requests
   2220  * are responded to in the order that they are received.
   2221  */
   2222 static int
   2223 send_msg(ldc_handle_t ldc_handle, void *msg, size_t msglen)
   2224 {
   2225 	int	status;
   2226 	size_t	nbytes;
   2227 
   2228 	do {
   2229 		nbytes = msglen;
   2230 		status = ldc_write(ldc_handle, msg, &nbytes);
   2231 		if (status != EWOULDBLOCK)
   2232 			break;
   2233 		drv_usecwait(vds_ldc_delay);
   2234 	} while (status == EWOULDBLOCK);
   2235 
   2236 	if (status != 0) {
   2237 		if (status != ECONNRESET)
   2238 			PR0("ldc_write() returned errno %d", status);
   2239 		return (status);
   2240 	} else if (nbytes != msglen) {
   2241 		PR0("ldc_write() performed only partial write");
   2242 		return (EIO);
   2243 	}
   2244 
   2245 	PR1("SENT %lu bytes", msglen);
   2246 	return (0);
   2247 }
   2248 
   2249 static void
   2250 vd_need_reset(vd_t *vd, boolean_t reset_ldc)
   2251 {
   2252 	mutex_enter(&vd->lock);
   2253 	vd->reset_state	= B_TRUE;
   2254 	vd->reset_ldc	= reset_ldc;
   2255 	mutex_exit(&vd->lock);
   2256 }
   2257 
   2258 /*
   2259  * Reset the state of the connection with a client, if needed; reset the LDC
   2260  * transport as well, if needed.  This function should only be called from the
   2261  * "vd_recv_msg", as it waits for tasks - otherwise a deadlock can occur.
   2262  */
   2263 static void
   2264 vd_reset_if_needed(vd_t *vd)
   2265 {
   2266 	int	status = 0;
   2267 
   2268 	mutex_enter(&vd->lock);
   2269 	if (!vd->reset_state) {
   2270 		ASSERT(!vd->reset_ldc);
   2271 		mutex_exit(&vd->lock);
   2272 		return;
   2273 	}
   2274 	mutex_exit(&vd->lock);
   2275 
   2276 	PR0("Resetting connection state with %s", VD_CLIENT(vd));
   2277 
   2278 	/*
   2279 	 * Let any asynchronous I/O complete before possibly pulling the rug
   2280 	 * out from under it; defer checking vd->reset_ldc, as one of the
   2281 	 * asynchronous tasks might set it
   2282 	 */
   2283 	if (vd->ioq != NULL)
   2284 		ddi_taskq_wait(vd->ioq);
   2285 	ddi_taskq_wait(vd->completionq);
   2286 
   2287 	status = vd_flush_write(vd);
   2288 	if (status) {
   2289 		PR0("flushwrite returned error %d", status);
   2290 	}
   2291 
   2292 	if ((vd->initialized & VD_DRING) &&
   2293 	    ((status = ldc_mem_dring_unmap(vd->dring_handle)) != 0))
   2294 		PR0("ldc_mem_dring_unmap() returned errno %d", status);
   2295 
   2296 	vd_free_dring_task(vd);
   2297 
   2298 	/* Free the staging buffer for msgs */
   2299 	if (vd->vio_msgp != NULL) {
   2300 		kmem_free(vd->vio_msgp, vd->max_msglen);
   2301 		vd->vio_msgp = NULL;
   2302 	}
   2303 
   2304 	/* Free the inband message buffer */
   2305 	if (vd->inband_task.msg != NULL) {
   2306 		kmem_free(vd->inband_task.msg, vd->max_msglen);
   2307 		vd->inband_task.msg = NULL;
   2308 	}
   2309 
   2310 	mutex_enter(&vd->lock);
   2311 
   2312 	if (vd->reset_ldc)
   2313 		PR0("taking down LDC channel");
   2314 	if (vd->reset_ldc && ((status = ldc_down(vd->ldc_handle)) != 0))
   2315 		PR0("ldc_down() returned errno %d", status);
   2316 
   2317 	/* Reset exclusive access rights */
   2318 	vd_reset_access(vd);
   2319 
   2320 	vd->initialized	&= ~(VD_SID | VD_SEQ_NUM | VD_DRING);
   2321 	vd->state	= VD_STATE_INIT;
   2322 	vd->max_msglen	= sizeof (vio_msg_t);	/* baseline vio message size */
   2323 
   2324 	/* Allocate the staging buffer */
   2325 	vd->vio_msgp = kmem_alloc(vd->max_msglen, KM_SLEEP);
   2326 
   2327 	PR0("calling ldc_up\n");
   2328 	(void) ldc_up(vd->ldc_handle);
   2329 
   2330 	vd->reset_state	= B_FALSE;
   2331 	vd->reset_ldc	= B_FALSE;
   2332 
   2333 	mutex_exit(&vd->lock);
   2334 }
   2335 
   2336 static void vd_recv_msg(void *arg);
   2337 
   2338 static void
   2339 vd_mark_in_reset(vd_t *vd)
   2340 {
   2341 	int status;
   2342 
   2343 	PR0("vd_mark_in_reset: marking vd in reset\n");
   2344 
   2345 	vd_need_reset(vd, B_FALSE);
   2346 	status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd, DDI_SLEEP);
   2347 	if (status == DDI_FAILURE) {
   2348 		PR0("cannot schedule task to recv msg\n");
   2349 		vd_need_reset(vd, B_TRUE);
   2350 		return;
   2351 	}
   2352 }
   2353 
   2354 static int
   2355 vd_mark_elem_done(vd_t *vd, int idx, int elem_status, int elem_nbytes)
   2356 {
   2357 	boolean_t		accepted;
   2358 	int			status;
   2359 	on_trap_data_t		otd;
   2360 	vd_dring_entry_t	*elem = VD_DRING_ELEM(idx);
   2361 
   2362 	if (vd->reset_state)
   2363 		return (0);
   2364 
   2365 	/* Acquire the element */
   2366 	if ((status = VIO_DRING_ACQUIRE(&otd, vd->dring_mtype,
   2367 	    vd->dring_handle, idx, idx)) != 0) {
   2368 		if (status == ECONNRESET) {
   2369 			vd_mark_in_reset(vd);
   2370 			return (0);
   2371 		} else {
   2372 			return (status);
   2373 		}
   2374 	}
   2375 
   2376 	/* Set the element's status and mark it done */
   2377 	accepted = (elem->hdr.dstate == VIO_DESC_ACCEPTED);
   2378 	if (accepted) {
   2379 		elem->payload.nbytes	= elem_nbytes;
   2380 		elem->payload.status	= elem_status;
   2381 		elem->hdr.dstate	= VIO_DESC_DONE;
   2382 	} else {
   2383 		/* Perhaps client timed out waiting for I/O... */
   2384 		PR0("element %u no longer \"accepted\"", idx);
   2385 		VD_DUMP_DRING_ELEM(elem);
   2386 	}
   2387 	/* Release the element */
   2388 	if ((status = VIO_DRING_RELEASE(vd->dring_mtype,
   2389 	    vd->dring_handle, idx, idx)) != 0) {
   2390 		if (status == ECONNRESET) {
   2391 			vd_mark_in_reset(vd);
   2392 			return (0);
   2393 		} else {
   2394 			PR0("VIO_DRING_RELEASE() returned errno %d",
   2395 			    status);
   2396 			return (status);
   2397 		}
   2398 	}
   2399 
   2400 	return (accepted ? 0 : EINVAL);
   2401 }
   2402 
   2403 /*
   2404  * Return Values
   2405  *	0	- operation completed successfully
   2406  *	EIO	- encountered LDC / task error
   2407  *
   2408  * Side Effect
   2409  *	sets request->status = <disk operation status>
   2410  */
   2411 static int
   2412 vd_complete_bio(vd_task_t *task)
   2413 {
   2414 	int			status		= 0;
   2415 	int			rv		= 0;
   2416 	vd_t			*vd		= task->vd;
   2417 	vd_dring_payload_t	*request	= task->request;
   2418 	struct buf		*buf		= &task->buf;
   2419 	int			wid, nwrites;
   2420 
   2421 
   2422 	ASSERT(vd != NULL);
   2423 	ASSERT(request != NULL);
   2424 	ASSERT(task->msg != NULL);
   2425 	ASSERT(task->msglen >= sizeof (*task->msg));
   2426 
   2427 	if (buf->b_flags & B_DONE) {
   2428 		/*
   2429 		 * If the I/O is already done then we don't call biowait()
   2430 		 * because biowait() might already have been called when
   2431 		 * flushing a previous asynchronous write. So we just
   2432 		 * retrieve the status of the request.
   2433 		 */
   2434 		request->status = geterror(buf);
   2435 	} else {
   2436 		/*
   2437 		 * Wait for the I/O. For synchronous I/O, biowait() will return
   2438 		 * when the I/O has completed. For asynchronous write, it will
   2439 		 * return the write has been submitted to the backend, but it
   2440 		 * may not have been committed.
   2441 		 */
   2442 		request->status = biowait(buf);
   2443 	}
   2444 
   2445 	if (buf->b_flags & B_ASYNC) {
   2446 		/*
   2447 		 * Asynchronous writes are used when writing to a file or a
   2448 		 * ZFS volume. In that case the bio notification indicates
   2449 		 * that the write has started. We have to flush the backend
   2450 		 * to ensure that the write has been committed before marking
   2451 		 * the request as completed.
   2452 		 */
   2453 		ASSERT(task->request->operation == VD_OP_BWRITE);
   2454 
   2455 		wid = task->write_index;
   2456 
   2457 		/* check if write has been already flushed */
   2458 		if (vd->write_queue[wid] != NULL) {
   2459 
   2460 			vd->write_queue[wid] = NULL;
   2461 			wid = VD_WRITE_INDEX_NEXT(vd, wid);
   2462 
   2463 			/*
   2464 			 * Because flushing is time consuming, it is worth
   2465 			 * waiting for any other writes so that they can be
   2466 			 * included in this single flush request.
   2467 			 */
   2468 			if (vd_awflush & VD_AWFLUSH_GROUP) {
   2469 				nwrites = 1;
   2470 				while (vd->write_queue[wid] != NULL) {
   2471 					(void) biowait(vd->write_queue[wid]);
   2472 					vd->write_queue[wid] = NULL;
   2473 					wid = VD_WRITE_INDEX_NEXT(vd, wid);
   2474 					nwrites++;
   2475 				}
   2476 				DTRACE_PROBE2(flushgrp, vd_task_t *, task,
   2477 				    int, nwrites);
   2478 			}
   2479 
   2480 			if (vd_awflush & VD_AWFLUSH_IMMEDIATE) {
   2481 				request->status = vd_flush_write(vd);
   2482 			} else if (vd_awflush & VD_AWFLUSH_DEFER) {
   2483 				(void) taskq_dispatch(system_taskq,
   2484 				    (void (*)(void *))vd_flush_write, vd,
   2485 				    DDI_SLEEP);
   2486 				request->status = 0;
   2487 			}
   2488 		}
   2489 	}
   2490 
   2491 	/* Update the number of bytes read/written */
   2492 	request->nbytes += buf->b_bcount - buf->b_resid;
   2493 
   2494 	/* Release the buffer */
   2495 	if (!vd->reset_state)
   2496 		status = ldc_mem_release(task->mhdl, 0, buf->b_bufsize);
   2497 	if (status) {
   2498 		PR0("ldc_mem_release() returned errno %d copying to "
   2499 		    "client", status);
   2500 		if (status == ECONNRESET) {
   2501 			vd_mark_in_reset(vd);
   2502 		}
   2503 		rv = EIO;
   2504 	}
   2505 
   2506 	/* Unmap the memory, even if in reset */
   2507 	status = ldc_mem_unmap(task->mhdl);
   2508 	if (status) {
   2509 		PR0("ldc_mem_unmap() returned errno %d copying to client",
   2510 		    status);
   2511 		if (status == ECONNRESET) {
   2512 			vd_mark_in_reset(vd);
   2513 		}
   2514 		rv = EIO;
   2515 	}
   2516 
   2517 	biofini(buf);
   2518 
   2519 	return (rv);
   2520 }
   2521 
   2522 /*
   2523  * Description:
   2524  *	This function is called by the two functions called by a taskq
   2525  *	[ vd_complete_notify() and vd_serial_notify()) ] to send the
   2526  *	message to the client.
   2527  *
   2528  * Parameters:
   2529  *	arg 	- opaque pointer to structure containing task to be completed
   2530  *
   2531  * Return Values
   2532  *	None
   2533  */
   2534 static void
   2535 vd_notify(vd_task_t *task)
   2536 {
   2537 	int	status;
   2538 
   2539 	ASSERT(task != NULL);
   2540 	ASSERT(task->vd != NULL);
   2541 
   2542 	/*
   2543 	 * Send the "ack" or "nack" back to the client; if sending the message
   2544 	 * via LDC fails, arrange to reset both the connection state and LDC
   2545 	 * itself
   2546 	 */
   2547 	PR2("Sending %s",
   2548 	    (task->msg->tag.vio_subtype == VIO_SUBTYPE_ACK) ? "ACK" : "NACK");
   2549 
   2550 	status = send_msg(task->vd->ldc_handle, task->msg, task->msglen);
   2551 	switch (status) {
   2552 	case 0:
   2553 		break;
   2554 	case ECONNRESET:
   2555 		vd_mark_in_reset(task->vd);
   2556 		break;
   2557 	default:
   2558 		PR0("initiating full reset");
   2559 		vd_need_reset(task->vd, B_TRUE);
   2560 		break;
   2561 	}
   2562 
   2563 	DTRACE_PROBE1(task__end, vd_task_t *, task);
   2564 }
   2565 
   2566 /*
   2567  * Description:
   2568  *	Mark the Dring entry as Done and (if necessary) send an ACK/NACK to
   2569  *	the vDisk client
   2570  *
   2571  * Parameters:
   2572  *	task 		- structure containing the request sent from client
   2573  *
   2574  * Return Values
   2575  *	None
   2576  */
   2577 static void
   2578 vd_complete_notify(vd_task_t *task)
   2579 {
   2580 	int			status		= 0;
   2581 	vd_t			*vd		= task->vd;
   2582 	vd_dring_payload_t	*request	= task->request;
   2583 
   2584 	/* Update the dring element for a dring client */
   2585 	if (!vd->reset_state && (vd->xfer_mode == VIO_DRING_MODE_V1_0)) {
   2586 		status = vd_mark_elem_done(vd, task->index,
   2587 		    request->status, request->nbytes);
   2588 		if (status == ECONNRESET)
   2589 			vd_mark_in_reset(vd);
   2590 		else if (status == EACCES)
   2591 			vd_need_reset(vd, B_TRUE);
   2592 	}
   2593 
   2594 	/*
   2595 	 * If a transport error occurred while marking the element done or
   2596 	 * previously while executing the task, arrange to "nack" the message
   2597 	 * when the final task in the descriptor element range completes
   2598 	 */
   2599 	if ((status != 0) || (task->status != 0))
   2600 		task->msg->tag.vio_subtype = VIO_SUBTYPE_NACK;
   2601 
   2602 	/*
   2603 	 * Only the final task for a range of elements will respond to and
   2604 	 * free the message
   2605 	 */
   2606 	if (task->type == VD_NONFINAL_RANGE_TASK) {
   2607 		return;
   2608 	}
   2609 
   2610 	/*
   2611 	 * We should only send an ACK/NACK here if we are not currently in
   2612 	 * reset as, depending on how we reset, the dring may have been
   2613 	 * blown away and we don't want to ACK/NACK a message that isn't
   2614 	 * there.
   2615 	 */
   2616 	if (!vd->reset_state)
   2617 		vd_notify(task);
   2618 }
   2619 
   2620 /*
   2621  * Description:
   2622  *	This is the basic completion function called to handle inband data
   2623  *	requests and handshake messages. All it needs to do is trigger a
   2624  *	message to the client that the request is completed.
   2625  *
   2626  * Parameters:
   2627  *	arg 	- opaque pointer to structure containing task to be completed
   2628  *
   2629  * Return Values
   2630  *	None
   2631  */
   2632 static void
   2633 vd_serial_notify(void *arg)
   2634 {
   2635 	vd_task_t		*task = (vd_task_t *)arg;
   2636 
   2637 	ASSERT(task != NULL);
   2638 	vd_notify(task);
   2639 }
   2640 
   2641 /* ARGSUSED */
   2642 static int
   2643 vd_geom2dk_geom(void *vd_buf, size_t vd_buf_len, void *ioctl_arg)
   2644 {
   2645 	VD_GEOM2DK_GEOM((vd_geom_t *)vd_buf, (struct dk_geom *)ioctl_arg);
   2646 	return (0);
   2647 }
   2648 
   2649 /* ARGSUSED */
   2650 static int
   2651 vd_vtoc2vtoc(void *vd_buf, size_t vd_buf_len, void *ioctl_arg)
   2652 {
   2653 	VD_VTOC2VTOC((vd_vtoc_t *)vd_buf, (struct extvtoc *)ioctl_arg);
   2654 	return (0);
   2655 }
   2656 
   2657 static void
   2658 dk_geom2vd_geom(void *ioctl_arg, void *vd_buf)
   2659 {
   2660 	DK_GEOM2VD_GEOM((struct dk_geom *)ioctl_arg, (vd_geom_t *)vd_buf);
   2661 }
   2662 
   2663 static void
   2664 vtoc2vd_vtoc(void *ioctl_arg, void *vd_buf)
   2665 {
   2666 	VTOC2VD_VTOC((struct extvtoc *)ioctl_arg, (vd_vtoc_t *)vd_buf);
   2667 }
   2668 
   2669 static int
   2670 vd_get_efi_in(void *vd_buf, size_t vd_buf_len, void *ioctl_arg)
   2671 {
   2672 	vd_efi_t *vd_efi = (vd_efi_t *)vd_buf;
   2673 	dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg;
   2674 	size_t data_len;
   2675 
   2676 	data_len = vd_buf_len - (sizeof (vd_efi_t) - sizeof (uint64_t));
   2677 	if (vd_efi->length > data_len)
   2678 		return (EINVAL);
   2679 
   2680 	dk_efi->dki_lba = vd_efi->lba;
   2681 	dk_efi->dki_length = vd_efi->length;
   2682 	dk_efi->dki_data = kmem_zalloc(vd_efi->length, KM_SLEEP);
   2683 	return (0);
   2684 }
   2685 
   2686 static void
   2687 vd_get_efi_out(void *ioctl_arg, void *vd_buf)
   2688 {
   2689 	int len;
   2690 	vd_efi_t *vd_efi = (vd_efi_t *)vd_buf;
   2691 	dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg;
   2692 
   2693 	len = vd_efi->length;
   2694 	DK_EFI2VD_EFI(dk_efi, vd_efi);
   2695 	kmem_free(dk_efi->dki_data, len);
   2696 }
   2697 
   2698 static int
   2699 vd_set_efi_in(void *vd_buf, size_t vd_buf_len, void *ioctl_arg)
   2700 {
   2701 	vd_efi_t *vd_efi = (vd_efi_t *)vd_buf;
   2702 	dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg;
   2703 	size_t data_len;
   2704 
   2705 	data_len = vd_buf_len - (sizeof (vd_efi_t) - sizeof (uint64_t));
   2706 	if (vd_efi->length > data_len)
   2707 		return (EINVAL);
   2708 
   2709 	dk_efi->dki_data = kmem_alloc(vd_efi->length, KM_SLEEP);
   2710 	VD_EFI2DK_EFI(vd_efi, dk_efi);
   2711 	return (0);
   2712 }
   2713 
   2714 static void
   2715 vd_set_efi_out(void *ioctl_arg, void *vd_buf)
   2716 {
   2717 	vd_efi_t *vd_efi = (vd_efi_t *)vd_buf;
   2718 	dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg;
   2719 
   2720 	kmem_free(dk_efi->dki_data, vd_efi->length);
   2721 }
   2722 
   2723 static int
   2724 vd_scsicmd_in(void *vd_buf, size_t vd_buf_len, void *ioctl_arg)
   2725 {
   2726 	size_t vd_scsi_len;
   2727 	vd_scsi_t *vd_scsi = (vd_scsi_t *)vd_buf;
   2728 	struct uscsi_cmd *uscsi = (struct uscsi_cmd *)ioctl_arg;
   2729 
   2730 	/* check buffer size */
   2731 	vd_scsi_len = VD_SCSI_SIZE;
   2732 	vd_scsi_len += P2ROUNDUP(vd_scsi->cdb_len, sizeof (uint64_t));
   2733 	vd_scsi_len += P2ROUNDUP(vd_scsi->sense_len, sizeof (uint64_t));
   2734 	vd_scsi_len += P2ROUNDUP(vd_scsi->datain_len, sizeof (uint64_t));
   2735 	vd_scsi_len += P2ROUNDUP(vd_scsi->dataout_len, sizeof (uint64_t));
   2736 
   2737 	ASSERT(vd_scsi_len % sizeof (uint64_t) == 0);
   2738 
   2739 	if (vd_buf_len < vd_scsi_len)
   2740 		return (EINVAL);
   2741 
   2742 	/* set flags */
   2743 	uscsi->uscsi_flags = vd_scsi_debug;
   2744 
   2745 	if (vd_scsi->options & VD_SCSI_OPT_NORETRY) {
   2746 		uscsi->uscsi_flags |= USCSI_ISOLATE;
   2747 		uscsi->uscsi_flags |= USCSI_DIAGNOSE;
   2748 	}
   2749 
   2750 	/* task attribute */
   2751 	switch (vd_scsi->task_attribute) {
   2752 	case VD_SCSI_TASK_ACA:
   2753 		uscsi->uscsi_flags |= USCSI_HEAD;
   2754 		break;
   2755 	case VD_SCSI_TASK_HQUEUE:
   2756 		uscsi->uscsi_flags |= USCSI_HTAG;
   2757 		break;
   2758 	case VD_SCSI_TASK_ORDERED:
   2759 		uscsi->uscsi_flags |= USCSI_OTAG;
   2760 		break;
   2761 	default:
   2762 		uscsi->uscsi_flags |= USCSI_NOTAG;
   2763 		break;
   2764 	}
   2765 
   2766 	/* timeout */
   2767 	uscsi->uscsi_timeout = vd_scsi->timeout;
   2768 
   2769 	/* cdb data */
   2770 	uscsi->uscsi_cdb = (caddr_t)VD_SCSI_DATA_CDB(vd_scsi);
   2771 	uscsi->uscsi_cdblen = vd_scsi->cdb_len;
   2772 
   2773 	/* sense buffer */
   2774 	if (vd_scsi->sense_len != 0) {
   2775 		uscsi->uscsi_flags |= USCSI_RQENABLE;
   2776 		uscsi->uscsi_rqbuf = (caddr_t)VD_SCSI_DATA_SENSE(vd_scsi);
   2777 		uscsi->uscsi_rqlen = vd_scsi->sense_len;
   2778 	}
   2779 
   2780 	if (vd_scsi->datain_len != 0 && vd_scsi->dataout_len != 0) {
   2781 		/* uscsi does not support read/write request */
   2782 		return (EINVAL);
   2783 	}
   2784 
   2785 	/* request data-in */
   2786 	if (vd_scsi->datain_len != 0) {
   2787 		uscsi->uscsi_flags |= USCSI_READ;
   2788 		uscsi->uscsi_buflen = vd_scsi->datain_len;
   2789 		uscsi->uscsi_bufaddr = (char *)VD_SCSI_DATA_IN(vd_scsi);
   2790 	}
   2791 
   2792 	/* request data-out */
   2793 	if (vd_scsi->dataout_len != 0) {
   2794 		uscsi->uscsi_buflen = vd_scsi->dataout_len;
   2795 		uscsi->uscsi_bufaddr = (char *)VD_SCSI_DATA_OUT(vd_scsi);
   2796 	}
   2797 
   2798 	return (0);
   2799 }
   2800 
   2801 static void
   2802 vd_scsicmd_out(void *ioctl_arg, void *vd_buf)
   2803 {
   2804 	vd_scsi_t *vd_scsi = (vd_scsi_t *)vd_buf;
   2805 	struct uscsi_cmd *uscsi = (struct uscsi_cmd *)ioctl_arg;
   2806 
   2807 	/* output fields */
   2808 	vd_scsi->cmd_status = uscsi->uscsi_status;
   2809 
   2810 	/* sense data */
   2811 	if ((uscsi->uscsi_flags & USCSI_RQENABLE) &&
   2812 	    (uscsi->uscsi_status == STATUS_CHECK ||
   2813 	    uscsi->uscsi_status == STATUS_TERMINATED)) {
   2814 		vd_scsi->sense_status = uscsi->uscsi_rqstatus;
   2815 		if (uscsi->uscsi_rqstatus == STATUS_GOOD)
   2816 			vd_scsi->sense_len -= uscsi->uscsi_rqresid;
   2817 		else
   2818 			vd_scsi->sense_len = 0;
   2819 	} else {
   2820 		vd_scsi->sense_len = 0;
   2821 	}
   2822 
   2823 	if (uscsi->uscsi_status != STATUS_GOOD) {
   2824 		vd_scsi->dataout_len = 0;
   2825 		vd_scsi->datain_len = 0;
   2826 		return;
   2827 	}
   2828 
   2829 	if (uscsi->uscsi_flags & USCSI_READ) {
   2830 		/* request data (read) */
   2831 		vd_scsi->datain_len -= uscsi->uscsi_resid;
   2832 		vd_scsi->dataout_len = 0;
   2833 	} else {
   2834 		/* request data (write) */
   2835 		vd_scsi->datain_len = 0;
   2836 		vd_scsi->dataout_len -= uscsi->uscsi_resid;
   2837 	}
   2838 }
   2839 
   2840 static ushort_t
   2841 vd_lbl2cksum(struct dk_label *label)
   2842 {
   2843 	int	count;
   2844 	ushort_t sum, *sp;
   2845 
   2846 	count =	(sizeof (struct dk_label)) / (sizeof (short)) - 1;
   2847 	sp = (ushort_t *)label;
   2848 	sum = 0;
   2849 	while (count--) {
   2850 		sum ^= *sp++;
   2851 	}
   2852 
   2853 	return (sum);
   2854 }
   2855 
   2856 /*
   2857  * Copy information from a vtoc and dk_geom structures to a dk_label structure.
   2858  */
   2859 static void
   2860 vd_vtocgeom_to_label(struct extvtoc *vtoc, struct dk_geom *geom,
   2861     struct dk_label *label)
   2862 {
   2863 	int i;
   2864 
   2865 	ASSERT(vtoc->v_nparts == V_NUMPAR);
   2866 	ASSERT(vtoc->v_sanity == VTOC_SANE);
   2867 
   2868 	bzero(label, sizeof (struct dk_label));
   2869 
   2870 	label->dkl_ncyl = geom->dkg_ncyl;
   2871 	label->dkl_acyl = geom->dkg_acyl;
   2872 	label->dkl_pcyl = geom->dkg_pcyl;
   2873 	label->dkl_nhead = geom->dkg_nhead;
   2874 	label->dkl_nsect = geom->dkg_nsect;
   2875 	label->dkl_intrlv = geom->dkg_intrlv;
   2876 	label->dkl_apc = geom->dkg_apc;
   2877 	label->dkl_rpm = geom->dkg_rpm;
   2878 	label->dkl_write_reinstruct = geom->dkg_write_reinstruct;
   2879 	label->dkl_read_reinstruct = geom->dkg_read_reinstruct;
   2880 
   2881 	label->dkl_vtoc.v_nparts = V_NUMPAR;
   2882 	label->dkl_vtoc.v_sanity = VTOC_SANE;
   2883 	label->dkl_vtoc.v_version = vtoc->v_version;
   2884 	for (i = 0; i < V_NUMPAR; i++) {
   2885 		label->dkl_vtoc.v_timestamp[i] = vtoc->timestamp[i];
   2886 		label->dkl_vtoc.v_part[i].p_tag = vtoc->v_part[i].p_tag;
   2887 		label->dkl_vtoc.v_part[i].p_flag = vtoc->v_part[i].p_flag;
   2888 		label->dkl_map[i].dkl_cylno = vtoc->v_part[i].p_start /
   2889 		    (label->dkl_nhead * label->dkl_nsect);
   2890 		label->dkl_map[i].dkl_nblk = vtoc->v_part[i].p_size;
   2891 	}
   2892 
   2893 	/*
   2894 	 * The bootinfo array can not be copied with bcopy() because
   2895 	 * elements are of type long in vtoc (so 64-bit) and of type
   2896 	 * int in dk_vtoc (so 32-bit).
   2897 	 */
   2898 	label->dkl_vtoc.v_bootinfo[0] = vtoc->v_bootinfo[0];
   2899 	label->dkl_vtoc.v_bootinfo[1] = vtoc->v_bootinfo[1];
   2900 	label->dkl_vtoc.v_bootinfo[2] = vtoc->v_bootinfo[2];
   2901 	bcopy(vtoc->v_asciilabel, label->dkl_asciilabel, LEN_DKL_ASCII);
   2902 	bcopy(vtoc->v_volume, label->dkl_vtoc.v_volume, LEN_DKL_VVOL);
   2903 
   2904 	/* re-compute checksum */
   2905 	label->dkl_magic = DKL_MAGIC;
   2906 	label->dkl_cksum = vd_lbl2cksum(label);
   2907 }
   2908 
   2909 /*
   2910  * Copy information from a dk_label structure to a vtoc and dk_geom structures.
   2911  */
   2912 static void
   2913 vd_label_to_vtocgeom(struct dk_label *label, struct extvtoc *vtoc,
   2914     struct dk_geom *geom)
   2915 {
   2916 	int i;
   2917 
   2918 	bzero(vtoc, sizeof (struct extvtoc));
   2919 	bzero(geom, sizeof (struct dk_geom));
   2920 
   2921 	geom->dkg_ncyl = label->dkl_ncyl;
   2922 	geom->dkg_acyl = label->dkl_acyl;
   2923 	geom->dkg_nhead = label->dkl_nhead;
   2924 	geom->dkg_nsect = label->dkl_nsect;
   2925 	geom->dkg_intrlv = label->dkl_intrlv;
   2926 	geom->dkg_apc = label->dkl_apc;
   2927 	geom->dkg_rpm = label->dkl_rpm;
   2928 	geom->dkg_pcyl = label->dkl_pcyl;
   2929 	geom->dkg_write_reinstruct = label->dkl_write_reinstruct;
   2930 	geom->dkg_read_reinstruct = label->dkl_read_reinstruct;
   2931 
   2932 	vtoc->v_sanity = label->dkl_vtoc.v_sanity;
   2933 	vtoc->v_version = label->dkl_vtoc.v_version;
   2934 	vtoc->v_sectorsz = DEV_BSIZE;
   2935 	vtoc->v_nparts = label->dkl_vtoc.v_nparts;
   2936 
   2937 	for (i = 0; i < vtoc->v_nparts; i++) {
   2938 		vtoc->v_part[i].p_tag = label->dkl_vtoc.v_part[i].p_tag;
   2939 		vtoc->v_part[i].p_flag = label->dkl_vtoc.v_part[i].p_flag;
   2940 		vtoc->v_part[i].p_start = label->dkl_map[i].dkl_cylno *
   2941 		    (label->dkl_nhead * label->dkl_nsect);
   2942 		vtoc->v_part[i].p_size = label->dkl_map[i].dkl_nblk;
   2943 		vtoc->timestamp[i] = label->dkl_vtoc.v_timestamp[i];
   2944 	}
   2945 
   2946 	/*
   2947 	 * The bootinfo array can not be copied with bcopy() because
   2948 	 * elements are of type long in vtoc (so 64-bit) and of type
   2949 	 * int in dk_vtoc (so 32-bit).
   2950 	 */
   2951 	vtoc->v_bootinfo[0] = label->dkl_vtoc.v_bootinfo[0];
   2952 	vtoc->v_bootinfo[1] = label->dkl_vtoc.v_bootinfo[1];
   2953 	vtoc->v_bootinfo[2] = label->dkl_vtoc.v_bootinfo[2];
   2954 	bcopy(label->dkl_asciilabel, vtoc->v_asciilabel, LEN_DKL_ASCII);
   2955 	bcopy(label->dkl_vtoc.v_volume, vtoc->v_volume, LEN_DKL_VVOL);
   2956 }
   2957 
   2958 /*
   2959  * Check if a geometry is valid for a single-slice disk. A geometry is
   2960  * considered valid if the main attributes of the geometry match with the
   2961  * attributes of the fake geometry we have created.
   2962  */
   2963 static boolean_t
   2964 vd_slice_geom_isvalid(vd_t *vd, struct dk_geom *geom)
   2965 {
   2966 	ASSERT(vd->vdisk_type == VD_DISK_TYPE_SLICE);
   2967 	ASSERT(vd->vdisk_label == VD_DISK_LABEL_VTOC);
   2968 
   2969 	if (geom->dkg_ncyl != vd->dk_geom.dkg_ncyl ||
   2970 	    geom->dkg_acyl != vd->dk_geom.dkg_acyl ||
   2971 	    geom->dkg_nsect != vd->dk_geom.dkg_nsect ||
   2972 	    geom->dkg_pcyl != vd->dk_geom.dkg_pcyl)
   2973 		return (B_FALSE);
   2974 
   2975 	return (B_TRUE);
   2976 }
   2977 
   2978 /*
   2979  * Check if a vtoc is valid for a single-slice disk. A vtoc is considered
   2980  * valid if the main attributes of the vtoc match with the attributes of the
   2981  * fake vtoc we have created.
   2982  */
   2983 static boolean_t
   2984 vd_slice_vtoc_isvalid(vd_t *vd, struct extvtoc *vtoc)
   2985 {
   2986 	size_t csize;
   2987 	int i;
   2988 
   2989 	ASSERT(vd->vdisk_type == VD_DISK_TYPE_SLICE);
   2990 	ASSERT(vd->vdisk_label == VD_DISK_LABEL_VTOC);
   2991 
   2992 	if (vtoc->v_sanity != vd->vtoc.v_sanity ||
   2993 	    vtoc->v_version != vd->vtoc.v_version ||
   2994 	    vtoc->v_nparts != vd->vtoc.v_nparts ||
   2995 	    strcmp(vtoc->v_volume, vd->vtoc.v_volume) != 0 ||
   2996 	    strcmp(vtoc->v_asciilabel, vd->vtoc.v_asciilabel) != 0)
   2997 		return (B_FALSE);
   2998 
   2999 	/* slice 2 should be unchanged */
   3000 	if (vtoc->v_part[VD_ENTIRE_DISK_SLICE].p_start !=
   3001 	    vd->vtoc.v_part[VD_ENTIRE_DISK_SLICE].p_start ||
   3002 	    vtoc->v_part[VD_ENTIRE_DISK_SLICE].p_size !=
   3003 	    vd->vtoc.v_part[VD_ENTIRE_DISK_SLICE].p_size)
   3004 		return (B_FALSE);
   3005 
   3006 	/*
   3007 	 * Slice 0 should be mostly unchanged and cover most of the disk.
   3008 	 * However we allow some flexibility wrt to the start and the size
   3009 	 * of this slice mainly because we can't exactly know how it will
   3010 	 * be defined by the OS installer.
   3011 	 *
   3012 	 * We allow slice 0 to be defined as starting on any of the first
   3013 	 * 4 cylinders.
   3014 	 */
   3015 	csize = vd->dk_geom.dkg_nhead * vd->dk_geom.dkg_nsect;
   3016 
   3017 	if (vtoc->v_part[0].p_start > 4 * csize ||
   3018 	    vtoc->v_part[0].p_size > vtoc->v_part[VD_ENTIRE_DISK_SLICE].p_size)
   3019 			return (B_FALSE);
   3020 
   3021 	if (vd->vtoc.v_part[0].p_size >= 4 * csize &&
   3022 	    vtoc->v_part[0].p_size < vd->vtoc.v_part[0].p_size - 4 *csize)
   3023 			return (B_FALSE);
   3024 
   3025 	/* any other slice should have a size of 0 */
   3026 	for (i = 1; i < vtoc->v_nparts; i++) {
   3027 		if (i != VD_ENTIRE_DISK_SLICE &&
   3028 		    vtoc->v_part[i].p_size != 0)
   3029 			return (B_FALSE);
   3030 	}
   3031 
   3032 	return (B_TRUE);
   3033 }
   3034 
   3035 /*
   3036  * Handle ioctls to a disk slice.
   3037  *
   3038  * Return Values
   3039  *	0	- Indicates that there are no errors in disk operations
   3040  *	ENOTSUP	- Unknown disk label type or unsupported DKIO ioctl
   3041  *	EINVAL	- Not enough room to copy the EFI label
   3042  *
   3043  */
   3044 static int
   3045 vd_do_slice_ioctl(vd_t *vd, int cmd, void *ioctl_arg)
   3046 {
   3047 	dk_efi_t *dk_ioc;
   3048 	struct extvtoc *vtoc;
   3049 	struct dk_geom *geom;
   3050 	size_t len, lba;
   3051 
   3052 	ASSERT(vd->vdisk_type == VD_DISK_TYPE_SLICE);
   3053 
   3054 	if (cmd == DKIOCFLUSHWRITECACHE)
   3055 		return (vd_flush_write(vd));
   3056 
   3057 	switch (vd->vdisk_label) {
   3058 
   3059 	/* ioctls for a single slice disk with a VTOC label */
   3060 	case VD_DISK_LABEL_VTOC:
   3061 
   3062 		switch (cmd) {
   3063 
   3064 		case DKIOCGGEOM:
   3065 			ASSERT(ioctl_arg != NULL);
   3066 			bcopy(&vd->dk_geom, ioctl_arg, sizeof (vd->dk_geom));
   3067 			return (0);
   3068 
   3069 		case DKIOCGEXTVTOC:
   3070 			ASSERT(ioctl_arg != NULL);
   3071 			bcopy(&vd->vtoc, ioctl_arg, sizeof (vd->vtoc));
   3072 			return (0);
   3073 
   3074 		case DKIOCSGEOM:
   3075 			ASSERT(ioctl_arg != NULL);
   3076 			if (vd_slice_single_slice)
   3077 				return (ENOTSUP);
   3078 
   3079 			/* fake success only if new geometry is valid */
   3080 			geom = (struct dk_geom *)ioctl_arg;
   3081 			if (!vd_slice_geom_isvalid(vd, geom))
   3082 				return (EINVAL);
   3083 
   3084 			return (0);
   3085 
   3086 		case DKIOCSEXTVTOC:
   3087 			ASSERT(ioctl_arg != NULL);
   3088 			if (vd_slice_single_slice)
   3089 				return (ENOTSUP);
   3090 
   3091 			/* fake sucess only if the new vtoc is valid */
   3092 			vtoc = (struct extvtoc *)ioctl_arg;
   3093 			if (!vd_slice_vtoc_isvalid(vd, vtoc))
   3094 				return (EINVAL);
   3095 
   3096 			return (0);
   3097 
   3098 		default:
   3099 			return (ENOTSUP);
   3100 		}
   3101 
   3102 	/* ioctls for a single slice disk with an EFI label */
   3103 	case VD_DISK_LABEL_EFI:
   3104 
   3105 		if (cmd != DKIOCGETEFI && cmd != DKIOCSETEFI)
   3106 			return (ENOTSUP);
   3107 
   3108 		ASSERT(ioctl_arg != NULL);
   3109 		dk_ioc = (dk_efi_t *)ioctl_arg;
   3110 
   3111 		len = dk_ioc->dki_length;
   3112 		lba = dk_ioc->dki_lba;
   3113 
   3114 		if ((lba != VD_EFI_LBA_GPT && lba != VD_EFI_LBA_GPE) ||
   3115 		    (lba == VD_EFI_LBA_GPT && len < sizeof (efi_gpt_t)) ||
   3116 		    (lba == VD_EFI_LBA_GPE && len < sizeof (efi_gpe_t)))
   3117 			return (EINVAL);
   3118 
   3119 		switch (cmd) {
   3120 		case DKIOCGETEFI:
   3121 			len = vd_slice_flabel_read(vd,
   3122 			    (caddr_t)dk_ioc->dki_data,
   3123 			    lba * vd->vdisk_bsize, len);
   3124 
   3125 			ASSERT(len > 0);
   3126 
   3127 			return (0);
   3128 
   3129 		case DKIOCSETEFI:
   3130 			if (vd_slice_single_slice)
   3131 				return (ENOTSUP);
   3132 
   3133 			/* we currently don't support writing EFI */
   3134 			return (EIO);
   3135 		}
   3136 
   3137 	default:
   3138 		/* Unknown disk label type */
   3139 		return (ENOTSUP);
   3140 	}
   3141 }
   3142 
   3143 static int
   3144 vds_efi_alloc_and_read(vd_t *vd, efi_gpt_t **gpt, efi_gpe_t **gpe)
   3145 {
   3146 	vd_efi_dev_t edev;
   3147 	int status;
   3148 
   3149 	VD_EFI_DEV_SET(edev, vd, (vd_efi_ioctl_func)vd_backend_ioctl);
   3150 
   3151 	status = vd_efi_alloc_and_read(&edev, gpt, gpe);
   3152 
   3153 	return (status);
   3154 }
   3155 
   3156 static void
   3157 vds_efi_free(vd_t *vd, efi_gpt_t *gpt, efi_gpe_t *gpe)
   3158 {
   3159 	vd_efi_dev_t edev;
   3160 
   3161 	VD_EFI_DEV_SET(edev, vd, (vd_efi_ioctl_func)vd_backend_ioctl);
   3162 
   3163 	vd_efi_free(&edev, gpt, gpe);
   3164 }
   3165 
   3166 static int
   3167 vd_dskimg_validate_efi(vd_t *vd)
   3168 {
   3169 	efi_gpt_t *gpt;
   3170 	efi_gpe_t *gpe;
   3171 	int i, nparts, status;
   3172 	struct uuid efi_reserved = EFI_RESERVED;
   3173 
   3174 	if ((status = vds_efi_alloc_and_read(vd, &gpt, &gpe)) != 0)
   3175 		return (status);
   3176 
   3177 	bzero(&vd->vtoc, sizeof (struct extvtoc));
   3178 	bzero(&vd->dk_geom, sizeof (struct dk_geom));
   3179 	bzero(vd->slices, sizeof (vd_slice_t) * VD_MAXPART);
   3180 
   3181 	vd->efi_reserved = -1;
   3182 
   3183 	nparts = gpt->efi_gpt_NumberOfPartitionEntries;
   3184 
   3185 	for (i = 0; i < nparts && i < VD_MAXPART; i++) {
   3186 
   3187 		if (gpe[i].efi_gpe_StartingLBA == 0 &&
   3188 		    gpe[i].efi_gpe_EndingLBA == 0) {
   3189 			continue;
   3190 		}
   3191 
   3192 		vd->slices[i].start = gpe[i].efi_gpe_StartingLBA;
   3193 		vd->slices[i].nblocks = gpe[i].efi_gpe_EndingLBA -
   3194 		    gpe[i].efi_gpe_StartingLBA + 1;
   3195 
   3196 		if (bcmp(&gpe[i].efi_gpe_PartitionTypeGUID, &efi_reserved,
   3197 		    sizeof (struct uuid)) == 0)
   3198 			vd->efi_reserved = i;
   3199 
   3200 	}
   3201 
   3202 	ASSERT(vd->vdisk_size != 0);
   3203 	vd->slices[VD_EFI_WD_SLICE].start = 0;
   3204 	vd->slices[VD_EFI_WD_SLICE].nblocks = vd->vdisk_size;
   3205 
   3206 	vds_efi_free(vd, gpt, gpe);
   3207 
   3208 	return (status);
   3209 }
   3210 
   3211 /*
   3212  * Function:
   3213  *	vd_dskimg_validate_geometry
   3214  *
   3215  * Description:
   3216  *	Read the label and validate the geometry of a disk image. The driver
   3217  *	label, vtoc and geometry information are updated according to the
   3218  *	label read from the disk image.
   3219  *
   3220  *	If no valid label is found, the label is set to unknown and the
   3221  *	function returns EINVAL, but a default vtoc and geometry are provided
   3222  *	to the driver. If an EFI label is found, ENOTSUP is returned.
   3223  *
   3224  * Parameters:
   3225  *	vd	- disk on which the operation is performed.
   3226  *
   3227  * Return Code:
   3228  *	0	- success.
   3229  *	EIO	- error reading the label from the disk image.
   3230  *	EINVAL	- unknown disk label.
   3231  *	ENOTSUP	- geometry not applicable (EFI label).
   3232  */
   3233 static int
   3234 vd_dskimg_validate_geometry(vd_t *vd)
   3235 {
   3236 	struct dk_label label;
   3237 	struct dk_geom *geom = &vd->dk_geom;
   3238 	struct extvtoc *vtoc = &vd->vtoc;
   3239 	int i;
   3240 	int status = 0;
   3241 
   3242 	ASSERT(VD_DSKIMG(vd));
   3243 
   3244 	if (VD_DSKIMG_LABEL_READ(vd, &label) < 0)
   3245 		return (EIO);
   3246 
   3247 	if (label.dkl_magic != DKL_MAGIC ||
   3248 	    label.dkl_cksum != vd_lbl2cksum(&label) ||
   3249 	    (vd_dskimg_validate_sanity &&
   3250 	    label.dkl_vtoc.v_sanity != VTOC_SANE) ||
   3251 	    label.dkl_vtoc.v_nparts != V_NUMPAR) {
   3252 
   3253 		if (vd_dskimg_validate_efi(vd) == 0) {
   3254 			vd->vdisk_label = VD_DISK_LABEL_EFI;
   3255 			return (ENOTSUP);
   3256 		}
   3257 
   3258 		vd->vdisk_label = VD_DISK_LABEL_UNK;
   3259 		vd_build_default_label(vd->dskimg_size, vd->vdisk_bsize,
   3260 		    &label);
   3261 		status = EINVAL;
   3262 	} else {
   3263 		vd->vdisk_label = VD_DISK_LABEL_VTOC;
   3264 	}
   3265 
   3266 	/* Update the driver geometry and vtoc */
   3267 	vd_label_to_vtocgeom(&label, vtoc, geom);
   3268 
   3269 	/* Update logical partitions */
   3270 	bzero(vd->slices, sizeof (vd_slice_t) * VD_MAXPART);
   3271 	if (vd->vdisk_label != VD_DISK_LABEL_UNK) {
   3272 		for (i = 0; i < vtoc->v_nparts; i++) {
   3273 			vd->slices[i].start = vtoc->v_part[i].p_start;
   3274 			vd->slices[i].nblocks = vtoc->v_part[i].p_size;
   3275 		}
   3276 	}
   3277 
   3278 	return (status);
   3279 }
   3280 
   3281 /*
   3282  * Handle ioctls to a disk image.
   3283  *
   3284  * Return Values
   3285  *	0	- Indicates that there are no errors
   3286  *	!= 0	- Disk operation returned an error
   3287  */
   3288 static int
   3289 vd_do_dskimg_ioctl(vd_t *vd, int cmd, void *ioctl_arg)
   3290 {
   3291 	struct dk_label label;
   3292 	struct dk_geom *geom;
   3293 	struct extvtoc *vtoc;
   3294 	dk_efi_t *efi;
   3295 	int rc;
   3296 
   3297 	ASSERT(VD_DSKIMG(vd));
   3298 
   3299 	switch (cmd) {
   3300 
   3301 	case DKIOCGGEOM:
   3302 		ASSERT(ioctl_arg != NULL);
   3303 		geom = (struct dk_geom *)ioctl_arg;
   3304 
   3305 		rc = vd_dskimg_validate_geometry(vd);
   3306 		if (rc != 0 && rc != EINVAL)
   3307 			return (rc);
   3308 		bcopy(&vd->dk_geom, geom, sizeof (struct dk_geom));
   3309 		return (0);
   3310 
   3311 	case DKIOCGEXTVTOC:
   3312 		ASSERT(ioctl_arg != NULL);
   3313 		vtoc = (struct extvtoc *)ioctl_arg;
   3314 
   3315 		rc = vd_dskimg_validate_geometry(vd);
   3316 		if (rc != 0 && rc != EINVAL)
   3317 			return (rc);
   3318 		bcopy(&vd->vtoc, vtoc, sizeof (struct extvtoc));
   3319 		return (0);
   3320 
   3321 	case DKIOCSGEOM:
   3322 		ASSERT(ioctl_arg != NULL);
   3323 		geom = (struct dk_geom *)ioctl_arg;
   3324 
   3325 		if (geom->dkg_nhead == 0 || geom->dkg_nsect == 0)
   3326 			return (EINVAL);
   3327 
   3328 		/*
   3329 		 * The current device geometry is not updated, just the driver
   3330 		 * "notion" of it. The device geometry will be effectively
   3331 		 * updated when a label is written to the device during a next
   3332 		 * DKIOCSEXTVTOC.
   3333 		 */
   3334 		bcopy(ioctl_arg, &vd->dk_geom, sizeof (vd->dk_geom));
   3335 		return (0);
   3336 
   3337 	case DKIOCSEXTVTOC:
   3338 		ASSERT(ioctl_arg != NULL);
   3339 		ASSERT(vd->dk_geom.dkg_nhead != 0 &&
   3340 		    vd->dk_geom.dkg_nsect != 0);
   3341 		vtoc = (struct extvtoc *)ioctl_arg;
   3342 
   3343 		if (vtoc->v_sanity != VTOC_SANE ||
   3344 		    vtoc->v_sectorsz != DEV_BSIZE ||
   3345 		    vtoc->v_nparts != V_NUMPAR)
   3346 			return (EINVAL);
   3347 
   3348 		vd_vtocgeom_to_label(vtoc, &vd->dk_geom, &label);
   3349 
   3350 		/* write label to the disk image */
   3351 		if ((rc = vd_dskimg_set_vtoc(vd, &label)) != 0)
   3352 			return (rc);
   3353 
   3354 		break;
   3355 
   3356 	case DKIOCFLUSHWRITECACHE:
   3357 		return (vd_flush_write(vd));
   3358 
   3359 	case DKIOCGETEFI:
   3360 		ASSERT(ioctl_arg != NULL);
   3361 		efi = (dk_efi_t *)ioctl_arg;
   3362 
   3363 		if (vd_dskimg_rw(vd, VD_SLICE_NONE, VD_OP_BREAD,
   3364 		    (caddr_t)efi->dki_data, efi->dki_lba, efi->dki_length) < 0)
   3365 			return (EIO);
   3366 
   3367 		return (0);
   3368 
   3369 	case DKIOCSETEFI:
   3370 		ASSERT(ioctl_arg != NULL);
   3371 		efi = (dk_efi_t *)ioctl_arg;
   3372 
   3373 		if (vd_dskimg_rw(vd, VD_SLICE_NONE, VD_OP_BWRITE,
   3374 		    (caddr_t)efi->dki_data, efi->dki_lba, efi->dki_length) < 0)
   3375 			return (EIO);
   3376 
   3377 		break;
   3378 
   3379 
   3380 	default:
   3381 		return (ENOTSUP);
   3382 	}
   3383 
   3384 	ASSERT(cmd == DKIOCSEXTVTOC || cmd == DKIOCSETEFI);
   3385 
   3386 	/* label has changed, revalidate the geometry */
   3387 	(void) vd_dskimg_validate_geometry(vd);
   3388 
   3389 	/*
   3390 	 * The disk geometry may have changed, so we need to write
   3391 	 * the devid (if there is one) so that it is stored at the
   3392 	 * right location.
   3393 	 */
   3394 	if (vd_dskimg_write_devid(vd, vd->dskimg_devid) != 0) {
   3395 		PR0("Fail to write devid");
   3396 	}
   3397 
   3398 	return (0);
   3399 }
   3400 
   3401 static int
   3402 vd_backend_ioctl(vd_t *vd, int cmd, caddr_t arg)
   3403 {
   3404 	int rval = 0, status;
   3405 	struct vtoc vtoc;
   3406 
   3407 	/*
   3408 	 * Call the appropriate function to execute the ioctl depending
   3409 	 * on the type of vdisk.
   3410 	 */
   3411 	if (vd->vdisk_type == VD_DISK_TYPE_SLICE) {
   3412 
   3413 		/* slice, file or volume exported as a single slice disk */
   3414 		status = vd_do_slice_ioctl(vd, cmd, arg);
   3415 
   3416 	} else if (VD_DSKIMG(vd)) {
   3417 
   3418 		/* file or volume exported as a full disk */
   3419 		status = vd_do_dskimg_ioctl(vd, cmd, arg);
   3420 
   3421 	} else {
   3422 
   3423 		/* disk device exported as a full disk */
   3424 		status = ldi_ioctl(vd->ldi_handle[0], cmd, (intptr_t)arg,
   3425 		    vd->open_flags | FKIOCTL, kcred, &rval);
   3426 
   3427 		/*
   3428 		 * By default VTOC ioctls are done using ioctls for the
   3429 		 * extended VTOC. Some drivers (in particular non-Sun drivers)
   3430 		 * may not support these ioctls. In that case, we fallback to
   3431 		 * the regular VTOC ioctls.
   3432 		 */
   3433 		if (status == ENOTTY) {
   3434 			switch (cmd) {
   3435 
   3436 			case DKIOCGEXTVTOC:
   3437 				cmd = DKIOCGVTOC;
   3438 				status = ldi_ioctl(vd->ldi_handle[0], cmd,
   3439 				    (intptr_t)&vtoc, vd->open_flags | FKIOCTL,
   3440 				    kcred, &rval);
   3441 				vtoctoextvtoc(vtoc,
   3442 				    (*(struct extvtoc *)(void *)arg));
   3443 				break;
   3444 
   3445 			case DKIOCSEXTVTOC:
   3446 				cmd = DKIOCSVTOC;
   3447 				extvtoctovtoc((*(struct extvtoc *)(void *)arg),
   3448 				    vtoc);
   3449 				status = ldi_ioctl(vd->ldi_handle[0], cmd,
   3450 				    (intptr_t)&vtoc, vd->open_flags | FKIOCTL,
   3451 				    kcred, &rval);
   3452 				break;
   3453 			}
   3454 		}
   3455 	}
   3456 
   3457 #ifdef DEBUG
   3458 	if (rval != 0) {
   3459 		PR0("ioctl %x set rval = %d, which is not being returned"
   3460 		    " to caller", cmd, rval);
   3461 	}
   3462 #endif /* DEBUG */
   3463 
   3464 	return (status);
   3465 }
   3466 
   3467 /*
   3468  * Description:
   3469  *	This is the function that processes the ioctl requests (farming it
   3470  *	out to functions that handle slices, files or whole disks)
   3471  *
   3472  * Return Values
   3473  *     0		- ioctl operation completed successfully
   3474  *     != 0		- The LDC error value encountered
   3475  *			  (propagated back up the call stack as a task error)
   3476  *
   3477  * Side Effect
   3478  *     sets request->status to the return value of the ioctl function.
   3479  */
   3480 static int
   3481 vd_do_ioctl(vd_t *vd, vd_dring_payload_t *request, void* buf, vd_ioctl_t *ioctl)
   3482 {
   3483 	int	status = 0;
   3484 	size_t	nbytes = request->nbytes;	/* modifiable copy */
   3485 
   3486 
   3487 	ASSERT(request->slice < vd->nslices);
   3488 	PR0("Performing %s", ioctl->operation_name);
   3489 
   3490 	/* Get data from client and convert, if necessary */
   3491 	if (ioctl->copyin != NULL)  {
   3492 		ASSERT(nbytes != 0 && buf != NULL);
   3493 		PR1("Getting \"arg\" data from client");
   3494 		if ((status = ldc_mem_copy(vd->ldc_handle, buf, 0, &nbytes,
   3495 		    request->cookie, request->ncookies,
   3496 		    LDC_COPY_IN)) != 0) {
   3497 			PR0("ldc_mem_copy() returned errno %d "
   3498 			    "copying from client", status);
   3499 			return (status);
   3500 		}
   3501 
   3502 		/* Convert client's data, if necessary */
   3503 		if (ioctl->copyin == VD_IDENTITY_IN) {
   3504 			/* use client buffer */
   3505 			ioctl->arg = buf;
   3506 		} else {
   3507 			/* convert client vdisk operation data to ioctl data */
   3508 			status = (ioctl->copyin)(buf, nbytes,
   3509 			    (void *)ioctl->arg);
   3510 			if (status != 0) {
   3511 				request->status = status;
   3512 				return (0);
   3513 			}
   3514 		}
   3515 	}
   3516 
   3517 	if (ioctl->operation == VD_OP_SCSICMD) {
   3518 		struct uscsi_cmd *uscsi = (struct uscsi_cmd *)ioctl->arg;
   3519 
   3520 		/* check write permission */
   3521 		if (!(vd->open_flags & FWRITE) &&
   3522 		    !(uscsi->uscsi_flags & USCSI_READ)) {
   3523 			PR0("uscsi fails because backend is opened read-only");
   3524 			request->status = EROFS;
   3525 			return (0);
   3526 		}
   3527 	}
   3528 
   3529 	/*
   3530 	 * Send the ioctl to the disk backend.
   3531 	 */
   3532 	request->status = vd_backend_ioctl(vd, ioctl->cmd, ioctl->arg);
   3533 
   3534 	if (request->status != 0) {
   3535 		PR0("ioctl(%s) = errno %d", ioctl->cmd_name, request->status);
   3536 		if (ioctl->operation == VD_OP_SCSICMD &&
   3537 		    ((struct uscsi_cmd *)ioctl->arg)->uscsi_status != 0)
   3538 			/*
   3539 			 * USCSICMD has reported an error and the uscsi_status
   3540 			 * field is not zero. This means that the SCSI command
   3541 			 * has completed but it has an error. So we should
   3542 			 * mark the VD operation has succesfully completed
   3543 			 * and clients can check the SCSI status field for
   3544 			 * SCSI errors.
   3545 			 */
   3546 			request->status = 0;
   3547 		else
   3548 			return (0);
   3549 	}
   3550 
   3551 	/* Convert data and send to client, if necessary */
   3552 	if (ioctl->copyout != NULL)  {
   3553 		ASSERT(nbytes != 0 && buf != NULL);
   3554 		PR1("Sending \"arg\" data to client");
   3555 
   3556 		/* Convert ioctl data to vdisk operation data, if necessary */
   3557 		if (ioctl->copyout != VD_IDENTITY_OUT)
   3558 			(ioctl->copyout)((void *)ioctl->arg, buf);
   3559 
   3560 		if ((status = ldc_mem_copy(vd->ldc_handle, buf, 0, &nbytes,
   3561 		    request->cookie, request->ncookies,
   3562 		    LDC_COPY_OUT)) != 0) {
   3563 			PR0("ldc_mem_copy() returned errno %d "
   3564 			    "copying to client", status);
   3565 			return (status);
   3566 		}
   3567 	}
   3568 
   3569 	return (status);
   3570 }
   3571 
   3572 #define	RNDSIZE(expr) P2ROUNDUP(sizeof (expr), sizeof (uint64_t))
   3573 
   3574 /*
   3575  * Description:
   3576  *	This generic function is called by the task queue to complete
   3577  *	the processing of the tasks. The specific completion function
   3578  *	is passed in as a field in the task pointer.
   3579  *
   3580  * Parameters:
   3581  *	arg 	- opaque pointer to structure containing task to be completed
   3582  *
   3583  * Return Values
   3584  *	None
   3585  */
   3586 static void
   3587 vd_complete(void *arg)
   3588 {
   3589 	vd_task_t	*task = (vd_task_t *)arg;
   3590 
   3591 	ASSERT(task != NULL);
   3592 	ASSERT(task->status == EINPROGRESS);
   3593 	ASSERT(task->completef != NULL);
   3594 
   3595 	task->status = task->completef(task);
   3596 	if (task->status)
   3597 		PR0("%s: Error %d completing task", __func__, task->status);
   3598 
   3599 	/* Now notify the vDisk client */
   3600 	vd_complete_notify(task);
   3601 }
   3602 
   3603 static int
   3604 vd_ioctl(vd_task_t *task)
   3605 {
   3606 	int			i, status;
   3607 	void			*buf = NULL;
   3608 	struct dk_geom		dk_geom = {0};
   3609 	struct extvtoc		vtoc = {0};
   3610 	struct dk_efi		dk_efi = {0};
   3611 	struct uscsi_cmd	uscsi = {0};
   3612 	vd_t			*vd		= task->vd;
   3613 	vd_dring_payload_t	*request	= task->request;
   3614 	vd_ioctl_t		ioctl[] = {
   3615 		/* Command (no-copy) operations */
   3616 		{VD_OP_FLUSH, STRINGIZE(VD_OP_FLUSH), 0,
   3617 		    DKIOCFLUSHWRITECACHE, STRINGIZE(DKIOCFLUSHWRITECACHE),
   3618 		    NULL, NULL, NULL, B_TRUE},
   3619 
   3620 		/* "Get" (copy-out) operations */
   3621 		{VD_OP_GET_WCE, STRINGIZE(VD_OP_GET_WCE), RNDSIZE(int),
   3622 		    DKIOCGETWCE, STRINGIZE(DKIOCGETWCE),
   3623 		    NULL, VD_IDENTITY_IN, VD_IDENTITY_OUT, B_FALSE},
   3624 		{VD_OP_GET_DISKGEOM, STRINGIZE(VD_OP_GET_DISKGEOM),
   3625 		    RNDSIZE(vd_geom_t),
   3626 		    DKIOCGGEOM, STRINGIZE(DKIOCGGEOM),
   3627 		    &dk_geom, NULL, dk_geom2vd_geom, B_FALSE},
   3628 		{VD_OP_GET_VTOC, STRINGIZE(VD_OP_GET_VTOC), RNDSIZE(vd_vtoc_t),
   3629 		    DKIOCGEXTVTOC, STRINGIZE(DKIOCGEXTVTOC),
   3630 		    &vtoc, NULL, vtoc2vd_vtoc, B_FALSE},
   3631 		{VD_OP_GET_EFI, STRINGIZE(VD_OP_GET_EFI), RNDSIZE(vd_efi_t),
   3632 		    DKIOCGETEFI, STRINGIZE(DKIOCGETEFI),
   3633 		    &dk_efi, vd_get_efi_in, vd_get_efi_out, B_FALSE},
   3634 
   3635 		/* "Set" (copy-in) operations */
   3636 		{VD_OP_SET_WCE, STRINGIZE(VD_OP_SET_WCE), RNDSIZE(int),
   3637 		    DKIOCSETWCE, STRINGIZE(DKIOCSETWCE),
   3638 		    NULL, VD_IDENTITY_IN, VD_IDENTITY_OUT, B_TRUE},
   3639 		{VD_OP_SET_DISKGEOM, STRINGIZE(VD_OP_SET_DISKGEOM),
   3640 		    RNDSIZE(vd_geom_t),
   3641 		    DKIOCSGEOM, STRINGIZE(DKIOCSGEOM),
   3642 		    &dk_geom, vd_geom2dk_geom, NULL, B_TRUE},
   3643 		{VD_OP_SET_VTOC, STRINGIZE(VD_OP_SET_VTOC), RNDSIZE(vd_vtoc_t),
   3644 		    DKIOCSEXTVTOC, STRINGIZE(DKIOCSEXTVTOC),
   3645 		    &vtoc, vd_vtoc2vtoc, NULL, B_TRUE},
   3646 		{VD_OP_SET_EFI, STRINGIZE(VD_OP_SET_EFI), RNDSIZE(vd_efi_t),
   3647 		    DKIOCSETEFI, STRINGIZE(DKIOCSETEFI),
   3648 		    &dk_efi, vd_set_efi_in, vd_set_efi_out, B_TRUE},
   3649 
   3650 		{VD_OP_SCSICMD, STRINGIZE(VD_OP_SCSICMD), RNDSIZE(vd_scsi_t),
   3651 		    USCSICMD, STRINGIZE(USCSICMD),
   3652 		    &uscsi, vd_scsicmd_in, vd_scsicmd_out, B_FALSE},
   3653 	};
   3654 	size_t		nioctls = (sizeof (ioctl))/(sizeof (ioctl[0]));
   3655 
   3656 
   3657 	ASSERT(vd != NULL);
   3658 	ASSERT(request != NULL);
   3659 	ASSERT(request->slice < vd->nslices);
   3660 
   3661 	/*
   3662 	 * Determine ioctl corresponding to caller's "operation" and
   3663 	 * validate caller's "nbytes"
   3664 	 */
   3665 	for (i = 0; i < nioctls; i++) {
   3666 		if (request->operation == ioctl[i].operation) {
   3667 			/* LDC memory operations require 8-byte multiples */
   3668 			ASSERT(ioctl[i].nbytes % sizeof (uint64_t) == 0);
   3669 
   3670 			if (request->operation == VD_OP_GET_EFI ||
   3671 			    request->operation == VD_OP_SET_EFI ||
   3672 			    request->operation == VD_OP_SCSICMD) {
   3673 				if (request->nbytes >= ioctl[i].nbytes)
   3674 					break;
   3675 				PR0("%s:  Expected at least nbytes = %lu, "
   3676 				    "got %lu", ioctl[i].operation_name,
   3677 				    ioctl[i].nbytes, request->nbytes);
   3678 				return (EINVAL);
   3679 			}
   3680 
   3681 			if (request->nbytes != ioctl[i].nbytes) {
   3682 				PR0("%s:  Expected nbytes = %lu, got %lu",
   3683 				    ioctl[i].operation_name, ioctl[i].nbytes,
   3684 				    request->nbytes);
   3685 				return (EINVAL);
   3686 			}
   3687 
   3688 			break;
   3689 		}
   3690 	}
   3691 	ASSERT(i < nioctls);	/* because "operation" already validated */
   3692 
   3693 	if (!(vd->open_flags & FWRITE) && ioctl[i].write) {
   3694 		PR0("%s fails because backend is opened read-only",
   3695 		    ioctl[i].operation_name);
   3696 		request->status = EROFS;
   3697 		return (0);
   3698 	}
   3699 
   3700 	if (request->nbytes)
   3701 		buf = kmem_zalloc(request->nbytes, KM_SLEEP);
   3702 	status = vd_do_ioctl(vd, request, buf, &ioctl[i]);
   3703 	if (request->nbytes)
   3704 		kmem_free(buf, request->nbytes);
   3705 
   3706 	return (status);
   3707 }
   3708 
   3709 static int
   3710 vd_get_devid(vd_task_t *task)
   3711 {
   3712 	vd_t *vd = task->vd;
   3713 	vd_dring_payload_t *request = task->request;
   3714 	vd_devid_t *vd_devid;
   3715 	impl_devid_t *devid;
   3716 	int status, bufid_len, devid_len, len, sz;
   3717 	int bufbytes;
   3718 
   3719 	PR1("Get Device ID, nbytes=%ld", request->nbytes);
   3720 
   3721 	if (vd->vdisk_type == VD_DISK_TYPE_SLICE) {
   3722 		/*
   3723 		 * We don't support devid for single-slice disks because we
   3724 		 * have no space to store a fabricated devid and for physical
   3725 		 * disk slices, we can't use the devid of the disk otherwise
   3726 		 * exporting multiple slices from the same disk will produce
   3727 		 * the same devids.
   3728 		 */
   3729 		PR2("No Device ID for slices");
   3730 		request->status = ENOTSUP;
   3731 		return (0);
   3732 	}
   3733 
   3734 	if (VD_DSKIMG(vd)) {
   3735 		if (vd->dskimg_devid == NULL) {
   3736 			PR2("No Device ID");
   3737 			request->status = ENOENT;
   3738 			return (0);
   3739 		} else {
   3740 			sz = ddi_devid_sizeof(vd->dskimg_devid);
   3741 			devid = kmem_alloc(sz, KM_SLEEP);
   3742 			bcopy(vd->dskimg_devid, devid, sz);
   3743 		}
   3744 	} else {
   3745 		if (ddi_lyr_get_devid(vd->dev[request->slice],
   3746 		    (ddi_devid_t *)&devid) != DDI_SUCCESS) {
   3747 			PR2("No Device ID");
   3748 			request->status = ENOENT;
   3749 			return (0);
   3750 		}
   3751 	}
   3752 
   3753 	bufid_len = request->nbytes - sizeof (vd_devid_t) + 1;
   3754 	devid_len = DEVID_GETLEN(devid);
   3755 
   3756 	/*
   3757 	 * Save the buffer size here for use in deallocation.
   3758 	 * The actual number of bytes copied is returned in
   3759 	 * the 'nbytes' field of the request structure.
   3760 	 */
   3761 	bufbytes = request->nbytes;
   3762 
   3763 	vd_devid = kmem_zalloc(bufbytes, KM_SLEEP);
   3764 	vd_devid->length = devid_len;
   3765 	vd_devid->type = DEVID_GETTYPE(devid);
   3766 
   3767 	len = (devid_len > bufid_len)? bufid_len : devid_len;
   3768 
   3769 	bcopy(devid->did_id, vd_devid->id, len);
   3770 
   3771 	request->status = 0;
   3772 
   3773 	/* LDC memory operations require 8-byte multiples */
   3774 	ASSERT(request->nbytes % sizeof (uint64_t) == 0);
   3775 
   3776 	if ((status = ldc_mem_copy(vd->ldc_handle, (caddr_t)vd_devid, 0,
   3777 	    &request->nbytes, request->cookie, request->ncookies,
   3778 	    LDC_COPY_OUT)) != 0) {
   3779 		PR0("ldc_mem_copy() returned errno %d copying to client",
   3780 		    status);
   3781 	}
   3782 	PR1("post mem_copy: nbytes=%ld", request->nbytes);
   3783 
   3784 	kmem_free(vd_devid, bufbytes);
   3785 	ddi_devid_free((ddi_devid_t)devid);
   3786 
   3787 	return (status);
   3788 }
   3789 
   3790 static int
   3791 vd_scsi_reset(vd_t *vd)
   3792 {
   3793 	int rval, status;
   3794 	struct uscsi_cmd uscsi = { 0 };
   3795 
   3796 	uscsi.uscsi_flags = vd_scsi_debug | USCSI_RESET;
   3797 	uscsi.uscsi_timeout = vd_scsi_rdwr_timeout;
   3798 
   3799 	status = ldi_ioctl(vd->ldi_handle[0], USCSICMD, (intptr_t)&uscsi,
   3800 	    (vd->open_flags | FKIOCTL), kcred, &rval);
   3801 
   3802 	return (status);
   3803 }
   3804 
   3805 static int
   3806 vd_reset(vd_task_t *task)
   3807 {
   3808 	vd_t *vd = task->vd;
   3809 	vd_dring_payload_t *request = task->request;
   3810 
   3811 	ASSERT(request->operation == VD_OP_RESET);
   3812 	ASSERT(vd->scsi);
   3813 
   3814 	PR0("Performing VD_OP_RESET");
   3815 
   3816 	if (request->nbytes != 0) {
   3817 		PR0("VD_OP_RESET:  Expected nbytes = 0, got %lu",
   3818 		    request->nbytes);
   3819 		return (EINVAL);
   3820 	}
   3821 
   3822 	request->status = vd_scsi_reset(vd);
   3823 
   3824 	return (0);
   3825 }
   3826 
   3827 static int
   3828 vd_get_capacity(vd_task_t *task)
   3829 {
   3830 	int rv;
   3831 	size_t nbytes;
   3832 	vd_t *vd = task->vd;
   3833 	vd_dring_payload_t *request = task->request;
   3834 	vd_capacity_t vd_cap = { 0 };
   3835 
   3836 	ASSERT(request->operation == VD_OP_GET_CAPACITY);
   3837 
   3838 	PR0("Performing VD_OP_GET_CAPACITY");
   3839 
   3840 	nbytes = request->nbytes;
   3841 
   3842 	if (nbytes != RNDSIZE(vd_capacity_t)) {
   3843 		PR0("VD_OP_GET_CAPACITY:  Expected nbytes = %lu, got %lu",
   3844 		    RNDSIZE(vd_capacity_t), nbytes);
   3845 		return (EINVAL);
   3846 	}
   3847 
   3848 	/*
   3849 	 * Check the backend size in case it has changed. If the check fails
   3850 	 * then we will return the last known size.
   3851 	 */
   3852 
   3853 	(void) vd_backend_check_size(vd);
   3854 	ASSERT(vd->vdisk_size != 0);
   3855 
   3856 	request->status = 0;
   3857 
   3858 	vd_cap.vdisk_block_size = vd->vdisk_bsize;
   3859 	vd_cap.vdisk_size = vd->vdisk_size;
   3860 
   3861 	if ((rv = ldc_mem_copy(vd->ldc_handle, (char *)&vd_cap, 0, &nbytes,
   3862 	    request->cookie, request->ncookies, LDC_COPY_OUT)) != 0) {
   3863 		PR0("ldc_mem_copy() returned errno %d copying to client", rv);
   3864 		return (rv);
   3865 	}
   3866 
   3867 	return (0);
   3868 }
   3869 
   3870 static int
   3871 vd_get_access(vd_task_t *task)
   3872 {
   3873 	uint64_t access;
   3874 	int rv, rval = 0;
   3875 	size_t nbytes;
   3876 	vd_t *vd = task->vd;
   3877 	vd_dring_payload_t *request = task->request;
   3878 
   3879 	ASSERT(request->operation == VD_OP_GET_ACCESS);
   3880 	ASSERT(vd->scsi);
   3881 
   3882 	PR0("Performing VD_OP_GET_ACCESS");
   3883 
   3884 	nbytes = request->nbytes;
   3885 
   3886 	if (nbytes != sizeof (uint64_t)) {
   3887 		PR0("VD_OP_GET_ACCESS:  Expected nbytes = %lu, got %lu",
   3888 		    sizeof (uint64_t), nbytes);
   3889 		return (EINVAL);
   3890 	}
   3891 
   3892 	request->status = ldi_ioctl(vd->ldi_handle[request->slice], MHIOCSTATUS,
   3893 	    NULL, (vd->open_flags | FKIOCTL), kcred, &rval);
   3894 
   3895 	if (request->status != 0)
   3896 		return (0);
   3897 
   3898 	access = (rval == 0)? VD_ACCESS_ALLOWED : VD_ACCESS_DENIED;
   3899 
   3900 	if ((rv = ldc_mem_copy(vd->ldc_handle, (char *)&access, 0, &nbytes,
   3901 	    request->cookie, request->ncookies, LDC_COPY_OUT)) != 0) {
   3902 		PR0("ldc_mem_copy() returned errno %d copying to client", rv);
   3903 		return (rv);
   3904 	}
   3905 
   3906 	return (0);
   3907 }
   3908 
   3909 static int
   3910 vd_set_access(vd_task_t *task)
   3911 {
   3912 	uint64_t flags;
   3913 	int rv, rval;
   3914 	size_t nbytes;
   3915 	vd_t *vd = task->vd;
   3916 	vd_dring_payload_t *request = task->request;
   3917 
   3918 	ASSERT(request->operation == VD_OP_SET_ACCESS);
   3919 	ASSERT(vd->scsi);
   3920 
   3921 	nbytes = request->nbytes;
   3922 
   3923 	if (nbytes != sizeof (uint64_t)) {
   3924 		PR0("VD_OP_SET_ACCESS:  Expected nbytes = %lu, got %lu",
   3925 		    sizeof (uint64_t), nbytes);
   3926 		return (EINVAL);
   3927 	}
   3928 
   3929 	if ((rv = ldc_mem_copy(vd->ldc_handle, (char *)&flags, 0, &nbytes,
   3930 	    request->cookie, request->ncookies, LDC_COPY_IN)) != 0) {
   3931 		PR0("ldc_mem_copy() returned errno %d copying from client", rv);
   3932 		return (rv);
   3933 	}
   3934 
   3935 	if (flags == VD_ACCESS_SET_CLEAR) {
   3936 		PR0("Performing VD_OP_SET_ACCESS (CLEAR)");
   3937 		request->status = ldi_ioctl(vd->ldi_handle[request->slice],
   3938 		    MHIOCRELEASE, NULL, (vd->open_flags | FKIOCTL), kcred,
   3939 		    &rval);
   3940 		if (request->status == 0)
   3941 			vd->ownership = B_FALSE;
   3942 		return (0);
   3943 	}
   3944 
   3945 	/*
   3946 	 * As per the VIO spec, the PREEMPT and PRESERVE flags are only valid
   3947 	 * when the EXCLUSIVE flag is set.
   3948 	 */
   3949 	if (!(flags & VD_ACCESS_SET_EXCLUSIVE)) {
   3950 		PR0("Invalid VD_OP_SET_ACCESS flags: 0x%lx", flags);
   3951 		request->status = EINVAL;
   3952 		return (0);
   3953 	}
   3954 
   3955 	switch (flags & (VD_ACCESS_SET_PREEMPT | VD_ACCESS_SET_PRESERVE)) {
   3956 
   3957 	case VD_ACCESS_SET_PREEMPT | VD_ACCESS_SET_PRESERVE:
   3958 		/*
   3959 		 * Flags EXCLUSIVE and PREEMPT and PRESERVE. We have to
   3960 		 * acquire exclusive access rights, preserve them and we
   3961 		 * can use preemption. So we can use the MHIOCTKNOWN ioctl.
   3962 		 */
   3963 		PR0("Performing VD_OP_SET_ACCESS (EXCLUSIVE|PREEMPT|PRESERVE)");
   3964 		request->status = ldi_ioctl(vd->ldi_handle[request->slice],
   3965 		    MHIOCTKOWN, NULL, (vd->open_flags | FKIOCTL), kcred, &rval);
   3966 		break;
   3967 
   3968 	case VD_ACCESS_SET_PRESERVE:
   3969 		/*
   3970 		 * Flags EXCLUSIVE and PRESERVE. We have to acquire exclusive
   3971 		 * access rights and preserve them, but not preempt any other
   3972 		 * host. So we need to use the MHIOCTKOWN ioctl to enable the
   3973 		 * "preserve" feature but we can not called it directly
   3974 		 * because it uses preemption. So before that, we use the
   3975 		 * MHIOCQRESERVE ioctl to ensure we can get exclusive rights
   3976 		 * without preempting anyone.
   3977 		 */
   3978 		PR0("Performing VD_OP_SET_ACCESS (EXCLUSIVE|PRESERVE)");
   3979 		request->status = ldi_ioctl(vd->ldi_handle[request->slice],
   3980 		    MHIOCQRESERVE, NULL, (vd->open_flags | FKIOCTL), kcred,
   3981 		    &rval);
   3982 		if (request->status != 0)
   3983 			break;
   3984 		request->status = ldi_ioctl(vd->ldi_handle[request->slice],
   3985 		    MHIOCTKOWN, NULL, (vd->open_flags | FKIOCTL), kcred, &rval);
   3986 		break;
   3987 
   3988 	case VD_ACCESS_SET_PREEMPT:
   3989 		/*
   3990 		 * Flags EXCLUSIVE and PREEMPT. We have to acquire exclusive
   3991 		 * access rights and we can use preemption. So we try to do
   3992 		 * a SCSI reservation, if it fails we reset the disk to clear
   3993 		 * any reservation and we try to reserve again.
   3994 		 */
   3995 		PR0("Performing VD_OP_SET_ACCESS (EXCLUSIVE|PREEMPT)");
   3996 		request->status = ldi_ioctl(vd->ldi_handle[request->slice],
   3997 		    MHIOCQRESERVE, NULL, (vd->open_flags | FKIOCTL), kcred,
   3998 		    &rval);
   3999 		if (request->status == 0)
   4000 			break;
   4001 
   4002 		/* reset the disk */
   4003 		(void) vd_scsi_reset(vd);
   4004 
   4005 		/* try again even if the reset has failed */
   4006 		request->status = ldi_ioctl(vd->ldi_handle[request->slice],
   4007 		    MHIOCQRESERVE, NULL, (vd->open_flags | FKIOCTL), kcred,
   4008 		    &rval);
   4009 		break;
   4010 
   4011 	case 0:
   4012 		/* Flag EXCLUSIVE only. Just issue a SCSI reservation */
   4013 		PR0("Performing VD_OP_SET_ACCESS (EXCLUSIVE)");
   4014 		request->status = ldi_ioctl(vd->ldi_handle[request->slice],
   4015 		    MHIOCQRESERVE, NULL, (vd->open_flags | FKIOCTL), kcred,
   4016 		    &rval);
   4017 		break;
   4018 	}
   4019 
   4020 	if (request->status == 0)
   4021 		vd->ownership = B_TRUE;
   4022 	else
   4023 		PR0("VD_OP_SET_ACCESS: error %d", request->status);
   4024 
   4025 	return (0);
   4026 }
   4027 
   4028 static void
   4029 vd_reset_access(vd_t *vd)
   4030 {
   4031 	int status, rval;
   4032 
   4033 	if (vd->file || vd->volume || !vd->ownership)
   4034 		return;
   4035 
   4036 	PR0("Releasing disk ownership");
   4037 	status = ldi_ioctl(vd->ldi_handle[0], MHIOCRELEASE, NULL,
   4038 	    (vd->open_flags | FKIOCTL), kcred, &rval);
   4039 
   4040 	/*
   4041 	 * An EACCES failure means that there is a reservation conflict,
   4042 	 * so we are not the owner of the disk anymore.
   4043 	 */
   4044 	if (status == 0 || status == EACCES) {
   4045 		vd->ownership = B_FALSE;
   4046 		return;
   4047 	}
   4048 
   4049 	PR0("Fail to release ownership, error %d", status);
   4050 
   4051 	/*
   4052 	 * We have failed to release the ownership, try to reset the disk
   4053 	 * to release reservations.
   4054 	 */
   4055 	PR0("Resetting disk");
   4056 	status = vd_scsi_reset(vd);
   4057 
   4058 	if (status != 0)
   4059 		PR0("Fail to reset disk, error %d", status);
   4060 
   4061 	/* whatever the result of the reset is, we try the release again */
   4062 	status = ldi_ioctl(vd->ldi_handle[0], MHIOCRELEASE, NULL,
   4063 	    (vd->open_flags | FKIOCTL), kcred, &rval);
   4064 
   4065 	if (status == 0 || status == EACCES) {
   4066 		vd->ownership = B_FALSE;
   4067 		return;
   4068 	}
   4069 
   4070 	PR0("Fail to release ownership, error %d", status);
   4071 
   4072 	/*
   4073 	 * At this point we have done our best to try to reset the
   4074 	 * access rights to the disk and we don't know if we still
   4075 	 * own a reservation and if any mechanism to preserve the
   4076 	 * ownership is still in place. The ultimate solution would
   4077 	 * be to reset the system but this is usually not what we
   4078 	 * want to happen.
   4079 	 */
   4080 
   4081 	if (vd_reset_access_failure == A_REBOOT) {
   4082 		cmn_err(CE_WARN, VD_RESET_ACCESS_FAILURE_MSG
   4083 		    ", rebooting the system", vd->device_path);
   4084 		(void) uadmin(A_SHUTDOWN, AD_BOOT, NULL);
   4085 	} else if (vd_reset_access_failure == A_DUMP) {
   4086 		panic(VD_RESET_ACCESS_FAILURE_MSG, vd->device_path);
   4087 	}
   4088 
   4089 	cmn_err(CE_WARN, VD_RESET_ACCESS_FAILURE_MSG, vd->device_path);
   4090 }
   4091 
   4092 /*
   4093  * Define the supported operations once the functions for performing them have
   4094  * been defined
   4095  */
   4096 static const vds_operation_t	vds_operation[] = {
   4097 #define	X(_s)	#_s, _s
   4098 	{X(VD_OP_BREAD),	vd_start_bio,	vd_complete_bio},
   4099 	{X(VD_OP_BWRITE),	vd_start_bio,	vd_complete_bio},
   4100 	{X(VD_OP_FLUSH),	vd_ioctl,	NULL},
   4101 	{X(VD_OP_GET_WCE),	vd_ioctl,	NULL},
   4102 	{X(VD_OP_SET_WCE),	vd_ioctl,	NULL},
   4103 	{X(VD_OP_GET_VTOC),	vd_ioctl,	NULL},
   4104 	{X(VD_OP_SET_VTOC),	vd_ioctl,	NULL},
   4105 	{X(VD_OP_GET_DISKGEOM),	vd_ioctl,	NULL},
   4106 	{X(VD_OP_SET_DISKGEOM),	vd_ioctl,	NULL},
   4107 	{X(VD_OP_GET_EFI),	vd_ioctl,	NULL},
   4108 	{X(VD_OP_SET_EFI),	vd_ioctl,	NULL},
   4109 	{X(VD_OP_GET_DEVID),	vd_get_devid,	NULL},
   4110 	{X(VD_OP_SCSICMD),	vd_ioctl,	NULL},
   4111 	{X(VD_OP_RESET),	vd_reset,	NULL},
   4112 	{X(VD_OP_GET_CAPACITY),	vd_get_capacity, NULL},
   4113 	{X(VD_OP_SET_ACCESS),	vd_set_access,	NULL},
   4114 	{X(VD_OP_GET_ACCESS),	vd_get_access,	NULL},
   4115 #undef	X
   4116 };
   4117 
   4118 static const size_t	vds_noperations =
   4119 	(sizeof (vds_operation))/(sizeof (vds_operation[0]));
   4120 
   4121 /*
   4122  * Process a task specifying a client I/O request
   4123  *
   4124  * Parameters:
   4125  *	task 		- structure containing the request sent from client
   4126  *
   4127  * Return Value
   4128  *	0	- success
   4129  *	ENOTSUP	- Unknown/Unsupported VD_OP_XXX operation
   4130  *	EINVAL	- Invalid disk slice
   4131  *	!= 0	- some other non-zero return value from start function
   4132  */
   4133 static int
   4134 vd_do_process_task(vd_task_t *task)
   4135 {
   4136 	int			i;
   4137 	vd_t			*vd		= task->vd;
   4138 	vd_dring_payload_t	*request	= task->request;
   4139 
   4140 	ASSERT(vd != NULL);
   4141 	ASSERT(request != NULL);
   4142 
   4143 	/* Find the requested operation */
   4144 	for (i = 0; i < vds_noperations; i++) {
   4145 		if (request->operation == vds_operation[i].operation) {
   4146 			/* all operations should have a start func */
   4147 			ASSERT(vds_operation[i].start != NULL);
   4148 
   4149 			task->completef = vds_operation[i].complete;
   4150 			break;
   4151 		}
   4152 	}
   4153 
   4154 	/*
   4155 	 * We need to check that the requested operation is permitted
   4156 	 * for the particular client that sent it or that the loop above
   4157 	 * did not complete without finding the operation type (indicating
   4158 	 * that the requested operation is unknown/unimplemented)
   4159 	 */
   4160 	if ((VD_OP_SUPPORTED(vd->operations, request->operation) == B_FALSE) ||
   4161 	    (i == vds_noperations)) {
   4162 		PR0("Unsupported operation %u", request->operation);
   4163 		request->status = ENOTSUP;
   4164 		return (0);
   4165 	}
   4166 
   4167 	/* Range-check slice */
   4168 	if (request->slice >= vd->nslices &&
   4169 	    ((vd->vdisk_type != VD_DISK_TYPE_DISK && vd_slice_single_slice) ||
   4170 	    request->slice != VD_SLICE_NONE)) {
   4171 		PR0("Invalid \"slice\" %u (max %u) for virtual disk",
   4172 		    request->slice, (vd->nslices - 1));
   4173 		request->status = EINVAL;
   4174 		return (0);
   4175 	}
   4176 
   4177 	/*
   4178 	 * Call the function pointer that starts the operation.
   4179 	 */
   4180 	return (vds_operation[i].start(task));
   4181 }
   4182 
   4183 /*
   4184  * Description:
   4185  *	This function is called by both the in-band and descriptor ring
   4186  *	message processing functions paths to actually execute the task
   4187  *	requested by the vDisk client. It in turn calls its worker
   4188  *	function, vd_do_process_task(), to carry our the request.
   4189  *
   4190  *	Any transport errors (e.g. LDC errors, vDisk protocol errors) are
   4191  *	saved in the 'status' field of the task and are propagated back
   4192  *	up the call stack to trigger a NACK
   4193  *
   4194  *	Any request errors (e.g. ENOTTY from an ioctl) are saved in
   4195  *	the 'status' field of the request and result in an ACK being sent
   4196  *	by the completion handler.
   4197  *
   4198  * Parameters:
   4199  *	task 		- structure containing the request sent from client
   4200  *
   4201  * Return Value
   4202  *	0		- successful synchronous request.
   4203  *	!= 0		- transport error (e.g. LDC errors, vDisk protocol)
   4204  *	EINPROGRESS	- task will be finished in a completion handler
   4205  */
   4206 static int
   4207 vd_process_task(vd_task_t *task)
   4208 {
   4209 	vd_t	*vd = task->vd;
   4210 	int	status;
   4211 
   4212 	DTRACE_PROBE1(task__start, vd_task_t *, task);
   4213 
   4214 	task->status =  vd_do_process_task(task);
   4215 
   4216 	/*
   4217 	 * If the task processing function returned EINPROGRESS indicating
   4218 	 * that the task needs completing then schedule a taskq entry to
   4219 	 * finish it now.
   4220 	 *
   4221 	 * Otherwise the task processing function returned either zero
   4222 	 * indicating that the task was finished in the start function (and we
   4223 	 * don't need to wait in a completion function) or the start function
   4224 	 * returned an error - in both cases all that needs to happen is the
   4225 	 * notification to the vDisk client higher up the call stack.
   4226 	 * If the task was using a Descriptor Ring, we need to mark it as done
   4227 	 * at this stage.
   4228 	 */
   4229 	if (task->status == EINPROGRESS) {
   4230 		/* Queue a task to complete the operation */
   4231 		(void) ddi_taskq_dispatch(vd->completionq, vd_complete,
   4232 		    task, DDI_SLEEP);
   4233 		return (EINPROGRESS);
   4234 	}
   4235 
   4236 	if (!vd->reset_state && (vd->xfer_mode == VIO_DRING_MODE_V1_0)) {
   4237 		/* Update the dring element if it's a dring client */
   4238 		status = vd_mark_elem_done(vd, task->index,
   4239 		    task->request->status, task->request->nbytes);
   4240 		if (status == ECONNRESET)
   4241 			vd_mark_in_reset(vd);
   4242 		else if (status == EACCES)
   4243 			vd_need_reset(vd, B_TRUE);
   4244 	}
   4245 
   4246 	return (task->status);
   4247 }
   4248 
   4249 /*
   4250  * Return true if the "type", "subtype", and "env" fields of the "tag" first
   4251  * argument match the corresponding remaining arguments; otherwise, return false
   4252  */
   4253 boolean_t
   4254 vd_msgtype(vio_msg_tag_t *tag, int type, int subtype, int env)
   4255 {
   4256 	return ((tag->vio_msgtype == type) &&
   4257 	    (tag->vio_subtype == subtype) &&
   4258 	    (tag->vio_subtype_env == env)) ? B_TRUE : B_FALSE;
   4259 }
   4260 
   4261 /*
   4262  * Check whether the major/minor version specified in "ver_msg" is supported
   4263  * by this server.
   4264  */
   4265 static boolean_t
   4266 vds_supported_version(vio_ver_msg_t *ver_msg)
   4267 {
   4268 	for (int i = 0; i < vds_num_versions; i++) {
   4269 		ASSERT(vds_version[i].major > 0);
   4270 		ASSERT((i == 0) ||
   4271 		    (vds_version[i].major < vds_version[i-1].major));
   4272 
   4273 		/*
   4274 		 * If the major versions match, adjust the minor version, if
   4275 		 * necessary, down to the highest value supported by this
   4276 		 * server and return true so this message will get "ack"ed;
   4277 		 * the client should also support all minor versions lower
   4278 		 * than the value it sent
   4279 		 */
   4280 		if (ver_msg->ver_major == vds_version[i].major) {
   4281 			if (ver_msg->ver_minor > vds_version[i].minor) {
   4282 				PR0("Adjusting minor version from %u to %u",
   4283 				    ver_msg->ver_minor, vds_version[i].minor);
   4284 				ver_msg->ver_minor = vds_version[i].minor;
   4285 			}
   4286 			return (B_TRUE);
   4287 		}
   4288 
   4289 		/*
   4290 		 * If the message contains a higher major version number, set
   4291 		 * the message's major/minor versions to the current values
   4292 		 * and return false, so this message will get "nack"ed with
   4293 		 * these values, and the client will potentially try again
   4294 		 * with the same or a lower version
   4295 		 */
   4296 		if (ver_msg->ver_major > vds_version[i].major) {
   4297 			ver_msg->ver_major = vds_version[i].major;
   4298 			ver_msg->ver_minor = vds_version[i].minor;
   4299 			return (B_FALSE);
   4300 		}
   4301 
   4302 		/*
   4303 		 * Otherwise, the message's major version is less than the
   4304 		 * current major version, so continue the loop to the next
   4305 		 * (lower) supported version
   4306 		 */
   4307 	}
   4308 
   4309 	/*
   4310 	 * No common version was found; "ground" the version pair in the
   4311 	 * message to terminate negotiation
   4312 	 */
   4313 	ver_msg->ver_major = 0;
   4314 	ver_msg->ver_minor = 0;
   4315 	return (B_FALSE);
   4316 }
   4317 
   4318 /*
   4319  * Process a version message from a client.  vds expects to receive version
   4320  * messages from clients seeking service, but never issues version messages
   4321  * itself; therefore, vds can ACK or NACK client version messages, but does
   4322  * not expect to receive version-message ACKs or NACKs (and will treat such
   4323  * messages as invalid).
   4324  */
   4325 static int
   4326 vd_process_ver_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
   4327 {
   4328 	vio_ver_msg_t	*ver_msg = (vio_ver_msg_t *)msg;
   4329 
   4330 
   4331 	ASSERT(msglen >= sizeof (msg->tag));
   4332 
   4333 	if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO,
   4334 	    VIO_VER_INFO)) {
   4335 		return (ENOMSG);	/* not a version message */
   4336 	}
   4337 
   4338 	if (msglen != sizeof (*ver_msg)) {
   4339 		PR0("Expected %lu-byte version message; "
   4340 		    "received %lu bytes", sizeof (*ver_msg), msglen);
   4341 		return (EBADMSG);
   4342 	}
   4343 
   4344 	if (ver_msg->dev_class != VDEV_DISK) {
   4345 		PR0("Expected device class %u (disk); received %u",
   4346 		    VDEV_DISK, ver_msg->dev_class);
   4347 		return (EBADMSG);
   4348 	}
   4349 
   4350 	/*
   4351 	 * We're talking to the expected kind of client; set our device class
   4352 	 * for "ack/nack" back to the client
   4353 	 */
   4354 	ver_msg->dev_class = VDEV_DISK_SERVER;
   4355 
   4356 	/*
   4357 	 * Check whether the (valid) version message specifies a version
   4358 	 * supported by this server.  If the version is not supported, return
   4359 	 * EBADMSG so the message will get "nack"ed; vds_supported_version()
   4360 	 * will have updated the message with a supported version for the
   4361 	 * client to consider
   4362 	 */
   4363 	if (!vds_supported_version(ver_msg))
   4364 		return (EBADMSG);
   4365 
   4366 
   4367 	/*
   4368 	 * A version has been agreed upon; use the client's SID for
   4369 	 * communication on this channel now
   4370 	 */
   4371 	ASSERT(!(vd->initialized & VD_SID));
   4372 	vd->sid = ver_msg->tag.vio_sid;
   4373 	vd->initialized |= VD_SID;
   4374 
   4375 	/*
   4376 	 * Store the negotiated major and minor version values in the "vd" data
   4377 	 * structure so that we can check if certain operations are supported
   4378 	 * by the client.
   4379 	 */
   4380 	vd->version.major = ver_msg->ver_major;
   4381 	vd->version.minor = ver_msg->ver_minor;
   4382 
   4383 	PR0("Using major version %u, minor version %u",
   4384 	    ver_msg->ver_major, ver_msg->ver_minor);
   4385 	return (0);
   4386 }
   4387 
   4388 static void
   4389 vd_set_exported_operations(vd_t *vd)
   4390 {
   4391 	vd->operations = 0;	/* clear field */
   4392 
   4393 	/*
   4394 	 * We need to check from the highest version supported to the
   4395 	 * lowest because versions with a higher minor number implicitly
   4396 	 * support versions with a lower minor number.
   4397 	 */
   4398 	if (vio_ver_is_supported(vd->version, 1, 1)) {
   4399 		ASSERT(vd->open_flags & FREAD);
   4400 		vd->operations |= VD_OP_MASK_READ | (1 << VD_OP_GET_CAPACITY);
   4401 
   4402 		if (vd->open_flags & FWRITE)
   4403 			vd->operations |= VD_OP_MASK_WRITE;
   4404 
   4405 		if (vd->scsi)
   4406 			vd->operations |= VD_OP_MASK_SCSI;
   4407 
   4408 		if (VD_DSKIMG(vd) && vd_dskimg_is_iso_image(vd)) {
   4409 			/*
   4410 			 * can't write to ISO images, make sure that write
   4411 			 * support is not set in case administrator did not
   4412 			 * use "options=ro" when doing an ldm add-vdsdev
   4413 			 */
   4414 			vd->operations &= ~VD_OP_MASK_WRITE;
   4415 		}
   4416 	} else if (vio_ver_is_supported(vd->version, 1, 0)) {
   4417 		vd->operations = VD_OP_MASK_READ | VD_OP_MASK_WRITE;
   4418 	}
   4419 
   4420 	/* we should have already agreed on a version */
   4421 	ASSERT(vd->operations != 0);
   4422 }
   4423 
   4424 static int
   4425 vd_process_attr_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
   4426 {
   4427 	vd_attr_msg_t	*attr_msg = (vd_attr_msg_t *)msg;
   4428 	int		status, retry = 0;
   4429 
   4430 
   4431 	ASSERT(msglen >= sizeof (msg->tag));
   4432 
   4433 	if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO,
   4434 	    VIO_ATTR_INFO)) {
   4435 		PR0("Message is not an attribute message");
   4436 		return (ENOMSG);
   4437 	}
   4438 
   4439 	if (msglen != sizeof (*attr_msg)) {
   4440 		PR0("Expected %lu-byte attribute message; "
   4441 		    "received %lu bytes", sizeof (*attr_msg), msglen);
   4442 		return (EBADMSG);
   4443 	}
   4444 
   4445 	if (attr_msg->max_xfer_sz == 0) {
   4446 		PR0("Received maximum transfer size of 0 from client");
   4447 		return (EBADMSG);
   4448 	}
   4449 
   4450 	if ((attr_msg->xfer_mode != VIO_DESC_MODE) &&
   4451 	    (attr_msg->xfer_mode != VIO_DRING_MODE_V1_0)) {
   4452 		PR0("Client requested unsupported transfer mode");
   4453 		return (EBADMSG);
   4454 	}
   4455 
   4456 	/*
   4457 	 * check if the underlying disk is ready, if not try accessing
   4458 	 * the device again. Open the vdisk device and extract info
   4459 	 * about it, as this is needed to respond to the attr info msg
   4460 	 */
   4461 	if ((vd->initialized & VD_DISK_READY) == 0) {
   4462 		PR0("Retry setting up disk (%s)", vd->device_path);
   4463 		do {
   4464 			status = vd_setup_vd(vd);
   4465 			if (status != EAGAIN || ++retry > vds_dev_retries)
   4466 				break;
   4467 
   4468 			/* incremental delay */
   4469 			delay(drv_usectohz(vds_dev_delay));
   4470 
   4471 			/* if vdisk is no longer enabled - return error */
   4472 			if (!vd_enabled(vd))
   4473 				return (ENXIO);
   4474 
   4475 		} while (status == EAGAIN);
   4476 
   4477 		if (status)
   4478 			return (ENXIO);
   4479 
   4480 		vd->initialized |= VD_DISK_READY;
   4481 		ASSERT(vd->nslices > 0 && vd->nslices <= V_NUMPAR);
   4482 		PR0("vdisk_type = %s, volume = %s, file = %s, nslices = %u",
   4483 		    ((vd->vdisk_type == VD_DISK_TYPE_DISK) ? "disk" : "slice"),
   4484 		    (vd->volume ? "yes" : "no"),
   4485 		    (vd->file ? "yes" : "no"),
   4486 		    vd->nslices);
   4487 	}
   4488 
   4489 	/* Success:  valid message and transfer mode */
   4490 	vd->xfer_mode = attr_msg->xfer_mode;
   4491 
   4492 	if (vd->xfer_mode == VIO_DESC_MODE) {
   4493 
   4494 		/*
   4495 		 * The vd_dring_inband_msg_t contains one cookie; need room
   4496 		 * for up to n-1 more cookies, where "n" is the number of full
   4497 		 * pages plus possibly one partial page required to cover
   4498 		 * "max_xfer_sz".  Add room for one more cookie if
   4499 		 * "max_xfer_sz" isn't an integral multiple of the page size.
   4500 		 * Must first get the maximum transfer size in bytes.
   4501 		 */
   4502 		size_t	max_xfer_bytes = attr_msg->vdisk_block_size ?
   4503 		    attr_msg->vdisk_block_size * attr_msg->max_xfer_sz :
   4504 		    attr_msg->max_xfer_sz;
   4505 		size_t	max_inband_msglen =
   4506 		    sizeof (vd_dring_inband_msg_t) +
   4507 		    ((max_xfer_bytes/PAGESIZE +
   4508 		    ((max_xfer_bytes % PAGESIZE) ? 1 : 0))*
   4509 		    (sizeof (ldc_mem_cookie_t)));
   4510 
   4511 		/*
   4512 		 * Set the maximum expected message length to
   4513 		 * accommodate in-band-descriptor messages with all
   4514 		 * their cookies
   4515 		 */
   4516 		vd->max_msglen = MAX(vd->max_msglen, max_inband_msglen);
   4517 
   4518 		/*
   4519 		 * Initialize the data structure for processing in-band I/O
   4520 		 * request descriptors
   4521 		 */
   4522 		vd->inband_task.vd	= vd;
   4523 		vd->inband_task.msg	= kmem_alloc(vd->max_msglen, KM_SLEEP);
   4524 		vd->inband_task.index	= 0;
   4525 		vd->inband_task.type	= VD_FINAL_RANGE_TASK;	/* range == 1 */
   4526 	}
   4527 
   4528 	/* Return the device's block size and max transfer size to the client */
   4529 	attr_msg->vdisk_block_size	= vd->vdisk_bsize;
   4530 	attr_msg->max_xfer_sz		= vd->max_xfer_sz;
   4531 
   4532 	attr_msg->vdisk_size = vd->vdisk_size;
   4533 	attr_msg->vdisk_type = (vd_slice_single_slice)? vd->vdisk_type :
   4534 	    VD_DISK_TYPE_DISK;
   4535 	attr_msg->vdisk_media = vd->vdisk_media;
   4536 
   4537 	/* Discover and save the list of supported VD_OP_XXX operations */
   4538 	vd_set_exported_operations(vd);
   4539 	attr_msg->operations = vd->operations;
   4540 
   4541 	PR0("%s", VD_CLIENT(vd));
   4542 
   4543 	ASSERT(vd->dring_task == NULL);
   4544 
   4545 	return (0);
   4546 }
   4547 
   4548 static int
   4549 vd_process_dring_reg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
   4550 {
   4551 	int			status;
   4552 	size_t			expected;
   4553 	ldc_mem_info_t		dring_minfo;
   4554 	uint8_t			mtype;
   4555 	vio_dring_reg_msg_t	*reg_msg = (vio_dring_reg_msg_t *)msg;
   4556 
   4557 
   4558 	ASSERT(msglen >= sizeof (msg->tag));
   4559 
   4560 	if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO,
   4561 	    VIO_DRING_REG)) {
   4562 		PR0("Message is not a register-dring message");
   4563 		return (ENOMSG);
   4564 	}
   4565 
   4566 	if (msglen < sizeof (*reg_msg)) {
   4567 		PR0("Expected at least %lu-byte register-dring message; "
   4568 		    "received %lu bytes", sizeof (*reg_msg), msglen);
   4569 		return (EBADMSG);
   4570 	}
   4571 
   4572 	expected = sizeof (*reg_msg) +
   4573 	    (reg_msg->ncookies - 1)*(sizeof (reg_msg->cookie[0]));
   4574 	if (msglen != expected) {
   4575 		PR0("Expected %lu-byte register-dring message; "
   4576 		    "received %lu bytes", expected, msglen);
   4577 		return (EBADMSG);
   4578 	}
   4579 
   4580 	if (vd->initialized & VD_DRING) {
   4581 		PR0("A dring was previously registered; only support one");
   4582 		return (EBADMSG);
   4583 	}
   4584 
   4585 	if (reg_msg->num_descriptors > INT32_MAX) {
   4586 		PR0("reg_msg->num_descriptors = %u; must be <= %u (%s)",
   4587 		    reg_msg->ncookies, INT32_MAX, STRINGIZE(INT32_MAX));
   4588 		return (EBADMSG);
   4589 	}
   4590 
   4591 	if (reg_msg->ncookies != 1) {
   4592 		/*
   4593 		 * In addition to fixing the assertion in the success case
   4594 		 * below, supporting drings which require more than one
   4595 		 * "cookie" requires increasing the value of vd->max_msglen
   4596 		 * somewhere in the code path prior to receiving the message
   4597 		 * which results in calling this function.  Note that without
   4598 		 * making this change, the larger message size required to
   4599 		 * accommodate multiple cookies cannot be successfully
   4600 		 * received, so this function will not even get called.
   4601 		 * Gracefully accommodating more dring cookies might
   4602 		 * reasonably demand exchanging an additional attribute or
   4603 		 * making a minor protocol adjustment
   4604 		 */
   4605 		PR0("reg_msg->ncookies = %u != 1", reg_msg->ncookies);
   4606 		return (EBADMSG);
   4607 	}
   4608 
   4609 	if (vd_direct_mapped_drings)
   4610 		mtype = LDC_DIRECT_MAP;
   4611 	else
   4612 		mtype = LDC_SHADOW_MAP;
   4613 
   4614 	status = ldc_mem_dring_map(vd->ldc_handle, reg_msg->cookie,
   4615 	    reg_msg->ncookies, reg_msg->num_descriptors,
   4616 	    reg_msg->descriptor_size, mtype, &vd->dring_handle);
   4617 	if (status != 0) {
   4618 		PR0("ldc_mem_dring_map() returned errno %d", status);
   4619 		return (status);
   4620 	}
   4621 
   4622 	/*
   4623 	 * To remove the need for this assertion, must call
   4624 	 * ldc_mem_dring_nextcookie() successfully ncookies-1 times after a
   4625 	 * successful call to ldc_mem_dring_map()
   4626 	 */
   4627 	ASSERT(reg_msg->ncookies == 1);
   4628 
   4629 	if ((status =
   4630 	    ldc_mem_dring_info(vd->dring_handle, &dring_minfo)) != 0) {
   4631 		PR0("ldc_mem_dring_info() returned errno %d", status);
   4632 		if ((status = ldc_mem_dring_unmap(vd->dring_handle)) != 0)
   4633 			PR0("ldc_mem_dring_unmap() returned errno %d", status);
   4634 		return (status);
   4635 	}
   4636 
   4637 	if (dring_minfo.vaddr == NULL) {
   4638 		PR0("Descriptor ring virtual address is NULL");
   4639 		return (ENXIO);
   4640 	}
   4641 
   4642 
   4643 	/* Initialize for valid message and mapped dring */
   4644 	vd->initialized |= VD_DRING;
   4645 	vd->dring_ident = 1;	/* "There Can Be Only One" */
   4646 	vd->dring = dring_minfo.vaddr;
   4647 	vd->descriptor_size = reg_msg->descriptor_size;
   4648 	vd->dring_len = reg_msg->num_descriptors;
   4649 	vd->dring_mtype = dring_minfo.mtype;
   4650 	reg_msg->dring_ident = vd->dring_ident;
   4651 	PR1("descriptor size = %u, dring length = %u",
   4652 	    vd->descriptor_size, vd->dring_len);
   4653 
   4654 	/*
   4655 	 * Allocate and initialize a "shadow" array of data structures for
   4656 	 * tasks to process I/O requests in dring elements
   4657 	 */
   4658 	vd->dring_task =
   4659 	    kmem_zalloc((sizeof (*vd->dring_task)) * vd->dring_len, KM_SLEEP);
   4660 	for (int i = 0; i < vd->dring_len; i++) {
   4661 		vd->dring_task[i].vd		= vd;
   4662 		vd->dring_task[i].index		= i;
   4663 
   4664 		status = ldc_mem_alloc_handle(vd->ldc_handle,
   4665 		    &(vd->dring_task[i].mhdl));
   4666 		if (status) {
   4667 			PR0("ldc_mem_alloc_handle() returned err %d ", status);
   4668 			return (ENXIO);
   4669 		}
   4670 
   4671 		/*
   4672 		 * The descriptor payload varies in length. Calculate its
   4673 		 * size by subtracting the header size from the total
   4674 		 * descriptor size.
   4675 		 */
   4676 		vd->dring_task[i].request = kmem_zalloc((vd->descriptor_size -
   4677 		    sizeof (vio_dring_entry_hdr_t)), KM_SLEEP);
   4678 		vd->dring_task[i].msg = kmem_alloc(vd->max_msglen, KM_SLEEP);
   4679 	}
   4680 
   4681 	if (vd->file || vd->zvol) {
   4682 		vd->write_queue =
   4683 		    kmem_zalloc(sizeof (buf_t *) * vd->dring_len, KM_SLEEP);
   4684 	}
   4685 
   4686 	return (0);
   4687 }
   4688 
   4689 static int
   4690 vd_process_dring_unreg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
   4691 {
   4692 	vio_dring_unreg_msg_t	*unreg_msg = (vio_dring_unreg_msg_t *)msg;
   4693 
   4694 
   4695 	ASSERT(msglen >= sizeof (msg->tag));
   4696 
   4697 	if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO,
   4698 	    VIO_DRING_UNREG)) {
   4699 		PR0("Message is not an unregister-dring message");
   4700 		return (ENOMSG);
   4701 	}
   4702 
   4703 	if (msglen != sizeof (*unreg_msg)) {
   4704 		PR0("Expected %lu-byte unregister-dring message; "
   4705 		    "received %lu bytes", sizeof (*unreg_msg), msglen);
   4706 		return (EBADMSG);
   4707 	}
   4708 
   4709 	if (unreg_msg->dring_ident != vd->dring_ident) {
   4710 		PR0("Expected dring ident %lu; received %lu",
   4711 		    vd->dring_ident, unreg_msg->dring_ident);
   4712 		return (EBADMSG);
   4713 	}
   4714 
   4715 	return (0);
   4716 }
   4717 
   4718 static int
   4719 process_rdx_msg(vio_msg_t *msg, size_t msglen)
   4720 {
   4721 	ASSERT(msglen >= sizeof (msg->tag));
   4722 
   4723 	if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, VIO_RDX)) {
   4724 		PR0("Message is not an RDX message");
   4725 		return (ENOMSG);
   4726 	}
   4727 
   4728 	if (msglen != sizeof (vio_rdx_msg_t)) {
   4729 		PR0("Expected %lu-byte RDX message; received %lu bytes",
   4730 		    sizeof (vio_rdx_msg_t), msglen);
   4731 		return (EBADMSG);
   4732 	}
   4733 
   4734 	PR0("Valid RDX message");
   4735 	return (0);
   4736 }
   4737 
   4738 static int
   4739 vd_check_seq_num(vd_t *vd, uint64_t seq_num)
   4740 {
   4741 	if ((vd->initialized & VD_SEQ_NUM) && (seq_num != vd->seq_num + 1)) {
   4742 		PR0("Received seq_num %lu; expected %lu",
   4743 		    seq_num, (vd->seq_num + 1));
   4744 		PR0("initiating soft reset");
   4745 		vd_need_reset(vd, B_FALSE);
   4746 		return (1);
   4747 	}
   4748 
   4749 	vd->seq_num = seq_num;
   4750 	vd->initialized |= VD_SEQ_NUM;	/* superfluous after first time... */
   4751 	return (0);
   4752 }
   4753 
   4754 /*
   4755  * Return the expected size of an inband-descriptor message with all the
   4756  * cookies it claims to include
   4757  */
   4758 static size_t
   4759 expected_inband_size(vd_dring_inband_msg_t *msg)
   4760 {
   4761 	return ((sizeof (*msg)) +
   4762 	    (msg->payload.ncookies - 1)*(sizeof (msg->payload.cookie[0])));
   4763 }
   4764 
   4765 /*
   4766  * Process an in-band descriptor message:  used with clients like OBP, with
   4767  * which vds exchanges descriptors within VIO message payloads, rather than
   4768  * operating on them within a descriptor ring
   4769  */
   4770 static int
   4771 vd_process_desc_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
   4772 {
   4773 	size_t			expected;
   4774 	vd_dring_inband_msg_t	*desc_msg = (vd_dring_inband_msg_t *)msg;
   4775 
   4776 
   4777 	ASSERT(msglen >= sizeof (msg->tag));
   4778 
   4779 	if (!vd_msgtype(&msg->tag, VIO_TYPE_DATA, VIO_SUBTYPE_INFO,
   4780 	    VIO_DESC_DATA)) {
   4781 		PR1("Message is not an in-band-descriptor message");
   4782 		return (ENOMSG);
   4783 	}
   4784 
   4785 	if (msglen < sizeof (*desc_msg)) {
   4786 		PR0("Expected at least %lu-byte descriptor message; "
   4787 		    "received %lu bytes", sizeof (*desc_msg), msglen);
   4788 		return (EBADMSG);
   4789 	}
   4790 
   4791 	if (msglen != (expected = expected_inband_size(desc_msg))) {
   4792 		PR0("Expected %lu-byte descriptor message; "
   4793 		    "received %lu bytes", expected, msglen);
   4794 		return (EBADMSG);
   4795 	}
   4796 
   4797 	if (vd_check_seq_num(vd, desc_msg->hdr.seq_num) != 0)
   4798 		return (EBADMSG);
   4799 
   4800 	/*
   4801 	 * Valid message:  Set up the in-band descriptor task and process the
   4802 	 * request.  Arrange to acknowledge the client's message, unless an
   4803 	 * error processing the descriptor task results in setting
   4804 	 * VIO_SUBTYPE_NACK
   4805 	 */
   4806 	PR1("Valid in-band-descriptor message");
   4807 	msg->tag.vio_subtype = VIO_SUBTYPE_ACK;
   4808 
   4809 	ASSERT(vd->inband_task.msg != NULL);
   4810 
   4811 	bcopy(msg, vd->inband_task.msg, msglen);
   4812 	vd->inband_task.msglen	= msglen;
   4813 
   4814 	/*
   4815 	 * The task request is now the payload of the message
   4816 	 * that was just copied into the body of the task.
   4817 	 */
   4818 	desc_msg = (vd_dring_inband_msg_t *)vd->inband_task.msg;
   4819 	vd->inband_task.request	= &desc_msg->payload;
   4820 
   4821 	return (vd_process_task(&vd->inband_task));
   4822 }
   4823 
   4824 static int
   4825 vd_process_element(vd_t *vd, vd_task_type_t type, uint32_t idx,
   4826     vio_msg_t *msg, size_t msglen)
   4827 {
   4828 	int			status;
   4829 	boolean_t		ready;
   4830 	on_trap_data_t		otd;
   4831 	vd_dring_entry_t	*elem = VD_DRING_ELEM(idx);
   4832 
   4833 	/* Accept the updated dring element */
   4834 	if ((status = VIO_DRING_ACQUIRE(&otd, vd->dring_mtype,
   4835 	    vd->dring_handle, idx, idx)) != 0) {
   4836 		return (status);
   4837 	}
   4838 	ready = (elem->hdr.dstate == VIO_DESC_READY);
   4839 	if (ready) {
   4840 		elem->hdr.dstate = VIO_DESC_ACCEPTED;
   4841 		bcopy(&elem->payload, vd->dring_task[idx].request,
   4842 		    (vd->descriptor_size - sizeof (vio_dring_entry_hdr_t)));
   4843 	} else {
   4844 		PR0("descriptor %u not ready", idx);
   4845 		VD_DUMP_DRING_ELEM(elem);
   4846 	}
   4847 	if ((status = VIO_DRING_RELEASE(vd->dring_mtype,
   4848 	    vd->dring_handle, idx, idx)) != 0) {
   4849 		PR0("VIO_DRING_RELEASE() returned errno %d", status);
   4850 		return (status);
   4851 	}
   4852 	if (!ready)
   4853 		return (EBUSY);
   4854 
   4855 
   4856 	/* Initialize a task and process the accepted element */
   4857 	PR1("Processing dring element %u", idx);
   4858 	vd->dring_task[idx].type	= type;
   4859 
   4860 	/* duplicate msg buf for cookies etc. */
   4861 	bcopy(msg, vd->dring_task[idx].msg, msglen);
   4862 
   4863 	vd->dring_task[idx].msglen	= msglen;
   4864 	return (vd_process_task(&vd->dring_task[idx]));
   4865 }
   4866 
   4867 static int
   4868 vd_process_element_range(vd_t *vd, int start, int end,
   4869     vio_msg_t *msg, size_t msglen)
   4870 {
   4871 	int		i, n, nelem, status = 0;
   4872 	boolean_t	inprogress = B_FALSE;
   4873 	vd_task_type_t	type;
   4874 
   4875 
   4876 	ASSERT(start >= 0);
   4877 	ASSERT(end >= 0);
   4878 
   4879 	/*
   4880 	 * Arrange to acknowledge the client's message, unless an error
   4881 	 * processing one of the dring elements results in setting
   4882 	 * VIO_SUBTYPE_NACK
   4883 	 */
   4884 	msg->tag.vio_subtype = VIO_SUBTYPE_ACK;
   4885 
   4886 	/*
   4887 	 * Process the dring elements in the range
   4888 	 */
   4889 	nelem = ((end < start) ? end + vd->dring_len : end) - start + 1;
   4890 	for (i = start, n = nelem; n > 0; i = (i + 1) % vd->dring_len, n--) {
   4891 		((vio_dring_msg_t *)msg)->end_idx = i;
   4892 		type = (n == 1) ? VD_FINAL_RANGE_TASK : VD_NONFINAL_RANGE_TASK;
   4893 		status = vd_process_element(vd, type, i, msg, msglen);
   4894 		if (status == EINPROGRESS)
   4895 			inprogress = B_TRUE;
   4896 		else if (status != 0)
   4897 			break;
   4898 	}
   4899 
   4900 	/*
   4901 	 * If some, but not all, operations of a multi-element range are in
   4902 	 * progress, wait for other operations to complete before returning
   4903 	 * (which will result in "ack" or "nack" of the message).  Note that
   4904 	 * all outstanding operations will need to complete, not just the ones
   4905 	 * corresponding to the current range of dring elements; howevever, as
   4906 	 * this situation is an error case, performance is less critical.
   4907 	 */
   4908 	if ((nelem > 1) && (status != EINPROGRESS) && inprogress) {
   4909 		if (vd->ioq != NULL)
   4910 			ddi_taskq_wait(vd->ioq);
   4911 		ddi_taskq_wait(vd->completionq);
   4912 	}
   4913 
   4914 	return (status);
   4915 }
   4916 
   4917 static int
   4918 vd_process_dring_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
   4919 {
   4920 	vio_dring_msg_t	*dring_msg = (vio_dring_msg_t *)msg;
   4921 
   4922 
   4923 	ASSERT(msglen >= sizeof (msg->tag));
   4924 
   4925 	if (!vd_msgtype(&msg->tag, VIO_TYPE_DATA, VIO_SUBTYPE_INFO,
   4926 	    VIO_DRING_DATA)) {
   4927 		PR1("Message is not a dring-data message");
   4928 		return (ENOMSG);
   4929 	}
   4930 
   4931 	if (msglen != sizeof (*dring_msg)) {
   4932 		PR0("Expected %lu-byte dring message; received %lu bytes",
   4933 		    sizeof (*dring_msg), msglen);
   4934 		return (EBADMSG);
   4935 	}
   4936 
   4937 	if (vd_check_seq_num(vd, dring_msg->seq_num) != 0)
   4938 		return (EBADMSG);
   4939 
   4940 	if (dring_msg->dring_ident != vd->dring_ident) {
   4941 		PR0("Expected dring ident %lu; received ident %lu",
   4942 		    vd->dring_ident, dring_msg->dring_ident);
   4943 		return (EBADMSG);
   4944 	}
   4945 
   4946 	if (dring_msg->start_idx >= vd->dring_len) {
   4947 		PR0("\"start_idx\" = %u; must be less than %u",
   4948 		    dring_msg->start_idx, vd->dring_len);
   4949 		return (EBADMSG);
   4950 	}
   4951 
   4952 	if ((dring_msg->end_idx < 0) ||
   4953 	    (dring_msg->end_idx >= vd->dring_len)) {
   4954 		PR0("\"end_idx\" = %u; must be >= 0 and less than %u",
   4955 		    dring_msg->end_idx, vd->dring_len);
   4956 		return (EBADMSG);
   4957 	}
   4958 
   4959 	/* Valid message; process range of updated dring elements */
   4960 	PR1("Processing descriptor range, start = %u, end = %u",
   4961 	    dring_msg->start_idx, dring_msg->end_idx);
   4962 	return (vd_process_element_range(vd, dring_msg->start_idx,
   4963 	    dring_msg->end_idx, msg, msglen));
   4964 }
   4965 
   4966 static int
   4967 recv_msg(ldc_handle_t ldc_handle, void *msg, size_t *nbytes)
   4968 {
   4969 	int	retry, status;
   4970 	size_t	size = *nbytes;
   4971 
   4972 
   4973 	for (retry = 0, status = ETIMEDOUT;
   4974 	    retry < vds_ldc_retries && status == ETIMEDOUT;
   4975 	    retry++) {
   4976 		PR1("ldc_read() attempt %d", (retry + 1));
   4977 		*nbytes = size;
   4978 		status = ldc_read(ldc_handle, msg, nbytes);
   4979 	}
   4980 
   4981 	if (status) {
   4982 		PR0("ldc_read() returned errno %d", status);
   4983 		if (status != ECONNRESET)
   4984 			return (ENOMSG);
   4985 		return (status);
   4986 	} else if (*nbytes == 0) {
   4987 		PR1("ldc_read() returned 0 and no message read");
   4988 		return (ENOMSG);
   4989 	}
   4990 
   4991 	PR1("RCVD %lu-byte message", *nbytes);
   4992 	return (0);
   4993 }
   4994 
   4995 static int
   4996 vd_do_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
   4997 {
   4998 	int		status;
   4999 
   5000 
   5001 	PR1("Processing (%x/%x/%x) message", msg->tag.vio_msgtype,
   5002 	    msg->tag.vio_subtype, msg->tag.vio_subtype_env);
   5003 #ifdef	DEBUG
   5004 	vd_decode_tag(msg);
   5005 #endif
   5006 
   5007 	/*
   5008 	 * Validate session ID up front, since it applies to all messages
   5009 	 * once set
   5010 	 */
   5011 	if ((msg->tag.vio_sid != vd->sid) && (vd->initialized & VD_SID)) {
   5012 		PR0("Expected SID %u, received %u", vd->sid,
   5013 		    msg->tag.vio_sid);
   5014 		return (EBADMSG);
   5015 	}
   5016 
   5017 	PR1("\tWhile in state %d (%s)", vd->state, vd_decode_state(vd->state));
   5018 
   5019 	/*
   5020 	 * Process the received message based on connection state
   5021 	 */
   5022 	switch (vd->state) {
   5023 	case VD_STATE_INIT:	/* expect version message */
   5024 		if ((status = vd_process_ver_msg(vd, msg, msglen)) != 0)
   5025 			return (status);
   5026 
   5027 		/* Version negotiated, move to that state */
   5028 		vd->state = VD_STATE_VER;
   5029 		return (0);
   5030 
   5031 	case VD_STATE_VER:	/* expect attribute message */
   5032 		if ((status = vd_process_attr_msg(vd, msg, msglen)) != 0)
   5033 			return (status);
   5034 
   5035 		/* Attributes exchanged, move to that state */
   5036 		vd->state = VD_STATE_ATTR;
   5037 		return (0);
   5038 
   5039 	case VD_STATE_ATTR:
   5040 		switch (vd->xfer_mode) {
   5041 		case VIO_DESC_MODE:	/* expect RDX message */
   5042 			if ((status = process_rdx_msg(msg, msglen)) != 0)
   5043 				return (status);
   5044 
   5045 			/* Ready to receive in-band descriptors */
   5046 			vd->state = VD_STATE_DATA;
   5047 			return (0);
   5048 
   5049 		case VIO_DRING_MODE_V1_0:  /* expect register-dring message */
   5050 			if ((status =
   5051 			    vd_process_dring_reg_msg(vd, msg, msglen)) != 0)
   5052 				return (status);
   5053 
   5054 			/* One dring negotiated, move to that state */
   5055 			vd->state = VD_STATE_DRING;
   5056 			return (0);
   5057 
   5058 		default:
   5059 			ASSERT("Unsupported transfer mode");
   5060 			PR0("Unsupported transfer mode");
   5061 			return (ENOTSUP);
   5062 		}
   5063 
   5064 	case VD_STATE_DRING:	/* expect RDX, register-dring, or unreg-dring */
   5065 		if ((status = process_rdx_msg(msg, msglen)) == 0) {
   5066 			/* Ready to receive data */
   5067 			vd->state = VD_STATE_DATA;
   5068 			return (0);
   5069 		} else if (status != ENOMSG) {
   5070 			return (status);
   5071 		}
   5072 
   5073 
   5074 		/*
   5075 		 * If another register-dring message is received, stay in
   5076 		 * dring state in case the client sends RDX; although the
   5077 		 * protocol allows multiple drings, this server does not
   5078 		 * support using more than one
   5079 		 */
   5080 		if ((status =
   5081 		    vd_process_dring_reg_msg(vd, msg, msglen)) != ENOMSG)
   5082 			return (status);
   5083 
   5084 		/*
   5085 		 * Acknowledge an unregister-dring message, but reset the
   5086 		 * connection anyway:  Although the protocol allows
   5087 		 * unregistering drings, this server cannot serve a vdisk
   5088 		 * without its only dring
   5089 		 */
   5090 		status = vd_process_dring_unreg_msg(vd, msg, msglen);
   5091 		return ((status == 0) ? ENOTSUP : status);
   5092 
   5093 	case VD_STATE_DATA:
   5094 		switch (vd->xfer_mode) {
   5095 		case VIO_DESC_MODE:	/* expect in-band-descriptor message */
   5096 			return (vd_process_desc_msg(vd, msg, msglen));
   5097 
   5098 		case VIO_DRING_MODE_V1_0: /* expect dring-data or unreg-dring */
   5099 			/*
   5100 			 * Typically expect dring-data messages, so handle
   5101 			 * them first
   5102 			 */
   5103 			if ((status = vd_process_dring_msg(vd, msg,
   5104 			    msglen)) != ENOMSG)
   5105 				return (status);
   5106 
   5107 			/*
   5108 			 * Acknowledge an unregister-dring message, but reset
   5109 			 * the connection anyway:  Although the protocol
   5110 			 * allows unregistering drings, this server cannot
   5111 			 * serve a vdisk without its only dring
   5112 			 */
   5113 			status = vd_process_dring_unreg_msg(vd, msg, msglen);
   5114 			return ((status == 0) ? ENOTSUP : status);
   5115 
   5116 		default:
   5117 			ASSERT("Unsupported transfer mode");
   5118 			PR0("Unsupported transfer mode");
   5119 			return (ENOTSUP);
   5120 		}
   5121 
   5122 	default:
   5123 		ASSERT("Invalid client connection state");
   5124 		PR0("Invalid client connection state");
   5125 		return (ENOTSUP);
   5126 	}
   5127 }
   5128 
   5129 static int
   5130 vd_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
   5131 {
   5132 	int		status;
   5133 	boolean_t	reset_ldc = B_FALSE;
   5134 	vd_task_t	task;
   5135 
   5136 	/*
   5137 	 * Check that the message is at least big enough for a "tag", so that
   5138 	 * message processing can proceed based on tag-specified message type
   5139 	 */
   5140 	if (msglen < sizeof (vio_msg_tag_t)) {
   5141 		PR0("Received short (%lu-byte) message", msglen);
   5142 		/* Can't "nack" short message, so drop the big hammer */
   5143 		PR0("initiating full reset");
   5144 		vd_need_reset(vd, B_TRUE);
   5145 		return (EBADMSG);
   5146 	}
   5147 
   5148 	/*
   5149 	 * Process the message
   5150 	 */
   5151 	switch (status = vd_do_process_msg(vd, msg, msglen)) {
   5152 	case 0:
   5153 		/* "ack" valid, successfully-processed messages */
   5154 		msg->tag.vio_subtype = VIO_SUBTYPE_ACK;
   5155 		break;
   5156 
   5157 	case EINPROGRESS:
   5158 		/* The completion handler will "ack" or "nack" the message */
   5159 		return (EINPROGRESS);
   5160 	case ENOMSG:
   5161 		PR0("Received unexpected message");
   5162 		_NOTE(FALLTHROUGH);
   5163 	case EBADMSG:
   5164 	case ENOTSUP:
   5165 		/* "transport" error will cause NACK of invalid messages */
   5166 		msg->tag.vio_subtype = VIO_SUBTYPE_NACK;
   5167 		break;
   5168 
   5169 	default:
   5170 		/* "transport" error will cause NACK of invalid messages */
   5171 		msg->tag.vio_subtype = VIO_SUBTYPE_NACK;
   5172 		/* An LDC error probably occurred, so try resetting it */
   5173 		reset_ldc = B_TRUE;
   5174 		break;
   5175 	}
   5176 
   5177 	PR1("\tResulting in state %d (%s)", vd->state,
   5178 	    vd_decode_state(vd->state));
   5179 
   5180 	/* populate the task so we can dispatch it on the taskq */
   5181 	task.vd = vd;
   5182 	task.msg = msg;
   5183 	task.msglen = msglen;
   5184 
   5185 	/*
   5186 	 * Queue a task to send the notification that the operation completed.
   5187 	 * We need to ensure that requests are responded to in the correct
   5188 	 * order and since the taskq is processed serially this ordering
   5189 	 * is maintained.
   5190 	 */
   5191 	(void) ddi_taskq_dispatch(vd->completionq, vd_serial_notify,
   5192 	    &task, DDI_SLEEP);
   5193 
   5194 	/*
   5195 	 * To ensure handshake negotiations do not happen out of order, such
   5196 	 * requests that come through this path should not be done in parallel
   5197 	 * so we need to wait here until the response is sent to the client.
   5198 	 */
   5199 	ddi_taskq_wait(vd->completionq);
   5200 
   5201 	/* Arrange to reset the connection for nack'ed or failed messages */
   5202 	if ((status != 0) || reset_ldc) {
   5203 		PR0("initiating %s reset",
   5204 		    (reset_ldc) ? "full" : "soft");
   5205 		vd_need_reset(vd, reset_ldc);
   5206 	}
   5207 
   5208 	return (status);
   5209 }
   5210 
   5211 static boolean_t
   5212 vd_enabled(vd_t *vd)
   5213 {
   5214 	boolean_t	enabled;
   5215 
   5216 	mutex_enter(&vd->lock);
   5217 	enabled = vd->enabled;
   5218 	mutex_exit(&vd->lock);
   5219 	return (enabled);
   5220 }
   5221 
   5222 static void
   5223 vd_recv_msg(void *arg)
   5224 {
   5225 	vd_t	*vd = (vd_t *)arg;
   5226 	int	rv = 0, status = 0;
   5227 
   5228 	ASSERT(vd != NULL);
   5229 
   5230 	PR2("New task to receive incoming message(s)");
   5231 
   5232 
   5233 	while (vd_enabled(vd) && status == 0) {
   5234 		size_t		msglen, msgsize;
   5235 		ldc_status_t	lstatus;
   5236 
   5237 		/*
   5238 		 * Receive and process a message
   5239 		 */
   5240 		vd_reset_if_needed(vd);	/* can change vd->max_msglen */
   5241 
   5242 		/*
   5243 		 * check if channel is UP - else break out of loop
   5244 		 */
   5245 		status = ldc_status(vd->ldc_handle, &lstatus);
   5246 		if (lstatus != LDC_UP) {
   5247 			PR0("channel not up (status=%d), exiting recv loop\n",
   5248 			    lstatus);
   5249 			break;
   5250 		}
   5251 
   5252 		ASSERT(vd->max_msglen != 0);
   5253 
   5254 		msgsize = vd->max_msglen; /* stable copy for alloc/free */
   5255 		msglen	= msgsize;	  /* actual len after recv_msg() */
   5256 
   5257 		status = recv_msg(vd->ldc_handle, vd->vio_msgp, &msglen);
   5258 		switch (status) {
   5259 		case 0:
   5260 			rv = vd_process_msg(vd, (void *)vd->vio_msgp, msglen);
   5261 			/* check if max_msglen changed */
   5262 			if (msgsize != vd->max_msglen) {
   5263 				PR0("max_msglen changed 0x%lx to 0x%lx bytes\n",
   5264 				    msgsize, vd->max_msglen);
   5265 				kmem_free(vd->vio_msgp, msgsize);
   5266 				vd->vio_msgp =
   5267 				    kmem_alloc(vd->max_msglen, KM_SLEEP);
   5268 			}
   5269 			if (rv == EINPROGRESS)
   5270 				continue;
   5271 			break;
   5272 
   5273 		case ENOMSG:
   5274 			break;
   5275 
   5276 		case ECONNRESET:
   5277 			PR0("initiating soft reset (ECONNRESET)\n");
   5278 			vd_need_reset(vd, B_FALSE);
   5279 			status = 0;
   5280 			break;
   5281 
   5282 		default:
   5283 			/* Probably an LDC failure; arrange to reset it */
   5284 			PR0("initiating full reset (status=0x%x)", status);
   5285 			vd_need_reset(vd, B_TRUE);
   5286 			break;
   5287 		}
   5288 	}
   5289 
   5290 	PR2("Task finished");
   5291 }
   5292 
   5293 static uint_t
   5294 vd_handle_ldc_events(uint64_t event, caddr_t arg)
   5295 {
   5296 	vd_t	*vd = (vd_t *)(void *)arg;
   5297 	int	status;
   5298 
   5299 	ASSERT(vd != NULL);
   5300 
   5301 	if (!vd_enabled(vd))
   5302 		return (LDC_SUCCESS);
   5303 
   5304 	if (event & LDC_EVT_DOWN) {
   5305 		PR0("LDC_EVT_DOWN: LDC channel went down");
   5306 
   5307 		vd_need_reset(vd, B_TRUE);
   5308 		status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd,
   5309 		    DDI_SLEEP);
   5310 		if (status == DDI_FAILURE) {
   5311 			PR0("cannot schedule task to recv msg\n");
   5312 			vd_need_reset(vd, B_TRUE);
   5313 		}
   5314 	}
   5315 
   5316 	if (event & LDC_EVT_RESET) {
   5317 		PR0("LDC_EVT_RESET: LDC channel was reset");
   5318 
   5319 		if (vd->state != VD_STATE_INIT) {
   5320 			PR0("scheduling full reset");
   5321 			vd_need_reset(vd, B_FALSE);
   5322 			status = ddi_taskq_dispatch(vd->startq, vd_recv_msg,
   5323 			    vd, DDI_SLEEP);
   5324 			if (status == DDI_FAILURE) {
   5325 				PR0("cannot schedule task to recv msg\n");
   5326 				vd_need_reset(vd, B_TRUE);
   5327 			}
   5328 
   5329 		} else {
   5330 			PR0("channel already reset, ignoring...\n");
   5331 			PR0("doing ldc up...\n");
   5332 			(void) ldc_up(vd->ldc_handle);
   5333 		}
   5334 
   5335 		return (LDC_SUCCESS);
   5336 	}
   5337 
   5338 	if (event & LDC_EVT_UP) {
   5339 		PR0("EVT_UP: LDC is up\nResetting client connection state");
   5340 		PR0("initiating soft reset");
   5341 		vd_need_reset(vd, B_FALSE);
   5342 		status = ddi_taskq_dispatch(vd->startq, vd_recv_msg,
   5343 		    vd, DDI_SLEEP);
   5344 		if (status == DDI_FAILURE) {
   5345 			PR0("cannot schedule task to recv msg\n");
   5346 			vd_need_reset(vd, B_TRUE);
   5347 			return (LDC_SUCCESS);
   5348 		}
   5349 	}
   5350 
   5351 	if (event & LDC_EVT_READ) {
   5352 		int	status;
   5353 
   5354 		PR1("New data available");
   5355 		/* Queue a task to receive the new data */
   5356 		status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd,
   5357 		    DDI_SLEEP);
   5358 
   5359 		if (status == DDI_FAILURE) {
   5360 			PR0("cannot schedule task to recv msg\n");
   5361 			vd_need_reset(vd, B_TRUE);
   5362 		}
   5363 	}
   5364 
   5365 	return (LDC_SUCCESS);
   5366 }
   5367 
   5368 static uint_t
   5369 vds_check_for_vd(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
   5370 {
   5371 	_NOTE(ARGUNUSED(key, val))
   5372 	(*((uint_t *)arg))++;
   5373 	return (MH_WALK_TERMINATE);
   5374 }
   5375 
   5376 
   5377 static int
   5378 vds_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
   5379 {
   5380 	uint_t	vd_present = 0;
   5381 	minor_t	instance;
   5382 	vds_t	*vds;
   5383 
   5384 
   5385 	switch (cmd) {
   5386 	case DDI_DETACH:
   5387 		/* the real work happens below */
   5388 		break;
   5389 	case DDI_SUSPEND:
   5390 		PR0("No action required for DDI_SUSPEND");
   5391 		return (DDI_SUCCESS);
   5392 	default:
   5393 		PR0("Unrecognized \"cmd\"");
   5394 		return (DDI_FAILURE);
   5395 	}
   5396 
   5397 	ASSERT(cmd == DDI_DETACH);
   5398 	instance = ddi_get_instance(dip);
   5399 	if ((vds = ddi_get_soft_state(vds_state, instance)) == NULL) {
   5400 		PR0("Could not get state for instance %u", instance);
   5401 		ddi_soft_state_free(vds_state, instance);
   5402 		return (DDI_FAILURE);
   5403 	}
   5404 
   5405 	/* Do no detach when serving any vdisks */
   5406 	mod_hash_walk(vds->vd_table, vds_check_for_vd, &vd_present);
   5407 	if (vd_present) {
   5408 		PR0("Not detaching because serving vdisks");
   5409 		return (DDI_FAILURE);
   5410 	}
   5411 
   5412 	PR0("Detaching");
   5413 	if (vds->initialized & VDS_MDEG) {
   5414 		(void) mdeg_unregister(vds->mdeg);
   5415 		kmem_free(vds->ispecp->specp, sizeof (vds_prop_template));
   5416 		kmem_free(vds->ispecp, sizeof (mdeg_node_spec_t));
   5417 		vds->ispecp = NULL;
   5418 		vds->mdeg = NULL;
   5419 	}
   5420 
   5421 	vds_driver_types_free(vds);
   5422 
   5423 	if (vds->initialized & VDS_LDI)
   5424 		(void) ldi_ident_release(vds->ldi_ident);
   5425 	mod_hash_destroy_hash(vds->vd_table);
   5426 	ddi_soft_state_free(vds_state, instance);
   5427 	return (DDI_SUCCESS);
   5428 }
   5429 
   5430 /*
   5431  * Description:
   5432  *	This function checks to see if the disk image being used as a
   5433  *	virtual disk is an ISO image. An ISO image is a special case
   5434  *	which can be booted/installed from like a CD/DVD.
   5435  *
   5436  * Parameters:
   5437  *	vd		- disk on which the operation is performed.
   5438  *
   5439  * Return Code:
   5440  *	B_TRUE		- The disk image is an ISO 9660 compliant image
   5441  *	B_FALSE		- just a regular disk image
   5442  */
   5443 static boolean_t
   5444 vd_dskimg_is_iso_image(vd_t *vd)
   5445 {
   5446 	char	iso_buf[ISO_SECTOR_SIZE];
   5447 	int	i, rv;
   5448 	uint_t	sec;
   5449 
   5450 	ASSERT(VD_DSKIMG(vd));
   5451 
   5452 	/*
   5453 	 * If we have already discovered and saved this info we can
   5454 	 * short-circuit the check and avoid reading the disk image.
   5455 	 */
   5456 	if (vd->vdisk_media == VD_MEDIA_DVD || vd->vdisk_media == VD_MEDIA_CD)
   5457 		return (B_TRUE);
   5458 
   5459 	/*
   5460 	 * We wish to read the sector that should contain the 2nd ISO volume
   5461 	 * descriptor. The second field in this descriptor is called the
   5462 	 * Standard Identifier and is set to CD001 for a CD-ROM compliant
   5463 	 * to the ISO 9660 standard.
   5464 	 */
   5465 	sec = (ISO_VOLDESC_SEC * ISO_SECTOR_SIZE) / vd->vdisk_bsize;
   5466 	rv = vd_dskimg_rw(vd, VD_SLICE_NONE, VD_OP_BREAD, (caddr_t)iso_buf,
   5467 	    sec, ISO_SECTOR_SIZE);
   5468 
   5469 	if (rv < 0)
   5470 		return (B_FALSE);
   5471 
   5472 	for (i = 0; i < ISO_ID_STRLEN; i++) {
   5473 		if (ISO_STD_ID(iso_buf)[i] != ISO_ID_STRING[i])
   5474 			return (B_FALSE);
   5475 	}
   5476 
   5477 	return (B_TRUE);
   5478 }
   5479 
   5480 /*
   5481  * Description:
   5482  *	This function checks to see if the virtual device is an ATAPI
   5483  *	device. ATAPI devices use Group 1 Read/Write commands, so
   5484  *	any USCSI calls vds makes need to take this into account.
   5485  *
   5486  * Parameters:
   5487  *	vd		- disk on which the operation is performed.
   5488  *
   5489  * Return Code:
   5490  *	B_TRUE		- The virtual disk is backed by an ATAPI device
   5491  *	B_FALSE		- not an ATAPI device (presumably SCSI)
   5492  */
   5493 static boolean_t
   5494 vd_is_atapi_device(vd_t *vd)
   5495 {
   5496 	boolean_t	is_atapi = B_FALSE;
   5497 	char		*variantp;
   5498 	int		rv;
   5499 
   5500 	ASSERT(vd->ldi_handle[0] != NULL);
   5501 	ASSERT(!vd->file);
   5502 
   5503 	rv = ldi_prop_lookup_string(vd->ldi_handle[0],
   5504 	    (LDI_DEV_T_ANY | DDI_PROP_DONTPASS), "variant", &variantp);
   5505 	if (rv == DDI_PROP_SUCCESS) {
   5506 		PR0("'variant' property exists for %s", vd->device_path);
   5507 		if (strcmp(variantp, "atapi") == 0)
   5508 			is_atapi = B_TRUE;
   5509 		ddi_prop_free(variantp);
   5510 	}
   5511 
   5512 	rv = ldi_prop_exists(vd->ldi_handle[0], LDI_DEV_T_ANY, "atapi");
   5513 	if (rv) {
   5514 		PR0("'atapi' property exists for %s", vd->device_path);
   5515 		is_atapi = B_TRUE;
   5516 	}
   5517 
   5518 	return (is_atapi);
   5519 }
   5520 
   5521 static int
   5522 vd_setup_full_disk(vd_t *vd)
   5523 {
   5524 	int		status;
   5525 	major_t		major = getmajor(vd->dev[0]);
   5526 	minor_t		minor = getminor(vd->dev[0]) - VD_ENTIRE_DISK_SLICE;
   5527 
   5528 	ASSERT(vd->vdisk_type == VD_DISK_TYPE_DISK);
   5529 
   5530 	/* set the disk size, block size and the media type of the disk */
   5531 	status = vd_backend_check_size(vd);
   5532 
   5533 	if (status != 0) {
   5534 		if (!vd->scsi) {
   5535 			/* unexpected failure */
   5536 			PRN("Check size failed for %s (errno %d)",
   5537 			    vd->device_path, status);
   5538 			return (EIO);
   5539 		}
   5540 
   5541 		/*
   5542 		 * The function can fail for SCSI disks which are present but
   5543 		 * reserved by another system. In that case, we don't know the
   5544 		 * size of the disk and the block size.
   5545 		 */
   5546 		vd->vdisk_size = VD_SIZE_UNKNOWN;
   5547 		vd->vdisk_bsize = 0;
   5548 		vd->backend_bsize = 0;
   5549 		vd->vdisk_media = VD_MEDIA_FIXED;
   5550 	}
   5551 
   5552 	/* Move dev number and LDI handle to entire-disk-slice array elements */
   5553 	vd->dev[VD_ENTIRE_DISK_SLICE]		= vd->dev[0];
   5554 	vd->dev[0]				= 0;
   5555 	vd->ldi_handle[VD_ENTIRE_DISK_SLICE]	= vd->ldi_handle[0];
   5556 	vd->ldi_handle[0]			= NULL;
   5557 
   5558 	/* Initialize device numbers for remaining slices and open them */
   5559 	for (int slice = 0; slice < vd->nslices; slice++) {
   5560 		/*
   5561 		 * Skip the entire-disk slice, as it's already open and its
   5562 		 * device known
   5563 		 */
   5564 		if (slice == VD_ENTIRE_DISK_SLICE)
   5565 			continue;
   5566 		ASSERT(vd->dev[slice] == 0);
   5567 		ASSERT(vd->ldi_handle[slice] == NULL);
   5568 
   5569 		/*
   5570 		 * Construct the device number for the current slice
   5571 		 */
   5572 		vd->dev[slice] = makedevice(major, (minor + slice));
   5573 
   5574 		/*
   5575 		 * Open all slices of the disk to serve them to the client.
   5576 		 * Slices are opened exclusively to prevent other threads or
   5577 		 * processes in the service domain from performing I/O to
   5578 		 * slices being accessed by a client.  Failure to open a slice
   5579 		 * results in vds not serving this disk, as the client could
   5580 		 * attempt (and should be able) to access any slice immediately.
   5581 		 * Any slices successfully opened before a failure will get
   5582 		 * closed by vds_destroy_vd() as a result of the error returned
   5583 		 * by this function.
   5584 		 *
   5585 		 * We need to do the open with FNDELAY so that opening an empty
   5586 		 * slice does not fail.
   5587 		 */
   5588 		PR0("Opening device major %u, minor %u = slice %u",
   5589 		    major, minor, slice);
   5590 
   5591 		/*
   5592 		 * Try to open the device. This can fail for example if we are
   5593 		 * opening an empty slice. So in case of a failure, we try the
   5594 		 * open again but this time with the FNDELAY flag.
   5595 		 */
   5596 		status = ldi_open_by_dev(&vd->dev[slice], OTYP_BLK,
   5597 		    vd->open_flags, kcred, &vd->ldi_handle[slice],
   5598 		    vd->vds->ldi_ident);
   5599 
   5600 		if (status != 0) {
   5601 			status = ldi_open_by_dev(&vd->dev[slice], OTYP_BLK,
   5602 			    vd->open_flags | FNDELAY, kcred,
   5603 			    &vd->ldi_handle[slice], vd->vds->ldi_ident);
   5604 		}
   5605 
   5606 		if (status != 0) {
   5607 			PRN("ldi_open_by_dev() returned errno %d "
   5608 			    "for slice %u", status, slice);
   5609 			/* vds_destroy_vd() will close any open slices */
   5610 			vd->ldi_handle[slice] = NULL;
   5611 			return (status);
   5612 		}
   5613 	}
   5614 
   5615 	return (0);
   5616 }
   5617 
   5618 /*
   5619  * When a slice or a volume is exported as a single-slice disk, we want
   5620  * the disk backend (i.e. the slice or volume) to be entirely mapped as
   5621  * a slice without the addition of any metadata.
   5622  *
   5623  * So when exporting the disk as a VTOC disk, we fake a disk with the following
   5624  * layout:
   5625  *                flabel +--- flabel_limit
   5626  *                 <->   V
   5627  *                 0 1   C                          D  E
   5628  *                 +-+---+--------------------------+--+
   5629  *  virtual disk:  |L|XXX|           slice 0        |AA|
   5630  *                 +-+---+--------------------------+--+
   5631  *                  ^    :                          :
   5632  *                  |    :                          :
   5633  *      VTOC LABEL--+    :                          :
   5634  *                       +--------------------------+
   5635  *  disk backend:        |     slice/volume/file    |
   5636  *                       +--------------------------+
   5637  *                       0                          N
   5638  *
   5639  * N is the number of blocks in the slice/volume/file.
   5640  *
   5641  * We simulate a disk with N+M blocks, where M is the number of blocks
   5642  * simluated at the beginning and at the end of the disk (blocks 0-C
   5643  * and D-E).
   5644  *
   5645  * The first blocks (0 to C-1) are emulated and can not be changed. Blocks C
   5646  * to D defines slice 0 and are mapped to the backend. Finally we emulate 2
   5647  * alternate cylinders at the end of the disk (blocks D-E). In summary we have:
   5648  *
   5649  * - block 0 (L) returns a fake VTOC label
   5650  * - blocks 1 to C-1 (X) are unused and return 0
   5651  * - blocks C to D-1 are mapped to the exported slice or volume
   5652  * - blocks D and E (A) are blocks defining alternate cylinders (2 cylinders)
   5653  *
   5654  * Note: because we define a fake disk geometry, it is possible that the length
   5655  * of the backend is not a multiple of the size of cylinder, in that case the
   5656  * very end of the backend will not map to any block of the virtual disk.
   5657  */
   5658 static int
   5659 vd_setup_partition_vtoc(vd_t *vd)
   5660 {
   5661 	char *device_path = vd->device_path;
   5662 	char unit;
   5663 	size_t size, csize;
   5664 
   5665 	/* Initialize dk_geom structure for single-slice device */
   5666 	if (vd->dk_geom.dkg_nsect == 0) {
   5667 		PRN("%s geometry claims 0 sectors per track", device_path);
   5668 		return (EIO);
   5669 	}
   5670 	if (vd->dk_geom.dkg_nhead == 0) {
   5671 		PRN("%s geometry claims 0 heads", device_path);
   5672 		return (EIO);
   5673 	}
   5674 
   5675 	/* size of a cylinder in block */
   5676 	csize = vd->dk_geom.dkg_nhead * vd->dk_geom.dkg_nsect;
   5677 
   5678 	/*
   5679 	 * Add extra cylinders: we emulate the first cylinder (which contains
   5680 	 * the disk label).
   5681 	 */
   5682 	vd->dk_geom.dkg_ncyl = vd->vdisk_size / csize + 1;
   5683 
   5684 	/* we emulate 2 alternate cylinders */
   5685 	vd->dk_geom.dkg_acyl = 2;
   5686 	vd->dk_geom.dkg_pcyl = vd->dk_geom.dkg_ncyl + vd->dk_geom.dkg_acyl;
   5687 
   5688 
   5689 	/* Initialize vtoc structure for single-slice device */
   5690 	bzero(vd->vtoc.v_part, sizeof (vd->vtoc.v_part));
   5691 	vd->vtoc.v_part[0].p_tag = V_UNASSIGNED;
   5692 	vd->vtoc.v_part[0].p_flag = 0;
   5693 	/*
   5694 	 * Partition 0 starts on cylinder 1 and its size has to be
   5695 	 * a multiple of a number of cylinder.
   5696 	 */
   5697 	vd->vtoc.v_part[0].p_start = csize; /* start on cylinder 1 */
   5698 	vd->vtoc.v_part[0].p_size = (vd->vdisk_size / csize) * csize;
   5699 
   5700 	if (vd_slice_single_slice) {
   5701 		vd->vtoc.v_nparts = 1;
   5702 		bcopy(VD_ASCIILABEL, vd->vtoc.v_asciilabel,
   5703 		    MIN(sizeof (VD_ASCIILABEL),
   5704 		    sizeof (vd->vtoc.v_asciilabel)));
   5705 		bcopy(VD_VOLUME_NAME, vd->vtoc.v_volume,
   5706 		    MIN(sizeof (VD_VOLUME_NAME), sizeof (vd->vtoc.v_volume)));
   5707 	} else {
   5708 		/* adjust the number of slices */
   5709 		vd->nslices = V_NUMPAR;
   5710 		vd->vtoc.v_nparts = V_NUMPAR;
   5711 
   5712 		/* define slice 2 representing the entire disk */
   5713 		vd->vtoc.v_part[VD_ENTIRE_DISK_SLICE].p_tag = V_BACKUP;
   5714 		vd->vtoc.v_part[VD_ENTIRE_DISK_SLICE].p_flag = 0;
   5715 		vd->vtoc.v_part[VD_ENTIRE_DISK_SLICE].p_start = 0;
   5716 		vd->vtoc.v_part[VD_ENTIRE_DISK_SLICE].p_size =
   5717 		    vd->dk_geom.dkg_ncyl * csize;
   5718 
   5719 		vd_get_readable_size(vd->vdisk_size * vd->vdisk_bsize,
   5720 		    &size, &unit);
   5721 
   5722 		/*
   5723 		 * Set some attributes of the geometry to what format(1m) uses
   5724 		 * so that writing a default label using format(1m) does not
   5725 		 * produce any error.
   5726 		 */
   5727 		vd->dk_geom.dkg_bcyl = 0;
   5728 		vd->dk_geom.dkg_intrlv = 1;
   5729 		vd->dk_geom.dkg_write_reinstruct = 0;
   5730 		vd->dk_geom.dkg_read_reinstruct = 0;
   5731 
   5732 		/*
   5733 		 * We must have a correct label name otherwise format(1m) will
   5734 		 * not recognized the disk as labeled.
   5735 		 */
   5736 		(void) snprintf(vd->vtoc.v_asciilabel, LEN_DKL_ASCII,
   5737 		    "SUN-DiskSlice-%ld%cB cyl %d alt %d hd %d sec %d",
   5738 		    size, unit,
   5739 		    vd->dk_geom.dkg_ncyl, vd->dk_geom.dkg_acyl,
   5740 		    vd->dk_geom.dkg_nhead, vd->dk_geom.dkg_nsect);
   5741 		bzero(vd->vtoc.v_volume, sizeof (vd->vtoc.v_volume));
   5742 
   5743 		/* create a fake label from the vtoc and geometry */
   5744 		vd->flabel_limit = (uint_t)csize;
   5745 		vd->flabel_size = VD_LABEL_VTOC_SIZE(vd->vdisk_bsize);
   5746 		vd->flabel = kmem_zalloc(vd->flabel_size, KM_SLEEP);
   5747 		vd_vtocgeom_to_label(&vd->vtoc, &vd->dk_geom,
   5748 		    VD_LABEL_VTOC(vd));
   5749 	}
   5750 
   5751 	/* adjust the vdisk_size, we emulate 3 cylinders */
   5752 	vd->vdisk_size += csize * 3;
   5753 
   5754 	return (0);
   5755 }
   5756 
   5757 /*
   5758  * When a slice, volume or file is exported as a single-slice disk, we want
   5759  * the disk backend (i.e. the slice, volume or file) to be entirely mapped
   5760  * as a slice without the addition of any metadata.
   5761  *
   5762  * So when exporting the disk as an EFI disk, we fake a disk with the following
   5763  * layout: (assuming the block size is 512 bytes)
   5764  *
   5765  *                  flabel        +--- flabel_limit
   5766  *                 <------>       v
   5767  *                 0 1 2  L      34                        34+N      P
   5768  *                 +-+-+--+-------+--------------------------+-------+
   5769  *  virtual disk:  |X|T|EE|XXXXXXX|           slice 0        |RRRRRRR|
   5770  *                 +-+-+--+-------+--------------------------+-------+
   5771  *                    ^ ^         :                          :
   5772  *                    | |         :                          :
   5773  *                GPT-+ +-GPE     :                          :
   5774  *                                +--------------------------+
   5775  *  disk backend:                 |     slice/volume/file    |
   5776  *                                +--------------------------+
   5777  *                                0                          N
   5778  *
   5779  * N is the number of blocks in the slice/volume/file.
   5780  *
   5781  * We simulate a disk with N+M blocks, where M is the number of blocks
   5782  * simluated at the beginning and at the end of the disk (blocks 0-34
   5783  * and 34+N-P).
   5784  *
   5785  * The first 34 blocks (0 to 33) are emulated and can not be changed. Blocks 34
   5786  * to 34+N defines slice 0 and are mapped to the exported backend, and we
   5787  * emulate some blocks at the end of the disk (blocks 34+N to P) as a the EFI
   5788  * reserved partition.
   5789  *
   5790  * - block 0 (X) is unused and return 0
   5791  * - block 1 (T) returns a fake EFI GPT (via DKIOCGETEFI)
   5792  * - blocks 2 to L-1 (E) defines a fake EFI GPE (via DKIOCGETEFI)
   5793  * - blocks L to 33 (X) are unused and return 0
   5794  * - blocks 34 to 34+N are mapped to the exported slice, volume or file
   5795  * - blocks 34+N+1 to P define a fake reserved partition and backup label, it
   5796  *   returns 0
   5797  *
   5798  * Note: if the backend size is not a multiple of the vdisk block size then
   5799  * the very end of the backend will not map to any block of the virtual disk.
   5800  */
   5801 static int
   5802 vd_setup_partition_efi(vd_t *vd)
   5803 {
   5804 	efi_gpt_t *gpt;
   5805 	efi_gpe_t *gpe;
   5806 	struct uuid uuid = EFI_USR;
   5807 	struct uuid efi_reserved = EFI_RESERVED;
   5808 	uint32_t crc;
   5809 	uint64_t s0_start, s0_end, first_u_lba;
   5810 	size_t bsize;
   5811 
   5812 	ASSERT(vd->vdisk_bsize > 0);
   5813 
   5814 	bsize = vd->vdisk_bsize;
   5815 	/*
   5816 	 * The minimum size for the label is 16K (EFI_MIN_ARRAY_SIZE)
   5817 	 * for GPEs plus one block for the GPT and one for PMBR.
   5818 	 */
   5819 	first_u_lba = (EFI_MIN_ARRAY_SIZE / bsize) + 2;
   5820 	vd->flabel_limit = (uint_t)first_u_lba;
   5821 	vd->flabel_size = VD_LABEL_EFI_SIZE(bsize);
   5822 	vd->flabel = kmem_zalloc(vd->flabel_size, KM_SLEEP);
   5823 	gpt = VD_LABEL_EFI_GPT(vd, bsize);
   5824 	gpe = VD_LABEL_EFI_GPE(vd, bsize);
   5825 
   5826 	/*
   5827 	 * Adjust the vdisk_size, we emulate the first few blocks
   5828 	 * for the disk label.
   5829 	 */
   5830 	vd->vdisk_size += first_u_lba;
   5831 	s0_start = first_u_lba;
   5832 	s0_end = vd->vdisk_size - 1;
   5833 
   5834 	gpt->efi_gpt_Signature = LE_64(EFI_SIGNATURE);
   5835 	gpt->efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT);
   5836 	gpt->efi_gpt_HeaderSize = LE_32(sizeof (efi_gpt_t));
   5837 	gpt->efi_gpt_FirstUsableLBA = LE_64(first_u_lba);
   5838 	gpt->efi_gpt_PartitionEntryLBA = LE_64(2ULL);
   5839 	gpt->efi_gpt_SizeOfPartitionEntry = LE_32(sizeof (efi_gpe_t));
   5840 
   5841 	UUID_LE_CONVERT(gpe[0].efi_gpe_PartitionTypeGUID, uuid);
   5842 	gpe[0].efi_gpe_StartingLBA = LE_64(s0_start);
   5843 	gpe[0].efi_gpe_EndingLBA = LE_64(s0_end);
   5844 
   5845 	if (vd_slice_single_slice) {
   5846 		gpt->efi_gpt_NumberOfPartitionEntries = LE_32(1);
   5847 	} else {
   5848 		/* adjust the number of slices */
   5849 		gpt->efi_gpt_NumberOfPartitionEntries = LE_32(VD_MAXPART);
   5850 		vd->nslices = V_NUMPAR;
   5851 
   5852 		/* define a fake reserved partition */
   5853 		UUID_LE_CONVERT(gpe[VD_MAXPART - 1].efi_gpe_PartitionTypeGUID,
   5854 		    efi_reserved);
   5855 		gpe[VD_MAXPART - 1].efi_gpe_StartingLBA =
   5856 		    LE_64(s0_end + 1);
   5857 		gpe[VD_MAXPART - 1].efi_gpe_EndingLBA =
   5858 		    LE_64(s0_end + EFI_MIN_RESV_SIZE);
   5859 
   5860 		/* adjust the vdisk_size to include the reserved slice */
   5861 		vd->vdisk_size += EFI_MIN_RESV_SIZE;
   5862 	}
   5863 
   5864 	gpt->efi_gpt_LastUsableLBA = LE_64(vd->vdisk_size - 1);
   5865 
   5866 	/* adjust the vdisk size for the backup GPT and GPE */
   5867 	vd->vdisk_size += (EFI_MIN_ARRAY_SIZE / bsize) + 1;
   5868 	gpt->efi_gpt_AlternateLBA = LE_64(vd->vdisk_size - 1);
   5869 
   5870 	CRC32(crc, gpe, sizeof (efi_gpe_t) * VD_MAXPART, -1U, crc32_table);
   5871 	gpt->efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc);
   5872 
   5873 	CRC32(crc, gpt, sizeof (efi_gpt_t), -1U, crc32_table);
   5874 	gpt->efi_gpt_HeaderCRC32 = LE_32(~crc);
   5875 
   5876 	return (0);
   5877 }
   5878 
   5879 /*
   5880  * Setup for a virtual disk whose backend is a file (exported as a single slice
   5881  * or as a full disk). In that case, the backend is accessed using the vnode
   5882  * interface.
   5883  */
   5884 static int
   5885 vd_setup_backend_vnode(vd_t *vd)
   5886 {
   5887 	int 		rval, status;
   5888 	dev_t		dev;
   5889 	char		*file_path = vd->device_path;
   5890 	ldi_handle_t	lhandle;
   5891 	struct dk_cinfo	dk_cinfo;
   5892 
   5893 	ASSERT(!vd->volume);
   5894 
   5895 	if ((status = vn_open(file_path, UIO_SYSSPACE, vd->open_flags | FOFFMAX,
   5896 	    0, &vd->file_vnode, 0, 0)) != 0) {
   5897 		PRN("vn_open(%s) = errno %d", file_path, status);
   5898 		return (status);
   5899 	}
   5900 
   5901 	/*
   5902 	 * We set vd->file now so that vds_destroy_vd will take care of
   5903 	 * closing the file and releasing the vnode in case of an error.
   5904 	 */
   5905 	vd->file = B_TRUE;
   5906 
   5907 	vd->max_xfer_sz = maxphys / DEV_BSIZE; /* default transfer size */
   5908 
   5909 	/*
   5910 	 * Get max_xfer_sz from the device where the file is.
   5911 	 */
   5912 	dev = vd->file_vnode->v_vfsp->vfs_dev;
   5913 	PR0("underlying device of %s = (%d, %d)\n", file_path,
   5914 	    getmajor(dev), getminor(dev));
   5915 
   5916 	status = ldi_open_by_dev(&dev, OTYP_BLK, FREAD, kcred, &lhandle,
   5917 	    vd->vds->ldi_ident);
   5918 
   5919 	if (status != 0) {
   5920 		PR0("ldi_open() returned errno %d for underlying device",
   5921 		    status);
   5922 	} else {
   5923 		if ((status = ldi_ioctl(lhandle, DKIOCINFO,
   5924 		    (intptr_t)&dk_cinfo, (vd->open_flags | FKIOCTL), kcred,
   5925 		    &rval)) != 0) {
   5926 			PR0("ldi_ioctl(DKIOCINFO) returned errno %d for "
   5927 			    "underlying device", status);
   5928 		} else {
   5929 			/*
   5930 			 * Store the device's max transfer size for
   5931 			 * return to the client
   5932 			 */
   5933 			vd->max_xfer_sz = dk_cinfo.dki_maxtransfer;
   5934 		}
   5935 
   5936 		PR0("close the underlying device");
   5937 		(void) ldi_close(lhandle, FREAD, kcred);
   5938 	}
   5939 
   5940 	PR0("using file %s on device (%d, %d), max_xfer = %u blks",
   5941 	    file_path, getmajor(dev), getminor(dev), vd->max_xfer_sz);
   5942 
   5943 	if (vd->vdisk_type == VD_DISK_TYPE_SLICE)
   5944 		status = vd_setup_slice_image(vd);
   5945 	else
   5946 		status = vd_setup_disk_image(vd);
   5947 
   5948 	return (status);
   5949 }
   5950 
   5951 static int
   5952 vd_setup_slice_image(vd_t *vd)
   5953 {
   5954 	struct dk_label label;
   5955 	int status;
   5956 
   5957 	if ((status = vd_backend_check_size(vd)) != 0) {
   5958 		PRN("Check size failed for %s (errno %d)",
   5959 		    vd->device_path, status);
   5960 		return (EIO);
   5961 	}
   5962 
   5963 	vd->vdisk_media = VD_MEDIA_FIXED;
   5964 	vd->vdisk_label = (vd_slice_label == VD_DISK_LABEL_UNK)?
   5965 	    vd_file_slice_label : vd_slice_label;
   5966 
   5967 	if (vd->vdisk_label == VD_DISK_LABEL_EFI ||
   5968 	    vd->dskimg_size >= 2 * ONE_TERABYTE) {
   5969 		status = vd_setup_partition_efi(vd);
   5970 	} else {
   5971 		/*
   5972 		 * We build a default label to get a geometry for
   5973 		 * the vdisk. Then the partition setup function will
   5974 		 * adjust the vtoc so that it defines a single-slice
   5975 		 * disk.
   5976 		 */
   5977 		vd_build_default_label(vd->dskimg_size, vd->vdisk_bsize,
   5978 		    &label);
   5979 		vd_label_to_vtocgeom(&label, &vd->vtoc, &vd->dk_geom);
   5980 		status = vd_setup_partition_vtoc(vd);
   5981 	}
   5982 
   5983 	return (status);
   5984 }
   5985 
   5986 static int
   5987 vd_setup_disk_image(vd_t *vd)
   5988 {
   5989 	int status;
   5990 	char *backend_path = vd->device_path;
   5991 
   5992 	if ((status = vd_backend_check_size(vd)) != 0) {
   5993 		PRN("Check size failed for %s (errno %d)",
   5994 		    backend_path, status);
   5995 		return (EIO);
   5996 	}
   5997 
   5998 	/* size should be at least sizeof(dk_label) */
   5999 	if (vd->dskimg_size < sizeof (struct dk_label)) {
   6000 		PRN("Size of file has to be at least %ld bytes",
   6001 		    sizeof (struct dk_label));
   6002 		return (EIO);
   6003 	}
   6004 
   6005 	/*
   6006 	 * Find and validate the geometry of a disk image.
   6007 	 */
   6008 	status = vd_dskimg_validate_geometry(vd);
   6009 	if (status != 0 && status != EINVAL && status != ENOTSUP) {
   6010 		PRN("Failed to read label from %s", backend_path);
   6011 		return (EIO);
   6012 	}
   6013 
   6014 	if (vd_dskimg_is_iso_image(vd)) {
   6015 		/*
   6016 		 * Indicate whether to call this a CD or DVD from the size
   6017 		 * of the ISO image (images for both drive types are stored
   6018 		 * in the ISO-9600 format). CDs can store up to just under 1Gb
   6019 		 */
   6020 		if ((vd->vdisk_size * vd->vdisk_bsize) > ONE_GIGABYTE)
   6021 			vd->vdisk_media = VD_MEDIA_DVD;
   6022 		else
   6023 			vd->vdisk_media = VD_MEDIA_CD;
   6024 	} else {
   6025 		vd->vdisk_media = VD_MEDIA_FIXED;
   6026 	}
   6027 
   6028 	/* Setup devid for the disk image */
   6029 
   6030 	if (vd->vdisk_label != VD_DISK_LABEL_UNK) {
   6031 
   6032 		status = vd_dskimg_read_devid(vd, &vd->dskimg_devid);
   6033 
   6034 		if (status == 0) {
   6035 			/* a valid devid was found */
   6036 			return (0);
   6037 		}
   6038 
   6039 		if (status != EINVAL) {
   6040 			/*
   6041 			 * There was an error while trying to read the devid.
   6042 			 * So this disk image may have a devid but we are
   6043 			 * unable to read it.
   6044 			 */
   6045 			PR0("can not read devid for %s", backend_path);
   6046 			vd->dskimg_devid = NULL;
   6047 			return (0);
   6048 		}
   6049 	}
   6050 
   6051 	/*
   6052 	 * No valid device id was found so we create one. Note that a failure
   6053 	 * to create a device id is not fatal and does not prevent the disk
   6054 	 * image from being attached.
   6055 	 */
   6056 	PR1("creating devid for %s", backend_path);
   6057 
   6058 	if (ddi_devid_init(vd->vds->dip, DEVID_FAB, NULL, 0,
   6059 	    &vd->dskimg_devid) != DDI_SUCCESS) {
   6060 		PR0("fail to create devid for %s", backend_path);
   6061 		vd->dskimg_devid = NULL;
   6062 		return (0);
   6063 	}
   6064 
   6065 	/*
   6066 	 * Write devid to the disk image. The devid is stored into the disk
   6067 	 * image if we have a valid label; otherwise the devid will be stored
   6068 	 * when the user writes a valid label.
   6069 	 */
   6070 	if (vd->vdisk_label != VD_DISK_LABEL_UNK) {
   6071 		if (vd_dskimg_write_devid(vd, vd->dskimg_devid) != 0) {
   6072 			PR0("fail to write devid for %s", backend_path);
   6073 			ddi_devid_free(vd->dskimg_devid);
   6074 			vd->dskimg_devid = NULL;
   6075 		}
   6076 	}
   6077 
   6078 	return (0);
   6079 }
   6080 
   6081 
   6082 /*
   6083  * Description:
   6084  *	Open a device using its device path (supplied by ldm(1m))
   6085  *
   6086  * Parameters:
   6087  *	vd 	- pointer to structure containing the vDisk info
   6088  *	flags	- open flags
   6089  *
   6090  * Return Value
   6091  *	0	- success
   6092  *	!= 0	- some other non-zero return value from ldi(9F) functions
   6093  */
   6094 static int
   6095 vd_open_using_ldi_by_name(vd_t *vd, int flags)
   6096 {
   6097 	int		status;
   6098 	char		*device_path = vd->device_path;
   6099 
   6100 	/* Attempt to open device */
   6101 	status = ldi_open_by_name(device_path, flags, kcred,
   6102 	    &vd->ldi_handle[0], vd->vds->ldi_ident);
   6103 
   6104 	/*
   6105 	 * The open can fail for example if we are opening an empty slice.
   6106 	 * In case of a failure, we try the open again but this time with
   6107 	 * the FNDELAY flag.
   6108 	 */
   6109 	if (status != 0)
   6110 		status = ldi_open_by_name(device_path, flags | FNDELAY,
   6111 		    kcred, &vd->ldi_handle[0], vd->vds->ldi_ident);
   6112 
   6113 	if (status != 0) {
   6114 		PR0("ldi_open_by_name(%s) = errno %d", device_path, status);
   6115 		vd->ldi_handle[0] = NULL;
   6116 		return (status);
   6117 	}
   6118 
   6119 	return (0);
   6120 }
   6121 
   6122 /*
   6123  * Setup for a virtual disk which backend is a device (a physical disk,
   6124  * slice or volume device) exported as a full disk or as a slice. In these
   6125  * cases, the backend is accessed using the LDI interface.
   6126  */
   6127 static int
   6128 vd_setup_backend_ldi(vd_t *vd)
   6129 {
   6130 	int		rval, status;
   6131 	struct dk_cinfo	dk_cinfo;
   6132 	char		*device_path = vd->device_path;
   6133 
   6134 	/* device has been opened by vd_identify_dev() */
   6135 	ASSERT(vd->ldi_handle[0] != NULL);
   6136 	ASSERT(vd->dev[0] != NULL);
   6137 
   6138 	vd->file = B_FALSE;
   6139 
   6140 	/* Verify backing device supports dk_cinfo */
   6141 	if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCINFO,
   6142 	    (intptr_t)&dk_cinfo, (vd->open_flags | FKIOCTL), kcred,
   6143 	    &rval)) != 0) {
   6144 		PRN("ldi_ioctl(DKIOCINFO) returned errno %d for %s",
   6145 		    status, device_path);
   6146 		return (status);
   6147 	}
   6148 	if (dk_cinfo.dki_partition >= V_NUMPAR) {
   6149 		PRN("slice %u >= maximum slice %u for %s",
   6150 		    dk_cinfo.dki_partition, V_NUMPAR, device_path);
   6151 		return (EIO);
   6152 	}
   6153 
   6154 	/*
   6155 	 * The device has been opened read-only by vd_identify_dev(), re-open
   6156 	 * it read-write if the write flag is set and we don't have an optical
   6157 	 * device such as a CD-ROM, which, for now, we do not permit writes to
   6158 	 * and thus should not export write operations to the client.
   6159 	 *
   6160 	 * Future: if/when we implement support for guest domains writing to
   6161 	 * optical devices we will need to do further checking of the media type
   6162 	 * to distinguish between read-only and writable discs.
   6163 	 */
   6164 	if (dk_cinfo.dki_ctype == DKC_CDROM) {
   6165 
   6166 		vd->open_flags &= ~FWRITE;
   6167 
   6168 	} else if (vd->open_flags & FWRITE) {
   6169 
   6170 		(void) ldi_close(vd->ldi_handle[0], vd->open_flags & ~FWRITE,
   6171 		    kcred);
   6172 		status = vd_open_using_ldi_by_name(vd, vd->open_flags);
   6173 		if (status != 0) {
   6174 			PR0("Failed to open (%s) = errno %d",
   6175 			    device_path, status);
   6176 			return (status);
   6177 		}
   6178 	}
   6179 
   6180 	/* Store the device's max transfer size for return to the client */
   6181 	vd->max_xfer_sz = dk_cinfo.dki_maxtransfer;
   6182 
   6183 	/*
   6184 	 * We need to work out if it's an ATAPI (IDE CD-ROM) or SCSI device so
   6185 	 * that we can use the correct CDB group when sending USCSI commands.
   6186 	 */
   6187 	vd->is_atapi_dev = vd_is_atapi_device(vd);
   6188 
   6189 	/*
   6190 	 * Export a full disk.
   6191 	 *
   6192 	 * The exported device can be either a volume, a disk or a CD/DVD
   6193 	 * device.  We export a device as a full disk if we have an entire
   6194 	 * disk slice (slice 2) and if this slice is exported as a full disk
   6195 	 * and not as a single slice disk. A CD or DVD device is exported
   6196 	 * as a full disk (even if it isn't s2). A volume is exported as a
   6197 	 * full disk as long as the "slice" option is not specified.
   6198 	 */
   6199 	if (vd->vdisk_type == VD_DISK_TYPE_DISK) {
   6200 
   6201 		if (vd->volume) {
   6202 			/* setup disk image */
   6203 			return (vd_setup_disk_image(vd));
   6204 		}
   6205 
   6206 		if (dk_cinfo.dki_partition == VD_ENTIRE_DISK_SLICE ||
   6207 		    dk_cinfo.dki_ctype == DKC_CDROM) {
   6208 			ASSERT(!vd->volume);
   6209 			if (dk_cinfo.dki_ctype == DKC_SCSI_CCS)
   6210 				vd->scsi = B_TRUE;
   6211 			return (vd_setup_full_disk(vd));
   6212 		}
   6213 	}
   6214 
   6215 	/*
   6216 	 * Export a single slice disk.
   6217 	 *
   6218 	 * The exported device can be either a volume device or a disk slice. If
   6219 	 * it is a disk slice different from slice 2 then it is always exported
   6220 	 * as a single slice disk even if the "slice" option is not specified.
   6221 	 * If it is disk slice 2 or a volume device then it is exported as a
   6222 	 * single slice disk only if the "slice" option is specified.
   6223 	 */
   6224 	return (vd_setup_single_slice_disk(vd));
   6225 }
   6226 
   6227 static int
   6228 vd_setup_single_slice_disk(vd_t *vd)
   6229 {
   6230 	int status, rval;
   6231 	struct dk_label label;
   6232 	char *device_path = vd->device_path;
   6233 	struct vtoc vtoc;
   6234 
   6235 	vd->vdisk_media = VD_MEDIA_FIXED;
   6236 
   6237 	if (vd->volume) {
   6238 		ASSERT(vd->vdisk_type == VD_DISK_TYPE_SLICE);
   6239 	}
   6240 
   6241 	/*
   6242 	 * We export the slice as a single slice disk even if the "slice"
   6243 	 * option was not specified.
   6244 	 */
   6245 	vd->vdisk_type  = VD_DISK_TYPE_SLICE;
   6246 	vd->nslices	= 1;
   6247 
   6248 	/* Get size of backing device */
   6249 	if ((status = vd_backend_check_size(vd)) != 0) {
   6250 		PRN("Check size failed for %s (errno %d)", device_path, status);
   6251 		return (EIO);
   6252 	}
   6253 
   6254 	/*
   6255 	 * When exporting a slice or a device as a single slice disk, we don't
   6256 	 * care about any partitioning exposed by the backend. The goal is just
   6257 	 * to export the backend as a flat storage. We provide a fake partition
   6258 	 * table (either a VTOC or EFI), which presents only one slice, to
   6259 	 * accommodate tools expecting a disk label. The selection of the label
   6260 	 * type (VTOC or EFI) depends on the value of the vd_slice_label
   6261 	 * variable.
   6262 	 */
   6263 	if (vd_slice_label == VD_DISK_LABEL_EFI ||
   6264 	    vd->vdisk_size >= ONE_TERABYTE / vd->vdisk_bsize) {
   6265 		vd->vdisk_label = VD_DISK_LABEL_EFI;
   6266 	} else {
   6267 		status = ldi_ioctl(vd->ldi_handle[0], DKIOCGEXTVTOC,
   6268 		    (intptr_t)&vd->vtoc, (vd->open_flags | FKIOCTL),
   6269 		    kcred, &rval);
   6270 
   6271 		if (status == ENOTTY) {
   6272 			/* try with the non-extended vtoc ioctl */
   6273 			status = ldi_ioctl(vd->ldi_handle[0], DKIOCGVTOC,
   6274 			    (intptr_t)&vtoc, (vd->open_flags | FKIOCTL),
   6275 			    kcred, &rval);
   6276 			vtoctoextvtoc(vtoc, vd->vtoc);
   6277 		}
   6278 
   6279 		if (status == 0) {
   6280 			status = ldi_ioctl(vd->ldi_handle[0], DKIOCGGEOM,
   6281 			    (intptr_t)&vd->dk_geom, (vd->open_flags | FKIOCTL),
   6282 			    kcred, &rval);
   6283 
   6284 			if (status != 0) {
   6285 				PRN("ldi_ioctl(DKIOCGEOM) returned errno %d "
   6286 				    "for %s", status, device_path);
   6287 				return (status);
   6288 			}
   6289 			vd->vdisk_label = VD_DISK_LABEL_VTOC;
   6290 
   6291 		} else if (vd_slice_label == VD_DISK_LABEL_VTOC) {
   6292 
   6293 			vd->vdisk_label = VD_DISK_LABEL_VTOC;
   6294 			vd_build_default_label(vd->vdisk_size * vd->vdisk_bsize,
   6295 			    vd->vdisk_bsize, &label);
   6296 			vd_label_to_vtocgeom(&label, &vd->vtoc, &vd->dk_geom);
   6297 
   6298 		} else {
   6299 			vd->vdisk_label = VD_DISK_LABEL_EFI;
   6300 		}
   6301 	}
   6302 
   6303 	if (vd->vdisk_label == VD_DISK_LABEL_VTOC) {
   6304 		/* export with a fake VTOC label */
   6305 		status = vd_setup_partition_vtoc(vd);
   6306 
   6307 	} else {
   6308 		/* export with a fake EFI label */
   6309 		status = vd_setup_partition_efi(vd);
   6310 	}
   6311 
   6312 	return (status);
   6313 }
   6314 
   6315 /*
   6316  * This function is invoked when setting up the vdisk backend and to process
   6317  * the VD_OP_GET_CAPACITY operation. It checks the backend size and set the
   6318  * following attributes of the vd structure:
   6319  *
   6320  * - vdisk_bsize: block size for the virtual disk used by the VIO protocol. Its
   6321  *   value is 512 bytes (DEV_BSIZE) when the backend is a file, a volume or a
   6322  *   CD/DVD. When the backend is a disk or a disk slice then it has the value
   6323  *   of the logical block size of that disk (as returned by the DKIOCGMEDIAINFO
   6324  *   ioctl). This block size is expected to be a power of 2 and a multiple of
   6325  *   512.
   6326  *
   6327  * - vdisk_size: size of the virtual disk expressed as a number of vdisk_bsize
   6328  *   blocks.
   6329  *
   6330  * vdisk_size and vdisk_bsize are sent to the vdisk client during the connection
   6331  * handshake and in the result of a VD_OP_GET_CAPACITY operation.
   6332  *
   6333  * - backend_bsize: block size of the backend device. backend_bsize has the same
   6334  *   value as vdisk_bsize except when the backend is a CD/DVD. In that case,
   6335  *   vdisk_bsize is set to 512 (DEV_BSIZE) while backend_bsize is set to the
   6336  *   effective logical block size of the CD/DVD (usually 2048).
   6337  *
   6338  * - dskimg_size: size of the backend when the backend is a disk image. This
   6339  *   attribute is set only when the backend is a file or a volume, otherwise it
   6340  *   is unused.
   6341  *
   6342  * - vio_bshift: number of bit to shift to convert a VIO block number (which
   6343  *   uses a block size of vdisk_bsize) to a buf(9s) block number (which uses a
   6344  *   block size of 512 bytes) i.e. we have vdisk_bsize = 512 x 2 ^ vio_bshift
   6345  *
   6346  * - vdisk_media: media of the virtual disk. This function only sets this
   6347  *   attribute for physical disk and CD/DVD. For other backend types, this
   6348  *   attribute is set in the setup function of the backend.
   6349  */
   6350 static int
   6351 vd_backend_check_size(vd_t *vd)
   6352 {
   6353 	size_t backend_size, backend_bsize, vdisk_bsize;
   6354 	size_t old_size, new_size;
   6355 	struct dk_minfo minfo;
   6356 	vattr_t vattr;
   6357 	int rval, rv, media, nshift = 0;
   6358 	uint32_t n;
   6359 
   6360 	if (vd->file) {
   6361 
   6362 		/* file (slice or full disk) */
   6363 		vattr.va_mask = AT_SIZE;
   6364 		rv = VOP_GETATTR(vd->file_vnode, &vattr, 0, kcred, NULL);
   6365 		if (rv != 0) {
   6366 			PR0("VOP_GETATTR(%s) = errno %d", vd->device_path, rv);
   6367 			return (rv);
   6368 		}
   6369 		backend_size = vattr.va_size;
   6370 		backend_bsize = DEV_BSIZE;
   6371 		vdisk_bsize = DEV_BSIZE;
   6372 
   6373 	} else if (vd->volume) {
   6374 
   6375 		/* volume (slice or full disk) */
   6376 		rv = ldi_get_size(vd->ldi_handle[0], &backend_size);
   6377 		if (rv != DDI_SUCCESS) {
   6378 			PR0("ldi_get_size() failed for %s", vd->device_path);
   6379 			return (EIO);
   6380 		}
   6381 		backend_bsize = DEV_BSIZE;
   6382 		vdisk_bsize = DEV_BSIZE;
   6383 
   6384 	} else {
   6385 
   6386 		/* physical disk or slice */
   6387 		rv = ldi_ioctl(vd->ldi_handle[0], DKIOCGMEDIAINFO,
   6388 		    (intptr_t)&minfo, (vd->open_flags | FKIOCTL),
   6389 		    kcred, &rval);
   6390 		if (rv != 0) {
   6391 			PR0("DKIOCGMEDIAINFO failed for %s (err=%d)",
   6392 			    vd->device_path, rv);
   6393 			return (rv);
   6394 		}
   6395 
   6396 		if (vd->vdisk_type == VD_DISK_TYPE_SLICE) {
   6397 			rv = ldi_get_size(vd->ldi_handle[0], &backend_size);
   6398 			if (rv != DDI_SUCCESS) {
   6399 				PR0("ldi_get_size() failed for %s",
   6400 				    vd->device_path);
   6401 				return (EIO);
   6402 			}
   6403 		} else {
   6404 			ASSERT(vd->vdisk_type == VD_DISK_TYPE_DISK);
   6405 			backend_size = minfo.dki_capacity * minfo.dki_lbsize;
   6406 		}
   6407 
   6408 		backend_bsize = minfo.dki_lbsize;
   6409 		media = DK_MEDIATYPE2VD_MEDIATYPE(minfo.dki_media_type);
   6410 
   6411 		/*
   6412 		 * If the device is a CD or a DVD then we force the vdisk block
   6413 		 * size to 512 bytes (DEV_BSIZE). In that case, vdisk_bsize can
   6414 		 * be different from backend_size.
   6415 		 */
   6416 		if (media == VD_MEDIA_CD || media == VD_MEDIA_DVD)
   6417 			vdisk_bsize = DEV_BSIZE;
   6418 		else
   6419 			vdisk_bsize = backend_bsize;
   6420 	}
   6421 
   6422 	/* check vdisk block size */
   6423 	if (vdisk_bsize == 0 || vdisk_bsize % DEV_BSIZE != 0)
   6424 		return (EINVAL);
   6425 
   6426 	old_size = vd->vdisk_size;
   6427 	new_size = backend_size / vdisk_bsize;
   6428 
   6429 	/* check if size has changed */
   6430 	if (old_size != VD_SIZE_UNKNOWN && old_size == new_size &&
   6431 	    vd->vdisk_bsize == vdisk_bsize)
   6432 		return (0);
   6433 
   6434 	/* cache info for blk conversion */
   6435 	for (n = vdisk_bsize / DEV_BSIZE; n > 1; n >>= 1) {
   6436 		if ((n & 0x1) != 0) {
   6437 			/* blk_size is not a power of 2 */
   6438 			return (EINVAL);
   6439 		}
   6440 		nshift++;
   6441 	}
   6442 
   6443 	vd->vio_bshift = nshift;
   6444 	vd->vdisk_size = new_size;
   6445 	vd->vdisk_bsize = vdisk_bsize;
   6446 	vd->backend_bsize = backend_bsize;
   6447 
   6448 	if (vd->file || vd->volume)
   6449 		vd->dskimg_size = backend_size;
   6450 
   6451 	/*
   6452 	 * If we are exporting a single-slice disk and the size of the backend
   6453 	 * has changed then we regenerate the partition setup so that the
   6454 	 * partitioning matches with the new disk backend size.
   6455 	 */
   6456 
   6457 	if (vd->vdisk_type == VD_DISK_TYPE_SLICE) {
   6458 		/* slice or file or device exported as a slice */
   6459 		if (vd->vdisk_label == VD_DISK_LABEL_VTOC) {
   6460 			rv = vd_setup_partition_vtoc(vd);
   6461 			if (rv != 0) {
   6462 				PR0("vd_setup_partition_vtoc() failed for %s "
   6463 				    "(err = %d)", vd->device_path, rv);
   6464 				return (rv);
   6465 			}
   6466 		} else {
   6467 			rv = vd_setup_partition_efi(vd);
   6468 			if (rv != 0) {
   6469 				PR0("vd_setup_partition_efi() failed for %s "
   6470 				    "(err = %d)", vd->device_path, rv);
   6471 				return (rv);
   6472 			}
   6473 		}
   6474 
   6475 	} else if (!vd->file && !vd->volume) {
   6476 		/* physical disk */
   6477 		ASSERT(vd->vdisk_type == VD_DISK_TYPE_DISK);
   6478 		vd->vdisk_media = media;
   6479 	}
   6480 
   6481 	return (0);
   6482 }
   6483 
   6484 /*
   6485  * Description:
   6486  *	Open a device using its device path and identify if this is
   6487  *	a disk device or a volume device.
   6488  *
   6489  * Parameters:
   6490  *	vd 	- pointer to structure containing the vDisk info
   6491  *	dtype	- return the driver type of the device
   6492  *
   6493  * Return Value
   6494  *	0	- success
   6495  *	!= 0	- some other non-zero return value from ldi(9F) functions
   6496  */
   6497 static int
   6498 vd_identify_dev(vd_t *vd, int *dtype)
   6499 {
   6500 	int status, i;
   6501 	char *device_path = vd->device_path;
   6502 	char *drv_name;
   6503 	int drv_type;
   6504 	vds_t *vds = vd->vds;
   6505 
   6506 	status = vd_open_using_ldi_by_name(vd, vd->open_flags & ~FWRITE);
   6507 	if (status != 0) {
   6508 		PR0("Failed to open (%s) = errno %d", device_path, status);
   6509 		return (status);
   6510 	}
   6511 
   6512 	/* Get device number of backing device */
   6513 	if ((status = ldi_get_dev(vd->ldi_handle[0], &vd->dev[0])) != 0) {
   6514 		PRN("ldi_get_dev() returned errno %d for %s",
   6515 		    status, device_path);
   6516 		return (status);
   6517 	}
   6518 
   6519 	/*
   6520 	 * We start by looking if the driver is in the list from vds.conf
   6521 	 * so that we can override the built-in list using vds.conf.
   6522 	 */
   6523 	drv_name = ddi_major_to_name(getmajor(vd->dev[0]));
   6524 	drv_type = VD_DRIVER_UNKNOWN;
   6525 
   6526 	/* check vds.conf list */
   6527 	for (i = 0; i < vds->num_drivers; i++) {
   6528 		if (vds->driver_types[i].type == VD_DRIVER_UNKNOWN) {
   6529 			/* ignore invalid entries */
   6530 			continue;
   6531 		}
   6532 		if (strcmp(drv_name, vds->driver_types[i].name) == 0) {
   6533 			drv_type = vds->driver_types[i].type;
   6534 			goto done;
   6535 		}
   6536 	}
   6537 
   6538 	/* check built-in list */
   6539 	for (i = 0; i < VDS_NUM_DRIVERS; i++) {
   6540 		if (strcmp(drv_name, vds_driver_types[i].name) == 0) {
   6541 			drv_type = vds_driver_types[i].type;
   6542 			goto done;
   6543 		}
   6544 	}
   6545 
   6546 done:
   6547 	PR0("driver %s identified as %s", drv_name,
   6548 	    (drv_type == VD_DRIVER_DISK)? "DISK" :
   6549 	    (drv_type == VD_DRIVER_VOLUME)? "VOLUME" : "UNKNOWN");
   6550 
   6551 	if (strcmp(drv_name, "zfs") == 0)
   6552 		vd->zvol = B_TRUE;
   6553 
   6554 	*dtype = drv_type;
   6555 
   6556 	return (0);
   6557 }
   6558 
   6559 static int
   6560 vd_setup_vd(vd_t *vd)
   6561 {
   6562 	int		status, drv_type, pseudo;
   6563 	dev_info_t	*dip;
   6564 	vnode_t 	*vnp;
   6565 	char		*path = vd->device_path;
   6566 	char		tq_name[TASKQ_NAMELEN];
   6567 
   6568 	/* make sure the vdisk backend is valid */
   6569 	if ((status = lookupname(path, UIO_SYSSPACE,
   6570 	    FOLLOW, NULLVPP, &vnp)) != 0) {
   6571 		PR0("Cannot lookup %s errno %d", path, status);
   6572 		goto done;
   6573 	}
   6574 
   6575 	switch (vnp->v_type) {
   6576 	case VREG:
   6577 		/*
   6578 		 * Backend is a file so it is exported as a full disk or as a
   6579 		 * single slice disk using the vnode interface.
   6580 		 */
   6581 		VN_RELE(vnp);
   6582 		vd->volume = B_FALSE;
   6583 		status = vd_setup_backend_vnode(vd);
   6584 		break;
   6585 
   6586 	case VBLK:
   6587 	case VCHR:
   6588 		/*
   6589 		 * Backend is a device. In that case, it is exported using the
   6590 		 * LDI interface, and it is exported either as a single-slice
   6591 		 * disk or as a full disk depending on the "slice" option and
   6592 		 * on the type of device.
   6593 		 *
   6594 		 * - A volume device is exported as a single-slice disk if the
   6595 		 *   "slice" is specified, otherwise it is exported as a full
   6596 		 *   disk.
   6597 		 *
   6598 		 * - A disk slice (different from slice 2) is always exported
   6599 		 *   as a single slice disk using the LDI interface.
   6600 		 *
   6601 		 * - The slice 2 of a disk is exported as a single slice disk
   6602 		 *   if the "slice" option is specified, otherwise the entire
   6603 		 *   disk will be exported.
   6604 		 *
   6605 		 * - The slice of a CD or DVD is exported as single slice disk
   6606 		 *   if the "slice" option is specified, otherwise the entire
   6607 		 *   disk will be exported.
   6608 		 */
   6609 
   6610 		/* check if this is a pseudo device */
   6611 		if ((dip = ddi_hold_devi_by_instance(getmajor(vnp->v_rdev),
   6612 		    dev_to_instance(vnp->v_rdev), 0))  == NULL) {
   6613 			PRN("%s is no longer accessible", path);
   6614 			VN_RELE(vnp);
   6615 			status = EIO;
   6616 			break;
   6617 		}
   6618 		pseudo = is_pseudo_device(dip);
   6619 		ddi_release_devi(dip);
   6620 		VN_RELE(vnp);
   6621 
   6622 		if ((status = vd_identify_dev(vd, &drv_type)) != 0) {
   6623 			if (status != ENODEV && status != ENXIO &&
   6624 			    status != ENOENT && status != EROFS) {
   6625 				PRN("%s identification failed with status %d",
   6626 				    path, status);
   6627 				status = EIO;
   6628 			}
   6629 			break;
   6630 		}
   6631 
   6632 		/*
   6633 		 * If the driver hasn't been identified then we consider that
   6634 		 * pseudo devices are volumes and other devices are disks.
   6635 		 */
   6636 		if (drv_type == VD_DRIVER_VOLUME ||
   6637 		    (drv_type == VD_DRIVER_UNKNOWN && pseudo)) {
   6638 			vd->volume = B_TRUE;
   6639 		}
   6640 
   6641 		/*
   6642 		 * If this is a volume device then its usage depends if the
   6643 		 * "slice" option is set or not. If the "slice" option is set
   6644 		 * then the volume device will be exported as a single slice,
   6645 		 * otherwise it will be exported as a full disk.
   6646 		 *
   6647 		 * For backward compatibility, if vd_volume_force_slice is set
   6648 		 * then we always export volume devices as slices.
   6649 		 */
   6650 		if (vd->volume && vd_volume_force_slice) {
   6651 			vd->vdisk_type = VD_DISK_TYPE_SLICE;
   6652 			vd->nslices = 1;
   6653 		}
   6654 
   6655 		status = vd_setup_backend_ldi(vd);
   6656 		break;
   6657 
   6658 	default:
   6659 		PRN("Unsupported vdisk backend %s", path);
   6660 		VN_RELE(vnp);
   6661 		status = EBADF;
   6662 	}
   6663 
   6664 done:
   6665 	if (status != 0) {
   6666 		/*
   6667 		 * If the error is retryable print an error message only
   6668 		 * during the first try.
   6669 		 */
   6670 		if (status == ENXIO || status == ENODEV ||
   6671 		    status == ENOENT || status == EROFS) {
   6672 			if (!(vd->initialized & VD_SETUP_ERROR)) {
   6673 				PRN("%s is currently inaccessible (error %d)",
   6674 				    path, status);
   6675 			}
   6676 			status = EAGAIN;
   6677 		} else {
   6678 			PRN("%s can not be exported as a virtual disk "
   6679 			    "(error %d)", path, status);
   6680 		}
   6681 		vd->initialized |= VD_SETUP_ERROR;
   6682 
   6683 	} else if (vd->initialized & VD_SETUP_ERROR) {
   6684 		/* print a message only if we previously had an error */
   6685 		PRN("%s is now online", path);
   6686 		vd->initialized &= ~VD_SETUP_ERROR;
   6687 	}
   6688 
   6689 	/*
   6690 	 * For file or ZFS volume we also need an I/O queue.
   6691 	 *
   6692 	 * The I/O task queue is initialized here and not in vds_do_init_vd()
   6693 	 * (as the start and completion queues) because vd_setup_vd() will be
   6694 	 * call again if the backend is not available, and we need to know if
   6695 	 * the backend is a ZFS volume or a file.
   6696 	 */
   6697 	if ((vd->file || vd->zvol) && vd->ioq == NULL) {
   6698 		(void) snprintf(tq_name, sizeof (tq_name), "vd_ioq%lu", vd->id);
   6699 
   6700 		if ((vd->ioq = ddi_taskq_create(vd->vds->dip, tq_name,
   6701 		    vd_ioq_nthreads, TASKQ_DEFAULTPRI, 0)) == NULL) {
   6702 			PRN("Could not create io task queue");
   6703 			return (EIO);
   6704 		}
   6705 	}
   6706 
   6707 	return (status);
   6708 }
   6709 
   6710 static int
   6711 vds_do_init_vd(vds_t *vds, uint64_t id, char *device_path, uint64_t options,
   6712     uint64_t ldc_id, vd_t **vdp)
   6713 {
   6714 	char			tq_name[TASKQ_NAMELEN];
   6715 	int			status;
   6716 	ddi_iblock_cookie_t	iblock = NULL;
   6717 	ldc_attr_t		ldc_attr;
   6718 	vd_t			*vd;
   6719 
   6720 
   6721 	ASSERT(vds != NULL);
   6722 	ASSERT(device_path != NULL);
   6723 	ASSERT(vdp != NULL);
   6724 	PR0("Adding vdisk for %s", device_path);
   6725 
   6726 	if ((vd = kmem_zalloc(sizeof (*vd), KM_NOSLEEP)) == NULL) {
   6727 		PRN("No memory for virtual disk");
   6728 		return (EAGAIN);
   6729 	}
   6730 	*vdp = vd;	/* assign here so vds_destroy_vd() can cleanup later */
   6731 	vd->id = id;
   6732 	vd->vds = vds;
   6733 	(void) strncpy(vd->device_path, device_path, MAXPATHLEN);
   6734 
   6735 	/* Setup open flags */
   6736 	vd->open_flags = FREAD;
   6737 
   6738 	if (!(options & VD_OPT_RDONLY))
   6739 		vd->open_flags |= FWRITE;
   6740 
   6741 	if (options & VD_OPT_EXCLUSIVE)
   6742 		vd->open_flags |= FEXCL;
   6743 
   6744 	/* Setup disk type */
   6745 	if (options & VD_OPT_SLICE) {
   6746 		vd->vdisk_type = VD_DISK_TYPE_SLICE;
   6747 		vd->nslices = 1;
   6748 	} else {
   6749 		vd->vdisk_type = VD_DISK_TYPE_DISK;
   6750 		vd->nslices = V_NUMPAR;
   6751 	}
   6752 
   6753 	/* default disk label */
   6754 	vd->vdisk_label = VD_DISK_LABEL_UNK;
   6755 
   6756 	/* Open vdisk and initialize parameters */
   6757 	if ((status = vd_setup_vd(vd)) == 0) {
   6758 		vd->initialized |= VD_DISK_READY;
   6759 
   6760 		ASSERT(vd->nslices > 0 && vd->nslices <= V_NUMPAR);
   6761 		PR0("vdisk_type = %s, volume = %s, file = %s, nslices = %u",
   6762 		    ((vd->vdisk_type == VD_DISK_TYPE_DISK) ? "disk" : "slice"),
   6763 		    (vd->volume ? "yes" : "no"), (vd->file ? "yes" : "no"),
   6764 		    vd->nslices);
   6765 	} else {
   6766 		if (status != EAGAIN)
   6767 			return (status);
   6768 	}
   6769 
   6770 	/* Initialize locking */
   6771 	if (ddi_get_soft_iblock_cookie(vds->dip, DDI_SOFTINT_MED,
   6772 	    &iblock) != DDI_SUCCESS) {
   6773 		PRN("Could not get iblock cookie.");
   6774 		return (EIO);
   6775 	}
   6776 
   6777 	mutex_init(&vd->lock, NULL, MUTEX_DRIVER, iblock);
   6778 	vd->initialized |= VD_LOCKING;
   6779 
   6780 
   6781 	/* Create start and completion task queues for the vdisk */
   6782 	(void) snprintf(tq_name, sizeof (tq_name), "vd_startq%lu", id);
   6783 	PR1("tq_name = %s", tq_name);
   6784 	if ((vd->startq = ddi_taskq_create(vds->dip, tq_name, 1,
   6785 	    TASKQ_DEFAULTPRI, 0)) == NULL) {
   6786 		PRN("Could not create task queue");
   6787 		return (EIO);
   6788 	}
   6789 	(void) snprintf(tq_name, sizeof (tq_name), "vd_completionq%lu", id);
   6790 	PR1("tq_name = %s", tq_name);
   6791 	if ((vd->completionq = ddi_taskq_create(vds->dip, tq_name, 1,
   6792 	    TASKQ_DEFAULTPRI, 0)) == NULL) {
   6793 		PRN("Could not create task queue");
   6794 		return (EIO);
   6795 	}
   6796 
   6797 	/* Allocate the staging buffer */
   6798 	vd->max_msglen = sizeof (vio_msg_t);	/* baseline vio message size */
   6799 	vd->vio_msgp = kmem_alloc(vd->max_msglen, KM_SLEEP);
   6800 
   6801 	vd->enabled = 1;	/* before callback can dispatch to startq */
   6802 
   6803 
   6804 	/* Bring up LDC */
   6805 	ldc_attr.devclass	= LDC_DEV_BLK_SVC;
   6806 	ldc_attr.instance	= ddi_get_instance(vds->dip);
   6807 	ldc_attr.mode		= LDC_MODE_UNRELIABLE;
   6808 	ldc_attr.mtu		= VD_LDC_MTU;
   6809 	if ((status = ldc_init(ldc_id, &ldc_attr, &vd->ldc_handle)) != 0) {
   6810 		PRN("Could not initialize LDC channel %lx, "
   6811 		    "init failed with error %d", ldc_id, status);
   6812 		return (status);
   6813 	}
   6814 	vd->initialized |= VD_LDC;
   6815 
   6816 	if ((status = ldc_reg_callback(vd->ldc_handle, vd_handle_ldc_events,
   6817 	    (caddr_t)vd)) != 0) {
   6818 		PRN("Could not initialize LDC channel %lu,"
   6819 		    "reg_callback failed with error %d", ldc_id, status);
   6820 		return (status);
   6821 	}
   6822 
   6823 	if ((status = ldc_open(vd->ldc_handle)) != 0) {
   6824 		PRN("Could not initialize LDC channel %lu,"
   6825 		    "open failed with error %d", ldc_id, status);
   6826 		return (status);
   6827 	}
   6828 
   6829 	if ((status = ldc_up(vd->ldc_handle)) != 0) {
   6830 		PR0("ldc_up() returned errno %d", status);
   6831 	}
   6832 
   6833 	/* Allocate the inband task memory handle */
   6834 	status = ldc_mem_alloc_handle(vd->ldc_handle, &(vd->inband_task.mhdl));
   6835 	if (status) {
   6836 		PRN("Could not initialize LDC channel %lu,"
   6837 		    "alloc_handle failed with error %d", ldc_id, status);
   6838 		return (ENXIO);
   6839 	}
   6840 
   6841 	/* Add the successfully-initialized vdisk to the server's table */
   6842 	if (mod_hash_insert(vds->vd_table, (mod_hash_key_t)id, vd) != 0) {
   6843 		PRN("Error adding vdisk ID %lu to table", id);
   6844 		return (EIO);
   6845 	}
   6846 
   6847 	/* store initial state */
   6848 	vd->state = VD_STATE_INIT;
   6849 
   6850 	return (0);
   6851 }
   6852 
   6853 static void
   6854 vd_free_dring_task(vd_t *vdp)
   6855 {
   6856 	if (vdp->dring_task != NULL) {
   6857 		ASSERT(vdp->dring_len != 0);
   6858 		/* Free all dring_task memory handles */
   6859 		for (int i = 0; i < vdp->dring_len; i++) {
   6860 			(void) ldc_mem_free_handle(vdp->dring_task[i].mhdl);
   6861 			kmem_free(vdp->dring_task[i].request,
   6862 			    (vdp->descriptor_size -
   6863 			    sizeof (vio_dring_entry_hdr_t)));
   6864 			vdp->dring_task[i].request = NULL;
   6865 			kmem_free(vdp->dring_task[i].msg, vdp->max_msglen);
   6866 			vdp->dring_task[i].msg = NULL;
   6867 		}
   6868 		kmem_free(vdp->dring_task,
   6869 		    (sizeof (*vdp->dring_task)) * vdp->dring_len);
   6870 		vdp->dring_task = NULL;
   6871 	}
   6872 
   6873 	if (vdp->write_queue != NULL) {
   6874 		kmem_free(vdp->write_queue, sizeof (buf_t *) * vdp->dring_len);
   6875 		vdp->write_queue = NULL;
   6876 	}
   6877 }
   6878 
   6879 /*
   6880  * Destroy the state associated with a virtual disk
   6881  */
   6882 static void
   6883 vds_destroy_vd(void *arg)
   6884 {
   6885 	vd_t	*vd = (vd_t *)arg;
   6886 	int	retry = 0, rv;
   6887 
   6888 	if (vd == NULL)
   6889 		return;
   6890 
   6891 	PR0("Destroying vdisk state");
   6892 
   6893 	/* Disable queuing requests for the vdisk */
   6894 	if (vd->initialized & VD_LOCKING) {
   6895 		mutex_enter(&vd->lock);
   6896 		vd->enabled = 0;
   6897 		mutex_exit(&vd->lock);
   6898 	}
   6899 
   6900 	/* Drain and destroy start queue (*before* destroying ioq) */
   6901 	if (vd->startq != NULL)
   6902 		ddi_taskq_destroy(vd->startq);	/* waits for queued tasks */
   6903 
   6904 	/* Drain and destroy the I/O queue (*before* destroying completionq) */
   6905 	if (vd->ioq != NULL)
   6906 		ddi_taskq_destroy(vd->ioq);
   6907 
   6908 	/* Drain and destroy completion queue (*before* shutting down LDC) */
   6909 	if (vd->completionq != NULL)
   6910 		ddi_taskq_destroy(vd->completionq);	/* waits for tasks */
   6911 
   6912 	vd_free_dring_task(vd);
   6913 
   6914 	/* Free the inband task memory handle */
   6915 	(void) ldc_mem_free_handle(vd->inband_task.mhdl);
   6916 
   6917 	/* Shut down LDC */
   6918 	if (vd->initialized & VD_LDC) {
   6919 		/* unmap the dring */
   6920 		if (vd->initialized & VD_DRING)
   6921 			(void) ldc_mem_dring_unmap(vd->dring_handle);
   6922 
   6923 		/* close LDC channel - retry on EAGAIN */
   6924 		while ((rv = ldc_close(vd->ldc_handle)) == EAGAIN) {
   6925 			if (++retry > vds_ldc_retries) {
   6926 				PR0("Timed out closing channel");
   6927 				break;
   6928 			}
   6929 			drv_usecwait(vds_ldc_delay);
   6930 		}
   6931 		if (rv == 0) {
   6932 			(void) ldc_unreg_callback(vd->ldc_handle);
   6933 			(void) ldc_fini(vd->ldc_handle);
   6934 		} else {
   6935 			/*
   6936 			 * Closing the LDC channel has failed. Ideally we should
   6937 			 * fail here but there is no Zeus level infrastructure
   6938 			 * to handle this. The MD has already been changed and
   6939 			 * we have to do the close. So we try to do as much
   6940 			 * clean up as we can.
   6941 			 */
   6942 			(void) ldc_set_cb_mode(vd->ldc_handle, LDC_CB_DISABLE);
   6943 			while (ldc_unreg_callback(vd->ldc_handle) == EAGAIN)
   6944 				drv_usecwait(vds_ldc_delay);
   6945 		}
   6946 	}
   6947 
   6948 	/* Free the staging buffer for msgs */
   6949 	if (vd->vio_msgp != NULL) {
   6950 		kmem_free(vd->vio_msgp, vd->max_msglen);
   6951 		vd->vio_msgp = NULL;
   6952 	}
   6953 
   6954 	/* Free the inband message buffer */
   6955 	if (vd->inband_task.msg != NULL) {
   6956 		kmem_free(vd->inband_task.msg, vd->max_msglen);
   6957 		vd->inband_task.msg = NULL;
   6958 	}
   6959 
   6960 	if (vd->file) {
   6961 		/* Close file */
   6962 		(void) VOP_CLOSE(vd->file_vnode, vd->open_flags, 1,
   6963 		    0, kcred, NULL);
   6964 		VN_RELE(vd->file_vnode);
   6965 	} else {
   6966 		/* Close any open backing-device slices */
   6967 		for (uint_t slice = 0; slice < V_NUMPAR; slice++) {
   6968 			if (vd->ldi_handle[slice] != NULL) {
   6969 				PR0("Closing slice %u", slice);
   6970 				(void) ldi_close(vd->ldi_handle[slice],
   6971 				    vd->open_flags, kcred);
   6972 			}
   6973 		}
   6974 	}
   6975 
   6976 	/* Free disk image devid */
   6977 	if (vd->dskimg_devid != NULL)
   6978 		ddi_devid_free(vd->dskimg_devid);
   6979 
   6980 	/* Free any fake label */
   6981 	if (vd->flabel) {
   6982 		kmem_free(vd->flabel, vd->flabel_size);
   6983 		vd->flabel = NULL;
   6984 		vd->flabel_size = 0;
   6985 	}
   6986 
   6987 	/* Free lock */
   6988 	if (vd->initialized & VD_LOCKING)
   6989 		mutex_destroy(&vd->lock);
   6990 
   6991 	/* Finally, free the vdisk structure itself */
   6992 	kmem_free(vd, sizeof (*vd));
   6993 }
   6994 
   6995 static int
   6996 vds_init_vd(vds_t *vds, uint64_t id, char *device_path, uint64_t options,
   6997     uint64_t ldc_id)
   6998 {
   6999 	int	status;
   7000 	vd_t	*vd = NULL;
   7001 
   7002 
   7003 	if ((status = vds_do_init_vd(vds, id, device_path, options,
   7004 	    ldc_id, &vd)) != 0)
   7005 		vds_destroy_vd(vd);
   7006 
   7007 	return (status);
   7008 }
   7009 
   7010 static int
   7011 vds_do_get_ldc_id(md_t *md, mde_cookie_t vd_node, mde_cookie_t *channel,
   7012     uint64_t *ldc_id)
   7013 {
   7014 	int	num_channels;
   7015 
   7016 
   7017 	/* Look for channel endpoint child(ren) of the vdisk MD node */
   7018 	if ((num_channels = md_scan_dag(md, vd_node,
   7019 	    md_find_name(md, VD_CHANNEL_ENDPOINT),
   7020 	    md_find_name(md, "fwd"), channel)) <= 0) {
   7021 		PRN("No \"%s\" found for virtual disk", VD_CHANNEL_ENDPOINT);
   7022 		return (-1);
   7023 	}
   7024 
   7025 	/* Get the "id" value for the first channel endpoint node */
   7026 	if (md_get_prop_val(md, channel[0], VD_ID_PROP, ldc_id) != 0) {
   7027 		PRN("No \"%s\" property found for \"%s\" of vdisk",
   7028 		    VD_ID_PROP, VD_CHANNEL_ENDPOINT);
   7029 		return (-1);
   7030 	}
   7031 
   7032 	if (num_channels > 1) {
   7033 		PRN("Using ID of first of multiple channels for this vdisk");
   7034 	}
   7035 
   7036 	return (0);
   7037 }
   7038 
   7039 static int
   7040 vds_get_ldc_id(md_t *md, mde_cookie_t vd_node, uint64_t *ldc_id)
   7041 {
   7042 	int		num_nodes, status;
   7043 	size_t		size;
   7044 	mde_cookie_t	*channel;
   7045 
   7046 
   7047 	if ((num_nodes = md_node_count(md)) <= 0) {
   7048 		PRN("Invalid node count in Machine Description subtree");
   7049 		return (-1);
   7050 	}
   7051 	size = num_nodes*(sizeof (*channel));
   7052 	channel = kmem_zalloc(size, KM_SLEEP);
   7053 	status = vds_do_get_ldc_id(md, vd_node, channel, ldc_id);
   7054 	kmem_free(channel, size);
   7055 
   7056 	return (status);
   7057 }
   7058 
   7059 /*
   7060  * Function:
   7061  *	vds_get_options
   7062  *
   7063  * Description:
   7064  * 	Parse the options of a vds node. Options are defined as an array
   7065  *	of strings in the vds-block-device-opts property of the vds node
   7066  *	in the machine description. Options are returned as a bitmask. The
   7067  *	mapping between the bitmask options and the options strings from the
   7068  *	machine description is defined in the vd_bdev_options[] array.
   7069  *
   7070  *	The vds-block-device-opts property is optional. If a vds has no such
   7071  *	property then no option is defined.
   7072  *
   7073  * Parameters:
   7074  *	md		- machine description.
   7075  *	vd_node		- vds node in the machine description for which
   7076  *			  options have to be parsed.
   7077  *	options		- the returned options.
   7078  *
   7079  * Return Code:
   7080  *	none.
   7081  */
   7082 static void
   7083 vds_get_options(md_t *md, mde_cookie_t vd_node, uint64_t *options)
   7084 {
   7085 	char	*optstr, *opt;
   7086 	int	len, n, i;
   7087 
   7088 	*options = 0;
   7089 
   7090 	if (md_get_prop_data(md, vd_node, VD_BLOCK_DEVICE_OPTS,
   7091 	    (uint8_t **)&optstr, &len) != 0) {
   7092 		PR0("No options found");
   7093 		return;
   7094 	}
   7095 
   7096 	/* parse options */
   7097 	opt = optstr;
   7098 	n = sizeof (vd_bdev_options) / sizeof (vd_option_t);
   7099 
   7100 	while (opt < optstr + len) {
   7101 		for (i = 0; i < n; i++) {
   7102 			if (strncmp(vd_bdev_options[i].vdo_name,
   7103 			    opt, VD_OPTION_NLEN) == 0) {
   7104 				*options |= vd_bdev_options[i].vdo_value;
   7105 				break;
   7106 			}
   7107 		}
   7108 
   7109 		if (i < n) {
   7110 			PR0("option: %s", opt);
   7111 		} else {
   7112 			PRN("option %s is unknown or unsupported", opt);
   7113 		}
   7114 
   7115 		opt += strlen(opt) + 1;
   7116 	}
   7117 }
   7118 
   7119 static void
   7120 vds_driver_types_free(vds_t *vds)
   7121 {
   7122 	if (vds->driver_types != NULL) {
   7123 		kmem_free(vds->driver_types, sizeof (vd_driver_type_t) *
   7124 		    vds->num_drivers);
   7125 		vds->driver_types = NULL;
   7126 		vds->num_drivers = 0;
   7127 	}
   7128 }
   7129 
   7130 /*
   7131  * Update the driver type list with information from vds.conf.
   7132  */
   7133 static void
   7134 vds_driver_types_update(vds_t *vds)
   7135 {
   7136 	char **list, *s;
   7137 	uint_t i, num, count = 0, len;
   7138 
   7139 	if (ddi_prop_lookup_string_array(DDI_DEV_T_ANY, vds->dip,
   7140 	    DDI_PROP_DONTPASS, "driver-type-list", &list, &num) !=
   7141 	    DDI_PROP_SUCCESS)
   7142 		return;
   7143 
   7144 	/*
   7145 	 * We create a driver_types list with as many as entries as there
   7146 	 * is in the driver-type-list from vds.conf. However only valid
   7147 	 * entries will be populated (i.e. entries from driver-type-list
   7148 	 * with a valid syntax). Invalid entries will be left blank so
   7149 	 * they will have no driver name and the driver type will be
   7150 	 * VD_DRIVER_UNKNOWN (= 0).
   7151 	 */
   7152 	vds->num_drivers = num;
   7153 	vds->driver_types = kmem_zalloc(sizeof (vd_driver_type_t) * num,
   7154 	    KM_SLEEP);
   7155 
   7156 	for (i = 0; i < num; i++) {
   7157 
   7158 		s = strchr(list[i], ':');
   7159 
   7160 		if (s == NULL) {
   7161 			PRN("vds.conf: driver-type-list, entry %d (%s): "
   7162 			    "a colon is expected in the entry",
   7163 			    i, list[i]);
   7164 			continue;
   7165 		}
   7166 
   7167 		len = (uintptr_t)s - (uintptr_t)list[i];
   7168 
   7169 		if (len == 0) {
   7170 			PRN("vds.conf: driver-type-list, entry %d (%s): "
   7171 			    "the driver name is empty",
   7172 			    i, list[i]);
   7173 			continue;
   7174 		}
   7175 
   7176 		if (len >= VD_DRIVER_NAME_LEN) {
   7177 			PRN("vds.conf: driver-type-list, entry %d (%s): "
   7178 			    "the driver name is too long",
   7179 			    i, list[i]);
   7180 			continue;
   7181 		}
   7182 
   7183 		if (strcmp(s + 1, "disk") == 0) {
   7184 
   7185 			vds->driver_types[i].type = VD_DRIVER_DISK;
   7186 
   7187 		} else if (strcmp(s + 1, "volume") == 0) {
   7188 
   7189 			vds->driver_types[i].type = VD_DRIVER_VOLUME;
   7190 
   7191 		} else {
   7192 			PRN("vds.conf: driver-type-list, entry %d (%s): "
   7193 			    "the driver type is invalid",
   7194 			    i, list[i]);
   7195 			continue;
   7196 		}
   7197 
   7198 		(void) strncpy(vds->driver_types[i].name, list[i], len);
   7199 
   7200 		PR0("driver-type-list, entry %d (%s) added",
   7201 		    i, list[i]);
   7202 
   7203 		count++;
   7204 	}
   7205 
   7206 	ddi_prop_free(list);
   7207 
   7208 	if (count == 0) {
   7209 		/* nothing was added, clean up */
   7210 		vds_driver_types_free(vds);
   7211 	}
   7212 }
   7213 
   7214 static void
   7215 vds_add_vd(vds_t *vds, md_t *md, mde_cookie_t vd_node)
   7216 {
   7217 	char		*device_path = NULL;
   7218 	uint64_t	id = 0, ldc_id = 0, options = 0;
   7219 
   7220 	if (md_get_prop_val(md, vd_node, VD_ID_PROP, &id) != 0) {
   7221 		PRN("Error getting vdisk \"%s\"", VD_ID_PROP);
   7222 		return;
   7223 	}
   7224 	PR0("Adding vdisk ID %lu", id);
   7225 	if (md_get_prop_str(md, vd_node, VD_BLOCK_DEVICE_PROP,
   7226 	    &device_path) != 0) {
   7227 		PRN("Error getting vdisk \"%s\"", VD_BLOCK_DEVICE_PROP);
   7228 		return;
   7229 	}
   7230 
   7231 	vds_get_options(md, vd_node, &options);
   7232 
   7233 	if (vds_get_ldc_id(md, vd_node, &ldc_id) != 0) {
   7234 		PRN("Error getting LDC ID for vdisk %lu", id);
   7235 		return;
   7236 	}
   7237 
   7238 	if (vds_init_vd(vds, id, device_path, options, ldc_id) != 0) {
   7239 		PRN("Failed to add vdisk ID %lu", id);
   7240 		if (mod_hash_destroy(vds->vd_table, (mod_hash_key_t)id) != 0)
   7241 			PRN("No vDisk entry found for vdisk ID %lu", id);
   7242 		return;
   7243 	}
   7244 }
   7245 
   7246 static void
   7247 vds_remove_vd(vds_t *vds, md_t *md, mde_cookie_t vd_node)
   7248 {
   7249 	uint64_t	id = 0;
   7250 
   7251 
   7252 	if (md_get_prop_val(md, vd_node, VD_ID_PROP, &id) != 0) {
   7253 		PRN("Unable to get \"%s\" property from vdisk's MD node",
   7254 		    VD_ID_PROP);
   7255 		return;
   7256 	}
   7257 	PR0("Removing vdisk ID %lu", id);
   7258 	if (mod_hash_destroy(vds->vd_table, (mod_hash_key_t)id) != 0)
   7259 		PRN("No vdisk entry found for vdisk ID %lu", id);
   7260 }
   7261 
   7262 static void
   7263 vds_change_vd(vds_t *vds, md_t *prev_md, mde_cookie_t prev_vd_node,
   7264     md_t *curr_md, mde_cookie_t curr_vd_node)
   7265 {
   7266 	char		*curr_dev, *prev_dev;
   7267 	uint64_t	curr_id = 0, curr_ldc_id = 0, curr_options = 0;
   7268 	uint64_t	prev_id = 0, prev_ldc_id = 0, prev_options = 0;
   7269 	size_t		len;
   7270 
   7271 
   7272 	/* Validate that vdisk ID has not changed */
   7273 	if (md_get_prop_val(prev_md, prev_vd_node, VD_ID_PROP, &prev_id) != 0) {
   7274 		PRN("Error getting previous vdisk \"%s\" property",
   7275 		    VD_ID_PROP);
   7276 		return;
   7277 	}
   7278 	if (md_get_prop_val(curr_md, curr_vd_node, VD_ID_PROP, &curr_id) != 0) {
   7279 		PRN("Error getting current vdisk \"%s\" property", VD_ID_PROP);
   7280 		return;
   7281 	}
   7282 	if (curr_id != prev_id) {
   7283 		PRN("Not changing vdisk:  ID changed from %lu to %lu",
   7284 		    prev_id, curr_id);
   7285 		return;
   7286 	}
   7287 
   7288 	/* Validate that LDC ID has not changed */
   7289 	if (vds_get_ldc_id(prev_md, prev_vd_node, &prev_ldc_id) != 0) {
   7290 		PRN("Error getting LDC ID for vdisk %lu", prev_id);
   7291 		return;
   7292 	}
   7293 
   7294 	if (vds_get_ldc_id(curr_md, curr_vd_node, &curr_ldc_id) != 0) {
   7295 		PRN("Error getting LDC ID for vdisk %lu", curr_id);
   7296 		return;
   7297 	}
   7298 	if (curr_ldc_id != prev_ldc_id) {
   7299 		_NOTE(NOTREACHED);	/* lint is confused */
   7300 		PRN("Not changing vdisk:  "
   7301 		    "LDC ID changed from %lu to %lu", prev_ldc_id, curr_ldc_id);
   7302 		return;
   7303 	}
   7304 
   7305 	/* Determine whether device path has changed */
   7306 	if (md_get_prop_str(prev_md, prev_vd_node, VD_BLOCK_DEVICE_PROP,
   7307 	    &prev_dev) != 0) {
   7308 		PRN("Error getting previous vdisk \"%s\"",
   7309 		    VD_BLOCK_DEVICE_PROP);
   7310 		return;
   7311 	}
   7312 	if (md_get_prop_str(curr_md, curr_vd_node, VD_BLOCK_DEVICE_PROP,
   7313 	    &curr_dev) != 0) {
   7314 		PRN("Error getting current vdisk \"%s\"", VD_BLOCK_DEVICE_PROP);
   7315 		return;
   7316 	}
   7317 	if (((len = strlen(curr_dev)) == strlen(prev_dev)) &&
   7318 	    (strncmp(curr_dev, prev_dev, len) == 0))
   7319 		return;	/* no relevant (supported) change */
   7320 
   7321 	/* Validate that options have not changed */
   7322 	vds_get_options(prev_md, prev_vd_node, &prev_options);
   7323 	vds_get_options(curr_md, curr_vd_node, &curr_options);
   7324 	if (prev_options != curr_options) {
   7325 		PRN("Not changing vdisk:  options changed from %lx to %lx",
   7326 		    prev_options, curr_options);
   7327 		return;
   7328 	}
   7329 
   7330 	PR0("Changing vdisk ID %lu", prev_id);
   7331 
   7332 	/* Remove old state, which will close vdisk and reset */
   7333 	if (mod_hash_destroy(vds->vd_table, (mod_hash_key_t)prev_id) != 0)
   7334 		PRN("No entry found for vdisk ID %lu", prev_id);
   7335 
   7336 	/* Re-initialize vdisk with new state */
   7337 	if (vds_init_vd(vds, curr_id, curr_dev, curr_options,
   7338 	    curr_ldc_id) != 0) {
   7339 		PRN("Failed to change vdisk ID %lu", curr_id);
   7340 		return;
   7341 	}
   7342 }
   7343 
   7344 static int
   7345 vds_process_md(void *arg, mdeg_result_t *md)
   7346 {
   7347 	int	i;
   7348 	vds_t	*vds = arg;
   7349 
   7350 
   7351 	if (md == NULL)
   7352 		return (MDEG_FAILURE);
   7353 	ASSERT(vds != NULL);
   7354 
   7355 	for (i = 0; i < md->removed.nelem; i++)
   7356 		vds_remove_vd(vds, md->removed.mdp, md->removed.mdep[i]);
   7357 	for (i = 0; i < md->match_curr.nelem; i++)
   7358 		vds_change_vd(vds, md->match_prev.mdp, md->match_prev.mdep[i],
   7359 		    md->match_curr.mdp, md->match_curr.mdep[i]);
   7360 	for (i = 0; i < md->added.nelem; i++)
   7361 		vds_add_vd(vds, md->added.mdp, md->added.mdep[i]);
   7362 
   7363 	return (MDEG_SUCCESS);
   7364 }
   7365 
   7366 
   7367 static int
   7368 vds_do_attach(dev_info_t *dip)
   7369 {
   7370 	int			status, sz;
   7371 	int			cfg_handle;
   7372 	minor_t			instance = ddi_get_instance(dip);
   7373 	vds_t			*vds;
   7374 	mdeg_prop_spec_t	*pspecp;
   7375 	mdeg_node_spec_t	*ispecp;
   7376 
   7377 	/*
   7378 	 * The "cfg-handle" property of a vds node in an MD contains the MD's
   7379 	 * notion of "instance", or unique identifier, for that node; OBP
   7380 	 * stores the value of the "cfg-handle" MD property as the value of
   7381 	 * the "reg" property on the node in the device tree it builds from
   7382 	 * the MD and passes to Solaris.  Thus, we look up the devinfo node's
   7383 	 * "reg" property value to uniquely identify this device instance when
   7384 	 * registering with the MD event-generation framework.  If the "reg"
   7385 	 * property cannot be found, the device tree state is presumably so
   7386 	 * broken that there is no point in continuing.
   7387 	 */
   7388 	if (!ddi_prop_exists(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
   7389 	    VD_REG_PROP)) {
   7390 		PRN("vds \"%s\" property does not exist", VD_REG_PROP);
   7391 		return (DDI_FAILURE);
   7392 	}
   7393 
   7394 	/* Get the MD instance for later MDEG registration */
   7395 	cfg_handle = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
   7396 	    VD_REG_PROP, -1);
   7397 
   7398 	if (ddi_soft_state_zalloc(vds_state, instance) != DDI_SUCCESS) {
   7399 		PRN("Could not allocate state for instance %u", instance);
   7400 		return (DDI_FAILURE);
   7401 	}
   7402 
   7403 	if ((vds = ddi_get_soft_state(vds_state, instance)) == NULL) {
   7404 		PRN("Could not get state for instance %u", instance);
   7405 		ddi_soft_state_free(vds_state, instance);
   7406 		return (DDI_FAILURE);
   7407 	}
   7408 
   7409 	vds->dip	= dip;
   7410 	vds->vd_table	= mod_hash_create_ptrhash("vds_vd_table", VDS_NCHAINS,
   7411 	    vds_destroy_vd, sizeof (void *));
   7412 
   7413 	ASSERT(vds->vd_table != NULL);
   7414 
   7415 	if ((status = ldi_ident_from_dip(dip, &vds->ldi_ident)) != 0) {
   7416 		PRN("ldi_ident_from_dip() returned errno %d", status);
   7417 		return (DDI_FAILURE);
   7418 	}
   7419 	vds->initialized |= VDS_LDI;
   7420 
   7421 	/* Register for MD updates */
   7422 	sz = sizeof (vds_prop_template);
   7423 	pspecp = kmem_alloc(sz, KM_SLEEP);
   7424 	bcopy(vds_prop_template, pspecp, sz);
   7425 
   7426 	VDS_SET_MDEG_PROP_INST(pspecp, cfg_handle);
   7427 
   7428 	/* initialize the complete prop spec structure */
   7429 	ispecp = kmem_zalloc(sizeof (mdeg_node_spec_t), KM_SLEEP);
   7430 	ispecp->namep = "virtual-device";
   7431 	ispecp->specp = pspecp;
   7432 
   7433 	if (mdeg_register(ispecp, &vd_match, vds_process_md, vds,
   7434 	    &vds->mdeg) != MDEG_SUCCESS) {
   7435 		PRN("Unable to register for MD updates");
   7436 		kmem_free(ispecp, sizeof (mdeg_node_spec_t));
   7437 		kmem_free(pspecp, sz);
   7438 		return (DDI_FAILURE);
   7439 	}
   7440 
   7441 	vds->ispecp = ispecp;
   7442 	vds->initialized |= VDS_MDEG;
   7443 
   7444 	/* Prevent auto-detaching so driver is available whenever MD changes */
   7445 	if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) !=
   7446 	    DDI_PROP_SUCCESS) {
   7447 		PRN("failed to set \"%s\" property for instance %u",
   7448 		    DDI_NO_AUTODETACH, instance);
   7449 	}
   7450 
   7451 	/* read any user defined driver types from conf file and update list */
   7452 	vds_driver_types_update(vds);
   7453 
   7454 	ddi_report_dev(dip);
   7455 	return (DDI_SUCCESS);
   7456 }
   7457 
   7458 static int
   7459 vds_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
   7460 {
   7461 	int	status;
   7462 
   7463 	switch (cmd) {
   7464 	case DDI_ATTACH:
   7465 		PR0("Attaching");
   7466 		if ((status = vds_do_attach(dip)) != DDI_SUCCESS)
   7467 			(void) vds_detach(dip, DDI_DETACH);
   7468 		return (status);
   7469 	case DDI_RESUME:
   7470 		PR0("No action required for DDI_RESUME");
   7471 		return (DDI_SUCCESS);
   7472 	default:
   7473 		return (DDI_FAILURE);
   7474 	}
   7475 }
   7476 
   7477 static struct dev_ops vds_ops = {
   7478 	DEVO_REV,	/* devo_rev */
   7479 	0,		/* devo_refcnt */
   7480 	ddi_no_info,	/* devo_getinfo */
   7481 	nulldev,	/* devo_identify */
   7482 	nulldev,	/* devo_probe */
   7483 	vds_attach,	/* devo_attach */
   7484 	vds_detach,	/* devo_detach */
   7485 	nodev,		/* devo_reset */
   7486 	NULL,		/* devo_cb_ops */
   7487 	NULL,		/* devo_bus_ops */
   7488 	nulldev,	/* devo_power */
   7489 	ddi_quiesce_not_needed,	/* devo_quiesce */
   7490 };
   7491 
   7492 static struct modldrv modldrv = {
   7493 	&mod_driverops,
   7494 	"virtual disk server",
   7495 	&vds_ops,
   7496 };
   7497 
   7498 static struct modlinkage modlinkage = {
   7499 	MODREV_1,
   7500 	&modldrv,
   7501 	NULL
   7502 };
   7503 
   7504 
   7505 int
   7506 _init(void)
   7507 {
   7508 	int		status;
   7509 
   7510 	if ((status = ddi_soft_state_init(&vds_state, sizeof (vds_t), 1)) != 0)
   7511 		return (status);
   7512 
   7513 	if ((status = mod_install(&modlinkage)) != 0) {
   7514 		ddi_soft_state_fini(&vds_state);
   7515 		return (status);
   7516 	}
   7517 
   7518 	return (0);
   7519 }
   7520 
   7521 int
   7522 _info(struct modinfo *modinfop)
   7523 {
   7524 	return (mod_info(&modlinkage, modinfop));
   7525 }
   7526 
   7527 int
   7528 _fini(void)
   7529 {
   7530 	int	status;
   7531 
   7532 	if ((status = mod_remove(&modlinkage)) != 0)
   7533 		return (status);
   7534 	ddi_soft_state_fini(&vds_state);
   7535 	return (0);
   7536 }
   7537