Home | History | Annotate | Download | only in os
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*
     27  * Multipath driver interface (MDI) implementation; see mdi_impl.h for a more
     28  * detailed discussion of the overall mpxio architecture.
     29  *
     30  * Default locking order:
     31  *
     32  * _NOTE(LOCK_ORDER(mdi_mutex, mdi_vhci:vh_phci_mutex);
     33  * _NOTE(LOCK_ORDER(mdi_mutex, mdi_vhci:vh_client_mutex);
     34  * _NOTE(LOCK_ORDER(mdi_vhci:vh_phci_mutex, mdi_phci::ph_mutex);
     35  * _NOTE(LOCK_ORDER(mdi_vhci:vh_client_mutex, mdi_client::ct_mutex);
     36  * _NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_pathinfo::pi_mutex))
     37  * _NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_client::ct_mutex))
     38  * _NOTE(LOCK_ORDER(mdi_client::ct_mutex mdi_pathinfo::pi_mutex))
     39  */
     40 
     41 #include <sys/note.h>
     42 #include <sys/types.h>
     43 #include <sys/varargs.h>
     44 #include <sys/param.h>
     45 #include <sys/errno.h>
     46 #include <sys/uio.h>
     47 #include <sys/buf.h>
     48 #include <sys/modctl.h>
     49 #include <sys/open.h>
     50 #include <sys/kmem.h>
     51 #include <sys/poll.h>
     52 #include <sys/conf.h>
     53 #include <sys/bootconf.h>
     54 #include <sys/cmn_err.h>
     55 #include <sys/stat.h>
     56 #include <sys/ddi.h>
     57 #include <sys/sunddi.h>
     58 #include <sys/ddipropdefs.h>
     59 #include <sys/sunndi.h>
     60 #include <sys/ndi_impldefs.h>
     61 #include <sys/promif.h>
     62 #include <sys/sunmdi.h>
     63 #include <sys/mdi_impldefs.h>
     64 #include <sys/taskq.h>
     65 #include <sys/epm.h>
     66 #include <sys/sunpm.h>
     67 #include <sys/modhash.h>
     68 #include <sys/disp.h>
     69 #include <sys/autoconf.h>
     70 #include <sys/sysmacros.h>
     71 
     72 #ifdef	DEBUG
     73 #include <sys/debug.h>
     74 int	mdi_debug = 1;
     75 int	mdi_debug_logonly = 0;
     76 #define	MDI_DEBUG(dbglevel, pargs) if (mdi_debug >= (dbglevel))	i_mdi_log pargs
     77 #define	MDI_WARN	CE_WARN, __func__
     78 #define	MDI_NOTE	CE_NOTE, __func__
     79 #define	MDI_CONT	CE_CONT, __func__
     80 static void i_mdi_log(int, const char *, dev_info_t *, const char *, ...);
     81 #else	/* !DEBUG */
     82 #define	MDI_DEBUG(dbglevel, pargs)
     83 #endif	/* DEBUG */
     84 int	mdi_debug_consoleonly = 0;
     85 
     86 extern pri_t	minclsyspri;
     87 extern int	modrootloaded;
     88 
     89 /*
     90  * Global mutex:
     91  * Protects vHCI list and structure members.
     92  */
     93 kmutex_t	mdi_mutex;
     94 
     95 /*
     96  * Registered vHCI class driver lists
     97  */
     98 int		mdi_vhci_count;
     99 mdi_vhci_t	*mdi_vhci_head;
    100 mdi_vhci_t	*mdi_vhci_tail;
    101 
    102 /*
    103  * Client Hash Table size
    104  */
    105 static int	mdi_client_table_size = CLIENT_HASH_TABLE_SIZE;
    106 
    107 /*
    108  * taskq interface definitions
    109  */
    110 #define	MDI_TASKQ_N_THREADS	8
    111 #define	MDI_TASKQ_PRI		minclsyspri
    112 #define	MDI_TASKQ_MINALLOC	(4*mdi_taskq_n_threads)
    113 #define	MDI_TASKQ_MAXALLOC	(500*mdi_taskq_n_threads)
    114 
    115 taskq_t				*mdi_taskq;
    116 static uint_t			mdi_taskq_n_threads = MDI_TASKQ_N_THREADS;
    117 
    118 #define	TICKS_PER_SECOND	(drv_usectohz(1000000))
    119 
    120 /*
    121  * The data should be "quiet" for this interval (in seconds) before the
    122  * vhci cached data is flushed to the disk.
    123  */
    124 static int mdi_vhcache_flush_delay = 10;
    125 
    126 /* number of seconds the vhcache flush daemon will sleep idle before exiting */
    127 static int mdi_vhcache_flush_daemon_idle_time = 60;
    128 
    129 /*
    130  * MDI falls back to discovery of all paths when a bus_config_one fails.
    131  * The following parameters can be used to tune this operation.
    132  *
    133  * mdi_path_discovery_boot
    134  *	Number of times path discovery will be attempted during early boot.
    135  *	Probably there is no reason to ever set this value to greater than one.
    136  *
    137  * mdi_path_discovery_postboot
    138  *	Number of times path discovery will be attempted after early boot.
    139  *	Set it to a minimum of two to allow for discovery of iscsi paths which
    140  *	may happen very late during booting.
    141  *
    142  * mdi_path_discovery_interval
    143  *	Minimum number of seconds MDI will wait between successive discovery
    144  *	of all paths. Set it to -1 to disable discovery of all paths.
    145  */
    146 static int mdi_path_discovery_boot = 1;
    147 static int mdi_path_discovery_postboot = 2;
    148 static int mdi_path_discovery_interval = 10;
    149 
    150 /*
    151  * number of seconds the asynchronous configuration thread will sleep idle
    152  * before exiting.
    153  */
    154 static int mdi_async_config_idle_time = 600;
    155 
    156 static int mdi_bus_config_cache_hash_size = 256;
    157 
    158 /* turns off multithreaded configuration for certain operations */
    159 static int mdi_mtc_off = 0;
    160 
    161 /*
    162  * The "path" to a pathinfo node is identical to the /devices path to a
    163  * devinfo node had the device been enumerated under a pHCI instead of
    164  * a vHCI.  This pathinfo "path" is associated with a 'path_instance'.
    165  * This association persists across create/delete of the pathinfo nodes,
    166  * but not across reboot.
    167  */
    168 static uint_t		mdi_pathmap_instance = 1;	/* 0 -> any path */
    169 static int		mdi_pathmap_hash_size = 256;
    170 static kmutex_t		mdi_pathmap_mutex;
    171 static mod_hash_t	*mdi_pathmap_bypath;		/* "path"->instance */
    172 static mod_hash_t	*mdi_pathmap_byinstance;	/* instance->"path" */
    173 static mod_hash_t	*mdi_pathmap_sbyinstance;	/* inst->shortpath */
    174 
    175 /*
    176  * MDI component property name/value string definitions
    177  */
    178 const char 		*mdi_component_prop = "mpxio-component";
    179 const char		*mdi_component_prop_vhci = "vhci";
    180 const char		*mdi_component_prop_phci = "phci";
    181 const char		*mdi_component_prop_client = "client";
    182 
    183 /*
    184  * MDI client global unique identifier property name
    185  */
    186 const char		*mdi_client_guid_prop = "client-guid";
    187 
    188 /*
    189  * MDI client load balancing property name/value string definitions
    190  */
    191 const char		*mdi_load_balance = "load-balance";
    192 const char		*mdi_load_balance_none = "none";
    193 const char		*mdi_load_balance_rr = "round-robin";
    194 const char		*mdi_load_balance_lba = "logical-block";
    195 
    196 /*
    197  * Obsolete vHCI class definition; to be removed after Leadville update
    198  */
    199 const char *mdi_vhci_class_scsi = MDI_HCI_CLASS_SCSI;
    200 
    201 static char vhci_greeting[] =
    202 	"\tThere already exists one vHCI driver for class %s\n"
    203 	"\tOnly one vHCI driver for each class is allowed\n";
    204 
    205 /*
    206  * Static function prototypes
    207  */
    208 static int		i_mdi_phci_offline(dev_info_t *, uint_t);
    209 static int		i_mdi_client_offline(dev_info_t *, uint_t);
    210 static int		i_mdi_phci_pre_detach(dev_info_t *, ddi_detach_cmd_t);
    211 static void		i_mdi_phci_post_detach(dev_info_t *,
    212 			    ddi_detach_cmd_t, int);
    213 static int		i_mdi_client_pre_detach(dev_info_t *,
    214 			    ddi_detach_cmd_t);
    215 static void		i_mdi_client_post_detach(dev_info_t *,
    216 			    ddi_detach_cmd_t, int);
    217 static void		i_mdi_pm_hold_pip(mdi_pathinfo_t *);
    218 static void		i_mdi_pm_rele_pip(mdi_pathinfo_t *);
    219 static int 		i_mdi_lba_lb(mdi_client_t *ct,
    220 			    mdi_pathinfo_t **ret_pip, struct buf *buf);
    221 static void		i_mdi_pm_hold_client(mdi_client_t *, int);
    222 static void		i_mdi_pm_rele_client(mdi_client_t *, int);
    223 static void		i_mdi_pm_reset_client(mdi_client_t *);
    224 static int		i_mdi_power_all_phci(mdi_client_t *);
    225 static void		i_mdi_log_sysevent(dev_info_t *, char *, char *);
    226 
    227 
    228 /*
    229  * Internal mdi_pathinfo node functions
    230  */
    231 static void		i_mdi_pi_kstat_destroy(mdi_pathinfo_t *);
    232 
    233 static mdi_vhci_t	*i_mdi_vhci_class2vhci(char *);
    234 static mdi_vhci_t	*i_devi_get_vhci(dev_info_t *);
    235 static mdi_phci_t	*i_devi_get_phci(dev_info_t *);
    236 static void		i_mdi_phci_lock(mdi_phci_t *, mdi_pathinfo_t *);
    237 static void		i_mdi_phci_unlock(mdi_phci_t *);
    238 static mdi_pathinfo_t	*i_mdi_pi_alloc(mdi_phci_t *, char *, mdi_client_t *);
    239 static void		i_mdi_phci_add_path(mdi_phci_t *, mdi_pathinfo_t *);
    240 static void		i_mdi_client_add_path(mdi_client_t *, mdi_pathinfo_t *);
    241 static void		i_mdi_pi_free(mdi_phci_t *ph, mdi_pathinfo_t *,
    242 			    mdi_client_t *);
    243 static void		i_mdi_phci_remove_path(mdi_phci_t *, mdi_pathinfo_t *);
    244 static void		i_mdi_client_remove_path(mdi_client_t *,
    245 			    mdi_pathinfo_t *);
    246 
    247 static int		i_mdi_pi_state_change(mdi_pathinfo_t *,
    248 			    mdi_pathinfo_state_t, int);
    249 static int		i_mdi_pi_offline(mdi_pathinfo_t *, int);
    250 static dev_info_t	*i_mdi_devinfo_create(mdi_vhci_t *, char *, char *,
    251 			    char **, int);
    252 static dev_info_t	*i_mdi_devinfo_find(mdi_vhci_t *, char *, char *);
    253 static int		i_mdi_devinfo_remove(dev_info_t *, dev_info_t *, int);
    254 static int		i_mdi_is_child_present(dev_info_t *, dev_info_t *);
    255 static mdi_client_t	*i_mdi_client_alloc(mdi_vhci_t *, char *, char *);
    256 static void		i_mdi_client_enlist_table(mdi_vhci_t *, mdi_client_t *);
    257 static void		i_mdi_client_delist_table(mdi_vhci_t *, mdi_client_t *);
    258 static mdi_client_t	*i_mdi_client_find(mdi_vhci_t *, char *, char *);
    259 static void		i_mdi_client_update_state(mdi_client_t *);
    260 static int		i_mdi_client_compute_state(mdi_client_t *,
    261 			    mdi_phci_t *);
    262 static void		i_mdi_client_lock(mdi_client_t *, mdi_pathinfo_t *);
    263 static void		i_mdi_client_unlock(mdi_client_t *);
    264 static int		i_mdi_client_free(mdi_vhci_t *, mdi_client_t *);
    265 static mdi_client_t	*i_devi_get_client(dev_info_t *);
    266 /*
    267  * NOTE: this will be removed once the NWS files are changed to use the new
    268  * mdi_{enable,disable}_path interfaces
    269  */
    270 static int		i_mdi_pi_enable_disable(dev_info_t *, dev_info_t *,
    271 				int, int);
    272 static mdi_pathinfo_t 	*i_mdi_enable_disable_path(mdi_pathinfo_t *pip,
    273 				mdi_vhci_t *vh, int flags, int op);
    274 /*
    275  * Failover related function prototypes
    276  */
    277 static int		i_mdi_failover(void *);
    278 
    279 /*
    280  * misc internal functions
    281  */
    282 static int		i_mdi_get_hash_key(char *);
    283 static int		i_map_nvlist_error_to_mdi(int);
    284 static void		i_mdi_report_path_state(mdi_client_t *,
    285 			    mdi_pathinfo_t *);
    286 
    287 static void		setup_vhci_cache(mdi_vhci_t *);
    288 static int		destroy_vhci_cache(mdi_vhci_t *);
    289 static int		stop_vhcache_async_threads(mdi_vhci_config_t *);
    290 static boolean_t	stop_vhcache_flush_thread(void *, int);
    291 static void		free_string_array(char **, int);
    292 static void		free_vhcache_phci(mdi_vhcache_phci_t *);
    293 static void		free_vhcache_pathinfo(mdi_vhcache_pathinfo_t *);
    294 static void		free_vhcache_client(mdi_vhcache_client_t *);
    295 static int		mainnvl_to_vhcache(mdi_vhci_cache_t *, nvlist_t *);
    296 static nvlist_t		*vhcache_to_mainnvl(mdi_vhci_cache_t *);
    297 static void		vhcache_phci_add(mdi_vhci_config_t *, mdi_phci_t *);
    298 static void		vhcache_phci_remove(mdi_vhci_config_t *, mdi_phci_t *);
    299 static void		vhcache_pi_add(mdi_vhci_config_t *,
    300 			    struct mdi_pathinfo *);
    301 static void		vhcache_pi_remove(mdi_vhci_config_t *,
    302 			    struct mdi_pathinfo *);
    303 static void		free_phclient_path_list(mdi_phys_path_t *);
    304 static void		sort_vhcache_paths(mdi_vhcache_client_t *);
    305 static int		flush_vhcache(mdi_vhci_config_t *, int);
    306 static void		vhcache_dirty(mdi_vhci_config_t *);
    307 static void		free_async_client_config(mdi_async_client_config_t *);
    308 static void		single_threaded_vhconfig_enter(mdi_vhci_config_t *);
    309 static void		single_threaded_vhconfig_exit(mdi_vhci_config_t *);
    310 static nvlist_t		*read_on_disk_vhci_cache(char *);
    311 extern int		fread_nvlist(char *, nvlist_t **);
    312 extern int		fwrite_nvlist(char *, nvlist_t *);
    313 
    314 /* called once when first vhci registers with mdi */
    315 static void
    316 i_mdi_init()
    317 {
    318 	static int initialized = 0;
    319 
    320 	if (initialized)
    321 		return;
    322 	initialized = 1;
    323 
    324 	mutex_init(&mdi_mutex, NULL, MUTEX_DEFAULT, NULL);
    325 
    326 	/* Create our taskq resources */
    327 	mdi_taskq = taskq_create("mdi_taskq", mdi_taskq_n_threads,
    328 	    MDI_TASKQ_PRI, MDI_TASKQ_MINALLOC, MDI_TASKQ_MAXALLOC,
    329 	    TASKQ_PREPOPULATE | TASKQ_CPR_SAFE);
    330 	ASSERT(mdi_taskq != NULL);	/* taskq_create never fails */
    331 
    332 	/* Allocate ['path_instance' <-> "path"] maps */
    333 	mutex_init(&mdi_pathmap_mutex, NULL, MUTEX_DRIVER, NULL);
    334 	mdi_pathmap_bypath = mod_hash_create_strhash(
    335 	    "mdi_pathmap_bypath", mdi_pathmap_hash_size,
    336 	    mod_hash_null_valdtor);
    337 	mdi_pathmap_byinstance = mod_hash_create_idhash(
    338 	    "mdi_pathmap_byinstance", mdi_pathmap_hash_size,
    339 	    mod_hash_null_valdtor);
    340 	mdi_pathmap_sbyinstance = mod_hash_create_idhash(
    341 	    "mdi_pathmap_sbyinstance", mdi_pathmap_hash_size,
    342 	    mod_hash_null_valdtor);
    343 }
    344 
    345 /*
    346  * mdi_get_component_type():
    347  *		Return mpxio component type
    348  * Return Values:
    349  *		MDI_COMPONENT_NONE
    350  *		MDI_COMPONENT_VHCI
    351  *		MDI_COMPONENT_PHCI
    352  *		MDI_COMPONENT_CLIENT
    353  * XXX This doesn't work under multi-level MPxIO and should be
    354  *	removed when clients migrate mdi_component_is_*() interfaces.
    355  */
    356 int
    357 mdi_get_component_type(dev_info_t *dip)
    358 {
    359 	return (DEVI(dip)->devi_mdi_component);
    360 }
    361 
    362 /*
    363  * mdi_vhci_register():
    364  *		Register a vHCI module with the mpxio framework
    365  *		mdi_vhci_register() is called by vHCI drivers to register the
    366  *		'class_driver' vHCI driver and its MDI entrypoints with the
    367  *		mpxio framework.  The vHCI driver must call this interface as
    368  *		part of its attach(9e) handler.
    369  *		Competing threads may try to attach mdi_vhci_register() as
    370  *		the vHCI drivers are loaded and attached as a result of pHCI
    371  *		driver instance registration (mdi_phci_register()) with the
    372  *		framework.
    373  * Return Values:
    374  *		MDI_SUCCESS
    375  *		MDI_FAILURE
    376  */
    377 /*ARGSUSED*/
    378 int
    379 mdi_vhci_register(char *class, dev_info_t *vdip, mdi_vhci_ops_t *vops,
    380     int flags)
    381 {
    382 	mdi_vhci_t		*vh = NULL;
    383 
    384 	/* Registrant can't be older */
    385 	ASSERT(vops->vo_revision <= MDI_VHCI_OPS_REV);
    386 
    387 #ifdef DEBUG
    388 	/*
    389 	 * IB nexus driver is loaded only when IB hardware is present.
    390 	 * In order to be able to do this there is a need to drive the loading
    391 	 * and attaching of the IB nexus driver (especially when an IB hardware
    392 	 * is dynamically plugged in) when an IB HCA driver (PHCI)
    393 	 * is being attached. Unfortunately this gets into the limitations
    394 	 * of devfs as there seems to be no clean way to drive configuration
    395 	 * of a subtree from another subtree of a devfs. Hence, do not ASSERT
    396 	 * for IB.
    397 	 */
    398 	if (strcmp(class, MDI_HCI_CLASS_IB) != 0)
    399 		ASSERT(DEVI_BUSY_OWNED(ddi_get_parent(vdip)));
    400 #endif
    401 
    402 	i_mdi_init();
    403 
    404 	mutex_enter(&mdi_mutex);
    405 	/*
    406 	 * Scan for already registered vhci
    407 	 */
    408 	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
    409 		if (strcmp(vh->vh_class, class) == 0) {
    410 			/*
    411 			 * vHCI has already been created.  Check for valid
    412 			 * vHCI ops registration.  We only support one vHCI
    413 			 * module per class
    414 			 */
    415 			if (vh->vh_ops != NULL) {
    416 				mutex_exit(&mdi_mutex);
    417 				cmn_err(CE_NOTE, vhci_greeting, class);
    418 				return (MDI_FAILURE);
    419 			}
    420 			break;
    421 		}
    422 	}
    423 
    424 	/*
    425 	 * if not yet created, create the vHCI component
    426 	 */
    427 	if (vh == NULL) {
    428 		struct client_hash	*hash = NULL;
    429 		char			*load_balance;
    430 
    431 		/*
    432 		 * Allocate and initialize the mdi extensions
    433 		 */
    434 		vh = kmem_zalloc(sizeof (mdi_vhci_t), KM_SLEEP);
    435 		hash = kmem_zalloc(mdi_client_table_size * sizeof (*hash),
    436 		    KM_SLEEP);
    437 		vh->vh_client_table = hash;
    438 		vh->vh_class = kmem_zalloc(strlen(class) + 1, KM_SLEEP);
    439 		(void) strcpy(vh->vh_class, class);
    440 		vh->vh_lb = LOAD_BALANCE_RR;
    441 		if (ddi_prop_lookup_string(DDI_DEV_T_ANY, vdip,
    442 		    0, LOAD_BALANCE_PROP, &load_balance) == DDI_SUCCESS) {
    443 			if (strcmp(load_balance, LOAD_BALANCE_PROP_NONE) == 0) {
    444 				vh->vh_lb = LOAD_BALANCE_NONE;
    445 			} else if (strcmp(load_balance, LOAD_BALANCE_PROP_LBA)
    446 				    == 0) {
    447 				vh->vh_lb = LOAD_BALANCE_LBA;
    448 			}
    449 			ddi_prop_free(load_balance);
    450 		}
    451 
    452 		mutex_init(&vh->vh_phci_mutex, NULL, MUTEX_DEFAULT, NULL);
    453 		mutex_init(&vh->vh_client_mutex, NULL, MUTEX_DEFAULT, NULL);
    454 
    455 		/*
    456 		 * Store the vHCI ops vectors
    457 		 */
    458 		vh->vh_dip = vdip;
    459 		vh->vh_ops = vops;
    460 
    461 		setup_vhci_cache(vh);
    462 
    463 		if (mdi_vhci_head == NULL) {
    464 			mdi_vhci_head = vh;
    465 		}
    466 		if (mdi_vhci_tail) {
    467 			mdi_vhci_tail->vh_next = vh;
    468 		}
    469 		mdi_vhci_tail = vh;
    470 		mdi_vhci_count++;
    471 	}
    472 
    473 	/*
    474 	 * Claim the devfs node as a vhci component
    475 	 */
    476 	DEVI(vdip)->devi_mdi_component |= MDI_COMPONENT_VHCI;
    477 
    478 	/*
    479 	 * Initialize our back reference from dev_info node
    480 	 */
    481 	DEVI(vdip)->devi_mdi_xhci = (caddr_t)vh;
    482 	mutex_exit(&mdi_mutex);
    483 	return (MDI_SUCCESS);
    484 }
    485 
    486 /*
    487  * mdi_vhci_unregister():
    488  *		Unregister a vHCI module from mpxio framework
    489  *		mdi_vhci_unregister() is called from the detach(9E) entrypoint
    490  * 		of a vhci to unregister it from the framework.
    491  * Return Values:
    492  *		MDI_SUCCESS
    493  *		MDI_FAILURE
    494  */
    495 /*ARGSUSED*/
    496 int
    497 mdi_vhci_unregister(dev_info_t *vdip, int flags)
    498 {
    499 	mdi_vhci_t	*found, *vh, *prev = NULL;
    500 
    501 	ASSERT(DEVI_BUSY_OWNED(ddi_get_parent(vdip)));
    502 
    503 	/*
    504 	 * Check for invalid VHCI
    505 	 */
    506 	if ((vh = i_devi_get_vhci(vdip)) == NULL)
    507 		return (MDI_FAILURE);
    508 
    509 	/*
    510 	 * Scan the list of registered vHCIs for a match
    511 	 */
    512 	mutex_enter(&mdi_mutex);
    513 	for (found = mdi_vhci_head; found != NULL; found = found->vh_next) {
    514 		if (found == vh)
    515 			break;
    516 		prev = found;
    517 	}
    518 
    519 	if (found == NULL) {
    520 		mutex_exit(&mdi_mutex);
    521 		return (MDI_FAILURE);
    522 	}
    523 
    524 	/*
    525 	 * Check the vHCI, pHCI and client count. All the pHCIs and clients
    526 	 * should have been unregistered, before a vHCI can be
    527 	 * unregistered.
    528 	 */
    529 	MDI_VHCI_PHCI_LOCK(vh);
    530 	if (vh->vh_refcnt || vh->vh_phci_count || vh->vh_client_count) {
    531 		MDI_VHCI_PHCI_UNLOCK(vh);
    532 		mutex_exit(&mdi_mutex);
    533 		return (MDI_FAILURE);
    534 	}
    535 	MDI_VHCI_PHCI_UNLOCK(vh);
    536 
    537 	if (destroy_vhci_cache(vh) != MDI_SUCCESS) {
    538 		mutex_exit(&mdi_mutex);
    539 		return (MDI_FAILURE);
    540 	}
    541 
    542 	/*
    543 	 * Remove the vHCI from the global list
    544 	 */
    545 	if (vh == mdi_vhci_head) {
    546 		mdi_vhci_head = vh->vh_next;
    547 	} else {
    548 		prev->vh_next = vh->vh_next;
    549 	}
    550 	if (vh == mdi_vhci_tail) {
    551 		mdi_vhci_tail = prev;
    552 	}
    553 	mdi_vhci_count--;
    554 	mutex_exit(&mdi_mutex);
    555 
    556 	vh->vh_ops = NULL;
    557 	DEVI(vdip)->devi_mdi_component &= ~MDI_COMPONENT_VHCI;
    558 	DEVI(vdip)->devi_mdi_xhci = NULL;
    559 	kmem_free(vh->vh_class, strlen(vh->vh_class)+1);
    560 	kmem_free(vh->vh_client_table,
    561 	    mdi_client_table_size * sizeof (struct client_hash));
    562 	mutex_destroy(&vh->vh_phci_mutex);
    563 	mutex_destroy(&vh->vh_client_mutex);
    564 
    565 	kmem_free(vh, sizeof (mdi_vhci_t));
    566 	return (MDI_SUCCESS);
    567 }
    568 
    569 /*
    570  * i_mdi_vhci_class2vhci():
    571  *		Look for a matching vHCI module given a vHCI class name
    572  * Return Values:
    573  *		Handle to a vHCI component
    574  *		NULL
    575  */
    576 static mdi_vhci_t *
    577 i_mdi_vhci_class2vhci(char *class)
    578 {
    579 	mdi_vhci_t	*vh = NULL;
    580 
    581 	ASSERT(!MUTEX_HELD(&mdi_mutex));
    582 
    583 	mutex_enter(&mdi_mutex);
    584 	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
    585 		if (strcmp(vh->vh_class, class) == 0) {
    586 			break;
    587 		}
    588 	}
    589 	mutex_exit(&mdi_mutex);
    590 	return (vh);
    591 }
    592 
    593 /*
    594  * i_devi_get_vhci():
    595  *		Utility function to get the handle to a vHCI component
    596  * Return Values:
    597  *		Handle to a vHCI component
    598  *		NULL
    599  */
    600 mdi_vhci_t *
    601 i_devi_get_vhci(dev_info_t *vdip)
    602 {
    603 	mdi_vhci_t	*vh = NULL;
    604 	if (MDI_VHCI(vdip)) {
    605 		vh = (mdi_vhci_t *)DEVI(vdip)->devi_mdi_xhci;
    606 	}
    607 	return (vh);
    608 }
    609 
    610 /*
    611  * mdi_phci_register():
    612  *		Register a pHCI module with mpxio framework
    613  *		mdi_phci_register() is called by pHCI drivers to register with
    614  *		the mpxio framework and a specific 'class_driver' vHCI.  The
    615  *		pHCI driver must call this interface as part of its attach(9e)
    616  *		handler.
    617  * Return Values:
    618  *		MDI_SUCCESS
    619  *		MDI_FAILURE
    620  */
    621 /*ARGSUSED*/
    622 int
    623 mdi_phci_register(char *class, dev_info_t *pdip, int flags)
    624 {
    625 	mdi_phci_t		*ph;
    626 	mdi_vhci_t		*vh;
    627 	char			*data;
    628 
    629 	/*
    630 	 * Some subsystems, like fcp, perform pHCI registration from a
    631 	 * different thread than the one doing the pHCI attach(9E) - the
    632 	 * driver attach code is waiting for this other thread to complete.
    633 	 * This means we can only ASSERT DEVI_BUSY_CHANGING of parent
    634 	 * (indicating that some thread has done an ndi_devi_enter of parent)
    635 	 * not DEVI_BUSY_OWNED (which would indicate that we did the enter).
    636 	 */
    637 	ASSERT(DEVI_BUSY_CHANGING(ddi_get_parent(pdip)));
    638 
    639 	/*
    640 	 * Check for mpxio-disable property. Enable mpxio if the property is
    641 	 * missing or not set to "yes".
    642 	 * If the property is set to "yes" then emit a brief message.
    643 	 */
    644 	if ((ddi_prop_lookup_string(DDI_DEV_T_ANY, pdip, 0, "mpxio-disable",
    645 	    &data) == DDI_SUCCESS)) {
    646 		if (strcmp(data, "yes") == 0) {
    647 			MDI_DEBUG(1, (MDI_CONT, pdip,
    648 			    "?multipath capabilities disabled via %s.conf.",
    649 			    ddi_driver_name(pdip)));
    650 			ddi_prop_free(data);
    651 			return (MDI_FAILURE);
    652 		}
    653 		ddi_prop_free(data);
    654 	}
    655 
    656 	/*
    657 	 * Search for a matching vHCI
    658 	 */
    659 	vh = (mdi_vhci_t *)i_mdi_vhci_class2vhci(class);
    660 	if (vh == NULL) {
    661 		return (MDI_FAILURE);
    662 	}
    663 
    664 	ph = kmem_zalloc(sizeof (mdi_phci_t), KM_SLEEP);
    665 	mutex_init(&ph->ph_mutex, NULL, MUTEX_DEFAULT, NULL);
    666 	ph->ph_dip = pdip;
    667 	ph->ph_vhci = vh;
    668 	ph->ph_next = NULL;
    669 	ph->ph_unstable = 0;
    670 	ph->ph_vprivate = 0;
    671 	cv_init(&ph->ph_unstable_cv, NULL, CV_DRIVER, NULL);
    672 
    673 	MDI_PHCI_LOCK(ph);
    674 	MDI_PHCI_SET_POWER_UP(ph);
    675 	MDI_PHCI_UNLOCK(ph);
    676 	DEVI(pdip)->devi_mdi_component |= MDI_COMPONENT_PHCI;
    677 	DEVI(pdip)->devi_mdi_xhci = (caddr_t)ph;
    678 
    679 	vhcache_phci_add(vh->vh_config, ph);
    680 
    681 	MDI_VHCI_PHCI_LOCK(vh);
    682 	if (vh->vh_phci_head == NULL) {
    683 		vh->vh_phci_head = ph;
    684 	}
    685 	if (vh->vh_phci_tail) {
    686 		vh->vh_phci_tail->ph_next = ph;
    687 	}
    688 	vh->vh_phci_tail = ph;
    689 	vh->vh_phci_count++;
    690 	MDI_VHCI_PHCI_UNLOCK(vh);
    691 
    692 	i_mdi_log_sysevent(pdip, class, ESC_DDI_INITIATOR_REGISTER);
    693 	return (MDI_SUCCESS);
    694 }
    695 
    696 /*
    697  * mdi_phci_unregister():
    698  *		Unregister a pHCI module from mpxio framework
    699  *		mdi_phci_unregister() is called by the pHCI drivers from their
    700  *		detach(9E) handler to unregister their instances from the
    701  *		framework.
    702  * Return Values:
    703  *		MDI_SUCCESS
    704  *		MDI_FAILURE
    705  */
    706 /*ARGSUSED*/
    707 int
    708 mdi_phci_unregister(dev_info_t *pdip, int flags)
    709 {
    710 	mdi_vhci_t		*vh;
    711 	mdi_phci_t		*ph;
    712 	mdi_phci_t		*tmp;
    713 	mdi_phci_t		*prev = NULL;
    714 	mdi_pathinfo_t		*pip;
    715 
    716 	ASSERT(DEVI_BUSY_CHANGING(ddi_get_parent(pdip)));
    717 
    718 	ph = i_devi_get_phci(pdip);
    719 	if (ph == NULL) {
    720 		MDI_DEBUG(1, (MDI_WARN, pdip, "!not a valid pHCI"));
    721 		return (MDI_FAILURE);
    722 	}
    723 
    724 	vh = ph->ph_vhci;
    725 	ASSERT(vh != NULL);
    726 	if (vh == NULL) {
    727 		MDI_DEBUG(1, (MDI_WARN, pdip, "!not a valid vHCI"));
    728 		return (MDI_FAILURE);
    729 	}
    730 
    731 	MDI_VHCI_PHCI_LOCK(vh);
    732 	tmp = vh->vh_phci_head;
    733 	while (tmp) {
    734 		if (tmp == ph) {
    735 			break;
    736 		}
    737 		prev = tmp;
    738 		tmp = tmp->ph_next;
    739 	}
    740 
    741 	if (ph == vh->vh_phci_head) {
    742 		vh->vh_phci_head = ph->ph_next;
    743 	} else {
    744 		prev->ph_next = ph->ph_next;
    745 	}
    746 
    747 	if (ph == vh->vh_phci_tail) {
    748 		vh->vh_phci_tail = prev;
    749 	}
    750 
    751 	vh->vh_phci_count--;
    752 	MDI_VHCI_PHCI_UNLOCK(vh);
    753 
    754 	/* Walk remaining pathinfo nodes and disassociate them from pHCI */
    755 	MDI_PHCI_LOCK(ph);
    756 	for (pip = (mdi_pathinfo_t *)ph->ph_path_head; pip;
    757 	    pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link)
    758 		MDI_PI(pip)->pi_phci = NULL;
    759 	MDI_PHCI_UNLOCK(ph);
    760 
    761 	i_mdi_log_sysevent(pdip, ph->ph_vhci->vh_class,
    762 	    ESC_DDI_INITIATOR_UNREGISTER);
    763 	vhcache_phci_remove(vh->vh_config, ph);
    764 	cv_destroy(&ph->ph_unstable_cv);
    765 	mutex_destroy(&ph->ph_mutex);
    766 	kmem_free(ph, sizeof (mdi_phci_t));
    767 	DEVI(pdip)->devi_mdi_component &= ~MDI_COMPONENT_PHCI;
    768 	DEVI(pdip)->devi_mdi_xhci = NULL;
    769 	return (MDI_SUCCESS);
    770 }
    771 
    772 /*
    773  * i_devi_get_phci():
    774  * 		Utility function to return the phci extensions.
    775  */
    776 static mdi_phci_t *
    777 i_devi_get_phci(dev_info_t *pdip)
    778 {
    779 	mdi_phci_t	*ph = NULL;
    780 
    781 	if (MDI_PHCI(pdip)) {
    782 		ph = (mdi_phci_t *)DEVI(pdip)->devi_mdi_xhci;
    783 	}
    784 	return (ph);
    785 }
    786 
    787 /*
    788  * Single thread mdi entry into devinfo node for modifying its children.
    789  * If necessary we perform an ndi_devi_enter of the vHCI before doing
    790  * an ndi_devi_enter of 'dip'.  We maintain circular in two parts: one
    791  * for the vHCI and one for the pHCI.
    792  */
    793 void
    794 mdi_devi_enter(dev_info_t *phci_dip, int *circular)
    795 {
    796 	dev_info_t	*vdip;
    797 	int		vcircular, pcircular;
    798 
    799 	/* Verify calling context */
    800 	ASSERT(MDI_PHCI(phci_dip));
    801 	vdip = mdi_devi_get_vdip(phci_dip);
    802 	ASSERT(vdip);			/* A pHCI always has a vHCI */
    803 
    804 	/*
    805 	 * If pHCI is detaching then the framework has already entered the
    806 	 * vHCI on a threads that went down the code path leading to
    807 	 * detach_node().  This framework enter of the vHCI during pHCI
    808 	 * detach is done to avoid deadlock with vHCI power management
    809 	 * operations which enter the vHCI and the enter down the path
    810 	 * to the pHCI. If pHCI is detaching then we piggyback this calls
    811 	 * enter of the vHCI on frameworks vHCI enter that has already
    812 	 * occurred - this is OK because we know that the framework thread
    813 	 * doing detach is waiting for our completion.
    814 	 *
    815 	 * We should DEVI_IS_DETACHING under an enter of the parent to avoid
    816 	 * race with detach - but we can't do that because the framework has
    817 	 * already entered the parent, so we have some complexity instead.
    818 	 */
    819 	for (;;) {
    820 		if (ndi_devi_tryenter(vdip, &vcircular)) {
    821 			ASSERT(vcircular != -1);
    822 			if (DEVI_IS_DETACHING(phci_dip)) {
    823 				ndi_devi_exit(vdip, vcircular);
    824 				vcircular = -1;
    825 			}
    826 			break;
    827 		} else if (DEVI_IS_DETACHING(phci_dip)) {
    828 			vcircular = -1;
    829 			break;
    830 		} else if (servicing_interrupt()) {
    831 			/*
    832 			 * Don't delay an interrupt (and ensure adaptive
    833 			 * mutex inversion support).
    834 			 */
    835 			ndi_devi_enter(vdip, &vcircular);
    836 			break;
    837 		} else {
    838 			delay_random(2);
    839 		}
    840 	}
    841 
    842 	ndi_devi_enter(phci_dip, &pcircular);
    843 	*circular = (vcircular << 16) | (pcircular & 0xFFFF);
    844 }
    845 
    846 /*
    847  * Attempt to mdi_devi_enter.
    848  */
    849 int
    850 mdi_devi_tryenter(dev_info_t *phci_dip, int *circular)
    851 {
    852 	dev_info_t	*vdip;
    853 	int		vcircular, pcircular;
    854 
    855 	/* Verify calling context */
    856 	ASSERT(MDI_PHCI(phci_dip));
    857 	vdip = mdi_devi_get_vdip(phci_dip);
    858 	ASSERT(vdip);			/* A pHCI always has a vHCI */
    859 
    860 	if (ndi_devi_tryenter(vdip, &vcircular)) {
    861 		if (ndi_devi_tryenter(phci_dip, &pcircular)) {
    862 			*circular = (vcircular << 16) | (pcircular & 0xFFFF);
    863 			return (1);	/* locked */
    864 		}
    865 		ndi_devi_exit(vdip, vcircular);
    866 	}
    867 	return (0);			/* busy */
    868 }
    869 
    870 /*
    871  * Release mdi_devi_enter or successful mdi_devi_tryenter.
    872  */
    873 void
    874 mdi_devi_exit(dev_info_t *phci_dip, int circular)
    875 {
    876 	dev_info_t	*vdip;
    877 	int		vcircular, pcircular;
    878 
    879 	/* Verify calling context */
    880 	ASSERT(MDI_PHCI(phci_dip));
    881 	vdip = mdi_devi_get_vdip(phci_dip);
    882 	ASSERT(vdip);			/* A pHCI always has a vHCI */
    883 
    884 	/* extract two circular recursion values from single int */
    885 	pcircular = (short)(circular & 0xFFFF);
    886 	vcircular = (short)((circular >> 16) & 0xFFFF);
    887 
    888 	ndi_devi_exit(phci_dip, pcircular);
    889 	if (vcircular != -1)
    890 		ndi_devi_exit(vdip, vcircular);
    891 }
    892 
    893 /*
    894  * The functions mdi_devi_exit_phci() and mdi_devi_enter_phci() are used
    895  * around a pHCI drivers calls to mdi_pi_online/offline, after holding
    896  * the pathinfo node via mdi_hold_path/mdi_rele_path, to avoid deadlock
    897  * with vHCI power management code during path online/offline.  Each
    898  * mdi_devi_exit_phci must have a matching mdi_devi_enter_phci, and both must
    899  * occur within the scope of an active mdi_devi_enter that establishes the
    900  * circular value.
    901  */
    902 void
    903 mdi_devi_exit_phci(dev_info_t *phci_dip, int circular)
    904 {
    905 	int		pcircular;
    906 
    907 	/* Verify calling context */
    908 	ASSERT(MDI_PHCI(phci_dip));
    909 
    910 	/* Keep hold on pHCI until we reenter in mdi_devi_enter_phci */
    911 	ndi_hold_devi(phci_dip);
    912 
    913 	pcircular = (short)(circular & 0xFFFF);
    914 	ndi_devi_exit(phci_dip, pcircular);
    915 }
    916 
    917 void
    918 mdi_devi_enter_phci(dev_info_t *phci_dip, int *circular)
    919 {
    920 	int		pcircular;
    921 
    922 	/* Verify calling context */
    923 	ASSERT(MDI_PHCI(phci_dip));
    924 
    925 	ndi_devi_enter(phci_dip, &pcircular);
    926 
    927 	/* Drop hold from mdi_devi_exit_phci. */
    928 	ndi_rele_devi(phci_dip);
    929 
    930 	/* verify matching mdi_devi_exit_phci/mdi_devi_enter_phci use */
    931 	ASSERT(pcircular == ((short)(*circular & 0xFFFF)));
    932 }
    933 
    934 /*
    935  * mdi_devi_get_vdip():
    936  *		given a pHCI dip return vHCI dip
    937  */
    938 dev_info_t *
    939 mdi_devi_get_vdip(dev_info_t *pdip)
    940 {
    941 	mdi_phci_t	*ph;
    942 
    943 	ph = i_devi_get_phci(pdip);
    944 	if (ph && ph->ph_vhci)
    945 		return (ph->ph_vhci->vh_dip);
    946 	return (NULL);
    947 }
    948 
    949 /*
    950  * mdi_devi_pdip_entered():
    951  *		Return 1 if we are vHCI and have done an ndi_devi_enter
    952  *		of a pHCI
    953  */
    954 int
    955 mdi_devi_pdip_entered(dev_info_t *vdip)
    956 {
    957 	mdi_vhci_t	*vh;
    958 	mdi_phci_t	*ph;
    959 
    960 	vh = i_devi_get_vhci(vdip);
    961 	if (vh == NULL)
    962 		return (0);
    963 
    964 	MDI_VHCI_PHCI_LOCK(vh);
    965 	ph = vh->vh_phci_head;
    966 	while (ph) {
    967 		if (ph->ph_dip && DEVI_BUSY_OWNED(ph->ph_dip)) {
    968 			MDI_VHCI_PHCI_UNLOCK(vh);
    969 			return (1);
    970 		}
    971 		ph = ph->ph_next;
    972 	}
    973 	MDI_VHCI_PHCI_UNLOCK(vh);
    974 	return (0);
    975 }
    976 
    977 /*
    978  * mdi_phci_path2devinfo():
    979  * 		Utility function to search for a valid phci device given
    980  *		the devfs pathname.
    981  */
    982 dev_info_t *
    983 mdi_phci_path2devinfo(dev_info_t *vdip, caddr_t pathname)
    984 {
    985 	char		*temp_pathname;
    986 	mdi_vhci_t	*vh;
    987 	mdi_phci_t	*ph;
    988 	dev_info_t 	*pdip = NULL;
    989 
    990 	vh = i_devi_get_vhci(vdip);
    991 	ASSERT(vh != NULL);
    992 
    993 	if (vh == NULL) {
    994 		/*
    995 		 * Invalid vHCI component, return failure
    996 		 */
    997 		return (NULL);
    998 	}
    999 
   1000 	temp_pathname = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
   1001 	MDI_VHCI_PHCI_LOCK(vh);
   1002 	ph = vh->vh_phci_head;
   1003 	while (ph != NULL) {
   1004 		pdip = ph->ph_dip;
   1005 		ASSERT(pdip != NULL);
   1006 		*temp_pathname = '\0';
   1007 		(void) ddi_pathname(pdip, temp_pathname);
   1008 		if (strcmp(temp_pathname, pathname) == 0) {
   1009 			break;
   1010 		}
   1011 		ph = ph->ph_next;
   1012 	}
   1013 	if (ph == NULL) {
   1014 		pdip = NULL;
   1015 	}
   1016 	MDI_VHCI_PHCI_UNLOCK(vh);
   1017 	kmem_free(temp_pathname, MAXPATHLEN);
   1018 	return (pdip);
   1019 }
   1020 
   1021 /*
   1022  * mdi_phci_get_path_count():
   1023  * 		get number of path information nodes associated with a given
   1024  *		pHCI device.
   1025  */
   1026 int
   1027 mdi_phci_get_path_count(dev_info_t *pdip)
   1028 {
   1029 	mdi_phci_t	*ph;
   1030 	int		count = 0;
   1031 
   1032 	ph = i_devi_get_phci(pdip);
   1033 	if (ph != NULL) {
   1034 		count = ph->ph_path_count;
   1035 	}
   1036 	return (count);
   1037 }
   1038 
   1039 /*
   1040  * i_mdi_phci_lock():
   1041  *		Lock a pHCI device
   1042  * Return Values:
   1043  *		None
   1044  * Note:
   1045  *		The default locking order is:
   1046  *		_NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_pathinfo::pi_mutex))
   1047  *		But there are number of situations where locks need to be
   1048  *		grabbed in reverse order.  This routine implements try and lock
   1049  *		mechanism depending on the requested parameter option.
   1050  */
   1051 static void
   1052 i_mdi_phci_lock(mdi_phci_t *ph, mdi_pathinfo_t *pip)
   1053 {
   1054 	if (pip) {
   1055 		/* Reverse locking is requested. */
   1056 		while (MDI_PHCI_TRYLOCK(ph) == 0) {
   1057 			if (servicing_interrupt()) {
   1058 				MDI_PI_HOLD(pip);
   1059 				MDI_PI_UNLOCK(pip);
   1060 				MDI_PHCI_LOCK(ph);
   1061 				MDI_PI_LOCK(pip);
   1062 				MDI_PI_RELE(pip);
   1063 				break;
   1064 			} else {
   1065 				/*
   1066 				 * tryenter failed. Try to grab again
   1067 				 * after a small delay
   1068 				 */
   1069 				MDI_PI_HOLD(pip);
   1070 				MDI_PI_UNLOCK(pip);
   1071 				delay_random(2);
   1072 				MDI_PI_LOCK(pip);
   1073 				MDI_PI_RELE(pip);
   1074 			}
   1075 		}
   1076 	} else {
   1077 		MDI_PHCI_LOCK(ph);
   1078 	}
   1079 }
   1080 
   1081 /*
   1082  * i_mdi_phci_unlock():
   1083  *		Unlock the pHCI component
   1084  */
   1085 static void
   1086 i_mdi_phci_unlock(mdi_phci_t *ph)
   1087 {
   1088 	MDI_PHCI_UNLOCK(ph);
   1089 }
   1090 
   1091 /*
   1092  * i_mdi_devinfo_create():
   1093  *		create client device's devinfo node
   1094  * Return Values:
   1095  *		dev_info
   1096  *		NULL
   1097  * Notes:
   1098  */
   1099 static dev_info_t *
   1100 i_mdi_devinfo_create(mdi_vhci_t *vh, char *name, char *guid,
   1101 	char **compatible, int ncompatible)
   1102 {
   1103 	dev_info_t *cdip = NULL;
   1104 
   1105 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
   1106 
   1107 	/* Verify for duplicate entry */
   1108 	cdip = i_mdi_devinfo_find(vh, name, guid);
   1109 	ASSERT(cdip == NULL);
   1110 	if (cdip) {
   1111 		cmn_err(CE_WARN,
   1112 		    "i_mdi_devinfo_create: client %s@%s already exists",
   1113 			name ? name : "", guid ? guid : "");
   1114 	}
   1115 
   1116 	ndi_devi_alloc_sleep(vh->vh_dip, name, DEVI_SID_NODEID, &cdip);
   1117 	if (cdip == NULL)
   1118 		goto fail;
   1119 
   1120 	/*
   1121 	 * Create component type and Global unique identifier
   1122 	 * properties
   1123 	 */
   1124 	if (ndi_prop_update_string(DDI_DEV_T_NONE, cdip,
   1125 	    MDI_CLIENT_GUID_PROP, guid) != DDI_PROP_SUCCESS) {
   1126 		goto fail;
   1127 	}
   1128 
   1129 	/* Decorate the node with compatible property */
   1130 	if (compatible &&
   1131 	    (ndi_prop_update_string_array(DDI_DEV_T_NONE, cdip,
   1132 	    "compatible", compatible, ncompatible) != DDI_PROP_SUCCESS)) {
   1133 		goto fail;
   1134 	}
   1135 
   1136 	return (cdip);
   1137 
   1138 fail:
   1139 	if (cdip) {
   1140 		(void) ndi_prop_remove_all(cdip);
   1141 		(void) ndi_devi_free(cdip);
   1142 	}
   1143 	return (NULL);
   1144 }
   1145 
   1146 /*
   1147  * i_mdi_devinfo_find():
   1148  *		Find a matching devinfo node for given client node name
   1149  *		and its guid.
   1150  * Return Values:
   1151  *		Handle to a dev_info node or NULL
   1152  */
   1153 static dev_info_t *
   1154 i_mdi_devinfo_find(mdi_vhci_t *vh, caddr_t name, char *guid)
   1155 {
   1156 	char			*data;
   1157 	dev_info_t 		*cdip = NULL;
   1158 	dev_info_t 		*ndip = NULL;
   1159 	int			circular;
   1160 
   1161 	ndi_devi_enter(vh->vh_dip, &circular);
   1162 	ndip = (dev_info_t *)DEVI(vh->vh_dip)->devi_child;
   1163 	while ((cdip = ndip) != NULL) {
   1164 		ndip = (dev_info_t *)DEVI(cdip)->devi_sibling;
   1165 
   1166 		if (strcmp(DEVI(cdip)->devi_node_name, name)) {
   1167 			continue;
   1168 		}
   1169 
   1170 		if (ddi_prop_lookup_string(DDI_DEV_T_ANY, cdip,
   1171 		    DDI_PROP_DONTPASS, MDI_CLIENT_GUID_PROP,
   1172 		    &data) != DDI_PROP_SUCCESS) {
   1173 			continue;
   1174 		}
   1175 
   1176 		if (strcmp(data, guid) != 0) {
   1177 			ddi_prop_free(data);
   1178 			continue;
   1179 		}
   1180 		ddi_prop_free(data);
   1181 		break;
   1182 	}
   1183 	ndi_devi_exit(vh->vh_dip, circular);
   1184 	return (cdip);
   1185 }
   1186 
   1187 /*
   1188  * i_mdi_devinfo_remove():
   1189  *		Remove a client device node
   1190  */
   1191 static int
   1192 i_mdi_devinfo_remove(dev_info_t *vdip, dev_info_t *cdip, int flags)
   1193 {
   1194 	int	rv = MDI_SUCCESS;
   1195 
   1196 	if (i_mdi_is_child_present(vdip, cdip) == MDI_SUCCESS ||
   1197 	    (flags & MDI_CLIENT_FLAGS_DEV_NOT_SUPPORTED)) {
   1198 		rv = ndi_devi_offline(cdip, NDI_DEVFS_CLEAN | NDI_DEVI_REMOVE);
   1199 		if (rv != NDI_SUCCESS) {
   1200 			MDI_DEBUG(1, (MDI_NOTE, cdip,
   1201 			    "!failed: cdip %p", (void *)cdip));
   1202 		}
   1203 		/*
   1204 		 * Convert to MDI error code
   1205 		 */
   1206 		switch (rv) {
   1207 		case NDI_SUCCESS:
   1208 			rv = MDI_SUCCESS;
   1209 			break;
   1210 		case NDI_BUSY:
   1211 			rv = MDI_BUSY;
   1212 			break;
   1213 		default:
   1214 			rv = MDI_FAILURE;
   1215 			break;
   1216 		}
   1217 	}
   1218 	return (rv);
   1219 }
   1220 
   1221 /*
   1222  * i_devi_get_client()
   1223  *		Utility function to get mpxio component extensions
   1224  */
   1225 static mdi_client_t *
   1226 i_devi_get_client(dev_info_t *cdip)
   1227 {
   1228 	mdi_client_t	*ct = NULL;
   1229 
   1230 	if (MDI_CLIENT(cdip)) {
   1231 		ct = (mdi_client_t *)DEVI(cdip)->devi_mdi_client;
   1232 	}
   1233 	return (ct);
   1234 }
   1235 
   1236 /*
   1237  * i_mdi_is_child_present():
   1238  *		Search for the presence of client device dev_info node
   1239  */
   1240 static int
   1241 i_mdi_is_child_present(dev_info_t *vdip, dev_info_t *cdip)
   1242 {
   1243 	int		rv = MDI_FAILURE;
   1244 	struct dev_info	*dip;
   1245 	int		circular;
   1246 
   1247 	ndi_devi_enter(vdip, &circular);
   1248 	dip = DEVI(vdip)->devi_child;
   1249 	while (dip) {
   1250 		if (dip == DEVI(cdip)) {
   1251 			rv = MDI_SUCCESS;
   1252 			break;
   1253 		}
   1254 		dip = dip->devi_sibling;
   1255 	}
   1256 	ndi_devi_exit(vdip, circular);
   1257 	return (rv);
   1258 }
   1259 
   1260 
   1261 /*
   1262  * i_mdi_client_lock():
   1263  *		Grab client component lock
   1264  * Return Values:
   1265  *		None
   1266  * Note:
   1267  *		The default locking order is:
   1268  *		_NOTE(LOCK_ORDER(mdi_client::ct_mutex mdi_pathinfo::pi_mutex))
   1269  *		But there are number of situations where locks need to be
   1270  *		grabbed in reverse order.  This routine implements try and lock
   1271  *		mechanism depending on the requested parameter option.
   1272  */
   1273 static void
   1274 i_mdi_client_lock(mdi_client_t *ct, mdi_pathinfo_t *pip)
   1275 {
   1276 	if (pip) {
   1277 		/*
   1278 		 * Reverse locking is requested.
   1279 		 */
   1280 		while (MDI_CLIENT_TRYLOCK(ct) == 0) {
   1281 			if (servicing_interrupt()) {
   1282 				MDI_PI_HOLD(pip);
   1283 				MDI_PI_UNLOCK(pip);
   1284 				MDI_CLIENT_LOCK(ct);
   1285 				MDI_PI_LOCK(pip);
   1286 				MDI_PI_RELE(pip);
   1287 				break;
   1288 			} else {
   1289 				/*
   1290 				 * tryenter failed. Try to grab again
   1291 				 * after a small delay
   1292 				 */
   1293 				MDI_PI_HOLD(pip);
   1294 				MDI_PI_UNLOCK(pip);
   1295 				delay_random(2);
   1296 				MDI_PI_LOCK(pip);
   1297 				MDI_PI_RELE(pip);
   1298 			}
   1299 		}
   1300 	} else {
   1301 		MDI_CLIENT_LOCK(ct);
   1302 	}
   1303 }
   1304 
   1305 /*
   1306  * i_mdi_client_unlock():
   1307  *		Unlock a client component
   1308  */
   1309 static void
   1310 i_mdi_client_unlock(mdi_client_t *ct)
   1311 {
   1312 	MDI_CLIENT_UNLOCK(ct);
   1313 }
   1314 
   1315 /*
   1316  * i_mdi_client_alloc():
   1317  * 		Allocate and initialize a client structure.  Caller should
   1318  *		hold the vhci client lock.
   1319  * Return Values:
   1320  *		Handle to a client component
   1321  */
   1322 /*ARGSUSED*/
   1323 static mdi_client_t *
   1324 i_mdi_client_alloc(mdi_vhci_t *vh, char *name, char *lguid)
   1325 {
   1326 	mdi_client_t	*ct;
   1327 
   1328 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
   1329 
   1330 	/*
   1331 	 * Allocate and initialize a component structure.
   1332 	 */
   1333 	ct = kmem_zalloc(sizeof (*ct), KM_SLEEP);
   1334 	mutex_init(&ct->ct_mutex, NULL, MUTEX_DEFAULT, NULL);
   1335 	ct->ct_hnext = NULL;
   1336 	ct->ct_hprev = NULL;
   1337 	ct->ct_dip = NULL;
   1338 	ct->ct_vhci = vh;
   1339 	ct->ct_drvname = kmem_alloc(strlen(name) + 1, KM_SLEEP);
   1340 	(void) strcpy(ct->ct_drvname, name);
   1341 	ct->ct_guid = kmem_alloc(strlen(lguid) + 1, KM_SLEEP);
   1342 	(void) strcpy(ct->ct_guid, lguid);
   1343 	ct->ct_cprivate = NULL;
   1344 	ct->ct_vprivate = NULL;
   1345 	ct->ct_flags = 0;
   1346 	ct->ct_state = MDI_CLIENT_STATE_FAILED;
   1347 	MDI_CLIENT_LOCK(ct);
   1348 	MDI_CLIENT_SET_OFFLINE(ct);
   1349 	MDI_CLIENT_SET_DETACH(ct);
   1350 	MDI_CLIENT_SET_POWER_UP(ct);
   1351 	MDI_CLIENT_UNLOCK(ct);
   1352 	ct->ct_failover_flags = 0;
   1353 	ct->ct_failover_status = 0;
   1354 	cv_init(&ct->ct_failover_cv, NULL, CV_DRIVER, NULL);
   1355 	ct->ct_unstable = 0;
   1356 	cv_init(&ct->ct_unstable_cv, NULL, CV_DRIVER, NULL);
   1357 	cv_init(&ct->ct_powerchange_cv, NULL, CV_DRIVER, NULL);
   1358 	ct->ct_lb = vh->vh_lb;
   1359 	ct->ct_lb_args =  kmem_zalloc(sizeof (client_lb_args_t), KM_SLEEP);
   1360 	ct->ct_lb_args->region_size = LOAD_BALANCE_DEFAULT_REGION_SIZE;
   1361 	ct->ct_path_count = 0;
   1362 	ct->ct_path_head = NULL;
   1363 	ct->ct_path_tail = NULL;
   1364 	ct->ct_path_last = NULL;
   1365 
   1366 	/*
   1367 	 * Add this client component to our client hash queue
   1368 	 */
   1369 	i_mdi_client_enlist_table(vh, ct);
   1370 	return (ct);
   1371 }
   1372 
   1373 /*
   1374  * i_mdi_client_enlist_table():
   1375  *		Attach the client device to the client hash table. Caller
   1376  *		should hold the vhci client lock.
   1377  */
   1378 static void
   1379 i_mdi_client_enlist_table(mdi_vhci_t *vh, mdi_client_t *ct)
   1380 {
   1381 	int 			index;
   1382 	struct client_hash	*head;
   1383 
   1384 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
   1385 
   1386 	index = i_mdi_get_hash_key(ct->ct_guid);
   1387 	head = &vh->vh_client_table[index];
   1388 	ct->ct_hnext = (mdi_client_t *)head->ct_hash_head;
   1389 	head->ct_hash_head = ct;
   1390 	head->ct_hash_count++;
   1391 	vh->vh_client_count++;
   1392 }
   1393 
   1394 /*
   1395  * i_mdi_client_delist_table():
   1396  *		Attach the client device to the client hash table.
   1397  *		Caller should hold the vhci client lock.
   1398  */
   1399 static void
   1400 i_mdi_client_delist_table(mdi_vhci_t *vh, mdi_client_t *ct)
   1401 {
   1402 	int			index;
   1403 	char			*guid;
   1404 	struct client_hash 	*head;
   1405 	mdi_client_t		*next;
   1406 	mdi_client_t		*last;
   1407 
   1408 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
   1409 
   1410 	guid = ct->ct_guid;
   1411 	index = i_mdi_get_hash_key(guid);
   1412 	head = &vh->vh_client_table[index];
   1413 
   1414 	last = NULL;
   1415 	next = (mdi_client_t *)head->ct_hash_head;
   1416 	while (next != NULL) {
   1417 		if (next == ct) {
   1418 			break;
   1419 		}
   1420 		last = next;
   1421 		next = next->ct_hnext;
   1422 	}
   1423 
   1424 	if (next) {
   1425 		head->ct_hash_count--;
   1426 		if (last == NULL) {
   1427 			head->ct_hash_head = ct->ct_hnext;
   1428 		} else {
   1429 			last->ct_hnext = ct->ct_hnext;
   1430 		}
   1431 		ct->ct_hnext = NULL;
   1432 		vh->vh_client_count--;
   1433 	}
   1434 }
   1435 
   1436 
   1437 /*
   1438  * i_mdi_client_free():
   1439  *		Free a client component
   1440  */
   1441 static int
   1442 i_mdi_client_free(mdi_vhci_t *vh, mdi_client_t *ct)
   1443 {
   1444 	int		rv = MDI_SUCCESS;
   1445 	int		flags = ct->ct_flags;
   1446 	dev_info_t	*cdip;
   1447 	dev_info_t	*vdip;
   1448 
   1449 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
   1450 
   1451 	vdip = vh->vh_dip;
   1452 	cdip = ct->ct_dip;
   1453 
   1454 	(void) ndi_prop_remove(DDI_DEV_T_NONE, cdip, MDI_CLIENT_GUID_PROP);
   1455 	DEVI(cdip)->devi_mdi_component &= ~MDI_COMPONENT_CLIENT;
   1456 	DEVI(cdip)->devi_mdi_client = NULL;
   1457 
   1458 	/*
   1459 	 * Clear out back ref. to dev_info_t node
   1460 	 */
   1461 	ct->ct_dip = NULL;
   1462 
   1463 	/*
   1464 	 * Remove this client from our hash queue
   1465 	 */
   1466 	i_mdi_client_delist_table(vh, ct);
   1467 
   1468 	/*
   1469 	 * Uninitialize and free the component
   1470 	 */
   1471 	kmem_free(ct->ct_drvname, strlen(ct->ct_drvname) + 1);
   1472 	kmem_free(ct->ct_guid, strlen(ct->ct_guid) + 1);
   1473 	kmem_free(ct->ct_lb_args, sizeof (client_lb_args_t));
   1474 	cv_destroy(&ct->ct_failover_cv);
   1475 	cv_destroy(&ct->ct_unstable_cv);
   1476 	cv_destroy(&ct->ct_powerchange_cv);
   1477 	mutex_destroy(&ct->ct_mutex);
   1478 	kmem_free(ct, sizeof (*ct));
   1479 
   1480 	if (cdip != NULL) {
   1481 		MDI_VHCI_CLIENT_UNLOCK(vh);
   1482 		(void) i_mdi_devinfo_remove(vdip, cdip, flags);
   1483 		MDI_VHCI_CLIENT_LOCK(vh);
   1484 	}
   1485 	return (rv);
   1486 }
   1487 
   1488 /*
   1489  * i_mdi_client_find():
   1490  * 		Find the client structure corresponding to a given guid
   1491  *		Caller should hold the vhci client lock.
   1492  */
   1493 static mdi_client_t *
   1494 i_mdi_client_find(mdi_vhci_t *vh, char *cname, char *guid)
   1495 {
   1496 	int			index;
   1497 	struct client_hash	*head;
   1498 	mdi_client_t		*ct;
   1499 
   1500 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
   1501 
   1502 	index = i_mdi_get_hash_key(guid);
   1503 	head = &vh->vh_client_table[index];
   1504 
   1505 	ct = head->ct_hash_head;
   1506 	while (ct != NULL) {
   1507 		if (strcmp(ct->ct_guid, guid) == 0 &&
   1508 		    (cname == NULL || strcmp(ct->ct_drvname, cname) == 0)) {
   1509 			break;
   1510 		}
   1511 		ct = ct->ct_hnext;
   1512 	}
   1513 	return (ct);
   1514 }
   1515 
   1516 /*
   1517  * i_mdi_client_update_state():
   1518  *		Compute and update client device state
   1519  * Notes:
   1520  *		A client device can be in any of three possible states:
   1521  *
   1522  *		MDI_CLIENT_STATE_OPTIMAL - Client in optimal state with more
   1523  *		one online/standby paths. Can tolerate failures.
   1524  *		MDI_CLIENT_STATE_DEGRADED - Client device in degraded state with
   1525  *		no alternate paths available as standby. A failure on the online
   1526  *		would result in loss of access to device data.
   1527  *		MDI_CLIENT_STATE_FAILED - Client device in failed state with
   1528  *		no paths available to access the device.
   1529  */
   1530 static void
   1531 i_mdi_client_update_state(mdi_client_t *ct)
   1532 {
   1533 	int state;
   1534 
   1535 	ASSERT(MDI_CLIENT_LOCKED(ct));
   1536 	state = i_mdi_client_compute_state(ct, NULL);
   1537 	MDI_CLIENT_SET_STATE(ct, state);
   1538 }
   1539 
   1540 /*
   1541  * i_mdi_client_compute_state():
   1542  *		Compute client device state
   1543  *
   1544  *		mdi_phci_t *	Pointer to pHCI structure which should
   1545  *				while computing the new value.  Used by
   1546  *				i_mdi_phci_offline() to find the new
   1547  *				client state after DR of a pHCI.
   1548  */
   1549 static int
   1550 i_mdi_client_compute_state(mdi_client_t *ct, mdi_phci_t *ph)
   1551 {
   1552 	int		state;
   1553 	int		online_count = 0;
   1554 	int		standby_count = 0;
   1555 	mdi_pathinfo_t	*pip, *next;
   1556 
   1557 	ASSERT(MDI_CLIENT_LOCKED(ct));
   1558 	pip = ct->ct_path_head;
   1559 	while (pip != NULL) {
   1560 		MDI_PI_LOCK(pip);
   1561 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
   1562 		if (MDI_PI(pip)->pi_phci == ph) {
   1563 			MDI_PI_UNLOCK(pip);
   1564 			pip = next;
   1565 			continue;
   1566 		}
   1567 
   1568 		if ((MDI_PI(pip)->pi_state & MDI_PATHINFO_STATE_MASK)
   1569 				== MDI_PATHINFO_STATE_ONLINE)
   1570 			online_count++;
   1571 		else if ((MDI_PI(pip)->pi_state & MDI_PATHINFO_STATE_MASK)
   1572 				== MDI_PATHINFO_STATE_STANDBY)
   1573 			standby_count++;
   1574 		MDI_PI_UNLOCK(pip);
   1575 		pip = next;
   1576 	}
   1577 
   1578 	if (online_count == 0) {
   1579 		if (standby_count == 0) {
   1580 			state = MDI_CLIENT_STATE_FAILED;
   1581 			MDI_DEBUG(2, (MDI_NOTE, ct->ct_dip,
   1582 			    "client state failed: ct = %p", (void *)ct));
   1583 		} else if (standby_count == 1) {
   1584 			state = MDI_CLIENT_STATE_DEGRADED;
   1585 		} else {
   1586 			state = MDI_CLIENT_STATE_OPTIMAL;
   1587 		}
   1588 	} else if (online_count == 1) {
   1589 		if (standby_count == 0) {
   1590 			state = MDI_CLIENT_STATE_DEGRADED;
   1591 		} else {
   1592 			state = MDI_CLIENT_STATE_OPTIMAL;
   1593 		}
   1594 	} else {
   1595 		state = MDI_CLIENT_STATE_OPTIMAL;
   1596 	}
   1597 	return (state);
   1598 }
   1599 
   1600 /*
   1601  * i_mdi_client2devinfo():
   1602  *		Utility function
   1603  */
   1604 dev_info_t *
   1605 i_mdi_client2devinfo(mdi_client_t *ct)
   1606 {
   1607 	return (ct->ct_dip);
   1608 }
   1609 
   1610 /*
   1611  * mdi_client_path2_devinfo():
   1612  * 		Given the parent devinfo and child devfs pathname, search for
   1613  *		a valid devfs node handle.
   1614  */
   1615 dev_info_t *
   1616 mdi_client_path2devinfo(dev_info_t *vdip, char *pathname)
   1617 {
   1618 	dev_info_t 	*cdip = NULL;
   1619 	dev_info_t 	*ndip = NULL;
   1620 	char		*temp_pathname;
   1621 	int		circular;
   1622 
   1623 	/*
   1624 	 * Allocate temp buffer
   1625 	 */
   1626 	temp_pathname = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
   1627 
   1628 	/*
   1629 	 * Lock parent against changes
   1630 	 */
   1631 	ndi_devi_enter(vdip, &circular);
   1632 	ndip = (dev_info_t *)DEVI(vdip)->devi_child;
   1633 	while ((cdip = ndip) != NULL) {
   1634 		ndip = (dev_info_t *)DEVI(cdip)->devi_sibling;
   1635 
   1636 		*temp_pathname = '\0';
   1637 		(void) ddi_pathname(cdip, temp_pathname);
   1638 		if (strcmp(temp_pathname, pathname) == 0) {
   1639 			break;
   1640 		}
   1641 	}
   1642 	/*
   1643 	 * Release devinfo lock
   1644 	 */
   1645 	ndi_devi_exit(vdip, circular);
   1646 
   1647 	/*
   1648 	 * Free the temp buffer
   1649 	 */
   1650 	kmem_free(temp_pathname, MAXPATHLEN);
   1651 	return (cdip);
   1652 }
   1653 
   1654 /*
   1655  * mdi_client_get_path_count():
   1656  * 		Utility function to get number of path information nodes
   1657  *		associated with a given client device.
   1658  */
   1659 int
   1660 mdi_client_get_path_count(dev_info_t *cdip)
   1661 {
   1662 	mdi_client_t	*ct;
   1663 	int		count = 0;
   1664 
   1665 	ct = i_devi_get_client(cdip);
   1666 	if (ct != NULL) {
   1667 		count = ct->ct_path_count;
   1668 	}
   1669 	return (count);
   1670 }
   1671 
   1672 
   1673 /*
   1674  * i_mdi_get_hash_key():
   1675  * 		Create a hash using strings as keys
   1676  *
   1677  */
   1678 static int
   1679 i_mdi_get_hash_key(char *str)
   1680 {
   1681 	uint32_t	g, hash = 0;
   1682 	char		*p;
   1683 
   1684 	for (p = str; *p != '\0'; p++) {
   1685 		g = *p;
   1686 		hash += g;
   1687 	}
   1688 	return (hash % (CLIENT_HASH_TABLE_SIZE - 1));
   1689 }
   1690 
   1691 /*
   1692  * mdi_get_lb_policy():
   1693  * 		Get current load balancing policy for a given client device
   1694  */
   1695 client_lb_t
   1696 mdi_get_lb_policy(dev_info_t *cdip)
   1697 {
   1698 	client_lb_t	lb = LOAD_BALANCE_NONE;
   1699 	mdi_client_t	*ct;
   1700 
   1701 	ct = i_devi_get_client(cdip);
   1702 	if (ct != NULL) {
   1703 		lb = ct->ct_lb;
   1704 	}
   1705 	return (lb);
   1706 }
   1707 
   1708 /*
   1709  * mdi_set_lb_region_size():
   1710  * 		Set current region size for the load-balance
   1711  */
   1712 int
   1713 mdi_set_lb_region_size(dev_info_t *cdip, int region_size)
   1714 {
   1715 	mdi_client_t	*ct;
   1716 	int		rv = MDI_FAILURE;
   1717 
   1718 	ct = i_devi_get_client(cdip);
   1719 	if (ct != NULL && ct->ct_lb_args != NULL) {
   1720 		ct->ct_lb_args->region_size = region_size;
   1721 		rv = MDI_SUCCESS;
   1722 	}
   1723 	return (rv);
   1724 }
   1725 
   1726 /*
   1727  * mdi_Set_lb_policy():
   1728  * 		Set current load balancing policy for a given client device
   1729  */
   1730 int
   1731 mdi_set_lb_policy(dev_info_t *cdip, client_lb_t lb)
   1732 {
   1733 	mdi_client_t	*ct;
   1734 	int		rv = MDI_FAILURE;
   1735 
   1736 	ct = i_devi_get_client(cdip);
   1737 	if (ct != NULL) {
   1738 		ct->ct_lb = lb;
   1739 		rv = MDI_SUCCESS;
   1740 	}
   1741 	return (rv);
   1742 }
   1743 
   1744 /*
   1745  * mdi_failover():
   1746  *		failover function called by the vHCI drivers to initiate
   1747  *		a failover operation.  This is typically due to non-availability
   1748  *		of online paths to route I/O requests.  Failover can be
   1749  *		triggered through user application also.
   1750  *
   1751  *		The vHCI driver calls mdi_failover() to initiate a failover
   1752  *		operation. mdi_failover() calls back into the vHCI driver's
   1753  *		vo_failover() entry point to perform the actual failover
   1754  *		operation.  The reason for requiring the vHCI driver to
   1755  *		initiate failover by calling mdi_failover(), instead of directly
   1756  *		executing vo_failover() itself, is to ensure that the mdi
   1757  *		framework can keep track of the client state properly.
   1758  *		Additionally, mdi_failover() provides as a convenience the
   1759  *		option of performing the failover operation synchronously or
   1760  *		asynchronously
   1761  *
   1762  *		Upon successful completion of the failover operation, the
   1763  *		paths that were previously ONLINE will be in the STANDBY state,
   1764  *		and the newly activated paths will be in the ONLINE state.
   1765  *
   1766  *		The flags modifier determines whether the activation is done
   1767  *		synchronously: MDI_FAILOVER_SYNC
   1768  * Return Values:
   1769  *		MDI_SUCCESS
   1770  *		MDI_FAILURE
   1771  *		MDI_BUSY
   1772  */
   1773 /*ARGSUSED*/
   1774 int
   1775 mdi_failover(dev_info_t *vdip, dev_info_t *cdip, int flags)
   1776 {
   1777 	int			rv;
   1778 	mdi_client_t		*ct;
   1779 
   1780 	ct = i_devi_get_client(cdip);
   1781 	ASSERT(ct != NULL);
   1782 	if (ct == NULL) {
   1783 		/* cdip is not a valid client device. Nothing more to do. */
   1784 		return (MDI_FAILURE);
   1785 	}
   1786 
   1787 	MDI_CLIENT_LOCK(ct);
   1788 
   1789 	if (MDI_CLIENT_IS_PATH_FREE_IN_PROGRESS(ct)) {
   1790 		/* A path to the client is being freed */
   1791 		MDI_CLIENT_UNLOCK(ct);
   1792 		return (MDI_BUSY);
   1793 	}
   1794 
   1795 
   1796 	if (MDI_CLIENT_IS_FAILED(ct)) {
   1797 		/*
   1798 		 * Client is in failed state. Nothing more to do.
   1799 		 */
   1800 		MDI_CLIENT_UNLOCK(ct);
   1801 		return (MDI_FAILURE);
   1802 	}
   1803 
   1804 	if (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) {
   1805 		/*
   1806 		 * Failover is already in progress; return BUSY
   1807 		 */
   1808 		MDI_CLIENT_UNLOCK(ct);
   1809 		return (MDI_BUSY);
   1810 	}
   1811 	/*
   1812 	 * Make sure that mdi_pathinfo node state changes are processed.
   1813 	 * We do not allow failovers to progress while client path state
   1814 	 * changes are in progress
   1815 	 */
   1816 	if (ct->ct_unstable) {
   1817 		if (flags == MDI_FAILOVER_ASYNC) {
   1818 			MDI_CLIENT_UNLOCK(ct);
   1819 			return (MDI_BUSY);
   1820 		} else {
   1821 			while (ct->ct_unstable)
   1822 				cv_wait(&ct->ct_unstable_cv, &ct->ct_mutex);
   1823 		}
   1824 	}
   1825 
   1826 	/*
   1827 	 * Client device is in stable state. Before proceeding, perform sanity
   1828 	 * checks again.
   1829 	 */
   1830 	if ((MDI_CLIENT_IS_DETACHED(ct)) || (MDI_CLIENT_IS_FAILED(ct)) ||
   1831 	    (!i_ddi_devi_attached(ct->ct_dip))) {
   1832 		/*
   1833 		 * Client is in failed state. Nothing more to do.
   1834 		 */
   1835 		MDI_CLIENT_UNLOCK(ct);
   1836 		return (MDI_FAILURE);
   1837 	}
   1838 
   1839 	/*
   1840 	 * Set the client state as failover in progress.
   1841 	 */
   1842 	MDI_CLIENT_SET_FAILOVER_IN_PROGRESS(ct);
   1843 	ct->ct_failover_flags = flags;
   1844 	MDI_CLIENT_UNLOCK(ct);
   1845 
   1846 	if (flags == MDI_FAILOVER_ASYNC) {
   1847 		/*
   1848 		 * Submit the initiate failover request via CPR safe
   1849 		 * taskq threads.
   1850 		 */
   1851 		(void) taskq_dispatch(mdi_taskq, (task_func_t *)i_mdi_failover,
   1852 		    ct, KM_SLEEP);
   1853 		return (MDI_ACCEPT);
   1854 	} else {
   1855 		/*
   1856 		 * Synchronous failover mode.  Typically invoked from the user
   1857 		 * land.
   1858 		 */
   1859 		rv = i_mdi_failover(ct);
   1860 	}
   1861 	return (rv);
   1862 }
   1863 
   1864 /*
   1865  * i_mdi_failover():
   1866  *		internal failover function. Invokes vHCI drivers failover
   1867  *		callback function and process the failover status
   1868  * Return Values:
   1869  *		None
   1870  *
   1871  * Note: A client device in failover state can not be detached or freed.
   1872  */
   1873 static int
   1874 i_mdi_failover(void *arg)
   1875 {
   1876 	int		rv = MDI_SUCCESS;
   1877 	mdi_client_t	*ct = (mdi_client_t *)arg;
   1878 	mdi_vhci_t	*vh = ct->ct_vhci;
   1879 
   1880 	ASSERT(!MDI_CLIENT_LOCKED(ct));
   1881 
   1882 	if (vh->vh_ops->vo_failover != NULL) {
   1883 		/*
   1884 		 * Call vHCI drivers callback routine
   1885 		 */
   1886 		rv = (*vh->vh_ops->vo_failover)(vh->vh_dip, ct->ct_dip,
   1887 		    ct->ct_failover_flags);
   1888 	}
   1889 
   1890 	MDI_CLIENT_LOCK(ct);
   1891 	MDI_CLIENT_CLEAR_FAILOVER_IN_PROGRESS(ct);
   1892 
   1893 	/*
   1894 	 * Save the failover return status
   1895 	 */
   1896 	ct->ct_failover_status = rv;
   1897 
   1898 	/*
   1899 	 * As a result of failover, client status would have been changed.
   1900 	 * Update the client state and wake up anyone waiting on this client
   1901 	 * device.
   1902 	 */
   1903 	i_mdi_client_update_state(ct);
   1904 
   1905 	cv_broadcast(&ct->ct_failover_cv);
   1906 	MDI_CLIENT_UNLOCK(ct);
   1907 	return (rv);
   1908 }
   1909 
   1910 /*
   1911  * Load balancing is logical block.
   1912  * IOs within the range described by region_size
   1913  * would go on the same path. This would improve the
   1914  * performance by cache-hit on some of the RAID devices.
   1915  * Search only for online paths(At some point we
   1916  * may want to balance across target ports).
   1917  * If no paths are found then default to round-robin.
   1918  */
   1919 static int
   1920 i_mdi_lba_lb(mdi_client_t *ct, mdi_pathinfo_t **ret_pip, struct buf *bp)
   1921 {
   1922 	int		path_index = -1;
   1923 	int		online_path_count = 0;
   1924 	int		online_nonpref_path_count = 0;
   1925 	int 		region_size = ct->ct_lb_args->region_size;
   1926 	mdi_pathinfo_t	*pip;
   1927 	mdi_pathinfo_t	*next;
   1928 	int		preferred, path_cnt;
   1929 
   1930 	pip = ct->ct_path_head;
   1931 	while (pip) {
   1932 		MDI_PI_LOCK(pip);
   1933 		if (MDI_PI(pip)->pi_state ==
   1934 		    MDI_PATHINFO_STATE_ONLINE && MDI_PI(pip)->pi_preferred) {
   1935 			online_path_count++;
   1936 		} else if (MDI_PI(pip)->pi_state ==
   1937 		    MDI_PATHINFO_STATE_ONLINE && !MDI_PI(pip)->pi_preferred) {
   1938 			online_nonpref_path_count++;
   1939 		}
   1940 		next = (mdi_pathinfo_t *)
   1941 		    MDI_PI(pip)->pi_client_link;
   1942 		MDI_PI_UNLOCK(pip);
   1943 		pip = next;
   1944 	}
   1945 	/* if found any online/preferred then use this type */
   1946 	if (online_path_count > 0) {
   1947 		path_cnt = online_path_count;
   1948 		preferred = 1;
   1949 	} else if (online_nonpref_path_count > 0) {
   1950 		path_cnt = online_nonpref_path_count;
   1951 		preferred = 0;
   1952 	} else {
   1953 		path_cnt = 0;
   1954 	}
   1955 	if (path_cnt) {
   1956 		path_index = (bp->b_blkno >> region_size) % path_cnt;
   1957 		pip = ct->ct_path_head;
   1958 		while (pip && path_index != -1) {
   1959 			MDI_PI_LOCK(pip);
   1960 			if (path_index == 0 &&
   1961 			    (MDI_PI(pip)->pi_state ==
   1962 			    MDI_PATHINFO_STATE_ONLINE) &&
   1963 				MDI_PI(pip)->pi_preferred == preferred) {
   1964 				MDI_PI_HOLD(pip);
   1965 				MDI_PI_UNLOCK(pip);
   1966 				*ret_pip = pip;
   1967 				return (MDI_SUCCESS);
   1968 			}
   1969 			path_index --;
   1970 			next = (mdi_pathinfo_t *)
   1971 			    MDI_PI(pip)->pi_client_link;
   1972 			MDI_PI_UNLOCK(pip);
   1973 			pip = next;
   1974 		}
   1975 		MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
   1976 		    "lba %llx: path %s %p",
   1977 		    bp->b_lblkno, mdi_pi_spathname(pip), (void *)pip));
   1978 	}
   1979 	return (MDI_FAILURE);
   1980 }
   1981 
   1982 /*
   1983  * mdi_select_path():
   1984  *		select a path to access a client device.
   1985  *
   1986  *		mdi_select_path() function is called by the vHCI drivers to
   1987  *		select a path to route the I/O request to.  The caller passes
   1988  *		the block I/O data transfer structure ("buf") as one of the
   1989  *		parameters.  The mpxio framework uses the buf structure
   1990  *		contents to maintain per path statistics (total I/O size /
   1991  *		count pending).  If more than one online paths are available to
   1992  *		select, the framework automatically selects a suitable path
   1993  *		for routing I/O request. If a failover operation is active for
   1994  *		this client device the call shall be failed with MDI_BUSY error
   1995  *		code.
   1996  *
   1997  *		By default this function returns a suitable path in online
   1998  *		state based on the current load balancing policy.  Currently
   1999  *		we support LOAD_BALANCE_NONE (Previously selected online path
   2000  *		will continue to be used till the path is usable) and
   2001  *		LOAD_BALANCE_RR (Online paths will be selected in a round
   2002  *		robin fashion), LOAD_BALANCE_LB(Online paths will be selected
   2003  *		based on the logical block).  The load balancing
   2004  *		through vHCI drivers configuration file (driver.conf).
   2005  *
   2006  *		vHCI drivers may override this default behavior by specifying
   2007  *		appropriate flags.  The meaning of the thrid argument depends
   2008  *		on the flags specified. If MDI_SELECT_PATH_INSTANCE is set
   2009  *		then the argument is the "path instance" of the path to select.
   2010  *		If MDI_SELECT_PATH_INSTANCE is not set then the argument is
   2011  *		"start_pip". A non NULL "start_pip" is the starting point to
   2012  *		walk and find the next appropriate path.  The following values
   2013  *		are currently defined: MDI_SELECT_ONLINE_PATH (to select an
   2014  *		ONLINE path) and/or MDI_SELECT_STANDBY_PATH (to select an
   2015  *		STANDBY path).
   2016  *
   2017  *		The non-standard behavior is used by the scsi_vhci driver,
   2018  *		whenever it has to use a STANDBY/FAULTED path.  Eg. during
   2019  *		attach of client devices (to avoid an unnecessary failover
   2020  *		when the STANDBY path comes up first), during failover
   2021  *		(to activate a STANDBY path as ONLINE).
   2022  *
   2023  *		The selected path is returned in a a mdi_hold_path() state
   2024  *		(pi_ref_cnt). Caller should release the hold by calling
   2025  *		mdi_rele_path().
   2026  *
   2027  * Return Values:
   2028  *		MDI_SUCCESS	- Completed successfully
   2029  *		MDI_BUSY 	- Client device is busy failing over
   2030  *		MDI_NOPATH	- Client device is online, but no valid path are
   2031  *				  available to access this client device
   2032  *		MDI_FAILURE	- Invalid client device or state
   2033  *		MDI_DEVI_ONLINING
   2034  *				- Client device (struct dev_info state) is in
   2035  *				  onlining state.
   2036  */
   2037 
   2038 /*ARGSUSED*/
   2039 int
   2040 mdi_select_path(dev_info_t *cdip, struct buf *bp, int flags,
   2041     void *arg, mdi_pathinfo_t **ret_pip)
   2042 {
   2043 	mdi_client_t	*ct;
   2044 	mdi_pathinfo_t	*pip;
   2045 	mdi_pathinfo_t	*next;
   2046 	mdi_pathinfo_t	*head;
   2047 	mdi_pathinfo_t	*start;
   2048 	client_lb_t	lbp;	/* load balancing policy */
   2049 	int		sb = 1;	/* standard behavior */
   2050 	int		preferred = 1;	/* preferred path */
   2051 	int		cond, cont = 1;
   2052 	int		retry = 0;
   2053 	mdi_pathinfo_t	*start_pip;	/* request starting pathinfo */
   2054 	int		path_instance;	/* request specific path instance */
   2055 
   2056 	/* determine type of arg based on flags */
   2057 	if (flags & MDI_SELECT_PATH_INSTANCE) {
   2058 		path_instance = (int)(intptr_t)arg;
   2059 		start_pip = NULL;
   2060 	} else {
   2061 		path_instance = 0;
   2062 		start_pip = (mdi_pathinfo_t *)arg;
   2063 	}
   2064 
   2065 	if (flags != 0) {
   2066 		/*
   2067 		 * disable default behavior
   2068 		 */
   2069 		sb = 0;
   2070 	}
   2071 
   2072 	*ret_pip = NULL;
   2073 	ct = i_devi_get_client(cdip);
   2074 	if (ct == NULL) {
   2075 		/* mdi extensions are NULL, Nothing more to do */
   2076 		return (MDI_FAILURE);
   2077 	}
   2078 
   2079 	MDI_CLIENT_LOCK(ct);
   2080 
   2081 	if (sb) {
   2082 		if (MDI_CLIENT_IS_FAILED(ct)) {
   2083 			/*
   2084 			 * Client is not ready to accept any I/O requests.
   2085 			 * Fail this request.
   2086 			 */
   2087 			MDI_DEBUG(2, (MDI_NOTE, cdip,
   2088 			    "client state offline ct = %p", (void *)ct));
   2089 			MDI_CLIENT_UNLOCK(ct);
   2090 			return (MDI_FAILURE);
   2091 		}
   2092 
   2093 		if (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) {
   2094 			/*
   2095 			 * Check for Failover is in progress. If so tell the
   2096 			 * caller that this device is busy.
   2097 			 */
   2098 			MDI_DEBUG(2, (MDI_NOTE, cdip,
   2099 			    "client failover in progress ct = %p",
   2100 			    (void *)ct));
   2101 			MDI_CLIENT_UNLOCK(ct);
   2102 			return (MDI_BUSY);
   2103 		}
   2104 
   2105 		/*
   2106 		 * Check to see whether the client device is attached.
   2107 		 * If not so, let the vHCI driver manually select a path
   2108 		 * (standby) and let the probe/attach process to continue.
   2109 		 */
   2110 		if (MDI_CLIENT_IS_DETACHED(ct) || !i_ddi_devi_attached(cdip)) {
   2111 			MDI_DEBUG(4, (MDI_NOTE, cdip,
   2112 			    "devi is onlining ct = %p", (void *)ct));
   2113 			MDI_CLIENT_UNLOCK(ct);
   2114 			return (MDI_DEVI_ONLINING);
   2115 		}
   2116 	}
   2117 
   2118 	/*
   2119 	 * Cache in the client list head.  If head of the list is NULL
   2120 	 * return MDI_NOPATH
   2121 	 */
   2122 	head = ct->ct_path_head;
   2123 	if (head == NULL) {
   2124 		MDI_CLIENT_UNLOCK(ct);
   2125 		return (MDI_NOPATH);
   2126 	}
   2127 
   2128 	/* Caller is specifying a specific pathinfo path by path_instance */
   2129 	if (path_instance) {
   2130 		/* search for pathinfo with correct path_instance */
   2131 		for (pip = head;
   2132 		    pip && (mdi_pi_get_path_instance(pip) != path_instance);
   2133 		    pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link)
   2134 			;
   2135 
   2136 		/* If path can't be selected then MDI_NOPATH is returned. */
   2137 		if (pip == NULL) {
   2138 			MDI_CLIENT_UNLOCK(ct);
   2139 			return (MDI_NOPATH);
   2140 		}
   2141 
   2142 		/*
   2143 		 * Verify state of path. When asked to select a specific
   2144 		 * path_instance, we select the requested path in any
   2145 		 * state (ONLINE, OFFLINE, STANDBY, FAULT) other than INIT.
   2146 		 * We don't however select paths where the pHCI has detached.
   2147 		 * NOTE: last pathinfo node of an opened client device may
   2148 		 * exist in an OFFLINE state after the pHCI associated with
   2149 		 * that path has detached (but pi_phci will be NULL if that
   2150 		 * has occurred).
   2151 		 */
   2152 		MDI_PI_LOCK(pip);
   2153 		if ((MDI_PI(pip)->pi_state == MDI_PATHINFO_STATE_INIT) ||
   2154 		    (MDI_PI(pip)->pi_phci == NULL)) {
   2155 			MDI_PI_UNLOCK(pip);
   2156 			MDI_CLIENT_UNLOCK(ct);
   2157 			return (MDI_FAILURE);
   2158 		}
   2159 
   2160 		/* Return MDI_BUSY if we have a transient condition */
   2161 		if (MDI_PI_IS_TRANSIENT(pip)) {
   2162 			MDI_PI_UNLOCK(pip);
   2163 			MDI_CLIENT_UNLOCK(ct);
   2164 			return (MDI_BUSY);
   2165 		}
   2166 
   2167 		/*
   2168 		 * Return the path in hold state. Caller should release the
   2169 		 * lock by calling mdi_rele_path()
   2170 		 */
   2171 		MDI_PI_HOLD(pip);
   2172 		MDI_PI_UNLOCK(pip);
   2173 		*ret_pip = pip;
   2174 		MDI_CLIENT_UNLOCK(ct);
   2175 		return (MDI_SUCCESS);
   2176 	}
   2177 
   2178 	/*
   2179 	 * for non default behavior, bypass current
   2180 	 * load balancing policy and always use LOAD_BALANCE_RR
   2181 	 * except that the start point will be adjusted based
   2182 	 * on the provided start_pip
   2183 	 */
   2184 	lbp = sb ? ct->ct_lb : LOAD_BALANCE_RR;
   2185 
   2186 	switch (lbp) {
   2187 	case LOAD_BALANCE_NONE:
   2188 		/*
   2189 		 * Load balancing is None  or Alternate path mode
   2190 		 * Start looking for a online mdi_pathinfo node starting from
   2191 		 * last known selected path
   2192 		 */
   2193 		preferred = 1;
   2194 		pip = (mdi_pathinfo_t *)ct->ct_path_last;
   2195 		if (pip == NULL) {
   2196 			pip = head;
   2197 		}
   2198 		start = pip;
   2199 		do {
   2200 			MDI_PI_LOCK(pip);
   2201 			/*
   2202 			 * No need to explicitly check if the path is disabled.
   2203 			 * Since we are checking for state == ONLINE and the
   2204 			 * same variable is used for DISABLE/ENABLE information.
   2205 			 */
   2206 			if ((MDI_PI(pip)->pi_state  ==
   2207 				MDI_PATHINFO_STATE_ONLINE) &&
   2208 				preferred == MDI_PI(pip)->pi_preferred) {
   2209 				/*
   2210 				 * Return the path in hold state. Caller should
   2211 				 * release the lock by calling mdi_rele_path()
   2212 				 */
   2213 				MDI_PI_HOLD(pip);
   2214 				MDI_PI_UNLOCK(pip);
   2215 				ct->ct_path_last = pip;
   2216 				*ret_pip = pip;
   2217 				MDI_CLIENT_UNLOCK(ct);
   2218 				return (MDI_SUCCESS);
   2219 			}
   2220 
   2221 			/*
   2222 			 * Path is busy.
   2223 			 */
   2224 			if (MDI_PI_IS_DRV_DISABLE_TRANSIENT(pip) ||
   2225 			    MDI_PI_IS_TRANSIENT(pip))
   2226 				retry = 1;
   2227 			/*
   2228 			 * Keep looking for a next available online path
   2229 			 */
   2230 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
   2231 			if (next == NULL) {
   2232 				next = head;
   2233 			}
   2234 			MDI_PI_UNLOCK(pip);
   2235 			pip = next;
   2236 			if (start == pip && preferred) {
   2237 				preferred = 0;
   2238 			} else if (start == pip && !preferred) {
   2239 				cont = 0;
   2240 			}
   2241 		} while (cont);
   2242 		break;
   2243 
   2244 	case LOAD_BALANCE_LBA:
   2245 		/*
   2246 		 * Make sure we are looking
   2247 		 * for an online path. Otherwise, if it is for a STANDBY
   2248 		 * path request, it will go through and fetch an ONLINE
   2249 		 * path which is not desirable.
   2250 		 */
   2251 		if ((ct->ct_lb_args != NULL) &&
   2252 			    (ct->ct_lb_args->region_size) && bp &&
   2253 				(sb || (flags == MDI_SELECT_ONLINE_PATH))) {
   2254 			if (i_mdi_lba_lb(ct, ret_pip, bp)
   2255 				    == MDI_SUCCESS) {
   2256 				MDI_CLIENT_UNLOCK(ct);
   2257 				return (MDI_SUCCESS);
   2258 			}
   2259 		}
   2260 		/* FALLTHROUGH */
   2261 	case LOAD_BALANCE_RR:
   2262 		/*
   2263 		 * Load balancing is Round Robin. Start looking for a online
   2264 		 * mdi_pathinfo node starting from last known selected path
   2265 		 * as the start point.  If override flags are specified,
   2266 		 * process accordingly.
   2267 		 * If the search is already in effect(start_pip not null),
   2268 		 * then lets just use the same path preference to continue the
   2269 		 * traversal.
   2270 		 */
   2271 
   2272 		if (start_pip != NULL) {
   2273 			preferred = MDI_PI(start_pip)->pi_preferred;
   2274 		} else {
   2275 			preferred = 1;
   2276 		}
   2277 
   2278 		start = sb ? (mdi_pathinfo_t *)ct->ct_path_last : start_pip;
   2279 		if (start == NULL) {
   2280 			pip = head;
   2281 		} else {
   2282 			pip = (mdi_pathinfo_t *)MDI_PI(start)->pi_client_link;
   2283 			if (pip == NULL) {
   2284 				if ( flags & MDI_SELECT_NO_PREFERRED) {
   2285 					/*
   2286 					 * Return since we hit the end of list
   2287 					 */
   2288 					MDI_CLIENT_UNLOCK(ct);
   2289 					return (MDI_NOPATH);
   2290 				}
   2291 
   2292 				if (!sb) {
   2293 					if (preferred == 0) {
   2294 						/*
   2295 						 * Looks like we have completed
   2296 						 * the traversal as preferred
   2297 						 * value is 0. Time to bail out.
   2298 						 */
   2299 						*ret_pip = NULL;
   2300 						MDI_CLIENT_UNLOCK(ct);
   2301 						return (MDI_NOPATH);
   2302 					} else {
   2303 						/*
   2304 						 * Looks like we reached the
   2305 						 * end of the list. Lets enable
   2306 						 * traversal of non preferred
   2307 						 * paths.
   2308 						 */
   2309 						preferred = 0;
   2310 					}
   2311 				}
   2312 				pip = head;
   2313 			}
   2314 		}
   2315 		start = pip;
   2316 		do {
   2317 			MDI_PI_LOCK(pip);
   2318 			if (sb) {
   2319 				cond = ((MDI_PI(pip)->pi_state ==
   2320 				    MDI_PATHINFO_STATE_ONLINE &&
   2321 					MDI_PI(pip)->pi_preferred ==
   2322 						preferred) ? 1 : 0);
   2323 			} else {
   2324 				if (flags == MDI_SELECT_ONLINE_PATH) {
   2325 					cond = ((MDI_PI(pip)->pi_state ==
   2326 					    MDI_PATHINFO_STATE_ONLINE &&
   2327 						MDI_PI(pip)->pi_preferred ==
   2328 						preferred) ? 1 : 0);
   2329 				} else if (flags == MDI_SELECT_STANDBY_PATH) {
   2330 					cond = ((MDI_PI(pip)->pi_state ==
   2331 					    MDI_PATHINFO_STATE_STANDBY &&
   2332 						MDI_PI(pip)->pi_preferred ==
   2333 						preferred) ? 1 : 0);
   2334 				} else if (flags == (MDI_SELECT_ONLINE_PATH |
   2335 				    MDI_SELECT_STANDBY_PATH)) {
   2336 					cond = (((MDI_PI(pip)->pi_state ==
   2337 					    MDI_PATHINFO_STATE_ONLINE ||
   2338 					    (MDI_PI(pip)->pi_state ==
   2339 					    MDI_PATHINFO_STATE_STANDBY)) &&
   2340 						MDI_PI(pip)->pi_preferred ==
   2341 						preferred) ? 1 : 0);
   2342 				} else if (flags ==
   2343 					(MDI_SELECT_STANDBY_PATH |
   2344 					MDI_SELECT_ONLINE_PATH |
   2345 					MDI_SELECT_USER_DISABLE_PATH)) {
   2346 					cond = (((MDI_PI(pip)->pi_state ==
   2347 					    MDI_PATHINFO_STATE_ONLINE ||
   2348 					    (MDI_PI(pip)->pi_state ==
   2349 					    MDI_PATHINFO_STATE_STANDBY) ||
   2350 						(MDI_PI(pip)->pi_state ==
   2351 					    (MDI_PATHINFO_STATE_ONLINE|
   2352 					    MDI_PATHINFO_STATE_USER_DISABLE)) ||
   2353 						(MDI_PI(pip)->pi_state ==
   2354 					    (MDI_PATHINFO_STATE_STANDBY |
   2355 					    MDI_PATHINFO_STATE_USER_DISABLE)))&&
   2356 						MDI_PI(pip)->pi_preferred ==
   2357 						preferred) ? 1 : 0);
   2358 				} else if (flags ==
   2359 				    (MDI_SELECT_STANDBY_PATH |
   2360 				    MDI_SELECT_ONLINE_PATH |
   2361 				    MDI_SELECT_NO_PREFERRED)) {
   2362 					cond = (((MDI_PI(pip)->pi_state ==
   2363 					    MDI_PATHINFO_STATE_ONLINE) ||
   2364 					    (MDI_PI(pip)->pi_state ==
   2365 					    MDI_PATHINFO_STATE_STANDBY))
   2366 					    ? 1 : 0);
   2367 				} else {
   2368 					cond = 0;
   2369 				}
   2370 			}
   2371 			/*
   2372 			 * No need to explicitly check if the path is disabled.
   2373 			 * Since we are checking for state == ONLINE and the
   2374 			 * same variable is used for DISABLE/ENABLE information.
   2375 			 */
   2376 			if (cond) {
   2377 				/*
   2378 				 * Return the path in hold state. Caller should
   2379 				 * release the lock by calling mdi_rele_path()
   2380 				 */
   2381 				MDI_PI_HOLD(pip);
   2382 				MDI_PI_UNLOCK(pip);
   2383 				if (sb)
   2384 					ct->ct_path_last = pip;
   2385 				*ret_pip = pip;
   2386 				MDI_CLIENT_UNLOCK(ct);
   2387 				return (MDI_SUCCESS);
   2388 			}
   2389 			/*
   2390 			 * Path is busy.
   2391 			 */
   2392 			if (MDI_PI_IS_DRV_DISABLE_TRANSIENT(pip) ||
   2393 			    MDI_PI_IS_TRANSIENT(pip))
   2394 				retry = 1;
   2395 
   2396 			/*
   2397 			 * Keep looking for a next available online path
   2398 			 */
   2399 do_again:
   2400 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
   2401 			if (next == NULL) {
   2402 				if ( flags & MDI_SELECT_NO_PREFERRED) {
   2403 					/*
   2404 					 * Bail out since we hit the end of list
   2405 					 */
   2406 					MDI_PI_UNLOCK(pip);
   2407 					break;
   2408 				}
   2409 
   2410 				if (!sb) {
   2411 					if (preferred == 1) {
   2412 						/*
   2413 						 * Looks like we reached the
   2414 						 * end of the list. Lets enable
   2415 						 * traversal of non preferred
   2416 						 * paths.
   2417 						 */
   2418 						preferred = 0;
   2419 						next = head;
   2420 					} else {
   2421 						/*
   2422 						 * We have done both the passes
   2423 						 * Preferred as well as for
   2424 						 * Non-preferred. Bail out now.
   2425 						 */
   2426 						cont = 0;
   2427 					}
   2428 				} else {
   2429 					/*
   2430 					 * Standard behavior case.
   2431 					 */
   2432 					next = head;
   2433 				}
   2434 			}
   2435 			MDI_PI_UNLOCK(pip);
   2436 			if (cont == 0) {
   2437 				break;
   2438 			}
   2439 			pip = next;
   2440 
   2441 			if (!sb) {
   2442 				/*
   2443 				 * We need to handle the selection of
   2444 				 * non-preferred path in the following
   2445 				 * case:
   2446 				 *
   2447 				 * +------+   +------+   +------+   +-----+
   2448 				 * | A : 1| - | B : 1| - | C : 0| - |NULL |
   2449 				 * +------+   +------+   +------+   +-----+
   2450 				 *
   2451 				 * If we start the search with B, we need to
   2452 				 * skip beyond B to pick C which is non -
   2453 				 * preferred in the second pass. The following
   2454 				 * test, if true, will allow us to skip over
   2455 				 * the 'start'(B in the example) to select
   2456 				 * other non preferred elements.
   2457 				 */
   2458 				if ((start_pip != NULL) && (start_pip == pip) &&
   2459 				    (MDI_PI(start_pip)->pi_preferred
   2460 				    != preferred)) {
   2461 					/*
   2462 					 * try again after going past the start
   2463 					 * pip
   2464 					 */
   2465 					MDI_PI_LOCK(pip);
   2466 					goto do_again;
   2467 				}
   2468 			} else {
   2469 				/*
   2470 				 * Standard behavior case
   2471 				 */
   2472 				if (start == pip && preferred) {
   2473 					/* look for nonpreferred paths */
   2474 					preferred = 0;
   2475 				} else if (start == pip && !preferred) {
   2476 					/*
   2477 					 * Exit condition
   2478 					 */
   2479 					cont = 0;
   2480 				}
   2481 			}
   2482 		} while (cont);
   2483 		break;
   2484 	}
   2485 
   2486 	MDI_CLIENT_UNLOCK(ct);
   2487 	if (retry == 1) {
   2488 		return (MDI_BUSY);
   2489 	} else {
   2490 		return (MDI_NOPATH);
   2491 	}
   2492 }
   2493 
   2494 /*
   2495  * For a client, return the next available path to any phci
   2496  *
   2497  * Note:
   2498  *		Caller should hold the branch's devinfo node to get a consistent
   2499  *		snap shot of the mdi_pathinfo nodes.
   2500  *
   2501  *		Please note that even the list is stable the mdi_pathinfo
   2502  *		node state and properties are volatile.  The caller should lock
   2503  *		and unlock the nodes by calling mdi_pi_lock() and
   2504  *		mdi_pi_unlock() functions to get a stable properties.
   2505  *
   2506  *		If there is a need to use the nodes beyond the hold of the
   2507  *		devinfo node period (For ex. I/O), then mdi_pathinfo node
   2508  *		need to be held against unexpected removal by calling
   2509  *		mdi_hold_path() and should be released by calling
   2510  *		mdi_rele_path() on completion.
   2511  */
   2512 mdi_pathinfo_t *
   2513 mdi_get_next_phci_path(dev_info_t *ct_dip, mdi_pathinfo_t *pip)
   2514 {
   2515 	mdi_client_t *ct;
   2516 
   2517 	if (!MDI_CLIENT(ct_dip))
   2518 		return (NULL);
   2519 
   2520 	/*
   2521 	 * Walk through client link
   2522 	 */
   2523 	ct = (mdi_client_t *)DEVI(ct_dip)->devi_mdi_client;
   2524 	ASSERT(ct != NULL);
   2525 
   2526 	if (pip == NULL)
   2527 		return ((mdi_pathinfo_t *)ct->ct_path_head);
   2528 
   2529 	return ((mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link);
   2530 }
   2531 
   2532 /*
   2533  * For a phci, return the next available path to any client
   2534  * Note: ditto mdi_get_next_phci_path()
   2535  */
   2536 mdi_pathinfo_t *
   2537 mdi_get_next_client_path(dev_info_t *ph_dip, mdi_pathinfo_t *pip)
   2538 {
   2539 	mdi_phci_t *ph;
   2540 
   2541 	if (!MDI_PHCI(ph_dip))
   2542 		return (NULL);
   2543 
   2544 	/*
   2545 	 * Walk through pHCI link
   2546 	 */
   2547 	ph = (mdi_phci_t *)DEVI(ph_dip)->devi_mdi_xhci;
   2548 	ASSERT(ph != NULL);
   2549 
   2550 	if (pip == NULL)
   2551 		return ((mdi_pathinfo_t *)ph->ph_path_head);
   2552 
   2553 	return ((mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link);
   2554 }
   2555 
   2556 /*
   2557  * mdi_hold_path():
   2558  *		Hold the mdi_pathinfo node against unwanted unexpected free.
   2559  * Return Values:
   2560  *		None
   2561  */
   2562 void
   2563 mdi_hold_path(mdi_pathinfo_t *pip)
   2564 {
   2565 	if (pip) {
   2566 		MDI_PI_LOCK(pip);
   2567 		MDI_PI_HOLD(pip);
   2568 		MDI_PI_UNLOCK(pip);
   2569 	}
   2570 }
   2571 
   2572 
   2573 /*
   2574  * mdi_rele_path():
   2575  *		Release the mdi_pathinfo node which was selected
   2576  *		through mdi_select_path() mechanism or manually held by
   2577  *		calling mdi_hold_path().
   2578  * Return Values:
   2579  *		None
   2580  */
   2581 void
   2582 mdi_rele_path(mdi_pathinfo_t *pip)
   2583 {
   2584 	if (pip) {
   2585 		MDI_PI_LOCK(pip);
   2586 		MDI_PI_RELE(pip);
   2587 		if (MDI_PI(pip)->pi_ref_cnt == 0) {
   2588 			cv_broadcast(&MDI_PI(pip)->pi_ref_cv);
   2589 		}
   2590 		MDI_PI_UNLOCK(pip);
   2591 	}
   2592 }
   2593 
   2594 /*
   2595  * mdi_pi_lock():
   2596  * 		Lock the mdi_pathinfo node.
   2597  * Note:
   2598  *		The caller should release the lock by calling mdi_pi_unlock()
   2599  */
   2600 void
   2601 mdi_pi_lock(mdi_pathinfo_t *pip)
   2602 {
   2603 	ASSERT(pip != NULL);
   2604 	if (pip) {
   2605 		MDI_PI_LOCK(pip);
   2606 	}
   2607 }
   2608 
   2609 
   2610 /*
   2611  * mdi_pi_unlock():
   2612  * 		Unlock the mdi_pathinfo node.
   2613  * Note:
   2614  *		The mdi_pathinfo node should have been locked with mdi_pi_lock()
   2615  */
   2616 void
   2617 mdi_pi_unlock(mdi_pathinfo_t *pip)
   2618 {
   2619 	ASSERT(pip != NULL);
   2620 	if (pip) {
   2621 		MDI_PI_UNLOCK(pip);
   2622 	}
   2623 }
   2624 
   2625 /*
   2626  * mdi_pi_find():
   2627  *		Search the list of mdi_pathinfo nodes attached to the
   2628  *		pHCI/Client device node whose path address matches "paddr".
   2629  *		Returns a pointer to the mdi_pathinfo node if a matching node is
   2630  *		found.
   2631  * Return Values:
   2632  *		mdi_pathinfo node handle
   2633  *		NULL
   2634  * Notes:
   2635  *		Caller need not hold any locks to call this function.
   2636  */
   2637 mdi_pathinfo_t *
   2638 mdi_pi_find(dev_info_t *pdip, char *caddr, char *paddr)
   2639 {
   2640 	mdi_phci_t		*ph;
   2641 	mdi_vhci_t		*vh;
   2642 	mdi_client_t		*ct;
   2643 	mdi_pathinfo_t		*pip = NULL;
   2644 
   2645 	MDI_DEBUG(2, (MDI_NOTE, pdip,
   2646 	    "caddr@%s paddr@%s", caddr ? caddr : "", paddr ? paddr : ""));
   2647 	if ((pdip == NULL) || (paddr == NULL)) {
   2648 		return (NULL);
   2649 	}
   2650 	ph = i_devi_get_phci(pdip);
   2651 	if (ph == NULL) {
   2652 		/*
   2653 		 * Invalid pHCI device, Nothing more to do.
   2654 		 */
   2655 		MDI_DEBUG(2, (MDI_WARN, pdip, "invalid phci"));
   2656 		return (NULL);
   2657 	}
   2658 
   2659 	vh = ph->ph_vhci;
   2660 	if (vh == NULL) {
   2661 		/*
   2662 		 * Invalid vHCI device, Nothing more to do.
   2663 		 */
   2664 		MDI_DEBUG(2, (MDI_WARN, pdip, "invalid vhci"));
   2665 		return (NULL);
   2666 	}
   2667 
   2668 	/*
   2669 	 * Look for pathinfo node identified by paddr.
   2670 	 */
   2671 	if (caddr == NULL) {
   2672 		/*
   2673 		 * Find a mdi_pathinfo node under pHCI list for a matching
   2674 		 * unit address.
   2675 		 */
   2676 		MDI_PHCI_LOCK(ph);
   2677 		if (MDI_PHCI_IS_OFFLINE(ph)) {
   2678 			MDI_DEBUG(2, (MDI_WARN, pdip,
   2679 			    "offline phci %p", (void *)ph));
   2680 			MDI_PHCI_UNLOCK(ph);
   2681 			return (NULL);
   2682 		}
   2683 		pip = (mdi_pathinfo_t *)ph->ph_path_head;
   2684 
   2685 		while (pip != NULL) {
   2686 			if (strcmp(MDI_PI(pip)->pi_addr, paddr) == 0) {
   2687 				break;
   2688 			}
   2689 			pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
   2690 		}
   2691 		MDI_PHCI_UNLOCK(ph);
   2692 		MDI_DEBUG(2, (MDI_NOTE, pdip,
   2693 		    "found %s %p", mdi_pi_spathname(pip), (void *)pip));
   2694 		return (pip);
   2695 	}
   2696 
   2697 	/*
   2698 	 * XXX - Is the rest of the code in this function really necessary?
   2699 	 * The consumers of mdi_pi_find() can search for the desired pathinfo
   2700 	 * node by calling mdi_pi_find(pdip, NULL, paddr). Irrespective of
   2701 	 * whether the search is based on the pathinfo nodes attached to
   2702 	 * the pHCI or the client node, the result will be the same.
   2703 	 */
   2704 
   2705 	/*
   2706 	 * Find the client device corresponding to 'caddr'
   2707 	 */
   2708 	MDI_VHCI_CLIENT_LOCK(vh);
   2709 
   2710 	/*
   2711 	 * XXX - Passing NULL to the following function works as long as the
   2712 	 * the client addresses (caddr) are unique per vhci basis.
   2713 	 */
   2714 	ct = i_mdi_client_find(vh, NULL, caddr);
   2715 	if (ct == NULL) {
   2716 		/*
   2717 		 * Client not found, Obviously mdi_pathinfo node has not been
   2718 		 * created yet.
   2719 		 */
   2720 		MDI_VHCI_CLIENT_UNLOCK(vh);
   2721 		MDI_DEBUG(2, (MDI_NOTE, pdip,
   2722 		    "client not found for caddr @%s", caddr ? caddr : ""));
   2723 		return (NULL);
   2724 	}
   2725 
   2726 	/*
   2727 	 * Hold the client lock and look for a mdi_pathinfo node with matching
   2728 	 * pHCI and paddr
   2729 	 */
   2730 	MDI_CLIENT_LOCK(ct);
   2731 
   2732 	/*
   2733 	 * Release the global mutex as it is no more needed. Note: We always
   2734 	 * respect the locking order while acquiring.
   2735 	 */
   2736 	MDI_VHCI_CLIENT_UNLOCK(vh);
   2737 
   2738 	pip = (mdi_pathinfo_t *)ct->ct_path_head;
   2739 	while (pip != NULL) {
   2740 		/*
   2741 		 * Compare the unit address
   2742 		 */
   2743 		if ((MDI_PI(pip)->pi_phci == ph) &&
   2744 		    strcmp(MDI_PI(pip)->pi_addr, paddr) == 0) {
   2745 			break;
   2746 		}
   2747 		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
   2748 	}
   2749 	MDI_CLIENT_UNLOCK(ct);
   2750 	MDI_DEBUG(2, (MDI_NOTE, pdip,
   2751 	    "found: %s %p", mdi_pi_spathname(pip), (void *)pip));
   2752 	return (pip);
   2753 }
   2754 
   2755 /*
   2756  * mdi_pi_alloc():
   2757  *		Allocate and initialize a new instance of a mdi_pathinfo node.
   2758  *		The mdi_pathinfo node returned by this function identifies a
   2759  *		unique device path is capable of having properties attached
   2760  *		and passed to mdi_pi_online() to fully attach and online the
   2761  *		path and client device node.
   2762  *		The mdi_pathinfo node returned by this function must be
   2763  *		destroyed using mdi_pi_free() if the path is no longer
   2764  *		operational or if the caller fails to attach a client device
   2765  *		node when calling mdi_pi_online(). The framework will not free
   2766  *		the resources allocated.
   2767  *		This function can be called from both interrupt and kernel
   2768  *		contexts.  DDI_NOSLEEP flag should be used while calling
   2769  *		from interrupt contexts.
   2770  * Return Values:
   2771  *		MDI_SUCCESS
   2772  *		MDI_FAILURE
   2773  *		MDI_NOMEM
   2774  */
   2775 /*ARGSUSED*/
   2776 int
   2777 mdi_pi_alloc_compatible(dev_info_t *pdip, char *cname, char *caddr, char *paddr,
   2778     char **compatible, int ncompatible, int flags, mdi_pathinfo_t **ret_pip)
   2779 {
   2780 	mdi_vhci_t	*vh;
   2781 	mdi_phci_t	*ph;
   2782 	mdi_client_t	*ct;
   2783 	mdi_pathinfo_t	*pip = NULL;
   2784 	dev_info_t	*cdip;
   2785 	int		rv = MDI_NOMEM;
   2786 	int		path_allocated = 0;
   2787 
   2788 	MDI_DEBUG(2, (MDI_NOTE, pdip,
   2789 	    "cname %s: caddr@%s paddr@%s",
   2790 	    cname ? cname : "", caddr ? caddr : "", paddr ? paddr : ""));
   2791 
   2792 	if (pdip == NULL || cname == NULL || caddr == NULL || paddr == NULL ||
   2793 	    ret_pip == NULL) {
   2794 		/* Nothing more to do */
   2795 		return (MDI_FAILURE);
   2796 	}
   2797 
   2798 	*ret_pip = NULL;
   2799 
   2800 	/* No allocations on detaching pHCI */
   2801 	if (DEVI_IS_DETACHING(pdip)) {
   2802 		/* Invalid pHCI device, return failure */
   2803 		MDI_DEBUG(1, (MDI_WARN, pdip,
   2804 		    "!detaching pHCI=%p", (void *)pdip));
   2805 		return (MDI_FAILURE);
   2806 	}
   2807 
   2808 	ph = i_devi_get_phci(pdip);
   2809 	ASSERT(ph != NULL);
   2810 	if (ph == NULL) {
   2811 		/* Invalid pHCI device, return failure */
   2812 		MDI_DEBUG(1, (MDI_WARN, pdip,
   2813 		    "!invalid pHCI=%p", (void *)pdip));
   2814 		return (MDI_FAILURE);
   2815 	}
   2816 
   2817 	MDI_PHCI_LOCK(ph);
   2818 	vh = ph->ph_vhci;
   2819 	if (vh == NULL) {
   2820 		/* Invalid vHCI device, return failure */
   2821 		MDI_DEBUG(1, (MDI_WARN, pdip,
   2822 		    "!invalid vHCI=%p", (void *)pdip));
   2823 		MDI_PHCI_UNLOCK(ph);
   2824 		return (MDI_FAILURE);
   2825 	}
   2826 
   2827 	if (MDI_PHCI_IS_READY(ph) == 0) {
   2828 		/*
   2829 		 * Do not allow new node creation when pHCI is in
   2830 		 * offline/suspended states
   2831 		 */
   2832 		MDI_DEBUG(1, (MDI_WARN, pdip,
   2833 		    "pHCI=%p is not ready", (void *)ph));
   2834 		MDI_PHCI_UNLOCK(ph);
   2835 		return (MDI_BUSY);
   2836 	}
   2837 	MDI_PHCI_UNSTABLE(ph);
   2838 	MDI_PHCI_UNLOCK(ph);
   2839 
   2840 	/* look for a matching client, create one if not found */
   2841 	MDI_VHCI_CLIENT_LOCK(vh);
   2842 	ct = i_mdi_client_find(vh, cname, caddr);
   2843 	if (ct == NULL) {
   2844 		ct = i_mdi_client_alloc(vh, cname, caddr);
   2845 		ASSERT(ct != NULL);
   2846 	}
   2847 
   2848 	if (ct->ct_dip == NULL) {
   2849 		/*
   2850 		 * Allocate a devinfo node
   2851 		 */
   2852 		ct->ct_dip = i_mdi_devinfo_create(vh, cname, caddr,
   2853 		    compatible, ncompatible);
   2854 		if (ct->ct_dip == NULL) {
   2855 			(void) i_mdi_client_free(vh, ct);
   2856 			goto fail;
   2857 		}
   2858 	}
   2859 	cdip = ct->ct_dip;
   2860 
   2861 	DEVI(cdip)->devi_mdi_component |= MDI_COMPONENT_CLIENT;
   2862 	DEVI(cdip)->devi_mdi_client = (caddr_t)ct;
   2863 
   2864 	MDI_CLIENT_LOCK(ct);
   2865 	pip = (mdi_pathinfo_t *)ct->ct_path_head;
   2866 	while (pip != NULL) {
   2867 		/*
   2868 		 * Compare the unit address
   2869 		 */
   2870 		if ((MDI_PI(pip)->pi_phci == ph) &&
   2871 		    strcmp(MDI_PI(pip)->pi_addr, paddr) == 0) {
   2872 			break;
   2873 		}
   2874 		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
   2875 	}
   2876 	MDI_CLIENT_UNLOCK(ct);
   2877 
   2878 	if (pip == NULL) {
   2879 		/*
   2880 		 * This is a new path for this client device.  Allocate and
   2881 		 * initialize a new pathinfo node
   2882 		 */
   2883 		pip = i_mdi_pi_alloc(ph, paddr, ct);
   2884 		ASSERT(pip != NULL);
   2885 		path_allocated = 1;
   2886 	}
   2887 	rv = MDI_SUCCESS;
   2888 
   2889 fail:
   2890 	/*
   2891 	 * Release the global mutex.
   2892 	 */
   2893 	MDI_VHCI_CLIENT_UNLOCK(vh);
   2894 
   2895 	/*
   2896 	 * Mark the pHCI as stable
   2897 	 */
   2898 	MDI_PHCI_LOCK(ph);
   2899 	MDI_PHCI_STABLE(ph);
   2900 	MDI_PHCI_UNLOCK(ph);
   2901 	*ret_pip = pip;
   2902 
   2903 	MDI_DEBUG(2, (MDI_NOTE, pdip,
   2904 	    "alloc %s %p", mdi_pi_spathname(pip), (void *)pip));
   2905 
   2906 	if (path_allocated)
   2907 		vhcache_pi_add(vh->vh_config, MDI_PI(pip));
   2908 
   2909 	return (rv);
   2910 }
   2911 
   2912 /*ARGSUSED*/
   2913 int
   2914 mdi_pi_alloc(dev_info_t *pdip, char *cname, char *caddr, char *paddr,
   2915     int flags, mdi_pathinfo_t **ret_pip)
   2916 {
   2917 	return (mdi_pi_alloc_compatible(pdip, cname, caddr, paddr, NULL, 0,
   2918 	    flags, ret_pip));
   2919 }
   2920 
   2921 /*
   2922  * i_mdi_pi_alloc():
   2923  *		Allocate a mdi_pathinfo node and add to the pHCI path list
   2924  * Return Values:
   2925  *		mdi_pathinfo
   2926  */
   2927 /*ARGSUSED*/
   2928 static mdi_pathinfo_t *
   2929 i_mdi_pi_alloc(mdi_phci_t *ph, char *paddr, mdi_client_t *ct)
   2930 {
   2931 	mdi_pathinfo_t	*pip;
   2932 	int		ct_circular;
   2933 	int		ph_circular;
   2934 	static char	path[MAXPATHLEN];	/* mdi_pathmap_mutex protects */
   2935 	char		*path_persistent;
   2936 	int		path_instance;
   2937 	mod_hash_val_t	hv;
   2938 
   2939 	ASSERT(MDI_VHCI_CLIENT_LOCKED(ph->ph_vhci));
   2940 
   2941 	pip = kmem_zalloc(sizeof (struct mdi_pathinfo), KM_SLEEP);
   2942 	mutex_init(&MDI_PI(pip)->pi_mutex, NULL, MUTEX_DEFAULT, NULL);
   2943 	MDI_PI(pip)->pi_state = MDI_PATHINFO_STATE_INIT |
   2944 	    MDI_PATHINFO_STATE_TRANSIENT;
   2945 
   2946 	if (MDI_PHCI_IS_USER_DISABLED(ph))
   2947 		MDI_PI_SET_USER_DISABLE(pip);
   2948 
   2949 	if (MDI_PHCI_IS_DRV_DISABLED_TRANSIENT(ph))
   2950 		MDI_PI_SET_DRV_DISABLE_TRANS(pip);
   2951 
   2952 	if (MDI_PHCI_IS_DRV_DISABLED(ph))
   2953 		MDI_PI_SET_DRV_DISABLE(pip);
   2954 
   2955 	MDI_PI(pip)->pi_old_state = MDI_PATHINFO_STATE_INIT;
   2956 	cv_init(&MDI_PI(pip)->pi_state_cv, NULL, CV_DEFAULT, NULL);
   2957 	MDI_PI(pip)->pi_client = ct;
   2958 	MDI_PI(pip)->pi_phci = ph;
   2959 	MDI_PI(pip)->pi_addr = kmem_alloc(strlen(paddr) + 1, KM_SLEEP);
   2960 	(void) strcpy(MDI_PI(pip)->pi_addr, paddr);
   2961 
   2962         /*
   2963 	 * We form the "path" to the pathinfo node, and see if we have
   2964 	 * already allocated a 'path_instance' for that "path".  If so,
   2965 	 * we use the already allocated 'path_instance'.  If not, we
   2966 	 * allocate a new 'path_instance' and associate it with a copy of
   2967 	 * the "path" string (which is never freed). The association
   2968 	 * between a 'path_instance' this "path" string persists until
   2969 	 * reboot.
   2970 	 */
   2971         mutex_enter(&mdi_pathmap_mutex);
   2972 	(void) ddi_pathname(ph->ph_dip, path);
   2973 	(void) sprintf(path + strlen(path), "/%s@%s",
   2974 	    mdi_pi_get_node_name(pip), mdi_pi_get_addr(pip));
   2975         if (mod_hash_find(mdi_pathmap_bypath, (mod_hash_key_t)path, &hv) == 0) {
   2976                 path_instance = (uint_t)(intptr_t)hv;
   2977         } else {
   2978 		/* allocate a new 'path_instance' and persistent "path" */
   2979 		path_instance = mdi_pathmap_instance++;
   2980 		path_persistent = i_ddi_strdup(path, KM_SLEEP);
   2981                 (void) mod_hash_insert(mdi_pathmap_bypath,
   2982                     (mod_hash_key_t)path_persistent,
   2983                     (mod_hash_val_t)(intptr_t)path_instance);
   2984 		(void) mod_hash_insert(mdi_pathmap_byinstance,
   2985 		    (mod_hash_key_t)(intptr_t)path_instance,
   2986 		    (mod_hash_val_t)path_persistent);
   2987 
   2988 		/* create shortpath name */
   2989 		(void) snprintf(path, sizeof(path), "%s%d/%s@%s",
   2990 		    ddi_driver_name(ph->ph_dip), ddi_get_instance(ph->ph_dip),
   2991 		    mdi_pi_get_node_name(pip), mdi_pi_get_addr(pip));
   2992 		path_persistent = i_ddi_strdup(path, KM_SLEEP);
   2993 		(void) mod_hash_insert(mdi_pathmap_sbyinstance,
   2994 		    (mod_hash_key_t)(intptr_t)path_instance,
   2995 		    (mod_hash_val_t)path_persistent);
   2996         }
   2997         mutex_exit(&mdi_pathmap_mutex);
   2998 	MDI_PI(pip)->pi_path_instance = path_instance;
   2999 
   3000 	(void) nvlist_alloc(&MDI_PI(pip)->pi_prop, NV_UNIQUE_NAME, KM_SLEEP);
   3001 	ASSERT(MDI_PI(pip)->pi_prop != NULL);
   3002 	MDI_PI(pip)->pi_pprivate = NULL;
   3003 	MDI_PI(pip)->pi_cprivate = NULL;
   3004 	MDI_PI(pip)->pi_vprivate = NULL;
   3005 	MDI_PI(pip)->pi_client_link = NULL;
   3006 	MDI_PI(pip)->pi_phci_link = NULL;
   3007 	MDI_PI(pip)->pi_ref_cnt = 0;
   3008 	MDI_PI(pip)->pi_kstats = NULL;
   3009 	MDI_PI(pip)->pi_preferred = 1;
   3010 	cv_init(&MDI_PI(pip)->pi_ref_cv, NULL, CV_DEFAULT, NULL);
   3011 
   3012 	/*
   3013 	 * Lock both dev_info nodes against changes in parallel.
   3014 	 *
   3015 	 * The ndi_devi_enter(Client), is atypical since the client is a leaf.
   3016 	 * This atypical operation is done to synchronize pathinfo nodes
   3017 	 * during devinfo snapshot (see di_register_pip) by 'pretending' that
   3018 	 * the pathinfo nodes are children of the Client.
   3019 	 */
   3020 	ndi_devi_enter(ct->ct_dip, &ct_circular);
   3021 	ndi_devi_enter(ph->ph_dip, &ph_circular);
   3022 
   3023 	i_mdi_phci_add_path(ph, pip);
   3024 	i_mdi_client_add_path(ct, pip);
   3025 
   3026 	ndi_devi_exit(ph->ph_dip, ph_circular);
   3027 	ndi_devi_exit(ct->ct_dip, ct_circular);
   3028 
   3029 	return (pip);
   3030 }
   3031 
   3032 /*
   3033  * mdi_pi_pathname_by_instance():
   3034  *	Lookup of "path" by 'path_instance'. Return "path".
   3035  *	NOTE: returned "path" remains valid forever (until reboot).
   3036  */
   3037 char *
   3038 mdi_pi_pathname_by_instance(int path_instance)
   3039 {
   3040 	char		*path;
   3041 	mod_hash_val_t	hv;
   3042 
   3043 	/* mdi_pathmap lookup of "path" by 'path_instance' */
   3044 	mutex_enter(&mdi_pathmap_mutex);
   3045 	if (mod_hash_find(mdi_pathmap_byinstance,
   3046 	    (mod_hash_key_t)(intptr_t)path_instance, &hv) == 0)
   3047 		path = (char *)hv;
   3048 	else
   3049 		path = NULL;
   3050 	mutex_exit(&mdi_pathmap_mutex);
   3051 	return (path);
   3052 }
   3053 
   3054 /*
   3055  * mdi_pi_spathname_by_instance():
   3056  *	Lookup of "shortpath" by 'path_instance'. Return "shortpath".
   3057  *	NOTE: returned "shortpath" remains valid forever (until reboot).
   3058  */
   3059 char *
   3060 mdi_pi_spathname_by_instance(int path_instance)
   3061 {
   3062 	char		*path;
   3063 	mod_hash_val_t	hv;
   3064 
   3065 	/* mdi_pathmap lookup of "path" by 'path_instance' */
   3066 	mutex_enter(&mdi_pathmap_mutex);
   3067 	if (mod_hash_find(mdi_pathmap_sbyinstance,
   3068 	    (mod_hash_key_t)(intptr_t)path_instance, &hv) == 0)
   3069 		path = (char *)hv;
   3070 	else
   3071 		path = NULL;
   3072 	mutex_exit(&mdi_pathmap_mutex);
   3073 	return (path);
   3074 }
   3075 
   3076 
   3077 /*
   3078  * i_mdi_phci_add_path():
   3079  * 		Add a mdi_pathinfo node to pHCI list.
   3080  * Notes:
   3081  *		Caller should per-pHCI mutex
   3082  */
   3083 static void
   3084 i_mdi_phci_add_path(mdi_phci_t *ph, mdi_pathinfo_t *pip)
   3085 {
   3086 	ASSERT(DEVI_BUSY_OWNED(ph->ph_dip));
   3087 
   3088 	MDI_PHCI_LOCK(ph);
   3089 	if (ph->ph_path_head == NULL) {
   3090 		ph->ph_path_head = pip;
   3091 	} else {
   3092 		MDI_PI(ph->ph_path_tail)->pi_phci_link = MDI_PI(pip);
   3093 	}
   3094 	ph->ph_path_tail = pip;
   3095 	ph->ph_path_count++;
   3096 	MDI_PHCI_UNLOCK(ph);
   3097 }
   3098 
   3099 /*
   3100  * i_mdi_client_add_path():
   3101  *		Add mdi_pathinfo node to client list
   3102  */
   3103 static void
   3104 i_mdi_client_add_path(mdi_client_t *ct, mdi_pathinfo_t *pip)
   3105 {
   3106 	ASSERT(DEVI_BUSY_OWNED(ct->ct_dip));
   3107 
   3108 	MDI_CLIENT_LOCK(ct);
   3109 	if (ct->ct_path_head == NULL) {
   3110 		ct->ct_path_head = pip;
   3111 	} else {
   3112 		MDI_PI(ct->ct_path_tail)->pi_client_link = MDI_PI(pip);
   3113 	}
   3114 	ct->ct_path_tail = pip;
   3115 	ct->ct_path_count++;
   3116 	MDI_CLIENT_UNLOCK(ct);
   3117 }
   3118 
   3119 /*
   3120  * mdi_pi_free():
   3121  *		Free the mdi_pathinfo node and also client device node if this
   3122  *		is the last path to the device
   3123  * Return Values:
   3124  *		MDI_SUCCESS
   3125  *		MDI_FAILURE
   3126  *		MDI_BUSY
   3127  */
   3128 /*ARGSUSED*/
   3129 int
   3130 mdi_pi_free(mdi_pathinfo_t *pip, int flags)
   3131 {
   3132 	int		rv = MDI_FAILURE;
   3133 	mdi_vhci_t	*vh;
   3134 	mdi_phci_t	*ph;
   3135 	mdi_client_t	*ct;
   3136 	int		(*f)();
   3137 	int		client_held = 0;
   3138 
   3139 	MDI_PI_LOCK(pip);
   3140 	ph = MDI_PI(pip)->pi_phci;
   3141 	ASSERT(ph != NULL);
   3142 	if (ph == NULL) {
   3143 		/*
   3144 		 * Invalid pHCI device, return failure
   3145 		 */
   3146 		MDI_DEBUG(1, (MDI_WARN, NULL,
   3147 		    "!invalid pHCI: pip %s %p",
   3148 		    mdi_pi_spathname(pip), (void *)pip));
   3149 		MDI_PI_UNLOCK(pip);
   3150 		return (MDI_FAILURE);
   3151 	}
   3152 
   3153 	vh = ph->ph_vhci;
   3154 	ASSERT(vh != NULL);
   3155 	if (vh == NULL) {
   3156 		/* Invalid pHCI device, return failure */
   3157 		MDI_DEBUG(1, (MDI_WARN, ph->ph_dip,
   3158 		    "!invalid vHCI: pip %s %p",
   3159 		    mdi_pi_spathname(pip), (void *)pip));
   3160 		MDI_PI_UNLOCK(pip);
   3161 		return (MDI_FAILURE);
   3162 	}
   3163 
   3164 	ct = MDI_PI(pip)->pi_client;
   3165 	ASSERT(ct != NULL);
   3166 	if (ct == NULL) {
   3167 		/*
   3168 		 * Invalid Client device, return failure
   3169 		 */
   3170 		MDI_DEBUG(1, (MDI_WARN, ph->ph_dip,
   3171 		    "!invalid client: pip %s %p",
   3172 		    mdi_pi_spathname(pip), (void *)pip));
   3173 		MDI_PI_UNLOCK(pip);
   3174 		return (MDI_FAILURE);
   3175 	}
   3176 
   3177 	/*
   3178 	 * Check to see for busy condition.  A mdi_pathinfo can only be freed
   3179 	 * if the node state is either offline or init and the reference count
   3180 	 * is zero.
   3181 	 */
   3182 	if (!(MDI_PI_IS_OFFLINE(pip) || MDI_PI_IS_INIT(pip) ||
   3183 	    MDI_PI_IS_INITING(pip))) {
   3184 		/*
   3185 		 * Node is busy
   3186 		 */
   3187 		MDI_DEBUG(1, (MDI_WARN, ct->ct_dip,
   3188 		    "!busy: pip %s %p", mdi_pi_spathname(pip), (void *)pip));
   3189 		MDI_PI_UNLOCK(pip);
   3190 		return (MDI_BUSY);
   3191 	}
   3192 
   3193 	while (MDI_PI(pip)->pi_ref_cnt != 0) {
   3194 		/*
   3195 		 * Give a chance for pending I/Os to complete.
   3196 		 */
   3197 		MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
   3198 		    "!%d cmds still pending on path: %s %p",
   3199 		    MDI_PI(pip)->pi_ref_cnt,
   3200 		    mdi_pi_spathname(pip), (void *)pip));
   3201 		if (cv_timedwait(&MDI_PI(pip)->pi_ref_cv,
   3202 		    &MDI_PI(pip)->pi_mutex,
   3203 		    ddi_get_lbolt() + drv_usectohz(60 * 1000000)) == -1) {
   3204 			/*
   3205 			 * The timeout time reached without ref_cnt being zero
   3206 			 * being signaled.
   3207 			 */
   3208 			MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
   3209 			    "!Timeout reached on path %s %p without the cond",
   3210 			    mdi_pi_spathname(pip), (void *)pip));
   3211 			MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
   3212 			    "!%d cmds still pending on path %s %p",
   3213 			    MDI_PI(pip)->pi_ref_cnt,
   3214 			    mdi_pi_spathname(pip), (void *)pip));
   3215 			MDI_PI_UNLOCK(pip);
   3216 			return (MDI_BUSY);
   3217 		}
   3218 	}
   3219 	if (MDI_PI(pip)->pi_pm_held) {
   3220 		client_held = 1;
   3221 	}
   3222 	MDI_PI_UNLOCK(pip);
   3223 
   3224 	vhcache_pi_remove(vh->vh_config, MDI_PI(pip));
   3225 
   3226 	MDI_CLIENT_LOCK(ct);
   3227 
   3228 	/* Prevent further failovers till MDI_VHCI_CLIENT_LOCK is held */
   3229 	MDI_CLIENT_SET_PATH_FREE_IN_PROGRESS(ct);
   3230 
   3231 	/*
   3232 	 * Wait till failover is complete before removing this node.
   3233 	 */
   3234 	while (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct))
   3235 		cv_wait(&ct->ct_failover_cv, &ct->ct_mutex);
   3236 
   3237 	MDI_CLIENT_UNLOCK(ct);
   3238 	MDI_VHCI_CLIENT_LOCK(vh);
   3239 	MDI_CLIENT_LOCK(ct);
   3240 	MDI_CLIENT_CLEAR_PATH_FREE_IN_PROGRESS(ct);
   3241 
   3242 	if (!MDI_PI_IS_INITING(pip)) {
   3243 		f = vh->vh_ops->vo_pi_uninit;
   3244 		if (f != NULL) {
   3245 			rv = (*f)(vh->vh_dip, pip, 0);
   3246 		}
   3247 	}
   3248 	/*
   3249 	 * If vo_pi_uninit() completed successfully.
   3250 	 */
   3251 	if (rv == MDI_SUCCESS) {
   3252 		if (client_held) {
   3253 			MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
   3254 			    "i_mdi_pm_rele_client\n"));
   3255 			i_mdi_pm_rele_client(ct, 1);
   3256 		}
   3257 		i_mdi_pi_free(ph, pip, ct);
   3258 		if (ct->ct_path_count == 0) {
   3259 			/*
   3260 			 * Client lost its last path.
   3261 			 * Clean up the client device
   3262 			 */
   3263 			MDI_CLIENT_UNLOCK(ct);
   3264 			(void) i_mdi_client_free(ct->ct_vhci, ct);
   3265 			MDI_VHCI_CLIENT_UNLOCK(vh);
   3266 			return (rv);
   3267 		}
   3268 	}
   3269 	MDI_CLIENT_UNLOCK(ct);
   3270 	MDI_VHCI_CLIENT_UNLOCK(vh);
   3271 
   3272 	if (rv == MDI_FAILURE)
   3273 		vhcache_pi_add(vh->vh_config, MDI_PI(pip));
   3274 
   3275 	return (rv);
   3276 }
   3277 
   3278 /*
   3279  * i_mdi_pi_free():
   3280  *		Free the mdi_pathinfo node
   3281  */
   3282 static void
   3283 i_mdi_pi_free(mdi_phci_t *ph, mdi_pathinfo_t *pip, mdi_client_t *ct)
   3284 {
   3285 	int	ct_circular;
   3286 	int	ph_circular;
   3287 
   3288 	ASSERT(MDI_CLIENT_LOCKED(ct));
   3289 
   3290 	/*
   3291 	 * remove any per-path kstats
   3292 	 */
   3293 	i_mdi_pi_kstat_destroy(pip);
   3294 
   3295 	/* See comments in i_mdi_pi_alloc() */
   3296 	ndi_devi_enter(ct->ct_dip, &ct_circular);
   3297 	ndi_devi_enter(ph->ph_dip, &ph_circular);
   3298 
   3299 	i_mdi_client_remove_path(ct, pip);
   3300 	i_mdi_phci_remove_path(ph, pip);
   3301 
   3302 	ndi_devi_exit(ph->ph_dip, ph_circular);
   3303 	ndi_devi_exit(ct->ct_dip, ct_circular);
   3304 
   3305 	mutex_destroy(&MDI_PI(pip)->pi_mutex);
   3306 	cv_destroy(&MDI_PI(pip)->pi_state_cv);
   3307 	cv_destroy(&MDI_PI(pip)->pi_ref_cv);
   3308 	if (MDI_PI(pip)->pi_addr) {
   3309 		kmem_free(MDI_PI(pip)->pi_addr,
   3310 		    strlen(MDI_PI(pip)->pi_addr) + 1);
   3311 		MDI_PI(pip)->pi_addr = NULL;
   3312 	}
   3313 
   3314 	if (MDI_PI(pip)->pi_prop) {
   3315 		(void) nvlist_free(MDI_PI(pip)->pi_prop);
   3316 		MDI_PI(pip)->pi_prop = NULL;
   3317 	}
   3318 	kmem_free(pip, sizeof (struct mdi_pathinfo));
   3319 }
   3320 
   3321 
   3322 /*
   3323  * i_mdi_phci_remove_path():
   3324  * 		Remove a mdi_pathinfo node from pHCI list.
   3325  * Notes:
   3326  *		Caller should hold per-pHCI mutex
   3327  */
   3328 static void
   3329 i_mdi_phci_remove_path(mdi_phci_t *ph, mdi_pathinfo_t *pip)
   3330 {
   3331 	mdi_pathinfo_t	*prev = NULL;
   3332 	mdi_pathinfo_t	*path = NULL;
   3333 
   3334 	ASSERT(DEVI_BUSY_OWNED(ph->ph_dip));
   3335 
   3336 	MDI_PHCI_LOCK(ph);
   3337 	path = ph->ph_path_head;
   3338 	while (path != NULL) {
   3339 		if (path == pip) {
   3340 			break;
   3341 		}
   3342 		prev = path;
   3343 		path = (mdi_pathinfo_t *)MDI_PI(path)->pi_phci_link;
   3344 	}
   3345 
   3346 	if (path) {
   3347 		ph->ph_path_count--;
   3348 		if (prev) {
   3349 			MDI_PI(prev)->pi_phci_link = MDI_PI(path)->pi_phci_link;
   3350 		} else {
   3351 			ph->ph_path_head =
   3352 			    (mdi_pathinfo_t *)MDI_PI(path)->pi_phci_link;
   3353 		}
   3354 		if (ph->ph_path_tail == path) {
   3355 			ph->ph_path_tail = prev;
   3356 		}
   3357 	}
   3358 
   3359 	/*
   3360 	 * Clear the pHCI link
   3361 	 */
   3362 	MDI_PI(pip)->pi_phci_link = NULL;
   3363 	MDI_PI(pip)->pi_phci = NULL;
   3364 	MDI_PHCI_UNLOCK(ph);
   3365 }
   3366 
   3367 /*
   3368  * i_mdi_client_remove_path():
   3369  * 		Remove a mdi_pathinfo node from client path list.
   3370  */
   3371 static void
   3372 i_mdi_client_remove_path(mdi_client_t *ct, mdi_pathinfo_t *pip)
   3373 {
   3374 	mdi_pathinfo_t	*prev = NULL;
   3375 	mdi_pathinfo_t	*path;
   3376 
   3377 	ASSERT(DEVI_BUSY_OWNED(ct->ct_dip));
   3378 
   3379 	ASSERT(MDI_CLIENT_LOCKED(ct));
   3380 	path = ct->ct_path_head;
   3381 	while (path != NULL) {
   3382 		if (path == pip) {
   3383 			break;
   3384 		}
   3385 		prev = path;
   3386 		path = (mdi_pathinfo_t *)MDI_PI(path)->pi_client_link;
   3387 	}
   3388 
   3389 	if (path) {
   3390 		ct->ct_path_count--;
   3391 		if (prev) {
   3392 			MDI_PI(prev)->pi_client_link =
   3393 			    MDI_PI(path)->pi_client_link;
   3394 		} else {
   3395 			ct->ct_path_head =
   3396 			    (mdi_pathinfo_t *)MDI_PI(path)->pi_client_link;
   3397 		}
   3398 		if (ct->ct_path_tail == path) {
   3399 			ct->ct_path_tail = prev;
   3400 		}
   3401 		if (ct->ct_path_last == path) {
   3402 			ct->ct_path_last = ct->ct_path_head;
   3403 		}
   3404 	}
   3405 	MDI_PI(pip)->pi_client_link = NULL;
   3406 	MDI_PI(pip)->pi_client = NULL;
   3407 }
   3408 
   3409 /*
   3410  * i_mdi_pi_state_change():
   3411  *		online a mdi_pathinfo node
   3412  *
   3413  * Return Values:
   3414  *		MDI_SUCCESS
   3415  *		MDI_FAILURE
   3416  */
   3417 /*ARGSUSED*/
   3418 static int
   3419 i_mdi_pi_state_change(mdi_pathinfo_t *pip, mdi_pathinfo_state_t state, int flag)
   3420 {
   3421 	int		rv = MDI_SUCCESS;
   3422 	mdi_vhci_t	*vh;
   3423 	mdi_phci_t	*ph;
   3424 	mdi_client_t	*ct;
   3425 	int		(*f)();
   3426 	dev_info_t	*cdip;
   3427 
   3428 	MDI_PI_LOCK(pip);
   3429 
   3430 	ph = MDI_PI(pip)->pi_phci;
   3431 	ASSERT(ph);
   3432 	if (ph == NULL) {
   3433 		/*
   3434 		 * Invalid pHCI device, fail the request
   3435 		 */
   3436 		MDI_PI_UNLOCK(pip);
   3437 		MDI_DEBUG(1, (MDI_WARN, NULL,
   3438 		    "!invalid phci: pip %s %p",
   3439 		    mdi_pi_spathname(pip), (void *)pip));
   3440 		return (MDI_FAILURE);
   3441 	}
   3442 
   3443 	vh = ph->ph_vhci;
   3444 	ASSERT(vh);
   3445 	if (vh == NULL) {
   3446 		/*
   3447 		 * Invalid vHCI device, fail the request
   3448 		 */
   3449 		MDI_PI_UNLOCK(pip);
   3450 		MDI_DEBUG(1, (MDI_WARN, ph->ph_dip,
   3451 		    "!invalid vhci: pip %s %p",
   3452 		    mdi_pi_spathname(pip), (void *)pip));
   3453 		return (MDI_FAILURE);
   3454 	}
   3455 
   3456 	ct = MDI_PI(pip)->pi_client;
   3457 	ASSERT(ct != NULL);
   3458 	if (ct == NULL) {
   3459 		/*
   3460 		 * Invalid client device, fail the request
   3461 		 */
   3462 		MDI_PI_UNLOCK(pip);
   3463 		MDI_DEBUG(1, (MDI_WARN, ph->ph_dip,
   3464 		    "!invalid client: pip %s %p",
   3465 		    mdi_pi_spathname(pip), (void *)pip));
   3466 		return (MDI_FAILURE);
   3467 	}
   3468 
   3469 	/*
   3470 	 * If this path has not been initialized yet, Callback vHCI driver's
   3471 	 * pathinfo node initialize entry point
   3472 	 */
   3473 
   3474 	if (MDI_PI_IS_INITING(pip)) {
   3475 		MDI_PI_UNLOCK(pip);
   3476 		f = vh->vh_ops->vo_pi_init;
   3477 		if (f != NULL) {
   3478 			rv = (*f)(vh->vh_dip, pip, 0);
   3479 			if (rv != MDI_SUCCESS) {
   3480 				MDI_DEBUG(1, (MDI_WARN, ct->ct_dip,
   3481 				    "!vo_pi_init failed: vHCI %p, pip %s %p",
   3482 				    (void *)vh, mdi_pi_spathname(pip),
   3483 				    (void *)pip));
   3484 				return (MDI_FAILURE);
   3485 			}
   3486 		}
   3487 		MDI_PI_LOCK(pip);
   3488 		MDI_PI_CLEAR_TRANSIENT(pip);
   3489 	}
   3490 
   3491 	/*
   3492 	 * Do not allow state transition when pHCI is in offline/suspended
   3493 	 * states
   3494 	 */
   3495 	i_mdi_phci_lock(ph, pip);
   3496 	if (MDI_PHCI_IS_READY(ph) == 0) {
   3497 		MDI_DEBUG(1, (MDI_WARN, ct->ct_dip,
   3498 		    "!pHCI not ready, pHCI=%p", (void *)ph));
   3499 		MDI_PI_UNLOCK(pip);
   3500 		i_mdi_phci_unlock(ph);
   3501 		return (MDI_BUSY);
   3502 	}
   3503 	MDI_PHCI_UNSTABLE(ph);
   3504 	i_mdi_phci_unlock(ph);
   3505 
   3506 	/*
   3507 	 * Check if mdi_pathinfo state is in transient state.
   3508 	 * If yes, offlining is in progress and wait till transient state is
   3509 	 * cleared.
   3510 	 */
   3511 	if (MDI_PI_IS_TRANSIENT(pip)) {
   3512 		while (MDI_PI_IS_TRANSIENT(pip)) {
   3513 			cv_wait(&MDI_PI(pip)->pi_state_cv,
   3514 			    &MDI_PI(pip)->pi_mutex);
   3515 		}
   3516 	}
   3517 
   3518 	/*
   3519 	 * Grab the client lock in reverse order sequence and release the
   3520 	 * mdi_pathinfo mutex.
   3521 	 */
   3522 	i_mdi_client_lock(ct, pip);
   3523 	MDI_PI_UNLOCK(pip);
   3524 
   3525 	/*
   3526 	 * Wait till failover state is cleared
   3527 	 */
   3528 	while (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct))
   3529 		cv_wait(&ct->ct_failover_cv, &ct->ct_mutex);
   3530 
   3531 	/*
   3532 	 * Mark the mdi_pathinfo node state as transient
   3533 	 */
   3534 	MDI_PI_LOCK(pip);
   3535 	switch (state) {
   3536 	case MDI_PATHINFO_STATE_ONLINE:
   3537 		MDI_PI_SET_ONLINING(pip);
   3538 		break;
   3539 
   3540 	case MDI_PATHINFO_STATE_STANDBY:
   3541 		MDI_PI_SET_STANDBYING(pip);
   3542 		break;
   3543 
   3544 	case MDI_PATHINFO_STATE_FAULT:
   3545 		/*
   3546 		 * Mark the pathinfo state as FAULTED
   3547 		 */
   3548 		MDI_PI_SET_FAULTING(pip);
   3549 		MDI_PI_ERRSTAT(pip, MDI_PI_HARDERR);
   3550 		break;
   3551 
   3552 	case MDI_PATHINFO_STATE_OFFLINE:
   3553 		/*
   3554 		 * ndi_devi_offline() cannot hold pip or ct locks.
   3555 		 */
   3556 		MDI_PI_UNLOCK(pip);
   3557 
   3558 		/*
   3559 		 * If this is a user initiated path online->offline operation
   3560 		 * who's success would transition a client from DEGRADED to
   3561 		 * FAILED then only proceed if we can offline the client first.
   3562 		 */
   3563 		cdip = ct->ct_dip;
   3564 		if ((flag & NDI_USER_REQ) &&
   3565 		    MDI_PI_IS_ONLINE(pip) &&
   3566 		    (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_DEGRADED)) {
   3567 			i_mdi_client_unlock(ct);
   3568 			rv = ndi_devi_offline(cdip, NDI_DEVFS_CLEAN);
   3569 			if (rv != NDI_SUCCESS) {
   3570 				/*
   3571 				 * Convert to MDI error code
   3572 				 */
   3573 				switch (rv) {
   3574 				case NDI_BUSY:
   3575 					rv = MDI_BUSY;
   3576 					break;
   3577 				default:
   3578 					rv = MDI_FAILURE;
   3579 					break;
   3580 				}
   3581 				goto state_change_exit;
   3582 			} else {
   3583 				i_mdi_client_lock(ct, NULL);
   3584 			}
   3585 		}
   3586 		/*
   3587 		 * Mark the mdi_pathinfo node state as transient
   3588 		 */
   3589 		MDI_PI_LOCK(pip);
   3590 		MDI_PI_SET_OFFLINING(pip);
   3591 		break;
   3592 	}
   3593 	MDI_PI_UNLOCK(pip);
   3594 	MDI_CLIENT_UNSTABLE(ct);
   3595 	i_mdi_client_unlock(ct);
   3596 
   3597 	f = vh->vh_ops->vo_pi_state_change;
   3598 	if (f != NULL)
   3599 		rv = (*f)(vh->vh_dip, pip, state, 0, flag);
   3600 
   3601 	MDI_CLIENT_LOCK(ct);
   3602 	MDI_PI_LOCK(pip);
   3603 	if (rv == MDI_NOT_SUPPORTED) {
   3604 		MDI_CLIENT_SET_DEV_NOT_SUPPORTED(ct);
   3605 	}
   3606 	if (rv != MDI_SUCCESS) {
   3607 		MDI_DEBUG(2, (MDI_WARN, ct->ct_dip,
   3608 		    "vo_pi_state_change failed: rv %x", rv));
   3609 	}
   3610 	if (MDI_PI_IS_TRANSIENT(pip)) {
   3611 		if (rv == MDI_SUCCESS) {
   3612 			MDI_PI_CLEAR_TRANSIENT(pip);
   3613 		} else {
   3614 			MDI_PI(pip)->pi_state = MDI_PI_OLD_STATE(pip);
   3615 		}
   3616 	}
   3617 
   3618 	/*
   3619 	 * Wake anyone waiting for this mdi_pathinfo node
   3620 	 */
   3621 	cv_broadcast(&MDI_PI(pip)->pi_state_cv);
   3622 	MDI_PI_UNLOCK(pip);
   3623 
   3624 	/*
   3625 	 * Mark the client device as stable
   3626 	 */
   3627 	MDI_CLIENT_STABLE(ct);
   3628 	if (rv == MDI_SUCCESS) {
   3629 		if (ct->ct_unstable == 0) {
   3630 			cdip = ct->ct_dip;
   3631 
   3632 			/*
   3633 			 * Onlining the mdi_pathinfo node will impact the
   3634 			 * client state Update the client and dev_info node
   3635 			 * state accordingly
   3636 			 */
   3637 			rv = NDI_SUCCESS;
   3638 			i_mdi_client_update_state(ct);
   3639 			switch (MDI_CLIENT_STATE(ct)) {
   3640 			case MDI_CLIENT_STATE_OPTIMAL:
   3641 			case MDI_CLIENT_STATE_DEGRADED:
   3642 				if (cdip && !i_ddi_devi_attached(cdip) &&
   3643 				    ((state == MDI_PATHINFO_STATE_ONLINE) ||
   3644 				    (state == MDI_PATHINFO_STATE_STANDBY))) {
   3645 
   3646 					/*
   3647 					 * Must do ndi_devi_online() through
   3648 					 * hotplug thread for deferred
   3649 					 * attach mechanism to work
   3650 					 */
   3651 					MDI_CLIENT_UNLOCK(ct);
   3652 					rv = ndi_devi_online(cdip, 0);
   3653 					MDI_CLIENT_LOCK(ct);
   3654 					if ((rv != NDI_SUCCESS) &&
   3655 					    (MDI_CLIENT_STATE(ct) ==
   3656 					    MDI_CLIENT_STATE_DEGRADED)) {
   3657 						/*
   3658 						 * ndi_devi_online failed.
   3659 						 * Reset client flags to
   3660 						 * offline.
   3661 						 */
   3662 						MDI_DEBUG(1, (MDI_WARN, cdip,
   3663 						    "!ndi_devi_online failed "
   3664 						    "error %x", rv));
   3665 						MDI_CLIENT_SET_OFFLINE(ct);
   3666 					}
   3667 					if (rv != NDI_SUCCESS) {
   3668 						/* Reset the path state */
   3669 						MDI_PI_LOCK(pip);
   3670 						MDI_PI(pip)->pi_state =
   3671 						    MDI_PI_OLD_STATE(pip);
   3672 						MDI_PI_UNLOCK(pip);
   3673 					}
   3674 				}
   3675 				break;
   3676 
   3677 			case MDI_CLIENT_STATE_FAILED:
   3678 				/*
   3679 				 * This is the last path case for
   3680 				 * non-user initiated events.
   3681 				 */
   3682 				if (((flag & NDI_USER_REQ) == 0) &&
   3683 				    cdip && (i_ddi_node_state(cdip) >=
   3684 				    DS_INITIALIZED)) {
   3685 					MDI_CLIENT_UNLOCK(ct);
   3686 					rv = ndi_devi_offline(cdip,
   3687 					    NDI_DEVFS_CLEAN);
   3688 					MDI_CLIENT_LOCK(ct);
   3689 
   3690 					if (rv != NDI_SUCCESS) {
   3691 						/*
   3692 						 * ndi_devi_offline failed.
   3693 						 * Reset client flags to
   3694 						 * online as the path could not
   3695 						 * be offlined.
   3696 						 */
   3697 						MDI_DEBUG(1, (MDI_WARN, cdip,
   3698 						    "!ndi_devi_offline failed: "
   3699 						    "error %x", rv));
   3700 						MDI_CLIENT_SET_ONLINE(ct);
   3701 					}
   3702 				}
   3703 				break;
   3704 			}
   3705 			/*
   3706 			 * Convert to MDI error code
   3707 			 */
   3708 			switch (rv) {
   3709 			case NDI_SUCCESS:
   3710 				MDI_CLIENT_SET_REPORT_DEV_NEEDED(ct);
   3711 				i_mdi_report_path_state(ct, pip);
   3712 				rv = MDI_SUCCESS;
   3713 				break;
   3714 			case NDI_BUSY:
   3715 				rv = MDI_BUSY;
   3716 				break;
   3717 			default:
   3718 				rv = MDI_FAILURE;
   3719 				break;
   3720 			}
   3721 		}
   3722 	}
   3723 	MDI_CLIENT_UNLOCK(ct);
   3724 
   3725 state_change_exit:
   3726 	/*
   3727 	 * Mark the pHCI as stable again.
   3728 	 */
   3729 	MDI_PHCI_LOCK(ph);
   3730 	MDI_PHCI_STABLE(ph);
   3731 	MDI_PHCI_UNLOCK(ph);
   3732 	return (rv);
   3733 }
   3734 
   3735 /*
   3736  * mdi_pi_online():
   3737  *		Place the path_info node in the online state.  The path is
   3738  *		now available to be selected by mdi_select_path() for
   3739  *		transporting I/O requests to client devices.
   3740  * Return Values:
   3741  *		MDI_SUCCESS
   3742  *		MDI_FAILURE
   3743  */
   3744 int
   3745 mdi_pi_online(mdi_pathinfo_t *pip, int flags)
   3746 {
   3747 	mdi_client_t	*ct = MDI_PI(pip)->pi_client;
   3748 	int		client_held = 0;
   3749 	int		rv;
   3750 
   3751 	ASSERT(ct != NULL);
   3752 	rv = i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_ONLINE, flags);
   3753 	if (rv != MDI_SUCCESS)
   3754 		return (rv);
   3755 
   3756 	MDI_PI_LOCK(pip);
   3757 	if (MDI_PI(pip)->pi_pm_held == 0) {
   3758 		MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
   3759 		    "i_mdi_pm_hold_pip %p", (void *)pip));
   3760 		i_mdi_pm_hold_pip(pip);
   3761 		client_held = 1;
   3762 	}
   3763 	MDI_PI_UNLOCK(pip);
   3764 
   3765 	if (client_held) {
   3766 		MDI_CLIENT_LOCK(ct);
   3767 		if (ct->ct_power_cnt == 0) {
   3768 			rv = i_mdi_power_all_phci(ct);
   3769 		}
   3770 
   3771 		MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
   3772 		    "i_mdi_pm_hold_client %p", (void *)ct));
   3773 		i_mdi_pm_hold_client(ct, 1);
   3774 		MDI_CLIENT_UNLOCK(ct);
   3775 	}
   3776 
   3777 	return (rv);
   3778 }
   3779 
   3780 /*
   3781  * mdi_pi_standby():
   3782  *		Place the mdi_pathinfo node in standby state
   3783  *
   3784  * Return Values:
   3785  *		MDI_SUCCESS
   3786  *		MDI_FAILURE
   3787  */
   3788 int
   3789 mdi_pi_standby(mdi_pathinfo_t *pip, int flags)
   3790 {
   3791 	return (i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_STANDBY, flags));
   3792 }
   3793 
   3794 /*
   3795  * mdi_pi_fault():
   3796  *		Place the mdi_pathinfo node in fault'ed state
   3797  * Return Values:
   3798  *		MDI_SUCCESS
   3799  *		MDI_FAILURE
   3800  */
   3801 int
   3802 mdi_pi_fault(mdi_pathinfo_t *pip, int flags)
   3803 {
   3804 	return (i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_FAULT, flags));
   3805 }
   3806 
   3807 /*
   3808  * mdi_pi_offline():
   3809  *		Offline a mdi_pathinfo node.
   3810  * Return Values:
   3811  *		MDI_SUCCESS
   3812  *		MDI_FAILURE
   3813  */
   3814 int
   3815 mdi_pi_offline(mdi_pathinfo_t *pip, int flags)
   3816 {
   3817 	int	ret, client_held = 0;
   3818 	mdi_client_t	*ct;
   3819 
   3820 	/*
   3821 	 * Original code overloaded NDI_DEVI_REMOVE to this interface, and
   3822 	 * used it to mean "user initiated operation" (i.e. devctl). Callers
   3823 	 * should now just use NDI_USER_REQ.
   3824 	 */
   3825 	if (flags & NDI_DEVI_REMOVE) {
   3826 		flags &= ~NDI_DEVI_REMOVE;
   3827 		flags |= NDI_USER_REQ;
   3828 	}
   3829 
   3830 	ret = i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_OFFLINE, flags);
   3831 
   3832 	if (ret == MDI_SUCCESS) {
   3833 		MDI_PI_LOCK(pip);
   3834 		if (MDI_PI(pip)->pi_pm_held) {
   3835 			client_held = 1;
   3836 		}
   3837 		MDI_PI_UNLOCK(pip);
   3838 
   3839 		if (client_held) {
   3840 			ct = MDI_PI(pip)->pi_client;
   3841 			MDI_CLIENT_LOCK(ct);
   3842 			MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
   3843 			    "i_mdi_pm_rele_client\n"));
   3844 			i_mdi_pm_rele_client(ct, 1);
   3845 			MDI_CLIENT_UNLOCK(ct);
   3846 		}
   3847 	}
   3848 
   3849 	return (ret);
   3850 }
   3851 
   3852 /*
   3853  * i_mdi_pi_offline():
   3854  *		Offline a mdi_pathinfo node and call the vHCI driver's callback
   3855  */
   3856 static int
   3857 i_mdi_pi_offline(mdi_pathinfo_t *pip, int flags)
   3858 {
   3859 	dev_info_t	*vdip = NULL;
   3860 	mdi_vhci_t	*vh = NULL;
   3861 	mdi_client_t	*ct = NULL;
   3862 	int		(*f)();
   3863 	int		rv;
   3864 
   3865 	MDI_PI_LOCK(pip);
   3866 	ct = MDI_PI(pip)->pi_client;
   3867 	ASSERT(ct != NULL);
   3868 
   3869 	while (MDI_PI(pip)->pi_ref_cnt != 0) {
   3870 		/*
   3871 		 * Give a chance for pending I/Os to complete.
   3872 		 */
   3873 		MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
   3874 		    "!%d cmds still pending on path %s %p",
   3875 		    MDI_PI(pip)->pi_ref_cnt, mdi_pi_spathname(pip),
   3876 		    (void *)pip));
   3877 		if (cv_timedwait(&MDI_PI(pip)->pi_ref_cv,
   3878 		    &MDI_PI(pip)->pi_mutex,
   3879 		    ddi_get_lbolt() + drv_usectohz(60 * 1000000)) == -1) {
   3880 			/*
   3881 			 * The timeout time reached without ref_cnt being zero
   3882 			 * being signaled.
   3883 			 */
   3884 			MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
   3885 			    "!Timeout reached on path %s %p without the cond",
   3886 			    mdi_pi_spathname(pip), (void *)pip));
   3887 			MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
   3888 			    "!%d cmds still pending on path %s %p",
   3889 			    MDI_PI(pip)->pi_ref_cnt,
   3890 			    mdi_pi_spathname(pip), (void *)pip));
   3891 		}
   3892 	}
   3893 	vh = ct->ct_vhci;
   3894 	vdip = vh->vh_dip;
   3895 
   3896 	/*
   3897 	 * Notify vHCI that has registered this event
   3898 	 */
   3899 	ASSERT(vh->vh_ops);
   3900 	f = vh->vh_ops->vo_pi_state_change;
   3901 
   3902 	if (f != NULL) {
   3903 		MDI_PI_UNLOCK(pip);
   3904 		if ((rv = (*f)(vdip, pip, MDI_PATHINFO_STATE_OFFLINE, 0,
   3905 		    flags)) != MDI_SUCCESS) {
   3906 			MDI_DEBUG(1, (MDI_WARN, ct->ct_dip,
   3907 			    "!vo_path_offline failed: vdip %s%d %p: path %s %p",
   3908 			    ddi_driver_name(vdip), ddi_get_instance(vdip),
   3909 			    (void *)vdip, mdi_pi_spathname(pip), (void *)pip));
   3910 		}
   3911 		MDI_PI_LOCK(pip);
   3912 	}
   3913 
   3914 	/*
   3915 	 * Set the mdi_pathinfo node state and clear the transient condition
   3916 	 */
   3917 	MDI_PI_SET_OFFLINE(pip);
   3918 	cv_broadcast(&MDI_PI(pip)->pi_state_cv);
   3919 	MDI_PI_UNLOCK(pip);
   3920 
   3921 	MDI_CLIENT_LOCK(ct);
   3922 	if (rv == MDI_SUCCESS) {
   3923 		if (ct->ct_unstable == 0) {
   3924 			dev_info_t	*cdip = ct->ct_dip;
   3925 
   3926 			/*
   3927 			 * Onlining the mdi_pathinfo node will impact the
   3928 			 * client state Update the client and dev_info node
   3929 			 * state accordingly
   3930 			 */
   3931 			i_mdi_client_update_state(ct);
   3932 			rv = NDI_SUCCESS;
   3933 			if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_FAILED) {
   3934 				if (cdip &&
   3935 				    (i_ddi_node_state(cdip) >=
   3936 				    DS_INITIALIZED)) {
   3937 					MDI_CLIENT_UNLOCK(ct);
   3938 					rv = ndi_devi_offline(cdip,
   3939 					    NDI_DEVFS_CLEAN);
   3940 					MDI_CLIENT_LOCK(ct);
   3941 					if (rv != NDI_SUCCESS) {
   3942 						/*
   3943 						 * ndi_devi_offline failed.
   3944 						 * Reset client flags to
   3945 						 * online.
   3946 						 */
   3947 						MDI_DEBUG(4, (MDI_WARN, cdip,
   3948 						    "ndi_devi_offline failed: "
   3949 						    "error %x", rv));
   3950 						MDI_CLIENT_SET_ONLINE(ct);
   3951 					}
   3952 				}
   3953 			}
   3954 			/*
   3955 			 * Convert to MDI error code
   3956 			 */
   3957 			switch (rv) {
   3958 			case NDI_SUCCESS:
   3959 				rv = MDI_SUCCESS;
   3960 				break;
   3961 			case NDI_BUSY:
   3962 				rv = MDI_BUSY;
   3963 				break;
   3964 			default:
   3965 				rv = MDI_FAILURE;
   3966 				break;
   3967 			}
   3968 		}
   3969 		MDI_CLIENT_SET_REPORT_DEV_NEEDED(ct);
   3970 		i_mdi_report_path_state(ct, pip);
   3971 	}
   3972 
   3973 	MDI_CLIENT_UNLOCK(ct);
   3974 
   3975 	/*
   3976 	 * Change in the mdi_pathinfo node state will impact the client state
   3977 	 */
   3978 	MDI_DEBUG(2, (MDI_NOTE, ct->ct_dip,
   3979 	    "ct = %p pip = %p", (void *)ct, (void *)pip));
   3980 	return (rv);
   3981 }
   3982 
   3983 /*
   3984  * mdi_pi_get_node_name():
   3985  *              Get the name associated with a mdi_pathinfo node.
   3986  *              Since pathinfo nodes are not directly named, we
   3987  *              return the node_name of the client.
   3988  *
   3989  * Return Values:
   3990  *              char *
   3991  */
   3992 char *
   3993 mdi_pi_get_node_name(mdi_pathinfo_t *pip)
   3994 {
   3995 	mdi_client_t    *ct;
   3996 
   3997 	if (pip == NULL)
   3998 		return (NULL);
   3999 	ct = MDI_PI(pip)->pi_client;
   4000 	if ((ct == NULL) || (ct->ct_dip == NULL))
   4001 		return (NULL);
   4002 	return (ddi_node_name(ct->ct_dip));
   4003 }
   4004 
   4005 /*
   4006  * mdi_pi_get_addr():
   4007  *		Get the unit address associated with a mdi_pathinfo node
   4008  *
   4009  * Return Values:
   4010  *		char *
   4011  */
   4012 char *
   4013 mdi_pi_get_addr(mdi_pathinfo_t *pip)
   4014 {
   4015 	if (pip == NULL)
   4016 		return (NULL);
   4017 
   4018 	return (MDI_PI(pip)->pi_addr);
   4019 }
   4020 
   4021 /*
   4022  * mdi_pi_get_path_instance():
   4023  *		Get the 'path_instance' of a mdi_pathinfo node
   4024  *
   4025  * Return Values:
   4026  *		path_instance
   4027  */
   4028 int
   4029 mdi_pi_get_path_instance(mdi_pathinfo_t *pip)
   4030 {
   4031 	if (pip == NULL)
   4032 		return (0);
   4033 
   4034 	return (MDI_PI(pip)->pi_path_instance);
   4035 }
   4036 
   4037 /*
   4038  * mdi_pi_pathname():
   4039  *		Return pointer to path to pathinfo node.
   4040  */
   4041 char *
   4042 mdi_pi_pathname(mdi_pathinfo_t *pip)
   4043 {
   4044 	if (pip == NULL)
   4045 		return (NULL);
   4046 	return (mdi_pi_pathname_by_instance(mdi_pi_get_path_instance(pip)));
   4047 }
   4048 
   4049 /*
   4050  * mdi_pi_spathname():
   4051  *		Return pointer to shortpath to pathinfo node. Used for debug
   4052  *		messages, so return "" instead of NULL when unknown.
   4053  */
   4054 char *
   4055 mdi_pi_spathname(mdi_pathinfo_t *pip)
   4056 {
   4057 	char	*spath = "";
   4058 
   4059 	if (pip) {
   4060 		spath = mdi_pi_spathname_by_instance(
   4061 		    mdi_pi_get_path_instance(pip));
   4062 		if (spath == NULL)
   4063 			spath = "";
   4064 	}
   4065 	return (spath);
   4066 }
   4067 
   4068 char *
   4069 mdi_pi_pathname_obp(mdi_pathinfo_t *pip, char *path)
   4070 {
   4071 	char *obp_path = NULL;
   4072 	if ((pip == NULL) || (path == NULL))
   4073 		return (NULL);
   4074 
   4075 	if (mdi_prop_lookup_string(pip, "obp-path", &obp_path) == MDI_SUCCESS) {
   4076 		(void) strcpy(path, obp_path);
   4077 		(void) mdi_prop_free(obp_path);
   4078 	} else {
   4079 		path = NULL;
   4080 	}
   4081 	return (path);
   4082 }
   4083 
   4084 int
   4085 mdi_pi_pathname_obp_set(mdi_pathinfo_t *pip, char *component)
   4086 {
   4087 	dev_info_t *pdip;
   4088 	char *obp_path = NULL;
   4089 	int rc = MDI_FAILURE;
   4090 
   4091 	if (pip == NULL)
   4092 		return (MDI_FAILURE);
   4093 
   4094 	pdip = mdi_pi_get_phci(pip);
   4095 	if (pdip == NULL)
   4096 		return (MDI_FAILURE);
   4097 
   4098 	obp_path = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
   4099 
   4100 	if (ddi_pathname_obp(pdip, obp_path) == NULL) {
   4101 		(void) ddi_pathname(pdip, obp_path);
   4102 	}
   4103 
   4104 	if (component) {
   4105 		(void) strncat(obp_path, "/", MAXPATHLEN);
   4106 		(void) strncat(obp_path, component, MAXPATHLEN);
   4107 	}
   4108 	rc = mdi_prop_update_string(pip, "obp-path", obp_path);
   4109 
   4110 	if (obp_path)
   4111 		kmem_free(obp_path, MAXPATHLEN);
   4112 	return (rc);
   4113 }
   4114 
   4115 /*
   4116  * mdi_pi_get_client():
   4117  *		Get the client devinfo associated with a mdi_pathinfo node
   4118  *
   4119  * Return Values:
   4120  *		Handle to client device dev_info node
   4121  */
   4122 dev_info_t *
   4123 mdi_pi_get_client(mdi_pathinfo_t *pip)
   4124 {
   4125 	dev_info_t	*dip = NULL;
   4126 	if (pip) {
   4127 		dip = MDI_PI(pip)->pi_client->ct_dip;
   4128 	}
   4129 	return (dip);
   4130 }
   4131 
   4132 /*
   4133  * mdi_pi_get_phci():
   4134  *		Get the pHCI devinfo associated with the mdi_pathinfo node
   4135  * Return Values:
   4136  *		Handle to dev_info node
   4137  */
   4138 dev_info_t *
   4139 mdi_pi_get_phci(mdi_pathinfo_t *pip)
   4140 {
   4141 	dev_info_t	*dip = NULL;
   4142 	mdi_phci_t	*ph;
   4143 
   4144 	if (pip) {
   4145 		ph = MDI_PI(pip)->pi_phci;
   4146 		if (ph)
   4147 			dip = ph->ph_dip;
   4148 	}
   4149 	return (dip);
   4150 }
   4151 
   4152 /*
   4153  * mdi_pi_get_client_private():
   4154  *		Get the client private information associated with the
   4155  *		mdi_pathinfo node
   4156  */
   4157 void *
   4158 mdi_pi_get_client_private(mdi_pathinfo_t *pip)
   4159 {
   4160 	void *cprivate = NULL;
   4161 	if (pip) {
   4162 		cprivate = MDI_PI(pip)->pi_cprivate;
   4163 	}
   4164 	return (cprivate);
   4165 }
   4166 
   4167 /*
   4168  * mdi_pi_set_client_private():
   4169  *		Set the client private information in the mdi_pathinfo node
   4170  */
   4171 void
   4172 mdi_pi_set_client_private(mdi_pathinfo_t *pip, void *priv)
   4173 {
   4174 	if (pip) {
   4175 		MDI_PI(pip)->pi_cprivate = priv;
   4176 	}
   4177 }
   4178 
   4179 /*
   4180  * mdi_pi_get_phci_private():
   4181  *		Get the pHCI private information associated with the
   4182  *		mdi_pathinfo node
   4183  */
   4184 caddr_t
   4185 mdi_pi_get_phci_private(mdi_pathinfo_t *pip)
   4186 {
   4187 	caddr_t	pprivate = NULL;
   4188 
   4189 	if (pip) {
   4190 		pprivate = MDI_PI(pip)->pi_pprivate;
   4191 	}
   4192 	return (pprivate);
   4193 }
   4194 
   4195 /*
   4196  * mdi_pi_set_phci_private():
   4197  *		Set the pHCI private information in the mdi_pathinfo node
   4198  */
   4199 void
   4200 mdi_pi_set_phci_private(mdi_pathinfo_t *pip, caddr_t priv)
   4201 {
   4202 	if (pip) {
   4203 		MDI_PI(pip)->pi_pprivate = priv;
   4204 	}
   4205 }
   4206 
   4207 /*
   4208  * mdi_pi_get_state():
   4209  *		Get the mdi_pathinfo node state. Transient states are internal
   4210  *		and not provided to the users
   4211  */
   4212 mdi_pathinfo_state_t
   4213 mdi_pi_get_state(mdi_pathinfo_t *pip)
   4214 {
   4215 	mdi_pathinfo_state_t    state = MDI_PATHINFO_STATE_INIT;
   4216 
   4217 	if (pip) {
   4218 		if (MDI_PI_IS_TRANSIENT(pip)) {
   4219 			/*
   4220 			 * mdi_pathinfo is in state transition.  Return the
   4221 			 * last good state.
   4222 			 */
   4223 			state = MDI_PI_OLD_STATE(pip);
   4224 		} else {
   4225 			state = MDI_PI_STATE(pip);
   4226 		}
   4227 	}
   4228 	return (state);
   4229 }
   4230 
   4231 /*
   4232  * mdi_pi_get_flags():
   4233  *		Get the mdi_pathinfo node flags.
   4234  */
   4235 uint_t
   4236 mdi_pi_get_flags(mdi_pathinfo_t *pip)
   4237 {
   4238 	return (pip ? MDI_PI(pip)->pi_flags : 0);
   4239 }
   4240 
   4241 /*
   4242  * Note that the following function needs to be the new interface for
   4243  * mdi_pi_get_state when mpxio gets integrated to ON.
   4244  */
   4245 int
   4246 mdi_pi_get_state2(mdi_pathinfo_t *pip, mdi_pathinfo_state_t *state,
   4247 		uint32_t *ext_state)
   4248 {
   4249 	*state = MDI_PATHINFO_STATE_INIT;
   4250 
   4251 	if (pip) {
   4252 		if (MDI_PI_IS_TRANSIENT(pip)) {
   4253 			/*
   4254 			 * mdi_pathinfo is in state transition.  Return the
   4255 			 * last good state.
   4256 			 */
   4257 			*state = MDI_PI_OLD_STATE(pip);
   4258 			*ext_state = MDI_PI_OLD_EXT_STATE(pip);
   4259 		} else {
   4260 			*state = MDI_PI_STATE(pip);
   4261 			*ext_state = MDI_PI_EXT_STATE(pip);
   4262 		}
   4263 	}
   4264 	return (MDI_SUCCESS);
   4265 }
   4266 
   4267 /*
   4268  * mdi_pi_get_preferred:
   4269  *	Get the preferred path flag
   4270  */
   4271 int
   4272 mdi_pi_get_preferred(mdi_pathinfo_t *pip)
   4273 {
   4274 	if (pip) {
   4275 		return (MDI_PI(pip)->pi_preferred);
   4276 	}
   4277 	return (0);
   4278 }
   4279 
   4280 /*
   4281  * mdi_pi_set_preferred:
   4282  *	Set the preferred path flag
   4283  */
   4284 void
   4285 mdi_pi_set_preferred(mdi_pathinfo_t *pip, int preferred)
   4286 {
   4287 	if (pip) {
   4288 		MDI_PI(pip)->pi_preferred = preferred;
   4289 	}
   4290 }
   4291 
   4292 /*
   4293  * mdi_pi_set_state():
   4294  *		Set the mdi_pathinfo node state
   4295  */
   4296 void
   4297 mdi_pi_set_state(mdi_pathinfo_t *pip, mdi_pathinfo_state_t state)
   4298 {
   4299 	uint32_t	ext_state;
   4300 
   4301 	if (pip) {
   4302 		ext_state = MDI_PI(pip)->pi_state & MDI_PATHINFO_EXT_STATE_MASK;
   4303 		MDI_PI(pip)->pi_state = state;
   4304 		MDI_PI(pip)->pi_state |= ext_state;
   4305 
   4306 		/* Path has changed state, invalidate DINFOCACHE snap shot. */
   4307 		i_ddi_di_cache_invalidate();
   4308 	}
   4309 }
   4310 
   4311 /*
   4312  * Property functions:
   4313  */
   4314 int
   4315 i_map_nvlist_error_to_mdi(int val)
   4316 {
   4317 	int rv;
   4318 
   4319 	switch (val) {
   4320 	case 0:
   4321 		rv = DDI_PROP_SUCCESS;
   4322 		break;
   4323 	case EINVAL:
   4324 	case ENOTSUP:
   4325 		rv = DDI_PROP_INVAL_ARG;
   4326 		break;
   4327 	case ENOMEM:
   4328 		rv = DDI_PROP_NO_MEMORY;
   4329 		break;
   4330 	default:
   4331 		rv = DDI_PROP_NOT_FOUND;
   4332 		break;
   4333 	}
   4334 	return (rv);
   4335 }
   4336 
   4337 /*
   4338  * mdi_pi_get_next_prop():
   4339  * 		Property walk function.  The caller should hold mdi_pi_lock()
   4340  *		and release by calling mdi_pi_unlock() at the end of walk to
   4341  *		get a consistent value.
   4342  */
   4343 nvpair_t *
   4344 mdi_pi_get_next_prop(mdi_pathinfo_t *pip, nvpair_t *prev)
   4345 {
   4346 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
   4347 		return (NULL);
   4348 	}
   4349 	ASSERT(MDI_PI_LOCKED(pip));
   4350 	return (nvlist_next_nvpair(MDI_PI(pip)->pi_prop, prev));
   4351 }
   4352 
   4353 /*
   4354  * mdi_prop_remove():
   4355  * 		Remove the named property from the named list.
   4356  */
   4357 int
   4358 mdi_prop_remove(mdi_pathinfo_t *pip, char *name)
   4359 {
   4360 	if (pip == NULL) {
   4361 		return (DDI_PROP_NOT_FOUND);
   4362 	}
   4363 	ASSERT(!MDI_PI_LOCKED(pip));
   4364 	MDI_PI_LOCK(pip);
   4365 	if (MDI_PI(pip)->pi_prop == NULL) {
   4366 		MDI_PI_UNLOCK(pip);
   4367 		return (DDI_PROP_NOT_FOUND);
   4368 	}
   4369 	if (name) {
   4370 		(void) nvlist_remove_all(MDI_PI(pip)->pi_prop, name);
   4371 	} else {
   4372 		char		nvp_name[MAXNAMELEN];
   4373 		nvpair_t	*nvp;
   4374 		nvp = nvlist_next_nvpair(MDI_PI(pip)->pi_prop, NULL);
   4375 		while (nvp) {
   4376 			nvpair_t	*next;
   4377 			next = nvlist_next_nvpair(MDI_PI(pip)->pi_prop, nvp);
   4378 			(void) snprintf(nvp_name, sizeof(nvp_name), "%s",
   4379 			    nvpair_name(nvp));
   4380 			(void) nvlist_remove_all(MDI_PI(pip)->pi_prop,
   4381 			    nvp_name);
   4382 			nvp = next;
   4383 		}
   4384 	}
   4385 	MDI_PI_UNLOCK(pip);
   4386 	return (DDI_PROP_SUCCESS);
   4387 }
   4388 
   4389 /*
   4390  * mdi_prop_size():
   4391  * 		Get buffer size needed to pack the property data.
   4392  * 		Caller should hold the mdi_pathinfo_t lock to get a consistent
   4393  *		buffer size.
   4394  */
   4395 int
   4396 mdi_prop_size(mdi_pathinfo_t *pip, size_t *buflenp)
   4397 {
   4398 	int	rv;
   4399 	size_t	bufsize;
   4400 
   4401 	*buflenp = 0;
   4402 	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
   4403 		return (DDI_PROP_NOT_FOUND);
   4404 	}
   4405 	ASSERT(MDI_PI_LOCKED(pip));
   4406 	rv = nvlist_size(MDI_PI(pip)->pi_prop,
   4407 	    &bufsize, NV_ENCODE_NATIVE);
   4408 	*buflenp = bufsize;
   4409 	return (i_map_nvlist_error_to_mdi(rv));
   4410 }
   4411 
   4412 /*
   4413  * mdi_prop_pack():
   4414  * 		pack the property list.  The caller should hold the
   4415  *		mdi_pathinfo_t node to get a consistent data
   4416  */
   4417 int
   4418 mdi_prop_pack(mdi_pathinfo_t *pip, char **bufp, uint_t buflen)
   4419 {
   4420 	int	rv;
   4421 	size_t	bufsize;
   4422 
   4423 	if ((pip == NULL) || MDI_PI(pip)->pi_prop == NULL) {
   4424 		return (DDI_PROP_NOT_FOUND);
   4425 	}
   4426 
   4427 	ASSERT(MDI_PI_LOCKED(pip));
   4428 
   4429 	bufsize = buflen;
   4430 	rv = nvlist_pack(MDI_PI(pip)->pi_prop, bufp, (size_t *)&bufsize,
   4431 	    NV_ENCODE_NATIVE, KM_SLEEP);
   4432 
   4433 	return (i_map_nvlist_error_to_mdi(rv));
   4434 }
   4435 
   4436 /*
   4437  * mdi_prop_update_byte():
   4438  *		Create/Update a byte property
   4439  */
   4440 int
   4441 mdi_prop_update_byte(mdi_pathinfo_t *pip, char *name, uchar_t data)
   4442 {
   4443 	int rv;
   4444 
   4445 	if (pip == NULL) {
   4446 		return (DDI_PROP_INVAL_ARG);
   4447 	}
   4448 	ASSERT(!MDI_PI_LOCKED(pip));
   4449 	MDI_PI_LOCK(pip);
   4450 	if (MDI_PI(pip)->pi_prop == NULL) {
   4451 		MDI_PI_UNLOCK(pip);
   4452 		return (