Home | History | Annotate | Download | only in client
      1 //
      2 // CDDL HEADER START
      3 //
      4 // The contents of this file are subject to the terms of the
      5 // Common Development and Distribution License (the License).
      6 // You may not use this file except in compliance with the License.
      7 //
      8 // You can obtain a copy of the license at usr/src/CDDL.txt
      9 // or http://www.opensolaris.org/os/licensing.
     10 // See the License for the specific language governing permissions
     11 // and limitations under the License.
     12 //
     13 // When distributing Covered Code, include this CDDL HEADER in each
     14 // file and include the License file at usr/src/CDDL.txt.
     15 // If applicable, add the following below this CDDL HEADER, with the
     16 // fields enclosed by brackets [] replaced with your own identifying
     17 // information: Portions Copyright [yyyy] [name of copyright owner]
     18 //
     19 // CDDL HEADER END
     20 //
     21 
     22 //
     23 // Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     24 // Use is subject to license terms.
     25 //
     26 
     27 #pragma ident	"@(#)pxvfs.cc	1.39	08/05/20 SMI"
     28 
     29 #include <sys/types.h>
     30 #include <sys/systm.h>
     31 #include <sys/vfs.h>
     32 #include <sys/vnode.h>
     33 #include <sys/file.h>
     34 #include <sys/uio.h>
     35 #include <sys/dnlc.h>
     36 #include <sys/mount.h>
     37 #include <sys/statvfs.h>
     38 #include <sys/debug.h>
     39 #include <sys/cmn_err.h>
     40 #include <sys/fs_subr.h>
     41 #include <sys/pathname.h>
     42 #include <sys/fs/ufs_mount.h>
     43 #include <sys/mntent.h>
     44 #include <kstat.h>
     45 #include <sys/ddi.h>
     46 #include <sys/disp.h>
     47 
     48 #include <sys/sol_version.h>
     49 #include <sys/os.h>
     50 #include <sys/sol_conv.h>
     51 #include <solobj/solobj_impl.h>
     52 #include <nslib/ns.h>
     53 #include <orb/fault/fault_injection.h>
     54 #include <orb/monitor/monitor.h>
     55 #include <orb/infrastructure/clusterproc.h>
     56 
     57 #include "../version.h"
     58 #include <pxfs/common/pxfslib.h>
     59 #include <pxfs/device/device_replica_impl.h>
     60 #include <pxfs/device/device_service_mgr.h>
     61 #include <pxfs/mount/mount_client_impl.h>
     62 #include <pxfs/mount/mount_debug.h>
     63 #include <pxfs/lib/pxfs_debug.h>
     64 #include <pxfs/lib/pxfs_misc.h>
     65 #include <pxfs/client/pxfobj.h>
     66 #include <pxfs/client/pxfobjplus.h>
     67 #include <pxfs/client/fobj_client_impl.h>
     68 #include <pxfs/client/pxreg.h>
     69 #include <pxfs/client/pxdir.h>
     70 #include <pxfs/client/pxchr.h>
     71 #include <pxfs/client/pxlink.h>
     72 #include <pxfs/client/pxspecial.h>
     73 #include <pxfs/client/pxvfs.h>
     74 #include <pxfs/client/fsmgr_client_impl.h>
     75 
     76 #ifndef VXFS_DISABLED
     77 #include <pxfs/server/vxfs_dependent_impl.h>
     78 #endif
     79 
     80 #if SOL_VERSION >= __s9
     81 #define	PSARC_2001_038
     82 #endif
     83 
     84 //
     85 // For update of the mnttab modification time. The function
     86 // vfs_mnttab_modtimeupd is declared static in vfs.c for
     87 // Solaris 8 and 9. For Solaris 10, it is global.
     88 //
     89 #if SOL_VERSION >= __s10
     90 #define	GLOBAL_MNTTAB_MODTIME_INTERFACE
     91 #else
     92 extern timespec_t vfs_mnttab_mtime;
     93 #endif
     94 
     95 //
     96 // Constants for initializing various throttling variables.
     97 //
     98 const int	KILOBYTE = 1024;
     99 const int64_t	MEGABYTE = 1024 * KILOBYTE;
    100 const int	ONE_SECOND = 1000000; // 1 second in microseconds
    101 
    102 int64_t DATA_RATE_DEFAULT = 20 * MEGABYTE;	// 20mb per second
    103 int64_t DATA_RATE_MINIMUM = 2  * MEGABYTE;	// 2mb per second
    104 int	THROTTLE_MONITOR_INTERVAL = ONE_SECOND; // 1 second by default
    105 
    106 //lint -e666
    107 // PXFS is an extensive user of the inline function get_vp() within
    108 // the vnode macros:
    109 //	error = VOP_LOOKUP(get_vp(), ... );
    110 // There are no side effects to calling get_vp() repeatedly, flexelint
    111 // does not know that, but we do.
    112 //
    113 
    114 //
    115 // Static data member initializations.
    116 //
    117 pxvfs_inactive_threadpool
    118 	*pxvfs_inactive_threadpool::the_pxvfs_inactive_threadpool = NULL;
    119 
    120 pxvfs_list_t	pxvfs::all_pxvfs;	// list of all pxvfs structures
    121 os::rwlock_t	pxvfs::all_pxvfs_lock;	// protects 'all_pxvfs'
    122 int		pxvfs::pxfstype;	// number assigned by Solaris
    123 bool		pxvfs::unmounts_disabled = false;
    124 
    125 // This value must be a non-zero positive number
    126 int		cluster_fs_drain_queue_len = 50;
    127 
    128 uint_t		pxvfs::pxfobjhsz = 0;
    129 uint_t		pxvfs::pxfobjhsz_max = 0;
    130 uint_t		pxvfs::pxfobjh_len = 4;
    131 kstat_t		*pxvfs::node_stats = NULL;
    132 
    133 //
    134 // The flushing of attributes is done by a kernel thread
    135 // with a priority higher than that of applications.
    136 // The attribute flush operation can starve normal activity.
    137 // These values control the flushing of attributes
    138 // The default values were chosen to allow approx. 100,000 files to be flushed
    139 // in 30 seconds, and to spread the work over that period.
    140 //
    141 // Number of files processed per interval
    142 int		pxvfs::sync_all_attr_throttle = 40;
    143 
    144 // No of files processed during a sync before sleeping.
    145 int		pxvfs::sync_filesystem_throttle = 40;
    146 
    147 // Amount of sleep between intervals
    148 os::usec_t	pxvfs::sync_all_attr_interval[] = {
    149 			(os::usec_t)(120 * 1000),	// 120ms <10,000 files
    150 			(os::usec_t)(60 * 1000),	// 60ms <20,000 files
    151 			(os::usec_t)(40 * 1000),	// 40ms <30,000 files
    152 			(os::usec_t)(30 * 1000),	// 30ms <40,000 files
    153 			(os::usec_t)(20 * 1000)		// 20ms >=40,000 files
    154 		};
    155 
    156 bool		pxvfs::sync_all_attr_thread_running = false;
    157 os::mutex_t	pxvfs::sync_all_attr_lock;
    158 
    159 const int	inactive_thread_priority = 65;
    160 
    161 #ifdef DEBUG
    162 uint32_t	pxvfs_vget_number_calls = 0;
    163 uint32_t	pxvfs_vget_number_fid_hits = 0;
    164 #endif
    165 
    166 // Number of async threads per thread-pool.
    167 int pxfs_async_threads = 15;
    168 uint64_t pxvfs::async_task_count = 0;
    169 
    170 //
    171 // This should be pxvfs::pxfobj_hash_bkt but our compiler doesn't understand
    172 // that and our lint checker complains about it.
    173 //lint -e1038
    174 pxvfs::pxfobj_hash_bkt	*pxvfs::pxfobj_hash = NULL;
    175 //lint +e1038
    176 
    177 //
    178 // Assign default values for throttling variables. For description see
    179 // declaration of these members in pxvfs's class definition below.
    180 //
    181 int64_t pxvfs::data_rate	 = DATA_RATE_DEFAULT;	// 20mb per second
    182 int64_t pxvfs::data_rate_default = DATA_RATE_DEFAULT;	// 20mb per second
    183 int64_t pxvfs::data_rate_minimum = DATA_RATE_MINIMUM;	// 2mb per second
    184 int64_t	pxvfs::bytes_in_window	 = DATA_RATE_DEFAULT;	// 20mb
    185 int	pxvfs::bandwidth_chunk	 = 8 * KILOBYTE;	// 8kb
    186 
    187 // Monitor thread wakes every 1 second by default.
    188 int	pxvfs::throttle_monitor_interval = THROTTLE_MONITOR_INTERVAL;
    189 
    190 //
    191 // We don't want to keep more than 16MB worth of I/O requests pending
    192 // per client. Assuming an average size of 128kb per I/O request, that
    193 // is 128 pending I/O requests on the server.
    194 //
    195 int	pxvfs::max_permitted_ios = 128;
    196 
    197 // Everything else initialized to zero.
    198 int64_t		pxvfs::bytes_sent_in_second	= 0;
    199 int64_t		pxvfs::bytes_written_in_second	= 0;
    200 timespec_t	pxvfs::window_start		= {0L, 0};
    201 
    202 os::mutex_t	pxvfs::io_pending_lock;
    203 os::condvar_t	pxvfs::io_pending_cv;
    204 int64_t		pxvfs::io_pending = 0;
    205 os::condvar_t	pxvfs::bandwidth_cv;
    206 os::mutex_t	pxvfs::bandwidth_lock;
    207 os::mutex_t	pxvfs::data_rate_lock;
    208 
    209 //
    210 // class pxvfs_inactive_task methods
    211 //
    212 
    213 //
    214 // execute - this does the actual work for the task to clean up
    215 // inactive proxy vnodes.
    216 //
    217 void
    218 pxvfs_inactive_task::execute()
    219 {
    220 	pxvfs	*pxvfsp	= get_pxvfs();
    221 
    222 	pxvfsp->flags_lock.lock();
    223 
    224 	//
    225 	// Clean up some inactive proxy vnodes
    226 	//
    227 	pxvfsp->empty_inactive_list();
    228 
    229 	if (pxvfsp->inactive_list_cnt != 0) {
    230 		//
    231 		// There are still more inactive proxy vnodes.
    232 		// We do not process all inactive proxy vnodes
    233 		// at one time in order to let other file systems clean up.
    234 		// Requeue this work request.
    235 		// It is safe to requeue this task for either of two reasons:
    236 		// 1) this threadpool is single threaded.
    237 		// 2) this work task is already off the work list,
    238 		//	and so will not be queued twice concurrently.
    239 		//
    240 		pxvfs_inactive_threadpool::the().defer_processing(this);
    241 		pxvfsp->flags_lock.unlock();
    242 
    243 	} else {
    244 		//
    245 		// There are no more inactive proxy vnodes.
    246 		//
    247 		pxvfsp->flags &= ~pxvfs::PXFS_TASK_QUEUED;
    248 		pxvfsp->flags_lock.unlock();
    249 		VFS_RELE(pxvfsp->fs_vfs);
    250 	}
    251 }
    252 
    253 //
    254 // task_done - Method called when the threadpool decides to throw away a task.
    255 // This happens only during shutdown.
    256 // Cannot use the default implementation which does a "delete this",
    257 // because this task is embedded in another object.
    258 //
    259 void
    260 pxvfs_inactive_task::task_done()
    261 {
    262 	// All work tasks should have been processed before shutdown
    263 	ASSERT(0);
    264 }
    265 
    266 //
    267 // class pxvfs_inactive_threadpool methods
    268 //
    269 
    270 //
    271 // constructor - this threadpool uses two threads.
    272 //
    273 pxvfs_inactive_threadpool::pxvfs_inactive_threadpool() :
    274 	threadpool(false, 2, "pxvfs_inactive_threadpool", 2)
    275 {
    276 }
    277 
    278 pxvfs_inactive_threadpool::~pxvfs_inactive_threadpool()
    279 {
    280 	ASSERT(task_count() == 0);
    281 }
    282 
    283 //
    284 // startup - this method is called at modload time to initialize
    285 // this object.
    286 //
    287 // static
    288 void
    289 pxvfs_inactive_threadpool::startup()
    290 {
    291 	ASSERT(the_pxvfs_inactive_threadpool == NULL);
    292 	the_pxvfs_inactive_threadpool = new pxvfs_inactive_threadpool;
    293 
    294 	//
    295 	// Pxfs consume large amounts of memory.
    296 	// Use a higher thread priority for freeing resources.
    297 	//
    298 	(void) the_pxvfs_inactive_threadpool->
    299 	    set_sched_props(inactive_thread_priority);
    300 }
    301 
    302 //
    303 // shutdown - this method is called at modload time to shutdown
    304 // this object.
    305 //
    306 // static
    307 void
    308 pxvfs_inactive_threadpool::shutdown()
    309 {
    310 	delete the_pxvfs_inactive_threadpool;
    311 	the_pxvfs_inactive_threadpool = NULL;
    312 }
    313 
    314 //
    315 // class pxvfs methods
    316 //
    317 
    318 //
    319 // Constructor.
    320 //
    321 //lint -e668 -e1732 -e1733
    322 pxvfs::pxvfs(PXFS_VER::filesystem_ptr fsptr, fsmgr_client_impl *clientmgrp,
    323     const PXFS_VER::fs_info *fsinfop, int fstype, vfs_t *vfsp,
    324     uint32_t server_incarn) :
    325 	pxvfs_list_elem(this),
    326 	fs_rootvp(NULL),
    327 	server_incn(server_incarn),
    328 	flags(0),
    329 	active_cnt(0),
    330 	inactive_list_cnt(0),
    331 	fsmgr_client_implp(clientmgrp),
    332 	_syncdir_on(false),
    333 	_nocto_on(false),
    334 	_forcedirectio_on(false),
    335 	underlying_fs(UNKNOWN),
    336 	blk_reserve_invo_in_progress(false)
    337 {
    338 	ASSERT(clientmgrp != NULL);
    339 
    340 #ifdef	PXFS_KSTATS_ENABLED
    341 	char	*stats_name = new char[KSTAT_STRLEN];
    342 	(void) sprintf(stats_name, "client v1 (%d, %d)",
    343 	    (int)getmajor(fsinfop->fsdev), (int)getminor(fsinfop->fsdev));
    344 
    345 	//lint +e668 +e1732 +e1733
    346 	stats = kstat_create("pxfs", 0, stats_name,
    347 	    "pxvfs", KSTAT_TYPE_NAMED,
    348 	    PXVFS_STATS_MAX_NUM, KSTAT_FLAG_PERSISTENT);
    349 
    350 	delete [] stats_name;
    351 
    352 	if (stats != NULL) {
    353 		kstat_named_init(&(KSTAT_NAMED_PTR(stats)
    354 		    [PXVFS_STATS_NUM_OPEN_FILES]), "Open Files",
    355 		    KSTAT_DATA_UINT32);
    356 		(KSTAT_NAMED_PTR(stats))[
    357 		    PXVFS_STATS_NUM_OPEN_FILES].value.ui32 = 0;
    358 
    359 		kstat_named_init(&(KSTAT_NAMED_PTR(stats)
    360 		    [PXVFS_STATS_ACCESS_TOKEN_HITS]), "Access Cache Hits",
    361 		    KSTAT_DATA_UINT32);
    362 		(KSTAT_NAMED_PTR(stats))[
    363 		    PXVFS_STATS_ACCESS_TOKEN_HITS].value.ui32 = 0;
    364 
    365 		kstat_named_init(&(KSTAT_NAMED_PTR(stats)
    366 		    [PXVFS_STATS_ACCESS_TOKEN_MISSES]), "Access Cache Misses",
    367 		    KSTAT_DATA_UINT32);
    368 		(KSTAT_NAMED_PTR(stats))[
    369 		    PXVFS_STATS_ACCESS_TOKEN_MISSES].value.ui32 = 0;
    370 
    371 		kstat_named_init(&(KSTAT_NAMED_PTR(stats)
    372 		    [PXVFS_STATS_ACCESS_TOKEN_INVALS]), "Access Cache Invals",
    373 		    KSTAT_DATA_UINT32);
    374 		(KSTAT_NAMED_PTR(stats))[
    375 		    PXVFS_STATS_ACCESS_TOKEN_INVALS].value.ui32 = 0;
    376 
    377 		kstat_named_init(&(KSTAT_NAMED_PTR(stats)
    378 		    [PXVFS_STATS_ATTR_TOKEN_HITS]), "Attribute Token Hits",
    379 		    KSTAT_DATA_UINT32);
    380 		(KSTAT_NAMED_PTR(stats))[
    381 		    PXVFS_STATS_ATTR_TOKEN_HITS].value.ui32 = 0;
    382 
    383 		kstat_named_init(&(KSTAT_NAMED_PTR(stats)
    384 		    [PXVFS_STATS_ATTR_TOKEN_MISSES]), "Attribute Token Misses",
    385 		    KSTAT_DATA_UINT32);
    386 		(KSTAT_NAMED_PTR(stats))[
    387 		    PXVFS_STATS_ATTR_TOKEN_MISSES].value.ui32 = 0;
    388 
    389 		kstat_named_init(&(KSTAT_NAMED_PTR(stats)
    390 		    [PXVFS_STATS_ATTR_TOKEN_INVALS]), "Attribute Token Invals",
    391 		    KSTAT_DATA_UINT32);
    392 		(KSTAT_NAMED_PTR(stats))[
    393 		    PXVFS_STATS_ATTR_TOKEN_INVALS].value.ui32 = 0;
    394 
    395 		kstat_named_init(&(KSTAT_NAMED_PTR(stats)
    396 		    [PXVFS_STATS_DATA_TOKEN_HITS]), "Data Token Hits",
    397 		    KSTAT_DATA_UINT32);
    398 		(KSTAT_NAMED_PTR(stats))[
    399 		    PXVFS_STATS_DATA_TOKEN_HITS].value.ui32 = 0;
    400 
    401 		kstat_named_init(&(KSTAT_NAMED_PTR(stats)
    402 		    [PXVFS_STATS_DATA_TOKEN_MISSES]), "Data Token Misses",
    403 		    KSTAT_DATA_UINT32);
    404 		(KSTAT_NAMED_PTR(stats))[
    405 		    PXVFS_STATS_DATA_TOKEN_MISSES].value.ui32 = 0;
    406 
    407 		kstat_named_init(&(KSTAT_NAMED_PTR(stats)
    408 		    [PXVFS_STATS_DATA_TOKEN_INVALS]), "Data Token Invals",
    409 		    KSTAT_DATA_UINT32);
    410 		(KSTAT_NAMED_PTR(stats))[
    411 		    PXVFS_STATS_DATA_TOKEN_INVALS].value.ui32 = 0;
    412 
    413 		kstat_named_init(&(KSTAT_NAMED_PTR(stats)
    414 		    [PXVFS_STATS_DATA_ALLOC]), "Number of bmap invocations",
    415 		    KSTAT_DATA_UINT32);
    416 		(KSTAT_NAMED_PTR(stats))[
    417 		    PXVFS_STATS_DATA_ALLOC].value.ui32 = 0;
    418 
    419 		kstat_named_init(&(KSTAT_NAMED_PTR(stats)
    420 		    [PXVFS_STATS_DATA_TOKEN_RETRIES]), "Data Token Retries",
    421 		    KSTAT_DATA_UINT32);
    422 		(KSTAT_NAMED_PTR(stats))[
    423 		    PXVFS_STATS_DATA_TOKEN_RETRIES].value.ui32 = 0;
    424 
    425 		kstat_named_init(&(KSTAT_NAMED_PTR(stats)
    426 		    [PXVFS_STATS_THROTTLING_HITS]), "Mem_async Throttling Hits",
    427 		    KSTAT_DATA_UINT32);
    428 		(KSTAT_NAMED_PTR(stats))[
    429 		    PXVFS_STATS_THROTTLING_HITS].value.ui32 = 0;
    430 
    431 		kstat_install(stats);
    432 	}
    433 #else
    434 	stats = NULL;
    435 #endif	/* PXFS_KSTATS_ENABLED */
    436 
    437 	vfsp->vfs_data = (caddr_t)this;
    438 	vfsp->vfs_fstype = fstype;
    439 
    440 	//
    441 	// The VFS documentation says only that this field is the native block
    442 	// size of the file system. We have successfully used both
    443 	// the PXFS transfer size and that of the underlying file system VFS.
    444 	// We will use the block size of the underlying file system so
    445 	// as to be able to know the granual size of an allocation when
    446 	// filling in a hole.
    447 	//
    448 	vfsp->vfs_bsize = fsinfop->fsbsize;
    449 
    450 	vfsp->vfs_dev = fsinfop->fsdev;
    451 	vfsp->vfs_flag = fsinfop->fsflag | VFS_PXFS;
    452 
    453 	// XXX:	should be:	fs_vfs->vfs_fsid = conv(fsinfop->fsid);
    454 	//	but haven't been able to figure out how to write a conv method
    455 	//	the compiler will accept.
    456 	vfsp->vfs_fsid.val[0] = fsinfop->fsid.val[0];
    457 	vfsp->vfs_fsid.val[1] = fsinfop->fsid.val[1];
    458 
    459 	fs_vfs = vfsp;
    460 
    461 	// Check if the underlying filesystem is UFS.
    462 	if (strcmp("ufs", vfssw[fs_vfs->vfs_fstype].vsw_name) == 0) {
    463 		//
    464 		// XXXX Workaround for MO_TAG bug in vfs.c
    465 		//
    466 		mntopt_t *mop = vfs_hasopt(&vfsp->vfs_mntopts, "logging");
    467 
    468 		if (mop && (mop->mo_flags & MO_TAG) == 0) {
    469 			mop->mo_flags |= MO_TAG;
    470 		}
    471 		underlying_fs = UFS;
    472 #ifndef VXFS_DISABLED
    473 	} else if (strcmp("vxfs", vfssw[fs_vfs->vfs_fstype].vsw_name)
    474 	    == 0) {
    475 		underlying_fs = VXFS;
    476 #endif
    477 	} else if (strcmp("hsfs", vfssw[fs_vfs->vfs_fstype].vsw_name)
    478 	    == 0) {
    479 		underlying_fs = HSFS;
    480 	}
    481 
    482 	//
    483 	// We're storing a reference to the server-side fs,
    484 	// so we need to duplicate it.
    485 	//
    486 	ASSERT(!CORBA::is_nil(fsptr));
    487 	fs_fsobj = PXFS_VER::filesystem::_duplicate(fsptr);
    488 
    489 	//
    490 	// The value of the root vp is cached, but we can't pre-fill the
    491 	// cache here because filesystem::getroot() returns a packed
    492 	// vnode.  When we unpack the packed vnode we will call
    493 	// find_pxvfs() looking for this vfs, but it has not yet been
    494 	// constructed, so it will not be on the list of pxvfs structs and
    495 	// we'll end up recursively calling this constructor.
    496 	//
    497 
    498 	//
    499 	// Initialize the status of the pxfs client status to
    500 	// PXFS_VER::GREENZONE. It is safe to assume we are in GREENZONE.
    501 	// Because, during the first allocating write or a file creation
    502 	// we would call reserve_blocks(). This routine will contact the
    503 	// server (through get_reservation()) and find out the current status
    504 	// of the filesystem and act accordingly.
    505 	//
    506 	pxvfs_status = PXFS_VER::GREENZONE;
    507 
    508 	// Create the async threadpool for this file-system
    509 	mem_async_threadpool =
    510 	    new threadpool(true, 5, "apageout thr", pxfs_async_threads);
    511 
    512 	// We must succeed.
    513 	CL_PANIC(mem_async_threadpool != NULL);
    514 }
    515 
    516 pxvfs::~pxvfs()
    517 {
    518 	CORBA::release(fs_fsobj);
    519 
    520 	// Assert that there are no extant client side objects for this pxvfs.
    521 	ASSERT(fs_rootvp == NULL);
    522 	ASSERT((flags & PXFS_FILE_ACTIVATE) == 0);
    523 	ASSERT(llm_cb_list.empty());
    524 } //lint !e1740 pointers are neither freed nor zero'ed by destructor
    525 
    526 //
    527 // get_pxvfs - This method supports the pxvfs_inactive_task
    528 // This is a virtual method, so do not try to make inline.
    529 //
    530 pxvfs *
    531 pxvfs::get_pxvfs()
    532 {
    533 	return (this);
    534 }
    535 
    536 //
    537 // Called by mount_client_impl when the initial mount or a remount
    538 // occur.
    539 //
    540 void
    541 pxvfs::set_mntoptions(const char *mntoptions)
    542 {
    543 	if (pxfslib::exists_mntopt(mntoptions, MNTOPT_SYNCDIR, false)) {
    544 		_syncdir_on = true;
    545 	} else {
    546 		_syncdir_on = false;
    547 	}
    548 	if (pxfslib::exists_mntopt(mntoptions, MNTOPT_NOCTO, false)) {
    549 		_nocto_on = true;
    550 	} else {
    551 		_nocto_on = false;
    552 	}
    553 	if (pxfslib::exists_mntopt(mntoptions, MNTOPT_FORCEDIRECTIO, false)) {
    554 		_forcedirectio_on = true;
    555 	} else {
    556 		_forcedirectio_on = false;
    557 	}
    558 }
    559 
    560 //
    561 // supported_bdev_fs - returns the whether the specifed name is supported
    562 // by PXFS as a file system.
    563 //
    564 bool
    565 pxvfs::supported_bdev_fs(char *fsname)
    566 {
    567 	//
    568 	// Table for specifying list of block device file systems
    569 	// that are currently supported as wrapped file systems.
    570 	//
    571 	struct supp_bdev_fs {
    572 		char    *fsname;
    573 		bool	supported;
    574 	};
    575 	static struct supp_bdev_fs supp_bdev_fs[] = {
    576 		{ "ufs",  true },
    577 		{ "hsfs", true },
    578 #ifndef VXFS_DISABLED
    579 		{ "vxfs", true }
    580 #endif
    581 	};
    582 
    583 	int	nbdev_fs = (int)(sizeof (supp_bdev_fs) /
    584 	    sizeof (struct supp_bdev_fs));
    585 
    586 	for (int i = 0; i < nbdev_fs; i++) {
    587 		if (strcmp(fsname, supp_bdev_fs[i].fsname) == 0) {
    588 			return (supp_bdev_fs[i].supported);
    589 		}
    590 	}
    591 	return (false);
    592 }
    593 
    594 //
    595 // Mount a global file system.
    596 // At this point, the mount point is locked locally with vn_vfswlock(),
    597 // the file system switch table is locked locally with RLOCK_VFSSW(),
    598 // and the vfs_t is locked locally with vfs_lock() in that order.
    599 //
    600 // static
    601 int
    602 pxvfs::mount(vfs *vfsp, vnode *mvp, struct mounta *uap, cred *cr)
    603 {
    604 	Environment	e;
    605 	int		error;
    606 
    607 	ASSERT(uap->flags & MS_GLOBAL);
    608 	ASSERT(vn_vfswlock_held(mvp));
    609 #ifndef PSARC_2001_038 // This assertion is no longer true for S9
    610 	ASSERT(VFSSW_LOCKED());
    611 #endif
    612 	// ASSERT(vfs_lock_held(vfsp));
    613 
    614 	//
    615 	// Disallow an overlaid mount on an extant mount point unless it's
    616 	// explicitly requested.
    617 	// XXX This check should be in domount() but requires a change to
    618 	// the VFS_MOUNT() interface.
    619 	// XXX Also, namefs doesn't do this check quite this way.
    620 	// XXX Note that we don't check that vp->v_type == VDIR.
    621 	// This should be checked for most file systems but not "namefs".
    622 	//
    623 	mutex_enter(&mvp->v_lock);
    624 	if ((uap->flags & MS_REMOUNT) == 0 &&
    625 	    (uap->flags & MS_OVERLAY) == 0 &&
    626 	    (mvp->v_count != 1 || (mvp->v_flag & VROOT) != 0)) {
    627 		mutex_exit(&mvp->v_lock);
    628 		return (EBUSY);
    629 	}
    630 	mutex_exit(&mvp->v_lock);
    631 
    632 	//
    633 	// Verify that the mount client is already active, returning
    634 	// failure if it isn't.
    635 	//
    636 	if (!mount_client_impl::is_activated()) {
    637 		char		nodename[32];
    638 
    639 		(void) sprintf(nodename, "Node (%u)", orb_conf::node_number());
    640 		os::sc_syslog_msg msg(SC_SYSLOG_GLOBAL_MOUNT_TAG,
    641 		    nodename, NULL);
    642 		//
    643 		// SCMSGS
    644 		// @explanation
    645 		// A global mount command is attempted before the node has
    646 		// initialized the global file system name space. Typically
    647 		// this caused by trying to perform a global mount while the
    648 		// system is booted in single user mode.
    649 		// @user_action
    650 		// If the system is not at run level 2 or 3, change to run
    651 		// level 2 or 3 using the init(1M) command. Otherwise, check
    652 		// message logs for errors during boot.
    653 		//
    654 		(void) msg.log(SC_SYSLOG_WARNING, MESSAGE,
    655 		    "pxvfs:mount(): global mounts are not enabled"
    656 		    " (need to run \"clconfig -g\" first)\n");
    657 		return (ENODEV);
    658 	}
    659 
    660 	//
    661 	// We read in all the mount data from user space since there
    662 	// are many places where we need to examine the data before
    663 	// passing it to the VFS_MOUNT() of the underlying file system.
    664 	//
    665 	sol::mounta ma;
    666 	CORBA::String_var options;
    667 
    668 	if (uap->flags & MS_SYSSPACE) {
    669 		ma.spec = os::strdup(uap->spec);
    670 		ma.dir = os::strdup(uap->dir);
    671 		ma.fstype = os::strdup(uap->fstype);
    672 		ma.flags = uap->flags;
    673 		if (uap->datalen != 0) {
    674 			ma.data.load((uint_t)uap->datalen, (uint_t)uap->datalen,
    675 			    (uint8_t *)uap->dataptr, false);
    676 		}
    677 		if (uap->optlen != 0) {
    678 			ma.options.load((uint_t)uap->optlen,
    679 			    (uint_t)uap->optlen, (uint8_t *)uap->optptr, false);
    680 		}
    681 	} else {
    682 		// Don't support old mount formats.
    683 		if ((uap->flags & (MS_OPTIONSTR | MS_DATA | MS_FSS)) == 0 ||
    684 		    (uintptr_t)uap->fstype < 256) {
    685 			return (EINVAL);
    686 		}
    687 
    688 		size_t len;
    689 		char *str = new char [MAXPATHLEN];
    690 
    691 		if (uap->spec != NULL) {
    692 			error = copyinstr(uap->spec, str, (size_t)MAXPATHLEN,
    693 			    &len);
    694 			if (error != 0) {
    695 				delete [] str;
    696 				return (error);
    697 			}
    698 			ma.spec = os::strcpy(new char [len], str);
    699 		}
    700 
    701 		error = copyinstr(uap->dir, str, (size_t)MAXPATHLEN, &len);
    702 		if (error != 0) {
    703 			delete [] str;
    704 			return (error);
    705 		}
    706 		ma.dir = os::strcpy(new char [len], str);
    707 
    708 		ma.fstype = new char [FSTYPSZ];
    709 		error = copyinstr(uap->fstype, ma.fstype, (size_t)FSTYPSZ,
    710 		    &len);
    711 		if (error != 0) {
    712 			delete [] str;
    713 			if (error == ENAMETOOLONG) {
    714 				error = EINVAL;
    715 			}
    716 			return (error);
    717 		}
    718 
    719 		if ((uap->flags & MS_DATA) != 0 && uap->datalen != 0) {
    720 			ma.data.length((uint_t)uap->datalen);
    721 			error = copyin(uap->dataptr, ma.data.buffer(),
    722 			    (size_t)uap->datalen);	//lint !e571
    723 			if (error != 0) {
    724 				delete [] str;
    725 				return (error);
    726 			}
    727 		}
    728 
    729 		if ((uap->flags & MS_OPTIONSTR) != 0 && uap->optlen != 0) {
    730 			ma.options.length((uint_t)uap->optlen);
    731 			error = copyin(uap->optptr, ma.options.buffer(),
    732 			    (size_t)uap->optlen);	//lint !e571
    733 			if (error != 0) {
    734 				delete [] str;
    735 				return (error);
    736 			}
    737 		}
    738 		MOUNT_DBPRINTF(
    739 		    MOUNT_TRACE_CLIENT,
    740 		    MOUNT_GREEN,
    741 		    ("\nma.spec %s \nma.dir %s \nma.flags %x \nma.fstype %s \n",
    742 		    (char *)ma.spec, (char *)ma.dir, ma.flags,
    743 		    (char *)ma.fstype));
    744 
    745 		if (ma.options.length() != 0) {
    746 			MOUNT_DBPRINTF(
    747 			    MOUNT_TRACE_CLIENT,
    748 			    MOUNT_GREEN,
    749 			    (" ma.options %s\n", ma.options.buffer()));
    750 		}
    751 
    752 		delete [] str;
    753 		ma.flags = uap->flags | MS_SYSSPACE;
    754 
    755 #ifndef VXFS_DISABLED
    756 		if (strcmp(ma.fstype, "vxfs") == 0) {
    757 			if (error = vxfs_copyinargs(ma)) {
    758 				return (error);
    759 			}
    760 		}
    761 #endif
    762 	}
    763 
    764 	// Get a reference to the local mount client.
    765 	fs::mount_client_var	clientv = mount_client_impl::get_client_ref();
    766 	solobj::cred_var	credobj = solobj_impl::conv(cr);
    767 
    768 	//
    769 	// For remounts, the mount point should be the root vnode for
    770 	// this file system.
    771 	//
    772 	if (ma.flags & MS_REMOUNT) {
    773 		// The mount point should be a PXFS vnode.
    774 		ASSERT(mvp->v_flag & VPXFS);
    775 
    776 		//
    777 		// Note: we are sharing the hold on the fobj and file system
    778 		// objects that the proxy vnode/vfs hold so don't release
    779 		// them.
    780 		//
    781 		PXFS_VER::filesystem_ptr	fsptr =
    782 		    VFSTOPXFS(vfsp)->get_fsobj();
    783 
    784 		PXFS_VER::fobj_ptr	fobjp =
    785 		    ((pxfobj *)pxnode::VTOPX(mvp))->getfobj();
    786 
    787 		uint32_t	vfsflags;
    788 
    789 		FAULTPT_PXFS(FAULTNUM_PXFS_REMOUNT_C_B,
    790 				FaultFunctions::generic);
    791 
    792 		mount_client_impl::get_server()->remount_v1(fsptr, fobjp, ma,
    793 		    credobj, clientv, vfsflags, options, e);
    794 
    795 		FAULTPT_PXFS(FAULTNUM_PXFS_REMOUNT_C_A,
    796 		    FaultFunctions::generic);
    797 
    798 		error = pxfslib::get_err(e);
    799 
    800 		if (error == 0) {
    801 			//
    802 			// The mount client on each node except this one
    803 			// builds the mount options table and sets the node
    804 			// mount flags. On this node, an options table is
    805 			// built in Solaris by domount, but we do it again
    806 			// to accommodate differences in underlying file
    807 			// systems (ie. VxFS).  Include the remount option
    808 			// for Solaris 10 command level mount option checks.
    809 			//
    810 			vfsp->vfs_flag = vfsflags;
    811 #ifdef GLOBAL_MNTTAB_MODTIME_INTERFACE
    812 			if (!pxfslib::exists_mntopt(options, "remount",
    813 			    false)) {
    814 				char *new_options;
    815 				//lint -e668
    816 				size_t		new_len = strlen(options) +
    817 				    strlen(",remount") + 1;
    818 				new_options = new char [new_len];
    819 				(void) strcpy(new_options, options);
    820 				(void) strcat(new_options, ",remount");
    821 				options = os::strdup(new_options);
    822 				delete [] new_options;
    823 			}
    824 			//
    825 			// vfs_mnttab_modtimeupd is global for Solaris 10 but
    826 			// is declared static for Solaris 8 and 9.
    827 			//
    828 			vfs_list_lock();
    829 			vfs_createopttbl(&vfsp->vfs_mntopts, options);
    830 			vfs_parsemntopts(&vfsp->vfs_mntopts,
    831 			    (char *)options, 1);
    832 			vfs_mnttab_modtimeupd();
    833 			vfs_list_unlock();
    834 			vfs_setmntopt(vfsp, MNTOPT_REMOUNT, NULL,
    835 			    VFS_NODISPLAY);
    836 #else
    837 
    838 			vfs_createopttbl(&vfsp->vfs_mntopts, options);
    839 			vfs_parsemntopts(&vfsp->vfs_mntopts, options, 1);
    840 			gethrestime(&vfs_mnttab_mtime);
    841 #endif
    842 			if (ma.options.length() > 0) {
    843 				VFSTOPXFS(vfsp)->set_mntoptions
    844 				    ((const char *)ma.options.buffer());
    845 			}
    846 		}
    847 
    848 		//
    849 		// Workaround for MO_TAG bug in vfs.c. pxvfs's constructor.
    850 		//
    851 		if (strcmp("ufs", vfssw[vfsp->vfs_fstype].vsw_name) == 0) {
    852 			mntopt_t *mop = vfs_hasopt(&vfsp->vfs_mntopts,
    853 			    "logging");
    854 			if (mop && (mop->mo_flags & MO_TAG) == 0) {
    855 				mop->mo_flags |= MO_TAG;
    856 			}
    857 		}
    858 
    859 		FAULTPT_PXFS(FAULTNUM_PXFS_REMOUNT_C_E,
    860 			FaultFunctions::generic);
    861 
    862 		return (error);
    863 	}
    864 
    865 	bool			dev_is_ha;
    866 	CORBA::String_var	dev_name;
    867 	sol::nodeid_seq_t	dev_nids;
    868 
    869 	//
    870 	// Check that PXFS supports this file system type.
    871 	// Note: ma.spec can be NULL if mounting "namefs".
    872 	// XXX This code will need to change when new file system types
    873 	// are supported by PXFS.
    874 	//
    875 	if (!supported_bdev_fs(ma.fstype)) {
    876 		//
    877 		// Log message and bail out for unsupported file systems.
    878 		//
    879 
    880 		char		nodename[32];
    881 
    882 		(void) sprintf(nodename, "Node (%u)", orb_conf::node_number());
    883 		os::sc_syslog_msg msg(SC_SYSLOG_GLOBAL_MOUNT_TAG,
    884 		    nodename, NULL);
    885 		//
    886 		// SCMSGS
    887 		// @explanation
    888 		// A global mount is not supported for the specified file
    889 		// system.
    890 		// @user_action
    891 		// Check the release notes and documents about the support
    892 		// of the specified system.
    893 		//
    894 		(void) msg.log(SC_SYSLOG_NOTICE, MESSAGE,
    895 		    "pxvfs:mount(): global mount is not supported for "
    896 		    "filesystem type : '%s'\n", ma.fstype);
    897 		return (ENOTSUP);
    898 	} else {
    899 		//
    900 		// Lookup the device special file to determine the dev_t
    901 		// for the device.
    902 		// If this is a PXFS global device file, then
    903 		// contact DCS to start the device service and get the
    904 		// list of nodes we should create file system replicas on.
    905 		// For non-PXFS special files, just create a file system
    906 		// replica on this node.
    907 		//
    908 		vnode_t		*bvp;
    909 		error = lookupname(ma.spec, UIO_SYSSPACE, FOLLOW, NULL, &bvp);
    910 		if (error != 0) {
    911 			return (error);
    912 		}
    913 		if (bvp->v_type != VBLK) {
    914 			VN_RELE(bvp);
    915 			return (ENOTBLK);
    916 		}
    917 		if ((bvp->v_flag & VPXFS) == 0) {
    918 			//
    919 			// Not a PXFS special file.
    920 			//
    921 			VN_RELE(bvp);
    922 			dev_is_ha = false;
    923 			dev_nids.length(1);
    924 			dev_nids[0] = orb_conf::node_number();
    925 		} else {
    926 			//
    927 			// Contact DCS to get the list of nodes that this
    928 			// device is attached to and whether or not its an
    929 			// HA device. Also, pass a reference to the mount
    930 			// server's dc_callback object so the mount server
    931 			// is notified when this node configuration data
    932 			// changes (due to system administration commands).
    933 			//
    934 			dev_t devid = bvp->v_rdev;
    935 			VN_RELE(bvp);
    936 
    937 			sol::nodeid_seq_t_var nodes;
    938 
    939 			error = get_configured_nodes(dev_is_ha, dev_name,
    940 			    devid, nodes, e);
    941 			if (error != 0) {
    942 				return (error);
    943 			}
    944 			dev_nids = *nodes;
    945 		}
    946 	}
    947 
    948 	//
    949 	// Note: we have to release the read lock on vfssw[] since
    950 	// we may have to instantiate the file system (by calling
    951 	// domount() for the underlying file system) which could
    952 	// try to call WLOCK_VFSSW() and deadlock. We can safely
    953 	// release the read lock because the pxfs module won't
    954 	// be unloaded (we don't allow unloading but if we did,
    955 	// we would have control of that in _fini()).
    956 	// We do need to make sure we return with the read lock held
    957 	// so that domount("pxfs") can unlock it.
    958 	//
    959 	// But first, allocate a vfssw[] entry for the underlying
    960 	// file system type without loading the module.
    961 	//
    962 #ifdef PSARC_2001_038 // Need to grab the lock for S9
    963 	RLOCK_VFSSW();
    964 #endif
    965 	struct vfssw	*vswp = vfs_getvfsswbyname(ma.fstype);
    966 	RUNLOCK_VFSSW();
    967 	if (vswp == NULL) {
    968 		WLOCK_VFSSW();
    969 		if ((vswp = vfs_getvfsswbyname(ma.fstype)) == NULL) {
    970 			vswp = allocate_vfssw(ma.fstype);
    971 		}
    972 		WUNLOCK_VFSSW();
    973 		if (vswp == NULL) {
    974 			RLOCK_VFSSW();
    975 			return (EINVAL);
    976 		}
    977 	}
    978 	int	fstype = (int)(vswp - vfssw);
    979 	ASSERT(fstype); // see 4470243
    980 
    981 	//
    982 	// We pass the mount arguments to the mount server which
    983 	// will lock the mount point on all other nodes, create the
    984 	// file system object (instantiate the file system),
    985 	// create proxy file system objects on all other nodes, link
    986 	// the proxy into the file system name space and unlock the
    987 	// mount points. If all that goes well, we will create the proxy,
    988 	// link it into the name space, and unlock on this node after we
    989 	// return to domount().
    990 	// We have to pass the mount point vnode pointer to mount()
    991 	// since mount_client_impl::instantiate() may be called on
    992 	// this node.
    993 	//
    994 	PXFS_VER::filesystem_var	fsobj;
    995 	PXFS_VER::fs_info		fsinfo;
    996 
    997 	FAULTPT_PXFS(FAULTNUM_PXFS_MOUNT_C_B, FaultFunctions::generic);
    998 
    999 	mount_client_impl::get_server()->mount_v1(ma, (sol::uintptr_t)mvp,
   1000 	    credobj, clientv, dev_is_ha, dev_name, dev_nids,
   1001 	    fsobj, fsinfo, options, e);
   1002 
   1003 	FAULTPT_PXFS(FAULTNUM_PXFS_MOUNT_C_A, FaultFunctions::generic);
   1004 
   1005 	// Aquire the read lock before returning.
   1006 #ifndef PSARC_2001_038 // lock is no longer held, we don't grab it again for S9
   1007 	RLOCK_VFSSW();
   1008 #endif
   1009 
   1010 	error = pxfslib::get_err(e);
   1011 
   1012 	MOUNT_DBPRINTF(
   1013 	    MOUNT_TRACE_CLIENT,
   1014 	    (error ? MOUNT_RED : MOUNT_GREEN),
   1015 	    ("pxvfs:mount() mount error %d\n",
   1016 	    error));
   1017 
   1018 	if (error != 0) {
   1019 		return (error);
   1020 	}
   1021 
   1022 	//
   1023 	// The mount client on each node except this one builds the mount
   1024 	// options table. On this node an options table is built in Solaris
   1025 	// by domount, but we do it again here to accommodate differences
   1026 	// in underlying file systems (ie VxFS).
   1027 	//
   1028 	vfs_createopttbl(&vfsp->vfs_mntopts, options);
   1029 	vfs_parsemntopts(&vfsp->vfs_mntopts, (char *)options, 1);
   1030 
   1031 	//
   1032 	// At this point we are committed to creating the proxy file system
   1033 	// for this node and linking it into the name space.
   1034 	//
   1035 	uint32_t	server_incarn;
   1036 	ASSERT(!CORBA::is_nil(fsobj));
   1037 	fsmgr_client_impl	*clientmgrp = new fsmgr_client_impl();
   1038 	PXFS_VER::fsmgr_client_var	clientmgr = clientmgrp->get_objref();
   1039 
   1040 	uint32_t fs_blk_size;
   1041 	bool fastwrite;
   1042 	PXFS_VER::fsmgr_server_ptr	servermgr_p =
   1043 	    fsobj->bind_fs(clientmgr, orb_conf::node_number(),
   1044 	    server_incarn, fs_blk_size, fastwrite, e);
   1045 	//
   1046 	// The file system is already "mounted" on all other nodes.
   1047 	// So, go ahead and complete mounting on this node as well.
   1048 	//
   1049 	if (e.exception()) {
   1050 		//
   1051 		// We ignore comm failures since this can happen
   1052 		// if a node with the pxfs server crashes and
   1053 		// then any node joins the global name space. The
   1054 		// mount server will try to create a proxy vfs
   1055 		// for the dead file system and link it into the
   1056 		// name space. We need this to succeed so that
   1057 		// the dead file system can be unmounted properly.
   1058 		//
   1059 		if (CORBA::COMM_FAILURE::_exnarrow(e.exception()) == NULL) {
   1060 #ifdef DEBUG
   1061 			e.exception()->print_exception(
   1062 			    "pxvfs::mount(): ");
   1063 #endif
   1064 			MOUNT_DBPRINTF(
   1065 			    MOUNT_TRACE_CLIENT,
   1066 			    MOUNT_RED,
   1067 			    ("pxvfs:mount() exception from"
   1068 			    " fsobj->bind_fs()\n"));
   1069 		} else {
   1070 			servermgr_p = PXFS_VER::fsmgr_server::_nil();
   1071 			MOUNT_DBPRINTF(
   1072 			    MOUNT_TRACE_CLIENT,
   1073 			    MOUNT_RED,
   1074 			    ("pxvfs:mount() comm failure\n "));
   1075 		}
   1076 
   1077 		e.clear();
   1078 	}
   1079 
   1080 
   1081 	pxvfs	*pxvfsp = new pxvfs(fsobj, clientmgrp, &fsinfo, fstype, vfsp,
   1082 	    server_incarn);
   1083 
   1084 	pxvfsp->pxfs_bsize = fs_blk_size;
   1085 	pxvfsp->fastwrite = fastwrite;
   1086 	//
   1087 	// No blocks allocated initially.
   1088 	// The blocks get allocated during the first allocating write
   1089 	// from this node.
   1090 	//
   1091 	pxvfsp->blocks_available = 0;
   1092 
   1093 	//
   1094 	// Mount flags are set by the mount client on each node except
   1095 	// this one which initiated the mount(2) call, so we set them now
   1096 	// for this node.
   1097 	//
   1098 	if (ma.options.length() > 0) {
   1099 		pxvfsp->set_mntoptions((const char *)ma.options.buffer());
   1100 	}
   1101 
   1102 	// Finish initializing the linkages between fsmgr client and server.
   1103 	clientmgrp->set_pxvfsp(pxvfsp, servermgr_p);
   1104 
   1105 	//
   1106 	// Add ourself to the local list of all PXFS file systems and
   1107 	// keep a hold on the vfs struct while on the list.
   1108 	//
   1109 	all_pxvfs_lock.wrlock();
   1110 	ASSERT(search(fsobj) == NULL);
   1111 	all_pxvfs.prepend((pxvfs_list_elem *)pxvfsp);
   1112 	VFS_HOLD(vfsp);
   1113 	all_pxvfs_lock.unlock();
   1114 
   1115 	FAULTPT_PXFS(FAULTNUM_PXFS_MOUNT_C_E, FaultFunctions::generic);
   1116 
   1117 	CORBA::release(servermgr_p);
   1118 	return (0);
   1119 }
   1120 
   1121 //
   1122 // Unmount a global file system.
   1123 // At this point, the covered mount point is locked locally with vn_vfswlock(),
   1124 // and the vfs_t is locked locally with vfs_lock() in that order.
   1125 //
   1126 // Note: Purge of all dnlc entries for this vfs has been done in dounmount
   1127 // of the pxfs filesystem.
   1128 //
   1129 int
   1130 pxvfs::unmount(int umflags, cred *credp)
   1131 {
   1132 	Environment	e;
   1133 
   1134 	// ASSERT(vfs_lock_held(fs_vfs));
   1135 	ASSERT(!CORBA::is_nil(fs_fsobj));
   1136 
   1137 	//
   1138 	// When a cluster node is going down cleanly (ie. with shutdown,
   1139 	// poweroff, init, etc), the /sbin/umountall utility is run to
   1140 	// unmount all file systems.  But we don't want umountall to unmount
   1141 	// PXFS filesystems served out by other nodes and mounted cluster-wide.
   1142 	// We take advantage of the Solaris stop script facility associated
   1143 	// with the init program; in particluar rc0.d/K30MOUNTGFSYS is run
   1144 	// as a result of init 0 (this is done for both shutdown and
   1145 	// poweroff).  From the stop script global mounts are disabled using
   1146 	// the cladm system call (and ultimately pxvfs::disable_unmounts).
   1147 	//
   1148 	// When using the scshutdown command to take down all cluster nodes,
   1149 	// the global unmounts are of course valid and needed. In this
   1150 	// case, scshutdown performs unmounting of global filesytems prior
   1151 	// to doing the individual node shutdowns.
   1152 	//
   1153 	// For node take down commands which do not result in running init
   1154 	// (ie. halt, reboot, uadmin), filesystems are unmounted in the
   1155 	// kernel and PXFS is made aware that the unmount is occuring.
   1156 	// This is done using the PXFS_SYNC_CLOSE flag set during the sync
   1157 	// operation before unmount.  If PXFS_SYNC_CLOSE is set, PXFS can set
   1158 	// it's PXFS_SHUTDOWN flag.  This is set even if the node going down is
   1159 	// the only node serving out the filesystem.  This is desired to prevent
   1160 	// applications (which would be unaware of the unmount) from continuing
   1161 	// operations at the local mount point.  Rather, after the node serving
   1162 	// out the filesystem is down, applications will get EIO for further
   1163 	// operations and they can take appropriate action.
   1164 	//
   1165 	if (unmounts_disabled || flags & PXFS_SHUTDOWN) {
   1166 		return (EBUSY);
   1167 	}
   1168 
   1169 	flags_lock.lock();
   1170 	//
   1171 	// Set PXFS_UNMOUNTING to block creation of new proxy vnodes.
   1172 	//
   1173 	flags |= PXFS_UNMOUNTING;
   1174 	if (umflags & MS_FORCE) {
   1175 		flags |= PXFS_FORCE_UNMOUNTING;
   1176 	}
   1177 
   1178 	if (fs_rootvp != NULL) {
   1179 		// Release the cached root vnode.
   1180 		vnode_t		*vnodep = fs_rootvp;
   1181 		fs_rootvp = NULL;
   1182 		flags_lock.unlock();
   1183 		VN_RELE(vnodep);
   1184 	} else {
   1185 		flags_lock.unlock();
   1186 	}
   1187 
   1188 	//
   1189 	// For a normal unmount, there will be a check for filesystem busy.
   1190 	// If busy, we get a return value of true; else the return value is
   1191 	// false after waiting for the inactive vnode list to become empty.
   1192 	// For forced unmount there is a wait for one pass of processing
   1193 	// of the inactive list rather than waiting for an empty list; and
   1194 	// the return value is always false.
   1195 	//
   1196 	if (wait_empty_inactive_list(umflags & MS_FORCE ? true : false)) {
   1197 		MOUNT_DBPRINTF(
   1198 		    MOUNT_TRACE_CLIENT,
   1199 		    MOUNT_RED,
   1200 		    ("pxvfs(%p):unmount returning EBUSY\n",
   1201 		    this));
   1202 		return (EBUSY);
   1203 	}
   1204 
   1205 	//
   1206 	// Get object references for the mount service
   1207 	//
   1208 	solobj::cred_var	credobj = solobj_impl::conv(credp);
   1209 	fs::mount_client_var	clientv = mount_client_impl::get_client_ref();
   1210 
   1211 	fs::mount_server_var	mount_server_v =
   1212 	    mount_client_impl::get_server();
   1213 
   1214 	FAULTPT_PXFS(FAULTNUM_PXFS_UNMOUNT_C_B, FaultFunctions::generic);
   1215 	mount_server_v->unmount_v1(fs_fsobj, umflags, credobj, clientv,
   1216 	    orb_conf::node_number(), 0, e);
   1217 	FAULTPT_PXFS(FAULTNUM_PXFS_UNMOUNT_C_A, FaultFunctions::generic);
   1218 
   1219 	sol::error_t	error = pxfslib::get_err(e);
   1220 
   1221 	MOUNT_DBPRINTF(
   1222 	    MOUNT_TRACE_CLIENT,
   1223 	    (error ? MOUNT_RED : MOUNT_GREEN),
   1224 	    ("pxvfs:unmount(%p) unmount error %d\n",
   1225 	    this, error));
   1226 
   1227 	if (error != 0) {
   1228 		unmount_failed();
   1229 	}
   1230 	FAULTPT_PXFS(FAULTNUM_PXFS_UNMOUNT_C_E, FaultFunctions::generic);
   1231 	return (error);
   1232 }
   1233 
   1234 int
   1235 pxvfs::mountroot(enum whymountroot)
   1236 {
   1237 	MOUNT_DBPRINTF(
   1238 	    MOUNT_TRACE_CLIENT,
   1239 	    MOUNT_RED,
   1240 	    ("pxvfs:mountroot called\n")); // XXX
   1241 	return (ENOTSUP);
   1242 }
   1243 
   1244 //
   1245 // root - An out parameter will contain the root directory proxy vnode,
   1246 // and the return value identifies whether the operation succeeded or failed.
   1247 //
   1248 // Note that the current design imposes the restriction that
   1249 // the underlying filesystem sets
   1250 // its root once and does not change its root.
   1251 //
   1252 int
   1253 pxvfs::root(vnode **vnodepp)
   1254 {
   1255 	flags_lock.lock();
   1256 	while (flags & PXFS_UNMOUNTING) {
   1257 		//
   1258 		// Wait for pending unmount operation to complete
   1259 		//
   1260 		flags |= PXFS_FILE_ACTIVATE;
   1261 		flags_cv.wait(&flags_lock);
   1262 	}
   1263 	if (flags & PXFS_UNMOUNTED) {
   1264 		//
   1265 		// The file system was unmounted,
   1266 		// and hence all files are inaccessible
   1267 		//
   1268 		flags_lock.unlock();
   1269 		*vnodepp = NULL;
   1270 		return (EIO);
   1271 	}
   1272 	if (fs_rootvp != NULL) {
   1273 		//
   1274 		// The root directory vnode is cached
   1275 		//
   1276 		*vnodepp = fs_rootvp;
   1277 		VN_HOLD(*vnodepp);
   1278 		flags_lock.unlock();
   1279 		return (0);
   1280 	}
   1281 	flags_lock.unlock();
   1282 
   1283 	//
   1284 	// Call the pxfs server to get a root object.
   1285 	// The root directory proxy vnode remains in existence
   1286 	// while the proxy file system is mounted.
   1287 	// Because the lock is dropped, another thread could
   1288 	// concurrently ask for the root proxy directory.
   1289 	// So we have to deal with the case where we are not
   1290 	// the first to ask for the root directory.
   1291 	//
   1292 	PXFS_VER::fobj_var	rootobj;
   1293 	PXFS_VER::fobj_info	rootinfo;
   1294 	PXFS_VER::bind_info	binfo;
   1295 	Environment		e;
   1296 	uint32_t		server_incn_orig = get_server_incn();
   1297 
   1298 	fobj_client_impl	*client1p = new fobj_client_impl;
   1299 	fobj_client_impl	*client2p;
   1300 
   1301 	PXFS_VER::fobj_client_ptr	client1_p = client1p->get_objref();
   1302 
   1303 	PXFS_VER::fobj_client_ptr	client2_p =
   1304 	    PXFS_VER::fobj_client::_nil();
   1305 
   1306 	fs_fsobj->getroot(rootobj, rootinfo, binfo, client1_p, client2_p, e);
   1307 	sol::error_t	error = pxfslib::get_err(e);
   1308 	if (error != 0) {
   1309 		CORBA::release(client1_p);
   1310 		CORBA::release(client2_p);
   1311 		*vnodepp = NULL;
   1312 		return (error);
   1313 	}
   1314 	// The root directory supports caching
   1315 	ASSERT(!CORBA::is_nil(client2_p));
   1316 
   1317 	if (client1_p->_equiv(client2_p)) {
   1318 		//
   1319 		// This is the first thread to request root directory
   1320 		//
   1321 		// Do not allow the server to change while
   1322 		// in the middle of creating a proxy vnode
   1323 		//
   1324 		server_incn_lock.rdlock();
   1325 		if (server_incn_orig == get_server_incn()) {
   1326 			// Construct a proxy vnode for the root directory.
   1327 			*vnodepp =
   1328 			    get_pxfobj(rootobj, rootinfo, &binfo, client1p);
   1329 
   1330 			server_incn_lock.unlock();
   1331 		} else {
   1332 			//
   1333 			// The registration was orphanned by
   1334 			// either a failover or switchover
   1335 			//
   1336 			server_incn_lock.unlock();
   1337 
   1338 			error = connect_again(vnodepp, rootobj, rootinfo, binfo,
   1339 			    client1p, client1_p, e);
   1340 		}
   1341 
   1342 		if (*vnodepp == NULL) {
   1343 			//
   1344 			// Error because we are unmounting proxy file system
   1345 			//
   1346 			CORBA::release(client1_p);
   1347 			CORBA::release(client2_p);
   1348 			return (EIO);
   1349 		}
   1350 
   1351 		flags_lock.lock();
   1352 		if (fs_rootvp == NULL) {
   1353 			//
   1354 			// Cache the root directory proxy vnode.
   1355 			// The cache places its own hold on the
   1356 			// root directory proxy vnode.
   1357 			//
   1358 			fs_rootvp = *vnodepp;
   1359 			VN_HOLD(*vnodepp);
   1360 		}
   1361 		flags_lock.unlock();
   1362 
   1363 	} else {
   1364 		//
   1365 		// There already is a proxy vnode for root directory
   1366 		//
   1367 		client2p = (fobj_client_impl *)
   1368 		    (client2_p->_handler()->get_cookie());
   1369 
   1370 		//
   1371 		// The current client may not be ready
   1372 		//
   1373 		if (!client2p->wait_till_ok()) {
   1374 			//
   1375 			// Error because we are unmounting proxy file system
   1376 			//
   1377 			*vnodepp = NULL;
   1378 			CORBA::release(client1_p);
   1379 			CORBA::release(client2_p);
   1380 			return (EIO);
   1381 		}
   1382 		//
   1383 		// The existing client has completed initialization
   1384 		// and has a hold placed upon the proxy vnode.
   1385 		//
   1386 		*vnodepp = pxnode::PXTOV(client2p->get_pxfobjplusp());
   1387 
   1388 		flags_lock.lock();
   1389 		if (fs_rootvp == NULL) {
   1390 			//
   1391 			// Cache the root directory proxy vnode.
   1392 			// The cache places its own hold on the
   1393 			// root directory proxy vnode.
   1394 			//
   1395 			fs_rootvp = *vnodepp;
   1396 			VN_HOLD(*vnodepp);
   1397 		}
   1398 		flags_lock.unlock();
   1399 	}
   1400 	CORBA::release(client1_p);
   1401 	CORBA::release(client2_p);
   1402 
   1403 	ASSERT((*vnodepp)->v_flag & VROOT);
   1404 
   1405 	return (0);
   1406 }
   1407 
   1408 int
   1409 pxvfs::statvfs(struct statvfs64 *sp)
   1410 {
   1411 	Environment e;
   1412 
   1413 	//
   1414 	// We hold the root proxy vnode, because not all calls
   1415 	// to VFS_STATVFS hold the root vnode pointer
   1416 	// before making the VFS_STATVFS call.
   1417 	//
   1418 
   1419 	vnode	*vnodep;
   1420 	int	err = root(&vnodep);
   1421 	if (err != 0) {
   1422 		return (err);
   1423 	}
   1424 
   1425 	//lint -e64
   1426 	fs_fsobj->get_statistics(conv(*sp), e);
   1427 	//lint +e64
   1428 
   1429 	VN_RELE(vnodep);
   1430 	return (pxfslib::get_err(e));
   1431 }
   1432 
   1433 //
   1434 // sync_all_attr
   1435 // Walk the list of all pxfobj objects, and flush out dirty attributes.  This is
   1436 // triggered by fsflush calling vfs_sync with the SYNC_ATTR flag.
   1437 //
   1438 // static
   1439 void
   1440 pxvfs::sync_all_attr(void *)
   1441 {
   1442 	pxfobj		*pxfobjp;
   1443 	pxfobj		*prevp = NULL;
   1444 	int		file_count = 0;
   1445 	int		sleep_index;
   1446 
   1447 	for (uint_t idx = 0; idx < pxfobjhsz; idx++) {
   1448 		//
   1449 		// Initialize the iterator while holding the lock.
   1450 		//
   1451 		pxfobj_hash[idx].hlock.lock();
   1452 		pxfobj_list_t::ListIterator	iter(pxfobj_hash[idx].hlist);
   1453 
   1454 		//
   1455 		// Determine sleep time.
   1456 		// The exact value does not matter.
   1457 		// We pretend that the files are evenly distributed across
   1458 		// buckets.
   1459 		//
   1460 		switch ((pxfobj_hash[idx].hlist_cnt * pxfobjhsz) / 10000) {
   1461 		case 0:		sleep_index = 0;	// <10,000 files
   1462 				break;
   1463 		case 1:		sleep_index = 1;	// <20,000 files
   1464 				break;
   1465 		case 2:		sleep_index = 2;	// <30,000 files
   1466 				break;
   1467 		case 3:		sleep_index = 3;	// <40,000 files
   1468 				break;
   1469 		default:	sleep_index = 4;	// >=40,000 files
   1470 		};
   1471 
   1472 		for (; (pxfobjp = iter.get_current()) != NULL; iter.advance()) {
   1473 
   1474 			ASSERT(pxfobjp->is_inhashtable());
   1475 
   1476 			if (pxfobjp->is_inactive() ||
   1477 			    !pxfobjp->is_cached()) {
   1478 				//
   1479 				// Inactive processing will flush any
   1480 				// cached attributes.
   1481 				// Do not place hold on proxy vnode,
   1482 				// as that will stop inactive file cleanup.
   1483 				//
   1484 				file_count++;
   1485 				continue;
   1486 			}
   1487 			//
   1488 			// Ensure that the proxy vnode remains active
   1489 			// while doing the sync on this proxy vnode.
   1490 			// PXFS does not remove the proxy vnode from
   1491 			// this hash table while the proxy vnode remains active.
   1492 			// This ensures that the list iterator will be pointing
   1493 			// to a member of the hash table.
   1494 			//
   1495 			VN_HOLD(pxnode::PXTOV(pxfobjp));
   1496 
   1497 			//
   1498 			// Drop the lock to allow changes to the hash table.
   1499 			// The sync can take a long time. So this is needed.
   1500 			//
   1501 			pxfobj_hash[idx].hlock.unlock();
   1502 
   1503 			if (prevp) {
   1504 				//
   1505 				// It is possible that this operation will
   1506 				// render this proxy vnode inactive and expel
   1507 				// this proxy vnode from the hash table.
   1508 				//
   1509 				VN_RELE(pxnode::PXTOV(prevp));
   1510 			}
   1511 
   1512 			(void) ((pxfobjplus *)pxfobjp)->sync_attr();
   1513 			prevp = pxfobjp;
   1514 
   1515 			//
   1516 			// This flush can starve other work.
   1517 			// So we throttle the flush.
   1518 			//
   1519 			if (++file_count >= sync_all_attr_throttle) {
   1520 				//
   1521 				// Allow other work.
   1522 				//
   1523 				os::usecsleep(
   1524 				    sync_all_attr_interval[sleep_index] *
   1525 				    ((file_count +
   1526 				    (sync_all_attr_throttle - 1))
   1527 				    / sync_all_attr_throttle));
   1528 				file_count = 0;
   1529 			}
   1530 
   1531 			pxfobj_hash[idx].hlock.lock();
   1532 		}
   1533 		pxfobj_hash[idx].hlock.unlock();
   1534 	}
   1535 	if (prevp) {
   1536 		//
   1537 		// Release the hold placed on the proxy vnode that was
   1538 		// processed last.
   1539 		//
   1540 		VN_RELE(pxnode::PXTOV(prevp));
   1541 	}
   1542 	sync_all_attr_lock.lock();
   1543 	sync_all_attr_thread_running = false;
   1544 	sync_all_attr_lock.unlock();
   1545 }
   1546 
   1547 //
   1548 // This is a sync in order to unmount one file system.
   1549 //
   1550 // XXX We should do an invocation to the server so the sync is seen globally
   1551 // and also to sync the underlying file system.
   1552 //
   1553 int
   1554 pxvfs::sync(short flag, cred *credp)
   1555 {
   1556 	pxfobj	*pxfobjp;
   1557 	pxfobj	*prevp;
   1558 
   1559 	//
   1560 	// If we are panicing we cannot safely perform any file system
   1561 	// operation. The integrity of the kernel is suspect too. To avoid
   1562 	// further problems we ignore the sync if a panic is in progress.
   1563 	//
   1564 	if (panicstr) {
   1565 		return (0);
   1566 	}
   1567 
   1568 	if (flag & SYNC_ATTR) {
   1569 		//
   1570 		// Hand off SYNC_ATTR calls by the fsflush thread to a
   1571 		// separate thread.  This may take very long, so if the thread
   1572 		// is currently running (from a previous call), don't schedule
   1573 		// it again.
   1574 		//
   1575 		// PXFS could already be doing this operation. This avoids
   1576 		// duplicate work.
   1577 		//
   1578 		sync_all_attr_lock.lock();
   1579 		if (!sync_all_attr_thread_running) {
   1580 			sync_all_attr_thread_running = true;
   1581 
   1582 			// Do 'new's outside locks.
   1583 			sync_all_attr_lock.unlock();
   1584 			defer_task	*taskp =
   1585 			    new work_task(pxvfs::sync_all_attr, NULL);
   1586 			sync_all_attr_lock.lock();
   1587 
   1588 			common_threadpool::the().defer_processing(taskp);
   1589 		}
   1590 		sync_all_attr_lock.unlock();
   1591 		return (0);
   1592 	}
   1593 
   1594 	//
   1595 	// Remember if SYNC_CLOSE is set so we know in unmount()
   1596 	// that the unmount is due to the node being shut down.
   1597 	//
   1598 	if (flag & SYNC_CLOSE) {
   1599 		flags_lock.lock();
   1600 		flags |= PXFS_SHUTDOWN;
   1601 		flags_lock.unlock();
   1602 	}
   1603 
   1604 	prevp = NULL;
   1605 
   1606 	for (uint_t idx = 0; idx < pxfobjhsz; idx++) {
   1607 		//
   1608 		// Make sure the iterator is initialized while holding the
   1609 		// lock.
   1610 		//
   1611 		pxfobj_hash[idx].hlock.lock();
   1612 		pxfobj_list_t::ListIterator iter(pxfobj_hash[idx].hlist);
   1613 
   1614 		for (; (pxfobjp = iter.get_current()) != NULL; iter.advance()) {
   1615 
   1616 			ASSERT(pxfobjp->is_inhashtable());
   1617 			VN_HOLD(pxnode::PXTOV(pxfobjp));
   1618 
   1619 			pxfobj_hash[idx].hlock.unlock();
   1620 
   1621 			if (prevp != NULL) {
   1622 				VN_RELE(pxnode::PXTOV(prevp));
   1623 			}
   1624 
   1625 			//
   1626 			// We commit only those vnodes which belong to this
   1627 			// filesystem.
   1628 			//
   1629 			if ((pxnode::PXTOV(pxfobjp)->v_vfsp) == fs_vfs) {
   1630 #if	SOL_VERSION >= __s11
   1631 				(void) VOP_PUTPAGE(pxnode::PXTOV(pxfobjp),
   1632 				    (offset_t)0, (size_t)0, B_ASYNC, credp,
   1633 				    NULL);
   1634 #else
   1635 				(void) VOP_PUTPAGE(pxnode::PXTOV(pxfobjp),
   1636 				    (offset_t)0, (size_t)0, B_ASYNC, credp);
   1637 #endif
   1638 			}
   1639 
   1640 			prevp = pxfobjp;
   1641 			pxfobj_hash[idx].hlock.lock();
   1642 
   1643 			//
   1644 			// Although the list could have changed while the lock
   1645 			// was not held, the list element pointer in the
   1646 			// iterator should still be valid since we held the
   1647 			// vnode (i.e., it shouldn't be removed from the list).
   1648 			// XXX This is based on the knowledge of exact
   1649 			// implementation of the ListIterator. This
   1650 			// should work as of today, but we need to be careful
   1651 			// if iterator implementation changes.
   1652 			//
   1653 		}
   1654 		pxfobj_hash[idx].hlock.unlock();
   1655 	}
   1656 	if (prevp) {
   1657 		VN_RELE(pxnode::PXTOV(prevp));
   1658 	}
   1659 	return (0);
   1660 }
   1661 
   1662 //
   1663 // Sync all pxfs file systems.
   1664 //
   1665 // XXX We should do an invocation to the server so the sync is seen globally
   1666 // and also to sync the underlying file system.
   1667 //
   1668 int
   1669 pxvfs::sync_all(short flag, cred *cr)
   1670 {
   1671 	pxvfs	*pxvfsp;
   1672 	int	error = 0;
   1673 	int	result = 0;
   1674 
   1675 	all_pxvfs_lock.rdlock();
   1676 	for (all_pxvfs.atfirst();
   1677 	    (pxvfsp = all_pxvfs.get_current()) != NULL;
   1678 	    all_pxvfs.advance()) {
   1679 
   1680 		error = pxvfsp->sync(flag, cr);
   1681 
   1682 
   1683 		if (result == 0) {
   1684 			//
   1685 			// If multiple file systems encounter errors,
   1686 			// will return the first error encountered.
   1687 			//
   1688 			result = error;
   1689 		}
   1690 	}
   1691 	all_pxvfs_lock.unlock();
   1692 
   1693 	return (error);
   1694 }
   1695 
   1696 //
   1697 // fid_to_proxy_file - Find the proxy file object for the specified FID.
   1698 //
   1699 // The caller must eventually do VN_RELE to undo the VN_HOLD done here.
   1700 //
   1701 pxfobj *
   1702 pxvfs::fid_to_proxy_file(fid_t *wanted_fidp)
   1703 {
   1704 	pxfobj		*pxfobjp;
   1705 	const fid_t	*fidp;
   1706 
   1707 	uint_t		b_idx = pxfs_misc::hash_devt_fid(get_vfsp()->vfs_dev,
   1708 			    wanted_fidp, pxfobjhsz);
   1709 
   1710 	pxfobj_hash_bkt	&hbkt = pxfobj_hash[b_idx];
   1711 
   1712 	// Make sure the iterator is initialized while holding the lock.
   1713 	hbkt.hlock.lock();
   1714 	pxfobj_list_t::ListIterator iter(hbkt.hlist);
   1715 
   1716 	for (; (pxfobjp = iter.get_current()) != NULL; iter.advance()) {
   1717 		fidp = pxfobjp->get_fidp();
   1718 		if (fidp != NULL &&
   1719 		    fidp->fid_len == wanted_fidp->fid_len &&
   1720 		    (bcmp(fidp->fid_data, wanted_fidp->fid_data,
   1721 		    (size_t)wanted_fidp->fid_len) == 0)) {
   1722 			//
   1723 			// Found a match
   1724 			//
   1725 			// Make sure the proxy file object
   1726 			// stays around until we're finished
   1727 			//
   1728 			VN_HOLD(pxnode::PXTOV(pxfobjp));
   1729 			hbkt.hlock.unlock();
   1730 
   1731 			if (pxfobjp->can_cache()) {
   1732 				((pxfobjplus *)pxfobjp)->set_recycled();
   1733 			}
   1734 
   1735 			return (pxfobjp);
   1736 		}
   1737 	}
   1738 	hbkt.hlock.unlock();
   1739 
   1740 	// No match found
   1741 	return (NULL);
   1742 }
   1743 
   1744 //
   1745 // vget - Lookup a file based on the file ID.
   1746 // Return its vnode pointer held.
   1747 //
   1748 // This operation supports NFS operations.
   1749 //
   1750 int
   1751 pxvfs::vget(vnode **vnodepp, struct fid *ufidp)
   1752 {
   1753 	pxfobj		*pxfobjp;
   1754 	const fid_t	*fidp;
   1755 
   1756 	flags_lock.lock();
   1757 	while (flags & PXFS_UNMOUNTING) {
   1758 		//
   1759 		// Wait for pending unmount operation to complete
   1760 		//
   1761 		flags |= PXFS_FILE_ACTIVATE;
   1762 		flags_cv.wait(&flags_lock);
   1763 	}
   1764 	if (flags & PXFS_UNMOUNTED) {
   1765 		//
   1766 		// The file system was unmounted,
   1767 		// and hence all files are inaccessible
   1768 		//
   1769 		flags_lock.unlock();
   1770 		*vnodepp = NULL;
   1771 		return (EIO);
   1772 	}
   1773 	flags_lock.unlock();
   1774 
   1775 #ifdef DEBUG
   1776 	os::atomic_add_32(&pxvfs_vget_number_calls, 1);
   1777 #endif
   1778 
   1779 	uint_t		b_idx = pxfs_misc::hash_devt_fid(get_vfsp()->vfs_dev,
   1780 			    ufidp, pxfobjhsz);
   1781 
   1782 	pxfobj_hash_bkt	&hbkt = pxfobj_hash[b_idx];
   1783 
   1784 	// Make sure the iterator is initialized while holding the lock.
   1785 	hbkt.hlock.lock();
   1786 	pxfobj_list_t::ListIterator iter(hbkt.hlist);
   1787 
   1788 	for (; (pxfobjp = iter.get_current()) != NULL; iter.advance()) {
   1789 		fidp = pxfobjp->get_fidp();
   1790 		if (fidp != NULL) {
   1791 			//
   1792 			// See if the current proxy file object
   1793 			// has the fid of the file we are looking for.
   1794 			//
   1795 			if ((fidp->fid_len == ufidp->fid_len) &&
   1796 			    bcmp(fidp->fid_data, ufidp->fid_data,
   1797 			    (size_t)ufidp->fid_len) == 0) {
   1798 				//
   1799 				// Check that the file object of the
   1800 				// matched fid, belongs to this filesystem.
   1801 				//
   1802 				// The current hashing algorithm for
   1803 				// a pxfobj uses the dev_t and fid
   1804 				// to determine the hash index.
   1805 				// In order to retrieve the correct
   1806 				// pxfobj from the hash table,
   1807 				// both dev_t and fid need to match.
   1808 				//
   1809 				// Note that fids are unique only for
   1810 				// a given filesystem, not across filesystems.
   1811 				// It is very much possible that two files
   1812 				// belonging to different filesystems,
   1813 				// may have the same fid, and may end up
   1814 				// on the same hash chain.
   1815 				//
   1816 				vfs_t	*cur_vfsp =
   1817 				    pxnode::PXTOV(pxfobjp)->v_vfsp;
   1818 				if (cur_vfsp == get_vfsp()) {
   1819 					*vnodepp = pxnode::PXTOV(pxfobjp);
   1820 					VN_HOLD(*vnodepp);
   1821 					hbkt.hlock.unlock();
   1822 					if (pxfobjp->can_cache()) {
   1823 						((pxfobjplus *)pxfobjp)->
   1824 						    set_recycled();
   1825 					}
   1826 #ifdef DEBUG
   1827 					os::atomic_add_32(
   1828 					    &pxvfs_vget_number_fid_hits, 1);
   1829 #endif
   1830 					return (0);
   1831 				} else {
   1832 					PXFS_DBPRINTF(PXFS_TRACE_PXVFS,
   1833 					    PXFS_AMBER,
   1834 					    ("pxvfs:vget(%p) fid %p "
   1835 					    "false match vfsp %p\n",
   1836 					    this,  fidp->fid_data, cur_vfsp));
   1837 				}
   1838 			}
   1839 		}
   1840 	}
   1841 	hbkt.hlock.unlock();
   1842 
   1843 	//
   1844 	// Go to the pxfs server to do the lookup.
   1845 	//
   1846 	PXFS_VER::fobj_var	fobj_v;
   1847 	PXFS_VER::fobj_info	fobjinfo;
   1848 	PXFS_VER::bind_info	binfo;
   1849 	Environment		e;
   1850 	uint32_t		server_incn_orig = get_server_incn();
   1851 
   1852 	fs::fobjid_t		fobjid(ufidp->fid_len, ufidp->fid_len,
   1853 					(uint8_t *)ufidp->fid_data, false);
   1854 	//
   1855 	// If the new client is not used, unreference will clean it up.
   1856 	//
   1857 	fobj_client_impl	*client1p = new fobj_client_impl;
   1858 	fobj_client_impl	*client2p;
   1859 
   1860 	PXFS_VER::fobj_client_ptr	client1_p = client1p->get_objref();
   1861 
   1862 	PXFS_VER::fobj_client_ptr	client2_p =
   1863 	    PXFS_VER::fobj_client::_nil();
   1864 
   1865 	//
   1866 	// Contact the server to get the file object for the specified FID
   1867 	//
   1868 	fs_fsobj->getfobj(fobjid, fobj_v, fobjinfo, binfo,
   1869 	    client1_p, client2_p, e);
   1870 
   1871 	sol::error_t	error = pxfslib::get_err(e);
   1872 	if (error == 0) {
   1873 		if (CORBA::is_nil(client2_p) || client1_p->_equiv(client2_p)) {
   1874 			//
   1875 			// This means we are creating a new proxy vnode
   1876 			//
   1877 			if (CORBA::is_nil(client2_p)) {
   1878 				//
   1879 				// Client side caching is not used
   1880 				//
   1881 				client1p = NULL;
   1882 
   1883 				*vnodepp = get_pxfobj(fobj_v, fobjinfo, &binfo,
   1884 				    client1p);
   1885 			} else {
   1886 				//
   1887 				// Client side caching is used
   1888 				//
   1889 				// Do not allow the server to change while
   1890 				// in the middle of creating a proxy vnode
   1891 				//
   1892 				server_incn_lock.rdlock();
   1893 				if (server_incn_orig == get_server_incn()) {
   1894 					*vnodepp = get_pxfobj(fobj_v, fobjinfo,
   1895 					    &binfo, client1p);
   1896 					server_incn_lock.unlock();
   1897 				} else {
   1898 					//
   1899 					// The registration was orphanned by
   1900 					// either a failover or switchover
   1901 					//
   1902 					server_incn_lock.unlock();
   1903 
   1904 					error = connect_again(vnodepp,
   1905 					    fobj_v, fobjinfo, binfo,
   1906 					    client1p, client1_p, e);
   1907 				}
   1908 			}
   1909 			if (error == 0 && *vnodepp == NULL) {
   1910 				//
   1911 				// Although UFS returns a NULL vnode pointer
   1912 				// and error == 0,
   1913 				// the right thing to do is return EINVAL.
   1914 				//
   1915 				error = EINVAL;
   1916 			}
   1917 		} else {
   1918 			//
   1919 			// There already is a proxy vnode
   1920 			//
   1921 			client2p = (fobj_client_impl *)
   1922 			    (client2_p->_handler()->get_cookie());
   1923 
   1924 			//
   1925 			// The current client may not be ready
   1926 			//
   1927 			if (!client2p->wait_till_ok()) {
   1928 				//
   1929 				// The current client is going away
   1930 				//
   1931 				CORBA::release(client2_p);
   1932 				client2_p = PXFS_VER::fobj_client::_nil();
   1933 
   1934 				error = connect_again(vnodepp,
   1935 				    fobj_v, fobjinfo, binfo,
   1936 				    client1p, client1_p, e);
   1937 			} else {
   1938 				//
   1939 
   1940 				// Existing client has completed initialization
   1941 				// and has a hold placed upon it.
   1942 				//
   1943 				*vnodepp =
   1944 				    pxnode::PXTOV(client2p->get_pxfobjplusp());
   1945 			}
   1946 		}
   1947 	} else {
   1948 		//
   1949 		// This is so that NFS server returns stale file handle error
   1950 		// to the client. Current implementation checks for error or
   1951 		// NULL vnode pointer, but we set both to make sure we
   1952 		// behave the same way as UFS does.
   1953 		//
   1954 		*vnodepp = NULL;
   1955 	}
   1956 	CORBA::release(client1_p);
   1957 	CORBA::release(client2_p);
   1958 	return (error);
   1959 }
   1960 
   1961 //
   1962 // connect_again - The fobj_client originally given by the server went away.
   1963 // This method contacts the server and connects again with a new fobj_client.
   1964 //
   1965 int
   1966 pxvfs::connect_again(vnode **vnodepp,
   1967     PXFS_VER::fobj_ptr fobj_p,
   1968     PXFS_VER::fobj_info &fobjinfo,
   1969     PXFS_VER::bind_info &binfo,
   1970     fobj_client_impl *client1p,
   1971     PXFS_VER::fobj_client_ptr client1_p,
   1972     Environment &e)
   1973 {
   1974 	fobj_client_impl		*client2p;
   1975 
   1976 	PXFS_VER::fobj_client_ptr	client2_p =
   1977 	    PXFS_VER::fobj_client::_nil();
   1978 
   1979 	uint32_t		server_incn_orig;
   1980 
   1981 	//
   1982 	// Keep trying until we succeed or encounter a fatal error
   1983 	//
   1984 	while (true) {
   1985 		server_incn_orig = get_server_incn();
   1986 
   1987 		((PXFS_VER::fobjplus_ptr)fobj_p)->
   1988 		    cache_new_client(binfo, client1_p, client2_p, e);
   1989 
   1990 		if (e.exception()) {
   1991 			//
   1992 			// The file server object could not be restored after a
   1993 			// node failure.
   1994 			//
   1995 			e.clear();
   1996 			*vnodepp = NULL;
   1997 			return (EIO);
   1998 		}
   1999 
   2000 		//
   2001 		// We only reconnect for those file objects supporting caching.
   2002 		//
   2003 		ASSERT(!CORBA::is_nil(client2_p));
   2004 		if (client1_p->_equiv(client2_p)) {
   2005 			//
   2006 			// This means we are creating a new proxy vnode
   2007 			//
   2008 			// Do not allow the server to change while in the
   2009 			// middle of creating a proxy vnode.
   2010 			//
   2011 			server_incn_lock.rdlock();
   2012 			if (server_incn_orig != get_server_incn()) {
   2013 				//
   2014 				// The registration was orphanned by a
   2015 				// failover or a switchover
   2016 				//
   2017 				server_incn_lock.unlock();
   2018 				continue;
   2019 			}
   2020 
   2021 			*vnodepp = get_pxfobj(fobj_p, fobjinfo, &binfo,
   2022 			    client1p);
   2023 
   2024 			server_incn_lock.unlock();
   2025 
   2026 			if (*vnodepp == NULL) {
   2027 				//
   2028 				// This error can happen on the root directory
   2029 				// when the proxy file system is being unmounted
   2030 				//
   2031 				return (EIO);
   2032 			} else {
   2033 				return (0);
   2034 			}
   2035 		} else {
   2036 			//
   2037 			// There already is a proxy vnode
   2038 			//
   2039 			client2p = (fobj_client_impl *)
   2040 			    (client2_p->_handler()->get_cookie());
   2041 
   2042 			//
   2043 			// The current client may not be ready
   2044 			//
   2045 			if (client2p->wait_till_ok()) {
   2046 				//
   2047 				// Existing client has completed initialization
   2048 				// and has a hold placed upon it.
   2049 				//
   2050 				*vnodepp =
   2051 				    pxnode::PXTOV(client2p->get_pxfobjplusp());
   2052 
   2053 				return (0);
   2054 			}
   2055 
   2056 			//
   2057 			// The specified client is going away. Try again.
   2058 			//
   2059 			CORBA::release(client2_p);
   2060 			client2_p = PXFS_VER::fobj_client::_nil();
   2061 		}
   2062 	}
   2063 }
   2064 
   2065 int
   2066 pxvfs::swapvp(vnode **, char *)
   2067 {
   2068 	return (ENOSYS);
   2069 }
   2070 
   2071 //
   2072 // get_pxfobj - Find or create a proxy vnode for the given fobj.
   2073 // The vnode is returned held and the caller should call VN_RELE()
   2074 // when finished using the vnode.
   2075 //
   2076 // The routine guarantees that there's at most one proxy vnode
   2077 // on a given client for a given fobj.
   2078 //
   2079 // Warning: this method assumes that the fobj belongs to the file system
   2080 // it is invoked on, and will create a proxy vnode for the fobj whether or not
   2081 // the fobj is in fact owned by the fs.
   2082 //
   2083 // Any proxy vnode that supports caching will have a fobj_client,
   2084 // and must have a non-null 'binfop'. The reverse is not true.
   2085 // For example, a lookup on a device will arrive here with a non-null binfop
   2086 // and no fobj_client. The unpack_vnode() method will call this method
   2087 // and provide null values for both binfop and the fobj_client.
   2088 //
   2089 vnode *
   2090 pxvfs::get_pxfobj(PXFS_VER::fobj_ptr fobjp, const PXFS_VER::fobj_info &fobjinfo,
   2091     PXFS_VER::bind_info *binfop, fobj_client_impl *clientp)
   2092 {
   2093 	pxfobj	*pxfobjp;
   2094 
   2095 	ASSERT(!CORBA::is_nil(fobjp));
   2096 
   2097 	//
   2098 	// During unmount, we need to be able to say that all vnodes are
   2099 	// inactive. Most new vnodes are created via directory operations
   2100 	// which are locked out when vn_vfswlock() is called on the mount
   2101 	// point. However, VFS_GET() (vget() above) and pxfobj::unpack_vnode()
   2102 	// are not blocked and so we have this code here to prevent new
   2103 	// pxfobj's from being created until unmount either succeeds or
   2104 	// fails completely.
   2105 	// Note: if we make it past the unlock, unmount() will return EBUSY
   2106 	//
   2107 	flags_lock.lock();
   2108 	while (flags & PXFS_UNMOUNTING) {
   2109 		flags |= PXFS_FILE_ACTIVATE;
   2110 		flags_cv.wait(&flags_lock);
   2111 	}
   2112 	if (flags & PXFS_UNMOUNTED) {
   2113 		flags_lock.unlock();
   2114 		return (NULL);
   2115 	}
   2116 	active_cnt++;
   2117 	flags_lock.unlock();
   2118 
   2119 	// Allocate a new proxy vnode with a reference count of one.
   2120 	pxfobjp = make_pxfobj(fobjp, fobjinfo, clientp);
   2121 
   2122 	if (clientp != NULL) {
   2123 		ASSERT(binfop != NULL);
   2124 
   2125 		switch (binfop->_d()) {
   2126 		case PXFS_VER::bt_fobj: {
   2127 			//
   2128 			// Only a proxy vnode of this type caches information
   2129 			//
   2130 			pxfobjplus	*pxfobjplusp = (pxfobjplus *)pxfobjp;
   2131 
   2132 			//
   2133 			// Initialize the attribute cache with
   2134 			// the data returned.
   2135 			//
   2136 			pxfobjplusp->install_attr(binfop->_u.bind_fobj.attr,
   2137 			    binfop->_u.bind_fobj.rights);
   2138 
   2139 			// Initialize the cachedata flag when appropriate
   2140 			pxfobjplusp->install_cachedata_flag(
   2141 			    binfop->_u.bind_fobj.cachedata);
   2142 
   2143 			//
   2144 			// Initialize the link from the proxy vnode object to
   2145 			// the fobj_client
   2146 			//
   2147 			ASSERT(clientp != NULL);
   2148 			pxfobjplusp->set_client(clientp);
   2149 
   2150 			//
   2151 			// Initialize the link from the fobj_client to the
   2152 			// proxy vnode.
   2153 			//
   2154 			// This must be the last part of initialization,
   2155 			// because waiting threads are unblocked.
   2156 			//
   2157 			clientp->set_pxfobjplus(pxfobjplusp);
   2158 
   2159 			break;
   2160 		}
   2161 
   2162 		default:
   2163 			//
   2164 			// No information is provided to the client
   2165 			//
   2166 			ASSERT(0);
   2167 			break;
   2168 		}
   2169 	}
   2170 	// Insert the new proxy vnode into the hash list of all proxy vnodes.
   2171 	pxfobj	*pxfobj2p = fobjhash_insert(pxfobjp);
   2172 	return (pxnode::PXTOV(pxfobj2p));
   2173 }
   2174 
   2175 //
   2176 // make_pxfobj
   2177 // This is called from get_pxfobj() to actually make a new proxy fobj object.
   2178 //
   2179 // Note: it should only do memory allocation and initialization for the
   2180 // proxy fobj. Other file systems can replace this default implementation
   2181 // and therefore common pxfobj initialization should go in get_pxfobj()
   2182 // instead.
   2183 //
   2184 // Proxy vnodes are created with a reference count of one.
   2185 //
   2186 pxfobj *
   2187 pxvfs::make_pxfobj(PXFS_VER::fobj_ptr fobjp,
   2188     const PXFS_VER::fobj_info &fobjinfo,
   2189     fobj_client_impl *clientp)
   2190 {
   2191 	pxfobj	*pxp;
   2192 
   2193 	ASSERT(!CORBA::is_nil(fobjp));
   2194 
   2195 	//
   2196 	// XXX - new() may lead to a deadlock if memory is exhausted
   2197 	//	 See the NFS nnode management code how deadlock
   2198 	//	 can be prevented.
   2199 	//
   2200 
   2201 	switch (fobjinfo.ftype) {
   2202 	case PXFS_VER::fobj_io: {
   2203 		PXFS_VER::io_var	iop = PXFS_VER::io::_narrow(fobjp);
   2204 		ASSERT(!CORBA::is_nil(iop));
   2205 		pxp = new pxchr(fs_vfs, iop, fobjinfo);
   2206 		break;
   2207 	}
   2208 	case PXFS_VER::fobj_file: {
   2209 		PXFS_VER::file_var	filep = PXFS_VER::file::_narrow(fobjp);
   2210 		ASSERT(!CORBA::is_nil(filep));
   2211 		pxp = new pxreg(clientp, fs_vfs, filep, fobjinfo);
   2212 		break;
   2213 	}
   2214 	case PXFS_VER::fobj_unixdir: {
   2215 		PXFS_VER::unixdir_var	udp = PXFS_VER::unixdir::_narrow(fobjp);
   2216 		ASSERT(!CORBA::is_nil(udp));
   2217 		pxp = new pxdir(clientp, fs_vfs, udp, fobjinfo);
   2218 		break;
   2219 	}
   2220 	case PXFS_VER::fobj_symbolic_link: {
   2221 		PXFS_VER::symbolic_link_var	linkp =
   2222 		    PXFS_VER::symbolic_link::_narrow(fobjp);
   2223 		ASSERT(!CORBA::is_nil(linkp));
   2224 		pxp = new pxlink(clientp, fs_vfs, linkp, fobjinfo);
   2225 		break;
   2226 	}
   2227 	case PXFS_VER::fobj_special: {
   2228 		PXFS_VER::special_var	spp = PXFS_VER::special::_narrow(fobjp);
   2229 		ASSERT(!CORBA::is_nil(spp));
   2230 		pxp = new pxspecial(fs_vfs, spp, fobjinfo);
   2231 		break;
   2232 	}
   2233 	case PXFS_VER::fobj_sobj:
   2234 	case PXFS_VER::fobj_fobj:
   2235 	case PXFS_VER::fobj_device:
   2236 	case PXFS_VER::fobj_procfile:
   2237 	default:
   2238 		os::panic("pxvfs:make_pxfobj unsupported fobj type %d",
   2239 			fobjinfo.ftype);
   2240 		// NOTREACHED
   2241 	}
   2242 
   2243 	return (pxp);
   2244 }
   2245 
   2246 //
   2247 // fobjhash_insert
   2248 // Insert a new (held) pxfobj into the hash table of all active pxfobj's.
   2249 // If a preexisting pxfobj is found, return it (held) instead.
   2250 // The caller is responsible for calling VN_RELE() on the associated vnode;
   2251 // The caller should not use or release the pointer passed to this
   2252 // routine.
   2253 //
   2254 pxfobj *
   2255 pxvfs::fobjhash_insert(pxfobj *new_pxfobjp)
   2256 {
   2257 	PXFS_VER::fobj_ptr	fobjp = new_pxfobjp->getfobj();
   2258 	pxfobj			*pxfobjp;
   2259 
   2260 	//
   2261 	// Search the hash table of all fobj objects to see if there is
   2262 	// already a proxy for 'fobjp'. We delete the one just
   2263 	// created if there is a duplicate since this will hold the
   2264 	// pxfobj hash lock for less time and has fewer locking issues
   2265 	// than trying to lock, search, create pxfobj, unlock.
   2266 	// The assumption is that deleting a duplicate does not
   2267 	// happen very often.
   2268 	//
   2269 	uint_t		b_idx = pxfs_misc::hash_devt_fid(
   2270 	    (new_pxfobjp->get_vp())->v_vfsp->vfs_dev,
   2271 	    new_pxfobjp->get_fidp(), pxfobjhsz);
   2272 
   2273 	pxfobj_hash_bkt	&hbkt = pxfobj_hash[b_idx];
   2274 
   2275 	hbkt.hlock.lock();
   2276 	pxfobj_list_t::ListIterator iter(hbkt.hlist);
   2277 
   2278 	for (; (pxfobjp = iter.get_current()) != NULL; iter.advance()) {
   2279 		if (fobjp->_equiv(pxfobjp->getfobj())) {
   2280 			//
   2281 			// We found an existing pxfobj.
   2282 			// We do a hold before releasing the hash lock so
   2283 			// that pxfobj_inactive() will see that we have
   2284 			// reclaimed the vnode.
   2285 			//
   2286 			VN_HOLD(pxnode::PXTOV(pxfobjp));
   2287 			hbkt.hlock.unlock();
   2288 
   2289 			//
   2290 			// Since we didn't use the new pxfobj, we have to
   2291 			// adjust the active count. Objects in the hash
   2292 			// table will be accounted for when pxfobj_inactive()
   2293 			// removes it from the hash table.
   2294 			//
   2295 			flags_lock.lock();
   2296 			ASSERT(active_cnt >= 2);
   2297 			active_cnt--;
   2298 			flags_lock.unlock();
   2299 			VN_RELE(pxnode::PXTOV(new_pxfobjp));
   2300 			return (pxfobjp);
   2301 		}
   2302 	}
   2303 
   2304 	hbkt.hlist_cnt++;
   2305 
   2306 	//
   2307 	// We expect the most recently created proxy file object
   2308 	// to be the most likely one to be used next.
   2309 	// So we put the newly created one at the front of the list.
   2310 	//
   2311 	new_pxfobjp->set_inhashtable();
   2312 	hbkt.hlist.prepend(new_pxfobjp);
   2313 
   2314 	hbkt.hlock.unlock();
   2315 
   2316 	PXFS_KSTATS(node_stats, ((KSTAT_NAMED_PTR(node_stats))
   2317 	    [PXVFS_NODE_STATS_NUM_OPEN_FILES].value.ui32++));
   2318 	PXFS_KSTATS(stats, ((KSTAT_NAMED_PTR(stats))
   2319 	    [PXVFS_STATS_NUM_OPEN_FILES].value.ui32++));
   2320 
   2321 	return (new_pxfobjp);
   2322 }
   2323 
   2324 //
   2325 // add_inactivelist - queue this proxy vnode for processing
   2326 // to become stale.
   2327 //
   2328 void
   2329 pxvfs::add_inactivelist(pxfobjplus *pxfobjplusp)
   2330 {
   2331 	flags_lock.lock();
   2332 
   2333 	//
   2334 	// Add ourself to the list of inactive proxy vnodes
   2335 	//
   2336 	inactive_list.append((inactive_list_elem *)pxfobjplusp);
   2337 	inactive_list_cnt++;
   2338 
   2339 	if ((flags & PXFS_TASK_QUEUED) == 0) {
   2340 		//
   2341 		// Prevent pxvfs from being destroyed until work completes.
   2342 		//
   2343 		VFS_HOLD(fs_vfs);
   2344 
   2345 		//
   2346 		// Since the system is not already scheduled
   2347 		// to clean up inactive proxy vnodes,
   2348 		// make the system clean up inactive proxy vnodes.
   2349 		//
   2350 		flags |= PXFS_TASK_QUEUED;
   2351 		pxvfs_inactive_threadpool::the().defer_processing(this);
   2352 	}
   2353 	flags_lock.unlock();
   2354 }
   2355 
   2356 //
   2357 // empty_inactive_list - Drain the inactive proxy vnode list.
   2358 // If someone is waiting, this method processes all inactive proxy vnodes.
   2359 // Otherwise, process a limited number of inactive proxy vnodes for
   2360 // this file system, because we want to allow the worker thread to clean up
   2361 // other file systems.
   2362 //
   2363 void
   2364 pxvfs::empty_inactive_list()
   2365 {
   2366 	ASSERT(cluster_fs_drain_queue_len > 0);
   2367 	ASSERT(flags_lock.lock_held());
   2368 
   2369 	pxfobj	*pxfobjp;
   2370 	int	drain_count = 0;
   2371 
   2372 	//
   2373 	// If a forced unmount operation is waiting, we will drain only
   2374 	// a limited amount - but at least what is currently on the list.
   2375 	//
   2376 	if ((flags & PXFS_FORCE_UNMOUNTING) && (flags & PXFS_INACTIVE_WAIT)) {
   2377 		drain_count = - (int)inactive_list_cnt;
   2378 	}
   2379 
   2380 	while ((pxfobjp = inactive_list.reapfirst()) != NULL) {
   2381 		flags_lock.unlock();
   2382 		pxfobjp->cleanup_proxy_vnode();
   2383 		flags_lock.lock();
   2384 
   2385 		//
   2386 		// Another thread might think the pxvfs object has no
   2387 		// work left to clean up inactive proxy vnodes when
   2388 		// this count is zero. In which case it might destroy
   2389 		// this object. Therefore must decrement this count after
   2390 		// all the work has been done and while holding the flags_lock.
   2391 		//
   2392 		ASSERT(inactive_list_cnt != 0);
   2393 		--inactive_list_cnt;
   2394 
   2395 		//
   2396 		// Only do a limited amount of processing for each file
   2397 		// system by the reaper unless someone is waiting.
   2398 		//
   2399 		if (++drain_count >= cluster_fs_drain_queue_len &&
   2400 		    (((flags & PXFS_INACTIVE_WAIT) == 0) ||
   2401 		    (flags & PXFS_FORCE_UNMOUNTING))) {
   2402 			break;
   2403 		}
   2404 	}
   2405 	//
   2406 	// If the inactive list is empty or a forced unmount is being attempted,
   2407 	// then wake up any other thread that is waiting for proxy vnodes
   2408 	// to be cleaned up.
   2409 	//
   2410 	if ((inactive_list_cnt == 0 || flags & PXFS_FORCE_UNMOUNTING) &&
   2411 	    ((flags & PXFS_INACTIVE_WAIT) != 0)) {
   2412 		//
   2413 		// Wake up any other thread that
   2414 		// is waiting for proxy vnodes to be cleaned up.
   2415 		//
   2416 		flags &= ~PXFS_INACTIVE_WAIT;
   2417 		flags_cv.broadcast();
   2418 	}
   2419 }
   2420 
   2421 //
   2422 // If this function is called in support of a normal unmount, and the
   2423 // filesystem is busy (non-zero active_cnt), then return immediately.
   2424 // If this is not for an unmount or if the unmount is forced, then wait
   2425 // for processing of the inactive vnode list.
   2426 //
   2427 // The return value is only significant for normal unmount operations
   2428 // (is_unmount == true and forced_umount = false). For this case, a
   2429 // value of true is returned if the filesystem is busy. For all other
   2430 // cases, false is returned.
   2431 //
   2432 // If filesystem does not have active files, set a flag to prevent new
   2433 // vnodes from being created (see get_pxfobj()).
   2434 //
   2435 bool
   2436 pxvfs::wait_empty_inactive_list(bool forced_unmount)
   2437 {
   2438 	int	try_count;
   2439 
   2440 	flags_lock.lock();
   2441 	if (!forced_unmount) {
   2442 		//
   2443 		// The only thing holding a vnode active may be something
   2444 		// transient, like an asynchronous write. For a regular unmount
   2445 		// allow a little more time for transient stuff to complete.
   2446 		//
   2447 		try_count = 6;
   2448 	} else {
   2449 		try_count = 1;
   2450 	}
   2451 
   2452 	for (; try_count > 0; try_count--) {
   2453 		// Wait for inactive proxy vnode processing to finish.
   2454 		while (inactive_list_cnt != 0) {
   2455 			flags |= PXFS_INACTIVE_WAIT;
   2456 			flags_cv.wait(&flags_lock);
   2457 			if (forced_unmount) {
   2458 				break;
   2459 			}
   2460 		}
   2461 		if (active_cnt != 0 && try_count > 1) {
   2462 			flags_lock.unlock();
   2463 			// Sleep for 1 second
   2464 			os::usecsleep((os::usec_t)1000000);
   2465 			flags_lock.lock();
   2466 		}
   2467 	}
   2468 	//
   2469 	// Check to see if any proxy vnodes are still in use.
   2470 	//
   2471 	bool	in_use;
   2472 	if (forced_unmount) {
   2473 		in_use = false;
   2474 	} else {
   2475 		in_use = (active_cnt != 0);
   2476 		if (in_use) {
   2477 			flags &= ~PXFS_UNMOUNTING;
   2478 			if (flags & PXFS_FILE_ACTIVATE) {
   2479 				flags &= ~PXFS_FILE_ACTIVATE;
   2480 				flags_cv.broadcast();
   2481 			}
   2482 		}
   2483 	}
   2484 	flags_lock.unlock();
   2485 	return (in_use);
   2486 }
   2487 
   2488 //
   2489 // pxfobj_inactive
   2490 // This is called when the last vnode reference to a proxy vnode
   2491 // is being released (i.e., the last call to VN_RELE() with v_count == 1).
   2492 //
   2493 // This supports proxy vnodes that never have dirty information.
   2494 // Thus there are no callbacks from the server on these proxy vnodes.
   2495 // This kind of proxy vnode can only be found through the hash table.
   2496 //
   2497 void
   2498 pxvfs::pxfobj_inactive(pxfobj *pxfobjp)
   2499 {
   2500 	//
   2501 	// Check to see if we are in the hash table.
   2502 	// If we are not, it is because we lost the race in
   2503 	// fobjhash_insert(). Once this kind of proxy vnode enters
   2504 	// the hash table, the proxy vnode remains there until destroyed.
   2505 	//
   2506 	if (pxfobjp->is_inhashtable()) {
   2507 		uint_t		b_idx = pxfs_misc::hash_devt_fid(
   2508 				    (pxfobjp->get_vp())->v_vfsp->vfs_dev,
   2509 				    pxfobjp->get_fidp(), pxfobjhsz);
   2510 
   2511 		pxfobj_hash_bkt		&hbkt = pxfobj_hash[b_idx];
   2512 		hbkt.hlock.lock();
   2513 
   2514 		vnode_t		*vnodep = pxfobjp->get_vp();
   2515 
   2516 		//
   2517 		// Check to see if we are still inactive (not reclaimed).
   2518 		//
   2519 		mutex_enter(&vnodep->v_lock);
   2520 		if (vnodep->v_count > 1) {
   2521 			//
   2522 			// We were reclaimed by vget() or fobjhash_insert().
   2523 			// Account for the missing decrement in vn_rele().
   2524 			//
   2525 			vnodep->v_count--;
   2526 			mutex_exit(&vnodep->v_lock);
   2527 			hbkt.hlock.unlock();
   2528 			return;
   2529 		}
   2530 		mutex_exit(&vnodep->v_lock);
   2531 
   2532 		//
   2533 		// We are now committed to releasing the vnode.
   2534 		//
   2535 		bool	removed = hbkt.hlist.erase(pxfobjp);
   2536 		CL_PANIC(removed);
   2537 
   2538 		ASSERT(hbkt.hlist_cnt != 0);
   2539 		hbkt.hlist_cnt--;
   2540 
   2541 		hbkt.hlock.unlock();
   2542 		flags_lock.lock();
   2543 		ASSERT(active_cnt != 0);
   2544 		active_cnt--;
   2545 		pxfobjp->not_inhashtable();
   2546 		flags_lock.unlock();
   2547 
   2548 		PXFS_KSTATS(node_stats, ((KSTAT_NAMED_PTR(node_stats))
   2549 		    [PXVFS_NODE_STATS_NUM_OPEN_FILES].value.ui32--));
   2550 		PXFS_KSTATS(stats, ((KSTAT_NAMED_PTR(stats))
   2551 		    [PXVFS_STATS_NUM_OPEN_FILES].value.ui32--));
   2552 	}
   2553 	pxfobjp->set_stale();
   2554 	delete pxfobjp;
   2555 }
   2556 
   2557 //
   2558 // pxfobjplus_inactive
   2559 // Dirty cached information has already been flushed to the server.
   2560 //
   2561 // If the proxy vnode is not in use, remove the proxy vnode from the
   2562 // hash table.
   2563 //
   2564 // Return Result True = the proxy vnode is not in the hash table
   2565 bool
   2566 pxvfs::pxfobjplus_inactive(pxfobjplus *pxfobjplusp)
   2567 {
   2568 	//
   2569 	// The proxy vnode may never have entered the hash table.
   2570 	//
   2571 	if (pxfobjplusp->is_inhashtable()) {
   2572 		//
   2573 		// The proxy vnode is in the hash table
   2574 		//
   2575 		uint_t	b_idx = pxfs_misc::hash_devt_fid(
   2576 			    (pxfobjplusp->get_vp())->v_vfsp->vfs_dev,
   2577 			    pxfobjplusp->get_fidp(), pxfobjhsz);
   2578 
   2579 		pxfobj_hash_bkt		&hbkt = pxfobj_hash[b_idx];
   2580 		hbkt.hlock.lock();
   2581 
   2582 		vnode_t		*vnodep = pxfobjplusp->get_vp();
   2583 
   2584 		//
   2585 		// Check to see if we are still inactive (not reclaimed).
   2586 		//
   2587 		mutex_enter(&vnodep->v_lock);
   2588 		if (vnodep->v_count > 1) {
   2589 			//
   2590 			// The proxy vnode was reclaimed.
   2591 			// Account for the missing decrement in vn_rele().
   2592 			//
   2593 			vnodep->v_count--;
   2594 			mutex_exit(&vnodep->v_lock);
   2595 			hbkt.hlock.unlock();
   2596 			return (false);
   2597 		}
   2598 		mutex_exit(&vnodep->v_lock);
   2599 
   2600 		//
   2601 		// We are now committed to releasing the vnode.
   2602 		//
   2603 
   2604 		bool	removed = hbkt.hlist.erase((pxfobj *)pxfobjplusp);
   2605 		CL_PANIC(removed);
   2606 
   2607 		ASSERT(hbkt.hlist_cnt != 0);
   2608 		hbkt.hlist_cnt--;
   2609 
   2610 		hbkt.hlock.unlock();
   2611 		flags_lock.lock();
   2612 		ASSERT(active_cnt != 0);
   2613 		active_cnt--;
   2614 		flags_lock.unlock();
   2615 		pxfobjplusp->not_inhashtable();
   2616 
   2617 		PXFS_KSTATS(node_stats, ((KSTAT_NAMED_PTR(node_stats))
   2618 		    [PXVFS_NODE_STATS_NUM_OPEN_FILES].value.ui32--));
   2619 		PXFS_KSTATS(stats, ((KSTAT_NAMED_PTR(stats))
   2620 		    [PXVFS_STATS_NUM_OPEN_FILES].value.ui32--));
   2621 	}
   2622 	pxfobjplusp->set_stale();
   2623 	return (true);
   2624 }
   2625 
   2626 //
   2627 // purge_caches - is called to prepare a filesystem for unmount/removal.
   2628 // Return true if file system is still in use (active vnodes present),
   2629 // false otherwise. In case of a forced unmount, return false always.
   2630 //
   2631 bool
   2632 pxvfs::purge_caches(bool force_unmount, cred *credp)
   2633 {
   2634 	// Purge all DNLC entries for this vfs.
   2635 	(void) dnlc_purge_vfsp(fs_vfs, 0);
   2636 
   2637 	flags_lock.lock();
   2638 	//
   2639 	// Set PXFS_UNMOUNTING to block creation of new proxy vnodes.
   2640 	//
   2641 	flags |= PXFS_UNMOUNTING;
   2642 	if (force_unmount) {
   2643 		flags |= PXFS_FORCE_UNMOUNTING;
   2644 	}
   2645 
   2646 	if (fs_rootvp != NULL) {
   2647 		// Release the cached root directory vnode
   2648 		vnode_t		*vnodep = fs_rootvp;
   2649 		fs_rootvp = NULL;
   2650 		flags_lock.unlock();
   2651 		VN_RELE(vnodep);
   2652 	} else {
   2653 		flags_lock.unlock();
   2654 	}
   2655 
   2656 	//
   2657 	// For normal unmount sync all the data (shortcut to VFS_SYNC(fs_vfs)).
   2658 	// Any possiblity of hanging because of I/O problems is avoided for
   2659 	// forced unmount.
   2660 	//
   2661 	if (!force_unmount) {
   2662 		(void) sync(0, credp);
   2663 	}
   2664 
   2665 	//
   2666 	// Lock the vfs to maintain file system status quo during unmount.
   2667 	// This has to be done after sync(), because ufs_update tries
   2668 	// to acquire the vfs_reflock. Thus we avoid deadlock in
   2669 	// traverse(), VFS_ROOT(), get_pxfobj().
   2670 	//
   2671 	vfs_lock_wait(fs_vfs);
   2672 
   2673 	//
   2674 	// Forced Unmount Case - make one pass at processing the inactive list,
   2675 	// and the return value is always false.
   2676 	//
   2677 	// Otherwise wait for the inactive list to be processed,
   2678 	// and the return value will be true if there are still active
   2679 	// proxy vnodes.
   2680 	//
   2681 	if (wait_empty_inactive_list(force_unmount)) {
   2682 		vfs_unlock(fs_vfs);
   2683 
   2684 		PXFS_DBPRINTF(
   2685 		    PXFS_TRACE_PXVFS,
   2686 		    PXFS_AMBER,
   2687 		    ("pxvfs:purge_caches(%p) active_cnt %d\n",
   2688 		    this, active_cnt));
   2689 
   2690 		return (true);
   2691 	} else {
   2692 		return (false);
   2693 	}
   2694 }
   2695 
   2696 //
   2697 // Called to flush the filesystem's dirty data.
   2698 // All the files is the hash bucket are flushed.
   2699 // Returns non-zero if an error is encountered; zero otherwise.
   2700 //
   2701 // revoke : This is set to true if called from revoke_allocation().
   2702 //    In which case we call sync_file_revoke() instead of sync_file().
   2703 //
   2704 int
   2705 pxvfs::sync_filesystem(cred *credp, bool revoke)
   2706 {
   2707 	int	error = 0;
   2708 	int	ret = 0;
   2709 	pxfobj	*pxfobjp;
   2710 	pxfobj	*prevp = NULL;
   2711 	int	file_count = 0;
   2712 
   2713 	for (uint_t idx = 0; idx < pxfobjhsz; idx++) {
   2714 		//
   2715 		// Make sure the iterator is initialized while holding the
   2716 		// lock.
   2717 		//
   2718 		pxfobj_hash[idx].hlock.lock();
   2719 		pxfobj_list_t::ListIterator iter(pxfobj_hash[idx].hlist);
   2720 		for (; (pxfobjp = iter.get_current()) != NULL; iter.advance()) {
   2721 
   2722 			ASSERT(pxfobjp->is_inhashtable());
   2723 
   2724 			//
   2725 			// Place a hold on the vnode so that it is not
   2726 			// released during proccessing.
   2727 			//
   2728 			VN_HOLD(pxnode::PXTOV(pxfobjp));
   2729 			pxfobj_hash[idx].hlock.unlock();
   2730 
   2731 			if (prevp != NULL) {
   2732 				//
   2733 				// It is possible that this operation will
   2734 				// render this proxy vnode inactive and expel
   2735 				// this proxy vnode from the hash table.
   2736 				//
   2737 				VN_RELE(pxnode::PXTOV(prevp));
   2738 			}
   2739 
   2740 			//
   2741 			// We skip those vnodes which don't belong to this
   2742 			// filesystem.
   2743 			//
   2744 			if ((pxnode::PXTOV(pxfobjp)->v_vfsp) != fs_vfs) {
   2745 				prevp = pxfobjp;
   2746 				pxfobj_hash[idx].hlock.lock();
   2747 				continue;
   2748 			}
   2749 
   2750 			//
   2751 			// Synchronously write out dirty data.
   2752 			// Wait for queued up async requests to complete.
   2753 			//
   2754 			if (revoke) {
   2755 				error = pxfobjp->sync_file_revoke();
   2756 			} else {
   2757 				error = pxfobjp->sync_file();
   2758 			}
   2759 
   2760 			if (error) {
   2761 				PXFS_DBPRINTF(
   2762 				    PXFS_TRACE_PXVFS,
   2763 				    PXFS_RED,
   2764 				    ("pxvfs::sync_filesystem(%p) sync_file(%p)"
   2765 				    " returned error %d \n",
   2766 				    this, pxfobjp, error));
   2767 
   2768 				// Return a non zero value.
   2769 				ret = 1;
   2770 			}
   2771 
   2772 			//
   2773 			// We cannot release the hold on this proxy vnode now
   2774 			// as the release can result in the vnode getting
   2775 			// expelled from the hash table. So we save a pointer
   2776 			// to the proxy vnode and do the release later.
   2777 			// Note: The release has to be done when the hlock is
   2778 			// not held.
   2779 			//
   2780 			prevp = pxfobjp;
   2781 
   2782 			//
   2783 			// This sync can starve other work.
   2784 			// So we throttle the sync.
   2785 			//
   2786 			file_count++;
   2787 			if (file_count >= sync_filesystem_throttle) {
   2788 				//
   2789 				// Allow other work. Sleep for 20 ms.
   2790 				//
   2791 				os::usecsleep((os::usec_t)(20000));
   2792 				file_count = 0;
   2793 			}
   2794 
   2795 			pxfobj_hash[idx].hlock.lock();
   2796 		}
   2797 		pxfobj_hash[idx].hlock.unlock();
   2798 	}
   2799 
   2800 	if (prevp != NULL) {
   2801 		//
   2802 		// Release the hold placed on the proxy vnode that was
   2803 		// processed last.
   2804 		//
   2805 		VN_RELE(pxnode::PXTOV(prevp));
   2806 	}
   2807 
   2808 	return (ret);
   2809 }
   2810 
   2811 //
   2812 // This is called by mount_client_impl::remove_notify() when the
   2813 // global unmount succeeds.
   2814 //
   2815 void
   2816 pxvfs::unmount_succeeded()
   2817 {
   2818 	// ASSERT(vfs_lock_held(fs_vfs));
   2819 	fs_vfs->vfs_flag |= VFS_UNMOUNTED;
   2820 
   2821 	//
   2822 	// Clear the PXFS_UNMOUNTING flag, set the PXFS_UNMOUNTED flag,
   2823 	// and wake up any get_pxfobj() sleepers.
   2824 	//
   2825 	flags_lock.lock();
   2826 	flags &= ~PXFS_UNMOUNTING;
   2827 	flags |= PXFS_UNMOUNTED;
   2828 	if (flags & PXFS_FILE_ACTIVATE) {
   2829 		flags &= ~PXFS_FILE_ACTIVATE;
   2830 		flags_cv.broadcast();
   2831 	}
   2832 	flags_lock.unlock();
   2833 
   2834 	fsmgr_client_implp->unmount_succeeded();
   2835 	fsmgr_client_implp = NULL;
   2836 
   2837 	//
   2838 	// Remove ourself from the all_pxvfs list (if we're there).
   2839 	// XXX how do we make sure that find_pxvfs() doesn't hand out a
   2840 	// pointer to us before we remove ourself from the list?
   2841 	//
   2842 	all_pxvfs_lock.wrlock();
   2843 	(void) all_pxvfs.erase((pxvfs_list_elem *)this);
   2844 	VFS_RELE(fs_vfs);	// Release list's hold on vfs_t
   2845 	all_pxvfs_lock.unlock();
   2846 
   2847 #ifdef	PXFS_KSTATS_ENABLED
   2848 	if (stats != NULL) {
   2849 		kstat_delete(stats);
   2850 	}
   2851 #endif
   2852 }
   2853 
   2854 //
   2855 // This is called by pxvfs::unmount() and mount_client_impl::unmount_failed()
   2856 // when a global unmount fails.
   2857 //
   2858 void
   2859 pxvfs::unmount_failed()
   2860 {
   2861 	// Clear the PXFS_UNMOUNTING flag and wake up any get_pxfobj() sleepers.
   2862 	flags_lock.lock();
   2863 	flags &= ~PXFS_UNMOUNTING;
   2864 	flags &= ~PXFS_FORCE_UNMOUNTING;
   2865 	if (flags & PXFS_FILE_ACTIVATE) {
   2866 		flags &= ~PXFS_FILE_ACTIVATE;
   2867 		flags_cv.broadcast();
   2868 	}
   2869 	flags_lock.unlock();
   2870 }
   2871 
   2872 //
   2873 // This routine is called to clean up if the file system server crashes.
   2874 // It is called from fsmgr_client_impl::_unreferenced() so it shouldn't
   2875 // take too much time.
   2876 //
   2877 void
   2878 pxvfs::cleanup()
   2879 {
   2880 	// Clean up sleeping locks.
   2881 	pxfs_llm_callback_impl *llmp;
   2882 	llm_cb_list_lock.lock();
   2883 	for (llm_cb_list.atfirst(); (llmp = llm_cb_list.get_current()) != NULL;
   2884 	    llm_cb_list.advance()) {
   2885 		llmp->signal(EIO);
   2886 	}
   2887 	llm_cb_list_lock.unlock();
   2888 }
   2889 
   2890 //
   2891 // new_file_system_primary - do processing needed when the system
   2892 // activates a new file system primary.
   2893 //
   2894 void
   2895 pxvfs::new_file_system_primary(uint32_t server_incarn, Environment &)
   2896 {
   2897 	//
   2898 	// The locking ensures that all in-progress client registrations
   2899 	// complete before changing the server incarnation.
   2900 	// This does not wait for invocations that have yet produced a reply.
   2901 	//
   2902 	server_incn_lock.wrlock();
   2903 	server_incn = server_incarn;
   2904 	server_incn_lock.unlock();
   2905 
   2906 	replay_sleeping_locks();
   2907 }
   2908 
   2909 //
   2910 // Replay all the sleeping locks that originated from this node.
   2911 //
   2912 void
   2913 pxvfs::replay_sleeping_locks()
   2914 {
   2915 	//
   2916 	// Walk through the list of callback objects, and wake them up with
   2917 	// 'RETRY_LOCK'.
   2918 	//
   2919 	pxfs_llm_callback_impl *llmp;
   2920 	llm_cb_list_lock.lock();
   2921 	for (llm_cb_list.atfirst(); (llmp = llm_cb_list.get_current()) != NULL;
   2922 	    llm_cb_list.advance()) {
   2923 		llmp->signal(pxfs_llm_callback_impl::RETRY_LOCK);
   2924 	}
   2925 	llm_cb_list_lock.unlock();
   2926 }
   2927 
   2928 //
   2929 // Insert callback object into list of callback objects.
   2930 //
   2931 void
   2932 pxvfs::insert_llm_cbobj(pxfs_llm_callback_impl *llmp)
   2933 {
   2934 	llm_cb_list_lock.lock();
   2935 	llm_cb_list.prepend(llmp);
   2936 	llm_cb_list_lock.unlock();
   2937 }
   2938 
   2939 //
   2940 // Remove callback object from list of callback objects.
   2941 //
   2942 void
   2943 pxvfs::remove_llm_cbobj(pxfs_llm_callback_impl *llmp)
   2944 {
   2945 	llm_cb_list_lock.lock();
   2946 	(void) llm_cb_list.erase(llmp);
   2947 	llm_cb_list_lock.unlock();
   2948 }
   2949 
   2950 //
   2951 // Calls made via pxfs/server/nlm_pxfs.cc when lockd on this node dies/restarts.
   2952 // When lockd dies, there are two calls, setting the status of the NLM locks
   2953 // from this node to 'FLK_NLM_SHUTTING_DOWN', and then 'FLK_NLM_DOWN'.  The
   2954 // first call interrupts all sleeping locks, and the second call discards all
   2955 // active locks.
   2956 //
   2957 // static
   2958 void
   2959 pxvfs::set_nlm_status(int32_t nlmid, PXFS_VER::nlm_status status)
   2960 {
   2961 	pxvfs		*pxvfsp;
   2962 	SList<pxvfs>	tmp_all_pxvfs;
   2963 
   2964 	//
   2965 	// Make a copy of 'all_pxvfs' so we don't have to lock/unlock
   2966 	// 'all_pxvfs'.
   2967 	//
   2968 	all_pxvfs_lock.rdlock();
   2969 	for (all_pxvfs.atfirst();
   2970 	    (pxvfsp = all_pxvfs.get_current()) != NULL;
   2971 	    all_pxvfs.advance()) {
   2972 		VFS_HOLD(pxvfsp->fs_vfs);
   2973 		tmp_all_pxvfs.prepend(pxvfsp);
   2974 	}
   2975 	all_pxvfs_lock.unlock();
   2976 
   2977 	// Call 'set_nlm_status' on each filesystem.
   2978 	Environment	e;
   2979 	while ((pxvfsp = tmp_all_pxvfs.reapfirst()) != NULL) {
   2980 		pxvfsp->get_fsobj()->set_nlm_status(nlmid, status, e);
   2981 		e.clear();
   2982 		VFS_RELE(pxvfsp->fs_vfs);
   2983 	}
   2984 }
   2985 
   2986 //
   2987 // Call made via pxfs/server/nlm_pxvfs.cc when statd on the client or server
   2988 // node dies and gets restarted.
   2989 //
   2990 // static
   2991 void
   2992 pxvfs::remove_file_locks(int32_t sysid)
   2993 {
   2994 	pxvfs		*pxvfsp;
   2995 	SList<pxvfs>	tmp_all_pxvfs;
   2996 
   2997 	//
   2998 	// Make a copy of 'all_pxvfs' so we don't have to lock/unlock
   2999 	// 'all_pxvfs'.
   3000 	//
   3001 	all_pxvfs_lock.rdlock();
   3002 	for (all_pxvfs.atfirst();
   3003 	    (pxvfsp = all_pxvfs.get_current()) != NULL;
   3004 	    all_pxvfs.advance()) {
   3005 		VFS_HOLD(pxvfsp->fs_vfs);
   3006 		tmp_all_pxvfs.prepend(pxvfsp);
   3007 	}
   3008 	all_pxvfs_lock.unlock();
   3009 
   3010 	// Call 'remove_file_locks' on each filesystem.
   3011 	Environment e;
   3012 	while ((pxvfsp = tmp_all_pxvfs.reapfirst()) != NULL) {
   3013 		pxvfsp->get_fsobj()->remove_file_locks(sysid, e);
   3014 		e.clear();
   3015 		VFS_RELE(pxvfsp->fs_vfs);
   3016 	}
   3017 }
   3018 
   3019 //
   3020 // find_pxvfs
   3021 // Return the pxvfs structure for a given PXFS file system object.
   3022 // Return NULL if the proxy could not be found or created.
   3023 // Otherwise, the pointer is returned held() and the caller should
   3024 // call rele() when finished using the pointer.
   3025 //
   3026 // If 'fsinfop' is not NULL and 'fsobj' does not already
   3027 // have a proxy, then use the file system info to create a new proxy.
   3028 //
   3029 pxvfs *
   3030 pxvfs::find_pxvfs(PXFS_VER::filesystem_ptr fsobj,
   3031     const PXFS_VER::fs_info *fsinfop)
   3032 {
   3033 	pxvfs		*pxvfsp;
   3034 	vfs_t		*vfsp;
   3035 	Environment	e;
   3036 
   3037 	//
   3038 	// The vast majority of calls will only need the read lock,
   3039 	// because the proxy file system vfs already exists.
   3040 	//
   3041 	all_pxvfs_lock.rdlock();
   3042 
   3043 	// Search the list of proxy file systems
   3044 	pxvfsp = search(fsobj);
   3045 	if (pxvfsp != NULL) {
   3046 		//
   3047 		// This proxy file system vfs exists
   3048 		//
   3049 		ASSERT((pxvfsp->flags & PXFS_UNMOUNTED) == 0);
   3050 		VFS_HOLD(pxvfsp->fs_vfs);
   3051 		all_pxvfs_lock.unlock();
   3052 		return (pxvfsp);
   3053 	}
   3054 
   3055 	//
   3056 	// If we aren't supposed to create a new proxy,
   3057 	// return NULL since we didn't find it.
   3058 	//
   3059 	if (fsinfop == NULL) {
   3060 		all_pxvfs_lock.unlock();
   3061 		return (NULL);
   3062 	}
   3063 
   3064 	//
   3065 	// Try to upgrade the read lock to a write lock. This can fail.
   3066 	//
   3067 	if (!all_pxvfs_lock.try_upgrade()) {
   3068 		// Lock upgrade attempt failed
   3069 		all_pxvfs_lock.unlock();
   3070 		all_pxvfs_lock.wrlock();
   3071 
   3072 		//
   3073 		// Normally nobody else will be attempting to
   3074 		// create this file system. But this code takes
   3075 		// the safe approach that will always work
   3076 		// in spite of orphan requests or anything else,
   3077 		// and the cost is minimal.
   3078 		//
   3079 		// Search the list of proxy file systems
   3080 		pxvfsp = search(fsobj);
   3081 		if (pxvfsp != NULL) {
   3082 			//
   3083 			// This proxy file system vfs exists
   3084 			//
   3085 			ASSERT((pxvfsp->flags & PXFS_UNMOUNTED) == 0);
   3086 			VFS_HOLD(pxvfsp->fs_vfs);
   3087 			all_pxvfs_lock.unlock();
   3088 			return (pxvfsp);
   3089 		}
   3090 	}
   3091 
   3092 	//
   3093 	// Set up the fsmgr_client/fsmgr_server connection.
   3094 	//
   3095 	fsmgr_client_impl		*clientmgrp = new fsmgr_client_impl();
   3096 	PXFS_VER::fsmgr_client_var	clientmgr = clientmgrp->get_objref();
   3097 
   3098 	FAULTPT_PXFS(FAULTNUM_PXFS_MOUNT_BIND_FS_B,
   3099 		FaultFunctions::generic);
   3100 
   3101 	uint32_t		server_incarn;
   3102 	uint32_t		fs_blk_size;
   3103 	bool			fastwrite_flag;
   3104 	PXFS_VER::fsmgr_server_ptr	servermgr_p = fsobj->
   3105 	    bind_fs(clientmgr, orb_conf::node_number(), server_incarn,
   3106 	    fs_blk_size, fastwrite_flag, e);
   3107 	if (e.exception()) {
   3108 		//
   3109 		// We ignore comm failures since this can happen
   3110 		// if a node with the pxfs server crashes and
   3111 		// then any node joins the global name space. The
   3112 		// mount server will try to create a proxy vfs
   3113 		// for the dead file system and link it into the
   3114 		// name space. We need this to succeed so that
   3115 		// the dead file system can be unmounted properly.
   3116 		//
   3117 		if (CORBA::COMM_FAILURE::_exnarrow(e.exception()) == NULL) {
   3118 #ifdef DEBUG
   3119 			e.exception()->print_exception(
   3120 			    "pxvfs::findpxvfs: ");
   3121 #endif
   3122 			MOUNT_DBPRINTF(
   3123 			    MOUNT_TRACE_CLIENT,
   3124 			    MOUNT_RED,
   3125 			    ("pxvfs:findpxvfs(): exception from"
   3126 			    " fsobj->bind_fs()\n"));
   3127 		} else {
   3128 			servermgr_p = PXFS_VER::fsmgr_server::_nil();
   3129 			MOUNT_DBPRINTF(
   3130 			    MOUNT_TRACE_CLIENT,
   3131 			    MOUNT_RED,
   3132 			    ("pxvfs:findpxvfs(): comm failure\n "));
   3133 		}
   3134 
   3135 		e.clear();
   3136 	}
   3137 
   3138 	//
   3139 	// Allocate a vfssw[] entry for the underlying file system type
   3140 	// but don't try to load the module.
   3141 	//
   3142 	RLOCK_VFSSW();
   3143 	struct vfssw	*vswp = vfs_getvfsswbyname((char *)fsinfop->fstype);
   3144 	if (vswp == NULL) {
   3145 		RUNLOCK_VFSSW();
   3146 		WLOCK_VFSSW();
   3147 		vswp = vfs_getvfsswbyname((char *)fsinfop->fstype);
   3148 		if (vswp == NULL)
   3149 			vswp = allocate_vfssw((char *)fsinfop->fstype);
   3150 		WUNLOCK_VFSSW();
   3151 		if (vswp == NULL) {
   3152 			all_pxvfs_lock.unlock();
   3153 			CORBA::release(servermgr_p);
   3154 			return (NULL);
   3155 		}
   3156 		RLOCK_VFSSW();
   3157 	}
   3158 	int	fstype = (int)(vswp - vfssw);
   3159 	RUNLOCK_VFSSW();
   3160 
   3161 	//
   3162 	// Create a new vfs.
   3163 	//
   3164 #if	SOL_VERSION >= __s11
   3165 	vfsp = vfs_alloc(KM_SLEEP);
   3166 #else
   3167 	vfsp = (vfs_t *)kmem_alloc(sizeof (vfs_t), KM_SLEEP);
   3168 #endif
   3169 	VFS_INIT(vfsp, pxfs_vfsopsp, (caddr_t)NULL);
   3170 	VFS_HOLD(vfsp);
   3171 	pxvfsp = new pxvfs(fsobj, clientmgrp, fsinfop, fstype, vfsp,
   3172 	    server_incarn);
   3173 
   3174 	pxvfsp->pxfs_bsize = fs_blk_size;
   3175 	pxvfsp->blocks_available = 0;
   3176 	pxvfsp->fastwrite = fastwrite_flag;
   3177 
   3178 	// Finish initializing the linkages between fsmgr client and server.
   3179 	clientmgrp->set_pxvfsp(pxvfsp, servermgr_p);
   3180 
   3181 	// Add ourself to the list of all PXFS file systems.
   3182 	all_pxvfs.prepend((pxvfs_list_elem *)pxvfsp);
   3183 	VFS_HOLD(vfsp);
   3184 	all_pxvfs_lock.unlock();
   3185 
   3186 	CORBA::release(servermgr_p);
   3187 	return (pxvfsp);
   3188 }
   3189 
   3190 //
   3191 // Search the all_pxvfs list for the given file system object.
   3192 // Return NULL if not found.
   3193 //
   3194 pxvfs *
   3195 pxvfs::search(PXFS_VER::filesystem_ptr fsobj)
   3196 {
   3197 	pxvfs	*p;
   3198 	pxvfs_list_t::ListIterator iter(all_pxvfs);
   3199 
   3200 	for (; (p = iter.get_current()) != NULL;
   3201 	    iter.advance()) {
   3202 		if (fsobj->_equiv(p->get_fsobj())) {
   3203 			return (p);
   3204 		}
   3205 	}
   3206 	return (NULL);
   3207 }
   3208 
   3209 //
   3210 // get_configured_nodes - obtains the nodes configured for this device.
   3211 // An important side effect is that DCS will start the device service
   3212 // if it is not already started.
   3213 //
   3214 int
   3215 pxvfs::get_configured_nodes(bool &dev_is_ha, CORBA::String_out dev_name,
   3216 	dev_t devid, sol::nodeid_seq_t_out nodes, Environment &e)
   3217 {
   3218 	int	error;
   3219 
   3220 #ifndef PSARC_2001_038 // no longer called with the lock held in Solaris 9
   3221 	RUNLOCK_VFSSW();
   3222 #endif
   3223 
   3224 	fs::dc_callback_var	callback =
   3225 	    mount_client_impl::get_server()->get_dc_callback(e);
   3226 
   3227 #ifndef PSARC_2001_038
   3228 	RLOCK_VFSSW();
   3229 #endif
   3230 
   3231 	if (e.exception()) {
   3232 		return (pxfslib::get_err(e));
   3233 	}
   3234 
   3235 	error = dcs_get_configured_nodes(devid, callback,
   3236 	    dev_is_ha, dev_name, nodes);
   3237 	//lint -e1746
   3238 
   3239 	return (error);
   3240 }
   3241 
   3242 void
   3243 pxvfs::disable_unmounts()
   3244 {
   3245 	unmounts_disabled = true;
   3246 }
   3247 
   3248 //
   3249 // memory_callback - the Memory Monitor executes this method whenever
   3250 // the memory state changes. This method purges all pxfs entries from
   3251 // the DNLC cache when the system is memory starved
   3252 //
   3253 // static
   3254 void
   3255 pxvfs::memory_callback(monitor::system_state_t state)
   3256 {
   3257 	switch (state) {
   3258 	case monitor::MEMORY_STARVED:
   3259 		//
   3260 		// PXFS enters its proxy vnode into the DNLC.
   3261 		// When PXFS is on top of UFS on the same node,
   3262 		// the DNLC on this node also contains UFS vnodes.
   3263 		// The UFS vnode entry in the DNLC is not removed
   3264 		// when the corresponding PXFS server file object goes away.
   3265 		// We need to purge both the client proxy vnode
   3266 		// and the server file system vnode.
   3267 		//
   3268 		// The DNLC is an optional performance enhancer.
   3269 		// Any file in active use will soon be entered again
   3270 		// into the DNLC.
   3271 		//
   3272 		// The DNLC method for removing a specific vnode
   3273 		// requires walking all of the DNLC entries.
   3274 		// So purge everything.
   3275 		//
   3276 		dnlc_purge();
   3277 
   3278 		PXFS_DBPRINTF(
   3279 		    PXFS_TRACE_PXVFS,
   3280 		    PXFS_AMBER,
   3281 		    ("pxvfs:memory_callback: state %d purged dnlc\n",
   3282 		    state));
   3283 		break;
   3284 	default:
   3285 		break;
   3286 	}
   3287 }
   3288 
   3289 //
   3290 // update_throughput() does bandwidth calculation. This method is called by
   3291 // every thread that does a successful page_out with the number of bytes
   3292 // transferred. In the case of async writes, it will be the aio callback
   3293 // that calls this method. After every re-calculation it signals all
   3294 // threads waiting for more bandwidth.
   3295 //
   3296 void
   3297 pxvfs::update_throughput(int bytes_xfrd)
   3298 {
   3299 	timespec_t	now = {0L, 0};
   3300 	uint64_t	current_rate;
   3301 	uint64_t	time_for_xfr;
   3302 
   3303 	gethrestime(&now);
   3304 
   3305 	//
   3306 	// If the current window started more than a second ago, store
   3307 	// current time, calculate new bytes per second rate and set bytes
   3308 	// available over next second to new data rate.
   3309 	//
   3310 	data_rate_lock.lock();
   3311 
   3312 	// increment total bytes transferred from window_start
   3313 	bytes_sent_in_second += bytes_xfrd;
   3314 
   3315 	//
   3316 	// The last througput update as less than a second ago, don't
   3317 	// recalculate data rate.
   3318 	//
   3319 	if (diff_timespec(window_start, now) < 1000) {
   3320 		data_rate_lock.unlock();
   3321 		return;
   3322 	}
   3323 
   3324 	gethrestime(&window_start);
   3325 
   3326 	//
   3327 	// Update recommended data rate
   3328 	//
   3329 	monitor::system_state_t sys_state = monitor::the().get_current_state();
   3330 	if (sys_state != monitor::MEMORY_PLENTY) {
   3331 		//
   3332 		// If memory is low we set the data rate to exactly the number
   3333 		// of bytes we committed in the last second.
   3334 		//
   3335 		data_rate = bytes_sent_in_second;
   3336 
   3337 		//
   3338 		// If less than configured minimum bytes were written and
   3339 		// memory state is not MEMORY_STARVED set data_rate to the
   3340 		// configured minimum data rate.
   3341 		//
   3342 		if (bytes_sent_in_second < data_rate_minimum &&
   3343 		    sys_state != monitor::MEMORY_STARVED) {
   3344 			data_rate = data_rate_minimum;
   3345 		}
   3346 	} else {
   3347 		//
   3348 		// If writers aren't using the available bandwidth, we
   3349 		// don't change data rate. Re-calculation happens only if
   3350 		// qouta in last second was exceeded. wait_for_bandwidth()
   3351 		// allows this over-run when there is plenty of memory.
   3352 		//
   3353 		if (bytes_written_in_second >= data_rate) {
   3354 			if (bytes_sent_in_second < data_rate_minimum) {
   3355 				//
   3356 				// Bytes written could be low because there
   3357 				// weren't enough writes in the last few
   3358 				// seconds. Ease off the throttling as we have
   3359 				// enough memory to accomodate dirty pages.
   3360 				// Set data rate to the configured minimum
   3361 				// instead of bytes_sent_in_second.
   3362 				//
   3363 				data_rate = data_rate_minimum;
   3364 			} else {
   3365 				//
   3366 				// Anticipate an increase by 6.25% over the
   3367 				// next second if current data-rate is more
   3368 				// than the configured default.
   3369 				//
   3370 				if (bytes_sent_in_second > data_rate_default) {
   3371 					data_rate = bytes_sent_in_second +
   3372 						    (bytes_sent_in_second/16);
   3373 				} else {
   3374 					//
   3375 					// If current data rate is below
   3376 					// configured default, we must get to
   3377 					// the default fast. We increase the
   3378 					// data rate at 50% every second.
   3379 					//
   3380 					data_rate = bytes_sent_in_second +
   3381 						    (bytes_sent_in_second/2);
   3382 				}
   3383 			}
   3384 		}
   3385 	}
   3386 
   3387 	// Reset per second accumulators..
   3388 	bytes_written_in_second = 0;
   3389 	bytes_sent_in_second = 0;
   3390 
   3391 	// ..and set byte quota for the current window.
   3392 	bytes_in_window = data_rate;
   3393 
   3394 	//
   3395 	// If there are no bytes available for writers to consume don't
   3396 	// wake anyone.
   3397 	//
   3398 	if (bytes_in_window != 0) {
   3399 		bandwidth_lock.lock();
   3400 		// Wakeup any thread waiting for bandwidth.
   3401 		bandwidth_cv.broadcast();
   3402 		bandwidth_lock.unlock();
   3403 	}
   3404 
   3405 	data_rate_lock.unlock();
   3406 }
   3407 
   3408 //
   3409 // 'throttle_monitor_thread' runs every second, updates the bandwidth and
   3410 // wakes up writers waiting for bandwidth. Without a per-second trigger,
   3411 // writers can wait indefinitely if there was no i/o scheduled.
   3412 //
   3413 void
   3414 pxvfs::throttle_monitor_thread(void *)
   3415 {
   3416 	while (true) {
   3417 		os::usecsleep(throttle_monitor_interval);
   3418 		//
   3419 		// Determine new bandwidth for this second with no bytes
   3420 		// transferred.
   3421 		//
   3422 		pxvfs::update_throughput(0);
   3423 	}
   3424 }
   3425 
   3426 // Create a new thread to monitor bandwidth per second.
   3427 int
   3428 pxvfs::launch_throttle_monitor_thread()
   3429 {
   3430 	//
   3431 	// Create a kernel thread in the SYS scheduling class.
   3432 	//
   3433 	if ((clnewlwp(throttle_monitor_thread,
   3434 			NULL, MINCLSYSPRI, NULL, NULL)) != 0) {
   3435 		return (-1);
   3436 	}
   3437 	return (0);
   3438 }
   3439 
   3440 //
   3441 // Called at modload time.
   3442 //
   3443 int
   3444 pxvfs::startup()
   3445 {
   3446 	//
   3447 	// The size of the pxfobj_hash table is computed in a way that
   3448 	// is similar to the way ufs calculates the size/max size of its
   3449 	// in-core inode hash table.
   3450 	//
   3451 	//lint -e64 -e419 -e712 -e747 -e534
   3452 	if (pxfobjhsz_max == 0) {
   3453 		pxfobjhsz_max =
   3454 		    (uint_t)1 << os::highbit((uint_t)ncsize / pxfobjh_len);
   3455 	}
   3456 
   3457 	if (pxfobjhsz == 0) {
   3458 		pxfobjhsz = pxfobjhsz_max;
   3459 	}
   3460 
   3461 	if (pxfobjhsz > pxfobjhsz_max) {
   3462 		pxfobjhsz = pxfobjhsz_max;
   3463 	}
   3464 
   3465 	pxfobj_hash = new pxfobj_hash_bkt[pxfobjhsz];
   3466 
   3467 	//
   3468 	// Create the pxvfs_inactive_threadpool,
   3469 	// which processes requests to reap inactive proxy vnodes
   3470 	//
   3471 	pxvfs_inactive_threadpool::startup();
   3472 
   3473 #ifdef	PXFS_KSTATS_ENABLED
   3474 	// Create the per-node kstat structure.
   3475 	node_stats = kstat_create("pxfs", 0,
   3476 	    "Per-node client v1 stats", "pxvfs",
   3477 	    KSTAT_TYPE_NAMED, PXVFS_NODE_STATS_MAX_NUM, KSTAT_FLAG_PERSISTENT);
   3478 
   3479 	if (node_stats != NULL) {
   3480 		kstat_named_init(&(KSTAT_NAMED_PTR(node_stats)
   3481 		    [PXVFS_NODE_STATS_NUM_OPEN_FILES]), "Open Files",
   3482 		    KSTAT_DATA_UINT32);
   3483 		PXFS_KSTATS(node_stats, ((KSTAT_NAMED_PTR(node_stats))
   3484 		    [PXVFS_NODE_STATS_NUM_OPEN_FILES].value.ui32 = 0));
   3485 
   3486 		kstat_install(node_stats);
   3487 	}
   3488 #else
   3489 	node_stats = NULL;
   3490 #endif	/* PXFS_KSTATS_ENABLED */
   3491 
   3492 	// Register pxfs purge method with Memory Monitor
   3493 	monitor::the().subscribe(memory_callback);
   3494 
   3495 	int error;
   3496 
   3497 	// Start the throttle monitoring thread.
   3498 	if ((error = launch_throttle_monitor_thread()) != 0) {
   3499 		char		nodename[32];
   3500 
   3501 		(void) sprintf(nodename, "Node (%u)", orb_conf::node_number());
   3502 		os::sc_syslog_msg msg(SC_SYSLOG_FILESYSTEM_TAG, nodename, NULL);
   3503 		//
   3504 		// SCMSGS
   3505 		// @explanation
   3506 		// Thread to support pxfs throttling could not be launched.
   3507 		// @user_action
   3508 		// Check if the node is short on resources.
   3509 		//
   3510 		(void) msg.log(SC_SYSLOG_WARNING, MESSAGE,
   3511 		    "pxvfs:startup() Failed to create throttle monitoring"
   3512 		    " thread.\n");
   3513 		return (error);
   3514 	}
   3515 
   3516 	return (0);
   3517 }
   3518 
   3519 int
   3520 pxvfs::shutdown()
   3521 {
   3522 	// Deregister pxfs purge method with Memory Monitor
   3523 	monitor::the().unsubscribe(memory_callback);
   3524 
   3525 	pxvfs_inactive_threadpool::shutdown();
   3526 
   3527 	delete [] pxfobj_hash;
   3528 	return (0);
   3529 }
   3530 
   3531 bool
   3532 pxvfs::is_unmounted()
   3533 {
   3534 	if (flags & PXFS_FORCE_UNMOUNTING) {
   3535 		return (true);
   3536 	} else {
   3537 		return (false);
   3538 	}
   3539 }
   3540 
   3541 //
   3542 // This routine tries to allocate blocks from the local cache
   3543 // ie. blocks_available.
   3544 // If there are not enough blocks to satisfy the current request, it contacts
   3545 // the server (get_reservatio()) to get more blocks.
   3546 // If the server decides to switch to REDZONE we block here waiting for
   3547 // the switch to complete and we return a '0' to indicate the switch.
   3548 // If the get_reservation() finds more blocks we do the reservation
   3549 // and returns the number of blocks allocated.
   3550 //
   3551 uint64_t
   3552 pxvfs::reserve_blocks(PXFS_VER::blkcnt_t want, bool no_redzone_wait)
   3553 {
   3554 	PXFS_VER::blkcnt_t refill_block_count = 0;
   3555 	PXFS_VER::server_status_t status;
   3556 	Environment env;
   3557 
   3558 	for (;;) {
   3559 		blocks_reservation.lock();
   3560 
   3561 		//
   3562 		// If there is an invocation or redzone switch in progress
   3563 		// wait for the invocation or switch to complete.
   3564 		//
   3565 		while (pxvfs_status == PXFS_VER::SWITCH_TO_REDZONE ||
   3566 		    blk_reserve_invo_in_progress) {
   3567 			//
   3568 			// This was a call for pre-reservation, must
   3569 			// not wait for revoke allocations to complete.
   3570 			//
   3571 			if (no_redzone_wait) {
   3572 				blocks_reservation.unlock();
   3573 				return (0);
   3574 			}
   3575 			blocks_reservation_cv.wait(&blocks_reservation);
   3576 		}
   3577 		if (pxvfs_status == PXFS_VER::REDZONE) {
   3578 			blocks_reservation.unlock();
   3579 			return (0);
   3580 		}
   3581 		ASSERT(pxvfs_status == PXFS_VER::GREENZONE);
   3582 
   3583 		if (want < blocks_available) {
   3584 			blocks_available = blocks_available - want;
   3585 			blocks_reservation.unlock();
   3586 			return (want);
   3587 		}
   3588 
   3589 		//
   3590 		// Clear available blocks and set flag to show block
   3591 		// reservation invocation is in progress.
   3592 		//
   3593 		blocks_available = 0;
   3594 		blk_reserve_invo_in_progress = true;
   3595 
   3596 		PXFS_DBPRINTF(PXFS_TRACE_PXVFS,
   3597 		    PXFS_AMBER,
   3598 		    ("pxvfs:reserve_blocks(%p) Calling get_reservation()\n",
   3599 		    this));
   3600 
   3601 		//
   3602 		// Get more reservation from server to fulfill the reuqest.
   3603 		//
   3604 		// Do not hold locks across invocations unless necessary. We
   3605 		// must not hold blocks_reservation lock while waiting for
   3606 		// server. Any thread needing disk blocks will block until the
   3607 		// invocation completes and this thread broadcasts a wakeup.
   3608 		//
   3609 		blocks_reservation.unlock();
   3610 		get_fsobj()->get_reservation(refill_block_count, status, env);
   3611 		blocks_reservation.lock();
   3612 
   3613 		env.clear();
   3614 
   3615 		ASSERT(refill_block_count >= 0);
   3616 
   3617 		//
   3618 		// Set local block reservation to what the server gave
   3619 		// us and clear invocation active flag.
   3620 		//
   3621 		blocks_available = refill_block_count;
   3622 		blk_reserve_invo_in_progress = false;
   3623 
   3624 		if (refill_block_count > 0) {
   3625 			//
   3626 			// Wake up any threads that came in while we were
   3627 			// waiting for server to allocate us blocks.
   3628 			//
   3629 			blocks_reservation_cv.broadcast();
   3630 			blocks_reservation.unlock();
   3631 		} else {
   3632 			//
   3633 			// Server is not able to give away reservation which
   3634 			// means server either has switched to REDZONE or
   3635 			// is in the process of switching to REDZONE.
   3636 			//
   3637 			pxvfs_status = status;
   3638 			blocks_reservation.unlock();
   3639 			PXFS_DBPRINTF(PXFS_TRACE_PXVFS,
   3640 			    PXFS_AMBER,
   3641 			    ("pxvfs:reserve_blocks(%p) pxvfs_status %d\n",
   3642 			    this, status));
   3643 		}
   3644 	}
   3645 }
   3646 
   3647 void
   3648 pxvfs::set_server_status(PXFS_VER::server_status_t status)
   3649 {
   3650 	blocks_reservation.lock();
   3651 	pxvfs_status = status;
   3652 	blocks_reservation_cv.broadcast();
   3653 	blocks_reservation.unlock();
   3654 }
   3655 
   3656 //
   3657 // Check if there is enough bandwidth to accomodate given bytes. If
   3658 // not, wait for throttle update routine to signal us whenever per
   3659 // second quota is updated.
   3660 //
   3661 int
   3662 pxvfs::wait_for_bandwidth(int bytes_needed, int &bytes_allocated)
   3663 {
   3664 	monitor::system_state_t sys_state;
   3665 
   3666 	sys_state = monitor::the().get_current_state();
   3667 
   3668 	// Round up to minimum allowed bandwidth allocation.
   3669 	bytes_needed = MAX(bytes_needed, bandwidth_chunk);
   3670 
   3671 	bandwidth_lock.lock();
   3672 
   3673 	while (bytes_needed > bytes_in_window) {
   3674 
   3675 		//
   3676 		// If there is plenty of memory, we allow over-run of the
   3677 		// calculated bandwidth once without waiting. The next
   3678 		// throughput updation will correct this if the server was
   3679 		// loaded.
   3680 		//
   3681 		if (sys_state == monitor::MEMORY_PLENTY &&
   3682 		    bytes_in_window != 0) {
   3683 			break;
   3684 		}
   3685 
   3686 		//
   3687 		// Wait for more bandwidth to be available
   3688 		//
   3689 		if (!bandwidth_cv.wait_sig(&bandwidth_lock)) {
   3690 			bandwidth_lock.unlock();
   3691 			return (EINTR);
   3692 		}
   3693 		//
   3694 		// If this write is bigger than current bandwidth no sense
   3695 		// in waiting for more. If we are under memory pressure
   3696 		// don't proceed until the pressure eases off.
   3697 		//
   3698 		if (bytes_needed > data_rate &&
   3699 		    monitor::the().get_current_state() !=
   3700 					monitor::MEMORY_STARVED) {
   3701 			break;
   3702 		}
   3703 	}
   3704 
   3705 	if (bytes_needed > bytes_in_window) {
   3706 		bytes_in_window = 0;
   3707 	} else {
   3708 		bytes_in_window -= bytes_needed;
   3709 	}
   3710 
   3711 	bytes_written_in_second += bytes_needed;
   3712 	bandwidth_lock.unlock();
   3713 
   3714 	bytes_allocated = bytes_needed;
   3715 	return (0);
   3716 }
   3717