1 // 2 // CDDL HEADER START 3 // 4 // The contents of this file are subject to the terms of the 5 // Common Development and Distribution License (the License). 6 // You may not use this file except in compliance with the License. 7 // 8 // You can obtain a copy of the license at usr/src/CDDL.txt 9 // or http://www.opensolaris.org/os/licensing. 10 // See the License for the specific language governing permissions 11 // and limitations under the License. 12 // 13 // When distributing Covered Code, include this CDDL HEADER in each 14 // file and include the License file at usr/src/CDDL.txt. 15 // If applicable, add the following below this CDDL HEADER, with the 16 // fields enclosed by brackets [] replaced with your own identifying 17 // information: Portions Copyright [yyyy] [name of copyright owner] 18 // 19 // CDDL HEADER END 20 // 21 22 // 23 // Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 // Use is subject to license terms. 25 // 26 27 #pragma ident "@(#)pxvfs.cc 1.39 08/05/20 SMI" 28 29 #include <sys/types.h> 30 #include <sys/systm.h> 31 #include <sys/vfs.h> 32 #include <sys/vnode.h> 33 #include <sys/file.h> 34 #include <sys/uio.h> 35 #include <sys/dnlc.h> 36 #include <sys/mount.h> 37 #include <sys/statvfs.h> 38 #include <sys/debug.h> 39 #include <sys/cmn_err.h> 40 #include <sys/fs_subr.h> 41 #include <sys/pathname.h> 42 #include <sys/fs/ufs_mount.h> 43 #include <sys/mntent.h> 44 #include <kstat.h> 45 #include <sys/ddi.h> 46 #include <sys/disp.h> 47 48 #include <sys/sol_version.h> 49 #include <sys/os.h> 50 #include <sys/sol_conv.h> 51 #include <solobj/solobj_impl.h> 52 #include <nslib/ns.h> 53 #include <orb/fault/fault_injection.h> 54 #include <orb/monitor/monitor.h> 55 #include <orb/infrastructure/clusterproc.h> 56 57 #include "../version.h" 58 #include <pxfs/common/pxfslib.h> 59 #include <pxfs/device/device_replica_impl.h> 60 #include <pxfs/device/device_service_mgr.h> 61 #include <pxfs/mount/mount_client_impl.h> 62 #include <pxfs/mount/mount_debug.h> 63 #include <pxfs/lib/pxfs_debug.h> 64 #include <pxfs/lib/pxfs_misc.h> 65 #include <pxfs/client/pxfobj.h> 66 #include <pxfs/client/pxfobjplus.h> 67 #include <pxfs/client/fobj_client_impl.h> 68 #include <pxfs/client/pxreg.h> 69 #include <pxfs/client/pxdir.h> 70 #include <pxfs/client/pxchr.h> 71 #include <pxfs/client/pxlink.h> 72 #include <pxfs/client/pxspecial.h> 73 #include <pxfs/client/pxvfs.h> 74 #include <pxfs/client/fsmgr_client_impl.h> 75 76 #ifndef VXFS_DISABLED 77 #include <pxfs/server/vxfs_dependent_impl.h> 78 #endif 79 80 #if SOL_VERSION >= __s9 81 #define PSARC_2001_038 82 #endif 83 84 // 85 // For update of the mnttab modification time. The function 86 // vfs_mnttab_modtimeupd is declared static in vfs.c for 87 // Solaris 8 and 9. For Solaris 10, it is global. 88 // 89 #if SOL_VERSION >= __s10 90 #define GLOBAL_MNTTAB_MODTIME_INTERFACE 91 #else 92 extern timespec_t vfs_mnttab_mtime; 93 #endif 94 95 // 96 // Constants for initializing various throttling variables. 97 // 98 const int KILOBYTE = 1024; 99 const int64_t MEGABYTE = 1024 * KILOBYTE; 100 const int ONE_SECOND = 1000000; // 1 second in microseconds 101 102 int64_t DATA_RATE_DEFAULT = 20 * MEGABYTE; // 20mb per second 103 int64_t DATA_RATE_MINIMUM = 2 * MEGABYTE; // 2mb per second 104 int THROTTLE_MONITOR_INTERVAL = ONE_SECOND; // 1 second by default 105 106 //lint -e666 107 // PXFS is an extensive user of the inline function get_vp() within 108 // the vnode macros: 109 // error = VOP_LOOKUP(get_vp(), ... ); 110 // There are no side effects to calling get_vp() repeatedly, flexelint 111 // does not know that, but we do. 112 // 113 114 // 115 // Static data member initializations. 116 // 117 pxvfs_inactive_threadpool 118 *pxvfs_inactive_threadpool::the_pxvfs_inactive_threadpool = NULL; 119 120 pxvfs_list_t pxvfs::all_pxvfs; // list of all pxvfs structures 121 os::rwlock_t pxvfs::all_pxvfs_lock; // protects 'all_pxvfs' 122 int pxvfs::pxfstype; // number assigned by Solaris 123 bool pxvfs::unmounts_disabled = false; 124 125 // This value must be a non-zero positive number 126 int cluster_fs_drain_queue_len = 50; 127 128 uint_t pxvfs::pxfobjhsz = 0; 129 uint_t pxvfs::pxfobjhsz_max = 0; 130 uint_t pxvfs::pxfobjh_len = 4; 131 kstat_t *pxvfs::node_stats = NULL; 132 133 // 134 // The flushing of attributes is done by a kernel thread 135 // with a priority higher than that of applications. 136 // The attribute flush operation can starve normal activity. 137 // These values control the flushing of attributes 138 // The default values were chosen to allow approx. 100,000 files to be flushed 139 // in 30 seconds, and to spread the work over that period. 140 // 141 // Number of files processed per interval 142 int pxvfs::sync_all_attr_throttle = 40; 143 144 // No of files processed during a sync before sleeping. 145 int pxvfs::sync_filesystem_throttle = 40; 146 147 // Amount of sleep between intervals 148 os::usec_t pxvfs::sync_all_attr_interval[] = { 149 (os::usec_t)(120 * 1000), // 120ms <10,000 files 150 (os::usec_t)(60 * 1000), // 60ms <20,000 files 151 (os::usec_t)(40 * 1000), // 40ms <30,000 files 152 (os::usec_t)(30 * 1000), // 30ms <40,000 files 153 (os::usec_t)(20 * 1000) // 20ms >=40,000 files 154 }; 155 156 bool pxvfs::sync_all_attr_thread_running = false; 157 os::mutex_t pxvfs::sync_all_attr_lock; 158 159 const int inactive_thread_priority = 65; 160 161 #ifdef DEBUG 162 uint32_t pxvfs_vget_number_calls = 0; 163 uint32_t pxvfs_vget_number_fid_hits = 0; 164 #endif 165 166 // Number of async threads per thread-pool. 167 int pxfs_async_threads = 15; 168 uint64_t pxvfs::async_task_count = 0; 169 170 // 171 // This should be pxvfs::pxfobj_hash_bkt but our compiler doesn't understand 172 // that and our lint checker complains about it. 173 //lint -e1038 174 pxvfs::pxfobj_hash_bkt *pxvfs::pxfobj_hash = NULL; 175 //lint +e1038 176 177 // 178 // Assign default values for throttling variables. For description see 179 // declaration of these members in pxvfs's class definition below. 180 // 181 int64_t pxvfs::data_rate = DATA_RATE_DEFAULT; // 20mb per second 182 int64_t pxvfs::data_rate_default = DATA_RATE_DEFAULT; // 20mb per second 183 int64_t pxvfs::data_rate_minimum = DATA_RATE_MINIMUM; // 2mb per second 184 int64_t pxvfs::bytes_in_window = DATA_RATE_DEFAULT; // 20mb 185 int pxvfs::bandwidth_chunk = 8 * KILOBYTE; // 8kb 186 187 // Monitor thread wakes every 1 second by default. 188 int pxvfs::throttle_monitor_interval = THROTTLE_MONITOR_INTERVAL; 189 190 // 191 // We don't want to keep more than 16MB worth of I/O requests pending 192 // per client. Assuming an average size of 128kb per I/O request, that 193 // is 128 pending I/O requests on the server. 194 // 195 int pxvfs::max_permitted_ios = 128; 196 197 // Everything else initialized to zero. 198 int64_t pxvfs::bytes_sent_in_second = 0; 199 int64_t pxvfs::bytes_written_in_second = 0; 200 timespec_t pxvfs::window_start = {0L, 0}; 201 202 os::mutex_t pxvfs::io_pending_lock; 203 os::condvar_t pxvfs::io_pending_cv; 204 int64_t pxvfs::io_pending = 0; 205 os::condvar_t pxvfs::bandwidth_cv; 206 os::mutex_t pxvfs::bandwidth_lock; 207 os::mutex_t pxvfs::data_rate_lock; 208 209 // 210 // class pxvfs_inactive_task methods 211 // 212 213 // 214 // execute - this does the actual work for the task to clean up 215 // inactive proxy vnodes. 216 // 217 void 218 pxvfs_inactive_task::execute() 219 { 220 pxvfs *pxvfsp = get_pxvfs(); 221 222 pxvfsp->flags_lock.lock(); 223 224 // 225 // Clean up some inactive proxy vnodes 226 // 227 pxvfsp->empty_inactive_list(); 228 229 if (pxvfsp->inactive_list_cnt != 0) { 230 // 231 // There are still more inactive proxy vnodes. 232 // We do not process all inactive proxy vnodes 233 // at one time in order to let other file systems clean up. 234 // Requeue this work request. 235 // It is safe to requeue this task for either of two reasons: 236 // 1) this threadpool is single threaded. 237 // 2) this work task is already off the work list, 238 // and so will not be queued twice concurrently. 239 // 240 pxvfs_inactive_threadpool::the().defer_processing(this); 241 pxvfsp->flags_lock.unlock(); 242 243 } else { 244 // 245 // There are no more inactive proxy vnodes. 246 // 247 pxvfsp->flags &= ~pxvfs::PXFS_TASK_QUEUED; 248 pxvfsp->flags_lock.unlock(); 249 VFS_RELE(pxvfsp->fs_vfs); 250 } 251 } 252 253 // 254 // task_done - Method called when the threadpool decides to throw away a task. 255 // This happens only during shutdown. 256 // Cannot use the default implementation which does a "delete this", 257 // because this task is embedded in another object. 258 // 259 void 260 pxvfs_inactive_task::task_done() 261 { 262 // All work tasks should have been processed before shutdown 263 ASSERT(0); 264 } 265 266 // 267 // class pxvfs_inactive_threadpool methods 268 // 269 270 // 271 // constructor - this threadpool uses two threads. 272 // 273 pxvfs_inactive_threadpool::pxvfs_inactive_threadpool() : 274 threadpool(false, 2, "pxvfs_inactive_threadpool", 2) 275 { 276 } 277 278 pxvfs_inactive_threadpool::~pxvfs_inactive_threadpool() 279 { 280 ASSERT(task_count() == 0); 281 } 282 283 // 284 // startup - this method is called at modload time to initialize 285 // this object. 286 // 287 // static 288 void 289 pxvfs_inactive_threadpool::startup() 290 { 291 ASSERT(the_pxvfs_inactive_threadpool == NULL); 292 the_pxvfs_inactive_threadpool = new pxvfs_inactive_threadpool; 293 294 // 295 // Pxfs consume large amounts of memory. 296 // Use a higher thread priority for freeing resources. 297 // 298 (void) the_pxvfs_inactive_threadpool-> 299 set_sched_props(inactive_thread_priority); 300 } 301 302 // 303 // shutdown - this method is called at modload time to shutdown 304 // this object. 305 // 306 // static 307 void 308 pxvfs_inactive_threadpool::shutdown() 309 { 310 delete the_pxvfs_inactive_threadpool; 311 the_pxvfs_inactive_threadpool = NULL; 312 } 313 314 // 315 // class pxvfs methods 316 // 317 318 // 319 // Constructor. 320 // 321 //lint -e668 -e1732 -e1733 322 pxvfs::pxvfs(PXFS_VER::filesystem_ptr fsptr, fsmgr_client_impl *clientmgrp, 323 const PXFS_VER::fs_info *fsinfop, int fstype, vfs_t *vfsp, 324 uint32_t server_incarn) : 325 pxvfs_list_elem(this), 326 fs_rootvp(NULL), 327 server_incn(server_incarn), 328 flags(0), 329 active_cnt(0), 330 inactive_list_cnt(0), 331 fsmgr_client_implp(clientmgrp), 332 _syncdir_on(false), 333 _nocto_on(false), 334 _forcedirectio_on(false), 335 underlying_fs(UNKNOWN), 336 blk_reserve_invo_in_progress(false) 337 { 338 ASSERT(clientmgrp != NULL); 339 340 #ifdef PXFS_KSTATS_ENABLED 341 char *stats_name = new char[KSTAT_STRLEN]; 342 (void) sprintf(stats_name, "client v1 (%d, %d)", 343 (int)getmajor(fsinfop->fsdev), (int)getminor(fsinfop->fsdev)); 344 345 //lint +e668 +e1732 +e1733 346 stats = kstat_create("pxfs", 0, stats_name, 347 "pxvfs", KSTAT_TYPE_NAMED, 348 PXVFS_STATS_MAX_NUM, KSTAT_FLAG_PERSISTENT); 349 350 delete [] stats_name; 351 352 if (stats != NULL) { 353 kstat_named_init(&(KSTAT_NAMED_PTR(stats) 354 [PXVFS_STATS_NUM_OPEN_FILES]), "Open Files", 355 KSTAT_DATA_UINT32); 356 (KSTAT_NAMED_PTR(stats))[ 357 PXVFS_STATS_NUM_OPEN_FILES].value.ui32 = 0; 358 359 kstat_named_init(&(KSTAT_NAMED_PTR(stats) 360 [PXVFS_STATS_ACCESS_TOKEN_HITS]), "Access Cache Hits", 361 KSTAT_DATA_UINT32); 362 (KSTAT_NAMED_PTR(stats))[ 363 PXVFS_STATS_ACCESS_TOKEN_HITS].value.ui32 = 0; 364 365 kstat_named_init(&(KSTAT_NAMED_PTR(stats) 366 [PXVFS_STATS_ACCESS_TOKEN_MISSES]), "Access Cache Misses", 367 KSTAT_DATA_UINT32); 368 (KSTAT_NAMED_PTR(stats))[ 369 PXVFS_STATS_ACCESS_TOKEN_MISSES].value.ui32 = 0; 370 371 kstat_named_init(&(KSTAT_NAMED_PTR(stats) 372 [PXVFS_STATS_ACCESS_TOKEN_INVALS]), "Access Cache Invals", 373 KSTAT_DATA_UINT32); 374 (KSTAT_NAMED_PTR(stats))[ 375 PXVFS_STATS_ACCESS_TOKEN_INVALS].value.ui32 = 0; 376 377 kstat_named_init(&(KSTAT_NAMED_PTR(stats) 378 [PXVFS_STATS_ATTR_TOKEN_HITS]), "Attribute Token Hits", 379 KSTAT_DATA_UINT32); 380 (KSTAT_NAMED_PTR(stats))[ 381 PXVFS_STATS_ATTR_TOKEN_HITS].value.ui32 = 0; 382 383 kstat_named_init(&(KSTAT_NAMED_PTR(stats) 384 [PXVFS_STATS_ATTR_TOKEN_MISSES]), "Attribute Token Misses", 385 KSTAT_DATA_UINT32); 386 (KSTAT_NAMED_PTR(stats))[ 387 PXVFS_STATS_ATTR_TOKEN_MISSES].value.ui32 = 0; 388 389 kstat_named_init(&(KSTAT_NAMED_PTR(stats) 390 [PXVFS_STATS_ATTR_TOKEN_INVALS]), "Attribute Token Invals", 391 KSTAT_DATA_UINT32); 392 (KSTAT_NAMED_PTR(stats))[ 393 PXVFS_STATS_ATTR_TOKEN_INVALS].value.ui32 = 0; 394 395 kstat_named_init(&(KSTAT_NAMED_PTR(stats) 396 [PXVFS_STATS_DATA_TOKEN_HITS]), "Data Token Hits", 397 KSTAT_DATA_UINT32); 398 (KSTAT_NAMED_PTR(stats))[ 399 PXVFS_STATS_DATA_TOKEN_HITS].value.ui32 = 0; 400 401 kstat_named_init(&(KSTAT_NAMED_PTR(stats) 402 [PXVFS_STATS_DATA_TOKEN_MISSES]), "Data Token Misses", 403 KSTAT_DATA_UINT32); 404 (KSTAT_NAMED_PTR(stats))[ 405 PXVFS_STATS_DATA_TOKEN_MISSES].value.ui32 = 0; 406 407 kstat_named_init(&(KSTAT_NAMED_PTR(stats) 408 [PXVFS_STATS_DATA_TOKEN_INVALS]), "Data Token Invals", 409 KSTAT_DATA_UINT32); 410 (KSTAT_NAMED_PTR(stats))[ 411 PXVFS_STATS_DATA_TOKEN_INVALS].value.ui32 = 0; 412 413 kstat_named_init(&(KSTAT_NAMED_PTR(stats) 414 [PXVFS_STATS_DATA_ALLOC]), "Number of bmap invocations", 415 KSTAT_DATA_UINT32); 416 (KSTAT_NAMED_PTR(stats))[ 417 PXVFS_STATS_DATA_ALLOC].value.ui32 = 0; 418 419 kstat_named_init(&(KSTAT_NAMED_PTR(stats) 420 [PXVFS_STATS_DATA_TOKEN_RETRIES]), "Data Token Retries", 421 KSTAT_DATA_UINT32); 422 (KSTAT_NAMED_PTR(stats))[ 423 PXVFS_STATS_DATA_TOKEN_RETRIES].value.ui32 = 0; 424 425 kstat_named_init(&(KSTAT_NAMED_PTR(stats) 426 [PXVFS_STATS_THROTTLING_HITS]), "Mem_async Throttling Hits", 427 KSTAT_DATA_UINT32); 428 (KSTAT_NAMED_PTR(stats))[ 429 PXVFS_STATS_THROTTLING_HITS].value.ui32 = 0; 430 431 kstat_install(stats); 432 } 433 #else 434 stats = NULL; 435 #endif /* PXFS_KSTATS_ENABLED */ 436 437 vfsp->vfs_data = (caddr_t)this; 438 vfsp->vfs_fstype = fstype; 439 440 // 441 // The VFS documentation says only that this field is the native block 442 // size of the file system. We have successfully used both 443 // the PXFS transfer size and that of the underlying file system VFS. 444 // We will use the block size of the underlying file system so 445 // as to be able to know the granual size of an allocation when 446 // filling in a hole. 447 // 448 vfsp->vfs_bsize = fsinfop->fsbsize; 449 450 vfsp->vfs_dev = fsinfop->fsdev; 451 vfsp->vfs_flag = fsinfop->fsflag | VFS_PXFS; 452 453 // XXX: should be: fs_vfs->vfs_fsid = conv(fsinfop->fsid); 454 // but haven't been able to figure out how to write a conv method 455 // the compiler will accept. 456 vfsp->vfs_fsid.val[0] = fsinfop->fsid.val[0]; 457 vfsp->vfs_fsid.val[1] = fsinfop->fsid.val[1]; 458 459 fs_vfs = vfsp; 460 461 // Check if the underlying filesystem is UFS. 462 if (strcmp("ufs", vfssw[fs_vfs->vfs_fstype].vsw_name) == 0) { 463 // 464 // XXXX Workaround for MO_TAG bug in vfs.c 465 // 466 mntopt_t *mop = vfs_hasopt(&vfsp->vfs_mntopts, "logging"); 467 468 if (mop && (mop->mo_flags & MO_TAG) == 0) { 469 mop->mo_flags |= MO_TAG; 470 } 471 underlying_fs = UFS; 472 #ifndef VXFS_DISABLED 473 } else if (strcmp("vxfs", vfssw[fs_vfs->vfs_fstype].vsw_name) 474 == 0) { 475 underlying_fs = VXFS; 476 #endif 477 } else if (strcmp("hsfs", vfssw[fs_vfs->vfs_fstype].vsw_name) 478 == 0) { 479 underlying_fs = HSFS; 480 } 481 482 // 483 // We're storing a reference to the server-side fs, 484 // so we need to duplicate it. 485 // 486 ASSERT(!CORBA::is_nil(fsptr)); 487 fs_fsobj = PXFS_VER::filesystem::_duplicate(fsptr); 488 489 // 490 // The value of the root vp is cached, but we can't pre-fill the 491 // cache here because filesystem::getroot() returns a packed 492 // vnode. When we unpack the packed vnode we will call 493 // find_pxvfs() looking for this vfs, but it has not yet been 494 // constructed, so it will not be on the list of pxvfs structs and 495 // we'll end up recursively calling this constructor. 496 // 497 498 // 499 // Initialize the status of the pxfs client status to 500 // PXFS_VER::GREENZONE. It is safe to assume we are in GREENZONE. 501 // Because, during the first allocating write or a file creation 502 // we would call reserve_blocks(). This routine will contact the 503 // server (through get_reservation()) and find out the current status 504 // of the filesystem and act accordingly. 505 // 506 pxvfs_status = PXFS_VER::GREENZONE; 507 508 // Create the async threadpool for this file-system 509 mem_async_threadpool = 510 new threadpool(true, 5, "apageout thr", pxfs_async_threads); 511 512 // We must succeed. 513 CL_PANIC(mem_async_threadpool != NULL); 514 } 515 516 pxvfs::~pxvfs() 517 { 518 CORBA::release(fs_fsobj); 519 520 // Assert that there are no extant client side objects for this pxvfs. 521 ASSERT(fs_rootvp == NULL); 522 ASSERT((flags & PXFS_FILE_ACTIVATE) == 0); 523 ASSERT(llm_cb_list.empty()); 524 } //lint !e1740 pointers are neither freed nor zero'ed by destructor 525 526 // 527 // get_pxvfs - This method supports the pxvfs_inactive_task 528 // This is a virtual method, so do not try to make inline. 529 // 530 pxvfs * 531 pxvfs::get_pxvfs() 532 { 533 return (this); 534 } 535 536 // 537 // Called by mount_client_impl when the initial mount or a remount 538 // occur. 539 // 540 void 541 pxvfs::set_mntoptions(const char *mntoptions) 542 { 543 if (pxfslib::exists_mntopt(mntoptions, MNTOPT_SYNCDIR, false)) { 544 _syncdir_on = true; 545 } else { 546 _syncdir_on = false; 547 } 548 if (pxfslib::exists_mntopt(mntoptions, MNTOPT_NOCTO, false)) { 549 _nocto_on = true; 550 } else { 551 _nocto_on = false; 552 } 553 if (pxfslib::exists_mntopt(mntoptions, MNTOPT_FORCEDIRECTIO, false)) { 554 _forcedirectio_on = true; 555 } else { 556 _forcedirectio_on = false; 557 } 558 } 559 560 // 561 // supported_bdev_fs - returns the whether the specifed name is supported 562 // by PXFS as a file system. 563 // 564 bool 565 pxvfs::supported_bdev_fs(char *fsname) 566 { 567 // 568 // Table for specifying list of block device file systems 569 // that are currently supported as wrapped file systems. 570 // 571 struct supp_bdev_fs { 572 char *fsname; 573 bool supported; 574 }; 575 static struct supp_bdev_fs supp_bdev_fs[] = { 576 { "ufs", true }, 577 { "hsfs", true }, 578 #ifndef VXFS_DISABLED 579 { "vxfs", true } 580 #endif 581 }; 582 583 int nbdev_fs = (int)(sizeof (supp_bdev_fs) / 584 sizeof (struct supp_bdev_fs)); 585 586 for (int i = 0; i < nbdev_fs; i++) { 587 if (strcmp(fsname, supp_bdev_fs[i].fsname) == 0) { 588 return (supp_bdev_fs[i].supported); 589 } 590 } 591 return (false); 592 } 593 594 // 595 // Mount a global file system. 596 // At this point, the mount point is locked locally with vn_vfswlock(), 597 // the file system switch table is locked locally with RLOCK_VFSSW(), 598 // and the vfs_t is locked locally with vfs_lock() in that order. 599 // 600 // static 601 int 602 pxvfs::mount(vfs *vfsp, vnode *mvp, struct mounta *uap, cred *cr) 603 { 604 Environment e; 605 int error; 606 607 ASSERT(uap->flags & MS_GLOBAL); 608 ASSERT(vn_vfswlock_held(mvp)); 609 #ifndef PSARC_2001_038 // This assertion is no longer true for S9 610 ASSERT(VFSSW_LOCKED()); 611 #endif 612 // ASSERT(vfs_lock_held(vfsp)); 613 614 // 615 // Disallow an overlaid mount on an extant mount point unless it's 616 // explicitly requested. 617 // XXX This check should be in domount() but requires a change to 618 // the VFS_MOUNT() interface. 619 // XXX Also, namefs doesn't do this check quite this way. 620 // XXX Note that we don't check that vp->v_type == VDIR. 621 // This should be checked for most file systems but not "namefs". 622 // 623 mutex_enter(&mvp->v_lock); 624 if ((uap->flags & MS_REMOUNT) == 0 && 625 (uap->flags & MS_OVERLAY) == 0 && 626 (mvp->v_count != 1 || (mvp->v_flag & VROOT) != 0)) { 627 mutex_exit(&mvp->v_lock); 628 return (EBUSY); 629 } 630 mutex_exit(&mvp->v_lock); 631 632 // 633 // Verify that the mount client is already active, returning 634 // failure if it isn't. 635 // 636 if (!mount_client_impl::is_activated()) { 637 char nodename[32]; 638 639 (void) sprintf(nodename, "Node (%u)", orb_conf::node_number()); 640 os::sc_syslog_msg msg(SC_SYSLOG_GLOBAL_MOUNT_TAG, 641 nodename, NULL); 642 // 643 // SCMSGS 644 // @explanation 645 // A global mount command is attempted before the node has 646 // initialized the global file system name space. Typically 647 // this caused by trying to perform a global mount while the 648 // system is booted in single user mode. 649 // @user_action 650 // If the system is not at run level 2 or 3, change to run 651 // level 2 or 3 using the init(1M) command. Otherwise, check 652 // message logs for errors during boot. 653 // 654 (void) msg.log(SC_SYSLOG_WARNING, MESSAGE, 655 "pxvfs:mount(): global mounts are not enabled" 656 " (need to run \"clconfig -g\" first)\n"); 657 return (ENODEV); 658 } 659 660 // 661 // We read in all the mount data from user space since there 662 // are many places where we need to examine the data before 663 // passing it to the VFS_MOUNT() of the underlying file system. 664 // 665 sol::mounta ma; 666 CORBA::String_var options; 667 668 if (uap->flags & MS_SYSSPACE) { 669 ma.spec = os::strdup(uap->spec); 670 ma.dir = os::strdup(uap->dir); 671 ma.fstype = os::strdup(uap->fstype); 672 ma.flags = uap->flags; 673 if (uap->datalen != 0) { 674 ma.data.load((uint_t)uap->datalen, (uint_t)uap->datalen, 675 (uint8_t *)uap->dataptr, false); 676 } 677 if (uap->optlen != 0) { 678 ma.options.load((uint_t)uap->optlen, 679 (uint_t)uap->optlen, (uint8_t *)uap->optptr, false); 680 } 681 } else { 682 // Don't support old mount formats. 683 if ((uap->flags & (MS_OPTIONSTR | MS_DATA | MS_FSS)) == 0 || 684 (uintptr_t)uap->fstype < 256) { 685 return (EINVAL); 686 } 687 688 size_t len; 689 char *str = new char [MAXPATHLEN]; 690 691 if (uap->spec != NULL) { 692 error = copyinstr(uap->spec, str, (size_t)MAXPATHLEN, 693 &len); 694 if (error != 0) { 695 delete [] str; 696 return (error); 697 } 698 ma.spec = os::strcpy(new char [len], str); 699 } 700 701 error = copyinstr(uap->dir, str, (size_t)MAXPATHLEN, &len); 702 if (error != 0) { 703 delete [] str; 704 return (error); 705 } 706 ma.dir = os::strcpy(new char [len], str); 707 708 ma.fstype = new char [FSTYPSZ]; 709 error = copyinstr(uap->fstype, ma.fstype, (size_t)FSTYPSZ, 710 &len); 711 if (error != 0) { 712 delete [] str; 713 if (error == ENAMETOOLONG) { 714 error = EINVAL; 715 } 716 return (error); 717 } 718 719 if ((uap->flags & MS_DATA) != 0 && uap->datalen != 0) { 720 ma.data.length((uint_t)uap->datalen); 721 error = copyin(uap->dataptr, ma.data.buffer(), 722 (size_t)uap->datalen); //lint !e571 723 if (error != 0) { 724 delete [] str; 725 return (error); 726 } 727 } 728 729 if ((uap->flags & MS_OPTIONSTR) != 0 && uap->optlen != 0) { 730 ma.options.length((uint_t)uap->optlen); 731 error = copyin(uap->optptr, ma.options.buffer(), 732 (size_t)uap->optlen); //lint !e571 733 if (error != 0) { 734 delete [] str; 735 return (error); 736 } 737 } 738 MOUNT_DBPRINTF( 739 MOUNT_TRACE_CLIENT, 740 MOUNT_GREEN, 741 ("\nma.spec %s \nma.dir %s \nma.flags %x \nma.fstype %s \n", 742 (char *)ma.spec, (char *)ma.dir, ma.flags, 743 (char *)ma.fstype)); 744 745 if (ma.options.length() != 0) { 746 MOUNT_DBPRINTF( 747 MOUNT_TRACE_CLIENT, 748 MOUNT_GREEN, 749 (" ma.options %s\n", ma.options.buffer())); 750 } 751 752 delete [] str; 753 ma.flags = uap->flags | MS_SYSSPACE; 754 755 #ifndef VXFS_DISABLED 756 if (strcmp(ma.fstype, "vxfs") == 0) { 757 if (error = vxfs_copyinargs(ma)) { 758 return (error); 759 } 760 } 761 #endif 762 } 763 764 // Get a reference to the local mount client. 765 fs::mount_client_var clientv = mount_client_impl::get_client_ref(); 766 solobj::cred_var credobj = solobj_impl::conv(cr); 767 768 // 769 // For remounts, the mount point should be the root vnode for 770 // this file system. 771 // 772 if (ma.flags & MS_REMOUNT) { 773 // The mount point should be a PXFS vnode. 774 ASSERT(mvp->v_flag & VPXFS); 775 776 // 777 // Note: we are sharing the hold on the fobj and file system 778 // objects that the proxy vnode/vfs hold so don't release 779 // them. 780 // 781 PXFS_VER::filesystem_ptr fsptr = 782 VFSTOPXFS(vfsp)->get_fsobj(); 783 784 PXFS_VER::fobj_ptr fobjp = 785 ((pxfobj *)pxnode::VTOPX(mvp))->getfobj(); 786 787 uint32_t vfsflags; 788 789 FAULTPT_PXFS(FAULTNUM_PXFS_REMOUNT_C_B, 790 FaultFunctions::generic); 791 792 mount_client_impl::get_server()->remount_v1(fsptr, fobjp, ma, 793 credobj, clientv, vfsflags, options, e); 794 795 FAULTPT_PXFS(FAULTNUM_PXFS_REMOUNT_C_A, 796 FaultFunctions::generic); 797 798 error = pxfslib::get_err(e); 799 800 if (error == 0) { 801 // 802 // The mount client on each node except this one 803 // builds the mount options table and sets the node 804 // mount flags. On this node, an options table is 805 // built in Solaris by domount, but we do it again 806 // to accommodate differences in underlying file 807 // systems (ie. VxFS). Include the remount option 808 // for Solaris 10 command level mount option checks. 809 // 810 vfsp->vfs_flag = vfsflags; 811 #ifdef GLOBAL_MNTTAB_MODTIME_INTERFACE 812 if (!pxfslib::exists_mntopt(options, "remount", 813 false)) { 814 char *new_options; 815 //lint -e668 816 size_t new_len = strlen(options) + 817 strlen(",remount") + 1; 818 new_options = new char [new_len]; 819 (void) strcpy(new_options, options); 820 (void) strcat(new_options, ",remount"); 821 options = os::strdup(new_options); 822 delete [] new_options; 823 } 824 // 825 // vfs_mnttab_modtimeupd is global for Solaris 10 but 826 // is declared static for Solaris 8 and 9. 827 // 828 vfs_list_lock(); 829 vfs_createopttbl(&vfsp->vfs_mntopts, options); 830 vfs_parsemntopts(&vfsp->vfs_mntopts, 831 (char *)options, 1); 832 vfs_mnttab_modtimeupd(); 833 vfs_list_unlock(); 834 vfs_setmntopt(vfsp, MNTOPT_REMOUNT, NULL, 835 VFS_NODISPLAY); 836 #else 837 838 vfs_createopttbl(&vfsp->vfs_mntopts, options); 839 vfs_parsemntopts(&vfsp->vfs_mntopts, options, 1); 840 gethrestime(&vfs_mnttab_mtime); 841 #endif 842 if (ma.options.length() > 0) { 843 VFSTOPXFS(vfsp)->set_mntoptions 844 ((const char *)ma.options.buffer()); 845 } 846 } 847 848 // 849 // Workaround for MO_TAG bug in vfs.c. pxvfs's constructor. 850 // 851 if (strcmp("ufs", vfssw[vfsp->vfs_fstype].vsw_name) == 0) { 852 mntopt_t *mop = vfs_hasopt(&vfsp->vfs_mntopts, 853 "logging"); 854 if (mop && (mop->mo_flags & MO_TAG) == 0) { 855 mop->mo_flags |= MO_TAG; 856 } 857 } 858 859 FAULTPT_PXFS(FAULTNUM_PXFS_REMOUNT_C_E, 860 FaultFunctions::generic); 861 862 return (error); 863 } 864 865 bool dev_is_ha; 866 CORBA::String_var dev_name; 867 sol::nodeid_seq_t dev_nids; 868 869 // 870 // Check that PXFS supports this file system type. 871 // Note: ma.spec can be NULL if mounting "namefs". 872 // XXX This code will need to change when new file system types 873 // are supported by PXFS. 874 // 875 if (!supported_bdev_fs(ma.fstype)) { 876 // 877 // Log message and bail out for unsupported file systems. 878 // 879 880 char nodename[32]; 881 882 (void) sprintf(nodename, "Node (%u)", orb_conf::node_number()); 883 os::sc_syslog_msg msg(SC_SYSLOG_GLOBAL_MOUNT_TAG, 884 nodename, NULL); 885 // 886 // SCMSGS 887 // @explanation 888 // A global mount is not supported for the specified file 889 // system. 890 // @user_action 891 // Check the release notes and documents about the support 892 // of the specified system. 893 // 894 (void) msg.log(SC_SYSLOG_NOTICE, MESSAGE, 895 "pxvfs:mount(): global mount is not supported for " 896 "filesystem type : '%s'\n", ma.fstype); 897 return (ENOTSUP); 898 } else { 899 // 900 // Lookup the device special file to determine the dev_t 901 // for the device. 902 // If this is a PXFS global device file, then 903 // contact DCS to start the device service and get the 904 // list of nodes we should create file system replicas on. 905 // For non-PXFS special files, just create a file system 906 // replica on this node. 907 // 908 vnode_t *bvp; 909 error = lookupname(ma.spec, UIO_SYSSPACE, FOLLOW, NULL, &bvp); 910 if (error != 0) { 911 return (error); 912 } 913 if (bvp->v_type != VBLK) { 914 VN_RELE(bvp); 915 return (ENOTBLK); 916 } 917 if ((bvp->v_flag & VPXFS) == 0) { 918 // 919 // Not a PXFS special file. 920 // 921 VN_RELE(bvp); 922 dev_is_ha = false; 923 dev_nids.length(1); 924 dev_nids[0] = orb_conf::node_number(); 925 } else { 926 // 927 // Contact DCS to get the list of nodes that this 928 // device is attached to and whether or not its an 929 // HA device. Also, pass a reference to the mount 930 // server's dc_callback object so the mount server 931 // is notified when this node configuration data 932 // changes (due to system administration commands). 933 // 934 dev_t devid = bvp->v_rdev; 935 VN_RELE(bvp); 936 937 sol::nodeid_seq_t_var nodes; 938 939 error = get_configured_nodes(dev_is_ha, dev_name, 940 devid, nodes, e); 941 if (error != 0) { 942 return (error); 943 } 944 dev_nids = *nodes; 945 } 946 } 947 948 // 949 // Note: we have to release the read lock on vfssw[] since 950 // we may have to instantiate the file system (by calling 951 // domount() for the underlying file system) which could 952 // try to call WLOCK_VFSSW() and deadlock. We can safely 953 // release the read lock because the pxfs module won't 954 // be unloaded (we don't allow unloading but if we did, 955 // we would have control of that in _fini()). 956 // We do need to make sure we return with the read lock held 957 // so that domount("pxfs") can unlock it. 958 // 959 // But first, allocate a vfssw[] entry for the underlying 960 // file system type without loading the module. 961 // 962 #ifdef PSARC_2001_038 // Need to grab the lock for S9 963 RLOCK_VFSSW(); 964 #endif 965 struct vfssw *vswp = vfs_getvfsswbyname(ma.fstype); 966 RUNLOCK_VFSSW(); 967 if (vswp == NULL) { 968 WLOCK_VFSSW(); 969 if ((vswp = vfs_getvfsswbyname(ma.fstype)) == NULL) { 970 vswp = allocate_vfssw(ma.fstype); 971 } 972 WUNLOCK_VFSSW(); 973 if (vswp == NULL) { 974 RLOCK_VFSSW(); 975 return (EINVAL); 976 } 977 } 978 int fstype = (int)(vswp - vfssw); 979 ASSERT(fstype); // see 4470243 980 981 // 982 // We pass the mount arguments to the mount server which 983 // will lock the mount point on all other nodes, create the 984 // file system object (instantiate the file system), 985 // create proxy file system objects on all other nodes, link 986 // the proxy into the file system name space and unlock the 987 // mount points. If all that goes well, we will create the proxy, 988 // link it into the name space, and unlock on this node after we 989 // return to domount(). 990 // We have to pass the mount point vnode pointer to mount() 991 // since mount_client_impl::instantiate() may be called on 992 // this node. 993 // 994 PXFS_VER::filesystem_var fsobj; 995 PXFS_VER::fs_info fsinfo; 996 997 FAULTPT_PXFS(FAULTNUM_PXFS_MOUNT_C_B, FaultFunctions::generic); 998 999 mount_client_impl::get_server()->mount_v1(ma, (sol::uintptr_t)mvp, 1000 credobj, clientv, dev_is_ha, dev_name, dev_nids, 1001 fsobj, fsinfo, options, e); 1002 1003 FAULTPT_PXFS(FAULTNUM_PXFS_MOUNT_C_A, FaultFunctions::generic); 1004 1005 // Aquire the read lock before returning. 1006 #ifndef PSARC_2001_038 // lock is no longer held, we don't grab it again for S9 1007 RLOCK_VFSSW(); 1008 #endif 1009 1010 error = pxfslib::get_err(e); 1011 1012 MOUNT_DBPRINTF( 1013 MOUNT_TRACE_CLIENT, 1014 (error ? MOUNT_RED : MOUNT_GREEN), 1015 ("pxvfs:mount() mount error %d\n", 1016 error)); 1017 1018 if (error != 0) { 1019 return (error); 1020 } 1021 1022 // 1023 // The mount client on each node except this one builds the mount 1024 // options table. On this node an options table is built in Solaris 1025 // by domount, but we do it again here to accommodate differences 1026 // in underlying file systems (ie VxFS). 1027 // 1028 vfs_createopttbl(&vfsp->vfs_mntopts, options); 1029 vfs_parsemntopts(&vfsp->vfs_mntopts, (char *)options, 1); 1030 1031 // 1032 // At this point we are committed to creating the proxy file system 1033 // for this node and linking it into the name space. 1034 // 1035 uint32_t server_incarn; 1036 ASSERT(!CORBA::is_nil(fsobj)); 1037 fsmgr_client_impl *clientmgrp = new fsmgr_client_impl(); 1038 PXFS_VER::fsmgr_client_var clientmgr = clientmgrp->get_objref(); 1039 1040 uint32_t fs_blk_size; 1041 bool fastwrite; 1042 PXFS_VER::fsmgr_server_ptr servermgr_p = 1043 fsobj->bind_fs(clientmgr, orb_conf::node_number(), 1044 server_incarn, fs_blk_size, fastwrite, e); 1045 // 1046 // The file system is already "mounted" on all other nodes. 1047 // So, go ahead and complete mounting on this node as well. 1048 // 1049 if (e.exception()) { 1050 // 1051 // We ignore comm failures since this can happen 1052 // if a node with the pxfs server crashes and 1053 // then any node joins the global name space. The 1054 // mount server will try to create a proxy vfs 1055 // for the dead file system and link it into the 1056 // name space. We need this to succeed so that 1057 // the dead file system can be unmounted properly. 1058 // 1059 if (CORBA::COMM_FAILURE::_exnarrow(e.exception()) == NULL) { 1060 #ifdef DEBUG 1061 e.exception()->print_exception( 1062 "pxvfs::mount(): "); 1063 #endif 1064 MOUNT_DBPRINTF( 1065 MOUNT_TRACE_CLIENT, 1066 MOUNT_RED, 1067 ("pxvfs:mount() exception from" 1068 " fsobj->bind_fs()\n")); 1069 } else { 1070 servermgr_p = PXFS_VER::fsmgr_server::_nil(); 1071 MOUNT_DBPRINTF( 1072 MOUNT_TRACE_CLIENT, 1073 MOUNT_RED, 1074 ("pxvfs:mount() comm failure\n ")); 1075 } 1076 1077 e.clear(); 1078 } 1079 1080 1081 pxvfs *pxvfsp = new pxvfs(fsobj, clientmgrp, &fsinfo, fstype, vfsp, 1082 server_incarn); 1083 1084 pxvfsp->pxfs_bsize = fs_blk_size; 1085 pxvfsp->fastwrite = fastwrite; 1086 // 1087 // No blocks allocated initially. 1088 // The blocks get allocated during the first allocating write 1089 // from this node. 1090 // 1091 pxvfsp->blocks_available = 0; 1092 1093 // 1094 // Mount flags are set by the mount client on each node except 1095 // this one which initiated the mount(2) call, so we set them now 1096 // for this node. 1097 // 1098 if (ma.options.length() > 0) { 1099 pxvfsp->set_mntoptions((const char *)ma.options.buffer()); 1100 } 1101 1102 // Finish initializing the linkages between fsmgr client and server. 1103 clientmgrp->set_pxvfsp(pxvfsp, servermgr_p); 1104 1105 // 1106 // Add ourself to the local list of all PXFS file systems and 1107 // keep a hold on the vfs struct while on the list. 1108 // 1109 all_pxvfs_lock.wrlock(); 1110 ASSERT(search(fsobj) == NULL); 1111 all_pxvfs.prepend((pxvfs_list_elem *)pxvfsp); 1112 VFS_HOLD(vfsp); 1113 all_pxvfs_lock.unlock(); 1114 1115 FAULTPT_PXFS(FAULTNUM_PXFS_MOUNT_C_E, FaultFunctions::generic); 1116 1117 CORBA::release(servermgr_p); 1118 return (0); 1119 } 1120 1121 // 1122 // Unmount a global file system. 1123 // At this point, the covered mount point is locked locally with vn_vfswlock(), 1124 // and the vfs_t is locked locally with vfs_lock() in that order. 1125 // 1126 // Note: Purge of all dnlc entries for this vfs has been done in dounmount 1127 // of the pxfs filesystem. 1128 // 1129 int 1130 pxvfs::unmount(int umflags, cred *credp) 1131 { 1132 Environment e; 1133 1134 // ASSERT(vfs_lock_held(fs_vfs)); 1135 ASSERT(!CORBA::is_nil(fs_fsobj)); 1136 1137 // 1138 // When a cluster node is going down cleanly (ie. with shutdown, 1139 // poweroff, init, etc), the /sbin/umountall utility is run to 1140 // unmount all file systems. But we don't want umountall to unmount 1141 // PXFS filesystems served out by other nodes and mounted cluster-wide. 1142 // We take advantage of the Solaris stop script facility associated 1143 // with the init program; in particluar rc0.d/K30MOUNTGFSYS is run 1144 // as a result of init 0 (this is done for both shutdown and 1145 // poweroff). From the stop script global mounts are disabled using 1146 // the cladm system call (and ultimately pxvfs::disable_unmounts). 1147 // 1148 // When using the scshutdown command to take down all cluster nodes, 1149 // the global unmounts are of course valid and needed. In this 1150 // case, scshutdown performs unmounting of global filesytems prior 1151 // to doing the individual node shutdowns. 1152 // 1153 // For node take down commands which do not result in running init 1154 // (ie. halt, reboot, uadmin), filesystems are unmounted in the 1155 // kernel and PXFS is made aware that the unmount is occuring. 1156 // This is done using the PXFS_SYNC_CLOSE flag set during the sync 1157 // operation before unmount. If PXFS_SYNC_CLOSE is set, PXFS can set 1158 // it's PXFS_SHUTDOWN flag. This is set even if the node going down is 1159 // the only node serving out the filesystem. This is desired to prevent 1160 // applications (which would be unaware of the unmount) from continuing 1161 // operations at the local mount point. Rather, after the node serving 1162 // out the filesystem is down, applications will get EIO for further 1163 // operations and they can take appropriate action. 1164 // 1165 if (unmounts_disabled || flags & PXFS_SHUTDOWN) { 1166 return (EBUSY); 1167 } 1168 1169 flags_lock.lock(); 1170 // 1171 // Set PXFS_UNMOUNTING to block creation of new proxy vnodes. 1172 // 1173 flags |= PXFS_UNMOUNTING; 1174 if (umflags & MS_FORCE) { 1175 flags |= PXFS_FORCE_UNMOUNTING; 1176 } 1177 1178 if (fs_rootvp != NULL) { 1179 // Release the cached root vnode. 1180 vnode_t *vnodep = fs_rootvp; 1181 fs_rootvp = NULL; 1182 flags_lock.unlock(); 1183 VN_RELE(vnodep); 1184 } else { 1185 flags_lock.unlock(); 1186 } 1187 1188 // 1189 // For a normal unmount, there will be a check for filesystem busy. 1190 // If busy, we get a return value of true; else the return value is 1191 // false after waiting for the inactive vnode list to become empty. 1192 // For forced unmount there is a wait for one pass of processing 1193 // of the inactive list rather than waiting for an empty list; and 1194 // the return value is always false. 1195 // 1196 if (wait_empty_inactive_list(umflags & MS_FORCE ? true : false)) { 1197 MOUNT_DBPRINTF( 1198 MOUNT_TRACE_CLIENT, 1199 MOUNT_RED, 1200 ("pxvfs(%p):unmount returning EBUSY\n", 1201 this)); 1202 return (EBUSY); 1203 } 1204 1205 // 1206 // Get object references for the mount service 1207 // 1208 solobj::cred_var credobj = solobj_impl::conv(credp); 1209 fs::mount_client_var clientv = mount_client_impl::get_client_ref(); 1210 1211 fs::mount_server_var mount_server_v = 1212 mount_client_impl::get_server(); 1213 1214 FAULTPT_PXFS(FAULTNUM_PXFS_UNMOUNT_C_B, FaultFunctions::generic); 1215 mount_server_v->unmount_v1(fs_fsobj, umflags, credobj, clientv, 1216 orb_conf::node_number(), 0, e); 1217 FAULTPT_PXFS(FAULTNUM_PXFS_UNMOUNT_C_A, FaultFunctions::generic); 1218 1219 sol::error_t error = pxfslib::get_err(e); 1220 1221 MOUNT_DBPRINTF( 1222 MOUNT_TRACE_CLIENT, 1223 (error ? MOUNT_RED : MOUNT_GREEN), 1224 ("pxvfs:unmount(%p) unmount error %d\n", 1225 this, error)); 1226 1227 if (error != 0) { 1228 unmount_failed(); 1229 } 1230 FAULTPT_PXFS(FAULTNUM_PXFS_UNMOUNT_C_E, FaultFunctions::generic); 1231 return (error); 1232 } 1233 1234 int 1235 pxvfs::mountroot(enum whymountroot) 1236 { 1237 MOUNT_DBPRINTF( 1238 MOUNT_TRACE_CLIENT, 1239 MOUNT_RED, 1240 ("pxvfs:mountroot called\n")); // XXX 1241 return (ENOTSUP); 1242 } 1243 1244 // 1245 // root - An out parameter will contain the root directory proxy vnode, 1246 // and the return value identifies whether the operation succeeded or failed. 1247 // 1248 // Note that the current design imposes the restriction that 1249 // the underlying filesystem sets 1250 // its root once and does not change its root. 1251 // 1252 int 1253 pxvfs::root(vnode **vnodepp) 1254 { 1255 flags_lock.lock(); 1256 while (flags & PXFS_UNMOUNTING) { 1257 // 1258 // Wait for pending unmount operation to complete 1259 // 1260 flags |= PXFS_FILE_ACTIVATE; 1261 flags_cv.wait(&flags_lock); 1262 } 1263 if (flags & PXFS_UNMOUNTED) { 1264 // 1265 // The file system was unmounted, 1266 // and hence all files are inaccessible 1267 // 1268 flags_lock.unlock(); 1269 *vnodepp = NULL; 1270 return (EIO); 1271 } 1272 if (fs_rootvp != NULL) { 1273 // 1274 // The root directory vnode is cached 1275 // 1276 *vnodepp = fs_rootvp; 1277 VN_HOLD(*vnodepp); 1278 flags_lock.unlock(); 1279 return (0); 1280 } 1281 flags_lock.unlock(); 1282 1283 // 1284 // Call the pxfs server to get a root object. 1285 // The root directory proxy vnode remains in existence 1286 // while the proxy file system is mounted. 1287 // Because the lock is dropped, another thread could 1288 // concurrently ask for the root proxy directory. 1289 // So we have to deal with the case where we are not 1290 // the first to ask for the root directory. 1291 // 1292 PXFS_VER::fobj_var rootobj; 1293 PXFS_VER::fobj_info rootinfo; 1294 PXFS_VER::bind_info binfo; 1295 Environment e; 1296 uint32_t server_incn_orig = get_server_incn(); 1297 1298 fobj_client_impl *client1p = new fobj_client_impl; 1299 fobj_client_impl *client2p; 1300 1301 PXFS_VER::fobj_client_ptr client1_p = client1p->get_objref(); 1302 1303 PXFS_VER::fobj_client_ptr client2_p = 1304 PXFS_VER::fobj_client::_nil(); 1305 1306 fs_fsobj->getroot(rootobj, rootinfo, binfo, client1_p, client2_p, e); 1307 sol::error_t error = pxfslib::get_err(e); 1308 if (error != 0) { 1309 CORBA::release(client1_p); 1310 CORBA::release(client2_p); 1311 *vnodepp = NULL; 1312 return (error); 1313 } 1314 // The root directory supports caching 1315 ASSERT(!CORBA::is_nil(client2_p)); 1316 1317 if (client1_p->_equiv(client2_p)) { 1318 // 1319 // This is the first thread to request root directory 1320 // 1321 // Do not allow the server to change while 1322 // in the middle of creating a proxy vnode 1323 // 1324 server_incn_lock.rdlock(); 1325 if (server_incn_orig == get_server_incn()) { 1326 // Construct a proxy vnode for the root directory. 1327 *vnodepp = 1328 get_pxfobj(rootobj, rootinfo, &binfo, client1p); 1329 1330 server_incn_lock.unlock(); 1331 } else { 1332 // 1333 // The registration was orphanned by 1334 // either a failover or switchover 1335 // 1336 server_incn_lock.unlock(); 1337 1338 error = connect_again(vnodepp, rootobj, rootinfo, binfo, 1339 client1p, client1_p, e); 1340 } 1341 1342 if (*vnodepp == NULL) { 1343 // 1344 // Error because we are unmounting proxy file system 1345 // 1346 CORBA::release(client1_p); 1347 CORBA::release(client2_p); 1348 return (EIO); 1349 } 1350 1351 flags_lock.lock(); 1352 if (fs_rootvp == NULL) { 1353 // 1354 // Cache the root directory proxy vnode. 1355 // The cache places its own hold on the 1356 // root directory proxy vnode. 1357 // 1358 fs_rootvp = *vnodepp; 1359 VN_HOLD(*vnodepp); 1360 } 1361 flags_lock.unlock(); 1362 1363 } else { 1364 // 1365 // There already is a proxy vnode for root directory 1366 // 1367 client2p = (fobj_client_impl *) 1368 (client2_p->_handler()->get_cookie()); 1369 1370 // 1371 // The current client may not be ready 1372 // 1373 if (!client2p->wait_till_ok()) { 1374 // 1375 // Error because we are unmounting proxy file system 1376 // 1377 *vnodepp = NULL; 1378 CORBA::release(client1_p); 1379 CORBA::release(client2_p); 1380 return (EIO); 1381 } 1382 // 1383 // The existing client has completed initialization 1384 // and has a hold placed upon the proxy vnode. 1385 // 1386 *vnodepp = pxnode::PXTOV(client2p->get_pxfobjplusp()); 1387 1388 flags_lock.lock(); 1389 if (fs_rootvp == NULL) { 1390 // 1391 // Cache the root directory proxy vnode. 1392 // The cache places its own hold on the 1393 // root directory proxy vnode. 1394 // 1395 fs_rootvp = *vnodepp; 1396 VN_HOLD(*vnodepp); 1397 } 1398 flags_lock.unlock(); 1399 } 1400 CORBA::release(client1_p); 1401 CORBA::release(client2_p); 1402 1403 ASSERT((*vnodepp)->v_flag & VROOT); 1404 1405 return (0); 1406 } 1407 1408 int 1409 pxvfs::statvfs(struct statvfs64 *sp) 1410 { 1411 Environment e; 1412 1413 // 1414 // We hold the root proxy vnode, because not all calls 1415 // to VFS_STATVFS hold the root vnode pointer 1416 // before making the VFS_STATVFS call. 1417 // 1418 1419 vnode *vnodep; 1420 int err = root(&vnodep); 1421 if (err != 0) { 1422 return (err); 1423 } 1424 1425 //lint -e64 1426 fs_fsobj->get_statistics(conv(*sp), e); 1427 //lint +e64 1428 1429 VN_RELE(vnodep); 1430 return (pxfslib::get_err(e)); 1431 } 1432 1433 // 1434 // sync_all_attr 1435 // Walk the list of all pxfobj objects, and flush out dirty attributes. This is 1436 // triggered by fsflush calling vfs_sync with the SYNC_ATTR flag. 1437 // 1438 // static 1439 void 1440 pxvfs::sync_all_attr(void *) 1441 { 1442 pxfobj *pxfobjp; 1443 pxfobj *prevp = NULL; 1444 int file_count = 0; 1445 int sleep_index; 1446 1447 for (uint_t idx = 0; idx < pxfobjhsz; idx++) { 1448 // 1449 // Initialize the iterator while holding the lock. 1450 // 1451 pxfobj_hash[idx].hlock.lock(); 1452 pxfobj_list_t::ListIterator iter(pxfobj_hash[idx].hlist); 1453 1454 // 1455 // Determine sleep time. 1456 // The exact value does not matter. 1457 // We pretend that the files are evenly distributed across 1458 // buckets. 1459 // 1460 switch ((pxfobj_hash[idx].hlist_cnt * pxfobjhsz) / 10000) { 1461 case 0: sleep_index = 0; // <10,000 files 1462 break; 1463 case 1: sleep_index = 1; // <20,000 files 1464 break; 1465 case 2: sleep_index = 2; // <30,000 files 1466 break; 1467 case 3: sleep_index = 3; // <40,000 files 1468 break; 1469 default: sleep_index = 4; // >=40,000 files 1470 }; 1471 1472 for (; (pxfobjp = iter.get_current()) != NULL; iter.advance()) { 1473 1474 ASSERT(pxfobjp->is_inhashtable()); 1475 1476 if (pxfobjp->is_inactive() || 1477 !pxfobjp->is_cached()) { 1478 // 1479 // Inactive processing will flush any 1480 // cached attributes. 1481 // Do not place hold on proxy vnode, 1482 // as that will stop inactive file cleanup. 1483 // 1484 file_count++; 1485 continue; 1486 } 1487 // 1488 // Ensure that the proxy vnode remains active 1489 // while doing the sync on this proxy vnode. 1490 // PXFS does not remove the proxy vnode from 1491 // this hash table while the proxy vnode remains active. 1492 // This ensures that the list iterator will be pointing 1493 // to a member of the hash table. 1494 // 1495 VN_HOLD(pxnode::PXTOV(pxfobjp)); 1496 1497 // 1498 // Drop the lock to allow changes to the hash table. 1499 // The sync can take a long time. So this is needed. 1500 // 1501 pxfobj_hash[idx].hlock.unlock(); 1502 1503 if (prevp) { 1504 // 1505 // It is possible that this operation will 1506 // render this proxy vnode inactive and expel 1507 // this proxy vnode from the hash table. 1508 // 1509 VN_RELE(pxnode::PXTOV(prevp)); 1510 } 1511 1512 (void) ((pxfobjplus *)pxfobjp)->sync_attr(); 1513 prevp = pxfobjp; 1514 1515 // 1516 // This flush can starve other work. 1517 // So we throttle the flush. 1518 // 1519 if (++file_count >= sync_all_attr_throttle) { 1520 // 1521 // Allow other work. 1522 // 1523 os::usecsleep( 1524 sync_all_attr_interval[sleep_index] * 1525 ((file_count + 1526 (sync_all_attr_throttle - 1)) 1527 / sync_all_attr_throttle)); 1528 file_count = 0; 1529 } 1530 1531 pxfobj_hash[idx].hlock.lock(); 1532 } 1533 pxfobj_hash[idx].hlock.unlock(); 1534 } 1535 if (prevp) { 1536 // 1537 // Release the hold placed on the proxy vnode that was 1538 // processed last. 1539 // 1540 VN_RELE(pxnode::PXTOV(prevp)); 1541 } 1542 sync_all_attr_lock.lock(); 1543 sync_all_attr_thread_running = false; 1544 sync_all_attr_lock.unlock(); 1545 } 1546 1547 // 1548 // This is a sync in order to unmount one file system. 1549 // 1550 // XXX We should do an invocation to the server so the sync is seen globally 1551 // and also to sync the underlying file system. 1552 // 1553 int 1554 pxvfs::sync(short flag, cred *credp) 1555 { 1556 pxfobj *pxfobjp; 1557 pxfobj *prevp; 1558 1559 // 1560 // If we are panicing we cannot safely perform any file system 1561 // operation. The integrity of the kernel is suspect too. To avoid 1562 // further problems we ignore the sync if a panic is in progress. 1563 // 1564 if (panicstr) { 1565 return (0); 1566 } 1567 1568 if (flag & SYNC_ATTR) { 1569 // 1570 // Hand off SYNC_ATTR calls by the fsflush thread to a 1571 // separate thread. This may take very long, so if the thread 1572 // is currently running (from a previous call), don't schedule 1573 // it again. 1574 // 1575 // PXFS could already be doing this operation. This avoids 1576 // duplicate work. 1577 // 1578 sync_all_attr_lock.lock(); 1579 if (!sync_all_attr_thread_running) { 1580 sync_all_attr_thread_running = true; 1581 1582 // Do 'new's outside locks. 1583 sync_all_attr_lock.unlock(); 1584 defer_task *taskp = 1585 new work_task(pxvfs::sync_all_attr, NULL); 1586 sync_all_attr_lock.lock(); 1587 1588 common_threadpool::the().defer_processing(taskp); 1589 } 1590 sync_all_attr_lock.unlock(); 1591 return (0); 1592 } 1593 1594 // 1595 // Remember if SYNC_CLOSE is set so we know in unmount() 1596 // that the unmount is due to the node being shut down. 1597 // 1598 if (flag & SYNC_CLOSE) { 1599 flags_lock.lock(); 1600 flags |= PXFS_SHUTDOWN; 1601 flags_lock.unlock(); 1602 } 1603 1604 prevp = NULL; 1605 1606 for (uint_t idx = 0; idx < pxfobjhsz; idx++) { 1607 // 1608 // Make sure the iterator is initialized while holding the 1609 // lock. 1610 // 1611 pxfobj_hash[idx].hlock.lock(); 1612 pxfobj_list_t::ListIterator iter(pxfobj_hash[idx].hlist); 1613 1614 for (; (pxfobjp = iter.get_current()) != NULL; iter.advance()) { 1615 1616 ASSERT(pxfobjp->is_inhashtable()); 1617 VN_HOLD(pxnode::PXTOV(pxfobjp)); 1618 1619 pxfobj_hash[idx].hlock.unlock(); 1620 1621 if (prevp != NULL) { 1622 VN_RELE(pxnode::PXTOV(prevp)); 1623 } 1624 1625 // 1626 // We commit only those vnodes which belong to this 1627 // filesystem. 1628 // 1629 if ((pxnode::PXTOV(pxfobjp)->v_vfsp) == fs_vfs) { 1630 #if SOL_VERSION >= __s11 1631 (void) VOP_PUTPAGE(pxnode::PXTOV(pxfobjp), 1632 (offset_t)0, (size_t)0, B_ASYNC, credp, 1633 NULL); 1634 #else 1635 (void) VOP_PUTPAGE(pxnode::PXTOV(pxfobjp), 1636 (offset_t)0, (size_t)0, B_ASYNC, credp); 1637 #endif 1638 } 1639 1640 prevp = pxfobjp; 1641 pxfobj_hash[idx].hlock.lock(); 1642 1643 // 1644 // Although the list could have changed while the lock 1645 // was not held, the list element pointer in the 1646 // iterator should still be valid since we held the 1647 // vnode (i.e., it shouldn't be removed from the list). 1648 // XXX This is based on the knowledge of exact 1649 // implementation of the ListIterator. This 1650 // should work as of today, but we need to be careful 1651 // if iterator implementation changes. 1652 // 1653 } 1654 pxfobj_hash[idx].hlock.unlock(); 1655 } 1656 if (prevp) { 1657 VN_RELE(pxnode::PXTOV(prevp)); 1658 } 1659 return (0); 1660 } 1661 1662 // 1663 // Sync all pxfs file systems. 1664 // 1665 // XXX We should do an invocation to the server so the sync is seen globally 1666 // and also to sync the underlying file system. 1667 // 1668 int 1669 pxvfs::sync_all(short flag, cred *cr) 1670 { 1671 pxvfs *pxvfsp; 1672 int error = 0; 1673 int result = 0; 1674 1675 all_pxvfs_lock.rdlock(); 1676 for (all_pxvfs.atfirst(); 1677 (pxvfsp = all_pxvfs.get_current()) != NULL; 1678 all_pxvfs.advance()) { 1679 1680 error = pxvfsp->sync(flag, cr); 1681 1682 1683 if (result == 0) { 1684 // 1685 // If multiple file systems encounter errors, 1686 // will return the first error encountered. 1687 // 1688 result = error; 1689 } 1690 } 1691 all_pxvfs_lock.unlock(); 1692 1693 return (error); 1694 } 1695 1696 // 1697 // fid_to_proxy_file - Find the proxy file object for the specified FID. 1698 // 1699 // The caller must eventually do VN_RELE to undo the VN_HOLD done here. 1700 // 1701 pxfobj * 1702 pxvfs::fid_to_proxy_file(fid_t *wanted_fidp) 1703 { 1704 pxfobj *pxfobjp; 1705 const fid_t *fidp; 1706 1707 uint_t b_idx = pxfs_misc::hash_devt_fid(get_vfsp()->vfs_dev, 1708 wanted_fidp, pxfobjhsz); 1709 1710 pxfobj_hash_bkt &hbkt = pxfobj_hash[b_idx]; 1711 1712 // Make sure the iterator is initialized while holding the lock. 1713 hbkt.hlock.lock(); 1714 pxfobj_list_t::ListIterator iter(hbkt.hlist); 1715 1716 for (; (pxfobjp = iter.get_current()) != NULL; iter.advance()) { 1717 fidp = pxfobjp->get_fidp(); 1718 if (fidp != NULL && 1719 fidp->fid_len == wanted_fidp->fid_len && 1720 (bcmp(fidp->fid_data, wanted_fidp->fid_data, 1721 (size_t)wanted_fidp->fid_len) == 0)) { 1722 // 1723 // Found a match 1724 // 1725 // Make sure the proxy file object 1726 // stays around until we're finished 1727 // 1728 VN_HOLD(pxnode::PXTOV(pxfobjp)); 1729 hbkt.hlock.unlock(); 1730 1731 if (pxfobjp->can_cache()) { 1732 ((pxfobjplus *)pxfobjp)->set_recycled(); 1733 } 1734 1735 return (pxfobjp); 1736 } 1737 } 1738 hbkt.hlock.unlock(); 1739 1740 // No match found 1741 return (NULL); 1742 } 1743 1744 // 1745 // vget - Lookup a file based on the file ID. 1746 // Return its vnode pointer held. 1747 // 1748 // This operation supports NFS operations. 1749 // 1750 int 1751 pxvfs::vget(vnode **vnodepp, struct fid *ufidp) 1752 { 1753 pxfobj *pxfobjp; 1754 const fid_t *fidp; 1755 1756 flags_lock.lock(); 1757 while (flags & PXFS_UNMOUNTING) { 1758 // 1759 // Wait for pending unmount operation to complete 1760 // 1761 flags |= PXFS_FILE_ACTIVATE; 1762 flags_cv.wait(&flags_lock); 1763 } 1764 if (flags & PXFS_UNMOUNTED) { 1765 // 1766 // The file system was unmounted, 1767 // and hence all files are inaccessible 1768 // 1769 flags_lock.unlock(); 1770 *vnodepp = NULL; 1771 return (EIO); 1772 } 1773 flags_lock.unlock(); 1774 1775 #ifdef DEBUG 1776 os::atomic_add_32(&pxvfs_vget_number_calls, 1); 1777 #endif 1778 1779 uint_t b_idx = pxfs_misc::hash_devt_fid(get_vfsp()->vfs_dev, 1780 ufidp, pxfobjhsz); 1781 1782 pxfobj_hash_bkt &hbkt = pxfobj_hash[b_idx]; 1783 1784 // Make sure the iterator is initialized while holding the lock. 1785 hbkt.hlock.lock(); 1786 pxfobj_list_t::ListIterator iter(hbkt.hlist); 1787 1788 for (; (pxfobjp = iter.get_current()) != NULL; iter.advance()) { 1789 fidp = pxfobjp->get_fidp(); 1790 if (fidp != NULL) { 1791 // 1792 // See if the current proxy file object 1793 // has the fid of the file we are looking for. 1794 // 1795 if ((fidp->fid_len == ufidp->fid_len) && 1796 bcmp(fidp->fid_data, ufidp->fid_data, 1797 (size_t)ufidp->fid_len) == 0) { 1798 // 1799 // Check that the file object of the 1800 // matched fid, belongs to this filesystem. 1801 // 1802 // The current hashing algorithm for 1803 // a pxfobj uses the dev_t and fid 1804 // to determine the hash index. 1805 // In order to retrieve the correct 1806 // pxfobj from the hash table, 1807 // both dev_t and fid need to match. 1808 // 1809 // Note that fids are unique only for 1810 // a given filesystem, not across filesystems. 1811 // It is very much possible that two files 1812 // belonging to different filesystems, 1813 // may have the same fid, and may end up 1814 // on the same hash chain. 1815 // 1816 vfs_t *cur_vfsp = 1817 pxnode::PXTOV(pxfobjp)->v_vfsp; 1818 if (cur_vfsp == get_vfsp()) { 1819 *vnodepp = pxnode::PXTOV(pxfobjp); 1820 VN_HOLD(*vnodepp); 1821 hbkt.hlock.unlock(); 1822 if (pxfobjp->can_cache()) { 1823 ((pxfobjplus *)pxfobjp)-> 1824 set_recycled(); 1825 } 1826 #ifdef DEBUG 1827 os::atomic_add_32( 1828 &pxvfs_vget_number_fid_hits, 1); 1829 #endif 1830 return (0); 1831 } else { 1832 PXFS_DBPRINTF(PXFS_TRACE_PXVFS, 1833 PXFS_AMBER, 1834 ("pxvfs:vget(%p) fid %p " 1835 "false match vfsp %p\n", 1836 this, fidp->fid_data, cur_vfsp)); 1837 } 1838 } 1839 } 1840 } 1841 hbkt.hlock.unlock(); 1842 1843 // 1844 // Go to the pxfs server to do the lookup. 1845 // 1846 PXFS_VER::fobj_var fobj_v; 1847 PXFS_VER::fobj_info fobjinfo; 1848 PXFS_VER::bind_info binfo; 1849 Environment e; 1850 uint32_t server_incn_orig = get_server_incn(); 1851 1852 fs::fobjid_t fobjid(ufidp->fid_len, ufidp->fid_len, 1853 (uint8_t *)ufidp->fid_data, false); 1854 // 1855 // If the new client is not used, unreference will clean it up. 1856 // 1857 fobj_client_impl *client1p = new fobj_client_impl; 1858 fobj_client_impl *client2p; 1859 1860 PXFS_VER::fobj_client_ptr client1_p = client1p->get_objref(); 1861 1862 PXFS_VER::fobj_client_ptr client2_p = 1863 PXFS_VER::fobj_client::_nil(); 1864 1865 // 1866 // Contact the server to get the file object for the specified FID 1867 // 1868 fs_fsobj->getfobj(fobjid, fobj_v, fobjinfo, binfo, 1869 client1_p, client2_p, e); 1870 1871 sol::error_t error = pxfslib::get_err(e); 1872 if (error == 0) { 1873 if (CORBA::is_nil(client2_p) || client1_p->_equiv(client2_p)) { 1874 // 1875 // This means we are creating a new proxy vnode 1876 // 1877 if (CORBA::is_nil(client2_p)) { 1878 // 1879 // Client side caching is not used 1880 // 1881 client1p = NULL; 1882 1883 *vnodepp = get_pxfobj(fobj_v, fobjinfo, &binfo, 1884 client1p); 1885 } else { 1886 // 1887 // Client side caching is used 1888 // 1889 // Do not allow the server to change while 1890 // in the middle of creating a proxy vnode 1891 // 1892 server_incn_lock.rdlock(); 1893 if (server_incn_orig == get_server_incn()) { 1894 *vnodepp = get_pxfobj(fobj_v, fobjinfo, 1895 &binfo, client1p); 1896 server_incn_lock.unlock(); 1897 } else { 1898 // 1899 // The registration was orphanned by 1900 // either a failover or switchover 1901 // 1902 server_incn_lock.unlock(); 1903 1904 error = connect_again(vnodepp, 1905 fobj_v, fobjinfo, binfo, 1906 client1p, client1_p, e); 1907 } 1908 } 1909 if (error == 0 && *vnodepp == NULL) { 1910 // 1911 // Although UFS returns a NULL vnode pointer 1912 // and error == 0, 1913 // the right thing to do is return EINVAL. 1914 // 1915 error = EINVAL; 1916 } 1917 } else { 1918 // 1919 // There already is a proxy vnode 1920 // 1921 client2p = (fobj_client_impl *) 1922 (client2_p->_handler()->get_cookie()); 1923 1924 // 1925 // The current client may not be ready 1926 // 1927 if (!client2p->wait_till_ok()) { 1928 // 1929 // The current client is going away 1930 // 1931 CORBA::release(client2_p); 1932 client2_p = PXFS_VER::fobj_client::_nil(); 1933 1934 error = connect_again(vnodepp, 1935 fobj_v, fobjinfo, binfo, 1936 client1p, client1_p, e); 1937 } else { 1938 // 1939 1940 // Existing client has completed initialization 1941 // and has a hold placed upon it. 1942 // 1943 *vnodepp = 1944 pxnode::PXTOV(client2p->get_pxfobjplusp()); 1945 } 1946 } 1947 } else { 1948 // 1949 // This is so that NFS server returns stale file handle error 1950 // to the client. Current implementation checks for error or 1951 // NULL vnode pointer, but we set both to make sure we 1952 // behave the same way as UFS does. 1953 // 1954 *vnodepp = NULL; 1955 } 1956 CORBA::release(client1_p); 1957 CORBA::release(client2_p); 1958 return (error); 1959 } 1960 1961 // 1962 // connect_again - The fobj_client originally given by the server went away. 1963 // This method contacts the server and connects again with a new fobj_client. 1964 // 1965 int 1966 pxvfs::connect_again(vnode **vnodepp, 1967 PXFS_VER::fobj_ptr fobj_p, 1968 PXFS_VER::fobj_info &fobjinfo, 1969 PXFS_VER::bind_info &binfo, 1970 fobj_client_impl *client1p, 1971 PXFS_VER::fobj_client_ptr client1_p, 1972 Environment &e) 1973 { 1974 fobj_client_impl *client2p; 1975 1976 PXFS_VER::fobj_client_ptr client2_p = 1977 PXFS_VER::fobj_client::_nil(); 1978 1979 uint32_t server_incn_orig; 1980 1981 // 1982 // Keep trying until we succeed or encounter a fatal error 1983 // 1984 while (true) { 1985 server_incn_orig = get_server_incn(); 1986 1987 ((PXFS_VER::fobjplus_ptr)fobj_p)-> 1988 cache_new_client(binfo, client1_p, client2_p, e); 1989 1990 if (e.exception()) { 1991 // 1992 // The file server object could not be restored after a 1993 // node failure. 1994 // 1995 e.clear(); 1996 *vnodepp = NULL; 1997 return (EIO); 1998 } 1999 2000 // 2001 // We only reconnect for those file objects supporting caching. 2002 // 2003 ASSERT(!CORBA::is_nil(client2_p)); 2004 if (client1_p->_equiv(client2_p)) { 2005 // 2006 // This means we are creating a new proxy vnode 2007 // 2008 // Do not allow the server to change while in the 2009 // middle of creating a proxy vnode. 2010 // 2011 server_incn_lock.rdlock(); 2012 if (server_incn_orig != get_server_incn()) { 2013 // 2014 // The registration was orphanned by a 2015 // failover or a switchover 2016 // 2017 server_incn_lock.unlock(); 2018 continue; 2019 } 2020 2021 *vnodepp = get_pxfobj(fobj_p, fobjinfo, &binfo, 2022 client1p); 2023 2024 server_incn_lock.unlock(); 2025 2026 if (*vnodepp == NULL) { 2027 // 2028 // This error can happen on the root directory 2029 // when the proxy file system is being unmounted 2030 // 2031 return (EIO); 2032 } else { 2033 return (0); 2034 } 2035 } else { 2036 // 2037 // There already is a proxy vnode 2038 // 2039 client2p = (fobj_client_impl *) 2040 (client2_p->_handler()->get_cookie()); 2041 2042 // 2043 // The current client may not be ready 2044 // 2045 if (client2p->wait_till_ok()) { 2046 // 2047 // Existing client has completed initialization 2048 // and has a hold placed upon it. 2049 // 2050 *vnodepp = 2051 pxnode::PXTOV(client2p->get_pxfobjplusp()); 2052 2053 return (0); 2054 } 2055 2056 // 2057 // The specified client is going away. Try again. 2058 // 2059 CORBA::release(client2_p); 2060 client2_p = PXFS_VER::fobj_client::_nil(); 2061 } 2062 } 2063 } 2064 2065 int 2066 pxvfs::swapvp(vnode **, char *) 2067 { 2068 return (ENOSYS); 2069 } 2070 2071 // 2072 // get_pxfobj - Find or create a proxy vnode for the given fobj. 2073 // The vnode is returned held and the caller should call VN_RELE() 2074 // when finished using the vnode. 2075 // 2076 // The routine guarantees that there's at most one proxy vnode 2077 // on a given client for a given fobj. 2078 // 2079 // Warning: this method assumes that the fobj belongs to the file system 2080 // it is invoked on, and will create a proxy vnode for the fobj whether or not 2081 // the fobj is in fact owned by the fs. 2082 // 2083 // Any proxy vnode that supports caching will have a fobj_client, 2084 // and must have a non-null 'binfop'. The reverse is not true. 2085 // For example, a lookup on a device will arrive here with a non-null binfop 2086 // and no fobj_client. The unpack_vnode() method will call this method 2087 // and provide null values for both binfop and the fobj_client. 2088 // 2089 vnode * 2090 pxvfs::get_pxfobj(PXFS_VER::fobj_ptr fobjp, const PXFS_VER::fobj_info &fobjinfo, 2091 PXFS_VER::bind_info *binfop, fobj_client_impl *clientp) 2092 { 2093 pxfobj *pxfobjp; 2094 2095 ASSERT(!CORBA::is_nil(fobjp)); 2096 2097 // 2098 // During unmount, we need to be able to say that all vnodes are 2099 // inactive. Most new vnodes are created via directory operations 2100 // which are locked out when vn_vfswlock() is called on the mount 2101 // point. However, VFS_GET() (vget() above) and pxfobj::unpack_vnode() 2102 // are not blocked and so we have this code here to prevent new 2103 // pxfobj's from being created until unmount either succeeds or 2104 // fails completely. 2105 // Note: if we make it past the unlock, unmount() will return EBUSY 2106 // 2107 flags_lock.lock(); 2108 while (flags & PXFS_UNMOUNTING) { 2109 flags |= PXFS_FILE_ACTIVATE; 2110 flags_cv.wait(&flags_lock); 2111 } 2112 if (flags & PXFS_UNMOUNTED) { 2113 flags_lock.unlock(); 2114 return (NULL); 2115 } 2116 active_cnt++; 2117 flags_lock.unlock(); 2118 2119 // Allocate a new proxy vnode with a reference count of one. 2120 pxfobjp = make_pxfobj(fobjp, fobjinfo, clientp); 2121 2122 if (clientp != NULL) { 2123 ASSERT(binfop != NULL); 2124 2125 switch (binfop->_d()) { 2126 case PXFS_VER::bt_fobj: { 2127 // 2128 // Only a proxy vnode of this type caches information 2129 // 2130 pxfobjplus *pxfobjplusp = (pxfobjplus *)pxfobjp; 2131 2132 // 2133 // Initialize the attribute cache with 2134 // the data returned. 2135 // 2136 pxfobjplusp->install_attr(binfop->_u.bind_fobj.attr, 2137 binfop->_u.bind_fobj.rights); 2138 2139 // Initialize the cachedata flag when appropriate 2140 pxfobjplusp->install_cachedata_flag( 2141 binfop->_u.bind_fobj.cachedata); 2142 2143 // 2144 // Initialize the link from the proxy vnode object to 2145 // the fobj_client 2146 // 2147 ASSERT(clientp != NULL); 2148 pxfobjplusp->set_client(clientp); 2149 2150 // 2151 // Initialize the link from the fobj_client to the 2152 // proxy vnode. 2153 // 2154 // This must be the last part of initialization, 2155 // because waiting threads are unblocked. 2156 // 2157 clientp->set_pxfobjplus(pxfobjplusp); 2158 2159 break; 2160 } 2161 2162 default: 2163 // 2164 // No information is provided to the client 2165 // 2166 ASSERT(0); 2167 break; 2168 } 2169 } 2170 // Insert the new proxy vnode into the hash list of all proxy vnodes. 2171 pxfobj *pxfobj2p = fobjhash_insert(pxfobjp); 2172 return (pxnode::PXTOV(pxfobj2p)); 2173 } 2174 2175 // 2176 // make_pxfobj 2177 // This is called from get_pxfobj() to actually make a new proxy fobj object. 2178 // 2179 // Note: it should only do memory allocation and initialization for the 2180 // proxy fobj. Other file systems can replace this default implementation 2181 // and therefore common pxfobj initialization should go in get_pxfobj() 2182 // instead. 2183 // 2184 // Proxy vnodes are created with a reference count of one. 2185 // 2186 pxfobj * 2187 pxvfs::make_pxfobj(PXFS_VER::fobj_ptr fobjp, 2188 const PXFS_VER::fobj_info &fobjinfo, 2189 fobj_client_impl *clientp) 2190 { 2191 pxfobj *pxp; 2192 2193 ASSERT(!CORBA::is_nil(fobjp)); 2194 2195 // 2196 // XXX - new() may lead to a deadlock if memory is exhausted 2197 // See the NFS nnode management code how deadlock 2198 // can be prevented. 2199 // 2200 2201 switch (fobjinfo.ftype) { 2202 case PXFS_VER::fobj_io: { 2203 PXFS_VER::io_var iop = PXFS_VER::io::_narrow(fobjp); 2204 ASSERT(!CORBA::is_nil(iop)); 2205 pxp = new pxchr(fs_vfs, iop, fobjinfo); 2206 break; 2207 } 2208 case PXFS_VER::fobj_file: { 2209 PXFS_VER::file_var filep = PXFS_VER::file::_narrow(fobjp); 2210 ASSERT(!CORBA::is_nil(filep)); 2211 pxp = new pxreg(clientp, fs_vfs, filep, fobjinfo); 2212 break; 2213 } 2214 case PXFS_VER::fobj_unixdir: { 2215 PXFS_VER::unixdir_var udp = PXFS_VER::unixdir::_narrow(fobjp); 2216 ASSERT(!CORBA::is_nil(udp)); 2217 pxp = new pxdir(clientp, fs_vfs, udp, fobjinfo); 2218 break; 2219 } 2220 case PXFS_VER::fobj_symbolic_link: { 2221 PXFS_VER::symbolic_link_var linkp = 2222 PXFS_VER::symbolic_link::_narrow(fobjp); 2223 ASSERT(!CORBA::is_nil(linkp)); 2224 pxp = new pxlink(clientp, fs_vfs, linkp, fobjinfo); 2225 break; 2226 } 2227 case PXFS_VER::fobj_special: { 2228 PXFS_VER::special_var spp = PXFS_VER::special::_narrow(fobjp); 2229 ASSERT(!CORBA::is_nil(spp)); 2230 pxp = new pxspecial(fs_vfs, spp, fobjinfo); 2231 break; 2232 } 2233 case PXFS_VER::fobj_sobj: 2234 case PXFS_VER::fobj_fobj: 2235 case PXFS_VER::fobj_device: 2236 case PXFS_VER::fobj_procfile: 2237 default: 2238 os::panic("pxvfs:make_pxfobj unsupported fobj type %d", 2239 fobjinfo.ftype); 2240 // NOTREACHED 2241 } 2242 2243 return (pxp); 2244 } 2245 2246 // 2247 // fobjhash_insert 2248 // Insert a new (held) pxfobj into the hash table of all active pxfobj's. 2249 // If a preexisting pxfobj is found, return it (held) instead. 2250 // The caller is responsible for calling VN_RELE() on the associated vnode; 2251 // The caller should not use or release the pointer passed to this 2252 // routine. 2253 // 2254 pxfobj * 2255 pxvfs::fobjhash_insert(pxfobj *new_pxfobjp) 2256 { 2257 PXFS_VER::fobj_ptr fobjp = new_pxfobjp->getfobj(); 2258 pxfobj *pxfobjp; 2259 2260 // 2261 // Search the hash table of all fobj objects to see if there is 2262 // already a proxy for 'fobjp'. We delete the one just 2263 // created if there is a duplicate since this will hold the 2264 // pxfobj hash lock for less time and has fewer locking issues 2265 // than trying to lock, search, create pxfobj, unlock. 2266 // The assumption is that deleting a duplicate does not 2267 // happen very often. 2268 // 2269 uint_t b_idx = pxfs_misc::hash_devt_fid( 2270 (new_pxfobjp->get_vp())->v_vfsp->vfs_dev, 2271 new_pxfobjp->get_fidp(), pxfobjhsz); 2272 2273 pxfobj_hash_bkt &hbkt = pxfobj_hash[b_idx]; 2274 2275 hbkt.hlock.lock(); 2276 pxfobj_list_t::ListIterator iter(hbkt.hlist); 2277 2278 for (; (pxfobjp = iter.get_current()) != NULL; iter.advance()) { 2279 if (fobjp->_equiv(pxfobjp->getfobj())) { 2280 // 2281 // We found an existing pxfobj. 2282 // We do a hold before releasing the hash lock so 2283 // that pxfobj_inactive() will see that we have 2284 // reclaimed the vnode. 2285 // 2286 VN_HOLD(pxnode::PXTOV(pxfobjp)); 2287 hbkt.hlock.unlock(); 2288 2289 // 2290 // Since we didn't use the new pxfobj, we have to 2291 // adjust the active count. Objects in the hash 2292 // table will be accounted for when pxfobj_inactive() 2293 // removes it from the hash table. 2294 // 2295 flags_lock.lock(); 2296 ASSERT(active_cnt >= 2); 2297 active_cnt--; 2298 flags_lock.unlock(); 2299 VN_RELE(pxnode::PXTOV(new_pxfobjp)); 2300 return (pxfobjp); 2301 } 2302 } 2303 2304 hbkt.hlist_cnt++; 2305 2306 // 2307 // We expect the most recently created proxy file object 2308 // to be the most likely one to be used next. 2309 // So we put the newly created one at the front of the list. 2310 // 2311 new_pxfobjp->set_inhashtable(); 2312 hbkt.hlist.prepend(new_pxfobjp); 2313 2314 hbkt.hlock.unlock(); 2315 2316 PXFS_KSTATS(node_stats, ((KSTAT_NAMED_PTR(node_stats)) 2317 [PXVFS_NODE_STATS_NUM_OPEN_FILES].value.ui32++)); 2318 PXFS_KSTATS(stats, ((KSTAT_NAMED_PTR(stats)) 2319 [PXVFS_STATS_NUM_OPEN_FILES].value.ui32++)); 2320 2321 return (new_pxfobjp); 2322 } 2323 2324 // 2325 // add_inactivelist - queue this proxy vnode for processing 2326 // to become stale. 2327 // 2328 void 2329 pxvfs::add_inactivelist(pxfobjplus *pxfobjplusp) 2330 { 2331 flags_lock.lock(); 2332 2333 // 2334 // Add ourself to the list of inactive proxy vnodes 2335 // 2336 inactive_list.append((inactive_list_elem *)pxfobjplusp); 2337 inactive_list_cnt++; 2338 2339 if ((flags & PXFS_TASK_QUEUED) == 0) { 2340 // 2341 // Prevent pxvfs from being destroyed until work completes. 2342 // 2343 VFS_HOLD(fs_vfs); 2344 2345 // 2346 // Since the system is not already scheduled 2347 // to clean up inactive proxy vnodes, 2348 // make the system clean up inactive proxy vnodes. 2349 // 2350 flags |= PXFS_TASK_QUEUED; 2351 pxvfs_inactive_threadpool::the().defer_processing(this); 2352 } 2353 flags_lock.unlock(); 2354 } 2355 2356 // 2357 // empty_inactive_list - Drain the inactive proxy vnode list. 2358 // If someone is waiting, this method processes all inactive proxy vnodes. 2359 // Otherwise, process a limited number of inactive proxy vnodes for 2360 // this file system, because we want to allow the worker thread to clean up 2361 // other file systems. 2362 // 2363 void 2364 pxvfs::empty_inactive_list() 2365 { 2366 ASSERT(cluster_fs_drain_queue_len > 0); 2367 ASSERT(flags_lock.lock_held()); 2368 2369 pxfobj *pxfobjp; 2370 int drain_count = 0; 2371 2372 // 2373 // If a forced unmount operation is waiting, we will drain only 2374 // a limited amount - but at least what is currently on the list. 2375 // 2376 if ((flags & PXFS_FORCE_UNMOUNTING) && (flags & PXFS_INACTIVE_WAIT)) { 2377 drain_count = - (int)inactive_list_cnt; 2378 } 2379 2380 while ((pxfobjp = inactive_list.reapfirst()) != NULL) { 2381 flags_lock.unlock(); 2382 pxfobjp->cleanup_proxy_vnode(); 2383 flags_lock.lock(); 2384 2385 // 2386 // Another thread might think the pxvfs object has no 2387 // work left to clean up inactive proxy vnodes when 2388 // this count is zero. In which case it might destroy 2389 // this object. Therefore must decrement this count after 2390 // all the work has been done and while holding the flags_lock. 2391 // 2392 ASSERT(inactive_list_cnt != 0); 2393 --inactive_list_cnt; 2394 2395 // 2396 // Only do a limited amount of processing for each file 2397 // system by the reaper unless someone is waiting. 2398 // 2399 if (++drain_count >= cluster_fs_drain_queue_len && 2400 (((flags & PXFS_INACTIVE_WAIT) == 0) || 2401 (flags & PXFS_FORCE_UNMOUNTING))) { 2402 break; 2403 } 2404 } 2405 // 2406 // If the inactive list is empty or a forced unmount is being attempted, 2407 // then wake up any other thread that is waiting for proxy vnodes 2408 // to be cleaned up. 2409 // 2410 if ((inactive_list_cnt == 0 || flags & PXFS_FORCE_UNMOUNTING) && 2411 ((flags & PXFS_INACTIVE_WAIT) != 0)) { 2412 // 2413 // Wake up any other thread that 2414 // is waiting for proxy vnodes to be cleaned up. 2415 // 2416 flags &= ~PXFS_INACTIVE_WAIT; 2417 flags_cv.broadcast(); 2418 } 2419 } 2420 2421 // 2422 // If this function is called in support of a normal unmount, and the 2423 // filesystem is busy (non-zero active_cnt), then return immediately. 2424 // If this is not for an unmount or if the unmount is forced, then wait 2425 // for processing of the inactive vnode list. 2426 // 2427 // The return value is only significant for normal unmount operations 2428 // (is_unmount == true and forced_umount = false). For this case, a 2429 // value of true is returned if the filesystem is busy. For all other 2430 // cases, false is returned. 2431 // 2432 // If filesystem does not have active files, set a flag to prevent new 2433 // vnodes from being created (see get_pxfobj()). 2434 // 2435 bool 2436 pxvfs::wait_empty_inactive_list(bool forced_unmount) 2437 { 2438 int try_count; 2439 2440 flags_lock.lock(); 2441 if (!forced_unmount) { 2442 // 2443 // The only thing holding a vnode active may be something 2444 // transient, like an asynchronous write. For a regular unmount 2445 // allow a little more time for transient stuff to complete. 2446 // 2447 try_count = 6; 2448 } else { 2449 try_count = 1; 2450 } 2451 2452 for (; try_count > 0; try_count--) { 2453 // Wait for inactive proxy vnode processing to finish. 2454 while (inactive_list_cnt != 0) { 2455 flags |= PXFS_INACTIVE_WAIT; 2456 flags_cv.wait(&flags_lock); 2457 if (forced_unmount) { 2458 break; 2459 } 2460 } 2461 if (active_cnt != 0 && try_count > 1) { 2462 flags_lock.unlock(); 2463 // Sleep for 1 second 2464 os::usecsleep((os::usec_t)1000000); 2465 flags_lock.lock(); 2466 } 2467 } 2468 // 2469 // Check to see if any proxy vnodes are still in use. 2470 // 2471 bool in_use; 2472 if (forced_unmount) { 2473 in_use = false; 2474 } else { 2475 in_use = (active_cnt != 0); 2476 if (in_use) { 2477 flags &= ~PXFS_UNMOUNTING; 2478 if (flags & PXFS_FILE_ACTIVATE) { 2479 flags &= ~PXFS_FILE_ACTIVATE; 2480 flags_cv.broadcast(); 2481 } 2482 } 2483 } 2484 flags_lock.unlock(); 2485 return (in_use); 2486 } 2487 2488 // 2489 // pxfobj_inactive 2490 // This is called when the last vnode reference to a proxy vnode 2491 // is being released (i.e., the last call to VN_RELE() with v_count == 1). 2492 // 2493 // This supports proxy vnodes that never have dirty information. 2494 // Thus there are no callbacks from the server on these proxy vnodes. 2495 // This kind of proxy vnode can only be found through the hash table. 2496 // 2497 void 2498 pxvfs::pxfobj_inactive(pxfobj *pxfobjp) 2499 { 2500 // 2501 // Check to see if we are in the hash table. 2502 // If we are not, it is because we lost the race in 2503 // fobjhash_insert(). Once this kind of proxy vnode enters 2504 // the hash table, the proxy vnode remains there until destroyed. 2505 // 2506 if (pxfobjp->is_inhashtable()) { 2507 uint_t b_idx = pxfs_misc::hash_devt_fid( 2508 (pxfobjp->get_vp())->v_vfsp->vfs_dev, 2509 pxfobjp->get_fidp(), pxfobjhsz); 2510 2511 pxfobj_hash_bkt &hbkt = pxfobj_hash[b_idx]; 2512 hbkt.hlock.lock(); 2513 2514 vnode_t *vnodep = pxfobjp->get_vp(); 2515 2516 // 2517 // Check to see if we are still inactive (not reclaimed). 2518 // 2519 mutex_enter(&vnodep->v_lock); 2520 if (vnodep->v_count > 1) { 2521 // 2522 // We were reclaimed by vget() or fobjhash_insert(). 2523 // Account for the missing decrement in vn_rele(). 2524 // 2525 vnodep->v_count--; 2526 mutex_exit(&vnodep->v_lock); 2527 hbkt.hlock.unlock(); 2528 return; 2529 } 2530 mutex_exit(&vnodep->v_lock); 2531 2532 // 2533 // We are now committed to releasing the vnode. 2534 // 2535 bool removed = hbkt.hlist.erase(pxfobjp); 2536 CL_PANIC(removed); 2537 2538 ASSERT(hbkt.hlist_cnt != 0); 2539 hbkt.hlist_cnt--; 2540 2541 hbkt.hlock.unlock(); 2542 flags_lock.lock(); 2543 ASSERT(active_cnt != 0); 2544 active_cnt--; 2545 pxfobjp->not_inhashtable(); 2546 flags_lock.unlock(); 2547 2548 PXFS_KSTATS(node_stats, ((KSTAT_NAMED_PTR(node_stats)) 2549 [PXVFS_NODE_STATS_NUM_OPEN_FILES].value.ui32--)); 2550 PXFS_KSTATS(stats, ((KSTAT_NAMED_PTR(stats)) 2551 [PXVFS_STATS_NUM_OPEN_FILES].value.ui32--)); 2552 } 2553 pxfobjp->set_stale(); 2554 delete pxfobjp; 2555 } 2556 2557 // 2558 // pxfobjplus_inactive 2559 // Dirty cached information has already been flushed to the server. 2560 // 2561 // If the proxy vnode is not in use, remove the proxy vnode from the 2562 // hash table. 2563 // 2564 // Return Result True = the proxy vnode is not in the hash table 2565 bool 2566 pxvfs::pxfobjplus_inactive(pxfobjplus *pxfobjplusp) 2567 { 2568 // 2569 // The proxy vnode may never have entered the hash table. 2570 // 2571 if (pxfobjplusp->is_inhashtable()) { 2572 // 2573 // The proxy vnode is in the hash table 2574 // 2575 uint_t b_idx = pxfs_misc::hash_devt_fid( 2576 (pxfobjplusp->get_vp())->v_vfsp->vfs_dev, 2577 pxfobjplusp->get_fidp(), pxfobjhsz); 2578 2579 pxfobj_hash_bkt &hbkt = pxfobj_hash[b_idx]; 2580 hbkt.hlock.lock(); 2581 2582 vnode_t *vnodep = pxfobjplusp->get_vp(); 2583 2584 // 2585 // Check to see if we are still inactive (not reclaimed). 2586 // 2587 mutex_enter(&vnodep->v_lock); 2588 if (vnodep->v_count > 1) { 2589 // 2590 // The proxy vnode was reclaimed. 2591 // Account for the missing decrement in vn_rele(). 2592 // 2593 vnodep->v_count--; 2594 mutex_exit(&vnodep->v_lock); 2595 hbkt.hlock.unlock(); 2596 return (false); 2597 } 2598 mutex_exit(&vnodep->v_lock); 2599 2600 // 2601 // We are now committed to releasing the vnode. 2602 // 2603 2604 bool removed = hbkt.hlist.erase((pxfobj *)pxfobjplusp); 2605 CL_PANIC(removed); 2606 2607 ASSERT(hbkt.hlist_cnt != 0); 2608 hbkt.hlist_cnt--; 2609 2610 hbkt.hlock.unlock(); 2611 flags_lock.lock(); 2612 ASSERT(active_cnt != 0); 2613 active_cnt--; 2614 flags_lock.unlock(); 2615 pxfobjplusp->not_inhashtable(); 2616 2617 PXFS_KSTATS(node_stats, ((KSTAT_NAMED_PTR(node_stats)) 2618 [PXVFS_NODE_STATS_NUM_OPEN_FILES].value.ui32--)); 2619 PXFS_KSTATS(stats, ((KSTAT_NAMED_PTR(stats)) 2620 [PXVFS_STATS_NUM_OPEN_FILES].value.ui32--)); 2621 } 2622 pxfobjplusp->set_stale(); 2623 return (true); 2624 } 2625 2626 // 2627 // purge_caches - is called to prepare a filesystem for unmount/removal. 2628 // Return true if file system is still in use (active vnodes present), 2629 // false otherwise. In case of a forced unmount, return false always. 2630 // 2631 bool 2632 pxvfs::purge_caches(bool force_unmount, cred *credp) 2633 { 2634 // Purge all DNLC entries for this vfs. 2635 (void) dnlc_purge_vfsp(fs_vfs, 0); 2636 2637 flags_lock.lock(); 2638 // 2639 // Set PXFS_UNMOUNTING to block creation of new proxy vnodes. 2640 // 2641 flags |= PXFS_UNMOUNTING; 2642 if (force_unmount) { 2643 flags |= PXFS_FORCE_UNMOUNTING; 2644 } 2645 2646 if (fs_rootvp != NULL) { 2647 // Release the cached root directory vnode 2648 vnode_t *vnodep = fs_rootvp; 2649 fs_rootvp = NULL; 2650 flags_lock.unlock(); 2651 VN_RELE(vnodep); 2652 } else { 2653 flags_lock.unlock(); 2654 } 2655 2656 // 2657 // For normal unmount sync all the data (shortcut to VFS_SYNC(fs_vfs)). 2658 // Any possiblity of hanging because of I/O problems is avoided for 2659 // forced unmount. 2660 // 2661 if (!force_unmount) { 2662 (void) sync(0, credp); 2663 } 2664 2665 // 2666 // Lock the vfs to maintain file system status quo during unmount. 2667 // This has to be done after sync(), because ufs_update tries 2668 // to acquire the vfs_reflock. Thus we avoid deadlock in 2669 // traverse(), VFS_ROOT(), get_pxfobj(). 2670 // 2671 vfs_lock_wait(fs_vfs); 2672 2673 // 2674 // Forced Unmount Case - make one pass at processing the inactive list, 2675 // and the return value is always false. 2676 // 2677 // Otherwise wait for the inactive list to be processed, 2678 // and the return value will be true if there are still active 2679 // proxy vnodes. 2680 // 2681 if (wait_empty_inactive_list(force_unmount)) { 2682 vfs_unlock(fs_vfs); 2683 2684 PXFS_DBPRINTF( 2685 PXFS_TRACE_PXVFS, 2686 PXFS_AMBER, 2687 ("pxvfs:purge_caches(%p) active_cnt %d\n", 2688 this, active_cnt)); 2689 2690 return (true); 2691 } else { 2692 return (false); 2693 } 2694 } 2695 2696 // 2697 // Called to flush the filesystem's dirty data. 2698 // All the files is the hash bucket are flushed. 2699 // Returns non-zero if an error is encountered; zero otherwise. 2700 // 2701 // revoke : This is set to true if called from revoke_allocation(). 2702 // In which case we call sync_file_revoke() instead of sync_file(). 2703 // 2704 int 2705 pxvfs::sync_filesystem(cred *credp, bool revoke) 2706 { 2707 int error = 0; 2708 int ret = 0; 2709 pxfobj *pxfobjp; 2710 pxfobj *prevp = NULL; 2711 int file_count = 0; 2712 2713 for (uint_t idx = 0; idx < pxfobjhsz; idx++) { 2714 // 2715 // Make sure the iterator is initialized while holding the 2716 // lock. 2717 // 2718 pxfobj_hash[idx].hlock.lock(); 2719 pxfobj_list_t::ListIterator iter(pxfobj_hash[idx].hlist); 2720 for (; (pxfobjp = iter.get_current()) != NULL; iter.advance()) { 2721 2722 ASSERT(pxfobjp->is_inhashtable()); 2723 2724 // 2725 // Place a hold on the vnode so that it is not 2726 // released during proccessing. 2727 // 2728 VN_HOLD(pxnode::PXTOV(pxfobjp)); 2729 pxfobj_hash[idx].hlock.unlock(); 2730 2731 if (prevp != NULL) { 2732 // 2733 // It is possible that this operation will 2734 // render this proxy vnode inactive and expel 2735 // this proxy vnode from the hash table. 2736 // 2737 VN_RELE(pxnode::PXTOV(prevp)); 2738 } 2739 2740 // 2741 // We skip those vnodes which don't belong to this 2742 // filesystem. 2743 // 2744 if ((pxnode::PXTOV(pxfobjp)->v_vfsp) != fs_vfs) { 2745 prevp = pxfobjp; 2746 pxfobj_hash[idx].hlock.lock(); 2747 continue; 2748 } 2749 2750 // 2751 // Synchronously write out dirty data. 2752 // Wait for queued up async requests to complete. 2753 // 2754 if (revoke) { 2755 error = pxfobjp->sync_file_revoke(); 2756 } else { 2757 error = pxfobjp->sync_file(); 2758 } 2759 2760 if (error) { 2761 PXFS_DBPRINTF( 2762 PXFS_TRACE_PXVFS, 2763 PXFS_RED, 2764 ("pxvfs::sync_filesystem(%p) sync_file(%p)" 2765 " returned error %d \n", 2766 this, pxfobjp, error)); 2767 2768 // Return a non zero value. 2769 ret = 1; 2770 } 2771 2772 // 2773 // We cannot release the hold on this proxy vnode now 2774 // as the release can result in the vnode getting 2775 // expelled from the hash table. So we save a pointer 2776 // to the proxy vnode and do the release later. 2777 // Note: The release has to be done when the hlock is 2778 // not held. 2779 // 2780 prevp = pxfobjp; 2781 2782 // 2783 // This sync can starve other work. 2784 // So we throttle the sync. 2785 // 2786 file_count++; 2787 if (file_count >= sync_filesystem_throttle) { 2788 // 2789 // Allow other work. Sleep for 20 ms. 2790 // 2791 os::usecsleep((os::usec_t)(20000)); 2792 file_count = 0; 2793 } 2794 2795 pxfobj_hash[idx].hlock.lock(); 2796 } 2797 pxfobj_hash[idx].hlock.unlock(); 2798 } 2799 2800 if (prevp != NULL) { 2801 // 2802 // Release the hold placed on the proxy vnode that was 2803 // processed last. 2804 // 2805 VN_RELE(pxnode::PXTOV(prevp)); 2806 } 2807 2808 return (ret); 2809 } 2810 2811 // 2812 // This is called by mount_client_impl::remove_notify() when the 2813 // global unmount succeeds. 2814 // 2815 void 2816 pxvfs::unmount_succeeded() 2817 { 2818 // ASSERT(vfs_lock_held(fs_vfs)); 2819 fs_vfs->vfs_flag |= VFS_UNMOUNTED; 2820 2821 // 2822 // Clear the PXFS_UNMOUNTING flag, set the PXFS_UNMOUNTED flag, 2823 // and wake up any get_pxfobj() sleepers. 2824 // 2825 flags_lock.lock(); 2826 flags &= ~PXFS_UNMOUNTING; 2827 flags |= PXFS_UNMOUNTED; 2828 if (flags & PXFS_FILE_ACTIVATE) { 2829 flags &= ~PXFS_FILE_ACTIVATE; 2830 flags_cv.broadcast(); 2831 } 2832 flags_lock.unlock(); 2833 2834 fsmgr_client_implp->unmount_succeeded(); 2835 fsmgr_client_implp = NULL; 2836 2837 // 2838 // Remove ourself from the all_pxvfs list (if we're there). 2839 // XXX how do we make sure that find_pxvfs() doesn't hand out a 2840 // pointer to us before we remove ourself from the list? 2841 // 2842 all_pxvfs_lock.wrlock(); 2843 (void) all_pxvfs.erase((pxvfs_list_elem *)this); 2844 VFS_RELE(fs_vfs); // Release list's hold on vfs_t 2845 all_pxvfs_lock.unlock(); 2846 2847 #ifdef PXFS_KSTATS_ENABLED 2848 if (stats != NULL) { 2849 kstat_delete(stats); 2850 } 2851 #endif 2852 } 2853 2854 // 2855 // This is called by pxvfs::unmount() and mount_client_impl::unmount_failed() 2856 // when a global unmount fails. 2857 // 2858 void 2859 pxvfs::unmount_failed() 2860 { 2861 // Clear the PXFS_UNMOUNTING flag and wake up any get_pxfobj() sleepers. 2862 flags_lock.lock(); 2863 flags &= ~PXFS_UNMOUNTING; 2864 flags &= ~PXFS_FORCE_UNMOUNTING; 2865 if (flags & PXFS_FILE_ACTIVATE) { 2866 flags &= ~PXFS_FILE_ACTIVATE; 2867 flags_cv.broadcast(); 2868 } 2869 flags_lock.unlock(); 2870 } 2871 2872 // 2873 // This routine is called to clean up if the file system server crashes. 2874 // It is called from fsmgr_client_impl::_unreferenced() so it shouldn't 2875 // take too much time. 2876 // 2877 void 2878 pxvfs::cleanup() 2879 { 2880 // Clean up sleeping locks. 2881 pxfs_llm_callback_impl *llmp; 2882 llm_cb_list_lock.lock(); 2883 for (llm_cb_list.atfirst(); (llmp = llm_cb_list.get_current()) != NULL; 2884 llm_cb_list.advance()) { 2885 llmp->signal(EIO); 2886 } 2887 llm_cb_list_lock.unlock(); 2888 } 2889 2890 // 2891 // new_file_system_primary - do processing needed when the system 2892 // activates a new file system primary. 2893 // 2894 void 2895 pxvfs::new_file_system_primary(uint32_t server_incarn, Environment &) 2896 { 2897 // 2898 // The locking ensures that all in-progress client registrations 2899 // complete before changing the server incarnation. 2900 // This does not wait for invocations that have yet produced a reply. 2901 // 2902 server_incn_lock.wrlock(); 2903 server_incn = server_incarn; 2904 server_incn_lock.unlock(); 2905 2906 replay_sleeping_locks(); 2907 } 2908 2909 // 2910 // Replay all the sleeping locks that originated from this node. 2911 // 2912 void 2913 pxvfs::replay_sleeping_locks() 2914 { 2915 // 2916 // Walk through the list of callback objects, and wake them up with 2917 // 'RETRY_LOCK'. 2918 // 2919 pxfs_llm_callback_impl *llmp; 2920 llm_cb_list_lock.lock(); 2921 for (llm_cb_list.atfirst(); (llmp = llm_cb_list.get_current()) != NULL; 2922 llm_cb_list.advance()) { 2923 llmp->signal(pxfs_llm_callback_impl::RETRY_LOCK); 2924 } 2925 llm_cb_list_lock.unlock(); 2926 } 2927 2928 // 2929 // Insert callback object into list of callback objects. 2930 // 2931 void 2932 pxvfs::insert_llm_cbobj(pxfs_llm_callback_impl *llmp) 2933 { 2934 llm_cb_list_lock.lock(); 2935 llm_cb_list.prepend(llmp); 2936 llm_cb_list_lock.unlock(); 2937 } 2938 2939 // 2940 // Remove callback object from list of callback objects. 2941 // 2942 void 2943 pxvfs::remove_llm_cbobj(pxfs_llm_callback_impl *llmp) 2944 { 2945 llm_cb_list_lock.lock(); 2946 (void) llm_cb_list.erase(llmp); 2947 llm_cb_list_lock.unlock(); 2948 } 2949 2950 // 2951 // Calls made via pxfs/server/nlm_pxfs.cc when lockd on this node dies/restarts. 2952 // When lockd dies, there are two calls, setting the status of the NLM locks 2953 // from this node to 'FLK_NLM_SHUTTING_DOWN', and then 'FLK_NLM_DOWN'. The 2954 // first call interrupts all sleeping locks, and the second call discards all 2955 // active locks. 2956 // 2957 // static 2958 void 2959 pxvfs::set_nlm_status(int32_t nlmid, PXFS_VER::nlm_status status) 2960 { 2961 pxvfs *pxvfsp; 2962 SList<pxvfs> tmp_all_pxvfs; 2963 2964 // 2965 // Make a copy of 'all_pxvfs' so we don't have to lock/unlock 2966 // 'all_pxvfs'. 2967 // 2968 all_pxvfs_lock.rdlock(); 2969 for (all_pxvfs.atfirst(); 2970 (pxvfsp = all_pxvfs.get_current()) != NULL; 2971 all_pxvfs.advance()) { 2972 VFS_HOLD(pxvfsp->fs_vfs); 2973 tmp_all_pxvfs.prepend(pxvfsp); 2974 } 2975 all_pxvfs_lock.unlock(); 2976 2977 // Call 'set_nlm_status' on each filesystem. 2978 Environment e; 2979 while ((pxvfsp = tmp_all_pxvfs.reapfirst()) != NULL) { 2980 pxvfsp->get_fsobj()->set_nlm_status(nlmid, status, e); 2981 e.clear(); 2982 VFS_RELE(pxvfsp->fs_vfs); 2983 } 2984 } 2985 2986 // 2987 // Call made via pxfs/server/nlm_pxvfs.cc when statd on the client or server 2988 // node dies and gets restarted. 2989 // 2990 // static 2991 void 2992 pxvfs::remove_file_locks(int32_t sysid) 2993 { 2994 pxvfs *pxvfsp; 2995 SList<pxvfs> tmp_all_pxvfs; 2996 2997 // 2998 // Make a copy of 'all_pxvfs' so we don't have to lock/unlock 2999 // 'all_pxvfs'. 3000 // 3001 all_pxvfs_lock.rdlock(); 3002 for (all_pxvfs.atfirst(); 3003 (pxvfsp = all_pxvfs.get_current()) != NULL; 3004 all_pxvfs.advance()) { 3005 VFS_HOLD(pxvfsp->fs_vfs); 3006 tmp_all_pxvfs.prepend(pxvfsp); 3007 } 3008 all_pxvfs_lock.unlock(); 3009 3010 // Call 'remove_file_locks' on each filesystem. 3011 Environment e; 3012 while ((pxvfsp = tmp_all_pxvfs.reapfirst()) != NULL) { 3013 pxvfsp->get_fsobj()->remove_file_locks(sysid, e); 3014 e.clear(); 3015 VFS_RELE(pxvfsp->fs_vfs); 3016 } 3017 } 3018 3019 // 3020 // find_pxvfs 3021 // Return the pxvfs structure for a given PXFS file system object. 3022 // Return NULL if the proxy could not be found or created. 3023 // Otherwise, the pointer is returned held() and the caller should 3024 // call rele() when finished using the pointer. 3025 // 3026 // If 'fsinfop' is not NULL and 'fsobj' does not already 3027 // have a proxy, then use the file system info to create a new proxy. 3028 // 3029 pxvfs * 3030 pxvfs::find_pxvfs(PXFS_VER::filesystem_ptr fsobj, 3031 const PXFS_VER::fs_info *fsinfop) 3032 { 3033 pxvfs *pxvfsp; 3034 vfs_t *vfsp; 3035 Environment e; 3036 3037 // 3038 // The vast majority of calls will only need the read lock, 3039 // because the proxy file system vfs already exists. 3040 // 3041 all_pxvfs_lock.rdlock(); 3042 3043 // Search the list of proxy file systems 3044 pxvfsp = search(fsobj); 3045 if (pxvfsp != NULL) { 3046 // 3047 // This proxy file system vfs exists 3048 // 3049 ASSERT((pxvfsp->flags & PXFS_UNMOUNTED) == 0); 3050 VFS_HOLD(pxvfsp->fs_vfs); 3051 all_pxvfs_lock.unlock(); 3052 return (pxvfsp); 3053 } 3054 3055 // 3056 // If we aren't supposed to create a new proxy, 3057 // return NULL since we didn't find it. 3058 // 3059 if (fsinfop == NULL) { 3060 all_pxvfs_lock.unlock(); 3061 return (NULL); 3062 } 3063 3064 // 3065 // Try to upgrade the read lock to a write lock. This can fail. 3066 // 3067 if (!all_pxvfs_lock.try_upgrade()) { 3068 // Lock upgrade attempt failed 3069 all_pxvfs_lock.unlock(); 3070 all_pxvfs_lock.wrlock(); 3071 3072 // 3073 // Normally nobody else will be attempting to 3074 // create this file system. But this code takes 3075 // the safe approach that will always work 3076 // in spite of orphan requests or anything else, 3077 // and the cost is minimal. 3078 // 3079 // Search the list of proxy file systems 3080 pxvfsp = search(fsobj); 3081 if (pxvfsp != NULL) { 3082 // 3083 // This proxy file system vfs exists 3084 // 3085 ASSERT((pxvfsp->flags & PXFS_UNMOUNTED) == 0); 3086 VFS_HOLD(pxvfsp->fs_vfs); 3087 all_pxvfs_lock.unlock(); 3088 return (pxvfsp); 3089 } 3090 } 3091 3092 // 3093 // Set up the fsmgr_client/fsmgr_server connection. 3094 // 3095 fsmgr_client_impl *clientmgrp = new fsmgr_client_impl(); 3096 PXFS_VER::fsmgr_client_var clientmgr = clientmgrp->get_objref(); 3097 3098 FAULTPT_PXFS(FAULTNUM_PXFS_MOUNT_BIND_FS_B, 3099 FaultFunctions::generic); 3100 3101 uint32_t server_incarn; 3102 uint32_t fs_blk_size; 3103 bool fastwrite_flag; 3104 PXFS_VER::fsmgr_server_ptr servermgr_p = fsobj-> 3105 bind_fs(clientmgr, orb_conf::node_number(), server_incarn, 3106 fs_blk_size, fastwrite_flag, e); 3107 if (e.exception()) { 3108 // 3109 // We ignore comm failures since this can happen 3110 // if a node with the pxfs server crashes and 3111 // then any node joins the global name space. The 3112 // mount server will try to create a proxy vfs 3113 // for the dead file system and link it into the 3114 // name space. We need this to succeed so that 3115 // the dead file system can be unmounted properly. 3116 // 3117 if (CORBA::COMM_FAILURE::_exnarrow(e.exception()) == NULL) { 3118 #ifdef DEBUG 3119 e.exception()->print_exception( 3120 "pxvfs::findpxvfs: "); 3121 #endif 3122 MOUNT_DBPRINTF( 3123 MOUNT_TRACE_CLIENT, 3124 MOUNT_RED, 3125 ("pxvfs:findpxvfs(): exception from" 3126 " fsobj->bind_fs()\n")); 3127 } else { 3128 servermgr_p = PXFS_VER::fsmgr_server::_nil(); 3129 MOUNT_DBPRINTF( 3130 MOUNT_TRACE_CLIENT, 3131 MOUNT_RED, 3132 ("pxvfs:findpxvfs(): comm failure\n ")); 3133 } 3134 3135 e.clear(); 3136 } 3137 3138 // 3139 // Allocate a vfssw[] entry for the underlying file system type 3140 // but don't try to load the module. 3141 // 3142 RLOCK_VFSSW(); 3143 struct vfssw *vswp = vfs_getvfsswbyname((char *)fsinfop->fstype); 3144 if (vswp == NULL) { 3145 RUNLOCK_VFSSW(); 3146 WLOCK_VFSSW(); 3147 vswp = vfs_getvfsswbyname((char *)fsinfop->fstype); 3148 if (vswp == NULL) 3149 vswp = allocate_vfssw((char *)fsinfop->fstype); 3150 WUNLOCK_VFSSW(); 3151 if (vswp == NULL) { 3152 all_pxvfs_lock.unlock(); 3153 CORBA::release(servermgr_p); 3154 return (NULL); 3155 } 3156 RLOCK_VFSSW(); 3157 } 3158 int fstype = (int)(vswp - vfssw); 3159 RUNLOCK_VFSSW(); 3160 3161 // 3162 // Create a new vfs. 3163 // 3164 #if SOL_VERSION >= __s11 3165 vfsp = vfs_alloc(KM_SLEEP); 3166 #else 3167 vfsp = (vfs_t *)kmem_alloc(sizeof (vfs_t), KM_SLEEP); 3168 #endif 3169 VFS_INIT(vfsp, pxfs_vfsopsp, (caddr_t)NULL); 3170 VFS_HOLD(vfsp); 3171 pxvfsp = new pxvfs(fsobj, clientmgrp, fsinfop, fstype, vfsp, 3172 server_incarn); 3173 3174 pxvfsp->pxfs_bsize = fs_blk_size; 3175 pxvfsp->blocks_available = 0; 3176 pxvfsp->fastwrite = fastwrite_flag; 3177 3178 // Finish initializing the linkages between fsmgr client and server. 3179 clientmgrp->set_pxvfsp(pxvfsp, servermgr_p); 3180 3181 // Add ourself to the list of all PXFS file systems. 3182 all_pxvfs.prepend((pxvfs_list_elem *)pxvfsp); 3183 VFS_HOLD(vfsp); 3184 all_pxvfs_lock.unlock(); 3185 3186 CORBA::release(servermgr_p); 3187 return (pxvfsp); 3188 } 3189 3190 // 3191 // Search the all_pxvfs list for the given file system object. 3192 // Return NULL if not found. 3193 // 3194 pxvfs * 3195 pxvfs::search(PXFS_VER::filesystem_ptr fsobj) 3196 { 3197 pxvfs *p; 3198 pxvfs_list_t::ListIterator iter(all_pxvfs); 3199 3200 for (; (p = iter.get_current()) != NULL; 3201 iter.advance()) { 3202 if (fsobj->_equiv(p->get_fsobj())) { 3203 return (p); 3204 } 3205 } 3206 return (NULL); 3207 } 3208 3209 // 3210 // get_configured_nodes - obtains the nodes configured for this device. 3211 // An important side effect is that DCS will start the device service 3212 // if it is not already started. 3213 // 3214 int 3215 pxvfs::get_configured_nodes(bool &dev_is_ha, CORBA::String_out dev_name, 3216 dev_t devid, sol::nodeid_seq_t_out nodes, Environment &e) 3217 { 3218 int error; 3219 3220 #ifndef PSARC_2001_038 // no longer called with the lock held in Solaris 9 3221 RUNLOCK_VFSSW(); 3222 #endif 3223 3224 fs::dc_callback_var callback = 3225 mount_client_impl::get_server()->get_dc_callback(e); 3226 3227 #ifndef PSARC_2001_038 3228 RLOCK_VFSSW(); 3229 #endif 3230 3231 if (e.exception()) { 3232 return (pxfslib::get_err(e)); 3233 } 3234 3235 error = dcs_get_configured_nodes(devid, callback, 3236 dev_is_ha, dev_name, nodes); 3237 //lint -e1746 3238 3239 return (error); 3240 } 3241 3242 void 3243 pxvfs::disable_unmounts() 3244 { 3245 unmounts_disabled = true; 3246 } 3247 3248 // 3249 // memory_callback - the Memory Monitor executes this method whenever 3250 // the memory state changes. This method purges all pxfs entries from 3251 // the DNLC cache when the system is memory starved 3252 // 3253 // static 3254 void 3255 pxvfs::memory_callback(monitor::system_state_t state) 3256 { 3257 switch (state) { 3258 case monitor::MEMORY_STARVED: 3259 // 3260 // PXFS enters its proxy vnode into the DNLC. 3261 // When PXFS is on top of UFS on the same node, 3262 // the DNLC on this node also contains UFS vnodes. 3263 // The UFS vnode entry in the DNLC is not removed 3264 // when the corresponding PXFS server file object goes away. 3265 // We need to purge both the client proxy vnode 3266 // and the server file system vnode. 3267 // 3268 // The DNLC is an optional performance enhancer. 3269 // Any file in active use will soon be entered again 3270 // into the DNLC. 3271 // 3272 // The DNLC method for removing a specific vnode 3273 // requires walking all of the DNLC entries. 3274 // So purge everything. 3275 // 3276 dnlc_purge(); 3277 3278 PXFS_DBPRINTF( 3279 PXFS_TRACE_PXVFS, 3280 PXFS_AMBER, 3281 ("pxvfs:memory_callback: state %d purged dnlc\n", 3282 state)); 3283 break; 3284 default: 3285 break; 3286 } 3287 } 3288 3289 // 3290 // update_throughput() does bandwidth calculation. This method is called by 3291 // every thread that does a successful page_out with the number of bytes 3292 // transferred. In the case of async writes, it will be the aio callback 3293 // that calls this method. After every re-calculation it signals all 3294 // threads waiting for more bandwidth. 3295 // 3296 void 3297 pxvfs::update_throughput(int bytes_xfrd) 3298 { 3299 timespec_t now = {0L, 0}; 3300 uint64_t current_rate; 3301 uint64_t time_for_xfr; 3302 3303 gethrestime(&now); 3304 3305 // 3306 // If the current window started more than a second ago, store 3307 // current time, calculate new bytes per second rate and set bytes 3308 // available over next second to new data rate. 3309 // 3310 data_rate_lock.lock(); 3311 3312 // increment total bytes transferred from window_start 3313 bytes_sent_in_second += bytes_xfrd; 3314 3315 // 3316 // The last througput update as less than a second ago, don't 3317 // recalculate data rate. 3318 // 3319 if (diff_timespec(window_start, now) < 1000) { 3320 data_rate_lock.unlock(); 3321 return; 3322 } 3323 3324 gethrestime(&window_start); 3325 3326 // 3327 // Update recommended data rate 3328 // 3329 monitor::system_state_t sys_state = monitor::the().get_current_state(); 3330 if (sys_state != monitor::MEMORY_PLENTY) { 3331 // 3332 // If memory is low we set the data rate to exactly the number 3333 // of bytes we committed in the last second. 3334 // 3335 data_rate = bytes_sent_in_second; 3336 3337 // 3338 // If less than configured minimum bytes were written and 3339 // memory state is not MEMORY_STARVED set data_rate to the 3340 // configured minimum data rate. 3341 // 3342 if (bytes_sent_in_second < data_rate_minimum && 3343 sys_state != monitor::MEMORY_STARVED) { 3344 data_rate = data_rate_minimum; 3345 } 3346 } else { 3347 // 3348 // If writers aren't using the available bandwidth, we 3349 // don't change data rate. Re-calculation happens only if 3350 // qouta in last second was exceeded. wait_for_bandwidth() 3351 // allows this over-run when there is plenty of memory. 3352 // 3353 if (bytes_written_in_second >= data_rate) { 3354 if (bytes_sent_in_second < data_rate_minimum) { 3355 // 3356 // Bytes written could be low because there 3357 // weren't enough writes in the last few 3358 // seconds. Ease off the throttling as we have 3359 // enough memory to accomodate dirty pages. 3360 // Set data rate to the configured minimum 3361 // instead of bytes_sent_in_second. 3362 // 3363 data_rate = data_rate_minimum; 3364 } else { 3365 // 3366 // Anticipate an increase by 6.25% over the 3367 // next second if current data-rate is more 3368 // than the configured default. 3369 // 3370 if (bytes_sent_in_second > data_rate_default) { 3371 data_rate = bytes_sent_in_second + 3372 (bytes_sent_in_second/16); 3373 } else { 3374 // 3375 // If current data rate is below 3376 // configured default, we must get to 3377 // the default fast. We increase the 3378 // data rate at 50% every second. 3379 // 3380 data_rate = bytes_sent_in_second + 3381 (bytes_sent_in_second/2); 3382 } 3383 } 3384 } 3385 } 3386 3387 // Reset per second accumulators.. 3388 bytes_written_in_second = 0; 3389 bytes_sent_in_second = 0; 3390 3391 // ..and set byte quota for the current window. 3392 bytes_in_window = data_rate; 3393 3394 // 3395 // If there are no bytes available for writers to consume don't 3396 // wake anyone. 3397 // 3398 if (bytes_in_window != 0) { 3399 bandwidth_lock.lock(); 3400 // Wakeup any thread waiting for bandwidth. 3401 bandwidth_cv.broadcast(); 3402 bandwidth_lock.unlock(); 3403 } 3404 3405 data_rate_lock.unlock(); 3406 } 3407 3408 // 3409 // 'throttle_monitor_thread' runs every second, updates the bandwidth and 3410 // wakes up writers waiting for bandwidth. Without a per-second trigger, 3411 // writers can wait indefinitely if there was no i/o scheduled. 3412 // 3413 void 3414 pxvfs::throttle_monitor_thread(void *) 3415 { 3416 while (true) { 3417 os::usecsleep(throttle_monitor_interval); 3418 // 3419 // Determine new bandwidth for this second with no bytes 3420 // transferred. 3421 // 3422 pxvfs::update_throughput(0); 3423 } 3424 } 3425 3426 // Create a new thread to monitor bandwidth per second. 3427 int 3428 pxvfs::launch_throttle_monitor_thread() 3429 { 3430 // 3431 // Create a kernel thread in the SYS scheduling class. 3432 // 3433 if ((clnewlwp(throttle_monitor_thread, 3434 NULL, MINCLSYSPRI, NULL, NULL)) != 0) { 3435 return (-1); 3436 } 3437 return (0); 3438 } 3439 3440 // 3441 // Called at modload time. 3442 // 3443 int 3444 pxvfs::startup() 3445 { 3446 // 3447 // The size of the pxfobj_hash table is computed in a way that 3448 // is similar to the way ufs calculates the size/max size of its 3449 // in-core inode hash table. 3450 // 3451 //lint -e64 -e419 -e712 -e747 -e534 3452 if (pxfobjhsz_max == 0) { 3453 pxfobjhsz_max = 3454 (uint_t)1 << os::highbit((uint_t)ncsize / pxfobjh_len); 3455 } 3456 3457 if (pxfobjhsz == 0) { 3458 pxfobjhsz = pxfobjhsz_max; 3459 } 3460 3461 if (pxfobjhsz > pxfobjhsz_max) { 3462 pxfobjhsz = pxfobjhsz_max; 3463 } 3464 3465 pxfobj_hash = new pxfobj_hash_bkt[pxfobjhsz]; 3466 3467 // 3468 // Create the pxvfs_inactive_threadpool, 3469 // which processes requests to reap inactive proxy vnodes 3470 // 3471 pxvfs_inactive_threadpool::startup(); 3472 3473 #ifdef PXFS_KSTATS_ENABLED 3474 // Create the per-node kstat structure. 3475 node_stats = kstat_create("pxfs", 0, 3476 "Per-node client v1 stats", "pxvfs", 3477 KSTAT_TYPE_NAMED, PXVFS_NODE_STATS_MAX_NUM, KSTAT_FLAG_PERSISTENT); 3478 3479 if (node_stats != NULL) { 3480 kstat_named_init(&(KSTAT_NAMED_PTR(node_stats) 3481 [PXVFS_NODE_STATS_NUM_OPEN_FILES]), "Open Files", 3482 KSTAT_DATA_UINT32); 3483 PXFS_KSTATS(node_stats, ((KSTAT_NAMED_PTR(node_stats)) 3484 [PXVFS_NODE_STATS_NUM_OPEN_FILES].value.ui32 = 0)); 3485 3486 kstat_install(node_stats); 3487 } 3488 #else 3489 node_stats = NULL; 3490 #endif /* PXFS_KSTATS_ENABLED */ 3491 3492 // Register pxfs purge method with Memory Monitor 3493 monitor::the().subscribe(memory_callback); 3494 3495 int error; 3496 3497 // Start the throttle monitoring thread. 3498 if ((error = launch_throttle_monitor_thread()) != 0) { 3499 char nodename[32]; 3500 3501 (void) sprintf(nodename, "Node (%u)", orb_conf::node_number()); 3502 os::sc_syslog_msg msg(SC_SYSLOG_FILESYSTEM_TAG, nodename, NULL); 3503 // 3504 // SCMSGS 3505 // @explanation 3506 // Thread to support pxfs throttling could not be launched. 3507 // @user_action 3508 // Check if the node is short on resources. 3509 // 3510 (void) msg.log(SC_SYSLOG_WARNING, MESSAGE, 3511 "pxvfs:startup() Failed to create throttle monitoring" 3512 " thread.\n"); 3513 return (error); 3514 } 3515 3516 return (0); 3517 } 3518 3519 int 3520 pxvfs::shutdown() 3521 { 3522 // Deregister pxfs purge method with Memory Monitor 3523 monitor::the().unsubscribe(memory_callback); 3524 3525 pxvfs_inactive_threadpool::shutdown(); 3526 3527 delete [] pxfobj_hash; 3528 return (0); 3529 } 3530 3531 bool 3532 pxvfs::is_unmounted() 3533 { 3534 if (flags & PXFS_FORCE_UNMOUNTING) { 3535 return (true); 3536 } else { 3537 return (false); 3538 } 3539 } 3540 3541 // 3542 // This routine tries to allocate blocks from the local cache 3543 // ie. blocks_available. 3544 // If there are not enough blocks to satisfy the current request, it contacts 3545 // the server (get_reservatio()) to get more blocks. 3546 // If the server decides to switch to REDZONE we block here waiting for 3547 // the switch to complete and we return a '0' to indicate the switch. 3548 // If the get_reservation() finds more blocks we do the reservation 3549 // and returns the number of blocks allocated. 3550 // 3551 uint64_t 3552 pxvfs::reserve_blocks(PXFS_VER::blkcnt_t want, bool no_redzone_wait) 3553 { 3554 PXFS_VER::blkcnt_t refill_block_count = 0; 3555 PXFS_VER::server_status_t status; 3556 Environment env; 3557 3558 for (;;) { 3559 blocks_reservation.lock(); 3560 3561 // 3562 // If there is an invocation or redzone switch in progress 3563 // wait for the invocation or switch to complete. 3564 // 3565 while (pxvfs_status == PXFS_VER::SWITCH_TO_REDZONE || 3566 blk_reserve_invo_in_progress) { 3567 // 3568 // This was a call for pre-reservation, must 3569 // not wait for revoke allocations to complete. 3570 // 3571 if (no_redzone_wait) { 3572 blocks_reservation.unlock(); 3573 return (0); 3574 } 3575 blocks_reservation_cv.wait(&blocks_reservation); 3576 } 3577 if (pxvfs_status == PXFS_VER::REDZONE) { 3578 blocks_reservation.unlock(); 3579 return (0); 3580 } 3581 ASSERT(pxvfs_status == PXFS_VER::GREENZONE); 3582 3583 if (want < blocks_available) { 3584 blocks_available = blocks_available - want; 3585 blocks_reservation.unlock(); 3586 return (want); 3587 } 3588 3589 // 3590 // Clear available blocks and set flag to show block 3591 // reservation invocation is in progress. 3592 // 3593 blocks_available = 0; 3594 blk_reserve_invo_in_progress = true; 3595 3596 PXFS_DBPRINTF(PXFS_TRACE_PXVFS, 3597 PXFS_AMBER, 3598 ("pxvfs:reserve_blocks(%p) Calling get_reservation()\n", 3599 this)); 3600 3601 // 3602 // Get more reservation from server to fulfill the reuqest. 3603 // 3604 // Do not hold locks across invocations unless necessary. We 3605 // must not hold blocks_reservation lock while waiting for 3606 // server. Any thread needing disk blocks will block until the 3607 // invocation completes and this thread broadcasts a wakeup. 3608 // 3609 blocks_reservation.unlock(); 3610 get_fsobj()->get_reservation(refill_block_count, status, env); 3611 blocks_reservation.lock(); 3612 3613 env.clear(); 3614 3615 ASSERT(refill_block_count >= 0); 3616 3617 // 3618 // Set local block reservation to what the server gave 3619 // us and clear invocation active flag. 3620 // 3621 blocks_available = refill_block_count; 3622 blk_reserve_invo_in_progress = false; 3623 3624 if (refill_block_count > 0) { 3625 // 3626 // Wake up any threads that came in while we were 3627 // waiting for server to allocate us blocks. 3628 // 3629 blocks_reservation_cv.broadcast(); 3630 blocks_reservation.unlock(); 3631 } else { 3632 // 3633 // Server is not able to give away reservation which 3634 // means server either has switched to REDZONE or 3635 // is in the process of switching to REDZONE. 3636 // 3637 pxvfs_status = status; 3638 blocks_reservation.unlock(); 3639 PXFS_DBPRINTF(PXFS_TRACE_PXVFS, 3640 PXFS_AMBER, 3641 ("pxvfs:reserve_blocks(%p) pxvfs_status %d\n", 3642 this, status)); 3643 } 3644 } 3645 } 3646 3647 void 3648 pxvfs::set_server_status(PXFS_VER::server_status_t status) 3649 { 3650 blocks_reservation.lock(); 3651 pxvfs_status = status; 3652 blocks_reservation_cv.broadcast(); 3653 blocks_reservation.unlock(); 3654 } 3655 3656 // 3657 // Check if there is enough bandwidth to accomodate given bytes. If 3658 // not, wait for throttle update routine to signal us whenever per 3659 // second quota is updated. 3660 // 3661 int 3662 pxvfs::wait_for_bandwidth(int bytes_needed, int &bytes_allocated) 3663 { 3664 monitor::system_state_t sys_state; 3665 3666 sys_state = monitor::the().get_current_state(); 3667 3668 // Round up to minimum allowed bandwidth allocation. 3669 bytes_needed = MAX(bytes_needed, bandwidth_chunk); 3670 3671 bandwidth_lock.lock(); 3672 3673 while (bytes_needed > bytes_in_window) { 3674 3675 // 3676 // If there is plenty of memory, we allow over-run of the 3677 // calculated bandwidth once without waiting. The next 3678 // throughput updation will correct this if the server was 3679 // loaded. 3680 // 3681 if (sys_state == monitor::MEMORY_PLENTY && 3682 bytes_in_window != 0) { 3683 break; 3684 } 3685 3686 // 3687 // Wait for more bandwidth to be available 3688 // 3689 if (!bandwidth_cv.wait_sig(&bandwidth_lock)) { 3690 bandwidth_lock.unlock(); 3691 return (EINTR); 3692 } 3693 // 3694 // If this write is bigger than current bandwidth no sense 3695 // in waiting for more. If we are under memory pressure 3696 // don't proceed until the pressure eases off. 3697 // 3698 if (bytes_needed > data_rate && 3699 monitor::the().get_current_state() != 3700 monitor::MEMORY_STARVED) { 3701 break; 3702 } 3703 } 3704 3705 if (bytes_needed > bytes_in_window) { 3706 bytes_in_window = 0; 3707 } else { 3708 bytes_in_window -= bytes_needed; 3709 } 3710 3711 bytes_written_in_second += bytes_needed; 3712 bandwidth_lock.unlock(); 3713 3714 bytes_allocated = bytes_needed; 3715 return (0); 3716 } 3717