1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the License). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/CDDL.txt 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/CDDL.txt. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets [] replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * Portions of this source code were derived from Berkeley 4.3 BSD 32 * under license from the Regents of the University of California. 33 */ 34 35 #pragma ident "@(#)ufs_dependent_impl.cc 1.23 08/05/20 SMI" 36 37 #include <sys/file.h> 38 #include <sys/fcntl.h> 39 #include <sys/filio.h> 40 #include <sys/lockfs.h> 41 #include <sys/mntent.h> 42 #include <sys/vnode.h> 43 #include <sys/fs/ufs_filio.h> 44 45 #include <sys/sol_version.h> 46 #if SOL_VERSION >= __s10 47 #define _LEAST_PRIVS 48 #endif 49 #if defined(_LEAST_PRIVS) 50 #include <sys/policy.h> 51 #endif 52 53 #include <orb/infrastructure/orb_conf.h> 54 55 #include "../version.h" 56 #include <pxfs/lib/pxfs_debug.h> 57 #include <pxfs/server/ufs_dependent_impl.h> 58 #include <pxfs/server/fobj_impl.h> 59 #include <pxfs/server/io_impl.h> 60 #include <pxfs/server/fs_impl.h> 61 #include <pxfs/server/file_impl.h> 62 #include <pxfs/server/fobj_trans_states.h> 63 64 //lint -e1512 65 // 66 // Warning(1512) destructor for base class is not virtual -- In a 67 // final pass through all the classes, we have found a class that is 68 // the base class of a derivation and has a destructor but the 69 // destructor is not virtual. It is conventional for inherited classes 70 // to have virtual destructors so that is it safe to 'delete' a 71 // pointer to a base class. 72 // 73 // Ths classes prov_common_iter and prov_common_setin the file 74 // prov_common.h have to be changed to have virtual destructors. 75 // 76 77 //lint -e666 78 // PXFS do an extensive user of the inline function get_vp() within 79 // the vnode macros: 80 // error = VOP_LOOKUP(get_vp(), ... ); 81 // There are no side effects to calling get_vp() repeatedly, flexelint 82 // does not know that, but we do. 83 // 84 85 // Constructor. 86 ufs_dependent_impl::ufs_dependent_impl(fs_ii *fsp, const char *mntoptions) 87 { 88 _fsp = fsp; 89 90 _lockfs_info.lf_comment = NULL; 91 _locking_on = false; 92 93 ufs_dependent_impl::set_mntopts(mntoptions); 94 95 last_sync_time = os::gethrtime(); 96 } 97 98 // Virtual destructor. 99 ufs_dependent_impl::~ufs_dependent_impl() 100 { 101 // Free up lockfs comment, if any. 102 delete [] _lockfs_info.lf_comment; 103 } //lint !e1540 pointers are neither freed nor zero'ed by destructor 104 105 // 106 // Called by the constructor, or when the FS is mounted using new mount 107 // options (via the MS_REMOUNT flag to mount). 108 // 109 void 110 ufs_dependent_impl::set_mntopts(const char *mntoptions) 111 { 112 _forcedirectio_on = false; 113 _syncdir_on = false; 114 _nocto_on = false; 115 116 if (mntoptions == NULL) { 117 return; 118 } 119 120 // 121 // Look for the "forcedirectio" option. If this option is true, 122 // then directio mode is always enabled for open files in this 123 // filesystem. We flag the mntoption using the '_forcedirectio_on' 124 // variable, and then turn off caching by setting 'cachedata' to 125 // false in new_fobj(), which is called every time a new fobj is 126 // created for this FS. 127 // 128 if (pxfslib::exists_mntopt(mntoptions, MNTOPT_FORCEDIRECTIO, false)) { 129 // 130 // Found the "forcedirectio" mount option. Set the boolean 131 // variable for use in 'new_fobj'. 132 // 133 PXFS_DBPRINTF( 134 PXFS_TRACE_UFS, 135 PXFS_GREEN, 136 ("FS %x mounted with forcedirectio\n", _fsp)); 137 _forcedirectio_on = true; 138 } 139 140 if (pxfslib::exists_mntopt(mntoptions, MNTOPT_SYNCDIR, false)) { 141 _syncdir_on = true; 142 } 143 144 if (pxfslib::exists_mntopt(mntoptions, MNTOPT_NOCTO, true)) { 145 _nocto_on = true; 146 } 147 } 148 149 // 150 // Function called by the fs_impl object right after it creates a 151 // new fobj object. This function is meant to set fs-specific 152 // parameters for the fobj - in UFS's case, it would set the 'cachedata' 153 // flag based on whether the UFS mount was done with the "forcedirectio" 154 // option turned on. 155 // 156 void 157 ufs_dependent_impl::new_fobj(fobj_ii *fobjp) 158 { 159 ASSERT(_fsp == fobjp->get_fsp()); 160 161 if (_forcedirectio_on) { 162 PXFS_DBPRINTF( 163 PXFS_TRACE_UFS, 164 PXFS_GREEN, 165 ("Setting cachedata to false for %x\n", fobjp)); 166 // 167 // This UFS filesystem is mounted with the "forcedirectio" 168 // mount option. Take the directio path by default. 169 // 170 if (fobjp->get_ftype() == PXFS_VER::fobj_file) { 171 // 172 // Only file objects care about directio 173 // 174 file_ii *filep = (file_ii *)fobjp; 175 filep->init_cachedata_flag(false); 176 } 177 } 178 } 179 180 // 181 // Called from page_out and async_page_out. Retry VOP_ALLOC_DATA to ensure 182 // there's space on disk before writing data out. We should really do this 183 // only for files that are open across a failover. However, we don't have 184 // enough state today to know which files were open across a failover. 185 // 186 int 187 ufs_dependent_impl::fs_preprocess(vnode_t *vp, u_offset_t offset, size_t *len, 188 fdbuffer_t *fdb, int flags, cred_t *credp) 189 { 190 if (!_syncdir_on) { 191 int error; 192 PXFS_DBPRINTF( 193 PXFS_TRACE_UFS, 194 PXFS_GREEN, 195 ("fs_alloc_data(fs_preprocess): vp %p off %llx len %lx\n", 196 vp, offset, *len)); 197 error = fs_alloc_data(vp, offset, len, fdb, flags, credp); 198 if (error) { 199 return (error); 200 } 201 // 202 // We need to sync out UFS's log to disk to prevent bug 203 // 4362944. The bug occurs because a pageout can complete, 204 // but the inode may not have been updated with block 205 // allocation information when using pxfs without syncdir. 206 // So after a failover, we have a "holy" file although pages 207 // have been written to disk. By flushing UFS's log to disk, 208 // we ensure this file's inode has been updated on disk. 209 // 210 error = sync_if_necessary(os::gethrtime(), vp, credp); 211 return (error); 212 } else { 213 return (0); 214 } 215 } 216 217 // 218 // Function called by fobj_ii::cascaded_ioctl() to process ufs-specific ioctls. 219 // Returns true if the ioctl was processed in this function, and false 220 // if not. 221 // 222 // Keep in mind, that due to 4408967 (switchover/failover of locked filesystem 223 // hangs), we now do our _freeze_in_progress() check here, on a case by case 224 // basis (vs. how it was done in the past, were we assumed all ioctls were 225 // cascaded. 226 // 227 // Cascaded ioctls (where we depend on another invocation -- an example would be 228 // the directio case, where we need to flush everyone's cache) need this check 229 // to prevent deadlock. 230 // 231 // Others, like the lockfs ioctl (lockfs -u, in particular, was the motivation 232 // behind 4408967), are not cascaded and need to make it through to the 233 // underlying UFS filesystem so we can unblock freeze_primary. 234 // 235 bool 236 ufs_dependent_impl::process_cascaded_ioctl(sol::nodeid_t, 237 fobj_ii *fobjp, int32_t iocmd, sol::intptr_t arg, int32_t flag, 238 cred_t *crp, int *result, int &error, Environment &env) 239 { 240 struct lockfs fs_lockfs; 241 242 ASSERT(_fsp == fobjp->get_fsp()); 243 244 fs_lockfs.lf_comment = NULL; 245 246 switch (iocmd) { 247 case _FIOSATIME: 248 error = ioctl_fiosatime(fobjp, arg, flag, result, 249 crp, env); 250 return (true); 251 252 case _FIOLFS: 253 // Special handling is required for _FIOLFS only if PXFS is HA 254 if (_fsp->is_replicated()) { 255 if (!((fs_repl_impl *)_fsp)->_freeze_in_progress( 256 &env)) { 257 258 // We only sync when it is a write lock 259 #if defined(_LEAST_PRIVS) 260 error = get_lockfs_user_params(arg, flag, 261 crp, fs_lockfs, fobjp->get_vp()); 262 #else 263 error = get_lockfs_user_params(arg, flag, 264 crp, fs_lockfs); 265 #endif 266 if (error) { 267 if (fs_lockfs.lf_comment != NULL) { 268 delete [] fs_lockfs.lf_comment; 269 } 270 return (true); 271 } 272 273 if (LOCKFS_WLOCK == fs_lockfs.lf_lock) { 274 _fsp->sync_fs(crp); 275 #if SOL_VERSION >= __s11 276 error = VOP_IOCTL(fobjp->get_vp(), 277 _FIOFFS, arg, flag, crp, result, 278 NULL); 279 #else 280 error = VOP_IOCTL(fobjp->get_vp(), 281 _FIOFFS, arg, flag, crp, result); 282 #endif 283 } 284 } 285 env.clear(); 286 287 error = ioctl_fiolfs(fobjp, arg, flag, result, crp, 288 env); 289 return (true); 290 } 291 break; 292 293 case _FIOFFS: 294 if (_fsp->is_replicated() && 295 ((fs_repl_impl *)_fsp)->_freeze_in_progress(&env)) { 296 return (true); 297 } 298 _fsp->sync_fs(crp); 299 #if SOL_VERSION >= __s11 300 error = VOP_IOCTL(fobjp->get_vp(), iocmd, arg, flag, crp, 301 result, NULL); 302 #else 303 error = VOP_IOCTL(fobjp->get_vp(), iocmd, arg, flag, crp, 304 result); 305 #endif 306 return (true); 307 308 case _FIOSDIO: 309 if (_fsp->is_replicated() && 310 ((fs_repl_impl *)_fsp)->_freeze_in_progress(&env)) { 311 return (true); 312 } 313 _fsp->sync_fs(crp); 314 #if SOL_VERSION >= __s11 315 error = VOP_IOCTL(fobjp->get_vp(), iocmd, arg, flag, crp, 316 result, NULL); 317 #else 318 error = VOP_IOCTL(fobjp->get_vp(), iocmd, arg, flag, crp, 319 result); 320 #endif 321 return (true); 322 323 case _FIODIRECTIO: 324 if (_fsp->is_replicated() && 325 ((fs_repl_impl *)_fsp)->_freeze_in_progress(&env)) { 326 return (true); 327 } 328 fobjp->range_lock(); 329 FAULTPT_PXFS(FAULTNUM_PXFS_FIODIRECTIO_S_B, 330 FaultFunctions::generic); 331 #if SOL_VERSION >= __s11 332 error = VOP_IOCTL(fobjp->get_vp(), iocmd, arg, flag, crp, 333 result, NULL); 334 #else 335 error = VOP_IOCTL(fobjp->get_vp(), iocmd, arg, flag, crp, 336 result); 337 #endif 338 if (error == 0) { 339 ASSERT(arg == DIRECTIO_ON || arg == DIRECTIO_OFF); 340 341 if (_forcedirectio_on && arg == DIRECTIO_OFF) { 342 // 343 // If this FS was mounted with "forcedirectio", 344 // and some app. is trying to turn off direct 345 // io, ignore that call. This is what UFS 346 // does. 347 // 348 } else { 349 if (fobjp->get_ftype() == PXFS_VER::fobj_file) { 350 // 351 // Only file objects care about directio 352 // 353 file_ii *filep = (file_ii *)fobjp; 354 filep->dio_writes.wrlock(); 355 error = filep->set_cachedata_flag( 356 arg == DIRECTIO_OFF, env); 357 filep->dio_writes.unlock(); 358 } 359 } 360 } 361 FAULTPT_PXFS(FAULTNUM_PXFS_FIODIRECTIO_S_A, 362 FaultFunctions::generic); 363 fobjp->range_unlock(); 364 return (true); 365 366 case _FIOLOGDISABLE: 367 #if SOL_VERSION >= __s11 368 error = VOP_IOCTL(fobjp->get_vp(), iocmd, arg, flag, crp, 369 result, NULL); 370 #else 371 error = VOP_IOCTL(fobjp->get_vp(), iocmd, arg, flag, crp, 372 result); 373 #endif 374 return (true); 375 376 case _FIOISBUSY: { 377 // 378 // Contract-private interface for Legato. 379 // No ioctl is sent down to the underlying filesystem 380 // because, it will always return failure to this 381 // legato specific ioctl. Instead, pxfs tries to find 382 // out if this file has been opened on pxfs clients. 383 // 384 uint32_t ret_val; 385 #if defined(_LEAST_PRIVS) 386 if (secpolicy_fs_config(crp, (fobjp->get_vp())->v_vfsp) != 0) { 387 #else 388 if (! suser(crp)) { 389 #endif 390 error = EPERM; 391 } else { 392 ret_val = (uint32_t)fobjp->is_it_busy(); 393 if (suword32((int *)arg, ret_val)) { 394 error = EFAULT; 395 } else { 396 error = 0; 397 } 398 } 399 return (true); 400 } 401 402 case _FIOTUNE: 403 // Tune the file system's atrributes. 404 #if SOL_VERSION >= __s11 405 error = VOP_IOCTL(fobjp->get_vp(), iocmd, arg, 406 flag, crp, result, NULL); 407 #else 408 error = VOP_IOCTL(fobjp->get_vp(), iocmd, arg, 409 flag, crp, result); 410 #endif 411 FAULTPT_PXFS(FAULTNUM_PXFS_FIOTUNE_S, FaultFunctions::generic); 412 if ((error == 0) && fobjp->is_replicated()) { 413 // 414 // If the ioctl was successful, sync metadata 415 // so that the in-memory version of the superblock 416 // is committed to disk. 417 // 418 #if SOL_VERSION >= __s11 419 error = VOP_FSYNC(fobjp->get_vp(), FNODSYNC, crp, NULL); 420 #else 421 error = VOP_FSYNC(fobjp->get_vp(), FNODSYNC, crp); 422 #endif 423 } 424 return (true); 425 default: 426 break; 427 } 428 429 // The ioctl was not processed in a fs-specific manner. 430 return (false); 431 } 432 433 void 434 ufs_dependent_impl::replay_ioctl(fobj_ii *fobjp, int32_t iocmd, 435 sol::intptr_t arg, int32_t flag, int32_t &result, int &error) 436 { 437 // 438 // Set the copy_args 'pid' value to '0', to indicate that the 439 // kernel is performing the ioctl (see orb/copy.cc). 440 // 441 copy_args args(orb_conf::node_number(), 0); 442 copy::setcontext(&args); 443 444 switch (iocmd) { 445 case _FIODIRECTIO: 446 PXFS_DBPRINTF( 447 PXFS_TRACE_UFS, 448 PXFS_GREEN, 449 ("replaying directio ioctl\n")); 450 #if SOL_VERSION >= __s11 451 error = VOP_IOCTL(fobjp->get_vp(), iocmd, arg, flag, kcred, 452 &result, NULL); 453 #else 454 error = VOP_IOCTL(fobjp->get_vp(), iocmd, arg, flag, kcred, 455 &result); 456 #endif 457 if (error) { 458 PXFS_DBPRINTF( 459 PXFS_TRACE_UFS, 460 PXFS_RED, 461 ("error replaying directio ioctl %d\n", error)); 462 } 463 return; 464 465 default: 466 break; 467 } 468 469 // Trap future references 470 copy::setcontext(NULL); 471 } 472 473 // 474 // Function called by fs_ii::convert_to_primary() to do any fs-specific 475 // conversions. 476 // 477 int 478 ufs_dependent_impl::convert_to_primary(fs_ii *fsp) 479 { 480 int error = 0; 481 482 // Replay the filesystem lock, if any existed. 483 if (_locking_on) { 484 // Make a local copy. 485 struct lockfs lkfs; 486 lkfs = _lockfs_info; 487 488 PXFS_DBPRINTF( 489 PXFS_TRACE_UFS, 490 PXFS_GREEN, 491 ("ufs_dependent_impl::convert_to_primary: " 492 "lkfs = %p", &lkfs)); 493 error = fs_dependent_impl::kernel_ioctl(fsp->get_vfsp(), 494 _FIOLFS, (intptr_t)&lkfs); 495 if (error != 0) { 496 // 497 // This is fatal because a locked filesystem is now 498 // unlocked after a *transparent* failure or switchover. 499 // We should not become the primary. 500 // 501 PXFS_DBPRINTF( 502 PXFS_TRACE_UFS, 503 PXFS_RED, 504 ("ufs_dependent_impl::" 505 "convert_to_primary: replaying of lock failed " 506 "- error %d\n", error)); 507 return (error); 508 } 509 } 510 fiolog_t arg; 511 512 arg.nbytes_requested = 0; 513 arg.nbytes_actual = 0; 514 arg.error = FIOLOG_ENONE; 515 516 if (!(fsp->device_is_lofi())) { 517 error = fs_dependent_impl::kernel_ioctl(fsp->get_vfsp(), 518 _FIOLOGENABLE, (intptr_t)&arg); 519 520 if (error != 0) { 521 PXFS_DBPRINTF( 522 PXFS_TRACE_UFS, 523 PXFS_RED, 524 ("ufs_dependent_impl::" 525 "convert_to_primary: logging ioctl" 526 "failed - error %d\n", error)); 527 return (error); 528 } 529 } 530 531 // 532 // Force the file system information back to the server 533 // during recovery mode. 534 // 535 common_threadpool::the().defer_processing(new fs_recovery_task( 536 fsp, fsp->get_server_incn())); 537 538 return (0); 539 } 540 541 void 542 ufs_dependent_impl::freeze_primary(const char *fs_name) 543 { 544 // 545 // Try to grab the _lockfs_lock mutex. This mutex could be protecting 546 // the _locking_on boolean. 547 // 548 _lockfs_lock.lock(); 549 while (_locking_on) { 550 char nodename[32]; 551 552 (void) sprintf(nodename, "Node (%u)", orb_conf::node_number()); 553 554 os::sc_syslog_msg msg(SC_SYSLOG_FILESYSTEM_TAG, 555 nodename, NULL); 556 // 557 // SCMSGS 558 // @explanation 559 // The file system has been locked with the _FIOLFS ioctl. It 560 // is necessary to perform an unlock _FIOLFS ioctl. The 561 // growfs(1M) or lockfs(1M) command may be responsible for 562 // this lock. 563 // @user_action 564 // An _FIOLFS LOCKFS_ULOCK ioctl is required to unlock the 565 // file system. 566 // 567 (void) msg.log(SC_SYSLOG_WARNING, MESSAGE, 568 "Filesystem (%s) is locked and cannot be frozen", 569 fs_name); 570 _fiolfs_cv.wait(&_lockfs_lock); 571 } 572 _lockfs_lock.unlock(); 573 } 574 575 // 576 // Helper function for dumping state to a new secondary. 577 // 578 void 579 ufs_dependent_impl::dump_state(REPL_PXFS_VER::fs_replica_ptr ckptp, 580 Environment &env) 581 { 582 if (_locking_on) { 583 ckptp->ckpt_lockfs_info((uint64_t)_lockfs_info.lf_lock, 584 (uint64_t)_lockfs_info.lf_flags, 585 (uint64_t)_lockfs_info.lf_key, 586 _lockfs_info.lf_comment, env); 587 env.clear(); 588 } 589 } 590 591 // 592 // ioctl_fiosatime - Helper function to perform the '_FIOSATIME' ufs ioctl. 593 // The UFS _FIOSATIME ioctl sets the access time of the file it is performed 594 // on. We need to downgrade the attribute caches before letting the ioctl 595 // through to UFS. Also, since UFS only does the update in-memory, PXFS needs 596 // to perform a sync on the file if the ioctl completes successfully. 597 // 598 sol::error_t 599 ufs_dependent_impl::ioctl_fiosatime(fobj_ii *fobjp, sol::intptr_t arg, 600 int32_t flag, int *ret_val, cred_t *crp, Environment &) 601 { 602 int error; 603 vnode_t *vp = fobjp->get_vp(); 604 605 // 606 // The _FIOSATIME sets the access time attribute of the vnode. Before 607 // we issue the ioctl, we need to downgrade the attribute caches just 608 // like in fobj_ii::set_attributes() 609 // 610 fobjp->attr_lock.wrlock(); 611 612 error = fobjp->downgrade_attr_all(PXFS_VER::attr_write, false, 0); 613 if (error != 0) { 614 fobjp->attr_lock.unlock(); 615 return (error); 616 } 617 618 // Issue the ioctl. 619 #if SOL_VERSION >= __s11 620 error = VOP_IOCTL(vp, _FIOSATIME, arg, flag, crp, ret_val, NULL); 621 #else 622 error = VOP_IOCTL(vp, _FIOSATIME, arg, flag, crp, ret_val); 623 #endif 624 625 if ((error == 0) && fobjp->is_replicated()) { 626 // 627 // If the ioctl was successful, write out the updated value to 628 // disk so that if there is a failover/switchover, the 629 // attribute value will remain consistent. 630 // XXX This is currently needed because UFS logging writes 631 // the log asynchronously. 632 // XXX is FDSYNC really correct ? 633 // 634 error = do_fsync(vp, crp); 635 } 636 637 fobjp->attr_lock.unlock(); 638 return (error); 639 } 640 641 // 642 // Helper function to perform the '_FIOLFS' ufs ioctl. 643 // The UFS _FIOLFS ioctl performs file-system locking. If the fs being locked 644 // is a HA fs, then PXFS needs to keep track of the current locking state of the 645 // fs, and further needs to replay this lock when a failover/switchover of the 646 // filesystem happens. This implies that we need to checkpoint the fs locking 647 // error over to the PXFS secondaries - this is done in this routine and 648 // also when a new secondary is added to this service. 649 // 650 sol::error_t 651 ufs_dependent_impl::ioctl_fiolfs(fobj_ii *fobjp, sol::intptr_t arg, 652 int32_t flag, int *ret_val, cred_t *crp, Environment &env) 653 { 654 int error; 655 struct lockfs fs_lockfs; 656 fobj_lockfs_state *state_obj; 657 int state; 658 sol::error_t err; 659 vnode_t *vp = fobjp->get_vp(); 660 661 ASSERT(fobjp->get_fsp() == _fsp); 662 ASSERT(fobjp->is_replicated()); 663 664 if ((state_obj = fobj_lockfs_state::retry(env)) != NULL) { 665 // 666 // This call is a retry. Get the stored data from the state 667 // object. Note that it is possible to reconstruct 'fs_lockfs' 668 // from the arguments to ioctl_fiolfs, but that is slower than 669 // just copying it out of the state object. 670 // 671 state_obj->get_args(&fs_lockfs, &state); 672 673 switch (state) { 674 case fobj_lockfs_state::INITIAL: 675 // Continue with the processing. 676 break; 677 678 case fobj_lockfs_state::COMMITED: 679 // 680 // The ioctl has already been replayed - we need to 681 // return success. But (sigh!) it's not as easy as 682 // that - we also need to copy out the appropriate 683 // value of lf_key to the user. 684 // 'lf_key' is a value stored by UFS associated 685 // with the current lock - UFS uses it to provide some 686 // protection against multiple threads doing locking on 687 // a filesystem without knowing about each other. We, 688 // of course, need to support this so that the lockfs 689 // protocol is truly transparent in a HA PXFS 690 // filesystem. 691 // 692 err = set_lockfs_user_params(arg, flag, 693 (uint64_t)_lockfs_info.lf_key); 694 return (err); 695 696 case fobj_lockfs_state::CANCELLED: 697 // 698 // The ioctl has completed before - just return the 699 // error code. 700 // 701 return (state_obj->get_error()); 702 703 default: 704 break; 705 } 706 } 707 708 // 709 // Copy the lockfs structure in from user space. 710 // Note that the get_lockfs_user_params() call returns an 711 // allocated buffer in fs_lockfs.lf_comment - this memory is 712 // freed before returning from this routine if there is an 713 // error. If there is no error, the fs_lockfs parameter 714 // (along with the allocated memory) is stored in the fs_ii 715 // object by the store_lockfs_params() call below, and 716 // subsequently freed by the fs object. 717 // 718 #if defined(_LEAST_PRIVS) 719 err = get_lockfs_user_params(arg, flag, crp, fs_lockfs, 720 fobjp->get_vp()); 721 #else 722 err = get_lockfs_user_params(arg, flag, crp, fs_lockfs); 723 #endif 724 if (err != 0) { 725 return (err); 726 } 727 728 // 729 // this is for 4413957 -- because we call VOP_RENAME() instead of 730 // VOP_REMOVE() and can't easily check for UFS delete lock enabled 731 // 732 if (LOCKFS_DLOCK == fs_lockfs.lf_lock) { 733 return (ENOTSUP); 734 } 735 736 // 737 // Checkpoint all the parameters over to the secondaries. The 738 // secondaries create a transaction object to store all the 739 // parameters and then the transaction object waits for the 740 // commit checkpoint to store the parameters with the fs 741 // secondaries. 742 // 743 fobjp->get_ckpt()->ckpt_lockfs_start((uint64_t)fs_lockfs.lf_lock, 744 (uint64_t)fs_lockfs.lf_flags, (uint64_t)fs_lockfs.lf_key, 745 fs_lockfs.lf_comment, env); 746 env.clear(); 747 748 // 749 // We need to serialize the lockfs ioctls on a per-filesystem basis 750 // so that we know which lock is curently in effect on the underlying 751 // filesystem. This guards for the race-condition if two lockfs 752 // calls get into UFS and the last one out of UFS is not the last one 753 // checkpointed across to the secondaries. 754 // 755 _lockfs_lock.lock(); 756 757 #if SOL_VERSION >= __s11 758 error = VOP_IOCTL(vp, _FIOLFS, arg, flag, crp, ret_val, NULL); 759 #else 760 error = VOP_IOCTL(vp, _FIOLFS, arg, flag, crp, ret_val); 761 #endif 762 763 if (error == 0) { 764 // 765 // Success. We must now store the parameters we used to 766 // lock the filesystem on the primary, and also commit this 767 // transaction on the secondaries. We do this to enable PXFS 768 // to replay the ioctl during a switchover/failover 769 // 770 // there are 2 cases: lockfs -u and everyone else 771 // 772 if (LOCKFS_ULOCK == fs_lockfs.lf_lock) { 773 _locking_on = false; 774 } else { 775 _locking_on = true; 776 } 777 store_lockfs_params(fs_lockfs); 778 fobjp->commit(env); 779 env.clear(); 780 _fiolfs_cv.broadcast(); 781 } else { 782 // 783 // The ioctl failed. Cancel the transaction on the secondaries 784 // Note that we are sending the error code over - this is 785 // returned to the user if there is a retry of this call. 786 // 787 fobjp->get_ckpt()->ckpt_lockfs_failure(error, env); 788 env.clear(); 789 delete [] fs_lockfs.lf_comment; 790 } 791 792 _lockfs_lock.unlock(); 793 794 return (error); 795 } 796 797 // 798 // Helper function to copy the user parameters to the _FIOLFS ioctl into kernel 799 // space. 800 // 801 #if defined(_LEAST_PRIVS) 802 sol::error_t 803 ufs_dependent_impl::get_lockfs_user_params(sol::intptr_t uarg, int32_t flag, 804 cred_t *crp, struct lockfs &lkfs, vnode_t *vp) 805 #else 806 sol::error_t 807 ufs_dependent_impl::get_lockfs_user_params(sol::intptr_t uarg, int32_t flag, 808 cred_t *crp, struct lockfs &lkfs) 809 #endif 810 { 811 char *comment; 812 813 // 814 // NOTE: This code is adapted from the ufs code that handles the 815 // '_FIOLFS' ioctl in ufs_vnops.c. If that code changes in any way, 816 // this should change with it. 817 // 818 #if defined(_LEAST_PRIVS) 819 if (secpolicy_fs_config(crp, vp->v_vfsp) != 0) { 820 #else 821 if (! suser(crp)) { 822 #endif 823 return (EPERM); 824 } 825 826 if ((flag & DATAMODEL_MASK) == DATAMODEL_NATIVE) { 827 if (copyin((caddr_t)uarg, &lkfs, sizeof (struct lockfs))) { 828 return (EFAULT); 829 } 830 } 831 #ifdef _SYSCALL32_IMPL 832 else { 833 struct lockfs32 lkfs32; 834 /* Translate ILP32 lockfs to LP64 lockfs */ 835 if (copyin((caddr_t)uarg, &lkfs32, sizeof (struct lockfs32))) 836 return (EFAULT); 837 lkfs.lf_lock = (ulong_t)lkfs32.lf_lock; 838 lkfs.lf_flags = (ulong_t)lkfs32.lf_flags; 839 lkfs.lf_key = (ulong_t)lkfs32.lf_key; 840 lkfs.lf_comlen = (ulong_t)lkfs32.lf_comlen; 841 lkfs.lf_comment = (caddr_t)lkfs32.lf_comment; 842 } 843 #endif /* _SYSCALL32_IMPL */ 844 845 if (lkfs.lf_comlen) { 846 if (lkfs.lf_comlen > LOCKFS_MAXCOMMENTLEN) { 847 return (ENAMETOOLONG); 848 } 849 comment = new char[lkfs.lf_comlen]; 850 if (copyin(lkfs.lf_comment, comment, lkfs.lf_comlen)) { 851 delete [] comment; 852 return (EFAULT); 853 } 854 lkfs.lf_comment = comment; 855 } else { 856 lkfs.lf_comment = NULL; 857 } 858 859 return (0); 860 } 861 862 // 863 // Helper function used to copy out the current value of 'lf_key' to the user. 864 // This function only needs to be called if a retry comes in for a committed 865 // transaction. 866 // 867 sol::error_t 868 ufs_dependent_impl::set_lockfs_user_params(sol::intptr_t uarg, int32_t flag, 869 uint64_t lf_key) 870 { 871 // 872 // This function is modelled after ufs handling of the _FIOLFS ioctl. 873 // If you plan on making any changes here, look at ufs first. 874 // 875 if ((flag & DATAMODEL_MASK) == DATAMODEL_NATIVE) { 876 struct lockfs lkfs; 877 878 // Copy in the arguments. 879 if (copyin((caddr_t)uarg, &lkfs, sizeof (struct lockfs))) { 880 return (EFAULT); 881 } 882 // Set the key value to the current value. 883 lkfs.lf_key = (ulong_t)lf_key; 884 885 // ... and copy out. 886 (void) copyout(&lkfs, (caddr_t)uarg, sizeof (struct lockfs)); 887 } 888 #ifdef _SYSCALL32_IMPL 889 else { 890 // Do the same thing in 32 bits. 891 struct lockfs32 lkfs32; 892 893 // Copy in the arguments. 894 if (copyin((caddr_t)uarg, &lkfs32, sizeof (struct lockfs32))) 895 return (EFAULT); 896 897 // Set the key value to the current value. 898 lkfs32.lf_key = (uint32_t)lf_key; 899 900 // ... and copy out. 901 (void) copyout(&lkfs32, (caddr_t)uarg, 902 sizeof (struct lockfs32)); 903 } 904 #endif /* _SYSCALL32_IMPL */ 905 906 return (0); 907 } 908 909 // 910 // This is where the ckpt_lockfs_state() checkpoint ends up. 911 // We store the fs locking parameters in the '_lockfs_info' member. 912 // 'lockfs_info' is used during failovers/switchovers to replay the locking 913 // ioctl. 914 // 915 void 916 ufs_dependent_impl::ckpt_lockfs_state(uint64_t lf_lock, uint64_t lf_flags, 917 uint64_t lf_key, const char *lf_comment) 918 { 919 ASSERT(_fsp->is_replicated()); 920 ASSERT(_fsp->is_secondary()); 921 922 if (LOCKFS_ULOCK != lf_lock) { 923 PXFS_DBPRINTF( 924 PXFS_TRACE_UFS, 925 PXFS_GREEN, 926 ("locked via ckpt_lockfs_state()\n")); 927 _locking_on = true; 928 } else { 929 PXFS_DBPRINTF( 930 PXFS_TRACE_UFS, 931 PXFS_GREEN, 932 ("unlocked via ckpt_lockfs_state()\n")); 933 _locking_on = false; 934 } 935 936 delete [] _lockfs_info.lf_comment; 937 938 _lockfs_info.lf_lock = (ulong_t)lf_lock; 939 _lockfs_info.lf_flags = (ulong_t)lf_flags; 940 _lockfs_info.lf_key = (ulong_t)lf_key; 941 if (lf_comment == NULL) { 942 _lockfs_info.lf_comlen = 0; 943 _lockfs_info.lf_comment = NULL; 944 } else { 945 _lockfs_info.lf_comlen = strlen(lf_comment) + 1; 946 _lockfs_info.lf_comment = new char[_lockfs_info.lf_comlen]; 947 (void) strcpy(_lockfs_info.lf_comment, lf_comment); 948 } 949 } 950 951 // 952 // Called from fobj_ii::ioctl_fiolfs to set the locking parameters of the 953 // underlying filesystem. 954 // 955 void 956 ufs_dependent_impl::store_lockfs_params(struct lockfs &lkfs) 957 { 958 ASSERT(_lockfs_lock.lock_held()); 959 ASSERT(_fsp->is_replicated()); 960 ASSERT(!_fsp->is_secondary()); 961 962 delete [] _lockfs_info.lf_comment; 963 964 _lockfs_info = lkfs; 965 } 966 967 extern "C" int ufs_alloc_data(vnode_t *vp, u_offset_t offset, size_t *len, 968 fdbuffer_t *fdb, int flags, cred_t *cr); 969 extern "C" int ufs_rdwr_data(vnode_t *vp, u_offset_t offset, size_t len, 970 fdbuffer_t *fdb, int flags, cred_t *cr); 971 972 int 973 ufs_dependent_impl::fs_alloc_data(vnode_t *vp, u_offset_t offset, size_t *len, 974 fdbuffer_t *fdb, int flags, cred_t *cr) 975 { 976 // 977 // If this is a NFS thread, unset the T_DONTPEND flag so that 978 // ufs_alloc_data remains asynchronous. 979 // 980 bool nfs_thread = (bool)(curthread->t_flag & T_DONTPEND); 981 982 if (nfs_thread) { 983 curthread->t_flag &= ~T_DONTPEND; 984 } 985 PXFS_DBPRINTF( 986 PXFS_TRACE_UFS, 987 PXFS_GREEN, 988 ("fs_alloc_data: vp %p off %llx len %lx\n", 989 vp, offset, *len)); 990 int error = ufs_alloc_data(vp, offset, len, fdb, flags, cr); 991 if (nfs_thread) { 992 curthread->t_flag |= T_DONTPEND; 993 } 994 995 return (error); 996 } 997 998 int 999 ufs_dependent_impl::fs_rdwr_data(vnode_t *vp, u_offset_t offset, size_t len, 1000 size_t, fdbuffer_t *fdb, int flags, cred_t *cr) 1001 { 1002 PXFS_DBPRINTF( 1003 PXFS_TRACE_UFS, 1004 PXFS_GREEN, 1005 ("fs_rdwr_data: %s%s vp %p off %llx len %lx\n", 1006 flags & B_ASYNC ? "a" : "", 1007 flags & B_READ ? "read" : "write", 1008 vp, offset, len)); 1009 1010 return (ufs_rdwr_data(vp, offset, len, fdb, flags, cr)); 1011 } 1012 1013 // 1014 // fs_fsync - when "syncdir" option isn't enabled and this is an HA file system, 1015 // we need to flush the UFS log to disk so we don't lose file meta-data 1016 // after a failover. 1017 // 1018 int 1019 ufs_dependent_impl::fs_fsync(vnode_t *vnodep, cred_t *credp) 1020 { 1021 int error = 0; 1022 1023 if (!_fsp->is_replicated() || _syncdir_on) { 1024 // Nothing to do. 1025 return (0); 1026 } 1027 1028 FAULTPT_PXFS(FAULTNUM_PXFS_SYNCDIR, FaultFunctions::generic); 1029 // 1030 // We store the time we started doing the FSYNC and assign it to 1031 // last_sync_time only after VOP_FSYNC completes. This will make 1032 // threads that came in when a VOP_FSYNC was in progress to wait 1033 // until VOP_FSYNC completion. 1034 // 1035 os::hrtime_t tmp_last_sync_time = os::gethrtime(); 1036 1037 // 1038 // ufs_alloc_data() creates async log transactions. ufs_fsync() is 1039 // optimized for NFS threads to skip fsync if the last transaction 1040 // for that thread is already commited. PxFS depends on *all* 1041 // transactions created before ufs_fsync was called being commited. 1042 // The optimization for NFS breaks this guarantee. To work around 1043 // the above problem, we clear T_DONTPEND flag on this thread to 1044 // make this thread look like a non-NFS thread. 1045 // 1046 bool nfs_thread = (bool)(curthread->t_flag & T_DONTPEND); 1047 if (nfs_thread) { 1048 curthread->t_flag &= ~T_DONTPEND; 1049 } 1050 #if SOL_VERSION >= __s11 1051 error = VOP_FSYNC(vnodep, FNODSYNC, credp, NULL); 1052 #else 1053 error = VOP_FSYNC(vnodep, FNODSYNC, credp); 1054 #endif 1055 if (nfs_thread) { 1056 curthread->t_flag |= T_DONTPEND; 1057 } 1058 1059 last_sync_time = tmp_last_sync_time; 1060 1061 return (error); 1062 } 1063 1064 // 1065 // do_fsync - sync all information to disk. 1066 // 1067 int 1068 ufs_dependent_impl::do_fsync(vnode_t *vnodep, cred_t *credp) 1069 { 1070 int error; 1071 sync_lock.wrlock(); 1072 1073 // 1074 // We store the time we started doing VOP_FSYNC and assign it 1075 // to last_sync_time only after VOP_FSYNC completes. This will 1076 // make threads that came in when a VOP_FSYNC was in progress 1077 // to wait until VOP_FSYNC completion. 1078 // 1079 os::hrtime_t tmp_last_sync_time = os::gethrtime(); 1080 1081 // 1082 // If this is a NFS thread, unset the T_DONTPEND flag so that 1083 // the sync operation will not be skipped by optimizations 1084 // for NFS that do not apply to Pxfs. 1085 // 1086 bool nfs_thread = (bool)(curthread->t_flag & T_DONTPEND); 1087 1088 if (nfs_thread) { 1089 curthread->t_flag &= ~T_DONTPEND; 1090 } 1091 #if SOL_VERSION >= __s11 1092 error = VOP_FSYNC(vnodep, FSYNC, credp, NULL); 1093 #else 1094 error = VOP_FSYNC(vnodep, FSYNC, credp); 1095 #endif 1096 if (nfs_thread) { 1097 curthread->t_flag |= T_DONTPEND; 1098 } 1099 1100 last_sync_time = tmp_last_sync_time; 1101 1102 sync_lock.unlock(); 1103 return (error); 1104 } 1105 1106 // 1107 // Sync. out the in-memory log if we don't know that it was pushed to disk 1108 // after 'mod_time'. 1109 // 1110 int 1111 ufs_dependent_impl::sync_if_necessary(os::hrtime_t &mod_time, vnode *vnodep, 1112 cred_t *credp) 1113 { 1114 int error = 0; 1115 1116 if (mod_time >= last_sync_time) { 1117 sync_lock.wrlock(); 1118 if (mod_time >= last_sync_time) { 1119 error = fs_fsync(vnodep, credp); 1120 } 1121 sync_lock.unlock(); 1122 } 1123 1124 return (error); 1125 } 1126