Home | History | Annotate | Download | only in server
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the License).
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/CDDL.txt
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/CDDL.txt.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets [] replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
     28 /*	  All Rights Reserved	  */
     29 
     30 /*
     31  * Portions of this source code were derived from Berkeley 4.3 BSD
     32  * under license from the Regents of the University of California.
     33  */
     34 
     35 #pragma ident	"@(#)ufs_dependent_impl.cc	1.23	08/05/20 SMI"
     36 
     37 #include <sys/file.h>
     38 #include <sys/fcntl.h>
     39 #include <sys/filio.h>
     40 #include <sys/lockfs.h>
     41 #include <sys/mntent.h>
     42 #include <sys/vnode.h>
     43 #include <sys/fs/ufs_filio.h>
     44 
     45 #include <sys/sol_version.h>
     46 #if SOL_VERSION >= __s10
     47 #define	_LEAST_PRIVS
     48 #endif
     49 #if defined(_LEAST_PRIVS)
     50 #include <sys/policy.h>
     51 #endif
     52 
     53 #include <orb/infrastructure/orb_conf.h>
     54 
     55 #include "../version.h"
     56 #include <pxfs/lib/pxfs_debug.h>
     57 #include <pxfs/server/ufs_dependent_impl.h>
     58 #include <pxfs/server/fobj_impl.h>
     59 #include <pxfs/server/io_impl.h>
     60 #include <pxfs/server/fs_impl.h>
     61 #include <pxfs/server/file_impl.h>
     62 #include <pxfs/server/fobj_trans_states.h>
     63 
     64 //lint -e1512
     65 //
     66 // Warning(1512) destructor for base class is not virtual -- In a
     67 // final pass through all the classes, we have found a class that is
     68 // the base class of a derivation and has a destructor but the
     69 // destructor is not virtual. It is conventional for inherited classes
     70 // to have virtual destructors so that is it safe to 'delete' a
     71 // pointer to a base class.
     72 //
     73 // Ths classes prov_common_iter and prov_common_setin the file
     74 // prov_common.h have to be changed to have virtual destructors.
     75 //
     76 
     77 //lint -e666
     78 // PXFS do an extensive user of the inline function get_vp() within
     79 // the vnode macros:
     80 //	error = VOP_LOOKUP(get_vp(), ... );
     81 // There are no side effects to calling get_vp() repeatedly, flexelint
     82 // does not know that, but we do.
     83 //
     84 
     85 // Constructor.
     86 ufs_dependent_impl::ufs_dependent_impl(fs_ii *fsp, const char *mntoptions)
     87 {
     88 	_fsp = fsp;
     89 
     90 	_lockfs_info.lf_comment = NULL;
     91 	_locking_on = false;
     92 
     93 	ufs_dependent_impl::set_mntopts(mntoptions);
     94 
     95 	last_sync_time = os::gethrtime();
     96 }
     97 
     98 // Virtual destructor.
     99 ufs_dependent_impl::~ufs_dependent_impl()
    100 {
    101 	// Free up lockfs comment, if any.
    102 	delete [] _lockfs_info.lf_comment;
    103 } //lint !e1540 pointers are neither freed nor zero'ed by destructor
    104 
    105 //
    106 // Called by the constructor, or when the FS is mounted using new mount
    107 // options (via the MS_REMOUNT flag to mount).
    108 //
    109 void
    110 ufs_dependent_impl::set_mntopts(const char *mntoptions)
    111 {
    112 	_forcedirectio_on = false;
    113 	_syncdir_on = false;
    114 	_nocto_on = false;
    115 
    116 	if (mntoptions == NULL) {
    117 		return;
    118 	}
    119 
    120 	//
    121 	// Look for the "forcedirectio" option.  If this option is true,
    122 	// then directio mode is always enabled for open files in this
    123 	// filesystem.  We flag the mntoption using the '_forcedirectio_on'
    124 	// variable, and then turn off caching by setting 'cachedata' to
    125 	// false in new_fobj(), which is called every time a new fobj is
    126 	// created for this FS.
    127 	//
    128 	if (pxfslib::exists_mntopt(mntoptions, MNTOPT_FORCEDIRECTIO, false)) {
    129 		//
    130 		// Found the "forcedirectio" mount option.  Set the boolean
    131 		// variable for use in 'new_fobj'.
    132 		//
    133 		PXFS_DBPRINTF(
    134 		    PXFS_TRACE_UFS,
    135 		    PXFS_GREEN,
    136 		    ("FS %x mounted with forcedirectio\n", _fsp));
    137 		_forcedirectio_on = true;
    138 	}
    139 
    140 	if (pxfslib::exists_mntopt(mntoptions, MNTOPT_SYNCDIR, false)) {
    141 		_syncdir_on = true;
    142 	}
    143 
    144 	if (pxfslib::exists_mntopt(mntoptions, MNTOPT_NOCTO, true)) {
    145 		_nocto_on = true;
    146 	}
    147 }
    148 
    149 //
    150 // Function called by the fs_impl object right after it creates a
    151 // new fobj object.  This function is meant to set fs-specific
    152 // parameters for the fobj - in UFS's case, it would set the 'cachedata'
    153 // flag based on whether the UFS mount was done with the "forcedirectio"
    154 // option turned on.
    155 //
    156 void
    157 ufs_dependent_impl::new_fobj(fobj_ii *fobjp)
    158 {
    159 	ASSERT(_fsp == fobjp->get_fsp());
    160 
    161 	if (_forcedirectio_on) {
    162 		PXFS_DBPRINTF(
    163 		    PXFS_TRACE_UFS,
    164 		    PXFS_GREEN,
    165 		    ("Setting cachedata to false for %x\n", fobjp));
    166 		//
    167 		// This UFS filesystem is mounted with the "forcedirectio"
    168 		// mount option.  Take the directio path by default.
    169 		//
    170 		if (fobjp->get_ftype() == PXFS_VER::fobj_file) {
    171 			//
    172 			// Only file objects care about directio
    173 			//
    174 			file_ii		*filep = (file_ii *)fobjp;
    175 			filep->init_cachedata_flag(false);
    176 		}
    177 	}
    178 }
    179 
    180 //
    181 // Called from page_out and async_page_out. Retry VOP_ALLOC_DATA to ensure
    182 // there's space on disk before writing data out. We should really do this
    183 // only for files that are open across a failover. However, we don't have
    184 // enough state today to know which files were open across a failover.
    185 //
    186 int
    187 ufs_dependent_impl::fs_preprocess(vnode_t *vp, u_offset_t offset, size_t *len,
    188 	    fdbuffer_t *fdb, int flags, cred_t *credp)
    189 {
    190 	if (!_syncdir_on) {
    191 		int error;
    192 		PXFS_DBPRINTF(
    193 		    PXFS_TRACE_UFS,
    194 		    PXFS_GREEN,
    195 		    ("fs_alloc_data(fs_preprocess): vp %p off %llx len %lx\n",
    196 		    vp, offset, *len));
    197 		error = fs_alloc_data(vp, offset, len, fdb, flags, credp);
    198 		if (error) {
    199 			return (error);
    200 		}
    201 		//
    202 		// We need to sync out UFS's log to disk to prevent bug
    203 		// 4362944. The bug occurs because a pageout can complete,
    204 		// but the inode may not have been updated with block
    205 		// allocation information when using pxfs without syncdir.
    206 		// So after a failover, we have a "holy" file although pages
    207 		// have been written to disk. By flushing UFS's log to disk,
    208 		// we ensure this file's inode has been updated on disk.
    209 		//
    210 		error = sync_if_necessary(os::gethrtime(), vp, credp);
    211 		return (error);
    212 	} else {
    213 		return (0);
    214 	}
    215 }
    216 
    217 //
    218 // Function called by fobj_ii::cascaded_ioctl() to process ufs-specific ioctls.
    219 // Returns true if the ioctl was processed in this function, and false
    220 // if not.
    221 //
    222 // Keep in mind, that due to 4408967 (switchover/failover of locked filesystem
    223 // hangs), we now do our _freeze_in_progress() check here, on a case by case
    224 // basis (vs. how it was done in the past, were we assumed all ioctls were
    225 // cascaded.
    226 //
    227 // Cascaded ioctls (where we depend on another invocation -- an example would be
    228 // the directio case, where we need to flush everyone's cache) need this check
    229 // to prevent deadlock.
    230 //
    231 // Others, like the lockfs ioctl (lockfs -u, in particular, was the motivation
    232 // behind 4408967), are not cascaded and need to make it through to the
    233 // underlying UFS filesystem so we can unblock freeze_primary.
    234 //
    235 bool
    236 ufs_dependent_impl::process_cascaded_ioctl(sol::nodeid_t,
    237     fobj_ii *fobjp, int32_t iocmd, sol::intptr_t arg, int32_t flag,
    238     cred_t *crp, int *result, int &error, Environment &env)
    239 {
    240 	struct lockfs	fs_lockfs;
    241 
    242 	ASSERT(_fsp == fobjp->get_fsp());
    243 
    244 	fs_lockfs.lf_comment = NULL;
    245 
    246 	switch (iocmd) {
    247 	case _FIOSATIME:
    248 		error = ioctl_fiosatime(fobjp, arg, flag, result,
    249 		    crp, env);
    250 		return (true);
    251 
    252 	case _FIOLFS:
    253 		// Special handling is required for _FIOLFS only if PXFS is HA
    254 		if (_fsp->is_replicated()) {
    255 			if (!((fs_repl_impl *)_fsp)->_freeze_in_progress(
    256 			    &env)) {
    257 
    258 				// We only sync when it is a write lock
    259 #if defined(_LEAST_PRIVS)
    260 				error = get_lockfs_user_params(arg, flag,
    261 				    crp, fs_lockfs, fobjp->get_vp());
    262 #else
    263 				error = get_lockfs_user_params(arg, flag,
    264 				    crp, fs_lockfs);
    265 #endif
    266 				if (error) {
    267 					if (fs_lockfs.lf_comment != NULL) {
    268 						delete [] fs_lockfs.lf_comment;
    269 					}
    270 					return (true);
    271 				}
    272 
    273 				if (LOCKFS_WLOCK == fs_lockfs.lf_lock) {
    274 					_fsp->sync_fs(crp);
    275 #if	SOL_VERSION >= __s11
    276 					error = VOP_IOCTL(fobjp->get_vp(),
    277 					    _FIOFFS, arg, flag, crp, result,
    278 					    NULL);
    279 #else
    280 					error = VOP_IOCTL(fobjp->get_vp(),
    281 					    _FIOFFS, arg, flag, crp, result);
    282 #endif
    283 				}
    284 			}
    285 			env.clear();
    286 
    287 			error = ioctl_fiolfs(fobjp, arg, flag, result, crp,
    288 			    env);
    289 			return (true);
    290 		}
    291 		break;
    292 
    293 	case _FIOFFS:
    294 		if (_fsp->is_replicated() &&
    295 		    ((fs_repl_impl *)_fsp)->_freeze_in_progress(&env)) {
    296 				return (true);
    297 		}
    298 		_fsp->sync_fs(crp);
    299 #if	SOL_VERSION >= __s11
    300 		error = VOP_IOCTL(fobjp->get_vp(), iocmd, arg, flag, crp,
    301 		    result, NULL);
    302 #else
    303 		error = VOP_IOCTL(fobjp->get_vp(), iocmd, arg, flag, crp,
    304 		    result);
    305 #endif
    306 		return (true);
    307 
    308 	case _FIOSDIO:
    309 		if (_fsp->is_replicated() &&
    310 		    ((fs_repl_impl *)_fsp)->_freeze_in_progress(&env)) {
    311 				return (true);
    312 		}
    313 		_fsp->sync_fs(crp);
    314 #if	SOL_VERSION >= __s11
    315 		error = VOP_IOCTL(fobjp->get_vp(), iocmd, arg, flag, crp,
    316 		    result, NULL);
    317 #else
    318 		error = VOP_IOCTL(fobjp->get_vp(), iocmd, arg, flag, crp,
    319 		    result);
    320 #endif
    321 		return (true);
    322 
    323 	case _FIODIRECTIO:
    324 		if (_fsp->is_replicated() &&
    325 		    ((fs_repl_impl *)_fsp)->_freeze_in_progress(&env)) {
    326 				return (true);
    327 		}
    328 		fobjp->range_lock();
    329 		FAULTPT_PXFS(FAULTNUM_PXFS_FIODIRECTIO_S_B,
    330 		    FaultFunctions::generic);
    331 #if	SOL_VERSION >= __s11
    332 		error = VOP_IOCTL(fobjp->get_vp(), iocmd, arg, flag, crp,
    333 		    result, NULL);
    334 #else
    335 		error = VOP_IOCTL(fobjp->get_vp(), iocmd, arg, flag, crp,
    336 		    result);
    337 #endif
    338 		if (error == 0) {
    339 			ASSERT(arg == DIRECTIO_ON || arg == DIRECTIO_OFF);
    340 
    341 			if (_forcedirectio_on && arg == DIRECTIO_OFF) {
    342 				//
    343 				// If this FS was mounted with "forcedirectio",
    344 				// and some app. is trying to turn off direct
    345 				// io, ignore that call.  This is what UFS
    346 				// does.
    347 				//
    348 			} else {
    349 				if (fobjp->get_ftype() == PXFS_VER::fobj_file) {
    350 					//
    351 					// Only file objects care about directio
    352 					//
    353 					file_ii	*filep = (file_ii *)fobjp;
    354 					filep->dio_writes.wrlock();
    355 					error = filep->set_cachedata_flag(
    356 					    arg == DIRECTIO_OFF, env);
    357 					filep->dio_writes.unlock();
    358 				}
    359 			}
    360 		}
    361 		FAULTPT_PXFS(FAULTNUM_PXFS_FIODIRECTIO_S_A,
    362 		    FaultFunctions::generic);
    363 		fobjp->range_unlock();
    364 		return (true);
    365 
    366 	case _FIOLOGDISABLE:
    367 #if	SOL_VERSION >= __s11
    368 		error = VOP_IOCTL(fobjp->get_vp(), iocmd, arg, flag, crp,
    369 		    result, NULL);
    370 #else
    371 		error = VOP_IOCTL(fobjp->get_vp(), iocmd, arg, flag, crp,
    372 		    result);
    373 #endif
    374 		return (true);
    375 
    376 	case _FIOISBUSY: {
    377 		//
    378 		// Contract-private interface for Legato.
    379 		// No ioctl is sent down to the underlying filesystem
    380 		// because, it will always return failure to this
    381 		// legato specific ioctl. Instead, pxfs tries to find
    382 		// out if this file has been opened on pxfs clients.
    383 		//
    384 		uint32_t ret_val;
    385 #if defined(_LEAST_PRIVS)
    386 		if (secpolicy_fs_config(crp, (fobjp->get_vp())->v_vfsp) != 0) {
    387 #else
    388 		if (! suser(crp)) {
    389 #endif
    390 			error = EPERM;
    391 		} else {
    392 			ret_val = (uint32_t)fobjp->is_it_busy();
    393 			if (suword32((int *)arg, ret_val)) {
    394 				error = EFAULT;
    395 			} else {
    396 				error = 0;
    397 			}
    398 		}
    399 		return (true);
    400 	}
    401 
    402 	case _FIOTUNE:
    403 		// Tune the file system's atrributes.
    404 #if	SOL_VERSION >= __s11
    405 		error = VOP_IOCTL(fobjp->get_vp(), iocmd, arg,
    406 			    flag, crp, result, NULL);
    407 #else
    408 		error = VOP_IOCTL(fobjp->get_vp(), iocmd, arg,
    409 			    flag, crp, result);
    410 #endif
    411 		FAULTPT_PXFS(FAULTNUM_PXFS_FIOTUNE_S, FaultFunctions::generic);
    412 		if ((error == 0) && fobjp->is_replicated()) {
    413 			//
    414 			// If the ioctl was successful, sync metadata
    415 			// so that the in-memory version of the superblock
    416 			// is committed to disk.
    417 			//
    418 #if	SOL_VERSION >= __s11
    419 			error = VOP_FSYNC(fobjp->get_vp(), FNODSYNC, crp, NULL);
    420 #else
    421 			error = VOP_FSYNC(fobjp->get_vp(), FNODSYNC, crp);
    422 #endif
    423 		}
    424 		return (true);
    425 	default:
    426 		break;
    427 	}
    428 
    429 	// The ioctl was not processed in a fs-specific manner.
    430 	return (false);
    431 }
    432 
    433 void
    434 ufs_dependent_impl::replay_ioctl(fobj_ii *fobjp, int32_t iocmd,
    435     sol::intptr_t arg, int32_t flag, int32_t &result, int &error)
    436 {
    437 	//
    438 	// Set the copy_args 'pid' value to '0', to indicate that the
    439 	// kernel is performing the ioctl (see orb/copy.cc).
    440 	//
    441 	copy_args args(orb_conf::node_number(), 0);
    442 	copy::setcontext(&args);
    443 
    444 	switch (iocmd) {
    445 	case _FIODIRECTIO:
    446 		PXFS_DBPRINTF(
    447 		    PXFS_TRACE_UFS,
    448 		    PXFS_GREEN,
    449 		    ("replaying directio ioctl\n"));
    450 #if	SOL_VERSION >= __s11
    451 		error = VOP_IOCTL(fobjp->get_vp(), iocmd, arg, flag, kcred,
    452 			    &result, NULL);
    453 #else
    454 		error = VOP_IOCTL(fobjp->get_vp(), iocmd, arg, flag, kcred,
    455 			    &result);
    456 #endif
    457 		if (error) {
    458 			PXFS_DBPRINTF(
    459 			    PXFS_TRACE_UFS,
    460 			    PXFS_RED,
    461 			    ("error replaying directio ioctl %d\n", error));
    462 		}
    463 		return;
    464 
    465 	default:
    466 		break;
    467 	}
    468 
    469 	// Trap future references
    470 	copy::setcontext(NULL);
    471 }
    472 
    473 //
    474 // Function called by fs_ii::convert_to_primary() to do any fs-specific
    475 // conversions.
    476 //
    477 int
    478 ufs_dependent_impl::convert_to_primary(fs_ii *fsp)
    479 {
    480 	int error = 0;
    481 
    482 	// Replay the filesystem lock, if any existed.
    483 	if (_locking_on) {
    484 		// Make a local copy.
    485 		struct lockfs lkfs;
    486 		lkfs = _lockfs_info;
    487 
    488 		PXFS_DBPRINTF(
    489 		    PXFS_TRACE_UFS,
    490 		    PXFS_GREEN,
    491 		    ("ufs_dependent_impl::convert_to_primary: "
    492 		    "lkfs = %p", &lkfs));
    493 		error = fs_dependent_impl::kernel_ioctl(fsp->get_vfsp(),
    494 		    _FIOLFS, (intptr_t)&lkfs);
    495 		if (error != 0) {
    496 			//
    497 			// This is fatal because a locked filesystem is now
    498 			// unlocked after a *transparent* failure or switchover.
    499 			// We should not become the primary.
    500 			//
    501 			PXFS_DBPRINTF(
    502 			    PXFS_TRACE_UFS,
    503 			    PXFS_RED,
    504 			    ("ufs_dependent_impl::"
    505 			    "convert_to_primary: replaying of lock failed "
    506 			    "- error %d\n", error));
    507 			return (error);
    508 		}
    509 	}
    510 	fiolog_t arg;
    511 
    512 	arg.nbytes_requested = 0;
    513 	arg.nbytes_actual = 0;
    514 	arg.error = FIOLOG_ENONE;
    515 
    516 	if (!(fsp->device_is_lofi())) {
    517 		error = fs_dependent_impl::kernel_ioctl(fsp->get_vfsp(),
    518 		    _FIOLOGENABLE, (intptr_t)&arg);
    519 
    520 		if (error != 0) {
    521 			PXFS_DBPRINTF(
    522 			    PXFS_TRACE_UFS,
    523 			    PXFS_RED,
    524 			    ("ufs_dependent_impl::"
    525 			    "convert_to_primary: logging ioctl"
    526 			    "failed - error %d\n", error));
    527 			return (error);
    528 		}
    529 	}
    530 
    531 	//
    532 	// Force the file system information back to the server
    533 	// during recovery mode.
    534 	//
    535 	common_threadpool::the().defer_processing(new fs_recovery_task(
    536 	    fsp, fsp->get_server_incn()));
    537 
    538 	return (0);
    539 }
    540 
    541 void
    542 ufs_dependent_impl::freeze_primary(const char *fs_name)
    543 {
    544 	//
    545 	// Try to grab the _lockfs_lock mutex. This mutex could be protecting
    546 	// the _locking_on boolean.
    547 	//
    548 	_lockfs_lock.lock();
    549 	while (_locking_on) {
    550 		char nodename[32];
    551 
    552 		(void) sprintf(nodename, "Node (%u)", orb_conf::node_number());
    553 
    554 		os::sc_syslog_msg msg(SC_SYSLOG_FILESYSTEM_TAG,
    555 		    nodename, NULL);
    556 		//
    557 		// SCMSGS
    558 		// @explanation
    559 		// The file system has been locked with the _FIOLFS ioctl. It
    560 		// is necessary to perform an unlock _FIOLFS ioctl. The
    561 		// growfs(1M) or lockfs(1M) command may be responsible for
    562 		// this lock.
    563 		// @user_action
    564 		// An _FIOLFS LOCKFS_ULOCK ioctl is required to unlock the
    565 		// file system.
    566 		//
    567 		(void) msg.log(SC_SYSLOG_WARNING, MESSAGE,
    568 		    "Filesystem (%s) is locked and cannot be frozen",
    569 		    fs_name);
    570 		_fiolfs_cv.wait(&_lockfs_lock);
    571 	}
    572 	_lockfs_lock.unlock();
    573 }
    574 
    575 //
    576 // Helper function for dumping state to a new secondary.
    577 //
    578 void
    579 ufs_dependent_impl::dump_state(REPL_PXFS_VER::fs_replica_ptr ckptp,
    580     Environment &env)
    581 {
    582 	if (_locking_on) {
    583 		ckptp->ckpt_lockfs_info((uint64_t)_lockfs_info.lf_lock,
    584 		    (uint64_t)_lockfs_info.lf_flags,
    585 		    (uint64_t)_lockfs_info.lf_key,
    586 		    _lockfs_info.lf_comment, env);
    587 		env.clear();
    588 	}
    589 }
    590 
    591 //
    592 // ioctl_fiosatime - Helper function to perform the '_FIOSATIME' ufs ioctl.
    593 // The UFS _FIOSATIME ioctl sets the access time of the file it is performed
    594 // on.  We need to downgrade the attribute caches before letting the ioctl
    595 // through to UFS.  Also, since UFS only does the update in-memory, PXFS needs
    596 // to perform a sync on the file if the ioctl completes successfully.
    597 //
    598 sol::error_t
    599 ufs_dependent_impl::ioctl_fiosatime(fobj_ii *fobjp, sol::intptr_t arg,
    600     int32_t flag, int *ret_val, cred_t *crp, Environment &)
    601 {
    602 	int	error;
    603 	vnode_t *vp = fobjp->get_vp();
    604 
    605 	//
    606 	// The _FIOSATIME sets the access time attribute of the vnode.  Before
    607 	// we issue the ioctl, we need to downgrade the attribute caches just
    608 	// like in fobj_ii::set_attributes()
    609 	//
    610 	fobjp->attr_lock.wrlock();
    611 
    612 	error = fobjp->downgrade_attr_all(PXFS_VER::attr_write, false, 0);
    613 	if (error != 0) {
    614 		fobjp->attr_lock.unlock();
    615 		return (error);
    616 	}
    617 
    618 	// Issue the ioctl.
    619 #if	SOL_VERSION >= __s11
    620 	error = VOP_IOCTL(vp, _FIOSATIME, arg, flag, crp, ret_val, NULL);
    621 #else
    622 	error = VOP_IOCTL(vp, _FIOSATIME, arg, flag, crp, ret_val);
    623 #endif
    624 
    625 	if ((error == 0) && fobjp->is_replicated()) {
    626 		//
    627 		// If the ioctl was successful, write out the updated value to
    628 		// disk so that if there is a failover/switchover, the
    629 		// attribute value will remain consistent.
    630 		// XXX This is currently needed because UFS logging writes
    631 		// the log asynchronously.
    632 		// XXX is FDSYNC really correct ?
    633 		//
    634 		error = do_fsync(vp, crp);
    635 	}
    636 
    637 	fobjp->attr_lock.unlock();
    638 	return (error);
    639 }
    640 
    641 //
    642 // Helper function to perform the '_FIOLFS' ufs ioctl.
    643 // The UFS _FIOLFS ioctl performs file-system locking.  If the fs being locked
    644 // is a HA fs, then PXFS needs to keep track of the current locking state of the
    645 // fs, and further needs to replay this lock when a failover/switchover of the
    646 // filesystem happens.  This implies that we need to checkpoint the fs locking
    647 // error over to the PXFS secondaries - this is done in this routine and
    648 // also when a new secondary is added to this service.
    649 //
    650 sol::error_t
    651 ufs_dependent_impl::ioctl_fiolfs(fobj_ii *fobjp, sol::intptr_t arg,
    652     int32_t flag, int *ret_val, cred_t *crp, Environment &env)
    653 {
    654 	int error;
    655 	struct lockfs fs_lockfs;
    656 	fobj_lockfs_state *state_obj;
    657 	int state;
    658 	sol::error_t err;
    659 	vnode_t *vp = fobjp->get_vp();
    660 
    661 	ASSERT(fobjp->get_fsp() == _fsp);
    662 	ASSERT(fobjp->is_replicated());
    663 
    664 	if ((state_obj = fobj_lockfs_state::retry(env)) != NULL) {
    665 		//
    666 		// This call is a retry.  Get the stored data from the state
    667 		// object.  Note that it is possible to reconstruct 'fs_lockfs'
    668 		// from the arguments to ioctl_fiolfs, but that is slower than
    669 		// just copying it out of the state object.
    670 		//
    671 		state_obj->get_args(&fs_lockfs, &state);
    672 
    673 		switch (state) {
    674 		case fobj_lockfs_state::INITIAL:
    675 			// Continue with the processing.
    676 			break;
    677 
    678 		case fobj_lockfs_state::COMMITED:
    679 			//
    680 			// The ioctl has already been replayed - we need to
    681 			// return success.  But (sigh!) it's not as easy as
    682 			// that - we also need to copy out the appropriate
    683 			// value of lf_key to the user.
    684 			// 'lf_key' is a value stored by UFS associated
    685 			// with the current lock - UFS uses it to provide some
    686 			// protection against multiple threads doing locking on
    687 			// a filesystem without knowing about each other.  We,
    688 			// of course, need to support this so that the lockfs
    689 			// protocol is truly transparent in a HA PXFS
    690 			// filesystem.
    691 			//
    692 			err = set_lockfs_user_params(arg, flag,
    693 			    (uint64_t)_lockfs_info.lf_key);
    694 			return (err);
    695 
    696 		case fobj_lockfs_state::CANCELLED:
    697 			//
    698 			// The ioctl has completed before - just return the
    699 			// error code.
    700 			//
    701 			return (state_obj->get_error());
    702 
    703 		default:
    704 			break;
    705 		}
    706 	}
    707 
    708 	//
    709 	// Copy the lockfs structure in from user space.
    710 	// Note that the get_lockfs_user_params() call returns an
    711 	// allocated buffer in fs_lockfs.lf_comment - this memory is
    712 	// freed before returning from this routine if there is an
    713 	// error.  If there is no error, the fs_lockfs parameter
    714 	// (along with the allocated memory) is stored in the fs_ii
    715 	// object by the store_lockfs_params() call below, and
    716 	// subsequently freed by the fs object.
    717 	//
    718 #if defined(_LEAST_PRIVS)
    719 	err = get_lockfs_user_params(arg, flag, crp, fs_lockfs,
    720 	    fobjp->get_vp());
    721 #else
    722 	err = get_lockfs_user_params(arg, flag, crp, fs_lockfs);
    723 #endif
    724 	if (err != 0) {
    725 		return (err);
    726 	}
    727 
    728 	//
    729 	// this is for 4413957 -- because we call VOP_RENAME() instead of
    730 	// VOP_REMOVE() and can't easily check for UFS delete lock enabled
    731 	//
    732 	if (LOCKFS_DLOCK == fs_lockfs.lf_lock) {
    733 		return (ENOTSUP);
    734 	}
    735 
    736 	//
    737 	// Checkpoint all the parameters over to the secondaries.  The
    738 	// secondaries create a transaction object to store all the
    739 	// parameters and then the transaction object waits for the
    740 	// commit checkpoint to store the parameters with the fs
    741 	// secondaries.
    742 	//
    743 	fobjp->get_ckpt()->ckpt_lockfs_start((uint64_t)fs_lockfs.lf_lock,
    744 	    (uint64_t)fs_lockfs.lf_flags, (uint64_t)fs_lockfs.lf_key,
    745 	    fs_lockfs.lf_comment, env);
    746 	env.clear();
    747 
    748 	//
    749 	// We need to serialize the lockfs ioctls on a per-filesystem basis
    750 	// so that we know which lock is curently in effect on the underlying
    751 	// filesystem.  This guards for the race-condition if two lockfs
    752 	// calls get into UFS and the last one out of UFS is not the last one
    753 	// checkpointed across to the secondaries.
    754 	//
    755 	_lockfs_lock.lock();
    756 
    757 #if	SOL_VERSION >= __s11
    758 	error = VOP_IOCTL(vp, _FIOLFS, arg, flag, crp, ret_val, NULL);
    759 #else
    760 	error = VOP_IOCTL(vp, _FIOLFS, arg, flag, crp, ret_val);
    761 #endif
    762 
    763 	if (error == 0) {
    764 		//
    765 		// Success.  We must now store the parameters we used to
    766 		// lock the filesystem on the primary, and also commit this
    767 		// transaction on the secondaries.  We do this to enable PXFS
    768 		// to replay the ioctl during a switchover/failover
    769 		//
    770 		// there are 2 cases: lockfs -u and everyone else
    771 		//
    772 		if (LOCKFS_ULOCK == fs_lockfs.lf_lock) {
    773 			_locking_on = false;
    774 		} else {
    775 			_locking_on = true;
    776 		}
    777 		store_lockfs_params(fs_lockfs);
    778 		fobjp->commit(env);
    779 		env.clear();
    780 		_fiolfs_cv.broadcast();
    781 	} else {
    782 		//
    783 		// The ioctl failed.  Cancel the transaction on the secondaries
    784 		// Note that we are sending the error code over - this is
    785 		// returned to the user if there is a retry of this call.
    786 		//
    787 		fobjp->get_ckpt()->ckpt_lockfs_failure(error, env);
    788 		env.clear();
    789 		delete [] fs_lockfs.lf_comment;
    790 	}
    791 
    792 	_lockfs_lock.unlock();
    793 
    794 	return (error);
    795 }
    796 
    797 //
    798 // Helper function to copy the user parameters to the _FIOLFS ioctl into kernel
    799 // space.
    800 //
    801 #if defined(_LEAST_PRIVS)
    802 sol::error_t
    803 ufs_dependent_impl::get_lockfs_user_params(sol::intptr_t uarg, int32_t flag,
    804     cred_t *crp, struct lockfs &lkfs, vnode_t *vp)
    805 #else
    806 sol::error_t
    807 ufs_dependent_impl::get_lockfs_user_params(sol::intptr_t uarg, int32_t flag,
    808     cred_t *crp, struct lockfs &lkfs)
    809 #endif
    810 {
    811 	char *comment;
    812 
    813 	//
    814 	// NOTE: This code is adapted from the ufs code that handles the
    815 	// '_FIOLFS' ioctl in ufs_vnops.c.  If that code changes in any way,
    816 	// this should change with it.
    817 	//
    818 #if defined(_LEAST_PRIVS)
    819 	if (secpolicy_fs_config(crp, vp->v_vfsp) != 0) {
    820 #else
    821 	if (! suser(crp)) {
    822 #endif
    823 		return (EPERM);
    824 	}
    825 
    826 	if ((flag & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
    827 		if (copyin((caddr_t)uarg, &lkfs, sizeof (struct lockfs))) {
    828 			return (EFAULT);
    829 		}
    830 	}
    831 #ifdef _SYSCALL32_IMPL
    832 	else {
    833 		struct lockfs32 lkfs32;
    834 		/* Translate ILP32 lockfs to LP64 lockfs */
    835 		if (copyin((caddr_t)uarg, &lkfs32, sizeof (struct lockfs32)))
    836 			return (EFAULT);
    837 		lkfs.lf_lock = (ulong_t)lkfs32.lf_lock;
    838 		lkfs.lf_flags = (ulong_t)lkfs32.lf_flags;
    839 		lkfs.lf_key = (ulong_t)lkfs32.lf_key;
    840 		lkfs.lf_comlen = (ulong_t)lkfs32.lf_comlen;
    841 		lkfs.lf_comment = (caddr_t)lkfs32.lf_comment;
    842 	}
    843 #endif /* _SYSCALL32_IMPL */
    844 
    845 	if (lkfs.lf_comlen) {
    846 		if (lkfs.lf_comlen > LOCKFS_MAXCOMMENTLEN) {
    847 			return (ENAMETOOLONG);
    848 		}
    849 		comment = new char[lkfs.lf_comlen];
    850 		if (copyin(lkfs.lf_comment, comment, lkfs.lf_comlen)) {
    851 			delete [] comment;
    852 			return (EFAULT);
    853 		}
    854 		lkfs.lf_comment = comment;
    855 	} else {
    856 		lkfs.lf_comment = NULL;
    857 	}
    858 
    859 	return (0);
    860 }
    861 
    862 //
    863 // Helper function used to copy out the current value of 'lf_key' to the user.
    864 // This function only needs to be called if a retry comes in for a committed
    865 // transaction.
    866 //
    867 sol::error_t
    868 ufs_dependent_impl::set_lockfs_user_params(sol::intptr_t uarg, int32_t flag,
    869     uint64_t lf_key)
    870 {
    871 	//
    872 	// This function is modelled after ufs handling of the _FIOLFS ioctl.
    873 	// If you plan on making any changes here, look at ufs first.
    874 	//
    875 	if ((flag & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
    876 		struct lockfs lkfs;
    877 
    878 		// Copy in the arguments.
    879 		if (copyin((caddr_t)uarg, &lkfs, sizeof (struct lockfs))) {
    880 			return (EFAULT);
    881 		}
    882 		// Set the key value to the current value.
    883 		lkfs.lf_key = (ulong_t)lf_key;
    884 
    885 		// ... and copy out.
    886 		(void) copyout(&lkfs, (caddr_t)uarg, sizeof (struct lockfs));
    887 	}
    888 #ifdef _SYSCALL32_IMPL
    889 	else {
    890 		// Do the same thing in 32 bits.
    891 		struct lockfs32 lkfs32;
    892 
    893 		// Copy in the arguments.
    894 		if (copyin((caddr_t)uarg, &lkfs32, sizeof (struct lockfs32)))
    895 			return (EFAULT);
    896 
    897 		// Set the key value to the current value.
    898 		lkfs32.lf_key = (uint32_t)lf_key;
    899 
    900 		// ... and copy out.
    901 		(void) copyout(&lkfs32, (caddr_t)uarg,
    902 		    sizeof (struct lockfs32));
    903 	}
    904 #endif /* _SYSCALL32_IMPL */
    905 
    906 	return (0);
    907 }
    908 
    909 //
    910 // This is where the ckpt_lockfs_state() checkpoint ends up.
    911 // We store the fs locking parameters in the '_lockfs_info' member.
    912 // 'lockfs_info' is used during failovers/switchovers to replay the locking
    913 // ioctl.
    914 //
    915 void
    916 ufs_dependent_impl::ckpt_lockfs_state(uint64_t lf_lock, uint64_t lf_flags,
    917     uint64_t lf_key, const char *lf_comment)
    918 {
    919 	ASSERT(_fsp->is_replicated());
    920 	ASSERT(_fsp->is_secondary());
    921 
    922 	if (LOCKFS_ULOCK != lf_lock) {
    923 		PXFS_DBPRINTF(
    924 		    PXFS_TRACE_UFS,
    925 		    PXFS_GREEN,
    926 		    ("locked via ckpt_lockfs_state()\n"));
    927 		_locking_on = true;
    928 	} else {
    929 		PXFS_DBPRINTF(
    930 		    PXFS_TRACE_UFS,
    931 		    PXFS_GREEN,
    932 		    ("unlocked via ckpt_lockfs_state()\n"));
    933 		_locking_on = false;
    934 	}
    935 
    936 	delete [] _lockfs_info.lf_comment;
    937 
    938 	_lockfs_info.lf_lock = (ulong_t)lf_lock;
    939 	_lockfs_info.lf_flags = (ulong_t)lf_flags;
    940 	_lockfs_info.lf_key = (ulong_t)lf_key;
    941 	if (lf_comment == NULL) {
    942 		_lockfs_info.lf_comlen = 0;
    943 		_lockfs_info.lf_comment = NULL;
    944 	} else {
    945 		_lockfs_info.lf_comlen = strlen(lf_comment) + 1;
    946 		_lockfs_info.lf_comment = new char[_lockfs_info.lf_comlen];
    947 		(void) strcpy(_lockfs_info.lf_comment, lf_comment);
    948 	}
    949 }
    950 
    951 //
    952 // Called from fobj_ii::ioctl_fiolfs to set the locking parameters of the
    953 // underlying filesystem.
    954 //
    955 void
    956 ufs_dependent_impl::store_lockfs_params(struct lockfs &lkfs)
    957 {
    958 	ASSERT(_lockfs_lock.lock_held());
    959 	ASSERT(_fsp->is_replicated());
    960 	ASSERT(!_fsp->is_secondary());
    961 
    962 	delete [] _lockfs_info.lf_comment;
    963 
    964 	_lockfs_info = lkfs;
    965 }
    966 
    967 extern "C" int ufs_alloc_data(vnode_t *vp, u_offset_t offset, size_t *len,
    968     fdbuffer_t *fdb, int flags, cred_t *cr);
    969 extern "C" int ufs_rdwr_data(vnode_t *vp, u_offset_t offset, size_t len,
    970     fdbuffer_t *fdb, int flags, cred_t *cr);
    971 
    972 int
    973 ufs_dependent_impl::fs_alloc_data(vnode_t *vp, u_offset_t offset, size_t *len,
    974     fdbuffer_t *fdb, int flags, cred_t *cr)
    975 {
    976 	//
    977 	// If this is a NFS thread, unset the T_DONTPEND flag so that
    978 	// ufs_alloc_data remains asynchronous.
    979 	//
    980 	bool nfs_thread = (bool)(curthread->t_flag & T_DONTPEND);
    981 
    982 	if (nfs_thread) {
    983 		curthread->t_flag &= ~T_DONTPEND;
    984 	}
    985 	PXFS_DBPRINTF(
    986 	    PXFS_TRACE_UFS,
    987 	    PXFS_GREEN,
    988 	    ("fs_alloc_data: vp %p off %llx len %lx\n",
    989 	    vp, offset, *len));
    990 	int error = ufs_alloc_data(vp, offset, len, fdb, flags, cr);
    991 	if (nfs_thread) {
    992 		curthread->t_flag |= T_DONTPEND;
    993 	}
    994 
    995 	return (error);
    996 }
    997 
    998 int
    999 ufs_dependent_impl::fs_rdwr_data(vnode_t *vp, u_offset_t offset, size_t len,
   1000     size_t, fdbuffer_t *fdb, int flags, cred_t *cr)
   1001 {
   1002 	PXFS_DBPRINTF(
   1003 	    PXFS_TRACE_UFS,
   1004 	    PXFS_GREEN,
   1005 	    ("fs_rdwr_data: %s%s vp %p off %llx len %lx\n",
   1006 	    flags & B_ASYNC ? "a" : "",
   1007 	    flags & B_READ ? "read" : "write",
   1008 	    vp, offset, len));
   1009 
   1010 	return (ufs_rdwr_data(vp, offset, len, fdb, flags, cr));
   1011 }
   1012 
   1013 //
   1014 // fs_fsync - when "syncdir" option isn't enabled and this is an HA file system,
   1015 // we need to flush the UFS log to disk so we don't lose file meta-data
   1016 // after a failover.
   1017 //
   1018 int
   1019 ufs_dependent_impl::fs_fsync(vnode_t *vnodep, cred_t *credp)
   1020 {
   1021 	int		error = 0;
   1022 
   1023 	if (!_fsp->is_replicated() || _syncdir_on) {
   1024 		// Nothing to do.
   1025 		return (0);
   1026 	}
   1027 
   1028 	FAULTPT_PXFS(FAULTNUM_PXFS_SYNCDIR, FaultFunctions::generic);
   1029 	//
   1030 	// We store the time we started doing the FSYNC and assign it to
   1031 	// last_sync_time only after VOP_FSYNC completes. This will make
   1032 	// threads that came in when a VOP_FSYNC was in progress to wait
   1033 	// until VOP_FSYNC completion.
   1034 	//
   1035 	os::hrtime_t	tmp_last_sync_time = os::gethrtime();
   1036 
   1037 	//
   1038 	// ufs_alloc_data() creates async log transactions. ufs_fsync() is
   1039 	// optimized for NFS threads to skip fsync if the last transaction
   1040 	// for that thread is already commited. PxFS depends on *all*
   1041 	// transactions created before ufs_fsync was called being commited.
   1042 	// The optimization for NFS breaks this guarantee. To work around
   1043 	// the above problem, we clear T_DONTPEND flag on this thread to
   1044 	// make this thread look like a non-NFS thread.
   1045 	//
   1046 	bool nfs_thread = (bool)(curthread->t_flag & T_DONTPEND);
   1047 	if (nfs_thread) {
   1048 		curthread->t_flag &= ~T_DONTPEND;
   1049 	}
   1050 #if	SOL_VERSION >= __s11
   1051 	error = VOP_FSYNC(vnodep, FNODSYNC, credp, NULL);
   1052 #else
   1053 	error = VOP_FSYNC(vnodep, FNODSYNC, credp);
   1054 #endif
   1055 	if (nfs_thread) {
   1056 		curthread->t_flag |= T_DONTPEND;
   1057 	}
   1058 
   1059 	last_sync_time = tmp_last_sync_time;
   1060 
   1061 	return (error);
   1062 }
   1063 
   1064 //
   1065 // do_fsync - sync all information to disk.
   1066 //
   1067 int
   1068 ufs_dependent_impl::do_fsync(vnode_t *vnodep, cred_t *credp)
   1069 {
   1070 	int		error;
   1071 	sync_lock.wrlock();
   1072 
   1073 	//
   1074 	// We store the time we started doing VOP_FSYNC and assign it
   1075 	// to last_sync_time only after VOP_FSYNC completes. This will
   1076 	// make threads that came in when a VOP_FSYNC was in progress
   1077 	// to wait until VOP_FSYNC completion.
   1078 	//
   1079 	os::hrtime_t	tmp_last_sync_time = os::gethrtime();
   1080 
   1081 	//
   1082 	// If this is a NFS thread, unset the T_DONTPEND flag so that
   1083 	// the sync operation will not be skipped by optimizations
   1084 	// for NFS that do not apply to Pxfs.
   1085 	//
   1086 	bool nfs_thread = (bool)(curthread->t_flag & T_DONTPEND);
   1087 
   1088 	if (nfs_thread) {
   1089 		curthread->t_flag &= ~T_DONTPEND;
   1090 	}
   1091 #if	SOL_VERSION >= __s11
   1092 	error = VOP_FSYNC(vnodep, FSYNC, credp, NULL);
   1093 #else
   1094 	error = VOP_FSYNC(vnodep, FSYNC, credp);
   1095 #endif
   1096 	if (nfs_thread) {
   1097 		curthread->t_flag |= T_DONTPEND;
   1098 	}
   1099 
   1100 	last_sync_time = tmp_last_sync_time;
   1101 
   1102 	sync_lock.unlock();
   1103 	return (error);
   1104 }
   1105 
   1106 //
   1107 // Sync. out the in-memory log if we don't know that it was pushed to disk
   1108 // after 'mod_time'.
   1109 //
   1110 int
   1111 ufs_dependent_impl::sync_if_necessary(os::hrtime_t &mod_time, vnode *vnodep,
   1112     cred_t *credp)
   1113 {
   1114 	int	error = 0;
   1115 
   1116 	if (mod_time >= last_sync_time) {
   1117 		sync_lock.wrlock();
   1118 		if (mod_time >= last_sync_time) {
   1119 			error = fs_fsync(vnodep, credp);
   1120 		}
   1121 		sync_lock.unlock();
   1122 	}
   1123 
   1124 	return (error);
   1125 }
   1126