Home | History | Annotate | Download | only in nfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*
     27  * NFS Version 4 state recovery code.
     28  */
     29 
     30 #include <nfs/nfs4_clnt.h>
     31 #include <nfs/nfs4.h>
     32 #include <nfs/rnode4.h>
     33 #include <sys/cmn_err.h>
     34 #include <sys/cred.h>
     35 #include <sys/systm.h>
     36 #include <sys/flock.h>
     37 #include <sys/dnlc.h>
     38 #include <sys/ddi.h>
     39 #include <sys/disp.h>
     40 #include <sys/list.h>
     41 #include <sys/sdt.h>
     42 
     43 extern r4hashq_t *rtable4;
     44 
     45 /*
     46  * Information that describes what needs to be done for recovery.  It is
     47  * passed to a client recovery thread as well as passed to various recovery
     48  * routines.  rc_mi, rc_vp1, and rc_vp2 refer to the filesystem and
     49  * vnode(s) affected by recovery.  rc_vp1 and rc_vp2 are references (use
     50  * VN_HOLD) or NULL.  rc_lost_rqst contains information about the lost
     51  * lock or open/close request, and it holds reference counts for the
     52  * various objects (vnode, etc.).  The recovery thread also uses flags set
     53  * in the mntinfo4_t or vnode_t to tell it what to do.  rc_error is used
     54  * to save the error that originally triggered the recovery event -- will
     55  * later be used to set mi_error if recovery doesn't work.  rc_bseqid_rqst
     56  * contains information about the request that got NFS4ERR_BAD_SEQID, and
     57  * it holds reference count for the various objects (vnode, open owner,
     58  * open stream, lock owner).
     59  */
     60 
     61 typedef struct {
     62 	mntinfo4_t *rc_mi;
     63 	vnode_t *rc_vp1;
     64 	vnode_t *rc_vp2;
     65 	nfs4_recov_t rc_action;
     66 	stateid4 rc_stateid;
     67 	bool_t rc_srv_reboot;		/* server has rebooted */
     68 	nfs4_lost_rqst_t *rc_lost_rqst;
     69 	nfs4_error_t rc_orig_errors;	/* original errors causing recovery */
     70 	int rc_error;
     71 	nfs4_bseqid_entry_t *rc_bseqid_rqst;
     72 } recov_info_t;
     73 
     74 /*
     75  * How long to wait before trying again if there is an error doing
     76  * recovery, in seconds.
     77  */
     78 
     79 static int recov_err_delay = 1;
     80 
     81 /*
     82  * How long to wait when processing NFS4ERR_GRACE or NFS4ERR_DELAY
     83  * errors.  Expressed in seconds.  Default is defined as
     84  * NFS4ERR_DELAY_TIME and this variable is initialized in nfs4_subr_init()
     85  */
     86 time_t nfs4err_delay_time = 0;
     87 
     88 /*
     89  * Tuneable to limit how many time "exempt" ops go OTW
     90  * after a recovery error.  Exempt op hints are OH_CLOSE,
     91  * OH_LOCKU, OH_DELEGRETURN.  These previously always went
     92  * OTW even after rnode was "dead" due to recovery errors.
     93  *
     94  * The tuneable below limits the number of times a start_fop
     95  * invocation will retry the exempt hints.  After the limit
     96  * is reached, nfs4_start_fop will return an error just like
     97  * it would for non-exempt op hints.
     98  */
     99 int nfs4_max_recov_error_retry = 3;
    100 
    101 /*
    102  * Number of seconds the recovery thread should pause before retry when the
    103  * filesystem has been forcibly unmounted.
    104  */
    105 
    106 int nfs4_unmount_delay = 1;
    107 
    108 #ifdef DEBUG
    109 
    110 /*
    111  * How long to wait (in seconds) between recovery operations on a given
    112  * file.  Normally zero, but could be set longer for testing purposes.
    113  */
    114 static int nfs4_recovdelay = 0;
    115 
    116 /*
    117  * Switch that controls whether to go into the debugger when recovery
    118  * fails.
    119  */
    120 static int nfs4_fail_recov_stop = 0;
    121 
    122 /*
    123  * Tuneables to debug client namespace interaction with server
    124  * mount points:
    125  *
    126  *	nfs4_srvmnt_fail_cnt:
    127  *		number of times EACCES returned because client
    128  *		attempted to cross server mountpoint
    129  *
    130  *	nfs4_srvmnt_debug:
    131  *		trigger console printf whenever client attempts
    132  *		to cross server mountpoint
    133  */
    134 int nfs4_srvmnt_fail_cnt = 0;
    135 int nfs4_srvmnt_debug = 0;
    136 #endif
    137 
    138 /* forward references, in alphabetic order */
    139 static void close_after_open_resend(vnode_t *, cred_t *, uint32_t,
    140 	nfs4_error_t *);
    141 static void errs_to_action(recov_info_t *,
    142 	nfs4_server_t *, mntinfo4_t *, stateid4 *, nfs4_lost_rqst_t *, int,
    143 	nfs_opnum4, nfs4_bseqid_entry_t *);
    144 static void flush_reinstate(nfs4_lost_rqst_t *);
    145 static void free_milist(mntinfo4_t **, int);
    146 static mntinfo4_t **make_milist(nfs4_server_t *, int *);
    147 static int nfs4_check_recov_err(vnode_t *, nfs4_op_hint_t,
    148 	nfs4_recov_state_t *, int, char *);
    149 static char *nfs4_getsrvnames(mntinfo4_t *, size_t *);
    150 static void nfs4_recov_fh_fail(vnode_t *, int, nfsstat4);
    151 static void nfs4_recov_thread(recov_info_t *);
    152 static void nfs4_remove_lost_rqsts(mntinfo4_t *, nfs4_server_t *);
    153 static void nfs4_resend_lost_rqsts(recov_info_t *, nfs4_server_t *);
    154 static cred_t *pid_to_cr(pid_t);
    155 static void reclaim_one_lock(vnode_t *, flock64_t *, nfs4_error_t *, int *);
    156 static void recov_bad_seqid(recov_info_t *);
    157 static void recov_badstate(recov_info_t *, vnode_t *, nfsstat4);
    158 static void recov_clientid(recov_info_t *, nfs4_server_t *);
    159 static void recov_done(mntinfo4_t *, recov_info_t *);
    160 static void recov_filehandle(nfs4_recov_t, mntinfo4_t *, vnode_t *);
    161 static void recov_newserver(recov_info_t *, nfs4_server_t **, bool_t *);
    162 static void recov_openfiles(recov_info_t *, nfs4_server_t *);
    163 static void recov_stale(mntinfo4_t *, vnode_t *);
    164 static void nfs4_free_lost_rqst(nfs4_lost_rqst_t *, nfs4_server_t *);
    165 static void recov_throttle(recov_info_t *, vnode_t *);
    166 static void relock_skip_pid(locklist_t *, pid_t);
    167 static void resend_lock(nfs4_lost_rqst_t *, nfs4_error_t *);
    168 static void resend_one_op(nfs4_lost_rqst_t *, nfs4_error_t *, mntinfo4_t *,
    169 	nfs4_server_t *);
    170 static void save_bseqid_rqst(nfs4_bseqid_entry_t *, recov_info_t *);
    171 static void start_recovery(recov_info_t *, mntinfo4_t *, vnode_t *, vnode_t *,
    172 	nfs4_server_t *);
    173 static void start_recovery_action(nfs4_recov_t, bool_t, mntinfo4_t *, vnode_t *,
    174 	vnode_t *);
    175 static int wait_for_recovery(mntinfo4_t *, nfs4_op_hint_t);
    176 
    177 /*
    178  * Return non-zero if the given errno, status, and rpc status codes
    179  * in the nfs4_error_t indicate that client recovery is needed.
    180  * "stateful" indicates whether the call that got the error establishes or
    181  * removes state on the server (open, close, lock, unlock, delegreturn).
    182  */
    183 
    184 int
    185 nfs4_needs_recovery(nfs4_error_t *ep, bool_t stateful, vfs_t *vfsp)
    186 {
    187 	int recov = 0;
    188 	mntinfo4_t *mi;
    189 
    190 	/*
    191 	 * Try failover if the error values justify it and if
    192 	 * it's a failover mount.  Don't try if the mount is in
    193 	 * progress, failures are handled explicitly by nfs4rootvp.
    194 	 */
    195 	if (nfs4_try_failover(ep)) {
    196 		mi = VFTOMI4(vfsp);
    197 		mutex_enter(&mi->mi_lock);
    198 		recov = FAILOVER_MOUNT4(mi) && !(mi->mi_flags & MI4_MOUNTING);
    199 		mutex_exit(&mi->mi_lock);
    200 		if (recov)
    201 			return (recov);
    202 	}
    203 
    204 	if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vfsp)) {
    205 		/*
    206 		 * The server may have gotten the request, so for stateful
    207 		 * ops we need to resynchronize and possibly back out the
    208 		 * op.
    209 		 */
    210 		return (stateful);
    211 	}
    212 	if (ep->error != 0)
    213 		return (0);
    214 
    215 	/* stat values are listed alphabetically */
    216 	/*
    217 	 * There are two lists here: the errors for which we have code, and
    218 	 * the errors for which we plan to have code before FCS.  For the
    219 	 * second list, print a warning message but don't attempt recovery.
    220 	 */
    221 	switch (ep->stat) {
    222 	case NFS4ERR_BADHANDLE:
    223 	case NFS4ERR_BAD_SEQID:
    224 	case NFS4ERR_BAD_STATEID:
    225 	case NFS4ERR_DELAY:
    226 	case NFS4ERR_EXPIRED:
    227 	case NFS4ERR_FHEXPIRED:
    228 	case NFS4ERR_GRACE:
    229 	case NFS4ERR_OLD_STATEID:
    230 	case NFS4ERR_RESOURCE:
    231 	case NFS4ERR_STALE_CLIENTID:
    232 	case NFS4ERR_STALE_STATEID:
    233 	case NFS4ERR_WRONGSEC:
    234 	case NFS4ERR_STALE:
    235 		recov = 1;
    236 		break;
    237 #ifdef DEBUG
    238 	case NFS4ERR_LEASE_MOVED:
    239 	case NFS4ERR_MOVED:
    240 		zcmn_err(VFTOMI4(vfsp)->mi_zone->zone_id,
    241 		    CE_WARN, "!Can't yet recover from NFS status %d",
    242 		    ep->stat);
    243 		break;
    244 #endif
    245 	}
    246 
    247 	return (recov);
    248 }
    249 
    250 /*
    251  * Some operations such as DELEGRETURN want to avoid invoking
    252  * recovery actions that will only mark the file dead.  If
    253  * better handlers are invoked for any of these errors, this
    254  * routine should be modified.
    255  */
    256 int
    257 nfs4_recov_marks_dead(nfsstat4 status)
    258 {
    259 	if (status == NFS4ERR_BAD_SEQID ||
    260 	    status == NFS4ERR_EXPIRED ||
    261 	    status == NFS4ERR_BAD_STATEID ||
    262 	    status == NFS4ERR_OLD_STATEID)
    263 		return (1);
    264 	return (0);
    265 }
    266 
    267 /*
    268  * Transfer the state recovery information in recovp to mi's resend queue,
    269  * and mark mi as having a lost state request.
    270  */
    271 static void
    272 nfs4_enqueue_lost_rqst(recov_info_t *recovp, mntinfo4_t *mi)
    273 {
    274 	nfs4_lost_rqst_t *lrp = recovp->rc_lost_rqst;
    275 
    276 	ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
    277 	    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
    278 
    279 	ASSERT(lrp != NULL && lrp->lr_op != 0);
    280 
    281 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
    282 	    "nfs4_enqueue_lost_rqst %p, op %d",
    283 	    (void *)lrp, lrp->lr_op));
    284 
    285 	mutex_enter(&mi->mi_lock);
    286 	mi->mi_recovflags |= MI4R_LOST_STATE;
    287 	if (lrp->lr_putfirst)
    288 		list_insert_head(&mi->mi_lost_state, lrp);
    289 	else
    290 		list_insert_tail(&mi->mi_lost_state, lrp);
    291 	recovp->rc_lost_rqst = NULL;
    292 	mutex_exit(&mi->mi_lock);
    293 
    294 	nfs4_queue_event(RE_LOST_STATE, mi, NULL, lrp->lr_op, lrp->lr_vp,
    295 	    lrp->lr_dvp, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
    296 }
    297 
    298 /*
    299  * Transfer the bad seqid recovery information in recovp to mi's
    300  * bad seqid queue, and mark mi as having a bad seqid request.
    301  */
    302 void
    303 enqueue_bseqid_rqst(recov_info_t *recovp, mntinfo4_t *mi)
    304 {
    305 	ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
    306 	    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
    307 	ASSERT(recovp->rc_bseqid_rqst != NULL);
    308 
    309 	mutex_enter(&mi->mi_lock);
    310 	mi->mi_recovflags |= MI4R_BAD_SEQID;
    311 	list_insert_tail(&mi->mi_bseqid_list, recovp->rc_bseqid_rqst);
    312 	recovp->rc_bseqid_rqst = NULL;
    313 	mutex_exit(&mi->mi_lock);
    314 }
    315 
    316 /*
    317  * Initiate recovery.
    318  *
    319  * The nfs4_error_t contains the return codes that triggered a recovery
    320  * attempt.  mi, vp1, and vp2 refer to the filesystem and files that were
    321  * being operated on.  vp1 and vp2 may be NULL.
    322  *
    323  * Multiple calls are okay.  If recovery is already underway, the call
    324  * updates the information about what state needs recovery but does not
    325  * start a new thread.  The caller should hold mi->mi_recovlock as a reader
    326  * for proper synchronization with any recovery thread.
    327  *
    328  * This will return TRUE if recovery was aborted, and FALSE otherwise.
    329  */
    330 bool_t
    331 nfs4_start_recovery(nfs4_error_t *ep, mntinfo4_t *mi, vnode_t *vp1,
    332     vnode_t *vp2, stateid4 *sid, nfs4_lost_rqst_t *lost_rqstp, nfs_opnum4 op,
    333     nfs4_bseqid_entry_t *bsep)
    334 {
    335 	recov_info_t *recovp;
    336 	nfs4_server_t *sp;
    337 	bool_t abort = FALSE;
    338 	bool_t gone = FALSE;
    339 
    340 	ASSERT(nfs_zone() == mi->mi_zone);
    341 	mutex_enter(&mi->mi_lock);
    342 	/*
    343 	 * If there is lost state, we need to kick off recovery even if the
    344 	 * filesystem has been unmounted or the zone is shutting down.
    345 	 */
    346 	gone = FS_OR_ZONE_GONE4(mi->mi_vfsp);
    347 	if (gone) {
    348 		ASSERT(ep->error != EINTR || lost_rqstp != NULL);
    349 		if (ep->error == EIO && lost_rqstp == NULL) {
    350 			/* failed due to forced unmount, no new lost state */
    351 			abort = TRUE;
    352 		}
    353 		if ((ep->error == 0 || ep->error == ETIMEDOUT) &&
    354 		    !(mi->mi_recovflags & MI4R_LOST_STATE)) {
    355 			/* some other failure, no existing lost state */
    356 			abort = TRUE;
    357 		}
    358 		if (abort) {
    359 			mutex_exit(&mi->mi_lock);
    360 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
    361 			    "nfs4_start_recovery: fs unmounted"));
    362 			return (TRUE);
    363 		}
    364 	}
    365 	mi->mi_in_recovery++;
    366 	mutex_exit(&mi->mi_lock);
    367 
    368 	recovp = kmem_alloc(sizeof (recov_info_t), KM_SLEEP);
    369 	recovp->rc_orig_errors = *ep;
    370 	sp = find_nfs4_server(mi);
    371 	errs_to_action(recovp, sp, mi, sid, lost_rqstp, gone, op, bsep);
    372 	if (sp != NULL)
    373 		mutex_exit(&sp->s_lock);
    374 	start_recovery(recovp, mi, vp1, vp2, sp);
    375 	if (sp != NULL)
    376 		nfs4_server_rele(sp);
    377 	return (FALSE);
    378 }
    379 
    380 /*
    381  * Internal version of nfs4_start_recovery.  The difference is that the
    382  * caller specifies the recovery action, rather than the errors leading to
    383  * recovery.
    384  */
    385 static void
    386 start_recovery_action(nfs4_recov_t what, bool_t reboot, mntinfo4_t *mi,
    387     vnode_t *vp1, vnode_t *vp2)
    388 {
    389 	recov_info_t *recovp;
    390 
    391 	ASSERT(nfs_zone() == mi->mi_zone);
    392 	mutex_enter(&mi->mi_lock);
    393 	mi->mi_in_recovery++;
    394 	mutex_exit(&mi->mi_lock);
    395 
    396 	recovp = kmem_zalloc(sizeof (recov_info_t), KM_SLEEP);
    397 	recovp->rc_action = what;
    398 	recovp->rc_srv_reboot = reboot;
    399 	recovp->rc_error = EIO;
    400 	start_recovery(recovp, mi, vp1, vp2, NULL);
    401 }
    402 
    403 static void
    404 start_recovery(recov_info_t *recovp, mntinfo4_t *mi,
    405     vnode_t *vp1, vnode_t *vp2, nfs4_server_t *sp)
    406 {
    407 	NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
    408 	    "start_recovery: mi %p, what %s", (void*)mi,
    409 	    nfs4_recov_action_to_str(recovp->rc_action)));
    410 
    411 	/*
    412 	 * Bump the reference on the vfs so that we can pass it to the
    413 	 * recovery thread.
    414 	 */
    415 	VFS_HOLD(mi->mi_vfsp);
    416 	MI4_HOLD(mi);
    417 again:
    418 	switch (recovp->rc_action) {
    419 	case NR_FAILOVER:
    420 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
    421 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
    422 		if (mi->mi_servers->sv_next == NULL)
    423 			goto out_no_thread;
    424 		mutex_enter(&mi->mi_lock);
    425 		mi->mi_recovflags |= MI4R_NEED_NEW_SERVER;
    426 		mutex_exit(&mi->mi_lock);
    427 
    428 		if (recovp->rc_lost_rqst != NULL)
    429 			nfs4_enqueue_lost_rqst(recovp, mi);
    430 		break;
    431 
    432 	case NR_CLIENTID:
    433 		/*
    434 		 * If the filesystem has been unmounted, punt.
    435 		 */
    436 		if (sp == NULL)
    437 			goto out_no_thread;
    438 
    439 		/*
    440 		 * If nobody else is working on the clientid, mark the
    441 		 * clientid as being no longer set.  Then mark the specific
    442 		 * filesystem being worked on.
    443 		 */
    444 		if (!nfs4_server_in_recovery(sp)) {
    445 			mutex_enter(&sp->s_lock);
    446 			sp->s_flags &= ~N4S_CLIENTID_SET;
    447 			mutex_exit(&sp->s_lock);
    448 		}
    449 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
    450 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
    451 		mutex_enter(&mi->mi_lock);
    452 		mi->mi_recovflags |= MI4R_NEED_CLIENTID;
    453 		if (recovp->rc_srv_reboot)
    454 			mi->mi_recovflags |= MI4R_SRV_REBOOT;
    455 		mutex_exit(&mi->mi_lock);
    456 		break;
    457 
    458 	case NR_OPENFILES:
    459 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
    460 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
    461 		mutex_enter(&mi->mi_lock);
    462 		mi->mi_recovflags |= MI4R_REOPEN_FILES;
    463 		if (recovp->rc_srv_reboot)
    464 			mi->mi_recovflags |= MI4R_SRV_REBOOT;
    465 		mutex_exit(&mi->mi_lock);
    466 		break;
    467 
    468 	case NR_WRONGSEC:
    469 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
    470 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
    471 		mutex_enter(&mi->mi_lock);
    472 		mi->mi_recovflags |= MI4R_NEED_SECINFO;
    473 		mutex_exit(&mi->mi_lock);
    474 		break;
    475 
    476 	case NR_EXPIRED:
    477 		if (vp1 != NULL)
    478 			recov_badstate(recovp, vp1, NFS4ERR_EXPIRED);
    479 		if (vp2 != NULL)
    480 			recov_badstate(recovp, vp2, NFS4ERR_EXPIRED);
    481 		goto out_no_thread;	/* no further recovery possible */
    482 
    483 	case NR_BAD_STATEID:
    484 		if (vp1 != NULL)
    485 			recov_badstate(recovp, vp1, NFS4ERR_BAD_STATEID);
    486 		if (vp2 != NULL)
    487 			recov_badstate(recovp, vp2, NFS4ERR_BAD_STATEID);
    488 		goto out_no_thread;	/* no further recovery possible */
    489 
    490 	case NR_FHEXPIRED:
    491 	case NR_BADHANDLE:
    492 		if (vp1 != NULL)
    493 			recov_throttle(recovp, vp1);
    494 		if (vp2 != NULL)
    495 			recov_throttle(recovp, vp2);
    496 		/*
    497 		 * Recover the filehandle now, rather than using a
    498 		 * separate thread.  We can do this because filehandle
    499 		 * recovery is independent of any other state, and because
    500 		 * we know that we are not competing with the recovery
    501 		 * thread at this time.  recov_filehandle will deal with
    502 		 * threads that are competing to recover this filehandle.
    503 		 */
    504 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
    505 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
    506 		if (vp1 != NULL)
    507 			recov_filehandle(recovp->rc_action, mi, vp1);
    508 		if (vp2 != NULL)
    509 			recov_filehandle(recovp->rc_action, mi, vp2);
    510 		goto out_no_thread;	/* no further recovery needed */
    511 
    512 	case NR_STALE:
    513 		/*
    514 		 * NFS4ERR_STALE handling
    515 		 * recov_stale() could set MI4R_NEED_NEW_SERVER to
    516 		 * indicate that we can and should failover.
    517 		 */
    518 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
    519 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
    520 
    521 		if (vp1 != NULL)
    522 			recov_stale(mi, vp1);
    523 		if (vp2 != NULL)
    524 			recov_stale(mi, vp2);
    525 		mutex_enter(&mi->mi_lock);
    526 		if ((mi->mi_recovflags & MI4R_NEED_NEW_SERVER) == 0) {
    527 			mutex_exit(&mi->mi_lock);
    528 			goto out_no_thread;
    529 		}
    530 		mutex_exit(&mi->mi_lock);
    531 		recovp->rc_action = NR_FAILOVER;
    532 		goto again;
    533 
    534 	case NR_BAD_SEQID:
    535 		if (recovp->rc_bseqid_rqst) {
    536 			enqueue_bseqid_rqst(recovp, mi);
    537 			break;
    538 		}
    539 
    540 		if (vp1 != NULL)
    541 			recov_badstate(recovp, vp1, NFS4ERR_BAD_SEQID);
    542 		if (vp2 != NULL)
    543 			recov_badstate(recovp, vp2, NFS4ERR_BAD_SEQID);
    544 		goto out_no_thread; /* no further recovery possible */
    545 
    546 	case NR_OLDSTATEID:
    547 		if (vp1 != NULL)
    548 			recov_badstate(recovp, vp1, NFS4ERR_OLD_STATEID);
    549 		if (vp2 != NULL)
    550 			recov_badstate(recovp, vp2, NFS4ERR_OLD_STATEID);
    551 		goto out_no_thread;	/* no further recovery possible */
    552 
    553 	case NR_GRACE:
    554 		nfs4_set_grace_wait(mi);
    555 		goto out_no_thread; /* no further action required for GRACE */
    556 
    557 	case NR_DELAY:
    558 		if (vp1)
    559 			nfs4_set_delay_wait(vp1);
    560 		goto out_no_thread; /* no further action required for DELAY */
    561 
    562 	case NR_LOST_STATE_RQST:
    563 	case NR_LOST_LOCK:
    564 		nfs4_enqueue_lost_rqst(recovp, mi);
    565 		break;
    566 
    567 	default:
    568 		nfs4_queue_event(RE_UNEXPECTED_ACTION, mi, NULL,
    569 		    recovp->rc_action, NULL, NULL, 0, NULL, 0, TAG_NONE,
    570 		    TAG_NONE, 0, 0);
    571 		goto out_no_thread;
    572 	}
    573 
    574 	/*
    575 	 * If either file recently went through the same recovery, wait
    576 	 * awhile.  This is in case there is some sort of bug; we might not
    577 	 * be able to recover properly, but at least we won't bombard the
    578 	 * server with calls, and we won't tie up the client.
    579 	 */
    580 	if (vp1 != NULL)
    581 		recov_throttle(recovp, vp1);
    582 	if (vp2 != NULL)
    583 		recov_throttle(recovp, vp2);
    584 
    585 	/*
    586 	 * If there's already a recovery thread, don't start another one.
    587 	 */
    588 
    589 	mutex_enter(&mi->mi_lock);
    590 	if (mi->mi_flags & MI4_RECOV_ACTIV) {
    591 		mutex_exit(&mi->mi_lock);
    592 		goto out_no_thread;
    593 	}
    594 	mi->mi_flags |= MI4_RECOV_ACTIV;
    595 	mutex_exit(&mi->mi_lock);
    596 	NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
    597 	    "start_recovery: starting new thread for mi %p", (void*)mi));
    598 
    599 	recovp->rc_mi = mi;
    600 	recovp->rc_vp1 = vp1;
    601 	if (vp1 != NULL) {
    602 		ASSERT(VTOMI4(vp1) == mi);
    603 		VN_HOLD(recovp->rc_vp1);
    604 	}
    605 	recovp->rc_vp2 = vp2;
    606 	if (vp2 != NULL) {
    607 		ASSERT(VTOMI4(vp2) == mi);
    608 		VN_HOLD(recovp->rc_vp2);
    609 	}
    610 
    611 	(void) zthread_create(NULL, 0, nfs4_recov_thread, recovp, 0,
    612 	    minclsyspri);
    613 	return;
    614 
    615 	/* not reached by thread creating call */
    616 out_no_thread:
    617 	mutex_enter(&mi->mi_lock);
    618 	mi->mi_in_recovery--;
    619 	if (mi->mi_in_recovery == 0)
    620 		cv_broadcast(&mi->mi_cv_in_recov);
    621 	mutex_exit(&mi->mi_lock);
    622 
    623 	VFS_RELE(mi->mi_vfsp);
    624 	MI4_RELE(mi);
    625 	/*
    626 	 * Free up resources that were allocated for us.
    627 	 */
    628 	kmem_free(recovp, sizeof (recov_info_t));
    629 }
    630 
    631 static int
    632 nfs4_check_recov_err(vnode_t *vp, nfs4_op_hint_t op,
    633     nfs4_recov_state_t *rsp, int retry_err_cnt, char *str)
    634 {
    635 	rnode4_t *rp;
    636 	int error = 0;
    637 	int exempt;
    638 
    639 	if (vp == NULL)
    640 		return (0);
    641 
    642 	exempt = (op == OH_CLOSE || op == OH_LOCKU || op == OH_DELEGRETURN);
    643 	rp = VTOR4(vp);
    644 	mutex_enter(&rp->r_statelock);
    645 
    646 	/*
    647 	 * If there was a recovery error, then allow op hints "exempt" from
    648 	 * recov errors to retry (currently 3 times).  Either r_error or
    649 	 * EIO is returned for non-exempt op hints.
    650 	 */
    651 	if (rp->r_flags & R4RECOVERR) {
    652 		if (exempt && rsp->rs_num_retry_despite_err <=
    653 		    nfs4_max_recov_error_retry) {
    654 
    655 			/*
    656 			 * Check to make sure that we haven't already inc'd
    657 			 * rs_num_retry_despite_err for current nfs4_start_fop
    658 			 * instance.  We don't want to double inc (if we were
    659 			 * called with vp2, then the vp1 call could have
    660 			 * already incremented.
    661 			 */
    662 			if (retry_err_cnt == rsp->rs_num_retry_despite_err)
    663 				rsp->rs_num_retry_despite_err++;
    664 
    665 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
    666 			    "nfs4_start_fop: %s %p DEAD, cnt=%d", str,
    667 			    (void *)vp, rsp->rs_num_retry_despite_err));
    668 		} else {
    669 			error = (rp->r_error ? rp->r_error : EIO);
    670 			/*
    671 			 * An ESTALE error on a non-regular file is not
    672 			 * "sticky".  Return the ESTALE error once, but
    673 			 * clear the condition to allow future operations
    674 			 * to go OTW.  This will allow the client to
    675 			 * recover if the server has merely unshared then
    676 			 * re-shared the file system.  For regular files,
    677 			 * the unshare has destroyed the open state at the
    678 			 * server and we aren't willing to do a reopen (yet).
    679 			 */
    680 			if (error == ESTALE && vp->v_type != VREG) {
    681 				rp->r_flags &=
    682 				    ~(R4RECOVERR|R4RECOVERRP|R4STALE);
    683 				rp->r_error = 0;
    684 				error = ESTALE;
    685 			}
    686 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
    687 			    "nfs4_start_fop: %s %p DEAD, cnt=%d error=%d",
    688 			    str, (void *)vp,
    689 			    rsp->rs_num_retry_despite_err, error));
    690 		}
    691 	}
    692 
    693 	mutex_exit(&rp->r_statelock);
    694 	return (error);
    695 }
    696 
    697 /*
    698  * Initial setup code that every operation should call if it might invoke
    699  * client recovery.  Can block waiting for recovery to finish on a
    700  * filesystem.  Either vnode ptr can be NULL.
    701  *
    702  * Returns 0 if there are no outstanding errors.  Can return an
    703  * errno value under various circumstances (e.g., failed recovery, or
    704  * interrupted while waiting for recovery to finish).
    705  *
    706  * There must be a corresponding call to nfs4_end_op() to free up any locks
    707  * or resources allocated by this call (assuming this call succeeded),
    708  * using the same rsp that's passed in here.
    709  *
    710  * The open and lock seqid synchronization must be stopped before calling this
    711  * function, as it could lead to deadlock when trying to reopen a file or
    712  * reclaim a lock.  The synchronization is obtained with calls to:
    713  *   nfs4_start_open_seqid_sync()
    714  *   nfs4_start_lock_seqid_sync()
    715  *
    716  * *startrecovp is set TRUE if the caller should not bother with the
    717  * over-the-wire call, and just initiate recovery for the given request.
    718  * This is typically used for state-releasing ops if the filesystem has
    719  * been forcibly unmounted.  startrecovp may be NULL for
    720  * non-state-releasing ops.
    721  */
    722 
    723 int
    724 nfs4_start_fop(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, nfs4_op_hint_t op,
    725     nfs4_recov_state_t *rsp, bool_t *startrecovp)
    726 {
    727 	int error = 0, rerr_cnt;
    728 	nfs4_server_t *sp = NULL;
    729 	nfs4_server_t *tsp;
    730 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
    731 	uint_t droplock_cnt;
    732 #ifdef DEBUG
    733 	void *fop_caller;
    734 #endif
    735 
    736 	ASSERT(vp1 == NULL || vp1->v_vfsp == mi->mi_vfsp);
    737 	ASSERT(vp2 == NULL || vp2->v_vfsp == mi->mi_vfsp);
    738 
    739 #ifdef	DEBUG
    740 	if ((fop_caller = tsd_get(nfs4_tsd_key)) != NULL) {
    741 		cmn_err(CE_PANIC, "Missing nfs4_end_fop: last caller %p",
    742 		    fop_caller);
    743 	}
    744 	(void) tsd_set(nfs4_tsd_key, caller());
    745 #endif
    746 
    747 	rsp->rs_sp = NULL;
    748 	rsp->rs_flags &= ~NFS4_RS_RENAME_HELD;
    749 	rerr_cnt = rsp->rs_num_retry_despite_err;
    750 
    751 	/*
    752 	 * Process the items that may delay() based on server response
    753 	 */
    754 	error = nfs4_wait_for_grace(mi, rsp);
    755 	if (error)
    756 		goto out;
    757 
    758 	if (vp1 != NULL) {
    759 		error = nfs4_wait_for_delay(vp1, rsp);
    760 		if (error)
    761 			goto out;
    762 	}
    763 
    764 	/* Wait for a delegation recall to complete. */
    765 
    766 	error = wait_for_recall(vp1, vp2, op, rsp);
    767 	if (error)
    768 		goto out;
    769 
    770 	/*
    771 	 * Wait for any current recovery actions to finish.  Note that a
    772 	 * recovery thread can still start up after wait_for_recovery()
    773 	 * finishes.  We don't block out recovery operations until we
    774 	 * acquire s_recovlock and mi_recovlock.
    775 	 */
    776 	error = wait_for_recovery(mi, op);
    777 	if (error)
    778 		goto out;
    779 
    780 	/*
    781 	 * Check to see if the rnode is already marked with a
    782 	 * recovery error.  If so, return it immediately.  But
    783 	 * always pass CLOSE, LOCKU, and DELEGRETURN so we can
    784 	 * clean up state on the server.
    785 	 */
    786 
    787 	if (vp1 != NULL) {
    788 		if (error = nfs4_check_recov_err(vp1, op, rsp, rerr_cnt, "vp1"))
    789 			goto out;
    790 		nfs4_check_remap(mi, vp1, NFS4_REMAP_CKATTRS, &e);
    791 	}
    792 
    793 	if (vp2 != NULL) {
    794 		if (error = nfs4_check_recov_err(vp2, op, rsp, rerr_cnt, "vp2"))
    795 			goto out;
    796 		nfs4_check_remap(mi, vp2, NFS4_REMAP_CKATTRS, &e);
    797 	}
    798 
    799 	/*
    800 	 * The lock order calls for us to acquire s_recovlock before
    801 	 * mi_recovlock, but we have to hold mi_recovlock to look up sp (to
    802 	 * prevent races with the failover/migration code).  So acquire
    803 	 * mi_recovlock, look up sp, drop mi_recovlock, acquire
    804 	 * s_recovlock and mi_recovlock, then verify that sp is still the
    805 	 * right object.  XXX Can we find a simpler way to deal with this?
    806 	 */
    807 	if (nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
    808 	    mi->mi_flags & MI4_INT)) {
    809 		error = EINTR;
    810 		goto out;
    811 	}
    812 get_sp:
    813 	sp = find_nfs4_server(mi);
    814 	if (sp != NULL) {
    815 		sp->s_otw_call_count++;
    816 		mutex_exit(&sp->s_lock);
    817 		droplock_cnt = mi->mi_srvset_cnt;
    818 	}
    819 	nfs_rw_exit(&mi->mi_recovlock);
    820 
    821 	if (sp != NULL) {
    822 		if (nfs_rw_enter_sig(&sp->s_recovlock, RW_READER,
    823 		    mi->mi_flags & MI4_INT)) {
    824 			error = EINTR;
    825 			goto out;
    826 		}
    827 	}
    828 	if (nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
    829 	    mi->mi_flags & MI4_INT)) {
    830 		if (sp != NULL)
    831 			nfs_rw_exit(&sp->s_recovlock);
    832 		error = EINTR;
    833 		goto out;
    834 	}
    835 	/*
    836 	 * If the mntinfo4_t hasn't changed nfs4_sever_ts then
    837 	 * there's no point in double checking to make sure it
    838 	 * has switched.
    839 	 */
    840 	if (sp == NULL || droplock_cnt != mi->mi_srvset_cnt) {
    841 		tsp = find_nfs4_server(mi);
    842 		if (tsp != sp) {
    843 			/* try again */
    844 			if (tsp != NULL) {
    845 				mutex_exit(&tsp->s_lock);
    846 				nfs4_server_rele(tsp);
    847 				tsp = NULL;
    848 			}
    849 			if (sp != NULL) {
    850 				nfs_rw_exit(&sp->s_recovlock);
    851 				mutex_enter(&sp->s_lock);
    852 				sp->s_otw_call_count--;
    853 				mutex_exit(&sp->s_lock);
    854 				nfs4_server_rele(sp);
    855 				sp = NULL;
    856 			}
    857 			goto get_sp;
    858 		} else {
    859 			if (tsp != NULL) {
    860 				mutex_exit(&tsp->s_lock);
    861 				nfs4_server_rele(tsp);
    862 				tsp = NULL;
    863 			}
    864 		}
    865 	}
    866 
    867 	if (sp != NULL) {
    868 		rsp->rs_sp = sp;
    869 	}
    870 
    871 	/*
    872 	 * If the fileystem uses volatile filehandles, obtain a lock so
    873 	 * that we synchronize with renames.  Exception: mount operations
    874 	 * can change mi_fh_expire_type, which could be a problem, since
    875 	 * the end_op code needs to be consistent with the start_op code
    876 	 * about mi_rename_lock.  Since mounts don't compete with renames,
    877 	 * it's simpler to just not acquire the rename lock for mounts.
    878 	 */
    879 	if (NFS4_VOLATILE_FH(mi) && op != OH_MOUNT) {
    880 		if (nfs_rw_enter_sig(&mi->mi_rename_lock,
    881 		    op == OH_VFH_RENAME ? RW_WRITER : RW_READER,
    882 		    mi->mi_flags & MI4_INT)) {
    883 			nfs_rw_exit(&mi->mi_recovlock);
    884 			if (sp != NULL)
    885 				nfs_rw_exit(&sp->s_recovlock);
    886 			error = EINTR;
    887 			goto out;
    888 		}
    889 		rsp->rs_flags |= NFS4_RS_RENAME_HELD;
    890 	}
    891 
    892 	if (OH_IS_STATE_RELE(op)) {
    893 		/*
    894 		 * For forced unmount, letting the request proceed will
    895 		 * almost always delay response to the user, so hand it off
    896 		 * to the recovery thread.  For exiting lwp's, we don't
    897 		 * have a good way to tell if the request will hang.  We
    898 		 * generally want processes to handle their own requests so
    899 		 * that they can be done in parallel, but if there is
    900 		 * already a recovery thread, hand the request off to it.
    901 		 * This will improve user response at no cost to overall
    902 		 * system throughput.  For zone shutdown, we'd prefer
    903 		 * the recovery thread to handle this as well.
    904 		 */
    905 		ASSERT(startrecovp != NULL);
    906 		mutex_enter(&mi->mi_lock);
    907 		if (FS_OR_ZONE_GONE4(mi->mi_vfsp))
    908 			*startrecovp = TRUE;
    909 		else if ((curthread->t_proc_flag & TP_LWPEXIT) &&
    910 		    (mi->mi_flags & MI4_RECOV_ACTIV))
    911 			*startrecovp = TRUE;
    912 		else
    913 			*startrecovp = FALSE;
    914 		mutex_exit(&mi->mi_lock);
    915 	} else
    916 		if (startrecovp != NULL)
    917 			*startrecovp = FALSE;
    918 
    919 	ASSERT(error == 0);
    920 	return (error);
    921 
    922 out:
    923 	ASSERT(error != 0);
    924 	if (sp != NULL) {
    925 		mutex_enter(&sp->s_lock);
    926 		sp->s_otw_call_count--;
    927 		mutex_exit(&sp->s_lock);
    928 		nfs4_server_rele(sp);
    929 		rsp->rs_sp = NULL;
    930 	}
    931 	nfs4_end_op_recall(vp1, vp2, rsp);
    932 
    933 #ifdef	DEBUG
    934 	(void) tsd_set(nfs4_tsd_key, NULL);
    935 #endif
    936 	return (error);
    937 }
    938 
    939 /*
    940  * It is up to the caller to determine if rsp->rs_sp being NULL
    941  * is detrimental or not.
    942  */
    943 int
    944 nfs4_start_op(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2,
    945     nfs4_recov_state_t *rsp)
    946 {
    947 	ASSERT(rsp->rs_num_retry_despite_err == 0);
    948 	rsp->rs_num_retry_despite_err = 0;
    949 	return (nfs4_start_fop(mi, vp1, vp2, OH_OTHER, rsp, NULL));
    950 }
    951 
    952 /*
    953  * Release any resources acquired by nfs4_start_op().
    954  * 'sp' should be the nfs4_server pointer returned by nfs4_start_op().
    955  *
    956  * The operation hint is used to avoid a deadlock by bypassing delegation
    957  * return logic for writes, which are done while returning a delegation.
    958  */
    959 
    960 void
    961 nfs4_end_fop(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, nfs4_op_hint_t op,
    962     nfs4_recov_state_t *rsp, bool_t needs_recov)
    963 {
    964 	nfs4_server_t *sp = rsp->rs_sp;
    965 	rnode4_t *rp = NULL;
    966 
    967 #ifdef	lint
    968 	/*
    969 	 * The op hint isn't used any more, but might be in
    970 	 * the future.
    971 	 */
    972 	op = op;
    973 #endif
    974 
    975 #ifdef	DEBUG
    976 	ASSERT(tsd_get(nfs4_tsd_key) != NULL);
    977 	(void) tsd_set(nfs4_tsd_key, NULL);
    978 #endif
    979 
    980 	nfs4_end_op_recall(vp1, vp2, rsp);
    981 
    982 	if (rsp->rs_flags & NFS4_RS_RENAME_HELD)
    983 		nfs_rw_exit(&mi->mi_rename_lock);
    984 
    985 	if (!needs_recov) {
    986 		if (rsp->rs_flags & NFS4_RS_DELAY_MSG) {
    987 			/* may need to clear the delay interval */
    988 			if (vp1 != NULL) {
    989 				rp = VTOR4(vp1);
    990 				mutex_enter(&rp->r_statelock);
    991 				rp->r_delay_interval = 0;
    992 				mutex_exit(&rp->r_statelock);
    993 			}
    994 		}
    995 		rsp->rs_flags &= ~(NFS4_RS_GRACE_MSG|NFS4_RS_DELAY_MSG);
    996 	}
    997 
    998 	/*
    999 	 * If the corresponding nfs4_start_op() found a sp,
   1000 	 * then there must still be a sp.
   1001 	 */
   1002 	if (sp != NULL) {
   1003 		nfs_rw_exit(&mi->mi_recovlock);
   1004 		nfs_rw_exit(&sp->s_recovlock);
   1005 		mutex_enter(&sp->s_lock);
   1006 		sp->s_otw_call_count--;
   1007 		cv_broadcast(&sp->s_cv_otw_count);
   1008 		mutex_exit(&sp->s_lock);
   1009 		nfs4_server_rele(sp);
   1010 	} else {
   1011 		nfs_rw_exit(&mi->mi_recovlock);
   1012 	}
   1013 }
   1014 
   1015 void
   1016 nfs4_end_op(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2,
   1017     nfs4_recov_state_t *rsp, bool_t needrecov)
   1018 {
   1019 	nfs4_end_fop(mi, vp1, vp2, OH_OTHER, rsp, needrecov);
   1020 }
   1021 
   1022 /*
   1023  * If the filesystem is going through client recovery, block until
   1024  * finished.
   1025  * Exceptions:
   1026  * - state-releasing ops (CLOSE, LOCKU, DELEGRETURN) are allowed to proceed
   1027  *   if the filesystem has been forcibly unmounted or the lwp is exiting.
   1028  *
   1029  * Return value:
   1030  * - 0 if no errors
   1031  * - EINTR if the call was interrupted
   1032  * - EIO if the filesystem has been forcibly unmounted (non-state-releasing
   1033  *   op)
   1034  * - the errno value from the recovery thread, if recovery failed
   1035  */
   1036 
   1037 static int
   1038 wait_for_recovery(mntinfo4_t *mi, nfs4_op_hint_t op_hint)
   1039 {
   1040 	int error = 0;
   1041 
   1042 	mutex_enter(&mi->mi_lock);
   1043 
   1044 	while (mi->mi_recovflags != 0) {
   1045 		klwp_t *lwp = ttolwp(curthread);
   1046 
   1047 		if ((mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) ||
   1048 		    (mi->mi_flags & MI4_RECOV_FAIL))
   1049 			break;
   1050 		if (OH_IS_STATE_RELE(op_hint) &&
   1051 		    (curthread->t_proc_flag & TP_LWPEXIT))
   1052 			break;
   1053 
   1054 		if (lwp != NULL)
   1055 			lwp->lwp_nostop++;
   1056 		/* XXX - use different cv? */
   1057 		if (cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock) == 0) {
   1058 			error = EINTR;
   1059 			if (lwp != NULL)
   1060 				lwp->lwp_nostop--;
   1061 			break;
   1062 		}
   1063 		if (lwp != NULL)
   1064 			lwp->lwp_nostop--;
   1065 	}
   1066 
   1067 	if ((mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) &&
   1068 	    !OH_IS_STATE_RELE(op_hint)) {
   1069 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   1070 		    "wait_for_recovery: forced unmount"));
   1071 		error = EIO;
   1072 	} else if (mi->mi_flags & MI4_RECOV_FAIL) {
   1073 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   1074 		    "wait_for_recovery: fail since RECOV FAIL"));
   1075 		error = mi->mi_error;
   1076 	}
   1077 
   1078 	mutex_exit(&mi->mi_lock);
   1079 
   1080 	return (error);
   1081 }
   1082 
   1083 /*
   1084  * If the client received NFS4ERR_GRACE for this particular mount,
   1085  * the client blocks here until it is time to try again.
   1086  *
   1087  * Return value:
   1088  * - 0 if wait was successful
   1089  * - EINTR if the call was interrupted
   1090  */
   1091 
   1092 int
   1093 nfs4_wait_for_grace(mntinfo4_t *mi, nfs4_recov_state_t *rsp)
   1094 {
   1095 	int error = 0;
   1096 	time_t curtime, time_to_wait;
   1097 
   1098 	/* do a unprotected check to reduce mi_lock contention */
   1099 	if (mi->mi_grace_wait != 0) {
   1100 		mutex_enter(&mi->mi_lock);
   1101 
   1102 		if (mi->mi_grace_wait != 0) {
   1103 			if (!(rsp->rs_flags & NFS4_RS_GRACE_MSG))
   1104 				rsp->rs_flags |= NFS4_RS_GRACE_MSG;
   1105 
   1106 			curtime = gethrestime_sec();
   1107 
   1108 			if (curtime < mi->mi_grace_wait) {
   1109 
   1110 				time_to_wait = mi->mi_grace_wait - curtime;
   1111 
   1112 				mutex_exit(&mi->mi_lock);
   1113 
   1114 				delay(SEC_TO_TICK(time_to_wait));
   1115 
   1116 				curtime = gethrestime_sec();
   1117 
   1118 				mutex_enter(&mi->mi_lock);
   1119 
   1120 				if (curtime >= mi->mi_grace_wait)
   1121 					mi->mi_grace_wait = 0;
   1122 			} else {
   1123 				mi->mi_grace_wait = 0;
   1124 			}
   1125 		}
   1126 		mutex_exit(&mi->mi_lock);
   1127 	}
   1128 
   1129 	return (error);
   1130 }
   1131 
   1132 /*
   1133  * If the client received NFS4ERR_DELAY for an operation on a vnode,
   1134  * the client blocks here until it is time to try again.
   1135  *
   1136  * Return value:
   1137  * - 0 if wait was successful
   1138  * - EINTR if the call was interrupted
   1139  */
   1140 
   1141 int
   1142 nfs4_wait_for_delay(vnode_t *vp, nfs4_recov_state_t *rsp)
   1143 {
   1144 	int error = 0;
   1145 	time_t curtime, time_to_wait;
   1146 	rnode4_t *rp;
   1147 
   1148 	ASSERT(vp != NULL);
   1149 
   1150 	rp = VTOR4(vp);
   1151 
   1152 	/* do a unprotected check to reduce r_statelock contention */
   1153 	if (rp->r_delay_wait != 0) {
   1154 		mutex_enter(&rp->r_statelock);
   1155 
   1156 		if (rp->r_delay_wait != 0) {
   1157 
   1158 			if (!(rsp->rs_flags & NFS4_RS_DELAY_MSG)) {
   1159 				rsp->rs_flags |= NFS4_RS_DELAY_MSG;
   1160 				nfs4_mi_kstat_inc_delay(VTOMI4(vp));
   1161 			}
   1162 
   1163 			curtime = gethrestime_sec();
   1164 
   1165 			if (curtime < rp->r_delay_wait) {
   1166 
   1167 				time_to_wait = rp->r_delay_wait - curtime;
   1168 
   1169 				mutex_exit(&rp->r_statelock);
   1170 
   1171 				delay(SEC_TO_TICK(time_to_wait));
   1172 
   1173 				curtime = gethrestime_sec();
   1174 
   1175 				mutex_enter(&rp->r_statelock);
   1176 
   1177 				if (curtime >= rp->r_delay_wait)
   1178 					rp->r_delay_wait = 0;
   1179 			} else {
   1180 				rp->r_delay_wait = 0;
   1181 			}
   1182 		}
   1183 		mutex_exit(&rp->r_statelock);
   1184 	}
   1185 
   1186 	return (error);
   1187 }
   1188 
   1189 /*
   1190  * The recovery thread.
   1191  */
   1192 
   1193 static void
   1194 nfs4_recov_thread(recov_info_t *recovp)
   1195 {
   1196 	mntinfo4_t *mi = recovp->rc_mi;
   1197 	nfs4_server_t *sp;
   1198 	int done = 0, error = 0;
   1199 	bool_t recov_fail = FALSE;
   1200 	callb_cpr_t cpr_info;
   1201 	kmutex_t cpr_lock;
   1202 
   1203 	nfs4_queue_event(RE_START, mi, NULL, mi->mi_recovflags,
   1204 	    recovp->rc_vp1, recovp->rc_vp2, 0, NULL, 0, TAG_NONE, TAG_NONE,
   1205 	    0, 0);
   1206 
   1207 	mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
   1208 	CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Recov");
   1209 
   1210 	mutex_enter(&mi->mi_lock);
   1211 	mi->mi_recovthread = curthread;
   1212 	mutex_exit(&mi->mi_lock);
   1213 
   1214 	/*
   1215 	 * We don't really need protection here against failover or
   1216 	 * migration, since the current thread is the one that would make
   1217 	 * any changes, but hold mi_recovlock anyway for completeness (and
   1218 	 * to satisfy any ASSERTs).
   1219 	 */
   1220 	(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
   1221 	sp = find_nfs4_server(mi);
   1222 	if (sp != NULL)
   1223 		mutex_exit(&sp->s_lock);
   1224 	nfs_rw_exit(&mi->mi_recovlock);
   1225 
   1226 	/*
   1227 	 * Do any necessary recovery, based on the information in recovp
   1228 	 * and any recovery flags.
   1229 	 */
   1230 
   1231 	do {
   1232 		mutex_enter(&mi->mi_lock);
   1233 		if (FS_OR_ZONE_GONE4(mi->mi_vfsp)) {
   1234 			bool_t activesrv;
   1235 
   1236 			NFS4_DEBUG(nfs4_client_recov_debug &&
   1237 			    mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED, (CE_NOTE,
   1238 			    "nfs4_recov_thread: file system has been "
   1239 			    "unmounted"));
   1240 			NFS4_DEBUG(nfs4_client_recov_debug &&
   1241 			    zone_status_get(curproc->p_zone) >=
   1242 			    ZONE_IS_SHUTTING_DOWN, (CE_NOTE,
   1243 			    "nfs4_recov_thread: zone shutting down"));
   1244 			/*
   1245 			 * If the server has lost its state for us and
   1246 			 * the filesystem is unmounted, then the filesystem
   1247 			 * can be tossed, even if there are lost lock or
   1248 			 * lost state calls in the recovery queue.
   1249 			 */
   1250 			if (mi->mi_recovflags &
   1251 			    (MI4R_NEED_CLIENTID | MI4R_REOPEN_FILES)) {
   1252 				NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   1253 				"nfs4_recov_thread: bailing out"));
   1254 				mi->mi_flags |= MI4_RECOV_FAIL;
   1255 				mi->mi_error = recovp->rc_error;
   1256 				recov_fail = TRUE;
   1257 			}
   1258 			/*
   1259 			 * We don't know if the server has any state for
   1260 			 * us, and the filesystem has been unmounted.  If
   1261 			 * there are "lost state" recovery items, keep
   1262 			 * trying to process them until there are no more
   1263 			 * mounted filesystems for the server.  Otherwise,
   1264 			 * bail out.  The reason we don't mark the
   1265 			 * filesystem as failing recovery is in case we
   1266 			 * have to do "lost state" recovery later (e.g., a
   1267 			 * user process exits).
   1268 			 */
   1269 			if (!(mi->mi_recovflags & MI4R_LOST_STATE)) {
   1270 				done = 1;
   1271 				mutex_exit(&mi->mi_lock);
   1272 				break;
   1273 			}
   1274 			mutex_exit(&mi->mi_lock);
   1275 
   1276 			if (sp == NULL)
   1277 				activesrv = FALSE;
   1278 			else {
   1279 				mutex_enter(&sp->s_lock);
   1280 				activesrv = nfs4_fs_active(sp);
   1281 			}
   1282 			if (!activesrv) {
   1283 				NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   1284 				    "no active fs for server %p",
   1285 				    (void *)sp));
   1286 				mutex_enter(&mi->mi_lock);
   1287 				mi->mi_flags |= MI4_RECOV_FAIL;
   1288 				mi->mi_error = recovp->rc_error;
   1289 				mutex_exit(&mi->mi_lock);
   1290 				recov_fail = TRUE;
   1291 				if (sp != NULL) {
   1292 					/*
   1293 					 * Mark the server instance as
   1294 					 * dead, so that nobody will attach
   1295 					 * a new filesystem.
   1296 					 */
   1297 					nfs4_mark_srv_dead(sp);
   1298 				}
   1299 			}
   1300 			if (sp != NULL)
   1301 				mutex_exit(&sp->s_lock);
   1302 		} else {
   1303 			mutex_exit(&mi->mi_lock);
   1304 		}
   1305 
   1306 		/*
   1307 		 * Check if we need to select a new server for a
   1308 		 * failover.  Choosing a new server will force at
   1309 		 * least a check of the clientid.
   1310 		 */
   1311 		mutex_enter(&mi->mi_lock);
   1312 		if (!recov_fail &&
   1313 		    (mi->mi_recovflags & MI4R_NEED_NEW_SERVER)) {
   1314 			mutex_exit(&mi->mi_lock);
   1315 			recov_newserver(recovp, &sp, &recov_fail);
   1316 		} else
   1317 			mutex_exit(&mi->mi_lock);
   1318 
   1319 		/*
   1320 		 * Check if we need to recover the clientid.  This
   1321 		 * must be done before file and lock recovery, and it
   1322 		 * potentially affects the recovery threads for other
   1323 		 * filesystems, so it gets special treatment.
   1324 		 */
   1325 		if (sp != NULL && recov_fail == FALSE) {
   1326 			mutex_enter(&sp->s_lock);
   1327 			if (!(sp->s_flags & N4S_CLIENTID_SET)) {
   1328 				mutex_exit(&sp->s_lock);
   1329 				recov_clientid(recovp, sp);
   1330 			} else {
   1331 				/*
   1332 				 * Unset this flag in case another recovery
   1333 				 * thread successfully recovered the clientid
   1334 				 * for us already.
   1335 				 */
   1336 				mutex_enter(&mi->mi_lock);
   1337 				mi->mi_recovflags &= ~MI4R_NEED_CLIENTID;
   1338 				mutex_exit(&mi->mi_lock);
   1339 				mutex_exit(&sp->s_lock);
   1340 			}
   1341 		}
   1342 
   1343 		/*
   1344 		 * Check if we need to get the security information.
   1345 		 */
   1346 		mutex_enter(&mi->mi_lock);
   1347 		if ((mi->mi_recovflags & MI4R_NEED_SECINFO) &&
   1348 		    !(mi->mi_flags & MI4_RECOV_FAIL)) {
   1349 			mutex_exit(&mi->mi_lock);
   1350 			(void) nfs_rw_enter_sig(&mi->mi_recovlock,
   1351 			    RW_WRITER, 0);
   1352 			error = nfs4_secinfo_recov(recovp->rc_mi,
   1353 			    recovp->rc_vp1, recovp->rc_vp2);
   1354 			/*
   1355 			 * If error, nothing more can be done, stop
   1356 			 * the recovery.
   1357 			 */
   1358 			if (error) {
   1359 				mutex_enter(&mi->mi_lock);
   1360 				mi->mi_flags |= MI4_RECOV_FAIL;
   1361 				mi->mi_error = recovp->rc_error;
   1362 				mutex_exit(&mi->mi_lock);
   1363 				nfs4_queue_event(RE_WRONGSEC, mi, NULL,
   1364 				    error, recovp->rc_vp1, recovp->rc_vp2,
   1365 				    0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
   1366 			}
   1367 			nfs_rw_exit(&mi->mi_recovlock);
   1368 		} else
   1369 			mutex_exit(&mi->mi_lock);
   1370 
   1371 		/*
   1372 		 * Check if there's a bad seqid to recover.
   1373 		 */
   1374 		mutex_enter(&mi->mi_lock);
   1375 		if ((mi->mi_recovflags & MI4R_BAD_SEQID) &&
   1376 		    !(mi->mi_flags & MI4_RECOV_FAIL)) {
   1377 			mutex_exit(&mi->mi_lock);
   1378 			(void) nfs_rw_enter_sig(&mi->mi_recovlock,
   1379 			    RW_WRITER, 0);
   1380 			recov_bad_seqid(recovp);
   1381 			nfs_rw_exit(&mi->mi_recovlock);
   1382 		} else
   1383 			mutex_exit(&mi->mi_lock);
   1384 
   1385 		/*
   1386 		 * Next check for recovery that affects the entire
   1387 		 * filesystem.
   1388 		 */
   1389 		if (sp != NULL) {
   1390 			mutex_enter(&mi->mi_lock);
   1391 			if ((mi->mi_recovflags & MI4R_REOPEN_FILES) &&
   1392 			    !(mi->mi_flags & MI4_RECOV_FAIL)) {
   1393 				mutex_exit(&mi->mi_lock);
   1394 				recov_openfiles(recovp, sp);
   1395 			} else
   1396 				mutex_exit(&mi->mi_lock);
   1397 		}
   1398 
   1399 		/*
   1400 		 * Send any queued state recovery requests.
   1401 		 */
   1402 		mutex_enter(&mi->mi_lock);
   1403 		if (sp != NULL &&
   1404 		    (mi->mi_recovflags & MI4R_LOST_STATE) &&
   1405 		    !(mi->mi_flags & MI4_RECOV_FAIL)) {
   1406 			mutex_exit(&mi->mi_lock);
   1407 			(void) nfs_rw_enter_sig(&mi->mi_recovlock,
   1408 			    RW_WRITER, 0);
   1409 			nfs4_resend_lost_rqsts(recovp, sp);
   1410 			if (list_head(&mi->mi_lost_state) == NULL) {
   1411 				/* done */
   1412 				mutex_enter(&mi->mi_lock);
   1413 				mi->mi_recovflags &= ~MI4R_LOST_STATE;
   1414 				mutex_exit(&mi->mi_lock);
   1415 			}
   1416 			nfs_rw_exit(&mi->mi_recovlock);
   1417 		} else {
   1418 			mutex_exit(&mi->mi_lock);
   1419 		}
   1420 
   1421 		/*
   1422 		 * See if there is anything more to do.  If not, announce
   1423 		 * that we are done and exit.
   1424 		 *
   1425 		 * Need mi_recovlock to keep 'sp' valid.  Must grab
   1426 		 * mi_recovlock before mi_lock to preserve lock ordering.
   1427 		 */
   1428 		(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
   1429 		mutex_enter(&mi->mi_lock);
   1430 		if ((mi->mi_recovflags & ~MI4R_SRV_REBOOT) == 0 ||
   1431 		    (mi->mi_flags & MI4_RECOV_FAIL)) {
   1432 			list_t local_lost_state;
   1433 			nfs4_lost_rqst_t *lrp;
   1434 
   1435 			/*
   1436 			 * We need to remove the lost requests before we
   1437 			 * unmark the mi as no longer doing recovery to
   1438 			 * avoid a race with a new thread putting new lost
   1439 			 * requests on the same mi (and the going away
   1440 			 * thread would remove the new lost requests).
   1441 			 *
   1442 			 * Move the lost requests to a local list since
   1443 			 * nfs4_remove_lost_rqst() drops mi_lock, and
   1444 			 * dropping the mi_lock would make our check to
   1445 			 * see if recovery is done no longer valid.
   1446 			 */
   1447 			list_create(&local_lost_state,
   1448 			    sizeof (nfs4_lost_rqst_t),
   1449 			    offsetof(nfs4_lost_rqst_t, lr_node));
   1450 			list_move_tail(&local_lost_state, &mi->mi_lost_state);
   1451 
   1452 			done = 1;
   1453 			mutex_exit(&mi->mi_lock);
   1454 			/*
   1455 			 * Now officially free the "moved"
   1456 			 * lost requests.
   1457 			 */
   1458 			while ((lrp = list_head(&local_lost_state)) != NULL) {
   1459 				list_remove(&local_lost_state, lrp);
   1460 				nfs4_free_lost_rqst(lrp, sp);
   1461 			}
   1462 			list_destroy(&local_lost_state);
   1463 		} else
   1464 			mutex_exit(&mi->mi_lock);
   1465 		nfs_rw_exit(&mi->mi_recovlock);
   1466 
   1467 		/*
   1468 		 * If the filesystem has been forcibly unmounted, there is
   1469 		 * probably no point in retrying immediately.  Furthermore,
   1470 		 * there might be user processes waiting for a chance to
   1471 		 * queue up "lost state" requests, so that they can exit.
   1472 		 * So pause here for a moment.  Same logic for zone shutdown.
   1473 		 */
   1474 		if (!done && FS_OR_ZONE_GONE4(mi->mi_vfsp)) {
   1475 			mutex_enter(&mi->mi_lock);
   1476 			cv_broadcast(&mi->mi_failover_cv);
   1477 			mutex_exit(&mi->mi_lock);
   1478 			delay(SEC_TO_TICK(nfs4_unmount_delay));
   1479 		}
   1480 
   1481 	} while (!done);
   1482 
   1483 	if (sp != NULL)
   1484 		nfs4_server_rele(sp);
   1485 
   1486 	/*
   1487 	 * Return all recalled delegations
   1488 	 */
   1489 	nfs4_dlistclean();
   1490 
   1491 	mutex_enter(&mi->mi_lock);
   1492 	recov_done(mi, recovp);
   1493 	mutex_exit(&mi->mi_lock);
   1494 
   1495 	/*
   1496 	 * Free up resources that were allocated for us.
   1497 	 */
   1498 	if (recovp->rc_vp1 != NULL)
   1499 		VN_RELE(recovp->rc_vp1);
   1500 	if (recovp->rc_vp2 != NULL)
   1501 		VN_RELE(recovp->rc_vp2);
   1502 
   1503 	/* now we are done using the mi struct, signal the waiters */
   1504 	mutex_enter(&mi->mi_lock);
   1505 	mi->mi_in_recovery--;
   1506 	if (mi->mi_in_recovery == 0)
   1507 		cv_broadcast(&mi->mi_cv_in_recov);
   1508 	mutex_exit(&mi->mi_lock);
   1509 
   1510 	VFS_RELE(mi->mi_vfsp);
   1511 	MI4_RELE(mi);
   1512 	kmem_free(recovp, sizeof (recov_info_t));
   1513 	mutex_enter(&cpr_lock);
   1514 	CALLB_CPR_EXIT(&cpr_info);
   1515 	mutex_destroy(&cpr_lock);
   1516 	zthread_exit();
   1517 }
   1518 
   1519 /*
   1520  * Log the end of recovery and notify any waiting threads.
   1521  */
   1522 
   1523 static void
   1524 recov_done(mntinfo4_t *mi, recov_info_t *recovp)
   1525 {
   1526 
   1527 	ASSERT(MUTEX_HELD(&mi->mi_lock));
   1528 
   1529 	nfs4_queue_event(RE_END, mi, NULL, 0, recovp->rc_vp1,
   1530 	    recovp->rc_vp2, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
   1531 	mi->mi_recovthread = NULL;
   1532 	mi->mi_flags &= ~MI4_RECOV_ACTIV;
   1533 	mi->mi_recovflags &= ~MI4R_SRV_REBOOT;
   1534 	cv_broadcast(&mi->mi_failover_cv);
   1535 }
   1536 
   1537 /*
   1538  * State-specific recovery routines, by state.
   1539  */
   1540 
   1541 /*
   1542  * Failover.
   1543  *
   1544  * Replaces *spp with a reference to the new server, which must
   1545  * eventually be freed.
   1546  */
   1547 
   1548 static void
   1549 recov_newserver(recov_info_t *recovp, nfs4_server_t **spp, bool_t *recov_fail)
   1550 {
   1551 	mntinfo4_t *mi = recovp->rc_mi;
   1552 	servinfo4_t *svp = NULL;
   1553 	nfs4_server_t *osp = *spp;
   1554 	CLIENT *cl;
   1555 	enum clnt_stat status;
   1556 	struct timeval tv;
   1557 	int error;
   1558 	int oncethru = 0;
   1559 	rnode4_t *rp;
   1560 	int index;
   1561 	nfs_fh4 fh;
   1562 	char *snames;
   1563 	size_t len;
   1564 
   1565 	(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0);
   1566 
   1567 	tv.tv_sec = 2;
   1568 	tv.tv_usec = 0;
   1569 
   1570 #ifdef lint
   1571 	/*
   1572 	 * Lint can't follow the logic, so thinks that snames and len
   1573 	 * can be used before being set.  They can't, but lint can't
   1574 	 * figure it out.  To address the lint warning, initialize
   1575 	 * snames and len for lint.
   1576 	 */
   1577 	snames = NULL;
   1578 	len = 0;
   1579 #endif
   1580 
   1581 	/*
   1582 	 * Ping the null NFS procedure of every server in
   1583 	 * the list until one responds.  We always start
   1584 	 * at the head of the list and always skip the one
   1585 	 * that is current, since it's caused us a problem.
   1586 	 */
   1587 	while (svp == NULL) {
   1588 		for (svp = mi->mi_servers; svp; svp = svp->sv_next) {
   1589 
   1590 			mutex_enter(&mi->mi_lock);
   1591 			if (FS_OR_ZONE_GONE4(mi->mi_vfsp)) {
   1592 				mi->mi_flags |= MI4_RECOV_FAIL;
   1593 				mutex_exit(&mi->mi_lock);
   1594 				(void) nfs_rw_exit(&mi->mi_recovlock);
   1595 				*recov_fail = TRUE;
   1596 				if (oncethru)
   1597 					kmem_free(snames, len);
   1598 				return;
   1599 			}
   1600 			mutex_exit(&mi->mi_lock);
   1601 
   1602 			(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
   1603 			if (svp->sv_flags & SV4_NOTINUSE) {
   1604 				nfs_rw_exit(&svp->sv_lock);
   1605 				continue;
   1606 			}
   1607 			nfs_rw_exit(&svp->sv_lock);
   1608 
   1609 			if (!oncethru && svp == mi->mi_curr_serv)
   1610 				continue;
   1611 
   1612 			error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr,
   1613 			    NFS_PROGRAM, NFS_V4, 0, 1, CRED(), &cl);
   1614 			if (error)
   1615 				continue;
   1616 
   1617 			if (!(mi->mi_flags & MI4_INT))
   1618 				cl->cl_nosignal = TRUE;
   1619 			status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL,
   1620 			    xdr_void, NULL, tv);
   1621 			if (!(mi->mi_flags & MI4_INT))
   1622 				cl->cl_nosignal = FALSE;
   1623 			AUTH_DESTROY(cl->cl_auth);
   1624 			CLNT_DESTROY(cl);
   1625 			if (status == RPC_SUCCESS) {
   1626 				nfs4_queue_event(RE_FAILOVER, mi,
   1627 				    svp == mi->mi_curr_serv ? NULL :
   1628 				    svp->sv_hostname, 0, NULL, NULL, 0,
   1629 				    NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
   1630 				break;
   1631 			}
   1632 		}
   1633 
   1634 		if (svp == NULL) {
   1635 			if (!oncethru) {
   1636 				snames = nfs4_getsrvnames(mi, &len);
   1637 				nfs4_queue_fact(RF_SRVS_NOT_RESPOND, mi,
   1638 				    0, 0, 0, FALSE, snames, 0, NULL);
   1639 				oncethru = 1;
   1640 			}
   1641 			delay(hz);
   1642 		}
   1643 	}
   1644 
   1645 	if (oncethru) {
   1646 		nfs4_queue_fact(RF_SRVS_OK, mi, 0, 0, 0, FALSE, snames,
   1647 		    0, NULL);
   1648 		kmem_free(snames, len);
   1649 	}
   1650 
   1651 #if DEBUG
   1652 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
   1653 	ASSERT((svp->sv_flags & SV4_NOTINUSE) == 0);
   1654 	nfs_rw_exit(&svp->sv_lock);
   1655 #endif
   1656 
   1657 	mutex_enter(&mi->mi_lock);
   1658 	mi->mi_recovflags &= ~MI4R_NEED_NEW_SERVER;
   1659 	if (svp != mi->mi_curr_serv) {
   1660 		servinfo4_t *osvp = mi->mi_curr_serv;
   1661 
   1662 		mutex_exit(&mi->mi_lock);
   1663 
   1664 		/*
   1665 		 * Update server-dependent fields in the root vnode.
   1666 		 */
   1667 		index = rtable4hash(mi->mi_rootfh);
   1668 		rw_enter(&rtable4[index].r_lock, RW_WRITER);
   1669 
   1670 		rp = r4find(&rtable4[index], mi->mi_rootfh, mi->mi_vfsp);
   1671 		if (rp != NULL) {
   1672 			NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
   1673 			    "recov_newserver: remapping %s", rnode4info(rp)));
   1674 			mutex_enter(&rp->r_statelock);
   1675 			rp->r_server = svp;
   1676 			PURGE_ATTRCACHE4_LOCKED(rp);
   1677 			mutex_exit(&rp->r_statelock);
   1678 			(void) nfs4_free_data_reclaim(rp);
   1679 			nfs4_purge_rddir_cache(RTOV4(rp));
   1680 			rw_exit(&rtable4[index].r_lock);
   1681 			NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
   1682 			    "recov_newserver: done with %s",
   1683 			    rnode4info(rp)));
   1684 			VN_RELE(RTOV4(rp));
   1685 		} else
   1686 			rw_exit(&rtable4[index].r_lock);
   1687 		(void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
   1688 
   1689 		mutex_enter(&mi->mi_lock);
   1690 		mi->mi_recovflags |= MI4R_REOPEN_FILES | MI4R_REMAP_FILES;
   1691 		if (recovp->rc_srv_reboot)
   1692 			mi->mi_recovflags |= MI4R_SRV_REBOOT;
   1693 		mi->mi_curr_serv = svp;
   1694 		mi->mi_failover++;
   1695 		mi->mi_flags &= ~MI4_BADOWNER_DEBUG;
   1696 		mutex_exit(&mi->mi_lock);
   1697 
   1698 		(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
   1699 		fh.nfs_fh4_len = svp->sv_fhandle.fh_len;
   1700 		fh.nfs_fh4_val = svp->sv_fhandle.fh_buf;
   1701 		sfh4_update(mi->mi_rootfh, &fh);
   1702 		fh.nfs_fh4_len = svp->sv_pfhandle.fh_len;
   1703 		fh.nfs_fh4_val = svp->sv_pfhandle.fh_buf;
   1704 		sfh4_update(mi->mi_srvparentfh, &fh);
   1705 		nfs_rw_exit(&svp->sv_lock);
   1706 
   1707 		*spp = nfs4_move_mi(mi, osvp, svp);
   1708 		if (osp != NULL)
   1709 			nfs4_server_rele(osp);
   1710 	} else
   1711 		mutex_exit(&mi->mi_lock);
   1712 	(void) nfs_rw_exit(&mi->mi_recovlock);
   1713 }
   1714 
   1715 /*
   1716  * Clientid.
   1717  */
   1718 
   1719 static void
   1720 recov_clientid(recov_info_t *recovp, nfs4_server_t *sp)
   1721 {
   1722 	mntinfo4_t *mi = recovp->rc_mi;
   1723 	int error = 0;
   1724 	int still_stale;
   1725 	int need_new_s;
   1726 
   1727 	ASSERT(sp != NULL);
   1728 
   1729 	/*
   1730 	 * Acquire the recovery lock and then verify that the clientid
   1731 	 * still needs to be recovered.  (Note that s_recovlock is supposed
   1732 	 * to be acquired before s_lock.)  Since the thread holds the
   1733 	 * recovery lock, no other thread will recover the clientid.
   1734 	 */
   1735 	(void) nfs_rw_enter_sig(&sp->s_recovlock, RW_WRITER, 0);
   1736 	(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0);
   1737 	mutex_enter(&sp->s_lock);
   1738 	still_stale = ((sp->s_flags & N4S_CLIENTID_SET) == 0);
   1739 	mutex_exit(&sp->s_lock);
   1740 
   1741 	if (still_stale) {
   1742 		nfs4_error_t n4e;
   1743 
   1744 		nfs4_error_zinit(&n4e);
   1745 		nfs4setclientid(mi, kcred, TRUE, &n4e);
   1746 		error = n4e.error;
   1747 		if (error != 0) {
   1748 
   1749 			/*
   1750 			 * nfs4setclientid may have set MI4R_NEED_NEW_SERVER,
   1751 			 * if so, just return and let recov_thread drive
   1752 			 * failover.
   1753 			 */
   1754 			mutex_enter(&mi->mi_lock);
   1755 			need_new_s = mi->mi_recovflags & MI4R_NEED_NEW_SERVER;
   1756 			mutex_exit(&mi->mi_lock);
   1757 
   1758 			if (need_new_s) {
   1759 				nfs_rw_exit(&mi->mi_recovlock);
   1760 				nfs_rw_exit(&sp->s_recovlock);
   1761 				return;
   1762 			}
   1763 
   1764 			nfs4_queue_event(RE_CLIENTID, mi, NULL, n4e.error, NULL,
   1765 			    NULL, n4e.stat, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
   1766 			mutex_enter(&mi->mi_lock);
   1767 			mi->mi_flags |= MI4_RECOV_FAIL;
   1768 			mi->mi_error = recovp->rc_error;
   1769 			mutex_exit(&mi->mi_lock);
   1770 			/* don't destroy the nfs4_server, let umount do it */
   1771 		}
   1772 	}
   1773 
   1774 	if (error == 0) {
   1775 		mutex_enter(&mi->mi_lock);
   1776 		mi->mi_recovflags &= ~MI4R_NEED_CLIENTID;
   1777 		/*
   1778 		 * If still_stale isn't true, then another thread already
   1779 		 * recovered the clientid.  And that thread that set the
   1780 		 * clientid will have initiated reopening files on all the
   1781 		 * filesystems for the server, so we should not initiate
   1782 		 * reopening for this filesystem here.
   1783 		 */
   1784 		if (still_stale) {
   1785 			mi->mi_recovflags |= MI4R_REOPEN_FILES;
   1786 			if (recovp->rc_srv_reboot)
   1787 				mi->mi_recovflags |= MI4R_SRV_REBOOT;
   1788 		}
   1789 		mutex_exit(&mi->mi_lock);
   1790 	}
   1791 
   1792 	nfs_rw_exit(&mi->mi_recovlock);
   1793 
   1794 	if (error != 0) {
   1795 		nfs_rw_exit(&sp->s_recovlock);
   1796 		mutex_enter(&mi->mi_lock);
   1797 		if ((mi->mi_flags & MI4_RECOV_FAIL) == 0)
   1798 			delay(SEC_TO_TICK(recov_err_delay));
   1799 		mutex_exit(&mi->mi_lock);
   1800 	} else {
   1801 		mntinfo4_t **milist;
   1802 		mntinfo4_t *tmi;
   1803 		int nummi, i;
   1804 
   1805 		/*
   1806 		 * Initiate recovery of open files for other filesystems.
   1807 		 * We create an array of filesystems, rather than just
   1808 		 * walking the filesystem list, to avoid deadlock issues
   1809 		 * with s_lock and mi_recovlock.
   1810 		 */
   1811 		milist = make_milist(sp, &nummi);
   1812 		for (i = 0; i < nummi; i++) {
   1813 			tmi = milist[i];
   1814 			if (tmi != mi) {
   1815 				(void) nfs_rw_enter_sig(&tmi->mi_recovlock,
   1816 				    RW_READER, 0);
   1817 				start_recovery_action(NR_OPENFILES, TRUE, tmi,
   1818 				    NULL, NULL);
   1819 				nfs_rw_exit(&tmi->mi_recovlock);
   1820 			}
   1821 		}
   1822 		free_milist(milist, nummi);
   1823 
   1824 		nfs_rw_exit(&sp->s_recovlock);
   1825 	}
   1826 }
   1827 
   1828 /*
   1829  * Return an array of filesystems associated with the given server.  The
   1830  * caller should call free_milist() to free the references and memory.
   1831  */
   1832 
   1833 static mntinfo4_t **
   1834 make_milist(nfs4_server_t *sp, int *nummip)
   1835 {
   1836 	int nummi, i;
   1837 	mntinfo4_t **milist;
   1838 	mntinfo4_t *tmi;
   1839 
   1840 	mutex_enter(&sp->s_lock);
   1841 	nummi = 0;
   1842 	for (tmi = sp->mntinfo4_list; tmi != NULL; tmi = tmi->mi_clientid_next)
   1843 		nummi++;
   1844 
   1845 	milist = kmem_alloc(nummi * sizeof (mntinfo4_t *), KM_SLEEP);
   1846 
   1847 	for (i = 0, tmi = sp->mntinfo4_list; tmi != NULL; i++,
   1848 	    tmi = tmi->mi_clientid_next) {
   1849 		milist[i] = tmi;
   1850 		VFS_HOLD(tmi->mi_vfsp);
   1851 	}
   1852 	mutex_exit(&sp->s_lock);
   1853 
   1854 	*nummip = nummi;
   1855 	return (milist);
   1856 }
   1857 
   1858 /*
   1859  * Free the filesystem list created by make_milist().
   1860  */
   1861 
   1862 static void
   1863 free_milist(mntinfo4_t **milist, int nummi)
   1864 {
   1865 	mntinfo4_t *tmi;
   1866 	int i;
   1867 
   1868 	for (i = 0; i < nummi; i++) {
   1869 		tmi = milist[i];
   1870 		VFS_RELE(tmi->mi_vfsp);
   1871 	}
   1872 	kmem_free(milist, nummi * sizeof (mntinfo4_t *));
   1873 }
   1874 
   1875 /*
   1876  * Filehandle
   1877  */
   1878 
   1879 /*
   1880  * Lookup the filehandle for the given vnode and update the rnode if it has
   1881  * changed.
   1882  *
   1883  * Errors:
   1884  * - if the filehandle could not be updated because of an error that
   1885  *   requires further recovery, initiate that recovery and return.
   1886  * - if the filehandle could not be updated because of a signal, pretend we
   1887  *   succeeded and let someone else deal with it.
   1888  * - if the filehandle could not be updated and the filesystem has been
   1889  *   forcibly unmounted, pretend we succeeded, and let the caller deal with
   1890  *   the forced unmount (to retry or not to retry, that is the question).
   1891  * - if the filehandle could not be updated because of some other error,
   1892  *   mark the rnode bad and return.
   1893  */
   1894 static void
   1895 recov_filehandle(nfs4_recov_t action, mntinfo4_t *mi, vnode_t *vp)
   1896 {
   1897 	rnode4_t *rp = VTOR4(vp);
   1898 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
   1899 	bool_t needrecov;
   1900 
   1901 	mutex_enter(&rp->r_statelock);
   1902 
   1903 	if (rp->r_flags & R4RECOVERR) {
   1904 		mutex_exit(&rp->r_statelock);
   1905 		return;
   1906 	}
   1907 
   1908 	/*
   1909 	 * If someone else is updating the filehandle, wait for them to
   1910 	 * finish and then let our caller retry.
   1911 	 */
   1912 	if (rp->r_flags & R4RECEXPFH) {
   1913 		while (rp->r_flags & R4RECEXPFH) {
   1914 			cv_wait(&rp->r_cv, &rp->r_statelock);
   1915 		}
   1916 		mutex_exit(&rp->r_statelock);
   1917 		return;
   1918 	}
   1919 	rp->r_flags |= R4RECEXPFH;
   1920 	mutex_exit(&rp->r_statelock);
   1921 
   1922 	if (action == NR_BADHANDLE) {
   1923 		/* shouldn't happen */
   1924 		nfs4_queue_event(RE_BADHANDLE, mi, NULL, 0,
   1925 		    vp, NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
   1926 	}
   1927 
   1928 	nfs4_remap_file(mi, vp, 0, &e);
   1929 	needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
   1930 
   1931 	/*
   1932 	 * If we get BADHANDLE or FHEXPIRED in their handler, something is
   1933 	 * broken.  Don't try to recover, just mark the file dead.
   1934 	 */
   1935 	if (needrecov && e.error == 0 &&
   1936 	    (e.stat == NFS4ERR_BADHANDLE || e.stat == NFS4ERR_FHEXPIRED))
   1937 		needrecov = FALSE;
   1938 	if (needrecov) {
   1939 		(void) nfs4_start_recovery(&e, mi, vp,
   1940 		    NULL, NULL, NULL, OP_LOOKUP, NULL);
   1941 	} else if (e.error != EINTR &&
   1942 	    !NFS4_FRC_UNMT_ERR(e.error, mi->mi_vfsp) &&
   1943 	    (e.error != 0 || e.stat != NFS4_OK)) {
   1944 		nfs4_recov_fh_fail(vp, e.error, e.stat);
   1945 		/*
   1946 		 * Don't set r_error to ESTALE.  Higher-level code (e.g.,
   1947 		 * cstatat_getvp()) retries on ESTALE, which would cause
   1948 		 * an infinite loop.
   1949 		 */
   1950 	}
   1951 
   1952 	mutex_enter(&rp->r_statelock);
   1953 	rp->r_flags &= ~R4RECEXPFH;
   1954 	cv_broadcast(&rp->r_cv);
   1955 	mutex_exit(&rp->r_statelock);
   1956 }
   1957 
   1958 /*
   1959  * Stale Filehandle
   1960  */
   1961 
   1962 /*
   1963  * A stale filehandle can happen when an individual file has
   1964  * been removed, or when an entire filesystem has been taken
   1965  * offline.  To distinguish these cases, we do this:
   1966  * - if a GETATTR with the current filehandle is okay, we do
   1967  *   nothing (this can happen with two-filehandle ops)
   1968  * - if the GETATTR fails, but a GETATTR of the root filehandle
   1969  *   succeeds, mark the rnode with R4STALE, which will stop use
   1970  * - if the GETATTR fails, and a GETATTR of the root filehandle
   1971  *   also fails, we consider the problem filesystem-wide, so:
   1972  *   - if we can failover, we should
   1973  *   - if we can't failover, we should mark both the original
   1974  *     vnode and the root bad
   1975  */
   1976 static void
   1977 recov_stale(mntinfo4_t *mi, vnode_t *vp)
   1978 {
   1979 	rnode4_t *rp = VTOR4(vp);
   1980 	vnode_t *rootvp = NULL;
   1981 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
   1982 	nfs4_ga_res_t gar;
   1983 	char *fail_msg = "failed to recover from NFS4ERR_STALE";
   1984 	bool_t needrecov;
   1985 
   1986 	mutex_enter(&rp->r_statelock);
   1987 
   1988 	if (rp->r_flags & R4RECOVERR) {
   1989 		mutex_exit(&rp->r_statelock);
   1990 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   1991 		    "recov_stale: already marked dead, rp %s",
   1992 		    rnode4info(rp)));
   1993 		return;
   1994 	}
   1995 
   1996 	if (rp->r_flags & R4STALE) {
   1997 		mutex_exit(&rp->r_statelock);
   1998 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   1999 		    "recov_stale: already marked stale, rp %s",
   2000 		    rnode4info(rp)));
   2001 		return;
   2002 	}
   2003 
   2004 	mutex_exit(&rp->r_statelock);
   2005 
   2006 	/* Try a GETATTR on this vnode */
   2007 	nfs4_getattr_otw_norecovery(vp, &gar, &e, CRED(), 0);
   2008 
   2009 	/*
   2010 	 * Handle non-STALE recoverable errors
   2011 	 */
   2012 	needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
   2013 	if (needrecov && (e.error != 0 || e.stat != NFS4ERR_STALE)) {
   2014 		(void) nfs4_start_recovery(&e, mi, vp,
   2015 		    NULL, NULL, NULL, OP_GETATTR, NULL);
   2016 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   2017 		    "recov_stale: error=%d, stat=%d seen on rp %s",
   2018 		    e.error, e.stat, rnode4info(rp)));
   2019 		goto out;
   2020 	}
   2021 
   2022 	/* Are things OK for this vnode? */
   2023 	if (!e.error && e.stat == NFS4_OK) {
   2024 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   2025 		    "recov_stale: file appears fine, rp %s",
   2026 		    rnode4info(rp)));
   2027 		goto out;
   2028 	}
   2029 
   2030 	/* Did we get an unrelated non-recoverable error? */
   2031 	if (e.error || e.stat != NFS4ERR_STALE) {
   2032 		nfs4_fail_recov(vp, fail_msg, e.error, e.stat);
   2033 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   2034 		    "recov_stale: unrelated fatal error, rp %s",
   2035 		    rnode4info(rp)));
   2036 		goto out;
   2037 	}
   2038 
   2039 	/*
   2040 	 * If we don't appear to be dealing with the root node, find it.
   2041 	 */
   2042 	if ((vp->v_flag & VROOT) == 0) {
   2043 		nfs4_error_zinit(&e);
   2044 		e.error = VFS_ROOT(vp->v_vfsp, &rootvp);
   2045 		if (e.error) {
   2046 			nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE);
   2047 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   2048 			    "recov_stale: can't find root node for rp %s",
   2049 			    rnode4info(rp)));
   2050 			goto out;
   2051 		}
   2052 	}
   2053 
   2054 	/* Try a GETATTR on the root vnode */
   2055 	if (rootvp != NULL) {
   2056 		nfs4_error_zinit(&e);
   2057 		nfs4_getattr_otw_norecovery(rootvp, &gar, &e, CRED(), 0);
   2058 
   2059 		/* Try recovery? */
   2060 		if (e.error != 0 || e.stat != NFS4ERR_STALE) {
   2061 			needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
   2062 			if (needrecov) {
   2063 				(void) nfs4_start_recovery(&e,
   2064 				    mi, rootvp, NULL, NULL, NULL,
   2065 				    OP_GETATTR, NULL);
   2066 				NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   2067 				    "recov_stale: error=%d, stat=%d seen "
   2068 				    "on rp %s", e.error, e.stat,
   2069 				    rnode4info(rp)));
   2070 			}
   2071 		}
   2072 
   2073 		/*
   2074 		 * Check to see if a failover attempt is warranted
   2075 		 * NB: nfs4_try_failover doesn't check for STALE
   2076 		 * because recov_stale gets a shot first.  Now that
   2077 		 * recov_stale has failed, go ahead and try failover.
   2078 		 *
   2079 		 * If the getattr on the root filehandle was successful,
   2080 		 * then mark recovery as failed for 'vp' and exit.
   2081 		 */
   2082 		if (nfs4_try_failover(&e) == 0 && e.stat != NFS4ERR_STALE) {
   2083 			/*
   2084 			 * pass the original error to fail_recov, not
   2085 			 * the one from trying the root vnode.
   2086 			 */
   2087 			nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE);
   2088 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   2089 			    "recov_stale: root node OK, marking "
   2090 			    "dead rp %s", rnode4info(rp)));
   2091 			goto out;
   2092 		}
   2093 	}
   2094 
   2095 	/*
   2096 	 * Here, we know that both the original file and the
   2097 	 * root filehandle (which may be the same) are stale.
   2098 	 * We want to fail over if we can, and if we can't, we
   2099 	 * want to mark everything in sight bad.
   2100 	 */
   2101 	if (FAILOVER_MOUNT4(mi)) {
   2102 		mutex_enter(&mi->mi_lock);
   2103 		mi->mi_recovflags |= MI4R_NEED_NEW_SERVER;
   2104 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   2105 		    "recov_stale: failing over due to rp %s",
   2106 		    rnode4info(rp)));
   2107 		mutex_exit(&mi->mi_lock);
   2108 	} else {
   2109 		rnode4_t *rootrp;
   2110 		servinfo4_t *svp;
   2111 
   2112 		/*
   2113 		 * Can't fail over, so mark things dead.
   2114 		 *
   2115 		 * If rootvp is set, we know we have a distinct
   2116 		 * non-root vnode which can be marked dead in
   2117 		 * the usual way.
   2118 		 *
   2119 		 * Then we want to mark the root vnode dead.
   2120 		 * Note that if rootvp wasn't set, our vp is
   2121 		 * actually the root vnode.
   2122 		 */
   2123 		if (rootvp != NULL) {
   2124 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   2125 			    "recov_stale: can't fail over, marking dead rp %s",
   2126 			    rnode4info(rp)));
   2127 			nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE);
   2128 		} else {
   2129 			rootvp = vp;
   2130 			VN_HOLD(rootvp);
   2131 		}
   2132 
   2133 		/*
   2134 		 * Mark root dead, but quietly - since
   2135 		 * the root rnode is frequently recreated,
   2136 		 * we can encounter this at every access.
   2137 		 * Also mark recovery as failed on this VFS.
   2138 		 */
   2139 		rootrp = VTOR4(rootvp);
   2140 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_CONT,
   2141 		    "recov_stale: marking dead root rp %s",
   2142 		    rnode4info(rootrp)));
   2143 		mutex_enter(&rootrp->r_statelock);
   2144 		rootrp->r_flags |= (R4RECOVERR | R4STALE);
   2145 		rootrp->r_error = ESTALE;
   2146 		mutex_exit(&rootrp->r_statelock);
   2147 		mutex_enter(&mi->mi_lock);
   2148 		mi->mi_error = ESTALE;
   2149 		mutex_exit(&mi->mi_lock);
   2150 
   2151 		svp = mi->mi_curr_serv;
   2152 		(void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
   2153 		svp->sv_flags |= SV4_ROOT_STALE;
   2154 		nfs_rw_exit(&svp->sv_lock);
   2155 	}
   2156 
   2157 out:
   2158 	if (rootvp)
   2159 		VN_RELE(rootvp);
   2160 }
   2161 
   2162 /*
   2163  * Locks.
   2164  */
   2165 
   2166 /*
   2167  * Reclaim all the active (acquired) locks for the given file.
   2168  * If a process lost a lock, the process is sent a SIGLOST.  This is not
   2169  * considered an error.
   2170  *
   2171  * Return values:
   2172  * Errors and status are returned via the nfs4_error_t parameter
   2173  * If an error indicates that recovery is needed, the caller is responsible
   2174  * for dealing with it.
   2175  */
   2176 
   2177 static void
   2178 relock_file(vnode_t *vp, mntinfo4_t *mi, nfs4_error_t *ep,
   2179     fattr4_change pre_change)
   2180 {
   2181 	locklist_t *locks, *llp;
   2182 	rnode4_t *rp;
   2183 
   2184 	ASSERT(ep != NULL);
   2185 	nfs4_error_zinit(ep);
   2186 
   2187 	if (VTOMI4(vp)->mi_flags & MI4_LLOCK)
   2188 		return;
   2189 
   2190 	nfs4_flush_lock_owners(VTOR4(vp));
   2191 
   2192 	/*
   2193 	 * If we get an error that requires recovery actions, just bail out
   2194 	 * and let the top-level recovery code handle it.
   2195 	 *
   2196 	 * If we get some other error, kill the process that owned the lock
   2197 	 * and mark its remaining locks (if any) as belonging to NOPID, so
   2198 	 * that we don't make any more reclaim requests for that process.
   2199 	 */
   2200 
   2201 	rp = VTOR4(vp);
   2202 	locks = flk_active_locks_for_vp(vp);
   2203 	for (llp = locks; llp != NULL; llp = llp->ll_next) {
   2204 		int did_reclaim = 1;
   2205 
   2206 		ASSERT(llp->ll_vp == vp);
   2207 		if (llp->ll_flock.l_pid == NOPID)
   2208 			continue;
   2209 		reclaim_one_lock(vp, &llp->ll_flock, ep, &did_reclaim);
   2210 		/*
   2211 		 * If we need to restart recovery, stop processing the
   2212 		 * list.  Some errors would be recoverable under other
   2213 		 * circumstances, but if they happen here we just give up
   2214 		 * on the lock.
   2215 		 */
   2216 		if (nfs4_needs_recovery(ep, TRUE, vp->v_vfsp)) {
   2217 			if (ep->error != 0)
   2218 				break;
   2219 			if (!nfs4_recov_marks_dead(ep->stat))
   2220 				break;
   2221 		}
   2222 		/*
   2223 		 *   In case the server isn't offering us a grace period, or
   2224 		 * if we missed it, we might have opened & locked from scratch,
   2225 		 * rather than reopened/reclaimed.
   2226 		 *   We need to ensure that the object hadn't been otherwise
   2227 		 * changed during this time, by comparing the changeinfo.
   2228 		 *   We get passed the changeinfo from before the reopen by our
   2229 		 * caller, in pre_change.
   2230 		 *   The changeinfo from after the reopen is in rp->r_change,
   2231 		 * courtesy of the GETATTR in the reopen.
   2232 		 *   If they're different, then the file has changed, and we
   2233 		 * have to SIGLOST the app.
   2234 		 */
   2235 		if (ep->error == 0 && ep->stat == NFS4_OK && !did_reclaim) {
   2236 			mutex_enter(&rp->r_statelock);
   2237 			if (pre_change != rp->r_change)
   2238 				ep->stat = NFS4ERR_NO_GRACE;
   2239 			mutex_exit(&rp->r_statelock);
   2240 		}
   2241 		if (ep->error != 0 || ep->stat != NFS4_OK) {
   2242 			if (ep->error != 0)
   2243 				nfs4_queue_event(RE_FAIL_RELOCK, mi,
   2244 				    NULL, ep->error, vp, NULL, 0, NULL,
   2245 				    llp->ll_flock.l_pid, TAG_NONE, TAG_NONE,
   2246 				    0, 0);
   2247 			else
   2248 				nfs4_queue_event(RE_FAIL_RELOCK, mi,
   2249 				    NULL, 0, vp, NULL, ep->stat, NULL,
   2250 				    llp->ll_flock.l_pid, TAG_NONE, TAG_NONE,
   2251 				    0, 0);
   2252 			nfs4_send_siglost(llp->ll_flock.l_pid, mi, vp, TRUE,
   2253 			    ep->error, ep->stat);
   2254 			relock_skip_pid(llp, llp->ll_flock.l_pid);
   2255 
   2256 			/* Reinitialize the nfs4_error and continue */
   2257 			nfs4_error_zinit(ep);
   2258 		}
   2259 	}
   2260 
   2261 	if (locks != NULL)
   2262 		flk_free_locklist(locks);
   2263 }
   2264 
   2265 /*
   2266  * Reclaim the given lock.
   2267  * If the lock can't be reclaimed, the process is sent SIGLOST, but this is
   2268  * not considered an error.
   2269  *
   2270  * Errors are returned via the nfs4_error_t parameter.
   2271  */
   2272 static void
   2273 reclaim_one_lock(vnode_t *vp, flock64_t *flk, nfs4_error_t *ep,
   2274     int *did_reclaimp)
   2275 {
   2276 	cred_t *cr;
   2277 	rnode4_t *rp = VTOR4(vp);
   2278 
   2279 	cr = pid_to_cr(flk->l_pid);
   2280 	if (cr == NULL) {
   2281 		nfs4_error_zinit(ep);
   2282 		ep->error = ESRCH;
   2283 		return;
   2284 	}
   2285 
   2286 	do {
   2287 		mutex_enter(&rp->r_statelock);
   2288 		if (rp->r_flags & R4RECOVERR) {
   2289 			/*
   2290 			 * This shouldn't affect other reclaims, so don't
   2291 			 * return an error.
   2292 			 */
   2293 			mutex_exit(&rp->r_statelock);
   2294 			break;
   2295 		}
   2296 		mutex_exit(&rp->r_statelock);
   2297 
   2298 		nfs4frlock(NFS4_LCK_CTYPE_RECLAIM, vp, F_SETLK, flk,
   2299 		    FREAD|FWRITE, 0, cr, ep, NULL, did_reclaimp);
   2300 		if (ep->error == 0 && ep->stat == NFS4ERR_FHEXPIRED)
   2301 			start_recovery_action(NR_FHEXPIRED, TRUE, VTOMI4(vp),
   2302 			    vp, NULL);
   2303 	} while (ep->error == 0 && ep->stat == NFS4ERR_FHEXPIRED);
   2304 
   2305 	crfree(cr);
   2306 }
   2307 
   2308 /*
   2309  * Open files.
   2310  */
   2311 
   2312 /*
   2313  * Verifies if the nfsstat4 is a valid error for marking this vnode dead.
   2314  * Returns 1 if the error is valid; 0 otherwise.
   2315  */
   2316 static int
   2317 nfs4_valid_recov_err_for_vp(vnode_t *vp, nfsstat4 stat)
   2318 {
   2319 	/*
   2320 	 * We should not be marking non-regular files as dead,
   2321 	 * except in very rare cases (eg: BADHANDLE or NFS4ERR_BADNAME).
   2322 	 */
   2323 	if (vp->v_type != VREG && stat != NFS4ERR_BADHANDLE &&
   2324 	    stat != NFS4ERR_BADNAME)
   2325 		return (0);
   2326 
   2327 	return (1);
   2328 }
   2329 
   2330 /*
   2331  * Failed attempting to recover a filehandle.  If 'stat' is valid for 'vp',
   2332  * then mark the object dead.  Since we've had to do a lookup for
   2333  * filehandle recovery, we will mark the object dead if we got NOENT.
   2334  */
   2335 static void
   2336 nfs4_recov_fh_fail(vnode_t *vp, int error, nfsstat4 stat)
   2337 {
   2338 	ASSERT(vp != NULL);
   2339 
   2340 	if ((error == 0) && (stat != NFS4ERR_NOENT) &&
   2341 	    (!nfs4_valid_recov_err_for_vp(vp, stat)))
   2342 		return;
   2343 
   2344 	nfs4_fail_recov(vp, "can't recover filehandle", error, stat);
   2345 }
   2346 
   2347 /*
   2348  * Recovery from a "shouldn't happen" error.  In the long term, we'd like
   2349  * to mark only the data structure(s) that provided the bad value as being
   2350  * bad.  But for now we'll just mark the entire file.
   2351  */
   2352 
   2353 static void
   2354 recov_badstate(recov_info_t *recovp, vnode_t *vp, nfsstat4 stat)
   2355 {
   2356 	ASSERT(vp != NULL);
   2357 	recov_throttle(recovp, vp);
   2358 
   2359 	if (!nfs4_valid_recov_err_for_vp(vp, stat))
   2360 		return;
   2361 
   2362 	nfs4_fail_recov(vp, "", 0, stat);
   2363 }
   2364 
   2365 /*
   2366  * Free up the information saved for a lost state request.
   2367  */
   2368 static void
   2369 nfs4_free_lost_rqst(nfs4_lost_rqst_t *lrp, nfs4_server_t *sp)
   2370 {
   2371 	component4 *filep;
   2372 	nfs4_open_stream_t *osp;
   2373 	int have_sync_lock;
   2374 
   2375 	NFS4_DEBUG(nfs4_lost_rqst_debug,
   2376 	    (CE_NOTE, "nfs4_free_lost_rqst:"));
   2377 
   2378 	switch (lrp->lr_op) {
   2379 	case OP_OPEN:
   2380 		filep = &lrp->lr_ofile;
   2381 		if (filep->utf8string_val) {
   2382 			kmem_free(filep->utf8string_val, filep->utf8string_len);
   2383 			filep->utf8string_val = NULL;
   2384 		}
   2385 		break;
   2386 	case OP_DELEGRETURN:
   2387 		nfs4delegreturn_cleanup(VTOR4(lrp->lr_vp), sp);
   2388 		break;
   2389 	case OP_CLOSE:
   2390 		osp = lrp->lr_osp;
   2391 		ASSERT(osp != NULL);
   2392 		mutex_enter(&osp->os_sync_lock);
   2393 		have_sync_lock = 1;
   2394 		if (osp->os_pending_close) {
   2395 			/* clean up the open file state. */
   2396 			osp->os_pending_close = 0;
   2397 			nfs4close_notw(lrp->lr_vp, osp, &have_sync_lock);
   2398 		}
   2399 		if (have_sync_lock)
   2400 			mutex_exit(&osp->os_sync_lock);
   2401 		break;
   2402 	}
   2403 
   2404 	lrp->lr_op = 0;
   2405 	if (lrp->lr_oop != NULL) {
   2406 		open_owner_rele(lrp->lr_oop);
   2407 		lrp->lr_oop = NULL;
   2408 	}
   2409 	if (lrp->lr_osp != NULL) {
   2410 		open_stream_rele(lrp->lr_osp, VTOR4(lrp->lr_vp));
   2411 		lrp->lr_osp = NULL;
   2412 	}
   2413 	if (lrp->lr_lop != NULL) {
   2414 		lock_owner_rele(lrp->lr_lop);
   2415 		lrp->lr_lop = NULL;
   2416 	}
   2417 	if (lrp->lr_flk != NULL) {
   2418 		kmem_free(lrp->lr_flk, sizeof (flock64_t));
   2419 		lrp->lr_flk = NULL;
   2420 	}
   2421 	if (lrp->lr_vp != NULL) {
   2422 		VN_RELE(lrp->lr_vp);
   2423 		lrp->lr_vp = NULL;
   2424 	}
   2425 	if (lrp->lr_dvp != NULL) {
   2426 		VN_RELE(lrp->lr_dvp);
   2427 		lrp->lr_dvp = NULL;
   2428 	}
   2429 	if (lrp->lr_cr != NULL) {
   2430 		crfree(lrp->lr_cr);
   2431 		lrp->lr_cr = NULL;
   2432 	}
   2433 
   2434 	kmem_free(lrp, sizeof (nfs4_lost_rqst_t));
   2435 }
   2436 
   2437 /*
   2438  * Remove any lost state requests and free them.
   2439  */
   2440 static void
   2441 nfs4_remove_lost_rqsts(mntinfo4_t *mi, nfs4_server_t *sp)
   2442 {
   2443 	nfs4_lost_rqst_t *lrp;
   2444 
   2445 	mutex_enter(&mi->mi_lock);
   2446 	while ((lrp = list_head(&mi->mi_lost_state)) != NULL) {
   2447 		list_remove(&mi->mi_lost_state, lrp);
   2448 		mutex_exit(&mi->mi_lock);
   2449 		nfs4_free_lost_rqst(lrp, sp);
   2450 		mutex_enter(&mi->mi_lock);
   2451 	}
   2452 	mutex_exit(&mi->mi_lock);
   2453 }
   2454 
   2455 /*
   2456  * Reopen all the files for the given filesystem and reclaim any locks.
   2457  */
   2458 
   2459 static void
   2460 recov_openfiles(recov_info_t *recovp, nfs4_server_t *sp)
   2461 {
   2462 	mntinfo4_t *mi = recovp->rc_mi;
   2463 	nfs4_opinst_t *reopenlist = NULL, *rep;
   2464 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
   2465 	open_claim_type4 claim;
   2466 	int remap;
   2467 	char *fail_msg = "No such file or directory on replica";
   2468 	rnode4_t *rp;
   2469 	fattr4_change pre_change;
   2470 
   2471 	ASSERT(sp != NULL);
   2472 
   2473 	/*
   2474 	 * This check is to allow a 10ms pause before we reopen files
   2475 	 * it should allow the server time to have received the CB_NULL
   2476 	 * reply and update its internal structures such that (if
   2477 	 * applicable) we are granted a delegation on reopened files.
   2478 	 */
   2479 	mutex_enter(&sp->s_lock);
   2480 	if ((sp->s_flags & (N4S_CB_PINGED | N4S_CB_WAITER)) == 0) {
   2481 		sp->s_flags |= N4S_CB_WAITER;
   2482 		(void) cv_reltimedwait(&sp->wait_cb_null, &sp->s_lock,
   2483 		    drv_usectohz(N4S_CB_PAUSE_TIME), TR_CLOCK_TICK);
   2484 	}
   2485 	mutex_exit(&sp->s_lock);
   2486 
   2487 	(void) nfs_rw_enter_sig(&sp->s_recovlock, RW_READER, 0);
   2488 	(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0);
   2489 
   2490 	if (NFS4_VOLATILE_FH(mi)) {
   2491 		nfs4_remap_root(mi, &e, 0);
   2492 		if (nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp)) {
   2493 			(void) nfs4_start_recovery(&e, mi, NULL,
   2494 			    NULL, NULL, NULL, OP_LOOKUP, NULL);
   2495 		}
   2496 	}
   2497 
   2498 	mutex_enter(&mi->mi_lock);
   2499 	if (recovp->rc_srv_reboot || (mi->mi_recovflags & MI4R_SRV_REBOOT))
   2500 		claim = CLAIM_PREVIOUS;
   2501 	else
   2502 		claim = CLAIM_NULL;
   2503 	mutex_exit(&mi->mi_lock);
   2504 
   2505 	if (e.error == 0 && e.stat == NFS4_OK) {
   2506 		/*
   2507 		 * Get a snapshot of open files in the filesystem.  Note
   2508 		 * that new opens will stall until the server's grace
   2509 		 * period is done.
   2510 		 */
   2511 		reopenlist = r4mkopenlist(mi);
   2512 
   2513 		mutex_enter(&mi->mi_lock);
   2514 		remap = mi->mi_recovflags & MI4R_REMAP_FILES;
   2515 		mutex_exit(&mi->mi_lock);
   2516 		/*
   2517 		 * Since we are re-establishing state on the
   2518 		 * server, its ok to blow away the saved lost
   2519 		 * requests since we don't need to reissue it.
   2520 		 */
   2521 		nfs4_remove_lost_rqsts(mi, sp);
   2522 
   2523 		for (rep = reopenlist; rep; rep = rep->re_next) {
   2524 
   2525 			if (remap) {
   2526 				nfs4_remap_file(mi, rep->re_vp,
   2527 				    NFS4_REMAP_CKATTRS, &e);
   2528 			}
   2529 			if (e.error == ENOENT || e.stat == NFS4ERR_NOENT) {
   2530 				/*
   2531 				 * The current server does not have the file
   2532 				 * that is to be remapped.  This is most
   2533 				 * likely due to an improperly maintained
   2534 				 * replica.   The files that are missing from
   2535 				 * the server will be marked dead and logged
   2536 				 * in order to make sys admins aware of the
   2537 				 * problem.
   2538 				 */
   2539 				nfs4_fail_recov(rep->re_vp,
   2540 				    fail_msg, e.error, e.stat);
   2541 				/*
   2542 				 * We've already handled the error so clear it.
   2543 				 */
   2544 				nfs4_error_zinit(&e);
   2545 				continue;
   2546 			} else if (e.error == 0 && e.stat == NFS4_OK) {
   2547 				int j;
   2548 
   2549 				rp = VTOR4(rep->re_vp);
   2550 				mutex_enter(&rp->r_statelock);
   2551 				pre_change = rp->r_change;
   2552 				mutex_exit(&rp->r_statelock);
   2553 
   2554 				for (j = 0; j < rep->re_numosp; j++) {
   2555 					nfs4_reopen(rep->re_vp, rep->re_osp[j],
   2556 					    &e, claim, FALSE, TRUE);
   2557 					if (e.error != 0 || e.stat != NFS4_OK)
   2558 						break;
   2559 				}
   2560 				if (nfs4_needs_recovery(&e, TRUE,
   2561 				    mi->mi_vfsp)) {
   2562 					(void) nfs4_start_recovery(&e, mi,
   2563 					    rep->re_vp, NULL, NULL, NULL,
   2564 					    OP_OPEN, NULL);
   2565 					break;
   2566 				}
   2567 			}
   2568 #ifdef DEBUG
   2569 			if (nfs4_recovdelay > 0)
   2570 				delay(MSEC_TO_TICK(nfs4_recovdelay * 1000));
   2571 #endif
   2572 			if (e.error == 0 && e.stat == NFS4_OK)
   2573 				relock_file(rep->re_vp, mi, &e, pre_change);
   2574 
   2575 			if (nfs4_needs_recovery(&e, TRUE, mi->mi_vfsp))
   2576 				(void) nfs4_start_recovery(&e, mi,
   2577 				    rep->re_vp, NULL, NULL, NULL, OP_LOCK,
   2578 				    NULL);
   2579 			if (e.error != 0 || e.stat != NFS4_OK)
   2580 				break;
   2581 		}
   2582 
   2583 		/*
   2584 		 * Check to see if we need to remap files passed in
   2585 		 * via the recovery arguments; this will have been
   2586 		 * done for open files.  A failure here is not fatal.
   2587 		 */
   2588 		if (remap) {
   2589 			nfs4_error_t ignore;
   2590 			nfs4_check_remap(mi, recovp->rc_vp1, NFS4_REMAP_CKATTRS,
   2591 			    &ignore);
   2592 			nfs4_check_remap(mi, recovp->rc_vp2, NFS4_REMAP_CKATTRS,
   2593 			    &ignore);
   2594 		}
   2595 	}
   2596 
   2597 	if (e.error == 0 && e.stat == NFS4_OK) {
   2598 		mutex_enter(&mi->mi_lock);
   2599 		mi->mi_recovflags &= ~(MI4R_REOPEN_FILES | MI4R_REMAP_FILES);
   2600 		mutex_exit(&mi->mi_lock);
   2601 	}
   2602 
   2603 	nfs_rw_exit(&mi->mi_recovlock);
   2604 	nfs_rw_exit(&sp->s_recovlock);
   2605 
   2606 	if (reopenlist != NULL)
   2607 		r4releopenlist(reopenlist);
   2608 }
   2609 
   2610 /*
   2611  * Resend the queued state recovery requests in "rqsts".
   2612  */
   2613 
   2614 static void
   2615 nfs4_resend_lost_rqsts(recov_info_t *recovp, nfs4_server_t *sp)
   2616 {
   2617 	nfs4_lost_rqst_t	*lrp, *tlrp;
   2618 	mntinfo4_t		*mi = recovp->rc_mi;
   2619 	nfs4_error_t		n4e;
   2620 #ifdef NOTYET
   2621 	uint32_t		deny_bits = 0;
   2622 #endif
   2623 
   2624 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_resend_lost_rqsts"));
   2625 
   2626 	ASSERT(mi != NULL);
   2627 	ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
   2628 
   2629 	mutex_enter(&mi->mi_lock);
   2630 	lrp = list_head(&mi->mi_lost_state);
   2631 	mutex_exit(&mi->mi_lock);
   2632 	while (lrp != NULL) {
   2633 		nfs4_error_zinit(&n4e);
   2634 		resend_one_op(lrp, &n4e, mi, sp);
   2635 		NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
   2636 		    "nfs4_resend_lost_rqsts: resend request: for vp %p got "
   2637 		    "error %d stat %d", (void *)lrp->lr_vp, n4e.error,
   2638 		    n4e.stat));
   2639 
   2640 		/*
   2641 		 * If we get a recovery error that we can actually
   2642 		 * recover from (such as ETIMEDOUT, FHEXPIRED), we
   2643 		 * return and let the recovery thread redrive the call.
   2644 		 * Don't requeue unless the zone is still healthy.
   2645 		 */
   2646 		if (zone_status_get(curproc->p_zone) < ZONE_IS_SHUTTING_DOWN &&
   2647 		    nfs4_needs_recovery(&n4e, TRUE, mi->mi_vfsp) &&
   2648 		    (nfs4_try_failover(&n4e) ||
   2649 		    NFS4_FRC_UNMT_ERR(n4e.error, mi->mi_vfsp) ||
   2650 		    (n4e.error == 0 && n4e.stat != NFS4ERR_BADHANDLE &&
   2651 		    !nfs4_recov_marks_dead(n4e.stat)))) {
   2652 			/*
   2653 			 * For these three errors, we want to delay a bit
   2654 			 * instead of pounding the server into submission.
   2655 			 * We have to do this manually; the normal
   2656 			 * processing for these errors only works for
   2657 			 * non-recovery requests.
   2658 			 */
   2659 			if ((n4e.error == 0 && n4e.stat == NFS4ERR_DELAY) ||
   2660 			    (n4e.error == 0 && n4e.stat == NFS4ERR_GRACE) ||
   2661 			    (n4e.error == 0 && n4e.stat == NFS4ERR_RESOURCE) ||
   2662 			    NFS4_FRC_UNMT_ERR(n4e.error, mi->mi_vfsp)) {
   2663 				delay(SEC_TO_TICK(nfs4err_delay_time));
   2664 			} else {
   2665 				(void) nfs4_start_recovery(&n4e,
   2666 				    mi, lrp->lr_dvp, lrp->lr_vp, NULL, NULL,
   2667 				    lrp->lr_op, NULL);
   2668 			}
   2669 			return;
   2670 		}
   2671 
   2672 		mutex_enter(&mi->mi_lock);
   2673 		list_remove(&mi->mi_lost_state, lrp);
   2674 		tlrp = lrp;
   2675 		lrp = list_head(&mi->mi_lost_state);
   2676 		mutex_exit(&mi->mi_lock);
   2677 		nfs4_free_lost_rqst(tlrp, sp);
   2678 	}
   2679 }
   2680 
   2681 /*
   2682  * Resend the given op, and issue any necessary undo call.
   2683  * errors are returned via the nfs4_error_t parameter.
   2684  */
   2685 
   2686 static void
   2687 resend_one_op(nfs4_lost_rqst_t *lrp, nfs4_error_t *ep,
   2688     mntinfo4_t *mi, nfs4_server_t *sp)
   2689 {
   2690 	vnode_t *vp;
   2691 	nfs4_open_stream_t *osp;
   2692 	cred_t *cr;
   2693 	uint32_t acc_bits;
   2694 
   2695 	vp = lrp->lr_vp;
   2696 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_one_op: "
   2697 	    "have a lost open/close request for vp %p", (void *)vp));
   2698 
   2699 	switch (lrp->lr_op) {
   2700 	case OP_OPEN:
   2701 		nfs4_resend_open_otw(&vp, lrp, ep);
   2702 		break;
   2703 	case OP_OPEN_DOWNGRADE:
   2704 		ASSERT(lrp->lr_oop != NULL);
   2705 		ep->error = nfs4_start_open_seqid_sync(lrp->lr_oop, mi);
   2706 		ASSERT(!ep->error);	/* recov thread always succeeds */
   2707 		ASSERT(lrp->lr_osp != NULL);
   2708 		mutex_enter(&lrp->lr_osp->os_sync_lock);
   2709 		nfs4_open_downgrade(lrp->lr_dg_acc, lrp->lr_dg_deny,
   2710 		    lrp->lr_oop, lrp->lr_osp, vp, lrp->lr_cr, lrp,
   2711 		    ep, NULL, NULL);
   2712 		mutex_exit(&lrp->lr_osp->os_sync_lock);
   2713 		nfs4_end_open_seqid_sync(lrp->lr_oop);
   2714 		break;
   2715 	case OP_CLOSE:
   2716 		osp = lrp->lr_osp;
   2717 		cr = lrp->lr_cr;
   2718 		acc_bits = 0;
   2719 		mutex_enter(&osp->os_sync_lock);
   2720 		if (osp->os_share_acc_read)
   2721 			acc_bits |= OPEN4_SHARE_ACCESS_READ;
   2722 		if (osp->os_share_acc_write)
   2723 			acc_bits |= OPEN4_SHARE_ACCESS_WRITE;
   2724 		mutex_exit(&osp->os_sync_lock);
   2725 		nfs4close_one(vp, osp, cr, acc_bits, lrp, ep,
   2726 		    CLOSE_RESEND, 0, 0, 0);
   2727 		break;
   2728 	case OP_LOCK:
   2729 	case OP_LOCKU:
   2730 		resend_lock(lrp, ep);
   2731 		goto done;
   2732 	case OP_DELEGRETURN:
   2733 		nfs4_resend_delegreturn(lrp, ep, sp);
   2734 		goto done;
   2735 	default:
   2736 #ifdef DEBUG
   2737 		cmn_err(CE_PANIC, "resend_one_op: unexpected op: %d",
   2738 		    lrp->lr_op);
   2739 #endif
   2740 		nfs4_queue_event(RE_LOST_STATE_BAD_OP, mi, NULL,
   2741 		    lrp->lr_op, lrp->lr_vp, lrp->lr_dvp, NFS4_OK, NULL, 0,
   2742 		    TAG_NONE, TAG_NONE, 0, 0);
   2743 		nfs4_error_init(ep, EINVAL);
   2744 		return;
   2745 	}
   2746 
   2747 	/*
   2748 	 * No need to retry nor send an "undo" CLOSE in the
   2749 	 * event the server rebooted.
   2750 	 */
   2751 	if (ep->error == 0 && (ep->stat == NFS4ERR_STALE_CLIENTID ||
   2752 	    ep->stat == NFS4ERR_STALE_STATEID || ep->stat == NFS4ERR_EXPIRED))
   2753 		goto done;
   2754 
   2755 	/*
   2756 	 * If we resent a CLOSE or OPEN_DOWNGRADE, there's nothing
   2757 	 * to undo.  Undoing locking operations was handled by
   2758 	 * resend_lock().
   2759 	 */
   2760 	if (lrp->lr_op == OP_OPEN_DOWNGRADE || lrp->lr_op == OP_CLOSE)
   2761 		goto done;
   2762 
   2763 	/*
   2764 	 * If we get any other error for OPEN, then don't attempt
   2765 	 * to undo the resend of the open (since it was never
   2766 	 * successful!).
   2767 	 */
   2768 	ASSERT(lrp->lr_op == OP_OPEN);
   2769 	if (ep->error || ep->stat != NFS4_OK)
   2770 		goto done;
   2771 
   2772 	/*
   2773 	 * Now let's undo our OPEN.
   2774 	 */
   2775 	nfs4_error_zinit(ep);
   2776 	close_after_open_resend(vp, lrp->lr_cr, lrp->lr_oacc, ep);
   2777 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_one_op: "
   2778 	    "nfs4close_one: for vp %p got error %d stat %d",
   2779 	    (void *)vp, ep->error, ep->stat));
   2780 
   2781 done:
   2782 	if (vp != lrp->lr_vp)
   2783 		VN_RELE(vp);
   2784 }
   2785 
   2786 /*
   2787  * Close a file that was opened via a resent OPEN.
   2788  * Most errors are passed back to the caller (via the return value and
   2789  * *statp), except for FHEXPIRED, which is retried.
   2790  *
   2791  * It might be conceptually cleaner to push the CLOSE request onto the
   2792  * front of the resend queue, rather than sending it here.  That would
   2793  * match the way we undo lost lock requests.  On the other
   2794  * hand, we've already got something that works, and there's no reason to
   2795  * change it at this time.
   2796  */
   2797 
   2798 static void
   2799 close_after_open_resend(vnode_t *vp, cred_t *cr, uint32_t acc_bits,
   2800     nfs4_error_t *ep)
   2801 {
   2802 
   2803 	for (;;) {
   2804 		nfs4close_one(vp, NULL, cr, acc_bits, NULL, ep,
   2805 		    CLOSE_AFTER_RESEND, 0, 0, 0);
   2806 		if (ep->error == 0 && ep->stat == NFS4_OK)
   2807 			break;		/* success; done */
   2808 		if (ep->error != 0 || ep->stat != NFS4ERR_FHEXPIRED)
   2809 			break;
   2810 		/* else retry FHEXPIRED */
   2811 	}
   2812 
   2813 }
   2814 
   2815 /*
   2816  * Resend the given lost lock request.  Return an errno value.  If zero,
   2817  * *statp is set to the NFS status code for the call.
   2818  *
   2819  * Issue a SIGLOST and mark the rnode dead if we get a non-recovery error or
   2820  * a recovery error that we don't actually recover from yet (eg: BAD_SEQID).
   2821  * Let the recovery thread redrive the call if we get a recovery error that
   2822  * we can actually recover from.
   2823  */
   2824 static void
   2825 resend_lock(nfs4_lost_rqst_t *lrp, nfs4_error_t *ep)
   2826 {
   2827 	bool_t		send_siglost = FALSE;
   2828 	vnode_t		*vp = lrp->lr_vp;
   2829 
   2830 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_lock:"));
   2831 	ASSERT(lrp->lr_ctype == NFS4_LCK_CTYPE_REINSTATE ||
   2832 	    lrp->lr_ctype == NFS4_LCK_CTYPE_RESEND);
   2833 
   2834 	nfs4frlock(lrp->lr_ctype, vp, F_SETLK,
   2835 	    lrp->lr_flk, FREAD|FWRITE, 0, lrp->lr_cr, ep, lrp, NULL);
   2836 
   2837 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_lock: "
   2838 	    "nfs4frlock for vp %p returned error %d, stat %d",
   2839 	    (void *)vp, ep->error, ep->stat));
   2840 
   2841 	if (ep->error == 0 && ep->stat == 0)
   2842 		goto done;
   2843 	if (ep->error == 0 && ep->stat == NFS4ERR_DENIED &&
   2844 	    lrp->lr_ctype == NFS4_LCK_CTYPE_RESEND)
   2845 		goto done;
   2846 
   2847 	/*
   2848 	 * If we failed with a non-recovery error, send SIGLOST and
   2849 	 * mark the file dead.
   2850 	 */
   2851 	if (!nfs4_needs_recovery(ep, TRUE, vp->v_vfsp))
   2852 		send_siglost = TRUE;
   2853 	else {
   2854 		/*
   2855 		 * Done with recovering LOST LOCK in the event the
   2856 		 * server rebooted or we've lost the lease.
   2857 		 */
   2858 		if (ep->error == 0 && (ep->stat == NFS4ERR_STALE_CLIENTID ||
   2859 		    ep->stat == NFS4ERR_STALE_STATEID ||
   2860 		    ep->stat == NFS4ERR_EXPIRED)) {
   2861 			goto done;
   2862 		}
   2863 
   2864 		/*
   2865 		 * BAD_STATEID on an unlock indicates that the server has
   2866 		 * forgotten about the lock anyway, so act like the call
   2867 		 * was successful.
   2868 		 */
   2869 		if (ep->error == 0 && ep->stat == NFS4ERR_BAD_STATEID &&
   2870 		    lrp->lr_op == OP_LOCKU)
   2871 			goto done;
   2872 
   2873 		/*
   2874 		 * If we got a recovery error that we don't actually
   2875 		 * recover from, send SIGLOST.  If the filesystem was
   2876 		 * forcibly unmounted, we skip the SIGLOST because (a) it's
   2877 		 * unnecessary noise, and (b) there could be a new process
   2878 		 * with the same pid as the one that had generated the lost
   2879 		 * state request.
   2880 		 */
   2881 		if (ep->error == 0 && (ep->stat == NFS4ERR_BADHANDLE ||
   2882 		    nfs4_recov_marks_dead(ep->stat))) {
   2883 			if (!(vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
   2884 				send_siglost = TRUE;
   2885 			goto done;
   2886 		}
   2887 
   2888 		/*
   2889 		 * If the filesystem was forcibly unmounted, we
   2890 		 * still need to synchronize with the server and
   2891 		 * release state.  Try again later.
   2892 		 */
   2893 		if (NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp))
   2894 			goto done;
   2895 
   2896 		/*
   2897 		 * If we get a recovery error that we can actually
   2898 		 * recover from (such as ETIMEDOUT, FHEXPIRED),
   2899 		 * return and let the recovery thread redrive the call.
   2900 		 *
   2901 		 * For the three errors below, we want to delay a bit
   2902 		 * instead of pounding the server into submission.
   2903 		 */
   2904 		if ((ep->error == 0 && ep->stat == NFS4ERR_DELAY) ||
   2905 		    (ep->error == 0 && ep->stat == NFS4ERR_GRACE) ||
   2906 		    (ep->error == 0 && ep->stat == NFS4ERR_RESOURCE))
   2907 			delay(SEC_TO_TICK(recov_err_delay));
   2908 		goto done;
   2909 	}
   2910 
   2911 done:
   2912 	if (send_siglost) {
   2913 		cred_t *sv_cred;
   2914 
   2915 		/*
   2916 		 * Must be root or the actual thread being issued the
   2917 		 * SIGLOST for this to work, so just become root.
   2918 		 */
   2919 		sv_cred = curthread->t_cred;
   2920 		curthread->t_cred = kcred;
   2921 		nfs4_send_siglost(lrp->lr_flk->l_pid, VTOMI4(vp), vp, FALSE,
   2922 		    ep->error, ep->stat);
   2923 		curthread->t_cred = sv_cred;
   2924 
   2925 		/*
   2926 		 * Flush any additional reinstantiation requests for
   2927 		 * this operation.  Sending multiple SIGLOSTs to the user
   2928 		 * process is unlikely to help and may cause trouble.
   2929 		 */
   2930 		if (lrp->lr_ctype == NFS4_LCK_CTYPE_REINSTATE)
   2931 			flush_reinstate(lrp);
   2932 	}
   2933 }
   2934 
   2935 /*
   2936  * Remove any lock reinstantiation requests that correspond to the given
   2937  * lost request.  We only remove items that follow lrp in the queue,
   2938  * assuming that lrp will be removed by the generic lost state code.
   2939  */
   2940 
   2941 static void
   2942 flush_reinstate(nfs4_lost_rqst_t *lrp)
   2943 {
   2944 	vnode_t *vp;
   2945 	pid_t pid;
   2946 	mntinfo4_t *mi;
   2947 	nfs4_lost_rqst_t *nlrp;
   2948 
   2949 	vp = lrp->lr_vp;
   2950 	mi = VTOMI4(vp);
   2951 	pid = lrp->lr_flk->l_pid;
   2952 
   2953 	/*
   2954 	 * If there are any more reinstantation requests to get rid of,
   2955 	 * they should all be clustered at the front of the lost state
   2956 	 * queue.
   2957 	 */
   2958 	mutex_enter(&mi->mi_lock);
   2959 	for (lrp = list_next(&mi->mi_lost_state, lrp); lrp != NULL;
   2960 	    lrp = nlrp) {
   2961 		nlrp = list_next(&mi->mi_lost_state, lrp);
   2962 		if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU)
   2963 			break;
   2964 		if (lrp->lr_ctype != NFS4_LCK_CTYPE_REINSTATE)
   2965 			break;
   2966 		ASSERT(lrp->lr_vp == vp);
   2967 		ASSERT(lrp->lr_flk->l_pid == pid);
   2968 		NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
   2969 		    "remove reinstantiation %p", (void *)lrp));
   2970 		list_remove(&mi->mi_lost_state, lrp);
   2971 		nfs4_free_lost_rqst(lrp, NULL);
   2972 	}
   2973 	mutex_exit(&mi->mi_lock);
   2974 }
   2975 
   2976 /*
   2977  * End of state-specific recovery routines.
   2978  */
   2979 
   2980 /*
   2981  * Allocate a lost request struct, initialize it from lost_rqstp (including
   2982  * bumping the reference counts for the referenced vnode, etc.), and hang
   2983  * it off of recovp.
   2984  */
   2985 
   2986 static void
   2987 nfs4_save_lost_rqst(nfs4_lost_rqst_t *lost_rqstp, recov_info_t *recovp,
   2988     nfs4_recov_t *action, mntinfo4_t *mi)
   2989 {
   2990 	nfs4_lost_rqst_t *destp;
   2991 
   2992 	ASSERT(recovp->rc_lost_rqst == NULL);
   2993 
   2994 	destp = kmem_alloc(sizeof (nfs4_lost_rqst_t), KM_SLEEP);
   2995 	recovp->rc_lost_rqst = destp;
   2996 
   2997 	if (lost_rqstp->lr_op == OP_LOCK ||
   2998 	    lost_rqstp->lr_op == OP_LOCKU) {
   2999 		ASSERT(lost_rqstp->lr_lop);
   3000 		*action = NR_LOST_LOCK;
   3001 		destp->lr_ctype = lost_rqstp->lr_ctype;
   3002 		destp->lr_locktype = lost_rqstp->lr_locktype;
   3003 	} else if (lost_rqstp->lr_op == OP_OPEN) {
   3004 		component4 *srcfp, *destfp;
   3005 
   3006 		destp->lr_oacc = lost_rqstp->lr_oacc;
   3007 		destp->lr_odeny = lost_rqstp->lr_odeny;
   3008 		destp->lr_oclaim = lost_rqstp->lr_oclaim;
   3009 		if (lost_rqstp->lr_oclaim == CLAIM_DELEGATE_CUR)
   3010 			destp->lr_ostateid = lost_rqstp->lr_ostateid;
   3011 
   3012 		srcfp = &lost_rqstp->lr_ofile;
   3013 		destfp = &destp->lr_ofile;
   3014 		/*
   3015 		 * Consume caller's utf8string
   3016 		 */
   3017 		destfp->utf8string_len = srcfp->utf8string_len;
   3018 		destfp->utf8string_val = srcfp->utf8string_val;
   3019 		srcfp->utf8string_len = 0;
   3020 		srcfp->utf8string_val = NULL;	/* make sure not reused */
   3021 
   3022 		*action = NR_LOST_STATE_RQST;
   3023 	} else if (lost_rqstp->lr_op == OP_OPEN_DOWNGRADE) {
   3024 		destp->lr_dg_acc = lost_rqstp->lr_dg_acc;
   3025 		destp->lr_dg_deny = lost_rqstp->lr_dg_deny;
   3026 
   3027 		*action = NR_LOST_STATE_RQST;
   3028 	} else if (lost_rqstp->lr_op == OP_CLOSE) {
   3029 		ASSERT(lost_rqstp->lr_oop);
   3030 		*action = NR_LOST_STATE_RQST;
   3031 	} else if (lost_rqstp->lr_op == OP_DELEGRETURN) {
   3032 		*action = NR_LOST_STATE_RQST;
   3033 	} else {
   3034 #ifdef DEBUG
   3035 		cmn_err(CE_PANIC, "nfs4_save_lost_rqst: bad op %d",
   3036 		    lost_rqstp->lr_op);
   3037 #endif
   3038 		nfs4_queue_event(RE_LOST_STATE_BAD_OP, mi, NULL,
   3039 		    lost_rqstp->lr_op, lost_rqstp->lr_vp, lost_rqstp->lr_dvp,
   3040 		    NFS4_OK, NULL, curproc->p_pid, TAG_NONE, TAG_NONE, 0, 0);
   3041 		*action = NR_UNUSED;
   3042 		recovp->rc_lost_rqst = NULL;
   3043 		kmem_free(destp, sizeof (nfs4_lost_rqst_t));
   3044 		return;
   3045 	}
   3046 
   3047 	destp->lr_op = lost_rqstp->lr_op;
   3048 	destp->lr_vp = lost_rqstp->lr_vp;
   3049 	if (destp->lr_vp)
   3050 		VN_HOLD(destp->lr_vp);
   3051 	destp->lr_dvp = lost_rqstp->lr_dvp;
   3052 	if (destp->lr_dvp)
   3053 		VN_HOLD(destp->lr_dvp);
   3054 	destp->lr_oop = lost_rqstp->lr_oop;
   3055 	if (destp->lr_oop)
   3056 		open_owner_hold(destp->lr_oop);
   3057 	destp->lr_osp = lost_rqstp->lr_osp;
   3058 	if (destp->lr_osp)
   3059 		open_stream_hold(destp->lr_osp);
   3060 	destp->lr_lop = lost_rqstp->lr_lop;
   3061 	if (destp->lr_lop)
   3062 		lock_owner_hold(destp->lr_lop);
   3063 	destp->lr_cr = lost_rqstp->lr_cr;
   3064 	if (destp->lr_cr)
   3065 		crhold(destp->lr_cr);
   3066 	if (lost_rqstp->lr_flk == NULL)
   3067 		destp->lr_flk = NULL;
   3068 	else {
   3069 		destp->lr_flk = kmem_alloc(sizeof (flock64_t), KM_SLEEP);
   3070 		*destp->lr_flk = *lost_rqstp->lr_flk;
   3071 	}
   3072 	destp->lr_putfirst = lost_rqstp->lr_putfirst;
   3073 }
   3074 
   3075 /*
   3076  * Map the given return values (errno and nfs4 status code) to a recovery
   3077  * action and fill in the following fields of recovp: rc_action,
   3078  * rc_srv_reboot, rc_stateid, rc_lost_rqst.
   3079  */
   3080 
   3081 void
   3082 errs_to_action(recov_info_t *recovp,
   3083     nfs4_server_t *sp, mntinfo4_t *mi, stateid4 *sidp,
   3084     nfs4_lost_rqst_t *lost_rqstp, int unmounted, nfs_opnum4 op,
   3085     nfs4_bseqid_entry_t *bsep)
   3086 {
   3087 	nfs4_recov_t action = NR_UNUSED;
   3088 	bool_t reboot = FALSE;
   3089 	int try_f;
   3090 	int error = recovp->rc_orig_errors.error;
   3091 	nfsstat4 stat = recovp->rc_orig_errors.stat;
   3092 
   3093 	bzero(&recovp->rc_stateid, sizeof (stateid4));
   3094 	recovp->rc_lost_rqst = NULL;
   3095 	recovp->rc_bseqid_rqst = NULL;
   3096 
   3097 	try_f = nfs4_try_failover(&recovp->rc_orig_errors) &&
   3098 	    FAILOVER_MOUNT4(mi);
   3099 
   3100 	/*
   3101 	 * We start recovery for EINTR only in the lost lock
   3102 	 * or lost open/close case.
   3103 	 */
   3104 
   3105 	if (try_f || error == EINTR || (error == EIO && unmounted)) {
   3106 		recovp->rc_error = (error != 0 ? error : geterrno4(stat));
   3107 		if (lost_rqstp) {
   3108 			ASSERT(lost_rqstp->lr_op != 0);
   3109 			nfs4_save_lost_rqst(lost_rqstp, recovp, &action, mi);
   3110 		}
   3111 		if (try_f)
   3112 			action = NR_FAILOVER;
   3113 	} else if (error != 0) {
   3114 		recovp->rc_error = error;
   3115 		nfs4_queue_event(RE_UNEXPECTED_ERRNO, mi, NULL, error, NULL,
   3116 		    NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
   3117 		action = NR_CLIENTID;
   3118 	} else {
   3119 		recovp->rc_error = geterrno4(stat);
   3120 		switch (stat) {
   3121 #ifdef notyet
   3122 		case NFS4ERR_LEASE_MOVED:
   3123 			action = xxx;
   3124 			break;
   3125 		case NFS4ERR_MOVED:
   3126 			action = xxx;
   3127 			break;
   3128 #endif
   3129 		case NFS4ERR_BADHANDLE:
   3130 			action = NR_BADHANDLE;
   3131 			break;
   3132 		case NFS4ERR_BAD_SEQID:
   3133 			if (bsep)
   3134 				save_bseqid_rqst(bsep, recovp);
   3135 			action = NR_BAD_SEQID;
   3136 			break;
   3137 		case NFS4ERR_OLD_STATEID:
   3138 			action = NR_OLDSTATEID;
   3139 			break;
   3140 		case NFS4ERR_WRONGSEC:
   3141 			action = NR_WRONGSEC;
   3142 			break;
   3143 		case NFS4ERR_FHEXPIRED:
   3144 			action = NR_FHEXPIRED;
   3145 			break;
   3146 		case NFS4ERR_BAD_STATEID:
   3147 			if (sp == NULL || (sp != NULL && inlease(sp))) {
   3148 
   3149 				action = NR_BAD_STATEID;
   3150 				if (sidp)
   3151 					recovp->rc_stateid = *sidp;
   3152 			} else
   3153 				action = NR_CLIENTID;
   3154 			break;
   3155 		case NFS4ERR_EXPIRED:
   3156 			/*
   3157 			 * The client's lease has expired, either due
   3158 			 * to a network partition or perhaps a client
   3159 			 * error.  In either case, try an NR_CLIENTID
   3160 			 * style recovery.  reboot remains false, since
   3161 			 * there is no evidence the server has rebooted.
   3162 			 * This will cause CLAIM_NULL opens and lock
   3163 			 * requests without the reclaim bit.
   3164 			 */
   3165 			action = NR_CLIENTID;
   3166 
   3167 			DTRACE_PROBE4(nfs4__expired,
   3168 			    nfs4_server_t *, sp,
   3169 			    mntinfo4_t *, mi,
   3170 			    stateid4 *, sidp, int, op);
   3171 
   3172 			break;
   3173 		case NFS4ERR_STALE_CLIENTID:
   3174 		case NFS4ERR_STALE_STATEID:
   3175 			action = NR_CLIENTID;
   3176 			reboot = TRUE;
   3177 			break;
   3178 		case NFS4ERR_RESOURCE:
   3179 			/*
   3180 			 * If this had been a FAILOVER mount, then
   3181 			 * we'd have tried failover.  Since it's not,
   3182 			 * just delay a while and retry.
   3183 			 */
   3184 			action = NR_DELAY;
   3185 			break;
   3186 		case NFS4ERR_GRACE:
   3187 			action = NR_GRACE;
   3188 			break;
   3189 		case NFS4ERR_DELAY:
   3190 			action = NR_DELAY;
   3191 			break;
   3192 		case NFS4ERR_STALE:
   3193 			action = NR_STALE;
   3194 			break;
   3195 		default:
   3196 			nfs4_queue_event(RE_UNEXPECTED_STATUS, mi, NULL, 0,
   3197 			    NULL, NULL, stat, NULL, 0, TAG_NONE, TAG_NONE,
   3198 			    0, 0);
   3199 			action = NR_CLIENTID;
   3200 			break;
   3201 		}
   3202 	}
   3203 
   3204 	/* make sure action got set */
   3205 	ASSERT(action != NR_UNUSED);
   3206 	recovp->rc_srv_reboot = reboot;
   3207 	recovp->rc_action = action;
   3208 	nfs4_queue_fact(RF_ERR, mi, stat, action, op, reboot, NULL, error,
   3209 	    NULL);
   3210 }
   3211 
   3212 /*
   3213  * Return the (held) credential for the process with the given pid.
   3214  * May return NULL (e.g., process not found).
   3215  */
   3216 
   3217 static cred_t *
   3218 pid_to_cr(pid_t pid)
   3219 {
   3220 	proc_t *p;
   3221 	cred_t *cr;
   3222 
   3223 	mutex_enter(&pidlock);
   3224 	if ((p = prfind(pid)) == NULL) {
   3225 		mutex_exit(&pidlock);
   3226 		return (NULL);
   3227 	}
   3228 
   3229 	mutex_enter(&p->p_crlock);
   3230 	crhold(cr = p->p_cred);
   3231 	mutex_exit(&p->p_crlock);
   3232 	mutex_exit(&pidlock);
   3233 
   3234 	return (cr);
   3235 }
   3236 
   3237 /*
   3238  * Send SIGLOST to the given process and queue the event.
   3239  *
   3240  * The 'dump' boolean tells us whether this action should dump the
   3241  * in-kernel queue of recovery messages or not.
   3242  */
   3243 
   3244 void
   3245 nfs4_send_siglost(pid_t pid, mntinfo4_t *mi, vnode_t *vp, bool_t dump,
   3246     int error, nfsstat4 stat)
   3247 {
   3248 	proc_t *p;
   3249 
   3250 	mutex_enter(&pidlock);
   3251 	p = prfind(pid);
   3252 	if (p)
   3253 		psignal(p, SIGLOST);
   3254 	mutex_exit(&pidlock);
   3255 	nfs4_queue_event(dump ? RE_SIGLOST : RE_SIGLOST_NO_DUMP, mi,
   3256 	    NULL, error, vp, NULL, stat, NULL, pid, TAG_NONE, TAG_NONE, 0, 0);
   3257 }
   3258 
   3259 /*
   3260  * Scan the lock list for entries that match the given pid.  Change the
   3261  * pid in those that do to NOPID.
   3262  */
   3263 
   3264 static void
   3265 relock_skip_pid(locklist_t *llp, pid_t pid)
   3266 {
   3267 	for (; llp != NULL; llp = llp->ll_next) {
   3268 		if (llp->ll_flock.l_pid == pid)
   3269 			llp->ll_flock.l_pid = NOPID;
   3270 	}
   3271 }
   3272 
   3273 /*
   3274  * Mark a file as having failed recovery, after making a last-ditch effort
   3275  * to return any delegation.
   3276  *
   3277  * Sets r_error to EIO or ESTALE for the given vnode.
   3278  */
   3279 void
   3280 nfs4_fail_recov(vnode_t *vp, char *why, int error, nfsstat4 stat)
   3281 {
   3282 	rnode4_t *rp = VTOR4(vp);
   3283 
   3284 #ifdef DEBUG
   3285 	if (nfs4_fail_recov_stop)
   3286 		debug_enter("nfs4_fail_recov");
   3287 #endif
   3288 
   3289 	mutex_enter(&rp->r_statelock);
   3290 	if (rp->r_flags & (R4RECOVERR|R4RECOVERRP)) {
   3291 		mutex_exit(&rp->r_statelock);
   3292 		return;
   3293 	}
   3294 
   3295 	/*
   3296 	 * Set R4RECOVERRP to indicate that a recovery error is in
   3297 	 * progress.  This will shut down reads and writes at the top
   3298 	 * half.  Don't set R4RECOVERR until after we've returned the
   3299 	 * delegation, otherwise it will fail.
   3300 	 */
   3301 
   3302 	rp->r_flags |= R4RECOVERRP;
   3303 	mutex_exit(&rp->r_statelock);
   3304 
   3305 	nfs4delegabandon(rp);
   3306 
   3307 	mutex_enter(&rp->r_statelock);
   3308 	rp->r_flags |= (R4RECOVERR | R4STALE);
   3309 	rp->r_error = (error == 0 && stat == NFS4ERR_STALE) ? ESTALE : EIO;
   3310 	PURGE_ATTRCACHE4_LOCKED(rp);
   3311 	if (!(vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
   3312 		nfs4_queue_event(RE_DEAD_FILE, VTOMI4(vp), NULL, error,
   3313 		    vp, NULL, stat, why, 0, TAG_NONE, TAG_NONE, 0, 0);
   3314 	mutex_exit(&rp->r_statelock);
   3315 
   3316 	dnlc_purge_vp(vp);
   3317 }
   3318 
   3319 /*
   3320  * recov_throttle: if the file had the same recovery action within the
   3321  * throttle interval, wait for the throttle interval to finish before
   3322  * proceeding.
   3323  *
   3324  * Side effects: updates the rnode with the current recovery information.
   3325  */
   3326 
   3327 static void
   3328 recov_throttle(recov_info_t *recovp, vnode_t *vp)
   3329 {
   3330 	time_t curtime, time_to_wait;
   3331 	rnode4_t *rp = VTOR4(vp);
   3332 
   3333 	curtime = gethrestime_sec();
   3334 
   3335 	mutex_enter(&rp->r_statelock);
   3336 	NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   3337 	    "recov_throttle: now: (%d, %ld), last: (%d, %ld)",
   3338 	    recovp->rc_action, curtime,
   3339 	    rp->r_recov_act, rp->r_last_recov));
   3340 	if (recovp->rc_action == rp->r_recov_act &&
   3341 	    rp->r_last_recov + recov_err_delay > curtime) {
   3342 		time_to_wait = rp->r_last_recov + recov_err_delay - curtime;
   3343 		mutex_exit(&rp->r_statelock);
   3344 		delay(SEC_TO_TICK(time_to_wait));
   3345 		curtime = gethrestime_sec();
   3346 		mutex_enter(&rp->r_statelock);
   3347 	}
   3348 
   3349 	rp->r_last_recov = curtime;
   3350 	rp->r_recov_act = recovp->rc_action;
   3351 	mutex_exit(&rp->r_statelock);
   3352 }
   3353 
   3354 /*
   3355  * React to NFS4ERR_GRACE by setting the time we'll permit
   3356  * the next call to this filesystem.
   3357  */
   3358 void
   3359 nfs4_set_grace_wait(mntinfo4_t *mi)
   3360 {
   3361 	mutex_enter(&mi->mi_lock);
   3362 	/* Mark the time for the future */
   3363 	mi->mi_grace_wait = gethrestime_sec() + nfs4err_delay_time;
   3364 	mutex_exit(&mi->mi_lock);
   3365 }
   3366 
   3367 /*
   3368  * React to MFS4ERR_DELAY by setting the time we'll permit
   3369  * the next call to this vnode.
   3370  */
   3371 void
   3372 nfs4_set_delay_wait(vnode_t *vp)
   3373 {
   3374 	rnode4_t *rp = VTOR4(vp);
   3375 
   3376 	mutex_enter(&rp->r_statelock);
   3377 	/*
   3378 	 * Calculate amount we should delay, initial
   3379 	 * delay will be short and then we will back off.
   3380 	 */
   3381 	if (rp->r_delay_interval == 0)
   3382 		rp->r_delay_interval = NFS4_INITIAL_DELAY_INTERVAL;
   3383 	else
   3384 		/* calculate next interval value */
   3385 		rp->r_delay_interval =
   3386 		    MIN(NFS4_MAX_DELAY_INTERVAL, (rp->r_delay_interval << 1));
   3387 	rp->r_delay_wait = gethrestime_sec() + rp->r_delay_interval;
   3388 	mutex_exit(&rp->r_statelock);
   3389 }
   3390 
   3391 /*
   3392  * The caller is responsible for freeing the returned string.
   3393  */
   3394 static char *
   3395 nfs4_getsrvnames(mntinfo4_t *mi, size_t *len)
   3396 {
   3397 	servinfo4_t *svp;
   3398 	char *srvnames;
   3399 	char *namep;
   3400 	size_t length;
   3401 
   3402 	/*
   3403 	 * Calculate the length of the string required to hold all
   3404 	 * of the server names plus either a comma or a null
   3405 	 * character following each individual one.
   3406 	 */
   3407 	length = 0;
   3408 	for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) {
   3409 		(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
   3410 		if (svp->sv_flags & SV4_NOTINUSE) {
   3411 			nfs_rw_exit(&svp->sv_lock);
   3412 			continue;
   3413 		}
   3414 		nfs_rw_exit(&svp->sv_lock);
   3415 		length += svp->sv_hostnamelen;
   3416 	}
   3417 
   3418 	srvnames = kmem_alloc(length, KM_SLEEP);
   3419 
   3420 	namep = srvnames;
   3421 	for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) {
   3422 		(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
   3423 		if (svp->sv_flags & SV4_NOTINUSE) {
   3424 			nfs_rw_exit(&svp->sv_lock);
   3425 			continue;
   3426 		}
   3427 		nfs_rw_exit(&svp->sv_lock);
   3428 		(void) strcpy(namep, svp->sv_hostname);
   3429 		namep += svp->sv_hostnamelen - 1;
   3430 		*namep++ = ',';
   3431 	}
   3432 	*--namep = '\0';
   3433 
   3434 	*len = length;
   3435 
   3436 	return (srvnames);
   3437 }
   3438 
   3439 static void
   3440 save_bseqid_rqst(nfs4_bseqid_entry_t *bsep, recov_info_t *recovp)
   3441 {
   3442 	nfs4_bseqid_entry_t *destp;
   3443 
   3444 	destp = kmem_alloc(sizeof (nfs4_bseqid_entry_t), KM_SLEEP);
   3445 	recovp->rc_bseqid_rqst = destp;
   3446 
   3447 	if (bsep->bs_oop)
   3448 		open_owner_hold(bsep->bs_oop);
   3449 	destp->bs_oop = bsep->bs_oop;
   3450 	if (bsep->bs_lop)
   3451 		lock_owner_hold(bsep->bs_lop);
   3452 	destp->bs_lop = bsep->bs_lop;
   3453 	if (bsep->bs_vp)
   3454 		VN_HOLD(bsep->bs_vp);
   3455 	destp->bs_vp = bsep->bs_vp;
   3456 	destp->bs_pid = bsep->bs_pid;
   3457 	destp->bs_tag = bsep->bs_tag;
   3458 	destp->bs_seqid = bsep->bs_seqid;
   3459 }
   3460 
   3461 static void
   3462 free_bseqid_rqst(nfs4_bseqid_entry_t *bsep)
   3463 {
   3464 	if (bsep->bs_oop)
   3465 		open_owner_rele(bsep->bs_oop);
   3466 	if (bsep->bs_lop)
   3467 		lock_owner_rele(bsep->bs_lop);
   3468 	if (bsep->bs_vp)
   3469 		VN_RELE(bsep->bs_vp);
   3470 	kmem_free(bsep, sizeof (nfs4_bseqid_entry_t));
   3471 }
   3472 
   3473 /*
   3474  * We don't actually fully recover from NFS4ERR_BAD_SEQID.  We
   3475  * simply mark the open owner and open stream (if provided) as "bad".
   3476  * Then future uses of these data structures will be limited to basically
   3477  * just cleaning up the internal client state (no going OTW).
   3478  *
   3479  * The result of this is to return errors back to the app/usr when
   3480  * we receive NFS4ERR_BAD_SEQID, but also allow future/new calls to
   3481  * succeed so progress can be made.
   3482  */
   3483 void
   3484 recov_bad_seqid(recov_info_t *recovp)
   3485 {
   3486 	mntinfo4_t		*mi = recovp->rc_mi;
   3487 	nfs4_open_owner_t	*bad_oop;
   3488 	nfs4_lock_owner_t	*bad_lop;
   3489 	vnode_t			*vp;
   3490 	rnode4_t		*rp = NULL;
   3491 	pid_t			pid;
   3492 	nfs4_bseqid_entry_t	*bsep, *tbsep;
   3493 	int			error;
   3494 
   3495 	ASSERT(mi != NULL);
   3496 	ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
   3497 
   3498 	mutex_enter(&mi->mi_lock);
   3499 	bsep = list_head(&mi->mi_bseqid_list);
   3500 	mutex_exit(&mi->mi_lock);
   3501 
   3502 	/*
   3503 	 * Handle all the bad seqid entries on mi's list.
   3504 	 */
   3505 	while (bsep != NULL) {
   3506 		bad_oop = bsep->bs_oop;
   3507 		bad_lop = bsep->bs_lop;
   3508 		vp = bsep->bs_vp;
   3509 		pid = bsep->bs_pid;
   3510 
   3511 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   3512 		    "recov_bad_seqid: mark oop %p lop %p as bad for "
   3513 		    "vp %p tag %s pid %d: last good seqid %d for tag %s",
   3514 		    (void *)bad_oop, (void *)bad_lop, (void *)vp,
   3515 		    nfs4_ctags[bsep->bs_tag].ct_str, pid,
   3516 		    bad_oop ?  bad_oop->oo_last_good_seqid : 0,
   3517 		    bad_oop ? nfs4_ctags[bad_oop->oo_last_good_op].ct_str :
   3518 		    nfs4_ctags[TAG_NONE].ct_str));
   3519 
   3520 		nfs4_queue_event(RE_BAD_SEQID, mi, NULL,
   3521 		    0, vp, NULL, NFS4ERR_BAD_SEQID, NULL, pid, bsep->bs_tag,
   3522 		    bad_oop ? bad_oop->oo_last_good_op : TAG_NONE,
   3523 		    bsep->bs_seqid, bad_oop ? bad_oop->oo_last_good_seqid : 0);
   3524 
   3525 		if (bad_oop) {
   3526 			/* essentially reset the open owner */
   3527 			error = nfs4_start_open_seqid_sync(bad_oop, mi);
   3528 			ASSERT(!error);	/* recov thread always succeeds */
   3529 			bad_oop->oo_name = nfs4_get_new_oo_name();
   3530 			bad_oop->oo_seqid = 0;
   3531 			nfs4_end_open_seqid_sync(bad_oop);
   3532 		}
   3533 
   3534 		if (bad_lop) {
   3535 			mutex_enter(&bad_lop->lo_lock);
   3536 			bad_lop->lo_flags |= NFS4_BAD_SEQID_LOCK;
   3537 			mutex_exit(&bad_lop->lo_lock);
   3538 
   3539 			ASSERT(vp != NULL);
   3540 			rp = VTOR4(vp);
   3541 			mutex_enter(&rp->r_statelock);
   3542 			rp->r_flags |= R4LODANGLERS;
   3543 			mutex_exit(&rp->r_statelock);
   3544 
   3545 			nfs4_send_siglost(pid, mi, vp, TRUE,
   3546 			    0, NFS4ERR_BAD_SEQID);
   3547 		}
   3548 
   3549 		mutex_enter(&mi->mi_lock);
   3550 		list_remove(&mi->mi_bseqid_list, bsep);
   3551 		tbsep = bsep;
   3552 		bsep = list_head(&mi->mi_bseqid_list);
   3553 		mutex_exit(&mi->mi_lock);
   3554 		free_bseqid_rqst(tbsep);
   3555 	}
   3556 
   3557 	mutex_enter(&mi->mi_lock);
   3558 	mi->mi_recovflags &= ~MI4R_BAD_SEQID;
   3559 	mutex_exit(&mi->mi_lock);
   3560 }
   3561