Home | History | Annotate | Download | only in nfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*
     27  * NFS Version 4 state recovery code.
     28  */
     29 
     30 #include <nfs/nfs4_clnt.h>
     31 #include <nfs/nfs4.h>
     32 #include <nfs/rnode4.h>
     33 #include <sys/cmn_err.h>
     34 #include <sys/cred.h>
     35 #include <sys/systm.h>
     36 #include <sys/flock.h>
     37 #include <sys/dnlc.h>
     38 #include <sys/ddi.h>
     39 #include <sys/disp.h>
     40 #include <sys/list.h>
     41 #include <sys/sdt.h>
     42 #include <sys/mount.h>
     43 #include <sys/door.h>
     44 #include <nfs/nfssys.h>
     45 #include <nfs/nfsid_map.h>
     46 #include <nfs/nfs4_idmap_impl.h>
     47 
     48 extern r4hashq_t *rtable4;
     49 
     50 /*
     51  * Information that describes what needs to be done for recovery.  It is
     52  * passed to a client recovery thread as well as passed to various recovery
     53  * routines.  rc_mi, rc_vp1, and rc_vp2 refer to the filesystem and
     54  * vnode(s) affected by recovery.  rc_vp1 and rc_vp2 are references (use
     55  * VN_HOLD) or NULL.  rc_lost_rqst contains information about the lost
     56  * lock or open/close request, and it holds reference counts for the
     57  * various objects (vnode, etc.).  The recovery thread also uses flags set
     58  * in the mntinfo4_t or vnode_t to tell it what to do.  rc_error is used
     59  * to save the error that originally triggered the recovery event -- will
     60  * later be used to set mi_error if recovery doesn't work.  rc_bseqid_rqst
     61  * contains information about the request that got NFS4ERR_BAD_SEQID, and
     62  * it holds reference count for the various objects (vnode, open owner,
     63  * open stream, lock owner).
     64  */
     65 
     66 typedef struct {
     67 	mntinfo4_t *rc_mi;
     68 	vnode_t *rc_vp1;
     69 	vnode_t *rc_vp2;
     70 	nfs4_recov_t rc_action;
     71 	stateid4 rc_stateid;
     72 	bool_t rc_srv_reboot;		/* server has rebooted */
     73 	nfs4_lost_rqst_t *rc_lost_rqst;
     74 	nfs4_error_t rc_orig_errors;	/* original errors causing recovery */
     75 	int rc_error;
     76 	nfs4_bseqid_entry_t *rc_bseqid_rqst;
     77 	vnode_t *rc_moved_vp;
     78 	char *rc_moved_nm;
     79 } recov_info_t;
     80 
     81 /*
     82  * How long to wait before trying again if there is an error doing
     83  * recovery, in seconds.
     84  */
     85 
     86 static int recov_err_delay = 1;
     87 
     88 /*
     89  * How long to wait when processing NFS4ERR_GRACE or NFS4ERR_DELAY
     90  * errors.  Expressed in seconds.  Default is defined as
     91  * NFS4ERR_DELAY_TIME and this variable is initialized in nfs4_subr_init()
     92  */
     93 time_t nfs4err_delay_time = 0;
     94 
     95 /*
     96  * Tuneable to limit how many time "exempt" ops go OTW
     97  * after a recovery error.  Exempt op hints are OH_CLOSE,
     98  * OH_LOCKU, OH_DELEGRETURN.  These previously always went
     99  * OTW even after rnode was "dead" due to recovery errors.
    100  *
    101  * The tuneable below limits the number of times a start_fop
    102  * invocation will retry the exempt hints.  After the limit
    103  * is reached, nfs4_start_fop will return an error just like
    104  * it would for non-exempt op hints.
    105  */
    106 int nfs4_max_recov_error_retry = 3;
    107 
    108 /*
    109  * Number of seconds the recovery thread should pause before retry when the
    110  * filesystem has been forcibly unmounted.
    111  */
    112 
    113 int nfs4_unmount_delay = 1;
    114 
    115 #ifdef DEBUG
    116 
    117 /*
    118  * How long to wait (in seconds) between recovery operations on a given
    119  * file.  Normally zero, but could be set longer for testing purposes.
    120  */
    121 static int nfs4_recovdelay = 0;
    122 
    123 /*
    124  * Switch that controls whether to go into the debugger when recovery
    125  * fails.
    126  */
    127 static int nfs4_fail_recov_stop = 0;
    128 
    129 /*
    130  * Tuneables to debug client namespace interaction with server
    131  * mount points:
    132  *
    133  *	nfs4_srvmnt_fail_cnt:
    134  *		number of times EACCES returned because client
    135  *		attempted to cross server mountpoint
    136  *
    137  *	nfs4_srvmnt_debug:
    138  *		trigger console printf whenever client attempts
    139  *		to cross server mountpoint
    140  */
    141 int nfs4_srvmnt_fail_cnt = 0;
    142 int nfs4_srvmnt_debug = 0;
    143 #endif
    144 
    145 extern zone_key_t	nfs4clnt_zone_key;
    146 
    147 /* forward references, in alphabetic order */
    148 static void close_after_open_resend(vnode_t *, cred_t *, uint32_t,
    149 	nfs4_error_t *);
    150 static void errs_to_action(recov_info_t *,
    151 	nfs4_server_t *, mntinfo4_t *, stateid4 *, nfs4_lost_rqst_t *, int,
    152 	nfs_opnum4, nfs4_bseqid_entry_t *);
    153 static void flush_reinstate(nfs4_lost_rqst_t *);
    154 static void free_milist(mntinfo4_t **, int);
    155 static mntinfo4_t **make_milist(nfs4_server_t *, int *);
    156 static int nfs4_check_recov_err(vnode_t *, nfs4_op_hint_t,
    157 	nfs4_recov_state_t *, int, char *);
    158 static char *nfs4_getsrvnames(mntinfo4_t *, size_t *);
    159 static void nfs4_recov_fh_fail(vnode_t *, int, nfsstat4);
    160 static void nfs4_recov_thread(recov_info_t *);
    161 static void nfs4_remove_lost_rqsts(mntinfo4_t *, nfs4_server_t *);
    162 static void nfs4_resend_lost_rqsts(recov_info_t *, nfs4_server_t *);
    163 static cred_t *pid_to_cr(pid_t);
    164 static void reclaim_one_lock(vnode_t *, flock64_t *, nfs4_error_t *, int *);
    165 static void recov_bad_seqid(recov_info_t *);
    166 static void recov_badstate(recov_info_t *, vnode_t *, nfsstat4);
    167 static void recov_clientid(recov_info_t *, nfs4_server_t *);
    168 static void recov_done(mntinfo4_t *, recov_info_t *);
    169 static void recov_filehandle(nfs4_recov_t, mntinfo4_t *, vnode_t *);
    170 static void recov_newserver(recov_info_t *, nfs4_server_t **, bool_t *);
    171 static void recov_openfiles(recov_info_t *, nfs4_server_t *);
    172 static void recov_stale(mntinfo4_t *, vnode_t *);
    173 static void nfs4_free_lost_rqst(nfs4_lost_rqst_t *, nfs4_server_t *);
    174 static void recov_throttle(recov_info_t *, vnode_t *);
    175 static void relock_skip_pid(locklist_t *, pid_t);
    176 static void resend_lock(nfs4_lost_rqst_t *, nfs4_error_t *);
    177 static void resend_one_op(nfs4_lost_rqst_t *, nfs4_error_t *, mntinfo4_t *,
    178 	nfs4_server_t *);
    179 static void save_bseqid_rqst(nfs4_bseqid_entry_t *, recov_info_t *);
    180 static void start_recovery(recov_info_t *, mntinfo4_t *, vnode_t *, vnode_t *,
    181 	nfs4_server_t *, vnode_t *, char *);
    182 static void start_recovery_action(nfs4_recov_t, bool_t, mntinfo4_t *, vnode_t *,
    183 	vnode_t *);
    184 static int wait_for_recovery(mntinfo4_t *, nfs4_op_hint_t);
    185 
    186 /*
    187  * Return non-zero if the given errno, status, and rpc status codes
    188  * in the nfs4_error_t indicate that client recovery is needed.
    189  * "stateful" indicates whether the call that got the error establishes or
    190  * removes state on the server (open, close, lock, unlock, delegreturn).
    191  */
    192 
    193 int
    194 nfs4_needs_recovery(nfs4_error_t *ep, bool_t stateful, vfs_t *vfsp)
    195 {
    196 	int recov = 0;
    197 	mntinfo4_t *mi;
    198 
    199 	/*
    200 	 * Try failover if the error values justify it and if
    201 	 * it's a failover mount.  Don't try if the mount is in
    202 	 * progress, failures are handled explicitly by nfs4rootvp.
    203 	 */
    204 	if (nfs4_try_failover(ep)) {
    205 		mi = VFTOMI4(vfsp);
    206 		mutex_enter(&mi->mi_lock);
    207 		recov = FAILOVER_MOUNT4(mi) && !(mi->mi_flags & MI4_MOUNTING);
    208 		mutex_exit(&mi->mi_lock);
    209 		if (recov)
    210 			return (recov);
    211 	}
    212 
    213 	if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vfsp)) {
    214 		/*
    215 		 * The server may have gotten the request, so for stateful
    216 		 * ops we need to resynchronize and possibly back out the
    217 		 * op.
    218 		 */
    219 		return (stateful);
    220 	}
    221 	if (ep->error != 0)
    222 		return (0);
    223 
    224 	/* stat values are listed alphabetically */
    225 	/*
    226 	 * There are two lists here: the errors for which we have code, and
    227 	 * the errors for which we plan to have code before FCS.  For the
    228 	 * second list, print a warning message but don't attempt recovery.
    229 	 */
    230 	switch (ep->stat) {
    231 	case NFS4ERR_BADHANDLE:
    232 	case NFS4ERR_BAD_SEQID:
    233 	case NFS4ERR_BAD_STATEID:
    234 	case NFS4ERR_DELAY:
    235 	case NFS4ERR_EXPIRED:
    236 	case NFS4ERR_FHEXPIRED:
    237 	case NFS4ERR_GRACE:
    238 	case NFS4ERR_OLD_STATEID:
    239 	case NFS4ERR_RESOURCE:
    240 	case NFS4ERR_STALE_CLIENTID:
    241 	case NFS4ERR_STALE_STATEID:
    242 	case NFS4ERR_WRONGSEC:
    243 	case NFS4ERR_STALE:
    244 		recov = 1;
    245 		break;
    246 #ifdef DEBUG
    247 	case NFS4ERR_LEASE_MOVED:
    248 	case NFS4ERR_MOVED:
    249 		zcmn_err(VFTOMI4(vfsp)->mi_zone->zone_id,
    250 		    CE_WARN, "!Can't yet recover from NFS status %d",
    251 		    ep->stat);
    252 		break;
    253 #endif
    254 	}
    255 
    256 	return (recov);
    257 }
    258 
    259 /*
    260  * Some operations such as DELEGRETURN want to avoid invoking
    261  * recovery actions that will only mark the file dead.  If
    262  * better handlers are invoked for any of these errors, this
    263  * routine should be modified.
    264  */
    265 int
    266 nfs4_recov_marks_dead(nfsstat4 status)
    267 {
    268 	if (status == NFS4ERR_BAD_SEQID ||
    269 	    status == NFS4ERR_EXPIRED ||
    270 	    status == NFS4ERR_BAD_STATEID ||
    271 	    status == NFS4ERR_OLD_STATEID)
    272 		return (1);
    273 	return (0);
    274 }
    275 
    276 /*
    277  * Transfer the state recovery information in recovp to mi's resend queue,
    278  * and mark mi as having a lost state request.
    279  */
    280 static void
    281 nfs4_enqueue_lost_rqst(recov_info_t *recovp, mntinfo4_t *mi)
    282 {
    283 	nfs4_lost_rqst_t *lrp = recovp->rc_lost_rqst;
    284 
    285 	ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
    286 	    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
    287 
    288 	ASSERT(lrp != NULL && lrp->lr_op != 0);
    289 
    290 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
    291 	    "nfs4_enqueue_lost_rqst %p, op %d",
    292 	    (void *)lrp, lrp->lr_op));
    293 
    294 	mutex_enter(&mi->mi_lock);
    295 	mi->mi_recovflags |= MI4R_LOST_STATE;
    296 	if (lrp->lr_putfirst)
    297 		list_insert_head(&mi->mi_lost_state, lrp);
    298 	else
    299 		list_insert_tail(&mi->mi_lost_state, lrp);
    300 	recovp->rc_lost_rqst = NULL;
    301 	mutex_exit(&mi->mi_lock);
    302 
    303 	nfs4_queue_event(RE_LOST_STATE, mi, NULL, lrp->lr_op, lrp->lr_vp,
    304 	    lrp->lr_dvp, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
    305 }
    306 
    307 /*
    308  * Transfer the bad seqid recovery information in recovp to mi's
    309  * bad seqid queue, and mark mi as having a bad seqid request.
    310  */
    311 void
    312 enqueue_bseqid_rqst(recov_info_t *recovp, mntinfo4_t *mi)
    313 {
    314 	ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
    315 	    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
    316 	ASSERT(recovp->rc_bseqid_rqst != NULL);
    317 
    318 	mutex_enter(&mi->mi_lock);
    319 	mi->mi_recovflags |= MI4R_BAD_SEQID;
    320 	list_insert_tail(&mi->mi_bseqid_list, recovp->rc_bseqid_rqst);
    321 	recovp->rc_bseqid_rqst = NULL;
    322 	mutex_exit(&mi->mi_lock);
    323 }
    324 
    325 /*
    326  * Initiate recovery.
    327  *
    328  * The nfs4_error_t contains the return codes that triggered a recovery
    329  * attempt.  mi, vp1, and vp2 refer to the filesystem and files that were
    330  * being operated on.  vp1 and vp2 may be NULL.
    331  *
    332  * Multiple calls are okay.  If recovery is already underway, the call
    333  * updates the information about what state needs recovery but does not
    334  * start a new thread.  The caller should hold mi->mi_recovlock as a reader
    335  * for proper synchronization with any recovery thread.
    336  *
    337  * This will return TRUE if recovery was aborted, and FALSE otherwise.
    338  */
    339 bool_t
    340 nfs4_start_recovery(nfs4_error_t *ep, mntinfo4_t *mi, vnode_t *vp1,
    341     vnode_t *vp2, stateid4 *sid, nfs4_lost_rqst_t *lost_rqstp, nfs_opnum4 op,
    342     nfs4_bseqid_entry_t *bsep, vnode_t *moved_vp, char *moved_nm)
    343 {
    344 	recov_info_t *recovp;
    345 	nfs4_server_t *sp;
    346 	bool_t abort = FALSE;
    347 	bool_t gone = FALSE;
    348 
    349 	ASSERT(nfs_zone() == mi->mi_zone);
    350 	mutex_enter(&mi->mi_lock);
    351 	/*
    352 	 * If there is lost state, we need to kick off recovery even if the
    353 	 * filesystem has been unmounted or the zone is shutting down.
    354 	 */
    355 	gone = FS_OR_ZONE_GONE4(mi->mi_vfsp);
    356 	if (gone) {
    357 		ASSERT(ep->error != EINTR || lost_rqstp != NULL);
    358 		if (ep->error == EIO && lost_rqstp == NULL) {
    359 			/* failed due to forced unmount, no new lost state */
    360 			abort = TRUE;
    361 		}
    362 		if ((ep->error == 0 || ep->error == ETIMEDOUT) &&
    363 		    !(mi->mi_recovflags & MI4R_LOST_STATE)) {
    364 			/* some other failure, no existing lost state */
    365 			abort = TRUE;
    366 		}
    367 		if (abort) {
    368 			mutex_exit(&mi->mi_lock);
    369 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
    370 			    "nfs4_start_recovery: fs unmounted"));
    371 			return (TRUE);
    372 		}
    373 	}
    374 	mi->mi_in_recovery++;
    375 	mutex_exit(&mi->mi_lock);
    376 
    377 	recovp = kmem_alloc(sizeof (recov_info_t), KM_SLEEP);
    378 	recovp->rc_orig_errors = *ep;
    379 	sp = find_nfs4_server(mi);
    380 	errs_to_action(recovp, sp, mi, sid, lost_rqstp, gone, op, bsep);
    381 	if (sp != NULL)
    382 		mutex_exit(&sp->s_lock);
    383 	start_recovery(recovp, mi, vp1, vp2, sp, moved_vp, moved_nm);
    384 	if (sp != NULL)
    385 		nfs4_server_rele(sp);
    386 	return (FALSE);
    387 }
    388 
    389 /*
    390  * Internal version of nfs4_start_recovery.  The difference is that the
    391  * caller specifies the recovery action, rather than the errors leading to
    392  * recovery.
    393  */
    394 static void
    395 start_recovery_action(nfs4_recov_t what, bool_t reboot, mntinfo4_t *mi,
    396     vnode_t *vp1, vnode_t *vp2)
    397 {
    398 	recov_info_t *recovp;
    399 
    400 	ASSERT(nfs_zone() == mi->mi_zone);
    401 	mutex_enter(&mi->mi_lock);
    402 	mi->mi_in_recovery++;
    403 	mutex_exit(&mi->mi_lock);
    404 
    405 	recovp = kmem_zalloc(sizeof (recov_info_t), KM_SLEEP);
    406 	recovp->rc_action = what;
    407 	recovp->rc_srv_reboot = reboot;
    408 	recovp->rc_error = EIO;
    409 	start_recovery(recovp, mi, vp1, vp2, NULL, NULL, NULL);
    410 }
    411 
    412 static void
    413 start_recovery(recov_info_t *recovp, mntinfo4_t *mi,
    414     vnode_t *vp1, vnode_t *vp2, nfs4_server_t *sp,
    415     vnode_t *moved_vp, char *moved_nm)
    416 {
    417 	NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
    418 	    "start_recovery: mi %p, what %s", (void*)mi,
    419 	    nfs4_recov_action_to_str(recovp->rc_action)));
    420 
    421 	/*
    422 	 * Bump the reference on the vfs so that we can pass it to the
    423 	 * recovery thread.
    424 	 */
    425 	VFS_HOLD(mi->mi_vfsp);
    426 	MI4_HOLD(mi);
    427 again:
    428 	switch (recovp->rc_action) {
    429 	case NR_FAILOVER:
    430 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
    431 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
    432 		if (mi->mi_servers->sv_next == NULL)
    433 			goto out_no_thread;
    434 		mutex_enter(&mi->mi_lock);
    435 		mi->mi_recovflags |= MI4R_NEED_NEW_SERVER;
    436 		mutex_exit(&mi->mi_lock);
    437 
    438 		if (recovp->rc_lost_rqst != NULL)
    439 			nfs4_enqueue_lost_rqst(recovp, mi);
    440 		break;
    441 
    442 	case NR_CLIENTID:
    443 		/*
    444 		 * If the filesystem has been unmounted, punt.
    445 		 */
    446 		if (sp == NULL)
    447 			goto out_no_thread;
    448 
    449 		/*
    450 		 * If nobody else is working on the clientid, mark the
    451 		 * clientid as being no longer set.  Then mark the specific
    452 		 * filesystem being worked on.
    453 		 */
    454 		if (!nfs4_server_in_recovery(sp)) {
    455 			mutex_enter(&sp->s_lock);
    456 			sp->s_flags &= ~N4S_CLIENTID_SET;
    457 			mutex_exit(&sp->s_lock);
    458 		}
    459 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
    460 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
    461 		mutex_enter(&mi->mi_lock);
    462 		mi->mi_recovflags |= MI4R_NEED_CLIENTID;
    463 		if (recovp->rc_srv_reboot)
    464 			mi->mi_recovflags |= MI4R_SRV_REBOOT;
    465 		mutex_exit(&mi->mi_lock);
    466 		break;
    467 
    468 	case NR_OPENFILES:
    469 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
    470 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
    471 		mutex_enter(&mi->mi_lock);
    472 		mi->mi_recovflags |= MI4R_REOPEN_FILES;
    473 		if (recovp->rc_srv_reboot)
    474 			mi->mi_recovflags |= MI4R_SRV_REBOOT;
    475 		mutex_exit(&mi->mi_lock);
    476 		break;
    477 
    478 	case NR_WRONGSEC:
    479 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
    480 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
    481 		mutex_enter(&mi->mi_lock);
    482 		mi->mi_recovflags |= MI4R_NEED_SECINFO;
    483 		mutex_exit(&mi->mi_lock);
    484 		break;
    485 
    486 	case NR_EXPIRED:
    487 		if (vp1 != NULL)
    488 			recov_badstate(recovp, vp1, NFS4ERR_EXPIRED);
    489 		if (vp2 != NULL)
    490 			recov_badstate(recovp, vp2, NFS4ERR_EXPIRED);
    491 		goto out_no_thread;	/* no further recovery possible */
    492 
    493 	case NR_BAD_STATEID:
    494 		if (vp1 != NULL)
    495 			recov_badstate(recovp, vp1, NFS4ERR_BAD_STATEID);
    496 		if (vp2 != NULL)
    497 			recov_badstate(recovp, vp2, NFS4ERR_BAD_STATEID);
    498 		goto out_no_thread;	/* no further recovery possible */
    499 
    500 	case NR_FHEXPIRED:
    501 	case NR_BADHANDLE:
    502 		if (vp1 != NULL)
    503 			recov_throttle(recovp, vp1);
    504 		if (vp2 != NULL)
    505 			recov_throttle(recovp, vp2);
    506 		/*
    507 		 * Recover the filehandle now, rather than using a
    508 		 * separate thread.  We can do this because filehandle
    509 		 * recovery is independent of any other state, and because
    510 		 * we know that we are not competing with the recovery
    511 		 * thread at this time.  recov_filehandle will deal with
    512 		 * threads that are competing to recover this filehandle.
    513 		 */
    514 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
    515 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
    516 		if (vp1 != NULL)
    517 			recov_filehandle(recovp->rc_action, mi, vp1);
    518 		if (vp2 != NULL)
    519 			recov_filehandle(recovp->rc_action, mi, vp2);
    520 		goto out_no_thread;	/* no further recovery needed */
    521 
    522 	case NR_STALE:
    523 		/*
    524 		 * NFS4ERR_STALE handling
    525 		 * recov_stale() could set MI4R_NEED_NEW_SERVER to
    526 		 * indicate that we can and should failover.
    527 		 */
    528 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
    529 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
    530 
    531 		if (vp1 != NULL)
    532 			recov_stale(mi, vp1);
    533 		if (vp2 != NULL)
    534 			recov_stale(mi, vp2);
    535 		mutex_enter(&mi->mi_lock);
    536 		if ((mi->mi_recovflags & MI4R_NEED_NEW_SERVER) == 0) {
    537 			mutex_exit(&mi->mi_lock);
    538 			goto out_no_thread;
    539 		}
    540 		mutex_exit(&mi->mi_lock);
    541 		recovp->rc_action = NR_FAILOVER;
    542 		goto again;
    543 
    544 	case NR_BAD_SEQID:
    545 		if (recovp->rc_bseqid_rqst) {
    546 			enqueue_bseqid_rqst(recovp, mi);
    547 			break;
    548 		}
    549 
    550 		if (vp1 != NULL)
    551 			recov_badstate(recovp, vp1, NFS4ERR_BAD_SEQID);
    552 		if (vp2 != NULL)
    553 			recov_badstate(recovp, vp2, NFS4ERR_BAD_SEQID);
    554 		goto out_no_thread; /* no further recovery possible */
    555 
    556 	case NR_OLDSTATEID:
    557 		if (vp1 != NULL)
    558 			recov_badstate(recovp, vp1, NFS4ERR_OLD_STATEID);
    559 		if (vp2 != NULL)
    560 			recov_badstate(recovp, vp2, NFS4ERR_OLD_STATEID);
    561 		goto out_no_thread;	/* no further recovery possible */
    562 
    563 	case NR_GRACE:
    564 		nfs4_set_grace_wait(mi);
    565 		goto out_no_thread; /* no further action required for GRACE */
    566 
    567 	case NR_DELAY:
    568 		if (vp1)
    569 			nfs4_set_delay_wait(vp1);
    570 		goto out_no_thread; /* no further action required for DELAY */
    571 
    572 	case NR_LOST_STATE_RQST:
    573 	case NR_LOST_LOCK:
    574 		nfs4_enqueue_lost_rqst(recovp, mi);
    575 		break;
    576 	default:
    577 		nfs4_queue_event(RE_UNEXPECTED_ACTION, mi, NULL,
    578 		    recovp->rc_action, NULL, NULL, 0, NULL, 0, TAG_NONE,
    579 		    TAG_NONE, 0, 0);
    580 		goto out_no_thread;
    581 	}
    582 
    583 	/*
    584 	 * If either file recently went through the same recovery, wait
    585 	 * awhile.  This is in case there is some sort of bug; we might not
    586 	 * be able to recover properly, but at least we won't bombard the
    587 	 * server with calls, and we won't tie up the client.
    588 	 */
    589 	if (vp1 != NULL)
    590 		recov_throttle(recovp, vp1);
    591 	if (vp2 != NULL)
    592 		recov_throttle(recovp, vp2);
    593 
    594 	/*
    595 	 * If there's already a recovery thread, don't start another one.
    596 	 */
    597 
    598 	mutex_enter(&mi->mi_lock);
    599 	if (mi->mi_flags & MI4_RECOV_ACTIV) {
    600 		mutex_exit(&mi->mi_lock);
    601 		goto out_no_thread;
    602 	}
    603 	mi->mi_flags |= MI4_RECOV_ACTIV;
    604 	mutex_exit(&mi->mi_lock);
    605 	NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
    606 	    "start_recovery: starting new thread for mi %p", (void*)mi));
    607 
    608 	recovp->rc_mi = mi;
    609 	recovp->rc_vp1 = vp1;
    610 	if (vp1 != NULL) {
    611 		ASSERT(VTOMI4(vp1) == mi);
    612 		VN_HOLD(recovp->rc_vp1);
    613 	}
    614 	recovp->rc_vp2 = vp2;
    615 	if (vp2 != NULL) {
    616 		ASSERT(VTOMI4(vp2) == mi);
    617 		VN_HOLD(recovp->rc_vp2);
    618 	}
    619 	recovp->rc_moved_vp = moved_vp;
    620 	recovp->rc_moved_nm = moved_nm;
    621 
    622 	(void) zthread_create(NULL, 0, nfs4_recov_thread, recovp, 0,
    623 	    minclsyspri);
    624 	return;
    625 
    626 	/* not reached by thread creating call */
    627 out_no_thread:
    628 	mutex_enter(&mi->mi_lock);
    629 	mi->mi_in_recovery--;
    630 	if (mi->mi_in_recovery == 0)
    631 		cv_broadcast(&mi->mi_cv_in_recov);
    632 	mutex_exit(&mi->mi_lock);
    633 
    634 	VFS_RELE(mi->mi_vfsp);
    635 	MI4_RELE(mi);
    636 	/*
    637 	 * Free up resources that were allocated for us.
    638 	 */
    639 	kmem_free(recovp, sizeof (recov_info_t));
    640 }
    641 
    642 static int
    643 nfs4_check_recov_err(vnode_t *vp, nfs4_op_hint_t op,
    644     nfs4_recov_state_t *rsp, int retry_err_cnt, char *str)
    645 {
    646 	rnode4_t *rp;
    647 	int error = 0;
    648 	int exempt;
    649 
    650 	if (vp == NULL)
    651 		return (0);
    652 
    653 	exempt = (op == OH_CLOSE || op == OH_LOCKU || op == OH_DELEGRETURN);
    654 	rp = VTOR4(vp);
    655 	mutex_enter(&rp->r_statelock);
    656 
    657 	/*
    658 	 * If there was a recovery error, then allow op hints "exempt" from
    659 	 * recov errors to retry (currently 3 times).  Either r_error or
    660 	 * EIO is returned for non-exempt op hints.
    661 	 */
    662 	if (rp->r_flags & R4RECOVERR) {
    663 		if (exempt && rsp->rs_num_retry_despite_err <=
    664 		    nfs4_max_recov_error_retry) {
    665 
    666 			/*
    667 			 * Check to make sure that we haven't already inc'd
    668 			 * rs_num_retry_despite_err for current nfs4_start_fop
    669 			 * instance.  We don't want to double inc (if we were
    670 			 * called with vp2, then the vp1 call could have
    671 			 * already incremented.
    672 			 */
    673 			if (retry_err_cnt == rsp->rs_num_retry_despite_err)
    674 				rsp->rs_num_retry_despite_err++;
    675 
    676 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
    677 			    "nfs4_start_fop: %s %p DEAD, cnt=%d", str,
    678 			    (void *)vp, rsp->rs_num_retry_despite_err));
    679 		} else {
    680 			error = (rp->r_error ? rp->r_error : EIO);
    681 			/*
    682 			 * An ESTALE error on a non-regular file is not
    683 			 * "sticky".  Return the ESTALE error once, but
    684 			 * clear the condition to allow future operations
    685 			 * to go OTW.  This will allow the client to
    686 			 * recover if the server has merely unshared then
    687 			 * re-shared the file system.  For regular files,
    688 			 * the unshare has destroyed the open state at the
    689 			 * server and we aren't willing to do a reopen (yet).
    690 			 */
    691 			if (error == ESTALE && vp->v_type != VREG) {
    692 				rp->r_flags &=
    693 				    ~(R4RECOVERR|R4RECOVERRP|R4STALE);
    694 				rp->r_error = 0;
    695 				error = ESTALE;
    696 			}
    697 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
    698 			    "nfs4_start_fop: %s %p DEAD, cnt=%d error=%d",
    699 			    str, (void *)vp,
    700 			    rsp->rs_num_retry_despite_err, error));
    701 		}
    702 	}
    703 
    704 	mutex_exit(&rp->r_statelock);
    705 	return (error);
    706 }
    707 
    708 /*
    709  * Initial setup code that every operation should call if it might invoke
    710  * client recovery.  Can block waiting for recovery to finish on a
    711  * filesystem.  Either vnode ptr can be NULL.
    712  *
    713  * Returns 0 if there are no outstanding errors.  Can return an
    714  * errno value under various circumstances (e.g., failed recovery, or
    715  * interrupted while waiting for recovery to finish).
    716  *
    717  * There must be a corresponding call to nfs4_end_op() to free up any locks
    718  * or resources allocated by this call (assuming this call succeeded),
    719  * using the same rsp that's passed in here.
    720  *
    721  * The open and lock seqid synchronization must be stopped before calling this
    722  * function, as it could lead to deadlock when trying to reopen a file or
    723  * reclaim a lock.  The synchronization is obtained with calls to:
    724  *   nfs4_start_open_seqid_sync()
    725  *   nfs4_start_lock_seqid_sync()
    726  *
    727  * *startrecovp is set TRUE if the caller should not bother with the
    728  * over-the-wire call, and just initiate recovery for the given request.
    729  * This is typically used for state-releasing ops if the filesystem has
    730  * been forcibly unmounted.  startrecovp may be NULL for
    731  * non-state-releasing ops.
    732  */
    733 
    734 int
    735 nfs4_start_fop(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, nfs4_op_hint_t op,
    736     nfs4_recov_state_t *rsp, bool_t *startrecovp)
    737 {
    738 	int error = 0, rerr_cnt;
    739 	nfs4_server_t *sp = NULL;
    740 	nfs4_server_t *tsp;
    741 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
    742 	uint_t droplock_cnt;
    743 #ifdef DEBUG
    744 	void *fop_caller;
    745 #endif
    746 
    747 	ASSERT(vp1 == NULL || vp1->v_vfsp == mi->mi_vfsp);
    748 	ASSERT(vp2 == NULL || vp2->v_vfsp == mi->mi_vfsp);
    749 
    750 #ifdef	DEBUG
    751 	if ((fop_caller = tsd_get(nfs4_tsd_key)) != NULL) {
    752 		cmn_err(CE_PANIC, "Missing nfs4_end_fop: last caller %p",
    753 		    fop_caller);
    754 	}
    755 	(void) tsd_set(nfs4_tsd_key, caller());
    756 #endif
    757 
    758 	rsp->rs_sp = NULL;
    759 	rsp->rs_flags &= ~NFS4_RS_RENAME_HELD;
    760 	rerr_cnt = rsp->rs_num_retry_despite_err;
    761 
    762 	/*
    763 	 * Process the items that may delay() based on server response
    764 	 */
    765 	error = nfs4_wait_for_grace(mi, rsp);
    766 	if (error)
    767 		goto out;
    768 
    769 	if (vp1 != NULL) {
    770 		error = nfs4_wait_for_delay(vp1, rsp);
    771 		if (error)
    772 			goto out;
    773 	}
    774 
    775 	/* Wait for a delegation recall to complete. */
    776 
    777 	error = wait_for_recall(vp1, vp2, op, rsp);
    778 	if (error)
    779 		goto out;
    780 
    781 	/*
    782 	 * Wait for any current recovery actions to finish.  Note that a
    783 	 * recovery thread can still start up after wait_for_recovery()
    784 	 * finishes.  We don't block out recovery operations until we
    785 	 * acquire s_recovlock and mi_recovlock.
    786 	 */
    787 	error = wait_for_recovery(mi, op);
    788 	if (error)
    789 		goto out;
    790 
    791 	/*
    792 	 * Check to see if the rnode is already marked with a
    793 	 * recovery error.  If so, return it immediately.  But
    794 	 * always pass CLOSE, LOCKU, and DELEGRETURN so we can
    795 	 * clean up state on the server.
    796 	 */
    797 
    798 	if (vp1 != NULL) {
    799 		if (error = nfs4_check_recov_err(vp1, op, rsp, rerr_cnt, "vp1"))
    800 			goto out;
    801 		nfs4_check_remap(mi, vp1, NFS4_REMAP_CKATTRS, &e);
    802 	}
    803 
    804 	if (vp2 != NULL) {
    805 		if (error = nfs4_check_recov_err(vp2, op, rsp, rerr_cnt, "vp2"))
    806 			goto out;
    807 		nfs4_check_remap(mi, vp2, NFS4_REMAP_CKATTRS, &e);
    808 	}
    809 
    810 	/*
    811 	 * The lock order calls for us to acquire s_recovlock before
    812 	 * mi_recovlock, but we have to hold mi_recovlock to look up sp (to
    813 	 * prevent races with the failover/migration code).  So acquire
    814 	 * mi_recovlock, look up sp, drop mi_recovlock, acquire
    815 	 * s_recovlock and mi_recovlock, then verify that sp is still the
    816 	 * right object.  XXX Can we find a simpler way to deal with this?
    817 	 */
    818 	if (nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
    819 	    mi->mi_flags & MI4_INT)) {
    820 		error = EINTR;
    821 		goto out;
    822 	}
    823 get_sp:
    824 	sp = find_nfs4_server(mi);
    825 	if (sp != NULL) {
    826 		sp->s_otw_call_count++;
    827 		mutex_exit(&sp->s_lock);
    828 		droplock_cnt = mi->mi_srvset_cnt;
    829 	}
    830 	nfs_rw_exit(&mi->mi_recovlock);
    831 
    832 	if (sp != NULL) {
    833 		if (nfs_rw_enter_sig(&sp->s_recovlock, RW_READER,
    834 		    mi->mi_flags & MI4_INT)) {
    835 			error = EINTR;
    836 			goto out;
    837 		}
    838 	}
    839 	if (nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
    840 	    mi->mi_flags & MI4_INT)) {
    841 		if (sp != NULL)
    842 			nfs_rw_exit(&sp->s_recovlock);
    843 		error = EINTR;
    844 		goto out;
    845 	}
    846 	/*
    847 	 * If the mntinfo4_t hasn't changed nfs4_sever_ts then
    848 	 * there's no point in double checking to make sure it
    849 	 * has switched.
    850 	 */
    851 	if (sp == NULL || droplock_cnt != mi->mi_srvset_cnt) {
    852 		tsp = find_nfs4_server(mi);
    853 		if (tsp != sp) {
    854 			/* try again */
    855 			if (tsp != NULL) {
    856 				mutex_exit(&tsp->s_lock);
    857 				nfs4_server_rele(tsp);
    858 				tsp = NULL;
    859 			}
    860 			if (sp != NULL) {
    861 				nfs_rw_exit(&sp->s_recovlock);
    862 				mutex_enter(&sp->s_lock);
    863 				sp->s_otw_call_count--;
    864 				mutex_exit(&sp->s_lock);
    865 				nfs4_server_rele(sp);
    866 				sp = NULL;
    867 			}
    868 			goto get_sp;
    869 		} else {
    870 			if (tsp != NULL) {
    871 				mutex_exit(&tsp->s_lock);
    872 				nfs4_server_rele(tsp);
    873 				tsp = NULL;
    874 			}
    875 		}
    876 	}
    877 
    878 	if (sp != NULL) {
    879 		rsp->rs_sp = sp;
    880 	}
    881 
    882 	/*
    883 	 * If the fileystem uses volatile filehandles, obtain a lock so
    884 	 * that we synchronize with renames.  Exception: mount operations
    885 	 * can change mi_fh_expire_type, which could be a problem, since
    886 	 * the end_op code needs to be consistent with the start_op code
    887 	 * about mi_rename_lock.  Since mounts don't compete with renames,
    888 	 * it's simpler to just not acquire the rename lock for mounts.
    889 	 */
    890 	if (NFS4_VOLATILE_FH(mi) && op != OH_MOUNT) {
    891 		if (nfs_rw_enter_sig(&mi->mi_rename_lock,
    892 		    op == OH_VFH_RENAME ? RW_WRITER : RW_READER,
    893 		    mi->mi_flags & MI4_INT)) {
    894 			nfs_rw_exit(&mi->mi_recovlock);
    895 			if (sp != NULL)
    896 				nfs_rw_exit(&sp->s_recovlock);
    897 			error = EINTR;
    898 			goto out;
    899 		}
    900 		rsp->rs_flags |= NFS4_RS_RENAME_HELD;
    901 	}
    902 
    903 	if (OH_IS_STATE_RELE(op)) {
    904 		/*
    905 		 * For forced unmount, letting the request proceed will
    906 		 * almost always delay response to the user, so hand it off
    907 		 * to the recovery thread.  For exiting lwp's, we don't
    908 		 * have a good way to tell if the request will hang.  We
    909 		 * generally want processes to handle their own requests so
    910 		 * that they can be done in parallel, but if there is
    911 		 * already a recovery thread, hand the request off to it.
    912 		 * This will improve user response at no cost to overall
    913 		 * system throughput.  For zone shutdown, we'd prefer
    914 		 * the recovery thread to handle this as well.
    915 		 */
    916 		ASSERT(startrecovp != NULL);
    917 		mutex_enter(&mi->mi_lock);
    918 		if (FS_OR_ZONE_GONE4(mi->mi_vfsp))
    919 			*startrecovp = TRUE;
    920 		else if ((curthread->t_proc_flag & TP_LWPEXIT) &&
    921 		    (mi->mi_flags & MI4_RECOV_ACTIV))
    922 			*startrecovp = TRUE;
    923 		else
    924 			*startrecovp = FALSE;
    925 		mutex_exit(&mi->mi_lock);
    926 	} else
    927 		if (startrecovp != NULL)
    928 			*startrecovp = FALSE;
    929 
    930 	ASSERT(error == 0);
    931 	return (error);
    932 
    933 out:
    934 	ASSERT(error != 0);
    935 	if (sp != NULL) {
    936 		mutex_enter(&sp->s_lock);
    937 		sp->s_otw_call_count--;
    938 		mutex_exit(&sp->s_lock);
    939 		nfs4_server_rele(sp);
    940 		rsp->rs_sp = NULL;
    941 	}
    942 	nfs4_end_op_recall(vp1, vp2, rsp);
    943 
    944 #ifdef	DEBUG
    945 	(void) tsd_set(nfs4_tsd_key, NULL);
    946 #endif
    947 	return (error);
    948 }
    949 
    950 /*
    951  * It is up to the caller to determine if rsp->rs_sp being NULL
    952  * is detrimental or not.
    953  */
    954 int
    955 nfs4_start_op(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2,
    956     nfs4_recov_state_t *rsp)
    957 {
    958 	ASSERT(rsp->rs_num_retry_despite_err == 0);
    959 	rsp->rs_num_retry_despite_err = 0;
    960 	return (nfs4_start_fop(mi, vp1, vp2, OH_OTHER, rsp, NULL));
    961 }
    962 
    963 /*
    964  * Release any resources acquired by nfs4_start_op().
    965  * 'sp' should be the nfs4_server pointer returned by nfs4_start_op().
    966  *
    967  * The operation hint is used to avoid a deadlock by bypassing delegation
    968  * return logic for writes, which are done while returning a delegation.
    969  */
    970 
    971 void
    972 nfs4_end_fop(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, nfs4_op_hint_t op,
    973     nfs4_recov_state_t *rsp, bool_t needs_recov)
    974 {
    975 	nfs4_server_t *sp = rsp->rs_sp;
    976 	rnode4_t *rp = NULL;
    977 
    978 #ifdef	lint
    979 	/*
    980 	 * The op hint isn't used any more, but might be in
    981 	 * the future.
    982 	 */
    983 	op = op;
    984 #endif
    985 
    986 #ifdef	DEBUG
    987 	ASSERT(tsd_get(nfs4_tsd_key) != NULL);
    988 	(void) tsd_set(nfs4_tsd_key, NULL);
    989 #endif
    990 
    991 	nfs4_end_op_recall(vp1, vp2, rsp);
    992 
    993 	if (rsp->rs_flags & NFS4_RS_RENAME_HELD)
    994 		nfs_rw_exit(&mi->mi_rename_lock);
    995 
    996 	if (!needs_recov) {
    997 		if (rsp->rs_flags & NFS4_RS_DELAY_MSG) {
    998 			/* may need to clear the delay interval */
    999 			if (vp1 != NULL) {
   1000 				rp = VTOR4(vp1);
   1001 				mutex_enter(&rp->r_statelock);
   1002 				rp->r_delay_interval = 0;
   1003 				mutex_exit(&rp->r_statelock);
   1004 			}
   1005 		}
   1006 		rsp->rs_flags &= ~(NFS4_RS_GRACE_MSG|NFS4_RS_DELAY_MSG);
   1007 	}
   1008 
   1009 	/*
   1010 	 * If the corresponding nfs4_start_op() found a sp,
   1011 	 * then there must still be a sp.
   1012 	 */
   1013 	if (sp != NULL) {
   1014 		nfs_rw_exit(&mi->mi_recovlock);
   1015 		nfs_rw_exit(&sp->s_recovlock);
   1016 		mutex_enter(&sp->s_lock);
   1017 		sp->s_otw_call_count--;
   1018 		cv_broadcast(&sp->s_cv_otw_count);
   1019 		mutex_exit(&sp->s_lock);
   1020 		nfs4_server_rele(sp);
   1021 	} else {
   1022 		nfs_rw_exit(&mi->mi_recovlock);
   1023 	}
   1024 }
   1025 
   1026 void
   1027 nfs4_end_op(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2,
   1028     nfs4_recov_state_t *rsp, bool_t needrecov)
   1029 {
   1030 	nfs4_end_fop(mi, vp1, vp2, OH_OTHER, rsp, needrecov);
   1031 }
   1032 
   1033 /*
   1034  * If the filesystem is going through client recovery, block until
   1035  * finished.
   1036  * Exceptions:
   1037  * - state-releasing ops (CLOSE, LOCKU, DELEGRETURN) are allowed to proceed
   1038  *   if the filesystem has been forcibly unmounted or the lwp is exiting.
   1039  *
   1040  * Return value:
   1041  * - 0 if no errors
   1042  * - EINTR if the call was interrupted
   1043  * - EIO if the filesystem has been forcibly unmounted (non-state-releasing
   1044  *   op)
   1045  * - the errno value from the recovery thread, if recovery failed
   1046  */
   1047 
   1048 static int
   1049 wait_for_recovery(mntinfo4_t *mi, nfs4_op_hint_t op_hint)
   1050 {
   1051 	int error = 0;
   1052 
   1053 	mutex_enter(&mi->mi_lock);
   1054 
   1055 	while (mi->mi_recovflags != 0) {
   1056 		klwp_t *lwp = ttolwp(curthread);
   1057 
   1058 		if ((mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) ||
   1059 		    (mi->mi_flags & MI4_RECOV_FAIL))
   1060 			break;
   1061 		if (OH_IS_STATE_RELE(op_hint) &&
   1062 		    (curthread->t_proc_flag & TP_LWPEXIT))
   1063 			break;
   1064 
   1065 		if (lwp != NULL)
   1066 			lwp->lwp_nostop++;
   1067 		/* XXX - use different cv? */
   1068 		if (cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock) == 0) {
   1069 			error = EINTR;
   1070 			if (lwp != NULL)
   1071 				lwp->lwp_nostop--;
   1072 			break;
   1073 		}
   1074 		if (lwp != NULL)
   1075 			lwp->lwp_nostop--;
   1076 	}
   1077 
   1078 	if ((mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) &&
   1079 	    !OH_IS_STATE_RELE(op_hint)) {
   1080 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   1081 		    "wait_for_recovery: forced unmount"));
   1082 		error = EIO;
   1083 	} else if (mi->mi_flags & MI4_RECOV_FAIL) {
   1084 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   1085 		    "wait_for_recovery: fail since RECOV FAIL"));
   1086 		error = mi->mi_error;
   1087 	}
   1088 
   1089 	mutex_exit(&mi->mi_lock);
   1090 
   1091 	return (error);
   1092 }
   1093 
   1094 /*
   1095  * If the client received NFS4ERR_GRACE for this particular mount,
   1096  * the client blocks here until it is time to try again.
   1097  *
   1098  * Return value:
   1099  * - 0 if wait was successful
   1100  * - EINTR if the call was interrupted
   1101  */
   1102 
   1103 int
   1104 nfs4_wait_for_grace(mntinfo4_t *mi, nfs4_recov_state_t *rsp)
   1105 {
   1106 	int error = 0;
   1107 	time_t curtime, time_to_wait;
   1108 
   1109 	/* do a unprotected check to reduce mi_lock contention */
   1110 	if (mi->mi_grace_wait != 0) {
   1111 		mutex_enter(&mi->mi_lock);
   1112 
   1113 		if (mi->mi_grace_wait != 0) {
   1114 			if (!(rsp->rs_flags & NFS4_RS_GRACE_MSG))
   1115 				rsp->rs_flags |= NFS4_RS_GRACE_MSG;
   1116 
   1117 			curtime = gethrestime_sec();
   1118 
   1119 			if (curtime < mi->mi_grace_wait) {
   1120 
   1121 				time_to_wait = mi->mi_grace_wait - curtime;
   1122 
   1123 				mutex_exit(&mi->mi_lock);
   1124 
   1125 				delay(SEC_TO_TICK(time_to_wait));
   1126 
   1127 				curtime = gethrestime_sec();
   1128 
   1129 				mutex_enter(&mi->mi_lock);
   1130 
   1131 				if (curtime >= mi->mi_grace_wait)
   1132 					mi->mi_grace_wait = 0;
   1133 			} else {
   1134 				mi->mi_grace_wait = 0;
   1135 			}
   1136 		}
   1137 		mutex_exit(&mi->mi_lock);
   1138 	}
   1139 
   1140 	return (error);
   1141 }
   1142 
   1143 /*
   1144  * If the client received NFS4ERR_DELAY for an operation on a vnode,
   1145  * the client blocks here until it is time to try again.
   1146  *
   1147  * Return value:
   1148  * - 0 if wait was successful
   1149  * - EINTR if the call was interrupted
   1150  */
   1151 
   1152 int
   1153 nfs4_wait_for_delay(vnode_t *vp, nfs4_recov_state_t *rsp)
   1154 {
   1155 	int error = 0;
   1156 	time_t curtime, time_to_wait;
   1157 	rnode4_t *rp;
   1158 
   1159 	ASSERT(vp != NULL);
   1160 
   1161 	rp = VTOR4(vp);
   1162 
   1163 	/* do a unprotected check to reduce r_statelock contention */
   1164 	if (rp->r_delay_wait != 0) {
   1165 		mutex_enter(&rp->r_statelock);
   1166 
   1167 		if (rp->r_delay_wait != 0) {
   1168 
   1169 			if (!(rsp->rs_flags & NFS4_RS_DELAY_MSG)) {
   1170 				rsp->rs_flags |= NFS4_RS_DELAY_MSG;
   1171 				nfs4_mi_kstat_inc_delay(VTOMI4(vp));
   1172 			}
   1173 
   1174 			curtime = gethrestime_sec();
   1175 
   1176 			if (curtime < rp->r_delay_wait) {
   1177 
   1178 				time_to_wait = rp->r_delay_wait - curtime;
   1179 
   1180 				mutex_exit(&rp->r_statelock);
   1181 
   1182 				delay(SEC_TO_TICK(time_to_wait));
   1183 
   1184 				curtime = gethrestime_sec();
   1185 
   1186 				mutex_enter(&rp->r_statelock);
   1187 
   1188 				if (curtime >= rp->r_delay_wait)
   1189 					rp->r_delay_wait = 0;
   1190 			} else {
   1191 				rp->r_delay_wait = 0;
   1192 			}
   1193 		}
   1194 		mutex_exit(&rp->r_statelock);
   1195 	}
   1196 
   1197 	return (error);
   1198 }
   1199 
   1200 /*
   1201  * The recovery thread.
   1202  */
   1203 
   1204 static void
   1205 nfs4_recov_thread(recov_info_t *recovp)
   1206 {
   1207 	mntinfo4_t *mi = recovp->rc_mi;
   1208 	nfs4_server_t *sp;
   1209 	int done = 0, error = 0;
   1210 	bool_t recov_fail = FALSE;
   1211 	callb_cpr_t cpr_info;
   1212 	kmutex_t cpr_lock;
   1213 
   1214 	nfs4_queue_event(RE_START, mi, NULL, mi->mi_recovflags,
   1215 	    recovp->rc_vp1, recovp->rc_vp2, 0, NULL, 0, TAG_NONE, TAG_NONE,
   1216 	    0, 0);
   1217 
   1218 	mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
   1219 	CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Recov");
   1220 
   1221 	mutex_enter(&mi->mi_lock);
   1222 	mi->mi_recovthread = curthread;
   1223 	mutex_exit(&mi->mi_lock);
   1224 
   1225 	/*
   1226 	 * We don't really need protection here against failover or
   1227 	 * migration, since the current thread is the one that would make
   1228 	 * any changes, but hold mi_recovlock anyway for completeness (and
   1229 	 * to satisfy any ASSERTs).
   1230 	 */
   1231 	(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
   1232 	sp = find_nfs4_server(mi);
   1233 	if (sp != NULL)
   1234 		mutex_exit(&sp->s_lock);
   1235 	nfs_rw_exit(&mi->mi_recovlock);
   1236 
   1237 	/*
   1238 	 * Do any necessary recovery, based on the information in recovp
   1239 	 * and any recovery flags.
   1240 	 */
   1241 
   1242 	do {
   1243 		mutex_enter(&mi->mi_lock);
   1244 		if (FS_OR_ZONE_GONE4(mi->mi_vfsp)) {
   1245 			bool_t activesrv;
   1246 
   1247 			NFS4_DEBUG(nfs4_client_recov_debug &&
   1248 			    mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED, (CE_NOTE,
   1249 			    "nfs4_recov_thread: file system has been "
   1250 			    "unmounted"));
   1251 			NFS4_DEBUG(nfs4_client_recov_debug &&
   1252 			    zone_status_get(curproc->p_zone) >=
   1253 			    ZONE_IS_SHUTTING_DOWN, (CE_NOTE,
   1254 			    "nfs4_recov_thread: zone shutting down"));
   1255 			/*
   1256 			 * If the server has lost its state for us and
   1257 			 * the filesystem is unmounted, then the filesystem
   1258 			 * can be tossed, even if there are lost lock or
   1259 			 * lost state calls in the recovery queue.
   1260 			 */
   1261 			if (mi->mi_recovflags &
   1262 			    (MI4R_NEED_CLIENTID | MI4R_REOPEN_FILES)) {
   1263 				NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   1264 				"nfs4_recov_thread: bailing out"));
   1265 				mi->mi_flags |= MI4_RECOV_FAIL;
   1266 				mi->mi_error = recovp->rc_error;
   1267 				recov_fail = TRUE;
   1268 			}
   1269 			/*
   1270 			 * We don't know if the server has any state for
   1271 			 * us, and the filesystem has been unmounted.  If
   1272 			 * there are "lost state" recovery items, keep
   1273 			 * trying to process them until there are no more
   1274 			 * mounted filesystems for the server.  Otherwise,
   1275 			 * bail out.  The reason we don't mark the
   1276 			 * filesystem as failing recovery is in case we
   1277 			 * have to do "lost state" recovery later (e.g., a
   1278 			 * user process exits).
   1279 			 */
   1280 			if (!(mi->mi_recovflags & MI4R_LOST_STATE)) {
   1281 				done = 1;
   1282 				mutex_exit(&mi->mi_lock);
   1283 				break;
   1284 			}
   1285 			mutex_exit(&mi->mi_lock);
   1286 
   1287 			if (sp == NULL)
   1288 				activesrv = FALSE;
   1289 			else {
   1290 				mutex_enter(&sp->s_lock);
   1291 				activesrv = nfs4_fs_active(sp);
   1292 			}
   1293 			if (!activesrv) {
   1294 				NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   1295 				    "no active fs for server %p",
   1296 				    (void *)sp));
   1297 				mutex_enter(&mi->mi_lock);
   1298 				mi->mi_flags |= MI4_RECOV_FAIL;
   1299 				mi->mi_error = recovp->rc_error;
   1300 				mutex_exit(&mi->mi_lock);
   1301 				recov_fail = TRUE;
   1302 				if (sp != NULL) {
   1303 					/*
   1304 					 * Mark the server instance as
   1305 					 * dead, so that nobody will attach
   1306 					 * a new filesystem.
   1307 					 */
   1308 					nfs4_mark_srv_dead(sp);
   1309 				}
   1310 			}
   1311 			if (sp != NULL)
   1312 				mutex_exit(&sp->s_lock);
   1313 		} else {
   1314 			mutex_exit(&mi->mi_lock);
   1315 		}
   1316 
   1317 		/*
   1318 		 * Check if we need to select a new server for a
   1319 		 * failover.  Choosing a new server will force at
   1320 		 * least a check of the clientid.
   1321 		 */
   1322 		mutex_enter(&mi->mi_lock);
   1323 		if (!recov_fail &&
   1324 		    (mi->mi_recovflags & MI4R_NEED_NEW_SERVER)) {
   1325 			mutex_exit(&mi->mi_lock);
   1326 			recov_newserver(recovp, &sp, &recov_fail);
   1327 		} else
   1328 			mutex_exit(&mi->mi_lock);
   1329 
   1330 		/*
   1331 		 * Check if we need to recover the clientid.  This
   1332 		 * must be done before file and lock recovery, and it
   1333 		 * potentially affects the recovery threads for other
   1334 		 * filesystems, so it gets special treatment.
   1335 		 */
   1336 		if (sp != NULL && recov_fail == FALSE) {
   1337 			mutex_enter(&sp->s_lock);
   1338 			if (!(sp->s_flags & N4S_CLIENTID_SET)) {
   1339 				mutex_exit(&sp->s_lock);
   1340 				recov_clientid(recovp, sp);
   1341 			} else {
   1342 				/*
   1343 				 * Unset this flag in case another recovery
   1344 				 * thread successfully recovered the clientid
   1345 				 * for us already.
   1346 				 */
   1347 				mutex_enter(&mi->mi_lock);
   1348 				mi->mi_recovflags &= ~MI4R_NEED_CLIENTID;
   1349 				mutex_exit(&mi->mi_lock);
   1350 				mutex_exit(&sp->s_lock);
   1351 			}
   1352 		}
   1353 
   1354 		/*
   1355 		 * Check if we need to get the security information.
   1356 		 */
   1357 		mutex_enter(&mi->mi_lock);
   1358 		if ((mi->mi_recovflags & MI4R_NEED_SECINFO) &&
   1359 		    !(mi->mi_flags & MI4_RECOV_FAIL)) {
   1360 			mutex_exit(&mi->mi_lock);
   1361 			(void) nfs_rw_enter_sig(&mi->mi_recovlock,
   1362 			    RW_WRITER, 0);
   1363 			error = nfs4_secinfo_recov(recovp->rc_mi,
   1364 			    recovp->rc_vp1, recovp->rc_vp2);
   1365 			/*
   1366 			 * If error, nothing more can be done, stop
   1367 			 * the recovery.
   1368 			 */
   1369 			if (error) {
   1370 				mutex_enter(&mi->mi_lock);
   1371 				mi->mi_flags |= MI4_RECOV_FAIL;
   1372 				mi->mi_error = recovp->rc_error;
   1373 				mutex_exit(&mi->mi_lock);
   1374 				nfs4_queue_event(RE_WRONGSEC, mi, NULL,
   1375 				    error, recovp->rc_vp1, recovp->rc_vp2,
   1376 				    0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
   1377 			}
   1378 			nfs_rw_exit(&mi->mi_recovlock);
   1379 		} else
   1380 			mutex_exit(&mi->mi_lock);
   1381 
   1382 		/*
   1383 		 * Check if there's a bad seqid to recover.
   1384 		 */
   1385 		mutex_enter(&mi->mi_lock);
   1386 		if ((mi->mi_recovflags & MI4R_BAD_SEQID) &&
   1387 		    !(mi->mi_flags & MI4_RECOV_FAIL)) {
   1388 			mutex_exit(&mi->mi_lock);
   1389 			(void) nfs_rw_enter_sig(&mi->mi_recovlock,
   1390 			    RW_WRITER, 0);
   1391 			recov_bad_seqid(recovp);
   1392 			nfs_rw_exit(&mi->mi_recovlock);
   1393 		} else
   1394 			mutex_exit(&mi->mi_lock);
   1395 
   1396 		/*
   1397 		 * Next check for recovery that affects the entire
   1398 		 * filesystem.
   1399 		 */
   1400 		if (sp != NULL) {
   1401 			mutex_enter(&mi->mi_lock);
   1402 			if ((mi->mi_recovflags & MI4R_REOPEN_FILES) &&
   1403 			    !(mi->mi_flags & MI4_RECOV_FAIL)) {
   1404 				mutex_exit(&mi->mi_lock);
   1405 				recov_openfiles(recovp, sp);
   1406 			} else
   1407 				mutex_exit(&mi->mi_lock);
   1408 		}
   1409 
   1410 		/*
   1411 		 * Send any queued state recovery requests.
   1412 		 */
   1413 		mutex_enter(&mi->mi_lock);
   1414 		if (sp != NULL &&
   1415 		    (mi->mi_recovflags & MI4R_LOST_STATE) &&
   1416 		    !(mi->mi_flags & MI4_RECOV_FAIL)) {
   1417 			mutex_exit(&mi->mi_lock);
   1418 			(void) nfs_rw_enter_sig(&mi->mi_recovlock,
   1419 			    RW_WRITER, 0);
   1420 			nfs4_resend_lost_rqsts(recovp, sp);
   1421 			if (list_head(&mi->mi_lost_state) == NULL) {
   1422 				/* done */
   1423 				mutex_enter(&mi->mi_lock);
   1424 				mi->mi_recovflags &= ~MI4R_LOST_STATE;
   1425 				mutex_exit(&mi->mi_lock);
   1426 			}
   1427 			nfs_rw_exit(&mi->mi_recovlock);
   1428 		} else {
   1429 			mutex_exit(&mi->mi_lock);
   1430 		}
   1431 
   1432 		/*
   1433 		 * See if there is anything more to do.  If not, announce
   1434 		 * that we are done and exit.
   1435 		 *
   1436 		 * Need mi_recovlock to keep 'sp' valid.  Must grab
   1437 		 * mi_recovlock before mi_lock to preserve lock ordering.
   1438 		 */
   1439 		(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
   1440 		mutex_enter(&mi->mi_lock);
   1441 		if ((mi->mi_recovflags & ~MI4R_SRV_REBOOT) == 0 ||
   1442 		    (mi->mi_flags & MI4_RECOV_FAIL)) {
   1443 			list_t local_lost_state;
   1444 			nfs4_lost_rqst_t *lrp;
   1445 
   1446 			/*
   1447 			 * We need to remove the lost requests before we
   1448 			 * unmark the mi as no longer doing recovery to
   1449 			 * avoid a race with a new thread putting new lost
   1450 			 * requests on the same mi (and the going away
   1451 			 * thread would remove the new lost requests).
   1452 			 *
   1453 			 * Move the lost requests to a local list since
   1454 			 * nfs4_remove_lost_rqst() drops mi_lock, and
   1455 			 * dropping the mi_lock would make our check to
   1456 			 * see if recovery is done no longer valid.
   1457 			 */
   1458 			list_create(&local_lost_state,
   1459 			    sizeof (nfs4_lost_rqst_t),
   1460 			    offsetof(nfs4_lost_rqst_t, lr_node));
   1461 			list_move_tail(&local_lost_state, &mi->mi_lost_state);
   1462 
   1463 			done = 1;
   1464 			mutex_exit(&mi->mi_lock);
   1465 			/*
   1466 			 * Now officially free the "moved"
   1467 			 * lost requests.
   1468 			 */
   1469 			while ((lrp = list_head(&local_lost_state)) != NULL) {
   1470 				list_remove(&local_lost_state, lrp);
   1471 				nfs4_free_lost_rqst(lrp, sp);
   1472 			}
   1473 			list_destroy(&local_lost_state);
   1474 		} else
   1475 			mutex_exit(&mi->mi_lock);
   1476 		nfs_rw_exit(&mi->mi_recovlock);
   1477 
   1478 		/*
   1479 		 * If the filesystem has been forcibly unmounted, there is
   1480 		 * probably no point in retrying immediately.  Furthermore,
   1481 		 * there might be user processes waiting for a chance to
   1482 		 * queue up "lost state" requests, so that they can exit.
   1483 		 * So pause here for a moment.  Same logic for zone shutdown.
   1484 		 */
   1485 		if (!done && FS_OR_ZONE_GONE4(mi->mi_vfsp)) {
   1486 			mutex_enter(&mi->mi_lock);
   1487 			cv_broadcast(&mi->mi_failover_cv);
   1488 			mutex_exit(&mi->mi_lock);
   1489 			delay(SEC_TO_TICK(nfs4_unmount_delay));
   1490 		}
   1491 
   1492 	} while (!done);
   1493 
   1494 	if (sp != NULL)
   1495 		nfs4_server_rele(sp);
   1496 
   1497 	/*
   1498 	 * Return all recalled delegations
   1499 	 */
   1500 	nfs4_dlistclean();
   1501 
   1502 	mutex_enter(&mi->mi_lock);
   1503 	recov_done(mi, recovp);
   1504 	mutex_exit(&mi->mi_lock);
   1505 
   1506 	/*
   1507 	 * Free up resources that were allocated for us.
   1508 	 */
   1509 	if (recovp->rc_vp1 != NULL)
   1510 		VN_RELE(recovp->rc_vp1);
   1511 	if (recovp->rc_vp2 != NULL)
   1512 		VN_RELE(recovp->rc_vp2);
   1513 
   1514 	/* now we are done using the mi struct, signal the waiters */
   1515 	mutex_enter(&mi->mi_lock);
   1516 	mi->mi_in_recovery--;
   1517 	if (mi->mi_in_recovery == 0)
   1518 		cv_broadcast(&mi->mi_cv_in_recov);
   1519 	mutex_exit(&mi->mi_lock);
   1520 
   1521 	VFS_RELE(mi->mi_vfsp);
   1522 	MI4_RELE(mi);
   1523 	kmem_free(recovp, sizeof (recov_info_t));
   1524 	mutex_enter(&cpr_lock);
   1525 	CALLB_CPR_EXIT(&cpr_info);
   1526 	mutex_destroy(&cpr_lock);
   1527 	zthread_exit();
   1528 }
   1529 
   1530 /*
   1531  * Log the end of recovery and notify any waiting threads.
   1532  */
   1533 
   1534 static void
   1535 recov_done(mntinfo4_t *mi, recov_info_t *recovp)
   1536 {
   1537 
   1538 	ASSERT(MUTEX_HELD(&mi->mi_lock));
   1539 
   1540 	nfs4_queue_event(RE_END, mi, NULL, 0, recovp->rc_vp1,
   1541 	    recovp->rc_vp2, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
   1542 	mi->mi_recovthread = NULL;
   1543 	mi->mi_flags &= ~MI4_RECOV_ACTIV;
   1544 	mi->mi_recovflags &= ~MI4R_SRV_REBOOT;
   1545 	cv_broadcast(&mi->mi_failover_cv);
   1546 }
   1547 
   1548 /*
   1549  * State-specific recovery routines, by state.
   1550  */
   1551 
   1552 /*
   1553  * Failover.
   1554  *
   1555  * Replaces *spp with a reference to the new server, which must
   1556  * eventually be freed.
   1557  */
   1558 
   1559 static void
   1560 recov_newserver(recov_info_t *recovp, nfs4_server_t **spp, bool_t *recov_fail)
   1561 {
   1562 	mntinfo4_t *mi = recovp->rc_mi;
   1563 	servinfo4_t *svp = NULL;
   1564 	nfs4_server_t *osp = *spp;
   1565 	CLIENT *cl;
   1566 	enum clnt_stat status;
   1567 	struct timeval tv;
   1568 	int error;
   1569 	int oncethru = 0;
   1570 	rnode4_t *rp;
   1571 	int index;
   1572 	nfs_fh4 fh;
   1573 	char *snames;
   1574 	size_t len;
   1575 
   1576 	(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0);
   1577 
   1578 	tv.tv_sec = 2;
   1579 	tv.tv_usec = 0;
   1580 
   1581 #ifdef lint
   1582 	/*
   1583 	 * Lint can't follow the logic, so thinks that snames and len
   1584 	 * can be used before being set.  They can't, but lint can't
   1585 	 * figure it out.  To address the lint warning, initialize
   1586 	 * snames and len for lint.
   1587 	 */
   1588 	snames = NULL;
   1589 	len = 0;
   1590 #endif
   1591 
   1592 	/*
   1593 	 * Ping the null NFS procedure of every server in
   1594 	 * the list until one responds.  We always start
   1595 	 * at the head of the list and always skip the one
   1596 	 * that is current, since it's caused us a problem.
   1597 	 */
   1598 	while (svp == NULL) {
   1599 		for (svp = mi->mi_servers; svp; svp = svp->sv_next) {
   1600 
   1601 			mutex_enter(&mi->mi_lock);
   1602 			if (FS_OR_ZONE_GONE4(mi->mi_vfsp)) {
   1603 				mi->mi_flags |= MI4_RECOV_FAIL;
   1604 				mutex_exit(&mi->mi_lock);
   1605 				(void) nfs_rw_exit(&mi->mi_recovlock);
   1606 				*recov_fail = TRUE;
   1607 				if (oncethru)
   1608 					kmem_free(snames, len);
   1609 				return;
   1610 			}
   1611 			mutex_exit(&mi->mi_lock);
   1612 
   1613 			(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
   1614 			if (svp->sv_flags & SV4_NOTINUSE) {
   1615 				nfs_rw_exit(&svp->sv_lock);
   1616 				continue;
   1617 			}
   1618 			nfs_rw_exit(&svp->sv_lock);
   1619 
   1620 			if (!oncethru && svp == mi->mi_curr_serv)
   1621 				continue;
   1622 
   1623 			error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr,
   1624 			    NFS_PROGRAM, NFS_V4, 0, 1, CRED(), &cl);
   1625 			if (error)
   1626 				continue;
   1627 
   1628 			if (!(mi->mi_flags & MI4_INT))
   1629 				cl->cl_nosignal = TRUE;
   1630 			status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL,
   1631 			    xdr_void, NULL, tv);
   1632 			if (!(mi->mi_flags & MI4_INT))
   1633 				cl->cl_nosignal = FALSE;
   1634 			AUTH_DESTROY(cl->cl_auth);
   1635 			CLNT_DESTROY(cl);
   1636 			if (status == RPC_SUCCESS) {
   1637 				nfs4_queue_event(RE_FAILOVER, mi,
   1638 				    svp == mi->mi_curr_serv ? NULL :
   1639 				    svp->sv_hostname, 0, NULL, NULL, 0,
   1640 				    NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
   1641 				break;
   1642 			}
   1643 		}
   1644 
   1645 		if (svp == NULL) {
   1646 			if (!oncethru) {
   1647 				snames = nfs4_getsrvnames(mi, &len);
   1648 				nfs4_queue_fact(RF_SRVS_NOT_RESPOND, mi,
   1649 				    0, 0, 0, FALSE, snames, 0, NULL);
   1650 				oncethru = 1;
   1651 			}
   1652 			delay(hz);
   1653 		}
   1654 	}
   1655 
   1656 	if (oncethru) {
   1657 		nfs4_queue_fact(RF_SRVS_OK, mi, 0, 0, 0, FALSE, snames,
   1658 		    0, NULL);
   1659 		kmem_free(snames, len);
   1660 	}
   1661 
   1662 #if DEBUG
   1663 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
   1664 	ASSERT((svp->sv_flags & SV4_NOTINUSE) == 0);
   1665 	nfs_rw_exit(&svp->sv_lock);
   1666 #endif
   1667 
   1668 	mutex_enter(&mi->mi_lock);
   1669 	mi->mi_recovflags &= ~MI4R_NEED_NEW_SERVER;
   1670 	if (svp != mi->mi_curr_serv) {
   1671 		servinfo4_t *osvp = mi->mi_curr_serv;
   1672 
   1673 		mutex_exit(&mi->mi_lock);
   1674 
   1675 		/*
   1676 		 * Update server-dependent fields in the root vnode.
   1677 		 */
   1678 		index = rtable4hash(mi->mi_rootfh);
   1679 		rw_enter(&rtable4[index].r_lock, RW_WRITER);
   1680 
   1681 		rp = r4find(&rtable4[index], mi->mi_rootfh, mi->mi_vfsp);
   1682 		if (rp != NULL) {
   1683 			NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
   1684 			    "recov_newserver: remapping %s", rnode4info(rp)));
   1685 			mutex_enter(&rp->r_statelock);
   1686 			rp->r_server = svp;
   1687 			PURGE_ATTRCACHE4_LOCKED(rp);
   1688 			mutex_exit(&rp->r_statelock);
   1689 			(void) nfs4_free_data_reclaim(rp);
   1690 			nfs4_purge_rddir_cache(RTOV4(rp));
   1691 			rw_exit(&rtable4[index].r_lock);
   1692 			NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
   1693 			    "recov_newserver: done with %s",
   1694 			    rnode4info(rp)));
   1695 			VN_RELE(RTOV4(rp));
   1696 		} else
   1697 			rw_exit(&rtable4[index].r_lock);
   1698 		(void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
   1699 
   1700 		mutex_enter(&mi->mi_lock);
   1701 		mi->mi_recovflags |= MI4R_REOPEN_FILES | MI4R_REMAP_FILES;
   1702 		if (recovp->rc_srv_reboot)
   1703 			mi->mi_recovflags |= MI4R_SRV_REBOOT;
   1704 		mi->mi_curr_serv = svp;
   1705 		mi->mi_failover++;
   1706 		mi->mi_flags &= ~MI4_BADOWNER_DEBUG;
   1707 		mutex_exit(&mi->mi_lock);
   1708 
   1709 		(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
   1710 		fh.nfs_fh4_len = svp->sv_fhandle.fh_len;
   1711 		fh.nfs_fh4_val = svp->sv_fhandle.fh_buf;
   1712 		sfh4_update(mi->mi_rootfh, &fh);
   1713 		fh.nfs_fh4_len = svp->sv_pfhandle.fh_len;
   1714 		fh.nfs_fh4_val = svp->sv_pfhandle.fh_buf;
   1715 		sfh4_update(mi->mi_srvparentfh, &fh);
   1716 		nfs_rw_exit(&svp->sv_lock);
   1717 
   1718 		*spp = nfs4_move_mi(mi, osvp, svp);
   1719 		if (osp != NULL)
   1720 			nfs4_server_rele(osp);
   1721 	} else
   1722 		mutex_exit(&mi->mi_lock);
   1723 	(void) nfs_rw_exit(&mi->mi_recovlock);
   1724 }
   1725 
   1726 /*
   1727  * Clientid.
   1728  */
   1729 
   1730 static void
   1731 recov_clientid(recov_info_t *recovp, nfs4_server_t *sp)
   1732 {
   1733 	mntinfo4_t *mi = recovp->rc_mi;
   1734 	int error = 0;
   1735 	int still_stale;
   1736 	int need_new_s;
   1737 
   1738 	ASSERT(sp != NULL);
   1739 
   1740 	/*
   1741 	 * Acquire the recovery lock and then verify that the clientid
   1742 	 * still needs to be recovered.  (Note that s_recovlock is supposed
   1743 	 * to be acquired before s_lock.)  Since the thread holds the
   1744 	 * recovery lock, no other thread will recover the clientid.
   1745 	 */
   1746 	(void) nfs_rw_enter_sig(&sp->s_recovlock, RW_WRITER, 0);
   1747 	(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0);
   1748 	mutex_enter(&sp->s_lock);
   1749 	still_stale = ((sp->s_flags & N4S_CLIENTID_SET) == 0);
   1750 	mutex_exit(&sp->s_lock);
   1751 
   1752 	if (still_stale) {
   1753 		nfs4_error_t n4e;
   1754 
   1755 		nfs4_error_zinit(&n4e);
   1756 		nfs4setclientid(mi, kcred, TRUE, &n4e);
   1757 		error = n4e.error;
   1758 		if (error != 0) {
   1759 
   1760 			/*
   1761 			 * nfs4setclientid may have set MI4R_NEED_NEW_SERVER,
   1762 			 * if so, just return and let recov_thread drive
   1763 			 * failover.
   1764 			 */
   1765 			mutex_enter(&mi->mi_lock);
   1766 			need_new_s = mi->mi_recovflags & MI4R_NEED_NEW_SERVER;
   1767 			mutex_exit(&mi->mi_lock);
   1768 
   1769 			if (need_new_s) {
   1770 				nfs_rw_exit(&mi->mi_recovlock);
   1771 				nfs_rw_exit(&sp->s_recovlock);
   1772 				return;
   1773 			}
   1774 
   1775 			nfs4_queue_event(RE_CLIENTID, mi, NULL, n4e.error, NULL,
   1776 			    NULL, n4e.stat, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
   1777 			mutex_enter(&mi->mi_lock);
   1778 			mi->mi_flags |= MI4_RECOV_FAIL;
   1779 			mi->mi_error = recovp->rc_error;
   1780 			mutex_exit(&mi->mi_lock);
   1781 			/* don't destroy the nfs4_server, let umount do it */
   1782 		}
   1783 	}
   1784 
   1785 	if (error == 0) {
   1786 		mutex_enter(&mi->mi_lock);
   1787 		mi->mi_recovflags &= ~MI4R_NEED_CLIENTID;
   1788 		/*
   1789 		 * If still_stale isn't true, then another thread already
   1790 		 * recovered the clientid.  And that thread that set the
   1791 		 * clientid will have initiated reopening files on all the
   1792 		 * filesystems for the server, so we should not initiate
   1793 		 * reopening for this filesystem here.
   1794 		 */
   1795 		if (still_stale) {
   1796 			mi->mi_recovflags |= MI4R_REOPEN_FILES;
   1797 			if (recovp->rc_srv_reboot)
   1798 				mi->mi_recovflags |= MI4R_SRV_REBOOT;
   1799 		}
   1800 		mutex_exit(&mi->mi_lock);
   1801 	}
   1802 
   1803 	nfs_rw_exit(&mi->mi_recovlock);
   1804 
   1805 	if (error != 0) {
   1806 		nfs_rw_exit(&sp->s_recovlock);
   1807 		mutex_enter(&mi->mi_lock);
   1808 		if ((mi->mi_flags & MI4_RECOV_FAIL) == 0)
   1809 			delay(SEC_TO_TICK(recov_err_delay));
   1810 		mutex_exit(&mi->mi_lock);
   1811 	} else {
   1812 		mntinfo4_t **milist;
   1813 		mntinfo4_t *tmi;
   1814 		int nummi, i;
   1815 
   1816 		/*
   1817 		 * Initiate recovery of open files for other filesystems.
   1818 		 * We create an array of filesystems, rather than just
   1819 		 * walking the filesystem list, to avoid deadlock issues
   1820 		 * with s_lock and mi_recovlock.
   1821 		 */
   1822 		milist = make_milist(sp, &nummi);
   1823 		for (i = 0; i < nummi; i++) {
   1824 			tmi = milist[i];
   1825 			if (tmi != mi) {
   1826 				(void) nfs_rw_enter_sig(&tmi->mi_recovlock,
   1827 				    RW_READER, 0);
   1828 				start_recovery_action(NR_OPENFILES, TRUE, tmi,
   1829 				    NULL, NULL);
   1830 				nfs_rw_exit(&tmi->mi_recovlock);
   1831 			}
   1832 		}
   1833 		free_milist(milist, nummi);
   1834 
   1835 		nfs_rw_exit(&sp->s_recovlock);
   1836 	}
   1837 }
   1838 
   1839 /*
   1840  * Return an array of filesystems associated with the given server.  The
   1841  * caller should call free_milist() to free the references and memory.
   1842  */
   1843 
   1844 static mntinfo4_t **
   1845 make_milist(nfs4_server_t *sp, int *nummip)
   1846 {
   1847 	int nummi, i;
   1848 	mntinfo4_t **milist;
   1849 	mntinfo4_t *tmi;
   1850 
   1851 	mutex_enter(&sp->s_lock);
   1852 	nummi = 0;
   1853 	for (tmi = sp->mntinfo4_list; tmi != NULL; tmi = tmi->mi_clientid_next)
   1854 		nummi++;
   1855 
   1856 	milist = kmem_alloc(nummi * sizeof (mntinfo4_t *), KM_SLEEP);
   1857 
   1858 	for (i = 0, tmi = sp->mntinfo4_list; tmi != NULL; i++,
   1859 	    tmi = tmi->mi_clientid_next) {
   1860 		milist[i] = tmi;
   1861 		VFS_HOLD(tmi->mi_vfsp);
   1862 	}
   1863 	mutex_exit(&sp->s_lock);
   1864 
   1865 	*nummip = nummi;
   1866 	return (milist);
   1867 }
   1868 
   1869 /*
   1870  * Free the filesystem list created by make_milist().
   1871  */
   1872 
   1873 static void
   1874 free_milist(mntinfo4_t **milist, int nummi)
   1875 {
   1876 	mntinfo4_t *tmi;
   1877 	int i;
   1878 
   1879 	for (i = 0; i < nummi; i++) {
   1880 		tmi = milist[i];
   1881 		VFS_RELE(tmi->mi_vfsp);
   1882 	}
   1883 	kmem_free(milist, nummi * sizeof (mntinfo4_t *));
   1884 }
   1885 
   1886 /*
   1887  * Filehandle
   1888  */
   1889 
   1890 /*
   1891  * Lookup the filehandle for the given vnode and update the rnode if it has
   1892  * changed.
   1893  *
   1894  * Errors:
   1895  * - if the filehandle could not be updated because of an error that
   1896  *   requires further recovery, initiate that recovery and return.
   1897  * - if the filehandle could not be updated because of a signal, pretend we
   1898  *   succeeded and let someone else deal with it.
   1899  * - if the filehandle could not be updated and the filesystem has been
   1900  *   forcibly unmounted, pretend we succeeded, and let the caller deal with
   1901  *   the forced unmount (to retry or not to retry, that is the question).
   1902  * - if the filehandle could not be updated because of some other error,
   1903  *   mark the rnode bad and return.
   1904  */
   1905 static void
   1906 recov_filehandle(nfs4_recov_t action, mntinfo4_t *mi, vnode_t *vp)
   1907 {
   1908 	rnode4_t *rp = VTOR4(vp);
   1909 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
   1910 	bool_t needrecov;
   1911 
   1912 	mutex_enter(&rp->r_statelock);
   1913 
   1914 	if (rp->r_flags & R4RECOVERR) {
   1915 		mutex_exit(&rp->r_statelock);
   1916 		return;
   1917 	}
   1918 
   1919 	/*
   1920 	 * If someone else is updating the filehandle, wait for them to
   1921 	 * finish and then let our caller retry.
   1922 	 */
   1923 	if (rp->r_flags & R4RECEXPFH) {
   1924 		while (rp->r_flags & R4RECEXPFH) {
   1925 			cv_wait(&rp->r_cv, &rp->r_statelock);
   1926 		}
   1927 		mutex_exit(&rp->r_statelock);
   1928 		return;
   1929 	}
   1930 	rp->r_flags |= R4RECEXPFH;
   1931 	mutex_exit(&rp->r_statelock);
   1932 
   1933 	if (action == NR_BADHANDLE) {
   1934 		/* shouldn't happen */
   1935 		nfs4_queue_event(RE_BADHANDLE, mi, NULL, 0,
   1936 		    vp, NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
   1937 	}
   1938 
   1939 	nfs4_remap_file(mi, vp, 0, &e);
   1940 	needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
   1941 
   1942 	/*
   1943 	 * If we get BADHANDLE or FHEXPIRED in their handler, something is
   1944 	 * broken.  Don't try to recover, just mark the file dead.
   1945 	 */
   1946 	if (needrecov && e.error == 0 &&
   1947 	    (e.stat == NFS4ERR_BADHANDLE || e.stat == NFS4ERR_FHEXPIRED))
   1948 		needrecov = FALSE;
   1949 	if (needrecov) {
   1950 		(void) nfs4_start_recovery(&e, mi, vp,
   1951 		    NULL, NULL, NULL, OP_LOOKUP, NULL, NULL, NULL);
   1952 	} else if (e.error != EINTR &&
   1953 	    !NFS4_FRC_UNMT_ERR(e.error, mi->mi_vfsp) &&
   1954 	    (e.error != 0 || e.stat != NFS4_OK)) {
   1955 		nfs4_recov_fh_fail(vp, e.error, e.stat);
   1956 		/*
   1957 		 * Don't set r_error to ESTALE.  Higher-level code (e.g.,
   1958 		 * cstatat_getvp()) retries on ESTALE, which would cause
   1959 		 * an infinite loop.
   1960 		 */
   1961 	}
   1962 
   1963 	mutex_enter(&rp->r_statelock);
   1964 	rp->r_flags &= ~R4RECEXPFH;
   1965 	cv_broadcast(&rp->r_cv);
   1966 	mutex_exit(&rp->r_statelock);
   1967 }
   1968 
   1969 /*
   1970  * Stale Filehandle
   1971  */
   1972 
   1973 /*
   1974  * A stale filehandle can happen when an individual file has
   1975  * been removed, or when an entire filesystem has been taken
   1976  * offline.  To distinguish these cases, we do this:
   1977  * - if a GETATTR with the current filehandle is okay, we do
   1978  *   nothing (this can happen with two-filehandle ops)
   1979  * - if the GETATTR fails, but a GETATTR of the root filehandle
   1980  *   succeeds, mark the rnode with R4STALE, which will stop use
   1981  * - if the GETATTR fails, and a GETATTR of the root filehandle
   1982  *   also fails, we consider the problem filesystem-wide, so:
   1983  *   - if we can failover, we should
   1984  *   - if we can't failover, we should mark both the original
   1985  *     vnode and the root bad
   1986  */
   1987 static void
   1988 recov_stale(mntinfo4_t *mi, vnode_t *vp)
   1989 {
   1990 	rnode4_t *rp = VTOR4(vp);
   1991 	vnode_t *rootvp = NULL;
   1992 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
   1993 	nfs4_ga_res_t gar;
   1994 	char *fail_msg = "failed to recover from NFS4ERR_STALE";
   1995 	bool_t needrecov;
   1996 
   1997 	mutex_enter(&rp->r_statelock);
   1998 
   1999 	if (rp->r_flags & R4RECOVERR) {
   2000 		mutex_exit(&rp->r_statelock);
   2001 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   2002 		    "recov_stale: already marked dead, rp %s",
   2003 		    rnode4info(rp)));
   2004 		return;
   2005 	}
   2006 
   2007 	if (rp->r_flags & R4STALE) {
   2008 		mutex_exit(&rp->r_statelock);
   2009 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   2010 		    "recov_stale: already marked stale, rp %s",
   2011 		    rnode4info(rp)));
   2012 		return;
   2013 	}
   2014 
   2015 	mutex_exit(&rp->r_statelock);
   2016 
   2017 	/* Try a GETATTR on this vnode */
   2018 	nfs4_getattr_otw_norecovery(vp, &gar, &e, CRED(), 0);
   2019 
   2020 	/*
   2021 	 * Handle non-STALE recoverable errors
   2022 	 */
   2023 	needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
   2024 	if (needrecov && (e.error != 0 || e.stat != NFS4ERR_STALE)) {
   2025 		(void) nfs4_start_recovery(&e, mi, vp,
   2026 		    NULL, NULL, NULL, OP_GETATTR, NULL, NULL, NULL);
   2027 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   2028 		    "recov_stale: error=%d, stat=%d seen on rp %s",
   2029 		    e.error, e.stat, rnode4info(rp)));
   2030 		goto out;
   2031 	}
   2032 
   2033 	/* Are things OK for this vnode? */
   2034 	if (!e.error && e.stat == NFS4_OK) {
   2035 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   2036 		    "recov_stale: file appears fine, rp %s",
   2037 		    rnode4info(rp)));
   2038 		goto out;
   2039 	}
   2040 
   2041 	/* Did we get an unrelated non-recoverable error? */
   2042 	if (e.error || e.stat != NFS4ERR_STALE) {
   2043 		nfs4_fail_recov(vp, fail_msg, e.error, e.stat);
   2044 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   2045 		    "recov_stale: unrelated fatal error, rp %s",
   2046 		    rnode4info(rp)));
   2047 		goto out;
   2048 	}
   2049 
   2050 	/*
   2051 	 * If we don't appear to be dealing with the root node, find it.
   2052 	 */
   2053 	if ((vp->v_flag & VROOT) == 0) {
   2054 		nfs4_error_zinit(&e);
   2055 		e.error = VFS_ROOT(vp->v_vfsp, &rootvp);
   2056 		if (e.error) {
   2057 			nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE);
   2058 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   2059 			    "recov_stale: can't find root node for rp %s",
   2060 			    rnode4info(rp)));
   2061 			goto out;
   2062 		}
   2063 	}
   2064 
   2065 	/* Try a GETATTR on the root vnode */
   2066 	if (rootvp != NULL) {
   2067 		nfs4_error_zinit(&e);
   2068 		nfs4_getattr_otw_norecovery(rootvp, &gar, &e, CRED(), 0);
   2069 
   2070 		/* Try recovery? */
   2071 		if (e.error != 0 || e.stat != NFS4ERR_STALE) {
   2072 			needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
   2073 			if (needrecov) {
   2074 				(void) nfs4_start_recovery(&e,
   2075 				    mi, rootvp, NULL, NULL, NULL,
   2076 				    OP_GETATTR, NULL, NULL, NULL);
   2077 				NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   2078 				    "recov_stale: error=%d, stat=%d seen "
   2079 				    "on rp %s", e.error, e.stat,
   2080 				    rnode4info(rp)));
   2081 			}
   2082 		}
   2083 
   2084 		/*
   2085 		 * Check to see if a failover attempt is warranted
   2086 		 * NB: nfs4_try_failover doesn't check for STALE
   2087 		 * because recov_stale gets a shot first.  Now that
   2088 		 * recov_stale has failed, go ahead and try failover.
   2089 		 *
   2090 		 * If the getattr on the root filehandle was successful,
   2091 		 * then mark recovery as failed for 'vp' and exit.
   2092 		 */
   2093 		if (nfs4_try_failover(&e) == 0 && e.stat != NFS4ERR_STALE) {
   2094 			/*
   2095 			 * pass the original error to fail_recov, not
   2096 			 * the one from trying the root vnode.
   2097 			 */
   2098 			nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE);
   2099 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   2100 			    "recov_stale: root node OK, marking "
   2101 			    "dead rp %s", rnode4info(rp)));
   2102 			goto out;
   2103 		}
   2104 	}
   2105 
   2106 	/*
   2107 	 * Here, we know that both the original file and the
   2108 	 * root filehandle (which may be the same) are stale.
   2109 	 * We want to fail over if we can, and if we can't, we
   2110 	 * want to mark everything in sight bad.
   2111 	 */
   2112 	if (FAILOVER_MOUNT4(mi)) {
   2113 		mutex_enter(&mi->mi_lock);
   2114 		mi->mi_recovflags |= MI4R_NEED_NEW_SERVER;
   2115 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   2116 		    "recov_stale: failing over due to rp %s",
   2117 		    rnode4info(rp)));
   2118 		mutex_exit(&mi->mi_lock);
   2119 	} else {
   2120 		rnode4_t *rootrp;
   2121 		servinfo4_t *svp;
   2122 
   2123 		/*
   2124 		 * Can't fail over, so mark things dead.
   2125 		 *
   2126 		 * If rootvp is set, we know we have a distinct
   2127 		 * non-root vnode which can be marked dead in
   2128 		 * the usual way.
   2129 		 *
   2130 		 * Then we want to mark the root vnode dead.
   2131 		 * Note that if rootvp wasn't set, our vp is
   2132 		 * actually the root vnode.
   2133 		 */
   2134 		if (rootvp != NULL) {
   2135 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   2136 			    "recov_stale: can't fail over, marking dead rp %s",
   2137 			    rnode4info(rp)));
   2138 			nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE);
   2139 		} else {
   2140 			rootvp = vp;
   2141 			VN_HOLD(rootvp);
   2142 		}
   2143 
   2144 		/*
   2145 		 * Mark root dead, but quietly - since
   2146 		 * the root rnode is frequently recreated,
   2147 		 * we can encounter this at every access.
   2148 		 * Also mark recovery as failed on this VFS.
   2149 		 */
   2150 		rootrp = VTOR4(rootvp);
   2151 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_CONT,
   2152 		    "recov_stale: marking dead root rp %s",
   2153 		    rnode4info(rootrp)));
   2154 		mutex_enter(&rootrp->r_statelock);
   2155 		rootrp->r_flags |= (R4RECOVERR | R4STALE);
   2156 		rootrp->r_error = ESTALE;
   2157 		mutex_exit(&rootrp->r_statelock);
   2158 		mutex_enter(&mi->mi_lock);
   2159 		mi->mi_error = ESTALE;
   2160 		mutex_exit(&mi->mi_lock);
   2161 
   2162 		svp = mi->mi_curr_serv;
   2163 		(void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
   2164 		svp->sv_flags |= SV4_ROOT_STALE;
   2165 		nfs_rw_exit(&svp->sv_lock);
   2166 	}
   2167 
   2168 out:
   2169 	if (rootvp)
   2170 		VN_RELE(rootvp);
   2171 }
   2172 
   2173 /*
   2174  * Locks.
   2175  */
   2176 
   2177 /*
   2178  * Reclaim all the active (acquired) locks for the given file.
   2179  * If a process lost a lock, the process is sent a SIGLOST.  This is not
   2180  * considered an error.
   2181  *
   2182  * Return values:
   2183  * Errors and status are returned via the nfs4_error_t parameter
   2184  * If an error indicates that recovery is needed, the caller is responsible
   2185  * for dealing with it.
   2186  */
   2187 
   2188 static void
   2189 relock_file(vnode_t *vp, mntinfo4_t *mi, nfs4_error_t *ep,
   2190     fattr4_change pre_change)
   2191 {
   2192 	locklist_t *locks, *llp;
   2193 	rnode4_t *rp;
   2194 
   2195 	ASSERT(ep != NULL);
   2196 	nfs4_error_zinit(ep);
   2197 
   2198 	if (VTOMI4(vp)->mi_flags & MI4_LLOCK)
   2199 		return;
   2200 
   2201 	nfs4_flush_lock_owners(VTOR4(vp));
   2202 
   2203 	/*
   2204 	 * If we get an error that requires recovery actions, just bail out
   2205 	 * and let the top-level recovery code handle it.
   2206 	 *
   2207 	 * If we get some other error, kill the process that owned the lock
   2208 	 * and mark its remaining locks (if any) as belonging to NOPID, so
   2209 	 * that we don't make any more reclaim requests for that process.
   2210 	 */
   2211 
   2212 	rp = VTOR4(vp);
   2213 	locks = flk_active_locks_for_vp(vp);
   2214 	for (llp = locks; llp != NULL; llp = llp->ll_next) {
   2215 		int did_reclaim = 1;
   2216 
   2217 		ASSERT(llp->ll_vp == vp);
   2218 		if (llp->ll_flock.l_pid == NOPID)
   2219 			continue;
   2220 		reclaim_one_lock(vp, &llp->ll_flock, ep, &did_reclaim);
   2221 		/*
   2222 		 * If we need to restart recovery, stop processing the
   2223 		 * list.  Some errors would be recoverable under other
   2224 		 * circumstances, but if they happen here we just give up
   2225 		 * on the lock.
   2226 		 */
   2227 		if (nfs4_needs_recovery(ep, TRUE, vp->v_vfsp)) {
   2228 			if (ep->error != 0)
   2229 				break;
   2230 			if (!nfs4_recov_marks_dead(ep->stat))
   2231 				break;
   2232 		}
   2233 		/*
   2234 		 *   In case the server isn't offering us a grace period, or
   2235 		 * if we missed it, we might have opened & locked from scratch,
   2236 		 * rather than reopened/reclaimed.
   2237 		 *   We need to ensure that the object hadn't been otherwise
   2238 		 * changed during this time, by comparing the changeinfo.
   2239 		 *   We get passed the changeinfo from before the reopen by our
   2240 		 * caller, in pre_change.
   2241 		 *   The changeinfo from after the reopen is in rp->r_change,
   2242 		 * courtesy of the GETATTR in the reopen.
   2243 		 *   If they're different, then the file has changed, and we
   2244 		 * have to SIGLOST the app.
   2245 		 */
   2246 		if (ep->error == 0 && ep->stat == NFS4_OK && !did_reclaim) {
   2247 			mutex_enter(&rp->r_statelock);
   2248 			if (pre_change != rp->r_change)
   2249 				ep->stat = NFS4ERR_NO_GRACE;
   2250 			mutex_exit(&rp->r_statelock);
   2251 		}
   2252 		if (ep->error != 0 || ep->stat != NFS4_OK) {
   2253 			if (ep->error != 0)
   2254 				nfs4_queue_event(RE_FAIL_RELOCK, mi,
   2255 				    NULL, ep->error, vp, NULL, 0, NULL,
   2256 				    llp->ll_flock.l_pid, TAG_NONE, TAG_NONE,
   2257 				    0, 0);
   2258 			else
   2259 				nfs4_queue_event(RE_FAIL_RELOCK, mi,
   2260 				    NULL, 0, vp, NULL, ep->stat, NULL,
   2261 				    llp->ll_flock.l_pid, TAG_NONE, TAG_NONE,
   2262 				    0, 0);
   2263 			nfs4_send_siglost(llp->ll_flock.l_pid, mi, vp, TRUE,
   2264 			    ep->error, ep->stat);
   2265 			relock_skip_pid(llp, llp->ll_flock.l_pid);
   2266 
   2267 			/* Reinitialize the nfs4_error and continue */
   2268 			nfs4_error_zinit(ep);
   2269 		}
   2270 	}
   2271 
   2272 	if (locks != NULL)
   2273 		flk_free_locklist(locks);
   2274 }
   2275 
   2276 /*
   2277  * Reclaim the given lock.
   2278  * If the lock can't be reclaimed, the process is sent SIGLOST, but this is
   2279  * not considered an error.
   2280  *
   2281  * Errors are returned via the nfs4_error_t parameter.
   2282  */
   2283 static void
   2284 reclaim_one_lock(vnode_t *vp, flock64_t *flk, nfs4_error_t *ep,
   2285     int *did_reclaimp)
   2286 {
   2287 	cred_t *cr;
   2288 	rnode4_t *rp = VTOR4(vp);
   2289 
   2290 	cr = pid_to_cr(flk->l_pid);
   2291 	if (cr == NULL) {
   2292 		nfs4_error_zinit(ep);
   2293 		ep->error = ESRCH;
   2294 		return;
   2295 	}
   2296 
   2297 	do {
   2298 		mutex_enter(&rp->r_statelock);
   2299 		if (rp->r_flags & R4RECOVERR) {
   2300 			/*
   2301 			 * This shouldn't affect other reclaims, so don't
   2302 			 * return an error.
   2303 			 */
   2304 			mutex_exit(&rp->r_statelock);
   2305 			break;
   2306 		}
   2307 		mutex_exit(&rp->r_statelock);
   2308 
   2309 		nfs4frlock(NFS4_LCK_CTYPE_RECLAIM, vp, F_SETLK, flk,
   2310 		    FREAD|FWRITE, 0, cr, ep, NULL, did_reclaimp);
   2311 		if (ep->error == 0 && ep->stat == NFS4ERR_FHEXPIRED)
   2312 			start_recovery_action(NR_FHEXPIRED, TRUE, VTOMI4(vp),
   2313 			    vp, NULL);
   2314 	} while (ep->error == 0 && ep->stat == NFS4ERR_FHEXPIRED);
   2315 
   2316 	crfree(cr);
   2317 }
   2318 
   2319 /*
   2320  * Open files.
   2321  */
   2322 
   2323 /*
   2324  * Verifies if the nfsstat4 is a valid error for marking this vnode dead.
   2325  * Returns 1 if the error is valid; 0 otherwise.
   2326  */
   2327 static int
   2328 nfs4_valid_recov_err_for_vp(vnode_t *vp, nfsstat4 stat)
   2329 {
   2330 	/*
   2331 	 * We should not be marking non-regular files as dead,
   2332 	 * except in very rare cases (eg: BADHANDLE or NFS4ERR_BADNAME).
   2333 	 */
   2334 	if (vp->v_type != VREG && stat != NFS4ERR_BADHANDLE &&
   2335 	    stat != NFS4ERR_BADNAME)
   2336 		return (0);
   2337 
   2338 	return (1);
   2339 }
   2340 
   2341 /*
   2342  * Failed attempting to recover a filehandle.  If 'stat' is valid for 'vp',
   2343  * then mark the object dead.  Since we've had to do a lookup for
   2344  * filehandle recovery, we will mark the object dead if we got NOENT.
   2345  */
   2346 static void
   2347 nfs4_recov_fh_fail(vnode_t *vp, int error, nfsstat4 stat)
   2348 {
   2349 	ASSERT(vp != NULL);
   2350 
   2351 	if ((error == 0) && (stat != NFS4ERR_NOENT) &&
   2352 	    (!nfs4_valid_recov_err_for_vp(vp, stat)))
   2353 		return;
   2354 
   2355 	nfs4_fail_recov(vp, "can't recover filehandle", error, stat);
   2356 }
   2357 
   2358 /*
   2359  * Recovery from a "shouldn't happen" error.  In the long term, we'd like
   2360  * to mark only the data structure(s) that provided the bad value as being
   2361  * bad.  But for now we'll just mark the entire file.
   2362  */
   2363 
   2364 static void
   2365 recov_badstate(recov_info_t *recovp, vnode_t *vp, nfsstat4 stat)
   2366 {
   2367 	ASSERT(vp != NULL);
   2368 	recov_throttle(recovp, vp);
   2369 
   2370 	if (!nfs4_valid_recov_err_for_vp(vp, stat))
   2371 		return;
   2372 
   2373 	nfs4_fail_recov(vp, "", 0, stat);
   2374 }
   2375 
   2376 /*
   2377  * Free up the information saved for a lost state request.
   2378  */
   2379 static void
   2380 nfs4_free_lost_rqst(nfs4_lost_rqst_t *lrp, nfs4_server_t *sp)
   2381 {
   2382 	component4 *filep;
   2383 	nfs4_open_stream_t *osp;
   2384 	int have_sync_lock;
   2385 
   2386 	NFS4_DEBUG(nfs4_lost_rqst_debug,
   2387 	    (CE_NOTE, "nfs4_free_lost_rqst:"));
   2388 
   2389 	switch (lrp->lr_op) {
   2390 	case OP_OPEN:
   2391 		filep = &lrp->lr_ofile;
   2392 		if (filep->utf8string_val) {
   2393 			kmem_free(filep->utf8string_val, filep->utf8string_len);
   2394 			filep->utf8string_val = NULL;
   2395 		}
   2396 		break;
   2397 	case OP_DELEGRETURN:
   2398 		nfs4delegreturn_cleanup(VTOR4(lrp->lr_vp), sp);
   2399 		break;
   2400 	case OP_CLOSE:
   2401 		osp = lrp->lr_osp;
   2402 		ASSERT(osp != NULL);
   2403 		mutex_enter(&osp->os_sync_lock);
   2404 		have_sync_lock = 1;
   2405 		if (osp->os_pending_close) {
   2406 			/* clean up the open file state. */
   2407 			osp->os_pending_close = 0;
   2408 			nfs4close_notw(lrp->lr_vp, osp, &have_sync_lock);
   2409 		}
   2410 		if (have_sync_lock)
   2411 			mutex_exit(&osp->os_sync_lock);
   2412 		break;
   2413 	}
   2414 
   2415 	lrp->lr_op = 0;
   2416 	if (lrp->lr_oop != NULL) {
   2417 		open_owner_rele(lrp->lr_oop);
   2418 		lrp->lr_oop = NULL;
   2419 	}
   2420 	if (lrp->lr_osp != NULL) {
   2421 		open_stream_rele(lrp->lr_osp, VTOR4(lrp->lr_vp));
   2422 		lrp->lr_osp = NULL;
   2423 	}
   2424 	if (lrp->lr_lop != NULL) {
   2425 		lock_owner_rele(lrp->lr_lop);
   2426 		lrp->lr_lop = NULL;
   2427 	}
   2428 	if (lrp->lr_flk != NULL) {
   2429 		kmem_free(lrp->lr_flk, sizeof (flock64_t));
   2430 		lrp->lr_flk = NULL;
   2431 	}
   2432 	if (lrp->lr_vp != NULL) {
   2433 		VN_RELE(lrp->lr_vp);
   2434 		lrp->lr_vp = NULL;
   2435 	}
   2436 	if (lrp->lr_dvp != NULL) {
   2437 		VN_RELE(lrp->lr_dvp);
   2438 		lrp->lr_dvp = NULL;
   2439 	}
   2440 	if (lrp->lr_cr != NULL) {
   2441 		crfree(lrp->lr_cr);
   2442 		lrp->lr_cr = NULL;
   2443 	}
   2444 
   2445 	kmem_free(lrp, sizeof (nfs4_lost_rqst_t));
   2446 }
   2447 
   2448 /*
   2449  * Remove any lost state requests and free them.
   2450  */
   2451 static void
   2452 nfs4_remove_lost_rqsts(mntinfo4_t *mi, nfs4_server_t *sp)
   2453 {
   2454 	nfs4_lost_rqst_t *lrp;
   2455 
   2456 	mutex_enter(&mi->mi_lock);
   2457 	while ((lrp = list_head(&mi->mi_lost_state)) != NULL) {
   2458 		list_remove(&mi->mi_lost_state, lrp);
   2459 		mutex_exit(&mi->mi_lock);
   2460 		nfs4_free_lost_rqst(lrp, sp);
   2461 		mutex_enter(&mi->mi_lock);
   2462 	}
   2463 	mutex_exit(&mi->mi_lock);
   2464 }
   2465 
   2466 /*
   2467  * Reopen all the files for the given filesystem and reclaim any locks.
   2468  */
   2469 
   2470 static void
   2471 recov_openfiles(recov_info_t *recovp, nfs4_server_t *sp)
   2472 {
   2473 	mntinfo4_t *mi = recovp->rc_mi;
   2474 	nfs4_opinst_t *reopenlist = NULL, *rep;
   2475 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
   2476 	open_claim_type4 claim;
   2477 	int remap;
   2478 	char *fail_msg = "No such file or directory on replica";
   2479 	rnode4_t *rp;
   2480 	fattr4_change pre_change;
   2481 
   2482 	ASSERT(sp != NULL);
   2483 
   2484 	/*
   2485 	 * This check is to allow a 10ms pause before we reopen files
   2486 	 * it should allow the server time to have received the CB_NULL
   2487 	 * reply and update its internal structures such that (if
   2488 	 * applicable) we are granted a delegation on reopened files.
   2489 	 */
   2490 	mutex_enter(&sp->s_lock);
   2491 	if ((sp->s_flags & (N4S_CB_PINGED | N4S_CB_WAITER)) == 0) {
   2492 		sp->s_flags |= N4S_CB_WAITER;
   2493 		(void) cv_reltimedwait(&sp->wait_cb_null, &sp->s_lock,
   2494 		    drv_usectohz(N4S_CB_PAUSE_TIME), TR_CLOCK_TICK);
   2495 	}
   2496 	mutex_exit(&sp->s_lock);
   2497 
   2498 	(void) nfs_rw_enter_sig(&sp->s_recovlock, RW_READER, 0);
   2499 	(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0);
   2500 
   2501 	if (NFS4_VOLATILE_FH(mi)) {
   2502 		nfs4_remap_root(mi, &e, 0);
   2503 		if (nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp)) {
   2504 			(void) nfs4_start_recovery(&e, mi, NULL,
   2505 			    NULL, NULL, NULL, OP_LOOKUP, NULL, NULL, NULL);
   2506 		}
   2507 	}
   2508 
   2509 	mutex_enter(&mi->mi_lock);
   2510 	if (recovp->rc_srv_reboot || (mi->mi_recovflags & MI4R_SRV_REBOOT))
   2511 		claim = CLAIM_PREVIOUS;
   2512 	else
   2513 		claim = CLAIM_NULL;
   2514 	mutex_exit(&mi->mi_lock);
   2515 
   2516 	if (e.error == 0 && e.stat == NFS4_OK) {
   2517 		/*
   2518 		 * Get a snapshot of open files in the filesystem.  Note
   2519 		 * that new opens will stall until the server's grace
   2520 		 * period is done.
   2521 		 */
   2522 		reopenlist = r4mkopenlist(mi);
   2523 
   2524 		mutex_enter(&mi->mi_lock);
   2525 		remap = mi->mi_recovflags & MI4R_REMAP_FILES;
   2526 		mutex_exit(&mi->mi_lock);
   2527 		/*
   2528 		 * Since we are re-establishing state on the
   2529 		 * server, its ok to blow away the saved lost
   2530 		 * requests since we don't need to reissue it.
   2531 		 */
   2532 		nfs4_remove_lost_rqsts(mi, sp);
   2533 
   2534 		for (rep = reopenlist; rep; rep = rep->re_next) {
   2535 
   2536 			if (remap) {
   2537 				nfs4_remap_file(mi, rep->re_vp,
   2538 				    NFS4_REMAP_CKATTRS, &e);
   2539 			}
   2540 			if (e.error == ENOENT || e.stat == NFS4ERR_NOENT) {
   2541 				/*
   2542 				 * The current server does not have the file
   2543 				 * that is to be remapped.  This is most
   2544 				 * likely due to an improperly maintained
   2545 				 * replica.   The files that are missing from
   2546 				 * the server will be marked dead and logged
   2547 				 * in order to make sys admins aware of the
   2548 				 * problem.
   2549 				 */
   2550 				nfs4_fail_recov(rep->re_vp,
   2551 				    fail_msg, e.error, e.stat);
   2552 				/*
   2553 				 * We've already handled the error so clear it.
   2554 				 */
   2555 				nfs4_error_zinit(&e);
   2556 				continue;
   2557 			} else if (e.error == 0 && e.stat == NFS4_OK) {
   2558 				int j;
   2559 
   2560 				rp = VTOR4(rep->re_vp);
   2561 				mutex_enter(&rp->r_statelock);
   2562 				pre_change = rp->r_change;
   2563 				mutex_exit(&rp->r_statelock);
   2564 
   2565 				for (j = 0; j < rep->re_numosp; j++) {
   2566 					nfs4_reopen(rep->re_vp, rep->re_osp[j],
   2567 					    &e, claim, FALSE, TRUE);
   2568 					if (e.error != 0 || e.stat != NFS4_OK)
   2569 						break;
   2570 				}
   2571 				if (nfs4_needs_recovery(&e, TRUE,
   2572 				    mi->mi_vfsp)) {
   2573 					(void) nfs4_start_recovery(&e, mi,
   2574 					    rep->re_vp, NULL, NULL, NULL,
   2575 					    OP_OPEN, NULL, NULL, NULL);
   2576 					break;
   2577 				}
   2578 			}
   2579 #ifdef DEBUG
   2580 			if (nfs4_recovdelay > 0)
   2581 				delay(MSEC_TO_TICK(nfs4_recovdelay * 1000));
   2582 #endif
   2583 			if (e.error == 0 && e.stat == NFS4_OK)
   2584 				relock_file(rep->re_vp, mi, &e, pre_change);
   2585 
   2586 			if (nfs4_needs_recovery(&e, TRUE, mi->mi_vfsp))
   2587 				(void) nfs4_start_recovery(&e, mi,
   2588 				    rep->re_vp, NULL, NULL, NULL, OP_LOCK,
   2589 				    NULL, NULL, NULL);
   2590 			if (e.error != 0 || e.stat != NFS4_OK)
   2591 				break;
   2592 		}
   2593 
   2594 		/*
   2595 		 * Check to see if we need to remap files passed in
   2596 		 * via the recovery arguments; this will have been
   2597 		 * done for open files.  A failure here is not fatal.
   2598 		 */
   2599 		if (remap) {
   2600 			nfs4_error_t ignore;
   2601 			nfs4_check_remap(mi, recovp->rc_vp1, NFS4_REMAP_CKATTRS,
   2602 			    &ignore);
   2603 			nfs4_check_remap(mi, recovp->rc_vp2, NFS4_REMAP_CKATTRS,
   2604 			    &ignore);
   2605 		}
   2606 	}
   2607 
   2608 	if (e.error == 0 && e.stat == NFS4_OK) {
   2609 		mutex_enter(&mi->mi_lock);
   2610 		mi->mi_recovflags &= ~(MI4R_REOPEN_FILES | MI4R_REMAP_FILES);
   2611 		mutex_exit(&mi->mi_lock);
   2612 	}
   2613 
   2614 	nfs_rw_exit(&mi->mi_recovlock);
   2615 	nfs_rw_exit(&sp->s_recovlock);
   2616 
   2617 	if (reopenlist != NULL)
   2618 		r4releopenlist(reopenlist);
   2619 }
   2620 
   2621 /*
   2622  * Resend the queued state recovery requests in "rqsts".
   2623  */
   2624 
   2625 static void
   2626 nfs4_resend_lost_rqsts(recov_info_t *recovp, nfs4_server_t *sp)
   2627 {
   2628 	nfs4_lost_rqst_t	*lrp, *tlrp;
   2629 	mntinfo4_t		*mi = recovp->rc_mi;
   2630 	nfs4_error_t		n4e;
   2631 #ifdef NOTYET
   2632 	uint32_t		deny_bits = 0;
   2633 #endif
   2634 
   2635 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_resend_lost_rqsts"));
   2636 
   2637 	ASSERT(mi != NULL);
   2638 	ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
   2639 
   2640 	mutex_enter(&mi->mi_lock);
   2641 	lrp = list_head(&mi->mi_lost_state);
   2642 	mutex_exit(&mi->mi_lock);
   2643 	while (lrp != NULL) {
   2644 		nfs4_error_zinit(&n4e);
   2645 		resend_one_op(lrp, &n4e, mi, sp);
   2646 		NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
   2647 		    "nfs4_resend_lost_rqsts: resend request: for vp %p got "
   2648 		    "error %d stat %d", (void *)lrp->lr_vp, n4e.error,
   2649 		    n4e.stat));
   2650 
   2651 		/*
   2652 		 * If we get a recovery error that we can actually
   2653 		 * recover from (such as ETIMEDOUT, FHEXPIRED), we
   2654 		 * return and let the recovery thread redrive the call.
   2655 		 * Don't requeue unless the zone is still healthy.
   2656 		 */
   2657 		if (zone_status_get(curproc->p_zone) < ZONE_IS_SHUTTING_DOWN &&
   2658 		    nfs4_needs_recovery(&n4e, TRUE, mi->mi_vfsp) &&
   2659 		    (nfs4_try_failover(&n4e) ||
   2660 		    NFS4_FRC_UNMT_ERR(n4e.error, mi->mi_vfsp) ||
   2661 		    (n4e.error == 0 && n4e.stat != NFS4ERR_BADHANDLE &&
   2662 		    !nfs4_recov_marks_dead(n4e.stat)))) {
   2663 			/*
   2664 			 * For these three errors, we want to delay a bit
   2665 			 * instead of pounding the server into submission.
   2666 			 * We have to do this manually; the normal
   2667 			 * processing for these errors only works for
   2668 			 * non-recovery requests.
   2669 			 */
   2670 			if ((n4e.error == 0 && n4e.stat == NFS4ERR_DELAY) ||
   2671 			    (n4e.error == 0 && n4e.stat == NFS4ERR_GRACE) ||
   2672 			    (n4e.error == 0 && n4e.stat == NFS4ERR_RESOURCE) ||
   2673 			    NFS4_FRC_UNMT_ERR(n4e.error, mi->mi_vfsp)) {
   2674 				delay(SEC_TO_TICK(nfs4err_delay_time));
   2675 			} else {
   2676 				(void) nfs4_start_recovery(&n4e,
   2677 				    mi, lrp->lr_dvp, lrp->lr_vp, NULL, NULL,
   2678 				    lrp->lr_op, NULL, NULL, NULL);
   2679 			}
   2680 			return;
   2681 		}
   2682 
   2683 		mutex_enter(&mi->mi_lock);
   2684 		list_remove(&mi->mi_lost_state, lrp);
   2685 		tlrp = lrp;
   2686 		lrp = list_head(&mi->mi_lost_state);
   2687 		mutex_exit(&mi->mi_lock);
   2688 		nfs4_free_lost_rqst(tlrp, sp);
   2689 	}
   2690 }
   2691 
   2692 /*
   2693  * Resend the given op, and issue any necessary undo call.
   2694  * errors are returned via the nfs4_error_t parameter.
   2695  */
   2696 
   2697 static void
   2698 resend_one_op(nfs4_lost_rqst_t *lrp, nfs4_error_t *ep,
   2699     mntinfo4_t *mi, nfs4_server_t *sp)
   2700 {
   2701 	vnode_t *vp;
   2702 	nfs4_open_stream_t *osp;
   2703 	cred_t *cr;
   2704 	uint32_t acc_bits;
   2705 
   2706 	vp = lrp->lr_vp;
   2707 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_one_op: "
   2708 	    "have a lost open/close request for vp %p", (void *)vp));
   2709 
   2710 	switch (lrp->lr_op) {
   2711 	case OP_OPEN:
   2712 		nfs4_resend_open_otw(&vp, lrp, ep);
   2713 		break;
   2714 	case OP_OPEN_DOWNGRADE:
   2715 		ASSERT(lrp->lr_oop != NULL);
   2716 		ep->error = nfs4_start_open_seqid_sync(lrp->lr_oop, mi);
   2717 		ASSERT(!ep->error);	/* recov thread always succeeds */
   2718 		ASSERT(lrp->lr_osp != NULL);
   2719 		mutex_enter(&lrp->lr_osp->os_sync_lock);
   2720 		nfs4_open_downgrade(lrp->lr_dg_acc, lrp->lr_dg_deny,
   2721 		    lrp->lr_oop, lrp->lr_osp, vp, lrp->lr_cr, lrp,
   2722 		    ep, NULL, NULL);
   2723 		mutex_exit(&lrp->lr_osp->os_sync_lock);
   2724 		nfs4_end_open_seqid_sync(lrp->lr_oop);
   2725 		break;
   2726 	case OP_CLOSE:
   2727 		osp = lrp->lr_osp;
   2728 		cr = lrp->lr_cr;
   2729 		acc_bits = 0;
   2730 		mutex_enter(&osp->os_sync_lock);
   2731 		if (osp->os_share_acc_read)
   2732 			acc_bits |= OPEN4_SHARE_ACCESS_READ;
   2733 		if (osp->os_share_acc_write)
   2734 			acc_bits |= OPEN4_SHARE_ACCESS_WRITE;
   2735 		mutex_exit(&osp->os_sync_lock);
   2736 		nfs4close_one(vp, osp, cr, acc_bits, lrp, ep,
   2737 		    CLOSE_RESEND, 0, 0, 0);
   2738 		break;
   2739 	case OP_LOCK:
   2740 	case OP_LOCKU:
   2741 		resend_lock(lrp, ep);
   2742 		goto done;
   2743 	case OP_DELEGRETURN:
   2744 		nfs4_resend_delegreturn(lrp, ep, sp);
   2745 		goto done;
   2746 	default:
   2747 #ifdef DEBUG
   2748 		cmn_err(CE_PANIC, "resend_one_op: unexpected op: %d",
   2749 		    lrp->lr_op);
   2750 #endif
   2751 		nfs4_queue_event(RE_LOST_STATE_BAD_OP, mi, NULL,
   2752 		    lrp->lr_op, lrp->lr_vp, lrp->lr_dvp, NFS4_OK, NULL, 0,
   2753 		    TAG_NONE, TAG_NONE, 0, 0);
   2754 		nfs4_error_init(ep, EINVAL);
   2755 		return;
   2756 	}
   2757 
   2758 	/*
   2759 	 * No need to retry nor send an "undo" CLOSE in the
   2760 	 * event the server rebooted.
   2761 	 */
   2762 	if (ep->error == 0 && (ep->stat == NFS4ERR_STALE_CLIENTID ||
   2763 	    ep->stat == NFS4ERR_STALE_STATEID || ep->stat == NFS4ERR_EXPIRED))
   2764 		goto done;
   2765 
   2766 	/*
   2767 	 * If we resent a CLOSE or OPEN_DOWNGRADE, there's nothing
   2768 	 * to undo.  Undoing locking operations was handled by
   2769 	 * resend_lock().
   2770 	 */
   2771 	if (lrp->lr_op == OP_OPEN_DOWNGRADE || lrp->lr_op == OP_CLOSE)
   2772 		goto done;
   2773 
   2774 	/*
   2775 	 * If we get any other error for OPEN, then don't attempt
   2776 	 * to undo the resend of the open (since it was never
   2777 	 * successful!).
   2778 	 */
   2779 	ASSERT(lrp->lr_op == OP_OPEN);
   2780 	if (ep->error || ep->stat != NFS4_OK)
   2781 		goto done;
   2782 
   2783 	/*
   2784 	 * Now let's undo our OPEN.
   2785 	 */
   2786 	nfs4_error_zinit(ep);
   2787 	close_after_open_resend(vp, lrp->lr_cr, lrp->lr_oacc, ep);
   2788 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_one_op: "
   2789 	    "nfs4close_one: for vp %p got error %d stat %d",
   2790 	    (void *)vp, ep->error, ep->stat));
   2791 
   2792 done:
   2793 	if (vp != lrp->lr_vp)
   2794 		VN_RELE(vp);
   2795 }
   2796 
   2797 /*
   2798  * Close a file that was opened via a resent OPEN.
   2799  * Most errors are passed back to the caller (via the return value and
   2800  * *statp), except for FHEXPIRED, which is retried.
   2801  *
   2802  * It might be conceptually cleaner to push the CLOSE request onto the
   2803  * front of the resend queue, rather than sending it here.  That would
   2804  * match the way we undo lost lock requests.  On the other
   2805  * hand, we've already got something that works, and there's no reason to
   2806  * change it at this time.
   2807  */
   2808 
   2809 static void
   2810 close_after_open_resend(vnode_t *vp, cred_t *cr, uint32_t acc_bits,
   2811     nfs4_error_t *ep)
   2812 {
   2813 
   2814 	for (;;) {
   2815 		nfs4close_one(vp, NULL, cr, acc_bits, NULL, ep,
   2816 		    CLOSE_AFTER_RESEND, 0, 0, 0);
   2817 		if (ep->error == 0 && ep->stat == NFS4_OK)
   2818 			break;		/* success; done */
   2819 		if (ep->error != 0 || ep->stat != NFS4ERR_FHEXPIRED)
   2820 			break;
   2821 		/* else retry FHEXPIRED */
   2822 	}
   2823 
   2824 }
   2825 
   2826 /*
   2827  * Resend the given lost lock request.  Return an errno value.  If zero,
   2828  * *statp is set to the NFS status code for the call.
   2829  *
   2830  * Issue a SIGLOST and mark the rnode dead if we get a non-recovery error or
   2831  * a recovery error that we don't actually recover from yet (eg: BAD_SEQID).
   2832  * Let the recovery thread redrive the call if we get a recovery error that
   2833  * we can actually recover from.
   2834  */
   2835 static void
   2836 resend_lock(nfs4_lost_rqst_t *lrp, nfs4_error_t *ep)
   2837 {
   2838 	bool_t		send_siglost = FALSE;
   2839 	vnode_t		*vp = lrp->lr_vp;
   2840 
   2841 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_lock:"));
   2842 	ASSERT(lrp->lr_ctype == NFS4_LCK_CTYPE_REINSTATE ||
   2843 	    lrp->lr_ctype == NFS4_LCK_CTYPE_RESEND);
   2844 
   2845 	nfs4frlock(lrp->lr_ctype, vp, F_SETLK,
   2846 	    lrp->lr_flk, FREAD|FWRITE, 0, lrp->lr_cr, ep, lrp, NULL);
   2847 
   2848 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_lock: "
   2849 	    "nfs4frlock for vp %p returned error %d, stat %d",
   2850 	    (void *)vp, ep->error, ep->stat));
   2851 
   2852 	if (ep->error == 0 && ep->stat == 0)
   2853 		goto done;
   2854 	if (ep->error == 0 && ep->stat == NFS4ERR_DENIED &&
   2855 	    lrp->lr_ctype == NFS4_LCK_CTYPE_RESEND)
   2856 		goto done;
   2857 
   2858 	/*
   2859 	 * If we failed with a non-recovery error, send SIGLOST and
   2860 	 * mark the file dead.
   2861 	 */
   2862 	if (!nfs4_needs_recovery(ep, TRUE, vp->v_vfsp))
   2863 		send_siglost = TRUE;
   2864 	else {
   2865 		/*
   2866 		 * Done with recovering LOST LOCK in the event the
   2867 		 * server rebooted or we've lost the lease.
   2868 		 */
   2869 		if (ep->error == 0 && (ep->stat == NFS4ERR_STALE_CLIENTID ||
   2870 		    ep->stat == NFS4ERR_STALE_STATEID ||
   2871 		    ep->stat == NFS4ERR_EXPIRED)) {
   2872 			goto done;
   2873 		}
   2874 
   2875 		/*
   2876 		 * BAD_STATEID on an unlock indicates that the server has
   2877 		 * forgotten about the lock anyway, so act like the call
   2878 		 * was successful.
   2879 		 */
   2880 		if (ep->error == 0 && ep->stat == NFS4ERR_BAD_STATEID &&
   2881 		    lrp->lr_op == OP_LOCKU)
   2882 			goto done;
   2883 
   2884 		/*
   2885 		 * If we got a recovery error that we don't actually
   2886 		 * recover from, send SIGLOST.  If the filesystem was
   2887 		 * forcibly unmounted, we skip the SIGLOST because (a) it's
   2888 		 * unnecessary noise, and (b) there could be a new process
   2889 		 * with the same pid as the one that had generated the lost
   2890 		 * state request.
   2891 		 */
   2892 		if (ep->error == 0 && (ep->stat == NFS4ERR_BADHANDLE ||
   2893 		    nfs4_recov_marks_dead(ep->stat))) {
   2894 			if (!(vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
   2895 				send_siglost = TRUE;
   2896 			goto done;
   2897 		}
   2898 
   2899 		/*
   2900 		 * If the filesystem was forcibly unmounted, we
   2901 		 * still need to synchronize with the server and
   2902 		 * release state.  Try again later.
   2903 		 */
   2904 		if (NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp))
   2905 			goto done;
   2906 
   2907 		/*
   2908 		 * If we get a recovery error that we can actually
   2909 		 * recover from (such as ETIMEDOUT, FHEXPIRED),
   2910 		 * return and let the recovery thread redrive the call.
   2911 		 *
   2912 		 * For the three errors below, we want to delay a bit
   2913 		 * instead of pounding the server into submission.
   2914 		 */
   2915 		if ((ep->error == 0 && ep->stat == NFS4ERR_DELAY) ||
   2916 		    (ep->error == 0 && ep->stat == NFS4ERR_GRACE) ||
   2917 		    (ep->error == 0 && ep->stat == NFS4ERR_RESOURCE))
   2918 			delay(SEC_TO_TICK(recov_err_delay));
   2919 		goto done;
   2920 	}
   2921 
   2922 done:
   2923 	if (send_siglost) {
   2924 		cred_t *sv_cred;
   2925 
   2926 		/*
   2927 		 * Must be root or the actual thread being issued the
   2928 		 * SIGLOST for this to work, so just become root.
   2929 		 */
   2930 		sv_cred = curthread->t_cred;
   2931 		curthread->t_cred = kcred;
   2932 		nfs4_send_siglost(lrp->lr_flk->l_pid, VTOMI4(vp), vp, FALSE,
   2933 		    ep->error, ep->stat);
   2934 		curthread->t_cred = sv_cred;
   2935 
   2936 		/*
   2937 		 * Flush any additional reinstantiation requests for
   2938 		 * this operation.  Sending multiple SIGLOSTs to the user
   2939 		 * process is unlikely to help and may cause trouble.
   2940 		 */
   2941 		if (lrp->lr_ctype == NFS4_LCK_CTYPE_REINSTATE)
   2942 			flush_reinstate(lrp);
   2943 	}
   2944 }
   2945 
   2946 /*
   2947  * Remove any lock reinstantiation requests that correspond to the given
   2948  * lost request.  We only remove items that follow lrp in the queue,
   2949  * assuming that lrp will be removed by the generic lost state code.
   2950  */
   2951 
   2952 static void
   2953 flush_reinstate(nfs4_lost_rqst_t *lrp)
   2954 {
   2955 	vnode_t *vp;
   2956 	pid_t pid;
   2957 	mntinfo4_t *mi;
   2958 	nfs4_lost_rqst_t *nlrp;
   2959 
   2960 	vp = lrp->lr_vp;
   2961 	mi = VTOMI4(vp);
   2962 	pid = lrp->lr_flk->l_pid;
   2963 
   2964 	/*
   2965 	 * If there are any more reinstantation requests to get rid of,
   2966 	 * they should all be clustered at the front of the lost state
   2967 	 * queue.
   2968 	 */
   2969 	mutex_enter(&mi->mi_lock);
   2970 	for (lrp = list_next(&mi->mi_lost_state, lrp); lrp != NULL;
   2971 	    lrp = nlrp) {
   2972 		nlrp = list_next(&mi->mi_lost_state, lrp);
   2973 		if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU)
   2974 			break;
   2975 		if (lrp->lr_ctype != NFS4_LCK_CTYPE_REINSTATE)
   2976 			break;
   2977 		ASSERT(lrp->lr_vp == vp);
   2978 		ASSERT(lrp->lr_flk->l_pid == pid);
   2979 		NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
   2980 		    "remove reinstantiation %p", (void *)lrp));
   2981 		list_remove(&mi->mi_lost_state, lrp);
   2982 		nfs4_free_lost_rqst(lrp, NULL);
   2983 	}
   2984 	mutex_exit(&mi->mi_lock);
   2985 }
   2986 
   2987 /*
   2988  * End of state-specific recovery routines.
   2989  */
   2990 
   2991 /*
   2992  * Allocate a lost request struct, initialize it from lost_rqstp (including
   2993  * bumping the reference counts for the referenced vnode, etc.), and hang
   2994  * it off of recovp.
   2995  */
   2996 
   2997 static void
   2998 nfs4_save_lost_rqst(nfs4_lost_rqst_t *lost_rqstp, recov_info_t *recovp,
   2999     nfs4_recov_t *action, mntinfo4_t *mi)
   3000 {
   3001 	nfs4_lost_rqst_t *destp;
   3002 
   3003 	ASSERT(recovp->rc_lost_rqst == NULL);
   3004 
   3005 	destp = kmem_alloc(sizeof (nfs4_lost_rqst_t), KM_SLEEP);
   3006 	recovp->rc_lost_rqst = destp;
   3007 
   3008 	if (lost_rqstp->lr_op == OP_LOCK ||
   3009 	    lost_rqstp->lr_op == OP_LOCKU) {
   3010 		ASSERT(lost_rqstp->lr_lop);
   3011 		*action = NR_LOST_LOCK;
   3012 		destp->lr_ctype = lost_rqstp->lr_ctype;
   3013 		destp->lr_locktype = lost_rqstp->lr_locktype;
   3014 	} else if (lost_rqstp->lr_op == OP_OPEN) {
   3015 		component4 *srcfp, *destfp;
   3016 
   3017 		destp->lr_oacc = lost_rqstp->lr_oacc;
   3018 		destp->lr_odeny = lost_rqstp->lr_odeny;
   3019 		destp->lr_oclaim = lost_rqstp->lr_oclaim;
   3020 		if (lost_rqstp->lr_oclaim == CLAIM_DELEGATE_CUR)
   3021 			destp->lr_ostateid = lost_rqstp->lr_ostateid;
   3022 
   3023 		srcfp = &lost_rqstp->lr_ofile;
   3024 		destfp = &destp->lr_ofile;
   3025 		/*
   3026 		 * Consume caller's utf8string
   3027 		 */
   3028 		destfp->utf8string_len = srcfp->utf8string_len;
   3029 		destfp->utf8string_val = srcfp->utf8string_val;
   3030 		srcfp->utf8string_len = 0;
   3031 		srcfp->utf8string_val = NULL;	/* make sure not reused */
   3032 
   3033 		*action = NR_LOST_STATE_RQST;
   3034 	} else if (lost_rqstp->lr_op == OP_OPEN_DOWNGRADE) {
   3035 		destp->lr_dg_acc = lost_rqstp->lr_dg_acc;
   3036 		destp->lr_dg_deny = lost_rqstp->lr_dg_deny;
   3037 
   3038 		*action = NR_LOST_STATE_RQST;
   3039 	} else if (lost_rqstp->lr_op == OP_CLOSE) {
   3040 		ASSERT(lost_rqstp->lr_oop);
   3041 		*action = NR_LOST_STATE_RQST;
   3042 	} else if (lost_rqstp->lr_op == OP_DELEGRETURN) {
   3043 		*action = NR_LOST_STATE_RQST;
   3044 	} else {
   3045 #ifdef DEBUG
   3046 		cmn_err(CE_PANIC, "nfs4_save_lost_rqst: bad op %d",
   3047 		    lost_rqstp->lr_op);
   3048 #endif
   3049 		nfs4_queue_event(RE_LOST_STATE_BAD_OP, mi, NULL,
   3050 		    lost_rqstp->lr_op, lost_rqstp->lr_vp, lost_rqstp->lr_dvp,
   3051 		    NFS4_OK, NULL, curproc->p_pid, TAG_NONE, TAG_NONE, 0, 0);
   3052 		*action = NR_UNUSED;
   3053 		recovp->rc_lost_rqst = NULL;
   3054 		kmem_free(destp, sizeof (nfs4_lost_rqst_t));
   3055 		return;
   3056 	}
   3057 
   3058 	destp->lr_op = lost_rqstp->lr_op;
   3059 	destp->lr_vp = lost_rqstp->lr_vp;
   3060 	if (destp->lr_vp)
   3061 		VN_HOLD(destp->lr_vp);
   3062 	destp->lr_dvp = lost_rqstp->lr_dvp;
   3063 	if (destp->lr_dvp)
   3064 		VN_HOLD(destp->lr_dvp);
   3065 	destp->lr_oop = lost_rqstp->lr_oop;
   3066 	if (destp->lr_oop)
   3067 		open_owner_hold(destp->lr_oop);
   3068 	destp->lr_osp = lost_rqstp->lr_osp;
   3069 	if (destp->lr_osp)
   3070 		open_stream_hold(destp->lr_osp);
   3071 	destp->lr_lop = lost_rqstp->lr_lop;
   3072 	if (destp->lr_lop)
   3073 		lock_owner_hold(destp->lr_lop);
   3074 	destp->lr_cr = lost_rqstp->lr_cr;
   3075 	if (destp->lr_cr)
   3076 		crhold(destp->lr_cr);
   3077 	if (lost_rqstp->lr_flk == NULL)
   3078 		destp->lr_flk = NULL;
   3079 	else {
   3080 		destp->lr_flk = kmem_alloc(sizeof (flock64_t), KM_SLEEP);
   3081 		*destp->lr_flk = *lost_rqstp->lr_flk;
   3082 	}
   3083 	destp->lr_putfirst = lost_rqstp->lr_putfirst;
   3084 }
   3085 
   3086 /*
   3087  * Map the given return values (errno and nfs4 status code) to a recovery
   3088  * action and fill in the following fields of recovp: rc_action,
   3089  * rc_srv_reboot, rc_stateid, rc_lost_rqst.
   3090  */
   3091 
   3092 void
   3093 errs_to_action(recov_info_t *recovp,
   3094     nfs4_server_t *sp, mntinfo4_t *mi, stateid4 *sidp,
   3095     nfs4_lost_rqst_t *lost_rqstp, int unmounted, nfs_opnum4 op,
   3096     nfs4_bseqid_entry_t *bsep)
   3097 {
   3098 	nfs4_recov_t action = NR_UNUSED;
   3099 	bool_t reboot = FALSE;
   3100 	int try_f;
   3101 	int error = recovp->rc_orig_errors.error;
   3102 	nfsstat4 stat = recovp->rc_orig_errors.stat;
   3103 
   3104 	bzero(&recovp->rc_stateid, sizeof (stateid4));
   3105 	recovp->rc_lost_rqst = NULL;
   3106 	recovp->rc_bseqid_rqst = NULL;
   3107 
   3108 	try_f = nfs4_try_failover(&recovp->rc_orig_errors) &&
   3109 	    FAILOVER_MOUNT4(mi);
   3110 
   3111 	/*
   3112 	 * We start recovery for EINTR only in the lost lock
   3113 	 * or lost open/close case.
   3114 	 */
   3115 
   3116 	if (try_f || error == EINTR || (error == EIO && unmounted)) {
   3117 		recovp->rc_error = (error != 0 ? error : geterrno4(stat));
   3118 		if (lost_rqstp) {
   3119 			ASSERT(lost_rqstp->lr_op != 0);
   3120 			nfs4_save_lost_rqst(lost_rqstp, recovp, &action, mi);
   3121 		}
   3122 		if (try_f)
   3123 			action = NR_FAILOVER;
   3124 	} else if (error != 0) {
   3125 		recovp->rc_error = error;
   3126 		nfs4_queue_event(RE_UNEXPECTED_ERRNO, mi, NULL, error, NULL,
   3127 		    NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
   3128 		action = NR_CLIENTID;
   3129 	} else {
   3130 		recovp->rc_error = geterrno4(stat);
   3131 		switch (stat) {
   3132 #ifdef notyet
   3133 		case NFS4ERR_LEASE_MOVED:
   3134 			action = xxx;
   3135 			break;
   3136 #endif
   3137 		case NFS4ERR_MOVED:
   3138 			action = NR_MOVED;
   3139 			break;
   3140 		case NFS4ERR_BADHANDLE:
   3141 			action = NR_BADHANDLE;
   3142 			break;
   3143 		case NFS4ERR_BAD_SEQID:
   3144 			if (bsep)
   3145 				save_bseqid_rqst(bsep, recovp);
   3146 			action = NR_BAD_SEQID;
   3147 			break;
   3148 		case NFS4ERR_OLD_STATEID:
   3149 			action = NR_OLDSTATEID;
   3150 			break;
   3151 		case NFS4ERR_WRONGSEC:
   3152 			action = NR_WRONGSEC;
   3153 			break;
   3154 		case NFS4ERR_FHEXPIRED:
   3155 			action = NR_FHEXPIRED;
   3156 			break;
   3157 		case NFS4ERR_BAD_STATEID:
   3158 			if (sp == NULL || (sp != NULL && inlease(sp))) {
   3159 
   3160 				action = NR_BAD_STATEID;
   3161 				if (sidp)
   3162 					recovp->rc_stateid = *sidp;
   3163 			} else
   3164 				action = NR_CLIENTID;
   3165 			break;
   3166 		case NFS4ERR_EXPIRED:
   3167 			/*
   3168 			 * The client's lease has expired, either due
   3169 			 * to a network partition or perhaps a client
   3170 			 * error.  In either case, try an NR_CLIENTID
   3171 			 * style recovery.  reboot remains false, since
   3172 			 * there is no evidence the server has rebooted.
   3173 			 * This will cause CLAIM_NULL opens and lock
   3174 			 * requests without the reclaim bit.
   3175 			 */
   3176 			action = NR_CLIENTID;
   3177 
   3178 			DTRACE_PROBE4(nfs4__expired,
   3179 			    nfs4_server_t *, sp,
   3180 			    mntinfo4_t *, mi,
   3181 			    stateid4 *, sidp, int, op);
   3182 
   3183 			break;
   3184 		case NFS4ERR_STALE_CLIENTID:
   3185 		case NFS4ERR_STALE_STATEID:
   3186 			action = NR_CLIENTID;
   3187 			reboot = TRUE;
   3188 			break;
   3189 		case NFS4ERR_RESOURCE:
   3190 			/*
   3191 			 * If this had been a FAILOVER mount, then
   3192 			 * we'd have tried failover.  Since it's not,
   3193 			 * just delay a while and retry.
   3194 			 */
   3195 			action = NR_DELAY;
   3196 			break;
   3197 		case NFS4ERR_GRACE:
   3198 			action = NR_GRACE;
   3199 			break;
   3200 		case NFS4ERR_DELAY:
   3201 			action = NR_DELAY;
   3202 			break;
   3203 		case NFS4ERR_STALE:
   3204 			action = NR_STALE;
   3205 			break;
   3206 		default:
   3207 			nfs4_queue_event(RE_UNEXPECTED_STATUS, mi, NULL, 0,
   3208 			    NULL, NULL, stat, NULL, 0, TAG_NONE, TAG_NONE,
   3209 			    0, 0);
   3210 			action = NR_CLIENTID;
   3211 			break;
   3212 		}
   3213 	}
   3214 
   3215 	/* make sure action got set */
   3216 	ASSERT(action != NR_UNUSED);
   3217 	recovp->rc_srv_reboot = reboot;
   3218 	recovp->rc_action = action;
   3219 	nfs4_queue_fact(RF_ERR, mi, stat, action, op, reboot, NULL, error,
   3220 	    NULL);
   3221 }
   3222 
   3223 /*
   3224  * Return the (held) credential for the process with the given pid.
   3225  * May return NULL (e.g., process not found).
   3226  */
   3227 
   3228 static cred_t *
   3229 pid_to_cr(pid_t pid)
   3230 {
   3231 	proc_t *p;
   3232 	cred_t *cr;
   3233 
   3234 	mutex_enter(&pidlock);
   3235 	if ((p = prfind(pid)) == NULL) {
   3236 		mutex_exit(&pidlock);
   3237 		return (NULL);
   3238 	}
   3239 
   3240 	mutex_enter(&p->p_crlock);
   3241 	crhold(cr = p->p_cred);
   3242 	mutex_exit(&p->p_crlock);
   3243 	mutex_exit(&pidlock);
   3244 
   3245 	return (cr);
   3246 }
   3247 
   3248 /*
   3249  * Send SIGLOST to the given process and queue the event.
   3250  *
   3251  * The 'dump' boolean tells us whether this action should dump the
   3252  * in-kernel queue of recovery messages or not.
   3253  */
   3254 
   3255 void
   3256 nfs4_send_siglost(pid_t pid, mntinfo4_t *mi, vnode_t *vp, bool_t dump,
   3257     int error, nfsstat4 stat)
   3258 {
   3259 	proc_t *p;
   3260 
   3261 	mutex_enter(&pidlock);
   3262 	p = prfind(pid);
   3263 	if (p)
   3264 		psignal(p, SIGLOST);
   3265 	mutex_exit(&pidlock);
   3266 	nfs4_queue_event(dump ? RE_SIGLOST : RE_SIGLOST_NO_DUMP, mi,
   3267 	    NULL, error, vp, NULL, stat, NULL, pid, TAG_NONE, TAG_NONE, 0, 0);
   3268 }
   3269 
   3270 /*
   3271  * Scan the lock list for entries that match the given pid.  Change the
   3272  * pid in those that do to NOPID.
   3273  */
   3274 
   3275 static void
   3276 relock_skip_pid(locklist_t *llp, pid_t pid)
   3277 {
   3278 	for (; llp != NULL; llp = llp->ll_next) {
   3279 		if (llp->ll_flock.l_pid == pid)
   3280 			llp->ll_flock.l_pid = NOPID;
   3281 	}
   3282 }
   3283 
   3284 /*
   3285  * Mark a file as having failed recovery, after making a last-ditch effort
   3286  * to return any delegation.
   3287  *
   3288  * Sets r_error to EIO or ESTALE for the given vnode.
   3289  */
   3290 void
   3291 nfs4_fail_recov(vnode_t *vp, char *why, int error, nfsstat4 stat)
   3292 {
   3293 	rnode4_t *rp = VTOR4(vp);
   3294 
   3295 #ifdef DEBUG
   3296 	if (nfs4_fail_recov_stop)
   3297 		debug_enter("nfs4_fail_recov");
   3298 #endif
   3299 
   3300 	mutex_enter(&rp->r_statelock);
   3301 	if (rp->r_flags & (R4RECOVERR|R4RECOVERRP)) {
   3302 		mutex_exit(&rp->r_statelock);
   3303 		return;
   3304 	}
   3305 
   3306 	/*
   3307 	 * Set R4RECOVERRP to indicate that a recovery error is in
   3308 	 * progress.  This will shut down reads and writes at the top
   3309 	 * half.  Don't set R4RECOVERR until after we've returned the
   3310 	 * delegation, otherwise it will fail.
   3311 	 */
   3312 
   3313 	rp->r_flags |= R4RECOVERRP;
   3314 	mutex_exit(&rp->r_statelock);
   3315 
   3316 	nfs4delegabandon(rp);
   3317 
   3318 	mutex_enter(&rp->r_statelock);
   3319 	rp->r_flags |= (R4RECOVERR | R4STALE);
   3320 	rp->r_error = (error == 0 && stat == NFS4ERR_STALE) ? ESTALE : EIO;
   3321 	PURGE_ATTRCACHE4_LOCKED(rp);
   3322 	if (!(vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
   3323 		nfs4_queue_event(RE_DEAD_FILE, VTOMI4(vp), NULL, error,
   3324 		    vp, NULL, stat, why, 0, TAG_NONE, TAG_NONE, 0, 0);
   3325 	mutex_exit(&rp->r_statelock);
   3326 
   3327 	dnlc_purge_vp(vp);
   3328 }
   3329 
   3330 /*
   3331  * recov_throttle: if the file had the same recovery action within the
   3332  * throttle interval, wait for the throttle interval to finish before
   3333  * proceeding.
   3334  *
   3335  * Side effects: updates the rnode with the current recovery information.
   3336  */
   3337 
   3338 static void
   3339 recov_throttle(recov_info_t *recovp, vnode_t *vp)
   3340 {
   3341 	time_t curtime, time_to_wait;
   3342 	rnode4_t *rp = VTOR4(vp);
   3343 
   3344 	curtime = gethrestime_sec();
   3345 
   3346 	mutex_enter(&rp->r_statelock);
   3347 	NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   3348 	    "recov_throttle: now: (%d, %ld), last: (%d, %ld)",
   3349 	    recovp->rc_action, curtime,
   3350 	    rp->r_recov_act, rp->r_last_recov));
   3351 	if (recovp->rc_action == rp->r_recov_act &&
   3352 	    rp->r_last_recov + recov_err_delay > curtime) {
   3353 		time_to_wait = rp->r_last_recov + recov_err_delay - curtime;
   3354 		mutex_exit(&rp->r_statelock);
   3355 		delay(SEC_TO_TICK(time_to_wait));
   3356 		curtime = gethrestime_sec();
   3357 		mutex_enter(&rp->r_statelock);
   3358 	}
   3359 
   3360 	rp->r_last_recov = curtime;
   3361 	rp->r_recov_act = recovp->rc_action;
   3362 	mutex_exit(&rp->r_statelock);
   3363 }
   3364 
   3365 /*
   3366  * React to NFS4ERR_GRACE by setting the time we'll permit
   3367  * the next call to this filesystem.
   3368  */
   3369 void
   3370 nfs4_set_grace_wait(mntinfo4_t *mi)
   3371 {
   3372 	mutex_enter(&mi->mi_lock);
   3373 	/* Mark the time for the future */
   3374 	mi->mi_grace_wait = gethrestime_sec() + nfs4err_delay_time;
   3375 	mutex_exit(&mi->mi_lock);
   3376 }
   3377 
   3378 /*
   3379  * React to MFS4ERR_DELAY by setting the time we'll permit
   3380  * the next call to this vnode.
   3381  */
   3382 void
   3383 nfs4_set_delay_wait(vnode_t *vp)
   3384 {
   3385 	rnode4_t *rp = VTOR4(vp);
   3386 
   3387 	mutex_enter(&rp->r_statelock);
   3388 	/*
   3389 	 * Calculate amount we should delay, initial
   3390 	 * delay will be short and then we will back off.
   3391 	 */
   3392 	if (rp->r_delay_interval == 0)
   3393 		rp->r_delay_interval = NFS4_INITIAL_DELAY_INTERVAL;
   3394 	else
   3395 		/* calculate next interval value */
   3396 		rp->r_delay_interval =
   3397 		    MIN(NFS4_MAX_DELAY_INTERVAL, (rp->r_delay_interval << 1));
   3398 	rp->r_delay_wait = gethrestime_sec() + rp->r_delay_interval;
   3399 	mutex_exit(&rp->r_statelock);
   3400 }
   3401 
   3402 /*
   3403  * The caller is responsible for freeing the returned string.
   3404  */
   3405 static char *
   3406 nfs4_getsrvnames(mntinfo4_t *mi, size_t *len)
   3407 {
   3408 	servinfo4_t *svp;
   3409 	char *srvnames;
   3410 	char *namep;
   3411 	size_t length;
   3412 
   3413 	/*
   3414 	 * Calculate the length of the string required to hold all
   3415 	 * of the server names plus either a comma or a null
   3416 	 * character following each individual one.
   3417 	 */
   3418 	length = 0;
   3419 	for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) {
   3420 		(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
   3421 		if (svp->sv_flags & SV4_NOTINUSE) {
   3422 			nfs_rw_exit(&svp->sv_lock);
   3423 			continue;
   3424 		}
   3425 		nfs_rw_exit(&svp->sv_lock);
   3426 		length += svp->sv_hostnamelen;
   3427 	}
   3428 
   3429 	srvnames = kmem_alloc(length, KM_SLEEP);
   3430 
   3431 	namep = srvnames;
   3432 	for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) {
   3433 		(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
   3434 		if (svp->sv_flags & SV4_NOTINUSE) {
   3435 			nfs_rw_exit(&svp->sv_lock);
   3436 			continue;
   3437 		}
   3438 		nfs_rw_exit(&svp->sv_lock);
   3439 		(void) strcpy(namep, svp->sv_hostname);
   3440 		namep += svp->sv_hostnamelen - 1;
   3441 		*namep++ = ',';
   3442 	}
   3443 	*--namep = '\0';
   3444 
   3445 	*len = length;
   3446 
   3447 	return (srvnames);
   3448 }
   3449 
   3450 static void
   3451 save_bseqid_rqst(nfs4_bseqid_entry_t *bsep, recov_info_t *recovp)
   3452 {
   3453 	nfs4_bseqid_entry_t *destp;
   3454 
   3455 	destp = kmem_alloc(sizeof (nfs4_bseqid_entry_t), KM_SLEEP);
   3456 	recovp->rc_bseqid_rqst = destp;
   3457 
   3458 	if (bsep->bs_oop)
   3459 		open_owner_hold(bsep->bs_oop);
   3460 	destp->bs_oop = bsep->bs_oop;
   3461 	if (bsep->bs_lop)
   3462 		lock_owner_hold(bsep->bs_lop);
   3463 	destp->bs_lop = bsep->bs_lop;
   3464 	if (bsep->bs_vp)
   3465 		VN_HOLD(bsep->bs_vp);
   3466 	destp->bs_vp = bsep->bs_vp;
   3467 	destp->bs_pid = bsep->bs_pid;
   3468 	destp->bs_tag = bsep->bs_tag;
   3469 	destp->bs_seqid = bsep->bs_seqid;
   3470 }
   3471 
   3472 static void
   3473 free_bseqid_rqst(nfs4_bseqid_entry_t *bsep)
   3474 {
   3475 	if (bsep->bs_oop)
   3476 		open_owner_rele(bsep->bs_oop);
   3477 	if (bsep->bs_lop)
   3478 		lock_owner_rele(bsep->bs_lop);
   3479 	if (bsep->bs_vp)
   3480 		VN_RELE(bsep->bs_vp);
   3481 	kmem_free(bsep, sizeof (nfs4_bseqid_entry_t));
   3482 }
   3483 
   3484 /*
   3485  * We don't actually fully recover from NFS4ERR_BAD_SEQID.  We
   3486  * simply mark the open owner and open stream (if provided) as "bad".
   3487  * Then future uses of these data structures will be limited to basically
   3488  * just cleaning up the internal client state (no going OTW).
   3489  *
   3490  * The result of this is to return errors back to the app/usr when
   3491  * we receive NFS4ERR_BAD_SEQID, but also allow future/new calls to
   3492  * succeed so progress can be made.
   3493  */
   3494 void
   3495 recov_bad_seqid(recov_info_t *recovp)
   3496 {
   3497 	mntinfo4_t		*mi = recovp->rc_mi;
   3498 	nfs4_open_owner_t	*bad_oop;
   3499 	nfs4_lock_owner_t	*bad_lop;
   3500 	vnode_t			*vp;
   3501 	rnode4_t		*rp = NULL;
   3502 	pid_t			pid;
   3503 	nfs4_bseqid_entry_t	*bsep, *tbsep;
   3504 	int			error;
   3505 
   3506 	ASSERT(mi != NULL);
   3507 	ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
   3508 
   3509 	mutex_enter(&mi->mi_lock);
   3510 	bsep = list_head(&mi->mi_bseqid_list);
   3511 	mutex_exit(&mi->mi_lock);
   3512 
   3513 	/*
   3514 	 * Handle all the bad seqid entries on mi's list.
   3515 	 */
   3516 	while (bsep != NULL) {
   3517 		bad_oop = bsep->bs_oop;
   3518 		bad_lop = bsep->bs_lop;
   3519 		vp = bsep->bs_vp;
   3520 		pid = bsep->bs_pid;
   3521 
   3522 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   3523 		    "recov_bad_seqid: mark oop %p lop %p as bad for "
   3524 		    "vp %p tag %s pid %d: last good seqid %d for tag %s",
   3525 		    (void *)bad_oop, (void *)bad_lop, (void *)vp,
   3526 		    nfs4_ctags[bsep->bs_tag].ct_str, pid,
   3527 		    bad_oop ?  bad_oop->oo_last_good_seqid : 0,
   3528 		    bad_oop ? nfs4_ctags[bad_oop->oo_last_good_op].ct_str :
   3529 		    nfs4_ctags[TAG_NONE].ct_str));
   3530 
   3531 		nfs4_queue_event(RE_BAD_SEQID, mi, NULL,
   3532 		    0, vp, NULL, NFS4ERR_BAD_SEQID, NULL, pid, bsep->bs_tag,
   3533 		    bad_oop ? bad_oop->oo_last_good_op : TAG_NONE,
   3534 		    bsep->bs_seqid, bad_oop ? bad_oop->oo_last_good_seqid : 0);
   3535 
   3536 		if (bad_oop) {
   3537 			/* essentially reset the open owner */
   3538 			error = nfs4_start_open_seqid_sync(bad_oop, mi);
   3539 			ASSERT(!error);	/* recov thread always succeeds */
   3540 			bad_oop->oo_name = nfs4_get_new_oo_name();
   3541 			bad_oop->oo_seqid = 0;
   3542 			nfs4_end_open_seqid_sync(bad_oop);
   3543 		}
   3544 
   3545 		if (bad_lop) {
   3546 			mutex_enter(&bad_lop->lo_lock);
   3547 			bad_lop->lo_flags |= NFS4_BAD_SEQID_LOCK;
   3548 			mutex_exit(&bad_lop->lo_lock);
   3549 
   3550 			ASSERT(vp != NULL);
   3551 			rp = VTOR4(vp);
   3552 			mutex_enter(&rp->r_statelock);
   3553 			rp->r_flags |= R4LODANGLERS;
   3554 			mutex_exit(&rp->r_statelock);
   3555 
   3556 			nfs4_send_siglost(pid, mi, vp, TRUE,
   3557 			    0, NFS4ERR_BAD_SEQID);
   3558 		}
   3559 
   3560 		mutex_enter(&mi->mi_lock);
   3561 		list_remove(&mi->mi_bseqid_list, bsep);
   3562 		tbsep = bsep;
   3563 		bsep = list_head(&mi->mi_bseqid_list);
   3564 		mutex_exit(&mi->mi_lock);
   3565 		free_bseqid_rqst(tbsep);
   3566 	}
   3567 
   3568 	mutex_enter(&mi->mi_lock);
   3569 	mi->mi_recovflags &= ~MI4R_BAD_SEQID;
   3570 	mutex_exit(&mi->mi_lock);
   3571 }
   3572