Home | History | Annotate | Download | only in nfs
      1      0    stevel /*
      2      0    stevel  * CDDL HEADER START
      3      0    stevel  *
      4      0    stevel  * The contents of this file are subject to the terms of the
      5   1705   jwahlig  * Common Development and Distribution License (the "License").
      6   1705   jwahlig  * You may not use this file except in compliance with the License.
      7      0    stevel  *
      8      0    stevel  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9      0    stevel  * or http://www.opensolaris.org/os/licensing.
     10      0    stevel  * See the License for the specific language governing permissions
     11      0    stevel  * and limitations under the License.
     12      0    stevel  *
     13      0    stevel  * When distributing Covered Code, include this CDDL HEADER in each
     14      0    stevel  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15      0    stevel  * If applicable, add the following below this CDDL HEADER, with the
     16      0    stevel  * fields enclosed by brackets "[]" replaced with your own identifying
     17      0    stevel  * information: Portions Copyright [yyyy] [name of copyright owner]
     18      0    stevel  *
     19      0    stevel  * CDDL HEADER END
     20      0    stevel  */
     21      0    stevel /*
     22   9858     Pavel  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23      0    stevel  * Use is subject to license terms.
     24      0    stevel  */
     25      0    stevel 
     26      0    stevel /*
     27      0    stevel  * NFS Version 4 state recovery code.
     28      0    stevel  */
     29      0    stevel 
     30      0    stevel #include <nfs/nfs4_clnt.h>
     31      0    stevel #include <nfs/nfs4.h>
     32      0    stevel #include <nfs/rnode4.h>
     33      0    stevel #include <sys/cmn_err.h>
     34      0    stevel #include <sys/cred.h>
     35      0    stevel #include <sys/systm.h>
     36      0    stevel #include <sys/flock.h>
     37      0    stevel #include <sys/dnlc.h>
     38      0    stevel #include <sys/ddi.h>
     39      0    stevel #include <sys/disp.h>
     40      0    stevel #include <sys/list.h>
     41      0    stevel #include <sys/sdt.h>
     42      0    stevel 
     43      0    stevel extern r4hashq_t *rtable4;
     44      0    stevel 
     45      0    stevel /*
     46      0    stevel  * Information that describes what needs to be done for recovery.  It is
     47      0    stevel  * passed to a client recovery thread as well as passed to various recovery
     48      0    stevel  * routines.  rc_mi, rc_vp1, and rc_vp2 refer to the filesystem and
     49      0    stevel  * vnode(s) affected by recovery.  rc_vp1 and rc_vp2 are references (use
     50      0    stevel  * VN_HOLD) or NULL.  rc_lost_rqst contains information about the lost
     51      0    stevel  * lock or open/close request, and it holds reference counts for the
     52      0    stevel  * various objects (vnode, etc.).  The recovery thread also uses flags set
     53      0    stevel  * in the mntinfo4_t or vnode_t to tell it what to do.  rc_error is used
     54      0    stevel  * to save the error that originally triggered the recovery event -- will
     55      0    stevel  * later be used to set mi_error if recovery doesn't work.  rc_bseqid_rqst
     56      0    stevel  * contains information about the request that got NFS4ERR_BAD_SEQID, and
     57      0    stevel  * it holds reference count for the various objects (vnode, open owner,
     58      0    stevel  * open stream, lock owner).
     59      0    stevel  */
     60      0    stevel 
     61      0    stevel typedef struct {
     62      0    stevel 	mntinfo4_t *rc_mi;
     63      0    stevel 	vnode_t *rc_vp1;
     64      0    stevel 	vnode_t *rc_vp2;
     65      0    stevel 	nfs4_recov_t rc_action;
     66      0    stevel 	stateid4 rc_stateid;
     67      0    stevel 	bool_t rc_srv_reboot;		/* server has rebooted */
     68      0    stevel 	nfs4_lost_rqst_t *rc_lost_rqst;
     69      0    stevel 	nfs4_error_t rc_orig_errors;	/* original errors causing recovery */
     70      0    stevel 	int rc_error;
     71      0    stevel 	nfs4_bseqid_entry_t *rc_bseqid_rqst;
     72      0    stevel } recov_info_t;
     73      0    stevel 
     74      0    stevel /*
     75      0    stevel  * How long to wait before trying again if there is an error doing
     76      0    stevel  * recovery, in seconds.
     77      0    stevel  */
     78      0    stevel 
     79      0    stevel static int recov_err_delay = 1;
     80      0    stevel 
     81      0    stevel /*
     82      0    stevel  * How long to wait when processing NFS4ERR_GRACE or NFS4ERR_DELAY
     83      0    stevel  * errors.  Expressed in seconds.  Default is defined as
     84      0    stevel  * NFS4ERR_DELAY_TIME and this variable is initialized in nfs4_subr_init()
     85      0    stevel  */
     86      0    stevel time_t nfs4err_delay_time = 0;
     87      0    stevel 
     88      0    stevel /*
     89      0    stevel  * Tuneable to limit how many time "exempt" ops go OTW
     90      0    stevel  * after a recovery error.  Exempt op hints are OH_CLOSE,
     91      0    stevel  * OH_LOCKU, OH_DELEGRETURN.  These previously always went
     92      0    stevel  * OTW even after rnode was "dead" due to recovery errors.
     93      0    stevel  *
     94      0    stevel  * The tuneable below limits the number of times a start_fop
     95      0    stevel  * invocation will retry the exempt hints.  After the limit
     96      0    stevel  * is reached, nfs4_start_fop will return an error just like
     97      0    stevel  * it would for non-exempt op hints.
     98      0    stevel  */
     99      0    stevel int nfs4_max_recov_error_retry = 3;
    100      0    stevel 
    101      0    stevel /*
    102      0    stevel  * Number of seconds the recovery thread should pause before retry when the
    103      0    stevel  * filesystem has been forcibly unmounted.
    104      0    stevel  */
    105      0    stevel 
    106      0    stevel int nfs4_unmount_delay = 1;
    107      0    stevel 
    108      0    stevel #ifdef DEBUG
    109      0    stevel 
    110      0    stevel /*
    111      0    stevel  * How long to wait (in seconds) between recovery operations on a given
    112      0    stevel  * file.  Normally zero, but could be set longer for testing purposes.
    113      0    stevel  */
    114      0    stevel static int nfs4_recovdelay = 0;
    115      0    stevel 
    116      0    stevel /*
    117      0    stevel  * Switch that controls whether to go into the debugger when recovery
    118      0    stevel  * fails.
    119      0    stevel  */
    120      0    stevel static int nfs4_fail_recov_stop = 0;
    121      0    stevel 
    122      0    stevel /*
    123      0    stevel  * Tuneables to debug client namespace interaction with server
    124      0    stevel  * mount points:
    125      0    stevel  *
    126      0    stevel  *	nfs4_srvmnt_fail_cnt:
    127      0    stevel  *		number of times EACCES returned because client
    128      0    stevel  *		attempted to cross server mountpoint
    129      0    stevel  *
    130      0    stevel  *	nfs4_srvmnt_debug:
    131      0    stevel  *		trigger console printf whenever client attempts
    132      0    stevel  *		to cross server mountpoint
    133      0    stevel  */
    134      0    stevel int nfs4_srvmnt_fail_cnt = 0;
    135      0    stevel int nfs4_srvmnt_debug = 0;
    136      0    stevel #endif
    137      0    stevel 
    138      0    stevel /* forward references, in alphabetic order */
    139      0    stevel static void close_after_open_resend(vnode_t *, cred_t *, uint32_t,
    140      0    stevel 	nfs4_error_t *);
    141      0    stevel static void errs_to_action(recov_info_t *,
    142      0    stevel 	nfs4_server_t *, mntinfo4_t *, stateid4 *, nfs4_lost_rqst_t *, int,
    143      0    stevel 	nfs_opnum4, nfs4_bseqid_entry_t *);
    144      0    stevel static void flush_reinstate(nfs4_lost_rqst_t *);
    145      0    stevel static void free_milist(mntinfo4_t **, int);
    146      0    stevel static mntinfo4_t **make_milist(nfs4_server_t *, int *);
    147      0    stevel static int nfs4_check_recov_err(vnode_t *, nfs4_op_hint_t,
    148      0    stevel 	nfs4_recov_state_t *, int, char *);
    149      0    stevel static char *nfs4_getsrvnames(mntinfo4_t *, size_t *);
    150      0    stevel static void nfs4_recov_fh_fail(vnode_t *, int, nfsstat4);
    151      0    stevel static void nfs4_recov_thread(recov_info_t *);
    152      0    stevel static void nfs4_remove_lost_rqsts(mntinfo4_t *, nfs4_server_t *);
    153      0    stevel static void nfs4_resend_lost_rqsts(recov_info_t *, nfs4_server_t *);
    154      0    stevel static cred_t *pid_to_cr(pid_t);
    155      0    stevel static void reclaim_one_lock(vnode_t *, flock64_t *, nfs4_error_t *, int *);
    156      0    stevel static void recov_bad_seqid(recov_info_t *);
    157      0    stevel static void recov_badstate(recov_info_t *, vnode_t *, nfsstat4);
    158      0    stevel static void recov_clientid(recov_info_t *, nfs4_server_t *);
    159      0    stevel static void recov_done(mntinfo4_t *, recov_info_t *);
    160      0    stevel static void recov_filehandle(nfs4_recov_t, mntinfo4_t *, vnode_t *);
    161      0    stevel static void recov_newserver(recov_info_t *, nfs4_server_t **, bool_t *);
    162      0    stevel static void recov_openfiles(recov_info_t *, nfs4_server_t *);
    163      0    stevel static void recov_stale(mntinfo4_t *, vnode_t *);
    164      0    stevel static void nfs4_free_lost_rqst(nfs4_lost_rqst_t *, nfs4_server_t *);
    165      0    stevel static void recov_throttle(recov_info_t *, vnode_t *);
    166      0    stevel static void relock_skip_pid(locklist_t *, pid_t);
    167      0    stevel static void resend_lock(nfs4_lost_rqst_t *, nfs4_error_t *);
    168      0    stevel static void resend_one_op(nfs4_lost_rqst_t *, nfs4_error_t *, mntinfo4_t *,
    169      0    stevel 	nfs4_server_t *);
    170      0    stevel static void save_bseqid_rqst(nfs4_bseqid_entry_t *, recov_info_t *);
    171      0    stevel static void start_recovery(recov_info_t *, mntinfo4_t *, vnode_t *, vnode_t *,
    172      0    stevel 	nfs4_server_t *);
    173      0    stevel static void start_recovery_action(nfs4_recov_t, bool_t, mntinfo4_t *, vnode_t *,
    174      0    stevel 	vnode_t *);
    175      0    stevel static int wait_for_recovery(mntinfo4_t *, nfs4_op_hint_t);
    176      0    stevel 
    177      0    stevel /*
    178      0    stevel  * Return non-zero if the given errno, status, and rpc status codes
    179      0    stevel  * in the nfs4_error_t indicate that client recovery is needed.
    180      0    stevel  * "stateful" indicates whether the call that got the error establishes or
    181      0    stevel  * removes state on the server (open, close, lock, unlock, delegreturn).
    182      0    stevel  */
    183      0    stevel 
    184      0    stevel int
    185      0    stevel nfs4_needs_recovery(nfs4_error_t *ep, bool_t stateful, vfs_t *vfsp)
    186      0    stevel {
    187      0    stevel 	int recov = 0;
    188      0    stevel 	mntinfo4_t *mi;
    189      0    stevel 
    190      0    stevel 	/*
    191      0    stevel 	 * Try failover if the error values justify it and if
    192      0    stevel 	 * it's a failover mount.  Don't try if the mount is in
    193      0    stevel 	 * progress, failures are handled explicitly by nfs4rootvp.
    194      0    stevel 	 */
    195      0    stevel 	if (nfs4_try_failover(ep)) {
    196      0    stevel 		mi = VFTOMI4(vfsp);
    197      0    stevel 		mutex_enter(&mi->mi_lock);
    198      0    stevel 		recov = FAILOVER_MOUNT4(mi) && !(mi->mi_flags & MI4_MOUNTING);
    199      0    stevel 		mutex_exit(&mi->mi_lock);
    200      0    stevel 		if (recov)
    201      0    stevel 			return (recov);
    202      0    stevel 	}
    203      0    stevel 
    204      0    stevel 	if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vfsp)) {
    205      0    stevel 		/*
    206      0    stevel 		 * The server may have gotten the request, so for stateful
    207      0    stevel 		 * ops we need to resynchronize and possibly back out the
    208      0    stevel 		 * op.
    209      0    stevel 		 */
    210      0    stevel 		return (stateful);
    211      0    stevel 	}
    212      0    stevel 	if (ep->error != 0)
    213      0    stevel 		return (0);
    214      0    stevel 
    215      0    stevel 	/* stat values are listed alphabetically */
    216      0    stevel 	/*
    217      0    stevel 	 * There are two lists here: the errors for which we have code, and
    218      0    stevel 	 * the errors for which we plan to have code before FCS.  For the
    219      0    stevel 	 * second list, print a warning message but don't attempt recovery.
    220      0    stevel 	 */
    221      0    stevel 	switch (ep->stat) {
    222      0    stevel 	case NFS4ERR_BADHANDLE:
    223      0    stevel 	case NFS4ERR_BAD_SEQID:
    224      0    stevel 	case NFS4ERR_BAD_STATEID:
    225      0    stevel 	case NFS4ERR_DELAY:
    226      0    stevel 	case NFS4ERR_EXPIRED:
    227      0    stevel 	case NFS4ERR_FHEXPIRED:
    228      0    stevel 	case NFS4ERR_GRACE:
    229      0    stevel 	case NFS4ERR_OLD_STATEID:
    230      0    stevel 	case NFS4ERR_RESOURCE:
    231      0    stevel 	case NFS4ERR_STALE_CLIENTID:
    232      0    stevel 	case NFS4ERR_STALE_STATEID:
    233      0    stevel 	case NFS4ERR_WRONGSEC:
    234      0    stevel 	case NFS4ERR_STALE:
    235      0    stevel 		recov = 1;
    236      0    stevel 		break;
    237      0    stevel #ifdef DEBUG
    238      0    stevel 	case NFS4ERR_LEASE_MOVED:
    239      0    stevel 	case NFS4ERR_MOVED:
    240      0    stevel 		zcmn_err(VFTOMI4(vfsp)->mi_zone->zone_id,
    241      0    stevel 		    CE_WARN, "!Can't yet recover from NFS status %d",
    242   5302  th199096 		    ep->stat);
    243      0    stevel 		break;
    244      0    stevel #endif
    245      0    stevel 	}
    246      0    stevel 
    247      0    stevel 	return (recov);
    248      0    stevel }
    249      0    stevel 
    250      0    stevel /*
    251      0    stevel  * Some operations such as DELEGRETURN want to avoid invoking
    252      0    stevel  * recovery actions that will only mark the file dead.  If
    253      0    stevel  * better handlers are invoked for any of these errors, this
    254      0    stevel  * routine should be modified.
    255      0    stevel  */
    256      0    stevel int
    257      0    stevel nfs4_recov_marks_dead(nfsstat4 status)
    258      0    stevel {
    259      0    stevel 	if (status == NFS4ERR_BAD_SEQID ||
    260      0    stevel 	    status == NFS4ERR_EXPIRED ||
    261      0    stevel 	    status == NFS4ERR_BAD_STATEID ||
    262      0    stevel 	    status == NFS4ERR_OLD_STATEID)
    263      0    stevel 		return (1);
    264      0    stevel 	return (0);
    265      0    stevel }
    266      0    stevel 
    267      0    stevel /*
    268      0    stevel  * Transfer the state recovery information in recovp to mi's resend queue,
    269      0    stevel  * and mark mi as having a lost state request.
    270      0    stevel  */
    271      0    stevel static void
    272      0    stevel nfs4_enqueue_lost_rqst(recov_info_t *recovp, mntinfo4_t *mi)
    273      0    stevel {
    274      0    stevel 	nfs4_lost_rqst_t *lrp = recovp->rc_lost_rqst;
    275      0    stevel 
    276      0    stevel 	ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
    277      0    stevel 	    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
    278      0    stevel 
    279      0    stevel 	ASSERT(lrp != NULL && lrp->lr_op != 0);
    280      0    stevel 
    281      0    stevel 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
    282   5302  th199096 	    "nfs4_enqueue_lost_rqst %p, op %d",
    283   5302  th199096 	    (void *)lrp, lrp->lr_op));
    284      0    stevel 
    285      0    stevel 	mutex_enter(&mi->mi_lock);
    286      0    stevel 	mi->mi_recovflags |= MI4R_LOST_STATE;
    287      0    stevel 	if (lrp->lr_putfirst)
    288      0    stevel 		list_insert_head(&mi->mi_lost_state, lrp);
    289      0    stevel 	else
    290      0    stevel 		list_insert_tail(&mi->mi_lost_state, lrp);
    291      0    stevel 	recovp->rc_lost_rqst = NULL;
    292      0    stevel 	mutex_exit(&mi->mi_lock);
    293      0    stevel 
    294      0    stevel 	nfs4_queue_event(RE_LOST_STATE, mi, NULL, lrp->lr_op, lrp->lr_vp,
    295   5302  th199096 	    lrp->lr_dvp, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
    296      0    stevel }
    297      0    stevel 
    298      0    stevel /*
    299      0    stevel  * Transfer the bad seqid recovery information in recovp to mi's
    300      0    stevel  * bad seqid queue, and mark mi as having a bad seqid request.
    301      0    stevel  */
    302      0    stevel void
    303      0    stevel enqueue_bseqid_rqst(recov_info_t *recovp, mntinfo4_t *mi)
    304      0    stevel {
    305      0    stevel 	ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
    306      0    stevel 	    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
    307      0    stevel 	ASSERT(recovp->rc_bseqid_rqst != NULL);
    308      0    stevel 
    309      0    stevel 	mutex_enter(&mi->mi_lock);
    310      0    stevel 	mi->mi_recovflags |= MI4R_BAD_SEQID;
    311      0    stevel 	list_insert_tail(&mi->mi_bseqid_list, recovp->rc_bseqid_rqst);
    312      0    stevel 	recovp->rc_bseqid_rqst = NULL;
    313      0    stevel 	mutex_exit(&mi->mi_lock);
    314      0    stevel }
    315      0    stevel 
    316      0    stevel /*
    317      0    stevel  * Initiate recovery.
    318      0    stevel  *
    319      0    stevel  * The nfs4_error_t contains the return codes that triggered a recovery
    320      0    stevel  * attempt.  mi, vp1, and vp2 refer to the filesystem and files that were
    321      0    stevel  * being operated on.  vp1 and vp2 may be NULL.
    322      0    stevel  *
    323      0    stevel  * Multiple calls are okay.  If recovery is already underway, the call
    324      0    stevel  * updates the information about what state needs recovery but does not
    325      0    stevel  * start a new thread.  The caller should hold mi->mi_recovlock as a reader
    326      0    stevel  * for proper synchronization with any recovery thread.
    327      0    stevel  *
    328      0    stevel  * This will return TRUE if recovery was aborted, and FALSE otherwise.
    329      0    stevel  */
    330      0    stevel bool_t
    331      0    stevel nfs4_start_recovery(nfs4_error_t *ep, mntinfo4_t *mi, vnode_t *vp1,
    332      0    stevel     vnode_t *vp2, stateid4 *sid, nfs4_lost_rqst_t *lost_rqstp, nfs_opnum4 op,
    333      0    stevel     nfs4_bseqid_entry_t *bsep)
    334      0    stevel {
    335      0    stevel 	recov_info_t *recovp;
    336      0    stevel 	nfs4_server_t *sp;
    337      0    stevel 	bool_t abort = FALSE;
    338      0    stevel 	bool_t gone = FALSE;
    339      0    stevel 
    340    766  carlsonj 	ASSERT(nfs_zone() == mi->mi_zone);
    341      0    stevel 	mutex_enter(&mi->mi_lock);
    342      0    stevel 	/*
    343      0    stevel 	 * If there is lost state, we need to kick off recovery even if the
    344      0    stevel 	 * filesystem has been unmounted or the zone is shutting down.
    345      0    stevel 	 */
    346      0    stevel 	gone = FS_OR_ZONE_GONE4(mi->mi_vfsp);
    347      0    stevel 	if (gone) {
    348      0    stevel 		ASSERT(ep->error != EINTR || lost_rqstp != NULL);
    349      0    stevel 		if (ep->error == EIO && lost_rqstp == NULL) {
    350      0    stevel 			/* failed due to forced unmount, no new lost state */
    351      0    stevel 			abort = TRUE;
    352      0    stevel 		}
    353      0    stevel 		if ((ep->error == 0 || ep->error == ETIMEDOUT) &&
    354      0    stevel 		    !(mi->mi_recovflags & MI4R_LOST_STATE)) {
    355      0    stevel 			/* some other failure, no existing lost state */
    356      0    stevel 			abort = TRUE;
    357      0    stevel 		}
    358      0    stevel 		if (abort) {
    359      0    stevel 			mutex_exit(&mi->mi_lock);
    360      0    stevel 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
    361   5302  th199096 			    "nfs4_start_recovery: fs unmounted"));
    362      0    stevel 			return (TRUE);
    363      0    stevel 		}
    364      0    stevel 	}
    365      0    stevel 	mi->mi_in_recovery++;
    366      0    stevel 	mutex_exit(&mi->mi_lock);
    367      0    stevel 
    368      0    stevel 	recovp = kmem_alloc(sizeof (recov_info_t), KM_SLEEP);
    369      0    stevel 	recovp->rc_orig_errors = *ep;
    370      0    stevel 	sp = find_nfs4_server(mi);
    371   5302  th199096 	errs_to_action(recovp, sp, mi, sid, lost_rqstp, gone, op, bsep);
    372      0    stevel 	if (sp != NULL)
    373      0    stevel 		mutex_exit(&sp->s_lock);
    374      0    stevel 	start_recovery(recovp, mi, vp1, vp2, sp);
    375      0    stevel 	if (sp != NULL)
    376      0    stevel 		nfs4_server_rele(sp);
    377      0    stevel 	return (FALSE);
    378      0    stevel }
    379      0    stevel 
    380      0    stevel /*
    381      0    stevel  * Internal version of nfs4_start_recovery.  The difference is that the
    382      0    stevel  * caller specifies the recovery action, rather than the errors leading to
    383      0    stevel  * recovery.
    384      0    stevel  */
    385      0    stevel static void
    386      0    stevel start_recovery_action(nfs4_recov_t what, bool_t reboot, mntinfo4_t *mi,
    387   5302  th199096     vnode_t *vp1, vnode_t *vp2)
    388      0    stevel {
    389      0    stevel 	recov_info_t *recovp;
    390      0    stevel 
    391    766  carlsonj 	ASSERT(nfs_zone() == mi->mi_zone);
    392      0    stevel 	mutex_enter(&mi->mi_lock);
    393      0    stevel 	mi->mi_in_recovery++;
    394      0    stevel 	mutex_exit(&mi->mi_lock);
    395      0    stevel 
    396      0    stevel 	recovp = kmem_zalloc(sizeof (recov_info_t), KM_SLEEP);
    397      0    stevel 	recovp->rc_action = what;
    398      0    stevel 	recovp->rc_srv_reboot = reboot;
    399      0    stevel 	recovp->rc_error = EIO;
    400      0    stevel 	start_recovery(recovp, mi, vp1, vp2, NULL);
    401      0    stevel }
    402      0    stevel 
    403      0    stevel static void
    404      0    stevel start_recovery(recov_info_t *recovp, mntinfo4_t *mi,
    405   5302  th199096     vnode_t *vp1, vnode_t *vp2, nfs4_server_t *sp)
    406      0    stevel {
    407      0    stevel 	NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
    408   5302  th199096 	    "start_recovery: mi %p, what %s", (void*)mi,
    409   5302  th199096 	    nfs4_recov_action_to_str(recovp->rc_action)));
    410      0    stevel 
    411      0    stevel 	/*
    412      0    stevel 	 * Bump the reference on the vfs so that we can pass it to the
    413      0    stevel 	 * recovery thread.
    414      0    stevel 	 */
    415      0    stevel 	VFS_HOLD(mi->mi_vfsp);
    416   1705   jwahlig 	MI4_HOLD(mi);
    417      0    stevel again:
    418      0    stevel 	switch (recovp->rc_action) {
    419      0    stevel 	case NR_FAILOVER:
    420      0    stevel 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
    421      0    stevel 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
    422      0    stevel 		if (mi->mi_servers->sv_next == NULL)
    423      0    stevel 			goto out_no_thread;
    424      0    stevel 		mutex_enter(&mi->mi_lock);
    425      0    stevel 		mi->mi_recovflags |= MI4R_NEED_NEW_SERVER;
    426      0    stevel 		mutex_exit(&mi->mi_lock);
    427      0    stevel 
    428      0    stevel 		if (recovp->rc_lost_rqst != NULL)
    429      0    stevel 			nfs4_enqueue_lost_rqst(recovp, mi);
    430      0    stevel 		break;
    431      0    stevel 
    432      0    stevel 	case NR_CLIENTID:
    433      0    stevel 		/*
    434      0    stevel 		 * If the filesystem has been unmounted, punt.
    435      0    stevel 		 */
    436      0    stevel 		if (sp == NULL)
    437      0    stevel 			goto out_no_thread;
    438      0    stevel 
    439      0    stevel 		/*
    440      0    stevel 		 * If nobody else is working on the clientid, mark the
    441      0    stevel 		 * clientid as being no longer set.  Then mark the specific
    442      0    stevel 		 * filesystem being worked on.
    443      0    stevel 		 */
    444      0    stevel 		if (!nfs4_server_in_recovery(sp)) {
    445      0    stevel 			mutex_enter(&sp->s_lock);
    446      0    stevel 			sp->s_flags &= ~N4S_CLIENTID_SET;
    447      0    stevel 			mutex_exit(&sp->s_lock);
    448      0    stevel 		}
    449      0    stevel 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
    450      0    stevel 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
    451      0    stevel 		mutex_enter(&mi->mi_lock);
    452      0    stevel 		mi->mi_recovflags |= MI4R_NEED_CLIENTID;
    453      0    stevel 		if (recovp->rc_srv_reboot)
    454      0    stevel 			mi->mi_recovflags |= MI4R_SRV_REBOOT;
    455      0    stevel 		mutex_exit(&mi->mi_lock);
    456      0    stevel 		break;
    457      0    stevel 
    458      0    stevel 	case NR_OPENFILES:
    459      0    stevel 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
    460      0    stevel 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
    461      0    stevel 		mutex_enter(&mi->mi_lock);
    462      0    stevel 		mi->mi_recovflags |= MI4R_REOPEN_FILES;
    463      0    stevel 		if (recovp->rc_srv_reboot)
    464      0    stevel 			mi->mi_recovflags |= MI4R_SRV_REBOOT;
    465      0    stevel 		mutex_exit(&mi->mi_lock);
    466      0    stevel 		break;
    467      0    stevel 
    468      0    stevel 	case NR_WRONGSEC:
    469      0    stevel 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
    470      0    stevel 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
    471      0    stevel 		mutex_enter(&mi->mi_lock);
    472      0    stevel 		mi->mi_recovflags |= MI4R_NEED_SECINFO;
    473      0    stevel 		mutex_exit(&mi->mi_lock);
    474      0    stevel 		break;
    475      0    stevel 
    476      0    stevel 	case NR_EXPIRED:
    477      0    stevel 		if (vp1 != NULL)
    478      0    stevel 			recov_badstate(recovp, vp1, NFS4ERR_EXPIRED);
    479      0    stevel 		if (vp2 != NULL)
    480      0    stevel 			recov_badstate(recovp, vp2, NFS4ERR_EXPIRED);
    481      0    stevel 		goto out_no_thread;	/* no further recovery possible */
    482      0    stevel 
    483      0    stevel 	case NR_BAD_STATEID:
    484      0    stevel 		if (vp1 != NULL)
    485      0    stevel 			recov_badstate(recovp, vp1, NFS4ERR_BAD_STATEID);
    486      0    stevel 		if (vp2 != NULL)
    487      0    stevel 			recov_badstate(recovp, vp2, NFS4ERR_BAD_STATEID);
    488      0    stevel 		goto out_no_thread;	/* no further recovery possible */
    489      0    stevel 
    490      0    stevel 	case NR_FHEXPIRED:
    491      0    stevel 	case NR_BADHANDLE:
    492      0    stevel 		if (vp1 != NULL)
    493      0    stevel 			recov_throttle(recovp, vp1);
    494      0    stevel 		if (vp2 != NULL)
    495      0    stevel 			recov_throttle(recovp, vp2);
    496      0    stevel 		/*
    497      0    stevel 		 * Recover the filehandle now, rather than using a
    498      0    stevel 		 * separate thread.  We can do this because filehandle
    499      0    stevel 		 * recovery is independent of any other state, and because
    500      0    stevel 		 * we know that we are not competing with the recovery
    501      0    stevel 		 * thread at this time.  recov_filehandle will deal with
    502      0    stevel 		 * threads that are competing to recover this filehandle.
    503      0    stevel 		 */
    504      0    stevel 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
    505      0    stevel 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
    506      0    stevel 		if (vp1 != NULL)
    507      0    stevel 			recov_filehandle(recovp->rc_action, mi, vp1);
    508      0    stevel 		if (vp2 != NULL)
    509      0    stevel 			recov_filehandle(recovp->rc_action, mi, vp2);
    510      0    stevel 		goto out_no_thread;	/* no further recovery needed */
    511      0    stevel 
    512      0    stevel 	case NR_STALE:
    513      0    stevel 		/*
    514      0    stevel 		 * NFS4ERR_STALE handling
    515      0    stevel 		 * recov_stale() could set MI4R_NEED_NEW_SERVER to
    516      0    stevel 		 * indicate that we can and should failover.
    517      0    stevel 		 */
    518      0    stevel 		ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
    519      0    stevel 		    nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
    520      0    stevel 
    521      0    stevel 		if (vp1 != NULL)
    522      0    stevel 			recov_stale(mi, vp1);
    523      0    stevel 		if (vp2 != NULL)
    524      0    stevel 			recov_stale(mi, vp2);
    525      0    stevel 		mutex_enter(&mi->mi_lock);
    526      0    stevel 		if ((mi->mi_recovflags & MI4R_NEED_NEW_SERVER) == 0) {
    527      0    stevel 			mutex_exit(&mi->mi_lock);
    528      0    stevel 			goto out_no_thread;
    529      0    stevel 		}
    530      0    stevel 		mutex_exit(&mi->mi_lock);
    531      0    stevel 		recovp->rc_action = NR_FAILOVER;
    532      0    stevel 		goto again;
    533      0    stevel 
    534      0    stevel 	case NR_BAD_SEQID:
    535      0    stevel 		if (recovp->rc_bseqid_rqst) {
    536      0    stevel 			enqueue_bseqid_rqst(recovp, mi);
    537      0    stevel 			break;
    538      0    stevel 		}
    539      0    stevel 
    540      0    stevel 		if (vp1 != NULL)
    541      0    stevel 			recov_badstate(recovp, vp1, NFS4ERR_BAD_SEQID);
    542      0    stevel 		if (vp2 != NULL)
    543      0    stevel 			recov_badstate(recovp, vp2, NFS4ERR_BAD_SEQID);
    544      0    stevel 		goto out_no_thread; /* no further recovery possible */
    545      0    stevel 
    546      0    stevel 	case NR_OLDSTATEID:
    547      0    stevel 		if (vp1 != NULL)
    548      0    stevel 			recov_badstate(recovp, vp1, NFS4ERR_OLD_STATEID);
    549      0    stevel 		if (vp2 != NULL)
    550      0    stevel 			recov_badstate(recovp, vp2, NFS4ERR_OLD_STATEID);
    551      0    stevel 		goto out_no_thread;	/* no further recovery possible */
    552      0    stevel 
    553      0    stevel 	case NR_GRACE:
    554      0    stevel 		nfs4_set_grace_wait(mi);
    555      0    stevel 		goto out_no_thread; /* no further action required for GRACE */
    556      0    stevel 
    557      0    stevel 	case NR_DELAY:
    558      0    stevel 		if (vp1)
    559      0    stevel 			nfs4_set_delay_wait(vp1);
    560      0    stevel 		goto out_no_thread; /* no further action required for DELAY */
    561      0    stevel 
    562      0    stevel 	case NR_LOST_STATE_RQST:
    563      0    stevel 	case NR_LOST_LOCK:
    564      0    stevel 		nfs4_enqueue_lost_rqst(recovp, mi);
    565      0    stevel 		break;
    566      0    stevel 
    567      0    stevel 	default:
    568      0    stevel 		nfs4_queue_event(RE_UNEXPECTED_ACTION, mi, NULL,
    569      0    stevel 		    recovp->rc_action, NULL, NULL, 0, NULL, 0, TAG_NONE,
    570      0    stevel 		    TAG_NONE, 0, 0);
    571      0    stevel 		goto out_no_thread;
    572      0    stevel 	}
    573      0    stevel 
    574      0    stevel 	/*
    575      0    stevel 	 * If either file recently went through the same recovery, wait
    576      0    stevel 	 * awhile.  This is in case there is some sort of bug; we might not
    577      0    stevel 	 * be able to recover properly, but at least we won't bombard the
    578      0    stevel 	 * server with calls, and we won't tie up the client.
    579      0    stevel 	 */
    580      0    stevel 	if (vp1 != NULL)
    581      0    stevel 		recov_throttle(recovp, vp1);
    582      0    stevel 	if (vp2 != NULL)
    583      0    stevel 		recov_throttle(recovp, vp2);
    584      0    stevel 
    585      0    stevel 	/*
    586      0    stevel 	 * If there's already a recovery thread, don't start another one.
    587      0    stevel 	 */
    588      0    stevel 
    589      0    stevel 	mutex_enter(&mi->mi_lock);
    590      0    stevel 	if (mi->mi_flags & MI4_RECOV_ACTIV) {
    591      0    stevel 		mutex_exit(&mi->mi_lock);
    592      0    stevel 		goto out_no_thread;
    593      0    stevel 	}
    594      0    stevel 	mi->mi_flags |= MI4_RECOV_ACTIV;
    595      0    stevel 	mutex_exit(&mi->mi_lock);
    596      0    stevel 	NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
    597   5302  th199096 	    "start_recovery: starting new thread for mi %p", (void*)mi));
    598      0    stevel 
    599      0    stevel 	recovp->rc_mi = mi;
    600      0    stevel 	recovp->rc_vp1 = vp1;
    601      0    stevel 	if (vp1 != NULL) {
    602      0    stevel 		ASSERT(VTOMI4(vp1) == mi);
    603      0    stevel 		VN_HOLD(recovp->rc_vp1);
    604      0    stevel 	}
    605      0    stevel 	recovp->rc_vp2 = vp2;
    606      0    stevel 	if (vp2 != NULL) {
    607      0    stevel 		ASSERT(VTOMI4(vp2) == mi);
    608      0    stevel 		VN_HOLD(recovp->rc_vp2);
    609      0    stevel 	}
    610      0    stevel 
    611      0    stevel 	(void) zthread_create(NULL, 0, nfs4_recov_thread, recovp, 0,
    612   5302  th199096 	    minclsyspri);
    613      0    stevel 	return;
    614      0    stevel 
    615      0    stevel 	/* not reached by thread creating call */
    616      0    stevel out_no_thread:
    617      0    stevel 	mutex_enter(&mi->mi_lock);
    618      0    stevel 	mi->mi_in_recovery--;
    619    855   jwahlig 	if (mi->mi_in_recovery == 0)
    620    855   jwahlig 		cv_broadcast(&mi->mi_cv_in_recov);
    621      0    stevel 	mutex_exit(&mi->mi_lock);
    622      0    stevel 
    623      0    stevel 	VFS_RELE(mi->mi_vfsp);
    624   1705   jwahlig 	MI4_RELE(mi);
    625      0    stevel 	/*
    626      0    stevel 	 * Free up resources that were allocated for us.
    627      0    stevel 	 */
    628      0    stevel 	kmem_free(recovp, sizeof (recov_info_t));
    629      0    stevel }
    630      0    stevel 
    631      0    stevel static int
    632      0    stevel nfs4_check_recov_err(vnode_t *vp, nfs4_op_hint_t op,
    633   5302  th199096     nfs4_recov_state_t *rsp, int retry_err_cnt, char *str)
    634      0    stevel {
    635      0    stevel 	rnode4_t *rp;
    636      0    stevel 	int error = 0;
    637      0    stevel 	int exempt;
    638      0    stevel 
    639      0    stevel 	if (vp == NULL)
    640      0    stevel 		return (0);
    641      0    stevel 
    642      0    stevel 	exempt = (op == OH_CLOSE || op == OH_LOCKU || op == OH_DELEGRETURN);
    643      0    stevel 	rp = VTOR4(vp);
    644      0    stevel 	mutex_enter(&rp->r_statelock);
    645      0    stevel 
    646      0    stevel 	/*
    647      0    stevel 	 * If there was a recovery error, then allow op hints "exempt" from
    648      0    stevel 	 * recov errors to retry (currently 3 times).  Either r_error or
    649      0    stevel 	 * EIO is returned for non-exempt op hints.
    650      0    stevel 	 */
    651      0    stevel 	if (rp->r_flags & R4RECOVERR) {
    652      0    stevel 		if (exempt && rsp->rs_num_retry_despite_err <=
    653   5302  th199096 		    nfs4_max_recov_error_retry) {
    654      0    stevel 
    655      0    stevel 			/*
    656      0    stevel 			 * Check to make sure that we haven't already inc'd
    657      0    stevel 			 * rs_num_retry_despite_err for current nfs4_start_fop
    658      0    stevel 			 * instance.  We don't want to double inc (if we were
    659      0    stevel 			 * called with vp2, then the vp1 call could have
    660      0    stevel 			 * already incremented.
    661      0    stevel 			 */
    662      0    stevel 			if (retry_err_cnt == rsp->rs_num_retry_despite_err)
    663      0    stevel 				rsp->rs_num_retry_despite_err++;
    664      0    stevel 
    665      0    stevel 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
    666   5302  th199096 			    "nfs4_start_fop: %s %p DEAD, cnt=%d", str,
    667   5302  th199096 			    (void *)vp, rsp->rs_num_retry_despite_err));
    668      0    stevel 		} else {
    669      0    stevel 			error = (rp->r_error ? rp->r_error : EIO);
    670      0    stevel 			/*
    671      0    stevel 			 * An ESTALE error on a non-regular file is not
    672      0    stevel 			 * "sticky".  Return the ESTALE error once, but
    673      0    stevel 			 * clear the condition to allow future operations
    674      0    stevel 			 * to go OTW.  This will allow the client to
    675      0    stevel 			 * recover if the server has merely unshared then
    676      0    stevel 			 * re-shared the file system.  For regular files,
    677      0    stevel 			 * the unshare has destroyed the open state at the
    678      0    stevel 			 * server and we aren't willing to do a reopen (yet).
    679      0    stevel 			 */
    680      0    stevel 			if (error == ESTALE && vp->v_type != VREG) {
    681      0    stevel 				rp->r_flags &=
    682   5302  th199096 				    ~(R4RECOVERR|R4RECOVERRP|R4STALE);
    683      0    stevel 				rp->r_error = 0;
    684      0    stevel 				error = ESTALE;
    685      0    stevel 			}
    686      0    stevel 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
    687   5302  th199096 			    "nfs4_start_fop: %s %p DEAD, cnt=%d error=%d",
    688   5302  th199096 			    str, (void *)vp,
    689   5302  th199096 			    rsp->rs_num_retry_despite_err, error));
    690      0    stevel 		}
    691      0    stevel 	}
    692   5302  th199096 
    693      0    stevel 	mutex_exit(&rp->r_statelock);
    694      0    stevel 	return (error);
    695      0    stevel }
    696      0    stevel 
    697      0    stevel /*
    698      0    stevel  * Initial setup code that every operation should call if it might invoke
    699      0    stevel  * client recovery.  Can block waiting for recovery to finish on a
    700      0    stevel  * filesystem.  Either vnode ptr can be NULL.
    701      0    stevel  *
    702      0    stevel  * Returns 0 if there are no outstanding errors.  Can return an
    703      0    stevel  * errno value under various circumstances (e.g., failed recovery, or
    704      0    stevel  * interrupted while waiting for recovery to finish).
    705      0    stevel  *
    706      0    stevel  * There must be a corresponding call to nfs4_end_op() to free up any locks
    707      0    stevel  * or resources allocated by this call (assuming this call succeeded),
    708      0    stevel  * using the same rsp that's passed in here.
    709      0    stevel  *
    710      0    stevel  * The open and lock seqid synchronization must be stopped before calling this
    711      0    stevel  * function, as it could lead to deadlock when trying to reopen a file or
    712      0    stevel  * reclaim a lock.  The synchronization is obtained with calls to:
    713      0    stevel  *   nfs4_start_open_seqid_sync()
    714      0    stevel  *   nfs4_start_lock_seqid_sync()
    715      0    stevel  *
    716      0    stevel  * *startrecovp is set TRUE if the caller should not bother with the
    717      0    stevel  * over-the-wire call, and just initiate recovery for the given request.
    718      0    stevel  * This is typically used for state-releasing ops if the filesystem has
    719      0    stevel  * been forcibly unmounted.  startrecovp may be NULL for
    720      0    stevel  * non-state-releasing ops.
    721      0    stevel  */
    722      0    stevel 
    723      0    stevel int
    724      0    stevel nfs4_start_fop(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, nfs4_op_hint_t op,
    725   5302  th199096     nfs4_recov_state_t *rsp, bool_t *startrecovp)
    726      0    stevel {
    727      0    stevel 	int error = 0, rerr_cnt;
    728      0    stevel 	nfs4_server_t *sp = NULL;
    729      0    stevel 	nfs4_server_t *tsp;
    730      0    stevel 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
    731   9858     Pavel 	uint_t droplock_cnt;
    732      0    stevel #ifdef DEBUG
    733      0    stevel 	void *fop_caller;
    734      0    stevel #endif
    735      0    stevel 
    736      0    stevel 	ASSERT(vp1 == NULL || vp1->v_vfsp == mi->mi_vfsp);
    737      0    stevel 	ASSERT(vp2 == NULL || vp2->v_vfsp == mi->mi_vfsp);
    738      0    stevel 
    739      0    stevel #ifdef	DEBUG
    740      0    stevel 	if ((fop_caller = tsd_get(nfs4_tsd_key)) != NULL) {
    741      0    stevel 		cmn_err(CE_PANIC, "Missing nfs4_end_fop: last caller %p",
    742   5302  th199096 		    fop_caller);
    743      0    stevel 	}
    744      0    stevel 	(void) tsd_set(nfs4_tsd_key, caller());
    745      0    stevel #endif
    746      0    stevel 
    747      0    stevel 	rsp->rs_sp = NULL;
    748      0    stevel 	rsp->rs_flags &= ~NFS4_RS_RENAME_HELD;
    749      0    stevel 	rerr_cnt = rsp->rs_num_retry_despite_err;
    750      0    stevel 
    751      0    stevel 	/*
    752      0    stevel 	 * Process the items that may delay() based on server response
    753      0    stevel 	 */
    754      0    stevel 	error = nfs4_wait_for_grace(mi, rsp);
    755      0    stevel 	if (error)
    756      0    stevel 		goto out;
    757      0    stevel 
    758      0    stevel 	if (vp1 != NULL) {
    759      0    stevel 		error = nfs4_wait_for_delay(vp1, rsp);
    760      0    stevel 		if (error)
    761      0    stevel 			goto out;
    762      0    stevel 	}
    763      0    stevel 
    764      0    stevel 	/* Wait for a delegation recall to complete. */
    765      0    stevel 
    766      0    stevel 	error = wait_for_recall(vp1, vp2, op, rsp);
    767      0    stevel 	if (error)
    768      0    stevel 		goto out;
    769      0    stevel 
    770      0    stevel 	/*
    771      0    stevel 	 * Wait for any current recovery actions to finish.  Note that a
    772      0    stevel 	 * recovery thread can still start up after wait_for_recovery()
    773      0    stevel 	 * finishes.  We don't block out recovery operations until we
    774      0    stevel 	 * acquire s_recovlock and mi_recovlock.
    775      0    stevel 	 */
    776      0    stevel 	error = wait_for_recovery(mi, op);
    777      0    stevel 	if (error)
    778      0    stevel 		goto out;
    779      0    stevel 
    780      0    stevel 	/*
    781      0    stevel 	 * Check to see if the rnode is already marked with a
    782      0    stevel 	 * recovery error.  If so, return it immediately.  But
    783      0    stevel 	 * always pass CLOSE, LOCKU, and DELEGRETURN so we can
    784      0    stevel 	 * clean up state on the server.
    785      0    stevel 	 */
    786      0    stevel 
    787      0    stevel 	if (vp1 != NULL) {
    788      0    stevel 		if (error = nfs4_check_recov_err(vp1, op, rsp, rerr_cnt, "vp1"))
    789      0    stevel 			goto out;
    790      0    stevel 		nfs4_check_remap(mi, vp1, NFS4_REMAP_CKATTRS, &e);
    791      0    stevel 	}
    792      0    stevel 
    793      0    stevel 	if (vp2 != NULL) {
    794      0    stevel 		if (error = nfs4_check_recov_err(vp2, op, rsp, rerr_cnt, "vp2"))
    795      0    stevel 			goto out;
    796      0    stevel 		nfs4_check_remap(mi, vp2, NFS4_REMAP_CKATTRS, &e);
    797      0    stevel 	}
    798      0    stevel 
    799      0    stevel 	/*
    800      0    stevel 	 * The lock order calls for us to acquire s_recovlock before
    801      0    stevel 	 * mi_recovlock, but we have to hold mi_recovlock to look up sp (to
    802      0    stevel 	 * prevent races with the failover/migration code).  So acquire
    803      0    stevel 	 * mi_recovlock, look up sp, drop mi_recovlock, acquire
    804      0    stevel 	 * s_recovlock and mi_recovlock, then verify that sp is still the
    805      0    stevel 	 * right object.  XXX Can we find a simpler way to deal with this?
    806      0    stevel 	 */
    807      0    stevel 	if (nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
    808      0    stevel 	    mi->mi_flags & MI4_INT)) {
    809      0    stevel 		error = EINTR;
    810      0    stevel 		goto out;
    811      0    stevel 	}
    812      0    stevel get_sp:
    813      0    stevel 	sp = find_nfs4_server(mi);
    814      0    stevel 	if (sp != NULL) {
    815      0    stevel 		sp->s_otw_call_count++;
    816    163  ek110237 		mutex_exit(&sp->s_lock);
    817   9858     Pavel 		droplock_cnt = mi->mi_srvset_cnt;
    818      0    stevel 	}
    819      0    stevel 	nfs_rw_exit(&mi->mi_recovlock);
    820      0    stevel 
    821      0    stevel 	if (sp != NULL) {
    822      0    stevel 		if (nfs_rw_enter_sig(&sp->s_recovlock, RW_READER,
    823   5302  th199096 		    mi->mi_flags & MI4_INT)) {
    824      0    stevel 			error = EINTR;
    825      0    stevel 			goto out;
    826      0    stevel 		}
    827      0    stevel 	}
    828      0    stevel 	if (nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
    829   5302  th199096 	    mi->mi_flags & MI4_INT)) {
    830      0    stevel 		if (sp != NULL)
    831      0    stevel 			nfs_rw_exit(&sp->s_recovlock);
    832      0    stevel 		error = EINTR;
    833      0    stevel 		goto out;
    834      0    stevel 	}
    835      0    stevel 	/*
    836      0    stevel 	 * If the mntinfo4_t hasn't changed nfs4_sever_ts then
    837      0    stevel 	 * there's no point in double checking to make sure it
    838      0    stevel 	 * has switched.
    839      0    stevel 	 */
    840   9858     Pavel 	if (sp == NULL || droplock_cnt != mi->mi_srvset_cnt) {
    841      0    stevel 		tsp = find_nfs4_server(mi);
    842      0    stevel 		if (tsp != sp) {
    843      0    stevel 			/* try again */
    844      0    stevel 			if (tsp != NULL) {
    845      0    stevel 				mutex_exit(&tsp->s_lock);
    846      0    stevel 				nfs4_server_rele(tsp);
    847      0    stevel 				tsp = NULL;
    848      0    stevel 			}
    849      0    stevel 			if (sp != NULL) {
    850      0    stevel 				nfs_rw_exit(&sp->s_recovlock);
    851      0    stevel 				mutex_enter(&sp->s_lock);
    852      0    stevel 				sp->s_otw_call_count--;
    853      0    stevel 				mutex_exit(&sp->s_lock);
    854      0    stevel 				nfs4_server_rele(sp);
    855      0    stevel 				sp = NULL;
    856      0    stevel 			}
    857      0    stevel 			goto get_sp;
    858      0    stevel 		} else {
    859      0    stevel 			if (tsp != NULL) {
    860      0    stevel 				mutex_exit(&tsp->s_lock);
    861      0    stevel 				nfs4_server_rele(tsp);
    862      0    stevel 				tsp = NULL;
    863      0    stevel 			}
    864      0    stevel 		}
    865      0    stevel 	}
    866      0    stevel 
    867      0    stevel 	if (sp != NULL) {
    868      0    stevel 		rsp->rs_sp = sp;
    869      0    stevel 	}
    870      0    stevel 
    871      0    stevel 	/*
    872      0    stevel 	 * If the fileystem uses volatile filehandles, obtain a lock so
    873      0    stevel 	 * that we synchronize with renames.  Exception: mount operations
    874      0    stevel 	 * can change mi_fh_expire_type, which could be a problem, since
    875      0    stevel 	 * the end_op code needs to be consistent with the start_op code
    876      0    stevel 	 * about mi_rename_lock.  Since mounts don't compete with renames,
    877      0    stevel 	 * it's simpler to just not acquire the rename lock for mounts.
    878      0    stevel 	 */
    879      0    stevel 	if (NFS4_VOLATILE_FH(mi) && op != OH_MOUNT) {
    880      0    stevel 		if (nfs_rw_enter_sig(&mi->mi_rename_lock,
    881   5302  th199096 		    op == OH_VFH_RENAME ? RW_WRITER : RW_READER,
    882   5302  th199096 		    mi->mi_flags & MI4_INT)) {
    883      0    stevel 			nfs_rw_exit(&mi->mi_recovlock);
    884      0    stevel 			if (sp != NULL)
    885      0    stevel 				nfs_rw_exit(&sp->s_recovlock);
    886      0    stevel 			error = EINTR;
    887      0    stevel 			goto out;
    888      0    stevel 		}
    889      0    stevel 		rsp->rs_flags |= NFS4_RS_RENAME_HELD;
    890      0    stevel 	}
    891      0    stevel 
    892      0    stevel 	if (OH_IS_STATE_RELE(op)) {
    893      0    stevel 		/*
    894      0    stevel 		 * For forced unmount, letting the request proceed will
    895      0    stevel 		 * almost always delay response to the user, so hand it off
    896      0    stevel 		 * to the recovery thread.  For exiting lwp's, we don't
    897      0    stevel 		 * have a good way to tell if the request will hang.  We
    898      0    stevel 		 * generally want processes to handle their own requests so
    899      0    stevel 		 * that they can be done in parallel, but if there is
    900      0    stevel 		 * already a recovery thread, hand the request off to it.
    901      0    stevel 		 * This will improve user response at no cost to overall
    902      0    stevel 		 * system throughput.  For zone shutdown, we'd prefer
    903      0    stevel 		 * the recovery thread to handle this as well.
    904      0    stevel 		 */
    905      0    stevel 		ASSERT(startrecovp != NULL);
    906      0    stevel 		mutex_enter(&mi->mi_lock);
    907      0    stevel 		if (FS_OR_ZONE_GONE4(mi->mi_vfsp))
    908      0    stevel 			*startrecovp = TRUE;
    909      0    stevel 		else if ((curthread->t_proc_flag & TP_LWPEXIT) &&
    910      0    stevel 		    (mi->mi_flags & MI4_RECOV_ACTIV))
    911      0    stevel 			*startrecovp = TRUE;
    912      0    stevel 		else
    913      0    stevel 			*startrecovp = FALSE;
    914      0    stevel 		mutex_exit(&mi->mi_lock);
    915      0    stevel 	} else
    916      0    stevel 		if (startrecovp != NULL)
    917      0    stevel 			*startrecovp = FALSE;
    918      0    stevel 
    919      0    stevel 	ASSERT(error == 0);
    920      0    stevel 	return (error);
    921      0    stevel 
    922      0    stevel out:
    923      0    stevel 	ASSERT(error != 0);
    924      0    stevel 	if (sp != NULL) {
    925      0    stevel 		mutex_enter(&sp->s_lock);
    926      0    stevel 		sp->s_otw_call_count--;
    927      0    stevel 		mutex_exit(&sp->s_lock);
    928      0    stevel 		nfs4_server_rele(sp);
    929      0    stevel 		rsp->rs_sp = NULL;
    930      0    stevel 	}
    931      0    stevel 	nfs4_end_op_recall(vp1, vp2, rsp);
    932      0    stevel 
    933      0    stevel #ifdef	DEBUG
    934      0    stevel 	(void) tsd_set(nfs4_tsd_key, NULL);
    935      0    stevel #endif
    936      0    stevel 	return (error);
    937      0    stevel }
    938      0    stevel 
    939      0    stevel /*
    940      0    stevel  * It is up to the caller to determine if rsp->rs_sp being NULL
    941      0    stevel  * is detrimental or not.
    942      0    stevel  */
    943      0    stevel int
    944      0    stevel nfs4_start_op(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2,
    945   5302  th199096     nfs4_recov_state_t *rsp)
    946      0    stevel {
    947      0    stevel 	ASSERT(rsp->rs_num_retry_despite_err == 0);
    948      0    stevel 	rsp->rs_num_retry_despite_err = 0;
    949      0    stevel 	return (nfs4_start_fop(mi, vp1, vp2, OH_OTHER, rsp, NULL));
    950      0    stevel }
    951      0    stevel 
    952      0    stevel /*
    953      0    stevel  * Release any resources acquired by nfs4_start_op().
    954      0    stevel  * 'sp' should be the nfs4_server pointer returned by nfs4_start_op().
    955      0    stevel  *
    956      0    stevel  * The operation hint is used to avoid a deadlock by bypassing delegation
    957      0    stevel  * return logic for writes, which are done while returning a delegation.
    958      0    stevel  */
    959      0    stevel 
    960      0    stevel void
    961      0    stevel nfs4_end_fop(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2, nfs4_op_hint_t op,
    962   5302  th199096     nfs4_recov_state_t *rsp, bool_t needs_recov)
    963      0    stevel {
    964      0    stevel 	nfs4_server_t *sp = rsp->rs_sp;
    965      0    stevel 	rnode4_t *rp = NULL;
    966      0    stevel 
    967      0    stevel #ifdef	lint
    968      0    stevel 	/*
    969      0    stevel 	 * The op hint isn't used any more, but might be in
    970      0    stevel 	 * the future.
    971      0    stevel 	 */
    972      0    stevel 	op = op;
    973      0    stevel #endif
    974      0    stevel 
    975      0    stevel #ifdef	DEBUG
    976      0    stevel 	ASSERT(tsd_get(nfs4_tsd_key) != NULL);
    977      0    stevel 	(void) tsd_set(nfs4_tsd_key, NULL);
    978      0    stevel #endif
    979      0    stevel 
    980      0    stevel 	nfs4_end_op_recall(vp1, vp2, rsp);
    981      0    stevel 
    982      0    stevel 	if (rsp->rs_flags & NFS4_RS_RENAME_HELD)
    983      0    stevel 		nfs_rw_exit(&mi->mi_rename_lock);
    984      0    stevel 
    985      0    stevel 	if (!needs_recov) {
    986      0    stevel 		if (rsp->rs_flags & NFS4_RS_DELAY_MSG) {
    987      0    stevel 			/* may need to clear the delay interval */
    988      0    stevel 			if (vp1 != NULL) {
    989      0    stevel 				rp = VTOR4(vp1);
    990      0    stevel 				mutex_enter(&rp->r_statelock);
    991      0    stevel 				rp->r_delay_interval = 0;
    992      0    stevel 				mutex_exit(&rp->r_statelock);
    993      0    stevel 			}
    994      0    stevel 		}
    995      0    stevel 		rsp->rs_flags &= ~(NFS4_RS_GRACE_MSG|NFS4_RS_DELAY_MSG);
    996      0    stevel 	}
    997      0    stevel 
    998      0    stevel 	/*
    999      0    stevel 	 * If the corresponding nfs4_start_op() found a sp,
   1000      0    stevel 	 * then there must still be a sp.
   1001      0    stevel 	 */
   1002      0    stevel 	if (sp != NULL) {
   1003      0    stevel 		nfs_rw_exit(&mi->mi_recovlock);
   1004      0    stevel 		nfs_rw_exit(&sp->s_recovlock);
   1005      0    stevel 		mutex_enter(&sp->s_lock);
   1006      0    stevel 		sp->s_otw_call_count--;
   1007      0    stevel 		cv_broadcast(&sp->s_cv_otw_count);
   1008      0    stevel 		mutex_exit(&sp->s_lock);
   1009      0    stevel 		nfs4_server_rele(sp);
   1010      0    stevel 	} else {
   1011      0    stevel 		nfs_rw_exit(&mi->mi_recovlock);
   1012      0    stevel 	}
   1013      0    stevel }
   1014      0    stevel 
   1015      0    stevel void
   1016      0    stevel nfs4_end_op(mntinfo4_t *mi, vnode_t *vp1, vnode_t *vp2,
   1017   5302  th199096     nfs4_recov_state_t *rsp, bool_t needrecov)
   1018      0    stevel {
   1019      0    stevel 	nfs4_end_fop(mi, vp1, vp2, OH_OTHER, rsp, needrecov);
   1020      0    stevel }
   1021      0    stevel 
   1022      0    stevel /*
   1023      0    stevel  * If the filesystem is going through client recovery, block until
   1024      0    stevel  * finished.
   1025      0    stevel  * Exceptions:
   1026      0    stevel  * - state-releasing ops (CLOSE, LOCKU, DELEGRETURN) are allowed to proceed
   1027      0    stevel  *   if the filesystem has been forcibly unmounted or the lwp is exiting.
   1028      0    stevel  *
   1029      0    stevel  * Return value:
   1030      0    stevel  * - 0 if no errors
   1031      0    stevel  * - EINTR if the call was interrupted
   1032      0    stevel  * - EIO if the filesystem has been forcibly unmounted (non-state-releasing
   1033      0    stevel  *   op)
   1034      0    stevel  * - the errno value from the recovery thread, if recovery failed
   1035      0    stevel  */
   1036      0    stevel 
   1037      0    stevel static int
   1038      0    stevel wait_for_recovery(mntinfo4_t *mi, nfs4_op_hint_t op_hint)
   1039      0    stevel {
   1040      0    stevel 	int error = 0;
   1041      0    stevel 
   1042      0    stevel 	mutex_enter(&mi->mi_lock);
   1043      0    stevel 
   1044      0    stevel 	while (mi->mi_recovflags != 0) {
   1045      0    stevel 		klwp_t *lwp = ttolwp(curthread);
   1046      0    stevel 
   1047   6520  vv149972 		if ((mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) ||
   1048   6520  vv149972 		    (mi->mi_flags & MI4_RECOV_FAIL))
   1049      0    stevel 			break;
   1050      0    stevel 		if (OH_IS_STATE_RELE(op_hint) &&
   1051      0    stevel 		    (curthread->t_proc_flag & TP_LWPEXIT))
   1052      0    stevel 			break;
   1053      0    stevel 
   1054      0    stevel 		if (lwp != NULL)
   1055      0    stevel 			lwp->lwp_nostop++;
   1056      0    stevel 		/* XXX - use different cv? */
   1057      0    stevel 		if (cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock) == 0) {
   1058      0    stevel 			error = EINTR;
   1059      0    stevel 			if (lwp != NULL)
   1060      0    stevel 				lwp->lwp_nostop--;
   1061      0    stevel 			break;
   1062      0    stevel 		}
   1063      0    stevel 		if (lwp != NULL)
   1064      0    stevel 			lwp->lwp_nostop--;
   1065      0    stevel 	}
   1066      0    stevel 
   1067   6520  vv149972 	if ((mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) &&
   1068      0    stevel 	    !OH_IS_STATE_RELE(op_hint)) {
   1069      0    stevel 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   1070   5302  th199096 		    "wait_for_recovery: forced unmount"));
   1071      0    stevel 		error = EIO;
   1072   6520  vv149972 	} else if (mi->mi_flags & MI4_RECOV_FAIL) {
   1073   6520  vv149972 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   1074   6520  vv149972 		    "wait_for_recovery: fail since RECOV FAIL"));
   1075   6520  vv149972 		error = mi->mi_error;
   1076      0    stevel 	}
   1077      0    stevel 
   1078      0    stevel 	mutex_exit(&mi->mi_lock);
   1079      0    stevel 
   1080      0    stevel 	return (error);
   1081      0    stevel }
   1082      0    stevel 
   1083      0    stevel /*
   1084      0    stevel  * If the client received NFS4ERR_GRACE for this particular mount,
   1085      0    stevel  * the client blocks here until it is time to try again.
   1086      0    stevel  *
   1087      0    stevel  * Return value:
   1088      0    stevel  * - 0 if wait was successful
   1089      0    stevel  * - EINTR if the call was interrupted
   1090      0    stevel  */
   1091      0    stevel 
   1092      0    stevel int
   1093      0    stevel nfs4_wait_for_grace(mntinfo4_t *mi, nfs4_recov_state_t *rsp)
   1094      0    stevel {
   1095      0    stevel 	int error = 0;
   1096      0    stevel 	time_t curtime, time_to_wait;
   1097      0    stevel 
   1098      0    stevel 	/* do a unprotected check to reduce mi_lock contention */
   1099      0    stevel 	if (mi->mi_grace_wait != 0) {
   1100      0    stevel 		mutex_enter(&mi->mi_lock);
   1101      0    stevel 
   1102      0    stevel 		if (mi->mi_grace_wait != 0) {
   1103      0    stevel 			if (!(rsp->rs_flags & NFS4_RS_GRACE_MSG))
   1104      0    stevel 				rsp->rs_flags |= NFS4_RS_GRACE_MSG;
   1105      0    stevel 
   1106      0    stevel 			curtime = gethrestime_sec();
   1107      0    stevel 
   1108      0    stevel 			if (curtime < mi->mi_grace_wait) {
   1109      0    stevel 
   1110      0    stevel 				time_to_wait = mi->mi_grace_wait - curtime;
   1111      0    stevel 
   1112      0    stevel 				mutex_exit(&mi->mi_lock);
   1113      0    stevel 
   1114   5583  dm120769 				delay(SEC_TO_TICK(time_to_wait));
   1115      0    stevel 
   1116      0    stevel 				curtime = gethrestime_sec();
   1117      0    stevel 
   1118      0    stevel 				mutex_enter(&mi->mi_lock);
   1119      0    stevel 
   1120      0    stevel 				if (curtime >= mi->mi_grace_wait)
   1121      0    stevel 					mi->mi_grace_wait = 0;
   1122      0    stevel 			} else {
   1123      0    stevel 				mi->mi_grace_wait = 0;
   1124      0    stevel 			}
   1125      0    stevel 		}
   1126      0    stevel 		mutex_exit(&mi->mi_lock);
   1127      0    stevel 	}
   1128      0    stevel 
   1129      0    stevel 	return (error);
   1130      0    stevel }
   1131      0    stevel 
   1132      0    stevel /*
   1133      0    stevel  * If the client received NFS4ERR_DELAY for an operation on a vnode,
   1134      0    stevel  * the client blocks here until it is time to try again.
   1135      0    stevel  *
   1136      0    stevel  * Return value:
   1137      0    stevel  * - 0 if wait was successful
   1138      0    stevel  * - EINTR if the call was interrupted
   1139      0    stevel  */
   1140      0    stevel 
   1141      0    stevel int
   1142      0    stevel nfs4_wait_for_delay(vnode_t *vp, nfs4_recov_state_t *rsp)
   1143      0    stevel {
   1144      0    stevel 	int error = 0;
   1145      0    stevel 	time_t curtime, time_to_wait;
   1146      0    stevel 	rnode4_t *rp;
   1147      0    stevel 
   1148      0    stevel 	ASSERT(vp != NULL);
   1149      0    stevel 
   1150      0    stevel 	rp = VTOR4(vp);
   1151      0    stevel 
   1152      0    stevel 	/* do a unprotected check to reduce r_statelock contention */
   1153      0    stevel 	if (rp->r_delay_wait != 0) {
   1154      0    stevel 		mutex_enter(&rp->r_statelock);
   1155      0    stevel 
   1156      0    stevel 		if (rp->r_delay_wait != 0) {
   1157      0    stevel 
   1158      0    stevel 			if (!(rsp->rs_flags & NFS4_RS_DELAY_MSG)) {
   1159      0    stevel 				rsp->rs_flags |= NFS4_RS_DELAY_MSG;
   1160      0    stevel 				nfs4_mi_kstat_inc_delay(VTOMI4(vp));
   1161      0    stevel 			}
   1162      0    stevel 
   1163      0    stevel 			curtime = gethrestime_sec();
   1164      0    stevel 
   1165      0    stevel 			if (curtime < rp->r_delay_wait) {
   1166      0    stevel 
   1167      0    stevel 				time_to_wait = rp->r_delay_wait - curtime;
   1168      0    stevel 
   1169      0    stevel 				mutex_exit(&rp->r_statelock);
   1170      0    stevel 
   1171   5583  dm120769 				delay(SEC_TO_TICK(time_to_wait));
   1172      0    stevel 
   1173      0    stevel 				curtime = gethrestime_sec();
   1174      0    stevel 
   1175      0    stevel 				mutex_enter(&rp->r_statelock);
   1176      0    stevel 
   1177      0    stevel 				if (curtime >= rp->r_delay_wait)
   1178      0    stevel 					rp->r_delay_wait = 0;
   1179      0    stevel 			} else {
   1180      0    stevel 				rp->r_delay_wait = 0;
   1181      0    stevel 			}
   1182      0    stevel 		}
   1183      0    stevel 		mutex_exit(&rp->r_statelock);
   1184      0    stevel 	}
   1185      0    stevel 
   1186      0    stevel 	return (error);
   1187      0    stevel }
   1188      0    stevel 
   1189      0    stevel /*
   1190      0    stevel  * The recovery thread.
   1191      0    stevel  */
   1192      0    stevel 
   1193      0    stevel static void
   1194      0    stevel nfs4_recov_thread(recov_info_t *recovp)
   1195      0    stevel {
   1196      0    stevel 	mntinfo4_t *mi = recovp->rc_mi;
   1197      0    stevel 	nfs4_server_t *sp;
   1198      0    stevel 	int done = 0, error = 0;
   1199      0    stevel 	bool_t recov_fail = FALSE;
   1200      0    stevel 	callb_cpr_t cpr_info;
   1201      0    stevel 	kmutex_t cpr_lock;
   1202      0    stevel 
   1203      0    stevel 	nfs4_queue_event(RE_START, mi, NULL, mi->mi_recovflags,
   1204      0    stevel 	    recovp->rc_vp1, recovp->rc_vp2, 0, NULL, 0, TAG_NONE, TAG_NONE,
   1205      0    stevel 	    0, 0);
   1206      0    stevel 
   1207      0    stevel 	mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
   1208      0    stevel 	CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Recov");
   1209      0    stevel 
   1210      0    stevel 	mutex_enter(&mi->mi_lock);
   1211      0    stevel 	mi->mi_recovthread = curthread;
   1212      0    stevel 	mutex_exit(&mi->mi_lock);
   1213      0    stevel 
   1214      0    stevel 	/*
   1215      0    stevel 	 * We don't really need protection here against failover or
   1216      0    stevel 	 * migration, since the current thread is the one that would make
   1217      0    stevel 	 * any changes, but hold mi_recovlock anyway for completeness (and
   1218      0    stevel 	 * to satisfy any ASSERTs).
   1219      0    stevel 	 */
   1220      0    stevel 	(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
   1221      0    stevel 	sp = find_nfs4_server(mi);
   1222      0    stevel 	if (sp != NULL)
   1223      0    stevel 		mutex_exit(&sp->s_lock);
   1224      0    stevel 	nfs_rw_exit(&mi->mi_recovlock);
   1225      0    stevel 
   1226      0    stevel 	/*
   1227      0    stevel 	 * Do any necessary recovery, based on the information in recovp
   1228      0    stevel 	 * and any recovery flags.
   1229      0    stevel 	 */
   1230      0    stevel 
   1231      0    stevel 	do {
   1232      0    stevel 		mutex_enter(&mi->mi_lock);
   1233      0    stevel 		if (FS_OR_ZONE_GONE4(mi->mi_vfsp)) {
   1234      0    stevel 			bool_t activesrv;
   1235      0    stevel 
   1236      0    stevel 			NFS4_DEBUG(nfs4_client_recov_debug &&
   1237      0    stevel 			    mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED, (CE_NOTE,
   1238   5302  th199096 			    "nfs4_recov_thread: file system has been "
   1239   5302  th199096 			    "unmounted"));
   1240      0    stevel 			NFS4_DEBUG(nfs4_client_recov_debug &&
   1241      0    stevel 			    zone_status_get(curproc->p_zone) >=
   1242      0    stevel 			    ZONE_IS_SHUTTING_DOWN, (CE_NOTE,
   1243   5302  th199096 			    "nfs4_recov_thread: zone shutting down"));
   1244      0    stevel 			/*
   1245      0    stevel 			 * If the server has lost its state for us and
   1246      0    stevel 			 * the filesystem is unmounted, then the filesystem
   1247      0    stevel 			 * can be tossed, even if there are lost lock or
   1248      0    stevel 			 * lost state calls in the recovery queue.
   1249      0    stevel 			 */
   1250      0    stevel 			if (mi->mi_recovflags &
   1251      0    stevel 			    (MI4R_NEED_CLIENTID | MI4R_REOPEN_FILES)) {
   1252      0    stevel 				NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   1253      0    stevel 				"nfs4_recov_thread: bailing out"));
   1254      0    stevel 				mi->mi_flags |= MI4_RECOV_FAIL;
   1255      0    stevel 				mi->mi_error = recovp->rc_error;
   1256      0    stevel 				recov_fail = TRUE;
   1257      0    stevel 			}
   1258      0    stevel 			/*
   1259      0    stevel 			 * We don't know if the server has any state for
   1260      0    stevel 			 * us, and the filesystem has been unmounted.  If
   1261      0    stevel 			 * there are "lost state" recovery items, keep
   1262      0    stevel 			 * trying to process them until there are no more
   1263      0    stevel 			 * mounted filesystems for the server.  Otherwise,
   1264      0    stevel 			 * bail out.  The reason we don't mark the
   1265      0    stevel 			 * filesystem as failing recovery is in case we
   1266      0    stevel 			 * have to do "lost state" recovery later (e.g., a
   1267      0    stevel 			 * user process exits).
   1268      0    stevel 			 */
   1269      0    stevel 			if (!(mi->mi_recovflags & MI4R_LOST_STATE)) {
   1270    855   jwahlig 				done = 1;
   1271      0    stevel 				mutex_exit(&mi->mi_lock);
   1272      0    stevel 				break;
   1273      0    stevel 			}
   1274      0    stevel 			mutex_exit(&mi->mi_lock);
   1275      0    stevel 
   1276      0    stevel 			if (sp == NULL)
   1277      0    stevel 				activesrv = FALSE;
   1278      0    stevel 			else {
   1279      0    stevel 				mutex_enter(&sp->s_lock);
   1280      0    stevel 				activesrv = nfs4_fs_active(sp);
   1281      0    stevel 			}
   1282      0    stevel 			if (!activesrv) {
   1283      0    stevel 				NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   1284   5302  th199096 				    "no active fs for server %p",
   1285   5302  th199096 				    (void *)sp));
   1286      0    stevel 				mutex_enter(&mi->mi_lock);
   1287      0    stevel 				mi->mi_flags |= MI4_RECOV_FAIL;
   1288      0    stevel 				mi->mi_error = recovp->rc_error;
   1289      0    stevel 				mutex_exit(&mi->mi_lock);
   1290      0    stevel 				recov_fail = TRUE;
   1291      0    stevel 				if (sp != NULL) {
   1292      0    stevel 					/*
   1293      0    stevel 					 * Mark the server instance as
   1294      0    stevel 					 * dead, so that nobody will attach
   1295      0    stevel 					 * a new filesystem.
   1296      0    stevel 					 */
   1297      0    stevel 					nfs4_mark_srv_dead(sp);
   1298      0    stevel 				}
   1299      0    stevel 			}
   1300      0    stevel 			if (sp != NULL)
   1301      0    stevel 				mutex_exit(&sp->s_lock);
   1302      0    stevel 		} else {
   1303      0    stevel 			mutex_exit(&mi->mi_lock);
   1304      0    stevel 		}
   1305      0    stevel 
   1306      0    stevel 		/*
   1307      0    stevel 		 * Check if we need to select a new server for a
   1308      0    stevel 		 * failover.  Choosing a new server will force at
   1309      0    stevel 		 * least a check of the clientid.
   1310      0    stevel 		 */
   1311      0    stevel 		mutex_enter(&mi->mi_lock);
   1312      0    stevel 		if (!recov_fail &&
   1313      0    stevel 		    (mi->mi_recovflags & MI4R_NEED_NEW_SERVER)) {
   1314      0    stevel 			mutex_exit(&mi->mi_lock);
   1315      0    stevel 			recov_newserver(recovp, &sp, &recov_fail);
   1316      0    stevel 		} else
   1317      0    stevel 			mutex_exit(&mi->mi_lock);
   1318      0    stevel 
   1319      0    stevel 		/*
   1320      0    stevel 		 * Check if we need to recover the clientid.  This
   1321      0    stevel 		 * must be done before file and lock recovery, and it
   1322      0    stevel 		 * potentially affects the recovery threads for other
   1323      0    stevel 		 * filesystems, so it gets special treatment.
   1324      0    stevel 		 */
   1325      0    stevel 		if (sp != NULL && recov_fail == FALSE) {
   1326      0    stevel 			mutex_enter(&sp->s_lock);
   1327      0    stevel 			if (!(sp->s_flags & N4S_CLIENTID_SET)) {
   1328      0    stevel 				mutex_exit(&sp->s_lock);
   1329      0    stevel 				recov_clientid(recovp, sp);
   1330      0    stevel 			} else {
   1331      0    stevel 				/*
   1332      0    stevel 				 * Unset this flag in case another recovery
   1333      0    stevel 				 * thread successfully recovered the clientid
   1334      0    stevel 				 * for us already.
   1335      0    stevel 				 */
   1336      0    stevel 				mutex_enter(&mi->mi_lock);
   1337      0    stevel 				mi->mi_recovflags &= ~MI4R_NEED_CLIENTID;
   1338      0    stevel 				mutex_exit(&mi->mi_lock);
   1339      0    stevel 				mutex_exit(&sp->s_lock);
   1340      0    stevel 			}
   1341      0    stevel 		}
   1342      0    stevel 
   1343      0    stevel 		/*
   1344      0    stevel 		 * Check if we need to get the security information.
   1345      0    stevel 		 */
   1346      0    stevel 		mutex_enter(&mi->mi_lock);
   1347      0    stevel 		if ((mi->mi_recovflags & MI4R_NEED_SECINFO) &&
   1348      0    stevel 		    !(mi->mi_flags & MI4_RECOV_FAIL)) {
   1349      0    stevel 			mutex_exit(&mi->mi_lock);
   1350      0    stevel 			(void) nfs_rw_enter_sig(&mi->mi_recovlock,
   1351   5302  th199096 			    RW_WRITER, 0);
   1352      0    stevel 			error = nfs4_secinfo_recov(recovp->rc_mi,
   1353   5302  th199096 			    recovp->rc_vp1, recovp->rc_vp2);
   1354      0    stevel 			/*
   1355      0    stevel 			 * If error, nothing more can be done, stop
   1356      0    stevel 			 * the recovery.
   1357      0    stevel 			 */
   1358      0    stevel 			if (error) {
   1359      0    stevel 				mutex_enter(&mi->mi_lock);
   1360      0    stevel 				mi->mi_flags |= MI4_RECOV_FAIL;
   1361      0    stevel 				mi->mi_error = recovp->rc_error;
   1362      0    stevel 				mutex_exit(&mi->mi_lock);
   1363      0    stevel 				nfs4_queue_event(RE_WRONGSEC, mi, NULL,
   1364      0    stevel 				    error, recovp->rc_vp1, recovp->rc_vp2,
   1365      0    stevel 				    0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
   1366      0    stevel 			}
   1367      0    stevel 			nfs_rw_exit(&mi->mi_recovlock);
   1368      0    stevel 		} else
   1369      0    stevel 			mutex_exit(&mi->mi_lock);
   1370      0    stevel 
   1371      0    stevel 		/*
   1372      0    stevel 		 * Check if there's a bad seqid to recover.
   1373      0    stevel 		 */
   1374      0    stevel 		mutex_enter(&mi->mi_lock);
   1375      0    stevel 		if ((mi->mi_recovflags & MI4R_BAD_SEQID) &&
   1376      0    stevel 		    !(mi->mi_flags & MI4_RECOV_FAIL)) {
   1377      0    stevel 			mutex_exit(&mi->mi_lock);
   1378      0    stevel 			(void) nfs_rw_enter_sig(&mi->mi_recovlock,
   1379   5302  th199096 			    RW_WRITER, 0);
   1380      0    stevel 			recov_bad_seqid(recovp);
   1381      0    stevel 			nfs_rw_exit(&mi->mi_recovlock);
   1382      0    stevel 		} else
   1383      0    stevel 			mutex_exit(&mi->mi_lock);
   1384      0    stevel 
   1385      0    stevel 		/*
   1386      0    stevel 		 * Next check for recovery that affects the entire
   1387      0    stevel 		 * filesystem.
   1388      0    stevel 		 */
   1389      0    stevel 		if (sp != NULL) {
   1390      0    stevel 			mutex_enter(&mi->mi_lock);
   1391      0    stevel 			if ((mi->mi_recovflags & MI4R_REOPEN_FILES) &&
   1392      0    stevel 			    !(mi->mi_flags & MI4_RECOV_FAIL)) {
   1393      0    stevel 				mutex_exit(&mi->mi_lock);
   1394      0    stevel 				recov_openfiles(recovp, sp);
   1395      0    stevel 			} else
   1396      0    stevel 				mutex_exit(&mi->mi_lock);
   1397      0    stevel 		}
   1398      0    stevel 
   1399      0    stevel 		/*
   1400      0    stevel 		 * Send any queued state recovery requests.
   1401      0    stevel 		 */
   1402      0    stevel 		mutex_enter(&mi->mi_lock);
   1403      0    stevel 		if (sp != NULL &&
   1404      0    stevel 		    (mi->mi_recovflags & MI4R_LOST_STATE) &&
   1405      0    stevel 		    !(mi->mi_flags & MI4_RECOV_FAIL)) {
   1406      0    stevel 			mutex_exit(&mi->mi_lock);
   1407      0    stevel 			(void) nfs_rw_enter_sig(&mi->mi_recovlock,
   1408   5302  th199096 			    RW_WRITER, 0);
   1409      0    stevel 			nfs4_resend_lost_rqsts(recovp, sp);
   1410      0    stevel 			if (list_head(&mi->mi_lost_state) == NULL) {
   1411      0    stevel 				/* done */
   1412      0    stevel 				mutex_enter(&mi->mi_lock);
   1413      0    stevel 				mi->mi_recovflags &= ~MI4R_LOST_STATE;
   1414      0    stevel 				mutex_exit(&mi->mi_lock);
   1415      0    stevel 			}
   1416      0    stevel 			nfs_rw_exit(&mi->mi_recovlock);
   1417      0    stevel 		} else {
   1418      0    stevel 			mutex_exit(&mi->mi_lock);
   1419      0    stevel 		}
   1420      0    stevel 
   1421      0    stevel 		/*
   1422      0    stevel 		 * See if there is anything more to do.  If not, announce
   1423      0    stevel 		 * that we are done and exit.
   1424      0    stevel 		 *
   1425      0    stevel 		 * Need mi_recovlock to keep 'sp' valid.  Must grab
   1426      0    stevel 		 * mi_recovlock before mi_lock to preserve lock ordering.
   1427      0    stevel 		 */
   1428      0    stevel 		(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
   1429      0    stevel 		mutex_enter(&mi->mi_lock);
   1430      0    stevel 		if ((mi->mi_recovflags & ~MI4R_SRV_REBOOT) == 0 ||
   1431      0    stevel 		    (mi->mi_flags & MI4_RECOV_FAIL)) {
   1432      0    stevel 			list_t local_lost_state;
   1433      0    stevel 			nfs4_lost_rqst_t *lrp;
   1434      0    stevel 
   1435      0    stevel 			/*
   1436      0    stevel 			 * We need to remove the lost requests before we
   1437      0    stevel 			 * unmark the mi as no longer doing recovery to
   1438      0    stevel 			 * avoid a race with a new thread putting new lost
   1439      0    stevel 			 * requests on the same mi (and the going away
   1440      0    stevel 			 * thread would remove the new lost requests).
   1441      0    stevel 			 *
   1442      0    stevel 			 * Move the lost requests to a local list since
   1443      0    stevel 			 * nfs4_remove_lost_rqst() drops mi_lock, and
   1444      0    stevel 			 * dropping the mi_lock would make our check to
   1445      0    stevel 			 * see if recovery is done no longer valid.
   1446      0    stevel 			 */
   1447      0    stevel 			list_create(&local_lost_state,
   1448      0    stevel 			    sizeof (nfs4_lost_rqst_t),
   1449      0    stevel 			    offsetof(nfs4_lost_rqst_t, lr_node));
   1450      0    stevel 			list_move_tail(&local_lost_state, &mi->mi_lost_state);
   1451      0    stevel 
   1452      0    stevel 			done = 1;
   1453      0    stevel 			mutex_exit(&mi->mi_lock);
   1454      0    stevel 			/*
   1455      0    stevel 			 * Now officially free the "moved"
   1456      0    stevel 			 * lost requests.
   1457      0    stevel 			 */
   1458      0    stevel 			while ((lrp = list_head(&local_lost_state)) != NULL) {
   1459      0    stevel 				list_remove(&local_lost_state, lrp);
   1460      0    stevel 				nfs4_free_lost_rqst(lrp, sp);
   1461      0    stevel 			}
   1462      0    stevel 			list_destroy(&local_lost_state);
   1463      0    stevel 		} else
   1464      0    stevel 			mutex_exit(&mi->mi_lock);
   1465      0    stevel 		nfs_rw_exit(&mi->mi_recovlock);
   1466      0    stevel 
   1467      0    stevel 		/*
   1468      0    stevel 		 * If the filesystem has been forcibly unmounted, there is
   1469      0    stevel 		 * probably no point in retrying immediately.  Furthermore,
   1470      0    stevel 		 * there might be user processes waiting for a chance to
   1471      0    stevel 		 * queue up "lost state" requests, so that they can exit.
   1472      0    stevel 		 * So pause here for a moment.  Same logic for zone shutdown.
   1473      0    stevel 		 */
   1474      0    stevel 		if (!done && FS_OR_ZONE_GONE4(mi->mi_vfsp)) {
   1475      0    stevel 			mutex_enter(&mi->mi_lock);
   1476      0    stevel 			cv_broadcast(&mi->mi_failover_cv);
   1477      0    stevel 			mutex_exit(&mi->mi_lock);
   1478      0    stevel 			delay(SEC_TO_TICK(nfs4_unmount_delay));
   1479      0    stevel 		}
   1480      0    stevel 
   1481      0    stevel 	} while (!done);
   1482      0    stevel 
   1483      0    stevel 	if (sp != NULL)
   1484      0    stevel 		nfs4_server_rele(sp);
   1485      0    stevel 
   1486      0    stevel 	/*
   1487      0    stevel 	 * Return all recalled delegations
   1488      0    stevel 	 */
   1489      0    stevel 	nfs4_dlistclean();
   1490      0    stevel 
   1491    855   jwahlig 	mutex_enter(&mi->mi_lock);
   1492    855   jwahlig 	recov_done(mi, recovp);
   1493    855   jwahlig 	mutex_exit(&mi->mi_lock);
   1494    855   jwahlig 
   1495      0    stevel 	/*
   1496      0    stevel 	 * Free up resources that were allocated for us.
   1497      0    stevel 	 */
   1498      0    stevel 	if (recovp->rc_vp1 != NULL)
   1499      0    stevel 		VN_RELE(recovp->rc_vp1);
   1500      0    stevel 	if (recovp->rc_vp2 != NULL)
   1501      0    stevel 		VN_RELE(recovp->rc_vp2);
   1502   1126   jwahlig 
   1503    855   jwahlig 	/* now we are done using the mi struct, signal the waiters */
   1504    855   jwahlig 	mutex_enter(&mi->mi_lock);
   1505    855   jwahlig 	mi->mi_in_recovery--;
   1506    855   jwahlig 	if (mi->mi_in_recovery == 0)
   1507    855   jwahlig 		cv_broadcast(&mi->mi_cv_in_recov);
   1508    855   jwahlig 	mutex_exit(&mi->mi_lock);
   1509   1126   jwahlig 
   1510   1705   jwahlig 	VFS_RELE(mi->mi_vfsp);
   1511   1705   jwahlig 	MI4_RELE(mi);
   1512      0    stevel 	kmem_free(recovp, sizeof (recov_info_t));
   1513      0    stevel 	mutex_enter(&cpr_lock);
   1514      0    stevel 	CALLB_CPR_EXIT(&cpr_info);
   1515      0    stevel 	mutex_destroy(&cpr_lock);
   1516      0    stevel 	zthread_exit();
   1517      0    stevel }
   1518      0    stevel 
   1519      0    stevel /*
   1520      0    stevel  * Log the end of recovery and notify any waiting threads.
   1521      0    stevel  */
   1522      0    stevel 
   1523      0    stevel static void
   1524      0    stevel recov_done(mntinfo4_t *mi, recov_info_t *recovp)
   1525      0    stevel {
   1526      0    stevel 
   1527      0    stevel 	ASSERT(MUTEX_HELD(&mi->mi_lock));
   1528      0    stevel 
   1529      0    stevel 	nfs4_queue_event(RE_END, mi, NULL, 0, recovp->rc_vp1,
   1530   5302  th199096 	    recovp->rc_vp2, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
   1531      0    stevel 	mi->mi_recovthread = NULL;
   1532      0    stevel 	mi->mi_flags &= ~MI4_RECOV_ACTIV;
   1533      0    stevel 	mi->mi_recovflags &= ~MI4R_SRV_REBOOT;
   1534      0    stevel 	cv_broadcast(&mi->mi_failover_cv);
   1535      0    stevel }
   1536      0    stevel 
   1537      0    stevel /*
   1538      0    stevel  * State-specific recovery routines, by state.
   1539      0    stevel  */
   1540      0    stevel 
   1541      0    stevel /*
   1542      0    stevel  * Failover.
   1543      0    stevel  *
   1544      0    stevel  * Replaces *spp with a reference to the new server, which must
   1545      0    stevel  * eventually be freed.
   1546      0    stevel  */
   1547      0    stevel 
   1548      0    stevel static void
   1549      0    stevel recov_newserver(recov_info_t *recovp, nfs4_server_t **spp, bool_t *recov_fail)
   1550      0    stevel {
   1551      0    stevel 	mntinfo4_t *mi = recovp->rc_mi;
   1552      0    stevel 	servinfo4_t *svp = NULL;
   1553      0    stevel 	nfs4_server_t *osp = *spp;
   1554      0    stevel 	CLIENT *cl;
   1555      0    stevel 	enum clnt_stat status;
   1556      0    stevel 	struct timeval tv;
   1557      0    stevel 	int error;
   1558      0    stevel 	int oncethru = 0;
   1559      0    stevel 	rnode4_t *rp;
   1560      0    stevel 	int index;
   1561      0    stevel 	nfs_fh4 fh;
   1562      0    stevel 	char *snames;
   1563      0    stevel 	size_t len;
   1564      0    stevel 
   1565      0    stevel 	(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0);
   1566      0    stevel 
   1567      0    stevel 	tv.tv_sec = 2;
   1568      0    stevel 	tv.tv_usec = 0;
   1569      0    stevel 
   1570      0    stevel #ifdef lint
   1571      0    stevel 	/*
   1572      0    stevel 	 * Lint can't follow the logic, so thinks that snames and len
   1573      0    stevel 	 * can be used before being set.  They can't, but lint can't
   1574      0    stevel 	 * figure it out.  To address the lint warning, initialize
   1575      0    stevel 	 * snames and len for lint.
   1576      0    stevel 	 */
   1577      0    stevel 	snames = NULL;
   1578      0    stevel 	len = 0;
   1579      0    stevel #endif
   1580      0    stevel 
   1581      0    stevel 	/*
   1582      0    stevel 	 * Ping the null NFS procedure of every server in
   1583      0    stevel 	 * the list until one responds.  We always start
   1584      0    stevel 	 * at the head of the list and always skip the one
   1585      0    stevel 	 * that is current, since it's caused us a problem.
   1586      0    stevel 	 */
   1587      0    stevel 	while (svp == NULL) {
   1588      0    stevel 		for (svp = mi->mi_servers; svp; svp = svp->sv_next) {
   1589      0    stevel 
   1590      0    stevel 			mutex_enter(&mi->mi_lock);
   1591      0    stevel 			if (FS_OR_ZONE_GONE4(mi->mi_vfsp)) {
   1592      0    stevel 				mi->mi_flags |= MI4_RECOV_FAIL;
   1593      0    stevel 				mutex_exit(&mi->mi_lock);
   1594      0    stevel 				(void) nfs_rw_exit(&mi->mi_recovlock);
   1595      0    stevel 				*recov_fail = TRUE;
   1596      0    stevel 				if (oncethru)
   1597      0    stevel 					kmem_free(snames, len);
   1598      0    stevel 				return;
   1599      0    stevel 			}
   1600      0    stevel 			mutex_exit(&mi->mi_lock);
   1601      0    stevel 
   1602      0    stevel 			(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
   1603      0    stevel 			if (svp->sv_flags & SV4_NOTINUSE) {
   1604      0    stevel 				nfs_rw_exit(&svp->sv_lock);
   1605      0    stevel 				continue;
   1606      0    stevel 			}
   1607      0    stevel 			nfs_rw_exit(&svp->sv_lock);
   1608      0    stevel 
   1609      0    stevel 			if (!oncethru && svp == mi->mi_curr_serv)
   1610      0    stevel 				continue;
   1611      0    stevel 
   1612      0    stevel 			error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr,
   1613      0    stevel 			    NFS_PROGRAM, NFS_V4, 0, 1, CRED(), &cl);
   1614      0    stevel 			if (error)
   1615      0    stevel 				continue;
   1616      0    stevel 
   1617      0    stevel 			if (!(mi->mi_flags & MI4_INT))
   1618      0    stevel 				cl->cl_nosignal = TRUE;
   1619      0    stevel 			status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL,
   1620      0    stevel 			    xdr_void, NULL, tv);
   1621      0    stevel 			if (!(mi->mi_flags & MI4_INT))
   1622      0    stevel 				cl->cl_nosignal = FALSE;
   1623      0    stevel 			AUTH_DESTROY(cl->cl_auth);
   1624      0    stevel 			CLNT_DESTROY(cl);
   1625      0    stevel 			if (status == RPC_SUCCESS) {
   1626      0    stevel 				nfs4_queue_event(RE_FAILOVER, mi,
   1627      0    stevel 				    svp == mi->mi_curr_serv ? NULL :
   1628      0    stevel 				    svp->sv_hostname, 0, NULL, NULL, 0,
   1629      0    stevel 				    NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
   1630      0    stevel 				break;
   1631      0    stevel 			}
   1632      0    stevel 		}
   1633      0    stevel 
   1634      0    stevel 		if (svp == NULL) {
   1635      0    stevel 			if (!oncethru) {
   1636      0    stevel 				snames = nfs4_getsrvnames(mi, &len);
   1637      0    stevel 				nfs4_queue_fact(RF_SRVS_NOT_RESPOND, mi,
   1638      0    stevel 				    0, 0, 0, FALSE, snames, 0, NULL);
   1639      0    stevel 				oncethru = 1;
   1640      0    stevel 			}
   1641      0    stevel 			delay(hz);
   1642      0    stevel 		}
   1643      0    stevel 	}
   1644      0    stevel 
   1645      0    stevel 	if (oncethru) {
   1646      0    stevel 		nfs4_queue_fact(RF_SRVS_OK, mi, 0, 0, 0, FALSE, snames,
   1647      0    stevel 		    0, NULL);
   1648      0    stevel 		kmem_free(snames, len);
   1649      0    stevel 	}
   1650      0    stevel 
   1651      0    stevel #if DEBUG
   1652      0    stevel 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
   1653      0    stevel 	ASSERT((svp->sv_flags & SV4_NOTINUSE) == 0);
   1654      0    stevel 	nfs_rw_exit(&svp->sv_lock);
   1655      0    stevel #endif
   1656      0    stevel 
   1657      0    stevel 	mutex_enter(&mi->mi_lock);
   1658      0    stevel 	mi->mi_recovflags &= ~MI4R_NEED_NEW_SERVER;
   1659      0    stevel 	if (svp != mi->mi_curr_serv) {
   1660      0    stevel 		servinfo4_t *osvp = mi->mi_curr_serv;
   1661      0    stevel 
   1662      0    stevel 		mutex_exit(&mi->mi_lock);
   1663      0    stevel 
   1664      0    stevel 		/*
   1665      0    stevel 		 * Update server-dependent fields in the root vnode.
   1666      0    stevel 		 */
   1667      0    stevel 		index = rtable4hash(mi->mi_rootfh);
   1668      0    stevel 		rw_enter(&rtable4[index].r_lock, RW_WRITER);
   1669      0    stevel 
   1670      0    stevel 		rp = r4find(&rtable4[index], mi->mi_rootfh, mi->mi_vfsp);
   1671      0    stevel 		if (rp != NULL) {
   1672      0    stevel 			NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
   1673      0    stevel 			    "recov_newserver: remapping %s", rnode4info(rp)));
   1674      0    stevel 			mutex_enter(&rp->r_statelock);
   1675      0    stevel 			rp->r_server = svp;
   1676      0    stevel 			PURGE_ATTRCACHE4_LOCKED(rp);
   1677      0    stevel 			mutex_exit(&rp->r_statelock);
   1678      0    stevel 			(void) nfs4_free_data_reclaim(rp);
   1679      0    stevel 			nfs4_purge_rddir_cache(RTOV4(rp));
   1680      0    stevel 			rw_exit(&rtable4[index].r_lock);
   1681      0    stevel 			NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
   1682      0    stevel 			    "recov_newserver: done with %s",
   1683      0    stevel 			    rnode4info(rp)));
   1684      0    stevel 			VN_RELE(RTOV4(rp));
   1685      0    stevel 		} else
   1686      0    stevel 			rw_exit(&rtable4[index].r_lock);
   1687      0    stevel 		(void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
   1688      0    stevel 
   1689      0    stevel 		mutex_enter(&mi->mi_lock);
   1690      0    stevel 		mi->mi_recovflags |= MI4R_REOPEN_FILES | MI4R_REMAP_FILES;
   1691      0    stevel 		if (recovp->rc_srv_reboot)
   1692      0    stevel 			mi->mi_recovflags |= MI4R_SRV_REBOOT;
   1693      0    stevel 		mi->mi_curr_serv = svp;
   1694      0    stevel 		mi->mi_failover++;
   1695      0    stevel 		mi->mi_flags &= ~MI4_BADOWNER_DEBUG;
   1696      0    stevel 		mutex_exit(&mi->mi_lock);
   1697      0    stevel 
   1698      0    stevel 		(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
   1699      0    stevel 		fh.nfs_fh4_len = svp->sv_fhandle.fh_len;
   1700      0    stevel 		fh.nfs_fh4_val = svp->sv_fhandle.fh_buf;
   1701      0    stevel 		sfh4_update(mi->mi_rootfh, &fh);
   1702      0    stevel 		fh.nfs_fh4_len = svp->sv_pfhandle.fh_len;
   1703      0    stevel 		fh.nfs_fh4_val = svp->sv_pfhandle.fh_buf;
   1704      0    stevel 		sfh4_update(mi->mi_srvparentfh, &fh);
   1705      0    stevel 		nfs_rw_exit(&svp->sv_lock);
   1706      0    stevel 
   1707      0    stevel 		*spp = nfs4_move_mi(mi, osvp, svp);
   1708      0    stevel 		if (osp != NULL)
   1709      0    stevel 			nfs4_server_rele(osp);
   1710      0    stevel 	} else
   1711      0    stevel 		mutex_exit(&mi->mi_lock);
   1712      0    stevel 	(void) nfs_rw_exit(&mi->mi_recovlock);
   1713      0    stevel }
   1714      0    stevel 
   1715      0    stevel /*
   1716      0    stevel  * Clientid.
   1717      0    stevel  */
   1718      0    stevel 
   1719      0    stevel static void
   1720      0    stevel recov_clientid(recov_info_t *recovp, nfs4_server_t *sp)
   1721      0    stevel {
   1722      0    stevel 	mntinfo4_t *mi = recovp->rc_mi;
   1723      0    stevel 	int error = 0;
   1724      0    stevel 	int still_stale;
   1725      0    stevel 	int need_new_s;
   1726      0    stevel 
   1727      0    stevel 	ASSERT(sp != NULL);
   1728      0    stevel 
   1729      0    stevel 	/*
   1730      0    stevel 	 * Acquire the recovery lock and then verify that the clientid
   1731      0    stevel 	 * still needs to be recovered.  (Note that s_recovlock is supposed
   1732      0    stevel 	 * to be acquired before s_lock.)  Since the thread holds the
   1733      0    stevel 	 * recovery lock, no other thread will recover the clientid.
   1734      0    stevel 	 */
   1735      0    stevel 	(void) nfs_rw_enter_sig(&sp->s_recovlock, RW_WRITER, 0);
   1736      0    stevel 	(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0);
   1737      0    stevel 	mutex_enter(&sp->s_lock);
   1738      0    stevel 	still_stale = ((sp->s_flags & N4S_CLIENTID_SET) == 0);
   1739      0    stevel 	mutex_exit(&sp->s_lock);
   1740      0    stevel 
   1741      0    stevel 	if (still_stale) {
   1742      0    stevel 		nfs4_error_t n4e;
   1743      0    stevel 
   1744      0    stevel 		nfs4_error_zinit(&n4e);
   1745      0    stevel 		nfs4setclientid(mi, kcred, TRUE, &n4e);
   1746      0    stevel 		error = n4e.error;
   1747      0    stevel 		if (error != 0) {
   1748      0    stevel 
   1749      0    stevel 			/*
   1750      0    stevel 			 * nfs4setclientid may have set MI4R_NEED_NEW_SERVER,
   1751      0    stevel 			 * if so, just return and let recov_thread drive
   1752      0    stevel 			 * failover.
   1753      0    stevel 			 */
   1754      0    stevel 			mutex_enter(&mi->mi_lock);
   1755      0    stevel 			need_new_s = mi->mi_recovflags & MI4R_NEED_NEW_SERVER;
   1756      0    stevel 			mutex_exit(&mi->mi_lock);
   1757      0    stevel 
   1758      0    stevel 			if (need_new_s) {
   1759      0    stevel 				nfs_rw_exit(&mi->mi_recovlock);
   1760      0    stevel 				nfs_rw_exit(&sp->s_recovlock);
   1761      0    stevel 				return;
   1762      0    stevel 			}
   1763      0    stevel 
   1764      0    stevel 			nfs4_queue_event(RE_CLIENTID, mi, NULL, n4e.error, NULL,
   1765      0    stevel 			    NULL, n4e.stat, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
   1766      0    stevel 			mutex_enter(&mi->mi_lock);
   1767      0    stevel 			mi->mi_flags |= MI4_RECOV_FAIL;
   1768      0    stevel 			mi->mi_error = recovp->rc_error;
   1769      0    stevel 			mutex_exit(&mi->mi_lock);
   1770      0    stevel 			/* don't destroy the nfs4_server, let umount do it */
   1771      0    stevel 		}
   1772      0    stevel 	}
   1773      0    stevel 
   1774      0    stevel 	if (error == 0) {
   1775      0    stevel 		mutex_enter(&mi->mi_lock);
   1776      0    stevel 		mi->mi_recovflags &= ~MI4R_NEED_CLIENTID;
   1777      0    stevel 		/*
   1778      0    stevel 		 * If still_stale isn't true, then another thread already
   1779      0    stevel 		 * recovered the clientid.  And that thread that set the
   1780      0    stevel 		 * clientid will have initiated reopening files on all the
   1781      0    stevel 		 * filesystems for the server, so we should not initiate
   1782      0    stevel 		 * reopening for this filesystem here.
   1783      0    stevel 		 */
   1784      0    stevel 		if (still_stale) {
   1785      0    stevel 			mi->mi_recovflags |= MI4R_REOPEN_FILES;
   1786      0    stevel 			if (recovp->rc_srv_reboot)
   1787      0    stevel 				mi->mi_recovflags |= MI4R_SRV_REBOOT;
   1788      0    stevel 		}
   1789      0    stevel 		mutex_exit(&mi->mi_lock);
   1790      0    stevel 	}
   1791      0    stevel 
   1792      0    stevel 	nfs_rw_exit(&mi->mi_recovlock);
   1793      0    stevel 
   1794      0    stevel 	if (error != 0) {
   1795      0    stevel 		nfs_rw_exit(&sp->s_recovlock);
   1796      0    stevel 		mutex_enter(&mi->mi_lock);
   1797      0    stevel 		if ((mi->mi_flags & MI4_RECOV_FAIL) == 0)
   1798      0    stevel 			delay(SEC_TO_TICK(recov_err_delay));
   1799      0    stevel 		mutex_exit(&mi->mi_lock);
   1800      0    stevel 	} else {
   1801      0    stevel 		mntinfo4_t **milist;
   1802      0    stevel 		mntinfo4_t *tmi;
   1803      0    stevel 		int nummi, i;
   1804      0    stevel 
   1805      0    stevel 		/*
   1806      0    stevel 		 * Initiate recovery of open files for other filesystems.
   1807      0    stevel 		 * We create an array of filesystems, rather than just
   1808      0    stevel 		 * walking the filesystem list, to avoid deadlock issues
   1809      0    stevel 		 * with s_lock and mi_recovlock.
   1810      0    stevel 		 */
   1811      0    stevel 		milist = make_milist(sp, &nummi);
   1812      0    stevel 		for (i = 0; i < nummi; i++) {
   1813      0    stevel 			tmi = milist[i];
   1814      0    stevel 			if (tmi != mi) {
   1815      0    stevel 				(void) nfs_rw_enter_sig(&tmi->mi_recovlock,
   1816   5302  th199096 				    RW_READER, 0);
   1817      0    stevel 				start_recovery_action(NR_OPENFILES, TRUE, tmi,
   1818   5302  th199096 				    NULL, NULL);
   1819      0    stevel 				nfs_rw_exit(&tmi->mi_recovlock);
   1820      0    stevel 			}
   1821      0    stevel 		}
   1822      0    stevel 		free_milist(milist, nummi);
   1823      0    stevel 
   1824      0    stevel 		nfs_rw_exit(&sp->s_recovlock);
   1825      0    stevel 	}
   1826      0    stevel }
   1827      0    stevel 
   1828      0    stevel /*
   1829      0    stevel  * Return an array of filesystems associated with the given server.  The
   1830      0    stevel  * caller should call free_milist() to free the references and memory.
   1831      0    stevel  */
   1832      0    stevel 
   1833      0    stevel static mntinfo4_t **
   1834      0    stevel make_milist(nfs4_server_t *sp, int *nummip)
   1835      0    stevel {
   1836      0    stevel 	int nummi, i;
   1837      0    stevel 	mntinfo4_t **milist;
   1838      0    stevel 	mntinfo4_t *tmi;
   1839      0    stevel 
   1840      0    stevel 	mutex_enter(&sp->s_lock);
   1841      0    stevel 	nummi = 0;
   1842      0    stevel 	for (tmi = sp->mntinfo4_list; tmi != NULL; tmi = tmi->mi_clientid_next)
   1843      0    stevel 		nummi++;
   1844      0    stevel 
   1845   4254   jwahlig 	milist = kmem_alloc(nummi * sizeof (mntinfo4_t *), KM_SLEEP);
   1846      0    stevel 
   1847      0    stevel 	for (i = 0, tmi = sp->mntinfo4_list; tmi != NULL; i++,
   1848      0    stevel 	    tmi = tmi->mi_clientid_next) {
   1849      0    stevel 		milist[i] = tmi;
   1850      0    stevel 		VFS_HOLD(tmi->mi_vfsp);
   1851      0    stevel 	}
   1852      0    stevel 	mutex_exit(&sp->s_lock);
   1853      0    stevel 
   1854      0    stevel 	*nummip = nummi;
   1855      0    stevel 	return (milist);
   1856      0    stevel }
   1857      0    stevel 
   1858      0    stevel /*
   1859      0    stevel  * Free the filesystem list created by make_milist().
   1860      0    stevel  */
   1861      0    stevel 
   1862      0    stevel static void
   1863      0    stevel free_milist(mntinfo4_t **milist, int nummi)
   1864      0    stevel {
   1865      0    stevel 	mntinfo4_t *tmi;
   1866      0    stevel 	int i;
   1867      0    stevel 
   1868      0    stevel 	for (i = 0; i < nummi; i++) {
   1869      0    stevel 		tmi = milist[i];
   1870      0    stevel 		VFS_RELE(tmi->mi_vfsp);
   1871      0    stevel 	}
   1872      0    stevel 	kmem_free(milist, nummi * sizeof (mntinfo4_t *));
   1873      0    stevel }
   1874      0    stevel 
   1875      0    stevel /*
   1876      0    stevel  * Filehandle
   1877      0    stevel  */
   1878      0    stevel 
   1879      0    stevel /*
   1880      0    stevel  * Lookup the filehandle for the given vnode and update the rnode if it has
   1881      0    stevel  * changed.
   1882      0    stevel  *
   1883      0    stevel  * Errors:
   1884      0    stevel  * - if the filehandle could not be updated because of an error that
   1885      0    stevel  *   requires further recovery, initiate that recovery and return.
   1886      0    stevel  * - if the filehandle could not be updated because of a signal, pretend we
   1887      0    stevel  *   succeeded and let someone else deal with it.
   1888      0    stevel  * - if the filehandle could not be updated and the filesystem has been
   1889      0    stevel  *   forcibly unmounted, pretend we succeeded, and let the caller deal with
   1890      0    stevel  *   the forced unmount (to retry or not to retry, that is the question).
   1891      0    stevel  * - if the filehandle could not be updated because of some other error,
   1892      0    stevel  *   mark the rnode bad and return.
   1893      0    stevel  */
   1894      0    stevel static void
   1895      0    stevel recov_filehandle(nfs4_recov_t action, mntinfo4_t *mi, vnode_t *vp)
   1896      0    stevel {
   1897      0    stevel 	rnode4_t *rp = VTOR4(vp);
   1898      0    stevel 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
   1899      0    stevel 	bool_t needrecov;
   1900      0    stevel 
   1901      0    stevel 	mutex_enter(&rp->r_statelock);
   1902      0    stevel 
   1903      0    stevel 	if (rp->r_flags & R4RECOVERR) {
   1904      0    stevel 		mutex_exit(&rp->r_statelock);
   1905      0    stevel 		return;
   1906      0    stevel 	}
   1907      0    stevel 
   1908      0    stevel 	/*
   1909      0    stevel 	 * If someone else is updating the filehandle, wait for them to
   1910      0    stevel 	 * finish and then let our caller retry.
   1911      0    stevel 	 */
   1912      0    stevel 	if (rp->r_flags & R4RECEXPFH) {
   1913      0    stevel 		while (rp->r_flags & R4RECEXPFH) {
   1914      0    stevel 			cv_wait(&rp->r_cv, &rp->r_statelock);
   1915      0    stevel 		}
   1916      0    stevel 		mutex_exit(&rp->r_statelock);
   1917      0    stevel 		return;
   1918      0    stevel 	}
   1919      0    stevel 	rp->r_flags |= R4RECEXPFH;
   1920      0    stevel 	mutex_exit(&rp->r_statelock);
   1921      0    stevel 
   1922      0    stevel 	if (action == NR_BADHANDLE) {
   1923      0    stevel 		/* shouldn't happen */
   1924      0    stevel 		nfs4_queue_event(RE_BADHANDLE, mi, NULL, 0,
   1925      0    stevel 		    vp, NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
   1926      0    stevel 	}
   1927      0    stevel 
   1928      0    stevel 	nfs4_remap_file(mi, vp, 0, &e);
   1929      0    stevel 	needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
   1930      0    stevel 
   1931      0    stevel 	/*
   1932      0    stevel 	 * If we get BADHANDLE or FHEXPIRED in their handler, something is
   1933      0    stevel 	 * broken.  Don't try to recover, just mark the file dead.
   1934      0    stevel 	 */
   1935      0    stevel 	if (needrecov && e.error == 0 &&
   1936      0    stevel 	    (e.stat == NFS4ERR_BADHANDLE || e.stat == NFS4ERR_FHEXPIRED))
   1937      0    stevel 		needrecov = FALSE;
   1938      0    stevel 	if (needrecov) {
   1939      0    stevel 		(void) nfs4_start_recovery(&e, mi, vp,
   1940   5302  th199096 		    NULL, NULL, NULL, OP_LOOKUP, NULL);
   1941      0    stevel 	} else if (e.error != EINTR &&
   1942      0    stevel 	    !NFS4_FRC_UNMT_ERR(e.error, mi->mi_vfsp) &&
   1943      0    stevel 	    (e.error != 0 || e.stat != NFS4_OK)) {
   1944      0    stevel 		nfs4_recov_fh_fail(vp, e.error, e.stat);
   1945      0    stevel 		/*
   1946      0    stevel 		 * Don't set r_error to ESTALE.  Higher-level code (e.g.,
   1947      0    stevel 		 * cstatat_getvp()) retries on ESTALE, which would cause
   1948      0    stevel 		 * an infinite loop.
   1949      0    stevel 		 */
   1950      0    stevel 	}
   1951      0    stevel 
   1952      0    stevel 	mutex_enter(&rp->r_statelock);
   1953      0    stevel 	rp->r_flags &= ~R4RECEXPFH;
   1954      0    stevel 	cv_broadcast(&rp->r_cv);
   1955      0    stevel 	mutex_exit(&rp->r_statelock);
   1956      0    stevel }
   1957      0    stevel 
   1958      0    stevel /*
   1959      0    stevel  * Stale Filehandle
   1960      0    stevel  */
   1961      0    stevel 
   1962      0    stevel /*
   1963      0    stevel  * A stale filehandle can happen when an individual file has
   1964      0    stevel  * been removed, or when an entire filesystem has been taken
   1965      0    stevel  * offline.  To distinguish these cases, we do this:
   1966      0    stevel  * - if a GETATTR with the current filehandle is okay, we do
   1967      0    stevel  *   nothing (this can happen with two-filehandle ops)
   1968      0    stevel  * - if the GETATTR fails, but a GETATTR of the root filehandle
   1969      0    stevel  *   succeeds, mark the rnode with R4STALE, which will stop use
   1970      0    stevel  * - if the GETATTR fails, and a GETATTR of the root filehandle
   1971      0    stevel  *   also fails, we consider the problem filesystem-wide, so:
   1972      0    stevel  *   - if we can failover, we should
   1973      0    stevel  *   - if we can't failover, we should mark both the original
   1974      0    stevel  *     vnode and the root bad
   1975      0    stevel  */
   1976      0    stevel static void
   1977      0    stevel recov_stale(mntinfo4_t *mi, vnode_t *vp)
   1978      0    stevel {
   1979      0    stevel 	rnode4_t *rp = VTOR4(vp);
   1980      0    stevel 	vnode_t *rootvp = NULL;
   1981      0    stevel 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
   1982      0    stevel 	nfs4_ga_res_t gar;
   1983      0    stevel 	char *fail_msg = "failed to recover from NFS4ERR_STALE";
   1984      0    stevel 	bool_t needrecov;
   1985      0    stevel 
   1986      0    stevel 	mutex_enter(&rp->r_statelock);
   1987      0    stevel 
   1988      0    stevel 	if (rp->r_flags & R4RECOVERR) {
   1989      0    stevel 		mutex_exit(&rp->r_statelock);
   1990      0    stevel 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   1991      0    stevel 		    "recov_stale: already marked dead, rp %s",
   1992      0    stevel 		    rnode4info(rp)));
   1993      0    stevel 		return;
   1994      0    stevel 	}
   1995      0    stevel 
   1996      0    stevel 	if (rp->r_flags & R4STALE) {
   1997      0    stevel 		mutex_exit(&rp->r_statelock);
   1998      0    stevel 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   1999      0    stevel 		    "recov_stale: already marked stale, rp %s",
   2000      0    stevel 		    rnode4info(rp)));
   2001      0    stevel 		return;
   2002      0    stevel 	}
   2003      0    stevel 
   2004      0    stevel 	mutex_exit(&rp->r_statelock);
   2005      0    stevel 
   2006      0    stevel 	/* Try a GETATTR on this vnode */
   2007      0    stevel 	nfs4_getattr_otw_norecovery(vp, &gar, &e, CRED(), 0);
   2008      0    stevel 
   2009      0    stevel 	/*
   2010      0    stevel 	 * Handle non-STALE recoverable errors
   2011      0    stevel 	 */
   2012      0    stevel 	needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
   2013      0    stevel 	if (needrecov && (e.error != 0 || e.stat != NFS4ERR_STALE)) {
   2014      0    stevel 		(void) nfs4_start_recovery(&e, mi, vp,
   2015   5302  th199096 		    NULL, NULL, NULL, OP_GETATTR, NULL);
   2016      0    stevel 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   2017      0    stevel 		    "recov_stale: error=%d, stat=%d seen on rp %s",
   2018      0    stevel 		    e.error, e.stat, rnode4info(rp)));
   2019      0    stevel 		goto out;
   2020      0    stevel 	}
   2021      0    stevel 
   2022      0    stevel 	/* Are things OK for this vnode? */
   2023      0    stevel 	if (!e.error && e.stat == NFS4_OK) {
   2024      0    stevel 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   2025      0    stevel 		    "recov_stale: file appears fine, rp %s",
   2026      0    stevel 		    rnode4info(rp)));
   2027      0    stevel 		goto out;
   2028      0    stevel 	}
   2029      0    stevel 
   2030      0    stevel 	/* Did we get an unrelated non-recoverable error? */
   2031      0    stevel 	if (e.error || e.stat != NFS4ERR_STALE) {
   2032      0    stevel 		nfs4_fail_recov(vp, fail_msg, e.error, e.stat);
   2033      0    stevel 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   2034      0    stevel 		    "recov_stale: unrelated fatal error, rp %s",
   2035      0    stevel 		    rnode4info(rp)));
   2036      0    stevel 		goto out;
   2037      0    stevel 	}
   2038      0    stevel 
   2039      0    stevel 	/*
   2040      0    stevel 	 * If we don't appear to be dealing with the root node, find it.
   2041      0    stevel 	 */
   2042      0    stevel 	if ((vp->v_flag & VROOT) == 0) {
   2043      0    stevel 		nfs4_error_zinit(&e);
   2044      0    stevel 		e.error = VFS_ROOT(vp->v_vfsp, &rootvp);
   2045      0    stevel 		if (e.error) {
   2046      0    stevel 			nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE);
   2047      0    stevel 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   2048      0    stevel 			    "recov_stale: can't find root node for rp %s",
   2049      0    stevel 			    rnode4info(rp)));
   2050      0    stevel 			goto out;
   2051      0    stevel 		}
   2052      0    stevel 	}
   2053      0    stevel 
   2054      0    stevel 	/* Try a GETATTR on the root vnode */
   2055      0    stevel 	if (rootvp != NULL) {
   2056      0    stevel 		nfs4_error_zinit(&e);
   2057      0    stevel 		nfs4_getattr_otw_norecovery(rootvp, &gar, &e, CRED(), 0);
   2058      0    stevel 
   2059      0    stevel 		/* Try recovery? */
   2060      0    stevel 		if (e.error != 0 || e.stat != NFS4ERR_STALE) {
   2061      0    stevel 			needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
   2062      0    stevel 			if (needrecov) {
   2063      0    stevel 				(void) nfs4_start_recovery(&e,
   2064   5302  th199096 				    mi, rootvp, NULL, NULL, NULL,
   2065   5302  th199096 				    OP_GETATTR, NULL);
   2066      0    stevel 				NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   2067      0    stevel 				    "recov_stale: error=%d, stat=%d seen "
   2068      0    stevel 				    "on rp %s", e.error, e.stat,
   2069      0    stevel 				    rnode4info(rp)));
   2070      0    stevel 			}
   2071      0    stevel 		}
   2072      0    stevel 
   2073      0    stevel 		/*
   2074      0    stevel 		 * Check to see if a failover attempt is warranted
   2075      0    stevel 		 * NB: nfs4_try_failover doesn't check for STALE
   2076      0    stevel 		 * because recov_stale gets a shot first.  Now that
   2077      0    stevel 		 * recov_stale has failed, go ahead and try failover.
   2078      0    stevel 		 *
   2079      0    stevel 		 * If the getattr on the root filehandle was successful,
   2080      0    stevel 		 * then mark recovery as failed for 'vp' and exit.
   2081      0    stevel 		 */
   2082      0    stevel 		if (nfs4_try_failover(&e) == 0 && e.stat != NFS4ERR_STALE) {
   2083      0    stevel 			/*
   2084      0    stevel 			 * pass the original error to fail_recov, not
   2085      0    stevel 			 * the one from trying the root vnode.
   2086      0    stevel 			 */
   2087      0    stevel 			nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE);
   2088      0    stevel 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   2089      0    stevel 			    "recov_stale: root node OK, marking "
   2090      0    stevel 			    "dead rp %s", rnode4info(rp)));
   2091      0    stevel 			goto out;
   2092      0    stevel 		}
   2093      0    stevel 	}
   2094      0    stevel 
   2095      0    stevel 	/*
   2096      0    stevel 	 * Here, we know that both the original file and the
   2097      0    stevel 	 * root filehandle (which may be the same) are stale.
   2098      0    stevel 	 * We want to fail over if we can, and if we can't, we
   2099      0    stevel 	 * want to mark everything in sight bad.
   2100      0    stevel 	 */
   2101      0    stevel 	if (FAILOVER_MOUNT4(mi)) {
   2102      0    stevel 		mutex_enter(&mi->mi_lock);
   2103      0    stevel 		mi->mi_recovflags |= MI4R_NEED_NEW_SERVER;
   2104      0    stevel 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   2105      0    stevel 		    "recov_stale: failing over due to rp %s",
   2106      0    stevel 		    rnode4info(rp)));
   2107      0    stevel 		mutex_exit(&mi->mi_lock);
   2108      0    stevel 	} else {
   2109      0    stevel 		rnode4_t *rootrp;
   2110      0    stevel 		servinfo4_t *svp;
   2111      0    stevel 
   2112      0    stevel 		/*
   2113      0    stevel 		 * Can't fail over, so mark things dead.
   2114      0    stevel 		 *
   2115      0    stevel 		 * If rootvp is set, we know we have a distinct
   2116      0    stevel 		 * non-root vnode which can be marked dead in
   2117      0    stevel 		 * the usual way.
   2118      0    stevel 		 *
   2119      0    stevel 		 * Then we want to mark the root vnode dead.
   2120      0    stevel 		 * Note that if rootvp wasn't set, our vp is
   2121      0    stevel 		 * actually the root vnode.
   2122      0    stevel 		 */
   2123      0    stevel 		if (rootvp != NULL) {
   2124      0    stevel 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
   2125      0    stevel 			    "recov_stale: can't fail over, marking dead rp %s",
   2126      0    stevel 			    rnode4info(rp)));
   2127      0    stevel 			nfs4_fail_recov(vp, fail_msg, 0, NFS4ERR_STALE);
   2128      0    stevel 		} else {
   2129      0    stevel 			rootvp = vp;
   2130      0    stevel 			VN_HOLD(rootvp);
   2131      0    stevel 		}
   2132      0    stevel 
   2133      0    stevel 		/*
   2134      0    stevel 		 * Mark root dead, but quietly - since
   2135      0    stevel 		 * the root rnode is frequently recreated,
   2136      0    stevel 		 * we can encounter this at every access.
   2137      0    stevel 		 * Also mark recovery as failed on this VFS.
   2138      0    stevel 		 */
   2139      0    stevel 		rootrp = VTOR4(rootvp);
   2140      0    stevel 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_CONT,
   2141      0    stevel 		    "recov_stale: marking dead root rp %s",
   2142      0    stevel 		    rnode4info(rootrp)));
   2143      0    stevel 		mutex_enter(&rootrp->r_statelock);
   2144      0    stevel 		rootrp->r_flags |= (R4RECOVERR | R4STALE);
   2145      0    stevel 		rootrp->r_error = ESTALE;
   2146      0    stevel 		mutex_exit(&rootrp->r_statelock);
   2147      0    stevel 		mutex_enter(&mi->mi_lock);
   2148      0    stevel 		mi->mi_error = ESTALE;
   2149      0    stevel 		mutex_exit(&mi->mi_lock);
   2150      0    stevel 
   2151      0    stevel 		svp = mi->mi_curr_serv;
   2152      0    stevel 		(void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
   2153      0    stevel 		svp->sv_flags |= SV4_ROOT_STALE;
   2154      0    stevel 		nfs_rw_exit(&svp->sv_lock);
   2155      0    stevel 	}
   2156      0    stevel 
   2157      0    stevel out:
   2158      0    stevel 	if (rootvp)
   2159      0    stevel 		VN_RELE(rootvp);
   2160      0    stevel }
   2161      0    stevel 
   2162      0    stevel /*
   2163      0    stevel  * Locks.
   2164      0    stevel  */
   2165      0    stevel 
   2166      0    stevel /*
   2167      0    stevel  * Reclaim all the active (acquired) locks for the given file.
   2168      0    stevel  * If a process lost a lock, the process is sent a SIGLOST.  This is not
   2169      0    stevel  * considered an error.
   2170      0    stevel  *
   2171      0    stevel  * Return values:
   2172      0    stevel  * Errors and status are returned via the nfs4_error_t parameter
   2173      0    stevel  * If an error indicates that recovery is needed, the caller is responsible
   2174      0    stevel  * for dealing with it.
   2175      0    stevel  */
   2176      0    stevel 
   2177      0    stevel static void
   2178      0    stevel relock_file(vnode_t *vp, mntinfo4_t *mi, nfs4_error_t *ep,
   2179      0    stevel     fattr4_change pre_change)
   2180      0    stevel {
   2181      0    stevel 	locklist_t *locks, *llp;
   2182      0    stevel 	rnode4_t *rp;
   2183      0    stevel 
   2184      0    stevel 	ASSERT(ep != NULL);
   2185      0    stevel 	nfs4_error_zinit(ep);
   2186      0    stevel 
   2187      0    stevel 	if (VTOMI4(vp)->mi_flags & MI4_LLOCK)
   2188      0    stevel 		return;
   2189      0    stevel 
   2190      0    stevel 	nfs4_flush_lock_owners(VTOR4(vp));
   2191      0    stevel 
   2192      0    stevel 	/*
   2193      0    stevel 	 * If we get an error that requires recovery actions, just bail out
   2194      0    stevel 	 * and let the top-level recovery code handle it.
   2195      0    stevel 	 *
   2196      0    stevel 	 * If we get some other error, kill the process that owned the lock
   2197      0    stevel 	 * and mark its remaining locks (if any) as belonging to NOPID, so
   2198      0    stevel 	 * that we don't make any more reclaim requests for that process.
   2199      0    stevel 	 */
   2200      0    stevel 
   2201      0    stevel 	rp = VTOR4(vp);
   2202      0    stevel 	locks = flk_active_locks_for_vp(vp);
   2203      0    stevel 	for (llp = locks; llp != NULL; llp = llp->ll_next) {
   2204      0    stevel 		int did_reclaim = 1;
   2205      0    stevel 
   2206      0    stevel 		ASSERT(llp->ll_vp == vp);
   2207      0    stevel 		if (llp->ll_flock.l_pid == NOPID)
   2208      0    stevel 			continue;
   2209      0    stevel 		reclaim_one_lock(vp, &llp->ll_flock, ep, &did_reclaim);
   2210      0    stevel 		/*
   2211      0    stevel 		 * If we need to restart recovery, stop processing the
   2212      0    stevel 		 * list.  Some errors would be recoverable under other
   2213      0    stevel 		 * circumstances, but if they happen here we just give up
   2214      0    stevel 		 * on the lock.
   2215      0    stevel 		 */
   2216      0    stevel 		if (nfs4_needs_recovery(ep, TRUE, vp->v_vfsp)) {
   2217      0    stevel 			if (ep->error != 0)
   2218      0    stevel 				break;
   2219      0    stevel 			if (!nfs4_recov_marks_dead(ep->stat))
   2220      0    stevel 				break;
   2221      0    stevel 		}
   2222      0    stevel 		/*
   2223      0    stevel 		 *   In case the server isn't offering us a grace period, or
   2224      0    stevel 		 * if we missed it, we might have opened & locked from scratch,
   2225      0    stevel 		 * rather than reopened/reclaimed.
   2226      0    stevel 		 *   We need to ensure that the object hadn't been otherwise
   2227      0    stevel 		 * changed during this time, by comparing the changeinfo.
   2228      0    stevel 		 *   We get passed the changeinfo from before the reopen by our
   2229      0    stevel 		 * caller, in pre_change.
   2230      0    stevel 		 *   The changeinfo from after the reopen is in rp->r_change,
   2231      0    stevel 		 * courtesy of the GETATTR in the reopen.
   2232      0    stevel 		 *   If they're different, then the file has changed, and we
   2233      0    stevel 		 * have to SIGLOST the app.
   2234      0    stevel 		 */
   2235      0    stevel 		if (ep->error == 0 && ep->stat == NFS4_OK && !did_reclaim) {
   2236      0    stevel 			mutex_enter(&rp->r_statelock);
   2237      0    stevel 			if (pre_change != rp->r_change)
   2238      0    stevel 				ep->stat = NFS4ERR_NO_GRACE;
   2239      0    stevel 			mutex_exit(&rp->r_statelock);
   2240      0    stevel 		}
   2241      0    stevel 		if (ep->error != 0 || ep->stat != NFS4_OK) {
   2242      0    stevel 			if (ep->error != 0)
   2243      0    stevel 				nfs4_queue_event(RE_FAIL_RELOCK, mi,
   2244      0    stevel 				    NULL, ep->error, vp, NULL, 0, NULL,
   2245      0    stevel 				    llp->ll_flock.l_pid, TAG_NONE, TAG_NONE,
   2246      0    stevel 				    0, 0);
   2247      0    stevel 			else
   2248      0    stevel 				nfs4_queue_event(RE_FAIL_RELOCK, mi,
   2249      0    stevel 				    NULL, 0, vp, NULL, ep->stat, NULL,
   2250      0    stevel 				    llp->ll_flock.l_pid, TAG_NONE, TAG_NONE,
   2251      0    stevel 				    0, 0);
   2252      0    stevel 			nfs4_send_siglost(llp->ll_flock.l_pid, mi, vp, TRUE,
   2253      0    stevel 			    ep->error, ep->stat);
   2254      0    stevel 			relock_skip_pid(llp, llp->ll_flock.l_pid);
   2255      0    stevel 
   2256      0    stevel 			/* Reinitialize the nfs4_error and continue */
   2257      0    stevel 			nfs4_error_zinit(ep);
   2258      0    stevel 		}
   2259      0    stevel 	}
   2260      0    stevel 
   2261      0    stevel 	if (locks != NULL)
   2262      0    stevel 		flk_free_locklist(locks);
   2263      0    stevel }
   2264      0    stevel 
   2265      0    stevel /*
   2266      0    stevel  * Reclaim the given lock.
   2267      0    stevel  * If the lock can't be reclaimed, the process is sent SIGLOST, but this is
   2268      0    stevel  * not considered an error.
   2269      0    stevel  *
   2270      0    stevel  * Errors are returned via the nfs4_error_t parameter.
   2271      0    stevel  */
   2272      0    stevel static void
   2273      0    stevel reclaim_one_lock(vnode_t *vp, flock64_t *flk, nfs4_error_t *ep,
   2274   5302  th199096     int *did_reclaimp)
   2275      0    stevel {
   2276      0    stevel 	cred_t *cr;
   2277      0    stevel 	rnode4_t *rp = VTOR4(vp);
   2278      0    stevel 
   2279      0    stevel 	cr = pid_to_cr(flk->l_pid);
   2280      0    stevel 	if (cr == NULL) {
   2281      0    stevel 		nfs4_error_zinit(ep);
   2282      0    stevel 		ep->error = ESRCH;
   2283      0    stevel 		return;
   2284      0    stevel 	}
   2285      0    stevel 
   2286      0    stevel 	do {
   2287      0    stevel 		mutex_enter(&rp->r_statelock);
   2288      0    stevel 		if (rp->r_flags & R4RECOVERR) {
   2289      0    stevel 			/*
   2290      0    stevel 			 * This shouldn't affect other reclaims, so don't
   2291      0    stevel 			 * return an error.
   2292      0    stevel 			 */
   2293      0    stevel 			mutex_exit(&rp->r_statelock);
   2294      0    stevel 			break;
   2295      0    stevel 		}
   2296      0    stevel 		mutex_exit(&rp->r_statelock);
   2297      0    stevel 
   2298      0    stevel 		nfs4frlock(NFS4_LCK_CTYPE_RECLAIM, vp, F_SETLK, flk,
   2299   5302  th199096 		    FREAD|FWRITE, 0, cr, ep, NULL, did_reclaimp);
   2300      0    stevel 		if (ep->error == 0 && ep->stat == NFS4ERR_FHEXPIRED)
   2301      0    stevel 			start_recovery_action(NR_FHEXPIRED, TRUE, VTOMI4(vp),
   2302   5302  th199096 			    vp, NULL);
   2303      0    stevel 	} while (ep->error == 0 && ep->stat == NFS4ERR_FHEXPIRED);
   2304      0    stevel 
   2305      0    stevel 	crfree(cr);
   2306      0    stevel }
   2307      0    stevel 
   2308      0    stevel /*
   2309      0    stevel  * Open files.
   2310      0    stevel  */
   2311      0    stevel 
   2312      0    stevel /*
   2313      0    stevel  * Verifies if the nfsstat4 is a valid error for marking this vnode dead.
   2314      0    stevel  * Returns 1 if the error is valid; 0 otherwise.
   2315      0    stevel  */
   2316      0    stevel static int
   2317      0    stevel nfs4_valid_recov_err_for_vp(vnode_t *vp, nfsstat4 stat)
   2318      0    stevel {
   2319      0    stevel 	/*
   2320      0    stevel 	 * We should not be marking non-regular files as dead,
   2321      0    stevel 	 * except in very rare cases (eg: BADHANDLE or NFS4ERR_BADNAME).
   2322      0    stevel 	 */
   2323      0    stevel 	if (vp->v_type != VREG && stat != NFS4ERR_BADHANDLE &&
   2324      0    stevel 	    stat != NFS4ERR_BADNAME)
   2325      0    stevel 		return (0);
   2326      0    stevel 
   2327      0    stevel 	return (1);
   2328      0    stevel }
   2329      0    stevel 
   2330      0    stevel /*
   2331      0    stevel  * Failed attempting to recover a filehandle.  If 'stat' is valid for 'vp',
   2332      0    stevel  * then mark the object dead.  Since we've had to do a lookup for
   2333      0    stevel  * filehandle recovery, we will mark the object dead if we got NOENT.
   2334      0    stevel  */
   2335      0    stevel static void
   2336      0    stevel nfs4_recov_fh_fail(vnode_t *vp, int error, nfsstat4 stat)
   2337      0    stevel {
   2338      0    stevel 	ASSERT(vp != NULL);
   2339      0    stevel 
   2340      0    stevel 	if ((error == 0) && (stat != NFS4ERR_NOENT) &&
   2341      0    stevel 	    (!nfs4_valid_recov_err_for_vp(vp, stat)))
   2342      0    stevel 		return;
   2343      0    stevel 
   2344      0    stevel 	nfs4_fail_recov(vp, "can't recover filehandle", error, stat);
   2345      0    stevel }
   2346      0    stevel 
   2347      0    stevel /*
   2348      0    stevel  * Recovery from a "shouldn't happen" error.  In the long term, we'd like
   2349      0    stevel  * to mark only the data structure(s) that provided the bad value as being
   2350      0    stevel  * bad.  But for now we'll just mark the entire file.
   2351      0    stevel  */
   2352      0    stevel 
   2353      0    stevel static void
   2354      0    stevel recov_badstate(recov_info_t *recovp, vnode_t *vp, nfsstat4 stat)
   2355      0    stevel {
   2356      0    stevel 	ASSERT(vp != NULL);
   2357      0    stevel 	recov_throttle(recovp, vp);
   2358      0    stevel 
   2359      0    stevel 	if (!nfs4_valid_recov_err_for_vp(vp, stat))
   2360      0    stevel 		return;
   2361      0    stevel 
   2362      0    stevel 	nfs4_fail_recov(vp, "", 0, stat);
   2363      0    stevel }
   2364      0    stevel 
   2365      0    stevel /*
   2366      0    stevel  * Free up the information saved for a lost state request.
   2367      0    stevel  */
   2368      0    stevel static void
   2369      0    stevel nfs4_free_lost_rqst(nfs4_lost_rqst_t *lrp, nfs4_server_t *sp)
   2370      0    stevel {
   2371      0    stevel 	component4 *filep;
   2372      0    stevel 	nfs4_open_stream_t *osp;
   2373      0    stevel 	int have_sync_lock;
   2374      0    stevel 
   2375      0    stevel 	NFS4_DEBUG(nfs4_lost_rqst_debug,
   2376   5302  th199096 	    (CE_NOTE, "nfs4_free_lost_rqst:"));
   2377      0    stevel 
   2378      0    stevel 	switch (lrp->lr_op) {
   2379      0    stevel 	case OP_OPEN:
   2380      0    stevel 		filep = &lrp->lr_ofile;
   2381      0    stevel 		if (filep->utf8string_val) {
   2382      0    stevel 			kmem_free(filep->utf8string_val, filep->utf8string_len);
   2383      0    stevel 			filep->utf8string_val = NULL;
   2384      0    stevel 		}
   2385      0    stevel 		break;
   2386      0    stevel 	case OP_DELEGRETURN:
   2387      0    stevel 		nfs4delegreturn_cleanup(VTOR4(lrp->lr_vp), sp);
   2388      0    stevel 		break;
   2389      0    stevel 	case OP_CLOSE:
   2390      0    stevel 		osp = lrp->lr_osp;
   2391      0    stevel 		ASSERT(osp != NULL);
   2392      0    stevel 		mutex_enter(&osp->os_sync_lock);
   2393      0    stevel 		have_sync_lock = 1;
   2394      0    stevel 		if (osp->os_pending_close) {
   2395      0    stevel 			/* clean up the open file state. */
   2396      0    stevel 			osp->os_pending_close = 0;
   2397      0    stevel 			nfs4close_notw(lrp->lr_vp, osp, &have_sync_lock);
   2398      0    stevel 		}
   2399      0    stevel 		if (have_sync_lock)
   2400      0    stevel 			mutex_exit(&osp->os_sync_lock);
   2401      0    stevel 		break;
   2402      0    stevel 	}
   2403      0    stevel 
   2404      0    stevel 	lrp->lr_op = 0;
   2405      0    stevel 	if (lrp->lr_oop != NULL) {
   2406      0    stevel 		open_owner_rele(lrp->lr_oop);
   2407      0    stevel 		lrp->lr_oop = NULL;
   2408      0    stevel 	}
   2409      0    stevel 	if (lrp->lr_osp != NULL) {
   2410      0    stevel 		open_stream_rele(lrp->lr_osp, VTOR4(lrp->lr_vp));
   2411      0    stevel 		lrp->lr_osp = NULL;
   2412      0    stevel 	}
   2413      0    stevel 	if (lrp->lr_lop != NULL) {
   2414      0    stevel 		lock_owner_rele(lrp->lr_lop);
   2415      0    stevel 		lrp->lr_lop = NULL;
   2416      0    stevel 	}
   2417      0    stevel 	if (lrp->lr_flk != NULL) {
   2418      0    stevel 		kmem_free(lrp->lr_flk, sizeof (flock64_t));
   2419      0    stevel 		lrp->lr_flk = NULL;
   2420      0    stevel 	}
   2421      0    stevel 	if (lrp->lr_vp != NULL) {
   2422      0    stevel 		VN_RELE(lrp->lr_vp);
   2423      0    stevel 		lrp->lr_vp = NULL;
   2424      0    stevel 	}
   2425      0    stevel 	if (lrp->lr_dvp != NULL) {
   2426      0    stevel 		VN_RELE(lrp->lr_dvp);
   2427      0    stevel 		lrp->lr_dvp = NULL;
   2428      0    stevel 	}
   2429      0    stevel 	if (lrp->lr_cr != NULL) {
   2430      0    stevel 		crfree(lrp->lr_cr);
   2431      0    stevel 		lrp->lr_cr = NULL;
   2432      0    stevel 	}
   2433      0    stevel 
   2434      0    stevel 	kmem_free(lrp, sizeof (nfs4_lost_rqst_t));
   2435      0    stevel }
   2436      0    stevel 
   2437      0    stevel /*
   2438      0    stevel  * Remove any lost state requests and free them.
   2439      0    stevel  */
   2440      0    stevel static void
   2441      0    stevel nfs4_remove_lost_rqsts(mntinfo4_t *mi, nfs4_server_t *sp)
   2442      0    stevel {
   2443      0    stevel 	nfs4_lost_rqst_t *lrp;
   2444      0    stevel 
   2445      0    stevel 	mutex_enter(&mi->mi_lock);
   2446      0    stevel 	while ((lrp = list_head(&mi->mi_lost_state)) != NULL) {
   2447      0    stevel 		list_remove(&mi->mi_lost_state, lrp);
   2448      0    stevel 		mutex_exit(&mi->mi_lock);
   2449      0    stevel 		nfs4_free_lost_rqst(lrp, sp);
   2450      0    stevel 		mutex_enter(&mi->mi_lock);
   2451      0    stevel 	}
   2452      0    stevel 	mutex_exit(&mi->mi_lock);
   2453      0    stevel }
   2454      0    stevel 
   2455      0    stevel /*
   2456      0    stevel  * Reopen all the files for the given filesystem and reclaim any locks.
   2457      0    stevel  */
   2458      0    stevel 
   2459      0    stevel static void
   2460      0    stevel recov_openfiles(recov_info_t *recovp, nfs4_server_t *sp)
   2461      0    stevel {
   2462      0    stevel 	mntinfo4_t *mi = recovp->rc_mi;
   2463      0    stevel 	nfs4_opinst_t *reopenlist = NULL, *rep;
   2464      0    stevel 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
   2465      0    stevel 	open_claim_type4 claim;
   2466      0    stevel 	int remap;
   2467      0    stevel 	char *fail_msg = "No such file or directory on replica";
   2468      0    stevel 	rnode4_t *rp;
   2469      0    stevel 	fattr4_change pre_change;
   2470      0    stevel 
   2471      0    stevel 	ASSERT(sp != NULL);
   2472      0    stevel 
   2473      0    stevel 	/*
   2474      0    stevel 	 * This check is to allow a 10ms pause before we reopen files
   2475      0    stevel 	 * it should allow the server time to have received the CB_NULL
   2476      0    stevel 	 * reply and update its internal structures such that (if
   2477      0    stevel 	 * applicable) we are granted a delegation on reopened files.
   2478      0    stevel 	 */
   2479      0    stevel 	mutex_enter(&sp->s_lock);
   2480      0    stevel 	if ((sp->s_flags & (N4S_CB_PINGED | N4S_CB_WAITER)) == 0) {
   2481      0    stevel 		sp->s_flags |= N4S_CB_WAITER;
   2482  11066    rafael 		(void) cv_reltimedwait(&sp->wait_cb_null, &sp->s_lock,
   2483  11066    rafael 		    drv_usectohz(N4S_CB_PAUSE_TIME), TR_CLOCK_TICK);
   2484      0    stevel 	}
   2485      0    stevel 	mutex_exit(&sp->s_lock);
   2486      0    stevel 
   2487      0    stevel 	(void) nfs_rw_enter_sig(&sp->s_recovlock, RW_READER, 0);
   2488      0    stevel 	(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0);
   2489      0    stevel 
   2490      0    stevel 	if (NFS4_VOLATILE_FH(mi)) {
   2491      0    stevel 		nfs4_remap_root(mi, &e, 0);
   2492      0    stevel 		if (nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp)) {
   2493      0    stevel 			(void) nfs4_start_recovery(&e, mi, NULL,
   2494   5302  th199096 			    NULL, NULL, NULL, OP_LOOKUP, NULL);
   2495      0    stevel 		}
   2496      0    stevel 	}
   2497      0    stevel 
   2498      0    stevel 	mutex_enter(&mi->mi_lock);
   2499      0    stevel 	if (recovp->rc_srv_reboot || (mi->mi_recovflags & MI4R_SRV_REBOOT))
   2500      0    stevel 		claim = CLAIM_PREVIOUS;
   2501      0    stevel 	else
   2502      0    stevel 		claim = CLAIM_NULL;
   2503      0    stevel 	mutex_exit(&mi->mi_lock);
   2504      0    stevel 
   2505      0    stevel 	if (e.error == 0 && e.stat == NFS4_OK) {
   2506      0    stevel 		/*
   2507      0    stevel 		 * Get a snapshot of open files in the filesystem.  Note
   2508      0    stevel 		 * that new opens will stall until the server's grace
   2509      0    stevel 		 * period is done.
   2510      0    stevel 		 */
   2511      0    stevel 		reopenlist = r4mkopenlist(mi);
   2512      0    stevel 
   2513      0    stevel 		mutex_enter(&mi->mi_lock);
   2514      0    stevel 		remap = mi->mi_recovflags & MI4R_REMAP_FILES;
   2515      0    stevel 		mutex_exit(&mi->mi_lock);
   2516      0    stevel 		/*
   2517      0    stevel 		 * Since we are re-establishing state on the
   2518      0    stevel 		 * server, its ok to blow away the saved lost
   2519      0    stevel 		 * requests since we don't need to reissue it.
   2520      0    stevel 		 */
   2521      0    stevel 		nfs4_remove_lost_rqsts(mi, sp);
   2522      0    stevel 
   2523      0    stevel 		for (rep = reopenlist; rep; rep = rep->re_next) {
   2524      0    stevel 
   2525      0    stevel 			if (remap) {
   2526      0    stevel 				nfs4_remap_file(mi, rep->re_vp,
   2527   5302  th199096 				    NFS4_REMAP_CKATTRS, &e);
   2528      0    stevel 			}
   2529      0    stevel 			if (e.error == ENOENT || e.stat == NFS4ERR_NOENT) {
   2530      0    stevel 				/*
   2531      0    stevel 				 * The current server does not have the file
   2532      0    stevel 				 * that is to be remapped.  This is most
   2533      0    stevel 				 * likely due to an improperly maintained
   2534      0    stevel 				 * replica.   The files that are missing from
   2535      0    stevel 				 * the server will be marked dead and logged
   2536      0    stevel 				 * in order to make sys admins aware of the
   2537      0    stevel 				 * problem.
   2538      0    stevel 				 */
   2539      0    stevel 				nfs4_fail_recov(rep->re_vp,
   2540   5302  th199096 				    fail_msg, e.error, e.stat);
   2541      0    stevel 				/*
   2542      0    stevel 				 * We've already handled the error so clear it.
   2543      0    stevel 				 */
   2544      0    stevel 				nfs4_error_zinit(&e);
   2545      0    stevel 				continue;
   2546      0    stevel 			} else if (e.error == 0 && e.stat == NFS4_OK) {
   2547      0    stevel 				int j;
   2548      0    stevel 
   2549      0    stevel 				rp = VTOR4(rep->re_vp);
   2550      0    stevel 				mutex_enter(&rp->r_statelock);
   2551      0    stevel 				pre_change = rp->r_change;
   2552      0    stevel 				mutex_exit(&rp->r_statelock);
   2553      0    stevel 
   2554      0    stevel 				for (j = 0; j < rep->re_numosp; j++) {
   2555      0    stevel 					nfs4_reopen(rep->re_vp, rep->re_osp[j],
   2556   5302  th199096 					    &e, claim, FALSE, TRUE);
   2557      0    stevel 					if (e.error != 0 || e.stat != NFS4_OK)
   2558      0    stevel 						break;
   2559      0    stevel 				}
   2560      0    stevel 				if (nfs4_needs_recovery(&e, TRUE,
   2561      0    stevel 				    mi->mi_vfsp)) {
   2562      0    stevel 					(void) nfs4_start_recovery(&e, mi,
   2563   5302  th199096 					    rep->re_vp, NULL, NULL, NULL,
   2564   5302  th199096 					    OP_OPEN, NULL);
   2565      0    stevel 					break;
   2566      0    stevel 				}
   2567      0    stevel 			}
   2568      0    stevel #ifdef DEBUG
   2569      0    stevel 			if (nfs4_recovdelay > 0)
   2570      0    stevel 				delay(MSEC_TO_TICK(nfs4_recovdelay * 1000));
   2571      0    stevel #endif
   2572      0    stevel 			if (e.error == 0 && e.stat == NFS4_OK)
   2573      0    stevel 				relock_file(rep->re_vp, mi, &e, pre_change);
   2574      0    stevel 
   2575      0    stevel 			if (nfs4_needs_recovery(&e, TRUE, mi->mi_vfsp))
   2576      0    stevel 				(void) nfs4_start_recovery(&e, mi,
   2577   5302  th199096 				    rep->re_vp, NULL, NULL, NULL, OP_LOCK,
   2578   5302  th199096 				    NULL);
   2579      0    stevel 			if (e.error != 0 || e.stat != NFS4_OK)
   2580      0    stevel 				break;
   2581      0    stevel 		}
   2582      0    stevel 
   2583      0    stevel 		/*
   2584      0    stevel 		 * Check to see if we need to remap files passed in
   2585      0    stevel 		 * via the recovery arguments; this will have been
   2586      0    stevel 		 * done for open files.  A failure here is not fatal.
   2587      0    stevel 		 */
   2588      0    stevel 		if (remap) {
   2589      0    stevel 			nfs4_error_t ignore;
   2590      0    stevel 			nfs4_check_remap(mi, recovp->rc_vp1, NFS4_REMAP_CKATTRS,
   2591   5302  th199096 			    &ignore);
   2592      0    stevel 			nfs4_check_remap(mi, recovp->rc_vp2, NFS4_REMAP_CKATTRS,
   2593   5302  th199096 			    &ignore);
   2594      0    stevel 		}
   2595      0    stevel 	}
   2596      0    stevel 
   2597      0    stevel 	if (e.error == 0 && e.stat == NFS4_OK) {
   2598      0    stevel 		mutex_enter(&mi->mi_lock);
   2599      0    stevel 		mi->mi_recovflags &= ~(MI4R_REOPEN_FILES | MI4R_REMAP_FILES);
   2600      0    stevel 		mutex_exit(&mi->mi_lock);
   2601      0    stevel 	}
   2602      0    stevel 
   2603      0    stevel 	nfs_rw_exit(&mi->mi_recovlock);
   2604      0    stevel 	nfs_rw_exit(&sp->s_recovlock);
   2605      0    stevel 
   2606      0    stevel 	if (reopenlist != NULL)
   2607      0    stevel 		r4releopenlist(reopenlist);
   2608      0    stevel }
   2609      0    stevel 
   2610      0    stevel /*
   2611      0    stevel  * Resend the queued state recovery requests in "rqsts".
   2612      0    stevel  */
   2613      0    stevel 
   2614      0    stevel static void
   2615      0    stevel nfs4_resend_lost_rqsts(recov_info_t *recovp, nfs4_server_t *sp)
   2616      0    stevel {
   2617      0    stevel 	nfs4_lost_rqst_t	*lrp, *tlrp;
   2618      0    stevel 	mntinfo4_t		*mi = recovp->rc_mi;
   2619    284  ek110237 	nfs4_error_t		n4e;
   2620      0    stevel #ifdef NOTYET
   2621      0    stevel 	uint32_t		deny_bits = 0;
   2622      0    stevel #endif
   2623      0    stevel 
   2624      0    stevel 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_resend_lost_rqsts"));
   2625      0    stevel 
   2626      0    stevel 	ASSERT(mi != NULL);
   2627      0    stevel 	ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
   2628      0    stevel 
   2629      0    stevel 	mutex_enter(&mi->mi_lock);
   2630      0    stevel 	lrp = list_head(&mi->mi_lost_state);
   2631      0    stevel 	mutex_exit(&mi->mi_lock);
   2632      0    stevel 	while (lrp != NULL) {
   2633    284  ek110237 		nfs4_error_zinit(&n4e);
   2634    284  ek110237 		resend_one_op(lrp, &n4e, mi, sp);
   2635      0    stevel 		NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
   2636      0    stevel 		    "nfs4_resend_lost_rqsts: resend request: for vp %p got "
   2637    284  ek110237 		    "error %d stat %d", (void *)lrp->lr_vp, n4e.error,
   2638    284  ek110237 		    n4e.stat));
   2639      0    stevel 
   2640      0    stevel 		/*
   2641      0    stevel 		 * If we get a recovery error that we can actually
   2642      0    stevel 		 * recover from (such as ETIMEDOUT, FHEXPIRED), we
   2643      0    stevel 		 * return and let the recovery thread redrive the call.
   2644      0    stevel 		 * Don't requeue unless the zone is still healthy.
   2645      0    stevel 		 */
   2646      0    stevel 		if (zone_status_get(curproc->p_zone) < ZONE_IS_SHUTTING_DOWN &&
   2647    284  ek110237 		    nfs4_needs_recovery(&n4e, TRUE, mi->mi_vfsp) &&
   2648    284  ek110237 		    (nfs4_try_failover(&n4e) ||
   2649    284  ek110237 		    NFS4_FRC_UNMT_ERR(n4e.error, mi->mi_vfsp) ||
   2650    284  ek110237 		    (n4e.error == 0 && n4e.stat != NFS4ERR_BADHANDLE &&
   2651    284  ek110237 		    !nfs4_recov_marks_dead(n4e.stat)))) {
   2652      0    stevel 			/*
   2653      0    stevel 			 * For these three errors, we want to delay a bit
   2654      0    stevel 			 * instead of pounding the server into submission.
   2655      0    stevel 			 * We have to do this manually; the normal
   2656      0    stevel 			 * processing for these errors only works for
   2657      0    stevel 			 * non-recovery requests.
   2658      0    stevel 			 */
   2659    284  ek110237 			if ((n4e.error == 0 && n4e.stat == NFS4ERR_DELAY) ||
   2660    284  ek110237 			    (n4e.error == 0 && n4e.stat == NFS4ERR_GRACE) ||
   2661    284  ek110237 			    (n4e.error == 0 && n4e.stat == NFS4ERR_RESOURCE) ||
   2662    284  ek110237 			    NFS4_FRC_UNMT_ERR(n4e.error, mi->mi_vfsp)) {
   2663      0    stevel 				delay(SEC_TO_TICK(nfs4err_delay_time));
   2664      0    stevel 			} else {
   2665    284  ek110237 				(void) nfs4_start_recovery(&n4e,
   2666   5302  th199096 				    mi, lrp->lr_dvp, lrp->lr_vp, NULL, NULL,
   2667   5302  th199096 				    lrp->lr_op, NULL);
   2668      0    stevel 			}
   2669      0    stevel 			return;
   2670      0    stevel 		}
   2671      0    stevel 
   2672      0    stevel 		mutex_enter(&mi->mi_lock);
   2673      0    stevel 		list_remove(&mi->mi_lost_state, lrp);
   2674      0    stevel 		tlrp = lrp;
   2675      0    stevel 		lrp = list_head(&mi->mi_lost_state);
   2676      0    stevel 		mutex_exit(&mi->mi_lock);
   2677      0    stevel 		nfs4_free_lost_rqst(tlrp, sp);
   2678      0    stevel 	}
   2679      0    stevel }
   2680      0    stevel 
   2681      0    stevel /*
   2682      0    stevel  * Resend the given op, and issue any necessary undo call.
   2683      0    stevel  * errors are returned via the nfs4_error_t parameter.
   2684      0    stevel  */
   2685      0    stevel 
   2686      0    stevel static void
   2687      0    stevel resend_one_op(nfs4_lost_rqst_t *lrp, nfs4_error_t *ep,
   2688   5302  th199096     mntinfo4_t *mi, nfs4_server_t *sp)
   2689      0    stevel {
   2690      0    stevel 	vnode_t *vp;
   2691      0    stevel 	nfs4_open_stream_t *osp;
   2692      0    stevel 	cred_t *cr;
   2693      0    stevel 	uint32_t acc_bits;
   2694      0    stevel 
   2695      0    stevel 	vp = lrp->lr_vp;
   2696      0    stevel 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_one_op: "
   2697      0    stevel 	    "have a lost open/close request for vp %p", (void *)vp));
   2698      0    stevel 
   2699      0    stevel 	switch (lrp->lr_op) {
   2700      0    stevel 	case OP_OPEN:
   2701      0    stevel 		nfs4_resend_open_otw(&vp, lrp, ep);
   2702      0    stevel 		break;
   2703      0    stevel 	case OP_OPEN_DOWNGRADE:
   2704      0    stevel 		ASSERT(lrp->lr_oop != NULL);
   2705      0    stevel 		ep->error = nfs4_start_open_seqid_sync(lrp->lr_oop, mi);
   2706      0    stevel 		ASSERT(!ep->error);	/* recov thread always succeeds */
   2707      0    stevel 		ASSERT(lrp->lr_osp != NULL);
   2708      0    stevel 		mutex_enter(&lrp->lr_osp->os_sync_lock);
   2709      0    stevel 		nfs4_open_downgrade(lrp->lr_dg_acc, lrp->lr_dg_deny,
   2710   5302  th199096 		    lrp->lr_oop, lrp->lr_osp, vp, lrp->lr_cr, lrp,
   2711   5302  th199096 		    ep, NULL, NULL);
   2712      0    stevel 		mutex_exit(&lrp->lr_osp->os_sync_lock);
   2713      0    stevel 		nfs4_end_open_seqid_sync(lrp->lr_oop);
   2714      0    stevel 		break;
   2715      0    stevel 	case OP_CLOSE:
   2716      0    stevel 		osp = lrp->lr_osp;
   2717      0    stevel 		cr = lrp->lr_cr;
   2718      0    stevel 		acc_bits = 0;
   2719      0    stevel 		mutex_enter(&osp->os_sync_lock);
   2720      0    stevel 		if (osp->os_share_acc_read)
   2721      0    stevel 			acc_bits |= OPEN4_SHARE_ACCESS_READ;
   2722      0    stevel 		if (osp->os_share_acc_write)
   2723      0    stevel 			acc_bits |= OPEN4_SHARE_ACCESS_WRITE;
   2724      0    stevel 		mutex_exit(&osp->os_sync_lock);
   2725      0    stevel 		nfs4close_one(vp, osp, cr, acc_bits, lrp, ep,
   2726   5302  th199096 		    CLOSE_RESEND, 0, 0, 0);
   2727      0    stevel 		break;
   2728      0    stevel 	case OP_LOCK:
   2729      0    stevel 	case OP_LOCKU:
   2730      0    stevel 		resend_lock(lrp, ep);
   2731      0    stevel 		goto done;
   2732      0    stevel 	case OP_DELEGRETURN:
   2733      0    stevel 		nfs4_resend_delegreturn(lrp, ep, sp);
   2734      0    stevel 		goto done;
   2735      0    stevel 	default:
   2736      0    stevel #ifdef DEBUG
   2737      0    stevel 		cmn_err(CE_PANIC, "resend_one_op: unexpected op: %d",
   2738   5302  th199096 		    lrp->lr_op);
   2739      0    stevel #endif
   2740      0    stevel 		nfs4_queue_event(RE_LOST_STATE_BAD_OP, mi, NULL,
   2741      0    stevel 		    lrp->lr_op, lrp->lr_vp, lrp->lr_dvp, NFS4_OK, NULL, 0,
   2742      0    stevel 		    TAG_NONE, TAG_NONE, 0, 0);
   2743      0    stevel 		nfs4_error_init(ep, EINVAL);
   2744      0    stevel 		return;
   2745      0    stevel 	}
   2746      0    stevel 
   2747      0    stevel 	/*
   2748      0    stevel 	 * No need to retry nor send an "undo" CLOSE in the
   2749      0    stevel 	 * event the server rebooted.
   2750      0    stevel 	 */
   2751      0    stevel 	if (ep->error == 0 && (ep->stat == NFS4ERR_STALE_CLIENTID ||
   2752      0    stevel 	    ep->stat == NFS4ERR_STALE_STATEID || ep->stat == NFS4ERR_EXPIRED))
   2753      0    stevel 		goto done;
   2754      0    stevel 
   2755      0    stevel 	/*
   2756      0    stevel 	 * If we resent a CLOSE or OPEN_DOWNGRADE, there's nothing
   2757      0    stevel 	 * to undo.  Undoing locking operations was handled by
   2758      0    stevel 	 * resend_lock().
   2759      0    stevel 	 */
   2760      0    stevel 	if (lrp->lr_op == OP_OPEN_DOWNGRADE || lrp->lr_op == OP_CLOSE)
   2761      0    stevel 		goto done;
   2762      0    stevel 
   2763      0    stevel 	/*
   2764      0    stevel 	 * If we get any other error for OPEN, then don't attempt
   2765      0    stevel 	 * to undo the resend of the open (since it was never
   2766      0    stevel 	 * successful!).
   2767      0    stevel 	 */
   2768      0    stevel 	ASSERT(lrp->lr_op == OP_OPEN);
   2769      0    stevel 	if (ep->error || ep->stat != NFS4_OK)
   2770      0    stevel 		goto done;
   2771      0    stevel 
   2772      0    stevel 	/*
   2773      0    stevel 	 * Now let's undo our OPEN.
   2774      0    stevel 	 */
   2775      0    stevel 	nfs4_error_zinit(ep);
   2776      0    stevel 	close_after_open_resend(vp, lrp->lr_cr, lrp->lr_oacc, ep);
   2777      0    stevel 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_one_op: "
   2778      0    stevel 	    "nfs4close_one: for vp %p got error %d stat %d",
   2779      0    stevel 	    (void *)vp, ep->error, ep->stat));
   2780      0    stevel 
   2781      0    stevel done:
   2782      0    stevel 	if (vp != lrp->lr_vp)
   2783      0    stevel 		VN_RELE(vp);
   2784      0    stevel }
   2785      0    stevel 
   2786      0    stevel /*
   2787      0    stevel  * Close a file that was opened via a resent OPEN.
   2788      0    stevel  * Most errors are passed back to the caller (via the return value and
   2789      0    stevel  * *statp), except for FHEXPIRED, which is retried.
   2790      0    stevel  *
   2791      0    stevel  * It might be conceptually cleaner to push the CLOSE request onto the
   2792      0    stevel  * front of the resend queue, rather than sending it here.  That would
   2793      0    stevel  * match the way we undo lost lock requests.  On the other
   2794      0    stevel  * hand, we've already got something that works, and there's no reason to
   2795      0    stevel  * change it at this time.
   2796      0    stevel  */
   2797      0    stevel 
   2798      0    stevel static void
   2799      0    stevel close_after_open_resend(vnode_t *vp, cred_t *cr, uint32_t acc_bits,
   2800   5302  th199096     nfs4_error_t *ep)
   2801      0    stevel {
   2802      0    stevel 
   2803      0    stevel 	for (;;) {
   2804      0    stevel 		nfs4close_one(vp, NULL, cr, acc_bits, NULL, ep,
   2805   5302  th199096 		    CLOSE_AFTER_RESEND, 0, 0, 0);
   2806      0    stevel 		if (ep->error == 0 && ep->stat == NFS4_OK)
   2807      0    stevel 			break;		/* success; done */
   2808      0    stevel 		if (ep->error != 0 || ep->stat != NFS4ERR_FHEXPIRED)
   2809      0    stevel 			break;
   2810      0    stevel 		/* else retry FHEXPIRED */
   2811      0    stevel 	}
   2812      0    stevel 
   2813      0    stevel }
   2814      0    stevel 
   2815      0    stevel /*
   2816      0    stevel  * Resend the given lost lock request.  Return an errno value.  If zero,
   2817      0    stevel  * *statp is set to the NFS status code for the call.
   2818      0    stevel  *
   2819      0    stevel  * Issue a SIGLOST and mark the rnode dead if we get a non-recovery error or
   2820      0    stevel  * a recovery error that we don't actually recover from yet (eg: BAD_SEQID).
   2821      0    stevel  * Let the recovery thread redrive the call if we get a recovery error that
   2822      0    stevel  * we can actually recover from.
   2823      0    stevel  */
   2824      0    stevel static void
   2825      0    stevel resend_lock(nfs4_lost_rqst_t *lrp, nfs4_error_t *ep)
   2826      0    stevel {
   2827      0    stevel 	bool_t		send_siglost = FALSE;
   2828      0    stevel 	vnode_t		*vp = lrp->lr_vp;
   2829      0    stevel 
   2830      0    stevel 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_lock:"));
   2831      0    stevel 	ASSERT(lrp->lr_ctype == NFS4_LCK_CTYPE_REINSTATE ||
   2832      0    stevel 	    lrp->lr_ctype == NFS4_LCK_CTYPE_RESEND);
   2833      0    stevel 
   2834      0    stevel 	nfs4frlock(lrp->lr_ctype, vp, F_SETLK,
   2835   5302  th199096 	    lrp->lr_flk, FREAD|FWRITE, 0, lrp->lr_cr, ep, lrp, NULL);
   2836      0    stevel 
   2837      0    stevel 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "resend_lock: "
   2838      0    stevel 	    "nfs4frlock for vp %p returned error %d, stat %d",
   2839      0    stevel 	    (void *)vp, ep->error, ep->stat));
   2840      0    stevel 
   2841      0    stevel 	if (ep->error == 0 && ep->stat == 0)
   2842      0    stevel 		goto done;
   2843      0    stevel 	if (ep->error == 0 && ep->stat == NFS4ERR_DENIED &&
   2844      0    stevel 	    lrp->lr_ctype == NFS4_LCK_CTYPE_RESEND)
   2845      0    stevel 		goto done;
   2846      0    stevel 
   2847      0    stevel 	/*
   2848      0    stevel 	 * If we failed with a non-recovery error, send SIGLOST and
   2849      0    stevel 	 * mark the file dead.
   2850      0    stevel 	 */
   2851      0    stevel 	if (!nfs4_needs_recovery(ep, TRUE, vp->v_vfsp))
   2852      0    stevel 		send_siglost = TRUE;
   2853      0    stevel 	else {
   2854      0    stevel 		/*
   2855      0    stevel 		 * Done with recovering LOST LOCK in the event the
   2856      0    stevel 		 * server rebooted or we've lost the lease.
   2857      0    stevel 		 */
   2858      0    stevel 		if (ep->error == 0 && (ep->stat == NFS4ERR_STALE_CLIENTID ||
   2859      0    stevel 		    ep->stat == NFS4ERR_STALE_STATEID ||
   2860      0    stevel 		    ep->stat == NFS4ERR_EXPIRED)) {
   2861      0    stevel 			goto done;
   2862      0    stevel 		}
   2863      0    stevel 
   2864      0    stevel 		/*
   2865      0    stevel 		 * BAD_STATEID on an unlock indicates that the server has
   2866      0    stevel 		 * forgotten about the lock anyway, so act like the call
   2867      0    stevel 		 * was successful.
   2868      0    stevel 		 */
   2869      0    stevel 		if (ep->error == 0 && ep->stat == NFS4ERR_BAD_STATEID &&
   2870      0    stevel 		    lrp->lr_op == OP_LOCKU)
   2871      0    stevel 			goto done;
   2872      0    stevel 
   2873      0    stevel 		/*
   2874      0    stevel 		 * If we got a recovery error that we don't actually
   2875      0    stevel 		 * recover from, send SIGLOST.  If the filesystem was
   2876      0    stevel 		 * forcibly unmounted, we skip the SIGLOST because (a) it's
   2877      0    stevel 		 * unnecessary noise, and (b) there could be a new process
   2878      0    stevel 		 * with the same pid as the one that had generated the lost
   2879      0    stevel 		 * state request.
   2880      0    stevel 		 */
   2881      0    stevel 		if (ep->error == 0 && (ep->stat == NFS4ERR_BADHANDLE ||
   2882      0    stevel 		    nfs4_recov_marks_dead(ep->stat))) {
   2883      0    stevel 			if (!(vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
   2884      0    stevel 				send_siglost = TRUE;
   2885      0    stevel 			goto done;
   2886      0    stevel 		}
   2887      0    stevel 
   2888      0    stevel 		/*
   2889      0    stevel 		 * If the filesystem was forcibly unmounted, we
   2890      0    stevel 		 * still need to synchronize with the server and
   2891      0    stevel 		 * release state.  Try again later.
   2892      0    stevel 		 */
   2893      0    stevel 		if (NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp))
   2894      0    stevel 			goto done;
   2895      0    stevel 
   2896      0    stevel 		/*
   2897      0    stevel 		 * If we get a recovery error that we can actually
   2898      0    stevel 		 * recover from (such as ETIMEDOUT, FHEXPIRED),
   2899      0    stevel 		 * return and let the recovery thread redrive the call.
   2900      0    stevel 		 *
   2901      0    stevel 		 * For the three errors below, we want to delay a bit
   2902      0    stevel 		 * instead of pounding the server into submission.
   2903      0    stevel 		 */
   2904      0    stevel 		if ((ep->error == 0 && ep->stat == NFS4ERR_DELAY) ||
   2905      0    stevel 		    (ep->error == 0 && ep->stat == NFS4ERR_GRACE) ||
   2906      0    stevel 		    (ep->error == 0 && ep->stat == NFS4ERR_RESOURCE))
   2907      0    stevel 			delay(SEC_TO_TICK(recov_err_delay));
   2908      0    stevel 		goto done;
   2909      0    stevel 	}
   2910      0    stevel 
   2911      0    stevel done:
   2912      0    stevel 	if (send_siglost) {
   2913      0    stevel 		cred_t *sv_cred;
   2914      0    stevel 
   2915      0    stevel 		/*
   2916      0    stevel 		 * Must be root or the actual thread being issued the
   2917      0    stevel 		 * SIGLOST for this to work, so just become root.
   2918      0    stevel 		 */
   2919      0    stevel 		sv_cred = curthread->t_cred;
   2920      0    stevel 		curthread->t_cred = kcred;
   2921      0    stevel 		nfs4_send_siglost(lrp->lr_flk->l_pid, VTOMI4(vp), vp, FALSE,
   2922      0    stevel 		    ep->error, ep->stat);
   2923      0    stevel 		curthread->t_cred = sv_cred;
   2924      0    stevel 
   2925      0    stevel 		/*
   2926      0    stevel 		 * Flush any additional reinstantiation requests for
   2927      0    stevel 		 * this operation.  Sending multiple SIGLOSTs to the user
   2928      0    stevel 		 * process is unlikely to help and may cause trouble.
   2929      0    stevel 		 */
   2930      0    stevel 		if (lrp->lr_ctype == NFS4_LCK_CTYPE_REINSTATE)
   2931      0    stevel 			flush_reinstate(lrp);
   2932      0    stevel 	}
   2933      0    stevel }
   2934      0    stevel 
   2935      0    stevel /*
   2936      0    stevel  * Remove any lock reinstantiation requests that correspond to the given
   2937      0    stevel  * lost request.  We only remove items that follow lrp in the queue,
   2938      0    stevel  * assuming that lrp will be removed by the generic lost state code.
   2939      0    stevel  */
   2940      0    stevel 
   2941      0    stevel static void
   2942      0    stevel flush_reinstate(nfs4_lost_rqst_t *lrp)
   2943      0    stevel {
   2944      0    stevel 	vnode_t *vp;
   2945      0    stevel 	pid_t pid;
   2946      0    stevel 	mntinfo4_t *mi;
   2947      0    stevel 	nfs4_lost_rqst_t *nlrp;
   2948      0    stevel 
   2949      0    stevel 	vp = lrp->lr_vp;
   2950      0    stevel 	mi = VTOMI4(vp);
   2951      0    stevel 	pid = lrp->lr_flk->l_pid;
   2952      0    stevel 
   2953      0    stevel 	/*
   2954      0    stevel 	 * If there are any more reinstantation requests to get rid of,
   2955      0    stevel 	 * they should all be clustered at the front of the lost state
   2956      0    stevel 	 * queue.
   2957      0    stevel 	 */
   2958      0    stevel 	mutex_enter(&mi->mi_lock);
   2959      0    stevel 	for (lrp = list_next(&mi->mi_lost_state, lrp); lrp != NULL;
   2960      0    stevel 	    lrp = nlrp) {
   2961      0    stevel 		nlrp = list_next(&mi->mi_lost_state, lrp);
   2962      0    stevel 		if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU)
   2963      0    stevel 			break;
   2964      0    stevel 		if (lrp->lr_ctype != NFS4_LCK_CTYPE_REINSTATE)
   2965      0    stevel 			break;
   2966      0    stevel 		ASSERT(lrp->lr_vp == vp);
   2967      0    stevel 		ASSERT(lrp->lr_flk->l_pid == pid);
   2968      0    stevel 		NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
   2969   5302  th199096 		    "remove reinstantiation %p", (void *)lrp));
   2970      0    stevel 		list_remove(&mi->mi_lost_state, lrp);
   2971      0    stevel 		nfs4_free_lost_rqst(lrp, NULL);
   2972      0    stevel 	}
   2973      0    stevel 	mutex_exit(&mi->mi_lock);
   2974      0    stevel }
   2975      0    stevel 
   2976      0    stevel /*
   2977      0    stevel  * End of state-specific recovery routines.
   2978      0    stevel  */
   2979      0    stevel 
   2980      0    stevel /*
   2981      0    stevel  * Allocate a lost request struct, initialize it from lost_rqstp (including
   2982      0    stevel  * bumping the reference counts for the referenced vnode, etc.), and hang
   2983      0    stevel  * it off of recovp.
   2984      0    stevel  */
   2985      0    stevel 
   2986      0    stevel static void
   2987      0    stevel nfs4_save_lost_rqst(nfs4_lost_rqst_t *lost_rqstp, recov_info_t *recovp,
   2988   5302  th199096     nfs4_recov_t *action, mntinfo4_t *mi)
   2989      0    stevel {
   2990      0    stevel 	nfs4_lost_rqst_t *destp;
   2991      0    stevel 
   2992      0    stevel 	ASSERT(recovp->rc_lost_rqst == NULL);
   2993      0    stevel 
   2994      0    stevel 	destp = kmem_alloc(sizeof (nfs4_lost_rqst_t), KM_SLEEP);
   2995      0    stevel 	recovp->rc_lost_rqst = destp;
   2996      0    stevel 
   2997