Home | History | Annotate | Download | only in nfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #include <sys/systm.h>
     28 #include <sys/sdt.h>
     29 #include <rpc/types.h>
     30 #include <rpc/auth.h>
     31 #include <rpc/auth_unix.h>
     32 #include <rpc/auth_des.h>
     33 #include <rpc/svc.h>
     34 #include <rpc/xdr.h>
     35 #include <nfs/nfs4.h>
     36 #include <nfs/nfs_dispatch.h>
     37 #include <nfs/nfs4_drc.h>
     38 
     39 #define	NFS4_MAX_MINOR_VERSION	0
     40 
     41 /*
     42  * This is the duplicate request cache for NFSv4
     43  */
     44 rfs4_drc_t *nfs4_drc = NULL;
     45 
     46 /*
     47  * The default size of the duplicate request cache
     48  */
     49 uint32_t nfs4_drc_max = 8 * 1024;
     50 
     51 /*
     52  * The number of buckets we'd like to hash the
     53  * replies into.. do not change this on the fly.
     54  */
     55 uint32_t nfs4_drc_hash = 541;
     56 
     57 static void rfs4_resource_err(struct svc_req *req, COMPOUND4args *argsp);
     58 
     59 /*
     60  * Initialize a duplicate request cache.
     61  */
     62 rfs4_drc_t *
     63 rfs4_init_drc(uint32_t drc_size, uint32_t drc_hash_size)
     64 {
     65 	rfs4_drc_t *drc;
     66 	uint32_t   bki;
     67 
     68 	ASSERT(drc_size);
     69 	ASSERT(drc_hash_size);
     70 
     71 	drc = kmem_alloc(sizeof (rfs4_drc_t), KM_SLEEP);
     72 
     73 	drc->max_size = drc_size;
     74 	drc->in_use = 0;
     75 
     76 	mutex_init(&drc->lock, NULL, MUTEX_DEFAULT, NULL);
     77 
     78 	drc->dr_hash = drc_hash_size;
     79 
     80 	drc->dr_buckets = kmem_alloc(sizeof (list_t)*drc_hash_size, KM_SLEEP);
     81 
     82 	for (bki = 0; bki < drc_hash_size; bki++) {
     83 		list_create(&drc->dr_buckets[bki], sizeof (rfs4_dupreq_t),
     84 		    offsetof(rfs4_dupreq_t, dr_bkt_next));
     85 	}
     86 
     87 	list_create(&(drc->dr_cache), sizeof (rfs4_dupreq_t),
     88 	    offsetof(rfs4_dupreq_t, dr_next));
     89 
     90 	return (drc);
     91 }
     92 
     93 /*
     94  * Destroy a duplicate request cache.
     95  */
     96 void
     97 rfs4_fini_drc(rfs4_drc_t *drc)
     98 {
     99 	rfs4_dupreq_t *drp, *drp_next;
    100 
    101 	ASSERT(drc);
    102 
    103 	/* iterate over the dr_cache and free the enties */
    104 	for (drp = list_head(&(drc->dr_cache)); drp != NULL; drp = drp_next) {
    105 
    106 		if (drp->dr_state == NFS4_DUP_REPLAY)
    107 			rfs4_compound_free(&(drp->dr_res));
    108 
    109 		if (drp->dr_addr.buf != NULL)
    110 			kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen);
    111 
    112 		drp_next = list_next(&(drc->dr_cache), drp);
    113 
    114 		kmem_free(drp, sizeof (rfs4_dupreq_t));
    115 	}
    116 
    117 	mutex_destroy(&drc->lock);
    118 	kmem_free(drc->dr_buckets,
    119 	    sizeof (list_t)*drc->dr_hash);
    120 	kmem_free(drc, sizeof (rfs4_drc_t));
    121 }
    122 
    123 /*
    124  * rfs4_dr_chstate:
    125  *
    126  * Change the state of a rfs4_dupreq. If it's not in transition
    127  * to the FREE state, return. If we are moving to the FREE state
    128  * then we need to clean up the compound results and move the entry
    129  * to the end of the list.
    130  */
    131 void
    132 rfs4_dr_chstate(rfs4_dupreq_t *drp, int new_state)
    133 {
    134 	rfs4_drc_t *drc;
    135 
    136 	ASSERT(drp);
    137 	ASSERT(drp->drc);
    138 	ASSERT(drp->dr_bkt);
    139 	ASSERT(MUTEX_HELD(&drp->drc->lock));
    140 
    141 	drp->dr_state = new_state;
    142 
    143 	if (new_state != NFS4_DUP_FREE)
    144 		return;
    145 
    146 	drc = drp->drc;
    147 
    148 	/*
    149 	 * Remove entry from the bucket and
    150 	 * dr_cache list, free compound results.
    151 	 */
    152 	list_remove(drp->dr_bkt, drp);
    153 	list_remove(&(drc->dr_cache), drp);
    154 	rfs4_compound_free(&(drp->dr_res));
    155 }
    156 
    157 /*
    158  * rfs4_alloc_dr:
    159  *
    160  * Malloc a new one if we have not reached our maximum cache
    161  * limit, otherwise pick an entry off the tail -- Use if it
    162  * is marked as NFS4_DUP_FREE, or is an entry in the
    163  * NFS4_DUP_REPLAY state.
    164  */
    165 rfs4_dupreq_t *
    166 rfs4_alloc_dr(rfs4_drc_t *drc)
    167 {
    168 	rfs4_dupreq_t *drp_tail, *drp = NULL;
    169 
    170 	ASSERT(drc);
    171 	ASSERT(MUTEX_HELD(&drc->lock));
    172 
    173 	/*
    174 	 * Have we hit the cache limit yet ?
    175 	 */
    176 	if (drc->in_use < drc->max_size) {
    177 		/*
    178 		 * nope, so let's malloc a new one
    179 		 */
    180 		drp = kmem_zalloc(sizeof (rfs4_dupreq_t), KM_SLEEP);
    181 		drp->drc = drc;
    182 		drc->in_use++;
    183 		DTRACE_PROBE1(nfss__i__drc_new, rfs4_dupreq_t *, drp);
    184 		return (drp);
    185 	}
    186 
    187 	/*
    188 	 * Cache is all allocated now traverse the list
    189 	 * backwards to find one we can reuse.
    190 	 */
    191 	for (drp_tail = list_tail(&drc->dr_cache); drp_tail != NULL;
    192 	    drp_tail = list_prev(&drc->dr_cache, drp_tail)) {
    193 
    194 		switch (drp_tail->dr_state) {
    195 
    196 		case NFS4_DUP_FREE:
    197 			list_remove(&(drc->dr_cache), drp_tail);
    198 			DTRACE_PROBE1(nfss__i__drc_freeclaim,
    199 			    rfs4_dupreq_t *, drp_tail);
    200 			return (drp_tail);
    201 			/* NOTREACHED */
    202 
    203 		case NFS4_DUP_REPLAY:
    204 			/* grab it. */
    205 			rfs4_dr_chstate(drp_tail, NFS4_DUP_FREE);
    206 			DTRACE_PROBE1(nfss__i__drc_replayclaim,
    207 			    rfs4_dupreq_t *, drp_tail);
    208 			return (drp_tail);
    209 			/* NOTREACHED */
    210 		}
    211 	}
    212 	DTRACE_PROBE1(nfss__i__drc_full, rfs4_drc_t *, drc);
    213 	return (NULL);
    214 }
    215 
    216 /*
    217  * rfs4_find_dr:
    218  *
    219  * Search for an entry in the duplicate request cache by
    220  * calculating the hash index based on the XID, and examining
    221  * the entries in the hash bucket. If we find a match, return.
    222  * Once we have searched the bucket we call rfs4_alloc_dr() to
    223  * allocate a new entry, or reuse one that is available.
    224  */
    225 int
    226 rfs4_find_dr(struct svc_req *req, rfs4_drc_t *drc, rfs4_dupreq_t **dup)
    227 {
    228 
    229 	uint32_t	the_xid;
    230 	list_t		*dr_bkt;
    231 	rfs4_dupreq_t	*drp;
    232 	int		bktdex;
    233 
    234 	/*
    235 	 * Get the XID, calculate the bucket and search to
    236 	 * see if we need to replay from the cache.
    237 	 */
    238 	the_xid = req->rq_xprt->xp_xid;
    239 	bktdex = the_xid % drc->dr_hash;
    240 
    241 	dr_bkt = (list_t *)
    242 	    &(drc->dr_buckets[(the_xid % drc->dr_hash)]);
    243 
    244 	DTRACE_PROBE3(nfss__i__drc_bktdex,
    245 	    int, bktdex,
    246 	    uint32_t, the_xid,
    247 	    list_t *, dr_bkt);
    248 
    249 	*dup = NULL;
    250 
    251 	mutex_enter(&drc->lock);
    252 	/*
    253 	 * Search the bucket for a matching xid and address.
    254 	 */
    255 	for (drp = list_head(dr_bkt); drp != NULL;
    256 	    drp = list_next(dr_bkt, drp)) {
    257 
    258 		if (drp->dr_xid == the_xid &&
    259 		    drp->dr_addr.len == req->rq_xprt->xp_rtaddr.len &&
    260 		    bcmp((caddr_t)drp->dr_addr.buf,
    261 		    (caddr_t)req->rq_xprt->xp_rtaddr.buf,
    262 		    drp->dr_addr.len) == 0) {
    263 
    264 			/*
    265 			 * Found a match so REPLAY the Reply
    266 			 */
    267 			if (drp->dr_state == NFS4_DUP_REPLAY) {
    268 				rfs4_dr_chstate(drp, NFS4_DUP_INUSE);
    269 				mutex_exit(&drc->lock);
    270 				*dup = drp;
    271 				DTRACE_PROBE1(nfss__i__drc_replay,
    272 				    rfs4_dupreq_t *, drp);
    273 				return (NFS4_DUP_REPLAY);
    274 			}
    275 
    276 			/*
    277 			 * This entry must be in transition, so return
    278 			 * the 'pending' status.
    279 			 */
    280 			mutex_exit(&drc->lock);
    281 			return (NFS4_DUP_PENDING);
    282 		}
    283 	}
    284 
    285 	drp = rfs4_alloc_dr(drc);
    286 	mutex_exit(&drc->lock);
    287 
    288 	/*
    289 	 * The DRC is full and all entries are in use. Upper function
    290 	 * should error out this request and force the client to
    291 	 * retransmit -- effectively this is a resource issue. NFSD
    292 	 * threads tied up with native File System, or the cache size
    293 	 * is too small for the server load.
    294 	 */
    295 	if (drp == NULL)
    296 		return (NFS4_DUP_ERROR);
    297 
    298 	/*
    299 	 * Init the state to NEW.
    300 	 */
    301 	drp->dr_state = NFS4_DUP_NEW;
    302 
    303 	/*
    304 	 * If needed, resize the address buffer
    305 	 */
    306 	if (drp->dr_addr.maxlen < req->rq_xprt->xp_rtaddr.len) {
    307 		if (drp->dr_addr.buf != NULL)
    308 			kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen);
    309 		drp->dr_addr.maxlen = req->rq_xprt->xp_rtaddr.len;
    310 		drp->dr_addr.buf = kmem_alloc(drp->dr_addr.maxlen, KM_NOSLEEP);
    311 		if (drp->dr_addr.buf == NULL) {
    312 			/*
    313 			 * If the malloc fails, mark the entry
    314 			 * as free and put on the tail.
    315 			 */
    316 			drp->dr_addr.maxlen = 0;
    317 			drp->dr_state = NFS4_DUP_FREE;
    318 			mutex_enter(&drc->lock);
    319 			list_insert_tail(&(drc->dr_cache), drp);
    320 			mutex_exit(&drc->lock);
    321 			return (NFS4_DUP_ERROR);
    322 		}
    323 	}
    324 
    325 
    326 	/*
    327 	 * Copy the address.
    328 	 */
    329 	drp->dr_addr.len = req->rq_xprt->xp_rtaddr.len;
    330 
    331 	bcopy((caddr_t)req->rq_xprt->xp_rtaddr.buf,
    332 	    (caddr_t)drp->dr_addr.buf,
    333 	    drp->dr_addr.len);
    334 
    335 	drp->dr_xid = the_xid;
    336 	drp->dr_bkt = dr_bkt;
    337 
    338 	/*
    339 	 * Insert at the head of the bucket and
    340 	 * the drc lists..
    341 	 */
    342 	mutex_enter(&drc->lock);
    343 	list_insert_head(&drc->dr_cache, drp);
    344 	list_insert_head(dr_bkt, drp);
    345 	mutex_exit(&drc->lock);
    346 
    347 	*dup = drp;
    348 
    349 	return (NFS4_DUP_NEW);
    350 }
    351 
    352 /*
    353  *
    354  * This function handles the duplicate request cache,
    355  * NULL_PROC and COMPOUND procedure calls for NFSv4;
    356  *
    357  * Passed into this function are:-
    358  *
    359  * 	disp	A pointer to our dispatch table entry
    360  * 	req	The request to process
    361  * 	xprt	The server transport handle
    362  * 	ap	A pointer to the arguments
    363  *
    364  *
    365  * When appropriate this function is responsible for inserting
    366  * the reply into the duplicate cache or replaying an existing
    367  * cached reply.
    368  *
    369  * dr_stat 	reflects the state of the duplicate request that
    370  * 		has been inserted into or retrieved from the cache
    371  *
    372  * drp		is the duplicate request entry
    373  *
    374  */
    375 int
    376 rfs4_dispatch(struct rpcdisp *disp, struct svc_req *req,
    377 		SVCXPRT *xprt, char *ap)
    378 {
    379 
    380 	COMPOUND4res	 res_buf;
    381 	COMPOUND4res	*rbp;
    382 	COMPOUND4args	*cap;
    383 	cred_t		*cr = NULL;
    384 	int		 error = 0;
    385 	int		 dis_flags = 0;
    386 	int		 dr_stat = NFS4_NOT_DUP;
    387 	rfs4_dupreq_t	*drp = NULL;
    388 	int		 rv;
    389 
    390 	ASSERT(disp);
    391 
    392 	/*
    393 	 * Short circuit the RPC_NULL proc.
    394 	 */
    395 	if (disp->dis_proc == rpc_null) {
    396 		DTRACE_NFSV4_1(null__start, struct svc_req *, req);
    397 		if (!svc_sendreply(xprt, xdr_void, NULL)) {
    398 			DTRACE_NFSV4_1(null__done, struct svc_req *, req);
    399 			svcerr_systemerr(xprt);
    400 			return (1);
    401 		}
    402 		DTRACE_NFSV4_1(null__done, struct svc_req *, req);
    403 		return (0);
    404 	}
    405 
    406 	/* Only NFSv4 Compounds from this point onward */
    407 
    408 	rbp = &res_buf;
    409 	cap = (COMPOUND4args *)ap;
    410 
    411 	/*
    412 	 * Figure out the disposition of the whole COMPOUND
    413 	 * and record it's IDEMPOTENTCY.
    414 	 */
    415 	rfs4_compound_flagproc(cap, &dis_flags);
    416 
    417 	/*
    418 	 * If NON-IDEMPOTENT then we need to figure out if this
    419 	 * request can be replied from the duplicate cache.
    420 	 *
    421 	 * If this is a new request then we need to insert the
    422 	 * reply into the duplicate cache.
    423 	 */
    424 	if (!(dis_flags & RPC_IDEMPOTENT)) {
    425 		/* look for a replay from the cache or allocate */
    426 		dr_stat = rfs4_find_dr(req, nfs4_drc, &drp);
    427 
    428 		switch (dr_stat) {
    429 
    430 		case NFS4_DUP_ERROR:
    431 			rfs4_resource_err(req, cap);
    432 			return (1);
    433 			/* NOTREACHED */
    434 
    435 		case NFS4_DUP_PENDING:
    436 			/*
    437 			 * reply has previously been inserted into the
    438 			 * duplicate cache, however the reply has
    439 			 * not yet been sent via svc_sendreply()
    440 			 */
    441 			return (1);
    442 			/* NOTREACHED */
    443 
    444 		case NFS4_DUP_NEW:
    445 			curthread->t_flag |= T_DONTPEND;
    446 			/* NON-IDEMPOTENT proc call */
    447 			rfs4_compound(cap, rbp, NULL, req, cr, &rv);
    448 			curthread->t_flag &= ~T_DONTPEND;
    449 
    450 			if (rv)		/* short ckt sendreply on error */
    451 				return (rv);
    452 
    453 			/*
    454 			 * dr_res must be initialized before calling
    455 			 * rfs4_dr_chstate (it frees the reply).
    456 			 */
    457 			drp->dr_res = res_buf;
    458 			if (curthread->t_flag & T_WOULDBLOCK) {
    459 				curthread->t_flag &= ~T_WOULDBLOCK;
    460 				/*
    461 				 * mark this entry as FREE and plop
    462 				 * on the end of the cache list
    463 				 */
    464 				mutex_enter(&drp->drc->lock);
    465 				rfs4_dr_chstate(drp, NFS4_DUP_FREE);
    466 				list_insert_tail(&(drp->drc->dr_cache), drp);
    467 				mutex_exit(&drp->drc->lock);
    468 				return (1);
    469 			}
    470 			break;
    471 
    472 		case NFS4_DUP_REPLAY:
    473 			/* replay from the cache */
    474 			rbp = &(drp->dr_res);
    475 			break;
    476 		}
    477 	} else {
    478 		curthread->t_flag |= T_DONTPEND;
    479 		/* IDEMPOTENT proc call */
    480 		rfs4_compound(cap, rbp, NULL, req, cr, &rv);
    481 		curthread->t_flag &= ~T_DONTPEND;
    482 
    483 		if (rv)		/* short ckt sendreply on error */
    484 			return (rv);
    485 
    486 		if (curthread->t_flag & T_WOULDBLOCK) {
    487 			curthread->t_flag &= ~T_WOULDBLOCK;
    488 			return (1);
    489 		}
    490 	}
    491 
    492 	/*
    493 	 * Send out the replayed reply or the 'real' one.
    494 	 */
    495 	if (!svc_sendreply(xprt,  xdr_COMPOUND4res_srv, (char *)rbp)) {
    496 		DTRACE_PROBE2(nfss__e__dispatch_sendfail,
    497 		    struct svc_req *, xprt,
    498 		    char *, rbp);
    499 		svcerr_systemerr(xprt);
    500 		error++;
    501 	}
    502 
    503 	/*
    504 	 * If this reply was just inserted into the duplicate cache
    505 	 * or it was replayed from the dup cache; (re)mark it as
    506 	 * available for replay
    507 	 *
    508 	 * At first glance, this 'if' statement seems a little strange;
    509 	 * testing for NFS4_DUP_REPLAY, and then calling...
    510 	 *
    511 	 *	rfs4_dr_chatate(NFS4_DUP_REPLAY)
    512 	 *
    513 	 * ... but notice that we are checking dr_stat, and not the
    514 	 * state of the entry itself, the entry will be NFS4_DUP_INUSE,
    515 	 * we do that so that we know not to prematurely reap it whilst
    516 	 * we resent it to the client.
    517 	 *
    518 	 */
    519 	if (dr_stat == NFS4_DUP_NEW || dr_stat == NFS4_DUP_REPLAY) {
    520 		mutex_enter(&drp->drc->lock);
    521 		rfs4_dr_chstate(drp, NFS4_DUP_REPLAY);
    522 		mutex_exit(&drp->drc->lock);
    523 	} else if (dr_stat == NFS4_NOT_DUP) {
    524 		rfs4_compound_free(rbp);
    525 	}
    526 
    527 	return (error);
    528 }
    529 
    530 bool_t
    531 rfs4_minorvers_mismatch(struct svc_req *req, SVCXPRT *xprt, void *args)
    532 {
    533 	COMPOUND4args *argsp;
    534 	COMPOUND4res res_buf, *resp;
    535 
    536 	if (req->rq_vers != 4)
    537 		return (FALSE);
    538 
    539 	argsp = (COMPOUND4args *)args;
    540 
    541 	if (argsp->minorversion <= NFS4_MAX_MINOR_VERSION)
    542 		return (FALSE);
    543 
    544 	resp = &res_buf;
    545 
    546 	/*
    547 	 * Form a reply tag by copying over the reqeuest tag.
    548 	 */
    549 	resp->tag.utf8string_val =
    550 	    kmem_alloc(argsp->tag.utf8string_len, KM_SLEEP);
    551 	resp->tag.utf8string_len = argsp->tag.utf8string_len;
    552 	bcopy(argsp->tag.utf8string_val, resp->tag.utf8string_val,
    553 	    resp->tag.utf8string_len);
    554 	resp->array_len = 0;
    555 	resp->array = NULL;
    556 	resp->status = NFS4ERR_MINOR_VERS_MISMATCH;
    557 	if (!svc_sendreply(xprt,  xdr_COMPOUND4res_srv, (char *)resp)) {
    558 		DTRACE_PROBE2(nfss__e__minorvers_mismatch,
    559 		    SVCXPRT *, xprt, char *, resp);
    560 		svcerr_systemerr(xprt);
    561 	}
    562 	rfs4_compound_free(resp);
    563 	return (TRUE);
    564 }
    565 
    566 void
    567 rfs4_resource_err(struct svc_req *req, COMPOUND4args *argsp)
    568 {
    569 	COMPOUND4res res_buf, *rbp;
    570 	nfs_resop4 *resop;
    571 	PUTFH4res *resp;
    572 
    573 	rbp = &res_buf;
    574 
    575 	/*
    576 	 * Form a reply tag by copying over the request tag.
    577 	 */
    578 	rbp->tag.utf8string_val =
    579 	    kmem_alloc(argsp->tag.utf8string_len, KM_SLEEP);
    580 	rbp->tag.utf8string_len = argsp->tag.utf8string_len;
    581 	bcopy(argsp->tag.utf8string_val, rbp->tag.utf8string_val,
    582 	    rbp->tag.utf8string_len);
    583 
    584 	rbp->array_len = 1;
    585 	rbp->array = kmem_zalloc(rbp->array_len * sizeof (nfs_resop4),
    586 	    KM_SLEEP);
    587 	resop = &rbp->array[0];
    588 	resop->resop = argsp->array[0].argop;	/* copy first op over */
    589 
    590 	/* Any op will do, just need to access status field */
    591 	resp = &resop->nfs_resop4_u.opputfh;
    592 
    593 	/*
    594 	 * NFS4ERR_RESOURCE is allowed for all ops, except OP_ILLEGAL.
    595 	 * Note that all op numbers in the compound array were already
    596 	 * validated by the XDR decoder (xdr_COMPOUND4args_srv()).
    597 	 */
    598 	resp->status = (resop->resop == OP_ILLEGAL ?
    599 	    NFS4ERR_OP_ILLEGAL : NFS4ERR_RESOURCE);
    600 
    601 	/* compound status is same as first op status */
    602 	rbp->status = resp->status;
    603 
    604 	if (!svc_sendreply(req->rq_xprt, xdr_COMPOUND4res_srv, (char *)rbp)) {
    605 		DTRACE_PROBE2(nfss__rsrc_err__sendfail,
    606 		    struct svc_req *, req->rq_xprt, char *, rbp);
    607 		svcerr_systemerr(req->rq_xprt);
    608 	}
    609 
    610 	UTF8STRING_FREE(rbp->tag);
    611 	kmem_free(rbp->array, rbp->array_len * sizeof (nfs_resop4));
    612 }
    613