Home | History | Annotate | Download | only in nfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #include <sys/systm.h>
     27 #include <sys/cmn_err.h>
     28 #include <sys/kmem.h>
     29 #include <sys/disp.h>
     30 #include <sys/id_space.h>
     31 #include <sys/atomic.h>
     32 #include <rpc/rpc.h>
     33 #include <nfs/nfs4.h>
     34 #include <nfs/nfs4_db_impl.h>
     35 
     36 static int rfs4_reap_interval = RFS4_REAP_INTERVAL;
     37 
     38 static void rfs4_dbe_reap(rfs4_table_t *, time_t, uint32_t);
     39 static void rfs4_dbe_destroy(rfs4_dbe_t *);
     40 static rfs4_dbe_t *rfs4_dbe_create(rfs4_table_t *, id_t, rfs4_entry_t);
     41 static void rfs4_start_reaper(rfs4_table_t *);
     42 
     43 id_t
     44 rfs4_dbe_getid(rfs4_dbe_t *entry)
     45 {
     46 	return (entry->dbe_id);
     47 }
     48 
     49 void
     50 rfs4_dbe_hold(rfs4_dbe_t *entry)
     51 {
     52 	atomic_add_32(&entry->dbe_refcnt, 1);
     53 }
     54 
     55 /*
     56  * rfs4_dbe_rele_nolock only decrements the reference count of the entry.
     57  */
     58 void
     59 rfs4_dbe_rele_nolock(rfs4_dbe_t *entry)
     60 {
     61 	atomic_add_32(&entry->dbe_refcnt, -1);
     62 }
     63 
     64 
     65 uint32_t
     66 rfs4_dbe_refcnt(rfs4_dbe_t *entry)
     67 {
     68 	return (entry->dbe_refcnt);
     69 }
     70 
     71 /*
     72  * Mark an entry such that the dbsearch will skip it.
     73  * Caller does not want this entry to be found any longer
     74  */
     75 void
     76 rfs4_dbe_invalidate(rfs4_dbe_t *entry)
     77 {
     78 	entry->dbe_invalid = TRUE;
     79 	entry->dbe_skipsearch = TRUE;
     80 }
     81 
     82 /*
     83  * Is this entry invalid?
     84  */
     85 bool_t
     86 rfs4_dbe_is_invalid(rfs4_dbe_t *entry)
     87 {
     88 	return (entry->dbe_invalid);
     89 }
     90 
     91 time_t
     92 rfs4_dbe_get_timerele(rfs4_dbe_t *entry)
     93 {
     94 	return (entry->dbe_time_rele);
     95 }
     96 
     97 /*
     98  * Use these to temporarily hide/unhide a db entry.
     99  */
    100 void
    101 rfs4_dbe_hide(rfs4_dbe_t *entry)
    102 {
    103 	rfs4_dbe_lock(entry);
    104 	entry->dbe_skipsearch = TRUE;
    105 	rfs4_dbe_unlock(entry);
    106 }
    107 
    108 void
    109 rfs4_dbe_unhide(rfs4_dbe_t *entry)
    110 {
    111 	rfs4_dbe_lock(entry);
    112 	entry->dbe_skipsearch = FALSE;
    113 	rfs4_dbe_unlock(entry);
    114 }
    115 
    116 void
    117 rfs4_dbe_rele(rfs4_dbe_t *entry)
    118 {
    119 	mutex_enter(entry->dbe_lock);
    120 	ASSERT(entry->dbe_refcnt > 1);
    121 	atomic_add_32(&entry->dbe_refcnt, -1);
    122 	entry->dbe_time_rele = gethrestime_sec();
    123 	mutex_exit(entry->dbe_lock);
    124 }
    125 
    126 void
    127 rfs4_dbe_lock(rfs4_dbe_t *entry)
    128 {
    129 	mutex_enter(entry->dbe_lock);
    130 }
    131 
    132 void
    133 rfs4_dbe_unlock(rfs4_dbe_t *entry)
    134 {
    135 	mutex_exit(entry->dbe_lock);
    136 }
    137 
    138 bool_t
    139 rfs4_dbe_islocked(rfs4_dbe_t *entry)
    140 {
    141 	return (mutex_owned(entry->dbe_lock));
    142 }
    143 
    144 clock_t
    145 rfs4_dbe_twait(rfs4_dbe_t *entry, clock_t timeout)
    146 {
    147 	return (cv_timedwait(entry->dbe_cv, entry->dbe_lock, timeout));
    148 }
    149 
    150 void
    151 rfs4_dbe_cv_broadcast(rfs4_dbe_t *entry)
    152 {
    153 	cv_broadcast(entry->dbe_cv);
    154 }
    155 
    156 /* ARGSUSED */
    157 static int
    158 rfs4_dbe_kmem_constructor(void *obj, void *private, int kmflag)
    159 {
    160 	rfs4_dbe_t *entry = obj;
    161 
    162 	mutex_init(entry->dbe_lock, NULL, MUTEX_DEFAULT, NULL);
    163 	cv_init(entry->dbe_cv, NULL, CV_DEFAULT, NULL);
    164 
    165 	return (0);
    166 }
    167 
    168 static void
    169 rfs4_dbe_kmem_destructor(void *obj, void *private)
    170 {
    171 	rfs4_dbe_t *entry = obj;
    172 	/*LINTED*/
    173 	rfs4_table_t *table = private;
    174 
    175 	mutex_destroy(entry->dbe_lock);
    176 	cv_destroy(entry->dbe_cv);
    177 }
    178 
    179 rfs4_database_t *
    180 rfs4_database_create(uint32_t flags)
    181 {
    182 	rfs4_database_t *db;
    183 
    184 	db = kmem_alloc(sizeof (rfs4_database_t), KM_SLEEP);
    185 	mutex_init(db->db_lock, NULL, MUTEX_DEFAULT, NULL);
    186 	db->db_tables = NULL;
    187 	db->db_debug_flags = flags;
    188 	db->db_shutdown_count = 0;
    189 	cv_init(&db->db_shutdown_wait, NULL, CV_DEFAULT, NULL);
    190 	return (db);
    191 }
    192 
    193 
    194 /*
    195  * The reaper threads that have been created for the tables in this
    196  * database must be stopped and the entries in the tables released.
    197  * Each table will be marked as "shutdown" and the reaper threads
    198  * poked and they will see that a shutdown is in progress and cleanup
    199  * and exit.  This function waits for all reaper threads to stop
    200  * before returning to the caller.
    201  */
    202 void
    203 rfs4_database_shutdown(rfs4_database_t *db)
    204 {
    205 	rfs4_table_t *table;
    206 
    207 	mutex_enter(db->db_lock);
    208 	for (table = db->db_tables; table; table = table->dbt_tnext) {
    209 		table->dbt_reaper_shutdown = TRUE;
    210 		mutex_enter(&table->dbt_reaper_cv_lock);
    211 		cv_broadcast(&table->dbt_reaper_wait);
    212 		db->db_shutdown_count++;
    213 		mutex_exit(&table->dbt_reaper_cv_lock);
    214 	}
    215 	while (db->db_shutdown_count > 0) {
    216 		cv_wait(&db->db_shutdown_wait, db->db_lock);
    217 	}
    218 	mutex_exit(db->db_lock);
    219 }
    220 
    221 /*
    222  * Given a database that has been "shutdown" by the function above all
    223  * of the table tables are destroyed and then the database itself
    224  * freed.
    225  */
    226 void
    227 rfs4_database_destroy(rfs4_database_t *db)
    228 {
    229 	rfs4_table_t *next, *tmp;
    230 
    231 	for (next = db->db_tables; next; ) {
    232 		tmp = next;
    233 		next = tmp->dbt_tnext;
    234 		rfs4_table_destroy(db, tmp);
    235 	}
    236 
    237 	mutex_destroy(db->db_lock);
    238 	kmem_free(db, sizeof (rfs4_database_t));
    239 }
    240 
    241 rfs4_table_t *
    242 rfs4_table_create(rfs4_database_t *db, char *tabname, time_t max_cache_time,
    243     uint32_t idxcnt, bool_t (*create)(rfs4_entry_t, void *),
    244     void (*destroy)(rfs4_entry_t),
    245     bool_t (*expiry)(rfs4_entry_t),
    246     uint32_t size, uint32_t hashsize,
    247     uint32_t maxentries, id_t start)
    248 {
    249 	rfs4_table_t *table;
    250 	int len;
    251 	char *cache_name;
    252 	char *id_name;
    253 
    254 	table = kmem_alloc(sizeof (rfs4_table_t), KM_SLEEP);
    255 	table->dbt_db = db;
    256 	rw_init(table->dbt_t_lock, NULL, RW_DEFAULT, NULL);
    257 	mutex_init(table->dbt_lock, NULL, MUTEX_DEFAULT, NULL);
    258 	mutex_init(&table->dbt_reaper_cv_lock, NULL, MUTEX_DEFAULT, NULL);
    259 	cv_init(&table->dbt_reaper_wait, NULL, CV_DEFAULT, NULL);
    260 
    261 	len = strlen(tabname);
    262 	table->dbt_name = kmem_alloc(len+1, KM_SLEEP);
    263 	cache_name = kmem_alloc(len + 12 /* "_entry_cache" */ + 1, KM_SLEEP);
    264 	(void) strcpy(table->dbt_name, tabname);
    265 	(void) sprintf(cache_name, "%s_entry_cache", table->dbt_name);
    266 	table->dbt_max_cache_time = max_cache_time;
    267 	table->dbt_usize = size;
    268 	table->dbt_len = hashsize;
    269 	table->dbt_count = 0;
    270 	table->dbt_idxcnt = 0;
    271 	table->dbt_ccnt = 0;
    272 	table->dbt_maxcnt = idxcnt;
    273 	table->dbt_indices = NULL;
    274 	table->dbt_id_space = NULL;
    275 	table->dbt_reaper_shutdown = FALSE;
    276 
    277 	if (start >= 0) {
    278 		if (maxentries + (uint32_t)start > (uint32_t)INT32_MAX)
    279 			maxentries = INT32_MAX - start;
    280 		id_name = kmem_alloc(len + 9 /* "_id_space" */ + 1, KM_SLEEP);
    281 		(void) sprintf(id_name, "%s_id_space", table->dbt_name);
    282 		table->dbt_id_space = id_space_create(id_name, start,
    283 		    maxentries + start);
    284 		kmem_free(id_name, len + 10);
    285 	}
    286 	table->dbt_maxentries = maxentries;
    287 	table->dbt_create = create;
    288 	table->dbt_destroy = destroy;
    289 	table->dbt_expiry = expiry;
    290 
    291 	table->dbt_mem_cache = kmem_cache_create(cache_name,
    292 	    sizeof (rfs4_dbe_t) + idxcnt * sizeof (rfs4_link_t) + size,
    293 	    0,
    294 	    rfs4_dbe_kmem_constructor,
    295 	    rfs4_dbe_kmem_destructor,
    296 	    NULL,
    297 	    table,
    298 	    NULL,
    299 	    0);
    300 	kmem_free(cache_name, len+13);
    301 
    302 	table->dbt_debug = db->db_debug_flags;
    303 
    304 	mutex_enter(db->db_lock);
    305 	table->dbt_tnext = db->db_tables;
    306 	db->db_tables = table;
    307 	mutex_exit(db->db_lock);
    308 
    309 	rfs4_start_reaper(table);
    310 
    311 	return (table);
    312 }
    313 
    314 void
    315 rfs4_table_destroy(rfs4_database_t *db, rfs4_table_t *table)
    316 {
    317 	rfs4_table_t *p;
    318 	rfs4_index_t *idx;
    319 
    320 	ASSERT(table->dbt_count == 0);
    321 
    322 	mutex_enter(db->db_lock);
    323 	if (table == db->db_tables)
    324 		db->db_tables = table->dbt_tnext;
    325 	else {
    326 		for (p = db->db_tables; p; p = p->dbt_tnext)
    327 			if (p->dbt_tnext == table) {
    328 				p->dbt_tnext = table->dbt_tnext;
    329 				table->dbt_tnext = NULL;
    330 				break;
    331 			}
    332 		ASSERT(p != NULL);
    333 	}
    334 	mutex_exit(db->db_lock);
    335 
    336 	/* Destroy indices */
    337 	while (table->dbt_indices) {
    338 		idx = table->dbt_indices;
    339 		table->dbt_indices = idx->dbi_inext;
    340 		rfs4_index_destroy(idx);
    341 	}
    342 
    343 	rw_destroy(table->dbt_t_lock);
    344 	mutex_destroy(table->dbt_lock);
    345 	mutex_destroy(&table->dbt_reaper_cv_lock);
    346 	cv_destroy(&table->dbt_reaper_wait);
    347 
    348 	kmem_free(table->dbt_name, strlen(table->dbt_name) + 1);
    349 	if (table->dbt_id_space)
    350 		id_space_destroy(table->dbt_id_space);
    351 	kmem_cache_destroy(table->dbt_mem_cache);
    352 	kmem_free(table, sizeof (rfs4_table_t));
    353 }
    354 
    355 rfs4_index_t *
    356 rfs4_index_create(rfs4_table_t *table, char *keyname,
    357     uint32_t (*hash)(void *),
    358     bool_t (compare)(rfs4_entry_t, void *),
    359     void *(*mkkey)(rfs4_entry_t),
    360     bool_t createable)
    361 {
    362 	rfs4_index_t *idx;
    363 
    364 	ASSERT(table->dbt_idxcnt < table->dbt_maxcnt);
    365 
    366 	idx = kmem_alloc(sizeof (rfs4_index_t), KM_SLEEP);
    367 
    368 	idx->dbi_table = table;
    369 	idx->dbi_keyname = kmem_alloc(strlen(keyname) + 1, KM_SLEEP);
    370 	(void) strcpy(idx->dbi_keyname, keyname);
    371 	idx->dbi_hash = hash;
    372 	idx->dbi_compare = compare;
    373 	idx->dbi_mkkey = mkkey;
    374 	idx->dbi_tblidx = table->dbt_idxcnt;
    375 	table->dbt_idxcnt++;
    376 	if (createable) {
    377 		table->dbt_ccnt++;
    378 		if (table->dbt_ccnt > 1)
    379 			panic("Table %s currently can have only have one "
    380 			    "index that will allow creation of entries",
    381 			    table->dbt_name);
    382 		idx->dbi_createable = TRUE;
    383 	} else {
    384 		idx->dbi_createable = FALSE;
    385 	}
    386 
    387 	idx->dbi_inext = table->dbt_indices;
    388 	table->dbt_indices = idx;
    389 	idx->dbi_buckets = kmem_zalloc(sizeof (rfs4_bucket_t) * table->dbt_len,
    390 	    KM_SLEEP);
    391 
    392 	return (idx);
    393 }
    394 
    395 void
    396 rfs4_index_destroy(rfs4_index_t *idx)
    397 {
    398 	kmem_free(idx->dbi_keyname, strlen(idx->dbi_keyname) + 1);
    399 	kmem_free(idx->dbi_buckets,
    400 	    sizeof (rfs4_bucket_t) * idx->dbi_table->dbt_len);
    401 	kmem_free(idx, sizeof (rfs4_index_t));
    402 }
    403 
    404 static void
    405 rfs4_dbe_destroy(rfs4_dbe_t *entry)
    406 {
    407 	rfs4_index_t *idx;
    408 	void *key;
    409 	int i;
    410 	rfs4_bucket_t *bp;
    411 	rfs4_table_t *table = entry->dbe_table;
    412 	rfs4_link_t *l;
    413 
    414 	NFS4_DEBUG(table->dbt_debug & DESTROY_DEBUG,
    415 	    (CE_NOTE, "Destroying entry %p from %s",
    416 	    (void*)entry, table->dbt_name));
    417 
    418 	mutex_enter(entry->dbe_lock);
    419 	ASSERT(entry->dbe_refcnt == 0);
    420 	mutex_exit(entry->dbe_lock);
    421 
    422 	/* Unlink from all indices */
    423 	for (idx = table->dbt_indices; idx; idx = idx->dbi_inext) {
    424 		l = &entry->dbe_indices[idx->dbi_tblidx];
    425 		/* check and see if we were ever linked in to the index */
    426 		if (INVALID_LINK(l)) {
    427 			ASSERT(l->next == NULL && l->prev == NULL);
    428 			continue;
    429 		}
    430 		key = idx->dbi_mkkey(entry->dbe_data);
    431 		i = HASH(idx, key);
    432 		bp = &idx->dbi_buckets[i];
    433 		ASSERT(bp->dbk_head != NULL);
    434 		DEQUEUE_IDX(bp, &entry->dbe_indices[idx->dbi_tblidx]);
    435 	}
    436 
    437 	/* Destroy user data */
    438 	if (table->dbt_destroy)
    439 		(*table->dbt_destroy)(entry->dbe_data);
    440 
    441 	if (table->dbt_id_space)
    442 		id_free(table->dbt_id_space, entry->dbe_id);
    443 
    444 	mutex_enter(table->dbt_lock);
    445 	table->dbt_count--;
    446 	mutex_exit(table->dbt_lock);
    447 
    448 	/* Destroy the entry itself */
    449 	kmem_cache_free(table->dbt_mem_cache, entry);
    450 }
    451 
    452 
    453 static rfs4_dbe_t *
    454 rfs4_dbe_create(rfs4_table_t *table, id_t id, rfs4_entry_t data)
    455 {
    456 	rfs4_dbe_t *entry;
    457 	int i;
    458 
    459 	NFS4_DEBUG(table->dbt_debug & CREATE_DEBUG,
    460 	    (CE_NOTE, "Creating entry in table %s", table->dbt_name));
    461 
    462 	entry = kmem_cache_alloc(table->dbt_mem_cache, KM_SLEEP);
    463 
    464 	entry->dbe_refcnt = 1;
    465 	entry->dbe_invalid = FALSE;
    466 	entry->dbe_skipsearch = FALSE;
    467 	entry->dbe_time_rele = 0;
    468 	entry->dbe_id = 0;
    469 
    470 	if (table->dbt_id_space)
    471 		entry->dbe_id = id;
    472 	entry->dbe_table = table;
    473 
    474 	for (i = 0; i < table->dbt_maxcnt; i++) {
    475 		entry->dbe_indices[i].next = entry->dbe_indices[i].prev = NULL;
    476 		entry->dbe_indices[i].entry = entry;
    477 		/*
    478 		 * We mark the entry as not indexed by setting the low
    479 		 * order bit, since address are word aligned. This has
    480 		 * the advantage of causeing a trap if the address is
    481 		 * used. After the entry is linked in to the
    482 		 * corresponding index the bit will be cleared.
    483 		 */
    484 		INVALIDATE_ADDR(entry->dbe_indices[i].entry);
    485 	}
    486 
    487 	entry->dbe_data = (rfs4_entry_t)&entry->dbe_indices[table->dbt_maxcnt];
    488 	bzero(entry->dbe_data, table->dbt_usize);
    489 	entry->dbe_data->dbe = entry;
    490 
    491 	if (!(*table->dbt_create)(entry->dbe_data, data)) {
    492 		kmem_cache_free(table->dbt_mem_cache, entry);
    493 		return (NULL);
    494 	}
    495 
    496 	mutex_enter(table->dbt_lock);
    497 	table->dbt_count++;
    498 	mutex_exit(table->dbt_lock);
    499 
    500 	return (entry);
    501 }
    502 
    503 rfs4_entry_t
    504 rfs4_dbsearch(rfs4_index_t *idx, void *key, bool_t *create, void *arg,
    505     rfs4_dbsearch_type_t dbsearch_type)
    506 {
    507 	int already_done;
    508 	uint32_t i;
    509 	rfs4_table_t *table = idx->dbi_table;
    510 	rfs4_index_t *ip;
    511 	rfs4_bucket_t *bp;
    512 	rfs4_link_t *l;
    513 	rfs4_dbe_t *entry;
    514 	id_t id = -1;
    515 
    516 	i = HASH(idx, key);
    517 	bp = &idx->dbi_buckets[i];
    518 
    519 	NFS4_DEBUG(table->dbt_debug & SEARCH_DEBUG,
    520 	    (CE_NOTE, "Searching for key %p in table %s by %s",
    521 	    key, table->dbt_name, idx->dbi_keyname));
    522 
    523 	rw_enter(bp->dbk_lock, RW_READER);
    524 retry:
    525 	for (l = bp->dbk_head; l; l = l->next) {
    526 		if (l->entry->dbe_refcnt > 0 &&
    527 		    (l->entry->dbe_skipsearch == FALSE ||
    528 		    (l->entry->dbe_skipsearch == TRUE &&
    529 		    dbsearch_type == RFS4_DBS_INVALID)) &&
    530 		    (*idx->dbi_compare)(l->entry->dbe_data, key)) {
    531 			mutex_enter(l->entry->dbe_lock);
    532 			if (l->entry->dbe_refcnt == 0) {
    533 				mutex_exit(l->entry->dbe_lock);
    534 				continue;
    535 			}
    536 
    537 			/* place an additional hold since we are returning */
    538 			rfs4_dbe_hold(l->entry);
    539 
    540 			mutex_exit(l->entry->dbe_lock);
    541 			rw_exit(bp->dbk_lock);
    542 
    543 			*create = FALSE;
    544 
    545 			NFS4_DEBUG((table->dbt_debug & SEARCH_DEBUG),
    546 			    (CE_NOTE, "Found entry %p for %p in table %s",
    547 			    (void *)l->entry, key, table->dbt_name));
    548 
    549 			if (id != -1)
    550 				id_free(table->dbt_id_space, id);
    551 			return (l->entry->dbe_data);
    552 		}
    553 	}
    554 
    555 	if (!*create || table->dbt_create == NULL || !idx->dbi_createable ||
    556 	    table->dbt_maxentries == table->dbt_count) {
    557 		NFS4_DEBUG(table->dbt_debug & SEARCH_DEBUG,
    558 		    (CE_NOTE, "Entry for %p in %s not found",
    559 		    key, table->dbt_name));
    560 
    561 		rw_exit(bp->dbk_lock);
    562 		if (id != -1)
    563 			id_free(table->dbt_id_space, id);
    564 		return (NULL);
    565 	}
    566 
    567 	if (table->dbt_id_space && id == -1) {
    568 		/* get an id but don't sleep for it */
    569 		id = id_alloc_nosleep(table->dbt_id_space);
    570 		if (id == -1) {
    571 			rw_exit(bp->dbk_lock);
    572 
    573 			/* get an id, ok to sleep for it here */
    574 			id = id_alloc(table->dbt_id_space);
    575 
    576 			rw_enter(bp->dbk_lock, RW_WRITER);
    577 			goto retry;
    578 		}
    579 	}
    580 
    581 	/* get an exclusive lock on the bucket */
    582 	if (rw_read_locked(bp->dbk_lock) && !rw_tryupgrade(bp->dbk_lock)) {
    583 		NFS4_DEBUG(table->dbt_debug & OTHER_DEBUG,
    584 		    (CE_NOTE, "Trying to upgrade lock on "
    585 		    "hash chain %d (%p) for  %s by %s",
    586 		    i, (void*)bp, table->dbt_name, idx->dbi_keyname));
    587 
    588 		rw_exit(bp->dbk_lock);
    589 		rw_enter(bp->dbk_lock, RW_WRITER);
    590 		goto retry;
    591 	}
    592 
    593 	/* create entry */
    594 	entry = rfs4_dbe_create(table, id, arg);
    595 	if (entry == NULL) {
    596 		rw_exit(bp->dbk_lock);
    597 		if (id != -1)
    598 			id_free(table->dbt_id_space, id);
    599 
    600 		NFS4_DEBUG(table->dbt_debug & CREATE_DEBUG,
    601 		    (CE_NOTE, "Constructor for table %s failed",
    602 		    table->dbt_name));
    603 		return (NULL);
    604 	}
    605 
    606 	/*
    607 	 * Add one ref for entry into table's hash - only one
    608 	 * reference added even though there may be multiple indices
    609 	 */
    610 	rfs4_dbe_hold(entry);
    611 	ENQUEUE(bp->dbk_head, &entry->dbe_indices[idx->dbi_tblidx]);
    612 	VALIDATE_ADDR(entry->dbe_indices[idx->dbi_tblidx].entry);
    613 
    614 	already_done = idx->dbi_tblidx;
    615 	rw_exit(bp->dbk_lock);
    616 
    617 	for (ip = table->dbt_indices; ip; ip = ip->dbi_inext) {
    618 		if (ip->dbi_tblidx == already_done)
    619 			continue;
    620 		l = &entry->dbe_indices[ip->dbi_tblidx];
    621 		i = HASH(ip, ip->dbi_mkkey(entry->dbe_data));
    622 		ASSERT(i < ip->dbi_table->dbt_len);
    623 		bp = &ip->dbi_buckets[i];
    624 		ENQUEUE_IDX(bp, l);
    625 	}
    626 
    627 	NFS4_DEBUG(
    628 	    table->dbt_debug & SEARCH_DEBUG || table->dbt_debug & CREATE_DEBUG,
    629 	    (CE_NOTE, "Entry %p created for %s = %p in table %s",
    630 	    (void*)entry, idx->dbi_keyname, (void*)key, table->dbt_name));
    631 
    632 	return (entry->dbe_data);
    633 }
    634 
    635 /*ARGSUSED*/
    636 boolean_t
    637 rfs4_cpr_callb(void *arg, int code)
    638 {
    639 	rfs4_table_t *table = rfs4_client_tab;
    640 	rfs4_bucket_t *buckets, *bp;
    641 	rfs4_link_t *l;
    642 	rfs4_client_t *cp;
    643 	int i;
    644 
    645 	/*
    646 	 * We get called for Suspend and Resume events.
    647 	 * For the suspend case we simply don't care!  Nor do we care if
    648 	 * there are no clients.
    649 	 */
    650 	if (code == CB_CODE_CPR_CHKPT || table == NULL) {
    651 		return (B_TRUE);
    652 	}
    653 
    654 	buckets = table->dbt_indices->dbi_buckets;
    655 
    656 	/*
    657 	 * When we get this far we are in the process of
    658 	 * resuming the system from a previous suspend.
    659 	 *
    660 	 * We are going to blast through and update the
    661 	 * last_access time for all the clients and in
    662 	 * doing so extend them by one lease period.
    663 	 */
    664 	for (i = 0; i < table->dbt_len; i++) {
    665 		bp = &buckets[i];
    666 		for (l = bp->dbk_head; l; l = l->next) {
    667 			cp = (rfs4_client_t *)l->entry->dbe_data;
    668 			cp->rc_last_access = gethrestime_sec();
    669 		}
    670 	}
    671 
    672 	return (B_TRUE);
    673 }
    674 
    675 /*
    676  * Given a table, lock each of the buckets and walk all entries (in
    677  * turn locking those) and calling the provided "callout" function
    678  * with the provided parameter.  Obviously used to iterate across all
    679  * entries in a particular table via the database locking hierarchy.
    680  * Obviously the caller must not hold locks on any of the entries in
    681  * the specified table.
    682  */
    683 void
    684 rfs4_dbe_walk(rfs4_table_t *table,
    685     void (*callout)(rfs4_entry_t, void *),
    686     void *data)
    687 {
    688 	rfs4_bucket_t *buckets = table->dbt_indices->dbi_buckets, *bp;
    689 	rfs4_link_t *l;
    690 	rfs4_dbe_t *entry;
    691 	int i;
    692 
    693 	NFS4_DEBUG(table->dbt_debug & WALK_DEBUG,
    694 	    (CE_NOTE, "Walking entries in %s", table->dbt_name));
    695 
    696 	/* Walk the buckets looking for entries to release/destroy */
    697 	for (i = 0; i < table->dbt_len; i++) {
    698 		bp = &buckets[i];
    699 		rw_enter(bp->dbk_lock, RW_READER);
    700 		for (l = bp->dbk_head; l; l = l->next) {
    701 			entry = l->entry;
    702 			mutex_enter(entry->dbe_lock);
    703 			(*callout)(entry->dbe_data, data);
    704 			mutex_exit(entry->dbe_lock);
    705 		}
    706 		rw_exit(bp->dbk_lock);
    707 	}
    708 
    709 	NFS4_DEBUG(table->dbt_debug & WALK_DEBUG,
    710 	    (CE_NOTE, "Walking entries complete %s", table->dbt_name));
    711 }
    712 
    713 
    714 static void
    715 rfs4_dbe_reap(rfs4_table_t *table, time_t cache_time, uint32_t desired)
    716 {
    717 	rfs4_index_t *idx = table->dbt_indices;
    718 	rfs4_bucket_t *buckets = idx->dbi_buckets, *bp;
    719 	rfs4_link_t *l, *t;
    720 	rfs4_dbe_t *entry;
    721 	bool_t found;
    722 	int i;
    723 	int count = 0;
    724 
    725 	NFS4_DEBUG(table->dbt_debug & REAP_DEBUG,
    726 	    (CE_NOTE, "Reaping %d entries older than %ld seconds in table %s",
    727 	    desired, cache_time, table->dbt_name));
    728 
    729 	/* Walk the buckets looking for entries to release/destroy */
    730 	for (i = 0; i < table->dbt_len; i++) {
    731 		bp = &buckets[i];
    732 		do {
    733 			found = FALSE;
    734 			rw_enter(bp->dbk_lock, RW_READER);
    735 			for (l = bp->dbk_head; l; l = l->next) {
    736 				entry = l->entry;
    737 				/*
    738 				 * Examine an entry.  Ref count of 1 means
    739 				 * that the only reference is for the hash
    740 				 * table reference.
    741 				 */
    742 				if (entry->dbe_refcnt != 1)
    743 					continue;
    744 				mutex_enter(entry->dbe_lock);
    745 				if ((entry->dbe_refcnt == 1) &&
    746 				    (table->dbt_reaper_shutdown ||
    747 				    table->dbt_expiry == NULL ||
    748 				    (*table->dbt_expiry)(entry->dbe_data))) {
    749 					entry->dbe_refcnt--;
    750 					count++;
    751 					found = TRUE;
    752 				}
    753 				mutex_exit(entry->dbe_lock);
    754 			}
    755 			if (found) {
    756 				if (!rw_tryupgrade(bp->dbk_lock)) {
    757 					rw_exit(bp->dbk_lock);
    758 					rw_enter(bp->dbk_lock, RW_WRITER);
    759 				}
    760 
    761 				l = bp->dbk_head;
    762 				while (l) {
    763 					t = l;
    764 					entry = t->entry;
    765 					l = l->next;
    766 					if (entry->dbe_refcnt == 0) {
    767 						DEQUEUE(bp->dbk_head, t);
    768 						t->next = NULL;
    769 						t->prev = NULL;
    770 						INVALIDATE_ADDR(t->entry);
    771 						rfs4_dbe_destroy(entry);
    772 					}
    773 				}
    774 			}
    775 			rw_exit(bp->dbk_lock);
    776 			/*
    777 			 * delay slightly if there is more work to do
    778 			 * with the expectation that other reaper
    779 			 * threads are freeing data structures as well
    780 			 * and in turn will reduce ref counts on
    781 			 * entries in this table allowing them to be
    782 			 * released.  This is only done in the
    783 			 * instance that the tables are being shut down.
    784 			 */
    785 			if (table->dbt_reaper_shutdown && bp->dbk_head != NULL)
    786 				delay(hz/100);
    787 		/*
    788 		 * If this is a table shutdown, keep going until
    789 		 * everything is gone
    790 		 */
    791 		} while (table->dbt_reaper_shutdown && bp->dbk_head != NULL);
    792 
    793 		if (!table->dbt_reaper_shutdown && desired && count >= desired)
    794 			break;
    795 	}
    796 
    797 	NFS4_DEBUG(table->dbt_debug & REAP_DEBUG,
    798 	    (CE_NOTE, "Reaped %d entries older than %ld seconds in table %s",
    799 	    count, cache_time, table->dbt_name));
    800 }
    801 
    802 
    803 static void
    804 reaper_thread(caddr_t *arg)
    805 {
    806 	rfs4_table_t *table = (rfs4_table_t *)arg;
    807 	clock_t rc, time, wakeup;
    808 
    809 	NFS4_DEBUG(table->dbt_debug,
    810 	    (CE_NOTE, "rfs4_reaper_thread starting for %s", table->dbt_name));
    811 
    812 	CALLB_CPR_INIT(&table->dbt_reaper_cpr_info, &table->dbt_reaper_cv_lock,
    813 	    callb_generic_cpr, "nfsv4Reaper");
    814 
    815 	time = MIN(rfs4_reap_interval, table->dbt_max_cache_time);
    816 	wakeup = SEC_TO_TICK(time);
    817 
    818 	mutex_enter(&table->dbt_reaper_cv_lock);
    819 	do {
    820 		CALLB_CPR_SAFE_BEGIN(&table->dbt_reaper_cpr_info);
    821 		rc = cv_reltimedwait_sig(&table->dbt_reaper_wait,
    822 		    &table->dbt_reaper_cv_lock, wakeup, TR_CLOCK_TICK);
    823 		CALLB_CPR_SAFE_END(&table->dbt_reaper_cpr_info,
    824 		    &table->dbt_reaper_cv_lock);
    825 		rfs4_dbe_reap(table, table->dbt_max_cache_time, 0);
    826 	} while (rc != 0 && table->dbt_reaper_shutdown == FALSE);
    827 
    828 	CALLB_CPR_EXIT(&table->dbt_reaper_cpr_info);
    829 
    830 	NFS4_DEBUG(table->dbt_debug,
    831 	    (CE_NOTE, "rfs4_reaper_thread exiting for %s", table->dbt_name));
    832 
    833 	/* Notify the database shutdown processing that the table is shutdown */
    834 	mutex_enter(table->dbt_db->db_lock);
    835 	table->dbt_db->db_shutdown_count--;
    836 	cv_signal(&table->dbt_db->db_shutdown_wait);
    837 	mutex_exit(table->dbt_db->db_lock);
    838 }
    839 
    840 static void
    841 rfs4_start_reaper(rfs4_table_t *table)
    842 {
    843 	if (table->dbt_max_cache_time == 0)
    844 		return;
    845 
    846 	(void) thread_create(NULL, 0, reaper_thread, table, 0, &p0, TS_RUN,
    847 	    minclsyspri);
    848 }
    849 
    850 #ifdef DEBUG
    851 void
    852 rfs4_dbe_debug(rfs4_dbe_t *entry)
    853 {
    854 	cmn_err(CE_NOTE, "Entry %p from table %s",
    855 	    (void *)entry, entry->dbe_table->dbt_name);
    856 	cmn_err(CE_CONT, "\trefcnt = %d id = %d",
    857 	    entry->dbe_refcnt, entry->dbe_id);
    858 }
    859 #endif
    860