Home | History | Annotate | Download | only in nfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #include <sys/systm.h>
     27 #include <sys/cmn_err.h>
     28 #include <sys/kmem.h>
     29 #include <sys/disp.h>
     30 #include <sys/id_space.h>
     31 #include <sys/atomic.h>
     32 #include <rpc/rpc.h>
     33 #include <nfs/nfs4.h>
     34 #include <nfs/nfs4_db_impl.h>
     35 #include <sys/sdt.h>
     36 
     37 static int rfs4_reap_interval = RFS4_REAP_INTERVAL;
     38 
     39 static void rfs4_dbe_reap(rfs4_table_t *, time_t, uint32_t);
     40 static void rfs4_dbe_destroy(rfs4_dbe_t *);
     41 static rfs4_dbe_t *rfs4_dbe_create(rfs4_table_t *, id_t, rfs4_entry_t);
     42 static void rfs4_start_reaper(rfs4_table_t *);
     43 
     44 /*
     45  * t_lowat - integer percentage of table entries	/etc/system only
     46  * t_hiwat - integer percentage of table entries	/etc/system only
     47  * t_lreap - integer percentage of table reap time	mdb or /etc/system
     48  * t_hreap - integer percentage of table reap time	mdb or /etc/system
     49  */
     50 uint32_t	t_lowat = 50;	/* reap at t_lreap when id's in use hit 50% */
     51 uint32_t	t_hiwat = 75;	/* reap at t_hreap when id's in use hit 75% */
     52 time_t		t_lreap = 50;	/* default to 50% of table's reap interval */
     53 time_t		t_hreap = 10;	/* default to 10% of table's reap interval */
     54 
     55 id_t
     56 rfs4_dbe_getid(rfs4_dbe_t *entry)
     57 {
     58 	return (entry->dbe_id);
     59 }
     60 
     61 void
     62 rfs4_dbe_hold(rfs4_dbe_t *entry)
     63 {
     64 	atomic_add_32(&entry->dbe_refcnt, 1);
     65 }
     66 
     67 /*
     68  * rfs4_dbe_rele_nolock only decrements the reference count of the entry.
     69  */
     70 void
     71 rfs4_dbe_rele_nolock(rfs4_dbe_t *entry)
     72 {
     73 	atomic_add_32(&entry->dbe_refcnt, -1);
     74 }
     75 
     76 
     77 uint32_t
     78 rfs4_dbe_refcnt(rfs4_dbe_t *entry)
     79 {
     80 	return (entry->dbe_refcnt);
     81 }
     82 
     83 /*
     84  * Mark an entry such that the dbsearch will skip it.
     85  * Caller does not want this entry to be found any longer
     86  */
     87 void
     88 rfs4_dbe_invalidate(rfs4_dbe_t *entry)
     89 {
     90 	entry->dbe_invalid = TRUE;
     91 	entry->dbe_skipsearch = TRUE;
     92 }
     93 
     94 /*
     95  * Is this entry invalid?
     96  */
     97 bool_t
     98 rfs4_dbe_is_invalid(rfs4_dbe_t *entry)
     99 {
    100 	return (entry->dbe_invalid);
    101 }
    102 
    103 time_t
    104 rfs4_dbe_get_timerele(rfs4_dbe_t *entry)
    105 {
    106 	return (entry->dbe_time_rele);
    107 }
    108 
    109 /*
    110  * Use these to temporarily hide/unhide a db entry.
    111  */
    112 void
    113 rfs4_dbe_hide(rfs4_dbe_t *entry)
    114 {
    115 	rfs4_dbe_lock(entry);
    116 	entry->dbe_skipsearch = TRUE;
    117 	rfs4_dbe_unlock(entry);
    118 }
    119 
    120 void
    121 rfs4_dbe_unhide(rfs4_dbe_t *entry)
    122 {
    123 	rfs4_dbe_lock(entry);
    124 	entry->dbe_skipsearch = FALSE;
    125 	rfs4_dbe_unlock(entry);
    126 }
    127 
    128 void
    129 rfs4_dbe_rele(rfs4_dbe_t *entry)
    130 {
    131 	mutex_enter(entry->dbe_lock);
    132 	ASSERT(entry->dbe_refcnt > 1);
    133 	atomic_add_32(&entry->dbe_refcnt, -1);
    134 	entry->dbe_time_rele = gethrestime_sec();
    135 	mutex_exit(entry->dbe_lock);
    136 }
    137 
    138 void
    139 rfs4_dbe_lock(rfs4_dbe_t *entry)
    140 {
    141 	mutex_enter(entry->dbe_lock);
    142 }
    143 
    144 void
    145 rfs4_dbe_unlock(rfs4_dbe_t *entry)
    146 {
    147 	mutex_exit(entry->dbe_lock);
    148 }
    149 
    150 bool_t
    151 rfs4_dbe_islocked(rfs4_dbe_t *entry)
    152 {
    153 	return (mutex_owned(entry->dbe_lock));
    154 }
    155 
    156 clock_t
    157 rfs4_dbe_twait(rfs4_dbe_t *entry, clock_t timeout)
    158 {
    159 	return (cv_timedwait(entry->dbe_cv, entry->dbe_lock, timeout));
    160 }
    161 
    162 void
    163 rfs4_dbe_cv_broadcast(rfs4_dbe_t *entry)
    164 {
    165 	cv_broadcast(entry->dbe_cv);
    166 }
    167 
    168 /* ARGSUSED */
    169 static int
    170 rfs4_dbe_kmem_constructor(void *obj, void *private, int kmflag)
    171 {
    172 	rfs4_dbe_t *entry = obj;
    173 
    174 	mutex_init(entry->dbe_lock, NULL, MUTEX_DEFAULT, NULL);
    175 	cv_init(entry->dbe_cv, NULL, CV_DEFAULT, NULL);
    176 
    177 	return (0);
    178 }
    179 
    180 static void
    181 rfs4_dbe_kmem_destructor(void *obj, void *private)
    182 {
    183 	rfs4_dbe_t *entry = obj;
    184 	/*LINTED*/
    185 	rfs4_table_t *table = private;
    186 
    187 	mutex_destroy(entry->dbe_lock);
    188 	cv_destroy(entry->dbe_cv);
    189 }
    190 
    191 rfs4_database_t *
    192 rfs4_database_create(uint32_t flags)
    193 {
    194 	rfs4_database_t *db;
    195 
    196 	db = kmem_alloc(sizeof (rfs4_database_t), KM_SLEEP);
    197 	mutex_init(db->db_lock, NULL, MUTEX_DEFAULT, NULL);
    198 	db->db_tables = NULL;
    199 	db->db_debug_flags = flags;
    200 	db->db_shutdown_count = 0;
    201 	cv_init(&db->db_shutdown_wait, NULL, CV_DEFAULT, NULL);
    202 	return (db);
    203 }
    204 
    205 
    206 /*
    207  * The reaper threads that have been created for the tables in this
    208  * database must be stopped and the entries in the tables released.
    209  * Each table will be marked as "shutdown" and the reaper threads
    210  * poked and they will see that a shutdown is in progress and cleanup
    211  * and exit.  This function waits for all reaper threads to stop
    212  * before returning to the caller.
    213  */
    214 void
    215 rfs4_database_shutdown(rfs4_database_t *db)
    216 {
    217 	rfs4_table_t *table;
    218 
    219 	mutex_enter(db->db_lock);
    220 	for (table = db->db_tables; table; table = table->dbt_tnext) {
    221 		table->dbt_reaper_shutdown = TRUE;
    222 		mutex_enter(&table->dbt_reaper_cv_lock);
    223 		cv_broadcast(&table->dbt_reaper_wait);
    224 		db->db_shutdown_count++;
    225 		mutex_exit(&table->dbt_reaper_cv_lock);
    226 	}
    227 	while (db->db_shutdown_count > 0) {
    228 		cv_wait(&db->db_shutdown_wait, db->db_lock);
    229 	}
    230 	mutex_exit(db->db_lock);
    231 }
    232 
    233 /*
    234  * Given a database that has been "shutdown" by the function above all
    235  * of the table tables are destroyed and then the database itself
    236  * freed.
    237  */
    238 void
    239 rfs4_database_destroy(rfs4_database_t *db)
    240 {
    241 	rfs4_table_t *next, *tmp;
    242 
    243 	for (next = db->db_tables; next; ) {
    244 		tmp = next;
    245 		next = tmp->dbt_tnext;
    246 		rfs4_table_destroy(db, tmp);
    247 	}
    248 
    249 	mutex_destroy(db->db_lock);
    250 	kmem_free(db, sizeof (rfs4_database_t));
    251 }
    252 
    253 rfs4_table_t *
    254 rfs4_table_create(rfs4_database_t *db, char *tabname, time_t max_cache_time,
    255     uint32_t idxcnt, bool_t (*create)(rfs4_entry_t, void *),
    256     void (*destroy)(rfs4_entry_t),
    257     bool_t (*expiry)(rfs4_entry_t),
    258     uint32_t size, uint32_t hashsize,
    259     uint32_t maxentries, id_t start)
    260 {
    261 	rfs4_table_t	*table;
    262 	int		 len;
    263 	char		*cache_name;
    264 	char		*id_name;
    265 
    266 	table = kmem_alloc(sizeof (rfs4_table_t), KM_SLEEP);
    267 	table->dbt_db = db;
    268 	rw_init(table->dbt_t_lock, NULL, RW_DEFAULT, NULL);
    269 	mutex_init(table->dbt_lock, NULL, MUTEX_DEFAULT, NULL);
    270 	mutex_init(&table->dbt_reaper_cv_lock, NULL, MUTEX_DEFAULT, NULL);
    271 	cv_init(&table->dbt_reaper_wait, NULL, CV_DEFAULT, NULL);
    272 
    273 	len = strlen(tabname);
    274 	table->dbt_name = kmem_alloc(len+1, KM_SLEEP);
    275 	cache_name = kmem_alloc(len + 12 /* "_entry_cache" */ + 1, KM_SLEEP);
    276 	(void) strcpy(table->dbt_name, tabname);
    277 	(void) sprintf(cache_name, "%s_entry_cache", table->dbt_name);
    278 	table->dbt_max_cache_time = max_cache_time;
    279 	table->dbt_usize = size;
    280 	table->dbt_len = hashsize;
    281 	table->dbt_count = 0;
    282 	table->dbt_idxcnt = 0;
    283 	table->dbt_ccnt = 0;
    284 	table->dbt_maxcnt = idxcnt;
    285 	table->dbt_indices = NULL;
    286 	table->dbt_id_space = NULL;
    287 	table->dbt_reaper_shutdown = FALSE;
    288 
    289 	if (start >= 0) {
    290 		if (maxentries + (uint32_t)start > (uint32_t)INT32_MAX)
    291 			maxentries = INT32_MAX - start;
    292 		id_name = kmem_alloc(len + 9 /* "_id_space" */ + 1, KM_SLEEP);
    293 		(void) sprintf(id_name, "%s_id_space", table->dbt_name);
    294 		table->dbt_id_space = id_space_create(id_name, start,
    295 		    maxentries + start);
    296 		kmem_free(id_name, len + 10);
    297 	}
    298 	ASSERT(t_lowat != 0);
    299 	table->dbt_id_lwat = (maxentries * t_lowat) / 100;
    300 	ASSERT(t_hiwat != 0);
    301 	table->dbt_id_hwat = (maxentries * t_hiwat) / 100;
    302 	table->dbt_id_reap = MIN(rfs4_reap_interval, max_cache_time);
    303 	table->dbt_maxentries = maxentries;
    304 	table->dbt_create = create;
    305 	table->dbt_destroy = destroy;
    306 	table->dbt_expiry = expiry;
    307 
    308 	table->dbt_mem_cache = kmem_cache_create(cache_name,
    309 	    sizeof (rfs4_dbe_t) + idxcnt * sizeof (rfs4_link_t) + size,
    310 	    0,
    311 	    rfs4_dbe_kmem_constructor,
    312 	    rfs4_dbe_kmem_destructor,
    313 	    NULL,
    314 	    table,
    315 	    NULL,
    316 	    0);
    317 	kmem_free(cache_name, len+13);
    318 
    319 	table->dbt_debug = db->db_debug_flags;
    320 
    321 	mutex_enter(db->db_lock);
    322 	table->dbt_tnext = db->db_tables;
    323 	db->db_tables = table;
    324 	mutex_exit(db->db_lock);
    325 
    326 	rfs4_start_reaper(table);
    327 
    328 	return (table);
    329 }
    330 
    331 void
    332 rfs4_table_destroy(rfs4_database_t *db, rfs4_table_t *table)
    333 {
    334 	rfs4_table_t *p;
    335 	rfs4_index_t *idx;
    336 
    337 	ASSERT(table->dbt_count == 0);
    338 
    339 	mutex_enter(db->db_lock);
    340 	if (table == db->db_tables)
    341 		db->db_tables = table->dbt_tnext;
    342 	else {
    343 		for (p = db->db_tables; p; p = p->dbt_tnext)
    344 			if (p->dbt_tnext == table) {
    345 				p->dbt_tnext = table->dbt_tnext;
    346 				table->dbt_tnext = NULL;
    347 				break;
    348 			}
    349 		ASSERT(p != NULL);
    350 	}
    351 	mutex_exit(db->db_lock);
    352 
    353 	/* Destroy indices */
    354 	while (table->dbt_indices) {
    355 		idx = table->dbt_indices;
    356 		table->dbt_indices = idx->dbi_inext;
    357 		rfs4_index_destroy(idx);
    358 	}
    359 
    360 	rw_destroy(table->dbt_t_lock);
    361 	mutex_destroy(table->dbt_lock);
    362 	mutex_destroy(&table->dbt_reaper_cv_lock);
    363 	cv_destroy(&table->dbt_reaper_wait);
    364 
    365 	kmem_free(table->dbt_name, strlen(table->dbt_name) + 1);
    366 	if (table->dbt_id_space)
    367 		id_space_destroy(table->dbt_id_space);
    368 	kmem_cache_destroy(table->dbt_mem_cache);
    369 	kmem_free(table, sizeof (rfs4_table_t));
    370 }
    371 
    372 rfs4_index_t *
    373 rfs4_index_create(rfs4_table_t *table, char *keyname,
    374     uint32_t (*hash)(void *),
    375     bool_t (compare)(rfs4_entry_t, void *),
    376     void *(*mkkey)(rfs4_entry_t),
    377     bool_t createable)
    378 {
    379 	rfs4_index_t *idx;
    380 
    381 	ASSERT(table->dbt_idxcnt < table->dbt_maxcnt);
    382 
    383 	idx = kmem_alloc(sizeof (rfs4_index_t), KM_SLEEP);
    384 
    385 	idx->dbi_table = table;
    386 	idx->dbi_keyname = kmem_alloc(strlen(keyname) + 1, KM_SLEEP);
    387 	(void) strcpy(idx->dbi_keyname, keyname);
    388 	idx->dbi_hash = hash;
    389 	idx->dbi_compare = compare;
    390 	idx->dbi_mkkey = mkkey;
    391 	idx->dbi_tblidx = table->dbt_idxcnt;
    392 	table->dbt_idxcnt++;
    393 	if (createable) {
    394 		table->dbt_ccnt++;
    395 		if (table->dbt_ccnt > 1)
    396 			panic("Table %s currently can have only have one "
    397 			    "index that will allow creation of entries",
    398 			    table->dbt_name);
    399 		idx->dbi_createable = TRUE;
    400 	} else {
    401 		idx->dbi_createable = FALSE;
    402 	}
    403 
    404 	idx->dbi_inext = table->dbt_indices;
    405 	table->dbt_indices = idx;
    406 	idx->dbi_buckets = kmem_zalloc(sizeof (rfs4_bucket_t) * table->dbt_len,
    407 	    KM_SLEEP);
    408 
    409 	return (idx);
    410 }
    411 
    412 void
    413 rfs4_index_destroy(rfs4_index_t *idx)
    414 {
    415 	kmem_free(idx->dbi_keyname, strlen(idx->dbi_keyname) + 1);
    416 	kmem_free(idx->dbi_buckets,
    417 	    sizeof (rfs4_bucket_t) * idx->dbi_table->dbt_len);
    418 	kmem_free(idx, sizeof (rfs4_index_t));
    419 }
    420 
    421 static void
    422 rfs4_dbe_destroy(rfs4_dbe_t *entry)
    423 {
    424 	rfs4_index_t *idx;
    425 	void *key;
    426 	int i;
    427 	rfs4_bucket_t *bp;
    428 	rfs4_table_t *table = entry->dbe_table;
    429 	rfs4_link_t *l;
    430 
    431 	NFS4_DEBUG(table->dbt_debug & DESTROY_DEBUG,
    432 	    (CE_NOTE, "Destroying entry %p from %s",
    433 	    (void*)entry, table->dbt_name));
    434 
    435 	mutex_enter(entry->dbe_lock);
    436 	ASSERT(entry->dbe_refcnt == 0);
    437 	mutex_exit(entry->dbe_lock);
    438 
    439 	/* Unlink from all indices */
    440 	for (idx = table->dbt_indices; idx; idx = idx->dbi_inext) {
    441 		l = &entry->dbe_indices[idx->dbi_tblidx];
    442 		/* check and see if we were ever linked in to the index */
    443 		if (INVALID_LINK(l)) {
    444 			ASSERT(l->next == NULL && l->prev == NULL);
    445 			continue;
    446 		}
    447 		key = idx->dbi_mkkey(entry->dbe_data);
    448 		i = HASH(idx, key);
    449 		bp = &idx->dbi_buckets[i];
    450 		ASSERT(bp->dbk_head != NULL);
    451 		DEQUEUE_IDX(bp, &entry->dbe_indices[idx->dbi_tblidx]);
    452 	}
    453 
    454 	/* Destroy user data */
    455 	if (table->dbt_destroy)
    456 		(*table->dbt_destroy)(entry->dbe_data);
    457 
    458 	if (table->dbt_id_space)
    459 		id_free(table->dbt_id_space, entry->dbe_id);
    460 
    461 	mutex_enter(table->dbt_lock);
    462 	table->dbt_count--;
    463 	mutex_exit(table->dbt_lock);
    464 
    465 	/* Destroy the entry itself */
    466 	kmem_cache_free(table->dbt_mem_cache, entry);
    467 }
    468 
    469 
    470 static rfs4_dbe_t *
    471 rfs4_dbe_create(rfs4_table_t *table, id_t id, rfs4_entry_t data)
    472 {
    473 	rfs4_dbe_t *entry;
    474 	int i;
    475 
    476 	NFS4_DEBUG(table->dbt_debug & CREATE_DEBUG,
    477 	    (CE_NOTE, "Creating entry in table %s", table->dbt_name));
    478 
    479 	entry = kmem_cache_alloc(table->dbt_mem_cache, KM_SLEEP);
    480 
    481 	entry->dbe_refcnt = 1;
    482 	entry->dbe_invalid = FALSE;
    483 	entry->dbe_skipsearch = FALSE;
    484 	entry->dbe_time_rele = 0;
    485 	entry->dbe_id = 0;
    486 
    487 	if (table->dbt_id_space)
    488 		entry->dbe_id = id;
    489 	entry->dbe_table = table;
    490 
    491 	for (i = 0; i < table->dbt_maxcnt; i++) {
    492 		entry->dbe_indices[i].next = entry->dbe_indices[i].prev = NULL;
    493 		entry->dbe_indices[i].entry = entry;
    494 		/*
    495 		 * We mark the entry as not indexed by setting the low
    496 		 * order bit, since address are word aligned. This has
    497 		 * the advantage of causeing a trap if the address is
    498 		 * used. After the entry is linked in to the
    499 		 * corresponding index the bit will be cleared.
    500 		 */
    501 		INVALIDATE_ADDR(entry->dbe_indices[i].entry);
    502 	}
    503 
    504 	entry->dbe_data = (rfs4_entry_t)&entry->dbe_indices[table->dbt_maxcnt];
    505 	bzero(entry->dbe_data, table->dbt_usize);
    506 	entry->dbe_data->dbe = entry;
    507 
    508 	if (!(*table->dbt_create)(entry->dbe_data, data)) {
    509 		kmem_cache_free(table->dbt_mem_cache, entry);
    510 		return (NULL);
    511 	}
    512 
    513 	mutex_enter(table->dbt_lock);
    514 	table->dbt_count++;
    515 	mutex_exit(table->dbt_lock);
    516 
    517 	return (entry);
    518 }
    519 
    520 static void
    521 rfs4_dbe_tabreap_adjust(rfs4_table_t *table)
    522 {
    523 	clock_t		tabreap;
    524 	clock_t		reap_int;
    525 	uint32_t	in_use;
    526 
    527 	/*
    528 	 * Adjust the table's reap interval based on the
    529 	 * number of id's currently in use. Each table's
    530 	 * default remains the same if id usage subsides.
    531 	 */
    532 	ASSERT(MUTEX_HELD(&table->dbt_reaper_cv_lock));
    533 	tabreap = MIN(rfs4_reap_interval, table->dbt_max_cache_time);
    534 
    535 	in_use = table->dbt_count + 1;	/* see rfs4_dbe_create */
    536 	if (in_use >= table->dbt_id_hwat) {
    537 		ASSERT(t_hreap != 0);
    538 		reap_int = (tabreap * t_hreap) / 100;
    539 	} else if (in_use >= table->dbt_id_lwat) {
    540 		ASSERT(t_lreap != 0);
    541 		reap_int = (tabreap * t_lreap) / 100;
    542 	} else {
    543 		reap_int = tabreap;
    544 	}
    545 	table->dbt_id_reap = reap_int;
    546 	DTRACE_PROBE2(table__reap__interval, char *,
    547 	    table->dbt_name, time_t, table->dbt_id_reap);
    548 }
    549 
    550 rfs4_entry_t
    551 rfs4_dbsearch(rfs4_index_t *idx, void *key, bool_t *create, void *arg,
    552     rfs4_dbsearch_type_t dbsearch_type)
    553 {
    554 	int		 already_done;
    555 	uint32_t	 i;
    556 	rfs4_table_t	*table = idx->dbi_table;
    557 	rfs4_index_t	*ip;
    558 	rfs4_bucket_t	*bp;
    559 	rfs4_link_t	*l;
    560 	rfs4_dbe_t	*entry;
    561 	id_t		 id = -1;
    562 
    563 	i = HASH(idx, key);
    564 	bp = &idx->dbi_buckets[i];
    565 
    566 	NFS4_DEBUG(table->dbt_debug & SEARCH_DEBUG,
    567 	    (CE_NOTE, "Searching for key %p in table %s by %s",
    568 	    key, table->dbt_name, idx->dbi_keyname));
    569 
    570 	rw_enter(bp->dbk_lock, RW_READER);
    571 retry:
    572 	for (l = bp->dbk_head; l; l = l->next) {
    573 		if (l->entry->dbe_refcnt > 0 &&
    574 		    (l->entry->dbe_skipsearch == FALSE ||
    575 		    (l->entry->dbe_skipsearch == TRUE &&
    576 		    dbsearch_type == RFS4_DBS_INVALID)) &&
    577 		    (*idx->dbi_compare)(l->entry->dbe_data, key)) {
    578 			mutex_enter(l->entry->dbe_lock);
    579 			if (l->entry->dbe_refcnt == 0) {
    580 				mutex_exit(l->entry->dbe_lock);
    581 				continue;
    582 			}
    583 
    584 			/* place an additional hold since we are returning */
    585 			rfs4_dbe_hold(l->entry);
    586 
    587 			mutex_exit(l->entry->dbe_lock);
    588 			rw_exit(bp->dbk_lock);
    589 
    590 			*create = FALSE;
    591 
    592 			NFS4_DEBUG((table->dbt_debug & SEARCH_DEBUG),
    593 			    (CE_NOTE, "Found entry %p for %p in table %s",
    594 			    (void *)l->entry, key, table->dbt_name));
    595 
    596 			if (id != -1)
    597 				id_free(table->dbt_id_space, id);
    598 			return (l->entry->dbe_data);
    599 		}
    600 	}
    601 
    602 	if (!*create || table->dbt_create == NULL || !idx->dbi_createable ||
    603 	    table->dbt_maxentries == table->dbt_count) {
    604 		NFS4_DEBUG(table->dbt_debug & SEARCH_DEBUG,
    605 		    (CE_NOTE, "Entry for %p in %s not found",
    606 		    key, table->dbt_name));
    607 
    608 		rw_exit(bp->dbk_lock);
    609 		if (id != -1)
    610 			id_free(table->dbt_id_space, id);
    611 		return (NULL);
    612 	}
    613 
    614 	if (table->dbt_id_space && id == -1) {
    615 		rw_exit(bp->dbk_lock);
    616 
    617 		/* get an id, ok to sleep for it here */
    618 		id = id_alloc(table->dbt_id_space);
    619 		ASSERT(id != -1);
    620 
    621 		mutex_enter(&table->dbt_reaper_cv_lock);
    622 		rfs4_dbe_tabreap_adjust(table);
    623 		mutex_exit(&table->dbt_reaper_cv_lock);
    624 
    625 		rw_enter(bp->dbk_lock, RW_WRITER);
    626 		goto retry;
    627 	}
    628 
    629 	/* get an exclusive lock on the bucket */
    630 	if (rw_read_locked(bp->dbk_lock) && !rw_tryupgrade(bp->dbk_lock)) {
    631 		NFS4_DEBUG(table->dbt_debug & OTHER_DEBUG,
    632 		    (CE_NOTE, "Trying to upgrade lock on "
    633 		    "hash chain %d (%p) for  %s by %s",
    634 		    i, (void*)bp, table->dbt_name, idx->dbi_keyname));
    635 
    636 		rw_exit(bp->dbk_lock);
    637 		rw_enter(bp->dbk_lock, RW_WRITER);
    638 		goto retry;
    639 	}
    640 
    641 	/* create entry */
    642 	entry = rfs4_dbe_create(table, id, arg);
    643 	if (entry == NULL) {
    644 		rw_exit(bp->dbk_lock);
    645 		if (id != -1)
    646 			id_free(table->dbt_id_space, id);
    647 
    648 		NFS4_DEBUG(table->dbt_debug & CREATE_DEBUG,
    649 		    (CE_NOTE, "Constructor for table %s failed",
    650 		    table->dbt_name));
    651 		return (NULL);
    652 	}
    653 
    654 	/*
    655 	 * Add one ref for entry into table's hash - only one
    656 	 * reference added even though there may be multiple indices
    657 	 */
    658 	rfs4_dbe_hold(entry);
    659 	ENQUEUE(bp->dbk_head, &entry->dbe_indices[idx->dbi_tblidx]);
    660 	VALIDATE_ADDR(entry->dbe_indices[idx->dbi_tblidx].entry);
    661 
    662 	already_done = idx->dbi_tblidx;
    663 	rw_exit(bp->dbk_lock);
    664 
    665 	for (ip = table->dbt_indices; ip; ip = ip->dbi_inext) {
    666 		if (ip->dbi_tblidx == already_done)
    667 			continue;
    668 		l = &entry->dbe_indices[ip->dbi_tblidx];
    669 		i = HASH(ip, ip->dbi_mkkey(entry->dbe_data));
    670 		ASSERT(i < ip->dbi_table->dbt_len);
    671 		bp = &ip->dbi_buckets[i];
    672 		ENQUEUE_IDX(bp, l);
    673 	}
    674 
    675 	NFS4_DEBUG(
    676 	    table->dbt_debug & SEARCH_DEBUG || table->dbt_debug & CREATE_DEBUG,
    677 	    (CE_NOTE, "Entry %p created for %s = %p in table %s",
    678 	    (void*)entry, idx->dbi_keyname, (void*)key, table->dbt_name));
    679 
    680 	return (entry->dbe_data);
    681 }
    682 
    683 /*ARGSUSED*/
    684 boolean_t
    685 rfs4_cpr_callb(void *arg, int code)
    686 {
    687 	rfs4_table_t *table = rfs4_client_tab;
    688 	rfs4_bucket_t *buckets, *bp;
    689 	rfs4_link_t *l;
    690 	rfs4_client_t *cp;
    691 	int i;
    692 
    693 	/*
    694 	 * We get called for Suspend and Resume events.
    695 	 * For the suspend case we simply don't care!  Nor do we care if
    696 	 * there are no clients.
    697 	 */
    698 	if (code == CB_CODE_CPR_CHKPT || table == NULL) {
    699 		return (B_TRUE);
    700 	}
    701 
    702 	buckets = table->dbt_indices->dbi_buckets;
    703 
    704 	/*
    705 	 * When we get this far we are in the process of
    706 	 * resuming the system from a previous suspend.
    707 	 *
    708 	 * We are going to blast through and update the
    709 	 * last_access time for all the clients and in
    710 	 * doing so extend them by one lease period.
    711 	 */
    712 	for (i = 0; i < table->dbt_len; i++) {
    713 		bp = &buckets[i];
    714 		for (l = bp->dbk_head; l; l = l->next) {
    715 			cp = (rfs4_client_t *)l->entry->dbe_data;
    716 			cp->rc_last_access = gethrestime_sec();
    717 		}
    718 	}
    719 
    720 	return (B_TRUE);
    721 }
    722 
    723 /*
    724  * Given a table, lock each of the buckets and walk all entries (in
    725  * turn locking those) and calling the provided "callout" function
    726  * with the provided parameter.  Obviously used to iterate across all
    727  * entries in a particular table via the database locking hierarchy.
    728  * Obviously the caller must not hold locks on any of the entries in
    729  * the specified table.
    730  */
    731 void
    732 rfs4_dbe_walk(rfs4_table_t *table,
    733     void (*callout)(rfs4_entry_t, void *),
    734     void *data)
    735 {
    736 	rfs4_bucket_t *buckets = table->dbt_indices->dbi_buckets, *bp;
    737 	rfs4_link_t *l;
    738 	rfs4_dbe_t *entry;
    739 	int i;
    740 
    741 	NFS4_DEBUG(table->dbt_debug & WALK_DEBUG,
    742 	    (CE_NOTE, "Walking entries in %s", table->dbt_name));
    743 
    744 	/* Walk the buckets looking for entries to release/destroy */
    745 	for (i = 0; i < table->dbt_len; i++) {
    746 		bp = &buckets[i];
    747 		rw_enter(bp->dbk_lock, RW_READER);
    748 		for (l = bp->dbk_head; l; l = l->next) {
    749 			entry = l->entry;
    750 			mutex_enter(entry->dbe_lock);
    751 			(*callout)(entry->dbe_data, data);
    752 			mutex_exit(entry->dbe_lock);
    753 		}
    754 		rw_exit(bp->dbk_lock);
    755 	}
    756 
    757 	NFS4_DEBUG(table->dbt_debug & WALK_DEBUG,
    758 	    (CE_NOTE, "Walking entries complete %s", table->dbt_name));
    759 }
    760 
    761 
    762 static void
    763 rfs4_dbe_reap(rfs4_table_t *table, time_t cache_time, uint32_t desired)
    764 {
    765 	rfs4_index_t *idx = table->dbt_indices;
    766 	rfs4_bucket_t *buckets = idx->dbi_buckets, *bp;
    767 	rfs4_link_t *l, *t;
    768 	rfs4_dbe_t *entry;
    769 	bool_t found;
    770 	int i;
    771 	int count = 0;
    772 
    773 	NFS4_DEBUG(table->dbt_debug & REAP_DEBUG,
    774 	    (CE_NOTE, "Reaping %d entries older than %ld seconds in table %s",
    775 	    desired, cache_time, table->dbt_name));
    776 
    777 	/* Walk the buckets looking for entries to release/destroy */
    778 	for (i = 0; i < table->dbt_len; i++) {
    779 		bp = &buckets[i];
    780 		do {
    781 			found = FALSE;
    782 			rw_enter(bp->dbk_lock, RW_READER);
    783 			for (l = bp->dbk_head; l; l = l->next) {
    784 				entry = l->entry;
    785 				/*
    786 				 * Examine an entry.  Ref count of 1 means
    787 				 * that the only reference is for the hash
    788 				 * table reference.
    789 				 */
    790 				if (entry->dbe_refcnt != 1)
    791 					continue;
    792 				mutex_enter(entry->dbe_lock);
    793 				if ((entry->dbe_refcnt == 1) &&
    794 				    (table->dbt_reaper_shutdown ||
    795 				    table->dbt_expiry == NULL ||
    796 				    (*table->dbt_expiry)(entry->dbe_data))) {
    797 					entry->dbe_refcnt--;
    798 					count++;
    799 					found = TRUE;
    800 				}
    801 				mutex_exit(entry->dbe_lock);
    802 			}
    803 			if (found) {
    804 				if (!rw_tryupgrade(bp->dbk_lock)) {
    805 					rw_exit(bp->dbk_lock);
    806 					rw_enter(bp->dbk_lock, RW_WRITER);
    807 				}
    808 
    809 				l = bp->dbk_head;
    810 				while (l) {
    811 					t = l;
    812 					entry = t->entry;
    813 					l = l->next;
    814 					if (entry->dbe_refcnt == 0) {
    815 						DEQUEUE(bp->dbk_head, t);
    816 						t->next = NULL;
    817 						t->prev = NULL;
    818 						INVALIDATE_ADDR(t->entry);
    819 						rfs4_dbe_destroy(entry);
    820 					}
    821 				}
    822 			}
    823 			rw_exit(bp->dbk_lock);
    824 			/*
    825 			 * delay slightly if there is more work to do
    826 			 * with the expectation that other reaper
    827 			 * threads are freeing data structures as well
    828 			 * and in turn will reduce ref counts on
    829 			 * entries in this table allowing them to be
    830 			 * released.  This is only done in the
    831 			 * instance that the tables are being shut down.
    832 			 */
    833 			if (table->dbt_reaper_shutdown && bp->dbk_head != NULL)
    834 				delay(hz/100);
    835 		/*
    836 		 * If this is a table shutdown, keep going until
    837 		 * everything is gone
    838 		 */
    839 		} while (table->dbt_reaper_shutdown && bp->dbk_head != NULL);
    840 
    841 		if (!table->dbt_reaper_shutdown && desired && count >= desired)
    842 			break;
    843 	}
    844 
    845 	NFS4_DEBUG(table->dbt_debug & REAP_DEBUG,
    846 	    (CE_NOTE, "Reaped %d entries older than %ld seconds in table %s",
    847 	    count, cache_time, table->dbt_name));
    848 }
    849 
    850 static void
    851 reaper_thread(caddr_t *arg)
    852 {
    853 	rfs4_table_t	*table = (rfs4_table_t *)arg;
    854 	clock_t		 rc;
    855 
    856 	NFS4_DEBUG(table->dbt_debug,
    857 	    (CE_NOTE, "rfs4_reaper_thread starting for %s", table->dbt_name));
    858 
    859 	CALLB_CPR_INIT(&table->dbt_reaper_cpr_info, &table->dbt_reaper_cv_lock,
    860 	    callb_generic_cpr, "nfsv4Reaper");
    861 
    862 	mutex_enter(&table->dbt_reaper_cv_lock);
    863 	do {
    864 		CALLB_CPR_SAFE_BEGIN(&table->dbt_reaper_cpr_info);
    865 		rc = cv_reltimedwait_sig(&table->dbt_reaper_wait,
    866 		    &table->dbt_reaper_cv_lock,
    867 		    SEC_TO_TICK(table->dbt_id_reap), TR_CLOCK_TICK);
    868 		CALLB_CPR_SAFE_END(&table->dbt_reaper_cpr_info,
    869 		    &table->dbt_reaper_cv_lock);
    870 		rfs4_dbe_reap(table, table->dbt_max_cache_time, 0);
    871 	} while (rc != 0 && table->dbt_reaper_shutdown == FALSE);
    872 
    873 	CALLB_CPR_EXIT(&table->dbt_reaper_cpr_info);
    874 
    875 	NFS4_DEBUG(table->dbt_debug,
    876 	    (CE_NOTE, "rfs4_reaper_thread exiting for %s", table->dbt_name));
    877 
    878 	/* Notify the database shutdown processing that the table is shutdown */
    879 	mutex_enter(table->dbt_db->db_lock);
    880 	table->dbt_db->db_shutdown_count--;
    881 	cv_signal(&table->dbt_db->db_shutdown_wait);
    882 	mutex_exit(table->dbt_db->db_lock);
    883 }
    884 
    885 static void
    886 rfs4_start_reaper(rfs4_table_t *table)
    887 {
    888 	if (table->dbt_max_cache_time == 0)
    889 		return;
    890 
    891 	(void) thread_create(NULL, 0, reaper_thread, table, 0, &p0, TS_RUN,
    892 	    minclsyspri);
    893 }
    894 
    895 #ifdef DEBUG
    896 void
    897 rfs4_dbe_debug(rfs4_dbe_t *entry)
    898 {
    899 	cmn_err(CE_NOTE, "Entry %p from table %s",
    900 	    (void *)entry, entry->dbe_table->dbt_name);
    901 	cmn_err(CE_CONT, "\trefcnt = %d id = %d",
    902 	    entry->dbe_refcnt, entry->dbe_id);
    903 }
    904 #endif
    905