Home | History | Annotate | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*
     27  * The 512-byte leaf is broken into 32 16-byte chunks.
     28  * chunk number n means l_chunk[n], even though the header precedes it.
     29  * the names are stored null-terminated.
     30  */
     31 
     32 #include <sys/spa.h>
     33 #include <sys/dmu.h>
     34 #include <sys/zfs_context.h>
     35 #include <sys/fs/zfs.h>
     36 #include <sys/zap.h>
     37 #include <sys/zap_impl.h>
     38 #include <sys/zap_leaf.h>
     39 
     40 static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry);
     41 
     42 #define	CHAIN_END 0xffff /* end of the chunk chain */
     43 
     44 /* half the (current) minimum block size */
     45 #define	MAX_ARRAY_BYTES (8<<10)
     46 
     47 #define	LEAF_HASH(l, h) \
     48 	((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \
     49 	((h) >> (64 - ZAP_LEAF_HASH_SHIFT(l)-(l)->l_phys->l_hdr.lh_prefix_len)))
     50 
     51 #define	LEAF_HASH_ENTPTR(l, h) (&(l)->l_phys->l_hash[LEAF_HASH(l, h)])
     52 
     53 
     54 static void
     55 zap_memset(void *a, int c, size_t n)
     56 {
     57 	char *cp = a;
     58 	char *cpend = cp + n;
     59 
     60 	while (cp < cpend)
     61 		*cp++ = c;
     62 }
     63 
     64 static void
     65 stv(int len, void *addr, uint64_t value)
     66 {
     67 	switch (len) {
     68 	case 1:
     69 		*(uint8_t *)addr = value;
     70 		return;
     71 	case 2:
     72 		*(uint16_t *)addr = value;
     73 		return;
     74 	case 4:
     75 		*(uint32_t *)addr = value;
     76 		return;
     77 	case 8:
     78 		*(uint64_t *)addr = value;
     79 		return;
     80 	}
     81 	ASSERT(!"bad int len");
     82 }
     83 
     84 static uint64_t
     85 ldv(int len, const void *addr)
     86 {
     87 	switch (len) {
     88 	case 1:
     89 		return (*(uint8_t *)addr);
     90 	case 2:
     91 		return (*(uint16_t *)addr);
     92 	case 4:
     93 		return (*(uint32_t *)addr);
     94 	case 8:
     95 		return (*(uint64_t *)addr);
     96 	}
     97 	ASSERT(!"bad int len");
     98 	return (0xFEEDFACEDEADBEEFULL);
     99 }
    100 
    101 void
    102 zap_leaf_byteswap(zap_leaf_phys_t *buf, int size)
    103 {
    104 	int i;
    105 	zap_leaf_t l;
    106 	l.l_bs = highbit(size)-1;
    107 	l.l_phys = buf;
    108 
    109 	buf->l_hdr.lh_block_type = 	BSWAP_64(buf->l_hdr.lh_block_type);
    110 	buf->l_hdr.lh_prefix = 		BSWAP_64(buf->l_hdr.lh_prefix);
    111 	buf->l_hdr.lh_magic = 		BSWAP_32(buf->l_hdr.lh_magic);
    112 	buf->l_hdr.lh_nfree = 		BSWAP_16(buf->l_hdr.lh_nfree);
    113 	buf->l_hdr.lh_nentries = 	BSWAP_16(buf->l_hdr.lh_nentries);
    114 	buf->l_hdr.lh_prefix_len = 	BSWAP_16(buf->l_hdr.lh_prefix_len);
    115 	buf->l_hdr.lh_freelist = 	BSWAP_16(buf->l_hdr.lh_freelist);
    116 
    117 	for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++)
    118 		buf->l_hash[i] = BSWAP_16(buf->l_hash[i]);
    119 
    120 	for (i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) {
    121 		zap_leaf_chunk_t *lc = &ZAP_LEAF_CHUNK(&l, i);
    122 		struct zap_leaf_entry *le;
    123 
    124 		switch (lc->l_free.lf_type) {
    125 		case ZAP_CHUNK_ENTRY:
    126 			le = &lc->l_entry;
    127 
    128 			le->le_type =		BSWAP_8(le->le_type);
    129 			le->le_int_size =	BSWAP_8(le->le_int_size);
    130 			le->le_next =		BSWAP_16(le->le_next);
    131 			le->le_name_chunk =	BSWAP_16(le->le_name_chunk);
    132 			le->le_name_length =	BSWAP_16(le->le_name_length);
    133 			le->le_value_chunk =	BSWAP_16(le->le_value_chunk);
    134 			le->le_value_length =	BSWAP_16(le->le_value_length);
    135 			le->le_cd =		BSWAP_32(le->le_cd);
    136 			le->le_hash =		BSWAP_64(le->le_hash);
    137 			break;
    138 		case ZAP_CHUNK_FREE:
    139 			lc->l_free.lf_type =	BSWAP_8(lc->l_free.lf_type);
    140 			lc->l_free.lf_next =	BSWAP_16(lc->l_free.lf_next);
    141 			break;
    142 		case ZAP_CHUNK_ARRAY:
    143 			lc->l_array.la_type =	BSWAP_8(lc->l_array.la_type);
    144 			lc->l_array.la_next =	BSWAP_16(lc->l_array.la_next);
    145 			/* la_array doesn't need swapping */
    146 			break;
    147 		default:
    148 			ASSERT(!"bad leaf type");
    149 		}
    150 	}
    151 }
    152 
    153 void
    154 zap_leaf_init(zap_leaf_t *l, boolean_t sort)
    155 {
    156 	int i;
    157 
    158 	l->l_bs = highbit(l->l_dbuf->db_size)-1;
    159 	zap_memset(&l->l_phys->l_hdr, 0, sizeof (struct zap_leaf_header));
    160 	zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l));
    161 	for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
    162 		ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE;
    163 		ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1;
    164 	}
    165 	ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)-1).l_free.lf_next = CHAIN_END;
    166 	l->l_phys->l_hdr.lh_block_type = ZBT_LEAF;
    167 	l->l_phys->l_hdr.lh_magic = ZAP_LEAF_MAGIC;
    168 	l->l_phys->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l);
    169 	if (sort)
    170 		l->l_phys->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
    171 }
    172 
    173 /*
    174  * Routines which manipulate leaf chunks (l_chunk[]).
    175  */
    176 
    177 static uint16_t
    178 zap_leaf_chunk_alloc(zap_leaf_t *l)
    179 {
    180 	int chunk;
    181 
    182 	ASSERT(l->l_phys->l_hdr.lh_nfree > 0);
    183 
    184 	chunk = l->l_phys->l_hdr.lh_freelist;
    185 	ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
    186 	ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE);
    187 
    188 	l->l_phys->l_hdr.lh_freelist = ZAP_LEAF_CHUNK(l, chunk).l_free.lf_next;
    189 
    190 	l->l_phys->l_hdr.lh_nfree--;
    191 
    192 	return (chunk);
    193 }
    194 
    195 static void
    196 zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk)
    197 {
    198 	struct zap_leaf_free *zlf = &ZAP_LEAF_CHUNK(l, chunk).l_free;
    199 	ASSERT3U(l->l_phys->l_hdr.lh_nfree, <, ZAP_LEAF_NUMCHUNKS(l));
    200 	ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
    201 	ASSERT(zlf->lf_type != ZAP_CHUNK_FREE);
    202 
    203 	zlf->lf_type = ZAP_CHUNK_FREE;
    204 	zlf->lf_next = l->l_phys->l_hdr.lh_freelist;
    205 	bzero(zlf->lf_pad, sizeof (zlf->lf_pad)); /* help it to compress */
    206 	l->l_phys->l_hdr.lh_freelist = chunk;
    207 
    208 	l->l_phys->l_hdr.lh_nfree++;
    209 }
    210 
    211 /*
    212  * Routines which manipulate leaf arrays (zap_leaf_array type chunks).
    213  */
    214 
    215 static uint16_t
    216 zap_leaf_array_create(zap_leaf_t *l, const char *buf,
    217 	int integer_size, int num_integers)
    218 {
    219 	uint16_t chunk_head;
    220 	uint16_t *chunkp = &chunk_head;
    221 	int byten = 0;
    222 	uint64_t value;
    223 	int shift = (integer_size-1)*8;
    224 	int len = num_integers;
    225 
    226 	ASSERT3U(num_integers * integer_size, <, MAX_ARRAY_BYTES);
    227 
    228 	while (len > 0) {
    229 		uint16_t chunk = zap_leaf_chunk_alloc(l);
    230 		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
    231 		int i;
    232 
    233 		la->la_type = ZAP_CHUNK_ARRAY;
    234 		for (i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) {
    235 			if (byten == 0)
    236 				value = ldv(integer_size, buf);
    237 			la->la_array[i] = value >> shift;
    238 			value <<= 8;
    239 			if (++byten == integer_size) {
    240 				byten = 0;
    241 				buf += integer_size;
    242 				if (--len == 0)
    243 					break;
    244 			}
    245 		}
    246 
    247 		*chunkp = chunk;
    248 		chunkp = &la->la_next;
    249 	}
    250 	*chunkp = CHAIN_END;
    251 
    252 	return (chunk_head);
    253 }
    254 
    255 static void
    256 zap_leaf_array_free(zap_leaf_t *l, uint16_t *chunkp)
    257 {
    258 	uint16_t chunk = *chunkp;
    259 
    260 	*chunkp = CHAIN_END;
    261 
    262 	while (chunk != CHAIN_END) {
    263 		int nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next;
    264 		ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_array.la_type, ==,
    265 		    ZAP_CHUNK_ARRAY);
    266 		zap_leaf_chunk_free(l, chunk);
    267 		chunk = nextchunk;
    268 	}
    269 }
    270 
    271 /* array_len and buf_len are in integers, not bytes */
    272 static void
    273 zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
    274     int array_int_len, int array_len, int buf_int_len, uint64_t buf_len,
    275     char *buf)
    276 {
    277 	int len = MIN(array_len, buf_len);
    278 	int byten = 0;
    279 	uint64_t value = 0;
    280 
    281 	ASSERT3U(array_int_len, <=, buf_int_len);
    282 
    283 	/* Fast path for one 8-byte integer */
    284 	if (array_int_len == 8 && buf_int_len == 8 && len == 1) {
    285 		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
    286 		uint8_t *ip = la->la_array;
    287 		uint64_t *buf64 = (uint64_t *)buf;
    288 
    289 		*buf64 = (uint64_t)ip[0] << 56 | (uint64_t)ip[1] << 48 |
    290 		    (uint64_t)ip[2] << 40 | (uint64_t)ip[3] << 32 |
    291 		    (uint64_t)ip[4] << 24 | (uint64_t)ip[5] << 16 |
    292 		    (uint64_t)ip[6] << 8 | (uint64_t)ip[7];
    293 		return;
    294 	}
    295 
    296 	/* Fast path for an array of 1-byte integers (eg. the entry name) */
    297 	if (array_int_len == 1 && buf_int_len == 1 &&
    298 	    buf_len > array_len + ZAP_LEAF_ARRAY_BYTES) {
    299 		while (chunk != CHAIN_END) {
    300 			struct zap_leaf_array *la =
    301 			    &ZAP_LEAF_CHUNK(l, chunk).l_array;
    302 			bcopy(la->la_array, buf, ZAP_LEAF_ARRAY_BYTES);
    303 			buf += ZAP_LEAF_ARRAY_BYTES;
    304 			chunk = la->la_next;
    305 		}
    306 		return;
    307 	}
    308 
    309 	while (len > 0) {
    310 		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
    311 		int i;
    312 
    313 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
    314 		for (i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) {
    315 			value = (value << 8) | la->la_array[i];
    316 			byten++;
    317 			if (byten == array_int_len) {
    318 				stv(buf_int_len, buf, value);
    319 				byten = 0;
    320 				len--;
    321 				if (len == 0)
    322 					return;
    323 				buf += buf_int_len;
    324 			}
    325 		}
    326 		chunk = la->la_next;
    327 	}
    328 }
    329 
    330 /*
    331  * Only to be used on 8-bit arrays.
    332  * array_len is actual len in bytes (not encoded le_value_length).
    333  * namenorm is null-terminated.
    334  */
    335 static boolean_t
    336 zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn, int chunk, int array_len)
    337 {
    338 	int bseen = 0;
    339 
    340 	if (zn->zn_matchtype == MT_FIRST) {
    341 		char *thisname = kmem_alloc(array_len, KM_SLEEP);
    342 		boolean_t match;
    343 
    344 		zap_leaf_array_read(l, chunk, 1, array_len, 1,
    345 		    array_len, thisname);
    346 		match = zap_match(zn, thisname);
    347 		kmem_free(thisname, array_len);
    348 		return (match);
    349 	}
    350 
    351 	/* Fast path for exact matching */
    352 	while (bseen < array_len) {
    353 		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
    354 		int toread = MIN(array_len - bseen, ZAP_LEAF_ARRAY_BYTES);
    355 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
    356 		if (bcmp(la->la_array, zn->zn_name_orij + bseen, toread))
    357 			break;
    358 		chunk = la->la_next;
    359 		bseen += toread;
    360 	}
    361 	return (bseen == array_len);
    362 }
    363 
    364 /*
    365  * Routines which manipulate leaf entries.
    366  */
    367 
    368 int
    369 zap_leaf_lookup(zap_leaf_t *l, zap_name_t *zn, zap_entry_handle_t *zeh)
    370 {
    371 	uint16_t *chunkp;
    372 	struct zap_leaf_entry *le;
    373 
    374 	ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
    375 
    376 again:
    377 	for (chunkp = LEAF_HASH_ENTPTR(l, zn->zn_hash);
    378 	    *chunkp != CHAIN_END; chunkp = &le->le_next) {
    379 		uint16_t chunk = *chunkp;
    380 		le = ZAP_LEAF_ENTRY(l, chunk);
    381 
    382 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
    383 		ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
    384 
    385 		if (le->le_hash != zn->zn_hash)
    386 			continue;
    387 
    388 		/*
    389 		 * NB: the entry chain is always sorted by cd on
    390 		 * normalized zap objects, so this will find the
    391 		 * lowest-cd match for MT_FIRST.
    392 		 */
    393 		ASSERT(zn->zn_matchtype == MT_EXACT ||
    394 		    (l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED));
    395 		if (zap_leaf_array_match(l, zn, le->le_name_chunk,
    396 		    le->le_name_length)) {
    397 			zeh->zeh_num_integers = le->le_value_length;
    398 			zeh->zeh_integer_size = le->le_int_size;
    399 			zeh->zeh_cd = le->le_cd;
    400 			zeh->zeh_hash = le->le_hash;
    401 			zeh->zeh_chunkp = chunkp;
    402 			zeh->zeh_leaf = l;
    403 			return (0);
    404 		}
    405 	}
    406 
    407 	/*
    408 	 * NB: we could of course do this in one pass, but that would be
    409 	 * a pain.  We'll see if MT_BEST is even used much.
    410 	 */
    411 	if (zn->zn_matchtype == MT_BEST) {
    412 		zn->zn_matchtype = MT_FIRST;
    413 		goto again;
    414 	}
    415 
    416 	return (ENOENT);
    417 }
    418 
    419 /* Return (h1,cd1 >= h2,cd2) */
    420 #define	HCD_GTEQ(h1, cd1, h2, cd2) \
    421 	((h1 > h2) ? TRUE : ((h1 == h2 && cd1 >= cd2) ? TRUE : FALSE))
    422 
    423 int
    424 zap_leaf_lookup_closest(zap_leaf_t *l,
    425     uint64_t h, uint32_t cd, zap_entry_handle_t *zeh)
    426 {
    427 	uint16_t chunk;
    428 	uint64_t besth = -1ULL;
    429 	uint32_t bestcd = ZAP_MAXCD;
    430 	uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES(l)-1;
    431 	uint16_t lh;
    432 	struct zap_leaf_entry *le;
    433 
    434 	ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
    435 
    436 	for (lh = LEAF_HASH(l, h); lh <= bestlh; lh++) {
    437 		for (chunk = l->l_phys->l_hash[lh];
    438 		    chunk != CHAIN_END; chunk = le->le_next) {
    439 			le = ZAP_LEAF_ENTRY(l, chunk);
    440 
    441 			ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
    442 			ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
    443 
    444 			if (HCD_GTEQ(le->le_hash, le->le_cd, h, cd) &&
    445 			    HCD_GTEQ(besth, bestcd, le->le_hash, le->le_cd)) {
    446 				ASSERT3U(bestlh, >=, lh);
    447 				bestlh = lh;
    448 				besth = le->le_hash;
    449 				bestcd = le->le_cd;
    450 
    451 				zeh->zeh_num_integers = le->le_value_length;
    452 				zeh->zeh_integer_size = le->le_int_size;
    453 				zeh->zeh_cd = le->le_cd;
    454 				zeh->zeh_hash = le->le_hash;
    455 				zeh->zeh_fakechunk = chunk;
    456 				zeh->zeh_chunkp = &zeh->zeh_fakechunk;
    457 				zeh->zeh_leaf = l;
    458 			}
    459 		}
    460 	}
    461 
    462 	return (bestcd == ZAP_MAXCD ? ENOENT : 0);
    463 }
    464 
    465 int
    466 zap_entry_read(const zap_entry_handle_t *zeh,
    467     uint8_t integer_size, uint64_t num_integers, void *buf)
    468 {
    469 	struct zap_leaf_entry *le =
    470 	    ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
    471 	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
    472 
    473 	if (le->le_int_size > integer_size)
    474 		return (EINVAL);
    475 
    476 	zap_leaf_array_read(zeh->zeh_leaf, le->le_value_chunk, le->le_int_size,
    477 	    le->le_value_length, integer_size, num_integers, buf);
    478 
    479 	if (zeh->zeh_num_integers > num_integers)
    480 		return (EOVERFLOW);
    481 	return (0);
    482 
    483 }
    484 
    485 int
    486 zap_entry_read_name(const zap_entry_handle_t *zeh, uint16_t buflen, char *buf)
    487 {
    488 	struct zap_leaf_entry *le =
    489 	    ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
    490 	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
    491 
    492 	zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 1,
    493 	    le->le_name_length, 1, buflen, buf);
    494 	if (le->le_name_length > buflen)
    495 		return (EOVERFLOW);
    496 	return (0);
    497 }
    498 
    499 int
    500 zap_entry_update(zap_entry_handle_t *zeh,
    501 	uint8_t integer_size, uint64_t num_integers, const void *buf)
    502 {
    503 	int delta_chunks;
    504 	zap_leaf_t *l = zeh->zeh_leaf;
    505 	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, *zeh->zeh_chunkp);
    506 
    507 	delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) -
    508 	    ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_length * le->le_int_size);
    509 
    510 	if ((int)l->l_phys->l_hdr.lh_nfree < delta_chunks)
    511 		return (EAGAIN);
    512 
    513 	/*
    514 	 * We should search other chained leaves (via
    515 	 * zap_entry_remove,create?) otherwise returning EAGAIN will
    516 	 * just send us into an infinite loop if we have to chain
    517 	 * another leaf block, rather than being able to split this
    518 	 * block.
    519 	 */
    520 
    521 	zap_leaf_array_free(l, &le->le_value_chunk);
    522 	le->le_value_chunk =
    523 	    zap_leaf_array_create(l, buf, integer_size, num_integers);
    524 	le->le_value_length = num_integers;
    525 	le->le_int_size = integer_size;
    526 	return (0);
    527 }
    528 
    529 void
    530 zap_entry_remove(zap_entry_handle_t *zeh)
    531 {
    532 	uint16_t entry_chunk;
    533 	struct zap_leaf_entry *le;
    534 	zap_leaf_t *l = zeh->zeh_leaf;
    535 
    536 	ASSERT3P(zeh->zeh_chunkp, !=, &zeh->zeh_fakechunk);
    537 
    538 	entry_chunk = *zeh->zeh_chunkp;
    539 	le = ZAP_LEAF_ENTRY(l, entry_chunk);
    540 	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
    541 
    542 	zap_leaf_array_free(l, &le->le_name_chunk);
    543 	zap_leaf_array_free(l, &le->le_value_chunk);
    544 
    545 	*zeh->zeh_chunkp = le->le_next;
    546 	zap_leaf_chunk_free(l, entry_chunk);
    547 
    548 	l->l_phys->l_hdr.lh_nentries--;
    549 }
    550 
    551 int
    552 zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd,
    553     uint8_t integer_size, uint64_t num_integers, const void *buf,
    554     zap_entry_handle_t *zeh)
    555 {
    556 	uint16_t chunk;
    557 	uint16_t *chunkp;
    558 	struct zap_leaf_entry *le;
    559 	uint64_t namelen, valuelen;
    560 	int numchunks;
    561 
    562 	valuelen = integer_size * num_integers;
    563 	namelen = strlen(name) + 1;
    564 	ASSERT(namelen >= 2);
    565 
    566 	numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(namelen) +
    567 	    ZAP_LEAF_ARRAY_NCHUNKS(valuelen);
    568 	if (numchunks > ZAP_LEAF_NUMCHUNKS(l))
    569 		return (E2BIG);
    570 
    571 	if (cd == ZAP_MAXCD) {
    572 		/* find the lowest unused cd */
    573 		if (l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED) {
    574 			cd = 0;
    575 
    576 			for (chunk = *LEAF_HASH_ENTPTR(l, h);
    577 			    chunk != CHAIN_END; chunk = le->le_next) {
    578 				le = ZAP_LEAF_ENTRY(l, chunk);
    579 				if (le->le_cd > cd)
    580 					break;
    581 				if (le->le_hash == h) {
    582 					ASSERT3U(cd, ==, le->le_cd);
    583 					cd++;
    584 				}
    585 			}
    586 		} else {
    587 			/* old unsorted format; do it the O(n^2) way */
    588 			for (cd = 0; cd < ZAP_MAXCD; cd++) {
    589 				for (chunk = *LEAF_HASH_ENTPTR(l, h);
    590 				    chunk != CHAIN_END; chunk = le->le_next) {
    591 					le = ZAP_LEAF_ENTRY(l, chunk);
    592 					if (le->le_hash == h &&
    593 					    le->le_cd == cd) {
    594 						break;
    595 					}
    596 				}
    597 				/* If this cd is not in use, we are good. */
    598 				if (chunk == CHAIN_END)
    599 					break;
    600 			}
    601 		}
    602 		/*
    603 		 * we would run out of space in a block before we could
    604 		 * have ZAP_MAXCD entries
    605 		 */
    606 		ASSERT3U(cd, <, ZAP_MAXCD);
    607 	}
    608 
    609 	if (l->l_phys->l_hdr.lh_nfree < numchunks)
    610 		return (EAGAIN);
    611 
    612 	/* make the entry */
    613 	chunk = zap_leaf_chunk_alloc(l);
    614 	le = ZAP_LEAF_ENTRY(l, chunk);
    615 	le->le_type = ZAP_CHUNK_ENTRY;
    616 	le->le_name_chunk = zap_leaf_array_create(l, name, 1, namelen);
    617 	le->le_name_length = namelen;
    618 	le->le_value_chunk =
    619 	    zap_leaf_array_create(l, buf, integer_size, num_integers);
    620 	le->le_value_length = num_integers;
    621 	le->le_int_size = integer_size;
    622 	le->le_hash = h;
    623 	le->le_cd = cd;
    624 
    625 	/* link it into the hash chain */
    626 	/* XXX if we did the search above, we could just use that */
    627 	chunkp = zap_leaf_rehash_entry(l, chunk);
    628 
    629 	l->l_phys->l_hdr.lh_nentries++;
    630 
    631 	zeh->zeh_leaf = l;
    632 	zeh->zeh_num_integers = num_integers;
    633 	zeh->zeh_integer_size = le->le_int_size;
    634 	zeh->zeh_cd = le->le_cd;
    635 	zeh->zeh_hash = le->le_hash;
    636 	zeh->zeh_chunkp = chunkp;
    637 
    638 	return (0);
    639 }
    640 
    641 /*
    642  * Determine if there is another entry with the same normalized form.
    643  * For performance purposes, either zn or name must be provided (the
    644  * other can be NULL).  Note, there usually won't be any hash
    645  * conflicts, in which case we don't need the concatenated/normalized
    646  * form of the name.  But all callers have one of these on hand anyway,
    647  * so might as well take advantage.  A cleaner but slower interface
    648  * would accept neither argument, and compute the normalized name as
    649  * needed (using zap_name_alloc(zap_entry_read_name(zeh))).
    650  */
    651 boolean_t
    652 zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn,
    653     const char *name, zap_t *zap)
    654 {
    655 	uint64_t chunk;
    656 	struct zap_leaf_entry *le;
    657 	boolean_t allocdzn = B_FALSE;
    658 
    659 	if (zap->zap_normflags == 0)
    660 		return (B_FALSE);
    661 
    662 	for (chunk = *LEAF_HASH_ENTPTR(zeh->zeh_leaf, zeh->zeh_hash);
    663 	    chunk != CHAIN_END; chunk = le->le_next) {
    664 		le = ZAP_LEAF_ENTRY(zeh->zeh_leaf, chunk);
    665 		if (le->le_hash != zeh->zeh_hash)
    666 			continue;
    667 		if (le->le_cd == zeh->zeh_cd)
    668 			continue;
    669 
    670 		if (zn == NULL) {
    671 			zn = zap_name_alloc(zap, name, MT_FIRST);
    672 			allocdzn = B_TRUE;
    673 		}
    674 		if (zap_leaf_array_match(zeh->zeh_leaf, zn,
    675 		    le->le_name_chunk, le->le_name_length)) {
    676 			if (allocdzn)
    677 				zap_name_free(zn);
    678 			return (B_TRUE);
    679 		}
    680 	}
    681 	if (allocdzn)
    682 		zap_name_free(zn);
    683 	return (B_FALSE);
    684 }
    685 
    686 /*
    687  * Routines for transferring entries between leafs.
    688  */
    689 
    690 static uint16_t *
    691 zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry)
    692 {
    693 	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry);
    694 	struct zap_leaf_entry *le2;
    695 	uint16_t *chunkp;
    696 
    697 	/*
    698 	 * keep the entry chain sorted by cd
    699 	 * NB: this will not cause problems for unsorted leafs, though
    700 	 * it is unnecessary there.
    701 	 */
    702 	for (chunkp = LEAF_HASH_ENTPTR(l, le->le_hash);
    703 	    *chunkp != CHAIN_END; chunkp = &le2->le_next) {
    704 		le2 = ZAP_LEAF_ENTRY(l, *chunkp);
    705 		if (le2->le_cd > le->le_cd)
    706 			break;
    707 	}
    708 
    709 	le->le_next = *chunkp;
    710 	*chunkp = entry;
    711 	return (chunkp);
    712 }
    713 
    714 static uint16_t
    715 zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl)
    716 {
    717 	uint16_t new_chunk;
    718 	uint16_t *nchunkp = &new_chunk;
    719 
    720 	while (chunk != CHAIN_END) {
    721 		uint16_t nchunk = zap_leaf_chunk_alloc(nl);
    722 		struct zap_leaf_array *nla =
    723 		    &ZAP_LEAF_CHUNK(nl, nchunk).l_array;
    724 		struct zap_leaf_array *la =
    725 		    &ZAP_LEAF_CHUNK(l, chunk).l_array;
    726 		int nextchunk = la->la_next;
    727 
    728 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
    729 		ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS(l));
    730 
    731 		*nla = *la; /* structure assignment */
    732 
    733 		zap_leaf_chunk_free(l, chunk);
    734 		chunk = nextchunk;
    735 		*nchunkp = nchunk;
    736 		nchunkp = &nla->la_next;
    737 	}
    738 	*nchunkp = CHAIN_END;
    739 	return (new_chunk);
    740 }
    741 
    742 static void
    743 zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl)
    744 {
    745 	struct zap_leaf_entry *le, *nle;
    746 	uint16_t chunk;
    747 
    748 	le = ZAP_LEAF_ENTRY(l, entry);
    749 	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
    750 
    751 	chunk = zap_leaf_chunk_alloc(nl);
    752 	nle = ZAP_LEAF_ENTRY(nl, chunk);
    753 	*nle = *le; /* structure assignment */
    754 
    755 	(void) zap_leaf_rehash_entry(nl, chunk);
    756 
    757 	nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl);
    758 	nle->le_value_chunk =
    759 	    zap_leaf_transfer_array(l, le->le_value_chunk, nl);
    760 
    761 	zap_leaf_chunk_free(l, entry);
    762 
    763 	l->l_phys->l_hdr.lh_nentries--;
    764 	nl->l_phys->l_hdr.lh_nentries++;
    765 }
    766 
    767 /*
    768  * Transfer the entries whose hash prefix ends in 1 to the new leaf.
    769  */
    770 void
    771 zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
    772 {
    773 	int i;
    774 	int bit = 64 - 1 - l->l_phys->l_hdr.lh_prefix_len;
    775 
    776 	/* set new prefix and prefix_len */
    777 	l->l_phys->l_hdr.lh_prefix <<= 1;
    778 	l->l_phys->l_hdr.lh_prefix_len++;
    779 	nl->l_phys->l_hdr.lh_prefix = l->l_phys->l_hdr.lh_prefix | 1;
    780 	nl->l_phys->l_hdr.lh_prefix_len = l->l_phys->l_hdr.lh_prefix_len;
    781 
    782 	/* break existing hash chains */
    783 	zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l));
    784 
    785 	if (sort)
    786 		l->l_phys->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
    787 
    788 	/*
    789 	 * Transfer entries whose hash bit 'bit' is set to nl; rehash
    790 	 * the remaining entries
    791 	 *
    792 	 * NB: We could find entries via the hashtable instead. That
    793 	 * would be O(hashents+numents) rather than O(numblks+numents),
    794 	 * but this accesses memory more sequentially, and when we're
    795 	 * called, the block is usually pretty full.
    796 	 */
    797 	for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
    798 		struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, i);
    799 		if (le->le_type != ZAP_CHUNK_ENTRY)
    800 			continue;
    801 
    802 		if (le->le_hash & (1ULL << bit))
    803 			zap_leaf_transfer_entry(l, i, nl);
    804 		else
    805 			(void) zap_leaf_rehash_entry(l, i);
    806 	}
    807 }
    808 
    809 void
    810 zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
    811 {
    812 	int i, n;
    813 
    814 	n = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift -
    815 	    l->l_phys->l_hdr.lh_prefix_len;
    816 	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
    817 	zs->zs_leafs_with_2n_pointers[n]++;
    818 
    819 
    820 	n = l->l_phys->l_hdr.lh_nentries/5;
    821 	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
    822 	zs->zs_blocks_with_n5_entries[n]++;
    823 
    824 	n = ((1<<FZAP_BLOCK_SHIFT(zap)) -
    825 	    l->l_phys->l_hdr.lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 /
    826 	    (1<<FZAP_BLOCK_SHIFT(zap));
    827 	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
    828 	zs->zs_blocks_n_tenths_full[n]++;
    829 
    830 	for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) {
    831 		int nentries = 0;
    832 		int chunk = l->l_phys->l_hash[i];
    833 
    834 		while (chunk != CHAIN_END) {
    835 			struct zap_leaf_entry *le =
    836 			    ZAP_LEAF_ENTRY(l, chunk);
    837 
    838 			n = 1 + ZAP_LEAF_ARRAY_NCHUNKS(le->le_name_length) +
    839 			    ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_length *
    840 			    le->le_int_size);
    841 			n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
    842 			zs->zs_entries_using_n_chunks[n]++;
    843 
    844 			chunk = le->le_next;
    845 			nentries++;
    846 		}
    847 
    848 		n = nentries;
    849 		n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
    850 		zs->zs_buckets_with_n_entries[n]++;
    851 	}
    852 }
    853