Home | History | Annotate | Download | only in common
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #include <_string_table.h>
     28 #include <strings.h>
     29 #include <sgs.h>
     30 #include <stdio.h>
     31 
     32 /*
     33  * This file provides the interfaces to build a Str_tbl suitable for use by
     34  * either the sgsmsg message system, or a standard ELF string table (SHT_STRTAB)
     35  * as created by ld(1).
     36  *
     37  * There are two modes which can be used when constructing a string table:
     38  *
     39  *	st_new(0)
     40  *		standard string table - no compression.  This is the
     41  *		traditional, fast method.
     42  *
     43  *	st_new(FLG_STTAB_COMPRESS)
     44  *		builds a compressed string table which both eliminates
     45  *		duplicate strings, and permits strings with common suffixes
     46  *		(atexit vs. exit) to overlap in the table.  This provides space
     47  *		savings for many string tables.  Although more work than the
     48  *		traditional method, the algorithms used are designed to scale
     49  *		and keep any overhead at a minimum.
     50  *
     51  * These string tables are built with a common interface in a two-pass manner.
     52  * The first pass finds all of the strings required for the string-table and
     53  * calculates the size required for the final string table.
     54  *
     55  * The second pass allocates the string table, populates the strings into the
     56  * table and returns the offsets the strings have been assigned.
     57  *
     58  * The calling sequence to build and populate a string table is:
     59  *
     60  *		st_new();		// initialize strtab
     61  *
     62  *		st_insert(st1);		// first pass of strings ...
     63  *					// calculates size required for
     64  *					// string table
     65  *
     66  *		st_delstring(st?);	// remove string previously
     67  *					// inserted
     68  *		st_insert(stN);
     69  *
     70  *		st_getstrtab_sz();	// freezes strtab and computes
     71  *					// size of table.
     72  *
     73  *		st_setstrbuf();		// associates a final destination
     74  *					// for the string table
     75  *
     76  *		st_setstring(st1);	// populate the string table
     77  *		...			// offsets are based off of second
     78  *					// pass	through the string table
     79  *		st_setstring(stN);
     80  *
     81  *		st_destroy();		// tear down string table
     82  *					// structures.
     83  *
     84  * String Suffix Compression Algorithm:
     85  *
     86  *   Here's a quick high level overview of the Suffix String
     87  *   compression algorithm used.  First - the heart of the algorithm
     88  *   is a Hash table list which represents a dictionary of all unique
     89  *   strings inserted into the string table.  The hash function for
     90  *   this table is a standard string hash except that the hash starts
     91  *   at the last character in the string (&str[n - 1]) and works towards
     92  *   the first character in the function (&str[0]).  As we compute the
     93  *   HASH value for a given string, we also compute the hash values
     94  *   for all of the possible suffix strings for that string.
     95  *
     96  *   As we compute the hash - at each character see if the current
     97  *   suffix string for that hash is already present in the table.  If
     98  *   it is, and the string is a master string.  Then change that
     99  *   string to a suffix string of the new string being inserted.
    100  *
    101  *   When the final hash value is found (hash for str[0...n]), check
    102  *   to see if it is in the hash table - if so increment the reference
    103  *   count for the string.  If it is not yet in the table, insert a
    104  *   new hash table entry for a master string.
    105  *
    106  *   The above method will find all suffixes of a given string given
    107  *   that the strings are inserted from shortest to longest.  That is
    108  *   why this is a two phase method, we first collect all of the
    109  *   strings and store them based off of their length in an AVL tree.
    110  *   Once all of the strings have been submitted we then start the
    111  *   hash table build by traversing the AVL tree in order and
    112  *   inserting the strings from shortest to longest as described
    113  *   above.
    114  */
    115 
    116 /* LINTLIBRARY */
    117 
    118 static int
    119 avl_len_compare(const void *n1, const void *n2)
    120 {
    121 	size_t	len1, len2;
    122 
    123 	len1 = ((LenNode *)n1)->ln_strlen;
    124 	len2 = ((LenNode *)n2)->ln_strlen;
    125 
    126 	if (len1 == len2)
    127 		return (0);
    128 	if (len2 < len1)
    129 		return (1);
    130 	return (-1);
    131 }
    132 
    133 static int
    134 avl_str_compare(const void *n1, const void *n2)
    135 {
    136 	const char	*str1, *str2;
    137 	int		rc;
    138 
    139 	str1 = ((StrNode *)n1)->sn_str;
    140 	str2 = ((StrNode *)n2)->sn_str;
    141 
    142 	rc = strcmp(str1, str2);
    143 	if (rc > 0)
    144 		return (1);
    145 	if (rc < 0)
    146 		return (-1);
    147 	return (0);
    148 }
    149 
    150 /*
    151  * Return an initialized Str_tbl - returns NULL on failure.
    152  *
    153  * flags:
    154  *	FLG_STTAB_COMPRESS - build a compressed string table
    155  */
    156 Str_tbl *
    157 st_new(uint_t flags)
    158 {
    159 	Str_tbl	*stp;
    160 
    161 	if ((stp = calloc(sizeof (*stp), 1)) == NULL)
    162 		return (NULL);
    163 
    164 	/*
    165 	 * Start with a leading '\0' - it's tradition.
    166 	 */
    167 	stp->st_strsize = stp->st_fullstrsize = stp->st_nextoff = 1;
    168 
    169 	/*
    170 	 * Do we compress this string table?
    171 	 */
    172 	stp->st_flags = flags;
    173 	if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0)
    174 		return (stp);
    175 
    176 	if ((stp->st_lentree = calloc(sizeof (*stp->st_lentree), 1)) == NULL)
    177 		return (NULL);
    178 
    179 	avl_create(stp->st_lentree, &avl_len_compare, sizeof (LenNode),
    180 	    SGSOFFSETOF(LenNode, ln_avlnode));
    181 
    182 	return (stp);
    183 }
    184 
    185 /*
    186  * Insert a new string into the Str_tbl.  There are two AVL trees used.
    187  *
    188  *  -	The first LenNode AVL tree maintains a tree of nodes based on string
    189  *	sizes.
    190  *  -	Each LenNode maintains a StrNode AVL tree for each string.  Large
    191  *	applications have been known to contribute thousands of strings of
    192  *	the same size.  Should strings need to be removed (-z ignore), then
    193  *	the string AVL tree makes this removal efficient and scalable.
    194  */
    195 int
    196 st_insert(Str_tbl *stp, const char *str)
    197 {
    198 	size_t		len;
    199 	StrNode		*snp, sn = { 0 };
    200 	LenNode		*lnp, ln = { 0 };
    201 	avl_index_t	where;
    202 
    203 	/*
    204 	 * String table can't have been cooked
    205 	 */
    206 	assert((stp->st_flags & FLG_STTAB_COOKED) == 0);
    207 
    208 	/*
    209 	 * Null strings always point to the head of the string
    210 	 * table - no reason to keep searching.
    211 	 */
    212 	if ((len = strlen(str)) == 0)
    213 		return (0);
    214 
    215 	stp->st_fullstrsize += len + 1;
    216 	stp->st_strcnt++;
    217 
    218 	if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0)
    219 		return (0);
    220 
    221 	/*
    222 	 * From the controlling string table, determine which LenNode AVL node
    223 	 * provides for this string length.  If the node doesn't exist, insert
    224 	 * a new node to represent this string length.
    225 	 */
    226 	ln.ln_strlen = len;
    227 	if ((lnp = avl_find(stp->st_lentree, &ln, &where)) == NULL) {
    228 		if ((lnp = calloc(sizeof (*lnp), 1)) == NULL)
    229 			return (-1);
    230 		lnp->ln_strlen = len;
    231 		avl_insert(stp->st_lentree, lnp, where);
    232 
    233 		if ((lnp->ln_strtree = calloc(sizeof (*lnp->ln_strtree), 1)) ==
    234 		    NULL)
    235 			return (0);
    236 
    237 		avl_create(lnp->ln_strtree, &avl_str_compare, sizeof (StrNode),
    238 		    SGSOFFSETOF(StrNode, sn_avlnode));
    239 	}
    240 
    241 	/*
    242 	 * From the string length AVL node determine whether a StrNode AVL node
    243 	 * provides this string.  If the node doesn't exist, insert a new node
    244 	 * to represent this string.
    245 	 */
    246 	sn.sn_str = str;
    247 	if ((snp = avl_find(lnp->ln_strtree, &sn, &where)) == NULL) {
    248 		if ((snp = calloc(sizeof (*snp), 1)) == NULL)
    249 			return (-1);
    250 		snp->sn_str = str;
    251 		avl_insert(lnp->ln_strtree, snp, where);
    252 	}
    253 	snp->sn_refcnt++;
    254 
    255 	return (0);
    256 }
    257 
    258 /*
    259  * Remove a previously inserted string from the Str_tbl.
    260  */
    261 int
    262 st_delstring(Str_tbl *stp, const char *str)
    263 {
    264 	size_t		len;
    265 	LenNode		*lnp, ln = { 0 };
    266 	StrNode		*snp, sn = { 0 };
    267 
    268 	/*
    269 	 * String table can't have been cooked
    270 	 */
    271 	assert((stp->st_flags & FLG_STTAB_COOKED) == 0);
    272 
    273 	len = strlen(str);
    274 	stp->st_fullstrsize -= len + 1;
    275 
    276 	if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0)
    277 		return (0);
    278 
    279 	/*
    280 	 * Determine which LenNode AVL node provides for this string length.
    281 	 */
    282 	ln.ln_strlen = len;
    283 	if ((lnp = avl_find(stp->st_lentree, &ln, 0)) != NULL) {
    284 		sn.sn_str = str;
    285 		if ((snp = avl_find(lnp->ln_strtree, &sn, 0)) != NULL) {
    286 			/*
    287 			 * Reduce the reference count, and if zero remove the
    288 			 * node.
    289 			 */
    290 			if (--snp->sn_refcnt == 0)
    291 				avl_remove(lnp->ln_strtree, snp);
    292 			return (0);
    293 		}
    294 	}
    295 
    296 	/*
    297 	 * No strings of this length, or no string itself - someone goofed.
    298 	 */
    299 	return (-1);
    300 }
    301 
    302 /*
    303  * Tear down a String_Table structure.
    304  */
    305 void
    306 st_destroy(Str_tbl *stp)
    307 {
    308 	Str_hash	*sthash, *psthash;
    309 	Str_master	*mstr, *pmstr;
    310 	uint_t		i;
    311 
    312 	/*
    313 	 * cleanup the master strings
    314 	 */
    315 	for (mstr = stp->st_mstrlist, pmstr = 0; mstr;
    316 	    mstr = mstr->sm_next) {
    317 		if (pmstr)
    318 			free(pmstr);
    319 		pmstr = mstr;
    320 	}
    321 	if (pmstr)
    322 		free(pmstr);
    323 
    324 	if (stp->st_hashbcks) {
    325 		for (i = 0; i < stp->st_hbckcnt; i++) {
    326 			for (sthash = stp->st_hashbcks[i], psthash = 0;
    327 			    sthash; sthash = sthash->hi_next) {
    328 				if (psthash)
    329 					free(psthash);
    330 				psthash = sthash;
    331 			}
    332 			if (psthash)
    333 				free(psthash);
    334 		}
    335 		free(stp->st_hashbcks);
    336 	}
    337 	free(stp);
    338 }
    339 
    340 
    341 /*
    342  * For a given string - copy it into the buffer associated with
    343  * the string table - and return the offset it has been assigned.
    344  *
    345  * If a value of '-1' is returned - the string was not found in
    346  * the Str_tbl.
    347  */
    348 int
    349 st_setstring(Str_tbl *stp, const char *str, size_t *stoff)
    350 {
    351 	size_t		stlen;
    352 	uint_t		hashval;
    353 	Str_hash	*sthash;
    354 	Str_master	*mstr;
    355 	int		i;
    356 
    357 	/*
    358 	 * String table *must* have been previously cooked
    359 	 */
    360 	assert(stp->st_strbuf);
    361 
    362 	assert(stp->st_flags & FLG_STTAB_COOKED);
    363 	stlen = strlen(str);
    364 	/*
    365 	 * Null string always points to head of string table
    366 	 */
    367 	if (stlen == 0) {
    368 		*stoff = 0;
    369 		return (0);
    370 	}
    371 
    372 	if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) {
    373 		size_t		_stoff;
    374 
    375 		stlen++;	/* count for trailing '\0' */
    376 		_stoff = stp->st_nextoff;
    377 		/*
    378 		 * Have we overflowed our assigned buffer?
    379 		 */
    380 		if ((_stoff + stlen) > stp->st_fullstrsize)
    381 			return (-1);
    382 		memcpy(stp->st_strbuf + _stoff, str, stlen);
    383 		*stoff = _stoff;
    384 		stp->st_nextoff += stlen;
    385 		return (0);
    386 	}
    387 
    388 	/*
    389 	 * Calculate reverse hash for string.
    390 	 */
    391 	hashval = HASHSEED;
    392 	for (i = stlen; i >= 0; i--) {
    393 		hashval = ((hashval << 5) + hashval) +
    394 		    str[i];			/* h = ((h * 33) + c) */
    395 	}
    396 
    397 	for (sthash = stp->st_hashbcks[hashval % stp->st_hbckcnt]; sthash;
    398 	    sthash = sthash->hi_next) {
    399 		const char	*hstr;
    400 
    401 		if (sthash->hi_hashval != hashval)
    402 			continue;
    403 
    404 		hstr = &sthash->hi_mstr->sm_str[sthash->hi_mstr->sm_strlen -
    405 		    sthash->hi_strlen];
    406 		if (strcmp(str, hstr) == 0)
    407 			break;
    408 	}
    409 
    410 	/*
    411 	 * Did we find the string?
    412 	 */
    413 	if (sthash == 0)
    414 		return (-1);
    415 
    416 	/*
    417 	 * Has this string been copied into the string table?
    418 	 */
    419 	mstr = sthash->hi_mstr;
    420 	if (mstr->sm_stroff == 0) {
    421 		size_t	mstrlen = mstr->sm_strlen + 1;
    422 
    423 		mstr->sm_stroff = stp->st_nextoff;
    424 
    425 		/*
    426 		 * Have we overflowed our assigned buffer?
    427 		 */
    428 		if ((mstr->sm_stroff + mstrlen) > stp->st_fullstrsize)
    429 			return (-1);
    430 
    431 		(void) memcpy(stp->st_strbuf + mstr->sm_stroff,
    432 		    mstr->sm_str, mstrlen);
    433 		stp->st_nextoff += mstrlen;
    434 	}
    435 
    436 	/*
    437 	 * Calculate offset of (sub)string.
    438 	 */
    439 	*stoff = mstr->sm_stroff + mstr->sm_strlen - sthash->hi_strlen;
    440 
    441 	return (0);
    442 }
    443 
    444 
    445 static int
    446 st_hash_insert(Str_tbl *stp, const char *str, size_t len)
    447 {
    448 	int		i;
    449 	uint_t		hashval = HASHSEED;
    450 	uint_t		bckcnt = stp->st_hbckcnt;
    451 	Str_hash	**hashbcks = stp->st_hashbcks;
    452 	Str_hash	*sthash;
    453 	Str_master	*mstr = 0;
    454 
    455 	/*
    456 	 * We use a classic 'Bernstein k=33' hash function.  But
    457 	 * instead of hashing from the start of the string to the
    458 	 * end, we do it in reverse.
    459 	 *
    460 	 * This way - we are essentially building all of the
    461 	 * suffix hashvalues as we go.  We can check to see if
    462 	 * any suffixes already exist in the tree as we generate
    463 	 * the hash.
    464 	 */
    465 	for (i = len; i >= 0; i--) {
    466 		hashval = ((hashval << 5) + hashval) +
    467 		    str[i];			/* h = ((h * 33) + c) */
    468 
    469 		for (sthash = hashbcks[hashval % bckcnt];
    470 		    sthash; sthash = sthash->hi_next) {
    471 			const char	*hstr;
    472 			Str_master	*_mstr;
    473 
    474 			if (sthash->hi_hashval != hashval)
    475 				continue;
    476 
    477 			_mstr = sthash->hi_mstr;
    478 			hstr = &_mstr->sm_str[_mstr->sm_strlen -
    479 			    sthash->hi_strlen];
    480 
    481 			if (strcmp(&str[i], hstr))
    482 				continue;
    483 
    484 			if (i == 0) {
    485 				/*
    486 				 * Entry already in table, increment refcnt and
    487 				 * get out.
    488 				 */
    489 				sthash->hi_refcnt++;
    490 				return (0);
    491 			} else {
    492 				/*
    493 				 * If this 'suffix' is presently a 'master
    494 				 * string, then take over it's record.
    495 				 */
    496 				if (sthash->hi_strlen == _mstr->sm_strlen) {
    497 					/*
    498 					 * we should only do this once.
    499 					 */
    500 					assert(mstr == 0);
    501 					mstr = _mstr;
    502 				}
    503 			}
    504 		}
    505 	}
    506 
    507 	/*
    508 	 * Do we need a new master string, or can we take over
    509 	 * one we already found in the table?
    510 	 */
    511 	if (mstr == 0) {
    512 		/*
    513 		 * allocate a new master string
    514 		 */
    515 		if ((mstr = calloc(sizeof (*mstr), 1)) == 0)
    516 			return (-1);
    517 		mstr->sm_next = stp->st_mstrlist;
    518 		stp->st_mstrlist = mstr;
    519 		stp->st_strsize += len + 1;
    520 	} else {
    521 		/*
    522 		 * We are taking over a existing master string, the string size
    523 		 * only increments by the difference between the current string
    524 		 * and the previous master.
    525 		 */
    526 		assert(len > mstr->sm_strlen);
    527 		stp->st_strsize += len - mstr->sm_strlen;
    528 	}
    529 
    530 	if ((sthash = calloc(sizeof (*sthash), 1)) == 0)
    531 		return (-1);
    532 
    533 	mstr->sm_hashval = sthash->hi_hashval = hashval;
    534 	mstr->sm_strlen = sthash->hi_strlen = len;
    535 	mstr->sm_str = str;
    536 	sthash->hi_refcnt = 1;
    537 	sthash->hi_mstr = mstr;
    538 
    539 	/*
    540 	 * Insert string element into head of hash list
    541 	 */
    542 	hashval = hashval % bckcnt;
    543 	sthash->hi_next = hashbcks[hashval];
    544 	hashbcks[hashval] = sthash;
    545 	return (0);
    546 }
    547 
    548 /*
    549  * Return amount of space required for the string table.
    550  */
    551 size_t
    552 st_getstrtab_sz(Str_tbl *stp)
    553 {
    554 	assert(stp->st_fullstrsize > 0);
    555 
    556 	if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) {
    557 		stp->st_flags |= FLG_STTAB_COOKED;
    558 		return (stp->st_fullstrsize);
    559 	}
    560 
    561 	if ((stp->st_flags & FLG_STTAB_COOKED) == 0) {
    562 		LenNode		*lnp;
    563 		void		*cookie;
    564 
    565 		stp->st_flags |= FLG_STTAB_COOKED;
    566 		/*
    567 		 * allocate a hash table about the size of # of
    568 		 * strings input.
    569 		 */
    570 		stp->st_hbckcnt = findprime(stp->st_strcnt);
    571 		if ((stp->st_hashbcks = calloc(sizeof (*stp->st_hashbcks),
    572 		    stp->st_hbckcnt)) == NULL)
    573 			return (0);
    574 
    575 		/*
    576 		 * We now walk all of the strings in the list, from shortest to
    577 		 * longest, and insert them into the hashtable.
    578 		 */
    579 		if ((lnp = avl_first(stp->st_lentree)) == NULL) {
    580 			/*
    581 			 * Is it possible we have an empty string table, if so,
    582 			 * the table still contains '\0', so return the size.
    583 			 */
    584 			if (avl_numnodes(stp->st_lentree) == 0) {
    585 				assert(stp->st_strsize == 1);
    586 				return (stp->st_strsize);
    587 			}
    588 			return (0);
    589 		}
    590 
    591 		while (lnp) {
    592 			StrNode	*snp;
    593 
    594 			/*
    595 			 * Walk the string lists and insert them into the hash
    596 			 * list.  Once a string is inserted we no longer need
    597 			 * it's entry, so the string can be freed.
    598 			 */
    599 			for (snp = avl_first(lnp->ln_strtree); snp;
    600 			    snp = AVL_NEXT(lnp->ln_strtree, snp)) {
    601 				if (st_hash_insert(stp, snp->sn_str,
    602 				    lnp->ln_strlen) == -1)
    603 					return (0);
    604 			}
    605 
    606 			/*
    607 			 * Now that the strings have been copied, walk the
    608 			 * StrNode tree and free all the AVL nodes.  Note,
    609 			 * avl_destroy_nodes() beats avl_remove() as the
    610 			 * latter balances the nodes as they are removed.
    611 			 * We just want to tear the whole thing down fast.
    612 			 */
    613 			cookie = NULL;
    614 			while ((snp = avl_destroy_nodes(lnp->ln_strtree,
    615 			    &cookie)) != NULL)
    616 				free(snp);
    617 			avl_destroy(lnp->ln_strtree);
    618 			free(lnp->ln_strtree);
    619 			lnp->ln_strtree = NULL;
    620 
    621 			/*
    622 			 * Move on to the next LenNode.
    623 			 */
    624 			lnp = AVL_NEXT(stp->st_lentree, lnp);
    625 		}
    626 
    627 		/*
    628 		 * Now that all of the strings have been freed, walk the
    629 		 * LenNode tree and free all of the AVL nodes.  Note,
    630 		 * avl_destroy_nodes() beats avl_remove() as the latter
    631 		 * balances the nodes as they are removed. We just want to
    632 		 * tear the whole thing down fast.
    633 		 */
    634 		cookie = NULL;
    635 		while ((lnp = avl_destroy_nodes(stp->st_lentree,
    636 		    &cookie)) != NULL)
    637 			free(lnp);
    638 		avl_destroy(stp->st_lentree);
    639 		free(stp->st_lentree);
    640 		stp->st_lentree = 0;
    641 	}
    642 
    643 	assert(stp->st_strsize > 0);
    644 	assert(stp->st_fullstrsize >= stp->st_strsize);
    645 
    646 	return (stp->st_strsize);
    647 }
    648 
    649 /*
    650  * Associate a buffer with a string table.
    651  */
    652 const char *
    653 st_getstrbuf(Str_tbl *stp)
    654 {
    655 	return (stp->st_strbuf);
    656 }
    657 
    658 int
    659 st_setstrbuf(Str_tbl *stp, char *stbuf, size_t bufsize)
    660 {
    661 	assert(stp->st_flags & FLG_STTAB_COOKED);
    662 
    663 	if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) {
    664 		if (bufsize < stp->st_fullstrsize)
    665 			return (-1);
    666 	} else {
    667 		if (bufsize < stp->st_strsize)
    668 			return (-1);
    669 	}
    670 
    671 	stp->st_strbuf = stbuf;
    672 #ifdef	DEBUG
    673 	/*
    674 	 * for debug builds - start with a stringtable filled in
    675 	 * with '0xff'.  This makes it very easy to spot unfilled
    676 	 * holes in the strtab.
    677 	 */
    678 	memset(stbuf, 0xff, bufsize);
    679 	stbuf[0] = '\0';
    680 #else
    681 	memset(stbuf, 0x0, bufsize);
    682 #endif
    683 	return (0);
    684 }
    685