Home | History | Annotate | Download | only in regex
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
     28 /*	  All Rights Reserved	*/
     29 
     30 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     31 
     32 /*
     33  * IMPORTANT NOTE:
     34  *
     35  * regcmp() WORKS **ONLY** WITH THE ASCII AND THE Solaris EUC CHARACTER SETS.
     36  * IT IS **NOT** CHARACTER SET INDEPENDENT.
     37  *
     38  */
     39 
     40 #pragma weak _regcmp = regcmp
     41 
     42 #include "lint.h"
     43 #include "mtlib.h"
     44 #include <limits.h>
     45 #include <stdarg.h>
     46 #include <stdlib.h>
     47 #include <thread.h>
     48 #include <wctype.h>
     49 #include <widec.h>
     50 #include <string.h>
     51 #include "tsd.h"
     52 
     53 
     54 /* CONSTANTS SHARED WITH regex() */
     55 
     56 #include "regex.h"
     57 
     58 /* PRIVATE CONSTANTS */
     59 
     60 #define	BACKSLASH		'\\'
     61 #define	CIRCUMFLEX		'^'
     62 #define	COMMA			','
     63 #define	DASH			'-'
     64 #define	DOLLAR_SIGN		'$'
     65 #define	DOT			'.'
     66 #define	LEFT_CURLY_BRACE	'{'
     67 #define	LEFT_PAREN		'('
     68 #define	LEFT_SQUARE_BRACKET	'['
     69 #define	PLUS			'+'
     70 #define	RIGHT_CURLY_BRACE	'}'
     71 #define	RIGHT_PAREN		')'
     72 #define	RIGHT_SQUARE_BRACKET	']'
     73 #define	SINGLE_BYTE_MASK	0xff
     74 #define	STRINGP_STACK_SIZE	50
     75 #define	STAR			'*'
     76 
     77 /* PRIVATE GLOBAL VARIABLES */
     78 
     79 static char	*compilep_stack[STRINGP_STACK_SIZE];
     80 static char	**compilep_stackp;
     81 static mutex_t  regcmp_lock = DEFAULTMUTEX;
     82 
     83 /* DECLARATIONS OF PRIVATE FUNCTIONS */
     84 
     85 static int add_char(char *compilep, wchar_t wchar);
     86 static int add_single_char_expr(char *compilep, wchar_t wchar);
     87 
     88 #define	ERROR_EXIT(mutex_lockp, arg_listp, compile_startp) \
     89 \
     90 	va_end(arg_listp); \
     91 	lmutex_unlock(mutex_lockp); \
     92 	if ((compile_startp) != (char *)0) \
     93 		free((void *)compile_startp); \
     94 	return ((char *)0)
     95 
     96 static int get_count(int *countp, const char *regexp);
     97 static int get_digit(const char *regexp);
     98 static int get_wchar(wchar_t *wchar, const char *regexp);
     99 static char *pop_compilep(void);
    100 static char *push_compilep(char *compilep);
    101 static boolean_t valid_range(wchar_t lower_char, wchar_t upper_char);
    102 
    103 
    104 /* DEFINITIONS OF PUBLIC VARIABLES */
    105 
    106 int __i_size;
    107 
    108 /*
    109  * define thread-specific storage for __i_size
    110  *
    111  */
    112 int *
    113 ___i_size(void)
    114 {
    115 	if (thr_main())
    116 		return (&__i_size);
    117 	return ((int *)tsdalloc(_T_REGCMP_ISIZE, sizeof (int), NULL));
    118 }
    119 
    120 #define		__i_size (*(___i_size()))
    121 
    122 /* DEFINITION OF regcmp() */
    123 
    124 extern char *
    125 regcmp(const char *regexp, ...)
    126 {
    127 	va_list		arg_listp;
    128 	size_t		arg_strlen;
    129 	boolean_t	can_repeat;
    130 	int		char_size;
    131 	unsigned int	class_length;
    132 	char		*compilep;
    133 	char		*compile_startp = (char *)0;
    134 	int		count_length;
    135 	wchar_t		current_char;
    136 	int		expr_length;
    137 	int		groupn;
    138 	unsigned int	group_length;
    139 	unsigned int	high_bits;
    140 	boolean_t	dash_indicates_range;
    141 	unsigned int	low_bits;
    142 	int		max_count;
    143 	int		min_count;
    144 	const char	*next_argp;
    145 	wchar_t		first_char_in_range;
    146 	char		*regex_typep;
    147 	int		return_arg_number;
    148 	int		substringn;
    149 
    150 	if (___i_size() == (int *)0)
    151 		return ((char *)0);
    152 
    153 	/*
    154 	 * When compiling a regular expression, regcmp() generates at most
    155 	 * two extra single-byte characters for each character in the
    156 	 * expression, so allocating three times the number of bytes in all
    157 	 * the strings that comprise the regular expression will ensure that
    158 	 * regcmp() won't overwrite the end of the allocated block when
    159 	 * compiling the expression.
    160 	 */
    161 
    162 	va_start(arg_listp, regexp);
    163 	next_argp = regexp;
    164 	arg_strlen = 0;
    165 	while (next_argp != (char *)0) {
    166 		arg_strlen += strlen(next_argp);
    167 		next_argp = va_arg(arg_listp, /* const */ char *);
    168 	}
    169 	va_end(arg_listp);
    170 
    171 	if (arg_strlen == 0)
    172 		return ((char *)0);
    173 	compile_startp = (char *)malloc(3 * arg_strlen);
    174 	if (compile_startp == (char *)0)
    175 		return ((char *)0);
    176 
    177 	lmutex_lock(&regcmp_lock);
    178 	__i_size = 0;
    179 	compilep = compile_startp;
    180 	compilep_stackp = &compilep_stack[STRINGP_STACK_SIZE];
    181 
    182 	/* GET THE FIRST CHARACTER IN THE REGULAR EXPRESSION */
    183 	va_start(arg_listp, regexp);
    184 	next_argp = va_arg(arg_listp, /* const */ char *);
    185 	char_size = get_wchar(&current_char, regexp);
    186 	if (char_size < 0) {
    187 		ERROR_EXIT(&regcmp_lock, arg_listp, compile_startp);
    188 	} else if (char_size > 0) {
    189 		regexp += char_size;
    190 	} else /* (char_size == 0 ) */ {
    191 		regexp = next_argp;
    192 		next_argp = va_arg(arg_listp, /* const */ char *);
    193 		char_size = get_wchar(&current_char, regexp);
    194 		if (char_size <= 0) {
    195 			ERROR_EXIT(&regcmp_lock, arg_listp, compile_startp);
    196 		} else {
    197 			regexp += char_size;
    198 		}
    199 	}
    200 
    201 	/* FIND OUT IF THE EXPRESSION MUST START AT THE START OF A STRING */
    202 
    203 	if (current_char == CIRCUMFLEX) {
    204 		char_size = get_wchar(&current_char, regexp);
    205 		if (char_size < 0) {
    206 			ERROR_EXIT(&regcmp_lock, arg_listp, compile_startp);
    207 		} else if (char_size > 0) {
    208 			regexp += char_size;
    209 			*compilep = (unsigned char)START_OF_STRING_MARK;
    210 			compilep++;
    211 		} else if /* (char_size == 0) && */ (next_argp != (char *)0) {
    212 			regexp = next_argp;
    213 			next_argp = va_arg(arg_listp, /* const */ char *);
    214 			char_size = get_wchar(&current_char, regexp);
    215 			if (char_size <= 0) {
    216 				ERROR_EXIT(&regcmp_lock, arg_listp,
    217 				    compile_startp);
    218 			} else {
    219 				regexp += char_size;
    220 			}
    221 			*compilep = (unsigned char)START_OF_STRING_MARK;
    222 			compilep++;
    223 		} else {
    224 			/* ((char_size==0) && (next_argp==(char *)0)) */
    225 			/*
    226 			 * the regular expression is "^"
    227 			 */
    228 			*compilep = (unsigned char)START_OF_STRING_MARK;
    229 			compilep++;
    230 			*compilep = (unsigned char)END_REGEX;
    231 			compilep++;
    232 			*compilep = '\0';
    233 			compilep++;
    234 			__i_size = (int)(compilep - compile_startp);
    235 			va_end(arg_listp);
    236 			lmutex_unlock(&regcmp_lock);
    237 			return (compile_startp);
    238 		}
    239 	}
    240 
    241 	/* COMPILE THE REGULAR EXPRESSION */
    242 
    243 	groupn = 0;
    244 	substringn = 0;
    245 	can_repeat = B_FALSE;
    246 	for (;;) {
    247 
    248 		/*
    249 		 * At the end of each iteration get the next character
    250 		 * from the regular expression and increment regexp to
    251 		 * point to the following character.  Exit when all
    252 		 * the characters in all the strings in the argument
    253 		 * list have been read.
    254 		 */
    255 
    256 		switch (current_char) {
    257 
    258 			/*
    259 			 * No fall-through.  Each case ends with either
    260 			 * a break or an error exit.  Each case starts
    261 			 * with compilep addressing the next location to
    262 			 * be written in the compiled regular expression,
    263 			 * and with regexp addressing the next character
    264 			 * to be read from the regular expression being
    265 			 * compiled.  Each case that doesn't return
    266 			 * increments regexp to address the next character
    267 			 * to be read from the regular expression and
    268 			 * increments compilep to address the next
    269 			 * location to be written in the compiled
    270 			 * regular expression.
    271 			 *
    272 			 * NOTE: The comments for each case give the meaning
    273 			 * of the regular expression compiled by the case
    274 			 * and the character string written to the compiled
    275 			 * regular expression by the case.  Each single
    276 			 * character
    277 			 * written to the compiled regular expression is
    278 			 * shown enclosed in angle brackets (<>).  Each
    279 			 * compiled regular expression begins with a marker
    280 			 * character which is shown as a named constant
    281 			 * (e.g. <ASCII_CHAR>). Character constants are
    282 			 * shown enclosed in single quotes (e.g. <'$'>).
    283 			 * All other single characters written to the
    284 			 * compiled regular expression are shown as lower
    285 			 * case variable names (e.g. <ascii_char> or
    286 			 * <multibyte_char>). Multicharacter
    287 			 * strings written to the compiled regular expression
    288 			 * are shown as variable names followed by elipses
    289 			 * (e.g. <regex...>).
    290 			 */
    291 
    292 		case DOLLAR_SIGN:
    293 			/* end of string marker or simple dollar sign */
    294 			/* compiles to <END_OF_STRING_MARK> or */
    295 			/* <ASCII_CHAR><'$'> */
    296 
    297 			char_size = get_wchar(&current_char, regexp);
    298 			if ((char_size == 0) && (next_argp == (char *)0)) {
    299 				can_repeat = B_FALSE;
    300 				*compilep = (unsigned char)END_OF_STRING_MARK;
    301 				compilep++;
    302 			} else {
    303 				can_repeat = B_TRUE;
    304 				*compilep = (unsigned char)ASCII_CHAR;
    305 				regex_typep = compilep;
    306 				compilep++;
    307 				*compilep = DOLLAR_SIGN;
    308 				compilep++;
    309 			}
    310 			break; /* end case DOLLAR_SIGN */
    311 
    312 		case DOT: /* any character */
    313 
    314 			/* compiles to <ANY_CHAR> */
    315 
    316 			can_repeat = B_TRUE;
    317 			*compilep = (unsigned char)ANY_CHAR;
    318 			regex_typep = compilep;
    319 			compilep++;
    320 
    321 			break; /* end case DOT */
    322 
    323 		case BACKSLASH: /* escaped character */
    324 
    325 			/*
    326 			 * compiles to <ASCII_CHAR><ascii_char> or
    327 			 * <MULTIBYTE_CHAR><multibyte_char>
    328 			 */
    329 
    330 			char_size = get_wchar(&current_char, regexp);
    331 			if (char_size <= 0) {
    332 				ERROR_EXIT(&regcmp_lock, arg_listp,
    333 				    compile_startp);
    334 			} else {
    335 				regexp += char_size;
    336 				can_repeat = B_TRUE;
    337 				expr_length = add_single_char_expr(
    338 				    compilep, current_char);
    339 				regex_typep = compilep;
    340 				compilep += expr_length;
    341 			}
    342 			break; /* end case '\\' */
    343 
    344 		case LEFT_SQUARE_BRACKET:
    345 			/* start of a character class expression */
    346 
    347 			/*
    348 			 * [^...c...] compiles to
    349 			 * <NOT_IN_CLASS><class_length><...c...>
    350 			 * [^...a-z...] compiles to
    351 			 * <NOT_IN_CLASS><class_length><...a<THRU>z...>
    352 			 * [...c...] compiles to
    353 			 * <IN_CLASS><class_length><...c...>
    354 			 * [...a-z...] compiles to
    355 			 * <IN_CLASS><class_length><...a<THRU>z...>
    356 			 *
    357 			 * NOTE: <class_length> includes the
    358 			 * <class_length> byte
    359 			 */
    360 
    361 			can_repeat = B_TRUE;
    362 			regex_typep = compilep;
    363 
    364 			/* DETERMINE THE CLASS TYPE */
    365 
    366 			/*
    367 			 * NOTE: This algorithm checks the value of the
    368 			 * "multibyte"
    369 			 * macro in <euc.h> (included in <widec.h> )
    370 			 * to find out if regcmp()
    371 			 * is compiling the regular expression in a
    372 			 * multibyte locale.
    373 			 */
    374 			char_size = get_wchar(&current_char, regexp);
    375 			if (char_size <= 0) {
    376 				ERROR_EXIT(&regcmp_lock, arg_listp,
    377 				    compile_startp);
    378 			} else if (current_char == CIRCUMFLEX) {
    379 				regexp++;
    380 				char_size = get_wchar(&current_char, regexp);
    381 				if (char_size <= 0) {
    382 					ERROR_EXIT(&regcmp_lock,
    383 					    arg_listp, compile_startp);
    384 				} else {
    385 					regexp += char_size;
    386 					if (!multibyte) {
    387 						*compilep = (unsigned char)
    388 						    NOT_IN_ASCII_CHAR_CLASS;
    389 					} else {
    390 						*compilep = (unsigned char)
    391 						    NOT_IN_MULTIBYTE_CHAR_CLASS;
    392 					}
    393 					/* leave space for <class_length> */
    394 					compilep += 2;
    395 				}
    396 			} else {
    397 				regexp += char_size;
    398 				if (!multibyte) {
    399 					*compilep = (unsigned char)
    400 					    IN_ASCII_CHAR_CLASS;
    401 				} else {
    402 					*compilep = (unsigned char)
    403 					    IN_MULTIBYTE_CHAR_CLASS;
    404 				}
    405 				/* leave space for <class_length> */
    406 				compilep += 2;
    407 			}
    408 
    409 			/* COMPILE THE CLASS */
    410 			/*
    411 			 * check for a leading right square bracket,
    412 			 * which is allowed
    413 			 */
    414 
    415 			if (current_char == RIGHT_SQUARE_BRACKET) {
    416 				/*
    417 				 * the leading RIGHT_SQUARE_BRACKET may
    418 				 * be part of a character range
    419 				 * expression like "[]-\]"
    420 				 */
    421 				dash_indicates_range = B_TRUE;
    422 				first_char_in_range = current_char;
    423 				char_size = get_wchar(&current_char, regexp);
    424 				if (char_size <= 0) {
    425 					ERROR_EXIT(&regcmp_lock,
    426 					    arg_listp, compile_startp);
    427 				} else {
    428 					regexp += char_size;
    429 					*compilep = RIGHT_SQUARE_BRACKET;
    430 					compilep++;
    431 				}
    432 			} else {
    433 				/*
    434 				 * decode the character in the following
    435 				 * while loop and decide then if it can
    436 				 * be the first character
    437 				 * in a character range expression
    438 				 */
    439 				dash_indicates_range = B_FALSE;
    440 			}
    441 
    442 			while (current_char != RIGHT_SQUARE_BRACKET) {
    443 				if (current_char != DASH) {
    444 					/*
    445 					 * if a DASH follows current_char,
    446 					 *  current_char, the DASH and the
    447 					 * character that follows the DASH
    448 					 * may form a character range
    449 					 * expression
    450 					 */
    451 					dash_indicates_range = B_TRUE;
    452 					first_char_in_range = current_char;
    453 					expr_length = add_char(
    454 					    compilep, current_char);
    455 					compilep += expr_length;
    456 
    457 				} else if /* (current_char == DASH) && */
    458 					(dash_indicates_range == B_FALSE) {
    459 					/*
    460 					 * current_char is a DASH, but
    461 					 * either begins the entire
    462 					 * character class or follows a
    463 					 * character that's already
    464 					 * part of a character range
    465 					 * expression, so it simply
    466 					 * represents the DASH character
    467 					 * itself
    468 					 */
    469 					*compilep = DASH;
    470 					compilep ++;
    471 					/*
    472 					 * if another DASH follows this
    473 					 * one, this DASH is part
    474 					 * of a character range expression
    475 					 * like "[--\]"
    476 					 */
    477 					dash_indicates_range = B_TRUE;
    478 					first_char_in_range = current_char;
    479 
    480 				} else /* ((current_char == DASH && */
    481 				/* (dash_indicates_range == B_TRUE)) */ {
    482 					/*
    483 					 * the DASH appears after a single
    484 					 * character that isn't
    485 					 * already part of a character
    486 					 * range expression, so it
    487 					 * and the characters preceding
    488 					 * and following it can form a
    489 					 * character range expression
    490 					 * like "[a-z]"
    491 					 */
    492 					char_size = get_wchar(
    493 					    &current_char, regexp);
    494 					if (char_size <= 0) {
    495 						ERROR_EXIT(&regcmp_lock,
    496 						    arg_listp, compile_startp);
    497 
    498 					} else if (current_char ==
    499 						RIGHT_SQUARE_BRACKET) {
    500 						/*
    501 						 * the preceding DASH is
    502 						 * the last character in the
    503 						 * class and represents the
    504 						 * DASH character itself
    505 						 */
    506 						*compilep = DASH;
    507 						compilep++;
    508 
    509 					} else if (valid_range(
    510 					    first_char_in_range,
    511 					    current_char) == B_FALSE) {
    512 
    513 						ERROR_EXIT(&regcmp_lock,
    514 						arg_listp, compile_startp);
    515 
    516 					} else {
    517 						/*
    518 						 * the DASH is part of a
    519 						 * character range
    520 						 * expression; encode the
    521 						 * rest of the expression
    522 						 */
    523 						regexp += char_size;
    524 						*compilep = (unsigned char)
    525 						    THRU;
    526 						compilep++;
    527 						expr_length = add_char(
    528 						    compilep, current_char);
    529 						compilep += expr_length;
    530 						/*
    531 						 * if a DASH follows this
    532 						 * character range
    533 						 * expression,
    534 						 * it represents the DASH
    535 						 * character itself
    536 						 */
    537 						dash_indicates_range =
    538 						    B_FALSE;
    539 					}
    540 				}
    541 
    542 				/* GET THE NEXT CHARACTER */
    543 
    544 				char_size = get_wchar(&current_char, regexp);
    545 				if (char_size <= 0) {
    546 					ERROR_EXIT(&regcmp_lock,
    547 					    arg_listp, compile_startp);
    548 				} else {
    549 					regexp += char_size;
    550 				}
    551 
    552 			}
    553 			/* end while (current_char != RIGHT_SQUARE_BRACKET) */
    554 
    555 			/* INSERT THE LENGTH OF THE CLASS INTO THE */
    556 			/* COMPILED EXPRESSION */
    557 
    558 			class_length = (unsigned int)
    559 			    (compilep - regex_typep - 1);
    560 			if ((class_length < 2) ||
    561 			    (class_length > MAX_SINGLE_BYTE_INT)) {
    562 				ERROR_EXIT(&regcmp_lock, arg_listp,
    563 				    compile_startp);
    564 			} else {
    565 				*(regex_typep + 1) = (unsigned char)
    566 				    class_length;
    567 			}
    568 			break; /* end case LEFT_SQUARE_BRACKET */
    569 
    570 		case LEFT_PAREN:
    571 
    572 			/*
    573 			 * start of a parenthesized group of regular
    574 			 * expressions compiles to <'\0'><'\0'>, leaving
    575 			 * space in the compiled regular expression for
    576 			 * <group_type|ADDED_LENGTH_BITS><group_length>
    577 			 */
    578 
    579 			if (push_compilep(compilep) == (char *)0) {
    580 				/*
    581 				 * groups can contain groups, so group
    582 				 * start pointers
    583 				 * must be saved and restored in sequence
    584 				 */
    585 				ERROR_EXIT(&regcmp_lock, arg_listp,
    586 				    compile_startp);
    587 			} else {
    588 				can_repeat = B_FALSE;
    589 				*compilep = '\0';	/* for debugging */
    590 				compilep++;
    591 				*compilep = '\0';	/* for debugging */
    592 				compilep++;
    593 			}
    594 			break; /* end case LEFT_PAREN */
    595 
    596 		case RIGHT_PAREN:
    597 			/* end of a marked group of regular expressions */
    598 
    599 			/*
    600 			 * (<regex>)$0-9 compiles to
    601 			 * <SAVED_GROUP><substringn><compiled_regex...>\
    602 			 * <END_SAVED_GROUP><substringn><return_arg_number>
    603 			 * (<regex>)* compiles to
    604 			 * <ZERO_OR_MORE_GROUP|ADDED_LENGTH_BITS>
    605 			 * <group_length> <compiled_regex...>
    606 			 * <END_GROUP|ZERO_OR_MORE><groupn>
    607 			 * (<regex>)+ compiles to
    608 			 * <ONE_OR_MORE_GROUP|ADDED_LENGTH_BITS>
    609 			 * <group_length>\
    610 			 * <compiled_regex...><END_GROUP|ONE_OR_MORE>
    611 			 * <groupn>
    612 			 * (<regex>){...} compiles to
    613 			 * <COUNTED_GROUP|ADDED_LENGTH_BITS><group_length>\
    614 			 * <compiled_regex...><END_GROUP|COUNT><groupn>\
    615 			 * <minimum_repeat_count><maximum_repeat_count>
    616 			 * otherwise (<regex>) compiles to
    617 			 * <SIMPLE_GROUP><blank><compiled_regex...>
    618 			 * <END_GROUP><groupn>
    619 			 *
    620 			 * NOTE:
    621 			 *
    622 			 * group_length + (256 * ADDED_LENGTH_BITS) ==
    623 			 * length_of(<compiled_regex...><END_GROUP|...>
    624 			 * <groupn>)
    625 			 * which also ==
    626 			 * length_of(<group_type|ADDED_LENGTH_BITS>
    627 			 * <group_length>\ <compiled_regex...>)
    628 			 * groupn no longer seems to be used, but the code
    629 			 * still computes it to preserve backward
    630 			 * compatibility
    631 			 * with earlier versions of regex().
    632 			 */
    633 
    634 			/* RETRIEVE THE ADDRESS OF THE START OF THE GROUP */
    635 
    636 			regex_typep = pop_compilep();
    637 			if (regex_typep == (char *)0) {
    638 				ERROR_EXIT(&regcmp_lock, arg_listp,
    639 				    compile_startp);
    640 			}
    641 			char_size = get_wchar(&current_char, regexp);
    642 			if (char_size < 0) {
    643 				ERROR_EXIT(&regcmp_lock, arg_listp,
    644 				    compile_startp);
    645 			} else if (char_size == 0) {
    646 				*regex_typep = SIMPLE_GROUP;
    647 				can_repeat = B_TRUE;
    648 				*compilep = (unsigned char)END_GROUP;
    649 				regex_typep = compilep;
    650 				compilep++;
    651 				*compilep = (unsigned char)groupn;
    652 				groupn++;
    653 				compilep++;
    654 			} else if (current_char == DOLLAR_SIGN) {
    655 				*regex_typep = SAVED_GROUP;
    656 				regex_typep++;
    657 				*regex_typep = (char)substringn;
    658 				can_repeat = B_FALSE;
    659 				regexp ++;
    660 				return_arg_number = get_digit(regexp);
    661 				if ((return_arg_number < 0) ||
    662 				    (substringn >= NSUBSTRINGS)) {
    663 					ERROR_EXIT(&regcmp_lock, arg_listp,
    664 					    compile_startp);
    665 				}
    666 				regexp++;
    667 				*compilep = (unsigned char)END_SAVED_GROUP;
    668 				compilep++;
    669 				*compilep = (unsigned char)substringn;
    670 				substringn++;
    671 				compilep++;
    672 				*compilep = (unsigned char)return_arg_number;
    673 				compilep++;
    674 			} else {
    675 				switch (current_char) {
    676 				case STAR:
    677 					*regex_typep = ZERO_OR_MORE_GROUP;
    678 					break;
    679 				case PLUS:
    680 					*regex_typep = ONE_OR_MORE_GROUP;
    681 					break;
    682 				case LEFT_CURLY_BRACE:
    683 					*regex_typep = COUNTED_GROUP;
    684 					break;
    685 				default:
    686 					*regex_typep = SIMPLE_GROUP;
    687 				}
    688 				if (*regex_typep != SIMPLE_GROUP) {
    689 					group_length = (unsigned int)
    690 						(compilep - regex_typep);
    691 					if (group_length >= 1024) {
    692 						ERROR_EXIT(&regcmp_lock,
    693 						arg_listp, compile_startp);
    694 					}
    695 					high_bits = group_length >>
    696 					    TIMES_256_SHIFT;
    697 					low_bits = group_length &
    698 					    SINGLE_BYTE_MASK;
    699 					*regex_typep =
    700 					    (unsigned char)
    701 					    ((unsigned int)
    702 						*regex_typep | high_bits);
    703 					regex_typep++;
    704 					*regex_typep =
    705 					    (unsigned char)low_bits;
    706 				}
    707 				can_repeat = B_TRUE;
    708 				*compilep = (unsigned char)END_GROUP;
    709 				regex_typep = compilep;
    710 				compilep++;
    711 				*compilep = (unsigned char)groupn;
    712 				groupn++;
    713 				compilep++;
    714 			}
    715 
    716 			break; /* end case RIGHT_PAREN */
    717 
    718 		case STAR: /* zero or more repetitions of the */
    719 				/* preceding expression */
    720 
    721 			/*
    722 			 * <regex...>* compiles to <regex_type|ZERO_OR_MORE>\
    723 			 * <compiled_regex...>
    724 			 * (<regex...>)* compiles to
    725 			 * <ZERO_OR_MORE_GROUP|ADDED_LENGTH_BITS>\
    726 			 * <group_length><compiled_regex...>\
    727 			 * <END_GROUP|ZERO_OR_MORE><groupn>
    728 			 */
    729 
    730 			if (can_repeat == B_FALSE) {
    731 				ERROR_EXIT(&regcmp_lock, arg_listp,
    732 				    compile_startp);
    733 			} else {
    734 				can_repeat = B_FALSE;
    735 				*regex_typep = (unsigned char)
    736 				((unsigned int)*regex_typep | ZERO_OR_MORE);
    737 			}
    738 			break; /* end case '*' */
    739 
    740 		case PLUS:
    741 			/* one or more repetitions of the preceding */
    742 				/* expression */
    743 
    744 			/*
    745 			 * <regex...>+ compiles to <regex_type|ONE_OR_MORE>\
    746 			 * <compiled_regex...> (<regex...>)+ compiles to
    747 			 * <ONE_OR_MORE_GROUP|ADDED_LENGTH_BITS>\
    748 			 * <group_length><compiled_regex...>\
    749 			 * <END_GROUP|ONE_OR_MORE><groupn>
    750 			 */
    751 
    752 			if (can_repeat == B_FALSE) {
    753 				ERROR_EXIT(&regcmp_lock, arg_listp,
    754 					compile_startp);
    755 			} else {
    756 				can_repeat = B_FALSE;
    757 				*regex_typep =
    758 					(unsigned char)((unsigned int)*
    759 					regex_typep | ONE_OR_MORE);
    760 			}
    761 			break; /* end case '+' */
    762 
    763 		case LEFT_CURLY_BRACE:
    764 
    765 			/*
    766 			 * repeat the preceding regular expression
    767 			 * at least min_count times
    768 			 * and at most max_count times
    769 			 *
    770 			 * <regex...>{min_count} compiles to
    771 			 * <regex type|COUNT><compiled_regex...>
    772 			 * <min_count><min_count>
    773 			 *
    774 			 * <regex...>{min_count,} compiles to
    775 			 * <regex type|COUNT><compiled_regex...>
    776 			 * <min_count><UNLIMITED>
    777 			 *
    778 			 * <regex...>{min_count,max_count} compiles to
    779 			 * <regex type>|COUNT><compiled_regex...>
    780 			 * <min_count><max_count>
    781 			 *
    782 			 * (<regex...>){min_count,max_count} compiles to
    783 			 * <COUNTED_GROUP|ADDED_LENGTH_BITS><group_length>\
    784 			 * <compiled_regex...><END_GROUP|COUNT><groupn>\
    785 			 * <minimum_match_count><maximum_match_count>
    786 			 */
    787 
    788 			if (can_repeat == B_FALSE) {
    789 				ERROR_EXIT(&regcmp_lock, arg_listp,
    790 					compile_startp);
    791 			}
    792 			can_repeat = B_FALSE;
    793 			*regex_typep = (unsigned char)((unsigned int)*
    794 					regex_typep | COUNT);
    795 			count_length = get_count(&min_count, regexp);
    796 			if (count_length <= 0) {
    797 				ERROR_EXIT(&regcmp_lock, arg_listp,
    798 					compile_startp);
    799 			}
    800 			regexp += count_length;
    801 
    802 			if (*regexp == RIGHT_CURLY_BRACE) { /* {min_count} */
    803 				regexp++;
    804 				max_count = min_count;
    805 			} else if (*regexp == COMMA) { /* {min_count,..} */
    806 				regexp++;
    807 				/* {min_count,}   */
    808 				if (*regexp == RIGHT_CURLY_BRACE) {
    809 					regexp++;
    810 					max_count = UNLIMITED;
    811 				} else { /* {min_count,max_count} */
    812 					count_length = get_count(
    813 						&max_count, regexp);
    814 					if (count_length <= 0) {
    815 						ERROR_EXIT(&regcmp_lock,
    816 						arg_listp, compile_startp);
    817 					}
    818 					regexp += count_length;
    819 					if (*regexp != RIGHT_CURLY_BRACE) {
    820 						ERROR_EXIT(&regcmp_lock,
    821 						arg_listp, compile_startp);
    822 					}
    823 					regexp++;
    824 				}
    825 			} else { /* invalid expression */
    826 				ERROR_EXIT(&regcmp_lock, arg_listp,
    827 					compile_startp);
    828 			}
    829 
    830 			if ((min_count > MAX_SINGLE_BYTE_INT) ||
    831 				((max_count != UNLIMITED) &&
    832 				(min_count > max_count))) {
    833 				ERROR_EXIT(&regcmp_lock, arg_listp,
    834 					compile_startp);
    835 			} else {
    836 				*compilep = (unsigned char)min_count;
    837 				compilep++;
    838 				*compilep = (unsigned char)max_count;
    839 				compilep++;
    840 			}
    841 			break; /* end case LEFT_CURLY_BRACE */
    842 
    843 		default: /* a single non-special character */
    844 
    845 			/*
    846 			 * compiles to <ASCII_CHAR><ascii_char> or
    847 			 * <MULTIBYTE_CHAR><multibyte_char>
    848 			 */
    849 
    850 			can_repeat = B_TRUE;
    851 			regex_typep = compilep;
    852 			expr_length = add_single_char_expr(compilep,
    853 					current_char);
    854 			compilep += expr_length;
    855 
    856 		} /* end switch (current_char) */
    857 
    858 		/* GET THE NEXT CHARACTER FOR THE WHILE LOOP */
    859 
    860 		char_size = get_wchar(&current_char, regexp);
    861 		if (char_size < 0) {
    862 			ERROR_EXIT(&regcmp_lock, arg_listp, compile_startp);
    863 		} else if (char_size > 0) {
    864 			regexp += char_size;
    865 		} else if /* (char_size == 0) && */ (next_argp != (char *)0) {
    866 			regexp = next_argp;
    867 			next_argp = va_arg(arg_listp, /* const */ char *);
    868 			char_size = get_wchar(&current_char, regexp);
    869 			if (char_size <= 0) {
    870 				ERROR_EXIT(&regcmp_lock, arg_listp,
    871 					compile_startp);
    872 			} else {
    873 				regexp += char_size;
    874 			}
    875 		} else /* ((char_size == 0) && (next_argp == (char *)0)) */ {
    876 			if (pop_compilep() != (char *)0) {
    877 				/* unmatched parentheses */
    878 				ERROR_EXIT(&regcmp_lock, arg_listp,
    879 					compile_startp);
    880 			}
    881 			*compilep = (unsigned char)END_REGEX;
    882 			compilep++;
    883 			*compilep = '\0';
    884 			compilep++;
    885 			__i_size = (int)(compilep - compile_startp);
    886 			va_end(arg_listp);
    887 			lmutex_unlock(&regcmp_lock);
    888 			return (compile_startp);
    889 		}
    890 	} /* end for (;;) */
    891 
    892 } /* regcmp() */
    893 
    894 
    895 /* DEFINITIONS OF PRIVATE FUNCTIONS */
    896 
    897 static int
    898 add_char(char *compilep, wchar_t wchar)
    899 {
    900 	int expr_length;
    901 
    902 	if ((unsigned int)wchar <= (unsigned int)0x7f) {
    903 		*compilep = (unsigned char)wchar;
    904 		expr_length = 1;
    905 	} else {
    906 		expr_length = wctomb(compilep, wchar);
    907 	}
    908 	return (expr_length);
    909 }
    910 
    911 static int
    912 add_single_char_expr(char *compilep, wchar_t wchar)
    913 {
    914 	int expr_length = 0;
    915 
    916 	if ((unsigned int)wchar <= (unsigned int)0x7f) {
    917 		*compilep = (unsigned char)ASCII_CHAR;
    918 		compilep++;
    919 		*compilep = (unsigned char)wchar;
    920 		expr_length += 2;
    921 	} else {
    922 		*compilep = (unsigned char)MULTIBYTE_CHAR;
    923 		compilep++;
    924 		expr_length++;
    925 		expr_length += wctomb(compilep, wchar);
    926 	}
    927 	return (expr_length);
    928 }
    929 
    930 static int
    931 get_count(int *countp, const char *regexp)
    932 {
    933 	char count_char = '0';
    934 	int count = 0;
    935 	int count_length = 0;
    936 
    937 	if (regexp == (char *)0) {
    938 		return ((int)0);
    939 	} else {
    940 		count_char = *regexp;
    941 		while (('0' <= count_char) && (count_char <= '9')) {
    942 			count = (10 * count) + (int)(count_char - '0');
    943 			count_length++;
    944 			regexp++;
    945 			count_char = *regexp;
    946 		}
    947 	}
    948 	*countp = count;
    949 	return (count_length);
    950 }
    951 
    952 static int
    953 get_digit(const char *regexp)
    954 {
    955 	char digit;
    956 
    957 	if (regexp == (char *)0) {
    958 		return ((int)-1);
    959 	} else {
    960 		digit = *regexp;
    961 		if (('0' <= digit) && (digit <= '9')) {
    962 			return ((int)(digit - '0'));
    963 		} else {
    964 			return ((int)-1);
    965 		}
    966 	}
    967 }
    968 
    969 static int
    970 get_wchar(wchar_t *wcharp, const char *regexp)
    971 {
    972 	int char_size;
    973 
    974 	if (regexp == (char *)0) {
    975 		char_size = 0;
    976 		*wcharp = (wchar_t)((unsigned int)'\0');
    977 	} else if (*regexp == '\0') {
    978 		char_size = 0;
    979 		*wcharp = (wchar_t)((unsigned int)*regexp);
    980 	} else if ((unsigned char)*regexp <= (unsigned char)0x7f) {
    981 		char_size = 1;
    982 		*wcharp = (wchar_t)((unsigned int)*regexp);
    983 	} else {
    984 		char_size = mbtowc(wcharp, regexp, MB_LEN_MAX);
    985 	}
    986 	return (char_size);
    987 }
    988 
    989 static char *
    990 pop_compilep(void)
    991 {
    992 	char *compilep;
    993 
    994 	if (compilep_stackp >= &compilep_stack[STRINGP_STACK_SIZE]) {
    995 		return ((char *)0);
    996 	} else {
    997 		compilep = *compilep_stackp;
    998 		compilep_stackp++;
    999 		return (compilep);
   1000 	}
   1001 }
   1002 
   1003 static char *
   1004 push_compilep(char *compilep)
   1005 {
   1006 	if (compilep_stackp <= &compilep_stack[0]) {
   1007 		return ((char *)0);
   1008 	} else {
   1009 		compilep_stackp--;
   1010 		*compilep_stackp = compilep;
   1011 		return (compilep);
   1012 	}
   1013 }
   1014 
   1015 static boolean_t
   1016 valid_range(wchar_t lower_char, wchar_t upper_char)
   1017 {
   1018 	return (((lower_char <= 0x7f) && (upper_char <= 0x7f) &&
   1019 	    !iswcntrl(lower_char) && !iswcntrl(upper_char) &&
   1020 	    (lower_char < upper_char)) ||
   1021 	    (((lower_char & WCHAR_CSMASK) ==
   1022 	    (upper_char & WCHAR_CSMASK)) &&
   1023 	    (lower_char < upper_char)));
   1024 }
   1025