Home | History | Annotate | Download | only in common
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     27 
     28 #include "fields.h"
     29 
     30 /*
     31  * fields
     32  *
     33  * Overview
     34  *   By a field, we mean the various delimited character sequences within each
     35  *   line of the input files.  The sort key consists of an ordered sequence of
     36  *   fields, which need not include all possible fields for the given line.
     37  *   (Furthermore, not every line need contain sufficient fields for the fields
     38  *   given within the sort key.  In fact, none of the lines in the input stream
     39  *   need contain sufficient fields.)
     40  *
     41  *   There are two methods for specifying fields for sort(1); these are
     42  *   discussed in options.c.  Here we discuss only the internal representation
     43  *   of fields, as used for constructing the collation vector for each line as
     44  *   defined by the sort key.
     45  *
     46  * Representation
     47  *   The sort key is a singly-linked list of field specifiers.  At present,
     48  *   fields may belong to one of three species:  alphabetical, numerical, or
     49  *   monthly; the species (f_species) then indicates the conversion function
     50  *   (f_convert) used to transform the raw characters of the character sequence
     51  *   to a collatable form.  (In principle, this allows us to consider future
     52  *   field species such as hexadecimal.)
     53  *
     54  *   Fields and offsets are numbered such that zero refers to the first field or
     55  *   character, respectively.  Thus, the interpretation of a key specifier, m.n,
     56  *   is that the field begins at the nth character beyond the mth occurence of
     57  *   the key separator.  If the blanks flag has been specified, then the field
     58  *   begins at the nth non-blank character past the mth key separator.  If the
     59  *   key separator is unspecified, then the key separator is defined as one or
     60  *   more blank characters.
     61  *
     62  *   In general, the various options afforded by sort may be broken into two
     63  *   categories:  field species and field modifiers.  For each field species,
     64  *   there is one or more conversion routines that take a delimited character
     65  *   sequence and convert it to a character sequence collatable by strcmp() or
     66  *   memcmp().  For field species that may be further modified, such as the
     67  *   fold-to-uppercase option for alphabetic fields, the conversion routine may
     68  *   be aware of how the modifier affects collation.  Finally, the no-modifiers
     69  *   case may present an opportunity for a simplified, faster version.
     70  *
     71  * Code Structure
     72  *   The code paths for single-byte and multi-byte locales diverge significantly
     73  *   in fields.c.  Most routines have an *_wide() version, which produces an
     74  *   equivalent effect for line records whose data field is composed of wide
     75  *   characters (wchar_t).  However, the l_collated field of a line record is
     76  *   always composed of characters, so that the radix sorts provided in
     77  *   internal.c can work in both single- and multi-byte locales.  Thus, in the
     78  *   various convert_*_wide() routines, the output is placed in l_collated, with
     79  *   a length multiplier of 4.
     80  */
     81 
     82 #define	BEFORE_NUMBER	0x0
     83 #define	IN_NUMBER	0x1
     84 
     85 static char	numerical_separator;
     86 static char	numerical_decimal;
     87 static char	monetary_separator;
     88 static char	monetary_decimal;
     89 
     90 static wchar_t	w_numerical_separator;
     91 static wchar_t	w_numerical_decimal;
     92 static wchar_t	w_monetary_separator;
     93 static wchar_t	w_monetary_decimal;
     94 
     95 #define	MONTHS_IN_YEAR	12
     96 #define	MAX_MON_LEN	20
     97 
     98 enum { MO_NONE = 1, MO_OFFSET = 2 };
     99 
    100 static char	*months[MONTHS_IN_YEAR];
    101 static size_t	month_lengths[MONTHS_IN_YEAR];
    102 static wchar_t	*w_months[MONTHS_IN_YEAR];
    103 static size_t	w_month_lengths[MONTHS_IN_YEAR];
    104 
    105 #define	DECIMAL_CHAR		(numerical_decimal)
    106 #define	IS_BLANK(x)		(isspace((uchar_t)(x)) && (x) != '\n')
    107 #define	IS_SEPARATOR(x)		\
    108 	((numerical_separator != '\0' && (x) == numerical_separator) || \
    109 	(monetary_separator != '\0' && (x) == monetary_separator))
    110 #define	IS_DECIMAL(x)		\
    111 	((x) == numerical_decimal || \
    112 	(monetary_decimal != '\0' && (x) == monetary_decimal))
    113 #define	W_DECIMAL_CHAR		(w_numerical_decimal)
    114 #define	W_IS_BLANK(x)		(iswspace(x) && (x) != L'\n')
    115 #define	W_IS_SEPARATOR(x)	\
    116 	((numerical_separator != '\0' && (x) == w_numerical_separator) || \
    117 	(monetary_separator != '\0' && (x) == w_monetary_separator))
    118 #define	W_IS_DECIMAL(x)		\
    119 	(((x) == w_numerical_decimal) || \
    120 	(monetary_decimal != '\0' && (x) == w_monetary_decimal))
    121 
    122 #define	INTERFIELD_SEPARATOR '\0'
    123 #define	W_INTERFIELD_SEPARATOR L'\0'
    124 
    125 #define	INT_SIGN_FLIP_MASK 0x80000000
    126 #define	INT_SIGN_PASS_MASK 0x00000000
    127 
    128 /*
    129  * strx_ops_t, xfrm_len, and xfrm_cpy:  In the case where we are sorting in the
    130  * C locale, we want to avoid the expense of transforming strings to collatable
    131  * forms since, by definition, an arbitrary string in the C locale is already in
    132  * its collatable form.  Therefore, we construct a small ops vector (the
    133  * strx_ops) and two wrappers: xfrm_len() to massage the strxfrm(NULL, ...) into
    134  * strlen()-like behaviour, and xfrm_cpy() to make strncpy() appear
    135  * strxfrm()-like.
    136  */
    137 /*ARGSUSED*/
    138 static size_t
    139 xfrm_len(const char *s2, size_t len)
    140 {
    141 	return (strxfrm(NULL, s2, 0) + 1);
    142 }
    143 
    144 /*
    145  * The length represented by n includes a null character, so to return the
    146  * correct length we subtract 1.  Note that this function is only used by
    147  * field_convert_alpha, and isn't for general use, as it assumes that n is the
    148  * length of s2 plus a null character.
    149  */
    150 static size_t
    151 C_ncpy(char *s1, const char *s2, size_t n)
    152 {
    153 	(void) strncpy(s1, s2, n);
    154 	return (n - 1);
    155 }
    156 
    157 /*ARGSUSED*/
    158 static size_t
    159 C_len(const char *s, size_t len)
    160 {
    161 	ASSERT(s != NULL);
    162 	return (len);
    163 }
    164 
    165 typedef struct _strx_ops {
    166 	size_t	(*sx_len)(const char *, size_t);
    167 	size_t	(*sx_xfrm)(char *, const char *, size_t);
    168 } strx_ops_t;
    169 
    170 static const strx_ops_t C_ops = { C_len, C_ncpy };
    171 static const strx_ops_t SB_ops = { xfrm_len, strxfrm };
    172 
    173 static const strx_ops_t *xfrm_ops;
    174 
    175 static void
    176 field_initialize_separator(void)
    177 {
    178 	/*
    179 	 * A locale need not define all of the cases below:  only decimal_point
    180 	 * must be defined.  Furthermore, sort(1) has traditionally not used the
    181 	 * positive_sign and negative_sign, grouping, or currency_symbols (or
    182 	 * their numeric counterparts, if any).
    183 	 */
    184 	struct lconv *conv = localeconv();
    185 
    186 	if (!xstreql(conv->thousands_sep, "")) {
    187 		numerical_separator = *conv->thousands_sep;
    188 		(void) mbtowc(&w_numerical_separator, conv->thousands_sep,
    189 		    MB_CUR_MAX);
    190 	} else
    191 		numerical_separator = '\0';
    192 
    193 	if (!xstreql(conv->mon_thousands_sep, "")) {
    194 		monetary_separator = *conv->mon_thousands_sep;
    195 		(void) mbtowc(&w_monetary_separator, conv->mon_thousands_sep,
    196 		    MB_CUR_MAX);
    197 	} else
    198 		monetary_separator = '\0';
    199 
    200 	if (!xstreql(conv->mon_decimal_point, "")) {
    201 		monetary_decimal = *conv->mon_decimal_point;
    202 		(void) mbtowc(&w_monetary_decimal, conv->mon_decimal_point,
    203 		    MB_CUR_MAX);
    204 	} else
    205 		monetary_decimal = '\0';
    206 
    207 	numerical_decimal = *conv->decimal_point;
    208 	(void) mbtowc(&w_numerical_decimal, conv->decimal_point, MB_CUR_MAX);
    209 }
    210 
    211 static void
    212 field_initialize_month(int is_c_locale)
    213 {
    214 	int i;
    215 	int j;
    216 	struct tm this_month;
    217 	const char *c_months[MONTHS_IN_YEAR] = {
    218 		"JAN", "FEB", "MAR", "APR", "MAY", "JUN",
    219 		"JUL", "AUG", "SEP", "OCT", "NOV", "DEC"
    220 	};
    221 
    222 	char month_name[MAX_MON_LEN * MB_LEN_MAX];
    223 	wchar_t	w_month_name[MAX_MON_LEN];
    224 
    225 	if (is_c_locale) {
    226 		for (i = 0; i < MONTHS_IN_YEAR; i++) {
    227 			months[i] = (char *)c_months[i];
    228 			month_lengths[i] = strlen(c_months[i]);
    229 		}
    230 		/*
    231 		 * We don't need to initialize the wide version of the month
    232 		 * names.
    233 		 */
    234 		return;
    235 	}
    236 
    237 	(void) memset(&this_month, 0, sizeof (this_month));
    238 
    239 	for (i = 0; i < MONTHS_IN_YEAR; i++) {
    240 		this_month.tm_mon = i;
    241 
    242 		(void) strftime(month_name, sizeof (month_name),
    243 		    "%b", &this_month);
    244 
    245 		for (j = 0; j < strlen(month_name); j++)
    246 			month_name[j] = toupper(month_name[j]);
    247 		(void) mbstowcs(w_month_name, month_name, MAX_MON_LEN);
    248 
    249 		months[i] = strdup(month_name);
    250 		month_lengths[i] = strlen(month_name);
    251 		w_months[i] = wsdup(w_month_name);
    252 		w_month_lengths[i] = wslen(w_month_name);
    253 	}
    254 }
    255 
    256 void
    257 field_initialize(sort_t *S)
    258 {
    259 	field_initialize_month(S->m_c_locale);
    260 	field_initialize_separator();
    261 
    262 	if (S->m_c_locale)
    263 		xfrm_ops = &C_ops;
    264 	else
    265 		xfrm_ops = &SB_ops;
    266 }
    267 
    268 field_t *
    269 field_new(sort_t *S)
    270 {
    271 	field_t	*F = safe_realloc(NULL, sizeof (field_t));
    272 
    273 	F->f_start_field = -1;
    274 	F->f_start_offset = -1;
    275 	F->f_end_field = -1;
    276 	F->f_end_offset = -1;
    277 	F->f_next = NULL;
    278 
    279 	if (S == NULL) {
    280 		F->f_species = ALPHA;
    281 		F->f_options = 0;
    282 	} else {
    283 		F->f_species = S->m_default_species;
    284 		F->f_options = S->m_field_options;
    285 	}
    286 
    287 	return (F);
    288 }
    289 
    290 void
    291 field_delete(field_t *F)
    292 {
    293 	free(F);
    294 }
    295 
    296 /*
    297  * The recursive implementation of field_add_to_chain() given below is
    298  * inappropriate if function calls are expensive, or a truly large number of
    299  * fields are anticipated.
    300  */
    301 void
    302 field_add_to_chain(field_t **F, field_t *A)
    303 {
    304 	if (*F == NULL)
    305 		*F = A;
    306 	else
    307 		field_add_to_chain(&((*F)->f_next), A);
    308 }
    309 
    310 #ifdef DEBUG
    311 #ifndef _LP64
    312 #define	FIELD_FMT \
    313 "\nStart field: %d\tStart offset: %d\nEnd field: %d\tEnd offset: %d\n"
    314 #else /* !_LP64 */
    315 #define	FIELD_FMT \
    316 "\nStart field: %ld\tStart offset: %ld\nEnd field: %ld\tEnd offset: %ld\n"
    317 #endif /* !_LP64 */
    318 
    319 /*
    320  * field_print is used only for debugging purposes.
    321  */
    322 void
    323 field_print(field_t *F)
    324 {
    325 	char *field_names[] = {"ALPHA", "MONTH", "NUMERIC"};
    326 	int status = 0;
    327 
    328 	(void) fprintf(stderr, "Type: %s", field_names[F->f_species]);
    329 	(void) fprintf(stderr, "\tOptions: ");
    330 
    331 	if (F->f_options & FIELD_REVERSE_COMPARISONS) {
    332 		(void) fprintf(stderr, "REVERSE");
    333 		status++;
    334 	}
    335 	if (F->f_options & FIELD_DICTIONARY_ORDER) {
    336 		(void) fprintf(stderr, "DICTIONARY ");
    337 		status++;
    338 	}
    339 	if (F->f_options & FIELD_FOLD_UPPERCASE) {
    340 		(void) fprintf(stderr, "UPPERCASE ");
    341 		status++;
    342 	}
    343 	if (F->f_options & FIELD_IGNORE_NONPRINTABLES) {
    344 		(void) fprintf(stderr, "PRINTABLES ");
    345 		status++;
    346 	}
    347 	if (F->f_options & FIELD_IGNORE_BLANKS_START) {
    348 		(void) fprintf(stderr, "BLANKS_START ");
    349 		status++;
    350 	}
    351 	if (F->f_options & FIELD_IGNORE_BLANKS_END) {
    352 		(void) fprintf(stderr, "BLANKS_END ");
    353 		status++;
    354 	}
    355 
    356 	if (status == 0)
    357 		(void) fprintf(stderr, "NO_MODIFIERS");
    358 
    359 	(void) fprintf(stderr, FIELD_FMT, F->f_start_field, F->f_start_offset,
    360 	    F->f_end_field, F->f_end_offset);
    361 }
    362 #endif /* DEBUG */
    363 
    364 static ssize_t
    365 field_boundary(field_t *F, line_rec_t *L, int is_end, int is_blanks)
    366 {
    367 	char *S = L->l_data.sp;
    368 	char *T = S;
    369 	char *eol = S + L->l_data_length;
    370 	ssize_t field = is_end ? F->f_end_field : F->f_start_field;
    371 	ssize_t offset = is_end ? F->f_end_offset : F->f_start_offset;
    372 	ssize_t ret;
    373 
    374 	ASSERT(is_end || field > -1);
    375 
    376 	if (is_end && field == -1)
    377 		return (L->l_data_length);
    378 
    379 	while (field-- > 0) {
    380 		while (T < eol && IS_BLANK(*T))
    381 			T++;
    382 
    383 		while (T < eol && !IS_BLANK(*T))
    384 			T++;
    385 	}
    386 
    387 	if ((!is_end || offset > 0) && is_blanks) {
    388 		while (IS_BLANK(*T))
    389 			T++;
    390 	}
    391 
    392 	if ((ret = MAX(T - S, 0) + offset) >= L->l_data_length)
    393 		return (L->l_data_length);
    394 
    395 	return (ret);
    396 }
    397 
    398 static void
    399 field_delimit(field_t *F, line_rec_t *L, ssize_t *start, ssize_t *end)
    400 {
    401 	ASSERT(F->f_start_field > -1);
    402 
    403 	*start = field_boundary(F, L, 0,
    404 	    F->f_options & FIELD_IGNORE_BLANKS_START);
    405 	*end = field_boundary(F, L, 1,
    406 	    F->f_options & FIELD_IGNORE_BLANKS_END);
    407 }
    408 
    409 static ssize_t
    410 field_boundary_wide(field_t *F, line_rec_t *L, int is_end, int is_blanks)
    411 {
    412 	wchar_t *S = L->l_data.wp;
    413 	wchar_t *T = S;
    414 	wchar_t *eol = S + L->l_data_length;
    415 	ssize_t field = is_end ? F->f_end_field : F->f_start_field;
    416 	ssize_t offset = is_end ? F->f_end_offset : F->f_start_offset;
    417 	ssize_t ret;
    418 
    419 	ASSERT(is_end || field > -1);
    420 
    421 	if (is_end && field == -1)
    422 		return (L->l_data_length);
    423 
    424 	while (field-- > 0) {
    425 		while (T < eol && W_IS_BLANK(*T))
    426 			T++;
    427 
    428 		while (T < eol && !W_IS_BLANK(*T))
    429 			T++;
    430 	}
    431 
    432 	if ((!is_end || offset > 0) && is_blanks) {
    433 		while (W_IS_BLANK(*T))
    434 			T++;
    435 	}
    436 
    437 	if ((ret = MAX(T - S, 0) + offset) >= L->l_data_length)
    438 		return (L->l_data_length);
    439 
    440 	return (ret);
    441 }
    442 
    443 static void
    444 field_delimit_wide(field_t *F, line_rec_t *L, ssize_t *start, ssize_t *end)
    445 {
    446 	ASSERT(F->f_start_field > -1);
    447 
    448 	*start = field_boundary_wide(F, L, 0,
    449 	    F->f_options & FIELD_IGNORE_BLANKS_START);
    450 	*end = field_boundary_wide(F, L, 1,
    451 	    F->f_options & FIELD_IGNORE_BLANKS_END);
    452 }
    453 
    454 static ssize_t
    455 field_boundary_tabbed(field_t *F, line_rec_t *L, int is_end, int is_blanks,
    456     vchar_t delimiter)
    457 {
    458 	char *S = L->l_data.sp;
    459 	char *T = S;
    460 	char *eol = S + L->l_data_length;
    461 	ssize_t field = is_end ? F->f_end_field : F->f_start_field;
    462 	ssize_t offset = is_end ? F->f_end_offset : F->f_start_offset;
    463 	ssize_t ret;
    464 
    465 	ASSERT(is_end || field > -1);
    466 
    467 	if (is_end && field == -1)
    468 		return (L->l_data_length);
    469 
    470 	while (field-- > 0) {
    471 		T = xstrnchr(T, delimiter.sc, eol - T);
    472 		if (T == NULL || T > eol)
    473 			return (L->l_data_length);
    474 
    475 		T++;
    476 	}
    477 
    478 	if ((!is_end || offset != 0) && is_blanks) {
    479 		while (IS_BLANK(*T))
    480 			T++;
    481 	}
    482 
    483 	if ((ret = MAX(T - S, 0) + offset) >= L->l_data_length) {
    484 		if (S[L->l_data_length - 1] == delimiter.sc) {
    485 			return (L->l_data_length - 1);
    486 		} else {
    487 			return (L->l_data_length);
    488 		}
    489 	}
    490 
    491 	if (is_end && offset == 0)
    492 		ret--;
    493 
    494 	return (ret);
    495 }
    496 
    497 /*
    498  * field_delimit_tabbed() is called when a field separator has been defined
    499  * using the -t option.  The character at the offset, start, is either one or
    500  * more character positions past the delimiter marking the start of the
    501  * field, or at the end of the line.
    502  */
    503 static void
    504 field_delimit_tabbed(field_t *F, line_rec_t *L, ssize_t *start, ssize_t *end,
    505     vchar_t delimiter)
    506 {
    507 	ASSERT(F->f_start_field > -1);
    508 
    509 	*start = field_boundary_tabbed(F, L, 0, F->f_options &
    510 	    FIELD_IGNORE_BLANKS_START, delimiter);
    511 	*end = field_boundary_tabbed(F, L, 1, F->f_options &
    512 	    FIELD_IGNORE_BLANKS_END, delimiter);
    513 }
    514 
    515 static ssize_t
    516 field_boundary_tabbed_wide(field_t *F, line_rec_t *L, int is_end, int is_blanks,
    517     vchar_t delimiter)
    518 {
    519 	wchar_t *S = L->l_data.wp;
    520 	wchar_t *T = S;
    521 	wchar_t *eol = S + L->l_data_length;
    522 	ssize_t field = is_end ? F->f_end_field : F->f_start_field;
    523 	ssize_t offset = is_end ? F->f_end_offset : F->f_start_offset;
    524 	ssize_t ret;
    525 
    526 	ASSERT(is_end || field > -1);
    527 
    528 	if (is_end && field == -1)
    529 		return (L->l_data_length);
    530 
    531 	while (field-- > 0) {
    532 		T = xwsnchr(T, delimiter.wc, eol - T);
    533 		if (T == NULL || T > eol)
    534 			return (L->l_data_length);
    535 
    536 		T++;
    537 	}
    538 
    539 	if ((!is_end || offset != 0) && is_blanks) {
    540 		while (W_IS_BLANK(*T))
    541 			T++;
    542 	}
    543 
    544 	if ((ret = MAX(T - S, 0) + offset) >= L->l_data_length) {
    545 		if (S[L->l_data_length - 1] == delimiter.wc) {
    546 			return (L->l_data_length - 1);
    547 		} else {
    548 			return (L->l_data_length);
    549 		}
    550 	}
    551 
    552 	if (is_end && offset == 0)
    553 		ret--;
    554 
    555 	return (ret);
    556 }
    557 
    558 static void
    559 field_delimit_tabbed_wide(field_t *F, line_rec_t *L, ssize_t *start,
    560     ssize_t *end, vchar_t delimiter)
    561 {
    562 	ASSERT(F->f_start_field > -1);
    563 
    564 	*start = field_boundary_tabbed_wide(F, L, 0, F->f_options &
    565 	    FIELD_IGNORE_BLANKS_START, delimiter);
    566 	*end = field_boundary_tabbed_wide(F, L, 1, F->f_options &
    567 	    FIELD_IGNORE_BLANKS_END, delimiter);
    568 }
    569 
    570 /*ARGSUSED*/
    571 ssize_t
    572 field_convert_month(field_t *F, line_rec_t *L, vchar_t delimiter,
    573     ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
    574 {
    575 	int j;
    576 	ssize_t	val;
    577 	char month_candidate[MAX_MON_LEN * MB_LEN_MAX];
    578 	ssize_t month_length = data_length;
    579 	ssize_t month_offset = data_offset;
    580 
    581 	if (sizeof (char) > L->l_collate_bufsize - coll_offset)
    582 		return (-1);
    583 
    584 	(void) memset(month_candidate, 0, MAX_MON_LEN * MB_LEN_MAX);
    585 
    586 
    587 	/*
    588 	 * The month field formally begins with the first non-blank character.
    589 	 */
    590 	while (IS_BLANK(*(L->l_data.sp + month_offset))) {
    591 		month_offset++;
    592 		month_length--;
    593 	}
    594 
    595 	for (j = 0; j < MAX_MON_LEN && j < month_length; j++)
    596 		month_candidate[j] = toupper((L->l_data.sp + month_offset)[j]);
    597 
    598 	for (j = 0; j < MONTHS_IN_YEAR; j++) {
    599 		if (xstrneql(month_candidate, months[j], month_lengths[j])) {
    600 			*(L->l_collate.sp + coll_offset) = '\0' + j + MO_OFFSET;
    601 			return (1);
    602 		}
    603 	}
    604 
    605 	/*
    606 	 * no matching month; copy string into field.  required behaviour is
    607 	 * that "month-free" keys sort before month-sortable keys, so insert
    608 	 * a "will sort first" token.
    609 	 */
    610 	*(L->l_collate.sp + coll_offset) = '\0' + MO_NONE;
    611 
    612 	val = field_convert_alpha_simple(F, L, delimiter, data_offset,
    613 	    data_length, coll_offset + 1);
    614 
    615 	if (val < 0)
    616 		return (-1);
    617 	else
    618 		return (val + 1);
    619 }
    620 
    621 /*ARGSUSED*/
    622 ssize_t
    623 field_convert_month_wide(field_t *F, line_rec_t *L, vchar_t delimiter,
    624     ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
    625 {
    626 	ssize_t j;
    627 	ssize_t val;
    628 	wchar_t month_candidate[MAX_MON_LEN];
    629 	wchar_t *month;
    630 	wchar_t *buffer = L->l_collate.wp + coll_offset;
    631 	ssize_t month_length = data_length;
    632 	ssize_t month_offset = data_offset;
    633 
    634 	if (L->l_collate_bufsize - coll_offset * sizeof (wchar_t) <
    635 	    sizeof (wchar_t))
    636 		return (-1);
    637 
    638 	(void) memset(month_candidate, 0, MAX_MON_LEN * sizeof (wchar_t));
    639 
    640 
    641 	while (W_IS_BLANK(*(L->l_data.wp + month_offset))) {
    642 		month_offset++;
    643 		month_length--;
    644 	}
    645 
    646 	month = L->l_data.wp + month_offset;
    647 
    648 	for (j = 0; j < MAX_MON_LEN && j < month_length; j++)
    649 		month_candidate[j] = towupper(month[j]);
    650 
    651 	for (j = 0; j < MONTHS_IN_YEAR; j++)
    652 		if (xwcsneql(month_candidate, w_months[j],
    653 		    w_month_lengths[j])) {
    654 			*buffer = L'\0' + j + MO_OFFSET;
    655 			return (1);
    656 		}
    657 
    658 	*buffer = L'\0' + MO_NONE;
    659 
    660 	val = field_convert_alpha_wide(F, L, delimiter, data_offset,
    661 	    data_length, coll_offset + sizeof (wchar_t));
    662 
    663 	if (val < 0)
    664 		return (-1);
    665 	else
    666 		return (val + 1);
    667 }
    668 
    669 /*
    670  * field_convert_alpha() always fails with return value -1 if the converted
    671  * string would cause l_collate_length to exceed l_collate_bufsize
    672  */
    673 /*ARGSUSED*/
    674 ssize_t
    675 field_convert_alpha(field_t *F, line_rec_t *L, vchar_t delimiter,
    676     ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
    677 {
    678 	static char *compose;
    679 	static ssize_t compose_length;
    680 
    681 	ssize_t	clength = 0;
    682 	ssize_t	dlength;
    683 	ssize_t	i;
    684 
    685 	if (compose_length < (data_length + 1)) {
    686 		compose_length = data_length + 1;
    687 		compose = safe_realloc(compose, compose_length * sizeof (char));
    688 	}
    689 
    690 	for (i = data_offset; i < data_offset + data_length; i++) {
    691 		char t = (L->l_data.sp)[i];
    692 
    693 		if ((F->f_options & FIELD_IGNORE_NONPRINTABLES) &&
    694 		    !isprint((uchar_t)t))
    695 			continue;
    696 
    697 		if ((F->f_options & FIELD_DICTIONARY_ORDER) &&
    698 		    !isalnum((uchar_t)t) && !isspace((uchar_t)t))
    699 			continue;
    700 
    701 		if (F->f_options & FIELD_FOLD_UPPERCASE)
    702 			t = toupper(t);
    703 
    704 		compose[clength++] = t;
    705 	}
    706 	compose[clength] = '\0';
    707 
    708 	if ((dlength = xfrm_ops->sx_len(compose, clength)) <
    709 	    L->l_collate_bufsize - coll_offset)
    710 		return (xfrm_ops->sx_xfrm(L->l_collate.sp + coll_offset,
    711 		    compose, dlength + 1));
    712 	else
    713 		return ((ssize_t)-1);
    714 }
    715 
    716 /*ARGSUSED*/
    717 ssize_t
    718 field_convert_alpha_simple(field_t *F, line_rec_t *L, vchar_t delimiter,
    719     ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
    720 {
    721 	static char *compose;
    722 	static ssize_t compose_length;
    723 
    724 	ssize_t	clength;
    725 	ssize_t	dlength;
    726 
    727 	if (compose_length < (data_length + 1)) {
    728 		compose_length = data_length + 1;
    729 		compose = safe_realloc(compose, compose_length * sizeof (char));
    730 	}
    731 
    732 	(void) memcpy(compose, L->l_data.sp + data_offset, data_length);
    733 	clength = data_length;
    734 	compose[clength] = '\0';
    735 
    736 	if ((dlength = xfrm_ops->sx_len(compose, clength)) <
    737 	    L->l_collate_bufsize - coll_offset)
    738 		return (xfrm_ops->sx_xfrm(L->l_collate.sp + coll_offset,
    739 		    compose, dlength + 1));
    740 	else
    741 		return ((ssize_t)-1);
    742 }
    743 
    744 /*ARGSUSED*/
    745 ssize_t
    746 field_convert_alpha_wide(field_t *F, line_rec_t *L, vchar_t delimiter,
    747     ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
    748 {
    749 	wchar_t	*compose = safe_realloc(NULL, (data_length + 1) *
    750 	    sizeof (wchar_t));
    751 	ssize_t	clength = 0;
    752 	ssize_t	dlength;
    753 	ssize_t	i;
    754 	ssize_t ret;
    755 
    756 	for (i = data_offset; i < data_offset + data_length; i++) {
    757 		wchar_t	t = (L->l_data.wp)[i];
    758 
    759 		if ((F->f_options & FIELD_IGNORE_NONPRINTABLES) && !iswprint(t))
    760 			continue;
    761 
    762 		if ((F->f_options & FIELD_DICTIONARY_ORDER) && !iswalnum(t) &&
    763 		    !iswspace(t))
    764 			continue;
    765 
    766 		if (F->f_options & FIELD_FOLD_UPPERCASE)
    767 			t = towupper(t);
    768 
    769 		compose[clength++] = t;
    770 	}
    771 	compose[clength] = L'\0';
    772 
    773 	dlength = wcsxfrm(NULL, compose, (size_t)0);
    774 	if ((dlength * sizeof (wchar_t)) < L->l_collate_bufsize -
    775 	    coll_offset * sizeof (wchar_t)) {
    776 		ret = (ssize_t)wcsxfrm(L->l_collate.wp + coll_offset, compose,
    777 		    (size_t)dlength + 1);
    778 	} else {
    779 		ret = (ssize_t)-1;
    780 	}
    781 
    782 	safe_free(compose);
    783 
    784 	return (ret);
    785 }
    786 
    787 /*
    788  * field_convert_numeric() converts the given field into a collatable numerical
    789  * sequence.  The sequence is ordered as { log, integer, separator, fraction },
    790  * with an optional sentinel component at the sequence end.
    791  */
    792 /*ARGSUSED*/
    793 ssize_t
    794 field_convert_numeric(field_t *F, line_rec_t *L, vchar_t delimiter,
    795     ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
    796 {
    797 	char *number;
    798 	char *buffer = L->l_collate.sp + coll_offset;
    799 	ssize_t length;
    800 
    801 	char sign = '2';
    802 	int log_ten;
    803 	char *digits = buffer + 1 + sizeof (int) / sizeof (char);
    804 	size_t j = 0;
    805 	size_t i;
    806 
    807 	int state = BEFORE_NUMBER;
    808 
    809 	number = L->l_data.sp + data_offset;
    810 	length = data_length;
    811 
    812 	/*
    813 	 * Eat leading blanks, if any.
    814 	 */
    815 	for (i = 0; i < length; i++)
    816 		if (!IS_BLANK(number[i]))
    817 			break;
    818 
    819 	/*
    820 	 * Test that there is sufficient size in the collation buffer for our
    821 	 * number.  In addition to the possible remaining characters in the
    822 	 * field, we also require space for the sign (char), logarithm (int),
    823 	 * separator (char), and as many as two string terminators (for reverse
    824 	 * sorts).
    825 	 */
    826 	if (((length - i) + 4 * sizeof (char) + sizeof (int)) >
    827 	    (L->l_collate_bufsize - coll_offset))
    828 		return ((ssize_t)-1);
    829 
    830 	/*
    831 	 * If negative, set sign.
    832 	 */
    833 	if (number[i] == '-') {
    834 		i++;
    835 		sign = '0';
    836 	}
    837 
    838 	/*
    839 	 * Scan integer part; eat leading zeros.
    840 	 */
    841 	for (; i < length; i++) {
    842 		if (IS_SEPARATOR(number[i]))
    843 			continue;
    844 
    845 		if (number[i] == '0' && !(state & IN_NUMBER))
    846 			continue;
    847 
    848 		if (!isdigit((uchar_t)number[i]))
    849 			break;
    850 
    851 		state |= IN_NUMBER;
    852 		if (sign == '0')
    853 			digits[j++] = '0' + '9' - number[i];
    854 		else
    855 			digits[j++] = number[i];
    856 	}
    857 
    858 	if (i < length && IS_DECIMAL(number[i])) {
    859 		/*
    860 		 * Integer part terminated by decimal.
    861 		 */
    862 		digits[j] = DECIMAL_CHAR;
    863 		log_ten = j++;
    864 
    865 		/*
    866 		 * Scan fractional part.
    867 		 */
    868 		for (++i; i < length; i++) {
    869 			if (IS_SEPARATOR(number[i]))
    870 				continue;
    871 
    872 			if (!isdigit((uchar_t)number[i]))
    873 				break;
    874 
    875 			if (number[i] != '0')
    876 				state |= IN_NUMBER;
    877 
    878 			if (sign == '0')
    879 				digits[j++] = '0' + '9' - number[i];
    880 			else
    881 				digits[j++] = number[i];
    882 		}
    883 
    884 		if (sign == '0')
    885 			digits[j++] = (char)(UCHAR_MAX - INTERFIELD_SEPARATOR);
    886 	} else {
    887 		/*
    888 		 * Nondigit or end of string seen.
    889 		 */
    890 		log_ten = (int)j;
    891 		if (sign == '0')
    892 			digits[j++] = (char)(UCHAR_MAX - INTERFIELD_SEPARATOR);
    893 		else
    894 			digits[j] = INTERFIELD_SEPARATOR;
    895 	}
    896 
    897 	if ((state & IN_NUMBER) == 0) {
    898 		/*
    899 		 * A non-zero number was not detected; treat as defined zero.
    900 		 */
    901 		sign = '1';
    902 		log_ten = 0;
    903 		digits[0] = '0';
    904 		j = 1;
    905 	}
    906 
    907 	/*
    908 	 * We subtract a constant from the log of negative values so that
    909 	 * they will correctly precede positive values with a zero logarithm.
    910 	 */
    911 	if (sign == '0') {
    912 		if (j != 0)
    913 			log_ten = -log_ten - 2;
    914 		else
    915 			/*
    916 			 * Special case for -0.
    917 			 */
    918 			log_ten = -1;
    919 	}
    920 
    921 	buffer[0] = sign;
    922 
    923 	/*
    924 	 * Place logarithm in big-endian form.
    925 	 */
    926 	for (i = 0; i < sizeof (int); i++)
    927 		buffer[i + 1] = (log_ten << (i * NBBY))
    928 		    >> ((sizeof (int) - 1) * NBBY);
    929 
    930 	if (j + sizeof (char) + sizeof (int) <
    931 	    L->l_collate_bufsize - coll_offset)
    932 		return (j + 1 + sizeof (int));
    933 	else
    934 		return ((ssize_t)-1);
    935 }
    936 
    937 /*ARGSUSED*/
    938 ssize_t
    939 field_convert_numeric_wide(field_t *F, line_rec_t *L, vchar_t delimiter,
    940     ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
    941 {
    942 	wchar_t *number;
    943 	wchar_t *buffer = L->l_collate.wp + coll_offset;
    944 	char *lbuffer;
    945 	ssize_t length;
    946 
    947 	wchar_t	sign = L'2';
    948 	int log_ten;
    949 	wchar_t	*digits = buffer + 1 + sizeof (int)/sizeof (wchar_t);
    950 	size_t j = 0;
    951 	size_t i;
    952 
    953 	int state = BEFORE_NUMBER;
    954 
    955 	number = L->l_data.wp + data_offset;
    956 	length = data_length;
    957 
    958 	for (i = 0; i < length; i++)
    959 		if (!W_IS_BLANK(number[i]))
    960 			break;
    961 
    962 	if (((length - i) * sizeof (wchar_t) + 4 * sizeof (wchar_t) +
    963 	    sizeof (int)) > (L->l_collate_bufsize - coll_offset))
    964 		return ((ssize_t)-1);
    965 
    966 	if (number[i] == L'-') {
    967 		i++;
    968 		sign = L'0';
    969 	}
    970 
    971 	for (; i < length; i++) {
    972 		if (W_IS_SEPARATOR(number[i]))
    973 			continue;
    974 
    975 		if (number[i] == L'0' && !(state & IN_NUMBER))
    976 			continue;
    977 
    978 		if (!iswdigit(number[i]))
    979 			break;
    980 
    981 		state |= IN_NUMBER;
    982 		if (sign == L'0')
    983 			digits[j++] = L'0' + L'9' - number[i];
    984 		else
    985 			digits[j++] = number[i];
    986 	}
    987 
    988 	if (i < length && W_IS_DECIMAL(number[i])) {
    989 		digits[j] = W_DECIMAL_CHAR;
    990 		log_ten = j++;
    991 
    992 		for (++i; i < length; i++) {
    993 			if (W_IS_SEPARATOR(number[i]))
    994 				continue;
    995 
    996 			if (!iswdigit(number[i]))
    997 				break;
    998 
    999 			if (number[i] != L'0')
   1000 				state |= IN_NUMBER;
   1001 
   1002 			if (sign == L'0')
   1003 				digits[j++] = L'0' + L'9' - number[i];
   1004 			else
   1005 				digits[j++] = number[i];
   1006 		}
   1007 
   1008 		if (sign == L'0')
   1009 			digits[j++] = (wchar_t)(WCHAR_MAX -
   1010 			    W_INTERFIELD_SEPARATOR);
   1011 	} else {
   1012 		log_ten = (int)j;
   1013 		if (sign == L'0')
   1014 			digits[j++] = (wchar_t)(WCHAR_MAX -
   1015 			    W_INTERFIELD_SEPARATOR);
   1016 		else
   1017 			digits[j] = W_INTERFIELD_SEPARATOR;
   1018 	}
   1019 
   1020 	if ((state & IN_NUMBER) == 0) {
   1021 		sign = L'1';
   1022 		log_ten = 0;
   1023 		digits[0] = L'0';
   1024 		j = 1;
   1025 	}
   1026 
   1027 	if (sign == L'0') {
   1028 		if (j != 0)
   1029 			log_ten = -log_ten - 2;
   1030 		else
   1031 			log_ten = -1;
   1032 	}
   1033 
   1034 	buffer[0] = sign;
   1035 	/*
   1036 	 * Place logarithm in big-endian form.
   1037 	 */
   1038 	lbuffer = (char *)(buffer + 1);
   1039 	for (i = 0; i < sizeof (int); i++)
   1040 		lbuffer[i] = (log_ten << (i * NBBY))
   1041 		    >> ((sizeof (int) - 1) * NBBY);
   1042 
   1043 	if ((j + 1 + sizeof (int)/sizeof (wchar_t)) * sizeof (wchar_t) <
   1044 	    L->l_collate_bufsize - coll_offset * sizeof (wchar_t))
   1045 		return (j + 1 + sizeof (int) / sizeof (wchar_t));
   1046 	else
   1047 		return ((ssize_t)-1);
   1048 }
   1049 
   1050 /*
   1051  * flags contains one of CV_REALLOC, CV_FAIL, specifying the preferred behaviour
   1052  * when coll_offset exceeds l_collate_bufsize.
   1053  */
   1054 ssize_t
   1055 field_convert(field_t *F, line_rec_t *L, int flags, vchar_t field_separator)
   1056 {
   1057 	ssize_t coll_offset = 0;
   1058 	ssize_t	start, end, distance;
   1059 	field_t *cur_fieldp = F;
   1060 
   1061 	while (cur_fieldp != NULL) {
   1062 		/*
   1063 		 * delimit field
   1064 		 */
   1065 		if (!field_separator.sc)
   1066 			field_delimit(cur_fieldp, L, &start, &end);
   1067 		else
   1068 			field_delimit_tabbed(cur_fieldp, L, &start, &end,
   1069 			    field_separator);
   1070 
   1071 		distance = 0;
   1072 		if (end - start > 0 ||
   1073 		    (end - start == 0 && F->f_species == NUMERIC)) {
   1074 			/*
   1075 			 * Convert field, appending to collated field of line
   1076 			 * record.
   1077 			 */
   1078 			distance = cur_fieldp->f_convert(cur_fieldp, L,
   1079 			    field_separator, start, end - start, coll_offset);
   1080 
   1081 			/*
   1082 			 * branch should execute comparatively rarely
   1083 			 */
   1084 			if (distance == -1) {
   1085 				if (flags & FCV_REALLOC) {
   1086 					ASSERT(L->l_collate_bufsize > 0);
   1087 					L->l_collate_bufsize *= 2;
   1088 					L->l_collate.sp =
   1089 					    safe_realloc(L->l_collate.sp,
   1090 					    L->l_collate_bufsize);
   1091 
   1092 					__S(stats_incr_convert_reallocs());
   1093 					continue;
   1094 				} else {
   1095 					/*
   1096 					 * FCV_FAIL has been set.
   1097 					 */
   1098 					return (-1);
   1099 				}
   1100 			}
   1101 		}
   1102 
   1103 		if (cur_fieldp->f_options & FIELD_REVERSE_COMPARISONS) {
   1104 			xstrninv(L->l_collate.sp, coll_offset, distance);
   1105 			*(L->l_collate.sp + coll_offset + distance) =
   1106 			    (char)(UCHAR_MAX - INTERFIELD_SEPARATOR);
   1107 			distance++;
   1108 		}
   1109 
   1110 		ASSERT(distance >= 0);
   1111 		coll_offset += distance;
   1112 		if (coll_offset >= L->l_collate_bufsize) {
   1113 			if (flags & FCV_REALLOC) {
   1114 				ASSERT(L->l_collate_bufsize > 0);
   1115 				L->l_collate_bufsize *= 2;
   1116 				L->l_collate.sp = safe_realloc(L->l_collate.sp,
   1117 				    L->l_collate_bufsize);
   1118 
   1119 				__S(stats_incr_convert_reallocs());
   1120 			} else {
   1121 				return (-1);
   1122 			}
   1123 		}
   1124 		*(L->l_collate.sp + coll_offset) = INTERFIELD_SEPARATOR;
   1125 		coll_offset++;
   1126 
   1127 		cur_fieldp = cur_fieldp->f_next;
   1128 	}
   1129 
   1130 	L->l_collate_length = coll_offset;
   1131 
   1132 	return (L->l_collate_length);
   1133 }
   1134 
   1135 ssize_t
   1136 field_convert_wide(field_t *F, line_rec_t *L, int flags,
   1137     vchar_t field_separator)
   1138 {
   1139 	ssize_t coll_offset = 0;
   1140 	ssize_t	start, end, distance;
   1141 	field_t *cur_fieldp = F;
   1142 
   1143 	while (cur_fieldp != NULL) {
   1144 		if (!field_separator.wc)
   1145 			field_delimit_wide(cur_fieldp, L, &start, &end);
   1146 		else
   1147 			field_delimit_tabbed_wide(cur_fieldp, L, &start, &end,
   1148 			    field_separator);
   1149 
   1150 		distance = 0;
   1151 		if (end - start > 0 ||
   1152 		    end - start == 0 && F->f_species == NUMERIC) {
   1153 			distance = cur_fieldp->f_convert(cur_fieldp, L,
   1154 			    field_separator, start, end - start, coll_offset);
   1155 
   1156 			if (distance == -1) {
   1157 				if (flags & FCV_REALLOC) {
   1158 					ASSERT(L->l_collate_bufsize > 0);
   1159 					L->l_collate_bufsize *= 2;
   1160 					L->l_collate.wp = safe_realloc(
   1161 					    L->l_collate.wp,
   1162 					    L->l_collate_bufsize);
   1163 
   1164 					__S(stats_incr_convert_reallocs());
   1165 					continue;
   1166 				} else {
   1167 					return (-1);
   1168 				}
   1169 			}
   1170 		}
   1171 
   1172 		if (cur_fieldp->f_options & FIELD_REVERSE_COMPARISONS) {
   1173 			xwcsninv(L->l_collate.wp, coll_offset, distance);
   1174 			*(L->l_collate.wp + coll_offset + distance) =
   1175 			    WCHAR_MAX - INTERFIELD_SEPARATOR;
   1176 			distance++;
   1177 		}
   1178 
   1179 		ASSERT(distance >= 0);
   1180 		coll_offset += distance;
   1181 		if (coll_offset * sizeof (wchar_t) >= L->l_collate_bufsize) {
   1182 			if (flags & FCV_REALLOC) {
   1183 				ASSERT(L->l_collate_bufsize > 0);
   1184 				L->l_collate_bufsize *= 2;
   1185 				L->l_collate.wp = safe_realloc(L->l_collate.wp,
   1186 				    L->l_collate_bufsize);
   1187 
   1188 				__S(stats_incr_convert_reallocs());
   1189 			} else {
   1190 				return (-1);
   1191 			}
   1192 		}
   1193 		*(L->l_collate.wp + coll_offset) = W_INTERFIELD_SEPARATOR;
   1194 		coll_offset++;
   1195 
   1196 		cur_fieldp = cur_fieldp->f_next;
   1197 	}
   1198 
   1199 	L->l_collate_length = coll_offset * sizeof (wchar_t);
   1200 #ifdef _LITTLE_ENDIAN
   1201 	xwcsntomsb(L->l_collate.wp, coll_offset);
   1202 #endif /* _LITTLE_ENDIAN */
   1203 
   1204 	return (L->l_collate_length);
   1205 }
   1206 
   1207 /*
   1208  * line_convert() and line_convert_wide() are called when the collation vector
   1209  * of a given line has been exhausted, and we are performing the final,
   1210  * full-line comparison required by the sort specification.  Because we do not
   1211  * have a guarantee that l_data is null-terminated, we create an explicitly
   1212  * null-terminated copy suitable for transformation to a collatable form for the
   1213  * current locale.
   1214  */
   1215 static void
   1216 line_convert(line_rec_t *L)
   1217 {
   1218 	static ssize_t bufsize;
   1219 	static char *buffer;
   1220 
   1221 	if (L->l_raw_collate.sp != NULL)
   1222 		return;
   1223 
   1224 	if (L->l_data_length + 1 > bufsize) {
   1225 		buffer = safe_realloc(buffer, L->l_data_length + 1);
   1226 		bufsize = L->l_data_length + 1;
   1227 	}
   1228 
   1229 	(void) strncpy(buffer, L->l_data.sp, L->l_data_length);
   1230 	buffer[L->l_data_length] = '\0';
   1231 
   1232 	L->l_raw_collate.sp = safe_realloc(L->l_raw_collate.sp,
   1233 	    xfrm_ops->sx_len(buffer, L->l_data_length) + 1);
   1234 	xfrm_ops->sx_xfrm(L->l_raw_collate.sp, buffer,
   1235 	    xfrm_ops->sx_len(buffer, L->l_data_length) + 1);
   1236 
   1237 	__S(stats_incr_line_conversions());
   1238 }
   1239 
   1240 static void
   1241 line_convert_wide(line_rec_t *L)
   1242 {
   1243 	static wchar_t *buffer;
   1244 	static ssize_t bufsize;
   1245 
   1246 	ssize_t dlength;
   1247 
   1248 	if (L->l_raw_collate.wp != NULL)
   1249 		return;
   1250 
   1251 	if (L->l_data_length + 1 > bufsize) {
   1252 		buffer = safe_realloc(buffer, (L->l_data_length + 1) *
   1253 		    sizeof (wchar_t));
   1254 		bufsize = L->l_data_length + 1;
   1255 	}
   1256 
   1257 	(void) wcsncpy(buffer, L->l_data.wp, L->l_data_length);
   1258 	buffer[L->l_data_length] = L'\0';
   1259 
   1260 	dlength = wcsxfrm(NULL, buffer, 0) + 1;
   1261 	L->l_raw_collate.wp = safe_realloc(L->l_raw_collate.wp, dlength *
   1262 	    sizeof (wchar_t));
   1263 	(void) wcsxfrm(L->l_raw_collate.wp, buffer, dlength);
   1264 
   1265 	__S(stats_incr_line_conversions());
   1266 }
   1267 
   1268 /*
   1269  * Our convention for collation is
   1270  *
   1271  *	A > B  => r > 0,
   1272  *	A == B => r = 0,
   1273  *	A < B  => r < 0
   1274  *
   1275  * This convention is consistent with the definition of memcmp(), strcmp(), and
   1276  * strncmp() in the C locale.  collated() and collated_wide() have two optional
   1277  * behaviours, which can be activated by setting the appropriate values in
   1278  * coll_flag:  COLL_UNIQUE, which returns 0 if the l_collate fields of the line
   1279  * records being compared are identical; COLL_DATA_ONLY, which ignores the
   1280  * l_collate field for the current comparison; and COLL_REVERSE, which flips the
   1281  * result for comparisons that fall through to an actual data comparison (since
   1282  * the collated vector should already reflect reverse ordering from field
   1283  * conversion).
   1284  */
   1285 int
   1286 collated(line_rec_t *A, line_rec_t *B, ssize_t depth, flag_t coll_flag)
   1287 {
   1288 	ssize_t ml = MIN(A->l_collate_length, B->l_collate_length) - depth;
   1289 	int r;
   1290 	int mask = (coll_flag & COLL_REVERSE) ? INT_SIGN_FLIP_MASK :
   1291 	    INT_SIGN_PASS_MASK;
   1292 	ssize_t la, lb;
   1293 
   1294 	if (!(coll_flag & COLL_DATA_ONLY)) {
   1295 		if (ml > 0) {
   1296 			r = memcmp(A->l_collate.sp + depth,
   1297 			    B->l_collate.sp + depth, ml);
   1298 
   1299 			if (r)
   1300 				return (r);
   1301 		}
   1302 
   1303 		if (A->l_collate_length < B->l_collate_length)
   1304 			return (-1);
   1305 
   1306 		if (A->l_collate_length > B->l_collate_length)
   1307 			return (1);
   1308 	}
   1309 
   1310 	/*
   1311 	 * This is where we cut out, if we know that the current sort is over
   1312 	 * the entire line.
   1313 	 */
   1314 	if (coll_flag & COLL_UNIQUE)
   1315 		return (0);
   1316 
   1317 	line_convert(A);
   1318 	line_convert(B);
   1319 
   1320 	la = strlen(A->l_raw_collate.sp);
   1321 	lb = strlen(B->l_raw_collate.sp);
   1322 
   1323 	r = memcmp(A->l_raw_collate.sp, B->l_raw_collate.sp, MIN(la, lb));
   1324 
   1325 	if (r)
   1326 		return (r ^ mask);
   1327 
   1328 	if (la < lb)
   1329 		return (-1 ^ mask);
   1330 
   1331 	if (la > lb)
   1332 		return (1 ^ mask);
   1333 
   1334 	return (0);
   1335 }
   1336 
   1337 int
   1338 collated_wide(line_rec_t *A, line_rec_t *B, ssize_t depth, flag_t coll_flag)
   1339 {
   1340 	ssize_t ml = MIN(A->l_collate_length, B->l_collate_length) - depth;
   1341 	int r;
   1342 	int mask = (coll_flag & COLL_REVERSE) ? INT_SIGN_FLIP_MASK :
   1343 	    INT_SIGN_PASS_MASK;
   1344 	ssize_t la, lb;
   1345 
   1346 	if (!(coll_flag & COLL_DATA_ONLY)) {
   1347 		if (ml > 0) {
   1348 			r = memcmp(A->l_collate.sp + depth,
   1349 			    B->l_collate.sp + depth, ml);
   1350 
   1351 			if (r)
   1352 				return (r);
   1353 		}
   1354 		if (A->l_collate_length < B->l_collate_length)
   1355 			return (-1);
   1356 
   1357 		if (A->l_collate_length > B->l_collate_length)
   1358 			return (1);
   1359 	}
   1360 
   1361 	if (coll_flag & COLL_UNIQUE)
   1362 		return (0);
   1363 
   1364 	line_convert_wide(A);
   1365 	line_convert_wide(B);
   1366 
   1367 	la = wcslen(A->l_raw_collate.wp);
   1368 	lb = wcslen(B->l_raw_collate.wp);
   1369 
   1370 	r = wmemcmp(A->l_raw_collate.wp, B->l_raw_collate.wp,
   1371 	    (size_t)MIN(la, lb));
   1372 
   1373 	if (r)
   1374 		return (r ^ mask);
   1375 
   1376 	if (la < lb)
   1377 		return (-1 ^ mask);
   1378 
   1379 	if (la > lb)
   1380 		return (1 ^ mask);
   1381 
   1382 	return (0);
   1383 }
   1384