Home | History | Annotate | Download | only in gen
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 	.file	"strcasecmp.s"
     28 
     29 /*
     30  * The strcasecmp() function is a case insensitive versions of strcmp().
     31  * It assumes the ASCII character set and ignores differences in case
     32  * when comparing lower and upper case characters. In other words, it
     33  * behaves as if both strings had been converted to lower case using
     34  * tolower() in the "C" locale on each byte, and the results had then
     35  * been compared using strcmp().
     36  *
     37  * The assembly code below is an optimized version of the following C
     38  * reference:
     39  *
     40  * static const char charmap[] = {
     41  *	'\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
     42  *	'\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
     43  *	'\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
     44  *	'\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
     45  *	'\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
     46  *	'\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
     47  *	'\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
     48  *	'\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
     49  *	'\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
     50  *	'\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
     51  *	'\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
     52  *	'\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
     53  *	'\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
     54  *	'\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
     55  *	'\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
     56  *	'\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
     57  *	'\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
     58  *	'\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
     59  *	'\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
     60  *	'\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
     61  *	'\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
     62  *	'\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
     63  *	'\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
     64  *	'\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
     65  *	'\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
     66  *	'\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
     67  *	'\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
     68  *	'\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
     69  *	'\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
     70  *	'\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
     71  *	'\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
     72  *	'\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
     73  * };
     74  *
     75  * int
     76  * strcasecmp(const char *s1, const char *s2)
     77  * {
     78  *	const unsigned char	*cm = (const unsigned char *)charmap;
     79  *	const unsigned char	*us1 = (const unsigned char *)s1;
     80  *	const unsigned char	*us2 = (const unsigned char *)s2;
     81  *
     82  *	while (cm[*us1] == cm[*us2++])
     83  *		if (*us1++ == '\0')
     84  *			return (0);
     85  *	return (cm[*us1] - cm[*(us2 - 1)]);
     86  * }
     87  *
     88  * The following algorithm, from a 1987 news posting by Alan Mycroft, is
     89  * used for finding null bytes in a word:
     90  *
     91  * #define has_null(word) ((word - 0x01010101) & (~word & 0x80808080))
     92  *
     93  * The following algorithm is used for a wordwise tolower() operation:
     94  *
     95  * unsigned int
     96  * parallel_tolower (unsigned int x)
     97  * {
     98  *	unsigned int p;
     99  *	unsigned int q;
    100  *
    101  *	unsigned int m1 = 0x80808080;
    102  *	unsigned int m2 = 0x3f3f3f3f;
    103  *	unsigned int m3 = 0x25252525;
    104  *
    105  *	q = x & ~m1;// newb = byte & 0x7F
    106  *	p = q + m2; // newb > 0x5A --> MSB set
    107  *	q = q + m3; // newb < 0x41 --> MSB clear
    108  *	p = p & ~q; // newb > 0x40 && newb < 0x5B --> MSB set
    109  *	q = m1 & ~x;//  byte < 0x80 --> 0x80
    110  *	q = p & q;  // newb > 0x40 && newb < 0x5B && byte < 0x80 -> 0x80,else 0
    111  *	q = q >> 2; // newb > 0x40 && newb < 0x5B && byte < 0x80 -> 0x20,else 0
    112  *	return (x + q); // translate uppercase characters to lowercase
    113  * }
    114  *
    115  * Both algorithms have been tested exhaustively for all possible 2^32 inputs.
    116  */
    117 
    118 #include <sys/asm_linkage.h>
    119 
    120 	! The first part of this algorithm walks through the beginning of
    121 	! both strings a byte at a time until the source ptr is  aligned to
    122 	! a word boundary. During these steps, the bytes are translated to
    123 	! lower-case if they are upper-case, and are checked against
    124 	! the source string.
    125 
    126 	ENTRY(strcasecmp)
    127 
    128 	.align 32
    129 
    130 	save	%sp, -SA(WINDOWSIZE), %sp
    131 	subcc	%i0, %i1, %i2		! s1 == s2 ?
    132 	bz,pn	%ncc, .stringsequal	! yup, done, strings equal
    133 	andcc	%i0, 3, %i3		! s1 word-aligned ?
    134 	bz,pn	%ncc, .s1aligned1	! yup
    135 	sethi	%hi(0x80808080), %i4	! start loading Mycroft's magic1
    136 
    137 	ldub	[%i1 + %i2], %i0	! s1[0]
    138 	ldub	[%i1], %g1		! s2[0]
    139 	sub	%i0, 'A', %l0		! transform for faster uppercase check
    140 	sub	%g1, 'A', %l1		! transform for faster uppercase check
    141 	cmp	%l0, ('Z' - 'A')	! s1[0] uppercase?
    142 	bleu,a	.noxlate11		! yes
    143 	add	%i0, ('a' - 'A'), %i0	! s1[0] = tolower(s1[0])
    144 .noxlate11:
    145 	cmp	%l1, ('Z' - 'A')	! s2[0] uppercase?
    146 	bleu,a	.noxlate12		! yes
    147 	add	%g1, ('a' - 'A'), %g1	! s2[0] = tolower(s2[0])
    148 .noxlate12:
    149 	subcc	%i0, %g1, %i0		! tolower(s1[0]) != tolower(s2[0]) ?
    150 	bne,pn	%ncc, .done		! yup, done
    151 	inc	%i1			! s1++, s2++
    152 	addcc	%i0, %g1, %i0		! s1[0] == 0 ?
    153 	bz,pn	%ncc, .done		! yup, done, strings equal
    154 	cmp	%i3, 3			! s1 aligned now?
    155 	bz	%ncc, .s1aligned2	! yup
    156 	sethi	%hi(0x01010101), %i5	! start loading Mycroft's magic2
    157 
    158 	ldub	[%i1 + %i2], %i0	! s1[1]
    159 	ldub	[%i1], %g1		! s2[1]
    160 	sub	%i0, 'A', %l0		! transform for faster uppercase check
    161 	sub	%g1, 'A', %l1		! transform for faster uppercase check
    162 	cmp	%l0, ('Z' - 'A')	! s1[1] uppercase?
    163 	bleu,a	.noxlate21		! yes
    164 	add	%i0, ('a' - 'A'), %i0	! s1[1] = tolower(s1[1])
    165 .noxlate21:
    166 	cmp	%l1, ('Z' - 'A')	! s2[1] uppercase?
    167 	bleu,a	.noxlate22		! yes
    168 	add	%g1, ('a' - 'A'), %g1	! s2[1] = tolower(s2[1])
    169 .noxlate22:
    170 	subcc	%i0, %g1, %i0		! tolower(s1[1]) != tolower(s2[1]) ?
    171 	bne,pn	%ncc, .done		! yup, done
    172 	inc	%i1			! s1++, s2++
    173 	addcc	%i0, %g1, %i0		! s1[1] == 0 ?
    174 	bz,pn	%ncc, .done		! yup, done, strings equal
    175 	cmp	%i3, 2			! s1 aligned now?
    176 	bz	%ncc, .s1aligned3	! yup
    177 	or	%i4, %lo(0x80808080),%i4! finish loading Mycroft's magic1
    178 
    179 	ldub	[%i1 + %i2], %i0	! s1[2]
    180 	ldub	[%i1], %g1		! s2[2]
    181 	sub	%i0, 'A', %l0		! transform for faster uppercase check
    182 	sub	%g1, 'A', %l1		! transform for faster uppercase check
    183 	cmp	%l0, ('Z' - 'A')	! s1[2] uppercase?
    184 	bleu,a	.noxlate31		! yes
    185 	add	%i0, ('a' - 'A'), %i0	! s1[2] = tolower(s1[2])
    186 .noxlate31:
    187 	cmp	%l1, ('Z' - 'A')	! s2[2] uppercase?
    188 	bleu,a	.noxlate32		! yes
    189 	add	%g1, ('a' - 'A'), %g1	! s2[2] = tolower(s2[2])
    190 .noxlate32:
    191 	subcc	%i0, %g1, %i0		! tolower(s1[2]) != tolower(s2[2]) ?
    192 	bne,pn	%ncc, .done		! yup, done
    193 	inc	%i1			! s1++, s2++
    194 	addcc	%i0, %g1, %i0		! s1[2] == 0 ?
    195 	bz,pn	%ncc, .done		! yup, done, strings equal
    196 	or	%i5, %lo(0x01010101),%i5! finish loading Mycroft's magic2
    197 	ba	.s1aligned4		! s1 aligned now
    198 	andcc	%i1, 3, %i3		! s2 word-aligned ?
    199 
    200 	! Here, we initialize our checks for a zero byte and decide
    201 	! whether or not we can optimize further if we're fortunate
    202 	! enough to have a word aligned desintation
    203 
    204 .s1aligned1:
    205 	sethi	%hi(0x01010101), %i5	! start loading Mycroft's magic2
    206 .s1aligned2:
    207 	or	%i4, %lo(0x80808080),%i4! finish loading Mycroft's magic1
    208 .s1aligned3:
    209 	or	%i5, %lo(0x01010101),%i5! finish loading Mycroft's magic2
    210 	andcc	%i1, 3, %i3		! s2 word aligned ?
    211 .s1aligned4:
    212 	sethi	%hi(0x3f3f3f3f), %l2	! load m2 for parallel tolower()
    213 	sethi	%hi(0x25252525), %l3	! load m3 for parallel tolower()
    214 	or 	%l2, %lo(0x3f3f3f3f),%l2! finish loading m2
    215 	bz	.word4			! yup, s2 word-aligned
    216 	or 	%l3, %lo(0x25252525),%l3! finish loading m3
    217 
    218 	add	%i2, %i3, %i2		! start adjusting offset s1-s2
    219 	sll     %i3, 3, %l6    		! shift factor for left shifts
    220 	andn	%i1, 3, %i1		! round s1 pointer down to next word
    221 	sub	%g0, %l6, %l7		! shift factor for right shifts
    222 	orn	%i3, %g0, %i3		! generate all ones
    223 	lduw	[%i1], %i0		! new lower word from s2
    224 	srl	%i3, %l6, %i3		! mask for fixing up bytes
    225 	sll	%i0, %l6, %g1		! partial unaligned word from s2
    226 	orn	%i0, %i3, %i0		! force start bytes to non-zero
    227 	nop				! pad to align loop to 16-byte boundary
    228 	nop				! pad to align loop to 16-byte boundary
    229 
    230 	! This is the comparision procedure used if the destination is not
    231 	! word aligned, if it is, we use word4 & cmp4
    232 
    233 .cmp:
    234 	andn	%i4, %i0, %l4		! ~word & 0x80808080
    235 	sub	%i0, %i5, %l5		! word - 0x01010101
    236 	andcc	%l5, %l4, %g0		! (word - 0x01010101) & ~word & 0x80808080
    237 	bz,a,pt	%ncc, .doload		! null byte in previous aligned s2 word
    238 	lduw	[%i1 + 4], %i0		! load next aligned word from s2
    239 .doload:
    240 	srl	%i0, %l7, %i3		! byte 1 from new aligned word from s2
    241 	or	%g1, %i3, %g1		! merge to get unaligned word from s2
    242 	lduw	[%i1 + %i2], %i3	! x1 = word from s1
    243 	andn	%i3, %i4, %l0		! q1 = x1 & ~m1
    244 	andn	%g1, %i4, %l4		! q2 = x2 & ~m1
    245 	add	%l0, %l2, %l1		! p1 = q1 + m2
    246 	add	%l4, %l2, %l5		! p2 = q2 + m2
    247 	add	%l0, %l3, %l0		! q1 = q1 + m3
    248 	add	%l4, %l3, %l4		! q2 = q2 + m3
    249 	andn	%l1, %l0, %l1		! p1 = p1 & ~q1
    250 	andn	%l5, %l4, %l5		! p2 = p2 & ~q2
    251 	andn	%i4, %i3, %l0		! q1 = m1 & ~x1
    252 	andn	%i4, %g1, %l4		! q2 = m1 & ~x2
    253 	and	%l0, %l1, %l0		! q1 = p1 & q1
    254 	and	%l4, %l5, %l4		! q2 = p2 & q2
    255 	srl	%l0, 2, %l0		! q1 = q1 >> 2
    256 	srl	%l4, 2, %l4		! q2 = q2 >> 2
    257 	add	%l0, %i3, %i3		! lowercase word from s1
    258 	add	%l4, %g1, %g1		! lowercase word from s2
    259 	cmp	%i3, %g1		! tolower(*s1) != tolower(*s2) ?
    260 	bne	%icc, .wordsdiffer	! yup, now find byte that is different
    261 	add	%i1, 4, %i1		! s1+=4, s2+=4
    262 	andn	%i4, %i3, %l4		! ~word & 0x80808080
    263 	sub	%i3, %i5, %l5		! word - 0x01010101
    264 	andcc	%l5, %l4, %g0		! (word - 0x01010101) & ~word & 0x80808080
    265 	bz,pt	%ncc, .cmp		! no null-byte in s1 yet
    266 	sll	%i0, %l6, %g1		! partial unaligned word from s2
    267 
    268 	! words are equal but the end of s1 has been reached
    269 	! this means the strings must be equal
    270 .stringsequal:
    271 	ret				! return
    272 	restore	%g0, %g0, %o0		! return 0, i.e. strings are equal
    273 	nop				! pad
    274 
    275 
    276 	! we have a word aligned source and destination!  This means
    277 	! things get to go fast!
    278 
    279 .word4:
    280 	lduw	[%i1 + %i2], %i3	! x1 = word from s1
    281 
    282 .cmp4:
    283 	andn	%i3, %i4, %l0		! q1 = x1 & ~m1
    284 	lduw	[%i1], %g1		! x2 = word from s2
    285 	andn	%g1, %i4, %l4		! q2 = x2 & ~m1
    286 	add	%l0, %l2, %l1		! p1 = q1 + m2
    287 	add	%l4, %l2, %l5		! p2 = q2 + m2
    288 	add	%l0, %l3, %l0		! q1 = q1 + m3
    289 	add	%l4, %l3, %l4		! q2 = q2 + m3
    290 	andn	%l1, %l0, %l1		! p1 = p1 & ~q1
    291 	andn	%l5, %l4, %l5		! p2 = p2 & ~q2
    292 	andn	%i4, %i3, %l0		! q1 = m1 & ~x1
    293 	andn	%i4, %g1, %l4		! q2 = m1 & ~x2
    294 	and	%l0, %l1, %l0		! q1 = p1 & q1
    295 	and	%l4, %l5, %l4		! q2 = p2 & q2
    296 	srl	%l0, 2, %l0		! q1 = q1 >> 2
    297 	srl	%l4, 2, %l4		! q2 = q2 >> 2
    298 	add	%l0, %i3, %i3		! lowercase word from s1
    299 	add	%l4, %g1, %g1		! lowercase word from s2
    300 	cmp	%i3, %g1		! tolower(*s1) != tolower(*s2) ?
    301 	bne,pn	%icc, .wordsdiffer	! yup, now find mismatching character
    302 	add	%i1, 4, %i1		! s1+=4, s2+=4
    303 	andn	%i4, %i3, %l4		! ~word & 0x80808080
    304 	sub	%i3, %i5, %l5		! word - 0x01010101
    305 	andcc	%l5, %l4, %g0		! (word - 0x01010101) & ~word & 0x80808080
    306 	bz,a,pt	%icc, .cmp4		! no null-byte in s1 yet
    307 	lduw	[%i1 + %i2], %i3	! load word from s1
    308 
    309 	! words are equal but the end of s1 has been reached
    310 	! this means the strings must be equal
    311 .stringsequal4:
    312 	ret				! return
    313 	restore	%g0, %g0, %o0		! return 0, i.e. strings are equal
    314 
    315 .wordsdiffer:
    316 	srl	%g1, 24, %i2		! first byte of mismatching word in s2
    317 	srl	%i3, 24, %i1		! first byte of mismatching word in s1
    318 	subcc	%i1, %i2, %i0		! *s1-*s2
    319 	bnz,pn	%ncc, .done		! bytes differ, return difference
    320 	srl	%g1, 16, %i2		! second byte of mismatching word in s2
    321 	andcc	%i1, 0xff, %i0		! *s1 == 0 ?
    322 	bz,pn	%ncc, .done		! yup
    323 
    324 	! we know byte 1 is equal, so can compare bytes 1,2 as a group
    325 
    326 	srl	%i3, 16, %i1		! second byte of mismatching word in s1
    327 	subcc	%i1, %i2, %i0		! *s1-*s2
    328 	bnz,pn	%ncc, .done		! bytes differ, return difference
    329 	srl	%g1, 8, %i2		! third byte of mismatching word in s2
    330 	andcc	%i1, 0xff, %i0		! *s1 == 0 ?
    331 	bz,pn	%ncc, .done		! yup
    332 
    333 	! we know bytes 1, 2 are equal, so can compare bytes 1,2,3 as a group
    334 
    335 	srl	%i3, 8, %i1		! third byte of mismatching word in s1
    336 	subcc	%i1, %i2, %i0		! *s1-*s2
    337 	bnz,pn	%ncc, .done		! bytes differ, return difference
    338 	andcc	%i1, 0xff, %g0		! *s1 == 0 ?
    339 	bz,pn	%ncc, .stringsequal	! yup
    340 
    341 	! we know bytes 1,2,3 are equal, so can compare bytes 1,2,3,4 as group
    342 
    343 	subcc	%i3, %g1, %i0		! *s1-*s2
    344 	bz,a	.done			! bytes differ, return difference
    345 	andcc	%i3, 0xff, %i0		! *s1 == 0 ?
    346 
    347 .done:
    348 	ret				! return
    349 	restore	%i0, %g0, %o0		! return tolower(*s1) - tolower(*s2)
    350 
    351 	SET_SIZE(strcasecmp)
    352