Home | History | Annotate | Download | only in gen
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright (c) 2009, Intel Corporation
     24  * All rights reserved.
     25  */
     26 
     27 /*
     28  *	str[n]cmp - compare chars between two string
     29  */
     30 
     31 #include "SYS.h"
     32 #include "proc64_id.h"
     33 
     34 #define LABEL(s) .strcmp/**/s
     35 
     36 #ifdef USE_AS_STRNCMP
     37 	/*
     38 	 * Since the counter, %r11, is unsigned, we branch to strcmp_exitz
     39 	 * if the new counter > the old one or is 0.
     40 	 */
     41 #define UPDATE_STRNCMP_COUNTER				\
     42 	/* calculate left number to compare */		\
     43 	lea	-16(%rcx, %r11), %r9;			\
     44 	cmp	%r9, %r11;				\
     45 	jb	LABEL(strcmp_exitz);			\
     46 	test	%r9, %r9;				\
     47 	je	LABEL(strcmp_exitz);			\
     48 	mov	%r9, %r11
     49 #else
     50 #define UPDATE_STRNCMP_COUNTER
     51 #endif
     52 
     53 	/*
     54 	 * This implementation uses SSE to compare up to 16 bytes at a time.
     55 	 */
     56 #ifdef USE_AS_STRNCMP
     57 	ENTRY(strncmp)
     58 	test	%rdx, %rdx
     59 	je	LABEL(strcmp_exitz)
     60 	mov	%rdx, %r11
     61 #else
     62 	ENTRY(strcmp)			/* (const char *, const char *) */
     63 #endif
     64 	mov	%esi, %ecx
     65 	mov	%edi, %eax
     66 	and	$0x3f, %rcx		/* rsi alignment in cache line */
     67 	and	$0x3f, %rax		/* rdi alignment in cache line */
     68 	cmp	$0x30, %ecx
     69 	ja	LABEL(crosscache)	/* rsi: 16-byte load will cross cache line */
     70 	cmp	$0x30, %eax
     71 	ja	LABEL(crosscache)	/* rdi: 16-byte load will cross cache line */
     72 	movlpd	(%rdi), %xmm1
     73 	movlpd	(%rsi), %xmm2
     74 	movhpd	8(%rdi), %xmm1
     75 	movhpd	8(%rsi), %xmm2
     76 	pxor	%xmm0, %xmm0		/* clear %xmm0 for null char checks */
     77 	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
     78 	pcmpeqb	%xmm2, %xmm1		/* compare first 16 bytes for equality */
     79 	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
     80 	pmovmskb %xmm1, %edx
     81 	sub	$0xffff, %edx		/* if first 16 bytes are same, edx == 0xffff */
     82 	jnz	LABEL(less16bytes)	/* If not, found mismatch or null char */
     83 #ifdef USE_AS_STRNCMP
     84 	sub	$16, %r11
     85 	jbe	LABEL(strcmp_exitz)	/* finish comparision */
     86 #endif
     87 	add	$16, %rsi		/* prepare to search next 16 bytes */
     88 	add	$16, %rdi		/* prepare to search next 16 bytes */
     89 
     90 	/*
     91 	 * Determine rdi and rsi string offsets from 16-byte alignment.
     92 	 * Use relative offset difference between the two to determine which case
     93 	 * below to use.
     94 	 */
     95 	.p2align 4
     96 LABEL(crosscache):
     97 	and	$0xfffffffffffffff0, %rsi	/* force %rsi to be 16 byte aligned */
     98 	and	$0xfffffffffffffff0, %rdi	/* force %rdi to be 16 byte aligned */
     99 	mov	$0xffff, %edx			/* for equivalent offset */
    100 	xor	%r8d, %r8d
    101 	and	$0xf, %ecx			/* offset of rsi */
    102 	and	$0xf, %eax			/* offset of rdi */
    103 	cmp	%eax, %ecx
    104 	je	LABEL(ashr_0)			/* both strings have the same alignment */
    105 	ja	LABEL(bigger)
    106 	mov	%edx, %r8d			/* r8d is offset flag for exit tail */
    107 	xchg	%ecx, %eax
    108 	xchg	%rsi, %rdi
    109 LABEL(bigger):
    110 	mov	%rcx, %r9
    111 	sub	%rax, %r9
    112 	lea	LABEL(unaligned_table)(%rip), %r10
    113 	movslq	(%r10, %r9, 4), %r9
    114 	lea	(%r10, %r9), %r10
    115 	jmp	*%r10				/* jump to corresponding case */
    116 
    117 /*
    118  * ashr_0 handles the following cases:
    119  * 	str1 offset = str2 offset
    120  */
    121 	.p2align 4
    122 LABEL(ashr_0):
    123 	movdqa	(%rsi), %xmm1
    124 	pxor	%xmm0, %xmm0			/* clear %xmm0 for null char check */
    125 	pcmpeqb	%xmm1, %xmm0			/* Any null chars? */
    126 	pcmpeqb	(%rdi), %xmm1			/* compare 16 bytes for equality */
    127 	psubb	%xmm0, %xmm1			/* packed sub of comparison results*/
    128 	pmovmskb %xmm1, %r9d
    129 	shr	%cl, %edx			/* adjust 0xffff for offset */
    130 	shr	%cl, %r9d			/* adjust for 16-byte offset */
    131 	sub	%r9d, %edx
    132 	/*
    133 	 * edx must be the same with r9d if in left byte (16-rcx) is equal to
    134 	 * the start from (16-rax) and no null char was seen.
    135 	 */
    136 	jne	LABEL(less32bytes)		/* mismatch or null char */
    137 	UPDATE_STRNCMP_COUNTER
    138 	mov	$16, %rcx
    139 	mov	$16, %r9
    140 	pxor	%xmm0, %xmm0			/* clear xmm0, may have changed above */
    141 
    142 	/*
    143 	 * Now both strings are aligned at 16-byte boundary. Loop over strings
    144 	 * checking 32-bytes per iteration.
    145 	 */
    146 	.p2align 4
    147 LABEL(loop_ashr_0):
    148 	movdqa	(%rsi, %rcx), %xmm1
    149 	movdqa	(%rdi, %rcx), %xmm2
    150 
    151 	pcmpeqb	%xmm1, %xmm0
    152 	pcmpeqb	%xmm2, %xmm1
    153 	psubb	%xmm0, %xmm1
    154 	pmovmskb %xmm1, %edx
    155 	sub	$0xffff, %edx
    156 	jnz	LABEL(exit)		/* mismatch or null char seen */
    157 
    158 #ifdef USE_AS_STRNCMP
    159 	sub	$16, %r11
    160 	jbe	LABEL(strcmp_exitz)
    161 #endif
    162 	add	$16, %rcx
    163 	movdqa	(%rsi, %rcx), %xmm1
    164 	movdqa	(%rdi, %rcx), %xmm2
    165 
    166 	pcmpeqb	%xmm1, %xmm0
    167 	pcmpeqb	%xmm2, %xmm1
    168 	psubb	%xmm0, %xmm1
    169 	pmovmskb %xmm1, %edx
    170 	sub	$0xffff, %edx
    171 	jnz	LABEL(exit)
    172 #ifdef USE_AS_STRNCMP
    173 	sub	$16, %r11
    174 	jbe	LABEL(strcmp_exitz)
    175 #endif
    176 	add	$16, %rcx
    177 	jmp	LABEL(loop_ashr_0)
    178 
    179 /*
    180  * ashr_1 handles the following cases:
    181  * 	abs(str1 offset - str2 offset) = 15
    182  */
    183 	.p2align 4
    184 LABEL(ashr_1):
    185 	pxor	%xmm0, %xmm0
    186 	movdqa	(%rdi), %xmm2
    187 	movdqa	(%rsi), %xmm1
    188 	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
    189 	pslldq	$15, %xmm2		/* shift first string to align with second */
    190 	pcmpeqb	%xmm1, %xmm2		/* compare 16 bytes for equality */
    191 	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
    192 	pmovmskb %xmm2, %r9d
    193 	shr	%cl, %edx		/* adjust 0xffff for offset */
    194 	shr	%cl, %r9d		/* adjust for 16-byte offset */
    195 	sub	%r9d, %edx
    196 	jnz	LABEL(less32bytes)	/* mismatch or null char seen */
    197 	movdqa	(%rdi), %xmm3
    198 	UPDATE_STRNCMP_COUNTER
    199 
    200 	pxor	%xmm0, %xmm0
    201 	mov	$16, %rcx		/* index for loads */
    202 	mov	$1, %r9d		/* rdi bytes already examined. Used in exit code */
    203 	/*
    204 	 * Setup %r10 value allows us to detect crossing a page boundary.
    205 	 * When %r10 goes positive we are crossing a page boundary and
    206 	 * need to do a nibble.
    207 	 */
    208 	lea	1(%rdi), %r10
    209 	and	$0xfff, %r10		/* offset into 4K page */
    210 	sub	$0x1000, %r10		/* subtract 4K pagesize */
    211 	movdqa	%xmm3, %xmm4
    212 
    213 	.p2align 4
    214 LABEL(loop_ashr_1):
    215 	add	$16, %r10
    216 	jg	LABEL(nibble_ashr_1)	/* cross page boundary */
    217 
    218 LABEL(gobble_ashr_1):
    219 	movdqa	(%rsi, %rcx), %xmm1
    220 	movdqa	(%rdi, %rcx), %xmm2
    221 	movdqa	%xmm2, %xmm4		 /* store for next cycle */
    222 
    223 	psrldq	$1, %xmm3
    224 	pslldq	$15, %xmm2
    225 	por	%xmm3, %xmm2		/* merge into one 16byte value */
    226 
    227 	pcmpeqb	%xmm1, %xmm0
    228 	pcmpeqb	%xmm2, %xmm1
    229 	psubb	%xmm0, %xmm1
    230 	pmovmskb %xmm1, %edx
    231 	sub	$0xffff, %edx
    232 	jnz	LABEL(exit)
    233 
    234 #ifdef USE_AS_STRNCMP
    235 	sub	$16, %r11
    236 	jbe	LABEL(strcmp_exitz)
    237 #endif
    238 	add	$16, %rcx
    239 	movdqa	%xmm4, %xmm3
    240 
    241 	add	$16, %r10
    242 	jg	LABEL(nibble_ashr_1)	/* cross page boundary */
    243 
    244 	movdqa	(%rsi, %rcx), %xmm1
    245 	movdqa	(%rdi, %rcx), %xmm2
    246 	movdqa	%xmm2, %xmm4		/* store for next cycle */
    247 
    248 	psrldq	$1, %xmm3
    249 	pslldq 	$15, %xmm2
    250 	por	%xmm3, %xmm2		/* merge into one 16byte value */
    251 
    252 	pcmpeqb	%xmm1, %xmm0
    253 	pcmpeqb	%xmm2, %xmm1
    254 	psubb	%xmm0, %xmm1
    255 	pmovmskb %xmm1, %edx
    256 	sub	$0xffff, %edx
    257 	jnz	LABEL(exit)
    258 
    259 #ifdef USE_AS_STRNCMP
    260 	sub	$16, %r11
    261 	jbe	LABEL(strcmp_exitz)
    262 #endif
    263 	add	$16, %rcx
    264 	movdqa	%xmm4, %xmm3
    265 	jmp	LABEL(loop_ashr_1)
    266 
    267 	/*
    268 	 * Nibble avoids loads across page boundary. This is to avoid a potential
    269 	 * access into unmapped memory.
    270 	 */
    271 	.p2align 4
    272 LABEL(nibble_ashr_1):
    273 	psrldq	$1, %xmm4
    274 	movdqa	(%rsi, %rcx), %xmm1
    275 	pcmpeqb	%xmm1, %xmm0
    276 	pcmpeqb	%xmm4, %xmm1
    277 	psubb	%xmm0, %xmm1
    278 	pmovmskb %xmm1, %edx
    279 	sub	$0x7fff, %edx
    280 	jnz	LABEL(exit)
    281 #ifdef USE_AS_STRNCMP
    282 	cmp	$15, %r11
    283 	jbe	LABEL(strcmp_exitz)
    284 #endif
    285 	pxor	%xmm0, %xmm0
    286 	sub	$0x1000, %r10		/* subtract 4K from %r10 */
    287 	jmp	LABEL(gobble_ashr_1)
    288 
    289 /*
    290  * ashr_2 handles the following cases:
    291  * 	abs(str1 offset - str2 offset) = 14
    292  */
    293 	.p2align 4
    294 LABEL(ashr_2):
    295 	pxor	%xmm0, %xmm0
    296 	movdqa	(%rdi), %xmm2
    297 	movdqa	(%rsi), %xmm1
    298 	pcmpeqb	%xmm1, %xmm0
    299 	pslldq	$14, %xmm2
    300 	pcmpeqb	%xmm1, %xmm2
    301 	psubb	%xmm0, %xmm2
    302 	pmovmskb %xmm2, %r9d
    303 	shr	%cl, %edx
    304 	shr	%cl, %r9d
    305 	sub	%r9d, %edx
    306 	jnz	LABEL(less32bytes)
    307 	movdqa	(%rdi), %xmm3
    308 	UPDATE_STRNCMP_COUNTER
    309 
    310 	pxor	%xmm0, %xmm0
    311 	mov	$16, %rcx	/* index for loads */
    312 	mov	$2, %r9d	/* rdi bytes already examined. Used in exit code */
    313 	/*
    314 	 * Setup %r10 value allows us to detect crossing a page boundary.
    315 	 * When %r10 goes positive we are crossing a page boundary and
    316 	 * need to do a nibble.
    317 	 */
    318 	lea	2(%rdi), %r10
    319 	and	$0xfff, %r10	/* offset into 4K page */
    320 	sub	$0x1000, %r10	/* subtract 4K pagesize */
    321 	movdqa	%xmm3, %xmm4
    322 
    323 	.p2align 4
    324 LABEL(loop_ashr_2):
    325 	add	$16, %r10
    326 	jg	LABEL(nibble_ashr_2)
    327 
    328 LABEL(gobble_ashr_2):
    329 	movdqa	(%rsi, %rcx), %xmm1
    330 	movdqa	(%rdi, %rcx), %xmm2
    331 	movdqa	%xmm2, %xmm4
    332 
    333 	psrldq	$2, %xmm3
    334 	pslldq	$14, %xmm2
    335 	por	%xmm3, %xmm2
    336 
    337 	pcmpeqb	%xmm1, %xmm0
    338 	pcmpeqb	%xmm2, %xmm1
    339 	psubb	%xmm0, %xmm1
    340 	pmovmskb %xmm1, %edx
    341 	sub	$0xffff, %edx
    342 	jnz	LABEL(exit)
    343 
    344 #ifdef USE_AS_STRNCMP
    345 	sub	$16, %r11
    346 	jbe	LABEL(strcmp_exitz)
    347 #endif
    348 
    349 	add	$16, %rcx
    350 	movdqa	%xmm4, %xmm3
    351 
    352 	add	$16, %r10
    353 	jg	LABEL(nibble_ashr_2)	/* cross page boundary */
    354 
    355 	movdqa	(%rsi, %rcx), %xmm1
    356 	movdqa	(%rdi, %rcx), %xmm2
    357 	movdqa	%xmm2, %xmm4
    358 
    359 	psrldq	$2, %xmm3
    360 	pslldq 	$14, %xmm2
    361 	por	%xmm3, %xmm2
    362 
    363 	pcmpeqb	%xmm1, %xmm0
    364 	pcmpeqb	%xmm2, %xmm1
    365 	psubb	%xmm0, %xmm1
    366 	pmovmskb %xmm1, %edx
    367 	sub	$0xffff, %edx
    368 	jnz	LABEL(exit)
    369 
    370 #ifdef USE_AS_STRNCMP
    371 	sub	$16, %r11
    372 	jbe	LABEL(strcmp_exitz)
    373 #endif
    374 
    375 	add	$16, %rcx
    376 	movdqa	%xmm4, %xmm3
    377 	jmp	LABEL(loop_ashr_2)
    378 
    379 	.p2align 4
    380 LABEL(nibble_ashr_2):
    381 	psrldq	$2, %xmm4
    382 	movdqa	(%rsi, %rcx), %xmm1
    383 	pcmpeqb	%xmm1, %xmm0
    384 	pcmpeqb	%xmm4, %xmm1
    385 	psubb	%xmm0, %xmm1
    386 	pmovmskb %xmm1, %edx
    387 	sub	$0x3fff, %edx
    388 	jnz	LABEL(exit)
    389 #ifdef USE_AS_STRNCMP
    390 	cmp	$14, %r11
    391 	jbe	LABEL(strcmp_exitz)
    392 #endif
    393 	pxor	%xmm0, %xmm0
    394 	sub	$0x1000, %r10		/* subtract 4K from %r10 */
    395 	jmp	LABEL(gobble_ashr_2)
    396 
    397 /*
    398  * ashr_3 handles the following cases:
    399  * 	abs(str1 offset - str2 offset) = 13
    400  */
    401 	.p2align 4
    402 LABEL(ashr_3):
    403 	pxor	%xmm0, %xmm0
    404 	movdqa	(%rdi), %xmm2
    405 	movdqa	(%rsi), %xmm1
    406 	pcmpeqb	%xmm1, %xmm0
    407 	pslldq	$13, %xmm2
    408 	pcmpeqb	%xmm1, %xmm2
    409 	psubb	%xmm0, %xmm2
    410 	pmovmskb %xmm2, %r9d
    411 	shr	%cl, %edx
    412 	shr	%cl, %r9d
    413 	sub	%r9d, %edx
    414 	jnz	LABEL(less32bytes)
    415 	movdqa	(%rdi), %xmm3
    416 
    417 	UPDATE_STRNCMP_COUNTER
    418 
    419 	pxor	%xmm0, %xmm0
    420 	mov	$16, %rcx	/* index for loads */
    421 	mov	$3, %r9d	/* rdi bytes already examined. Used in exit code */
    422 	/*
    423 	 * Setup %r10 value allows us to detect crossing a page boundary.
    424 	 * When %r10 goes positive we are crossing a page boundary and
    425 	 * need to do a nibble.
    426 	 */
    427 	lea	3(%rdi), %r10
    428 	and	$0xfff, %r10	/* offset into 4K page */
    429 	sub	$0x1000, %r10	/* subtract 4K pagesize */
    430 	movdqa	%xmm3, %xmm4
    431 
    432 	.p2align 4
    433 LABEL(loop_ashr_3):
    434 	add	$16, %r10
    435 	jg	LABEL(nibble_ashr_3)
    436 
    437 LABEL(gobble_ashr_3):
    438 	movdqa	(%rsi, %rcx), %xmm1
    439 	movdqa	(%rdi, %rcx), %xmm2
    440 	movdqa	%xmm2, %xmm4
    441 
    442 	psrldq	$3, %xmm3
    443 	pslldq	$13, %xmm2
    444 	por	%xmm3, %xmm2
    445 
    446 	pcmpeqb	%xmm1, %xmm0
    447 	pcmpeqb	%xmm2, %xmm1
    448 	psubb	%xmm0, %xmm1
    449 	pmovmskb %xmm1, %edx
    450 	sub	$0xffff, %edx
    451 	jnz	LABEL(exit)
    452 
    453 #ifdef USE_AS_STRNCMP
    454 	sub	$16, %r11
    455 	jbe	LABEL(strcmp_exitz)
    456 #endif
    457 
    458 	add	$16, %rcx
    459 	movdqa	%xmm4, %xmm3
    460 
    461 	add	$16, %r10
    462 	jg	LABEL(nibble_ashr_3)	/* cross page boundary */
    463 
    464 	movdqa	(%rsi, %rcx), %xmm1
    465 	movdqa	(%rdi, %rcx), %xmm2
    466 	movdqa	%xmm2, %xmm4
    467 
    468 	psrldq	$3, %xmm3
    469 	pslldq 	$13, %xmm2
    470 	por	%xmm3, %xmm2
    471 
    472 	pcmpeqb	%xmm1, %xmm0
    473 	pcmpeqb	%xmm2, %xmm1
    474 	psubb	%xmm0, %xmm1
    475 	pmovmskb %xmm1, %edx
    476 	sub	$0xffff, %edx
    477 	jnz	LABEL(exit)
    478 
    479 #ifdef USE_AS_STRNCMP
    480 	sub	$16, %r11
    481 	jbe	LABEL(strcmp_exitz)
    482 #endif
    483 
    484 	add	$16, %rcx
    485 	movdqa	%xmm4, %xmm3
    486 	jmp	LABEL(loop_ashr_3)
    487 
    488 	.p2align 4
    489 LABEL(nibble_ashr_3):
    490 	psrldq	$3, %xmm4
    491 	movdqa	(%rsi, %rcx), %xmm1
    492 	pcmpeqb	%xmm1, %xmm0
    493 	pcmpeqb	%xmm4, %xmm1
    494 	psubb	%xmm0, %xmm1
    495 	pmovmskb %xmm1, %edx
    496 	sub	$0x1fff, %edx
    497 	jnz	LABEL(exit)
    498 #ifdef USE_AS_STRNCMP
    499 	cmp	$13, %r11
    500 	jbe	LABEL(strcmp_exitz)
    501 #endif
    502 	pxor	%xmm0, %xmm0
    503 	sub	$0x1000, %r10		/* subtract 4K from %r10 */
    504 	jmp	LABEL(gobble_ashr_3)
    505 
    506 /*
    507  * ashr_4 handles the following cases:
    508  * 	abs(str1 offset - str2 offset) = 12
    509  */
    510 	.p2align 4
    511 LABEL(ashr_4):
    512 	pxor	%xmm0, %xmm0
    513 	movdqa	(%rdi), %xmm2
    514 	movdqa	(%rsi), %xmm1
    515 	pcmpeqb	%xmm1, %xmm0
    516 	pslldq	$12, %xmm2
    517 	pcmpeqb	%xmm1, %xmm2
    518 	psubb	%xmm0, %xmm2
    519 	pmovmskb %xmm2, %r9d
    520 	shr	%cl, %edx
    521 	shr	%cl, %r9d
    522 	sub	%r9d, %edx
    523 	jnz	LABEL(less32bytes)
    524 	movdqa	(%rdi), %xmm3
    525 
    526 	UPDATE_STRNCMP_COUNTER
    527 
    528 	pxor	%xmm0, %xmm0
    529 	mov	$16, %rcx	/* index for loads */
    530 	mov	$4, %r9d	/* rdi bytes already examined. Used in exit code */
    531 	/*
    532 	 * Setup %r10 value allows us to detect crossing a page boundary.
    533 	 * When %r10 goes positive we are crossing a page boundary and
    534 	 * need to do a nibble.
    535 	 */
    536 	lea	4(%rdi), %r10
    537 	and	$0xfff, %r10	/* offset into 4K page */
    538 	sub	$0x1000, %r10	/* subtract 4K pagesize */
    539 	movdqa	%xmm3, %xmm4
    540 
    541 	.p2align 4
    542 LABEL(loop_ashr_4):
    543 	add	$16, %r10
    544 	jg	LABEL(nibble_ashr_4)
    545 
    546 LABEL(gobble_ashr_4):
    547 	movdqa	(%rsi, %rcx), %xmm1
    548 	movdqa	(%rdi, %rcx), %xmm2
    549 	movdqa	%xmm2, %xmm4
    550 
    551 	psrldq	$4, %xmm3
    552 	pslldq	$12, %xmm2
    553 	por	%xmm3, %xmm2
    554 
    555 	pcmpeqb	%xmm1, %xmm0
    556 	pcmpeqb	%xmm2, %xmm1
    557 	psubb	%xmm0, %xmm1
    558 	pmovmskb %xmm1, %edx
    559 	sub	$0xffff, %edx
    560 	jnz	LABEL(exit)
    561 
    562 #ifdef USE_AS_STRNCMP
    563 	sub	$16, %r11
    564 	jbe	LABEL(strcmp_exitz)
    565 #endif
    566 
    567 	add	$16, %rcx
    568 	movdqa	%xmm4, %xmm3
    569 
    570 	add	$16, %r10
    571 	jg	LABEL(nibble_ashr_4)	/* cross page boundary */
    572 
    573 	movdqa	(%rsi, %rcx), %xmm1
    574 	movdqa	(%rdi, %rcx), %xmm2
    575 	movdqa	%xmm2, %xmm4
    576 
    577 	psrldq	$4, %xmm3
    578 	pslldq 	$12, %xmm2
    579 	por	%xmm3, %xmm2
    580 
    581 	pcmpeqb	%xmm1, %xmm0
    582 	pcmpeqb	%xmm2, %xmm1
    583 	psubb	%xmm0, %xmm1
    584 	pmovmskb %xmm1, %edx
    585 	sub	$0xffff, %edx
    586 	jnz	LABEL(exit)
    587 
    588 #ifdef USE_AS_STRNCMP
    589 	sub	$16, %r11
    590 	jbe	LABEL(strcmp_exitz)
    591 #endif
    592 
    593 	add	$16, %rcx
    594 	movdqa	%xmm4, %xmm3
    595 	jmp	LABEL(loop_ashr_4)
    596 
    597 	.p2align 4
    598 LABEL(nibble_ashr_4):
    599 	psrldq	$4, %xmm4
    600 	movdqa	(%rsi, %rcx), %xmm1
    601 	pcmpeqb	%xmm1, %xmm0
    602 	pcmpeqb	%xmm4, %xmm1
    603 	psubb	%xmm0, %xmm1
    604 	pmovmskb %xmm1, %edx
    605 	sub	$0x0fff, %edx
    606 	jnz	LABEL(exit)
    607 #ifdef USE_AS_STRNCMP
    608 	cmp	$12, %r11
    609 	jbe	LABEL(strcmp_exitz)
    610 #endif
    611 	pxor	%xmm0, %xmm0
    612 	sub	$0x1000, %r10		/* subtract 4K from %r10 */
    613 	jmp	LABEL(gobble_ashr_4)
    614 
    615 /*
    616  * ashr_5 handles the following cases:
    617  * 	abs(str1 offset - str2 offset) = 11
    618  */
    619 	.p2align 4
    620 LABEL(ashr_5):
    621 	pxor	%xmm0, %xmm0
    622 	movdqa	(%rdi), %xmm2
    623 	movdqa	(%rsi), %xmm1
    624 	pcmpeqb	%xmm1, %xmm0
    625 	pslldq	$11, %xmm2
    626 	pcmpeqb	%xmm1, %xmm2
    627 	psubb	%xmm0, %xmm2
    628 	pmovmskb %xmm2, %r9d
    629 	shr	%cl, %edx
    630 	shr	%cl, %r9d
    631 	sub	%r9d, %edx
    632 	jnz	LABEL(less32bytes)
    633 	movdqa	(%rdi), %xmm3
    634 
    635 	UPDATE_STRNCMP_COUNTER
    636 
    637 	pxor	%xmm0, %xmm0
    638 	mov	$16, %rcx	/* index for loads */
    639 	mov	$5, %r9d	/* rdi bytes already examined. Used in exit code */
    640 	/*
    641 	 * Setup %r10 value allows us to detect crossing a page boundary.
    642 	 * When %r10 goes positive we are crossing a page boundary and
    643 	 * need to do a nibble.
    644 	 */
    645 	lea	5(%rdi), %r10
    646 	and	$0xfff, %r10	/* offset into 4K page */
    647 	sub	$0x1000, %r10	/* subtract 4K pagesize */
    648 	movdqa	%xmm3, %xmm4
    649 
    650 	.p2align 4
    651 LABEL(loop_ashr_5):
    652 	add	$16, %r10
    653 	jg	LABEL(nibble_ashr_5)
    654 
    655 LABEL(gobble_ashr_5):
    656 	movdqa	(%rsi, %rcx), %xmm1
    657 	movdqa	(%rdi, %rcx), %xmm2
    658 	movdqa	%xmm2, %xmm4
    659 
    660 	psrldq	$5, %xmm3
    661 	pslldq	$11, %xmm2
    662 	por	%xmm3, %xmm2
    663 
    664 	pcmpeqb	%xmm1, %xmm0
    665 	pcmpeqb	%xmm2, %xmm1
    666 	psubb	%xmm0, %xmm1
    667 	pmovmskb %xmm1, %edx
    668 	sub	$0xffff, %edx
    669 	jnz	LABEL(exit)
    670 
    671 #ifdef USE_AS_STRNCMP
    672 	sub	$16, %r11
    673 	jbe	LABEL(strcmp_exitz)
    674 #endif
    675 
    676 	add	$16, %rcx
    677 	movdqa	%xmm4, %xmm3
    678 
    679 	add	$16, %r10
    680 	jg	LABEL(nibble_ashr_5)	/* cross page boundary */
    681 
    682 	movdqa	(%rsi, %rcx), %xmm1
    683 	movdqa	(%rdi, %rcx), %xmm2
    684 	movdqa	%xmm2, %xmm4
    685 
    686 	psrldq	$5, %xmm3
    687 	pslldq 	$11, %xmm2
    688 	por	%xmm3, %xmm2
    689 
    690 	pcmpeqb	%xmm1, %xmm0
    691 	pcmpeqb	%xmm2, %xmm1
    692 	psubb	%xmm0, %xmm1
    693 	pmovmskb %xmm1, %edx
    694 	sub	$0xffff, %edx
    695 	jnz	LABEL(exit)
    696 
    697 #ifdef USE_AS_STRNCMP
    698 	sub	$16, %r11
    699 	jbe	LABEL(strcmp_exitz)
    700 #endif
    701 
    702 	add	$16, %rcx
    703 	movdqa	%xmm4, %xmm3
    704 	jmp	LABEL(loop_ashr_5)
    705 
    706 	.p2align 4
    707 LABEL(nibble_ashr_5):
    708 	psrldq	$5, %xmm4
    709 	movdqa	(%rsi, %rcx), %xmm1
    710 	pcmpeqb	%xmm1, %xmm0
    711 	pcmpeqb	%xmm4, %xmm1
    712 	psubb	%xmm0, %xmm1
    713 	pmovmskb %xmm1, %edx
    714 	sub	$0x07ff, %edx
    715 	jnz	LABEL(exit)
    716 #ifdef USE_AS_STRNCMP
    717 	cmp	$11, %r11
    718 	jbe	LABEL(strcmp_exitz)
    719 #endif
    720  	pxor	%xmm0, %xmm0
    721 	sub	$0x1000, %r10		/* subtract 4K from %r10 */
    722 	jmp	LABEL(gobble_ashr_5)
    723 
    724 /*
    725  * ashr_6 handles the following cases:
    726  * 	abs(str1 offset - str2 offset) = 10
    727  */
    728 	.p2align 4
    729 LABEL(ashr_6):
    730 	pxor	%xmm0, %xmm0
    731 	movdqa	(%rdi), %xmm2
    732 	movdqa	(%rsi), %xmm1
    733 	pcmpeqb	%xmm1, %xmm0
    734 	pslldq	$10, %xmm2
    735 	pcmpeqb	%xmm1, %xmm2
    736 	psubb	%xmm0, %xmm2
    737 	pmovmskb %xmm2, %r9d
    738 	shr	%cl, %edx
    739 	shr	%cl, %r9d
    740 	sub	%r9d, %edx
    741 	jnz	LABEL(less32bytes)
    742 	movdqa	(%rdi), %xmm3
    743 
    744 	UPDATE_STRNCMP_COUNTER
    745 
    746 	pxor	%xmm0, %xmm0
    747 	mov	$16, %rcx	/* index for loads */
    748 	mov	$6, %r9d	/* rdi bytes already examined. Used in exit code */
    749 	/*
    750 	 * Setup %r10 value allows us to detect crossing a page boundary.
    751 	 * When %r10 goes positive we are crossing a page boundary and
    752 	 * need to do a nibble.
    753 	 */
    754 	lea	6(%rdi), %r10
    755 	and	$0xfff, %r10	/* offset into 4K page */
    756 	sub	$0x1000, %r10	/* subtract 4K pagesize */
    757 	movdqa	%xmm3, %xmm4
    758 
    759 	.p2align 4
    760 LABEL(loop_ashr_6):
    761 	add	$16, %r10
    762 	jg	LABEL(nibble_ashr_6)
    763 
    764 LABEL(gobble_ashr_6):
    765 	movdqa	(%rsi, %rcx), %xmm1
    766 	movdqa	(%rdi, %rcx), %xmm2
    767 	movdqa	%xmm2, %xmm4
    768 
    769 	psrldq	$6, %xmm3
    770 	pslldq	$10, %xmm2
    771 	por	%xmm3, %xmm2
    772 
    773 	pcmpeqb	%xmm1, %xmm0
    774 	pcmpeqb	%xmm2, %xmm1
    775 	psubb	%xmm0, %xmm1
    776 	pmovmskb %xmm1, %edx
    777 	sub	$0xffff, %edx
    778 	jnz	LABEL(exit)
    779 
    780 #ifdef USE_AS_STRNCMP
    781 	sub	$16, %r11
    782 	jbe	LABEL(strcmp_exitz)
    783 #endif
    784 
    785 	add	$16, %rcx
    786 	movdqa	%xmm4, %xmm3
    787 
    788 	add	$16, %r10
    789 	jg	LABEL(nibble_ashr_6)	/* cross page boundary */
    790 
    791 	movdqa	(%rsi, %rcx), %xmm1
    792 	movdqa	(%rdi, %rcx), %xmm2
    793 	movdqa	%xmm2, %xmm4
    794 
    795 	psrldq	$6, %xmm3
    796 	pslldq 	$10, %xmm2
    797 	por	%xmm3, %xmm2
    798 
    799 	pcmpeqb	%xmm1, %xmm0
    800 	pcmpeqb	%xmm2, %xmm1
    801 	psubb	%xmm0, %xmm1
    802 	pmovmskb %xmm1, %edx
    803 	sub	$0xffff, %edx
    804 	jnz	LABEL(exit)
    805 
    806 #ifdef USE_AS_STRNCMP
    807 	sub	$16, %r11
    808 	jbe	LABEL(strcmp_exitz)
    809 #endif
    810 
    811 	add	$16, %rcx
    812 	movdqa	%xmm4, %xmm3
    813 	jmp	LABEL(loop_ashr_6)
    814 
    815 	.p2align 4
    816 LABEL(nibble_ashr_6):
    817 	psrldq	$6, %xmm4
    818 	movdqa	(%rsi, %rcx), %xmm1
    819 	pcmpeqb	%xmm1, %xmm0
    820 	pcmpeqb	%xmm4, %xmm1
    821 	psubb	%xmm0, %xmm1
    822 	pmovmskb %xmm1, %edx
    823 	sub	$0x03ff, %edx
    824 	jnz	LABEL(exit)
    825 #ifdef USE_AS_STRNCMP
    826 	cmp	$10, %r11
    827 	jbe	LABEL(strcmp_exitz)
    828 #endif
    829  	pxor	%xmm0, %xmm0
    830 	sub	$0x1000, %r10		/* subtract 4K from %r10 */
    831 	jmp	LABEL(gobble_ashr_6)
    832 
    833 /*
    834  * ashr_7 handles the following cases:
    835  * 	abs(str1 offset - str2 offset) = 9
    836  */
    837 	.p2align 4
    838 LABEL(ashr_7):
    839 	pxor	%xmm0, %xmm0
    840 	movdqa	(%rdi), %xmm2
    841 	movdqa	(%rsi), %xmm1
    842 	pcmpeqb	%xmm1, %xmm0
    843 	pslldq	$9, %xmm2
    844 	pcmpeqb	%xmm1, %xmm2
    845 	psubb	%xmm0, %xmm2
    846 	pmovmskb %xmm2, %r9d
    847 	shr	%cl, %edx
    848 	shr	%cl, %r9d
    849 	sub	%r9d, %edx
    850 	jnz	LABEL(less32bytes)
    851 	movdqa	(%rdi), %xmm3
    852 
    853 	UPDATE_STRNCMP_COUNTER
    854 
    855 	pxor	%xmm0, %xmm0
    856 	mov	$16, %rcx	/* index for loads */
    857 	mov	$7, %r9d	/* rdi bytes already examined. Used in exit code */
    858 	/*
    859 	 * Setup %r10 value allows us to detect crossing a page boundary.
    860 	 * When %r10 goes positive we are crossing a page boundary and
    861 	 * need to do a nibble.
    862 	 */
    863 	lea	7(%rdi), %r10
    864 	and	$0xfff, %r10	/* offset into 4K page */
    865 	sub	$0x1000, %r10	/* subtract 4K pagesize */
    866 	movdqa	%xmm3, %xmm4
    867 
    868 	.p2align 4
    869 LABEL(loop_ashr_7):
    870 	add	$16, %r10
    871 	jg	LABEL(nibble_ashr_7)
    872 
    873 LABEL(gobble_ashr_7):
    874 	movdqa	(%rsi, %rcx), %xmm1
    875 	movdqa	(%rdi, %rcx), %xmm2
    876 	movdqa	%xmm2, %xmm4
    877 
    878 	psrldq	$7, %xmm3
    879 	pslldq	$9, %xmm2
    880 	por	%xmm3, %xmm2
    881 
    882 	pcmpeqb	%xmm1, %xmm0
    883 	pcmpeqb	%xmm2, %xmm1
    884 	psubb	%xmm0, %xmm1
    885 	pmovmskb %xmm1, %edx
    886 	sub	$0xffff, %edx
    887 	jnz	LABEL(exit)
    888 
    889 #ifdef USE_AS_STRNCMP
    890 	sub	$16, %r11
    891 	jbe	LABEL(strcmp_exitz)
    892 #endif
    893 
    894 	add	$16, %rcx
    895 	movdqa	%xmm4, %xmm3
    896 
    897 	add	$16, %r10
    898 	jg	LABEL(nibble_ashr_7)	/* cross page boundary */
    899 
    900 	movdqa	(%rsi, %rcx), %xmm1
    901 	movdqa	(%rdi, %rcx), %xmm2
    902 	movdqa	%xmm2, %xmm4
    903 
    904 	psrldq	$7, %xmm3
    905 	pslldq 	$9, %xmm2
    906 	por	%xmm3, %xmm2
    907 
    908 	pcmpeqb	%xmm1, %xmm0
    909 	pcmpeqb	%xmm2, %xmm1
    910 	psubb	%xmm0, %xmm1
    911 	pmovmskb %xmm1, %edx
    912 	sub	$0xffff, %edx
    913 	jnz	LABEL(exit)
    914 
    915 #ifdef USE_AS_STRNCMP
    916 	sub	$16, %r11
    917 	jbe	LABEL(strcmp_exitz)
    918 #endif
    919 
    920 	add	$16, %rcx
    921 	movdqa	%xmm4, %xmm3
    922 	jmp	LABEL(loop_ashr_7)
    923 
    924 	.p2align 4
    925 LABEL(nibble_ashr_7):
    926 	psrldq	$7, %xmm4
    927 	movdqa	(%rsi, %rcx), %xmm1
    928 	pcmpeqb	%xmm1, %xmm0
    929 	pcmpeqb	%xmm4, %xmm1
    930 	psubb	%xmm0, %xmm1
    931 	pmovmskb %xmm1, %edx
    932 	sub	$0x01ff, %edx
    933 	jnz	LABEL(exit)
    934 #ifdef USE_AS_STRNCMP
    935 	cmp	$9, %r11
    936 	jbe	LABEL(strcmp_exitz)
    937 #endif
    938  	pxor	%xmm0, %xmm0
    939 	sub	$0x1000, %r10		/* subtract 4K from %r10 */
    940 	jmp	LABEL(gobble_ashr_7)
    941 
    942 /*
    943  * ashr_8 handles the following cases:
    944  * 	abs(str1 offset - str2 offset) = 8
    945  */
    946 	.p2align 4
    947 LABEL(ashr_8):
    948 	pxor	%xmm0, %xmm0
    949 	movdqa	(%rdi), %xmm2
    950 	movdqa	(%rsi), %xmm1
    951 	pcmpeqb	%xmm1, %xmm0
    952 	pslldq	$8, %xmm2
    953 	pcmpeqb	%xmm1, %xmm2
    954 	psubb	%xmm0, %xmm2
    955 	pmovmskb %xmm2, %r9d
    956 	shr	%cl, %edx
    957 	shr	%cl, %r9d
    958 	sub	%r9d, %edx
    959 	jnz	LABEL(less32bytes)
    960 	movdqa	(%rdi), %xmm3
    961 
    962 	UPDATE_STRNCMP_COUNTER
    963 
    964 	pxor	%xmm0, %xmm0
    965 	mov	$16, %rcx	/* index for loads */
    966 	mov	$8, %r9d	/* rdi bytes already examined. Used in exit code */
    967 	/*
    968 	 * Setup %r10 value allows us to detect crossing a page boundary.
    969 	 * When %r10 goes positive we are crossing a page boundary and
    970 	 * need to do a nibble.
    971 	 */
    972 	lea	8(%rdi), %r10
    973 	and	$0xfff, %r10	/* offset into 4K page */
    974 	sub	$0x1000, %r10	/* subtract 4K pagesize */
    975 	movdqa	%xmm3, %xmm4
    976 
    977 	.p2align 4
    978 LABEL(loop_ashr_8):
    979 	add	$16, %r10
    980 	jg	LABEL(nibble_ashr_8)
    981 
    982 LABEL(gobble_ashr_8):
    983 	movdqa	(%rsi, %rcx), %xmm1
    984 	movdqa	(%rdi, %rcx), %xmm2
    985 	movdqa	%xmm2, %xmm4
    986 
    987 	psrldq	$8, %xmm3
    988 	pslldq	$8, %xmm2
    989 	por	%xmm3, %xmm2
    990 
    991 	pcmpeqb	%xmm1, %xmm0
    992 	pcmpeqb	%xmm2, %xmm1
    993 	psubb	%xmm0, %xmm1
    994 	pmovmskb %xmm1, %edx
    995 	sub	$0xffff, %edx
    996 	jnz	LABEL(exit)
    997 
    998 #ifdef USE_AS_STRNCMP
    999 	sub	$16, %r11
   1000 	jbe	LABEL(strcmp_exitz)
   1001 #endif
   1002 
   1003 	add	$16, %rcx
   1004 	movdqa	%xmm4, %xmm3
   1005 
   1006 	add	$16, %r10
   1007 	jg	LABEL(nibble_ashr_8)	/* cross page boundary */
   1008 
   1009 	movdqa	(%rsi, %rcx), %xmm1
   1010 	movdqa	(%rdi, %rcx), %xmm2
   1011 	movdqa	%xmm2, %xmm4
   1012 
   1013 	psrldq	$8, %xmm3
   1014 	pslldq 	$8, %xmm2
   1015 	por	%xmm3, %xmm2
   1016 
   1017 	pcmpeqb	%xmm1, %xmm0
   1018 	pcmpeqb	%xmm2, %xmm1
   1019 	psubb	%xmm0, %xmm1
   1020 	pmovmskb %xmm1, %edx
   1021 	sub	$0xffff, %edx
   1022 	jnz	LABEL(exit)
   1023 
   1024 #ifdef USE_AS_STRNCMP
   1025 	sub	$16, %r11
   1026 	jbe	LABEL(strcmp_exitz)
   1027 #endif
   1028 
   1029 	add	$16, %rcx
   1030 	movdqa	%xmm4, %xmm3
   1031 	jmp	LABEL(loop_ashr_8)
   1032 
   1033 	.p2align 4
   1034 LABEL(nibble_ashr_8):
   1035 	psrldq	$8, %xmm4
   1036 	movdqa	(%rsi, %rcx), %xmm1
   1037 	pcmpeqb	%xmm1, %xmm0
   1038 	pcmpeqb	%xmm4, %xmm1
   1039 	psubb	%xmm0, %xmm1
   1040 	pmovmskb %xmm1, %edx
   1041 	sub	$0x00ff, %edx
   1042 	jnz	LABEL(exit)
   1043 #ifdef USE_AS_STRNCMP
   1044 	cmp	$8, %r11
   1045 	jbe	LABEL(strcmp_exitz)
   1046 #endif
   1047  	pxor	%xmm0, %xmm0
   1048 	sub	$0x1000, %r10		/* subtract 4K from %r10 */
   1049 	jmp	LABEL(gobble_ashr_8)
   1050 
   1051 /*
   1052  * ashr_9 handles the following cases:
   1053  * 	abs(str1 offset - str2 offset) = 7
   1054  */
   1055 	.p2align 4
   1056 LABEL(ashr_9):
   1057 	pxor	%xmm0, %xmm0
   1058 	movdqa	(%rdi), %xmm2
   1059 	movdqa	(%rsi), %xmm1
   1060 	pcmpeqb	%xmm1, %xmm0
   1061 	pslldq	$7, %xmm2
   1062 	pcmpeqb	%xmm1, %xmm2
   1063 	psubb	%xmm0, %xmm2
   1064 	pmovmskb %xmm2, %r9d
   1065 	shr	%cl, %edx
   1066 	shr	%cl, %r9d
   1067 	sub	%r9d, %edx
   1068 	jnz	LABEL(less32bytes)
   1069 	movdqa	(%rdi), %xmm3
   1070 
   1071 	UPDATE_STRNCMP_COUNTER
   1072 
   1073 	pxor	%xmm0, %xmm0
   1074 	mov	$16, %rcx	/* index for loads */
   1075 	mov	$9, %r9d	/* rdi bytes already examined. Used in exit code */
   1076 	/*
   1077 	 * Setup %r10 value allows us to detect crossing a page boundary.
   1078 	 * When %r10 goes positive we are crossing a page boundary and
   1079 	 * need to do a nibble.
   1080 	 */
   1081 	lea	9(%rdi), %r10
   1082 	and	$0xfff, %r10	/* offset into 4K page */
   1083 	sub	$0x1000, %r10	/* subtract 4K pagesize */
   1084 	movdqa	%xmm3, %xmm4
   1085 
   1086 	.p2align 4
   1087 LABEL(loop_ashr_9):
   1088 	add	$16, %r10
   1089 	jg	LABEL(nibble_ashr_9)
   1090 
   1091 LABEL(gobble_ashr_9):
   1092 	movdqa	(%rsi, %rcx), %xmm1
   1093 	movdqa	(%rdi, %rcx), %xmm2
   1094 	movdqa	%xmm2, %xmm4
   1095 
   1096 	psrldq	$9, %xmm3
   1097 	pslldq	$7, %xmm2
   1098 	por	%xmm3, %xmm2
   1099 
   1100 	pcmpeqb	%xmm1, %xmm0
   1101 	pcmpeqb	%xmm2, %xmm1
   1102 	psubb	%xmm0, %xmm1
   1103 	pmovmskb %xmm1, %edx
   1104 	sub	$0xffff, %edx
   1105 	jnz	LABEL(exit)
   1106 
   1107 #ifdef USE_AS_STRNCMP
   1108 	sub	$16, %r11
   1109 	jbe	LABEL(strcmp_exitz)
   1110 #endif
   1111 
   1112 	add	$16, %rcx
   1113 	movdqa	%xmm4, %xmm3
   1114 
   1115 	add	$16, %r10
   1116 	jg	LABEL(nibble_ashr_9)	/* cross page boundary */
   1117 
   1118 	movdqa	(%rsi, %rcx), %xmm1
   1119 	movdqa	(%rdi, %rcx), %xmm2
   1120 	movdqa	%xmm2, %xmm4
   1121 
   1122 	psrldq	$9, %xmm3
   1123 	pslldq 	$7, %xmm2
   1124 	por	%xmm3, %xmm2
   1125 
   1126 	pcmpeqb	%xmm1, %xmm0
   1127 	pcmpeqb	%xmm2, %xmm1
   1128 	psubb	%xmm0, %xmm1
   1129 	pmovmskb %xmm1, %edx
   1130 	sub	$0xffff, %edx
   1131 	jnz	LABEL(exit)
   1132 
   1133 #ifdef USE_AS_STRNCMP
   1134 	sub	$16, %r11
   1135 	jbe	LABEL(strcmp_exitz)
   1136 #endif
   1137 
   1138 	add	$16, %rcx
   1139 	movdqa	%xmm4, %xmm3		/* store for next cycle */
   1140 	jmp	LABEL(loop_ashr_9)
   1141 
   1142 	.p2align 4
   1143 LABEL(nibble_ashr_9):
   1144 	psrldq	$9, %xmm4
   1145 	movdqa	(%rsi, %rcx), %xmm1
   1146 	pcmpeqb	%xmm1, %xmm0
   1147 	pcmpeqb	%xmm4, %xmm1
   1148 	psubb	%xmm0, %xmm1
   1149 	pmovmskb %xmm1, %edx
   1150 	sub	$0x007f, %edx
   1151 	jnz	LABEL(exit)
   1152 #ifdef USE_AS_STRNCMP
   1153 	cmp	$7, %r11
   1154 	jbe	LABEL(strcmp_exitz)
   1155 #endif
   1156  	pxor	%xmm0, %xmm0
   1157 	sub	$0x1000, %r10		/* subtract 4K from %r10 */
   1158 	jmp	LABEL(gobble_ashr_9)
   1159 
   1160 /*
   1161  * ashr_10 handles the following cases:
   1162  * 	abs(str1 offset - str2 offset) = 6
   1163  */
   1164 	.p2align 4
   1165 LABEL(ashr_10):
   1166 	pxor	%xmm0, %xmm0
   1167 	movdqa	(%rdi), %xmm2
   1168 	movdqa	(%rsi), %xmm1
   1169 	pcmpeqb	%xmm1, %xmm0
   1170 	pslldq	$6, %xmm2
   1171 	pcmpeqb	%xmm1, %xmm2
   1172 	psubb	%xmm0, %xmm2
   1173 	pmovmskb %xmm2, %r9d
   1174 	shr	%cl, %edx
   1175 	shr	%cl, %r9d
   1176 	sub	%r9d, %edx
   1177 	jnz	LABEL(less32bytes)
   1178 	movdqa	(%rdi), %xmm3
   1179 
   1180 	UPDATE_STRNCMP_COUNTER
   1181 
   1182 	pxor	%xmm0, %xmm0
   1183 	mov	$16, %rcx	/* index for loads */
   1184 	mov	$10, %r9d	/* rdi bytes already examined. Used in exit code */
   1185 	/*
   1186 	 * Setup %r10 value allows us to detect crossing a page boundary.
   1187 	 * When %r10 goes positive we are crossing a page boundary and
   1188 	 * need to do a nibble.
   1189 	 */
   1190 	lea	10(%rdi), %r10
   1191 	and	$0xfff, %r10	/* offset into 4K page */
   1192 	sub	$0x1000, %r10	/* subtract 4K pagesize */
   1193 	movdqa	%xmm3, %xmm4
   1194 
   1195 	.p2align 4
   1196 LABEL(loop_ashr_10):
   1197 	add	$16, %r10
   1198 	jg	LABEL(nibble_ashr_10)
   1199 
   1200 LABEL(gobble_ashr_10):
   1201 	movdqa	(%rsi, %rcx), %xmm1
   1202 	movdqa	(%rdi, %rcx), %xmm2
   1203 	movdqa	%xmm2, %xmm4
   1204 
   1205 	psrldq	$10, %xmm3
   1206 	pslldq	$6, %xmm2
   1207 	por	%xmm3, %xmm2
   1208 
   1209 	pcmpeqb	%xmm1, %xmm0
   1210 	pcmpeqb	%xmm2, %xmm1
   1211 	psubb	%xmm0, %xmm1
   1212 	pmovmskb %xmm1, %edx
   1213 	sub	$0xffff, %edx
   1214 	jnz	LABEL(exit)
   1215 
   1216 #ifdef USE_AS_STRNCMP
   1217 	sub	$16, %r11
   1218 	jbe	LABEL(strcmp_exitz)
   1219 #endif
   1220 
   1221 	add	$16, %rcx
   1222 	movdqa	%xmm4, %xmm3
   1223 
   1224 	add	$16, %r10
   1225 	jg	LABEL(nibble_ashr_10)	/* cross page boundary */
   1226 
   1227 	movdqa	(%rsi, %rcx), %xmm1
   1228 	movdqa	(%rdi, %rcx), %xmm2
   1229 	movdqa	%xmm2, %xmm4
   1230 
   1231 	psrldq	$10, %xmm3
   1232 	pslldq 	$6, %xmm2
   1233 	por	%xmm3, %xmm2
   1234 
   1235 	pcmpeqb	%xmm1, %xmm0
   1236 	pcmpeqb	%xmm2, %xmm1
   1237 	psubb	%xmm0, %xmm1
   1238 	pmovmskb %xmm1, %edx
   1239 	sub	$0xffff, %edx
   1240 	jnz	LABEL(exit)
   1241 
   1242 #ifdef USE_AS_STRNCMP
   1243 	sub	$16, %r11
   1244 	jbe	LABEL(strcmp_exitz)
   1245 #endif
   1246 
   1247 	add	$16, %rcx
   1248 	movdqa	%xmm4, %xmm3
   1249 	jmp	LABEL(loop_ashr_10)
   1250 
   1251 	.p2align 4
   1252 LABEL(nibble_ashr_10):
   1253 	psrldq	$10, %xmm4
   1254 	movdqa	(%rsi, %rcx), %xmm1
   1255 	pcmpeqb	%xmm1, %xmm0
   1256 	pcmpeqb	%xmm4, %xmm1
   1257 	psubb	%xmm0, %xmm1
   1258 	pmovmskb %xmm1, %edx
   1259 	sub	$0x003f, %edx
   1260 	jnz	LABEL(exit)
   1261 #ifdef USE_AS_STRNCMP
   1262 	cmp	$6, %r11
   1263 	jbe	LABEL(strcmp_exitz)
   1264 #endif
   1265  	pxor	%xmm0, %xmm0
   1266 	sub	$0x1000, %r10		/* subtract 4K from %r10 */
   1267 	jmp	LABEL(gobble_ashr_10)
   1268 
   1269 /*
   1270  * ashr_11 handles the following cases:
   1271  * 	abs(str1 offset - str2 offset) = 5
   1272  */
   1273 	.p2align 4
   1274 LABEL(ashr_11):
   1275 	pxor	%xmm0, %xmm0
   1276 	movdqa	(%rdi), %xmm2
   1277 	movdqa	(%rsi), %xmm1
   1278 	pcmpeqb	%xmm1, %xmm0
   1279 	pslldq	$5, %xmm2
   1280 	pcmpeqb	%xmm1, %xmm2
   1281 	psubb	%xmm0, %xmm2
   1282 	pmovmskb %xmm2, %r9d
   1283 	shr	%cl, %edx
   1284 	shr	%cl, %r9d
   1285 	sub	%r9d, %edx
   1286 	jnz	LABEL(less32bytes)
   1287 	movdqa	(%rdi), %xmm3
   1288 
   1289 	UPDATE_STRNCMP_COUNTER
   1290 
   1291 	pxor	%xmm0, %xmm0
   1292 	mov	$16, %rcx	/* index for loads */
   1293 	mov	$11, %r9d	/* rdi bytes already examined. Used in exit code */
   1294 	/*
   1295 	 * Setup %r10 value allows us to detect crossing a page boundary.
   1296 	 * When %r10 goes positive we are crossing a page boundary and
   1297 	 * need to do a nibble.
   1298 	 */
   1299 	lea	11(%rdi), %r10
   1300 	and	$0xfff, %r10	/* offset into 4K page */
   1301 	sub	$0x1000, %r10	/* subtract 4K pagesize */
   1302 	movdqa	%xmm3, %xmm4
   1303 
   1304 	.p2align 4
   1305 LABEL(loop_ashr_11):
   1306 	add	$16, %r10
   1307 	jg	LABEL(nibble_ashr_11)
   1308 
   1309 LABEL(gobble_ashr_11):
   1310 	movdqa	(%rsi, %rcx), %xmm1
   1311 	movdqa	(%rdi, %rcx), %xmm2
   1312 	movdqa	%xmm2, %xmm4
   1313 
   1314 	psrldq	$11, %xmm3
   1315 	pslldq	$5, %xmm2
   1316 	por	%xmm3, %xmm2
   1317 
   1318 	pcmpeqb	%xmm1, %xmm0
   1319 	pcmpeqb	%xmm2, %xmm1
   1320 	psubb	%xmm0, %xmm1
   1321 	pmovmskb %xmm1, %edx
   1322 	sub	$0xffff, %edx
   1323 	jnz	LABEL(exit)
   1324 
   1325 #ifdef USE_AS_STRNCMP
   1326 	sub	$16, %r11
   1327 	jbe	LABEL(strcmp_exitz)
   1328 #endif
   1329 
   1330 	add	$16, %rcx
   1331 	movdqa	%xmm4, %xmm3
   1332 
   1333 	add	$16, %r10
   1334 	jg	LABEL(nibble_ashr_11)	/* cross page boundary */
   1335 
   1336 	movdqa	(%rsi, %rcx), %xmm1
   1337 	movdqa	(%rdi, %rcx), %xmm2
   1338 	movdqa	%xmm2, %xmm4
   1339 
   1340 	psrldq	$11, %xmm3
   1341 	pslldq 	$5, %xmm2
   1342 	por	%xmm3, %xmm2
   1343 
   1344 	pcmpeqb	%xmm1, %xmm0
   1345 	pcmpeqb	%xmm2, %xmm1
   1346 	psubb	%xmm0, %xmm1
   1347 	pmovmskb %xmm1, %edx
   1348 	sub	$0xffff, %edx
   1349 	jnz	LABEL(exit)
   1350 
   1351 #ifdef USE_AS_STRNCMP
   1352 	sub	$16, %r11
   1353 	jbe	LABEL(strcmp_exitz)
   1354 #endif
   1355 
   1356 	add	$16, %rcx
   1357 	movdqa	%xmm4, %xmm3
   1358 	jmp	LABEL(loop_ashr_11)
   1359 
   1360 	.p2align 4
   1361 LABEL(nibble_ashr_11):
   1362 	psrldq	$11, %xmm4
   1363 	movdqa	(%rsi, %rcx), %xmm1
   1364 	pcmpeqb	%xmm1, %xmm0
   1365 	pcmpeqb	%xmm4, %xmm1
   1366 	psubb	%xmm0, %xmm1
   1367 	pmovmskb %xmm1, %edx
   1368 	sub	$0x001f, %edx
   1369 	jnz	LABEL(exit)
   1370 #ifdef USE_AS_STRNCMP
   1371 	cmp	$5, %r11
   1372 	jbe	LABEL(strcmp_exitz)
   1373 #endif
   1374  	pxor	%xmm0, %xmm0
   1375 	sub	$0x1000, %r10		/* subtract 4K from %r10 */
   1376 	jmp	LABEL(gobble_ashr_11)
   1377 
   1378 /*
   1379  * ashr_12 handles the following cases:
   1380  * 	abs(str1 offset - str2 offset) = 4
   1381  */
   1382 	.p2align 4
   1383 LABEL(ashr_12):
   1384 	pxor	%xmm0, %xmm0
   1385 	movdqa	(%rdi), %xmm2
   1386 	movdqa	(%rsi), %xmm1
   1387 	pcmpeqb	%xmm1, %xmm0
   1388 	pslldq	$4, %xmm2
   1389 	pcmpeqb	%xmm1, %xmm2
   1390 	psubb	%xmm0, %xmm2
   1391 	pmovmskb %xmm2, %r9d
   1392 	shr	%cl, %edx
   1393 	shr	%cl, %r9d
   1394 	sub	%r9d, %edx
   1395 	jnz	LABEL(less32bytes)
   1396 	movdqa	(%rdi), %xmm3
   1397 
   1398 	UPDATE_STRNCMP_COUNTER
   1399 
   1400 	pxor	%xmm0, %xmm0
   1401 	mov	$16, %rcx	/* index for loads */
   1402 	mov	$12, %r9d	/* rdi bytes already examined. Used in exit code */
   1403 	/*
   1404 	 * Setup %r10 value allows us to detect crossing a page boundary.
   1405 	 * When %r10 goes positive we are crossing a page boundary and
   1406 	 * need to do a nibble.
   1407 	 */
   1408 	lea	12(%rdi), %r10
   1409 	and	$0xfff, %r10	/* offset into 4K page */
   1410 	sub	$0x1000, %r10	/* subtract 4K pagesize */
   1411 	movdqa	%xmm3, %xmm4
   1412 
   1413 	.p2align 4
   1414 LABEL(loop_ashr_12):
   1415 	add	$16, %r10
   1416 	jg	LABEL(nibble_ashr_12)
   1417 
   1418 LABEL(gobble_ashr_12):
   1419 	movdqa	(%rsi, %rcx), %xmm1
   1420 	movdqa	(%rdi, %rcx), %xmm2
   1421 	movdqa	%xmm2, %xmm4
   1422 
   1423 	psrldq	$12, %xmm3
   1424 	pslldq	$4, %xmm2
   1425 	por	%xmm3, %xmm2
   1426 
   1427 	pcmpeqb	%xmm1, %xmm0
   1428 	pcmpeqb	%xmm2, %xmm1
   1429 	psubb	%xmm0, %xmm1
   1430 	pmovmskb %xmm1, %edx
   1431 	sub	$0xffff, %edx
   1432 	jnz	LABEL(exit)
   1433 
   1434 #ifdef USE_AS_STRNCMP
   1435 	sub	$16, %r11
   1436 	jbe	LABEL(strcmp_exitz)
   1437 #endif
   1438 
   1439 	add	$16, %rcx
   1440 	movdqa	%xmm4, %xmm3
   1441 
   1442 	add	$16, %r10
   1443 	jg	LABEL(nibble_ashr_12)	/* cross page boundary */
   1444 
   1445 	movdqa	(%rsi, %rcx), %xmm1
   1446 	movdqa	(%rdi, %rcx), %xmm2
   1447 	movdqa	%xmm2, %xmm4
   1448 
   1449 	psrldq	$12, %xmm3
   1450 	pslldq 	$4, %xmm2
   1451 	por	%xmm3, %xmm2
   1452 
   1453 	pcmpeqb	%xmm1, %xmm0
   1454 	pcmpeqb	%xmm2, %xmm1
   1455 	psubb	%xmm0, %xmm1
   1456 	pmovmskb %xmm1, %edx
   1457 	sub	$0xffff, %edx
   1458 	jnz	LABEL(exit)
   1459 
   1460 #ifdef USE_AS_STRNCMP
   1461 	sub	$16, %r11
   1462 	jbe	LABEL(strcmp_exitz)
   1463 #endif
   1464 
   1465 	add	$16, %rcx
   1466 	movdqa	%xmm4, %xmm3
   1467 	jmp	LABEL(loop_ashr_12)
   1468 
   1469 	.p2align 4
   1470 LABEL(nibble_ashr_12):
   1471 	psrldq	$12, %xmm4
   1472 	movdqa	(%rsi, %rcx), %xmm1
   1473 	pcmpeqb	%xmm1, %xmm0
   1474 	pcmpeqb	%xmm4, %xmm1
   1475 	psubb	%xmm0, %xmm1
   1476 	pmovmskb %xmm1, %edx
   1477 	sub	$0x000f, %edx
   1478 	jnz	LABEL(exit)
   1479 #ifdef USE_AS_STRNCMP
   1480 	cmp	$4, %r11
   1481 	jbe	LABEL(strcmp_exitz)
   1482 #endif
   1483  	pxor	%xmm0, %xmm0
   1484 	sub	$0x1000, %r10		/* subtract 4K from %r10 */
   1485 	jmp	LABEL(gobble_ashr_12)
   1486 
   1487 /*
   1488  * ashr_13 handles the following cases:
   1489  * 	abs(str1 offset - str2 offset) = 3
   1490  */
   1491 	.p2align 4
   1492 LABEL(ashr_13):
   1493 	pxor	%xmm0, %xmm0
   1494 	movdqa	(%rdi), %xmm2
   1495 	movdqa	(%rsi), %xmm1
   1496 	pcmpeqb	%xmm1, %xmm0
   1497 	pslldq	$3, %xmm2
   1498 	pcmpeqb	%xmm1, %xmm2
   1499 	psubb	%xmm0, %xmm2
   1500 	pmovmskb %xmm2, %r9d
   1501 	shr	%cl, %edx
   1502 	shr	%cl, %r9d
   1503 	sub	%r9d, %edx
   1504 	jnz	LABEL(less32bytes)
   1505 	movdqa	(%rdi), %xmm3
   1506 
   1507 	UPDATE_STRNCMP_COUNTER
   1508 
   1509 	pxor	%xmm0, %xmm0
   1510 	mov	$16, %rcx	/* index for loads */
   1511 	mov	$13, %r9d	/* rdi bytes already examined. Used in exit code */
   1512 	/*
   1513 	 * Setup %r10 value allows us to detect crossing a page boundary.
   1514 	 * When %r10 goes positive we are crossing a page boundary and
   1515 	 * need to do a nibble.
   1516 	 */
   1517 	lea	13(%rdi), %r10
   1518 	and	$0xfff, %r10	/* offset into 4K page */
   1519 	sub	$0x1000, %r10	/* subtract 4K pagesize */
   1520 	movdqa	%xmm3, %xmm4
   1521 
   1522 	.p2align 4
   1523 LABEL(loop_ashr_13):
   1524 	add	$16, %r10
   1525 	jg	LABEL(nibble_ashr_13)
   1526 
   1527 LABEL(gobble_ashr_13):
   1528 	movdqa	(%rsi, %rcx), %xmm1
   1529 	movdqa	(%rdi, %rcx), %xmm2
   1530 	movdqa	%xmm2, %xmm4
   1531 
   1532 	psrldq	$13, %xmm3
   1533 	pslldq	$3, %xmm2
   1534 	por	%xmm3, %xmm2
   1535 
   1536 	pcmpeqb	%xmm1, %xmm0
   1537 	pcmpeqb	%xmm2, %xmm1
   1538 	psubb	%xmm0, %xmm1
   1539 	pmovmskb %xmm1, %edx
   1540 	sub	$0xffff, %edx
   1541 	jnz	LABEL(exit)
   1542 
   1543 #ifdef USE_AS_STRNCMP
   1544 	sub	$16, %r11
   1545 	jbe	LABEL(strcmp_exitz)
   1546 #endif
   1547 
   1548 	add	$16, %rcx
   1549 	movdqa	%xmm4, %xmm3
   1550 
   1551 	add	$16, %r10
   1552 	jg	LABEL(nibble_ashr_13)	/* cross page boundary */
   1553 
   1554 	movdqa	(%rsi, %rcx), %xmm1
   1555 	movdqa	(%rdi, %rcx), %xmm2
   1556 	movdqa	%xmm2, %xmm4
   1557 
   1558 	psrldq	$13, %xmm3
   1559 	pslldq 	$3, %xmm2
   1560 	por	%xmm3, %xmm2
   1561 
   1562 	pcmpeqb	%xmm1, %xmm0
   1563 	pcmpeqb	%xmm2, %xmm1
   1564 	psubb	%xmm0, %xmm1
   1565 	pmovmskb %xmm1, %edx
   1566 	sub	$0xffff, %edx
   1567 	jnz	LABEL(exit)
   1568 
   1569 #ifdef USE_AS_STRNCMP
   1570 	sub	$16, %r11
   1571 	jbe	LABEL(strcmp_exitz)
   1572 #endif
   1573 
   1574 	add	$16, %rcx
   1575 	movdqa	%xmm4, %xmm3
   1576 	jmp	LABEL(loop_ashr_13)
   1577 
   1578 	.p2align 4
   1579 LABEL(nibble_ashr_13):
   1580 	psrldq	$13, %xmm4
   1581 	movdqa	(%rsi, %rcx), %xmm1
   1582 	pcmpeqb	%xmm1, %xmm0
   1583 	pcmpeqb	%xmm4, %xmm1
   1584 	psubb	%xmm0, %xmm1
   1585 	pmovmskb %xmm1, %edx
   1586 	sub	$0x0007, %edx
   1587 	jnz	LABEL(exit)
   1588 #ifdef USE_AS_STRNCMP
   1589 	cmp	$3, %r11
   1590 	jbe	LABEL(strcmp_exitz)
   1591 #endif
   1592  	pxor	%xmm0, %xmm0
   1593 	sub	$0x1000, %r10		/* subtract 4K from %r10 */
   1594 	jmp	LABEL(gobble_ashr_13)
   1595 
   1596 /*
   1597  * ashr_14 handles the following cases:
   1598  * 	abs(str1 offset - str2 offset) = 2
   1599  */
   1600 	.p2align 4
   1601 LABEL(ashr_14):
   1602 	pxor	%xmm0, %xmm0
   1603 	movdqa	(%rdi), %xmm2
   1604 	movdqa	(%rsi), %xmm1
   1605 	pcmpeqb	%xmm1, %xmm0
   1606 	pslldq  $2, %xmm2
   1607 	pcmpeqb	%xmm1, %xmm2
   1608 	psubb	%xmm0, %xmm2
   1609 	pmovmskb %xmm2, %r9d
   1610 	shr	%cl, %edx
   1611 	shr	%cl, %r9d
   1612 	sub	%r9d, %edx
   1613 	jnz	LABEL(less32bytes)
   1614 	movdqa	(%rdi), %xmm3
   1615 
   1616 	UPDATE_STRNCMP_COUNTER
   1617 
   1618 	pxor	%xmm0, %xmm0
   1619 	mov	$16, %rcx	/* index for loads */
   1620 	mov	$14, %r9d	/* rdi bytes already examined. Used in exit code */
   1621 	/*
   1622 	 * Setup %r10 value allows us to detect crossing a page boundary.
   1623 	 * When %r10 goes positive we are crossing a page boundary and
   1624 	 * need to do a nibble.
   1625 	 */
   1626 	lea	14(%rdi), %r10
   1627 	and	$0xfff, %r10	/* offset into 4K page */
   1628 	sub	$0x1000, %r10	/* subtract 4K pagesize */
   1629 	movdqa	%xmm3, %xmm4
   1630 
   1631 	.p2align 4
   1632 LABEL(loop_ashr_14):
   1633 	add	$16, %r10
   1634 	jg	LABEL(nibble_ashr_14)
   1635 
   1636 LABEL(gobble_ashr_14):
   1637 	movdqa	(%rsi, %rcx), %xmm1
   1638 	movdqa	(%rdi, %rcx), %xmm2
   1639 	movdqa	%xmm2, %xmm4
   1640 
   1641 	psrldq	$14, %xmm3
   1642 	pslldq	$2, %xmm2
   1643 	por	%xmm3, %xmm2
   1644 
   1645 	pcmpeqb	%xmm1, %xmm0
   1646 	pcmpeqb	%xmm2, %xmm1
   1647 	psubb	%xmm0, %xmm1
   1648 	pmovmskb %xmm1, %edx
   1649 	sub	$0xffff, %edx
   1650 	jnz	LABEL(exit)
   1651 
   1652 #ifdef USE_AS_STRNCMP
   1653 	sub	$16, %r11
   1654 	jbe	LABEL(strcmp_exitz)
   1655 #endif
   1656 
   1657 	add	$16, %rcx
   1658 	movdqa	%xmm4, %xmm3
   1659 
   1660 	add	$16, %r10
   1661 	jg	LABEL(nibble_ashr_14)	/* cross page boundary */
   1662 
   1663 	movdqa	(%rsi, %rcx), %xmm1
   1664 	movdqa	(%rdi, %rcx), %xmm2
   1665 	movdqa	%xmm2, %xmm4
   1666 
   1667 	psrldq	$14, %xmm3
   1668 	pslldq 	$2, %xmm2
   1669 	por	%xmm3, %xmm2
   1670 
   1671 	pcmpeqb	%xmm1, %xmm0
   1672 	pcmpeqb	%xmm2, %xmm1
   1673 	psubb	%xmm0, %xmm1
   1674 	pmovmskb %xmm1, %edx
   1675 	sub	$0xffff, %edx
   1676 	jnz	LABEL(exit)
   1677 
   1678 #ifdef USE_AS_STRNCMP
   1679 	sub	$16, %r11
   1680 	jbe	LABEL(strcmp_exitz)
   1681 #endif
   1682 
   1683 	add	$16, %rcx
   1684 	movdqa	%xmm4, %xmm3
   1685 	jmp	LABEL(loop_ashr_14)
   1686 
   1687 	.p2align 4
   1688 LABEL(nibble_ashr_14):
   1689 	psrldq	$14, %xmm4
   1690 	movdqa	(%rsi, %rcx), %xmm1
   1691 	pcmpeqb	%xmm1, %xmm0
   1692 	pcmpeqb	%xmm4, %xmm1
   1693 	psubb	%xmm0, %xmm1
   1694 	pmovmskb %xmm1, %edx
   1695 	sub	$0x0003, %edx
   1696 	jnz	LABEL(exit)
   1697 #ifdef USE_AS_STRNCMP
   1698 	cmp	$2, %r11
   1699 	jbe	LABEL(strcmp_exitz)
   1700 #endif
   1701  	pxor	%xmm0, %xmm0
   1702 	sub	$0x1000, %r10		/* subtract 4K from %r10 */
   1703 	jmp	LABEL(gobble_ashr_14)
   1704 
   1705 /*
   1706  * ashr_15 handles the following cases:
   1707  * 	abs(str1 offset - str2 offset) = 1
   1708  */
   1709 	.p2align 4
   1710 LABEL(ashr_15):
   1711 	pxor	%xmm0, %xmm0
   1712 	movdqa	(%rdi), %xmm2
   1713 	movdqa	(%rsi), %xmm1
   1714 	pcmpeqb	%xmm1, %xmm0
   1715 	pslldq	$1, %xmm2
   1716 	pcmpeqb	%xmm1, %xmm2
   1717 	psubb	%xmm0, %xmm2
   1718 	pmovmskb %xmm2, %r9d
   1719 	shr	%cl, %edx
   1720 	shr	%cl, %r9d
   1721 	sub	%r9d, %edx
   1722 	jnz	LABEL(less32bytes)
   1723 
   1724 	movdqa	(%rdi), %xmm3
   1725 
   1726 	UPDATE_STRNCMP_COUNTER
   1727 
   1728 	pxor	%xmm0, %xmm0
   1729 	mov	$16, %rcx	/* index for loads */
   1730 	mov	$15, %r9d	/* rdi bytes already examined. Used in exit code */
   1731 	/*
   1732 	 * Setup %r10 value allows us to detect crossing a page boundary.
   1733 	 * When %r10 goes positive we are crossing a page boundary and
   1734 	 * need to do a nibble.
   1735 	 */
   1736 	lea	15(%rdi), %r10
   1737 	and	$0xfff, %r10	/* offset into 4K page */
   1738 	sub	$0x1000, %r10	/* subtract 4K pagesize */
   1739 	movdqa	%xmm3, %xmm4
   1740 
   1741 	.p2align 4
   1742 LABEL(loop_ashr_15):
   1743 	add	$16, %r10
   1744 	jg	LABEL(nibble_ashr_15)
   1745 
   1746 LABEL(gobble_ashr_15):
   1747 	movdqa	(%rsi, %rcx), %xmm1
   1748 	movdqa	(%rdi, %rcx), %xmm2
   1749 	movdqa	%xmm2, %xmm4
   1750 
   1751 	psrldq	$15, %xmm3
   1752 	pslldq	$1, %xmm2
   1753 	por	%xmm3, %xmm2
   1754 
   1755 	pcmpeqb	%xmm1, %xmm0
   1756 	pcmpeqb	%xmm2, %xmm1
   1757 	psubb	%xmm0, %xmm1
   1758 	pmovmskb %xmm1, %edx
   1759 	sub	$0xffff, %edx
   1760 	jnz	LABEL(exit)
   1761 
   1762 #ifdef USE_AS_STRNCMP
   1763 	sub	$16, %r11
   1764 	jbe	LABEL(strcmp_exitz)
   1765 #endif
   1766 
   1767 	add	$16, %rcx
   1768 	movdqa	%xmm4, %xmm3
   1769 
   1770 	add	$16, %r10
   1771 	jg	LABEL(nibble_ashr_15)	/* cross page boundary */
   1772 
   1773 	movdqa	(%rsi, %rcx), %xmm1
   1774 	movdqa	(%rdi, %rcx), %xmm2
   1775 	movdqa	%xmm2, %xmm4
   1776 
   1777 	psrldq	$15, %xmm3
   1778 	pslldq 	$1, %xmm2
   1779 	por	%xmm3, %xmm2
   1780 
   1781 	pcmpeqb	%xmm1, %xmm0
   1782 	pcmpeqb	%xmm2, %xmm1
   1783 	psubb	%xmm0, %xmm1
   1784 	pmovmskb %xmm1, %edx
   1785 	sub	$0xffff, %edx
   1786 	jnz	LABEL(exit)
   1787 
   1788 #ifdef USE_AS_STRNCMP
   1789 	sub	$16, %r11
   1790 	jbe	LABEL(strcmp_exitz)
   1791 #endif
   1792 
   1793 	add	$16, %rcx
   1794 	movdqa	%xmm4, %xmm3
   1795 	jmp	LABEL(loop_ashr_15)
   1796 
   1797 	.p2align 4
   1798 LABEL(nibble_ashr_15):
   1799 	psrldq	$15, %xmm4
   1800 	movdqa	(%rsi, %rcx), %xmm1
   1801 	pcmpeqb	%xmm1, %xmm0
   1802 	pcmpeqb	%xmm4, %xmm1
   1803 	psubb	%xmm0, %xmm1
   1804 	pmovmskb %xmm1, %edx
   1805 	sub	$0x0001, %edx
   1806 	jnz	LABEL(exit)
   1807 #ifdef USE_AS_STRNCMP
   1808 	cmp	$1, %r11
   1809 	jbe	LABEL(strcmp_exitz)
   1810 #endif
   1811  	pxor	%xmm0, %xmm0
   1812 	sub	$0x1000, %r10		/* subtract 4K from %r10 */
   1813 	jmp	LABEL(gobble_ashr_15)
   1814 
   1815 	.p2align 4
   1816 LABEL(exit):
   1817 	lea	-16(%r9, %rcx), %rax	/* locate the exact offset for rdi */
   1818 LABEL(less32bytes):
   1819 	lea	(%rdi, %rax), %rdi	/* locate the exact address for first operand(rdi) */
   1820 	lea	(%rsi, %rcx), %rsi	/* locate the exact address for second operand(rsi) */
   1821 	test	%r8d, %r8d
   1822 	jz	LABEL(ret)
   1823 	xchg	%rsi, %rdi		/* recover original order according to flag(%r8d) */
   1824 
   1825 	.p2align 4
   1826 LABEL(ret):
   1827 LABEL(less16bytes):
   1828 	/*
   1829 	 * Check to see if BSF is fast on this processor. If not, use a different
   1830 	 * exit tail.
   1831 	 */
   1832 	testl	$USE_BSF,.memops_method(%rip)
   1833 	jz	LABEL(AMD_exit)
   1834 	bsf	%rdx, %rdx		/* find and store bit index in %rdx */
   1835 
   1836 #ifdef USE_AS_STRNCMP
   1837 	sub	%rdx, %r11
   1838 	jbe	LABEL(strcmp_exitz)
   1839 #endif
   1840 	xor	%ecx, %ecx		/* clear %ecx */
   1841 	xor	%eax, %eax		/* clear %eax */
   1842 
   1843 	movb	(%rsi, %rdx), %cl
   1844 	movb	(%rdi, %rdx), %al
   1845 
   1846 	sub	%ecx, %eax
   1847 	ret
   1848 
   1849 #ifdef USE_AS_STRNCMP
   1850 LABEL(strcmp_exitz):
   1851 	xor	%eax, %eax
   1852 	ret
   1853 #endif
   1854 
   1855 	/*
   1856 	 * This exit tail does not use the bsf instruction.
   1857 	 */
   1858 	.p2align 4
   1859 LABEL(AMD_exit):
   1860 	test	%dl, %dl
   1861 	jz	LABEL(next_8_bytes)
   1862 
   1863 	test	$0x01, %dl
   1864 	jnz	LABEL(Byte0)
   1865 
   1866 	test	$0x02, %dl
   1867 	jnz	LABEL(Byte1)
   1868 
   1869 	test	$0x04, %dl
   1870 	jnz	LABEL(Byte2)
   1871 
   1872 	test	$0x08, %dl
   1873 	jnz	LABEL(Byte3)
   1874 
   1875 	test	$0x10, %dl
   1876 	jnz	LABEL(Byte4)
   1877 
   1878 	test	$0x20, %dl
   1879 	jnz	LABEL(Byte5)
   1880 
   1881 	test	$0x40, %dl
   1882 	jnz	LABEL(Byte6)
   1883 
   1884 #ifdef USE_AS_STRNCMP
   1885 	sub	$7, %r11
   1886 	jbe	LABEL(strcmp_exitz)
   1887 #endif
   1888 	movzx	7(%rsi), %ecx
   1889 	movzx	7(%rdi), %eax
   1890 
   1891 	sub	%ecx, %eax
   1892 	ret
   1893 
   1894 	.p2align 4
   1895 LABEL(Byte0):
   1896 	/*
   1897 	 * never need to handle byte 0 for strncmpy
   1898 #ifdef USE_AS_STRNCMP
   1899 	sub	$0, %r11
   1900 	jbe	LABEL(strcmp_exitz)
   1901 #endif
   1902 	*/
   1903 	movzx	(%rsi), %ecx
   1904 	movzx	(%rdi), %eax
   1905 
   1906 	sub	%ecx, %eax
   1907 	ret
   1908 
   1909 	.p2align 4
   1910 LABEL(Byte1):
   1911 
   1912 #ifdef USE_AS_STRNCMP
   1913 	sub	$1, %r11
   1914 	jbe	LABEL(strcmp_exitz)
   1915 #endif
   1916 	movzx	1(%rsi), %ecx
   1917 	movzx	1(%rdi), %eax
   1918 
   1919 	sub	%ecx, %eax
   1920 	ret
   1921 
   1922 	.p2align 4
   1923 LABEL(Byte2):
   1924 
   1925 #ifdef USE_AS_STRNCMP
   1926 	sub	$2, %r11
   1927 	jbe	LABEL(strcmp_exitz)
   1928 #endif
   1929 	movzx	2(%rsi), %ecx
   1930 	movzx	2(%rdi), %eax
   1931 
   1932 	sub	%ecx, %eax
   1933 	ret
   1934 
   1935 	.p2align 4
   1936 LABEL(Byte3):
   1937 
   1938 #ifdef USE_AS_STRNCMP
   1939 	sub	$3, %r11
   1940 	jbe	LABEL(strcmp_exitz)
   1941 #endif
   1942 	movzx	3(%rsi), %ecx
   1943 	movzx	3(%rdi), %eax
   1944 
   1945 	sub	%ecx, %eax
   1946 	ret
   1947 
   1948 	.p2align 4
   1949 LABEL(Byte4):
   1950 
   1951 #ifdef USE_AS_STRNCMP
   1952 	sub	$4, %r11
   1953 	jbe	LABEL(strcmp_exitz)
   1954 #endif
   1955 	movzx	4(%rsi), %ecx
   1956 	movzx	4(%rdi), %eax
   1957 
   1958 	sub	%ecx, %eax
   1959 	ret
   1960 
   1961 	.p2align 4
   1962 LABEL(Byte5):
   1963 
   1964 #ifdef USE_AS_STRNCMP
   1965 	sub	$5, %r11
   1966 	jbe	LABEL(strcmp_exitz)
   1967 #endif
   1968 	movzx	5(%rsi), %ecx
   1969 	movzx	5(%rdi), %eax
   1970 
   1971 	sub	%ecx, %eax
   1972 	ret
   1973 
   1974 	.p2align 4
   1975 LABEL(Byte6):
   1976 
   1977 #ifdef USE_AS_STRNCMP
   1978 	sub	$6, %r11
   1979 	jbe	LABEL(strcmp_exitz)
   1980 #endif
   1981 	movzx	6(%rsi), %ecx
   1982 	movzx	6(%rdi), %eax
   1983 
   1984 	sub	%ecx, %eax
   1985 	ret
   1986 
   1987 	.p2align 4
   1988 LABEL(next_8_bytes):
   1989 	add	$8, %rdi
   1990 	add	$8, %rsi
   1991 #ifdef USE_AS_STRNCMP
   1992 	sub	$8, %r11
   1993 	jbe	LABEL(strcmp_exitz)
   1994 #endif
   1995 	test	$0x01, %dh
   1996 	jnz	LABEL(Byte0)
   1997 
   1998 	test	$0x02, %dh
   1999 	jnz	LABEL(Byte1)
   2000 
   2001 	test	$0x04, %dh
   2002 	jnz	LABEL(Byte2)
   2003 
   2004 	test	$0x08, %dh
   2005 	jnz	LABEL(Byte3)
   2006 
   2007 	test	$0x10, %dh
   2008 	jnz	LABEL(Byte4)
   2009 
   2010 	test	$0x20, %dh
   2011 	jnz	LABEL(Byte5)
   2012 
   2013 	test	$0x40, %dh
   2014 	jnz	LABEL(Byte6)
   2015 
   2016 #ifdef USE_AS_STRNCMP
   2017 	sub	$7, %r11
   2018 	jbe	LABEL(strcmp_exitz)
   2019 #endif
   2020 	movzx	7(%rsi), %ecx
   2021 	movzx	7(%rdi), %eax
   2022 
   2023 	sub	%ecx, %eax
   2024 	ret
   2025 
   2026 	.pushsection .rodata
   2027 	.p2align 4
   2028 LABEL(unaligned_table):
   2029 	.int	LABEL(ashr_0) - LABEL(unaligned_table)
   2030 	.int	LABEL(ashr_15) - LABEL(unaligned_table)
   2031 	.int	LABEL(ashr_14) - LABEL(unaligned_table)
   2032 	.int	LABEL(ashr_13) - LABEL(unaligned_table)
   2033 	.int	LABEL(ashr_12) - LABEL(unaligned_table)
   2034 	.int	LABEL(ashr_11) - LABEL(unaligned_table)
   2035 	.int	LABEL(ashr_10) - LABEL(unaligned_table)
   2036 	.int	LABEL(ashr_9) - LABEL(unaligned_table)
   2037 	.int	LABEL(ashr_8) - LABEL(unaligned_table)
   2038 	.int	LABEL(ashr_7) - LABEL(unaligned_table)
   2039 	.int	LABEL(ashr_6) - LABEL(unaligned_table)
   2040 	.int	LABEL(ashr_5) - LABEL(unaligned_table)
   2041 	.int	LABEL(ashr_4) - LABEL(unaligned_table)
   2042 	.int	LABEL(ashr_3) - LABEL(unaligned_table)
   2043 	.int	LABEL(ashr_2) - LABEL(unaligned_table)
   2044 	.int	LABEL(ashr_1) - LABEL(unaligned_table)
   2045 	.popsection
   2046 #ifdef USE_AS_STRNCMP
   2047 	SET_SIZE(strncmp)
   2048 #else
   2049 	SET_SIZE(strcmp)		/* (const char *, const char *) */
   2050 #endif
   2051