Home | History | Annotate | Download | only in gen
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright (c) 2009, Intel Corporation
     24  * All rights reserved.
     25  */
     26 
     27 /*
     28  *	str[n]cpy - copy [n] chars from second operand into first operand
     29  */
     30 #include "SYS.h"
     31 #include "proc64_id.h"
     32 
     33 #define LABEL(s) .strcpy/**/s
     34 
     35 #ifdef USE_AS_STRNCPY
     36 	ENTRY(strncpy)
     37 	test	%edx, %edx
     38 	jz	LABEL(strncpy_exitz)
     39 	mov	%rdx, %r8
     40 #else
     41 	ENTRY(strcpy)				/* (char *, const char *) */
     42 	xor	%rdx, %rdx
     43 #endif
     44 	mov	%esi, %ecx
     45 	and	$0xfffffffffffffff0, %rsi	/* force rsi 16 byte align */
     46 	and	$0xf, %rcx
     47 	mov	%rdi, %rax			/* save destination address for return value */
     48 
     49 
     50 	pxor	%xmm0, %xmm0			/* clear %xmm0 for null char checks */
     51 	pcmpeqb	(%rsi), %xmm0			/* check 16 bytes in src for null */
     52 	pmovmskb %xmm0, %edx
     53 	shr	%cl, %edx			/* adjust for offset from 16byte boundary */
     54 	test	%edx, %edx			/* edx will be 0 if chars are non-null */
     55 	jnz	LABEL(less16bytes)		/* null char found in first 16 bytes examined */
     56 #ifdef USE_AS_STRNCPY
     57 	/*
     58 	 * Check if the count is satisfied in first 16 bytes examined.
     59 	 */
     60 	lea	-16(%r8, %rcx), %r11
     61 	cmp	$0, %r11
     62 	jle	LABEL(less16bytes)
     63 #endif
     64 	mov	%rcx, %r9			/* rsi alignment offset */
     65 	or	%edi, %ecx
     66 	and	$0xf, %ecx
     67 	lea	-16(%r9), %r10
     68 	jz	LABEL(ashr_0)			/* src and dest are both 16 byte aligned */
     69 
     70 	neg	%r10				/* max src bytes remaining in current dqword */
     71 
     72 	pxor	%xmm0, %xmm0			/* clear %xmm0, may be polluted by unaligned operation */
     73 	pcmpeqb	16(%rsi), %xmm0			/* check next 16 bytes in src for a null */
     74 	pmovmskb %xmm0, %edx
     75 	test	%edx, %edx
     76 	jnz	LABEL(less32bytes)		/* null char found in first 32 bytes examined */
     77 
     78 #ifdef USE_AS_STRNCPY
     79 	/*
     80 	 * If strncpy count <= 16 go to exit case
     81 	 */
     82 	sub	$16, %r8
     83 	jbe	LABEL(less32bytes_strncpy_truncation)
     84 #endif
     85 	/*
     86 	 * At least 16 bytes to copy to destination string. Move them now.
     87 	 * Don't worry about alignment.
     88 	 */
     89 	mov	(%rsi, %r9), %rdx
     90 	mov	%rdx, (%rdi)
     91 	mov	8(%rsi, %r9), %rdx
     92 	mov	%rdx, 8(%rdi)
     93 
     94 	/*
     95 	 * so far destination rdi may be aligned by 16, re-calculate rsi and
     96 	 * jump to corresponding src/dest relative offset case.
     97 	 * 	rcx is offset of rsi
     98 	 * 	rdx is offset of rdi
     99 	 */
    100 	and	$0xfffffffffffffff0, %rdi	/* force rdi 16 byte align */
    101 	mov	%rax, %rdx			/* rax contains orignal rdi */
    102 	xor	%rdi, %rdx			/* same effect as "and $0xf, %rdx" */
    103 #ifdef USE_AS_STRNCPY
    104 	/*
    105 	 * Will now do 16 byte aligned stores. Stores may overlap some bytes
    106 	 * (ie store twice) if destination was unaligned. Compensate here.
    107 	 */
    108 	add	%rdx, %r8			/* compensate for overlap */
    109 #endif
    110 
    111 	add	$16, %rdi			/* next 16 bytes for dest */
    112 
    113 	/*
    114 	 * align src to 16-byte boundary. Could be up or down depending on
    115 	 * whether src offset - dest offset > 0 (up) or
    116 	 *  src offset - dest offset < 0 (down).
    117 	 */
    118 	sub	%rdx, %r9			/* src offset - dest offset */
    119 
    120 	lea	16(%r9, %rsi), %rsi
    121 	mov	%esi, %ecx			/* for new src offset */
    122 	and	$0xfffffffffffffff0, %rsi	/* force rsi 16 byte align */
    123 
    124 	and	$0xf, %ecx			/* new src offset is 0 if rsi/rdi have same alignment */
    125 	jz	LABEL(ashr_0)
    126 
    127 #ifdef USE_AS_STRNCPY
    128 	xor	%edx, %edx			/* In case unaligned_exit is taken */
    129 #endif
    130 	/*
    131 	 * Jump to case corresponding to source/dest string relative offsets
    132 	 * Index = (16 + (src offset - dest offset)) % 16
    133 	 */
    134 	lea	-16(%rcx), %r10
    135 	mov	%rcx, %r9
    136 	neg	%r10				/* max src bytes remaining in current dqword */
    137 	lea	LABEL(unaligned_table)(%rip), %r11
    138 	movslq	(%r11, %rcx, 4), %rcx
    139 	lea	(%r11, %rcx), %rcx
    140 	jmp	*%rcx
    141 
    142 /*
    143  * ashr_0 handles the following cases:
    144  * 	src alignment offset = dest alignment offset
    145  */
    146 	.p2align 5
    147 LABEL(ashr_0):
    148 #ifdef USE_AS_STRNCPY
    149 	sub	$16, %r8
    150  	jbe	LABEL(strncpy_truncation_aligned)
    151 #endif
    152 	movdqa	(%rsi), %xmm1		/* fetch 16 bytes from src string */
    153 	movdqa	%xmm1, (%rdi)		/* store 16 bytes into dest string */
    154 	add	$16, %rsi
    155 	add	$16, %rdi
    156 	pcmpeqb	(%rsi), %xmm0		/* check 16 bytes in src for a null */
    157 	pmovmskb %xmm0, %edx
    158 
    159 	test	%edx, %edx		/* edx will be 0 if chars are non-null */
    160 	jnz	LABEL(aligned_16bytes)	/* exit tail */
    161 
    162 LABEL(ashr_0_loop):
    163 #ifdef USE_AS_STRNCPY
    164 	sub	$16, %r8
    165 	jbe	LABEL(strncpy_truncation_aligned)
    166 #endif
    167 	movdqa	(%rsi, %rcx), %xmm1
    168 	movdqa	%xmm1, (%rdi, %rcx)
    169 	add	$16, %rcx
    170 	pcmpeqb	(%rsi, %rcx), %xmm0
    171 	pmovmskb %xmm0, %edx
    172 	test	%edx, %edx
    173 	jnz	LABEL(aligned_exit)
    174 
    175 #ifdef USE_AS_STRNCPY
    176 	sub	$16, %r8
    177 	jbe	LABEL(strncpy_truncation_aligned)
    178 #endif
    179 	movdqa  (%rsi, %rcx), %xmm1
    180 	movdqa  %xmm1, (%rdi, %rcx)
    181 	add	$16, %rcx
    182 	pcmpeqb  (%rsi, %rcx), %xmm0
    183 	pmovmskb  %xmm0, %edx
    184 	test	%edx, %edx
    185 	jnz	LABEL(aligned_exit)
    186 
    187 #ifdef USE_AS_STRNCPY
    188 	sub	$16, %r8
    189 	jbe	LABEL(strncpy_truncation_aligned)
    190 #endif
    191 	movdqa  (%rsi, %rcx), %xmm1
    192 	movdqa  %xmm1, (%rdi, %rcx)
    193 
    194 	add	$16, %rcx
    195 	pcmpeqb  (%rsi, %rcx), %xmm0
    196 	pmovmskb  %xmm0, %edx
    197 	test	%edx, %edx
    198 	jnz	LABEL(aligned_exit)
    199 
    200 #ifdef USE_AS_STRNCPY
    201 	sub	$16, %r8
    202 	jbe	LABEL(strncpy_truncation_aligned)
    203 #endif
    204 	movdqa  (%rsi, %rcx), %xmm1
    205 	movdqa  %xmm1, (%rdi, %rcx)
    206 	add	$16, %rcx
    207 	pcmpeqb  (%rsi, %rcx), %xmm0
    208 	pmovmskb  %xmm0, %edx
    209 	test	%edx, %edx
    210 	jz	LABEL(ashr_0_loop)
    211 	jmp	LABEL(aligned_exit)
    212 
    213 
    214 /*
    215  * ashr_15 handles the following cases:
    216  * 	(16 + (src offset - dest offset)) % 16 = 15
    217  *
    218  * Based on above operation, start from (%r9 + rsi) to the left of this cache
    219  * bank, there is no null byte.
    220  */
    221 	.p2align 4
    222 LABEL(ashr_15):
    223 	xor	%ecx, %ecx				/* clear index */
    224 #ifdef USE_AS_STRNCPY
    225 	cmp	%r10, %r8
    226 	jbe	LABEL(unaligned_exit)
    227 #endif
    228 	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
    229 	jz	LABEL(ashr_15_use_sse2)
    230 
    231 	.p2align 4
    232 LABEL(ashr_15_use_ssse3):
    233 	movdqa	16(%rsi, %rcx), %xmm3
    234 	pcmpeqb	%xmm3, %xmm0
    235 	pmovmskb %xmm0, %edx
    236 	test	%edx, %edx
    237 	jnz	LABEL(unaligned_exit)
    238 #ifdef USE_AS_STRNCPY
    239 	sub	$16, %r8
    240  	jbe	LABEL(strncpy_truncation_unaligned)
    241 #endif
    242 
    243 	#palignr $15, (%rsi, %rcx), %xmm3
    244 	.byte	0x66, 0x0F, 0x3A ,0x0F
    245 	.byte	0x1c, 0x0e, 0x0f
    246 
    247 	movdqa	%xmm3, (%rdi, %rcx)
    248 	add	$16, %rcx
    249 
    250 #ifdef USE_AS_STRNCPY
    251 	cmp	%r10, %r8
    252 	jbe	LABEL(unaligned_exit)
    253 #endif
    254 	movdqa	16(%rsi, %rcx), %xmm3
    255 	pcmpeqb %xmm3, %xmm0
    256 	pmovmskb %xmm0, %edx
    257 	test	%edx, %edx
    258 	jnz	LABEL(unaligned_exit)
    259 #ifdef USE_AS_STRNCPY
    260 	sub	$16, %r8
    261  	jbe	LABEL(strncpy_truncation_unaligned)
    262 #endif
    263 
    264 	#palignr $15, (%rsi, %rcx), %xmm3
    265 	.byte	0x66, 0x0F, 0x3A ,0x0F
    266 	.byte	0x1c, 0x0e, 0x0f
    267 
    268 	movdqa	%xmm3, (%rdi, %rcx)
    269 	add	$16, %rcx
    270 
    271 #ifdef USE_AS_STRNCPY
    272 	cmp	%r10, %r8
    273 	jbe	LABEL(unaligned_exit)
    274 #endif
    275 	jmp	LABEL(ashr_15_use_ssse3)
    276 
    277 	.p2align 4
    278 LABEL(ashr_15_use_sse2):
    279 	pcmpeqb 16(%rsi, %rcx), %xmm0
    280 	pmovmskb %xmm0, %edx
    281 	test	%edx, %edx
    282 	jnz	LABEL(unaligned_exit)
    283 #ifdef USE_AS_STRNCPY
    284 	sub	$16, %r8
    285  	jbe	LABEL(strncpy_truncation_unaligned)
    286 #endif
    287 
    288 	movdqa	16(%rsi, %rcx), %xmm3
    289 	movdqa	(%rsi, %rcx), %xmm2
    290 
    291 	psrldq	$15, %xmm2
    292 	pslldq	$1, %xmm3
    293 	por	%xmm2, %xmm3
    294 
    295 	movdqa	%xmm3, (%rdi, %rcx)
    296 	add	$16, %rcx
    297 #ifdef USE_AS_STRNCPY
    298 	cmp	%r10, %r8
    299 	jbe	LABEL(unaligned_exit)
    300 #endif
    301 	pcmpeqb 16(%rsi, %rcx), %xmm0
    302 	pmovmskb %xmm0, %edx
    303 	test	%edx, %edx
    304 	jnz	LABEL(unaligned_exit)
    305 #ifdef USE_AS_STRNCPY
    306 	sub	$16, %r8
    307  	jbe	LABEL(strncpy_truncation_unaligned)
    308 #endif
    309 
    310 	movdqa	16(%rsi, %rcx), %xmm3
    311 	movdqa	(%rsi, %rcx), %xmm2
    312 
    313 	psrldq	$15, %xmm2
    314 	pslldq	$1, %xmm3
    315 	por	%xmm2, %xmm3
    316 
    317 	movdqa	%xmm3, (%rdi, %rcx)
    318 	add	$16, %rcx
    319 #ifdef USE_AS_STRNCPY
    320 	cmp	%r10, %r8
    321 	jbe	LABEL(unaligned_exit)
    322 #endif
    323 	jmp	LABEL(ashr_15_use_sse2)
    324 
    325 
    326 /*
    327  * ashr_14 handles the following cases:
    328  * 	(16 + (src offset - dest offset)) % 16 = 14
    329  *
    330  * Based on above operation, start from (%r9 + rsi) to the left of this cache
    331  * bank, there is no null byte.
    332  */
    333 	.p2align 4
    334 LABEL(ashr_14):
    335 	xor	%ecx, %ecx				/* clear index */
    336 #ifdef USE_AS_STRNCPY
    337 	cmp	%r10, %r8
    338 	jbe	LABEL(unaligned_exit)
    339 #endif
    340 	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
    341 	jz	LABEL(ashr_14_use_sse2)
    342 
    343 	.p2align 4
    344 LABEL(ashr_14_use_ssse3):
    345 	movdqa	16(%rsi, %rcx), %xmm3
    346 	pcmpeqb	%xmm3, %xmm0
    347 	pmovmskb %xmm0, %edx
    348 	test	%edx, %edx
    349 	jnz	LABEL(unaligned_exit)
    350 #ifdef USE_AS_STRNCPY
    351 	sub	$16, %r8
    352  	jbe	LABEL(strncpy_truncation_unaligned)
    353 #endif
    354 
    355 	#palignr $14, (%rsi, %rcx), %xmm3
    356 	.byte	0x66, 0x0F, 0x3A ,0x0F
    357 	.byte	0x1c, 0x0e, 0x0e
    358 
    359 	movdqa	%xmm3, (%rdi, %rcx)
    360 	add	$16, %rcx
    361 
    362 #ifdef USE_AS_STRNCPY
    363 	cmp	%r10, %r8
    364 	jbe	LABEL(unaligned_exit)
    365 #endif
    366 	movdqa	16(%rsi, %rcx), %xmm3
    367 	pcmpeqb %xmm3, %xmm0
    368 	pmovmskb %xmm0, %edx
    369 	test	%edx, %edx
    370 	jnz	LABEL(unaligned_exit)
    371 #ifdef USE_AS_STRNCPY
    372 	sub	$16, %r8
    373  	jbe	LABEL(strncpy_truncation_unaligned)
    374 #endif
    375 
    376 	#palignr $14, (%rsi, %rcx), %xmm3
    377 	.byte	0x66, 0x0F, 0x3A ,0x0F
    378 	.byte	0x1c, 0x0e, 0x0e
    379 
    380 	movdqa	%xmm3, (%rdi, %rcx)
    381 	add	$16, %rcx
    382 #ifdef USE_AS_STRNCPY
    383 	cmp	%r10, %r8
    384 	jbe	LABEL(unaligned_exit)
    385 #endif
    386 	jmp	LABEL(ashr_14_use_ssse3)
    387 
    388 	.p2align 4
    389 LABEL(ashr_14_use_sse2):
    390 	pcmpeqb 16(%rsi, %rcx), %xmm0
    391 	pmovmskb %xmm0, %edx
    392 	test	%edx, %edx
    393 	jnz	LABEL(unaligned_exit)
    394 #ifdef USE_AS_STRNCPY
    395 	sub	$16, %r8
    396  	jbe	LABEL(strncpy_truncation_unaligned)
    397 #endif
    398 
    399 	movdqa	16(%rsi, %rcx), %xmm3
    400 	movdqa	(%rsi, %rcx), %xmm2
    401 
    402 	psrldq	$14, %xmm2
    403 	pslldq	$2, %xmm3
    404 	por	%xmm2, %xmm3
    405 
    406 	movdqa	%xmm3, (%rdi, %rcx)
    407 	add	$16, %rcx
    408 
    409 #ifdef USE_AS_STRNCPY
    410 	cmp	%r10, %r8
    411 	jbe	LABEL(unaligned_exit)
    412 #endif
    413 	pcmpeqb 16(%rsi, %rcx), %xmm0
    414 	pmovmskb %xmm0, %edx
    415 	test	%edx, %edx
    416 	jnz	LABEL(unaligned_exit)
    417 #ifdef USE_AS_STRNCPY
    418 	sub	$16, %r8
    419  	jbe	LABEL(strncpy_truncation_unaligned)
    420 #endif
    421 
    422 	movdqa	16(%rsi, %rcx), %xmm3
    423 	movdqa	(%rsi, %rcx), %xmm2
    424 
    425 	psrldq	$14, %xmm2
    426 	pslldq	$2, %xmm3
    427 	por	%xmm2, %xmm3
    428 
    429 	movdqa	%xmm3, (%rdi, %rcx)
    430 	add	$16, %rcx
    431 #ifdef USE_AS_STRNCPY
    432 	cmp	%r10, %r8
    433 	jbe	LABEL(unaligned_exit)
    434 #endif
    435 	jmp	LABEL(ashr_14_use_sse2)
    436 
    437 
    438 /*
    439  * ashr_13 handles the following cases:
    440  * 	(16 + (src offset - dest offset)) % 16 = 13
    441  *
    442  * Based on above operation, start from (%r9 + rsi) to the left of this cache
    443  * bank, there is no null byte.
    444  */
    445 	.p2align 4
    446 LABEL(ashr_13):
    447 	xor	%ecx, %ecx				/* clear index */
    448 #ifdef USE_AS_STRNCPY
    449 	cmp	%r10, %r8
    450 	jbe	LABEL(unaligned_exit)
    451 #endif
    452 	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
    453 	jz	LABEL(ashr_13_use_sse2)
    454 
    455 	.p2align 4
    456 LABEL(ashr_13_use_ssse3):
    457 	movdqa	16(%rsi, %rcx), %xmm3
    458 	pcmpeqb	%xmm3, %xmm0
    459 	pmovmskb %xmm0, %edx
    460 	test	%edx, %edx
    461 	jnz	LABEL(unaligned_exit)
    462 #ifdef USE_AS_STRNCPY
    463 	sub	$16, %r8
    464  	jbe	LABEL(strncpy_truncation_unaligned)
    465 #endif
    466 
    467 	#palignr $13, (%rsi, %rcx), %xmm3
    468 	.byte	0x66, 0x0F, 0x3A ,0x0F
    469 	.byte	0x1c, 0x0e, 0x0d
    470 
    471 	movdqa	%xmm3, (%rdi, %rcx)
    472 	add	$16, %rcx
    473 
    474 #ifdef USE_AS_STRNCPY
    475 	cmp	%r10, %r8
    476 	jbe	LABEL(unaligned_exit)
    477 #endif
    478 	movdqa	16(%rsi, %rcx), %xmm3
    479 	pcmpeqb %xmm3, %xmm0
    480 	pmovmskb %xmm0, %edx
    481 	test	%edx, %edx
    482 	jnz	LABEL(unaligned_exit)
    483 #ifdef USE_AS_STRNCPY
    484 	sub	$16, %r8
    485  	jbe	LABEL(strncpy_truncation_unaligned)
    486 #endif
    487 
    488 	#palignr $13, (%rsi, %rcx), %xmm3
    489 	.byte	0x66, 0x0F, 0x3A ,0x0F
    490 	.byte	0x1c, 0x0e, 0x0d
    491 
    492 	movdqa	%xmm3, (%rdi, %rcx)
    493 	add	$16, %rcx
    494 #ifdef USE_AS_STRNCPY
    495 	cmp	%r10, %r8
    496 	jbe	LABEL(unaligned_exit)
    497 #endif
    498 	jmp	LABEL(ashr_13_use_ssse3)
    499 
    500 	.p2align 4
    501 LABEL(ashr_13_use_sse2):
    502 	pcmpeqb 16(%rsi, %rcx), %xmm0
    503 	pmovmskb %xmm0, %edx
    504 	test	%edx, %edx
    505 	jnz	LABEL(unaligned_exit)
    506 #ifdef USE_AS_STRNCPY
    507 	sub	$16, %r8
    508  	jbe	LABEL(strncpy_truncation_unaligned)
    509 #endif
    510 
    511 	movdqa	16(%rsi, %rcx), %xmm3
    512 	movdqa	(%rsi, %rcx), %xmm2
    513 
    514 	psrldq	$13, %xmm2
    515 	pslldq	$3, %xmm3
    516 	por	%xmm2, %xmm3
    517 
    518 	movdqa	%xmm3, (%rdi, %rcx)
    519 	add	$16, %rcx
    520 
    521 #ifdef USE_AS_STRNCPY
    522 	cmp	%r10, %r8
    523 	jbe	LABEL(unaligned_exit)
    524 #endif
    525 	pcmpeqb 16(%rsi, %rcx), %xmm0
    526 	pmovmskb %xmm0, %edx
    527 	test	%edx, %edx
    528 	jnz	LABEL(unaligned_exit)
    529 #ifdef USE_AS_STRNCPY
    530 	sub	$16, %r8
    531  	jbe	LABEL(strncpy_truncation_unaligned)
    532 #endif
    533 
    534 	movdqa	16(%rsi, %rcx), %xmm3
    535 	movdqa	(%rsi, %rcx), %xmm2
    536 
    537 	psrldq	$13, %xmm2
    538 	pslldq	$3, %xmm3
    539 	por	%xmm2, %xmm3
    540 
    541 	movdqa	%xmm3, (%rdi, %rcx)
    542 	add	$16, %rcx
    543 #ifdef USE_AS_STRNCPY
    544 	cmp	%r10, %r8
    545 	jbe	LABEL(unaligned_exit)
    546 #endif
    547 	jmp	LABEL(ashr_13_use_sse2)
    548 
    549 
    550 /*
    551  * ashr_12 handles the following cases:
    552  * 	(16 + (src offset - dest offset)) % 16 = 12
    553  *
    554  * Based on above operation, start from (%r9 + rsi) to the left of this cache
    555  * bank, there is no null byte.
    556  */
    557 	.p2align 4
    558 LABEL(ashr_12):
    559 	xor	%ecx, %ecx				/* clear index */
    560 #ifdef USE_AS_STRNCPY
    561 	cmp	%r10, %r8
    562 	jbe	LABEL(unaligned_exit)
    563 #endif
    564 	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
    565 	jz	LABEL(ashr_12_use_sse2)
    566 
    567 	.p2align 4
    568 LABEL(ashr_12_use_ssse3):
    569 	movdqa	16(%rsi, %rcx), %xmm3
    570 	pcmpeqb	%xmm3, %xmm0
    571 	pmovmskb %xmm0, %edx
    572 	test	%edx, %edx
    573 	jnz	LABEL(unaligned_exit)
    574 #ifdef USE_AS_STRNCPY
    575 	sub	$16, %r8
    576  	jbe	LABEL(strncpy_truncation_unaligned)
    577 #endif
    578 
    579 	#palignr $12, (%rsi, %rcx), %xmm3
    580 	.byte	0x66, 0x0F, 0x3A ,0x0F
    581 	.byte	0x1c, 0x0e, 0x0c
    582 
    583 	movdqa	%xmm3, (%rdi, %rcx)
    584 	add	$16, %rcx
    585 
    586 #ifdef USE_AS_STRNCPY
    587 	cmp	%r10, %r8
    588 	jbe	LABEL(unaligned_exit)
    589 #endif
    590 	movdqa	16(%rsi, %rcx), %xmm3
    591 	pcmpeqb %xmm3, %xmm0
    592 	pmovmskb %xmm0, %edx
    593 	test	%edx, %edx
    594 	jnz	LABEL(unaligned_exit)
    595 #ifdef USE_AS_STRNCPY
    596 	sub	$16, %r8
    597  	jbe	LABEL(strncpy_truncation_unaligned)
    598 #endif
    599 
    600 	#palignr $12, (%rsi, %rcx), %xmm3
    601 	.byte	0x66, 0x0F, 0x3A ,0x0F
    602 	.byte	0x1c, 0x0e, 0x0c
    603 
    604 	movdqa	%xmm3, (%rdi, %rcx)
    605 	add	$16, %rcx
    606 #ifdef USE_AS_STRNCPY
    607 	cmp	%r10, %r8
    608 	jbe	LABEL(unaligned_exit)
    609 #endif
    610 	jmp	LABEL(ashr_12_use_ssse3)
    611 
    612 	.p2align 4
    613 LABEL(ashr_12_use_sse2):
    614 	pcmpeqb 16(%rsi, %rcx), %xmm0
    615 	pmovmskb %xmm0, %edx
    616 	test	%edx, %edx
    617 	jnz	LABEL(unaligned_exit)
    618 #ifdef USE_AS_STRNCPY
    619 	sub	$16, %r8
    620  	jbe	LABEL(strncpy_truncation_unaligned)
    621 #endif
    622 
    623 	movdqa	16(%rsi, %rcx), %xmm3
    624 	movdqa	(%rsi, %rcx), %xmm2
    625 
    626 	psrldq	$12, %xmm2
    627 	pslldq	$4, %xmm3
    628 	por	%xmm2, %xmm3
    629 
    630 	movdqa	%xmm3, (%rdi, %rcx)
    631 	add	$16, %rcx
    632 
    633 #ifdef USE_AS_STRNCPY
    634 	cmp	%r10, %r8
    635 	jbe	LABEL(unaligned_exit)
    636 #endif
    637 	pcmpeqb 16(%rsi, %rcx), %xmm0
    638 	pmovmskb %xmm0, %edx
    639 	test	%edx, %edx
    640 	jnz	LABEL(unaligned_exit)
    641 #ifdef USE_AS_STRNCPY
    642 	sub	$16, %r8
    643  	jbe	LABEL(strncpy_truncation_unaligned)
    644 #endif
    645 
    646 	movdqa	16(%rsi, %rcx), %xmm3
    647 	movdqa	(%rsi, %rcx), %xmm2
    648 
    649 	psrldq	$12, %xmm2
    650 	pslldq	$4, %xmm3
    651 	por	%xmm2, %xmm3
    652 
    653 	movdqa	%xmm3, (%rdi, %rcx)
    654 	add	$16, %rcx
    655 #ifdef USE_AS_STRNCPY
    656 	cmp	%r10, %r8
    657 	jbe	LABEL(unaligned_exit)
    658 #endif
    659 	jmp	LABEL(ashr_12_use_sse2)
    660 
    661 
    662 /*
    663  * ashr_11 handles the following cases:
    664  * 	(16 + (src offset - dest offset)) % 16 = 11
    665  *
    666  * Based on above operation, start from (%r9 + rsi) to the left of this cache
    667  * bank, there is no null byte.
    668  */
    669 	.p2align 4
    670 LABEL(ashr_11):
    671 	xor	%ecx, %ecx				/* clear index */
    672 #ifdef USE_AS_STRNCPY
    673 	cmp	%r10, %r8
    674 	jbe	LABEL(unaligned_exit)
    675 #endif
    676 	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
    677 	jz	LABEL(ashr_11_use_sse2)
    678 
    679 	.p2align 4
    680 LABEL(ashr_11_use_ssse3):
    681 	movdqa	16(%rsi, %rcx), %xmm3
    682 	pcmpeqb	%xmm3, %xmm0
    683 	pmovmskb %xmm0, %edx
    684 	test	%edx, %edx
    685 	jnz	LABEL(unaligned_exit)
    686 #ifdef USE_AS_STRNCPY
    687 	sub	$16, %r8
    688  	jbe	LABEL(strncpy_truncation_unaligned)
    689 #endif
    690 
    691 	#palignr $11, (%rsi, %rcx), %xmm3
    692 	.byte	0x66, 0x0F, 0x3A ,0x0F
    693 	.byte	0x1c, 0x0e, 0x0b
    694 
    695 	movdqa	%xmm3, (%rdi, %rcx)
    696 	add	$16, %rcx
    697 
    698 #ifdef USE_AS_STRNCPY
    699 	cmp	%r10, %r8
    700 	jbe	LABEL(unaligned_exit)
    701 #endif
    702 	movdqa	16(%rsi, %rcx), %xmm3
    703 	pcmpeqb %xmm3, %xmm0
    704 	pmovmskb %xmm0, %edx
    705 	test	%edx, %edx
    706 	jnz	LABEL(unaligned_exit)
    707 #ifdef USE_AS_STRNCPY
    708 	sub	$16, %r8
    709  	jbe	LABEL(strncpy_truncation_unaligned)
    710 #endif
    711 
    712 	#palignr $11, (%rsi, %rcx), %xmm3
    713 	.byte	0x66, 0x0F, 0x3A ,0x0F
    714 	.byte	0x1c, 0x0e, 0x0b
    715 
    716 	movdqa	%xmm3, (%rdi, %rcx)
    717 	add	$16, %rcx
    718 #ifdef USE_AS_STRNCPY
    719 	cmp	%r10, %r8
    720 	jbe	LABEL(unaligned_exit)
    721 #endif
    722 	jmp	LABEL(ashr_11_use_ssse3)
    723 
    724 	.p2align 4
    725 LABEL(ashr_11_use_sse2):
    726 	pcmpeqb 16(%rsi, %rcx), %xmm0
    727 	pmovmskb %xmm0, %edx
    728 	test	%edx, %edx
    729 	jnz	LABEL(unaligned_exit)
    730 #ifdef USE_AS_STRNCPY
    731 	sub	$16, %r8
    732  	jbe	LABEL(strncpy_truncation_unaligned)
    733 #endif
    734 
    735 	movdqa	16(%rsi, %rcx), %xmm3
    736 	movdqa	(%rsi, %rcx), %xmm2
    737 
    738 	psrldq	$11, %xmm2
    739 	pslldq	$5, %xmm3
    740 	por	%xmm2, %xmm3
    741 
    742 	movdqa	%xmm3, (%rdi, %rcx)
    743 	add	$16, %rcx
    744 
    745 #ifdef USE_AS_STRNCPY
    746 	cmp	%r10, %r8
    747 	jbe	LABEL(unaligned_exit)
    748 #endif
    749 	pcmpeqb 16(%rsi, %rcx), %xmm0
    750 	pmovmskb %xmm0, %edx
    751 	test	%edx, %edx
    752 	jnz	LABEL(unaligned_exit)
    753 #ifdef USE_AS_STRNCPY
    754 	sub	$16, %r8
    755  	jbe	LABEL(strncpy_truncation_unaligned)
    756 #endif
    757 
    758 	movdqa	16(%rsi, %rcx), %xmm3
    759 	movdqa	(%rsi, %rcx), %xmm2
    760 
    761 	psrldq	$11, %xmm2
    762 	pslldq	$5, %xmm3
    763 	por	%xmm2, %xmm3
    764 
    765 	movdqa	%xmm3, (%rdi, %rcx)
    766 	add	$16, %rcx
    767 #ifdef USE_AS_STRNCPY
    768 	cmp	%r10, %r8
    769 	jbe	LABEL(unaligned_exit)
    770 #endif
    771 	jmp	LABEL(ashr_11_use_sse2)
    772 
    773 
    774 /*
    775  * ashr_10 handles the following cases:
    776  * 	(16 + (src offset - dest offset)) % 16 = 10
    777  *
    778  * Based on above operation, start from (%r9 + rsi) to the left of this cache
    779  * bank, there is no null byte.
    780  */
    781 	.p2align 4
    782 LABEL(ashr_10):
    783 	xor	%ecx, %ecx				/* clear index */
    784 #ifdef USE_AS_STRNCPY
    785 	cmp	%r10, %r8
    786 	jbe	LABEL(unaligned_exit)
    787 #endif
    788 	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
    789 	jz	LABEL(ashr_10_use_sse2)
    790 
    791 	.p2align 4
    792 LABEL(ashr_10_use_ssse3):
    793 	movdqa	16(%rsi, %rcx), %xmm3
    794 	pcmpeqb	%xmm3, %xmm0
    795 	pmovmskb %xmm0, %edx
    796 	test	%edx, %edx
    797 	jnz	LABEL(unaligned_exit)
    798 #ifdef USE_AS_STRNCPY
    799 	sub	$16, %r8
    800  	jbe	LABEL(strncpy_truncation_unaligned)
    801 #endif
    802 
    803 	#palignr $10, (%rsi, %rcx), %xmm3
    804 	.byte	0x66, 0x0F, 0x3A ,0x0F
    805 	.byte	0x1c, 0x0e, 0x0a
    806 
    807 	movdqa	%xmm3, (%rdi, %rcx)
    808 	add	$16, %rcx
    809 
    810 #ifdef USE_AS_STRNCPY
    811 	cmp	%r10, %r8
    812 	jbe	LABEL(unaligned_exit)
    813 #endif
    814 	movdqa	16(%rsi, %rcx), %xmm3
    815 	pcmpeqb %xmm3, %xmm0
    816 	pmovmskb %xmm0, %edx
    817 	test	%edx, %edx
    818 	jnz	LABEL(unaligned_exit)
    819 #ifdef USE_AS_STRNCPY
    820 	sub	$16, %r8
    821  	jbe	LABEL(strncpy_truncation_unaligned)
    822 #endif
    823 
    824 	#palignr $10, (%rsi, %rcx), %xmm3
    825 	.byte	0x66, 0x0F, 0x3A ,0x0F
    826 	.byte	0x1c, 0x0e, 0x0a
    827 
    828 	movdqa	%xmm3, (%rdi, %rcx)
    829 	add	$16, %rcx
    830 #ifdef USE_AS_STRNCPY
    831 	cmp	%r10, %r8
    832 	jbe	LABEL(unaligned_exit)
    833 #endif
    834 	jmp	LABEL(ashr_10_use_ssse3)
    835 
    836 	.p2align 4
    837 LABEL(ashr_10_use_sse2):
    838 	pcmpeqb 16(%rsi, %rcx), %xmm0
    839 	pmovmskb %xmm0, %edx
    840 	test	%edx, %edx
    841 	jnz	LABEL(unaligned_exit)
    842 #ifdef USE_AS_STRNCPY
    843 	sub	$16, %r8
    844  	jbe	LABEL(strncpy_truncation_unaligned)
    845 #endif
    846 
    847 	movdqa	16(%rsi, %rcx), %xmm3
    848 	movdqa	(%rsi, %rcx), %xmm2
    849 
    850 	psrldq	$10, %xmm2
    851 	pslldq	$6, %xmm3
    852 	por	%xmm2, %xmm3
    853 
    854 	movdqa	%xmm3, (%rdi, %rcx)
    855 	add	$16, %rcx
    856 
    857 #ifdef USE_AS_STRNCPY
    858 	cmp	%r10, %r8
    859 	jbe	LABEL(unaligned_exit)
    860 #endif
    861 	pcmpeqb 16(%rsi, %rcx), %xmm0
    862 	pmovmskb %xmm0, %edx
    863 	test	%edx, %edx
    864 	jnz	LABEL(unaligned_exit)
    865 #ifdef USE_AS_STRNCPY
    866 	sub	$16, %r8
    867  	jbe	LABEL(strncpy_truncation_unaligned)
    868 #endif
    869 
    870 	movdqa	16(%rsi, %rcx), %xmm3
    871 	movdqa	(%rsi, %rcx), %xmm2
    872 
    873 	psrldq	$10, %xmm2
    874 	pslldq	$6, %xmm3
    875 	por	%xmm2, %xmm3
    876 
    877 	movdqa	%xmm3, (%rdi, %rcx)
    878 	add	$16, %rcx
    879 #ifdef USE_AS_STRNCPY
    880 	cmp	%r10, %r8
    881 	jbe	LABEL(unaligned_exit)
    882 #endif
    883 	jmp	LABEL(ashr_10_use_sse2)
    884 
    885 
    886 /*
    887  * ashr_9 handles the following cases:
    888  * 	(16 + (src offset - dest offset)) % 16 = 9
    889  *
    890  * Based on above operation, start from (%r9 + rsi) to the left of this cache
    891  * bank, there is no null byte.
    892  */
    893 	.p2align 4
    894 LABEL(ashr_9):
    895 	xor	%ecx, %ecx				/* clear index */
    896 #ifdef USE_AS_STRNCPY
    897 	cmp	%r10, %r8
    898 	jbe	LABEL(unaligned_exit)
    899 #endif
    900 	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
    901 	jz	LABEL(ashr_9_use_sse2)
    902 
    903 	.p2align 4
    904 LABEL(ashr_9_use_ssse3):
    905 	movdqa	16(%rsi, %rcx), %xmm3
    906 	pcmpeqb	%xmm3, %xmm0
    907 	pmovmskb %xmm0, %edx
    908 	test	%edx, %edx
    909 	jnz	LABEL(unaligned_exit)
    910 #ifdef USE_AS_STRNCPY
    911 	sub	$16, %r8
    912  	jbe	LABEL(strncpy_truncation_unaligned)
    913 #endif
    914 
    915 	#palignr $9, (%rsi, %rcx), %xmm3
    916 	.byte	0x66, 0x0F, 0x3A ,0x0F
    917 	.byte	0x1c, 0x0e, 0x09
    918 
    919 	movdqa	%xmm3, (%rdi, %rcx)
    920 	add	$16, %rcx
    921 
    922 #ifdef USE_AS_STRNCPY
    923 	cmp	%r10, %r8
    924 	jbe	LABEL(unaligned_exit)
    925 #endif
    926 	movdqa	16(%rsi, %rcx), %xmm3
    927 	pcmpeqb %xmm3, %xmm0
    928 	pmovmskb %xmm0, %edx
    929 	test	%edx, %edx
    930 	jnz	LABEL(unaligned_exit)
    931 #ifdef USE_AS_STRNCPY
    932 	sub	$16, %r8
    933  	jbe	LABEL(strncpy_truncation_unaligned)
    934 #endif
    935 
    936 	#palignr $9, (%rsi, %rcx), %xmm3
    937 	.byte	0x66, 0x0F, 0x3A ,0x0F
    938 	.byte	0x1c, 0x0e, 0x09
    939 
    940 	movdqa	%xmm3, (%rdi, %rcx)
    941 	add	$16, %rcx
    942 #ifdef USE_AS_STRNCPY
    943 	cmp	%r10, %r8
    944 	jbe	LABEL(unaligned_exit)
    945 #endif
    946 	jmp	LABEL(ashr_9_use_ssse3)
    947 
    948 	.p2align 4
    949 LABEL(ashr_9_use_sse2):
    950 	pcmpeqb 16(%rsi, %rcx), %xmm0
    951 	pmovmskb %xmm0, %edx
    952 	test	%edx, %edx
    953 	jnz	LABEL(unaligned_exit)
    954 #ifdef USE_AS_STRNCPY
    955 	sub	$16, %r8
    956  	jbe	LABEL(strncpy_truncation_unaligned)
    957 #endif
    958 
    959 	movdqa	16(%rsi, %rcx), %xmm3
    960 	movdqa	(%rsi, %rcx), %xmm2
    961 
    962 	psrldq	$9, %xmm2
    963 	pslldq	$7, %xmm3
    964 	por	%xmm2, %xmm3
    965 
    966 	movdqa	%xmm3, (%rdi, %rcx)
    967 	add	$16, %rcx
    968 
    969 #ifdef USE_AS_STRNCPY
    970 	cmp	%r10, %r8
    971 	jbe	LABEL(unaligned_exit)
    972 #endif
    973 	pcmpeqb 16(%rsi, %rcx), %xmm0
    974 	pmovmskb %xmm0, %edx
    975 	test	%edx, %edx
    976 	jnz	LABEL(unaligned_exit)
    977 #ifdef USE_AS_STRNCPY
    978 	sub	$16, %r8
    979  	jbe	LABEL(strncpy_truncation_unaligned)
    980 #endif
    981 
    982 	movdqa	16(%rsi, %rcx), %xmm3
    983 	movdqa	(%rsi, %rcx), %xmm2
    984 
    985 	psrldq	$9, %xmm2
    986 	pslldq	$7, %xmm3
    987 	por	%xmm2, %xmm3
    988 
    989 	movdqa	%xmm3, (%rdi, %rcx)
    990 	add	$16, %rcx
    991 #ifdef USE_AS_STRNCPY
    992 	cmp	%r10, %r8
    993 	jbe	LABEL(unaligned_exit)
    994 #endif
    995 	jmp	LABEL(ashr_9_use_sse2)
    996 
    997 
    998 /*
    999  * ashr_8 handles the following cases:
   1000  * 	(16 + (src offset - dest offset)) % 16 = 8
   1001  *
   1002  * Based on above operation, start from (%r9 + rsi) to the left of this cache
   1003  * bank, there is no null byte.
   1004  */
   1005 	.p2align 4
   1006 LABEL(ashr_8):
   1007 	xor	%ecx, %ecx				/* clear index */
   1008 #ifdef USE_AS_STRNCPY
   1009 	cmp	%r10, %r8
   1010 	jbe	LABEL(unaligned_exit)
   1011 #endif
   1012 	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
   1013 	jz	LABEL(ashr_8_use_sse2)
   1014 
   1015 	.p2align 4
   1016 LABEL(ashr_8_use_ssse3):
   1017 	movdqa	16(%rsi, %rcx), %xmm3
   1018 	pcmpeqb	%xmm3, %xmm0
   1019 	pmovmskb %xmm0, %edx
   1020 	test	%edx, %edx
   1021 	jnz	LABEL(unaligned_exit)
   1022 #ifdef USE_AS_STRNCPY
   1023 	sub	$16, %r8
   1024  	jbe	LABEL(strncpy_truncation_unaligned)
   1025 #endif
   1026 
   1027 	#palignr $8, (%rsi, %rcx), %xmm3
   1028 	.byte	0x66, 0x0F, 0x3A ,0x0F
   1029 	.byte	0x1c, 0x0e, 0x08
   1030 
   1031 	movdqa	%xmm3, (%rdi, %rcx)
   1032 	add	$16, %rcx
   1033 
   1034 #ifdef USE_AS_STRNCPY
   1035 	cmp	%r10, %r8
   1036 	jbe	LABEL(unaligned_exit)
   1037 #endif
   1038 	movdqa	16(%rsi, %rcx), %xmm3
   1039 	pcmpeqb %xmm3, %xmm0
   1040 	pmovmskb %xmm0, %edx
   1041 	test	%edx, %edx
   1042 	jnz	LABEL(unaligned_exit)
   1043 #ifdef USE_AS_STRNCPY
   1044 	sub	$16, %r8
   1045  	jbe	LABEL(strncpy_truncation_unaligned)
   1046 #endif
   1047 
   1048 	#palignr $8, (%rsi, %rcx), %xmm3
   1049 	.byte	0x66, 0x0F, 0x3A ,0x0F
   1050 	.byte	0x1c, 0x0e, 0x08
   1051 
   1052 	movdqa	%xmm3, (%rdi, %rcx)
   1053 	add	$16, %rcx
   1054 #ifdef USE_AS_STRNCPY
   1055 	cmp	%r10, %r8
   1056 	jbe	LABEL(unaligned_exit)
   1057 #endif
   1058 	jmp	LABEL(ashr_8_use_ssse3)
   1059 
   1060 	.p2align 4
   1061 LABEL(ashr_8_use_sse2):
   1062 	pcmpeqb 16(%rsi, %rcx), %xmm0
   1063 	pmovmskb %xmm0, %edx
   1064 	test	%edx, %edx
   1065 	jnz	LABEL(unaligned_exit)
   1066 #ifdef USE_AS_STRNCPY
   1067 	sub	$16, %r8
   1068  	jbe	LABEL(strncpy_truncation_unaligned)
   1069 #endif
   1070 
   1071 	movdqa	16(%rsi, %rcx), %xmm3
   1072 	movdqa	(%rsi, %rcx), %xmm2
   1073 
   1074 	psrldq	$8, %xmm2
   1075 	pslldq	$8, %xmm3
   1076 	por	%xmm2, %xmm3
   1077 
   1078 	movdqa	%xmm3, (%rdi, %rcx)
   1079 	add	$16, %rcx
   1080 
   1081 #ifdef USE_AS_STRNCPY
   1082 	cmp	%r10, %r8
   1083 	jbe	LABEL(unaligned_exit)
   1084 #endif
   1085 	pcmpeqb 16(%rsi, %rcx), %xmm0
   1086 	pmovmskb %xmm0, %edx
   1087 	test	%edx, %edx
   1088 	jnz	LABEL(unaligned_exit)
   1089 #ifdef USE_AS_STRNCPY
   1090 	sub	$16, %r8
   1091  	jbe	LABEL(strncpy_truncation_unaligned)
   1092 #endif
   1093 
   1094 	movdqa	16(%rsi, %rcx), %xmm3
   1095 	movdqa	(%rsi, %rcx), %xmm2
   1096 
   1097 	psrldq	$8, %xmm2
   1098 	pslldq	$8, %xmm3
   1099 	por	%xmm2, %xmm3
   1100 
   1101 	movdqa	%xmm3, (%rdi, %rcx)
   1102 	add	$16, %rcx
   1103 #ifdef USE_AS_STRNCPY
   1104 	cmp	%r10, %r8
   1105 	jbe	LABEL(unaligned_exit)
   1106 #endif
   1107 	jmp	LABEL(ashr_8_use_sse2)
   1108 
   1109 
   1110 /*
   1111  * ashr_7 handles the following cases:
   1112  * 	(16 + (src offset - dest offset)) % 16 = 7
   1113  *
   1114  * Based on above operation, start from (%r9 + rsi) to the left of this cache
   1115  * bank, there is no null byte.
   1116  */
   1117 	.p2align 4
   1118 LABEL(ashr_7):
   1119 	xor	%ecx, %ecx				/* clear index */
   1120 #ifdef USE_AS_STRNCPY
   1121 	cmp	%r10, %r8
   1122 	jbe	LABEL(unaligned_exit)
   1123 #endif
   1124 	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
   1125 	jz	LABEL(ashr_7_use_sse2)
   1126 
   1127 	.p2align 4
   1128 LABEL(ashr_7_use_ssse3):
   1129 	movdqa	16(%rsi, %rcx), %xmm3
   1130 	pcmpeqb	%xmm3, %xmm0
   1131 	pmovmskb %xmm0, %edx
   1132 	test	%edx, %edx
   1133 	jnz	LABEL(unaligned_exit)
   1134 #ifdef USE_AS_STRNCPY
   1135 	sub	$16, %r8
   1136  	jbe	LABEL(strncpy_truncation_unaligned)
   1137 #endif
   1138 
   1139 	#palignr $7, (%rsi, %rcx), %xmm3
   1140 	.byte	0x66, 0x0F, 0x3A ,0x0F
   1141 	.byte	0x1c, 0x0e, 0x07
   1142 
   1143 	movdqa	%xmm3, (%rdi, %rcx)
   1144 	add	$16, %rcx
   1145 
   1146 #ifdef USE_AS_STRNCPY
   1147 	cmp	%r10, %r8
   1148 	jbe	LABEL(unaligned_exit)
   1149 #endif
   1150 	movdqa	16(%rsi, %rcx), %xmm3
   1151 	pcmpeqb %xmm3, %xmm0
   1152 	pmovmskb %xmm0, %edx
   1153 	test	%edx, %edx
   1154 	jnz	LABEL(unaligned_exit)
   1155 #ifdef USE_AS_STRNCPY
   1156 	sub	$16, %r8
   1157  	jbe	LABEL(strncpy_truncation_unaligned)
   1158 #endif
   1159 
   1160 	#palignr $7, (%rsi, %rcx), %xmm3
   1161 	.byte	0x66, 0x0F, 0x3A ,0x0F
   1162 	.byte	0x1c, 0x0e, 0x07
   1163 
   1164 	movdqa	%xmm3, (%rdi, %rcx)
   1165 	add	$16, %rcx
   1166 #ifdef USE_AS_STRNCPY
   1167 	cmp	%r10, %r8
   1168 	jbe	LABEL(unaligned_exit)
   1169 #endif
   1170 	jmp	LABEL(ashr_7_use_ssse3)
   1171 
   1172 	.p2align 4
   1173 LABEL(ashr_7_use_sse2):
   1174 	pcmpeqb 16(%rsi, %rcx), %xmm0
   1175 	pmovmskb %xmm0, %edx
   1176 	test	%edx, %edx
   1177 	jnz	LABEL(unaligned_exit)
   1178 #ifdef USE_AS_STRNCPY
   1179 	sub	$16, %r8
   1180  	jbe	LABEL(strncpy_truncation_unaligned)
   1181 #endif
   1182 
   1183 	movdqa	16(%rsi, %rcx), %xmm3
   1184 	movdqa	(%rsi, %rcx), %xmm2
   1185 
   1186 	psrldq	$7, %xmm2
   1187 	pslldq	$9, %xmm3
   1188 	por	%xmm2, %xmm3
   1189 
   1190 	movdqa	%xmm3, (%rdi, %rcx)
   1191 	add	$16, %rcx
   1192 
   1193 #ifdef USE_AS_STRNCPY
   1194 	cmp	%r10, %r8
   1195 	jbe	LABEL(unaligned_exit)
   1196 #endif
   1197 	pcmpeqb 16(%rsi, %rcx), %xmm0
   1198 	pmovmskb %xmm0, %edx
   1199 	test	%edx, %edx
   1200 	jnz	LABEL(unaligned_exit)
   1201 #ifdef USE_AS_STRNCPY
   1202 	sub	$16, %r8
   1203  	jbe	LABEL(strncpy_truncation_unaligned)
   1204 #endif
   1205 
   1206 	movdqa	16(%rsi, %rcx), %xmm3
   1207 	movdqa	(%rsi, %rcx), %xmm2
   1208 
   1209 	psrldq	$7, %xmm2
   1210 	pslldq	$9, %xmm3
   1211 	por	%xmm2, %xmm3
   1212 
   1213 	movdqa	%xmm3, (%rdi, %rcx)
   1214 	add	$16, %rcx
   1215 #ifdef USE_AS_STRNCPY
   1216 	cmp	%r10, %r8
   1217 	jbe	LABEL(unaligned_exit)
   1218 #endif
   1219 	jmp	LABEL(ashr_7_use_sse2)
   1220 
   1221 
   1222 /*
   1223  * ashr_6 handles the following cases:
   1224  * 	(16 + (src offset - dest offset)) % 16 = 6
   1225  *
   1226  * Based on above operation, start from (%r9 + rsi) to the left of this cache
   1227  * bank, there is no null byte.
   1228  */
   1229 	.p2align 4
   1230 LABEL(ashr_6):
   1231 	xor	%ecx, %ecx				/* clear index */
   1232 #ifdef USE_AS_STRNCPY
   1233 	cmp	%r10, %r8
   1234 	jbe	LABEL(unaligned_exit)
   1235 #endif
   1236 	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
   1237 	jz	LABEL(ashr_6_use_sse2)
   1238 
   1239 	.p2align 4
   1240 LABEL(ashr_6_use_ssse3):
   1241 	movdqa	16(%rsi, %rcx), %xmm3
   1242 	pcmpeqb	%xmm3, %xmm0
   1243 	pmovmskb %xmm0, %edx
   1244 	test	%edx, %edx
   1245 	jnz	LABEL(unaligned_exit)
   1246 #ifdef USE_AS_STRNCPY
   1247 	sub	$16, %r8
   1248  	jbe	LABEL(strncpy_truncation_unaligned)
   1249 #endif
   1250 
   1251 	#palignr $6, (%rsi, %rcx), %xmm3
   1252 	.byte	0x66, 0x0F, 0x3A ,0x0F
   1253 	.byte	0x1c, 0x0e, 0x06
   1254 
   1255 	movdqa	%xmm3, (%rdi, %rcx)
   1256 	add	$16, %rcx
   1257 
   1258 #ifdef USE_AS_STRNCPY
   1259 	cmp	%r10, %r8
   1260 	jbe	LABEL(unaligned_exit)
   1261 #endif
   1262 	movdqa	16(%rsi, %rcx), %xmm3
   1263 	pcmpeqb %xmm3, %xmm0
   1264 	pmovmskb %xmm0, %edx
   1265 	test	%edx, %edx
   1266 	jnz	LABEL(unaligned_exit)
   1267 #ifdef USE_AS_STRNCPY
   1268 	sub	$16, %r8
   1269  	jbe	LABEL(strncpy_truncation_unaligned)
   1270 #endif
   1271 
   1272 	#palignr $6, (%rsi, %rcx), %xmm3
   1273 	.byte	0x66, 0x0F, 0x3A ,0x0F
   1274 	.byte	0x1c, 0x0e, 0x06
   1275 
   1276 	movdqa	%xmm3, (%rdi, %rcx)
   1277 	add	$16, %rcx
   1278 #ifdef USE_AS_STRNCPY
   1279 	cmp	%r10, %r8
   1280 	jbe	LABEL(unaligned_exit)
   1281 #endif
   1282 	jmp	LABEL(ashr_6_use_ssse3)
   1283 
   1284 	.p2align 4
   1285 LABEL(ashr_6_use_sse2):
   1286 	pcmpeqb 16(%rsi, %rcx), %xmm0
   1287 	pmovmskb %xmm0, %edx
   1288 	test	%edx, %edx
   1289 	jnz	LABEL(unaligned_exit)
   1290 #ifdef USE_AS_STRNCPY
   1291 	sub	$16, %r8
   1292  	jbe	LABEL(strncpy_truncation_unaligned)
   1293 #endif
   1294 
   1295 	movdqa	16(%rsi, %rcx), %xmm3
   1296 	movdqa	(%rsi, %rcx), %xmm2
   1297 
   1298 	psrldq	$6, %xmm2
   1299 	pslldq	$10, %xmm3
   1300 	por	%xmm2, %xmm3
   1301 
   1302 	movdqa	%xmm3, (%rdi, %rcx)
   1303 	add	$16, %rcx
   1304 
   1305 #ifdef USE_AS_STRNCPY
   1306 	cmp	%r10, %r8
   1307 	jbe	LABEL(unaligned_exit)
   1308 #endif
   1309 	pcmpeqb 16(%rsi, %rcx), %xmm0
   1310 	pmovmskb %xmm0, %edx
   1311 	test	%edx, %edx
   1312 	jnz	LABEL(unaligned_exit)
   1313 #ifdef USE_AS_STRNCPY
   1314 	sub	$16, %r8
   1315  	jbe	LABEL(strncpy_truncation_unaligned)
   1316 #endif
   1317 
   1318 	movdqa	16(%rsi, %rcx), %xmm3
   1319 	movdqa	(%rsi, %rcx), %xmm2
   1320 
   1321 	psrldq	$6, %xmm2
   1322 	pslldq	$10, %xmm3
   1323 	por	%xmm2, %xmm3
   1324 
   1325 	movdqa	%xmm3, (%rdi, %rcx)
   1326 	add	$16, %rcx
   1327 #ifdef USE_AS_STRNCPY
   1328 	cmp	%r10, %r8
   1329 	jbe	LABEL(unaligned_exit)
   1330 #endif
   1331 	jmp	LABEL(ashr_6_use_sse2)
   1332 
   1333 
   1334 /*
   1335  * ashr_5 handles the following cases:
   1336  * 	(16 + (src offset - dest offset)) % 16 = 5
   1337  *
   1338  * Based on above operation, start from (%r9 + rsi) to the left of this cache
   1339  * bank, there is no null byte.
   1340  */
   1341 	.p2align 4
   1342 LABEL(ashr_5):
   1343 	xor	%ecx, %ecx				/* clear index */
   1344 #ifdef USE_AS_STRNCPY
   1345 	cmp	%r10, %r8
   1346 	jbe	LABEL(unaligned_exit)
   1347 #endif
   1348 	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
   1349 	jz	LABEL(ashr_5_use_sse2)
   1350 
   1351 	.p2align 4
   1352 LABEL(ashr_5_use_ssse3):
   1353 	movdqa	16(%rsi, %rcx), %xmm3
   1354 	pcmpeqb	%xmm3, %xmm0
   1355 	pmovmskb %xmm0, %edx
   1356 	test	%edx, %edx
   1357 	jnz	LABEL(unaligned_exit)
   1358 #ifdef USE_AS_STRNCPY
   1359 	sub	$16, %r8
   1360  	jbe	LABEL(strncpy_truncation_unaligned)
   1361 #endif
   1362 
   1363 	#palignr $5, (%rsi, %rcx), %xmm3
   1364 	.byte	0x66, 0x0F, 0x3A ,0x0F
   1365 	.byte	0x1c, 0x0e, 0x05
   1366 
   1367 	movdqa	%xmm3, (%rdi, %rcx)
   1368 	add	$16, %rcx
   1369 
   1370 #ifdef USE_AS_STRNCPY
   1371 	cmp	%r10, %r8
   1372 	jbe	LABEL(unaligned_exit)
   1373 #endif
   1374 	movdqa	16(%rsi, %rcx), %xmm3
   1375 	pcmpeqb %xmm3, %xmm0
   1376 	pmovmskb %xmm0, %edx
   1377 	test	%edx, %edx
   1378 	jnz	LABEL(unaligned_exit)
   1379 #ifdef USE_AS_STRNCPY
   1380 	sub	$16, %r8
   1381  	jbe	LABEL(strncpy_truncation_unaligned)
   1382 #endif
   1383 
   1384 	#palignr $5, (%rsi, %rcx), %xmm3
   1385 	.byte	0x66, 0x0F, 0x3A ,0x0F
   1386 	.byte	0x1c, 0x0e, 0x05
   1387 
   1388 	movdqa	%xmm3, (%rdi, %rcx)
   1389 	add	$16, %rcx
   1390 #ifdef USE_AS_STRNCPY
   1391 	cmp	%r10, %r8
   1392 	jbe	LABEL(unaligned_exit)
   1393 #endif
   1394 	jmp	LABEL(ashr_5_use_ssse3)
   1395 
   1396 	.p2align 4
   1397 LABEL(ashr_5_use_sse2):
   1398 	pcmpeqb 16(%rsi, %rcx), %xmm0
   1399 	pmovmskb %xmm0, %edx
   1400 	test	%edx, %edx
   1401 	jnz	LABEL(unaligned_exit)
   1402 #ifdef USE_AS_STRNCPY
   1403 	sub	$16, %r8
   1404  	jbe	LABEL(strncpy_truncation_unaligned)
   1405 #endif
   1406 
   1407 	movdqa	16(%rsi, %rcx), %xmm3
   1408 	movdqa	(%rsi, %rcx), %xmm2
   1409 
   1410 	psrldq	$5, %xmm2
   1411 	pslldq	$11, %xmm3
   1412 	por	%xmm2, %xmm3
   1413 
   1414 	movdqa	%xmm3, (%rdi, %rcx)
   1415 	add	$16, %rcx
   1416 
   1417 #ifdef USE_AS_STRNCPY
   1418 	cmp	%r10, %r8
   1419 	jbe	LABEL(unaligned_exit)
   1420 #endif
   1421 	pcmpeqb 16(%rsi, %rcx), %xmm0
   1422 	pmovmskb %xmm0, %edx
   1423 	test	%edx, %edx
   1424 	jnz	LABEL(unaligned_exit)
   1425 #ifdef USE_AS_STRNCPY
   1426 	sub	$16, %r8
   1427  	jbe	LABEL(strncpy_truncation_unaligned)
   1428 #endif
   1429 
   1430 	movdqa	16(%rsi, %rcx), %xmm3
   1431 	movdqa	(%rsi, %rcx), %xmm2
   1432 
   1433 	psrldq	$5, %xmm2
   1434 	pslldq	$11, %xmm3
   1435 	por	%xmm2, %xmm3
   1436 
   1437 	movdqa	%xmm3, (%rdi, %rcx)
   1438 	add	$16, %rcx
   1439 #ifdef USE_AS_STRNCPY
   1440 	cmp	%r10, %r8
   1441 	jbe	LABEL(unaligned_exit)
   1442 #endif
   1443 	jmp	LABEL(ashr_5_use_sse2)
   1444 
   1445 
   1446 /*
   1447  * ashr_4 handles the following cases:
   1448  * 	(16 + (src offset - dest offset)) % 16 = 4
   1449  *
   1450  * Based on above operation, start from (%r9 + rsi) to the left of this cache
   1451  * bank, there is no null byte.
   1452  */
   1453 	.p2align 4
   1454 LABEL(ashr_4):
   1455 	xor	%ecx, %ecx				/* clear index */
   1456 #ifdef USE_AS_STRNCPY
   1457 	cmp	%r10, %r8
   1458 	jbe	LABEL(unaligned_exit)
   1459 #endif
   1460 	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
   1461 	jz	LABEL(ashr_4_use_sse2)
   1462 
   1463 	.p2align 4
   1464 LABEL(ashr_4_use_ssse3):
   1465 	movdqa	16(%rsi, %rcx), %xmm3
   1466 	pcmpeqb	%xmm3, %xmm0
   1467 	pmovmskb %xmm0, %edx
   1468 	test	%edx, %edx
   1469 	jnz	LABEL(unaligned_exit)
   1470 #ifdef USE_AS_STRNCPY
   1471 	sub	$16, %r8
   1472  	jbe	LABEL(strncpy_truncation_unaligned)
   1473 #endif
   1474 
   1475 	#palignr $4, (%rsi, %rcx), %xmm3
   1476 	.byte	0x66, 0x0F, 0x3A ,0x0F
   1477 	.byte	0x1c, 0x0e, 0x04
   1478 
   1479 	movdqa	%xmm3, (%rdi, %rcx)
   1480 	add	$16, %rcx
   1481 
   1482 #ifdef USE_AS_STRNCPY
   1483 	cmp	%r10, %r8
   1484 	jbe	LABEL(unaligned_exit)
   1485 #endif
   1486 	movdqa	16(%rsi, %rcx), %xmm3
   1487 	pcmpeqb %xmm3, %xmm0
   1488 	pmovmskb %xmm0, %edx
   1489 	test	%edx, %edx
   1490 	jnz	LABEL(unaligned_exit)
   1491 #ifdef USE_AS_STRNCPY
   1492 	sub	$16, %r8
   1493  	jbe	LABEL(strncpy_truncation_unaligned)
   1494 #endif
   1495 
   1496 	#palignr $4, (%rsi, %rcx), %xmm3
   1497 	.byte	0x66, 0x0F, 0x3A ,0x0F
   1498 	.byte	0x1c, 0x0e, 0x04
   1499 
   1500 	movdqa	%xmm3, (%rdi, %rcx)
   1501 	add	$16, %rcx
   1502 #ifdef USE_AS_STRNCPY
   1503 	cmp	%r10, %r8
   1504 	jbe	LABEL(unaligned_exit)
   1505 #endif
   1506 	jmp	LABEL(ashr_4_use_ssse3)
   1507 
   1508 	.p2align 4
   1509 LABEL(ashr_4_use_sse2):
   1510 	pcmpeqb 16(%rsi, %rcx), %xmm0
   1511 	pmovmskb %xmm0, %edx
   1512 	test	%edx, %edx
   1513 	jnz	LABEL(unaligned_exit)
   1514 #ifdef USE_AS_STRNCPY
   1515 	sub	$16, %r8
   1516  	jbe	LABEL(strncpy_truncation_unaligned)
   1517 #endif
   1518 
   1519 	movdqa	16(%rsi, %rcx), %xmm3
   1520 	movdqa	(%rsi, %rcx), %xmm2
   1521 
   1522 	psrldq	$4, %xmm2
   1523 	pslldq	$12, %xmm3
   1524 	por	%xmm2, %xmm3
   1525 
   1526 	movdqa	%xmm3, (%rdi, %rcx)
   1527 	add	$16, %rcx
   1528 
   1529 #ifdef USE_AS_STRNCPY
   1530 	cmp	%r10, %r8
   1531 	jbe	LABEL(unaligned_exit)
   1532 #endif
   1533 	pcmpeqb 16(%rsi, %rcx), %xmm0
   1534 	pmovmskb %xmm0, %edx
   1535 	test	%edx, %edx
   1536 	jnz	LABEL(unaligned_exit)
   1537 #ifdef USE_AS_STRNCPY
   1538 	sub	$16, %r8
   1539  	jbe	LABEL(strncpy_truncation_unaligned)
   1540 #endif
   1541 
   1542 	movdqa	16(%rsi, %rcx), %xmm3
   1543 	movdqa	(%rsi, %rcx), %xmm2
   1544 
   1545 	psrldq	$4, %xmm2
   1546 	pslldq	$12, %xmm3
   1547 	por	%xmm2, %xmm3
   1548 
   1549 	movdqa	%xmm3, (%rdi, %rcx)
   1550 	add	$16, %rcx
   1551 #ifdef USE_AS_STRNCPY
   1552 	cmp	%r10, %r8
   1553 	jbe	LABEL(unaligned_exit)
   1554 #endif
   1555 	jmp	LABEL(ashr_4_use_sse2)
   1556 
   1557 
   1558 /*
   1559  * ashr_3 handles the following cases:
   1560  * 	(16 + (src offset - dest offset)) % 16 = 3
   1561  *
   1562  * Based on above operation, start from (%r9 + rsi) to the left of this cache
   1563  * bank, there is no null byte.
   1564  */
   1565 	.p2align 4
   1566 LABEL(ashr_3):
   1567 	xor	%ecx, %ecx				/* clear index */
   1568 #ifdef USE_AS_STRNCPY
   1569 	cmp	%r10, %r8
   1570 	jbe	LABEL(unaligned_exit)
   1571 #endif
   1572 	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
   1573 	jz	LABEL(ashr_3_use_sse2)
   1574 
   1575 	.p2align 4
   1576 LABEL(ashr_3_use_ssse3):
   1577 	movdqa	16(%rsi, %rcx), %xmm3
   1578 	pcmpeqb	%xmm3, %xmm0
   1579 	pmovmskb %xmm0, %edx
   1580 	test	%edx, %edx
   1581 	jnz	LABEL(unaligned_exit)
   1582 #ifdef USE_AS_STRNCPY
   1583 	sub	$16, %r8
   1584  	jbe	LABEL(strncpy_truncation_unaligned)
   1585 #endif
   1586 
   1587 	#palignr $3, (%rsi, %rcx), %xmm3
   1588 	.byte	0x66, 0x0F, 0x3A ,0x0F
   1589 	.byte	0x1c, 0x0e, 0x03
   1590 
   1591 	movdqa	%xmm3, (%rdi, %rcx)
   1592 	add	$16, %rcx
   1593 
   1594 #ifdef USE_AS_STRNCPY
   1595 	cmp	%r10, %r8
   1596 	jbe	LABEL(unaligned_exit)
   1597 #endif
   1598 	movdqa	16(%rsi, %rcx), %xmm3
   1599 	pcmpeqb %xmm3, %xmm0
   1600 	pmovmskb %xmm0, %edx
   1601 	test	%edx, %edx
   1602 	jnz	LABEL(unaligned_exit)
   1603 #ifdef USE_AS_STRNCPY
   1604 	sub	$16, %r8
   1605  	jbe	LABEL(strncpy_truncation_unaligned)
   1606 #endif
   1607 
   1608 	#palignr $3, (%rsi, %rcx), %xmm3
   1609 	.byte	0x66, 0x0F, 0x3A ,0x0F
   1610 	.byte	0x1c, 0x0e, 0x03
   1611 
   1612 	movdqa	%xmm3, (%rdi, %rcx)
   1613 	add	$16, %rcx
   1614 #ifdef USE_AS_STRNCPY
   1615 	cmp	%r10, %r8
   1616 	jbe	LABEL(unaligned_exit)
   1617 #endif
   1618 	jmp	LABEL(ashr_3_use_ssse3)
   1619 
   1620 	.p2align 4
   1621 LABEL(ashr_3_use_sse2):
   1622 	pcmpeqb 16(%rsi, %rcx), %xmm0
   1623 	pmovmskb %xmm0, %edx
   1624 	test	%edx, %edx
   1625 	jnz	LABEL(unaligned_exit)
   1626 #ifdef USE_AS_STRNCPY
   1627 	sub	$16, %r8
   1628  	jbe	LABEL(strncpy_truncation_unaligned)
   1629 #endif
   1630 
   1631 	movdqa	16(%rsi, %rcx), %xmm3
   1632 	movdqa	(%rsi, %rcx), %xmm2
   1633 
   1634 	psrldq	$3, %xmm2
   1635 	pslldq	$13, %xmm3
   1636 	por	%xmm2, %xmm3
   1637 
   1638 	movdqa	%xmm3, (%rdi, %rcx)
   1639 	add	$16, %rcx
   1640 
   1641 #ifdef USE_AS_STRNCPY
   1642 	cmp	%r10, %r8
   1643 	jbe	LABEL(unaligned_exit)
   1644 #endif
   1645 	pcmpeqb 16(%rsi, %rcx), %xmm0
   1646 	pmovmskb %xmm0, %edx
   1647 	test	%edx, %edx
   1648 	jnz	LABEL(unaligned_exit)
   1649 #ifdef USE_AS_STRNCPY
   1650 	sub	$16, %r8
   1651  	jbe	LABEL(strncpy_truncation_unaligned)
   1652 #endif
   1653 
   1654 	movdqa	16(%rsi, %rcx), %xmm3
   1655 	movdqa	(%rsi, %rcx), %xmm2
   1656 
   1657 	psrldq	$3, %xmm2
   1658 	pslldq	$13, %xmm3
   1659 	por	%xmm2, %xmm3
   1660 
   1661 	movdqa	%xmm3, (%rdi, %rcx)
   1662 	add	$16, %rcx
   1663 #ifdef USE_AS_STRNCPY
   1664 	cmp	%r10, %r8
   1665 	jbe	LABEL(unaligned_exit)
   1666 #endif
   1667 	jmp	LABEL(ashr_3_use_sse2)
   1668 
   1669 
   1670 /*
   1671  * ashr_2 handles the following cases:
   1672  * 	(16 + (src offset - dest offset)) % 16 = 2
   1673  *
   1674  * Based on above operation, start from (%r9 + rsi) to the left of this cache
   1675  * bank, there is no null byte.
   1676  */
   1677 	.p2align 4
   1678 LABEL(ashr_2):
   1679 	xor	%ecx, %ecx				/* clear index */
   1680 #ifdef USE_AS_STRNCPY
   1681 	cmp	%r10, %r8
   1682 	jbe	LABEL(unaligned_exit)
   1683 #endif
   1684 	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
   1685 	jz	LABEL(ashr_2_use_sse2)
   1686 
   1687 	.p2align 4
   1688 LABEL(ashr_2_use_ssse3):
   1689 	movdqa	16(%rsi, %rcx), %xmm3
   1690 	pcmpeqb	%xmm3, %xmm0
   1691 	pmovmskb %xmm0, %edx
   1692 	test	%edx, %edx
   1693 	jnz	LABEL(unaligned_exit)
   1694 #ifdef USE_AS_STRNCPY
   1695 	sub	$16, %r8
   1696  	jbe	LABEL(strncpy_truncation_unaligned)
   1697 #endif
   1698 
   1699 	#palignr $2, (%rsi, %rcx), %xmm3
   1700 	.byte	0x66, 0x0F, 0x3A ,0x0F
   1701 	.byte	0x1c, 0x0e, 0x02
   1702 
   1703 	movdqa	%xmm3, (%rdi, %rcx)
   1704 	add	$16, %rcx
   1705 
   1706 #ifdef USE_AS_STRNCPY
   1707 	cmp	%r10, %r8
   1708 	jbe	LABEL(unaligned_exit)
   1709 #endif
   1710 	movdqa	16(%rsi, %rcx), %xmm3
   1711 	pcmpeqb %xmm3, %xmm0
   1712 	pmovmskb %xmm0, %edx
   1713 	test	%edx, %edx
   1714 	jnz	LABEL(unaligned_exit)
   1715 #ifdef USE_AS_STRNCPY
   1716 	sub	$16, %r8
   1717  	jbe	LABEL(strncpy_truncation_unaligned)
   1718 #endif
   1719 
   1720 	#palignr $2, (%rsi, %rcx), %xmm3
   1721 	.byte	0x66, 0x0F, 0x3A ,0x0F
   1722 	.byte	0x1c, 0x0e, 0x02
   1723 
   1724 	movdqa	%xmm3, (%rdi, %rcx)
   1725 	add	$16, %rcx
   1726 #ifdef USE_AS_STRNCPY
   1727 	cmp	%r10, %r8
   1728 	jbe	LABEL(unaligned_exit)
   1729 #endif
   1730 	jmp	LABEL(ashr_2_use_ssse3)
   1731 
   1732 	.p2align 4
   1733 LABEL(ashr_2_use_sse2):
   1734 	pcmpeqb 16(%rsi, %rcx), %xmm0
   1735 	pmovmskb %xmm0, %edx
   1736 	test	%edx, %edx
   1737 	jnz	LABEL(unaligned_exit)
   1738 #ifdef USE_AS_STRNCPY
   1739 	sub	$16, %r8
   1740  	jbe	LABEL(strncpy_truncation_unaligned)
   1741 #endif
   1742 
   1743 	movdqa	16(%rsi, %rcx), %xmm3
   1744 	movdqa	(%rsi, %rcx), %xmm2
   1745 
   1746 	psrldq	$2, %xmm2
   1747 	pslldq	$14, %xmm3
   1748 	por	%xmm2, %xmm3
   1749 
   1750 	movdqa	%xmm3, (%rdi, %rcx)
   1751 	add	$16, %rcx
   1752 
   1753 #ifdef USE_AS_STRNCPY
   1754 	cmp	%r10, %r8
   1755 	jbe	LABEL(unaligned_exit)
   1756 #endif
   1757 	pcmpeqb 16(%rsi, %rcx), %xmm0
   1758 	pmovmskb %xmm0, %edx
   1759 	test	%edx, %edx
   1760 	jnz	LABEL(unaligned_exit)
   1761 #ifdef USE_AS_STRNCPY
   1762 	sub	$16, %r8
   1763  	jbe	LABEL(strncpy_truncation_unaligned)
   1764 #endif
   1765 
   1766 	movdqa	16(%rsi, %rcx), %xmm3
   1767 	movdqa	(%rsi, %rcx), %xmm2
   1768 
   1769 	psrldq	$2, %xmm2
   1770 	pslldq	$14, %xmm3
   1771 	por	%xmm2, %xmm3
   1772 
   1773 	movdqa	%xmm3, (%rdi, %rcx)
   1774 	add	$16, %rcx
   1775 #ifdef USE_AS_STRNCPY
   1776 	cmp	%r10, %r8
   1777 	jbe	LABEL(unaligned_exit)
   1778 #endif
   1779 	jmp	LABEL(ashr_2_use_sse2)
   1780 
   1781 
   1782 /*
   1783  * ashr_1 handles the following cases:
   1784  * 	(16 + (src offset - dest offset)) % 16 = 1
   1785  *
   1786  * Based on above operation, start from (%r9 + rsi) to the left of this cache
   1787  * bank, there is no null byte.
   1788  */
   1789 	.p2align 4
   1790 LABEL(ashr_1):
   1791 	xor	%ecx, %ecx				/* clear index */
   1792 #ifdef USE_AS_STRNCPY
   1793 	cmp	%r10, %r8
   1794 	jbe	LABEL(unaligned_exit)
   1795 #endif
   1796 	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
   1797 	jz	LABEL(ashr_1_use_sse2)
   1798 
   1799 	.p2align 4
   1800 LABEL(ashr_1_use_ssse3):
   1801 	movdqa	16(%rsi, %rcx), %xmm3
   1802 	pcmpeqb	%xmm3, %xmm0
   1803 	pmovmskb %xmm0, %edx
   1804 	test	%edx, %edx
   1805 	jnz	LABEL(unaligned_exit)
   1806 #ifdef USE_AS_STRNCPY
   1807 	sub	$16, %r8
   1808  	jbe	LABEL(strncpy_truncation_unaligned)
   1809 #endif
   1810 
   1811 	#palignr $1, (%rsi, %rcx), %xmm3
   1812 	.byte	0x66, 0x0F, 0x3A ,0x0F
   1813 	.byte	0x1c, 0x0e, 0x01
   1814 
   1815 	movdqa	%xmm3, (%rdi, %rcx)
   1816 	add	$16, %rcx
   1817 
   1818 #ifdef USE_AS_STRNCPY
   1819 	cmp	%r10, %r8
   1820 	jbe	LABEL(unaligned_exit)
   1821 #endif
   1822 	movdqa	16(%rsi, %rcx), %xmm3
   1823 	pcmpeqb %xmm3, %xmm0
   1824 	pmovmskb %xmm0, %edx
   1825 	test	%edx, %edx
   1826 	jnz	LABEL(unaligned_exit)
   1827 #ifdef USE_AS_STRNCPY
   1828 	sub	$16, %r8
   1829  	jbe	LABEL(strncpy_truncation_unaligned)
   1830 #endif
   1831 	#palignr $1, (%rsi, %rcx), %xmm3
   1832 	.byte	0x66, 0x0F, 0x3A ,0x0F
   1833 	.byte	0x1c, 0x0e, 0x01
   1834 
   1835 	movdqa	%xmm3, (%rdi, %rcx)
   1836 	add	$16, %rcx
   1837 #ifdef USE_AS_STRNCPY
   1838 	cmp	%r10, %r8
   1839 	jbe	LABEL(unaligned_exit)
   1840 #endif
   1841 	jmp	LABEL(ashr_1_use_ssse3)
   1842 
   1843 	.p2align 4
   1844 LABEL(ashr_1_use_sse2):
   1845 	pcmpeqb 16(%rsi, %rcx), %xmm0
   1846 	pmovmskb %xmm0, %edx
   1847 	test	%edx, %edx
   1848 	jnz	LABEL(unaligned_exit)
   1849 #ifdef USE_AS_STRNCPY
   1850 	sub	$16, %r8
   1851  	jbe	LABEL(strncpy_truncation_unaligned)
   1852 #endif
   1853 	movdqa	16(%rsi, %rcx), %xmm3
   1854 	movdqa	(%rsi, %rcx), %xmm2
   1855 
   1856 	psrldq	$1, %xmm2
   1857 	pslldq	$15, %xmm3
   1858 	por	%xmm2, %xmm3
   1859 
   1860 	movdqa	%xmm3, (%rdi, %rcx)
   1861 	add	$16, %rcx
   1862 
   1863 #ifdef USE_AS_STRNCPY
   1864 	cmp	%r10, %r8
   1865 	jbe	LABEL(unaligned_exit)
   1866 #endif
   1867 	pcmpeqb 16(%rsi, %rcx), %xmm0
   1868 	pmovmskb %xmm0, %edx
   1869 	test	%edx, %edx
   1870 	jnz	LABEL(unaligned_exit)
   1871 #ifdef USE_AS_STRNCPY
   1872 	sub	$16, %r8
   1873  	jbe	LABEL(strncpy_truncation_unaligned)
   1874 #endif
   1875 
   1876 	movdqa	16(%rsi, %rcx), %xmm3
   1877 	movdqa	(%rsi, %rcx), %xmm2
   1878 
   1879 	psrldq	$1, %xmm2
   1880 	pslldq	$15, %xmm3
   1881 	por	%xmm2, %xmm3
   1882 
   1883 	movdqa	%xmm3, (%rdi, %rcx)
   1884 	add	$16, %rcx
   1885 #ifdef USE_AS_STRNCPY
   1886 	cmp	%r10, %r8
   1887 	jbe	LABEL(unaligned_exit)
   1888 #endif
   1889 	jmp	LABEL(ashr_1_use_sse2)
   1890 
   1891 
   1892 	/*
   1893 	 * Exit tail code:
   1894 	 * Up to 32 bytes are copied in the case of strcpy.
   1895 	 */
   1896 	.p2align 4
   1897 LABEL(less32bytes):
   1898 	xor	%ecx, %ecx
   1899 LABEL(unaligned_exit):
   1900 	add	%r9, %rsi		/* r9 holds offset of rsi */
   1901 	mov	%rcx, %r9
   1902 	mov	%r10, %rcx
   1903 	shl	%cl, %edx		/* after shl, calculate the exact number to be filled */
   1904 	mov	%r9, %rcx
   1905 	.p2align 4
   1906 LABEL(aligned_exit):
   1907 	add	%rcx, %rdi		/* locate exact address for rdi */
   1908 LABEL(less16bytes):
   1909 	add	%rcx, %rsi		/* locate exact address for rsi */
   1910 LABEL(aligned_16bytes):
   1911 #ifdef USE_AS_STRNCPY
   1912 	/*
   1913 	 * Null found in 16bytes checked. Set bit in bitmask corresponding to
   1914 	 * the strncpy count argument. We will copy to the null (inclusive)
   1915 	 * or count whichever comes first.
   1916 	 */
   1917 	mov	$1, %r9d
   1918 	lea	-1(%r8), %rcx
   1919 	shl	%cl, %r9d
   1920 	cmp	$32, %r8
   1921 	ja	LABEL(strncpy_tail)
   1922 	or	%r9d, %edx
   1923 LABEL(strncpy_tail):
   1924 #endif
   1925 	/*
   1926 	 * Check to see if BSF is fast on this processor. If not, use a
   1927 	 * different exit tail.
   1928 	 */
   1929 	testb	$USE_BSF, .memops_method(%rip)
   1930 	jz	LABEL(AMD_exit)
   1931 	bsf	%rdx, %rcx		/* Find byte with null char */
   1932 	lea	LABEL(tail_table)(%rip), %r11
   1933 	movslq	(%r11, %rcx, 4), %rcx
   1934 	lea	(%r11, %rcx), %rcx
   1935 	jmp	*%rcx
   1936 
   1937 #ifdef USE_AS_STRNCPY
   1938 	/*
   1939 	 * Count reached before null found.
   1940 	 */
   1941 	.p2align 4
   1942 LABEL(less32bytes_strncpy_truncation):
   1943 	xor	%ecx, %ecx
   1944 LABEL(strncpy_truncation_unaligned):
   1945 	add	%r9, %rsi		/* next src char to copy */
   1946 LABEL(strncpy_truncation_aligned):
   1947 	add	%rcx, %rdi
   1948 	add	%rcx, %rsi
   1949 	add	$16, %r8		/* compensation */
   1950 	lea	-1(%r8), %rcx
   1951 	lea	LABEL(tail_table)(%rip), %r11
   1952 	movslq	(%r11, %rcx, 4), %rcx
   1953 	lea	(%r11, %rcx), %rcx
   1954 	jmp	*%rcx
   1955 
   1956 	.p2align 4
   1957 LABEL(strncpy_exitz):
   1958 	mov	%rdi, %rax
   1959 	ret
   1960 #endif
   1961 
   1962 	.p2align 4
   1963 LABEL(AMD_exit):
   1964 	test	%dl, %dl
   1965 	jz	LABEL(AMD_exit_more_8)
   1966 	test	$0x01, %dl
   1967 	jnz	LABEL(tail_0)
   1968 	test	$0x02, %dl
   1969 	jnz	LABEL(tail_1)
   1970 	test	$0x04, %dl
   1971 	jnz	LABEL(tail_2)
   1972 	test	$0x08, %dl
   1973 	jnz	LABEL(tail_3)
   1974 	test	$0x10, %dl
   1975 	jnz	LABEL(tail_4)
   1976 	test	$0x20, %dl
   1977 	jnz	LABEL(tail_5)
   1978 	test	$0x40, %dl
   1979 	jnz	LABEL(tail_6)
   1980 
   1981 	.p2align 4
   1982 LABEL(tail_7):				/* 8 bytes */
   1983 	mov	(%rsi), %rcx
   1984 	mov	%rcx, (%rdi)
   1985 #ifdef USE_AS_STRNCPY
   1986 	mov	$8, %cl
   1987 	sub	$8, %r8
   1988 	jnz	LABEL(strncpy_fill_tail)
   1989 #endif
   1990 	ret
   1991 
   1992 #ifdef USE_AS_STRNCPY
   1993 	/*
   1994 	 * Null terminated src string shorter than count. Fill the rest of the
   1995 	 * destination with null chars.
   1996 	 */
   1997 	.p2align 4
   1998 LABEL(strncpy_fill_tail):
   1999 	mov	%rax, %rdx
   2000 	movzx	%cl, %rax
   2001 	mov	%r8, %rcx
   2002 	add	%rax, %rdi
   2003 	xor	%eax, %eax
   2004 	shr	$3, %ecx
   2005 	jz	LABEL(strncpy_fill_less_8)
   2006 
   2007 	rep	stosq
   2008 LABEL(strncpy_fill_less_8):
   2009 	mov	%r8, %rcx
   2010 	and	$7, %rcx
   2011 	jz	LABEL(strncpy_fill_return)
   2012 LABEL(strncpy_fill_less_7):
   2013 	sub	$1, %ecx
   2014 	mov	%al, (%rdi, %rcx)
   2015 	jnz	LABEL(strncpy_fill_less_7)
   2016 LABEL(strncpy_fill_return):
   2017 	mov	%rdx, %rax
   2018 	ret
   2019 #endif
   2020 
   2021 	.p2align 4
   2022 LABEL(tail_0):				/* 1 byte */
   2023 	mov	(%rsi), %cl
   2024 	mov	%cl, (%rdi)
   2025 #ifdef USE_AS_STRNCPY
   2026 	mov	$1, %cl
   2027 	sub	$1, %r8
   2028 	jnz	LABEL(strncpy_fill_tail)
   2029 #endif
   2030 	ret
   2031 
   2032 	.p2align 4
   2033 LABEL(tail_1):				/* 2 bytes */
   2034 	mov	(%rsi), %cx
   2035 	mov	%cx, (%rdi)
   2036 #ifdef USE_AS_STRNCPY
   2037 	mov	$2, %cl
   2038 	sub	$2, %r8
   2039 	jnz	LABEL(strncpy_fill_tail)
   2040 #endif
   2041 	ret
   2042 
   2043 	.p2align 4
   2044 LABEL(tail_2):				/* 3 bytes */
   2045 	mov	(%rsi), %cx
   2046 	mov	%cx, (%rdi)
   2047 	mov	1(%rsi), %cx
   2048 	mov	%cx, 1(%rdi)
   2049 #ifdef USE_AS_STRNCPY
   2050 	mov	$3, %cl
   2051 	sub	$3, %r8
   2052 	jnz	LABEL(strncpy_fill_tail)
   2053 #endif
   2054 	ret
   2055 
   2056 	.p2align 4
   2057 LABEL(tail_3):				/* 4 bytes */
   2058 	mov	(%rsi), %ecx
   2059 	mov	%ecx, (%rdi)
   2060 #ifdef USE_AS_STRNCPY
   2061 	mov	$4, %cl
   2062 	sub	$4, %r8
   2063 	jnz	LABEL(strncpy_fill_tail)
   2064 #endif
   2065 	ret
   2066 
   2067 	.p2align 4
   2068 LABEL(tail_4):				/* 5 bytes */
   2069 	mov	(%rsi), %ecx
   2070 	mov	%ecx, (%rdi)
   2071 	mov	1(%rsi), %edx
   2072 	mov	%edx, 1(%rdi)
   2073 #ifdef USE_AS_STRNCPY
   2074 	mov	$5, %cl
   2075 	sub	$5, %r8
   2076 	jnz	LABEL(strncpy_fill_tail)
   2077 #endif
   2078 	ret
   2079 
   2080 	.p2align 4
   2081 LABEL(tail_5):				/* 6 bytes */
   2082 	mov	(%rsi), %ecx
   2083 	mov	%ecx, (%rdi)
   2084 	mov	2(%rsi), %edx
   2085 	mov	%edx, 2(%rdi)
   2086 #ifdef USE_AS_STRNCPY
   2087 	mov	$6, %cl
   2088 	sub	$6, %r8
   2089 	jnz	LABEL(strncpy_fill_tail)
   2090 #endif
   2091 	ret
   2092 
   2093 	.p2align 4
   2094 LABEL(tail_6):				/* 7 bytes */
   2095 	mov	(%rsi), %ecx
   2096 	mov	%ecx, (%rdi)
   2097 	mov	3(%rsi), %edx
   2098 	mov	%edx,3(%rdi)
   2099 #ifdef USE_AS_STRNCPY
   2100 	mov	$7, %cl
   2101 	sub	$7, %r8
   2102 	jnz	LABEL(strncpy_fill_tail)
   2103 #endif
   2104 	ret
   2105 
   2106 	.p2align 4
   2107 LABEL(tail_8):				/* 9 bytes */
   2108 	mov	(%rsi), %rcx
   2109 	mov	%rcx, (%rdi)
   2110 	mov	5(%rsi), %edx
   2111 	mov	%edx, 5(%rdi)
   2112 #ifdef USE_AS_STRNCPY
   2113 	mov	$9, %cl
   2114 	sub	$9, %r8
   2115 	jnz	LABEL(strncpy_fill_tail)
   2116 #endif
   2117 	ret
   2118 
   2119 	.p2align 4
   2120 LABEL(AMD_exit_more_8):
   2121 	test	%dh, %dh
   2122 	jz	LABEL(AMD_exit_more_16)
   2123 	test	$0x01, %dh
   2124 	jnz	LABEL(tail_8)
   2125 	test	$0x02, %dh
   2126 	jnz	LABEL(tail_9)
   2127 	test	$0x04, %dh
   2128 	jnz	LABEL(tail_10)
   2129 	test	$0x08, %dh
   2130 	jnz	LABEL(tail_11)
   2131 	test	$0x10, %dh
   2132 	jnz	LABEL(tail_12)
   2133 	test	$0x20, %dh
   2134 	jnz	LABEL(tail_13)
   2135 	test	$0x40, %dh
   2136 	jnz	LABEL(tail_14)
   2137 
   2138 	.p2align 4
   2139 LABEL(tail_15):				/* 16 bytes */
   2140 	mov	(%rsi), %rcx
   2141 	mov	%rcx, (%rdi)
   2142 	mov	8(%rsi), %rdx
   2143 	mov	%rdx, 8(%rdi)
   2144 #ifdef USE_AS_STRNCPY
   2145 	mov	$16, %cl
   2146 	sub	$16, %r8
   2147 	jnz	LABEL(strncpy_fill_tail)
   2148 #endif
   2149 	ret
   2150 
   2151 	.p2align 4
   2152 LABEL(tail_9):				/* 10 bytes */
   2153 	mov	(%rsi), %rcx
   2154 	mov	%rcx, (%rdi)
   2155 	mov	6(%rsi), %edx
   2156 	mov	%edx, 6(%rdi)
   2157 #ifdef USE_AS_STRNCPY
   2158 	mov	$10, %cl
   2159 	sub	$10, %r8
   2160 	jnz	LABEL(strncpy_fill_tail)
   2161 #endif
   2162 	ret
   2163 
   2164 	.p2align 4
   2165 LABEL(tail_10):				/* 11 bytes */
   2166 	mov	(%rsi), %rcx
   2167 	mov	%rcx, (%rdi)
   2168 	mov	7(%rsi), %edx
   2169 	mov	%edx, 7(%rdi)
   2170 #ifdef USE_AS_STRNCPY
   2171 	mov	$11, %cl
   2172 	sub	$11, %r8
   2173 	jnz	LABEL(strncpy_fill_tail)
   2174 #endif
   2175 	ret
   2176 
   2177 	.p2align 4
   2178 LABEL(tail_11):				/* 12 bytes */
   2179 	mov	(%rsi), %rcx
   2180 	mov	%rcx, (%rdi)
   2181 	mov	8(%rsi), %edx
   2182 	mov	%edx, 8(%rdi)
   2183 #ifdef USE_AS_STRNCPY
   2184 	mov	$12, %cl
   2185 	sub	$12, %r8
   2186 	jnz	LABEL(strncpy_fill_tail)
   2187 #endif
   2188 	ret
   2189 
   2190 	.p2align 4
   2191 LABEL(tail_12):				/* 13 bytes */
   2192 	mov	(%rsi), %rcx
   2193 	mov	%rcx, (%rdi)
   2194 	mov	5(%rsi), %rcx
   2195 	mov	%rcx, 5(%rdi)
   2196 #ifdef USE_AS_STRNCPY
   2197 	mov	$13, %cl
   2198 	sub	$13, %r8
   2199 	jnz	LABEL(strncpy_fill_tail)
   2200 #endif
   2201 	ret
   2202 
   2203 	.p2align 4
   2204 LABEL(tail_13):				/* 14 bytes */
   2205 	mov	(%rsi), %rcx
   2206 	mov	%rcx, (%rdi)
   2207 	mov	6(%rsi), %rcx
   2208 	mov	%rcx, 6(%rdi)
   2209 #ifdef USE_AS_STRNCPY
   2210 	mov	$14, %cl
   2211 	sub	$14, %r8
   2212 	jnz	LABEL(strncpy_fill_tail)
   2213 #endif
   2214 	ret
   2215 
   2216 	.p2align 4
   2217 LABEL(tail_14):				/* 15 bytes */
   2218 	mov	(%rsi), %rcx
   2219 	mov	%rcx, (%rdi)
   2220 	mov	7(%rsi), %rcx
   2221 	mov	%rcx, 7(%rdi)
   2222 #ifdef USE_AS_STRNCPY
   2223 	mov	$15, %cl
   2224 	sub	$15, %r8
   2225 	jnz	LABEL(strncpy_fill_tail)
   2226 #endif
   2227 	ret
   2228 
   2229 	.p2align 4
   2230 LABEL(AMD_exit_more_16):
   2231 	shr	$16, %edx
   2232 	test	%dl, %dl
   2233 	jz	LABEL(AMD_exit_more_24)
   2234 	test	$0x01, %dl
   2235 	jnz	LABEL(tail_16)
   2236 	test	$0x02, %dl
   2237 	jnz	LABEL(tail_17)
   2238 	test	$0x04, %dl
   2239 	jnz	LABEL(tail_18)
   2240 	test	$0x08, %dl
   2241 	jnz	LABEL(tail_19)
   2242 	test	$0x10, %dl
   2243 	jnz	LABEL(tail_20)
   2244 	test	$0x20, %dl
   2245 	jnz	LABEL(tail_21)
   2246 	test	$0x40, %dl
   2247 	jnz	LABEL(tail_22)
   2248 
   2249 	.p2align 4
   2250 LABEL(tail_23):				/* 24 bytes */
   2251 	mov	(%rsi), %rcx
   2252 	mov	%rcx, (%rdi)
   2253 	mov	8(%rsi), %rdx
   2254 	mov	%rdx, 8(%rdi)
   2255 	mov	16(%rsi), %rcx
   2256 	mov	%rcx, 16(%rdi)
   2257 #ifdef USE_AS_STRNCPY
   2258 	mov	$24, %cl
   2259 	sub	$24, %r8
   2260 	jnz	LABEL(strncpy_fill_tail)
   2261 #endif
   2262 	ret
   2263 
   2264 	.p2align 4
   2265 LABEL(tail_16):				/* 17 bytes */
   2266 	mov	(%rsi), %rcx
   2267 	mov	%rcx, (%rdi)
   2268 	mov	8(%rsi), %rdx
   2269 	mov	%rdx, 8(%rdi)
   2270 	mov	16(%rsi), %cl
   2271 	mov	%cl, 16(%rdi)
   2272 #ifdef USE_AS_STRNCPY
   2273 	mov	$17, %cl
   2274 	sub	$17, %r8
   2275 	jnz	LABEL(strncpy_fill_tail)
   2276 #endif
   2277 	ret
   2278 
   2279 	.p2align 4
   2280 LABEL(tail_17):				/* 18 bytes */
   2281 	mov	(%rsi), %rcx
   2282 	mov	%rcx, (%rdi)
   2283 	mov	8(%rsi), %rdx
   2284 	mov	%rdx, 8(%rdi)
   2285 	mov	16(%rsi), %cx
   2286 	mov	%cx, 16(%rdi)
   2287 #ifdef USE_AS_STRNCPY
   2288 	mov	$18, %cl
   2289 	sub	$18, %r8
   2290 	jnz	LABEL(strncpy_fill_tail)
   2291 #endif
   2292 	ret
   2293 
   2294 	.p2align 4
   2295 LABEL(tail_18):				/* 19 bytes */
   2296 	mov	(%rsi), %rcx
   2297 	mov	%rcx, (%rdi)
   2298 	mov	8(%rsi), %rdx
   2299 	mov	%rdx, 8(%rdi)
   2300 	mov	15(%rsi), %ecx
   2301 	mov	%ecx,15(%rdi)
   2302 #ifdef USE_AS_STRNCPY
   2303 	mov	$19, %cl
   2304 	sub	$19, %r8
   2305 	jnz	LABEL(strncpy_fill_tail)
   2306 #endif
   2307 	ret
   2308 
   2309 	.p2align 4
   2310 LABEL(tail_19):				/* 20 bytes */
   2311 	mov	(%rsi), %rcx
   2312 	mov	%rcx, (%rdi)
   2313 	mov	8(%rsi), %rdx
   2314 	mov	%rdx, 8(%rdi)
   2315 	mov	16(%rsi), %ecx
   2316 	mov	%ecx, 16(%rdi)
   2317 #ifdef USE_AS_STRNCPY
   2318 	mov	$20, %cl
   2319 	sub	$20, %r8
   2320 	jnz	LABEL(strncpy_fill_tail)
   2321 #endif
   2322 	ret
   2323 
   2324 	.p2align 4
   2325 LABEL(tail_20):				/* 21 bytes */
   2326 	mov	(%rsi), %rcx
   2327 	mov	%rcx, (%rdi)
   2328 	mov	8(%rsi), %rdx
   2329 	mov	%rdx, 8(%rdi)
   2330 	mov	13(%rsi), %rcx
   2331 	mov	%rcx, 13(%rdi)
   2332 #ifdef USE_AS_STRNCPY
   2333 	mov	$21, %cl
   2334 	sub	$21, %r8
   2335 	jnz	LABEL(strncpy_fill_tail)
   2336 #endif
   2337 	ret
   2338 
   2339 	.p2align 4
   2340 LABEL(tail_21):				/* 22 bytes */
   2341 	mov	(%rsi), %rcx
   2342 	mov	%rcx, (%rdi)
   2343 	mov	8(%rsi), %rdx
   2344 	mov	%rdx, 8(%rdi)
   2345 	mov	14(%rsi), %rcx
   2346 	mov	%rcx, 14(%rdi)
   2347 #ifdef USE_AS_STRNCPY
   2348 	mov	$22, %cl
   2349 	sub	$22, %r8
   2350 	jnz	LABEL(strncpy_fill_tail)
   2351 #endif
   2352 	ret
   2353 
   2354 	.p2align 4
   2355 LABEL(tail_22):				/* 23 bytes */
   2356 	mov	(%rsi), %rcx
   2357 	mov	%rcx, (%rdi)
   2358 	mov	8(%rsi), %rdx
   2359 	mov	%rdx, 8(%rdi)
   2360 	mov	15(%rsi), %rcx
   2361 	mov	%rcx, 15(%rdi)
   2362 #ifdef USE_AS_STRNCPY
   2363 	mov	$23, %cl
   2364 	sub	$23, %r8
   2365 	jnz	LABEL(strncpy_fill_tail)
   2366 #endif
   2367 	ret
   2368 
   2369 	.p2align 4
   2370 LABEL(AMD_exit_more_24):
   2371 	test	$0x01, %dh
   2372 	jnz	LABEL(tail_24)
   2373 	test	$0x02, %dh
   2374 	jnz	LABEL(tail_25)
   2375 	test	$0x04, %dh
   2376 	jnz	LABEL(tail_26)
   2377 	test	$0x08, %dh
   2378 	jnz	LABEL(tail_27)
   2379 	test	$0x10, %dh
   2380 	jnz	LABEL(tail_28)
   2381 	test	$0x20, %dh
   2382 	jnz	LABEL(tail_29)
   2383 	test	$0x40, %dh
   2384 	jnz	LABEL(tail_30)
   2385 
   2386 	.p2align 4
   2387 LABEL(tail_31):				/* 32 bytes */
   2388 	mov	(%rsi), %rcx
   2389 	mov	%rcx, (%rdi)
   2390 	mov	8(%rsi), %rdx
   2391 	mov	%rdx, 8(%rdi)
   2392 	mov	16(%rsi), %rcx
   2393 	mov	%rcx, 16(%rdi)
   2394 	mov	24(%rsi), %rdx
   2395 	mov	%rdx, 24(%rdi)
   2396 #ifdef USE_AS_STRNCPY
   2397 	mov	$32, %cl
   2398 	sub	$32, %r8
   2399 	jnz	LABEL(strncpy_fill_tail)
   2400 #endif
   2401 	ret
   2402 
   2403 	.p2align 4
   2404 LABEL(tail_24):				/* 25 bytes */
   2405 	mov	(%rsi), %rcx
   2406 	mov	%rcx, (%rdi)
   2407 	mov	8(%rsi), %rdx
   2408 	mov	%rdx, 8(%rdi)
   2409 	mov	16(%rsi), %rcx
   2410 	mov	%rcx, 16(%rdi)
   2411 	mov	21(%rsi), %edx
   2412 	mov	%edx, 21(%rdi)
   2413 #ifdef USE_AS_STRNCPY
   2414 	mov	$25, %cl
   2415 	sub	$25, %r8
   2416 	jnz	LABEL(strncpy_fill_tail)
   2417 #endif
   2418 	ret
   2419 
   2420 	.p2align 4
   2421 LABEL(tail_25):				/* 26 bytes */
   2422 	mov	(%rsi), %rcx
   2423 	mov	%rcx, (%rdi)
   2424 	mov	8(%rsi), %rdx
   2425 	mov	%rdx, 8(%rdi)
   2426 	mov	16(%rsi), %rcx
   2427 	mov	%rcx, 16(%rdi)
   2428 	mov	22(%rsi), %edx
   2429 	mov	%edx, 22(%rdi)
   2430 #ifdef USE_AS_STRNCPY
   2431 	mov	$26, %cl
   2432 	sub	$26, %r8
   2433 	jnz	LABEL(strncpy_fill_tail)
   2434 #endif
   2435 	ret
   2436 
   2437 	.p2align 4
   2438 LABEL(tail_26):				/* 27 bytes */
   2439 	mov	(%rsi), %rcx
   2440 	mov	%rcx, (%rdi)
   2441 	mov	8(%rsi), %rdx
   2442 	mov	%rdx, 8(%rdi)
   2443 	mov	16(%rsi), %rcx
   2444 	mov	%rcx, 16(%rdi)
   2445 	mov	23(%rsi), %edx
   2446 	mov	%edx, 23(%rdi)
   2447 #ifdef USE_AS_STRNCPY
   2448 	mov	$27, %cl
   2449 	sub	$27, %r8
   2450 	jnz	LABEL(strncpy_fill_tail)
   2451 #endif
   2452 	ret
   2453 
   2454 	.p2align 4
   2455 LABEL(tail_27):				/* 28 bytes */
   2456 	mov	(%rsi), %rcx
   2457 	mov	%rcx, (%rdi)
   2458 	mov	8(%rsi), %rdx
   2459 	mov	%rdx, 8(%rdi)
   2460 	mov	16(%rsi), %rcx
   2461 	mov	%rcx, 16(%rdi)
   2462 	mov	24(%rsi), %edx
   2463 	mov	%edx, 24(%rdi)
   2464 #ifdef USE_AS_STRNCPY
   2465 	mov	$28, %cl
   2466 	sub	$28, %r8
   2467 	jnz	LABEL(strncpy_fill_tail)
   2468 #endif
   2469 	ret
   2470 
   2471 	.p2align 4
   2472 LABEL(tail_28):				/* 29 bytes */
   2473 	mov	(%rsi), %rcx
   2474 	mov	%rcx, (%rdi)
   2475 	mov	8(%rsi), %rdx
   2476 	mov	%rdx, 8(%rdi)
   2477 	mov	16(%rsi), %rcx
   2478 	mov	%rcx, 16(%rdi)
   2479 	mov	21(%rsi), %rdx
   2480 	mov	%rdx, 21(%rdi)
   2481 #ifdef USE_AS_STRNCPY
   2482 	mov	$29, %cl
   2483 	sub	$29, %r8
   2484 	jnz	LABEL(strncpy_fill_tail)
   2485 #endif
   2486 	ret
   2487 
   2488 	.p2align 4
   2489 LABEL(tail_29):				/* 30 bytes */
   2490 	mov	(%rsi), %rcx
   2491 	mov	%rcx, (%rdi)
   2492 	mov	8(%rsi), %rdx
   2493 	mov	%rdx, 8(%rdi)
   2494 	mov	16(%rsi), %rcx
   2495 	mov	%rcx, 16(%rdi)
   2496 	mov	22(%rsi), %rdx
   2497 	mov	%rdx, 22(%rdi)
   2498 #ifdef USE_AS_STRNCPY
   2499 	mov	$30, %cl
   2500 	sub	$30, %r8
   2501 	jnz	LABEL(strncpy_fill_tail)
   2502 #endif
   2503 	ret
   2504 
   2505 	.p2align 4
   2506 LABEL(tail_30):				/* 31 bytes */
   2507 	mov	(%rsi), %rcx
   2508 	mov	%rcx, (%rdi)
   2509 	mov	8(%rsi), %rdx
   2510 	mov	%rdx, 8(%rdi)
   2511 	mov	16(%rsi), %rcx
   2512 	mov	%rcx, 16(%rdi)
   2513 	mov	23(%rsi), %rdx
   2514 	mov	%rdx, 23(%rdi)
   2515 #ifdef USE_AS_STRNCPY
   2516 	mov	$31, %cl
   2517 	sub	$31, %r8
   2518 	jnz	LABEL(strncpy_fill_tail)
   2519 #endif
   2520 	ret
   2521 
   2522 	.pushsection .rodata
   2523 	.p2align 4
   2524 LABEL(tail_table):
   2525 	.int	LABEL(tail_0) - LABEL(tail_table)	/* 1 byte */
   2526 	.int	LABEL(tail_1) - LABEL(tail_table)
   2527 	.int	LABEL(tail_2) - LABEL(tail_table)
   2528 	.int	LABEL(tail_3) - LABEL(tail_table)
   2529 	.int	LABEL(tail_4) - LABEL(tail_table)
   2530 	.int	LABEL(tail_5) - LABEL(tail_table)
   2531 	.int	LABEL(tail_6) - LABEL(tail_table)
   2532 	.int	LABEL(tail_7) - LABEL(tail_table)
   2533 	.int	LABEL(tail_8) - LABEL(tail_table)
   2534 	.int	LABEL(tail_9) - LABEL(tail_table)
   2535 	.int	LABEL(tail_10) - LABEL(tail_table)
   2536 	.int	LABEL(tail_11) - LABEL(tail_table)
   2537 	.int	LABEL(tail_12) - LABEL(tail_table)
   2538 	.int	LABEL(tail_13) - LABEL(tail_table)
   2539 	.int	LABEL(tail_14) - LABEL(tail_table)
   2540 	.int	LABEL(tail_15) - LABEL(tail_table)
   2541 	.int	LABEL(tail_16) - LABEL(tail_table)
   2542 	.int	LABEL(tail_17) - LABEL(tail_table)
   2543 	.int	LABEL(tail_18) - LABEL(tail_table)
   2544 	.int	LABEL(tail_19) - LABEL(tail_table)
   2545 	.int	LABEL(tail_20) - LABEL(tail_table)
   2546 	.int	LABEL(tail_21) - LABEL(tail_table)
   2547 	.int	LABEL(tail_22) - LABEL(tail_table)
   2548 	.int	LABEL(tail_23) - LABEL(tail_table)
   2549 	.int	LABEL(tail_24) - LABEL(tail_table)
   2550 	.int	LABEL(tail_25) - LABEL(tail_table)
   2551 	.int	LABEL(tail_26) - LABEL(tail_table)
   2552 	.int	LABEL(tail_27) - LABEL(tail_table)
   2553 	.int	LABEL(tail_28) - LABEL(tail_table)
   2554 	.int	LABEL(tail_29) - LABEL(tail_table)
   2555 	.int	LABEL(tail_30) - LABEL(tail_table)
   2556 	.int	LABEL(tail_31) - LABEL(tail_table)	/* 32 bytes */
   2557 
   2558 	.p2align 4
   2559 LABEL(unaligned_table):
   2560 	.int	LABEL(ashr_0) - LABEL(unaligned_table)
   2561 	.int	LABEL(ashr_1) - LABEL(unaligned_table)
   2562 	.int	LABEL(ashr_2) - LABEL(unaligned_table)
   2563 	.int	LABEL(ashr_3) - LABEL(unaligned_table)
   2564 	.int	LABEL(ashr_4) - LABEL(unaligned_table)
   2565 	.int	LABEL(ashr_5) - LABEL(unaligned_table)
   2566 	.int	LABEL(ashr_6) - LABEL(unaligned_table)
   2567 	.int	LABEL(ashr_7) - LABEL(unaligned_table)
   2568 	.int	LABEL(ashr_8) - LABEL(unaligned_table)
   2569 	.int	LABEL(ashr_9) - LABEL(unaligned_table)
   2570 	.int	LABEL(ashr_10) - LABEL(unaligned_table)
   2571 	.int	LABEL(ashr_11) - LABEL(unaligned_table)
   2572 	.int	LABEL(ashr_12) - LABEL(unaligned_table)
   2573 	.int	LABEL(ashr_13) - LABEL(unaligned_table)
   2574 	.int	LABEL(ashr_14) - LABEL(unaligned_table)
   2575 	.int	LABEL(ashr_15) - LABEL(unaligned_table)
   2576 	.popsection
   2577 
   2578 #ifdef USE_AS_STRNCPY
   2579 	SET_SIZE(strncpy)
   2580 #else
   2581 	SET_SIZE(strcpy)			/* (char *, const char *) */
   2582 #endif
   2583