Home | History | Annotate | Download | only in gen
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 /*
     28  * Copyright (c) 2008, Intel Corporation
     29  * All rights reserved.
     30  */
     31 
     32 /*
     33  * memcpy.s - copies two blocks of memory
     34  *	Implements memcpy() and memmove() libc primitives.
     35  */
     36 
     37 	.file	"memcpy.s"
     38 
     39 #include <sys/asm_linkage.h>
     40 
     41 	ANSI_PRAGMA_WEAK(memmove,function)
     42 	ANSI_PRAGMA_WEAK(memcpy,function)
     43 
     44 #include "cache.h"
     45 #include "proc64_id.h"
     46 
     47 #define L(s) .memcpy/**/s
     48 
     49 /*
     50  * memcpy algorithm overview:
     51  *
     52  * Thresholds used below were determined experimentally.
     53  *
     54  * Pseudo code:
     55  *
     56  * NOTE: On AMD NO_SSE is always set.  Performance on Opteron did not improve
     57  * using 16-byte stores.  Setting NO_SSE on AMD should be re-evaluated on
     58  * future AMD processors.
     59  *
     60  *
     61  * If (size <= 128 bytes) {
     62  *	do unrolled code (primarily 8-byte loads/stores) regardless of
     63  *	alignment.
     64  * } else {
     65  *	Align destination to 16-byte boundary
     66  *
     67  *      if (NO_SSE) {
     68  *		If (size > half of the largest level cache) {
     69  *			Use 8-byte non-temporal stores (64-bytes/loop)
     70  *		} else {
     71  *			if (size > 4K && size <= half l1 cache size) {
     72  *				Use rep movsq
     73  *			} else {
     74  *				Use 8-byte loads/stores (64 bytes per loop)
     75  *			}
     76  *		}
     77  *
     78  *	} else { **USE SSE**
     79  *		If (size > half of the largest level cache) {
     80  *			Use 16-byte non-temporal stores (128-bytes per loop)
     81  *		} else {
     82  *			If (both source and destination are aligned) {
     83  *			    Use 16-byte aligned loads and stores (128 bytes/loop)
     84  *			} else {
     85  *			    use pairs of xmm registers with SSE2 or SSSE3
     86  *			    instructions to concatenate and shift appropriately
     87  *			    to account for source unalignment. This enables
     88  *			    16-byte aligned loads to be done.
     89  *			}
     90  *		}
     91 	}
     92  *
     93  *	Finish any remaining bytes via unrolled code above.
     94  * }
     95  *
     96  * memmove overview:
     97  *	memmove is the same as memcpy except one case where copy needs to be
     98  *	done backwards. The copy backwards code is done in a similar manner.
     99  */
    100 
    101 	ENTRY(memmove)
    102 	cmp	%rsi,%rdi		# if dst <= src
    103 	jbe	L(CopyForward)		# then do copy forward
    104 	mov	%rsi,%r9		# move src to r9
    105 	add	%rdx,%r9		# add len to get addr of end of src
    106 	cmp	%r9,%rdi		# if dst < end of src
    107 	jb	L(CopyBackwards)	# then do copy backwards
    108 	jmp	L(CopyForward)
    109 
    110 	ENTRY (memcpy)
    111 L(CopyForward):
    112 	mov    %rdx,%r8
    113 	mov    %rdi,%rcx
    114 	mov    %rsi,%rdx
    115 	mov    %rdi,%rax
    116 	lea    L(fwdPxQx)(%rip),%r11
    117 	cmp    $0x80,%r8		# 128
    118 	jg     L(ck_use_sse2)
    119 	add    %r8,%rcx
    120 	add    %r8,%rdx
    121 
    122 	movslq (%r11,%r8,4),%r10
    123 	lea    (%r10,%r11,1),%r11
    124 	jmpq   *%r11
    125 
    126 	.balign 16
    127 L(ShrtAlignNew):
    128 	lea    L(AliPxQx)(%rip),%r11
    129 	mov    %rcx,%r9
    130 	and    $0xf,%r9
    131 
    132 	movslq (%r11,%r9,4),%r10
    133 	lea    (%r10,%r11,1),%r11
    134 	jmpq   *%r11
    135 
    136 	.balign 16
    137 L(fwdPxQx): .int       L(P0Q0)-L(fwdPxQx)
    138            .int        L(P1Q0)-L(fwdPxQx)
    139            .int        L(P2Q0)-L(fwdPxQx)
    140            .int        L(P3Q0)-L(fwdPxQx)
    141            .int        L(P4Q0)-L(fwdPxQx)
    142            .int        L(P5Q0)-L(fwdPxQx)
    143            .int        L(P6Q0)-L(fwdPxQx)
    144            .int        L(P7Q0)-L(fwdPxQx)
    145 
    146            .int        L(P0Q1)-L(fwdPxQx)
    147            .int        L(P1Q1)-L(fwdPxQx)
    148            .int        L(P2Q1)-L(fwdPxQx)
    149            .int        L(P3Q1)-L(fwdPxQx)
    150            .int        L(P4Q1)-L(fwdPxQx)
    151            .int        L(P5Q1)-L(fwdPxQx)
    152            .int        L(P6Q1)-L(fwdPxQx)
    153            .int        L(P7Q1)-L(fwdPxQx)
    154 
    155            .int        L(P0Q2)-L(fwdPxQx)
    156            .int        L(P1Q2)-L(fwdPxQx)
    157            .int        L(P2Q2)-L(fwdPxQx)
    158            .int        L(P3Q2)-L(fwdPxQx)
    159            .int        L(P4Q2)-L(fwdPxQx)
    160            .int        L(P5Q2)-L(fwdPxQx)
    161            .int        L(P6Q2)-L(fwdPxQx)
    162            .int        L(P7Q2)-L(fwdPxQx)
    163 
    164            .int        L(P0Q3)-L(fwdPxQx)
    165            .int        L(P1Q3)-L(fwdPxQx)
    166            .int        L(P2Q3)-L(fwdPxQx)
    167            .int        L(P3Q3)-L(fwdPxQx)
    168            .int        L(P4Q3)-L(fwdPxQx)
    169            .int        L(P5Q3)-L(fwdPxQx)
    170            .int        L(P6Q3)-L(fwdPxQx)
    171            .int        L(P7Q3)-L(fwdPxQx)
    172 
    173            .int        L(P0Q4)-L(fwdPxQx)
    174            .int        L(P1Q4)-L(fwdPxQx)
    175            .int        L(P2Q4)-L(fwdPxQx)
    176            .int        L(P3Q4)-L(fwdPxQx)
    177            .int        L(P4Q4)-L(fwdPxQx)
    178            .int        L(P5Q4)-L(fwdPxQx)
    179            .int        L(P6Q4)-L(fwdPxQx)
    180            .int        L(P7Q4)-L(fwdPxQx)
    181 
    182            .int        L(P0Q5)-L(fwdPxQx)
    183            .int        L(P1Q5)-L(fwdPxQx)
    184            .int        L(P2Q5)-L(fwdPxQx)
    185            .int        L(P3Q5)-L(fwdPxQx)
    186            .int        L(P4Q5)-L(fwdPxQx)
    187            .int        L(P5Q5)-L(fwdPxQx)
    188            .int        L(P6Q5)-L(fwdPxQx)
    189            .int        L(P7Q5)-L(fwdPxQx)
    190 
    191            .int        L(P0Q6)-L(fwdPxQx)
    192            .int        L(P1Q6)-L(fwdPxQx)
    193            .int        L(P2Q6)-L(fwdPxQx)
    194            .int        L(P3Q6)-L(fwdPxQx)
    195            .int        L(P4Q6)-L(fwdPxQx)
    196            .int        L(P5Q6)-L(fwdPxQx)
    197            .int        L(P6Q6)-L(fwdPxQx)
    198            .int        L(P7Q6)-L(fwdPxQx)
    199 
    200            .int        L(P0Q7)-L(fwdPxQx)
    201            .int        L(P1Q7)-L(fwdPxQx)
    202            .int        L(P2Q7)-L(fwdPxQx)
    203            .int        L(P3Q7)-L(fwdPxQx)
    204            .int        L(P4Q7)-L(fwdPxQx)
    205            .int        L(P5Q7)-L(fwdPxQx)
    206            .int        L(P6Q7)-L(fwdPxQx)
    207            .int        L(P7Q7)-L(fwdPxQx)
    208 
    209            .int        L(P0Q8)-L(fwdPxQx)
    210            .int        L(P1Q8)-L(fwdPxQx)
    211            .int        L(P2Q8)-L(fwdPxQx)
    212            .int        L(P3Q8)-L(fwdPxQx)
    213            .int        L(P4Q8)-L(fwdPxQx)
    214            .int        L(P5Q8)-L(fwdPxQx)
    215            .int        L(P6Q8)-L(fwdPxQx)
    216            .int        L(P7Q8)-L(fwdPxQx)
    217 
    218            .int        L(P0Q9)-L(fwdPxQx)
    219            .int        L(P1Q9)-L(fwdPxQx)
    220            .int        L(P2Q9)-L(fwdPxQx)
    221            .int        L(P3Q9)-L(fwdPxQx)
    222            .int        L(P4Q9)-L(fwdPxQx)
    223            .int        L(P5Q9)-L(fwdPxQx)
    224            .int        L(P6Q9)-L(fwdPxQx)
    225            .int        L(P7Q9)-L(fwdPxQx)
    226 
    227            .int        L(P0QA)-L(fwdPxQx)
    228            .int        L(P1QA)-L(fwdPxQx)
    229            .int        L(P2QA)-L(fwdPxQx)
    230            .int        L(P3QA)-L(fwdPxQx)
    231            .int        L(P4QA)-L(fwdPxQx)
    232            .int        L(P5QA)-L(fwdPxQx)
    233            .int        L(P6QA)-L(fwdPxQx)
    234            .int        L(P7QA)-L(fwdPxQx)
    235 
    236            .int        L(P0QB)-L(fwdPxQx)
    237            .int        L(P1QB)-L(fwdPxQx)
    238            .int        L(P2QB)-L(fwdPxQx)
    239            .int        L(P3QB)-L(fwdPxQx)
    240            .int        L(P4QB)-L(fwdPxQx)
    241            .int        L(P5QB)-L(fwdPxQx)
    242            .int        L(P6QB)-L(fwdPxQx)
    243            .int        L(P7QB)-L(fwdPxQx)
    244 
    245            .int        L(P0QC)-L(fwdPxQx)
    246            .int        L(P1QC)-L(fwdPxQx)
    247            .int        L(P2QC)-L(fwdPxQx)
    248            .int        L(P3QC)-L(fwdPxQx)
    249            .int        L(P4QC)-L(fwdPxQx)
    250            .int        L(P5QC)-L(fwdPxQx)
    251            .int        L(P6QC)-L(fwdPxQx)
    252            .int        L(P7QC)-L(fwdPxQx)
    253 
    254            .int        L(P0QD)-L(fwdPxQx)
    255            .int        L(P1QD)-L(fwdPxQx)
    256            .int        L(P2QD)-L(fwdPxQx)
    257            .int        L(P3QD)-L(fwdPxQx)
    258            .int        L(P4QD)-L(fwdPxQx)
    259            .int        L(P5QD)-L(fwdPxQx)
    260            .int        L(P6QD)-L(fwdPxQx)
    261            .int        L(P7QD)-L(fwdPxQx)
    262 
    263            .int        L(P0QE)-L(fwdPxQx)
    264            .int        L(P1QE)-L(fwdPxQx)
    265            .int        L(P2QE)-L(fwdPxQx)
    266            .int        L(P3QE)-L(fwdPxQx)
    267            .int        L(P4QE)-L(fwdPxQx)
    268            .int        L(P5QE)-L(fwdPxQx)
    269            .int        L(P6QE)-L(fwdPxQx)
    270            .int        L(P7QE)-L(fwdPxQx)
    271 
    272            .int        L(P0QF)-L(fwdPxQx)
    273            .int        L(P1QF)-L(fwdPxQx)
    274            .int        L(P2QF)-L(fwdPxQx)
    275            .int        L(P3QF)-L(fwdPxQx)
    276            .int        L(P4QF)-L(fwdPxQx)
    277            .int        L(P5QF)-L(fwdPxQx)
    278            .int        L(P6QF)-L(fwdPxQx)
    279            .int        L(P7QF)-L(fwdPxQx)
    280 
    281            .int        L(P0QG)-L(fwdPxQx)	# 0x80
    282 
    283 	   .balign 16
    284 L(AliPxQx): .int   L(now_qw_aligned)-L(AliPxQx)
    285            .int        L(A1Q0)-L(AliPxQx)
    286            .int        L(A2Q0)-L(AliPxQx)
    287            .int        L(A3Q0)-L(AliPxQx)
    288            .int        L(A4Q0)-L(AliPxQx)
    289            .int        L(A5Q0)-L(AliPxQx)
    290            .int        L(A6Q0)-L(AliPxQx)
    291            .int        L(A7Q0)-L(AliPxQx)
    292            .int        L(A0Q1)-L(AliPxQx)
    293            .int        L(A1Q1)-L(AliPxQx)
    294            .int        L(A2Q1)-L(AliPxQx)
    295            .int        L(A3Q1)-L(AliPxQx)
    296            .int        L(A4Q1)-L(AliPxQx)
    297            .int        L(A5Q1)-L(AliPxQx)
    298            .int        L(A6Q1)-L(AliPxQx)
    299            .int        L(A7Q1)-L(AliPxQx)
    300 
    301 	.balign 16
    302 L(A1Q0):			# ; need to move 8+ 7=1+2+4 bytes
    303 	movzbq (%rdx),%r11
    304 	sub    $0xf,%r8
    305 	mov    %r11b,(%rcx)
    306 
    307 	movzwq 0x1(%rdx),%r10
    308 	mov    %r10w,0x1(%rcx)
    309 
    310 	mov    0x3(%rdx),%r9d
    311 	mov    %r9d,0x3(%rcx)
    312 
    313 	mov    0x7(%rdx),%r11
    314 	add    $0xf,%rdx
    315 	mov    %r11,0x7(%rcx)
    316 
    317 	add    $0xf,%rcx
    318 	jmp    L(now_qw_aligned)
    319 
    320 	.balign 16
    321 L(A2Q0):			# ; need to move 8+ 6=2+4 bytes
    322 	movzwq (%rdx),%r10
    323 	sub    $0xe,%r8
    324 	mov    %r10w,(%rcx)
    325 
    326 	mov    0x2(%rdx),%r9d
    327 	mov    %r9d,0x2(%rcx)
    328 
    329 	mov    0x6(%rdx),%r11
    330 	add    $0xe,%rdx
    331 	mov    %r11,0x6(%rcx)
    332 	add    $0xe,%rcx
    333 	jmp    L(now_qw_aligned)
    334 
    335 	.balign 16
    336 L(A3Q0):			# ; need to move 8+ 5=1+4 bytes
    337 	movzbq (%rdx),%r11
    338 	sub    $0xd,%r8
    339 	mov    %r11b,(%rcx)
    340 
    341 	mov    0x1(%rdx),%r9d
    342 	mov    %r9d,0x1(%rcx)
    343 
    344 	mov    0x5(%rdx),%r10
    345 	add    $0xd,%rdx
    346 	mov    %r10,0x5(%rcx)
    347 
    348 	add    $0xd,%rcx
    349 	jmp    L(now_qw_aligned)
    350 
    351 	.balign 16
    352 L(A4Q0):			# ; need to move 8+4 bytes
    353 	mov    (%rdx),%r9d
    354 	sub    $0xc,%r8
    355 	mov    %r9d,(%rcx)
    356 
    357 	mov    0x4(%rdx),%r10
    358 	add    $0xc,%rdx
    359 	mov    %r10,0x4(%rcx)
    360 
    361 	add    $0xc,%rcx
    362 	jmp    L(now_qw_aligned)
    363 
    364 	.balign 16
    365 L(A5Q0):			# ; need to move 8+ 3=1+2 bytes
    366 	movzbq (%rdx),%r11
    367 	sub    $0xb,%r8
    368 	mov    %r11b,(%rcx)
    369 
    370 	movzwq 0x1(%rdx),%r10
    371 	mov    %r10w,0x1(%rcx)
    372 
    373 	mov    0x3(%rdx),%r9
    374 	add    $0xb,%rdx
    375 	mov    %r9,0x3(%rcx)
    376 
    377 	add    $0xb,%rcx
    378 	jmp    L(now_qw_aligned)
    379 
    380 	.balign 16
    381 L(A6Q0):			# ; need to move 8+2 bytes
    382 	movzwq (%rdx),%r10
    383 	sub    $0xa,%r8
    384 	mov    %r10w,(%rcx)
    385 
    386 	mov    0x2(%rdx),%r9
    387 	add    $0xa,%rdx
    388 	mov    %r9,0x2(%rcx)
    389 
    390 	add    $0xa,%rcx
    391 	jmp    L(now_qw_aligned)
    392 
    393 	.balign 16
    394 L(A7Q0):			# ; need to move 8+1 byte
    395 	movzbq (%rdx),%r11
    396 	sub    $0x9,%r8
    397 	mov    %r11b,(%rcx)
    398 
    399 	mov    0x1(%rdx),%r10
    400 	add    $0x9,%rdx
    401 	mov    %r10,0x1(%rcx)
    402 
    403 	add    $0x9,%rcx
    404 	jmp    L(now_qw_aligned)
    405 
    406 	.balign 16
    407 L(A0Q1):			# ; need to move 8 bytes
    408 
    409 	mov    (%rdx),%r10
    410 	add    $0x8,%rdx
    411 	sub    $0x8,%r8
    412 	mov    %r10,(%rcx)
    413 
    414 	add    $0x8,%rcx
    415 	jmp    L(now_qw_aligned)
    416 
    417 	.balign 16
    418 L(A1Q1):			# ; need to move 7=1+2+4 bytes
    419 	movzbq (%rdx),%r11
    420 	sub    $0x7,%r8
    421 	mov    %r11b,(%rcx)
    422 
    423 	movzwq 0x1(%rdx),%r10
    424 	mov    %r10w,0x1(%rcx)
    425 
    426 	mov    0x3(%rdx),%r9d
    427 	add    $0x7,%rdx
    428 	mov    %r9d,0x3(%rcx)
    429 	add    $0x7,%rcx
    430 	jmp    L(now_qw_aligned)
    431 
    432 	.balign 16
    433 L(A2Q1):			# ; need to move 6=2+4 bytes
    434 	movzwq (%rdx),%r10
    435 	sub    $0x6,%r8
    436 	mov    %r10w,(%rcx)
    437 	mov    0x2(%rdx),%r9d
    438 	add    $0x6,%rdx
    439 	mov    %r9d,0x2(%rcx)
    440 	add    $0x6,%rcx
    441 	jmp    L(now_qw_aligned)
    442 
    443 	.balign 16
    444 L(A3Q1):			# ; need to move 5=1+4 bytes
    445 	movzbq (%rdx),%r11
    446 	sub    $0x5,%r8
    447 	mov    %r11b,(%rcx)
    448 	mov    0x1(%rdx),%r9d
    449 	add    $0x5,%rdx
    450 	mov    %r9d,0x1(%rcx)
    451 	add    $0x5,%rcx
    452 	jmp    L(now_qw_aligned)
    453 
    454 	.balign 16
    455 L(A4Q1):			# ; need to move 4 bytes
    456 	mov    (%rdx),%r9d
    457 	sub    $0x4,%r8
    458 	add    $0x4,%rdx
    459 	mov    %r9d,(%rcx)
    460 	add    $0x4,%rcx
    461 	jmp    L(now_qw_aligned)
    462 
    463 	.balign 16
    464 L(A5Q1):			# ; need to move 3=1+2 bytes
    465 	movzbq (%rdx),%r11
    466 	sub    $0x3,%r8
    467 	mov    %r11b,(%rcx)
    468 
    469 	movzwq 0x1(%rdx),%r10
    470 	add    $0x3,%rdx
    471 	mov    %r10w,0x1(%rcx)
    472 
    473 	add    $0x3,%rcx
    474 	jmp    L(now_qw_aligned)
    475 
    476 	.balign 16
    477 L(A6Q1):			# ; need to move 2 bytes
    478 	movzwq (%rdx),%r10
    479 	sub    $0x2,%r8
    480 	add    $0x2,%rdx
    481 	mov    %r10w,(%rcx)
    482 	add    $0x2,%rcx
    483 	jmp    L(now_qw_aligned)
    484 
    485 	.balign 16
    486 L(A7Q1):			# ; need to move 1 byte
    487 	movzbq (%rdx),%r11
    488 	dec    %r8
    489 	inc    %rdx
    490 	mov    %r11b,(%rcx)
    491 	inc    %rcx
    492 	jmp    L(now_qw_aligned)
    493 
    494 
    495 	.balign 16
    496 L(P0QG):
    497 	mov    -0x80(%rdx),%r9
    498 	mov    %r9,-0x80(%rcx)
    499 L(P0QF):
    500 	mov    -0x78(%rdx),%r10
    501 	mov    %r10,-0x78(%rcx)
    502 L(P0QE):
    503 	mov    -0x70(%rdx),%r9
    504 	mov    %r9,-0x70(%rcx)
    505 L(P0QD):
    506 	mov    -0x68(%rdx),%r10
    507 	mov    %r10,-0x68(%rcx)
    508 L(P0QC):
    509 	mov    -0x60(%rdx),%r9
    510 	mov    %r9,-0x60(%rcx)
    511 L(P0QB):
    512 	mov    -0x58(%rdx),%r10
    513 	mov    %r10,-0x58(%rcx)
    514 L(P0QA):
    515 	mov    -0x50(%rdx),%r9
    516 	mov    %r9,-0x50(%rcx)
    517 L(P0Q9):
    518 	mov    -0x48(%rdx),%r10
    519 	mov    %r10,-0x48(%rcx)
    520 L(P0Q8):
    521 	mov    -0x40(%rdx),%r9
    522 	mov    %r9,-0x40(%rcx)
    523 L(P0Q7):
    524 	mov    -0x38(%rdx),%r10
    525 	mov    %r10,-0x38(%rcx)
    526 L(P0Q6):
    527 	mov    -0x30(%rdx),%r9
    528 	mov    %r9,-0x30(%rcx)
    529 L(P0Q5):
    530 	mov    -0x28(%rdx),%r10
    531 	mov    %r10,-0x28(%rcx)
    532 L(P0Q4):
    533 	mov    -0x20(%rdx),%r9
    534 	mov    %r9,-0x20(%rcx)
    535 L(P0Q3):
    536 	mov    -0x18(%rdx),%r10
    537 	mov    %r10,-0x18(%rcx)
    538 L(P0Q2):
    539 	mov    -0x10(%rdx),%r9
    540 	mov    %r9,-0x10(%rcx)
    541 L(P0Q1):
    542 	mov    -0x8(%rdx),%r10
    543 	mov    %r10,-0x8(%rcx)
    544 L(P0Q0):
    545 	ret
    546 
    547 	.balign 16
    548 L(P1QF):
    549 	mov    -0x79(%rdx),%r9
    550 	mov    %r9,-0x79(%rcx)
    551 L(P1QE):
    552 	mov    -0x71(%rdx),%r11
    553 	mov    %r11,-0x71(%rcx)
    554 L(P1QD):
    555 	mov    -0x69(%rdx),%r10
    556 	mov    %r10,-0x69(%rcx)
    557 L(P1QC):
    558 	mov    -0x61(%rdx),%r9
    559 	mov    %r9,-0x61(%rcx)
    560 L(P1QB):
    561 	mov    -0x59(%rdx),%r11
    562 	mov    %r11,-0x59(%rcx)
    563 L(P1QA):
    564 	mov    -0x51(%rdx),%r10
    565 	mov    %r10,-0x51(%rcx)
    566 L(P1Q9):
    567 	mov    -0x49(%rdx),%r9
    568 	mov    %r9,-0x49(%rcx)
    569 L(P1Q8):
    570 	mov    -0x41(%rdx),%r11
    571 	mov    %r11,-0x41(%rcx)
    572 L(P1Q7):
    573 	mov    -0x39(%rdx),%r10
    574 	mov    %r10,-0x39(%rcx)
    575 L(P1Q6):
    576 	mov    -0x31(%rdx),%r9
    577 	mov    %r9,-0x31(%rcx)
    578 L(P1Q5):
    579 	mov    -0x29(%rdx),%r11
    580 	mov    %r11,-0x29(%rcx)
    581 L(P1Q4):
    582 	mov    -0x21(%rdx),%r10
    583 	mov    %r10,-0x21(%rcx)
    584 L(P1Q3):
    585 	mov    -0x19(%rdx),%r9
    586 	mov    %r9,-0x19(%rcx)
    587 L(P1Q2):
    588 	mov    -0x11(%rdx),%r11
    589 	mov    %r11,-0x11(%rcx)
    590 L(P1Q1):
    591 	mov    -0x9(%rdx),%r10
    592 	mov    %r10,-0x9(%rcx)
    593 L(P1Q0):
    594 	movzbq -0x1(%rdx),%r9
    595 	mov    %r9b,-0x1(%rcx)
    596 	ret
    597 
    598 	.balign 16
    599 L(P2QF):
    600 	mov    -0x7a(%rdx),%r9
    601 	mov    %r9,-0x7a(%rcx)
    602 L(P2QE):
    603 	mov    -0x72(%rdx),%r11
    604 	mov    %r11,-0x72(%rcx)
    605 L(P2QD):
    606 	mov    -0x6a(%rdx),%r10
    607 	mov    %r10,-0x6a(%rcx)
    608 L(P2QC):
    609 	mov    -0x62(%rdx),%r9
    610 	mov    %r9,-0x62(%rcx)
    611 L(P2QB):
    612 	mov    -0x5a(%rdx),%r11
    613 	mov    %r11,-0x5a(%rcx)
    614 L(P2QA):
    615 	mov    -0x52(%rdx),%r10
    616 	mov    %r10,-0x52(%rcx)
    617 L(P2Q9):
    618 	mov    -0x4a(%rdx),%r9
    619 	mov    %r9,-0x4a(%rcx)
    620 L(P2Q8):
    621 	mov    -0x42(%rdx),%r11
    622 	mov    %r11,-0x42(%rcx)
    623 L(P2Q7):
    624 	mov    -0x3a(%rdx),%r10
    625 	mov    %r10,-0x3a(%rcx)
    626 L(P2Q6):
    627 	mov    -0x32(%rdx),%r9
    628 	mov    %r9,-0x32(%rcx)
    629 L(P2Q5):
    630 	mov    -0x2a(%rdx),%r11
    631 	mov    %r11,-0x2a(%rcx)
    632 L(P2Q4):
    633 	mov    -0x22(%rdx),%r10
    634 	mov    %r10,-0x22(%rcx)
    635 L(P2Q3):
    636 	mov    -0x1a(%rdx),%r9
    637 	mov    %r9,-0x1a(%rcx)
    638 L(P2Q2):
    639 	mov    -0x12(%rdx),%r11
    640 	mov    %r11,-0x12(%rcx)
    641 L(P2Q1):
    642 	mov    -0xa(%rdx),%r10
    643 	mov    %r10,-0xa(%rcx)
    644 L(P2Q0):
    645 	movzwq -0x2(%rdx),%r9
    646 	mov    %r9w,-0x2(%rcx)
    647 	ret
    648 
    649 	.balign 16
    650 L(P3QF):
    651 	mov    -0x7b(%rdx),%r9
    652 	mov    %r9,-0x7b(%rcx)
    653 L(P3QE):
    654 	mov    -0x73(%rdx),%r11
    655 	mov    %r11,-0x73(%rcx)
    656 L(P3QD):
    657 	mov    -0x6b(%rdx),%r10
    658 	mov    %r10,-0x6b(%rcx)
    659 L(P3QC):
    660 	mov    -0x63(%rdx),%r9
    661 	mov    %r9,-0x63(%rcx)
    662 L(P3QB):
    663 	mov    -0x5b(%rdx),%r11
    664 	mov    %r11,-0x5b(%rcx)
    665 L(P3QA):
    666 	mov    -0x53(%rdx),%r10
    667 	mov    %r10,-0x53(%rcx)
    668 L(P3Q9):
    669 	mov    -0x4b(%rdx),%r9
    670 	mov    %r9,-0x4b(%rcx)
    671 L(P3Q8):
    672 	mov    -0x43(%rdx),%r11
    673 	mov    %r11,-0x43(%rcx)
    674 L(P3Q7):
    675 	mov    -0x3b(%rdx),%r10
    676 	mov    %r10,-0x3b(%rcx)
    677 L(P3Q6):
    678 	mov    -0x33(%rdx),%r9
    679 	mov    %r9,-0x33(%rcx)
    680 L(P3Q5):
    681 	mov    -0x2b(%rdx),%r11
    682 	mov    %r11,-0x2b(%rcx)
    683 L(P3Q4):
    684 	mov    -0x23(%rdx),%r10
    685 	mov    %r10,-0x23(%rcx)
    686 L(P3Q3):
    687 	mov    -0x1b(%rdx),%r9
    688 	mov    %r9,-0x1b(%rcx)
    689 L(P3Q2):
    690 	mov    -0x13(%rdx),%r11
    691 	mov    %r11,-0x13(%rcx)
    692 L(P3Q1):
    693 	mov    -0xb(%rdx),%r10
    694 	mov    %r10,-0xb(%rcx)
    695 	/*
    696 	 * These trailing loads/stores have to do all their loads 1st,
    697 	 * then do the stores.
    698 	 */
    699 L(P3Q0):
    700 	movzwq -0x3(%rdx),%r9
    701 	movzbq -0x1(%rdx),%r10
    702 	mov    %r9w,-0x3(%rcx)
    703 	mov    %r10b,-0x1(%rcx)
    704 	ret
    705 
    706 	.balign 16
    707 L(P4QF):
    708 	mov    -0x7c(%rdx),%r9
    709 	mov    %r9,-0x7c(%rcx)
    710 L(P4QE):
    711 	mov    -0x74(%rdx),%r11
    712 	mov    %r11,-0x74(%rcx)
    713 L(P4QD):
    714 	mov    -0x6c(%rdx),%r10
    715 	mov    %r10,-0x6c(%rcx)
    716 L(P4QC):
    717 	mov    -0x64(%rdx),%r9
    718 	mov    %r9,-0x64(%rcx)
    719 L(P4QB):
    720 	mov    -0x5c(%rdx),%r11
    721 	mov    %r11,-0x5c(%rcx)
    722 L(P4QA):
    723 	mov    -0x54(%rdx),%r10
    724 	mov    %r10,-0x54(%rcx)
    725 L(P4Q9):
    726 	mov    -0x4c(%rdx),%r9
    727 	mov    %r9,-0x4c(%rcx)
    728 L(P4Q8):
    729 	mov    -0x44(%rdx),%r11
    730 	mov    %r11,-0x44(%rcx)
    731 L(P4Q7):
    732 	mov    -0x3c(%rdx),%r10
    733 	mov    %r10,-0x3c(%rcx)
    734 L(P4Q6):
    735 	mov    -0x34(%rdx),%r9
    736 	mov    %r9,-0x34(%rcx)
    737 L(P4Q5):
    738 	mov    -0x2c(%rdx),%r11
    739 	mov    %r11,-0x2c(%rcx)
    740 L(P4Q4):
    741 	mov    -0x24(%rdx),%r10
    742 	mov    %r10,-0x24(%rcx)
    743 L(P4Q3):
    744 	mov    -0x1c(%rdx),%r9
    745 	mov    %r9,-0x1c(%rcx)
    746 L(P4Q2):
    747 	mov    -0x14(%rdx),%r11
    748 	mov    %r11,-0x14(%rcx)
    749 L(P4Q1):
    750 	mov    -0xc(%rdx),%r10
    751 	mov    %r10,-0xc(%rcx)
    752 L(P4Q0):
    753 	mov    -0x4(%rdx),%r9d
    754 	mov    %r9d,-0x4(%rcx)
    755 	ret
    756 
    757 	.balign 16
    758 L(P5QF):
    759 	mov    -0x7d(%rdx),%r9
    760 	mov    %r9,-0x7d(%rcx)
    761 L(P5QE):
    762 	mov    -0x75(%rdx),%r11
    763 	mov    %r11,-0x75(%rcx)
    764 L(P5QD):
    765 	mov    -0x6d(%rdx),%r10
    766 	mov    %r10,-0x6d(%rcx)
    767 L(P5QC):
    768 	mov    -0x65(%rdx),%r9
    769 	mov    %r9,-0x65(%rcx)
    770 L(P5QB):
    771 	mov    -0x5d(%rdx),%r11
    772 	mov    %r11,-0x5d(%rcx)
    773 L(P5QA):
    774 	mov    -0x55(%rdx),%r10
    775 	mov    %r10,-0x55(%rcx)
    776 L(P5Q9):
    777 	mov    -0x4d(%rdx),%r9
    778 	mov    %r9,-0x4d(%rcx)
    779 L(P5Q8):
    780 	mov    -0x45(%rdx),%r11
    781 	mov    %r11,-0x45(%rcx)
    782 L(P5Q7):
    783 	mov    -0x3d(%rdx),%r10
    784 	mov    %r10,-0x3d(%rcx)
    785 L(P5Q6):
    786 	mov    -0x35(%rdx),%r9
    787 	mov    %r9,-0x35(%rcx)
    788 L(P5Q5):
    789 	mov    -0x2d(%rdx),%r11
    790 	mov    %r11,-0x2d(%rcx)
    791 L(P5Q4):
    792 	mov    -0x25(%rdx),%r10
    793 	mov    %r10,-0x25(%rcx)
    794 L(P5Q3):
    795 	mov    -0x1d(%rdx),%r9
    796 	mov    %r9,-0x1d(%rcx)
    797 L(P5Q2):
    798 	mov    -0x15(%rdx),%r11
    799 	mov    %r11,-0x15(%rcx)
    800 L(P5Q1):
    801 	mov    -0xd(%rdx),%r10
    802 	mov    %r10,-0xd(%rcx)
    803 	/*
    804 	 * These trailing loads/stores have to do all their loads 1st,
    805 	 * then do the stores.
    806 	 */
    807 L(P5Q0):
    808 	mov    -0x5(%rdx),%r9d
    809 	movzbq -0x1(%rdx),%r10
    810 	mov    %r9d,-0x5(%rcx)
    811 	mov    %r10b,-0x1(%rcx)
    812 	ret
    813 
    814 	.balign 16
    815 L(P6QF):
    816 	mov    -0x7e(%rdx),%r9
    817 	mov    %r9,-0x7e(%rcx)
    818 L(P6QE):
    819 	mov    -0x76(%rdx),%r11
    820 	mov    %r11,-0x76(%rcx)
    821 L(P6QD):
    822 	mov    -0x6e(%rdx),%r10
    823 	mov    %r10,-0x6e(%rcx)
    824 L(P6QC):
    825 	mov    -0x66(%rdx),%r9
    826 	mov    %r9,-0x66(%rcx)
    827 L(P6QB):
    828 	mov    -0x5e(%rdx),%r11
    829 	mov    %r11,-0x5e(%rcx)
    830 L(P6QA):
    831 	mov    -0x56(%rdx),%r10
    832 	mov    %r10,-0x56(%rcx)
    833 L(P6Q9):
    834 	mov    -0x4e(%rdx),%r9
    835 	mov    %r9,-0x4e(%rcx)
    836 L(P6Q8):
    837 	mov    -0x46(%rdx),%r11
    838 	mov    %r11,-0x46(%rcx)
    839 L(P6Q7):
    840 	mov    -0x3e(%rdx),%r10
    841 	mov    %r10,-0x3e(%rcx)
    842 L(P6Q6):
    843 	mov    -0x36(%rdx),%r9
    844 	mov    %r9,-0x36(%rcx)
    845 L(P6Q5):
    846 	mov    -0x2e(%rdx),%r11
    847 	mov    %r11,-0x2e(%rcx)
    848 L(P6Q4):
    849 	mov    -0x26(%rdx),%r10
    850 	mov    %r10,-0x26(%rcx)
    851 L(P6Q3):
    852 	mov    -0x1e(%rdx),%r9
    853 	mov    %r9,-0x1e(%rcx)
    854 L(P6Q2):
    855 	mov    -0x16(%rdx),%r11
    856 	mov    %r11,-0x16(%rcx)
    857 L(P6Q1):
    858 	mov    -0xe(%rdx),%r10
    859 	mov    %r10,-0xe(%rcx)
    860 	/*
    861 	 * These trailing loads/stores have to do all their loads 1st,
    862 	 * then do the stores.
    863 	 */
    864 L(P6Q0):
    865 	mov    -0x6(%rdx),%r9d
    866 	movzwq -0x2(%rdx),%r10
    867 	mov    %r9d,-0x6(%rcx)
    868 	mov    %r10w,-0x2(%rcx)
    869 	ret
    870 
    871 	.balign 16
    872 L(P7QF):
    873 	mov    -0x7f(%rdx),%r9
    874 	mov    %r9,-0x7f(%rcx)
    875 L(P7QE):
    876 	mov    -0x77(%rdx),%r11
    877 	mov    %r11,-0x77(%rcx)
    878 L(P7QD):
    879 	mov    -0x6f(%rdx),%r10
    880 	mov    %r10,-0x6f(%rcx)
    881 L(P7QC):
    882 	mov    -0x67(%rdx),%r9
    883 	mov    %r9,-0x67(%rcx)
    884 L(P7QB):
    885 	mov    -0x5f(%rdx),%r11
    886 	mov    %r11,-0x5f(%rcx)
    887 L(P7QA):
    888 	mov    -0x57(%rdx),%r10
    889 	mov    %r10,-0x57(%rcx)
    890 L(P7Q9):
    891 	mov    -0x4f(%rdx),%r9
    892 	mov    %r9,-0x4f(%rcx)
    893 L(P7Q8):
    894 	mov    -0x47(%rdx),%r11
    895 	mov    %r11,-0x47(%rcx)
    896 L(P7Q7):
    897 	mov    -0x3f(%rdx),%r10
    898 	mov    %r10,-0x3f(%rcx)
    899 L(P7Q6):
    900 	mov    -0x37(%rdx),%r9
    901 	mov    %r9,-0x37(%rcx)
    902 L(P7Q5):
    903 	mov    -0x2f(%rdx),%r11
    904 	mov    %r11,-0x2f(%rcx)
    905 L(P7Q4):
    906 	mov    -0x27(%rdx),%r10
    907 	mov    %r10,-0x27(%rcx)
    908 L(P7Q3):
    909 	mov    -0x1f(%rdx),%r9
    910 	mov    %r9,-0x1f(%rcx)
    911 L(P7Q2):
    912 	mov    -0x17(%rdx),%r11
    913 	mov    %r11,-0x17(%rcx)
    914 L(P7Q1):
    915 	mov    -0xf(%rdx),%r10
    916 	mov    %r10,-0xf(%rcx)
    917 	/*
    918 	 * These trailing loads/stores have to do all their loads 1st,
    919 	 * then do the stores.
    920 	 */
    921 L(P7Q0):
    922 	mov    -0x7(%rdx),%r9d
    923 	movzwq -0x3(%rdx),%r10
    924 	movzbq -0x1(%rdx),%r11
    925 	mov    %r9d,-0x7(%rcx)
    926 	mov    %r10w,-0x3(%rcx)
    927 	mov    %r11b,-0x1(%rcx)
    928 	ret
    929 
    930 	.balign 16
    931 L(ck_use_sse2):
    932 	/*
    933 	 * Align dest to 16 byte boundary.
    934 	 */
    935 	test   $0xf,%rcx
    936 	jnz    L(ShrtAlignNew)
    937 
    938 L(now_qw_aligned):
    939 	cmpl   $NO_SSE,.memops_method(%rip)
    940 	je     L(Loop8byte_pre)
    941 
    942 	/*
    943 	 * The fall-through path is to do SSE2 16-byte load/stores
    944 	 */
    945 
    946 	/*
    947 	 * If current move size is larger than half of the highest level cache
    948 	 * size, then do non-temporal moves.
    949 	 */
    950 	mov    .largest_level_cache_size(%rip),%r9d
    951 	shr    %r9		# take half of it
    952 	cmp    %r9,%r8
    953 	jg     L(sse2_nt_move)
    954 
    955 	/*
    956 	 * If both the source and dest are aligned, then use the both aligned
    957 	 * logic. Well aligned data should reap the rewards.
    958 	 */
    959 	test   $0xf,%rdx
    960 	jz     L(pre_both_aligned)
    961 
    962 	lea    L(SSE_src)(%rip),%r10		# SSE2 (default)
    963 	testl  $USE_SSSE3,.memops_method(%rip)
    964 	jz     1f
    965 	lea    L(SSSE3_src)(%rip),%r10		# SSSE3
    966 
    967 1:
    968 	/*
    969 	 * if the src is not 16 byte aligned...
    970 	 */
    971 	mov    %rdx,%r11
    972 	and    $0xf,%r11
    973 	movdqu (%rdx),%xmm0
    974 	movdqa %xmm0,(%rcx)
    975 	add    $0x10,%rdx
    976 	sub    %r11,%rdx
    977 	add    $0x10,%rcx
    978 	sub    $0x10,%r8
    979 	movdqa (%rdx),%xmm1
    980 
    981 	movslq (%r10,%r11,4),%r9
    982 	lea    (%r9,%r10,1),%r10
    983 	jmpq   *%r10
    984 
    985 	    .balign 16
    986 L(SSSE3_src): .int	L(pre_both_aligned)-L(SSSE3_src)
    987 	    .int        L(mov3dqa1) -L(SSSE3_src)
    988 	    .int        L(mov3dqa2) -L(SSSE3_src)
    989 	    .int        L(mov3dqa3) -L(SSSE3_src)
    990 	    .int        L(mov3dqa4) -L(SSSE3_src)
    991 	    .int        L(mov3dqa5) -L(SSSE3_src)
    992 	    .int        L(mov3dqa6) -L(SSSE3_src)
    993 	    .int        L(mov3dqa7) -L(SSSE3_src)
    994 	    .int        L(movdqa8)  -L(SSSE3_src)
    995 	    .int        L(mov3dqa9) -L(SSSE3_src)
    996 	    .int        L(mov3dqa10)-L(SSSE3_src)
    997 	    .int        L(mov3dqa11)-L(SSSE3_src)
    998 	    .int        L(mov3dqa12)-L(SSSE3_src)
    999 	    .int        L(mov3dqa13)-L(SSSE3_src)
   1000 	    .int        L(mov3dqa14)-L(SSSE3_src)
   1001 	    .int        L(mov3dqa15)-L(SSSE3_src)
   1002 L(SSE_src): .int    L(pre_both_aligned)-L(SSE_src)
   1003 	    .int        L(movdqa1) -L(SSE_src)
   1004 	    .int        L(movdqa2) -L(SSE_src)
   1005 	    .int        L(movdqa3) -L(SSE_src)
   1006 	    .int        L(movdqa4) -L(SSE_src)
   1007 	    .int        L(movdqa5) -L(SSE_src)
   1008 	    .int        L(movdqa6) -L(SSE_src)
   1009 	    .int        L(movdqa7) -L(SSE_src)
   1010 	    .int        L(movdqa8) -L(SSE_src)
   1011 	    .int        L(movdqa9) -L(SSE_src)
   1012 	    .int        L(movdqa10)-L(SSE_src)
   1013 	    .int        L(movdqa11)-L(SSE_src)
   1014 	    .int        L(movdqa12)-L(SSE_src)
   1015 	    .int        L(movdqa13)-L(SSE_src)
   1016 	    .int        L(movdqa14)-L(SSE_src)
   1017 	    .int        L(movdqa15)-L(SSE_src)
   1018 
   1019 	.balign 16
   1020 L(movdqa1):
   1021 	movdqa 0x10(%rdx),%xmm3 # load the upper source buffer
   1022 	movdqa 0x20(%rdx),%xmm0 # load the upper source buffer
   1023 	lea    0x20(%rdx),%rdx
   1024 	lea    -0x20(%r8),%r8
   1025 
   1026 	psrldq $0x1,%xmm1  # shift right prev buffer (saved from last iteration)
   1027 	movdqa %xmm3,%xmm2 # store off xmm reg for use next iteration
   1028 	pslldq $0xf,%xmm3  # shift the current buffer left (shift in zeros)
   1029 	por    %xmm1,%xmm3 # OR them together
   1030 	cmp    $0x20,%r8
   1031 
   1032 	psrldq $0x1,%xmm2  # shift right prev buffer (saved from last iteration)
   1033 	movdqa %xmm0,%xmm1 # store off xmm reg for use next iteration
   1034 	pslldq $0xf,%xmm0  # shift the current buffer left (shift in zeros)
   1035 	por    %xmm2,%xmm0 # OR them together
   1036 	movdqa %xmm3,(%rcx)     # store it
   1037 	movdqa %xmm0,0x10(%rcx) # store it
   1038 	lea    0x20(%rcx),%rcx
   1039 
   1040 	jge    L(movdqa1)
   1041 	jmp    L(movdqa_epi)
   1042 
   1043 	.balign 16
   1044 L(movdqa2):
   1045 	sub    $0x20,%r8
   1046 	movdqa 0x10(%rdx),%xmm3
   1047 	movdqa 0x20(%rdx),%xmm0
   1048 	add    $0x20,%rdx
   1049 
   1050 	psrldq $0x2,%xmm1
   1051 	movdqa %xmm3,%xmm2
   1052 	pslldq $0xe,%xmm3
   1053 	por    %xmm1,%xmm3
   1054 
   1055 	psrldq $0x2,%xmm2
   1056 	movdqa %xmm0,%xmm1
   1057 	pslldq $0xe,%xmm0
   1058 	por    %xmm2,%xmm0
   1059 	movdqa %xmm3,(%rcx)
   1060 	movdqa %xmm0,0x10(%rcx)
   1061 
   1062 	add    $0x20,%rcx
   1063 	cmp    $0x20,%r8
   1064 	jge    L(movdqa2)
   1065 	jmp    L(movdqa_epi)
   1066 
   1067 	.balign 16
   1068 L(movdqa3):
   1069 	sub    $0x20,%r8
   1070 	movdqa 0x10(%rdx),%xmm3
   1071 	movdqa 0x20(%rdx),%xmm0
   1072 	add    $0x20,%rdx
   1073 
   1074 	psrldq $0x3,%xmm1
   1075 	movdqa %xmm3,%xmm2
   1076 	pslldq $0xd,%xmm3
   1077 	por    %xmm1,%xmm3
   1078 
   1079 	psrldq $0x3,%xmm2
   1080 	movdqa %xmm0,%xmm1
   1081 	pslldq $0xd,%xmm0
   1082 	por    %xmm2,%xmm0
   1083 	movdqa %xmm3,(%rcx)
   1084 	movdqa %xmm0,0x10(%rcx)
   1085 
   1086 	add    $0x20,%rcx
   1087 	cmp    $0x20,%r8
   1088 	jge    L(movdqa3)
   1089 	jmp    L(movdqa_epi)
   1090 
   1091 	.balign 16
   1092 L(movdqa4):
   1093 	sub    $0x20,%r8
   1094 	movdqa 0x10(%rdx),%xmm3
   1095 	movdqa 0x20(%rdx),%xmm0
   1096 	add    $0x20,%rdx
   1097 
   1098 	psrldq $0x4,%xmm1
   1099 	movdqa %xmm3,%xmm2
   1100 	pslldq $0xc,%xmm3
   1101 	por    %xmm1,%xmm3
   1102 
   1103 	psrldq $0x4,%xmm2
   1104 	movdqa %xmm0,%xmm1
   1105 	pslldq $0xc,%xmm0
   1106 	por    %xmm2,%xmm0
   1107 
   1108 	movdqa %xmm3,(%rcx)
   1109 	movdqa %xmm0,0x10(%rcx)
   1110 
   1111 	add    $0x20,%rcx
   1112 	cmp    $0x20,%r8
   1113 	jge    L(movdqa4)
   1114 	jmp    L(movdqa_epi)
   1115 
   1116 	.balign 16
   1117 L(movdqa5):
   1118 	sub    $0x20,%r8
   1119 	movdqa 0x10(%rdx),%xmm3
   1120 	movdqa 0x20(%rdx),%xmm0
   1121 	add    $0x20,%rdx
   1122 
   1123 	psrldq $0x5,%xmm1
   1124 	movdqa %xmm3,%xmm2
   1125 	pslldq $0xb,%xmm3
   1126 	por    %xmm1,%xmm3
   1127 
   1128 	psrldq $0x5,%xmm2
   1129 	movdqa %xmm0,%xmm1
   1130 	pslldq $0xb,%xmm0
   1131 	por    %xmm2,%xmm0
   1132 
   1133 	movdqa %xmm3,(%rcx)
   1134 	movdqa %xmm0,0x10(%rcx)
   1135 
   1136 	add    $0x20,%rcx
   1137 	cmp    $0x20,%r8
   1138 	jge    L(movdqa5)
   1139 	jmp    L(movdqa_epi)
   1140 
   1141 	.balign 16
   1142 L(movdqa6):
   1143 	sub    $0x20,%r8
   1144 	movdqa 0x10(%rdx),%xmm3
   1145 	movdqa 0x20(%rdx),%xmm0
   1146 	add    $0x20,%rdx
   1147 
   1148 	psrldq $0x6,%xmm1
   1149 	movdqa %xmm3,%xmm2
   1150 	pslldq $0xa,%xmm3
   1151 	por    %xmm1,%xmm3
   1152 
   1153 	psrldq $0x6,%xmm2
   1154 	movdqa %xmm0,%xmm1
   1155 	pslldq $0xa,%xmm0
   1156 	por    %xmm2,%xmm0
   1157 	movdqa %xmm3,(%rcx)
   1158 	movdqa %xmm0,0x10(%rcx)
   1159 
   1160 	add    $0x20,%rcx
   1161 	cmp    $0x20,%r8
   1162 	jge    L(movdqa6)
   1163 	jmp    L(movdqa_epi)
   1164 
   1165 	.balign 16
   1166 L(movdqa7):
   1167 	sub    $0x20,%r8
   1168 	movdqa 0x10(%rdx),%xmm3
   1169 	movdqa 0x20(%rdx),%xmm0
   1170 	add    $0x20,%rdx
   1171 
   1172 	psrldq $0x7,%xmm1
   1173 	movdqa %xmm3,%xmm2
   1174 	pslldq $0x9,%xmm3
   1175 	por    %xmm1,%xmm3
   1176 
   1177 	psrldq $0x7,%xmm2
   1178 	movdqa %xmm0,%xmm1
   1179 	pslldq $0x9,%xmm0
   1180 	por    %xmm2,%xmm0
   1181 	movdqa %xmm3,(%rcx)
   1182 	movdqa %xmm0,0x10(%rcx)
   1183 
   1184 	add    $0x20,%rcx
   1185 	cmp    $0x20,%r8
   1186 	jge    L(movdqa7)
   1187 	jmp    L(movdqa_epi)
   1188 
   1189 	.balign 16
   1190 L(movdqa8):
   1191 	movdqa 0x10(%rdx),%xmm3
   1192 	sub    $0x30,%r8
   1193 	movdqa 0x20(%rdx),%xmm0
   1194 	movdqa 0x30(%rdx),%xmm5
   1195 	lea    0x30(%rdx),%rdx
   1196 
   1197 	shufpd $0x1,%xmm3,%xmm1
   1198 	movdqa %xmm1,(%rcx)
   1199 
   1200 	cmp    $0x30,%r8
   1201 
   1202 	shufpd $0x1,%xmm0,%xmm3
   1203 	movdqa %xmm3,0x10(%rcx)
   1204 
   1205 	movdqa %xmm5,%xmm1
   1206 	shufpd $0x1,%xmm5,%xmm0
   1207 	movdqa %xmm0,0x20(%rcx)
   1208 
   1209 	lea    0x30(%rcx),%rcx
   1210 
   1211 	jge    L(movdqa8)
   1212 	jmp    L(movdqa_epi)
   1213 
   1214 	.balign 16
   1215 L(movdqa9):
   1216 	sub    $0x20,%r8
   1217 	movdqa 0x10(%rdx),%xmm3
   1218 	movdqa 0x20(%rdx),%xmm0
   1219 	add    $0x20,%rdx
   1220 
   1221 	psrldq $0x9,%xmm1
   1222 	movdqa %xmm3,%xmm2
   1223 	pslldq $0x7,%xmm3
   1224 	por    %xmm1,%xmm3
   1225 
   1226 	psrldq $0x9,%xmm2
   1227 	movdqa %xmm0,%xmm1
   1228 	pslldq $0x7,%xmm0
   1229 	por    %xmm2,%xmm0
   1230 	movdqa %xmm3,(%rcx)
   1231 	movdqa %xmm0,0x10(%rcx)
   1232 
   1233 	add    $0x20,%rcx
   1234 	cmp    $0x20,%r8
   1235 	jge    L(movdqa9)
   1236 	jmp    L(movdqa_epi)
   1237 
   1238 	.balign 16
   1239 L(movdqa10):
   1240 	sub    $0x20,%r8
   1241 	movdqa 0x10(%rdx),%xmm3
   1242 	movdqa 0x20(%rdx),%xmm0
   1243 	add    $0x20,%rdx
   1244 
   1245 	psrldq $0xa,%xmm1
   1246 	movdqa %xmm3,%xmm2
   1247 	pslldq $0x6,%xmm3
   1248 	por    %xmm1,%xmm3
   1249 
   1250 	psrldq $0xa,%xmm2
   1251 	movdqa %xmm0,%xmm1
   1252 	pslldq $0x6,%xmm0
   1253 	por    %xmm2,%xmm0
   1254 	movdqa %xmm3,(%rcx)
   1255 	movdqa %xmm0,0x10(%rcx)
   1256 
   1257 	add    $0x20,%rcx
   1258 	cmp    $0x20,%r8
   1259 	jge    L(movdqa10)
   1260 	jmp    L(movdqa_epi)
   1261 
   1262 	.balign 16
   1263 L(movdqa11):
   1264 	sub    $0x20,%r8
   1265 	movdqa 0x10(%rdx),%xmm3
   1266 	movdqa 0x20(%rdx),%xmm0
   1267 	add    $0x20,%rdx
   1268 
   1269 	psrldq $0xb,%xmm1
   1270 	movdqa %xmm3,%xmm2
   1271 	pslldq $0x5,%xmm3
   1272 	por    %xmm1,%xmm3
   1273 
   1274 	psrldq $0xb,%xmm2
   1275 	movdqa %xmm0,%xmm1
   1276 	pslldq $0x5,%xmm0
   1277 	por    %xmm2,%xmm0
   1278 	movdqa %xmm3,(%rcx)
   1279 	movdqa %xmm0,0x10(%rcx)
   1280 
   1281 	add    $0x20,%rcx
   1282 	cmp    $0x20,%r8
   1283 	jge    L(movdqa11)
   1284 	jmp    L(movdqa_epi)
   1285 
   1286 	.balign 16
   1287 L(movdqa12):
   1288 	sub    $0x20,%r8
   1289 	movdqa 0x10(%rdx),%xmm3
   1290 	movdqa 0x20(%rdx),%xmm0
   1291 	add    $0x20,%rdx
   1292 
   1293 	psrldq $0xc,%xmm1
   1294 	movdqa %xmm3,%xmm2
   1295 	pslldq $0x4,%xmm3
   1296 	por    %xmm1,%xmm3
   1297 
   1298 	psrldq $0xc,%xmm2
   1299 	movdqa %xmm0,%xmm1
   1300 	pslldq $0x4,%xmm0
   1301 	por    %xmm2,%xmm0
   1302 	movdqa %xmm3,(%rcx)
   1303 	movdqa %xmm0,0x10(%rcx)
   1304 
   1305 	add    $0x20,%rcx
   1306 	cmp    $0x20,%r8
   1307 	jge    L(movdqa12)
   1308 	jmp    L(movdqa_epi)
   1309 
   1310 	.balign 16
   1311 L(movdqa13):
   1312 	sub    $0x20,%r8
   1313 	movdqa 0x10(%rdx),%xmm3
   1314 	movdqa 0x20(%rdx),%xmm0
   1315 	add    $0x20,%rdx
   1316 
   1317 	psrldq $0xd,%xmm1
   1318 	movdqa %xmm3,%xmm2
   1319 	pslldq $0x3,%xmm3
   1320 	por    %xmm1,%xmm3
   1321 
   1322 	psrldq $0xd,%xmm2
   1323 	movdqa %xmm0,%xmm1
   1324 	pslldq $0x3,%xmm0
   1325 	por    %xmm2,%xmm0
   1326 	movdqa %xmm3,(%rcx)
   1327 	movdqa %xmm0,0x10(%rcx)
   1328 
   1329 	add    $0x20,%rcx
   1330 	cmp    $0x20,%r8
   1331 	jge    L(movdqa13)
   1332 	jmp    L(movdqa_epi)
   1333 
   1334 	.balign 16
   1335 L(movdqa14):
   1336 	sub    $0x20,%r8
   1337 	movdqa 0x10(%rdx),%xmm3
   1338 	movdqa 0x20(%rdx),%xmm0
   1339 	add    $0x20,%rdx
   1340 
   1341 	psrldq $0xe,%xmm1
   1342 	movdqa %xmm3,%xmm2
   1343 	pslldq $0x2,%xmm3
   1344 	por    %xmm1,%xmm3
   1345 
   1346 	psrldq $0xe,%xmm2
   1347 	movdqa %xmm0,%xmm1
   1348 	pslldq $0x2,%xmm0
   1349 	por    %xmm2,%xmm0
   1350 	movdqa %xmm3,(%rcx)
   1351 	movdqa %xmm0,0x10(%rcx)
   1352 
   1353 	add    $0x20,%rcx
   1354 	cmp    $0x20,%r8
   1355 	jge    L(movdqa14)
   1356 	jmp    L(movdqa_epi)
   1357 
   1358 	.balign 16
   1359 L(movdqa15):
   1360 	sub    $0x20,%r8
   1361 	movdqa 0x10(%rdx),%xmm3
   1362 	movdqa 0x20(%rdx),%xmm0
   1363 	add    $0x20,%rdx
   1364 
   1365 	psrldq $0xf,%xmm1
   1366 	movdqa %xmm3,%xmm2
   1367 	pslldq $0x1,%xmm3
   1368 	por    %xmm1,%xmm3
   1369 
   1370 	psrldq $0xf,%xmm2
   1371 	movdqa %xmm0,%xmm1
   1372 	pslldq $0x1,%xmm0
   1373 	por    %xmm2,%xmm0
   1374 	movdqa %xmm3,(%rcx)
   1375 	movdqa %xmm0,0x10(%rcx)
   1376 
   1377 	add    $0x20,%rcx
   1378 	cmp    $0x20,%r8
   1379 	jge    L(movdqa15)
   1380 	#jmp   L(movdqa_epi)
   1381 
   1382 	.balign 16
   1383 L(movdqa_epi):
   1384 	lea    L(fwdPxQx)(%rip),%r10
   1385 	add    %r11,%rdx # bump rdx to the right addr (it lagged behind in the above loop)
   1386 	add    %r8,%rcx
   1387 	add    %r8,%rdx
   1388 
   1389 	movslq (%r10,%r8,4),%r9
   1390 	lea    (%r9,%r10,1),%r10
   1391 	jmpq   *%r10
   1392 
   1393 	.balign 16
   1394 L(mov3dqa1):
   1395 	movdqa	0x10(%rdx),%xmm3 # load the upper source buffer
   1396 	sub	$0x30,%r8
   1397 	movdqa	0x20(%rdx),%xmm0 # load the upper source buffer
   1398 	movdqa	0x30(%rdx),%xmm5 # load the upper source buffer
   1399 	lea	0x30(%rdx),%rdx
   1400 	cmp	$0x30,%r8
   1401 
   1402 	movdqa	%xmm3,%xmm2       # store off xmm reg for use next iteration
   1403 	#palignr	$0x1,%xmm1,%xmm3
   1404 	.byte	0x66,0x0f,0x3a,0x0f
   1405 	.byte	0xd9,0x01
   1406 	movdqa	%xmm3,(%rcx)      # store it
   1407 
   1408 	movdqa	%xmm0,%xmm4       # store off xmm reg for use next iteration
   1409 	#palignr	$0x1,%xmm2,%xmm0
   1410 	.byte	0x66,0x0f,0x3a,0x0f
   1411 	.byte	0xc2,0x01
   1412 	movdqa	%xmm0,0x10(%rcx)  # store it
   1413 
   1414 	movdqa	%xmm5,%xmm1       # store off xmm reg for use next iteration
   1415 	#palignr	$0x1,%xmm4,%xmm5
   1416 	.byte	0x66,0x0f,0x3a,0x0f
   1417 	.byte	0xec,0x01
   1418 	movdqa	%xmm5,0x20(%rcx)  # store it
   1419 
   1420 	lea	0x30(%rcx),%rcx
   1421 	jge	L(mov3dqa1)
   1422 
   1423 	cmp	$0x10,%r8
   1424 	jl	L(movdqa_epi)
   1425 	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
   1426 	sub	$0x10,%r8
   1427 	lea	0x10(%rdx),%rdx
   1428 	movdqa	%xmm3,%xmm2		# save for use next concat
   1429 	#palignr	$0x1,%xmm1,%xmm3
   1430 	.byte	0x66,0x0f,0x3a,0x0f
   1431 	.byte	0xd9,0x01
   1432 
   1433 	cmp	$0x10,%r8
   1434 	movdqa	%xmm3,(%rcx)      	# store it
   1435 	lea	0x10(%rcx),%rcx
   1436 	jl	L(movdqa_epi)
   1437 
   1438 	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
   1439 	sub	$0x10,%r8
   1440 	lea	0x10(%rdx),%rdx
   1441 	#palignr	$0x1,%xmm2,%xmm0
   1442 	.byte	0x66,0x0f,0x3a,0x0f
   1443 	.byte	0xc2,0x01
   1444 	movdqa	%xmm0,(%rcx)      	# store it
   1445 	lea	0x10(%rcx),%rcx
   1446 	jmp	L(movdqa_epi)
   1447 
   1448 	.balign 16
   1449 L(mov3dqa2):
   1450 	movdqa	0x10(%rdx),%xmm3
   1451 	sub	$0x30,%r8
   1452 	movdqa	0x20(%rdx),%xmm0
   1453 	movdqa	0x30(%rdx),%xmm5
   1454 	lea	0x30(%rdx),%rdx
   1455 	cmp	$0x30,%r8
   1456 
   1457 	movdqa	%xmm3,%xmm2
   1458 	#palignr	$0x2,%xmm1,%xmm3
   1459 	.byte	0x66,0x0f,0x3a,0x0f
   1460 	.byte	0xd9,0x02
   1461 	movdqa	%xmm3,(%rcx)
   1462 
   1463 	movdqa	%xmm0,%xmm4
   1464 	#palignr	$0x2,%xmm2,%xmm0
   1465 	.byte	0x66,0x0f,0x3a,0x0f
   1466 	.byte	0xc2,0x02
   1467 	movdqa	%xmm0,0x10(%rcx)
   1468 
   1469 	movdqa	%xmm5,%xmm1
   1470 	#palignr	$0x2,%xmm4,%xmm5
   1471 	.byte	0x66,0x0f,0x3a,0x0f
   1472 	.byte	0xec,0x02
   1473 	movdqa	%xmm5,0x20(%rcx)
   1474 
   1475 	lea	0x30(%rcx),%rcx
   1476 	jge	L(mov3dqa2)
   1477 
   1478 	cmp	$0x10,%r8
   1479 	jl	L(movdqa_epi)
   1480 	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
   1481 	sub	$0x10,%r8
   1482 	lea	0x10(%rdx),%rdx
   1483 	movdqa	%xmm3,%xmm2		# save for use next concat
   1484 	#palignr	$0x2,%xmm1,%xmm3
   1485 	.byte	0x66,0x0f,0x3a,0x0f
   1486 	.byte	0xd9,0x02
   1487 
   1488 	cmp	$0x10,%r8
   1489 	movdqa	%xmm3,(%rcx)      	# store it
   1490 	lea	0x10(%rcx),%rcx
   1491 	jl	L(movdqa_epi)
   1492 
   1493 	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
   1494 	sub	$0x10,%r8
   1495 	lea	0x10(%rdx),%rdx
   1496 	#palignr	$0x2,%xmm2,%xmm0
   1497 	.byte	0x66,0x0f,0x3a,0x0f
   1498 	.byte	0xc2,0x02
   1499 	movdqa	%xmm0,(%rcx)      	# store it
   1500 	lea	0x10(%rcx),%rcx
   1501 	jmp	L(movdqa_epi)
   1502 
   1503 	.balign 16
   1504 L(mov3dqa3):
   1505 	movdqa	0x10(%rdx),%xmm3
   1506 	sub	$0x30,%r8
   1507 	movdqa	0x20(%rdx),%xmm0
   1508 	movdqa	0x30(%rdx),%xmm5
   1509 	lea	0x30(%rdx),%rdx
   1510 	cmp	$0x30,%r8
   1511 
   1512 	movdqa	%xmm3,%xmm2
   1513 	#palignr	$0x3,%xmm1,%xmm3
   1514 	.byte	0x66,0x0f,0x3a,0x0f
   1515 	.byte	0xd9,0x03
   1516 	movdqa	%xmm3,(%rcx)
   1517 
   1518 	movdqa	%xmm0,%xmm4
   1519 	#palignr	$0x3,%xmm2,%xmm0
   1520 	.byte	0x66,0x0f,0x3a,0x0f
   1521 	.byte	0xc2,0x03
   1522 	movdqa	%xmm0,0x10(%rcx)
   1523 
   1524 	movdqa	%xmm5,%xmm1
   1525 	#palignr	$0x3,%xmm4,%xmm5
   1526 	.byte	0x66,0x0f,0x3a,0x0f
   1527 	.byte	0xec,0x03
   1528 	movdqa	%xmm5,0x20(%rcx)
   1529 
   1530 	lea	0x30(%rcx),%rcx
   1531 	jge	L(mov3dqa3)
   1532 
   1533 	cmp	$0x10,%r8
   1534 	jl	L(movdqa_epi)
   1535 	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
   1536 	sub	$0x10,%r8
   1537 	lea	0x10(%rdx),%rdx
   1538 	movdqa	%xmm3,%xmm2		# save for use next concat
   1539 	#palignr	$0x3,%xmm1,%xmm3
   1540 	.byte	0x66,0x0f,0x3a,0x0f
   1541 	.byte	0xd9,0x03
   1542 
   1543 	cmp	$0x10,%r8
   1544 	movdqa	%xmm3,(%rcx)      	# store it
   1545 	lea	0x10(%rcx),%rcx
   1546 	jl	L(movdqa_epi)
   1547 
   1548 	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
   1549 	sub	$0x10,%r8
   1550 	lea	0x10(%rdx),%rdx
   1551 	#palignr	$0x3,%xmm2,%xmm0
   1552 	.byte	0x66,0x0f,0x3a,0x0f
   1553 	.byte	0xc2,0x03
   1554 	movdqa	%xmm0,(%rcx)      	# store it
   1555 	lea	0x10(%rcx),%rcx
   1556 	jmp	L(movdqa_epi)
   1557 
   1558 	.balign 16
   1559 L(mov3dqa4):
   1560 	movdqa	0x10(%rdx),%xmm3
   1561 	sub	$0x30,%r8
   1562 	movdqa	0x20(%rdx),%xmm0
   1563 	movdqa	0x30(%rdx),%xmm5
   1564 	lea	0x30(%rdx),%rdx
   1565 	cmp	$0x30,%r8
   1566 
   1567 	movdqa	%xmm3,%xmm2
   1568 	#palignr	$0x4,%xmm1,%xmm3
   1569 	.byte	0x66,0x0f,0x3a,0x0f
   1570 	.byte	0xd9,0x04
   1571 	movdqa	%xmm3,(%rcx)
   1572 
   1573 	movdqa	%xmm0,%xmm4
   1574 	#palignr	$0x4,%xmm2,%xmm0
   1575 	.byte	0x66,0x0f,0x3a,0x0f
   1576 	.byte	0xc2,0x04
   1577 	movdqa	%xmm0,0x10(%rcx)
   1578 
   1579 	movdqa	%xmm5,%xmm1
   1580 	#palignr	$0x4,%xmm4,%xmm5
   1581 	.byte	0x66,0x0f,0x3a,0x0f
   1582 	.byte	0xec,0x04
   1583 	movdqa	%xmm5,0x20(%rcx)
   1584 
   1585 	lea	0x30(%rcx),%rcx
   1586 	jge	L(mov3dqa4)
   1587 
   1588 	cmp	$0x10,%r8
   1589 	jl	L(movdqa_epi)
   1590 	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
   1591 	sub	$0x10,%r8
   1592 	lea	0x10(%rdx),%rdx
   1593 	movdqa	%xmm3,%xmm2		# save for use next concat
   1594 	#palignr	$0x4,%xmm1,%xmm3
   1595 	.byte	0x66,0x0f,0x3a,0x0f
   1596 	.byte	0xd9,0x04
   1597 
   1598 	cmp	$0x10,%r8
   1599 	movdqa	%xmm3,(%rcx)      	# store it
   1600 	lea	0x10(%rcx),%rcx
   1601 	jl	L(movdqa_epi)
   1602 
   1603 	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
   1604 	sub	$0x10,%r8
   1605 	lea	0x10(%rdx),%rdx
   1606 	#palignr	$0x4,%xmm2,%xmm0
   1607 	.byte	0x66,0x0f,0x3a,0x0f
   1608 	.byte	0xc2,0x04
   1609 	movdqa	%xmm0,(%rcx)      	# store it
   1610 	lea	0x10(%rcx),%rcx
   1611 	jmp	L(movdqa_epi)
   1612 
   1613 	.balign 16
   1614 L(mov3dqa5):
   1615 	movdqa	0x10(%rdx),%xmm3
   1616 	sub	$0x30,%r8
   1617 	movdqa	0x20(%rdx),%xmm0
   1618 	movdqa	0x30(%rdx),%xmm5
   1619 	lea	0x30(%rdx),%rdx
   1620 	cmp	$0x30,%r8
   1621 
   1622 	movdqa	%xmm3,%xmm2
   1623 	#palignr	$0x5,%xmm1,%xmm3
   1624 	.byte	0x66,0x0f,0x3a,0x0f
   1625 	.byte	0xd9,0x05
   1626 	movdqa	%xmm3,(%rcx)
   1627 
   1628 	movdqa	%xmm0,%xmm4
   1629 	#palignr	$0x5,%xmm2,%xmm0
   1630 	.byte	0x66,0x0f,0x3a,0x0f
   1631 	.byte	0xc2,0x05
   1632 	movdqa	%xmm0,0x10(%rcx)
   1633 
   1634 	movdqa	%xmm5,%xmm1
   1635 	#palignr	$0x5,%xmm4,%xmm5
   1636 	.byte	0x66,0x0f,0x3a,0x0f
   1637 	.byte	0xec,0x05
   1638 	movdqa	%xmm5,0x20(%rcx)
   1639 
   1640 	lea	0x30(%rcx),%rcx
   1641 	jge	L(mov3dqa5)
   1642 
   1643 	cmp	$0x10,%r8
   1644 	jl	L(movdqa_epi)
   1645 	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
   1646 	sub	$0x10,%r8
   1647 	lea	0x10(%rdx),%rdx
   1648 	movdqa	%xmm3,%xmm2		# save for use next concat
   1649 	#palignr	$0x5,%xmm1,%xmm3
   1650 	.byte	0x66,0x0f,0x3a,0x0f
   1651 	.byte	0xd9,0x05
   1652 
   1653 	cmp	$0x10,%r8
   1654 	movdqa	%xmm3,(%rcx)      	# store it
   1655 	lea	0x10(%rcx),%rcx
   1656 	jl	L(movdqa_epi)
   1657 
   1658 	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
   1659 	sub	$0x10,%r8
   1660 	lea	0x10(%rdx),%rdx
   1661 	#palignr	$0x5,%xmm2,%xmm0
   1662 	.byte	0x66,0x0f,0x3a,0x0f
   1663 	.byte	0xc2,0x05
   1664 	movdqa	%xmm0,(%rcx)      	# store it
   1665 	lea	0x10(%rcx),%rcx
   1666 	jmp	L(movdqa_epi)
   1667 
   1668 	.balign 16
   1669 L(mov3dqa6):
   1670 	movdqa	0x10(%rdx),%xmm3
   1671 	sub	$0x30,%r8
   1672 	movdqa	0x20(%rdx),%xmm0
   1673 	movdqa	0x30(%rdx),%xmm5
   1674 	lea	0x30(%rdx),%rdx
   1675 	cmp	$0x30,%r8
   1676 
   1677 	movdqa	%xmm3,%xmm2
   1678 	#palignr	$0x6,%xmm1,%xmm3
   1679 	.byte	0x66,0x0f,0x3a,0x0f
   1680 	.byte	0xd9,0x06
   1681 	movdqa	%xmm3,(%rcx)
   1682 
   1683 	movdqa	%xmm0,%xmm4
   1684 	#palignr	$0x6,%xmm2,%xmm0
   1685 	.byte	0x66,0x0f,0x3a,0x0f
   1686 	.byte	0xc2,0x06
   1687 	movdqa	%xmm0,0x10(%rcx)
   1688 
   1689 	movdqa	%xmm5,%xmm1
   1690 	#palignr	$0x6,%xmm4,%xmm5
   1691 	.byte	0x66,0x0f,0x3a,0x0f
   1692 	.byte	0xec,0x06
   1693 	movdqa	%xmm5,0x20(%rcx)
   1694 
   1695 	lea	0x30(%rcx),%rcx
   1696 	jge	L(mov3dqa6)
   1697 
   1698 	cmp	$0x10,%r8
   1699 	jl	L(movdqa_epi)
   1700 	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
   1701 	sub	$0x10,%r8
   1702 	lea	0x10(%rdx),%rdx
   1703 	movdqa	%xmm3,%xmm2		# save for use next concat
   1704 	#palignr	$0x6,%xmm1,%xmm3
   1705 	.byte	0x66,0x0f,0x3a,0x0f
   1706 	.byte	0xd9,0x06
   1707 
   1708 	cmp	$0x10,%r8
   1709 	movdqa	%xmm3,(%rcx)      	# store it
   1710 	lea	0x10(%rcx),%rcx
   1711 	jl	L(movdqa_epi)
   1712 
   1713 	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
   1714 	sub	$0x10,%r8
   1715 	lea	0x10(%rdx),%rdx
   1716 	#palignr	$0x6,%xmm2,%xmm0
   1717 	.byte	0x66,0x0f,0x3a,0x0f
   1718 	.byte	0xc2,0x06
   1719 	movdqa	%xmm0,(%rcx)      	# store it
   1720 	lea	0x10(%rcx),%rcx
   1721 	jmp	L(movdqa_epi)
   1722 
   1723 	.balign 16
   1724 L(mov3dqa7):
   1725 	movdqa	0x10(%rdx),%xmm3
   1726 	sub	$0x30,%r8
   1727 	movdqa	0x20(%rdx),%xmm0
   1728 	movdqa	0x30(%rdx),%xmm5
   1729 	lea	0x30(%rdx),%rdx
   1730 	cmp	$0x30,%r8
   1731 
   1732 	movdqa	%xmm3,%xmm2
   1733 	#palignr	$0x7,%xmm1,%xmm3
   1734 	.byte	0x66,0x0f,0x3a,0x0f
   1735 	.byte	0xd9,0x07
   1736 	movdqa	%xmm3,(%rcx)
   1737 
   1738 	movdqa	%xmm0,%xmm4
   1739 	#palignr	$0x7,%xmm2,%xmm0
   1740 	.byte	0x66,0x0f,0x3a,0x0f
   1741 	.byte	0xc2,0x07
   1742 	movdqa	%xmm0,0x10(%rcx)
   1743 
   1744 	movdqa	%xmm5,%xmm1
   1745 	#palignr	$0x7,%xmm4,%xmm5
   1746 	.byte	0x66,0x0f,0x3a,0x0f
   1747 	.byte	0xec,0x07
   1748 	movdqa	%xmm5,0x20(%rcx)
   1749 
   1750 	lea	0x30(%rcx),%rcx
   1751 	jge	L(mov3dqa7)
   1752 
   1753 	cmp	$0x10,%r8
   1754 	jl	L(movdqa_epi)
   1755 	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
   1756 	sub	$0x10,%r8
   1757 	lea	0x10(%rdx),%rdx
   1758 	movdqa	%xmm3,%xmm2		# save for use next concat
   1759 	#palignr	$0x7,%xmm1,%xmm3
   1760 	.byte	0x66,0x0f,0x3a,0x0f
   1761 	.byte	0xd9,0x07
   1762 
   1763 	cmp	$0x10,%r8
   1764 	movdqa	%xmm3,(%rcx)      	# store it
   1765 	lea	0x10(%rcx),%rcx
   1766 	jl	L(movdqa_epi)
   1767 
   1768 	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
   1769 	sub	$0x10,%r8
   1770 	lea	0x10(%rdx),%rdx
   1771 	#palignr	$0x7,%xmm2,%xmm0
   1772 	.byte	0x66,0x0f,0x3a,0x0f
   1773 	.byte	0xc2,0x07
   1774 	movdqa	%xmm0,(%rcx)      	# store it
   1775 	lea	0x10(%rcx),%rcx
   1776 	jmp	L(movdqa_epi)
   1777 
   1778 	.balign 16
   1779 L(mov3dqa9):
   1780 	movdqa	0x10(%rdx),%xmm3
   1781 	sub	$0x30,%r8
   1782 	movdqa	0x20(%rdx),%xmm0
   1783 	movdqa	0x30(%rdx),%xmm5
   1784 	lea	0x30(%rdx),%rdx
   1785 	cmp	$0x30,%r8
   1786 
   1787 	movdqa	%xmm3,%xmm2
   1788 	#palignr	$0x9,%xmm1,%xmm3
   1789 	.byte	0x66,0x0f,0x3a,0x0f
   1790 	.byte	0xd9,0x09
   1791 	movdqa	%xmm3,(%rcx)
   1792 
   1793 	movdqa	%xmm0,%xmm4
   1794 	#palignr	$0x9,%xmm2,%xmm0
   1795 	.byte	0x66,0x0f,0x3a,0x0f
   1796 	.byte	0xc2,0x09
   1797 	movdqa	%xmm0,0x10(%rcx)
   1798 
   1799 	movdqa	%xmm5,%xmm1
   1800 	#palignr	$0x9,%xmm4,%xmm5
   1801 	.byte	0x66,0x0f,0x3a,0x0f
   1802 	.byte	0xec,0x09
   1803 	movdqa	%xmm5,0x20(%rcx)
   1804 
   1805 	lea	0x30(%rcx),%rcx
   1806 	jge	L(mov3dqa9)
   1807 
   1808 	cmp	$0x10,%r8
   1809 	jl	L(movdqa_epi)
   1810 	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
   1811 	sub	$0x10,%r8
   1812 	lea	0x10(%rdx),%rdx
   1813 	movdqa	%xmm3,%xmm2		# save for use next concat
   1814 	#palignr	$0x9,%xmm1,%xmm3
   1815 	.byte	0x66,0x0f,0x3a,0x0f
   1816 	.byte	0xd9,0x09
   1817 
   1818 	cmp	$0x10,%r8
   1819 	movdqa	%xmm3,(%rcx)      	# store it
   1820 	lea	0x10(%rcx),%rcx
   1821 	jl	L(movdqa_epi)
   1822 
   1823 	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
   1824 	sub	$0x10,%r8
   1825 	lea	0x10(%rdx),%rdx
   1826 	#palignr	$0x9,%xmm2,%xmm0
   1827 	.byte	0x66,0x0f,0x3a,0x0f
   1828 	.byte	0xc2,0x09
   1829 	movdqa	%xmm0,(%rcx)      	# store it
   1830 	lea	0x10(%rcx),%rcx
   1831 	jmp	L(movdqa_epi)
   1832 
   1833 	.balign 16
   1834 L(mov3dqa10):
   1835 	movdqa	0x10(%rdx),%xmm3
   1836 	sub	$0x30,%r8
   1837 	movdqa	0x20(%rdx),%xmm0
   1838 	movdqa	0x30(%rdx),%xmm5
   1839 	lea	0x30(%rdx),%rdx
   1840 	cmp	$0x30,%r8
   1841 
   1842 	movdqa	%xmm3,%xmm2
   1843 	#palignr	$0xa,%xmm1,%xmm3
   1844 	.byte	0x66,0x0f,0x3a,0x0f
   1845 	.byte	0xd9,0x0a
   1846 	movdqa	%xmm3,(%rcx)
   1847 
   1848 	movdqa	%xmm0,%xmm4
   1849 	#palignr	$0xa,%xmm2,%xmm0
   1850 	.byte	0x66,0x0f,0x3a,0x0f
   1851 	.byte	0xc2,0x0a
   1852 	movdqa	%xmm0,0x10(%rcx)
   1853 
   1854 	movdqa	%xmm5,%xmm1
   1855 	#palignr	$0xa,%xmm4,%xmm5
   1856 	.byte	0x66,0x0f,0x3a,0x0f
   1857 	.byte	0xec,0x0a
   1858 	movdqa	%xmm5,0x20(%rcx)
   1859 
   1860 	lea	0x30(%rcx),%rcx
   1861 	jge	L(mov3dqa10)
   1862 
   1863 	cmp	$0x10,%r8
   1864 	jl	L(movdqa_epi)
   1865 	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
   1866 	sub	$0x10,%r8
   1867 	lea	0x10(%rdx),%rdx
   1868 	movdqa	%xmm3,%xmm2		# save for use next concat
   1869 	#palignr	$0xa,%xmm1,%xmm3
   1870 	.byte	0x66,0x0f,0x3a,0x0f
   1871 	.byte	0xd9,0x0a
   1872 
   1873 	cmp	$0x10,%r8
   1874 	movdqa	%xmm3,(%rcx)      	# store it
   1875 	lea	0x10(%rcx),%rcx
   1876 	jl	L(movdqa_epi)
   1877 
   1878 	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
   1879 	sub	$0x10,%r8
   1880 	lea	0x10(%rdx),%rdx
   1881 	#palignr	$0xa,%xmm2,%xmm0
   1882 	.byte	0x66,0x0f,0x3a,0x0f
   1883 	.byte	0xc2,0x0a
   1884 	movdqa	%xmm0,(%rcx)      	# store it
   1885 	lea	0x10(%rcx),%rcx
   1886 	jmp	L(movdqa_epi)
   1887 
   1888 	.balign 16
   1889 L(mov3dqa11):
   1890 	movdqa	0x10(%rdx),%xmm3
   1891 	sub	$0x30,%r8
   1892 	movdqa	0x20(%rdx),%xmm0
   1893 	movdqa	0x30(%rdx),%xmm5
   1894 	lea	0x30(%rdx),%rdx
   1895 	cmp	$0x30,%r8
   1896 
   1897 	movdqa	%xmm3,%xmm2
   1898 	#palignr	$0xb,%xmm1,%xmm3
   1899 	.byte	0x66,0x0f,0x3a,0x0f
   1900 	.byte	0xd9,0x0b
   1901 	movdqa	%xmm3,(%rcx)
   1902 
   1903 	movdqa	%xmm0,%xmm4
   1904 	#palignr	$0xb,%xmm2,%xmm0
   1905 	.byte	0x66,0x0f,0x3a,0x0f
   1906 	.byte	0xc2,0x0b
   1907 	movdqa	%xmm0,0x10(%rcx)
   1908 
   1909 	movdqa	%xmm5,%xmm1
   1910 	#palignr	$0xb,%xmm4,%xmm5
   1911 	.byte	0x66,0x0f,0x3a,0x0f
   1912 	.byte	0xec,0x0b
   1913 	movdqa	%xmm5,0x20(%rcx)
   1914 
   1915 	lea	0x30(%rcx),%rcx
   1916 	jge	L(mov3dqa11)
   1917 
   1918 	cmp	$0x10,%r8
   1919 	jl	L(movdqa_epi)
   1920 	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
   1921 	sub	$0x10,%r8
   1922 	lea	0x10(%rdx),%rdx
   1923 	movdqa	%xmm3,%xmm2		# save for use next concat
   1924 	#palignr	$0xb,%xmm1,%xmm3
   1925 	.byte	0x66,0x0f,0x3a,0x0f
   1926 	.byte	0xd9,0x0b
   1927 
   1928 	cmp	$0x10,%r8
   1929 	movdqa	%xmm3,(%rcx)      	# store it
   1930 	lea	0x10(%rcx),%rcx
   1931 	jl	L(movdqa_epi)
   1932 
   1933 	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
   1934 	sub	$0x10,%r8
   1935 	lea	0x10(%rdx),%rdx
   1936 	#palignr	$0xb,%xmm2,%xmm0
   1937 	.byte	0x66,0x0f,0x3a,0x0f
   1938 	.byte	0xc2,0x0b
   1939 	movdqa	%xmm0,(%rcx)      	# store it
   1940 	lea	0x10(%rcx),%rcx
   1941 	jmp	L(movdqa_epi)
   1942 
   1943 	.balign 16
   1944 L(mov3dqa12):
   1945 	movdqa	0x10(%rdx),%xmm3
   1946 	sub	$0x30,%r8
   1947 	movdqa	0x20(%rdx),%xmm0
   1948 	movdqa	0x30(%rdx),%xmm5
   1949 	lea	0x30(%rdx),%rdx
   1950 	cmp	$0x30,%r8
   1951 
   1952 	movdqa	%xmm3,%xmm2
   1953 	#palignr	$0xc,%xmm1,%xmm3
   1954 	.byte	0x66,0x0f,0x3a,0x0f
   1955 	.byte	0xd9,0x0c
   1956 	movdqa	%xmm3,(%rcx)
   1957 
   1958 	movdqa	%xmm0,%xmm4
   1959 	#palignr	$0xc,%xmm2,%xmm0
   1960 	.byte	0x66,0x0f,0x3a,0x0f
   1961 	.byte	0xc2,0x0c
   1962 	movdqa	%xmm0,0x10(%rcx)
   1963 
   1964 	movdqa	%xmm5,%xmm1
   1965 	#palignr	$0xc,%xmm4,%xmm5
   1966 	.byte	0x66,0x0f,0x3a,0x0f
   1967 	.byte	0xec,0x0c
   1968 	movdqa	%xmm5,0x20(%rcx)
   1969 
   1970 	lea	0x30(%rcx),%rcx
   1971 	jge	L(mov3dqa12)
   1972 
   1973 	cmp	$0x10,%r8
   1974 	jl	L(movdqa_epi)
   1975 	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
   1976 	sub	$0x10,%r8
   1977 	lea	0x10(%rdx),%rdx
   1978 	movdqa	%xmm3,%xmm2		# save for use next concat
   1979 	#palignr	$0xc,%xmm1,%xmm3
   1980 	.byte	0x66,0x0f,0x3a,0x0f
   1981 	.byte	0xd9,0x0c
   1982 
   1983 	cmp	$0x10,%r8
   1984 	movdqa	%xmm3,(%rcx)      	# store it
   1985 	lea	0x10(%rcx),%rcx
   1986 	jl	L(movdqa_epi)
   1987 
   1988 	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
   1989 	sub	$0x10,%r8
   1990 	lea	0x10(%rdx),%rdx
   1991 	#palignr	$0xc,%xmm2,%xmm0
   1992 	.byte	0x66,0x0f,0x3a,0x0f
   1993 	.byte	0xc2,0x0c
   1994 	movdqa	%xmm0,(%rcx)      	# store it
   1995 	lea	0x10(%rcx),%rcx
   1996 	jmp	L(movdqa_epi)
   1997 
   1998 	.balign 16
   1999 L(mov3dqa13):
   2000 	movdqa	0x10(%rdx),%xmm3
   2001 	sub	$0x30,%r8
   2002 	movdqa	0x20(%rdx),%xmm0
   2003 	movdqa	0x30(%rdx),%xmm5
   2004 	lea	0x30(%rdx),%rdx
   2005 	cmp	$0x30,%r8
   2006 
   2007 	movdqa	%xmm3,%xmm2
   2008 	#palignr	$0xd,%xmm1,%xmm3
   2009 	.byte	0x66,0x0f,0x3a,0x0f
   2010 	.byte	0xd9,0x0d
   2011 	movdqa	%xmm3,(%rcx)
   2012 
   2013 	movdqa	%xmm0,%xmm4
   2014 	#palignr	$0xd,%xmm2,%xmm0
   2015 	.byte	0x66,0x0f,0x3a,0x0f
   2016 	.byte	0xc2,0x0d
   2017 	movdqa	%xmm0,0x10(%rcx)
   2018 
   2019 	movdqa	%xmm5,%xmm1
   2020 	#palignr	$0xd,%xmm4,%xmm5
   2021 	.byte	0x66,0x0f,0x3a,0x0f
   2022 	.byte	0xec,0x0d
   2023 	movdqa	%xmm5,0x20(%rcx)
   2024 
   2025 	lea	0x30(%rcx),%rcx
   2026 	jge	L(mov3dqa13)
   2027 
   2028 	cmp	$0x10,%r8
   2029 	jl	L(movdqa_epi)
   2030 	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
   2031 	sub	$0x10,%r8
   2032 	lea	0x10(%rdx),%rdx
   2033 	movdqa	%xmm3,%xmm2		# save for use next concat
   2034 	#palignr	$0xd,%xmm1,%xmm3
   2035 	.byte	0x66,0x0f,0x3a,0x0f
   2036 	.byte	0xd9,0x0d
   2037 
   2038 	cmp	$0x10,%r8
   2039 	movdqa	%xmm3,(%rcx)      	# store it
   2040 	lea	0x10(%rcx),%rcx
   2041 	jl	L(movdqa_epi)
   2042 
   2043 	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
   2044 	sub	$0x10,%r8
   2045 	lea	0x10(%rdx),%rdx
   2046 	#palignr	$0xd,%xmm2,%xmm0
   2047 	.byte	0x66,0x0f,0x3a,0x0f
   2048 	.byte	0xc2,0x0d
   2049 	movdqa	%xmm0,(%rcx)      	# store it
   2050 	lea	0x10(%rcx),%rcx
   2051 	jmp	L(movdqa_epi)
   2052 
   2053 	.balign 16
   2054 L(mov3dqa14):
   2055 	movdqa	0x10(%rdx),%xmm3
   2056 	sub	$0x30,%r8
   2057 	movdqa	0x20(%rdx),%xmm0
   2058 	movdqa	0x30(%rdx),%xmm5
   2059 	lea	0x30(%rdx),%rdx
   2060 	cmp	$0x30,%r8
   2061 
   2062 	movdqa	%xmm3,%xmm2
   2063 	#palignr	$0xe,%xmm1,%xmm3
   2064 	.byte	0x66,0x0f,0x3a,0x0f
   2065 	.byte	0xd9,0x0e
   2066 	movdqa	%xmm3,(%rcx)
   2067 
   2068 	movdqa	%xmm0,%xmm4
   2069 	#palignr	$0xe,%xmm2,%xmm0
   2070 	.byte	0x66,0x0f,0x3a,0x0f
   2071 	.byte	0xc2,0x0e
   2072 	movdqa	%xmm0,0x10(%rcx)
   2073 
   2074 	movdqa	%xmm5,%xmm1
   2075 	#palignr	$0xe,%xmm4,%xmm5
   2076 	.byte	0x66,0x0f,0x3a,0x0f
   2077 	.byte	0xec,0x0e
   2078 	movdqa	%xmm5,0x20(%rcx)
   2079 
   2080 	lea	0x30(%rcx),%rcx
   2081 	jge	L(mov3dqa14)
   2082 
   2083 	cmp	$0x10,%r8
   2084 	jl	L(movdqa_epi)
   2085 	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
   2086 	sub	$0x10,%r8
   2087 	lea	0x10(%rdx),%rdx
   2088 	movdqa	%xmm3,%xmm2		# save for use next concat
   2089 	#palignr	$0xe,%xmm1,%xmm3
   2090 	.byte	0x66,0x0f,0x3a,0x0f
   2091 	.byte	0xd9,0x0e
   2092 
   2093 	cmp	$0x10,%r8
   2094 	movdqa	%xmm3,(%rcx)      	# store it
   2095 	lea	0x10(%rcx),%rcx
   2096 	jl	L(movdqa_epi)
   2097 
   2098 	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
   2099 	sub	$0x10,%r8
   2100 	lea	0x10(%rdx),%rdx
   2101 	#palignr	$0xe,%xmm2,%xmm0
   2102 	.byte	0x66,0x0f,0x3a,0x0f
   2103 	.byte	0xc2,0x0e
   2104 	movdqa	%xmm0,(%rcx)      	# store it
   2105 	lea	0x10(%rcx),%rcx
   2106 	jmp	L(movdqa_epi)
   2107 
   2108 	.balign 16
   2109 L(mov3dqa15):
   2110 	movdqa	0x10(%rdx),%xmm3
   2111 	sub	$0x30,%r8
   2112 	movdqa	0x20(%rdx),%xmm0
   2113 	movdqa	0x30(%rdx),%xmm5
   2114 	lea	0x30(%rdx),%rdx
   2115 	cmp	$0x30,%r8
   2116 
   2117 	movdqa	%xmm3,%xmm2
   2118 	#palignr	$0xf,%xmm1,%xmm3
   2119 	.byte	0x66,0x0f,0x3a,0x0f
   2120 	.byte	0xd9,0x0f
   2121 	movdqa	%xmm3,(%rcx)
   2122 
   2123 	movdqa	%xmm0,%xmm4
   2124 	#palignr	$0xf,%xmm2,%xmm0
   2125 	.byte	0x66,0x0f,0x3a,0x0f
   2126 	.byte	0xc2,0x0f
   2127 	movdqa	%xmm0,0x10(%rcx)
   2128 
   2129 	movdqa	%xmm5,%xmm1
   2130 	#palignr	$0xf,%xmm4,%xmm5
   2131 	.byte	0x66,0x0f,0x3a,0x0f
   2132 	.byte	0xec,0x0f
   2133 	movdqa	%xmm5,0x20(%rcx)
   2134 
   2135 	lea	0x30(%rcx),%rcx
   2136 	jge	L(mov3dqa15)
   2137 
   2138 	cmp	$0x10,%r8
   2139 	jl	L(movdqa_epi)
   2140 	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
   2141 	sub	$0x10,%r8
   2142 	lea	0x10(%rdx),%rdx
   2143 	movdqa	%xmm3,%xmm2		# save for use next concat
   2144 	#palignr	$0xf,%xmm1,%xmm3
   2145 	.byte	0x66,0x0f,0x3a,0x0f
   2146 	.byte	0xd9,0x0f
   2147 
   2148 	cmp	$0x10,%r8
   2149 	movdqa	%xmm3,(%rcx)      	# store it
   2150 	lea	0x10(%rcx),%rcx
   2151 	jl	L(movdqa_epi)
   2152 
   2153 	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
   2154 	sub	$0x10,%r8
   2155 	lea	0x10(%rdx),%rdx
   2156 	#palignr	$0xf,%xmm2,%xmm0
   2157 	.byte	0x66,0x0f,0x3a,0x0f
   2158 	.byte	0xc2,0x0f
   2159 	movdqa	%xmm0,(%rcx)      	# store it
   2160 	lea	0x10(%rcx),%rcx
   2161 	jmp	L(movdqa_epi)
   2162 
   2163 	.balign 16
   2164 L(sse2_nt_move):
   2165 	lea	0x40(%rcx),%rcx
   2166 	lea	0x40(%rdx),%rdx
   2167 	lea	-0x40(%r8),%r8
   2168 
   2169 	/*
   2170 	 * doesn't matter if source is aligned for stuff out of cache.
   2171 	 * the mis-aligned penalty is masked by the slowness of main memory.
   2172 	 */
   2173 	prefetchnta 0x180(%rdx)
   2174 	movdqu	-0x40(%rdx),%xmm0
   2175 	movdqu	-0x30(%rdx),%xmm1
   2176 
   2177 	cmp	$0x40,%r8
   2178 	movntdq	%xmm0,-0x40(%rcx)
   2179 	movntdq	%xmm1,-0x30(%rcx)
   2180 
   2181 	movdqu	-0x20(%rdx),%xmm2
   2182 	movdqu	-0x10(%rdx),%xmm3
   2183 
   2184 	movntdq	%xmm2,-0x20(%rcx)
   2185 	movntdq	%xmm3,-0x10(%rcx)
   2186 
   2187 	jge	L(sse2_nt_move)
   2188 
   2189 	lea	L(Fix16EndTable)(%rip),%r10
   2190 	mov	%r8,%r9
   2191 	and	$0xFFFFFFFFFFFFFFF0,%r9
   2192 	add	%r9,%rcx
   2193 	add	%r9,%rdx
   2194 	sub	%r9,%r8
   2195 	shr	$0x4,%r9
   2196 	sfence
   2197 
   2198 	movslq	(%r10,%r9,4),%r11
   2199 	lea	(%r11,%r10,1),%r10
   2200 	jmpq	*%r10
   2201 
   2202 	.balign 16
   2203 L(Fix16EndTable):
   2204 	.int    L(fix16_0)-L(Fix16EndTable)
   2205 	.int    L(fix16_1)-L(Fix16EndTable)
   2206 	.int    L(fix16_2)-L(Fix16EndTable)
   2207 	.int    L(fix16_3)-L(Fix16EndTable)
   2208 
   2209 	.balign 16
   2210 L(fix16_3):
   2211 	movdqu -0x30(%rdx),%xmm1
   2212 	movdqa %xmm1,-0x30(%rcx)
   2213 L(fix16_2):
   2214 	movdqu -0x20(%rdx),%xmm2
   2215 	movdqa %xmm2,-0x20(%rcx)
   2216 L(fix16_1):
   2217 	movdqu -0x10(%rdx),%xmm3
   2218 	movdqa %xmm3,-0x10(%rcx)
   2219 L(fix16_0):
   2220 	lea    L(fwdPxQx)(%rip),%r10
   2221 	add    %r8,%rdx
   2222 	add    %r8,%rcx
   2223 
   2224 	movslq (%r10,%r8,4),%r9
   2225 	lea    (%r9,%r10,1),%r10
   2226 	jmpq   *%r10
   2227 
   2228 	.balign 16
   2229 L(pre_both_aligned):
   2230 	cmp    $0x80,%r8
   2231 	jl     L(fix_16b)
   2232 
   2233 	.balign 16
   2234 L(both_aligned):
   2235 
   2236 	/*
   2237 	 * this 'paired' load/load/store/store seems to do best.
   2238 	 */
   2239 	movdqa (%rdx),%xmm0
   2240 	movdqa 0x10(%rdx),%xmm1
   2241 
   2242 	movdqa %xmm0,(%rcx)
   2243 	movdqa %xmm1,0x10(%rcx)
   2244 	lea    -0x80(%r8),%r8
   2245 
   2246 	movdqa 0x20(%rdx),%xmm2
   2247 	movdqa 0x30(%rdx),%xmm3
   2248 
   2249 	movdqa %xmm2,0x20(%rcx)
   2250 	movdqa %xmm3,0x30(%rcx)
   2251 
   2252 	movdqa 0x40(%rdx),%xmm0
   2253 	movdqa 0x50(%rdx),%xmm1
   2254 	cmp    $0x80,%r8
   2255 
   2256 	movdqa %xmm0,0x40(%rcx)
   2257 	movdqa %xmm1,0x50(%rcx)
   2258 
   2259 	movdqa 0x60(%rdx),%xmm2
   2260 	movdqa 0x70(%rdx),%xmm3
   2261 	lea    0x80(%rdx),%rdx
   2262 	movdqa %xmm2,0x60(%rcx)
   2263 	movdqa %xmm3,0x70(%rcx)
   2264 	lea    0x80(%rcx),%rcx
   2265 	jge    L(both_aligned)
   2266 
   2267 L(fix_16b):
   2268 	add    %r8,%rcx
   2269 	lea    L(fwdPxQx)(%rip),%r10
   2270 	add    %r8,%rdx
   2271 
   2272 	movslq (%r10,%r8,4),%r9
   2273 	lea    (%r9,%r10,1),%r10
   2274 	jmpq   *%r10
   2275 
   2276 	.balign 16
   2277 L(Loop8byte_pre):
   2278 	# Use 8-byte moves
   2279 	mov    .largest_level_cache_size(%rip),%r9d
   2280 	shr    %r9		# take half of it
   2281 	cmp    %r9,%r8
   2282 	jge    L(byte8_nt_top)
   2283 	# Find out whether to use rep movsq
   2284 	cmp    $4096,%r8
   2285 	jle    L(byte8_top)
   2286 	mov    .amd64cache1half(%rip),%r9d	# half of l1 cache
   2287 	cmp    %r9,%r8
   2288 	jle    L(use_rep)
   2289 
   2290 	.balign     16
   2291 L(byte8_top):
   2292 	mov    (%rdx),%r9
   2293 	mov    0x8(%rdx),%r10
   2294 	lea    -0x40(%r8),%r8
   2295 	mov    %r9,(%rcx)
   2296 	mov    %r10,0x8(%rcx)
   2297 	mov    0x10(%rdx),%r11
   2298 	mov    0x18(%rdx),%r9
   2299 	mov    %r11,0x10(%rcx)
   2300 	mov    %r9,0x18(%rcx)
   2301 
   2302 	cmp    $0x40,%r8
   2303 	mov    0x20(%rdx),%r10
   2304 	mov    0x28(%rdx),%r11
   2305 	mov    %r10,0x20(%rcx)
   2306 	mov    %r11,0x28(%rcx)
   2307 	mov    0x30(%rdx),%r9
   2308 	mov    0x38(%rdx),%r10
   2309 	lea    0x40(%rdx),%rdx
   2310 	mov    %r9,0x30(%rcx)
   2311 	mov    %r10,0x38(%rcx)
   2312 	lea    0x40(%rcx),%rcx
   2313 	jg     L(byte8_top)
   2314 
   2315 L(byte8_end):
   2316 	lea    L(fwdPxQx)(%rip),%r10
   2317 	lea    (%rdx,%r8,1),%rdx
   2318 	lea    (%rcx,%r8,1),%rcx
   2319 
   2320 	movslq (%r10,%r8,4),%r9
   2321 	lea    (%r9,%r10,1),%r10
   2322 	jmpq   *%r10
   2323 
   2324 	.balign	16
   2325 L(use_rep):
   2326 	mov    %rdx,%rsi		# %rsi = source
   2327 	mov    %rcx,%rdi		# %rdi = destination
   2328 	mov    %r8,%rcx			# %rcx = count
   2329 	shrq   $3,%rcx			# 8-byte word count
   2330 	rep
   2331 	  movsq
   2332 	mov    %rsi,%rdx		# source
   2333 	mov    %rdi,%rcx		# destination
   2334 	andq   $7,%r8			# remainder
   2335 	jnz    L(byte8_end)
   2336 	ret
   2337 
   2338 	.balign 16
   2339 L(byte8_nt_top):
   2340 	sub    $0x40,%r8
   2341 	prefetchnta 0x180(%rdx)
   2342 	mov    (%rdx),%r9
   2343 	movnti %r9,(%rcx)
   2344 	mov    0x8(%rdx),%r10
   2345 	movnti %r10,0x8(%rcx)
   2346 	mov    0x10(%rdx),%r11
   2347 	movnti %r11,0x10(%rcx)
   2348 	mov    0x18(%rdx),%r9
   2349 	movnti %r9,0x18(%rcx)
   2350 	mov    0x20(%rdx),%r10
   2351 	movnti %r10,0x20(%rcx)
   2352 	mov    0x28(%rdx),%r11
   2353 	movnti %r11,0x28(%rcx)
   2354 	mov    0x30(%rdx),%r9
   2355 	movnti %r9,0x30(%rcx)
   2356 	mov    0x38(%rdx),%r10
   2357 	movnti %r10,0x38(%rcx)
   2358 
   2359 	lea    0x40(%rdx),%rdx
   2360 	lea    0x40(%rcx),%rcx
   2361 	cmp    $0x40,%r8
   2362 	jge    L(byte8_nt_top)
   2363 	sfence
   2364 	jmp    L(byte8_end)
   2365 
   2366 	SET_SIZE(memcpy)
   2367 
   2368 	.balign 16
   2369 L(CopyBackwards):
   2370 	mov    %rdx,%r8
   2371 	mov    %rdi,%rcx
   2372 	mov    %rsi,%rdx
   2373 	mov    %rdi,%rax		# return value
   2374 
   2375 	# ck alignment of last byte
   2376 	lea    (%rcx,%r8,1),%rcx
   2377 	test   $0x7,%rcx
   2378 	lea    (%rdx,%r8,1),%rdx
   2379 	jne    L(bk_align)
   2380 
   2381 L(bk_qw_aligned):
   2382 	lea    L(bkPxQx)(%rip),%r10
   2383 
   2384 	cmp    $0x90,%r8		# 144
   2385 	jg     L(bk_ck_sse2_alignment)
   2386 
   2387 	sub    %r8,%rcx
   2388 	sub    %r8,%rdx
   2389 
   2390 	movslq (%r10,%r8,4),%r9
   2391 	lea    (%r9,%r10,1),%r10
   2392 	jmpq   *%r10
   2393 
   2394 	.balign 16
   2395 L(bk_align):
   2396 	# only align if len > 8
   2397 	cmp    $8,%r8
   2398 	jle    L(bk_qw_aligned)
   2399 	test   $0x1,%rcx
   2400 	je     L(bk_tst2)
   2401 	dec    %rcx
   2402 	dec    %rdx
   2403 	dec    %r8
   2404 	mov    (%rdx),%r9b
   2405 	mov    %r9b,(%rcx)
   2406 
   2407 L(bk_tst2):
   2408 	test   $0x2,%rcx
   2409 	je     L(bk_tst3)
   2410 
   2411 L(bk_got2):
   2412 	sub    $0x2,%rcx
   2413 	sub    $0x2,%rdx
   2414 	sub    $0x2,%r8
   2415 	movzwq (%rdx),%r9
   2416 	mov    %r9w,(%rcx)
   2417 
   2418 L(bk_tst3):
   2419 	test   $0x4,%rcx
   2420 	je     L(bk_qw_aligned)
   2421 
   2422 L(bk_got3):
   2423 	sub    $0x4,%rcx
   2424 	sub    $0x4,%rdx
   2425 	sub    $0x4,%r8
   2426 	mov    (%rdx),%r9d
   2427 	mov    %r9d,(%rcx)
   2428 	jmp    L(bk_qw_aligned)
   2429 
   2430 	.balign 16
   2431 L(bk_ck_sse2_alignment):
   2432 	cmpl   $NO_SSE,.memops_method(%rip)
   2433 	je     L(bk_use_rep)
   2434 	# check alignment of last byte
   2435 	test   $0xf,%rcx
   2436 	jz     L(bk_sse2_cpy)
   2437 
   2438 L(bk_sse2_align):
   2439 	# only here if already aligned on at least a qword bndry
   2440 	sub    $0x8,%rcx
   2441 	sub    $0x8,%rdx
   2442 	sub    $0x8,%r8
   2443 	mov    (%rdx),%r9
   2444 	mov    %r9,(%rcx)
   2445 	#jmp   L(bk_sse2_cpy)
   2446 
   2447 	.balign 16
   2448 L(bk_sse2_cpy):
   2449 	sub    $0x80,%rcx		# 128
   2450 	sub    $0x80,%rdx
   2451 	movdqu 0x70(%rdx),%xmm3
   2452 	movdqu 0x60(%rdx),%xmm2
   2453 	movdqa %xmm3,0x70(%rcx)
   2454 	movdqa %xmm2,0x60(%rcx)
   2455 	sub    $0x80,%r8
   2456 	movdqu 0x50(%rdx),%xmm1
   2457 	movdqu 0x40(%rdx),%xmm0
   2458 	movdqa %xmm1,0x50(%rcx)
   2459 	movdqa %xmm0,0x40(%rcx)
   2460 
   2461 	cmp    $0x80,%r8
   2462 	movdqu 0x30(%rdx),%xmm3
   2463 	movdqu 0x20(%rdx),%xmm2
   2464 	movdqa %xmm3,0x30(%rcx)
   2465 	movdqa %xmm2,0x20(%rcx)
   2466 	movdqu 0x10(%rdx),%xmm1
   2467 	movdqu (%rdx),%xmm0
   2468 	movdqa %xmm1,0x10(%rcx)
   2469 	movdqa %xmm0,(%rcx)
   2470 	jge    L(bk_sse2_cpy)
   2471 
   2472 L(bk_sse2_cpy_end):
   2473 	lea    L(bkPxQx)(%rip),%r10
   2474 	sub    %r8,%rdx
   2475 	sub    %r8,%rcx
   2476 	movslq (%r10,%r8,4),%r9
   2477 	lea    (%r9,%r10,1),%r10
   2478 	jmpq   *%r10
   2479 
   2480 	.balign 16
   2481 L(bk_use_rep):
   2482 	xchg   %rcx,%r9
   2483 	mov    %rdx,%rsi		# source
   2484 	mov    %r9,%rdi			# destination
   2485 	mov    %r8,%rcx			# count
   2486 	sub    $8,%rsi
   2487 	sub    $8,%rdi
   2488 	shr    $3,%rcx
   2489 	std				# reverse direction
   2490 	rep
   2491 	  movsq
   2492 	cld				# reset direction flag
   2493 
   2494 	xchg   %rcx,%r9
   2495 	lea    L(bkPxQx)(%rip),%r10
   2496 	sub    %r8,%rdx
   2497 	sub    %r8,%rcx
   2498 	andq   $7,%r8			# remainder
   2499 	jz     2f
   2500 	movslq (%r10,%r8,4),%r9
   2501 	lea    (%r9,%r10,1),%r10
   2502 	jmpq   *%r10
   2503 2:
   2504 	ret
   2505 
   2506 	.balign 16
   2507 L(bkP0QI):
   2508 	mov    0x88(%rdx),%r10
   2509 	mov    %r10,0x88(%rcx)
   2510 L(bkP0QH):
   2511 	mov    0x80(%rdx),%r10
   2512 	mov    %r10,0x80(%rcx)
   2513 L(bkP0QG):
   2514 	mov    0x78(%rdx),%r9
   2515 	mov    %r9,0x78(%rcx)
   2516 L(bkP0QF):
   2517 	mov    0x70(%rdx),%r11
   2518 	mov    %r11,0x70(%rcx)
   2519 L(bkP0QE):
   2520 	mov    0x68(%rdx),%r10
   2521 	mov    %r10,0x68(%rcx)
   2522 L(bkP0QD):
   2523 	mov    0x60(%rdx),%r9
   2524 	mov    %r9,0x60(%rcx)
   2525 L(bkP0QC):
   2526 	mov    0x58(%rdx),%r11
   2527 	mov    %r11,0x58(%rcx)
   2528 L(bkP0QB):
   2529 	mov    0x50(%rdx),%r10
   2530 	mov    %r10,0x50(%rcx)
   2531 L(bkP0QA):
   2532 	mov    0x48(%rdx),%r9
   2533 	mov    %r9,0x48(%rcx)
   2534 L(bkP0Q9):
   2535 	mov    0x40(%rdx),%r11
   2536 	mov    %r11,0x40(%rcx)
   2537 L(bkP0Q8):
   2538 	mov    0x38(%rdx),%r10
   2539 	mov    %r10,0x38(%rcx)
   2540 L(bkP0Q7):
   2541 	mov    0x30(%rdx),%r9
   2542 	mov    %r9,0x30(%rcx)
   2543 L(bkP0Q6):
   2544 	mov    0x28(%rdx),%r11
   2545 	mov    %r11,0x28(%rcx)
   2546 L(bkP0Q5):
   2547 	mov    0x20(%rdx),%r10
   2548 	mov    %r10,0x20(%rcx)
   2549 L(bkP0Q4):
   2550 	mov    0x18(%rdx),%r9
   2551 	mov    %r9,0x18(%rcx)
   2552 L(bkP0Q3):
   2553 	mov    0x10(%rdx),%r11
   2554 	mov    %r11,0x10(%rcx)
   2555 L(bkP0Q2):
   2556 	mov    0x8(%rdx),%r10
   2557 	mov    %r10,0x8(%rcx)
   2558 L(bkP0Q1):
   2559 	mov    (%rdx),%r9
   2560 	mov    %r9,(%rcx)
   2561 L(bkP0Q0):
   2562 	ret
   2563 
   2564 	.balign 16
   2565 L(bkP1QI):
   2566 	mov    0x89(%rdx),%r10
   2567 	mov    %r10,0x89(%rcx)
   2568 L(bkP1QH):
   2569 	mov    0x81(%rdx),%r11
   2570 	mov    %r11,0x81(%rcx)
   2571 L(bkP1QG):
   2572 	mov    0x79(%rdx),%r10
   2573 	mov    %r10,0x79(%rcx)
   2574 L(bkP1QF):
   2575 	mov    0x71(%rdx),%r9
   2576 	mov    %r9,0x71(%rcx)
   2577 L(bkP1QE):
   2578 	mov    0x69(%rdx),%r11
   2579 	mov    %r11,0x69(%rcx)
   2580 L(bkP1QD):
   2581 	mov    0x61(%rdx),%r10
   2582 	mov    %r10,0x61(%rcx)
   2583 L(bkP1QC):
   2584 	mov    0x59(%rdx),%r9
   2585 	mov    %r9,0x59(%rcx)
   2586 L(bkP1QB):
   2587 	mov    0x51(%rdx),%r11
   2588 	mov    %r11,0x51(%rcx)
   2589 L(bkP1QA):
   2590 	mov    0x49(%rdx),%r10
   2591 	mov    %r10,0x49(%rcx)
   2592 L(bkP1Q9):
   2593 	mov    0x41(%rdx),%r9
   2594 	mov    %r9,0x41(%rcx)
   2595 L(bkP1Q8):
   2596 	mov    0x39(%rdx),%r11
   2597 	mov    %r11,0x39(%rcx)
   2598 L(bkP1Q7):
   2599 	mov    0x31(%rdx),%r10
   2600 	mov    %r10,0x31(%rcx)
   2601 L(bkP1Q6):
   2602 	mov    0x29(%rdx),%r9
   2603 	mov    %r9,0x29(%rcx)
   2604 L(bkP1Q5):
   2605 	mov    0x21(%rdx),%r11
   2606 	mov    %r11,0x21(%rcx)
   2607 L(bkP1Q4):
   2608 	mov    0x19(%rdx),%r10
   2609 	mov    %r10,0x19(%rcx)
   2610 L(bkP1Q3):
   2611 	mov    0x11(%rdx),%r9
   2612 	mov    %r9,0x11(%rcx)
   2613 L(bkP1Q2):
   2614 	mov    0x9(%rdx),%r11
   2615 	mov    %r11,0x9(%rcx)
   2616 L(bkP1Q1):
   2617 	mov    0x1(%rdx),%r10
   2618 	mov    %r10,0x1(%rcx)
   2619 L(bkP1Q0):
   2620 	mov    (%rdx),%r9b
   2621 	mov    %r9b,(%rcx)
   2622 	ret
   2623 
   2624 	.balign 16
   2625 L(bkP2QI):
   2626 	mov    0x8a(%rdx),%r10
   2627 	mov    %r10,0x8a(%rcx)
   2628 L(bkP2QH):
   2629 	mov    0x82(%rdx),%r11
   2630 	mov    %r11,0x82(%rcx)
   2631 L(bkP2QG):
   2632 	mov    0x7a(%rdx),%r10
   2633 	mov    %r10,0x7a(%rcx)
   2634 L(bkP2QF):
   2635 	mov    0x72(%rdx),%r9
   2636 	mov    %r9,0x72(%rcx)
   2637 L(bkP2QE):
   2638 	mov    0x6a(%rdx),%r11
   2639 	mov    %r11,0x6a(%rcx)
   2640 L(bkP2QD):
   2641 	mov    0x62(%rdx),%r10
   2642 	mov    %r10,0x62(%rcx)
   2643 L(bkP2QC):
   2644 	mov    0x5a(%rdx),%r9
   2645 	mov    %r9,0x5a(%rcx)
   2646 L(bkP2QB):
   2647 	mov    0x52(%rdx),%r11
   2648 	mov    %r11,0x52(%rcx)
   2649 L(bkP2QA):
   2650 	mov    0x4a(%rdx),%r10
   2651 	mov    %r10,0x4a(%rcx)
   2652 L(bkP2Q9):
   2653 	mov    0x42(%rdx),%r9
   2654 	mov    %r9,0x42(%rcx)
   2655 L(bkP2Q8):
   2656 	mov    0x3a(%rdx),%r11
   2657 	mov    %r11,0x3a(%rcx)
   2658 L(bkP2Q7):
   2659 	mov    0x32(%rdx),%r10
   2660 	mov    %r10,0x32(%rcx)
   2661 L(bkP2Q6):
   2662 	mov    0x2a(%rdx),%r9
   2663 	mov    %r9,0x2a(%rcx)
   2664 L(bkP2Q5):
   2665 	mov    0x22(%rdx),%r11
   2666 	mov    %r11,0x22(%rcx)
   2667 L(bkP2Q4):
   2668 	mov    0x1a(%rdx),%r10
   2669 	mov    %r10,0x1a(%rcx)
   2670 L(bkP2Q3):
   2671 	mov    0x12(%rdx),%r9
   2672 	mov    %r9,0x12(%rcx)
   2673 L(bkP2Q2):
   2674 	mov    0xa(%rdx),%r11
   2675 	mov    %r11,0xa(%rcx)
   2676 L(bkP2Q1):
   2677 	mov    0x2(%rdx),%r10
   2678 	mov    %r10,0x2(%rcx)
   2679 L(bkP2Q0):
   2680 	mov    (%rdx),%r9w
   2681 	mov    %r9w,(%rcx)
   2682 	ret
   2683 
   2684 	.balign 16
   2685 L(bkP3QI):
   2686 	mov    0x8b(%rdx),%r10
   2687 	mov    %r10,0x8b(%rcx)
   2688 L(bkP3QH):
   2689 	mov    0x83(%rdx),%r11
   2690 	mov    %r11,0x83(%rcx)
   2691 L(bkP3QG):
   2692 	mov    0x7b(%rdx),%r10
   2693 	mov    %r10,0x7b(%rcx)
   2694 L(bkP3QF):
   2695 	mov    0x73(%rdx),%r9
   2696 	mov    %r9,0x73(%rcx)
   2697 L(bkP3QE):
   2698 	mov    0x6b(%rdx),%r11
   2699 	mov    %r11,0x6b(%rcx)
   2700 L(bkP3QD):
   2701 	mov    0x63(%rdx),%r10
   2702 	mov    %r10,0x63(%rcx)
   2703 L(bkP3QC):
   2704 	mov    0x5b(%rdx),%r9
   2705 	mov    %r9,0x5b(%rcx)
   2706 L(bkP3QB):
   2707 	mov    0x53(%rdx),%r11
   2708 	mov    %r11,0x53(%rcx)
   2709 L(bkP3QA):
   2710 	mov    0x4b(%rdx),%r10
   2711 	mov    %r10,0x4b(%rcx)
   2712 L(bkP3Q9):
   2713 	mov    0x43(%rdx),%r9
   2714 	mov    %r9,0x43(%rcx)
   2715 L(bkP3Q8):
   2716 	mov    0x3b(%rdx),%r11
   2717 	mov    %r11,0x3b(%rcx)
   2718 L(bkP3Q7):
   2719 	mov    0x33(%rdx),%r10
   2720 	mov    %r10,0x33(%rcx)
   2721 L(bkP3Q6):
   2722 	mov    0x2b(%rdx),%r9
   2723 	mov    %r9,0x2b(%rcx)
   2724 L(bkP3Q5):
   2725 	mov    0x23(%rdx),%r11
   2726 	mov    %r11,0x23(%rcx)
   2727 L(bkP3Q4):
   2728 	mov    0x1b(%rdx),%r10
   2729 	mov    %r10,0x1b(%rcx)
   2730 L(bkP3Q3):
   2731 	mov    0x13(%rdx),%r9
   2732 	mov    %r9,0x13(%rcx)
   2733 L(bkP3Q2):
   2734 	mov    0xb(%rdx),%r11
   2735 	mov    %r11,0xb(%rcx)
   2736 L(bkP3Q1):
   2737 	mov    0x3(%rdx),%r10
   2738 	mov    %r10,0x3(%rcx)
   2739 L(bkP3Q0): # trailing loads/stores do all their loads 1st, then do the stores
   2740 	mov    0x1(%rdx),%r9w
   2741 	mov    %r9w,0x1(%rcx)
   2742 	mov    (%rdx),%r10b
   2743 	mov    %r10b,(%rcx)
   2744 	ret
   2745 
   2746 	.balign 16
   2747 L(bkP4QI):
   2748 	mov    0x8c(%rdx),%r10
   2749 	mov    %r10,0x8c(%rcx)
   2750 L(bkP4QH):
   2751 	mov    0x84(%rdx),%r11
   2752 	mov    %r11,0x84(%rcx)
   2753 L(bkP4QG):
   2754 	mov    0x7c(%rdx),%r10
   2755 	mov    %r10,0x7c(%rcx)
   2756 L(bkP4QF):
   2757 	mov    0x74(%rdx),%r9
   2758 	mov    %r9,0x74(%rcx)
   2759 L(bkP4QE):
   2760 	mov    0x6c(%rdx),%r11
   2761 	mov    %r11,0x6c(%rcx)
   2762 L(bkP4QD):
   2763 	mov    0x64(%rdx),%r10
   2764 	mov    %r10,0x64(%rcx)
   2765 L(bkP4QC):
   2766 	mov    0x5c(%rdx),%r9
   2767 	mov    %r9,0x5c(%rcx)
   2768 L(bkP4QB):
   2769 	mov    0x54(%rdx),%r11
   2770 	mov    %r11,0x54(%rcx)
   2771 L(bkP4QA):
   2772 	mov    0x4c(%rdx),%r10
   2773 	mov    %r10,0x4c(%rcx)
   2774 L(bkP4Q9):
   2775 	mov    0x44(%rdx),%r9
   2776 	mov    %r9,0x44(%rcx)
   2777 L(bkP4Q8):
   2778 	mov    0x3c(%rdx),%r11
   2779 	mov    %r11,0x3c(%rcx)
   2780 L(bkP4Q7):
   2781 	mov    0x34(%rdx),%r10
   2782 	mov    %r10,0x34(%rcx)
   2783 L(bkP4Q6):
   2784 	mov    0x2c(%rdx),%r9
   2785 	mov    %r9,0x2c(%rcx)
   2786 L(bkP4Q5):
   2787 	mov    0x24(%rdx),%r11
   2788 	mov    %r11,0x24(%rcx)
   2789 L(bkP4Q4):
   2790 	mov    0x1c(%rdx),%r10
   2791 	mov    %r10,0x1c(%rcx)
   2792 L(bkP4Q3):
   2793 	mov    0x14(%rdx),%r9
   2794 	mov    %r9,0x14(%rcx)
   2795 L(bkP4Q2):
   2796 	mov    0xc(%rdx),%r11
   2797 	mov    %r11,0xc(%rcx)
   2798 L(bkP4Q1):
   2799 	mov    0x4(%rdx),%r10
   2800 	mov    %r10,0x4(%rcx)
   2801 L(bkP4Q0):
   2802 	mov    (%rdx),%r9d
   2803 	mov    %r9d,(%rcx)
   2804 	ret
   2805 
   2806 	.balign 16
   2807 L(bkP5QI):
   2808 	mov    0x8d(%rdx),%r10
   2809 	mov    %r10,0x8d(%rcx)
   2810 L(bkP5QH):
   2811 	mov    0x85(%rdx),%r9
   2812 	mov    %r9,0x85(%rcx)
   2813 L(bkP5QG):
   2814 	mov    0x7d(%rdx),%r11
   2815 	mov    %r11,0x7d(%rcx)
   2816 L(bkP5QF):
   2817 	mov    0x75(%rdx),%r10
   2818 	mov    %r10,0x75(%rcx)
   2819 L(bkP5QE):
   2820 	mov    0x6d(%rdx),%r9
   2821 	mov    %r9,0x6d(%rcx)
   2822 L(bkP5QD):
   2823 	mov    0x65(%rdx),%r11
   2824 	mov    %r11,0x65(%rcx)
   2825 L(bkP5QC):
   2826 	mov    0x5d(%rdx),%r10
   2827 	mov    %r10,0x5d(%rcx)
   2828 L(bkP5QB):
   2829 	mov    0x55(%rdx),%r9
   2830 	mov    %r9,0x55(%rcx)
   2831 L(bkP5QA):
   2832 	mov    0x4d(%rdx),%r11
   2833 	mov    %r11,0x4d(%rcx)
   2834 L(bkP5Q9):
   2835 	mov    0x45(%rdx),%r10
   2836 	mov    %r10,0x45(%rcx)
   2837 L(bkP5Q8):
   2838 	mov    0x3d(%rdx),%r9
   2839 	mov    %r9,0x3d(%rcx)
   2840 L(bkP5Q7):
   2841 	mov    0x35(%rdx),%r11
   2842 	mov    %r11,0x35(%rcx)
   2843 L(bkP5Q6):
   2844 	mov    0x2d(%rdx),%r10
   2845 	mov    %r10,0x2d(%rcx)
   2846 L(bkP5Q5):
   2847 	mov    0x25(%rdx),%r9
   2848 	mov    %r9,0x25(%rcx)
   2849 L(bkP5Q4):
   2850 	mov    0x1d(%rdx),%r11
   2851 	mov    %r11,0x1d(%rcx)
   2852 L(bkP5Q3):
   2853 	mov    0x15(%rdx),%r10
   2854 	mov    %r10,0x15(%rcx)
   2855 L(bkP5Q2):
   2856 	mov    0xd(%rdx),%r9
   2857 	mov    %r9,0xd(%rcx)
   2858 L(bkP5Q1):
   2859 	mov    0x5(%rdx),%r11
   2860 	mov    %r11,0x5(%rcx)
   2861 L(bkP5Q0): # trailing loads/stores do all their loads 1st, then do the stores
   2862 	mov    0x1(%rdx),%r9d
   2863 	mov    %r9d,0x1(%rcx)
   2864 	mov    (%rdx),%r10b
   2865 	mov    %r10b,(%rcx)
   2866 	ret
   2867 
   2868 	.balign 16
   2869 L(bkP6QI):
   2870 	mov    0x8e(%rdx),%r10
   2871 	mov    %r10,0x8e(%rcx)
   2872 L(bkP6QH):
   2873 	mov    0x86(%rdx),%r11
   2874 	mov    %r11,0x86(%rcx)
   2875 L(bkP6QG):
   2876 	mov    0x7e(%rdx),%r10
   2877 	mov    %r10,0x7e(%rcx)
   2878 L(bkP6QF):
   2879 	mov    0x76(%rdx),%r9
   2880 	mov    %r9,0x76(%rcx)
   2881 L(bkP6QE):
   2882 	mov    0x6e(%rdx),%r11
   2883 	mov    %r11,0x6e(%rcx)
   2884 L(bkP6QD):
   2885 	mov    0x66(%rdx),%r10
   2886 	mov    %r10,0x66(%rcx)
   2887 L(bkP6QC):
   2888 	mov    0x5e(%rdx),%r9
   2889 	mov    %r9,0x5e(%rcx)
   2890 L(bkP6QB):
   2891 	mov    0x56(%rdx),%r11
   2892 	mov    %r11,0x56(%rcx)
   2893 L(bkP6QA):
   2894 	mov    0x4e(%rdx),%r10
   2895 	mov    %r10,0x4e(%rcx)
   2896 L(bkP6Q9):
   2897 	mov    0x46(%rdx),%r9
   2898 	mov    %r9,0x46(%rcx)
   2899 L(bkP6Q8):
   2900 	mov    0x3e(%rdx),%r11
   2901 	mov    %r11,0x3e(%rcx)
   2902 L(bkP6Q7):
   2903 	mov    0x36(%rdx),%r10
   2904 	mov    %r10,0x36(%rcx)
   2905 L(bkP6Q6):
   2906 	mov    0x2e(%rdx),%r9
   2907 	mov    %r9,0x2e(%rcx)
   2908 L(bkP6Q5):
   2909 	mov    0x26(%rdx),%r11
   2910 	mov    %r11,0x26(%rcx)
   2911 L(bkP6Q4):
   2912 	mov    0x1e(%rdx),%r10
   2913 	mov    %r10,0x1e(%rcx)
   2914 L(bkP6Q3):
   2915 	mov    0x16(%rdx),%r9
   2916 	mov    %r9,0x16(%rcx)
   2917 L(bkP6Q2):
   2918 	mov    0xe(%rdx),%r11
   2919 	mov    %r11,0xe(%rcx)
   2920 L(bkP6Q1):
   2921 	mov    0x6(%rdx),%r10
   2922 	mov    %r10,0x6(%rcx)
   2923 L(bkP6Q0): # trailing loads/stores do all their loads 1st, then do the stores
   2924 	mov    0x2(%rdx),%r9d
   2925 	mov    %r9d,0x2(%rcx)
   2926 	mov    (%rdx),%r10w
   2927 	mov    %r10w,(%rcx)
   2928 	ret
   2929 
   2930 	.balign 16
   2931 L(bkP7QI):
   2932 	mov    0x8f(%rdx),%r10
   2933 	mov    %r10,0x8f(%rcx)
   2934 L(bkP7QH):
   2935 	mov    0x87(%rdx),%r11
   2936 	mov    %r11,0x87(%rcx)
   2937 L(bkP7QG):
   2938 	mov    0x7f(%rdx),%r10
   2939 	mov    %r10,0x7f(%rcx)
   2940 L(bkP7QF):
   2941 	mov    0x77(%rdx),%r9
   2942 	mov    %r9,0x77(%rcx)
   2943 L(bkP7QE):
   2944 	mov    0x6f(%rdx),%r11
   2945 	mov    %r11,0x6f(%rcx)
   2946 L(bkP7QD):
   2947 	mov    0x67(%rdx),%r10
   2948 	mov    %r10,0x67(%rcx)
   2949 L(bkP7QC):
   2950 	mov    0x5f(%rdx),%r9
   2951 	mov    %r9,0x5f(%rcx)
   2952 L(bkP7QB):
   2953 	mov    0x57(%rdx),%r11
   2954 	mov    %r11,0x57(%rcx)
   2955 L(bkP7QA):
   2956 	mov    0x4f(%rdx),%r10
   2957 	mov    %r10,0x4f(%rcx)
   2958 L(bkP7Q9):
   2959 	mov    0x47(%rdx),%r9
   2960 	mov    %r9,0x47(%rcx)
   2961 L(bkP7Q8):
   2962 	mov    0x3f(%rdx),%r11
   2963 	mov    %r11,0x3f(%rcx)
   2964 L(bkP7Q7):
   2965 	mov    0x37(%rdx),%r10
   2966 	mov    %r10,0x37(%rcx)
   2967 L(bkP7Q6):
   2968 	mov    0x2f(%rdx),%r9
   2969 	mov    %r9,0x2f(%rcx)
   2970 L(bkP7Q5):
   2971 	mov    0x27(%rdx),%r11
   2972 	mov    %r11,0x27(%rcx)
   2973 L(bkP7Q4):
   2974 	mov    0x1f(%rdx),%r10
   2975 	mov    %r10,0x1f(%rcx)
   2976 L(bkP7Q3):
   2977 	mov    0x17(%rdx