Home | History | Annotate | Download | only in common
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 	.file	"memcpy.s"
     28 
     29 /*
     30  * memcpy(s1, s2, len)
     31  *
     32  * Copy s2 to s1, always copy n bytes.
     33  * Note: this C code does not work for overlapped copies.
     34  *       Memmove() and bcopy() do.
     35  *
     36  * Fast assembler language version of the following C-program for memcpy
     37  * which represents the `standard' for the C-library.
     38  *
     39  *	void *
     40  *	memcpy(void *s, const void *s0, size_t n)
     41  *	{
     42  *		if (n != 0) {
     43  *	   	    char *s1 = s;
     44  *		    const char *s2 = s0;
     45  *		    do {
     46  *			*s1++ = *s2++;
     47  *		    } while (--n != 0);
     48  *		}
     49  *		return (s);
     50  *	}
     51  */
     52 
     53 #include <sys/asm_linkage.h>
     54 #include <sys/sun4asi.h>
     55 #include <sys/trap.h>
     56 
     57 #define	ICACHE_LINE_SIZE	64
     58 #define	BLOCK_SIZE	64
     59 #define	FPRS_FEF	0x4
     60 
     61 #define SHORTCOPY	3
     62 #define	SMALL_MAX	39
     63 #define	MEDIUM_MAX	255
     64 #define MED_WMAX	256	/* max copy for medium word-aligned case */
     65 #define MED_MAX		256	/* max copy for medium longword-aligned case */
     66 
     67 #ifndef BSTORE_SIZE
     68 #define BSTORE_SIZE	256	/* min copy size for block store */
     69 #endif
     70 
     71 	ANSI_PRAGMA_WEAK(memmove,function)
     72 	ANSI_PRAGMA_WEAK(memcpy,function)
     73 
     74 	ENTRY(memmove)
     75 	cmp	%o1, %o0	! if from address is >= to use forward copy
     76 	bgeu	%ncc, .forcpy	! else use backward if ...
     77 	sub	%o0, %o1, %o4	! get difference of two addresses
     78 	cmp	%o2, %o4	! compare size and difference of addresses
     79 	bleu	%ncc, .forcpy	! if size is bigger, do overlapped copy
     80 	nop
     81 
     82         !
     83         ! an overlapped copy that must be done "backwards"
     84         !
     85 .ovbc:
     86 	mov	%o0, %g1		! save dest address for return val
     87 	add     %o1, %o2, %o1           ! get to end of source space
     88         add     %o0, %o2, %o0           ! get to end of destination space
     89 
     90 	cmp	%o2, 24
     91 	bgeu,pn	%ncc, .dbalign
     92 	nop
     93 	cmp	%o2, 4
     94 	blt,pn	%ncc, .byte
     95 	sub	%o2, 3, %o2
     96 .byte4loop:
     97 	ldub	[%o1-1], %o3		! load last byte
     98 	stb	%o3, [%o0-1]		! store last byte
     99 	sub	%o1, 4, %o1
    100 	ldub	[%o1+2], %o3		! load 2nd from last byte
    101 	stb	%o3, [%o0-2]		! store 2nd from last byte
    102 	sub	%o0, 4, %o0
    103 	ldub	[%o1+1], %o3		! load 3rd from last byte
    104 	stb	%o3, [%o0+1]		! store 3rd from last byte
    105 	subcc	%o2, 4, %o2
    106 	ldub	[%o1], %o3		! load 4th from last byte
    107 	bgu,pt	%ncc, .byte4loop
    108 	stb	%o3, [%o0]		! store 4th from last byte
    109 .byte:
    110 	addcc	%o2, 3, %o2
    111 	bz,pt	%ncc, .exit
    112 .byteloop:
    113 	dec	%o1			! decrement src address
    114 	ldub	[%o1], %o3		! read a byte
    115 	dec	%o0			! decrement dst address
    116 	deccc	%o2			! decrement count
    117 	bgu,pt	%ncc, .byteloop		! loop until done
    118 	stb	%o3, [%o0]		! write byte
    119 .exit:
    120 	retl
    121 	mov	%g1, %o0
    122 
    123 	.align	16
    124 .dbalign:
    125 	andcc   %o0, 7, %o5		! bytes till DST 8 byte aligned
    126 	bz,pt	%ncc, .dbmed
    127 	sub	%o2, %o5, %o2		! update count
    128 .dbalign1:
    129 	dec	%o1			! decrement src address
    130 	ldub	[%o1], %o3		! read a byte
    131 	dec	%o0			! decrement dst address
    132 	deccc	%o5			! decrement count
    133 	bgu,pt	%ncc, .dbalign1		! loop until done
    134 	stb	%o3, [%o0]		! store a byte
    135 
    136 ! check for src long word alignment
    137 .dbmed:
    138 	andcc	%o1, 7, %g0		! chk src long word alignment
    139 	bnz,pn	%ncc, .dbbck
    140 	nop
    141 !
    142 ! Following code is for overlapping copies where src and dest
    143 ! are long word aligned
    144 !
    145 	cmp	%o2, 4095
    146 	blt,pn	%ncc, .dbmedl32enter	! go to no prefetch code
    147 	nop
    148 	prefetch [%o1 - (1 * BLOCK_SIZE)], 20	! into the prefetch cache
    149 	sub	%o2, 63, %o2		! adjust length to allow cc test
    150 					! for end of loop
    151 	prefetch [%o1 - (2 * BLOCK_SIZE)], 20	! into the prefetch cache
    152 	rd	%fprs, %o3		! o3 = fprs
    153 	! if fprs.fef == 0, set it. Checking it, requires 2 instructions.
    154 	! So set it anyway, without checking.
    155 	prefetch [%o1 - (3 * BLOCK_SIZE)], 20	! into the prefetch cache
    156 	wr      %g0, 0x4, %fprs         ! fprs.fef = 1
    157 	prefetch [%o1 - (4 * BLOCK_SIZE)], 20	! into the prefetch cache
    158 .dbmedl64:
    159 	prefetch [%o1 - (5 * BLOCK_SIZE)], 20	! into the prefetch cache
    160 	ldd	[%o1-8], %d4		! load
    161 	subcc	%o2, 64, %o2		! decrement length count
    162 	std	%d4, [%o0-8]		! and store
    163 	ldd	[%o1-16], %d2		! a block of 64 bytes
    164 	sub	%o1, 64, %o1		! decrease src ptr by 64
    165 	std	%d2, [%o0-16]
    166 	sub	%o0, 64, %o0		! decrease dst ptr by 64
    167 	ldd	[%o1+40], %d4
    168 	std	%d4, [%o0+40]
    169 	ldd	[%o1+32], %d2
    170 	std	%d2, [%o0+32]
    171 	ldd	[%o1+24], %d4
    172 	std	%d4, [%o0+24]
    173 	ldd	[%o1+16], %d2
    174 	std	%d2, [%o0+16]
    175 	ldd	[%o1+8], %d4
    176 	std	%d4, [%o0+8]
    177 	ldd	[%o1], %d2
    178 	bgu,pt	%ncc, .dbmedl64		! repeat if at least 64 bytes left
    179 	std	%d2, [%o0]
    180 	add	%o2, 63, %o2		! restore offset adjustment
    181 	and	%o3, 0x4, %o3           ! fprs.du = fprs.dl = 0
    182 	wr	%o3, %g0, %fprs         ! fprs = o3   restore fprs
    183 .dbmedl32enter:
    184 	subcc	%o2, 31, %o2		! adjust length to allow cc test
    185 					! for end of loop
    186 	ble,pt  %ncc, .dbmedl31		! skip big loop if less than 32
    187 	nop
    188 .dbmedl32:
    189 	ldx	[%o1-8], %o4		! load
    190 	subcc	%o2, 32, %o2		! decrement length count
    191 	stx	%o4, [%o0-8]		! and store
    192 	ldx	[%o1-16], %o3		! a block of 32 bytes
    193 	sub	%o1, 32, %o1		! decrease src ptr by 32
    194 	stx	%o3, [%o0-16]
    195 	ldx	[%o1+8], %o4
    196 	sub	%o0, 32, %o0		! decrease dst ptr by 32
    197 	stx	%o4, [%o0+8]
    198 	ldx	[%o1], %o3
    199 	bgu,pt	%ncc, .dbmedl32		! repeat if at least 32 bytes left
    200 	stx	%o3, [%o0]
    201 .dbmedl31:
    202 	addcc	%o2, 16, %o2		! adjust remaining count
    203 	ble,pt	%ncc, .dbmedl15		! skip if 15 or fewer bytes left
    204 	nop				!
    205 	ldx	[%o1-8], %o4		! load and store 16 bytes
    206 	sub	%o1, 16, %o1		! decrease src ptr by 16
    207 	stx	%o4, [%o0-8]		!
    208 	sub	%o2, 16, %o2		! decrease count by 16
    209 	ldx	[%o1], %o3		!
    210 	sub	%o0, 16, %o0		! decrease dst ptr by 16
    211 	stx	%o3, [%o0]
    212 .dbmedl15:
    213 	addcc	%o2, 15, %o2		! restore count
    214 	bz,pt	%ncc, .dbexit		! exit if finished
    215 	nop
    216 	cmp	%o2, 8
    217 	blt,pt	%ncc, .dbremain		! skip if 7 or fewer bytes left
    218 	nop
    219 	ldx	[%o1-8], %o4		! load 8 bytes
    220 	sub	%o1, 8, %o1		! decrease src ptr by 8
    221 	stx	%o4, [%o0-8]		! and store 8 bytes
    222 	subcc	%o2, 8, %o2		! decrease count by 8
    223 	bnz	%ncc, .dbremain		! exit if finished
    224 	sub	%o0, 8, %o0		! decrease dst ptr by 8
    225 	retl
    226 	mov	%g1, %o0
    227 
    228 !
    229 ! Following code is for overlapping copies where src and dest
    230 ! are not long word aligned
    231 !
    232 	.align	16
    233 .dbbck:
    234 	rd	%fprs, %o3		! o3 = fprs
    235 
    236 	! if fprs.fef == 0, set it. Checking it, requires 2 instructions.
    237 	! So set it anyway, without checking.
    238 	wr      %g0, 0x4, %fprs         ! fprs.fef = 1
    239 
    240 	alignaddr %o1, %g0, %o5		! align src
    241 	ldd	[%o5], %d0		! get first 8 byte block
    242 	andn	%o2, 7, %o4		! prepare src ptr for finishup code
    243 	cmp	%o2, 32
    244 	blt,pn	%ncc, .dbmv8
    245 	sub	%o1, %o4, %o1		!
    246 	cmp	%o2, 4095		! check for short memmoves
    247 	blt,pn	%ncc, .dbmv32enter	! go to no prefetch code
    248 .dbmv64:
    249 	ldd	[%o5-8], %d2		! load 8 bytes
    250 	ldd	[%o5-16], %d4		! load 8 bytes
    251 	sub	%o5, 64, %o5		!
    252 	ldd	[%o5+40], %d6		! load 8 bytes
    253 	sub	%o0, 64, %o0		!
    254 	ldd	[%o5+32], %d8		! load 8 bytes
    255 	sub	%o2, 64, %o2		! 64 less bytes to copy
    256 	ldd	[%o5+24], %d18		! load 8 bytes
    257 	cmp	%o2, 64			! do we have < 64 bytes remaining
    258 	ldd	[%o5+16], %d28		! load 8 bytes
    259 	ldd	[%o5+8], %d30		! load 8 bytes
    260 	prefetch [%o5 - (5 * BLOCK_SIZE)], 20	! into the prefetch cache
    261 	faligndata %d2, %d0, %d10	! extract 8 bytes out
    262 	ldd	[%o5], %d0		! load 8 bytes
    263 	std	%d10, [%o0+56]		! store the current 8 bytes
    264 	faligndata %d4, %d2, %d12	! extract 8 bytes out
    265 	std	%d12, [%o0+48]		! store the current 8 bytes
    266 	faligndata %d6, %d4, %d14	! extract 8 bytes out
    267 	std	%d14, [%o0+40]		! store the current 8 bytes
    268 	faligndata %d8, %d6, %d16	! extract 8 bytes out
    269 	std	%d16, [%o0+32]		! store the current 8 bytes
    270 	faligndata %d18, %d8, %d20	! extract 8 bytes out
    271 	std	%d20, [%o0+24]		! store the current 8 bytes
    272 	faligndata %d28, %d18, %d22	! extract 8 bytes out
    273 	std	%d22, [%o0+16]		! store the current 8 bytes
    274 	faligndata %d30, %d28, %d24	! extract 8 bytes out
    275 	std	%d24, [%o0+8]		! store the current 8 bytes
    276 	faligndata %d0, %d30, %d26	! extract 8 bytes out
    277 	bgeu,pt	%ncc, .dbmv64
    278 	std	%d26, [%o0]		! store the current 8 bytes
    279 
    280 	cmp	%o2, 32
    281 	blt,pn	%ncc, .dbmvx
    282 	nop
    283 .dbmv32:
    284 	ldd	[%o5-8], %d2		! load 8 bytes
    285 .dbmv32enter:
    286 	ldd	[%o5-16], %d4		! load 8 bytes
    287 	sub	%o5, 32, %o5		!
    288 	ldd	[%o5+8], %d6		! load 8 bytes
    289 	sub	%o0, 32, %o0		!
    290 	faligndata %d2, %d0, %d10	! extract 8 bytes out
    291 	ldd	[%o5], %d0		! load 8 bytes
    292 	sub     %o2,32, %o2		! 32 less bytes to copy
    293 	std	%d10, [%o0+24]		! store the current 8 bytes
    294 	cmp	%o2, 32			! do we have < 32 bytes remaining
    295 	faligndata %d4, %d2, %d12	! extract 8 bytes out
    296 	std	%d12, [%o0+16]		! store the current 8 bytes
    297 	faligndata %d6, %d4, %d14	! extract 8 bytes out
    298 	std	%d14, [%o0+8]		! store the current 8 bytes
    299 	faligndata %d0, %d6, %d16	! extract 8 bytes out
    300 	bgeu,pt	%ncc, .dbmv32
    301 	std	%d16, [%o0]		! store the current 8 bytes
    302 .dbmvx:
    303 	cmp	%o2, 8			! do we have < 8 bytes remaining
    304 	blt,pt	%ncc, .dbmvfinish	! if yes, skip to finish up code
    305 	nop
    306 .dbmv8:
    307 	ldd	[%o5-8], %d2
    308 	sub	%o0, 8, %o0		! since we are at the end
    309 					! when we first enter the loop
    310 	sub     %o2, 8, %o2		! 8 less bytes to copy
    311 	sub	%o5, 8, %o5
    312 	cmp	%o2, 8			! do we have < 8 bytes remaining
    313 	faligndata %d2, %d0, %d8        ! extract 8 bytes out
    314 	std	%d8, [%o0]		! store the current 8 bytes
    315 	bgeu,pt	%ncc, .dbmv8
    316 	fmovd	%d2, %d0
    317 .dbmvfinish:
    318 	and	%o3, 0x4, %o3           ! fprs.du = fprs.dl = 0
    319 	tst	%o2
    320 	bz,pt	%ncc, .dbexit
    321 	wr	%o3, %g0, %fprs         ! fprs = o3   restore fprs
    322 
    323 .dbremain:
    324 	cmp	%o2, 4
    325 	blt,pn	%ncc, .dbbyte
    326 	nop
    327 	ldub	[%o1-1], %o3		! load last byte
    328 	stb	%o3, [%o0-1]		! store last byte
    329 	sub	%o1, 4, %o1
    330 	ldub	[%o1+2], %o3		! load 2nd from last byte
    331 	stb	%o3, [%o0-2]		! store 2nd from last byte
    332 	sub	%o0, 4, %o0
    333 	ldub	[%o1+1], %o3		! load 3rd from last byte
    334 	stb	%o3, [%o0+1]		! store 3rd from last byte
    335 	subcc	%o2, 4, %o2
    336 	ldub	[%o1], %o3		! load 4th from last byte
    337 	stb	%o3, [%o0]		! store 4th from last byte
    338 	bz,pt	%ncc, .dbexit
    339 .dbbyte:
    340 	dec	%o1			! decrement src address
    341 	ldub	[%o1], %o3		! read a byte
    342 	dec	%o0			! decrement dst address
    343 	deccc	%o2			! decrement count
    344 	bgu,pt	%ncc, .dbbyte		! loop until done
    345 	stb	%o3, [%o0]		! write byte
    346 .dbexit:
    347 	retl
    348         mov     %g1, %o0
    349 	SET_SIZE(memmove)
    350 
    351 
    352 	.align ICACHE_LINE_SIZE
    353 	ENTRY(memcpy)
    354 					! adjust instruction alignment
    355 	nop				! Do not remove, these nops affect
    356 	nop				! icache alignment and performance
    357 .forcpy:
    358 	cmp	%o2, SMALL_MAX		! check for not small case
    359 	bgu,pn	%ncc, .medium		! go to larger cases
    360 	mov	%o0, %g1		! save %o0
    361 	cmp	%o2, SHORTCOPY		! check for really short case
    362 	ble,pt	%ncc, .smallleft	!
    363 	or	%o0, %o1, %o3		! prepare alignment check
    364 	andcc	%o3, 0x3, %g0		! test for alignment
    365 	bz,pt	%ncc, .smallword	! branch to word aligned case
    366 	sub	%o2, 3, %o2		! adjust count to allow cc zero test
    367 .smallnotalign4:
    368 	ldub	[%o1], %o3		! read byte
    369 	subcc	%o2, 4, %o2		! reduce count by 4
    370 	stb	%o3, [%o0]		! write byte
    371 	ldub	[%o1+1], %o3		! repeat for a total of 4 bytes
    372 	add	%o1, 4, %o1		! advance SRC by 4
    373 	stb	%o3, [%o0+1]
    374 	ldub	[%o1-2], %o3
    375 	add	%o0, 4, %o0		! advance DST by 4
    376 	stb	%o3, [%o0-2]
    377 	ldub	[%o1-1], %o3
    378 	bgu,pt	%ncc, .smallnotalign4	! loop til 3 or fewer bytes remain
    379 	stb	%o3, [%o0-1]
    380 	add	%o2, 3, %o2		! restore count
    381 .smallleft:
    382 	tst	%o2
    383 	bz,pt	%ncc, .smallexit
    384 	nop
    385 .smallleft3:				! 1, 2, or 3 bytes remain
    386 	ldub	[%o1], %o3		! load one byte
    387 	deccc	%o2			! reduce count for cc test
    388 	bz,pt	%ncc, .smallexit
    389 	stb	%o3, [%o0]		! store one byte
    390 	ldub	[%o1+1], %o3		! load second byte
    391 	deccc	%o2
    392 	bz,pt	%ncc, .smallexit
    393 	stb	%o3, [%o0+1]		! store second byte
    394 	ldub	[%o1+2], %o3		! load third byte
    395 	stb	%o3, [%o0+2]		! store third byte
    396 	retl
    397 	mov	%g1, %o0		! restore %o0
    398 
    399 	.align	16
    400 	nop				! affects loop icache alignment
    401 .smallwords:
    402 	lduw	[%o1], %o3		! read word
    403 .smallwordx:
    404 	subcc	%o2, 8, %o2		! update count
    405 	stw	%o3, [%o0]		! write word
    406 	add	%o1, 8, %o1		! update SRC
    407 	lduw	[%o1-4], %o3		! read word
    408 	add	%o0, 8, %o0		! update DST
    409 	bgu,pt	%ncc, .smallwords	! loop until done
    410 	stw	%o3, [%o0-4]		! write word
    411 	addcc	%o2, 7, %o2		! restore count
    412 	bz,pt	%ncc, .smallexit	! check for completion
    413 	nop
    414 	cmp	%o2, 4			! check for 4 or more bytes left
    415 	blt	.smallleft3		! if not, go to finish up
    416 	nop
    417 	lduw	[%o1], %o3
    418 	add	%o1, 4, %o1
    419 	subcc	%o2, 4, %o2
    420 	stw	%o3, [%o0]
    421 	add	%o0, 4, %o0
    422 	bnz,pt	%ncc, .smallleft3
    423 	nop
    424 	retl
    425 	mov	%g1, %o0		! restore %o0
    426 
    427 .smallword:
    428 	subcc	%o2, 4, %o2		! update count
    429 	bgu,pt	%ncc, .smallwordx
    430 	lduw	[%o1], %o3		! read word
    431 	addcc	%o2, 3, %o2		! restore count
    432 	bz,pt	%ncc, .smallexit
    433 	stw	%o3, [%o0]		! write word
    434 	deccc	%o2			! reduce count for cc test
    435 	ldub	[%o1+4], %o3		! load one byte
    436 	bz,pt	%ncc, .smallexit
    437 	stb	%o3, [%o0+4]		! store one byte
    438 	ldub	[%o1+5], %o3		! load second byte
    439 	deccc	%o2
    440 	bz,pt	%ncc, .smallexit
    441 	stb	%o3, [%o0+5]		! store second byte
    442 	ldub	[%o1+6], %o3		! load third byte
    443 	stb	%o3, [%o0+6]		! store third byte
    444 .smallexit:
    445 	retl
    446 	mov	%g1, %o0		! restore %o0
    447 	.align 16
    448 .medium:
    449 	neg	%o0, %o5
    450 	neg	%o1, %o3
    451 	andcc	%o5, 7, %o5	! bytes till DST 8 byte aligned
    452 	and	%o3, 7, %o3	! bytes till SRC 8 byte aligned
    453 
    454 	bz	%ncc, 2f
    455 	sub	%o5, %o3, %o3	! -(bytes till SRC aligned after DST aligned)
    456 				! o3={-7, -6, ... 7}  o3>0 => SRC overaligned
    457 
    458 	sub	%o2, %o5, %o2	! update count
    459 
    460 1:
    461 	ldub	[%o1], %o4
    462 	deccc	%o5
    463 	inc	%o1
    464 	stb	%o4, [%o0]
    465 	bgu,pt	%ncc, 1b
    466 	inc	%o0
    467 
    468 	! Now DST is 8-byte aligned.  o0, o1, o2 are current.
    469 
    470 2:
    471 	andcc	%o1, 0x3, %g0		! test alignment
    472 	bnz,pt	%ncc, .mediumsetup	! branch to skip aligned cases
    473 					! if src, dst not aligned
    474 	prefetch [%o1 + (1 * BLOCK_SIZE)], 20
    475 
    476 /*
    477  * Handle all cases where src and dest are aligned on word
    478  * or long word boundaries.  Use unrolled loops for better
    479  * performance.  This option wins over standard large data
    480  * move when source and destination is in cache for medium
    481  * to short data moves.
    482  */
    483 	andcc	%o1, 0x7, %g0		! test word alignment
    484 	bz,pt	%ncc, .medlword		! branch to long word aligned case
    485 	prefetch [%o1 + (2 * BLOCK_SIZE)], 20
    486 	cmp	%o2, MED_WMAX		! limit to store buffer size
    487 	bgu,pt	%ncc, .mediumrejoin	! otherwise rejoin main loop
    488 	nop
    489 	subcc	%o2, 15, %o2		! adjust length to allow cc test
    490 					! for end of loop
    491 	ble,pt	%ncc, .medw15		! skip big loop if less than 16
    492 	prefetch [%o1 + (3 * BLOCK_SIZE)], 20
    493 /*
    494  * no need to put prefetch in loop as prefetches have
    495  * already been issued for maximum loop size
    496  */
    497 .medw16:
    498 	ld	[%o1], %o4		! load
    499 	subcc	%o2, 16, %o2		! decrement length count
    500 	stw	%o4, [%o0]		! and store
    501 	ld	[%o1+4], %o3		! a block of 16 bytes
    502 	add	%o1, 16, %o1		! increase src ptr by 16
    503 	stw	%o3, [%o0+4]
    504 	ld	[%o1-8], %o4
    505 	add	%o0, 16, %o0		! increase dst ptr by 16
    506 	stw	%o4, [%o0-8]
    507 	ld	[%o1-4], %o3
    508 	bgu,pt	%ncc, .medw16		! repeat if at least 16 bytes left
    509 	stw	%o3, [%o0-4]
    510 .medw15:
    511 	addcc	%o2, 15, %o2		! restore count
    512 	bz,pt	%ncc, .medwexit		! exit if finished
    513 	nop
    514 	cmp	%o2, 8
    515 	blt,pt	%ncc, .medw7		! skip if 7 or fewer bytes left
    516 	nop				!
    517 	ld	[%o1], %o4		! load 4 bytes
    518 	subcc	%o2, 8, %o2		! decrease count by 8
    519 	stw	%o4, [%o0]		! and store 4 bytes
    520 	add	%o1, 8, %o1		! increase src ptr by 8
    521 	ld	[%o1-4], %o3		! load 4 bytes
    522 	add	%o0, 8, %o0		! increase dst ptr by 8
    523 	stw	%o3, [%o0-4]		! and store 4 bytes
    524 	bz	%ncc, .medwexit		! exit if finished
    525 	nop
    526 .medw7:					! count is ge 1, less than 8
    527 	cmp	%o2, 3			! check for 4 bytes left
    528 	ble,pt	%ncc, .medw3		! skip if 3 or fewer bytes left
    529 	nop				!
    530 	ld	[%o1], %o4		! load 4 bytes
    531 	sub	%o2, 4, %o2		! decrease count by 4
    532 	add	%o1, 4, %o1		! increase src ptr by 4
    533 	stw	%o4, [%o0]		! and store 4 bytes
    534 	add	%o0, 4, %o0		! increase dst ptr by 4
    535 	tst	%o2			! check for zero bytes left
    536 	bz	%ncc, .medwexit		! exit if finished
    537 	nop
    538 .medw3:					! count is known to be 1, 2, or 3
    539 	deccc	%o2			! reduce count by one
    540 	ldub	[%o1], %o3		! load one byte
    541 	bz,pt	%ncc, .medwexit		! exit if last byte
    542 	stb	%o3, [%o0]		! store one byte
    543 	ldub	[%o1+1], %o3		! load second byte
    544 	deccc	%o2			! reduce count by one
    545 	bz,pt	%ncc, .medwexit		! exit if last byte
    546 	stb	%o3, [%o0+1]		! store second byte
    547 	ldub	[%o1+2], %o3		! load third byte
    548 	stb	%o3, [%o0+2]		! store third byte
    549 .medwexit:
    550 	retl
    551 	mov	%g1, %o0		! restore %o0
    552 
    553 /*
    554  * Special case for handling when src and dest are both long word aligned
    555  * and total data to move is between SMALL_MAX and MED_MAX bytes
    556  */
    557 
    558 	.align 16
    559 	nop
    560 .medlword:				! long word aligned
    561 					! length > SMALL_MAX
    562 	cmp	%o2, MED_MAX		! limit to store buffer size
    563 	bgu,pt	%ncc, .mediumrejoin	! otherwise rejoin main loop
    564 	nop
    565 	subcc	%o2, 31, %o2		! adjust length to allow cc test
    566 					! for end of loop
    567 	ble,pt	%ncc, .medl31		! skip big loop if less than 32
    568 	prefetch [%o1 + (3 * BLOCK_SIZE)], 20	! into the l2 cache
    569 /*
    570  * no need to put prefetch in loop as prefetches have
    571  * already been issued for maximum loop size
    572  */
    573 .medl32:
    574 	ldx	[%o1], %o4		! load
    575 	subcc	%o2, 32, %o2		! decrement length count
    576 	stx	%o4, [%o0]		! and store
    577 	ldx	[%o1+8], %o3		! a block of 32 bytes
    578 	add	%o1, 32, %o1		! increase src ptr by 32
    579 	stx	%o3, [%o0+8]
    580 	ldx	[%o1-16], %o4
    581 	add	%o0, 32, %o0		! increase dst ptr by 32
    582 	stx	%o4, [%o0-16]
    583 	ldx	[%o1-8], %o3
    584 	bgu,pt	%ncc, .medl32		! repeat if at least 32 bytes left
    585 	stx	%o3, [%o0-8]
    586 .medl31:
    587 	addcc	%o2, 16, %o2		! adjust remaining count
    588 	ble,pt	%ncc, .medl15		! skip if 15 or fewer bytes left
    589 	nop				!
    590 	ldx	[%o1], %o4		! load and store 16 bytes
    591 	add	%o1, 16, %o1		! increase src ptr by 16
    592 	stx	%o4, [%o0]		!
    593 	sub	%o2, 16, %o2		! decrease count by 16
    594 	ldx	[%o1-8], %o3		!
    595 	add	%o0, 16, %o0		! increase dst ptr by 16
    596 	stx	%o3, [%o0-8]
    597 .medl15:
    598 	addcc	%o2, 15, %o2		! restore count
    599 	bz,pt	%ncc, .medwexit		! exit if finished
    600 	nop
    601 	cmp	%o2, 8
    602 	blt,pt	%ncc, .medw7		! skip if 7 or fewer bytes left
    603 	nop
    604 	ldx	[%o1], %o4		! load 8 bytes
    605 	add	%o1, 8, %o1		! increase src ptr by 8
    606 	stx	%o4, [%o0]		! and store 8 bytes
    607 	subcc	%o2, 8, %o2		! decrease count by 8
    608 	bz	%ncc, .medwexit		! exit if finished
    609 	add	%o0, 8, %o0		! increase dst ptr by 8
    610 	ba	.medw7
    611 	nop
    612 
    613 	.align 16
    614 	nop
    615 	nop
    616 	nop
    617 .mediumsetup:
    618 	prefetch [%o1 + (2 * BLOCK_SIZE)], 21
    619 .mediumrejoin:
    620 	rd	%fprs, %o4		! check for unused FPU
    621 
    622 	add	%o1, 8, %o1		! prepare to round SRC upward
    623 
    624 	sethi	%hi(0x1234567f), %o5	! For GSR.MASK
    625 	or	%o5, 0x67f, %o5
    626 
    627 	andcc	%o4, FPRS_FEF, %o4	! test FEF, fprs.du = fprs.dl = 0
    628 	bz,a	%ncc, 3f
    629 	wr	%g0, FPRS_FEF, %fprs	! fprs.fef = 1
    630 3:
    631 	cmp	%o2, MEDIUM_MAX
    632 	bmask	%o5, %g0, %g0
    633 
    634 	! Compute o5 (number of bytes that need copying using the main loop).
    635 	! First, compute for the medium case.
    636 	! Then, if large case, o5 is replaced by count for block alignment.
    637 	! Be careful not to read past end of SRC
    638 	! Currently, o2 is the actual count remaining
    639 	!            o3 is how much sooner we'll cross the alignment boundary
    640 	!                in SRC compared to in DST
    641 	!
    642 	! Examples:  Let # denote bytes that should not be accessed
    643 	!            Let x denote a byte already copied to align DST
    644 	!            Let . and - denote bytes not yet copied
    645 	!            Let | denote double alignment boundaries
    646 	!
    647 	!            DST:  ######xx|........|--------|..######   o2 = 18
    648 	!                          o0
    649 	!
    650 	!  o3 = -3:  SRC:  ###xx...|.....---|-----..#|########   o5 = 8
    651 	!                          o1
    652 	!
    653 	!  o3 =  0:  SRC:  ######xx|........|--------|..######   o5 = 16-8 = 8
    654 	!                                   o1
    655 	!
    656 	!  o3 = +1:  SRC:  #######x|x.......|.-------|-..#####   o5 = 16-8 = 8
    657 	!                                   o1
    658 
    659 	or	%g0, -8, %o5
    660 	alignaddr %o1, %g0, %o1		! set GSR.ALIGN and align o1
    661 
    662 	movrlz	%o3, %g0, %o5		! subtract 8 from o2+o3 only if o3>=0
    663 	add	%o5, %o2, %o5
    664 	add	%o5, %o3, %o5
    665 
    666 	bleu	%ncc, 4f
    667 	andn	%o5, 7, %o5		! 8 byte aligned count
    668 	neg	%o0, %o5		! 'large' case
    669 	and	%o5, BLOCK_SIZE-1, %o5  ! bytes till DST block aligned
    670 4:
    671 	brgez,a	%o3, .beginmedloop
    672 	ldd	[%o1-8], %d0
    673 
    674 	add	%o1, %o3, %o1		! back up o1
    675 5:
    676 	ldda	[%o1]ASI_FL8_P, %d2
    677 	inc	%o1
    678 	andcc	%o1, 7, %g0
    679 	bnz	%ncc, 5b
    680 	bshuffle %d0, %d2, %d0		! shifts d0 left 1 byte and or's in d2
    681 
    682 .beginmedloop:
    683 	tst	%o5
    684 	bz	%ncc, .endmedloop
    685 	sub	%o2, %o5, %o2		! update count for later
    686 
    687 	! Main loop to write out doubles.  Note: o5 & 7 == 0
    688 
    689 	ldd	[%o1], %d2
    690 	subcc	%o5, 8, %o5		! update local count
    691 	bz,pn	%ncc, 1f
    692 	add	%o1, 8, %o1		! update SRC
    693 
    694 .medloop:
    695 	faligndata %d0, %d2, %d4
    696 	ldd	[%o1], %d0
    697 	subcc	%o5, 8, %o5		! update local count
    698 	add	%o1, 16, %o1		! update SRC
    699 	std	%d4, [%o0]
    700 	bz,pn	%ncc, 2f
    701 	faligndata %d2, %d0, %d6
    702 	ldd	[%o1 - 8], %d2
    703 	subcc	%o5, 8, %o5		! update local count
    704 	std	%d6, [%o0 + 8]
    705 	bnz,pt	%ncc, .medloop
    706 	add	%o0, 16, %o0		! update DST
    707 
    708 1:
    709 	faligndata %d0, %d2, %d4
    710 	fmovd	%d2, %d0
    711 	std	%d4, [%o0]
    712 	ba	.endmedloop
    713 	add	%o0, 8, %o0
    714 
    715 2:
    716 	std	%d6, [%o0 + 8]
    717 	sub	%o1, 8, %o1
    718 	add	%o0, 16, %o0
    719 
    720 
    721 .endmedloop:
    722 	! Currently, o1 is pointing to the next double-aligned byte in SRC
    723 	! The 8 bytes starting at [o1-8] are available in d0
    724 	! At least one, and possibly all, of these need to be written.
    725 
    726 	cmp	%o2, BLOCK_SIZE
    727 	bgu	%ncc, .large		! otherwise, less than 16 bytes left
    728 
    729 #if 0
    730 
    731 	/* This code will use partial stores.  */
    732 
    733 	mov	%g0, %o5
    734 	and	%o3, 7, %o3		! Number of bytes needed to completely
    735 					! fill %d0 with good (unwritten) data.
    736 
    737 	subcc	%o2, 8, %o2		! update count (maybe too much)
    738 	movl	%ncc, %o2, %o5
    739 	addcc	%o3, %o5, %o5		! extra bytes we can stuff into %d0
    740 	sub	%o3, %o5, %o3		! update o3 (# bad bytes in %d0)
    741 
    742 	bz	%ncc, 2f
    743 	alignaddr %o3, %g0, %g0		! set GSR.ALIGN
    744 
    745 1:
    746 	deccc	%o5
    747 	ldda	[%o1]ASI_FL8_P, %d2
    748 	inc	%o1
    749 	bgu	%ncc, 1b
    750 	bshuffle %d0, %d2, %d0		! shifts d0 left 1 byte and or's in d2
    751 
    752 2:
    753 	not     %o3
    754 	faligndata %d0, %d0, %d0	! shift bytes to the left
    755 	and	%o3, 7, %o3		! last byte to be stored in [%o0+%o3]
    756 	edge8n	%g0, %o3, %o5
    757 	stda	%d0, [%o0]%o5, ASI_PST8_P
    758 	brlez	%o2, .mediumexit
    759 	add	%o0, %o3, %o0		! update DST to last stored byte
    760 3:
    761 	inc	%o0
    762 	deccc	%o2
    763 	ldub	[%o1], %o3
    764 	stb	%o3, [%o0]
    765 	bgu	%ncc, 3b
    766 	inc	%o1
    767 
    768 #else
    769 
    770 	andcc	%o3, 7, %o5		! Number of bytes needed to completely
    771 					! fill %d0 with good (unwritten) data.
    772 	bz	%ncc, 2f
    773 	sub	%o5, 8, %o3		! -(number of good bytes in %d0)
    774 	cmp	%o2, 8
    775 	bl,a	%ncc, 3f		! Not enough bytes to fill %d0
    776 	add	%o1, %o3, %o1 		! Back up %o1
    777 
    778 1:
    779 	deccc	%o5
    780 	ldda	[%o1]ASI_FL8_P, %d2
    781 	inc	%o1
    782 	bgu	%ncc, 1b
    783 	bshuffle %d0, %d2, %d0		! shifts d0 left 1 byte and or's in d2
    784 
    785 2:
    786 	subcc	%o2, 8, %o2
    787 	std	%d0, [%o0]
    788 	bz	%ncc, .mediumexit
    789 	add	%o0, 8, %o0
    790 3:
    791 	ldub	[%o1], %o3
    792 	deccc	%o2
    793 	inc	%o1
    794 	stb	%o3, [%o0]
    795 	bgu	%ncc, 3b
    796 	inc	%o0
    797 #endif
    798 
    799 .mediumexit:
    800         wr      %o4, %g0, %fprs		! fprs = o4   restore fprs
    801 	retl
    802         mov     %g1, %o0
    803 
    804 
    805 	.align ICACHE_LINE_SIZE
    806 .large:
    807 	! The following test for BSTORE_SIZE is used to decide whether
    808 	! to store data with a block store or with individual stores.
    809 	! The block store wins when the amount of data is so large
    810 	! that it is causes other application data to be moved out
    811 	! of the L1 or L2 cache.
    812 	! On a Panther, block store can lose more often because block
    813 	! store forces the stored data to be removed from the L3 cache.
    814 	!
    815 	sethi	%hi(BSTORE_SIZE),%o5
    816 	or	%o5,%lo(BSTORE_SIZE),%o5
    817 	cmp	%o2, %o5
    818 	bgu	%ncc, .xlarge
    819 
    820 	! %o0 I/O DST is 64-byte aligned
    821 	! %o1 I/O 8-byte aligned (and we've set GSR.ALIGN)
    822 	! %d0 I/O already loaded with SRC data from [%o1-8]
    823 	! %o2 I/O count (number of bytes that need to be written)
    824 	! %o3 I   Not written.  If zero, then SRC is double aligned.
    825 	! %o4 I   Not written.  Holds fprs.
    826 	! %o5   O The number of doubles that remain to be written.
    827 
    828 	! Load the rest of the current block
    829 	! Recall that %o1 is further into SRC than %o0 is into DST
    830 
    831 	prefetch [%o0 + (0 * BLOCK_SIZE)], 22
    832 	prefetch [%o0 + (1 * BLOCK_SIZE)], 22
    833 	prefetch [%o0 + (2 * BLOCK_SIZE)], 22
    834 	ldd	[%o1], %f2
    835 	prefetch [%o1 + (3 * BLOCK_SIZE)], 21
    836 	ldd	[%o1 + 0x8], %f4
    837 	faligndata %f0, %f2, %f32
    838 	ldd	[%o1 + 0x10], %f6
    839 	faligndata %f2, %f4, %f34
    840 	ldd	[%o1 + 0x18], %f8
    841 	faligndata %f4, %f6, %f36
    842 	ldd	[%o1 + 0x20], %f10
    843         or	%g0, -8, %o5		! if %o3 >= 0, %o5 = -8
    844 	prefetch [%o1 + (4 * BLOCK_SIZE)], 21
    845 	faligndata %f6, %f8, %f38
    846 	ldd	[%o1 + 0x28], %f12
    847 	movrlz	%o3, %g0, %o5		! if %o3 < 0, %o5 = 0  (needed lter)
    848 	faligndata %f8, %f10, %f40
    849 	ldd	[%o1 + 0x30], %f14
    850 	faligndata %f10, %f12, %f42
    851 	ldd	[%o1 + 0x38], %f0
    852 	sub	%o2, BLOCK_SIZE, %o2	! update count
    853 	prefetch [%o1 + (5 * BLOCK_SIZE)], 21
    854 	add	%o1, BLOCK_SIZE, %o1		! update SRC
    855 
    856 	! Main loop.  Write previous block.  Load rest of current block.
    857 	! Some bytes will be loaded that won't yet be written.
    858 1:
    859 	ldd	[%o1], %f2
    860 	faligndata %f12, %f14, %f44
    861 	ldd	[%o1 + 0x8], %f4
    862 	faligndata %f14, %f0, %f46
    863 	std	%f32, [%o0]
    864 	std	%f34, [%o0+8]
    865 	std	%f36, [%o0+16]
    866 	std	%f38, [%o0+24]
    867 	std	%f40, [%o0+32]
    868 	std	%f42, [%o0+40]
    869 	std	%f44, [%o0+48]
    870 	std	%f46, [%o0+56]
    871 	sub	%o2, BLOCK_SIZE, %o2		! update count
    872 	prefetch [%o0 + (6 * BLOCK_SIZE)], 22
    873 	prefetch [%o0 + (3 * BLOCK_SIZE)], 22
    874 	add	%o0, BLOCK_SIZE, %o0		! update DST
    875 	ldd	[%o1 + 0x10], %f6
    876 	faligndata %f0, %f2, %f32
    877 	ldd	[%o1 + 0x18], %f8
    878 	faligndata %f2, %f4, %f34
    879 	ldd	[%o1 + 0x20], %f10
    880 	faligndata %f4, %f6, %f36
    881 	ldd	[%o1 + 0x28], %f12
    882 	faligndata %f6, %f8, %f38
    883 	ldd	[%o1 + 0x30], %f14
    884 	faligndata %f8, %f10, %f40
    885 	ldd	[%o1 + 0x38], %f0
    886 	faligndata %f10, %f12, %f42
    887 	cmp	%o2, BLOCK_SIZE + 8
    888 	prefetch [%o1 + (5 * BLOCK_SIZE)], 21
    889 	bgu,pt	%ncc, 1b
    890 	add	%o1, BLOCK_SIZE, %o1	! update SRC
    891 	faligndata %f12, %f14, %f44
    892 	faligndata %f14, %f0, %f46
    893 	stda	%f32, [%o0]ASI_BLK_P		! store 64 bytes, bypass cache
    894 	cmp	%o2, BLOCK_SIZE
    895 	bne	%ncc, 2f		! exactly 1 block remaining?
    896 	add	%o0, BLOCK_SIZE, %o0	! update DST
    897 	brz,a	%o3, 3f			! is SRC double aligned?
    898 	ldd	[%o1], %f2
    899 
    900 2:
    901 	add	%o5, %o2, %o5		! %o5 was already set to 0 or -8
    902 	add	%o5, %o3, %o5
    903 
    904 	membar	#StoreLoad|#StoreStore
    905 
    906 	ba	.beginmedloop
    907 	andn	%o5, 7, %o5		! 8 byte aligned count
    908 
    909 
    910 	! This is when there is exactly 1 block remaining and SRC is aligned
    911 3:
    912 	ldd	[%o1 + 0x8], %f4
    913 	ldd	[%o1 + 0x10], %f6
    914 	fsrc1	%f0, %f32
    915 	ldd	[%o1 + 0x18], %f8
    916 	fsrc1	%f2, %f34
    917 	ldd	[%o1 + 0x20], %f10
    918 	fsrc1	%f4, %f36
    919 	ldd	[%o1 + 0x28], %f12
    920 	fsrc1	%f6, %f38
    921 	ldd	[%o1 + 0x30], %f14
    922 	fsrc1	%f8, %f40
    923 	fsrc1	%f10, %f42
    924 	fsrc1	%f12, %f44
    925 	fsrc1	%f14, %f46
    926 	stda	%f32, [%o0]ASI_BLK_P
    927 	membar	#StoreLoad|#StoreStore
    928 	wr	%o4, 0, %fprs
    929 	retl
    930 	mov	%g1, %o0
    931 
    932 
    933 	.align 16
    934 	! two nops here causes loop starting at 1f below to be
    935 	! on a cache line boundary, improving performance
    936 	nop
    937 	nop
    938 .xlarge:
    939 	! %o0 I/O DST is 64-byte aligned
    940 	! %o1 I/O 8-byte aligned (and we've set GSR.ALIGN)
    941 	! %d0 I/O already loaded with SRC data from [%o1-8]
    942 	! %o2 I/O count (number of bytes that need to be written)
    943 	! %o3 I   Not written.  If zero, then SRC is double aligned.
    944 	! %o4 I   Not written.  Holds fprs.
    945 	! %o5   O The number of doubles that remain to be written.
    946 
    947 	! Load the rest of the current block
    948 	! Recall that %o1 is further into SRC than %o0 is into DST
    949 
    950 	! prefetch [%o1 + (3 * BLOCK_SIZE)], 21
    951 	! executed in delay slot for branch to .xlarge
    952 	prefetch [%o1 + (4 * BLOCK_SIZE)], 21
    953 	prefetch [%o1 + (5 * BLOCK_SIZE)], 21
    954 	ldd	[%o1], %f2
    955 	prefetch [%o1 + (6 * BLOCK_SIZE)], 21
    956 	ldd	[%o1 + 0x8], %f4
    957 	faligndata %f0, %f2, %f32
    958 	ldd	[%o1 + 0x10], %f6
    959 	faligndata %f2, %f4, %f34
    960 	ldd	[%o1 + 0x18], %f8
    961 	faligndata %f4, %f6, %f36
    962 	ldd	[%o1 + 0x20], %f10
    963         or	%g0, -8, %o5		! if %o3 >= 0, %o5 = -8
    964 	faligndata %f6, %f8, %f38
    965 	ldd	[%o1 + 0x28], %f12
    966 	movrlz	%o3, %g0, %o5		! if %o3 < 0, %o5 = 0  (needed later)
    967 	faligndata %f8, %f10, %f40
    968 	ldd	[%o1 + 0x30], %f14
    969 	faligndata %f10, %f12, %f42
    970 	ldd	[%o1 + 0x38], %f0
    971 	sub	%o2, BLOCK_SIZE, %o2	! update count
    972 	prefetch [%o1 + (7 * BLOCK_SIZE)], 21
    973 	add	%o1, BLOCK_SIZE, %o1	! update SRC
    974 
    975 	! This point is 32-byte aligned since 24 instructions appear since
    976 	! the previous alignment directive.
    977 
    978 
    979 	! Main loop.  Write previous block.  Load rest of current block.
    980 	! Some bytes will be loaded that won't yet be written.
    981 1:
    982 	ldd	[%o1], %f2
    983 	faligndata %f12, %f14, %f44
    984 	ldd	[%o1 + 0x8], %f4
    985 	faligndata %f14, %f0, %f46
    986 	stda	%f32, [%o0]ASI_BLK_P
    987 	sub	%o2, BLOCK_SIZE, %o2		! update count
    988 	ldd	[%o1 + 0x10], %f6
    989 	faligndata %f0, %f2, %f32
    990 	ldd	[%o1 + 0x18], %f8
    991 	faligndata %f2, %f4, %f34
    992 	ldd	[%o1 + 0x20], %f10
    993 	faligndata %f4, %f6, %f36
    994 	ldd	[%o1 + 0x28], %f12
    995 	faligndata %f6, %f8, %f38
    996 	ldd	[%o1 + 0x30], %f14
    997 	faligndata %f8, %f10, %f40
    998 	ldd	[%o1 + 0x38], %f0
    999 	faligndata %f10, %f12, %f42
   1000 	! offset of 8*BLK+8 bytes works best over range of (src-dst) mod 1K
   1001 	prefetch [%o1 + (8 * BLOCK_SIZE) + 8], 21
   1002 	add	%o0, BLOCK_SIZE, %o0		! update DST
   1003 	cmp	%o2, BLOCK_SIZE + 8
   1004 	! second prefetch important to correct for occasional dropped
   1005 	! initial prefetches, 5*BLK works best over range of (src-dst) mod 1K
   1006 	! strong prefetch prevents drops on Panther, but Jaguar and earlier
   1007 	! US-III models treat strong prefetches as weak prefetchs
   1008 	! to avoid regressions on customer hardware, we retain the prefetch
   1009 	prefetch [%o1 + (5 * BLOCK_SIZE)], 21
   1010 	bgu,pt	%ncc, 1b
   1011 	add	%o1, BLOCK_SIZE, %o1		! update SRC
   1012 
   1013 	faligndata %f12, %f14, %f44
   1014 	faligndata %f14, %f0, %f46
   1015 	stda	%f32, [%o0]ASI_BLK_P		! store 64 bytes, bypass cache
   1016 	cmp	%o2, BLOCK_SIZE
   1017 	bne	%ncc, 2f		! exactly 1 block remaining?
   1018 	add	%o0, BLOCK_SIZE, %o0	! update DST
   1019 	brz,a	%o3, 3f			! is SRC double aligned?
   1020 	ldd	[%o1], %f2
   1021 
   1022 2:
   1023 	add	%o5, %o2, %o5		! %o5 was already set to 0 or -8
   1024 	add	%o5, %o3, %o5
   1025 
   1026 	membar	#StoreLoad|#StoreStore
   1027 
   1028 	ba	.beginmedloop
   1029 	andn	%o5, 7, %o5		! 8 byte aligned count
   1030 
   1031 
   1032 	! This is when there is exactly 1 block remaining and SRC is aligned
   1033 3:
   1034 	ldd	[%o1 + 0x8], %f4
   1035 	ldd	[%o1 + 0x10], %f6
   1036 	fsrc1	%f0, %f32
   1037 	ldd	[%o1 + 0x18], %f8
   1038 	fsrc1	%f2, %f34
   1039 	ldd	[%o1 + 0x20], %f10
   1040 	fsrc1	%f4, %f36
   1041 	ldd	[%o1 + 0x28], %f12
   1042 	fsrc1	%f6, %f38
   1043 	ldd	[%o1 + 0x30], %f14
   1044 	fsrc1	%f8, %f40
   1045 	fsrc1	%f10, %f42
   1046 	fsrc1	%f12, %f44
   1047 	fsrc1	%f14, %f46
   1048 	stda	%f32, [%o0]ASI_BLK_P
   1049 	membar	#StoreLoad|#StoreStore
   1050 	wr	%o4, 0, %fprs
   1051 	retl
   1052 	mov	%g1, %o0
   1053 
   1054 	SET_SIZE(memcpy)
   1055