Home | History | Annotate | Download | only in common
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 	.file	"memcpy.s"
     28 
     29 /*
     30  * memcpy(s1, s2, len)
     31  *
     32  * Copy s2 to s1, always copy n bytes.
     33  * Note: this C code does not work for overlapped copies.
     34  *       Memmove() and bcopy() do.
     35  *
     36  * Fast assembler language version of the following C-program for memcpy
     37  * which represents the `standard' for the C-library.
     38  *
     39  *	void *
     40  *	memcpy(void *s, const void *s0, size_t n)
     41  *	{
     42  *		if (n != 0) {
     43  *	   	    char *s1 = s;
     44  *		    const char *s2 = s0;
     45  *		    do {
     46  *			*s1++ = *s2++;
     47  *		    } while (--n != 0);
     48  *		}
     49  *		return (s);
     50  *	}
     51  */
     52 
     53 #include <sys/asm_linkage.h>
     54 #include <sys/sun4asi.h>
     55 #include <sys/trap.h>
     56 
     57 #define	ICACHE_LINE_SIZE	64
     58 #define	BLOCK_SIZE		64
     59 #define	FPRS_FEF		0x4
     60 
     61 #define	ALIGNED8_FPCOPY_THRESHOLD	1024
     62 #define	ALIGNED4_FPCOPY_THRESHOLD	1024
     63 #define	BST_THRESHOLD			65536
     64 
     65 #define	SHORTCOPY	3
     66 #define	SMALL_MAX	64
     67 #define	MEDIUM_MAX	255
     68 #define	MED_WMAX	256	/* max copy for medium word-aligned case */
     69 
     70 #define	N_READS_STRONG	20
     71 #define	N_WRITES_STRONG	22
     72 
     73 
     74 	ANSI_PRAGMA_WEAK(memmove,function)
     75 	ANSI_PRAGMA_WEAK(memcpy,function)
     76 
     77 	ENTRY(memmove)
     78 	prefetch [%o1], N_READS_STRONG
     79 	prefetch [%o0], N_WRITES_STRONG
     80 	cmp	%o1, %o0	! if from address is >= to use forward copy
     81 	bgeu	%ncc, .forcpy	! else use backward if ...
     82 	sub	%o0, %o1, %o4	! get difference of two addresses
     83 	cmp	%o2, %o4	! compare size and difference of addresses
     84 	bleu	%ncc, .forcpy	! if size is bigger, do overlapped copy
     85 	nop
     86 
     87 	!
     88 	! an overlapped copy that must be done "backwards"
     89 	!
     90 .ovbc:
     91 	mov	%o0, %g1		! save dest address for return val
     92 	add     %o1, %o2, %o1           ! get to end of source space
     93 	add     %o0, %o2, %o0           ! get to end of destination space
     94 
     95 	cmp	%o2, 64
     96 	bgeu,pn	%ncc, .dbalign
     97 	nop
     98 	cmp	%o2, 4
     99 	blt,pn	%ncc, .byte
    100 	sub	%o2, 3, %o2
    101 .byte4loop:
    102 	ldub	[%o1-1], %o3		! load last byte
    103 	stb	%o3, [%o0-1]		! store last byte
    104 	sub	%o1, 4, %o1
    105 	ldub	[%o1+2], %o3		! load 2nd from last byte
    106 	stb	%o3, [%o0-2]		! store 2nd from last byte
    107 	sub	%o0, 4, %o0
    108 	ldub	[%o1+1], %o3		! load 3rd from last byte
    109 	stb	%o3, [%o0+1]		! store 3rd from last byte
    110 	subcc	%o2, 4, %o2
    111 	ldub	[%o1], %o3		! load 4th from last byte
    112 	bgu,pt	%ncc, .byte4loop
    113 	stb	%o3, [%o0]		! store 4th from last byte
    114 .byte:
    115 	addcc	%o2, 3, %o2
    116 	bz,pt	%ncc, .exit
    117 .byteloop:
    118 	dec	%o1			! decrement src address
    119 	ldub	[%o1], %o3		! read a byte
    120 	dec	%o0			! decrement dst address
    121 	deccc	%o2			! decrement count
    122 	bgu,pt	%ncc, .byteloop		! loop until done
    123 	stb	%o3, [%o0]		! write byte
    124 .exit:
    125 	retl
    126 	mov	%g1, %o0
    127 
    128 	.align	16
    129 .dbalign:
    130 	prefetch [%o1 - (4 * BLOCK_SIZE)], #one_read
    131 	prefetch [%o0 - (4 * BLOCK_SIZE)], #one_write
    132 	andcc   %o0, 7, %o5		! bytes till DST 8 byte aligned
    133 	bz,pt	%ncc, .dbmed
    134 	sub	%o2, %o5, %o2		! update count
    135 .dbalign1:
    136 	dec	%o1			! decrement src address
    137 	ldub	[%o1], %o3		! read a byte
    138 	dec	%o0			! decrement dst address
    139 	deccc	%o5			! decrement count
    140 	bgu,pt	%ncc, .dbalign1		! loop until done
    141 	stb	%o3, [%o0]		! store a byte
    142 
    143 ! check for src long word alignment
    144 .dbmed:
    145 	andcc	%o1, 7, %g0		! chk src long word alignment
    146 	bnz,pn	%ncc, .dbbck
    147 	nop
    148 !
    149 ! Following code is for overlapping copies where src and dest
    150 ! are long word aligned
    151 !
    152 !
    153 ! For SPARC64-VI, prefetch is effective for both integer and fp register
    154 ! operations. There are no benefits in using the fp registers for
    155 ! aligned data copying.
    156 
    157 .dbmedl32enter:
    158 	subcc	%o2, 31, %o2		! adjust length to allow cc test
    159 					! for end of loop
    160 	ble,pt  %ncc, .dbmedl31		! skip big loop if less than 32
    161 	nop
    162 .dbmedl32:
    163 	ldx	[%o1-8], %o4		! load
    164 	prefetch [%o1 - (8 * BLOCK_SIZE)], #one_read
    165 	subcc	%o2, 32, %o2		! decrement length count
    166 	stx	%o4, [%o0-8]		! and store
    167 	prefetch [%o0 - (8 * BLOCK_SIZE)], #one_write
    168 	ldx	[%o1-16], %o3		! a block of 32 bytes
    169 	sub	%o1, 32, %o1		! decrease src ptr by 32
    170 	stx	%o3, [%o0-16]
    171 	ldx	[%o1+8], %o4
    172 	sub	%o0, 32, %o0		! decrease dst ptr by 32
    173 	stx	%o4, [%o0+8]
    174 	ldx	[%o1], %o3
    175 	bgu,pt	%ncc, .dbmedl32		! repeat if at least 32 bytes left
    176 	stx	%o3, [%o0]
    177 .dbmedl31:
    178 	addcc	%o2, 16, %o2		! adjust remaining count
    179 	ble,pt	%ncc, .dbmedl15		! skip if 15 or fewer bytes left
    180 	nop				!
    181 	ldx	[%o1-8], %o4		! load and store 16 bytes
    182 	sub	%o1, 16, %o1		! decrease src ptr by 16
    183 	stx	%o4, [%o0-8]		!
    184 	sub	%o2, 16, %o2		! decrease count by 16
    185 	ldx	[%o1], %o3		!
    186 	sub	%o0, 16, %o0		! decrease dst ptr by 16
    187 	stx	%o3, [%o0]
    188 .dbmedl15:
    189 	addcc	%o2, 15, %o2		! restore count
    190 	bz,pt	%ncc, .dbexit		! exit if finished
    191 	nop
    192 	cmp	%o2, 8
    193 	blt,pt	%ncc, .dbremain		! skip if 7 or fewer bytes left
    194 	nop
    195 	ldx	[%o1-8], %o4		! load 8 bytes
    196 	sub	%o1, 8, %o1		! decrease src ptr by 8
    197 	stx	%o4, [%o0-8]		! and store 8 bytes
    198 	subcc	%o2, 8, %o2		! decrease count by 8
    199 	bnz	%ncc, .dbremain		! exit if finished
    200 	sub	%o0, 8, %o0		! decrease dst ptr by 8
    201 	retl
    202 	mov	%g1, %o0
    203 
    204 !
    205 ! Following code is for overlapping copies where src and dest
    206 ! are not long word aligned
    207 !
    208 	.align	16
    209 .dbbck:
    210 	rd	%fprs, %o3		! o3 = fprs
    211 
    212 	! if fprs.fef == 0, set it. Checking it, requires 2 instructions.
    213 	! So set it anyway, without checking.
    214 	wr	%g0, 0x4, %fprs		! fprs.fef = 1
    215 
    216 	alignaddr %o1, %g0, %o5		! align src
    217 	ldd	[%o5], %d0		! get first 8 byte block
    218 	andn	%o2, 7, %o4		! prepare src ptr for finishup code
    219 	cmp	%o2, 32
    220 	blt,pn	%ncc, .dbmv8
    221 	sub	%o1, %o4, %o1		!
    222 	cmp	%o2, 4095		! check for short memmoves
    223 	blt,pn	%ncc, .dbmv32enter	! go to no prefetch code
    224 .dbmv64:
    225 	ldd	[%o5-8], %d2		! load 8 bytes
    226 	ldd	[%o5-16], %d4		! load 8 bytes
    227 	sub	%o5, 64, %o5		!
    228 	ldd	[%o5+40], %d6		! load 8 bytes
    229 	sub	%o0, 64, %o0		!
    230 	ldd	[%o5+32], %d8		! load 8 bytes
    231 	sub	%o2, 64, %o2		! 64 less bytes to copy
    232 	ldd	[%o5+24], %d18		! load 8 bytes
    233 	cmp	%o2, 64			! do we have < 64 bytes remaining
    234 	ldd	[%o5+16], %d28		! load 8 bytes
    235 	ldd	[%o5+8], %d30		! load 8 bytes
    236 	faligndata %d2, %d0, %d10	! extract 8 bytes out
    237 	prefetch [%o5 - (5 * BLOCK_SIZE)], #one_read
    238 	ldd	[%o5], %d0		! load 8 bytes
    239 	std	%d10, [%o0+56]		! store the current 8 bytes
    240 	faligndata %d4, %d2, %d12	! extract 8 bytes out
    241 	prefetch [%o0 - (5 * BLOCK_SIZE)], #one_write
    242 	std	%d12, [%o0+48]		! store the current 8 bytes
    243 	faligndata %d6, %d4, %d14	! extract 8 bytes out
    244 	std	%d14, [%o0+40]		! store the current 8 bytes
    245 	faligndata %d8, %d6, %d16	! extract 8 bytes out
    246 	std	%d16, [%o0+32]		! store the current 8 bytes
    247 	faligndata %d18, %d8, %d20	! extract 8 bytes out
    248 	std	%d20, [%o0+24]		! store the current 8 bytes
    249 	faligndata %d28, %d18, %d22	! extract 8 bytes out
    250 	std	%d22, [%o0+16]		! store the current 8 bytes
    251 	faligndata %d30, %d28, %d24	! extract 8 bytes out
    252 	std	%d24, [%o0+8]		! store the current 8 bytes
    253 	faligndata %d0, %d30, %d26	! extract 8 bytes out
    254 	bgeu,pt	%ncc, .dbmv64
    255 	std	%d26, [%o0]		! store the current 8 bytes
    256 
    257 	cmp	%o2, 32
    258 	blt,pn	%ncc, .dbmvx
    259 	nop
    260 .dbmv32:
    261 	ldd	[%o5-8], %d2		! load 8 bytes
    262 .dbmv32enter:
    263 	ldd	[%o5-16], %d4		! load 8 bytes
    264 	sub	%o5, 32, %o5		!
    265 	ldd	[%o5+8], %d6		! load 8 bytes
    266 	sub	%o0, 32, %o0		!
    267 	faligndata %d2, %d0, %d10	! extract 8 bytes out
    268 	ldd	[%o5], %d0		! load 8 bytes
    269 	sub	%o2,32, %o2		! 32 less bytes to copy
    270 	std	%d10, [%o0+24]		! store the current 8 bytes
    271 	cmp	%o2, 32			! do we have < 32 bytes remaining
    272 	faligndata %d4, %d2, %d12	! extract 8 bytes out
    273 	std	%d12, [%o0+16]		! store the current 8 bytes
    274 	faligndata %d6, %d4, %d14	! extract 8 bytes out
    275 	std	%d14, [%o0+8]		! store the current 8 bytes
    276 	faligndata %d0, %d6, %d16	! extract 8 bytes out
    277 	bgeu,pt	%ncc, .dbmv32
    278 	std	%d16, [%o0]		! store the current 8 bytes
    279 .dbmvx:
    280 	cmp	%o2, 8			! do we have < 8 bytes remaining
    281 	blt,pt	%ncc, .dbmvfinish	! if yes, skip to finish up code
    282 	nop
    283 .dbmv8:
    284 	ldd	[%o5-8], %d2
    285 	sub	%o0, 8, %o0		! since we are at the end
    286 					! when we first enter the loop
    287 	sub	%o2, 8, %o2		! 8 less bytes to copy
    288 	sub	%o5, 8, %o5
    289 	cmp	%o2, 8			! do we have < 8 bytes remaining
    290 	faligndata %d2, %d0, %d8	! extract 8 bytes out
    291 	std	%d8, [%o0]		! store the current 8 bytes
    292 	bgeu,pt	%ncc, .dbmv8
    293 	fmovd	%d2, %d0
    294 .dbmvfinish:
    295 	and	%o3, 0x4, %o3		! fprs.du = fprs.dl = 0
    296 	tst	%o2
    297 	bz,pt	%ncc, .dbexit
    298 	wr	%o3, %g0, %fprs		! fprs = o3   restore fprs
    299 
    300 .dbremain:
    301 	cmp	%o2, 4
    302 	blt,pn	%ncc, .dbbyte
    303 	nop
    304 	ldub	[%o1-1], %o3		! load last byte
    305 	stb	%o3, [%o0-1]		! store last byte
    306 	sub	%o1, 4, %o1
    307 	ldub	[%o1+2], %o3		! load 2nd from last byte
    308 	stb	%o3, [%o0-2]		! store 2nd from last byte
    309 	sub	%o0, 4, %o0
    310 	ldub	[%o1+1], %o3		! load 3rd from last byte
    311 	stb	%o3, [%o0+1]		! store 3rd from last byte
    312 	subcc	%o2, 4, %o2
    313 	ldub	[%o1], %o3		! load 4th from last byte
    314 	stb	%o3, [%o0]		! store 4th from last byte
    315 	bz,pt	%ncc, .dbexit
    316 .dbbyte:
    317 	dec	%o1			! decrement src address
    318 	ldub	[%o1], %o3		! read a byte
    319 	dec	%o0			! decrement dst address
    320 	deccc	%o2			! decrement count
    321 	bgu,pt	%ncc, .dbbyte		! loop until done
    322 	stb	%o3, [%o0]		! write byte
    323 .dbexit:
    324 	retl
    325 	mov	%g1, %o0
    326 	SET_SIZE(memmove)
    327 
    328 
    329 	.align ICACHE_LINE_SIZE
    330 	ENTRY(memcpy)
    331 					! adjust instruction alignment
    332 	nop				! Do not remove, these nops affect
    333 	nop				! icache alignment and performance
    334 .forcpy:
    335 	prefetch [%o1], N_READS_STRONG
    336 	prefetch [%o0], N_WRITES_STRONG
    337 	cmp	%o2, SMALL_MAX		! check for not small case
    338 	bgu,pn	%ncc, .medium		! go to larger cases
    339 	mov	%o0, %g1		! save %o0
    340 	cmp	%o2, SHORTCOPY		! check for really short case
    341 	ble,pt	%ncc, .smallleft	!
    342 	or	%o0, %o1, %o3		! prepare alignment check
    343 	andcc	%o3, 0x3, %g0		! test for alignment
    344 	bz,pt	%ncc, .smallword	! branch to word aligned case
    345 	sub	%o2, 3, %o2		! adjust count to allow cc zero test
    346 .smallnotalign4:
    347 	ldub	[%o1], %o3		! read byte
    348 	subcc	%o2, 4, %o2		! reduce count by 4
    349 	stb	%o3, [%o0]		! write byte
    350 	ldub	[%o1+1], %o3		! repeat for a total of 4 bytes
    351 	add	%o1, 4, %o1		! advance SRC by 4
    352 	stb	%o3, [%o0+1]
    353 	ldub	[%o1-2], %o3
    354 	add	%o0, 4, %o0		! advance DST by 4
    355 	stb	%o3, [%o0-2]
    356 	ldub	[%o1-1], %o3
    357 	bgu,pt	%ncc, .smallnotalign4	! loop til 3 or fewer bytes remain
    358 	stb	%o3, [%o0-1]
    359 	add	%o2, 3, %o2		! restore count
    360 .smallleft:
    361 	tst	%o2
    362 	bz,pt	%ncc, .smallexit
    363 	nop
    364 .smallleft3:				! 1, 2, or 3 bytes remain
    365 	ldub	[%o1], %o3		! load one byte
    366 	deccc	%o2			! reduce count for cc test
    367 	bz,pt	%ncc, .smallexit
    368 	stb	%o3, [%o0]		! store one byte
    369 	ldub	[%o1+1], %o3		! load second byte
    370 	deccc	%o2
    371 	bz,pt	%ncc, .smallexit
    372 	stb	%o3, [%o0+1]		! store second byte
    373 	ldub	[%o1+2], %o3		! load third byte
    374 	stb	%o3, [%o0+2]		! store third byte
    375 	retl
    376 	mov	%g1, %o0		! restore %o0
    377 
    378 	.align	16
    379 	nop				! affects loop icache alignment
    380 .smallwords:
    381 	lduw	[%o1], %o3		! read word
    382 .smallwordx:
    383 	subcc	%o2, 8, %o2		! update count
    384 	stw	%o3, [%o0]		! write word
    385 	add	%o1, 8, %o1		! update SRC
    386 	lduw	[%o1-4], %o3		! read word
    387 	add	%o0, 8, %o0		! update DST
    388 	bgu,pt	%ncc, .smallwords	! loop until done
    389 	stw	%o3, [%o0-4]		! write word
    390 	addcc	%o2, 7, %o2		! restore count
    391 	bz,pt	%ncc, .smallexit	! check for completion
    392 	nop
    393 	cmp	%o2, 4			! check for 4 or more bytes left
    394 	blt	.smallleft3		! if not, go to finish up
    395 	nop
    396 	lduw	[%o1], %o3
    397 	add	%o1, 4, %o1
    398 	subcc	%o2, 4, %o2
    399 	stw	%o3, [%o0]
    400 	add	%o0, 4, %o0
    401 	bnz,pt	%ncc, .smallleft3
    402 	nop
    403 	retl
    404 	mov	%g1, %o0		! restore %o0
    405 
    406 .smallword:
    407 	subcc	%o2, 4, %o2		! update count
    408 	bgu,pt	%ncc, .smallwordx
    409 	lduw	[%o1], %o3		! read word
    410 	addcc	%o2, 3, %o2		! restore count
    411 	bz,pt	%ncc, .smallexit
    412 	stw	%o3, [%o0]		! write word
    413 	deccc	%o2			! reduce count for cc test
    414 	ldub	[%o1+4], %o3		! load one byte
    415 	bz,pt	%ncc, .smallexit
    416 	stb	%o3, [%o0+4]		! store one byte
    417 	ldub	[%o1+5], %o3		! load second byte
    418 	deccc	%o2
    419 	bz,pt	%ncc, .smallexit
    420 	stb	%o3, [%o0+5]		! store second byte
    421 	ldub	[%o1+6], %o3		! load third byte
    422 	stb	%o3, [%o0+6]		! store third byte
    423 .smallexit:
    424 	retl
    425 	mov	%g1, %o0		! restore %o0
    426 	.align 16
    427 .medium:
    428 	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
    429 	prefetch [%o0 + (4 * BLOCK_SIZE)], #one_write
    430 	neg	%o0, %o5
    431 	neg	%o1, %o3
    432 	andcc	%o5, 7, %o5	! bytes till DST 8 byte aligned
    433 	and	%o3, 7, %o3	! bytes till SRC 8 byte aligned
    434 
    435 	bz	%ncc, 2f
    436 	sub	%o5, %o3, %o3	! -(bytes till SRC aligned after DST aligned)
    437 				! o3={-7, -6, ... 7}  o3>0 => SRC overaligned
    438 
    439 	sub	%o2, %o5, %o2	! update count
    440 
    441 1:
    442 	ldub	[%o1], %o4
    443 	deccc	%o5
    444 	inc	%o1
    445 	stb	%o4, [%o0]
    446 	bgu,pt	%ncc, 1b
    447 	inc	%o0
    448 
    449 	! Now DST is 8-byte aligned.  o0, o1, o2 are current.
    450 
    451 2:
    452 	andcc	%o1, 0x3, %g0		! test alignment
    453 	prefetch [%o1 + (1 * BLOCK_SIZE)], #one_read
    454 	bnz,pt	%ncc, .mediumsetup	! branch to skip aligned cases
    455 					! if src, dst not aligned
    456 	prefetch [%o0 + (1 * BLOCK_SIZE)], #one_write
    457 
    458 /*
    459  * Handle all cases where src and dest are aligned on word
    460  * or long word boundaries.  Use unrolled loops for better
    461  * performance.  This option wins over standard large data
    462  * move when source and destination is in cache for medium
    463  * to short data moves.
    464  */
    465 	andcc	%o1, 0x7, %g0		! test word alignment
    466 	prefetch [%o1 + (2 * BLOCK_SIZE)], #one_read
    467 	bz,pt	%ncc, .medlword		! branch to long word aligned case
    468 	prefetch [%o0 + (2 * BLOCK_SIZE)], #one_write
    469 	cmp	%o2, ALIGNED4_FPCOPY_THRESHOLD	! limit to store buffer size
    470 	bgu,pt	%ncc, .mediumrejoin	! otherwise rejoin main loop
    471 	prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read
    472 	subcc	%o2, 15, %o2		! adjust length to allow cc test
    473 	prefetch [%o0 + (3 * BLOCK_SIZE)], #one_write
    474 					! for end of loop
    475 	ble,pt	%ncc, .medw15		! skip big loop if less than 16
    476 	  .empty
    477 .medw16:
    478 	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
    479 	ld	[%o1], %o4		! load
    480 	subcc	%o2, 16, %o2		! decrement length count
    481 	prefetch [%o0 + (4 * BLOCK_SIZE)], #one_write
    482 	stw	%o4, [%o0]		! and store
    483 	ld	[%o1+4], %o3		! a block of 16 bytes
    484 	add	%o1, 16, %o1		! increase src ptr by 16
    485 	stw	%o3, [%o0+4]
    486 	ld	[%o1-8], %o4
    487 	add	%o0, 16, %o0		! increase dst ptr by 16
    488 	stw	%o4, [%o0-8]
    489 	ld	[%o1-4], %o3
    490 	bgu,pt	%ncc, .medw16		! repeat if at least 16 bytes left
    491 	stw	%o3, [%o0-4]
    492 .medw15:
    493 	addcc	%o2, 15, %o2		! restore count
    494 	bz,pt	%ncc, .medwexit		! exit if finished
    495 	nop
    496 	cmp	%o2, 8
    497 	blt,pt	%ncc, .medw7		! skip if 7 or fewer bytes left
    498 	nop				!
    499 	ld	[%o1], %o4		! load 4 bytes
    500 	subcc	%o2, 8, %o2		! decrease count by 8
    501 	stw	%o4, [%o0]		! and store 4 bytes
    502 	add	%o1, 8, %o1		! increase src ptr by 8
    503 	ld	[%o1-4], %o3		! load 4 bytes
    504 	add	%o0, 8, %o0		! increase dst ptr by 8
    505 	stw	%o3, [%o0-4]		! and store 4 bytes
    506 	bz	%ncc, .medwexit		! exit if finished
    507 	nop
    508 .medw7:					! count is ge 1, less than 8
    509 	cmp	%o2, 3			! check for 4 bytes left
    510 	ble,pt	%ncc, .medw3		! skip if 3 or fewer bytes left
    511 	nop				!
    512 	ld	[%o1], %o4		! load 4 bytes
    513 	sub	%o2, 4, %o2		! decrease count by 4
    514 	add	%o1, 4, %o1		! increase src ptr by 4
    515 	stw	%o4, [%o0]		! and store 4 bytes
    516 	add	%o0, 4, %o0		! increase dst ptr by 4
    517 	tst	%o2			! check for zero bytes left
    518 	bz	%ncc, .medwexit		! exit if finished
    519 	nop
    520 .medw3:					! count is known to be 1, 2, or 3
    521 	deccc	%o2			! reduce count by one
    522 	ldub	[%o1], %o3		! load one byte
    523 	bz,pt	%ncc, .medwexit		! exit if last byte
    524 	stb	%o3, [%o0]		! store one byte
    525 	ldub	[%o1+1], %o3		! load second byte
    526 	deccc	%o2			! reduce count by one
    527 	bz,pt	%ncc, .medwexit		! exit if last byte
    528 	stb	%o3, [%o0+1]		! store second byte
    529 	ldub	[%o1+2], %o3		! load third byte
    530 	stb	%o3, [%o0+2]		! store third byte
    531 .medwexit:
    532 	retl
    533 	mov	%g1, %o0		! restore %o0
    534 
    535 /*
    536  * Special case for handling when src and dest are both long word aligned
    537  * and total data to move is between SMALL_MAX and ALIGNED8_FPCOPY_THRESHOLD
    538  * bytes.
    539  */
    540 
    541 	.align 16
    542 	nop
    543 .medlword:				! long word aligned
    544 					! length > ALIGNED8_FPCOPY_THRESHOLD
    545 	cmp	%o2, ALIGNED8_FPCOPY_THRESHOLD
    546 	bgu,pt	%ncc, .mediumrejoin	! otherwise rejoin main loop
    547 	prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read
    548 	prefetch [%o0 + (3 * BLOCK_SIZE)], #one_write
    549 	subcc	%o2, 31, %o2		! adjust length to allow cc test
    550 					! for end of loop
    551 	ble,pt	%ncc, .medl31		! skip big loop if less than 32
    552 	  .empty
    553 .medl32:
    554 	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
    555 	ldx	[%o1], %o4		! load
    556 	subcc	%o2, 32, %o2		! decrement length count
    557 	prefetch [%o0 + (4 * BLOCK_SIZE)], #one_read
    558 	stx	%o4, [%o0]		! and store
    559 	ldx	[%o1+8], %o3		! a block of 32 bytes
    560 	add	%o1, 32, %o1		! increase src ptr by 32
    561 	stx	%o3, [%o0+8]
    562 	ldx	[%o1-16], %o4
    563 	add	%o0, 32, %o0		! increase dst ptr by 32
    564 	stx	%o4, [%o0-16]
    565 	ldx	[%o1-8], %o3
    566 	bgu,pt	%ncc, .medl32		! repeat if at least 32 bytes left
    567 	stx	%o3, [%o0-8]
    568 .medl31:
    569 	addcc	%o2, 16, %o2		! adjust remaining count
    570 	ble,pt	%ncc, .medl15		! skip if 15 or fewer bytes left
    571 	nop				!
    572 	ldx	[%o1], %o4		! load and store 16 bytes
    573 	add	%o1, 16, %o1		! increase src ptr by 16
    574 	stx	%o4, [%o0]		!
    575 	sub	%o2, 16, %o2		! decrease count by 16
    576 	ldx	[%o1-8], %o3		!
    577 	add	%o0, 16, %o0		! increase dst ptr by 16
    578 	stx	%o3, [%o0-8]
    579 .medl15:
    580 	addcc	%o2, 15, %o2		! restore count
    581 	bz,pt	%ncc, .medwexit		! exit if finished
    582 	nop
    583 	cmp	%o2, 8
    584 	blt,pt	%ncc, .medw7		! skip if 7 or fewer bytes left
    585 	nop
    586 	ldx	[%o1], %o4		! load 8 bytes
    587 	add	%o1, 8, %o1		! increase src ptr by 8
    588 	stx	%o4, [%o0]		! and store 8 bytes
    589 	subcc	%o2, 8, %o2		! decrease count by 8
    590 	bz	%ncc, .medwexit		! exit if finished
    591 	add	%o0, 8, %o0		! increase dst ptr by 8
    592 	ba	.medw7
    593 	nop
    594 
    595 	.align 16
    596 	nop
    597 	nop
    598 	nop
    599 .mediumsetup:
    600 	prefetch [%o1 + (2 * BLOCK_SIZE)], #one_read
    601 	prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read
    602 .mediumrejoin:
    603 	rd	%fprs, %o4		! check for unused FPU
    604 
    605 	add	%o1, 8, %o1		! prepare to round SRC upward
    606 
    607 	sethi	%hi(0x1234567f), %o5	! For GSR.MASK
    608 	or	%o5, 0x67f, %o5
    609 
    610 	andcc	%o4, FPRS_FEF, %o4	! test FEF, fprs.du = fprs.dl = 0
    611 	bz,a	%ncc, 3f
    612 	wr	%g0, FPRS_FEF, %fprs	! fprs.fef = 1
    613 3:
    614 	cmp	%o2, MEDIUM_MAX
    615 	bmask	%o5, %g0, %g0
    616 
    617 	! Compute o5 (number of bytes that need copying using the main loop).
    618 	! First, compute for the medium case.
    619 	! Then, if large case, o5 is replaced by count for block alignment.
    620 	! Be careful not to read past end of SRC
    621 	! Currently, o2 is the actual count remaining
    622 	!            o3 is how much sooner we'll cross the alignment boundary
    623 	!                in SRC compared to in DST
    624 	!
    625 	! Examples:  Let # denote bytes that should not be accessed
    626 	!            Let x denote a byte already copied to align DST
    627 	!            Let . and - denote bytes not yet copied
    628 	!            Let | denote double alignment boundaries
    629 	!
    630 	!            DST:  ######xx|........|--------|..######   o2 = 18
    631 	!                          o0
    632 	!
    633 	!  o3 = -3:  SRC:  ###xx...|.....---|-----..#|########   o5 = 8
    634 	!                          o1
    635 	!
    636 	!  o3 =  0:  SRC:  ######xx|........|--------|..######   o5 = 16-8 = 8
    637 	!                                   o1
    638 	!
    639 	!  o3 = +1:  SRC:  #######x|x.......|.-------|-..#####   o5 = 16-8 = 8
    640 	!                                   o1
    641 
    642 	or	%g0, -8, %o5
    643 	alignaddr %o1, %g0, %o1		! set GSR.ALIGN and align o1
    644 
    645 	movrlz	%o3, %g0, %o5		! subtract 8 from o2+o3 only if o3>=0
    646 	add	%o5, %o2, %o5
    647 	add	%o5, %o3, %o5
    648 
    649 	bleu	%ncc, 4f
    650 	andn	%o5, 7, %o5		! 8 byte aligned count
    651 	neg	%o0, %o5		! 'large' case
    652 	and	%o5, BLOCK_SIZE-1, %o5  ! bytes till DST block aligned
    653 4:
    654 	brgez,a	%o3, .beginmedloop
    655 	ldd	[%o1-8], %d0
    656 
    657 	add	%o1, %o3, %o1		! back up o1
    658 5:
    659 	ldda	[%o1]ASI_FL8_P, %d2
    660 	inc	%o1
    661 	andcc	%o1, 7, %g0
    662 	bnz	%ncc, 5b
    663 	bshuffle %d0, %d2, %d0		! shifts d0 left 1 byte and or's in d2
    664 
    665 .beginmedloop:
    666 	tst	%o5
    667 	bz	%ncc, .endmedloop
    668 	sub	%o2, %o5, %o2		! update count for later
    669 
    670 	! Main loop to write out doubles.  Note: o5 & 7 == 0
    671 
    672 	ldd	[%o1], %d2
    673 	subcc	%o5, 8, %o5		! update local count
    674 	bz,pn	%ncc, 1f
    675 	add	%o1, 8, %o1		! update SRC
    676 
    677 .medloop:
    678 	faligndata %d0, %d2, %d4
    679 	ldd	[%o1], %d0
    680 	subcc	%o5, 8, %o5		! update local count
    681 	add	%o1, 16, %o1		! update SRC
    682 	std	%d4, [%o0]
    683 	bz,pn	%ncc, 2f
    684 	faligndata %d2, %d0, %d6
    685 	ldd	[%o1 - 8], %d2
    686 	subcc	%o5, 8, %o5		! update local count
    687 	std	%d6, [%o0 + 8]
    688 	bnz,pt	%ncc, .medloop
    689 	add	%o0, 16, %o0		! update DST
    690 
    691 1:
    692 	faligndata %d0, %d2, %d4
    693 	fmovd	%d2, %d0
    694 	std	%d4, [%o0]
    695 	ba	.endmedloop
    696 	add	%o0, 8, %o0
    697 
    698 2:
    699 	std	%d6, [%o0 + 8]
    700 	sub	%o1, 8, %o1
    701 	add	%o0, 16, %o0
    702 
    703 
    704 .endmedloop:
    705 	! Currently, o1 is pointing to the next double-aligned byte in SRC
    706 	! The 8 bytes starting at [o1-8] are available in d0
    707 	! At least one, and possibly all, of these need to be written.
    708 
    709 	cmp	%o2, BLOCK_SIZE
    710 	bgu	%ncc, .large		! otherwise, less than 16 bytes left
    711 
    712 #if 0
    713 
    714 	/* This code will use partial stores.  */
    715 
    716 	mov	%g0, %o5
    717 	and	%o3, 7, %o3		! Number of bytes needed to completely
    718 					! fill %d0 with good (unwritten) data.
    719 
    720 	subcc	%o2, 8, %o2		! update count (maybe too much)
    721 	movl	%ncc, %o2, %o5
    722 	addcc	%o3, %o5, %o5		! extra bytes we can stuff into %d0
    723 	sub	%o3, %o5, %o3		! update o3 (# bad bytes in %d0)
    724 
    725 	bz	%ncc, 2f
    726 	alignaddr %o3, %g0, %g0		! set GSR.ALIGN
    727 
    728 1:
    729 	deccc	%o5
    730 	ldda	[%o1]ASI_FL8_P, %d2
    731 	inc	%o1
    732 	bgu	%ncc, 1b
    733 	bshuffle %d0, %d2, %d0		! shifts d0 left 1 byte and or's in d2
    734 
    735 2:
    736 	not	%o3
    737 	faligndata %d0, %d0, %d0	! shift bytes to the left
    738 	and	%o3, 7, %o3		! last byte to be stored in [%o0+%o3]
    739 	edge8n	%g0, %o3, %o5
    740 	stda	%d0, [%o0]%o5, ASI_PST8_P
    741 	brlez	%o2, .mediumexit
    742 	add	%o0, %o3, %o0		! update DST to last stored byte
    743 3:
    744 	inc	%o0
    745 	deccc	%o2
    746 	ldub	[%o1], %o3
    747 	stb	%o3, [%o0]
    748 	bgu	%ncc, 3b
    749 	inc	%o1
    750 
    751 #else
    752 
    753 	andcc	%o3, 7, %o5		! Number of bytes needed to completely
    754 					! fill %d0 with good (unwritten) data.
    755 	bz	%ncc, 2f
    756 	sub	%o5, 8, %o3		! -(number of good bytes in %d0)
    757 	cmp	%o2, 8
    758 	bl,a	%ncc, 3f		! Not enough bytes to fill %d0
    759 	add	%o1, %o3, %o1 		! Back up %o1
    760 
    761 1:
    762 	deccc	%o5
    763 	ldda	[%o1]ASI_FL8_P, %d2
    764 	inc	%o1
    765 	bgu	%ncc, 1b
    766 	bshuffle %d0, %d2, %d0		! shifts d0 left 1 byte and or's in d2
    767 
    768 2:
    769 	subcc	%o2, 8, %o2
    770 	std	%d0, [%o0]
    771 	bz	%ncc, .mediumexit
    772 	add	%o0, 8, %o0
    773 3:
    774 	ldub	[%o1], %o3
    775 	deccc	%o2
    776 	inc	%o1
    777 	stb	%o3, [%o0]
    778 	bgu	%ncc, 3b
    779 	inc	%o0
    780 #endif
    781 
    782 .mediumexit:
    783         wr	%o4, %g0, %fprs		! fprs = o4   restore fprs
    784 	retl
    785         mov	%g1, %o0
    786 
    787 
    788 	.align ICACHE_LINE_SIZE
    789 .large:
    790 
    791 	! %o0 I/O DST is 64-byte aligned
    792 	! %o1 I/O 8-byte aligned (and we've set GSR.ALIGN)
    793 	! %d0 I/O already loaded with SRC data from [%o1-8]
    794 	! %o2 I/O count (number of bytes that need to be written)
    795 	! %o3 I   Not written.  If zero, then SRC is double aligned.
    796 	! %o4 I   Not written.  Holds fprs.
    797 	! %o5   O The number of doubles that remain to be written.
    798 
    799 	! Load the rest of the current block
    800 	! Recall that %o1 is further into SRC than %o0 is into DST
    801 
    802 	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
    803 	prefetch [%o1 + (8 * BLOCK_SIZE)], #one_read
    804 
    805 	set	BST_THRESHOLD, %o5
    806 	cmp	%o2, %o5
    807 	bgu,pn	%icc, .xlarge
    808 	prefetch [%o1 + (12 * BLOCK_SIZE)], #one_read
    809 
    810 	ldd	[%o1], %f2
    811 	ldd	[%o1 + 0x8], %f4
    812 	faligndata %f0, %f2, %f32
    813 	ldd	[%o1 + 0x10], %f6
    814 	faligndata %f2, %f4, %f34
    815 	ldd	[%o1 + 0x18], %f8
    816 	faligndata %f4, %f6, %f36
    817 	ldd	[%o1 + 0x20], %f10
    818 	or	%g0, -8, %o5		! if %o3 >= 0, %o5 = -8
    819 	faligndata %f6, %f8, %f38
    820 	prefetch [%o1 + (16 * BLOCK_SIZE)], #one_read
    821 	ldd	[%o1 + 0x28], %f12
    822 	movrlz	%o3, %g0, %o5		! if %o3 < 0, %o5 = 0  (needed lter)
    823 	faligndata %f8, %f10, %f40
    824 	ldd	[%o1 + 0x30], %f14
    825 	faligndata %f10, %f12, %f42
    826 	ldd	[%o1 + 0x38], %f0
    827 	sub	%o2, BLOCK_SIZE, %o2	! update count
    828 	add	%o1, BLOCK_SIZE, %o1	! update SRC
    829 
    830 	! Main loop.  Write previous block.  Load rest of current block.
    831 	! Some bytes will be loaded that won't yet be written.
    832 1:
    833 	ldd	[%o1], %f2
    834 	faligndata %f12, %f14, %f44
    835 	ldd	[%o1 + 0x8], %f4
    836 	faligndata %f14, %f0, %f46
    837 	std	%f32, [%o0]
    838 	std	%f34, [%o0+8]
    839 	std	%f36, [%o0+16]
    840 	std	%f38, [%o0+24]
    841 	std	%f40, [%o0+32]
    842 	std	%f42, [%o0+40]
    843 	std	%f44, [%o0+48]
    844 	std	%f46, [%o0+56]
    845 	sub	%o2, BLOCK_SIZE, %o2		! update count
    846 	prefetch [%o1 + (24 * BLOCK_SIZE) + BLOCK_SIZE], #one_read
    847 	add	%o0, BLOCK_SIZE, %o0		! update DST
    848 	ldd	[%o1 + 0x10], %f6
    849 	faligndata %f0, %f2, %f32
    850 	ldd	[%o1 + 0x18], %f8
    851 	faligndata %f2, %f4, %f34
    852 	ldd	[%o1 + 0x20], %f10
    853 	faligndata %f4, %f6, %f36
    854 	ldd	[%o1 + 0x28], %f12
    855 	faligndata %f6, %f8, %f38
    856 	ldd	[%o1 + 0x30], %f14
    857 	faligndata %f8, %f10, %f40
    858 	ldd	[%o1 + 0x38], %f0
    859 	faligndata %f10, %f12, %f42
    860 	prefetch [%o1 + (18 * BLOCK_SIZE)], #one_read
    861 	cmp	%o2, BLOCK_SIZE + 8
    862 	prefetch [%o0 + (18 * BLOCK_SIZE)], #one_write
    863 	bgu,pt	%ncc, 1b
    864 	add	%o1, BLOCK_SIZE, %o1	! update SRC
    865 	faligndata %f12, %f14, %f44
    866 	faligndata %f14, %f0, %f46
    867 	stda	%f32, [%o0]ASI_BLK_P		! store 64 bytes, bypass cache
    868 	cmp	%o2, BLOCK_SIZE
    869 	bne	%ncc, 2f		! exactly 1 block remaining?
    870 	add	%o0, BLOCK_SIZE, %o0	! update DST
    871 	brz,a	%o3, 3f			! is SRC double aligned?
    872 	ldd	[%o1], %f2
    873 
    874 2:
    875 	add	%o5, %o2, %o5		! %o5 was already set to 0 or -8
    876 	add	%o5, %o3, %o5
    877 
    878 	membar	#StoreLoad|#StoreStore
    879 
    880 	ba	.beginmedloop
    881 	andn	%o5, 7, %o5		! 8 byte aligned count
    882 
    883 
    884 	! This is when there is exactly 1 block remaining and SRC is aligned
    885 3:
    886 	ldd	[%o1 + 0x8], %f4
    887 	ldd	[%o1 + 0x10], %f6
    888 	fsrc1	%f0, %f32
    889 	ldd	[%o1 + 0x18], %f8
    890 	fsrc1	%f2, %f34
    891 	ldd	[%o1 + 0x20], %f10
    892 	fsrc1	%f4, %f36
    893 	ldd	[%o1 + 0x28], %f12
    894 	fsrc1	%f6, %f38
    895 	ldd	[%o1 + 0x30], %f14
    896 	fsrc1	%f8, %f40
    897 	fsrc1	%f10, %f42
    898 	fsrc1	%f12, %f44
    899 	fsrc1	%f14, %f46
    900 	stda	%f32, [%o0]ASI_BLK_P
    901 	membar	#StoreLoad|#StoreStore
    902 	wr	%o4, 0, %fprs
    903 	retl
    904 	mov	%g1, %o0
    905 
    906 
    907 	.align 16
    908 	! two nops here causes loop starting at 1f below to be
    909 	! on a cache line boundary, improving performance
    910 	nop
    911 	nop
    912 .xlarge:
    913 	! %o0 I/O DST is 64-byte aligned
    914 	! %o1 I/O 8-byte aligned (and we've set GSR.ALIGN)
    915 	! %d0 I/O already loaded with SRC data from [%o1-8]
    916 	! %o2 I/O count (number of bytes that need to be written)
    917 	! %o3 I   Not written.  If zero, then SRC is double aligned.
    918 	! %o4 I   Not written.  Holds fprs.
    919 	! %o5   O The number of doubles that remain to be written.
    920 
    921 	! Load the rest of the current block
    922 	! Recall that %o1 is further into SRC than %o0 is into DST
    923 
    924 	ldd	[%o1], %f2
    925 	ldd	[%o1 + 0x8], %f4
    926 	faligndata %f0, %f2, %f32
    927 	ldd	[%o1 + 0x10], %f6
    928 	faligndata %f2, %f4, %f34
    929 	ldd	[%o1 + 0x18], %f8
    930 	faligndata %f4, %f6, %f36
    931 	ldd	[%o1 + 0x20], %f10
    932 	or	%g0, -8, %o5		! if %o3 >= 0, %o5 = -8
    933 	faligndata %f6, %f8, %f38
    934 	ldd	[%o1 + 0x28], %f12
    935 	movrlz	%o3, %g0, %o5		! if %o3 < 0, %o5 = 0  (needed later)
    936 	prefetch [%o1 + (16 * BLOCK_SIZE)], #one_read
    937 	faligndata %f8, %f10, %f40
    938 	ldd	[%o1 + 0x30], %f14
    939 	faligndata %f10, %f12, %f42
    940 	ldd	[%o1 + 0x38], %f0
    941 	prefetch [%o1 + (17 * BLOCK_SIZE)], #one_read
    942 	sub	%o2, BLOCK_SIZE, %o2	! update count
    943 	add	%o1, BLOCK_SIZE, %o1	! update SRC
    944 
    945 	! This point is 32-byte aligned since 24 instructions appear since
    946 	! the previous alignment directive.
    947 
    948 
    949 	! Main loop.  Write previous block.  Load rest of current block.
    950 	! Some bytes will be loaded that won't yet be written.
    951 1:
    952 	ldd	[%o1], %f2
    953 	faligndata %f12, %f14, %f44
    954 	ldd	[%o1 + 0x8], %f4
    955 	faligndata %f14, %f0, %f46
    956 	stda	%f32, [%o0]ASI_BLK_P
    957 	sub	%o2, BLOCK_SIZE, %o2		! update count
    958 	ldd	[%o1 + 0x10], %f6
    959 	faligndata %f0, %f2, %f32
    960 	ldd	[%o1 + 0x18], %f8
    961 	faligndata %f2, %f4, %f34
    962 	ldd	[%o1 + 0x20], %f10
    963 	faligndata %f4, %f6, %f36
    964 	ldd	[%o1 + 0x28], %f12
    965 	faligndata %f6, %f8, %f38
    966 	ldd	[%o1 + 0x30], %f14
    967 	prefetch [%o1 + (2 * BLOCK_SIZE)], #n_reads
    968 	faligndata %f8, %f10, %f40
    969 	ldd	[%o1 + 0x38], %f0
    970 	faligndata %f10, %f12, %f42
    971 	prefetch [%o1 + (25 * BLOCK_SIZE)], #one_read
    972 	add	%o0, BLOCK_SIZE, %o0		! update DST
    973 	cmp	%o2, BLOCK_SIZE + 8
    974 	! second prefetch important to correct for occasional dropped
    975 	prefetch [%o1 + (18 * BLOCK_SIZE)], #one_read
    976 	bgu,pt	%ncc, 1b
    977 	add	%o1, BLOCK_SIZE, %o1		! update SRC
    978 
    979 	faligndata %f12, %f14, %f44
    980 	faligndata %f14, %f0, %f46
    981 	stda	%f32, [%o0]ASI_BLK_P		! store 64 bytes, bypass cache
    982 	cmp	%o2, BLOCK_SIZE
    983 	bne	%ncc, 2f		! exactly 1 block remaining?
    984 	add	%o0, BLOCK_SIZE, %o0	! update DST
    985 	brz,a	%o3, 3f			! is SRC double aligned?
    986 	ldd	[%o1], %f2
    987 
    988 2:
    989 	add	%o5, %o2, %o5		! %o5 was already set to 0 or -8
    990 	add	%o5, %o3, %o5
    991 
    992 	membar	#StoreLoad|#StoreStore
    993 
    994 	ba	.beginmedloop
    995 	andn	%o5, 7, %o5		! 8 byte aligned count
    996 
    997 
    998 	! This is when there is exactly 1 block remaining and SRC is aligned
    999 3:
   1000 	ldd	[%o1 + 0x8], %f4
   1001 	ldd	[%o1 + 0x10], %f6
   1002 	fsrc1	%f0, %f32
   1003 	ldd	[%o1 + 0x18], %f8
   1004 	fsrc1	%f2, %f34
   1005 	ldd	[%o1 + 0x20], %f10
   1006 	fsrc1	%f4, %f36
   1007 	ldd	[%o1 + 0x28], %f12
   1008 	fsrc1	%f6, %f38
   1009 	ldd	[%o1 + 0x30], %f14
   1010 	fsrc1	%f8, %f40
   1011 	fsrc1	%f10, %f42
   1012 	fsrc1	%f12, %f44
   1013 	fsrc1	%f14, %f46
   1014 	stda	%f32, [%o0]ASI_BLK_P
   1015 	membar	#StoreLoad|#StoreStore
   1016 	wr	%o4, 0, %fprs
   1017 	retl
   1018 	mov	%g1, %o0
   1019 
   1020 	SET_SIZE(memcpy)
   1021