1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 .file "memcpy.s" 28 29 /* 30 * memcpy(s1, s2, len) 31 * 32 * Copy s2 to s1, always copy n bytes. 33 * Note: this C code does not work for overlapped copies. 34 * Memmove() and bcopy() do. 35 * 36 * Fast assembler language version of the following C-program for memcpy 37 * which represents the `standard' for the C-library. 38 * 39 * void * 40 * memcpy(void *s, const void *s0, size_t n) 41 * { 42 * if (n != 0) { 43 * char *s1 = s; 44 * const char *s2 = s0; 45 * do { 46 * *s1++ = *s2++; 47 * } while (--n != 0); 48 * } 49 * return (s); 50 * } 51 */ 52 53 #include <sys/asm_linkage.h> 54 #include <sys/sun4asi.h> 55 #include <sys/trap.h> 56 57 #define ICACHE_LINE_SIZE 64 58 #define BLOCK_SIZE 64 59 #define FPRS_FEF 0x4 60 61 #define ALIGNED8_FPCOPY_THRESHOLD 1024 62 #define ALIGNED4_FPCOPY_THRESHOLD 1024 63 #define BST_THRESHOLD 65536 64 65 #define SHORTCOPY 3 66 #define SMALL_MAX 64 67 #define MEDIUM_MAX 255 68 #define MED_WMAX 256 /* max copy for medium word-aligned case */ 69 70 #define N_READS_STRONG 20 71 #define N_WRITES_STRONG 22 72 73 74 ANSI_PRAGMA_WEAK(memmove,function) 75 ANSI_PRAGMA_WEAK(memcpy,function) 76 77 ENTRY(memmove) 78 prefetch [%o1], N_READS_STRONG 79 prefetch [%o0], N_WRITES_STRONG 80 cmp %o1, %o0 ! if from address is >= to use forward copy 81 bgeu %ncc, .forcpy ! else use backward if ... 82 sub %o0, %o1, %o4 ! get difference of two addresses 83 cmp %o2, %o4 ! compare size and difference of addresses 84 bleu %ncc, .forcpy ! if size is bigger, do overlapped copy 85 nop 86 87 ! 88 ! an overlapped copy that must be done "backwards" 89 ! 90 .ovbc: 91 mov %o0, %g1 ! save dest address for return val 92 add %o1, %o2, %o1 ! get to end of source space 93 add %o0, %o2, %o0 ! get to end of destination space 94 95 cmp %o2, 64 96 bgeu,pn %ncc, .dbalign 97 nop 98 cmp %o2, 4 99 blt,pn %ncc, .byte 100 sub %o2, 3, %o2 101 .byte4loop: 102 ldub [%o1-1], %o3 ! load last byte 103 stb %o3, [%o0-1] ! store last byte 104 sub %o1, 4, %o1 105 ldub [%o1+2], %o3 ! load 2nd from last byte 106 stb %o3, [%o0-2] ! store 2nd from last byte 107 sub %o0, 4, %o0 108 ldub [%o1+1], %o3 ! load 3rd from last byte 109 stb %o3, [%o0+1] ! store 3rd from last byte 110 subcc %o2, 4, %o2 111 ldub [%o1], %o3 ! load 4th from last byte 112 bgu,pt %ncc, .byte4loop 113 stb %o3, [%o0] ! store 4th from last byte 114 .byte: 115 addcc %o2, 3, %o2 116 bz,pt %ncc, .exit 117 .byteloop: 118 dec %o1 ! decrement src address 119 ldub [%o1], %o3 ! read a byte 120 dec %o0 ! decrement dst address 121 deccc %o2 ! decrement count 122 bgu,pt %ncc, .byteloop ! loop until done 123 stb %o3, [%o0] ! write byte 124 .exit: 125 retl 126 mov %g1, %o0 127 128 .align 16 129 .dbalign: 130 prefetch [%o1 - (4 * BLOCK_SIZE)], #one_read 131 prefetch [%o0 - (4 * BLOCK_SIZE)], #one_write 132 andcc %o0, 7, %o5 ! bytes till DST 8 byte aligned 133 bz,pt %ncc, .dbmed 134 sub %o2, %o5, %o2 ! update count 135 .dbalign1: 136 dec %o1 ! decrement src address 137 ldub [%o1], %o3 ! read a byte 138 dec %o0 ! decrement dst address 139 deccc %o5 ! decrement count 140 bgu,pt %ncc, .dbalign1 ! loop until done 141 stb %o3, [%o0] ! store a byte 142 143 ! check for src long word alignment 144 .dbmed: 145 andcc %o1, 7, %g0 ! chk src long word alignment 146 bnz,pn %ncc, .dbbck 147 nop 148 ! 149 ! Following code is for overlapping copies where src and dest 150 ! are long word aligned 151 ! 152 ! 153 ! For SPARC64-VI, prefetch is effective for both integer and fp register 154 ! operations. There are no benefits in using the fp registers for 155 ! aligned data copying. 156 157 .dbmedl32enter: 158 subcc %o2, 31, %o2 ! adjust length to allow cc test 159 ! for end of loop 160 ble,pt %ncc, .dbmedl31 ! skip big loop if less than 32 161 nop 162 .dbmedl32: 163 ldx [%o1-8], %o4 ! load 164 prefetch [%o1 - (8 * BLOCK_SIZE)], #one_read 165 subcc %o2, 32, %o2 ! decrement length count 166 stx %o4, [%o0-8] ! and store 167 prefetch [%o0 - (8 * BLOCK_SIZE)], #one_write 168 ldx [%o1-16], %o3 ! a block of 32 bytes 169 sub %o1, 32, %o1 ! decrease src ptr by 32 170 stx %o3, [%o0-16] 171 ldx [%o1+8], %o4 172 sub %o0, 32, %o0 ! decrease dst ptr by 32 173 stx %o4, [%o0+8] 174 ldx [%o1], %o3 175 bgu,pt %ncc, .dbmedl32 ! repeat if at least 32 bytes left 176 stx %o3, [%o0] 177 .dbmedl31: 178 addcc %o2, 16, %o2 ! adjust remaining count 179 ble,pt %ncc, .dbmedl15 ! skip if 15 or fewer bytes left 180 nop ! 181 ldx [%o1-8], %o4 ! load and store 16 bytes 182 sub %o1, 16, %o1 ! decrease src ptr by 16 183 stx %o4, [%o0-8] ! 184 sub %o2, 16, %o2 ! decrease count by 16 185 ldx [%o1], %o3 ! 186 sub %o0, 16, %o0 ! decrease dst ptr by 16 187 stx %o3, [%o0] 188 .dbmedl15: 189 addcc %o2, 15, %o2 ! restore count 190 bz,pt %ncc, .dbexit ! exit if finished 191 nop 192 cmp %o2, 8 193 blt,pt %ncc, .dbremain ! skip if 7 or fewer bytes left 194 nop 195 ldx [%o1-8], %o4 ! load 8 bytes 196 sub %o1, 8, %o1 ! decrease src ptr by 8 197 stx %o4, [%o0-8] ! and store 8 bytes 198 subcc %o2, 8, %o2 ! decrease count by 8 199 bnz %ncc, .dbremain ! exit if finished 200 sub %o0, 8, %o0 ! decrease dst ptr by 8 201 retl 202 mov %g1, %o0 203 204 ! 205 ! Following code is for overlapping copies where src and dest 206 ! are not long word aligned 207 ! 208 .align 16 209 .dbbck: 210 rd %fprs, %o3 ! o3 = fprs 211 212 ! if fprs.fef == 0, set it. Checking it, requires 2 instructions. 213 ! So set it anyway, without checking. 214 wr %g0, 0x4, %fprs ! fprs.fef = 1 215 216 alignaddr %o1, %g0, %o5 ! align src 217 ldd [%o5], %d0 ! get first 8 byte block 218 andn %o2, 7, %o4 ! prepare src ptr for finishup code 219 cmp %o2, 32 220 blt,pn %ncc, .dbmv8 221 sub %o1, %o4, %o1 ! 222 cmp %o2, 4095 ! check for short memmoves 223 blt,pn %ncc, .dbmv32enter ! go to no prefetch code 224 .dbmv64: 225 ldd [%o5-8], %d2 ! load 8 bytes 226 ldd [%o5-16], %d4 ! load 8 bytes 227 sub %o5, 64, %o5 ! 228 ldd [%o5+40], %d6 ! load 8 bytes 229 sub %o0, 64, %o0 ! 230 ldd [%o5+32], %d8 ! load 8 bytes 231 sub %o2, 64, %o2 ! 64 less bytes to copy 232 ldd [%o5+24], %d18 ! load 8 bytes 233 cmp %o2, 64 ! do we have < 64 bytes remaining 234 ldd [%o5+16], %d28 ! load 8 bytes 235 ldd [%o5+8], %d30 ! load 8 bytes 236 faligndata %d2, %d0, %d10 ! extract 8 bytes out 237 prefetch [%o5 - (5 * BLOCK_SIZE)], #one_read 238 ldd [%o5], %d0 ! load 8 bytes 239 std %d10, [%o0+56] ! store the current 8 bytes 240 faligndata %d4, %d2, %d12 ! extract 8 bytes out 241 prefetch [%o0 - (5 * BLOCK_SIZE)], #one_write 242 std %d12, [%o0+48] ! store the current 8 bytes 243 faligndata %d6, %d4, %d14 ! extract 8 bytes out 244 std %d14, [%o0+40] ! store the current 8 bytes 245 faligndata %d8, %d6, %d16 ! extract 8 bytes out 246 std %d16, [%o0+32] ! store the current 8 bytes 247 faligndata %d18, %d8, %d20 ! extract 8 bytes out 248 std %d20, [%o0+24] ! store the current 8 bytes 249 faligndata %d28, %d18, %d22 ! extract 8 bytes out 250 std %d22, [%o0+16] ! store the current 8 bytes 251 faligndata %d30, %d28, %d24 ! extract 8 bytes out 252 std %d24, [%o0+8] ! store the current 8 bytes 253 faligndata %d0, %d30, %d26 ! extract 8 bytes out 254 bgeu,pt %ncc, .dbmv64 255 std %d26, [%o0] ! store the current 8 bytes 256 257 cmp %o2, 32 258 blt,pn %ncc, .dbmvx 259 nop 260 .dbmv32: 261 ldd [%o5-8], %d2 ! load 8 bytes 262 .dbmv32enter: 263 ldd [%o5-16], %d4 ! load 8 bytes 264 sub %o5, 32, %o5 ! 265 ldd [%o5+8], %d6 ! load 8 bytes 266 sub %o0, 32, %o0 ! 267 faligndata %d2, %d0, %d10 ! extract 8 bytes out 268 ldd [%o5], %d0 ! load 8 bytes 269 sub %o2,32, %o2 ! 32 less bytes to copy 270 std %d10, [%o0+24] ! store the current 8 bytes 271 cmp %o2, 32 ! do we have < 32 bytes remaining 272 faligndata %d4, %d2, %d12 ! extract 8 bytes out 273 std %d12, [%o0+16] ! store the current 8 bytes 274 faligndata %d6, %d4, %d14 ! extract 8 bytes out 275 std %d14, [%o0+8] ! store the current 8 bytes 276 faligndata %d0, %d6, %d16 ! extract 8 bytes out 277 bgeu,pt %ncc, .dbmv32 278 std %d16, [%o0] ! store the current 8 bytes 279 .dbmvx: 280 cmp %o2, 8 ! do we have < 8 bytes remaining 281 blt,pt %ncc, .dbmvfinish ! if yes, skip to finish up code 282 nop 283 .dbmv8: 284 ldd [%o5-8], %d2 285 sub %o0, 8, %o0 ! since we are at the end 286 ! when we first enter the loop 287 sub %o2, 8, %o2 ! 8 less bytes to copy 288 sub %o5, 8, %o5 289 cmp %o2, 8 ! do we have < 8 bytes remaining 290 faligndata %d2, %d0, %d8 ! extract 8 bytes out 291 std %d8, [%o0] ! store the current 8 bytes 292 bgeu,pt %ncc, .dbmv8 293 fmovd %d2, %d0 294 .dbmvfinish: 295 and %o3, 0x4, %o3 ! fprs.du = fprs.dl = 0 296 tst %o2 297 bz,pt %ncc, .dbexit 298 wr %o3, %g0, %fprs ! fprs = o3 restore fprs 299 300 .dbremain: 301 cmp %o2, 4 302 blt,pn %ncc, .dbbyte 303 nop 304 ldub [%o1-1], %o3 ! load last byte 305 stb %o3, [%o0-1] ! store last byte 306 sub %o1, 4, %o1 307 ldub [%o1+2], %o3 ! load 2nd from last byte 308 stb %o3, [%o0-2] ! store 2nd from last byte 309 sub %o0, 4, %o0 310 ldub [%o1+1], %o3 ! load 3rd from last byte 311 stb %o3, [%o0+1] ! store 3rd from last byte 312 subcc %o2, 4, %o2 313 ldub [%o1], %o3 ! load 4th from last byte 314 stb %o3, [%o0] ! store 4th from last byte 315 bz,pt %ncc, .dbexit 316 .dbbyte: 317 dec %o1 ! decrement src address 318 ldub [%o1], %o3 ! read a byte 319 dec %o0 ! decrement dst address 320 deccc %o2 ! decrement count 321 bgu,pt %ncc, .dbbyte ! loop until done 322 stb %o3, [%o0] ! write byte 323 .dbexit: 324 retl 325 mov %g1, %o0 326 SET_SIZE(memmove) 327 328 329 .align ICACHE_LINE_SIZE 330 ENTRY(memcpy) 331 ! adjust instruction alignment 332 nop ! Do not remove, these nops affect 333 nop ! icache alignment and performance 334 .forcpy: 335 prefetch [%o1], N_READS_STRONG 336 prefetch [%o0], N_WRITES_STRONG 337 cmp %o2, SMALL_MAX ! check for not small case 338 bgu,pn %ncc, .medium ! go to larger cases 339 mov %o0, %g1 ! save %o0 340 cmp %o2, SHORTCOPY ! check for really short case 341 ble,pt %ncc, .smallleft ! 342 or %o0, %o1, %o3 ! prepare alignment check 343 andcc %o3, 0x3, %g0 ! test for alignment 344 bz,pt %ncc, .smallword ! branch to word aligned case 345 sub %o2, 3, %o2 ! adjust count to allow cc zero test 346 .smallnotalign4: 347 ldub [%o1], %o3 ! read byte 348 subcc %o2, 4, %o2 ! reduce count by 4 349 stb %o3, [%o0] ! write byte 350 ldub [%o1+1], %o3 ! repeat for a total of 4 bytes 351 add %o1, 4, %o1 ! advance SRC by 4 352 stb %o3, [%o0+1] 353 ldub [%o1-2], %o3 354 add %o0, 4, %o0 ! advance DST by 4 355 stb %o3, [%o0-2] 356 ldub [%o1-1], %o3 357 bgu,pt %ncc, .smallnotalign4 ! loop til 3 or fewer bytes remain 358 stb %o3, [%o0-1] 359 add %o2, 3, %o2 ! restore count 360 .smallleft: 361 tst %o2 362 bz,pt %ncc, .smallexit 363 nop 364 .smallleft3: ! 1, 2, or 3 bytes remain 365 ldub [%o1], %o3 ! load one byte 366 deccc %o2 ! reduce count for cc test 367 bz,pt %ncc, .smallexit 368 stb %o3, [%o0] ! store one byte 369 ldub [%o1+1], %o3 ! load second byte 370 deccc %o2 371 bz,pt %ncc, .smallexit 372 stb %o3, [%o0+1] ! store second byte 373 ldub [%o1+2], %o3 ! load third byte 374 stb %o3, [%o0+2] ! store third byte 375 retl 376 mov %g1, %o0 ! restore %o0 377 378 .align 16 379 nop ! affects loop icache alignment 380 .smallwords: 381 lduw [%o1], %o3 ! read word 382 .smallwordx: 383 subcc %o2, 8, %o2 ! update count 384 stw %o3, [%o0] ! write word 385 add %o1, 8, %o1 ! update SRC 386 lduw [%o1-4], %o3 ! read word 387 add %o0, 8, %o0 ! update DST 388 bgu,pt %ncc, .smallwords ! loop until done 389 stw %o3, [%o0-4] ! write word 390 addcc %o2, 7, %o2 ! restore count 391 bz,pt %ncc, .smallexit ! check for completion 392 nop 393 cmp %o2, 4 ! check for 4 or more bytes left 394 blt .smallleft3 ! if not, go to finish up 395 nop 396 lduw [%o1], %o3 397 add %o1, 4, %o1 398 subcc %o2, 4, %o2 399 stw %o3, [%o0] 400 add %o0, 4, %o0 401 bnz,pt %ncc, .smallleft3 402 nop 403 retl 404 mov %g1, %o0 ! restore %o0 405 406 .smallword: 407 subcc %o2, 4, %o2 ! update count 408 bgu,pt %ncc, .smallwordx 409 lduw [%o1], %o3 ! read word 410 addcc %o2, 3, %o2 ! restore count 411 bz,pt %ncc, .smallexit 412 stw %o3, [%o0] ! write word 413 deccc %o2 ! reduce count for cc test 414 ldub [%o1+4], %o3 ! load one byte 415 bz,pt %ncc, .smallexit 416 stb %o3, [%o0+4] ! store one byte 417 ldub [%o1+5], %o3 ! load second byte 418 deccc %o2 419 bz,pt %ncc, .smallexit 420 stb %o3, [%o0+5] ! store second byte 421 ldub [%o1+6], %o3 ! load third byte 422 stb %o3, [%o0+6] ! store third byte 423 .smallexit: 424 retl 425 mov %g1, %o0 ! restore %o0 426 .align 16 427 .medium: 428 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 429 prefetch [%o0 + (4 * BLOCK_SIZE)], #one_write 430 neg %o0, %o5 431 neg %o1, %o3 432 andcc %o5, 7, %o5 ! bytes till DST 8 byte aligned 433 and %o3, 7, %o3 ! bytes till SRC 8 byte aligned 434 435 bz %ncc, 2f 436 sub %o5, %o3, %o3 ! -(bytes till SRC aligned after DST aligned) 437 ! o3={-7, -6, ... 7} o3>0 => SRC overaligned 438 439 sub %o2, %o5, %o2 ! update count 440 441 1: 442 ldub [%o1], %o4 443 deccc %o5 444 inc %o1 445 stb %o4, [%o0] 446 bgu,pt %ncc, 1b 447 inc %o0 448 449 ! Now DST is 8-byte aligned. o0, o1, o2 are current. 450 451 2: 452 andcc %o1, 0x3, %g0 ! test alignment 453 prefetch [%o1 + (1 * BLOCK_SIZE)], #one_read 454 bnz,pt %ncc, .mediumsetup ! branch to skip aligned cases 455 ! if src, dst not aligned 456 prefetch [%o0 + (1 * BLOCK_SIZE)], #one_write 457 458 /* 459 * Handle all cases where src and dest are aligned on word 460 * or long word boundaries. Use unrolled loops for better 461 * performance. This option wins over standard large data 462 * move when source and destination is in cache for medium 463 * to short data moves. 464 */ 465 andcc %o1, 0x7, %g0 ! test word alignment 466 prefetch [%o1 + (2 * BLOCK_SIZE)], #one_read 467 bz,pt %ncc, .medlword ! branch to long word aligned case 468 prefetch [%o0 + (2 * BLOCK_SIZE)], #one_write 469 cmp %o2, ALIGNED4_FPCOPY_THRESHOLD ! limit to store buffer size 470 bgu,pt %ncc, .mediumrejoin ! otherwise rejoin main loop 471 prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read 472 subcc %o2, 15, %o2 ! adjust length to allow cc test 473 prefetch [%o0 + (3 * BLOCK_SIZE)], #one_write 474 ! for end of loop 475 ble,pt %ncc, .medw15 ! skip big loop if less than 16 476 .empty 477 .medw16: 478 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 479 ld [%o1], %o4 ! load 480 subcc %o2, 16, %o2 ! decrement length count 481 prefetch [%o0 + (4 * BLOCK_SIZE)], #one_write 482 stw %o4, [%o0] ! and store 483 ld [%o1+4], %o3 ! a block of 16 bytes 484 add %o1, 16, %o1 ! increase src ptr by 16 485 stw %o3, [%o0+4] 486 ld [%o1-8], %o4 487 add %o0, 16, %o0 ! increase dst ptr by 16 488 stw %o4, [%o0-8] 489 ld [%o1-4], %o3 490 bgu,pt %ncc, .medw16 ! repeat if at least 16 bytes left 491 stw %o3, [%o0-4] 492 .medw15: 493 addcc %o2, 15, %o2 ! restore count 494 bz,pt %ncc, .medwexit ! exit if finished 495 nop 496 cmp %o2, 8 497 blt,pt %ncc, .medw7 ! skip if 7 or fewer bytes left 498 nop ! 499 ld [%o1], %o4 ! load 4 bytes 500 subcc %o2, 8, %o2 ! decrease count by 8 501 stw %o4, [%o0] ! and store 4 bytes 502 add %o1, 8, %o1 ! increase src ptr by 8 503 ld [%o1-4], %o3 ! load 4 bytes 504 add %o0, 8, %o0 ! increase dst ptr by 8 505 stw %o3, [%o0-4] ! and store 4 bytes 506 bz %ncc, .medwexit ! exit if finished 507 nop 508 .medw7: ! count is ge 1, less than 8 509 cmp %o2, 3 ! check for 4 bytes left 510 ble,pt %ncc, .medw3 ! skip if 3 or fewer bytes left 511 nop ! 512 ld [%o1], %o4 ! load 4 bytes 513 sub %o2, 4, %o2 ! decrease count by 4 514 add %o1, 4, %o1 ! increase src ptr by 4 515 stw %o4, [%o0] ! and store 4 bytes 516 add %o0, 4, %o0 ! increase dst ptr by 4 517 tst %o2 ! check for zero bytes left 518 bz %ncc, .medwexit ! exit if finished 519 nop 520 .medw3: ! count is known to be 1, 2, or 3 521 deccc %o2 ! reduce count by one 522 ldub [%o1], %o3 ! load one byte 523 bz,pt %ncc, .medwexit ! exit if last byte 524 stb %o3, [%o0] ! store one byte 525 ldub [%o1+1], %o3 ! load second byte 526 deccc %o2 ! reduce count by one 527 bz,pt %ncc, .medwexit ! exit if last byte 528 stb %o3, [%o0+1] ! store second byte 529 ldub [%o1+2], %o3 ! load third byte 530 stb %o3, [%o0+2] ! store third byte 531 .medwexit: 532 retl 533 mov %g1, %o0 ! restore %o0 534 535 /* 536 * Special case for handling when src and dest are both long word aligned 537 * and total data to move is between SMALL_MAX and ALIGNED8_FPCOPY_THRESHOLD 538 * bytes. 539 */ 540 541 .align 16 542 nop 543 .medlword: ! long word aligned 544 ! length > ALIGNED8_FPCOPY_THRESHOLD 545 cmp %o2, ALIGNED8_FPCOPY_THRESHOLD 546 bgu,pt %ncc, .mediumrejoin ! otherwise rejoin main loop 547 prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read 548 prefetch [%o0 + (3 * BLOCK_SIZE)], #one_write 549 subcc %o2, 31, %o2 ! adjust length to allow cc test 550 ! for end of loop 551 ble,pt %ncc, .medl31 ! skip big loop if less than 32 552 .empty 553 .medl32: 554 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 555 ldx [%o1], %o4 ! load 556 subcc %o2, 32, %o2 ! decrement length count 557 prefetch [%o0 + (4 * BLOCK_SIZE)], #one_read 558 stx %o4, [%o0] ! and store 559 ldx [%o1+8], %o3 ! a block of 32 bytes 560 add %o1, 32, %o1 ! increase src ptr by 32 561 stx %o3, [%o0+8] 562 ldx [%o1-16], %o4 563 add %o0, 32, %o0 ! increase dst ptr by 32 564 stx %o4, [%o0-16] 565 ldx [%o1-8], %o3 566 bgu,pt %ncc, .medl32 ! repeat if at least 32 bytes left 567 stx %o3, [%o0-8] 568 .medl31: 569 addcc %o2, 16, %o2 ! adjust remaining count 570 ble,pt %ncc, .medl15 ! skip if 15 or fewer bytes left 571 nop ! 572 ldx [%o1], %o4 ! load and store 16 bytes 573 add %o1, 16, %o1 ! increase src ptr by 16 574 stx %o4, [%o0] ! 575 sub %o2, 16, %o2 ! decrease count by 16 576 ldx [%o1-8], %o3 ! 577 add %o0, 16, %o0 ! increase dst ptr by 16 578 stx %o3, [%o0-8] 579 .medl15: 580 addcc %o2, 15, %o2 ! restore count 581 bz,pt %ncc, .medwexit ! exit if finished 582 nop 583 cmp %o2, 8 584 blt,pt %ncc, .medw7 ! skip if 7 or fewer bytes left 585 nop 586 ldx [%o1], %o4 ! load 8 bytes 587 add %o1, 8, %o1 ! increase src ptr by 8 588 stx %o4, [%o0] ! and store 8 bytes 589 subcc %o2, 8, %o2 ! decrease count by 8 590 bz %ncc, .medwexit ! exit if finished 591 add %o0, 8, %o0 ! increase dst ptr by 8 592 ba .medw7 593 nop 594 595 .align 16 596 nop 597 nop 598 nop 599 .mediumsetup: 600 prefetch [%o1 + (2 * BLOCK_SIZE)], #one_read 601 prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read 602 .mediumrejoin: 603 rd %fprs, %o4 ! check for unused FPU 604 605 add %o1, 8, %o1 ! prepare to round SRC upward 606 607 sethi %hi(0x1234567f), %o5 ! For GSR.MASK 608 or %o5, 0x67f, %o5 609 610 andcc %o4, FPRS_FEF, %o4 ! test FEF, fprs.du = fprs.dl = 0 611 bz,a %ncc, 3f 612 wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1 613 3: 614 cmp %o2, MEDIUM_MAX 615 bmask %o5, %g0, %g0 616 617 ! Compute o5 (number of bytes that need copying using the main loop). 618 ! First, compute for the medium case. 619 ! Then, if large case, o5 is replaced by count for block alignment. 620 ! Be careful not to read past end of SRC 621 ! Currently, o2 is the actual count remaining 622 ! o3 is how much sooner we'll cross the alignment boundary 623 ! in SRC compared to in DST 624 ! 625 ! Examples: Let # denote bytes that should not be accessed 626 ! Let x denote a byte already copied to align DST 627 ! Let . and - denote bytes not yet copied 628 ! Let | denote double alignment boundaries 629 ! 630 ! DST: ######xx|........|--------|..###### o2 = 18 631 ! o0 632 ! 633 ! o3 = -3: SRC: ###xx...|.....---|-----..#|######## o5 = 8 634 ! o1 635 ! 636 ! o3 = 0: SRC: ######xx|........|--------|..###### o5 = 16-8 = 8 637 ! o1 638 ! 639 ! o3 = +1: SRC: #######x|x.......|.-------|-..##### o5 = 16-8 = 8 640 ! o1 641 642 or %g0, -8, %o5 643 alignaddr %o1, %g0, %o1 ! set GSR.ALIGN and align o1 644 645 movrlz %o3, %g0, %o5 ! subtract 8 from o2+o3 only if o3>=0 646 add %o5, %o2, %o5 647 add %o5, %o3, %o5 648 649 bleu %ncc, 4f 650 andn %o5, 7, %o5 ! 8 byte aligned count 651 neg %o0, %o5 ! 'large' case 652 and %o5, BLOCK_SIZE-1, %o5 ! bytes till DST block aligned 653 4: 654 brgez,a %o3, .beginmedloop 655 ldd [%o1-8], %d0 656 657 add %o1, %o3, %o1 ! back up o1 658 5: 659 ldda [%o1]ASI_FL8_P, %d2 660 inc %o1 661 andcc %o1, 7, %g0 662 bnz %ncc, 5b 663 bshuffle %d0, %d2, %d0 ! shifts d0 left 1 byte and or's in d2 664 665 .beginmedloop: 666 tst %o5 667 bz %ncc, .endmedloop 668 sub %o2, %o5, %o2 ! update count for later 669 670 ! Main loop to write out doubles. Note: o5 & 7 == 0 671 672 ldd [%o1], %d2 673 subcc %o5, 8, %o5 ! update local count 674 bz,pn %ncc, 1f 675 add %o1, 8, %o1 ! update SRC 676 677 .medloop: 678 faligndata %d0, %d2, %d4 679 ldd [%o1], %d0 680 subcc %o5, 8, %o5 ! update local count 681 add %o1, 16, %o1 ! update SRC 682 std %d4, [%o0] 683 bz,pn %ncc, 2f 684 faligndata %d2, %d0, %d6 685 ldd [%o1 - 8], %d2 686 subcc %o5, 8, %o5 ! update local count 687 std %d6, [%o0 + 8] 688 bnz,pt %ncc, .medloop 689 add %o0, 16, %o0 ! update DST 690 691 1: 692 faligndata %d0, %d2, %d4 693 fmovd %d2, %d0 694 std %d4, [%o0] 695 ba .endmedloop 696 add %o0, 8, %o0 697 698 2: 699 std %d6, [%o0 + 8] 700 sub %o1, 8, %o1 701 add %o0, 16, %o0 702 703 704 .endmedloop: 705 ! Currently, o1 is pointing to the next double-aligned byte in SRC 706 ! The 8 bytes starting at [o1-8] are available in d0 707 ! At least one, and possibly all, of these need to be written. 708 709 cmp %o2, BLOCK_SIZE 710 bgu %ncc, .large ! otherwise, less than 16 bytes left 711 712 #if 0 713 714 /* This code will use partial stores. */ 715 716 mov %g0, %o5 717 and %o3, 7, %o3 ! Number of bytes needed to completely 718 ! fill %d0 with good (unwritten) data. 719 720 subcc %o2, 8, %o2 ! update count (maybe too much) 721 movl %ncc, %o2, %o5 722 addcc %o3, %o5, %o5 ! extra bytes we can stuff into %d0 723 sub %o3, %o5, %o3 ! update o3 (# bad bytes in %d0) 724 725 bz %ncc, 2f 726 alignaddr %o3, %g0, %g0 ! set GSR.ALIGN 727 728 1: 729 deccc %o5 730 ldda [%o1]ASI_FL8_P, %d2 731 inc %o1 732 bgu %ncc, 1b 733 bshuffle %d0, %d2, %d0 ! shifts d0 left 1 byte and or's in d2 734 735 2: 736 not %o3 737 faligndata %d0, %d0, %d0 ! shift bytes to the left 738 and %o3, 7, %o3 ! last byte to be stored in [%o0+%o3] 739 edge8n %g0, %o3, %o5 740 stda %d0, [%o0]%o5, ASI_PST8_P 741 brlez %o2, .mediumexit 742 add %o0, %o3, %o0 ! update DST to last stored byte 743 3: 744 inc %o0 745 deccc %o2 746 ldub [%o1], %o3 747 stb %o3, [%o0] 748 bgu %ncc, 3b 749 inc %o1 750 751 #else 752 753 andcc %o3, 7, %o5 ! Number of bytes needed to completely 754 ! fill %d0 with good (unwritten) data. 755 bz %ncc, 2f 756 sub %o5, 8, %o3 ! -(number of good bytes in %d0) 757 cmp %o2, 8 758 bl,a %ncc, 3f ! Not enough bytes to fill %d0 759 add %o1, %o3, %o1 ! Back up %o1 760 761 1: 762 deccc %o5 763 ldda [%o1]ASI_FL8_P, %d2 764 inc %o1 765 bgu %ncc, 1b 766 bshuffle %d0, %d2, %d0 ! shifts d0 left 1 byte and or's in d2 767 768 2: 769 subcc %o2, 8, %o2 770 std %d0, [%o0] 771 bz %ncc, .mediumexit 772 add %o0, 8, %o0 773 3: 774 ldub [%o1], %o3 775 deccc %o2 776 inc %o1 777 stb %o3, [%o0] 778 bgu %ncc, 3b 779 inc %o0 780 #endif 781 782 .mediumexit: 783 wr %o4, %g0, %fprs ! fprs = o4 restore fprs 784 retl 785 mov %g1, %o0 786 787 788 .align ICACHE_LINE_SIZE 789 .large: 790 791 ! %o0 I/O DST is 64-byte aligned 792 ! %o1 I/O 8-byte aligned (and we've set GSR.ALIGN) 793 ! %d0 I/O already loaded with SRC data from [%o1-8] 794 ! %o2 I/O count (number of bytes that need to be written) 795 ! %o3 I Not written. If zero, then SRC is double aligned. 796 ! %o4 I Not written. Holds fprs. 797 ! %o5 O The number of doubles that remain to be written. 798 799 ! Load the rest of the current block 800 ! Recall that %o1 is further into SRC than %o0 is into DST 801 802 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 803 prefetch [%o1 + (8 * BLOCK_SIZE)], #one_read 804 805 set BST_THRESHOLD, %o5 806 cmp %o2, %o5 807 bgu,pn %icc, .xlarge 808 prefetch [%o1 + (12 * BLOCK_SIZE)], #one_read 809 810 ldd [%o1], %f2 811 ldd [%o1 + 0x8], %f4 812 faligndata %f0, %f2, %f32 813 ldd [%o1 + 0x10], %f6 814 faligndata %f2, %f4, %f34 815 ldd [%o1 + 0x18], %f8 816 faligndata %f4, %f6, %f36 817 ldd [%o1 + 0x20], %f10 818 or %g0, -8, %o5 ! if %o3 >= 0, %o5 = -8 819 faligndata %f6, %f8, %f38 820 prefetch [%o1 + (16 * BLOCK_SIZE)], #one_read 821 ldd [%o1 + 0x28], %f12 822 movrlz %o3, %g0, %o5 ! if %o3 < 0, %o5 = 0 (needed lter) 823 faligndata %f8, %f10, %f40 824 ldd [%o1 + 0x30], %f14 825 faligndata %f10, %f12, %f42 826 ldd [%o1 + 0x38], %f0 827 sub %o2, BLOCK_SIZE, %o2 ! update count 828 add %o1, BLOCK_SIZE, %o1 ! update SRC 829 830 ! Main loop. Write previous block. Load rest of current block. 831 ! Some bytes will be loaded that won't yet be written. 832 1: 833 ldd [%o1], %f2 834 faligndata %f12, %f14, %f44 835 ldd [%o1 + 0x8], %f4 836 faligndata %f14, %f0, %f46 837 std %f32, [%o0] 838 std %f34, [%o0+8] 839 std %f36, [%o0+16] 840 std %f38, [%o0+24] 841 std %f40, [%o0+32] 842 std %f42, [%o0+40] 843 std %f44, [%o0+48] 844 std %f46, [%o0+56] 845 sub %o2, BLOCK_SIZE, %o2 ! update count 846 prefetch [%o1 + (24 * BLOCK_SIZE) + BLOCK_SIZE], #one_read 847 add %o0, BLOCK_SIZE, %o0 ! update DST 848 ldd [%o1 + 0x10], %f6 849 faligndata %f0, %f2, %f32 850 ldd [%o1 + 0x18], %f8 851 faligndata %f2, %f4, %f34 852 ldd [%o1 + 0x20], %f10 853 faligndata %f4, %f6, %f36 854 ldd [%o1 + 0x28], %f12 855 faligndata %f6, %f8, %f38 856 ldd [%o1 + 0x30], %f14 857 faligndata %f8, %f10, %f40 858 ldd [%o1 + 0x38], %f0 859 faligndata %f10, %f12, %f42 860 prefetch [%o1 + (18 * BLOCK_SIZE)], #one_read 861 cmp %o2, BLOCK_SIZE + 8 862 prefetch [%o0 + (18 * BLOCK_SIZE)], #one_write 863 bgu,pt %ncc, 1b 864 add %o1, BLOCK_SIZE, %o1 ! update SRC 865 faligndata %f12, %f14, %f44 866 faligndata %f14, %f0, %f46 867 stda %f32, [%o0]ASI_BLK_P ! store 64 bytes, bypass cache 868 cmp %o2, BLOCK_SIZE 869 bne %ncc, 2f ! exactly 1 block remaining? 870 add %o0, BLOCK_SIZE, %o0 ! update DST 871 brz,a %o3, 3f ! is SRC double aligned? 872 ldd [%o1], %f2 873 874 2: 875 add %o5, %o2, %o5 ! %o5 was already set to 0 or -8 876 add %o5, %o3, %o5 877 878 membar #StoreLoad|#StoreStore 879 880 ba .beginmedloop 881 andn %o5, 7, %o5 ! 8 byte aligned count 882 883 884 ! This is when there is exactly 1 block remaining and SRC is aligned 885 3: 886 ldd [%o1 + 0x8], %f4 887 ldd [%o1 + 0x10], %f6 888 fsrc1 %f0, %f32 889 ldd [%o1 + 0x18], %f8 890 fsrc1 %f2, %f34 891 ldd [%o1 + 0x20], %f10 892 fsrc1 %f4, %f36 893 ldd [%o1 + 0x28], %f12 894 fsrc1 %f6, %f38 895 ldd [%o1 + 0x30], %f14 896 fsrc1 %f8, %f40 897 fsrc1 %f10, %f42 898 fsrc1 %f12, %f44 899 fsrc1 %f14, %f46 900 stda %f32, [%o0]ASI_BLK_P 901 membar #StoreLoad|#StoreStore 902 wr %o4, 0, %fprs 903 retl 904 mov %g1, %o0 905 906 907 .align 16 908 ! two nops here causes loop starting at 1f below to be 909 ! on a cache line boundary, improving performance 910 nop 911 nop 912 .xlarge: 913 ! %o0 I/O DST is 64-byte aligned 914 ! %o1 I/O 8-byte aligned (and we've set GSR.ALIGN) 915 ! %d0 I/O already loaded with SRC data from [%o1-8] 916 ! %o2 I/O count (number of bytes that need to be written) 917 ! %o3 I Not written. If zero, then SRC is double aligned. 918 ! %o4 I Not written. Holds fprs. 919 ! %o5 O The number of doubles that remain to be written. 920 921 ! Load the rest of the current block 922 ! Recall that %o1 is further into SRC than %o0 is into DST 923 924 ldd [%o1], %f2 925 ldd [%o1 + 0x8], %f4 926 faligndata %f0, %f2, %f32 927 ldd [%o1 + 0x10], %f6 928 faligndata %f2, %f4, %f34 929 ldd [%o1 + 0x18], %f8 930 faligndata %f4, %f6, %f36 931 ldd [%o1 + 0x20], %f10 932 or %g0, -8, %o5 ! if %o3 >= 0, %o5 = -8 933 faligndata %f6, %f8, %f38 934 ldd [%o1 + 0x28], %f12 935 movrlz %o3, %g0, %o5 ! if %o3 < 0, %o5 = 0 (needed later) 936 prefetch [%o1 + (16 * BLOCK_SIZE)], #one_read 937 faligndata %f8, %f10, %f40 938 ldd [%o1 + 0x30], %f14 939 faligndata %f10, %f12, %f42 940 ldd [%o1 + 0x38], %f0 941 prefetch [%o1 + (17 * BLOCK_SIZE)], #one_read 942 sub %o2, BLOCK_SIZE, %o2 ! update count 943 add %o1, BLOCK_SIZE, %o1 ! update SRC 944 945 ! This point is 32-byte aligned since 24 instructions appear since 946 ! the previous alignment directive. 947 948 949 ! Main loop. Write previous block. Load rest of current block. 950 ! Some bytes will be loaded that won't yet be written. 951 1: 952 ldd [%o1], %f2 953 faligndata %f12, %f14, %f44 954 ldd [%o1 + 0x8], %f4 955 faligndata %f14, %f0, %f46 956 stda %f32, [%o0]ASI_BLK_P 957 sub %o2, BLOCK_SIZE, %o2 ! update count 958 ldd [%o1 + 0x10], %f6 959 faligndata %f0, %f2, %f32 960 ldd [%o1 + 0x18], %f8 961 faligndata %f2, %f4, %f34 962 ldd [%o1 + 0x20], %f10 963 faligndata %f4, %f6, %f36 964 ldd [%o1 + 0x28], %f12 965 faligndata %f6, %f8, %f38 966 ldd [%o1 + 0x30], %f14 967 prefetch [%o1 + (2 * BLOCK_SIZE)], #n_reads 968 faligndata %f8, %f10, %f40 969 ldd [%o1 + 0x38], %f0 970 faligndata %f10, %f12, %f42 971 prefetch [%o1 + (25 * BLOCK_SIZE)], #one_read 972 add %o0, BLOCK_SIZE, %o0 ! update DST 973 cmp %o2, BLOCK_SIZE + 8 974 ! second prefetch important to correct for occasional dropped 975 prefetch [%o1 + (18 * BLOCK_SIZE)], #one_read 976 bgu,pt %ncc, 1b 977 add %o1, BLOCK_SIZE, %o1 ! update SRC 978 979 faligndata %f12, %f14, %f44 980 faligndata %f14, %f0, %f46 981 stda %f32, [%o0]ASI_BLK_P ! store 64 bytes, bypass cache 982 cmp %o2, BLOCK_SIZE 983 bne %ncc, 2f ! exactly 1 block remaining? 984 add %o0, BLOCK_SIZE, %o0 ! update DST 985 brz,a %o3, 3f ! is SRC double aligned? 986 ldd [%o1], %f2 987 988 2: 989 add %o5, %o2, %o5 ! %o5 was already set to 0 or -8 990 add %o5, %o3, %o5 991 992 membar #StoreLoad|#StoreStore 993 994 ba .beginmedloop 995 andn %o5, 7, %o5 ! 8 byte aligned count 996 997 998 ! This is when there is exactly 1 block remaining and SRC is aligned 999 3: 1000 ldd [%o1 + 0x8], %f4 1001 ldd [%o1 + 0x10], %f6 1002 fsrc1 %f0, %f32 1003 ldd [%o1 + 0x18], %f8 1004 fsrc1 %f2, %f34 1005 ldd [%o1 + 0x20], %f10 1006 fsrc1 %f4, %f36 1007 ldd [%o1 + 0x28], %f12 1008 fsrc1 %f6, %f38 1009 ldd [%o1 + 0x30], %f14 1010 fsrc1 %f8, %f40 1011 fsrc1 %f10, %f42 1012 fsrc1 %f12, %f44 1013 fsrc1 %f14, %f46 1014 stda %f32, [%o0]ASI_BLK_P 1015 membar #StoreLoad|#StoreStore 1016 wr %o4, 0, %fprs 1017 retl 1018 mov %g1, %o0 1019 1020 SET_SIZE(memcpy) 1021