1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 .file "memcpy.s" 28 29 /* 30 * memcpy(s1, s2, len) 31 * 32 * Copy s2 to s1, always copy n bytes. 33 * Note: this C code does not work for overlapped copies. 34 * Memmove() and bcopy() do. 35 * 36 * Fast assembler language version of the following C-program for memcpy 37 * which represents the `standard' for the C-library. 38 * 39 * void * 40 * memcpy(void *s, const void *s0, size_t n) 41 * { 42 * if (n != 0) { 43 * char *s1 = s; 44 * const char *s2 = s0; 45 * do { 46 * *s1++ = *s2++; 47 * } while (--n != 0); 48 * } 49 * return (s); 50 * } 51 */ 52 53 #include <sys/asm_linkage.h> 54 #include <sys/sun4asi.h> 55 #include <sys/trap.h> 56 57 #define ICACHE_LINE_SIZE 64 58 #define BLOCK_SIZE 64 59 #define FPRS_FEF 0x4 60 61 #define SHORTCOPY 3 62 #define SMALL_MAX 39 63 #define MEDIUM_MAX 255 64 #define MED_WMAX 256 /* max copy for medium word-aligned case */ 65 #define MED_MAX 256 /* max copy for medium longword-aligned case */ 66 67 #ifndef BSTORE_SIZE 68 #define BSTORE_SIZE 256 /* min copy size for block store */ 69 #endif 70 71 ANSI_PRAGMA_WEAK(memmove,function) 72 ANSI_PRAGMA_WEAK(memcpy,function) 73 74 ENTRY(memmove) 75 cmp %o1, %o0 ! if from address is >= to use forward copy 76 bgeu %ncc, .forcpy ! else use backward if ... 77 sub %o0, %o1, %o4 ! get difference of two addresses 78 cmp %o2, %o4 ! compare size and difference of addresses 79 bleu %ncc, .forcpy ! if size is bigger, do overlapped copy 80 nop 81 82 ! 83 ! an overlapped copy that must be done "backwards" 84 ! 85 .ovbc: 86 mov %o0, %g1 ! save dest address for return val 87 add %o1, %o2, %o1 ! get to end of source space 88 add %o0, %o2, %o0 ! get to end of destination space 89 90 cmp %o2, 24 91 bgeu,pn %ncc, .dbalign 92 nop 93 cmp %o2, 4 94 blt,pn %ncc, .byte 95 sub %o2, 3, %o2 96 .byte4loop: 97 ldub [%o1-1], %o3 ! load last byte 98 stb %o3, [%o0-1] ! store last byte 99 sub %o1, 4, %o1 100 ldub [%o1+2], %o3 ! load 2nd from last byte 101 stb %o3, [%o0-2] ! store 2nd from last byte 102 sub %o0, 4, %o0 103 ldub [%o1+1], %o3 ! load 3rd from last byte 104 stb %o3, [%o0+1] ! store 3rd from last byte 105 subcc %o2, 4, %o2 106 ldub [%o1], %o3 ! load 4th from last byte 107 bgu,pt %ncc, .byte4loop 108 stb %o3, [%o0] ! store 4th from last byte 109 .byte: 110 addcc %o2, 3, %o2 111 bz,pt %ncc, .exit 112 .byteloop: 113 dec %o1 ! decrement src address 114 ldub [%o1], %o3 ! read a byte 115 dec %o0 ! decrement dst address 116 deccc %o2 ! decrement count 117 bgu,pt %ncc, .byteloop ! loop until done 118 stb %o3, [%o0] ! write byte 119 .exit: 120 retl 121 mov %g1, %o0 122 123 .align 16 124 .dbalign: 125 andcc %o0, 7, %o5 ! bytes till DST 8 byte aligned 126 bz,pt %ncc, .dbmed 127 sub %o2, %o5, %o2 ! update count 128 .dbalign1: 129 dec %o1 ! decrement src address 130 ldub [%o1], %o3 ! read a byte 131 dec %o0 ! decrement dst address 132 deccc %o5 ! decrement count 133 bgu,pt %ncc, .dbalign1 ! loop until done 134 stb %o3, [%o0] ! store a byte 135 136 ! check for src long word alignment 137 .dbmed: 138 andcc %o1, 7, %g0 ! chk src long word alignment 139 bnz,pn %ncc, .dbbck 140 nop 141 ! 142 ! Following code is for overlapping copies where src and dest 143 ! are long word aligned 144 ! 145 cmp %o2, 4095 146 blt,pn %ncc, .dbmedl32enter ! go to no prefetch code 147 nop 148 prefetch [%o1 - (1 * BLOCK_SIZE)], 20 ! into the prefetch cache 149 sub %o2, 63, %o2 ! adjust length to allow cc test 150 ! for end of loop 151 prefetch [%o1 - (2 * BLOCK_SIZE)], 20 ! into the prefetch cache 152 rd %fprs, %o3 ! o3 = fprs 153 ! if fprs.fef == 0, set it. Checking it, requires 2 instructions. 154 ! So set it anyway, without checking. 155 prefetch [%o1 - (3 * BLOCK_SIZE)], 20 ! into the prefetch cache 156 wr %g0, 0x4, %fprs ! fprs.fef = 1 157 prefetch [%o1 - (4 * BLOCK_SIZE)], 20 ! into the prefetch cache 158 .dbmedl64: 159 prefetch [%o1 - (5 * BLOCK_SIZE)], 20 ! into the prefetch cache 160 ldd [%o1-8], %d4 ! load 161 subcc %o2, 64, %o2 ! decrement length count 162 std %d4, [%o0-8] ! and store 163 ldd [%o1-16], %d2 ! a block of 64 bytes 164 sub %o1, 64, %o1 ! decrease src ptr by 64 165 std %d2, [%o0-16] 166 sub %o0, 64, %o0 ! decrease dst ptr by 64 167 ldd [%o1+40], %d4 168 std %d4, [%o0+40] 169 ldd [%o1+32], %d2 170 std %d2, [%o0+32] 171 ldd [%o1+24], %d4 172 std %d4, [%o0+24] 173 ldd [%o1+16], %d2 174 std %d2, [%o0+16] 175 ldd [%o1+8], %d4 176 std %d4, [%o0+8] 177 ldd [%o1], %d2 178 bgu,pt %ncc, .dbmedl64 ! repeat if at least 64 bytes left 179 std %d2, [%o0] 180 add %o2, 63, %o2 ! restore offset adjustment 181 and %o3, 0x4, %o3 ! fprs.du = fprs.dl = 0 182 wr %o3, %g0, %fprs ! fprs = o3 restore fprs 183 .dbmedl32enter: 184 subcc %o2, 31, %o2 ! adjust length to allow cc test 185 ! for end of loop 186 ble,pt %ncc, .dbmedl31 ! skip big loop if less than 32 187 nop 188 .dbmedl32: 189 ldx [%o1-8], %o4 ! load 190 subcc %o2, 32, %o2 ! decrement length count 191 stx %o4, [%o0-8] ! and store 192 ldx [%o1-16], %o3 ! a block of 32 bytes 193 sub %o1, 32, %o1 ! decrease src ptr by 32 194 stx %o3, [%o0-16] 195 ldx [%o1+8], %o4 196 sub %o0, 32, %o0 ! decrease dst ptr by 32 197 stx %o4, [%o0+8] 198 ldx [%o1], %o3 199 bgu,pt %ncc, .dbmedl32 ! repeat if at least 32 bytes left 200 stx %o3, [%o0] 201 .dbmedl31: 202 addcc %o2, 16, %o2 ! adjust remaining count 203 ble,pt %ncc, .dbmedl15 ! skip if 15 or fewer bytes left 204 nop ! 205 ldx [%o1-8], %o4 ! load and store 16 bytes 206 sub %o1, 16, %o1 ! decrease src ptr by 16 207 stx %o4, [%o0-8] ! 208 sub %o2, 16, %o2 ! decrease count by 16 209 ldx [%o1], %o3 ! 210 sub %o0, 16, %o0 ! decrease dst ptr by 16 211 stx %o3, [%o0] 212 .dbmedl15: 213 addcc %o2, 15, %o2 ! restore count 214 bz,pt %ncc, .dbexit ! exit if finished 215 nop 216 cmp %o2, 8 217 blt,pt %ncc, .dbremain ! skip if 7 or fewer bytes left 218 nop 219 ldx [%o1-8], %o4 ! load 8 bytes 220 sub %o1, 8, %o1 ! decrease src ptr by 8 221 stx %o4, [%o0-8] ! and store 8 bytes 222 subcc %o2, 8, %o2 ! decrease count by 8 223 bnz %ncc, .dbremain ! exit if finished 224 sub %o0, 8, %o0 ! decrease dst ptr by 8 225 retl 226 mov %g1, %o0 227 228 ! 229 ! Following code is for overlapping copies where src and dest 230 ! are not long word aligned 231 ! 232 .align 16 233 .dbbck: 234 rd %fprs, %o3 ! o3 = fprs 235 236 ! if fprs.fef == 0, set it. Checking it, requires 2 instructions. 237 ! So set it anyway, without checking. 238 wr %g0, 0x4, %fprs ! fprs.fef = 1 239 240 alignaddr %o1, %g0, %o5 ! align src 241 ldd [%o5], %d0 ! get first 8 byte block 242 andn %o2, 7, %o4 ! prepare src ptr for finishup code 243 cmp %o2, 32 244 blt,pn %ncc, .dbmv8 245 sub %o1, %o4, %o1 ! 246 cmp %o2, 4095 ! check for short memmoves 247 blt,pn %ncc, .dbmv32enter ! go to no prefetch code 248 .dbmv64: 249 ldd [%o5-8], %d2 ! load 8 bytes 250 ldd [%o5-16], %d4 ! load 8 bytes 251 sub %o5, 64, %o5 ! 252 ldd [%o5+40], %d6 ! load 8 bytes 253 sub %o0, 64, %o0 ! 254 ldd [%o5+32], %d8 ! load 8 bytes 255 sub %o2, 64, %o2 ! 64 less bytes to copy 256 ldd [%o5+24], %d18 ! load 8 bytes 257 cmp %o2, 64 ! do we have < 64 bytes remaining 258 ldd [%o5+16], %d28 ! load 8 bytes 259 ldd [%o5+8], %d30 ! load 8 bytes 260 prefetch [%o5 - (5 * BLOCK_SIZE)], 20 ! into the prefetch cache 261 faligndata %d2, %d0, %d10 ! extract 8 bytes out 262 ldd [%o5], %d0 ! load 8 bytes 263 std %d10, [%o0+56] ! store the current 8 bytes 264 faligndata %d4, %d2, %d12 ! extract 8 bytes out 265 std %d12, [%o0+48] ! store the current 8 bytes 266 faligndata %d6, %d4, %d14 ! extract 8 bytes out 267 std %d14, [%o0+40] ! store the current 8 bytes 268 faligndata %d8, %d6, %d16 ! extract 8 bytes out 269 std %d16, [%o0+32] ! store the current 8 bytes 270 faligndata %d18, %d8, %d20 ! extract 8 bytes out 271 std %d20, [%o0+24] ! store the current 8 bytes 272 faligndata %d28, %d18, %d22 ! extract 8 bytes out 273 std %d22, [%o0+16] ! store the current 8 bytes 274 faligndata %d30, %d28, %d24 ! extract 8 bytes out 275 std %d24, [%o0+8] ! store the current 8 bytes 276 faligndata %d0, %d30, %d26 ! extract 8 bytes out 277 bgeu,pt %ncc, .dbmv64 278 std %d26, [%o0] ! store the current 8 bytes 279 280 cmp %o2, 32 281 blt,pn %ncc, .dbmvx 282 nop 283 .dbmv32: 284 ldd [%o5-8], %d2 ! load 8 bytes 285 .dbmv32enter: 286 ldd [%o5-16], %d4 ! load 8 bytes 287 sub %o5, 32, %o5 ! 288 ldd [%o5+8], %d6 ! load 8 bytes 289 sub %o0, 32, %o0 ! 290 faligndata %d2, %d0, %d10 ! extract 8 bytes out 291 ldd [%o5], %d0 ! load 8 bytes 292 sub %o2,32, %o2 ! 32 less bytes to copy 293 std %d10, [%o0+24] ! store the current 8 bytes 294 cmp %o2, 32 ! do we have < 32 bytes remaining 295 faligndata %d4, %d2, %d12 ! extract 8 bytes out 296 std %d12, [%o0+16] ! store the current 8 bytes 297 faligndata %d6, %d4, %d14 ! extract 8 bytes out 298 std %d14, [%o0+8] ! store the current 8 bytes 299 faligndata %d0, %d6, %d16 ! extract 8 bytes out 300 bgeu,pt %ncc, .dbmv32 301 std %d16, [%o0] ! store the current 8 bytes 302 .dbmvx: 303 cmp %o2, 8 ! do we have < 8 bytes remaining 304 blt,pt %ncc, .dbmvfinish ! if yes, skip to finish up code 305 nop 306 .dbmv8: 307 ldd [%o5-8], %d2 308 sub %o0, 8, %o0 ! since we are at the end 309 ! when we first enter the loop 310 sub %o2, 8, %o2 ! 8 less bytes to copy 311 sub %o5, 8, %o5 312 cmp %o2, 8 ! do we have < 8 bytes remaining 313 faligndata %d2, %d0, %d8 ! extract 8 bytes out 314 std %d8, [%o0] ! store the current 8 bytes 315 bgeu,pt %ncc, .dbmv8 316 fmovd %d2, %d0 317 .dbmvfinish: 318 and %o3, 0x4, %o3 ! fprs.du = fprs.dl = 0 319 tst %o2 320 bz,pt %ncc, .dbexit 321 wr %o3, %g0, %fprs ! fprs = o3 restore fprs 322 323 .dbremain: 324 cmp %o2, 4 325 blt,pn %ncc, .dbbyte 326 nop 327 ldub [%o1-1], %o3 ! load last byte 328 stb %o3, [%o0-1] ! store last byte 329 sub %o1, 4, %o1 330 ldub [%o1+2], %o3 ! load 2nd from last byte 331 stb %o3, [%o0-2] ! store 2nd from last byte 332 sub %o0, 4, %o0 333 ldub [%o1+1], %o3 ! load 3rd from last byte 334 stb %o3, [%o0+1] ! store 3rd from last byte 335 subcc %o2, 4, %o2 336 ldub [%o1], %o3 ! load 4th from last byte 337 stb %o3, [%o0] ! store 4th from last byte 338 bz,pt %ncc, .dbexit 339 .dbbyte: 340 dec %o1 ! decrement src address 341 ldub [%o1], %o3 ! read a byte 342 dec %o0 ! decrement dst address 343 deccc %o2 ! decrement count 344 bgu,pt %ncc, .dbbyte ! loop until done 345 stb %o3, [%o0] ! write byte 346 .dbexit: 347 retl 348 mov %g1, %o0 349 SET_SIZE(memmove) 350 351 352 .align ICACHE_LINE_SIZE 353 ENTRY(memcpy) 354 ! adjust instruction alignment 355 nop ! Do not remove, these nops affect 356 nop ! icache alignment and performance 357 .forcpy: 358 cmp %o2, SMALL_MAX ! check for not small case 359 bgu,pn %ncc, .medium ! go to larger cases 360 mov %o0, %g1 ! save %o0 361 cmp %o2, SHORTCOPY ! check for really short case 362 ble,pt %ncc, .smallleft ! 363 or %o0, %o1, %o3 ! prepare alignment check 364 andcc %o3, 0x3, %g0 ! test for alignment 365 bz,pt %ncc, .smallword ! branch to word aligned case 366 sub %o2, 3, %o2 ! adjust count to allow cc zero test 367 .smallnotalign4: 368 ldub [%o1], %o3 ! read byte 369 subcc %o2, 4, %o2 ! reduce count by 4 370 stb %o3, [%o0] ! write byte 371 ldub [%o1+1], %o3 ! repeat for a total of 4 bytes 372 add %o1, 4, %o1 ! advance SRC by 4 373 stb %o3, [%o0+1] 374 ldub [%o1-2], %o3 375 add %o0, 4, %o0 ! advance DST by 4 376 stb %o3, [%o0-2] 377 ldub [%o1-1], %o3 378 bgu,pt %ncc, .smallnotalign4 ! loop til 3 or fewer bytes remain 379 stb %o3, [%o0-1] 380 add %o2, 3, %o2 ! restore count 381 .smallleft: 382 tst %o2 383 bz,pt %ncc, .smallexit 384 nop 385 .smallleft3: ! 1, 2, or 3 bytes remain 386 ldub [%o1], %o3 ! load one byte 387 deccc %o2 ! reduce count for cc test 388 bz,pt %ncc, .smallexit 389 stb %o3, [%o0] ! store one byte 390 ldub [%o1+1], %o3 ! load second byte 391 deccc %o2 392 bz,pt %ncc, .smallexit 393 stb %o3, [%o0+1] ! store second byte 394 ldub [%o1+2], %o3 ! load third byte 395 stb %o3, [%o0+2] ! store third byte 396 retl 397 mov %g1, %o0 ! restore %o0 398 399 .align 16 400 nop ! affects loop icache alignment 401 .smallwords: 402 lduw [%o1], %o3 ! read word 403 .smallwordx: 404 subcc %o2, 8, %o2 ! update count 405 stw %o3, [%o0] ! write word 406 add %o1, 8, %o1 ! update SRC 407 lduw [%o1-4], %o3 ! read word 408 add %o0, 8, %o0 ! update DST 409 bgu,pt %ncc, .smallwords ! loop until done 410 stw %o3, [%o0-4] ! write word 411 addcc %o2, 7, %o2 ! restore count 412 bz,pt %ncc, .smallexit ! check for completion 413 nop 414 cmp %o2, 4 ! check for 4 or more bytes left 415 blt .smallleft3 ! if not, go to finish up 416 nop 417 lduw [%o1], %o3 418 add %o1, 4, %o1 419 subcc %o2, 4, %o2 420 stw %o3, [%o0] 421 add %o0, 4, %o0 422 bnz,pt %ncc, .smallleft3 423 nop 424 retl 425 mov %g1, %o0 ! restore %o0 426 427 .smallword: 428 subcc %o2, 4, %o2 ! update count 429 bgu,pt %ncc, .smallwordx 430 lduw [%o1], %o3 ! read word 431 addcc %o2, 3, %o2 ! restore count 432 bz,pt %ncc, .smallexit 433 stw %o3, [%o0] ! write word 434 deccc %o2 ! reduce count for cc test 435 ldub [%o1+4], %o3 ! load one byte 436 bz,pt %ncc, .smallexit 437 stb %o3, [%o0+4] ! store one byte 438 ldub [%o1+5], %o3 ! load second byte 439 deccc %o2 440 bz,pt %ncc, .smallexit 441 stb %o3, [%o0+5] ! store second byte 442 ldub [%o1+6], %o3 ! load third byte 443 stb %o3, [%o0+6] ! store third byte 444 .smallexit: 445 retl 446 mov %g1, %o0 ! restore %o0 447 .align 16 448 .medium: 449 neg %o0, %o5 450 neg %o1, %o3 451 andcc %o5, 7, %o5 ! bytes till DST 8 byte aligned 452 and %o3, 7, %o3 ! bytes till SRC 8 byte aligned 453 454 bz %ncc, 2f 455 sub %o5, %o3, %o3 ! -(bytes till SRC aligned after DST aligned) 456 ! o3={-7, -6, ... 7} o3>0 => SRC overaligned 457 458 sub %o2, %o5, %o2 ! update count 459 460 1: 461 ldub [%o1], %o4 462 deccc %o5 463 inc %o1 464 stb %o4, [%o0] 465 bgu,pt %ncc, 1b 466 inc %o0 467 468 ! Now DST is 8-byte aligned. o0, o1, o2 are current. 469 470 2: 471 andcc %o1, 0x3, %g0 ! test alignment 472 bnz,pt %ncc, .mediumsetup ! branch to skip aligned cases 473 ! if src, dst not aligned 474 prefetch [%o1 + (1 * BLOCK_SIZE)], 20 475 476 /* 477 * Handle all cases where src and dest are aligned on word 478 * or long word boundaries. Use unrolled loops for better 479 * performance. This option wins over standard large data 480 * move when source and destination is in cache for medium 481 * to short data moves. 482 */ 483 andcc %o1, 0x7, %g0 ! test word alignment 484 bz,pt %ncc, .medlword ! branch to long word aligned case 485 prefetch [%o1 + (2 * BLOCK_SIZE)], 20 486 cmp %o2, MED_WMAX ! limit to store buffer size 487 bgu,pt %ncc, .mediumrejoin ! otherwise rejoin main loop 488 nop 489 subcc %o2, 15, %o2 ! adjust length to allow cc test 490 ! for end of loop 491 ble,pt %ncc, .medw15 ! skip big loop if less than 16 492 prefetch [%o1 + (3 * BLOCK_SIZE)], 20 493 /* 494 * no need to put prefetch in loop as prefetches have 495 * already been issued for maximum loop size 496 */ 497 .medw16: 498 ld [%o1], %o4 ! load 499 subcc %o2, 16, %o2 ! decrement length count 500 stw %o4, [%o0] ! and store 501 ld [%o1+4], %o3 ! a block of 16 bytes 502 add %o1, 16, %o1 ! increase src ptr by 16 503 stw %o3, [%o0+4] 504 ld [%o1-8], %o4 505 add %o0, 16, %o0 ! increase dst ptr by 16 506 stw %o4, [%o0-8] 507 ld [%o1-4], %o3 508 bgu,pt %ncc, .medw16 ! repeat if at least 16 bytes left 509 stw %o3, [%o0-4] 510 .medw15: 511 addcc %o2, 15, %o2 ! restore count 512 bz,pt %ncc, .medwexit ! exit if finished 513 nop 514 cmp %o2, 8 515 blt,pt %ncc, .medw7 ! skip if 7 or fewer bytes left 516 nop ! 517 ld [%o1], %o4 ! load 4 bytes 518 subcc %o2, 8, %o2 ! decrease count by 8 519 stw %o4, [%o0] ! and store 4 bytes 520 add %o1, 8, %o1 ! increase src ptr by 8 521 ld [%o1-4], %o3 ! load 4 bytes 522 add %o0, 8, %o0 ! increase dst ptr by 8 523 stw %o3, [%o0-4] ! and store 4 bytes 524 bz %ncc, .medwexit ! exit if finished 525 nop 526 .medw7: ! count is ge 1, less than 8 527 cmp %o2, 3 ! check for 4 bytes left 528 ble,pt %ncc, .medw3 ! skip if 3 or fewer bytes left 529 nop ! 530 ld [%o1], %o4 ! load 4 bytes 531 sub %o2, 4, %o2 ! decrease count by 4 532 add %o1, 4, %o1 ! increase src ptr by 4 533 stw %o4, [%o0] ! and store 4 bytes 534 add %o0, 4, %o0 ! increase dst ptr by 4 535 tst %o2 ! check for zero bytes left 536 bz %ncc, .medwexit ! exit if finished 537 nop 538 .medw3: ! count is known to be 1, 2, or 3 539 deccc %o2 ! reduce count by one 540 ldub [%o1], %o3 ! load one byte 541 bz,pt %ncc, .medwexit ! exit if last byte 542 stb %o3, [%o0] ! store one byte 543 ldub [%o1+1], %o3 ! load second byte 544 deccc %o2 ! reduce count by one 545 bz,pt %ncc, .medwexit ! exit if last byte 546 stb %o3, [%o0+1] ! store second byte 547 ldub [%o1+2], %o3 ! load third byte 548 stb %o3, [%o0+2] ! store third byte 549 .medwexit: 550 retl 551 mov %g1, %o0 ! restore %o0 552 553 /* 554 * Special case for handling when src and dest are both long word aligned 555 * and total data to move is between SMALL_MAX and MED_MAX bytes 556 */ 557 558 .align 16 559 nop 560 .medlword: ! long word aligned 561 ! length > SMALL_MAX 562 cmp %o2, MED_MAX ! limit to store buffer size 563 bgu,pt %ncc, .mediumrejoin ! otherwise rejoin main loop 564 nop 565 subcc %o2, 31, %o2 ! adjust length to allow cc test 566 ! for end of loop 567 ble,pt %ncc, .medl31 ! skip big loop if less than 32 568 prefetch [%o1 + (3 * BLOCK_SIZE)], 20 ! into the l2 cache 569 /* 570 * no need to put prefetch in loop as prefetches have 571 * already been issued for maximum loop size 572 */ 573 .medl32: 574 ldx [%o1], %o4 ! load 575 subcc %o2, 32, %o2 ! decrement length count 576 stx %o4, [%o0] ! and store 577 ldx [%o1+8], %o3 ! a block of 32 bytes 578 add %o1, 32, %o1 ! increase src ptr by 32 579 stx %o3, [%o0+8] 580 ldx [%o1-16], %o4 581 add %o0, 32, %o0 ! increase dst ptr by 32 582 stx %o4, [%o0-16] 583 ldx [%o1-8], %o3 584 bgu,pt %ncc, .medl32 ! repeat if at least 32 bytes left 585 stx %o3, [%o0-8] 586 .medl31: 587 addcc %o2, 16, %o2 ! adjust remaining count 588 ble,pt %ncc, .medl15 ! skip if 15 or fewer bytes left 589 nop ! 590 ldx [%o1], %o4 ! load and store 16 bytes 591 add %o1, 16, %o1 ! increase src ptr by 16 592 stx %o4, [%o0] ! 593 sub %o2, 16, %o2 ! decrease count by 16 594 ldx [%o1-8], %o3 ! 595 add %o0, 16, %o0 ! increase dst ptr by 16 596 stx %o3, [%o0-8] 597 .medl15: 598 addcc %o2, 15, %o2 ! restore count 599 bz,pt %ncc, .medwexit ! exit if finished 600 nop 601 cmp %o2, 8 602 blt,pt %ncc, .medw7 ! skip if 7 or fewer bytes left 603 nop 604 ldx [%o1], %o4 ! load 8 bytes 605 add %o1, 8, %o1 ! increase src ptr by 8 606 stx %o4, [%o0] ! and store 8 bytes 607 subcc %o2, 8, %o2 ! decrease count by 8 608 bz %ncc, .medwexit ! exit if finished 609 add %o0, 8, %o0 ! increase dst ptr by 8 610 ba .medw7 611 nop 612 613 .align 16 614 nop 615 nop 616 nop 617 .mediumsetup: 618 prefetch [%o1 + (2 * BLOCK_SIZE)], 21 619 .mediumrejoin: 620 rd %fprs, %o4 ! check for unused FPU 621 622 add %o1, 8, %o1 ! prepare to round SRC upward 623 624 sethi %hi(0x1234567f), %o5 ! For GSR.MASK 625 or %o5, 0x67f, %o5 626 627 andcc %o4, FPRS_FEF, %o4 ! test FEF, fprs.du = fprs.dl = 0 628 bz,a %ncc, 3f 629 wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1 630 3: 631 cmp %o2, MEDIUM_MAX 632 bmask %o5, %g0, %g0 633 634 ! Compute o5 (number of bytes that need copying using the main loop). 635 ! First, compute for the medium case. 636 ! Then, if large case, o5 is replaced by count for block alignment. 637 ! Be careful not to read past end of SRC 638 ! Currently, o2 is the actual count remaining 639 ! o3 is how much sooner we'll cross the alignment boundary 640 ! in SRC compared to in DST 641 ! 642 ! Examples: Let # denote bytes that should not be accessed 643 ! Let x denote a byte already copied to align DST 644 ! Let . and - denote bytes not yet copied 645 ! Let | denote double alignment boundaries 646 ! 647 ! DST: ######xx|........|--------|..###### o2 = 18 648 ! o0 649 ! 650 ! o3 = -3: SRC: ###xx...|.....---|-----..#|######## o5 = 8 651 ! o1 652 ! 653 ! o3 = 0: SRC: ######xx|........|--------|..###### o5 = 16-8 = 8 654 ! o1 655 ! 656 ! o3 = +1: SRC: #######x|x.......|.-------|-..##### o5 = 16-8 = 8 657 ! o1 658 659 or %g0, -8, %o5 660 alignaddr %o1, %g0, %o1 ! set GSR.ALIGN and align o1 661 662 movrlz %o3, %g0, %o5 ! subtract 8 from o2+o3 only if o3>=0 663 add %o5, %o2, %o5 664 add %o5, %o3, %o5 665 666 bleu %ncc, 4f 667 andn %o5, 7, %o5 ! 8 byte aligned count 668 neg %o0, %o5 ! 'large' case 669 and %o5, BLOCK_SIZE-1, %o5 ! bytes till DST block aligned 670 4: 671 brgez,a %o3, .beginmedloop 672 ldd [%o1-8], %d0 673 674 add %o1, %o3, %o1 ! back up o1 675 5: 676 ldda [%o1]ASI_FL8_P, %d2 677 inc %o1 678 andcc %o1, 7, %g0 679 bnz %ncc, 5b 680 bshuffle %d0, %d2, %d0 ! shifts d0 left 1 byte and or's in d2 681 682 .beginmedloop: 683 tst %o5 684 bz %ncc, .endmedloop 685 sub %o2, %o5, %o2 ! update count for later 686 687 ! Main loop to write out doubles. Note: o5 & 7 == 0 688 689 ldd [%o1], %d2 690 subcc %o5, 8, %o5 ! update local count 691 bz,pn %ncc, 1f 692 add %o1, 8, %o1 ! update SRC 693 694 .medloop: 695 faligndata %d0, %d2, %d4 696 ldd [%o1], %d0 697 subcc %o5, 8, %o5 ! update local count 698 add %o1, 16, %o1 ! update SRC 699 std %d4, [%o0] 700 bz,pn %ncc, 2f 701 faligndata %d2, %d0, %d6 702 ldd [%o1 - 8], %d2 703 subcc %o5, 8, %o5 ! update local count 704 std %d6, [%o0 + 8] 705 bnz,pt %ncc, .medloop 706 add %o0, 16, %o0 ! update DST 707 708 1: 709 faligndata %d0, %d2, %d4 710 fmovd %d2, %d0 711 std %d4, [%o0] 712 ba .endmedloop 713 add %o0, 8, %o0 714 715 2: 716 std %d6, [%o0 + 8] 717 sub %o1, 8, %o1 718 add %o0, 16, %o0 719 720 721 .endmedloop: 722 ! Currently, o1 is pointing to the next double-aligned byte in SRC 723 ! The 8 bytes starting at [o1-8] are available in d0 724 ! At least one, and possibly all, of these need to be written. 725 726 cmp %o2, BLOCK_SIZE 727 bgu %ncc, .large ! otherwise, less than 16 bytes left 728 729 #if 0 730 731 /* This code will use partial stores. */ 732 733 mov %g0, %o5 734 and %o3, 7, %o3 ! Number of bytes needed to completely 735 ! fill %d0 with good (unwritten) data. 736 737 subcc %o2, 8, %o2 ! update count (maybe too much) 738 movl %ncc, %o2, %o5 739 addcc %o3, %o5, %o5 ! extra bytes we can stuff into %d0 740 sub %o3, %o5, %o3 ! update o3 (# bad bytes in %d0) 741 742 bz %ncc, 2f 743 alignaddr %o3, %g0, %g0 ! set GSR.ALIGN 744 745 1: 746 deccc %o5 747 ldda [%o1]ASI_FL8_P, %d2 748 inc %o1 749 bgu %ncc, 1b 750 bshuffle %d0, %d2, %d0 ! shifts d0 left 1 byte and or's in d2 751 752 2: 753 not %o3 754 faligndata %d0, %d0, %d0 ! shift bytes to the left 755 and %o3, 7, %o3 ! last byte to be stored in [%o0+%o3] 756 edge8n %g0, %o3, %o5 757 stda %d0, [%o0]%o5, ASI_PST8_P 758 brlez %o2, .mediumexit 759 add %o0, %o3, %o0 ! update DST to last stored byte 760 3: 761 inc %o0 762 deccc %o2 763 ldub [%o1], %o3 764 stb %o3, [%o0] 765 bgu %ncc, 3b 766 inc %o1 767 768 #else 769 770 andcc %o3, 7, %o5 ! Number of bytes needed to completely 771 ! fill %d0 with good (unwritten) data. 772 bz %ncc, 2f 773 sub %o5, 8, %o3 ! -(number of good bytes in %d0) 774 cmp %o2, 8 775 bl,a %ncc, 3f ! Not enough bytes to fill %d0 776 add %o1, %o3, %o1 ! Back up %o1 777 778 1: 779 deccc %o5 780 ldda [%o1]ASI_FL8_P, %d2 781 inc %o1 782 bgu %ncc, 1b 783 bshuffle %d0, %d2, %d0 ! shifts d0 left 1 byte and or's in d2 784 785 2: 786 subcc %o2, 8, %o2 787 std %d0, [%o0] 788 bz %ncc, .mediumexit 789 add %o0, 8, %o0 790 3: 791 ldub [%o1], %o3 792 deccc %o2 793 inc %o1 794 stb %o3, [%o0] 795 bgu %ncc, 3b 796 inc %o0 797 #endif 798 799 .mediumexit: 800 wr %o4, %g0, %fprs ! fprs = o4 restore fprs 801 retl 802 mov %g1, %o0 803 804 805 .align ICACHE_LINE_SIZE 806 .large: 807 ! The following test for BSTORE_SIZE is used to decide whether 808 ! to store data with a block store or with individual stores. 809 ! The block store wins when the amount of data is so large 810 ! that it is causes other application data to be moved out 811 ! of the L1 or L2 cache. 812 ! On a Panther, block store can lose more often because block 813 ! store forces the stored data to be removed from the L3 cache. 814 ! 815 sethi %hi(BSTORE_SIZE),%o5 816 or %o5,%lo(BSTORE_SIZE),%o5 817 cmp %o2, %o5 818 bgu %ncc, .xlarge 819 820 ! %o0 I/O DST is 64-byte aligned 821 ! %o1 I/O 8-byte aligned (and we've set GSR.ALIGN) 822 ! %d0 I/O already loaded with SRC data from [%o1-8] 823 ! %o2 I/O count (number of bytes that need to be written) 824 ! %o3 I Not written. If zero, then SRC is double aligned. 825 ! %o4 I Not written. Holds fprs. 826 ! %o5 O The number of doubles that remain to be written. 827 828 ! Load the rest of the current block 829 ! Recall that %o1 is further into SRC than %o0 is into DST 830 831 prefetch [%o0 + (0 * BLOCK_SIZE)], 22 832 prefetch [%o0 + (1 * BLOCK_SIZE)], 22 833 prefetch [%o0 + (2 * BLOCK_SIZE)], 22 834 ldd [%o1], %f2 835 prefetch [%o1 + (3 * BLOCK_SIZE)], 21 836 ldd [%o1 + 0x8], %f4 837 faligndata %f0, %f2, %f32 838 ldd [%o1 + 0x10], %f6 839 faligndata %f2, %f4, %f34 840 ldd [%o1 + 0x18], %f8 841 faligndata %f4, %f6, %f36 842 ldd [%o1 + 0x20], %f10 843 or %g0, -8, %o5 ! if %o3 >= 0, %o5 = -8 844 prefetch [%o1 + (4 * BLOCK_SIZE)], 21 845 faligndata %f6, %f8, %f38 846 ldd [%o1 + 0x28], %f12 847 movrlz %o3, %g0, %o5 ! if %o3 < 0, %o5 = 0 (needed lter) 848 faligndata %f8, %f10, %f40 849 ldd [%o1 + 0x30], %f14 850 faligndata %f10, %f12, %f42 851 ldd [%o1 + 0x38], %f0 852 sub %o2, BLOCK_SIZE, %o2 ! update count 853 prefetch [%o1 + (5 * BLOCK_SIZE)], 21 854 add %o1, BLOCK_SIZE, %o1 ! update SRC 855 856 ! Main loop. Write previous block. Load rest of current block. 857 ! Some bytes will be loaded that won't yet be written. 858 1: 859 ldd [%o1], %f2 860 faligndata %f12, %f14, %f44 861 ldd [%o1 + 0x8], %f4 862 faligndata %f14, %f0, %f46 863 std %f32, [%o0] 864 std %f34, [%o0+8] 865 std %f36, [%o0+16] 866 std %f38, [%o0+24] 867 std %f40, [%o0+32] 868 std %f42, [%o0+40] 869 std %f44, [%o0+48] 870 std %f46, [%o0+56] 871 sub %o2, BLOCK_SIZE, %o2 ! update count 872 prefetch [%o0 + (6 * BLOCK_SIZE)], 22 873 prefetch [%o0 + (3 * BLOCK_SIZE)], 22 874 add %o0, BLOCK_SIZE, %o0 ! update DST 875 ldd [%o1 + 0x10], %f6 876 faligndata %f0, %f2, %f32 877 ldd [%o1 + 0x18], %f8 878 faligndata %f2, %f4, %f34 879 ldd [%o1 + 0x20], %f10 880 faligndata %f4, %f6, %f36 881 ldd [%o1 + 0x28], %f12 882 faligndata %f6, %f8, %f38 883 ldd [%o1 + 0x30], %f14 884 faligndata %f8, %f10, %f40 885 ldd [%o1 + 0x38], %f0 886 faligndata %f10, %f12, %f42 887 cmp %o2, BLOCK_SIZE + 8 888 prefetch [%o1 + (5 * BLOCK_SIZE)], 21 889 bgu,pt %ncc, 1b 890 add %o1, BLOCK_SIZE, %o1 ! update SRC 891 faligndata %f12, %f14, %f44 892 faligndata %f14, %f0, %f46 893 stda %f32, [%o0]ASI_BLK_P ! store 64 bytes, bypass cache 894 cmp %o2, BLOCK_SIZE 895 bne %ncc, 2f ! exactly 1 block remaining? 896 add %o0, BLOCK_SIZE, %o0 ! update DST 897 brz,a %o3, 3f ! is SRC double aligned? 898 ldd [%o1], %f2 899 900 2: 901 add %o5, %o2, %o5 ! %o5 was already set to 0 or -8 902 add %o5, %o3, %o5 903 904 membar #StoreLoad|#StoreStore 905 906 ba .beginmedloop 907 andn %o5, 7, %o5 ! 8 byte aligned count 908 909 910 ! This is when there is exactly 1 block remaining and SRC is aligned 911 3: 912 ldd [%o1 + 0x8], %f4 913 ldd [%o1 + 0x10], %f6 914 fsrc1 %f0, %f32 915 ldd [%o1 + 0x18], %f8 916 fsrc1 %f2, %f34 917 ldd [%o1 + 0x20], %f10 918 fsrc1 %f4, %f36 919 ldd [%o1 + 0x28], %f12 920 fsrc1 %f6, %f38 921 ldd [%o1 + 0x30], %f14 922 fsrc1 %f8, %f40 923 fsrc1 %f10, %f42 924 fsrc1 %f12, %f44 925 fsrc1 %f14, %f46 926 stda %f32, [%o0]ASI_BLK_P 927 membar #StoreLoad|#StoreStore 928 wr %o4, 0, %fprs 929 retl 930 mov %g1, %o0 931 932 933 .align 16 934 ! two nops here causes loop starting at 1f below to be 935 ! on a cache line boundary, improving performance 936 nop 937 nop 938 .xlarge: 939 ! %o0 I/O DST is 64-byte aligned 940 ! %o1 I/O 8-byte aligned (and we've set GSR.ALIGN) 941 ! %d0 I/O already loaded with SRC data from [%o1-8] 942 ! %o2 I/O count (number of bytes that need to be written) 943 ! %o3 I Not written. If zero, then SRC is double aligned. 944 ! %o4 I Not written. Holds fprs. 945 ! %o5 O The number of doubles that remain to be written. 946 947 ! Load the rest of the current block 948 ! Recall that %o1 is further into SRC than %o0 is into DST 949 950 ! prefetch [%o1 + (3 * BLOCK_SIZE)], 21 951 ! executed in delay slot for branch to .xlarge 952 prefetch [%o1 + (4 * BLOCK_SIZE)], 21 953 prefetch [%o1 + (5 * BLOCK_SIZE)], 21 954 ldd [%o1], %f2 955 prefetch [%o1 + (6 * BLOCK_SIZE)], 21 956 ldd [%o1 + 0x8], %f4 957 faligndata %f0, %f2, %f32 958 ldd [%o1 + 0x10], %f6 959 faligndata %f2, %f4, %f34 960 ldd [%o1 + 0x18], %f8 961 faligndata %f4, %f6, %f36 962 ldd [%o1 + 0x20], %f10 963 or %g0, -8, %o5 ! if %o3 >= 0, %o5 = -8 964 faligndata %f6, %f8, %f38 965 ldd [%o1 + 0x28], %f12 966 movrlz %o3, %g0, %o5 ! if %o3 < 0, %o5 = 0 (needed later) 967 faligndata %f8, %f10, %f40 968 ldd [%o1 + 0x30], %f14 969 faligndata %f10, %f12, %f42 970 ldd [%o1 + 0x38], %f0 971 sub %o2, BLOCK_SIZE, %o2 ! update count 972 prefetch [%o1 + (7 * BLOCK_SIZE)], 21 973 add %o1, BLOCK_SIZE, %o1 ! update SRC 974 975 ! This point is 32-byte aligned since 24 instructions appear since 976 ! the previous alignment directive. 977 978 979 ! Main loop. Write previous block. Load rest of current block. 980 ! Some bytes will be loaded that won't yet be written. 981 1: 982 ldd [%o1], %f2 983 faligndata %f12, %f14, %f44 984 ldd [%o1 + 0x8], %f4 985 faligndata %f14, %f0, %f46 986 stda %f32, [%o0]ASI_BLK_P 987 sub %o2, BLOCK_SIZE, %o2 ! update count 988 ldd [%o1 + 0x10], %f6 989 faligndata %f0, %f2, %f32 990 ldd [%o1 + 0x18], %f8 991 faligndata %f2, %f4, %f34 992 ldd [%o1 + 0x20], %f10 993 faligndata %f4, %f6, %f36 994 ldd [%o1 + 0x28], %f12 995 faligndata %f6, %f8, %f38 996 ldd [%o1 + 0x30], %f14 997 faligndata %f8, %f10, %f40 998 ldd [%o1 + 0x38], %f0 999 faligndata %f10, %f12, %f42 1000 ! offset of 8*BLK+8 bytes works best over range of (src-dst) mod 1K 1001 prefetch [%o1 + (8 * BLOCK_SIZE) + 8], 21 1002 add %o0, BLOCK_SIZE, %o0 ! update DST 1003 cmp %o2, BLOCK_SIZE + 8 1004 ! second prefetch important to correct for occasional dropped 1005 ! initial prefetches, 5*BLK works best over range of (src-dst) mod 1K 1006 ! strong prefetch prevents drops on Panther, but Jaguar and earlier 1007 ! US-III models treat strong prefetches as weak prefetchs 1008 ! to avoid regressions on customer hardware, we retain the prefetch 1009 prefetch [%o1 + (5 * BLOCK_SIZE)], 21 1010 bgu,pt %ncc, 1b 1011 add %o1, BLOCK_SIZE, %o1 ! update SRC 1012 1013 faligndata %f12, %f14, %f44 1014 faligndata %f14, %f0, %f46 1015 stda %f32, [%o0]ASI_BLK_P ! store 64 bytes, bypass cache 1016 cmp %o2, BLOCK_SIZE 1017 bne %ncc, 2f ! exactly 1 block remaining? 1018 add %o0, BLOCK_SIZE, %o0 ! update DST 1019 brz,a %o3, 3f ! is SRC double aligned? 1020 ldd [%o1], %f2 1021 1022 2: 1023 add %o5, %o2, %o5 ! %o5 was already set to 0 or -8 1024 add %o5, %o3, %o5 1025 1026 membar #StoreLoad|#StoreStore 1027 1028 ba .beginmedloop 1029 andn %o5, 7, %o5 ! 8 byte aligned count 1030 1031 1032 ! This is when there is exactly 1 block remaining and SRC is aligned 1033 3: 1034 ldd [%o1 + 0x8], %f4 1035 ldd [%o1 + 0x10], %f6 1036 fsrc1 %f0, %f32 1037 ldd [%o1 + 0x18], %f8 1038 fsrc1 %f2, %f34 1039 ldd [%o1 + 0x20], %f10 1040 fsrc1 %f4, %f36 1041 ldd [%o1 + 0x28], %f12 1042 fsrc1 %f6, %f38 1043 ldd [%o1 + 0x30], %f14 1044 fsrc1 %f8, %f40 1045 fsrc1 %f10, %f42 1046 fsrc1 %f12, %f44 1047 fsrc1 %f14, %f46 1048 stda %f32, [%o0]ASI_BLK_P 1049 membar #StoreLoad|#StoreStore 1050 wr %o4, 0, %fprs 1051 retl 1052 mov %g1, %o0 1053 1054 SET_SIZE(memcpy) 1055