Home | History | Annotate | Download | only in gen
      1 /*
      2  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
      3  * Use is subject to license terms.
      4  */
      5 
      6 /*
      7  * Copyright (c) 2002 Advanced Micro Devices, Inc.
      8  *
      9  * All rights reserved.
     10  *
     11  * Redistribution and  use in source and binary  forms, with or
     12  * without  modification,  are   permitted  provided  that  the
     13  * following conditions are met:
     14  *
     15  * + Redistributions  of source  code  must  retain  the  above
     16  *   copyright  notice,   this  list  of   conditions  and  the
     17  *   following disclaimer.
     18  *
     19  * + Redistributions  in binary  form must reproduce  the above
     20  *   copyright  notice,   this  list  of   conditions  and  the
     21  *   following  disclaimer in  the  documentation and/or  other
     22  *   materials provided with the distribution.
     23  *
     24  * + Neither the  name of Advanced Micro Devices,  Inc. nor the
     25  *   names  of  its contributors  may  be  used  to endorse  or
     26  *   promote  products  derived   from  this  software  without
     27  *   specific prior written permission.
     28  *
     29  * THIS  SOFTWARE  IS PROVIDED  BY  THE  COPYRIGHT HOLDERS  AND
     30  * CONTRIBUTORS AS IS AND  ANY EXPRESS OR IMPLIED WARRANTIES,
     31  * INCLUDING,  BUT NOT  LIMITED TO,  THE IMPLIED  WARRANTIES OF
     32  * MERCHANTABILITY  AND FITNESS  FOR A  PARTICULAR  PURPOSE ARE
     33  * DISCLAIMED.  IN  NO  EVENT  SHALL  ADVANCED  MICRO  DEVICES,
     34  * INC.  OR CONTRIBUTORS  BE LIABLE  FOR ANY  DIRECT, INDIRECT,
     35  * INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR CONSEQUENTIAL  DAMAGES
     36  * (INCLUDING,  BUT NOT LIMITED  TO, PROCUREMENT  OF SUBSTITUTE
     37  * GOODS  OR  SERVICES;  LOSS  OF  USE, DATA,  OR  PROFITS;  OR
     38  * BUSINESS INTERRUPTION)  HOWEVER CAUSED AND ON  ANY THEORY OF
     39  * LIABILITY,  WHETHER IN CONTRACT,  STRICT LIABILITY,  OR TORT
     40  * (INCLUDING NEGLIGENCE  OR OTHERWISE) ARISING IN  ANY WAY OUT
     41  * OF THE  USE  OF  THIS  SOFTWARE, EVEN  IF  ADVISED  OF  THE
     42  * POSSIBILITY OF SUCH DAMAGE.
     43  *
     44  * It is  licensee's responsibility  to comply with  any export
     45  * regulations applicable in licensee's jurisdiction.
     46  */
     47 
     48 	.file	"memcmp.s"
     49 
     50 #include <sys/asm_linkage.h>
     51 
     52 	ANSI_PRAGMA_WEAK(memcmp,function)
     53 
     54 #include "SYS.h"
     55 #include "cache.h"
     56 
     57 #define LABEL(s) .memcmp/**/s
     58 
     59 	ENTRY(memcmp)                 /* (const void *, const void*, size_t) */
     60 
     61 LABEL(try1):
     62         cmp     $8, %rdx
     63         jae     LABEL(1after)
     64 
     65 LABEL(1):                                /* 1-byte */
     66         test    %rdx, %rdx
     67         mov     $0, %eax
     68         jz      LABEL(exit)
     69 
     70 LABEL(1loop):
     71         movzbl  (%rdi), %eax
     72         movzbl  (%rsi), %ecx
     73         sub     %ecx, %eax
     74         jnz     LABEL(exit)
     75 
     76         dec     %rdx
     77 
     78         lea     1 (%rdi), %rdi
     79         lea     1 (%rsi), %rsi
     80 
     81         jnz     LABEL(1loop)
     82 
     83 LABEL(exit):
     84         rep
     85         ret
     86 
     87         .p2align 4
     88 
     89 LABEL(1after):
     90 
     91 LABEL(8try):
     92         cmp     $32, %rdx
     93         jae     LABEL(8after)
     94 
     95 LABEL(8):                        /* 8-byte */
     96         mov     %edx, %ecx
     97         shr     $3, %ecx
     98         jz      LABEL(1)
     99 
    100         .p2align 4
    101 
    102 LABEL(8loop):
    103         mov     (%rsi), %rax
    104         cmp     (%rdi), %rax
    105         jne     LABEL(1)
    106 
    107         sub     $8, %rdx
    108         dec     %ecx
    109 
    110         lea     8 (%rsi), %rsi
    111         lea     8 (%rdi), %rdi
    112 
    113         jnz     LABEL(8loop)
    114 
    115 LABEL(8skip):
    116         and     $7, %edx
    117         jnz     LABEL(1)
    118 
    119         xor     %eax, %eax
    120         ret
    121 
    122         .p2align 4
    123 
    124 LABEL(8after):
    125 
    126 LABEL(32try):
    127         cmp     $2048, %rdx
    128         ja      LABEL(32after)
    129 
    130 LABEL(32):                               /* 32-byte */
    131         mov     %edx, %ecx
    132         shr     $5, %ecx
    133         jz      LABEL(8)
    134 
    135         .p2align 4
    136 
    137 LABEL(32loop):
    138         mov        (%rsi), %rax
    139         mov      8 (%rsi),  %r8
    140         mov     16 (%rsi),  %r9
    141         mov     24 (%rsi), %r10
    142         sub        (%rdi), %rax
    143         sub      8 (%rdi),  %r8
    144         sub     16 (%rdi),  %r9
    145         sub     24 (%rdi), %r10
    146 
    147         or      %rax,  %r8
    148         or       %r9, %r10
    149         or       %r8, %r10
    150         jnz     LABEL(8)
    151 
    152         sub     $32, %rdx
    153         dec     %ecx
    154 
    155         lea     32 (%rsi), %rsi
    156         lea     32 (%rdi), %rdi
    157 
    158         jnz     LABEL(32loop)
    159 
    160 LABEL(32skip):
    161         and     $31, %edx
    162         jnz     LABEL(8)
    163 
    164         xor     %eax, %eax
    165         ret
    166 
    167         .p2align 4
    168 
    169 LABEL(32after):
    170 
    171 	prefetchnta _sref_(.amd64cache1half)	/* 3DNow: use prefetch */
    172 
    173 LABEL(srctry):
    174         mov     %esi, %r8d      /* align by source */
    175 
    176         and     $7, %r8d
    177         jz      LABEL(srcafter)  /* not unaligned */
    178 
    179 LABEL(src):                      /* align */
    180         lea     -8 (%r8, %rdx), %rdx
    181         sub     $8, %r8d
    182 
    183 
    184 LABEL(srcloop):
    185         movzbl  (%rdi), %eax
    186         movzbl  (%rsi), %ecx
    187         sub     %ecx, %eax
    188         jnz     LABEL(exit)
    189 
    190         inc     %r8d
    191 
    192         lea     1 (%rdi), %rdi
    193         lea     1 (%rsi), %rsi
    194 
    195         jnz     LABEL(srcloop)
    196 
    197         .p2align 4
    198 
    199 LABEL(srcafter):
    200 
    201 LABEL(64try):
    202         mov     _sref_(.amd64cache1half), %rcx
    203         cmp	%rdx, %rcx
    204         cmova   %rdx, %rcx
    205 
    206 LABEL(64):                               /* 64-byte */
    207         shr     $6, %rcx
    208         jz      LABEL(32)
    209 
    210         .p2align 4
    211 
    212 LABEL(64loop):
    213         mov        (%rsi), %rax
    214         mov      8 (%rsi),  %r8
    215         sub        (%rdi), %rax
    216         sub      8 (%rdi),  %r8
    217         or      %r8,  %rax
    218 
    219         mov     16 (%rsi),  %r9
    220         mov     24 (%rsi), %r10
    221         sub     16 (%rdi),  %r9
    222         sub     24 (%rdi), %r10
    223         or      %r10, %r9
    224 
    225         or      %r9,  %rax
    226         jnz     LABEL(32)
    227 
    228         mov     32 (%rsi), %rax
    229         mov     40 (%rsi),  %r8
    230         sub     32 (%rdi), %rax
    231         sub     40 (%rdi),  %r8
    232         or      %r8,  %rax
    233 
    234         mov     48 (%rsi),  %r9
    235         mov     56 (%rsi), %r10
    236         sub     48 (%rdi),  %r9
    237         sub     56 (%rdi), %r10
    238         or      %r10, %r9
    239 
    240         or      %r9,  %rax
    241         jnz    	LABEL(32)
    242 
    243         lea     64 (%rsi), %rsi
    244         lea     64 (%rdi), %rdi
    245 
    246         sub     $64, %rdx
    247         dec     %rcx
    248         jnz     LABEL(64loop)
    249 
    250 LABEL(64skip):
    251         cmp     $2048, %rdx
    252         ja     LABEL(64after)
    253 
    254         test    %edx, %edx
    255         jnz     LABEL(32)
    256 
    257         xor     %eax, %eax
    258         ret
    259 
    260         .p2align 4
    261 
    262 LABEL(64after):
    263 
    264 LABEL(pretry):
    265 
    266 LABEL(pre):                              /* 64-byte prefetching */
    267         mov     _sref_(.amd64cache2half), %rcx
    268         cmp	%rdx, %rcx
    269         cmova   %rdx, %rcx
    270 
    271         shr     $6, %rcx
    272         jz      LABEL(preskip)
    273 
    274         prefetchnta 512 (%rsi)	/* 3DNow: use prefetch */
    275         prefetchnta 512 (%rdi)	/* 3DNow: use prefetch */
    276 
    277         mov        (%rsi), %rax
    278         mov      8 (%rsi), %r9
    279         mov     16 (%rsi), %r10
    280         mov     24 (%rsi), %r11
    281         sub        (%rdi), %rax
    282         sub      8 (%rdi), %r9
    283         sub     16 (%rdi), %r10
    284         sub     24 (%rdi), %r11
    285 
    286         or       %r9, %rax
    287         or      %r11, %r10
    288         or      %r10, %rax
    289         jnz     LABEL(32)
    290 
    291         mov     32 (%rsi), %rax
    292         mov     40 (%rsi), %r9
    293         mov     48 (%rsi), %r10
    294         mov     56 (%rsi), %r11
    295         sub     32 (%rdi), %rax
    296         sub     40 (%rdi), %r9
    297         sub     48 (%rdi), %r10
    298         sub     56 (%rdi), %r11
    299 
    300         or       %r9, %rax
    301         or      %r11, %r10
    302         or      %r10, %rax
    303         jnz     LABEL(32)
    304 
    305         lea     64 (%rsi), %rsi
    306         lea     64 (%rdi), %rdi
    307 
    308         sub     $64, %rdx
    309         dec     %rcx
    310 
    311         .p2align 4
    312 
    313 LABEL(preloop):
    314         prefetchnta 512 (%rsi)	/* 3DNow: use prefetch */
    315         prefetchnta 512 (%rdi)	/* 3DNow: use prefetch */
    316 
    317         mov        (%rsi), %rax
    318         mov      8 (%rsi), %r9
    319         mov     16 (%rsi), %r10
    320         mov     24 (%rsi), %r11
    321         sub        (%rdi), %rax
    322         sub      8 (%rdi), %r9
    323         sub     16 (%rdi), %r10
    324         sub     24 (%rdi), %r11
    325 
    326         or       %r9, %rax
    327         or      %r11, %r10
    328         or      %r10, %rax
    329         jnz     LABEL(32)
    330 
    331         mov     32 (%rsi), %rax
    332         mov     40 (%rsi), %r9
    333         mov     48 (%rsi), %r10
    334         mov     56 (%rsi), %r11
    335         sub     32 (%rdi), %rax
    336         sub     40 (%rdi), %r9
    337         sub     48 (%rdi), %r10
    338         sub     56 (%rdi), %r11
    339 
    340         or       %r9, %rax
    341         or      %r11, %r10
    342         or      %r10, %rax
    343         jnz     LABEL(32)
    344 
    345         lea     64 (%rsi), %rsi
    346         lea     64 (%rdi), %rdi
    347 
    348         sub     $64, %rdx
    349         dec     %rcx
    350         jnz     LABEL(preloop)
    351 
    352 
    353 LABEL(preskip):
    354         cmp     $2048, %rdx
    355         ja      LABEL(preafter)
    356 
    357         test    %edx, %edx
    358         jnz     LABEL(32)
    359 
    360         xor     %eax, %eax
    361         ret
    362 
    363         .p2align 4
    364 
    365 LABEL(preafter):
    366 
    367 LABEL(128try):
    368 
    369 LABEL(128):                              /* 128-byte */
    370         mov     %rdx, %rcx
    371         shr     $7, %rcx
    372         jz      LABEL(128skip)
    373 
    374         .p2align 4
    375 
    376 LABEL(128loop):
    377         prefetchnta 512 (%rsi)	/* 3DNow: use prefetch */
    378         prefetchnta 512 (%rdi)	/* 3DNow: use prefetch */
    379 
    380         mov        (%rsi), %rax
    381         mov      8 (%rsi), %r8
    382         sub        (%rdi), %rax
    383         sub      8 (%rdi), %r8
    384         mov     16 (%rsi), %r9
    385         mov     24 (%rsi), %r10
    386         sub     16 (%rdi), %r9
    387         sub     24 (%rdi), %r10
    388 
    389         or       %r8, %rax
    390         or       %r9, %r10
    391         or      %r10, %rax
    392 
    393         mov     32 (%rsi), %r8
    394         mov     40 (%rsi), %r9
    395         sub     32 (%rdi), %r8
    396         sub     40 (%rdi), %r9
    397         mov     48 (%rsi), %r10
    398         mov     56 (%rsi), %r11
    399         sub     48 (%rdi), %r10
    400         sub     56 (%rdi), %r11
    401 
    402         or       %r9, %r8
    403         or      %r11, %r10
    404         or      %r10, %r8
    405 
    406         or      %r8, %rax
    407         jnz     LABEL(32)
    408 
    409         prefetchnta 576 (%rsi)	/* 3DNow: use prefetch */
    410         prefetchnta 576 (%rdi)	/* 3DNow: use prefetch */
    411 
    412         mov      64 (%rsi), %rax
    413         mov      72 (%rsi), %r8
    414         sub      64 (%rdi), %rax
    415         sub      72 (%rdi), %r8
    416         mov      80 (%rsi), %r9
    417         mov      88 (%rsi), %r10
    418         sub      80 (%rdi), %r9
    419         sub      88 (%rdi), %r10
    420 
    421         or       %r8, %rax
    422         or       %r9, %r10
    423         or      %r10, %rax
    424 
    425         mov      96 (%rsi), %r8
    426         mov     104 (%rsi), %r9
    427         sub      96 (%rdi), %r8
    428         sub     104 (%rdi), %r9
    429         mov     112 (%rsi), %r10
    430         mov     120 (%rsi), %r11
    431         sub     112 (%rdi), %r10
    432         sub     120 (%rdi), %r11
    433 
    434         or       %r9, %r8
    435         or      %r11, %r10
    436         or      %r10, %r8
    437 
    438         or      %r8, %rax
    439         jnz     LABEL(32)
    440 
    441         sub     $128, %rdx
    442         dec     %rcx
    443 
    444         lea     128 (%rsi), %rsi
    445         lea     128 (%rdi), %rdi
    446 
    447         jnz     LABEL(128loop)
    448 
    449 LABEL(128skip):
    450         and     $127, %edx
    451         jnz     LABEL(32)
    452 
    453         xor     %eax, %eax
    454         ret
    455 
    456 	SET_SIZE(memcmp)
    457