Home | History | Annotate | Download | only in gen
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright (c) 2009, Intel Corporation
     24  * All rights reserved.
     25  */
     26 
     27 /*
     28  *	strlen - calculate the length of string
     29  */
     30 
     31 #include "SYS.h"
     32 #include "proc64_id.h"
     33 
     34 #define LABEL(s) .strlen/**/s
     35 
     36 	/*
     37 	 * This implementation uses SSE instructions to compare up to 16 bytes
     38 	 * at a time looking for the end of string (null char).
     39 	 */
     40 	ENTRY(strlen)			/* (const char *s) */
     41 	mov	%rdi, %rsi		/* keep original %rdi value */
     42 	mov	%rsi, %rcx
     43 	pxor	%xmm0, %xmm0		/* 16 null chars */
     44 	and	$15, %rcx
     45 	jz	LABEL(align16_loop)	/* string is 16 byte aligned */
     46 
     47 	/*
     48 	 * Unaligned case. Round down to 16-byte boundary before comparing
     49 	 * 16 bytes for a null char. The code then compensates for any extra chars
     50 	 * preceding the start of the string.
     51 	 */
     52 LABEL(unalign16):
     53 	and	$0xfffffffffffffff0, %rsi
     54 
     55 	pcmpeqb	(%rsi), %xmm0
     56 	lea	16(%rdi), %rsi
     57 	pmovmskb %xmm0, %edx
     58 
     59 	shr	%cl, %edx		/* Compensate for bytes preceding the string */
     60 	test	%edx, %edx
     61 	jnz	LABEL(exit)
     62 	sub	%rcx, %rsi		/* no null, adjust to next 16-byte boundary */
     63 	pxor	%xmm0, %xmm0		/* clear xmm0, may have been changed... */
     64 
     65 	.p2align 4
     66 LABEL(align16_loop):			/* 16 byte aligned */
     67 	pcmpeqb	(%rsi), %xmm0		/* look for null bytes */
     68 	pmovmskb %xmm0, %edx		/* move each byte mask of %xmm0 to edx */
     69 
     70 	add	$16, %rsi		/* prepare to search next 16 bytes */
     71 	test	%edx, %edx		/* if no null byte, %edx must be 0 */
     72 	jnz	LABEL(exit)		/* found a null */
     73 
     74 	pcmpeqb	(%rsi), %xmm0
     75 	pmovmskb %xmm0, %edx
     76 	add	$16, %rsi
     77 	test	%edx, %edx
     78 	jnz	LABEL(exit)
     79 
     80 	pcmpeqb	(%rsi), %xmm0
     81 	pmovmskb %xmm0, %edx
     82 	add	$16, %rsi
     83 	test	%edx, %edx
     84 	jnz	LABEL(exit)
     85 
     86 	pcmpeqb	(%rsi), %xmm0
     87 	pmovmskb %xmm0, %edx
     88 	add	$16, %rsi
     89 	test	%edx, %edx
     90 	jz	LABEL(align16_loop)
     91 
     92 	.p2align 4
     93 LABEL(exit):
     94 	neg	%rdi
     95 	/*
     96 	 * Check to see if BSF is fast on this processor. If not, use a different
     97 	 * exit tail to find first bit set indicating null byte match.
     98 	 */
     99 	testl	$USE_BSF, .memops_method(%rip)
    100 	jz	LABEL(AMD_exit)
    101 
    102 	lea	-16(%rdi, %rsi), %rax	/* calculate exact offset */
    103 	bsf	%edx, %ecx		/* Least significant 1 bit is index of null */
    104 	lea	(%rax, %rcx),%rax
    105 	ret
    106 
    107 	/*
    108 	 * This exit tail does not use the bsf instruction.
    109 	 */
    110 	.p2align 4
    111 LABEL(AMD_exit):
    112 	lea	-16(%rdi, %rsi), %rax
    113 	test	%dl, %dl
    114 	jz	LABEL(exit_high)
    115 	test	$0x01, %dl
    116 	jnz	LABEL(exit_tail0)
    117 
    118 	test	$0x02, %dl
    119 	jnz	LABEL(exit_tail1)
    120 
    121 	.p2align 4
    122 	test	$0x04, %dl
    123 	jnz	LABEL(exit_tail2)
    124 
    125 	test	$0x08, %dl
    126 	jnz	LABEL(exit_tail3)
    127 
    128 	test	$0x10, %dl
    129 	jnz	LABEL(exit_tail4)
    130 
    131 	test	$0x20, %dl
    132 	jnz	LABEL(exit_tail5)
    133 
    134 	test	$0x40, %dl
    135 	jnz	LABEL(exit_tail6)
    136 	add	$7, %rax
    137 	ret
    138 
    139 	.p2align 4
    140 LABEL(exit_high):
    141 	add	$8, %rax
    142 	test	$0x01, %dh
    143 	jnz	LABEL(exit_tail0)
    144 
    145 	test	$0x02, %dh
    146 	jnz	LABEL(exit_tail1)
    147 
    148 	test	$0x04, %dh
    149 	jnz	LABEL(exit_tail2)
    150 
    151 	test	$0x08, %dh
    152 	jnz	LABEL(exit_tail3)
    153 
    154 	test	$0x10, %dh
    155 	jnz	LABEL(exit_tail4)
    156 
    157 	test	$0x20, %dh
    158 	jnz	LABEL(exit_tail5)
    159 
    160 	test	$0x40, %dh
    161 	jnz	LABEL(exit_tail6)
    162 	add	$7, %rax
    163 	ret
    164 
    165 	.p2align 4
    166 LABEL(exit_tail0):
    167 	xor	%ecx, %ecx
    168 	ret
    169 
    170 	.p2align 4
    171 LABEL(exit_tail1):
    172 	add	$1, %rax
    173 	ret
    174 
    175 	.p2align 4
    176 LABEL(exit_tail2):
    177 	add	$2, %rax
    178 	ret
    179 
    180 	.p2align 4
    181 LABEL(exit_tail3):
    182 	add	$3, %rax
    183 	ret
    184 
    185 	.p2align 4
    186 LABEL(exit_tail4):
    187 	add	$4, %rax
    188 	ret
    189 
    190 	.p2align 4
    191 LABEL(exit_tail5):
    192 	add	$5, %rax
    193 	ret
    194 
    195 	.p2align 4
    196 LABEL(exit_tail6):
    197 	add	$6, %rax
    198 	ret
    199 	SET_SIZE(strlen)
    200