Re: improve strlen
- From: "hutch--" <spamtrap@xxxxxxxxxx>
- Date: Wed, 26 Oct 2005 17:51:53 +0000 (UTC)
jukka,
It probably depends wha you are running it on and how you have set it
up. It should be 16 byte aligned. This was tested on a PIV 2.8 gig
Prescott but if you have inlined this in a C compiler, check a few
things like how it protects registers on entry and exit and what
alignment it is running.
These are the timings I get wth a MASM test piece.
szLen_xx is the first version
szLen_x2 is the second version
Prescott PIV 2.8 gig
--------------------
1282 MS szLen_xx
859 MS szLen_x2
1266 MS szLen_xx
859 MS szLen_x2
1266 MS szLen_xx
859 MS szLen_x2
1266 MS szLen_xx
859 MS szLen_x2
AMD Sempron 2.4
---------------
1859 MS szLen_xx
1688 MS szLen_x2
1875 MS szLen_xx
1687 MS szLen_x2
1875 MS szLen_xx
1688 MS szLen_x2
1875 MS szLen_xx
1687 MS szLen_x2
> Unrolling like that kinda kills branch prediction, I suppose. And burns
> code/tracecache for no real gain..
Branch prediction penalties need to be weighed if in fact they are a
factor against jump reduction which is generally more useful as taken
jumps are usually slower than fallen through jumps. In te unroll the
unpredicted jump is only taken once on exit so its not a factor in loop
code.
Thos is the actual form of the MASM code that I derived the two posting
from.
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
align 16
szLen_x2 proc item:DWORD
mov [esp-4], esi
mov [esp-8], edi
mov [esp-12], ebx
mov [esp-16], ebp
mov ebx, 80808080h ; load immediates into registers
mov ebp, 4
mov eax, [esp+4]
mov ecx, eax ; copy EAX to ECX
add ecx, 3 ; align up by 4
and ecx, -4
sub ecx, eax ; calculate any misalignment in ecx
mov esi, ecx ; store ECX in ESI
jz proceed
sub eax, 1
@@:
add eax, 1
cmp BYTE PTR [eax], 0 ; scan for terminator for
je quit ; up to the 1st 3 bytes
sub ecx, 1
jns @B
jmp proceed
quit:
sub eax, [esp+4] ; calculate length if terminator
jmp outa_here ; is found in 1st 3 bytes
; ----------------
proceed: ; proceed with the rest
lea edx, [eax+3] ; pointer+3 used in the end
align 4
@@:
REPEAT 7
mov edi, [eax] ; read first 4 bytes
add eax, ebp ; increment pointer
lea ecx, [edi-01010101h] ; subtract 1 from each byte
not edi ; invert all bytes
and ecx, edi ; and these two
and ecx, ebx
jnz nxt ; exit loop on zero bytes
ENDM
mov edi, [eax] ; read first 4 bytes
add eax, ebp ; increment pointer
lea ecx, [edi-01010101h] ; subtract 1 from each byte
not edi ; invert all bytes
and ecx, edi ; and these two
and ecx, ebx
jz @B ; no zero bytes, continue loop
nxt:
test ecx, 00008080h ; test first two bytes
jnz @F
shr ecx, 16 ; not in the first 2 bytes
add eax, 2
@@:
shl cl, 1 ; use carry flag to avoid branch
sbb eax, edx ; compute length
add eax, esi ; add misalignment count
outa_here:
mov esi, [esp-4]
mov edi, [esp-8]
mov ebx, [esp-12]
mov ebp, [esp-16]
ret 4
szLen_x2 endp
OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef
Regards,
hutch at movsd dot com
.
- Follow-Ups:
- Re: improve strlen
- From: jukka@xxxxxxxxxxxx
- Re: improve strlen
- References:
- improve strlen
- From: Claudio Daffra
- Re: improve strlen
- From: spamtrap
- Re: improve strlen
- From: hutch--
- Re: improve strlen
- From: spamtrap
- Re: improve strlen
- From: jukka@xxxxxxxxxxxx
- Re: improve strlen
- From: jukka@xxxxxxxxxxxx
- Re: improve strlen
- From: jukka@xxxxxxxxxxxx
- Re: improve strlen
- From: hutch--
- Re: improve strlen
- From: jukka@xxxxxxxxxxxx
- improve strlen
- Prev by Date: Re: compiler generated output
- Next by Date: Re: compiler generated output
- Previous by thread: Re: improve strlen
- Next by thread: Re: improve strlen
- Index(es):
Relevant Pages
|