Re: improve strlen



jukka,

It probably depends wha you are running it on and how you have set it
up. It should be 16 byte aligned. This was tested on a PIV 2.8 gig
Prescott but if you have inlined this in a C compiler, check a few
things like how it protects registers on entry and exit and what
alignment it is running.

These are the timings I get wth a MASM test piece.

szLen_xx is the first version
szLen_x2 is the second version

Prescott PIV 2.8 gig
--------------------
1282 MS szLen_xx
859 MS szLen_x2
1266 MS szLen_xx
859 MS szLen_x2
1266 MS szLen_xx
859 MS szLen_x2
1266 MS szLen_xx
859 MS szLen_x2

AMD Sempron 2.4
---------------
1859 MS szLen_xx
1688 MS szLen_x2
1875 MS szLen_xx
1687 MS szLen_x2
1875 MS szLen_xx
1688 MS szLen_x2
1875 MS szLen_xx
1687 MS szLen_x2

> Unrolling like that kinda kills branch prediction, I suppose. And burns
> code/tracecache for no real gain..

Branch prediction penalties need to be weighed if in fact they are a
factor against jump reduction which is generally more useful as taken
jumps are usually slower than fallen through jumps. In te unroll the
unpredicted jump is only taken once on exit so its not a factor in loop
code.

Thos is the actual form of the MASM code that I derived the two posting
from.

OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE

align 16

szLen_x2 proc item:DWORD

mov [esp-4], esi
mov [esp-8], edi
mov [esp-12], ebx
mov [esp-16], ebp

mov ebx, 80808080h ; load immediates into registers
mov ebp, 4

mov eax, [esp+4]
mov ecx, eax ; copy EAX to ECX
add ecx, 3 ; align up by 4
and ecx, -4
sub ecx, eax ; calculate any misalignment in ecx
mov esi, ecx ; store ECX in ESI
jz proceed

sub eax, 1
@@:
add eax, 1
cmp BYTE PTR [eax], 0 ; scan for terminator for
je quit ; up to the 1st 3 bytes
sub ecx, 1
jns @B
jmp proceed

quit:
sub eax, [esp+4] ; calculate length if terminator
jmp outa_here ; is found in 1st 3 bytes

; ----------------

proceed: ; proceed with the rest
lea edx, [eax+3] ; pointer+3 used in the end

align 4
@@:
REPEAT 7
mov edi, [eax] ; read first 4 bytes
add eax, ebp ; increment pointer
lea ecx, [edi-01010101h] ; subtract 1 from each byte
not edi ; invert all bytes
and ecx, edi ; and these two
and ecx, ebx
jnz nxt ; exit loop on zero bytes
ENDM

mov edi, [eax] ; read first 4 bytes
add eax, ebp ; increment pointer
lea ecx, [edi-01010101h] ; subtract 1 from each byte
not edi ; invert all bytes
and ecx, edi ; and these two
and ecx, ebx
jz @B ; no zero bytes, continue loop

nxt:
test ecx, 00008080h ; test first two bytes
jnz @F
shr ecx, 16 ; not in the first 2 bytes
add eax, 2
@@:
shl cl, 1 ; use carry flag to avoid branch
sbb eax, edx ; compute length
add eax, esi ; add misalignment count

outa_here:
mov esi, [esp-4]
mov edi, [esp-8]
mov ebx, [esp-12]
mov ebp, [esp-16]

ret 4

szLen_x2 endp

OPTION PROLOGUE:PrologueDef
OPTION EPILOGUE:EpilogueDef

Regards,

hutch at movsd dot com

.



Relevant Pages