Re: improve strlen



jukka,

Here is a slightly tweaked version of the algo I posted. It unrolls a
block of code by 8 and replaces an immediate in the loop code with the
same value in a spare register. It is clocking up on my test PIV at
about 22% faster than the last version I posted.

I have done all of the testing on strings that are misaligned so that
the alignment code is forced to run.

;
«««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««

fn_00401460:

mov [esp-4], esi
mov [esp-8], edi
mov [esp-0Ch], ebx
mov [esp-10h], ebp

mov ebx, 80808080h
mov ebp, 4

mov eax, [esp+4]
mov ecx, eax
add ecx, 3
and ecx, 0FFFFFFFCh
sub ecx, eax
mov esi, ecx
jz lbl2
sub eax, 1

lbl0:
add eax, 1
cmp BYTE PTR [eax], 0
jz lbl1
sub ecx, 1
jns lbl0
jmp lbl2

lbl1:
sub eax, [esp+4]
jmp lbl6

lbl2:
lea edx, [eax+3]
mov edi, edi

lbl3:
mov edi, [eax]
add eax, ebp
lea ecx, [edi-1010101h]
not edi
and ecx, edi
and ecx, ebx
jne lbl4
mov edi, [eax]
add eax, ebp
lea ecx, [edi-1010101h]
not edi
and ecx, edi
and ecx, ebx
jnz lbl4
mov edi, [eax]
add eax, ebp
lea ecx, [edi-1010101h]
not edi
and ecx, edi
and ecx, ebx
jnz lbl4
mov edi, [eax]
add eax, ebp
lea ecx, [edi-1010101h]
not edi
and ecx, edi
and ecx, ebx
jnz lbl4
mov edi, [eax]
add eax, ebp
lea ecx, [edi-1010101h]
not edi
and ecx, edi
and ecx, ebx
jnz lbl4
mov edi, [eax]
add eax, ebp
lea ecx, [edi-1010101h]
not edi
and ecx, edi
and ecx, ebx
jnz lbl4
mov edi, [eax]
add eax, ebp
lea ecx, [edi-1010101h]
not edi
and ecx, edi
and ecx, ebx
jnz lbl4
mov edi, [eax]
add eax, ebp
lea ecx, [edi-1010101h]
not edi
and ecx, edi
and ecx, ebx
je lbl3

lbl4:
test ecx, 8080h
jnz lbl5
shr ecx, 10h
add eax, 2

lbl5:
shl cl, 1
sbb eax, edx
add eax, esi

lbl6:
mov esi, [esp-4]
mov edi, [esp-8]
mov ebx, [esp-0Ch]
mov ebp, [esp-10h]
ret 4

;
«««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««

Regards,

hutch at movsd dot com


.



Relevant Pages