Re: COMPARE HLL/ASM




Ok, here is the whole story yet:

It's not optimised at all and there might be faster algos too,
but it contains not a single branch and uses only four GP-regs,
so its timing isn't value dependend.

I got ~38 cycles on AMD K7 under KESYS (cache aligned, prefetched),
and an average of ~150 cycles on AMD64 with XP-home.
It may be a bit slower on Intel CPUs because of the Shifts.

As I saw in the other post right now you meant a 64 bit result,
so I expanded my first version to 128->64 bits it now shows
85 cycles on the KESYS-K7 and ~300 with windoze.
Seems this M$-stuff got heavy cache-issues ;)

So I tried also a short code version and what a surprise it now
takes again ~150 cycles per pass with windoze (~95 with mine).
I think to measure just all the cache miss penalties and the time
our code takes is a minor factor on windoze.
Hope you can get better figures with Linux.
__
wolfgang


the short version (complete copy)[RosAsm]:

[StdH: 0]
[Time: 0 0]
[Hstring: B$'ffeeddccFEDCBA98']
[Result: 0]
[ResultH: 0]
main:
push 0-11 |call 'KERNEL32.GetStdHandle' |mov D$StdH eax
;a dummy to make code + data pages alive
_______________________________________
MOV eax 0
CPUID
RDTSC |mov D$time eax |mov D$time+4 edx
__________;TEST-AREA insert your code under test here:

hex2bin: ;assumming string not empty and byte aligned (even size)
mov esi,hstring
mov edi,result
mov ecx,16 ;str-len
ALIGN 4
L0: mov ax, W$esi+ecx-2
sub ax,3030h
and ax,5f5fh
cmp al,0ah |jc L1> |sub al,7
L1: cmp ah,0ah |jc L2> |sub ah,7
L2: shl al,4 |or al ah
mov B$edi,al |inc edi |sub ecx,2|jnz L0<
done:
___________;end of TEST area

RDTSC |sub eax D$time |sbb edx D$time+4 |mov D$time eax |mov D$time+4 edx
int3 ;read values in debug view
jmp 'KERNEL32.ExitProcess'
____________________________________


____________________________________
The unrolled variant: (test code only, assumes a 16 byte string)

hex2bin:
MOV eax D$Hstring+8
MOV edx D$Hstring+12
;hex2bin: ;** edx:eax to eax **
BSWAP eax ;assume strings LSD is rightmost
BSWAP edx
SUB eax,30303030h
SUB edx,30303030h
AND eax,5f5f5f5fh ;make it Ucase
AND edx,5f5f5f5fh
MOV ebx,eax ;copy low
MOV ecx,edx ;copy high
AND eax,10101010h
AND edx,10101010h
SHR eax,4
SHR edx,4
IMUL eax,7
IMUL edx,7
SUB ebx,eax
SUB ecx,edx ; we got unpacked BCD (better: BCH)
; with leftmost MSD in ebx:ecx yet.
;int3
;packBCD: ; ebx:ecx ->eax
MOV eax ebx ; start with top two
SHR eax 16
SHL ah 4
OR ah al ;AH is top byte yet
SHL eax 8
MOV ax bx
SHL ah 4
OR ah al
SHL eax 8 ;the two high bytes are done now

MOV ax,cx ;LSD now
SHL ah,4
OR al,ah
SHR ecx 16
MOV ah cl
SHL ch 4
OR ah ch
;first quad done:
MOV D$result eax

;almost just copied from the above
MOV eax D$Hstring
MOV edx D$Hstring+4
;hex2bin: ;** edx:eax to eax **
BSWAP eax ;assume strings LSD is rightmost
BSWAP edx
SUB eax,30303030h
SUB edx,30303030h
AND eax,5f5f5f5fh
AND edx,5f5f5f5fh
MOV ebx,eax ;copy low
MOV ecx,edx ;copy high
AND eax,10101010h
AND edx,10101010h
SHR eax,4
SHR edx,4
IMUL eax,7
IMUL edx,7
SUB ebx,eax
SUB ecx,edx ; we got unpacked BCD (better: BCH)
; with leftmost MSD in ebx:ecx yet.
;packBCD: ; ebx:ecx ->eax
MOV eax ebx ; start with top two
SHR eax 16
SHL ah 4
OR ah al ;AH is top byte yet
SHL eax 8
MOV ax bx
SHL ah 4
OR ah al
SHL eax 8 ;the two high bytes are done now

MOV ax,cx ;LSD now
SHL ah,4
OR al,ah
SHR ecx 16
MOV ah cl
SHL ch 4
OR ah ch
;done 2nd quad:
MOV D$resultH eax
__


.



Relevant Pages