Re: bsr-algorithm?
- From: spamtrap <spamtrap@xxxxxxxxxx>
- Date: Fri, 23 May 2008 23:16:41 -0700 (PDT)
Hy;
Anybody got an idea about a branch-free 'emulation' of bsr?
Here it is - the code below treats input_vector as four 16-bit words
and performs BSR on them, leaving the results in mm0. For 0 values
it returns 0xffff, just like your fbpos function.
J.
Hm, it's the 'normal' log2-algorithm, right?
/* first-bit position */
static int fbpos(short int e) {
int f = e, fb = -1;
if (e > 0x00FF) fb += 8, e >>= 8;
if (e > 0x000F) fb += 4, e >>= 4;
if (e > 0x0003) fb += 2, e >>= 2;
if (e > 0x0001) fb += 1;
if (f > 0x0000) fb += 1;
return fb;
}
Oh, man, sometimes you don't see the forest for the trees, thanks.
It's integer only, which is great (MMX), you can do some more:
/* first-bit position */
static int fbpos(short int e) {
int s = 8;
int fb = (e ? 0 : -1);
int m = (e ? -1U : 0) >> s;
if (e > m) fb += s, e >>= s; s >>= 1; m >>= s;
if (e > m) fb += s, e >>= s; s >>= 1; m >>= s;
if (e > m) fb += s, e >>= s; s >>= 1; m >>= s;
if (e > m) fb += s;
return fb;
}
In MMX then:
const __int64 halfsize = 0x0000000000000008;
__asm {
movq mm1, [ input_vector ] ; e
pxor mm0, mm0
pxor mm3, mm3
pcmpeqw mm0, mm1 ; fb
movq mm2, [ halfsize ] ; s
pcmpeqw mm3, mm0 ; m
psrlw mm3, mm2 ; m >> s
; - 1st partition ----------------------------------------
movq mm4, mm1 ; e
pcmpgtw mm4, mm3 ; e > m
pand mm4, mm2 ; s<>0
padd mm0, mm4 ; fb += s<>0
psrlw mm2, 1 ; s >> 1
psrlw mm1, mm4 ; e >>= s<>0
psrlw mm3, mm2 ; m >> s
; - 2nd partition ----------------------------------------
movq mm4, mm1 ; e
pcmpgtw mm4, mm3 ; e > m
pand mm4, mm2 ; s<>0
padd mm0, mm4 ; fb += s<>0
psrlw mm2, 1 ; s >> 1
psrlw mm1, mm4 ; e >>= s<>0
psrlw mm3, mm2 ; m >> s
; - 3rd partition ----------------------------------------
movq mm4, mm1 ; e
pcmpgtw mm4, mm3 ; e > m
pand mm4, mm2 ; s<>0
padd mm0, mm4 ; fb += s<>0
psrlw mm2, 1 ; s >> 1
psrlw mm1, mm4 ; e >>= s<>0
psrlw mm3, mm2 ; m >> s
; - 4th partition ----------------------------------------
movq mm4, mm1 ; e
pcmpgtw mm4, mm3 ; e > m
pand mm4, mm2 ; s<>0
padd mm0, mm4 ; fb += s<>0
}
Nice, thanks!
Niels
.
- References:
- bsr-algorithm?
- From: spamtrap
- Re: bsr-algorithm?
- From: Jacek Wawrzaszek
- bsr-algorithm?
- Prev by Date: Re: bsr-algorithm?
- Next by Date: read floppy problem
- Previous by thread: Re: bsr-algorithm?
- Next by thread: AP-67 82C37 Application Note
- Index(es):