Fixed Point Square Root Improvements?

From: Chris Williams (spamtrap_at_crayne.org)
Date: 11/25/04


Date: 24 Nov 2004 20:14:06 -0800

I will be working with an embedded system with no FPU and need to be
able to find the square root of a 16.16 fixed point integer.

My current design is using a lookup table where the index is the root
(with two decimal bits) of the value pointed to (with four decimal
bits.) This doesn't cover every possible value, so I do a search for
the nearest value that is contained in the array.

Building the table:

sqrtTable[0x400];
for (L = 0; L < 0x400; L++) {
        sqrtTable[L] = (L * L);
}

(Actually I write the results out to a file for direct compilation in)

And my current function is as such:

__inline unsigned int fsqrt(unsigned int value) { //square root
function which works on a 16/16 fixed point integer

#ifndef PC //straight C code

        unsigned int testValue;

        unsigned int low = 0;
        unsigned int high = 0x3ff;
        unsigned int curr = 0x1ff;

        value >>= 12;

        while (1) {
                testValue = sqrtTable[curr];

                if (value == testValue) {
                        break;
                }
                else if (value < testValue) {
                        high = curr;
                }
                else {
                        low = curr;
                }

                if ((low + 1) == high || low == high) {
                        curr = low;
                        break;
                }

                curr = (low + high) >> 1;
        }

        return (curr << 14);

#else //PC only assembly
        unsigned int returnValue;

        _asm {
                mov eax, 0;
                mov edx, 0x3ff;
                mov edi, 0x1ff;

                lea esi, sqrtTable;

                mov ecx, value;
                shr ecx, 12;

                LOOP_TOP:
                        mov ebx, [esi + edi * 4];
                        cmp ebx, ecx;

                        je LOOP_END;
                        jl LESS;
                                mov edx, edi;

                                add edi, eax;
                                shr edi, 1;
                                jmp COMPARE_END;

                        LESS:
                                mov eax, edi;

                                add edi, edx;
                                shr edi, 1;
                        COMPARE_END:

                        mov ebx, eax;
                        cmp ebx, edx;
                        je CONVERGED;

                        inc ebx;
                        cmp ebx, edx;
                        je CONVERGED;

                        jmp LOOP_TOP;

                        CONVERGED:
                                mov edi, eax;
                LOOP_END:

                shl edi, 14;
                mov returnValue, edi;
        }
        return returnValue;

#endif
}

The C version appears to work about 2.2 times slower than C's sqrt()
on a float and the assembly at about 1.5.
Was just wondering if anyone had any recommendations? I am relatively
pleased to have gotten that close to the standard library, but
certainly wouldn't complain if I could get more speed and/or more
decimal bits with a different approach.

Thank you,
Chris Williams