Re: Optimization Questions
- From: "Mark_Larson" <spamtrap@xxxxxxxxxx>
- Date: 19 Mar 2007 11:15:13 -0700
On Mar 15, 2:46 pm, Randy Yates <spamt...@xxxxxxxxxx> wrote:
Thanks for all the references and examples, Mark.
FYI, here's my convolution routine. It's designed for use in a very
long FIR filter (>2000 taps), so instead of coding both the inner
and outer loops in assembly, I just coded the inner one, which is
where 99% of the time is.
I know I could unroll the loop, but basically I'm lazy. If anyone
sees any further optimizations that could be done with a single
loop, then please tell. By the way, the data can't be aligned on
8-word boundaries, hence I can't use MOVDQA's to load the data. The
coefficients are aligned.
--Randy
;
;***********************************************************************************************************************************
;
; Module: DecimatorMAC (Decimator Multiply-Accumulate)
;
; Programmer: Randy Yates
;
; Function:
;
; This function performs an inner product between the coefficients and the input data for use in the
; BTSC Decoder decimator routine. It computes a single output sample of an FIR filter.
;
; Register State Upon Entry: standard C calling convention
;
; Register State Upon Exit:
;
; EAX = inner product result in lower 16 bits
;
; C Prototype:
;
; int16_t DecimatorMAC(int16_t* inpPtr, uint32_t inpIdx, uint32_t inpMsk)
;
; Comments, Limitations, Assumptions:
;
; 1. Assumes processor has SSE2 extensions.
;
; 2. Assumes the coefficient length is a multiple of 8.
;
; 3. Assumes the coefficient order is reversed. Note that, for a symmetric filter, this is redundant
;
; 4. Assumes gcc C calling conventions (is this different than standard C calling conventions?)
;
;***********************************************************************************************************************************
;
GLOBAL DecimatorMAC
;
EXTERN decimatorFilterCoef,decimatorFilterLength
;
; Passed parameter stack offset definitions
;
prmPtr EQU 20
prmIdx EQU 24
prmMsk EQU 28
;
SECTION .data
;
; temporary storage for 8 samples (16 bytes) when the circular buffer boundary is crossed
;
boundarySamples:
;
DW 0, 0, 0, 0, 0, 0, 0, 0
;
k0: DD 0 ; intermediate loop counts - see description below
k1: DD 0
k2: DD 0
;
j DW 0
;
SECTION .text
;
DecimatorMAC:
;
; save registers which the caller needs preserved (gcc C calling conventions)
;
push ebp
push esi
push edi
push ebx
;
mov ebp, esp
;
mov esi, [ebp+prmPtr] ; esi is input data pointer
mov ebx, [ebp+prmMsk] ; ebx is mask
mov edi, [ebp+prmIdx] ; edi is data index
mov ecx, [decimatorFilterLength] ; ecx is decrementing loop counter
xor edx, edx ; edx is incrementing loop counter
;
; The standard convolution kernel is
;
; y[n] = sum_{m=0}^{M-1} h[m] * x[n-m].
;
; We could just as easily (and equivalently) let m run backwards:
;
; y[n] = sum_{m=M-1}^{0} h[m] * x[n-m].
;
; If we let k = M-1-m ==> m = M-1-k, this becomes
;
; y[n] = sum_{k=0}^{M-1} h[M-1-k] * x[n-M+1+k].
;
; The coefficients are in reverse order
;
; b[0] = h[M-1]
; b[1] = h[M-2]
; .
; .
; .
; b[M-1] = h[0]
;
; which implies b[k] = h[M-1-k] - precisely the expression above.
; Using k, the index into the data runs positive. Thus we need to
; compute n-M+1 as the starting point in the data, where n = inpIdx
; and M = decimatorFilterLength:
;
sub edi, ecx
add edi, 1
;
; use the mask to 'circularize" the index
;
and edi, ebx
;
; Compute the total SSE2 loop count k as the decimator filter length divided by 8 (we assume the
; decimator filter length is a multiple of eight) since we're operating on 8 16-bit samples at a time:
;
shr ecx, 3 ; divide ecx by 3 since we process 8 samples at a time
;
;********************
; Processing Approach
;********************
;
; Due to the possible disconuity in the input data across the circular buffer boundary,
; the processing is partitioned into three sections:
;
; 1. From n0, the starting point and the current value of [edi], to the last value n0 + 8*k0 - 1 such that
;
; n0 + 8*k0 - 1 <= M
;
; and
;
; n0 + 8*(k0+1) - 1 > M,
;
; where M = inpMsk. k0 is easily determined as
;
; k0 = min(floor((M + 1 - n0)/8), k),
;
; The SSE2 parallel computations are performed k0 times in this first section.
;
mov eax, ebx
add eax, 1
sub eax, edi
shr eax, 3
mov [k0], eax
sub eax, ecx
jna k0_OK
;
mov [k0], ecx
;
k0_OK:
;
; 2. In this section, k1 SSE2 parallel computations are performed using the temporary storage area as the
; data address, where
;
; k1 = min(k-k0, 1).
;
; Thus 0 or 8 samples are processed. The samples (if any) in this section may straddle the boundary of the
; circular buffer, so they are copied to the temporary storage area boundarySamples.
;
mov eax, ecx
sub eax, [k0]
je k1_OK
;
mov eax, 1
;
k1_OK:
;
mov [k1], eax
;
; 3. In this section, k2 SSE2 parallel computations are performed, where
;
; k2 = k - k0 - k1
;
mov eax, ecx
sub eax, [k0]
sub eax, [k1]
mov [k2], eax
;
; Initialize j, the section number (0, 1, 2) (kj)
;
mov word [j], 2
;
; Debugging label
;
kComplete:
;
; SSE2 register usage definition:
;
; xmm0 = input data and intermediate MAC (4 32-bit words)
; xmm1 = coef data
; xmm3 = quad 32-bit accumulator
;
; zero accumulator:
;
pxor xmm3, xmm3
;
; Get k0 in ecx for first section
;
mov ecx, [k0]
or ecx, ecx
je Checkj ; it's possible k0 = 0
;
; OK, we're all set: here's the main loop
;
MACLoop:
;
; Get 8 words of data:
;
; movdqu xmm0, [esi+2*edi]
movq xmm0, [esi+2*edi]
movhps xmm0, [esi+2*edi+8]
;
MACLoopResume:
;
; Get 8 words filter coefficients:
;
movdqa xmm1, [decimatorFilterCoef+2*edx]
;
; Do the double-quad intermediate MAC (xmm0):
;
pmaddwd xmm0, xmm1
;
; increment and circularize the data index
;
add edi, 8
and edi, ebx
;
; shift right arithmetically three to scale
;
psrad xmm0, 3
;
; decrement loop counter
;
add edx, 8
sub ecx, 1
;
; accumulate into xmm3
;
paddd xmm3, xmm0
jnz MACLoop
;
; check j
;
Checkj:
;
sub word [j], 1
je Sectionk2
jl Finished
;
Sectionk1
;
mov ecx, [k1]
or ecx, ecx
je Finished
;
; move 8 input samples into boundarySamples
;
mov ecx, 8
mov ebp, boundarySamples
;
WordMove:
;
mov ax, [esi+2*edi]
mov [ebp], ax
;
; increment and circularize the data index
;
add edi, 1
and edi, ebx
;
; increment the ebp index into the boundarySamples buffer
;
add ebp, 2
;
; decrement loop counter
;
sub ecx, 1
jnz WordMove
;
movdqu xmm0, [boundarySamples]
add ecx, 1
;
; radjust the data index
;
add edi, -8
and edi, ebx
jmp MACLoopResume
;
Sectionk2:
;
mov ecx, [k2]
or ecx, ecx
jne MACLoop
;
Finished:
;
; OK, we've got 4 intermediate 32-bit results packed into xmm3. Sum those
; and take the MSW and we've got our result.
;
; move result onto stack
;
movdqu [esp-16], xmm3
;
; move the first result into the accumulatr,
; then sum the other three
;
mov eax, dword [esp-16]
add eax, dword [esp-12]
add eax, dword [esp-08]
add eax, dword [esp-04]
;
; Finally, perform an arithmetic right shift by 16
;
shr eax, 16
;
; restore registers
;
RegisterRestore:
;
pop ebx
pop edi
pop esi
pop ebp
;
ret
;
END
--
% Randy Yates % "Remember the good old 1980's, when
%% Fuquay-Varina, NC % things were so uncomplicated?"
%%% 919-577-9882 % 'Ticket To The Moon'
%%%% <y...@xxxxxxxx> % *Time*, Electric Light Orchestrahttp://home.earthlink.net/~yatescr
couple quick tips
Get rid of the MOVDQU. MOVDQA takes 6 cycles on the p4, and MOVDQU
takes 10. That is almost twice as much! The trick is to align your
memory on a 16 byte boundary. I thought you said you were using
Linux? If so my red hat linux has support for a memalign()
insruction. if you don't you can search in malloc.h for the word
"align". I have a different alligned malloc on my Ubuntu, but again I
found it by searching for "align" in malloc.h. Memalign takes 2
parameters, the first is the alignment, the second is the size. on
redhat memalign doesn't need a special free(), you just call the
normal one. On my Ubuntu the malloc has a special aligned free. I
think on Ubuntu the functions are _aligned_malloc(), _aligned_free().
I searched for the word "free" in malloc.h to see if there were any
special free functions for the aligned mallocs.
fred = memalign(16,100000);
free(fred);
pshufd trick. there are 4 ports on the P4. ALL SSE/SSE2 go through
port 1. The one exception is any kind of move instruction whether it
is floating point, mmx, sse, or sse2 all go through port 0. Integer
instructions go through port 0 and port 1. So here is the rub.
Pshufd is 2 cycles faster on a P4 ( 4 cycles compared to 6 cycles on
the P4). But it goes down port 1. So in some cases it is faster to
replace all your movdqas with pshufd. The way I do it is I try all
combinations of replacing them. In general doing this if you have two
back to back movdqa's is gonna be faster.
PREVIOUS CODE
; Get 8 words of data:
;
movdqu xmm0, [esi+2*edi]
;
MACLoopResume:
;
; Get 8 words filter coefficients:
;
movdqa xmm1, [decimatorFilterCoef+2*edx]
REVISED code, changing movdqu to movdqa and adding a pshufd to second
movdqa, so they can execute in parallel
; Get 8 words of data:
;
movdqa xmm0, [esi+2*edi]
;
MACLoopResume:
;
; Get 8 words filter coefficients:
;
pshufd xmm1, [decimatorFilterCoef+2*edx],0E4h
16-bit code. don't use it. Switch to using movzx. 16 bit code on a
p4 runs horribly. Movzx is for unsigned data. Movsx is for signed
data, so make sure you use the right one based upon the data you are
using. You only need to change the code where you read the data into
a 16-bit register, writing one afterwards will be fast. The issue has
to do without partial register stalls. this is also an example of a
write-after-read, which I talk about below. Pre-read the value in EAX
with movzx or movsx 4 or 5 lines ahead if possible.
mov ax, [esi+2*edi]
mov [ebp], ax
;move me up 4 or 5 lines
movzx eax, dword ptr [esi+2*edi] ;gets rid of
one 16-bit access
mov [ebp], ah ;this avoids
a partial register stall because all of EAX was updated on the
previous instruction.
memory optimizations. Try and pre-read data into a register before
you use it. Memory access instructions usually take a number of
cycles longer since they have to get the data from memory first. So
pre-reading into a register will give a big speed up. I saw lots of
places in your code where you read data and then immediately use it.
Pre-read it a number of instructions ahead( 4 or 5 is a good rule of
thumb). For SSE2/SSE instructions you want more than several
instructions ahead. About twice of that.
second memory optimization trick. You can pre-read the data for SSE/
SSE2 registers on the previous loop. You move it to the end. You
aren't using that many SSE2 registers. You are using XMM0, XMM1,
XMM3. So you can use the rest to pre-read the data. For that to work
right, you have to add a read of them into the other SSE2 registers
above the first label in your loop
;add setting up XMM registers here outside the loop to pre-read
them for the first loop execution.
mov eax, ebx
add eax, 1
sub eax, edi
shr eax, 3
mov [k0], eax
sub eax, ecx
jna k0_OK
;
mov [k0], ecx
k0_OK:
redundant XMM code. the MOVQ and the MOVHPS do the same thing as the
single MOVDQU. get rid of them. Also align the memory you are using
and switch to MOVDQA.
MACLoop:
;
; Get 8 words of data:
;
movdqa xmm0, [esi+2*edi]
; movdqu xmm0, [esi+2*edi]
; movq xmm0, [esi+2*edi]
; movhps xmm0, [esi+2*edi+8]
;
if you read a register after just having written it, you get a stall.
Because the previous instruction has to finish executing before the
next instruction can execute. That's why I wanted you to pre-read all
your data before you need it. Otherwise it is going to stall the
bus. This is called a write-after-read. A read-after-a-write also
stalls. So if you just wrote data to memory or a register, and then
on the next instruction you use it as the source, it will stall. A
write means that the register or memory was the destination of the
previous instruction. And a read means the source is being used for
the next instruction. Memory reads can take a while, so you'd wanna
space these out 4 or so instructions, maybe more.
mov eax,[memory]
add ecx,eax
the p4 can theoritically execute 4 instructions in ALU instructions in
parallel ( the trace cache keeps that from happening since it only
issues 3 instructions at a time). Anyways, because of that you can
execute two ALU pieces of code by interleaving the instructions. This
usually gives a good speed up.
piece1_line1
piece2_line1
piece1_line2
piece2_line2
piece1_line3
piece2_line3
piece1_line4
piece2_line4
piece1_line5
piece2_line5
there are a few exceptions to that, for example, sometimes you have to
do a slow instruction in one of the pieces of code. If so you would
want to pad with more fast instructions from the first loop. Slow
instructions are like MUL and DIV. anything that takes more than a
few cycles to execute. Get a copy of the Intel optmization book. I
am assuming you have an Intel, since most people do.
repost your code after you've had a chance to play with it with these
optimization tips.
Mark
.
- Follow-Ups:
- Re: Optimization Questions
- From: Randy Yates
- Re: Optimization Questions
- References:
- Optimization Questions
- From: Randy Yates
- Re: Optimization Questions
- From: Mark_Larson
- Re: Optimization Questions
- From: Randy Yates
- Re: Optimization Questions
- From: Mark_Larson
- Re: Optimization Questions
- From: Randy Yates
- Optimization Questions
- Prev by Date: Re: increment values from stdin
- Next by Date: Re: Optimization Questions
- Previous by thread: Re: Optimization Questions
- Next by thread: Re: Optimization Questions
- Index(es):
Relevant Pages
|
|