Re: Optimizing Assembler Code



On Jun 14, 1:02 am, Frank <spamt...@xxxxxxxxxx> wrote:
I am working on an Open Source lexical analyzer generator
(http://quex.sourceforge.net).

Currently, I am busy optimizing the generated output. As a benchmark
I let it generate a lexical analyzer for C. Based on the analysis
time
for analyzing the ./kernel directory the linux sources I am measuring
the performance. Now, I am finding strange effects, such as when
using 'computed goto' (goto *variable) is slower than routing via
a switch statement (switch(variabel) { case 1: goto LABLE_1; ... })

I would need some support by someone who has excessive assembler
experience, in order to understand what it going on.

Please, contact me under the website above if anyone it interested.

Best Regards

Frank.

The compiler may generate rather different code for semantically
equivalent code.
Here's what I get for this code snippet:

// compiled with: gcc -c -O2 sw.c -S -osw.S
#include <stdio.h>

void fxn1(int var)
{
if (var == 0) goto label0;
else if (var == 1) goto label1;
else if (var == 2) goto label2;
else if (var == 3) goto label3;
else if (var == 4) goto label4;
else if (var == 5) goto label5;
else if (var == 6) goto label6;
else if (var == 7) goto label7;
return;

label0: printf("0\n"); return;
label1: printf("1\n"); return;
label2: printf("2\n"); return;
label3: printf("3\n"); return;
label4: printf("4\n"); return;
label5: printf("5\n"); return;
label6: printf("6\n"); return;
label7: printf("7\n"); return;
}

void fxn2(int var)
{
if (var < 4)
{
if (var < 2)
{
if (var == 0) goto label0;
else goto label1;
}
else
{
if (var == 2) goto label2;
else goto label3;
}
}
else
{
if (var < 6)
{
if (var == 4) goto label4;
else goto label5;
}
else
{
if (var == 6) goto label6;
else goto label7;
}
}
return;

label0: printf("0\n"); return;
label1: printf("1\n"); return;
label2: printf("2\n"); return;
label3: printf("3\n"); return;
label4: printf("4\n"); return;
label5: printf("5\n"); return;
label6: printf("6\n"); return;
label7: printf("6\n"); return;
}

void fxn3(int var)
{
switch (var)
{
case 0: goto label0;
case 1: goto label1;
case 2: goto label2;
case 3: goto label3;
case 4: goto label4;
case 5: goto label5;
case 6: goto label6;
case 7: goto label7;
}
return;

label0: printf("0\n"); return;
label1: printf("1\n"); return;
label2: printf("2\n"); return;
label3: printf("3\n"); return;
label4: printf("4\n"); return;
label5: printf("5\n"); return;
label6: printf("6\n"); return;
label7: printf("7\n"); return;
}

And here's the assembly output from the compiler:
.file "sw.c"
.section .text
LC7:
.ascii "7\0"
LC6:
.ascii "6\0"
LC5:
.ascii "5\0"
LC4:
.ascii "4\0"
LC3:
.ascii "3\0"
LC2:
.ascii "2\0"
LC1:
.ascii "1\0"
LC0:
.ascii "0\0"
.p2align 4,,15
..globl _fxn1
_fxn1:
pushl %ebp
movl %esp, %ebp
movl 8(%ebp), %eax
testl %eax, %eax
je L3
cmpl $1, %eax
je L6
cmpl $2, %eax
je L9
cmpl $3, %eax
je L12
cmpl $4, %eax
je L15
cmpl $5, %eax
je L18
cmpl $6, %eax
je L21
cmpl $7, %eax
je L24
popl %ebp
ret
L24:
movl $LC7, 8(%ebp)
.p2align 4,,7
L25:
popl %ebp
jmp _puts
L21:
movl $LC6, 8(%ebp)
jmp L25
L18:
movl $LC5, 8(%ebp)
jmp L25
L15:
movl $LC4, 8(%ebp)
jmp L25
L12:
movl $LC3, 8(%ebp)
jmp L25
L9:
movl $LC2, 8(%ebp)
jmp L25
L6:
movl $LC1, 8(%ebp)
jmp L25
.p2align 4,,7
L3:
movl $LC0, 8(%ebp)
jmp L25
.p2align 4,,15
..globl _fxn2
_fxn2:
pushl %ebp
movl %esp, %ebp
movl 8(%ebp), %eax
cmpl $3, %eax
jg L27
cmpl $1, %eax
jle L50
cmpl $2, %eax
je L35
L37:
movl $LC3, 8(%ebp)
.p2align 4,,7
L49:
popl %ebp
jmp _puts
.p2align 4,,7
L35:
movl $LC2, 8(%ebp)
jmp L49
.p2align 4,,7
L50:
testl %eax, %eax
jne L32
L30:
movl $LC0, 8(%ebp)
jmp L49
L32:
movl $LC1, 8(%ebp)
jmp L49
.p2align 4,,7
L27:
cmpl $5, %eax
jg L39
cmpl $4, %eax
je L41
L43:
movl $LC5, 8(%ebp)
jmp L49
L41:
movl $LC4, 8(%ebp)
jmp L49
L46:
.p2align 4,,7
L39:
L48:
movl $LC6, 8(%ebp)
jmp L49
.p2align 4,,15
..globl _fxn3
_fxn3:
pushl %ebp
movl %esp, %ebp
movl 8(%ebp), %eax
cmpl $7, %eax
ja L51
jmp *L69(,%eax,4)
.p2align 2
.p2align 2
L69:
.long L54
.long L56
.long L58
.long L60
.long L62
.long L64
.long L66
.long L68
L54:
movl $LC0, 8(%ebp)
.p2align 4,,7
L71:
popl %ebp
jmp _puts
L56:
movl $LC1, 8(%ebp)
jmp L71
L58:
movl $LC2, 8(%ebp)
jmp L71
L60:
movl $LC3, 8(%ebp)
jmp L71
L62:
movl $LC4, 8(%ebp)
jmp L71
L64:
movl $LC5, 8(%ebp)
jmp L71
L66:
movl $LC6, 8(%ebp)
jmp L71
L68:
movl $LC7, 8(%ebp)
jmp L71
.p2align 4,,7
L51:
popl %ebp
ret
.ident "GCC: (GNU) 3.3.4"

As you can see the compiler didn't optimize fxn1() much. It takes 7 CMP
+JMP's to get to label7. It didn't really optimize fxn2() either, but
there're 3 CMP+JMP's maximum in the path in the source and generated
code. fxn3(), OTOH, has just one CMP+JMP pair and the rest is handled
by a table. I guess many conditional branches cost a lot.

Alex

.



Relevant Pages