Re: GC in Jon's raytracing benchmark



Marco Antoniotti wrote:
> Rob Thorpe wrote:
> Without knowning much of the specifics,
> I'd bet that the OCaml compiler is just very good at stack allocating
> several intermediate values, especially vectors.
> I.e. it does DYNAMIC-EXTENT very well.

The vectors are all heap allocated, on the young heap where possible.

Have a look at the code around "caml_young_ptr" in the assembler generated
by ocamlopt for the "ray_sphere" function on x86, for example:

camlRay2__ray_sphere_84:
subl $24, %esp
..L166:
movl %eax, %esi
..L167: movl caml_young_ptr, %eax
subl $28, %eax
movl %eax, caml_young_ptr
cmpl caml_young_limit, %eax
jb .L168
leal 4(%eax), %eax
movl $6398, -4(%eax)
fldl (%ecx)
fsubl (%esi)
fstpl (%eax)
fldl 8(%ecx)
fsubl 8(%esi)
fstpl 8(%eax)
fldl 16(%ecx)
fsubl 16(%esi)
fstpl 16(%eax)
fldl 8(%eax)
fmull 8(%ebx)
fldl (%eax)
fmull (%ebx)
faddp %st, %st(1)
fldl 16(%eax)
fmull 16(%ebx)
faddp %st, %st(1)
fstpl 16(%esp)
fldl 8(%eax)
fmull 8(%eax)
fldl (%eax)
fmull (%eax)
faddp %st, %st(1)
fldl 16(%eax)
fmull 16(%eax)
faddp %st, %st(1)
fldl 16(%esp)
fmull 16(%esp)
fsubp %st, %st(1)
fldl (%edx)
fmull (%edx)
faddp %st, %st(1)
fstpl 0(%esp)
fldz
fcompl 0(%esp)
fnstsw %ax
andb $69, %ah
jne .L165
movl camlPervasives + 36, %eax
addl $24, %esp
ret
.align 16
..L165:
fldl 0(%esp)
fsqrt
fstpl 8(%esp)
fldl 16(%esp)
faddl 8(%esp)
fstpl 0(%esp)
..L170: movl caml_young_ptr, %eax
subl $12, %eax
movl %eax, caml_young_ptr
cmpl caml_young_limit, %eax
jb .L171
leal 4(%eax), %ecx
movl $2301, -4(%ecx)
fldl 0(%esp)
fstpl (%ecx)
fldz
fcompl 0(%esp)
fnstsw %ax
andb $69, %ah
jne .L164
movl camlPervasives + 36, %eax
addl $24, %esp
ret
.align 16
..L164:
fldl 16(%esp)
fsubl 8(%esp)
fstpl 0(%esp)
..L173: movl caml_young_ptr, %eax
subl $12, %eax
movl %eax, caml_young_ptr
cmpl caml_young_limit, %eax
jb .L174
leal 4(%eax), %ebx
movl $2301, -4(%ebx)
fldl 0(%esp)
fstpl (%ebx)
fldz
fcompl 0(%esp)
fnstsw %ax
andb $69, %ah
cmpb $1, %ah
jne .L163
movl %ebx, %eax
addl $24, %esp
ret
.align 16
..L163:
movl %ecx, %eax
addl $24, %esp
ret
..L174: call caml_call_gc
..L175: jmp .L173
..L171: call caml_call_gc
..L172: jmp .L170
..L168: call caml_call_gc
..L169: jmp .L167

Here's the equivalent generated by gcc from a C implementation for
comparison:

ray_sphere:
pushl %ebp
movl %esp, %ebp
subl $32, %esp
movl 8(%ebp), %eax
movl 12(%ebp), %edx
fldl 8(%eax)
fsubrl 8(%edx)
fldl (%eax)
fsubrl (%edx)
fldl 16(%eax)
fsubrl 16(%edx)
fstl -8(%ebp)
fxch %st(2)
fstl -16(%ebp)
fxch %st(1)
fstl -24(%ebp)
fld %st(0)
fmull 24(%eax)
fld %st(2)
fmull 32(%eax)
faddp %st, %st(1)
fld %st(3)
fmull 40(%eax)
faddp %st, %st(1)
fld %st(0)
fmul %st(1), %st
fxch %st(2)
fmul %st(0), %st
fxch %st(3)
fmul %st(0), %st
faddp %st, %st(3)
fxch %st(3)
fmul %st(0), %st
faddp %st, %st(2)
fsubp %st, %st(1)
fldl 24(%edx)
fmul %st(0), %st
faddp %st, %st(1)
ftst
fnstsw %ax
testb $5, %ah
jne .L44
fsqrt
fld %st(1)
fadd %st(1), %st
ftst
fnstsw %ax
testb $5, %ah
jne .L43
fxch %st(1)
fsubrp %st, %st(2)
fxch %st(1)
ftst
fnstsw %ax
testb $69, %ah
jne .L41
fstp %st(1)
leave
ret
..L43:
fstp %st(0)
.p2align 4,,15
..L44:
fstp %st(0)
fstp %st(0)
flds .LC8
leave
ret
..L41:
fstp %st(0)
leave
ret

--
Dr Jon D Harrop, Flying Frog Consultancy
http://www.ffconsultancy.com
.



Relevant Pages