asm grep



In a thread that had drifted a long way from PSHUFW and the Nasm manual, I was complaining about the asmutils grep...

> Ummm... did I mention that the asmutils grep ended in a segfault on my
> (38M) testfile? Yeah. I haven't looked at why, yet. That one can use
> some attention. Glad you brought it up, Santosh!

Well... true to its name "gets" doesn't check for buffer overrun! :(

Only a highly unusual file will trigger it - requires a "run" of over 16k without a linefeed (0x0a). My "testfile" happened to do it (thought that was pretty much "text"...). Arguably not worth fixin'.

I also mentioned "lint", and that I thought "call strlen" could be improved... Well, I de-linted it, and "improved" strlen...

A while back, someone was asking about the advantages, or not, of using macros to replace calls to subroutines. Mostly, it "saves the call and ret". My "asm strlen" (someone showed me this... think it was C-the-developer), besides saving the call and ret, has the "advantage" that the "destination" can be any reg - don't need to move eax to wherever you really wanted it. I think it works out pretty well in this case.

I also complained that the speed was horrible. The issue is *exactly* the example Robert cited - "sys_read ebp, tmp, 1" - we're calling the OS for every byte. If the "C newbie" did the same, he'd be slow too, but he probably knows fread() and/or fgetc()... may not know that read() exists. These do the buffering for him, so he's probably gonna kick our ass!

To improve speed, we'd at *least* want to read a good big bufferfull at a time. Might be a good candidate for mmap. String matching could be made faster... The question would be where to stop... This thing won't do "regular expressions" so hardly deserves the name "grep"! Sure is small, though... and the speed isn't too painful on small files...

In any case, here's my current version. With the bug still in. as posted, it weighs in at 588 bytes (compared to 603 for the original). Uncommenting the "bugfix" brings it back up to 598. One step forward, two steps back...

Best,
Frank



; nasm -f bin -I/where/ever/asmutils-0.18/inc/ -d__LINUX__ -d__ELF -d__ELF_MACROS__ grep.asm
; chmod +x grep

;Copyright (C) 1999-2002 Konstantin Boldyshev <konst@xxxxxxxxxxxxxxxxx>
;
;$Id: grep.asm,v 1.6 2002/02/18 06:46:47 konst Exp $
;
;hackers' grep
;
;syntax: grep [-b] [-c] [-q] [-v] PATTERN [file...]
;
;-b print byte offset before each line of output
;-c print count of matching lines for each file (instead of actual lines)
;-q be quiet (supress output, only set exit code)
;-v invert matching (select non-matching lines)
;
;there's no support for regexp, only pure string patterns.
;returns 0 on success (if pattern was found), 1 otherwise
;
;0.01: 19-Dec-1999 initial release (dumb and slow version)
;0.02: 14-Feb-2002 added -v option
;0.03: 18-Feb-2002 added -b, -c options,
; output filename when grepping several files

%include "system.inc"

%imacro astrlen 0-*

; single operand - eax is implied dest
%if %0 = 1
%ifidni %1, eax
%error "single operand form - eax is implied destination!!!"
%endif

or eax, byte -1
%%getlen:
cmp [%1 + eax + 1], byte 1
inc eax
jnc %%getlen

; two operand form
%elif %0 = 2
%ifidni %1, %2
%error "src and dest must not be the same!!!"
%endif

; fake an "%ifnreg"

%assign %%isreg 0

%ifidni %1, eax
%assign %%isreg 1
%elifidni %1, ebx
%assign %%isreg 1
%elifidni %1, ecx
%assign %%isreg 1
%elifidni %1, edx
%assign %%isreg 1
%elifidni %1, esi
%assign %%isreg 1
%elifidni %1, edi
%assign %%isreg 1
%elifidni %1, ebp
%assign %%isreg 1
%endif

%if %%isreg = 0
%error "destination must be a GP register!!!"
%endif


or %1, byte -1
%%getlen2:
cmp [%1 + %2 + 1], byte 1
inc %1
jnc %%getlen2
%else
%error "usage: strlen src (reg/label) or strlen dest (reg) src (reg/label)."
%endif
%endm


CODESEG

%assign _q 00000001b
%assign _v 00000010b
%assign _c 00000100b
%assign _b 10000000b

%assign BUFSIZE 0x4000

do_exit:
sys_exit [retcode]

START:
_mov ebp,STDIN ;file handle (STDIN if no args)
mov [retcode],byte 1

pop ebx
dec ebx
jz do_exit

pop esi
..s0:
pop edi ;get pattern

cmp word [edi],"-q"
jnz .s2
or al,_q
..s1:
dec ebx
jmps .s0
..s2:
cmp word [edi],"-c"
jnz .s3
or al,_c
jmps .s1
..s3:
cmp word [edi],"-b"
jnz .s4
or al,_b
jmps .s1
..s4:
cmp word [edi],"-v"
jnz .proceed
or al,_v
jmps .s1

..proceed:
mov [flag],byte al
dec ebx
jz .mainloop ;if no args - read STDIN
mov [argc],ebx

..next_file:
pop ebx ;pop filename pointer
or ebx,ebx
jz do_exit ;exit if no more args

xor eax,eax
mov [count],eax
mov [realoff],eax
mov [fname],ebx

; open O_RDONLY

sys_open EMPTY,O_RDONLY | O_LARGEFILE
mov ebp,eax
test eax,eax
js .next_file

..mainloop:
mov esi,buf
call gets
cmp [tmp], byte 0
jz .find

test [flag],byte _c
jz .next_file

call write_fname
call write_count
jmps .next_file

..find:
call strstr

mov edx,[flag]

test eax,eax
setz bh
test dl,_v
setz bl

xor bl,bh
jz .mainloop

..match:
mov [retcode],byte 0
test dl,_q
jnz .mainloop

inc dword [count]
test dl,_c
jnz .mainloop

call write_fname
call write_byteoff

astrlen edx, esi

sys_write STDOUT,esi

jmp .mainloop

;
;
;

write_fname:
cmp [argc],byte 1
jbe .return
pusha
mov esi,[fname]

astrlen edx, esi

mov byte [esi+edx],':'
inc edx
sys_write STDOUT,esi
mov byte [esi+eax-1],0
popa
..return:
ret

write_byteoff:
test [flag],byte _b
jz .return

pusha
mov eax,[byteoff]
call itoa

mov byte [edi],':'
mov edx,edi
sub edx,esi
inc edx
sys_write STDOUT,esi
popa
..return:
ret

write_count:
pusha
mov eax,[count]
call itoa
mov byte [edi],__n
mov edx,edi
sub edx,esi
inc edx
sys_write STDOUT,esi
popa
ret

itoa:
_mov edi,itoabuf
_mov ecx,10
mov esi,edi

..printB:
sub edx,edx
div ecx
test eax,eax
jz .print0
push edx
call .printB
pop edx
..print0:
add dl,'0'
mov [edi],dl
inc edi
ret


;esi - buffer
gets:
pusha
mov [tmp], byte 1

push dword [realoff]
pop dword [byteoff]
; bugfix - stop one shy of end, so we can zero-terminate
; lea edi, [esi + BUFSIZE - 1]

..read_byte:
sys_read ebp,tmp,1
cmp eax,edx
jnz .return

inc dword [realoff]

mov al,[tmp]
mov [esi],al
inc esi
; bugfix
; cmp esi, edi
; je .panic
cmp al,__n
jnz .read_byte
..panic:
mov [esi],byte 0
mov [tmp],byte 0

..return:
popa
ret

;very dumb but short strstr
;
;esi - haystack
;edi - needle

strstr:
push esi
push edi

xor eax,eax
cmp [esi],byte 0
jz .rets

astrlen ecx, edi

or ecx,ecx
jz .return

..next:
xor eax,eax

push ecx
push edi
repz cmpsb
pop edi
pop ecx
jz .rets
cmp [esi],byte 0
jnz .next
jmp short .return

..rets:
mov eax,esi

..return:
pop edi
pop esi
ret

UDATASEG

argc resd 1
fname resd 1
count resd 1
realoff resd 1
byteoff resd 1

retcode resd 1
tmp resb 1
flag resb 1
itoabuf resb 0x10
buf resb BUFSIZE

END
.



Relevant Pages

  • Re: Why There are no Asm Apps
    ... I doubt it's much more difficult in ASM than `c`. ... mov D$edi + TSkinProgressBar_Position eax ... mov eax D$edi + TSkinProgressBar_Step ...
    (alt.lang.asm)
  • Re: Betovs lies continue
    ... mov eax D$edi + TSkinSection_Text ... sub ecx eax|neg ecx ...
    (alt.lang.asm)
  • New Pos Functions
    ... test eax, eax ... mov ecx, ... cmp al, ...
    (borland.public.delphi.language.basm)
  • Re: Guillermitos Particles
    ... lea eax D$esi + SkinWindow.FreeClientRect ... not reading or copying things I ... Write to this memory, and then use "SetDIBitsToDevice" to output the final thing. ...
    (alt.lang.asm)