Re: Looking for super fast CSV parser



Sam Larson <sam212@xxxxxxxxx> wrote:

It seems to me that these Delphi database are nice and fast for 100k
rows, but if you load them up with tens of millions of rows of data,
performance starts to suffer. They have a tendancy to use a lot of CPU
cycles and will gobble up all available RAM.

Did you test DISQLite3? My findings show that DISQLite3 can insert the 20
million records and create an index on the date column in about 15 minutes.

To reproduce:

I assembled a small console project based on the record information gathered
from this thread. It uses the latest DISQLite3 Pro as database engine. The
application was compiled with D2007, is 365 KB in size including the embedded
database engine, and requires no external dependencies. The project source code
is copied below for anyone interested.

My test environment is 1 year old notebook with Intel Pentium Dual Core 1.8 MHz,
1 GB memory, 5200 rpm hard drive, WinXP SP2.

Running the project took about 5 minutes to insert the 20 million randomly
generated records. Creating the index took somewhat longer than that. Total
execution time was less than 15 minutes. The created database file is 1.77 GB in
size, including the index.

I expect the project to run considerably faster on server hardware, especially
with a faster hard drive. Increasing the database cache will also improve
performance at the cost of memory used. Adjust the "PRAGMA cache_size=400000"
according to your needs as described in the source comments.

Ralf

Test project source code follows:

{ ------------------------------------------------------------------------------

DISQLite3 project to profile a 20 million records insertion.
Use DISQLite3 Pro for best performance.

Visit the DISQLite3 Internet site for latest information and updates:

http://www.yunqa.de/delphi/

Copyright (c) 2005-2007 Ralf Junker, The Delphi Inspiration <delphi@xxxxxxxx>

------------------------------------------------------------------------------ }

program DISQLite3_20_Million;

{$I DI.inc}
{$I DISQLite3.inc}

{$APPTYPE CONSOLE}

uses
Windows, SysUtils, DISQLite3Api;

const
FILE_NAME = '20million.db3';
TOTAL_COUNT = 20000000;
TRANSACTION_DELTA = 50000;
DATE_START = 19770101;
DATE_END = 20071231;
DATE_DIFF = (DATE_END - DATE_START) / TOTAL_COUNT;

{$IFNDEF DISQLite3_Personal}
function ProgressCallback(UserData: Pointer): Integer;
begin
Write('.');
Result := 0;
end;
{$ENDIF !DISQLite3_Personal}

var
DB: sqlite3;
Stmt: sqlite3_stmt;
Date: Double;
ProductCode: array[0..6] of AnsiChar;
ProductLetter: AnsiChar;
i, j: Integer;
persec: Double;
totalsec: Integer;
tc_start, tc_Index: Cardinal;
begin
try
tc_start := GetTickCount;
DeleteFile(FILE_NAME);
sqlite3_check(sqlite3_open(FILE_NAME, @DB));
try
{ Tweak some DB settings to speed up inserts (DISQLite3 Pro only). }

{ Prevent other users from accessing the database while we are inserting.
}
sqlite3_exec_fast(DB, 'PRAGMA locking_mode=EXCLUSIVE');

{ Set the number of pages the DB engine buffers in memory.
400000 pages require about 450 MB of memory for this application.
Increasing this value further will speed up insertions and index
creation, but will also use more memory. Reduce if you have less
memory available to avoid OS paging. }
sqlite3_exec_fast(DB, 'PRAGMA cache_size=400000');

{ Noticably faster inserts at the cost of a little less ACID security. }
sqlite3_exec_fast(DB, 'PRAGMA synchronous=OFF');

{ Create the table. }
sqlite3_exec_fast(DB, 'CREATE TABLE t (' +
't1 TEXT,t2 TEXT,' +
'i1 INTEGER,' +
'r1 REAL,r2 REAL,r3 REAL,r4 REAL,r5 REAL,r6 REAL)');

{ Prepare an SQL statement. This is the fastes way to insert bulk
data with DISQLtie3. We will bind values to this statement below
and then step it once to perform the insert. }
sqlite3_check(sqlite3_prepare(DB, 'INSERT INTO t
(t1,t2,i1,r1,r2,r3,r4,r5,r6) VALUES (?,?,?,?,?,?,?,?,?)', -1, @Stmt, nil), DB);
try

{ Start a transaction to speed up inserts. It will be committed and
a new transaction started every TRANSACTION_DELTA inserts. }
sqlite3_exec_fast(DB, 'BEGIN TRANSACTION');

Date := DATE_START;
for i := 1 to TOTAL_COUNT do
begin
{ Bind a random product code. }
for j := Low(ProductCode) to High(ProductCode) do
ProductCode[j] := Char(Ord('A') + Random(24));
sqlite3_check(sqlite3_bind_text(Stmt, 1, PAnsiChar(@ProductCode),
SizeOf(ProductCode), SQLITE_STATIC), DB);
{ Bind a random product letter. }
ProductLetter := Char(Ord('A') + Random(24));
sqlite3_check(sqlite3_bind_text(Stmt, 2, @ProductLetter,
SizeOf(ProductLetter), SQLITE_STATIC), DB);
{ Bind a date, increasing from DATE_START. }
Date := Date + DATE_DIFF;
sqlite3_check(sqlite3_bind_int(Stmt, 3, Trunc(Date)),
DB);
{ Bind random floats, range 0 to 1000. }
sqlite3_check(sqlite3_bind_double(Stmt, 4, Random * 1000), DB);
sqlite3_check(sqlite3_bind_double(Stmt, 5, Random * 1000), DB);
sqlite3_check(sqlite3_bind_double(Stmt, 6, Random * 1000), DB);
sqlite3_check(sqlite3_bind_double(Stmt, 7, Random * 1000), DB);
sqlite3_check(sqlite3_bind_double(Stmt, 8, Random * 1000), DB);
sqlite3_check(sqlite3_bind_double(Stmt, 9, Random * 1000), DB);
{ Step the statement to perform the insert. }
sqlite3_check(sqlite3_step(Stmt), DB);
{ Reset the statement prepares it for the next insert. }
sqlite3_check(sqlite3_reset(Stmt), DB);

{ Every TRANSACTION_DELTA inserts, commit the transaction and
start a new one. Also output some performance statistics. }
if i mod TRANSACTION_DELTA = 0 then
begin
{ Commit current transaction. }
sqlite3_exec_fast(DB, 'COMMIT');
persec := i / ((GetTickCount - tc_start) / 1000);
totalsec := Round(TOTAL_COUNT / persec);
WriteLn(i: 8, ' - ',
persec: 7: 2, ' per sec - ',
i * 100 div TOTAL_COUNT: 3, '% done - est. total time: ',
totalsec div 60, ' min, ', totalsec mod 60, ' sec');
{ Start new transaction. }
sqlite3_exec_fast(DB, 'BEGIN TRANSACTION');
end;
end;

{ Commit current transaction. }
sqlite3_exec_fast(DB, 'COMMIT');

finally
sqlite3_check(sqlite3_finalize(Stmt), DB);
end;

WriteLn;
totalsec := (GetTickCount - tc_start) div 1000;
WriteLn('Insertion Time: ', totalsec div 60, ' min, ', totalsec mod 60, '
sec');

{$IFNDEF DISQLite3_Personal}
sqlite3_progress_handler(DB, 50000, ProgressCallback, nil);
{$ENDIF !DISQLite3_Personal}

tc_Index := GetTickCount;
WriteLn;
WriteLn('Creating index on date column ... this may take a while ... ');
sqlite3_exec_fast(DB, 'CREATE INDEX t_i1 ON t (i1)');
WriteLn;
totalsec := (GetTickCount - tc_Index) div 1000;
WriteLn('Index created in ', totalsec div 60, ' min, ', totalsec mod 60,
'sec');

finally
sqlite3_check(sqlite3_close(DB), DB);
WriteLn;
totalsec := (GetTickCount - tc_start) div 1000;
WriteLn('Total Time: ', totalsec div 60, ' min, ', totalsec mod 60,
'sec');

WriteLn;
WriteLn('Max database memory usage: ', sqlite3_memory_highwater(0) div
1024 div 1024, ' MB');
end;

except
on e: Exception do
WriteLn(e.Message);
end;

WriteLn;
WriteLn('Done - Press ENTER to Exit');
ReadLn;
end.

---
The Delphi Inspiration
http://www.yunqa.de/delphi/
.



Relevant Pages

  • Re: Robust single file database
    ... creating a database using DISQLite3, opening it, reading a value or changing ... You must be using an old version of DISQLite3. ... into the same transaction thus increasing insert rate. ...
    (borland.public.delphi.thirdpartytools.general)
  • Re: Recordset.AddNew and the recordset objects data retaining
    ... The statement you make about the individual updates not hitting the database ... to do with the updates being in a transaction but rather on whether you specify ... spill over and the memory starts paging out to disk that concerns me). ...
    (microsoft.public.data.ado)
  • Re: Interconnection Delay
    ... Are these JDBC connections? ... But I observed delays also for a single database ... A reason/excuse could be the particular transaction ... memory model that JET used. ...
    (comp.lang.java.databases)
  • Re: SQLCE 3.0 losing data after transaction
    ... Do you have a *small* code sample that illustrates this issue including code to create the database? ... Are you using DataSet or something less memory hungry like SqlCeResultSet? ... inserting many rows to the database we get a "not enough memory" ... And it occurs like there is a huge transaction and its roll backed. ...
    (microsoft.public.sqlserver.ce)
  • Re: Looking for super fast CSV parser
    ... I have updated DISQLite3 yesterday. ... Delphi database systems and even rivals big database players like MySql. ... The individual projects in this folder show how to generate data, export to CSV, ... Generate with memory helper table: ...
    (borland.public.delphi.thirdpartytools.general)