[PATCH] postgres support for DBIx::TextIndex

From: Jim Blomo (jim_at_rescomp.berkeley.edu)
Date: 03/18/04


Date: Wed, 17 Mar 2004 16:55:08 -0800
To: dbi-users@perl.org, dkoch@bizjournals.com

Hello,

I wrote a Pg.pm module for DBIx::TextIndex, which previously only
supported mysql. I've done testing with around ~150k documents and
everything seems to be working well. In order to get this working, I
had to change TextIndex.pm in a few ways:

- Pass primary key twice to $sth->execute for updates. This is because
  I had to break down mysql's nonstandard "replace into" to
  "delete from ...; insert into", so I needed an extra placeholder for
  the delete statement.
  **NOTE** This breaks the mysql (default) DB plugin.

- Use $sth->bind_param to do binding. This is because postgres' bytea
  type is not quoted properly by default. This should not break mysql.

- Deprecate use of Text::Unaccent. It looks like Text::Unaccent doesn't
  work correctly with Perl 5.8.2, probably because it uses some internal
  encoding for strings now. It doesn't pass `make test` either.

If someone who has mysql wants to look at this patch, I think mysql.pm
can be slightly changed to accommodate it. Please CC me on replies, as I
am not subscribed to this list.

Jim Blomo

diff -urN DBIx.orig/TextIndex/Pg.pm DBIx/TextIndex/Pg.pm
--- DBIx.orig/TextIndex/Pg.pm 1969-12-31 16:00:00.000000000 -0800
+++ DBIx/TextIndex/Pg.pm 2004-03-17 12:47:19.000000000 -0800
@@ -0,0 +1,515 @@
+#!/usr/bin/perl
+
+# PostgreSQL module for DBIx::TextIndex
+
+use strict;
+
+sub db_add_mask {
+ my $self = shift;
+ return <<END;
+DELETE FROM $self->{MASK_TABLE} where mask = ?;
+INSERT into $self->{MASK_TABLE} (mask, docs_vector) values (?, ?)
+END
+
+}
+
+sub db_delete_mask {
+ my $self = shift;
+ return <<END;
+delete from $self->{MASK_TABLE}
+where mask = ?
+END
+
+}
+
+sub db_drop_table {
+ my $self = shift;
+ my $table = shift;
+
+ if( $self->{INDEX_DBH}->selectrow_array("SELECT tablename FROM pg_tables WHERE tablename = '$table'") ) {
+ $self->{INDEX_DBH}->do("DROP TABLE $table");
+ }
+}
+
+sub db_table_exists {
+ my $self = shift;
+ my $table = shift;
+
+ return 1 if $self->{INDEX_DBH}->selectrow_array("SELECT tablename FROM pg_tables WHERE tablename = '$table'");
+ return 0;
+}
+
+sub db_create_collection_table {
+ my $self = shift;
+ return <<END;
+CREATE TABLE collection (
+ collection varchar(30) PRIMARY KEY default '',
+ version numeric(10,2) NOT NULL default 0.00,
+ max_indexed_id int NOT NULL default 0,
+ doc_table varchar(30) NOT NULL default '',
+ doc_id_field varchar(30) NOT NULL default '',
+ doc_fields varchar(250) NOT NULL default '',
+ charset varchar(50) NOT NULL default '',
+ stoplist varchar(255) NOT NULL default '',
+ proximity_index varchar(1) NOT NULL default '0',
+ error_empty_query varchar(255) NOT NULL default '',
+ error_quote_count varchar(255) NOT NULL default '',
+ error_no_results varchar(255) NOT NULL default '',
+ error_no_results_stop varchar(255) NOT NULL default '',
+ max_word_length int NOT NULL default 0,
+ result_threshold int NOT NULL default 0,
+ phrase_threshold int NOT NULL default 0,
+ min_wildcard_length int NOT NULL default 0,
+ decode_html_entities varchar(1) NOT NULL default '0',
+ scoring_method varchar(20) NOT NULL default '',
+ update_commit_interval int NOT NULL default 0
+)
+END
+
+}
+
+sub db_insert_collection_table_row {
+ my $self = shift;
+ my $row = shift;
+ my @fields;
+ my @values;
+ while (my ($field, $value) = each %$row) {
+ push @fields, $field;
+ push @values, $value;
+ }
+ my $collection_fields = join ', ', @fields;
+ my $place_holders = join ', ', (('?') x ($#fields + 1));
+ my $sql = <<END;
+insert into $self->{COLLECTION_TABLE}
+($collection_fields)
+values ($place_holders)
+END
+ $self->{INDEX_DBH}->do($sql, undef, @values);
+
+}
+
+sub db_fetch_max_indexed_id {
+ my $self = shift;
+
+ return <<END;
+SELECT max_indexed_id
+FROM $self->{COLLECTION_TABLE}
+WHERE collection = ?
+END
+
+}
+
+sub db_fetch_collection_version {
+ my $self = shift;
+
+ return <<END;
+select max(version) from $self->{COLLECTION_TABLE}
+END
+
+}
+
+sub db_collection_count {
+ my $self = shift;
+
+ return <<END;
+select count(*) from $self->{COLLECTION_TABLE}
+END
+
+}
+
+sub db_update_collection_info {
+ my $self = shift;
+ my $field = shift;
+
+ return <<END;
+update $self->{COLLECTION_TABLE}
+set $field = ?
+where collection = ?
+END
+
+}
+
+sub db_delete_collection_info {
+ my $self = shift;
+
+ return <<END;
+delete from $self->{COLLECTION_TABLE}
+where collection = ?
+END
+
+}
+
+sub db_store_collection_info {
+ my $self = shift;
+
+ my @collection_fields = @{$self->{COLLECTION_FIELDS}};
+ my $collection_fields = join ', ', @collection_fields;
+ my $place_holders = join ', ', (('?') x ($#collection_fields + 1));
+ return <<END;
+insert into $self->{COLLECTION_TABLE}
+($collection_fields)
+values
+($place_holders)
+END
+
+}
+
+sub db_fetch_collection_info {
+ my $self = shift;
+
+ my $collection_fields = join ', ', @{$self->{COLLECTION_FIELDS}};
+
+ return <<END;
+select
+$collection_fields
+from $self->{COLLECTION_TABLE}
+where collection = ?
+END
+
+}
+
+sub db_fetch_all_collection_rows {
+ my $self = shift;
+
+ return <<END;
+select * from $self->{COLLECTION_TABLE}
+END
+
+}
+
+sub db_phrase_scan_cz {
+ my $self = shift;
+ my $result_docs = shift;
+ my $fno = shift;
+
+ return <<END;
+select $self->{DOC_ID_FIELD}, $self->{DOC_FIELDS}->[$fno]
+from $self->{DOC_TABLE}
+where $self->{DOC_ID_FIELD} in ($result_docs)
+END
+
+}
+
+sub db_phrase_scan {
+ my $self = shift;
+ my $result_docs = shift;
+ my $fno = shift;
+
+ return <<END;
+select $self->{DOC_ID_FIELD}
+from $self->{DOC_TABLE}
+where $self->{DOC_ID_FIELD} IN ($result_docs)
+ and $self->{DOC_FIELDS}->[$fno] like ?
+END
+
+}
+
+sub db_fetch_maxtf {
+ my $self = shift;
+ my $fields = shift;
+
+ return <<END;
+select field_no, maxtf
+from $self->{MAXTF_TABLE}
+where field_no in ($fields)
+END
+
+}
+
+sub db_fetch_docweights {
+ my $self = shift;
+ my $fields = shift;
+
+ return <<END;
+select field_no, avg_docweight, docweights
+from $self->{DOCWEIGHTS_TABLE}
+where field_no in ($fields)
+END
+
+}
+
+
+sub db_fetch_all_docs_vector {
+ my $self = shift;
+ return <<END;
+SELECT all_docs_vector
+from $self->{ALL_DOCS_VECTOR_TABLE}
+END
+
+}
+
+sub db_update_all_docs_vector {
+ my $self = shift;
+ return <<END;
+DELETE FROM $self->{ALL_DOCS_VECTOR_TABLE} WHERE id = 1;
+INSERT INTO $self->{ALL_DOCS_VECTOR_TABLE}
+(id, all_docs_vector)
+VALUES (1, ?)
+END
+}
+
+sub db_docfreq_t {
+ my $self = shift;
+ my $table = shift;
+
+ return <<END;
+select docfreq_t from $table
+where word = ?
+END
+
+}
+
+sub db_fetch_mask {
+ my $self = shift;
+
+ return <<END;
+select docs_vector
+from $self->{MASK_TABLE}
+where mask = ?
+END
+
+}
+
+sub db_fetch_term_docs {
+ my $self = shift;
+ my $table = shift;
+
+ return <<END;
+select term_docs
+from $table
+where word = ?
+END
+
+}
+
+sub db_fetch_term_freq_and_docs {
+ my $self = shift;
+ my $table = shift;
+ return <<END;
+select docfreq_t, term_docs
+from $table
+where word = ?
+END
+
+}
+
+sub db_fetch_words {
+ my $self = shift;
+ my $table = shift;
+
+ return <<END;
+select word
+from $table
+where word like ?
+END
+
+}
+
+sub db_ping_doc {
+ my $self = shift;
+
+ return <<END;
+select 1
+from $self->{DOC_TABLE}
+where $self->{DOC_ID_FIELD} = ?
+END
+
+}
+
+sub db_fetch_doc {
+ my $self = shift;
+ my $field = shift;
+
+ return <<END;
+select $field
+from $self->{DOC_TABLE}
+where $self->{DOC_ID_FIELD} = ?
+END
+
+}
+
+sub db_update_maxtf {
+ my $self = shift;
+
+ return <<END;
+DELETE FROM $self->{MAXTF_TABLE} WHERE field_no = ?;
+INSERT into $self->{MAXTF_TABLE} (field_no, maxtf) values (?, ?)
+END
+
+}
+
+sub db_update_docweights {
+ my $self = shift;
+
+ return <<END;
+DELETE FROM $self->{DOCWEIGHTS_TABLE} WHERE field_no = ?;
+INSERT into $self->{DOCWEIGHTS_TABLE} (field_no, avg_docweight, docweights) values (?, ?, ?)
+END
+
+}
+
+sub db_inverted_replace {
+ my $self = shift;
+ my $table = shift;
+
+ return <<END;
+DELETE FROM $table WHERE word = ?;
+INSERT into $table
+(word, docfreq_t, term_docs)
+values (?, ?, ?)
+END
+
+}
+
+sub db_inverted_remove {
+ my $self = shift;
+ my $table = shift;
+
+ return <<END;
+delete from $table
+where word = ?
+END
+
+}
+
+sub db_inverted_select {
+ my $self = shift;
+ my $table = shift;
+
+ return <<END;
+select docfreq_t, term_docs
+from $table
+where word = ?
+END
+
+}
+
+sub db_create_mask_table {
+ my $self = shift;
+
+ return <<END;
+create table $self->{MASK_TABLE} (
+ mask varchar(100) primary key,
+ docs_vector text not null
+)
+END
+
+}
+
+sub db_create_docweights_table {
+ my $self = shift;
+ return <<END;
+create table $self->{DOCWEIGHTS_TABLE} (
+ field_no integer primary key,
+ avg_docweight float not null,
+ docweights bytea not null
+)
+END
+}
+
+sub db_create_maxterm_table {
+ my $self = shift;
+
+ return <<END;
+create table $self->{MAXTF_TABLE} (
+ field_no integer primary key,
+ maxtf bytea not null
+)
+END
+
+}
+
+sub db_create_all_docs_vector_table {
+ my $self = shift;
+
+ return <<END;
+CREATE TABLE $self->{ALL_DOCS_VECTOR_TABLE} (
+ id INT PRIMARY KEY,
+ all_docs_vector text NOT NULL
+)
+END
+}
+
+sub db_create_inverted_table {
+ my $self = shift;
+ my $table = shift;
+ my $max_word = $self->{MAX_WORD_LENGTH};
+
+ return <<END;
+create table $table (
+ word varchar($max_word) primary key,
+ docfreq_t int not null,
+ term_docs bytea not null
+)
+END
+
+}
+
+sub db_pindex_search {
+ my $self = shift;
+ my $fno = shift;
+ my $words = shift;
+ my $docs = shift;
+
+ return <<END;
+select word, doc, pos
+from $self->{PINDEX_TABLES}->[$fno]
+where doc in ($docs) and word in ($words)
+order by doc
+END
+
+}
+
+sub db_pindex_create {
+ my $self = shift;
+ my $table = shift;
+ my $max_word = $self->{MAX_WORD_LENGTH};
+
+ return <<END;
+create table $table (
+ word varchar($max_word) not null,
+ doc integer not null,
+ pos integer not null,
+ primary key (doc, word)
+)
+END
+
+}
+
+sub db_pindex_add {
+ my $self = shift;
+ my $table = shift;
+
+ return <<END;
+insert into $table (word, doc, pos)
+values (?, ?, ?)
+END
+
+}
+
+sub db_pindex_remove {
+ my $self = shift;
+ my $table = shift;
+ my $docs = shift;
+
+ return <<END;
+delete from $table
+where doc in ($docs)
+END
+
+}
+
+
+sub db_total_words {
+ my $self = shift;
+ my $table = shift;
+
+ return <<END;
+select SUM(docfreq_t)
+from $table
+END
+
+}
+
+1;
+
diff -urN DBIx.orig/TextIndex.pm DBIx/TextIndex.pm
--- DBIx.orig/TextIndex.pm 2003-10-01 13:30:51.000000000 -0700
+++ DBIx/TextIndex.pm 2004-03-17 16:39:29.000000000 -0800
@@ -13,7 +13,9 @@
 use DBIx::TextIndex::QueryParser;
 use DBIx::TextIndex::TermDocsCache;
 use HTML::Entities ();
-use Text::Unaccent qw(unac_string);
+#use Text::Unaccent qw(unac_string);
+
+use DBD::Pg;
 
 my $GEN = 'DBIx::TextIndex::Exception::General';
 my $DA = 'DBIx::TextIndex::Exception::DataAccess';
@@ -239,7 +241,7 @@
     $vector->Index_List_Store(@$ids);
 
     print "Adding mask ($mask) to table $self->{MASK_TABLE}\n" if $PA > 1;
- $self->{INDEX_DBH}->do($self->db_add_mask, undef, $mask, $vector->to_Enum);
+ $self->{INDEX_DBH}->do($self->db_add_mask, undef, $mask, $mask, $vector->to_Enum);
     return 1;
 }
 
@@ -429,7 +431,11 @@
             $maxtf[$doc_id] = 0;
         }
         my $packed_maxtf = pack 'w' x ($#maxtf + 1), @maxtf;
- $sth->execute($fno, $packed_maxtf);
+ #$sth->execute($fno, $packed_maxtf);
+ $sth->bind_param( 1, $fno );
+ $sth->bind_param( 2, $fno );
+ $sth->bind_param( 3, $packed_maxtf, { pg_type => DBD::Pg::PG_BYTEA } );
+ $sth->execute();
     }
 }
 
@@ -450,7 +456,12 @@
         }
         my $packed_w_d = pack 'f*', @w_d;
         # FIXME: we should update the average, leave it alone for now
- $sth->execute($fno, $self->{AVG_W_D}->[$fno], $packed_w_d);
+ #$sth->execute($fno, $self->{AVG_W_D}->[$fno], $packed_w_d);
+ $sth->bind_param( 1, $fno );
+ $sth->bind_param( 2, $fno );
+ $sth->bind_param( 3, $self->{AVG_W_D}->[$fno] );
+ $sth->bind_param( 4, $packed_w_d, { pg_type => DBD::Pg::PG_BYTEA } );
+ $sth->execute();
     }
 }
 
@@ -504,8 +515,14 @@
                 push @new_term_docs, ($term_docs->[$i], $term_docs->[$i + 1]);
             }
 
- $sth_replace->execute($word, $docfreq_t,
- pack_term_docs(\@new_term_docs));
+ #$sth_replace->execute($word, $docfreq_t,
+ # pack_term_docs(\@new_term_docs));
+ $sth_replace->bind_param( 1, $word );
+ $sth_replace->bind_param( 2, $word );
+ $sth_replace->bind_param( 3, $docfreq_t );
+ $sth_replace->bind_param( 4, pack_term_docs( \@new_term_docs ),
+ { pg_type => DBD::Pg::PG_BYTEA } );
+ $sth_replace->execute();
         }
     }
 }
@@ -1706,7 +1723,7 @@
 sub _lc_and_unac {
     my $self = shift;
     my $s = shift;
- $s = unac_string($self->{CHARSET}, $s);
+ #$s = unac_string($self->{CHARSET}, $s);
     $s = lc($s);
     return $s;
 }
@@ -1749,7 +1766,11 @@
         }
         $maxtf[0] = 0 unless defined $maxtf[0];
         my $packed_maxtf = pack 'w' x ($#maxtf + 1), @maxtf;
- $sth->execute($fno, $packed_maxtf);
+ #$sth->execute($fno, $packed_maxtf);
+ $sth->bind_param( 1, $fno );
+ $sth->bind_param( 2, $fno );
+ $sth->bind_param( 3, $packed_maxtf, { pg_type => DBD::Pg::PG_BYTEA } );
+ $sth->execute();
     }
     # Delete temporary in-memory structure
     delete($self->{NEW_MAXTF});
@@ -1780,7 +1801,12 @@
         $w_d[0] = 0 unless defined $w_d[0];
         # FIXME: this takes too much space
         my $packed_w_d = pack 'f*', @w_d;
- $sth->execute($fno, $avg_w_d, $packed_w_d);
+ #$sth->execute($fno, $avg_w_d, $packed_w_d);
+ $sth->bind_param( 1, $fno );
+ $sth->bind_param( 2, $fno );
+ $sth->bind_param( 3, $avg_w_d );
+ $sth->bind_param( 4, $packed_w_d, { pg_type => DBD::Pg::PG_BYTEA } );
+ $sth->execute();
     }
     # Delete temporary in-memory structure
     delete($self->{NEW_W_D});
@@ -1812,11 +1838,16 @@
 
             my $term_docs = pack_term_docs_append_vint($o_term_docs, $term_docs_vint);
 
- $i_sth->execute(
- $word,
- $self->{DOCFREQ_T}->[$fno]->{$word} + $o_docfreq_t,
- $term_docs,
- ) or warn $self->{INDEX_DBH}->err;
+ # $i_sth->execute(
+ # $word,
+ # $self->{DOCFREQ_T}->[$fno]->{$word} + $o_docfreq_t,
+ # $term_docs,
+ # ) or warn $self->{INDEX_DBH}->err;
+ $i_sth->bind_param( 1, $word );
+ $i_sth->bind_param( 2, $word );
+ $i_sth->bind_param( 3, $self->{DOCFREQ_T}->[$fno]->{$word} + $o_docfreq_t );
+ $i_sth->bind_param( 4, $term_docs, { pg_type => DBD::Pg::PG_BYTEA } );
+ $i_sth->execute() or warn $self->{INDEX_DBH}->err;
 
             delete($self->{TERM_DOCS_VINT}->[$fno]->{$word});
             $wc++;



Relevant Pages

  • sitemap generator for Perl
    ... I want to run the sitemap generator ... Returns the minimum number of links to traverse from the root URL of ... my $class = shift; ...
    (perl.beginners)
  • Re: How can I create instantiable objects (not classes)?
    ... a child object inherits not only its parent object's ... sub fee { ... my $class = shift; ... For example, an object of type Car might receive a message named "ticket," and since a car does not know what to do with a ticket, it would pass that message to an object of type Driver. ...
    (comp.lang.perl.misc)
  • Re: Packages and returning errors
    ... > array intact. ... sub is_a_instance_method { ... my $class = shift; ... You need to fix the scope of $error by moving its declaration outside ...
    (comp.lang.perl.misc)
  • Re: passing database data to a sub
    ... > I'm not sure of the difference, why isn't it a subroutine? ... > sure about this 'shift' thing anyway :-) ... > sub teardown ... > # Setup the template to use for the output. ...
    (perl.beginners)
  • Re: Massive failed FTP attempts.
    ... made it a little more generic so that it could monitor any log file. ... To run as a daemon and monitor proftpd, sending lockout notifications to ... sub prune_old_entries ... my $line = shift; ...
    (Security-Basics)