[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Koha-cvs] koha C4/Biblio.pm C4/Search.pm updater/updateda...
From: |
paul poulain |
Subject: |
[Koha-cvs] koha C4/Biblio.pm C4/Search.pm updater/updateda... |
Date: |
Wed, 02 May 2007 16:44:31 +0000 |
CVSROOT: /sources/koha
Module name: koha
Changes by: paul poulain <tipaul> 07/05/02 16:44:31
Modified files:
C4 : Biblio.pm Search.pm
updater : updatedatabase
misc/migration_tools: rebuild_nozebra.pl
Log message:
NoZebra SQL index management :
* adding 3 subs in Biblio.pm
- GetNoZebraIndexes, that get the index structure in a new
systempreference (added with this commit)
- _DelBiblioNoZebra, that retrieve all index entries for a biblio and
remove in a variable the biblio reference
- _AddBiblioNoZebra, that add index entries for a biblio.
Note that the 2 _Add and _Del subs work only in a hash variable, to
speed up things in case of a modif (ie : delete+add). The effective SQL update
is done in the ModZebra sub (that existed before, and dealed with zebra index).
I think the code has to be more deeply tested, but it works at least
partially.
CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/koha/C4/Biblio.pm?cvsroot=koha&r1=1.201&r2=1.202
http://cvs.savannah.gnu.org/viewcvs/koha/C4/Search.pm?cvsroot=koha&r1=1.134&r2=1.135
http://cvs.savannah.gnu.org/viewcvs/koha/updater/updatedatabase?cvsroot=koha&r1=1.162&r2=1.163
http://cvs.savannah.gnu.org/viewcvs/koha/misc/migration_tools/rebuild_nozebra.pl?cvsroot=koha&r1=1.2&r2=1.3
Patches:
Index: C4/Biblio.pm
===================================================================
RCS file: /sources/koha/koha/C4/Biblio.pm,v
retrieving revision 1.201
retrieving revision 1.202
diff -u -b -r1.201 -r1.202
--- C4/Biblio.pm 27 Apr 2007 14:00:49 -0000 1.201
+++ C4/Biblio.pm 2 May 2007 16:44:31 -0000 1.202
@@ -33,7 +33,7 @@
use vars qw($VERSION @ISA @EXPORT);
# set the version for version checking
-$VERSION = do { my @v = '$Revision: 1.201 $' =~ /\d+/g; shift(@v).".".join(
"_", map { sprintf "%03d", $_ } @v ); };
+$VERSION = do { my @v = '$Revision: 1.202 $' =~ /\d+/g; shift(@v).".".join(
"_", map { sprintf "%03d", $_ } @v ); };
@ISA = qw( Exporter );
@@ -113,6 +113,7 @@
&TransformHtmlToXml
&PrepareItemrecordDisplay
&char_decode
+ &GetNoZebraIndexes
);
=head1 NAME
@@ -607,7 +608,11 @@
}
return $error if $error;
- # Delete in Zebra
+ # Delete in Zebra. Be careful NOT to move this line after
_koha_delete_biblio
+ # for at least 2 reasons :
+ # - we need to read the biblio if NoZebra is set (to remove it from the
indexes
+ # - if something goes wrong, the biblio may be deleted from Koha but not
from zebra
+ # and we would have no way to remove it (except manually in zebra, but I
bet it would be very hard to handle the problem)
ModZebra($biblionumber,"delete_record","biblioserver");
# delete biblio from Koha tables and save in deletedbiblio
@@ -2705,97 +2710,254 @@
# replaced by a zebraqueue table, that is filled with ModZebra to run.
# the table is emptied by misc/cronjobs/zebraqueue_start.pl script
-my $sth=$dbh->prepare("insert into zebraqueue (biblio_auth_number
,server,operation) values(?,?,?)");
-$sth->execute($biblionumber,$server,$op);
-$sth->finish;
-
-#
-# my @Zconnbiblio;
-# my $tried = 0;
-# my $recon = 0;
-# my $reconnect = 0;
-# my $record;
-# my $shadow;
-#
-# reconnect:
-# $Zconnbiblio[0] = C4::Context->Zconn( $server, 0, 1 );
-#
-# if ( $server eq "biblioserver" ) {
-#
-# # it's unclear to me whether this should be in xml or MARC format
-# # but it is clear it should be nabbed from zebra rather than from
-# # the koha tables
-# $record = GetMarcBiblio($biblionumber);
-# $record = $record->as_xml_record() if $record;
-# # warn "RECORD $biblionumber => ".$record;
-# $shadow="biblioservershadow";
-#
-# # warn "RECORD $biblionumber => ".$record;
-# $shadow = "biblioservershadow";
-#
-# }
-# elsif ( $server eq "authorityserver" ) {
-# $record = C4::AuthoritiesMarc::XMLgetauthority( $dbh, $biblionumber
);
-# $shadow = "authorityservershadow";
-# } ## Add other servers as necessary
-#
-# my $Zpackage = $Zconnbiblio[0]->package();
-# $Zpackage->option( action => $op );
-# $Zpackage->option( record => $record );
-#
-# retry:
-# $Zpackage->send("update");
-# my $i;
-# my $event;
-#
-# while ( ( $i = ZOOM::event( address@hidden ) ) != 0 ) {
-# $event = $Zconnbiblio[0]->last_event();
-# last if $event == ZOOM::Event::ZEND;
-# }
-#
-# my ( $error, $errmsg, $addinfo, $diagset ) = $Zconnbiblio[0]->error_x();
-# if ( $error == 10000 && $reconnect == 0 )
-# { ## This is serious ZEBRA server is not available -reconnect
-# warn "problem with zebra server connection";
-# $reconnect = 1;
-# my $res = system('sc start "Z39.50 Server"
>c:/zebraserver/error.log');
-#
-# #warn "Trying to restart ZEBRA Server";
-# #goto "reconnect";
-# }
-# elsif ( $error == 10007 && $tried < 2 )
-# { ## timeout --another 30 looonng seconds for this update
-# $tried = $tried + 1;
-# warn "warn: timeout, trying again";
-# goto "retry";
-# }
-# elsif ( $error == 10004 && $recon == 0 ) { ##Lost connection
-reconnect
-# $recon = 1;
-# warn "error: reconnecting to zebra";
-# goto "reconnect";
-#
-# # as a last resort, we save the data to the filesystem to be indexed in
batch
-# }
-# elsif ($error) {
-# warn
-# "Error-$server $op $biblionumber /errcode:, $error, /MSG:,$errmsg,$addinfo
\n";
-# $Zpackage->destroy();
-# $Zconnbiblio[0]->destroy();
-# ModZebrafiles( $dbh, $biblionumber, $record, $op, $server );
-# return;
-# }
-# if ( C4::Context->$shadow ) {
-# $Zpackage->send('commit');
-# while ( ( $i = ZOOM::event( address@hidden ) ) != 0 ) {
-#
-# #waiting zebra to finish;
-# }
-# }
-# $Zpackage->destroy();
+ if (C4::Context->preference("NoZebra")) {
+ # lock the nozebra table : we will read index lines, update them in
Perl process
+ # and write everything in 1 transaction.
+ # lock the table to avoid someone else overwriting what we are doing
+ $dbh->do('LOCK TABLES nozebra WRITE,biblio WRITE,biblioitems WRITE,
systempreferences WRITE');
+ my %result; # the result hash that will be builded by deletion / add,
and written on mySQL at the end, to improve speed
+ my $record= GetMarcBiblio($biblionumber);
+ if ($op eq 'specialUpdate') {
+ # OK, we have to add or update the record
+ # 1st delete (virtually, in indexes) ...
+ %result = _DelBiblioNoZebra($biblionumber,$record);
+ # ... add the record
+ %result=_AddBiblioNoZebra($biblionumber,$record, %result);
+ } else {
+ # it's a deletion, delete the record...
+ %result=_DelBiblioNoZebra($biblionumber,$record);
+ }
+ # ok, now update the database...
+ my $sth = $dbh->prepare("UPDATE nozebra SET biblionumbers=? WHERE
indexname=? AND value=?");
+ foreach my $key (keys %result) {
+ foreach my $index (keys %{$result{$key}}) {
+ $sth->execute($result{$key}->{$index},$key,$index);
+ }
+ }
+ $dbh->do('UNLOCK TABLES');
+
+ } else {
+ #
+ # we use zebra, just fill zebraqueue table
+ #
+ my $sth=$dbh->prepare("insert into zebraqueue (biblio_auth_number
,server,operation) values(?,?,?)");
+ $sth->execute($biblionumber,$server,$op);
+ $sth->finish;
+ }
+}
+
+=head2 GetNoZebraIndexes
+
+=cut
+
+sub GetNoZebraIndexes {
+ my $index = C4::Context->preference('NoZebraIndexes');
+ my %indexes;
+ foreach my $line (split /('|"),/,$index) {
+ $line =~ /(.*)=>(.*)/;
+ my $index = substr($1,1); # get the index, don't forget to remove
initial ' or "
+ my $fields = $2;
+ $index =~ s/'|"| //g;
+ $fields =~ s/'|"| //g;
+ $indexes{$index}=$fields;
+ }
+ return %indexes;
}
=head1 INTERNAL FUNCTIONS
+=head2 _DelBiblioNoZebra($biblionumber,$record);
+
+ function to delete a biblio in NoZebra indexes
+ This function does NOT delete anything in database : it reads all the
indexes entries
+ that have to be deleted & delete them in the hash
+ The SQL part is done either :
+ - after the Add if we are modifying a biblio (delete + add again)
+ - immediatly after this sub if we are doing a true deletion.
+
+=cut
+
+
+sub _DelBiblioNoZebra {
+ my ($biblionumber,$record)address@hidden;
+ # Get the indexes
+ my $dbh = C4::Context->dbh;
+ # Get the indexes
+ my %index=GetNoZebraIndexes;
+ # get title of the record (to store the 10 first letters with the index)
+ my ($titletag,$titlesubfield) = GetMarcFromKohaField('biblio.title');
+ my $title = lc($record->subfield($titletag,$titlesubfield));
+
+ my %result;
+ # remove blancks comma (that could cause problem when decoding the string
for CQL retrieval) and regexp specific values
+ $title =~ s/ |,|;|\[|\]|\(|\)|\*|-|'|=//g;
+ # limit to 10 char, should be enough, and limit the DB size
+ $title = substr($title,0,10);
+ #parse each field
+ my $sth2=$dbh->prepare('SELECT biblionumbers FROM nozebra WHERE
indexname=? AND value=?');
+ foreach my $field ($record->fields()) {
+ #parse each subfield
+ next if $field->tag <10;
+ foreach my $subfield ($field->subfields()) {
+ my $tag = $field->tag();
+ my $subfieldcode = $subfield->[0];
+ my $indexed=0;
+ # check each index to see if the subfield is stored somewhere
+ # otherwise, store it in __RAW__ index
+ foreach my $key (keys %index) {
+# warn "examining $key index : ".$index{$key}." for $tag
$subfieldcode";
+ if ($index{$key} =~ /$tag\*/ or $index{$key} =~
/$tag$subfieldcode/) {
+ $indexed=1;
+ my $line= lc $subfield->[1];
+ # remove meaningless value in the field...
+ $line =~
s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=/ /g;
+ # ... and split in words
+ foreach (split / /,$line) {
+ next unless $_; # skip empty values (multiple spaces)
+ # if the entry is already here, do nothing, the
biblionumber has already be removed
+ unless ($result{$key}->{$_} =~
/$biblionumber,$title\-(\d);/) {
+ # get the index value if it exist in the nozebra
table and remove the entry, otherwise, do nothing
+ $sth2->execute($key,$_);
+ my $existing_biblionumbers = $sth2->fetchrow;
+ # it exists
+ if ($existing_biblionumbers) {
+ warn " existing for $key $_:
$existing_biblionumbers";
+ $result{$key}->{$_} =$existing_biblionumbers;
+ $result{$key}->{$_} =~
s/$biblionumber,$title\-(\d);//;
+ }
+ }
+ }
+ }
+ }
+ # the subfield is not indexed, store it in __RAW__ index anyway
+ unless ($indexed) {
+ my $line= lc $subfield->[1];
+ $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=/
/g;
+ # ... and split in words
+ foreach (split / /,$line) {
+ next unless $_; # skip empty values (multiple spaces)
+ # if the entry is already here, do nothing, the
biblionumber has already be removed
+ unless ($result{'__RAW__'}->{$_} =~
/$biblionumber,$title\-(\d);/) {
+ # get the index value if it exist in the nozebra table
and remove the entry, otherwise, do nothing
+ $sth2->execute('__RAW__',$_);
+ my $existing_biblionumbers = $sth2->fetchrow;
+ # it exists
+ if ($existing_biblionumbers) {
+ warn " existing for __RAW__ $_ :
$existing_biblionumbers";
+ $result{'__RAW__'}->{$_} =$existing_biblionumbers;
+ $result{'__RAW__'}->{$_} =~
s/$biblionumber,$title\-(\d);//;
+ }
+ }
+ }
+ }
+ }
+ }
+ return %result;
+}
+
+=head2 _DelBiblioNoZebra($biblionumber,$record);
+
+ function to delete a biblio in NoZebra indexes
+
+=cut
+
+
+sub _AddBiblioNoZebra {
+ my ($biblionumber,$record,%result)address@hidden;
+ my $dbh = C4::Context->dbh;
+ # Get the indexes
+ my %index=GetNoZebraIndexes;
+ # get title of the record (to store the 10 first letters with the index)
+ my ($titletag,$titlesubfield) = GetMarcFromKohaField('biblio.title');
+ my $title = lc($record->subfield($titletag,$titlesubfield));
+
+ # remove blancks comma (that could cause problem when decoding the string
for CQL retrieval) and regexp specific values
+ $title =~ s/ |,|;|\[|\]|\(|\)|\*|-|'|=//g;
+ # limit to 10 char, should be enough, and limit the DB size
+ $title = substr($title,0,10);
+ #parse each field
+ my $sth2=$dbh->prepare('SELECT biblionumbers FROM nozebra WHERE
indexname=? AND value=?');
+ foreach my $field ($record->fields()) {
+ #parse each subfield
+ next if $field->tag <10;
+ foreach my $subfield ($field->subfields()) {
+ my $tag = $field->tag();
+ my $subfieldcode = $subfield->[0];
+ my $indexed=0;
+ # check each index to see if the subfield is stored somewhere
+ # otherwise, store it in __RAW__ index
+ foreach my $key (keys %index) {
+# warn "examining $key index : ".$index{$key}." for $tag
$subfieldcode";
+ if ($index{$key} =~ /$tag\*/ or $index{$key} =~
/$tag$subfieldcode/) {
+ $indexed=1;
+ my $line= lc $subfield->[1];
+ # remove meaningless value in the field...
+ $line =~
s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=/ /g;
+ # ... and split in words
+ foreach (split / /,$line) {
+ next unless $_; # skip empty values (multiple spaces)
+ # if the entry is already here, improve weight
+# warn "managing $_";
+ if ($result{$key}->{$_} =~
/$biblionumber,$title\-(\d);/) {
+ my $weight=$1+1;
+ $result{$key}->{$_} =~
s/$biblionumber,$title\-(\d);//;
+ $result{$key}->{$_} .=
"$biblionumber,$title-$weight;";
+ } else {
+ # get the value if it exist in the nozebra table,
otherwise, create it
+ $sth2->execute($key,$_);
+ my $existing_biblionumbers = $sth2->fetchrow;
+ # it exists
+ if ($existing_biblionumbers) {
+ warn" existing : $existing_biblionumbers";
+ $result{$key}->{$_} =$existing_biblionumbers;
+ my $weight=$1+1;
+ $result{$key}->{$_} =~
s/$biblionumber,$title\-(\d);//;
+ $result{$key}->{$_} .=
"$biblionumber,$title-$weight;";
+ # create a new ligne for this entry
+ } else {
+ $dbh->do('INSERT INTO nozebra SET
indexname='.$dbh->quote($key).',value='.$dbh->quote($_));
+ $result{$key}->{$_}.="$biblionumber,$title-1;";
+ }
+ }
+ }
+ }
+ }
+ # the subfield is not indexed, store it in __RAW__ index anyway
+ unless ($indexed) {
+ my $line= lc $subfield->[1];
+ $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=/
/g;
+ # ... and split in words
+ foreach (split / /,$line) {
+ next unless $_; # skip empty values (multiple spaces)
+ # if the entry is already here, improve weight
+ if ($result{'__RAW__'}->{$_} =~
/$biblionumber,$title\-(\d);/) {
+ my $weight=$1+1;
+ $result{'__RAW__'}->{$_} =~
s/$biblionumber,$title\-(\d);//;
+ $result{'__RAW__'}->{$_} .=
"$biblionumber,$title-$weight;";
+ } else {
+ # get the value if it exist in the nozebra table,
otherwise, create it
+ $sth2->execute('__RAW__',$_);
+ my $existing_biblionumbers = $sth2->fetchrow;
+ # it exists
+ if ($existing_biblionumbers) {
+ $result{'__RAW__'}->{$_} =$existing_biblionumbers;
+ my $weight=$1+1;
+ $result{'__RAW__'}->{$_} =~
s/$biblionumber,$title\-(\d);//;
+ $result{'__RAW__'}->{$_} .=
"$biblionumber,$title-$weight;";
+ # create a new ligne for this entry
+ } else {
+ $dbh->do('INSERT INTO nozebra SET
indexname="__RAW__",value='.$dbh->quote($_));
+
$result{'__RAW__'}->{$_}.="$biblionumber,$title-1;";
+ }
+ }
+ }
+ }
+ }
+ }
+ return %result;
+}
+
+
=head2 MARCitemchange
=over 4
@@ -3505,7 +3667,7 @@
# deal with UNIMARC field 100 (encoding) : create it if needed & set encoding
to unicode
if ( $encoding eq "UNIMARC" ) {
my $string;
- if ( $record->subfield( 100, "a" ) ) {
+ if ( length($record->subfield( 100, "a" )) == 35 ) {
$string = $record->subfield( 100, "a" );
my $f100 = $record->field(100);
$record->delete_field($f100);
@@ -3689,8 +3851,17 @@
=cut
-# $Id: Biblio.pm,v 1.201 2007/04/27 14:00:49 hdl Exp $
+# $Id: Biblio.pm,v 1.202 2007/05/02 16:44:31 tipaul Exp $
# $Log: Biblio.pm,v $
+# Revision 1.202 2007/05/02 16:44:31 tipaul
+# NoZebra SQL index management :
+# * adding 3 subs in Biblio.pm
+# - GetNoZebraIndexes, that get the index structure in a new systempreference
(added with this commit)
+# - _DelBiblioNoZebra, that retrieve all index entries for a biblio and remove
in a variable the biblio reference
+# - _AddBiblioNoZebra, that add index entries for a biblio.
+# Note that the 2 _Add and _Del subs work only in a hash variable, to speed up
things in case of a modif (ie : delete+add). The effective SQL update is done
in the ModZebra sub (that existed before, and dealed with zebra index).
+# I think the code has to be more deeply tested, but it works at least
partially.
+#
# Revision 1.201 2007/04/27 14:00:49 hdl
# Removing $dbh from GetMarcFromKohaField (dbh is not used in this function.)
#
Index: C4/Search.pm
===================================================================
RCS file: /sources/koha/koha/C4/Search.pm,v
retrieving revision 1.134
retrieving revision 1.135
diff -u -b -r1.134 -r1.135
--- C4/Search.pm 2 May 2007 11:57:11 -0000 1.134
+++ C4/Search.pm 2 May 2007 16:44:31 -0000 1.135
@@ -25,7 +25,7 @@
use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
# set the version for version checking
-$VERSION = do { my @v = '$Revision: 1.134 $' =~ /\d+/g;
+$VERSION = do { my @v = '$Revision: 1.135 $' =~ /\d+/g;
shift(@v) . "." . join( "_", map { sprintf "%03d", $_ } @v );
};
@@ -1454,26 +1454,21 @@
$title =~ /(.*)-(\d)/;
# get weight
my $ranking =$2;
- # hint : the result is sorted by title.biblionumber because we can
have X biblios with the same title
- # and we don't want to get only 1 result for each of them !!!
# note that we + the ranking because ranking is calculated on
weight of EACH term requested.
# if we ask for "two towers", and "two" has weight 2 in biblio N,
and "towers" has weight 4 in biblio N
# biblio N has ranking = 6
- $count_ranking{$biblionumber}=0 unless
$count_ranking{$biblionumber};
$count_ranking{$biblionumber} =+ $ranking;
}
# build the result by "inverting" the count_ranking hash
# hing : as usual, we don't order by ranking only, to avoid having
only 1 result for each rank. We build an hash on concat(ranking,biblionumber)
instead
# warn "counting";
foreach (keys %count_ranking) {
- warn "$_ =".sprintf("%10d",$count_ranking{$_}).'-'.$_;
$result{sprintf("%10d",$count_ranking{$_}).'-'.$_} = $_;
}
# sort the hash and return the same structure as GetRecords (Zebra
querying)
my $result_hash;
my $numbers=0;
foreach my $key (sort {$b <=> $a} (keys %result)) {
- warn "KEY : $key = ".$result{$key};
$result_hash->{'RECORDS'}[$numbers++] = $result{$key};
}
# for the requested page, replace biblionumber by the complete record
Index: updater/updatedatabase
===================================================================
RCS file: /sources/koha/koha/updater/updatedatabase,v
retrieving revision 1.162
retrieving revision 1.163
diff -u -b -r1.162 -r1.163
--- updater/updatedatabase 30 Apr 2007 16:16:50 -0000 1.162
+++ updater/updatedatabase 2 May 2007 16:44:31 -0000 1.163
@@ -1,6 +1,6 @@
#!/usr/bin/perl
-# $Id: updatedatabase,v 1.162 2007/04/30 16:16:50 tipaul Exp $
+# $Id: updatedatabase,v 1.163 2007/05/02 16:44:31 tipaul Exp $
# Database Updater
# This script checks for required updates to the database.
@@ -530,6 +530,14 @@
explanation => 'Active this if you want NOT to use zebra
(large libraries should avoid this parameters)',
type => 'YesNo',
},
+ {
+ uniquefieldrequired => 'variable',
+ variable => 'NoZebraIndexes',
+ value => '0',
+ forceupdate => {'explanation' => 1, 'type' => 1},
+ explanation => "Enter a specific hash for NoZebra indexes.
Enter : 'indexname' => '100a,245a,500*','index2' => '...'",
+ type => 'Free',
+ },
],
userflags => [
{
@@ -1993,6 +2001,15 @@
exit;
# $Log: updatedatabase,v $
+# Revision 1.163 2007/05/02 16:44:31 tipaul
+# NoZebra SQL index management :
+# * adding 3 subs in Biblio.pm
+# - GetNoZebraIndexes, that get the index structure in a new systempreference
(added with this commit)
+# - _DelBiblioNoZebra, that retrieve all index entries for a biblio and remove
in a variable the biblio reference
+# - _AddBiblioNoZebra, that add index entries for a biblio.
+# Note that the 2 _Add and _Del subs work only in a hash variable, to speed up
things in case of a modif (ie : delete+add). The effective SQL update is done
in the ModZebra sub (that existed before, and dealed with zebra index).
+# I think the code has to be more deeply tested, but it works at least
partially.
+#
# Revision 1.162 2007/04/30 16:16:50 tipaul
# bugfix for updatedatabase : when there is no default value (NULL fields) +
removing bibliothesaurus table+adding NoZebra systempref (False by default)
#
Index: misc/migration_tools/rebuild_nozebra.pl
===================================================================
RCS file: /sources/koha/koha/misc/migration_tools/rebuild_nozebra.pl,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -b -r1.2 -r1.3
--- misc/migration_tools/rebuild_nozebra.pl 2 May 2007 11:57:11 -0000
1.2
+++ misc/migration_tools/rebuild_nozebra.pl 2 May 2007 16:44:31 -0000
1.3
@@ -14,7 +14,7 @@
$|=1; # flushes output
# limit for database dumping
-my $limit;# = "LIMIT 1000";
+my $limit = "LIMIT 100";
my $directory;
my $skip_export;
my $keep_export;
@@ -47,22 +47,7 @@
my $i=0;
my %result;
-my %index = (
- 'title' =>
'200a,200c,200d,200e,225a,225d,225e,225f,225h,225i,225v,500*,501*,503*,510*,512*,513*,514*,515*,516*,517*,518*,519*,520*,530*,531*,532*,540*,541*,545*,604t,610t,605a',
- 'author'
=>'200f,600a,601a,604a,700a,700b,700c,700d,700a,701b,701c,701d,702a,702b,702c,702d,710a,710b,710c,710d,711a,711b,711c,711d,712a,712b,712c,712d',
- 'isbn' => '010a',
- 'issn' => '011a',
- 'biblionumber' =>'0909',
- 'itemtype' => '200b',
- 'language' => '010a',
- 'publisher' => '210x',
- 'date' => '210d',
- 'note' =>
'300a,301a,302a,303a,304a,305a,306az,307a,308a,309a,310a,311a,312a,313a,314a,315a,316a,317a,318a,319a,320a,321a,322a,323a,324a,325a,326a,327a,328a,330a,332a,333a,336a,337a,345a',
- 'Koha-Auth-Number' => '6009,6019,6029,6039,6049,6059,6069,6109',
- 'subject' => '600*,601*,606*,610*',
- 'dewey' => '676a',
- 'host-item' => '995a,995c',
- );
+my %index = GetNoZebraIndexes();
$|=1;
while (my ($biblionumber) = $sth->fetchrow) {
@@ -71,14 +56,11 @@
my $record = GetMarcBiblio($biblionumber);
# get title of the record (to store the 10 first letters with the index)
- my $title;
- if (C4::Context->preference('marcflavour') eq 'UNIMARC') {
- $title = lc($record->subfield('200','a'));
- } else {
- $title = lc($record->subfield('245','a'));
- }
+ my ($titletag,$titlesubfield) = GetMarcFromKohaField('biblio.title');
+ my $title = lc($record->subfield($titletag,$titlesubfield));
+
# remove blancks comma (that could cause problem when decoding the string
for CQL retrieval) and regexp specific values
- $title =~ s/ |,|;|\[|\]|\(|\)|\*//g;
+ $title =~ s/ |,|;|\[|\]|\(|\)|\*|-|'|=//g;
# limit to 10 char, should be enough, and limit the DB size
$title = substr($title,0,10);
#parse each field
@@ -92,16 +74,20 @@
# check each index to see if the subfield is stored somewhere
# otherwise, store it in __RAW__ index
foreach my $key (keys %index) {
- if ($index{$key} =~ /$tag\*/ or $index{$key} =~
/$tag$subfield/) {
+ if ($index{$key} =~ /$tag\*/ or $index{$key} =~
/$tag$subfieldcode/) {
$indexed=1;
my $line= lc $subfield->[1];
- $line =~
s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\// /g;
+ # remove meaningless value in the field...
+ $line =~
s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=/ /g;
+ # ... and split in words
foreach (split / /,$line) {
- # see if the entry is already here
+ next unless $_; # skip empty values (multiple spaces)
+ # if the entry is already here, improve weight
if ($result{$key}->{$_} =~
/$biblionumber,$title\-(\d);/) {
my $weight=$1+1;
$result{$key}->{$_} =~
s/$biblionumber,$title\-(\d);//;
$result{$key}->{$_} .=
"$biblionumber,$title-$weight;";
+ # otherwise, create it, with weight=1
} else {
$result{$key}->{$_}.="$biblionumber,$title-1;";
}
@@ -111,8 +97,9 @@
# the subfield is not indexed, store it in __RAW__ index anyway
unless ($indexed) {
my $line= lc $subfield->[1];
- $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\//
/g;
+ $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=/
/g;
foreach (split / /,$line) {
+ next unless $_;
# warn $record->as_formatted."$_ =>".$title;
if ($result{__RAW__}->{$_} =~
/$biblionumber,$title\-(\d);/) {
my $weight=$1+1;
@@ -131,8 +118,8 @@
foreach my $key (keys %result) {
foreach my $index (keys %{$result{$key}}) {
$sth->execute($key,$index,$result{$key}->{$index});
- if (length($result{$key}->{$index}) >40000) {
- print length($result{$key}->{$index})." for $key / $index";
+ if (length($result{$key}->{$index}) > 40000) {
+ print length($result{$key}->{$index})."\n for $key / $index\n";
}
}
}
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [Koha-cvs] koha C4/Biblio.pm C4/Search.pm updater/updateda...,
paul poulain <=