[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Koha-cvs] koha/misc build_marc_Tword.pl build_marc_word.pl [rel_3_0]
From: |
paul poulain |
Subject: |
[Koha-cvs] koha/misc build_marc_Tword.pl build_marc_word.pl [rel_3_0] |
Date: |
Fri, 17 Nov 2006 12:56:37 +0000 |
CVSROOT: /sources/koha
Module name: koha
Branch: rel_3_0
Changes by: paul poulain <tipaul> 06/11/17 12:56:37
Removed files:
misc : build_marc_Tword.pl build_marc_word.pl
Log message:
removing useless scripts
CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/koha/misc/build_marc_Tword.pl?cvsroot=koha&only_with_tag=rel_3_0&r1=1.3&r2=0
http://cvs.savannah.gnu.org/viewcvs/koha/misc/build_marc_word.pl?cvsroot=koha&only_with_tag=rel_3_0&r1=1.1&r2=0
Patches:
Index: build_marc_Tword.pl
===================================================================
RCS file: build_marc_Tword.pl
diff -N build_marc_Tword.pl
--- build_marc_Tword.pl 1 Jun 2005 18:55:08 -0000 1.3
+++ /dev/null 1 Jan 1970 00:00:00 -0000
@@ -1,129 +0,0 @@
-#!/usr/bin/perl -w
-#-----------------------------------
-# Script Name: build_marc_Tword.pl
-# Script Version: 0.1.0
-# Date: 2004/06/05
-
-# script to build a marc_Tword table.
-# create the table :
-# CREATE TABLE `marc_Tword` (
-# `word` varchar(80) NOT NULL default '',
-# `usedin` text NOT NULL,
-# `tagsubfield` varchar(4) NOT NULL default '',
-# PRIMARY KEY (`word`,`tagsubfield`)
-#) TYPE=MyISAM;
-# just to test the idea of a reversed index searching.
-# reversed index for searchs on Title.
-# the marc_Tword table contains for each word & marc field/subfield, the list
of biblios using it, with the title
-# reminder : the inverted index is only done to search on a "contain". For a
"=" or "start by", the marc_subfield_table is perfect & correctly indexed.
-# if this POC becomes more than a POC, then I think we will have to build 1
table for each sorting (marc_Tword for title, Aword for author, Cword for
callnumber...)
-
-# FIXME :
-# * indexes empty words too (it's just a proof of concept)
-# * maybe it would be OK to store only 20 char of the title.
-
-use strict;
-use locale;
-use C4::Context;
-use C4::Biblio;
-my $dbh=C4::Context->dbh;
-use Time::HiRes qw(gettimeofday);
-
-# fields & subfields to ignore
-# in real situation, we should add a marc constraint on this.
-# ideally, we should not inde isbn, as every would be different, so it makes
the table very big.
-# but in this case we have to find a way to automatically search "isbn = XXX"
in marc_subfield_table
-
-my %ignore_list = (
- '001' =>1,
- '010b'=>1,
- '0909' => 1,
- '090a' => 1,
- '100' => 1,
- '105' => 1,
- '6069' => 1,
- '7009' => 1,
- '7019' => 1,
- '7109' => 1,
- '7129' => 1,
- '9959' => 1,
-);
-
-my $starttime = gettimeofday;
-
-$dbh->do("delete from marc_Tword");
-
-# parse every line
-my $query="SELECT
biblio.biblionumber,tag,subfieldcode,subfieldvalue,biblio.title FROM
marc_subfield_table left join marc_biblio on
marc_biblio.bibid=marc_subfield_table.bibid left join biblio on
marc_biblio.biblionumber=biblio.biblionumber where tag=?";
-my $sth=$dbh->prepare($query);
-
-for (my $looptag=0;$looptag<=999;$looptag++) {
- print "******** SELECTING ".(sprintf "%03s",$looptag)."\n";
- $sth->execute(sprintf "%03s",$looptag);
- print "******** DONE \n";
- $|=1; # flushes output
-
- my $sthT=$dbh->prepare("select usedin from marc_Tword where
tagsubfield=? and word=?");
- my $updateT=$dbh->prepare("update marc_Tword set usedin=? where
tagsubfield=? and word=?");
- my $insertT=$dbh->prepare("insert into marc_Tword
(tagsubfield,word,usedin) values (?,?,?)");
- my $i=0;
- my $timeneeded;
- # 1st version, slower, but less RAM consumming
- # while (my ($biblionumber, $tag, $subfieldcode, $subfieldvalue,
$title) = $sth->fetchrow) {
- # next if $ignore_list{"$tag.$subfieldcode"};
- # $subfieldvalue =~
s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g;
- # # remove useless chars in the title.
- # $title =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g;
- # my @words = split / /, $subfieldvalue;
- # # and retrieve the reversed entry
- # foreach my $word (@words) {
- # $sthT->execute($tag.$subfieldcode,$word);
- # if (my ($usedin) = $sthT->fetchrow) {
- # # add the field & save it once again.
- # $usedin.=",$biblionumber-$title";
- #
$updateT->execute($usedin,$tag.$subfieldcode,$word);
- # } else {
- #
$insertT->execute($tag.$subfieldcode,$word,",$title-$biblionumber");
- # }
- # }
- # $timeneeded = gettimeofday - $starttime unless ($i % 100);
- # print "$i in $timeneeded s\n" unless ($i % 100);
- # print ".";
- # $i++;
- # }
-
- # 2nd version : faster (about 100 times !), bug maybe too much RAM
consumming...
- my %largehash;
-# print "READING\n";
- $timeneeded = gettimeofday - $starttime unless ($i % 30000);
- print "READING $timeneeded s\n";
- while (my ($biblionumber, $tag, $subfieldcode, $subfieldvalue, $title)
= $sth->fetchrow) {
- next unless $subfieldvalue;
- next if $ignore_list{$tag.$subfieldcode};
- $subfieldvalue =~
s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g if $subfieldvalue;
- # remove useless chars in the title.
- $title =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g
if $title;
- my @words = split / /, $subfieldvalue;
- # and retrieve the reversed entry
- foreach my $word (@words) {
- my $localkey = $tag.$subfieldcode.'|'.uc($word);
-
$largehash{$localkey}.=",".substr($title,0,15)."-$biblionumber";
- }
- $timeneeded = gettimeofday - $starttime unless ($i % 30000);
- print "$i in $timeneeded s\n" unless ($i % 30000);
- print "." unless ($i % 500);
- $i++;
- }
- $i=0;
- print "WRITING\n";
- foreach my $k (keys %largehash) {
- $k =~ /(.*)\|(.*)/;
- $insertT->execute($1,$2,$largehash{$k});
- $timeneeded = gettimeofday - $starttime unless ($i % 30000);
- print "$i in $timeneeded s\n" unless ($i % 30000);
- print "." unless ($i % 500);
- $i++;
- }
-}
-
-$dbh->disconnect();
Index: build_marc_word.pl
===================================================================
RCS file: build_marc_word.pl
diff -N build_marc_word.pl
--- build_marc_word.pl 11 Jun 2004 15:07:48 -0000 1.1
+++ /dev/null 1 Jan 1970 00:00:00 -0000
@@ -1,114 +0,0 @@
-#!/usr/bin/perl -w
-#-----------------------------------
-# Script Name: build_marc_word.pl
-# Script Version: 0.1.0
-# Date: 2004/06/05
-# Author: Joshua Ferraro [jmf at kados dot org]
-# Description: This script builds a new marc_word
-# table with a reduced number of tags (only those
-# tags that should be searched) allowing for
-# faster and more accurate searching when used
-# with the SearchMarc routines. Make sure that
-# the MARCaddword routine in Biblio.pm will index
-# characters >= 1 char; otherwise, searches like
-# "O'brian, Patrick" will fail as the search
-# routines will seperate that query into "o",
-# "brian", and "patrick". (If "o" is not in the
-# database the search will fail)
-# Usage: build_marc_word.pl
-# Revision History:
-# 0.1.0 2004/06/11: first working version.
-# Thanks to Chris Cormack
-# for helping with the $data object
-# and Stephen Hedges for providing
-# the list of MARC tags.
-# FixMe:
-# *Should add a few parameters like 'delete from
-# marc_word' or make script ask user whether to
-# perform that task ...
-# *Add a 'status' report as the data is loaded ...
-#-----------------------------------
-use lib '/usr/local/koha/intranet/modules/';
-use strict;
-use C4::Context;
-use C4::Biblio;
-my $dbh=C4::Context->dbh;
-
-#Here is where you name the tags that you wish to index. If you
-# are using MARC21 this set of default tags should be fine but you
-# may need to add holdings tags specific to your library (e.g., holding
-# branch for Nelsonville is 942k but that may not be the case for your
-# library).
-my @tags=(
-
-#Tag documentation from http://lcweb.loc.gov/marc/bibliographic/ecbdhome.html
-
-"020a", # INTERNATIONAL STANDARD BOOK NUMBER
-"022a", # INTERNATIONAL STANDARD SERIAL NUMBER
-"100a", # MAIN ENTRY--PERSONAL NAME
-"110a", # MAIN ENTRY--CORPORATE NAME
-"110b", # Subordinate unit
-"110c", # Location of meeting
-"111a", # MAIN ENTRY--MEETING NAME
-"111c", # Location of meeting
-"130a", # MAIN ENTRY--UNIFORM TITLE
-"240a", # UNIFORM TITLE
-"245a", # TITLE STATEMENT
-"245b", # Remainder of title
-"245c", # Statement of responsibility, etc.
-"245p", # Name of part/section of a work
-"246a", # VARYING FORM OF TITLE
-"246b", # Remainder of title
-"260b", # PUBLICATION, DISTRIBUTION, ETC. (IMPRINT)
-"440a", # SERIES STATEMENT/ADDED ENTRY--TITLE
-"440p", # Name of part/section of a work
-"500a", # GENERAL NOTE
-"505t", # FORMATTED CONTENTS NOTE (t is Title)
-"511a", # PARTICIPANT OR PERFORMER NOTE
-"520a", # SUMMARY, ETC.
-"534a", # ORIGINAL VERSION NOTE
-"534k", # Key title of original
-"534t", # Title statement of original
-"586a", # AWARDS NOTE
-"600a", # SUBJECT ADDED ENTRY--PERSONAL NAME
-"610a", # SUBJECT ADDED ENTRY--CORPORATE NAME
-"611a", # SUBJECT ADDED ENTRY--MEETING NAME
-"630a", # SUBJECT ADDED ENTRY--UNIFORM TITLE
-"650a", # SUBJECT ADDED ENTRY--TOPICAL TERM
-"651a", # SUBJECT ADDED ENTRY--GEOGRAPHIC NAME
-"700a", # ADDED ENTRY--PERSONAL NAME
-"710a", # ADDED ENTRY--CORPORATE NAME
-"711a", # ADDED ENTRY--MEETING NAME
-"720a", # ADDED ENTRY--UNCONTROLLED NAME
-"730a", # ADDED ENTRY--UNIFORM TITLE
-"740a", # ADDED ENTRY--UNCONTROLLED RELATED/ANALYTICAL TITLE
-"752a", # ADDED ENTRY--HIERARCHICAL PLACE NAME
-"800a", # SERIES ADDED ENTRY--PERSONAL NAME
-"810a", # SERIES ADDED ENTRY--CORPORATE NAME
-"811a", # SERIES ADDED ENTRY--MEETING NAME
-"830a", # SERIES ADDED ENTRY--UNIFORM TITLE
-"942k" # Holdings Branch ?? Unique to NPL??
-);
-
-#note that subfieldcode in marc_subfield_table is subfieldid in marc_word ...
even
-#though there is another subfieldid in marc_subfield_table--very confusing
naming conventions!
-
-#For each tag we run a search to find the necessary data for building the
marc_word table
-foreach my $this_tagid(@tags) {
- my $query="SELECT
bibid,tag,tagorder,subfieldcode,subfieldorder,subfieldvalue FROM
marc_subfield_table WHERE tag=? AND subfieldcode=?";
- my $sth=$dbh->prepare($query);
-
- my ($tag, $subfieldid);
-
-#split the tag into tag, subfield
- if ($this_tagid =~ s/(\D+)//) {
- $subfieldid = $1;
- $tag = $this_tagid;
- }
-#Then we pass this information on to MARCaddword in Biblio.pm to actually
perform the import into marc_word
- $sth->execute($tag, $subfieldid);
- while (my $data=$sth->fetchrow_hashref()){
-
MARCaddword($dbh,$data->{'bibid'},$data->{'tag'},$data->{'tagorder'},$data->{'subfieldcode'},$data->{'subfieldorder'},$data->{'subfieldvalue'});
- }
-}
-$dbh->disconnect();
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [Koha-cvs] koha/misc build_marc_Tword.pl build_marc_word.pl [rel_3_0],
paul poulain <=