#!/usr/bin/perl -w #******************************* # # Create a file of MARC records from # a file exported from DBText # # Field separator is 0x04 # Record ends with 0x03 # # (c) 2009 by Bob Ewart bob-ewart@earthlink.net # #****************************** use strict; use MARC::Record; use MARC::Field; #use Data::Dumper; use MARC::File::MARCMaker; use POSIX qw(strftime); use Getopt::Long; my $version = "version 2009-07-04"; my $copyright = "copyright (c) 2009 by Bob Ewart bob-ewart\@earthlink.net"; my $license = "licensed under GPL v2"; my $help = 0; my $show_version = 0; my $outfname = 'catalog.mrc'; my $errout = 'catalog_err.csv'; my $infname; my $max_records = 9999999; my $dfltDate = "02/05/2002"; my $barcode_low = 0; GetOptions("help!" => \$help, "version!" => \$show_version, "output=s" => \$outfname, "error=s" => \$errout, "records=i" => \$max_records, ); if ($show_version) { die "$version\n$copyright\n$license\n"; } my $narg = @ARGV; if ($narg) { $infname = $ARGV[0]; } else { $help = 1; } if ($help) { die "\nDBTextCatalog infile\n". " Convert DBText catalog dump to MARC records for Koha bulkimport\n". "Options:\n". " --help This text\n". " --version The version $version\n". " --output= File name for blank records (default $outfname)\n". " --error= File name for error records (default $errout)\n". " --records= Maximum number of records to input (default $max_records)\n". "\n"; } my %convert_type = ('BOOK' => 'BOOK', 'DVD' => 'DVD', 'EROTIC' => 'ARCHIVE', 'F-A' => 'ARCHIVE', 'F-IA' => 'ARCHIVE', 'KIDS' => 'BOOK', 'PAMPHLET' => 'PAMPHLET', 'PAMPNLET' => 'PAMPHLET', 'REC' => 'AUDIO', 'VID' => 'VIDEO', 'VIDEOTAPE'=> 'VIDEO', 'ZPR' => 'BOOK', 'XSERIAL' => 'OFFSITE', 'AUDIOBOOK'=> 'AUDIO', 'CD' => 'CD', 'YA' => 'BOOK', ); my @error_headers = ('ID','Class','Bar Code','Record Type','Title','Author','Record Created'); $/ = chr(3); my $sep = chr(4); my ($linein, @fields, @headers, $i, $n, %field_no); my ($field, $field_num); my ($marc_rcd, $marc_field, $field_contents); my ($type_field, $type, $class_field, $class, $location, $barcode); my %dup_barcode; my %type_count; my %drop_count = ('ARCHIVE' => 0, 'SERIAL' => 0, 'OFFSITE' => 0, 'PAMPHLET' => 0 ); my %loc_count; my $str; my $records = 1; my $record_out = 0; my $items_out = 0; my $invalid = 0; my $dup_in_record = 0; my $dup_other_record = 0; my $dup_circ = 0; my ($nfields, @sub_fields, @alt_titles); my ($a,$b,$c); my $last_seen = strftime '%Y-%m-%d' ,localtime; my @field_count; my ($field_desc, $field_alt); my $marc_leader = '00000nam 22000001a 4500'; my $home_branch = 'SLA'; my $organization = 'flflsla'; my %marc_fields = ( '010' => ['LC Card','a',' ',' '], #NR '020' => ['ISBN','a',' ',' '], #R '022' => ['ISSN','a',' ',' '], #R '040' => ['Source','a',' ',' '], #NR '050' => ['Class','a',' ','4'], #R '082' => ['Dewey','a','0',' '], #R '100' => ['Author','a','1',' '], #NR 7001 a '110' => ['Corporate Author','a','2',' '], #NR 7102 a '245' => ['Title','a','0','0', #NR 2463 a 'Subtitle','b', 'Responsibility','c'], '250' => ['Edition','a',' ',' '], #NR '260' => ['Place','a','3',' ', #R 'Publisher','b', 'Pub Date','c'], '300' => ['Physical Description','abc',' ',' '], #R '440' => ['Series','a',' ',' '], #R '500' => ['Notes','a',' ',' '], #R '520' => ['Abstract','a','3',' '], #R '650' => ['Descriptors','a',' ','4'], #R ); my $m_field = 0; my $m_code = 1; my $m_ind1 = 2; my $m_ind2 = 3; my %marc_norepeats = ( '010' => ['no'], '040' => ['no'], '100' => ['700','a','1',' '], '110' => ['710','a','2',' '], '245' => ['246','a','3',' '], '250' => ['no'], ); my %f952_fields =( # d => 'Record Created', e => 'Provenance', g => 'Actual Cost', o => 'Class', ); # Open all files open (CATALOG, '<', $infname) or die "Can't open $infname for input\n$!\n"; open (MARCOUT, '>', $outfname) or die "Can't open $outfname for output\n$!\n"; open (ERROUT, '>', $errout) or die "Can't open $errout for error output\n$!\n"; open (MRKOUT, '>', 'catalog.mrk') or die "Can't open catalog.mrk' for output\n$!\n"; #=============================================== # Process field name record #=============================================== $linein = ; chomp($linein); @headers = split /$sep/, $linein; my $nheaders = @headers; for ($i = 0; $i < @headers; $i++) { $field_no{$headers[$i]} = $i; $field_count[$i] = 0; } $type_field = $field_no{'Record Type'}; $class_field = $field_no{'Class'}; #=============================================== # Main Loop #=============================================== while ($linein = ) { chomp($linein); @fields = split /$sep/,$linein; # split it into fields $records++; $nfields = @fields; if ($nfields != $nheaders) { if ($nfields > $nheaders) { $nfields = $nheaders; # don't process extra fields } } for ($i = 0; $i < $nfields; $i++) { # Garbage reduction if ($fields[$i] !~ m/^\s*$/) { #non-blank field $fields[$i] =~ s/\r\n/\|/g; # change CRLFs to | $fields[$i] =~ s/\|\s*\|/\|/g; # get rid of blank sub-fields $fields[$i] =~ s/\|\|/|/g; $fields[$i] =~ s/^\s*\"\s*//; # get rid of enclosing double quotes $fields[$i] =~ s/\s*\"\s*$//; $fields[$i] =~ s/^\s*\'\s*//; # get rid of enclosing single quotes $fields[$i] =~ s/\s*\'\s*$//; $fields[$i] =~ s/^\s*\|//; # leading or $fields[$i] =~ s/\|\s*$//; # trailing | } } #===================== # determine the type #===================== $type = $fields[$type_field]; $class = $fields[$class_field]; if (!$type) { # Record Type is missing $type = 'BOOK'; # asssume book for now if (!$class) { # both class and type are missing print_error('class&type missing'); $class = 'missing'; } else { # Can we determine record type from the class if ($class =~ m/([a-z\-]+)/i) { if (length($1) > 2) { # not LoC code if (exists $convert_type{uc($class)}) { $type = $convert_type{uc($class)}; } else { print_error(); } } # LoC code leave it as a book } # No letters--weird } } else { $type =~ m/([a-z]+)/i; # strip all the garbage characters $type = uc($1); # make it uppper case } if (!$type) { $type = 'BOOK'; # } elsif ($fields[$class_field] =~ m/\(O/i) { # $type = 'OVERSIZE'; } $type = uc($type); if (exists $convert_type{$type}) { $type = $convert_type{$type}; # make it standard } if (exists $drop_count{$type}) { # Drop any record in the drop list $drop_count{$type}++; # Count dropped records next; } $type_count{$type}++; # Count output MARC records #================================== # OK -- Create the MARC Record #================================== $marc_rcd = MARC::Record->new(); $marc_rcd->leader($marc_leader); # save the DBText ID as the 001 field $marc_field = MARC::Field->new('001',$fields[$field_no{'ID'}]); $marc_rcd->append_fields($marc_field); # flflsla is our code $marc_field = MARC::Field->new('003',$organization); $marc_rcd->append_fields($marc_field); # # import data for MARC fields # foreach $field (sort keys %marc_fields) { $field_desc = $marc_fields{$field}; # a pointer to an array $field_num = $field_no{$field_desc->[$m_field]}; # the input field number from the field name $nfields = @$field_desc; # the size of the array $field_contents = $fields[$field_num]; # get the contents of the input field if ($field_contents && ($field_contents !~ m/^\s*$/)) { # we have data if ($field_contents =~ m/\|/) { # repeated data @sub_fields = split /\|/,$field_contents; if (exists $marc_norepeats{$field}) { # can't repeat in this MARC field $field_alt = $marc_norepeats{$field}; # another pointer if ($field_alt->[$m_field] eq 'no') { # no alternate either if ($field eq '040') { $field_contents = pop @sub_fields; # just get the last one } $marc_field = MARC::Field->new($field, $field_desc->[$m_ind1],$field_desc->[$m_ind2], $field_desc->[$m_code],$field_contents); $marc_rcd->append_fields($marc_field); # just flush the whole field out } else { # We have an alternate repeat field $field_contents = shift @sub_fields; # regular out for the first out_marc($field); $field_desc = $marc_norepeats{$field}; # alternate out for the rest $nfields = 2; while ($field_contents = shift @sub_fields) { if ($field_contents) { out_marc($field_desc->[$m_field]); } } } } else { # we can repeat this field while ($field_contents = shift @sub_fields) { if ($field_contents) { # shouldn't have any blank sub-fields out_marc($field); } } } } else { # Just a single field out_marc($field); } } # no data } # end of field loop #=============================================== # End of field import to standard MARC record # ---------------------------------------------- # Field count #=============================================== for ($i = 0; ($i < @headers) and ($i < @fields); $i++) { if ($fields[$i] !~ /^\s*$/) {$field_count[$i]++; } } #=============================================== # Now concentrate on the Koha item records #=============================================== # Time to do the 952 record if ($type eq 'BOOK') { if ($class =~ m/^fic/i) { $location = 'FIC'; } elsif ($class =~ m/^\W*b\s/i) { $location = 'B'; } elsif ($class =~ m/^\W*ya\s/i) { $location = 'YA'; } elsif ($class =~ m/^\W*kids/i) { $location = 'YA'; } elsif ($class =~ m/^\(o/i) { $location = 'OS'; } elsif ($class =~ m/^\s*ps\s*648/i) { $location = 'PULP'; } else { $location = 'NF'; } } else { $location = uc($type); } $loc_count{$location}++; $marc_field = MARC::Field->new('952',' ',' ','a',$home_branch,'c',$location); @sub_fields = ('x',++$record_out); if (($fields[$field_no{'Record Created'}]) && ($fields[$field_no{'Record Created'}] =~ m/(\d*)\D*(\d*)\D*(\d*)/)) { my $crmonth = $1; my $crday = $2; my $cryear = $3; if ($cryear < 50) { $cryear += 2000; } elsif ( $cryear < 2000) { $cryear += 1900; } push @sub_fields, 'd',sprintf("%4d-%02d-%02d",$cryear, $crmonth, $crday); } else { push @sub_fields, 'd',$dfltDate; } foreach $field (sort keys %f952_fields) { $field_contents = $fields[$field_no{$f952_fields{$field}}]; if ($field_contents) { push @sub_fields, $field, $field_contents; } } # fixed fields push @sub_fields, ('r',$last_seen, 'y',$type); $marc_field->add_subfields(@sub_fields); # process the barcode(s) $field_contents = $fields[$field_no{'Bar Code'}]; if (!$field_contents) { # generate one if ($fields[$field_no{'ISBN'}]) { $field_contents = $fields[$field_no{'ISBN'}].(($barcode_low %6) + 4); } else { $field_contents = '99'.strftime("%d%H%M%S",localtime).substr($barcode_low,-4); } $barcode_low++; } my $new_field = $marc_field->clone; @sub_fields = split /\|/,$field_contents; $n = 0; # print "Bar codes '$field_contents'->".join(', ',@sub_fields).".\n"; foreach $field_contents (sort @sub_fields) { if (!$field_contents) { next; } $field_contents =~ m/(\w{3,})/; $barcode = $1; if (length($barcode) < 3) { print "$fields[$field_no{'ID'}] '$fields[$field_no{'Bar Code'}]'->'$field_contents'->'$barcode'\n"; next; } if (!$barcode) { next; } $n++; if ($dup_barcode{$barcode}) { # We found a duplicate barcode if ($dup_barcode{$barcode} == $record_out) { print "Record $record_out barcodes $barcode duplicated\n"; $dup_in_record++; } else { print "Record $record_out barcode $barcode matches record $dup_barcode{$barcode}\n"; $dup_other_record++; } $barcode .= substr($barcode_low,-1); $barcode_low++; if ($dup_barcode{$barcode}) { print "---$barcode still matches record $dup_barcode{$barcode}\n"; $barcode .= 'Z'; } if ($field_contents =~ m/\*/) { $dup_circ++; } } $dup_barcode{$barcode} = $record_out; # print "$n==$field_contents->$barcode.\n"; $marc_field = $new_field->clone; $marc_field->add_subfields('p',$barcode,'t',$n); if ($field_contents !~ m/\*/) { $marc_field->add_subfields('7','1'); } $marc_rcd->add_fields($marc_field); $items_out++; } if ($n == 0) { $marc_field = $new_field->clone; $barcode = '99'.strftime("%d%H%M%S",localtime).substr($barcode_low,-4); $barcode_low++; $marc_field->add_subfields('p',$barcode,'t','1','7','1'); $marc_rcd->add_fields($marc_field); $items_out++; } # output record print MARCOUT $marc_rcd->as_usmarc(); print MRKOUT MARC::File::MARCMaker->encode($marc_rcd); # Debug stuff # # if (($records == 2825) || ($records == 3152) || # ($records == 3663) || ($records == 4038) || ($records == 5489)) { # print_record(); # } ($records <= $max_records) or last; } #=========================== # End of Main Loop #--------------------------- # Print Statistics #============================ $records--; print "\nCount Percent Field\n"; for ($i=0; $i < @field_count; $i++) { printf("%5d %5.1f %s\n",$field_count[$i],(100.0 * $field_count[$i])/$records, $headers[$i]); } print "\nTypes kept\n"; $n = 0; foreach $type (sort keys %type_count) { printf("%5d '%s'\n",$type_count{$type},$type); $n += $type_count{$type}; } printf("%5d *Total Kept\n",$n); print "\nTypes dropped\n"; $i = 0; foreach $type (sort keys %drop_count) { printf("%5d '%s'\n",$drop_count{$type},$type); $i+= $drop_count{$type}; } printf("%5d *Total Dropped\n",$i); printf("%5d **Total Records**\n",$i+$n); print "\nLocations\n"; $i = 0; foreach $type (sort keys %loc_count) { printf("%5d '%s'\n",$loc_count{$type},$type); $i+= $loc_count{$type}; } printf("%5d **Total**\n\n",$i); printf("%5d Records in\n",$records); printf("%5d Records out\n",$record_out); printf("%5d Items out\n\n",$items_out); printf("%5d Duplicate barcodes within a record\n",$dup_in_record); printf("%5d Duplicate barcodes to other records\n",$dup_other_record); printf("%5d Duplicate barcodes circulating\n\n",$dup_circ); #=============================== # End of main line #------------------------------- # Subroutines #=============================== sub print_error { my (@error_fields,$field_name, $field_value); @error_fields = (); foreach $field_name (@error_headers) { $field_value = $fields[$field_no{$field_name}]; if (!$field_value) { $field_value = 'missing';} if ($field_value =~ m/,/) { $field_value = '"'.$field_value.'"'; } push @error_fields, $field_value; } print ERROUT join(',',@error_fields)."\n"; $invalid++; } sub physical_description { # print "p_d $_[0],$field_desc->[$m_code],". # "$field_desc->[$m_ind1],$field_desc->[$m_ind2],'$field_contents'\n"; if ($field_contents =~ m/(.*?:)\s*(.*?;)\s*(.*)/){ $a = $1; $b = $2; $c = $3; $marc_field=MARC::Field->new($_[0],$field_desc->[$m_ind1], $field_desc->[$m_ind2],'a',$a,'b',$b,'c',$c); } elsif ($field_contents =~ m/(.*?;)\s*(.*)/) { $a = $1; $c = $2; $marc_field=MARC::Field->new($_[0],$field_desc->[$m_ind1], $field_desc->[$m_ind2],'a',$a,'c',$c); } elsif ($field_contents =~ m/(.*?p.)\s*(.*?)\s*(\d.*)/) { $a = $1; $b = $2; $c = $3; if ($b) { $marc_field=MARC::Field->new($_[0],$field_desc->[$m_ind1], $field_desc->[$m_ind2],'a',$a,'b',$b,'c',$c); } else { $marc_field=MARC::Field->new($_[0],$field_desc->[$m_ind1], $field_desc->[$m_ind2],'a',$a,'c',$c); } } elsif ($field_contents =~ m/(.*?p.)\s*(.+)/){ $a = $1; $c = $2; $marc_field=MARC::Field->new($_[0],$field_desc->[$m_ind1], $field_desc->[$m_ind2],'a',$a,'c',$c); } else { $marc_field=MARC::Field->new($_[0],$field_desc->[$m_ind1], $field_desc->[$m_ind2],'a',$field_contents); } } sub print_record { print "\nRecord $records\n"; for ($i = 0; $i < @headers; $i++) { print $headers[$i]; if ($fields[$i]) { print "=>$fields[$i].\n"; } else { print ".\n"; } } } sub out_marc { # print "out_marc for $_[0], $field_desc->[$m_ind1],$field_desc->[$m_ind2],". # "$field_desc->[$m_code],$field_contents.\n"; if (length($field_desc->[$m_code]) > 1) { physical_description($_[0]); } else { $marc_field = MARC::Field->new($_[0],$field_desc->[$m_ind1],$field_desc->[$m_ind2], $field_desc->[$m_code],$field_contents); if ($nfields > 4) { for ($i = 4; $i < $nfields; $i+=2) { $field_num = $field_no{$field_desc->[$i]}; $field_contents = $fields[$field_num]; if ($field_contents) { $marc_field->add_subfields($field_desc->[$i+1],$field_contents); } } } } $marc_rcd->append_fields($marc_field); }