#!/usr/bin/perl -w use strict; # Biblio-helper - A program to help find parts of biblio entries # that may be publically available. # # There is a very well developed branch of computing that comes # from the library sciences area. Part of this effort comes from # one of the largest libraries, that of the US Library of Congress # (LOC). They have a format called MARC - MAchine Readable Cataloging. # Its home on the web is http://www.loc.gov/marc/ # Somehow, there have gotten to be various flavours of MARC, so sometimes # you need to look out for that. Of course, some people aren't satisfied # with making localisations of a "standard", some people have to # re-invent the wheel. So, there are other standards out there. # # MARC comes from a time when space was precious, and so their (US-LOC) # is full of cryptic details. There are efforts to move MARC into # things like an XML based format, but for now you need to be able # to work with MARC records (and possibly others). There is a perl # module (actually, an older one and a newer one) called MARC and # MARC::Record (Record is the newer one). In any event, this wraps # a bunch of logic around a really cryptic format. If your biblio # work involves books that have an ISBN or serials (journals) which # have an ISSN, there are Business::ISBN and Business::ISSN modules # which can help with that. I have read of one person using a barcode # scanner to scan his books, using the Business::ISBN module to # tidy up/verify the ISBN, and then the Net::Z3950 stuff to get # biblio entries from US-LOC. Something about insurance companies # not believing a person could have N books (N being some number # bigger than 1). # # Above, I mentioned Z3950. This is a formalised standard (z39.50 of # ANSI/NISO I believe), for doing a type of database search over a # network. And, it happens to be the one used most for things that # either are libraries, or look like libraries. # # There are variations on the biblio formats at the servers, and # there are variations on the hooks to search them. Would it surprise # you to find there are also variations on the languages used to # perform a search? I thought not. :-) # # There are a bunch of search languages, and again they tend to be # more than a little cryptic. Net::Z3950 is built on top of YAZ # (a toolkit from IndexData.dk), which by default uses a language # called PQN (Prefix Query Notation) which apparently was invented # by IndexData.dk to map simply to Z39.50 type-1 query structure. # The other common search language (which seems to come in 2 # flavours?) is Common Command Language (CCL). CCL is apparently # an implementation of ISO 8777. # # Documentation for PQN is in the YAZ documentation (YAZ User's # Guide and Reference) under Query Syntax Parsers. This is not # section 4.1, contrary to what the man page for Net::Z3950 says. # Documentation for CCL is in the same section, which again is # NOT section 4.1. CCL search queries are sent to the server as # is, ccl2rpn are converted locally into PQN and then sent. Not # all Z39.50 servers understand CCL, so sending a CCL query may # not get you any joy. Having the local parser (yaz) convert # the CCL to PQN won't work, as there is no hook installed yet to # read a file of converts modifies to attributes. So, you can # try the more easily understandble CCL, but it may not work. # Otherwise, you are stuck with figuring out PQN. # # And, it appears that some Z39.50 servers do not implement the # standard completely/correctly. So, don't be too surprised if # things don't work for you on the first try. # # Maybe I'm just being cynical because all this is new. But it # sure seems obfuscated to me. # # Copyright Gordon Haverland, Matter Realisations, 2003 # Sames terms as perl for th copyright, no warranty. # perl@materialisations.com use Net::Z3950; use MARC::File::USMARC; # use Business::ISBN; # To process via ISBN # use MARC::Record; # To understand a MARC record # A couple of global variables, initialized in a BEGIN subroutine. use vars qw( %tv %synonyms ); my( $hostname, $port, $databaseName ); my $Search_Buildup = 'prefix'; # default # Define our hostname, port, and databaseName. All that is really # required, is a hostname and port for the Connection, everything # else can go in an ->option() statement. Or, we can put everything # in the Connection() statement, if we know it now. I want to get # "Full" MARC records from the US-LOC. The type of record (Full or # Brief) is supposed to be case insensitive, but some servers are # case sensitive. Best to send 'F' or 'B'. The default port is 210. $hostname = 'z3950.loc.gov'; $port = 7090; $databaseName = 'Voyager'; my $conn = new Net::Z3950::Connection( $hostname, $port, databaseName => $databaseName, preferredRecordSyntax => Net::Z3950::RecordSyntax::USMARC, elementSetName => 'F' ); # $conn->option( # preferredRecordSyntax => Net::Z3950::RecordSyntax::USMARC, # elementSetName => 'f' # ); # Did I get a connection? if( $conn ) { # Yes, I did! # Assume we are reading titles, out of "data" open( DATA, "< data" ) || die "Can't open data for read: $!\n"; while( my $line = ) { chomp( $line ); # Generate a -prefix, -ccl, or -ccl2rpm query # -prefix (or PQN, the default) sets up a stack in a string. # @stack = (inst 1, inst 2, ...) # $query = { while( @stack ) { pop( @stack ) . ' '; } # Sort of thing. # keywords - tokens with an @ sign and special meaning to the parser # attrset, term, attr, and, or, not, prox, set # This is a MARC database, we need to search '@attr 1=7 ' or something. # The @attr 1=7 business is for search LOC (Library of Congress by ISBN) # I'm guessing that type 1 data is something like "string", since that # is the only type of query allowed by Voyager (US-LOC). # 7 = ISBN, 4 = ? (title?) # List of types from Voyager, is in get_attr() below. my $def_search = []; my $search_string = ''; my $parser_search_type = 'prefix'; # my $data_type = '1'; # if term => SCALAR, it's a generic search # if term => { # It's an attribute search_for string in field # term_type => 'attr', # data_type => 1, # field_desc => 'string', # search_for => 'string' # } # if term => { # term_type => 'attrset', (or set or term) # arg => 'string' # } $search_string = &add_search_terms( $search_string, term => { term_type => 'attr', data_type => 1, field_desc => 'Title', search_for => $line, } ); my $rs = $conn->search(-prefix => $search_string); my $n_records = $rs->size(); for( my $i = 0; $i < $n_records; $i++ ) { my $rec = $rs->record( $i+1 ); if( $rec ) { print '=====', $i+1, " =====\n", $rec->render(); my $marc_rec_obj = MARC::File::USMARC::decode( $rec->render() ); } else { printf "bad record %d\n", $i+1; } } } # End loop over reading titles from file. close( DATA ); } else { # Couldn't build connection to hostname print STDERR "Sorry, couldn't open connection to $hostname\n"; } $conn->close(); print "All done\n"; exit 0; # ---------------------- Subroutines ------------------------ sub add_search_terms { return undef if( $#_ < 1 ); # 2? my $search_string = shift; my $keyword = $_[0];; my( %term, @keys, $seen_hash ); if( $keyword eq 'term' ) { %term = (@_); @keys = keys( %term ); $seen_hash = 1; } else { $seen_hash = 0; } my $string = ''; if( $seen_hash == 1 ) { return undef unless( $#keys == 0 && $keys[0] eq 'term' ); my $term_ref = ref( $term{term} ); if( $term_ref eq 'SCALAR' ) { print "huh?\n"; } elsif( $term_ref eq '' ) { # Ordinary scalar, so ordinary search return '"' . $term{term} . '"'; } elsif( $term_ref eq 'HASH' ) { return undef unless( exists( $term{term}{term_type} ) ); my $search = $term{term}; if( $search->{term_type} =~ /^attr$/i ) { $string = &get_attr( $search ); return $search_string unless( $string ); } elsif( $search->{term_type} =~ /^(attrset|term|set)$/i ) { return undef unless( exists( $search->{arg} ) ); $string = "\@$search->{term_type} $search->{arg}"; } elsif( $search->{term_type} =~ /^prox$/i ) { print "Sorry, don't know proximity yet.\n"; } else { # Just a generic search print "Strange term_type $search->{term_type}\n"; return undef; } } } else { # $seen_hash == 0, so we have AND, OR or NOT if( $keyword =~ /^(and|or|not)$/i ) { $string = "\@$keyword"; } else { # Just a generic search print "Strange single word operator $keyword. Expecting AND, OR or NOT\n"; return undef; } } if( $Search_Buildup eq 'suffix' ) { return $search_string . ' ' . $string; } else { # default, prefix return $string . ' ' . $search_string; } } sub get_attr { my $hash = shift; my $term = '@attr '; my $n_desc; return undef unless( $hash->{term_type} =~ /^attr$/i ); if( exists( $tv{$hash->{field_desc}} ) ) { $n_desc = $tv{$hash->{field_desc}}; } else { print "Sorry, $hash->{field_desc} isn't a valid description (type=value) field\n"; my $desc = &syn_lookup( $hash->{field_desc} ); return undef unless $desc; return undef unless( exists( $tv{$desc} ) ); $n_desc = $tv{$desc}; } $term .= sprintf("%d=%d \"%s\"", $hash->{data_type}, $n_desc, $hash->{search_for}); return $term; } sub syn_lookup { my $key = shift; use Array::Lookup; # This is just like a hash lookup of $synonyms{$key}. my $value = lookup $key, \%synonyms, sub { my $key = shift; my $hash = shift; my @keys = keys( %{$hash} ); lookup_error $key, \@keys, 'not found', "Unknown description for search field '%s'; Use one of:\n"; }, sub { my $key = shift; my $hash = shift; my @keys = keys( %{$hash} ); lookup_error $key, \@keys, 'is ambiguous', "Ambiguous description '%s' for search field %s; Use one of:\n"; }; return $value; } BEGIN { # A copy of the field_type = index number for the US-LOC Voyager database. %tv = ( 'Personal name' => 1, 'Corporate name' => 2, 'Conference name' => 3, 'Title' => 4, 'Title series' => 5, 'Uniform title' => 6, 'ISBN' => 7, 'ISSN' => 8, 'LCCN' => 9, 'Local number (035 field)' => 12, 'Dewey classification' => 13, 'LC call number' => 16, 'NLM call number' => 17, 'Other call number' => 20, 'Subject heading' => 21, 'MeSH subject heading' => 25, 'LC subject heading' => 27, 'Date of publication' => 31, 'Title -- key' => 33, 'Title -- variant' => 41, 'Title -- former' => 42, 'Title -- abbreviated' => 43, 'Number -- national bibliography' => 48, 'Number -- government pub.' => 50, 'Number -- music publisher' => 51, 'Code -- language' => 54, 'Code -- geographic area' => 55, 'Code -- institution' => 56, 'Name and title' => 57, 'Name geographic' => 58, 'Place of publication' => 59, 'CODEN' => 60, 'Note' => 63, 'Name' => 1002, 'Author' => 1003, 'Author (personal name)' => 1004, 'Author (corporate name)' => 1005, 'Standard identifier' => 1007, 'LC children\'s subject' => 1008, 'Subject (personal name)' => 1009, 'Any (keyword)' => 1016, 'Name of publisher' => 1018, 'Cartographic math data' => 1024, 'Standard technical report number' => 1027, 'Material type (245$h)' => 1031, 'Electronic location and access' => 1032, 'Dissertation note' => 1056, 'Subject (name)' => 1074, 'Subject (title)' => 1078, 'Subject (topical)' => 1079, 'Additional format note' => 1107, 'Location' => 1108, 'Credits/performers' => 1185, 'Electronic access' => 1209, ); # Make another hash, from the above data, with the key being the # the lowercase string and the value being the mixed case string. my @array = keys( %tv ); my @lcarray; foreach (@array) { push @lcarray, lc( $_ ); } @synonyms{@lcarray} = @array; }