package ERC; require Exporter; # use Exporter (); use vars qw(@ISA @EXPORT @EXPORT_OK); @ISA = qw(Exporter); @EXPORT = qw(); @EXPORT_OK = qw( erc_elem erc_start erc_support erc_set_start erc_valencode erc_debug date2temper test_temper ); # ERC (Electronic Resource Citation) formatting routines (Perl module) # Alpha version -- for demonstration only. # # Store as 'ERC.pm' and type 'perldoc ERC' for documentation. # # Author: John A. Kunze, jak@ucop.edu, California Digital Library # # --------- # Copyright (c) 2003 UC Regents # # Permission to use, copy, modify, distribute, and sell this software and # its documentation for any purpose is hereby granted without fee, provided # that (i) the above copyright notices and this permission notice appear in # all copies of the software and related documentation, and (ii) the names # of the UC Regents and the University of California are not used in any # advertising or publicity relating to the software without the specific, # prior written permission of the University of California. # # THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND, # EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY # WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. # # IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE FOR ANY # SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND, # OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, # WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY # THEORY OF LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE # OR PERFORMANCE OF THIS SOFTWARE. # --------- my $debug = 0; # default is off; to set use erc_debug(1) # first four args required, the rest optional sub erc_start { my( $creator, $title, $date, $identifier, $description, $because )=@_; return "erc:\n" . erc_elem("who: $creator") . erc_elem("what: $title") . erc_elem("when: $date") . erc_elem("where: $identifier") . # xxx ? rule that how label must be present if there's a why? erc_elem(! defined($description) ? (defined($because) ? "how: (:unav) Unavailable" : "") : "how: $description") . erc_elem(! defined($because) ? "" : "why: $because") ; } sub erc_support { my( $who, $what, $when, $where )=@_; return "erc-support:\n" . erc_elem("who: $who") . erc_elem("what: $what") . erc_elem("when: $when") . erc_elem("where: $where") ; } sub erc_set_start { my( $who, $what, $when, $where, $howmany, $starting, $total )=@_; # xxx these should be wrapped in erc_elem() calls return "THUMP-Status: 0.1 200 OK\n" . "\n" . "|set: $who | $what | $when\n" . " | $where\n" . "here: $howmany | $starting | $total\n" . "\n" ; } # W3C-NOTE: YYYY-MM-DDThh:mm:ss.sTZD, where TZD = Z or +hh:mm or -hh:mm sub date2temper { my( $s_orig )=@_; my $s = lc($s_orig); $s =~ s/\s+//g; # squeeze all whitespace out $s =~ s/;/,/g; # change semi-colons to commas # Quick, crude heuristic check for common language constructs. # $s =~ /nodate/ || $s =~ /dateunknown/ || $s =~ /unknowndate/ || $s =~ /n\.d\./ and return "(:unkn) unknown"; # Clean up odd stuff, like final a period. $s =~ s/\.$//; # Check if squeezed date is already in TEMPER format and return, # with appropriate type tag, "[:]" (equivalent to [:td]), in # front if so. # XXX no timezone support (or definition) in TEMPER yet. # XXX limited support for lists or ranges # XXX no support for "ca." or "circa" # XXX no support for doubt: 1934? or 1926?-1934? # my $slen = length($s); $debug and print "# date2temper: s_orig=|$s_orig| s=|$s|\n"; $s =~ /^\d+$/ and $slen == 4 || # a year only $slen == 8 || # a particular day $slen == 14 # a particular second and return "[:] $s" ; # xxx kludge to do quick check for TEMPER year ranges and lists $s =~ /^\d\d\d\d([-,]\d\d\d\d)+$/ and return "[:] $s"; # Check a few obvious formats and convert to [:] if possible. # For now, look for evidence of the W3C datetime format (the # profile of ISO 8601 adopted by Dublin Core). # # Start by isolating any time zone. my $timezone; $s =~ s/Z$//i and $timezone = "0000" || 1 # force true or $s =~ s/([-+]\d\d):(\d\d)// and $timezone = $1 . $2 ; # Now try to isolate a time of day. my ($hours, $mins, $secs, $time); if ($s =~ s/T(\d\d):(\d\d)(:\d\d(\.\d+)?)?$//i) { $hours = $1; $mins = $2; $secs = $3; $fracsecs = $4; defined($secs) and $secs =~ s/^://; $time = $hours . $mins . (defined($secs) ? $secs : "00"); defined($timezone) and $time .= "/" . $timezone; # XXX temporarily use "/" } # By this point, $time is either set or undefined. my ($year, $month, $day, $date); # Look for month/day/year or month/year format if ($s =~ s,^(\d+)/(\d+)(/(\d+))?,,) { ($month, $day, $year) = ($1, $2, $4); ! defined($year) and $year = $day || 1 and $day = "00" || 1 ; # xxx should check length of $day and $month to see if == 4 # which means it was really a year length($year) == 2 and $year = ($year > 50 ? "20" : "19") . $year; length($month) == 1 and $month = "0" . $month; length($day) == 1 and $day = "0" . $day; $date = $year . $month . $day; length($s) == 0 and # no remainder (good result) return "[:] $date" or # remainder (something's wrong) return "$s_orig" . ($debug ? "remainder |$s|" : "") ; } # xxx note: no range checking has been done # Look for yyyy-mm-dd format. $s =~ s/^(\d\d\d\d)// and $year = $1; $s =~ s/^-(\d\d)// and $month = $1 || 1 # force true or $s =~ s/^-(\d)// and # catch non-standard 1-digit month $month = "0" . $1 ; $s =~ s/^-(\d\d)// and $day = $1 || 1 # force true or $s =~ s/^-(\d)// and # catch non-standard 1-digit day $day = "0" . $1 || 1 or $day = "00" || 1 ; # xxx note: no range checking has been done # By this point, if $year is undefined, we don't know this format. ! defined($year) and return $s_orig; # If by this point we have anything left in $s, there's a good # chance we're off course. # Now rebuild $date from scratch. $date = $year; defined($month) and # requires $day to be defined $date .= $month . $day; defined($time) and $date .= $time; length($s) == 0 and # no remainder (good result) return "[:] $date" or # remainder (partial result) return $s_orig #return "s=$date (remainder '$s' grokking $s_orig)" ; } # XXX belongs in separate test script sub test_temper { my @before_after = ( '2003-04-12T16:58:01.345Z', '20030412165801.345/0000', '2003-04-12T16:58:01', '20030412165801', '2003-04-12T16:58Z', '20030412165800/0000', '2003-04-12', '20030412', '2003-1-9', '20030109', '2003-04-12T00:00-01:00', '20030412000000/-0100', ' 1999 12 13 ', '19991213', '1/4/45', '19450104', ); my ($before, $after, $expected); while (@before_after) { $before = shift @before_after; $expected = shift @before_after; #print date2temper($before), "\n"; $after = date2temper($before); $after =~ s/\[:\] //; $after ne $expected and print "test failed, $before:\n\twanted\t|$expected|", "\n\tgot\t|$after|\n"; } } sub erc_debug { my( $n )=@_; $debug = $n; return 1; } sub erc_valencode { my( $valarrayref )=@_; my ($retval, $v) = ("", ""); for $v ( @{$valarrayref} ) { $v = entity_to_txt($v); $v =~ s/\|/%!/g; # encode any pre-existing separators $retval .= "$v | "; } $retval = substr($retval, 0, -3); # remove last separator return $retval; } use Text::Wrap; my $maxcols = 72; $Text::Wrap::columns = $maxcols; $Text::Wrap::huge = 'overflow'; # don't break long values # Make an ERC element. # # XXX needed any more? my $except_re = qr/([\001-\037\177-\377])/; sub erc_elem { my( $s )=@_; $s eq "" and # return an empty string untouched (add no \n) return $s; $s =~ s/\n//g; # replace every \n with " " -- this case is # not expected, but would screw things up $s =~ s/%/%%/g; # to preserve literal %, double it # XXX must be decoded by receiver # xxx ERC-encode ERC structural delims ? # XXX why do 4 bytes (instead of 2) show up in wget?? # # %-encode any chars that need it # $s =~ s/$except_re/ "%" . join("", unpack("H2", $1)) /ge; # fold lines longer than 72 chars and wrap with one tab's # indention (assume tabwidth=8, line length of 64=72-8 # wrap: initial tab = "", subsequent tab = "\t" $s = wrap("", "\t", $s); return $s . "\n"; # append newline to end element } use HTML::Entities; use Encode; sub entity_to_txt { my( $s )=@_; $s !~ /&\S+;/ and return $s; $s =~ s|"|"|gs; return encode("UTF8", decode_entities($s)); } 1; __END__ =head1 NAME ERC - Electronic Resource Citation tools, version 0.1 =head1 SYNOPSIS use ERC; $elem = erc_elem("$label: $string"); # Wraps text to 72 columns, appends newline to end the value. # The $label can be "who", "what", "when", "where", etc, or can # begin a new erc "story", such as "erc-support:\nPermanent". $erc = erc_start($creator, $title, $date, $identifier, [ $description, [ $because ] ]); # ... which is roughly short-hand for... $erc = "erc:\n" . erc_elem("who: $creator") . erc_elem("what: $title") . erc_elem("when: $date") . erc_elem("where: $identifier") . erc_elem("how: $description") . # optional erc_elem("why: $because"); # optional $erc .= "\n"; # two "\n" characters in a row terminate an ERC record $erc .= erc_support($who, $what, $when, $where); # Appends to an ERC an expression of the commitment made to the # the object: who made it, what it was, when it was made, and # where a fuller explanation of the commitment may be found. $set_intro = erc_set_start($who, $what, $when, $where, $howmany, $starting, $total); # Creates an HTTP header, blank line, special set descriptor record, # and another blank line. This precedes a set of ERC records (e.g., # 1 record, starting at record 1, from a result set of 1 record). =head1 DESCRIPTION The erc_elem function returns a plain text string representing an ERC element after doing some %-encoding of a line of data (in label: value format), and tries to perform some simple line wrapping for convenient printing and screen viewing. The erc_start function returns a text string representing the beginning of an ERC record. It is a short-hand way of calling erc_elem with the first four to six elements of an ERC (values for the first four are required). There is no "erc_end" function. Instead, the caller is responsible (after appending any number of extra elements) for properly terminating the ERC record, which ends when two "\n" (newline) characters appear in a row. Usually, this means appending a single newline since each element ends with a newline. The erc_support function returns a string similar in form to erc_start but telling a different story. While the first segment of an ERC tells the story of an expression of the object, this segment (often appended to the first) tells the story of the support commitment made to it: who made the commitment, what the nature of the commitment was, when it was made, and where a fuller explanation of the commitment may be found. The erc_set_start function returns a text string representing the preamble that normally introduces a set of one or more ERC records. The string has two parts. First is an HTTP header identifying the type of text stream that follows and a blank line; this part is meant to terminate a block of HTTP headers. Second is a special record (also ending with a blank line) that describes the particular set ERC records that follows it. The preamble identifies who created the set, what its title (preferably human-oriented) is, when it was created, and where an automated process can access the set. The last line of the preamble is a three-part element named "here". Its parts tell the recipient that here one can expect to find, respectively, $howmany records, starting at $starting, from a set of $total records. For example, 1, 1, and 1 for a one-record set, such as would be produced by a known-item search. =head1 REFERENCES A Metadata Kernel for Electronic Permanence (PDF) http://jodi.ecs.soton.ac.uk/Articles/v02/i02/Kunze/ =head1 HISTORY This is an alpha version of ERC tools. It is written in Perl. =head1 BUGS This API needs more functions, but which? Comments to jak@ucop.edu. =head1 AUTHOR John A. Kunze Ejak@ucop.eduE. =cut