Summary | Included libraries | Package variables | Synopsis | Description | General documentation | Methods |
WebCvs | Raw content |
use Bio::Biblio::IO;
my $io = new Bio::Biblio::IO (-format => 'medlinexml');
BEGIN | Code | |
_add_element | No description | Code |
_data2obj | No description | Code |
_debug_object_stack | No description | Code |
_eq_hash_elem | No description | Code |
_initialize | No description | Code |
_obj2obj | No description | Code |
_parse | No description | Code |
_process_citation | No description | Code |
handle_char | No description | Code |
handle_doc_end | No description | Code |
handle_doc_start | No description | Code |
handle_end | No description | Code |
handle_start | No description | Code |
next_bibref | No description | Code |
BEGIN | Top |
# set the version for version checking}
$VERSION = do { my @r = (q$$Revision: 1.5 $ =~ /\d+/g); sprintf "%d.%-02d", @r }; $Revision = q$$Id: medlinexml.pm,v 1.5 2002/10/22 07:45:13 lapp Exp $;
_add_element | description | prev | next | Top |
my ($key, $element) = @_; my $peek = $ObjectStack[$#ObjectStack]; $$peek{$key} = [] unless $$peek{$key}; push (@{ $$peek{$key} }, (defined $element ? $element : pop @PCDataStack)); } # remove top of @PCDataStack and put it into top object at @ObjectStack under name $key}
_data2obj | description | prev | next | Top |
my ($key) = @_; my $peek = $ObjectStack[$#ObjectStack]; $$peek{$key} = pop @PCDataStack; } # remove top of @ObjectStack and put it into now-top at @ObjectStack under name $key}
_debug_object_stack | description | prev | next | Top |
my ($action, $element) = @_; if ($action =~ /^START/o) { $DEBUGSTACK{$element} = (@ObjectStack+0); } else { return if $element eq 'LastName'; print "Element $element starts on " . $DEBUGSTACK{$element} . 'and ends on ' . (@ObjectStack+0) . "\n" if $DEBUGSTACK{$element} != (@ObjectStack+0); } } 1; __END__}
_eq_hash_elem | description | prev | next | Top |
my ($rh, $key, $value) = @_; return (defined $$rh{$key} and $$rh{$key} eq $value); } #}
# --- only for debugging
#
use vars qw(%DEBUGSTACK); %DEBUGSTACK = ();
_initialize | description | prev | next | Top |
my ($self, @args) = @_; # make a hashtable from @args}
my %param = @args; @param { map { lc $_ } keys %param } = values %param; # lowercase keys
# copy all @args into this object (overwriting what may already be
# there) - changing '-key' into '_key', and making keys lowercase
my $new_key; foreach my $key (keys %param) { ($new_key = $key) =~ s/^-/_/; $self->{ lc $new_key } = $param { $key }; } # find the format for output - and put it into a global $Convert
# because it will be used by the event handler who knows nothing
# about this object
my $result = $self->{'_result'} || 'medline2ref'; $result = "\L$result"; # normalize capitalization to lower case
# a special case is 'raw' when no converting module is loaded
# and citations will be returned as a hashtable (the one which
# is created during parsing XML file/stream)
unless ($result eq 'raw') { # load module with output converter - as defined in $result
if (defined &Bio::Biblio::IO::_load_format_module ($result)) { $Convert = "Bio::Biblio::IO::$result"->new (@args); } } # create an instance of the XML parser
# (unless it is already there...)
$self->{'_xml_parser'} = new XML::Parser (Handlers => {Init =>\& handle_doc_start, Start =>\& handle_start, End =>\& handle_end, Char =>\& handle_char, Final =>\& handle_doc_end}) unless $self->{'_xml_parser'}; # if there is an argument '-callback' then start parsing at once -
# the registered event handlers will use 'callback' to report
# back after each citation
#
# we need to remember this situation also in a global variable
# because the event handler subroutines know nothing about this
# object (unfortunately)
if ($Callback = $self->{'_callback'}) { $self->_parse; } } # -----------------------------------------------------------------------------
_obj2obj | description | prev | next | Top |
my ($key) = @_; my $p = pop @ObjectStack; my $peek = $ObjectStack[$#ObjectStack]; $$peek{$key} = $p; } # check if a $key exists in a ref-hash $rh and if it is equal to $value}
_parse | description | prev | next | Top |
my ($self) = shift; if (defined $self->{'_file'}) { $self->{'_xml_parser'}->parsefile ($self->{'_file'}); } elsif (defined $self->{'_fh'}) { my $fh = $self->{'_fh'}; if (ref ($fh) and UNIVERSAL::isa ($fh, 'IO::Handler')) { $self->{'_xml_parser'}->parse ($fh); } else { my $data; $data .= $_ while <$fh>; $self->{'_xml_parser'}->parse ($data); } } elsif ($self->{'_data'}) { $self->{'_xml_parser'}->parse ($self->{'_data'}); } else { $self->throw ("XML source to be parsed is unknown. Should be given in the new()."); } # when parsing is done all citations have already been delivered}
# to the caller using her callbacks - and nothing to be stored
# here, or parser put all citations into global @Cittaions where
# we want to copy there into this instance - so any caller can
# start parsing other XML input without overwriting already read
# citations from the first parser
if (@Citations) { $self->{'_citations'} = []; foreach my $cit (@Citations) { push (@{ $self->{'_citations'} }, $cit); undef $cit; } undef @Citations; } } # ---------------------------------------------------------------------
#
# Here is an implementation of Bio::Biblio::IO methods
#
# ---------------------------------------------------------------------
# global variables used by the XML event handlers
# TBD: make them accessible at least ONLY from this module...
@Citations = (); $Callback = undef; $Convert = undef; @ObjectStack = (); # it has Hash-Ref elements
@PCDataStack = (); # it has String elements
_process_citation | description | prev | next | Top |
my ($citation) = @_; $citation = $Convert->convert ($citation) if defined $Convert; if ($Callback) { &$Callback ($citation); } else { push (@Citations, $citation); } } # add $element into an array named $key to the top object at @ObjectStack;}
# if $element is empty, take it from @PCDataStack
handle_char | description | prev | next | Top |
my ($expat, $str) = @_; # this may happen with whitespaces between tags;}
# but because I have not created an entry for data on the stack
# I can also ignore such data, can't I
return if $#PCDataStack < 0; $PCDataStack [$#PCDataStack] .= $str;
handle_doc_end | description | prev | next | Top |
undef @ObjectStack; undef @PCDataStack;}
handle_doc_start | description | prev | next | Top |
@Citations = (); @ObjectStack = (); @PCDataStack = ();}
handle_end | description | prev | next | Top |
my ($expat, $e) = @_; #}
# First I have to deal with those elements which are both PCDATA
# (and therefore they are on the pcdataStack) and which have an
# attribute list (therefore they are also known as a separate
# p-object on the objectStack.
#
if ($e eq 'QualifierName' or $e eq 'SubHeading') { my $p = pop @ObjectStack; # pSubHeading
$$p{'subHeading'} = pop @PCDataStack; &_add_element ('subHeadings', $p); # adding to pMeshHeadings
# &_debug_object_stack ("END", $e);
return; } elsif ($e eq 'GeneralNote') { my $p = pop @ObjectStack; # pGeneralNote
$$p{'generalNote'} = pop @PCDataStack; &_add_element ('generalNotes', $p); # adding to pMedlineCitation
# &_debug_object_stack ("END", $e);
return; } elsif ($e eq 'OtherID') { my $p = pop @ObjectStack; # pOtherID
$$p{'otherID'} = pop @PCDataStack; &_add_element ('otherIDs', $p); # adding to pMedlineCitation
# &_debug_object_stack ("END", $e);
return; } #
# both object and pcdata stacks elements mixed here together
# (the element names appear in the order of frequency in the
# medline data set)
#
if (exists $POP_DATA_AND_PEEK_OBJ{$e}) { &_data2obj ("\l$e"); } elsif (exists $POP_OBJ_AND_PEEK_OBJ{$e}) { &_obj2obj ("\l$e"); } elsif (exists $POP_AND_ADD_ELEMENT{$e}) { &_add_element ($POP_AND_ADD_ELEMENT{$e}, pop @ObjectStack); } elsif (exists $POP_AND_ADD_DATA_ELEMENT{$e}) { &_add_element ($POP_AND_ADD_DATA_ELEMENT{$e}); } elsif ($e eq 'Author' or $e eq 'Investigator') { my $pAuthor; my $p = pop @ObjectStack; # pPersonalName or pAuthor
if (&_eq_hash_elem ($p, 'type', 'PersonalName')) { $pAuthor = pop @ObjectStack; $$pAuthor{'personalName'} = $p; } else { $pAuthor = $p; } my $peek = $ObjectStack[$#ObjectStack]; # pMedlineCitation, pArticle or pBook
if (&_eq_hash_elem ($peek, 'type', 'MedlineCitation')) { &_add_element ('investigators', $pAuthor); } else { &_add_element ('authors', $pAuthor); } } elsif ($e eq 'MedlineJournalInfo') { &_obj2obj ('journalInfo'); } elsif ($e eq 'PMID') { my $peek = $ObjectStack[$#ObjectStack]; # pMedlineCitation, pReference or pDeleteCitation
if (&_eq_hash_elem ($peek, 'type', 'DeleteCitation')) { &_add_element ('PMIDs'); } else { $$peek{'PMID'} = pop @PCDataStack; } } elsif ($e eq 'MedlineID') { my $peek = $ObjectStack[$#ObjectStack]; # pMedlineCitation, pReference or pDeleteCitation
if (&_eq_hash_elem ($peek, 'type', 'DeleteCitation')) { &_add_element ('MedlineIDs'); } else { $$peek{'medlineID'} = pop @PCDataStack; } # } elsif ($e eq 'OtherAbstract') {
# my $pAbstract = pop @ObjectStack;
# my $pOtherAbstract = pop @ObjectStack;
# $$pOtherAbstract{'abstract'} = $pAbstract
# &_add_element ('otherAbstracts', $pOtherAbstract);
} elsif ($e eq 'Affiliation') { my $peek = $ObjectStack[$#ObjectStack]; if (&_eq_hash_elem ($peek, 'type', 'PersonalName')) { my $peek2 = $ObjectStack[$#ObjectStack - 1]; $$peek2{'affiliation'} = pop @PCDataStack; } else { $$peek{'affiliation'} = pop @PCDataStack; } } elsif ($e eq 'DeleteCitation') { pop @ObjectStack; ### warn ("'DeleteCitation' tag found. Not known what to do with it."); # silently ignored
} elsif ($e eq 'MedlineCitation') { #
# Here we finally have the whole citation ready.
#
&_process_citation (pop @ObjectStack); #
# ERROR: if we are here, there was an unexpected element
#
} elsif (exists $PCDATA_NAMES{$e}) { pop @PCDataStack; warn ("An unexpected element found: $e"); } # &_debug_object_stack ("END", $e);
} # what to do when we have the whole $citation ready
handle_start | description | prev | next | Top |
my ($expat, $e, %attrs) = @_; # &_debug_object_stack ("START", $e);}
#
# The #PCDATA elements which have an attribute list must
# be first here - because for them I create entries both on
# the @PCDataStack _and_ on @ObjectStack.
#
if ($e eq 'QualifierName' or $e eq 'SubHeading') { my %p = (); $p{'majorTopic'} = "Y" if $attrs{'MajorTopicYN'}; push (@ObjectStack,\% p); } if ($e eq 'GeneralNote') { my %p = (); $p{'owner'} = $attrs{'Owner'} if $attrs{'Owner'}; push (@ObjectStack,\% p); } if ($e eq 'OtherID') { my %p = (); $p{'source'} = $attrs{'Source'}; push (@ObjectStack,\% p); } #
# A special treatment is for attributes for personal name.
# Because there is no XML element 'PersonalName' I need to
# to put yet another object on @ObjectStack unless there is
# already one.
#
if ($e eq 'LastName' or $e eq 'FirstName' or $e eq 'MidleName' or $e eq 'Initials' or $e eq 'ForeName' or $e eq 'Suffix') { my $peek = $ObjectStack[$#ObjectStack]; push (@ObjectStack, {'type' => 'PersonalName'}) unless (ref $peek and &_eq_hash_elem ($peek, 'type', 'PersonalName')); } #
# Then we have #PCDATA elements without an attribute list.
# For them I create an entry on @PCDataStack.
#
if (exists $PCDATA_NAMES{$e}) { push (@PCDataStack, ''); #
# And finally, all non-PCDATA elements go to the objectStack
#
} elsif (exists $SIMPLE_TREATMENT{$e}) { push (@ObjectStack, {}); } elsif ($e eq 'PersonalNameSubject') { push (@ObjectStack, {'type' => 'PersonalName'}); } elsif ($e eq 'DescriptorName' or $e eq 'Descriptor') { if (&_eq_hash_elem (\%attrs, 'MajorTopicYN', "Y")) { my $peek = $ObjectStack[$#ObjectStack]; $$peek{'descriptorMajorTopic'} = "Y"; } } elsif ($e eq 'MedlineCitation' || $e eq 'NCBIArticle') { my %p = ( 'type' => 'MedlineCitation' ); $p{'owner'} = $attrs{'Owner'} if $attrs{'Owner'}; $p{'status'} = $attrs{'Status'} if $attrs{'Status'}; push (@ObjectStack,\% p); } elsif ($e eq 'GrantList') { if (&_eq_hash_elem (\%attrs, 'CompleteYN', "N")) { my $peek = $ObjectStack[$#ObjectStack]; $$peek{'grantListComplete'} = "N"; } } elsif ($e eq 'DataBankList') { if (&_eq_hash_elem (\%attrs, 'CompleteYN', "N")) { my $peek = $ObjectStack[$#ObjectStack]; $$peek{'dataBankListComplete'} = "N"; } } elsif ($e eq 'AuthorList') { if (&_eq_hash_elem (\%attrs, 'CompleteYN', "N")) { my $peek = $ObjectStack[$#ObjectStack]; $$peek{'authorListComplete'} = "N"; } } elsif ($e eq 'OtherAbstract') { my %p = (); $p{'type'} = $attrs{'Type'} if $attrs{'Type'}; push (@ObjectStack,\% p); # push (@ObjectStack, { 'type' => 'Abstract' });
}
next_bibref | description | prev | next | Top |
my ($self) = @_; $self->throw ("Method 'next_bibref' should not be called when a '-callback' argument given.") if $self->{'_callback'}; # parse the whole input into memory (global @Citations)}
# and then copy it into this object
$self->_parse unless $self->{'_citations'}; # return the next citation (and forget it here)
shift (@{ $self->{'_citations'} }); } # ---------------------------------------------------------------------
#
# Here are the event handlers (they do the real job!)
#
# Note that these methods do not know anything about the object they
# are part of - they are called as subroutines. not as methods.
# It also means that they need to use global variables to store and
# exchnage intermediate results.
#
# ---------------------------------------------------------------------
#
# This is a list of #PCDATA elements.
#
%PCDATA_NAMES = ( 'AbstractText' => 1, 'AccessionNumber' => 1, 'Acronym' => 1, 'Affiliation' => 1, 'Agency' => 1, 'ArticleTitle' => 1, 'CASRegistryNumber' => 1, 'CitationSubset' => 1, 'Coden' => 1, 'CollectionTitle' => 1, 'CollectiveName' => 1, 'CopyrightInformation' => 1, 'Country' => 1, 'DataBankName' => 1, 'DateOfElectronicPublication' => 1, 'Day' => 1, 'Descriptor' => 1, 'DescriptorName' => 1, 'EndPage' => 1, 'FirstName' => 1, 'ForeName' => 1, 'GeneralNote' => 1, 'GeneSymbol' => 1, 'GrantID' => 1, 'Hour' => 1, 'ISOAbbreviation' => 1, 'ISSN' => 1, 'Initials' => 1, 'Issue' => 1, 'Keyword' => 1, 'Language' => 1, 'LastName' => 1, 'MedlineCode' => 1, 'MedlineDate' => 1, 'MedlineID' => 1, 'MedlinePgn' => 1, 'MedlineTA' => 1, 'MiddleName' => 1, 'Minute' => 1, 'Month' => 1, 'NameOfSubstance' => 1, 'NlmUniqueID' => 1, 'Note' => 1, 'NumberOfReferences' => 1, 'OtherID' => 1, 'PMID' => 1, 'PublicationType' => 1, 'Publisher' => 1, 'QualifierName' => 1, 'RefSource' => 1, 'RegistryNumber' => 1, 'Season' => 1, 'Second' => 1, 'SpaceFlightMission' => 1, 'StartPage' => 1, 'SubHeading' => 1, 'Suffix' => 1, 'Title' => 1, 'VernacularTitle' => 1, 'Volume' => 1, 'Year' => 1, ); %SIMPLE_TREATMENT = ( 'MeshHeading' => 1, 'Author' => 1, 'Article' => 1, 'Book' => 1, 'Investigator' => 1, 'Chemical' => 1, 'Pagination' => 1, 'MedlineJournalInfo' => 1, 'JournalIssue' => 1, 'Journal' => 1, 'DateCreated' => 1, 'DateCompleted' => 1, 'DateRevised' => 1, 'PubDate' => 1, 'Abstract' => 1, 'Grant' => 1, 'CommentsCorrections' => 1, 'CommentOn' => 1, 'CommentIn' => 1, 'ErratumFor' => 1, 'ErratumIn' => 1, 'OriginalReportIn' => 1, 'RepublishedFrom' => 1, 'RepublishedIn' => 1, 'RetractionOf' => 1, 'RetractionIn' => 1, 'SummaryForPatientsIn' => 1, 'UpdateIn' => 1, 'UpdateOf' => 1, 'DataBank' => 1, 'KeywordList' => 1, 'DeleteCitation' => 1, ); %POP_DATA_AND_PEEK_OBJ = ( 'Descriptor' => 1, 'DescriptorName' => 1, 'Year' => 1, 'Month' => 1, 'Day' => 1, 'LastName' => 1, 'Initials' => 1, 'FirstName' => 1, 'ForeName' => 1, 'NameOfSubstance' => 1, 'RegistryNumber' => 1, 'CASRegistryNumber' => 1, 'MiddleName' => 1, 'NlmUniqueID' => 1, 'MedlineTA' => 1, 'MedlinePgn' => 1, 'MedlineCode' => 1, 'Country' => 1, 'ISSN' => 1, 'ArticleTitle' => 1, 'Issue' => 1, 'AbstractText' => 1, 'VernacularTitle' => 1, 'GrantID' => 1, 'Agency' => 1, 'Acronym' => 1, 'MedlineDate' => 1, 'NumberOfReferences' => 1, 'RefSource' => 1, 'DataBankName' => 1, 'CopyrightInformation' => 1, 'Suffix' => 1, 'Note' => 1, 'CollectiveName' => 1, 'Hour' => 1, 'Minute' => 1, 'Second' => 1, 'Season' => 1, 'Coden' => 1, 'ISOAbbreviation' => 1, 'Publisher' => 1, 'CollectionTitle' => 1, 'DateOfElectronicPublication' => 1, 'StartPage' => 1, 'EndPage' => 1, 'Volume' => 1, 'Title' => 1, ); %POP_OBJ_AND_PEEK_OBJ = ( 'Pagination' => 1, 'JournalIssue' => 1, 'Journal' => 1, 'DateCreated' => 1, 'Article' => 1, 'DateCompleted' => 1, 'DateRevised' => 1, 'CommentsCorrections' => 1, 'Book' => 1, 'PubDate' => 1, 'Abstract' => 1, ); %POP_AND_ADD_DATA_ELEMENT = ( 'Keyword' => 'keywords', 'PublicationType' => 'publicationTypes', 'CitationSubset' => 'citationSubsets', 'Language' => 'languages', 'AccessionNumber' => 'accessionNumbers', 'GeneSymbol' => 'geneSymbols', 'SpaceFlightMission' => 'spaceFlightMissions', ); %POP_AND_ADD_ELEMENT = ( 'OtherAbstract' => 'otherAbstracts', 'Chemical' => 'chemicals', 'KeywordList' => 'keywordLists', 'Grant' => 'grants', 'UpdateIn' => 'updateIns', 'CommentOn' => 'commentOns', 'CommentIn' => 'commentIns', 'DataBank' => 'dataBanks', 'PersonalNameSubject' => 'personalNameSubjects', 'ErratumFor' => 'erratumFors', 'ErratumIn' => 'erratumIns', 'RepublishedFrom' => 'republishedFroms', 'RepublishedIn' => 'republishedIns', 'RetractionOf' => 'retractionOfs', 'RetractionIn' => 'retractionIns', 'UpdateOf' => 'updateOfs', 'OriginalReportIn' => 'originalReportIns', 'SummaryForPatientsIn' => 'summaryForPatientsIns', 'MeshHeading' => 'meshHeadings', );
FEEDBACK | Top |
Mailing Lists | Top |
bioperl-l@bioperl.org - General discussion
http://bioperl.org/MailList.shtml - About the mailing lists
Reporting Bugs | Top |
bioperl-bugs@bioperl.org
http://bugzilla.bioperl.org/
AUTHOR | Top |
COPYRIGHT | Top |
DISCLAIMER | Top |
APPENDIX | Top |
VERSION and Revision | Top |
Usage : print $Bio::Biblio::IO::medlinexml::VERSION;
print $Bio::Biblio::IO::medlinexml::Revision;