This object implements a NCBI Blast XML parser.
There is one additional initialization flag from the SearchIO defaults
- that is the -tempfile flag. If specified as true, then the parser
will write out each report to a temporary filehandle rather than
holding the entire report as a string in memory. The reason this is
done in the first place is NCBI reports have an uncessary <?xml
version="1.0"?> at the beginning of each report and RPS-BLAST reports
have an additional unecessary RPS-BLAST tag at the top of each report.
So we currently have implemented the work around by preparsing the
file (yes it makes the process slower, but it works).
BEGIN { %MODEMAP = ('BlastOutput' => 'result',
'Hit' => 'hit',
'Hsp' => 'hsp'
);
%MAPPING = (
'Hsp_bit-score' => 'HSP-bits',
'Hsp_score' => 'HSP-score',
'Hsp_evalue' => 'HSP-evalue',
'Hsp_query-from' => 'HSP-query_start',
'Hsp_query-to' => 'HSP-query_end',
'Hsp_hit-from' => 'HSP-hit_start',
'Hsp_hit-to' => 'HSP-hit_end',
'Hsp_positive' => 'HSP-conserved',
'Hsp_identity' => 'HSP-identical',
'Hsp_gaps' => 'HSP-gaps',
'Hsp_hitgaps' => 'HSP-hit_gaps',
'Hsp_querygaps' => 'HSP-query_gaps',
'Hsp_qseq' => 'HSP-query_seq',
'Hsp_hseq' => 'HSP-hit_seq',
'Hsp_midline' => 'HSP-homology_seq',
'Hsp_align-len' => 'HSP-hsp_length',
'Hsp_query-frame'=> 'HSP-query_frame',
'Hsp_hit-frame' => 'HSP-hit_frame',
'Hsp_num' => 'HSP-order',
'Hsp_pattern-from' => 'patternend',
'Hsp_pattern-to' => 'patternstart',
'Hsp_density' => 'hspdensity',
'Hit_id' => 'HIT-name',
'Hit_len' => 'HIT-length',
'Hit_accession' => 'HIT-accession',
'Hit_def' => 'HIT-description',
'Hit_num' => 'HIT-order',
'Iteration_iter-num' => 'HIT-iteration',
'Iteration_stat' => 'HIT-iteration_statistic',
'BlastOutput_program' => 'RESULT-algorithm_name',
'BlastOutput_version' => 'RESULT-algorithm_version',
'BlastOutput_query-def' => 'RESULT-query_description',
'BlastOutput_query-len' => 'RESULT-query_length',
'BlastOutput_db' => 'RESULT-database_name',
'BlastOutput_reference' => 'RESULT-program_reference',
'BlastOutput_query-ID' => 'runid',
'Parameters_matrix' => { 'RESULT-parameters' => 'matrix'},
'Parameters_expect' => { 'RESULT-parameters' => 'expect'},
'Parameters_include' => { 'RESULT-parameters' => 'include'},
'Parameters_sc-match' => { 'RESULT-parameters' => 'match'},
'Parameters_sc-mismatch' => { 'RESULT-parameters' => 'mismatch'},
'Parameters_gap-open' => { 'RESULT-parameters' => 'gapopen'},
'Parameters_gap-extend'=> { 'RESULT-parameters' => 'gapext'},
'Parameters_filter' => {'RESULT-parameters' => 'filter'},
'Statistics_db-num' => 'RESULT-database_entries',
'Statistics_db-len' => 'RESULT-database_letters',
'Statistics_hsp-len' => { 'RESULT-statistics' => 'hsplength'},
'Statistics_eff-space' => { 'RESULT-statistics' => 'effectivespace'},
'Statistics_kappa' => { 'RESULT-statistics' => 'kappa' },
'Statistics_lambda' => { 'RESULT-statistics' => 'lambda' },
'Statistics_entropy' => { 'RESULT-statistics' => 'entropy'},
);
eval { require Time::HiRes };
if( $@ ) { $DEBUG = 0; } |
sub next_result
{ my ($self) = @_;
my $data = '';
my $firstline = 1;
my ($tfh);
if( $self->use_tempfile ) {
$tfh = IO::File->new_tmpfile or $self->throw("Unable to open temp file: $!");
$tfh->autoflush(1);
}
my $okaytoprocess;
while( defined( $_ = $self->_readline) ) {
if( /^RPS-BLAST/i ) {
$self->{'_type'} = 'RPSBLAST';
next;
}
if( /^<\?xml version/ && ! $firstline) {
$self->_pushback($_);
last;
}
$_ = decode_entities($_);
$okaytoprocess = 1;
if( defined $tfh ) {
print $tfh $_;
} else {
$data .= $_;
}
$firstline = 0;
}
return undef unless( $okaytoprocess);
my %parser_args;
if( defined $tfh ) {
seek($tfh,0,0);
%parser_args = ('Source' => { 'ByteStream' => $tfh },
'Handler' => $self);
} else {
%parser_args = ('Source' => { 'String' => $data },
'Handler' => $self);
}
my $result;
my $starttime;
if( $DEBUG ) { $starttime = [ Time::HiRes::gettimeofday() ]; }
eval {
$result = $self->{'_xmlparser'}->parse(%parser_args);
$self->{'_result_count'}++;
};
if( $@ ) {
$self->warn("error in parsing a report:\n $@");
$result = undef;
}
if( $DEBUG ) {
$self->debug( sprintf("parsing took %f seconds\n", Time::HiRes::tv_interval($starttime)));
}
return $result; } |
The rest of the documentation details each of the object methods.
Internal methods are usually preceded with a _