/i;
if(not $found_table) {
if($ref_skip) {
# Replacing an reference data with special HTML.
$ref_skip = 0 if /^\s+$/;
}
if($getNote) {
## SAC: created this test since we are no longer reading from STDIN.
$out_aref ? push(@$out_aref, $_) : print $_;
$getNote = 0 if m/^\s+$/;
} elsif( m/(.*), Up \d.*/ or /Date: +(.+)/ or /Start: +(.+?) +End:/ ) {
### Network BLAST reports from NCBI are time stamped as follows:
#Fri Apr 18 15:55:41 EDT 1997, Up 1 day, 19 mins, 1 user, load: 19.54, 19.13, 17.77
my $date = "BLASTed on: $1\n";
$out_aref ? push(@$out_aref, $date) : print $date;
} elsif ( /^(<\w+>)?(T?BLAST[NPX])\s+(.*?)/ ) {
$found_data = 1;
local($^W) = 0;
s#(\S+)\s+(.*)#
Program: $1 $2 $3
#o;
$out_aref ? push(@$out_aref, $_) : print $_;
$skip = 1;
$prog = $2;
if($prog =~ /BLASTN/) {
## Prevent the error at Entrez when you ask for a nucl
## entry with a protein GI number.
$$gi_link = $DbUrl{'gb_n'}; # nucleotide
} else {
$$gi_link = $DbUrl{'gb_p'}; # protein
}
} elsif ( m/^Query=/ ) {
# Keeping the "Query=" format to keep it parsable by Blast.pm
# (after stripping HTML).
s#Query= *(.*)#
$1 \nQuery= $1#o;
$out_aref ? push(@$out_aref, $_) : print $_;
$skip = 1;
} elsif ( /Reference:/) {
$ref_skip = 1;
} elsif ( /^Database:/ ) {
&_markup_database(\$_);
$out_aref ? push(@$out_aref, $_) : print $_;
if ( /non-redundant genbank/i and $prog =~ /TBLAST[NX]/i) {
$getGenBankAlert = 1;
}
$skip = 1;
} elsif ( /sequences;/ ) {
$str = "$_
";
$out_aref ? push(@$out_aref, $str) : print $str;
} elsif ( /^\s+\(\d+ letters\)\s+/ ) {
$str = "
    $_";
$out_aref ? push(@$out_aref, $str) : print $str;
} elsif ( /^(WARNING|NOTICE):/i ) {
s#WARNING: *(.*)#
$1: $1#o;
$out_aref ? push(@$out_aref, $_) : print $_;
$getNote = 1;
} elsif ( /Score +E\s*$/ or /Probability\s*$/ ) {
# Put the last HTML-formatted lines before the main body of report.
$found_table = 1;
$skip = 0;
$out_aref ? push(@$out_aref, $refs) : print $refs;
if($getGenBankAlert) {
$str = &_genbank_alert;
$out_aref ? push(@$out_aref, $str) : print $str;
}
$str = "\n
";
$out_aref ? push(@$out_aref, $str) : print $str;
}
} else {
&_markup_report(\$_);
}
if ($found_data and not($skip or $ref_skip)) {
$out_aref ? push(@$out_aref, $_) : print $_;
}
1;
} # end sub {}
}
=head2 _set_markup_data
Usage : n/a; utility method used by get_html_func()
Purpose : Sets various hashes and regexps used for adding HTML
: to raw Blast output.
Returns : n/a
Comments : These items need be set only once.
See Also : L
=cut
#-------------------
sub _set_markup_data {
#-------------------
%DbUrl = $BioWWW->search_url('all');
%SGDUrl = $BioWWW->sgd_url('all');
$Signif = '[\de.-]{3,}'; # Regexp for a P-value or Expect value.
$Int = ' *\d\d*'; # Regexp for an integer.
$Descrip = ' +.* {2,}?'; # Regexp for a description line.
$Acc = '[A-Z][\d.]+'; # Regexp for GB/EMBL/DDJB/SP accession number
$Pir_acc = '[A-Z][A-Z0-9]{5,}'; # Regexp for PIR accession number
$Word = '[\w_.]+'; # Regexp for a word. Include dot for version.
$_set_markup = 1;
}
=head2 _markup_database
Usage : n/a; utility method used by get_html_func()
Purpose : Converts a cryptic database ID into a readable name.
Returns : n/a
Comments : This is used for converting local database IDs into
: understandable terms. At present, it only recognizes
: databases used locally at SGD.
See Also : L
=cut
#---------------------
sub _markup_database {
#---------------------
my $line_ref = shift;
local $_ = $$line_ref;
$_ =~ s#YeastN#S. cerevisiae GenBank Data Set; #;
$_ =~ s#YeastP#Non-Redundant S. cerevisiae Protein Data Set; #;
$_ =~ s#genoSC#Complete DNA Sequence for the S. cerevisiae Genome; #;
$_ =~ s#YeastORF-P#Translation of all Standard S.c. ORFs; #;
$_ =~ s#YeastORF-N#Coding Sequence of all Standard S.c. ORFs; #;
s#Database: *(.*)#Database: $1#o;
$$line_ref = $_;
}
=head2 _markup_report
Usage : n/a; utility function used by get_html_func()
Purpose : Adds HTML links to aid navigation of raw Blast output.
Returns : n/a
Comments : HTML-formatting is dependent on the Blast server that
: provided the Blast report. Currently, this function can handle reports
: produced by NCBI and SGD. Feel free to modify this function
: to accomodate reports produced by other servers/sites.
:
: This function is simply a collection of substitution regexps
: that recognize and modify the relevant lines of the Blast report.
: All non-header lines of the report are passed through this function,
: only the ones that match will get modified.
:
: The general scheme for adding links is as follows:
: (Some of the SGD markups do not follow this scheme precisely
: but this is the general trend.)
:
: For description lines in the summary table at the top of report:
:
: DB:SEQUENCE_ID DESCRIPTION SIGNIF_VAL
: DB = links to the indicated database (if not Gen/Embl/Ddbj).
: SEQUENCE_ID = links to GenBank entry for the sequence.
: SIGNIF_VAL = internal link to relevant alignment section.
:
: For the alignment sections in the body of the report:
:
: DB:SEQUENCE_ID (Back | Top) DESCRIPTION
: DB = links to the indicated database (if not Gen/Embl/Ddbj).
: SEQUENCE_ID = links to GenBank entry for the sequence.
: SIGNIF_VAL = internal link to alignment section.
: Back = internal link to description line in summary section.
: Top = internal link to top of page.
:
: 'DB' links are created for PDB, PIR, and SwissProt sequences.
:
: RE_PARSING HTML-FOMRATTED REPORTS:
: ----------------------------------
: HTML-formatted reports generated by this module, as well as reports
: obtained from the NCBI servers, should be parsable
: by Bio::Tools::Blast.pm. Parsing HTML-formatted reports is
: slow, however, since the HTML must be removed prior to parsing.
: Parsing HTML-formatted reports is dependent on the specific structure
: of the HTML and is generally not recommended.
:
: Note that since URLs can change without notice, links will need updating.
: The links are obtained from Bio::Tools::WWW.pm updating that module
: will update this as well.
:
Bugs : Some links to external databases are incorrect
: (in particular, for 'bbs' and 'prf' databases on NCBI Blast reports.
: Some links may fail as a result of the dynamic nature of the web.
: Hypertext links are not added to hits without database ids.
See Also : L, B, L()
=cut
#--------------------
sub _markup_report {
#--------------------
my $line_ref = shift;
local $_ = $$line_ref;
##
## REGEXPS FOR ALIGNMENT SECTIONS (within the body of the report,
## the text above the list of HSPs).
##
## If the HSP alignment sections don't start with a '>' we have no way
## of finding them. This occurs with reports saved from HTML-formatted
## web pages, which we shouldn't be processing here anyway.
## To facilitate parsing of HTML-formatted reports by Bio::Tools::Blast.pm,
## the anchors should be added at the BEGINNING of the HSP
## alignment section lines and at the END of the description section lines.
# Removing " ! " addded by GCG.
s/ ! / /;
### NCBI-specific markups for HSP alignment section lines:
local($^W) = 0;
# GenBank/EMBL, DDBJ hits (GenBank Format):
s@^>(gb|emb|dbj|ref)\|($Word)(\|$Word)?(.*)$@$1:$2$3$4
(Back|Top)@o;
s@^>(gb|emb|dbj|ref)\|($Word)(\| \(?$Word\)?)(.*)$@$1:$2$3$4
(Back|Top)@o;
# PIR hits
s@^>pir\|\|($Word)( .*)$@pir:$1 $2
(Back|Top)@o;
# GI hits (GenBank Format): using a nested (())
s@^>(gi)\|($Word)( +\(($Word)\))( .*)$@$1:$2$3$5
(Back|Top)@o;
# GNL PID hits (GenBank Format):
s@^>(gnl)\|($Word)?(\|$Word) +\(($Word)\)( .*)$@$1:$2$3($4)$5
(Back|Top)@o;
# BBS and PRF hits (what db?) (GenBank Format):
s@^>(bbs|prf)\|\|?($Word)( .*)$@$1:$2$3
(Back|Top)@o;
# SwissProt hits:
s@^>sp\|($Word)\|($Word)?( .*)$@sp:$1|$2$3
(Back|Top)@o;
## PDB ids with or without a chain identifier (GenBank format)
s@^>pdb\|(\d\w{3})\|[\w ] (.*)$@pdb:$1 (Back|Top) $2@o;
### SGD-specific markups for HSP alignment section lines:
## PDB ids without chain identifier
s@^>PDB_UNIQUEP:(\d\w{3})_ (.*)$@PDB:$1 (Back|Top) $2@o;
## PDB ids with chain identifier
s@^>PDB_UNIQUEP:(\d\w{3})_([\w ]{1})(.*)$@PDB:$1 Chain:$2, (Back|Top) $3@o;
s@^>($Word)PEPT:GI_(\d+)(.*)$@$1:GI_$2 $3
(Back|Top)@o;
# The gcg blast dataset generating tools up-case all sbjct sequence IDs.
# This is fine for yeast but not worm. This is considered a hack here.
s@WORMPEPT:(\w+\.)(\S+)@WORMPEPT:$1\L$2\E@;
s@^>WORMPEPT:(\S+)(.*)$@WORMPEP:$1 $2
(Back|Top)@o;
s#^>(GB_$Word):($Word) ($Acc) (.*$)#$2|$3$4\t[GenBank / EMBL / SGD] #o;
# Sac's version: ORF name is an external link into SGD:
s@^>ORFP:(\S*) +([\w-]+)(.*$)@ORFP:$1 $2$3
     [Gene/Sequence Resources / ORF Map] Back|Top@o;
# Mike's version:
# s#^>ORFP:(\S*) (.*$)#ORFP:$1 $2\t[Gene/Sequence Resources / ORF Map] #o;
s#^>ORFN:(\S*) (.*$)#ORFN:$1 $2\t[Gene/Sequence Resources] / ORF Map #o;
s#^>NR_SC:GP-\S* gi\|(\w+)([\w\|]*) (.*$)#GenPept|$1 gp|$2 $3\t[GenPept / SGD] #o;
s#^>NR_SC:SW-$Word SW:($Word) ($Acc) (.*$)#SWISS|$1 $2 $3\t[SwissProt / Entrez]#o;
s#^>NR_SC:PIR-$Word PIR:($Word) (.*$)# PIR|$1 $2\t[PIR / Entrez]#o;
s#^>CHRS:([A-Z][0-9]*) (.*)$#$1 $2: [Gene/Sequence Resources / ORF Map]#o;
s#^>NOT:([A-Z]_[0-9]*-[0-9]*)( *)Chromosome ([0-9]*) from ([0-9]*) to ([0-9]*)$#$1 $2Chromosome $3 from $4 to $5 [Gene/Sequence Resources / ORF Map / Retrieve DNA]#o;
s#^>UTR5_SC_[0-9]*:(\S*) 5' untranslated region, chr(\S*) ([0-9]*) - ([0-9]*)(.*$)#UTR5:$1 $1 5' untranslated region, chr$2 $3 - $4, $5\t[Gene/Sequence Resources / ORF Map]#o;
# Hits without a db identifier.
# If any of the previous regexps succeed, the leading '>' will be removed.
# Otherwise, this regexp could cause trouble.
s@^>($Word)(.*)$@$1 $2
(Back|Top)@o;
##
## REGEXPS FOR SUMMARY TABLE LINES AT TOP OF REPORT (a.k.a. 'descriptions')
## (table of sequence id, description, score, P/Expect value, n)
##
## Not using bold face to highlight the sequence id's since this can throw off
## off formatting of the line when the IDs are different lengths. This lead to
## the scores and P/Expect values not lining up properly.
### NCBI-specific markups for description lines:
# GenBank/EMBL, DDBJ hits (GenBank Format):
s@^ ?(gb|emb|dbj|ref)\|($Word)(\|$Word)?($Descrip)($Int +)($Signif)(.*)$@$1:$2$3$4$5$6$7@o;
s@^ ?(gb|emb|dbj|ref)\|($Word)(\| \(?$Word\)?)($Descrip)($Int +)($Signif)(.*)$@$1:$2$3$4$5$6$7@o;
# Missing inner ID
s@^ ?pir\|\|($Word)?($Descrip)($Int) ($Signif)(.*)$@pir:$1 $2$3 $4$5@o;
# GI hits (GenBank Format): using a nested (())
s@^ ?gi\|($Word)( +\(($Word)\))($Descrip)($Int) ($Signif)(.*)$@gi:$1$2$4$5 $6$7@o;
s@^ ?(gnl)\|($Word)?(\|$Word +)\(($Word)\)($Descrip)($Int) ($Signif)(.*)$@$1:$2$3($4)$5$6 $7$8@o;
s@^ ?(bbs|prf)\|\|?($Word)($Descrip)($Int) ($Signif)(.*)$@$1:$2 $3$4 $5$6@o;
## SwissProt accessions (GenBank format)
s@^ ?sp\|($Word)(\|$Word)?($Descrip)($Int) ($Signif)(.*)$@sp:$1$2$3$4 $5$6@o;
## PDB ids with or without a chain ID (GenBank format)
s@^ ?pdb\|($Word)\|($Word)?($Descrip)($Int) ($Signif)(.*)$@pdb:$1_$2$3$4 $5$6@o;
### SGD-specific markups for description lines:
## PDB ids without chain identifier
s@^ ?PDB_UNIQUEP:(\d\w{3})_($Descrip)($Int) ($Signif)(.*)$@PDB:$1 $2$3 $4$5@o;
## PDB ids with chain identifier
s@^ ?PDB_UNIQUEP:(\d\w{3})_(\w)($Descrip)($Int) ($Signif)(.*)$@PDB:$1 Chain:$2$3$4 $5$6@o;
s@^ ?($Word)PEPT:GI_(\d+)($Descrip)($Int) ($Signif)(.*)$@$1:GI_$2 $3 $4 $5 $6@o;
s@^ *WORMPEPT:(\S+)($Descrip)($Int) ($Signif)(.*)$@WORMPEP:$1 $2 $3 $4$5@o;
## Mike Cherry's markups. SAC note: added back database name to allow
## the HTML-formatted version to be parsable by Blast.pm.
s#^ ?(GB_$Word:)($Word)( *)($Acc)($Descrip)($Int) ( *$Signif) ( *\d*)$#GenBank\|$2\|$4 $3$5$6 $7 $8#o;
# Mike's version:
# s#^ ?(ORFP:)(\S*)($Descrip)($Int) ($Signif) ($Int)$#$1$2 $3 $4 $5 $6#o;
# My modification:
s@^ ?ORFP:(\S*) +([\w-]+)(.*[ ]{2,3})($Int) ($Signif) ($Int)$@ORFP:$1 $2$3$4 $5 $6@o;
s#^ ?(ORFN:)(\S*)($Descrip)($Int) ($Signif) ($Int)$#$1$2 $3 $4 $5 $6#o;
s#^ ?(NR_SC:GP-)(\S*) ( *)gi\|(\w+)([\w\|]*)($Descrip)($Int) ($Signif) ($Int)$#GenPept\|$4$3 gp|$2 $5$6$7 $8 $9#o;
s#^ ?(NR_SC:SW-)$Word ( *)SW:($Word) ($Acc)($Descrip)($Int) ($Signif) ($Int)$#SWISS\|$3 SW:$3 $4 $5$6 $7 $8#o;
s#^ ?(NR_SC:PIR-)$Word ( *)PIR:($Word)($Descrip)($Int) ($Signif) ($Int)$#PIR\|$3 $2 PIR:$3 $4$5 $6 $7#o;
s#^ ?(CHRS:)([A-Z][0-9]*)($Descrip)($Int) ($Signif) ($Int)$#$1Segment:$2 $3 $4 $5 $6#o;
s#^ ?(CHR[0-9]*)($Descrip)($Int) ($Signif) ($Int)$#$1 $2 $3 $4 $5#o;
s#^ ?(NOT:)([A-Z]_[0-9]*-[0-9]*)($Descrip)($Int) ($Signif) ($Int)$#$1$2 $3 $4 $5 $6#o;
s#^ ?(UTR5_SC_[0-9]*:)(\S*)($Descrip)($Int) ($Signif) ($Int)$#UTR5:$2 $3 $4 $5 $6#o;
# Hits without a db identifier.
s@^ ?($Word)($Descrip)($Int) ($Signif)(.*)$@$1$2$3 $4$5@o;
$$line_ref = $_;
}
=head2 _prog_ref_html
Usage : n/a; utility method used by get_html_func().
Purpose : Get a special alert for BLAST reports against all of GenBank/EMBL.
Returns : string with HTML
See Also : L
=cut
#------------------
sub _prog_ref_html {
#------------------
return <<"QQ_REF_QQ";
References:
- Altschul, Stephen F., Warren Gish, Webb Miller, Eugene W. Myers, and David J. Lipman (1990).
Basic local alignment search tool.
J. Mol. Biol. 215: 403-10.
- Altschul et al. (1997), Gapped BLAST and PSI-BLAST:
a new generation of protein database search programs.
Nucl. Acids Res. 25: 3389-3402.
- Program Descriptions:
BLAST2 |
WU-BLAST2 |
Help Manual
HTML formatting provided by the Bioperl Blast module.
QQ_REF_QQ
# Not really a reference for the Blast algorithm itself but an interesting usage.
#
Gish, Warren, and David J. States (1993). Identification of protein coding regions by database similarity search.
#Nature Genetics 3:266-72.
}
=head2 _genbank_alert
Usage : n/a; utility method used by get_html_func().
Purpose : Get a special alert for BLAST reports against all of GenBank/EMBL.
Returns : string with HTML
See Also : L
=cut
#------------------
sub _genbank_alert {
#------------------
return << "QQ_GENBANK_QQ";
CAUTION: Hits reported on this page may be derived from DNA sequences
that contain more than one gene.
To avoid mis-interpretation, always check database entries
for any sequence of interest to verify that the similarity
occurs within the described sequence. (E.g., A DNA sequence
for gene X as reported in GenBank may contain a 5' or 3'
fragment of coding sequence for a neighboring gene Y, yet will
be listed as gene X, since gene Y had not yet been identified).
QQ_GENBANK_QQ
}
=head2 strip_html
Usage : $boolean = &strip_html( string_ref );
: This method is exported.
Purpose : Removes HTML formatting from a supplied string.
: Attempts to restore the Blast report to enable
: parsing by Bio::Tools::Blast.pm.
Returns : Boolean: true if string was stripped, false if not.
Argument : string_ref = reference to a string containing the whole Blast
: report.
Throws : Croaks if the argument is not a scalar reference.
Comments : Based on code originally written by Alex Dong Li
: (ali@genet.sickkids.on.ca).
: This method does some Blast-specific stripping
: (adds back a '>' character in front of each HSP
: alignment listing).
:
: THIS METHOD IS HIGHLY ERROR-PRONE!
:
: Removal of the HTML tags and accurate reconstitution of the
: non-HTML-formatted report is highly dependent on structure of
: the HTML-formatted version. For example, it assumes that first
: line of each alignment section (HSP listing) starts with a
: anchor tag. This permits the reconstruction of the
: original report in which these lines begin with a ">".
: This is required for parsing.
:
: If the structure of the Blast report itself is not intended to
: be a standard, the structure of the HTML-formatted version
: is even less so. Therefore, the use of this method to
: reconstitute parsable Blast reports from HTML-format versions
: should be considered a temorary solution.
See Also : B
=cut
#---------------
sub strip_html {
#---------------
# This may not best way to remove html tags. However, it is simple.
# it won't work under following conditions:
# 1) if quoted > appears in a tag (does this ever happen?)
# 2) if a tag is split over multiple lines and this method is
# used to process one line at a time.
my $string_ref = shift;
ref $string_ref eq 'SCALAR' or
croak ("Can't strip HTML: ".
"Argument is should be a SCALAR reference not a ${\ref $string_ref}");
my $str = $$string_ref;
my $stripped = 0;
# Removing "" and adding the '>' character for
# HSP alignment listings.
$str =~ s/(\A|\n)]+> ?/>/sgi and $stripped = 1;
# Removing all "<>" tags.
$str =~ s/<[^>]+>| //sgi and $stripped = 1;
# Re-uniting any lone '>' characters.
$str =~ s/(\A|\n)>\s+/\n\n>/sgi and $stripped = 1;
$$string_ref = $str;
$stripped;
}
1;
__END__
#####################################################################################
# END OF CLASS #
#####################################################################################