Raw content of Bio::EnsEMBL::Analysis::Runnable::Finished::Blast
package Bio::EnsEMBL::Analysis::Runnable::Finished::Blast;
use strict;
use warnings;
use BlastableVersion;
use Symbol;
use Bio::EnsEMBL::Analysis::Config::Blast;
use Bio::EnsEMBL::Analysis::Config::General;
use Bio::EnsEMBL::Utils::Exception qw(throw warning info);
use base ("Bio::EnsEMBL::Analysis::Runnable::Blast");
$ENV{BLASTMAT} = $MAT_DIR;
$ENV{BLASTFILTER} = $BIN_DIR;
$ENV{BLASTDB} = $BLAST_DIR;
BEGIN {
print STDERR "\nUSING " . __PACKAGE__ . "\n\n";
}
sub get_analysis {
my ($self) = @_;
my ($ana);
unless ( $ana = $self->{'_analysis'} ) {
my ($source) = $self->program =~ m{([^/]+)$}
or throw(
"Can't parse last element from path: '" . $self->program . "'" );
$ana = $self->{'_analysis'} = Bio::EnsEMBL::Analysis->new(
-db => $self->database,
-db_version => 1, # ARUUGA!!!
-program => $source,
-program_version => 1,
-gff_source => $source,
-gff_feature => 'similarity',
-logic_name => 'blast',
);
}
return $ana;
}
sub run_analysis {
my ($self) = @_;
DB:foreach my $database (@{$self->databases}) {
my $db = $database;
$db =~ s/.*\///;
#allow system call to adapt to using ncbi blastall.
#defaults to WU blast
my $command = $self->program;
my $blastype = "";
my $filename = $self->queryfile;
my $results_file = $self->create_filename($db, 'blast.out');
$self->files_to_delete($results_file);
$self->results_files($results_file);
if ($self->type eq 'ncbi') {
$command .= " -d $database -i $filename ";
} else {
$command .= " $database $filename -gi ";
}
$command .= $self->options. ' 2>&1 > '.$results_file;
info("Running blast ".$command);
my $fh;
unless (open($fh, "$command |")) {
$self->delete_files;
throw("Error opening Blast cmd <$command>." .
" Returned error $? BLAST EXIT: '" .
($? >> 8) . "'," ." SIGNAL '" . ($? & 127) .
"', There was " . ($? & 128 ? 'a' : 'no') .
" core dump");
}
# this loop reads the STDERR from the blast command
# checking for FATAL: messages (wublast) [what does ncbi blast say?]
# N.B. using simple die() to make it easier for RunnableDB to parse.
while(<$fh>){
if(/FATAL:(.+)/){
my $match = $1;
next DB if $match =~ /There is nothing in the requested database to search/;
# clean up before dying
$self->delete_files;
if($match =~ /no valid contexts/){
die qq{"VOID"\n}; # hack instead
}elsif($match =~ /Bus Error signal received/){
die qq{"BUS_ERROR"\n}; # can we work out which host?
}elsif($match =~ /Segmentation Violation signal received./){
die qq{"SEGMENTATION_FAULT"\n}; # can we work out which host?
}elsif($match =~ /Out of memory;(.+)/){
# (.+) will be something like "1050704 bytes were last
#requested."
die qq{"OUT_OF_MEMORY"\n};
# resenD to big mem machine by rulemanager
}elsif($match =~ /the query sequence is shorter
than the word length/){
#no valid context
die qq{"VOID"\n}; # hack instead
}else{
warning("Something FATAL happened to BLAST we've not ".
"seen before, please add it to Package: "
. __PACKAGE__ . ", File: " . __FILE__."\n[$match]\n");
die ($self->unknown_error_string."\n");
# send appropriate string
#as standard this will be failed so job can be retried
#when in pipeline
}
}elsif(/WARNING:(.+)/){
# ONLY a warning usually something like hspmax=xxxx was exceeded
# skip ...
}elsif(/^\s{10}(.+)/){ # ten spaces
# Continuation of a WARNING: message
# Hope this doesn't catch more than these.
# skip ...
}
}
unless(close $fh){
# checking for failures when closing.
# we should't get here but if we do then $? is translated
#below see man perlvar
$self->delete_files;
warning("Error running Blast cmd <$command>. Returned ".
"error $? BLAST EXIT: '" . ($? >> 8) .
"', SIGNAL '" . ($? & 127) . "', There was " .
($? & 128 ? 'a' : 'no') . " core dump");
die ($self->unknown_error_string."\n");
}
}
}
sub parse_results {
my ($self) = @_;
my $results = $self->results_files;
my $bplites = $self->parser->parse_files($results);
my $threshold_type = $self->parser->threshold_type;
my $threshold = $self->parser->threshold;
my $discard_overlaps = $self->parser->discard_overlaps;
my $coverage = $self->parser->coverage;
my $hits =
$self->parser->get_best_hits( $bplites, $threshold_type, $threshold );
my $query_length = $self->query->length
or throw("Couldn't get query length");
my $output =
$self->parser->_apply_coverage_filter( $query_length, $hits,
$threshold_type, $threshold, $coverage, $discard_overlaps );
$self->output($output);
return $output;
}
sub clean_databases {
my ( $self) = @_;
$self->{databases} = [];
}
sub clean_results_files {
my ($self) = @_;
$self->{'results_files'} = [];
}
sub databases {
my ( $self, @vals ) = @_;
if ( not exists $self->{databases} ) {
$self->{databases} = [];
}
foreach my $val (@vals) {
my $db_names = $val;
my @databases;
$db_names =~ s/\s//g;
foreach my $dbname ( split( ",", $db_names ) )
{ # allows the use of a comma separated list in $self->database
# prepend the environment variable $BLASTDB if
# database name is not an absoloute path
unless ( $dbname =~ m!^/! ) {
$dbname = $ENV{BLASTDB} . "/" . $dbname;
}
# If the expanded database name exists put this in
# the database array.
#
# If it doesn't exist then see if $database-1,$database-2 exist
# and put them in the database array
if ( -f $dbname ) {
push( @databases, $dbname );
}
else {
my $count = 1;
my $db_filename;
while ( -f ( $db_filename = "${dbname}-${count}" ) ) {
push( @databases, $db_filename );
$count++;
}
$! = undef
; # to stop pollution as it will be "No such file or directory" after while loop above.
}
}
if ( scalar(@databases) == 0 ) {
throw( "No databases exist for " . $db_names );
} else {
foreach my $db_name (@databases){
$self->get_db_version($db_name) if $db_name =~ /emnew_/;
}
$self->get_db_version($databases[0]);
push @{$self->{databases}}, @databases;
}
}
return $self->{databases};
}
=head2 get_db_version
Title : get_db_version
[ distinguished from RunnableDB::*::db_version_searched() ]
Useage : $self->get_db_version('/data/base/path')
$obj->get_db_version()
Function: Set a blast database version from the supplied path
Get a blast database version from previously supplied path
Uses tjrc''s BlastableVersion module.
Returns : String
Args : String (should be a full database path)
Caller : $self::fetch_databases()
RunnableDB::Finished_EST::db_version_searched()
=cut
sub get_db_version {
my ( $self, $db ) = @_;
my $debug_this = 0; # this just shows debug info.
my $force_dbi = 0; # this will force a dbi call SLOW!!!!!!
unless ( $self->{'_db_version_searched'} ) {
if ($db) {
$BlastableVersion::debug = $debug_this;
warning
"BlastableVersion is cvs revision ".$BlastableVersion::revision." \n"
if $debug_this;
my $ver = eval {
my $blast_ver = BlastableVersion->new();
$blast_ver->force_dbi($force_dbi); # if set will be SLOW.
$blast_ver->get_version($db);
$blast_ver;
};
throw("I failed to get a BlastableVersion for $db") if $@;
my $dbv = $ver->version();
my $sgv = $ver->sanger_version();
my $name = $ver->name();
my $date = $ver->date();
unless ($dbv) {
throw( "I know nothing about $db I tried to find out:\n"
. " - name <". $name . ">\n"
. " - date <". $date . ">\n"
. " - version <". $dbv . ">\n"
. " - sanger_version <". $sgv. ">\n" );
}
$self->{'_db_version_searched'} = $dbv;
}
else {
throw( "You've asked about what I searched, but I don't know."
. " It's not set. I need to be called with a database filename first"
);
# The code probably got here because of a problem with the MLDBM
# cache file on the machine this was running on.
# the cache file is stored @ /var/tmp/blast_versions
# try
}
}
return $self->{'_db_version_searched'};
}
sub DESTROY
{
my ( $self ) = @_;
# just do the cleanup
$self->delete_files;
}
1;
__END__
=head1 NAME - Bio::EnsEMBL::Analysis::Runnable::Finished::Blast
=head1 DESCRIPTION
Unlike Bio::EnsEMBL::Analysis::Runnable::Blast,
this module creates FeaturePairs from HSPs after
doing any depth filtering to save time and memory
when searching genomic sequences that generate
large numbers of blast matches.
=head2 usage of Bio::EnsEMBL::Analysis::Config::Blast with this module
BLAST_CONFIG =>
{
Uniprot =>
{
BLAST_PARSER => 'Bio::EnsEMBL::Analysis::Tools::Finished::BPliteWrapper',
PARSER_PARAMS => {
-regex => '^\w+\s+(\w+)',
-query_type => 'pep',
-database_type => 'pep',
-threshold_type => 'PVALUE',
-threshold => 0.01,
-coverage => 10,
-discard_overlaps => 1,
},
BLAST_FILTER =>'Bio::EnsEMBL::Analysis::Tools::FeatureFilter',
FILTER_PARAMS => {
-min_score => 200,
-prune => 1,
},
BLAST_PARAMS => {
-unknown_error_string => 'FAILED',
-type => 'wu',
},
},
DEFAULT =>
{
BLAST_PARSER => 'Bio::EnsEMBL::Analysis::Tools::BPliteWrapper',
PARSER_PARAMS => {
-regex => '^(\w+)',
-query_type => undef,
-database_type => undef,
},
BLAST_FILTER => undef,
FILTER_PARAMS => {},
BLAST_PARAMS => {
-unknown_error_string => 'FAILED',
-type => 'wu',
}
},
BLAST_AB_INITIO_LOGICNAME => 'Genscan'
}
=head1 AUTHOR
James Gilbert B jgrg@sanger.ac.uk
modified by Sindhu K. Pillai Bsp1@sanger.ac.uk