Raw content of Bio::EnsEMBL::Analysis::Runnable::BlastMiniGenewise
#
# You may distribute this module under the same terms as perl itself
#
# POD documentation - main docs before the code
=pod
=head1 NAME
Bio::EnsEMBL::Analysis::Runnable::BlastMiniGenewise
=head1 SYNOPSIS
my $obj = Bio::EnsEMBL::Analysis::Runnable::BlastMiniGenewise->new
('-genomic' => $genseq,
'-features' => $features,
'-protein' => $protein,
'-seqfetcher' => $seqfetcher,
'-check_repeated' => 1);
$obj->run
my @newfeatures = $obj->output;
(where $protein and $genseq are Bio::Seq objects,
$features are X objects and $seqfetcher is a
SeqFetcher object.)
=head1 DESCRIPTION
=head1 CONTACT
Describe contact details here
=head1 APPENDIX
The rest of the documentation details each of the object methods.
Internal methods are usually preceded with a _
=cut
# Let the code begin...
package Bio::EnsEMBL::Analysis::Runnable::BlastMiniGenewise;
use vars qw(@ISA);
use strict;
use Bio::EnsEMBL::Analysis::Runnable::BlastMiniBuilder;
use Bio::EnsEMBL::Analysis::Runnable::MultiMiniGenewise;
use Bio::EnsEMBL::Analysis::Runnable::Blast;
use Bio::EnsEMBL::Analysis::Tools::BlastDB;
use Bio::EnsEMBL::Analysis::Runnable;
use Bio::EnsEMBL::Analysis::Runnable::ExonerateTranscript;
use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump);
use Bio::EnsEMBL::Utils::Argument qw( rearrange );
use Bio::EnsEMBL::Analysis::Tools::Logger qw(logger_info);
use Bio::EnsEMBL::Analysis::Tools::FilterBPlite;
use Bio::EnsEMBL::Analysis::Tools::FeatureFilter;
@ISA = qw(Bio::EnsEMBL::Analysis::Runnable::BlastMiniBuilder);
sub new {
my ($class,@args) = @_;
my $self = $class->SUPER::new(@args);
my( $ids, $seqfetcher, $minigenewise_options, $check_repeated,
$full_seq,$exonerate,$exonerate_path, $exonerate_options, $genewise_options,
$blast_parser, $blast_filter, $post_filter, $score_cutoff, $blastdb, $mmg_options) =
rearrange([qw(IDS SEQFETCHER MINIGENEWISE_OPTIONS
CHECK_REPEATED FULLSEQ EXONERATE EXONERATE_PATH
EXONERATE_OPTIONS GENEWISE_OPTIONS BLAST_PARSER
BLAST_FILTER POST_BLAST_FILTER SCORE_CUTOFF BLASTDB
MULTIMINIGENEWISE_OPTIONS)], @args);
####SETTING DEFAULTS#####
$self->check_repeated(1);
#########################
$self->seqfetcher($seqfetcher);
$self->ids($ids);
$self->minigenewise_options($minigenewise_options);
$self->full_seq($full_seq);
$self->exonerate($exonerate);
$self->exonerate_path($exonerate_path);
$self->exonerate_options($exonerate_options);
$self->check_repeated($check_repeated) if(defined($check_repeated));
$self->genewise_options($genewise_options);
$self->multiminigenewise_options($mmg_options);
$self->blast_parser($blast_parser);
$self->blast_filter($blast_filter);
$self->post_blast_filter($post_filter);
$self->score_cutoff($score_cutoff);
$self->blastdb($blastdb);
#note this is a default but due to the nature of it defaulting to on this
#way is needed
throw("No query sequence input") unless ($self->query);
throw("No seqfetcher provided") unless ($self->seqfetcher);
throw("No ids arrary ref provided") unless ($self->ids);
throw("No analysis defined") unless($self->analysis);
return $self;
}
sub ids{
my ($self, $arg) = @_;
if(defined $arg){
throw("BlastMiniGenewise::ids must be given an arrayref not a $arg")
unless(ref($arg) eq "ARRAY");
$self->{'ids'} = $arg;
}
return $self->{'ids'};
}
#Acessor methods
=head2 blast_filter
Title : blast_filter
Usage : $self->blast_filter($blast_filter)
Function: Get/set method for genewise blast_filter
Returns :
Args :
=cut
sub blast_filter{
my ($self, $arg) = @_;
if(defined $arg){
throw($arg." must have method filter_results")
unless($arg->can("filter_results"));
$self->{blast_filter} = $arg;
}
if(!$self->{blast_filter}){
$self->{blast_filter} = Bio::EnsEMBL::Analysis::Tools::FeatureFilter
->new(
-max_pvalue => 100,
-prune => 1,
-coverage => 100,
);
}
return $self->{blast_filter};
}
=head2 minigenewise_options
Title : minigenewise_options
Usage : $self->minigenewise_options($minigenewise_options)
Function: Get/set method for minigenewise minigenewise_options
Returns :
Args :
=cut
sub minigenewise_options{
my ($self, $arg) = @_;
if(defined $arg){
throw("BlastMiniGenewise::minigenewise_options must be given an hasref".
"not a $arg") unless(ref($arg) eq "HASH");
$self->{'minigenewise_options'} = $arg;
}
return $self->{'minigenewise_options'};
}
=head2 multiminigenewise_options
Title : multiminigenewise_options
Usage : $self->multiminigenewise_options($multiminigenewise_options)
Function: Get/set method for multiminigenewise multiminigenewise_options
Returns :
Args :
=cut
sub multiminigenewise_options{
my ($self, $arg) = @_;
if(defined $arg){
throw("BlastMiniGenewise::multiminigenewise_options must be given an hasref".
"not a $arg") unless(ref($arg) eq "HASH");
$self->{'multiminigenewise_options'} = $arg;
}
return $self->{'multiminigenewise_options'};
}
=head2 genewise_options
Title : genewise_options
Usage : $self->genewise_options($genewise_options)
Function: Get/set method for genewise genewise_options
Returns :
Args :
=cut
sub genewise_options{
my ($self, $arg) = @_;
if(defined $arg){
throw("BlastMiniGenewise::genewise_options must be given an hasref".
"not a $arg") unless(ref($arg) eq "HASH");
$self->{'genewise_options'} = $arg;
}
return $self->{'genewise_options'};
}
=head2 score_cutoff
Title : score_cutoff
Usage : $self->score_cutoff($score_cutoff)
Function: Get/set method for genewise score_cutoff
Returns :
Args :
=cut
sub score_cutoff{
my ($self, $arg) = @_;
if(defined($arg)){
$self->{score_cutoff} = $arg;
}
return $self->{score_cutoff};
}
sub post_blast_filter{
my ($self, $arg) = @_;
if(defined($arg)){
$self->{post_blast_filter} = $arg;
}
return $self->{post_blast_filter};
}
=head2 seqfetcher
Title : seqfetcher
Usage : $self->seqfetcher($seqfetcher)
Function: Get/set method for SeqFetcher
Returns : Bio::EnsEMBL::Analysis::SeqFetcher object
Args : Bio::EnsEMBL::Pipeline::SeqFetcher object
=cut
sub seqfetcher {
my( $self, $value ) = @_;
if ($value) {
throw("BlastMiniGenewise, seqfetcher must be a ".
"Bio::EnsEMBL::Pipeline::SeqFetcher")
unless ($value->isa("Bio::DB::RandomAccessI"));
$self->{'_seqfetcher'} = $value;
}
return $self->{'_seqfetcher'};
}
=head2 exonerate
Title : exonerate
Usage : $self->exonerate
Function: Get/Set method for using exonerate rather than Blast
Returns : 0 (False) or 1 (True)
Args : 0 (False) or 1 (True)
=cut
sub exonerate {
my( $self, $value ) = @_;
if ($value) {
$self->{'_exonerate'} = $value;
}
return $self->{'_exonerate'};
}
=head2 exonerate_path
Title : exonerate_path
Usage : $path = $self->exonerate_path
Function: Get/Set method for the path to the Exonerate executeable
Returns : String
Args : String
=cut
sub exonerate_path {
my( $self, $value ) = @_;
if($value){
my $path = $self->locate_executable($value);
$self->{'exonerate_path'} = $path;
}
throw($self->{'exonerate_path'}." is not executable")
if($self->{'exonerate_path'} && !(-x $self->{'exonerate_path'}));
return $self->{'exonerate_path'};
}
=head2 exonerate_options
Title : exonerate_options
Usage : $options = $self->exonerate_options
Function: Get/Set method for the options for running Exonerate
Returns : String
Args : String
=cut
sub exonerate_options {
my( $self, $value ) = @_;
if ($value) {
$self->{'_exonerate_options'} = $value;
}
return $self->{'_exonerate_options'};
}
=head2 full_seq
Title : full_seq
Usage : $self->full_seq
Function: Get/Set method for using the full genomic
: sequence rather than the mini seq
Returns : 0 (False) or 1 (True)
Args : 0 (False) or 1 (True)
=cut
sub full_seq {
my( $self, $value ) = @_;
if ($value) {
$self->{'_full_seq'} = $value;
}
return $self->{'_full_seq'};
}
sub blast_parser{
my ($self, $value) = @_;
if($value){
throw($value." must have a method called parse_file")
unless($value->can("parse_file"));
$self->{blast_parser} = $value;
}
if(!$self->{blast_parser}){
my $regex;
if($self->query->name =~ /^\S+\:\S*\:(\S+)\:\S*:\S*\:\S*/){
$regex = '^\S+\:\S*\:(\S+)\:\S*:\S*\:\S*';
}elsif ($self->query->name =~ /^(.*)\|(.*)\|(.*)/) {
$regex = '^.*\|(.*)\|.*';
} elsif ($self->query->name =~ /^..\:(.*)/) {
$regex = '^..\:(.*)';
}else {
$regex = '^(\w+)';
}
$self->{blast_parser} = Bio::EnsEMBL::Analysis::Tools::FilterBPlite
->new(
-regex => $regex,
-query_type => "pep",
-database_type => "dna",
);
}
return $self->{blast_parser};
}
sub blastdb{
my ($self, $arg) = @_;
if($arg){
throw($arg." must be a blastdb object")
unless($arg->isa("Bio::EnsEMBL::Analysis::Tools::BlastDB"));
$self->{blastdb} = $arg;
}
if(!$self->{blastdb}){
my $blastdb = Bio::EnsEMBL::Analysis::Tools::BlastDB
->new(
-sequences => [$self->query],
-output_dir => $self->workdir,
-mol_type => "DNA",
);
$self->{blastdb} = $blastdb;
}
return $self->{blastdb};
}
sub blastdb_file{
my ($self, $arg) = @_;
if($arg){
throw($arg." must exist as a file") unless(-e $arg);
$self->{blastdb_file} = $arg;
}
if(!$self->{blastdb_file}){
my $blastdb_file = $self->blastdb->create_blastdb;
foreach my $file(@{$self->blastdb->list_dbfiles}){
$self->files_to_delete($file);
}
$self->{blastdb_file} = $blastdb_file;
}
return $self->{blastdb_file};
}
#functional methods
=head2 run
Title : run.
Usage : $self->run()
Function: Performs a Blast search or an Exonerate search
: with all the protiens, passes the resulting proteins
: into a Bio::EnsEMBL::Analysis::Runnable::MultiMiniGenewise
: runnable
Returns : none
Args : none
=cut
sub run {
my ($self) = @_;
my $features = $self->run_alignment;
unless (@$features) {
print $self->query->name." has no associated features. ".
"Finishing run.\n";
return;
}
my $mg_runnables;
my @sorted_features = sort{$a->start <=> $b->start
|| $a->end <=> $b->end} @$features;
my %hash;
foreach my $f(@sorted_features){
if(!$hash{$f->hseqname}){
$hash{$f->hseqname} = 1;
}else{
$hash{$f->hseqname}++;
}
}
if ($self->check_repeated){
$mg_runnables = $self->build_runnables(@sorted_features);
} else {
my $runnable = $self->make_object($self->query, \@sorted_features);
push (@$mg_runnables, $runnable);
}
my $output_count;
foreach my $mg (@$mg_runnables){
$mg->run;
my @f = @{$mg->output};
$self->output(\@f);
$output_count += @f;
}
$self->delete_files;
return 1;
}
sub run_alignment{
my ($self) = @_;
my $seqs = $self->get_Sequences;
if(@$seqs != @{$self->ids}){
warning("Managed to get only ".scalar(@$seqs)." sequences from ".scalar(@{$self->ids}).
" for alignment run, check indices\n");
}
my $valid_seqs = $self->validate_Sequences($seqs);
if(@$valid_seqs != @$seqs){
warning("Only ".scalar(@$valid_seqs)." sequences were validated from ".@$seqs.
" sequences");
}
logger_info("There are ".@$valid_seqs." sequences to align using blast/exonerate");
my @features;
my @sorted_seqs = sort {$a->id cmp $b->id} @$valid_seqs;
foreach my $seq(@sorted_seqs){
if($self->exonerate){
push(@features, @{$self->run_exonerate($seq)});
}else{
#push(@features, @{$self->run_blast($seq)});
foreach my $f(@{$self->run_blast($seq)}){
$f->invert;
$f->slice($self->query);
push(@features, $f);
}
}
}
return \@features;
}
sub run_blast {
my ($self, $seq) = @_;
my $blastdb_file = $self->blastdb_file;
my $run = Bio::EnsEMBL::Analysis::Runnable::Blast
->new(
-query => $seq,
-program => 'wutblastn',
-database => $blastdb_file,
-parser => $self->blast_parser,
-filter => $self->blast_filter,
-analysis => $self->analysis,
);
$run->run;
my @blast_features;
foreach my $f (@{$run->output}){
my $id = $seq->id;
if($f->seqname =~ /$id/){
$f->seqname($id);
}else{
throw($f->seqname." doesn't contain ".$id);
}
$f->slice($self->query);
push(@blast_features,$f);
}
my @filter_features;
if($self->post_blast_filter){
#print STDERR "Filtering blast results\n";
#this code will through out any sets of features where
#none of the blast scores are higher than score set in config
#on the grounds its unlikely in that case that it will produce
#a good gene if a gene at all
my @fs;
my %feature_hash;
while(my $f = shift(@blast_features)){
if(!$feature_hash{$f->seqname}){
$feature_hash{$f->seqname} = [];
push(@{$feature_hash{$f->seqname}}, $f);
}else{
push(@{$feature_hash{$f->seqname}}, $f);
}
}
HIT: foreach my $hid(keys(%feature_hash)){
my @hit_features = @{$feature_hash{$hid}};
foreach my $f (@hit_features){
if($f->score > $self->score_cutoff){
push(@filter_features, @hit_features);
next HIT;
}
}
}
}else{
@filter_features = @blast_features;
}
return \@filter_features;
}
=head2 run_exonerate
Title : run_exonerate
Usage : @features = $self->run_exonerate()
Function: Uses Bio::Ensembl::Analysis::ExonerateTranscript to align
: the proteins to the slice. The supoporting features of
: the transcripts are returned
Returns : none
Args : list of Bio::EnsEMBL::BaseAlignFeature objects
=cut
sub run_exonerate {
my ($self)= @_;
my @features;
my @seqs = @{$self->get_Sequences};
if (@seqs != @{$self->ids}) {
warning("Managed to get only " . scalar(@seqs) . " of ".
scalar(@{$self->ids}) ."for Exonerate run; check indices\n");
}
my @valid_seqs = $self->validate_sequence(@seqs);
my @sorted_seqs = sort {$a->id cmp $b->id} @valid_seqs;
foreach my $seq (@sorted_seqs) {
my $exonerate = new Bio::EnsEMBL::Analysis::Runnable::ExonerateTranscript
(
-program => $self->exonerate_path,
-analysis => $self->analysis,
-query_seqs => ([$seq]),
-query_type => 'protein',
-target_seqs => ([$self->query]),
-target_type => 'dna',
-options => "-W 7 ". $self->exonerate_options,
);
eval {
$exonerate->run;
};
if ($@){
throw("Exonerate died on me$@\n");
}
my $transcripts = $exonerate->output;
unless (scalar(@$transcripts > 0)){
print "Didn't get a good exonerate alignment, trying again with a shorter word length\n";
$exonerate = new Bio::EnsEMBL::Analysis::Runnable::ExonerateTranscript
(
-program => $self->exonerate_path,
-analysis => $self->analysis,
-query_seqs => ([$seq]),
-query_type => 'protein',
-target_seqs => ([$self->query]),
-target_type => 'dna',
-options => "-W 5 ". $self->exonerate_options,
);
eval {
$exonerate->run;
};
if ($@){
throw("Exonerate died on me$@\n");
}
$transcripts = $exonerate->output;
}
unless (scalar(@$transcripts > 0)){
print STDERR "Exonerate cannot align ".$seq->display_id." \n";
next;
}
foreach my $trans (@{$exonerate->output}){
foreach my $exon (@{$trans->get_all_Exons}){
push @features, @{$exon->get_all_supporting_features};
}
}
}
return \@features;
}
#=head2 make_mmgw
=head2 make_object
Args [1] : $miniseq - a Bio::Seq object representing the
target sequence for the genewise run.
Args [2] : $features - reference to a list of
FeaturePairs generated by a blast run.
Example : $self->make_object($miniseq, $features);
Description: Takes a genomic sequence and builds a
MultiMiniGenewise runnable object using the
list of FeaturePairs.
Returntype : A list of
Bio::EnsEMBL::Analysis::Runnable::MultiMiniGenewise
Exceptions : none
Caller : $self->build_runnables
=cut
#sub make_mmgw {
sub make_object {
my ($self, $miniseq, $features, $cluster_start, $cluster_end) = @_;
# # Before we pass our blast generated features to
# # MultiMiniGenewise we must first convert them from
# # PepDnaAlignFeatures to FeaturePairs.
#
# my @newf;
# foreach my $f (@$features){
# my $newf = new Bio::EnsEMBL::FeaturePair(-feature1 => $f->feature2,
# -feature2 => $f->feature1);
# push(@newf,$newf);
# }
# Create a MultiMiniGenewise object with the features we've
# just converted.
my %pars = %{$self->multiminigenewise_options} if($self->multiminigenewise_options);
if (defined($cluster_end)) {
$pars{-cluster_start} = $cluster_start;
$pars{-cluster_end} = $cluster_end;
}
my $mg = Bio::EnsEMBL::Analysis::Runnable::MultiMiniGenewise->new
(
-query => $miniseq,
-features => $features,
-seqfetcher => $self->seqfetcher,
-minigenewise_options => $self->minigenewise_options,
-fullseq => $self->full_seq,
-genewise_options => $self->genewise_options,
-analysis => $self->analysis,
%pars,
);
%pars = ();
return $mg;
}
sub get_Sequences {
my ($self) = @_;
my @seq;
foreach my $id (@{$self->ids}) {
logger_info("Fetching ".$id." sequence\n");
my $seq = $self->get_Sequence($id);
if ($seq && $seq->length > 0) {
push(@seq,$seq);
} else {
print "Invalid sequence for $id - skipping\n";
}
}
return \@seq;
}
sub validate_Sequences {
my ($self, $seqs) = @_;
my @validated;
foreach my $seq (@$seqs) {
my $sequence = $seq->seq;
if ($sequence !~ /[^acgtn]/i) {
push (@validated, $seq);
} else {
$_ = $sequence;
my $len = length ($_);
my $invalidCharCount = tr/bB/xX/;
if ($invalidCharCount / $len > 0.05) {
warning("Ignoring ".$seq->display_id()
." contains more than 5% ($invalidCharCount) "
."odd nucleotide codes ($sequence)\n Type returns "
.$seq->moltype().")\n");
} else {
$seq->seq($_);
push (@validated, $seq);
}
}
}
return \@validated;
}
=head2 get_Sequence
Title : get_Sequence
Usage : my $seq = get_Sequence($id)
Function: Fetches sequences with id $id
Returns : Bio::PrimarySeq
Args : none
=cut
sub get_Sequence {
my ($self,$id) = @_;
my $seqfetcher = $self->seqfetcher;
my $seq;
my $name;
if($seqfetcher->can('db')){
my @dbs = $seqfetcher->db;
$name = $dbs[0];
}
my ($p, $f, $l) = caller;
if (!defined($id)) {
warning("No id input to get_Sequence");
}
eval {
$seq = $seqfetcher->get_Seq_by_acc($id);
};
if($@) {
warning("Problem fetching sequence for id [$id] with $seqfetcher $name $@\n");
return undef;
}
if(!defined($seq)){
warning("Could not find sequence for [$id] with $seqfetcher $name called by $f:$l");
}
return $seq;
}
1;