Raw content of Bio::EnsEMBL::Analysis::Runnable::BlastmiRNA
# Ensembl module for Bio::EnsEMBL::Analysis::Runnable::BlastmiRNA
#
# Copyright (c) 2004 Ensembl
#
=head1 NAME
Bio::EnsEMBL::Analysis::Runnable::BlastmiRNA
=head1 SYNOPSIS
my $blast = Bio::EnsEMBL::Analysis::Runnable::BlastmiRNA->new
(
-query => $slice,
-program => 'wublastn',
-database => 'embl_vertrna',
-options => 'hitdist=40 -cpus=1',
-parser => $bplitewrapper,
-filter => $featurefilter,
);
$blast->run;
my @output =@{$blast->output};
=head1 DESCRIPTION
Modified blast runnable for specific use with miRNAs
Use for running BLASTN of genomic vs miRNAs prior to
miRNA analysis.
Keeps the coverage in the dna_align_feature score field.
Also clusters overlapping hits and picks the one with the lowest
evalue to represent that cluster.
=head1 CONTACT
Post questions to the Ensembl development list: ensembl-dev@ebi.ac.uk
=cut
package Bio::EnsEMBL::Analysis::Runnable::BlastmiRNA;
use strict;
use warnings;
use Bio::EnsEMBL::Analysis::Runnable;
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
use Bio::EnsEMBL::Utils::Argument qw( rearrange );
use Bio::EnsEMBL::Analysis::Runnable::Blast;
use vars qw(@ISA);
@ISA = qw(Bio::EnsEMBL::Analysis::Runnable::Blast);
=head2 run
Arg : string, directory
Function : a generic run method. This checks the directory specifed
: to run it, write the query sequence to file, marks the query sequence
: file and results file for deletion, runs the analysis parses the
: results and deletes any files
Returntype: 1
Exceptions: throws if no query sequence is specified
Example :
=cut
sub run{
my ($self, $dir) = @_;
$self->workdir($dir) if($dir);
throw("Can't run ".$self." without a query sequence")
unless($self->query);
$self->checkdir();
my $filename = $self->write_seq_file();
$self->files_to_delete($filename);
$self->files_to_delete($self->resultsfile);
$self->run_analysis;
$self->parse_results;
$self->delete_files;
return 1;
}
=head2 parse_results
Arg [1] : Scalar coverage cutoff value
Function : overrides the parse results method in runnable to allow for
: coverage calculations
Returntype: none
Exceptions: none
Example : $self->parse_results(80);
=cut
sub parse_results{
my ($self,$coverage_cutoff) = @_;
my $results = $self->results_files;
my @daf_coverage_results;
my $filtered_output;
my $bplite = $self->parser->get_parsers($results);
foreach my $blast (@{$bplite}){
while( my $subject = $blast->nextSbjct){
while (my $hsp = $subject->nextHSP) {
my @daf_results;
my $hsp_length = $hsp->length."\t";
my $subject_length = $subject->{'LENGTH'};
$subject_length = 1 unless ($subject_length);
my $coverage = $hsp_length/$subject_length*100;
$coverage =~ s/\.\d+//;
if ($coverage_cutoff){
next unless($coverage > $coverage_cutoff);
}
$subject->name =~ /^\S+\s+(\w+)/;
my $name = $1;
push @daf_results, $self->parser->split_hsp($hsp,$name);
# add coverage into daf score
foreach my $daf(@daf_results){
#####################################################################
# swaps strands over if blast aligns +strand genomic to -ve strand in
# RFAM file. RFAM file sequences are the correct orientation whereas
# the genomic can be either, with unspliced DNA it becomes
# impossibe to tell automatically
if ($daf->hstrand == -1 && $daf->strand == 1){
$daf->strand(-1);
$daf->hstrand(1);
}
$daf->score($coverage);
$daf->external_db_id(3200);
push @daf_coverage_results, $daf;
}
}
}
}
return undef unless ( @daf_coverage_results);
my $output = $self->cluster(\@daf_coverage_results);
$self->output($output);
}
=head2 cluster
Arg [1] : Array ref of Bio::EnsEMBL::DnaDnaAlignFeature
Function : Clusters overlapping blast hits and chooses the one to represent the cluster.
Returntype: Array ref of Bio::EnsEMBL::DnaDnaAlignFeature
Exceptions: None
Example : my $output = $self->cluster(\@dna_align_features);
=cut
sub cluster{
my ($self,$dafs_ref)=@_;
my @dafs = @$dafs_ref;
@dafs = sort{$a->p_value <=> $b->p_value} @dafs;
my $start =0;
my @representative_sequences;
DAFS: foreach my $daf (@dafs){
my @cluster;
$start++;
next DAFS unless($daf);
push @cluster,$daf;
MATCHES: for (my $index = $start; $index <= $#dafs ; $index++){
next MATCHES unless ($dafs[$index]);
if ($daf->end >= $dafs[$index]->start() && $daf->start() <= $dafs[$index]->end){
push @cluster,$dafs[$index];
$dafs[$index] = undef;
}
}
# want to pick identical full length hits by preference
foreach my $daf ( @cluster ){
if($daf->score >= 100 && $daf->percent_id == 100){
push @representative_sequences, $daf;
next DAFS;
}
}
# otherwise sort by e_value
@cluster = sort{$b->p_value <=> $a->p_value} @cluster;
push @representative_sequences, pop @cluster;
}
return \@representative_sequences;
}
1;