Raw content of Bio::EnsEMBL::Analysis::Runnable::Infernal
# Ensembl module for Bio::EnsEMBL::Analysis::Runnable::Infernal
#
# Copyright (c) 2004 Ensembl
#
=head1 NAME
Bio::EnsEMBL::Analysis::Runnable::Infernal
=head1 SYNOPSIS
my $runnable = Bio::EnsEMBL::Analysis::Runnable::Infernal->new
(
'-queries' => \@array_ref_of_dna_align_features,
'-analysis' => $self->analysis,
);
$runnable->run;
$output = $runnable->output;
=head1 DESCRIPTION
Runnable for Infernal (Runs ncRNA analysis on blast hits).
Wraps cmsearch, part of the Infernal suite of programs by Sean Eddy.
Parses results to build non-coding gene objects and a representation
of secondary structure which is string length encoded and stored as a
transcript attribute
=head1 CONTACT
Post questions to the Ensembl development list: ensembl-dev@ebi.ac.uk
=cut
package Bio::EnsEMBL::Analysis::Runnable::Infernal;
use strict;
use warnings;
use Bio::EnsEMBL::Analysis::Runnable;
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
use Bio::EnsEMBL::Utils::Argument qw( rearrange );
use Bio::EnsEMBL::Gene;
use Bio::EnsEMBL::DBEntry;
use Bio::EnsEMBL::Analysis::Runnable::RNAFold;
use vars qw(@ISA);
@ISA = qw(Bio::EnsEMBL::Analysis::Runnable);
my $verbose = 1;
=head2 new
Title : new
Usage : my $runnable = Bio::EnsEMBL::Analysis::Runnable::Infernal->new
Function : Instantiates new Infernal runnable
Returns : Bio::EnsEMBL::Analysis::Runnable::Infernal object
Exceptions : none
Args : Array ref of Bio::EnsEMBL::DnaDnaAlignFeature objects
=cut
sub new {
my ($class,@args) = @_;
my $self = $class->SUPER::new(@args);
my ($queries,$thresholds) = rearrange(['QUERIES'], @args);
$self->queries($queries);
$self->get_thresholds;
return $self;
}
=head2 run
Title : run
Usage : my $runnable->run;
Function : Runs the runnable
Returns : None
Exceptions : Throws if no query sequence is provided
Args : None
=cut
sub run{
my ($self) = @_;
my $queries = $self->queries;
my @attributes;
my $descriptions = $self->get_descriptions;
foreach my $query (@$queries){
$self->query($self->get_sequence($query));
$self->throw("Can't run ".$self." without a query sequence")
unless($self->query);
$self->checkdir();
# write the seq file
my $filename = $self->write_seq_file();
$self->files_to_delete($filename);
$self->files_to_delete($self->resultsfile);
# run cmsearch
$self->run_analysis();
# parse the cmsearch results file - make a hash containing all
# the results ordered by score;
my $results = $self->parse_results($query);
next unless($results);
# make the gene object out of the highest scoring result that
# overlaps the blast hit
my $gene = $self->make_gene($results,$query,$descriptions);
$self->output($gene) if $gene;
}
$self->delete_files;
return 1;
}
=head2 run_analysis
Title : run_analysis
Usage : $runnable->run_analysis;
Function : Runs the analysis (cmsearch)
Returns : None
Exceptions : Throws in the event of cmsearch returning an error code
Args : None
=cut
sub run_analysis{
my ($self) = @_;
my $db = $self->analysis->db_file."/".$self->query->display_id.".cm";
my $command = $self->program;
my %thresholds = %{$self->thresholds};
my $domain = $self->query->display_id;
my $filename = $self->queryfile;
my $results_file = $self->create_filename("Infernal.$domain.out");
$self->files_to_delete($results_file);
$self->resultsfile($results_file);
$command .= " -W ".$thresholds{$domain}{'win'};
if ($thresholds{$domain}{'mode'} =~ /local/) {
$command .= " --local";
}
$command .= " $db $filename > $results_file";
print STDOUT "Running infernal ".$command."\n" if $verbose;
open(my $fh, "$command |") ||
$self->throw("Error opening Infernal cmd <$command>." .
" Returned error $? Infernal EXIT: '" .
($? >> 8) . "'," ." SIGNAL '" . ($? & 127) .
"', There was " . ($? & 128 ? 'a' : 'no') .
" core dump");
# this loop reads the STDERR from the blast command
while (<$fh>) {
if (/FATAL:(.+)/) {
my $match = $1;
$self->throw("Infernal failed to run - $match \n");
}
}
}
=head2 parse_results
Title : parse_results
Usage : $runnable->parse_results(\@dna_align_features)
Function : Parses all cmsearch output: rejects alignments where score is below
: the threshold defined for the RFAM family in question.
: The thresholds are defined in the file /data/blastdb/Rfam/Rfam.thr.
: A dna_align_feature representing the initial RFAM blast hit is used to determine
: the family and position on the genome.
: Parses all the results (there can be more than one) and stores them in
: an array of hashes which is then sorted by the cmsearch score
Returns : None
Exceptions : Throws if it cannot find, open or close the cmsearch output
Args : ref to array of hashes
=cut
sub parse_results{
my ($self, $daf) = @_;
my @dafs;
my $align;
my $line =0;
my $domain = substr($daf->hseqname,0,7);
my %thresholds = %{$self->thresholds};
my $threshold = $thresholds{$domain}{'thr'};
print STDERR "Domain $domain has threshold $threshold\n" if $verbose;
my $results = $self->resultsfile;
my $ff = $self->feature_factory;
if(!-e $results){
$self->throw("Can't parse a results file that dosen't exist ".$results.
" Infernal:parse_results");
}
my @results;
my ($hit,$start,$end,$score,$strand,$str) =0;
open(INFERNAL, $results) or $self->throw("FAILED to open ".$results.
" INFERNAL:parse_results");
LINE: while(){
chomp;
if ($_ =~ /^hit/){
if ($score && $threshold && $score > $threshold && $align->{'name'}){
$str = $self->parse_structure($align);
print STDERR "positve result at ".$daf->seq_region_name.":".$daf->seq_region_start."-".$daf->seq_region_end." strand ".
$daf->strand."\n" if $verbose;
push @results, {str => $str,
score => $score,
start => $start,
end => $end,
strand=> $strand};
}
$align = {};
$align->{'name'} = $domain ;
$line = -1;
}
if ($_ =~ /^CPU/){
$line = -1;
}
print STDERR "$line\t$_\n"if $verbose;
if ($score && $score >= $threshold && $line >= 0){
# parsing goes in 5 lines
# split the lines and store each element in a anonymous hash array
$line++;
if ($_){
push @{$align->{'str'}} , split//,substr($_,11,60) if ($line == 1);
push @{$align->{'target'}}, split//,substr($_,11,-9) if ($line == 2);
push @{$align->{'match'}} , split//,substr($_,11,60) if ($line == 3);
push @{$align->{'query'}} , split//,substr($_,11,-9) if ($line == 4);
}
else{
$line =0;
}
}
if ($_ =~ /^hit\s+(\d+)\s+:\s+(\d+)\s+(\d+)\s+(\d+).(\d+)\s+bits/){
$hit = $1;
$start = $2;
$end = $3;
$score = $4+$5/100;
print STDERR "hit - $hit\nscore - $score\n" if $verbose;;
if ($score >= $threshold){
if ($end < $start){
$strand = -1;
my $temp = $end;
$end = $start;
$start = $temp;
}
else {
$strand =1;
}
$line = 0;
}
else{
$line = -1;
}
}
}
if ($align->{'name'} && $score > $threshold){
print STDERR "positve result at ".$daf->seq_region_name.":".$daf->seq_region_start."-".$daf->seq_region_end." strand ".
$daf->strand."\n" if $verbose;
$str = $self->parse_structure($align);
push @results, {str => $str,
score => $score,
start => $start,
end => $end,
strand=> $strand};
}
close(INFERNAL) or $self->throw("FAILED to close ".$results.
" INFERNAL:parse_results");
# sort the results to get the highest scoring infernal alignment
@results = sort {$b->{'score'} <=> $a->{'score'}} @results;
if (@results){
return \@results;
}
else {
return;
}
}
=head2 parse_structure
Title : parse_structure
Usage : my $structure = $runnable->parse_structure(\%hash_ref_containing_parsed_cmsearch_results)
Function : Parses cmsearch alignment to create a structure line which represents the
: predicted secondary structure of the ungaped sequence
Returns : String representing a run length encoded form of the structure line
Exceptions : Throws if it cannot find, open or close the cmsearch output
Args : ref to an array of hashes wrapping the cmsearch output alignment
=cut
sub parse_structure{
my ($self,$align)=@_;
my @all_matches;
my @stack;
my @big_gaps;
my $matchstring;
my @matches;
my $big_gap=0;
my @attributes;
# Brace matching
# push open braces on to the stack
for (my $i=0 ; $i< scalar(@{$align->{'str'}}); $i++){
if ($align->{'str'}[$i] eq '(' or
$align->{'str'}[$i] eq '<' or
$align->{'str'}[$i] eq '[' or
$align->{'str'}[$i] eq '{'){
push @stack,$i;
}
# pop the positions of the open brace off the stack as you find close braces
if ($align->{'str'}[$i] eq ')' or
$align->{'str'}[$i] eq '}' or
$align->{'str'}[$i] eq ']' or
$align->{'str'}[$i] eq '>'){
$all_matches[$i] = pop @stack;
}
}
@stack = [];
# Need to do the reverse proces to get all matches
for (my $i = scalar(@{$align->{'str'}}-1); $i >=0 ; $i--){
if ($align->{'str'}[$i] eq ')' or
$align->{'str'}[$i] eq '}' or
$align->{'str'}[$i] eq ']' or
$align->{'str'}[$i] eq '>'){
push @stack,$i;
}
# pop the positions of the close brace off the stack as you find open braces
if ($align->{'str'}[$i] eq '(' or
$align->{'str'}[$i] eq '<' or
$align->{'str'}[$i] eq '[' or
$align->{'str'}[$i] eq '{'){
$all_matches[$i] = pop @stack;
}
}
for (my $i=0 ; $i< scalar(@{$align->{'str'}}); $i++){
# Parse out large gaps by looking for ~ on the str line;
if ($align->{'query'}[$i] eq '*'){
my $string;
for (my $j=$i+1 ; $j < scalar(@{$align->{'str'}}) ; $j++){
last if ($align->{'query'}[$j] eq '*');
$string .= $align->{'query'}[$j];
}
if ($string =~ /\[(.+)\]/){
my $gap = $1;
$gap =~ s/\D//g;
$big_gaps[$i] = $gap;
}
}
#skip over if you have a gap - beware gaps at the other end of the alignment;
if ($align->{'query'}[$i] eq '-'){
next;
}
# skip over if you have a missmatch
if ($align->{'match'}[$i] eq ' ' or
$align->{'match'}[$i] eq ':'){
$matches[$i] = '.';
next;
}
# Found a match
if (defined $all_matches[$i]){
# check there isnt a gap at the other end
if ($align->{'query'}[$all_matches[$i]] eq '-'){
$matches[$i] = '.';
next;
}
$matches[$i] = $align->{'str'}[$i];
$matches[$all_matches[$i]] = $align->{'str'}[$all_matches[$i]];
}
else {
$matches[$i] = $align->{'str'}[$i];
}
}
for (my $i=0 ; $i< scalar(@{$align->{'str'}}); $i++){
if ($big_gaps[$i]){
for (my $j = 0 ; $j < $big_gaps[$i] ; $j++){
$matchstring .= ".";
}
$i=$i+5;
next;
}
if ($matches[$i]){
$matchstring.= $matches[$i];
}
}
# make all characters into either (,),or .
$matchstring =~ s/[\<\[\{]/(/g;
$matchstring =~ s/[\>\]\}]/)/g;
$matchstring =~ s/[,:_-]/./g;
return $matchstring;
}
=head2 make_gene
Title : make_gene
Usage : my $gene = $runnable->make_gene($result_hash,$dna_align_feature)
Function : Creates the non coding gene object from the parsed result file.
: Takes all the results from the result file sorted by score and
: selects the highest scoring result that also overlaps the original BLAST hit
: Uses Bio::EnsEMBL::Analysis::Runnable::RNAfold to create a structure
: line for the gene. The structure obtained from the cmsearch alignment
: is used to constrain the folding prediction made by RNAfold.
: Adds the encoded structure as a transript attribute and xrefs are
: made based on the original BLAST hit
Returns : Bio::EnsEMBL::Gene
Exceptions : None
Args : ref to array of hashes containing parsed cmsearch results
: Bio::EnsEMBL::DnaDnaAlignFeature ($dna_align_feature)
=cut
sub make_gene{
my ($self,$results,$daf,$descriptions) = @_;
my $domain = substr($daf->hseqname,0,7);
my $description = $descriptions->{$domain}->{'description'};
my $name = $descriptions->{$domain}->{'name'};
my %thresholds = %{$self->thresholds};
my $padding = $thresholds{$domain}{'win'};
my %gene_hash;
my $slice = $daf->slice;
my @attributes;
my ($start,$end,$strand,$str,$score,$exon);
# step through all the results in order of score, highest first until
# you find a model that overlapping the blast hit
foreach my $result (@$results){
if ($daf->strand == 1){
$start = $daf->start - $padding + $result->{'start'} - 1 ;
$end = $daf->start - $padding + $result->{'end'} - 1;
} else {
$start = $daf->end + $padding - $result->{'end'} + 1 ;
$end = $daf->end + $padding - $result->{'start'} + 1;
}
$str = $result->{'str'};
$score = $result->{'score'};
# exons
# need to remove padding from exon coordinates to put them in the correct place
$exon = Bio::EnsEMBL::Exon->new
(
-start => $start,
-end => $end,
-strand => $daf->strand,
-slice => $slice,
-phase => -1,
-end_phase => -1
);
# reject if it falls of the start of the slice
next if ($exon->start < 1);
# reject if it falls of the end of the slice
next if ($exon->end > $slice->length);
# Only allow exons that overlap the origional dna_align_feature and
# have a secondary structure that is possible to parse
last if ($exon->overlaps($daf));
}
# return undef if no suitable candidates are found
return unless ($exon->start >= 1);
return unless ($exon->overlaps($daf));
# $daf->score($score);
$exon->add_supporting_features($daf);
# transcripts
my $transcript = Bio::EnsEMBL::Transcript->new;
$transcript->add_Exon($exon);
$transcript->start_Exon($exon);
$transcript->end_Exon($exon);
my $gene = Bio::EnsEMBL::Gene->new;
#Biotypes
$gene->biotype("misc_RNA");
$gene->biotype("snRNA") if($description =~ /uclear/ or
$description =~ /pliceosomal/ );
$gene->biotype("snoRNA") if($description =~ /ucleolar/);
$gene->biotype("rRNA") if($description =~ /ibosomal/);
$gene->confidence("NOVEL");
$gene->description($description." [Source: RFAM ".$self->analysis->db_version."]");
print STDERR "Rfam_id $domain ".$description."\n"if $verbose;;
$gene->analysis($self->analysis);
$gene->add_Transcript($transcript);
$transcript->biotype($gene->biotype);
# XREFS
my $xref = Bio::EnsEMBL::DBEntry->new
(
-primary_id => substr($daf->hseqname,0,7),
-display_id => $name,
-dbname => 'RFAM',
-release => 1,
-version => 0,
-description => $description." [Source: RFAM ".$self->analysis->db_version."]",
);
# Use RNA fold to tidy up the structure parsed from cmsearch results
my $seq = Bio::PrimarySeq->new(
-display_id => $domain,
-seq => $gene->seq,
);
my $RNAfold = Bio::EnsEMBL::Analysis::Runnable::RNAFold->new
(
-analysis => $self->analysis,
-sequence => $seq,
-structure => $str,
);
$RNAfold->run;
# return if there is no structure to display
return unless $RNAfold->structure;
# get the final structure encoded by run length
my @final_str = @{$RNAfold->encoded_str};
foreach my $str (@final_str){
# add the transcript attribute to the gene hash
my $attribute = Bio::EnsEMBL::Attribute->new
(-CODE => 'ncRNA',
-NAME => 'Structure',
-DESCRIPTION => 'RNA secondary structure line',
-VALUE => $str
);
push @attributes,$attribute;
}
# add the final structure to the gene as a transcript attribute
$gene_hash{'attrib'} = \@attributes;
$gene_hash{'gene'} = $gene;
$gene_hash{'xref'} = $xref;
print "Chosen hit and structure constraint : $start $end $strand $description $name\n$str\n";
return \%gene_hash;
}
=head2 get_sequence
Title : get_sequence
Usage : my $seq = $runnable->get_sequence($dna_align_feature)
Function : Makes a BioPerl seq object containing the sequecnce for cmsearch to run on.
: Uses the original RFAM blast hit represented by a dna alignment feature and adds
: 5' and 3' flanking sequence based on predefined values set for each RFAM familly
: in the /data/blastdb/Rfam/Rfam.thr file
Returns : Bio::Seq object
Exceptions : None
Args : Bio::EnsEMBL::DnaDnaAlignFeature
=cut
sub get_sequence{
my ($self,$daf)=@_;
my $domain = substr($daf->hseqname,0,7);
my %thresholds = %{$self->thresholds};
my $slice = $daf->slice;
my $padding = $thresholds{$domain}{'win'};
my $old_start = $daf->start;
my $old_end = $daf->end;
print STDERR "Using padding of $padding\t"if $verbose;
# add padding
$daf->start($daf->start-$padding);
$daf->end($daf->end+$padding);
my $seq = Bio::Seq->new(
-seq => $daf->seq,
-display_id => $domain,
);
# remove padding
$daf->start($old_start);
$daf->end($old_end);
print STDERR " total seq length = "if $verbose;;
print $seq->length."\n"if $verbose;;
return $seq;
}
=head2 get_thresholds
Title : get_thresholds
Usage : $runnable->get_thresholds
Function : Fetches and stores predefined thresholds set for each RFAM familly
: in the /data/blastdb/Rfam/Rfam.thr file
Returns : None
Exceptions : Throws if it cannot open the thresholds file
Args : Bio::EnsEMBL::DnaDnaAlignFeature
=cut
sub get_thresholds{
my($self)= @_;
# read threshold file
my %thr;
# full thresholds
open( T,$self->analysis->db_file."/Rfam.thr" ) or $self->throw("can't file the Rfam.thr file");
while() {
if( /^(RF\d+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s*$/ ) {
$thr{ $1 } = { 'id' => $2, 'thr' => $3, 'win' => $4, 'mode' => $5 };
}
}
close T;
$self->thresholds(\%thr);
}
=head2 get_descriptions
Title : get_descriptions
Usage : $runnable->get_descriptions
Function : Fetches and stores predefined descriptions for each RFAM familly
: in the Rfam.descriptions file
Returns : none
Exceptions : Throws if it cannot open the descriptions file
Args : Bio::EnsEMBL::DnaDnaAlignFeature
=cut
sub get_descriptions{
my($self)= @_;
my %descriptions;
my $domain;
my $name;
# read descriptions file
return undef unless -e $self->analysis->db_file."/Rfam.seed";
open( T,$self->analysis->db_file."/Rfam.seed") or
$self->throw("can't file the ".$self->analysis->db_file."/Rfam.seed file");
while() {
chomp;
if ($_ =~ /^\#=GF AC (.+)/){
$domain = $1;
}
if ($_ =~ /^\#=GF DE (.+)/){
$descriptions{$domain}{'description'} = $1;
}
if ($_ =~ /^\#=GF ID (.+)/){
$descriptions{$domain}{'name'} = $1;
}
}
close T;
return \%descriptions if scalar(keys %descriptions > 0);
$self->throw("Unable to find descriptions");
return undef;
}
#################################################################
# Containers
=head2 queries
Title : queries
Usage : my %queries = %$runnable->queries
Function : Get/ set for the dna alignfeatures defining as the query sequences for cmsearch
Returns : Hash reference
Exceptions : None
Args : Array reference of Bio::EnsEMBL::DnaDnaAlignFeature
=cut
sub queries {
my ($self, $queries) = @_;
if ($queries){
$self->{'_queries'} = $queries;
}
return $self->{'_queries'};
}
=head2 thresholds
Title : thresholds
Usage : my %thresholds = %$runnable->thresholds
Function : Get/ set for the cmsearch thresholds for each RFAM familly
Returns : Hash reference
Exceptions : None
Args : Hash Reference
=cut
sub thresholds {
my ($self, $thresholds) = @_;
if ($thresholds){
$self->{'_thresholds'} = $thresholds;
}
return $self->{'_thresholds'};
}
=head2 output
Arg [1] : Bio::EnsEMBL::Analysis::Runnable
Arg [2] : hashref of output
Function : overrides runnable output method to allow storing of hash refs
Exceptions: throws if not passed an hashref
Example :
=cut
sub output{
my ($self, $output) = @_;
if(!$self->{'output'}){
$self->{'output'} = [];
}
if($output){
throw("Must pass Runnable:output an hashref not a ".$output)
unless(ref($output) eq 'HASH');
push(@{$self->{'output'}}, $output);
}
return $self->{'output'};
}
1;