Bio::EnsEMBL::Analysis::Runnable
Infernal
Toolbar
Summary
Bio::EnsEMBL::Analysis::Runnable::Infernal
Package variables
Privates (from "my" definitions)
$verbose = 1
Included modules
Inherit
Synopsis
my $runnable = Bio::EnsEMBL::Analysis::Runnable::Infernal->new
(
'-queries' => \@array_ref_of_dna_align_features,
'-analysis' => $self->analysis,
);
$runnable->run;
$output = $runnable->output;
Description
Runnable for Infernal (Runs ncRNA analysis on blast hits).
Wraps cmsearch, part of the Infernal suite of programs by Sean Eddy.
Parses results to build non-coding gene objects and a representation
of secondary structure which is string length encoded and stored as a
transcript attribute
Methods
Methods description
Title : get_descriptions Usage : $runnable->get_descriptions Function : Fetches and stores predefined descriptions for each RFAM familly : in the Rfam.descriptions file Returns : none Exceptions : Throws if it cannot open the descriptions file Args : Bio::EnsEMBL::DnaDnaAlignFeature |
Title : get_sequence Usage : my $seq = $runnable->get_sequence($dna_align_feature) Function : Makes a BioPerl seq object containing the sequecnce for cmsearch to run on. : Uses the original RFAM blast hit represented by a dna alignment feature and adds : 5' and 3' flanking sequence based on predefined values set for each RFAM familly : in the /data/blastdb/Rfam/Rfam.thr file Returns : Bio::Seq object Exceptions : None Args : Bio::EnsEMBL::DnaDnaAlignFeature |
Title : get_thresholds Usage : $runnable->get_thresholds Function : Fetches and stores predefined thresholds set for each RFAM familly : in the /data/blastdb/Rfam/Rfam.thr file Returns : None Exceptions : Throws if it cannot open the thresholds file Args : Bio::EnsEMBL::DnaDnaAlignFeature |
Title : make_gene Usage : my $gene = $runnable->make_gene($result_hash,$dna_align_feature) Function : Creates the non coding gene object from the parsed result file. : Takes all the results from the result file sorted by score and : selects the highest scoring result that also overlaps the original BLAST hit : Uses Bio::EnsEMBL::Analysis::Runnable::RNAfold to create a structure : line for the gene. The structure obtained from the cmsearch alignment : is used to constrain the folding prediction made by RNAfold. : Adds the encoded structure as a transript attribute and xrefs are : made based on the original BLAST hit Returns : Bio::EnsEMBL::Gene Exceptions : None Args : ref to array of hashes containing parsed cmsearch results : Bio::EnsEMBL::DnaDnaAlignFeature ($dna_align_feature) |
Title : new Usage : my $runnable = Bio::EnsEMBL::Analysis::Runnable::Infernal->new Function : Instantiates new Infernal runnable Returns : Bio::EnsEMBL::Analysis::Runnable::Infernal object Exceptions : none Args : Array ref of Bio::EnsEMBL::DnaDnaAlignFeature objects |
Arg [1] : Bio::EnsEMBL::Analysis::Runnable Arg [2] : hashref of output Function : overrides runnable output method to allow storing of hash refs Exceptions: throws if not passed an hashref Example : |
Title : parse_results Usage : $runnable->parse_results(\@dna_align_features) Function : Parses all cmsearch output: rejects alignments where score is below : the threshold defined for the RFAM family in question. : The thresholds are defined in the file /data/blastdb/Rfam/Rfam.thr. : A dna_align_feature representing the initial RFAM blast hit is used to determine : the family and position on the genome. : Parses all the results (there can be more than one) and stores them in : an array of hashes which is then sorted by the cmsearch score Returns : None Exceptions : Throws if it cannot find, open or close the cmsearch output Args : ref to array of hashes |
Title : parse_structure Usage : my $structure = $runnable->parse_structure(\%hash_ref_containing_parsed_cmsearch_results) Function : Parses cmsearch alignment to create a structure line which represents the : predicted secondary structure of the ungaped sequence Returns : String representing a run length encoded form of the structure line Exceptions : Throws if it cannot find, open or close the cmsearch output Args : ref to an array of hashes wrapping the cmsearch output alignment |
Title : queries Usage : my %queries = %$runnable->queries Function : Get/ set for the dna alignfeatures defining as the query sequences for cmsearch Returns : Hash reference Exceptions : None Args : Array reference of Bio::EnsEMBL::DnaDnaAlignFeature |
Title : run Usage : my $runnable->run; Function : Runs the runnable Returns : None Exceptions : Throws if no query sequence is provided Args : None |
Title : run_analysis Usage : $runnable->run_analysis; Function : Runs the analysis (cmsearch) Returns : None Exceptions : Throws in the event of cmsearch returning an error code Args : None |
Title : thresholds Usage : my %thresholds = %$runnable->thresholds Function : Get/ set for the cmsearch thresholds for each RFAM familly Returns : Hash reference Exceptions : None Args : Hash Reference |
Methods code
sub get_descriptions
{ my($self)= @_;
my %descriptions;
my $domain;
my $name;
return undef unless -e $self->analysis->db_file."/Rfam.seed";
open( T,$self->analysis->db_file."/Rfam.seed") or
$self->throw("can't file the ".$self->analysis->db_file."/Rfam.seed file");
while(<T>) {
chomp;
if ($_ =~ /^\#=GF AC (.+)/){
$domain = $1;
}
if ($_ =~ /^\#=GF DE (.+)/){
$descriptions{$domain}{'description'} = $1;
}
if ($_ =~ /^\#=GF ID (.+)/){
$descriptions{$domain}{'name'} = $1;
}
}
close T;
return\% descriptions if scalar(keys %descriptions > 0);
$self->throw("Unable to find descriptions");
return undef;
}
} |
sub get_sequence
{ my ($self,$daf)=@_;
my $domain = substr($daf->hseqname,0,7);
my %thresholds = %{$self->thresholds};
my $slice = $daf->slice;
my $padding = $thresholds{$domain}{'win'};
my $old_start = $daf->start;
my $old_end = $daf->end;
print STDERR "Using padding of $padding\t"if $verbose;
$daf->start($daf->start-$padding);
$daf->end($daf->end+$padding);
my $seq = Bio::Seq->new(
-seq => $daf->seq,
-display_id => $domain,
);
$daf->start($old_start);
$daf->end($old_end);
print STDERR " total seq length = "if $verbose;;
print $seq->length."\n"if $verbose;;
return $seq; } |
sub get_thresholds
{ my($self)= @_;
my %thr;
open( T,$self->analysis->db_file."/Rfam.thr" ) or $self->throw("can't file the Rfam.thr file");
while(<T>) {
if( /^(RF\d+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s*$/ ) {
$thr{ $1 } = { 'id' => $2, 'thr' => $3, 'win' => $4, 'mode' => $5 };
}
}
close T;
$self->thresholds(\%thr); } |
sub make_gene
{ my ($self,$results,$daf,$descriptions) = @_;
my $domain = substr($daf->hseqname,0,7);
my $description = $descriptions->{$domain}->{'description'};
my $name = $descriptions->{$domain}->{'name'};
my %thresholds = %{$self->thresholds};
my $padding = $thresholds{$domain}{'win'};
my %gene_hash;
my $slice = $daf->slice;
my @attributes;
my ($start,$end,$strand,$str,$score,$exon);
foreach my $result (@$results){
if ($daf->strand == 1){
$start = $daf->start - $padding + $result->{'start'} - 1 ;
$end = $daf->start - $padding + $result->{'end'} - 1;
} else {
$start = $daf->end + $padding - $result->{'end'} + 1 ;
$end = $daf->end + $padding - $result->{'start'} + 1;
}
$str = $result->{'str'};
$score = $result->{'score'};
$exon = Bio::EnsEMBL::Exon->new
(
-start => $start,
-end => $end,
-strand => $daf->strand,
-slice => $slice,
-phase => -1,
-end_phase => -1
);
next if ($exon->start < 1);
next if ($exon->end > $slice->length);
last if ($exon->overlaps($daf));
}
return unless ($exon->start >= 1);
return unless ($exon->overlaps($daf));
$exon->add_supporting_features($daf);
my $transcript = Bio::EnsEMBL::Transcript->new;
$transcript->add_Exon($exon);
$transcript->start_Exon($exon);
$transcript->end_Exon($exon);
my $gene = Bio::EnsEMBL::Gene->new;
$gene->biotype("misc_RNA");
$gene->biotype("snRNA") if($description =~ /uclear/ or
$description =~ /pliceosomal/ );
$gene->biotype("snoRNA") if($description =~ /ucleolar/);
$gene->biotype("rRNA") if($description =~ /ibosomal/);
$gene->confidence("NOVEL");
$gene->description($description." [Source: RFAM ".$self->analysis->db_version."]");
print STDERR "Rfam_id $domain ".$description."\n"if $verbose;;
$gene->analysis($self->analysis);
$gene->add_Transcript($transcript);
$transcript->biotype($gene->biotype);
my $xref = Bio::EnsEMBL::DBEntry->new
(
-primary_id => substr($daf->hseqname,0,7),
-display_id => $name,
-dbname => 'RFAM',
-release => 1,
-version => 0,
-description => $description." [Source: RFAM ".$self->analysis->db_version."]",
);
my $seq = Bio::PrimarySeq->new(
-display_id => $domain,
-seq => $gene->seq,
);
my $RNAfold = Bio::EnsEMBL::Analysis::Runnable::RNAFold->new
(
-analysis => $self->analysis,
-sequence => $seq,
-structure => $str,
);
$RNAfold->run;
return unless $RNAfold->structure;
my @final_str = @{$RNAfold->encoded_str};
foreach my $str (@final_str){
my $attribute = Bio::EnsEMBL::Attribute->new
(-CODE => 'ncRNA',
-NAME => 'Structure',
-DESCRIPTION => 'RNA secondary structure line',
-VALUE => $str
);
push @attributes,$attribute;
}
$gene_hash{'attrib'} =\@ attributes;
$gene_hash{'gene'} = $gene;
$gene_hash{'xref'} = $xref;
print "Chosen hit and structure constraint : $start $end $strand $description $name\n$str\n";
return\% gene_hash; } |
sub new
{ my ($class,@args) = @_;
my $self = $class->SUPER::new(@args);
my ($queries,$thresholds) = rearrange(['QUERIES'], @args);
$self->queries($queries);
$self->get_thresholds;
return $self; } |
sub output
{ my ($self, $output) = @_;
if(!$self->{'output'}){
$self->{'output'} = [];
}
if($output){
throw("Must pass Runnable:output an hashref not a ".$output)
unless(ref($output) eq 'HASH');
push(@{$self->{'output'}}, $output);
}
return $self->{'output'};
}
1; } |
sub parse_results
{ my ($self, $daf) = @_;
my @dafs;
my $align;
my $line =0;
my $domain = substr($daf->hseqname,0,7);
my %thresholds = %{$self->thresholds};
my $threshold = $thresholds{$domain}{'thr'};
print STDERR "Domain $domain has threshold $threshold\n" if $verbose;
my $results = $self->resultsfile;
my $ff = $self->feature_factory;
if(!-e $results){
$self->throw("Can't parse a results file that dosen't exist ".$results.
" Infernal:parse_results");
}
my @results;
my ($hit,$start,$end,$score,$strand,$str) =0;
open(INFERNAL, $results) or $self->throw("FAILED to open ".$results.
" INFERNAL:parse_results");
LINE: while(<INFERNAL>){
chomp;
if ($_ =~ /^hit/){
if ($score && $threshold && $score > $threshold && $align->{'name'}){
$str = $self->parse_structure($align);
print STDERR "positve result at ".$daf->seq_region_name.":".$daf->seq_region_start."-".$daf->seq_region_end." strand ".
$daf->strand."\n" if $verbose;
push @results, {str => $str,
score => $score,
start => $start,
end => $end,
strand=> $strand};
}
$align = {};
$align->{'name'} = $domain ;
$line = -1;
}
if ($_ =~ /^CPU/){
$line = -1;
}
print STDERR "$line\t$_\n"if $verbose;
if ($score && $score >= $threshold && $line >= 0){
$line++;
if ($_){
push @{$align->{'str'}} , split//,substr($_,11,60) if ($line == 1);
push @{$align->{'target'}}, split//,substr($_,11,-9) if ($line == 2);
push @{$align->{'match'}} , split//,substr($_,11,60) if ($line == 3);
push @{$align->{'query'}} , split//,substr($_,11,-9) if ($line == 4);
}
else{
$line =0;
}
}
if ($_ =~ /^hit\s+(\d+)\s+:\s+(\d+)\s+(\d+)\s+(\d+).(\d+)\s+bits/){
$hit = $1;
$start = $2;
$end = $3;
$score = $4+$5/100; print STDERR "hit - $hit\nscore - $score\n" if $verbose;;
if ($score >= $threshold){
if ($end < $start){
$strand = -1;
my $temp = $end;
$end = $start;
$start = $temp;
}
else {
$strand =1;
}
$line = 0;
}
else{
$line = -1;
}
}
}
if ($align->{'name'} && $score > $threshold){
print STDERR "positve result at ".$daf->seq_region_name.":".$daf->seq_region_start."-".$daf->seq_region_end." strand ".
$daf->strand."\n" if $verbose;
$str = $self->parse_structure($align);
push @results, {str => $str,
score => $score,
start => $start,
end => $end,
strand=> $strand};
}
close(INFERNAL) or $self->throw("FAILED to close ".$results.
" INFERNAL:parse_results");
@results = sort {$b->{'score'} <=> $a->{'score'}} @results;
if (@results){
return\@ results;
}
else {
return;
} } |
sub parse_structure
{ my ($self,$align)=@_;
my @all_matches;
my @stack;
my @big_gaps;
my $matchstring;
my @matches;
my $big_gap=0;
my @attributes;
for (my $i=0 ; $i< scalar(@{$align->{'str'}}); $i++){
if ($align->{'str'}[$i] eq '(' or
$align->{'str'}[$i] eq '<' or
$align->{'str'}[$i] eq '[' or
$align->{'str'}[$i] eq '{'){
push @stack,$i;
}
if ($align->{'str'}[$i] eq ')' or
$align->{'str'}[$i] eq '}' or
$align->{'str'}[$i] eq ']' or
$align->{'str'}[$i] eq '>'){
$all_matches[$i] = pop @stack;
}
}
@stack = [];
for (my $i = scalar(@{$align->{'str'}}-1); $i >=0 ; $i--){
if ($align->{'str'}[$i] eq ')' or
$align->{'str'}[$i] eq '}' or
$align->{'str'}[$i] eq ']' or
$align->{'str'}[$i] eq '>'){
push @stack,$i;
}
if ($align->{'str'}[$i] eq '(' or
$align->{'str'}[$i] eq '<' or
$align->{'str'}[$i] eq '[' or
$align->{'str'}[$i] eq '{'){
$all_matches[$i] = pop @stack;
}
}
for (my $i=0 ; $i< scalar(@{$align->{'str'}}); $i++){
if ($align->{'query'}[$i] eq '*'){
my $string;
for (my $j=$i+1 ; $j < scalar(@{$align->{'str'}}) ; $j++){
last if ($align->{'query'}[$j] eq '*');
$string .= $align->{'query'}[$j];
}
if ($string =~ /\[(.+)\]/){
my $gap = $1;
$gap =~ s/\D//g;
$big_gaps[$i] = $gap;
}
}
if ($align->{'query'}[$i] eq '-'){
next;
}
if ($align->{'match'}[$i] eq ' ' or
$align->{'match'}[$i] eq ':'){
$matches[$i] = '.';
next;
}
if (defined $all_matches[$i]){
if ($align->{'query'}[$all_matches[$i]] eq '-'){
$matches[$i] = '.';
next;
}
$matches[$i] = $align->{'str'}[$i];
$matches[$all_matches[$i]] = $align->{'str'}[$all_matches[$i]];
}
else {
$matches[$i] = $align->{'str'}[$i];
}
}
for (my $i=0 ; $i< scalar(@{$align->{'str'}}); $i++){
if ($big_gaps[$i]){
for (my $j = 0 ; $j < $big_gaps[$i] ; $j++){
$matchstring .= ".";
}
$i=$i+5;
next;
}
if ($matches[$i]){
$matchstring.= $matches[$i];
}
}
$matchstring =~ s/[\<\[\{]/(/g;
$matchstring =~ s/[\>\]\}]/)/g;
$matchstring =~ s/[,:_-]/./g;
return $matchstring; } |
sub queries
{ my ($self, $queries) = @_;
if ($queries){
$self->{'_queries'} = $queries;
}
return $self->{'_queries'}; } |
sub run
{ my ($self) = @_;
my $queries = $self->queries;
my @attributes;
my $descriptions = $self->get_descriptions;
foreach my $query (@$queries){
$self->query($self->get_sequence($query));
$self->throw("Can't run ".$self." without a query sequence")
unless($self->query);
$self->checkdir();
my $filename = $self->write_seq_file();
$self->files_to_delete($filename);
$self->files_to_delete($self->resultsfile);
$self->run_analysis();
my $results = $self->parse_results($query);
next unless($results);
my $gene = $self->make_gene($results,$query,$descriptions);
$self->output($gene) if $gene;
}
$self->delete_files;
return 1; } |
sub run_analysis
{ my ($self) = @_;
my $db = $self->analysis->db_file."/".$self->query->display_id.".cm";
my $command = $self->program;
my %thresholds = %{$self->thresholds};
my $domain = $self->query->display_id;
my $filename = $self->queryfile;
my $results_file = $self->create_filename("Infernal.$domain.out");
$self->files_to_delete($results_file);
$self->resultsfile($results_file);
$command .= " -W ".$thresholds{$domain}{'win'};
if ($thresholds{$domain}{'mode'} =~ /local/) {
$command .= " --local";
}
$command .= " $db $filename > $results_file";
print STDOUT "Running infernal ".$command."\n" if $verbose;
open(my $fh, "$command |") ||
$self->throw("Error opening Infernal cmd <$command>." .
" Returned error $? Infernal EXIT: '" .
($? >> 8) . "'," ." SIGNAL '" . ($? & 127) .
"', There was " . ($? & 128 ? 'a' : 'no') .
" core dump");
while (<$fh>) {
if (/FATAL:(.+)/) {
my $match = $1;
$self->throw("Infernal failed to run - $match\n ");
}
} } |
sub thresholds
{ my ($self, $thresholds) = @_;
if ($thresholds){
$self->{'_thresholds'} = $thresholds;
}
return $self->{'_thresholds'}; } |
General documentation