Raw content of Bio::EnsEMBL::Analysis::Runnable::miRNA
# Ensembl module for Bio::EnsEMBL::Analysis::Runnable::miRNA
#
# Copyright (c) 2004 Ensembl
#
=head1 NAME
Bio::EnsEMBL::Analysis::Runnable:miRNA
=head1 SYNOPSIS
my $runnable = Bio::EnsEMBL::Analysis::Runnable::miRNA->new
(
-queries => \%families,
-analysis => $self->analysis,
);
$self->runnable($runnable);
=head1 DESCRIPTION
Takes blast alignments in the form of a hash ref of Bio::EnsEMBL::DnaDnaAlignFeatures
grouped by miRNA family.
Opens an EMBL format file of miRNA sequences used to create the blast database and
parses out the positions of the mature RNA for each dna align feature.
Determines if the mature sequence is wholly contained within the dna align feature.
Applies a percent identity filter to the blast alignments.
If the alignment meet these criteria it then runs RNAfold and parses the results.
If the results of RNAfold indicate that the sequence can form a hairpin structure with
a score < -20 it is classed as a miRNA.
Single exon gene objects are made and the DnaDnaAlignFeature is added as supporting evidence.
The predicted structure and coordinates of the mature sequence are added to the transcript as
transcript attributes
=head1 CONTACT
Post questions to the Ensembl development list: ensembl-dev@ebi.ac.uk
=cut
package Bio::EnsEMBL::Analysis::Runnable::miRNA;
use strict;
use warnings;
use Bio::EnsEMBL::Analysis::Runnable;
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
use Bio::EnsEMBL::Utils::Argument qw( rearrange );
use Bio::EnsEMBL::Exon;
use Bio::EnsEMBL::Transcript;
use Bio::EnsEMBL::Gene;
use Bio::EnsEMBL::DBEntry;
use Bio::EnsEMBL::Analysis::Runnable::RNAFold;
use vars qw(@ISA);
@ISA = qw(Bio::EnsEMBL::Analysis::Runnable);
my $verbose = "yes";
=head2 new
Title : new
Usage : my $runnable = Bio::EnsEMBL::Analysis::Runnable::miRNA->new
: (
: -queries => \%families,
: -analysis => $self->analysis,
: );
Function : Instantiates new miRNA runnable
Returns : Bio::EnsEMBL::Analysis::Runnable::miRNA
Exceptions : none
Args : Hash ref of Bio::EnsEMBL::DnaDnaAlignFeature
=cut
sub new {
my ($class,@args) = @_;
my $self = $class->SUPER::new(@args);
my ($queries,$thresholds) = rearrange(['QUERIES'], @args);
$self->queries($queries);
$self->throw("miRNA: dying because cannot find database".$self->analysis->db_file."\n")
unless (-e $self->analysis->db_file);
return $self;
}
=head2 new
Title : run
Usage : my $runnable->run;
Function : Run method.
Returns : None
Exceptions : Throws if no query sequences found
Args : None
=cut
sub run{
my ($self) = @_;
print STDERR "get miRNAs\n" if $verbose;
# fetch the coordinates of the mature miRNAs
$self->get_miRNAs;
my %queries = %{$self->queries};
$self->throw("Cannot find query sequences $@\n") unless %queries;
print STDERR "Run analysis\n" if $verbose;
FAM: foreach my $family (keys %queries){
DAF:foreach my $daf (@{$queries{$family}}){
# Does the alignment contain the mature sequence?
my ($align,$status) = $self->run_analysis($daf);
next DAF unless ($align && scalar @$align > 0);
# does the sequence fold into a hairpin ?
my $seq = Bio::PrimarySeq->new
(
-display_id => $daf->hseqname,
-seq => $daf->seq,
);
my $RNAfold = Bio::EnsEMBL::Analysis::Runnable::RNAFold->new
(
-analysis => $self->analysis,
-sequence => $seq
);
$RNAfold->run;
# get the final structure encoded by run length
my $structure = $RNAfold->encoded_str;
next DAF unless ($RNAfold->score < -20);
next DAF unless ($RNAfold->structure);
$self->display_stuff($daf,$RNAfold->structure,$align,$RNAfold->score) if $verbose;
# create the gene
$self->make_gene($daf,$structure,$align);
}
}
print STDERR "delete temp files\n" if $verbose;
$self->delete_files;
}
=head2 get_miRNAs
Title : get_miRNAs
Usage : $self->get_miRNAs;
Function : Opens and reads EMBL formatted file of miRNA sequences used to make
: the miRNA blast database
Returns : None
Exceptions : Throws if sequence file is not found / cannot be opened
Args : None
=cut
sub get_miRNAs{
my ($self)=@_;
my %miRNA;
my $seqIO = Bio::SeqIO->new(
-file => $self->analysis->db_file,
-format => "embl"
);
$self->throw("unable to open file ".$self->analysis->db_file."\n")
unless $seqIO;
while (my $seq = $seqIO->next_seq){
$miRNA{$seq->accession} = $seq;
}
$self->miRNAs(\%miRNA);
}
=head2 run_analysis
Title : run_analysis
Usage : my $align = $self->run_analysis($dna_align_feature);
Function : makes an alignment that represents the mature sequence of the miRNA
: tests to check that it contains the entire mature sequence and is of
: high enough percent identity
Returns : Bio::SimpleAlign object
Exceptions : Throws if sequence file is not found / cannot be opened
Args : Bio::EnsEMBL::DnaDnaAlignFeature
=cut
sub run_analysis{
my ($self,$daf)=@_;
my %miRNAs = %{$self->miRNAs};
my ($all_mature, $status);
my @mature_aligns;
if ($self->get_mature($daf)){
($all_mature,$status) = $self->get_mature($daf)
} else {
# ignore if you dont have a mature form identified.
$self->warning("No mature sequence identified for sequence ".$daf->hseqname." $@");
return 0;
}
# can have more than 1 mature sequence per hairpin
foreach my $mature(@$all_mature){
my $miRNA = $miRNAs{$daf->hseqname};
# length is 1 longer than end - start because it includes both the start and end base
my $miRNA_length = $mature->{'end'} - $mature->{'start'} +1;
# change the U to T so as not to confuse the BioPerl
my $seq = $miRNA->seq;
$seq =~ s/[uU]/T/g;
# make a fake slice to use as the hit sequence
my $slice = Bio::EnsEMBL::Slice->new
(
-seq_region_name => $daf->hseqname,
-start => 1,
-end => $miRNA->length,
-seq => $seq,
-strand => 1,
-coord_system => $daf->slice->coord_system,
);
$daf->hslice($slice);
# make an alignment of just the mature sequence
my $temp_align = $daf->restrict_between_positions($mature->{'start'},$mature->{'end'},"HSEQ");
unless ($temp_align)
{
# print "rejecting ".$daf->p_value." cos of not getting mature alignment\n";
next;
}
my $align = $temp_align->transfer($daf->feature_Slice);
# is the enitre mature sequence present in the alignment, no indels?
unless ( $align->alignment_length == $miRNA_length && $align->alignment_length == $align->length)
{
# print "rejecting ".$daf->dbID." coz of not getting aligned length\n";
next;
}
# is the mature alignment of high enough percent identity?
# also no gaps in the mature alignment
unless ( $align->get_SimpleAlign->overall_percentage_identity >= 90 )
{
# print "rejecting ".$daf->p_value." cos of aligned ID\n";
next;
}
push @mature_aligns,$align;
}
return \@mature_aligns,$status;
}
=head2 get_mature
Title : get_mature
Usage : my @mature = $self->get_mature($dna_align_feature);
Function : Returns the coordinates of the mature miRNA on the target sequence
Returns : Array containing 2 elements, the start and stop position (inclusive)
Exceptions : Throws if coordinates cannot be parsed or the miRNA cannot be found
Args : Bio::EnsEMBL::DnaDnaAlignFeature
=cut
sub get_mature{
my ($self,$query)=@_;
my %miRNAs = %{$self->miRNAs};
my $miRNA = $miRNAs{$query->hseqname};
my @mature;
my $status = "PUTATIVE";
unless ($miRNA){
print STDERR "Unable to locate miRNA fom embl file that corresponds to ".
$query->hseqname."$@ \n";
return 0;
}
my @feats = $miRNA->can('top_SeqFeatures') ? $miRNA->top_SeqFeatures : ();
foreach my $sf ( @feats ) {
my @fth = Bio::SeqIO::FTHelper::from_SeqFeature($sf);
foreach my $fth ( @fth ) {
if( ! $fth->isa('Bio::SeqIO::FTHelper') ) {
$self->throw("Cannot process FTHelper... $fth $@\n");
}
my $location = $fth->loc;
$status = "KNOWN"
if ($fth->field->{"evidence"}[0] && $fth->field->{"evidence"}[0] eq "experimental");
if ($location =~ /(\d+)\.+(\d+)/){
push @mature,{ 'start' => $1,
'end' => $2
};
} else {
return 0;
}
}
}
return \@mature,$status;
}
=head2 make_gene
Title : make_gene
Usage : my $gene = $runnable->make_gene($dna_align_feature,$structure,$simple_alignment)
Function : Creates the non coding gene object from the parsed result file.
Returns : Hashref of Bio::EnsEMBL::Gene, Bio::EnsEMBL::Attribute and Bio::EnsEMBL::DBEntry
Exceptions : None
Args : dna_align_feature (Bio::EnsEMBL::DnaDnaAlignFeature)
: structure (String)
: alignment (Bio::SimpleAlign)
=cut
sub make_gene{
my ($self,$daf,$structure,$aligns) = @_;
my %miRNAs = %{$self->miRNAs};
my @mature;
my %gene_hash;
my $description = $miRNAs{$daf->hseqname}->display_id;
my @attributes;
# exons
my $slice = $daf->slice;
my $exon = Bio::EnsEMBL::Exon->new
(
-start => $daf->start,
-end => $daf->end,
-strand => $daf->strand,
-slice => $slice,
-phase => -1,
-end_phase => -1
);
# dont let it fall of the slice
return if ($exon->start < 1 or $exon->end > $slice->length);
$exon->add_supporting_features($daf);
# transcripts
my $transcript = Bio::EnsEMBL::Transcript->new;
$transcript->add_Exon($exon);
$transcript->start_Exon($exon);
$transcript->end_Exon($exon);
$transcript->biotype("miRNA");
# add the transcript attributes for the position of the mature miRNA as well as
# the predicted structure
foreach my $code(@$structure){
my $str_attrib = Bio::EnsEMBL::Attribute->new
(-CODE => 'ncRNA',
-NAME => 'Structure',
-DESCRIPTION => 'RNA secondary structure line',
-VALUE => $code
);
push @attributes,$str_attrib;
}
foreach my $align(@$aligns){
my $miRNA_attrib = Bio::EnsEMBL::Attribute->new
(-CODE => 'miRNA',
-NAME => 'Micro RNA',
-DESCRIPTION => 'Coordinates of the mature miRNA',
-VALUE => $align->start."-".$align->end,
);
push @attributes,$miRNA_attrib;
}
# gene
my $gene = Bio::EnsEMBL::Gene->new;
$gene->biotype('miRNA');
$gene->analysis($self->analysis);
$gene->add_Transcript($transcript);
$gene->status('NOVEL');
$gene_hash{'gene'} = $gene;
$gene_hash{'attrib'} = \@attributes;
$self->output(\%gene_hash);
}
sub display_stuff{
my ($self,$daf,$structure,$aligns,$score)=@_;
my $all_mature;
my $status;
my %miRNAs = %{$self->miRNAs};
my $description = $miRNAs{$daf->hseqname}->display_id;
($all_mature,$status) = $self->get_mature($daf);
print STDERR "DAF ".$daf->dbID." chr ".$daf->seq_region_name." ".$daf->seq_region_start." ".$daf->seq_region_end."\n";
foreach my $mature (@$all_mature){
print STDERR $daf->hseqname." $description miRNA at ".$mature->{'start'}." ".$mature->{'end'}."\n ";
}
print STDERR "target strand ".$daf->strand."\n";
print STDERR "Query start end ".$daf->hstart.":".$daf->hend.
" strand ".$daf->hstrand."\n";
print STDERR "Start ".$daf->start." end ".$daf->end.
" Hit end ".$daf->hstart." hit end ".$daf->hend."\n";
print STDERR "Score $score\n";
foreach my $align(@{$aligns}){
my $simple = $align->get_SimpleAlign;
foreach my $seq ( $simple->each_seq() ) {
print STDERR $seq->seq."\n";
}
print STDERR $simple->match_line."\n";
print STDERR "\n";
$simple = $daf->get_SimpleAlign;
foreach my $seq ( $simple->each_seq() ) {
print STDERR $seq->seq."\n";
}
print STDERR $simple->match_line."\n";
print STDERR "\n";
print STDERR "$structure\n";
for (my $i=1 ; $i< $align->start;$i++) {
print STDERR ".";
}
print STDERR substr($daf->seq,$align->start-1,$align->end-$align->start+1);
print STDERR "\nMirna position = ".$align->start." ". $align->end."\n";
}
}
##################################################################################
# Containers
=head2 queries
Title : queries
Usage : my %queries = %$runnable->queries
Function :
Returns : Hash reference
Exceptions : None
Args :
=cut
sub queries {
my ($self, $queries) = @_;
if ($queries){
$self->{'_queries'} = $queries;
}
return $self->{'_queries'};
}
=head2 miRNAs
Title : miRNAs
Usage : my %miRNAs = %{$self->miRNAs};
Function : Container for storing miRNA BioSeq objects
Returns : Hash reference to Bio::Seq object
Exceptions : None
Args : Hash reference to Bio::Seq object
=cut
sub miRNAs {
my ($self, $miRNAs) = @_;
if ($miRNAs){
$self->{'_miRNAs'} = $miRNAs;
}
return $self->{'_miRNAs'};
}
=head2 output
Arg [1] : Bio::EnsEMBL::Analysis::Runnable
Arg [2] : hashref of output
Function : overrides runnable output method to allow storing of hash refs
Exceptions: throws if not passed an hashref
Example :
=cut
sub output{
my ($self, $output) = @_;
if(!$self->{'output'}){
$self->{'output'} = [];
}
if($output){
throw("Must pass Runnable:output an hashref not a ".$output)
unless(ref($output) eq 'HASH');
push(@{$self->{'output'}}, $output);
}
return $self->{'output'};
}
1;