Raw content of Bio::EnsEMBL::Analysis::Runnable::MiniGenewise # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =pod =head1 NAME Bio::EnsEMBL::Analysis::Runnable::MiniGenewise =head1 SYNOPSIS my $obj = Bio::EnsEMBL::Analysis::Runnable::MiniGenewise->new( -genomic => $genseq, -features => $features) $obj->run my @newfeatures = $obj->output; =head1 DESCRIPTION =head1 CONTACT Describe contact details here =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::EnsEMBL::Analysis::Runnable::MiniGenewise; use vars qw(@ISA); use strict; use Bio::EnsEMBL::Analysis::Runnable::Genewise; use Bio::EnsEMBL::Analysis::Tools::MiniSeq; use Bio::EnsEMBL::Analysis::Tools::PairAlign; use Bio::EnsEMBL::FeaturePair; use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump); use Bio::EnsEMBL::Utils::Argument qw( rearrange ); use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils qw(id coord_string); use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::ExonUtils qw(create_Exon); use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::EvidenceUtils qw(create_feature_from_gapped_pieces); use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranscriptUtils qw(Transcript_info); use Bio::EnsEMBL::Analysis::Tools::Logger qw(logger_info); @ISA = qw(Bio::EnsEMBL::Analysis::Runnable ); sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my( $protein, $features, $minimum_intron, $terminal_padding, $exon_padding, $max_iterate_dist, $genewise_options) = rearrange([qw(PROTEIN FEATURES MINIMUM_INTRON TERMINAL_PADDING EXON_PADDING MAX_SPLIT_ITERATE_DIST GENEWISE_OPTIONS)], @args); ####SETTING_DEFAULTS### $self->minimum_intron(1000); $self->exon_padding(200); $self->terminal_padding(20000); $self->max_split_iterate_distance(0); ####################### $self->protein_sequence($protein); $self->features($features); $self->minimum_intron($minimum_intron); $self->exon_padding($exon_padding); $self->terminal_padding($terminal_padding); $self->max_split_iterate_distance($max_iterate_dist); $self->genewise_options($genewise_options); throw("MiniGenewise needs a query sequence") unless($self->query); throw("MiniGenewise needs a peptide sequence") unless($self->protein_sequence); throw("MiniGenewise needs an array of features") unless($self->features && scalar(@{$self->features})); return $self; } #ACCESSOR METHODS =head2 get/set methods Arg [1] : Bio::EnsEMBL::Analysis::Runnable::MiniGenewise Arg [2] : various things depending on the accessor Function : to store the passed in variable Returntype: various Exceptions: methods will throw if the wrong type of variable is passed in Example : =cut sub genewise_options { my ($self,$arg) = @_; $self->{'_genewise_options'} = {} if(!$self->{'_genewise_options'}); if ($arg) { throw("Must pass genewise options a hash ref not a $arg") unless(ref($arg) eq "HASH"); $self->{'_genewise_options'} = $arg; } return $self->{'_genewise_options'}; } sub features { my ($self,$arg) = @_; $self->{features} = [] if(!$self->{features}); if($arg){ throw($arg." must be an arrayref of Bio::EnsEMBL::FeaturePair objects") unless(ref($arg) eq "ARRAY" && $arg->[0]->isa("Bio::EnsEMBL::FeaturePair")); $self->{features} = $arg; } return $self->{features}; } sub minimum_intron { my ($self,$arg) = @_; if (defined($arg)) { $self->{'_minimum_intron'} = $arg; } return $self->{'_minimum_intron'}; } sub exon_padding { my ($self,$arg) = @_; if (defined($arg)) { $self->{'_padding'} = $arg; } return $self->{'_padding'}; } sub terminal_padding { my ($self,$arg) = @_; if (defined($arg)) { $self->{'_terminal_padding'} = $arg; } return $self->{'_terminal_padding'}; } sub max_split_iterate_distance { my ($self,$arg) = @_; if (defined($arg)) { $self->{'_iterate'} = $arg; } return $self->{'_iterate'}; } sub protein_sequence { my( $self, $value ) = @_; if ($value) { throw("Input isn't a Bio::PrimarySeq but ".$value) unless($value->isa("Bio::PrimarySeqI")); $self->{'_protein_sequence'} = $value; } return $self->{'_protein_sequence'}; } sub miniseq{ my ($self, $value) = @_; if($value){ $self->{mini_seq} = $value; } return $self->{mini_seq}; } #FUNCTIONALITY sub run{ my ($self) = @_; my $must_try_again = 1; my $features = $self->features; my $gw_output; while($must_try_again){ $must_try_again = 0; $gw_output = $self->run_Genewise($features); if($gw_output eq "FAILED"){ return; } logger_info("Have ".@$gw_output." output from the genewise run"); if($self->max_split_iterate_distance){ my @bridge_features; foreach my $g(@$gw_output){ foreach my $e($g->get_all_Exons){ my @pieces = @{$self->miniseq->convert_SeqFeature($e)}; if (@pieces > 1) { @pieces = sort {$a->start <=> $b->start} @pieces; my $can_merge = 1; for(my $i=1; $i < @pieces; $i++) { my $dist = $pieces[$i]->start - $pieces[$i-1]->end - 1; if ($dist > $self->max_split_iterate_distance) { $can_merge = 0; last; } } if ($can_merge) { push @bridge_features, Bio::EnsEMBL::Feature ->new(-start => $pieces[0]->start, -end => $pieces[-1]->end); } } } } if(@bridge_features){ push(@$features, @bridge_features); $must_try_again = 1; warning($self->protein_sequence->id." has a predicted feature which splits ". " when mapped back to the genomic, trying again"); } } } my $output = []; foreach my $gene(@$gw_output){ my $new_gene = $self->convert_to_genomic($gene); warning("Failed to convert ".$gene." to genomic coordinates, skipping") unless($new_gene); push(@$output, $new_gene) if($new_gene); } $self->output($output); } sub run_Genewise{ my ($self, $features) = @_; if(!$features){ $features = $self->features; } # foreach my $f(@$features){ # print "MINIGENEWISE ".$f->start." ".$f->end." ".$f->strand." ".$f->hseqname."\n"; #} my $miniseq_object = $self->make_MiniSeq($features); my $miniseq = $miniseq_object->get_cDNA_sequence; my %params = %{$self->genewise_options} if($self->genewise_options); my $gw = new Bio::EnsEMBL::Analysis::Runnable::Genewise ( -query => $miniseq, -protein => $self->protein_sequence, -reverse => $self->is_reversed, -analysis => $self->analysis, %params ); #throw("Minigenewise Can't run without context in sequence") # if(!$miniseq =~ /[CATG]{3}/); eval{ $gw->run; }; if($@){ throw("Failed ".$gw." run $@"); # warning changed to throw() to prevent silent failing } my @output = @{$gw->output}; return \@output; } sub convert_to_genomic{ #my ($self, $gene) = @_; my ($self, $transcript) = @_; my $strand = 1; $strand = -1 if($self->is_reversed); my @converted_exons; my @all_supp_features; # need to convert all the exons and all the supporting features my @exons = @{$transcript->get_all_Exons}; my $nexon = scalar(@exons); TEXON:for(my $i=0; $i < $nexon; $i++) { my $exon = $exons[$i]; $exon->strand($strand); # need to convert whole exon back to genomic coordinates my @genomics = @{$self->miniseq->convert_SeqFeature($exon)}; if(!@genomics){ warning(id($exon)." ".coord_string($exon)." didn't produce any features ". "when converted to genomic coordinates, skipping"); return undef; } if (@genomics > 1) { # all hell will break loose as the sub alignments will probably # not map cheerfully and we may start introducing in frame stops ... # for now, ignore this feature. warning("Warning : feature converts into " . scalar(@genomics). " features; "); if ($i == $nexon - 1) { warning("Skipping last exon $i"); next TEXON; } elsif ($exon->phase == $exons[$i+1]->phase) { warning("Skipping compatible exon $i"); next TEXON; } else { warning("Cannot skip exon without adjusting phase; skipping gene"); return undef; } } else { my $genomic_exon = create_Exon($genomics[0]->start, $genomics[0]->end, $exon->phase, $exon->end_phase, $strand, $self->analysis, undef, undef, $self->query); # also need to convert each of the sub alignments back to genomic coordinates foreach my $sf (@{$exon->get_all_supporting_features}) { my @features; my @ungapped = $sf->ungapped_features; foreach my $aln (@ungapped) { my @alns = @{$self->miniseq->convert_PepFeaturePair($aln)}; if ($#alns > 0) { warning("Warning : sub_align feature converts into > 1 features " . scalar(@alns)); } my $align = create_feature_from_gapped_pieces ("Bio::EnsEMBL::DnaPepAlignFeature", \@alns, 100, undef, undef, $self->query, $self->protein_sequence->id, $self->analysis); push @features,$align; } push @all_supp_features,@features; my $gapped = create_feature_from_gapped_pieces ("Bio::EnsEMBL::DnaPepAlignFeature", \@features, 100, undef, undef, $self->query, $self->protein_sequence->id, $self->analysis); $genomic_exon->add_supporting_features($gapped); } push(@converted_exons,$genomic_exon); } } # make a new transcript from @converted_exons my $converted_transcript = new Bio::EnsEMBL::Transcript; my $converted_translation = new Bio::EnsEMBL::Translation; $converted_transcript->translation($converted_translation); if (scalar(@all_supp_features)) { my $daf = create_feature_from_gapped_pieces ("Bio::EnsEMBL::DnaPepAlignFeature", \@all_supp_features, 100, undef, undef, $self->query, $self->protein_sequence->id, $self->analysis); $converted_transcript->add_supporting_features($daf); } if ($#converted_exons < 0) { warning("Odd. No exons found in MiniGenewise running on ". $self->query->seq_region_name." ".$self->protein_sequence->id); return undef; } else { if ($converted_exons[0]->strand == -1) { @converted_exons = sort {$b->start <=> $a->start} @converted_exons; } else { @converted_exons = sort {$a->start <=> $b->start} @converted_exons; } $converted_translation->start_Exon($converted_exons[0]); $converted_translation->end_Exon ($converted_exons[$#converted_exons]); # phase is relative to the 5' end of the transcript (start translation) if ($converted_exons[0]->phase == 0) { $converted_translation->start(1); } elsif ($converted_exons[0]->phase == 1) { $converted_translation->start(3); } elsif ($converted_exons[0]->phase == 2) { $converted_translation->start(2); } $converted_translation->end ($converted_exons[$#converted_exons]->end - $converted_exons[$#converted_exons]->start + 1); foreach my $exon(@converted_exons){ $converted_transcript->add_Exon($exon); } } $converted_transcript->slice($self->query); return $converted_transcript; } =head2 make_MiniSeq Arg [1] : Bio::EnsEMBL::Analysis::Runnable::MiniGenewise Arg [2] : arrayref of Bio::EnsEMBL::FeaturePairs Function : create MiniSeq object from the given feature pairs Returntype: Bio::EnsEMBL::Analysis::Tools::MiniSeq Exceptions: Example : =cut sub make_MiniSeq { my ($self, $features) = @_; my $pairaln = new Bio::EnsEMBL::Analysis::Tools::PairAlign; my $ff = $self->feature_factory; my @genomic_features; my $prevend = 0; my $count = 0; my $mingap = $self->minimum_intron; my $seqname = $features->[0]->seqname; my @features = sort {$a->start <=> $b->start} @$features; FEAT:foreach my $f (@features) { my $start = $f->start - $self->exon_padding; my $end = $f->end + $self->exon_padding; if ($start < 1) { $start = 1; } if ($end > $self->query->length) { $end = $self->query->length; } my $gap = ($start - $prevend); # Extend the region is the gap between features is # below a certain size - otherwise start a new region if ($count > 0 && ($gap < $mingap)) { if ($end < $prevend) { $end = $prevend; } $genomic_features[$#genomic_features]->end($end); $prevend = $end; } else { my $newfeature = new Bio::EnsEMBL::Feature; $newfeature->seqname ($f->seqname); $newfeature->start ($start); $newfeature->end ($end); $newfeature->strand (1); $newfeature->slice($self->query); push(@genomic_features,$newfeature); $prevend = $end; } $count++; } # make a forward strand sequence, but tell genewise to run reversed if the # features are on the reverse strand - handled by _is_reversed @genomic_features = sort {$a->start <=> $b->start } @genomic_features; # pad the termini of the sequence with 20K (configurable?) of genomic # sequence - to catch small terminal exons that are missed by blast my $adjusted_start = $genomic_features[0]->start; $adjusted_start -= $self->terminal_padding; if($adjusted_start < 1 ) { $adjusted_start = 1; } $genomic_features[0]->start($adjusted_start); my $adjusted_end = $genomic_features[$#genomic_features]->end; $adjusted_end += $self->terminal_padding; if($adjusted_end > $self->query->length) { $adjusted_end = $self->query->length; } $genomic_features[$#genomic_features]->end($adjusted_end); my $current_coord = 1; foreach my $f (@genomic_features) { my $cdna_start = $current_coord; my $cdna_end = $current_coord + ($f->end - $f->start); my $fp = $ff->create_feature_pair($f->start, $f->end, $f->strand, undef, $cdna_start, $cdna_end, 1, $f->seqname.".cdna", undef, undef, $self->query->seq_region_name, $self->query, $self->analysis); $pairaln->addFeaturePair($fp); $current_coord = $cdna_end+1; } my $miniseq = new Bio::EnsEMBL::Analysis::Tools::MiniSeq(-id => 'test', -pairalign => $pairaln); $self->miniseq($miniseq); return $miniseq; } =head2 is_reversed Arg [1] : Bio::EnsEMBL::Analysis::Runnable::MiniSeq Function : Calculates if most of the features are on the forward or reverse strand Returntype: boolean Exceptions: warns if features are of mixed strand Example : =cut sub is_reversed { my ($self) = @_; if (!($self->{_reverse})) { my $strand = 0; my $fcount = 0; my $rcount = 0; foreach my $f (@{$self->features}) { if ($f->strand == 1){ $fcount++; } elsif ($f->strand == -1) { $rcount++; } } if ($fcount > $rcount) { $self->{_reverse} = 0; } else { $self->{_reverse} = 1; } if($fcount && $rcount){ warning("Forward and reverse strand features see to have been ". "passed to MiniGenewise together"); } } return $self->{_reverse}; } 1;