Raw content of Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranscriptUtils =head1 NAME Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranscriptUtils - utilities for transcript objects =head1 SYNOPSIS use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranscriptUtils qw(clone_Transcript); or use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranscriptUtils to get all methods =head1 DESCRIPTION All methods in this class should take a Bio::EnsEMBL::Transcript object as their first argument. The methods provided should carry out some standard functionality for said objects such as printing info, and cloning and checking phase consistency or splice sites etc =head1 CONTACT please send any questions to ensembl-dev@ebi.ac.uk =head1 METHODS the rest of the documention details the exported static class methods =cut package Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranscriptUtils; use strict; use warnings; use Exporter; use Bio::EnsEMBL::Utils::Exception qw(verbose throw warning stack_trace_dump); use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranslationUtils qw(print_Translation clone_Translation print_peptide); use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::ExonUtils qw(print_Exon clone_Exon Exon_info exon_length_less_than_maximum Exon_info get_upstream_Intron get_downstream_Intron get_upstream_splice_sites get_downstream_splice_sites validate_Exon_coords); use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::IntronUtils qw(intron_length_less_than_maximum get_splice_sites); use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils qw(coord_string id empty_Object); use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::EvidenceUtils qw (print_Evidence clone_Evidence); use Bio::EnsEMBL::Analysis::Tools::Utilities qw(write_seqfile); use Bio::EnsEMBL::Gene; use Bio::EnsEMBL::Transcript; use Bio::EnsEMBL::Translation; use Bio::EnsEMBL::Analysis::Tools::Logger qw(logger_info); use Bio::EnsEMBL::Analysis; use Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::Seg; use Bio::SeqIO; use vars qw (@ISA @EXPORT); @ISA = qw(Exporter); @EXPORT = qw( are_phases_consistent are_splice_sites_canonical are_strands_consistent attach_Slice_to_Transcript attach_Analysis_to_Transcript attach_Analysis_to_Transcript_no_support calculate_exon_phases clone_Transcript coding_coverage count_non_canonical_splice_sites dump_cDNA_file empty_Transcript evidence_coverage exon_lengths_all_less_than_maximum exonic_proportion get_downstream_Intron_from_Exon get_downstream_splice_sites get_evidence_ids get_upstream_splice_sites get_upstream_Intron_from_Exon has_no_unwanted_evidence intron_lengths_all_less_than_maximum is_not_folded is_spliced list_evidence low_complexity_less_than_maximum replace_stops_with_introns remove_initial_or_terminal_short_exons get_evidence_ids dump_cDNA_file convert_to_genes evidence_coverage get_upstream_Intron_from_Exon get_downstream_Intron_from_Exon get_upstream_splice_sites get_downstream_splice_sites empty_Transcript fully_load_Transcript all_exons_are_valid print_Transcript print_Transcript_and_Exons print_Transcript_evidence split_Transcript tidy_split_transcripts trim_cds_to_whole_codons Transcript_info evidence_coverage_greater_than_minimum count_real_introns identical_Transcripts set_start_codon set_stop_codon convert_translateable_exons_to_exon_extended_objects is_Transcript_sane ); =head2 convert_translateable_exons_to_exon_extended_objects Arg [0] : Bio::EnsEMBL::Transcript Function : converts all translateable Exons of a Bio::EnsEMBL::Transcript object into ExonExtended objects to access addtional attributes, like previouus exon etc Returntype: Arrayref of Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::ExonExtended objects Example : =cut sub convert_translateable_exons_to_exon_extended_objects { my ( $transcript ) = @_ ; $transcript->sort; my $exons = $transcript->get_all_translateable_Exons; my @conv_exons ; for (my $i=0 ; $i < scalar ( @$exons) ; $i++) { my $pte = ${$exons}[$i] ; bless $pte,"Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::ExonExtended" ; if ( ($i+1) <scalar(@$exons)) { $pte->next_exon(${$exons}[$i+1]) ; } if ( ($i-1)>=0) { $pte->prev_exon(${$exons}[$i-1]) ; } $pte->transcript($transcript) ; push @conv_exons , $pte ; } return \@conv_exons ; } =head2 print_Transcript Arg [1] : Bio::EnsEMBL::Transcript or Aref of Bio::EnsEMBL::Transcript-objects Arg [2] : string, this should be a string or spaces or tabs to indent the printed string Function : print information about the transcript and its children objects, using indent to make the format readable Returntype: n/a Exceptions: none Example : print_Transcript($transcript); =cut sub print_Transcript{ my ($tref, $indent) = @_; $indent = '' if(!$indent); my @transcripts ; if (ref($tref)=~m/ARRAY/){ @transcripts = @$tref; }else{ push @transcripts, $tref; } for my $transcript ( @transcripts ) { print Transcript_info($transcript, $indent)."\n"; my $translation_indent = $indent."\t"; print_Translation($transcript, $translation_indent) ; foreach my $exon(@{$transcript->get_all_Exons}){ my $exon_indent = $translation_indent."\t"; print_Exon($exon, $exon_indent); } } } =head2 print_Transcript_and_Exons Arg [1] : Bio::EnsEMBL::Transcript or Aref of Bio::EnsEMBL::Transcript-objects Arg [2] : string, this should be a string or spaces or tabs to indent the printed string Function : print information about the transcript and its Exons using indent to make the format readable Returntype: n/a Exceptions: none Examples : print_Transcript_and_Exons(\@transcript) print_Transcript_and_Exons($transcript) =cut sub print_Transcript_and_Exons{ my ($tref, $indent) = @_; $indent = '' if(!$indent); my @tr = (ref($tref)=~m/ARRAY/) ? @$tref : $tref ; for my $transcript ( @tr ) { print "\n" . Transcript_info($transcript, $indent)."\n"; #map { print Exon_info($_, $indent."\t")."\n" } @{$transcript->get_all_Exons} ; my $count = 1; foreach my $exon(@{$transcript->get_all_Exons}){ print $indent."\t".$count." ".Exon_info($exon)."\n"; $count++; } } } =head2 Transcript_info Arg [1] : Bio::EnsEMBL::Transcript Arg [2] : string, indent Function : return string of info about the transcript Returntype: Exceptions: none Example : print_just_Transcript($transcript); =cut sub Transcript_info{ my ($transcript, $indent) = @_; $indent = '' if(!$indent); my $coord_string = coord_string($transcript); my $id = id($transcript); return $indent."TRANSCRIPT: ".$id." ".$coord_string; } =head2 print_Transcript_evidence Arg [1] : Bio::EnsEMBL::Transcript Arg [2] : string, an indent Function : print the transcripts supporting evidence Returntype: n/a Exceptions: none Example : print_Transcript_evidence($transcript); =cut sub print_Transcript_evidence{ my ($transcript, $indent) = @_; $indent = '' if(!$indent); print $indent."TRANSCRIPT EVIDENCE:\n"; foreach my $evidence(@{$transcript->get_all_supporting_features}){ my $evidence_indent = $indent."\t"; print_Evidence($evidence, $evidence_indent); } } =head2 clone_Transcript Arg [1] : Bio::EnsEMBL::Transcript Function : produce a new copy of the transcript object passed in so it can be altered without impact on the original objects the only bit it doesnt keep is the adaptor so the cloned object can be stored Returntype: Bio::EnsEMBL::Transcript Exceptions: none Example : my $newtranscript = clone_Transcript($transcript); =cut sub clone_Transcript{ my ($transcript, $clone_xrefs) = @_; $clone_xrefs = 1 if(!defined($clone_xrefs)); my $newtranscript = new Bio::EnsEMBL::Transcript(); foreach my $exon(@{$transcript->get_all_Exons}){ my $newexon = clone_Exon($exon); $newtranscript->add_Exon($newexon); } foreach my $sf(@{$transcript->get_all_supporting_features}){ my $newsf = clone_Evidence($sf); $newtranscript->add_supporting_features($newsf); } my $newtranslation; if($transcript->translation){ $newtranslation = clone_Translation($transcript, $newtranscript, $clone_xrefs); } $newtranscript->translation($newtranslation); my $attribs = $transcript->get_all_Attributes(); $newtranscript->add_Attributes(@$attribs); $newtranscript->slice($transcript->slice); $newtranscript->biotype($transcript->biotype); $newtranscript->dbID($transcript->dbID); $newtranscript->stable_id($transcript->stable_id); $newtranscript->version($transcript->version); $newtranscript->analysis($transcript->analysis); if ($clone_xrefs){ foreach my $DBEntry (@{$transcript->get_all_DBEntries}){ $newtranscript->add_DBEntry($DBEntry); } $newtranscript->display_xref($transcript->display_xref); } return $newtranscript; } =head2 are_strands_consistent Arg [1] : Bio::EnsEMBL::Transcript Function : checks if strand is consistent between transcript and first exon and in multiexon genes between all exons Returntype: boolean, 1 if pass 0 if fail (ie strands inconsistent) Exceptions: none Example : throw("Strands not consistent") if(!are_strands_consistent($transcript)); =cut sub are_strands_consistent{ my ($transcript) = @_; my $exons = $transcript->get_all_Exons; if($exons->[0]->strand != $transcript->strand){ warn("Strands are inconsistent between the ". "first exon and the transcript for ". id($transcript)); return 0; } if(@$exons >= 2){ for(my $i = 1;$i < @$exons;$i++){ if($exons->[$i]->strand != $exons->[$i-1]->strand){ warn("Strands are inconsistent between ". "exon $i exon and exon ".($i-1)." for ". id($transcript)); return 0; } } } return 1; } =head2 exon_lengths_all_less_than_maximum Arg [1] : Bio::EnsEMBL::Transcript Arg [2] : int, max length Function : checks if any of the exons of given transcript are longer than specified max length Returntype: boolean, 1 for pass, 0 for fail (ie exon beyond max length) Exceptions: none Example : =cut sub exon_lengths_all_less_than_maximum{ my ($transcript, $max_length) = @_; $transcript->sort; foreach my $exon(@{$transcript->get_all_Exons}){ if(!exon_length_less_than_maximum($exon, $max_length)){ warn("Transcript ".id($transcript)." has ". "exon longer than ".$max_length); return 0; } } return 1; } =head2 intron_lengths_all_less_than_maximum Arg [1] : Bio::EnsEMBL::Transcript Arg [2] : int, max length Function : checks if any of the introns of given transcript are longer than specified max length Returntype: boolean, 1 for pass, 0 for fail (ie intron beyond max length) Exceptions: none Example : =cut sub intron_lengths_all_less_than_maximum{ my ($transcript, $max_length) = @_; $transcript->sort; foreach my $intron(@{$transcript->get_all_Introns}){ if(!intron_length_less_than_maximum($intron, $max_length)){ #warning("Transcript ".id($transcript)." has ". # "intron ".$intron->length." longer than ".$max_length); return 0; } } if(@{$transcript->get_all_Introns} == 0){ my $warn = "intron_lengths_all_less_than_maximum is an ". "inappropriate test for a single exon gene"; logger_info($warn); } return 1; } =head2 are_phases_consistent Arg [1] : Bio::EnsEMBL::Transcript Function : to check that end phase of one exon is always the same as start phase of the next. The only acceptable exception to this is an end phase of -1 and a start phase of 0/1/2 or vice versa as UTR exons always have end phase of -1 but the whole of the next exon may be coding and to give it a start phase of -1 is considered misleading Returntype: boolean, 1 for pass 0 for fail (ie phase inconsistency) Exceptions: none Example : =cut sub are_phases_consistent{ my ($transcript) = @_; my $exons = $transcript->get_all_Exons; if (not $transcript->translation) { # all phases should be -1 foreach my $e (@$exons) { if ($e->phase != -1 or $e->end_phase != -1) { warn("Non-coding transcript does not have -1 phases"); # return 0; } } } else { my $tr = $transcript->translation; for(my $i=1;$i < @$exons;$i++){ my $prev_end_phase = $exons->[$i-1]->end_phase; my $this_phase = $exons->[$i]->phase; if($prev_end_phase != $this_phase) { # exception: allow a -1<->0 transition if exon is # translation start exon if ($prev_end_phase == -1 and $this_phase == 0 and $tr->start_Exon == $exons->[$i]) { # okay next; } elsif ($prev_end_phase == 0 and $this_phase == -1 and $tr->end_Exon == $exons->[$i-1]) { # okay next; } else { warn("Coding transcript has inconsistent phases"); return 0; } } } } return 1; } =head2 calculate_exon_phases Arg [1] : Bio::EnsEMBL::Transcript Function : Given a transcript, calculates and sets exon phases according to translation and given start phase =cut sub calculate_exon_phases { my ($transcript, $start_phase) = @_; $transcript->sort; foreach my $e (@{$transcript->get_all_Exons}) { $e->phase(-1); $e->end_phase(-1); } if ($transcript->translation) { my $tr = $transcript->translation; my @exons = @{$transcript->get_all_Exons}; while($exons[0] != $tr->start_Exon) { shift @exons; } while($exons[-1] != $tr->end_Exon) { pop @exons; } # set phase of for first coding exon my $cds_len = $exons[0]->length; if ($tr->start == 1) { $exons[0]->phase($start_phase); $cds_len += $start_phase; } else { $cds_len -= ($tr->start - 1); } $exons[0]->end_phase($cds_len % 3); # set phase for internal coding exons for(my $i=1; $i < @exons; $i++) { $exons[$i]->phase($exons[$i-1]->end_phase); $exons[$i]->end_phase(($exons[$i]->length + $exons[$i]->phase) % 3); } # set phase for last coding exon if ($exons[-1]->length > $tr->end) { $exons[-1]->end_phase(-1); } } } =head2 is_not_folded Arg [1] : Bio::EnsEMBL::Transcript Function : check if any exons start before the previous exon finished Returntype:boolean, 1 for pass, 0 for fail Exceptions: Example : =cut sub is_not_folded{ my ($transcript) = @_; $transcript->sort; my $exons = $transcript->get_all_Exons; if(@$exons == 1){ my $warn = "is_not_folded ". "is an inappropriate test for a single ". "exon gene"; logger_info($warn); return 1; } for(my $i = 1;$i < @$exons;$i++){ $exons->[$i]->stable_id(''); if($exons->[$i]->strand == 1){ if($exons->[$i]->start < $exons->[$i-1]->end){ warning(id($transcript)." is folded"); warn($i." ".id($exons->[$i])." has a start which ". "is less than ".($i-1)." ".id($exons->[$i-1]). " end"); return 0; } }else{ if($exons->[$i]->end > $exons->[$i-1]->start){ warning(id($transcript)." is folded"); warn($i." ".id($exons->[$i])." has a end which ". "is greater than ".($i-1)." ".id($exons->[$i-1]). " start"); return 0; } } } return 1; } =head2 low_complexity_less_than_maximum Arg [1] : Bio::EnsEMBL::Transcript Arg [2] : int, maximum low complexity Function : calculate how much low complexity a transcripts translation has and check if it is above the specificed threshold Returntype: boolean, 1 for pass 0 for fail Exceptions: none Example : =cut sub low_complexity_less_than_maximum{ my ($transcript, $complexity_threshold) = @_; my $peptide = $transcript->translate; my $seg = Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::Seg->new ( -query => $peptide, -analysis => Bio::EnsEMBL::Analysis->new ( -logic_name => 'seg', -program_file => 'seg', ) ); $seg->run; my $low_complexity = $seg->get_low_complexity_length; logger_info(id($transcript)." has ".$low_complexity. " low complexity sequence"); #print_peptide($transcript); #print id($transcript)." has ".$low_complexity." low complexity sequence compared to". # " ".$complexity_threshold."\n"; if($low_complexity >= $complexity_threshold){ warn(id($transcript)."'s low ". "complexity (".$low_complexity.") is above ". "the threshold ".$complexity_threshold. "\n"); return 0; } return 1; } =head2 has_no_unwanted_evidence Arg [1] : Bio::EnsEMBL::Transcript Arg [2] : hashref, a hash of ids which are unwanted Function : To ensure the given transcript is not supported by any of the unwanted evidence Returntype: boolean, 1 for no unwanted evidence, 0 for unwanted evidence Exceptions: Example : =cut sub has_no_unwanted_evidence{ my ($transcript, $ids) = @_; $ids = {} if(!$ids); $ids->{'NG_'} = 1; my $evidence = get_evidence_ids($transcript); foreach my $evi(keys(%$evidence)){ foreach my $unwanted (keys(%$ids)) { if($evi =~ /$unwanted/){ warn(id($transcript)." has ".$evi. " unwanted evidence"); return 0; } } } return 1; } =head2 get_evidence_ids Arg [1] : Bio::EnsEMBL::Transcript Function : gets a hashref of all the evidence supporting the given transcript Returntype: hashref Exceptions: Example : =cut sub get_evidence_ids{ my ($transcript) = @_; my %hash; foreach my $sf(@{$transcript->get_all_supporting_features}){ $hash{$sf->hseqname} = 1; } foreach my $exon(@{$transcript->get_all_Exons}){ foreach my $sf(@{$exon->get_all_supporting_features}){ $hash{$sf->hseqname} = 1; } } return \%hash; } =head2 is_spliced Arg [1] : Bio::EnsEMBL::Transcript Arg [2] : int, intron size Function : calculates is the transcript contains at least one "real" intron Returntype: boolean 1 if it does, 0 if not Exceptions: Example : =cut sub is_spliced{ my ($transcript, $intron_size) = @_; my $count = count_real_introns($transcript, $intron_size); warning(id($transcript)." has no introns ". "longer than $intron_size bps") if(!$count); return 0 if(!$count); return 1; } =head2 count_real_introns Arg [1] : Bio::EnsEMBL::Transcript Arg [2] : int, intron size Function : counts the number of introns longer than the specified size, by default this is 9bp Returntype: int, the number of real introns Exceptions: Example : =cut sub count_real_introns{ my ($transcript, $intron_size) = @_; $intron_size = 9 if(!$intron_size); my $real_count = 0 ; my @introns = @{$transcript->get_all_Introns}; my $warn = id($transcript)." has no introns. count_real". "_introns makes no sense in those terms"; logger_info($warn) if(@introns == 0); foreach my $intron(@introns){ $real_count++ if($intron->length > $intron_size); } logger_info(id($transcript)." has ".$real_count." introns ". "longer than ".$intron_size." out of ". @introns." introns"); return $real_count; } =head2 are_splice_sites_canonical Arg [1] : Bio::EnsEMBL::Transcript Function : check if all splice sites are canonical GT/AG, GC/AG or AT/AC pairings Returntype: boolean, 1 for yes 0 for no Exceptions: Example : =cut sub are_splice_sites_canonical{ my ($transcript) = @_; my $introns = $transcript->get_all_Introns; my $non_canonical_count = count_non_canonical_splice_sites($transcript); if($non_canonical_count){ warn(id($transcript)." contains ". $non_canonical_count." non canonical ". "splice sites out of ".@$introns. " introns"); return 0; } if(@$introns == 0){ my $warn ="are_splice_sites_canonical is an ". "inappropriate test for a single exon gene"; logger_info($warn); } return 1; } =head2 count_non_canonical_splice_site Arg [1] : Bio::EnsEMBL::Transcript Function : count the number of non canonical splice sites Returntype: int, the number of non canonical splice sites Exceptions: Example : =cut sub count_non_canonical_splice_sites{ my ($transcript) = @_; my $slice = $transcript->slice; my $none = 0; foreach my $intron(@{$transcript->get_all_Introns}){ my ($upstream_site, $downstream_site) = get_splice_sites($intron); $none++ unless(($upstream_site eq 'GT' && $downstream_site eq 'AG') || ($upstream_site eq 'AT' && $downstream_site eq 'AC') || ($upstream_site eq 'GC' && $downstream_site eq 'AG') ) } return $none; } =head2 exonic_proportion Arg [1] : Bio::EnsEMBL::Transcript Function : calculates what proportion of the transcript extent is made up of exonic sequence Returntype: int Exceptions: Example : =cut sub exonic_proportion{ my ($transcript) = @_; my $genomic_extent = ($transcript->end - $transcript->start) + 1; my $cdna_length = $transcript->length; my $value = ($cdna_length/$genomic_extent) * 100; return $value; } =head2 coding_coverage Arg [1] : Bio::EnsEMBL::Transcript Function : calculate the ratio of the translateable seq of the transcript to the full length sequence Returntype: int Exceptions: Example : =cut sub coding_coverage{ my ($transcript) = @_; my $cdna_length = $transcript->length; my $coding_seq_length = length($transcript->translateable_seq); my $value = ($coding_seq_length/$cdna_length) * 100; return $value; } =head2 list_evidence Arg [1] : Bio::EnsEMBL::Transcript Function : produce a unique list of evidence to support a gene Returntype: listref Exceptions: Example : =cut sub list_evidence{ my ($transcript) = @_; my $hash = get_evidence_ids($transcript); my @ids = keys(%$hash); return \@ids; } =head2 split_Transcript Arg [1] : Bio::EnsEMBL::Transcript Arg [2] : int, max intron length Arg [3] : boolean, 1 to carry out checks, 0 to not be default is 1 Function : to split transcripts on introns which are too long, discard any single exon transcripts left behind and return the remaining Returntype: arrayref of Bio::EnsEMBL::Transcript objects Exceptions: throws if not passed a Bio::EnsEMBL::Transcript object =cut #This method is designed to split transcripts on #long introns and hopefully maintain clean and sensible #transcripts sub split_Transcript{ my ($transcript, $max_intron_length, $intron_numbers) = @_; throw("TranscriptUtils:split_Transcript will not work on single exon transcripts") if(@{$transcript->get_all_Exons} == 0); #cloning the transcript to ensure if all else fails #the original is fine my %intron_numbers; if (defined $intron_numbers) { map { $intron_numbers{$_} = 1 } @$intron_numbers; } my ($cds_start, $cds_end); if ($transcript->translation) { my ($c) = $transcript->cdna2genomic($transcript->cdna_coding_start, $transcript->cdna_coding_start); $cds_start = $c->start; ($c) = $transcript->cdna2genomic($transcript->cdna_coding_end, $transcript->cdna_coding_end); $cds_end = $c->start; if ($cds_start > $cds_end) { ($cds_start, $cds_end) = ($cds_end, $cds_start); } } throw("cds_start ".$cds_start." or cds_end.".$cds_end." is not defined") if(!$cds_start || !$cds_end); my $cloned_transcript = clone_Transcript($transcript); #creating first new transcript my $curr_transcript = new Bio::EnsEMBL::Transcript; $curr_transcript->biotype($transcript->biotype); $curr_transcript->analysis($transcript->analysis); my @split_transcripts; my $first_exon = 1; my $intron_count = 0; #adding first new transcript to list push(@split_transcripts, $curr_transcript); my $last_exon; #checking each intron INTRON: foreach my $intron(@{$cloned_transcript->get_all_Introns}){ $intron_count++; my $prev_exon = $intron->prev_Exon; my $next_exon = $intron->next_Exon; #If considering first intron and prev exon #is the first exon then need to add it to the #transcript and check if it is the start of translation if($first_exon){ $curr_transcript->add_Exon($prev_exon); $first_exon = 0; } #If the intron length is less than the specified max #then you can add it to the current transcript and #check if it is the start of translation or end of #translation if((not defined $max_intron_length or intron_length_less_than_maximum($intron, $max_intron_length)) and not exists $intron_numbers{$intron_count}) { $curr_transcript->add_Exon($next_exon); next INTRON; }else{ #If the intron is longer than maximum length a new #transcript must be started. This means the last exon #needs to be set as the end of the previous translation #and a new transcript must be created my $t = Bio::EnsEMBL::Transcript->new; $t->add_Exon($next_exon); $curr_transcript = $t; $curr_transcript->biotype($transcript->biotype); $curr_transcript->analysis($transcript->analysis); $last_exon = $next_exon; push(@split_transcripts, $curr_transcript); } } # now add the translations, and trim back transcripts that # do not end in whole codons my @processed; foreach my $stran (@split_transcripts) { if ($transcript->translation) { if ($stran->end >= $cds_start and $stran->start <= $cds_end) { # at least part of this transcript is coding my $tr = Bio::EnsEMBL::Translation->new; $stran->translation($tr); my @exons = @{$stran->get_all_Exons}; foreach my $e (@exons) { if ($cds_start >= $e->start and $cds_start < $e->end) { # start of translation is in this exon if ($stran->strand > 0) { $tr->start_Exon($e); $tr->start( $cds_start - $e->start + 1); } else { $tr->end_Exon($e); $tr->end( $e->end - $cds_start + 1); } } if ($cds_end >= $e->start and $cds_end <= $e->end) { if ($stran->strand > 0) { $tr->end_Exon($e); $tr->end( $cds_end - $e->start + 1); } else { $tr->start_Exon($e); $tr->start( $e->end - $cds_end + 1); } } } if (not $tr->start_Exon) { $tr->start_Exon($exons[0]); $tr->start(1); } if (not $tr->end_Exon) { $tr->end_Exon($exons[-1]); $tr->end($exons[-1]->length); } } } if ($stran->translation) { #$stran = trim_cds_to_whole_codons($stran); } foreach my $sf (@{$transcript->get_all_supporting_features}) { my @ugs; foreach my $ug ($sf->ungapped_features) { if ($ug->start >= $stran->start and $ug->end <= $stran->end) { push @ugs, $ug; } } if (@ugs) { my $newf = $sf->new(-features => \@ugs) if(@ugs); $stran->add_supporting_features($newf); } } push @processed, $stran; } my $info = "split_Transcript Returning " . scalar(@processed) . " transcripts"; logger_info($info); return \@processed; } =head2 tidy_split_transcripts Arg [1] : arrayref Bio::EnsEMBL::Transcript Arg [2] : Bio::EnsEMBL::Transcript Function : Intended to be called after split_Transcripts Performs a series of standard gene-buildy checks on the given transcripts, Args : The original transcripts, and the results of split_Transcript Returns : arrayref of transcripts that pass checks =cut sub tidy_split_transcripts{ my ($orig_tran, $split_trans) = @_; my @keep; foreach my $stran (@{$split_trans}) { if(@{$stran->get_all_Exons} == 1){ my ($exon) = @{$stran->get_all_Exons}; my $warn = id($stran)." only has one exon\n". Exon_info($exon); warning($warn); next; } if($stran->translate->seq =~ /\*/){ my $warn = Transcript_info($stran). " does not translate\n"; warning($warn); next; } my $initial_peptide = $orig_tran->translate->seq; if(!$initial_peptide =~ /$stran->translate->seq/){ my $warn = Transcript_info($stran)." translation ". "does not appear to be a subset of the original ". "translation"; warning($warn); next; } #and if the strands or phases are not consistent if(!are_strands_consistent($stran)){ my $warn = Transcript_info($stran)." has ". "inconsistent strands"; warning($warn); next; } if(!are_phases_consistent($stran)){ my $warn = Transcript_info($stran)." has ". "inconsistent phases"; warning($warn); next; } push @keep, $stran; } return(\@keep); } =head2 trim_cds_to_whole_codons Arg [1] : Bio::EnsEMBL::Transcript Function : when start/end of translation is flush to start/end of transcript, and phase is non-zero, trim back to whole codon. If we have UTR adjacent to a non-zero phase codon, then something is wrong, so best to leave it alone Returntype: Bio::EnsEMBL::Transcript Exceptions: Example : =cut sub trim_cds_to_whole_codons { my ($transcript) = @_; my $remove5 = 0; my $remove3 = 0; if ($transcript->translation) { my @exons = @{$transcript->get_all_Exons}; my $tr = $transcript->translation; if ($tr->start_Exon == $exons[0] and $tr->start_Exon->phase > 0 and $tr->start == 1) { $remove5 = (3 - $tr->start_Exon->phase) % 3; } if ($tr->end_Exon == $exons[-1] and $tr->end_Exon->end_phase > 0 and $tr->end == $tr->end_Exon->length) { $remove3 = $tr->end_Exon->end_phase; } if ($remove5 or $remove3) { my $cloned_transcript = clone_Transcript($transcript); my @exons = @{$cloned_transcript->get_all_Exons}; my $cloned_tr = $cloned_transcript->translation; if ($remove5) { while(@exons and $exons[0]->length <= $remove5) { $remove5 -= $exons[0]->length; shift @exons; } $cloned_transcript->flush_Exons; map { $cloned_transcript->add_Exon($_) } @exons; if ($cloned_transcript->strand > 0) { $exons[0]->start($exons[0]->start + $remove5); } else { $exons[0]->end($exons[0]->end - $remove5); } $exons[0]->phase(0); $cloned_tr->start_Exon($exons[0]); $cloned_tr->start(1); } if ($remove3) { while(@exons and $exons[-1]->length <= $remove3) { $remove3 -= $exons[-1]->length; pop @exons; } if ($cloned_transcript->strand > 0) { $exons[-1]->end($exons[-1]->end - $remove3); } else { $exons[-1]->start($exons[-1]->start + $remove3); } $exons[-1]->end_phase(0); $cloned_tr->end_Exon($exons[-1]); $cloned_tr->end($exons[-1]->length); } return $cloned_transcript; } } return $transcript; } =head2 replace_stops_with_introns Arg [1] : Bio::EnsEMBL::Transcript Function : replace any inframe stops with introns Returntype: Bio::EnsEMBL::Transcript Exceptions: Example : Notes : Needs extension to deal with transcript_supporting_feats =cut sub replace_stops_with_introns{ my ($transcript) = @_; $transcript->sort; my $newtranscript = clone_Transcript($transcript); my @exons = @{$newtranscript->get_all_Exons}; my $pep = $newtranscript->translate->seq; while($pep =~ /\*/g) { my $position = pos($pep); my @coords = $newtranscript->pep2genomic($position, $position); foreach my $stop (@coords) { # locate the exon that this stop lies in my @new_exons; foreach my $exon (@exons) { if ($stop->start >= $exon->start and $stop->end <= $exon->end) { # this stop lies completely within an exon. We split the exon # into two my $exon_left = Bio::EnsEMBL::Exon-> new(-slice => $exon->slice, -start => $exon->start, -end => $stop->start - 1, -strand => $exon->strand, -phase => $exon->strand < 0 ? 0 : $exon->phase, -end_phase => $exon->strand < 0 ? $exon->end_phase :0); my $exon_right = Bio::EnsEMBL::Exon-> new(-slice => $exon->slice, -start => $stop->end + 1, -end => $exon->end, -strand => $exon->strand, -phase => $exon->strand < 0 ? $exon->phase : 0, -end_phase => $exon->strand < 0 ? 0 : $exon->end_phase); # need to split the supporting features between the two my @sfs = @{$exon->get_all_supporting_features}; my (@ug_left, @ug_right); foreach my $f (@sfs) { foreach my $ug ($f->ungapped_features) { my $orignial_analysis = $ug->analysis; if ($ug->start >= $exon_left->start and $ug->end <= $exon_left->end) { #completely within the left-side of the split push @ug_left, $ug; } elsif ($ug->start >= $exon_right->start and $ug->end <= $exon_right->end) { #completely within the right-side of the split push @ug_right, $ug; } else { # this ug must span the split my $fp_left = Bio::EnsEMBL::FeaturePair->new(); if ($ug->slice) { $fp_left->slice($ug->slice); } $fp_left->seqname ($ug->seqname); $fp_left->strand ($ug->strand); $fp_left->hseqname ($ug->hseqname); $fp_left->score ($ug->score); $fp_left->percent_id($ug->percent_id); $fp_left->start ($ug->start); $fp_left->end ($stop->start - 1); $fp_left->external_db_id($ug->external_db_id); $fp_left->hcoverage($ug->hcoverage); $fp_left->analysis($orignial_analysis) ; my $fp_right = Bio::EnsEMBL::FeaturePair->new(); if ($ug->slice) { $fp_right->slice($ug->slice); } $fp_right->seqname ($ug->seqname); $fp_right->strand ($ug->strand); $fp_right->hseqname ($ug->hseqname); $fp_right->score ($ug->score); $fp_right->percent_id($ug->percent_id); $fp_right->start ($stop->end + 1); $fp_right->end ($ug->end); $fp_right->external_db_id($ug->external_db_id); $fp_right->hcoverage($ug->hcoverage); $fp_right->analysis($orignial_analysis) ; if ($exon->strand > 0) { $fp_left->hstart($ug->hstart); $fp_left->hend($fp_left->hstart + ($fp_left->length / 3) - 1); $fp_right->hend ($ug->hend); $fp_right->hstart($ug->hend - ($fp_right->length / 3) + 1); } else { $fp_left->hend ($ug->hend); $fp_left->hstart($ug->hend - ($fp_left->length / 3) + 1); $fp_right->hstart($ug->hstart); $fp_right->hend($fp_right->hstart + ($fp_right->length / 3) - 1); } if ($fp_left->end >= $fp_left->start) { push @ug_left, $fp_left; } if ($fp_right->end >= $fp_right->start) { push @ug_right, $fp_right; } } } } sub add_dna_align_features_by_hitname_and_analysis { my ( $ug_ref, $exon ) = @_ ; my %group_features_by_hitname_and_analysis ; for my $ug ( @$ug_ref ) { push @{$group_features_by_hitname_and_analysis{$ug->analysis->logic_name}{$ug->hseqname }} , $ug ; } for my $logic_name ( keys %group_features_by_hitname_and_analysis ) { for my $hseqname ( keys %{$group_features_by_hitname_and_analysis{$logic_name}}) { my @features = @{$group_features_by_hitname_and_analysis{$logic_name}{$hseqname}}; my $f = Bio::EnsEMBL::DnaPepAlignFeature->new(-features => \@features); $exon->add_supporting_features($f); } } return $exon ; } $exon_left = add_dna_align_features_by_hitname_and_analysis(\@ug_left,$exon_left) ; $exon_right =add_dna_align_features_by_hitname_and_analysis(\@ug_right,$exon_right) ; if ($exon->strand < 0) { if ($exon_right->end >= $exon_right->start) { push @new_exons, $exon_right; } if ($exon_left->end >= $exon_left->start) { push @new_exons, $exon_left; } } else { if ($exon_left->end >= $exon_left->start) { push @new_exons, $exon_left; } if ($exon_right->end >= $exon_right->start) { push @new_exons, $exon_right; } } } else { # this exon is unaffected by this stop push @new_exons, $exon; } } @exons = @new_exons; } } $newtranscript->flush_Exons; foreach my $exon (@exons) { $newtranscript->add_Exon($exon); } my $translation = Bio::EnsEMBL::Translation->new(); $translation->start_Exon($exons[0]); $translation->end_Exon($exons[-1]); $translation->start(1); $translation->end($exons[-1]->end - $exons[-1]->start + 1); $newtranscript->translation($translation); my $old_translation = $transcript->translation ; foreach my $DBEntry (@{$old_translation->get_all_DBEntries}){ $translation->add_DBEntry($DBEntry); } return $newtranscript; } =head2 remove_initial_or_terminal_short_exons Arg [1] : Bio::EnsEMBL::Transcript Arg [2] : int, minimum length Function : to trim transcripts of initial or terminal exons which are consider too short, by default less than 3bp Returntype: Bio::EnsEMBL::Transcript Exceptions: Example : =cut sub remove_initial_or_terminal_short_exons{ my ($transcript, $min_length) = @_; $transcript->sort; $min_length = 3 unless($min_length); throw("TranscriptUtils::remove_initial_or_terminal_short_exons will not work ". " if ".id($transcript)." has no translation") if(!$transcript->translation); my %translateable = %{_identify_translateable_exons ($transcript)}; my $cloned_transcript = clone_Transcript($transcript); my $start_exon = $cloned_transcript->translation-> start_Exon; my $start_exon_id = $start_exon->start.":". $start_exon->end.":".$start_exon->strand.":". $start_exon->phase; my $end_exon = $cloned_transcript->translation->end_Exon; #If the first exon is shorter than min length if($cloned_transcript->start_Exon->length < $min_length){ my @exons = @{$cloned_transcript->get_all_Exons}; FIVE_PRIME_END: while(scalar(@exons)){ my $exon = shift(@exons); #throw away each exon which is shorter than the min lenght #and recreate the transcript from the point where the #exons become longer again #This follows pretty much the same princples are translation #creation in the spliting of transcripts if($exon->length > $min_length){ my $unique_string = $exon->start.":".$exon->end.":". $exon->strand.":".$exon->phase; if($translateable{$unique_string} && $exon ne $start_exon){ $cloned_transcript->translation->start_Exon ($exon); $cloned_transcript->translation->start(1); $exon = _trim_translation_start($exon); $cloned_transcript->flush_Exons; $cloned_transcript->add_Exon($exon); foreach my $e(@exons){ $cloned_transcript->add_Exon($e); } last FIVE_PRIME_END; } }else { print id($exon)." is being trimmed - it is too short\n"; my $warn = id($exon)." is being trimmed ". "it is too short\n".Exon_info($exon); logger_info($warn); } } } #This is as above but it starts from the last exon and works #backwards if($cloned_transcript->end_Exon->length < $min_length){ my @exons = @{$cloned_transcript->get_all_Exons}; THREE_PRIME_END: while(@exons){ my $exon = pop @exons; if($exon->length > $min_length){ my $unique_string = $exon->start.":".$exon->end.":". $exon->strand.":".$exon->phase; if($translateable{$unique_string} && $exon != $cloned_transcript->translation->end_Exon){ $cloned_transcript->translation->end_Exon($exon); $exon = _trim_translation_end($exon); } $cloned_transcript->flush_Exons; $cloned_transcript->add_Exon($exon); my $translation_end = $exon->end - $exon->start + 1; $cloned_transcript->translation->end($translation_end); foreach my $e(@exons){ $cloned_transcript->add_Exon($e); } last THREE_PRIME_END; }else{ print id($exon)." is being trimmed - it is too short\n"; my $warn = id($exon)." is being trimmed ". "it is too short\n".Exon_info($exon); logger_info($warn); } } } return $cloned_transcript; } =head2 _identify_translateable_exons Arg [1] : Bio::EnsEMBL::Transcript Function : get a list of exons in the translation Returntype: hashref Exceptions: Example : =cut #This method assumes the exons are in the correct 5'-3' order if #this ever changes this will fail to work without adding a sort sub _identify_translateable_exons{ my ($transcript) = @_; my $start_exon = $transcript->translation->start_Exon; my $end_exon = $transcript->translation->end_Exon; my %translateable; EXON:foreach my $ex(@{$transcript->get_all_Exons}){ my $unique_string = $ex->start.":".$ex->end.":". $ex->strand.":".$ex->phase; if($ex ne $start_exon && !%translateable){ next EXON; } if($ex eq $end_exon){ $translateable{$unique_string} = $ex; last EXON; } $translateable{$unique_string} = $ex; } return \%translateable; } =head2 _trim_translation_start/end Arg [1] : Bio::EnsEMBL::Exon Function : trim untranslated bases from the start or end of an exon Returntype: Bio::EnsEMBL::Exon Exceptions: Example : =cut #Should these be here, they are not exported but they take exons not transcripts #so don't really follow convention, if they are ever made public they should #This adjusts the appropriate end of an exon to ensure the #translation has no untranslated basepairs preceeding it sub _trim_translation_start{ my ($exon) = @_; my $addition = 3 - $exon->phase; $addition = 0 if ($exon->phase == 0); my $tmp_start; if($exon->strand == 1){ $tmp_start = $exon->start + $addition; $exon->start($tmp_start); }else{ $tmp_start = $exon->end - $addition; $exon->end($tmp_start); } $exon->phase(0); return $exon; } #as above but for translation ends sub _trim_translation_end{ my ($exon) = @_; my $tmp_end; if($exon->strand == 1){ $tmp_end = $exon->end - $exon->end_phase; $exon->end($tmp_end); }else{ $tmp_end = $exon->start + $exon->end_phase; $exon->start($tmp_end); } return $exon; } =head2 dump_cDNA_file Arg [1] : Bio::EnsEMBL::Transcript Arg [2] : string, filename Arg [3] : string, format (optional) Function : dump file using Bio::SeqIO Returntype: filename Exceptions: throws if fails to write or close file Example : =cut sub dump_cDNA_file{ my ($transcript, $filename, $format) = @_; $format = 'fasta' if(!$format); logger_info("You are going to dump the cdna of ". id($transcript)." into ".$filename." format ". $format); my $seq = $transcript->seq; $seq->display_id(id($transcript)); $filename = write_seqfile($seq, $filename, $format); return $filename; } =head2 convert_to_genes( $tref, $analysis ) Arg[0] : Arrayref of Bio::EnsEMBL::Transcript objects Arg[1] : Bio::EnsEMBL::Analysis object (opt) Name : convert_to_genes Function : converts all transcripts to Genes (biotype of transcript becomes biotype of gene) Returntype : Arrayref of Bio::EnsEMBL::Gene objects =cut sub convert_to_genes { my ($tref, $analysis, $biotype ) = @_; my @genes ; my @tr = (ref($tref)=~m/ARRAY/) ? @$tref : ($tref) ; for my $t (@tr) { throw("Problems with ".id($t)." undef coords") if(!$t->start || !$t->end); fully_load_Transcript($t); $analysis = $t->analysis unless $analysis ; if($biotype){ $t->biotype($biotype); } my $g = Bio::EnsEMBL::Gene->new() ; $g->biotype( $t->biotype ) ; $g->add_Transcript($t); $g->analysis($analysis) ; $g->dbID($t->dbID); push @genes, $g ; } for my $g(@genes) { throw("there are no transcripts for this gene\n") if scalar(@{$g->get_all_Transcripts}) == 0 ; for my $tr ( @{$g->get_all_Transcripts} ) { throw("there are no exons for this transcript \n") if scalar(@{$tr->get_all_Exons}) == 0 ; } } foreach my $gene(@genes){ throw("Problems with ".id($gene)." undef coords") if(!$gene->start || !$gene->end); } return \@genes ; } =head2 evidence_coverage Arg [1] : Bio::EnsEMBL::Transcript Arg [2] : Bio::Seq Function : Returntype: Exceptions: Example : =cut #note this method is designed to work with protein evidence, it should work with #dna evidence but I am not sure sub evidence_coverage{ my ($transcript, $evidence) = @_; throw("Can't work out evidence coverage for ".id($transcript). " without any evidence") if(!$evidence); my $matches = 0; my $pstart = 0; my $pend = 0; my $evidence_name = $evidence->id; my $plength; foreach my $exon(@{$transcript->get_all_Exons}) { $pstart = 0; $pend = 0; foreach my $f(@{$exon->get_all_supporting_features}){ if($evidence_name ne $f->hseqname){ warning("$evidence_name ne " . $f->hseqname . "\n"); } if((!$pstart) || $pstart > $f->hstart){ $pstart = $f->hstart; } if((!$pend) || $pend < $f->hend){ $pend= $f->hend; } } $matches += ($pend - $pstart + 1); } $plength = $evidence->length; if(!defined($plength) || $plength == 0){ warning("TranscriptUtils: no sensible length for ".$evidence_name." assuming 0% ". "coverage"); return 0; } my $coverage = int(100 * $matches/$plength); foreach my $sf(@{$transcript->get_all_supporting_features}){ $sf->hcoverage($coverage) if($sf->hseqname eq $evidence_name); } return $coverage; } sub get_upstream_Intron_from_Exon{ my ($transcript, $exon) = @_; return get_upstream_Intron($exon, $transcript); } sub get_upstream_splice_sites_from_Exon{ my ($transcript, $exon) = @_; return get_upstream_splice_sites($exon, $transcript); } sub get_downstream_Intron_from_Exon{ my ($transcript, $exon) = @_; return get_downstream_Intron($exon, $transcript); } sub get_downstream_splice_sites_from_Exon{ my ($transcript, $exon) = @_; return get_downstream_splice_sites($exon, $transcript); } sub attach_Slice_to_Transcript{ my ($transcript, $slice) = @_; $transcript->slice($slice); foreach my $sf(@{$transcript->get_all_supporting_features}){ $sf->slice($slice); } foreach my $exon(@{$transcript->get_all_Exons}){ $exon->slice($slice); foreach my $sf(@{$exon->get_all_supporting_features}){ $sf->slice($slice); } } } sub attach_Analysis_to_Transcript{ my ($transcript, $analysis) = @_; $transcript->analysis($analysis); foreach my $sf(@{$transcript->get_all_supporting_features}){ $sf->analysis($analysis); } foreach my $exon(@{$transcript->get_all_Exons}){ $exon->analysis($analysis); foreach my $sf(@{$exon->get_all_supporting_features}){ $sf->analysis($analysis); } } } sub attach_Analysis_to_Transcript_no_support{ my ($transcript, $analysis) = @_; $transcript->analysis($analysis); } sub fully_load_Transcript{ my ($transcript, $keep_xrefs) = @_; $keep_xrefs = 1 if(!defined($keep_xrefs)); if ($transcript->translation) { $transcript->translate; $transcript->translation->get_all_Attributes; $transcript->translation->get_all_DBEntries if($keep_xrefs); $transcript->translation->get_all_ProteinFeatures; } EXON:foreach my $e(@{$transcript->get_all_Exons}){ $e->analysis; $e->stable_id; $e->get_all_supporting_features; } $transcript->stable_id; $transcript->analysis; $transcript->get_all_Attributes; $transcript->get_all_DBEntries if($keep_xrefs); $transcript->get_all_supporting_features; return $transcript; } sub empty_Transcript{ my ($transcript, $remove_stable_id, $remove_xrefs) = @_; fully_load_Transcript($transcript); foreach my $sf(@{$transcript->get_all_supporting_features}){ empty_Object($sf); } empty_Object($transcript->translation, $remove_stable_id) if($transcript->translation);; EXON:foreach my $e(@{$transcript->get_all_Exons}){ SF:foreach my $sf(@{$e->get_all_supporting_features}){ empty_Object($sf, $remove_stable_id); } empty_Object($e, $remove_stable_id); } $transcript->display_xref(undef) if($remove_xrefs); empty_Object($transcript, $remove_stable_id); return $transcript; } sub all_exons_are_valid{ my ($transcript, $max_length, $allow_negative_start) = @_; $transcript->sort; foreach my $exon(@{$transcript->get_all_Exons}){ throw(Transcript_info($transcript)." seems to contain an undefined exon") if(!$exon); if(!exon_length_less_than_maximum($exon, $max_length)){ warn("Transcript ".id($transcript)." has ". "exon longer than ".$max_length); return 0; } if(!validate_Exon_coords($exon, $allow_negative_start)){ warn("Transcript ".id($transcript)." has ". "invalid exon coords ".Exon_info($exon)); return 0; } } return 1; } sub evidence_coverage_greater_than_minimum{ my ($transcript, $evidence, $min_coverage) = @_; warning ("There has been no evidence passed in for ".Transcript_info($transcript). "Assumung coverage is fine") if(!$evidence); return 1 if(!$evidence); my $coverage = evidence_coverage($transcript, $evidence); warning(Transcript_info($transcript)." based on ".$evidence->id." has too ". "low coverage ".$coverage." of the evidence") unless($coverage > $min_coverage); return 0 unless($coverage > $min_coverage); return 1; } =head2 identical_Transcripts Title : identical_Transcripts Usage : $identical = identical_Transcripts($transcript1, $transcript2); Function: compares 2 Transcripts. DOES NOT CHECK TO SEE WHETHER PHASES ARE IDENTICAL OR WHETHER EVIDENCE IS IDENTICAL. Transcripts are compared on the basis of (i) number of Exons, (ii) start and end coordinates for each Transcript, (iii) strand Example : Returns : 1 if identical, 0 if not indentical Args : Transcript, Transcript =cut sub identical_Transcripts { my ($transcript1, $transcript2) = @_; my @exons1 = @{$transcript1->get_all_Exons()}; my @exons2 = @{$transcript2->get_all_Exons()}; # compare no. of Exons if (scalar(@exons1) != scalar(@exons2)) { return 0; } # compare Exon coordinates and strand foreach ( my $num = 0; $num < scalar(@exons1); $num++ ) { if ($exons1[$num]->strand != $exons2[$num]->strand) { return 0; } if ($exons1[$num]->start != $exons2[$num]->start) { return 0; } if ($exons1[$num]->end != $exons2[$num]->end) { return 0; } } # you may want to check the evidence and phase at some stage by adding a wrapper method return 1; } sub set_start_codon{ my ($transcript) = @_; # check transcript has a translation if(!$transcript->translation || !$transcript->translation->start_Exon){ warning("Transcript has no translation, or no start exon - maybe a pseudogene?"); return $transcript; } my $cloned_transcript = clone_Transcript($transcript); # useful info in genomic coordinates my $strand = @{$cloned_transcript->get_all_Exons}[0]->strand; my $translation = $cloned_transcript->translation; my $start_exon = $translation->start_Exon; my $cdna_coding_start = $cloned_transcript->cdna_coding_start; my $cdna_seq = uc($cloned_transcript->spliced_seq); my @pepgencoords = $cloned_transcript->pep2genomic(1,1); if(scalar(@pepgencoords) > 2) { logger_info("peptide start does not map cleanly - not modifying transcript"); return $cloned_transcript; } my $pepgenstart = $pepgencoords[0]->start; my $pepgenend = $pepgencoords[$#pepgencoords]->end; unless($pepgencoords[0]->isa('Bio::EnsEMBL::Mapper::Coordinate') && $pepgencoords[$#pepgencoords]->isa('Bio::EnsEMBL::Mapper::Coordinate')) { logger_info("peptide coordinate(s) maps to gap - not modifying transcript"); return $cloned_transcript; } ############################################################ # first see whether the transcript already begins with ATG my $first_codon = substr($cdna_seq, $cdna_coding_start-1, 3); if ( uc($first_codon) eq 'ATG' ){ logger_info("transcript already starts with ATG - no need to modify"); return $cloned_transcript; } ############################################################ # now look at the previous codon ############################################################ # first the simplest cases if($cdna_coding_start>3){ # the previous codon is in the cdna $first_codon = substr($cdna_seq, $cdna_coding_start-4, 3); if ($first_codon ne 'ATG'){ logger_info("Upstream codon is not an ATG - not modifying transcript"); return $cloned_transcript; }else{ # save current coords, just in case we need to revert my $current_translation_start = $cloned_transcript->translation->start; my $current_start_exon = $cloned_transcript->translation->start_Exon; my $current_start_exon_start = $current_start_exon->start; my $current_start_exon_end = $current_start_exon->end; my $current_start_exon_phase = $current_start_exon->phase; my $newstartexon; my $current_newstartexon_endphase; my @coords = $cloned_transcript->cdna2genomic($cdna_coding_start-3,$cdna_coding_start-1,$strand); my $new_start; my $new_end; # check not mapping to gaps unless($coords[0]->isa('Bio::EnsEMBL::Mapper::Coordinate') && $coords[$#coords]->isa('Bio::EnsEMBL::Mapper::Coordinate')) { logger_info("new coordinate(s) maps to gap - not modifying transcript"); return $cloned_transcript; } if (scalar(@coords) > 2){ print STDERR "coordinate mapping not done cleanly - not modifying transcript!\n"; return $cloned_transcript; }elsif(scalar(@coords) == 2){ logger_info("new start codon split across intron"); logger_info("coord[0] = " . $coords[0]->start . " " . $coords[0]->end); logger_info("coord[1] = " . $coords[1]->start . " " . $coords[1]->end); if($strand == 1){ $new_start = $coords[0]->start; $new_end = $coords[$#coords]->end;} else{ $new_start = $coords[0]->end; $new_end = $coords[$#coords]->start; } # find exon my $newstartexon = get_previous_Exon($cloned_transcript, $start_exon); if (!$newstartexon) { logger_info("Failed finding new start exon - not modifying transcript"); return $cloned_transcript; } # save in case we need to revert my $current_newstartexon_endphase = $newstartexon->end_phase; my $newphase; if ($strand == 1) { $newphase = $newstartexon->end - $new_start + 1; } else { $newphase = $new_start - $newstartexon->start + 1; } $start_exon->phase($newphase); $newstartexon->end_phase($newphase); $translation->start_Exon($newstartexon); $translation->start($newstartexon->length-$newphase+1); # make sure it still translates, and revert if necessary eval{ $cloned_transcript->translate; }; if($@){ logger_info("problem with modified transcript - reverting coordinates"); $cloned_transcript->start_Exon($current_start_exon); $cloned_transcript->start_Exon->start($current_start_exon_start); $cloned_transcript->start_Exon->end($current_start_exon_end); $translation->start($current_translation_start); $cloned_transcript->start_Exon->phase($current_start_exon_phase); if ($newstartexon){ $newstartexon->end_phase($current_newstartexon_endphase); } } $cloned_transcript->recalculate_coordinates; return $cloned_transcript; }else{ logger_info("New start codon doesn't split across introns - but which exon is it in"); $new_start = $coords[0]->start; $new_end = $coords[0]->end; if (($strand == 1 && $new_end == $pepgenstart-1) || ($strand == -1 && $new_start == $pepgenend+1)) { $translation->start($translation->start-3); } else{ # find exon my $newstartexon = get_previous_Exon($cloned_transcript, $start_exon); if (!$newstartexon) { logger_info("Failed finding new start exon - how can this be?"); return $cloned_transcript; } $current_newstartexon_endphase = $newstartexon->end_phase; # make the boundary phases 0 - the ATG is the last codon of $newstartexon # as we know it doesn't cross the intron $start_exon->phase(0); $newstartexon->end_phase(0); # Reset translation start exon $translation->start_Exon($newstartexon); $translation->start($newstartexon->length-2); } # make sure it still translates, and revert if necessary eval{ $cloned_transcript->translate; }; if($@){ logger_info("problem with modified transcript - reverting coordinates"); $cloned_transcript->start_Exon($current_start_exon); $cloned_transcript->start_Exon->start($current_start_exon_start); $cloned_transcript->start_Exon->end($current_start_exon_end); $translation->start($current_translation_start); $cloned_transcript->start_Exon->phase($current_start_exon_phase); $newstartexon->end_phase($current_newstartexon_endphase); } $cloned_transcript->recalculate_coordinates; return $cloned_transcript; } } } ############################################################ # more complex cases: the previous codon falls off the cdna else{ my $codon_start; my $codon_end; if ($strand == 1) { $codon_start = $pepgenstart - 3; $codon_end = $pepgenstart - 1; } else { $codon_start = $pepgenend + 1; $codon_end = $pepgenend + 3; } my $seq_adaptor = $start_exon->slice->adaptor->db->get_SequenceAdaptor; my $codonseq = uc(${$seq_adaptor->fetch_by_Slice_start_end_strand ($start_exon->slice, $codon_start,$codon_end, $strand)}); if ($codonseq ne "ATG") { logger_info("upstream codon (faling off the slice) is not ATG - not modifying transcript"); return $cloned_transcript; } else{ # fun fun fun # save current coordinates in case we need to revert my $current_start_exon = $start_exon; my $current_start_exon_start = $start_exon->start; my $current_start_exon_end = $start_exon->end; my $current_start_exon_phase = $start_exon->phase; my $current_start_exon_endphase = $start_exon->end_phase; my $current_translation_start = $translation->start; my $current_translation_end = $translation->end; if($strand == 1){ $start_exon->start($codon_start) } else{ $start_exon->end($codon_end) } $start_exon->phase(0); if ($translation->end_Exon == $start_exon){ $translation->end($translation->end + (4-$translation->start)); } $translation->start(1); # make sure it still translates, and revert if necessary eval{ $cloned_transcript->translate; }; if($@){ logger_info("problem with modified transcript - reverting coordinates"); $cloned_transcript->start_Exon($current_start_exon); $cloned_transcript->start_Exon->start($current_start_exon_start); $cloned_transcript->start_Exon->end($current_start_exon_end); $translation->start($current_translation_start); $translation->end($current_translation_end); $cloned_transcript->start_Exon->phase($current_start_exon_phase); $cloned_transcript->start_Exon->end_phase($current_start_exon_endphase); } $cloned_transcript->recalculate_coordinates; return $cloned_transcript; } } } sub get_previous_Exon{ my ($transcript, $exon ) = @_; # this order the exons 5' to 3' my @exons = @{$transcript->get_all_Exons}; for (my $i=0; $i<=$#exons; $i++ ){ if ( $exons[$i]->start == $exon->start && $exons[$i]->end == $exon->end && $exons[$i]->strand == $exon->strand && $i > 0 ){ return $exons[$i-1]; } } return undef; } sub get_next_Exon{ my ($transcript, $exon ) = @_; # this order the exons 5' to 3' my @exons = @{$transcript->get_all_Exons}; for (my $i=0; $i<=$#exons; $i++ ){ if ( $exons[$i]->start == $exon->start && $exons[$i]->end == $exon->end && $exons[$i]->strand == $exon->strand && ($i+1) <= $#exons ){ return $exons[$i+1]; } } return undef; } sub set_stop_codon{ my ($transcript) = @_; my $verbose = 0; unless ( $transcript->translation ){ logger_info("transcript has no translation - cannot put the stops"); return $transcript; } my $cloned_transcript = clone_Transcript($transcript); my $end = $cloned_transcript->translation->end; my $end_exon = $cloned_transcript->translation->end_Exon; ############################################################ # first see whether the transcript already include the stop: taa/tag/tga # this gives you the sequence 5' to 3' my $bioseq = $end_exon->seq; my $last_codon; if ( $end > 2 ){ $last_codon = $bioseq->subseq( $end - 2, $end ); }else{ my $donor = 3 - $end; my $acceptor = $end; my $previous_exon = get_previous_Exon( $cloned_transcript, $end_exon ); if ($previous_exon ){ my $subseq_start = $previous_exon->end - $previous_exon->start + 1 - $donor + 1; my $subseq_end = $previous_exon->end - $previous_exon->start + 1; my $donor_seq = $previous_exon->seq->subseq ($subseq_start, $subseq_end); my $acceptor_seq = $end_exon->seq->subseq( 1, $end ); $last_codon = $donor_seq.$acceptor_seq; } } if ( uc($last_codon) eq 'TAA' || uc($last_codon) eq 'TAG' || uc($last_codon) eq 'TGA' ){ logger_info("transcript already has a stop at the end - no need ". "to modify"); return $cloned_transcript; } ############################################################ # now look at the next codon ############################################################ # first the simplest case if ( $end + 3 <= ($end_exon->end - $end_exon->start + 1) ){ my $next_codon = $bioseq->subseq( $end+1, $end+3 ); if ( uc($next_codon) eq 'TAA' || uc($next_codon) eq 'TAG' || uc($next_codon) eq 'TGA'){ logger_info("simple-case: next codon is a stop - ". "extending translation\n"); $cloned_transcript->translation->end( $end + 3 ); $cloned_transcript->recalculate_coordinates; return $cloned_transcript; }else{ logger_info("next codon is not a stop - not modifying translation"); return $cloned_transcript; } } ############################################################ # more complex cases we need to know if there is a next exon: my $next_exon = get_next_Exon( $cloned_transcript, $end_exon ); if ( $next_exon ){ ############################################################ # how many bases of the next codon sit in $end_exon? my $donor_bases_count = ( $end_exon->end - $end_exon->start + 1 ) - $end; my $acceptor_bases_count = 3 - $donor_bases_count; ############################################################ # get the next codon my $next_bioseq = $next_exon->seq; my $donor; if ( $donor_bases_count == 0 ){ $donor = ''; } else{ $donor = $bioseq->subseq( $end+1, ($end_exon->end - $end_exon->start + 1 )); } my $acceptor = $next_bioseq->subseq( 1, $acceptor_bases_count ); my $next_codon = $donor.$acceptor; if ( uc($next_codon) eq 'TAA' || uc($next_codon) eq 'TAG' || uc($next_codon) eq 'TGA'){ logger_info("shared-codon: next codon is a stop - ". "extending translation"); $cloned_transcript->translation->end_Exon( $next_exon ); $cloned_transcript->translation->end( $acceptor_bases_count ); ############################################################ # re-set the phases: $end_exon->end_phase($donor_bases_count%3); $next_exon->phase( $donor_bases_count%3 ); $cloned_transcript->recalculate_coordinates; return $cloned_transcript; } else{ logger_info("next codon is not a stop - not modifying translation"); return $cloned_transcript; } }elsif( $end + 3 > ($end_exon->end - $end_exon->start + 1) ){ # there is no next exon and the next codon would fall off the end # of the exon # need to get the slice sequence my $adaptor = $end_exon->slice->adaptor; if ( $adaptor ){ my $donor_bases_count = ( $end_exon->end - $end_exon->start + 1 ) - $end; my $acceptor_bases_count = 3 - $donor_bases_count; # the sequence from the current end exon is: my $donor; if ( $donor_bases_count == 0 ){ $donor = ''; } else { $donor = $bioseq->subseq( $end+1, ( $end_exon->end - $end_exon->start + 1 )); } ############################################################ # here we distinguish the strands if ( $end_exon->strand == 1 ){ my $slice_start = $end_exon->slice->start; ############################################################ # calculate the next codon start/end in chr coordinates my $codon_start = $slice_start + ( $end_exon->start + $end - 1 ); my $codon_end = $codon_start + 2; my $codon_slice = $adaptor->fetch_by_region ($end_exon->slice->coord_system->name, $end_exon->slice->seq_region_name, $codon_start, $codon_end ); my $codon = $codon_slice->seq; ############################################################ if ( uc($codon) eq 'TAA' || uc($codon) eq 'TAG' || uc($codon) eq 'TGA'){ logger_info("forward-strand:next codon (falling off the exon) ". "is a stop - extending translation"); $end_exon->end( $end_exon->end + $acceptor_bases_count ); $cloned_transcript->translation->end( $end + 3 ); ############################################################ # update the exon sequence: my $seq_string = $end_exon->slice->subseq( $end_exon->start, $end_exon->end, $end_exon->strand ); $cloned_transcript->translation->end_Exon($end_exon); $cloned_transcript->recalculate_coordinates; return $cloned_transcript; } else{ logger_info("next codon (falling off the exon) is not a stop - not modifying"); return $cloned_transcript; } } else { my $slice_start = $end_exon->slice->start; ############################################################ # calculate the next codon start/end in chr coordinates my $codon_end = $slice_start + $end_exon->end - $end - 1; my $codon_start = $codon_end - 2; if($codon_start <= 0){ logger_info("Can't extend the transcript off the end of a ". $end_exon->slice->coord_system->name); return $cloned_transcript; } my $codon_slice = $adaptor->fetch_by_region ($end_exon->slice->coord_system->name, $end_exon->slice->seq_region_name, $codon_start, $codon_end ); my $pre_codon = $codon_slice->seq; # need to reverse and complement: my $codon; ( $codon = reverse $pre_codon ) =~tr/gatcGATC/ctagCTAG/; ; if ( uc($codon) eq 'TAA' || uc($codon) eq 'TAG' || uc($codon) eq 'TGA'){ logger_info("reverse-strand: next codon (falling off the exon) is a stop - extending translation\n"); $end_exon->start( $end_exon->start - $acceptor_bases_count); $cloned_transcript->translation->end( $end + 3 ); ############################################################ # update the exon sequence: my $seq_string = $end_exon->slice->subseq( $end_exon->start, $end_exon->end, $end_exon->strand ); $cloned_transcript->translation->end_Exon( $end_exon ); $cloned_transcript->recalculate_coordinates; return $cloned_transcript; } else { logger_info("next codon (falling off the exon) is not a stop - not modifying"); return $cloned_transcript; } } } else { logger_info("cannot get an adaptor to get the sequence - not modifying the translation"); return $cloned_transcript; } } else { logger_info("There is no downstream exon - and no stop codon beyond the last exon - not modifying"); return $cloned_transcript; } } =head2 is_Transcript_sane Arg [1] : Bio::EnsEMBL::Transcript Function : checks some simple facts about a transcript structure to ensure its sane Returntype: boolean Exceptions: none Example : =cut sub is_Transcript_sane{ my ($transcript) = @_; #exon coord sanity my $sane = 1; $transcript->sort; foreach my $exon(@{$transcript->get_all_Exons}){ if($exon->start > $exon->end){ $sane = 0; } } $sane = 0 unless(are_strands_consistent($transcript)); $sane = 0 unless(are_phases_consistent($transcript)); $sane = 0 unless(is_not_folded($transcript)); return $sane; } 1;