Raw content of Bio::EnsEMBL::Analysis::Runnable::HavanaAdder # # # BioPerl module for GeneBuilder # # Cared for by EnsEMBL <ensembl-dev@ebi.ac.uk> # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =pod =head1 NAME Bio::EnsEMBL::Analysis::Runnable::HavanaAdder =head1 SYNOPSIS # This is the main analysis database my $genebuilder = new Bio::EnsEMBL::Analysis::Runnable::HavanaAdder ( '-slice' => $self->query, '-input_id' => $self->input_id, ); =head1 DESCRIPTION This module reads your favourite annotations (ensembl, protein_coding,...) on the one hand, and manually curated plus features on the other hand. The product of Bio::EnsEMBL::Analysis::Runnable::HavanaAdder is a combination of both annotations where redundant transcripts are eliminated. The resulting transcripts are combined into genes. For more details, follow the list of methods called by build_Genes() method and the description in each one. =head1 CONTACT ensembl-dev@ebi.ac.uk =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::EnsEMBL::Analysis::Runnable::HavanaAdder; use Bio::EnsEMBL::Transcript; use Bio::EnsEMBL::Exon; use Bio::EnsEMBL::Gene; use Bio::EnsEMBL::Analysis; use Bio::EnsEMBL::Slice; use Bio::EnsEMBL::Root; use Bio::EnsEMBL::DBEntry; use Bio::EnsEMBL::Attribute; use Bio::EnsEMBL::DnaDnaAlignFeature; use Bio::EnsEMBL::Analysis::Tools::Algorithms::TranscriptCluster; use Bio::EnsEMBL::Analysis::Config::GeneBuild::General qw ( GB_INPUTID_REGEX ); use Bio::EnsEMBL::Analysis::Config::HavanaAdder qw ( GB_ENSEMBL_INPUT_GENETYPE GB_HAVANA_INPUT_GENETYPE GB_HAVANA_INPUT_TRANSCRIPTTYPES MERGED_TRANSCRIPT_OUTPUT_TYPE HAVANA_LOGIC_NAME MERGED_GENE_LOGIC_NAME MERGED_TRANSCRIPT_LOGIC_NAME ); use vars qw(@ISA); use strict; @ISA = qw(Bio::EnsEMBL::Root); ############################################################ sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($slice,$input_id) = $self->_rearrange([qw(SLICE INPUT_ID)], @args); $self->throw("Must input a slice to HavanaAdder") unless defined($slice); $self->{_final_genes} = []; $self->{_gene_types} = []; $self->query($slice); $self->gene_types($GB_ENSEMBL_INPUT_GENETYPE); $self->gene_types($GB_HAVANA_INPUT_GENETYPE); $self->input_id($input_id); return $self; } ############################################################ =head2 input_id Function: get/set for input id Returns : string Args : string (it expects a string of the format chr_name.start_coord-end_coord =cut sub input_id { my ($self,$id) = @_; if (defined($id)) { $self->{_input_id} = $id; } return $self->{_input_id}; } ############################################################ =head2 build_Genes Example : my @genes = $self->build_Genes Description: builds genes. It is like the run method in Runnables. It calls everything that needs to be done. Returns : none Args : none Caller : Bio::EnsEMBL::Analysis::RunnableDB::HavanaAdder =cut sub build_Genes{ my ($self) = @_; print STDERR "Building genes...\n"; # get all genes of type defined in gene_types() on this slice $self->get_Genes; my @all_transcripts = $self->combined_Transcripts; # do a preliminary clustering my @preliminary_genes = $self->cluster_into_Genes(@all_transcripts); # merge redundant ensembl transcripts which match a havana one $self->_merge_redundant_transcripts(\@preliminary_genes); # make shared exons unique objects my @genes = $self->_make_shared_exons_unique( @preliminary_genes ); print STDERR scalar(@genes)." genes built\n"; $self->final_genes( @genes ); } sub _merge_redundant_transcripts{ my ($self, $genes) = @_; GENE: foreach my $gene(@$genes){ my @transcripts = @{$gene->get_all_Transcripts}; my @havana; my @ensembl; #print "number of transcript: ",scalar(@transcripts),"\n"; # are there any havana transcripts? TRANSCRIPT:foreach my $transcript(@transcripts){ foreach my $htranscript (@{$GB_HAVANA_INPUT_TRANSCRIPTTYPES}){ if($transcript->biotype eq $htranscript."_havana"){ #print "I'm a havana transcript\n"; push(@havana, $transcript); next TRANSCRIPT; } } push(@ensembl, $transcript); } if (!scalar(@havana)){ next GENE; } #print "havana tran: ",scalar(@havana),"\n"; #print "ensembl tran: ",scalar(@ensembl),"\n"; # compare each havana transcript to each ensembl one foreach my $ht(@havana){ # We add an attribute to the havana transcripts that show which supporting features it has $self->add_havana_attribute($ht,$ht); #print "Deleting havana transcript supporting features\n"; $ht->flush_supporting_features; #print "Number of ensembl transcripts: ",scalar(@ensembl),"\n"; my $delete_trans = 0; my @t_pair; foreach my $et(@ensembl){ #if ($delete_t = $self->are_matched_pair($ht, $et)){ my $delete_t = $self->are_matched_pair($ht, $et); # We check all posible match pairs and give preference to the one that shares CDS and UTR # This was added so only the best matching havana/ensembl pair is chossen and to avoid a one to many link if ($delete_t){ if ($delete_t == $et){ #print "I'm here\n"; $delete_trans = $delete_t; @t_pair = ($ht, $et); }elsif($delete_trans != $et && $delete_t == $ht){ $delete_trans = $delete_t; @t_pair = ($ht, $et); }elsif($delete_trans == 0){ $delete_trans = $delete_t; @t_pair = ($ht, $et); } } } if($delete_trans && $delete_trans != 0){ $self->set_transcript_relation($delete_trans, @t_pair); $t_pair[0]->biotype($MERGED_TRANSCRIPT_OUTPUT_TYPE); $t_pair[1]->biotype($MERGED_TRANSCRIPT_OUTPUT_TYPE); # We want to remove the redundant transcript unless both share CDS but have different UTR # structure as in that case we annotate both transcripts $self->_remove_transcript_from_gene($gene, $delete_trans) unless $delete_trans == 1; }else{ $self->add_ottt_xref($ht); } } } } # are_matched_pair check return 4 different possible values: # return 0 means keep both transcript as they have different coding region # return 1 means keep both as they have same coding but different UTR exon structire # return ($ensembl) means keep havana transcript and remove ensembl # return ($havana) means keep ensembl transcript and remove hanana sub are_matched_pair { my($self, $havana, $ensembl) = @_; # Fetch all exons in each transcript my @hexons = @{$havana->get_all_Exons}; my @eexons = @{$ensembl->get_all_Exons}; my @thexons = @{$havana->get_all_translateable_Exons}; my @teexons = @{$ensembl->get_all_translateable_Exons}; # Check that the number of exons is the same in both transcripts return 0 unless scalar(@hexons) == scalar(@eexons); #print "____________________________________\n"; #print "HAVANA ID: ",$havana->dbID, " ENSEMBL: ",$ensembl->dbID,"\n"; # double check translation coords #print "HAVANA TRANS START: ",$havana->translation->genomic_start," END: ",$havana->translation->genomic_end,"\n"; #print "ENSEMBL TRANS START: ",$ensembl->translation->genomic_start," END: ",$ensembl->translation->genomic_end,"\n"; return 0 unless($havana->translation->genomic_start == $ensembl->translation->genomic_start && $havana->translation->genomic_end == $ensembl->translation->genomic_end); # special case for single exon genes if(scalar(@hexons == 1)){ #print "SINGLE EXONS!\n"; if ($hexons[0]->start == $eexons[0]->start && $hexons[0]->end == $eexons[0]->end && $hexons[0]->strand == $eexons[0]->strand && $thexons[0]->coding_region_start($havana) == $teexons[0]->coding_region_start($ensembl) && $thexons[0]->coding_region_end($havana) == $teexons[0]->coding_region_end($ensembl) ){ # Both are exactly the same so we delete the Ensembl one unless the Ensembl one is already a merged one return $ensembl; }elsif($hexons[0]->start <= $eexons[0]->start && $hexons[0]->end >= $eexons[0]->end && $hexons[0]->strand == $eexons[0]->strand && $eexons[0]->start == $teexons[0]->coding_region_start($ensembl) && $eexons[0]->end == $teexons[0]->coding_region_end($ensembl) ){ # Ensembl gene don't have UTR and Havana has then delete Ensembl one return $ensembl; }elsif((($hexons[0]->start != $eexons[0]->start || $hexons[0]->end != $eexons[0]->end) && $hexons[0]->strand == $eexons[0]->strand) && ($eexons[0]->start != $teexons[0]->coding_region_start($ensembl) || $eexons[0]->end != $teexons[0]->coding_region_end($ensembl)) ){ # Both contain UTR keep ENSEMBL return $havana; }else{ # We can be here when genes have different UTR start/end and different CDS start/end # or when the UTR start/end is the same but the CDS start/end is different #print "Keep both single exon genes\n"; return 0; } } # if is a multi exons transcript else{ # First we check the internal coding structure of the transcript where everything has to be exactly equal #print "CHECKING INTERNAL EXONS \n"; for(my $i=1; $i<=($#thexons-1); $i++){ return 0 unless ($thexons[$i]->start == $teexons[$i]->start && $thexons[$i]->end == $teexons[$i]->end && $thexons[$i]->strand == $teexons[$i]->strand ); } #print "INTERNAL CODING EXONS ARE OK \n"; # now check the rest of the internal exons that are not coding. for(my $i=1; $i<=($#hexons-1); $i++){ return 1 unless ($hexons[$i]->start == $eexons[$i]->start && $hexons[$i]->end == $eexons[$i]->end && $hexons[$i]->strand == $eexons[$i]->strand ); } #print "INTERNAL UTR EXONS ARE OK \n"; # Then check if the first an last exon are the same in both transcripts. If just start and end of UTR are different keep ensembl one # CASE 1: Both coding and UTR are the same, keep Havana and delete Ensembl if ($hexons[0]->start == $eexons[0]->start && $hexons[0]->end == $eexons[0]->end && $hexons[0]->strand == $eexons[0]->strand && $hexons[-1]->start == $eexons[-1]->start && $hexons[-1]->end == $eexons[-1]->end && $hexons[-1]->strand == $eexons[-1]->strand ){ #print "MULTIEXON DELETE ENSEMBL\n"; return $ensembl; }elsif (#CASE 2": HAVANA HAS UTR AND ENSEMBL DOESNT, KEEP HAVANA. Forward strand $hexons[0]->strand == 1 && $hexons[0]->end == $eexons[0]->end && $hexons[0]->strand == $eexons[0]->strand && $hexons[-1]->start == $eexons[-1]->start && $hexons[-1]->strand == $eexons[-1]->strand && $eexons[0]->start == $teexons[0]->coding_region_start($ensembl) && $eexons[-1]->end == $teexons[-1]->coding_region_end($ensembl) && ($hexons[-1]->end != $eexons[-1]->end || $hexons[0]->start != $eexons[0]->start) ){ #print "MULTIEXON DELETE ENSEMBL\n"; return $ensembl; }elsif (# CASE 3: BOTH ENSEMBL AND HAVANA HAVE UTR BUT WITH DIFFERENT START/END, KEEP ENSEMBL. Forward strand $hexons[0]->strand == 1 && $hexons[0]->end == $eexons[0]->end && $hexons[0]->strand == $eexons[0]->strand && $hexons[-1]->start == $eexons[-1]->start && $hexons[-1]->strand == $eexons[-1]->strand && ($eexons[0]->start != $teexons[0]->coding_region_start($ensembl) || $eexons[-1]->end != $teexons[-1]->coding_region_end($ensembl)) && ($hexons[-1]->end != $eexons[-1]->end || $hexons[0]->start != $eexons[0]->start) ){ #print "MULTIEXON DELETE HAVANA\n"; return $havana; }elsif (# CASE 4: Same as case 2 but in reverse strand $hexons[0]->strand == -1 && $hexons[0]->start == $eexons[0]->start && $hexons[0]->strand == $eexons[0]->strand && $hexons[-1]->end == $eexons[-1]->end && $hexons[-1]->strand == $eexons[-1]->strand && $eexons[-1]->start == $teexons[-1]->coding_region_start($ensembl) && $eexons[0]->end == $teexons[0]->coding_region_end($ensembl) && ($hexons[0]->end != $eexons[0]->end || $hexons[-1]->start != $eexons[-1]->start) ){ #print "MULTIEXON DELETE ENSEMBL\n"; return $ensembl; }elsif (# CASE 5: Same as case 3 but in reverse strand $hexons[0]->strand == -1 && $hexons[0]->start == $eexons[0]->start && $hexons[0]->strand == $eexons[0]->strand && $hexons[-1]->end == $eexons[-1]->end && $hexons[-1]->strand == $eexons[-1]->strand && ($eexons[-1]->start != $teexons[-1]->coding_region_start($ensembl) || $eexons[0]->end != $teexons[0]->coding_region_end($ensembl)) && ($hexons[0]->end != $eexons[0]->end || $hexons[-1]->start != $eexons[-1]->start) ){ #print "MULTIEXON DELETE HAVANA\n"; return $havana; }else{ #print "Should I be here?\n"; #print "Keep MULTIEXON BOTH\n"; return 1; } } print " WEIRD CASE WE DID NOT THOUGHT ABOUT, CHECK RULES!\n"; return 0; } sub add_ottt_xref{ my($self, $ht) = @_; #TEST foreach my $entry(@{ $ht->get_all_DBEntries}){ if ($entry->dbname eq 'Vega_transcript'){ if($entry->primary_id eq $entry->display_id){ print "I am adding an OTTT xref to the transcript\n"; print "OTTT TO ADD: ",$entry->primary_id,"\n"; my $xref_ottt = new Bio::EnsEMBL::DBEntry ( -primary_id =>$entry->primary_id, -display_id =>$ht->display_id, -priority => 1, -xref_priority => 0, -version => 1, -release => 1, -dbname => 'OTTT' ); $xref_ottt->status("XREF"); $ht->add_DBEntry($xref_ottt); #END of TEST } } } } sub set_transcript_relation { # $t_pair[0] is the havana transcript and $t_pair[1] is the ensembl transcript my($self, $delete_t, @t_pair) = @_; # If both share CDS and UTR is different in structure and number of exons we still keep both, and we link them via Xref if ($delete_t == 1){ # transfer OTTT ID and/or ENST foreach my $entry(@{ $t_pair[0]->get_all_DBEntries}){ if ($entry->dbname eq 'Vega_transcript'){ if($entry->primary_id eq $entry->display_id){ my $newentry = new Bio::EnsEMBL::DBEntry ( -primary_id => $entry->primary_id, -display_id => $entry->display_id, -priority => 1, -xref_priority => 0, -version => 1, -release => 1, -dbname => 'shares_CDS_with_OTTT' ); $newentry->status("XREF"); $t_pair[1]->add_DBEntry($newentry); #TEST my $xref_ottt = new Bio::EnsEMBL::DBEntry ( -primary_id =>$entry->primary_id, -display_id =>$entry->display_id, -priority => 1, -xref_priority => 0, -version => 1, -release => 1, -dbname => 'OTTT' ); print "OTTT xref to be added here\n"; $xref_ottt->status("XREF"); $t_pair[0]->add_DBEntry($xref_ottt); #END of TEST } } } my $link_attrib = Bio::EnsEMBL::Attribute->new (-CODE => 'enst_link', -NAME => 'enst link', -DESCRIPTION => 'Code to link a OTTT with an ENST when they both share the CDS of ENST', -VALUE => $t_pair[1]->dbID); $t_pair[1]->add_Attributes($link_attrib); my $xref_entry = new Bio::EnsEMBL::DBEntry ( -primary_id =>$t_pair[1]->dbID, -display_id =>$t_pair[1]->dbID, -priority => 1, -xref_priority => 0, -version => 1, -release => 1, -dbname => 'shares_CDS_with_ENST' ); $xref_entry->status("XREF"); $t_pair[0]->add_DBEntry($xref_entry); print "OTTT TO ADD: ",$t_pair[0]->stable_id,"\n"; } # If transcript to delete is Havana we create an xref for the entry say that the transcript is CDS equal to ENSEMBL elsif ($delete_t == $t_pair[0]){ # transfer OTTT ID and/or ENST foreach my $entry(@{ $t_pair[0]->get_all_DBEntries}){ if ($entry->dbname eq 'Vega_transcript'){ if($entry->primary_id eq $entry->display_id){ my $newentry = new Bio::EnsEMBL::DBEntry ( -primary_id => $entry->primary_id, -display_id => $entry->display_id, -priority => 1, -xref_priority => 0, -version => 1, -release => 1, -dbname => 'shares_CDS_with_OTTT' ); $newentry->status("XREF"); $t_pair[1]->add_DBEntry($newentry); } } } # We add a transcript attribute to the ensembl transcript with the start and end coords of the Havana transcript that we will delete my $attrib_value = $t_pair[0]->slice->coord_system_name.":".$t_pair[0]->slice->coord_system->version.":".$t_pair[0]->slice->seq_region_name.":". $t_pair[0]->start.":".$t_pair[0]->end.":1"; # print "ATTRIB VALUE:---------- ",$attrib_value,"\n"; my $attribute = Bio::EnsEMBL::Attribute->new (-CODE => 'TranscriptEdge', -NAME => 'Transcript Edge', -DESCRIPTION => '', -VALUE => $attrib_value); $t_pair[1]->add_Attributes($attribute); # When we delete a Havana transcript we want to transfer the exon supporting features to the transcript we keep my @delete_e = @{$delete_t->get_all_Exons}; my @exons = @{$t_pair[1]->get_all_Exons}; if (scalar(@delete_e) == scalar(@exons)){ my $e; for ($e = 0, $e<scalar(@delete_e), $e++){ if($delete_e[$e] && $exons[$e]){ $self->transfer_supporting_evidence($delete_e[$e], $exons[$e]); } } } # We add attributes to the havana transcript showing which supporting features where used for the transcript in Havana $self->add_havana_attribute($t_pair[0],$t_pair[1]); }elsif ($delete_t == $t_pair[1]){ # If the transcript to delete is ENSEMBL we add an xref entry say that both transcripts are exact matches (including UTR) foreach my $entry(@{ $t_pair[0]->get_all_DBEntries}){ if ($entry->dbname eq 'Vega_transcript'){ if($entry->primary_id eq $entry->display_id){ my $enstentry = new Bio::EnsEMBL::DBEntry ( -primary_id => $entry->primary_id, -display_id => $entry->display_id, -version => 1, -release => 1, -priority => 1, -xref_priority => 0, -dbname => 'shares_CDS_and_UTR_with_OTTT' ); $enstentry->status("XREF"); $t_pair[0]->add_DBEntry($enstentry); } } } # Transfer the supporting features both for transcript and exon of the transcript to delete to the transcript we keep $self->transfer_supporting_features($delete_t,$t_pair[0]); # We add attributes to the havana transcript showing which supporting features where used for the transcript in Havana #$self->add_havana_attribute($t_pair[0],$t_pair[0]); } } sub add_havana_attribute{ my ($self, $transcript, $trans_to_add_attrib) = @_; my %evidence; my %t_evidence; foreach my $tsf (@{$transcript->get_all_supporting_features}){ $t_evidence{$tsf->hseqname} = 1; } foreach my $te_key (keys %t_evidence){ #print "Adding special attrib\n"; if($te_key->isa("Bio::EnsEMBL::DnaPepAlignFeature")){ my $attribute = Bio::EnsEMBL::Attribute->new (-CODE => 'tp_otter_support', -NAME => 'tp otter support', -DESCRIPTION => 'Evidence ID that was used as protein transcript supporting feature for building a gene in Vega', -VALUE => $ev_key); $trans_to_add_attrib->add_Attributes($attribute); } if($te_key->isa("Bio::EnsEMBL::DnaDnaAlignFeature")){ my $attribute = Bio::EnsEMBL::Attribute->new (-CODE => 'td_otter_support', -NAME => 'td otter support', -DESCRIPTION => 'Evidence ID that was used as cdna transcript supporting feature for building a gene in Vega', -VALUE => $ev_key); $trans_to_add_attrib->add_Attributes($attribute); } } foreach my $exon (@{$transcript->get_all_Exons}){ foreach my $sf (@{$exon->get_all_supporting_features}){ $evidence{$sf->hseqname} = 1; } } foreach my $ev_key (keys %evidence){ #print "Adding special attrib\n"; if($ev_key->isa("Bio::EnsEMBL::DnaPepAlignFeature")){ my $attribute = Bio::EnsEMBL::Attribute->new (-CODE => 'ep_otter_support', -NAME => 'ep otter support', -DESCRIPTION => 'Evidence ID that was used as protein exon supporting feature for building a gene in Vega', -VALUE => $ev_key); $trans_to_add_attrib->add_Attributes($attribute); } if($ev_key->isa("Bio::EnsEMBL::DnaPepAlignFeature")){ my $attribute = Bio::EnsEMBL::Attribute->new (-CODE => 'ed_otter_support', -NAME => 'ed otter support', -DESCRIPTION => 'Evidence ID that was used as cdna exon supporting feature for building a gene in Vega', -VALUE => $ev_key); $trans_to_add_attrib->add_Attributes($attribute); } } } sub transfer_supporting_features{ my ($self, $delete_t, $transcript) = @_; #print "TRANSCRIPT IS : ", $transcript,"\n"; my @exon_features; # Delete all the supporting features for the Havana Transcript #$transcript->flush_supporting_features; my @delete_tsf = @{ $delete_t->get_all_supporting_features }; #my @transcript_sf = @{ $transcript->get_all_supporting_features }; # print "NUMBER OF TRANSCRIPT SF: ",scalar(@transcript_sf),"\n"; # print " AND DELETE TSF: ", scalar(@delete_tsf),"\n"; DTSF: foreach my $dtsf (@delete_tsf){ next DTSF unless $dtsf->isa("Bio::EnsEMBL::FeaturePair"); $transcript->add_supporting_features($dtsf); } my @delete_e = @{$delete_t->get_all_Exons}; my @exons = @{$transcript->get_all_Exons}; if (scalar(@delete_e) == scalar(@exons)){ my $e; for ($e = 0, $e<scalar(@delete_e), $e++){ if($delete_e[$e] && $exons[$e]){ $self->transfer_supporting_evidence($delete_e[$e], $exons[$e]); } } } # print "NUMBER AFT ADDITTION: ", scalar(@{ $transcript->get_all_supporting_features }),"\n"; } sub _remove_transcript_from_gene { my ($self, $gene, $trans_to_del) = @_; my @newtrans; foreach my $trans (@{$gene->get_all_Transcripts}) { if ($trans != $trans_to_del) { push @newtrans,$trans; } } # The naughty bit! $gene->{_transcript_array} = []; foreach my $trans (@newtrans) { $gene->add_Transcript($trans); } return scalar(@newtrans); } ############################################################ sub _make_shared_exons_unique{ my ( $self, @genes ) = @_; my @pruned_genes; foreach my $gene ( @genes ){ # make different exon objects that are shared between transcripts # ( regarding attributes: start, end, etc ) # into unique exon objects my $new_gene = $self->prune_Exons($gene); push( @pruned_genes, $new_gene ); } return @pruned_genes; } ############################################################ =head2 get_Genes Description: retrieves ensembl and havana gene annotations with supporting evidence. ReturnType : none, but $self->combined_Transcripts is filled Args : none =cut sub get_Genes { my ($self) = @_; my @transcripts; my @genes; my $ensemblslice = $self->fetch_sequence($self->input_id, $self->ensembl_db); my $havanaslice = $self->fetch_sequence($self->input_id, $self->havana_db); print STDERR "Fetching ensembl genes\n"; EGENE: foreach my $egene (@{$ensemblslice->get_all_Genes_by_type($GB_ENSEMBL_INPUT_GENETYPE)}){ # Don't add those genes that contain only transcripts imported from HAVANA (this is important during a merge update) if ($egene->analysis->logic_name() eq $HAVANA_LOGIC_NAME){ next EGENE; }else{ push (@genes,$egene); } } print STDERR "Retrieved ".scalar(@genes)." genes of type ".$GB_ENSEMBL_INPUT_GENETYPE."\n"; print STDERR "Fetching havana genes\n"; my @hgenes = @{$havanaslice->get_all_Genes_by_type($GB_HAVANA_INPUT_GENETYPE)}; print STDERR "Retrieved ".scalar(@hgenes)." genes of type ".$GB_HAVANA_INPUT_GENETYPE."\n"; # We change the biotype of the havana genes/transcripts as it could happend to be the same as the ensembl ones foreach my $hgene(@hgenes){ my $biotype = $hgene->biotype."_havana"; $hgene->biotype($biotype); foreach my $htran (@{$hgene->get_all_Transcripts}) { my $tbiotype = $htran->biotype."_havana"; $htran->biotype($tbiotype); } } push(@genes, @hgenes); foreach my $gene(@genes){ TRANSCRIPT: foreach my $tran (@{$gene->get_all_Transcripts}) { #First we remove HAVANA only transcripts that are present in merged genes if($gene->analysis->logic_name() eq $MERGED_GENE_LOGIC_NAME && $tran->analysis->logic_name() eq $HAVANA_LOGIC_NAME){ next TRANSCRIPT; }elsif($tran->analysis->logic_name() eq $MERGED_TRANSCRIPT_LOGIC_NAME){ # In case of a merged transcript we want to distinguish the ones that came from HAVANA that have same CDS # but different UTR structure as we want to remove then. This is important for a merge update to avoid # then been wrongly identified as share CDS and UTR my $share_enst = 0; my $share_cds_and_utr = 0; my @dbentries = @{ $tran->get_all_DBEntries }; foreach my $dbentry (@dbentries){ if ($dbentry->dbname eq "shares_CDS_with_ENST"){ #print "On transcript: ",$tran->dbID," This is a HAVANA shares ENST\n"; #next TRANSCRIPT; $share_enst = 1; } if ($dbentry->dbname eq "shares_CDS_and_UTR_with_OTTT"){ $share_cds_and_utr = 1; #print "On transcript: ",$tran->dbID," This is a HAVANA shares CDS and UTR\n"; } } if ($share_enst == 1 && $share_cds_and_utr == 0){ next TRANSCRIPT; } } $self->flush_xref($tran); #Check if a transcript is in the discarded genes database before adding it to the merging list. if($self->check_transcript_in_discarded_db($tran) != 0){ #print "Transcript added\n"; push(@transcripts, $tran); } } } print STDERR "Finished fetching genes\n"; $self->combined_Transcripts(@transcripts); } sub check_transcript_in_discarded_db{ my ($self, $tran) = @_; my @exons = @{$tran->get_all_Exons}; my $discardedslice = $self->discarded_db->get_SliceAdaptor->fetch_by_region('toplevel',$tran->slice->seq_region_name,$tran->seq_region_start,$tran->seq_region_end); #print STDERR "Fetching discarded genes\n"; #print "NUMBER OF DISCARDED GENES: ",scalar(@{$discardedslice->get_all_Genes}),"\n"; DGENE: foreach my $dgene (@{$discardedslice->get_all_Genes}){ DTRANS:foreach my $dtran (@{$dgene->get_all_Transcripts}){ my @dexons = @{$dtran->get_all_Exons}; if(scalar(@exons) == scalar(@dexons)){ #print "Number of exons: ",scalar(@exons),"\n"; for (my $i=0; $i < scalar(@exons); $i++){ if ($exons[$i]->seq_region_start != $dexons[$i]->seq_region_start || $exons[$i]->strand != $dexons[$i]->strand || $exons[$i]->seq_region_end != $dexons[$i]->seq_region_end){ # if you enter here means that these two transcripts are not the same #print "transcript exon coordinates are different\n"; next DTRANS; } } # If you are here means that both transcripts are the same and $trans must be discarded print "transcript found in discarded db\n"; return 0; }else{ # if you enter here means that these two transcripts are not the same #print "transcript number of exons is different\n"; next DGENE; } } } #If we reach here means that no transcript in the discarded db is the same as our transcript so we keep it return 1; } sub flush_xref{ my ($self, $transcript) = @_; my @newxrefs; #print "THIS IS WHAT NEED EMPTYING: ",$transcript->get_all_DBEntries,"\n"; foreach my $tran_xref (@{$transcript->get_all_DBEntries}){ if ($tran_xref->dbname ne "shares_CDS_and_UTR_with_OTTT" && $tran_xref->dbname ne "shares_CDS_with_OTTT" && $tran_xref->dbname ne "shares_CDS_with_ENST" && $tran_xref->dbname ne "OTTT"){ push (@newxrefs, $tran_xref); } } # The naughty bit! $transcript->{dbentries} = []; #$transcript->{display_xref} = []; foreach my $newxref (@newxrefs) { $transcript->add_DBEntry($newxref); } # return scalar(@newtrans); } ###########################################################c =head2 cluster_Transcripts Description : It separates transcripts according to strand and then clusters each set of transcripts by calling _cluster_Transcripts_by_genomic_range() Args : Array of Bio::EnsEMBL::Transcript Return : Array of Bio::EnsEMBL::Analysis::Tools::Algorithms::TranscriptCluster =cut sub cluster_Transcripts { my ($self,@transcripts) = @_; my @forward_transcripts; my @reverse_transcripts; foreach my $transcript (@transcripts){ my @exons = @{ $transcript->get_all_Exons }; if ( $exons[0]->strand == 1 ){ push( @forward_transcripts, $transcript ); } else{ push( @reverse_transcripts, $transcript ); } } my @forward_clusters; my @reverse_clusters; if ( @forward_transcripts ){ @forward_clusters = $self->_cluster_Transcripts_by_genomic_range( @forward_transcripts ); } if ( @reverse_transcripts ){ @reverse_clusters = $self->_cluster_Transcripts_by_genomic_range( @reverse_transcripts ); } my @clusters; if ( @forward_clusters ){ push( @clusters, @forward_clusters); } if ( @reverse_clusters ){ push( @clusters, @reverse_clusters); } return @clusters; } ############################################################ =head2 _cluster_Transcripts_by_genomic_range Description : It clusters transcripts according to genomic overlap Args : Array of Bio::EnsEMBL::Transcript Return : Array of Bio::EnsEMBL::Analysis::Tools::Algorithms::TranscriptCluster =cut sub _cluster_Transcripts_by_genomic_range{ my ($self,@mytranscripts) = @_; # first sort the transcripts my @transcripts = sort { $a->start <=> $b->start ? $a->start <=> $b->start : $b->end <=> $a->end } @mytranscripts; # create a new cluster my $cluster=Bio::EnsEMBL::Analysis::Tools::Algorithms::TranscriptCluster->new(); my $count = 0; my @cluster_starts; my @cluster_ends; my @clusters; # put the first transcript into these cluster $cluster->put_Transcripts( $transcripts[0] ); $cluster_starts[$count] = $transcripts[0]->start; $cluster_ends[$count] = $transcripts[0]->end; # store the list of clusters push( @clusters, $cluster ); # loop over the rest of the transcripts LOOP1: for (my $c=1; $c<=$#transcripts; $c++){ #print STDERR "\nIn cluster ".($count+1)."\n"; #print STDERR "start: $cluster_starts[$count] end: $cluster_ends[$count]\n"; #print STDERR "comparing:\n"; #Bio::EnsEMBL::Analysis::Tools::TranscriptUtils->_print_Transcript( $transcripts[$c] ); if ( !( $transcripts[$c]->end < $cluster_starts[$count] || $transcripts[$c]->start > $cluster_ends[$count] ) ){ $cluster->put_Transcripts( $transcripts[$c] ); # re-adjust size of cluster if ($transcripts[$c]->start < $cluster_starts[$count]) { $cluster_starts[$count] = $transcripts[$c]->start; } if ( $transcripts[$c]->end > $cluster_ends[$count]) { $cluster_ends[$count] = $transcripts[$c]->end; } } else{ # else, create a new cluster with this feature $count++; $cluster = Bio::EnsEMBL::Analysis::Tools::Algorithms::TranscriptCluster->new(); $cluster->put_Transcripts( $transcripts[$c] ); $cluster_starts[$count] = $transcripts[$c]->start; $cluster_ends[$count] = $transcripts[$c]->end; # store it in the list of clusters push(@clusters,$cluster); } } return @clusters; } ############################################################ =head2 cluster_into_Genes Example : my @genes = $self->cluster_into_Genes(@transcripts); Description : it clusters transcripts into genes according to exon overlap. It will take care of difficult cases like transcripts within introns. It also unify exons that are shared among transcripts. Returns : a beautiful list of geen objects Args : a list of transcript objects =cut sub cluster_into_Genes{ my ($self, @transcripts_unsorted) = @_; my $num_trans = scalar(@transcripts_unsorted); # First clean the coding exon cache in case it has any exons stored from previous called to the cluster_into_Genes function. $self->clear_coding_exons_cache; my @transcripts_unsorted_translation; foreach my $tran(@transcripts_unsorted){ if ($tran->translation){ push (@transcripts_unsorted_translation, $tran); } } my @transcripts = sort { $a->coding_region_start <=> $b->coding_region_start ? $a->coding_region_start <=> $b->coding_region_start : $b->coding_region_end <=> $a->coding_region_end } @transcripts_unsorted_translation; my @clusters; # clusters transcripts by whether or not any coding exon overlaps with a coding exon in # another transcript (came from original prune in GeneBuilder) foreach my $tran (@transcripts) { # First clean the coding exon cache in case it has any exons stored from previous called to the cluster_into_Genes function. # $self->clear_coding_exons_cache; my @matching_clusters; CLUSTER: foreach my $cluster (@clusters) { # $self->clear_coding_exons_cache; #print "Transcript: ",$tran->stable_id," has coding region start: ",$tran->coding_region_start,"\n"; foreach my $cluster_transcript (@$cluster) { if ($tran->coding_region_end >= $cluster_transcript->coding_region_start && $tran->coding_region_start <= $cluster_transcript->coding_region_end) { # foreach my $exon1 (@{$tran->get_all_Exons}) { # foreach my $cluster_exon (@{$cluster_transcript->get_all_Exons}) { my $exons1 = get_coding_exons_for_transcript($tran); my $cluster_exons = get_coding_exons_for_transcript($cluster_transcript); foreach my $exon1 (@{$exons1}) { foreach my $cluster_exon (@{$cluster_exons}) { if ($exon1->overlaps($cluster_exon) && $exon1->strand == $cluster_exon->strand) { push (@matching_clusters, $cluster); next CLUSTER; } } } } } } if (scalar(@matching_clusters) == 0) { my @newcluster; push(@newcluster,$tran); push(@clusters,\@newcluster); } elsif (scalar(@matching_clusters) == 1) { push @{$matching_clusters[0]}, $tran; } else { # Merge the matching clusters into a single cluster my @new_clusters; my @merged_cluster; foreach my $clust (@matching_clusters) { push @merged_cluster, @$clust; } push @merged_cluster, $tran; push @new_clusters,\@merged_cluster; # Add back non matching clusters foreach my $clust (@clusters) { my $found = 0; MATCHING: foreach my $m_clust (@matching_clusters) { if ($clust == $m_clust) { $found = 1; last MATCHING; } } if (!$found) { push @new_clusters,$clust; } } @clusters = @new_clusters; } } # safety and sanity checks $self->check_Clusters(scalar(@transcripts), \@clusters); # make and store genes #print STDERR scalar(@clusters)." created, turning them into genes...\n"; my @genes; foreach my $cluster(@clusters){ my $count = 0; my $gene = new Bio::EnsEMBL::Gene; foreach my $transcript (@$cluster){ #print "Transcript Stable ID: ",$transcript->dbID,"\n"; $gene->add_Transcript($transcript); } push( @genes, $gene ); } return @genes; } ############################################################ =head2 get_coding_exons_for_transcript Example : my $exons1 = $self->get_coding_exons_for_gene($tran); Description : It returns the coding exons of a transcript and stores them in a hash to safe computer time Returns : An ArrayRef than contain Exon objects. Args : a transcript object =cut { my %coding_exon_cache; sub clear_coding_exons_cache { %coding_exon_cache = (); } sub get_coding_exons_for_transcript { my ($trans) = @_; if (exists($coding_exon_cache{$trans})) { return $coding_exon_cache{$trans}; } else { my %coding_hash; next if (!$trans->translation); foreach my $exon (@{$trans->get_all_translateable_Exons}) { $coding_hash{$exon} = $exon; } my @coding = sort { $a->start <=> $b->start } values %coding_hash; #my @coding = values %coding_hash; $coding_exon_cache{$trans} = \@coding; return $coding_exon_cache{$trans}; } } } ############################################################ sub check_Clusters{ my ($self, $num_transcripts, $clusters) = @_; #Safety checks my $ntrans = 0; my $cluster_num = 0; my %trans_check_hash; foreach my $cluster (@$clusters) { $ntrans += scalar(@$cluster); foreach my $trans (@$cluster) { if (defined($trans_check_hash{$trans})) { $self->throw("Transcript " . $trans->dbID . " added twice to clusters\n"); } $trans_check_hash{$trans} = 1; } if (!scalar(@$cluster)) { $self->throw("Empty cluster"); } } if ($ntrans != $num_transcripts) { $self->throw("Not all transcripts have been added into clusters $ntrans and " . $num_transcripts. " \n"); } #end safety checks return; } ############################################################ sub transcript_high{ my ($self,$tran) = @_; my $high; #$tran->sort; if ( $tran->start_Exon->strand == 1){ $high = $tran->end_Exon->end; } else{ $high = $tran->start_Exon->end; } return $high; } ############################################################ sub transcript_low{ my ($self,$tran) = @_; my $low; #$tran->sort; if ( $tran->start_Exon->strand == 1){ $low = $tran->start_Exon->start; } else{ $low = $tran->end_Exon->start; } return $low; } ############################################################ sub by_transcript_high { my $alow; my $blow; my $ahigh; my $bhigh; # alow and ahigh are the left most and right most coordinates for transcript $a if ($a->start_Exon->strand == 1) { $alow = $a->start_Exon->start; $ahigh = $a->end_Exon->end; } else { $alow = $a->end_Exon->start; $ahigh = $a->start_Exon->end; } # blow and bhigh are the left most and right most coordinates for transcript $b if ($b->start_Exon->strand == 1) { $blow = $b->start_Exon->start; $bhigh = $b->end_Exon->end; } else { $blow = $b->end_Exon->start; $bhigh = $b->start_Exon->end; } # return the ascending comparison of the right-most coordinates if they're different if ($ahigh != $bhigh) { return $ahigh <=> $bhigh; } # if they'r equal, return the ascending comparison of the left most coordinate else { return $alow <=> $blow; } } ############################################################ sub prune_Exons { my ($self,$gene) = @_; my @unique_Exons; # keep track of all unique exons found so far to avoid making duplicates # need to be very careful about translation->start_Exon and translation->end_Exon foreach my $tran (@{$gene->get_all_Transcripts}) { my @newexons; foreach my $exon (@{$tran->get_all_Exons}) { my $found; #always empty UNI:foreach my $uni (@unique_Exons) { if ($uni->start == $exon->start && $uni->end == $exon->end && $uni->strand == $exon->strand && $uni->phase == $exon->phase && $uni->end_phase == $exon->end_phase ) { $found = $uni; last UNI; } } if (defined($found)) { push(@newexons,$found); if ($exon == $tran->translation->start_Exon){ $tran->translation->start_Exon($found); } if ($exon == $tran->translation->end_Exon){ $tran->translation->end_Exon($found); } } else { push(@newexons,$exon); push(@unique_Exons, $exon); } } $tran->flush_Exons; foreach my $exon (@newexons) { $tran->add_Exon($exon); } #print "Uniq_tran sid: ",$tran->dbID,"\n"; } return $gene; } ############################################################ =head2 prune_features Description: prunes out duplicated features Returntype : array of Bio::EnsEMBL::SeqFeature Args : array of Bio::EnsEMBL::SeqFeature =cut sub prune_features { my ($self,$feature_hash) = @_; my @pruned; ID: foreach my $id (keys %{ $feature_hash }) { my @features = @{$feature_hash->{$id}}; @features = sort {$a->start <=> $b->start} @features; unless ( @features ){ print STDERR "No features here for id: $id\n"; next ID; } while ( @features && !defined $features[0] ){ #print STDERR "jumping an undefined feature\n"; shift @features; } my $prev = -1; FEATURE: foreach my $f (@features) { if ($prev != -1 && $f->hseqname eq $prev->hseqname && $f->start == $prev->start && $f->end == $prev->end && $f->hstart == $prev->hstart && $f->hend == $prev->hend && $f->strand == $prev->strand && $f->hstrand == $prev->hstrand) { #keep the one with highest score if ( $f->score > $prev->score ){ $prev->score( $f->score ); } #print STDERR "pruning duplicated feature\n"; #print STDERR "previous: ".$prev->gffstring."\n"; #print STDERR "thisone : ".$f->gffstring."\n"; next FEATURE; } else { push(@pruned,$f); $prev = $f; } } } return @pruned; } ############################################################ ############################################################ # # GETSET METHODS # ############################################################ # get/set method holding a reference to the db with genewise and combined genes, # havana genes and discarded genes # this reference is set in Bio::EnsEMBL::Analysis::RunnableDB::HavanaAdder sub ensembl_db{ my ($self,$ensembl_db) = @_; if ( $ensembl_db ){ $self->{_ensembl_db} = $ensembl_db; } return $self->{_ensembl_db}; } sub havana_db{ my ($self,$havana_db) = @_; if ( $havana_db ){ $self->{_havana_db} = $havana_db; } return $self->{_havana_db}; } sub discarded_db{ my ($self, $discarded_db) = @_; if ( $discarded_db ){ $self->{_discarded_db} = $discarded_db;; } return $self->{_discarded_db}; } ############################################################ sub combined_Transcripts { my ($self,@transcripts) = @_; if (!defined($self->{_genewise_andthelike_transcripts})) { $self->{_genewise_andthelike_transcripts} = []; } if (scalar @transcripts > 0) { push(@{$self->{_genewise_andthelike_transcripts}},@transcripts); } return @{$self->{_genewise_andthelike_transcripts}}; } ############################################################ #=head2 my_genes # # Description: this holds and returns the genes that are produced after putting together genewise, combined and # processed_supporte_ab_initio predictions and removing the redundant set, giving priority # to long CDSs + UTR # #=cut # # #sub my_genes { # my ($self,@genes) = @_; # # unless($self->{_my_genes}){ # $self->{_my_genes} = []; # } # # if (@genes){ # push(@{$self->{_my_genes}},@genes); # } # return @{$self->{_my_genes}}; #} ############################################################ =head2 final_genes Descripton: this holds/returns the final genes produced after clustering transcripts and sharing common exons =cut sub final_genes{ my ($self, @genes) = @_; if ( @genes ){ push( @{$self->{_final_genes}}, @genes ); } return @{$self->{_final_genes}}; } ############################################################ =head2 gene_types Description: get/set for the type(s) of genes (usually TGE_gw, similarity_genewise and combined_e2g genes) to be used in the genebuilder they get set in new() Does not include the ab inition predictions =cut sub gene_types { my ($self,$type) = @_; if (defined($type)) { push(@{$self->{_gene_types}},$type); } return @{$self->{_gene_types}}; } ############################################################ #=head2 predictions # # Description: get/set for the PredictionTranscripts. It is set in new() # #=cut # #sub predictions { # my ($self,@predictions) = @_; # # if(!$self->{_predictions}){ # $self->{_predictions} = []; # } # if ( @predictions ) { # push(@{$self->{_predictions}},@predictions); # } # return @{$self->{_predictions}}; #} ############################################################ sub features { my ($self,@features) = @_; if (!defined($self->{_feature})) { $self->{_feature} = []; } if ( scalar @features ) { push(@{$self->{_feature}},@features); } return @{$self->{_feature}}; } ############################################################ sub query { my ($self,$slice) = @_; if (defined($slice)) { $self->{_query} = $slice; } return $self->{_query}; } ############################################################ =head2 transfer_supporting_evidence Title : transfer_supporting_evidence Usage : $self->transfer_supporting_evidence($source_exon, $target_exon) Function: Transfers supporting evidence from source_exon to target_exon, after checking the coordinates are sane and that the evidence is not already in place. Returns : nothing, but $target_exon has additional supporting evidence =cut sub transfer_supporting_evidence{ my ($self, $source_exon, $target_exon) = @_; my @target_sf = @{$target_exon->get_all_supporting_features}; # print "target exon sf: \n"; # foreach my $tsf(@target_sf){ print STDERR $tsf; $self->print_FeaturePair($tsf); } # print "source exon: \n"; # keep track of features already transferred, so that we do not duplicate my %unique_evidence; my %hold_evidence; SOURCE_FEAT: foreach my $feat ( @{$source_exon->get_all_supporting_features}){ next SOURCE_FEAT unless $feat->isa("Bio::EnsEMBL::FeaturePair"); # skip duplicated evidence objects next SOURCE_FEAT if ( $unique_evidence{ $feat } ); # skip duplicated evidence if ( $hold_evidence{ $feat->hseqname }{ $feat->start }{ $feat->end }{ $feat->hstart }{ $feat->hend } ){ #print STDERR "Skipping duplicated evidence\n"; next SOURCE_FEAT; } #$self->print_FeaturePair($feat); TARGET_FEAT: foreach my $tsf (@target_sf){ next TARGET_FEAT unless $tsf->isa("Bio::EnsEMBL::FeaturePair"); if($feat->start == $tsf->start && $feat->end == $tsf->end && $feat->strand == $tsf->strand && $feat->hseqname eq $tsf->hseqname && $feat->hstart == $tsf->hstart && $feat->hend == $tsf->hend){ #print STDERR "feature already in target exon\n"; next SOURCE_FEAT; } } #print STDERR "from ".$source_exon->dbID." to ".$target_exon->dbID."\n"; #$self->print_FeaturePair($feat); # I may need to add a paranoid check to see that no exons longer than the current one are transferred $target_exon->add_supporting_features($feat); $unique_evidence{ $feat } = 1; $hold_evidence{ $feat->hseqname }{ $feat->start }{ $feat->end }{ $feat->hstart }{ $feat->hend } = 1; } } #fetches sequence from appropriate database sub fetch_sequence{ my ($self, $name, $db) = @_; my $sa = $db->get_SliceAdaptor; my $slice = $sa->fetch_by_name($name); return $slice; } 1;