Bio::EnsEMBL::Analysis::Runnable
TranscriptCoalescer
Toolbar
Package variables
No package variables defined.
Included modules
Inherit
Synopsis
my $runnable = Bio::EnsEMBL::Analysis::Runnable::TranscriptCoalescer->new(
-query => $slice,
-program => 'snap',
);
$runnable->run;
my @predictions = @{$runnable->output};
Description
TranscriptCoalescer combines gene-structures from different evidence-sets
to longer predictions and adds translations to these predictions.
Methods
_all_exons_visited | No description | Code |
_check_if_lg_spans_st | No description | Code |
_compare_Genes | No description | Code |
add_translation_and_trans_supp_features_to_transcripts | No description | Code |
check_if_all_exons_are_overlapped | No description | Code |
compare | No description | Code |
edit_terminal_exons | No description | Code |
exon_recursion | No description | Code |
get_all_evidence_sets | No description | Code |
get_biotypes_of_evidence_set | No description | Code |
get_genes_by_biotype | No description | Code |
get_genes_by_evidence_set | No description | Code |
get_longest_3prim_term_exon_in_exon_cluster_of_this_exon | No description | Code |
get_longest_5prim_term_exon_in_exon_cluster_of_this_exon | No description | Code |
get_unvisited_exons_in_next_ec | No description | Code |
main_clustering_and_recursion | No description | Code |
merge_transcripts | No description | Code |
merge_transcripts_recursion | No description | Code |
new | Description | Code |
new_tr | No description | Code |
print_cluster_info | No description | Code |
print_exon | No description | Code |
print_object | No description | Code |
remove_overlapping_transcripts | No description | Code |
remove_redundant_transcripts | No description | Code |
run | Description | Code |
start_est_linking | No description | Code |
transcript_recursion | No description | Code |
Methods description
Function : creates TranscriptCoalescer-object Returnval : returns TranscripotCoalescer-object |
Arg : none Function : Runs the TranscriptCoalescer - clusters genes of EvidenceSet 'est' acc. to their genomic extent - Returnval : none |
Methods code
_all_exons_visited | description | prev | next | Top |
sub _all_exons_visited
{ my ( $exon_cluster_aref) = @_ ;
for my $ec (@$exon_cluster_aref) {
for my $e ( @{ $ec->get_all_Exons_in_ExonCluster } ) {
return 0 unless ( $e->visited ) ;
}
}
return 1 ; } |
sub _check_if_lg_spans_st
{ my ($lg, $st ) = @_ ;
if ($lg->seq_region_start <= $st->seq_region_start
&& $lg->seq_region_end >= $st->seq_region_end
&& $lg->seq_region_strand == $st->seq_region_strand
&& (@{$lg->get_all_Exons} >= @{$st->get_all_Exons } )
&& $lg ne $st ) {
return 1 ;
}
if ($st->seq_region_start <= $lg->seq_region_start
&& $st->seq_region_end >= $lg->seq_region_end
&& $st->seq_region_strand == $lg->seq_region_strand
&& (@{$st->get_all_Exons} >= @{$lg->get_all_Exons } )
&& $lg ne $st ) {
return 1 ;
}
return 0 ;
}
} |
sub _compare_Genes
{ my ($gene1,$gene2,$translate) = @_;
if ($gene1->end < $gene2->start || $gene1->start > $gene2->end) {
print "Gene 1 " . $gene1->start . " " . $gene1->end . "\n ";
print "Gene 2 " . $gene1->start . " " . $gene1->end . "\n ";
print "Failed extents check - returning 0\n";
return 0;
}
if ($translate) {
my $exons1 = get_coding_exons_for_gene($gene1);
my $exons2 = get_coding_exons_for_gene($gene2);
foreach my $exon1a (@$exons1) {
foreach my $exon2a (@$exons2) {
if ( ($exon1a->overlaps($exon2a)) && ($exon1a->strand == $exon2a->strand) ){
return 1;
}
}
}
} else {
foreach my $exon1a (@{$gene1->get_all_Exons}){
foreach my $exon2a (@{$gene2->get_all_Exons}){
if ( ($exon1a->overlaps($exon2a)) && ($exon1a->strand == $exon2a->strand) ){
return 1;
}
}
}
}
return 0;
}
} |
add_translation_and_trans_supp_features_to_transcripts | description | prev | next | Top |
sub add_translation_and_trans_supp_features_to_transcripts
{ my ($self, $trans,$min_translation_length) = @_ ;
my @trans_with_tl ;
TRANSCRIPT :for my $tr (@$trans) {
my $new_tr = compute_translation($tr) ;
unless ( $new_tr->translation ) {
print "skipping transcript - no translation !!\n " ;
next TRANSCRIPT ;
}
my $tr_length = $new_tr->length ; my $tl_length = $new_tr->translate->length ;
my $ratio = ( (3*$tl_length) / $tr_length)*100 ;
if ( $ratio > $MIN_TRANSLATION_LENGTH ){
push @trans_with_tl , $tr ;
}else {
print "Translation is shorter than $MIN_TRANSLATION_LENGTH %".
" of transcript length ($ratio) - Transcript will not be used\n"
if $self->{v} ;
if ($WRITE_FILTERED_TRANSCRIPTS) {
$ratio = int($ratio) ;
my $tl_length_biotype;
if ($ratio < 40 ) {
$tl_length_biotype = "translation_len_smaller_40_perc" ;
}else{
$tl_length_biotype = "translation_len_$ratio"."_perc" ;
}
$tr->biotype($tl_length_biotype) ;
push @trans_with_tl, $tr ;
}
}
}
return\@ trans_with_tl ; } |
check_if_all_exons_are_overlapped | description | prev | next | Top |
sub check_if_all_exons_are_overlapped
{ my ($lg,$st) = @_ ;
my $test_ex_ok = 0 ;
my %mark_exon ;
my %overlapped_test_exons ;
my $nr_same_exons_conserved = 0 ;
my $exon_3prim_mismatch = 0 ;
my $exon_5prim_mismatch = 0 ;
for my $lge (@{$lg->get_all_Exons }) {
for my $ste (@{$st->get_all_Exons }) {
if ( $lge->seq_region_start <= $ste->seq_region_start
&& $lge->seq_region_end >= $ste->seq_region_end
&& $lge->seq_region_strand eq $ste->seq_region_strand )
{
$nr_same_exons_conserved++ ;
$test_ex_ok++ ;
$mark_exon{$lge->hashkey} = $lge ;
$overlapped_test_exons{$ste->hashkey} = $ste ;
} else {
my $start_diff = abs( $ste->seq_region_start - $lge->seq_region_start ) ;
my $end_diff = abs( $ste->seq_region_end - $lge->seq_region_end ) ;
if ($ste->is_terminal_exon) {
if ($start_diff < 25 && $end_diff < 25 ) {
my $exon_conservation = $ste->get_percentage_exon_conversation_in_exon_cluster() ;
if ( $exon_conservation < 0.1 ) {
$mark_exon{$lge->hashkey} = $lge ;
$overlapped_test_exons{$ste}=$ste ;
$test_ex_ok++ ;
}else{
}
}
} else {
my $conservation = $ste->get_percentage_exon_conversation_in_exon_cluster() ;
if ($conservation < 0.1) {
}
}
$start_diff = abs( $ste->seq_region_start - $lge->seq_region_start ) ;
$end_diff = abs( $ste->seq_region_end - $lge->seq_region_end ) ;
if ($ste->is_5prim_exon ) {
$exon_5prim_mismatch = 1 ; }
if ($ste->is_3prim_exon) {
$exon_3prim_mismatch = 1 ; }
}
}
}
if ($test_ex_ok == scalar(@{ $st->get_all_Exons } ) ) {
for my $lger (@{$lg->get_all_Exons }) {
unless (exists $mark_exon{$lger->hashkey}) {
if ($st->seq_region_start <= $lger->seq_region_end
&& $st->seq_region_end >=$lger->seq_region_end ) {
return 0 ;
}else {
}
}
}
return 1 ; } elsif ( (scalar(@{ $st->get_all_Exons } ) - $nr_same_exons_conserved) < 2 ) {
if ($exon_3prim_mismatch || $exon_5prim_mismatch) {
return 1 ; }
}
return 0 ;
} |
sub compare
{ my ($ft1 , $ft2) = @_ ;
return 0 unless ($ft1 && $ft2) ;
if ( ($ft1->seq_region_start == $ft2->seq_region_start)
&& ($ft1->seq_region_end == $ft2->seq_region_end)
&& ($ft1->seq_region_strand== $ft2->seq_region_strand) ) {
return 1 ;
}
return 0 ; } |
sub edit_terminal_exons
{ my ($self,$aref) = @_ ;
for my $t (@$aref ) {
my @exons = @{ $t->get_all_Exons } ;
my $ex_5prim = $exons[0];
my $longest_exon_5 = get_longest_5prim_term_exon_in_exon_cluster_of_this_exon($ex_5prim) ;
$t = $t->exchange_exon($ex_5prim, $longest_exon_5) ;
my $ex_3prim = $exons[$#exons];
my $longest_exon_3 = get_longest_3prim_term_exon_in_exon_cluster_of_this_exon($ex_3prim) ;
$t = $t->exchange_exon($ex_3prim, $longest_exon_3) ;
}
print_Transcript_and_Exons($aref) if $self->{v} ;
return $aref ; } |
sub exon_recursion
{ my ($self, $all_ec, $exon, $last_exon, $transcript , $cnt, $href ) = @_ ;
if ($self->{v}) {
print "--> Starting exon recursion with this exon :\n\n " ;
print_object($exon) ;
print "\n\n" ;
}
$exon->visited("1") ;
my $tmp_biotype = $exon->biotype ."_temp" ;
my $init = 0 ;
if ( $cnt == 0 ) {
print "Initialisation\n" if $self->{v} ;
$init = 1 ;
$transcript = new Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranscriptExtended(
-BIOTYPE => $tmp_biotype ,
);
$$href{$exon}=$transcript ;
}
$cnt++ ;
if ( $exon->cluster eq $self->{end_terminal_ec} && !$exon->next_exon) {
print "End terminal cluster reached\n" if $self->{v} ;
return $transcript ;
}
if ( $exon->next_exon) {
$$href{$exon}=() ; print "Exon has next exon\n" if $self->{v} ;
unless ( compare ($exon, $last_exon) ) {
my $tr_tmp ;
if ($init) {
$transcript->add_Exon($exon) ;
$tr_tmp = $transcript ;
}else{
$tr_tmp = new_tr($transcript) ;
$transcript->add_Exon($exon) ;
}
$self->exon_recursion( $all_ec, $exon->next_exon , $exon, $tr_tmp , $cnt, $href);
my @t_exons = @{ $transcript->get_all_Exons } ;
my $exon_to_add ;
if ($exon->next_exon) {
if ($transcript->end_Exon ne $exon->next_exon) {
if ($transcript->seq_region_strand eq 1 ) {
if ($transcript->end_Exon->seq_region_end < $exon->next_exon->seq_region_start ) {
$exon_to_add = $exon->next_exon ;
}
} else {
if ($transcript->end_Exon->seq_region_start > $exon->next_exon->seq_region_end ) {
$exon_to_add = $exon->next_exon ;
}
}
}
}
if ($exon_to_add ) {
$transcript->add_Exon($exon_to_add) ;
}
return $transcript ;
} } else { print "Exon has no next exon - trying to find conserved intron to extend Transcript\n\n"
if $self->{v} ;
}
my @all_ex_in_ec = @{ $exon->cluster->get_all_Exons_in_ExonCluster() } ;
$$href{$exon}=() ;
print "Now walking through all exons in Cluster.... ( try to find last matching Intron)\n\n"
if $self->{v};
ALL_EXONS_IN_EC: for my $te ( @all_ex_in_ec ) {
my $nr_all_exons_in_ec = scalar(@all_ex_in_ec);
my $nr_exons_visited = keys %{$href} ;
if ($te eq $exon) {
print "exon is the same ---> skipping / next\n" if $self->{v}; next ;
}
if (exists $$href{$te} ) {
print "exon has been visited ---> skipping / next\n" if $self->{v}; next ;
}
if ($self->{v}){
print "Now testing this exon for conseverd intron:\n" ;
$self->print_exon($te," (this exon will be tested for cons. intron) ") ;
}
my ( $te_intron, $ex_intron ) ;
if ($te->prev_exon && $exon->prev_exon) {
print "\nExon and test-exon have prev. exon - building last intron\n" if $self->{v};
$te_intron = Bio::EnsEMBL::Intron->new($te->prev_exon,$te) ;
$ex_intron = Bio::EnsEMBL::Intron->new($exon->prev_exon,$exon) ;
} else {
}
print "Comparing introns if there are introns...\n" if $self->{v};
if ( compare($te_intron, $ex_intron ) ) {
print "Introns match !!\n" if $self->{v};
if (!exists($$href{$te})) {
print "Exon hasn't been visited\n" if $self->{v};
my $new_tr_2 ;
if ($last_exon) {
if ($self->{v}) {
print "Last_exon : $last_exon\n " ;
$self->print_exon($last_exon) ;
$self->print_exon($te->prev_exon) ;
print "switching transcript bcs conserved intron to this exon:\n";
print_exon($te->prev_exon) ;
print "\nCloning transcript -has the foolowing exons now :\n" ;
print "-------------------------------------------------\n" ;
}
$new_tr_2 = new_tr($transcript) ;
my @all_exons_new_tr = @{$new_tr_2->get_all_Exons} ;
for my $e (@all_exons_new_tr) {
$self->print_exon($e) ;
}
print "\n\n" if $self->{v};
} else {
$new_tr_2 = new_tr($transcript) ;
$last_exon = $exon ;
}
$new_tr_2 = new_tr($transcript) ;
$$href{$te}=1 ;
if ($self->{v}) {
print "new_starting_point:" ; $self->print_exon($te) ;
}
if ($te->next_exon) {
$self->exon_recursion ($all_ec,$te,$last_exon,$new_tr_2,$cnt,$href) ;
}else {
print "test exon has no next exon, skipping to start new recursiox\n" if $self->{v};
}
return ;
}else {
if ($self->{v}) {
print "exon has already been visited\n\n" ;
$self->print_exon($te," VISITED " ) ;
}
}
}elsif ( ($te->seq_region_end eq $exon->seq_region_end) && $te->next_exon) { }elsif ( ($te->seq_region_start eq $exon->seq_region_start) && $te->next_exon){ }else {
}
$$href{$te}=1 ;
} return ; } |
sub get_all_evidence_sets
{ my ($self) = @_ ;
return $self->{evidence_sets}; } |
sub get_biotypes_of_evidence_set
{ my ($self, $ev_set ) = @_ ;
return ${ $self->{evidence_sets}}{$ev_set};
}
} |
sub get_genes_by_biotype
{ my ($self, $biotype ) = @_ ;
return ${ $self->{all_genes_href}}{$biotype}; } |
sub get_genes_by_evidence_set
{ my ($self,$ev_set) = @_ ;
my @ev_set_genes ;
for my $biotype ( @{ $self->get_biotypes_of_evidence_set( $ev_set )} ) {
if ($self->get_genes_by_biotype($biotype)){
push @ev_set_genes, @{ $self->get_genes_by_biotype( $biotype ) } ;
}
}
return\@ ev_set_genes ; } |
get_longest_3prim_term_exon_in_exon_cluster_of_this_exon | description | prev | next | Top |
sub get_longest_3prim_term_exon_in_exon_cluster_of_this_exon
{ my ($small_exon ) = @_ ;
my $ec = $small_exon->cluster;
my @all_exons_in_clust = @{ $ec->get_all_Exons_in_ExonCluster } ;
my $longest_term_exon_in_clust = $small_exon ;
for my $exon (@all_exons_in_clust) {
if (!$exon->prev_exon && !$exon->next_exon) {
warning("Skipping this exon because this is a single exon\n") ;
next ;
}
if (!$exon->next_exon && ($exon->length>$longest_term_exon_in_clust->length) ) {
if ($exon->seq_region_strand eq "1" ) {
if ($exon->seq_region_start == $small_exon->seq_region_start) {
$longest_term_exon_in_clust = $exon ;
}
}elsif ($exon->seq_region_strand eq "-1") {
if ($exon->seq_region_end == $small_exon->seq_region_end ) {
$longest_term_exon_in_clust = $exon ;
}
}else {
throw("exon has no seq_region_strand assigned. exiting\n" ) ;
}
}
}
return $longest_term_exon_in_clust ;
}
1; } |
get_longest_5prim_term_exon_in_exon_cluster_of_this_exon | description | prev | next | Top |
sub get_longest_5prim_term_exon_in_exon_cluster_of_this_exon
{ my ($small_exon ) = @_ ;
my $ec = $small_exon->cluster;
my @all_exons_in_clust = @{ $ec->get_all_Exons_in_ExonCluster } ;
my $longest_term_exon_in_clust = $small_exon ;
for my $exon (@all_exons_in_clust) {
if (!$exon->prev_exon && !$exon->next_exon) {
warning("Skipping this exon because this is a single exon\n") ;
next ;
}
if (!$exon->prev_exon && ($exon->length>$longest_term_exon_in_clust->length) ) {
if ($exon->seq_region_strand eq "1" ) {
if ($exon->seq_region_end == $small_exon->seq_region_end) {
$longest_term_exon_in_clust = $exon ;
}
}elsif ($exon->seq_region_strand eq "-1") {
if ($exon->seq_region_start == $small_exon->seq_region_start ) {
$longest_term_exon_in_clust = $exon ;
}
}else {
throw("exon has no seq_region_strand assigned. exiting\n" ) ;
}
}
}
return $longest_term_exon_in_clust ; } |
sub get_unvisited_exons_in_next_ec
{ my ( $exon_cluster_aref) = @_ ;
my @unprocessed ;
for my $ec (@$exon_cluster_aref) {
for my $e ( @{ $ec->get_all_Exons_in_ExonCluster } ) {
push @unprocessed, $e unless ( $e->visited ) ;
}
return\@ unprocessed if ( @unprocessed > 0 ) ;
}
return [] ; } |
sub main_clustering_and_recursion
{ my ($self, $gene_cluster) = @_ ;
my @exon_clusters = @{ $gene_cluster->get_exon_clustering_from_gene_cluster() } ;
my @ex_cluster_est_evidence = @exon_clusters ;
$self->{start_terminal_ec} = $ex_cluster_est_evidence[0];
$self->{end_terminal_ec} = $ex_cluster_est_evidence[$#ex_cluster_est_evidence];
my @est_start_exons = @{ $ex_cluster_est_evidence[0]->get_all_Exons_of_EvidenceSet('est') } ;
@est_start_exons = grep {$_->next_exon} @est_start_exons ;
my @all_assembled_tr;
my $trans_aref = $self->transcript_recursion(\@
ex_cluster_est_evidence,\@
est_start_exons,
[]
) ;
my @pruned_transcripts ;
for my $transcript ( @$trans_aref ) {
my $ntr = new Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranscriptExtended( -BIOTYPE => $self->{new_biotype} ) ;
for my $exon ( @{ $transcript->get_all_Exons } ) {
$exon->biotype( $self->{new_biotype} ) ;
$ntr->add_Exon( $exon ) ;
}
push @pruned_transcripts , $ntr ;
}
return\@ pruned_transcripts ;
}
} |
sub merge_transcripts
{ my ($self, $base_trans, $trans_to_add , $merge_exon , $string) = @_ ;
my $new_biotype = $self->{new_biotype} . "_" . $merge_exon->biotype . "_merged_$string" ;
my $cloned_ex = new Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::ExonExtended (
-start => $merge_exon->start ,
-end => $merge_exon->end ,
-phase => $merge_exon->phase ,
-end_phase => $merge_exon->end_phase,
-strand => $merge_exon->seq_region_strand ,
-slice => $merge_exon->slice ,
-analysis => $base_trans->analysis ) ;
$cloned_ex->biotype( $new_biotype ) ;
$cloned_ex->transcript($merge_exon->transcript) ;
$cloned_ex->add_supporting_features( @{$merge_exon->get_all_supporting_features}) ;
my %tmp = %{ $self->get_all_evidence_sets };
my %tmp2 ;
@tmp2{ @{ $tmp{est_merged} } } =();
unless (exists $tmp2{ $new_biotype } ) {
push @{ $tmp{'est_merged'}}, $new_biotype ;
$self->get_all_evidence_sets(\% tmp ) ;
}
my $old_merge_exon_biotype = $merge_exon->biotype ;
if ( length($new_biotype) > 40) {
warning("The stringlength of the biotype is too loong....mysql will shorten it\n") ;
}
my @base_exons = @{$base_trans->get_all_Exons()} ;
pop @base_exons ;
my @exons_to_add = @{ $trans_to_add->get_all_Exons} ;
shift @exons_to_add ;
my $tr_merged = new Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranscriptExtended(
-BIOTYPE => $new_biotype ,
-ANALYSIS =>$base_trans->analysis ,
);
my $hseq_name ;
if ($merge_exon->stable_id) {
$hseq_name = $merge_exon->stable_id ;
}elsif ($merge_exon->dbID) {
$hseq_name = $merge_exon->dbID ;
}else {
$hseq_name = $merge_exon->biotype ;
}
my $feat_pair = Bio::EnsEMBL::FeaturePair->new (
-start => $merge_exon->seq_region_start ,
-end => $merge_exon->seq_region_end ,
-strand => $merge_exon->seq_region_strand ,
-slice => $merge_exon->slice ,
-analysis =>$merge_exon->analysis ,
-hseqname =>$hseq_name ,
-hstart => 1 ,
-hend => ( $merge_exon->seq_region_end - $merge_exon->seq_region_start +1 ) ,
-hstrand => $merge_exon->seq_region_strand ,
-score => 0 ,
-percent_id => 0 ,
) ;
my $dna_align_feat = Bio::EnsEMBL::DnaDnaAlignFeature->new (-features =>[$feat_pair] ,
-analysis => $merge_exon->analysis ) ;
$cloned_ex->add_supporting_features ( $dna_align_feat) ;
for my $be (@base_exons ) {
$be->biotype( $new_biotype ) ;
$tr_merged->add_Exon( $be ) ;
}
$tr_merged->add_Exon ( $cloned_ex ) ;
for my $e (@exons_to_add) {
$e->biotype($new_biotype) ;
$tr_merged->add_Exon ($e) ;
}
return $tr_merged ; } |
sub merge_transcripts_recursion
{ my ( $self, $start_transcript , $transcripts_ref , $exon_clusters ) = @_ ;
my $tr1 = $start_transcript ;
my $tr2 ;
if (@$transcripts_ref > 0 ) {
$tr2 = shift @$transcripts_ref ;
my @new_transcripts ;
my $ex_3prim = $tr1->end_Exon();
my $merged_transcript ;
unless ($ex_3prim->next_exon ) {
if ( $tr2->start_Exon->overlaps($ex_3prim) && !$tr2->start_Exon->prev_exon ) {
if ($self->{v}){
print "no next exon - look for overlapping 5prm\n " ;
print "\nRECURSION : These transcripts could be merged:\n" ; print "="x80;
print "\ntr1 : " . $tr1->seq_region_start . " -- " .$tr1->seq_region_end .
" " . $tr1->biotype . "\n" ; ;
print "tr2 : " . $tr2->seq_region_start . " -- " .$tr2->seq_region_end .
" " . $tr2->biotype . "\n" ; ;
}
my $this_ec = $tr1->end_Exon->cluster ;
my @candidate_exons = @{ $this_ec->get_all_Exons_of_EvidenceSet('simgw')} ;
push @candidate_exons, @{ $this_ec->get_all_Exons_of_EvidenceSet('abinitio')} ;
for my $ce (@candidate_exons) {
if ($self->{v}){
print "RECURSION checking candidate exon which spans region :\n" ;
print "start : " . $ce->seq_region_start . "\t".
$ce->seq_region_end . " (" . $ce->analysis->logic_name . ")\n\n" ;
}
if ( $ce->prev_exon && $ce->next_exon ) {
my ( $tr1_boundary , $tr2_boundary ) ;
my $ce_start = $ce->seq_region_end ;
my $ce_end = $ce->seq_region_start ;
if ( $tr1->seq_region_strand eq '1' ){
$tr1_boundary = $tr2->start_Exon->seq_region_end ;
$tr2_boundary = $tr1->end_Exon->seq_region_start ;
} else {
$tr1_boundary = $tr1->end_Exon->seq_region_end ;
$tr2_boundary = $tr2->start_Exon->seq_region_start ;
}
if ($self->{v}){
print "check_exon has previous and next exon\n" ;
print "\n" ; print "tr1_boundary : " . $tr1_boundary . "\n" ;
print "ce_start : " . $ce_start . "\n" ;
print "\n" ; print "tr2_boundary : " . $tr2_boundary . "\n" ;
print "ce_end : " . $ce_end . "\n" ; print "\n\n" ; print "\n\n" ;
}
if ($ce_start == $tr1_boundary && $ce_end == $tr2_boundary ) {
print "Boundaries match - merging genes\n " if $self->{v};
my $tr1_last_intron = Bio::EnsEMBL::Intron->new($tr1->end_Exon->prev_exon,$tr1->end_Exon) ;
my $ce_last_intron = Bio::EnsEMBL::Intron->new( $ce->prev_exon,$ce ) ;
my $tr2_next_intron = Bio::EnsEMBL::Intron->new($tr2->start_Exon,$tr2->start_Exon->next_exon) ;
my $ce_next_intron = Bio::EnsEMBL::Intron->new($ce, $ce->next_exon ) ;
if ( compare ($tr1_last_intron, $ce_last_intron ) && compare ($tr2_next_intron, $ce_next_intron)) {
push @new_transcripts , $self->merge_transcripts ( $tr1, $tr2, $ce, 'no_strict') ;
$merged_transcript = $self->merge_transcripts ( $tr1, $tr2, $ce, 'no_strict') ;
$self->merge_transcripts_recursion ( new_tr($merged_transcript) , $transcripts_ref , $exon_clusters );
}
if ( $tr1->end_Exon->prev_exon->seq_region_end == $ce->prev_exon->seq_region_end
&& $tr1->end_Exon->prev_exon->seq_region_start == $ce->prev_exon->seq_region_start
&& $tr2->start_Exon->next_exon->seq_region_start == $ce->next_exon->seq_region_start
&& $tr2->start_Exon->next_exon->seq_region_end == $ce->next_exon->seq_region_end ) {
push @new_transcripts , $self->merge_transcripts ( $tr1, $tr2, $ce, 'strict' ) ;
$merged_transcript = $self->merge_transcripts ( $tr1, $tr2, $ce, 'strict') ;
$self->merge_transcripts_recursion ( new_tr($merged_transcript) , $transcripts_ref , $exon_clusters );
}
} else {
print "Boundaries does not match\n" if $self->{v};
}
}
} }
}
} else { push @{$self->{merged_tr} } , $start_transcript ;
return ;
}
my $tr_new = new_tr($tr2) ;
$self->merge_transcripts_recursion ( $tr_new , $transcripts_ref , $exon_clusters ); } |
sub new
{ my ($class,@args) = @_;
my $self = $class->SUPER::new(@args);
$self->{'_modified_genes'} =[] ; $self->{'_discarded_transcripts'} = []; $self->{'_genes'} = [];
my( $all_genes, $evidence_sets,$dnadb , $utils_verbosity ) =
rearrange([qw(
ALL_GENES
EVIDENCE_SETS
DNADB
UTILS_VERBOSITY
)], @args);
$self->{merged_tr}=[] ;
$self->{min_translation_length}=$MIN_TRANSLATION_LENGTH ;
$self->{dnadb}=$dnadb ;
$self->{new_biotype} = $NEW_BIOTYPE ;
${$evidence_sets}{'est_merged'}=[$self->{new_biotype}] ;
$self->{evidence_sets} = $evidence_sets;
$self->{write_filtered_transcripts} = $WRITE_FILTERED_TRANSCRIPTS ;
$self->{write_alternative_transcripts} = $WRITE_ALTERNATIVE_TRANSCRIPTS ;
$self->{adjudicate_simgw_est} = $ADJUDICATE_SIMGW_EST ;
$self->{all_genes_href} = $all_genes ; $self->{v} = $VERBOSE ; if (defined($utils_verbosity)) {
$self->{v} = 1 if ($utils_verbosity=~m/INFO/) ;
}
return $self ; } |
sub new_tr
{ my ($old_tr) = @_ ;
my $new_tr = new Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranscriptExtended(
-EXONS => $old_tr->get_all_Exons(),
-BIOTYPE =>$old_tr->biotype(),
) ;
$new_tr->ev_set($old_tr->ev_set) if $old_tr->isa("Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranscriptExtended") ;
return $new_tr ; } |
sub print_cluster_info
{ my ($slice,$cluster) = @_;
my $name = "GENE-CLUSTER : ";
$name = "EXON-CLUSTER" if ref($cluster)=~m/Exon/ ;
my $offset = $slice->start -1 ; my $cl_start = $offset + $cluster->start ;
my $cl_end= $offset + $cluster->end ;
print "$name start: $cl_start\tend: $cl_end\t" . $cluster->strand ."\n " ;
return ; } |
sub print_exon
{ my ($self, $ex,$string,$nr_tab) = @_ ;
if ($self->{v}) {
$string ="" unless $string ;
my $tr_dbID = "" ;
my $hseq_name = "" ;
$nr_tab = 0 unless $nr_tab;
my $tr = $ex->transcript() ;
print $ex->biotype if $ex->biotype ;
my @sup_feat = @{ $ex->transcript->get_all_supporting_features} ;
$tr_dbID = "\ttr-db-id " . $ex->transcript->dbID ;
if ( $sup_feat[0]) {
$hseq_name = $sup_feat[0]->hseqname ;
}
print " " . $hseq_name .
"\t". $ex->seq_region_start . "---". $ex->seq_region_end .
"\t". $ex->seq_region_end. "---". $ex->seq_region_start .
"\t". $ex->seq_region_strand .
"\t". $ex->biotype ."$string" .
"\n" ;
} } |
sub print_object
{ my ( $ex, $string ) = @_ ;
print "$string :\n" if $string ;
print "srs -- sre:" .
"\t". $ex->seq_region_start .
"---". $ex->seq_region_end .
"\t". $ex->seq_region_strand .
"\t". $ex->biotype .
"\n" ; } |
sub remove_overlapping_transcripts
{ my ($tref ) = @_ ;
my @transcripts ;
for my $tr (@$tref) {
push @transcripts, $tr unless ($tr->biotype=~m/^del_/) ;
}
@transcripts = sort { scalar(@{$a->get_all_Exons}) <=> scalar(@{$b->get_all_Exons}) } @transcripts;
if ( @{$transcripts[-1]->get_all_Exons} == 1){
@transcripts = sort { $a->length <=> $b->length } @transcripts;
}
my @ltr = reverse @transcripts ;
my @remove_tr ;
for (my $i=0 ; $i<@ltr; $i++) {
my $lg = $ltr[$i] ;
for (my $j=$i+1 ; $j < @ltr; $j++ ) {
my $st = $ltr[$j] ;
if ($lg ne $st) {
if ( _check_if_lg_spans_st ($lg,$st) ) {
if ( check_if_all_exons_are_overlapped ($lg,$st) ) {
push @remove_tr, $st ;
}
} }
}
}
my %seen ;
@seen{ @$tref } = @$tref ;
delete @seen{ @remove_tr } ;
my @diff = values %seen ;
my %tmp ;
@tmp{@remove_tr} = @remove_tr ;
@remove_tr = values %tmp ;
for my $t ( @remove_tr ) {
my $bt = "del_" . $t->biotype ;
$t->biotype($bt) ;
for my $e (@{ $t->get_all_Exons} ) {
}
}
return (\@diff,\@ remove_tr) ; } |
sub remove_redundant_transcripts
{ my ($tr_ref) = @_ ;
my @non_red_trans ;
my %tmp ;
for my $t (@$tr_ref){
my @ex= @{ $t->get_all_Exons } ;
my $string ;
for my $e (@ex) {
my $exon_hk = $e->hashkey ;
$string.=$exon_hk ;
}
push @{$tmp{$string}}, $t ;
}
for my $k (keys %tmp){
my @t = @{$tmp{$k}} ;
push @non_red_trans, $t[0];
}
return\@ non_red_trans ; } |
sub run
{ my ($self) = @_ ;
print " all genes fetched\n" if $self->{v};
my @allgenes = @{ $self->get_genes_by_evidence_set('est') } ;
my ($clusters, $non_clusters) = cluster_Genes(\@allgenes, $self->get_all_evidence_sets ) ;
push @$clusters, @$non_clusters if (scalar(@$non_clusters) > 0 ) ;
my (@all_merged_est_transcripts, @tr_to_delete ) ;
print "non_clusters : " . scalar(@$non_clusters) . "\n" if $self->{v};
GENE_CLUSTER: foreach my $gene_cluster (@$clusters) {
if ($self->{v}) {
print " processing gene_cluster : $gene_cluster\n" ;
print "\n" ; print_cluster_info($self->query,$gene_cluster ) ; print "="x80; print "\n" ;
}
print "Starting main_clustering_and_recursion\n" if $self->{v} ;
my $tr = $self->main_clustering_and_recursion($gene_cluster) ;
if ( $tr ) {
if ($self->{v}){
print "Stage 1: Number Genes resulting out of pure est-merging : ".scalar(@$tr)."\n";
print_Transcript_and_Exons($tr) ;
}
my @non_red_trans =
@{ remove_redundant_transcripts( $self->edit_terminal_exons(remove_redundant_transcripts($tr)))};
print "Stage 2: Number Trans. after term exon edit / removing redundancy : "
. scalar(@non_red_trans)."\n" if $self->{v};
my ($non_overlapping_trans , $removed_trans )=remove_overlapping_transcripts(\@non_red_trans) ;
print "Stage 3: Number non-overlapping Transcripts : "
. scalar(@$non_overlapping_trans)
. " (" . scalar(@$removed_trans) . " removed )\n\n" if $self->{v} ;
push @tr_to_delete , @$removed_trans if $self->{write_filtered_transcripts} ;
push @all_merged_est_transcripts , @$non_overlapping_trans ;
}
}
if ($VERBOSE) {
print "after the first round\n" ;
print "="x80 ; print "\n" ;
print_Transcript_and_Exons(\@all_merged_est_transcripts) ;
}
my @new_gene_set ;
push @new_gene_set , @{convert_to_genes(\@ all_merged_est_transcripts, $self->analysis ) } ;
push @new_gene_set , @{ $self->get_genes_by_evidence_set('abinitio') } ;
push @new_gene_set , @{ $self->get_genes_by_evidence_set('simgw') } ;
my ($clusters2, $non_clusters2) = cluster_Genes(\@new_gene_set, $self->get_all_evidence_sets ) ;
GENE_CLUSTERS: for my $gene_cluster (@$clusters2) {
my %sets_in_gene_cluster ;
@sets_in_gene_cluster { @{$gene_cluster->get_sets_included} }= () ;
if ( exists $sets_in_gene_cluster{est_merged} ) {
my @tr_merged =
map {@{$_->get_all_Transcripts}} $gene_cluster->get_Genes_by_Set ( 'est_merged' );
my $exon_clusters = $gene_cluster->get_exon_clustering_from_gene_cluster();
print "exon_clustering finished\n" if $self->{v};
if ($gene_cluster->strand == 1 ) {
@tr_merged = sort { $a->seq_region_start <=> $b->seq_region_start } @tr_merged ;
}else {
@tr_merged = sort { $b->seq_region_start <=> $a->seq_region_start } @tr_merged ;
}
my $start_transcript = shift @tr_merged ;
my $t = $self->merge_transcripts_recursion($start_transcript ,\@ tr_merged, $exon_clusters );
if ( @{ $self->{merged_tr} }>0 ){
my @cloned_tr = @{ $self->{merged_tr} } ;
$self->{merged_tr}=[] ; print_Transcript_and_Exons(\@cloned_tr) ;
@cloned_tr = @{ remove_redundant_transcripts(\@ cloned_tr ) } ;
my ($non_overlapping_trans, $removed_trans )= remove_overlapping_transcripts(\@ cloned_tr ) ;
push @tr_to_delete, @$removed_trans if ($self->{write_filtered_transcripts});
push @all_merged_est_transcripts , @$non_overlapping_trans ;
@all_merged_est_transcripts =
@{ remove_redundant_transcripts(\@ all_merged_est_transcripts ) };
($non_overlapping_trans, $removed_trans )
= remove_overlapping_transcripts(\@ all_merged_est_transcripts ) ;
@all_merged_est_transcripts = @$non_overlapping_trans ;
push @tr_to_delete , @$removed_trans if $self->{write_filtered_transcripts};
} else {
push @all_merged_est_transcripts , @tr_merged ;
}
} else {
}
}
if ($self->{adjudicate_simgw_est}) {
}
if ($WRITE_FILTERED_TRANSCRIPTS) {
push @all_merged_est_transcripts, @tr_to_delete ;
}
print "Having " . scalar( @all_merged_est_transcripts ) . " genes so far for this slice\n\n " ;
my @converted_transcripts ;
OLD_TR: for my $old_tr (@all_merged_est_transcripts) {
print "transcript\n " ;
my @new_exons ;
my @tsf ;
my @esf ;
OLD_EX: for my $oe ( @{$old_tr->get_all_Exons} ) {
push @tsf, @{$oe->transcript->get_all_supporting_features} ;
print "have " . scalar ( @tsf ) . " tsf for exon ".$oe->dbID."\n " ;
my $ne = new Bio::EnsEMBL::Exon(
-START =>$oe->start,
-END =>$oe->end,
-STRAND =>$oe->strand,
-SLICE =>$oe->slice,
-ANALYSIS =>$oe->analysis,
);
$ne->add_supporting_features(@{$oe->get_all_supporting_features} ) ;
push @esf, @{$oe->get_all_supporting_features} ;
push @new_exons, $ne ;
}
my $ntr = new Bio::EnsEMBL::Transcript (-EXONS =>\@ new_exons) ;
$ntr->biotype($old_tr->biotype) ;
$ntr->add_supporting_features(@tsf) ;
push @converted_transcripts, $ntr;
}
@all_merged_est_transcripts = @converted_transcripts ;
@all_merged_est_transcripts =
sort {$a->seq_region_start <=> $b->seq_region_start} @all_merged_est_transcripts ;
print_Transcript_and_Exons(\@all_merged_est_transcripts) if $self->{v};
my @trans_with_tl = @{$self->add_translation_and_trans_supp_features_to_transcripts(\@
all_merged_est_transcripts,$self->{min_translation_length}) } ;
$self->output ( convert_to_genes (\@ trans_with_tl, $self->analysis) ) ;
return ; } |
sub start_est_linking
{ my ( $self, $start_exons , $ex_cluster_real_ev ) = @_ ;
my @all_assembled_tr ;
if ($self->{v}){
print "starting est-linking with set of start-exons\n" ;
for (@$start_exons) {
print "START_exon: " ; print_object($_) ;
}
}
for my $act_start_exon (@$start_exons) {
if ($self->{v}){
print "\n\nSTART: Starting with exon :\n" ;
$self->print_exon($act_start_exon) ;
print "starting the exon_recursion\n" ;
}
my $t = $self->exon_recursion(
$ex_cluster_real_ev, $act_start_exon, undef, [], 0 , {} );
push @all_assembled_tr, $t if $t;
print "exon_recursion finished" if $self->{v} ;
}
if ($self->{v}){
print "\nAfter exon_recursion:\n" ;
print_Transcript_and_Exons(\@all_assembled_tr) ;
}
return\@ all_assembled_tr ; } |
transcript_recursion | description | prev | next | Top |
sub transcript_recursion
{ my ($self , $ex_clusters_real_ev, $exons ,$all_tr ) = @_ ;
print "\nTranscriptRecursion:\n=============================================\n" if $self->{v};
my @ex_clusters_real_evidence = @$ex_clusters_real_ev ;
my @start_exons ;
if ( scalar(@$all_tr)==0) {
print "initialisation - using start exons which have been handend over _tmp_\n" if $self->{v} ;
@start_exons = @$exons ;
} else {
@start_exons = @{ get_unvisited_exons_in_next_ec( $ex_clusters_real_ev ) } ;
my @tmp ;
if ( ( @start_exons) > 0 ) {
print "start_exons :" . scalar(@start_exons) . "\n\n" if $self->{v} ;
for my $se (@start_exons) {
if ($se->next_exon) {
if ($self->{v}) {
print "this exon_has_next_exon : " ;
print_object($se) ;
print "this is next exon : " ;
print_object($se->next_exon) ;
}
push @tmp, $se if $se->next_exon() ;
}
}
}
@start_exons = @tmp ;
}
if ( _all_exons_visited ( $ex_clusters_real_ev) ) {
print "\nall_exons_visited_returning_from_transcript_recursion\n" if $self->{v};
return ;
}
if (@start_exons > 0 ) {
print "starting est_linking\n" if $self->{v};
my $trans_aref = $self->start_est_linking(\@start_exons , $ex_clusters_real_ev ) ;
print_Transcript_and_Exons($trans_aref , "trans_aref_smart" ) if $self->{v};
push @$all_tr, @$trans_aref ;
print "finished est_linking\n" if $self->{v};
for my $ref (@$all_tr) {
if ( ref($ref)=~m/ARRAY/){
throw("having_array_that is not good");
}
}
}
$self->transcript_recursion ($ex_clusters_real_ev,\@ start_exons , $all_tr ) ;
if ($self->{v}){
print "END_OF_TRANSCRIPT_RECURSION\ntrying print_Transcript\n" ;
print_Transcript_and_Exons($all_tr,"debug_trans_rec") ;
}
return $all_tr ; } |
General documentation
Bio::EnsEMBL::Analysis::Runnable::TranscriptCoealescer
Name : get_all_evidence_sets
Arg[1] : none
Function : Returns hash of evidence_set_names($key) and biotypes
Returnval : Hashref $hash{evidence_set_name} = @\biotypes
Name : get_genes_by_evidence_set($evidence_set)
Arg[1] : String
Function : Returns all genes of an evidence set
(an evidence set contains genes of different biotypes ).
If there were any PredictionTranscripts specified as evidence set,
they were converted to genes and returned, too.
Name : get_genes_by_biotype($arg)
Arg[1] : String
Function : Returns all genes of a specific biotype (like all simgw_100 or genscan genes)
Returnval : Arrayref of Bio::EnsEMBL::Gene objects
Name : get_biotypes_of_evidence_set($arg)
Arg[1] : String
Function : Returns all biotypes specific evidence set (like simgw_100 , genscan )
Returnval : Arrayref of Strings
Arg : Arrayref to array of array of transcripts
Function : removes redundant transcripts by comparing their exon-hashkeys
Returns : Arrayref to Array of Transcripts