Raw content of Bio::EnsEMBL::Analysis::Runnable::GeneBuilder
package Bio::EnsEMBL::Analysis::Runnable::GeneBuilder;
use strict;
use warnings;
use vars qw(@ISA);
use Bio::EnsEMBL::Analysis::Runnable;
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
use Bio::EnsEMBL::Utils::Argument qw( rearrange );
use Bio::EnsEMBL::Analysis::Tools::Algorithms::TranscriptCluster;
use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::ExonUtils qw(transfer_supporting_evidence Exon_info);
use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranscriptUtils qw(Transcript_info print_Transcript_and_Exons);
use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::GeneUtils qw (Gene_info);
@ISA = qw(Bio::EnsEMBL::Analysis::Runnable);
sub new{
my ($class,@args) = @_;
#print "In GeneBuilder constructor with super class" . ref($class) . "\n";
my $self = $class->SUPER::new(@args);
my($genes, $blessed_biotypes, $max_transcript_number,
$min_short_intron_len, $max_short_intron_len,$output_biotype) =
rearrange([qw(GENES BLESSED_BIOTYPES MAX_TRANSCRIPT_PER_CLUSTER
MIN_SHORT_INTRON_LEN MAX_SHORT_INTRON_LEN OUTPUT_BIOTYPE)], @args);
print "HERE ARE THE ARGS: ", join(" ",@args),"\n";
print "HERE I AM: ", $min_short_intron_len,"\t", $max_short_intron_len,"\n";
########################
###SETTING DEFAULTS#####
$self->max_transcript_number(10);
#$self->min_short_intron_len(7);
#$self->max_short_intron_len(15);
$self->output_biotype($self->analysis->logic_name);
#########################
$self->input_genes($genes);
$self->blessed_biotypes($blessed_biotypes);
$self->max_transcript_number($max_transcript_number);
$self->min_short_intron_len($min_short_intron_len);
$self->max_short_intron_len($max_short_intron_len);
$self->output_biotype($output_biotype);
#data sanity
warning("Strange running the Genebuilder without any genes")
if(!$genes || scalar(@$genes) == 0);
return $self;
}
sub input_genes{
my ($self, $arg) = @_;
if($arg){
$self->{'input_genes'} = $arg;
}
return $self->{'input_genes'};
}
sub blessed_biotypes{
my ($self, $arg) = @_;
if($arg){
$self->{'blessed_biotypes'} = $arg;
}
return $self->{'blessed_biotypes'};
}
sub max_transcript_number{
my ($self, $arg) = @_;
if(defined $arg){
$self->{'max_transcript_number'} = $arg;
}
return $self->{'max_transcript_number'};
}
sub min_short_intron_len{
my ($self, $arg) = @_;
if(defined $arg){
$self->{'min_short_intron'} = $arg;
}
return $self->{'min_short_intron'};
}
sub max_short_intron_len{
my ($self, $arg) = @_;
if(defined $arg){
$self->{'max_short_intron'} = $arg;
}
return $self->{'max_short_intron'};
}
sub output_biotype{
my ($self, $arg) = @_;
if($arg){
$self->{'output_biotype'} = $arg;
}
return $self->{'output_biotype'};
}
sub run{
my ($self) = @_;
my $transcripts = $self->get_Transcripts;
if(!$transcripts || @$transcripts == 0){
print "GeneBuilder seems to have no transcripts to cluster\n";
return;
}
#cluster transcripts
#print "Have ".@$transcripts." transcripts to build from\n";
my $transcript_clusters = $self->cluster_Transcripts($transcripts);
#print "Have ".@$transcript_clusters." transcript clusters\n";
#prune transcripts
my $pruned_transcripts = $self->prune_Transcripts($transcript_clusters);
#print "Have ".@$pruned_transcripts." pruned transcripts\n";
#first gene cluster
my $initial_genes = $self->cluster_into_Genes($pruned_transcripts);
#print "Have ".@$initial_genes." initial gene structures\n";
#prune redundant cds
$self->prune_redundant_CDS($initial_genes);
#prune redundant transcripts
$self->prune_redundant_transcripts($initial_genes);
#select best transcripts
my $best_transcripts = $self->select_best_transcripts($initial_genes);
#print "Have ".@$best_transcripts." best transcripts\n";
#final cluster of genes
my $final_genes = $self->cluster_into_Genes($best_transcripts);
my $pruned_genes = $self->make_shared_exons_unique($final_genes);
$self->output($pruned_genes);
#print "Have ".@$final_genes." final gene clusters\n";
#return output
}
sub get_Transcripts{
my ($self, $transcripts) = @_;
if($transcripts){
$self->{'transcripts'} = $transcripts;
}
if(!$self->{'transcripts'}){
foreach my $gene(@{$self->input_genes}){
foreach my $transcript(@{$gene->get_all_Transcripts}){
push(@{$self->{'transcripts'}}, $transcript);
}
}
}
return $self->{'transcripts'};
}
sub cluster_Transcripts{
my ($self, $transcripts) = @_;
my @forward;
my @reverse;
foreach my $transcript(@{$transcripts}){
if($transcript->strand == 1){
push(@forward, $transcript);
}else{
push(@reverse, $transcript);
}
}
my $forward_clusters = $self->cluster_Transcripts_by_genomic_range(\@forward);
my $reverse_clusters = $self->cluster_Transcripts_by_genomic_range(\@reverse);
my @clusters;
push(@clusters, @{$forward_clusters}) if($forward_clusters);
push(@clusters, @{$reverse_clusters}) if($reverse_clusters);
return \@clusters;
}
sub cluster_Transcripts_by_genomic_range{
my ($self, $transcripts) = @_;
if(!$transcripts || @$transcripts == 0){
return undef;
}
my @transcripts = sort { $a->start <=> $b->start ? $a->start <=> $b->start : $b->end <=> $a->end } @$transcripts;
my @clusters;
my $cluster_count = 0;
my @cluster_starts;
my @cluster_ends;
my $cluster = Bio::EnsEMBL::Analysis::Tools::Algorithms::TranscriptCluster->new();
$cluster->put_Transcripts(0, $transcripts[0]);
$cluster_starts[$cluster_count] = $transcripts[0]->start;
$cluster_ends[$cluster_count] = $transcripts[0]->end;
push(@clusters, $cluster);
for (my $c=1; $c<=$#transcripts; $c++){
#if the current transcript end is not less than the current cluster start
#or the current transcript start is not greater than the current cluster end
#add the current transcript to the current cluster
#readjust the coords
if ( !( $transcripts[$c]->end < $cluster_starts[$cluster_count] ||
$transcripts[$c]->start > $cluster_ends[$cluster_count] ) ){
$cluster->put_Transcripts(0, $transcripts[$c] );
# re-adjust size of cluster
if ($transcripts[$c]->start < $cluster_starts[$cluster_count]) {
$cluster_starts[$cluster_count] = $transcripts[$c]->start;
}
if ( $transcripts[$c]->end > $cluster_ends[$cluster_count]) {
$cluster_ends[$cluster_count] = $transcripts[$c]->end;
}
}else{
#here the transcripts don't overlap so a new cluster is created
#this involves creating a new object, incrementing the cluster count
#and setting up new cluster start and ends
$cluster_count++;
$cluster = Bio::EnsEMBL::Analysis::Tools::Algorithms::TranscriptCluster->new();
$cluster->put_Transcripts(0, $transcripts[$c] );
$cluster_starts[$cluster_count] = $transcripts[$c]->start;
$cluster_ends[$cluster_count] = $transcripts[$c]->end;
# store it in the list of clusters
push(@clusters,$cluster);
}
}
return \@clusters;
}
sub prune_Transcripts {
my ($self, $transcript_clusters) = @_;
my @newtran;
my %blessed_genetypes = %{$self->blessed_biotypes};;
my $cluster_count = 0;
CLUSTER:
foreach my $transcript_cluster ( @$transcript_clusters ){
$cluster_count++;
#print STDERR "Cluster $cluster_count\n";
my $mytranscripts = $transcript_cluster->get_Transcripts;
########################
#
# sort the transcripts
#
########################
my @transcripts = @{$self->bin_sort_transcripts( $mytranscripts )};
##############################
#
# deal with single exon genes
#
##############################
# do we really just want to take the first transcript only? What about supporting
# evidence from other transcripts?
# also, if there's a very long single exon gene we will lose any underlying
# multi-exon transcripts
# this may increase problems with the loss of valid single exon genes as mentioned
# below. it's a balance between keeping multi exon transcripts and losing
# single exon ones
my $maxexon_number = 0;
foreach my $t (@transcripts){
if ( scalar(@{$t->get_all_Exons}) > $maxexon_number ){
$maxexon_number = scalar(@{$t->get_all_Exons});
}
}
if ($maxexon_number == 1){
# take the longest:
@transcripts = map { $_->[1] } sort { $b->[0]->length <=> $a->[0]->length } map{ [ $_->start_Exon, $_ ] } @transcripts;
my $tran = shift( @transcripts );
push (@newtran, $tran);
my @es = @{$tran->get_all_Exons};
my $e = $es[0];
foreach my $transcript (@transcripts){
# make sure we keep it if it's blessed
if(exists $blessed_genetypes{$transcript->type}){
push(@newtran, $transcript);
}
else{
foreach my $exon ( @{$transcript->get_all_Exons} ){
transfer_supporting_evidence($exon, $e);
}
}
}
next CLUSTER;
}
# otherwise we need to deal with multi exon transcripts and reject duplicates.
# links each exon in the transcripts of this cluster with a hash of other exons
#it is paired with
my %pairhash;
# allows retrieval of exon objects by exon->id - convenience
my %exonhash;
# keep track of single exon transcripts (automatically rejected if the "top"
# transcript is multi exon),
# so we can check their supporting evidence at the very end.
my %single_exon_rejects;
##############################
#
# prune redundant transcripts
#
##############################
TRANSCRIPT:
foreach my $tran (@transcripts) {
my @exons = @{$tran->get_all_Exons};
my $i = 0;
my $found = 1;
# if this transcript has already been seen, this
# will be used to transfer supporting evidence
my @evidence_pairs;
# 10.1.2002 VAC we know there's a potential problem here - single exon
# transcripts which are in a cluster where the longest transcriopt has > 1
# exon are not going to be considered in this loop, so they'll always be
# marked "transcript already seen" How to sort them out? If the single exon
# overlaps an exon in a multi exon transcript then by our rules it probably
# ought to be rejected the same way transcripts with shared exon-pairs are.
# 27/5/2003 VAC But the supporting evidence should be transferred if we can!
# Tough one.
# more of a problem is that if the transcript with the largest number of exons
# is really a single exon with frameshifts, it will get rejected here based on
# intron size but in addition any valid non-frameshifted single exon transcripts
# will get rejected - which is definitely not right.We need code to represent
# frameshifted exons more sensibly so the frameshifted one doesn't get through
#the check for single exon genes above.
EXONS:
for ($i = 0; $i < $#exons; $i++) {
my $foundpair = 0;
my $exon1 = $exons[$i];
my $exon2 = $exons[$i+1];
# Only count introns > 50 bp as real introns
my $intron;
if ($exon1->strand == 1) {
$intron = abs($exon2->start - $exon1->end - 1);
}
else {
$intron = abs($exon1->start - $exon2->end - 1);
}
if ($intron < $self->max_short_intron_len &&
$intron > $self->min_short_intron_len ) {
print STDERR "Intron too short: $intron bp. Transcript will be rejected\n";
$foundpair = 1;
# this pair will not be compared with other transcripts
} else {
# go through the exon pairs already stored in %pairhash.
# If there is a pair whose exon1 overlaps this exon1, and
# whose exon2 overlaps this exon2, then these two transcripts are paired
foreach my $first_exon_id (keys %pairhash) {
my $first_exon = $exonhash{$first_exon_id};
foreach my $second_exon_id (keys %{$pairhash{$first_exon}}) {
my $second_exon = $exonhash{$second_exon_id};
if ( $exon1->overlaps($first_exon) && $exon2->overlaps($second_exon) ) {
$foundpair = 1;
# eae: this method allows a transcript to be covered by exon pairs
# from different transcripts, rejecting possible
# splicing variants. Needs rethinking
# we put first the exon from the transcript being tested:
push( @evidence_pairs, [ $exon1 , $first_exon ] );
push( @evidence_pairs, [ $exon2 , $second_exon ] );
transfer_supporting_evidence($exon1, $first_exon);
transfer_supporting_evidence($first_exon, $exon1);
transfer_supporting_evidence($exon2, $second_exon);
transfer_supporting_evidence($second_exon, $exon2);
}
}
}
}
if ($foundpair == 0) {
# ie this exon pair does not overlap with a pair yet found in another
# transcript
$found = 0;
# ie currently this transcript is not paired with another
# store the exons so they can be retrieved by id
$exonhash{$exon1} = $exon1;
$exonhash{$exon2} = $exon2;
# store the pairing between these 2 exons
$pairhash{$exon1}{$exon2} = 1;
}
} # end of EXONS
# decide whether this is a new transcript or whether it has already been seen
# if it's blessed, we keep it and there's nothing more to do
if(exists $blessed_genetypes{$tran->type}){
push (@newtran, $tran);
}
elsif ($found == 0) {
push(@newtran,$tran);
@evidence_pairs = ();
} elsif ($found == 1 && $#exons == 0){
# save the transcript and check though at the end to see if we can transfer
# supporting evidence; if we try it now we may not (yet) have any exons that
# overlap in %exonhash
$single_exon_rejects{$tran} = $tran;
} else {
if ( $tran == $transcripts[0] ){
print STDERR "Strange, this is the first transcript in the cluster!\n";
}
## transfer supporting feature data. We transfer it to exons
foreach my $pair ( @evidence_pairs ){
my @pair = @$pair;
# first in the pair is the 'already seen' exon
my $source_exon = $pair[0];
my $target_exon = $pair[1];
transfer_supporting_evidence($source_exon, $target_exon)
}
}
} # end of this transcript
# check to see if we can transfer evidence from rejected single exon transcripts
foreach my $reject(values %single_exon_rejects){
my @exons = @{$reject->get_all_Exons};
# is there any supporting evidence?
my @sf = @{$exons[0]->get_all_supporting_features};
foreach my $stored_exon(values %exonhash){
if($exons[0]->overlaps($stored_exon)){
# note that we could end up with bizarre situations of a single exon
# transcript overlapping two exons in a multi exon transcript, so the
# supporting evidence would be transferred in entirety to both exons.
transfer_supporting_evidence($exons[0], $stored_exon);
}
}
}
} #end CLUSTER
return \@newtran;
}
sub cluster_into_Genes{
my ($self, $transcripts_unsorted) = @_;
my $num_trans = scalar(@$transcripts_unsorted);
my @transcripts = sort { $a->start <=> $b->start ? $a->start <=> $b->start : $b->end <=> $a->end } @$transcripts_unsorted;
my @clusters;
# clusters transcripts by whether or not any exon overlaps with an exon in
# another transcript (came from original prune in GeneBuilder)
foreach my $tran (@transcripts) {
my @matching_clusters;
CLUSTER:
foreach my $cluster (@clusters) {
foreach my $cluster_transcript (@$cluster) {
if ($tran->end >= $cluster_transcript->start &&
$tran->start <= $cluster_transcript->end) {
foreach my $exon1 (@{$tran->get_all_Exons}) {
foreach my $cluster_exon (@{$cluster_transcript->get_all_Exons}) {
if ($exon1->overlaps($cluster_exon)
&& $exon1->strand == $cluster_exon->strand) {
push (@matching_clusters, $cluster);
next CLUSTER;
}
}
}
}
}
}
if (scalar(@matching_clusters) == 0) {
my @newcluster;
push(@newcluster,$tran);
push(@clusters,\@newcluster);
} elsif (scalar(@matching_clusters) == 1) {
push @{$matching_clusters[0]}, $tran;
} else {
# Merge the matching clusters into a single cluster
my @new_clusters;
my @merged_cluster;
foreach my $clust (@matching_clusters) {
push @merged_cluster, @$clust;
}
push @merged_cluster, $tran;
push @new_clusters,\@merged_cluster;
# Add back non matching clusters
foreach my $clust (@clusters) {
my $found = 0;
MATCHING:
foreach my $m_clust (@matching_clusters) {
if ($clust == $m_clust) {
$found = 1;
last MATCHING;
}
}
if (!$found) {
push @new_clusters,$clust;
}
}
@clusters = @new_clusters;
}
}
# safety and sanity checks
$self->check_Clusters(scalar(@transcripts), \@clusters);
# make and store genes
my @genes;
foreach my $cluster(@clusters){
my $count = 0;
my $gene = new Bio::EnsEMBL::Gene;
$gene->biotype($self->output_biotype);
foreach my $transcript (@$cluster){
$gene->add_Transcript($transcript);
}
push( @genes, $gene );
}
return \@genes;
}
sub bin_sort_transcripts{
my ($self,$transcripts) = @_;
my %lengths;
my $max_num_exons = 0;
# sizehash holds transcript length - based on sum of exon lengths
my %sizehash;
# orfhash holds orf length - based on sum of translateable exon lengths
my %orfhash;
my %tran2orf;
my %tran2length;
# keeps track of to which transcript(s) this exon belong
my %exon2transcript;
foreach my $tran (@$transcripts) {
# keep track of number of exons in multiexon transcripts
my @exons = @{ $tran->get_all_Exons };
if(scalar(@exons) > $max_num_exons){
$max_num_exons = scalar(@exons);
}
# total exon length
my $length = 0;
foreach my $e ( @exons ){
$length += $e->end - $e->start + 1;
push ( @{ $exon2transcript{ $e } }, $tran );
}
$sizehash{$tran} = $length;
$tran2length{ $tran } = $length;
# now for ORF length
$length = 0;
foreach my $e(@{$tran->get_all_translateable_Exons}){
$length += $e->end - $e->start + 1;
}
$tran2orf{ $tran } = $length;
push(@{$orfhash{$length}}, $tran);
}
# VAC 15/02/2002 sort transcripts based on total exon length - this
# introduces a problem - we can (and have) masked good transcripts
# with long translations in favour of transcripts with shorter
# translations and long UTRs that are overall slightly longer. This
# is not good.
# better way? hold both total exon length and length of translateable exons.
# Then sort: long translation + UTR > long translation no UTR > short translation
# + UTR > short translation no UTR
##########
my @transcripts = ();
##########
my @orflengths = sort {$b <=> $a} (keys %orfhash);
# strict sort by translation length is just as wrong as strict sort by UTR length
# bin translation lengths - 4 bins (based on 25% length diff)? 10 bins
# (based on 10%)?
my %orflength_bin;
my $numbins = 4;
my $currbin = 1;
foreach my $orflength(@orflengths){
last if $currbin > $numbins;
my $percid = ($orflength*100)/$orflengths[0];
if ($percid > 100) {
$percid = 100;
}
my $currthreshold = $currbin * (100/$numbins);
$currthreshold = 100 - $currthreshold;
if($percid <$currthreshold) {
$currbin++;
}
my @tmp = @{$orfhash{$orflength}};
push(@{$orflength_bin{$currbin}}, @{$orfhash{$orflength}});
}
# now, foreach bin in %orflengthbin, sort by exonlength
$currbin = 1;
EXONLENGTH_SORT:
while( $currbin <= $numbins){
if(!defined $orflength_bin{$currbin} ){
$currbin++;
next EXONLENGTH_SORT;
}
my @sorted_transcripts = sort { $sizehash{$b} <=> $sizehash{$a} }
@{$orflength_bin{$currbin}};
push(@transcripts, @sorted_transcripts);
$currbin++;
}
return \@transcripts;
}
sub check_Clusters{
my ($self, $num_transcripts, $clusters) = @_;
#Safety checks
my $ntrans = 0;
my $cluster_num = 0;
my %trans_check_hash;
foreach my $cluster (@$clusters) {
$ntrans += scalar(@$cluster);
foreach my $trans (@$cluster) {
if (defined($trans_check_hash{$trans})) {
throw("Transcript " . $trans->dbID ." ".$trans->adaptor->dbc->dbname.
" added twice to clusters\n");
}
$trans_check_hash{$trans} = 1;
}
if (!scalar(@$cluster)) {
throw("Empty cluster");
}
}
if ($ntrans != $num_transcripts) {
throw("Not all transcripts have been added into clusters $ntrans and " . $num_transcripts. " \n");
}
#end safety checks
return;
}
sub prune_redundant_CDS {
my ( $self, $genes ) = @_;
my $nremoved = 0;
my %blessed_genetypes = %{$self->blessed_biotypes};
# For each gene
foreach my $gene (@$genes) {
my @trans_with_utrs;
my @trans_without_utrs;
# Separate into ones with UTRs and ones without
foreach my $trans (@{$gene->get_all_Transcripts}) {
$trans->sort;
my @exons = @{$trans->get_all_Exons};
if ($trans->translation) {
if ($trans->translation->start_Exon == $exons[0] &&
$trans->translation->start == 1 &&
$trans->translation->end_Exon == $exons[$#exons] &&
$trans->translation->end == $exons[$#exons]->length) {
push @trans_without_utrs, $trans;
} else {
push @trans_with_utrs, $trans;
}
} else {
$self->warn("No translation for transcript");
}
}
# Generate CDS exons just once (saves CPU time)
my %cds_exon_hash;
foreach my $trans (@{$gene->get_all_Transcripts}) {
if ($trans->translation) {
my @cds_exons = @{$trans->get_all_translateable_Exons};
$cds_exon_hash{$trans} = \@cds_exons;
}
}
# Compare CDSs of ones with UTRs to CDSs of ones without
foreach my $utr_trans (@trans_with_utrs) {
my $utr_trans_cds_exons = $cds_exon_hash{$utr_trans};
CDS_TRANS: foreach my $cds_trans (@trans_without_utrs) {
my $cds_trans_cds_exons = $cds_exon_hash{$cds_trans};
if (scalar(@$cds_trans_cds_exons) != scalar(@$utr_trans_cds_exons)) {
# print "Different numbers of exons\n";
next CDS_TRANS;
}
my @cds_exons = @$cds_trans_cds_exons;
foreach my $utr_trans_exon (@$utr_trans_cds_exons) {
my $cds_trans_exon = shift @cds_exons;
if ($cds_trans_exon->start != $utr_trans_exon->start ||
$cds_trans_exon->end != $utr_trans_exon->end ||
$cds_trans_exon->strand != $utr_trans_exon->strand) {
next CDS_TRANS;
}
}
if ( exists $blessed_genetypes{$cds_trans->biotype} &&
! exists $blessed_genetypes{$utr_trans->biotype}) {
# Hack to make sure transcript gets through if its like a blessed one
$utr_trans->biotype($cds_trans->biotype);
}
$nremoved++;
$self->remove_transcript_from_gene($gene,$cds_trans);
}
}
#Get the number of transcript in the @trans_with_utrs
my $num_of_utr_trans = scalar(@trans_with_utrs);
# Extra paranoid test to check if any similarity overlaps targetted models and
# get rid of them
}
}
sub remove_transcript_from_gene {
my ($self, $gene, $trans_to_del) = @_;
my @newtrans;
foreach my $trans (@{$gene->get_all_Transcripts}) {
if ($trans != $trans_to_del) {
push @newtrans,$trans;
}
}
$gene->{_transcript_array} = [];
foreach my $trans (@newtrans) {
$gene->add_Transcript($trans);
}
return scalar(@newtrans);
}
sub prune_redundant_transcripts {
my ($self,$genes) = @_;
my $nremoved = 0;
my %blessed_genetypes = %{$self->blessed_biotypes};
# For each gene
foreach my $gene (@$genes) {
# sort transcripts by total cdna length (apologies for the complexity of this line
# its done so as only to calculate cdna length once for each transcript
my @sorted_transcripts = map { $_->[1] } sort { $b->[0] <=> $a->[0] } map { [$_->length, $_] } @{$gene->get_all_Transcripts};
# so now longer UTR transcripts (with same introns will come first)
# Now generate the sorted (low to high) exon arrays and put in a hash (for speed)
# and cds start and end (genomic) hashes (for speed again)
my %sorted_exon_hash;
my %cds_start_hash;
my %cds_end_hash;
foreach my $trans (@sorted_transcripts) {
my @exons = sort {$a->start <=> $b->start} @{$trans->get_all_Exons};
$sorted_exon_hash{$trans} = \@exons;
if ($trans->translation) {
$cds_start_hash{$trans} = $trans->coding_region_start;
$cds_end_hash{$trans} = $trans->coding_region_end;
}
}
# We're going to generate a list of transcripts to remove from the gene
# and then remove them at the end
my @trans_to_remove;
# for transcript
for (my $i=0; $itranslation);
my @exons = @{$sorted_exon_hash{$trans}};
# for every other transcript
COMPTRANS:
for (my $j=$i+1; $jtranslation);
next if ($trans == $comp_trans);
next if ($cds_start_hash{$trans} !=
$cds_start_hash{$comp_trans});
next if ($cds_end_hash{$trans} != $cds_end_hash{$comp_trans});
my @comp_exons = @{$sorted_exon_hash{$comp_trans}};
if (scalar(@comp_exons) != scalar(@exons)){
next;
}
# special case for single exon
if (scalar(@exons) == 1) {
if ( exists $blessed_genetypes{$comp_trans->biotype} &&
! exists $blessed_genetypes{$trans->biotype}) {
#hack to ensure blessed biotypes are maintained
$trans->biotype($comp_trans->biotype);
}
$sorted_transcripts[$j] = undef;
push @trans_to_remove, $comp_trans;
} else {
for (my $k=0; $kend != $comp_exons[$k]->end ||
$exons[$k+1]->start != $comp_exons[$k+1]->start) {
#means exons are unique
next COMPTRANS;
}
}
if ( exists $blessed_genetypes{$comp_trans->biotype} &&
! exists $blessed_genetypes{$trans->biotype}) {
# Hack to make sure transcript gets through if its like a blessed one
$trans->biotype($comp_trans->biotype);
}
$sorted_transcripts[$j] = undef;
push @trans_to_remove, $comp_trans;
}
}
}
foreach my $trans (@trans_to_remove) {
$nremoved++;
$self->remove_transcript_from_gene($gene,$trans);
}
}
}
sub select_best_transcripts{
my ( $self, $genes ) = @_;
my @selected_transcripts;
# make sure we don't discard blessed transcripts
my %blessed_genetypes = %{$self->blessed_biotypes};
GENE:
foreach my $gene ( @$genes ){
# sort the transcripts, get the longest CDS + UTR first (like in
# prune_Transcripts(); blessed transcripts come at the top )
my $sorted_transcripts =
$self->bin_sort_transcripts( $gene->get_all_Transcripts );
my $count = 0;
TRAN:
foreach my $transcript( @$sorted_transcripts ){
$count++;
unless (exists $blessed_genetypes{$transcript->biotype}){
next TRAN if ($count > $self->max_transcript_number);
}
push ( @selected_transcripts, $transcript );
}
}
return \@selected_transcripts;
}
sub make_shared_exons_unique{
my ($self, $genes) = @_;
my @pruned;
foreach my $gene(@$genes){
my $new_gene = $self->prune_Exons($gene);
push(@pruned, $new_gene);
}
return \@pruned;
}
sub prune_Exons{
my ($self, $gene) = @_;
my @unique_Exons;
foreach my $tran (@{$gene->get_all_Transcripts}) {
my @newexons;
foreach my $exon (@{$tran->get_all_Exons}) {
my $found;
#always empty
UNI:foreach my $uni (@unique_Exons) {
if ($uni->start == $exon->start &&
$uni->end == $exon->end &&
$uni->strand == $exon->strand &&
$uni->phase == $exon->phase &&
$uni->end_phase == $exon->end_phase
) {
$found = $uni;
last UNI;
}
}
if (defined($found)) {
push(@newexons,$found);
if ($exon == $tran->translation->start_Exon){
$tran->translation->start_Exon($found);
}
if ($exon == $tran->translation->end_Exon){
$tran->translation->end_Exon($found);
}
} else {
push(@newexons,$exon);
push(@unique_Exons, $exon);
}
}
$tran->flush_Exons;
foreach my $exon (@newexons) {
$tran->add_Exon($exon);
}
}
return $gene;
}
1;