GeneBuilder documentation.

No package variables defined.

sub bin_sort_transcripts {

  my ($self,$transcripts) = @_;

  my %lengths;
  my $max_num_exons = 0;

  # sizehash holds transcript length - based on sum of exon lengths
  my %sizehash;

  # orfhash holds orf length - based on sum of translateable exon lengths
  my %orfhash;

  my %tran2orf;
  my %tran2length;

  # keeps track of to which transcript(s) this exon belong
  my %exon2transcript;

  foreach my $tran (@$transcripts) {

    # keep track of number of exons in multiexon transcripts
    my @exons = @{ $tran->get_all_Exons };
    if(scalar(@exons) > $max_num_exons){
      $max_num_exons = scalar(@exons);
    }

    # total exon length
    my $length = 0;
    foreach my $e ( @exons ){
      $length += $e->end - $e->start + 1;
      push ( @{ $exon2transcript{ $e } }, $tran );
    }
    $sizehash{$tran} = $length;
    $tran2length{ $tran } = $length;

    # now for ORF length
    $length = 0;
    foreach my $e(@{$tran->get_all_translateable_Exons}){
      $length += $e->end - $e->start + 1;
    }
    $tran2orf{ $tran } = $length;
    push(@{$orfhash{$length}}, $tran);
  }

  # VAC 15/02/2002 sort transcripts based on total exon length - this
  # introduces a problem - we can (and have) masked good transcripts
  # with long translations in favour of transcripts with shorter
  # translations and long UTRs that are overall slightly longer. This
  # is not good.
  # better way? hold both total exon length and length of translateable exons. 
  # Then sort: long translation + UTR > long translation no UTR > short translation 
  # + UTR > short translation no UTR
  ##########
  my @transcripts = ();
  ##########

  my @orflengths = sort {$b <=> $a} (keys %orfhash);

  # strict sort by translation length is just as wrong as strict sort by UTR length
  # bin translation lengths - 4 bins (based on 25% length diff)? 10 bins 
  # (based on 10%)?
  my %orflength_bin;
  my $numbins = 4;
  my $currbin = 1;

  foreach my $orflength(@orflengths){
    last if $currbin > $numbins;
    my $percid = ($orflength*100)/$orflengths[0];
    if ($percid > 100) { 
      $percid = 100; 
    }
    my $currthreshold = $currbin * (100/$numbins);
    $currthreshold    = 100 - $currthreshold;

    if($percid <$currthreshold) { 
      $currbin++; 
    }
    my @tmp = @{$orfhash{$orflength}};
    push(@{$orflength_bin{$currbin}}, @{$orfhash{$orflength}});
  }

  # now, foreach bin in %orflengthbin, sort by exonlength
  $currbin = 1;
 EXONLENGTH_SORT:
  while( $currbin <= $numbins){
    if(!defined $orflength_bin{$currbin} ){
      $currbin++;
      next EXONLENGTH_SORT;
    }	
    my @sorted_transcripts = sort { $sizehash{$b} <=> $sizehash{$a} } 
      @{$orflength_bin{$currbin}};
    push(@transcripts, @sorted_transcripts);
    $currbin++;
  }

  return\@ transcripts;

}

sub blessed_biotypes {

  my ($self, $arg) = @_;
  if($arg){
    $self->{'blessed_biotypes'} = $arg;
  }
  return $self->{'blessed_biotypes'};

}

sub check_Clusters {

  my ($self, $num_transcripts, $clusters) = @_;
  #Safety checks
  my $ntrans = 0;

  my $cluster_num = 0;

  my %trans_check_hash;
  foreach my $cluster (@$clusters) {
    $ntrans += scalar(@$cluster);

    foreach my $trans (@$cluster) {

      if (defined($trans_check_hash{$trans})) {
        throw("Transcript " . $trans->dbID ." ".$trans->adaptor->dbc->dbname. 
              " added twice to clusters\n");
      }
      $trans_check_hash{$trans} = 1;
    }
    if (!scalar(@$cluster)) {
      throw("Empty cluster");
    }
  }
  if ($ntrans != $num_transcripts) {
    throw("Not all transcripts have been added into clusters $ntrans and " . $num_transcripts. "\n ");
  }
  #end safety checks
  return;

}

sub cluster_Transcripts {

  my ($self, $transcripts) = @_;
  my @forward;
  my @reverse;
  foreach my $transcript(@{$transcripts}){
    if($transcript->strand == 1){
      push(@forward, $transcript);
    }else{
      push(@reverse, $transcript);
    }
  }
  my $forward_clusters = $self->cluster_Transcripts_by_genomic_range(\@forward);
  my $reverse_clusters = $self->cluster_Transcripts_by_genomic_range(\@reverse);
  my @clusters;
  push(@clusters, @{$forward_clusters}) if($forward_clusters);
  push(@clusters, @{$reverse_clusters}) if($reverse_clusters);
  return\@ clusters;

}

sub cluster_Transcripts_by_genomic_range {

  my ($self, $transcripts) = @_;
  if(!$transcripts || @$transcripts == 0){
    return undef;
  }
  my @transcripts = sort { $a->start <=> $b->start ? $a->start <=> $b->start : $b->end <=> $a->end } @$transcripts;

  my @clusters;
  my $cluster_count = 0;
  my @cluster_starts;
  my @cluster_ends;
  
  my $cluster = Bio::EnsEMBL::Analysis::Tools::Algorithms::TranscriptCluster->new();
  $cluster->put_Transcripts(0, $transcripts[0]);

  $cluster_starts[$cluster_count] = $transcripts[0]->start;
  $cluster_ends[$cluster_count] = $transcripts[0]->end;

  push(@clusters, $cluster);
  for (my $c=1; $c<=$#transcripts; $c++){
    #if the current transcript end is not less than the current cluster start
    #or the current transcript start is not greater than the current cluster end
    #add the current transcript to the current cluster
    #readjust the coords
    if ( !( $transcripts[$c]->end < $cluster_starts[$cluster_count] ||
	    $transcripts[$c]->start > $cluster_ends[$cluster_count] ) ){
      $cluster->put_Transcripts(0, $transcripts[$c] );
      
      # re-adjust size of cluster
      if ($transcripts[$c]->start < $cluster_starts[$cluster_count]) {
	$cluster_starts[$cluster_count] = $transcripts[$c]->start;
      }
      if ( $transcripts[$c]->end > $cluster_ends[$cluster_count]) {
	$cluster_ends[$cluster_count] =  $transcripts[$c]->end;
      }
    }else{
      #here the transcripts don't overlap so a new cluster is created
      #this involves creating a new object, incrementing the cluster count
      #and setting up new cluster start and ends
      $cluster_count++;
      $cluster = Bio::EnsEMBL::Analysis::Tools::Algorithms::TranscriptCluster->new();
      $cluster->put_Transcripts(0, $transcripts[$c] );
      $cluster_starts[$cluster_count] = $transcripts[$c]->start;
      $cluster_ends[$cluster_count]   = $transcripts[$c]->end;
      
      # store it in the list of clusters
      push(@clusters,$cluster);
    }
  }
  return\@ clusters;

}

sub cluster_into_Genes {

  my ($self, $transcripts_unsorted) = @_;
  my $num_trans = scalar(@$transcripts_unsorted);
  my @transcripts = sort { $a->start <=> $b->start ? $a->start <=> $b->start  : $b->end <=> $a->end } @$transcripts_unsorted;

  my @clusters;
  # clusters transcripts by whether or not any exon overlaps with an exon in 
  # another transcript (came from original prune in GeneBuilder)
  foreach my $tran (@transcripts) {
    my @matching_clusters;
  CLUSTER: 
    foreach my $cluster (@clusters) {
      foreach my $cluster_transcript (@$cluster) {
        if ($tran->end  >= $cluster_transcript->start &&
            $tran->start <= $cluster_transcript->end) {
          
          foreach my $exon1 (@{$tran->get_all_Exons}) {
            foreach my $cluster_exon (@{$cluster_transcript->get_all_Exons}) {
              if ($exon1->overlaps($cluster_exon) 
                  && $exon1->strand == $cluster_exon->strand) {
                push (@matching_clusters, $cluster);
                next CLUSTER;
              }
            }
          }
        }
      }
    }

    if (scalar(@matching_clusters) == 0) {
      my @newcluster;
      push(@newcluster,$tran);
      push(@clusters,\@newcluster);
    } elsif (scalar(@matching_clusters) == 1) {
      push @{$matching_clusters[0]}, $tran;
    } else {
      # Merge the matching clusters into a single cluster
      my @new_clusters;
      my @merged_cluster;
      foreach my $clust (@matching_clusters) {
        push @merged_cluster, @$clust;
      }
      push @merged_cluster, $tran;
      push @new_clusters,\@merged_cluster;
      # Add back non matching clusters
      foreach my $clust (@clusters) {
        my $found = 0;
      MATCHING: 
	foreach my $m_clust (@matching_clusters) {
          if ($clust == $m_clust) {
            $found = 1;
            last MATCHING;
          }
        }
        if (!$found) {
          push @new_clusters,$clust;
        }
      }
      @clusters =  @new_clusters;
    }
  }
  # safety and sanity checks
  $self->check_Clusters(scalar(@transcripts),\@ clusters);
  
  # make and store genes
  my @genes;
  foreach my $cluster(@clusters){
    my $count = 0;
    my $gene = new Bio::EnsEMBL::Gene;
    $gene->biotype($self->output_biotype);
    foreach my $transcript (@$cluster){
      $gene->add_Transcript($transcript);
    }
    push( @genes, $gene );
  }
  return\@ genes;

}

sub get_Transcripts {

  my ($self, $transcripts) = @_;
  if($transcripts){
    $self->{'transcripts'} = $transcripts;
  }
  if(!$self->{'transcripts'}){
    foreach my $gene(@{$self->input_genes}){
      foreach my $transcript(@{$gene->get_all_Transcripts}){
        push(@{$self->{'transcripts'}}, $transcript);
      }
    }
  }
  return $self->{'transcripts'};

}

sub input_genes {

  my ($self, $arg) = @_;
  if($arg){
    $self->{'input_genes'} = $arg;
  }
  return $self->{'input_genes'};

}

sub make_shared_exons_unique {

  my ($self, $genes) = @_;
  my @pruned;
  foreach my $gene(@$genes){
    my $new_gene = $self->prune_Exons($gene);
    push(@pruned, $new_gene);
  }
  return\@ pruned;

}

sub max_short_intron_len {

  my ($self, $arg) = @_;
  if(defined $arg){
    $self->{'max_short_intron'} = $arg;
  }
  return $self->{'max_short_intron'};

}

sub max_transcript_number {

  my ($self, $arg) = @_;
  if(defined $arg){
    $self->{'max_transcript_number'} = $arg;
  }
  return $self->{'max_transcript_number'};

}

sub min_short_intron_len {

  my ($self, $arg) = @_;
  if(defined $arg){
    $self->{'min_short_intron'} = $arg;
  }
  return $self->{'min_short_intron'};

}

sub new {

  my ($class,@args) = @_;
  #print "In GeneBuilder constructor with super class" . ref($class) . "\n";
  my $self = $class->SUPER::new(@args);
  my($genes, $blessed_biotypes, $max_transcript_number,
     $min_short_intron_len, $max_short_intron_len,$output_biotype) = 
    rearrange([qw(GENES BLESSED_BIOTYPES MAX_TRANSCRIPT_PER_CLUSTER
                  MIN_SHORT_INTRON_LEN MAX_SHORT_INTRON_LEN OUTPUT_BIOTYPE)], @args);
  print "HERE ARE THE ARGS: ", join("  ",@args),"\n";
  print "HERE I AM: ", $min_short_intron_len,"\t", $max_short_intron_len,"\n";

  ########################
  ###SETTING DEFAULTS#####
  $self->max_transcript_number(10);
  #$self->min_short_intron_len(7);
  #$self->max_short_intron_len(15);
  $self->output_biotype($self->analysis->logic_name);
  #########################

  $self->input_genes($genes);
  $self->blessed_biotypes($blessed_biotypes);
  $self->max_transcript_number($max_transcript_number);
  $self->min_short_intron_len($min_short_intron_len);
  $self->max_short_intron_len($max_short_intron_len);
  $self->output_biotype($output_biotype);

  #data sanity
  warning("Strange running the Genebuilder without any genes") 
    if(!$genes || scalar(@$genes) == 0);

  return $self;

}

sub output_biotype {

  my ($self, $arg) = @_;
  if($arg){
    $self->{'output_biotype'} = $arg;
  }
  return $self->{'output_biotype'};

}

sub prune_Exons {

  my ($self, $gene) = @_;
  my @unique_Exons;
  foreach my $tran (@{$gene->get_all_Transcripts}) {
    my @newexons;
    foreach my $exon (@{$tran->get_all_Exons}) {
      my $found;
      #always empty
    UNI:foreach my $uni (@unique_Exons) {
	if ($uni->start  == $exon->start  &&
	    $uni->end    == $exon->end    &&
	    $uni->strand == $exon->strand &&
	    $uni->phase  == $exon->phase  &&
	    $uni->end_phase == $exon->end_phase
	   ) {
	  $found = $uni;
	  last UNI;
	}
      }
      if (defined($found)) {
	push(@newexons,$found);
	if ($exon == $tran->translation->start_Exon){
	  $tran->translation->start_Exon($found);
	}
	if ($exon == $tran->translation->end_Exon){
	  $tran->translation->end_Exon($found);
	}
      } else {
	push(@newexons,$exon);
	push(@unique_Exons, $exon);
      }
    }          
    $tran->flush_Exons;
    foreach my $exon (@newexons) {
      $tran->add_Exon($exon);
    }
  }
  return $gene;
}

1;

}

sub prune_Transcripts {

  my ($self, $transcript_clusters) = @_;
  my @newtran;

  my %blessed_genetypes = %{$self->blessed_biotypes};;

  my $cluster_count = 0;
  
 CLUSTER:
  foreach my $transcript_cluster ( @$transcript_clusters ){
    $cluster_count++;
    #print STDERR "Cluster $cluster_count\n";
    my $mytranscripts = $transcript_cluster->get_Transcripts;
   
    ########################
    #
    # sort the transcripts
    #
    ########################
    my @transcripts = @{$self->bin_sort_transcripts( $mytranscripts )};
  
    ##############################
    #
    # deal with single exon genes
    #
    ##############################

    # do we really just want to take the first transcript only? What about supporting 
    # evidence from other transcripts?
    # also, if there's a very long single exon gene we will lose any underlying 
    # multi-exon transcripts
    # this may increase problems with the loss of valid single exon genes as mentioned
    # below. it's a balance between keeping multi exon transcripts and losing 
    # single exon ones
   
    my $maxexon_number = 0;
    foreach my $t (@transcripts){
      if ( scalar(@{$t->get_all_Exons}) > $maxexon_number ){
	$maxexon_number = scalar(@{$t->get_all_Exons});
      }
    }
    if ($maxexon_number == 1){
      # take the longest:
      @transcripts = map { $_->[1] } sort { $b->[0]->length <=> $a->[0]->length } map{ [ $_->start_Exon, $_ ] } @transcripts;
      my $tran = shift( @transcripts );
      push (@newtran, $tran);
      my @es = @{$tran->get_all_Exons};
      my $e  = $es[0];
      foreach my $transcript (@transcripts){
	# make sure we keep it if it's blessed
	if(exists $blessed_genetypes{$transcript->type}){
	  push(@newtran, $transcript);
	}
	else{
	  foreach my $exon ( @{$transcript->get_all_Exons} ){
            transfer_supporting_evidence($exon, $e);
	  }
	}
      }
      next CLUSTER;
    }
    # otherwise we need to deal with multi exon transcripts and reject duplicates.
    # links each exon in the transcripts of this cluster with a hash of other exons 
    #it is paired with
    my %pairhash;

    # allows retrieval of exon objects by exon->id - convenience
    my %exonhash;

    # keep track of single exon transcripts (automatically rejected if the "top" 
    # transcript is multi exon), 
    # so we can check their supporting evidence at the very end.
    my %single_exon_rejects;    

    ##############################
    #
    # prune redundant transcripts
    #
    ##############################
  TRANSCRIPT:
    foreach my $tran (@transcripts) {
      my @exons = @{$tran->get_all_Exons};

      my $i     = 0;
      my $found = 1;
      # if this transcript has already been seen, this
      # will be used to transfer supporting evidence
      my @evidence_pairs;

      # 10.1.2002 VAC we know there's a potential problem here - single exon 
      # transcripts which are in a  cluster where the longest transcriopt has > 1 
      # exon are not going to be considered in this loop, so they'll always be 
      # marked "transcript already seen" How to sort them out? If the single exon 
      # overlaps an exon in a multi exon transcript then by our rules it probably 
      # ought to be rejected the same way transcripts with shared exon-pairs are.
      # 27/5/2003 VAC But the supporting evidence should be transferred if we can! 
      # Tough one.
      # more of a problem is that if the transcript with the largest number of exons 
      # is really a single exon with frameshifts, it will get rejected here based on
      # intron size but in addition any valid non-frameshifted single exon transcripts
      # will get rejected - which is definitely not right.We need code to represent 
      # frameshifted exons more sensibly so the frameshifted one doesn't get through 
      #the check for single exon genes above.

    EXONS:
      for ($i = 0; $i < $#exons; $i++) {
	my $foundpair = 0;
	my $exon1 = $exons[$i];
	my $exon2 = $exons[$i+1];
		
	# Only count introns > 50 bp as real introns
	my $intron;
	if ($exon1->strand == 1) {
	  $intron = abs($exon2->start - $exon1->end - 1);
	}
	else {
	  $intron = abs($exon1->start - $exon2->end - 1);
	}
	
	if ($intron < $self->max_short_intron_len && 
            $intron > $self->min_short_intron_len ) {
	  print STDERR "Intron too short: $intron bp. Transcript will be rejected\n";
	  $foundpair = 1;	
          # this pair will not be compared with other transcripts
	} else {
	  # go through the exon pairs already stored in %pairhash. 
	  # If there is a pair whose exon1 overlaps this exon1, and 
	  # whose exon2 overlaps this exon2, then these two transcripts are paired
          
	  foreach my $first_exon_id (keys %pairhash) {
            my $first_exon = $exonhash{$first_exon_id};
            foreach my $second_exon_id (keys %{$pairhash{$first_exon}}) {
	      my $second_exon = $exonhash{$second_exon_id};
              if ( $exon1->overlaps($first_exon) && $exon2->overlaps($second_exon) ) {
		$foundpair = 1;
                # eae: this method allows a transcript to be covered by exon pairs
		# from different transcripts, rejecting possible
		# splicing variants. Needs rethinking
		
		# we put first the exon from the transcript being tested:
		push( @evidence_pairs, [ $exon1 , $first_exon  ] );
		push( @evidence_pairs, [ $exon2 , $second_exon ] );
		
		transfer_supporting_evidence($exon1, $first_exon);
		transfer_supporting_evidence($first_exon, $exon1);
		transfer_supporting_evidence($exon2, $second_exon);
		transfer_supporting_evidence($second_exon, $exon2);
	      }
	    }
	  }
	}
	if ($foundpair == 0) {	
          # ie this exon pair does not overlap with a pair yet found in another 
          # transcript
	  $found = 0;		
          # ie currently this transcript is not paired with another
	  # store the exons so they can be retrieved by id
	  $exonhash{$exon1} = $exon1;
	  $exonhash{$exon2} = $exon2;
	  # store the pairing between these 2 exons
	  $pairhash{$exon1}{$exon2} = 1;
        }
      }				# end of EXONS

      # decide whether this is a new transcript or whether it has already been seen
      # if it's blessed, we keep it and there's nothing more to do
      if(exists $blessed_genetypes{$tran->type}){
	push (@newtran, $tran);
      }
      elsif ($found == 0) {
	push(@newtran,$tran);
	@evidence_pairs = ();
      } elsif ($found == 1 && $#exons == 0){
	# save the transcript and check though at the end to see if we can transfer 
        # supporting evidence; if we try it now we may not (yet) have any exons that 
        # overlap in %exonhash
        $single_exon_rejects{$tran} = $tran;
      } else {
        if ( $tran == $transcripts[0] ){
	  print STDERR "Strange, this is the first transcript in the cluster!\n";
	}
		
	## transfer supporting feature data. We transfer it to exons
	foreach my $pair ( @evidence_pairs ){
	  my @pair = @$pair;
	
	  # first in the pair is the 'already seen' exon
	  my $source_exon = $pair[0];
	  my $target_exon = $pair[1];
	
          transfer_supporting_evidence($source_exon, $target_exon)
	}
      }
    } # end of this transcript

    # check to see if we can transfer evidence from rejected single exon transcripts
    foreach my $reject(values %single_exon_rejects){
      my @exons = @{$reject->get_all_Exons};

      # is there any supporting evidence?
      my @sf = @{$exons[0]->get_all_supporting_features};

      foreach my $stored_exon(values %exonhash){
   	if($exons[0]->overlaps($stored_exon)){
	  # note that we could end up with bizarre situations of a single exon 
          # transcript overlapping two exons in a multi exon transcript, so the 
          # supporting evidence would be transferred in entirety to both exons.
	  transfer_supporting_evidence($exons[0], $stored_exon);
	
	}
      }
    }
  } #end CLUSTER
  return\@ newtran;

}

sub prune_redundant_CDS {

  my ( $self, $genes ) = @_;

  my $nremoved = 0;
  my %blessed_genetypes = %{$self->blessed_biotypes};

  # For each gene
  foreach my $gene (@$genes) {
    my @trans_with_utrs;
    my @trans_without_utrs;
    
    # Separate into ones with UTRs and ones without
    foreach my $trans (@{$gene->get_all_Transcripts}) {
      $trans->sort;
      my @exons = @{$trans->get_all_Exons};
      if ($trans->translation) {
        if ($trans->translation->start_Exon == $exons[0] &&
            $trans->translation->start == 1 &&
            $trans->translation->end_Exon == $exons[$#exons] &&
            $trans->translation->end == $exons[$#exons]->length) {
          push @trans_without_utrs, $trans;
        } else {
          push @trans_with_utrs, $trans;
        }
      } else {
        $self->warn("No translation for transcript");
      }
    }
    
    # Generate CDS exons just once (saves CPU time)
    my %cds_exon_hash;
    foreach my $trans (@{$gene->get_all_Transcripts}) {
      if ($trans->translation) {
        my @cds_exons = @{$trans->get_all_translateable_Exons};
        $cds_exon_hash{$trans} =\@ cds_exons;
      }
    }
      
    # Compare CDSs of ones with UTRs to CDSs of ones without
    foreach my $utr_trans (@trans_with_utrs) {
      my $utr_trans_cds_exons = $cds_exon_hash{$utr_trans};
      
    CDS_TRANS: foreach my $cds_trans (@trans_without_utrs) {
        my $cds_trans_cds_exons = $cds_exon_hash{$cds_trans};
        if (scalar(@$cds_trans_cds_exons) != scalar(@$utr_trans_cds_exons)) {
          # print "Different numbers of exons\n";
          next CDS_TRANS;
        }
        my @cds_exons = @$cds_trans_cds_exons;
        foreach my $utr_trans_exon (@$utr_trans_cds_exons) {
          my $cds_trans_exon = shift @cds_exons;
          if ($cds_trans_exon->start     != $utr_trans_exon->start  ||
              $cds_trans_exon->end       != $utr_trans_exon->end    ||
              $cds_trans_exon->strand    != $utr_trans_exon->strand) {
            next CDS_TRANS;
          }
        }
        if (  exists $blessed_genetypes{$cds_trans->biotype} && 
              ! exists $blessed_genetypes{$utr_trans->biotype}) {
          # Hack to make sure transcript gets through if its like a blessed one
          $utr_trans->biotype($cds_trans->biotype);
        }
        $nremoved++;
        $self->remove_transcript_from_gene($gene,$cds_trans);
      }
    }
    #Get the number of transcript in the @trans_with_utrs 
    my $num_of_utr_trans = scalar(@trans_with_utrs);
    # Extra paranoid test to check if any similarity overlaps targetted models and 
    # get rid of them
  }

}

sub prune_redundant_transcripts {

  my ($self,$genes) = @_;

  my $nremoved = 0;
   my %blessed_genetypes = %{$self->blessed_biotypes};
  # For each gene
  foreach my $gene (@$genes) {
    # sort transcripts by total cdna length (apologies for the complexity of this line
    #  its done so as only to calculate cdna length once for each transcript

    my @sorted_transcripts = map { $_->[1] } sort { $b->[0] <=> $a->[0] } map { [$_->length, $_] } @{$gene->get_all_Transcripts};

    # so now longer UTR transcripts (with same introns will come first)
    # Now generate the sorted (low to high) exon arrays and put in a hash (for speed)
    # and cds start and end (genomic) hashes (for speed again)
    my %sorted_exon_hash;
    my %cds_start_hash;
    my %cds_end_hash;

    foreach my $trans (@sorted_transcripts) {
      my @exons = sort {$a->start <=> $b->start} @{$trans->get_all_Exons};
      $sorted_exon_hash{$trans} =\@ exons;
      if ($trans->translation) {
        $cds_start_hash{$trans} = $trans->coding_region_start;
        $cds_end_hash{$trans}   = $trans->coding_region_end;
      }
    }

    # We're going to generate a list of transcripts to remove from the gene
    # and then remove them at the end
    my @trans_to_remove;

    # for transcript

    for (my $i=0; $i<scalar(@sorted_transcripts); $i++) {
      my $trans = $sorted_transcripts[$i];
      next if (!defined($trans));
      next if (!$trans->translation);
      my @exons = @{$sorted_exon_hash{$trans}};
      #   for every other transcript
    COMPTRANS:
      for (my $j=$i+1; $j<scalar(@sorted_transcripts); $j++) {
        my $comp_trans = $sorted_transcripts[$j];
        next if (!defined($comp_trans));
        next if (!$comp_trans->translation);
        next if ($trans == $comp_trans);
        next if ($cds_start_hash{$trans} != 
                 $cds_start_hash{$comp_trans});
        next if ($cds_end_hash{$trans} != $cds_end_hash{$comp_trans});
        my @comp_exons = @{$sorted_exon_hash{$comp_trans}};
        if (scalar(@comp_exons) != scalar(@exons)){
          next;
        }

        # special case for single exon
        if (scalar(@exons) == 1) {
          if (  exists $blessed_genetypes{$comp_trans->biotype} && 
                ! exists $blessed_genetypes{$trans->biotype}) {
            #hack to ensure blessed biotypes are maintained
            $trans->biotype($comp_trans->biotype);
          }

          $sorted_transcripts[$j] = undef;
          push @trans_to_remove, $comp_trans;
        } else {
          for (my $k=0; $k<scalar(@exons) - 1; $k++) {
            if ($exons[$k]->end != $comp_exons[$k]->end ||
                $exons[$k+1]->start != $comp_exons[$k+1]->start) {
              #means exons are unique
              next COMPTRANS;
            }
          }
          if (  exists $blessed_genetypes{$comp_trans->biotype} && 
                ! exists $blessed_genetypes{$trans->biotype}) {
            # Hack to make sure transcript gets through if its like a blessed one
            $trans->biotype($comp_trans->biotype);
          }
          $sorted_transcripts[$j] = undef;
          push @trans_to_remove, $comp_trans;
        }
      }
    }
    foreach my $trans (@trans_to_remove) {
      $nremoved++;
      $self->remove_transcript_from_gene($gene,$trans);
    }
  }

}

sub remove_transcript_from_gene {

  my ($self, $gene, $trans_to_del)  = @_;
  my @newtrans;

  foreach my $trans (@{$gene->get_all_Transcripts}) {
    if ($trans != $trans_to_del) {
      push @newtrans,$trans;
    }
  }
  $gene->{_transcript_array} = [];
  foreach my $trans (@newtrans) {
    $gene->add_Transcript($trans);
  }
  return scalar(@newtrans);

}

sub run {

  my ($self) = @_;
  my $transcripts = $self->get_Transcripts;
  if(!$transcripts || @$transcripts == 0){
    print "GeneBuilder seems to have no transcripts to cluster\n";
    return;
  }
  #cluster transcripts
  #print "Have ".@$transcripts." transcripts to build from\n";
  my $transcript_clusters = $self->cluster_Transcripts($transcripts);
  #print "Have ".@$transcript_clusters." transcript clusters\n";
  #prune transcripts
  my $pruned_transcripts = $self->prune_Transcripts($transcript_clusters);
  #print "Have ".@$pruned_transcripts." pruned transcripts\n";
  #first gene cluster
  my $initial_genes = $self->cluster_into_Genes($pruned_transcripts);
  #print "Have ".@$initial_genes." initial gene structures\n";
  #prune redundant cds
  $self->prune_redundant_CDS($initial_genes);
  #prune redundant transcripts
  $self->prune_redundant_transcripts($initial_genes);
  #select best transcripts
  my $best_transcripts = $self->select_best_transcripts($initial_genes);
  #print "Have ".@$best_transcripts." best transcripts\n";
  #final cluster of genes
  my $final_genes = $self->cluster_into_Genes($best_transcripts);
  my $pruned_genes = $self->make_shared_exons_unique($final_genes);
  $self->output($pruned_genes);
  #print "Have ".@$final_genes." final gene clusters\n";
  #return output

}

sub select_best_transcripts {

  my ( $self, $genes ) = @_;
  my @selected_transcripts;

  # make sure we don't discard blessed transcripts
  my %blessed_genetypes = %{$self->blessed_biotypes};
 GENE:
  foreach my $gene ( @$genes ){
    # sort the transcripts, get the longest CDS + UTR first (like in 
    # prune_Transcripts(); blessed transcripts come at the top )
    my $sorted_transcripts = 
      $self->bin_sort_transcripts( $gene->get_all_Transcripts );
    my $count = 0;
  TRAN:
    foreach my $transcript( @$sorted_transcripts ){
      $count++;
      unless (exists $blessed_genetypes{$transcript->biotype}){
	next TRAN if ($count > $self->max_transcript_number);
      }
      push ( @selected_transcripts, $transcript );
    }
  }
  return\@ selected_transcripts;

}

General documentation

bin_sort_transcripts	No description	Code
blessed_biotypes	No description	Code
check_Clusters	No description	Code
cluster_Transcripts	No description	Code
cluster_Transcripts_by_genomic_range	No description	Code
cluster_into_Genes	No description	Code
get_Transcripts	No description	Code
input_genes	No description	Code
make_shared_exons_unique	No description	Code
max_short_intron_len	No description	Code
max_transcript_number	No description	Code
min_short_intron_len	No description	Code
new	No description	Code
output_biotype	No description	Code
prune_Exons	No description	Code
prune_Transcripts	No description	Code
prune_redundant_CDS	No description	Code
prune_redundant_transcripts	No description	Code
remove_transcript_from_gene	No description	Code
run	No description	Code
select_best_transcripts	No description	Code