TranscriptConsensus documentation.

sub collapse_cluster {

  my ($self,$cluster,$genes_by_strand) = @_;
  my @genes = $cluster->get_Genes;
  my @exon_clusters = @{$cluster->get_exon_clustering_from_gene_cluster};
  my $collapsed_cluster = Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::CollapsedCluster->new();

  foreach my $gene (@genes){
    foreach my $trans (@{$gene->get_all_Transcripts}){
      my @exons = sort {$a->start <=> $b->start } @{$trans->get_all_Exons};
      for (my $i =0 ; $i < scalar(@exons) ; $i++){
	my $exon_cluster;
	my $exon = $exons[$i];
	# figure out which exon cluster the exon belongs to
	# but only once for each non redundant exon
	unless ($collapsed_cluster->contains_exon($exon)){
	EC: foreach my $ec (@exon_clusters){
	    if ($ec->contains_exon($exon)){
	      $exon_cluster = $ec;
	      last EC;
	    }
	  }
	}
	# pass all the exons to the collapsed cluster object
	$collapsed_cluster->add_exon($exon,$exon_cluster);
	# cannot make an intron unless we have 2 exons to look at
	next if $i == 0;
	# make an intron object using exon extended so I can store the evidence sets
	my $intron = $collapsed_cluster->make_intron_from_exons($exons[$i-1],$exons[$i]);
	$intron->ev_set($exon->ev_set);
	$intron->biotype('intron');
	$collapsed_cluster->add_intron($intron);
      }
    }
  }

  # once we have added all of them into a nice non-redundant set
  # we need to assign them a score
  foreach my $exon (@{$collapsed_cluster->get_all_exons}){
    my $score = $self->score($collapsed_cluster,$exon,\@genes,'exon');
    $collapsed_cluster->exon_score($exon,$score);
  }

  foreach my $intron (@{$collapsed_cluster->get_all_introns}){
    # use all the genes on the strand when looking for introns overlapping genes as it helps
    # stop cluster joining
    my $score = $self->score($collapsed_cluster,$intron,$genes_by_strand->{$intron->strand},'intron');
    $collapsed_cluster->intron_score($intron,$score);
  }

  return $collapsed_cluster;

}

sub filter_genes {

  my ( $self, $genes) = @_;
  #print "FILTERING TRANSCRIPTS\n";
  #print "Have ".@$genes." genes to filter\n";
  my @genes_to_filter;
  my %singletons;
  my %non_con;
  my @filtered_genes;
  my %delete;
  my %exon_hash;
  my $gene_count = 0;

  for ( my $g = 0 ; $g < scalar(@$genes) ; $g++ ){
    # dont run on ests unless specifically asked to 
    unless ($FILTER_ESTS){
      next if ($genes->[$g]->get_all_Transcripts->[0]->ev_set eq 'est');
    }
    $gene_count++;
    foreach my $trans (@{$genes->[$g]->get_all_Transcripts}){
      # count non consensus splices
      $non_con{$g} =  count_non_canonical_splice_sites($trans);

      my @exons = @{$trans->get_all_Exons};
      foreach (my $i =0 ; $i < scalar(@exons) ; $i++){
	my $exon = $exons[$i];
	# need to be able to backtrack, which gene exon comes from..
	my $unique_key = $exon->start.":".$exon->end.":".$exon->strand;
	push @{$exon_hash{$unique_key}} , $g;
      }
    }
  }
  # dont run on very small clusters
  if ($gene_count <= 2){ 
    return $genes;
  }
  # once we have looked at all the exons in the cluster find the singletons
  foreach my $key (sort keys %exon_hash){
    if (scalar(@{$exon_hash{$key}}) == 1){
      # singleton
      $singletons{$exon_hash{$key}[0]}++;
      $delete{$exon_hash{$key}[0]} = 1;
    }
  }
  # gotcha
  if ( scalar (keys %delete) >= $gene_count - 2 ){
    # dont delete them if you will end up deleting everything!
    print "All transcripts look dodgy, not filtering\n" if $VERBOSE;
    return $genes;
  } else {
    for ( my $g = 0 ; $g < scalar(@$genes) ; $g++ ){
      if ( $FILTER_SINGLETONS && $singletons{$g} && $singletons{$g}>= $FILTER_SINGLETONS ){
	print  "Filtering out ".$genes->[$g]->dbID." ".$genes->[$g]->biotype." ".$genes->[$g]->start.":".$genes->[$g]->end.":".$genes->[$g]->strand." " if $VERBOSE;
	print  $singletons{$g}." singleton exons\n" if $VERBOSE;
	next;
	
      }
      if ( $FILTER_NON_CONSENSUS && $non_con{$g} && $non_con{$g} >= $FILTER_NON_CONSENSUS ){
	print  "Filtering out ".$genes->[$g]->dbID." ".$genes->[$g]->biotype." ".$genes->[$g]->start.":".$genes->[$g]->end.":".$genes->[$g]->strand." " if $VERBOSE;
	print  $non_con{$g}." non - consensus\n" if $VERBOSE;	
	next;
      }
      push @filtered_genes , $genes->[$g];
    }
  }
  return\@ filtered_genes;

}

sub make_genes {

  my ($self,$transcripts,$cluster,$collapsed_cluster)= @_;
  my @genes;
  my @final_genes;
  $self->throw("No Transcripts to use\n") unless defined($transcripts);
 SCORE:  foreach my $t (sort {$b->score <=> $a->score } @$transcripts){
    my $transcript = $self->clone_transcript_extended($t);
    my $cds_start_exon;
    my $cds_end_exon;
    my $similarity_tran = $transcript->similarity;
    # each smilarity gene only allowed once, just the highest scoring one

    my $translation = Bio::EnsEMBL::Translation->new();
    my $strand= 1;
    my $slice =  $similarity_tran->slice;
    my $weighted_score;
    if ($ADD_UTR){
      # dont compute the translation just set it
      # need the similarity gene that it was built from
      # so I can set the cds start and end on the transcript
      # if the start is in the same exon in the new transcript
      # just transfer it, if it is over the end of the new exon
      # trim back so that it is in the exon if you see what I mean
      # print stuff
      print "Transcript ".$transcript->start .":".$transcript->end .":".$transcript->strand." ".$transcript->biotype." " if $VERBOSE;
      print scalar(@{$transcript->get_all_Exons})." Exons\n" if $VERBOSE;
      my $last_exon;
      foreach my $exon (@{$transcript->get_all_Exons}){
	if ($last_exon){
	  my $intron = $collapsed_cluster->make_intron_from_exons($last_exon,$exon);
	}
	$last_exon = $exon;
      }
      if ($transcript->strand == -1){
	my $invertedslice = $similarity_tran->slice->invert;
	$strand = -1;
	$similarity_tran = $similarity_tran->transfer($invertedslice);
	$transcript = $transcript->transfer($invertedslice);
      }
      my $cds_start = $similarity_tran->coding_region_start;
      my $cds_end = $similarity_tran->coding_region_end;
      my @exon_array = @{$transcript->get_all_Exons};
      my @similarity_exons = sort { $a->start <=> $b->start } @{$similarity_tran->get_all_Exons};
      
      
      my $number = scalar(@exon_array);
      # translation start and end is upside down when on the opposite strand
      for ( my $i = 0 ; $i < scalar(@exon_array) ; $i ++ ){
	# check for overlaps 
	my $exon = $exon_array[$i];	  
	# set phases to non coding by default
	$exon->phase( -1 );
	$exon->end_phase( -1 );
	foreach my $se (@similarity_exons){
	  # set coding exons to have same phase as similarity exons
	  if ($exon->start == $se->start){
	    $exon->phase($se->phase);
	}
	  if ( $exon->end == $se->end ){
	    $exon->end_phase($se->end_phase);
	  }
	}
	if ($exon->start <= $cds_start && $exon->end >= $cds_start){
	  # translation start is in this exon
	  $translation->start_Exon($exon);
	  $translation->start($cds_start - $exon->start+1);
	  $exon->phase( -1 ) 	if $exon->start < $cds_start;
	}
	if ($exon->start <= $cds_end && $exon->end >= $cds_end){
	  # translation stop is in this exon
	  $translation->end_Exon($exon);
	  $translation->end($cds_end - $exon->start+1);
	  $exon->end_phase( -1 ) 	if $exon->end > $cds_end;
	}
	# find the utr exon that shares the internal boundary with the first simgw exon
	if ($exon->end == $similarity_exons[0]->end){
	  $cds_start_exon = $exon;
	}
	# find the utr exon that shares the internal boundary with the last simgw exon
	if ($exon->start == $similarity_exons[-1]->start){
	  $cds_end_exon = $exon;
	}
      }
      unless ( $translation->start_Exon && $translation->start_Exon eq $cds_start_exon ) {
	# the start is not inside an exon
	throw("Cannot find the coding sequence start exon\n") unless $cds_start_exon;
	# trim back to the exon that matches the internal boundary
	# stay in frame though
	$translation->start_Exon($cds_start_exon);
	my $trim = $cds_start_exon->start - $cds_start;
	if ( $trim%3 == 0 ){
	  $translation->start(1);
	} else {	
	  $translation->start(1+(3-$trim%3));
	}
	$cds_start_exon->phase( -1 ) 	if $cds_start_exon->start < $cds_start;
      }
      unless ($translation->end_Exon && $translation->end_Exon eq $cds_end_exon ){
	# the start is not inside an exon
	throw("Cannot find the coding sequence end exon\n") unless $cds_end_exon;
	# trim back to the exon that matches the internal boundary
	# stay in frame 
      $translation->end_Exon($cds_end_exon);
	my $trim = $cds_end - $cds_end_exon->end;
	if ( $trim%3 == 0 ){
	  $translation->end($cds_end_exon->length);
	} else {
	  $translation->end($cds_end_exon->length-(3-$trim%3));
	}
	$cds_end_exon->end_phase( -1 ) 	if $cds_end_exon->end > $cds_end;
      }
      $transcript->translation($translation);
      if ($strand == -1){
	# flip it back onto the - strand
	my $slice = $transcript->slice->invert;
	$transcript = $transcript->transfer($slice);
	$similarity_tran = $similarity_tran->transfer($slice);
      }	
    } else {
      # dont need to mess with translations if I am not adding UTR...
      my $newtranslation = clone_Translation($similarity_tran,$transcript);
      $transcript->translation($newtranslation);
      # fix phases
      my @exon_array = @{$transcript->get_all_Exons};
      my @similarity_exons = sort { $a->start <=> $b->start } @{$similarity_tran->get_all_Exons};
      for ( my $i = 0 ; $i < scalar(@exon_array) ; $i ++ ){
	# check for overlaps 
	my $exon = $exon_array[$i];
	foreach my $se (@similarity_exons){
	  # set coding exons to have same phase as similarity exons
	  if ($exon->start == $se->start){
	    $exon->phase($se->phase);
	  }
	  if ( $exon->end == $se->end ){
	    $exon->end_phase($se->end_phase);
	  }
	}
      }
    }
	
    # penalise transcripts where a utr exon overlaps another exon that was previously coding
    $weighted_score = $self->compare_translation($transcript,$collapsed_cluster);
    print $transcript->score."\t" if $VERBOSE;
    print "\npeptide weighted score $weighted_score\n\n" if $VERBOSE;
    $transcript->score( $weighted_score );
    $transcript->analysis($similarity_tran->analysis);
    $transcript->slice($similarity_tran->slice);
    my $gene = Bio::EnsEMBL::Gene->new;
    $gene->slice($transcript->slice);
    $gene->add_Transcript($transcript);
    $gene->analysis($transcript->analysis);
    push @genes,$gene;
  }
  # add the models of the single exon genes 
  push @genes, @{$self->add_single_exon_genes($cluster,$collapsed_cluster)};
  @genes = sort { $b->get_all_Transcripts->[0]->score <=> $a->get_all_Transcripts->[0]->score } @genes;
  my $top_score = $genes[0]->get_all_Transcripts->[0]->score;
  my $bottom_score = $genes[-1]->get_all_Transcripts->[0]->score;
  # prevent the whole division by zero thing
  $bottom_score-= 0.0001 if $bottom_score == 0;
  foreach my $gene (@genes){
    foreach my $transcript ( @{$gene->get_all_Transcripts}){
      # sort out exon phases - transfer them from the similarity gene
      # coding region start / stop is independent of strand so I will swap them if the strand is -ve
      my $biotype = $BAD_BIOTYPE;
      # how about if we include the top 5% of the gene scores
      if ($GOOD_PERCENT){
        my $top = ($transcript->score - $bottom_score );
        my $bottom = ($top_score - $bottom_score);
	if ($bottom && (($top/$bottom) * 100 >= 100 - $GOOD_PERCENT)){
	  $biotype = $GOOD_BIOTYPE;
	}
      } else {
	# we want the single top scoring model
	if ($transcript->score == $top_score){
	  $biotype = $GOOD_BIOTYPE;
	}
      }
      $biotype = $SMALL_BIOTYPE if (scalar(@genes) < $MIN_CONSENSUS);
      $gene->biotype($biotype);
      push @final_genes, $gene if $biotype;
    }
  }
  foreach my $final(@final_genes){
    foreach my $transcript(@{$final->get_all_Transcripts}){
      throw(Transcript_info($transcript)." has inconsistent phases") 
        unless(are_phases_consistent($transcript));
      #modify score to be independent of length i total / number of introns+exons
      #Should always be less than 1
      my $final_score = $transcript->score / (scalar(@{$transcript->get_all_Exons})*2-1);    
      # add score into trancript supporting feature score feild and move
      # coverage into the new covergae field
      foreach my $tsf ( @{$transcript->get_all_supporting_features}) {
	$tsf->hcoverage($tsf->score);
	$tsf->score(sprintf("%.4f", $final_score));
      }	
      print "Transcript ". ($transcript->start + $self->solexa_slice->start -1) .":".
	($transcript->end  + $self->solexa_slice->start-1) . ":".
	  $transcript->strand." ".
	    $final->biotype." " if $VERBOSE && $SOLEXA;
      print "Final score $final_score\n";
    }
  }
  $self->output(\@final_genes);
  return;

}

sub make_transcripts {

  my ($self,$cluster,$collapsed_cluster) = @_;
  my @genes = $cluster->get_Genes;
  my @est_genes;
  my @similarity_genes;
  my $longest_similarity = 0;
  my @all_transcripts;
  foreach my $gene (@genes){ 
    push @est_genes, $gene if  $gene->get_all_Transcripts->[0]->ev_set eq 'est';
    push @similarity_genes, $gene if $gene->get_all_Transcripts->[0]->ev_set  eq 'simgw';
  }

  foreach my $similarity (@similarity_genes) {
    foreach my $trans (@{$similarity->get_all_Transcripts}) {
      my @partial_transcripts;
      my @possible_transcripts;
      my @possible_start_utr;
      my @possible_end_utr;
      my @transcript_exons = sort {$a->start <=> $b->start } @{$trans->get_all_Exons};
      # single exon genes are dealt with elsewhere
      next if (scalar(@transcript_exons) <= 1);
      my $exon_count = $#transcript_exons;
      my $start_splice;
      my $end_splice;
      my @similarity_exons;
      my $almost_complete;
      my @start_utr;
      my @end_utr;
      # using start and end rather than 3' and 5' to save strand problems
      # start UTR
      $start_splice = $collapsed_cluster->make_intron_from_exons($transcript_exons[0],$transcript_exons[1]);
      $self->throw("Intron was not previously defined in the cluster\n")
	unless $collapsed_cluster->contains_intron($start_splice);
      my @joining_exons = @{$collapsed_cluster->last_exons_by_intron($start_splice)};
      foreach my $exon (@joining_exons) {
	# get all the transcripts belonging to the non redundant potential joining exons
	my @transcripts = @{$collapsed_cluster->transcript_by_feature($exon)};
	foreach my $trans (@transcripts) {
	  # only want est transcripts
	  next unless $trans->ev_set eq 'est' ;
	  push @start_utr, $trans if $ADD_UTR;
	}
      }
      # end UTR
      $end_splice = $collapsed_cluster->make_intron_from_exons($transcript_exons[$exon_count-1],$transcript_exons[$exon_count]);
      $self->throw("Intron was not previously defined in the cluster\n")
	unless $collapsed_cluster->contains_intron($end_splice);
      @joining_exons = @{$collapsed_cluster->next_exons_by_intron($end_splice)};
      foreach my $exon (@joining_exons) {
	# get all the transcripts belonging to the non redundant potential joining exons
	my @transcripts = @{$collapsed_cluster->transcript_by_feature($exon)};
	foreach my $trans (@transcripts) {
	  # only want est transcripts
	  next unless $trans->ev_set eq 'est' ;
	  push @end_utr, $trans if $ADD_UTR;
	}
      }

      # if we have start utr
      if ( scalar(@start_utr) > 0 ) {
	# make and score transcripts that represent all the start UTRs that will fit this similarity gene
	foreach my $est (@start_utr) {
	  my @exon_array;
	  # truncate the ests to the right length
	  foreach my $exon (@{$est->get_all_Exons}){
	    if ($exon->end <= $start_splice->start){
	      push @exon_array, $exon;
	    }
	  }
	  my $new_transcript = $self->transcript_from_exons(\@exon_array,undef,$collapsed_cluster)
	    if scalar(@exon_array > 0);
	  if ($new_transcript){
	    # keep the transcript supporting features
	    $new_transcript->add_supporting_features(@{$est->get_all_supporting_features});
	    push @possible_start_utr, $new_transcript;
	  }
	}
	# only add the highest scoring possible utr for this similarity gene
	@possible_start_utr = sort { $a->score <=> $b->score } @possible_start_utr if @possible_start_utr;
	push @partial_transcripts, $possible_start_utr[-1] if @possible_start_utr;
      }
      # make a new transcript conatining just the first similarity exon
      my @exon_array = (  $transcript_exons[0]  );
      my $new_transcript = $self->transcript_from_exons(\@exon_array,undef,$collapsed_cluster);
      $self->throw("Unable to make transcript using first exon of similarity gene\n")
	unless $new_transcript;
      push @partial_transcripts, $new_transcript if $new_transcript;

      # now we are in the similarity gene proper put together all the internal exons
      @similarity_exons = @{$trans->get_all_Exons};
      pop @similarity_exons;
      shift @similarity_exons;

      #  add the similarity exons to all the transcripts made so far
      foreach my $transcript (@partial_transcripts) {	
	my $score;
	$almost_complete = $self->transcript_from_exons(\@similarity_exons,$transcript,$collapsed_cluster);
	if ( scalar(@end_utr) > 0 ) {
	  # add all the possible end utrs
	  # also make one transcript without end utr
	  foreach my $est (@end_utr) {
	    my @exon_array;
	    # make an exon array out of the est starting at the end_splice intron
	    foreach my $exon (@{$est->get_all_Exons}){
	      if ($exon->start >= $end_splice->end){
		push @exon_array, $exon;
	      }
	    }
	    # to all the possible transcripts made so far
	    my $new_transcript = $self->transcript_from_exons(\@exon_array,$almost_complete,$collapsed_cluster)
	      if scalar(@exon_array > 0);
	    if ($new_transcript){
	      # keep the supporting features
	      $new_transcript->add_supporting_features(@{$est->get_all_supporting_features});
	      $new_transcript->similarity($trans);
	      push @possible_end_utr, $new_transcript;
	    }
	  }
	  # only add the highest scoring possible utr for this similarity gene
	   @possible_end_utr = sort { $a->score <=> $b->score } @possible_end_utr;
	   push @possible_transcripts, $possible_end_utr[-1];
	 }
	 #  add the last exon of the similarity gene
	 my @exon_array = (  $transcript_exons[-1]  );
	 my $new_transcript = $self->transcript_from_exons(\@exon_array,$almost_complete,$collapsed_cluster);
	 $new_transcript->similarity($trans);
	 $self->throw("Unable to make transcript using last exon of similarity gene\n")
	   unless $new_transcript;
	 push @possible_transcripts, $new_transcript if $new_transcript;
       }
      # only store the highest scoring transcript for each similarity gene
      @possible_transcripts = sort { $a->score <=> $b->score } @possible_transcripts if @possible_transcripts;
      my $chosen_transcript = $possible_transcripts[-1];
      # keep the transcript supporting feature of the similarity gene
      $chosen_transcript->add_supporting_features(@{$trans->get_all_supporting_features});
      $chosen_transcript->biotype($trans->biotype);
      push @all_transcripts, $chosen_transcript ;
    }
  }
  return\@ all_transcripts;

}

sub run {

   my ($self) = @_ ; 
  my $count =1;
  my $genes_by_strand;

  my @allgenes = @{ $self->get_genes_by_evidence_set('simgw') }  ;
  push @allgenes , @{ $self->get_genes_by_evidence_set('est') }  ;
  
  if ($FILTER_SINGLETONS or   $FILTER_NON_CONSENSUS){
    #@allgenes = @{$self->filter_genes(\@allgenes)};
    my @tmp;
    my ($clusters, $non_clusters) = cluster_Genes(\@allgenes, $self->get_all_evidence_sets ) ;
    foreach my $cluster(@$clusters, @$non_clusters){
      my @genes = $cluster->get_Genes;
      @genes = @{$self->filter_genes(\@genes)};
      push(@tmp, @genes);
    }
    @allgenes = @tmp;
  }

  my ($clusters, $non_clusters) = cluster_Genes(\@allgenes, $self->get_all_evidence_sets ) ;
  
  # create a hash of genes by strand to use when looking for overlapping genes
  # useful to look at ALL genes not just those in the cluster to prevent cluster joining
  foreach my $gene (@allgenes){
    push @{$genes_by_strand->{$gene->strand}}, $gene;
  }

  push @$clusters, @$non_clusters if (scalar(@$non_clusters) > 0 ) ;
  my $found;
  $count = 0;
  foreach my $cluster (@$clusters){
    my $simgw;
    # cluster has to contain at least one similarity gene to be worth continuing with
    my @genes = $cluster->get_Genes;
    #foreach my $gene(@genes){
     # my $new_gene = $gene->transform("toplevel");
     # print Gene_info($new_gene)."\n"; #if($gene->start == 6633986 && $gene->end == 6649904);
    #}
    next unless $cluster->get_Genes_by_Set('simgw');
    
    print "\nCluster $count\n" if $VERBOSE;
    print $cluster->start." ".$cluster->end." ".$cluster->strand."\n" if $VERBOSE;
    $count++;
    # collapse the cluster down to make a non redundat set of introns and exons with scores
    my $collapsed_cluster = $self->collapse_cluster($cluster,$genes_by_strand);

    # add UTR to the similarity transcripts and make scores for the transcripts
    my $transcripts = $self->make_transcripts($cluster,$collapsed_cluster);

    # sort the scored transcripts by score and make gene objects out of the best ones
    $self->make_genes($transcripts,$cluster,$collapsed_cluster);
  }

}

sub score {

  my ($self,$collapsed_cluster,$feature,$genes,$type) = @_;
  my $total_score;
  my $solexa_score = 0;
  # scores are calculated as the number of exoct matches / the number of overlaps
  # it is complicated by end exons and evidence sets however.
  $total_score = $collapsed_cluster->get_intron_count($feature) if $type eq 'intron';
  $total_score = $collapsed_cluster->get_exon_count($feature) if $type eq 'exon';
  $collapsed_cluster->exon_info($feature) if $VERBOSE;
  print " " if $VERBOSE;
  my $score = 0 ;
  # this way we penalise anything with less than exons than the minumum number of overlapping exons
  my $overlap = $MIN_CONSENSUS ;
  my $est_trans = $MIN_CONSENSUS;
  # get solexa data if it's there
  if ($SOLEXA &&  defined($self->solexa_slice) && $type eq 'exon'){
    #Fetch the slice from the solexa db that corresponds to the exon
    #Get all solexa reads with score above the cutoff ( 150 reccomended ) regardless of strand
    my $sub_slice = $self->solexa_slice->adaptor->fetch_by_region
      (
       'toplevel',
       $self->solexa_slice->seq_region_name,
       $feature->start + $self->solexa_slice->start,
       $feature->end + $self->solexa_slice->start,
       1);
    $self->throw("Sub slice not found " . $feature->start . " " .  $feature->end . " " .  $feature->strand . "\n")
      unless $sub_slice;
    # Fetch the features
    $solexa_score = scalar(@{$sub_slice->get_all_DnaAlignFeatures
			       (
				$$TRANSCRIPT_CONSENSUS_DB_CONFIG{"SOLEXA_DB"}->{"BIOTYPE"}[0],
				$SOLEXA_SCORE_CUTOFF)}
			  );
    print  "Solexa reads = $solexa_score\n" if $VERBOSE;
  }
  
  # the weighted score is:
  # 50% - proportion of exons in the stack that come from ests / number of overlapping est transcripts
  # 50% - the non est exons over the total number of overlapping transcripts of any type

  # count the number of overlapping exons / introns
  foreach my $gene (@$genes){
    foreach my $tran (@{$gene->get_all_Transcripts}){
      # if its an exon just add up how many genes of each type are in the cluster
      if ($type eq 'exon'){
	$est_trans++ if $tran->ev_set eq 'est';
	$overlap++;
      }
      next if scalar(@{$tran->get_all_Exons}) == 1;
      if ($type eq 'intron'){
	# for introns; count the number of overlapping trans from the entire slice
	# this creates a penatly for introns that span genes that
	# are not in the cluster because they dont share exons
	if ($feature->start < $tran->end() && $feature->end > $tran->start()){
	  $est_trans++ if $tran->ev_set eq 'est';
	  $overlap++;
	}	
      }
    }
  }
  # number of est exons that overlap the feature
  my $est_exons = $collapsed_cluster->ev_count_by_feature($feature,'est');

  # for end exon scores we allow the external boundary to be non identical
  if ($type eq 'exon' && $feature->is_terminal_exon &&  $feature->number_exons > 1){
    print "End exon $est_exons " if $VERBOSE;
    # count the total number of overlapping end exons
    $est_exons = $collapsed_cluster->ev_count_by_terminal_feature($feature,'est');
    $total_score = $collapsed_cluster->count_end_exon($feature);
    $self->throw("No evidence found for exon " . $collapsed_cluster->exon_info($feature) . "\n")
      unless defined($est_exons);
  }
  print "$type " if $VERBOSE;
  print "TOTAL $total_score OVERLAP $overlap " if $VERBOSE;

  if ($est_exons){
    print "WEIGHTED GENES $est_exons EST TRANS $est_trans " if $VERBOSE;
    # number of exons of the weighted type / number of overlapping trans of the weighted type;
    my $weighted_score = 0;
    my $unweighted_score = 0;
    if ($est_trans){
      $weighted_score = ($est_exons + $solexa_score) / ($est_trans + $solexa_score) ;
      print "weighted score $weighted_score " if $VERBOSE;
    }
    # other componnt of score comes from all the exons
    $unweighted_score =  ($total_score + $solexa_score) / ($overlap + $solexa_score) ;
    print "unweighted score $unweighted_score\t " if $VERBOSE;
    $score = ($weighted_score + $unweighted_score) / 2;
  } else {
    # there are no est exons overlaping this feature score is simple identical / non-identical overlaps
    $score += (($total_score + $solexa_score) / ($overlap + $solexa_score)) ;
    # if the feature overlaps an est transcript but not est exons, add a penalty
    if ($est_trans &&  $est_trans > 1.5 ){
      $score -= $EST_OVERLAP_PENALTY;
    }
  }
  # favor long end exons
  if ($feature->is_terminal_exon && $type eq 'exon' && $feature->number_exons > 1){
    # longest end exon 
    my $longest_end_exon = $collapsed_cluster->get_end_exon($feature)->length;
    my $multiplyer = $feature->length / $longest_end_exon;
    print "End exon penalty ".$feature->length." / ". $longest_end_exon." -.5\t" if $VERBOSE;
    if ($score >= 0){
      $score *= $multiplyer;
    } else {
      $score /= $multiplyer;
    }
    # end exons are penalised to prevent 'spindly' exons
    $score = $score - $END_EXON_PENALTY;
    print " ENDEXON " if $VERBOSE;
  } else {
    # penalty for  internal short exons / introns
      $score-= 0.5 if $feature->length < $SHORT_INTRON_PENALTY && $type eq 'intron';
      $score-= 0.5 if $feature->length < $SHORT_EXON_PENALTY   && $type eq 'exon';
  }
  print "$score\n " if $VERBOSE;
  return $score;

}

sub transcript_from_exons {

  my ($self, $exons, $transcript, $collapsed_cluster) = @_;
  my $new_transcript = Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranscriptExtended->new();
  my $prev_exon;
  my $score;
  @$exons = sort { $a->start <=> $b->start } @$exons;

  # if we are extending an existing transcript the previous exon is the last in the transcript
  if (defined($transcript)){
    $self->throw("Transcript needs to be a Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranscriptExtended not a $transcript\n")
      unless $transcript->isa("Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranscriptExtended");
    $score = $transcript->score;
    # clone the transcript array
    foreach my $exon (sort { $a->start <=> $b->start } @{$transcript->get_all_Exons}){
      $new_transcript->add_Exon($exon);
      $prev_exon = $exon;
    }
  }

  foreach my $exon (@$exons){

    my $collapsed_exon = $collapsed_cluster->get_exon($exon);
      unless ( $exon ){
	$collapsed_cluster->exon_info($collapsed_exon) if $VERBOSE;
	$self->throw("No score assigned to this exon\n");
      }
    if ($prev_exon){
      my $intron = $collapsed_cluster->make_intron_from_exons($prev_exon,$exon);
      unless ( $intron ){
	$collapsed_cluster->exon_info($intron) if $VERBOSE;
	$self->throw("No score assigned to this intron\n");
      }
      $score += $collapsed_cluster->intron_score($intron);
      # add a penalty to UTR exons
      $score -= $UTR_PENALTY if $exon->ev_set eq 'est';
    }
    $score += $collapsed_cluster->exon_score($collapsed_exon);
    $score -= $UTR_PENALTY if $exon->ev_set eq 'est';
    $prev_exon = $exon;
    $new_transcript->add_Exon($collapsed_exon);
  }
  $new_transcript->score($score);
  return $new_transcript;

}

add_single_exon_genes	Description	Code
clone_exon_extended	No description	Code
clone_gene	No description	Code
clone_transcript_extended	No description	Code
collapse_cluster	Description	Code
compare_translation	Description	Code
filter_genes	Description	Code
make_genes	Description	Code
make_transcripts	Description	Code
run	Description	Code
score	Description	Code
solexa_slice	No description	Code
transcript_from_exons	Description	Code