BestTargetted documentation.

No package variables defined.

my $runnable = Bio::EnsEMBL::Analysis::Runnable::BestTargetted->new(
-query => $slice,
);
$runnable->run;
my @genes = @{$runnable->output};

BestTargetted combines gene-structures from any number of different targetted runs
generating a new set containing the best genes from the sets.
NOTES
- Each gene has only a single transcript
- First, Genes are clustered into overlapping clusters. This does not mean that
every Transcript in the cluster will overlap with every other Transcript in
the cluster. It is more like TranscriptCoalescer where each Transcript in the
cluster overlaps with at least one other Transcript.
- Unclustered Transcripts (those that do not overlap with any other Transcript)
are automatically kept
- If Genes in a cluster consist of those from only one analysis (biotype),
then all the Genes in this cluster will be stored.
- The tricky bit. Now we have a cluster that contains Genes from more than one
analysis. The Transcripts in the this cluster are hashed by:
Protein -> Block -> list of genes in block
where Protein is the protein from which the Transcript is made and Block is a
location where all the Transcripts in this location overlap with every other
Transcript, and all Transcripts are made from the same protein.
- Foreach block, we want to find the best protein. This is achieved by ordering
the Genes in the block by biotype, with Genes from the preferred analysis
being first in the list. The Transcript translations are compared against the
actual source protein. The first Transcript to match exactly is stored. If no
Transcripts match exactly, but all Transcripts have the same Tranlasion, then
store the first Transcript (Gene). If no Transcripts match exactly and they
have different translation sequences, then use exonerate to find the best Gene.
If there is a choice between 2 short genes from one analysis vs 1 long gene
from a second analysis, then choose the 2 short genes (provided they have a
good enough score/percent_id according to Exonerate).
- also, whene there is only one gene in a protein cluster, we might get suspicious.
In these cases, exonerate the gene to check it's score and percent_id to the
original protein. (Often a bad alignment has been produced.) Flag if the
exonerate results indicate that we may have a dodgy alignmennt.

sub all_genes {

  my ($self, $val) = @_;

  if (defined $val) {
    $self->{_all_genes} = $val;
  }

  return $self->{_all_genes};

}

sub biotypes {

  my ($self, $val) = @_;

  if (defined $val) {
    $self->{_bt_biotypes} = $val;
  }

  return $self->{_bt_biotypes};

}

sub check_for_pairs {

  my ($genes) = @_;
  my %logics;

  print "we have ". scalar @$genes ." genes to compare\n";
  
  # all the genes passed in are made from the same protein.
  # see if there are any NON-OVERLAPPING genes of the same biotype  
  foreach my $outer (@$genes) {
    foreach my $inner (@$genes) {
      if (($outer->biotype eq $inner->biotype) && ($outer->start > $inner->end || $outer->end < $inner->start)) {
        #they have the same biotype
        #they do not overlap
	$logics{$outer->biotype} += 1;
      }
    }
  }
  foreach my $k (keys %logics){
    print "Double count $k ".($logics{$k} / 2)."\n"; #divided by 2 because each pair counted twice
  }

 #print if a biotype has >2 transcripts in that cluster
  foreach my $k (keys %logics){
    if ($logics{$k} > 2){
      print "Eek - more than 1 pair of doubles for a single biotype\n";
      last;
    }
  }

  return keys %logics;

}

sub check_gene {

  my ($self, $gene) = @_;

  my $pass_checks = 1;
  my $hit_name;
  my @transcripts = @{$gene->get_all_Transcripts};
  my $protein_acc;

  if (scalar(@transcripts) != 1) {
    print STDERR "Gene with dbID ".$gene->dbID." has ".scalar(@transcripts)." transcripts\n";
    $pass_checks = 0;
  }

  # This bit requires a bit of tweaking for cdna2genome:
  # the genes from cdna2genome have cdnas as transcript_supporting_evidence
  # so will be dna_align_features with a hseqname=cdna_id. 
  # For targetted run, tsfs are protein_align_features
  # with hseqname=protein_id. 
  foreach my $transcript (@transcripts) {
    my @tsfs = @{$transcript->get_all_supporting_features};
    if (scalar(@tsfs) == 1) {
      $hit_name = $tsfs[0]->hseqname;
      my $entry_obj1 = $self->seqfetcher->get_entry_by_acc($hit_name);
      #print "PRIMARY primary $hit_name ".$entry_obj1."\n";

     # my @namesp = @{$self->seqfetcher->secondary_namespaces};
     # if (scalar(@namesp ) >0) {
     #   foreach my $sn (@namesp) {
     #     print "Namespace '$sn'\n";
     #   }
     # }

      #my $secondary_header_id;
      if ($entry_obj1 =~ m/^\>(\S+)\n/) {
        $protein_acc = $1;
      } elsif ($entry_obj1 =~ m/^\>(\S+)\s+(\S+)\n/) {
      #  $secondary_header_id = $2;
        $protein_acc = $2;
      } else {
        throw("Entry has unusual header :\" $entry_obj1\" - hit_name\" $hit_name\"");
      }

      #if ($secondary_header_id) {
      #  print STDERR "fetching secondary seq with 'ID' and acc $secondary_header_id\n\n";
      #  my @seq_objs = $self->seqfetcher->get_Seq_by_secondary('ID', $secondary_header_id);
      #  foreach my $seq_obj2 (@seq_objs) {
      #    print "SECONDARY SEQ $hit_name $secondary_header_id\n ".$seq_obj2->seq."\n";
      #  }
      #}

      #if ($secondary_header_id) {
      #  my $sseq_obj = $self->seqfetcher->get_Seq_by_acc($secondary_header_id);
      #  print ">$secondary_header_id\n".$sseq_obj->seq."\n";
      #}

      # print "index name ".$self->seqfetcher->index_name."\n"; # prints /lustre/work1/ensembl/ba1/cow4/Seq/BestTargetted_proteome_final/proteome.fa
       
    } else {
      print STDERR "Transcript with dbID ".$transcript->dbID.
           " has ".scalar(@tsfs)." tsfs. ".
           "Should only have one transcript_supporting_feature\n";
      $pass_checks = 0;
    }

    # check for a translation
    if (!defined($transcript->translation)) {
      print STDERR "Transcript with dbID ".$transcript->dbID." does not translate.\n";
      $pass_checks = 0;
    }
  }
  return $pass_checks, $protein_acc;

}

sub cluster_by_protein {

  my ($self, $cluster) = @_;
  my %protein_hash;

  # get all genes made from 1 protein
  
  # for this method, we want to use the actual protein
  # so, for cdna2genome, we use the protein id (which
  # is the second accession in the header line)
  GENE: foreach my $gene ($cluster->get_Genes()) {
    my ($gene_ok, $protein) = check_gene($self, $gene);
    
    if (!$gene_ok) {
      warning("Gene check failed for gene ".$gene->dbID." (in cluster with 2 or more Sets) - removed from analysis");
      next GENE;
    }
    
    my $duplicate; 
    if (!exists $protein_hash{$protein}{$gene->biotype}) {
      push @{$protein_hash{$protein}{$gene->biotype}}, $gene;
    } else {
      GOT: foreach my $got_gene (@{$protein_hash{$protein}{$gene->biotype}}){
        if ($gene->biotype ne $got_gene->biotype) {
          next GOT;
        }
	# we assume that each gene has only one Transcript
	my @transcripts1 = @{$gene->get_all_Transcripts()};
	my @transcripts2 = @{$got_gene->get_all_Transcripts()}; 

	if (identical_Transcripts($transcripts1[0], $transcripts2[0])) {
          $duplicate = 1; 
	}  
      }
      if (!$duplicate) {
        push @{$protein_hash{$protein}{$gene->biotype}}, $gene;
      }    
    }
    

    # If genes 1, 2 and 3 are all made from the same protein,
    # then they will be grouped together even if gene 2
    # and gene3 do not overlap.
    #                      gene 1
    #         <--------------------------------------->
    #   gene 2                                     gene 3
    # <---------->                              <-------------->

  }
  return\% protein_hash;

}

sub copy_empty_gene {

  my ($analysis, $gene) = @_;
  
  my $newgene = new Bio::EnsEMBL::Gene;
  $newgene->biotype( $gene->biotype);
  $newgene->analysis($analysis);
  
  return $newgene;

}

sub filter_transcripts {

  my ($h) = @_;
  my %hash = %{$h};
  my %hash2;
  #my %filtered = %{filter_transcripts(\%protein_clusters)};
  # fix doubles and overlappings
  foreach my $protein (keys %hash) {
    print "Filtering protein $protein\n";
    foreach my $biotype (keys %{$hash{$protein}}) {
      print "Filtering biotypes in random order $biotype\n";
      my @genes = @{$hash{$protein}{$biotype}};
      print "  started off with ".scalar(@genes)." genes\n";
      foreach my $g (@genes) {
        print "... gene ".$g->dbID."\n";
      }
      if (scalar(@genes) > 1) {
        print "We have doubles or overlapping\n";
        my @genes2 = @{tidy_up($hash{$protein}{$biotype})};
        print "  now have ".scalar(@genes2)." genes left\n";
        push @{$hash2{$protein}{$biotype}}, @{tidy_up($hash{$protein}{$biotype})}; 
      } else {
        push @{$hash2{$protein}{$biotype}},  @genes;
      }
    }
  }
  return\% hash2;

}

sub get_alternate_transcripts {

  my ($self, $cluster) = @_;

  my %alternates;
  my %seen;

  #print "\n\nStarting new cluster with ".scalar($cluster->get_Genes())." genes:\n";
  
  GENE: foreach my $gene ($cluster->get_Genes()) {
    #print "Doing GENE ".$gene->dbID."...";
    if (exists $seen{$gene->dbID}) {
      #print "Already seen. Skipping...\n";
      next GENE;
    }
    # CHECK THAT:
    # each gene has only 1 transcript
    # each transcript translates
    # each transcript has one tsf
    my ($gene_ok, $protein) = check_gene($self, $gene);
    if (!$gene_ok) {
      throw("Gene check failed for gene ".$gene->dbID."");
    }
    
    # add to the hash
    push @{$alternates{$protein}{$gene->dbID}}, $gene;
    $seen{$gene->dbID} = 1;
    my $exon_string = make_exon_string(@{$gene->get_all_Transcripts}[0]);

    INNER: foreach my $inner_gene ($cluster->get_Genes()) {
      #print "Doing INNER gene ".$inner_gene->dbID."...";
      if (exists $seen{$inner_gene->dbID}) {
        #print "Already seen. Skipping...\n";
        next INNER;
      }
      # DO CHECKS as above
      my ($inner_gene_ok, $inner_protein) = check_gene($self, $inner_gene);
      if (!$inner_gene_ok) {
        throw("Gene check failed for gene ".$inner_gene->dbID."");
      }
      if ($protein ne $inner_protein) {
        #print "Proteins from GENE $protein and INNER $inner_protein do not match. Skipping...\n";
        next INNER;
      }
      my $inner_exon_string = make_exon_string(@{$inner_gene->get_all_Transcripts}[0]);

      # Now look for unique genes made form the same protein
      if ($exon_string eq $inner_exon_string) {
        # they are identical
        #print "Genes ".$gene->dbID." and ".$inner_gene->dbID." have same exon string\n";
        $seen{$inner_gene->dbID} = 1; 
        next INNER;
      } elsif ($gene->start <= $inner_gene->end && $gene->end >= $inner_gene->start) {
        # they overlap and should be investigated
        # but does it overlap with everything in the array?
        #print "outer GENE ".$gene->dbID." overlaps with INNER gene ".$inner_gene->dbID."\n";
        if (overlaps_all($alternates{$protein}{$gene->dbID}, $inner_gene) ) {
          #print "  INNER gene ".$inner_gene->dbID." overlaps all in cluster so far\n";
          # not sure how this affects a "joiner" - a transcript that joins 2 clusters
          push @{$alternates{$protein}{$gene->dbID}}, $inner_gene;
          $seen{$inner_gene->dbID} = 1;
        }
        # what happens when we are dealing with a case like that of gene 1?
        # if gene 1 can only be used once, the we might miss out on something important
        # how often will this occur though, in reality?
        #                      gene 1
        #         <--------------------------------------->
        #   gene 2                                     gene 3
        # <---------->                              <-------------->

      } else {
        # they do not overlap so ignore
        #print "  INNER gene ".$inner_gene->dbID." does not overlap all in cluster so far\n";
        print "Genes ".$gene->dbID." (exon string ".$exon_string.") and ".
              $inner_gene->dbID." (exon string $inner_exon_string) do not overlap.\n"; 
      } #if ($exon_string eq $inner_exon_string) {
    } #inner
  } # genes
  return\% alternates;

}

sub get_best_gene {

  my ($self, $protein, $biotypes, $hsh) = @_;

  my %hash = %{$hsh};

  # we don't wanna make a gene here if only one analysis has built here
  my $biotypes_represented;
  my $token_gene;
  foreach my $biotype (keys %{$hash{$protein}}) {
    if (scalar(@{$hash{$protein}{$biotype}}) > 0) {
      $biotypes_represented++;
      $token_gene = $hash{$protein}{$biotype}->[0];
    }
  }
  if ($biotypes_represented == 1 && scalar(@$biotypes) > 2) {
    print "FLAG_NULL1: For $protein, only one biotype represented. See gene ".$token_gene->dbID." (".$token_gene->biotype.")\n";
    if (!defined $self->keep_single_analysis) {
      print "FLAG_NULL2: For $protein, am returning no genes.\n";
      return [];
    }
  }

  # which biotypes have more than 1 gene made from $protein?
  my @doubles;
  my $num_genes = 0;
  #foreach my $type (@$biotypes) {
    foreach my $biotype (keys %{$hash{$protein}}) {
        #my @g = $hash->{$protein}->{$biotype};
        #print "have ".scalar(@g)." genes, existing...\n"; 
        print "biotype $biotype has ".scalar(@{$hash{$protein}{$biotype}})." genes\n ";
        $num_genes += scalar(@{$hash{$protein}{$biotype}});
        if (scalar(@{$hash{$protein}{$biotype}}) > 1) {
          print "*** Have ".$biotype." doubles so need to be careful ***\n";
          push @doubles, $biotype;

        }
    }
  #} 

  # fetch the protein
  # for cdna2genome, we might need to fix things here. Getting a protein seq this way
  # we need to modify the input config file to have peptide seq with a cdna id header
  my $pep = $self->seqfetcher->get_Seq_by_acc($protein)->seq;
  if (!$pep) {
    throw("Unable to fetch peptide sequence");
  }
  my @best;
  my $exonerate; # flag
  print "Finding best gene for protein $protein...\n";

  if (scalar(@doubles) > 0) {
    print "We ave doubles so let's just use them\n";
    OUTER: foreach my $biotype (@$biotypes) {
      INNER: foreach my $type (@doubles) {
        if ($biotype eq $type) {
          print "The most favoured set of doubles is $biotype so we takes them\n";
          foreach my $gene (@{$hash{$protein}{$biotype}}) {
            print "Keeping gene ".$gene->dbID."\n";
          }
          push @best, @{$hash{$protein}{$biotype}};
          return\@ best;
        }
      }
    }
  }

  # Go thru each gene in order of favourite to least fav biotype
  # See whether its tln matches the peptide
  my @match_pep;
  my %tln_groups;
  my $tln_seq;
  OUTER: foreach my $type (@$biotypes) {
    INNER: foreach my $biotype (keys %{$hash{$protein}}) {
      if ($biotype ne $type) {
        next INNER;
      }
      if (scalar(@{$hash{$protein}{$biotype}}) > 0) {
        foreach my $gene (@{$hash{$protein}{$biotype}}) {
          print "Got gene dbid ".$gene->dbID." ".$gene->biotype."\n";
          foreach my $transcript (@{$gene->get_all_Transcripts}) {
            $tln_seq = $transcript->translate->seq;
            if ($tln_seq eq $pep) {
              push @match_pep, $gene;
            }
            push @{$tln_groups{$tln_seq}}, $gene; 
          }
        }
      } else {
        print "No genes for biotype $biotype\n";
      }
    }
  }
  my $num_tln_groups = scalar(keys %tln_groups);
  print "Have $num_tln_groups different possible translations represented.\n"; 
  foreach my $s (keys %tln_groups) {
    print "option ".$tln_groups{$s}[0]->biotype.": $s\n";
  }


# Now go thru the results and see whether we have any 
  # exact matches or whether we need to exonerate
  if (scalar(@match_pep == 1)) {
    # we don't care if we have doubles or not
    print "Only one exact match to protein.\n Keeping only gene ".$match_pep[0]->dbID." (".$match_pep[0]->biotype.")\n";
    @best = ( $match_pep[0] );

  } elsif (scalar(@match_pep > 1)) {
    print "More than one exact match to protein.\n"; 
    if (scalar(@doubles) > 0) {
      print "Have doubles and more than one match to the protein.\n";
      $exonerate = 1;
    } else {
      print "Keeping first gene ".$match_pep[0]->dbID.
            " from array as we prefer it's analysis (".$match_pep[0]->biotype.")\n";
      @best = ( $match_pep[0] );
    }

  } else {
    print "No exact match to protein. ";
    # we may have the translated sequences being all the same, 
    # or we may have the translated sequences being different.
    # take the first gene if they're all the same
    if (($num_genes > 1) && (scalar(keys %tln_groups) == 1) && (scalar(@doubles < 1))) {
      print "All transcripts have the same translated sequence.\n Keeping first gene ".$tln_groups{$tln_seq}->[0]->dbID.
            " from array as we prefer it's analysis (".$tln_groups{$tln_seq}->[0]->biotype.")\n";
      print "*** Maybe we should exonerate or look at exon structure to decide what to choose?\n";
      @best = ( $tln_groups{$tln_seq}->[0] );
    } else {
      print "We have $num_genes genes, ".scalar(keys %tln_groups)." different sequences ".
            " and ".scalar(@doubles)." doubles. Let's dig deeper.\n";
      $exonerate = 1;
    }
  }
 

  # More clarification is needed.
  # So we do exonerates
  # we keep track of all genes' scores and percent_ids for later use
  if (defined $exonerate) {
    print "Exonerate...\n";
    my $gene_tln_comp;
    my $max_gene_score = 0;
    my $max_perc_id = 0;
    my $best;
    my $first_best;
    my %scores;
    # make sure that you're using the correct protein and pep for cdna2genome
    my $target = Bio::Seq->new( -display_id => $protein, -seq => $pep);
    foreach my $biotype (@$biotypes) {
      if (defined $hash{$protein}{$biotype}) {
        foreach my $gene (@{$hash{$protein}{$biotype}}) {
          if (!defined $best) {
            # sometimes, when there is only one gene and it is very short, it has a score = 0
            print "First gene is ".$gene->dbID."\n";
            $best = $gene;
            $first_best = $gene;
          }
          foreach my $transcript (@{$gene->get_all_Transcripts}) {
            print "Running transcript ".$transcript->dbID."\n";
            my $query = Bio::Seq->new( -display_id => $transcript->dbID, -seq => $transcript->translate->seq);
            my ($gene_score, $perc_id) = @{$self->run_exonerate($query, $target)}; 
           # my $gene_score = shift @{$self->run_exonerate($query, $target)};
    
            print "Gene ".$gene->dbID." (".$gene->biotype.") has score $gene_score and percent_id $perc_id (ratio ".
                  ($transcript->translation->length)/($transcript->end - $transcript->start).")\n";
            $scores{$gene->biotype}{$gene->dbID}{'score'} = $gene_score;
            $scores{$gene->biotype}{$gene->dbID}{'percent_id'} = $perc_id; 
            $scores{$gene->biotype}{$gene->dbID}{'ratio'} = ($transcript->translation->length)/($transcript->end - $transcript->start);
            $scores{$gene->biotype}{$gene->dbID}{'gene'} = $gene;
            if ($gene_score > $max_gene_score) {
              $max_gene_score = $gene_score;
              $best = $gene;
              print "Gene with max score $max_gene_score is now ".$gene->dbID." (with percent_id $perc_id)\n";
            } # $gene_score > $max_gene_score
            if ($perc_id > $max_perc_id) {
              $max_perc_id = $perc_id;
            }
          } # reach my $transcript (@{$gene->get_all_Transcripts}) 
        } # foreach my $gene (@$genes) 
      }
    } # foreach my $biotype (@$biotypes) 

    # ok, we have all info that we need. Now 
    # see what we're dealing with. Are there doubles 
    # or can we just take the gene with the highest score? 
    my %allowed_doubles;
    if (scalar(@doubles) == 0) {
      print "No doubles, Checking the ratio\n";
      if ($scores{$best->biotype}{$best->dbID}{'ratio'} >= 0.8*($scores{$first_best->biotype}{$first_best->dbID}{'ratio'})) {
        @best = ($best);
        print "ratio OK. Keeping gene ".$best->dbID." biotype ".$best->biotype.
             " , ratio ".$scores{$best->biotype}{$best->dbID}{'ratio'}." vs ".
              $scores{$first_best->biotype}{$first_best->dbID}{'ratio'}."\n";
      } else {
        print "Ratio too low. Befault to 1st biotype. Keeping gene ".$first_best->dbID." biotype ".$first_best->biotype."\n";
        @best = ($first_best);
      }

    } elsif (scalar(@doubles) >= 1) {
      print "We have ".scalar(@doubles)." sets of doubles\n";
      # check that all genes have score >= 90% of the max
      # and then choose the biotype with the highest collective score
      foreach my $dbl_biotype (@doubles) {
        $allowed_doubles{$dbl_biotype} = 0;
        foreach my $gid (keys %{$scores{$dbl_biotype}}) {
          if ($scores{$dbl_biotype}{$gid}{'score'} >= 0.9*$max_gene_score) {
            # gene has sufficiently high score that we want to leep it
            $allowed_doubles{$dbl_biotype} += $scores{$dbl_biotype}{$gid}{'score'};
          } else {
            print "  Not acceptable: gene $gid has score ".$scores{$dbl_biotype}{$gid}{'score'}." which is too low\n";
          } #if
        } # gid
      } # foreach my $dbl_biotype (@$doubles)  
    
      # choose the one with the max score
      # note that if 2 biotypes have the same max score, then there is no
      # distinguishing which one is betetr
      my $best_biotype;
      my $max_dbl_score = 0;
      
      foreach my $dbl_biotype (@$biotypes) {
	
	if (defined $dbl_biotype && ($allowed_doubles{$dbl_biotype} > $max_dbl_score)) {
          $max_dbl_score = $allowed_doubles{$dbl_biotype};
          $best_biotype = $dbl_biotype;
        } 
      }
      if ($max_dbl_score > 0) {
        # we take the doubles
        foreach my $gene (@{$hash{$protein}{$best_biotype}}) {
          if (($gene->biotype eq $best_biotype) && ($scores{$gene->biotype}{$gene->dbID}{'score'} >= 0.9*$max_gene_score)) {
            print "Keeping gene ".$gene->dbID." biotype ".$gene->biotype."\n";
            push @best, $gene;
            if ($scores{$gene->biotype}{$gene->dbID}{'percent_id'} < 95) {
              print "WARNING_FLAG: percent_id is less than max\n";
            }
          } else {
            print "(Not keeping ".$gene->dbID." as score too low)\n";
          }
        }
      }
    }
    foreach my $b (@best) {
      if ($scores{$b->biotype}{$b->dbID}{'percent_id'} < 90) {
        print "FLAG: best_gene ".$b->dbID." has percent_id ".$scores{$b->biotype}{$b->dbID}{'percent_id'}.
              "<90 (start ".$b->start." end ".$b->end.")\n";
      }
    }
  } #if (defined $exonerate)  
  return\@ best;

}

sub get_ordered_genes {

  my ($self, $genes) = @_;

  my @ordered;
  my %seen;  
  
  print "block has ".scalar(@$genes)." genes\n";
  foreach my $logic (@{$self->biotypes}) {
    foreach my $gene (@$genes) {  
      if (!exists $seen{$gene} && ($logic eq $gene->biotype)) {
        print "  Got in cluster: gene ".$gene->dbID." of biotype ".$gene->biotype."\n";
        push @ordered, $gene;
        $seen{$gene} = 1;
      }
    }
  }
  return\@ ordered;

}

sub keep_single_analysis {

  my ($self, $val) = @_;

  if (defined $val) {
    $self->{_bt_keep_single_analysis} = $val;
  }

  return $self->{_bt_keep_single_analysis};
}



##########################################
### local class
##########################################

package Bio::EnsEMBL::Analysis::Runnable::BestTargettedExonerate;

use Bio::EnsEMBL::Analysis::Runnable::BaseExonerate;

use vars qw(@ISA);
@ISA = qw(Bio::EnsEMBL::Analysis::Runnable::BaseExonerate);

}

sub logic_names {

  my ($self, $val) = @_;

  if (defined $val) {
    $self->{_bt_logic_names} = $val;
  }

  return $self->{_bt_logic_names};

}

sub make_exon_string {

  my ($transcript) = @_;
  my $exon_string = "E";
  
  my @exons = sort {$a->start <=> $b->start} @{$transcript->get_all_translateable_Exons};
  foreach my $exon (@exons) {
    $exon_string .= ":".$exon->start.":".$exon->end;
  }
  return $exon_string;

}

sub make_outgenes {

  my ($analysis, $gene, $trans) = @_;
  
  my $outgene = copy_empty_gene($analysis, $gene);

  if (not defined $trans){
    my @trans = @{$gene->get_all_Transcripts};
  
    if (scalar @trans > 1){
      print STDERR "gene ".$$gene->dbID." has >1 transcript\n";
    }else{
      $trans = $trans[0];
    }
  }

  my $outtrans = clone_Transcript($trans); 
  $outtrans->analysis($analysis);
  $outtrans->dbID(undef);
  
 
  #update the transcript_supporting_feature analysis_ids to match the transcript
  foreach my $sf(@{$outtrans->get_all_supporting_features}){
    $sf->dbID(undef);
    $sf->analysis($analysis);
  }
  
  #update the supporting_feature analysis_ids to match the transcript
  foreach my $exon (@{$outtrans->get_all_Exons}){
    $exon->dbID(undef);
    foreach my $sf (@{$exon->get_all_supporting_features}){
      $sf->dbID(undef);
      $sf->analysis($analysis);
    }
  }
  
  $outgene->add_Transcript($outtrans);
 
  return $outgene;

}

sub new {

  my ($class,@args) = @_;
  my $self = $class->SUPER::new(@args);

    
  my( $biotypes, $seqfetcher, $verbose, $genes, $keep_single_analysis ) = 
      rearrange([qw(
                     BIOTYPES 
                     SEQFETCHER
                     VERBOSE
                     GENES
                     KEEP_SINGLE_ANALYSIS
                    )], @args);

  $self->seqfetcher($seqfetcher) if defined $seqfetcher;
  $self->biotypes($biotypes) if defined $biotypes;
  $self->verbose($verbose) if defined $verbose;
  $self->all_genes($genes) if defined $genes;
  $self->keep_single_analysis($keep_single_analysis) if defined $keep_single_analysis;

  return $self ;

}

sub overlaps_all {

  my ($gene_array, $gene)  = @_;
  my $num_overlapped;
  my $overlaps_all;

  foreach my $g (@$gene_array) {
    if ($g->start <= $gene->end && $g->end >= $gene->start) {
      $num_overlapped++;
    }
  }
  if ($num_overlapped == scalar(@$gene_array)) {
    $overlaps_all = 1;
    #print "overlaps all $num_overlapped genes\n";
  } else {
    print "does not overlap all genes\n";
  }
  return $overlaps_all;

}

sub run {

   my ($self) = @_ ; 
  
  my %outgenes;
   
  # get all genes using both logic names on slice and cluster them
  my %logic_hash;
  foreach my $logic (@{$self->biotypes}) {
    print STDERR "got $logic\n";
    $logic_hash{$logic} = [$logic];
  }
 
 
  # get all genes
  my @allgenes = @{$self->all_genes};

 
  # do clustering
  my ($clusters, $non_clusters) = cluster_Genes(\@allgenes,\% logic_hash ) ; 
  if ($self->verbose){
    print @$non_clusters ." non_clusters and ".@$clusters ." clusters\n";
  }


  # # #
  # loop thru non-clusters (single genes)
  # # #
  foreach my $non_cluster ( @$non_clusters ) {
    my @genes = $non_cluster->get_Genes();
    GENE: foreach my $gene (@genes){
      # need to check that they have a tln
      my ($gene_ok, $protein) = check_gene($self, $gene);
      if (!$gene_ok) {
        warning("Gene check failed for unclustered gene ".$gene->dbID." - removed from analysis");
        next GENE;
      }

      if ($self->verbose){
        print "storing non_cluster_gene ".$gene->dbID."\n";
      }
      $outgenes{$gene} = make_outgenes($self->analysis, $gene);
    }
  }


  # # #  
  # Now look at the clusters 
  # # #
  my @twoways;  
  foreach my $cluster (@$clusters) {
    my @inc_sets = @{$cluster->get_sets_included};
    # does the cluster contain genes from more than one set?
    if (scalar @inc_sets >= 2) {
      push @twoways, $cluster;
    } elsif (scalar @inc_sets == 1) { 
      GENE: foreach my $gene ($cluster->get_Genes_by_Set($inc_sets[0])) {
        # need to check that they have a tln
        my ($gene_ok, $protein) = check_gene($self, $gene);
        if (!$gene_ok) {
          warning("Gene check failed for gene ".$gene->dbID." (in cluster of only one Set) - removed from analysis");
          next GENE;
        }
        if ($self->verbose){ 
           my $gene_id = $gene->dbID ? $gene->dbID : "" ; 
           print "storing single_set_cluster_gene ".$gene_id ." ".$inc_sets[0]."\n";
         }
        $outgenes{$gene} = make_outgenes($self->analysis, $gene);
      } # end of foreach my $gene ($cluster->get_Genes_by_Set($inc_sets[0]))
    } # end of if (scalar @inc_sets >= 2) 
  } #end of foreach my $cluster (@$clusters) 
  
  print "Got " . scalar(@twoways) . " twoways\n";


  # # #
  # Now look at the clusters that contain genes of more than one biotype / biotype
  # # #
  foreach my $cluster (@twoways) {
    # check that genes have only one transcript
    # ignore gene if it does not translate
    # make a hash with keys = protein
    # and values = an array of all genes whose transcripts are made from this protein
    my $protein_clusters = cluster_by_protein($self, $cluster);    
    my $filtered = filter_transcripts($protein_clusters);

    foreach my $protein (keys %{$protein_clusters}) {  

      print "fetching protein from index : $protein\n " ;    
      my $fetched_protein;  

      #   
      # Seqfetcher does NOT throw just returns undef so we have to use eval block here to  
      # make the job die - otherwise job dies quietly which is BAD as it will be recorded successfully 
      #    
      
      eval { 
            $fetched_protein = $self->seqfetcher->get_Seq_by_acc($protein) ;     
           }; 

      if ($@) {    
         throw ( " Could not get sequence for protein $protein\n from index ". $self->seqfetcher->index_name . 
                 "\n If you use OBDA Index : Maybe your keys are redundant ?\n\n $@ " ) ; 
      }   

      unless ( $fetched_protein ) { 
        throw("SeqFetcher cannot get_Seq_by_acc for protein '$protein' in indexfile : ".  $self->seqfetcher->index_name );
      }  

      my $pep = $fetched_protein->seq; 
      print "Comparing transcripts made from protein $protein with seq $pep\n";
      if (!$pep) {
        throw("Unable to fetch peptide sequence for protein $protein");
      }
  
      # filter the genes
      my @best_genes = @{$self->get_best_gene($protein, $self->biotypes, $filtered)};
      foreach my $best_gene (@best_genes) {
        if (! exists $outgenes{$best_gene}){
          $outgenes{$best_gene} = make_outgenes($self->analysis, $best_gene);
        }
      } 
    } #foreach my $prot (keys %protein_clusters) 
  } # oreach my $cluster (@twoways) 
  my @outgenes;
  foreach my $gene ( keys %outgenes ) {
    push @outgenes, $outgenes{$gene};
  }
  $self->output(\@outgenes);

  #return ;

}

sub run_exonerate {

  my ($self, $query, $target) = @_;
  
  my $exonerate = Bio::EnsEMBL::Analysis::Runnable::BestTargettedExonerate->new
      (
       -program     => $self->program,
       -analysis    => $self->analysis,
       -query_seqs  => [$query],
       -target_seqs => [$target],
       -options     => "--model affine:local",
       );
       
  
  $exonerate->run;
  my ($score, $perc_id) = @{$exonerate->output} ;
  
  return [$score, $perc_id];
}

########################

}

sub seqfetcher {

  my ($self, $val) = @_;

  if (defined $val) {
    $self->{_bt_seqfetcher} = $val;
  }

  return $self->{_bt_seqfetcher};

}

sub tidy_up {

  my ($genes) = @_;

  print "In tidy up...have ".scalar(@$genes)." genes\n";
  my @ordered = sort {($a->end - $a->start) <=> ($b->end - $b->start)} @$genes;

  # make an array holding the shortest gene
  my @non_overlapping;
  push @non_overlapping, $ordered[0];

  # add in genes if they don't overlap
  my $overlaps = 0;
  OUTER: foreach my $outer (@ordered) {
    INNER: foreach my $inner (@non_overlapping) {
      if ($outer->dbID == $inner->dbID) {
        next OUTER;
      }
      if (($outer->start <= $inner->end && $outer->end >= $inner->start) 
       || ($inner->start <= $outer->end && $inner->end >= $outer->start)) {
        print "overlaps... checking if it extends the smaller gene... ";
        my $outer_transc = $outer->get_all_Transcripts->[0];
        my $inner_transc = $inner->get_all_Transcripts->[0];
        #my $outer_transc_seq = $outer->get_all_Transcripts->[0]->translateable_seq;
        #my $inner_transc_seq = $inner->get_all_Transcripts->[0]->translateable_seq;
        my $exact_match;
        for (my $i=0; $i< scalar(@{$outer_transc->get_all_Exons}); $i++) {
          for (my $j=0; $j<scalar(@{$inner_transc->get_all_Exons}); $j++) {
            if ($outer_transc->get_all_Exons->[$i]->start == $inner_transc->get_all_Exons->[$j]->start &&
                $outer_transc->get_all_Exons->[$i]->end == $inner_transc->get_all_Exons->[$j]->end) {
              $exact_match++;
              #next;
            }
          }
        }
        if ($exact_match == scalar(@{$inner_transc->get_all_Exons})) { 
        #if ($outer_transc->translateable_seq =~ m/$inner_transc_seq/) {
          print "EXTENDS\n";
        } else {
          print "OVERLAP\n";
          $overlaps++;
          next OUTER;
        }
      }
    }
    if ($overlaps < 1) {
      print "ADDING A DOUBLE\n";
      push @non_overlapping, $outer;
    }
  }

  # now, we might be able 
  return\@ non_overlapping;

}

sub verbose {

  my ($self, $val) = @_;

  if (defined $val) {
    $self->{_bt_verbose} = $val;
  }

  return $self->{_bt_verbose};

}

General documentation

Bio::EnsEMBL::Analysis::Runnable::BestTargetted

Post questions to the Ensembl development list: ensembl-dev@ebi.ac.uk

all_genes	No description	Code
biotypes	No description	Code
check_for_pairs	No description	Code
check_gene	No description	Code
cluster_by_protein	No description	Code
copy_empty_gene	Description	Code
filter_transcripts	No description	Code
get_alternate_transcripts	Description	Code
get_best_gene	No description	Code
get_ordered_genes	No description	Code
keep_single_analysis	No description	Code
logic_names	No description	Code
make_exon_string	No description	Code
make_outgenes	Description	Code
new	No description	Code
overlaps_all	No description	Code
run	Description	Code
run_exonerate	Description	Code
seqfetcher	No description	Code
tidy_up	No description	Code
verbose	No description	Code