FindSplitGenes documentation.

sub fetch_input {

  my( $self ) = @_;  

  my $dbname = 'Compara' ;  
  my @potential_stable_ids_for_recovery ; 

  # INSERT INTO `analysis` VALUES (100,'2006-11-22 16:39:35','FindSplitGenes',NULL,NULL,NULL,NULL,NULL,NULL,NULL,'FindSplitGenes',NULL,NULL,NULL); 

  my $dbc = Bio::EnsEMBL::Registry->get_DBAdaptor($dbname ,'compara') ;  
  my $gdba = Bio::EnsEMBL::Registry->get_adaptor($dbname,'compara','GenomeDB');
  my $mlssa = Bio::EnsEMBL::Registry->get_adaptor($dbname,'compara','MethodLinkSpeciesSet');

  unless ($dbc){ 
    throw ("Can't get Adaptor for comara ")  ; 
  }  

  my $method_link_type = "ENSEMBL_ORTHOLOGUES";

  # this should be the same so REVOCER_SPLIT_GENES ... can be removed ...
  my $qy_species  = $$MAIN_CONFIG{QUERY_SPECIES};  
  my $targeted_species = $$FIND_SPLIT_GENES{ANALYSIS_SETS}{$self->post_logic_name}{TARGETTED}; 

  my $informant_species = $$FIND_SPLIT_GENES{ANALYSIS_SETS}{$self->post_logic_name}{INFORMANT}; 

  #my $tg2_species = $$FIND_PARTIAL_GENES{ANALYSIS_SETS}{$self->post_logic_name}{TARGETED};

  my $informant_perc_cov = $$FIND_SPLIT_GENES{ANALYSIS_SETS}{$self->post_logic_name}{INFORMANT_PERC_COV} ; 
  my $targeted_perc_cov  = $$FIND_SPLIT_GENES{ANALYSIS_SETS}{$self->post_logic_name}{TARGETTED_PERC_COV} ;   

  my $informant_gdb = $gdba->fetch_by_name_assembly($informant_species);
  my $targeted_gdb = $gdba->fetch_by_name_assembly($targeted_species); 
  my $mlss = $mlssa->fetch_by_method_link_type_GenomeDBs($method_link_type,[$informant_gdb, $targeted_gdb]);

  my $no_chr_constraint = 0;
 

  my $sql = "SELECT im.stable_id FROM
  homology h, homology_member thm, member tm, homology_member ihm, member im 
  WHERE h.homology_id=thm.homology_id AND 
  thm.member_id=tm.member_id AND 
  h.homology_id=ihm.homology_id AND 
  ihm.member_id=im.member_id AND 
  h.method_link_species_set_id = ? AND 
  tm.genome_db_id = ? AND im.genome_db_id = ? AND 
  ihm.perc_cov < ? AND thm.perc_cov > ? 
  group by im.stable_id;";
  
  my $sth = $dbc->prepare($sql);
  $sth->execute($mlss->dbID, $targeted_gdb->dbID, $informant_gdb->dbID,$informant_perc_cov, $targeted_perc_cov);
  
  $sql = "SELECT h.description,im.stable_id,im.chr_name,im.chr_start,im.chr_end,im.chr_strand,
  ihm.perc_cov,tm.stable_id,tm.chr_name,tm.chr_start,tm.chr_end,tm.chr_strand,thm.perc_cov FROM 
  homology h, homology_member thm, member tm, homology_member ihm, member im 
  WHERE h.homology_id=thm.homology_id AND 
  thm.member_id=tm.member_id AND 
  h.homology_id=ihm.homology_id AND 
  ihm.member_id=im.member_id AND 
  h.method_link_species_set_id = ? AND 
  tm.genome_db_id = ? AND im.genome_db_id = ? AND 
  im.stable_id = ?";
  
  my $stable_id;
  $sth->bind_columns(\$stable_id);
  my $sth1 = $dbc->prepare($sql);
  
  my $targeted_split_genes = 0;
  my %stable_ids;
  my ($min_start, $max_end);
  
  print "#Possible split genes in $targeted_species #using $informant_species as informant\n";
  
  while ($sth->fetch) {
    $sth1->execute($mlss->dbID, $targeted_gdb->dbID, $informant_gdb->dbID, $stable_id);
  
    next if ($sth1->rows < 2);
  
    my ($description,$im_stable_id,$im_chr_name,$im_chr_start,$im_chr_end,$im_chr_strand,
        $ihm_perc_cov,$tm_stable_id,$tm_chr_name,$tm_chr_start,$tm_chr_end,$tm_chr_strand,
        $thm_perc_cov);
  
    $sth1->bind_columns(\$description,\$im_stable_id,\$im_chr_name,\$
                        im_chr_start,\$im_chr_end,\$im_chr_strand,\$
                        ihm_perc_cov,\$tm_stable_id,\$tm_chr_name,\$
                        tm_chr_start,\$tm_chr_end,\$tm_chr_strand,\$
                         thm_perc_cov);
  
    my @split_gene_data = ();
    my $split_gene_chr; 

    while ($sth1->fetch) { 

      unless (defined $split_gene_chr) {
        $split_gene_chr = $tm_chr_name;
      }
      if ($ihm_perc_cov >$targeted_perc_cov) {# || $split_gene_chr ne $tm_chr_name) 
        @split_gene_data = ();
        $split_gene_chr = undef;
        last;
      }
      if (!$no_chr_constraint && $split_gene_chr ne $tm_chr_name) {
        @split_gene_data = ();
        $split_gene_chr = undef;
        last;
      } 

      push @split_gene_data, [$description,$im_stable_id,$im_chr_name,$im_chr_start,
                              $im_chr_end,$im_chr_strand,$ihm_perc_cov,$tm_stable_id,
                              $tm_chr_name,$tm_chr_start,$tm_chr_end,$tm_chr_strand,
                              $thm_perc_cov]; 

      $min_start = $tm_chr_start unless (defined $min_start);
      $min_start = $tm_chr_start if ($min_start > $tm_chr_start);
      $max_end = $tm_chr_end unless (defined $max_end);
      $max_end = $tm_chr_end if ($max_end < $tm_chr_end);
      $stable_ids{$im_stable_id} =1;
      $stable_ids{$tm_stable_id} =1;
  } 

  push @potential_stable_ids_for_recovery, $im_stable_id ; 

  if (scalar @split_gene_data) {
    printf "#%-5s %20s %5s %9s %9s %6s %3s %20s %5s %9s %9s %6s %3s\n", qw(desc stable_id chr start end strand cov stable_id chr start end strand cov);
    foreach my $gene_piece (@split_gene_data) {
      printf " %-5s %20s %5s %9d %9d %6s %3d %20s %5s %9d %9d %6s %3d\n",@{$gene_piece};
    }
    print "#----\n";
    $targeted_split_genes++;
  }
  %stable_ids = ();
  $min_start = undef;
  $max_end = undef; 

} 

  print "#Potentially $targeted_split_genes $targeted_species split genes\n";  

  $self->split_genes(\@potential_stable_ids_for_recovery) ;

}

sub get_longest_transcripts {

  my ($ga, $stable_ids) = @_ ;
  my @cds ;

  GENES : for my $sid (@$stable_ids ) {
    my $gene = $ga->fetch_by_stable_id ($sid) ;
    my $maxlen = 0;
    my $nexonmax = 0;
    my $longest_transcript;

    foreach my $trans (@{$gene->get_all_Transcripts}) {
      my $len = 0;
      if ($trans->translation) {
        my @trans_exons = @{$trans->get_all_translateable_Exons()};
        foreach my $exon (@trans_exons) {
          $len+= $exon->length;
        }
        if ($len > $maxlen) {
          $maxlen = $len;
          $longest_transcript = $trans;
          $nexonmax = scalar($trans->get_all_Exons)
        }
      }
    }

    if (!defined($longest_transcript)) {
      print "No longest_transcript  transcript found for " . $gene->stable_id . "\n" ;
      next GENES ;
    }
    my $transstr = $longest_transcript->translate->seq;
    $transstr =~ s/(.{1,60})/$1\n/g;
    #print ">$sid\n$transstr\n\n" ; 
    if ($transstr =~m/\*/ ) {
      print "SKIPPING " . $longest_transcript->translation->stable_id .
        " : translation contain stop codon\n$transstr\n\n";
      next GENES ;
    }
    my $cds = $longest_transcript->translate ;
    $cds->display_id( $longest_transcript->translation->stable_id)  ;
    push @cds, $cds ;
  }
  my @rand_cds ;
  # randomize    
  return shuffle(\@cds) ;
}


1;

}

fetch_input	No description	Code
get_longest_transcripts	No description	Code
new	No description	Code
read_and_check_config	No description	Code
run	No description	Code
split_genes	No description	Code
write_output	No description	Code