OrthoTree documentation.

sub _treefam_genepairlink_stats {

  # This method is only useful to compare treefam v genetree
  my $self = shift;

  my $starttime = time()*1000;
  my $tmp_time = time();
  my $tree = $self->{'protein_tree'};

  my @all_protein_leaves = @{$tree->get_all_leaves};

  #precalculate the ancestor species_hash (caches into the metadata of nodes)
  #also augments the Duplication tagging
  $self->get_ancestor_species_hash($tree);

  #compare every gene in the tree with every other
  #each gene/gene pairing is a potential ortholog/paralog
  #and thus we need to analyze every possibility
  #Accomplish by creating a fully connected graph between all the
  #genes under the tree (hybrid graph structure) and then analyze each
  #gene/gene link

  $tmp_time = time();
  printf("build fully linked graph\n") if($self->debug);
  my @genepairlinks;
  while (my $protein1 = shift @all_protein_leaves) {
    foreach my $protein2 (@all_protein_leaves) {
      my $ancestor = $protein1->find_first_shared_ancestor($protein2);
      my $taxon_level = $self->get_ancestor_taxon_level($ancestor);
      my $distance = $protein1->distance_to_ancestor($ancestor) +
                     $protein2->distance_to_ancestor($ancestor);
      my $genepairlink = new Bio::EnsEMBL::Compara::Graph::Link
        ($protein1, $protein2, $distance);
      $genepairlink->add_tag("hops", 0);
      $genepairlink->add_tag("ancestor", $ancestor);
      $genepairlink->add_tag("taxon_name", $taxon_level->name);
      push @genepairlinks, $genepairlink;
    }
  }
  printf("%1.3f secs build links and features\n", time()-$tmp_time) 
    if($self->debug>1);

  $self->{'protein_tree'}->print_tree($self->{'tree_scale'}) 
    if($self->debug);

  #sort the gene/gene links by distance
  #   makes debug display easier to read, not required by algorithm
  $tmp_time = time();
  printf("sort links\n") if($self->debug);
  my @sorted_genepairlinks = 
    sort {$a->distance_between <=> $b->distance_between} @genepairlinks;
  printf("%1.3f secs to sort links\n", time()-$tmp_time) if($self->debug > 1);

  #analyze every gene pair (genepairlink) to get its classification
  printf("analyze links\n") if($self->debug);
  $tmp_time = time();
  my $count = 0;
  foreach my $genepairlink (@sorted_genepairlinks) {
    my ($protein1, $protein2) = $genepairlink->get_nodes;
    #run feature detectors: precalcs and caches into metadata
    $self->genepairlink_check_dups($genepairlink);
    $self->genepairlink_fetch_homology($genepairlink) if($self->debug);

    #do classification analysis : as filter stack
    if($self->inspecies_paralog_test($genepairlink)) { }
    elsif($self->direct_ortholog_test($genepairlink)) { } 
    elsif($self->ancient_residual_test($genepairlink)) { } 
    elsif($self->one2many_ortholog_test($genepairlink)) { } 
    elsif($self->outspecies_test($genepairlink)) { }
    else {
      printf
        ("OOPS!!!! %s - %s\n",
         $protein1->gene_member->stable_id,
         $protein2->gene_member->stable_id
        );
    }
    my $stid1;
    if (defined($protein1->gene_member)) {
      $stid1 = $protein1->gene_member->stable_id;
    } elsif (defined($protein1->get_tagvalue("G"))) {
      $stid1 = $protein1->get_tagvalue("G");
    } else {
      $stid1 = "unknown";
    }
    my $stid2;
    if (defined($protein2->gene_member)) {
      $stid2 = $protein2->gene_member->stable_id;
    } elsif (defined($protein2->get_tagvalue("G"))) {
      $stid2 = $protein2->get_tagvalue("G");
    } else {
      $stid2 = "unknown";
    }
    my @stids = sort ($stid1,$stid2);
    my $type = $genepairlink->get_tagvalue('orthotree_type');
    my $subtype = $genepairlink->get_tagvalue('orthotree_subtype') || 'NA';
    my $tree_type = "GT";
    $tree_type = "TF" if $self->{'_treefam'};
    my $tree_id = $self->{'protein_tree'}->node_id unless $self->{'_treefam'};
    $tree_id = $self->{'_treefam'} if $self->{'_treefam'};
    my $dup = 
      $genepairlink->get_tagvalue('ancestor')->get_tagvalue('Duplication');
    my $dup_alg = 
      $genepairlink->get_tagvalue('ancestor')->get_tagvalue('Duplication_alg') || 0;
    $self->{_gpresults} .= "$tree_type,$tree_id,$stids[0]"."_"."$stids[1]".","."$type,$subtype,$dup,$dup_alg\n";
    $self->{_homologytable} .= "$stids[0]".","."$stids[1]".","."$type,$subtype\n";
    print STDERR "analyze links $count\n" if ($count % 500 == 0);
  }
  printf("%1.3f secs to analyze genepair links\n", time()-$tmp_time) 
    if($self->debug > 1);
  $self->store_homologies if ($self->{debug});

}

sub analyze_genepairlink {

  my $self = shift;
  my $genepairlink = shift;

  my ($protein1, $protein2) = $genepairlink->get_nodes;

  #run feature detectors: precalcs and caches into metadata
  $self->genepairlink_check_dups($genepairlink);
  $self->genepairlink_fetch_homology($genepairlink) if($self->debug);

  #do classification analysis : as filter stack
  if($self->inspecies_paralog_test($genepairlink)) { }
  elsif($self->direct_ortholog_test($genepairlink)) { } 
  elsif($self->ancient_residual_test($genepairlink)) { } 
  elsif($self->one2many_ortholog_test($genepairlink)) { } 
  elsif($self->outspecies_test($genepairlink)) { }
  else {
    printf
      (
       "OOPS!!!! %s - %s\n",
       $protein1->gene_member->stable_id,
       $protein2->gene_member->stable_id
      );
  }

  $self->{'old_homology_count'}++ 
    if($genepairlink->get_tagvalue('old_homology'));

  my $type = $genepairlink->get_tagvalue('orthotree_type');
  if($type) {
    if(!defined($self->{'orthotree_homology_counts'}->{$type})) {
      $self->{'orthotree_homology_counts'}->{$type} = 1;
    } else {
      $self->{'orthotree_homology_counts'}->{$type}++;
    }
  }

  #display results
  $self->display_link_analysis($genepairlink) if($self->debug >1);

  return undef;

}

sub ancient_residual_test {

  my $self = shift;
  my $genepairlink = shift;

  #test 3: getting a bit more complex:
  #  - genes are from different species
  #  - there is evidence for duplication events elsewhere in the history
  #  - but these two genes are the only remaining representative of
  #    the ancestor

  my ($pep1, $pep2) = $genepairlink->get_nodes;
  return undef if($pep1->genome_db_id == $pep2->genome_db_id);

  my $ancestor = $genepairlink->get_tagvalue('ancestor');
  my $species_hash = $self->get_ancestor_species_hash($ancestor);

  #check these are the only representatives of the ancestor
  my $count1 = $species_hash->{$pep1->genome_db_id};
  my $count2 = $species_hash->{$pep2->genome_db_id};

  return undef if($count1>1);
  return undef if($count2>1);

  #passed all the tests -> it's a simple ortholog
  # print $ancestor->node_id, " ", $ancestor->name,"\n";

  # little hack to work around some weird treefam trees
  if ($self->{'_treefam'}) {
    my $dup_value = $ancestor->get_tagvalue("Duplication");
    if ($dup_value eq '') {
      $dup_value = 0;
      $ancestor->add_tag("Duplication",0) ;
    }
  }

#  my $sis_value = $ancestor->get_tagvalue("species_intersection_score");
  my $sis_value = $ancestor->get_tagvalue("duplication_confidence_score");
  if($ancestor->get_tagvalue("Duplication") > 0 && $sis_value ne '0') {
#    $self->delete_old_homologies_old($genepairlink) unless ($self->{'_readonly'});
    $genepairlink->add_tag("orthotree_type", 'apparent_ortholog_one2one');
    my $taxon = $self->get_ancestor_taxon_level($ancestor);
    $genepairlink->add_tag("orthotree_subtype", $taxon->name);
    # Duplication_confidence_score
    if ('' eq $ancestor->get_tagvalue("duplication_confidence_score")) {
      $self->duplication_confidence_score($ancestor);
      $genepairlink->{duplication_confidence_score} = $ancestor->get_tagvalue("duplication_confidence_score");
    }
  } else {
#    $self->delete_old_homologies_old($genepairlink) unless ($self->{'_readonly'});
    $genepairlink->add_tag("orthotree_type", 'ortholog_one2one');
    my $taxon = $self->get_ancestor_taxon_level($ancestor);
    $genepairlink->add_tag("orthotree_subtype", $taxon->name);
  }
  return 1;

}

sub check_job_fail_options {

  my $self = shift;

  if ( $self->{'protein_tree'}->get_tagvalue('gene_count') > $self->{'max_gene_count'} ) {
    $self->dataflow_output_id($self->input_id, 2);
    $self->input_job->update_status('FAILED');
    $self->{'protein_tree'}->release_tree;
    $self->{'protein_tree'} = undef;
    throw("OrthoTree : cluster size over threshold and FAIL it");
  }

  if($self->input_job->retry_count >= 2) {
    # Send to QuickTreeBreak
    $self->dataflow_output_id($self->input_id, 2);
    $self->input_job->update_status('FAILED');

    $self->DESTROY;
    throw("OrthoTree job failed >=3 times: try something else and FAIL it");
  }

  if ($self->input_job->retry_count >= 1) {
    if ($self->{'protein_tree'}->get_tagvalue('gene_count') > 400 && !defined($self->worker->{HIGHMEM})) {
      $self->input_job->adaptor->reset_highmem_job_by_dbID($self->input_job->dbID);
      $self->DESTROY;
      throw("OrthoTree job too big: try something else and FAIL it");
    }
  }

}

sub delete_old_homologies {

  my $self = shift;

  return undef unless ($self->input_job->retry_count > 0);

  print "deleting old homologies\n" if ($self->debug);

  # New method all in one go -- requires key on tree_node_id
  my $delete_time = time();
  my $tree_node_id = $self->{protein_tree}->node_id;
  my $sql1 = "delete h.*, hm.* from homology h, homology_member hm where h.homology_id=hm.homology_id and h.tree_node_id=$tree_node_id";
  my $sth1 = $self->dbc->prepare($sql1);
  $sth1->execute;
  $sth1->finish;
  printf("%1.3f secs build links and features\n", time()-$delete_time);

#   # Old method one by one
#   my %homology_ids;
#   my $sql = "select homology_id from homology_member where member_id = ?";
#   my $sth = $self->dbc->prepare($sql);

#   foreach my $leaf (@{$self->{'protein_tree'}->get_all_leaves}) {
#     $sth->execute($leaf->gene_member->member_id);
#     while (my $aref = $sth->fetchrow_arrayref) {
#       my ($homology_id) = @$aref;
#       $homology_ids{$homology_id} = 1;
#     }
#   }

#   my @list_ids;
#   foreach my $id (keys %homology_ids) {
#     my $delete_time = time();
#     push @list_ids, $id;
#     if (scalar @list_ids == 2000) {
#       my $sql1 = "delete from homology where homology_id in (".join(",",@list_ids).")";
#       my $sql2 = "delete from homology_member where homology_id in (".join(",",@list_ids).")";
#       my $sth1 = $self->dbc->prepare($sql1);
#       my $sth2 = $self->dbc->prepare($sql2);
#       $sth1->execute;
#       $sth2->execute;
#       $sth1->finish;
#       $sth2->finish;
#       @list_ids = ();
#     }
#     printf("%1.3f secs build links and features\n", time()-$delete_time);
#   }

#   if (scalar @list_ids) {
#     my $sql1 = "delete from homology where homology_id in (".join(",",@list_ids).")";
#     my $sql2 = "delete from homology_member where homology_id in (".join(",",@list_ids).")";
#     my $sth1 = $self->dbc->prepare($sql1);
#     my $sth2 = $self->dbc->prepare($sql2);
#     $sth1->execute;
#     $sth2->execute;
#     $sth1->finish;
#     $sth2->finish;
#     @list_ids = ();
#   }

  $self->{old_homologies_deleted} = 1;
  return undef;

}

sub delete_old_orthotree_tags {

  my $self = shift;

  return undef unless ($self->input_job->retry_count > 0);

  print "deleting old orthotree tags\n" if ($self->debug);
  my @node_ids;

#   # Old method -- too slow
#   my $left_index  = $self->{'protein_tree'}->left_index;
#   my $right_index = $self->{'protein_tree'}->right_index;
#   my $tree_root_node_id = $self->{'protein_tree'}->node_id;
#   # Include the root_id as well as the rest of the nodes within the tree
#   push @node_ids, $tree_root_node_id;
#   my $sql = "select ptn.node_id from protein_tree_node ptn where ptn.left_index>$left_index and ptn.right_index<$right_index";
#   my $sth = $self->dbc->prepare($sql);
#   $sth->execute;
#   while (my $aref = $sth->fetchrow_arrayref) {
#     my ($node_id) = @$aref;
#     push @node_ids, $node_id;
#   }

  foreach my $node ($self->{'protein_tree'}->get_all_subnodes) {
    push @node_ids, $node->node_id;
  }

  my @list_ids;
  foreach my $id (@node_ids) {
    push @list_ids, $id;
    if (scalar @list_ids == 2000) {
      my $sql = "delete from protein_tree_tag where node_id in (".join(",",@list_ids).") and tag in ('duplication_confidence_score','taxon_id','taxon_name','OrthoTree_runtime_msec','OrthoTree_types_hashstr')";
      my $sth = $self->dbc->prepare($sql);
      $sth->execute;
      $sth->finish;
      @list_ids = ();
    }
  }

  if (scalar @list_ids) {
    my $sql = "delete from protein_tree_tag where node_id in (".join(",",@list_ids).") and tag in ('duplication_confidence_score','taxon_id','taxon_name','OrthoTree_runtime_msec','OrthoTree_types_hashstr')";
    my $sth = $self->dbc->prepare($sql);
    $sth->execute;
    $sth->finish;
    @list_ids = ();
  }

  return undef;

}

sub direct_ortholog_test {

  my $self = shift;
  my $genepairlink = shift;

  #strictest ortholog test: 
  #  - genes are from different species
  #  - no ancestral duplication events
  #  - these genes are only copies of the ancestor for their species

  return undef if($genepairlink->get_tagvalue('has_dups'));

  my ($pep1, $pep2) = $genepairlink->get_nodes;
  return undef if($pep1->genome_db_id == $pep2->genome_db_id);

  my $ancestor = $genepairlink->get_tagvalue('ancestor');
  my $species_hash = $self->get_ancestor_species_hash($ancestor);

  #RAP seems to miss some duplication events so check the species 
  #counts for these two species to make sure they are the only
  #representatives of these species under the ancestor
  my $count1 = $species_hash->{$pep1->genome_db_id};
  my $count2 = $species_hash->{$pep2->genome_db_id};

  return undef if($count1>1);
  return undef if($count2>1);

  #passed all the tests -> it's a simple ortholog
#  $self->delete_old_homologies_old($genepairlink) unless ($self->{'_readonly'});
  $genepairlink->add_tag("orthotree_type", 'ortholog_one2one');
  my $taxon = $self->get_ancestor_taxon_level($ancestor);
  $genepairlink->add_tag("orthotree_subtype", $taxon->name);
  return 1;

}

sub display_link_analysis {

  my $self = shift;
  my $genepairlink = shift;

  #display raw feature analysis
  my ($protein1, $protein2) = $genepairlink->get_nodes;
  my $ancestor = $genepairlink->get_tagvalue('ancestor');
  printf("%21s(%7d) - %21s(%7d) : %10.3f dist : %3d hops : ", 
    $protein1->gene_member->stable_id, $protein1->gene_member->member_id,
    $protein2->gene_member->stable_id, $protein2->gene_member->member_id,
    $genepairlink->distance_between, $genepairlink->get_tagvalue('hops'));

  if($genepairlink->get_tagvalue('has_dups')) { printf("%5s ", 'DUP');
  } else { printf("%5s ", ""); }

  my $homology = $genepairlink->get_tagvalue('old_homology');
  if($homology) { printf("%5s ", $homology->description);
  } else { printf("%5s ", ""); }

  print("ancestor:(");
  my $dup_value = $ancestor->get_tagvalue("Duplication");
#  my $sis_value = $ancestor->get_tagvalue("species_intersection_score");
  my $sis_value = $ancestor->get_tagvalue("duplication_confidence_score");
  if($dup_value eq '1' || $dup_value eq '2'){
    if ($sis_value eq '0') {
      print("DD  ");
    } else {
      print("DUP ");
    }
  }
  else{print"    ";}
  printf("%9s)", $ancestor->node_id);

  my $taxon_level = $ancestor->get_tagvalue('taxon_level');
  printf(" %s %s %s\n", 
         $genepairlink->get_tagvalue('orthotree_type'), 
         $genepairlink->get_tagvalue('orthotree_subtype'),
         $taxon_level->name
        );

  return undef;

}

sub duplication_confidence_score {

  my $self = shift;
  my $ancestor = shift;

  # This assumes bifurcation!!! No multifurcations allowed
  my ($child_a, $child_b, $dummy) = @{$ancestor->children};
  throw("tree is multifurcated in duplication_confidence_score\n") if (defined($dummy));
  my @child_a_gdbs = keys %{$self->get_ancestor_species_hash($child_a)};
  my @child_b_gdbs = keys %{$self->get_ancestor_species_hash($child_b)};
  my %seen = ();  my @gdb_a = grep { ! $seen{$_} ++ } @child_a_gdbs;
     %seen = ();  my @gdb_b = grep { ! $seen{$_} ++ } @child_b_gdbs;
  my @isect = my @diff = my @union = (); my %count;
  foreach my $e (@gdb_a, @gdb_b) { $count{$e}++ }
  foreach my $e (keys %count) {
    push(@union, $e); push @{ $count{$e} == 2 ?\@ isect :\@ diff }, $e; 
  }

  my $duplication_confidence_score = 0;
  my $scalar_isect = scalar(@isect);
  my $scalar_union = scalar(@union);
  $duplication_confidence_score = 
    (($scalar_isect)/$scalar_union) unless (0 == $scalar_isect);

  $ancestor->store_tag
    (
     "duplication_confidence_score",
     $duplication_confidence_score
    ) unless ($self->{'_readonly'});

  my $rounded_duplication_confidence_score = (int((100.0 * $scalar_isect / $scalar_union + 0.5)));
  my $species_intersection_score = $ancestor->get_tagvalue("species_intersection_score");
  unless (defined($species_intersection_score) && $species_intersection_score ne '') {
    my $ancestor_node_id = $ancestor->node_id;
    warn("Difference in the ProteinTree: duplication_confidence_score [$duplication_confidence_score] whereas species_intersection_score [$species_intersection_score] is undefined in njtree - ancestor $ancestor_node_id\n");
    return;
  }
  if ($species_intersection_score ne $rounded_duplication_confidence_score && !defined($self->{_readonly})) {
    my $ancestor_node_id = $ancestor->node_id;
    $self->throw("Inconsistency in the ProteinTree: duplication_confidence_score [$duplication_confidence_score] != species_intersection_score [$species_intersection_score] -  $ancestor_node_id\n");
  }

}

sub fetch_input {

  my( $self) = @_;

  $self->{'tree_scale'} = 1;
  $self->{'store_homologies'} = 1;
  $self->{'max_gene_count'} = 200;
  $self->{all_between} = 0;
  $self->{no_between} = 0.25;

  $self->throw("No input_id") unless defined($self->input_id);

  #create a Compara::DBAdaptor which shares the same DBI handle
  #with the Pipeline::DBAdaptor that is based into this runnable
  $self->{'comparaDBA'} = Bio::EnsEMBL::Compara::DBSQL::DBAdaptor->new
    (
     -DBCONN=>$self->db->dbc
    );

  $self->get_params($self->parameters);
  $self->get_params($self->input_id);

  if($self->{analysis_data_id}) {
    my $analysis_data_id = $self->{analysis_data_id};
    my $analysis_data_params = $self->db->get_AnalysisDataAdaptor->fetch_by_dbID($analysis_data_id);
    $self->get_params($analysis_data_params);
  }

  $self->print_params if($self->debug);

  my $starttime = time();
  $self->{'treeDBA'} = $self->{'comparaDBA'}->get_ProteinTreeAdaptor;
  $self->{homologyDBA} = $self->{'comparaDBA'}->get_HomologyAdaptor;
  $self->{'protein_tree'} =  $self->{'treeDBA'}->
         fetch_tree_at_node_id($self->{'protein_tree_id'});
  $self->check_job_fail_options;

  if($self->debug) {
    $self->{'protein_tree'}->print_tree($self->{'tree_scale'});
    printf("time to fetch tree : %1.3f secs\n" , time()-$starttime);
  }
  unless($self->{'protein_tree'}) {
    throw("undefined ProteinTree as input\n");
  }
  $self->delete_old_homologies;
  $self->delete_old_orthotree_tags;
  $self->load_species_tree;

  return 1;

}

sub genepairlink_check_dups {

  my $self = shift;
  my $genepairlink = shift;

  my ($pep1, $pep2) = $genepairlink->get_nodes;

  my $ancestor = $genepairlink->get_tagvalue('ancestor');
  my $has_dup=0;
  my %nodes_between;
  my $tnode = $pep1;
  do {
    $tnode = $tnode->parent;
    my $dup_value = $tnode->get_tagvalue("Duplication");
#    my $sis_value = $tnode->get_tagvalue("species_intersection_score");
    my $sis_value = $tnode->get_tagvalue("duplication_confidence_score");
    $dup_value = 0 unless (defined($dup_value) && $dup_value ne '');
    $sis_value = 0 unless (defined($sis_value) && $sis_value ne '');
    unless ($sis_value eq '0') {
      if($dup_value > 0) {
        $has_dup = 1;
      }
    }
    $nodes_between{$tnode->node_id} = $tnode;
  } while(!($tnode->equals($ancestor)));

  $tnode = $pep2;
  do {
    $tnode = $tnode->parent;
    my $dup_value = $tnode->get_tagvalue("Duplication");
#    my $sis_value = $tnode->get_tagvalue("species_intersection_score");
    my $sis_value = $tnode->get_tagvalue("duplication_confidence_score");
    $dup_value = 0 unless (defined($dup_value) && $dup_value ne '');
    $sis_value = 0 unless (defined($sis_value) && $sis_value ne '');
    $genepairlink->{duplication_confidence_score} = $sis_value;
    unless ($sis_value eq '0') {
      if($dup_value > 0) {
        $has_dup = 1;
      }
    }
    $nodes_between{$tnode->node_id} = $tnode;
  } while(!($tnode->equals($ancestor)));

  $genepairlink->add_tag("hops", scalar(keys(%nodes_between)));
  $genepairlink->add_tag("has_dups", $has_dup);
  return undef;
}


########################################################
#
# Classification analysis
#
########################################################

}

sub generate_attribute_arguments {

  my ($self, $protein1, $protein2, $type) = @_;

  my $new_aln1_cigarline = "";
  my $new_aln2_cigarline = "";

  my $perc_id1 = 0;
  my $perc_pos1 = 0;
  my $perc_id2 = 0;
  my $perc_pos2 = 0;
  # This speeds up the pipeline for this portion of the homology table
  if ($type eq 'between_species_paralog') {
    return ($new_aln1_cigarline, $perc_id1, $perc_pos1, $new_aln2_cigarline, $perc_id2, $perc_pos2);
  }

  my $identical_matches = 0;
  my $positive_matches = 0;
  my $m_hash = $self->get_matrix_hash;

  my ($aln1state, $aln2state);
  my ($aln1count, $aln2count);

  # my @aln1 = split(//, $protein1->alignment_string); # Speed up
  # my @aln2 = split(//, $protein2->alignment_string);
  my $alignment_string = $protein1->alignment_string;
  my @aln1 = unpack("A1" x length($alignment_string), $alignment_string);
  $alignment_string = $protein2->alignment_string;
  my @aln2 = unpack("A1" x length($alignment_string), $alignment_string);

  for (my $i=0; $i <= $#aln1; $i++) {
    next if ($aln1[$i] eq "-" && $aln2[$i] eq "-");
    my ($cur_aln1state, $cur_aln2state) = qw(M M);
    if ($aln1[$i] eq "-") {
      $cur_aln1state = "D";
    }
    if ($aln2[$i] eq "-") {
      $cur_aln2state = "D";
    }
    if ($cur_aln1state eq "M" && $cur_aln2state eq "M" && $aln1[$i] eq $aln2[$i]) {
      $identical_matches++;
      $positive_matches++;
    } elsif ($cur_aln1state eq "M" && $cur_aln2state eq "M" && $m_hash->{uc $aln1[$i]}{uc $aln2[$i]} > 0) {
        $positive_matches++;
    }
    unless (defined $aln1state) {
      $aln1count = 1;
      $aln2count = 1;
      $aln1state = $cur_aln1state;
      $aln2state = $cur_aln2state;
      next;
    }
    if ($cur_aln1state eq $aln1state) {
      $aln1count++;
    } else {
      if ($aln1count == 1) {
        $new_aln1_cigarline .= $aln1state;
      } else {
        $new_aln1_cigarline .= $aln1count.$aln1state;
      }
      $aln1count = 1;
      $aln1state = $cur_aln1state;
    }
    if ($cur_aln2state eq $aln2state) {
      $aln2count++;
    } else {
      if ($aln2count == 1) {
        $new_aln2_cigarline .= $aln2state;
      } else {
        $new_aln2_cigarline .= $aln2count.$aln2state;
      }
      $aln2count = 1;
      $aln2state = $cur_aln2state;
    }
  }
  if ($aln1count == 1) {
    $new_aln1_cigarline .= $aln1state;
  } else {
    $new_aln1_cigarline .= $aln1count.$aln1state;
  }
  if ($aln2count == 1) {
    $new_aln2_cigarline .= $aln2state;
  } else {
    $new_aln2_cigarline .= $aln2count.$aln2state;
  }
  my $seq_length1 = $protein1->seq_length;
  unless (0 == $seq_length1) {
    $perc_id1 = $identical_matches*100.0/$seq_length1;
    $perc_pos1 = $positive_matches*100.0/$seq_length1;
  }
  my $seq_length2 = $protein2->seq_length;
  unless (0 == $seq_length2) {
    $perc_id2 = $identical_matches*100.0/$seq_length2;
    $perc_pos2 = $positive_matches*100.0/$seq_length2;
  }

#   my $perc_id1 = $identical_matches*100.0/$protein1->seq_length;
#   my $perc_pos1 = $positive_matches*100.0/$protein1->seq_length;
#   my $perc_id2 = $identical_matches*100.0/$protein2->seq_length;
#   my $perc_pos2 = $positive_matches*100.0/$protein2->seq_length;

  return ($new_aln1_cigarline, $perc_id1, $perc_pos1, $new_aln2_cigarline, $perc_id2, $perc_pos2);

}

sub get_ancestor_species_hash {

  my $self = shift;
  my $node = shift;

  my $species_hash = $node->get_tagvalue('species_hash');
  return $species_hash if($species_hash);

  $species_hash = {};
  my $duplication_hash = {};
  my $is_dup=0;

  if($node->isa('Bio::EnsEMBL::Compara::AlignedMember')) {
    my $node_genome_db_id = $node->genome_db_id;
    $species_hash->{$node_genome_db_id} = 1;
    $node->add_tag('species_hash', $species_hash);
    return $species_hash;
  }

  foreach my $child (@{$node->children}) {
    my $t_species_hash = $self->get_ancestor_species_hash($child);
    next unless(defined($t_species_hash)); #shouldn't happen
    foreach my $genome_db_id (keys(%$t_species_hash)) {
      unless(defined($species_hash->{$genome_db_id})) {
        $species_hash->{$genome_db_id} = $t_species_hash->{$genome_db_id};
      } else {
        #this species already existed in one of the other children
        #this means this species was duplicated at this point between
        #the species
        $is_dup=1;
        $duplication_hash->{$genome_db_id} = 1;
        $species_hash->{$genome_db_id} += $t_species_hash->{$genome_db_id};
      }
    }
  }
  
  #printf("\ncalc_ancestor_species_hash : %s\n", $self->encode_hash($species_hash));
  #$node->print_tree(20);
  
  $node->add_tag("species_hash", $species_hash);
  if($is_dup && !($self->{'_treefam'})) {
    my $original_duplication_value = $node->get_tagvalue("Duplication");
    $original_duplication_value = 0 
      unless (defined $original_duplication_value && $original_duplication_value ne '');

    if ($original_duplication_value == 0) {
      # RAP did not predict a duplication here
      $node->add_tag("duplication_hash", $duplication_hash);
      $node->store_tag("Duplication", 1)  unless ($self->{'_readonly'});
      $node->store_tag("Duplication_alg", 'species_count') 
        unless ($self->{'_readonly'});

    } elsif ($original_duplication_value == 1) {
      my $dup_alg = $node->get_tagvalue("Duplication_alg");
      if (defined $dup_alg and $dup_alg ne 'species_count') {
        # RAP did predict a duplication here but not species_count
        $node->add_tag("duplication_hash", $duplication_hash);
        $node->store_tag("Duplication", 2) unless ($self->{'_readonly'});
        $node->store_tag("Duplication_alg", 'species_count') 
          unless ($self->{'_readonly'});
      }
    }
  }
  return $species_hash;

}

sub get_ancestor_taxon_level {

  my $self = shift;
  my $ancestor = shift;

  my $taxon_level = $ancestor->get_tagvalue('taxon_level');
  return $taxon_level if($taxon_level);

  #printf("\ncalculate ancestor taxon level\n");
  my $taxon_tree = $self->{'taxon_tree'};
  my $species_hash = $self->get_ancestor_species_hash($ancestor);

  foreach my $gdbID (keys(%$species_hash)) {
    my $gdb = 
      $self->{'comparaDBA'}->get_GenomeDBAdaptor->fetch_by_dbID($gdbID);
    my $taxon = $taxon_tree->find_node_by_node_id($gdb->taxon_id);

    unless($taxon) {
      throw("oops missing taxon " . $gdb->taxon_id ."\n");
    }

    if($taxon_level) {
      $taxon_level = $taxon_level->find_first_shared_ancestor($taxon);
    } else {
      $taxon_level = $taxon;
    }
  }
  my $taxon_id = $taxon_level->get_tagvalue("taxon_id");
  $ancestor->add_tag("taxon_level", $taxon_level);
  $ancestor->store_tag("taxon_id", $taxon_id) 
    unless ($self->{'_readonly'});
  $ancestor->store_tag("taxon_name", $taxon_level->name) 
    unless ($self->{'_readonly'});

  #$ancestor->print_tree($self->{'tree_scale'});
  #$taxon_level->print_tree(10);

  return $taxon_level;

}

sub get_matrix_hash {

  my $self = shift;

  return $self->{'matrix_hash'} if (defined $self->{'matrix_hash'});

  my $BLOSUM62 = "#  Matrix made by matblas from blosum62.iij
#  * column uses minimum score
#  BLOSUM Clustered Scoring Matrix in 1/2 Bit Units
#  Blocks Database = /data/blocks_5.0/blocks.dat
#  Cluster Percentage: >= 62
#  Entropy =   0.6979, Expected =  -0.5209
   A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V  B  Z  X  *
A  4 -1 -2 -2  0 -1 -1  0 -2 -1 -1 -1 -1 -2 -1  1  0 -3 -2  0 -2 -1  0 -4
R -1  5  0 -2 -3  1  0 -2  0 -3 -2  2 -1 -3 -2 -1 -1 -3 -2 -3 -1  0 -1 -4
N -2  0  6  1 -3  0  0  0  1 -3 -3  0 -2 -3 -2  1  0 -4 -2 -3  3  0 -1 -4
D -2 -2  1  6 -3  0  2 -1 -1 -3 -4 -1 -3 -3 -1  0 -1 -4 -3 -3  4  1 -1 -4
C  0 -3 -3 -3  9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4
Q -1  1  0  0 -3  5  2 -2  0 -3 -2  1  0 -3 -1  0 -1 -2 -1 -2  0  3 -1 -4
E -1  0  0  2 -4  2  5 -2  0 -3 -3  1 -2 -3 -1  0 -1 -3 -2 -2  1  4 -1 -4
G  0 -2  0 -1 -3 -2 -2  6 -2 -4 -4 -2 -3 -3 -2  0 -2 -2 -3 -3 -1 -2 -1 -4
H -2  0  1 -1 -3  0  0 -2  8 -3 -3 -1 -2 -1 -2 -1 -2 -2  2 -3  0  0 -1 -4
I -1 -3 -3 -3 -1 -3 -3 -4 -3  4  2 -3  1  0 -3 -2 -1 -3 -1  3 -3 -3 -1 -4
L -1 -2 -3 -4 -1 -2 -3 -4 -3  2  4 -2  2  0 -3 -2 -1 -2 -1  1 -4 -3 -1 -4
K -1  2  0 -1 -3  1  1 -2 -1 -3 -2  5 -1 -3 -1  0 -1 -3 -2 -2  0  1 -1 -4
M -1 -1 -2 -3 -1  0 -2 -3 -2  1  2 -1  5  0 -2 -1 -1 -1 -1  1 -3 -1 -1 -4
F -2 -3 -3 -3 -2 -3 -3 -3 -1  0  0 -3  0  6 -4 -2 -2  1  3 -1 -3 -3 -1 -4
P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4  7 -1 -1 -4 -3 -2 -2 -1 -2 -4
S  1 -1  1  0 -1  0  0  0 -1 -2 -2  0 -1 -2 -1  4  1 -3 -2 -2  0  0  0 -4
T  0 -1  0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1  1  5 -2 -2  0 -1 -1  0 -4
W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1  1 -4 -3 -2 11  2 -3 -4 -3 -2 -4
Y -2 -2 -2 -3 -2 -1 -2 -3  2 -1 -1 -2 -1  3 -3 -2 -2  2  7 -1 -3 -2 -1 -4
V  0 -3 -3 -3 -1 -2 -2 -3 -3  3  1 -2  1 -1 -2 -2  0 -3 -1  4 -3 -2 -1 -4
B -2 -1  3  4 -3  0  1 -1  0 -3 -4  0 -3 -3 -2  0 -1 -4 -3 -3  4  1 -1 -4
Z -1  0  0  1 -3  3  4 -2  0 -3 -3  1 -1 -3 -1  0 -1 -3 -2 -2  1  4 -1 -4
X  0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2  0  0 -2 -1 -1 -1 -1 -1 -4
* -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4  1
";
  my $matrix_string;
  my @lines = split(/\n/,$BLOSUM62);
  foreach my $line (@lines) {
    next if ($line =~ /^\#/);
    if ($line =~ /^[A-Z\*\s]+$/) {
      $matrix_string .= sprintf "$line\n";
    } else {
      my @t = split(/\s+/,$line);
      shift @t;
      #       print scalar @t,"\n";
      $matrix_string .= sprintf(join(" ",@t)."\n");
    }
  }

  my %matrix_hash;
  @lines = ();
  @lines = split /\n/, $matrix_string;
  my $lts = shift @lines;
  $lts =~ s/^\s+//;
  $lts =~ s/\s+$//;
  my @letters = split /\s+/, $lts;

  foreach my $letter (@letters) {
    my $line = shift @lines;
    $line =~ s/^\s+//;
    $line =~ s/\s+$//;
    my @penalties = split /\s+/, $line;
    die "Size of letters array and penalties array are different\n"
      unless (scalar @letters == scalar @penalties);
    for (my $i=0; $i < scalar @letters; $i++) {
      $matrix_hash{uc $letter}{uc $letters[$i]} = $penalties[$i];
    }
  }

  $self->{'matrix_hash'} =\% matrix_hash;

  return $self->{'matrix_hash'};
}

1;

}

sub inspecies_paralog_test {

  my $self = shift;
  my $genepairlink = shift;

  #simplest paralog test: 
  #  - both genes are from the same species
  #  - and just label with taxonomic level

  my ($pep1, $pep2) = $genepairlink->get_nodes;
  return undef unless($pep1->genome_db_id == $pep2->genome_db_id);

  my $ancestor = $genepairlink->get_tagvalue('ancestor');
  my $taxon = $self->get_ancestor_taxon_level($ancestor);

  #my $species_hash = $self->get_ancestor_species_hash($ancestor);
  #foreach my $gdbID (keys(%$species_hash)) {
  #  return undef unless($gdbID == $pep1->genome_db_id);
  #}

  #passed all the tests -> it's an inspecies_paralog
#  $genepairlink->add_tag("orthotree_type", 'inspecies_paralog');
#  $self->delete_old_homologies_old($genepairlink) unless ($self->{'_readonly'});
  $genepairlink->add_tag("orthotree_type", 'within_species_paralog');
  $genepairlink->add_tag("orthotree_subtype", $taxon->name);
  # Duplication_confidence_score
  if ('' eq $ancestor->get_tagvalue("duplication_confidence_score")) {
    $self->duplication_confidence_score($ancestor);
    $genepairlink->{duplication_confidence_score} = $ancestor->get_tagvalue("duplication_confidence_score");
  }
  return 1;

}

sub one2many_ortholog_test {

  my $self = shift;
  my $genepairlink = shift;

  #test 4: getting a bit more complex yet again:
  #  - genes are from different species
  #  - but there is evidence for duplication events in the history
  #  - one of the genes is the only remaining representative of the
  #  ancestor in its species
  #  - but the other gene has multiple copies in it's species 
  #  (first level of orthogroup analysis)

  my ($pep1, $pep2) = $genepairlink->get_nodes;
  return undef if($pep1->genome_db_id == $pep2->genome_db_id);

  my $ancestor = $genepairlink->get_tagvalue('ancestor');
  my $species_hash = $self->get_ancestor_species_hash($ancestor);

  my $count1 = $species_hash->{$pep1->genome_db_id};
  my $count2 = $species_hash->{$pep2->genome_db_id};

  #one of the genes must be the only copy of the gene
  #and the other must appear more than once in the ancestry
  return undef unless 
    (
     ($count1==1 and $count2>1) or ($count1>1 and $count2==1)
    );

  my $dup_value = $ancestor->get_tagvalue("Duplication");
#  my $sis_value = $ancestor->get_tagvalue("species_intersection_score");
  my $sis_value = $ancestor->get_tagvalue("duplication_confidence_score");
  if($dup_value > 0 && $sis_value ne '0') {
    return undef;
  }

  #passed all the tests -> it's a one2many ortholog
#  $self->delete_old_homologies_old($genepairlink) unless ($self->{'_readonly'});
  $genepairlink->add_tag("orthotree_type", 'ortholog_one2many');
  my $taxon = $self->get_ancestor_taxon_level($ancestor);
  $genepairlink->add_tag("orthotree_subtype", $taxon->name);
  return 1;

}

sub outspecies_test {

  my $self = shift;
  my $genepairlink = shift;

  #last test: left over pairs:
  #  - genes are from different species
  #  - if ancestor is 'DUP' -> paralog else 'ortholog'

  my ($pep1, $pep2) = $genepairlink->get_nodes;
  return undef if($pep1->genome_db_id == $pep2->genome_db_id);

  my $ancestor = $genepairlink->get_tagvalue('ancestor');
  my $taxon = $self->get_ancestor_taxon_level($ancestor);

  #ultra simple ortho/paralog classification
  my $dup_value = $ancestor->get_tagvalue("Duplication");
#  my $sis_value = $ancestor->get_tagvalue("species_intersection_score");
  my $sis_value = $ancestor->get_tagvalue("duplication_confidence_score");
  unless ($dup_value eq '') {
    if($dup_value > 0 && $sis_value ne '0') {
#      $self->delete_old_homologies_old($genepairlink) unless ($self->{'_readonly'});
      $genepairlink->add_tag("orthotree_type", 'between_species_paralog');
      $genepairlink->add_tag("orthotree_subtype", $taxon->name);
      # Duplication_confidence_score
      if ('' eq $ancestor->get_tagvalue("duplication_confidence_score")) {
        $self->duplication_confidence_score($ancestor);
       $genepairlink->{duplication_confidence_score} = $ancestor->get_tagvalue("duplication_confidence_score");
      }
    } else {
#      $self->delete_old_homologies_old($genepairlink) unless ($self->{'_readonly'});
      $genepairlink->add_tag("orthotree_type", 'ortholog_many2many');
      $genepairlink->add_tag("orthotree_subtype", $taxon->name);
    }
  }
  return 1;
}


########################################################
#
# ProteinTree input/output section
#
########################################################

}

sub run_analysis {

  my $self = shift;

  my $starttime = time()*1000;
  my $tmp_time = time();
  my $tree = $self->{'protein_tree'};

  print "Getting all leaves\n";
  my @all_protein_leaves = @{$tree->get_all_leaves};
  my $tree_node_id = $tree->node_id;

  #precalculate the ancestor species_hash (caches into the metadata of
  #nodes) also augments the Duplication tagging
  printf("Calculating ancestor species hash\n") if ($self->debug);
  $self->get_ancestor_species_hash($tree);

  if($self->debug) {
    $self->{'protein_tree'}->print_tree($self->{'tree_scale'});
    printf("%d proteins in tree\n", scalar(@all_protein_leaves));
  }

  #compare every gene in the tree with every other each gene/gene
  #pairing is a potential ortholog/paralog and thus we need to analyze
  #every possibility
  #Accomplish by creating a fully connected graph between all the
  #genes under the tree (hybrid graph structure) and then analyze each
  #gene/gene link
  $tmp_time = time();
  printf("build fully linked graph\n") if($self->debug);
  my @genepairlinks;
  my $graphcount = 0;
  while (my $protein1 = shift @all_protein_leaves) {
    foreach my $protein2 (@all_protein_leaves) {
      my $ancestor = $protein1->find_first_shared_ancestor($protein2);
      # Line below will only become faster than above if we find a way to calculate long parent->parent journeys.
      # This is probably doable by looking at the right1/left1 right2/left2 distances between the 2 proteins
      # my $ancestor_indexed = $self->{'treeDBA'}->fetch_first_shared_ancestor_indexed($protein1,$protein2);
      my $taxon_level = $self->get_ancestor_taxon_level($ancestor);
      my $distance = $protein1->distance_to_ancestor($ancestor) +
                     $protein2->distance_to_ancestor($ancestor);
      my $genepairlink = new Bio::EnsEMBL::Compara::Graph::Link
        (
         $protein1, $protein2, $distance
        );
      $genepairlink->add_tag("hops", 0);
      $genepairlink->add_tag("ancestor", $ancestor);
      $genepairlink->add_tag("taxon_name", $taxon_level->name);
      $genepairlink->add_tag("tree_node_id", $tree_node_id);
      push @genepairlinks, $genepairlink;
    }
    print STDERR "build graph $graphcount\n" if ($graphcount++ % 10 == 0);
  }
  printf("%1.3f secs build links and features\n", time()-$tmp_time) 
    if($self->debug>1);

  $self->{'protein_tree'}->print_tree($self->{'tree_scale'}) 
    if($self->debug);

  #sort the gene/gene links by distance
  #   makes debug display easier to read, not required by algorithm
  $tmp_time = time();
  printf("sort links\n") if($self->debug);
  my @sorted_genepairlinks = 
    sort {$a->distance_between <=> $b->distance_between} @genepairlinks;
  printf("%1.3f secs to sort links\n", time()-$tmp_time) if($self->debug > 1);

  #analyze every gene pair (genepairlink) to get its classification
  printf("analyze links\n") if($self->debug);
  printf("%d links\n", scalar(@genepairlinks)) if ($self->debug);
  $tmp_time = time();
  $self->{'old_homology_count'} = 0;
  $self->{'orthotree_homology_counts'} = {};
  foreach my $genepairlink (@sorted_genepairlinks) {
    $self->analyze_genepairlink($genepairlink);
  }
  printf("%1.3f secs to analyze genepair links\n", time()-$tmp_time) 
    if($self->debug > 1);
  
  #display summary stats of analysis 
  my $runtime = time()*1000-$starttime;  
  $self->{'protein_tree'}->store_tag('OrthoTree_runtime_msec', $runtime) 
    unless ($self->{'_readonly'});
  if($self->debug) {
    printf("%d proteins in tree\n", scalar(@{$tree->get_all_leaves}));
    printf("%d pairings\n", scalar(@genepairlinks));
    printf("%d old homologies\n", $self->{'old_homology_count'});
    printf("orthotree homologies\n");
    foreach my $type (keys(%{$self->{'orthotree_homology_counts'}})) {
      printf
        (
         "  %13s : %d\n", $type, $self->{'orthotree_homology_counts'}->{$type}
        );
    }
  }
  $self->{'homology_links'} =\@ sorted_genepairlinks;
  $DB::single=1;1;
  return undef;

}

sub store_gene_link_as_homology {

  my $self = shift;
  my $genepairlink  = shift;

  my $type = $genepairlink->get_tagvalue('orthotree_type');
  return unless($type);
  my $subtype = $genepairlink->get_tagvalue('taxon_name');
  my $ancestor = $genepairlink->get_tagvalue('ancestor');
  my $tree_node_id = $genepairlink->get_tagvalue('tree_node_id');
  warn("Tag tree_node_id undefined\n") unless(defined($tree_node_id) && $tree_node_id ne '');

  my ($protein1, $protein2) = $genepairlink->get_nodes;

  #
  # create method_link_species_set
  #
  my $mlss = new Bio::EnsEMBL::Compara::MethodLinkSpeciesSet;
  $mlss->method_link_type("ENSEMBL_ORTHOLOGUES") 
    unless ($type eq 'between_species_paralog' || $type eq 'within_species_paralog');
  $mlss->method_link_type("ENSEMBL_PARALOGUES") if ($type eq 'between_species_paralog' || $type eq 'within_species_paralog');
  if ($protein1->genome_db->dbID == $protein2->genome_db->dbID) {
    $mlss->species_set([$protein1->genome_db]);
  } else {
    $mlss->species_set([$protein1->genome_db, $protein2->genome_db]);
  }
  $self->{'comparaDBA'}->get_MethodLinkSpeciesSetAdaptor->store($mlss) unless ($self->{'_readonly'});

  # create an Homology object
  my $homology = new Bio::EnsEMBL::Compara::Homology;
  $homology->description($type);
  $homology->subtype($subtype);
  # $homology->node_id($ancestor->node_id);
  $homology->ancestor_node_id($ancestor->node_id);
  $homology->tree_node_id($tree_node_id);
  $homology->method_link_type($mlss->method_link_type);
  $homology->method_link_species_set($mlss);

  my $key = $mlss->dbID . "_" . $protein1->dbID;
  $self->{_homology_consistency}{$key}{$type} = 1;
  #$homology->dbID(-1);

  # NEED TO BUILD THE Attributes (ie homology_members)
  my ($cigar_line1, $perc_id1, $perc_pos1,
      $cigar_line2, $perc_id2, $perc_pos2) = 
        $self->generate_attribute_arguments($protein1, $protein2,$type);

  # QUERY member
  #
  my $attribute;
  $attribute = new Bio::EnsEMBL::Compara::Attribute;
  $attribute->peptide_member_id($protein1->dbID);
  $attribute->cigar_line($cigar_line1);
  $attribute->perc_cov(100);
  $attribute->perc_id(int($perc_id1));
  $attribute->perc_pos(int($perc_pos1));

  $homology->add_Member_Attribute([$protein1->gene_member, $attribute]);

  #
  # HIT member
  #
  $attribute = new Bio::EnsEMBL::Compara::Attribute;
  $attribute->peptide_member_id($protein2->dbID);
  $attribute->cigar_line($cigar_line2);
  $attribute->perc_cov(100);
  $attribute->perc_id(int($perc_id2));
  $attribute->perc_pos(int($perc_pos2));

  $homology->add_Member_Attribute([$protein2->gene_member, $attribute]);

  ## Check if it has already been stored, in which case we dont need to store again
  my $matching_homology = 0;
  if ($self->input_job->retry_count > 0 && !defined($self->{old_homologies_deleted})) {
    my $member_id1 = $protein1->gene_member->member_id;
    my $member_id2 = $protein2->gene_member->member_id;
    if ($member_id1 == $member_id2) {
      my $tree_id = $self->{protein_tree}->node_id;
      my $pmember_id1 = $protein1->member_id; my $pstable_id1 = $protein1->stable_id;
      my $pmember_id2 = $protein2->member_id; my $pstable_id2 = $protein2->stable_id;
      $self->throw("$member_id1 ($pmember_id1 - $pstable_id1) and $member_id2 ($pmember_id2 - $pstable_id2) shouldn't be the same");
    }
    my $stored_homology = @{$self->{homologyDBA}->fetch_by_Member_id_Member_id($member_id1,$member_id2)}[0];
    if (defined($stored_homology)) {
      $matching_homology = 1;
      $matching_homology = 0 if ($stored_homology->description ne $homology->description);
      $matching_homology = 0 if ($stored_homology->subtype ne $homology->subtype);
      $matching_homology = 0 if ($stored_homology->ancestor_node_id ne $homology->ancestor_node_id);
      $matching_homology = 0 if ($stored_homology->tree_node_id ne $homology->tree_node_id);
      $matching_homology = 0 if ($stored_homology->method_link_type ne $homology->method_link_type);
      $matching_homology = 0 if ($stored_homology->method_link_species_set->dbID ne $homology->method_link_species_set->dbID);
    }

    # Delete old one, then proceed to store new one
    if (defined($stored_homology) && (0 == $matching_homology)) {
      my $homology_id = $stored_homology->dbID;
      my $sql1 = "delete from homology where homology_id=$homology_id";
      my $sql2 = "delete from homology_member where homology_id=$homology_id";
      my $sth1 = $self->dbc->prepare($sql1);
      my $sth2 = $self->dbc->prepare($sql2);
      $sth1->execute;
      $sth2->execute;
      $sth1->finish;
      $sth2->finish;
    }
  }
  if($self->{'store_homologies'} && 0 == $matching_homology) {
    $self->{'homologyDBA'}->store($homology);
  }

  my $stable_id;
  if($protein1->taxon_id < $protein2->taxon_id) {
    $stable_id = $protein1->taxon_id . "_" . $protein2->taxon_id . "_";
  } else {
    $stable_id = $protein2->taxon_id . "_" . $protein1->taxon_id . "_";
  }
  $stable_id .= sprintf ("%011.0d",$homology->dbID) if($homology->dbID);
  $homology->stable_id($stable_id);
  #TODO: update the stable_id of the homology

#   if($self->debug) {
#     print("store: ");
#     $homology->print_homology;
#   }

  return undef;

}

sub store_homologies {

  my $self = shift;
  my $a=1;
  my $hlinkscount = 0;
  foreach my $genepairlink (@{$self->{'homology_links'}}) {
    $self->display_link_analysis($genepairlink) if($self->debug>2);
#     unless (defined($self->{all_between})) {
    my $type = $genepairlink->get_tagvalue("orthotree_type");
    my $dcs = $genepairlink->{duplication_confidence_score};
    $DB::single=$a;1;
    next if ($type eq 'between_species_paralog' && $dcs > $self->{no_between});
#     }
    $self->store_gene_link_as_homology($genepairlink);
    print STDERR "homology links $hlinkscount\n" if ($hlinkscount++ % 500 == 0);
  }

  my $counts_str = $self->encode_hash($self->{'orthotree_homology_counts'});
  printf("$counts_str\n");

  $self->check_homology_consistency;

  $self->{'protein_tree'}->store_tag(
      'OrthoTree_types_hashstr', 
      $self->encode_hash($self->{'orthotree_homology_counts'})) unless ($self->{'_readonly'});

  # This has to go to OrthoTree because we haven't stored taxon_id and taxon_name yet here
  # Go through and calculate species sampling and count.
    # Create our taxonomic tree.
#   my $ta = $self->{'comparaDBA'}->get_NCBITaxonAdaptor;
#   my $gdb_list = $self->{'comparaDBA'}->get_GenomeDBAdaptor->fetch_all;
#   my $taxon_tree=undef;
#   foreach my $gdb (@{$gdb_list}) {
#     next if ($gdb->name =~ /Ancestral/);
#     next if ($gdb->name =~ /ilurana/);
#     my $taxon = $ta->fetch_node_by_taxon_id($gdb->taxon_id);
#     $taxon->no_autoload_children;
#     $taxon_tree = $taxon->root unless($taxon_tree);
#     $taxon_tree->merge_node_via_shared_ancestor($taxon);
#   }
#   $taxon_tree = $taxon_tree->minimize_tree;
#   my $tree_species_sampling = 0;
#   my $tree_species_num = 0;
#   my $num_leaves = $self->{'protein_tree'}->num_leaves;
#   if ($num_leaves >= 4) {
#     my $taxon_id = $self->{'protein_tree'}->get_tagvalue("taxon_id");
#     my $taxon_name = $self->{'protein_tree'}->get_tagvalue("taxon_name");
#     #print "ID: $taxon_id  $taxon_name\n";
#     my $potential_species = scalar @{$taxon_tree->find_node_by_node_id($taxon_id)->get_all_leaves};
#     if ($potential_species > 1) {
#       my %species;
#       my @species_array = map {$_->taxon->name} @{$self->{'protein_tree'}->get_all_leaves};  # Create an array of species names.
#       @species{@species_array} = (1) x @species_array;  # Fill the hashtable with the taxon names of all represented species.
#       my $represented_species = scalar keys %species; # Count up the number of unique represented species.
#       my $species_sampling = $represented_species / $potential_species;
#       # Tree species sampling.
#       $tree_species_sampling = sprintf("%.4f",$species_sampling);  # Format the decimals.
#       # Tree species number.
#       $tree_species_num = $represented_species;
#     } else {
#       $tree_species_sampling = 1;
#       $tree_species_num = 1;
#     }
#   }
#   $self->{'protein_tree'}->store_tag("tree_species_sampling",$tree_species_sampling);
#   $self->{'protein_tree'}->store_tag("tree_species_num",$tree_species_num);
  $DB::single=1;1;
  return undef;

}

DESTROY	No description	Code
_slurp	No description	Code
_treefam_genepairlink_stats	No description	Code
analyze_genepairlink	No description	Code
ancient_residual_test	No description	Code
check_homology_consistency	No description	Code
check_job_fail_options	No description	Code
delete_old_homologies	No description	Code
delete_old_homologies_old	No description	Code
delete_old_orthotree_tags	No description	Code
direct_ortholog_test	No description	Code
display_link_analysis	No description	Code
duplication_confidence_score	No description	Code
fetch_input	Description	Code
genepairlink_check_dups	No description	Code
genepairlink_fetch_homology	No description	Code
generate_attribute_arguments	No description	Code
get_ancestor_species_hash	No description	Code
get_ancestor_taxon_level	No description	Code
get_matrix_hash	No description	Code
get_params	No description	Code
inspecies_paralog_test	No description	Code
load_species_tree	No description	Code
load_species_tree_from_file	No description	Code
load_species_tree_from_tax	No description	Code
one2many_ortholog_test	No description	Code
outspecies_test	No description	Code
print_params	No description	Code
run	Description	Code
run_analysis	No description	Code
store_gene_link_as_homology	No description	Code
store_homologies	No description	Code
write_output	Description	Code