Bio::EnsEMBL::Compara::RunnableDB HomologyCluster
SummaryIncluded librariesPackage variablesSynopsisDescriptionGeneral documentationMethods
Toolbar
WebCvsRaw content
Summary
Bio::EnsEMBL::Compara::RunnableDB::HomologyCluster
Package variables
No package variables defined.
Included modules
Bio::EnsEMBL::Compara::DBSQL::DBAdaptor
Bio::EnsEMBL::Compara::Homology
Bio::EnsEMBL::Compara::NestedSet
Bio::EnsEMBL::Hive
Data::UUID
Switch
Time::HiRes qw ( time gettimeofday tv_interval )
Inherit
Bio::EnsEMBL::Hive::Process
Synopsis
my $aa = $sdba->get_AnalysisAdaptor;
my $analysis = $aa->fetch_by_logic_name('HomologyCluster');
my $rdb = new Bio::EnsEMBL::Compara::RunnableDB::HomologyCluster(
-input_id => "{'species_set'=>[1,2,3,14]}",
-analysis => $analysis);
$rdb->fetch_input
$rdb->run;
Description
This is a compara specific runnableDB, that based on an input_id
of arrayrefs of genome_db_ids, and from this species set realtionship
it will search through the homology data and build SingleLinkage Clusters
and store them into a NestedSet datastructure. This is the first step in
the Tree analysis production system.
Methods
build_cluster_around_gene_stable_id
No description
Code
build_homology_clusters
No description
Code
dataflow_clusters
No description
Code
fetch_homology_peptide_pairs_by_mlss
No description
Code
fetch_input
No description
Code
get_orthologue_cluster
No description
Code
get_params
No description
Code
grow_clusters_with_homology
No description
Code
grow_dbclusters_with_peppair
No description
Code
grow_memclusters_with_peppairDescriptionCode
run
No description
Code
store_clusters
No description
Code
write_output
No description
Code
Methods description
grow_memclusters_with_peppaircode    nextTop
  Description: Takes a pair of peptide_member_id and uses the NestedSet objects
to build a 3 layer tree in memory. There is a single root for the entire build
process, and each cluster is a child of this root. The members are children of
the clusters. During the build process the member leaves are assigned a node_id
equal to the peptide_member_id so they can be found via 'find_node_by_node_id'.
After the process is completed, each cluster can then be stored in a faster
bulk insert process.
Methods code
build_cluster_around_gene_stable_iddescriptionprevnextTop
sub build_cluster_around_gene_stable_id {
  my $self = shift;
  my $gene_stable_id = shift;

  my $root = new Bio::EnsEMBL::Compara::NestedSet;
  $root->name("PROTEIN_TREES");
  $self->{'comparaDBA'}->get_ProteinTreeAdaptor->store($root);

  my $MA = $self->{'comparaDBA'}->get_MemberAdaptor;
  my $gene_member = $MA->fetch_by_source_stable_id("ENSEMBLGENE", $gene_stable_id);

  throw("couldn't find gene member $gene_stable_id") unless($gene_member);
  
  my $start = time();
  my $ortho_set = {};
  my $member_set = {};
  $self->get_orthologue_cluster($gene_member, $ortho_set, $member_set, 0);

  printf("cluster has %d links\n", scalar(keys(%{$ortho_set})));
  printf("cluster has %d genes\n", scalar(keys(%{$member_set})));
  printf("%1.3f msec\n", 1000.0*(time() - $start));

  foreach my $homology (values(%{$ortho_set})) {
    $self->grow_clusters_with_homology($root, $homology);
  }

  printf("cluster has %d genes\n", scalar(keys(%{$member_set})));
  foreach my $member (values(%{$member_set})) {
    $member->print_member;
  }
  $root->print_tree;
}
build_homology_clustersdescriptionprevnextTop
sub build_homology_clusters {
  my $self = shift;
  
  my $build_mode = 'direct';
  
  return unless($self->{'species_set'});
  my @species_set = @{$self->{'species_set'}};
  return unless @species_set;

  my $mlssDBA = $self->{'comparaDBA'}->get_MethodLinkSpeciesSetAdaptor;
  my $homologyDBA = $self->{'comparaDBA'}->get_HomologyAdaptor;
  my $treeDBA = $self->{'comparaDBA'}->get_ProteinTreeAdaptor;
  my $starttime = time();
   
  my $root = new Bio::EnsEMBL::Compara::NestedSet;
  $root->name("PROTEIN_TREES");
  $treeDBA->store($root);
  printf("root_id %d\n", $root->node_id);
    
  #
# create Cluster MLSS
#
$self->{'cluster_mlss'} = new Bio::EnsEMBL::Compara::MethodLinkSpeciesSet; $self->{'cluster_mlss'}->method_link_type('PROTEIN_TREES'); my @genomeDB_set; foreach my $gdb_id (@species_set) { my $gdb = $self->{'comparaDBA'}->get_GenomeDBAdaptor->fetch_by_dbID($gdb_id); push @genomeDB_set, $gdb; } $self->{'cluster_mlss'}->species_set(\@genomeDB_set); $mlssDBA->store($self->{'cluster_mlss'}); printf("MLSS %d\n", $self->{'cluster_mlss'}->dbID); $self->{'member_leaves'} = {}; my $ug = new Data::UUID; #
# load all the Paralogue pairs, building clusters as we load
# get all homologies for each MLSS
#
foreach my $gdb_id1 (@species_set) { printf("\nfind MLSS for genome %d\n", $gdb_id1); my $mlss = $mlssDBA->fetch_by_method_link_type_genome_db_ids ("ENSEMBL_PARALOGUES",[$gdb_id1]); next unless(defined($mlss)); printf("fetch all paralogues for mlss_id=%d\n", $mlss->dbID); $starttime = time(); my $homology_list = $self->fetch_homology_peptide_pairs_by_mlss($mlss); printf(" %1.3f secs to fetch paralogues\n", (time()-$starttime)); printf(" %d paralogue pairs\n", scalar(@{$homology_list})); $starttime = time(); while(@{$homology_list}) { my $pep_pair = shift @{$homology_list}; $self->grow_memclusters_with_peppair($root, $pep_pair); } printf(" %1.3f secs to load/process homologies\n", (time()-$starttime)); } #
#get all MLSS for each homology pair in this species set
#get all homologies for each MLSS
#
while (my $gdb_id1 = shift @species_set) { foreach my $gdb_id2 (@species_set) { printf("\nfind MLSS for genome pair %d/%d\n", $gdb_id1, $gdb_id2); my $mlss = $mlssDBA->fetch_by_method_link_type_genome_db_ids ("ENSEMBL_ORTHOLOGUES",[$gdb_id1, $gdb_id2]); next unless(defined($mlss)); printf("fetch all homologies for mlss_id=%d\n", $mlss->dbID); $starttime = time(); my $homology_list = undef; switch ($build_mode) { case 'API' { $homology_list = $homologyDBA->fetch_all_by_MethodLinkSpeciesSet($mlss); } case 'direct' { $homology_list = $self->fetch_homology_peptide_pairs_by_mlss($mlss); } } printf(" %1.3f secs to fetch homologies\n", (time()-$starttime)); printf(" %d homologies\n", scalar(@{$homology_list})); $starttime = time(); my $counter=0; while(@{$homology_list}) { $counter++; #if($counter % 200 == 0) { printf("%10d homologies done\n", $counter); }
switch($build_mode) { case 'API' { my $homology = shift @{$homology_list}; $self->grow_clusters_with_homology($root, $homology); } case 'direct' { my $pep_pair = shift @{$homology_list}; $self->grow_memclusters_with_peppair($root, $pep_pair); } } } printf(" %1.3f secs to load/process homologies\n", (time()-$starttime)); } } if($build_mode eq 'direct') { $self->store_clusters($root); } $self->dataflow_clusters($root);
}
dataflow_clustersdescriptionprevnextTop
sub dataflow_clusters {
  my $self = shift;
  my $root = shift;
  
  my $clusters = $root->children;
  foreach my $cluster (@{$clusters}) {
    my $output_id = sprintf("{'protein_tree_id'=>%d}", $cluster->node_id);
    $self->dataflow_output_id($output_id, 2);
  }
}


1;
}
fetch_homology_peptide_pairs_by_mlssdescriptionprevnextTop
sub fetch_homology_peptide_pairs_by_mlss {
  my $self = shift;
  my $mlss = shift;
  
  my $starttime = time();

  my $sql = "SELECT hm.homology_id, peptide_member_id ".
            "FROM homology_member hm join homology using (homology_id ) ".
            "WHERE method_link_species_set_id = ? order by hm.homology_id";
  #print("$sql\n");
my $sth = $self->dbc->prepare($sql); $sth->execute($mlss->dbID); #print(" done with fetch\n");
my $homology_hash = {}; while( my $ref = $sth->fetchrow_arrayref() ) { my ($homology_id, $peptide_member_id) = @$ref; push @{$homology_hash->{$homology_id}}, $peptide_member_id; } $sth->finish; #printf("loaded %d homologies pep_pairs in %1.3f secs\n", scalar(keys(%$homology_hash)), (time()-$starttime));
my @pairs = values(%$homology_hash); return\@ pairs;
}
fetch_inputdescriptionprevnextTop
sub fetch_input {
  my( $self) = @_;

  $self->{'species_set'} = undef;
  $self->throw("No input_id") unless defined($self->input_id);

  #create a Compara::DBAdaptor which shares the same DBI handle
#with the pipeline DBAdaptor that is based into this runnable
$self->{'comparaDBA'} = Bio::EnsEMBL::Compara::DBSQL::DBAdaptor->new(-DBCONN=>$self->db->dbc); $self->get_params($self->input_id); return 1;
}
get_orthologue_clusterdescriptionprevnextTop
sub get_orthologue_cluster {
  my $self = shift;
  my $gene = shift;
  my $ortho_set = shift;
  my $member_set = shift;

  return if($member_set->{$gene->dbID});

  $gene->print_member("query gene\n") if($self->debug);
  $member_set->{$gene->dbID} = $gene;

  my $homologies = $self->{'comparaDBA'}->get_HomologyAdaptor->fetch_by_Member($gene);
  printf("fetched %d homologies\n", scalar(@$homologies)) if($self->debug);

  foreach my $homology (@{$homologies}) {
    next if($ortho_set->{$homology->dbID});
    next if($homology->method_link_type ne 'ENSEMBL_ORTHOLOGUES');

    foreach my $member_attribute (@{$homology->get_all_Member_Attribute}) {
      my ($member, $attribute) = @{$member_attribute};
      next if($member->dbID == $gene->dbID); #skip query gene
$member->print_member if($self->debug); printf("adding homology_id %d to cluster\n", $homology->dbID) if($self->debug); $ortho_set->{$homology->dbID} = $homology; $self->get_orthologue_cluster($member, $ortho_set, $member_set, $self->debug); } } printf("done with search query %s\n", $gene->stable_id) if($self->debug); } #########################################################################
#
# new fast algorithm idea:
# AVOIDS most of the API :(
# 1) preload all members from all genomes into MLSS linked group
# a) create 1 'un parented' node and member for each peptide
# 2) load 'homologies' as simple peptide_member_id pair SQL query
# 3) loop through peptide_pair, use root_id,parent_id to grow clusters
#
#
#########################################################################
}
get_paramsdescriptionprevnextTop
sub get_params {
  my $self         = shift;
  my $param_string = shift;

  return unless($param_string);
  print("parsing parameter string : ",$param_string,"\n");
  
  my $params = eval($param_string);
  return unless($params);

  foreach my $key (keys %$params) {
    print("  $key : ", $params->{$key}, "\n");
  }

  if (defined $params->{'species_set'}) {
    $self->{'species_set'} = $params->{'species_set'};
  }
  if (defined $params->{'gene_stable_id'}) {
    $self->{'gene_stable_id'} = $params->{'gene_stable_id'};
  }
  
  return;
}
grow_clusters_with_homologydescriptionprevnextTop
sub grow_clusters_with_homology {
  my $self = shift;
  my $root = shift;
  my $homology = shift;
  

  my ($alignedMember1, $alignedMember2) = $homology->get_AlignedMember_pair;
  $alignedMember1->method_link_species_set_id($self->{'cluster_mlss'}->dbID);
  $alignedMember2->method_link_species_set_id($self->{'cluster_mlss'}->dbID);  
  
  my $proteinTreeDBA = $self->{'comparaDBA'}->get_ProteinTreeAdaptor;
  my $treeMember1 = $proteinTreeDBA->fetch_AlignedMember_by_member_id_mlssID(
              $alignedMember1->member_id, $self->{'cluster_mlss'}->dbID);
  my $treeMember2 = $proteinTreeDBA->fetch_AlignedMember_by_member_id_mlssID(
              $alignedMember2->member_id, $self->{'cluster_mlss'}->dbID);

  if(!defined($treeMember1) and !defined($treeMember2)) {
    #neither member is in a cluster so create new cluster with just these 2 members
# printf("create new cluster\n");
my $cluster = new Bio::EnsEMBL::Compara::NestedSet; $root->store_child($cluster); $cluster->store_child($alignedMember1); $cluster->store_child($alignedMember2); } elsif(defined($treeMember1) and !defined($treeMember2)) { # printf("add member to cluster %d\n", $treeMember1->parent->node_id);
# $alignedMember2->print_member;
$treeMember1->parent->store_child($alignedMember2); } elsif(!defined($treeMember1) and defined($treeMember2)) { # printf("add member to cluster %d\n", $treeMember2->parent->node_id);
# $alignedMember1->print_member;
$treeMember2->parent->store_child($alignedMember1); } elsif(defined($treeMember1) and defined($treeMember2)) { if($treeMember1->parent->equals($treeMember2->parent)) { # printf("both members already in same cluster %d\n", $treeMember1->parent->node_id);
} else { #this member already belongs to a different cluster -> need to merge clusters
# print("MERGE clusters\n");
$proteinTreeDBA->merge_nodes($treeMember1->parent, $treeMember2->parent); } } } ##################################################
#
# single cluster build : gene start, recursive search
# Mainly used for debugging and testing.
# Too slow to be used as a primary production algorithm.
#
##################################################
}
grow_dbclusters_with_peppairdescriptionprevnextTop
sub grow_dbclusters_with_peppair {
  my $self = shift;
  my $root = shift;
  my $pep_pair = shift;

  #printf("homology peptide pair : %d - %d\n", $pep_pair->[0], $pep_pair->[1]); 
my $mlss_id = $self->{'cluster_mlss'}->dbID; my $proteinTreeDBA = $self->{'comparaDBA'}->get_ProteinTreeAdaptor; my $starttime = time(); my $treeMember1 = $proteinTreeDBA->fetch_AlignedMember_by_member_id_mlssID($pep_pair->[0], $mlss_id); my $treeMember2 = $proteinTreeDBA->fetch_AlignedMember_by_member_id_mlssID($pep_pair->[1], $mlss_id); printf(" %1.3f secs to fetch AlignedMember\n", (time()-$starttime)); my $alignedMember1 = undef; if(!defined($treeMember1)) { $alignedMember1 = new Bio::EnsEMBL::Compara::AlignedMember; $alignedMember1->method_link_species_set_id($mlss_id); $alignedMember1->member_id($pep_pair->[0]); } my $alignedMember2 = undef; if(!defined($treeMember2)) { $alignedMember2 = new Bio::EnsEMBL::Compara::AlignedMember; $alignedMember2->method_link_species_set_id($mlss_id); $alignedMember2->member_id($pep_pair->[1]); } if(!defined($treeMember1) and !defined($treeMember2)) { #neither member is in a cluster so create new cluster with just these 2 members
# printf("create new cluster\n");
my $cluster = new Bio::EnsEMBL::Compara::NestedSet; $root->store_child($cluster); my $starttime = time(); $cluster->store_child($alignedMember1); printf(" %1.3f secs to store member in new cluster\n", (time()-$starttime)); $cluster->store_child($alignedMember2); } elsif(defined($treeMember1) and !defined($treeMember2)) { # printf("add member to cluster %d\n", $treeMember1->parent->node_id);
# $alignedMember2->print_member;
my $starttime = time(); $treeMember1->parent->store_child($alignedMember2); printf(" %1.3f secs to store member in exisiting cluster\n", (time()-$starttime)); } elsif(!defined($treeMember1) and defined($treeMember2)) { # printf("add member to cluster %d\n", $treeMember2->parent->node_id);
# $alignedMember1->print_member;
$treeMember2->parent->store_child($alignedMember1); } elsif(defined($treeMember1) and defined($treeMember2)) { if($treeMember1->parent->equals($treeMember2->parent)) { # printf("both members already in same cluster %d\n", $treeMember1->parent->node_id);
} else { #this member already belongs to a different cluster -> need to merge clusters
# print("MERGE clusters\n");
$proteinTreeDBA->merge_nodes($treeMember1->parent, $treeMember2->parent); } }
}
grow_memclusters_with_peppairdescriptionprevnextTop
sub grow_memclusters_with_peppair {
  my $self = shift;
  my $root = shift;
  my $pep_pair = shift;

  #printf("homology peptide pair : %d - %d\n", $pep_pair->[0], $pep_pair->[1]); 
my $mlss_id = $self->{'cluster_mlss'}->dbID; my ($treeMember1, $treeMember2); $treeMember1 = $self->{'member_leaves'}->{$pep_pair->[0]}; $treeMember2 = $self->{'member_leaves'}->{$pep_pair->[1]}; if(!defined($treeMember1)) { $treeMember1 = new Bio::EnsEMBL::Compara::AlignedMember; $treeMember1->method_link_species_set_id($mlss_id); $treeMember1->member_id($pep_pair->[0]); $treeMember1->node_id($pep_pair->[0]); $self->{'member_leaves'}->{$pep_pair->[0]} = $treeMember1; } if(!defined($treeMember2)) { $treeMember2 = new Bio::EnsEMBL::Compara::AlignedMember; $treeMember2->method_link_species_set_id($mlss_id); $treeMember2->member_id($pep_pair->[1]); $treeMember2->node_id($pep_pair->[1]); $self->{'member_leaves'}->{$pep_pair->[1]} = $treeMember2; } if(!defined($treeMember1->parent) and !defined($treeMember2->parent)) { #neither member is in a cluster so create new cluster with just these 2 members
# printf("create new cluster\n");
my $cluster = new Bio::EnsEMBL::Compara::NestedSet; $root->add_child($cluster); $cluster->add_child($treeMember1); $cluster->add_child($treeMember2); } elsif(defined($treeMember1->parent) and !defined($treeMember2->parent)) { # printf("add member to cluster %d\n", $treeMember1->parent->node_id);
# $treeMember2->print_member;
$treeMember1->parent->add_child($treeMember2); } elsif(!defined($treeMember1->parent) and defined($treeMember2->parent)) { # printf("add member to cluster %d\n", $treeMember2->parent->node_id);
# $treeMember1->print_member;
$treeMember2->parent->add_child($treeMember1); } elsif(defined($treeMember1->parent) and defined($treeMember2->parent)) { if($treeMember1->parent->equals($treeMember2->parent)) { # printf("both members already in same cluster %d\n", $treeMember1->parent->node_id);
} else { #this member already belongs to a different cluster -> need to merge clusters
# print("MERGE clusters\n");
my $parent2 = $treeMember2->parent; $treeMember1->parent->merge_children($treeMember2->parent); $parent2->disavow_parent; #releases from root
} }
}
rundescriptionprevnextTop
sub run {
  my $self = shift;  
  return 1;
}
store_clustersdescriptionprevnextTop
sub store_clusters {
  my $self = shift;
  my $root = shift;
  
  my $starttime = time();

  my $treeDBA = $self->{'comparaDBA'}->get_ProteinTreeAdaptor;

  my $leaves = $root->get_all_leaves;
  printf("loaded %d leaves\n", scalar(@$leaves));
  my $count=0;
  foreach my $mem (@$leaves) { $count++ if($mem->isa('Bio::EnsEMBL::Compara::AlignedMember'));}
  printf("loaded %d leaves which are members\n", $count);
  printf("loaded %d members in hash\n", scalar(keys(%{$self->{'member_leaves'}})));
  printf("%d clusters generated\n", $root->get_child_count);  
  
  my $clusters = $root->children;
  my $counter=0;
  foreach my $cluster (@{$clusters}) {
    $cluster->build_leftright_indexing;
    
    $treeDBA->store($cluster);
    if($counter++ % 200 == 0) { printf("%10d clusters stored\n", $counter); }
  }
  printf("  %1.3f secs to store clusters\n", (time()-$starttime));
}
write_outputdescriptionprevnextTop
sub write_output {
  my $self = shift;
  
  if($self->{'gene_stable_id'}) {
    $self->build_cluster_around_gene_stable_id($self->{'gene_stable_id'});
  } else {
    $self->build_homology_clusters();
  }
  
  return 1;
}

##########################################
#
# internal methods
#
##########################################
}
General documentation
CONTACTTop
  Contact Jessica Severin on module implemetation/design detail: jessica@ebi.ac.uk
Contact Abel Ureta-Vidal on EnsEMBL/Compara: abel@ebi.ac.uk
Contact Ewan Birney on EnsEMBL in general: birney@sanger.ac.uk
APPENDIXTop
The rest of the documentation details each of the object methods.
Internal methods are usually preceded with a _