Raw content of Bio::EnsEMBL::Compara::RunnableDB::HomologyCluster # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =pod =head1 NAME Bio::EnsEMBL::Compara::RunnableDB::HomologyCluster =cut =head1 SYNOPSIS my $aa = $sdba->get_AnalysisAdaptor; my $analysis = $aa->fetch_by_logic_name('HomologyCluster'); my $rdb = new Bio::EnsEMBL::Compara::RunnableDB::HomologyCluster( -input_id => "{'species_set'=>[1,2,3,14]}", -analysis => $analysis); $rdb->fetch_input $rdb->run; =cut =head1 DESCRIPTION This is a compara specific runnableDB, that based on an input_id of arrayrefs of genome_db_ids, and from this species set realtionship it will search through the homology data and build SingleLinkage Clusters and store them into a NestedSet datastructure. This is the first step in the Tree analysis production system. =cut =head1 CONTACT Contact Jessica Severin on module implemetation/design detail: jessica@ebi.ac.uk Contact Abel Ureta-Vidal on EnsEMBL/Compara: abel@ebi.ac.uk Contact Ewan Birney on EnsEMBL in general: birney@sanger.ac.uk =cut =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::EnsEMBL::Compara::RunnableDB::HomologyCluster; use strict; use Switch; use Bio::EnsEMBL::Compara::DBSQL::DBAdaptor; use Bio::EnsEMBL::Hive; use Bio::EnsEMBL::Compara::NestedSet; use Bio::EnsEMBL::Compara::Homology; use Data::UUID; use Time::HiRes qw(time gettimeofday tv_interval); our @ISA = qw(Bio::EnsEMBL::Hive::Process); sub fetch_input { my( $self) = @_; $self->{'species_set'} = undef; $self->throw("No input_id") unless defined($self->input_id); #create a Compara::DBAdaptor which shares the same DBI handle #with the pipeline DBAdaptor that is based into this runnable $self->{'comparaDBA'} = Bio::EnsEMBL::Compara::DBSQL::DBAdaptor->new(-DBCONN=>$self->db->dbc); $self->get_params($self->input_id); return 1; } sub get_params { my $self = shift; my $param_string = shift; return unless($param_string); print("parsing parameter string : ",$param_string,"\n"); my $params = eval($param_string); return unless($params); foreach my $key (keys %$params) { print(" $key : ", $params->{$key}, "\n"); } if (defined $params->{'species_set'}) { $self->{'species_set'} = $params->{'species_set'}; } if (defined $params->{'gene_stable_id'}) { $self->{'gene_stable_id'} = $params->{'gene_stable_id'}; } return; } sub run { my $self = shift; return 1; } sub write_output { my $self = shift; if($self->{'gene_stable_id'}) { $self->build_cluster_around_gene_stable_id($self->{'gene_stable_id'}); } else { $self->build_homology_clusters(); } return 1; } ########################################## # # internal methods # ########################################## sub build_homology_clusters { my $self = shift; my $build_mode = 'direct'; return unless($self->{'species_set'}); my @species_set = @{$self->{'species_set'}}; return unless @species_set; my $mlssDBA = $self->{'comparaDBA'}->get_MethodLinkSpeciesSetAdaptor; my $homologyDBA = $self->{'comparaDBA'}->get_HomologyAdaptor; my $treeDBA = $self->{'comparaDBA'}->get_ProteinTreeAdaptor; my $starttime = time(); my $root = new Bio::EnsEMBL::Compara::NestedSet; $root->name("PROTEIN_TREES"); $treeDBA->store($root); printf("root_id %d\n", $root->node_id); # # create Cluster MLSS # $self->{'cluster_mlss'} = new Bio::EnsEMBL::Compara::MethodLinkSpeciesSet; $self->{'cluster_mlss'}->method_link_type('PROTEIN_TREES'); my @genomeDB_set; foreach my $gdb_id (@species_set) { my $gdb = $self->{'comparaDBA'}->get_GenomeDBAdaptor->fetch_by_dbID($gdb_id); push @genomeDB_set, $gdb; } $self->{'cluster_mlss'}->species_set(\@genomeDB_set); $mlssDBA->store($self->{'cluster_mlss'}); printf("MLSS %d\n", $self->{'cluster_mlss'}->dbID); $self->{'member_leaves'} = {}; my $ug = new Data::UUID; # # load all the Paralogue pairs, building clusters as we load # get all homologies for each MLSS # foreach my $gdb_id1 (@species_set) { printf("\nfind MLSS for genome %d\n", $gdb_id1); my $mlss = $mlssDBA->fetch_by_method_link_type_genome_db_ids ("ENSEMBL_PARALOGUES",[$gdb_id1]); next unless(defined($mlss)); printf("fetch all paralogues for mlss_id=%d\n", $mlss->dbID); $starttime = time(); my $homology_list = $self->fetch_homology_peptide_pairs_by_mlss($mlss); printf(" %1.3f secs to fetch paralogues\n", (time()-$starttime)); printf(" %d paralogue pairs\n", scalar(@{$homology_list})); $starttime = time(); while(@{$homology_list}) { my $pep_pair = shift @{$homology_list}; $self->grow_memclusters_with_peppair($root, $pep_pair); } printf(" %1.3f secs to load/process homologies\n", (time()-$starttime)); } # #get all MLSS for each homology pair in this species set #get all homologies for each MLSS # while (my $gdb_id1 = shift @species_set) { foreach my $gdb_id2 (@species_set) { printf("\nfind MLSS for genome pair %d/%d\n", $gdb_id1, $gdb_id2); my $mlss = $mlssDBA->fetch_by_method_link_type_genome_db_ids ("ENSEMBL_ORTHOLOGUES",[$gdb_id1, $gdb_id2]); next unless(defined($mlss)); printf("fetch all homologies for mlss_id=%d\n", $mlss->dbID); $starttime = time(); my $homology_list = undef; switch ($build_mode) { case 'API' { $homology_list = $homologyDBA->fetch_all_by_MethodLinkSpeciesSet($mlss); } case 'direct' { $homology_list = $self->fetch_homology_peptide_pairs_by_mlss($mlss); } } printf(" %1.3f secs to fetch homologies\n", (time()-$starttime)); printf(" %d homologies\n", scalar(@{$homology_list})); $starttime = time(); my $counter=0; while(@{$homology_list}) { $counter++; #if($counter % 200 == 0) { printf("%10d homologies done\n", $counter); } switch($build_mode) { case 'API' { my $homology = shift @{$homology_list}; $self->grow_clusters_with_homology($root, $homology); } case 'direct' { my $pep_pair = shift @{$homology_list}; $self->grow_memclusters_with_peppair($root, $pep_pair); } } } printf(" %1.3f secs to load/process homologies\n", (time()-$starttime)); } } if($build_mode eq 'direct') { $self->store_clusters($root); } $self->dataflow_clusters($root); } sub grow_clusters_with_homology { my $self = shift; my $root = shift; my $homology = shift; my ($alignedMember1, $alignedMember2) = $homology->get_AlignedMember_pair; $alignedMember1->method_link_species_set_id($self->{'cluster_mlss'}->dbID); $alignedMember2->method_link_species_set_id($self->{'cluster_mlss'}->dbID); my $proteinTreeDBA = $self->{'comparaDBA'}->get_ProteinTreeAdaptor; my $treeMember1 = $proteinTreeDBA->fetch_AlignedMember_by_member_id_mlssID( $alignedMember1->member_id, $self->{'cluster_mlss'}->dbID); my $treeMember2 = $proteinTreeDBA->fetch_AlignedMember_by_member_id_mlssID( $alignedMember2->member_id, $self->{'cluster_mlss'}->dbID); if(!defined($treeMember1) and !defined($treeMember2)) { #neither member is in a cluster so create new cluster with just these 2 members # printf("create new cluster\n"); my $cluster = new Bio::EnsEMBL::Compara::NestedSet; $root->store_child($cluster); $cluster->store_child($alignedMember1); $cluster->store_child($alignedMember2); } elsif(defined($treeMember1) and !defined($treeMember2)) { # printf("add member to cluster %d\n", $treeMember1->parent->node_id); # $alignedMember2->print_member; $treeMember1->parent->store_child($alignedMember2); } elsif(!defined($treeMember1) and defined($treeMember2)) { # printf("add member to cluster %d\n", $treeMember2->parent->node_id); # $alignedMember1->print_member; $treeMember2->parent->store_child($alignedMember1); } elsif(defined($treeMember1) and defined($treeMember2)) { if($treeMember1->parent->equals($treeMember2->parent)) { # printf("both members already in same cluster %d\n", $treeMember1->parent->node_id); } else { #this member already belongs to a different cluster -> need to merge clusters # print("MERGE clusters\n"); $proteinTreeDBA->merge_nodes($treeMember1->parent, $treeMember2->parent); } } } ################################################## # # single cluster build : gene start, recursive search # Mainly used for debugging and testing. # Too slow to be used as a primary production algorithm. # ################################################## sub build_cluster_around_gene_stable_id { my $self = shift; my $gene_stable_id = shift; my $root = new Bio::EnsEMBL::Compara::NestedSet; $root->name("PROTEIN_TREES"); $self->{'comparaDBA'}->get_ProteinTreeAdaptor->store($root); my $MA = $self->{'comparaDBA'}->get_MemberAdaptor; my $gene_member = $MA->fetch_by_source_stable_id("ENSEMBLGENE", $gene_stable_id); throw("couldn't find gene member $gene_stable_id") unless($gene_member); my $start = time(); my $ortho_set = {}; my $member_set = {}; $self->get_orthologue_cluster($gene_member, $ortho_set, $member_set, 0); printf("cluster has %d links\n", scalar(keys(%{$ortho_set}))); printf("cluster has %d genes\n", scalar(keys(%{$member_set}))); printf("%1.3f msec\n", 1000.0*(time() - $start)); foreach my $homology (values(%{$ortho_set})) { $self->grow_clusters_with_homology($root, $homology); } printf("cluster has %d genes\n", scalar(keys(%{$member_set}))); foreach my $member (values(%{$member_set})) { $member->print_member; } $root->print_tree; } sub get_orthologue_cluster { my $self = shift; my $gene = shift; my $ortho_set = shift; my $member_set = shift; return if($member_set->{$gene->dbID}); $gene->print_member("query gene\n") if($self->debug); $member_set->{$gene->dbID} = $gene; my $homologies = $self->{'comparaDBA'}->get_HomologyAdaptor->fetch_by_Member($gene); printf("fetched %d homologies\n", scalar(@$homologies)) if($self->debug); foreach my $homology (@{$homologies}) { next if($ortho_set->{$homology->dbID}); next if($homology->method_link_type ne 'ENSEMBL_ORTHOLOGUES'); foreach my $member_attribute (@{$homology->get_all_Member_Attribute}) { my ($member, $attribute) = @{$member_attribute}; next if($member->dbID == $gene->dbID); #skip query gene $member->print_member if($self->debug); printf("adding homology_id %d to cluster\n", $homology->dbID) if($self->debug); $ortho_set->{$homology->dbID} = $homology; $self->get_orthologue_cluster($member, $ortho_set, $member_set, $self->debug); } } printf("done with search query %s\n", $gene->stable_id) if($self->debug); } ######################################################################### # # new fast algorithm idea: # AVOIDS most of the API :( # 1) preload all members from all genomes into MLSS linked group # a) create 1 'un parented' node and member for each peptide # 2) load 'homologies' as simple peptide_member_id pair SQL query # 3) loop through peptide_pair, use root_id,parent_id to grow clusters # # ######################################################################### sub fetch_homology_peptide_pairs_by_mlss { my $self = shift; my $mlss = shift; my $starttime = time(); my $sql = "SELECT hm.homology_id, peptide_member_id ". "FROM homology_member hm join homology using (homology_id ) ". "WHERE method_link_species_set_id = ? order by hm.homology_id"; #print("$sql\n"); my $sth = $self->dbc->prepare($sql); $sth->execute($mlss->dbID); #print(" done with fetch\n"); my $homology_hash = {}; while( my $ref = $sth->fetchrow_arrayref() ) { my ($homology_id, $peptide_member_id) = @$ref; push @{$homology_hash->{$homology_id}}, $peptide_member_id; } $sth->finish; #printf("loaded %d homologies pep_pairs in %1.3f secs\n", scalar(keys(%$homology_hash)), (time()-$starttime)); my @pairs = values(%$homology_hash); return \@pairs; } sub grow_dbclusters_with_peppair { my $self = shift; my $root = shift; my $pep_pair = shift; #printf("homology peptide pair : %d - %d\n", $pep_pair->[0], $pep_pair->[1]); my $mlss_id = $self->{'cluster_mlss'}->dbID; my $proteinTreeDBA = $self->{'comparaDBA'}->get_ProteinTreeAdaptor; my $starttime = time(); my $treeMember1 = $proteinTreeDBA->fetch_AlignedMember_by_member_id_mlssID($pep_pair->[0], $mlss_id); my $treeMember2 = $proteinTreeDBA->fetch_AlignedMember_by_member_id_mlssID($pep_pair->[1], $mlss_id); printf(" %1.3f secs to fetch AlignedMember\n", (time()-$starttime)); my $alignedMember1 = undef; if(!defined($treeMember1)) { $alignedMember1 = new Bio::EnsEMBL::Compara::AlignedMember; $alignedMember1->method_link_species_set_id($mlss_id); $alignedMember1->member_id($pep_pair->[0]); } my $alignedMember2 = undef; if(!defined($treeMember2)) { $alignedMember2 = new Bio::EnsEMBL::Compara::AlignedMember; $alignedMember2->method_link_species_set_id($mlss_id); $alignedMember2->member_id($pep_pair->[1]); } if(!defined($treeMember1) and !defined($treeMember2)) { #neither member is in a cluster so create new cluster with just these 2 members # printf("create new cluster\n"); my $cluster = new Bio::EnsEMBL::Compara::NestedSet; $root->store_child($cluster); my $starttime = time(); $cluster->store_child($alignedMember1); printf(" %1.3f secs to store member in new cluster\n", (time()-$starttime)); $cluster->store_child($alignedMember2); } elsif(defined($treeMember1) and !defined($treeMember2)) { # printf("add member to cluster %d\n", $treeMember1->parent->node_id); # $alignedMember2->print_member; my $starttime = time(); $treeMember1->parent->store_child($alignedMember2); printf(" %1.3f secs to store member in exisiting cluster\n", (time()-$starttime)); } elsif(!defined($treeMember1) and defined($treeMember2)) { # printf("add member to cluster %d\n", $treeMember2->parent->node_id); # $alignedMember1->print_member; $treeMember2->parent->store_child($alignedMember1); } elsif(defined($treeMember1) and defined($treeMember2)) { if($treeMember1->parent->equals($treeMember2->parent)) { # printf("both members already in same cluster %d\n", $treeMember1->parent->node_id); } else { #this member already belongs to a different cluster -> need to merge clusters # print("MERGE clusters\n"); $proteinTreeDBA->merge_nodes($treeMember1->parent, $treeMember2->parent); } } } =head2 grow_memclusters_with_peppair Description: Takes a pair of peptide_member_id and uses the NestedSet objects to build a 3 layer tree in memory. There is a single root for the entire build process, and each cluster is a child of this root. The members are children of the clusters. During the build process the member leaves are assigned a node_id equal to the peptide_member_id so they can be found via 'find_node_by_node_id'. After the process is completed, each cluster can then be stored in a faster bulk insert process. =cut sub grow_memclusters_with_peppair { my $self = shift; my $root = shift; my $pep_pair = shift; #printf("homology peptide pair : %d - %d\n", $pep_pair->[0], $pep_pair->[1]); my $mlss_id = $self->{'cluster_mlss'}->dbID; my ($treeMember1, $treeMember2); $treeMember1 = $self->{'member_leaves'}->{$pep_pair->[0]}; $treeMember2 = $self->{'member_leaves'}->{$pep_pair->[1]}; if(!defined($treeMember1)) { $treeMember1 = new Bio::EnsEMBL::Compara::AlignedMember; $treeMember1->method_link_species_set_id($mlss_id); $treeMember1->member_id($pep_pair->[0]); $treeMember1->node_id($pep_pair->[0]); $self->{'member_leaves'}->{$pep_pair->[0]} = $treeMember1; } if(!defined($treeMember2)) { $treeMember2 = new Bio::EnsEMBL::Compara::AlignedMember; $treeMember2->method_link_species_set_id($mlss_id); $treeMember2->member_id($pep_pair->[1]); $treeMember2->node_id($pep_pair->[1]); $self->{'member_leaves'}->{$pep_pair->[1]} = $treeMember2; } if(!defined($treeMember1->parent) and !defined($treeMember2->parent)) { #neither member is in a cluster so create new cluster with just these 2 members # printf("create new cluster\n"); my $cluster = new Bio::EnsEMBL::Compara::NestedSet; $root->add_child($cluster); $cluster->add_child($treeMember1); $cluster->add_child($treeMember2); } elsif(defined($treeMember1->parent) and !defined($treeMember2->parent)) { # printf("add member to cluster %d\n", $treeMember1->parent->node_id); # $treeMember2->print_member; $treeMember1->parent->add_child($treeMember2); } elsif(!defined($treeMember1->parent) and defined($treeMember2->parent)) { # printf("add member to cluster %d\n", $treeMember2->parent->node_id); # $treeMember1->print_member; $treeMember2->parent->add_child($treeMember1); } elsif(defined($treeMember1->parent) and defined($treeMember2->parent)) { if($treeMember1->parent->equals($treeMember2->parent)) { # printf("both members already in same cluster %d\n", $treeMember1->parent->node_id); } else { #this member already belongs to a different cluster -> need to merge clusters # print("MERGE clusters\n"); my $parent2 = $treeMember2->parent; $treeMember1->parent->merge_children($treeMember2->parent); $parent2->disavow_parent; #releases from root } } } sub store_clusters { my $self = shift; my $root = shift; my $starttime = time(); my $treeDBA = $self->{'comparaDBA'}->get_ProteinTreeAdaptor; my $leaves = $root->get_all_leaves; printf("loaded %d leaves\n", scalar(@$leaves)); my $count=0; foreach my $mem (@$leaves) { $count++ if($mem->isa('Bio::EnsEMBL::Compara::AlignedMember'));} printf("loaded %d leaves which are members\n", $count); printf("loaded %d members in hash\n", scalar(keys(%{$self->{'member_leaves'}}))); printf("%d clusters generated\n", $root->get_child_count); my $clusters = $root->children; my $counter=0; foreach my $cluster (@{$clusters}) { $cluster->build_leftright_indexing; $treeDBA->store($cluster); if($counter++ % 200 == 0) { printf("%10d clusters stored\n", $counter); } } printf(" %1.3f secs to store clusters\n", (time()-$starttime)); } sub dataflow_clusters { my $self = shift; my $root = shift; my $clusters = $root->children; foreach my $cluster (@{$clusters}) { my $output_id = sprintf("{'protein_tree_id'=>%d}", $cluster->node_id); $self->dataflow_output_id($output_id, 2); } } 1;