Raw content of Bio::EnsEMBL::Compara::RunnableDB::OrthoTree
#
# You may distribute this module under the same terms as perl itself
#
# POD documentation - main docs before the code
=pod
=head1 NAME
Bio::EnsEMBL::Compara::RunnableDB::OrthoTree
=cut
=head1 SYNOPSIS
my $db = Bio::EnsEMBL::Compara::DBAdaptor->new($locator);
my $otree = Bio::EnsEMBL::Compara::RunnableDB::OrthoTree->new (
-db => $db,
-input_id => $input_id,
-analysis => $analysis );
$otree->fetch_input(); #reads from DB
$otree->run();
$otree->output();
$otree->write_output(); #writes to DB
=cut
=head1 DESCRIPTION
This Analysis/RunnableDB is designed to take ProteinTree as input
This must already have a rooted tree with duplication/sepeciation tags
on the nodes.
It analyzes that tree structure to pick Orthologues and Paralogs for
each genepair.
input_id/parameters format eg: "{'protein_tree_id'=>1234}"
protein_tree_id : use 'id' to fetch a cluster from the ProteinTree
=cut
=head1 CONTACT
Contact Albert Vilella on module implementation: avilella@ebi.ac.uk
Contact Jessica Severin on module design detail: jessica@ebi.ac.uk
Contact Abel Ureta-Vidal on EnsEMBL/Compara: abel@ebi.ac.uk
Contact Ewan Birney on EnsEMBL in general: birney@sanger.ac.uk
=cut
=head1 APPENDIX
The rest of the documentation details each of the object methods.
Internal methods are usually preceded with a _
=cut
package Bio::EnsEMBL::Compara::RunnableDB::OrthoTree;
use strict;
use Getopt::Long;
use IO::File;
use File::Basename;
use Time::HiRes qw(time gettimeofday tv_interval);
use Scalar::Util qw(looks_like_number);
use Bio::EnsEMBL::DBSQL::DBAdaptor;
use Bio::EnsEMBL::Compara::DBSQL::DBAdaptor;
use Bio::EnsEMBL::Compara::Member;
use Bio::EnsEMBL::Compara::Graph::Link;
use Bio::EnsEMBL::Compara::Graph::Node;
use Bio::EnsEMBL::Compara::Graph::NewickParser;
use Bio::EnsEMBL::Compara::MethodLinkSpeciesSet;
use Bio::EnsEMBL::Compara::Homology;
use Bio::SimpleAlign;
use Bio::AlignIO;
use Bio::EnsEMBL::Hive;
our @ISA = qw(Bio::EnsEMBL::Hive::Process);
=head2 fetch_input
Title : fetch_input
Usage : $self->fetch_input
Function: Fetches input data for repeatmasker from the database
Returns : none
Args : none
=cut
sub fetch_input {
my( $self) = @_;
$self->{'tree_scale'} = 1;
$self->{'store_homologies'} = 1;
$self->{'max_gene_count'} = 200;
$self->{all_between} = 0;
$self->{no_between} = 0.25;
$self->throw("No input_id") unless defined($self->input_id);
#create a Compara::DBAdaptor which shares the same DBI handle
#with the Pipeline::DBAdaptor that is based into this runnable
$self->{'comparaDBA'} = Bio::EnsEMBL::Compara::DBSQL::DBAdaptor->new
(
-DBCONN=>$self->db->dbc
);
$self->get_params($self->parameters);
$self->get_params($self->input_id);
if($self->{analysis_data_id}) {
my $analysis_data_id = $self->{analysis_data_id};
my $analysis_data_params = $self->db->get_AnalysisDataAdaptor->fetch_by_dbID($analysis_data_id);
$self->get_params($analysis_data_params);
}
$self->print_params if($self->debug);
my $starttime = time();
$self->{'treeDBA'} = $self->{'comparaDBA'}->get_ProteinTreeAdaptor;
$self->{homologyDBA} = $self->{'comparaDBA'}->get_HomologyAdaptor;
$self->{'protein_tree'} = $self->{'treeDBA'}->
fetch_tree_at_node_id($self->{'protein_tree_id'});
$self->check_job_fail_options;
if($self->debug) {
$self->{'protein_tree'}->print_tree($self->{'tree_scale'});
printf("time to fetch tree : %1.3f secs\n" , time()-$starttime);
}
unless($self->{'protein_tree'}) {
throw("undefined ProteinTree as input\n");
}
$self->delete_old_homologies;
$self->delete_old_orthotree_tags;
$self->load_species_tree;
return 1;
}
=head2 run
Title : run
Usage : $self->run
Function: runs OrthoTree
Returns : none
Args : none
=cut
sub run
{
my $self = shift;
$self->run_analysis;
}
=head2 write_output
Title : write_output
Usage : $self->write_output
Function: parse clustalw output and update homology and
homology_member tables
Returns : none
Args : none
=cut
sub write_output {
my $self = shift;
$self->store_homologies;
}
sub check_job_fail_options
{
my $self = shift;
if ( $self->{'protein_tree'}->get_tagvalue('gene_count') > $self->{'max_gene_count'} ) {
$self->dataflow_output_id($self->input_id, 2);
$self->input_job->update_status('FAILED');
$self->{'protein_tree'}->release_tree;
$self->{'protein_tree'} = undef;
throw("OrthoTree : cluster size over threshold and FAIL it");
}
if($self->input_job->retry_count >= 2) {
# Send to QuickTreeBreak
$self->dataflow_output_id($self->input_id, 2);
$self->input_job->update_status('FAILED');
$self->DESTROY;
throw("OrthoTree job failed >=3 times: try something else and FAIL it");
}
if ($self->input_job->retry_count >= 1) {
if ($self->{'protein_tree'}->get_tagvalue('gene_count') > 400 && !defined($self->worker->{HIGHMEM})) {
$self->input_job->adaptor->reset_highmem_job_by_dbID($self->input_job->dbID);
$self->DESTROY;
throw("OrthoTree job too big: try something else and FAIL it");
}
}
}
sub DESTROY {
my $self = shift;
if($self->{'protein_tree'}) {
printf("OrthoTree::DESTROY releasing protein_tree\n") if($self->debug);
$self->{'protein_tree'}->release_tree;
$self->{'protein_tree'} = undef;
}
if($self->{'taxon_tree'}) {
printf("OrthoTree::DESTROY releasing taxon_tree\n") if($self->debug);
$self->{'taxon_tree'}->release_tree;
$self->{'taxon_tree'} = undef;
}
$self->SUPER::DESTROY if $self->can("SUPER::DESTROY");
}
##########################################
#
# internal methods
#
##########################################
sub get_params {
my $self = shift;
my $param_string = shift;
return unless($param_string);
print("parsing parameter string : ",$param_string,"\n") if($self->debug);
my $params = eval($param_string);
return unless($params);
if($self->debug) {
foreach my $key (keys %$params) {
print(" $key : ", $params->{$key}, "\n");
}
}
foreach my $key (qw[cdna species_tree_file protein_tree_id use_genomedb_id max_gene_count analysis_data_id]) {
my $value = $params->{$key};
$self->{$key} = $value if defined $value;
}
return;
}
sub print_params {
my $self = shift;
print("params:\n");
printf(" tree_id : %d\n", $self->{'protein_tree_id'});
}
sub run_analysis {
my $self = shift;
my $starttime = time()*1000;
my $tmp_time = time();
my $tree = $self->{'protein_tree'};
print "Getting all leaves\n";
my @all_protein_leaves = @{$tree->get_all_leaves};
my $tree_node_id = $tree->node_id;
#precalculate the ancestor species_hash (caches into the metadata of
#nodes) also augments the Duplication tagging
printf("Calculating ancestor species hash\n") if ($self->debug);
$self->get_ancestor_species_hash($tree);
if($self->debug) {
$self->{'protein_tree'}->print_tree($self->{'tree_scale'});
printf("%d proteins in tree\n", scalar(@all_protein_leaves));
}
#compare every gene in the tree with every other each gene/gene
#pairing is a potential ortholog/paralog and thus we need to analyze
#every possibility
#Accomplish by creating a fully connected graph between all the
#genes under the tree (hybrid graph structure) and then analyze each
#gene/gene link
$tmp_time = time();
printf("build fully linked graph\n") if($self->debug);
my @genepairlinks;
my $graphcount = 0;
while (my $protein1 = shift @all_protein_leaves) {
foreach my $protein2 (@all_protein_leaves) {
my $ancestor = $protein1->find_first_shared_ancestor($protein2);
# Line below will only become faster than above if we find a way to calculate long parent->parent journeys.
# This is probably doable by looking at the right1/left1 right2/left2 distances between the 2 proteins
# my $ancestor_indexed = $self->{'treeDBA'}->fetch_first_shared_ancestor_indexed($protein1,$protein2);
my $taxon_level = $self->get_ancestor_taxon_level($ancestor);
my $distance = $protein1->distance_to_ancestor($ancestor) +
$protein2->distance_to_ancestor($ancestor);
my $genepairlink = new Bio::EnsEMBL::Compara::Graph::Link
(
$protein1, $protein2, $distance
);
$genepairlink->add_tag("hops", 0);
$genepairlink->add_tag("ancestor", $ancestor);
$genepairlink->add_tag("taxon_name", $taxon_level->name);
$genepairlink->add_tag("tree_node_id", $tree_node_id);
push @genepairlinks, $genepairlink;
}
print STDERR "build graph $graphcount\n" if ($graphcount++ % 10 == 0);
}
printf("%1.3f secs build links and features\n", time()-$tmp_time)
if($self->debug>1);
$self->{'protein_tree'}->print_tree($self->{'tree_scale'})
if($self->debug);
#sort the gene/gene links by distance
# makes debug display easier to read, not required by algorithm
$tmp_time = time();
printf("sort links\n") if($self->debug);
my @sorted_genepairlinks =
sort {$a->distance_between <=> $b->distance_between} @genepairlinks;
printf("%1.3f secs to sort links\n", time()-$tmp_time) if($self->debug > 1);
#analyze every gene pair (genepairlink) to get its classification
printf("analyze links\n") if($self->debug);
printf("%d links\n", scalar(@genepairlinks)) if ($self->debug);
$tmp_time = time();
$self->{'old_homology_count'} = 0;
$self->{'orthotree_homology_counts'} = {};
foreach my $genepairlink (@sorted_genepairlinks) {
$self->analyze_genepairlink($genepairlink);
}
printf("%1.3f secs to analyze genepair links\n", time()-$tmp_time)
if($self->debug > 1);
#display summary stats of analysis
my $runtime = time()*1000-$starttime;
$self->{'protein_tree'}->store_tag('OrthoTree_runtime_msec', $runtime)
unless ($self->{'_readonly'});
if($self->debug) {
printf("%d proteins in tree\n", scalar(@{$tree->get_all_leaves}));
printf("%d pairings\n", scalar(@genepairlinks));
printf("%d old homologies\n", $self->{'old_homology_count'});
printf("orthotree homologies\n");
foreach my $type (keys(%{$self->{'orthotree_homology_counts'}})) {
printf
(
" %13s : %d\n", $type, $self->{'orthotree_homology_counts'}->{$type}
);
}
}
$self->{'homology_links'} = \@sorted_genepairlinks;
$DB::single=1;1;
return undef;
}
sub analyze_genepairlink
{
my $self = shift;
my $genepairlink = shift;
my ($protein1, $protein2) = $genepairlink->get_nodes;
#run feature detectors: precalcs and caches into metadata
$self->genepairlink_check_dups($genepairlink);
$self->genepairlink_fetch_homology($genepairlink) if($self->debug);
#do classification analysis : as filter stack
if($self->inspecies_paralog_test($genepairlink)) { }
elsif($self->direct_ortholog_test($genepairlink)) { }
elsif($self->ancient_residual_test($genepairlink)) { }
elsif($self->one2many_ortholog_test($genepairlink)) { }
elsif($self->outspecies_test($genepairlink)) { }
else {
printf
(
"OOPS!!!! %s - %s\n",
$protein1->gene_member->stable_id,
$protein2->gene_member->stable_id
);
}
$self->{'old_homology_count'}++
if($genepairlink->get_tagvalue('old_homology'));
my $type = $genepairlink->get_tagvalue('orthotree_type');
if($type) {
if(!defined($self->{'orthotree_homology_counts'}->{$type})) {
$self->{'orthotree_homology_counts'}->{$type} = 1;
} else {
$self->{'orthotree_homology_counts'}->{$type}++;
}
}
#display results
$self->display_link_analysis($genepairlink) if($self->debug >1);
return undef;
}
sub display_link_analysis
{
my $self = shift;
my $genepairlink = shift;
#display raw feature analysis
my ($protein1, $protein2) = $genepairlink->get_nodes;
my $ancestor = $genepairlink->get_tagvalue('ancestor');
printf("%21s(%7d) - %21s(%7d) : %10.3f dist : %3d hops : ",
$protein1->gene_member->stable_id, $protein1->gene_member->member_id,
$protein2->gene_member->stable_id, $protein2->gene_member->member_id,
$genepairlink->distance_between, $genepairlink->get_tagvalue('hops'));
if($genepairlink->get_tagvalue('has_dups')) { printf("%5s ", 'DUP');
} else { printf("%5s ", ""); }
my $homology = $genepairlink->get_tagvalue('old_homology');
if($homology) { printf("%5s ", $homology->description);
} else { printf("%5s ", ""); }
print("ancestor:(");
my $dup_value = $ancestor->get_tagvalue("Duplication");
# my $sis_value = $ancestor->get_tagvalue("species_intersection_score");
my $sis_value = $ancestor->get_tagvalue("duplication_confidence_score");
if($dup_value eq '1' || $dup_value eq '2'){
if ($sis_value eq '0') {
print("DD ");
} else {
print("DUP ");
}
}
else{print" ";}
printf("%9s)", $ancestor->node_id);
my $taxon_level = $ancestor->get_tagvalue('taxon_level');
printf(" %s %s %s\n",
$genepairlink->get_tagvalue('orthotree_type'),
$genepairlink->get_tagvalue('orthotree_subtype'),
$taxon_level->name
);
return undef;
}
sub load_species_tree {
my $self = shift @_;
my $starttime = time();
my $tree = (exists $self->{'species_tree_file'})
? $self->load_species_tree_from_file
: $self->load_species_tree_from_tax
;
$self->{'taxon_tree'} = $tree;
if($self->debug) {
$tree->print_tree(1);
printf("%1.3f secs for load species tree\n", time()-$starttime);
}
}
sub load_species_tree_from_tax
{
my $self = shift;
printf("load_species_tree_from_tax\n") if($self->debug);
my $starttime = time();
my $taxonDBA = $self->{'comparaDBA'}->get_NCBITaxonAdaptor;
my $gdb_list = $self->{'comparaDBA'}->get_GenomeDBAdaptor->fetch_all;
my $root=undef;
foreach my $gdb (@$gdb_list) {
next if ($gdb->name =~ /Ancestral/);
my $taxon = $taxonDBA->fetch_node_by_taxon_id($gdb->taxon_id);
$taxon->no_autoload_children;
# $taxon->add_tag("taxon_id") = $taxon->taxon_id; # homogenize with load_species_tree_from_file
$taxon->{_tags}{taxon_id} = $taxon->taxon_id; # line above seems to fail somehow
$root = $taxon->root unless($root);
$root->merge_node_via_shared_ancestor($taxon);
}
$root = $root->minimize_tree;
return $root;
}
sub load_species_tree_from_file {
my $self = shift @_;
my $taxonDBA = $self->{'comparaDBA'}->get_NCBITaxonAdaptor;
print "load_species_tree_from_file\n" if $self->debug;
my $newick = $self->_slurp($self->{species_tree_file});
my $tree = Bio::EnsEMBL::Compara::Graph::NewickParser::parse_newick_into_tree($newick);
foreach my $node (@{$tree->all_nodes_in_graph}) {
my ($id) = split('-',$node->name);
$id =~ s/\*//; # internal nodes have asterisk
if (looks_like_number($id)) {
$node->node_id($id);
my $ncbi_node = $taxonDBA->fetch_node_by_taxon_id($id);
$node->name($ncbi_node->name) if (defined $ncbi_node);
} else { # doesnt look like number
$node->name($id);
}
$node->add_tag('taxon_id', $id);
}
return $tree;
}
sub _slurp {
my ($self, $file_name) = @_;
my $slurped;
{
local $/ = undef;
open(my $fh, '<', $file_name);
$slurped = <$fh>;
close($fh);
}
return $slurped;
}
sub get_ancestor_species_hash
{
my $self = shift;
my $node = shift;
my $species_hash = $node->get_tagvalue('species_hash');
return $species_hash if($species_hash);
$species_hash = {};
my $duplication_hash = {};
my $is_dup=0;
if($node->isa('Bio::EnsEMBL::Compara::AlignedMember')) {
my $node_genome_db_id = $node->genome_db_id;
$species_hash->{$node_genome_db_id} = 1;
$node->add_tag('species_hash', $species_hash);
return $species_hash;
}
foreach my $child (@{$node->children}) {
my $t_species_hash = $self->get_ancestor_species_hash($child);
next unless(defined($t_species_hash)); #shouldn't happen
foreach my $genome_db_id (keys(%$t_species_hash)) {
unless(defined($species_hash->{$genome_db_id})) {
$species_hash->{$genome_db_id} = $t_species_hash->{$genome_db_id};
} else {
#this species already existed in one of the other children
#this means this species was duplicated at this point between
#the species
$is_dup=1;
$duplication_hash->{$genome_db_id} = 1;
$species_hash->{$genome_db_id} += $t_species_hash->{$genome_db_id};
}
}
}
#printf("\ncalc_ancestor_species_hash : %s\n", $self->encode_hash($species_hash));
#$node->print_tree(20);
$node->add_tag("species_hash", $species_hash);
if($is_dup && !($self->{'_treefam'})) {
my $original_duplication_value = $node->get_tagvalue("Duplication");
$original_duplication_value = 0
unless (defined $original_duplication_value && $original_duplication_value ne '');
if ($original_duplication_value == 0) {
# RAP did not predict a duplication here
$node->add_tag("duplication_hash", $duplication_hash);
$node->store_tag("Duplication", 1) unless ($self->{'_readonly'});
$node->store_tag("Duplication_alg", 'species_count')
unless ($self->{'_readonly'});
} elsif ($original_duplication_value == 1) {
my $dup_alg = $node->get_tagvalue("Duplication_alg");
if (defined $dup_alg and $dup_alg ne 'species_count') {
# RAP did predict a duplication here but not species_count
$node->add_tag("duplication_hash", $duplication_hash);
$node->store_tag("Duplication", 2) unless ($self->{'_readonly'});
$node->store_tag("Duplication_alg", 'species_count')
unless ($self->{'_readonly'});
}
}
}
return $species_hash;
}
sub get_ancestor_taxon_level
{
my $self = shift;
my $ancestor = shift;
my $taxon_level = $ancestor->get_tagvalue('taxon_level');
return $taxon_level if($taxon_level);
#printf("\ncalculate ancestor taxon level\n");
my $taxon_tree = $self->{'taxon_tree'};
my $species_hash = $self->get_ancestor_species_hash($ancestor);
foreach my $gdbID (keys(%$species_hash)) {
my $gdb =
$self->{'comparaDBA'}->get_GenomeDBAdaptor->fetch_by_dbID($gdbID);
my $taxon = $taxon_tree->find_node_by_node_id($gdb->taxon_id);
unless($taxon) {
throw("oops missing taxon " . $gdb->taxon_id ."\n");
}
if($taxon_level) {
$taxon_level = $taxon_level->find_first_shared_ancestor($taxon);
} else {
$taxon_level = $taxon;
}
}
my $taxon_id = $taxon_level->get_tagvalue("taxon_id");
$ancestor->add_tag("taxon_level", $taxon_level);
$ancestor->store_tag("taxon_id", $taxon_id)
unless ($self->{'_readonly'});
$ancestor->store_tag("taxon_name", $taxon_level->name)
unless ($self->{'_readonly'});
#$ancestor->print_tree($self->{'tree_scale'});
#$taxon_level->print_tree(10);
return $taxon_level;
}
sub duplication_confidence_score {
my $self = shift;
my $ancestor = shift;
# This assumes bifurcation!!! No multifurcations allowed
my ($child_a, $child_b, $dummy) = @{$ancestor->children};
throw("tree is multifurcated in duplication_confidence_score\n") if (defined($dummy));
my @child_a_gdbs = keys %{$self->get_ancestor_species_hash($child_a)};
my @child_b_gdbs = keys %{$self->get_ancestor_species_hash($child_b)};
my %seen = (); my @gdb_a = grep { ! $seen{$_} ++ } @child_a_gdbs;
%seen = (); my @gdb_b = grep { ! $seen{$_} ++ } @child_b_gdbs;
my @isect = my @diff = my @union = (); my %count;
foreach my $e (@gdb_a, @gdb_b) { $count{$e}++ }
foreach my $e (keys %count) {
push(@union, $e); push @{ $count{$e} == 2 ? \@isect : \@diff }, $e;
}
my $duplication_confidence_score = 0;
my $scalar_isect = scalar(@isect);
my $scalar_union = scalar(@union);
$duplication_confidence_score =
(($scalar_isect)/$scalar_union) unless (0 == $scalar_isect);
$ancestor->store_tag
(
"duplication_confidence_score",
$duplication_confidence_score
) unless ($self->{'_readonly'});
my $rounded_duplication_confidence_score = (int((100.0 * $scalar_isect / $scalar_union + 0.5)));
my $species_intersection_score = $ancestor->get_tagvalue("species_intersection_score");
unless (defined($species_intersection_score) && $species_intersection_score ne '') {
my $ancestor_node_id = $ancestor->node_id;
warn("Difference in the ProteinTree: duplication_confidence_score [$duplication_confidence_score] whereas species_intersection_score [$species_intersection_score] is undefined in njtree - ancestor $ancestor_node_id\n");
return;
}
if ($species_intersection_score ne $rounded_duplication_confidence_score && !defined($self->{_readonly})) {
my $ancestor_node_id = $ancestor->node_id;
$self->throw("Inconsistency in the ProteinTree: duplication_confidence_score [$duplication_confidence_score] != species_intersection_score [$species_intersection_score] - $ancestor_node_id\n");
}
}
sub genepairlink_fetch_homology
{
my $self = shift;
my $genepairlink = shift;
my ($member1, $member2) = $genepairlink->get_nodes;
my $sql = "select homology.homology_id from homology ".
"join homology_member hm1 using(homology_id) ".
"join homology_member hm2 using (homology_id) " .
"where hm1.peptide_member_id=? and hm2.peptide_member_id=?";
my $sth = $self->dbc->prepare($sql);
$sth->execute($member1->member_id, $member2->member_id);
my $ref = $sth->fetchrow_arrayref();
return undef unless($ref);
$sth->finish;
my ($homology_id) = @$ref;
return undef unless($homology_id);
my $homology =
$self->{'comparaDBA'}->get_HomologyAdaptor->fetch_by_dbID($homology_id);
$genepairlink->add_tag("old_homology", $homology);
return $homology;
}
sub delete_old_orthotree_tags
{
my $self = shift;
return undef unless ($self->input_job->retry_count > 0);
print "deleting old orthotree tags\n" if ($self->debug);
my @node_ids;
# # Old method -- too slow
# my $left_index = $self->{'protein_tree'}->left_index;
# my $right_index = $self->{'protein_tree'}->right_index;
# my $tree_root_node_id = $self->{'protein_tree'}->node_id;
# # Include the root_id as well as the rest of the nodes within the tree
# push @node_ids, $tree_root_node_id;
# my $sql = "select ptn.node_id from protein_tree_node ptn where ptn.left_index>$left_index and ptn.right_index<$right_index";
# my $sth = $self->dbc->prepare($sql);
# $sth->execute;
# while (my $aref = $sth->fetchrow_arrayref) {
# my ($node_id) = @$aref;
# push @node_ids, $node_id;
# }
foreach my $node ($self->{'protein_tree'}->get_all_subnodes) {
push @node_ids, $node->node_id;
}
my @list_ids;
foreach my $id (@node_ids) {
push @list_ids, $id;
if (scalar @list_ids == 2000) {
my $sql = "delete from protein_tree_tag where node_id in (".join(",",@list_ids).") and tag in ('duplication_confidence_score','taxon_id','taxon_name','OrthoTree_runtime_msec','OrthoTree_types_hashstr')";
my $sth = $self->dbc->prepare($sql);
$sth->execute;
$sth->finish;
@list_ids = ();
}
}
if (scalar @list_ids) {
my $sql = "delete from protein_tree_tag where node_id in (".join(",",@list_ids).") and tag in ('duplication_confidence_score','taxon_id','taxon_name','OrthoTree_runtime_msec','OrthoTree_types_hashstr')";
my $sth = $self->dbc->prepare($sql);
$sth->execute;
$sth->finish;
@list_ids = ();
}
return undef;
}
sub delete_old_homologies
{
my $self = shift;
return undef unless ($self->input_job->retry_count > 0);
print "deleting old homologies\n" if ($self->debug);
# New method all in one go -- requires key on tree_node_id
my $delete_time = time();
my $tree_node_id = $self->{protein_tree}->node_id;
my $sql1 = "delete h.*, hm.* from homology h, homology_member hm where h.homology_id=hm.homology_id and h.tree_node_id=$tree_node_id";
my $sth1 = $self->dbc->prepare($sql1);
$sth1->execute;
$sth1->finish;
printf("%1.3f secs build links and features\n", time()-$delete_time);
# # Old method one by one
# my %homology_ids;
# my $sql = "select homology_id from homology_member where member_id = ?";
# my $sth = $self->dbc->prepare($sql);
# foreach my $leaf (@{$self->{'protein_tree'}->get_all_leaves}) {
# $sth->execute($leaf->gene_member->member_id);
# while (my $aref = $sth->fetchrow_arrayref) {
# my ($homology_id) = @$aref;
# $homology_ids{$homology_id} = 1;
# }
# }
# my @list_ids;
# foreach my $id (keys %homology_ids) {
# my $delete_time = time();
# push @list_ids, $id;
# if (scalar @list_ids == 2000) {
# my $sql1 = "delete from homology where homology_id in (".join(",",@list_ids).")";
# my $sql2 = "delete from homology_member where homology_id in (".join(",",@list_ids).")";
# my $sth1 = $self->dbc->prepare($sql1);
# my $sth2 = $self->dbc->prepare($sql2);
# $sth1->execute;
# $sth2->execute;
# $sth1->finish;
# $sth2->finish;
# @list_ids = ();
# }
# printf("%1.3f secs build links and features\n", time()-$delete_time);
# }
# if (scalar @list_ids) {
# my $sql1 = "delete from homology where homology_id in (".join(",",@list_ids).")";
# my $sql2 = "delete from homology_member where homology_id in (".join(",",@list_ids).")";
# my $sth1 = $self->dbc->prepare($sql1);
# my $sth2 = $self->dbc->prepare($sql2);
# $sth1->execute;
# $sth2->execute;
# $sth1->finish;
# $sth2->finish;
# @list_ids = ();
# }
$self->{old_homologies_deleted} = 1;
return undef;
}
sub delete_old_homologies_old
{
my $self = shift;
my $genepairlink = shift;
return undef unless ($self->input_job->retry_count > 0);
my ($member1, $member2) = $genepairlink->get_nodes;
my @homologies = @{$self->{'comparaDBA'}->get_HomologyAdaptor->fetch_by_Member_Member_method_link_type
($member1->gene_member, $member2->gene_member, 'ENSEMBL_ORTHOLOGUES')};
push @homologies, @{$self->{'comparaDBA'}->get_HomologyAdaptor->fetch_by_Member_Member_method_link_type
($member1->gene_member, $member2->gene_member, 'ENSEMBL_PARALOGUES')};
my $sql1 = "DELETE FROM homology WHERE homology_id=?";
my $sth1 = $self->dbc->prepare($sql1);
my $sql2 = "DELETE FROM homology_member WHERE homology_id=?";
my $sth2 = $self->dbc->prepare($sql2);
foreach my $homology (@homologies) {
$sth1->execute($homology->dbID);
$sth2->execute($homology->dbID);
}
$sth1->finish;
$sth2->finish;
return undef;
}
sub genepairlink_check_dups
{
my $self = shift;
my $genepairlink = shift;
my ($pep1, $pep2) = $genepairlink->get_nodes;
my $ancestor = $genepairlink->get_tagvalue('ancestor');
my $has_dup=0;
my %nodes_between;
my $tnode = $pep1;
do {
$tnode = $tnode->parent;
my $dup_value = $tnode->get_tagvalue("Duplication");
# my $sis_value = $tnode->get_tagvalue("species_intersection_score");
my $sis_value = $tnode->get_tagvalue("duplication_confidence_score");
$dup_value = 0 unless (defined($dup_value) && $dup_value ne '');
$sis_value = 0 unless (defined($sis_value) && $sis_value ne '');
unless ($sis_value eq '0') {
if($dup_value > 0) {
$has_dup = 1;
}
}
$nodes_between{$tnode->node_id} = $tnode;
} while(!($tnode->equals($ancestor)));
$tnode = $pep2;
do {
$tnode = $tnode->parent;
my $dup_value = $tnode->get_tagvalue("Duplication");
# my $sis_value = $tnode->get_tagvalue("species_intersection_score");
my $sis_value = $tnode->get_tagvalue("duplication_confidence_score");
$dup_value = 0 unless (defined($dup_value) && $dup_value ne '');
$sis_value = 0 unless (defined($sis_value) && $sis_value ne '');
$genepairlink->{duplication_confidence_score} = $sis_value;
unless ($sis_value eq '0') {
if($dup_value > 0) {
$has_dup = 1;
}
}
$nodes_between{$tnode->node_id} = $tnode;
} while(!($tnode->equals($ancestor)));
$genepairlink->add_tag("hops", scalar(keys(%nodes_between)));
$genepairlink->add_tag("has_dups", $has_dup);
return undef;
}
########################################################
#
# Classification analysis
#
########################################################
sub direct_ortholog_test
{
my $self = shift;
my $genepairlink = shift;
#strictest ortholog test:
# - genes are from different species
# - no ancestral duplication events
# - these genes are only copies of the ancestor for their species
return undef if($genepairlink->get_tagvalue('has_dups'));
my ($pep1, $pep2) = $genepairlink->get_nodes;
return undef if($pep1->genome_db_id == $pep2->genome_db_id);
my $ancestor = $genepairlink->get_tagvalue('ancestor');
my $species_hash = $self->get_ancestor_species_hash($ancestor);
#RAP seems to miss some duplication events so check the species
#counts for these two species to make sure they are the only
#representatives of these species under the ancestor
my $count1 = $species_hash->{$pep1->genome_db_id};
my $count2 = $species_hash->{$pep2->genome_db_id};
return undef if($count1>1);
return undef if($count2>1);
#passed all the tests -> it's a simple ortholog
# $self->delete_old_homologies_old($genepairlink) unless ($self->{'_readonly'});
$genepairlink->add_tag("orthotree_type", 'ortholog_one2one');
my $taxon = $self->get_ancestor_taxon_level($ancestor);
$genepairlink->add_tag("orthotree_subtype", $taxon->name);
return 1;
}
sub inspecies_paralog_test
{
my $self = shift;
my $genepairlink = shift;
#simplest paralog test:
# - both genes are from the same species
# - and just label with taxonomic level
my ($pep1, $pep2) = $genepairlink->get_nodes;
return undef unless($pep1->genome_db_id == $pep2->genome_db_id);
my $ancestor = $genepairlink->get_tagvalue('ancestor');
my $taxon = $self->get_ancestor_taxon_level($ancestor);
#my $species_hash = $self->get_ancestor_species_hash($ancestor);
#foreach my $gdbID (keys(%$species_hash)) {
# return undef unless($gdbID == $pep1->genome_db_id);
#}
#passed all the tests -> it's an inspecies_paralog
# $genepairlink->add_tag("orthotree_type", 'inspecies_paralog');
# $self->delete_old_homologies_old($genepairlink) unless ($self->{'_readonly'});
$genepairlink->add_tag("orthotree_type", 'within_species_paralog');
$genepairlink->add_tag("orthotree_subtype", $taxon->name);
# Duplication_confidence_score
if ('' eq $ancestor->get_tagvalue("duplication_confidence_score")) {
$self->duplication_confidence_score($ancestor);
$genepairlink->{duplication_confidence_score} = $ancestor->get_tagvalue("duplication_confidence_score");
}
return 1;
}
sub ancient_residual_test
{
my $self = shift;
my $genepairlink = shift;
#test 3: getting a bit more complex:
# - genes are from different species
# - there is evidence for duplication events elsewhere in the history
# - but these two genes are the only remaining representative of
# the ancestor
my ($pep1, $pep2) = $genepairlink->get_nodes;
return undef if($pep1->genome_db_id == $pep2->genome_db_id);
my $ancestor = $genepairlink->get_tagvalue('ancestor');
my $species_hash = $self->get_ancestor_species_hash($ancestor);
#check these are the only representatives of the ancestor
my $count1 = $species_hash->{$pep1->genome_db_id};
my $count2 = $species_hash->{$pep2->genome_db_id};
return undef if($count1>1);
return undef if($count2>1);
#passed all the tests -> it's a simple ortholog
# print $ancestor->node_id, " ", $ancestor->name,"\n";
# little hack to work around some weird treefam trees
if ($self->{'_treefam'}) {
my $dup_value = $ancestor->get_tagvalue("Duplication");
if ($dup_value eq '') {
$dup_value = 0;
$ancestor->add_tag("Duplication",0) ;
}
}
# my $sis_value = $ancestor->get_tagvalue("species_intersection_score");
my $sis_value = $ancestor->get_tagvalue("duplication_confidence_score");
if($ancestor->get_tagvalue("Duplication") > 0 && $sis_value ne '0') {
# $self->delete_old_homologies_old($genepairlink) unless ($self->{'_readonly'});
$genepairlink->add_tag("orthotree_type", 'apparent_ortholog_one2one');
my $taxon = $self->get_ancestor_taxon_level($ancestor);
$genepairlink->add_tag("orthotree_subtype", $taxon->name);
# Duplication_confidence_score
if ('' eq $ancestor->get_tagvalue("duplication_confidence_score")) {
$self->duplication_confidence_score($ancestor);
$genepairlink->{duplication_confidence_score} = $ancestor->get_tagvalue("duplication_confidence_score");
}
} else {
# $self->delete_old_homologies_old($genepairlink) unless ($self->{'_readonly'});
$genepairlink->add_tag("orthotree_type", 'ortholog_one2one');
my $taxon = $self->get_ancestor_taxon_level($ancestor);
$genepairlink->add_tag("orthotree_subtype", $taxon->name);
}
return 1;
}
sub one2many_ortholog_test
{
my $self = shift;
my $genepairlink = shift;
#test 4: getting a bit more complex yet again:
# - genes are from different species
# - but there is evidence for duplication events in the history
# - one of the genes is the only remaining representative of the
# ancestor in its species
# - but the other gene has multiple copies in it's species
# (first level of orthogroup analysis)
my ($pep1, $pep2) = $genepairlink->get_nodes;
return undef if($pep1->genome_db_id == $pep2->genome_db_id);
my $ancestor = $genepairlink->get_tagvalue('ancestor');
my $species_hash = $self->get_ancestor_species_hash($ancestor);
my $count1 = $species_hash->{$pep1->genome_db_id};
my $count2 = $species_hash->{$pep2->genome_db_id};
#one of the genes must be the only copy of the gene
#and the other must appear more than once in the ancestry
return undef unless
(
($count1==1 and $count2>1) or ($count1>1 and $count2==1)
);
my $dup_value = $ancestor->get_tagvalue("Duplication");
# my $sis_value = $ancestor->get_tagvalue("species_intersection_score");
my $sis_value = $ancestor->get_tagvalue("duplication_confidence_score");
if($dup_value > 0 && $sis_value ne '0') {
return undef;
}
#passed all the tests -> it's a one2many ortholog
# $self->delete_old_homologies_old($genepairlink) unless ($self->{'_readonly'});
$genepairlink->add_tag("orthotree_type", 'ortholog_one2many');
my $taxon = $self->get_ancestor_taxon_level($ancestor);
$genepairlink->add_tag("orthotree_subtype", $taxon->name);
return 1;
}
sub outspecies_test
{
my $self = shift;
my $genepairlink = shift;
#last test: left over pairs:
# - genes are from different species
# - if ancestor is 'DUP' -> paralog else 'ortholog'
my ($pep1, $pep2) = $genepairlink->get_nodes;
return undef if($pep1->genome_db_id == $pep2->genome_db_id);
my $ancestor = $genepairlink->get_tagvalue('ancestor');
my $taxon = $self->get_ancestor_taxon_level($ancestor);
#ultra simple ortho/paralog classification
my $dup_value = $ancestor->get_tagvalue("Duplication");
# my $sis_value = $ancestor->get_tagvalue("species_intersection_score");
my $sis_value = $ancestor->get_tagvalue("duplication_confidence_score");
unless ($dup_value eq '') {
if($dup_value > 0 && $sis_value ne '0') {
# $self->delete_old_homologies_old($genepairlink) unless ($self->{'_readonly'});
$genepairlink->add_tag("orthotree_type", 'between_species_paralog');
$genepairlink->add_tag("orthotree_subtype", $taxon->name);
# Duplication_confidence_score
if ('' eq $ancestor->get_tagvalue("duplication_confidence_score")) {
$self->duplication_confidence_score($ancestor);
$genepairlink->{duplication_confidence_score} = $ancestor->get_tagvalue("duplication_confidence_score");
}
} else {
# $self->delete_old_homologies_old($genepairlink) unless ($self->{'_readonly'});
$genepairlink->add_tag("orthotree_type", 'ortholog_many2many');
$genepairlink->add_tag("orthotree_subtype", $taxon->name);
}
}
return 1;
}
########################################################
#
# ProteinTree input/output section
#
########################################################
sub store_homologies
{
my $self = shift;
my $a=1;
my $hlinkscount = 0;
foreach my $genepairlink (@{$self->{'homology_links'}}) {
$self->display_link_analysis($genepairlink) if($self->debug>2);
# unless (defined($self->{all_between})) {
my $type = $genepairlink->get_tagvalue("orthotree_type");
my $dcs = $genepairlink->{duplication_confidence_score};
$DB::single=$a;1;
next if ($type eq 'between_species_paralog' && $dcs > $self->{no_between});
# }
$self->store_gene_link_as_homology($genepairlink);
print STDERR "homology links $hlinkscount\n" if ($hlinkscount++ % 500 == 0);
}
my $counts_str = $self->encode_hash($self->{'orthotree_homology_counts'});
printf("$counts_str\n");
$self->check_homology_consistency;
$self->{'protein_tree'}->store_tag(
'OrthoTree_types_hashstr',
$self->encode_hash($self->{'orthotree_homology_counts'})) unless ($self->{'_readonly'});
# This has to go to OrthoTree because we haven't stored taxon_id and taxon_name yet here
# Go through and calculate species sampling and count.
# Create our taxonomic tree.
# my $ta = $self->{'comparaDBA'}->get_NCBITaxonAdaptor;
# my $gdb_list = $self->{'comparaDBA'}->get_GenomeDBAdaptor->fetch_all;
# my $taxon_tree=undef;
# foreach my $gdb (@{$gdb_list}) {
# next if ($gdb->name =~ /Ancestral/);
# next if ($gdb->name =~ /ilurana/);
# my $taxon = $ta->fetch_node_by_taxon_id($gdb->taxon_id);
# $taxon->no_autoload_children;
# $taxon_tree = $taxon->root unless($taxon_tree);
# $taxon_tree->merge_node_via_shared_ancestor($taxon);
# }
# $taxon_tree = $taxon_tree->minimize_tree;
# my $tree_species_sampling = 0;
# my $tree_species_num = 0;
# my $num_leaves = $self->{'protein_tree'}->num_leaves;
# if ($num_leaves >= 4) {
# my $taxon_id = $self->{'protein_tree'}->get_tagvalue("taxon_id");
# my $taxon_name = $self->{'protein_tree'}->get_tagvalue("taxon_name");
# #print "ID: $taxon_id $taxon_name\n";
# my $potential_species = scalar @{$taxon_tree->find_node_by_node_id($taxon_id)->get_all_leaves};
# if ($potential_species > 1) {
# my %species;
# my @species_array = map {$_->taxon->name} @{$self->{'protein_tree'}->get_all_leaves}; # Create an array of species names.
# @species{@species_array} = (1) x @species_array; # Fill the hashtable with the taxon names of all represented species.
# my $represented_species = scalar keys %species; # Count up the number of unique represented species.
# my $species_sampling = $represented_species / $potential_species;
# # Tree species sampling.
# $tree_species_sampling = sprintf("%.4f",$species_sampling); # Format the decimals.
# # Tree species number.
# $tree_species_num = $represented_species;
# } else {
# $tree_species_sampling = 1;
# $tree_species_num = 1;
# }
# }
# $self->{'protein_tree'}->store_tag("tree_species_sampling",$tree_species_sampling);
# $self->{'protein_tree'}->store_tag("tree_species_num",$tree_species_num);
$DB::single=1;1;
return undef;
}
sub store_gene_link_as_homology
{
my $self = shift;
my $genepairlink = shift;
my $type = $genepairlink->get_tagvalue('orthotree_type');
return unless($type);
my $subtype = $genepairlink->get_tagvalue('taxon_name');
my $ancestor = $genepairlink->get_tagvalue('ancestor');
my $tree_node_id = $genepairlink->get_tagvalue('tree_node_id');
warn("Tag tree_node_id undefined\n") unless(defined($tree_node_id) && $tree_node_id ne '');
my ($protein1, $protein2) = $genepairlink->get_nodes;
#
# create method_link_species_set
#
my $mlss = new Bio::EnsEMBL::Compara::MethodLinkSpeciesSet;
$mlss->method_link_type("ENSEMBL_ORTHOLOGUES")
unless ($type eq 'between_species_paralog' || $type eq 'within_species_paralog');
$mlss->method_link_type("ENSEMBL_PARALOGUES") if ($type eq 'between_species_paralog' || $type eq 'within_species_paralog');
if ($protein1->genome_db->dbID == $protein2->genome_db->dbID) {
$mlss->species_set([$protein1->genome_db]);
} else {
$mlss->species_set([$protein1->genome_db, $protein2->genome_db]);
}
$self->{'comparaDBA'}->get_MethodLinkSpeciesSetAdaptor->store($mlss) unless ($self->{'_readonly'});
# create an Homology object
my $homology = new Bio::EnsEMBL::Compara::Homology;
$homology->description($type);
$homology->subtype($subtype);
# $homology->node_id($ancestor->node_id);
$homology->ancestor_node_id($ancestor->node_id);
$homology->tree_node_id($tree_node_id);
$homology->method_link_type($mlss->method_link_type);
$homology->method_link_species_set($mlss);
my $key = $mlss->dbID . "_" . $protein1->dbID;
$self->{_homology_consistency}{$key}{$type} = 1;
#$homology->dbID(-1);
# NEED TO BUILD THE Attributes (ie homology_members)
my ($cigar_line1, $perc_id1, $perc_pos1,
$cigar_line2, $perc_id2, $perc_pos2) =
$self->generate_attribute_arguments($protein1, $protein2,$type);
# QUERY member
#
my $attribute;
$attribute = new Bio::EnsEMBL::Compara::Attribute;
$attribute->peptide_member_id($protein1->dbID);
$attribute->cigar_line($cigar_line1);
$attribute->perc_cov(100);
$attribute->perc_id(int($perc_id1));
$attribute->perc_pos(int($perc_pos1));
$homology->add_Member_Attribute([$protein1->gene_member, $attribute]);
#
# HIT member
#
$attribute = new Bio::EnsEMBL::Compara::Attribute;
$attribute->peptide_member_id($protein2->dbID);
$attribute->cigar_line($cigar_line2);
$attribute->perc_cov(100);
$attribute->perc_id(int($perc_id2));
$attribute->perc_pos(int($perc_pos2));
$homology->add_Member_Attribute([$protein2->gene_member, $attribute]);
## Check if it has already been stored, in which case we dont need to store again
my $matching_homology = 0;
if ($self->input_job->retry_count > 0 && !defined($self->{old_homologies_deleted})) {
my $member_id1 = $protein1->gene_member->member_id;
my $member_id2 = $protein2->gene_member->member_id;
if ($member_id1 == $member_id2) {
my $tree_id = $self->{protein_tree}->node_id;
my $pmember_id1 = $protein1->member_id; my $pstable_id1 = $protein1->stable_id;
my $pmember_id2 = $protein2->member_id; my $pstable_id2 = $protein2->stable_id;
$self->throw("$member_id1 ($pmember_id1 - $pstable_id1) and $member_id2 ($pmember_id2 - $pstable_id2) shouldn't be the same");
}
my $stored_homology = @{$self->{homologyDBA}->fetch_by_Member_id_Member_id($member_id1,$member_id2)}[0];
if (defined($stored_homology)) {
$matching_homology = 1;
$matching_homology = 0 if ($stored_homology->description ne $homology->description);
$matching_homology = 0 if ($stored_homology->subtype ne $homology->subtype);
$matching_homology = 0 if ($stored_homology->ancestor_node_id ne $homology->ancestor_node_id);
$matching_homology = 0 if ($stored_homology->tree_node_id ne $homology->tree_node_id);
$matching_homology = 0 if ($stored_homology->method_link_type ne $homology->method_link_type);
$matching_homology = 0 if ($stored_homology->method_link_species_set->dbID ne $homology->method_link_species_set->dbID);
}
# Delete old one, then proceed to store new one
if (defined($stored_homology) && (0 == $matching_homology)) {
my $homology_id = $stored_homology->dbID;
my $sql1 = "delete from homology where homology_id=$homology_id";
my $sql2 = "delete from homology_member where homology_id=$homology_id";
my $sth1 = $self->dbc->prepare($sql1);
my $sth2 = $self->dbc->prepare($sql2);
$sth1->execute;
$sth2->execute;
$sth1->finish;
$sth2->finish;
}
}
if($self->{'store_homologies'} && 0 == $matching_homology) {
$self->{'homologyDBA'}->store($homology);
}
my $stable_id;
if($protein1->taxon_id < $protein2->taxon_id) {
$stable_id = $protein1->taxon_id . "_" . $protein2->taxon_id . "_";
} else {
$stable_id = $protein2->taxon_id . "_" . $protein1->taxon_id . "_";
}
$stable_id .= sprintf ("%011.0d",$homology->dbID) if($homology->dbID);
$homology->stable_id($stable_id);
#TODO: update the stable_id of the homology
# if($self->debug) {
# print("store: ");
# $homology->print_homology;
# }
return undef;
}
sub check_homology_consistency {
my $self = shift;
if ($self->{debug}) {
print "checking homology consistency\n";
foreach my $mlss_member_id ( keys %{$self->{_homology_consistency}} ) {
my $count = scalar(keys %{$self->{_homology_consistency}{$mlss_member_id}});
if ($count > 1) {
my ($mlss, $member_id) = split("_",$mlss_member_id);
print "mlss member_id : $mlss $member_id\n";
}
}
}
foreach my $mlss_member_id ( keys %{$self->{_homology_consistency}} ) {
my $count = scalar(keys %{$self->{_homology_consistency}{$mlss_member_id}});
if ($count > 1) {
my ($mlss, $member_id) = split("_",$mlss_member_id);
$self->throw("Inconsistent homologies in mlss $mlss and member_id $member_id");
}
}
}
sub _treefam_genepairlink_stats
{
# This method is only useful to compare treefam v genetree
my $self = shift;
my $starttime = time()*1000;
my $tmp_time = time();
my $tree = $self->{'protein_tree'};
my @all_protein_leaves = @{$tree->get_all_leaves};
#precalculate the ancestor species_hash (caches into the metadata of nodes)
#also augments the Duplication tagging
$self->get_ancestor_species_hash($tree);
#compare every gene in the tree with every other
#each gene/gene pairing is a potential ortholog/paralog
#and thus we need to analyze every possibility
#Accomplish by creating a fully connected graph between all the
#genes under the tree (hybrid graph structure) and then analyze each
#gene/gene link
$tmp_time = time();
printf("build fully linked graph\n") if($self->debug);
my @genepairlinks;
while (my $protein1 = shift @all_protein_leaves) {
foreach my $protein2 (@all_protein_leaves) {
my $ancestor = $protein1->find_first_shared_ancestor($protein2);
my $taxon_level = $self->get_ancestor_taxon_level($ancestor);
my $distance = $protein1->distance_to_ancestor($ancestor) +
$protein2->distance_to_ancestor($ancestor);
my $genepairlink = new Bio::EnsEMBL::Compara::Graph::Link
($protein1, $protein2, $distance);
$genepairlink->add_tag("hops", 0);
$genepairlink->add_tag("ancestor", $ancestor);
$genepairlink->add_tag("taxon_name", $taxon_level->name);
push @genepairlinks, $genepairlink;
}
}
printf("%1.3f secs build links and features\n", time()-$tmp_time)
if($self->debug>1);
$self->{'protein_tree'}->print_tree($self->{'tree_scale'})
if($self->debug);
#sort the gene/gene links by distance
# makes debug display easier to read, not required by algorithm
$tmp_time = time();
printf("sort links\n") if($self->debug);
my @sorted_genepairlinks =
sort {$a->distance_between <=> $b->distance_between} @genepairlinks;
printf("%1.3f secs to sort links\n", time()-$tmp_time) if($self->debug > 1);
#analyze every gene pair (genepairlink) to get its classification
printf("analyze links\n") if($self->debug);
$tmp_time = time();
my $count = 0;
foreach my $genepairlink (@sorted_genepairlinks) {
my ($protein1, $protein2) = $genepairlink->get_nodes;
#run feature detectors: precalcs and caches into metadata
$self->genepairlink_check_dups($genepairlink);
$self->genepairlink_fetch_homology($genepairlink) if($self->debug);
#do classification analysis : as filter stack
if($self->inspecies_paralog_test($genepairlink)) { }
elsif($self->direct_ortholog_test($genepairlink)) { }
elsif($self->ancient_residual_test($genepairlink)) { }
elsif($self->one2many_ortholog_test($genepairlink)) { }
elsif($self->outspecies_test($genepairlink)) { }
else {
printf
("OOPS!!!! %s - %s\n",
$protein1->gene_member->stable_id,
$protein2->gene_member->stable_id
);
}
my $stid1;
if (defined($protein1->gene_member)) {
$stid1 = $protein1->gene_member->stable_id;
} elsif (defined($protein1->get_tagvalue("G"))) {
$stid1 = $protein1->get_tagvalue("G");
} else {
$stid1 = "unknown";
}
my $stid2;
if (defined($protein2->gene_member)) {
$stid2 = $protein2->gene_member->stable_id;
} elsif (defined($protein2->get_tagvalue("G"))) {
$stid2 = $protein2->get_tagvalue("G");
} else {
$stid2 = "unknown";
}
my @stids = sort ($stid1,$stid2);
my $type = $genepairlink->get_tagvalue('orthotree_type');
my $subtype = $genepairlink->get_tagvalue('orthotree_subtype') || 'NA';
my $tree_type = "GT";
$tree_type = "TF" if $self->{'_treefam'};
my $tree_id = $self->{'protein_tree'}->node_id unless $self->{'_treefam'};
$tree_id = $self->{'_treefam'} if $self->{'_treefam'};
my $dup =
$genepairlink->get_tagvalue('ancestor')->get_tagvalue('Duplication');
my $dup_alg =
$genepairlink->get_tagvalue('ancestor')->get_tagvalue('Duplication_alg') || 0;
$self->{_gpresults} .= "$tree_type,$tree_id,$stids[0]"."_"."$stids[1]".","."$type,$subtype,$dup,$dup_alg\n";
$self->{_homologytable} .= "$stids[0]".","."$stids[1]".","."$type,$subtype\n";
print STDERR "analyze links $count\n" if ($count % 500 == 0);
}
printf("%1.3f secs to analyze genepair links\n", time()-$tmp_time)
if($self->debug > 1);
$self->store_homologies if ($self->{debug});
}
sub generate_attribute_arguments {
my ($self, $protein1, $protein2, $type) = @_;
my $new_aln1_cigarline = "";
my $new_aln2_cigarline = "";
my $perc_id1 = 0;
my $perc_pos1 = 0;
my $perc_id2 = 0;
my $perc_pos2 = 0;
# This speeds up the pipeline for this portion of the homology table
if ($type eq 'between_species_paralog') {
return ($new_aln1_cigarline, $perc_id1, $perc_pos1, $new_aln2_cigarline, $perc_id2, $perc_pos2);
}
my $identical_matches = 0;
my $positive_matches = 0;
my $m_hash = $self->get_matrix_hash;
my ($aln1state, $aln2state);
my ($aln1count, $aln2count);
# my @aln1 = split(//, $protein1->alignment_string); # Speed up
# my @aln2 = split(//, $protein2->alignment_string);
my $alignment_string = $protein1->alignment_string;
my @aln1 = unpack("A1" x length($alignment_string), $alignment_string);
$alignment_string = $protein2->alignment_string;
my @aln2 = unpack("A1" x length($alignment_string), $alignment_string);
for (my $i=0; $i <= $#aln1; $i++) {
next if ($aln1[$i] eq "-" && $aln2[$i] eq "-");
my ($cur_aln1state, $cur_aln2state) = qw(M M);
if ($aln1[$i] eq "-") {
$cur_aln1state = "D";
}
if ($aln2[$i] eq "-") {
$cur_aln2state = "D";
}
if ($cur_aln1state eq "M" && $cur_aln2state eq "M" && $aln1[$i] eq $aln2[$i]) {
$identical_matches++;
$positive_matches++;
} elsif ($cur_aln1state eq "M" && $cur_aln2state eq "M" && $m_hash->{uc $aln1[$i]}{uc $aln2[$i]} > 0) {
$positive_matches++;
}
unless (defined $aln1state) {
$aln1count = 1;
$aln2count = 1;
$aln1state = $cur_aln1state;
$aln2state = $cur_aln2state;
next;
}
if ($cur_aln1state eq $aln1state) {
$aln1count++;
} else {
if ($aln1count == 1) {
$new_aln1_cigarline .= $aln1state;
} else {
$new_aln1_cigarline .= $aln1count.$aln1state;
}
$aln1count = 1;
$aln1state = $cur_aln1state;
}
if ($cur_aln2state eq $aln2state) {
$aln2count++;
} else {
if ($aln2count == 1) {
$new_aln2_cigarline .= $aln2state;
} else {
$new_aln2_cigarline .= $aln2count.$aln2state;
}
$aln2count = 1;
$aln2state = $cur_aln2state;
}
}
if ($aln1count == 1) {
$new_aln1_cigarline .= $aln1state;
} else {
$new_aln1_cigarline .= $aln1count.$aln1state;
}
if ($aln2count == 1) {
$new_aln2_cigarline .= $aln2state;
} else {
$new_aln2_cigarline .= $aln2count.$aln2state;
}
my $seq_length1 = $protein1->seq_length;
unless (0 == $seq_length1) {
$perc_id1 = $identical_matches*100.0/$seq_length1;
$perc_pos1 = $positive_matches*100.0/$seq_length1;
}
my $seq_length2 = $protein2->seq_length;
unless (0 == $seq_length2) {
$perc_id2 = $identical_matches*100.0/$seq_length2;
$perc_pos2 = $positive_matches*100.0/$seq_length2;
}
# my $perc_id1 = $identical_matches*100.0/$protein1->seq_length;
# my $perc_pos1 = $positive_matches*100.0/$protein1->seq_length;
# my $perc_id2 = $identical_matches*100.0/$protein2->seq_length;
# my $perc_pos2 = $positive_matches*100.0/$protein2->seq_length;
return ($new_aln1_cigarline, $perc_id1, $perc_pos1, $new_aln2_cigarline, $perc_id2, $perc_pos2);
}
sub get_matrix_hash {
my $self = shift;
return $self->{'matrix_hash'} if (defined $self->{'matrix_hash'});
my $BLOSUM62 = "# Matrix made by matblas from blosum62.iij
# * column uses minimum score
# BLOSUM Clustered Scoring Matrix in 1/2 Bit Units
# Blocks Database = /data/blocks_5.0/blocks.dat
# Cluster Percentage: >= 62
# Entropy = 0.6979, Expected = -0.5209
A R N D C Q E G H I L K M F P S T W Y V B Z X *
A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 0 -4
R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 0 -1 -4
N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 3 0 -1 -4
D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 1 -1 -4
C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4
Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 3 -1 -4
E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4
G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -2 -1 -4
H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 0 -1 -4
I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 -3 -1 -4
L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 -3 -1 -4
K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 1 -1 -4
M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 -1 -1 -4
F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 -3 -1 -4
P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -1 -2 -4
S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 0 0 -4
T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1 0 -4
W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 -3 -2 -4
Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -2 -1 -4
V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 -2 -1 -4
B -2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 1 -1 -4
Z -1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4
X 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1 -1 -1 -1 -1 -4
* -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1
";
my $matrix_string;
my @lines = split(/\n/,$BLOSUM62);
foreach my $line (@lines) {
next if ($line =~ /^\#/);
if ($line =~ /^[A-Z\*\s]+$/) {
$matrix_string .= sprintf "$line\n";
} else {
my @t = split(/\s+/,$line);
shift @t;
# print scalar @t,"\n";
$matrix_string .= sprintf(join(" ",@t)."\n");
}
}
my %matrix_hash;
@lines = ();
@lines = split /\n/, $matrix_string;
my $lts = shift @lines;
$lts =~ s/^\s+//;
$lts =~ s/\s+$//;
my @letters = split /\s+/, $lts;
foreach my $letter (@letters) {
my $line = shift @lines;
$line =~ s/^\s+//;
$line =~ s/\s+$//;
my @penalties = split /\s+/, $line;
die "Size of letters array and penalties array are different\n"
unless (scalar @letters == scalar @penalties);
for (my $i=0; $i < scalar @letters; $i++) {
$matrix_hash{uc $letter}{uc $letters[$i]} = $penalties[$i];
}
}
$self->{'matrix_hash'} = \%matrix_hash;
return $self->{'matrix_hash'};
}
1;