Raw content of Bio::EnsEMBL::Compara::RunnableDB::QuickTreeBreak
#
# You may distribute this module under the same terms as perl itself
#
# POD documentation - main docs before the code
=pod
=head1 NAME
Bio::EnsEMBL::Compara::RunnableDB::QuickTreeBreak
=cut
=head1 SYNOPSIS
my $db = Bio::EnsEMBL::Compara::DBAdaptor->new($locator);
my $quicktreebreak = Bio::EnsEMBL::Compara::RunnableDB::QuickTreeBreak->new
(
-db => $db,
-input_id => $input_id,
-analysis => $analysis
);
$quicktreebreak->fetch_input(); #reads from DB
$quicktreebreak->run();
$quicktreebreak->output();
$quicktreebreak->write_output(); #writes to DB
=cut
=head1 DESCRIPTION
This Analysis/RunnableDB is designed to take ProteinTree as input.
This must already have a multiple alignment run on it. It uses that
alignment as input into the QuickTree program which then generates a
simple phylogenetic tree to be broken down into 2 pieces.
Google QuickTree to get the latest tar.gz from the Sanger.
Google sreformat to get the sequence reformatter that switches from fasta to stockholm.
input_id/parameters format eg: "{'protein_tree_id'=>1234,'clusterset_id'=>1}"
protein_tree_id : use 'id' to fetch a cluster from the ProteinTree
=cut
=head1 CONTACT
Contact Albert Vilella on module implementation/design detail: avilella@ebi.ac.uk
Contact Javier Herrero on EnsEMBL/Compara: jherrero@ebi.ac.uk
Contact Ewan Birney on EnsEMBL in general: birney@sanger.ac.uk
=cut
=head1 APPENDIX
The rest of the documentation details each of the object methods.
Internal methods are usually preceded with a _
=cut
package Bio::EnsEMBL::Compara::RunnableDB::QuickTreeBreak;
use strict;
use Getopt::Long;
use IO::File;
use File::Basename;
use Time::HiRes qw(time gettimeofday tv_interval);
use Bio::EnsEMBL::DBSQL::DBAdaptor;
use Bio::EnsEMBL::Compara::DBSQL::DBAdaptor;
use Bio::EnsEMBL::Compara::Member;
use Bio::EnsEMBL::Compara::Graph::NewickParser;
use Bio::SimpleAlign;
use Bio::AlignIO;
use Bio::EnsEMBL::Hive;
our @ISA = qw(Bio::EnsEMBL::Hive::Process);
=head2 fetch_input
Title : fetch_input
Usage : $self->fetch_input
Function: Fetches input data for repeatmasker from the database
Returns : none
Args : none
=cut
sub fetch_input {
my( $self) = @_;
$self->{'max_gene_count'} = 1500; # Can be overriden later
$self->throw("No input_id") unless defined($self->input_id);
#create a Compara::DBAdaptor which shares the same DBI handle
#with the pipeline DBAdaptor that is based into this runnable
$self->{'comparaDBA'} = Bio::EnsEMBL::Compara::DBSQL::DBAdaptor->new
(
-DBCONN=>$self->db->dbc
);
$self->get_params($self->parameters);
$self->get_params($self->input_id);
$self->print_params if($self->debug);
$self->check_job_fail_options;
$self->check_if_exit_cleanly;
unless($self->{'protein_tree'}) {
throw("undefined ProteinTree as input\n");
}
return 1;
}
=head2 run
Title : run
Usage : $self->run
Function: runs NJTREE PHYML
Returns : none
Args : none
=cut
sub run {
my $self = shift;
$self->check_if_exit_cleanly;
$self->run_quicktreebreak;
}
=head2 write_output
Title : write_output
Usage : $self->write_output
Function: stores proteintree
Returns : none
Args : none
=cut
sub write_output {
my $self = shift;
$self->check_if_exit_cleanly;
$self->store_proteintrees;
}
sub DESTROY {
my $self = shift;
if($self->{'protein_tree'}) {
printf("QuickTreeBreak::DESTROY releasing tree\n") if($self->debug);
$self->{'protein_tree'}->release_tree;
$self->{'protein_tree'} = undef;
$self->{'max_subtree'}->release_tree;
$self->{'new_subtree'}->release_tree;
$self->{'remaining_subtree'}->release_tree;
$self->{'max_subtree'} = undef;
$self->{'new_subtree'} = undef;
$self->{'remaining_subtree'} = undef;
}
$self->SUPER::DESTROY if $self->can("SUPER::DESTROY");
}
##########################################
#
# internal methods
#
##########################################
sub get_params {
my $self = shift;
my $param_string = shift;
return unless($param_string);
print("parsing parameter string : ",$param_string,"\n") if($self->debug);
my $params = eval($param_string);
return unless($params);
if($self->debug) {
foreach my $key (keys %$params) {
print(" $key : ", $params->{$key}, "\n");
}
}
if(defined($params->{'protein_tree_id'})) {
$self->{'protein_tree'} =
$self->{'comparaDBA'}->get_ProteinTreeAdaptor->
fetch_node_by_node_id($params->{'protein_tree_id'});
}
$self->{'max_gene_count'} =
$params->{'max_gene_count'} if(defined($params->{'max_gene_count'}));
$self->{'clusterset_id'} =
$params->{'clusterset_id'} if(defined($params->{'clusterset_id'}));
foreach my $key (qw[max_gene_count use_genomedb_id]) {
my $value = $params->{$key};
$self->{$key} = $value if defined $value;
}
return;
}
sub print_params {
my $self = shift;
print("params:\n");
print(" tree_id : ", $self->{'protein_tree'}->node_id,"\n") if($self->{'protein_tree'});
}
sub run_quicktreebreak {
my $self = shift;
my $starttime = time()*1000;
$self->{'input_aln'} = $self->dumpTreeMultipleAlignmentToWorkdir
(
$self->{'protein_tree'}
);
return unless($self->{'input_aln'});
$self->{'newick_file'} = $self->{'input_aln'} . "_quicktreebreak_tree.txt ";
my $quicktreebreak_executable = $self->analysis->program_file || '';
unless (-e $quicktreebreak_executable) {
$quicktreebreak_executable = "/nfs/acari/avilella/src/quicktree/quicktree_1.1/bin/quicktree";
}
throw("can't find a quicktree executable to run\n")
unless(-e $quicktreebreak_executable);
my $cmd = $quicktreebreak_executable;
$cmd .= " -out t -in a";
$cmd .= " ". $self->{'input_aln'};
#/nfs/acari/avilella/src/quicktree/quicktree_1.1/bin/quicktree -out t
# -in a /tmp/worker.12270/proteintree_517373.stk
$self->{'comparaDBA'}->dbc->disconnect_when_inactive(1);
print("$cmd\n") if($self->debug);
open(RUN, "$cmd |") or $self->throw("error running quicktree, $!\n");
my @output = ;
my $exit_status = close(RUN);
if (!$exit_status) {
$self->throw("error running quicktree, $!\n");
}
$self->{'comparaDBA'}->dbc->disconnect_when_inactive(0);
foreach my $line (@output) {
$line =~ s/\n//;
$self->{quicktree_newick_string} .= $line;
}
#parse the tree into the datastucture
$self->generate_subtrees;
my $runtime = time()*1000-$starttime;
$self->{'protein_tree'}->store_tag('QuickTreeBreak_runtime_msec', $runtime);
}
sub check_job_fail_options {
my $self = shift;
if ($self->input_job->retry_count > 8) {
# $self->input_job->adaptor->reset_highmem_job_by_dbID($self->input_job->dbID);
$self->input_job->update_status('FAILED');
$self->DESTROY;
throw("QuickTree job failed");
}
}
########################################################
#
# ProteinTree input/output section
#
########################################################
sub dumpTreeMultipleAlignmentToWorkdir {
my $self = shift;
my $tree = shift;
$self->{original_leafcount} = scalar(@{$tree->get_all_leaves});
if($self->{original_leafcount}<3) {
printf(STDERR "tree cluster %d has <3 proteins - can not build a tree\n",
$tree->node_id);
return undef;
}
$self->{'file_root'} =
$self->worker_temp_directory. "proteintree_". $tree->node_id;
$self->{'file_root'} =~ s/\/\//\//g; # converts any // in path to /
my $aln_file = $self->{'file_root'} . ".aln";
return $aln_file if(-e $aln_file);
if($self->debug) {
printf("dumpTreeMultipleAlignmentToWorkdir : %d members\n", $self->{original_leafcount});
print("aln_file = '$aln_file'\n");
}
open(OUTSEQ, ">$aln_file")
or $self->throw("Error opening $aln_file for write");
# Using append_taxon_id will give nice seqnames_taxonids needed for
# njtree species_tree matching
my %sa_params = ($self->{use_genomedb_id}) ? ('-APPEND_GENOMEDB_ID', 1) :
('-APPEND_TAXON_ID', 1);
my $sa = $tree->get_SimpleAlign
(
-id_type => 'MEMBER',
%sa_params
);
$sa->set_displayname_flat(1);
my $alignIO = Bio::AlignIO->newFh
(
-fh => \*OUTSEQ,
-format => "fasta"
);
print $alignIO $sa;
close OUTSEQ;
print STDERR "Using sreformat to change to stockholm format\n" if ($self->debug);
my $stk_file = $self->{'file_root'} . ".stk";
my $cmd = "/usr/local/ensembl/bin/sreformat stockholm $aln_file > $stk_file";
unless( system("$cmd") == 0) {
print("$cmd\n");
$self->check_job_fail_options;
throw("error running sreformat, $!\n");
}
$self->{'input_aln'} = $stk_file;
return $stk_file;
}
sub store_proteintrees {
my $self = shift;
$self->delete_original_cluster;
$self->store_clusters;
if($self->debug >1) {
print("done storing\n");
}
return undef;
}
sub store_clusters {
my $self = shift;
my $mlssDBA = $self->{'comparaDBA'}->get_MethodLinkSpeciesSetAdaptor;
my $pafDBA = $self->{'comparaDBA'}->get_PeptideAlignFeatureAdaptor;
my $treeDBA = $self->{'comparaDBA'}->get_ProteinTreeAdaptor;
my $starttime = time();
my $clusterset = $treeDBA->fetch_node_by_node_id($self->{'clusterset_id'});
throw("no clusterset found: $!\n") unless($clusterset);
$clusterset->no_autoload_children; # Important so that only the two below are used
$clusterset->add_child($self->{new_subtree});
$clusterset->add_child($self->{remaining_subtree});
my $clusters = $clusterset->children;
foreach my $cluster (@{$clusters}) {
my $node_id = $treeDBA->store($cluster);
#calc residue count total
my $leafcount = scalar(@{$cluster->get_all_leaves});
$cluster->store_tag('gene_count', $leafcount);
$cluster->store_tag('original_cluster', $self->{'original_cluster'}->node_id);
print STDERR "Stored $node_id with $leafcount leaves\n" if ($self->debug);
# Dataflow clusters
my $output_id = sprintf("{'protein_tree_id'=>%d, 'clusterset_id'=>%d}",
$node_id, $clusterset->node_id);
$self->dataflow_output_id($output_id, 2);
print STDERR "Created QuickTreeBreak job for $node_id\n";
}
}
sub delete_original_cluster {
my $self = shift;
my $delete_time = time();
my $original_cluster = $self->{'original_cluster'}->node_id;
# $original_cluster->store_tag('cluster_had_to_be_broken_down',1);
$self->delete_old_orthotree_tags;
my $tree_node_id = $original_cluster;
my $sql1 = "delete h.*, hm.* from homology h, homology_member hm where h.homology_id=hm.homology_id and h.tree_node_id=$tree_node_id";
my $sth1 = $self->dbc->prepare($sql1);
$sth1->execute;
$sth1->finish;
$self->{original_cluster}->adaptor->delete_node_and_under($self->{original_cluster});
printf("%1.3f secs to delete old cluster $original_cluster\n", time()-$delete_time);
return 1;
}
sub delete_old_orthotree_tags {
my $self = shift;
print "deleting old orthotree tags\n" if ($self->debug);
my @node_ids;
my $left_index = $self->{'protein_tree'}->left_index;
my $right_index = $self->{'protein_tree'}->right_index;
my $tree_root_node_id = $self->{'protein_tree'}->node_id;
# Include the root_id as well as the rest of the nodes within the tree
push @node_ids, $tree_root_node_id;
my $sql = "select ptn.node_id from protein_tree_node ptn where ptn.left_index>$left_index and ptn.right_index<$right_index";
my $sth = $self->dbc->prepare($sql);
$sth->execute;
while (my $aref = $sth->fetchrow_arrayref) {
my ($node_id) = @$aref;
push @node_ids, $node_id;
}
my @list_ids;
foreach my $id (@node_ids) {
push @list_ids, $id;
if (scalar @list_ids == 2000) {
my $sql = "delete from protein_tree_tag where node_id in (".join(",",@list_ids).") and tag in ('duplication_confidence_score','taxon_id','taxon_name','OrthoTree_runtime_msec','OrthoTree_types_hashstr')";
my $sth = $self->dbc->prepare($sql);
$sth->execute;
$sth->finish;
@list_ids = ();
}
}
if (scalar @list_ids) {
my $sql = "delete from protein_tree_tag where node_id in (".join(",",@list_ids).") and tag in ('duplication_confidence_score','taxon_id','taxon_name','OrthoTree_runtime_msec','OrthoTree_types_hashstr')";
my $sth = $self->dbc->prepare($sql);
$sth->execute;
$sth->finish;
@list_ids = ();
}
return undef;
}
sub generate_subtrees {
my $self = shift;
my $newick = $self->{'quicktree_newick_string'};
my $tree = $self->{'protein_tree'};
#cleanup old tree structure-
# flatten and reduce to only AlignedMember leaves
$tree->flatten_tree;
$tree->print_tree(20) if($self->debug);
foreach my $node (@{$tree->get_all_leaves}) {
next if($node->isa('Bio::EnsEMBL::Compara::AlignedMember'));
$node->disavow_parent;
}
#parse newick into a new tree object structure
my $newtree =
Bio::EnsEMBL::Compara::Graph::NewickParser::parse_newick_into_tree($newick);
$newtree->print_tree(20) if($self->debug > 1);
# get rid of the taxon_id needed by njtree -- name tag
foreach my $leaf (@{$newtree->get_all_leaves}) {
my $quicktreebreak_name = $leaf->get_tagvalue('name');
$quicktreebreak_name =~ /(\d+)\_\d+/;
my $member_name = $1;
$leaf->add_tag('name', $member_name);
bless $leaf, "Bio::EnsEMBL::Compara::AlignedMember";
$leaf->member_id($member_name);
}
# Break the tree by immediate children recursively
my @children;
my $keep_braking = 1;
$self->{max_subtree} = $newtree;
while ($keep_braking) {
@children = @{$self->{max_subtree}->children};
my $max_num_leaves = 0;
foreach my $child (@children) {
my $num_leaves = scalar(@{$child->get_all_leaves});
if ($num_leaves > $max_num_leaves) {
$max_num_leaves = $num_leaves;
$self->{max_subtree} = $child;
}
}
# Broke down to half, happy with it
my $proportion = ($max_num_leaves*100/$self->{original_leafcount});
print STDERR "QuickTreeBreak iterate -- $max_num_leaves ($proportion)\n" if ($self->debug);
if ($proportion <= 50) {
$keep_braking = 0;
}
}
# Create a copy of what is not max_subtree
$self->{remaining_subtree} = $self->{protein_tree}->copy;
$self->{new_subtree} = $self->{protein_tree}->copy;
$self->{new_subtree}->flatten_tree;
$self->{remaining_subtree}->flatten_tree;
my $subtree_leaves;
foreach my $leaf (@{$self->{max_subtree}->get_all_leaves}) {
$subtree_leaves->{$leaf->member_id} = 1;
}
foreach my $leaf (@{$self->{new_subtree}->get_all_leaves}) {
unless (defined $subtree_leaves->{$leaf->member_id}) {
print $leaf->name," leaf disavowing parent\n" if $self->debug;
$leaf->disavow_parent;
}
}
foreach my $leaf (@{$self->{remaining_subtree}->get_all_leaves}) {
if (defined $subtree_leaves->{$leaf->member_id}) {
print $leaf->name," leaf disavowing parent\n" if $self->debug;
$leaf->disavow_parent;
}
}
$self->{remaining_subtree} = $self->{remaining_subtree}->minimize_tree;
$self->{new_subtree} = $self->{new_subtree}->minimize_tree;
# Some checks
$self->throw("QuickTreeBreak: Failed to generate subtrees: $!\n") unless(defined($self->{'new_subtree'}) && defined($self->{'remaining_subtree'}));
my $final_original_num = scalar @{$self->{protein_tree}->get_all_leaves};
my $final_max_num = scalar @{$self->{new_subtree}->get_all_leaves};
my $final_remaining_num = scalar @{$self->{remaining_subtree}->get_all_leaves};
if(($final_max_num + $final_remaining_num) != $final_original_num) {
$self->throw("QuickTreeBreak: Incorrect sum of leaves [$final_max_num + $final_remaining_num != $final_original_num]: $!\n");
}
$self->{'original_cluster'} = $self->{protein_tree};
return undef;
}
1;