Raw content of Bio::EnsEMBL::Compara::RunnableDB::NJTREE_PHYML
#
# You may distribute this module under the same terms as perl itself
#
# POD documentation - main docs before the code
=pod
=head1 NAME
Bio::EnsEMBL::Compara::RunnableDB::NJTREE_PHYML
=cut
=head1 SYNOPSIS
my $db = Bio::EnsEMBL::Compara::DBAdaptor->new($locator);
my $njtree_phyml = Bio::EnsEMBL::Compara::RunnableDB::NJTREE_PHYML->new
(
-db => $db,
-input_id => $input_id,
-analysis => $analysis
);
$njtree_phyml->fetch_input(); #reads from DB
$njtree_phyml->run();
$njtree_phyml->output();
$njtree_phyml->write_output(); #writes to DB
=cut
=head1 DESCRIPTION
This Analysis/RunnableDB is designed to take ProteinTree as input
This must already have a multiple alignment run on it. It uses that alignment
as input into the NJTREE PHYML program which then generates a phylogenetic tree
input_id/parameters format eg: "{'protein_tree_id'=>1234}"
protein_tree_id : use 'id' to fetch a cluster from the ProteinTree
=cut
=head1 CONTACT
Contact Albert Vilella on module implementation/design detail: avilella@ebi.ac.uk
Contact Abel Ureta-Vidal on EnsEMBL/Compara: abel@ebi.ac.uk
Contact Ewan Birney on EnsEMBL in general: birney@sanger.ac.uk
=cut
=head1 APPENDIX
The rest of the documentation details each of the object methods.
Internal methods are usually preceded with a _
=cut
package Bio::EnsEMBL::Compara::RunnableDB::NJTREE_PHYML;
use strict;
use Getopt::Long;
use IO::File;
use File::Basename;
use Time::HiRes qw(time gettimeofday tv_interval);
use Bio::EnsEMBL::DBSQL::DBAdaptor;
use Bio::EnsEMBL::Compara::DBSQL::DBAdaptor;
use Bio::EnsEMBL::Compara::Member;
use Bio::EnsEMBL::Compara::Graph::NewickParser;
use Bio::SimpleAlign;
use Bio::AlignIO;
use Bio::EnsEMBL::Hive;
our @ISA = qw(Bio::EnsEMBL::Hive::Process);
=head2 fetch_input
Title : fetch_input
Usage : $self->fetch_input
Function: Fetches input data for repeatmasker from the database
Returns : none
Args : none
=cut
sub fetch_input {
my( $self) = @_;
$self->{'cdna'} = 1;
$self->{'bootstrap'} = 1;
$self->{'max_gene_count'} = 200;
$self->throw("No input_id") unless defined($self->input_id);
#create a Compara::DBAdaptor which shares the same DBI handle
#with the pipeline DBAdaptor that is based into this runnable
$self->{'comparaDBA'} = Bio::EnsEMBL::Compara::DBSQL::DBAdaptor->new
(
-DBCONN=>$self->db->dbc
);
$self->get_params($self->parameters);
$self->get_params($self->input_id);
$self->print_params if($self->debug);
$self->check_job_fail_options;
$self->check_if_exit_cleanly;
unless($self->{'protein_tree'}) {
throw("undefined ProteinTree as input\n");
}
if ($self->{'protein_tree'}->get_tagvalue('gene_count')
> $self->{'max_gene_count'}) {
# 3 is QuickTreeBreak -- 2 is NJTREE_PHYML jackknife
$self->dataflow_output_id($self->input_id, 3);
$self->input_job->update_status('FAILED');
$self->{'protein_tree'}->release_tree;
$self->{'protein_tree'} = undef;
throw("NJTREE_PHYML : cluster size over threshold and FAIL it");
}
return 1;
}
=head2 run
Title : run
Usage : $self->run
Function: runs NJTREE PHYML
Returns : none
Args : none
=cut
sub run {
my $self = shift;
$self->check_if_exit_cleanly;
$self->run_njtree_phyml;
}
=head2 write_output
Title : write_output
Usage : $self->write_output
Function: stores proteintree
Returns : none
Args : none
=cut
sub write_output {
my $self = shift;
$self->check_if_exit_cleanly;
$self->store_proteintree;
}
sub DESTROY {
my $self = shift;
if($self->{'protein_tree'}) {
printf("NJTREE_PHYML::DESTROY releasing tree\n") if($self->debug);
$self->{'protein_tree'}->release_tree;
$self->{'protein_tree'} = undef;
}
$self->SUPER::DESTROY if $self->can("SUPER::DESTROY");
}
##########################################
#
# internal methods
#
##########################################
sub get_params {
my $self = shift;
my $param_string = shift;
return unless($param_string);
print("parsing parameter string : ",$param_string,"\n") if($self->debug);
my $params = eval($param_string);
return unless($params);
if (defined $params->{'njtree_phyml_analysis_data_id'}) {
my $njtree_phyml_analysis_data_id = $params->{'njtree_phyml_analysis_data_id'};
my $ada = $self->db->get_AnalysisDataAdaptor;
my $new_params = eval($ada->fetch_by_dbID($njtree_phyml_analysis_data_id));
if (defined $new_params) {
$params = $new_params;
}
}
if($self->debug) {
foreach my $key (keys %$params) {
print(" $key : ", $params->{$key}, "\n");
}
}
if(defined($params->{'protein_tree_id'})) {
$self->{'protein_tree'} =
$self->{'comparaDBA'}->get_ProteinTreeAdaptor->
fetch_node_by_node_id($params->{'protein_tree_id'});
}
$self->{'jackknife'} = $params->{'jackknife'} if(defined($params->{'jackknife'}));
$self->{'cdna'} = $params->{'cdna'} if(defined($params->{'cdna'}));
$self->{'max_gene_count'} =
$params->{'max_gene_count'} if(defined($params->{'max_gene_count'}));
foreach my $key (qw[cdna max_gene_count species_tree_file honeycomb_dir use_genomedb_id]) {
my $value = $params->{$key};
$self->{$key} = $value if defined $value;
}
return;
}
sub print_params {
my $self = shift;
print("params:\n");
print(" tree_id : ", $self->{'protein_tree'}->node_id,"\n") if($self->{'protein_tree'});
print(" cdna : ", $self->{'cdna'},"\n");
}
sub run_njtree_phyml
{
my $self = shift;
my $starttime = time()*1000;
$self->{'cdna'} = 1; #always use cdna for njtree_phyml
$self->{'input_aln'} = $self->dumpTreeMultipleAlignmentToWorkdir
(
$self->{'protein_tree'}
);
return unless($self->{'input_aln'});
$self->{'newick_file'} = $self->{'input_aln'} . "_njtree_phyml_tree.txt ";
my $njtree_phyml_executable = $self->analysis->program_file || '';
unless (-e $njtree_phyml_executable) {
if (`uname -m` =~ /ia64/) {
$njtree_phyml_executable = "/nfs/acari/avilella/src/treesoft/trunk/treebest/ia64/treebest";
} else {
# it is a linux machine
# md5sum 91a9da7ad7d38ebedd5ce363a28d509b
# $njtree_phyml_executable = "/lustre/work1/ensembl/avilella/bin/i386/njtree_gcc";
# $njtree_phyml_executable = "/nfs/acari/avilella/src/_treesoft/treebest/treebest";
$njtree_phyml_executable = "/nfs/acari/avilella/src/treesoft/trunk/treebest/treebest";
}
}
throw("can't find a njtree executable to run\n")
unless(-e $njtree_phyml_executable);
throw("can't find species_tree_file\n")
unless(-e $self->{'species_tree_file'});
# ./njtree best -f spec-v4.1.nh -p tree -o $BASENAME.best.nhx \
# $BASENAME.nucl.mfa -b 100 2>&1/dev/null
my $cmd = $njtree_phyml_executable;
if (1 == $self->{'bootstrap'}) {
$cmd .= " best ";
if (defined($self->{'species_tree_file'})) {
$cmd .= " -f ". $self->{'species_tree_file'};
}
$cmd .= " ". $self->{'input_aln'};
$cmd .= " -p tree ";
$cmd .= " -o " . $self->{'newick_file'};
my $logfile = $self->worker_temp_directory. "proteintree_". $self->{'protein_tree'}->node_id . ".log";
my $errfile = $self->worker_temp_directory. "proteintree_". $self->{'protein_tree'}->node_id . ".err";
$cmd .= " 1>$logfile 2>$errfile";
# $cmd .= " 2>&1 > /dev/null" unless($self->debug);
$self->{'comparaDBA'}->dbc->disconnect_when_inactive(1);
print("$cmd\n") if($self->debug);
my $worker_temp_directory = $self->worker_temp_directory;
unless(system("cd $worker_temp_directory; $cmd") == 0) {
print("$cmd\n");
open ERRFILE,"$errfile" or die "couldnt open logfile $errfile: $!\n";
while () {
if ($_ =~ /NNI/) {
# Do jack-knife treebest starting by the sequence with more Ns
my $jackknife_value = $self->{'jackknife'} if (defined($self->{'jackknife'}));
$jackknife_value++;
my $output_id = sprintf("{'protein_tree_id'=>%d, 'clusterset_id'=>1, 'jackknife'=>$jackknife_value}", $self->{'protein_tree'}->node_id);
$self->input_job->input_id($output_id);
$self->dataflow_output_id($output_id, 2);
$self->input_job->update_status('FAILED');
$self->{'protein_tree'}->release_tree;
$self->{'protein_tree'} = undef;
warn("NJTREE_PHYML : NNI error, resubmitting as jackknife NJTREE_PHYML");
return undef;
}
}
$self->check_job_fail_options;
throw("error running njtree phyml, $!\n");
}
$self->{'comparaDBA'}->dbc->disconnect_when_inactive(0);
} elsif (0 == $self->{'bootstrap'}) {
# first part
# ./njtree phyml -nS -f species_tree.nh -p 0.01 -o $BASENAME.cons.nh $BASENAME.nucl.mfa
$cmd = $njtree_phyml_executable;
$cmd .= " phyml -nS";
if (defined($self->{'species_tree_file'})) {
$cmd .= " -f ". $self->{'species_tree_file'};
}
$cmd .= " ". $self->{'input_aln'};
$cmd .= " -p 0.01 ";
$self->{'intermediate_newick_file'} = $self->{'input_aln'} . "_intermediate_njtree_phyml_tree.txt ";
$cmd .= " -o " . $self->{'intermediate_newick_file'};
$cmd .= " 2>&1 > /dev/null" unless($self->debug);
print("$cmd\n") if($self->debug);
my $worker_temp_directory = $self->worker_temp_directory;
unless(system("cd $worker_temp_directory; $cmd") == 0) {
print("$cmd\n");
$self->check_job_fail_options;
throw("error running njtree phyml noboot (step 1 of 2), $!\n");
}
# second part
# nice -n 19 ./njtree sdi -s species_tree.nh $BASENAME.cons.nh > $BASENAME.cons.nhx
$cmd = $njtree_phyml_executable;
$cmd .= " sdi ";
if (defined($self->{'species_tree_file'})) {
$cmd .= " -s ". $self->{'species_tree_file'};
}
$cmd .= " ". $self->{'intermediate_newick_file'};
$cmd .= " 1> " . $self->{'newick_file'};
$cmd .= " 2> /dev/null" unless($self->debug);
$self->{'comparaDBA'}->dbc->disconnect_when_inactive(1);
print("$cmd\n") if($self->debug);
my $worker_temp_directory = $self->worker_temp_directory;
unless(system("cd $worker_temp_directory; $cmd") == 0) {
print("$cmd\n");
$self->check_job_fail_options;
throw("error running njtree phyml noboot (step 2 of 2), $!\n");
}
$self->{'comparaDBA'}->dbc->disconnect_when_inactive(0);
} else {
throw("NJTREE PHYML -- wrong bootstrap option");
}
#parse the tree into the datastucture
$self->parse_newick_into_proteintree;
my $runtime = time()*1000-$starttime;
$self->{'protein_tree'}->store_tag('NJTREE_PHYML_runtime_msec', $runtime);
}
sub check_job_fail_options
{
my $self = shift;
if ($self->input_job->retry_count == 1) {
if ($self->{'protein_tree'}->get_tagvalue('gene_count') > 200 && !defined($self->worker->{HIGHMEM})) {
$self->input_job->adaptor->reset_highmem_job_by_dbID($self->input_job->dbID);
$self->DESTROY;
throw("NJTREE PHYML job too big: try something else and FAIL it");
}
}
# This will go to QuickTreeBreak
if($self->input_job->retry_count >= 2 && !defined($self->{'jackknife'})) {
$self->dataflow_output_id($self->input_id, 3);
$self->input_job->update_status('FAILED');
$self->DESTROY;
throw("NJTREE PHYML job failed >=3 times: try something else and FAIL it");
}
}
########################################################
#
# ProteinTree input/output section
#
########################################################
sub dumpTreeMultipleAlignmentToWorkdir
{
my $self = shift;
my $tree = shift;
my $leafcount = scalar(@{$tree->get_all_leaves});
if($leafcount<3) {
printf(STDERR "tree cluster %d has <3 proteins - can not build a tree\n",
$tree->node_id);
return undef;
}
$self->{'file_root'} =
$self->worker_temp_directory. "proteintree_". $tree->node_id;
$self->{'file_root'} =~ s/\/\//\//g; # converts any // in path to /
my $aln_file = $self->{'file_root'} . ".aln";
return $aln_file if(-e $aln_file);
if($self->debug) {
printf("dumpTreeMultipleAlignmentToWorkdir : %d members\n", $leafcount);
print("aln_file = '$aln_file'\n");
}
open(OUTSEQ, ">$aln_file")
or $self->throw("Error opening $aln_file for write");
# Using append_taxon_id will give nice seqnames_taxonids needed for
# njtree species_tree matching
my %sa_params = ($self->{use_genomedb_id}) ? ('-APPEND_GENOMEDB_ID', 1) :
('-APPEND_TAXON_ID', 1);
my $sa = $tree->get_SimpleAlign
(
-id_type => 'MEMBER',
-cdna=>$self->{'cdna'},
-stop2x => 1,
%sa_params
);
$sa->set_displayname_flat(1);
if (defined($self->{'jackknife'})) {
# my $coverage_hash;
my $empty_hash;
foreach my $seq ($sa->each_seq) {
my $sequence = $seq->seq;
$sequence =~ s/\-//g;
my $full_length = length($sequence);
$sequence =~ s/N//g;
my $covered_length = length($sequence);
my $coverage_proportion = $covered_length/$full_length;
my $empty_length = $full_length - $covered_length;
# $coverage_hash->{$coverage_proportion} = $seq->display_id if ($coverage_proportion < 1);
$empty_hash->{$empty_length} = $seq->display_id if ($empty_length > 0);
}
my @lowest = sort {$b<=>$a} keys %$empty_hash;
my $i = 0;
while ($i < $self->{jackknife}) {
$sa->remove_seq($sa->each_seq_with_id($empty_hash->{$lowest[$i]}));
$sa = $sa->remove_gaps(undef,1);
$i++;
}
}
my $alignIO = Bio::AlignIO->newFh
(
-fh => \*OUTSEQ,
-format => "fasta"
);
print $alignIO $sa;
close OUTSEQ;
$self->{'input_aln'} = $aln_file;
return $aln_file;
}
sub store_proteintree
{
my $self = shift;
return unless($self->{'protein_tree'});
printf("PHYML::store_proteintree\n") if($self->debug);
my $treeDBA = $self->{'comparaDBA'}->get_ProteinTreeAdaptor;
$treeDBA->sync_tree_leftright_index($self->{'protein_tree'});
$treeDBA->store($self->{'protein_tree'});
$treeDBA->delete_nodes_not_in_tree($self->{'protein_tree'});
if($self->debug >1) {
print("done storing - now print\n");
$self->{'protein_tree'}->print_tree;
}
$self->{'protein_tree'}->store_tag('PHYML_alignment', 'njtree_phyml');
$self->{'protein_tree'}->store_tag('reconciliation_method', 'njtree_best');
$self->store_tags($self->{'protein_tree'});
$self->_store_tree_tags();
return undef;
}
sub store_tags
{
my $self = shift;
my $node = shift;
if(defined($self->{'jackknife'})) {
my $leaf_count = $self->{protein_tree}->num_leaves;
$self->{protein_tree}->adaptor->delete_tag($self->{protein_tree}->node_id,'gene_count');
$self->{protein_tree}->store_tag('gene_count', $leaf_count);
}
if($node->get_tagvalue("Duplication") eq '1') {
if($self->debug) { printf("store duplication : "); $node->print_node; }
$node->store_tag('Duplication', 1);
} else {
$node->store_tag('Duplication', 0);
}
if (defined($node->get_tagvalue("B"))) {
my $bootstrap_value = $node->get_tagvalue("B");
if (defined($bootstrap_value) && $bootstrap_value ne '') {
if ($self->debug) {
printf("store bootstrap : $bootstrap_value "); $node->print_node;
}
$node->store_tag('Bootstrap', $bootstrap_value);
}
}
if (defined($node->get_tagvalue("DD"))) {
my $dubious_dup = $node->get_tagvalue("DD");
if (defined($dubious_dup) && $dubious_dup ne '') {
if ($self->debug) {
printf("store dubious_duplication : $dubious_dup "); $node->print_node;
}
$node->store_tag('dubious_duplication', $dubious_dup);
}
}
if (defined($node->get_tagvalue("E"))) {
my $n_lost = $node->get_tagvalue("E");
$n_lost =~ s/.{2}//; # get rid of the initial $-
my @lost_taxa = split('-',$n_lost);
my %lost_taxa;
foreach my $taxon (@lost_taxa) {
$lost_taxa{$taxon} = 1;
}
foreach my $taxon (keys %lost_taxa) {
if ($self->debug) {
printf("store lost_taxon_id : $taxon "); $node->print_node;
}
$node->store_tag('lost_taxon_id', $taxon);
}
}
if (defined($node->get_tagvalue("SISi"))) {
my $sis_score = $node->get_tagvalue("SISi");
if (defined($sis_score) && $sis_score ne '') {
if ($self->debug) {
printf("store SISi : $sis_score "); $node->print_node;
}
$node->store_tag('SISi', $sis_score);
}
}
if (defined($node->get_tagvalue("SISu"))) {
my $sis_score = $node->get_tagvalue("SISu");
if (defined($sis_score) && $sis_score ne '') {
if ($self->debug) {
printf("store SISu : $sis_score "); $node->print_node;
}
$node->store_tag('SISu', $sis_score);
}
}
if (defined($node->get_tagvalue("SIS"))) {
my $sis_score = $node->get_tagvalue("SIS");
if (defined($sis_score) && $sis_score ne '') {
if ($self->debug) {
printf("store species_intersection_score : $sis_score "); $node->print_node;
}
$node->store_tag('species_intersection_score', $sis_score);
}
}
if (defined($node->get_tagvalue("SIS1"))) {
my $sis_score = $node->get_tagvalue("SIS1");
if (defined($sis_score) && $sis_score ne '') {
if ($self->debug) {
printf("store SIS1 : $sis_score "); $node->print_node;
}
$node->store_tag('SIS1', $sis_score);
}
}
if (defined($node->get_tagvalue("SIS2"))) {
my $sis_score = $node->get_tagvalue("SIS2");
if (defined($sis_score) && $sis_score ne '') {
if ($self->debug) {
printf("store SIS2 : $sis_score "); $node->print_node;
}
$node->store_tag('SIS2', $sis_score);
}
}
# if (defined($node->get_tagvalue("SIS3"))) {
# my $sis_score = $node->get_tagvalue("SIS3");
# if (defined($sis_score) && $sis_score ne '') {
# if ($self->debug) {
# printf("store SIS3 : $sis_score "); $node->print_node;
# }
# $node->store_tag('SIS3', $sis_score);
# }
# }
foreach my $child (@{$node->children}) {
$self->store_tags($child);
}
return undef;
}
sub parse_newick_into_proteintree
{
my $self = shift;
my $newick_file = $self->{'newick_file'};
my $tree = $self->{'protein_tree'};
#cleanup old tree structure-
# flatten and reduce to only AlignedMember leaves
$tree->flatten_tree;
$tree->print_tree(20) if($self->debug);
foreach my $node (@{$tree->get_all_leaves}) {
next if($node->isa('Bio::EnsEMBL::Compara::AlignedMember'));
$node->disavow_parent;
}
#parse newick into a new tree object structure
my $newick = '';
print("load from file $newick_file\n") if($self->debug);
open (FH, $newick_file) or throw("Couldnt open newick file [$newick_file]");
while() { $newick .= $_; }
close(FH);
my $newtree =
Bio::EnsEMBL::Compara::Graph::NewickParser::parse_newick_into_tree($newick);
$newtree->print_tree(20) if($self->debug > 1);
# get rid of the taxon_id needed by njtree -- name tag
foreach my $leaf (@{$newtree->get_all_leaves}) {
my $njtree_phyml_name = $leaf->get_tagvalue('name');
$njtree_phyml_name =~ /(\d+)\_\d+/;
my $member_name = $1;
$leaf->add_tag('name', $member_name);
}
# Leaves of newick tree are named with member_id of members from
# input tree move members (leaves) of input tree into newick tree to
# mirror the 'member_id' nodes
foreach my $member (@{$tree->get_all_leaves}) {
my $tmpnode = $newtree->find_node_by_name($member->member_id);
if($tmpnode) {
$tmpnode->add_child($member, 0.0);
$tmpnode->minimize_node; #tmpnode is now redundant so it is removed
} else {
print("unable to find node in newick for member");
$member->print_member;
}
}
# Merge the trees so that the children of the newick tree are now
# attached to the input tree's root node
$tree->merge_children($newtree);
# Newick tree is now empty so release it
$newtree->release_tree;
$tree->print_tree if($self->debug);
# check here on the leaf to test if they all are AlignedMembers as
# minimize_tree/minimize_node might not work properly
foreach my $leaf (@{$self->{'protein_tree'}->get_all_leaves}) {
unless($leaf->isa('Bio::EnsEMBL::Compara::AlignedMember')) {
throw("Phyml tree does not have all leaves as AlignedMember\n");
}
}
return undef;
}
sub _store_tree_tags {
my $self = shift;
my $tree = $self->{'protein_tree'};
my $pta = $self->{'comparaDBA'}->get_ProteinTreeAdaptor;
print "Storing Tree tags...\n";
$tree->_load_tags();
my @leaves = @{$tree->get_all_leaves};
my @nodes = @{$tree->get_all_nodes};
# Node is a root node (full tree).
my $node_is_root = 0;
my @roots = @{$pta->fetch_all_roots()};
foreach my $root (@roots) {
$node_is_root = 1 if ($root->node_id == $tree->parent->node_id);
}
$tree->store_tag("node_is_root",$node_is_root);
# Node is leaf node.
my $node_is_leaf = 0;
$node_is_leaf = 1 if ($tree->is_leaf);
$tree->store_tag("node_is_leaf",$node_is_leaf);
# Tree number of leaves.
my $tree_num_leaves = scalar(@leaves);
$tree->store_tag("tree_num_leaves",$tree_num_leaves);
# Tree number of human peptides contained.
my $num_hum_peps = 0;
foreach my $leaf (@leaves) {
$num_hum_peps++ if ($leaf->taxon_id == 9606);
}
$tree->store_tag("tree_num_human_peps",$num_hum_peps);
# Tree max root-to-tip distance.
my $tree_max_length = $tree->max_distance;
$tree->store_tag("tree_max_length",$tree_max_length);
# Tree max single branch length.
my $tree_max_branch = 0;
foreach my $node (@nodes) {
my $dist = $node->distance_to_parent;
$tree_max_branch = $dist if ($dist > $tree_max_branch);
}
$tree->store_tag("tree_max_branch",$tree_max_branch);
# Tree number of duplications and speciations.
my $tree_num_leaves = scalar(@{$tree->get_all_leaves});
my $num_dups = 0;
my $num_specs = 0;
foreach my $node (@{$tree->get_all_nodes}) {
my $dup = $node->get_tagvalue("Duplication");
if ($dup ne '' && $dup > 0) {
$num_dups++;
} else {
$num_specs++;
}
}
$tree->store_tag("tree_num_dup_nodes",$num_dups);
$tree->store_tag("tree_num_spec_nodes",$num_specs);
print "Done storing stuff!\n" if ($self->debug);
}
1;