Bio::EnsEMBL::Compara::RunnableDB PHYML
SummaryIncluded librariesPackage variablesSynopsisDescriptionGeneral documentationMethods
Toolbar
WebCvsRaw content
Summary
Bio::EnsEMBL::Compara::RunnableDB::PHYML
Package variables
No package variables defined.
Included modules
Bio::AlignIO
Bio::EnsEMBL::Compara::DBSQL::DBAdaptor
Bio::EnsEMBL::Compara::Graph::NewickParser
Bio::EnsEMBL::Compara::Member
Bio::EnsEMBL::DBSQL::DBAdaptor
Bio::EnsEMBL::Hive
Bio::SimpleAlign
File::Basename
Getopt::Long
IO::File
Time::HiRes qw ( time gettimeofday tv_interval )
Inherit
Bio::EnsEMBL::Hive::Process
Synopsis
my $db = Bio::EnsEMBL::Compara::DBAdaptor->new($locator);
my $phyml = Bio::EnsEMBL::Compara::RunnableDB::PHYML->new (
-db => $db,
-input_id => $input_id
-analysis => $analysis );
$phyml->fetch_input(); #reads from DB
$phyml->run();
$phyml->output();
$phyml->write_output(); #writes to DB
Description
This Analysis/RunnableDB is designed to take ProteinTree as input
This must already have a multiple alignment run on it. It uses that alignment
as input into the PHYML program which then generates a phylogenetic tree
input_id/parameters format eg: "{'protein_tree_id'=>1234}"
protein_tree_id : use 'id' to fetch a cluster from the ProteinTree
Methods
DESTROY
No description
Code
check_job_fail_options
No description
Code
dumpTreeMultipleAlignmentToWorkdir
No description
Code
fetch_inputDescriptionCode
get_params
No description
Code
parse_newick_into_proteintree
No description
Code
print_params
No description
Code
runDescriptionCode
run_phyml
No description
Code
store_proteintree
No description
Code
write_outputDescriptionCode
Methods description
fetch_inputcode    nextTop
    Title   :   fetch_input
Usage : $self->fetch_input
Function: Fetches input data for repeatmasker from the database
Returns : none
Args : none
runcodeprevnextTop
    Title   :   run
Usage : $self->run
Function: runs PHYML
Returns : none
Args : none
write_outputcodeprevnextTop
    Title   :   write_output
Usage : $self->write_output
Function: parse clustalw output and update family and family_member tables
Returns : none
Args : none
Methods code
DESTROYdescriptionprevnextTop
sub DESTROY {
  my $self = shift;

  if($self->{'protein_tree'}) {
    printf("PHYML::DESTROY  releasing tree\n") if($self->debug);
    $self->{'protein_tree'}->release_tree;
    $self->{'protein_tree'} = undef;
  }

  $self->SUPER::DESTROY if $self->can("SUPER::DESTROY");
}


##########################################
#
# internal methods
#
##########################################
}
check_job_fail_optionsdescriptionprevnextTop
sub check_job_fail_options {
  my $self = shift;

  if($self->input_job->retry_count >= 2) {
    $self->dataflow_output_id($self->input_id, 2);
    $self->input_job->update_status('FAILED');
  
    if($self->{'protein_tree'}) {
      $self->{'protein_tree'}->release_tree;
      $self->{'protein_tree'} = undef;
    }
    throw("PHYML job failed >=3 times: try something else and FAIL it");
  }
}


########################################################
#
# ProteinTree input/output section
#
########################################################
}
dumpTreeMultipleAlignmentToWorkdirdescriptionprevnextTop
sub dumpTreeMultipleAlignmentToWorkdir {
  my $self = shift;
  my $tree = shift;
  
  my $leafcount = scalar(@{$tree->get_all_leaves});  
  if($leafcount<3) {
    printf(STDERR "tree cluster %d has <3 proteins - can not build a tree\n", $tree->node_id);
    return undef;
  }
  
  $self->{'file_root'} = $self->worker_temp_directory. "proteintree_". $tree->node_id;
  $self->{'file_root'} =~ s/\/\//\//g;  # converts any // in path to /
my $clw_file = $self->{'file_root'} . ".aln"; return $clw_file if(-e $clw_file); if($self->debug) { printf("dumpTreeMultipleAlignmentToWorkdir : %d members\n", $leafcount); print("clw_file = '$clw_file'\n"); } open(OUTSEQ, ">$clw_file") or $self->throw("Error opening $clw_file for write"); my $sa = $tree->get_SimpleAlign(-id_type => 'MEMBER', -cdna=>$self->{'cdna'}, -stop2x => 1); my $alignIO = Bio::AlignIO->newFh(-fh =>\* OUTSEQ, -interleaved => 1, -format => "phylip" ); print $alignIO $sa; close OUTSEQ; $self->{'input_aln'} = $clw_file; return $clw_file;
}
fetch_inputdescriptionprevnextTop
sub fetch_input {
  my( $self) = @_;

  $self->{'transition_transversion_ratio'}           = 0.0;
  $self->{'number_of_substitution_rate_categories'}  = 4;
  $self->{'gamma_distribution_parameter'}            = 1;
  $self->{'cdna'}                                    = 1;
  $self->{'max_gene_count'} = 1000000;

  $self->check_job_fail_options;

#  if($self->input_job->retry_count >= 3) {
# $self->dataflow_output_id($self->input_id, 2);
# $self->input_job->update_status('FAILED');
# throw("PHYML job failed >3 times: try something else and FAIL it");
# }
$self->throw("No input_id") unless defined($self->input_id); #create a Compara::DBAdaptor which shares the same DBI handle
#with the pipeline DBAdaptor that is based into this runnable
$self->{'comparaDBA'} = Bio::EnsEMBL::Compara::DBSQL::DBAdaptor->new(-DBCONN=>$self->db->dbc); $self->get_params($self->parameters); $self->get_params($self->input_id); $self->print_params if($self->debug); unless($self->{'protein_tree'}) { throw("undefined ProteinTree as input\n"); } if ($self->{'protein_tree'}->get_tagvalue('gene_count') > $self->{'max_gene_count'}) { $self->dataflow_output_id($self->input_id, 2); $self->input_job->update_status('FAILED'); $self->{'protein_tree'}->release_tree; $self->{'protein_tree'} = undef; throw("PHYML : cluster size over threshold and FAIL it"); } return 1;
}
get_paramsdescriptionprevnextTop
sub get_params {
  my $self         = shift;
  my $param_string = shift;

  return unless($param_string);
  print("parsing parameter string : ",$param_string,"\n") if($self->debug);
  
  my $params = eval($param_string);
  return unless($params);

  if($self->debug) {
    foreach my $key (keys %$params) {
      print("  $key : ", $params->{$key}, "\n");
    }
  }
    
  if(defined($params->{'protein_tree_id'})) {
    $self->{'protein_tree'} =  
         $self->{'comparaDBA'}->get_ProteinTreeAdaptor->
         fetch_node_by_node_id($params->{'protein_tree_id'});
  }
  $self->{'cdna'} = $params->{'cdna'} if(defined($params->{'cdna'}));
  $self->{'max_gene_count'} = $params->{'max_gene_count'} if(defined($params->{'max_gene_count'}));
  
  return;
}
parse_newick_into_proteintreedescriptionprevnextTop
sub parse_newick_into_proteintree {
  my $self = shift;
  my $newick_file =  $self->{'newick_file'};
  my $tree = $self->{'protein_tree'};
  
  #cleanup old tree structure- 
# flatten and reduce to only AlignedMember leaves
$tree->flatten_tree; $tree->print_tree if($self->debug); foreach my $node (@{$tree->get_all_leaves}) { next if($node->isa('Bio::EnsEMBL::Compara::AlignedMember')); $node->disavow_parent; } #parse newick into a new tree object structure
my $newick = ''; print("load from file $newick_file\n") if($self->debug); open (FH, $newick_file) or throw("Could not open newick file [$newick_file]"); while(<FH>) { $newick .= $_; } close(FH); my $newtree = Bio::EnsEMBL::Compara::Graph::NewickParser::parse_newick_into_tree($newick); $newtree->print_tree if($self->debug > 1); #leaves of newick tree are named with member_id of members from input tree
#move members (leaves) of input tree into newick tree to mirror the 'member_id' nodes
foreach my $member (@{$tree->get_all_leaves}) { my $tmpnode = $newtree->find_node_by_name($member->member_id); if($tmpnode) { $tmpnode->add_child($member, 0.0); $tmpnode->minimize_node; #tmpnode is now redundant so it is removed
} else { print("unable to find node in newick for member"); $member->print_member; } } # merge the trees so that the children of the newick tree are now attached to the
# input tree's root node
$tree->merge_children($newtree); #newick tree is now empty so release it
$newtree->release_tree; $tree->print_tree if($self->debug); # check here on the leaf to test if they all are AlignedMembers as
# minimize_tree/minimize_node might not work properly
foreach my $leaf (@{$self->{'protein_tree'}->get_all_leaves}) { unless($leaf->isa('Bio::EnsEMBL::Compara::AlignedMember')) { throw("Phyml tree does not have all leaves as AlignedMember\n"); } } return undef; } 1;
}
print_paramsdescriptionprevnextTop
sub print_params {
  my $self = shift;

  print("params:\n");
  print("  tree_id   : ", $self->{'protein_tree'}->node_id,"\n") if($self->{'protein_tree'});
  print("  cdna      : ", $self->{'cdna'},"\n");
}
rundescriptionprevnextTop
sub run {
  my $self = shift;
  $self->run_phyml;
}
run_phymldescriptionprevnextTop
sub run_phyml {
  my $self = shift;

  my $starttime = time()*1000;

  $self->{'input_aln'} = $self->dumpTreeMultipleAlignmentToWorkdir($self->{'protein_tree'});
  return unless($self->{'input_aln'});
  
  $self->{'newick_file'} = $self->{'input_aln'} . "_phyml_tree.txt ";

  my $phyml_executable = $self->analysis->program_file;
  unless (-e $phyml_executable) {
    $phyml_executable = "/nfs/acari/jessica/bin/alpha-dec-osf4.0/phyml";
    if (-e "/proc/version") {
      # it is a linux machine
$phyml_executable = "/nfs/acari/jessica/bin/i386/phyml"; } } unless (-e $phyml_executable) { $phyml_executable = "/usr/local/ensembl/bin/phyml"; } throw("can't find a phyml executable to run\n") unless(-e $phyml_executable); #./phyml seqs2 1 i 1 0 JTT 0.0 4 1.0 BIONJ n n
my $cmd = $phyml_executable; $cmd .= " ". $self->{'input_aln'}; if($self->{'cdna'}) { $cmd .= " 0 i 1 0 HKY e 0.0 4 e BIONJ y y"; } else { $cmd .= " 1 i 1 0 WAG 0.0 4 e BIONJ y y "; #AA, interleaved, 1 dataset, no bootstrap
} $cmd .= " 2>&1 > /dev/null" unless($self->debug); $self->{'comparaDBA'}->dbc->disconnect_when_inactive(1); print("$cmd\n") if($self->debug); unless(system($cmd) == 0) { print("$cmd\n"); $self->check_job_fail_options; throw("error running phyml, $!\n"); } $self->{'comparaDBA'}->dbc->disconnect_when_inactive(0); #parse the tree into the datastucture
$self->parse_newick_into_proteintree; my $runtime = time()*1000-$starttime; $self->{'protein_tree'}->store_tag('PHYML_runtime_msec', $runtime);
}
store_proteintreedescriptionprevnextTop
sub store_proteintree {
  my $self = shift;

  return unless($self->{'protein_tree'});

  printf("PHYML::store_proteintree\n") if($self->debug);
  my $treeDBA = $self->{'comparaDBA'}->get_ProteinTreeAdaptor;
  $treeDBA->sync_tree_leftright_index($self->{'protein_tree'});
  $treeDBA->store($self->{'protein_tree'});
  $treeDBA->delete_nodes_not_in_tree($self->{'protein_tree'});
  
  if($self->debug >1) {
    print("done storing - now print\n");
    $self->{'protein_tree'}->print_tree;
  }
  
  if($self->{'cdna'}) {
    $self->{'protein_tree'}->store_tag('PHYML_alignment', 'cdna');
  } else {
    $self->{'protein_tree'}->store_tag('PHYML_alignment', 'aa');
  }
  $self->{'protein_tree'}->store_tag('tree_method', 'PHYML');
  return undef;
}
write_outputdescriptionprevnextTop
sub write_output {
  my $self = shift;

  $self->store_proteintree;
}
General documentation
CONTACTTop
  Contact Jessica Severin on module implemetation/design detail: jessica@ebi.ac.uk
Contact Abel Ureta-Vidal on EnsEMBL/Compara: abel@ebi.ac.uk
Contact Ewan Birney on EnsEMBL in general: birney@sanger.ac.uk
APPENDIXTop
The rest of the documentation details each of the object methods.
Internal methods are usually preceded with a _