Bio::EnsEMBL::Compara::RunnableDB
PHYML
Toolbar
Summary
Bio::EnsEMBL::Compara::RunnableDB::PHYML
Package variables
No package variables defined.
Included modules
File::Basename
Getopt::Long
IO::File
Time::HiRes qw ( time gettimeofday tv_interval )
Inherit
Synopsis
my $db = Bio::EnsEMBL::Compara::DBAdaptor->new($locator);
my $phyml = Bio::EnsEMBL::Compara::RunnableDB::PHYML->new (
-db => $db,
-input_id => $input_id
-analysis => $analysis );
$phyml->fetch_input(); #reads from DB
$phyml->run();
$phyml->output();
$phyml->write_output(); #writes to DB
Description
This Analysis/RunnableDB is designed to take ProteinTree as input
This must already have a multiple alignment run on it. It uses that alignment
as input into the PHYML program which then generates a phylogenetic tree
input_id/parameters format eg: "{'protein_tree_id'=>1234}"
protein_tree_id : use 'id' to fetch a cluster from the ProteinTree
Methods
Methods description
Title : fetch_input Usage : $self->fetch_input Function: Fetches input data for repeatmasker from the database Returns : none Args : none |
Title : run Usage : $self->run Function: runs PHYML Returns : none Args : none |
Title : write_output Usage : $self->write_output Function: parse clustalw output and update family and family_member tables Returns : none Args : none |
Methods code
sub DESTROY
{ my $self = shift;
if($self->{'protein_tree'}) {
printf("PHYML::DESTROY releasing tree\n") if($self->debug);
$self->{'protein_tree'}->release_tree;
$self->{'protein_tree'} = undef;
}
$self->SUPER::DESTROY if $self->can("SUPER::DESTROY");
}
} |
sub check_job_fail_options
{
my $self = shift;
if($self->input_job->retry_count >= 2) {
$self->dataflow_output_id($self->input_id, 2);
$self->input_job->update_status('FAILED');
if($self->{'protein_tree'}) {
$self->{'protein_tree'}->release_tree;
$self->{'protein_tree'} = undef;
}
throw("PHYML job failed >=3 times: try something else and FAIL it");
}
}
} |
dumpTreeMultipleAlignmentToWorkdir | description | prev | next | Top |
sub dumpTreeMultipleAlignmentToWorkdir
{
my $self = shift;
my $tree = shift;
my $leafcount = scalar(@{$tree->get_all_leaves});
if($leafcount<3) {
printf(STDERR "tree cluster %d has <3 proteins - can not build a tree\n", $tree->node_id);
return undef;
}
$self->{'file_root'} = $self->worker_temp_directory. "proteintree_". $tree->node_id;
$self->{'file_root'} =~ s/\/\//\//g;
my $clw_file = $self->{'file_root'} . ".aln";
return $clw_file if(-e $clw_file);
if($self->debug) {
printf("dumpTreeMultipleAlignmentToWorkdir : %d members\n", $leafcount);
print("clw_file = '$clw_file'\n");
}
open(OUTSEQ, ">$clw_file")
or $self->throw("Error opening $clw_file for write");
my $sa = $tree->get_SimpleAlign(-id_type => 'MEMBER', -cdna=>$self->{'cdna'}, -stop2x => 1);
my $alignIO = Bio::AlignIO->newFh(-fh =>\* OUTSEQ,
-interleaved => 1,
-format => "phylip"
);
print $alignIO $sa;
close OUTSEQ;
$self->{'input_aln'} = $clw_file;
return $clw_file; } |
sub fetch_input
{ my( $self) = @_;
$self->{'transition_transversion_ratio'} = 0.0;
$self->{'number_of_substitution_rate_categories'} = 4;
$self->{'gamma_distribution_parameter'} = 1;
$self->{'cdna'} = 1;
$self->{'max_gene_count'} = 1000000;
$self->check_job_fail_options;
$self->throw("No input_id") unless defined($self->input_id);
$self->{'comparaDBA'} = Bio::EnsEMBL::Compara::DBSQL::DBAdaptor->new(-DBCONN=>$self->db->dbc);
$self->get_params($self->parameters);
$self->get_params($self->input_id);
$self->print_params if($self->debug);
unless($self->{'protein_tree'}) {
throw("undefined ProteinTree as input\n");
}
if ($self->{'protein_tree'}->get_tagvalue('gene_count') > $self->{'max_gene_count'}) {
$self->dataflow_output_id($self->input_id, 2);
$self->input_job->update_status('FAILED');
$self->{'protein_tree'}->release_tree;
$self->{'protein_tree'} = undef;
throw("PHYML : cluster size over threshold and FAIL it");
}
return 1; } |
sub get_params
{ my $self = shift;
my $param_string = shift;
return unless($param_string);
print("parsing parameter string : ",$param_string,"\n") if($self->debug);
my $params = eval($param_string);
return unless($params);
if($self->debug) {
foreach my $key (keys %$params) {
print(" $key : ", $params->{$key}, "\n");
}
}
if(defined($params->{'protein_tree_id'})) {
$self->{'protein_tree'} =
$self->{'comparaDBA'}->get_ProteinTreeAdaptor->
fetch_node_by_node_id($params->{'protein_tree_id'});
}
$self->{'cdna'} = $params->{'cdna'} if(defined($params->{'cdna'}));
$self->{'max_gene_count'} = $params->{'max_gene_count'} if(defined($params->{'max_gene_count'}));
return; } |
sub parse_newick_into_proteintree
{
my $self = shift;
my $newick_file = $self->{'newick_file'};
my $tree = $self->{'protein_tree'};
$tree->flatten_tree;
$tree->print_tree if($self->debug);
foreach my $node (@{$tree->get_all_leaves}) {
next if($node->isa('Bio::EnsEMBL::Compara::AlignedMember'));
$node->disavow_parent;
}
my $newick = '';
print("load from file $newick_file\n") if($self->debug);
open (FH, $newick_file) or throw("Could not open newick file [$newick_file]");
while(<FH>) { $newick .= $_; }
close(FH);
my $newtree = Bio::EnsEMBL::Compara::Graph::NewickParser::parse_newick_into_tree($newick);
$newtree->print_tree if($self->debug > 1);
foreach my $member (@{$tree->get_all_leaves}) {
my $tmpnode = $newtree->find_node_by_name($member->member_id);
if($tmpnode) {
$tmpnode->add_child($member, 0.0);
$tmpnode->minimize_node; } else {
print("unable to find node in newick for member");
$member->print_member;
}
}
$tree->merge_children($newtree);
$newtree->release_tree;
$tree->print_tree if($self->debug);
foreach my $leaf (@{$self->{'protein_tree'}->get_all_leaves}) {
unless($leaf->isa('Bio::EnsEMBL::Compara::AlignedMember')) {
throw("Phyml tree does not have all leaves as AlignedMember\n");
}
}
return undef;
}
1; } |
sub print_params
{ my $self = shift;
print("params:\n");
print(" tree_id : ", $self->{'protein_tree'}->node_id,"\n") if($self->{'protein_tree'});
print(" cdna : ", $self->{'cdna'},"\n"); } |
sub run
{
my $self = shift;
$self->run_phyml; } |
sub run_phyml
{
my $self = shift;
my $starttime = time()*1000;
$self->{'input_aln'} = $self->dumpTreeMultipleAlignmentToWorkdir($self->{'protein_tree'});
return unless($self->{'input_aln'});
$self->{'newick_file'} = $self->{'input_aln'} . "_phyml_tree.txt ";
my $phyml_executable = $self->analysis->program_file;
unless (-e $phyml_executable) {
$phyml_executable = "/nfs/acari/jessica/bin/alpha-dec-osf4.0/phyml";
if (-e "/proc/version") {
$phyml_executable = "/nfs/acari/jessica/bin/i386/phyml";
}
}
unless (-e $phyml_executable) {
$phyml_executable = "/usr/local/ensembl/bin/phyml";
}
throw("can't find a phyml executable to run\n") unless(-e $phyml_executable);
my $cmd = $phyml_executable;
$cmd .= " ". $self->{'input_aln'};
if($self->{'cdna'}) {
$cmd .= " 0 i 1 0 HKY e 0.0 4 e BIONJ y y";
} else {
$cmd .= " 1 i 1 0 WAG 0.0 4 e BIONJ y y "; }
$cmd .= " 2>&1 > /dev/null" unless($self->debug);
$self->{'comparaDBA'}->dbc->disconnect_when_inactive(1);
print("$cmd\n") if($self->debug);
unless(system($cmd) == 0) {
print("$cmd\n");
$self->check_job_fail_options;
throw("error running phyml, $!\n");
}
$self->{'comparaDBA'}->dbc->disconnect_when_inactive(0);
$self->parse_newick_into_proteintree;
my $runtime = time()*1000-$starttime;
$self->{'protein_tree'}->store_tag('PHYML_runtime_msec', $runtime); } |
sub store_proteintree
{
my $self = shift;
return unless($self->{'protein_tree'});
printf("PHYML::store_proteintree\n") if($self->debug);
my $treeDBA = $self->{'comparaDBA'}->get_ProteinTreeAdaptor;
$treeDBA->sync_tree_leftright_index($self->{'protein_tree'});
$treeDBA->store($self->{'protein_tree'});
$treeDBA->delete_nodes_not_in_tree($self->{'protein_tree'});
if($self->debug >1) {
print("done storing - now print\n");
$self->{'protein_tree'}->print_tree;
}
if($self->{'cdna'}) {
$self->{'protein_tree'}->store_tag('PHYML_alignment', 'cdna');
} else {
$self->{'protein_tree'}->store_tag('PHYML_alignment', 'aa');
}
$self->{'protein_tree'}->store_tag('tree_method', 'PHYML');
return undef; } |
sub write_output
{ my $self = shift;
$self->store_proteintree; } |
General documentation
The rest of the documentation details each of the object methods.
Internal methods are usually preceded with a _