Bio::EnsEMBL::Compara::RunnableDB QuickTreeBreak
SummaryIncluded librariesPackage variablesSynopsisDescriptionGeneral documentationMethods
Toolbar
WebCvsRaw content
Summary
Bio::EnsEMBL::Compara::RunnableDB::QuickTreeBreak
Package variables
No package variables defined.
Included modules
Bio::AlignIO
Bio::EnsEMBL::Compara::DBSQL::DBAdaptor
Bio::EnsEMBL::Compara::Graph::NewickParser
Bio::EnsEMBL::Compara::Member
Bio::EnsEMBL::DBSQL::DBAdaptor
Bio::EnsEMBL::Hive
Bio::SimpleAlign
File::Basename
Getopt::Long
IO::File
Time::HiRes qw ( time gettimeofday tv_interval )
Inherit
Bio::EnsEMBL::Hive::Process
Synopsis
my $db = Bio::EnsEMBL::Compara::DBAdaptor->new($locator);
my $quicktreebreak = Bio::EnsEMBL::Compara::RunnableDB::QuickTreeBreak->new
(
-db => $db,
-input_id => $input_id,
-analysis => $analysis
);
$quicktreebreak->fetch_input(); #reads from DB
$quicktreebreak->run();
$quicktreebreak->output();
$quicktreebreak->write_output(); #writes to DB
Description
This Analysis/RunnableDB is designed to take ProteinTree as input.
This must already have a multiple alignment run on it. It uses that
alignment as input into the QuickTree program which then generates a
simple phylogenetic tree to be broken down into 2 pieces.
Google QuickTree to get the latest tar.gz from the Sanger.
Google sreformat to get the sequence reformatter that switches from fasta to stockholm.
input_id/parameters format eg: "{'protein_tree_id'=>1234,'clusterset_id'=>1}"
protein_tree_id : use 'id' to fetch a cluster from the ProteinTree
Methods
DESTROY
No description
Code
check_job_fail_options
No description
Code
delete_old_orthotree_tags
No description
Code
delete_original_cluster
No description
Code
dumpTreeMultipleAlignmentToWorkdir
No description
Code
fetch_inputDescriptionCode
generate_subtrees
No description
Code
get_params
No description
Code
print_params
No description
Code
runDescriptionCode
run_quicktreebreak
No description
Code
store_clusters
No description
Code
store_proteintrees
No description
Code
write_outputDescriptionCode
Methods description
fetch_inputcode    nextTop
    Title   :   fetch_input
Usage : $self->fetch_input
Function: Fetches input data for repeatmasker from the database
Returns : none
Args : none
runcodeprevnextTop
    Title   :   run
Usage : $self->run
Function: runs NJTREE PHYML
Returns : none
Args : none
write_outputcodeprevnextTop
    Title   :   write_output
Usage : $self->write_output
Function: stores proteintree
Returns : none
Args : none
Methods code
DESTROYdescriptionprevnextTop
sub DESTROY {
  my $self = shift;

  if($self->{'protein_tree'}) {
    printf("QuickTreeBreak::DESTROY releasing tree\n") if($self->debug);

    $self->{'protein_tree'}->release_tree;
    $self->{'protein_tree'} = undef;

    $self->{'max_subtree'}->release_tree;
    $self->{'new_subtree'}->release_tree;
    $self->{'remaining_subtree'}->release_tree;

    $self->{'max_subtree'} = undef;
    $self->{'new_subtree'} = undef;
    $self->{'remaining_subtree'} = undef;
  }

  $self->SUPER::DESTROY if $self->can("SUPER::DESTROY");
}


##########################################
#
# internal methods
#
##########################################
}
check_job_fail_optionsdescriptionprevnextTop
sub check_job_fail_options {
  my $self = shift;

  if ($self->input_job->retry_count > 8) {
    # $self->input_job->adaptor->reset_highmem_job_by_dbID($self->input_job->dbID);
$self->input_job->update_status('FAILED'); $self->DESTROY; throw("QuickTree job failed"); } } ########################################################
#
# ProteinTree input/output section
#
########################################################
}
delete_old_orthotree_tagsdescriptionprevnextTop
sub delete_old_orthotree_tags {
  my $self = shift;

  print "deleting old orthotree tags\n" if ($self->debug);
  my @node_ids;
  my $left_index  = $self->{'protein_tree'}->left_index;
  my $right_index = $self->{'protein_tree'}->right_index;
  my $tree_root_node_id = $self->{'protein_tree'}->node_id;
  # Include the root_id as well as the rest of the nodes within the tree
push @node_ids, $tree_root_node_id; my $sql = "select ptn.node_id from protein_tree_node ptn where ptn.left_index>$left_index and ptn.right_index<$right_index"; my $sth = $self->dbc->prepare($sql); $sth->execute; while (my $aref = $sth->fetchrow_arrayref) { my ($node_id) = @$aref; push @node_ids, $node_id; } my @list_ids; foreach my $id (@node_ids) { push @list_ids, $id; if (scalar @list_ids == 2000) { my $sql = "delete from protein_tree_tag where node_id in (".join(",",@list_ids).") and tag in ('duplication_confidence_score','taxon_id','taxon_name','OrthoTree_runtime_msec','OrthoTree_types_hashstr')"; my $sth = $self->dbc->prepare($sql); $sth->execute; $sth->finish; @list_ids = (); } } if (scalar @list_ids) { my $sql = "delete from protein_tree_tag where node_id in (".join(",",@list_ids).") and tag in ('duplication_confidence_score','taxon_id','taxon_name','OrthoTree_runtime_msec','OrthoTree_types_hashstr')"; my $sth = $self->dbc->prepare($sql); $sth->execute; $sth->finish; @list_ids = (); } return undef;
}
delete_original_clusterdescriptionprevnextTop
sub delete_original_cluster {
  my $self = shift;

  my $delete_time = time();
  my $original_cluster = $self->{'original_cluster'}->node_id;
 #   $original_cluster->store_tag('cluster_had_to_be_broken_down',1);
$self->delete_old_orthotree_tags; my $tree_node_id = $original_cluster; my $sql1 = "delete h.*, hm.* from homology h, homology_member hm where h.homology_id=hm.homology_id and h.tree_node_id=$tree_node_id"; my $sth1 = $self->dbc->prepare($sql1); $sth1->execute; $sth1->finish; $self->{original_cluster}->adaptor->delete_node_and_under($self->{original_cluster}); printf("%1.3f secs to delete old cluster $original_cluster\n", time()-$delete_time); return 1;
}
dumpTreeMultipleAlignmentToWorkdirdescriptionprevnextTop
sub dumpTreeMultipleAlignmentToWorkdir {
  my $self = shift;
  my $tree = shift;

  $self->{original_leafcount} = scalar(@{$tree->get_all_leaves});
  if($self->{original_leafcount}<3) {
    printf(STDERR "tree cluster %d has <3 proteins - can not build a tree\n", 
           $tree->node_id);
    return undef;
  }

  $self->{'file_root'} = 
    $self->worker_temp_directory. "proteintree_". $tree->node_id;
  $self->{'file_root'} =~ s/\/\//\//g;  # converts any // in path to /
my $aln_file = $self->{'file_root'} . ".aln"; return $aln_file if(-e $aln_file); if($self->debug) { printf("dumpTreeMultipleAlignmentToWorkdir : %d members\n", $self->{original_leafcount}); print("aln_file = '$aln_file'\n"); } open(OUTSEQ, ">$aln_file") or $self->throw("Error opening $aln_file for write"); # Using append_taxon_id will give nice seqnames_taxonids needed for
# njtree species_tree matching
my %sa_params = ($self->{use_genomedb_id}) ? ('-APPEND_GENOMEDB_ID', 1) : ('-APPEND_TAXON_ID', 1); my $sa = $tree->get_SimpleAlign ( -id_type => 'MEMBER', %sa_params ); $sa->set_displayname_flat(1); my $alignIO = Bio::AlignIO->newFh ( -fh =>\* OUTSEQ, -format => "fasta" ); print $alignIO $sa; close OUTSEQ; print STDERR "Using sreformat to change to stockholm format\n" if ($self->debug); my $stk_file = $self->{'file_root'} . ".stk"; my $cmd = "/usr/local/ensembl/bin/sreformat stockholm $aln_file > $stk_file"; unless( system("$cmd") == 0) { print("$cmd\n"); $self->check_job_fail_options; throw("error running sreformat, $!\n"); } $self->{'input_aln'} = $stk_file; return $stk_file;
}
fetch_inputdescriptionprevnextTop
sub fetch_input {
  my( $self) = @_;

  $self->{'max_gene_count'} = 1500; # Can be overriden later
$self->throw("No input_id") unless defined($self->input_id); #create a Compara::DBAdaptor which shares the same DBI handle
#with the pipeline DBAdaptor that is based into this runnable
$self->{'comparaDBA'} = Bio::EnsEMBL::Compara::DBSQL::DBAdaptor->new ( -DBCONN=>$self->db->dbc ); $self->get_params($self->parameters); $self->get_params($self->input_id); $self->print_params if($self->debug); $self->check_job_fail_options; $self->check_if_exit_cleanly; unless($self->{'protein_tree'}) { throw("undefined ProteinTree as input\n"); } return 1;
}
generate_subtreesdescriptionprevnextTop
sub generate_subtrees {
  my $self = shift;
  my $newick =  $self->{'quicktree_newick_string'};
  my $tree = $self->{'protein_tree'};

  #cleanup old tree structure- 
# flatten and reduce to only AlignedMember leaves
$tree->flatten_tree; $tree->print_tree(20) if($self->debug); foreach my $node (@{$tree->get_all_leaves}) { next if($node->isa('Bio::EnsEMBL::Compara::AlignedMember')); $node->disavow_parent; } #parse newick into a new tree object structure
my $newtree = Bio::EnsEMBL::Compara::Graph::NewickParser::parse_newick_into_tree($newick); $newtree->print_tree(20) if($self->debug > 1); # get rid of the taxon_id needed by njtree -- name tag
foreach my $leaf (@{$newtree->get_all_leaves}) { my $quicktreebreak_name = $leaf->get_tagvalue('name'); $quicktreebreak_name =~ /(\d+)\_\d+/; my $member_name = $1; $leaf->add_tag('name', $member_name); bless $leaf, "Bio::EnsEMBL::Compara::AlignedMember"; $leaf->member_id($member_name); } # Break the tree by immediate children recursively
my @children; my $keep_braking = 1; $self->{max_subtree} = $newtree; while ($keep_braking) { @children = @{$self->{max_subtree}->children}; my $max_num_leaves = 0; foreach my $child (@children) { my $num_leaves = scalar(@{$child->get_all_leaves}); if ($num_leaves > $max_num_leaves) { $max_num_leaves = $num_leaves; $self->{max_subtree} = $child; } } # Broke down to half, happy with it
my $proportion = ($max_num_leaves*100/$self->{original_leafcount});
print STDERR "QuickTreeBreak iterate -- $max_num_leaves ($proportion)\n" if ($self->debug); if ($proportion <= 50) { $keep_braking = 0; } } # Create a copy of what is not max_subtree
$self->{remaining_subtree} = $self->{protein_tree}->copy; $self->{new_subtree} = $self->{protein_tree}->copy; $self->{new_subtree}->flatten_tree; $self->{remaining_subtree}->flatten_tree; my $subtree_leaves; foreach my $leaf (@{$self->{max_subtree}->get_all_leaves}) { $subtree_leaves->{$leaf->member_id} = 1; } foreach my $leaf (@{$self->{new_subtree}->get_all_leaves}) { unless (defined $subtree_leaves->{$leaf->member_id}) { print $leaf->name," leaf disavowing parent\n" if $self->debug; $leaf->disavow_parent; } } foreach my $leaf (@{$self->{remaining_subtree}->get_all_leaves}) { if (defined $subtree_leaves->{$leaf->member_id}) { print $leaf->name," leaf disavowing parent\n" if $self->debug; $leaf->disavow_parent; } } $self->{remaining_subtree} = $self->{remaining_subtree}->minimize_tree; $self->{new_subtree} = $self->{new_subtree}->minimize_tree; # Some checks
$self->throw("QuickTreeBreak: Failed to generate subtrees: $!\n") unless(defined($self->{'new_subtree'}) && defined($self->{'remaining_subtree'})); my $final_original_num = scalar @{$self->{protein_tree}->get_all_leaves}; my $final_max_num = scalar @{$self->{new_subtree}->get_all_leaves}; my $final_remaining_num = scalar @{$self->{remaining_subtree}->get_all_leaves}; if(($final_max_num + $final_remaining_num) != $final_original_num) { $self->throw("QuickTreeBreak: Incorrect sum of leaves [$final_max_num + $final_remaining_num != $final_original_num]: $!\n"); } $self->{'original_cluster'} = $self->{protein_tree}; return undef; } 1;
}
get_paramsdescriptionprevnextTop
sub get_params {
  my $self         = shift;
  my $param_string = shift;

  return unless($param_string);
  print("parsing parameter string : ",$param_string,"\n") if($self->debug);

  my $params = eval($param_string);
  return unless($params);

  if($self->debug) {
    foreach my $key (keys %$params) {
      print("  $key : ", $params->{$key}, "\n");
    }
  }

  if(defined($params->{'protein_tree_id'})) {
    $self->{'protein_tree'} =  
         $self->{'comparaDBA'}->get_ProteinTreeAdaptor->
         fetch_node_by_node_id($params->{'protein_tree_id'});
  }

  $self->{'max_gene_count'} = 
    $params->{'max_gene_count'} if(defined($params->{'max_gene_count'}));

  $self->{'clusterset_id'} = 
    $params->{'clusterset_id'} if(defined($params->{'clusterset_id'}));

  foreach my $key (qw[max_gene_count use_genomedb_id]) {
    my $value = $params->{$key};
    $self->{$key} = $value if defined $value;
  }

  return;
}
print_paramsdescriptionprevnextTop
sub print_params {
  my $self = shift;

  print("params:\n");
  print("  tree_id   : ", $self->{'protein_tree'}->node_id,"\n") if($self->{'protein_tree'});
}
rundescriptionprevnextTop
sub run {
  my $self = shift;
  $self->check_if_exit_cleanly;
  $self->run_quicktreebreak;
}
run_quicktreebreakdescriptionprevnextTop
sub run_quicktreebreak {
  my $self = shift;

  my $starttime = time()*1000;

  $self->{'input_aln'} = $self->dumpTreeMultipleAlignmentToWorkdir
    (
     $self->{'protein_tree'}
    );
  return unless($self->{'input_aln'});

  $self->{'newick_file'} = $self->{'input_aln'} . "_quicktreebreak_tree.txt ";

  my $quicktreebreak_executable = $self->analysis->program_file || '';

  unless (-e $quicktreebreak_executable) {
    $quicktreebreak_executable = "/nfs/acari/avilella/src/quicktree/quicktree_1.1/bin/quicktree";
  }

  throw("can't find a quicktree executable to run\n") 
    unless(-e $quicktreebreak_executable);

  my $cmd = $quicktreebreak_executable;
  $cmd .= " -out t -in a";
  $cmd .= " ". $self->{'input_aln'};

  #/nfs/acari/avilella/src/quicktree/quicktree_1.1/bin/quicktree -out t
# -in a /tmp/worker.12270/proteintree_517373.stk
$self->{'comparaDBA'}->dbc->disconnect_when_inactive(1); print("$cmd\n") if($self->debug); open(RUN, "$cmd |") or $self->throw("error running quicktree, $!\n"); my @output = <RUN>; my $exit_status = close(RUN); if (!$exit_status) { $self->throw("error running quicktree, $!\n"); } $self->{'comparaDBA'}->dbc->disconnect_when_inactive(0); foreach my $line (@output) { $line =~ s/\n//; $self->{quicktree_newick_string} .= $line; } #parse the tree into the datastucture
$self->generate_subtrees; my $runtime = time()*1000-$starttime; $self->{'protein_tree'}->store_tag('QuickTreeBreak_runtime_msec', $runtime);
}
store_clustersdescriptionprevnextTop
sub store_clusters {
  my $self = shift;

  my $mlssDBA = $self->{'comparaDBA'}->get_MethodLinkSpeciesSetAdaptor;
  my $pafDBA = $self->{'comparaDBA'}->get_PeptideAlignFeatureAdaptor;
  my $treeDBA = $self->{'comparaDBA'}->get_ProteinTreeAdaptor;
  my $starttime = time();

  my $clusterset = $treeDBA->fetch_node_by_node_id($self->{'clusterset_id'});
  throw("no clusterset found: $!\n") unless($clusterset);

  $clusterset->no_autoload_children; # Important so that only the two below are used
$clusterset->add_child($self->{new_subtree}); $clusterset->add_child($self->{remaining_subtree}); my $clusters = $clusterset->children; foreach my $cluster (@{$clusters}) { my $node_id = $treeDBA->store($cluster); #calc residue count total
my $leafcount = scalar(@{$cluster->get_all_leaves}); $cluster->store_tag('gene_count', $leafcount); $cluster->store_tag('original_cluster', $self->{'original_cluster'}->node_id); print STDERR "Stored $node_id with $leafcount leaves\n" if ($self->debug); # Dataflow clusters
my $output_id = sprintf("{'protein_tree_id'=>%d, 'clusterset_id'=>%d}", $node_id, $clusterset->node_id); $self->dataflow_output_id($output_id, 2); print STDERR "Created QuickTreeBreak job for $node_id\n"; }
}
store_proteintreesdescriptionprevnextTop
sub store_proteintrees {
  my $self = shift;

  $self->delete_original_cluster;
  $self->store_clusters;

  if($self->debug >1) {
    print("done storing\n");
  }

  return undef;
}
write_outputdescriptionprevnextTop
sub write_output {
  my $self = shift;

  $self->check_if_exit_cleanly;
  $self->store_proteintrees;
}
General documentation
CONTACTTop
  Contact Albert Vilella on module implementation/design detail: avilella@ebi.ac.uk
Contact Javier Herrero on EnsEMBL/Compara: jherrero@ebi.ac.uk
Contact Ewan Birney on EnsEMBL in general: birney@sanger.ac.uk
APPENDIXTop
The rest of the documentation details each of the object methods.
Internal methods are usually preceded with a _