Raw content of Bio::EnsEMBL::Analysis::Runnable::Ortheus
# Ensembl module for Bio::EnsEMBL::Analysis::Runnable::Ortheus
#
# Copyright (c) 2007 Ensembl
#
=head1 NAME
Bio::EnsEMBL::Analysis::Runnable::Ortheus
=head1 SYNOPSIS
my $runnable = new Bio::EnsEMBL::Analysis::Runnable::Ortheus
(-workdir => $workdir,
-fasta_files => $fasta_files,
-tree_string => $tree_string,
-program => "/path/to/program");
$runnable->run;
my @output = @{$runnable->output};
=head1 DESCRIPTION
Ortheus expects to run the ortheus, a program for reconstructing the ancestral history
of a set of sequences. It is able to infer a tree (or one may be provided) and then
create a tree alignment of the sequences that includes ML reconstructions of the
ancestral sequences. Ortheus is based upon a probabilistic transducer model and handles
the evolution of both substitutions, insertions and deletions.
=head1 CONTACT
Ortheus was developed by Benedict Paten in the group of Ewan Birney at the EBI. You contact
him at bjp (AT) ebi (DOT) ac (DOT) uk.
Questions regarding this module should be addressed to the Ensembl development list: ensembl-dev@ebi.ac.uk
=cut
package Bio::EnsEMBL::Analysis::Runnable::Ortheus;
use strict;
use warnings;
use Bio::EnsEMBL::Utils::Exception;
use Bio::EnsEMBL::Utils::Argument;
use Bio::EnsEMBL::Analysis::Config::Compara;
use Bio::EnsEMBL::Compara::DBSQL::ProteinTreeAdaptor;
use Bio::EnsEMBL::Compara::GenomicAlign;
use Bio::EnsEMBL::Compara::GenomicAlignBlock;
use Bio::EnsEMBL::Compara::GenomicAlignTree;
use Bio::EnsEMBL::Compara::Graph::NewickParser;
use Data::Dumper;
use Bio::EnsEMBL::Analysis::Runnable;
our @ISA = qw(Bio::EnsEMBL::Analysis::Runnable);
my $java_exe = "/nfs/acari/bpaten/bin/jre1.6.0/bin/java";
my $uname = `uname`;
$uname =~ s/[\r\n]+//;
my $default_exonerate = "exonerate-1.0.0";
my $default_jar_file = "pecan.0.6.jar";
my $default_java_class = "bp.pecan.Pecan";
my $estimate_tree = "~/pecan/EstimateTree.py";
=head2 new
Arg [1] : -workdir => "/path/to/working/directory"
Arg [2] : -fasta_files => "/path/to/fasta/file"
Arg [3] : -tree_string => "/path/to/tree/file" (optional)
Arg [4] : -parameters => "parameter" (optional)
Function : contruct a new Bio::EnsEMBL::Analysis::Runnable::Mlagan
runnable
Returntype: Bio::EnsEMBL::Analysis::Runnable::Mlagan
Exceptions: none
Example :
=cut
sub new {
my ($class,@args) = @_;
my $self = $class->SUPER::new(@args);
my ($workdir, $fasta_files, $tree_string, $species_tree, $species_order, $parameters, $jar_file, $java_class, $exonerate, $options) =
rearrange(['WORKDIR', 'FASTA_FILES', 'TREE_STRING', 'SPECIES_TREE',
'SPECIES_ORDER', 'PARAMETERS', 'JAR_FILE', 'JAVA_CLASS', 'EXONERATE', 'OPTIONS'], @args);
chdir $self->workdir;
$self->fasta_files($fasta_files) if (defined $fasta_files);
if (defined $tree_string) {
$self->tree_string($tree_string)
} elsif ($species_tree and $species_order and @$species_order) {
$self->species_tree($species_tree);
$self->species_order($species_order);
}
$self->parameters($parameters) if (defined $parameters);
$self->options($options) if (defined $options);
# # $self->jar_file($jar_file) if (defined $jar_file);
# # $self->java_class($java_class) if (defined $java_class);
# # unless (defined $self->program) {
# # if (defined($self->analysis) and defined($self->analysis->program)) {
# # $self->program($self->analysis->program);
# # } else {
# # $self->program($java_exe);
# # }
# # }
# # unless (defined $self->jar_file) {
# # $self->jar_file($default_jar_file);
# # }
# # unless (defined $self->java_class) {
# # $self->java_class($default_java_class);
# # }
unless (defined $self->exonerate) {
$self->exonerate($default_exonerate);
}
# #
# # # Try to locate jar file in usual places...
# # if (!-e $self->jar_file) {
# # $default_jar_file = $self->jar_file;
# # if (-e "/usr/local/pecan/$default_jar_file") {
# # $self->jar_file("/usr/local/pecan/$default_jar_file");
# # } elsif (-e "/usr/local/ensembl/pecan/$default_jar_file") {
# # $self->jar_file("/usr/local/ensembl/pecan/$default_jar_file");
# # } elsif (-e "/usr/local/ensembl/bin/$default_jar_file") {
# # $self->jar_file("/usr/local/ensembl/bin/$default_jar_file");
# # } elsif (-e "/usr/local/bin/pecan/$default_jar_file") {
# # $self->jar_file("/usr/local/bin/pecan/$default_jar_file");
# # } elsif (-e $ENV{HOME}."/pecan/$default_jar_file") {
# # $self->jar_file($ENV{HOME}."/pecan/$default_jar_file");
# # } elsif (-e $ENV{HOME}."/Downloads/$default_jar_file") {
# # $self->jar_file($ENV{HOME}."/Downloads/$default_jar_file");
# # } else {
# # throw("Cannot find Pecan JAR file!");
# # }
# # }
return $self;
}
sub fasta_files {
my $self = shift;
$self->{'_fasta_files'} = shift if(@_);
return $self->{'_fasta_files'};
}
sub tree_string {
my $self = shift;
$self->{'_tree_string'} = shift if(@_);
return $self->{'_tree_string'};
}
sub species_tree {
my $self = shift;
$self->{'_species_tree'} = shift if(@_);
return $self->{'_species_tree'};
}
sub species_order {
my $self = shift;
$self->{'_species_order'} = shift if(@_);
return $self->{'_species_order'};
}
sub parameters {
my $self = shift;
$self->{'_parameters'} = shift if(@_);
return $self->{'_parameters'};
}
sub jar_file {
my $self = shift;
$self->{'_jar_file'} = shift if(@_);
return $self->{'_jar_file'};
}
sub java_class {
my $self = shift;
$self->{'_java_class'} = shift if(@_);
return $self->{'_java_class'};
}
sub exonerate {
my $self = shift;
$self->{'_exonerate'} = shift if(@_);
return $self->{'_exonerate'};
}
sub options {
my $self = shift;
$self->{'_options'} = shift if(@_);
return $self->{'_options'};
}
=head2 run_analysis
Arg [1] : Bio::EnsEMBL::Analysis::Runnable::Mlagan
Arg [2] : string, program name
Function : create and open a commandline for the program trf
Returntype: none
Exceptions: throws if the program in not executable or if the results
file doesnt exist
Example :
=cut
sub run_analysis {
my ($self, $program) = @_;
$self->run_ortheus;
#move this to compara module instead because it is easier to keep track
#of the 2x composite fragments. And also it removes the need to create
#compara objects in analysis module.
#$self->parse_results;
return 1;
}
sub run_ortheus {
my $self = shift;
chdir $self->workdir;
# throw("Python [$PYTHON] is not executable") unless ($PYTHON && -x $PYTHON);
throw("Ortheus [$ORTHEUS] does not exist") unless ($ORTHEUS && -e $ORTHEUS);
my $command = "$PYTHON $ORTHEUS";
# $command .= " -cp ".$self->jar_file." ".$self->java_class;
$command .= " -l \"#-j 0\" "; #-R\"select[mem>6000] rusage[mem=6000]\" -M6000000 ";
if (@{$self->fasta_files}) {
$command .= " -e";
foreach my $fasta_file (@{$self->fasta_files}) {
$command .= " $fasta_file";
}
}
#add more java memory by using java parameters set in $self->parameters
my $java_params = "";
if ($self->parameters) {
$java_params = $self->parameters;
}
#Add -X to fix -ve indices in array bug suggested by BP
$command .= " -m \"$JAVA " . $java_params . "\" -k \"#-J $EXONERATE -X\"";
if ($self->tree_string) {
$command .= " -d '" . $self->tree_string . "'";
# # TODO Not sure which elsif is right. Need to check this!!
# } elsif ($self->species_tree and $self->species_order and @{$self->species_order} > 2) {
# $command .= " -s $SEMPHY -z '".$self->species_tree."' -A ".join(" ", @{$self->species_order});
} elsif ($self->species_tree and $self->species_order and @{$self->species_order}) {
$command .= " -s $SEMPHY -z '".$self->species_tree."' -A ".join(" ", @{$self->species_order});
} else {
$command .= " -s $SEMPHY";
}
$command .= " -f output.$$.mfa -g output.$$.tree";
#append any additional options to command
if ($self->options) {
$command .= " " . $self->options;
}
print "Running ortheus: " . $command . "\n";
unless (system($command) == 0) {
throw("ortheus execution failed\n");
}
}
=head2 parse_results
Arg [1] : Bio::EnsEMBL::Analysis::Runnable::Mlagan
Function : parse the specifed file and produce RepeatFeatures
Returntype: nine
Exceptions: throws if fails to open or close the results file
Example :
=cut
sub parse_results{
my ($self, $run_number) = @_;
## The output file contains one fasta aligned sequence per original sequence + ancestral sequences.
## The first seq. corresponds to the fist leaf of the tree, the second one will be an internal
## node, the third is the second leaf and so on. The fasta header in the result file correspond
## to the names of the leaves for the leaf nodes and to the concatenation of the names of all the
## underlying leaves for internal nodes. For instance:
## ----------------------------------
## >0
## ACTTGG--CCGT
## >0_1
## ACTTGGTTCCGT
## >1
## ACTTGGTTCCGT
## >1_2_3
## ACTTGCTTCCGT
## >2
## CCTTCCTTCCGT
## ----------------------------------
## The sequence of fasta files and leaves in the tree have the same order. If Ortheus is run
## with a given tree, the sequences in the file follow the tree. If Ortheus estimate the tree,
## the tree output file contains also the right order of files:
## ----------------------------------
## ((1:0.0157,0:0.0697):0.0000,2:0.0081);
## /tmp/file3.fa /tmp/file1.fa /tmp/file2.fa
## ----------------------------------
# $self->workdir("/home/jherrero/ensembl/worker.8139/");
my $tree_file = $self->workdir . "/output.$$.tree";
if (-e $tree_file) {
## Ortheus estimated the tree. Overwrite the order of the fasta files and get the tree
open(F, $tree_file) || throw("Could not open tree file <$tree_file>");
my ($newick, $files) = ;
close(F);
$newick =~ s/[\r\n]+$//;
$self->tree_string($newick);
$files =~ s/[\r\n]+$//;
my $all_files = [split(" ", $files)];
$self->fasta_files($all_files);
print STDERR "NEWICK: $newick\nFILES: ", join(" -- ", @$all_files), "\n";
}
# $self->tree_string("((0:0.06969,1:0.015698):1e-05,2:0.008148):1e-05;");
# $self->fasta_files(["/home/jherrero/ensembl/worker.8139/seq1.fa", "/home/jherrero/ensembl/worker.8139/seq2.fa", "/home/jherrero/ensembl/worker.8139/seq3.fa"]);
my (@ordered_leaves) = $self->tree_string =~ /[(,]([^(:)]+)/g;
print STDERR "NEWICK: ", $self->tree_string, "\nLEAVES: ", join(" -- ", @ordered_leaves), "\nFILES: ", join(" -- ", @{$self->fasta_files}), "\n";
my $alignment_file = $self->workdir . "/output.$$.mfa";
# my $alignment_file = $self->workdir . "/output.8139.mfa";
my $this_genomic_align_block = new Bio::EnsEMBL::Compara::GenomicAlignBlock;
open(F, $alignment_file) || throw("Could not open $alignment_file");
my $seq = "";
my $this_genomic_align;
my $tree = Bio::EnsEMBL::Compara::Graph::NewickParser::parse_newick_into_tree($self->tree_string);
$tree->print_tree(100);
print $tree->newick_format("simple"), "\n";
print join(" -- ", map {$_->name} @{$tree->get_all_leaves}), "\n";
print "Reading $alignment_file...\n";
my $ids;
foreach my $this_file (@{$self->fasta_files}) {
push(@$ids, qx"head -1 $this_file");
push(@$ids, undef); ## There is an internal node after each leaf..
}
pop(@$ids); ## ...except for the last leaf which is the end of the tree
#print join(" :: ", @$ids), "\n\n";
while () {
next if (/^\s*$/);
chomp;
## FASTA headers correspond to the tree and the order of the leaves in the tree corresponds
## to the order of the files
if (/^>/) {
print "PARSING $_\n";
print $tree->newick_format(), "\n";
my ($name) = $_ =~ /^>(.+)/;
if (defined($this_genomic_align) and $seq) {
$this_genomic_align->aligned_sequence($seq);
$this_genomic_align_block->add_GenomicAlign($this_genomic_align);
}
my $header = shift(@$ids);
$this_genomic_align = new Bio::EnsEMBL::Compara::GenomicAlign;
if (!defined($header)) {
print "INTERNAL NODE $name\n";
my $this_node;
foreach my $this_leaf_name (split("_", $name)) {
if ($this_node) {
my $other_node = $tree->find_node_by_name($this_leaf_name);
if (!$other_node) {
throw("Cannot find node <$this_leaf_name>\n");
}
$this_node = $this_node->find_first_shared_ancestor($other_node);
} else {
print $tree->newick_format();
print "LEAF: $this_leaf_name\n";
$this_node = $tree->find_node_by_name($this_leaf_name);
}
}
print join("_", map {$_->name} @{$this_node->get_all_leaves}), "\n";
## INTERNAL NODE: dnafrag_id and dnafrag_end must be edited somewhere else
$this_genomic_align->dnafrag_id(-1);
$this_genomic_align->dnafrag_start(1);
$this_genomic_align->dnafrag_end(0);
$this_genomic_align->dnafrag_strand(1);
bless($this_node, "Bio::EnsEMBL::Compara::GenomicAlignTree");
$this_node->genomic_align($this_genomic_align);
$this_node->name($name);
} elsif ($header =~ /^>DnaFrag(\d+)\|(.+)\.(\d+)\-(\d+)\:(\-?1)$/) {
print "leaf_name?? $name\n";
my $this_leaf = $tree->find_node_by_name($name);
if (!$this_leaf) {
print $tree->newick_format(), "\n";
die "";
}
print "$this_leaf\n";
# print "****** $name -- $header -- ";
# if ($this_leaf) {
# $this_leaf->print_node();
# } else {
# print "[none]\n";
# }
#information extracted from fasta header
$this_genomic_align->dnafrag_id($1);
$this_genomic_align->dnafrag_start($3);
$this_genomic_align->dnafrag_end($4);
$this_genomic_align->dnafrag_strand($5);
bless($this_leaf, "Bio::EnsEMBL::Compara::GenomicAlignTree");
$this_leaf->genomic_align($this_genomic_align);
} else {
throw("Error while parsing the FASTA header. It must start by \">DnaFrag#####\" where ##### is the dnafrag_id\n$_");
}
$seq = "";
} else {
$seq .= $_;
}
}
close F;
if ($this_genomic_align->dnafrag_id == -1) {
} else {
$this_genomic_align->aligned_sequence($seq);
$this_genomic_align_block->add_GenomicAlign($this_genomic_align);
}
print $tree->newick_format("simple"), "\n";
print join(" -- ", map {$_->node_id."+".$_->genomic_align->dnafrag_id} (@{$tree->get_all_nodes()})), "\n";
$self->output([$tree]);
}
1;