Raw content of Bio::EnsEMBL::Analysis::Runnable::LowCoverageGenomeAlignment
# Ensembl module for Bio::EnsEMBL::Analysis::Runnable::LowCoverageGenomeAlignment
#
# Copyright (c) 2007 Ensembl
#
=head1 NAME
Bio::EnsEMBL::Analysis::Runnable::LowCoverageGenomeAlignment
=head1 SYNOPSIS
my $runnable = new Bio::EnsEMBL::Analysis::Runnable::LowCoverageGenomeAlignment
(-workdir => $workdir,
-multi_fasta_file => "/path/to/mfa/file",
-tree_string => $tree_string,
-taxon_species_tree => "tree in Newick format"
-parameters => $parameters, (optional),
-options => $options);
$runnable->run;
my @output = @{$runnable->output};
=head1 DESCRIPTION
This module creates a new tree for those alignments which contain a segmental duplication. The module will runs treeBest where there are more than 3 sequences in an alignment, otherwise it will run semphy. This module is still under development.
=head1 CONTACT
Questions regarding this module should be addressed to the Ensembl development list: ensembl-dev@ebi.ac.uk
=cut
package Bio::EnsEMBL::Analysis::Runnable::LowCoverageGenomeAlignment;
use strict;
use warnings;
use Bio::EnsEMBL::Utils::Exception;
use Bio::EnsEMBL::Utils::Argument;
use Bio::EnsEMBL::Analysis::Config::Compara;
use Bio::EnsEMBL::Compara::DBSQL::ProteinTreeAdaptor;
use Bio::EnsEMBL::Compara::GenomicAlign;
use Bio::EnsEMBL::Compara::GenomicAlignBlock;
use Bio::EnsEMBL::Compara::GenomicAlignTree;
use Bio::EnsEMBL::Compara::Graph::NewickParser;
use Data::Dumper;
use File::Basename;
use Bio::EnsEMBL::Analysis::Runnable;
our @ISA = qw(Bio::EnsEMBL::Analysis::Runnable);
=head2 new
Arg [1] : -workdir => "/path/to/working/directory"
Arg [2] : -multi_fasta_file => "/path/to/mfa/file"
Arg [3] : -tree_string => $tree_string (optional)
Arg [4] : -taxon_species_tree => $taxon_species_tree
Arg [5] : -parameters => $parameters (optional)
Arg [6] : -options => $options (optional)
Function : contruct a new Bio::EnsEMBL::Analysis::Runnable::LowCoverageGenomeAlignment
runnable
Returntype: Bio::EnsEMBL::Analysis::Runnable::LowCoverageGenomeAlignment
Exceptions: none
Example :
=cut
sub new {
my ($class,@args) = @_;
my $self = $class->SUPER::new(@args);
my ($workdir, $multi_fasta_file, $tree_string, $taxon_species_tree, $parameters, $options) =
rearrange(['WORKDIR', 'MULTI_FASTA_FILE', 'TREE_STRING',
'TAXON_SPECIES_TREE', 'PARAMETERS', 'OPTIONS'], @args);
chdir $self->workdir;
$self->multi_fasta_file($multi_fasta_file) if (defined $multi_fasta_file);
$self->tree_string($tree_string) if (defined $tree_string);
$self->taxon_species_tree($taxon_species_tree) if (defined $taxon_species_tree);
$self->parameters($parameters) if (defined $parameters);
$self->options($options) if (defined $options);
return $self;
}
sub multi_fasta_file {
my $self = shift;
$self->{'_multi_fasta_file'} = shift if(@_);
return $self->{'_multi_fasta_file'};
}
sub tree_string {
my $self = shift;
$self->{'_tree_string'} = shift if(@_);
return $self->{'_tree_string'};
}
sub taxon_species_tree {
my $self = shift;
$self->{'_taxon_species_tree'} = shift if(@_);
return $self->{'_taxon_species_tree'};
}
sub parameters {
my $self = shift;
$self->{'_parameters'} = shift if(@_);
return $self->{'_parameters'};
}
sub options {
my $self = shift;
$self->{'_options'} = shift if(@_);
return $self->{'_options'};
}
=head2 run_analysis
Arg [1] : Bio::EnsEMBL::Analysis::Runnable::LowCoverageGenomeAlignment
Arg [2] : string, program name
Description: run treebest if more than 3 sequences in the alignment else run
semphy
Returntype : none
Exceptions : throws if the program in not executable or if the results
file doesnt exist
Example :
Status : At risk
=cut
sub run_analysis {
my ($self, $program) = @_;
#find how many sequences are in the mfa file
my $num_sequences = get_num_sequences($self->multi_fasta_file);
#treebest phyml needs at least 4 sequences to run. If I have less than
#4, then run semphy instead.
if ($num_sequences < 4) {
$self->run_semphy_2x;
} else {
$self->run_treebest_2x;
}
#move this to compara module instead because it is easier to keep track
#of the 2x composite fragments. And also it removes the need to create
#compara objects in analysis module.
#$self->parse_results;
return 1;
}
#Find how many sequences in mfa file
sub get_num_sequences {
my ($mfa) = @_;
my $cnt = 0;
open (MFA, "$mfa") || throw("Couldn't open $mfa");
while () {
$cnt++ if (/^>/);
}
close(MFA);
return $cnt;
}
=head2 run_treebest_2x
Arg [1] : none
Description: create and open a commandline for the program treebest
Returntype : none
Exceptions : throws if the program in not executable or if the results
file doesnt exist
Example :
Status : At risk
=cut
sub run_treebest_2x {
my ($self) = @_;
#Check I don't already have a tree
return if ($self->tree_string);
chdir $self->workdir;
my $tree_file = "output.$$.tree";
#write species tree (with taxon_ids) to file in workdir
my $species_tree_file = "species_tree.$$.tree";
open(F, ">$species_tree_file") || throw("Couldn't open $species_tree_file for writing");
print (F $self->taxon_species_tree);
close(F);
#Run treebeset
my $command = "$TREEBEST phyml -Snf $species_tree_file " . $self->multi_fasta_file . " | $TREEBEST sdi -rs $species_tree_file - > $tree_file";
print "Running treebest $command\n";
#run treebest to create tree
unless (system($command) == 0) {
throw("treebest execution failed\n");
os_error_warning($command);
}
#read in new tree
if (-e $tree_file) {
## Treebest estimated the tree. Overwrite the order of the fasta files and get the tree
open(F, $tree_file) || throw("Could not open tree file <$tree_file>");
my $newick;
while () {
$newick .= $_;
}
close(F);
$newick =~ s/[\r\n]+$//;
rearrange_multi_fasta_file($newick, $self->multi_fasta_file);
}
}
=head2 run_semphy_2x
Arg [1] : none
Description: create and open a commandline for the program semphy
Returntype : none
Exceptions : throws if the program in not executable or if the results
file doesnt exist
Example :
Status : At risk
=cut
sub run_semphy_2x {
my ($self) = @_;
#Check I don't already have a tree
return if ($self->tree_string);
chdir $self->workdir;
my $tree_file = "output.$$.tree";
#Run semphy directly
my $command = "$SEMPHY --treeoutputfile=" . $tree_file . " -a 4 --hky -J -H -S --ACGprob=0.300000,0.200000,0.200000 --sequence=" . $self->multi_fasta_file;
print "Running semphy $command\n";
#run semphy to create tree
unless (system($command) == 0) {
throw("semphy execution failed\n");
os_error_warning($command);
}
#read in new tree
if (-e $tree_file) {
## Semphy estimated the tree. Overwrite the order of the fasta files and get the tree
open(F, $tree_file) || throw("Could not open tree file <$tree_file>");
my ($newick) = ;
close(F);
$newick =~ s/[\r\n]+$//;
rearrange_multi_fasta_file($newick, $self->multi_fasta_file);
}
#print "FINAL tree string $tree_string\n";
}
=head2 os_error_warning
Arg [1] : string: The command line argument of an exec() or
system() call.
Example : (system($command) == 0) || os_error_warning($command);
Description: Evaluates an eventual operating system error in
$! ($OS_ERROR) or the 16 bit wait(2) status word in
$? ($CHILD_ERROR) after an exec() or system() call.
It generates a warning message in both cases.
Returntype : bool: true on success.
Exceptions : none
Caller : general
Status : At Risk
=cut
sub os_error_warning {
my $command = shift;
my $warning = '';
$warning .= 'Encountered ';
if ($!) {
$warning .= "OS_ERROR \'$!\'";
} else {
$warning .= 'CHILD_ERROR ';
$warning .= 'Exit code: ';
$warning .= ($? >> 8);
$warning .= ' ';
$warning .= 'Signal: ';
$warning .= ($? & 127);
$warning .= ' ';
if ($? & 128) {
$warning .= ' ';
$warning .= 'Core dump ';
}
}
$warning .= " on following command:\n";
$warning .= $command;
warning($warning);
return 1;
}
sub rearrange_multi_fasta_file {
my ($tree_string, $mfa_file) = @_;
my (@ordered_leaves) = $tree_string =~ /(seq[^:]+)/g;
my $ordered_names;
for (my $i = 0; $i < @ordered_leaves; $i++) {
my $item;
$item->{name} = ">" . $ordered_leaves[$i];
$item->{leaf} = $ordered_leaves[$i];
push @$ordered_names, $item;
}
my %mfa_data;
open(MFA, $mfa_file) || throw("Could not open mfa file <$mfa_file>");
my $name;
my $seq;
while () {
next if (/^\s*$/);
chomp;
if (/^>/) {
if ($name) {
$mfa_data{$name} = $seq;
$seq = "";
}
$name = $_;
} else {
$seq .= $_;
}
}
$mfa_data{$name} = $seq;
close(MFA);
#overwrite existing mfa file
open MFA, ">$mfa_file" || throw("Couldn't open $mfa_file");
foreach my $ordered_name (@$ordered_names) {
#for ancestral
next if (!defined $mfa_data{$ordered_name->{name}});
print MFA ">".$ordered_name->{leaf} . "\n";
my $seq = $mfa_data{$ordered_name->{name}};
$seq =~ s/(.{80})/$1\n/g;
chomp $seq;
print MFA $seq,"\n";
}
close MFA;
}
1;