Raw content of Bio::EnsEMBL::Compara::RunnableDB::Muscle
#
# You may distribute this module under the same terms as perl itself
#
# POD documentation - main docs before the code
=pod
=head1 NAME
Bio::EnsEMBL::Compara::RunnableDB::Muscle
=cut
=head1 SYNOPSIS
my $db = Bio::EnsEMBL::Compara::DBAdaptor->new($locator);
my $muscle = Bio::EnsEMBL::Compara::RunnableDB::Muscle->new (
-db => $db,
-input_id => $input_id,
-analysis => $analysis );
$muscle->fetch_input(); #reads from DB
$muscle->run();
$muscle->output();
$muscle->write_output(); #writes to DB
=cut
=head1 DESCRIPTION
This Analysis/RunnableDB is designed to take a Family (or Homology) as input
Run a MUSCLE multiple alignment on it, and store the resulting alignment
back into the family_member table.
input_id/parameters format eg: "{'family_id'=>1234,'options'=>'-maxiters 2'}"
family_id : use family_id to run multiple alignment on its members
protein_tree_id : use 'id' to fetch a cluster from the ProteinTree
options : commandline options to pass to the 'muscle' program
=cut
=head1 CONTACT
Contact Jessica Severin on module implemetation/design detail: jessica@ebi.ac.uk
Contact Abel Ureta-Vidal on EnsEMBL/Compara: abel@ebi.ac.uk
Contact Ewan Birney on EnsEMBL in general: birney@sanger.ac.uk
=cut
=head1 APPENDIX
The rest of the documentation details each of the object methods.
Internal methods are usually preceded with a _
=cut
package Bio::EnsEMBL::Compara::RunnableDB::Muscle;
use strict;
use Getopt::Long;
use IO::File;
use File::Basename;
use Bio::EnsEMBL::DBSQL::DBAdaptor;
use Bio::EnsEMBL::BaseAlignFeature;
use Bio::EnsEMBL::Compara::DBSQL::DBAdaptor;
use Bio::EnsEMBL::Compara::DBSQL::PeptideAlignFeatureAdaptor;
use Bio::EnsEMBL::Compara::Member;
use Time::HiRes qw(time gettimeofday tv_interval);
use POSIX qw(ceil floor);
use Bio::EnsEMBL::Hive;
our @ISA = qw(Bio::EnsEMBL::Hive::Process);
=head2 fetch_input
Title : fetch_input
Usage : $self->fetch_input
Function: Fetches input data for repeatmasker from the database
Returns : none
Args : none
=cut
sub fetch_input {
my( $self) = @_;
#$self->{'options'} = "-maxiters 1 -diags1 -sv"; #fast options
$self->{'options'} = "";
$self->{'muscle_starttime'} = time()*1000;
$self->{'max_gene_count'} = 1500;
$self->check_job_fail_options;
# if($self->input_job->retry_count >= 3) {
# $self->dataflow_output_id($self->input_id, 2);
# $self->input_job->update_status('FAILED');
# throw("Muscle job failed >3 times: try something else and FAIL it");
# }
$self->throw("No input_id") unless defined($self->input_id);
#create a Compara::DBAdaptor which shares the same DBI handle
#with the pipeline DBAdaptor that is based into this runnable
$self->{'comparaDBA'} = Bio::EnsEMBL::Compara::DBSQL::DBAdaptor->new(-DBCONN=>$self->db->dbc);
$self->get_params($self->parameters);
$self->get_params($self->input_id);
$self->print_params if($self->debug);
$self->check_if_exit_cleanly;
if($self->{'family'}) {
$self->{'input_fasta'} = $self->dumpFamilyPeptidesToWorkdir($self->{'family'});
} elsif($self->{'protein_tree'}) {
if ($self->{'protein_tree'}->get_tagvalue('gene_count') > $self->{'max_gene_count'}) {
$self->dataflow_output_id($self->input_id, 2);
$self->input_job->update_status('FAILED');
$self->{'protein_tree'}->release_tree;
$self->{'protein_tree'} = undef;
throw("Muscle : cluster size over threshold and FAIL it");
}
$self->{'input_fasta'} = $self->dumpProteinTreeToWorkdir($self->{'protein_tree'});
} else {
throw("undefined family as input\n");
}
return 1;
}
=head2 run
Title : run
Usage : $self->run
Function: runs MUSCLE
Returns : none
Args : none
=cut
sub run
{
my $self = shift;
$self->check_if_exit_cleanly;
return unless($self->{'input_fasta'});
$self->{'muscle_output'} = $self->run_muscle;
}
=head2 write_output
Title : write_output
Usage : $self->write_output
Function: parse muscle output and update family and family_member tables
Returns : none
Args : none
=cut
sub write_output {
my $self = shift;
$self->check_if_exit_cleanly;
if($self->{'family'}) {
$self->parse_and_store_alignment_into_family;
} elsif($self->{'protein_tree'}) {
$self->parse_and_store_alignment_into_proteintree;
#done so release the tree
$self->{'protein_tree'}->release_tree;
} else {
throw("undefined family as input\n");
}
$self->cleanup_job_tmp_files unless($self->debug);
}
##########################################
#
# internal methods
#
##########################################
sub get_params {
my $self = shift;
my $param_string = shift;
return unless($param_string);
print("parsing parameter string : ",$param_string,"\n") if($self->debug);
my $params = eval($param_string);
return unless($params);
if($self->debug) {
foreach my $key (keys %$params) {
print(" $key : ", $params->{$key}, "\n");
}
}
if(defined($params->{'family_id'})) {
$self->{'family'} = $self->{'comparaDBA'}->get_FamilyAdaptor->fetch_by_dbID($params->{'family_id'});
}
if(defined($params->{'protein_tree_id'})) {
$self->{'protein_tree'} =
$self->{'comparaDBA'}->get_ProteinTreeAdaptor->
fetch_node_by_node_id($params->{'protein_tree_id'});
}
if(defined($params->{'clusterset_id'})) {
$self->{'clusterset_id'} = $params->{'clusterset_id'};
}
$self->{'options'} = $params->{'options'} if(defined($params->{'options'}));
$self->{'max_gene_count'} = $params->{'max_gene_count'} if(defined($params->{'max_gene_count'}));
return;
}
sub print_params {
my $self = shift;
print(" params:\n");
print(" family_id : ", $self->{'family'}->dbID,"\n") if($self->{'family'});
print(" options : ", $self->{'options'},"\n") if($self->{'options'});
}
sub run_muscle
{
my $self = shift;
my $input_fasta = $self->{'input_fasta'};
my $muscle_output = $input_fasta .".msc";
$muscle_output =~ s/\/\//\//g; # converts any // in path to /
my $muscle_executable = $self->analysis->program_file;
unless (-e $muscle_executable) {
$muscle_executable = "/nfs/acari/abel/bin/alpha-dec-osf4.0/muscle";
if (-e "/proc/version") {
# it is a linux machine
$muscle_executable = "/nfs/acari/abel/bin/i386/muscle";
}
}
throw("can't find a muscle executable to run\n") unless(-e $muscle_executable);
my $cmd = $muscle_executable;
$cmd .= " ". "-maxiters 3 " if($self->input_job->retry_count >= 2);
$cmd .= " ". $self->{'options'};
$cmd .= " -clw -nocore -verbose -quiet ";
$cmd .= " -in " . $input_fasta;
$cmd .= " -out $muscle_output -log $muscle_output.log";
$self->{'comparaDBA'}->dbc->disconnect_when_inactive(1);
print("$cmd\n") if($self->debug);
unless(system($cmd) == 0) {
$self->check_job_fail_options;
throw("error running muscle, $!\n");
}
$self->{'comparaDBA'}->dbc->disconnect_when_inactive(0);
$self->{'muscle_output'} = $muscle_output;
return $muscle_output;
}
sub cleanup_job_tmp_files
{
my $self = shift;
unlink ($self->{'input_fasta'}) if($self->{'input_fasta'});
if($self->{'muscle_output'}) {
unlink ($self->{'muscle_output'});
unlink ($self->{'muscle_output'} . ".log");
}
}
sub check_job_fail_options
{
my $self = shift;
if($self->input_job->retry_count >= 2) {
$self->dataflow_output_id($self->input_id, 2);
$self->input_job->update_status('FAILED');
if($self->{'protein_tree'}) {
$self->{'protein_tree'}->release_tree;
$self->{'protein_tree'} = undef;
}
throw("Muscle job failed >=3 times: try something else and FAIL it");
}
}
##############################################################
#
# Family input/output section
#
##############################################################
sub dumpFamilyPeptidesToWorkdir
{
my $self = shift;
my $family = shift;
my $fastafile = $self->worker_temp_directory. "family_". $family->dbID. ".fasta";
$fastafile =~ s/\/\//\//g; # converts any // in path to /
return $fastafile if(-e $fastafile);
print("fastafile = '$fastafile'\n") if($self->debug);
#
# get only peptide members
#
my $seq_id_hash = {};
my @members_attributes;
push @members_attributes,@{$family->get_Member_Attribute_by_source('ENSEMBLPEP')};
push @members_attributes,@{$family->get_Member_Attribute_by_source('Uniprot/SWISSPROT')};
push @members_attributes,@{$family->get_Member_Attribute_by_source('Uniprot/SPTREMBL')};
if(scalar @members_attributes <= 1) {
$self->update_single_peptide_family($family);
return undef; #so muscle isn't run
}
open(OUTSEQ, ">$fastafile")
or $self->throw("Error opening $fastafile for write");
foreach my $member_attribute (@members_attributes) {
my ($member,$attribute) = @{$member_attribute};
my $member_stable_id = $member->stable_id;
next if($seq_id_hash->{$member->sequence_id});
$seq_id_hash->{$member->sequence_id} = 1;
my $seq = $member->sequence;
$seq =~ s/(.{72})/$1\n/g;
chomp $seq;
print OUTSEQ ">$member_stable_id\n$seq\n";
}
close OUTSEQ;
return $fastafile;
}
sub update_single_peptide_family
{
my $self = shift;
my $family = shift;
my $familyMemberList = $family->get_all_Member_Attribute();
foreach my $familyMember (@{$familyMemberList}) {
my ($member,$attribute) = @{$familyMember};
next unless($member->sequence);
next if($member->source_name eq 'ENSEMBLGENE');
$attribute->cigar_line(length($member->sequence)."M");
printf("single_pepide_family %s : %s\n", $member->stable_id, $attribute->cigar_line) if($self->debug);
}
}
sub parse_and_store_alignment_into_family
{
my $self = shift;
my $muscle_output = $self->{'muscle_output'};
my $family = $self->{'family'};
if($muscle_output and -e $muscle_output) {
$family->read_clustalw($muscle_output);
}
my $familyDBA = $self->{'comparaDBA'}->get_FamilyAdaptor;
#
# post process and copy cigar_line between duplicate sequences
#
my $cigar_hash = {};
my $familyMemberList = $family->get_all_Member_Attribute();
#first build up a hash of cigar_lines that are defined
foreach my $familyMember (@{$familyMemberList}) {
my ($member,$attribute) = @{$familyMember};
next unless($member->sequence_id);
next unless(defined($attribute->cigar_line));
next if($attribute->cigar_line eq '');
next if($attribute->cigar_line eq 'NULL');
$cigar_hash->{$member->sequence_id} = $attribute->cigar_line;
}
#next loop again to copy (via sequence_id) into members
#missing cigar_lines and then store them
foreach my $familyMember (@{$familyMemberList}) {
my ($member,$attribute) = @{$familyMember};
next if($member->source_name eq 'ENSEMBLGENE');
next unless($member->sequence_id);
my $cigar_line = $cigar_hash->{$member->sequence_id};
next unless($cigar_line);
$attribute->cigar_line($cigar_line);
printf("update family_member %s : %s\n",$member->stable_id, $attribute->cigar_line) if($self->debug);
$familyDBA->update_relation([$member, $attribute]);
}
}
########################################################
#
# ProteinTree input/output section
#
########################################################
sub update_single_peptide_tree
{
my $self = shift;
my $tree = shift;
foreach my $member (@{$tree->get_all_leaves}) {
next unless($member->isa('Bio::EnsEMBL::Compara::AlignedMember'));
next unless($member->sequence);
$member->cigar_line(length($member->sequence)."M");
$self->{'comparaDBA'}->get_ProteinTreeAdaptor->store($member);
printf("single_pepide_tree %s : %s\n", $member->stable_id, $member->cigar_line) if($self->debug);
}
}
sub dumpProteinTreeToWorkdir
{
my $self = shift;
my $tree = shift;
my $fastafile = $self->worker_temp_directory. "proteintree_". $tree->node_id. ".fasta";
$fastafile =~ s/\/\//\//g; # converts any // in path to /
return $fastafile if(-e $fastafile);
print("fastafile = '$fastafile'\n") if($self->debug);
open(OUTSEQ, ">$fastafile")
or $self->throw("Error opening $fastafile for write");
my $seq_id_hash = {};
my $residues = 0;
my $member_list = $tree->get_all_leaves;
$tree->store_tag('gene_count', scalar(@$member_list));
foreach my $member (@{$member_list}) {
next if($seq_id_hash->{$member->sequence_id});
$seq_id_hash->{$member->sequence_id} = 1;
my $seq = $member->sequence;
$residues += $member->seq_length;
$seq =~ s/(.{72})/$1\n/g;
chomp $seq;
print OUTSEQ ">". $member->sequence_id. "\n$seq\n";
}
close OUTSEQ;
if(scalar keys (%{$seq_id_hash}) <= 1) {
$self->update_single_peptide_tree($tree);
return undef; #so muscle isn't run
}
$tree->store_tag('cluster_residue_count', $residues);
return $fastafile;
}
sub parse_and_store_alignment_into_proteintree
{
my $self = shift;
my $muscle_output = $self->{'muscle_output'};
my $tree = $self->{'protein_tree'};
return unless($muscle_output);
#
# parse alignment file into hash: combine alignment lines
#
my %align_hash;
my $FH = IO::File->new();
$FH->open($muscle_output) || throw("Could not open alignment file [$muscle_output]");
<$FH>; #skip header
while(<$FH>) {
next if($_ =~ /^\s+/); #skip lines that start with space
chomp;
my ($id, $align) = split;
$align_hash{$id} ||= '';
$align_hash{$id} .= $align;
}
$FH->close;
#
# convert clustalw alignment string into a cigar_line
#
my $alignment_length;
foreach my $id (keys %align_hash) {
my $alignment_string = $align_hash{$id};
unless (defined $alignment_length) {
$alignment_length = length($alignment_string);
} else {
if ($alignment_length != length($alignment_string)) {
throw("While parsing the alignment, some id did not return the expected alignment length\n");
}
}
$alignment_string =~ s/\-([A-Z])/\- $1/g;
$alignment_string =~ s/([A-Z])\-/$1 \-/g;
my @cigar_segments = split " ",$alignment_string;
my $cigar_line = "";
foreach my $segment (@cigar_segments) {
my $seglength = length($segment);
$seglength = "" if ($seglength == 1);
if ($segment =~ /^\-+$/) {
$cigar_line .= $seglength . "D";
} else {
$cigar_line .= $seglength . "M";
}
}
$align_hash{$id} = $cigar_line;
}
#
# align cigar_line to member and store
#
foreach my $member (@{$tree->get_all_leaves}) {
if ($align_hash{$member->sequence_id} eq "") {
throw("muscle did produce an empty cigar_line for ".$member->stable_id."\n");
}
$member->cigar_line($align_hash{$member->sequence_id});
## Check that the cigar length (Ms) matches the sequence length
my @cigar_match_lengths = map { if ($_ eq '') {$_ = 1} else {$_ = $_;} } map { $_ =~ /^(\d*)/ } ( $member->cigar_line =~ /(\d*[M])/g );
my $seq_cigar_length; map { $seq_cigar_length += $_ } @cigar_match_lengths;
my $member_sequence = $member->sequence; $member_sequence =~ s/\*//g;
if ($seq_cigar_length != length($member_sequence)) {
throw("While storing the cigar line, the returned cigar length did not match the sequence length\n");
}
#
printf("update protein_tree_member %s : %s\n",$member->stable_id, $member->cigar_line) if($self->debug);
$self->{'comparaDBA'}->get_ProteinTreeAdaptor->store($member);
}
$tree->store_tag('alignment_method', 'Muscle');
my $runtime = floor(time()*1000-$self->{'muscle_starttime'});
$tree->store_tag('MUSCLE_runtime_msec', $runtime);
# Fetch the alignment and calculate the percent identity
my $sa = $tree->get_SimpleAlign;
my $avg_pi = $sa->average_percentage_identity;
$tree->store_tag('MUSCLE_avg_percent_identity', sprintf("%.3f",$avg_pi));
# Also store alignment length
$tree->store_tag('MUSCLE_alignment_length', $sa->length);
return undef;
}
1;