Bio::EnsEMBL::Compara::RunnableDB Muscle
SummaryIncluded librariesPackage variablesSynopsisDescriptionGeneral documentationMethods
Toolbar
WebCvsRaw content
Summary
Bio::EnsEMBL::Compara::RunnableDB::Muscle
Package variables
No package variables defined.
Included modules
Bio::EnsEMBL::BaseAlignFeature
Bio::EnsEMBL::Compara::DBSQL::DBAdaptor
Bio::EnsEMBL::Compara::DBSQL::PeptideAlignFeatureAdaptor
Bio::EnsEMBL::Compara::Member
Bio::EnsEMBL::DBSQL::DBAdaptor
Bio::EnsEMBL::Hive
File::Basename
Getopt::Long
IO::File
POSIX qw ( ceil floor )
Time::HiRes qw ( time gettimeofday tv_interval )
Inherit
Bio::EnsEMBL::Hive::Process
Synopsis
my $db = Bio::EnsEMBL::Compara::DBAdaptor->new($locator);
my $muscle = Bio::EnsEMBL::Compara::RunnableDB::Muscle->new (
-db => $db,
-input_id => $input_id,
-analysis => $analysis );
$muscle->fetch_input(); #reads from DB
$muscle->run();
$muscle->output();
$muscle->write_output(); #writes to DB
Description
This Analysis/RunnableDB is designed to take a Family (or Homology) as input
Run a MUSCLE multiple alignment on it, and store the resulting alignment
back into the family_member table.
input_id/parameters format eg: "{'family_id'=>1234,'options'=>'-maxiters 2'}"
family_id : use family_id to run multiple alignment on its members
protein_tree_id : use 'id' to fetch a cluster from the ProteinTree
options : commandline options to pass to the 'muscle' program
Methods
check_job_fail_options
No description
Code
cleanup_job_tmp_files
No description
Code
dumpFamilyPeptidesToWorkdir
No description
Code
dumpProteinTreeToWorkdir
No description
Code
fetch_inputDescriptionCode
get_params
No description
Code
parse_and_store_alignment_into_family
No description
Code
parse_and_store_alignment_into_proteintree
No description
Code
print_params
No description
Code
runDescriptionCode
run_muscle
No description
Code
update_single_peptide_family
No description
Code
update_single_peptide_tree
No description
Code
write_outputDescriptionCode
Methods description
fetch_inputcode    nextTop
    Title   :   fetch_input
Usage : $self->fetch_input
Function: Fetches input data for repeatmasker from the database
Returns : none
Args : none
runcodeprevnextTop
    Title   :   run
Usage : $self->run
Function: runs MUSCLE
Returns : none
Args : none
write_outputcodeprevnextTop
    Title   :   write_output
Usage : $self->write_output
Function: parse muscle output and update family and family_member tables
Returns : none
Args : none
Methods code
check_job_fail_optionsdescriptionprevnextTop
sub check_job_fail_options {
  my $self = shift;

  if($self->input_job->retry_count >= 2) {
    $self->dataflow_output_id($self->input_id, 2);
    $self->input_job->update_status('FAILED');

    if($self->{'protein_tree'}) {
      $self->{'protein_tree'}->release_tree;
      $self->{'protein_tree'} = undef;
    }
    throw("Muscle job failed >=3 times: try something else and FAIL it");
  }
}

##############################################################
#
# Family input/output section
#
##############################################################
}
cleanup_job_tmp_filesdescriptionprevnextTop
sub cleanup_job_tmp_files {
  my $self = shift;
  
  unlink ($self->{'input_fasta'}) if($self->{'input_fasta'});
  if($self->{'muscle_output'}) {
    unlink ($self->{'muscle_output'});
    unlink ($self->{'muscle_output'} . ".log");
  }
}
dumpFamilyPeptidesToWorkdirdescriptionprevnextTop
sub dumpFamilyPeptidesToWorkdir {
  my $self = shift;
  my $family = shift;

  my $fastafile = $self->worker_temp_directory. "family_". $family->dbID. ".fasta";
  $fastafile =~ s/\/\//\//g;  # converts any // in path to /
return $fastafile if(-e $fastafile); print("fastafile = '$fastafile'\n") if($self->debug); #
# get only peptide members
#
my $seq_id_hash = {}; my @members_attributes; push @members_attributes,@{$family->get_Member_Attribute_by_source('ENSEMBLPEP')}; push @members_attributes,@{$family->get_Member_Attribute_by_source('Uniprot/SWISSPROT')}; push @members_attributes,@{$family->get_Member_Attribute_by_source('Uniprot/SPTREMBL')}; if(scalar @members_attributes <= 1) { $self->update_single_peptide_family($family); return undef; #so muscle isn't run
} open(OUTSEQ, ">$fastafile") or $self->throw("Error opening $fastafile for write"); foreach my $member_attribute (@members_attributes) { my ($member,$attribute) = @{$member_attribute}; my $member_stable_id = $member->stable_id; next if($seq_id_hash->{$member->sequence_id}); $seq_id_hash->{$member->sequence_id} = 1; my $seq = $member->sequence; $seq =~ s/(.{72})/$1\n/g; chomp $seq; print OUTSEQ ">$member_stable_id\n$seq\n"; } close OUTSEQ; return $fastafile;
}
dumpProteinTreeToWorkdirdescriptionprevnextTop
sub dumpProteinTreeToWorkdir {
  my $self = shift;
  my $tree = shift;

  my $fastafile = $self->worker_temp_directory. "proteintree_". $tree->node_id. ".fasta";
  $fastafile =~ s/\/\//\//g;  # converts any // in path to /
return $fastafile if(-e $fastafile); print("fastafile = '$fastafile'\n") if($self->debug); open(OUTSEQ, ">$fastafile") or $self->throw("Error opening $fastafile for write"); my $seq_id_hash = {}; my $residues = 0; my $member_list = $tree->get_all_leaves; $tree->store_tag('gene_count', scalar(@$member_list)); foreach my $member (@{$member_list}) { next if($seq_id_hash->{$member->sequence_id}); $seq_id_hash->{$member->sequence_id} = 1; my $seq = $member->sequence; $residues += $member->seq_length; $seq =~ s/(.{72})/$1\n/g; chomp $seq; print OUTSEQ ">". $member->sequence_id. "\n$seq\n"; } close OUTSEQ; if(scalar keys (%{$seq_id_hash}) <= 1) { $self->update_single_peptide_tree($tree); return undef; #so muscle isn't run
} $tree->store_tag('cluster_residue_count', $residues); return $fastafile;
}
fetch_inputdescriptionprevnextTop
sub fetch_input {
  my( $self) = @_;

  #$self->{'options'} = "-maxiters 1 -diags1 -sv"; #fast options
$self->{'options'} = ""; $self->{'muscle_starttime'} = time()*1000; $self->{'max_gene_count'} = 1500; $self->check_job_fail_options; # if($self->input_job->retry_count >= 3) {
# $self->dataflow_output_id($self->input_id, 2);
# $self->input_job->update_status('FAILED');
# throw("Muscle job failed >3 times: try something else and FAIL it");
# }
$self->throw("No input_id") unless defined($self->input_id); #create a Compara::DBAdaptor which shares the same DBI handle
#with the pipeline DBAdaptor that is based into this runnable
$self->{'comparaDBA'} = Bio::EnsEMBL::Compara::DBSQL::DBAdaptor->new(-DBCONN=>$self->db->dbc); $self->get_params($self->parameters); $self->get_params($self->input_id); $self->print_params if($self->debug); $self->check_if_exit_cleanly; if($self->{'family'}) { $self->{'input_fasta'} = $self->dumpFamilyPeptidesToWorkdir($self->{'family'}); } elsif($self->{'protein_tree'}) { if ($self->{'protein_tree'}->get_tagvalue('gene_count') > $self->{'max_gene_count'}) { $self->dataflow_output_id($self->input_id, 2); $self->input_job->update_status('FAILED'); $self->{'protein_tree'}->release_tree; $self->{'protein_tree'} = undef; throw("Muscle : cluster size over threshold and FAIL it"); } $self->{'input_fasta'} = $self->dumpProteinTreeToWorkdir($self->{'protein_tree'}); } else { throw("undefined family as input\n"); } return 1;
}
get_paramsdescriptionprevnextTop
sub get_params {
  my $self         = shift;
  my $param_string = shift;

  return unless($param_string);
  print("parsing parameter string : ",$param_string,"\n") if($self->debug);
  
  my $params = eval($param_string);
  return unless($params);

  if($self->debug) {
    foreach my $key (keys %$params) {
      print("  $key : ", $params->{$key}, "\n");
    }
  }
    
  if(defined($params->{'family_id'})) {
    $self->{'family'} =  $self->{'comparaDBA'}->get_FamilyAdaptor->fetch_by_dbID($params->{'family_id'});
  }
  if(defined($params->{'protein_tree_id'})) {
    $self->{'protein_tree'} =  
         $self->{'comparaDBA'}->get_ProteinTreeAdaptor->
         fetch_node_by_node_id($params->{'protein_tree_id'});
  }
  if(defined($params->{'clusterset_id'})) {
    $self->{'clusterset_id'} = $params->{'clusterset_id'};
  }
  $self->{'options'} = $params->{'options'} if(defined($params->{'options'}));
  $self->{'max_gene_count'} = $params->{'max_gene_count'} if(defined($params->{'max_gene_count'}));
  return;
}
parse_and_store_alignment_into_familydescriptionprevnextTop
sub parse_and_store_alignment_into_family {
  my $self = shift;
  my $muscle_output =  $self->{'muscle_output'};
  my $family = $self->{'family'};
    
  if($muscle_output and -e $muscle_output) {
    $family->read_clustalw($muscle_output);
  }

  my $familyDBA = $self->{'comparaDBA'}->get_FamilyAdaptor;

  # 
# post process and copy cigar_line between duplicate sequences
#
my $cigar_hash = {}; my $familyMemberList = $family->get_all_Member_Attribute(); #first build up a hash of cigar_lines that are defined
foreach my $familyMember (@{$familyMemberList}) { my ($member,$attribute) = @{$familyMember}; next unless($member->sequence_id); next unless(defined($attribute->cigar_line)); next if($attribute->cigar_line eq ''); next if($attribute->cigar_line eq 'NULL'); $cigar_hash->{$member->sequence_id} = $attribute->cigar_line; } #next loop again to copy (via sequence_id) into members
#missing cigar_lines and then store them
foreach my $familyMember (@{$familyMemberList}) { my ($member,$attribute) = @{$familyMember}; next if($member->source_name eq 'ENSEMBLGENE'); next unless($member->sequence_id); my $cigar_line = $cigar_hash->{$member->sequence_id}; next unless($cigar_line); $attribute->cigar_line($cigar_line); printf("update family_member %s : %s\n",$member->stable_id, $attribute->cigar_line) if($self->debug); $familyDBA->update_relation([$member, $attribute]); } } ########################################################
#
# ProteinTree input/output section
#
########################################################
}
parse_and_store_alignment_into_proteintreedescriptionprevnextTop
sub parse_and_store_alignment_into_proteintree {
  my $self = shift;
  my $muscle_output =  $self->{'muscle_output'};
  my $tree = $self->{'protein_tree'};
  
  return unless($muscle_output);
  
  #
# parse alignment file into hash: combine alignment lines
#
my %align_hash; my $FH = IO::File->new(); $FH->open($muscle_output) || throw("Could not open alignment file [$muscle_output]"); <$FH>; #skip header
while(<$FH>) { next if($_ =~ /^\s+/); #skip lines that start with space
chomp; my ($id, $align) = split; $align_hash{$id} ||= ''; $align_hash{$id} .= $align; } $FH->close; #
# convert clustalw alignment string into a cigar_line
#
my $alignment_length; foreach my $id (keys %align_hash) { my $alignment_string = $align_hash{$id}; unless (defined $alignment_length) { $alignment_length = length($alignment_string); } else { if ($alignment_length != length($alignment_string)) { throw("While parsing the alignment, some id did not return the expected alignment length\n"); } } $alignment_string =~ s/\-([A-Z])/\- $1/g; $alignment_string =~ s/([A-Z])\-/$1 \-/g; my @cigar_segments = split " ",$alignment_string; my $cigar_line = ""; foreach my $segment (@cigar_segments) { my $seglength = length($segment); $seglength = "" if ($seglength == 1); if ($segment =~ /^\-+$/) { $cigar_line .= $seglength . "D"; } else { $cigar_line .= $seglength . "M"; } } $align_hash{$id} = $cigar_line; } #
# align cigar_line to member and store
#
foreach my $member (@{$tree->get_all_leaves}) { if ($align_hash{$member->sequence_id} eq "") { throw("muscle did produce an empty cigar_line for ".$member->stable_id."\n"); } $member->cigar_line($align_hash{$member->sequence_id}); ## Check that the cigar length (Ms) matches the sequence length
my @cigar_match_lengths = map { if ($_ eq '') {$_ = 1} else {$_ = $_;} } map { $_ =~ /^(\d*)/ } ( $member->cigar_line =~ /(\d*[M])/g ); my $seq_cigar_length; map { $seq_cigar_length += $_ } @cigar_match_lengths; my $member_sequence = $member->sequence; $member_sequence =~ s/\*//g; if ($seq_cigar_length != length($member_sequence)) { throw("While storing the cigar line, the returned cigar length did not match the sequence length\n"); } #
printf("update protein_tree_member %s : %s\n",$member->stable_id, $member->cigar_line) if($self->debug); $self->{'comparaDBA'}->get_ProteinTreeAdaptor->store($member); } $tree->store_tag('alignment_method', 'Muscle'); my $runtime = floor(time()*1000-$self->{'muscle_starttime'}); $tree->store_tag('MUSCLE_runtime_msec', $runtime); # Fetch the alignment and calculate the percent identity
my $sa = $tree->get_SimpleAlign; my $avg_pi = $sa->average_percentage_identity; $tree->store_tag('MUSCLE_avg_percent_identity', sprintf("%.3f",$avg_pi)); # Also store alignment length
$tree->store_tag('MUSCLE_alignment_length', $sa->length); return undef; } 1;
}
print_paramsdescriptionprevnextTop
sub print_params {
  my $self = shift;

  print(" params:\n");
  print("   family_id     : ", $self->{'family'}->dbID,"\n") if($self->{'family'});
  print("   options       : ", $self->{'options'},"\n") if($self->{'options'});
}
rundescriptionprevnextTop
sub run {
  my $self = shift;

  $self->check_if_exit_cleanly;
  return unless($self->{'input_fasta'});  
  $self->{'muscle_output'} = $self->run_muscle;
}
run_muscledescriptionprevnextTop
sub run_muscle {
  my $self = shift;
  my $input_fasta = $self->{'input_fasta'};

  my $muscle_output = $input_fasta .".msc";
  $muscle_output =~ s/\/\//\//g;  # converts any // in path to /
my $muscle_executable = $self->analysis->program_file; unless (-e $muscle_executable) { $muscle_executable = "/nfs/acari/abel/bin/alpha-dec-osf4.0/muscle"; if (-e "/proc/version") { # it is a linux machine
$muscle_executable = "/nfs/acari/abel/bin/i386/muscle"; } } throw("can't find a muscle executable to run\n") unless(-e $muscle_executable); my $cmd = $muscle_executable; $cmd .= " ". "-maxiters 3 " if($self->input_job->retry_count >= 2); $cmd .= " ". $self->{'options'}; $cmd .= " -clw -nocore -verbose -quiet "; $cmd .= " -in " . $input_fasta; $cmd .= " -out $muscle_output -log $muscle_output.log"; $self->{'comparaDBA'}->dbc->disconnect_when_inactive(1); print("$cmd\n") if($self->debug); unless(system($cmd) == 0) { $self->check_job_fail_options; throw("error running muscle, $!\n"); } $self->{'comparaDBA'}->dbc->disconnect_when_inactive(0); $self->{'muscle_output'} = $muscle_output; return $muscle_output;
}
update_single_peptide_familydescriptionprevnextTop
sub update_single_peptide_family {
  my $self   = shift;
  my $family = shift;
  
  my $familyMemberList = $family->get_all_Member_Attribute();

  foreach my $familyMember (@{$familyMemberList}) {
    my ($member,$attribute) = @{$familyMember};
    next unless($member->sequence);
    next if($member->source_name eq 'ENSEMBLGENE');
    $attribute->cigar_line(length($member->sequence)."M");
    printf("single_pepide_family %s : %s\n", $member->stable_id, $attribute->cigar_line) if($self->debug);
  }
}
update_single_peptide_treedescriptionprevnextTop
sub update_single_peptide_tree {
  my $self   = shift;
  my $tree   = shift;
  
  foreach my $member (@{$tree->get_all_leaves}) {
    next unless($member->isa('Bio::EnsEMBL::Compara::AlignedMember'));
    next unless($member->sequence);
    $member->cigar_line(length($member->sequence)."M");
    $self->{'comparaDBA'}->get_ProteinTreeAdaptor->store($member);
    printf("single_pepide_tree %s : %s\n", $member->stable_id, $member->cigar_line) if($self->debug);
  }
}
write_outputdescriptionprevnextTop
sub write_output {
  my $self = shift;

  $self->check_if_exit_cleanly;
  if($self->{'family'}) {
    $self->parse_and_store_alignment_into_family;
  } elsif($self->{'protein_tree'}) {
    $self->parse_and_store_alignment_into_proteintree;
    #done so release the tree
$self->{'protein_tree'}->release_tree; } else { throw("undefined family as input\n"); } $self->cleanup_job_tmp_files unless($self->debug); } ##########################################
#
# internal methods
#
##########################################
}
General documentation
CONTACTTop
  Contact Jessica Severin on module implemetation/design detail: jessica@ebi.ac.uk
Contact Abel Ureta-Vidal on EnsEMBL/Compara: abel@ebi.ac.uk
Contact Ewan Birney on EnsEMBL in general: birney@sanger.ac.uk
APPENDIXTop
The rest of the documentation details each of the object methods.
Internal methods are usually preceded with a _