Bio::EnsEMBL::Compara::RunnableDB
MCoffee
Toolbar
Summary
Bio::EnsEMBL::Compara::RunnableDB::MCoffee
Package variables
No package variables defined.
Included modules
File::Basename
File::Path
Getopt::Long
IO::File
Time::HiRes qw ( time gettimeofday tv_interval )
Inherit
Synopsis
my $db = Bio::EnsEMBL::Compara::DBAdaptor->new($locator);
my $mcoffee = Bio::EnsEMBL::Compara::RunnableDB::Mcoffee->new (
-db => $db,
-input_id => $input_id,
-analysis => $analysis );
$mcoffee->fetch_input(); #reads from DB
$mcoffee->run();
$mcoffee->output();
$mcoffee->write_output(); #writes to DB
Description
This Analysis/RunnableDB is designed to take a protein_tree cluster as input
Run an MCOFFEE multiple alignment on it, and store the resulting alignment
back into the protein_tree_member and protein_tree_member_score table.
input_id/parameters format eg: "{'protein_tree_id'=>726093, 'clusterset_id'=>1}"
protein_tree_id : use family_id to run multiple alignment on its members
options : commandline options to pass to the 'mcoffee' program
Methods
DESTROY | No description | Code |
_get_alternate_alignment_tree | No description | Code |
_store_aln_tags | No description | Code |
_to_cigar_line | No description | Code |
dumpProteinTreeToWorkdir | No description | Code |
fetch_input | Description | Code |
get_params | No description | Code |
parse_and_store_alignment_into_proteintree | No description | Code |
run | Description | Code |
run_mcoffee | No description | Code |
update_single_peptide_tree | No description | Code |
write_output | Description | Code |
Methods description
Title : fetch_input Usage : $self->fetch_input Function: Fetches input data for mcoffee from the database Returns : none Args : none |
Title : run Usage : $self->run Function: runs MCOFFEE Returns : none Args : none |
Title : write_output Usage : $self->write_output Function: parse mcoffee output and update protein_tree_member tables Returns : none Args : none |
Methods code
sub DESTROY
{ my $self = shift;
if($self->{'protein_tree'}) {
$self->{'protein_tree'}->release_tree;
$self->{'protein_tree'} = undef;
}
unlink ($self->{'input_params'}) if($self->{'input_params'});
unlink ($self->{'input_fasta'}) if($self->{'input_fasta'});
if($self->{'mcoffee_output'}) {
unlink ($self->{'mcoffee_output'});
unlink ($self->{'mcoffee_output'} . ".log");
}
$self->SUPER::DESTROY if $self->can("SUPER::DESTROY");
}
} |
sub _get_alternate_alignment_tree
{ my $self = shift;
my $pta = shift;
my $node_id = shift;
my $table = shift;
my $tree = $pta->fetch_node_by_node_id($node_id);
foreach my $leaf (@{$tree->get_all_leaves}) {
undef $leaf->{'cdna_alignment_string'};
undef $leaf->{'alignment_string'};
my $id = $leaf->member_id;
my $cmd = "SELECT cigar_line FROM $table where member_id=$id;";
my $sth = $pta->prepare($cmd);
$sth->execute();
my $data = $sth->fetchrow_hashref();
$sth->finish();
my $cigar = $data->{'cigar_line'};
die "No cigar line for member $id!\n" unless ($cigar);
$leaf->cigar_line($cigar);
}
return $tree;
}
1; } |
sub _store_aln_tags
{ my $self = shift;
my $tree = shift || $self->{'protein_tree'};
my $output_table = $self->{'output_table'};
my $pta = $self->{'comparaDBA'}->get_ProteinTreeAdaptor;
print "Storing Alignment tags...\n";
if ($output_table ne "protein_tree_member") {
$tree = $self->_get_alternate_alignment_tree($pta,$tree->node_id,$output_table);
}
my $sa = $tree->get_SimpleAlign;
my $aln_pi = $sa->average_percentage_identity;
$tree->store_tag("aln_percent_identity",$aln_pi);
my $aln_length = $sa->length;
$tree->store_tag("aln_length",$aln_length);
my $aln_runtime = int(time()*1000-$self->{'mcoffee_starttime'});
$tree->store_tag("aln_runtime",$aln_runtime);
my $aln_method = $self->{'method'};
$tree->store_tag("aln_method",$aln_method);
my $aln_num_residues = $sa->no_residues;
$tree->store_tag("aln_num_residues",$aln_num_residues);
my ($from_clusterset_id, $to_clusterset_id) = split(":",$self->{'redo'});
my $redo_tag = "MCoffee_redo_".$from_clusterset_id."_".$to_clusterset_id;
$tree->store_tag("$redo_tag",$self->{'protein_tree_id'}) if ($self->{'redo'}); } |
sub _to_cigar_line
{ my $self = shift;
my $alignment_string = shift;
$alignment_string =~ s/\-([A-Z])/\- $1/g;
$alignment_string =~ s/([A-Z])\-/$1 \-/g;
my @cigar_segments = split " ",$alignment_string;
my $cigar_line = "";
foreach my $segment (@cigar_segments) {
my $seglength = length($segment);
$seglength = "" if ($seglength == 1);
if ($segment =~ /^\-+$/) {
$cigar_line .= $seglength . "D";
} else {
$cigar_line .= $seglength . "M";
}
}
return $cigar_line; } |
sub dumpProteinTreeToWorkdir
{
my $self = shift;
my $tree = shift;
my $use_exon_boundaries = shift;
my $fastafile;
if ($use_exon_boundaries) {
$fastafile = $self->worker_temp_directory. "proteintree_exon_". $tree->node_id. ".fasta";
} else {
my $node_id = $tree->node_id;
$fastafile = $self->worker_temp_directory. "proteintree_". $node_id. ".fasta";
}
$fastafile =~ s/\/\//\//g; return $fastafile if(-e $fastafile);
print("fastafile = '$fastafile'\n") if ($self->debug);
open(OUTSEQ, ">$fastafile")
or $self->throw("Error opening $fastafile for write!");
my $seq_id_hash = {};
my $residues = 0;
my $member_list = $tree->get_all_leaves;
$DB::single=1;1;
$self->{'tag_gene_count'} = scalar(@{$member_list});
foreach my $member (@{$member_list}) {
my $longest_member = $member->gene_member->get_longest_peptide_Member;
unless ($longest_member->member_id eq $member->member_id) {
$member->disavow_parent;
$self->{treeDBA}->delete_flattened_leaf($member);
my $updated_gene_count = $tree->get_tagvalue('gene_count');
$updated_gene_count--;
$tree->adaptor->delete_tag($tree->node_id,'gene_count');
$tree->store_tag('gene_count', $updated_gene_count);
next;
}
return undef unless ($member->isa("Bio::EnsEMBL::Compara::AlignedMember"));
next if($seq_id_hash->{$member->sequence_id});
$seq_id_hash->{$member->sequence_id} = 1;
my $seq = '';
if ($use_exon_boundaries) {
$seq = $member->get_exon_bounded_sequence;
} else {
$seq = $member->sequence;
}
$residues += $member->seq_length;
$seq =~ s/(.{72})/$1\n/g;
chomp $seq;
print OUTSEQ ">". $member->sequence_id. "\n$seq\n";
}
close OUTSEQ;
if(scalar keys (%{$seq_id_hash}) <= 1) {
$self->update_single_peptide_tree($tree);
}
$self->{'tag_residue_count'} = $residues;
return $fastafile; } |
sub fetch_input
{ my( $self) = @_;
$self->{'comparaDBA'} = Bio::EnsEMBL::Compara::DBSQL::DBAdaptor->new(-DBCONN=>$self->db->dbc);
my $p = '';
$p = 'use_exon_boundaries';
$self->{$p} = 0;
$p = 'method';
$self->{$p} = 'fmcoffee';
$p = 'output_table';
$self->{$p} = 'protein_tree_member';
$p = 'max_gene_count';
$self->{$p} = 1500;
$p = 'options';
$self->{$p} = '';
$self->get_params($self->parameters);
$self->get_params($self->input_id);
$self->check_if_exit_cleanly;
$self->{treeDBA} = $self->{'comparaDBA'}->get_ProteinTreeAdaptor;
if (defined $self->{'protein_tree_id'}) {
my $id = $self->{'protein_tree_id'};
$self->{'protein_tree'} = $self->{'comparaDBA'}->get_ProteinTreeAdaptor->fetch_node_by_node_id($id);
$self->{'protein_tree'}->flatten_tree; $self->{'input_fasta'} = $self->dumpProteinTreeToWorkdir($self->{'protein_tree'});
if ($self->{'use_exon_boundaries'}) {
$self->{'input_fasta_exons'} = $self->dumpProteinTreeToWorkDir($self->{'protein_tree'},1); }
}
if (defined($self->{'redo'}) && $self->{'method'} eq 'unalign') {
$self->{redo_sa} = $self->{'protein_tree'}->get_SimpleAlign(-id_type => 'MEMBER');
$self->{redo_sa}->set_displayname_flat(1);
$self->{redo_alnname} = $self->worker_temp_directory . $self->{protein_tree}->node_id . ".fasta";
my $alignout = Bio::AlignIO->new(-file => ">".$self->{redo_alnname},
-format => "fasta");
$alignout->write_aln($self->{redo_sa});
}
if ($self->{'method'} eq 'cmcoffee') {
if ($self->{'protein_tree'}->get_tagvalue('gene_count') > 100) {
$self->{'method'} = 'mafft';
print "MCoffee, auto-switch method to mafft because gene count > 100\n ";
}
}
print "RETRY COUNT:".$self->input_job->retry_count()."\n";
if ($self->input_job->retry_count >= 2) {
$self->{'method'} = 'fmcoffee';
}
if ($self->input_job->retry_count >= 3) {
$self->{'method'} = 'mafft';
}
print "MCoffee alignment method: ".$self->{'method'}."\n";
if (!defined($self->{'protein_tree'})) {
$self->DESTROY;
throw("MCoffee job no input protein_tree");
}
if (!$self->{'input_fasta'}) {
$self->DESTROY;
throw("MCoffee job, error writing input Fasta");
}
if ($self->{'protein_tree'}->get_tagvalue('gene_count') > $self->{'max_gene_count'}) {
$self->dataflow_output_id($self->input_id, 2);
$self->input_job->update_status('FAILED');
$self->DESTROY;
throw("Mcoffee job too big: try something else and FAIL it");
}
if ($self->input_job->retry_count >= 5) {
$self->dataflow_output_id($self->input_id, 2);
$self->input_job->update_status('FAILED');
$self->DESTROY;
throw("Mcoffee job failed >=3 times: try something else and FAIL it");
}
return 1; } |
sub get_params
{ my $self = shift;
my $param_string = shift;
return unless($param_string);
print("parsing parameter string : ",$param_string,"\n") if($self->debug);
my $params = eval($param_string);
return unless($params);
if($self->debug) {
foreach my $key (keys %$params) {
print(" $key\t=>\t", $params->{$key}, "\n");
}
}
my $p;
$p = 'method';
$self->{$p} = $params->{$p} if (defined($params->{$p}));
$p = 'cutoff';
$self->{$p} = $params->{$p} if (defined($params->{$p}));
$p = 'output_table';
$self->{$p} = $params->{$p} if (defined($params->{$p}));
$p = 'protein_tree_id';
$self->{$p} = $params->{$p} if (defined($params->{$p}));
$p = 'clusterset_id';
$self->{$p} = $params->{$p} if (defined($params->{$p}));
$p = 'options';
$self->{$p} = $params->{$p} if (defined($params->{$p}));
$p = 'max_gene_count';
$self->{$p} = $params->{$p} if (defined($params->{$p}));
$p = 'use_exon_boundaries';
$self->{$p} = $params->{$p} if (defined($params->{$p}));
$p = 'redo';
$self->{$p} = $params->{$p} if (defined($params->{$p}));
return; } |
parse_and_store_alignment_into_proteintree | description | prev | next | Top |
sub parse_and_store_alignment_into_proteintree
{
my $self = shift;
my $mcoffee_output = $self->{'mcoffee_output'};
my $mcoffee_scores = $self->{'mcoffee_scores'};
my $tree = $self->{'protein_tree'};
return unless($mcoffee_output and -e $mcoffee_output);
use Bio::AlignIO;
my $alignio = Bio::AlignIO->new(-file => "$mcoffee_output",
-format => "fasta");
my $aln = $alignio->next_aln();
my %align_hash;
foreach my $seq ($aln->each_seq) {
my $id = $seq->display_id;
$align_hash{$id} = $seq->seq;
}
my %score_hash;
if (defined $mcoffee_scores) {
my $FH = IO::File->new();
$FH->open($mcoffee_scores) || throw("Could not open alignment scores file [$mcoffee_scores]");
<$FH>; my $i=0;
while(<$FH>) {
$i++;
next if ($i < 7); next if($_ =~ /^\s+/); if ($_ =~ /:/) {
my ($id,$overall_score) = split(/:/,$_);
$id =~ s/^\s+|\s+$//g;
$overall_score =~ s/^\s+|\s+$//g;
print "___".$id."___".$overall_score."___\n";
next;
}
chomp;
my ($id, $align) = split;
$score_hash{$id} ||= '';
$score_hash{$id} .= $align;
}
$FH->close;
}
my $alignment_length;
foreach my $id (keys %align_hash) {
next if ($id eq 'cons');
my $alignment_string = $align_hash{$id};
unless (defined $alignment_length) {
$alignment_length = length($alignment_string);
} else {
if ($alignment_length != length($alignment_string)) {
throw("While parsing the alignment, some id did not return the expected alignment length\n");
}
}
$align_hash{$id} = $self->_to_cigar_line(uc($alignment_string));
}
if (defined($self->{redo}) && $self->{'output_table'} eq 'protein_tree_member') {
my ($from_clusterset_id, $to_clusterset_id) = split(":",$self->{'redo'});
throw("malformed redo option: ". $self->{'redo'}." should be like 1:1000000")
unless (defined($from_clusterset_id) && defined($to_clusterset_id));
my $clone_tree = $self->{protein_tree}->copy;
my $clusterset = $self->{treeDBA}->fetch_node_by_node_id($to_clusterset_id);
$clusterset->add_child($clone_tree);
$self->{treeDBA}->store($clone_tree);
$self->_store_aln_tags($clone_tree);
$tree->release_tree; $tree = $clone_tree;
}
foreach my $member (@{$tree->get_all_leaves}) {
if ($align_hash{$member->sequence_id} eq "" && $align_hash{$member->member_id} eq "") {
throw("mcoffee produced an empty cigar_line for ".$member->stable_id."\n");
}
$member->cigar_line($align_hash{$member->sequence_id} || $align_hash{$member->member_id});
my @cigar_match_lengths = map { if ($_ eq '') {$_ = 1} else {$_ = $_;} } map { $_ =~ /^(\d*)/ } ( $member->cigar_line =~ /(\d*[M])/g );
my $seq_cigar_length; map { $seq_cigar_length += $_ } @cigar_match_lengths;
my $member_sequence = $member->sequence; $member_sequence =~ s/\*//g;
if ($seq_cigar_length != length($member_sequence)) {
print $member_sequence."\n".$member->cigar_line."\n" if ($self->debug);
throw("While storing the cigar line, the returned cigar length did not match the sequence length\n");
}
if ($self->{'output_table'} eq 'protein_tree_member') {
$self->{'comparaDBA'}->get_ProteinTreeAdaptor->store($member);
} else {
my $table_name = $self->{'output_table'};
printf("Updating $table_name %s : %s\n",$member->stable_id,$member->cigar_line) if ($self->debug);
my $sth = $self->{treeDBA}->prepare("INSERT ignore INTO $table_name
(node_id,member_id,method_link_species_set_id,cigar_line) VALUES (?,?,?,?)");
$sth->execute($member->node_id,$member->member_id,$member->method_link_species_set_id,$member->cigar_line);
$sth->finish;
}
if (defined $self->{'mcoffee_scores'}) {
my $table_name = $self->{'output_table'} . "_score";
my $sth = $self->{treeDBA}->prepare("INSERT ignore INTO $table_name
(node_id,member_id,method_link_species_set_id,cigar_line) VALUES (?,?,?,?)");
my $score_string = $score_hash{$member->sequence_id} || '';
$score_string =~ s/[^\d-]/9/g; printf("Updating $table_name %s : %s\n",$member->stable_id,$score_string) if ($self->debug);
$sth->execute($member->node_id,$member->member_id,$member->method_link_species_set_id,$score_string);
$sth->finish;
}
}
}
} |
sub run
{
my $self = shift;
$self->check_if_exit_cleanly;
$self->{'mcoffee_starttime'} = time()*1000;
$self->run_mcoffee; } |
sub run_mcoffee
{
my $self = shift;
my $input_fasta = $self->{'input_fasta'};
my $mcoffee_output = $self->worker_temp_directory . "output.mfa";
$mcoffee_output =~ s/\/\//\//g; $self->{'mcoffee_output'} = $mcoffee_output;
my $mcoffee_scores = $mcoffee_output . ".score_ascii";
$mcoffee_scores =~ s/\/\//\//g; $self->{'mcoffee_scores'} = $mcoffee_scores;
my $tree_temp = $self->worker_temp_directory . "tree_temp.dnd";
$tree_temp =~ s/\/\//\//g; $self->{'mcoffee_tree'} = $tree_temp;
my $mcoffee_executable = $self->analysis->program_file;
unless (-e $mcoffee_executable) {
print "Using default T-Coffee executable!\n";
$mcoffee_executable = "/nfs/acari/avilella/src/tcoffee/compara/t_coffee";
}
throw("can't find a M-Coffee executable to run\n") unless(-e $mcoffee_executable);
my $tempdir = $self->worker_temp_directory;
print "TEMP DIR: $tempdir\n" if ($self->debug);
$self->{'input_params'} = $self->worker_temp_directory. "temp.params";
my $paramsfile = $self->{'input_params'};
$paramsfile =~ s/\/\//\//g; open(OUTPARAMS, ">$paramsfile")
or throw("Error opening $paramsfile for write");
my $method_string = '-method=';
if ($self->{'method'} && $self->{'method'} eq 'cmcoffee') {
$method_string .= "mafftgins_msa, muscle_msa, kalign_msa, probcons_msa"; } elsif ($self->{'method'} eq 'fmcoffee') {
$method_string .= "mafft_msa, muscle_msa, clustalw_msa, kalign_msa";
} elsif ($self->{'method'} eq 'muscle') {
$method_string .= "muscle_msa";
} elsif ($self->{'method'} eq 'mafft') {
$method_string .= "mafft_msa";
} elsif ($self->{'method'} eq 'prank') {
$method_string .= "prank_msa";
} elsif (defined($self->{'redo'}) && $self->{'method'} eq 'unalign') {
my $cutoff = $self->{'cutoff'} || 2;
$method_string = " -other_pg seq_reformat -in " . $self->{redo_alnname} ." -action +aln2overaln lower $cutoff 1>$mcoffee_output";
} else {
throw ("Improper method parameter: ".$self->{'method'});
}
if ($self->{'use_exon_boundaries'}) {
$method_string .= ", exon_pair";
my $exon_file = $self->{'input_fasta_exons'};
print OUTPARAMS "-template_file=$exon_file\n";
}
$method_string .= "\n";
print OUTPARAMS $method_string;
print OUTPARAMS "-mode=mcoffee\n";
print OUTPARAMS "-output=fasta_aln,score_ascii\n";
print OUTPARAMS "-outfile=$mcoffee_output\n";
print OUTPARAMS "-newtree=$tree_temp\n";
close(OUTPARAMS);
my $t_env_filename = $tempdir . "t_coffee_env";
open(TCOFFEE_ENV, ">$t_env_filename")
or throw("Error opening $t_env_filename for write");
print TCOFFEE_ENV "http_proxy_4_TCOFFEE=\n";
print TCOFFEE_ENV "EMAIL_4_TCOFFEE=cedric.notredame\@europe.com\n";
close TCOFFEE_ENV;
my $cmd = $mcoffee_executable;
$cmd .= " ".$input_fasta unless ($self->{redo});
$cmd .= " ". $self->{'options'};
if (defined($self->{'redo'}) && $self->{'method'} eq 'unalign') {
$self->{'mcoffee_scores'} = undef; $cmd .= " ". $method_string;
} else {
$cmd .= " -parameters=$paramsfile";
}
print("$cmd\n") if($self->debug);
my $prefix = "export HOME_4_TCOFFEE=\"$tempdir\";";
$prefix .= "export DIR_4_TCOFFEE=\"$tempdir\";";
$prefix .= "export TMP_4_TCOFFEE=\"$tempdir\";";
$prefix .= "export CACHE_4_TCOFFEE=\"$tempdir\";";
$prefix .= "export NO_ERROR_REPORT_4_TCOFFEE=1;";
$prefix .= "export MAFFT_BINARIES=/nfs/acari/gj1/bin/mafft-bins/binaries;";
print $prefix.$cmd."\n" if ($self->debug);
$self->{'comparaDBA'}->dbc->disconnect_when_inactive(1);
my $rc;
if ($self->{'method'} eq 'mafft') {
my $mafft_executable = "/nfs/acari/avilella/src/mafft/mafft-6.522/scripts/mafft";
BEGIN {$ENV{MAFFT_BINARIES} = '/nfs/acari/avilella/src/mafft/mafft-6.522/binaries'; }
print STDERR "### $mafft_executable --auto $input_fasta > $mcoffee_output\n";
$rc = system("$mafft_executable --auto $input_fasta > $mcoffee_output");
$self->{'mcoffee_scores'} = undef; } else {
$DB::single=1;1;
$rc = system($prefix.$cmd);
}
$self->{'comparaDBA'}->dbc->disconnect_when_inactive(0);
unless($rc == 0) {
$self->DESTROY;
throw("MCoffee job, error running executable: $\n");
}
}
} |
sub update_single_peptide_tree
{
my $self = shift;
my $tree = shift;
foreach my $member (@{$tree->get_all_leaves}) {
next unless($member->isa('Bio::EnsEMBL::Compara::AlignedMember'));
next unless($member->sequence);
$member->cigar_line(length($member->sequence)."M");
$self->{'comparaDBA'}->get_ProteinTreeAdaptor->store($member);
printf("single_pepide_tree %s : %s\n", $member->stable_id, $member->cigar_line) if($self->debug);
} } |
sub write_output
{ my $self = shift;
$self->check_if_exit_cleanly;
$self->parse_and_store_alignment_into_proteintree;
$self->_store_aln_tags unless ($self->{redo}); } |
General documentation
The rest of the documentation details each of the object methods.
Internal methods are usually preceded with a _