Bio::EnsEMBL::Compara::RunnableDB
Sitewise_dNdS
Toolbar
Summary
Bio::EnsEMBL::Compara::RunnableDB::Sitewise_dNdS
Package variables
No package variables defined.
Included modules
Cwd
File::Basename
Getopt::Long
IO::File
Time::HiRes qw ( time gettimeofday tv_interval )
Inherit
Synopsis
my $db = Bio::EnsEMBL::Compara::DBAdaptor->new($locator);
my $sitewise_dNdS = Bio::EnsEMBL::Compara::RunnableDB::Sitewise_dNdS->new
(
-db => $db,
-input_id => $input_id,
-analysis => $analysis
);
$sitewise_dNdS->fetch_input(); #reads from DB
$sitewise_dNdS->run();
$sitewise_dNdS->output();
$sitewise_dNdS->write_output(); #writes to DB
Description
This Analysis/RunnableDB is designed to take a ProteinTree or subtree
as input. This must already have a multiple alignment and a genetree
run on it. It uses the alignment and tree as input into the SLR
program which then generates annotations for the codons/peptides and
the branches in the genetree.
input_id/parameters format eg: "{'protein_tree_id'=>1234}"
protein_tree_id : use 'id' to fetch a cluster from the ProteinTree
Methods
Methods description
Title : fetch_input Usage : $self->fetch_input Function: Fetches input data for repeatmasker from the database Returns : none Args : none |
Title : run Usage : $self->run Function: runs SLR Returns : none Args : none |
Title : write_output Usage : $self->write_output Function: stores SLR annotations Returns : none Args : none |
Methods code
sub DESTROY
{ my $self = shift;
if ($self->{'protein_tree'}) {
printf("Sitewise_dNdS::DESTROY releasing tree\n") if($self->debug);
$self->{'protein_tree'}->release_tree;
$self->{'protein_tree'} = undef;
}
$self->SUPER::DESTROY if $self->can("SUPER::DESTROY");
}
} |
sub check_job_fail_options
{
my $self = shift;
if ($self->input_job->retry_count >= 2) {
$self->dataflow_output_id($self->input_id, 2);
$self->input_job->update_status('FAILED');
if ($self->{'protein_tree'}) {
$self->{'protein_tree'}->release_tree;
$self->{'protein_tree'} = undef;
}
throw("Sitewise_dNdS job failed >=3 times: try something else and FAIL it");
} } |
sub fetch_input
{ my( $self) = @_;
$self->check_job_fail_options;
throw("No input_id") unless defined($self->input_id);
$self->{'comparaDBA'} = Bio::EnsEMBL::Compara::DBSQL::DBAdaptor->new
(
-DBCONN=>$self->db->dbc
);
$self->get_params($self->parameters);
$self->get_params($self->input_id);
$self->print_params if($self->debug);
$self->check_if_exit_cleanly;
unless ($self->{'protein_tree'}) {
throw("undefined ProteinTree as input\n");
}
my $num_leaves = $self->{'protein_tree'}->num_leaves;
$self->{'protein_tree'}->print_tree(10) if ($self->debug);
if ($num_leaves < 4) {
$self->{'protein_tree'}->release_tree;
$self->{'protein_tree'} = undef;
return undef;
}
$self->{'cds_aln'} = $self->{'protein_tree'}->get_SimpleAlign(-cdna => 1);
$self->{'tree'} = $self->{'protein_tree'}->newick_format("int_node_id");
return 1; } |
sub get_params
{ my $self = shift;
my $param_string = shift;
return unless($param_string);
print("parsing parameter string : ",$param_string,"\n") if($self->debug);
my $params = eval($param_string);
return unless($params);
if (defined($params->{'protein_tree_id'})) {
$self->{'protein_tree'} =
$self->{'comparaDBA'}->get_ProteinTreeAdaptor->
fetch_node_by_node_id($params->{'protein_tree_id'});
} elsif (defined($params->{'saturated'})) {
$self->{'saturated'} = $params->{'saturated'};
}
return; } |
sub print_params
{ my $self = shift;
print("params:\n");
print(" tree_id : ", $self->{'protein_tree'}->node_id,"\n") if($self->{'protein_tree'}); } |
sub run
{ my $self = shift;
$self->check_if_exit_cleanly;
$self->run_sitewise_dNdS; } |
sub run_gblocks
{
my $self = shift;
my $gmin = shift;
my $aln = shift;
printf("Sitewise_dNdS::run_gblocks\n") if($self->debug);
throw("Sitewise_dNdS : error getting Peptide SimpleAlign") unless (defined($aln));
my $aln_length = $aln->length;
my $tree_id = $self->{'protein_tree'}->node_id;
my $tmpdir = $self->worker_temp_directory;
my $filename = "$tmpdir". "$tree_id.fasta";
my $tmpfile = Bio::AlignIO->new
(-file => ">$filename",
-format => 'fasta');
$tmpfile->write_aln($aln);
$tmpfile->close;
my $min_leaves_gblocks = int(($self->{'protein_tree'}->num_leaves+1) * $gmin + 0.5);
my $cmd = "echo -e\" o\n$filename\nt\nb\n2\n$min_leaves_gblocks\n5\n5\ng\nm\nq\n\" | /software/ensembl/compara/bin/Gblocks 2>/dev/null 1>/dev/null";
my $ret = system("$cmd");
open FLANKS, "$filename-gb.htm" or die "$!\n";
my $segments_string;
while (<FLANKS>) {
chomp $_;
next unless ($_ =~ /Flanks/);
$segments_string = $_;
last;
}
close FLANKS;
$segments_string =~ s/Flanks\: //g;
$segments_string =~ s/\s+$//g;
$self->{flanks} = $segments_string;
$self->{'protein_tree'}->store_tag('Gblocks_flanks', $segments_string); } |
sub run_sitewise_dNdS
{
my $self = shift;
return undef unless (defined($self->{protein_tree}));
$self->{starttime} = time()*1000;
my $slrexe = $self->analysis->program_file;
unless (-e $slrexe) {
$slrexe = "/software/ensembl/compara/bin/Slr_ensembl";
}
throw("can't find an slr executable to run\n")
unless(-e $slrexe);
my $aln = $self->{'cds_aln'};
my $tree_string = $self->{'tree'};
open(my $fake_fh, "+<",\$ tree_string);
my $treein = new Bio::TreeIO
(-fh => $fake_fh,
-format => 'newick');
my $tree = $treein->next_tree;
$treein->close;
throw("can't find cds_aln\n") if ( ! $aln );
throw("can't find tree\n") if ( ! $tree );
my $ct = 1;
my %order;
foreach my $node ($tree->get_leaf_nodes) {
$order{$node->id} = $ct++;
}
my @seq; my @ids;
foreach my $seq ( $aln->each_seq() ) {
push @seq, $seq;
push @ids, $seq->display_id;
}
my @sorted = map { $_->[1] } sort { $a->[0] <=> $b->[0] } map { [$order{$_->id()}, $_] } @seq;
my $sorted_aln = Bio::SimpleAlign->new();
foreach (@sorted) {
$sorted_aln->add_seq($_);
}
$self->run_gblocks(0.5,$sorted_aln);
my $mask;
my $aln_length = $sorted_aln->length;
foreach my $seq ($sorted_aln->each_seq) {
my $seqstring = $seq->seq;
my $start = 1;
foreach my $segment ($self->{flanks} =~ /(\[\d+ \d+\])/g) {
my ($segm_start,$segm_end) = $segment =~ /\[(\d+) (\d+)\]/;
my $mask = 'N' x ($segm_start-$start);
if (0 != $segm_start-$start) {
substr($seqstring,($start-1),($segm_start-$start),$mask);
}
$start = $segm_end + 1;
}
if ($start < $aln_length) { my $mask = 'N' x ($aln_length-$start+1); substr($seqstring,($start-1),($aln_length-$start+1),$mask);
}
$seq->seq($seqstring);
}
throw("malformed masked alignment of the wrong length") unless ($aln_length == $sorted_aln->length);
my $tmpdir = $self->worker_temp_directory;
my $alnout = Bio::AlignIO->new
('-format' => 'phylip',
'-file' => ">$tmpdir/aln",
'-interleaved' => 0,
'-idlinebreak' => 1,
'-idlength' => $aln->maxdisplayname_length + 1);
$alnout->write_aln($sorted_aln);
$alnout->close();
undef $alnout;
my $treeout = Bio::TreeIO->new('-format' => 'newick',
'-file' => ">$tmpdir/tree");
$treeout->_print(sprintf("%d 1\n",($ct-1)));
$treeout->write_tree($tree);
$treeout->close();
my $slr_ctl = "$tmpdir/slr.ctl";
open(SLR, ">$slr_ctl") or throw("cannot open $slr_ctl for writing");
print SLR "seqfile\: aln\n";
print SLR "treefile\: tree\n";
my $outfile = "slr.res";
print SLR "outfile\: $outfile\n";
if (defined($self->{'saturated'})) {
print SLR "saturated\: ". $self->{'saturated'} . "\n";
}
if (defined($self->{'gencode'})) {
print SLR "gencode\: ". $self->{'gencode'} . "\n";
}
print SLR "aminof\: 1\n"; close(SLR);
my ($rc,$results) = (1);
{
my $cwd = cwd();
my $exit_status;
chdir($tmpdir);
my $run;
my $quiet = ''; $quiet = ' 2>/dev/null' unless ($self->debug);
$self->{'comparaDBA'}->dbc->disconnect_when_inactive(1);
open($run, "$slrexe $quiet |") or throw("Cannot open exe $slrexe");
my @output = <$run>;
$exit_status = close($run);
$self->{'comparaDBA'}->dbc->disconnect_when_inactive(0);
$self->{error_string} = (join('',@output));
if ( (grep { /is saturated/ } @output)) {
$results->{saturated} = $self->{'saturated'};
my $min = 999;
foreach my $line (grep { /is saturated/ } @output) {
$line =~ /length = (\S+)/;
$min = $1 if ($1 < $min);
}
$results->{saturated} = sprintf("%2.3f",$min);
if (-e "$tmpdir/subtrees.out") {
my $treeio = Bio::TreeIO->new
('-format' => 'newick',
'-file' => "$tmpdir/subtrees.out");
my $tree_num= 1;
while ( my $tree = $treeio->next_tree ) {
$results->{trees}{$tree_num} = $tree;
$tree_num++;
}
}
chdir($cwd);
$self->{'results'} = $results;
$self->{'rc'} = $rc;
return undef;
}
foreach my $outline (@output) {
if ($outline =~ /lnL = (\S+)/) {
$results->{lnL} = $1;
}
if ($outline =~ /kappa = (\S+)/) {
$results->{kappa} = $1;
}
if ($outline =~ /omega = (\S+)/) {
$results->{omega} = $1;
}
}
if ( (grep { /\berr(or)?: /io } @output) || !$exit_status) {
warn("There was an error - see error_string for the program output");
warn('Error string: '.$self->{error_string}) if $self->debug;
$rc = 0;
}
eval {
open RESULTS, "$tmpdir/$outfile" or die "couldnt open results file: $!\n";
my $okay = 0;
my $sites;
my $type = 'default';
while (<RESULTS>) {
chomp $_;
if ( /^\#/ ) {
next;
}
if ( /\!/ ) {
$type = 'random';
} elsif ( /\+\+\+\+\s+/ ) {
$type = 'positive4';
} elsif ( /\+\+\+\s+/ ) {
$type = 'positive3';
} elsif ( /\+\+\s+/ ) {
$type = 'positive2';
} elsif ( /\+\s+/ ) {
$type = 'positive1';
} elsif ( /\-\-\-\-\s+/ ) {
$type = 'negative4';
} elsif ( /\-\-\-\s+/ ) {
$type = 'negative3';
} elsif ( /\-\-\s+/ ) {
$type = 'negative2';
} elsif ( /\-\s+/ ) {
$type = 'negative1';
} elsif ( /Constant/ ) {
$type = 'constant';
} elsif ( /All gaps/ ) {
$type = 'all_gaps';
} elsif ( /Single character/ ) {
$type = 'single_character';
} elsif ( /Synonymous/ ) {
$type = 'synonymous';
} else {
$type = 'default';
}
if ( /^\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)/ ) {
push @{$sites->{$type}}, [$1,$2,$3,$4,$5,$6,$7,$8,$9];
} else {
warn("error parsing the results: $_\n");
}
}
$results->{sites} = $sites;
close RESULTS;
};
if ( $@ ) {
warn($self->{error_string});
}
chdir($cwd);
}
$self->{'results'} = $results;
$self->{'rc'} = $rc;
return undef; } |
sub store_sitewise_dNdS
{
my $self = shift;
printf("Sitewise_dNdS::store_sitewise_dNdS\n") if($self->debug);
my $runtime = time()*1000-$self->{starttime};
$self->{'protein_tree'}->store_tag('Sitewise_dNdS_runtime_msec', $runtime);
my $results = $self->{'results'};
my $aa_aln = $self->{'protein_tree'}->get_SimpleAlign;
my @gap_col_matrix = @{$aa_aln->gap_col_matrix};
$self->{memberDBA} = $self->{'comparaDBA'}->get_MemberAdaptor;
my $root_id = $self->{'protein_tree'}->node_id;
my $subroot_id = $self->{'protein_tree'}->subroot->node_id;
$self->{'protein_tree'}->store_tag('Sitewise_dNdS_subroot_id', $subroot_id);
my $threshold_on_branch_ds = $self->{'saturated'};
my $sth;
foreach my $type (keys %{$results->{sites}}) {
foreach my $position (@{$results->{sites}{$type}}) {
my ($site, $neutral, $optimal, $omega, $lower, $upper, $lrt_stat, $pval, $adj_pval, $q_value) = @$position;
my $nseq_ngaps = 0; foreach my $val (values %{$gap_col_matrix[$site-1]}) {$nseq_ngaps++ unless (1 == $val)};
my $optimalc; if (0 != $nseq_ngaps) {$optimalc = $optimal / $nseq_ngaps;} else {$optimalc = $optimal;} $sth = $self->{'comparaDBA'}->dbc->prepare ("INSERT INTO sitewise_aln (aln_position, node_id, tree_node_id, omega, omega_lower, omega_upper, optimal, ncod, threshold_on_branch_ds, type) VALUES (?,?,?,?,?,?,?,?,?,?)"); $sth->execute($site,
$root_id,
$subroot_id,
$omega,
$lower,
$upper,
$optimalc,
$nseq_ngaps,
$threshold_on_branch_ds,
$type);
}
}
$sth->finish();
return undef;
}
1; } |
sub write_output
{ my $self = shift;
unless (defined($self->{results})) {
$self->input_job->update_status('FAILED');
return undef;
}
$self->check_if_exit_cleanly;
if (defined($self->{'results'}{saturated})) {
$self->{'protein_tree'}->store_tag('Sitewise_dNdS_saturated', $self->{'results'}{saturated});
if (defined($self->{'results'}{trees})) {
$self->{treeDBA} = $self->{'comparaDBA'}->get_ProteinTreeAdaptor;
foreach my $subtree_num (keys %{$self->{'results'}{trees}}) {
my $subtree = $self->{'results'}{trees}{$subtree_num};
my @nodes;
foreach my $node ($subtree->get_nodes) {
if ($node->is_Leaf) {
push @nodes, $node->id;
}
}
my $members;
foreach my $node (@nodes) {
$node =~ s/\s+//;
$members->{$node} = 1;
}
my $partial_tree = $self->{treeDBA}->fetch_node_by_node_id($self->{'protein_tree'}->node_id);
foreach my $leaf (@{$partial_tree->get_all_leaves}) {
next if (defined($members->{$leaf->stable_id}));
$leaf->disavow_parent;
$partial_tree = $partial_tree->minimize_tree;
}
my $subroot = $partial_tree->node_id;
$partial_tree->release_tree;
next if ($partial_tree->num_leaves >= $self->{'protein_tree'}->num_leaves);
my $output_id = sprintf("{'protein_tree_id'=>%d, 'clusterset_id'=>1}", $subroot);
$self->input_job->input_id($output_id);
$self->dataflow_output_id($output_id, 2);
}
}
$self->input_job->update_status('FAILED');
$self->{'protein_tree'}->release_tree;
$self->{'protein_tree'} = undef;
warn("Sitewise_dNdS : cluster saturated, creating jobs for subtrees and FAIL it");
return undef;
} elsif (defined($self->{'results'}{sites})) {
$self->store_sitewise_dNdS;
} else {
$self->{'protein_tree'}->release_tree;
} } |
General documentation
The rest of the documentation details each of the object methods.
Internal methods are usually preceded with a _