Bio::EnsEMBL::Compara::RunnableDB Sitewise_dNdS
SummaryIncluded librariesPackage variablesSynopsisDescriptionGeneral documentationMethods
Toolbar
WebCvsRaw content
Summary
Bio::EnsEMBL::Compara::RunnableDB::Sitewise_dNdS
Package variables
No package variables defined.
Included modules
Bio::AlignIO
Bio::EnsEMBL::Compara::DBSQL::DBAdaptor
Bio::EnsEMBL::DBSQL::DBAdaptor
Bio::EnsEMBL::Hive
Bio::SimpleAlign
Bio::TreeIO
Cwd
File::Basename
Getopt::Long
IO::File
Time::HiRes qw ( time gettimeofday tv_interval )
Inherit
Bio::EnsEMBL::Hive::Process
Synopsis
my $db = Bio::EnsEMBL::Compara::DBAdaptor->new($locator);
my $sitewise_dNdS = Bio::EnsEMBL::Compara::RunnableDB::Sitewise_dNdS->new
(
-db => $db,
-input_id => $input_id,
-analysis => $analysis
);
$sitewise_dNdS->fetch_input(); #reads from DB
$sitewise_dNdS->run();
$sitewise_dNdS->output();
$sitewise_dNdS->write_output(); #writes to DB
Description
This Analysis/RunnableDB is designed to take a ProteinTree or subtree
as input. This must already have a multiple alignment and a genetree
run on it. It uses the alignment and tree as input into the SLR
program which then generates annotations for the codons/peptides and
the branches in the genetree.
input_id/parameters format eg: "{'protein_tree_id'=>1234}"
protein_tree_id : use 'id' to fetch a cluster from the ProteinTree
Methods
DESTROY
No description
Code
check_job_fail_options
No description
Code
fetch_inputDescriptionCode
get_params
No description
Code
print_params
No description
Code
runDescriptionCode
run_gblocks
No description
Code
run_sitewise_dNdS
No description
Code
store_sitewise_dNdS
No description
Code
write_outputDescriptionCode
Methods description
fetch_inputcode    nextTop
    Title   :   fetch_input
Usage : $self->fetch_input
Function: Fetches input data for repeatmasker from the database
Returns : none
Args : none
runcodeprevnextTop
    Title   :   run
Usage : $self->run
Function: runs SLR
Returns : none
Args : none
write_outputcodeprevnextTop
    Title   :   write_output
Usage : $self->write_output
Function: stores SLR annotations
Returns : none
Args : none
Methods code
DESTROYdescriptionprevnextTop
sub DESTROY {
  my $self = shift;

  if ($self->{'protein_tree'}) {
    printf("Sitewise_dNdS::DESTROY  releasing tree\n") if($self->debug);
    $self->{'protein_tree'}->release_tree;
    $self->{'protein_tree'} = undef;
  }

  $self->SUPER::DESTROY if $self->can("SUPER::DESTROY");
}


##########################################
#
# internal methods
#
##########################################
}
check_job_fail_optionsdescriptionprevnextTop
sub check_job_fail_options {
    my $self = shift;

    if ($self->input_job->retry_count >= 2) {
      $self->dataflow_output_id($self->input_id, 2);
      $self->input_job->update_status('FAILED');

      if ($self->{'protein_tree'}) {
        $self->{'protein_tree'}->release_tree;
        $self->{'protein_tree'} = undef;
      }
      throw("Sitewise_dNdS job failed >=3 times: try something else and FAIL it");
    }
}
fetch_inputdescriptionprevnextTop
sub fetch_input {
  my( $self) = @_;

  $self->check_job_fail_options;
  throw("No input_id") unless defined($self->input_id);

  #create a Compara::DBAdaptor which shares the same DBI handle
#with the Pipeline::DBAdaptor that is based into this runnable
$self->{'comparaDBA'} = Bio::EnsEMBL::Compara::DBSQL::DBAdaptor->new ( -DBCONN=>$self->db->dbc ); $self->get_params($self->parameters); $self->get_params($self->input_id); $self->print_params if($self->debug); $self->check_if_exit_cleanly; unless ($self->{'protein_tree'}) { throw("undefined ProteinTree as input\n"); } my $num_leaves = $self->{'protein_tree'}->num_leaves; $self->{'protein_tree'}->print_tree(10) if ($self->debug); if ($num_leaves < 4) { $self->{'protein_tree'}->release_tree; $self->{'protein_tree'} = undef; return undef; # throw("Sitewise_dNdS : cluster size under 4 threshold and FAIL it");
} $self->{'cds_aln'} = $self->{'protein_tree'}->get_SimpleAlign(-cdna => 1); $self->{'tree'} = $self->{'protein_tree'}->newick_format("int_node_id"); return 1;
}
get_paramsdescriptionprevnextTop
sub get_params {
  my $self         = shift;
  my $param_string = shift;

  return unless($param_string);
  print("parsing parameter string : ",$param_string,"\n") if($self->debug);

  my $params = eval($param_string);
  return unless($params);

  if (defined($params->{'protein_tree_id'})) {
    $self->{'protein_tree'} = 
      $self->{'comparaDBA'}->get_ProteinTreeAdaptor->
        fetch_node_by_node_id($params->{'protein_tree_id'});
  } elsif (defined($params->{'saturated'})) {
        $self->{'saturated'} = $params->{'saturated'};
  }

  return;
}
print_paramsdescriptionprevnextTop
sub print_params {
  my $self = shift;

  print("params:\n");
  print("  tree_id   : ", $self->{'protein_tree'}->node_id,"\n") if($self->{'protein_tree'});
}
rundescriptionprevnextTop
sub run {
  my $self = shift;
  $self->check_if_exit_cleanly;
  $self->run_sitewise_dNdS;
}
run_gblocksdescriptionprevnextTop
sub run_gblocks {
    my $self = shift;
    my $gmin = shift;
    my $aln  = shift;

    printf("Sitewise_dNdS::run_gblocks\n") if($self->debug);

    throw("Sitewise_dNdS : error getting Peptide SimpleAlign") unless (defined($aln));

    my $aln_length = $aln->length;
    my $tree_id = $self->{'protein_tree'}->node_id;
    my $tmpdir = $self->worker_temp_directory;
    my $filename = "$tmpdir". "$tree_id.fasta";
    my $tmpfile = Bio::AlignIO->new
      (-file => ">$filename",
       -format => 'fasta');
    $tmpfile->write_aln($aln);
    $tmpfile->close;
    my $min_leaves_gblocks = int(($self->{'protein_tree'}->num_leaves+1) * $gmin + 0.5);
    my $cmd = "echo -e\" o\n$filename\nt\nb\n2\n$min_leaves_gblocks\n5\n5\ng\nm\nq\n\" | /software/ensembl/compara/bin/Gblocks 2>/dev/null 1>/dev/null";

    my $ret = system("$cmd");
    open FLANKS, "$filename-gb.htm" or die "$!\n";
    my $segments_string;
    while (<FLANKS>) {
      chomp $_;
      next unless ($_ =~ /Flanks/);
      $segments_string = $_;
      last;
    }
    close FLANKS;
    $segments_string =~ s/Flanks\: //g;
    $segments_string =~ s/\s+$//g;

    $self->{flanks} = $segments_string;
    $self->{'protein_tree'}->store_tag('Gblocks_flanks', $segments_string);
}
run_sitewise_dNdSdescriptionprevnextTop
sub run_sitewise_dNdS {
  my $self = shift;

  return undef unless (defined($self->{protein_tree}));
  $self->{starttime} = time()*1000;

  my $slrexe = $self->analysis->program_file;
  unless (-e $slrexe) {
    $slrexe = "/software/ensembl/compara/bin/Slr_ensembl";
  }

  throw("can't find an slr executable to run\n") 
    unless(-e $slrexe);

  my $aln = $self->{'cds_aln'};

  my $tree_string = $self->{'tree'};
  open(my $fake_fh, "+<",\$ tree_string);
  my $treein = new Bio::TreeIO
    (-fh => $fake_fh,
     -format => 'newick');
  my $tree = $treein->next_tree;
  $treein->close;
  throw("can't find cds_aln\n") if ( ! $aln );
  throw("can't find tree\n") if ( ! $tree );

  # Reorder the alignment according to the tree
my $ct = 1; my %order; foreach my $node ($tree->get_leaf_nodes) { $order{$node->id} = $ct++; } my @seq; my @ids; foreach my $seq ( $aln->each_seq() ) { push @seq, $seq; push @ids, $seq->display_id; } # use the map-sort-map idiom:
my @sorted = map { $_->[1] } sort { $a->[0] <=> $b->[0] } map { [$order{$_->id()}, $_] } @seq; my $sorted_aln = Bio::SimpleAlign->new(); foreach (@sorted) { $sorted_aln->add_seq($_); } # Mask the aligment
$self->run_gblocks(0.5,$sorted_aln); my $mask; my $aln_length = $sorted_aln->length; foreach my $seq ($sorted_aln->each_seq) { my $seqstring = $seq->seq; my $start = 1; foreach my $segment ($self->{flanks} =~ /(\[\d+ \d+\])/g) { my ($segm_start,$segm_end) = $segment =~ /\[(\d+) (\d+)\]/; my $mask = 'N' x ($segm_start-$start); if (0 != $segm_start-$start) { substr($seqstring,($start-1),($segm_start-$start),$mask); } $start = $segm_end + 1; } if ($start < $aln_length) { # last segment to mask
my $mask = 'N' x ($aln_length-$start+1); substr($seqstring,($start-1),($aln_length-$start+1),$mask); } $seq->seq($seqstring); } throw("malformed masked alignment of the wrong length") unless ($aln_length == $sorted_aln->length); ####
my $tmpdir = $self->worker_temp_directory; my $alnout = Bio::AlignIO->new ('-format' => 'phylip', '-file' => ">$tmpdir/aln", '-interleaved' => 0, '-idlinebreak' => 1, '-idlength' => $aln->maxdisplayname_length + 1); $alnout->write_aln($sorted_aln); $alnout->close(); undef $alnout; my $treeout = Bio::TreeIO->new('-format' => 'newick', '-file' => ">$tmpdir/tree"); # We need to add a line with the num of leaves ($ct-1) and the
# num of trees (1)
$treeout->_print(sprintf("%d 1\n",($ct-1))); $treeout->write_tree($tree); $treeout->close(); # now let's print the ctl file.
# many of the these programs are finicky about what the filename is
# and won't even run without the properly named file.
my $slr_ctl = "$tmpdir/slr.ctl"; open(SLR, ">$slr_ctl") or throw("cannot open $slr_ctl for writing"); print SLR "seqfile\: aln\n"; print SLR "treefile\: tree\n"; my $outfile = "slr.res"; print SLR "outfile\: $outfile\n"; if (defined($self->{'saturated'})) { print SLR "saturated\: ". $self->{'saturated'} . "\n"; } if (defined($self->{'gencode'})) { print SLR "gencode\: ". $self->{'gencode'} . "\n"; } print SLR "aminof\: 1\n"; # aminof
close(SLR); my ($rc,$results) = (1); { my $cwd = cwd(); my $exit_status; chdir($tmpdir); my $run; my $quiet = ''; $quiet = ' 2>/dev/null' unless ($self->debug); $self->{'comparaDBA'}->dbc->disconnect_when_inactive(1); open($run, "$slrexe $quiet |") or throw("Cannot open exe $slrexe"); my @output = <$run>; $exit_status = close($run); $self->{'comparaDBA'}->dbc->disconnect_when_inactive(0); $self->{error_string} = (join('',@output)); if ( (grep { /is saturated/ } @output)) { $results->{saturated} = $self->{'saturated'}; my $min = 999; foreach my $line (grep { /is saturated/ } @output) { $line =~ /length = (\S+)/; $min = $1 if ($1 < $min); } $results->{saturated} = sprintf("%2.3f",$min); if (-e "$tmpdir/subtrees.out") { my $treeio = Bio::TreeIO->new ('-format' => 'newick', '-file' => "$tmpdir/subtrees.out"); my $tree_num= 1; while ( my $tree = $treeio->next_tree ) { $results->{trees}{$tree_num} = $tree; $tree_num++; } } chdir($cwd); $self->{'results'} = $results; $self->{'rc'} = $rc; return undef; } foreach my $outline (@output) { if ($outline =~ /lnL = (\S+)/) { $results->{lnL} = $1; } if ($outline =~ /kappa = (\S+)/) { $results->{kappa} = $1; } if ($outline =~ /omega = (\S+)/) { $results->{omega} = $1; } } if ( (grep { /\berr(or)?: /io } @output) || !$exit_status) { warn("There was an error - see error_string for the program output"); warn('Error string: '.$self->{error_string}) if $self->debug; $rc = 0; } eval { open RESULTS, "$tmpdir/$outfile" or die "couldnt open results file: $!\n"; my $okay = 0; my $sites; my $type = 'default'; while (<RESULTS>) { chomp $_; if ( /^\#/ ) { next; } if ( /\!/ ) { $type = 'random'; } # random is last
elsif ( /\+\+\+\+\s+/ ) { $type = 'positive4'; } elsif ( /\+\+\+\s+/ ) { $type = 'positive3'; } elsif ( /\+\+\s+/ ) { $type = 'positive2'; } elsif ( /\+\s+/ ) { $type = 'positive1'; } elsif ( /\-\-\-\-\s+/ ) { $type = 'negative4'; } elsif ( /\-\-\-\s+/ ) { $type = 'negative3'; } elsif ( /\-\-\s+/ ) { $type = 'negative2'; } elsif ( /\-\s+/ ) { $type = 'negative1'; } elsif ( /Constant/ ) { $type = 'constant'; } elsif ( /All gaps/ ) { $type = 'all_gaps'; } elsif ( /Single character/ ) { $type = 'single_character'; } elsif ( /Synonymous/ ) { $type = 'synonymous'; } else { $type = 'default'; } if ( /^\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)/ ) { push @{$sites->{$type}}, [$1,$2,$3,$4,$5,$6,$7,$8,$9]; } else { warn("error parsing the results: $_\n"); } } $results->{sites} = $sites; close RESULTS; }; if ( $@ ) { warn($self->{error_string}); } chdir($cwd); } $self->{'results'} = $results; $self->{'rc'} = $rc; return undef;
}
store_sitewise_dNdSdescriptionprevnextTop
sub store_sitewise_dNdS {
  my $self = shift;

  printf("Sitewise_dNdS::store_sitewise_dNdS\n") if($self->debug);

  my $runtime = time()*1000-$self->{starttime};
  $self->{'protein_tree'}->store_tag('Sitewise_dNdS_runtime_msec', $runtime);

  my $results = $self->{'results'};
  my $aa_aln = $self->{'protein_tree'}->get_SimpleAlign;
  my @gap_col_matrix = @{$aa_aln->gap_col_matrix};
  $self->{memberDBA} = $self->{'comparaDBA'}->get_MemberAdaptor;
  my $root_id = $self->{'protein_tree'}->node_id;

  # We store a tag with the subroot_id so that we can do easy mapping
# from subtree to root
my $subroot_id = $self->{'protein_tree'}->subroot->node_id; $self->{'protein_tree'}->store_tag('Sitewise_dNdS_subroot_id', $subroot_id); my $threshold_on_branch_ds = $self->{'saturated'}; my $sth; foreach my $type (keys %{$results->{sites}}) { # next if ($type =~ /all_gaps/);
foreach my $position (@{$results->{sites}{$type}}) { # Site Neutral Optimal Omega lower upper LRT_Stat Pval Adj.Pval Q-value Result Note
# 1 4.77 3.44 0.0000 0.0000 1.4655 2.6626 1.0273e-01 8.6803e-01 1.7835e-02 Constant;
# 0 1 2 3 4 5 6 7 8 9
my ($site, $neutral, $optimal, $omega, $lower, $upper, $lrt_stat, $pval, $adj_pval, $q_value) = @$position; my $nseq_ngaps = 0; foreach my $val (values %{$gap_col_matrix[$site-1]}) {$nseq_ngaps++ unless (1 == $val)}; my $optimalc; if (0 != $nseq_ngaps) {$optimalc = $optimal / $nseq_ngaps;} else {$optimalc = $optimal;}
$sth = $self->{'comparaDBA'}->dbc->prepare
("INSERT INTO sitewise_aln
(aln_position,
node_id,
tree_node_id,
omega,
omega_lower,
omega_upper,
optimal,
ncod,
threshold_on_branch_ds,
type) VALUES (?,?,?,?,?,?,?,?,?,?)");
$sth->execute($site, $root_id, $subroot_id, $omega, $lower, $upper, $optimalc, $nseq_ngaps, $threshold_on_branch_ds, $type); # This stuff is disabled right now
# my $stored_id = $sth->{'mysql_insertid'};
# if ($type !~ /default/) {
# foreach my $seq ($aln->each_seq) {
# next unless ($seq->display_id =~ /ENSP0/ || $seq->display_id =~ /ENSMUSP0/); # only store human and mouse
# my $seq_location;
# eval { $seq_location = $seq->location_from_column($site);};
# if ($@) {
# # gaps before the first nucleotide, skip
# next;
# }
# my $location_type;
# eval { $location_type = $seq_location->location_type;};
# if ($@) {
# # gaps before the first nucleotide, skip
# next;
# }
# if ($seq_location->location_type eq 'EXACT') {
# my $member = $self->{memberDBA}->fetch_by_source_stable_id("ENSEMBLPEP",$seq->display_id);
# my $member_id = $member->dbID;
# my $member_position = $seq_location->start;
# my $aa = $seq->subseq($seq_location->start,$seq_location->end);
# my $sth = $self->{'comparaDBA'}->dbc->prepare
# ("INSERT INTO sitewise_member
# (sitewise_id,
# member_id,
# member_position) VALUES (?,?,?)");
# $sth->execute($stored_id,
# $member_id,
# $member_position);
# }
# }
# }
} } $sth->finish(); return undef; } 1;
}
write_outputdescriptionprevnextTop
sub write_output {
  my $self = shift;

  unless (defined($self->{results})) {
    $self->input_job->update_status('FAILED');
    return undef;
  }
  $self->check_if_exit_cleanly;
  if (defined($self->{'results'}{saturated})) {
    $self->{'protein_tree'}->store_tag('Sitewise_dNdS_saturated', $self->{'results'}{saturated});
    # create another job for each subtree
if (defined($self->{'results'}{trees})) { $self->{treeDBA} = $self->{'comparaDBA'}->get_ProteinTreeAdaptor; foreach my $subtree_num (keys %{$self->{'results'}{trees}}) { my $subtree = $self->{'results'}{trees}{$subtree_num}; my @nodes; foreach my $node ($subtree->get_nodes) { if ($node->is_Leaf) { push @nodes, $node->id; } } my $members; foreach my $node (@nodes) { $node =~ s/\s+//; $members->{$node} = 1; } my $partial_tree = $self->{treeDBA}->fetch_node_by_node_id($self->{'protein_tree'}->node_id); # print STDERR "[subtree $subtree_num] ",time()-$self->{starttime}," secs...\n" if ($self->debug); $self->{starttime} = time();
foreach my $leaf (@{$partial_tree->get_all_leaves}) { next if (defined($members->{$leaf->stable_id})); $leaf->disavow_parent; $partial_tree = $partial_tree->minimize_tree; } my $subroot = $partial_tree->node_id; $partial_tree->release_tree; next if ($partial_tree->num_leaves >= $self->{'protein_tree'}->num_leaves); my $output_id = sprintf("{'protein_tree_id'=>%d, 'clusterset_id'=>1}", $subroot); $self->input_job->input_id($output_id); $self->dataflow_output_id($output_id, 2); } } $self->input_job->update_status('FAILED'); $self->{'protein_tree'}->release_tree; $self->{'protein_tree'} = undef; warn("Sitewise_dNdS : cluster saturated, creating jobs for subtrees and FAIL it"); return undef; #throw("Sitewise_dNdS : cluster saturated, creating jobs for subtrees and FAIL it");
} elsif (defined($self->{'results'}{sites})) { $self->store_sitewise_dNdS; } else { # something wrong went on
$self->{'protein_tree'}->release_tree; }
}
General documentation
CONTACTTop
  Contact Albert Vilella on module implementation/design detail: avilella@ebi.ac.uk
Contact Ewan Birney on EnsEMBL in general: birney@sanger.ac.uk
APPENDIXTop
The rest of the documentation details each of the object methods.
Internal methods are usually preceded with a _