Bio::EnsEMBL::Compara::Production::GenomicAlignBlock Gerp
SummaryIncluded librariesPackage variablesSynopsisDescriptionGeneral documentationMethods
Toolbar
WebCvsRaw content
Summary
Bio::EnsEMBL::Compara::Production::GenomicAlignBlock::Gerp
Package variables
Privates (from "my" definitions)
%BIN_DIR;
$CONS_FILE_SUFFIX = ".elems"
$program_version = 2.1
$ALIGN_FILE = "gerp_alignment.mfa"
$TREE_FILE = "gerp_tree.nw"
$PARAM_FILE_SUFFIX = ".tmp"
$RATES_FILE_SUFFIX = ".rates"
Included modules
Bio::AlignIO
Bio::EnsEMBL::Compara::ConservationScore
Bio::EnsEMBL::Compara::DnaFragRegion
Bio::EnsEMBL::Compara::Graph::NewickParser
Bio::EnsEMBL::Compara::Production::DBSQL::DBAdaptor
Bio::EnsEMBL::Hive::Process
Bio::EnsEMBL::Utils::Exception
Bio::LocatableSeq
Bio::SimpleAlign
File::Basename
Inherit
Bio::EnsEMBL::Hive::Process
Synopsis
    $gerp->fetch_input();
$gerp->run();
$gerp->write_output(); writes to database
Description
    Given a mulitple alignment Bio::EnsEMBL::Compara::DBSQL::GenomicAlignBlock 
identifier it fetches GenomicAlignBlocks from a compara database and runs
the program GERP.pl. It then parses the output and writes the constrained
elements in the GenomicAlignBlock table and the conserved scores in the
ConservationScore table
Methods
_build_tree_string
No description
Code
_calculateNeutralRate
No description
Code
_get_name_from_GenomicAlign
No description
Code
_parse_cons_file
No description
Code
_parse_rates_file
No description
Code
_parse_results
No description
Code
_parse_results_v2
No description
Code
_update_tree
No description
Code
_writeMultiFastaAlignment
No description
Code
constrained_element_method_link_type
No description
Code
fetch_inputDescriptionCode
free_aligned_sequence
No description
Code
genomic_align_block_id
No description
Code
get_params
No description
Code
options
No description
Code
param_file
No description
Code
param_file_tmp
No description
Code
program
No description
Code
program_file
No description
Code
program_version
No description
Code
runDescriptionCode
run_gerp
No description
Code
run_gerp_v2
No description
Code
species_set
No description
Code
tree_file
No description
Code
window_sizes
No description
Code
write_outputDescriptionCode
Methods description
fetch_inputcode    nextTop
    Title   :   fetch_input
Usage : $self->fetch_input
Function: Fetches input data for gerp from the database
Returns : none
Args : none
runcodeprevnextTop
    Title   :   run
Usage : $self->run
Function: Run gerp
Returns : none
Args : none
write_outputcodeprevnextTop
    Title   :   write_output
Usage : $self->write_output
Function: Write results to the database
Returns : 1
Args : none
Methods code
_build_tree_stringdescriptionprevnextTop
sub _build_tree_string {
    my ($self, $genomic_aligns) = @_;
    
    my $tree_file = $self->tree_file;

    my $newick = "";
    if (-e $tree_file) {
	open NEWICK_FILE, $tree_file || throw("Can not open $tree_file");
	$newick = join("", <NEWICK_FILE>);
	close NEWICK_FILE;
    } else {
	## Look in the meta table
my $meta_adaptor = $self->{'comparaDBA'}->get_MetaContainer; $tree_file =~ s/.*\/([^\/]+)$/$1/; $newick = $meta_adaptor->list_value_by_key("$tree_file")->[0]; } return if (!$newick); $newick =~ s/^\s*//; $newick =~ s/\s*$//; $newick =~ s/[\r\n]//g; my $tree = Bio::EnsEMBL::Compara::Graph::NewickParser::parse_newick_into_tree($newick); $tree = $self->_update_tree($tree, $genomic_aligns); my $tree_string = $tree->newick_simple_format; # Remove quotes around node labels
$tree_string =~ s/"(_\d+_)"/$1/g; $tree->release_tree; $self->{'modified_tree_file'} = $self->worker_temp_directory . $TREE_FILE; open (TREE, ">$self->{'modified_tree_file'}") or throw "error writing alignment ($self->{'modified_tree_file'}) file\n"; print TREE $tree_string; close TREE; return $tree_string; } # Arg [1] : Bio::EnsEMBL::Compara::NestedSet $tree_root
# Example : $self->_update_nodes_names($tree);
# Description: This method updates the tree by removing or
# duplicating the leaves according to the original
# tree and the set of GenomicAligns. The tree nodes
# will be renamed .....
# Returntype : Bio::EnsEMBL::Compara::NestedSet (a tree)
# Exception :
# Warning :
}
_calculateNeutralRatedescriptionprevnextTop
sub _calculateNeutralRate {
    my ($tree_string) = @_;

    my @num_list = ($tree_string =~ /:(\d*.\d*)/g);
    my $neutral_rate = 0;
    foreach my $num (@num_list) {
	$neutral_rate += $num;
    }

    return $neutral_rate;
}

#write out multiple alignment in mfa format to $self->worker_temp_directory
}
_get_name_from_GenomicAligndescriptionprevnextTop
sub _get_name_from_GenomicAlign {
  my ($genomic_align) = @_;

  my $name = "_" . $genomic_align->dnafrag->genome_db->dbID . "_" .
    $genomic_align->dnafrag_id . "_" .
    $genomic_align->dnafrag_start . "_" .
    $genomic_align->dnafrag_end . "_";

  return $name;
}


1;
}
_parse_cons_filedescriptionprevnextTop
sub _parse_cons_file {
    my ($self, $cons_file, $version) = @_;

    my $gaba = $self->{'comparaDBA'}->get_GenomicAlignBlockAdaptor;
    my $gab = $gaba->fetch_by_dbID($self->genomic_align_block_id);

    my $mlssa = $self->{'comparaDBA'}->get_MethodLinkSpeciesSetAdaptor;

    my $mlss = $mlssa->fetch_by_method_link_type_genome_db_ids($self->constrained_element_method_link_type, 
							       $self->species_set);
    unless ($mlss) {
	throw("Invalid method_link_species_set\n");
    }
    my $constrained_element_adaptor = $self->{'comparaDBA'}->get_ConstrainedElementAdaptor;
    unless ($constrained_element_adaptor) {
	throw("could not get a constrained_element_adaptor\n");
    }

    open CONS, $cons_file || throw("Could not open $cons_file");
    my @constrained_elements;
    while (<CONS>) {
	unless (/^#/) {
                chomp;
                #extract info from constraints file
my ($start, $end, $length, $rej_subs, $p_value); ($start, $end, $length, $rej_subs, $p_value) = split /\s/,$_; #create new genomic align blocks by converting alignment
#coords to chromosome coords
my $constrained_gab = $gab->restrict_between_alignment_positions($start, $end, "skip"); my $constrained_element_block; my ($taxonomic_level) = join(" ", $mlss->name=~/\b[a-z]+\b/g); #feeble hack to get the taxonomic level
foreach my $genomic_align (@{$constrained_gab->get_all_GenomicAligns}) { my $constrained_element = new Bio::EnsEMBL::Compara::ConstrainedElement( -reference_dnafrag_id => $genomic_align->dnafrag_id, -start => $genomic_align->dnafrag_start, -end => $genomic_align->dnafrag_end, -score => $rej_subs, -p_value => $p_value, -method_link_species_set => $mlss->dbID, -taxonomic_level => $taxonomic_level, ); push(@$constrained_element_block, $constrained_element); } push(@constrained_elements, $constrained_element_block); } } close(CONS); #store in constrained_element table
$constrained_element_adaptor->store($mlss,\@ constrained_elements); } #This method parses the gerp rates file and stores the multiple alignment
#genomic align block id, window size, position, expected scores
#and difference (expected-observed) scores in the conservation_score table
#rates_file : full filename of rates file
#example : $self->_parse_rates_file(/tmp/worker.2580193/gerp_alignment.mfa.rates);
}
_parse_rates_filedescriptionprevnextTop
sub _parse_rates_file {
    my ($self, $rates_file, $version) = @_;
    my %win_cnt;
    my $win_sizes = eval($self->window_sizes);

    my $j;
    
    my $obs_no_score = -1.0; #uncalled observed score
my $exp_no_score = 0.0; #uncalled expected score
my $diff_no_score = 0.0; my $bucket; my $gaba = $self->{'comparaDBA'}->get_GenomicAlignBlockAdaptor; my $gab = $gaba->fetch_by_dbID($self->genomic_align_block_id); my $cs_adaptor = $self->{'comparaDBA'}->get_ConservationScoreAdaptor; #create and initialise bucket structure for each window size
for ($j = 0; $j < scalar(@$win_sizes); $j++) { $bucket->{$win_sizes->[$j]} = {called => 0, #no. called bases in block
win_called => 0, #no. called bases in window
cnt => 0, #no. bases in block
exp => 0, #current expected score
exp_scores => "",#expected score string
diff => 0, #current diff score
diff_scores => "",#diff score string
delete_cnt => 0, #no. uncalled bases
pos => 0, #current alignment position
start_pos => 1}; #alignment position of this block
} #length of uncalled region to allow neighboring called regions to be joined
#doesn't want to be too big because I need to store these uncalled values.
my $merge_dist = 10; #maximum called length before cut into smaller parts
my $max_called_dist = 1000; #read in rates file
open (RATES,$rates_file) or throw "Could not open rates ($rates_file) file\n"; while (<RATES>) { if (/^#/) { next; } chomp; my ($obs, $exp, $diff); if ($version == 1) { ($obs, $exp) = split/\t/,$_; if ($obs == $obs_no_score) { $diff = $diff_no_score; } else { $diff = $exp - $obs; } } elsif ($version == 2) { ($exp, $diff) = split/\t/,$_; } else { print STDERR "Invalid version, must be 1 or 2\n"; return; } for ($j = 0; $j < scalar(@$win_sizes); $j++) { #store called obs, exp and diff in each win_size bucket and keep a count of these with win_called
#if ($diff != $diff_no_score) {
if (($version == 1 && $obs != $obs_no_score) || ($version == 2 && $exp != $exp_no_score)) { $bucket->{$win_sizes->[$j]}->{exp} += $exp; $bucket->{$win_sizes->[$j]}->{diff} += $diff; $bucket->{$win_sizes->[$j]}->{win_called}++; } #total count for this bucket
$bucket->{$win_sizes->[$j]}->{win_cnt}++; #increment alignment position
$bucket->{$win_sizes->[$j]}->{pos}++; #if the bucket is full....
if ($bucket->{$win_sizes->[$j]}->{win_cnt} == $win_sizes->[$j]) { #re-initialise win_cnt
$bucket->{$win_sizes->[$j]}->{win_cnt} = 0; #FOUND CALLED SCORE
#if ($bucket->{$win_sizes->[$j]}->{diff} != $diff_no_score) {
#Use the fact that win_called keeps count of how many called
#scores I have. If win_called is 0, no called scores have
#been found
if ($bucket->{$win_sizes->[$j]}->{win_called} > 0) { #count how many called bases in this block
$bucket->{$win_sizes->[$j]}->{called}++; #if found delete_cnt uncalled bases which is less
#than merge_dist then add these to the
#relevant score string. If this makes the score
#string bigger than max_called_dist, then
#store the object in the database
if ($bucket->{$win_sizes->[$j]}->{delete_cnt} > 0 && $bucket->{$win_sizes->[$j]}->{delete_cnt} <= $merge_dist) { #add uncalled values to span merge_dist
for (my $i=0; $i < $bucket->{$win_sizes->[$j]}->{delete_cnt}; $i++) { $bucket->{$win_sizes->[$j]}->{cnt}++; $bucket->{$win_sizes->[$j]}->{exp_scores} .= "$exp_no_score "; $bucket->{$win_sizes->[$j]}->{diff_scores} .= "$diff_no_score "; #store in database
if ($bucket->{$win_sizes->[$j]}->{cnt} == $max_called_dist) { my $conservation_score = new Bio::EnsEMBL::Compara::ConservationScore( -genomic_align_block => $gab, -window_size => $win_sizes->[$j], -position => $bucket->{$win_sizes->[$j]}->{start_pos}, -expected_score => $bucket->{$win_sizes->[$j]}->{exp_scores}, -diff_score => $bucket->{$win_sizes->[$j]}->{diff_scores}); $cs_adaptor->store($conservation_score); #reset bucket values
$bucket->{$win_sizes->[$j]}->{cnt} = 0; $bucket->{$win_sizes->[$j]}->{called} = 0; $bucket->{$win_sizes->[$j]}->{exp_scores} = ""; $bucket->{$win_sizes->[$j]}->{diff_scores} = ""; $bucket->{$win_sizes->[$j]}->{start_pos} += $max_called_dist; } } } #first time found called score so save current pos
#as start_pos for this block
if ($bucket->{$win_sizes->[$j]}->{called} == 1) { #12.04.07 kfb fixed bug which occurs when the
#1000th score is in a region of 10 or less
#uncalled bases because the next start_pos was
#set to be the next called position instead of
#the next (uncalled) position.
#$bucket->{$win_sizes->[$j]}->{start_pos} = $bucket->{$win_sizes->[$j]}->{pos};
$bucket->{$win_sizes->[$j]}->{start_pos} = $bucket->{$win_sizes->[$j]}->{pos} - ($bucket->{$win_sizes->[$j]}->{cnt} * $win_sizes->[$j]); } #average over the number of called scores in a
#window and append to score string
#$bucket->{$win_sizes->[$j]}->{exp_scores} .= ($bucket->{$win_sizes->[$j]}->{exp}/$bucket->{$win_sizes->[$j]}->{win_called}) . " ";
#$bucket->{$win_sizes->[$j]}->{diff_scores} .= ($bucket->{$win_sizes->[$j]}->{diff}/$bucket->{$win_sizes->[$j]}->{win_called}) . " ";
$bucket->{$win_sizes->[$j]}->{exp_scores} .= ($bucket->{$win_sizes->[$j]}->{exp}/$win_sizes->[$j]) . " ";
$bucket->{$win_sizes->[$j]}->{diff_scores} .= ($bucket->{$win_sizes->[$j]}->{diff}/$win_sizes->[$j]) . " ";
#increment counter of total called scores in this
#block
$bucket->{$win_sizes->[$j]}->{cnt}++; #if have max_called_dist scores in the score
#string, then store them in the database
if ($bucket->{$win_sizes->[$j]}->{cnt} == $max_called_dist) { my $conservation_score = new Bio::EnsEMBL::Compara::ConservationScore( -genomic_align_block => $gab, -window_size => $win_sizes->[$j], -position => $bucket->{$win_sizes->[$j]}->{start_pos}, -expected_score => $bucket->{$win_sizes->[$j]}->{exp_scores}, -diff_score => $bucket->{$win_sizes->[$j]}->{diff_scores}); $cs_adaptor->store($conservation_score); #reinitialise bucket values
$bucket->{$win_sizes->[$j]}->{cnt} = 0; $bucket->{$win_sizes->[$j]}->{called} = 0; $bucket->{$win_sizes->[$j]}->{exp_scores} = ""; $bucket->{$win_sizes->[$j]}->{diff_scores} = ""; $bucket->{$win_sizes->[$j]}->{start_pos} += $max_called_dist; } #reset count for uncalled bases
$bucket->{$win_sizes->[$j]}->{delete_cnt} = 0; } else { #FOUND UNCALLED SCORE
#count how many uncalled bases in this block
$bucket->{$win_sizes->[$j]}->{delete_cnt}++; #add previous called values to the database
if ($bucket->{$win_sizes->[$j]}->{called} > 0 && $bucket->{$win_sizes->[$j]}->{delete_cnt} > $merge_dist) { my $conservation_score = new Bio::EnsEMBL::Compara::ConservationScore( -genomic_align_block => $gab, -window_size => $win_sizes->[$j], -position => $bucket->{$win_sizes->[$j]}->{start_pos}, -expected_score => $bucket->{$win_sizes->[$j]}->{exp_scores}, -diff_score => $bucket->{$win_sizes->[$j]}->{diff_scores}); $cs_adaptor->store($conservation_score); $bucket->{$win_sizes->[$j]}->{cnt} = 0; $bucket->{$win_sizes->[$j]}->{called} = 0; $bucket->{$win_sizes->[$j]}->{exp_scores} = ""; $bucket->{$win_sizes->[$j]}->{diff_scores} = ""; } } $bucket->{$win_sizes->[$j]}->{exp} = 0; $bucket->{$win_sizes->[$j]}->{diff} = 0; $bucket->{$win_sizes->[$j]}->{win_called} = 0; } } } #store last lot
for ($j = 0; $j < scalar(@$win_sizes); $j++) { if ($bucket->{$win_sizes->[$j]}->{delete_cnt} >= 0 && $bucket->{$win_sizes->[$j]}->{delete_cnt} <= $merge_dist) { my $conservation_score = new Bio::EnsEMBL::Compara::ConservationScore( -genomic_align_block => $gab, -window_size => $win_sizes->[$j], -position => $bucket->{$win_sizes->[$j]}->{start_pos}, -expected_score => $bucket->{$win_sizes->[$j]}->{exp_scores}, -diff_score => $bucket->{$win_sizes->[$j]}->{diff_scores}); $cs_adaptor->store($conservation_score); } } } # Arg [1] : array reference of genomic_aligns
# Example : $self->_build_tree_string();
# Description: This method sets the tree_string using the original
# species tree and the set of GenomicAligns. The
# tree is edited by the _update_tree method
# (see _update_tree elsewhere in this document)
# Returntype : -none-
# Exception :
# Warning :
}
_parse_resultsdescriptionprevnextTop
sub _parse_results {
  my ($self) = @_;

  my $alignment_file;

  #default values as defined in GERP.pl
my $rej_subs_min = 8.5; my $merge_distance = 1; #read in options from param file which GERP uses to generate it's output
#file names.
open (PARAM_FILE, $self->param_file) || throw "Could not open file " . $self->param_file; while (<PARAM_FILE>) { chomp; my ($flag,$value) = split /\s+/,$_; if ($flag eq "alignment") { $alignment_file = $self->worker_temp_directory . "/" . $value; } if ($flag eq "rej_subs_min") { $rej_subs_min = $value; } if ($flag eq "merge_distance") { $merge_distance = $value; } } close (PARAM_FILE); #generate rates and constraints file names
my $rates_file = "$alignment_file.rates"; my $cons_file = "$rates_file\_RS$rej_subs_min\_md$merge_distance\_cons.txt"; $self->_parse_cons_file($cons_file, 1); $self->_parse_rates_file($rates_file, 1); } #Parse results for Gerp version 2.1
}
_parse_results_v2descriptionprevnextTop
sub _parse_results_v2 {
  my ($self,) = @_;

  #generate rates and constraints file names
$self->_parse_rates_file($self->{'mfa_file'}.$RATES_FILE_SUFFIX, 2); $self->_parse_cons_file($self->{'mfa_file'}.$RATES_FILE_SUFFIX.$CONS_FILE_SUFFIX, 2); } #This method parses the gerp constraints file and stores the values as
#genomic_align_blocks
#cons_file : full filename of constraints file
#example : $self->_parse_cons_file(/tmp/worker.2580193/gerp_alignment.mfa.rates_RS8.5_md6_cons.txt);
}
_update_treedescriptionprevnextTop
sub _update_tree {
    my $self = shift;
    my $tree = shift;
    my $all_genomic_aligns = shift;

    my $idx = 1;
    foreach my $this_leaf (@{$tree->get_all_leaves}) {
	my $these_genomic_aligns = [];
	## Look for GenomicAligns belonging to this genome_db_id
foreach my $this_genomic_align (@$all_genomic_aligns) { if ($this_genomic_align->dnafrag->genome_db_id == $this_leaf->name) { push (@$these_genomic_aligns, $this_genomic_align); } } if (@$these_genomic_aligns == 1) { ## If only 1 has been found...
$this_leaf->name(_get_name_from_GenomicAlign($these_genomic_aligns->[0])); } elsif (@$these_genomic_aligns > 1) { ## If more than 1 has been found, create as many bifurcations as needed
for (my $i=0; $i<@$these_genomic_aligns-1; $i++) { my $new_node = new Bio::EnsEMBL::Compara::NestedSet; $new_node->name(_get_name_from_GenomicAlign($these_genomic_aligns->[$i])); $new_node->distance_to_parent(0); $this_leaf->add_child($new_node); my $new_internal_node = new Bio::EnsEMBL::Compara::NestedSet; $new_internal_node->distance_to_parent(0); $this_leaf->add_child($new_internal_node); $this_leaf = $new_internal_node; } $this_leaf->name(_get_name_from_GenomicAlign($these_genomic_aligns->[-1])); } else { ## If none has been found...
$this_leaf->disavow_parent; $tree = $tree->minimize_tree; } } if ($tree->get_child_count == 1) { my $child = $tree->children->[0]; $child->parent->merge_children($child); $child->disavow_parent; } ## Gerp wants unrooted trees. Takes one of the two children and attach the other one to it
if ($tree->get_child_count() == 2 and scalar(@{$tree->get_all_leaves()}) >= 3) { my $distance = $tree->children->[0]->distance_to_parent + $tree->children->[1]->distance_to_parent; my $new_root; my $new_child; if ($tree->children->[0]->get_child_count >= 2) { $new_root = $tree->children->[0]; $new_child = $tree->children->[1]; } else { $new_root = $tree->children->[1]; $new_child = $tree->children->[0]; } $new_root->disavow_parent(); $new_child->disavow_parent; $new_child->distance_to_parent($distance); $new_root->add_child($new_child); $tree = $new_root; } return $tree; } # Arg [1] : Bio::EnsEMBL::Compara::GenomicAlign
# Example : _get_name_from_GenomicAlign($genomic_align);
# Description:
# Returntype : string $name
# Exception :
# Warning :
}
_writeMultiFastaAlignmentdescriptionprevnextTop
sub _writeMultiFastaAlignment {
    my $self = shift;
    my $object = shift;

    #write out the alignment file
$self->{'mfa_file'} = $self->worker_temp_directory . $ALIGN_FILE; open (ALIGN, ">$self->{'mfa_file'}") or throw "error writing alignment ($self->{'mfa_file'}) file\n"; #create mfa file of multiple alignment from genomic align block
my $segments; if (UNIVERSAL::isa($object, "Bio::EnsEMBL::Compara::GenomicAlignTree")) { $segments = $object->get_all_leaves; } elsif (UNIVERSAL::isa($object, "Bio::EnsEMBL::Compara::GenomicAlignBlock")) { $segments = $object->get_all_GenomicAligns; } foreach my $this_segment (@{$segments}) { #my $seq_name = $genomic_align->dnafrag->genome_db->name;
#$seq_name =~ s/(\w*) (\w*)/$1_$2/;
#add _ to either side of genome_db id to make GERP like it
# Note: the name of the sequence must match the name of the
# corresponding leaf in the tree!
my $seq_name; if (UNIVERSAL::isa($this_segment, "Bio::EnsEMBL::Compara::GenomicAlignTree")) { $seq_name = $this_segment->name; } else { $seq_name = _get_name_from_GenomicAlign($this_segment); } my $aligned_sequence = $this_segment->aligned_sequence; $aligned_sequence =~ s/(.{80})/$1\n/g; $aligned_sequence =~ s/\./\-/g; chomp($aligned_sequence); print ALIGN ">$seq_name\n$aligned_sequence\n"; free_aligned_sequence($this_segment); } close ALIGN;
}
constrained_element_method_link_typedescriptionprevnextTop
sub constrained_element_method_link_type {
  my $self = shift;
  $self->{'_constrained_element_method_link_type'} = shift if(@_);
  return $self->{'_constrained_element_method_link_type'};
}

#read options from analysis table
}
fetch_inputdescriptionprevnextTop
sub fetch_input {
  my( $self) = @_;

  #create a Compara::DBAdaptor which shares the same DBI handle
#with $self->db (Hive DBAdaptor)
$self->{'comparaDBA'} = Bio::EnsEMBL::Compara::Production::DBSQL::DBAdaptor->new(-DBCONN=>$self->db->dbc); $self->{'comparaDBA'}->dbc->disconnect_when_inactive(0); #read from analysis table
$self->get_params($self->parameters); #read from analysis_job table
$self->get_params($self->input_id); unless (defined $self->program_version) { if (defined($self->analysis) and defined($self->analysis->program_version)) { $self->program_version($self->analysis->program_version); } else { $self->program_version($program_version); } } my $gaba = $self->{'comparaDBA'}->get_GenomicAlignBlockAdaptor; my $gab = $gaba->fetch_by_dbID($self->genomic_align_block_id); my $gas = $gab->get_all_GenomicAligns; #print "NUM GAS " . scalar(@$gas) . "\n";
#only run gerp if there are more than 2 genomic aligns. Gerp requires more
#than 2 sequences to be represented at a position
if (scalar(@$gas) > 2) { #decide whether to use GenomicAlignTree object or species tree.
my $mlss = $gab->method_link_species_set; my $method_link_class = $mlss->method_link_class; my $tree_string; if ($method_link_class =~ /GenomicAlignTree/) { #use GenomicAlignTree
my $gata = $self->{'comparaDBA'}->get_GenomicAlignTreeAdaptor; my $gat = $gata->fetch_by_GenomicAlignBlock($gab); foreach my $leaf (@{$gat->get_all_leaves}) { my $genomic_align = (sort {$a->dbID <=> $b->dbID} @{$leaf->genomic_align_group->get_all_GenomicAligns})[0]; my $name = "_" . $genomic_align->genome_db->dbID . "_" . $genomic_align->dnafrag_id . "_" . $genomic_align->dnafrag_start . "_" . $genomic_align->dnafrag_end . "_"; $leaf->name($name); } $tree_string = $gat->newick_simple_format(); $self->{'modified_tree_file'} = $self->worker_temp_directory . $TREE_FILE; open (TREE, ">$self->{'modified_tree_file'}") or throw "error writing alignment ($self->{'modified_tree_file'}) file\n"; print TREE $tree_string; close TREE; #write out multiple alignment as a mfa file.
$self->_writeMultiFastaAlignment($gat); } else { #use species tree
#write out modified tree depending on sequences represented in this
#genomic align block (based on Javier's _update_tree in Pecan.pm)
$tree_string = $self->_build_tree_string($gas); #write out multiple alignment as a mfa file.
$self->_writeMultiFastaAlignment($gab); } #if param_file defined, assume use GERP.pl else assume use gerpcol
if ($self->program_version == 1 && (defined $self->param_file)) { #calculate neutral rate if not given in the parameter file
open (PARAM_FILE, $self->param_file) || throw "Could not open file " . $self->param_file; my $neutral_rate; while (<PARAM_FILE>) { chomp; my ($flag,$value) = split /\s+/,$_; if ($flag eq "neutral_rate") { $neutral_rate = $value; } } close(PARAM_FILE); #copy param file into temporary param file in worker directory
my ($filename) = fileparse($self->param_file); $self->param_file_tmp($self->worker_temp_directory . $filename . $PARAM_FILE_SUFFIX); my $cp_cmd = "cp " . $self->param_file . " " . $self->param_file_tmp; unless (system ($cp_cmd) == 0) { throw("error copying " . $self->param_file . " to " . $self->param_file_tmp . "\n"); } #if param file doesn't have neutral rate, then append the calculated one
if (!defined $neutral_rate) { $neutral_rate = _calculateNeutralRate($tree_string); open (PARAM_FILE_TMP, ">>".$self->param_file_tmp) || throw "Could not open file " . $self->param_file_tmp . " for writing"; print PARAM_FILE_TMP "neutral_rate\t$neutral_rate\n"; } close (PARAM_FILE_TMP); } $self->{'run_gerp'} = 1; } else { $self->{'run_gerp'} = 0; } return 1;
}
free_aligned_sequencedescriptionprevnextTop
sub free_aligned_sequence {
    my ($leaf) = @_;

    my $genomic_align_group = $leaf->genomic_align_group;

    foreach my $this_genomic_align (@{$genomic_align_group->get_all_GenomicAligns}) {
	undef($this_genomic_align->{'aligned_sequence'});
    }
}


#run gerp version 1
}
genomic_align_block_iddescriptionprevnextTop
sub genomic_align_block_id {
  my $self = shift;
  $self->{'_genomic_align_block_id'} = shift if(@_);
  return $self->{'_genomic_align_block_id'};
}

#read species_set from analysis_job table
}
get_paramsdescriptionprevnextTop
sub get_params {
    my $self         = shift;
    my $param_string = shift;

    return unless($param_string);
    
    my $params = eval($param_string);
    return unless($params);

    if (defined($params->{'program'})) {
	$self->program($params->{'program'}); 
    }
    
    #read from parameters in analysis table
if (defined($params->{'param_file'})) { $self->param_file($params->{'param_file'}); } if (defined($params->{'tree_file'})) { $self->tree_file($params->{'tree_file'}); } if (defined($params->{'window_sizes'})) { $self->window_sizes($params->{'window_sizes'}); } if (defined($params->{'constrained_element_method_link_type'})) { $self->constrained_element_method_link_type($params->{'constrained_element_method_link_type'}); } if (defined($params->{'options'})) { $self->options($params->{'options'}); } #read from input_id in analysis_job table
if (defined($params->{'genomic_align_block_id'})) { $self->genomic_align_block_id($params->{'genomic_align_block_id'}); } if(defined($params->{'species_set'})) { $self->species_set($params->{'species_set'}); } return 1; } #Calculate the neutral rate of a tree by summing all the branch lengths.
#Returns the neutral rate
}
optionsdescriptionprevnextTop
sub options {
  my $self = shift;
  $self->{'_options'} = shift if(@_);
  return $self->{'_options'};
}

#read from parameters of analysis table
}
param_filedescriptionprevnextTop
sub param_file {
  my $self = shift;
  $self->{'_param_file'} = shift if(@_);
  return $self->{'_param_file'};
}

#read from parameters of analysis table
}
param_file_tmpdescriptionprevnextTop
sub param_file_tmp {
  my $self = shift;
  $self->{'_param_file_tmp'} = shift if(@_);
  return $self->{'_param_file_tmp'};
}


##########################################
#
# internal methods
#
##########################################
}
programdescriptionprevnextTop
sub program {
  my $self = shift;
  $self->{'_program'} = shift if(@_);
  return $self->{'_program'};
}
program_filedescriptionprevnextTop
sub program_file {
  my $self = shift;
  $self->{'_program_file'} = shift if(@_);
  return $self->{'_program_file'};
}

#read from parameters of analysis table
}
program_versiondescriptionprevnextTop
sub program_version {
  my $self = shift;
  $self->{'_program_version'} = shift if(@_);
  return $self->{'_program_version'};
}

#read from parameters of analysis table
}
rundescriptionprevnextTop
sub run {
    my $self = shift;

    #only run gerp if there are sufficient species present in the genomic align block
if (!$self->{'run_gerp'}) { return; } if ($self->program_version == 1) { $self->run_gerp; } elsif ($self->program_version == 2.1) { $self->run_gerp_v2; } else { throw("Invalid version number. Valid values are 1 or 2 or 2.1\n"); }
}
run_gerpdescriptionprevnextTop
sub run_gerp {
    my $self = shift;

    #change directory to where the temporary mfa and tree file are written
chdir $self->worker_temp_directory; unless (defined $self->program_file) { if (defined($self->analysis) and defined($self->analysis->program_file)) { $self->program_file($self->analysis->program_file); } else { #$self->program_file("$BIN_DIR[0]/GERP.pl");
$self->program_file("$BIN_DIR{$self->program_version}/GERP.pl"); } } throw($self->program_file . " is not executable Gerp::run ") unless ($self->program_file && -x $self->program_file); my $command = $self->program_file; if ($self->param_file) { $command .= " " . $self->param_file_tmp; } #run gerp with parameter file
unless (system($command) == 0) { throw("gerp execution failed\n"); } } #run gerp version 2.1
}
run_gerp_v2descriptionprevnextTop
sub run_gerp_v2 {
    my ($self, $bin_dir) = @_;
    my @program_files;
    my $gerpcol_path;
    my $gerpelem_path;

    #change directory to where the temporary mfa and tree file are written
chdir $self->worker_temp_directory; unless (defined $self->program_file) { if (defined($self->analysis) and defined($self->analysis->program_file)) { $gerpcol_path = $self->analysis->program_file . "/gerpcol"; $gerpelem_path = $self->analysis->program_file . "/gerpelem"; } else { $gerpcol_path = "$BIN_DIR{$self->program_version}/gerpcol"; $gerpelem_path = "$BIN_DIR{$self->program_version}/gerpelem"; } } throw($gerpcol_path . " is not executable Gerp::run ") unless ($gerpcol_path && -x $gerpcol_path); throw($gerpelem_path . " is not executable Gerp::run ") unless ($gerpelem_path && -x $gerpelem_path); #run gerpcol
my $command = $gerpcol_path; $command .= " -t " . $self->{'modified_tree_file'} . " -f " . $self->{'mfa_file'}; print STDERR "command $command\n"; unless (system($command) == 0) { throw("gerpcol execution failed\n"); } #run gerpelem
$command = $gerpelem_path; $command .= " -f " . $self->{'mfa_file'}.$RATES_FILE_SUFFIX; print STDERR "command $command\n"; unless (system($command) == 0) { throw("gerpelem execution failed\n"); } } #Parse results for Gerp version 1
#parse the param file to find the values required to determine what GERP
#called the rates and constrained elements
}
species_setdescriptionprevnextTop
sub species_set {
  my $self = shift;
  $self->{'_species_set'} = shift if(@_);
  return $self->{'_species_set'};
}

#read method_link_type from analysis table
}
tree_filedescriptionprevnextTop
sub tree_file {
  my $self = shift;
  $self->{'_tree_file'} = shift if(@_);
  return $self->{'_tree_file'};
}

#read from parameters of analysis table
}
window_sizesdescriptionprevnextTop
sub window_sizes {
  my $self = shift;
  $self->{'_window_sizes'} = shift if(@_);
  return $self->{'_window_sizes'};
}

#name of temporary parameter file
}
write_outputdescriptionprevnextTop
sub write_output {
    my ($self) = @_;
  
    #if haven't run gerp, don't try to store any results!
if (!$self->{'run_gerp'}) { return 1; } print STDERR "Write Output\n"; #parse results and store constraints and conserved elements in database
if ($self->program_version == 1) { $self->_parse_results; } elsif ($self->program_version == 2.1) { $self->_parse_results_v2; } else { throw("Invalid version number. Valid values are 1 or 2.1\n"); } return 1; } ##########################################
#
# getter/setter methods
#
##########################################
#read from input_id from analysis_job table
}
General documentation
AUTHOR - Kathryn BealTop
This modules is part of the Ensembl project
Email kbeal@ebi.ac.uk
CONTACTTop
This modules is part of the EnsEMBL project ()
Questions can be posted to the ensembl-dev mailing list: ensembl-dev@ebi.ac.uk
APPENDIXTop
The rest of the documentation details each of the object methods.
Internal methods are usually preceded with a _