Bio::EnsEMBL::Compara::Production::GenomicAlignBlock
PhastCons
Toolbar
Summary
Bio::EnsEMBL::Compara::Production::GenomicAlignBlock::PhastCons
Package variables
Privates (from "my" definitions)
$MSA_VIEW_EXE = "$BIN_DIR/msa_view"
$METHOD_LINK_TYPE = "PHASTCONS_CONSTRAINED_ELEMENT"
$BIN_DIR = "/software/ensembl/compara/phast"
$PHAST_CONS_EXE = "$BIN_DIR/phastCons"
Included modules
Inherit
Synopsis
$phastCons->fetch_input();
$phastCons->run();
$phastCons->write_output(); writes to database
Description
Given a genomic_align_tree Bio::EnsEMBL::Compara::DBSQL::GenomicAlignTree
identifier it fetches the alignment and the tree from a compara database and runs
the program phastCons. At the moment, only conserved elements are supported,
the parser for conservation scores is not implemented yet.
WARNING: This is preliminary version nd the paths/options/etc are hard-coded.
Please, refer to the TODO section for more details.
Methods
Methods description
Title : fetch_input Usage : $self->fetch_input Function: Fetches input data for gerp from the database Returns : none Args : none |
Title : run Usage : $self->run Function: Run gerp Returns : none Args : none |
Title : write_output Usage : $self->write_output Function: Write results to the database Returns : 1 Args : none |
Methods code
constrained_element_method_link_type | description | prev | next | Top |
sub constrained_element_method_link_type
{ my $self = shift;
$self->{'_constrained_element_method_link_type'} = shift if(@_);
return $self->{'_constrained_element_method_link_type'};
}
} |
sub fetch_input
{ my( $self) = @_;
$self->{'comparaDBA'} = Bio::EnsEMBL::Compara::Production::DBSQL::DBAdaptor->new(-DBCONN=>$self->db->dbc);
$self->{'comparaDBA'}->dbc->disconnect_when_inactive(0);
$self->get_params($self->parameters);
$self->get_params($self->input_id);
my $gata = $self->{'comparaDBA'}->get_GenomicAlignTreeAdaptor;
my $root_id = $self->root_id;
$self->{gat} = $gata->fetch_node_by_node_id($root_id);
if ($root_id != $self->{gat}->node_id) {
die "Cannot find a tree with this node ID: ".$root_id."\n";
}
my $reference_leaves = [];
my @species_to_skip = ("Gorilla gorilla", "Pongo pygmaeus");
my @uninformative_species = ("Pan troglodytes");
my $reference_species = "Homo sapiens";
foreach my $leaf (@{$self->{gat}->get_all_leaves}) {
my $species_name = $leaf->genomic_align_group->genome_db->name;
if (grep {$_ eq $species_name} @species_to_skip) {
$leaf->disavow_parent;
$self->{gat} = $self->{gat}->minimize_tree;
next;
}
$leaf->name("gat_".$leaf->node_id);
if (grep {$_ eq $species_name} @uninformative_species) {
push(@{$self->{uninformative_leaves}}, $leaf->node_id);
} elsif (!$reference_species or $species_name eq $reference_species) {
push(@{$self->{uninformative_leaves}}, $leaf->node_id);
push(@$reference_leaves, $leaf->node_id);
}
}
$self->{mod_file} = $self->worker_temp_directory . "gat_".$root_id.".mod";
open(MOD, ">".$self->{mod_file}) or die;
print MOD "ALPHABET: A C G T
ORDER: 0
SUBST_MOD: REV
BACKGROUND: 0.295000 0.205000 0.205000 0.295000
-0.976030 0.165175 0.539722 0.271133
0.237691 -0.990352 0.189637 0.563024
0.776673 0.189637 -1.248143 0.281833
0.271133 0.391254 0.195849 -0.858237
TREE: ";
print MOD $self->{gat}->newick_format("simple"), "\n";
close(MOD);
foreach my $this_ref_node_id (@$reference_leaves) {
my $root_file_name = $self->worker_temp_directory . "gat_".$root_id.".$this_ref_node_id";
my $multifasta_file = "$root_file_name.mfa";
my $ref_fasta_file = "$root_file_name.fa";
my $suff_stats_file = "$root_file_name.ss";
open(MULTIFASTA, ">$multifasta_file") or die;
my $chr_name;
foreach my $leaf (@{$self->{gat}->get_all_leaves}) {
next if ($leaf->node_id != $this_ref_node_id);
if ($leaf->genomic_align_group->get_all_GenomicAligns->[0]->dnafrag_strand == -1) {
$self->{gat}->reverse_complement;
}
print MULTIFASTA ">", $leaf->name, "\n";
my $aligned_sequence = $leaf->aligned_sequence;
$aligned_sequence =~ tr/./-/;
print MULTIFASTA $aligned_sequence, "\n";
open(FASTA, ">$ref_fasta_file ") or die;
print FASTA ">", $leaf->name, "\n";
my $original_sequence = $leaf->aligned_sequence;
$original_sequence =~ tr/./-/;
print FASTA $original_sequence, "\n";
close(FASTA);
last;
}
foreach my $leaf (@{$self->{gat}->get_all_leaves}) {
next if ($leaf->node_id == $this_ref_node_id);
print MULTIFASTA ">", $leaf->name, "\n";
my $aligned_sequence = $leaf->aligned_sequence;
$aligned_sequence =~ tr/./-/;
print MULTIFASTA $aligned_sequence, "\n";
}
close(MULTIFASTA);
my $run_str = "$MSA_VIEW_EXE -i FASTA $multifasta_file -o SS".
" --refseq 2$ref_fasta_file > $suff_stats_file";
system($run_str) == 0
or die "$run_str failed: $?";
die if (!-e $suff_stats_file);
$self->{ss_files}->{$this_ref_node_id} = $suff_stats_file;
}
foreach my $leaf (@{$self->{gat}->get_all_leaves}) {
free_aligned_sequence($leaf);
}
return 1; } |
sub free_aligned_sequence
{ my ($leaf) = @_;
my $genomic_align_group = $leaf->genomic_align_group;
foreach my $this_genomic_align (@{$genomic_align_group->get_all_GenomicAligns}) {
undef($this_genomic_align->{'aligned_sequence'});
}
}
1; } |
sub get_params
{ my $self = shift;
my $param_string = shift;
return unless($param_string);
my $params = eval($param_string);
return unless($params);
if (defined($params->{'program'})) {
$self->program($params->{'program'});
}
if (defined($params->{'param_file'})) {
$self->param_file($params->{'param_file'});
}
if (defined($params->{'tree_file'})) {
$self->tree_file($params->{'tree_file'});
}
if (defined($params->{'window_sizes'})) {
$self->window_sizes($params->{'window_sizes'});
}
if (defined($params->{'constrained_element_method_link_type'})) {
$self->constrained_element_method_link_type($params->{'constrained_element_method_link_type'});
}
if (defined($params->{'options'})) {
$self->options($params->{'options'});
}
if (defined($params->{'root_id'})) {
$self->root_id($params->{'root_id'});
}
if(defined($params->{'species_set'})) {
$self->species_set($params->{'species_set'});
}
return 1; } |
sub options
{ my $self = shift;
$self->{'_options'} = shift if(@_);
return $self->{'_options'};
}
} |
sub param_file
{ my $self = shift;
$self->{'_param_file'} = shift if(@_);
return $self->{'_param_file'};
}
} |
sub param_file_tmp
{ my $self = shift;
$self->{'_param_file_tmp'} = shift if(@_);
return $self->{'_param_file_tmp'};
}
} |
sub program
{ my $self = shift;
$self->{'_program'} = shift if(@_);
return $self->{'_program'}; } |
sub program_file
{ my $self = shift;
$self->{'_program_file'} = shift if(@_);
return $self->{'_program_file'};
}
} |
sub program_version
{ my $self = shift;
$self->{'_program_version'} = shift if(@_);
return $self->{'_program_version'};
}
} |
sub root_id
{ my $self = shift;
$self->{'_root_id'} = shift if(@_);
return $self->{'_root_id'};
}
} |
sub run
{ my $self = shift;
while (my ($node_id, $ss_file_name) = each %{$self->{ss_files}}) {
my $bed_file_name = $ss_file_name;
$bed_file_name =~ s/(\.ss)?$/\.bed/;
my $run_str = "$PHAST_CONS_EXE $ss_file_name ". $self->{mod_file}.
" -i SS --rho 0.3 --expected-length 45 --target-coverage 0.3";
if ($self->{uninformative_leaves} and @{$self->{uninformative_leaves}}) {
$run_str .= " --not-informative gat_".join(",gat_", grep {$_ ne $node_id} @{$self->{uninformative_leaves}});
}
$run_str .= " --most-conserved $bed_file_name --no-post-probs".
" --seqname gat_".$node_id." --idpref phastCons.$node_id";
print $run_str, "\n";
system($run_str) == 0
or die "$run_str failed: $?";
die if (!-e $bed_file_name);
$self->{bed_files}->{$node_id} = $bed_file_name;
} } |
sub tree_file
{ my $self = shift;
$self->{'_tree_file'} = shift if(@_);
return $self->{'_tree_file'};
}
} |
sub write_output
{ my ($self) = @_;
print STDERR "Write Output\n";
my $gat_method_link_species_set = $self->{gat}->get_all_leaves->[0]->
genomic_align_group->get_all_GenomicAligns->[0]->method_link_species_set;
my $phastCons_mlss = $gat_method_link_species_set;
$phastCons_mlss = $gat_method_link_species_set->adaptor->fetch_by_method_link_type_GenomeDBs(
$METHOD_LINK_TYPE, $phastCons_mlss->species_set);
print "MLSS: ", $phastCons_mlss->dbID;
my $constrained_element_adaptor = $gat_method_link_species_set->adaptor->db->get_ConstrainedElementAdaptor;
while (my ($node_id, $bed_file_name) = each %{$self->{bed_files}}) {
print "OUTPUT FOR NODE $node_id: $bed_file_name\n";
print qx""head -20 $bed_file_name", "[...]\n\n";
# Find the leaf
my $genomic_align_node;
foreach my $leaf (@{$self->{gat}->get_all_leaves}) {
if ($leaf->node_id == $node_id) {
$genomic_align_node = $leaf;
last;
}
}
## We always run phastCons on the FWD strand
my $genomic_align = $genomic_align_node->genomic_align_group->get_all_GenomicAligns->[0];
my $dnafrag_id = $genomic_align->dnafrag_id;
my $genomic_align_start = $genomic_align->dnafrag_start;
## Store the constrained elements
my @constrained_elements;
open(BED, $bed_file_name) or die;
while (<BED>) {
my ($seq_name, $start0, $end, @rest) = split(/\s+/, $_);
#create new genomic align blocks by converting alignment
#coords to chromosome coords
my $constrained_element_block;
my $constrained_element = new Bio::EnsEMBL::Compara::ConstrainedElement(
-reference_dnafrag_id => $dnafrag_id,
-start => $genomic_align_start + $start0,
-end => $genomic_align_start + $end - 1,
-method_link_species_set_id => $phastCons_mlss->dbID,
-score => 0,
);
push(@$constrained_element_block, $constrained_element);
push(@constrained_elements, $constrained_element_block);
}
close(BED);
#store in constrained_element table
$constrained_element_adaptor->store($phastCons_mlss,\@ constrained_elements);
}
return 1;
}
##########################################
#
# getter/setter methods
#
##########################################
#read from input_id from analysis_job table } |
General documentation
AUTHOR - Javier Herrero | Top |
This modules is part of the EnsEMBL project (
)
Questions can be posted to the ensembl-dev mailing list:
ensembl-dev@ebi.ac.uk
- Implement the parsing and storing of conservation scores
- Move the path to software to a configuration module
- Allow to specify the method_link_type for phastCons using the input_id or the parameters hash
- Allow to change the model file, in particular the alphabet, background distribution and
rate matrix.
- Allow to specify phastCons options using the input_id or the parameters hash
- Allow to specify the species you want to skip (if any), the uninformative species
and the reference species using the input_id or the parameters hash.
- Support to run phastCons with no reference sequence.
- Maybe store the MethodLinkSpeciesSet in the database if it doesn't exist yet.
The rest of the documentation details each of the object methods.
Internal methods are usually preceded with a _