Raw content of Bio::EnsEMBL::Compara::DBSQL::AlignSliceAdaptor
#
# Ensembl module for Bio::EnsEMBL::Compara::DBSQL::AlignSliceAdaptor
#
# Cared for by Javier Herrero
#
# Copyright EnsEMBL Team
#
# You may distribute this module under the same terms as perl itself
#
# pod documentation - main docs before the code
=head1 NAME
Bio::EnsEMBL::Compara::DBSQL::AlignSliceAdaptor - An AlignSlice can be used to map genes from one species onto another one. This adaptor is used to fetch all the data needed for an AlignSlice from the database.
=head1 INHERITANCE
This module inherits attributes and methods from Bio::EnsEMBL::DBSQL::BaseAdaptor
=head1 SYNOPSIS
use Bio::EnsEMBL::Registry;
## Load adaptors using the Registry
Bio::EnsEMBL::Registry->load_all();
## Fetch the query slice
my $query_slice_adaptor = Bio::EnsEMBL::Registry->get_adaptor(
"Homo sapiens", "core", "Slice");
my $query_slice = $query_slice_adaptor->fetch_by_region(
"chromosome", "14", 50000001, 50010001);
## Fetch the method_link_species_set
my $mlss_adaptor = Bio::EnsEMBL::Registry->get_adaptor(
"Compara26", "compara", "MethodLinkSpeciesSet");
my $method_link_species_set = $mlss_adaptor->fetch_by_method_link_type_registry_aliases(
"BLASTZ_NET", ["Homo sapiens", "Rattus norvegicus"]);
## Fetch the align_slice
my $align_slice_adaptor = Bio::EnsEMBL::Registry->get_adaptor(
"Compara26",
"compara",
"AlignSlice"
);
my $align_slice = $align_slice_adaptor->fetch_by_Slice_MethodLinkSpeciesSet(
$query_slice,
$method_link_species_set,
"expanded"
);
=head1 OBJECT ATTRIBUTES
=over
=item db (from SUPER class)
=back
=head1 AUTHORS
Javier Herrero (jherrero@ebi.ac.uk)
=head1 COPYRIGHT
Copyright (c) 2004. EnsEMBL Team
You may distribute this module under the same terms as perl itself
=head1 CONTACT
This modules is part of the EnsEMBL project ()
Questions can be posted to the ensembl-dev mailing list:
ensembl-dev@ebi.ac.uk
=head1 APPENDIX
The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
=cut
# Let the code begin...
package Bio::EnsEMBL::Compara::DBSQL::AlignSliceAdaptor;
use strict;
use Bio::EnsEMBL::DBSQL::BaseAdaptor;
use Bio::EnsEMBL::Utils::Exception qw(throw warning info);
use Bio::EnsEMBL::Compara::AlignSlice;
our @ISA = qw(Bio::EnsEMBL::DBSQL::BaseAdaptor);
=head2 new (CONSTRUCTOR)
Arg :
Example :
Description: Creates a new AlignSliceAdaptor object
Returntype : Bio::EnsEMBL::Compara::DBSQL::AlignSliceAdaptor
Exceptions : none
Caller : Bio::EnsEMBL::Registry->get_adaptor
=cut
sub new {
my $class = shift;
my $self = $class->SUPER::new(@_);
return $self;
}
=head2 fetch_by_Slice_MethodLinkSpeciesSet
Arg[1] : Bio::EnsEMBL::Slice $query_slice
Arg[2] : Bio::EnsEMBL::Compara::MethodLinkSpeciesSet $method_link_species_set
Arg[3] : [optional] boolean $expanded (def. FALSE)
Arg[4] : [optional] boolean $solve_overlapping (def. FALSE)
Arg[5] : [optional] Bio::EnsEMBL::Slice $target_slice
Example :
my $align_slice = $align_slice_adaptor->fetch_by_Slice_MethodLinkSpeciesSet(
$query_slice, $method_link_species_set);
Description: Fetches from the database all the data needed for the AlignSlice
corresponding to the $query_slice and the given
$method_link_species_set. Setting $expanded to anything different
from 0 or "" will create an AlignSlice in "expanded" mode. This means
that gaps are allowed in the reference species in order to allocate
insertions from other species.
By default overlapping alignments are ignored. You can choose to
reconciliate the alignments by means of a fake alignment setting the
solve_overlapping option to TRUE.
In order to restrict the AlignSlice to alignments with a given
genomic region, you can specify a target_slice. All alignments which
do not match this slice will be ignored.
Returntype : Bio::EnsEMBL::Compara::AlignSlice
Exceptions : thrown if wrong arguments are given
Caller : $object->methodname
=cut
sub fetch_by_Slice_MethodLinkSpeciesSet {
my ($self, $reference_slice, $method_link_species_set, $expanded, $solve_overlapping, $target_slice) = @_;
throw("[$reference_slice] is not a Bio::EnsEMBL::Slice")
unless ($reference_slice and ref($reference_slice) and
$reference_slice->isa("Bio::EnsEMBL::Slice"));
throw("[$method_link_species_set] is not a Bio::EnsEMBL::Compara::MethodLinkSpeciesSet")
unless ($method_link_species_set and ref($method_link_species_set) and
$method_link_species_set->isa("Bio::EnsEMBL::Compara::MethodLinkSpeciesSet"));
# Use cache whenever possible
my $key = $reference_slice->name.":".$method_link_species_set->dbID.":".($expanded?"exp":"cond").
":".($solve_overlapping?"fake-overlap":"non-overlap");
if (defined($target_slice)) {
throw("[$target_slice] is not a Bio::EnsEMBL::Slice")
unless ($target_slice and ref($target_slice) and
$target_slice->isa("Bio::EnsEMBL::Slice"));
$key .= ":".$target_slice->name();
}
return $self->{'_cache'}->{$key} if (defined($self->{'_cache'}->{$key}));
my $genomic_align_block_adaptor = $self->db->get_GenomicAlignBlockAdaptor;
my $genomic_align_blocks = $genomic_align_block_adaptor->fetch_all_by_MethodLinkSpeciesSet_Slice(
$method_link_species_set,
$reference_slice
);
## Remove all alignments not matching the target slice if any
if (defined($target_slice)) {
## Get the DnaFrag for the target Slice
my $target_dnafrag = $self->db->get_DnaFragAdaptor->fetch_by_Slice($target_slice);
if (!$target_dnafrag) {
throw("Cannot get a DnaFrag for the target Slice");
}
## Loop through all the alignment blocks and test whether they match the target slice or not
for (my $i = 0; $i < @$genomic_align_blocks; $i++) {
my $this_genomic_align_block = $genomic_align_blocks->[$i];
my $hits_the_target_slice = 0;
foreach my $this_genomic_align (@{$this_genomic_align_block->get_all_non_reference_genomic_aligns}) {
if ($this_genomic_align->dnafrag->dbID == $target_dnafrag->dbID and
$this_genomic_align->dnafrag_start <= $target_slice->end and
$this_genomic_align->dnafrag_end >= $target_slice->start) {
$hits_the_target_slice = 1;
last;
}
}
if (!$hits_the_target_slice) {
splice(@$genomic_align_blocks, $i, 1);
$i--;
}
}
}
my $genomic_align_trees = ();
my $species_order;
if ($method_link_species_set->method_link_class =~ /GenomicAlignTree/ and @$genomic_align_blocks) {
my $genomic_align_tree_adaptor = $self->db->get_GenomicAlignTreeAdaptor;
foreach my $this_genomic_align_block (@$genomic_align_blocks) {
# print $this_genomic_align_block->reference_genomic_align, "\n";
my $this_genomic_align_tree = $genomic_align_tree_adaptor->
fetch_by_GenomicAlignBlock($this_genomic_align_block);
push(@$genomic_align_trees, $this_genomic_align_tree);
# $this_genomic_align_tree->print();
# foreach my $this_ga (@{$this_genomic_align_tree->get_all_sorted_genomic_align_nodes}) {
# print $this_ga->genomic_align->genome_db->name(), "\n";
# }
}
my $last_node_id = undef;
my $tree_order;
foreach my $this_genomic_align_tree (@$genomic_align_trees) {
if ($last_node_id) {
$tree_order->{$this_genomic_align_tree->node_id}->{prev} = $last_node_id;
$tree_order->{$last_node_id}->{next} = $this_genomic_align_tree;
}
$last_node_id = $this_genomic_align_tree->node_id;
}
## First tree. Build the species order using the first tree only
foreach my $this_genomic_align_node (@{$genomic_align_trees->[0]->get_all_sorted_genomic_align_nodes}) {
next if (!@{$this_genomic_align_node->get_all_GenomicAligns});
my $this_genomic_align = $this_genomic_align_node->get_all_GenomicAligns->[0];
my $genome_db = $this_genomic_align->genome_db;
my $this_node_id = $this_genomic_align_node->node_id;
my $right_node_id = _get_right_node_id($this_genomic_align_node);
my $genomic_align_ids = [];
foreach my $each_genomic_align (@{$this_genomic_align_node->get_all_GenomicAligns}) {
push (@$genomic_align_ids, $each_genomic_align->dbID);
}
push(@$species_order,
{
genome_db => $genome_db,
right_node_id => $right_node_id,
genomic_align_ids => $genomic_align_ids,
# # last_node => $this_genomic_align_node,
});
}
$| = 1;
## Combine the first tree with the second, the resulting order with the third and so on
foreach my $this_genomic_align_tree (@$genomic_align_trees) {
my $next_genomic_align_tree = $tree_order->{$this_genomic_align_tree->node_id}->{next};
next if (!$next_genomic_align_tree);
# # # print STDERR "\nBEFORE:\n - ", join("\n - ", map {
# # # $_->{genome_db}->name." (".($_->{right_node_id} or "***").") [".
# # # join(" : ", @{$_->{genomic_align_ids}})."]"
# # # } @$species_order), "\n";
_combine_genomic_align_trees($species_order, $this_genomic_align_tree, $next_genomic_align_tree);
# $next_genomic_align_tree->print();
# # # print STDERR "\nAFTER:\n - ", join("\n - ", map {
# # # $_->{genome_db}->name." (".($_->{right_node_id} or "***").") [".
# # # join(" : ", @{$_->{genomic_align_ids}})."]"
# # # } @$species_order), "\n";
# # # ;
}
}
my $align_slice = new Bio::EnsEMBL::Compara::AlignSlice(
-adaptor => $self,
-reference_Slice => $reference_slice,
-Genomic_Align_Blocks => $genomic_align_blocks,
-Genomic_Align_Trees => $genomic_align_trees,
-species_order => $species_order,
-method_link_species_set => $method_link_species_set,
-expanded => $expanded,
-solve_overlapping => $solve_overlapping,
);
$self->{'_cache'}->{$key} = $align_slice;
return $align_slice;
}
=head2 fetch_by_GenomicAlignBlock
Arg[1] : Bio::EnsEMBL::Compara::GenomicAlignBlock $genomic_align_block
Arg[2] : [optional] boolean $expanded (def. FALSE)
Arg[3] : [optional] boolean $solve_overlapping (def. FALSE)
Example :
my $align_slice = $align_slice_adaptor->fetch_by_GenomicAlignBlock(
$genomic_align_block);
Description: Uses this genomic_aling_block to create an AlignSlice.
Setting $expanded to anything different
from 0 or "" will create an AlignSlice in "expanded" mode. This means
that gaps are allowed in the reference species in order to allocate
insertions from other species.
By default overlapping alignments are ignored. You can choose to
reconciliate the alignments by means of a fake alignment setting the
solve_overlapping option to TRUE.
Returntype : Bio::EnsEMBL::Compara::AlignSlice
Exceptions : thrown if arg[1] is not a Bio::EnsEMBL::Compara::GenomicAlignBlock
Exceptions : thrown if $genomic_align_block has no method_link_species_set
Caller : $object->methodname
=cut
sub fetch_by_GenomicAlignBlock {
my ($self, $genomic_align_block, $expanded, $solve_overlapping) = @_;
throw("[$genomic_align_block] is not a Bio::EnsEMBL::Compara::GenomicAlignBlock")
unless (UNIVERSAL::isa($genomic_align_block, "Bio::EnsEMBL::Compara::GenomicAlignBlock"));
my $method_link_species_set = $genomic_align_block->method_link_species_set();
throw("GenomicAlignBlock [$genomic_align_block] has no MethodLinkSpeciesSet")
unless ($method_link_species_set);
my $reference_genomic_align = $genomic_align_block->reference_genomic_align;
if (!$reference_genomic_align) {
$genomic_align_block->reference_genomic_align($genomic_align_block->get_all_GenomicAligns->[0]);
$reference_genomic_align = $genomic_align_block->reference_genomic_align;
}
my $reference_slice = $reference_genomic_align->get_Slice();
# Use cache whenever possible
my $key;
if ($genomic_align_block->dbID) {
$key = "gab_".$genomic_align_block->dbID.":".($expanded?"exp":"cond").
":".($solve_overlapping?"fake-overlap":"non-overlap");
} else {
$key = "gab_".$genomic_align_block.":".($expanded?"exp":"cond").
":".($solve_overlapping?"fake-overlap":"non-overlap");
}
return $self->{'_cache'}->{$key} if (defined($self->{'_cache'}->{$key}));
my $align_slice = new Bio::EnsEMBL::Compara::AlignSlice(
-adaptor => $self,
-reference_Slice => $reference_slice,
-Genomic_Align_Blocks => [$genomic_align_block],
-method_link_species_set => $method_link_species_set,
-expanded => $expanded,
-solve_overlapping => $solve_overlapping,
-preserve_blocks => 1,
);
$self->{'_cache'}->{$key} = $align_slice;
return $align_slice;
}
=head2 flush_cache
Arg[1] : none
Example : $align_slice_adaptor->flush_cache()
Description: Destroy the cache
Returntype : none
Exceptions : none
Caller : $object->methodname
=cut
sub flush_cache {
my ($self) = @_;
foreach my $align_slice (values (%{$self->{'_cache'}})) {
$align_slice->DESTROY;
}
undef $self->{'_cache'};
}
=head2 _combine_genomic_align_trees
Arg[1] : listref $species_order
Arg[2] : Bio::EnsEMBL::Compara::GenomicAlignTree $this_tree
Arg[3] : Bio::EnsEMBL::Compara::GenomicAlignTree $next_tree
Example :
Description: This method tries to accommodate the nodes in $next_tree
into $species_order. It uses several approaches. If there
is information available about left and right node IDs, it
will use it to link the nodes. Alternatively, it will rely
on the species names to do its best. When a new species name
appears in the $next_tree, it will try to insert it in the right
position.
Returntype : none
Exceptions : none
Caller : $object->methodname
=cut
sub _combine_genomic_align_trees {
my ($species_order, $this_tree, $next_tree) = @_;
# $this_tree is already taken into account in the species_order. $next_tree is the new info to combine
my $species_counter = 0;
my $existing_node_ids; # Lists all node_ids in the next tree
my $existing_right_node_ids;
my $next_species_names; # Lists all species names in the next tree
my $existing_species_names; # Lists all species names in the $species_order tracks
## Initialise values
foreach my $this_genomic_align_node (@{$next_tree->get_all_sorted_genomic_align_nodes}) {
my $this_node_id = $this_genomic_align_node->node_id;
$existing_node_ids->{$this_node_id} = 1;
push(@$next_species_names, $this_genomic_align_node->genomic_align_group->genome_db->name)
if ($this_genomic_align_node->genomic_align_group and
$this_genomic_align_node->genomic_align_group->genome_db->name ne "Ancestral sequences");
}
foreach my $species_def (@$species_order) {
my $right_node_id = $species_def->{right_node_id};
$existing_right_node_ids->{$right_node_id} = 1 if ($right_node_id);
push(@$existing_species_names, $species_def->{genome_db}->name);
}
## MAIN LOOP. For each of the nodes in $next_tree, try to find the best position in $species_order.
## First, rely on the right_node_id, then on the species_name. If $next_tree has a new species_name,
## include it. If no good position has been found, append the node to the end of the $species_order.
foreach my $this_genomic_align_node (@{$next_tree->get_all_sorted_genomic_align_nodes}) {
next if (!@{$this_genomic_align_node->get_all_GenomicAligns});
my $this_genomic_align = $this_genomic_align_node->get_all_GenomicAligns->[0];
my $this_genome_db = $this_genomic_align->genome_db;
my $this_node_id = $this_genomic_align_node->node_id;
my $this_right_node_id = _get_right_node_id($this_genomic_align_node);
my $these_genomic_align_ids = [];
foreach my $each_genomic_align (@{$this_genomic_align_node->get_all_GenomicAligns}) {
push (@$these_genomic_align_ids, $each_genomic_align->dbID);
}
## DEBUG info
# print "Inserting ", $this_genome_db->name, " into the species_order\n";
my $match = 0;
## SECONDARY LOOP. Note that the $species_counter is not reset at the end of the loop.
## This ensures that we do not add two nodes to the same species_order track and that we
## preserve the order in all existing and in the new tree.
while (!$match and $species_counter < @$species_order) {
my $species_genome_db = $species_order->[$species_counter]->{genome_db};
my $species_right_node_id = $species_order->[$species_counter]->{right_node_id};
$match = 1;
## 1. Use info from species_right_node_id if available
if (defined($species_right_node_id) and $species_right_node_id == $this_node_id) {
$species_order->[$species_counter]->{right_node_id} = $this_right_node_id;
# # $species_order->[$species_counter]->{last_node} = $this_genomic_align_node;
push (@{$species_order->[$species_counter]->{genomic_align_ids}}, @$these_genomic_align_ids);
## DEBUG info
# print "NODE LINK!\n";
# for (my $i = 0; $i<@$species_order; $i++) {
# if ($i == $species_counter) {
# print $species_order->[$i]->{genome_db}->name, "***\n";
# } else {
# print $species_order->[$i]->{genome_db}->name, "\n";
# }
# }
## 2. Force insertions in a math to the next right_node is expected
} elsif (defined($species_right_node_id) and exists($existing_node_ids->{$species_right_node_id})) {
splice(@$species_order, $species_counter, 0, {
genome_db => $this_genome_db,
right_node_id => $this_right_node_id,
genomic_align_ids => [@$these_genomic_align_ids],
});
## DEBUG info
# print "FORCE INSERT!\n";
# for (my $i = 0; $i<@$species_order; $i++) {
# if ($i == $species_counter) {
# print $species_order->[$i]->{genome_db}->name, "***\n";
# } else {
# print $species_order->[$i]->{genome_db}->name, "\n";
# }
# }
## 3. If there is no info about right node or this points to a node not found in next tree,
## rely on the species name
} elsif ($this_genome_db->name eq $species_genome_db->name
and (!defined($species_right_node_id) or
!defined($existing_node_ids->{$species_right_node_id}))
) {
$species_order->[$species_counter]->{right_node_id} = $this_right_node_id;
push (@{$species_order->[$species_counter]->{genomic_align_ids}}, @$these_genomic_align_ids);
## DEBUG info
# print "MATCH!\n";
# for (my $i = 0; $i<@$species_order; $i++) {
# if ($i == $species_counter) {
# print $species_order->[$i]->{genome_db}->name, "***\n";
# } else {
# print $species_order->[$i]->{genome_db}->name, "\n";
# }
# }
## 4. Insert this species if not found in the remaining set of existing species (in species_order)
} elsif (!defined($existing_right_node_ids->{$this_node_id})
and !grep {$_ eq $this_genome_db->name} @$existing_species_names) {
splice(@$species_order, $species_counter, 0, {
genome_db => $this_genome_db,
right_node_id => $this_right_node_id,
genomic_align_ids => [@$these_genomic_align_ids],
});
## DEBUG info
# print "INSERT!\n";
# for (my $i = 0; $i<@$species_order; $i++) {
# if ($i == $species_counter) {
# print $species_order->[$i]->{genome_db}->name, "***\n";
# } else {
# print $species_order->[$i]->{genome_db}->name, "\n";
# }
# }
## Unset $match, try with the next $species_order track.
} else {
$match = 0;
}
$species_counter++;
shift(@$existing_species_names);
}
## 5. We have not found any good position for this node: add a new track to the $species_order
if (!$match) {
push(@$species_order, {
genome_db => $this_genome_db,
right_node_id => $this_right_node_id,
genomic_align_ids => [@$these_genomic_align_ids],
});
$species_counter++;
## DEBUG info
# print "APPEND!\n";
# for (my $i = 0; $i<@$species_order; $i++) {
# if ($i == $species_counter) {
# print $species_order->[$i]->{genome_db}->name, "***\n";
# } else {
# print $species_order->[$i]->{genome_db}->name, "\n";
# }
# }
}
## DEBUG info
# print "[ENTER]";
# ;
shift(@$next_species_names);
}
return;
}
sub _get_right_node_id {
my ($this_genomic_align_node) = @_;
my $use_right = 1;
$use_right = 1 - $use_right if (!$this_genomic_align_node->root->get_original_strand);
my $neighbour_node;
if ($use_right) {
$neighbour_node = $this_genomic_align_node->right_node;
} else {
$neighbour_node = $this_genomic_align_node->left_node;
}
if ($neighbour_node) {
return $neighbour_node->node_id;
}
return undef;
}
1;