Raw content of Bio::EnsEMBL::Variation::DBSQL::AlleleFeatureAdaptor
#
# Ensembl module for Bio::EnsEMBL::Variation::DBSQL::AlleleFeatureAdaptor
#
# Copyright (c) 2004 Ensembl
#
# You may distribute this module under the same terms as perl itself
#
#
=head1 NAME
Bio::EnsEMBL::Variation::DBSQL::AlleleFeatureAdaptor
=head1 SYNOPSIS
$vdb = Bio::EnsEMBL::Variation::DBSQL::DBAdaptor->new(...);
$db = Bio::EnsEMBL::DBSQL::DBAdaptor->new(...);
# tell the variation database where core database information can be
# be found
$vdb->dnadb($db);
$afa = $vdb->get_AlleleFeatureAdaptor();
$sa = $db->get_SliceAdaptor();
# Get a VariationFeature by its internal identifier
$af = $afa->fetch_by_dbID(145);
# get all AlleleFeatures in a region
$slice = $sa->fetch_by_region('chromosome', 'X', 1e6, 2e6);
foreach $af (@{$afa->fetch_all_by_Slice($slice)}) {
print $af->start(), '-', $af->end(), ' ', $af->allele(), "\n";
}
=head1 DESCRIPTION
This adaptor provides database connectivity for AlleleFeature objects.
Genomic locations of alleles in samples can be obtained from the
database using this adaptor. See the base class BaseFeatureAdaptor for more information.
=head1 AUTHOR - Daniel Rios
=head1 CONTACT
Post questions to the Ensembl development list ensembl-dev@ebi.ac.uk
=head1 METHODS
=cut
use strict;
use warnings;
package Bio::EnsEMBL::Variation::DBSQL::AlleleFeatureAdaptor;
use Bio::EnsEMBL::Variation::AlleleFeature;
use Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor;
use Bio::EnsEMBL::Utils::Exception qw(throw);
use Bio::EnsEMBL::Utils::Sequence qw(expand);
use Bio::EnsEMBL::Variation::Utils::Sequence qw(ambiguity_code);
our @ISA = ('Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor');
=head2 from_IndividualSlice
Arg[0] : (optional) int $number
Example : $afa->from_IndividualSlice(1);
Description : Getter/Setter to know wether the Adaptor has been called
from and IndividualSlice (and must create the objects from the
genotype table) or from the StrainSlice (and must create the objects
from the allele table)
ReturnType : int
Exceptions : none
Caller : general
Status : At Risk
=cut
sub from_IndividualSlice{
my $self = shift;
return $self->{'from_IndividualSlice'} = shift if(@_);
return $self->{'from_IndividualSlice'};
}
=head2 fetch_all_by_Slice
Arg[0] : Bio::EnsEMBL::Slice $slice
Arg[1] : (optional) Bio::EnsEMBL::Variation::Individual $individual
Example : my $vf = $vfa->fetch_all_by_Slice($slice,$individual);
Description : Gets all the VariationFeatures in a certain Slice for a given
Individual (if provided)
ReturnType : listref of Bio::EnsEMBL::Variation::AlleleFeature
Exceptions : thrown on bad arguments
Caller : general
Status : At Risk
=cut
sub fetch_all_by_Slice{
my $self = shift;
my $slice = shift;
my $individual = shift;
if(!ref($slice) || !$slice->isa('Bio::EnsEMBL::Slice')) {
throw('Bio::EnsEMBL::Slice arg expected');
}
if (defined $individual){
if(!ref($individual) || !$individual->isa('Bio::EnsEMBL::Variation::Individual')) {
throw('Bio::EnsEMBL::Variation::Individual arg expected');
}
if(!defined($individual->dbID())) {
throw("Individual arg must have defined dbID");
}
}
%{$self->{'_slice_feature_cache'}} = (); #clean the cache to avoid caching problems
my $genotype_adaptor = $self->db->get_IndividualGenotypeAdaptor; #get genotype adaptor
my $genotypes = $genotype_adaptor->fetch_all_by_Slice($slice,$individual); #and get all genotype data
my $afs = $self->SUPER::fetch_all_by_Slice($slice); #get all AlleleFeatures within the Slice
my $last_position = 0;
my $new_afs = [];
my $string1;
my $string2;
my $alleles = {}; #hash to alleles, so we can check the genotype is within the alleles
#we need to merge genotype data with AlleleFeatures to assign alleles
foreach my $af (@{$afs}){
$alleles = {}; #flush the hash
map {$alleles->{$_}++} split /\//,$af->{'_vf_allele_string'}; #create a hash with the alleles
#both, genotypes and af should be sorted
for (my $i = $last_position;$i<@{$genotypes};$i++){
#need to reverse the alleles if they are in different strand, since the genotype
#table is stored always in the positive strand
$string1 = $genotypes->[$i]->allele1;
$string2 = $genotypes->[$i]->allele2;
if ($af->seq_region_strand == -1){
$string1 =~ tr/ACGTN-/TGCAN-/;
$string2 =~ tr/ACGTN-/TGCAN-/;
}
#we need to check the genotypes are within the alleles of the variation
if ($genotypes->[$i]->start == $af->seq_region_start && (exists $alleles->{$string1} || exists $alleles->{$string2})){
$alleles = {}; #flush the alleles hash
delete $af->{'_vf_allele_string'}; #and remove the attribute
if ($af->seq_region_strand == -1){
$genotypes->[$i]->allele1($string1);
$genotypes->[$i]->allele2($string2);
}
if ($genotypes->[$i]->allele2 eq 'N'){
$af->{'_half_genotype'} = 1;
$af->allele_string($genotypes->[$i]->allele1); #for half genotypes
}
else{
$af->{'_half_genotype'} = 0;
#we need to put the allele and Individual in the AlleleFeature object
if($genotypes->[$i]->allele1 eq '-' and $genotypes->[$i]->allele1 eq '-'){
1;
}
$af->allele_string(ambiguity_code($genotypes->[$i]->allele1 . '/' . $genotypes->[$i]->allele2)); #for heterozigous alleles
}
# $af->allele_string($genotypes->[$i]->allele1); #if it is strain, both alleles should be the same, might be changed in the future
$af->individual($genotypes->[$i]->individual);
$last_position++;
push @{$new_afs},$af;
}
elsif ($genotypes->[$i]->start < $af->seq_region_start){
$last_position++; #this should not happen, it means the genotype has no allele feature
}
else{
last;
}
}
}
return $new_afs;
}
=head2 fetch_all_IndividualSlice
Arg[0] : Bio::EnsEMBL::Slice $slice
Arg[1] : Bio::EnsEMBL::Variation::Population $population
Example : my $vf = $vfa->fetch_all_by_Slice($slice,$population);
Description : Gets all the VariationFeatures in a certain Slice for a given
Population
ReturnType : listref of Bio::EnsEMBL::Variation::VariationFeature
Exceptions : thrown on bad arguments
Caller : general
Status : At Risk
=cut
sub fetch_all_IndividualSlice{
my $self = shift;
my $slice = shift;
if(!ref($slice) || !$slice->isa('Bio::EnsEMBL::Slice')) {
throw('Bio::EnsEMBL::Slice arg expected');
}
#indicate to select the data from the single_bp table
$self->_multiple_bp(0);
#call the method fetch_all_by_Slice_constraint with the population constraint for the single_bp
my $allele_features = $self->fetch_all_by_Slice($slice);
#remove from the cache the result of the query (would not run the second one)
# my $key = uc(join(':', $slice->name, $constraint));
# delete $self->{'_slice_feature_cache'}->{$key};
#indicate to select the data from the multiple_bp table
$self->_multiple_bp(1);
#and add the same information for the multiple_bp table
# my $allele_features_multiple = $self->fetch_all_by_Slice_constraint($slice,$constraint);
# push @{$allele_features},$allele_features_multiple if (@{$allele_features_multiple} > 0);
#and include then the result of both queries
# $self->{'_slice_feature_cache'}->{$key} = $allele_features;
return $allele_features;
}
sub _tables{
my $self = shift;
if ($self->from_IndividualSlice){
return (['variation_feature','vf'], ['individual_genotype_single_bp','ig']) if (!$self->_multiple_bp());
return (['variation_feature','vf'], ['individual_genotype_multiple_bp','ig']) if ($self->_multiple_bp());
}
else{
return (['variation_feature','vf'], ['source','s FORCE INDEX(PRIMARY)']);
}
}
sub _columns{
my $self = shift;
return ('vf.variation_id' ,'ig.sample_id', 'CONCAT(ig.allele_1,"|",ig.allele_2) as alleles',
'vf.seq_region_id', 'vf.seq_region_start', 'vf.seq_region_end',
'vf.seq_region_strand', 'vf.variation_name') if ($self->from_IndividualSlice());
return qw(vf.variation_id
vf.seq_region_id vf.seq_region_start vf.seq_region_end
vf.seq_region_strand vf.variation_name s.name vf.variation_feature_id vf.allele_string);
}
sub _default_where_clause{
my $self = shift;
return "ig.variation_id = vf.variation_id" if ($self->from_IndividualSlice());
return "vf.source_id = s.source_id";
}
sub _objs_from_sth{
my ($self, $sth, $mapper, $dest_slice) = @_;
#
# This code is ugly because an attempt has been made to remove as many
# function calls as possible for speed purposes. Thus many caches and
# a fair bit of gymnastics is used.
#
my $sa = $self->db()->dnadb()->get_SliceAdaptor();
my @features;
my %slice_hash;
my %sr_name_hash;
my %sr_cs_hash;
my ($variation_id, $seq_region_id,
$seq_region_start,$seq_region_end, $seq_region_strand, $variation_name, $source_name, $variation_feature_id, $allele_string );
$sth->bind_columns(\$variation_id,
\$seq_region_id,\$seq_region_start,\$seq_region_end,\$seq_region_strand,
\$variation_name, \$source_name, \$variation_feature_id, \$allele_string);
my $asm_cs;
my $cmp_cs;
my $asm_cs_vers;
my $asm_cs_name;
my $cmp_cs_vers;
my $cmp_cs_name;
if($mapper) {
$asm_cs = $mapper->assembled_CoordSystem();
$cmp_cs = $mapper->component_CoordSystem();
$asm_cs_name = $asm_cs->name();
$asm_cs_vers = $asm_cs->version();
$cmp_cs_name = $cmp_cs->name();
$cmp_cs_vers = $cmp_cs->version();
}
my $dest_slice_start;
my $dest_slice_end;
my $dest_slice_strand;
my $dest_slice_length;
if($dest_slice) {
$dest_slice_start = $dest_slice->start();
$dest_slice_end = $dest_slice->end();
$dest_slice_strand = $dest_slice->strand();
$dest_slice_length = $dest_slice->length();
}
FEATURE: while($sth->fetch()) {
#get the slice object
my $slice = $slice_hash{"ID:".$seq_region_id};
if(!$slice) {
$slice = $sa->fetch_by_seq_region_id($seq_region_id);
$slice_hash{"ID:".$seq_region_id} = $slice;
$sr_name_hash{$seq_region_id} = $slice->seq_region_name();
$sr_cs_hash{$seq_region_id} = $slice->coord_system();
}
#
# remap the feature coordinates to another coord system
# if a mapper was provided
#
if($mapper) {
my $sr_name = $sr_name_hash{$seq_region_id};
my $sr_cs = $sr_cs_hash{$seq_region_id};
($sr_name,$seq_region_start,$seq_region_end,$seq_region_strand) =
$mapper->fastmap($sr_name, $seq_region_start, $seq_region_end,
$seq_region_strand, $sr_cs);
#skip features that map to gaps or coord system boundaries
next FEATURE if(!defined($sr_name));
#get a slice in the coord system we just mapped to
if($asm_cs == $sr_cs || ($cmp_cs != $sr_cs && $asm_cs->equals($sr_cs))) {
$slice = $slice_hash{"NAME:$sr_name:$cmp_cs_name:$cmp_cs_vers"} ||=
$sa->fetch_by_region($cmp_cs_name, $sr_name,undef, undef, undef,
$cmp_cs_vers);
} else {
$slice = $slice_hash{"NAME:$sr_name:$asm_cs_name:$asm_cs_vers"} ||=
$sa->fetch_by_region($asm_cs_name, $sr_name, undef, undef, undef,
$asm_cs_vers);
}
}
#
# If a destination slice was provided convert the coords
# If the dest_slice starts at 1 and is foward strand, nothing needs doing
#
if($dest_slice) {
if($dest_slice_start != 1 || $dest_slice_strand != 1) {
if($dest_slice_strand == 1) {
$seq_region_start = $seq_region_start - $dest_slice_start + 1;
$seq_region_end = $seq_region_end - $dest_slice_start + 1;
} else {
my $tmp_seq_region_start = $seq_region_start;
$seq_region_start = $dest_slice_end - $seq_region_end + 1;
$seq_region_end = $dest_slice_end - $tmp_seq_region_start + 1;
$seq_region_strand *= -1;
}
#throw away features off the end of the requested slice
if($seq_region_end < 1 || $seq_region_start > $dest_slice_length) {
next FEATURE;
}
}
$slice = $dest_slice;
}
#allele and sample table comes from the Compressed genotype adaptor
# $allele = ambiguity_code($allele) if ($self->from_IndividualSlice);
push @features, Bio::EnsEMBL::Variation::AlleleFeature->new_fast(
{'start' => $seq_region_start,
'end' => $seq_region_end,
'strand' => $seq_region_strand,
'slice' => $slice,
'allele_string' => '',
'variation_name' => $variation_name,
'adaptor' => $self,
'source' => $source_name,
'_variation_id' => $variation_id,
'_variation_feature_id' => $variation_feature_id,
'_vf_allele_string' => $allele_string,
'_sample_id' => ''});
}
return\@features;
}
#internal function used to determine wether selecting data from the individual_genotype_single_bp table or the multiple_bp
sub _multiple_bp{
my $self = shift;
$self->{'multiple_bp'} = shift if (@_);
return $self->{'multiple_bp'};
}
=head2 get_all_synonym_sources
Args[1] : Bio::EnsEMBL::Variation::AlleleFeature vf
Example : my @sources = @{$af_adaptor->get_all_synonym_sources($af)};
Description : returns a list of all the sources for synonyms of this
AlleleFeature
ReturnType : reference to list of strings
Exceptions : none
Caller : general
Status : At Risk
: Variation database is under development.
=cut
sub get_all_synonym_sources{
my $self = shift;
my $af = shift;
my %sources;
my @sources;
if(!ref($af) || !$af->isa('Bio::EnsEMBL::Variation::AlleleFeature')) {
throw("Bio::EnsEMBL::Variation::AlleleFeature argument expected");
}
if (!defined($af->{'_variation_id'}) && !defined($af->{'variation'})){
warning("Not possible to get synonym sources for the AlleleFeature: you need to attach a Variation first");
return \@sources;
}
#get the variation_id
my $variation_id;
if (defined ($af->{'_variation_id'})){
$variation_id = $af->{'_variation_id'};
}
else{
$variation_id = $af->variation->dbID();
}
#and go to the varyation_synonym table to get the extra sources
my $source_name;
my $sth = $self->prepare(qq{SELECT s.name
FROM variation_synonym vs, source s
WHERE s.source_id = vs.source_id
AND vs.variation_id = ?
});
$sth->bind_param(1,$variation_id,SQL_INTEGER);
$sth->execute();
$sth->bind_columns(\$source_name);
while ($sth->fetch){
$sources{$source_name}++;
}
@sources = keys(%sources);
return \@sources;
}
1;