Raw content of Bio::EnsEMBL::ExternalData::SangerSNP::VariationAdaptor
# EnsEMBL Sanger SNP adaptor
#
# Copyright EnsEMBL
#
# Author: Steve Searle
#
=head1 NAME
Bio::EnsEMBL::ExternalData::SangerSNP::VariationAdaptor
=head1 SYNOPSIS
A SNP adaptor which sits over the Sanger SNP database. Provides a means of
getting SNPs out of the Sanger SNP database as
Bio::EnsEMBL::Variation::VariationFeature objects.
=head1 CONTACT
Post questions to the EnsEMBL developer list:
=head1 APPENDIX
=cut
use strict;
package Bio::EnsEMBL::ExternalData::SangerSNP::VariationAdaptor;
use Bio::EnsEMBL::ExternalData::Variation;
use Bio::EnsEMBL::SNP;
use Bio::EnsEMBL::Variation::VariationFeature;
use Bio::EnsEMBL::Variation::Variation;
use Bio::EnsEMBL::Analysis;
use Bio::EnsEMBL::External::ExternalFeatureAdaptor;
use Bio::EnsEMBL::DBSQL::BaseAdaptor;
use vars qw(@ISA);
@ISA = qw(Bio::EnsEMBL::DBSQL::BaseAdaptor Bio::EnsEMBL::External::ExternalFeatureAdaptor );
sub fetch_all_by_chr_start_end {
my ($self,$chr,$start,$end) = @_;
my $assembly = $self->ensembl_db->get_CoordSystemAdaptor->fetch_all->[0]->version();
(my $assembly_name = $assembly) =~ s/[0-9]*$//;
(my $assembly_version = $assembly) =~ s/[A-Z,a-z]*([0-9]*)$/$1/;
my $query = qq {
SELECT MAPPED_SNP.ID_SNP,
(MAPPED_SNP.POSITION + SEQ_SEQ_MAP.START_COORDINATE -1) AS snppos,
(MAPPED_SNP.END_POSITION + SEQ_SEQ_MAP.START_COORDINATE -1) AS snpendpos,
(MAPPED_SNP.IS_REVCOMP * SEQ_SEQ_MAP.CONTIG_ORIENTATION) AS snpstrand,
CHROM_SEQ.DATABASE_SEQNAME as chrname,
SNP_SUMMARY.ALLELES,
SNP_SUMMARY.DEFAULT_NAME
FROM DATABASE_DICT,
CHROM_SEQ,
SEQ_SEQ_MAP,
MAPPED_SNP,
SNP_SUMMARY
WHERE DATABASE_DICT.DATABASE_NAME = '$assembly_name'
AND DATABASE_DICT.DATABASE_VERSION = '$assembly_version'
AND CHROM_SEQ.DATABASE_SOURCE = DATABASE_DICT.ID_DICT
AND CHROM_SEQ.IS_CURRENT = 1
AND CHROM_SEQ.DATABASE_SEQNAME='$chr'
AND CHROM_SEQ.ID_CHROMSEQ = SEQ_SEQ_MAP.ID_CHROMSEQ
AND MAPPED_SNP.ID_SEQUENCE =SEQ_SEQ_MAP.SUB_SEQUENCE
AND SNP_SUMMARY.ID_SNP = MAPPED_SNP.ID_SNP
AND MAPPED_SNP.IGNORE_REASON IS NULL
AND MAPPED_SNP.IS_REVCOMP IS NOT NULL
AND (MAPPED_SNP.POSITION + SEQ_SEQ_MAP.START_COORDINATE -1) BETWEEN $start AND $end
ORDER BY MAPPED_SNP.ID_SNP, SNPPOS
};
my $sth = $self->prepare($query);
# print $sth->{Statement} . "\n";
$sth->execute;
# print "Query finished\n";
my @snps;
# Naughty but should speed things up a bit
my $cur_snp_id = -1;
my $snp;
my %ids;
my $hashref;
while ($hashref = $sth->fetchrow_hashref) {
if ($hashref->{SNPSTRAND} != 1 && $hashref->{SNPSTRAND} != -1) {
print STDERR "Got non 1 or -1 strand for " . $hashref->{ID_SNP} . "\n";
}
my $start;
my $end;
if ($hashref->{SNPPOS} >= $hashref->{SNPENDPOS} ||
($hashref->{ALLELES} =~ /-/ && abs($hashref->{SNPPOS}-$hashref->{SNPENDPOS})==1)) {
$start = $hashref->{SNPENDPOS};
$end = $hashref->{SNPPOS};
} else {
$start = $hashref->{SNPPOS};
$end = $hashref->{SNPENDPOS};
}
if (exists($ids{$hashref->{ID_SNP} . ":" .$start})) {
print STDERR "Warning: Skipping duplicate for " . $hashref->{ID_SNP} . " at $start\n";
next;
}
my $varfeat = Bio::EnsEMBL::Variation::VariationFeature->new_fast(
{
'dbID' => $hashref->{ID_SNP},
'adaptor' => $self,
'variation_name' => $hashref->{DEFAULT_NAME},
'start' => $start,
'end' => $end,
'strand' => $hashref->{SNPSTRAND},
'allele_string' => $hashref->{ALLELES},
'source' => 'SangerSNP',
});
$varfeat->slice($self->ensembl_db->get_SliceAdaptor->fetch_by_region('chromosome',
$hashref->{CHRNAME}));
# add minimal Variation object
my $var = Bio::EnsEMBL::Variation::Variation->new(
-dbID => $hashref->{'ID_SNP'},
-ADAPTOR => $self,
-NAME => $hashref->{'DEFAULT_NAME'},
-SOURCE => 'Glovar',
);
# my %snp_hash;
# if ($hashref->{SNPPOS} >= $hashref->{SNPENDPOS} ||
# ($hashref->{ALLELES} =~ /-/ && abs($hashref->{SNPPOS}-$hashref->{SNPENDPOS})==1)) {
# $snp_hash{_gsf_start} = $hashref->{SNPENDPOS};
# $snp_hash{_gsf_end} = $hashref->{SNPPOS};
# } else {
# $snp_hash{_gsf_start} = $hashref->{SNPPOS};
# $snp_hash{_gsf_end} = $hashref->{SNPENDPOS};
# }
# if ($hashref->{SNPSTRAND} != 1 && $hashref->{SNPSTRAND} != -1) {
# print STDERR "Got non 1 or -1 strand\n";
# }
push @snps,$varfeat;
$ids{$hashref->{ID_SNP} . ":" .$start} = 1;
}
return \@snps;
}
sub coordinate_systems {
return ("ASSEMBLY");
}
sub fetch_by_dbID_position_range {
my ($self,$dbID,$range_chr,$range_start,$range_end) = @_;
my $assembly = $self->ensembl_db->assembly_type;
(my $assembly_name = $assembly) =~ s/[0-9]*$//;
(my $assembly_version = $assembly) =~ s/[A-Z,a-z]*([0-9]*)$/$1/;
my $query = qq {
SELECT DISTINCT MAPPED_SNP.ID_SNP,
(MAPPED_SNP.POSITION + SEQ_SEQ_MAP.START_COORDINATE -1) AS snppos,
(MAPPED_SNP.END_POSITION + SEQ_SEQ_MAP.START_COORDINATE -1) AS snpendpos,
(MAPPED_SNP.IS_REVCOMP * SEQ_SEQ_MAP.CONTIG_ORIENTATION) AS snpstrand,
CHROM_SEQ.DATABASE_SEQNAME as chrname,
SNP_SUMMARY.ALLELES,
SNP_SUMMARY.DEFAULT_NAME
FROM DATABASE_DICT,
CHROM_SEQ,
SEQ_SEQ_MAP,
MAPPED_SNP,
SNP_SUMMARY
WHERE DATABASE_DICT.DATABASE_NAME = '$assembly_name'
AND DATABASE_DICT.DATABASE_VERSION = '$assembly_version'
AND CHROM_SEQ.DATABASE_SOURCE = DATABASE_DICT.ID_DICT
AND CHROM_SEQ.IS_CURRENT = 1
AND CHROM_SEQ.ID_CHROMSEQ = SEQ_SEQ_MAP.ID_CHROMSEQ
AND MAPPED_SNP.ID_SEQUENCE =SEQ_SEQ_MAP.SUB_SEQUENCE
AND SNP_SUMMARY.ID_SNP = MAPPED_SNP.ID_SNP
AND MAPPED_SNP.IS_REVCOMP IS NOT NULL
AND SNP_SUMMARY.ID_SNP = $dbID
ORDER BY MAPPED_SNP.ID_SNP, SNPPOS
};
my $sth = $self->prepare($query);
#print $sth->{Statement} . "\n";
$sth->execute;
# print "Query finished\n";
my @snps;
# Naughty but should speed things up a bit
my $cur_snp_id = -1;
my $snp;
my %ids;
my $hashref;
while ($hashref = $sth->fetchrow_hashref) {
my $start;
my $end;
if ($hashref->{SNPPOS} >= $hashref->{SNPENDPOS} ||
($hashref->{ALLELES} =~ /-/ && abs($hashref->{SNPPOS}-$hashref->{SNPENDPOS})==1)) {
$start = $hashref->{SNPENDPOS};
$end = $hashref->{SNPPOS};
} else {
$start = $hashref->{SNPPOS};
$end = $hashref->{SNPENDPOS};
}
if ($hashref->{CHRNAME} ne $range_chr || $start < $range_start || $start > $range_end) {
#print "Outside range ($range_chr,$range_start,$range_end) for id $dbID " . $hashref->{CHRNAME} . " $start $end\n";
next;
}
#print "In range ($range_chr,$range_start,$range_end) for id $dbID " . $hashref->{CHRNAME} . " $start $end\n";
my $varfeat = Bio::EnsEMBL::Variation::VariationFeature->new_fast(
{
'dbID' => $hashref->{ID_SNP},
'adaptor' => $self,
'variation_name' => $hashref->{DEFAULT_NAME},
'start' => $start,
'end' => $end,
'strand' => $hashref->{SNPSTRAND},
'allele_string' => $hashref->{ALLELES},
'source' => 'SangerSNP',
});
$varfeat->slice($self->ensembl_db->get_SliceAdaptor->fetch_by_region('chromosome',$hashref->{CHRNAME}));
# add minimal Variation object
my $var = Bio::EnsEMBL::Variation::Variation->new(
-dbID => $hashref->{'ID_SNP'},
-ADAPTOR => $self,
-NAME => $hashref->{'DEFAULT_NAME'},
-SOURCE => 'Glovar',
);
push @snps,$varfeat;
}
if (scalar(@snps) > 1) {
print STDERR "Got multiple vars for $dbID - only returning 1\n";
}
return $snps[0];
}
sub fetch_all_by_dbID {
my ($self,$dbID) = @_;
my $assembly = $self->ensembl_db->assembly_type;
(my $assembly_name = $assembly) =~ s/[0-9]*$//;
(my $assembly_version = $assembly) =~ s/[A-Z,a-z]*([0-9]*)$/$1/;
my $query = qq {
SELECT DISTINCT MAPPED_SNP.ID_SNP,
(MAPPED_SNP.POSITION + SEQ_SEQ_MAP.START_COORDINATE -1) AS snppos,
(MAPPED_SNP.END_POSITION + SEQ_SEQ_MAP.START_COORDINATE -1) AS snpendpos,
(MAPPED_SNP.IS_REVCOMP * SEQ_SEQ_MAP.CONTIG_ORIENTATION) AS snpstrand,
CHROM_SEQ.DATABASE_SEQNAME as chrname,
SNP_SUMMARY.ALLELES,
SNP_SUMMARY.DEFAULT_NAME
FROM DATABASE_DICT,
CHROM_SEQ,
SEQ_SEQ_MAP,
MAPPED_SNP,
SNP_SUMMARY
WHERE DATABASE_DICT.DATABASE_NAME = '$assembly_name'
AND DATABASE_DICT.DATABASE_VERSION = '$assembly_version'
AND CHROM_SEQ.DATABASE_SOURCE = DATABASE_DICT.ID_DICT
AND CHROM_SEQ.IS_CURRENT = 1
AND CHROM_SEQ.ID_CHROMSEQ = SEQ_SEQ_MAP.ID_CHROMSEQ
AND MAPPED_SNP.ID_SEQUENCE =SEQ_SEQ_MAP.SUB_SEQUENCE
AND SNP_SUMMARY.ID_SNP = MAPPED_SNP.ID_SNP
AND MAPPED_SNP.IS_REVCOMP IS NOT NULL
AND SNP_SUMMARY.ID_SNP = $dbID
ORDER BY MAPPED_SNP.ID_SNP, SNPPOS
};
my $sth = $self->prepare($query);
#print $sth->{Statement} . "\n";
$sth->execute;
# print "Query finished\n";
my @snps;
# Naughty but should speed things up a bit
my $cur_snp_id = -1;
my $snp;
my %ids;
my $hashref;
while ($hashref = $sth->fetchrow_hashref) {
my $start;
my $end;
if ($hashref->{SNPPOS} >= $hashref->{SNPENDPOS} ||
($hashref->{ALLELES} =~ /-/ && abs($hashref->{SNPPOS}-$hashref->{SNPENDPOS})==1)) {
$start = $hashref->{SNPENDPOS};
$end = $hashref->{SNPPOS};
} else {
$start = $hashref->{SNPPOS};
$end = $hashref->{SNPENDPOS};
}
my $varfeat = Bio::EnsEMBL::Variation::VariationFeature->new_fast(
{
'dbID' => $hashref->{ID_SNP},
'adaptor' => $self,
'variation_name' => $hashref->{DEFAULT_NAME},
'start' => $start,
'end' => $end,
'strand' => $hashref->{SNPSTRAND},
'allele_string' => $hashref->{ALLELES},
'source' => 'SangerSNP',
});
$varfeat->slice($self->ensembl_db->get_SliceAdaptor->fetch_by_region('chromosome',$hashref->{CHRNAME}));
# add minimal Variation object
my $var = Bio::EnsEMBL::Variation::Variation->new(
-dbID => $hashref->{'ID_SNP'},
-ADAPTOR => $self,
-NAME => $hashref->{'DEFAULT_NAME'},
-SOURCE => 'Glovar',
);
push @snps,$varfeat;
}
return \@snps;
}
1;