Raw content of Bio::EnsEMBL::ExternalData::Glovar::GlovarSNPAdaptor
=head1 NAME
Bio::EnsEMBL::ExternalData::Glovar::GlovarSNPAdaptor -
Object adaptor for Glovar SNPs
=head1 SYNOPSIS
$glodb = Bio::EnsEMBL::ExternalData::Glovar::DBAdaptor->new(
-user => 'ensro',
-pass => 'secret',
-dbname => 'snp',
-host => 'go_host',
-driver => 'Oracle'
);
my $glovar_adaptor = $glodb->get_GlovarSNPAdaptor;
my $snps = $glovar_adaptor->fetch_all_by_clone_accession('AL100005', 'AL100005', 1, 10000);
=head1 DESCRIPTION
This module is an entry point into a glovar database. It allows you to retrieve
SNPs from Glovar.
=head1 AUTHORS
Tony Cox
Patrick Meidl
=head1 CONTACT
Post questions to the EnsEMBL development list ensembl-dev@ebi.ac.uk
=cut
package Bio::EnsEMBL::ExternalData::Glovar::GlovarSNPAdaptor;
use strict;
use Bio::EnsEMBL::Variation::VariationFeature;
use Bio::EnsEMBL::Variation::Variation;
use Bio::EnsEMBL::Variation::TranscriptVariation;
use Bio::EnsEMBL::Variation::PopulationGenotype;
use Bio::EnsEMBL::Variation::Population;
use Bio::EnsEMBL::Variation::Allele;
use Bio::EnsEMBL::Transcript;
use Bio::EnsEMBL::ExternalData::Glovar::GlovarAdaptor;
use Bio::EnsEMBL::Utils::Eprof qw(eprof_start eprof_end eprof_dump);
use Bio::EnsEMBL::Registry;
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
use vars qw(@ISA);
@ISA = qw(Bio::EnsEMBL::ExternalData::Glovar::GlovarAdaptor);
# map Glovar validation states to Ensembl ones
our %VSTATE_MAP = (
'Observed' => 'observed',
'Sanger Verified' => 'submitter',
'Verified' => 'submitter',
'Two-hit' => 'doublehit',
'Non-polymorphic' => 'non-polymorphic',
'HapMap Verified' => 'hapmap',
);
# map Glovar consequence and position type to Ensembl consequence type
our %CONSEQUENCE_TYPE_MAP = (
'Coding Synonymous' => 'SYNONYMOUS_CODING',
'Coding Non-synonymous' => 'NON_SYNONYMOUS_CODING',
'Coding Sop gained' => 'STOP_GAINED',
'Coding Stop lost' => 'STOP_LOST',
'Non-coding exonic Non-coding' => 'UTR',
'Intronic Non-coding' => 'INTRONIC',
'Upstream Non-coding' => 'UPSTREAM',
);
=head2 fetch_all_by_clone_accession
Arg[1] : clone internal ID
Arg[2] : clone embl accession
Arg[3] : clone start coordinate
Arg[4] : clone end coordinate
Example : @list = @{$glovar_adaptor->fetch_all_by_clone_accession('AL100005', 'AL100005', 1, 10000)};
Description: Retrieves a list of SNPs on a clone in clone coordinates.
Returntype : Listref of Bio::EnsEMBL::Variation::VariationFeature objects
Exceptions : none
Caller : $self->fetch_all_by_Clone
=cut
sub fetch_all_by_clone_accession {
my ($self, $embl_acc, $embl_version, $cl_start, $cl_end) = @_;
#&eprof_start('clone_sql');
# get info on clone
my @cloneinfo = $self->fetch_clone_by_accession($embl_acc);
return([]) unless (@cloneinfo);
my ($nt_name, $id_seq, $clone_start, $clone_end, $clone_strand) = @cloneinfo;
## temporary hack for SNP density script: skip vega-specific clones
#unless ($nt_name =~ /^NT/) {
# warn "WARNING: Skipping vega-specific clone ($embl_acc).\n";
# return ([]);
#}
# now get the SNPs on this clone
# get only features in the desired region of the clone
my ($q_start, $q_end);
if ($clone_strand == 1) {
$q_start = $clone_start + $cl_start - 1;
$q_end = $clone_start + $cl_end + 1;
} else{
$q_start = $clone_end - $cl_end - 1;
$q_end = $clone_end - $cl_start + 1;
}
my $q2 = qq(
SELECT
distinct(sgc.id_snp) as id_snp,
ss.id_snp as internal_id,
ss.default_name as id_default,
ms.position as snp_start,
ms.end_position as snp_end,
ms.is_revcomp as snp_strand,
scd.description as validated,
ss.alleles as alleles,
svd.description as snpclass,
ss.is_private as private,
ptd.description as pos_type,
cs.design_entry as expt_id,
sgc_dict.description as consequence
FROM
mapped_snp ms,
snp,
snpvartypedict svd,
snp_confirmation_dict scd,
snp_summary ss
LEFT JOIN (
snp_gene_consequence sgc
INNER JOIN
coding_sequence cs on sgc.id_codingseq = cs.id_codingseq
AND cs.design_entry = ?
INNER JOIN
sgc_dict on sgc.consequence = sgc_dict.id_dict
) on sgc.id_snp = ss.id_snp
LEFT JOIN
position_type_dict ptd on sgc.position_description = ptd.id_dict
WHERE ms.id_sequence = ?
AND ms.id_snp = ss.id_snp
AND ss.id_snp = snp.id_snp
AND snp.var_type = svd.id_dict
AND ss.confirmation_status = scd.id_dict
AND ms.position BETWEEN $q_start AND $q_end
);
my $sth;
eval {
$sth = $self->prepare($q2);
$sth->execute($self->consequence_exp, $id_seq);
};
if ($@){
warn("ERROR: SQL failed in " . (caller(0))[3] . "\n$@");
return([]);
}
my (%varfeats, %cons);
while (my $row = $sth->fetchrow_hashref()) {
return([]) unless keys %{$row};
warn "WARNING: private data!" if $row->{'PRIVATE'};
## filter SNPs without strand (this is gruft in mapped_snp)
next unless $row->{'SNP_STRAND'};
## calculate coords depending on clone orientation
my ($start, $end);
$row->{'SNP_END'} ||= $row->{'SNP_START'};
if ($clone_strand == 1) {
$start = $row->{'SNP_START'} - $clone_start + 1;
$end = $row->{'SNP_END'} - $clone_start + 1;
} else {
$start = $clone_end - $row->{'SNP_END'} + 1;
$end = $clone_end -$row->{'SNP_START'} + 1;
}
my $strand = $row->{'SNP_STRAND'}*$clone_strand;
my $key = join(":", $row->{'ID_DEFAULT'}, $start, $end, $strand);
my $consequence_type = $CONSEQUENCE_TYPE_MAP{$row->{'POS_TYPE'}." ".$row->{'CONSEQUENCE'}} || '_';
my $cons_rank = $Bio::EnsEMBL::Variation::VariationFeature::CONSEQUENCE_TYPES{uc($consequence_type)} || 99;
if ((!$cons{$key}) or ($cons_rank < $cons{$key})) {
# VariationFeature
my $varfeat = Bio::EnsEMBL::Variation::VariationFeature->new_fast(
{
'dbID' => $row->{'INTERNAL_ID'},
'adaptor' => $self,
'variation_name' => $row->{'ID_DEFAULT'},
'start' => $start,
'end' => $end,
'strand' => $strand,
'allele_string' => $row->{'ALLELES'},
'source' => 'Glovar',
'validation_code' => [ $VSTATE_MAP{$row->{'VALIDATED'}} ],
'consequence_type' => [ $consequence_type ],
});
# add minimal Variation object (needed for DBLinks)
my $var = Bio::EnsEMBL::Variation::Variation->new(
-dbID => $row->{'INTERNAL_ID'},
-ADAPTOR => $self,
-NAME => $row->{'ID_DEFAULT'},
-SOURCE => 'Glovar',
);
$self->get_DBLinks($var);
$varfeat->variation($var);
$cons{$key} = $cons_rank;
$varfeats{$key} = $varfeat;
}
}
#&eprof_end('clone_sql');
#&eprof_dump(\*STDERR);
return [values %varfeats];
}
=head2 fetch_SNP_by_id
Arg[1] : String - Variation ID
Example : my $variation = $glovar_adaptor->fetch_SNP_by_id($id);
Description : retrieve variations from Glovar by ID
Return type : Listref of Bio::EnsEMBL::Variation::VariationFeature objects.
Exceptions : none
Caller : $self
=cut
sub fetch_SNP_by_id {
my ($self, $id) = @_;
#&eprof_start('fetch_snp_by_id');
my $dnadb = Bio::EnsEMBL::Registry->get_DNAAdaptor($ENV{'ENSEMBL_SPECIES'}, 'glovar');
unless ($dnadb) {
warn "ERROR: No dnadb attached to Glovar.\n";
return;
}
## SNP query
my $q1 = qq(
SELECT
distinct(ss.id_snp) as internal_id,
ss.default_name as id_default,
ms.position as snp_start,
ms.end_position as snp_end,
ms.is_revcomp as snp_strand,
scd.description as validated,
ss.alleles as alleles,
svd.description as snpclass,
cd.chromosome as chr_name,
sseq.id_sequence as nt_id,
sseq.database_seqnname as seq_name,
sseq.database_source as database_source
FROM
snp_sequence sseq,
mapped_snp ms,
snp,
snpvartypedict svd,
snp_confirmation_dict scd,
snp_summary ss,
chromosomedict cd
WHERE ss.default_name = ?
AND sseq.id_sequence = ms.id_sequence
AND ms.id_snp = ss.id_snp
AND ss.id_snp = snp.id_snp
AND snp.var_type = svd.id_dict
AND ss.confirmation_status = scd.id_dict
AND sseq.chromosome = cd.id_dict
AND sseq.is_current = 1
);
my @snps = ();
my $sth1;
eval {
$sth1 = $self->prepare($q1);
$sth1->execute($id);
};
if ($@){
warn("ERROR: SQL failed in " . (caller(0))[3] . "\n$@");
return([]);
}
# loop over all SNP mappings found and pick one; mappings to clones
# take preference over mappings to NT_contigs
my (@seq_nt, @seq_clone);
while (my $r = $sth1->fetchrow_hashref()) {
return([]) unless keys %{$r};
if ($r->{'SEQ_NAME'} =~ /^NT/) {
push @seq_nt, $r;
} else {
push @seq_clone, $r;
}
}
# if more than one NT or clone mapping has been returned, something is
# wrong with snp_sequence.is_current, so print a warning
if (@seq_nt > 1 or @seq_clone > 1) {
warning(
"More than one mapping of SNP to NT_contig ("
. join(", ", map { $_->{'SEQ_NAME'} } @seq_nt)
. ") or clone ("
. join(", ", map { $_->{'SEQ_NAME'} } @seq_clone)
. ")."
);
}
# loop over all mappings returned; try clones first; you're done once
# you've managed to transform to chromosomal coords successfully
my $varfeat;
SEQ:
foreach my $row (@seq_clone, @seq_nt) {
$row->{'SNP_END'} ||= $row->{'SNP_START'};
my $snp_start = $row->{'SNP_START'};
my $id_seq = $row->{'NT_ID'};
# get clone the SNP is on
my $q2 = qq(
SELECT
cs.database_seqname as embl_acc,
csm.start_coordinate as clone_start,
csm.end_coordinate as clone_end,
csm.contig_orientation as clone_strand
FROM clone_seq cs,
clone_seq_map csm
WHERE csm.id_sequence = '$id_seq'
AND cs.id_cloneseq = csm.id_cloneseq
AND (csm.start_coordinate < $snp_start)
AND (csm.end_coordinate > $snp_start)
);
my $sth2;
eval {
$sth2 = $self->prepare($q2);
$sth2->execute();
};
if ($@){
warn("ERROR: SQL failed in " . (caller(0))[3] . "\n$@");
return([]);
}
my $j;
CLONE:
while (my ($embl_acc, $clone_start, $clone_end, $clone_strand) = $sth2->fetchrow_array()) {
$j++;
## map to chromosome
# you might get more than one clone back from Glovar, since it
# contains the whole clones, not only the golden parts; if you
# managed to transform to chromosome, you already had the right
# one, so skip
next CLONE if ($varfeat);
# get clone from core db
# if no clone was found, you've picked the wrong snp_sequence
# try the next one
my $clone = $dnadb->get_SliceAdaptor->fetch_by_region('clone', $embl_acc);
next CLONE unless ($clone);
# calculate clone coordinates for SNP
my ($start, $end);
if ($clone_strand == 1) {
$start = $row->{'SNP_START'} - $clone_start + 1;
$end = $row->{'SNP_END'} - $clone_start + 1;
} else {
$start = $clone_end - $row->{'SNP_END'} + 1;
$end = $clone_end - $row->{'SNP_START'} + 1;
}
next CLONE if ($start < 0);
$varfeat = Bio::EnsEMBL::Variation::VariationFeature->new(-dbID => $row->{'INTERNAL_ID'});
$varfeat->slice($clone);
$varfeat->start($start);
$varfeat->end($end);
$varfeat->strand($row->{'SNP_STRAND'}*$clone_strand);
$varfeat = $varfeat->transform('chromosome');
}
warn "WARNING: Multiple clones ($j) returned" if ($j > 1);
# try next snp_sequence if you couldn't transform this one
next SEQ unless $varfeat;
my $var = Bio::EnsEMBL::Variation::Variation->new(-dbID => $row->{'INTERNAL_ID'});
$varfeat->variation_name($id);
$var->name($id);
$varfeat->seq_region_name($row->{'CHR_NAME'});
$varfeat->source('Glovar');
$var->source('Glovar');
$var->adaptor($self);
# alleles
$varfeat->allele_string($row->{'ALLELES'});
# temporary hack; change sql query to get data from snp_variation,
# allele_frequency, population
foreach my $al (split(/\|/, $row->{'ALLELES'})) {
my $allele = Bio::EnsEMBL::Variation::Allele->new(-allele => $al);
$var->add_Allele($allele);
}
# get flanking sequence from core
my $slice = $dnadb->get_SliceAdaptor->fetch_by_region(
'chromosome',
$row->{'CHR_NAME'},
$varfeat->start - 25,
$varfeat->end + 25,
);
$slice = $slice->invert if ($row->{'SNP_STRAND'} == -1);
my $seq = $slice->seq;
# determine end of upstream sequence depending on range type (in-dels
# of type "between", i.e. start !== end, are actually inserts)
my $up_end = 25;
$up_end++ if (($row->{'SNPCLASS'} eq "SNP - indel") && ($varfeat->start ne $varfeat->end));
$var->five_prime_flanking_seq(substr($seq, 0, $up_end));
$var->three_prime_flanking_seq(substr($seq, 26));
# consequences and DBLinks
$self->get_consequences($varfeat);
$self->get_DBLinks($var);
# validation state
$varfeat->add_validation_state($VSTATE_MAP{$row->{'VALIDATED'}});
$var->add_validation_state($VSTATE_MAP{$row->{'VALIDATED'}});
# population genotypes
# $self->get_population_genotypes($var);
# add variation to variationFeature
$varfeat->variation($var);
}
#&eprof_end('fetch_snp_by_id');
#&eprof_dump(\*STDERR);
$varfeat ? (return [$varfeat]) : (return([]));
}
=head2 get_DBLinks
Arg[1] : Bio::EnsEMBL::SNP object
Example : $glovar_adaptor->get_DBLinks($snp, '104567');
Description : adds external database links to snp object
Return type : none
Exceptions : none
Caller : $self
=cut
sub get_DBLinks {
my ($self, $var) = @_;
my $q = qq(
SELECT
snp_name.SNP_NAME as NAME,
snpnametypedict.DESCRIPTION as TYPE
FROM
snp_name,
snpnametypedict
WHERE snp_name.ID_SNP = ?
AND snp_name.SNP_NAME_TYPE = snpnametypedict.ID_DICT
);
my $sth;
eval {
$sth = $self->prepare($q);
$sth->execute($var->dbID);
};
if ($@){
warn("ERROR: SQL failed in " . (caller(0))[3] . "\n$@");
return;
}
while (my $xref = $sth->fetchrow_hashref()) {
$var->add_synonym($xref->{'TYPE'}, $xref->{'NAME'});
}
}
=head2 get_consequences
Arg[1] : Bio::EnsEMBL::Variation::Variation object
Example : $glovar_adaptor->get_consequences($var);
Description : Adds a TranscriptVariation object to the variation
Return type : none
Exceptions : none
Caller : $self
=cut
sub get_consequences {
my ($self, $varfeat) = @_;
my $q = qq(
SELECT
ptd.description as pos_type,
sgc_dict.description as consequence,
cs.name as transcript_stable_id,
sgc.transcript_position as cdna_start
FROM
coding_sequence cs,
snp_gene_consequence sgc
LEFT JOIN
position_type_dict ptd on sgc.position_description = ptd.id_dict
LEFT JOIN
sgc_dict on sgc.consequence = sgc_dict.id_dict
WHERE sgc.id_snp = ?
AND sgc.id_codingseq = cs.id_codingseq
AND cs.design_entry = ?
);
my $sth;
eval {
$sth = $self->prepare($q);
$sth->execute($varfeat->{'dbID'}, $self->consequence_exp);
};
if ($@){
warn("ERROR: SQL failed in " . (caller(0))[3] . "\n$@");
return;
}
while (my $row = $sth->fetchrow_hashref) {
# add consequence
my $consequence_type = $CONSEQUENCE_TYPE_MAP{$row->{'POS_TYPE'}." ".$row->{'CONSEQUENCE'}};
my $key = join(":", $row->{'TRANSCRIPT_STABLE_ID'}, $row->{'CDNA_START'}, $consequence_type);
# only add consequence once (workaround for duplicates in db)
my $found_tvar;
foreach my $oldtvar (@{ $varfeat->get_all_TranscriptVariations || [] }) {
if (join(":", $oldtvar->transcript->stable_id,
$oldtvar->cdna_start,
$oldtvar->consequence_type
) eq $key) {
$found_tvar = 1;
}
}
unless ($found_tvar) {
$varfeat->add_consequence_type( [$consequence_type] );
# add TranscriptVariation object
my $trans = Bio::EnsEMBL::Transcript->new(
-STABLE_ID => $row->{'TRANSCRIPT_STABLE_ID'},
);
my $tvar = Bio::EnsEMBL::Variation::TranscriptVariation->new_fast({
transcript => $trans,
cdna_start => $row->{'CDNA_START'},
cdna_end => $row->{'CDNA_START'},
consequence_type => [ $consequence_type ],
});
$varfeat->add_TranscriptVariation($tvar);
}
}
}
=head2 get_population_genotypes
Arg[1] : Bio::EnsEMBL::Variation::Variation object
Example : $self->get_population_genotypes($var);
Description : Adds PopulationGenotype objects to a variation by fetching
population genotype info from the db
Return type : none
Exceptions : thrown if no Bio::EnsEMBL::Variation::Variation is supplied
Caller : internal
=cut
sub get_population_genotypes {
my ($self, $var) = @_;
unless ($var->isa('Bio::EnsEMBL::Variation::Variation')) {
throw("You must provide a Bio::EnsEMBL::Variation::Variation object.");
}
my $q = qq(
SELECT
sv.var_string as allele,
af.frequency as frequency,
p.description as pop_name,
ao.sample_size as sample_size
FROM
snp_variation sv,
allele_frequency af,
assay_outcome ao,
population p
WHERE sv.id_snp = ?
AND sv.id_var = af.id_var
AND af.id_outcome = ao.id_outcome
AND ao.id_pop = p.id_pop
);
my $sth;
eval {
$sth = $self->prepare($q);
$sth->execute($var->dbID);
};
if ($@){
warn("ERROR: SQL failed in " . (caller(0))[3] . "\n$@");
return;
}
my $pop;
while (my $row = $sth->fetchrow_hashref) {
# collect genotypes by population
$pop->{$row->{'POP_NAME'}}->{'sample_size'} = $row->{'SAMPLE_SIZE'};
push @{ $pop->{$row->{'POP_NAME'}}->{'frequency'} }, $row->{'FREQUENCY'};
push @{ $pop->{$row->{'POP_NAME'}}->{'alleles'} }, $row->{'ALLELE'};
}
foreach my $p (keys %$pop) {
my $population = Bio::EnsEMBL::Variation::Population->new(
-dbID => $p,
-NAME => $p,
-SIZE => $pop->{$p}->{'size'},
);
# add PopulationGenotype for each genotype
# genotype frequencies are calculated by combinatorics from allele
# frequencies, which is biologically not correct.
# ToDo: find the right data in the database
my ($allele1, $allele2) = @{ $pop->{$p}->{'alleles'} };
my ($freq1, $freq2) = @{ $pop->{$p}->{'frequency'} };
# homozygous genotypes
my $pop_genotype = Bio::EnsEMBL::Variation::PopulationGenotype->new(
-dbID => $p,
-ALLELE1 => $allele1,
-ALLELE2 => $allele1,
-FREQUENCY => $freq1**2,
);
$pop_genotype->population($population);
$var->add_PopulationGenotype($pop_genotype);
my $pop_genotype = Bio::EnsEMBL::Variation::PopulationGenotype->new(
-dbID => $p,
-ALLELE1 => $allele2,
-ALLELE2 => $allele2,
-FREQUENCY => $freq2**2,
);
$pop_genotype->population($population);
$var->add_PopulationGenotype($pop_genotype);
# heterozygous genotype
my $pop_genotype = Bio::EnsEMBL::Variation::PopulationGenotype->new(
-dbID => $p,
-ALLELE1 => $allele1,
-ALLELE2 => $allele2,
-FREQUENCY => $freq1*$freq2*2,
);
$pop_genotype->population($population);
$var->add_PopulationGenotype($pop_genotype);
}
}
=head2 consequence_exp
Arg[1] : (optional) consequence experiment id
Example : $glovar_adaptor->consequence_ext(2046);
Description : getter/setter for the consequence experiment
(coding_sequence.design_entry in the glovar db)
Return type : String - consequence experiment id
Exceptions : none
Caller : general
=cut
sub consequence_exp {
my ($self, $exp) = @_;
if ($exp) {
$self->{'consequence_exp'} = $exp;
}
return $self->{'consequence_exp'};
}
=head2 get_source_version
Example : my $version = $glovar_adaptor->get_source_version;
Description : This method is just for compatibility with
Bio::EnsEMBL::Variation API.
Return type : undef
Exceptions : none
Caller : general
=cut
sub get_source_version {
return undef;
}
=head2 coordinate_systems
Arg[1] : none
Example : my @coord_systems = $glovar_adaptor->coordinate_systems;
Description : This method returns a list of coordinate systems which are
implemented by this class. A minimum of one valid coordinate
system must be implemented. Valid coordinate systems are:
'SLICE', 'ASSEMBLY', 'CONTIG', and 'CLONE'.
Return type : list of strings
Exceptions : none
Caller : internal
=cut
sub coordinate_systems {
return ('CLONE');
}
=head2 track_name
Arg[1] : none
Example : my $track_name = $snp_adaptor->track_name;
Description : returns the track name
Return type : String - track name
Exceptions : none
Caller : Bio::EnsEMBL::Slice,
Bio::EnsEMBL::ExternalData::ExternalFeatureAdaptor
=cut
sub track_name {
return("GlovarSNP");
}
1;