GlovarSNPAdaptor documentation.

sub fetch_SNP_by_id {

    my ($self, $id) = @_;
    #&eprof_start('fetch_snp_by_id');
    
    my $dnadb = Bio::EnsEMBL::Registry->get_DNAAdaptor($ENV{'ENSEMBL_SPECIES'}, 'glovar');
    unless ($dnadb) {
        warn "ERROR: No dnadb attached to Glovar.\n";
        return;
    }
    
    ## SNP query
    my $q1 = qq(
        SELECT
                distinct(ss.id_snp)         as internal_id,
                ss.default_name             as id_default,
                ms.position                 as snp_start,
                ms.end_position             as snp_end,
                ms.is_revcomp               as snp_strand,
                scd.description             as validated,
                ss.alleles                  as alleles,
                svd.description             as snpclass,
                cd.chromosome               as chr_name,
                sseq.id_sequence            as nt_id,
                sseq.database_seqnname      as seq_name,
                sseq.database_source        as database_source
        FROM    
                snp_sequence sseq,
                mapped_snp ms,
                snp,
                snpvartypedict svd,
                snp_confirmation_dict scd,
                snp_summary ss,
                chromosomedict cd
        WHERE   ss.default_name = ?
        AND     sseq.id_sequence = ms.id_sequence
        AND     ms.id_snp = ss.id_snp
        AND     ss.id_snp = snp.id_snp
        AND     snp.var_type = svd.id_dict
        AND     ss.confirmation_status = scd.id_dict
        AND     sseq.chromosome = cd.id_dict
        AND     sseq.is_current = 1
    );

    my @snps = ();
    my $sth1;

    eval {
        $sth1 = $self->prepare($q1);
        $sth1->execute($id);
    }; 
    if ($@){
        warn("ERROR: SQL failed in " . (caller(0))[3] . "\n$@");
        return([]);
    }

    # loop over all SNP mappings found and pick one; mappings to clones
    # take preference over mappings to NT_contigs
    my (@seq_nt, @seq_clone);
    while (my $r = $sth1->fetchrow_hashref()) {
        return([]) unless keys %{$r};
        if ($r->{'SEQ_NAME'} =~ /^NT/) {
            push @seq_nt, $r;
        } else {
            push @seq_clone, $r;
        }
    }
    # if more than one NT or clone mapping has been returned, something is
    # wrong with snp_sequence.is_current, so print a warning
    if (@seq_nt > 1 or @seq_clone > 1) {
        warning(
            "More than one mapping of SNP to NT_contig ("
            . join(", ", map { $_->{'SEQ_NAME'} } @seq_nt)
            . ") or clone (" 
            . join(", ", map { $_->{'SEQ_NAME'} } @seq_clone)
            . ")."
        );
    }

    # loop over all mappings returned; try clones first; you're done once
    # you've managed to transform to chromosomal coords successfully
    my $varfeat;
    SEQ:
    foreach my $row (@seq_clone, @seq_nt) {
        $row->{'SNP_END'} ||= $row->{'SNP_START'};
        my $snp_start = $row->{'SNP_START'};
        my $id_seq = $row->{'NT_ID'};

        # get clone the SNP is on
        my $q2 = qq(
            SELECT
                    cs.database_seqname     as embl_acc,
                    csm.start_coordinate    as clone_start,
                    csm.end_coordinate      as clone_end,
                    csm.contig_orientation  as clone_strand
            FROM    clone_seq cs,
                    clone_seq_map csm
            WHERE   csm.id_sequence = '$id_seq'
            AND     cs.id_cloneseq = csm.id_cloneseq
            AND     (csm.start_coordinate < $snp_start)
            AND     (csm.end_coordinate > $snp_start)
        );
        my $sth2;
        eval {
            $sth2 = $self->prepare($q2);
            $sth2->execute();
        }; 
        if ($@){
            warn("ERROR: SQL failed in " . (caller(0))[3] . "\n$@");
            return([]);
        }
        my $j;
        CLONE:
        while (my ($embl_acc, $clone_start, $clone_end, $clone_strand) = $sth2->fetchrow_array()) {
            $j++;
            ## map to chromosome
            # you might get more than one clone back from Glovar, since it
            # contains the whole clones, not only the golden parts; if you
            # managed to transform to chromosome, you already had the right
            # one, so skip
            next CLONE if ($varfeat);
            
            # get clone from core db
            # if no clone was found, you've picked the wrong snp_sequence
            # try the next one
            my $clone = $dnadb->get_SliceAdaptor->fetch_by_region('clone', $embl_acc);
            next CLONE unless ($clone);
            
            # calculate clone coordinates for SNP
            my ($start, $end);
            if ($clone_strand == 1) {
                $start = $row->{'SNP_START'} - $clone_start + 1;
                $end = $row->{'SNP_END'} - $clone_start + 1;
            } else {
                $start = $clone_end - $row->{'SNP_END'} + 1;
                $end = $clone_end - $row->{'SNP_START'} + 1;
            }
            next CLONE if ($start < 0);

            $varfeat = Bio::EnsEMBL::Variation::VariationFeature->new(-dbID => $row->{'INTERNAL_ID'});
            $varfeat->slice($clone);
            $varfeat->start($start);
            $varfeat->end($end);
            $varfeat->strand($row->{'SNP_STRAND'}*$clone_strand);
            $varfeat = $varfeat->transform('chromosome');
        }
        warn "WARNING: Multiple clones ($j) returned" if ($j > 1);
        
        # try next snp_sequence if you couldn't transform this one
        next SEQ unless $varfeat;
        
        my $var = Bio::EnsEMBL::Variation::Variation->new(-dbID => $row->{'INTERNAL_ID'});
        $varfeat->variation_name($id);
        $var->name($id);
        $varfeat->seq_region_name($row->{'CHR_NAME'});
        $varfeat->source('Glovar');
        $var->source('Glovar');
        $var->adaptor($self);
        
        # alleles
        $varfeat->allele_string($row->{'ALLELES'});
        # temporary hack; change sql query to get data from snp_variation,
        # allele_frequency, population
        foreach my $al (split(/\|/, $row->{'ALLELES'})) {
            my $allele = Bio::EnsEMBL::Variation::Allele->new(-allele => $al);
            $var->add_Allele($allele);
        }

        # get flanking sequence from core
        my $slice = $dnadb->get_SliceAdaptor->fetch_by_region(
            'chromosome',
            $row->{'CHR_NAME'},
            $varfeat->start - 25,
            $varfeat->end + 25,
        );
        $slice = $slice->invert if ($row->{'SNP_STRAND'} == -1);
        my $seq = $slice->seq;

        # determine end of upstream sequence depending on range type (in-dels
        # of type "between", i.e. start !== end, are actually inserts)
        my $up_end = 25;
        $up_end++ if (($row->{'SNPCLASS'} eq "SNP - indel") && ($varfeat->start ne $varfeat->end));
        $var->five_prime_flanking_seq(substr($seq, 0, $up_end));
        $var->three_prime_flanking_seq(substr($seq, 26));
        # consequences and  DBLinks
        $self->get_consequences($varfeat);
        $self->get_DBLinks($var);

        # validation state
        $varfeat->add_validation_state($VSTATE_MAP{$row->{'VALIDATED'}});
        $var->add_validation_state($VSTATE_MAP{$row->{'VALIDATED'}});

        # population genotypes
        # $self->get_population_genotypes($var);

        # add variation to variationFeature
        $varfeat->variation($var);
    }

    #&eprof_end('fetch_snp_by_id');
    #&eprof_dump(\*STDERR);

    $varfeat ? (return [$varfeat]) : (return([]));

}

sub fetch_all_by_clone_accession {

    my ($self, $embl_acc, $embl_version, $cl_start, $cl_end) = @_;

    #&eprof_start('clone_sql');
    
    # get info on clone
    my @cloneinfo = $self->fetch_clone_by_accession($embl_acc);
    return([]) unless (@cloneinfo);
    my ($nt_name, $id_seq, $clone_start, $clone_end, $clone_strand) = @cloneinfo;

    ## temporary hack for SNP density script: skip vega-specific clones
    #unless ($nt_name =~ /^NT/) {
    #    warn "WARNING: Skipping vega-specific clone ($embl_acc).\n";
    #    return ([]);
    #}

    # now get the SNPs on this clone
    # get only features in the desired region of the clone
    my ($q_start, $q_end);
    if ($clone_strand == 1) {
        $q_start = $clone_start + $cl_start - 1;
        $q_end = $clone_start + $cl_end + 1;
    } else{
        $q_start = $clone_end - $cl_end - 1;
        $q_end = $clone_end - $cl_start + 1;
    }
    my $q2 = qq(
        SELECT
                distinct(sgc.id_snp)    as id_snp,
                ss.id_snp               as internal_id,
                ss.default_name         as id_default,
                ms.position             as snp_start,
                ms.end_position         as snp_end,
                ms.is_revcomp           as snp_strand,
                scd.description         as validated,
                ss.alleles              as alleles,
                svd.description         as snpclass,
                ss.is_private           as private,
                ptd.description         as pos_type,
                cs.design_entry         as expt_id,
                sgc_dict.description    as consequence
        FROM    
                mapped_snp ms,
                snp,
                snpvartypedict svd,
                snp_confirmation_dict scd,
                snp_summary ss
        LEFT JOIN (
                snp_gene_consequence sgc
                INNER JOIN
                    coding_sequence cs on sgc.id_codingseq = cs.id_codingseq
                    AND cs.design_entry = ?
                INNER JOIN
                    sgc_dict on sgc.consequence = sgc_dict.id_dict
                ) on sgc.id_snp = ss.id_snp
        LEFT JOIN
                position_type_dict ptd on sgc.position_description = ptd.id_dict
        WHERE   ms.id_sequence = ?
        AND     ms.id_snp = ss.id_snp
        AND     ss.id_snp = snp.id_snp
        AND     snp.var_type = svd.id_dict
        AND     ss.confirmation_status = scd.id_dict
        AND     ms.position BETWEEN $q_start AND $q_end
    );

    my $sth;
    eval {
        $sth = $self->prepare($q2);
        $sth->execute($self->consequence_exp, $id_seq);
    }; 
    if ($@){
        warn("ERROR: SQL failed in " . (caller(0))[3] . "\n$@");
        return([]);
    }

    my (%varfeats, %cons);
    while (my $row = $sth->fetchrow_hashref()) {
        return([]) unless keys %{$row};
        warn "WARNING: private data!" if $row->{'PRIVATE'};

        ## filter SNPs without strand (this is gruft in mapped_snp)
        next unless $row->{'SNP_STRAND'};

        ## calculate coords depending on clone orientation
        my ($start, $end);
        $row->{'SNP_END'} ||= $row->{'SNP_START'};
        if ($clone_strand == 1) {
            $start = $row->{'SNP_START'} - $clone_start + 1;
            $end = $row->{'SNP_END'} - $clone_start + 1;
        } else {
            $start = $clone_end - $row->{'SNP_END'} + 1;
            $end = $clone_end -$row->{'SNP_START'} + 1;
        }

        my $strand = $row->{'SNP_STRAND'}*$clone_strand;
        my $key = join(":", $row->{'ID_DEFAULT'}, $start, $end, $strand);
        my $consequence_type = $CONSEQUENCE_TYPE_MAP{$row->{'POS_TYPE'}." ".$row->{'CONSEQUENCE'}} || '_';
        my $cons_rank = $Bio::EnsEMBL::Variation::VariationFeature::CONSEQUENCE_TYPES{uc($consequence_type)} || 99;

        if ((!$cons{$key}) or ($cons_rank < $cons{$key})) {
            # VariationFeature
            my $varfeat = Bio::EnsEMBL::Variation::VariationFeature->new_fast(
                {
                    'dbID'              => $row->{'INTERNAL_ID'},
                    'adaptor'           => $self,
                    'variation_name'    => $row->{'ID_DEFAULT'},
                    'start'             => $start,
                    'end'               => $end,
                    'strand'            => $strand,
                    'allele_string'     => $row->{'ALLELES'},
                    'source'            => 'Glovar',
                    'validation_code'   => [ $VSTATE_MAP{$row->{'VALIDATED'}} ],
                    'consequence_type'  => [ $consequence_type ],
                });

            # add minimal Variation object (needed for DBLinks)
            my $var = Bio::EnsEMBL::Variation::Variation->new(
                    -dbID               => $row->{'INTERNAL_ID'},
                    -ADAPTOR            => $self,
                    -NAME               => $row->{'ID_DEFAULT'},
                    -SOURCE             => 'Glovar',
            );
            $self->get_DBLinks($var);
            $varfeat->variation($var);
            
            $cons{$key} = $cons_rank;
            $varfeats{$key} = $varfeat; 
        }
    }

    #&eprof_end('clone_sql');
    #&eprof_dump(\*STDERR);
    
    return [values %varfeats];

}

sub get_consequences {

    my ($self, $varfeat) = @_;
    my $q = qq(
        SELECT
                ptd.description         as pos_type,
                sgc_dict.description    as consequence,
                cs.name                 as transcript_stable_id,
                sgc.transcript_position as cdna_start
        FROM    
                coding_sequence cs,
                snp_gene_consequence sgc
        LEFT JOIN
                position_type_dict ptd on sgc.position_description = ptd.id_dict
        LEFT JOIN
                sgc_dict on sgc.consequence = sgc_dict.id_dict
        WHERE   sgc.id_snp = ?
        AND     sgc.id_codingseq = cs.id_codingseq
        AND     cs.design_entry = ?
    );
    my $sth;
    eval {
        $sth = $self->prepare($q);
        $sth->execute($varfeat->{'dbID'}, $self->consequence_exp);
    }; 
    if ($@){
        warn("ERROR: SQL failed in " . (caller(0))[3] . "\n$@");
        return;
    }

    while (my $row = $sth->fetchrow_hashref) {
        # add consequence
        my $consequence_type = $CONSEQUENCE_TYPE_MAP{$row->{'POS_TYPE'}." ".$row->{'CONSEQUENCE'}};

        my $key = join(":", $row->{'TRANSCRIPT_STABLE_ID'}, $row->{'CDNA_START'}, $consequence_type);

        # only add consequence once (workaround for duplicates in db)
        my $found_tvar;
        foreach my $oldtvar (@{ $varfeat->get_all_TranscriptVariations || [] }) {
            if (join(":",   $oldtvar->transcript->stable_id,
                            $oldtvar->cdna_start,
                            $oldtvar->consequence_type
                    ) eq $key) {
                $found_tvar = 1;
            }
        }
        
        unless ($found_tvar) {
            $varfeat->add_consequence_type( [$consequence_type] );

            # add TranscriptVariation object
            my $trans = Bio::EnsEMBL::Transcript->new(
                -STABLE_ID => $row->{'TRANSCRIPT_STABLE_ID'},
            );
            my $tvar = Bio::EnsEMBL::Variation::TranscriptVariation->new_fast({
                transcript          => $trans,
                cdna_start          => $row->{'CDNA_START'},
                cdna_end            => $row->{'CDNA_START'},
                consequence_type    => [ $consequence_type ],
            });
            $varfeat->add_TranscriptVariation($tvar);
        }
    }

}

sub get_population_genotypes {

    my ($self, $var) = @_;
    unless ($var->isa('Bio::EnsEMBL::Variation::Variation')) {
        throw("You must provide a Bio::EnsEMBL::Variation::Variation object.");
    }
    my $q = qq(
        SELECT
                sv.var_string       as allele,
                af.frequency        as frequency,
                p.description       as pop_name,
                ao.sample_size      as sample_size
        FROM
                snp_variation sv,
                allele_frequency af,
                assay_outcome ao,
                population p
        WHERE   sv.id_snp = ?
        AND     sv.id_var = af.id_var
        AND     af.id_outcome = ao.id_outcome
        AND     ao.id_pop = p.id_pop
    );
    my $sth;
    eval {
        $sth = $self->prepare($q);
        $sth->execute($var->dbID);
    }; 
    if ($@){
        warn("ERROR: SQL failed in " . (caller(0))[3] . "\n$@");
        return;
    }

    my $pop;
    while (my $row = $sth->fetchrow_hashref) {
        # collect genotypes by population
        $pop->{$row->{'POP_NAME'}}->{'sample_size'} = $row->{'SAMPLE_SIZE'};
        push @{ $pop->{$row->{'POP_NAME'}}->{'frequency'} }, $row->{'FREQUENCY'};
        push @{ $pop->{$row->{'POP_NAME'}}->{'alleles'} }, $row->{'ALLELE'};
    }

    foreach my $p (keys %$pop) {
        my $population = Bio::EnsEMBL::Variation::Population->new(
            -dbID       => $p,
            -NAME       => $p,
            -SIZE       => $pop->{$p}->{'size'},
        );

        # add PopulationGenotype for each genotype
        # genotype frequencies are calculated by combinatorics from allele
        # frequencies, which is biologically not correct.
        # ToDo: find the right data in the database
        my ($allele1, $allele2) = @{ $pop->{$p}->{'alleles'} };
        my ($freq1, $freq2) = @{ $pop->{$p}->{'frequency'} };

        # homozygous genotypes
        my $pop_genotype = Bio::EnsEMBL::Variation::PopulationGenotype->new(
            -dbID       => $p,
            -ALLELE1    => $allele1,
            -ALLELE2    => $allele1,
            -FREQUENCY  => $freq1**2,
        );
        $pop_genotype->population($population);
        $var->add_PopulationGenotype($pop_genotype);
        
        my $pop_genotype = Bio::EnsEMBL::Variation::PopulationGenotype->new(
            -dbID       => $p,
            -ALLELE1    => $allele2,
            -ALLELE2    => $allele2,
            -FREQUENCY  => $freq2**2,
        );
        $pop_genotype->population($population);
        $var->add_PopulationGenotype($pop_genotype);

        # heterozygous genotype
        my $pop_genotype = Bio::EnsEMBL::Variation::PopulationGenotype->new(
            -dbID       => $p,
            -ALLELE1    => $allele1,
            -ALLELE2    => $allele2,
            -FREQUENCY  => $freq1*$freq2*2,
        );
        $pop_genotype->population($population);
        $var->add_PopulationGenotype($pop_genotype);
    }

}

consequence_exp	Description	Code
coordinate_systems	Description	Code
fetch_SNP_by_id	Description	Code
fetch_all_by_clone_accession	Description	Code
get_DBLinks	Description	Code
get_consequences	Description	Code
get_population_genotypes	Description	Code
get_source_version	Description	Code
track_name	Description	Code