Raw content of Bio::EnsEMBL::Utils::TranscriptSNPs
=head1 LICENSE
Copyright (c) 1999-2009 The European Bioinformatics Institute and
Genome Research Limited. All rights reserved.
This software is distributed under a modified Apache license.
For license details, please see
/info/about/code_licence.html
=head1 CONTACT
Please email comments or questions to the public Ensembl
developers list at .
Questions may also be sent to the Ensembl help desk at
.
=cut
=head1 NAME
TranscriptSNPs - A utility class used to obtain information about the
relationships between a transcript and SNPs
=head1 SYNOPSIS
use Bio::EnsEMBL::Utils::TranscriptSNPs;
# get and type all snps in the region of the transcript
%snps = %{
Bio::EnsEMBL::Utils::TranscriptSNPs::get_all_SNPs( $transcript,
$flanking ) };
# get all snps overlapping the transcript in cdna coordinates
%snps =
%{ Bio::EnsEMBL::Utils::TranscriptSNPs::get_all_cdna_SNPs(
$transcript) };
# get the peptide variations caused by a set of SNPs
%variations = %{
Bio::EnsEMBL::Utils::TranscriptSNPs::get_all_peptide_variations(
$transcript, $snps ) };
=head1 DESCRIPTION
This is a utility class which can be used to get snps associated with a
transcript, and to determine the amino acid changes caused by the SNPs
=head1 METHODS
=cut
package Bio::EnsEMBL::Utils::TranscriptSNPs;
use strict;
use warnings;
no warnings 'uninitialized';
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
=head2 get_all_peptide_variations
Arg [1] : $transcript the transcript to obtain the peptide variations for
Arg [2] : $snps listref of coding snps in cdna coordinates
Example : $pep_hash = get_all_peptide_variations($transcript, \@snps);
Description: Takes a list of coding snps on this transcript in
which are in cdna coordinates and returns a hash with peptide
coordinate keys and listrefs of alternative amino acids as
values. The SNPs must additionally have a strand of 1 for the
sake of simplicity. Normally these could be generated using the
get_all_cdna_SNPs method.
Note that the peptide encoded by the reference sequence is
also present in the results and that duplicate peptides
(e.g. resulting from synonomous mutations) are discarded.
It is possible to have greated than two peptides variations
at a given location given adjacent or overlapping snps.
Insertion/deletion variations are ignored by this method.
Example of a data structure that could be returned:
{ 1 => ['I', 'M'],
10 => ['I', 'T'],
37 => ['N', 'D'],
56 => ['G', 'E'],
118 => ['R', 'K'],
159 => ['D', 'E'],
167 => ['Q', 'R'],
173 => ['H', 'Q'] }
Returntype : hashref
Exceptions : none
Caller : general
=cut
sub get_all_peptide_variations {
my $transcript = shift;
my $snps = shift;
if(!ref($transcript) || !$transcript->isa('Bio::EnsEMBL::Transcript')) {
throw('Bio::EnsEMBL::Transcript argument is required.');
}
if(!ref($snps) eq 'ARRAY') {
throw('Reference to a list of Bio::EnsEMBL::SNP objects is required');
}
my $codon_table = Bio::Tools::CodonTable->new;
my $codon_length = 3;
my $cdna = $transcript->spliced_seq;
my $variant_alleles;
my $translation_start = $transcript->cdna_coding_start;
foreach my $snp (@$snps) {
#skip variations not on a single base
next if ($snp->start != $snp->end);
my $start = $snp->start;
my $strand = $snp->strand;
#calculate offset of the nucleotide from codon start (0|1|2)
my $codon_pos = ($start - $translation_start) % $codon_length;
#calculate the peptide coordinate of the snp
my $peptide = ($start - $translation_start +
($codon_length - $codon_pos)) / $codon_length;
#retrieve the codon
my $codon = substr($cdna, $start - $codon_pos-1, $codon_length);
#store each alternative allele by its location in the peptide
my @alleles = split(/\/|\|/, lc($snp->allele_string));
#my @alleles = split(/\/|\|/, lc($snp->alleles));
foreach my $allele (@alleles) {
next if $allele eq '-'; #skip deletions
next if CORE::length($allele) != 1; #skip insertions
#get_all_cdna_SNPs always gives strand of 1 now
#if($strand == -1) {
# #complement the allele if the snp is on the reverse strand
# $allele =~
# tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/;
#}
#create a data structure of variant alleles sorted by both their
#peptide position and their position within the peptides codon
$variant_alleles ||= {};
if(exists $variant_alleles->{$peptide}) {
my $alleles_arr = $variant_alleles->{$peptide}->[1];
push @{$alleles_arr->[$codon_pos]}, $allele;
} else {
#create a list of 3 lists (one list for each codon position)
my $alleles_arr = [[],[],[]];
push @{$alleles_arr->[$codon_pos]}, $allele;
$variant_alleles->{$peptide} = [$codon, $alleles_arr];
}
}
}
my %out;
#now generate all possible codons for each peptide and translate them
foreach my $peptide (keys %$variant_alleles) {
my ($codon, $alleles) = @{$variant_alleles->{$peptide}};
#need to push original nucleotides onto each position
#so that all possible combinations can be generated
push @{$alleles->[0]}, substr($codon,0,1);
push @{$alleles->[1]}, substr($codon,1,1);
push @{$alleles->[2]}, substr($codon,2,1);
my %alt_amino_acids;
foreach my $a1 (@{$alleles->[0]}) {
substr($codon, 0, 1) = $a1;
foreach my $a2 (@{$alleles->[1]}) {
substr($codon, 1, 1) = $a2;
foreach my $a3 (@{$alleles->[2]}) {
substr($codon, 2, 1) = $a3;
my $aa = $codon_table->translate($codon);
#print "$codon translation is $aa\n";
$alt_amino_acids{$aa} = 1;
}
}
}
my @aas = keys %alt_amino_acids;
$out{$peptide} = \@aas;
}
return \%out;
}
=head2 get_all_SNPs
Arg [1] : Bio::EnsEMBL::Transcript $transcript
Arg [2] : (optional) int $flanking
The number of basepairs of transcript flanking sequence to
retrieve snps from (default 0)
Arg [3] : $source type of database source (dbSNP, Glovar)
Example : $snp_hashref = get_all_transcript_SNPs($transcript)
Description: Retrieves all snps found within the region of the
provided transcript
The snps are returned in a hash with keys corresponding
to the region the snp was found in. Possible keys are:
'three prime UTR', 'five prime UTR', 'coding', 'intronic',
'three prime flanking', 'five prime flanking'
If no flanking argument is provided no flanking snps will be
obtained.
The listrefs which are the values of the returned hash
contain snps in coordinates of the transcript region
(i.e. first base = first base of the first exon on the
postive strand - flanking bases + 1)
Multiple base variations and inserts/deletes are discarded
by this method and not used.
Returntype : hasref with string keys and listrefs of Bio::EnsEMBL::SNPs for
values
Exceptions : none
Caller : general
=cut
sub get_all_SNPs {
my $transcript = shift;
my $flanking = shift || 0;
my $source = shift;
if(!ref($transcript) || !$transcript->isa('Bio::EnsEMBL::Transcript')) {
throw('Bio::EnsEMBL::Transcript argument required.');
}
my $slice = $transcript->slice();
if(!$slice) {
warning("Cannot obtain SNPs for transcript without attached Slice.");
return {};
}
my $sa = $slice->adaptor();
if(!$sa) {
warning('Cannot obtain SNPs for transcript unless attached slice ' .
'has attached adaptor');
return {};
}
my %snp_hash;
# retrieve slice in the region of the transcript
$slice = $sa->fetch_by_Feature($transcript, $flanking );
# copy transcript, to work in coord system we are interested in
$transcript = $transcript->transfer( $slice );
# get all snps in the transcript region
my $snps;
if ($source eq 'glovar') {
$snps = $slice->get_all_ExternalFeatures('GlovarSNP');
}
elsif ($source eq 'variation') {
$snps = $slice->get_all_VariationFeatures;
}
else {
$snps = $slice->get_all_SNPs; # dont need once use new snp api (i think)
}
my $trans_start = $flanking + 1;
my $trans_end = $slice->length - $flanking;
my $trans_strand = $transcript->get_all_Exons->[0]->strand;
# classify each snp
foreach my $snp (@$snps) {
my $key;
if(($trans_strand == 1 && $snp->end < $trans_start) ||
($trans_strand == -1 && $snp->start > $trans_end)) {
# this snp is upstream from the transcript
$key = 'five prime flanking';
}
elsif(($trans_strand == 1 && $snp->start > $trans_end) ||
($trans_strand == -1 && $snp->start < $trans_start)) {
# this snp is downstream from the transcript
$key = 'three prime flanking';
}
else {
#snp is inside transcript region check if it overlaps an exon
foreach my $e (@{$transcript->get_all_Exons}) {
if($snp->end >= $e->start && $snp->start <= $e->end) {
# this snp is in an exon
if(($trans_strand == 1 &&
$snp->end < $transcript->coding_region_start) ||
($trans_strand == -1 &&
$snp->start > $transcript->coding_region_end)) {
# this snp is in the 5' UTR
$key = 'five prime UTR';
}
elsif(($trans_strand == 1 &&
$snp->start > $transcript->coding_region_end)||
($trans_strand == -1 &&
$snp->end < $transcript->coding_region_start)) {
# this snp is in the 3' UTR
$key = 'three prime UTR';
}
else {
# snp is coding
$key = 'coding';
}
last;
}
}
unless($key) {
# snp was not in an exon and is therefore intronic
$key = 'intronic';
}
}
unless($key) {
#warning('SNP could not be mapped. In/Dels not supported yet...');
next;
}
if(exists $snp_hash{$key}) {
push @{$snp_hash{$key}}, $snp;
}
else {
$snp_hash{$key} = [$snp];
}
}
return \%snp_hash;
}
=head2 get_all_cdna_SNPs
Arg [1] : Bio::EnsEMBL::Transcript $transcript
Arg [2] : $source type of database source (dbSNP, Glovar)
Example : $cdna_snp_hasref = $transcript->get_all_cdna_SNPs;
Description: Retrieves all snps found within exons of the provided
transcript.
The snps are returned in a hash with three keys corresponding
to the region the snp was found in. Valid keys are:
'three prime UTR', 'five prime UTR', 'coding'
The listrefs which are the values of the returned hash
contain snps in CDNA coordinates.
Multiple base variations and insertions/deletions are not
used by this function and are discarded.
Returntype : hasref with string keys and listrefs of Bio::EnsEMBL::SNPs for
values
Exceptions : none
Caller : general
=cut
sub get_all_cdna_SNPs {
my ($transcript, $source) = @_;
#retrieve all of the snps from this transcript
my $all_snps = get_all_SNPs($transcript, 0, $source);
my %snp_hash;
my @cdna_types = ('three prime UTR', 'five prime UTR','coding');
my $slice = $transcript->slice();
my $sa = $slice->adaptor();
$slice = $sa->fetch_by_Feature($transcript);
# copy transcript in order to work in coord system of interest
$transcript = $transcript->transfer($slice);
foreach my $type (@cdna_types) {
$snp_hash{$type} = [];
foreach my $snp (@{$all_snps->{$type}}) {
my @coords = $transcript->genomic2cdna($snp->start, $snp->end,
$snp->strand);
#skip snps that don't map cleanly (possibly an indel...)
if(scalar(@coords) != 1) {
#warning("snp of type $type does not map cleanly\n");
next;
}
my ($coord) = @coords;
unless($coord->isa('Bio::EnsEMBL::Mapper::Coordinate')) {
#warning("snp of type $type maps to gap\n");
next;
}
my $alleles;
my $ambicode;
# get alleles and ambig_code (with fallback to old snp API)
$alleles = $snp->allele_string || $snp->{'alleles'};
$ambicode = $snp->ambig_code || $snp->{'_ambiguity_code'};
#we arbitrarily put the SNP on the +ve strand because it is easier to
#work with in the webcode
if($coord->strand == -1) {
$alleles =~
tr/acgthvmrdbkynwsACGTDBKYHVMRNWS\//tgcadbkyhvmrnwsTGCAHVMRDBKYNWS\//;
$ambicode =~
tr/acgthvmrdbkynwsACGTDBKYHVMRNWS\//tgcadbkyhvmrnwsTGCAHVMRDBKYNWS\//;
}
#copy the snp and convert to cdna coords...
my $new_snp;
%$new_snp = %$snp;
bless $new_snp, ref $snp;
$new_snp->start($coord->start);
$new_snp->end($coord->end);
$new_snp->strand(1);
$new_snp->allele_string($alleles);
$new_snp->ambig_code($ambicode);
push @{$snp_hash{$type}}, $new_snp;
}
}
return \%snp_hash;
}
1;