Raw content of Bio::EnsEMBL::ExternalData::GeneSNP
#
# BioPerl module for Bio::EnsEMBL::ExternalData::GeneSNP
#
# Cared for by Heikki Lehvaslaiho
#
# Copyright Heikki Lehvaslaiho
#
# You may distribute this module under the same terms as perl itself
# POD documentation - main docs before the code
=head1 NAME
Bio::EnsEMBL::ExternalData::GeneSNP - class to expand genomic SNP
into full gene variation description
=head1 SYNOPSIS
$genesnp = new Bio::EnsEMBL::ExternalData::GeneSNP
(-gene => $gene,
-contig => $contig
);
# $snp is a Bio::EnsEMBL::ExternalData::Variation object
$var = $genesnp->snp2gene($snp);
# $var is a Bio::Variation::SeqDiff object
# or
@var = $genesnp->snps2gene(@snps);
=head1 DESCRIPTION
Bio::EnsEMBL::ExternalData::GeneSNP takes in
L objects in genomic (clone,
contig or virtual contig) coordinates and calculates for DNA, RNA and
protein changes in gene coordinates based on information in a
L object. The returned object for each SNP
description is L which links to
L compliant objects.
A SNP passed into a GeneSNP object is taken to be a gene SNP (gSNP) if
it is within 5kb of coding region. A RNA and protein level description
objct is created only for coding SNPs (cSNP).
$is_cSNP = 1 if $var->AAChange;
The IDs are kept as follows:
- All the DBLinks are L objects which are
passed from Variant into DNAMutation. Primary ID of a SNP is
$var->DNAMutation->id
- The contig ID is kept in SeqDiff ID : $var->id
- The ID of the transcript ID used to calculate SeqDiff is:
$var->rna_id
- The ID of the exon where SNP is located is kept in
DNAMutation->region_value. The string looks like 'ENSE00000012499
(1)'. The order number is there to keep track of introns which do
not have IDs, only an order number. To get exon ID only:
$var->DNAMutation->region_value =~ /\w+/
=head1 CONTACT
Heikki Lehvaslaiho
Address:
EMBL Outstation, European Bioinformatics Institute
Wellcome Trust Genome Campus, Hinxton
Cambs. CB10 1SD, United Kingdom
=cut
=head1 APPENDIX
The rest of the documentation details each of the object
methods. Internal methods are usually preceded with a _
=cut
# Let the code begin...
package Bio::EnsEMBL::ExternalData::GeneSNP;
use vars qw(@ISA );
use strict;
use Bio::EnsEMBL::Root;
# Object preamble - inheritance
@ISA = qw ( Bio::EnsEMBL::Root );
#use Carp;
use Bio::Variation::SeqDiff;
use Bio::Variation::DNAMutation;
use Bio::Variation::RNAChange;
use Bio::Variation::AAChange;
use Bio::Variation::Allele;
use Bio::Tools::CodonTable;
use Bio::EnsEMBL::ExternalData::Variation;
use Bio::Annotation::DBLink;
sub new {
my($class,@args) = @_;
my $self;
$self = {};
bless $self, $class;
my ($gene, $contig, $transcript) =
$self->_rearrange([qw(GENE
CONTIG
TRANSCRIPT
)],@args);
$gene && $self->gene($gene);
$contig && $self->contig($contig);
$transcript && $self->transcript($transcript);
return $self; # success - we hope!
}
=head2 gene
Title : gene
Usage : $geneobj = $obj->gene;
Function: Returns or sets the link-reference to a Gene object.
If there is no link, it will return undef
Returns : an obj_ref or undef
=cut
sub gene {
my ($self,$value) = @_;
if (defined $value) {
if( ! $value->isa('Bio::EnsEMBL::Gene') ) {
$self->throw("Is not a Bio::EnsEMBL::Gene object but a [$value]");
return (undef);
}
else {
$self->{'gene'} = $value;
}
}
unless (exists $self->{'gene'}) {
return (undef);
} else {
return $self->{'gene'};
}
}
=head2 transcript
Title : transcript
Usage : $transcriptobj = $obj->transcript;
Function: Returns or sets the link-reference to a Transcript object.
If there is no link, it will return undef
Returns : an obj_ref or undef
=cut
sub transcript {
my ($self,$value) = @_;
if (defined $value) {
if( ! $value->isa('Bio::EnsEMBL::Transcript') ) {
$self->throw("Is not a Bio::EnsEMBL::Transcript object but a [$value]");
return (undef);
}
else {
$self->{'transcript'} = $value;
}
}
unless (exists $self->{'transcript'}) {
return (undef);
} else {
return $self->{'transcript'};
}
}
=head2 contig
Title : contig
Usage : $contigobj = $obj->contig;
Function: Returns or sets the link-reference to a Contig object.
If there is no link, it will return undef
Returns : an obj_ref or undef
=cut
sub contig {
my ($self,$value) = @_;
if (defined $value) {
$self->{'contig'} = $value;
}
unless (exists $self->{'contig'}) {
return (undef);
} else {
return $self->{'contig'};
}
}
sub snps2transcript {
my ($self, @snps) = @_;
my $seqDiff = undef;
#sanity checks
$self->transcript || $self->throw("Set transcript to a Bio::EnsEMBL::Transcript object");
$self->contig || $self->throw("Set contig to a Bio::EnsEMBL::RawContig compliant object");
my @seqDiffs = ();
my $rna = $self->transcript;
my $aa = $rna->translation; #Bio::Ensembl::Translation object
my $aaseq = $rna->translate;
foreach my $snp (@snps) {
my $seqDiff = $self->_calculate_gene_coordinates($snp, $rna, $aa, $aaseq);
$seqDiff && push @seqDiffs, $seqDiff;
}
return @seqDiffs;
}
sub snps2gene {
my ($self, @snps) = @_;
my $seqDiff = undef;
#sanity checks
$self->gene || $self->throw("Set gene to a Bio::EnsEMBL::Gene object");
$self->contig || $self->throw("Set contig to a Bio::EnsEMBL::RawContig compliant object");
my @seqDiffs = ();
foreach my $rna ($self->gene->each_Transcript) {
my $aa = $rna->translation; #Bio::Ensembl::Translation object
my $aaseq = $rna->translate;
foreach my $snp (@snps) {
my $seqDiff = $self->_calculate_gene_coordinates($snp, $rna, $aa, $aaseq);
$seqDiff && push @seqDiffs, $seqDiff;
}
}
return @seqDiffs;
}
sub snp2gene {
my ($self, $snp) = @_;
my $seqDiff = undef;
#sanity checks
#$snp || $self->throw("Give a Bio::EnsEMBL::ExternalData::Variation object as an argument");
#$snp->isa('Bio::EnsEMBL::ExternalData::Variation') ||
# $self->throw("Is not a Bio::EnsEMBL::ExternalData::Variation object but a [$snp]");
$self->gene || $self->throw("Set gene to a Bio::EnsEMBL::Gene object");
$self->contig || $self->throw("Set contig to a Bio::EnsEMBL::RawContig compliant object");
my @seqDiffs = ();
foreach my $rna ($self->gene->each_Transcript) {
#print STDERR $rna->dna_seq->seq, "\n";
my $aa = $rna->translation; #Bio::Ensembl::Translation object
#print $aa->id, ", ", $aa->start, ", ", $aa->end, "\n";
my $aaseq = $rna->translate;
#print STDERR $aaseq->seq, "\n";
$seqDiff = $self->_calculate_gene_coordinates($snp, $rna, $aa, $aaseq);
$seqDiff && push @seqDiffs, $seqDiff;
}
return @seqDiffs;
}
sub _calculate_gene_coordinates {
my ($self, $snp, $rna, $aa, $aaseq) = @_;
my $seqDiff = undef;
$snp || $self->throw("Give a Bio::EnsEMBL::ExternalData::Variation object as an argument");
$snp->isa('Bio::EnsEMBL::ExternalData::Variation') ||
$self->throw("Is not a Bio::EnsEMBL::ExternalData::Variation object but a [$snp]");
$rna->isa('Bio::EnsEMBL::Transcript') ||
$self->throw("Is not a Bio::EnsEMBL::Transcript object but a [$rna]");
$aa->isa('Bio::EnsEMBL::Translation') ||
$self->throw("Is not a Bio::EnsEMBL::Translation object but a [$aaseq]");
$aaseq->isa('Bio::PrimarySeqI') ||
$self->throw("Is not a Bio::PrimarySeqI object but a [$aaseq]");
#reject the SNP if it is not withing 5kb of the coding region
return undef if $snp->start < ($aa->start - 5000);
return undef if $snp->end > ($aa->end + 5000);
#only real SNPs taken into account!
return undef if $snp->start != $snp->start;
#
#first create the container object
#
$seqDiff = Bio::Variation::SeqDiff->new();
$seqDiff->moltype('dna');
$seqDiff->numbering('entry');
#$seqDiff->dna_ori(); leave undefined,
#we do not want to burden the object with long contig sequnce
$seqDiff->rna_ori($rna->dna_seq->seq);
$seqDiff->aa_ori($rna->translate); #slow opeation?
$seqDiff->id($self->contig->id);
$seqDiff->rna_id($rna->id);
$seqDiff->offset($aa->start -1);
$seqDiff->cds_end($aa->end - $aa->start + 1);
#
# DNA level
#
my $dna_start = $snp->start - $seqDiff->offset;
$dna_start < 1 && $dna_start--; # no 0 in the coordinate system!
my $dnamut = Bio::Variation::DNAMutation->new
(-start => ($dna_start),
-end => ($dna_start),
);
$dnamut->mut_number(1);
$dnamut->proof('experimental'); # given coordianate system
#$dnamut->isMutation(0);
my (@alleles) = split (/\|/, $snp->alleles);
my $dnaA1 = Bio::Variation::Allele->new;
my $a1 = shift @alleles;
$dnamut->length(CORE::length $a1);
$dnaA1->seq($a1);
$dnamut->allele_ori($dnaA1);
foreach my $alleleseq (@alleles) {
my $A2 = Bio::Variation::Allele->new;
$A2->seq($alleleseq);
$dnamut->add_Allele($A2);
}
$seqDiff->add_Variant($dnamut);
foreach my $link ($snp->each_DBLink) {
$dnamut->add_DBLink;
}
my $start_pos = $snp->start;
$dnamut->upStreamSeq
(lc substr($self->contig->primary_seq->seq, $snp->start -25, 25));
$dnamut->dnStreamSeq
(lc substr($self->contig->primary_seq->seq, $snp->start +1, 25));
my $ref_allele = lc substr($self->contig->primary_seq->seq, $snp->start, 1 );
$dnaA1->seq ne $ref_allele &&
$self->warn("Found DNA ref allele: $ref_allele!");
#what if the SNP is closer to either end than 25 nt?
# where in the gene region the snp is?
if ($snp->start < $rna->start_exon->start) {
$dnamut->region('5\'upstream');
}
elsif ($snp->start > $rna->end_exon->end) {
$dnamut->region("3'downstream");
}
elsif ($snp->start < $aa->start) {
$dnamut->region("5'UTR");
}
elsif ($snp->start > $aa->end) {
$dnamut->region("3'UTR");
} else { #coding
my $last_exon_end = 0;
my $exoncount = 1;
#my $this_exon;
my $transcript_loc;
foreach my $exon ($rna->each_Exon) {
if ($snp->start < $exon->start) {
$dnamut->region('intron');
$dnamut->region_value($exoncount);
last;
}
elsif ($snp->start >= $exon->start and $snp->start <= $exon->end) {
$dnamut->region('exon');
$dnamut->region_value($exon->id. " ($exoncount)");
#$thisexon = $exon->id;
last;
}
$exoncount++;
}
}
#
# RNA level
#
my $rnachange = undef;
#if ($dnamut->region =~ /UTR/ or $dnamut->region eq 'exon' ) {
if ($dnamut->region eq 'exon' ) { # mRNA affected
$seqDiff->rna_offset($rna->start_exon->phase -1);
# new method into Bio::Ensembl::Transcript
my $rna_pos = $rna->rna_pos($snp->start);
$rnachange = Bio::Variation::RNAChange->new
(-start => $rna_pos - $seqDiff->rna_offset,
-end => $rna_pos - $seqDiff->rna_offset,
);
$rnachange->length(1);
$rnachange->mut_number(1);
$seqDiff->add_Variant($rnachange);
$dnamut->RNAChange($rnachange);
$rnachange->DNAMutation($dnamut);
$rnachange->proof('computed');
$rnachange->allele_ori($dnaA1);
foreach my $alleleseq (@alleles) {
my $A2 = Bio::Variation::Allele->new;
$A2->seq($alleleseq);
$rnachange->add_Allele($A2);
}
$rnachange->codon_pos(($rnachange->start -1 )% 3 +1);
if ($rna_pos < 25 ) {
$rnachange->upStreamSeq($dnamut->upStreamSeq);
} else {
$rnachange->upStreamSeq
(lc substr($seqDiff->rna_ori, $rna_pos -25, 25));
}
if ($rna_pos > $seqDiff->cds_end - 25 ) {
$rnachange->dnStreamSeq($dnamut->dnStreamSeq);
} else {
$rnachange->dnStreamSeq
(lc substr($seqDiff->rna_ori, $rna_pos + 1, 25));
}
my $ref_allele = lc substr($seqDiff->rna_ori, $rna_pos,1);
$dnaA1->seq ne $ref_allele &&
$self->warn("Found RNA ref allele: $ref_allele!");
}
#
# Protein level
#
if ($dnamut->region eq 'exon' ) { # coding region affected
my $aachange = Bio::Variation::AAChange->new
(-start => (int($rnachange->start / 3 + 1))
);
$aachange->end($aachange->start);
$aachange->proof('computed');
$seqDiff->add_Variant($aachange);
$rnachange->AAChange($aachange);
$aachange->RNAChange($rnachange);
$aachange->mut_number(1);
my $ct = new Bio::Tools::CodonTable;
my $aa_allele_ori = $ct->translate($rnachange->codon_ori);
my $aa_o = Bio::Variation::Allele->new;
$aa_o->seq($aa_allele_ori) if $aa_allele_ori;
$aachange->allele_ori($aa_o);
my $aa_allele_mut = $ct->translate($rnachange->codon_mut);
my $aa_m = Bio::Variation::Allele->new;
$aa_m->seq($aa_allele_mut) if $aa_allele_mut;
$aachange->add_Allele($aa_m);
my $aa_length_ori = CORE::length($aa_allele_ori);
$aachange->length($aa_length_ori);
$aachange->end($aachange->start + $aa_length_ori - 1 );
#terminator codon?
my $ref_allele = substr($aaseq->seq, $aachange->start -1 ,1);
$aa_allele_ori ne $ref_allele &&
$self->warn ("Found AA ref allele '$ref_allele' when expected to see $aa_allele_ori");
}
return $seqDiff;
}
1;