Raw content of Bio::EnsEMBL::Variation::Utils::Sequence # EnsEMBL module for Bio::EnsEMBL::Variation::Utils::Sequence # # =head1 NAME Bio::EnsEMBL::Variation::Utils::Sequence - Utility functions for sequences =head1 SYNOPSIS use Bio::EnsEMBL::Variation::Utils::Sequence qw(ambiguity_code variation_class); my $alleles = 'A|C'; print "my alleles = $alleles\n"; my $ambig_code = ambiguity_code($alleles); print "my ambiguity code is $ambig_code\n"; print "my SNP class is = variation_class($alleles)"; =head1 METHODS =cut use strict; use warnings; package Bio::EnsEMBL::Variation::Utils::Sequence; use Bio::EnsEMBL::Utils::Exception qw(warning); use Exporter; use vars qw(@ISA @EXPORT_OK); @ISA = qw(Exporter); @EXPORT_OK = qw(&ambiguity_code &variation_class &unambiguity_code &sequence_with_ambiguity); =head2 ambiguity_code Arg[1] : string $alleles Example : use Bio::EnsEMBL::Variation::Utils::Sequence qw(ambiguity_code) my $alleles = 'A|C'; my $ambig_code = ambiguity_code($alleles); print "the ambiguity code for $alleles is: ",$ambig_code; Description : returns the ambiguity code for a SNP allele ReturnType : String The ambiguity code Exceptions : None Caller : Variation, VariationFeature =cut sub ambiguity_code { my $alleles = shift; my %duplicates; #hash containing all alleles to remove duplicates map {$duplicates{$_}++} split /[\|\/\\]/, $alleles; $alleles = uc( join '', sort keys %duplicates ); my %ambig = qw(AC M ACG V ACGT N ACT H AG R AGT D AT W CG S CGT B CT Y GT K C C A A T T G G - -); #we will need to decide what to do with alleles like -A. Is that possible?? return $ambig{$alleles}; } =head2 unambiguity_code Arg[1] : string $alleles Example : use Bio::EnsEMBL::Variation::Utils::Sequence qw(unambiguity_code) my $ambiguity_code = 'M'; my $alleles = unambiguity_code($ambiguity_code); print "the alleles for ambiguity code $ambiguity_code is: ",$alleles; Description : returns the alleles for an ambiguity code ReturnType : String The Alleles, alphabetically sorted and in capital Exceptions : None Caller : Variation, VariationFeature =cut sub unambiguity_code { my $ambiguity_code = shift; my %unambig = qw(M AC V ACG N ACGT H ACT R AG D AGT W AT S CG B CGT Y CT K GT C CC A AA T TT G GG - --); #we will need to decide what to do with alleles like -A. Is that possible?? return $unambig{$ambiguity_code}; } =head2 variation_class Arg[1] : string $alleles Example : use Bio::EnsEMBL::Variation::Utils::Sequence qw (variation_class) my $alleles = 'A|C'; my $variation_class = variation_class($alleles); print "the variation class for the alleles $alleles is: ",$variation_class; Description : return the class of the alleles according to dbSNP classification(SNP,indel,mixed,mnp...) ReturnType : String. The class of the alleles Exceptions : none Caller : Variation, VariationFeature =cut sub variation_class{ my $alleles = shift; return 'snp' if $alleles =~ /^[ACGT]([\|\\\/][ACGT])+$/i; return 'cnv' if (($alleles eq 'cnv') || ($alleles eq 'CNV')); my @alleles = split /[\|\/\\]/, $alleles; if (@alleles == 1){ #(HETEROZYGOUS) 1 allele return 'het' } elsif(@alleles == 2){ if ((($alleles[0] =~ tr/ACTGN//)== length($alleles[0]) && ($alleles[1] =~ tr/-//) == 1) || (($alleles[0] =~ tr/-//) == 1 && ($alleles[1] =~ tr/ACTGN//) == length($alleles[1]))){ #A/- 2 alleles return 'in-del' } elsif (($alleles[0] =~ /LARGE/) || ($alleles[1] =~ /LARGE/)){ #(LARGEDELETION) 2 alleles return 'named' } elsif (($alleles[0] =~ tr/ACTG//) > 1 || ($alleles[1] =~ tr/ACTG//) > 1){ #AA/GC 2 alleles return 'mnp' } else{ warning("not possible to determine class for @alleles"); return ''; } } elsif(@alleles > 2){ if ($alleles[0] =~ /\d+/){ #(CA)14/15/16/17 > 2 alleles, all of them contain the number of repetitions of the allele return 'microsat' } elsif ((grep {/-/} @alleles) > 0){ #-/A/T/TTA > 2 alleles return 'mixed' } else{ # warning("not possible to determine class of alleles " . @alleles); return ''; } } else{ warning("no alleles available "); return ''; } } =head2 sequence_with_ambiguity Arg[1] : Bio::EnsEMBL::DBSQL::DBAdaptor $dbCore Arg[2] : Bio::EnsEMBL::Variation::DBSQL::DBAdaptor $dbVar Arg[3] : string $chr Arg[4] : int $start Arg[5] : int $end Arg[6] : int $strand Example : use Bio::EnsEMBL::Variation::Utils::Sequence qw (sequence_with_ambiguity) my $slice = sequence_with_ambiguity($dbCore,$dbVar,1,100,200); print "the sequence with ambiguity code for your region is: ",$slice->seq() Description : given a region, returns a Bio::EnsEMBL::Slice object with the sequence set with ambiguity codes ReturnType : Bio::EnsEMBL::Slice object Exceptions : none Caller : general =cut sub sequence_with_ambiguity{ my ($dbCore,$dbVar,$chr,$start,$end,$strand) = @_; my $slice; if (ref($dbCore) ne 'Bio::EnsEMBL::DBSQL::DBAdaptor'){ warning('You need to provide a Bio::EnsEMBL::DBSQL::DBAdaptor as a first argument'); return $slice; } if (ref($dbVar) ne 'Bio::EnsEMBL::Variation::DBSQL::DBAdaptor'){ warning('You need to provide a Bio::EnsEMBL::Variation::DBSQL::DBAdaptor object as second argument'); return $slice; } my $slice_adaptor = $dbCore->get_SliceAdaptor(); my $vf_adaptor = $dbVar->get_VariationFeatureAdaptor; $slice = $slice_adaptor->fetch_by_region('chromosome',$chr,$start,$end,$strand); #get the slice my $seq = $slice->seq; foreach my $vf (@{$vf_adaptor->fetch_all_by_Slice($slice)}){ substr($seq,$vf->start-1,1,$vf->ambig_code); } $slice->{'seq'} = $seq; return $slice; } 1;