IUPAC is a tool that produces a stream of unique, "strict"-satisfying Seq
objects from an ambiquous Seq object (containing non-standard characters given
the meaning shown below)
Extended Dna / Rna alphabet :
(includes symbols for nucleotide ambiguity)
------------------------------------------
Symbol Meaning Nucleic Acid
------------------------------------------
A A Adenine
C C Cytosine
G G Guanine
T T Thymine
U U Uracil
M A or C
R A or G
W A or T
S C or G
Y C or T
K G or T
V A or C or G
H A or C or T
D A or G or T
B C or G or T
X G or A or T or C
N G or A or T or C
IUPAC-IUB SYMBOLS FOR NUCLEOTIDE NOMENCLATURE:
Cornish-Bowden (1985) Nucl. Acids Res. 13: 3021-3030.
-----------------------------------
Amino Acid alphabet:
------------------------------------------
Symbol Meaning
------------------------------------------
A Alanine
B Aspartic Acid, Asparagine
C Cystine
D Aspartic Acid
E Glutamic Acid
F Phenylalanine
G Glycine
H Histidine
I Isoleucine
K Lysine
L Leucine
M Methionine
N Asparagine
P Proline
Q Glutamine
R Arginine
S Serine
T Threonine
V Valine
W Tryptophan
X Unknown
Y Tyrosine
Z Glutamic Acid, Glutamine
* Terminator
IUPAC-IUP AMINO ACID SYMBOLS:
Biochem J. 1984 Apr 15; 219(2): 345-373
Eur J Biochem. 1993 Apr 1; 213(1): 2
BEGIN { %IUB = ( A => [qw(A)],
C => [qw(C)],
G => [qw(G)],
T => [qw(T)],
U => [qw(U)],
M => [qw(A C)],
R => [qw(A G)],
W => [qw(A T)],
S => [qw(C G)],
Y => [qw(C T)],
K => [qw(G T)],
V => [qw(A C G)],
H => [qw(A C T)],
D => [qw(A G T)],
B => [qw(C G T)],
X => [qw(G A T C)],
N => [qw(G A T C)]
);
%IUP = (A => [qw(A)],
B => [qw(D N)],
C => [qw(C)],
D => [qw(D)],
E => [qw(E)],
F => [qw(F)],
G => [qw(G)],
H => [qw(H)],
I => [qw(I)],
K => [qw(K)],
L => [qw(L)],
M => [qw(M)],
N => [qw(N)],
P => [qw(P)],
Q => [qw(Q)],
R => [qw(R)],
S => [qw(S)],
T => [qw(T)],
U => [qw(U)],
V => [qw(V)],
W => [qw(W)],
X => [qw(X)],
Y => [qw(Y)],
Z => [qw(E Q)],
'*' => ['*']
); } |
sub new
{ my($class,@args) = @_;
my $self = $class->SUPER::new(@args);
my ($seq) = $self->_rearrange([qw(SEQ)],@args);
if((! defined($seq)) && @args && ref($args[0])) {
$seq = $args[0];
}
$seq->isa('Bio::Seq') or
$self->throw("Must supply a Seq.pm object to IUPAC!");
$self->{'_SeqObj'} = $seq;
if ($self->{'_SeqObj'}->alphabet() =~ m/^[dr]na$/i ) { # nucleotide seq object $self->{'_alpha'} = [ map { $IUB{uc($_)} } split('', $self->{'_SeqObj'}->seq()) ]; } elsif ($self->{'_SeqObj'}->alphabet() =~ m/^protein$/i ) { # amino acid seq object $self->{'_alpha'} = [ map { $IUP{uc($_)} } split('', $self->{'_SeqObj'}->seq()) ]; } else { $self->throw("You must specify the 'type' of sequence provided to IUPAC");
}
$self->{'_string'} = [(0) x length($self->{'_SeqObj'}->seq())];
scalar @{$self->{'_string'}} or $self->throw("Sequence has zero-length!");
$self->{'_string'}->[0] = -1;
return $self; } |
sub next_seq
{ my ($self) = @_;
for my $i ( 0 .. $#{$self->{'_string'}} ) {
next unless $self->{'_string'}->[$i] || @{$self->{'_alpha'}->[$i]} > 1;
if ( $self->{'_string'}->[$i] == $#{$self->{'_alpha'}->[$i]} ) { if ( $i == $#{$self->{'_string'}} ) { return undef;
} else {
$self->{'_string'}->[$i] = 0;
next;
}
} else {
$self->{'_string'}->[$i]++;
my $j = -1;
$self->{'_SeqObj'}->seq(join('', map { $j++; $self->{'_alpha'}->[$j]->[$_]; } @{$self->{'_string'}}));
my $desc = $self->{'_SeqObj'}->desc();
if ( !defined $desc ) { $desc = ""; }
$self->{'_num'}++;
1 while $self->{'_num'} =~ s/(\d)(\d\d\d)(?!\d)/$1,$2/;
$desc =~ s/( \[Bio::Tools::IUPAC-generated\sunique sequence # [^\]]*\])|$/ \[Bio::Tools::IUPAC-generated unique sequence # $self->{'_num'}\]/;
$self->{'_SeqObj'}->desc($desc);
$self->{'_num'} =~ s/,//g;
return $self->{'_SeqObj'};
}
} } |