Raw content of Bio::EnsEMBL::Pipeline::GeneDuplication::CodonBasedAlignment
# Cared for by Dan Andrews
#
# Copyright EnsEMBL
#
# You may distribute this module under the same terms as perl itself
#
# POD documentation - main docs before the code
=pod
=head1 NAME
Bio::EnsEMBL::Pipeline::GeneDuplication::CodonBasedAlignment
=head1 SYNOPSIS
use Bio::EnsEMBL::Pipeline::GeneDuplication::CodonBasedAlignment;
# Create our object, specifying the genetic code needed to
# translate our sequences. The commonest codes are:
# universal => 1
# vertebrate mitochondrial => 2
# These numbers are the same as those you need to specify to
# translate any Bio::Seq object. Hence if you need a truly
# oddball genetic code, check the Bio::Seq module documentation.
my $cba =
Bio::EnsEMBL::Pipeline::GeneDuplication::CodonBasedAlignment->new(
-genetic_code => 1,
-seqs => \@seqs);
# The sequences passed to the module should be a reference to
# an array of Bio::Seq objects. Everything is going to fall apart
# if you pass amino acid sequences here, so make sure they are
# nucleotide sequences.
# To run the the actual alignment process do the following.
# The return value is an array of Bio::Seq objects, that
# when dumped to a multiple fasta file or the like, will
# display a multiple alignment.
my $align = $cba->run_alignment;
foreach my $seq (@$align){
my $string = $seq->seq;
$string =~ s/(.{60})/$1\n/g;
print STDOUT ">" . $seq->display_id . "\n$string";
}
=head1 DESCRIPTION
=head1 CONTACT
Post general queries to B
=cut
package Bio::EnsEMBL::Pipeline::GeneDuplication::CodonBasedAlignment;
use strict;
use Bio::EnsEMBL::Root;
use Bio::Seq;
use Bio::SeqIO;
use Bio::Tools::Run::Alignment::Clustalw;
use Bio::EnsEMBL::Utils::Argument qw(rearrange);
use Bio::EnsEMBL::Utils::Exception qw(warning throw);
sub new {
my ($class, @args) = @_;
my $self = bless {},$class;
my ($genetic_code,
$seqs,
) = rearrange([qw(GENETIC_CODE
SEQS
)], @args);
unless (defined $genetic_code){
warning("Genetic code not specified. Defaulting " .
"to the so-called universal code.");
$genetic_code = 1;
}
$self->genetic_code($genetic_code);
$self->sequences($seqs) if defined $seqs;
return $self;
}
sub filename {
my $self = shift;
$self->{filename} = shift if @_;
return $self->{filename}
}
sub sequences_from_file {
my $self = shift;
my $seqio = Bio::SeqIO->new(-file => $self->filename,
-format => 'fasta');
my @seqs;
while (my $seq = $seqio->next_seq){
push @seqs, $seq;
}
$self->sequences(\@seqs);
return 1
}
sub sequences {
my $self = shift;
if (@_) {
$self->{_sequences} = shift;
foreach my $seq (@{$self->{_sequences}}){
throw("Unexpected sequence type. Should be Bio::Seq, " .
"but was [$seq]")
unless $seq->isa("Bio::Seq");
}
}
return $self->{_sequences}
}
### Alignment Stuff ###
sub run_alignment {
my ($self) = @_;
my $clustalw = Bio::Tools::Run::Alignment::Clustalw->new();
my $seqs = $self->sequences;
unless (scalar @$seqs > 1) {
die "It takes more than one sequence to clustalw";
return 0
}
my %nt_seqs;
my %desc_lookup;
foreach my $nt_seq (@$seqs){
$nt_seqs{$nt_seq->display_id} = $nt_seq;
$desc_lookup{$nt_seq->display_id} = $nt_seq->desc;
}
my @aa_seqs;
foreach my $seq (@$seqs) {
push (@aa_seqs, $seq->translate(undef, undef, undef, $self->genetic_code));
}
my $alignment = $clustalw->align(\@aa_seqs);
my @aligned_seqs;
my $longest_seq = 0;
foreach my $aligned_seq ($alignment->each_seq){
# Tack description lines back onto our aligned sequences.
$aligned_seq->desc($desc_lookup{$aligned_seq->display_id});
push(@aligned_seqs, $aligned_seq);
$longest_seq = $aligned_seq->length if $aligned_seq->length > $longest_seq;
}
foreach my $aligned_seq (@aligned_seqs) {
my $nt_work_seq = $nt_seqs{$aligned_seq->display_id}->seq;
$nt_work_seq =~ s/(...)/$1:/g;
my @nt_seq_array = split /:/, $nt_work_seq;
my @aa_seq_array = split //, $aligned_seq->seq;
while (scalar @aa_seq_array < $longest_seq){
push(@aa_seq_array, '-');
print "Adding gap to make sequences same length.\n";
}
# throw("nt and aa sequences are not of corresponding lengths.")
# if scalar @nt_seq_array != scalar @aa_seq_array;
my $aligned_nt_string = '';
foreach my $aa (@aa_seq_array) {
if ($aa eq '.' || $aa eq '-'){
$aligned_nt_string .= '---';
} else {
my $codon = shift @nt_seq_array;
# Pad out any partial codon (some do exist).
unless ($codon =~ /.../){$codon = '---';}
$aligned_nt_string .= $codon;
}
}
# If the nt sequence included a stop codon, tack this
# on here (the aa sequence will be missing this).
if (scalar @nt_seq_array == 1 && $nt_seq_array[-1] =~ /.../){
$aligned_nt_string .= join '', @nt_seq_array;
}
my $aligned_nt_bioseq = Bio::Seq->new(-display_id => $aligned_seq->display_id,
-desc => $aligned_seq->desc,
-seq => $aligned_nt_string);
$self->_alignment('add', $aligned_nt_bioseq);
}
return $self->_alignment;
}
sub _alignment {
my ($self, $action, $seq) = @_;
if (defined $action && $action eq 'add') {
push (@{$self->{_alignment}}, $seq);
} elsif (defined $action && $action eq 'clear') {
$self->{_alignment} = [];
} elsif (defined $action) {
warning("CodonBasedAlignment::_alignment - unknown action specified ".
"[$action]. Must be \'add\' or \'clear\'");
}
return $self->{_alignment};
}
sub genetic_code {
my $self = shift;
if (@_){
$self->{_genetic_code} = shift;
}
throw("Genetic code not specified.")
unless $self->{_genetic_code};
return $self->{_genetic_code}
}
return 1;