Raw content of Bio::EnsEMBL::Pipeline::Alignment::AlignmentSeq
# Cared for by Dan Andrews
#
# Copyright EnsEMBL
#
# You may distribute this module under the same terms as perl itself
#
# POD documentation - main docs before the code
=pod
=head1 NAME
Bio::EnsEMBL::Pipeline::Alignment::AlignmentSeq.pm
=head1 SYNOPSIS
=head1 DESCRIPTION
=head1 CONTACT
Post general queries to B
=cut
package Bio::EnsEMBL::Pipeline::Alignment::AlignmentSeq;
use vars qw(@ISA);
use strict;
use Bio::Seq;
use Bio::EnsEMBL::Utils::Exception qw(throw warning info);
use Bio::EnsEMBL::Utils::Argument qw(rearrange);
@ISA = qw();
sub new {
my ($class, @args) = @_;
my $self = bless {},$class;
my ($name,
$seq,
$deletions,
$start,
$exon,
$type) = rearrange([qw(NAME
SEQ
DELETIONS
START
EXON
TYPE)],@args);
# Optional arguments
$self->seq($seq) if $seq;
$self->name($name) if $name;
$self->start($start) if $start;
$self->exon($exon) if $exon;
$self->type($type) if $type;
return $self;
}
sub name {
my $self = shift;
if (@_) {
$self->{'_name'} = shift;
}
return $self->{'_name'};
}
sub seq {
my $self = shift;
if (@_) {
$self->{'_seq'} = shift;
}
$self->throw('No sequence attached to object')
unless $self->{'_seq'};
return $self->{'_seq'};
}
# Returns the sequence as an array, with one base per array element.
sub seq_array {
my $self = shift;
$self->throw("Cant retrieve seq_array when there is no sequence.")
unless $self->seq;
my @array = split //, $self->seq;
return \@array;
}
# Slightly formatted output
sub seq_with_newlines {
my ($self, $line_length) = @_;
$line_length = 60 unless defined $line_length;
my $seq_string = $self->seq;
$seq_string =~ s/(.{$line_length})/$1\n/g;
return $seq_string;
}
# Writes the sequence back from an array, where one base is
# resident in each array element. Empty elements will be
# written as gaps '-'
sub store_seq_array {
my $self = shift;
my $input_array = shift;
$self->throw("Array for storage contains nothing.")
unless (scalar @$input_array > 0);
my $concat_seq;
for (my $i = 0; $i < scalar @$input_array; $i++){
if (! defined $input_array->[$i] ||
$input_array->[$i] eq ''){
$concat_seq .= '-';
} else {
$concat_seq .= $input_array->[$i];
}
}
$self->seq($concat_seq);
return 1;
}
sub fetch_base_at_position {
my ($self, $base_position) = @_;
$self->throw("Must specify a coordinate in order to retrieve a base from the sequence.")
unless $base_position;
$base_position--;
return substr $self->seq, $base_position, 1;
}
# Specify the insertion position and length of
# gap for insertion into both the sequence and
# the deletion sequence.
sub insert_gap {
my ($self, $insert_position, $gap_length) = @_;
unless (defined $insert_position && $gap_length > 0){
throw("Need to specify gap insertion position [$insert_position] " .
"and length [$gap_length]");
}
my $gap = '-' x $gap_length;
# Stick the gap in the sequence
my $seq = $self->seq;
if (length $seq < ($insert_position - 1)){
info("Attempting to insert gap in sequence [" . $self->name .
"] beyond the end of the sequence. Sequence length [".
length($seq) ."] Insert position [" .
($insert_position - 1) . "]\n");
return 0
}
my $new_seq = substr($seq, 0, $insert_position - 1) .
$gap . substr($seq, $insert_position - 1);
$self->seq($new_seq);
# Take note of the gap in our set of deletion coordinates
for (my $i = 0; $i < $gap_length; $i++) {
$self->increment_deletions_above($insert_position+$i)
}
return 1
}
sub all_gaps {
my $self = shift;
my $sequence_array = $self->seq_array;
my @gap_coordinates;
for (my $i = 0; $i < scalar @$sequence_array; $i++) {
push @gap_coordinates, $i+1 if $sequence_array->[$i] eq '-'
}
return \@gap_coordinates
}
sub add_deletion {
my ($self, $position, $length) = @_;
throw("Add deletion requires a numeric argument, not this [" .
ref($position) . "]")
if (ref($position) ne '');
if ($position > $self->length) {
warning("Attempting to insert a deletion past the end of sequence.");
return 1
}
$length = 1 unless $length;
$self->deletion_hash->{$position} = $length
unless (defined $self->deletion_hash->{$position} &&
$self->deletion_hash->{$position} > $length);
return 1
}
sub increment_deletions_above {
my ($self, $coord) = @_;
my $deletion_hash = $self->deletion_hash;
my @coords = keys %$deletion_hash;
for (my $i = 0; $i < scalar @coords; $i++) {
if ($deletion_hash->{$coords[$i]}){
my $deletion_length = $deletion_hash->{$coords[$i]};
delete $deletion_hash->{$coords[$i]};
$deletion_hash->{$coords[$i]+1} = $deletion_length;
}
}
return 1
}
sub deletion_hash {
my $self = shift;
if (@_){
$self->{'_deletions'} = shift
}
unless ($self->{'_deletions'}){
$self->{'_deletions'} = {};
}
return $self->{'_deletions'}
}
sub deletion_coords {
my $self = shift;
my @deletion_coords = keys %{$self->deletion_hash};
return \@deletion_coords
}
# I'm not sure what this is used for...
sub start {
my $self = shift;
if (@_) {
$self->{'_start'} = shift;
}
return $self->{'_start'}
}
# Find the lengths of gaps both upstream and downstream of
# our sequence.
sub _end_gaps {
my $self = shift;
# Calculate both up and down stream gaps at the same
# time for increased overall speed (assuming that
# both numbers will ultimately be needed).
my $seq = $self->seq;
$seq =~ s/^(\-*)/$1/;
my $upstream_gaps = length $1;
$seq = reverse $seq;
$seq =~ s/^(\-*)/$1/;
my $downstream_gaps = length $1;
$self->{'_first_base_coord'} = $upstream_gaps + 1;
$self->{'_last_base_coord'} = $self->length - $downstream_gaps;
return 1
}
# Find the first non-gap base of our aligned sequence
sub first_base_coord {
my $self = shift;
unless (defined $self->{'_first_base_coord'}){
$self->_end_gaps;
}
return $self->{'_first_base_coord'}
}
# Find the last non-gap base of our aligned_sequence
sub last_base_coord {
my $self = shift;
unless (defined $self->{'_last_base_coord'}){
$self->_end_gaps;
}
return $self->{'_last_base_coord'}
}
# This stores the exon number. Used by
# Bio::EnsEMBL::Pipeline::Alignment::EvidenceAlignment
sub exon {
my $self = shift;
if (@_) {
$self->{'_exon'} = shift;
}
return $self->{'_exon'};
}
# Sequence type store - can be 'nucleotide' or 'protein'
sub type {
my $self = shift;
if (@_) {
my $type = shift;
$self->throw("Unknown sequence type - needs to be either " .
"'nucleotide' or 'protein'.")
unless ($type eq 'nucleotide' || $type eq 'protein');
$self->{'_type'} = $type;
}
return $self->{'_type'};
}
# Useful to know
sub length {
my $self = shift;
return scalar @{$self->seq_array};
}
# Text dump sequence with identifier
sub fasta_string {
my ($self, $line_length) = @_;
$line_length = 60
unless $line_length;
return '>' . $self->name . "\n" .
$self->seq_with_newlines($line_length) . "\n";
}
1;