Raw content of Bio::EnsEMBL::Analysis::Tools::PolyAClipping
#!/usr/local/ensembl/bin/perl -w
use strict;
use warnings;
=pod
=head1 NAME
polA-clipping.pl
=head1 DESCRIPTION
script to parse a fasta file and identify sequences with polyA/T tails/heads
these are then clipped and stored in a file
Copied directly from sd3's /ensembl-pipeline/scripts/EST/new_polyA_clipping.pl
and made into a rough module
Can also return which end of a sequence was clipped (head or tail) and how many bases were removed
clipping:
the non-A/T sequences at the ends must be <=10bp (set by $buffer)
the polyA/T string must be >4bp to be removed
it only clips polyA tails or polyT heads using a sliding window of 3 bp
the clipping is recursive but only clips one end of a sequence
the head/tail is only clipped if the polyA/T string is longer than the non-polyA/T string at the end of the sequence
perl new_polyA_clipping.pl sequences.fasta polyat_clipped.out
=cut
package Bio::EnsEMBL::Analysis::Tools::PolyAClipping;
use Bio::Seq;
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
use Exporter;
use vars qw(@ISA @EXPORT);
@ISA = qw(Exporter);
@EXPORT = qw(clip_if_necessary);
=head2 clip_if_necessary
Arg [1] : Bio::Seq
Arg [2] : Int - buffer size (optional)
Arg [3] : Int - window_size (optional)
Example : my ($clipped, $clip_end, $num_bases_removed) = clip_if_necessary($cdna,$buffer,$window)
Description: Decides whether a cDNA needs to be clipped
Returntype : String - the clipped sequence - or undef
Exceptions : none
=cut
sub clip_if_necessary {
my ($unprepared, $buffer, $window_size) = @_;
my $clipped = undef;
my $clipped_seq = "";
my $clip_end;
if (!defined $buffer) {
#length of region allowed to mismatch at ends of sequence
$buffer = 10;
}
if (!defined $window_size) {
$window_size = 3;
}
my $end_region = $buffer + $window_size;
my $unclipped = prepare_seq($unprepared);
#print ">".$unclipped->display_id."\n".$unclipped->seq."\n";
# # #
# If the sequence is long enough, clip it
# # #
my $seq = $unclipped->seq;
if (length $seq >= $end_region) {
#check for poly-A/T tails/heads - which should have mostly been removed...
#decide whether looking for polyT head or polyA tail:
my $head = substr($seq, 0, $end_region);
my $tail = substr($seq, -$end_region);
my $t_count = 0;
my $a_count = 0;
while ($head=~/([Tt]+)/g) { #will match greedily
if (length $1 > $t_count) {
$t_count = length $1;
}
}
while ($tail=~/([Aa]+)/g) { #will match greedily
if (length $1 > $a_count) {
$a_count = length $1;
}
}
#decide whether to clip heads, tails or unsure... set appropriate flag
if (($a_count > $t_count) && ($a_count > 4)) { #call the subroutine to trim the polyA tail:
$clipped_seq = clip_polya($seq, "tail", $buffer, $window_size, $end_region);
$clip_end = "tail";
} elsif (($a_count < $t_count) && ($t_count > 4)) { #call the subroutine to trim the polyT head:
$clipped_seq = clip_polya($seq, "head", $buffer, $window_size, $end_region);
$clip_end = "head";
} else {
if ($a_count > 4) { #only do for ones which appear to have a head/tail:
#tied - not sure which to do -try both and choose afterwards
my $clipped_head = clip_polya($seq, "head", $buffer, $window_size, $end_region);
my $clipped_tail = clip_polya($seq, "tail", $buffer, $window_size, $end_region);
#choose one which clipped the most:
if (length $clipped_head < length $clipped_tail) {
$clipped_seq = $clipped_head;
$clip_end = "head";
} elsif (length $clipped_tail < length $clipped_head) {
$clipped_seq = $clipped_tail;
$clip_end = "tail";
} else { #still can't tell, leave as original seq
$clipped_seq = $seq;
}
} else { # $a_count <= 4
#not going to be clipped
$clipped_seq = $seq;
}
}
} else { # length $seq < $end_region so no need to clip
$clipped_seq = $seq;
}
# # #
# Check that we have a clipped sequence
# # #
my $num_bases_removed;
if (defined $clipped_seq) {
$clipped = new Bio::Seq;
eval {
$clipped->seq($clipped_seq);
$clipped->display_id($unclipped->display_id);
$clipped->desc("");
};
#print "$unclipped->display_id: $seq\n$cdna->display_id: $clipped_seq\n\n";
$num_bases_removed = (length $seq) - (length $clipped_seq);
} else {
#the entire sequence seems to be polyA/T tail/head
warning("Sequence ".$unclipped->display_id." has been removed:\n$seq\n");
}
# We are now finished reading in all the sequences
return ($clipped, $clip_end, $num_bases_removed);
}
=head2 prepare_seq
Arg [1] : Bio::Seq
Example : my $prepared = prepare_seq($unprepared)
Description: Checks the display_id is OK
Makes all sequence uppercase
Returntype : Bio::Seq
Exceptions : Can't read id
=cut
sub prepare_seq {
my ($unclipped) = @_;
my $prepared = new Bio::Seq;
# # #
# just want the id on the first line:
# # #
my $tmp = $unclipped->display_id;
$tmp =~ s/\>//;
my $id;
if ($tmp =~ m/^gi.+ref\|(N[MR]_.+)\|/) {
$id = $1;
} elsif ($tmp =~ m/^[\w\d]+\s([\w\.\d]+)\s.+\n{1}?/) {
$id = $1;
} elsif ($tmp =~m/^[\w\.\d]+\s.+\n{1}?/) {
#already in correct format - do nothing
$id = $tmp;
} elsif ($tmp =~ m/[\w\.\d]+/) {
$id = $tmp;
} else {
throw("Cannot extract the input id from: \n".$unclipped->id."\nTry making your file so that the first line of each entry ".
"only contains the id, eg: \n>BC000830.1\n");
}
# make a new clipped cdna
$prepared->display_id($id);
# change seq to uppercase (as pattern matching later on is case-sensitive)
my $seq = $unclipped->seq;
$seq =~ tr/a-z/A-Z/;
$prepared->seq($seq);
return $prepared;
}
=head2 clip_polya
Arg [1] : String - the cDNA sequence
Arg [2] : String - "head" or "tail" - end of sequence to be clipped
Arg [3] : Int - buffer
Arg [4] : Int - window
Arg [5] : Int - end region
Example : $clipped_seq = clip_polya($seq, "head");
Description: Clips polyA or polyT sequences (case-sensitive)
Returntype : String - the clipped sequence - or undef
Exceptions : none
=cut
sub clip_polya {
my ($seq, $end, $buffer, $window_size, $end_region) = @_;
my @seq = split//, $seq;
my $length = length $seq;
my $a_count = 0;
my $t_count = 0;
my $clipped_seq = $seq;
# # #
# Count the number of consecutive A's or T's
# # #
if ($end eq "tail") {
my $tail = substr($seq, -$end_region);
while ($tail =~ m/([Aa]+)/g){ #will match greedily
if (length $1 > $a_count){
$a_count = length $1;
}
}
} elsif ($end eq "head") {
my $head = substr($seq, 0, $end_region);
while ($head =~ m/([Tt]+)/g){ #will match greedily
if (length $1 > $t_count){
$t_count = length $1;
}
}
}
# # #
# Treat the tail (high a_count) and head (high t_count) differently
# first look at tails
# looking only for polyA tail - use moving window looking for strings of 2/3 A's
# # #
if ($a_count > 4) {
#moving through seq starting from end - allow for buffer region:
for (my $i = ($length - 1); $i > ($length - $buffer); $i--) {
my $match = 0;
for (my $j = $i; $j > ($i - $window_size); $j--) { #check a window
if ($seq[$j] eq 'A') {
$match++;
}
}
if ($match > 1) { #if (2+)/3 = A - looks like a polyA tail
#in a polyA region - want to see how far this extends:
my $pos = $i;
while ($pos > ($window_size - 1)) {
#move the window one position:
if (ord($seq[$pos]) == 65) { #65 = decimal for 'A'
$match--;
}
if (ord($seq[$pos - $window_size]) == 65) {
$match++;
}
if ($match < 2) {
#at end of the polyA region:
#find length of polyA region: polya_len = (seq length - non-tail length - post-tail buffer length)
my $polya_len = $length - ($pos - ($window_size - 1)) - ($length - ($i + 1));
#test to see if polyA string > post polyA region:
if ($polya_len > ($length - $i)) {
#we now want to clip end of sequence
#identify last non-A in this window:
my $len;
for (my $j = ($pos - $window_size); $j <= $pos; $j++) {
if (ord($seq[$j]) != 65) {
$len = ($j + 1);
} else {
last;
}
}
$clipped_seq = substr($seq, 0, $len);
#now, it might be that the sequence look something like ....AAAAACGAAAAA
#in which case the newly clipped ....AAAAACG can be reexamined
$clipped_seq = clip_polya($clipped_seq, $end, $buffer, $window_size, $end_region);
} # end of if ($polya_len > ($length - $i)) {
$pos = 0; #break out of while loop
} # end of if ($match < 2) {
$pos--;
} # end of while ($pos > ($window_size - 1)) {
if ($match > 1) {
#then the while loop was finished whilst still on a polyA-tail
$clipped_seq = undef;
}
last; #move onto a new sequence
} # end of if ($match > 1) {
} # close the forlopp [i]
# # #
# now look at the heads
# looking only for polyT head - use moving window looking for strings of 2/3 T's
# # #
} elsif ($t_count > 4) {
#moving through seq from front:
for (my $i = 0; $i <= $buffer; $i++) {
my $match = 0;
for (my $j = $i; $j < ($i + $window_size); $j++) { #check a window
if ($seq[$j] eq 'T') {
$match++;
}
}
if ($match > 1) { #if (2+)/3 = T - looks like a polyT head
my $pos = $i;
#in a polyT region - want to see how far this extends:
while ($pos < ($length - $window_size)) {
#move the window one position
if (ord($seq[$pos]) == 84) { #eq 'T'
$match--;
}
if (ord($seq[$pos + $window_size]) == 84) {
$match++;
}
if ($match < 2) {
#at end of polyT region:
#find length of polyT region: polyt_len = (head length - pre-head buffer length)
my $polyt_len = ($pos + ($window_size - 1)) - ($i - 1);
#test to see if polyT string > pre polyT region:
if ($polyt_len > $i) {
#we now want to clip front of sequence:
#identify first non-T in this window:
my $len;
for (my $j = ($pos + ($window_size)); $j >= $pos; $j--) {
if (ord($seq[$j]) != 84) {
$len = $j;
} else {
last;
}
}
$clipped_seq = substr($seq, $len);
#now, it might be that the sequence look something like TTTTTCGTTTTT...
#in which case the newly clipped CGTTTTT... can be reexamined
$clipped_seq = clip_polya($clipped_seq, $end, $buffer, $window_size, $end_region);
} # end of if ($match < 2) {
$pos = $length; #break out of while loop
} # end of while ($pos < ($length - $window_size)) {
$pos++;
} # end of while ($pos < ($length - $window_size)) {
if ($match > 1) {
#then the while loop was finished whilst still on a polyA-tail
$clipped_seq = undef;
}
last; #move onto a new sequence
} # end of if ($match > 1) {
} # end of for (my $i = 0; $i <= $buffer; $i++) {
} else {
# no polyA or polyT
# do nothing
} # end of } elsif ($t_count > 4) {
return $clipped_seq;
}
1;