script to parse a fasta file and identify sequences with polyA/T tails/heads
these are then clipped and stored in a file
Copied directly from sd3's /ensembl-pipeline/scripts/EST/new_polyA_clipping.pl
and made into a rough module
Can also return which end of a sequence was clipped (head or tail) and how many bases were removed
clipping:
the non-A/T sequences at the ends must be <=10bp (set by $buffer)
the polyA/T string must be >4bp to be removed
it only clips polyA tails or polyT heads using a sliding window of 3 bp
the clipping is recursive but only clips one end of a sequence
the head/tail is only clipped if the polyA/T string is longer than the non-polyA/T string at the end of the sequence
perl new_polyA_clipping.pl sequences.fasta polyat_clipped.out
sub clip_if_necessary
{ my ($unprepared, $buffer, $window_size) = @_;
my $clipped = undef;
my $clipped_seq = "";
my $clip_end;
if (!defined $buffer) {
$buffer = 10;
}
if (!defined $window_size) {
$window_size = 3;
}
my $end_region = $buffer + $window_size;
my $unclipped = prepare_seq($unprepared);
my $seq = $unclipped->seq;
if (length $seq >= $end_region) {
my $head = substr($seq, 0, $end_region);
my $tail = substr($seq, -$end_region);
my $t_count = 0;
my $a_count = 0;
while ($head=~/([Tt]+)/g) { if (length $1 > $t_count) {
$t_count = length $1;
}
}
while ($tail=~/([Aa]+)/g) { if (length $1 > $a_count) {
$a_count = length $1;
}
}
if (($a_count > $t_count) && ($a_count > 4)) { $clipped_seq = clip_polya($seq, "tail", $buffer, $window_size, $end_region);
$clip_end = "tail";
} elsif (($a_count < $t_count) && ($t_count > 4)) { $clipped_seq = clip_polya($seq, "head", $buffer, $window_size, $end_region);
$clip_end = "head";
} else {
if ($a_count > 4) { my $clipped_head = clip_polya($seq, "head", $buffer, $window_size, $end_region);
my $clipped_tail = clip_polya($seq, "tail", $buffer, $window_size, $end_region);
if (length $clipped_head < length $clipped_tail) {
$clipped_seq = $clipped_head;
$clip_end = "head";
} elsif (length $clipped_tail < length $clipped_head) {
$clipped_seq = $clipped_tail;
$clip_end = "tail";
} else { $clipped_seq = $seq;
}
} else { $clipped_seq = $seq;
}
}
} else { $clipped_seq = $seq;
}
my $num_bases_removed;
if (defined $clipped_seq) {
$clipped = new Bio::Seq;
eval {
$clipped->seq($clipped_seq);
$clipped->display_id($unclipped->display_id);
$clipped->desc("");
};
$num_bases_removed = (length $seq) - (length $clipped_seq);
} else {
warning("Sequence ".$unclipped->display_id." has been removed:\n$seq\n");
}
return ($clipped, $clip_end, $num_bases_removed); } |
sub clip_polya
{ my ($seq, $end, $buffer, $window_size, $end_region) = @_;
my @seq = split//, $seq;
my $length = length $seq;
my $a_count = 0;
my $t_count = 0;
my $clipped_seq = $seq;
if ($end eq "tail") {
my $tail = substr($seq, -$end_region);
while ($tail =~ m/([Aa]+)/g){ #will match greedily if (length $1 > $a_count){ $a_count = length $1; }
}
} elsif ($end eq "head") {
my $head = substr($seq, 0, $end_region);
while ($head =~ m/([Tt]+)/g){ #will match greedily if (length $1 > $t_count){ $t_count = length $1; }
}
}
if ($a_count > 4) {
for (my $i = ($length - 1); $i > ($length - $buffer); $i--) {
my $match = 0;
for (my $j = $i; $j > ($i - $window_size); $j--) { if ($seq[$j] eq 'A') {
$match++;
}
}
if ($match > 1) { my $pos = $i;
while ($pos > ($window_size - 1)) {
if (ord($seq[$pos]) == 65) { $match--;
}
if (ord($seq[$pos - $window_size]) == 65) {
$match++;
}
if ($match < 2) {
my $polya_len = $length - ($pos - ($window_size - 1)) - ($length - ($i + 1));
if ($polya_len > ($length - $i)) {
my $len;
for (my $j = ($pos - $window_size); $j <= $pos; $j++) {
if (ord($seq[$j]) != 65) {
$len = ($j + 1);
} else {
last;
}
}
$clipped_seq = substr($seq, 0, $len);
$clipped_seq = clip_polya($clipped_seq, $end, $buffer, $window_size, $end_region);
} $pos = 0; } $pos--;
}
if ($match > 1) {
$clipped_seq = undef;
}
last; } }
} elsif ($t_count > 4) {
for (my $i = 0; $i <= $buffer; $i++) {
my $match = 0;
for (my $j = $i; $j < ($i + $window_size); $j++) { if ($seq[$j] eq 'T') {
$match++;
}
}
if ($match > 1) { my $pos = $i;
while ($pos < ($length - $window_size)) {
if (ord($seq[$pos]) == 84) { $match--;
}
if (ord($seq[$pos + $window_size]) == 84) {
$match++;
}
if ($match < 2) {
my $polyt_len = ($pos + ($window_size - 1)) - ($i - 1);
if ($polyt_len > $i) {
my $len;
for (my $j = ($pos + ($window_size)); $j >= $pos; $j--) {
if (ord($seq[$j]) != 84) {
$len = $j;
} else {
last;
}
}
$clipped_seq = substr($seq, $len);
$clipped_seq = clip_polya($clipped_seq, $end, $buffer, $window_size, $end_region);
} $pos = $length; } $pos++;
}
if ($match > 1) {
$clipped_seq = undef;
}
last; } } } else {
} return $clipped_seq;
}
1; } |
sub prepare_seq
{ my ($unclipped) = @_;
my $prepared = new Bio::Seq;
my $tmp = $unclipped->display_id;
$tmp =~ s/\>//;
my $id;
if ($tmp =~ m/^gi.+ref\|(N[MR]_.+)\|/) { $id = $1; } elsif ($tmp =~ m/^[\w\d]+\s([\w\.\d]+)\s.+\n{1}?/) { $id = $1; } elsif ($tmp =~m/^[\w\.\d]+\s.+\n{1}?/) {
$id = $tmp;
} elsif ($tmp =~ m/[\w\.\d]+/) { $id = $tmp; } else {
throw("Cannot extract the input id from:\n ".$unclipped->id."\nTry making your file so that the first line of each entry ".
"only contains the id, eg:\n >BC000830.1\n");
}
$prepared->display_id($id);
my $seq = $unclipped->seq;
$seq =~ tr/a-z/A-Z/;
$prepared->seq($seq);
return $prepared; } |