Bio::EnsEMBL::Analysis::Tools PolyAClipping
SummaryIncluded librariesPackage variablesDescriptionGeneral documentationMethods
Toolbar
WebCvsRaw content
Summary
polA-clipping.pl
Package variables
Globals (from "use vars" definitions)
@EXPORT = qw(clip_if_necessary)
Included modules
Bio::EnsEMBL::Utils::Exception qw ( throw warning )
Bio::Seq
Exporter
Inherit
Exporter
Synopsis
No synopsis!
Description
         
script to parse a fasta file and identify sequences with polyA/T tails/heads
these are then clipped and stored in a file
Copied directly from sd3's /ensembl-pipeline/scripts/EST/new_polyA_clipping.pl
and made into a rough module
Can also return which end of a sequence was clipped (head or tail) and how many bases were removed
clipping:
the non-A/T sequences at the ends must be <=10bp (set by $buffer)
the polyA/T string must be >4bp to be removed
it only clips polyA tails or polyT heads using a sliding window of 3 bp
the clipping is recursive but only clips one end of a sequence
the head/tail is only clipped if the polyA/T string is longer than the non-polyA/T string at the end of the sequence
perl new_polyA_clipping.pl sequences.fasta polyat_clipped.out
Methods
clip_if_necessaryDescriptionCode
clip_polyaDescriptionCode
prepare_seqDescriptionCode
Methods description
clip_if_necessary code    nextTop
  Arg [1]    : Bio::Seq  
Arg [2] : Int - buffer size (optional)
Arg [3] : Int - window_size (optional)
Example : my ($clipped, $clip_end, $num_bases_removed) = clip_if_necessary($cdna,$buffer,$window)
Description: Decides whether a cDNA needs to be clipped

Returntype : String - the clipped sequence - or undef
Exceptions : none
clip_polya codeprevnextTop
  Arg [1]    : String - the cDNA sequence 
Arg [2] : String - "head" or "tail" - end of sequence to be clipped
Arg [3] : Int - buffer
Arg [4] : Int - window
Arg [5] : Int - end region
Example : $clipped_seq = clip_polya($seq, "head");
Description: Clips polyA or polyT sequences (case-sensitive)
Returntype : String - the clipped sequence - or undef
Exceptions : none
prepare_seqcodeprevnextTop
  Arg [1]    : Bio::Seq 
Example : my $prepared = prepare_seq($unprepared)
Description: Checks the display_id is OK
Makes all sequence uppercase
Returntype : Bio::Seq
Exceptions : Can't read id
Methods code
clip_if_necessarydescriptionprevnextTop
sub clip_if_necessary {
  my ($unprepared, $buffer, $window_size) = @_;
  my $clipped = undef;
  my $clipped_seq = "";
  my $clip_end;

  if (!defined $buffer) {
    #length of region allowed to mismatch at ends of sequence
$buffer = 10; } if (!defined $window_size) { $window_size = 3; } my $end_region = $buffer + $window_size; my $unclipped = prepare_seq($unprepared); #print ">".$unclipped->display_id."\n".$unclipped->seq."\n";
# # #
# If the sequence is long enough, clip it
# # #
my $seq = $unclipped->seq; if (length $seq >= $end_region) { #check for poly-A/T tails/heads - which should have mostly been removed...
#decide whether looking for polyT head or polyA tail:
my $head = substr($seq, 0, $end_region); my $tail = substr($seq, -$end_region); my $t_count = 0; my $a_count = 0; while ($head=~/([Tt]+)/g) { #will match greedily
if (length $1 > $t_count) { $t_count = length $1; } } while ($tail=~/([Aa]+)/g) { #will match greedily
if (length $1 > $a_count) { $a_count = length $1; } } #decide whether to clip heads, tails or unsure... set appropriate flag
if (($a_count > $t_count) && ($a_count > 4)) { #call the subroutine to trim the polyA tail:
$clipped_seq = clip_polya($seq, "tail", $buffer, $window_size, $end_region); $clip_end = "tail"; } elsif (($a_count < $t_count) && ($t_count > 4)) { #call the subroutine to trim the polyT head:
$clipped_seq = clip_polya($seq, "head", $buffer, $window_size, $end_region); $clip_end = "head"; } else { if ($a_count > 4) { #only do for ones which appear to have a head/tail:
#tied - not sure which to do -try both and choose afterwards
my $clipped_head = clip_polya($seq, "head", $buffer, $window_size, $end_region); my $clipped_tail = clip_polya($seq, "tail", $buffer, $window_size, $end_region); #choose one which clipped the most:
if (length $clipped_head < length $clipped_tail) { $clipped_seq = $clipped_head; $clip_end = "head"; } elsif (length $clipped_tail < length $clipped_head) { $clipped_seq = $clipped_tail; $clip_end = "tail"; } else { #still can't tell, leave as original seq
$clipped_seq = $seq; } } else { # $a_count <= 4
#not going to be clipped
$clipped_seq = $seq; } } } else { # length $seq < $end_region so no need to clip
$clipped_seq = $seq; } # # #
# Check that we have a clipped sequence
# # #
my $num_bases_removed; if (defined $clipped_seq) { $clipped = new Bio::Seq; eval { $clipped->seq($clipped_seq); $clipped->display_id($unclipped->display_id); $clipped->desc(""); }; #print "$unclipped->display_id: $seq\n$cdna->display_id: $clipped_seq\n\n";
$num_bases_removed = (length $seq) - (length $clipped_seq); } else { #the entire sequence seems to be polyA/T tail/head
warning("Sequence ".$unclipped->display_id." has been removed:\n$seq\n"); } # We are now finished reading in all the sequences
return ($clipped, $clip_end, $num_bases_removed);
}
clip_polyadescriptionprevnextTop
sub clip_polya {
  my ($seq, $end, $buffer, $window_size, $end_region) = @_;  
  
  my @seq = split//, $seq;
  my $length = length $seq;
  my $a_count = 0;
  my $t_count = 0;
  
  my $clipped_seq = $seq;
  
  # # #
# Count the number of consecutive A's or T's
# # #
if ($end eq "tail") { my $tail = substr($seq, -$end_region); while ($tail =~ m/([Aa]+)/g){ #will match greedily
if (length
$1 > $a_count){
$a_count = length $1;
} } } elsif ($end eq "head") { my $head = substr($seq, 0, $end_region); while ($head =~ m/([Tt]+)/g){ #will match greedily
if (length
$1 > $t_count){
$t_count = length $1;
} } } # # #
# Treat the tail (high a_count) and head (high t_count) differently
# first look at tails
# looking only for polyA tail - use moving window looking for strings of 2/3 A's
# # #
if ($a_count > 4) { #moving through seq starting from end - allow for buffer region:
for (my $i = ($length - 1); $i > ($length - $buffer); $i--) { my $match = 0; for (my $j = $i; $j > ($i - $window_size); $j--) { #check a window
if ($seq[$j] eq 'A') { $match++; } } if ($match > 1) { #if (2+)/3 = A - looks like a polyA tail
#in a polyA region - want to see how far this extends:
my $pos = $i; while ($pos > ($window_size - 1)) { #move the window one position:
if (ord($seq[$pos]) == 65) { #65 = decimal for 'A'
$match--; } if (ord($seq[$pos - $window_size]) == 65) { $match++; } if ($match < 2) { #at end of the polyA region:
#find length of polyA region: polya_len = (seq length - non-tail length - post-tail buffer length)
my $polya_len = $length - ($pos - ($window_size - 1)) - ($length - ($i + 1)); #test to see if polyA string > post polyA region:
if ($polya_len > ($length - $i)) { #we now want to clip end of sequence
#identify last non-A in this window:
my $len; for (my $j = ($pos - $window_size); $j <= $pos; $j++) { if (ord($seq[$j]) != 65) { $len = ($j + 1); } else { last; } } $clipped_seq = substr($seq, 0, $len); #now, it might be that the sequence look something like ....AAAAACGAAAAA
#in which case the newly clipped ....AAAAACG can be reexamined
$clipped_seq = clip_polya($clipped_seq, $end, $buffer, $window_size, $end_region); } # end of if ($polya_len > ($length - $i)) {
$pos = 0; #break out of while loop
} # end of if ($match < 2) {
$pos--; } # end of while ($pos > ($window_size - 1)) {
if ($match > 1) { #then the while loop was finished whilst still on a polyA-tail
$clipped_seq = undef; } last; #move onto a new sequence
} # end of if ($match > 1) {
} # close the forlopp [i]
# # #
# now look at the heads
# looking only for polyT head - use moving window looking for strings of 2/3 T's
# # #
} elsif ($t_count > 4) { #moving through seq from front:
for (my $i = 0; $i <= $buffer; $i++) { my $match = 0; for (my $j = $i; $j < ($i + $window_size); $j++) { #check a window
if ($seq[$j] eq 'T') { $match++; } } if ($match > 1) { #if (2+)/3 = T - looks like a polyT head
my $pos = $i; #in a polyT region - want to see how far this extends:
while ($pos < ($length - $window_size)) { #move the window one position
if (ord($seq[$pos]) == 84) { #eq 'T'
$match--; } if (ord($seq[$pos + $window_size]) == 84) { $match++; } if ($match < 2) { #at end of polyT region:
#find length of polyT region: polyt_len = (head length - pre-head buffer length)
my $polyt_len = ($pos + ($window_size - 1)) - ($i - 1); #test to see if polyT string > pre polyT region:
if ($polyt_len > $i) { #we now want to clip front of sequence:
#identify first non-T in this window:
my $len; for (my $j = ($pos + ($window_size)); $j >= $pos; $j--) { if (ord($seq[$j]) != 84) { $len = $j; } else { last; } } $clipped_seq = substr($seq, $len); #now, it might be that the sequence look something like TTTTTCGTTTTT...
#in which case the newly clipped CGTTTTT... can be reexamined
$clipped_seq = clip_polya($clipped_seq, $end, $buffer, $window_size, $end_region); } # end of if ($match < 2) {
$pos = $length; #break out of while loop
} # end of while ($pos < ($length - $window_size)) {
$pos++; } # end of while ($pos < ($length - $window_size)) {
if ($match > 1) { #then the while loop was finished whilst still on a polyA-tail
$clipped_seq = undef; } last; #move onto a new sequence
} # end of if ($match > 1) {
} # end of for (my $i = 0; $i <= $buffer; $i++) {
} else { # no polyA or polyT
# do nothing
} # end of } elsif ($t_count > 4) {
return $clipped_seq; } 1;
}
prepare_seqdescriptionprevnextTop
sub prepare_seq {
  my ($unclipped) = @_;
  my $prepared = new Bio::Seq;
    # # #
# just want the id on the first line:
# # #
my $tmp = $unclipped->display_id; $tmp =~ s/\>//; my $id; if ($tmp =~ m/^gi.+ref\|(N[MR]_.+)\|/) {
$id = $1;
} elsif ($tmp =~ m/^[\w\d]+\s([\w\.\d]+)\s.+\n{1}?/) {
$id = $1;
} elsif ($tmp =~m/^[\w\.\d]+\s.+\n{1}?/) { #already in correct format - do nothing
$id = $tmp; } elsif ($tmp =~ m/[\w\.\d]+/) {
$id = $tmp;
} else { throw("Cannot extract the input id from:\n ".$unclipped->id."\nTry making your file so that the first line of each entry ". "only contains the id, eg:\n >BC000830.1\n"); } # make a new clipped cdna
$prepared->display_id($id); # change seq to uppercase (as pattern matching later on is case-sensitive)
my $seq = $unclipped->seq; $seq =~ tr/a-z/A-Z/; $prepared->seq($seq); return $prepared;
}
General documentation
No general documentation available.