my $seq; # a Bio::Seq object
my $polyA = Bio::EnsEMBL::Utils::PolyA->new();
# returns a new Bio::Seq object with the trimmed sequence
my $trimmed_seq = $polyA->clip($seq);
# cat put Ns in the place of the polyA/polyT tail
my $masked_seq = $polyA->mask($seq);
# can put in lower case the polyA/polyT using any flag:
my $softmasked_seq = $poly->mask( $seq, 'soft' );
It reads a Bio::Seq object, it first finds out whether it has a
polyA or a polyT and then performs one operation in the seq string:
clipping, masking or softmasking. It then returns a new Bio::Seq
object with the new sequence.
sub _clip
{ my ($self,$clip) = @_;
if (defined($clip)){
$self->{_clip} = $clip;
}
return $self->{_clip};
}
} |
sub _find_polyA
{ my ($self, $seq) = @_;
my $new_seq;
my $length = length($seq);
my $check_polyT = substr( $seq, 0, 6 );
my $check_polyA = substr( $seq, -6 );
my $t_count = $check_polyT =~ tr/Tt//;
my $a_count = $check_polyA =~ tr/Aa//;
if ( $a_count >= 5 && $a_count > $t_count ){
my $length_to_mask = 0;
my ($piece, $count ) = (3,0);
my $n_count = 0;
while( $length_to_mask < $length ){
my $chunk = substr( $seq, ($length - ($length_to_mask + 3)), $piece);
$count = $chunk =~ tr/Aa//;
$n_count = $chunk =~ tr/Nn//;
if ( ($count + $n_count) >= 2*( $piece )/3 ){ $length_to_mask += 3; }
else{
last;
}
}
if ( $length_to_mask > 0 ){
my $last_base = substr( $seq, ( $length - $length_to_mask ), 1);
my $previous_to_last = substr( $seq, ( $length - $length_to_mask - 1), 1);
if ( !( $last_base eq 'A' || $last_base eq 'a') ){
$length_to_mask--;
}
elsif( $previous_to_last eq 'A' || $previous_to_last eq 'a' ){
$length_to_mask++;
}
my $clipped_seq = substr( $seq, 0, $length - $length_to_mask );
my $mask;
if ( $self->_clip ){
$mask = "";
}
elsif( $self->_mask ){
$mask = "N" x ($length_to_mask);
}
elsif ( $self->_softmask ){
$mask = lc substr( $seq, ( $length - $length_to_mask ) );
}
$new_seq = $clipped_seq . $mask;
}
else{
$new_seq = $seq;
}
}
elsif( $t_count >=5 && $t_count > $a_count ){
my $length_to_mask = -3;
my ($piece, $count) = (3,3);
my $n_count = 0;
while ( $length_to_mask < $length ){
my $chunk = substr( $seq, $length_to_mask + 3, $piece );
$count = $chunk =~ tr/Tt//;
$n_count = $chunk =~ tr/Nn//;
if ( ($count+$n_count) >= 2*( $piece )/3 ){ $length_to_mask +=3; }
else{
last;
}
}
if ( $length_to_mask >= 0 ){
my $last_base = substr( $seq, ( $length_to_mask + 3 - 1 ), 1 );
my $previous_to_last = substr( $seq, ( $length_to_mask + 3 ), 1 );
if ( !( $last_base eq 'T' || $last_base eq 't' ) ){
$length_to_mask--;
}
elsif( $previous_to_last eq 'T' || $previous_to_last eq 't' ){
$length_to_mask++;
}
my $clipped_seq = substr( $seq, $length_to_mask + 3);
my $mask;
if ( $self->_clip ){
$mask = "";
}
elsif( $self->_mask ){
$mask = "N" x ($length_to_mask+3);
}
elsif ($self->_softmask){
$mask = lc substr( $seq, 0, ($length_to_mask + 3) );
}
$new_seq = $mask.$clipped_seq;
}
else{
$new_seq = $seq;
}
}
else{
$new_seq = $seq;
}
return $new_seq;
}
} |
sub _mask
{ my ($self,$mask) = @_;
if (defined($mask)){
$self->{_mask} = $mask;
}
return $self->{_mask};
}
} |
sub _softmask
{ my ($self,$softmask) = @_;
if (defined($softmask)){
$self->{_softmask} = $softmask;
}
return $self->{_softmask};
}
} |
sub clip
{ my ($self, $bioseq) = @_;
my $seq = $bioseq->seq;
$self->_clip(1);
$self->_mask(0);
$self->_softmask(0);
my $new_seq = $self->_find_polyA($seq);
my $cdna = Bio::Seq->new();
if (length($new_seq) > 0){
$cdna->seq($new_seq);
}
else {
print "While clipping the the polyA tail, sequence ".$bioseq->display_id." totally disappeared.\n";
print "Returning undef\n";
return undef;
}
$cdna->display_id( $bioseq->display_id );
$cdna->desc( $bioseq->desc );
return $cdna;
}
} |
sub has_polyA_track
{ my ($self, $seq) = @_;
my $new_seq;
my $length = length($seq);
my $check_polyT = substr( $seq, 0, 10 );
my $check_polyA = substr( $seq, -10 );
print STDERR "polyA: $check_polyA\n";
my $t_count = $check_polyT =~ tr/Tt//;
my $a_count = $check_polyA =~ tr/Aa//;
if ( $a_count >=7 || $t_count >=7 ){
return 1;
}
else{
return 0;
}
}
1; } |
sub mask
{ my ($self, $bioseq, $flag ) = @_;
my $seq = $bioseq->seq;
$self->_clip(0);
if ( $flag ){
$self->_mask(0);
$self->_softmask(1);
}
else{
$self->_mask(1);
$self->_softmask(0);
}
my $new_seq = $self->_find_polyA($seq);
my $cdna = new Bio::Seq;
$cdna->seq($new_seq);
$cdna->display_id( $bioseq->display_id );
$cdna->desc( $bioseq->desc );
return $cdna;
}
} |
sub new
{ my ($class) = @_;
if (ref($class)){
$class = ref($class);
}
my $self = {};
bless($self,$class);
return $self;
}
} |