Raw content of Bio::EnsEMBL::Utils::PolyA =head1 LICENSE Copyright (c) 1999-2009 The European Bioinformatics Institute and Genome Research Limited. All rights reserved. This software is distributed under a modified Apache license. For license details, please see /info/about/code_licence.html =head1 CONTACT Please email comments or questions to the public Ensembl developers list at <ensembl-dev@ebi.ac.uk>. Questions may also be sent to the Ensembl help desk at <helpdesk@ensembl.org>. =cut =head1 NAME Bio::EnsEMBL::Utils::PolyA =head1 SYNOPSIS my $seq; # a Bio::Seq object my $polyA = Bio::EnsEMBL::Utils::PolyA->new(); # returns a new Bio::Seq object with the trimmed sequence my $trimmed_seq = $polyA->clip($seq); # cat put Ns in the place of the polyA/polyT tail my $masked_seq = $polyA->mask($seq); # can put in lower case the polyA/polyT using any flag: my $softmasked_seq = $poly->mask( $seq, 'soft' ); =head1 DESCRIPTION It reads a Bio::Seq object, it first finds out whether it has a polyA or a polyT and then performs one operation in the seq string: clipping, masking or softmasking. It then returns a new Bio::Seq object with the new sequence. =head1 METHODS =cut package Bio::EnsEMBL::Utils::PolyA; use Bio::EnsEMBL::Root; use Bio::Seq; use vars qw(@ISA); use strict; @ISA = qw(Bio::EnsEMBL::Root); =head2 new =cut sub new{ my ($class) = @_; if (ref($class)){ $class = ref($class); } my $self = {}; bless($self,$class); return $self; } ############################################################ sub clip{ my ($self, $bioseq) = @_; # print STDERR "past a $bioseq\n"; my $seq = $bioseq->seq; $self->_clip(1); $self->_mask(0); $self->_softmask(0); my $new_seq = $self->_find_polyA($seq); my $cdna = Bio::Seq->new(); if (length($new_seq) > 0){ $cdna->seq($new_seq); } else { print "While clipping the the polyA tail, sequence ".$bioseq->display_id." totally disappeared.\n"; print "Returning undef\n"; return undef; } $cdna->display_id( $bioseq->display_id ); $cdna->desc( $bioseq->desc ); return $cdna; } ############################################################ sub mask{ my ($self, $bioseq, $flag ) = @_; my $seq = $bioseq->seq; $self->_clip(0); if ( $flag ){ $self->_mask(0); $self->_softmask(1); } else{ $self->_mask(1); $self->_softmask(0); } my $new_seq = $self->_find_polyA($seq); my $cdna = new Bio::Seq; $cdna->seq($new_seq); $cdna->display_id( $bioseq->display_id ); $cdna->desc( $bioseq->desc ); return $cdna; } ############################################################ ############################################################ sub _find_polyA{ my ($self, $seq) = @_; my $new_seq; my $length = length($seq); # is it a polyA or polyT? my $check_polyT = substr( $seq, 0, 6 ); my $check_polyA = substr( $seq, -6 ); my $t_count = $check_polyT =~ tr/Tt//; my $a_count = $check_polyA =~ tr/Aa//; #### polyA #### if ( $a_count >= 5 && $a_count > $t_count ){ # we calculate the number of bases we want to chop my $length_to_mask = 0; # we start with 3 bases my ($piece, $count ) = (3,0); # count also the number of Ns, consider the Ns as potential As my $n_count = 0; # take 3 by 3 bases from the end while( $length_to_mask < $length ){ my $chunk = substr( $seq, ($length - ($length_to_mask + 3)), $piece); $count = $chunk =~ tr/Aa//; $n_count = $chunk =~ tr/Nn//; if ( ($count + $n_count) >= 2*( $piece )/3 ){ $length_to_mask += 3; } else{ last; } } if ( $length_to_mask > 0 ){ # do not mask the last base if it is not an A: my $last_base = substr( $seq, ( $length - $length_to_mask ), 1); my $previous_to_last = substr( $seq, ( $length - $length_to_mask - 1), 1); if ( !( $last_base eq 'A' || $last_base eq 'a') ){ $length_to_mask--; } elsif( $previous_to_last eq 'A' || $previous_to_last eq 'a' ){ $length_to_mask++; } my $clipped_seq = substr( $seq, 0, $length - $length_to_mask ); my $mask; if ( $self->_clip ){ $mask = ""; } elsif( $self->_mask ){ $mask = "N" x ($length_to_mask); } elsif ( $self->_softmask ){ $mask = lc substr( $seq, ( $length - $length_to_mask ) ); } $new_seq = $clipped_seq . $mask; } else{ $new_seq = $seq; } } #### polyT #### elsif( $t_count >=5 && $t_count > $a_count ){ # calculate the number of bases to chop my $length_to_mask = -3; # we start with 3 bases: my ($piece, $count) = (3,3); # count also the number of Ns, consider the Ns as potential As my $n_count = 0; # take 3 by 3 bases from the beginning while ( $length_to_mask < $length ){ my $chunk = substr( $seq, $length_to_mask + 3, $piece ); #print STDERR "length to mask: $length_to_mask\n"; #print "chunk: $chunk\n"; $count = $chunk =~ tr/Tt//; $n_count = $chunk =~ tr/Nn//; if ( ($count+$n_count) >= 2*( $piece )/3 ){ $length_to_mask +=3; } else{ last; } } if ( $length_to_mask >= 0 ){ # do not chop the last base if it is not a A: #print STDERR "clipping sequence $seq\n"; my $last_base = substr( $seq, ( $length_to_mask + 3 - 1 ), 1 ); my $previous_to_last = substr( $seq, ( $length_to_mask + 3 ), 1 ); if ( !( $last_base eq 'T' || $last_base eq 't' ) ){ $length_to_mask--; } elsif( $previous_to_last eq 'T' || $previous_to_last eq 't' ){ $length_to_mask++; } my $clipped_seq = substr( $seq, $length_to_mask + 3); my $mask; if ( $self->_clip ){ $mask = ""; } elsif( $self->_mask ){ $mask = "N" x ($length_to_mask+3); } elsif ($self->_softmask){ $mask = lc substr( $seq, 0, ($length_to_mask + 3) ); } $new_seq = $mask.$clipped_seq; } else{ $new_seq = $seq; } } else{ # we cannot be sure of what it is # do not clip $new_seq = $seq; } return $new_seq; } ############################################################ sub _mask{ my ($self,$mask) = @_; if (defined($mask)){ $self->{_mask} = $mask; } return $self->{_mask}; } ############################################################ sub _clip{ my ($self,$clip) = @_; if (defined($clip)){ $self->{_clip} = $clip; } return $self->{_clip}; } ############################################################ sub _softmask{ my ($self,$softmask) = @_; if (defined($softmask)){ $self->{_softmask} = $softmask; } return $self->{_softmask}; } ############################################################ sub has_polyA_track{ my ($self, $seq) = @_; my $new_seq; my $length = length($seq); # is it a polyA or polyT? my $check_polyT = substr( $seq, 0, 10 ); my $check_polyA = substr( $seq, -10 ); print STDERR "polyA: $check_polyA\n"; my $t_count = $check_polyT =~ tr/Tt//; my $a_count = $check_polyA =~ tr/Aa//; ## testing with this short cut if ( $a_count >=7 || $t_count >=7 ){ return 1; } else{ return 0; } } ################ 1;