Bio::EnsEMBL::Utils PolyA
SummaryIncluded librariesPackage variablesSynopsisDescriptionGeneral documentationMethods
Toolbar
WebCvsRaw content
Summary
Bio::EnsEMBL::Utils::PolyA
Package variables
No package variables defined.
Included modules
Bio::EnsEMBL::Root
Bio::Seq
Inherit
Bio::EnsEMBL::Root
Synopsis
  my $seq;    # a Bio::Seq object
my $polyA = Bio::EnsEMBL::Utils::PolyA->new();
# returns a new Bio::Seq object with the trimmed sequence my $trimmed_seq = $polyA->clip($seq); # cat put Ns in the place of the polyA/polyT tail my $masked_seq = $polyA->mask($seq); # can put in lower case the polyA/polyT using any flag: my $softmasked_seq = $poly->mask( $seq, 'soft' );
Description
  It reads a Bio::Seq object, it first finds out whether it has a
polyA or a polyT and then performs one operation in the seq string:
clipping, masking or softmasking. It then returns a new Bio::Seq
object with the new sequence.
Methods
_clip
No description
Code
_find_polyA
No description
Code
_mask
No description
Code
_softmask
No description
Code
clip
No description
Code
has_polyA_track
No description
Code
mask
No description
Code
newDescriptionCode
Methods description
newcode    nextTop
Methods code
_clipdescriptionprevnextTop
sub _clip {
  my ($self,$clip) = @_;
  if (defined($clip)){
    $self->{_clip} = $clip;
  }
  return $self->{_clip};
}

############################################################
}
_find_polyAdescriptionprevnextTop
sub _find_polyA {
  my ($self, $seq) = @_;
  my $new_seq;
  my $length = length($seq);
  
  # is it a polyA or polyT?
my $check_polyT = substr( $seq, 0, 6 ); my $check_polyA = substr( $seq, -6 ); my $t_count = $check_polyT =~ tr/Tt//; my $a_count = $check_polyA =~ tr/Aa//; #### polyA ####
if ( $a_count >= 5 && $a_count > $t_count ){ # we calculate the number of bases we want to chop
my $length_to_mask = 0; # we start with 3 bases
my ($piece, $count ) = (3,0); # count also the number of Ns, consider the Ns as potential As
my $n_count = 0; # take 3 by 3 bases from the end
while( $length_to_mask < $length ){ my $chunk = substr( $seq, ($length - ($length_to_mask + 3)), $piece); $count = $chunk =~ tr/Aa//; $n_count = $chunk =~ tr/Nn//; if ( ($count + $n_count) >= 2*( $piece )/3 ){
$length_to_mask += 3;
} else{ last; } } if ( $length_to_mask > 0 ){ # do not mask the last base if it is not an A:
my $last_base = substr( $seq, ( $length - $length_to_mask ), 1); my $previous_to_last = substr( $seq, ( $length - $length_to_mask - 1), 1); if ( !( $last_base eq 'A' || $last_base eq 'a') ){ $length_to_mask--; } elsif( $previous_to_last eq 'A' || $previous_to_last eq 'a' ){ $length_to_mask++; } my $clipped_seq = substr( $seq, 0, $length - $length_to_mask ); my $mask; if ( $self->_clip ){ $mask = ""; } elsif( $self->_mask ){ $mask = "N" x ($length_to_mask); } elsif ( $self->_softmask ){ $mask = lc substr( $seq, ( $length - $length_to_mask ) ); } $new_seq = $clipped_seq . $mask; } else{ $new_seq = $seq; } } #### polyT ####
elsif( $t_count >=5 && $t_count > $a_count ){ # calculate the number of bases to chop
my $length_to_mask = -3; # we start with 3 bases:
my ($piece, $count) = (3,3); # count also the number of Ns, consider the Ns as potential As
my $n_count = 0; # take 3 by 3 bases from the beginning
while ( $length_to_mask < $length ){ my $chunk = substr( $seq, $length_to_mask + 3, $piece ); #print STDERR "length to mask: $length_to_mask\n";
#print "chunk: $chunk\n";
$count = $chunk =~ tr/Tt//; $n_count = $chunk =~ tr/Nn//; if ( ($count+$n_count) >= 2*( $piece )/3 ){
$length_to_mask +=3;
} else{ last; } } if ( $length_to_mask >= 0 ){ # do not chop the last base if it is not a A:
#print STDERR "clipping sequence $seq\n";
my $last_base = substr( $seq, ( $length_to_mask + 3 - 1 ), 1 ); my $previous_to_last = substr( $seq, ( $length_to_mask + 3 ), 1 ); if ( !( $last_base eq 'T' || $last_base eq 't' ) ){ $length_to_mask--; } elsif( $previous_to_last eq 'T' || $previous_to_last eq 't' ){ $length_to_mask++; } my $clipped_seq = substr( $seq, $length_to_mask + 3); my $mask; if ( $self->_clip ){ $mask = ""; } elsif( $self->_mask ){ $mask = "N" x ($length_to_mask+3); } elsif ($self->_softmask){ $mask = lc substr( $seq, 0, ($length_to_mask + 3) ); } $new_seq = $mask.$clipped_seq; } else{ $new_seq = $seq; } } else{ # we cannot be sure of what it is
# do not clip
$new_seq = $seq; } return $new_seq; } ############################################################
}
_maskdescriptionprevnextTop
sub _mask {
  my ($self,$mask) = @_;
  if (defined($mask)){
    $self->{_mask} = $mask;
  }
  return $self->{_mask};
}

############################################################
}
_softmaskdescriptionprevnextTop
sub _softmask {
  my ($self,$softmask) = @_;
  if (defined($softmask)){
    $self->{_softmask} = $softmask;
  }
  return $self->{_softmask};
}

############################################################
}
clipdescriptionprevnextTop
sub clip {
  my ($self, $bioseq) = @_;

  # print STDERR "past a $bioseq\n";
my $seq = $bioseq->seq; $self->_clip(1); $self->_mask(0); $self->_softmask(0); my $new_seq = $self->_find_polyA($seq); my $cdna = Bio::Seq->new(); if (length($new_seq) > 0){ $cdna->seq($new_seq); } else { print "While clipping the the polyA tail, sequence ".$bioseq->display_id." totally disappeared.\n"; print "Returning undef\n"; return undef; } $cdna->display_id( $bioseq->display_id ); $cdna->desc( $bioseq->desc ); return $cdna; } ############################################################
}
has_polyA_trackdescriptionprevnextTop
sub has_polyA_track {
  my ($self, $seq) = @_;
  my $new_seq;
  my $length = length($seq);
  
  # is it a polyA or polyT?
my $check_polyT = substr( $seq, 0, 10 ); my $check_polyA = substr( $seq, -10 ); print STDERR "polyA: $check_polyA\n"; my $t_count = $check_polyT =~ tr/Tt//; my $a_count = $check_polyA =~ tr/Aa//; ## testing with this short cut
if ( $a_count >=7 || $t_count >=7 ){ return 1; } else{ return 0; } } ################
1;
}
maskdescriptionprevnextTop
sub mask {
  my ($self, $bioseq, $flag ) = @_;
  
  my $seq = $bioseq->seq;
  $self->_clip(0);
  if ( $flag ){
    $self->_mask(0);
    $self->_softmask(1);
  }
  else{
    $self->_mask(1);
    $self->_softmask(0);
  }
  my $new_seq = $self->_find_polyA($seq);
  my $cdna = new Bio::Seq;
  $cdna->seq($new_seq);
  $cdna->display_id( $bioseq->display_id );
  $cdna->desc( $bioseq->desc );
  
  return $cdna;
}

############################################################
############################################################
}
newdescriptionprevnextTop
sub new {
  my ($class) = @_;
   if (ref($class)){
    $class = ref($class);
  }
  my $self = {};
  bless($self,$class);
  
  return $self;
}


############################################################
}
General documentation
LICENSETop
  Copyright (c) 1999-2009 The European Bioinformatics Institute and
Genome Research Limited. All rights reserved.
This software is distributed under a modified Apache license. For license details, please see /info/about/code_licence.html
CONTACTTop
  Please email comments or questions to the public Ensembl
developers list at <ensembl-dev@ebi.ac.uk>.
Questions may also be sent to the Ensembl help desk at <helpdesk@ensembl.org>.