Bio::EnsEMBL::Analysis::Tools
FeatureFilter
Toolbar
Summary
Bio::EnsEMBL::Analysis::Tools::FeatureFilter
Package variables
No package variables defined.
Included modules
Inherit
Synopsis
my $featurefilter = new Bio::EnsEMBL::Analysis::Tools::FeatureFilter
new->(
-min_score => 200,
-max_pvalue => 0.01,
-coverage => 5,
-prune => 1,
);
my @filtered_results = @{$featurefilter->filter_results(\@results)};
Description
This is the standard module used for filtering blast results
It filters on 4 separate things, It will throw out features whose score
is below a certain value or whose pvalue is above a certain value. Coverage
keeps all hit ids where at least of its features base pairs is covered by
less than its value in features (10 as standard). Prune and hard prune
function in the same manner but prune only considers sets of features with
the same hit id but hard prune considers all of them. The feature set
which is passed to prune_features is first split on the basis of strand
then each set of features on one strand are considered and the number of
features which cover each base pair in the range of basepairs covered is
counted. Then starting with the lowest scoring features and the highest
covered base pairs first a single feature is thrown out if at least one
of its base pairs covered the query sequence too much (this level is
also set by coverage)
here is an asci drawing which should hopefully explain. In this system
there are no min scores and coverage is set to 5 but nothing is thrown
away by the initial coverage filter as every feature has every feature has
at least on base pair whose coverage is below 5. The features with bits
scores of 50 and 100 though would be thrown away by prune as base 21 is
covered by 7 features and these are the lowest scoring of those 7 features
if normal prune was used these features would all need to share the same
hit id but if hard prune was used that would not be true
bit score
---------------- 50
--------------------- 100
--------- 250
--------------- 500
------------------- 750
----------- 1000
-------------------------- 1250
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
1 5 1 1 2 2 3
0 5 0 5 0
Methods
Methods description
Arg [1] : Bio::EnsEMBL::Analysis::Tools::FeatureFilter Arg [2] : arrayref of features, there is not type checking but it is expected that the features have start, end, score and hseqname methods (Bio::EnsEMBL::FeaturePairs is what it was written against) Function : filter the given features by score, pvalue coverage Returntype: arrayref Exceptions: throws if passed nothing or not an arrayref Example : |
Arg [1] : Bio::EnsEMBL::Analysis::Tools::FeatureFilter Arg [2] : int, minimum score Arg [3] : int, maximum p value Arg [4] : int, maximun coverage Arg [5] : int, toggle whether to prune features Arg [6] : int, toggle whether to use hard prune Arg [7] : int, toggle whether to filter on coverage or not Function : create a new FeatureFilter object Returntype: Bio::EnsEMBL::Analysis::Tools::FeatureFilter Exceptions: none Example : |
Arg [1] : Bio::EnsEMBL::Analysis::Tools::FeatureFilter Arg [2] : arrayref of features Function : calls to prune_features_by_strand with each strand Returntype: arrayref of features Exceptions: none Example : |
Arg [1] : Bio::EnsEMBL::Analysis::Tools::FeatureFilter Arg [2] : int, strand (must be 1 or -1) Arg [3] : arrayref of features Function : works out coverage of each base pair and throws out features of a base pair is covered by to many hits. The lowest scoring features are thrown out first Returntype: arrayref of features Exceptions: none Example : |
Methods code
sub coverage
{ my $self = shift;
$self->{'coverage'} = shift if(@_);
return $self->{'coverage'}; } |
sub filter_on_coverage
{ my $self = shift;
$self->{'filter_on_coverage'} = shift if(@_);
return $self->{'filter_on_coverage'};
}
} |
sub filter_results
{ my ($self, $features) = @_;
if(!$features || ref($features) ne 'ARRAY'){
throw("Must pass filter_results an arrayref not ".$features.
" FeatureFilter::filter_results");
}
my %validhit;
my %hitarray;
my %totalscore;
my $maxend = 0;
@$features = sort { $b->score <=> $a->score } @$features;
foreach my $f(@$features){
if($f->score > $self->min_score){
if(!exists $validhit{$f->hseqname}){
$validhit{$f->hseqname} = 0;
$totalscore{$f->hseqname} = 0;
}
$totalscore{$f->hseqname} += $f->score;
if(!$f->can('p_value') || !defined $f->p_value
|| $f->p_value < $self->max_pvalue){
if($validhit{$f->hseqname} < $f->score){
$validhit{$f->hseqname} = $f->score;
}
if($f->end > $maxend){
$maxend = $f->end;
}
}
}
if($validhit{$f->hseqname}){
if(!exists($hitarray{$f->hseqname})){
$hitarray{$f->hseqname} = [];
}
push(@{$hitarray{$f->hseqname}}, $f);
}
}
my $total_count = 0;
foreach my $id(keys(%hitarray)){
$total_count += @{$hitarray{$id}};
}
my @hit_ids = sort {$validhit{$b} <=> $validhit{$a}
or $totalscore{$b} <=> $totalscore{$a}
or $a cmp $b
} keys %validhit;
my %accepted_hit_ids;
if($self->filter_on_coverage){
my @strands = (1, -1);
foreach my $strand(@strands){
my @list;
$list[$maxend] = 0; NAME:foreach my $name(@hit_ids){
my $hole = 0;
FEATURE:foreach my $f(@{$hitarray{$name}}){
next FEATURE if $f->strand != $strand;
if( (($f->score > $self->min_score) && ($f->can('p_value'))
&& defined $f->p_value && $f->p_value < $self->max_pvalue) ){
INDEX:foreach my $i ( $f->start .. $f->end ) {
unless( $list[$i] ){
$list[$i] = 0;
}
if( $list[$i] < $self->coverage ) {
$hole = 1;
last;
}
}
}
}
if($hole == 0){
next ;
}
$accepted_hit_ids{$name} = 1;
foreach my $f ( @{$hitarray{$name}} ) {
if ($f->strand == $strand) {
for my $i ( $f->start .. $f->end ) {
$list[$i]++;
}
}
}
}
}
}else{
foreach my $name(keys(%hitarray)){
$accepted_hit_ids{$name} = 1;
}
}
$total_count=0;
foreach my $id(keys(%accepted_hit_ids)){
$total_count += @{$hitarray{$id}};
}
my @features;
foreach my $name(keys(%accepted_hit_ids)){
my @tmp = @{$hitarray{$name}};
my $remaining;
if($self->prune){
$remaining = $self->prune_features(\@tmp);
}else{
$remaining =\@ tmp;
}
push(@features, @$remaining);
}
if($self->hard_prune){ @features = @{$self->prune_features(\@features)};
}
return\@ features; } |
sub hard_prune
{ my $self = shift;
$self->{'hard_prune'} = shift if(@_);
return $self->{'hard_prune'}; } |
sub max_pvalue
{ my $self = shift;
$self->{'max_pvalue'} = shift if(@_);
return $self->{'max_pvalue'}; } |
sub min_score
{ my $self = shift;
$self->{'min_score'} = shift if(@_);
return $self->{'min_score'}; } |
sub new
{ my ($class,@args) = @_;
my $self = bless {},$class;
&verbose('WARNING');
my ($min_score, $max_pvalue, $coverage,
$prune, $hard_prune, $filter_on_coverage) =
rearrange(['MIN_SCORE', 'MAX_PVALUE', 'COVERAGE',
'PRUNE', 'HARD_PRUNE', 'FILTER_ON_COVERAGE'], @args);
$self->min_score(-100000);
$self->max_pvalue(0.1);
$self->coverage(10);
$self->prune(0);
$self->hard_prune(0);
$self->filter_on_coverage(1);
$self->min_score($min_score) if(defined $min_score);
$self->max_pvalue($max_pvalue) if(defined $max_pvalue);
$self->coverage($coverage) if(defined $coverage);
$self->prune($prune) if(defined $prune);
$self->hard_prune($hard_prune) if(defined $hard_prune);
$self->filter_on_coverage($filter_on_coverage)
if(defined($filter_on_coverage));
return $self;
}
} |
sub prune
{ my $self = shift;
$self->{'prune'} = shift if(@_);
return $self->{'prune'}; } |
sub prune_features
{ my ($self, $features) = @_;
my @output;
my $forward = $self->prune_features_by_strand(1, $features);
push(@output, @$forward) if($forward);
my $reverse = $self->prune_features_by_strand(-1, $features);
push(@output, @$reverse) if($reverse);
return\@ output; } |
sub prune_features_by_strand
{ my ($self, $strand, $in) = @_;
my @input_for_strand;
foreach my $f (@$in) {
push @input_for_strand, $f if $f->strand eq $strand;
}
return () if !@input_for_strand;
my @sorted_fs = sort{ $a->start <=> $b->start } @input_for_strand;
my $first_base = $sorted_fs[0]->start;
@sorted_fs = sort{ $a->end <=> $b->end } @input_for_strand;
my $last_base = $sorted_fs[$#sorted_fs]->end;
my @fs_per_base;
foreach my $base ($first_base..$last_base) {
$fs_per_base[$base] = 0; }
foreach my $f (@input_for_strand) {
foreach my $covered_base ($f->start..$f->end) {
$fs_per_base[$covered_base]++;
}
}
@sorted_fs = sort { $a->score <=> $b->score } @input_for_strand;
@input_for_strand = ();
my $max_coverage = $self->coverage;
my @over_covered_bases;
foreach my $base ($first_base..$last_base) {
my $excess_fs = $fs_per_base[$base] - $max_coverage;
if ($excess_fs > 0) {
push @over_covered_bases, $base;
}
}
@over_covered_bases = sort { $fs_per_base[$b] <=> $fs_per_base[$a] }
@over_covered_bases;
foreach my $base (@over_covered_bases) {
my $f_no = 0;
while ($fs_per_base[$base] > $max_coverage) {
my $start = $sorted_fs[$f_no]->start;
my $end = $sorted_fs[$f_no]->end;
if ($start <= $base and $end >= $base) { splice @sorted_fs, $f_no, 1; foreach my $was_covered ($start..$end) {
$fs_per_base[$was_covered]--;
}
} else { $f_no++;
}
}
}
return\@ sorted_fs;
}
1; } |
General documentation
Arg [1] : Bio::EnsEMBL::Analysis::Tools::FeatureFilter
Arg [2] : variable, generally int or string
Function : This describes the 6 container methods below
min_score, max_pvalue, coverage, prune, hard_prune and
filter on coverage. The all take, store and return their give
variable
Returntype: int/string
Exceptions: none
Example : none