Bio::EnsEMBL::Analysis::Tools::Finished
BPliteWrapper
Toolbar
Summary
Bio::EnsEMBL::Analysis::Tools::Finished::BPliteWrapper
Package variables
No package variables defined.
Included modules
Inherit
Synopsis
no synopsis
Description
This module is a wrapper for Tools::BPliteWrapper to provide an interface between
Bio::EnsEMBL::Analysis::Runnable::Finished::Blast and BPlite. This method fits model
for standard blast parsers as it provides the parse_file method which
returns an array of results. This method uses BPliteWrapper to parse the
file and contains methods to do filtering which is called by the Runnable::Finished::Blast
It contains the actual methods to create FeaturePairs from HSPs after
doing any depth filtering to save time and memory when searching genomic sequences that generate
large numbers of blast matches.
Methods
Methods description
Arg [1] : Bio::EnsEMBL::Analysis::Tools::Finished::BPliteWrapper Arg [2] : integer, coverage Function : container Returntype: integer Exceptions: throws if string is not >= 0 or <= 255 Example : |
Arg [1] : Bio::EnsEMBL::Analysis::Tools::Finished::BPliteWrapper Arg [2] : string, discard_overlap Function : container Returntype: string Example : |
Arg [1] : Bio::EnsEMBL::Analysis::Tools::Finished::BPliteWrapper Arg [2] : string, filename Function : opens file using Filehandle and passes filehandle to BPlite Returntype: Bio::EnsEMBL::Analysis::Tools::BPlite Exceptions: none Example : |
Arg [1] : Bio::EnsEMBL::Analysis::Tools::Finished::BPliteWrapper Arg [FILENAME] : string, filename Arg [REGEX] : string, regex Arg [QUERY_TYPE] : string, query sequence type, should be pep or dna Arg [DATABASE_TYPE] : string, database sequence type as above Arg [ANALYSIS] : Bio::EnsEMBL::Analysis object Arg [COVERAGE] : integer, coverage , defined in config file Arg [THRESHOLD_TYPE] : string, threshold_type defined in config file Arg [THRESHOLD] : integer, threshold defined in config file Arg [DISCARD_OVERLAPS]: flag, 1 or 0 defined in config file Function : Returntype: Exceptions: Example : |
Arg [1] : Bio::EnsEMBL::Analysis::Tools::Finished::BPliteWrapper Arg [2] : string filename Function : using BPlite to parse the blast output Returntype: arrayref Exceptions: throws if file does not exist Example : |
Arg [1] : Bio::EnsEMBL::Analysis::Tools::Finished::BPliteWrapper Arg [2] : integer, threshold Function : container Returntype: integer Example : |
Arg [1] : Bio::EnsEMBL::Analysis::Tools::Finished::BPliteWrapper Arg [2] : string, threshold type Function : container Returntype: string Exceptions: throws if string is not either PVALUE or PID Example : |
Methods code
_apply_coverage_filter | description | prev | next | Top |
sub _apply_coverage_filter
{
my ( $self, $query_length, $best_hits,$threshold_type,$threshold,$coverage,$discard_overlaps ) = @_;
my %id_list;
my @output;
my $max_coverage = defined($coverage) ? $coverage : 0;
$discard_overlaps = $discard_overlaps || 0;
throw("Max coverage '$max_coverage' is beyond limit of method '255'") if $max_coverage > 255;
my $coverage_map = "\0" x ( $query_length + 1 );
my $ana = $self->analysis;
my (@bin_numbers);
if ( $threshold_type eq 'PID' ) {
@bin_numbers = sort { $b <=> $a } keys %$best_hits;
}
elsif ( $threshold_type eq 'PVALUE' ) {
@bin_numbers = sort { $a <=> $b } keys %$best_hits;
}
else {
throw("Unknown threshold type '$threshold_type'");
}
foreach my $bin_n (@bin_numbers) {
my $score_hits = $best_hits->{$bin_n};
foreach my $top_score ( sort { $b <=> $a } keys %$score_hits ) {
my $hits = $score_hits->{$top_score};
foreach my $hit (@$hits) {
my $keep_hit = 0;
my ( $name, @hsps ) = @$hit;
my $ct=scalar(@hsps);
@hsps = $self->_discard_worst_overlapping(@hsps) if $discard_overlaps;
unless ($max_coverage == 0){
foreach my $hsp (@hsps) {
my $q = $hsp->query;
foreach my $i ( $q->start .. $q->end ) {
my $depth = unpack( 'C', substr( $coverage_map, $i, 1 ) );
if ( $depth < $max_coverage ) {
$keep_hit = 1;
$depth++;
substr( $coverage_map, $i, 1 ) = pack( 'C', $depth );
}
}
}
}
if ($keep_hit || $max_coverage == 0) {
foreach my $hsp (@hsps) {
push (@output,$self->split_hsp( $hsp, $name, $ana ));
}
}
}
}
}
print "*** FINISHED PARSING ***\n";
$self->output(\@output); } |
sub _discard_worst_overlapping
{
my $self = shift;
my @hsps = sort { $a->query->start <=> $b->query->start } @_;
my $first = shift @hsps;
my $start = $first->start;
my $end = $first->end;
my $current = [$first]; my @bins = ($current);
while ( my $hsp = shift @hsps ) {
my $q = $hsp->query;
if ( $q->start <= $end and $q->end >= $start ) {
push ( @$current, $hsp );
if ( $q->end > $end ) {
$end = $q->end;
}
}
else {
$current = [$hsp];
push ( @bins, $current );
$start = $hsp->start;
$end = $hsp->end;
}
}
foreach my $bin (@bins) {
if ( @$bin == 1 ) {
push ( @hsps, $bin->[0] );
}
else {
my @bin_hsps = sort { $a->percent <=> $b->percent || $a->score <=> $b->score || $b->P <=> $a->P } @$bin;
shift (@bin_hsps);
if ( @bin_hsps == 1 ) {
push ( @hsps, $bin_hsps[0] );
}
else {
push ( @hsps, $self->_discard_worst_overlapping(@bin_hsps) );
}
}
}
return @hsps;
}
1; } |
sub coverage
{ my ($self, $coverage) = @_;
if ( $coverage ) {
throw ("Coverage set should be between 0 and 255") unless ($coverage >= 0 || $coverage <= 255 );
$self->{'coverage'} = $coverage;
}
return $self->{'coverage'}; } |
sub discard_overlaps
{ my ($self, $dtype) = @_;
if ( $dtype ) {
$self->{'discard_overlaps'} = $dtype;
}
return $self->{'discard_overlaps'}; } |
sub get_best_hits
{
my ($self, $parsers,$thresh_type,$threshold) = @_;
my $regex = $self->regex;
my $best_hits = {};
PARSER:
foreach my $parser(@$parsers){
HIT:
while(my $sbjct = $parser->nextSbjct){
my ($name) = $sbjct->name =~ /$regex/;
throw("Error parsing name from ".$sbjct->name." check your ".
"blast setup and blast headers") unless($name);
my $hsps=[];
my $above_thresh=0;
my $best_value=1e6;
my $top_score = 0;
HSP:
while (my $hsp = $sbjct->nextHSP) {
push(@$hsps, $hsp);
my $score=$hsp->score;
my $pc=$hsp->percent;
my $p =$hsp->P;
if ( $thresh_type eq 'PVALUE' ) {
if ( $p <= $threshold ) {
$above_thresh = 1;
}
if ( $p < $best_value ) {
$best_value = $p;
}
}
elsif ( $thresh_type eq 'PID' ) {
$best_value = 0;
if ( $pc >= $threshold ) {
$above_thresh = 1;
}
if ( $pc > $best_value ) {
$best_value = $pc;
}
}
else {
throw("Unknown threshold type '$thresh_type'");
}
if ( $score > $top_score ) {
$top_score = $score;
}
}
next unless $above_thresh;
my $best = $best_hits->{$best_value}{$top_score} ||= [];
unshift ( @$hsps, $name );
push ( @$best, $hsps );
}
}
$parsers = [];
return $best_hits; } |
sub get_parsers
{ my ($self, $files) = @_;
if(!$files){
$files = $self->filenames;
}
my @parsers;
foreach my $file (@$files) {
my $fh = new FileHandle;
$fh->open("<".$file);
my $parser = Bio::EnsEMBL::Analysis::Tools::BPlite->new('-fh' => $fh);
push(@parsers,$parser);
}
return\@ parsers; } |
sub new
{ my ($class,@args) = @_;
my $self = $class->SUPER::new(@args);
&verbose('WARNING');
my ($regex, $query, $target, $analysis,$coverage,$threshold_type,$threshold,$discard_overlaps) =
rearrange(['REGEX', 'QUERY_TYPE', 'DATABASE_TYPE',
'ANALYSIS','COVERAGE','THRESHOLD_TYPE','THRESHOLD','DISCARD_OVERLAPS'], @args);
$self->regex('(\w+)\s+');
$self->regex($regex) if(defined $regex);
$self->query_type($query) if($query);
$self->database_type($target) if($target);
$self->analysis($analysis);
$self->threshold_type($threshold_type);
$self->threshold($threshold);
$self->coverage($coverage);
$self->discard_overlaps($discard_overlaps);
return $self; } |
sub parse_files
{
my ($self, $files) = @_;
$self->clean_output;
$self->clean_filenames;
$self->filenames($files);
my $bplites = $self->get_parsers($files);
return $bplites; } |
sub threshold
{ my ($self, $tvalue) = @_;
if ( $tvalue ) {
$self->{'threshold'} = $tvalue;
}
return $self->{'threshold'}; } |
sub threshold_type
{
my ($self, $ttype) = @_;
if ( $ttype ) {
throw ("Threshold_type should be either PVALUE or PID") unless ($ttype eq 'PVALUE' || $ttype eq 'PID');
$self->{'threshold_type'} = $ttype;
}
return $self->{'threshold_type'}; } |
General documentation
Arg [1] : Bio::EnsEMBL::Analysis::Tools::Finished::BPliteWrapper
Arg [2] : Bio::EnsEMBL::Analysis::Tools::BPlite
Function : get the hsps from bplite and return the best hits
Returntype: none
Exceptions: throws if regex does not produce a result
Example :