Raw content of Bio::EnsEMBL::Analysis::Tools::Finished::BPliteWrapper # Ensembl module for Bio::EnsEMBL::Analysis::Tools::Finished::BPliteWrapper # # Copyright (c) 2004 Ensembl # =head1 NAME Bio::EnsEMBL::Analysis::Tools::Finished::BPliteWrapper =head1 SYNOPSIS no synopsis =head1 DESCRIPTION This module is a wrapper for Tools::BPliteWrapper to provide an interface between Bio::EnsEMBL::Analysis::Runnable::Finished::Blast and BPlite. This method fits model for standard blast parsers as it provides the parse_file method which returns an array of results. This method uses BPliteWrapper to parse the file and contains methods to do filtering which is called by the Runnable::Finished::Blast It contains the actual methods to create FeaturePairs from HSPs after doing any depth filtering to save time and memory when searching genomic sequences that generate large numbers of blast matches. =head1 CONTACT Post questions to the Ensembl development list: ensembl-dev@ebi.ac.uk =cut package Bio::EnsEMBL::Analysis::Tools::Finished::BPliteWrapper; use strict; use warnings; use FileHandle; use Bio::EnsEMBL::Root; use Bio::EnsEMBL::Utils::Exception qw(verbose throw warning); use Bio::EnsEMBL::Utils::Argument qw( rearrange ); use Bio::EnsEMBL::Analysis::Tools::FeatureFactory; use Bio::EnsEMBL::DnaDnaAlignFeature; use Bio::EnsEMBL::DnaPepAlignFeature; use Bio::EnsEMBL::PepDnaAlignFeature; use base 'Bio::EnsEMBL::Analysis::Tools::BPliteWrapper'; =head2 new Arg [1] : Bio::EnsEMBL::Analysis::Tools::Finished::BPliteWrapper Arg [FILENAME] : string, filename Arg [REGEX] : string, regex Arg [QUERY_TYPE] : string, query sequence type, should be pep or dna Arg [DATABASE_TYPE] : string, database sequence type as above Arg [ANALYSIS] : Bio::EnsEMBL::Analysis object Arg [COVERAGE] : integer, coverage , defined in config file Arg [THRESHOLD_TYPE] : string, threshold_type defined in config file Arg [THRESHOLD] : integer, threshold defined in config file Arg [DISCARD_OVERLAPS]: flag, 1 or 0 defined in config file Function : Returntype: Exceptions: Example : =cut sub new{ my ($class,@args) = @_; my $self = $class->SUPER::new(@args); &verbose('WARNING'); my ($regex, $query, $target, $analysis,$coverage,$threshold_type,$threshold,$discard_overlaps) = rearrange(['REGEX', 'QUERY_TYPE', 'DATABASE_TYPE', 'ANALYSIS','COVERAGE','THRESHOLD_TYPE','THRESHOLD','DISCARD_OVERLAPS'], @args); ###################### #SETTING THE DEFAULTS# ###################### $self->regex('(\w+)\s+'); ###################### $self->regex($regex) if(defined $regex); $self->query_type($query) if($query); $self->database_type($target) if($target); $self->analysis($analysis); $self->threshold_type($threshold_type); $self->threshold($threshold); $self->coverage($coverage); $self->discard_overlaps($discard_overlaps); return $self; } =head2 threshold_type Arg [1] : Bio::EnsEMBL::Analysis::Tools::Finished::BPliteWrapper Arg [2] : string, threshold type Function : container Returntype: string Exceptions: throws if string is not either PVALUE or PID Example : =cut sub threshold_type{ my ($self, $ttype) = @_; if ( $ttype ) { throw ("Threshold_type should be either PVALUE or PID") unless ($ttype eq 'PVALUE' || $ttype eq 'PID'); $self->{'threshold_type'} = $ttype; } return $self->{'threshold_type'}; } =head2 threshold Arg [1] : Bio::EnsEMBL::Analysis::Tools::Finished::BPliteWrapper Arg [2] : integer, threshold Function : container Returntype: integer Example : =cut sub threshold{ my ($self, $tvalue) = @_; if ( $tvalue ) { $self->{'threshold'} = $tvalue; } return $self->{'threshold'}; } =head2 coverage Arg [1] : Bio::EnsEMBL::Analysis::Tools::Finished::BPliteWrapper Arg [2] : integer, coverage Function : container Returntype: integer Exceptions: throws if string is not >= 0 or <= 255 Example : =cut sub coverage{ my ($self, $coverage) = @_; if ( $coverage ) { throw ("Coverage set should be between 0 and 255") unless ($coverage >= 0 || $coverage <= 255 ); $self->{'coverage'} = $coverage; } return $self->{'coverage'}; } =head2 discard_overlaps Arg [1] : Bio::EnsEMBL::Analysis::Tools::Finished::BPliteWrapper Arg [2] : string, discard_overlap Function : container Returntype: string Example : =cut sub discard_overlaps{ my ($self, $dtype) = @_; if ( $dtype ) { $self->{'discard_overlaps'} = $dtype; } return $self->{'discard_overlaps'}; } =head2 parse_files Arg [1] : Bio::EnsEMBL::Analysis::Tools::Finished::BPliteWrapper Arg [2] : string filename Function : using BPlite to parse the blast output Returntype: arrayref Exceptions: throws if file does not exist Example : =cut sub parse_files{ my ($self, $files) = @_; $self->clean_output; $self->clean_filenames; $self->filenames($files); my $bplites = $self->get_parsers($files); return $bplites; } =head2 get_parsers Arg [1] : Bio::EnsEMBL::Analysis::Tools::Finished::BPliteWrapper Arg [2] : string, filename Function : opens file using Filehandle and passes filehandle to BPlite Returntype: Bio::EnsEMBL::Analysis::Tools::BPlite Exceptions: none Example : =cut sub get_parsers { my ($self, $files) = @_; if(!$files){ $files = $self->filenames; } my @parsers; foreach my $file (@$files) { my $fh = new FileHandle; $fh->open("<".$file); my $parser = Bio::EnsEMBL::Analysis::Tools::BPlite->new('-fh' => $fh); push(@parsers,$parser); } return \@parsers; } =head2 get_hsps Arg [1] : Bio::EnsEMBL::Analysis::Tools::Finished::BPliteWrapper Arg [2] : Bio::EnsEMBL::Analysis::Tools::BPlite Function : get the hsps from bplite and return the best hits Returntype: none Exceptions: throws if regex does not produce a result Example : =cut sub get_best_hits{ my ($self, $parsers,$thresh_type,$threshold) = @_; my $regex = $self->regex; my $best_hits = {}; PARSER: foreach my $parser(@$parsers){ HIT: while(my $sbjct = $parser->nextSbjct){ my ($name) = $sbjct->name =~ /$regex/; throw("Error parsing name from ".$sbjct->name." check your ". "blast setup and blast headers") unless($name); my $hsps=[]; my $above_thresh=0; my $best_value=1e6; my $top_score = 0; HSP: while (my $hsp = $sbjct->nextHSP) { push(@$hsps, $hsp); my $score=$hsp->score; my $pc=$hsp->percent; my $p =$hsp->P; if ( $thresh_type eq 'PVALUE' ) { if ( $p <= $threshold ) { $above_thresh = 1; } if ( $p < $best_value ) { $best_value = $p; } } elsif ( $thresh_type eq 'PID' ) { $best_value = 0; if ( $pc >= $threshold ) { $above_thresh = 1; } if ( $pc > $best_value ) { $best_value = $pc; } } else { throw("Unknown threshold type '$thresh_type'"); } #get top_score if ( $score > $top_score ) { $top_score = $score; } } next unless $above_thresh; my $best = $best_hits->{$best_value}{$top_score} ||= []; #Put as first element of hsps array unshift ( @$hsps, $name ); push ( @$best, $hsps ); }#while next->sbjt }#foreach parser $parsers = []; return $best_hits; } sub _apply_coverage_filter { my ( $self, $query_length, $best_hits,$threshold_type,$threshold,$coverage,$discard_overlaps ) = @_; my %id_list; my @output; my $max_coverage = defined($coverage) ? $coverage : 0; $discard_overlaps = $discard_overlaps || 0; throw("Max coverage '$max_coverage' is beyond limit of method '255'") if $max_coverage > 255; # Make a string of nulls (zeroes) the length of the query my $coverage_map = "\0" x ( $query_length + 1 ); my $ana = $self->analysis; # Loop through from best to worst according # to our threshold type. my (@bin_numbers); if ( $threshold_type eq 'PID' ) { @bin_numbers = sort { $b <=> $a } keys %$best_hits; } elsif ( $threshold_type eq 'PVALUE' ) { @bin_numbers = sort { $a <=> $b } keys %$best_hits; } else { throw("Unknown threshold type '$threshold_type'"); } foreach my $bin_n (@bin_numbers) { my $score_hits = $best_hits->{$bin_n}; # Loop through from best to worst according # to score within threshold bin foreach my $top_score ( sort { $b <=> $a } keys %$score_hits ) { my $hits = $score_hits->{$top_score}; # Loop through each hit with this score foreach my $hit (@$hits) { my $keep_hit = 0; my ( $name, @hsps ) = @$hit; my $ct=scalar(@hsps); # Don't keep multiple matches to the same sequence # at the same genomic location. @hsps = $self->_discard_worst_overlapping(@hsps) if $discard_overlaps; unless ($max_coverage == 0){ foreach my $hsp (@hsps) { my $q = $hsp->query; foreach my $i ( $q->start .. $q->end ) { my $depth = unpack( 'C', substr( $coverage_map, $i, 1 ) ); if ( $depth < $max_coverage ) { $keep_hit = 1; $depth++; # Increment depth at this position in the map substr( $coverage_map, $i, 1 ) = pack( 'C', $depth ); } } } } # Make FeaturePairs if we want to keep this hit if ($keep_hit || $max_coverage == 0) { foreach my $hsp (@hsps) { push (@output,$self->split_hsp( $hsp, $name, $ana )); } } } } } print "*** FINISHED PARSING ***\n"; $self->output(\@output); } sub _discard_worst_overlapping { my $self = shift; my @hsps = sort { $a->query->start <=> $b->query->start } @_; # Put all the hsps hits into overlapping bins my $first = shift @hsps; my $start = $first->start; my $end = $first->end; my $current = [$first]; # The current bin my @bins = ($current); while ( my $hsp = shift @hsps ) { my $q = $hsp->query; # Does this hsp overlap the current bin? # compare the first hsp with the second hsp, if both overlap then the current array will have both the first hsp and the second hsp if ( $q->start <= $end and $q->end >= $start ) { push ( @$current, $hsp ); if ( $q->end > $end ) { $end = $q->end; } } else { $current = [$hsp]; push ( @bins, $current ); $start = $hsp->start; $end = $hsp->end; } } foreach my $bin (@bins) { if ( @$bin == 1 ) { push ( @hsps, $bin->[0] ); } else { # Remove the hsp with the lowest percent identity # (or lowest score or highest P value) my @bin_hsps = sort { $a->percent <=> $b->percent || $a->score <=> $b->score || $b->P <=> $a->P } @$bin; shift (@bin_hsps); if ( @bin_hsps == 1 ) { # We are left with 1 hsp, so add it. push ( @hsps, $bin_hsps[0] ); } else { # Remaining hsps may not be overlapping push ( @hsps, $self->_discard_worst_overlapping(@bin_hsps) ); } } } return @hsps; } 1;