Raw content of Bio::EnsEMBL::Analysis::Tools::BPliteWrapper
# Ensembl module for Bio::EnsEMBL::Analysis::Tools::BPliteWrapper
#
# Copyright (c) 2004 Ensembl
#
=head1 NAME
Bio::EnsEMBL::Analysis::Tools::BPliteWrapper
=head1 SYNOPSIS
my $parser = Bio::EnsEMBL::Analysis::Tools::BPliteWrapper->
new(
-regex => '^\w+\s+(\w+)'
-query_type => 'dna',
-database_type => 'pep',
);
my @results = @{$parser->parse_results('blast.out')};
=head1 DESCRIPTION
This module is a wrapper for BPlite to provide an interface between
Bio::EnsEMBL::Analysis::Runnable::Blast and BPlite. This method fits model
for standard blast parsers as it provides the parse_file method which
returns an array of results. This method just uses BPlite to parse the
file it does no pre or post filtering and as such will not mimic the
behaviour or the current blast runnable in the pipeline code
=head1 CONTACT
Post questions to the Ensembl development list: ensembl-dev@ebi.ac.uk
=cut
package Bio::EnsEMBL::Analysis::Tools::BPliteWrapper;
use strict;
use warnings;
use FileHandle;
use Bio::EnsEMBL::Utils::Exception qw(verbose throw warning);
use Bio::EnsEMBL::Utils::Argument qw( rearrange );
use Bio::EnsEMBL::Analysis::Tools::FeatureFactory;
use Bio::EnsEMBL::Analysis::Tools::BPlite;
use Bio::EnsEMBL::DnaDnaAlignFeature;
use Bio::EnsEMBL::DnaPepAlignFeature;
use Bio::EnsEMBL::PepDnaAlignFeature;
use vars qw (@ISA);
@ISA = qw();
=head2 new
Arg [1] : Bio::EnsEMBL::Analysis::Tools::BPliteWrapper
Arg [FILENAME] : string, filename
Arg [REGEX] : string, regex
Arg [QUERY_TYPE] : string, query sequence type, should be pep or dna
Arg [DATABASE_TYPE] : string, database sequence type as above
Arg [ANALYSIS] : Bio::EnsEMBL::Analysis object
Function :
Returntype:
Exceptions:
Example :
=cut
sub new{
my ($caller,@args) = @_;
my $class = ref($caller) || $caller;
my $self = bless({}, $class);
&verbose('WARNING');
my ($regex, $query, $target, $analysis) =
rearrange(['REGEX', 'QUERY_TYPE', 'DATABASE_TYPE',
'ANALYSIS'], @args);
######################
#SETTING THE DEFAULTS#
######################
$self->regex('(\w+)\s+');
######################
$self->regex($regex) if(defined $regex);
$self->query_type($query) if($query);
$self->database_type($target) if($target);
$self->analysis($analysis);
return $self;
}
=head2 regex
Arg [1] : Bio::EnsEMBL::Analysis::Tools::BPliteWrapper
Arg [2] : string, regex
Function : container
Returntype: string, regex
Exceptions:
Example :
=cut
sub regex{
my $self = shift;
$self->{'regex'} = shift if(@_);
return $self->{'regex'};
}
sub filenames{
my ($self, $files) = @_;
if(!$self->{'filenames'}){
$self->{'filenames'} = [];
}
if($files){
throw($files." must be an arrayref BPliteWrapper:filenames ")
unless(ref($files) eq 'ARRAY');
push(@{$self->{'filenames'}}, @{$files});
}
return $self->{'filenames'};
}
=head2 query_type
Arg [1] : Bio::EnsEMBL::Analysis::Tools::BPliteWrapper
Arg [2] : string, query sequence type
Function : container
Returntype: string
Exceptions: throws if string is not either dna or pep
Example :
=cut
sub query_type{
my ($self, $dtype) = @_;
if($dtype){
$dtype = lc($dtype);
throw("Query type must be either dna or pep not ".$dtype)
unless($dtype eq 'pep' || $dtype eq 'dna');
$self->{'query_type'} = $dtype;
}
return $self->{'query_type'};
}
=head2 database_type
Arg [1] : Bio::EnsEMBL::Analysis::Tools::BPliteWrapper
Arg [2] : string, database sequence type
Function : container
Returntype: string
Exceptions: throws if string is not either dna or pep
Example :
=cut
sub database_type{
my ($self, $dtype) = @_;
if($dtype){
$dtype = lc($dtype);
throw("Database type must be either dna or pep not ".$dtype)
unless($dtype eq 'pep' || $dtype eq 'dna');
$self->{'database_type'} = $dtype;
}
return $self->{'database_type'};
}
=head2 output
Arg [1] : Bio::EnsEMBL::Analysis::Tools::BPliteWrapper
Arg [2] : arrayref of output features
Function : store the output features in an array
Returntype: arrayref
Exceptions: throws if not passed an arrayref
Example :
=cut
sub output{
my ($self, $output) = @_;
if(!$self->{'output'}){
$self->{'output'} = [];
}
if($output){
throw("Must pass and arrayref not a ".$output." BPliteWrapper:output")
unless(ref($output) eq 'ARRAY');
push(@{$self->{'output'}}, @$output);
}
return $self->{'output'};
}
=head2 clean_output
Arg [1] : Bio::EnsEMBL::Analysis::Tools::BPliteWrapper
Function : empties the internal output array
Returntype: none
Exceptions: none
Example :
=cut
sub clean_output{
my ($self) = @_;
$self->{'output'} = [];
}
sub clean_filenames{
my ($self) = @_;
$self->{'filenames'} = [];
}
=head2 feature_factory
Arg [1] : Bio::EnsEMBL::Analysis::Tools::BPliteWrapper
Arg [2] : Bio::EnsEMBL::Analysis::Tools::FeatureFactory
Function : container for feature factory creates one if one is requested
but one does not currently exist
Returntype: Bio::EnsEMBL::Analysis::Tools::FeatureFactory
Exceptions:
Example :
=cut
sub feature_factory{
my ($self, $feature_factory) = @_;
if($feature_factory){
$self->{'feature_factory'} = $feature_factory;
}
if(!$self->{'feature_factory'}){
$self->{'feature_factory'} =
Bio::EnsEMBL::Analysis::Tools::FeatureFactory->new();
}
return $self->{'feature_factory'};
}
=head2 analysis
Arg [1] : Bio::EnsEMBL::Analysis::RunnableDB
Arg [2] : Bio::EnsEMBL::Analysis
Function : container for analysis object
Returntype: Bio::EnsEMBL::Analysis
Exceptions: throws passed incorrect object type
Example :
=cut
sub analysis{
my $self = shift;
my $analysis = shift;
if($analysis){
throw("Must pass RunnableDB:analysis a Bio::EnsEMBL::Analysis".
"not a ".$analysis) unless($analysis->isa
('Bio::EnsEMBL::Analysis'));
$self->{'analysis'} = $analysis;
}
return $self->{'analysis'};
}
=head2 parse_file
Arg [1] : Bio::EnsEMBL::Analysis::Tools::BPliteWrapper
Arg [2] : string filename
Function : using BPlite to parse the blast output
Returntype: arrayref
Exceptions: throws if file does not exist
Example :
=cut
sub parse_files{
my ($self, $files) = @_;
$self->clean_output;
$self->clean_filenames;
$self->filenames($files);
my $bplites = $self->get_parsers($files);
$self->get_hsps($bplites);
return $self->output;
}
=head2 get_parser
Arg [1] : Bio::EnsEMBL::Analysis::Tools::BPliteWrapper
Arg [2] : string, filename
Function : opens file using Filehandle and passes filehandle to BPlite
Returntype: Bio::EnsEMBL::Analysis::Tools::BPlite
Exceptions: none
Example :
=cut
sub get_parsers {
my ($self, $files) = @_;
if(!$files){
$files = $self->filenames;
}
my @parsers;
foreach my $file (@$files) {
my $fh = new FileHandle;
$fh->open("<".$file);
my $parser = Bio::EnsEMBL::Analysis::Tools::BPlite->new('-fh' => $fh);
push(@parsers,$parser);
}
return \@parsers;
}
=head2 get_hsps
Arg [1] : Bio::EnsEMBL::Analysis::Tools::BPliteWrapper
Arg [2] : Bio::EnsEMBL::Analysis::Tools::BPlite
Function : get the hsps from bplite and turn then into features
Returntype: none
Exceptions: throws if regex does not produce a result
Example :
=cut
sub get_hsps{
my ($self, $parsers) = @_;
my $regex = $self->regex;
my @output;
PARSER:foreach my $parser(@$parsers){
NAME: while(my $sbjct = $parser->nextSbjct){
my ($name) = $sbjct->name =~ /$regex/;
throw("Error parsing name from ".$sbjct->name." check your ".
"blast setup and blast headers") unless($name);
HSP: while (my $hsp = $sbjct->nextHSP) {
push(@output, $self->split_hsp($hsp, $name));
}
}
}
$parsers = [];
$self->output(\@output);
}
=head2 split_hsp
Arg [1] : Bio::EnsEMBL::Analysis::Tools::BPliteWrapper
Arg [2] : Bio::EnsEMBL::Analysis::Tools::BPlite::HSP
Function : turn hsp into alignfeature
Returntype: Bio::EnsEMBL::BaseAlignFeature
Exceptions:
Example :
=cut
sub split_hsp {
my ($self,$hsp,$name) = @_;
my $qstrand = $hsp->query->strand;
my $hstrand = $hsp->subject->strand;
my ($qinc, $hinc) = $self->find_increments($qstrand,$hstrand);
my @qchars = split(//,$hsp->querySeq); # split alignment into array of
# chars
my @hchars = split(//,$hsp->sbjctSeq); # ditto for hit sequence
my $qstart = $hsp->query->start(); # Start off the feature pair start
my $hstart = $hsp->subject->start(); # ditto
my $qend = $hsp->query->start(); # Set the feature pair end also
my $hend = $hsp->subject->start(); # ditto
if ($qstrand == -1) {
$qstart = $hsp->query->end;
$qend = $hsp->query->end;
}
if ($hstrand == -1) {
$hstart = $hsp->subject->end;
$hend = $hsp->subject->end;
}
my $count = 0; # counter for the bases in the alignment
my $found = 0; # flag saying whether we have a feature pair
my @tmpf;
while ($count <= $#qchars) {
# We have hit an ungapped region. Increase the query and hit
#counters and flag that we have a feature pair.
if ($qchars[$count] ne '-' &&
$hchars[$count] ne '-') {
$qend += $qinc;
$hend += $hinc;
$found = 1;
} else {
# We have hit a gapped region. If the feature pair flag is set
# ($found) then make a feature pair, store it and reset the start
# and end variables.
my $query_seqname;
if ($hsp->query->can('seq_id')) {
$query_seqname = $hsp->query->seq_id;
} else {
$query_seqname = $hsp->query->seqname;
}
if ($found == 1) {
my $fp = $self->convert_to_featurepair($qstart, $qend, $qstrand,
$qinc, $hstart, $hend,
$hstrand, $hinc, $name,
$query_seqname,
$hsp->score,
$hsp->percent, $hsp->P,
$hsp->positive,
$hsp->match);
push(@tmpf,$fp);
}
# We're in a gapped region. We need to increment the sequence that
# doesn't have the gap in it to keep the coordinates correct.
# We also need to reset the current end coordinates.
if ($qchars[$count] ne '-') {
$qstart = $qend + $qinc;
} else {
$qstart = $qend;
}
if ($hchars[$count] ne '-') {
$hstart = $hend + $hinc;
} else {
$hstart = $hend;
}
$qend = $qstart;
$hend = $hstart;
$found = 0;
}
$count++;
}
# Remember the last feature
if ($found == 1) {
my $query_seqname;
if ($hsp->query->can('seq_id')) {
$query_seqname = $hsp->query->seq_id;
} else {
$query_seqname = $hsp->query->seqname;
}
my $fp = $self->convert_to_featurepair($qstart, $qend, $qstrand,
$qinc, $hstart, $hend,
$hstrand, $hinc, $name,
$query_seqname,
$hsp->score,
$hsp->percent, $hsp->P,
$hsp->positive,
$hsp->match);
push(@tmpf,$fp);
}
my $fp;
$qinc = abs( $qinc );
$hinc = abs( $hinc );
if( $qinc == 3 && $hinc == 1 ) {
$fp = Bio::EnsEMBL::DnaPepAlignFeature->new(-features => \@tmpf);
} elsif( $qinc == 1 && $hinc == 3 ) {
$fp = Bio::EnsEMBL::PepDnaAlignFeature->new(-features => \@tmpf);
} elsif( $qinc == 1 && $hinc == 1 ) {
$fp = Bio::EnsEMBL::DnaDnaAlignFeature->new(-features => \@tmpf);
} else {
throw( "Hardcoded values wrong?? " );
}
# helps debugging subsequent steps
$fp->{'qseq'} = $hsp->querySeq();
$fp->{'sseq'} = $hsp->sbjctSeq();
# for compara
$fp->positive_matches($hsp->positive);
$fp->identical_matches($hsp->match);
if($fp->hstart > $fp->hend){
throw("Failed start ".$fp->hstart." is greater than end ".$fp->hend." ".
"for ".$fp->hseqname."\n");
}
return $fp;
}
=head2 find_increments
Arg [1] : Bio::EnsEMBL::Analysis::Tools::BPliteWrapper
Arg [2] : int, query strand
Arg [3] : int, hit strand
Function : work out the query and hit increments
Returntype: int, int query inc, hit inc
Exceptions: none
Example :
=cut
sub find_increments{
my ($self, $qstrand, $hstrand) = @_;
my $qinc = 1 * $qstrand;
my $hinc = 1 * $hstrand;
my $qtype = lc($self->query_type);
my $htype = lc($self->database_type);
if ($qtype eq 'dna' && $htype eq 'pep') {
$qinc = 3 * $qinc;
}
if ($qtype eq 'pep' && $htype eq 'dna') {
$hinc = 3 * $hinc;
}
return ($qinc,$hinc);
}
=head2 convert_to_featurepair
Arg [1] : Bio::EnsEMBL::Analysis::Tools::BPliteWrapper
Arg [2] : int, query start
Arg [3] : int, query end
Arg [4] : int, query strand
Arg [5] : int, query increment
Arg [6] : int, hit start
Arg [7] : int, hit end
Arg [8] : int, hit strand
Arg [9] : int, hit increment
Arg [10] : string, name
Arg [11] : string, query name
Arg [12] : int, bit score
Arg [13] : int, percent identity
Arg [14] : int, p value
Arg [15] : int, positive matches
Arg [16] : int, matches
Function : take values and taking account for inc values make feature
pair
Returntype: Bio::EnsEMBL::FeaturePair
Exceptions:
Example :
=cut
sub convert_to_featurepair{
my ($self, $qstart, $qend, $qstrand, $qinc, $hstart, $hend, $hstrand,
$hinc, $name, $seqname,$score, $percent, $pvalue, $positive, $matches) = @_;
my $tmpqend = $qend; $tmpqend -= $qinc;
my $tmphend = $hend; $tmphend -= $hinc;
my $tmpqstart = $qstart;
my $tmphstart = $hstart;
# This is for dna-pep alignments. The actual end base
# will be +- 2 bases further on.
if (abs($qinc) > 1) {
$tmpqend += $qstrand * 2;
}
if (abs($hinc) > 1) {
$tmphend += $hstrand * 2;
}
# Make sure start is always < end
if ($tmpqstart > $tmpqend) {
my $tmp = $tmpqstart;
$tmpqstart = $tmpqend;
$tmpqend = $tmp;
}
if ($tmphstart > $tmphend) {
my $tmp = $tmphstart;
$tmphstart = $tmphend;
$tmphend = $tmp;
}
my $fp = $self->feature_factory->create_feature_pair($tmpqstart,
$tmpqend, $qstrand,
$score, $tmphstart,
$tmphend, $hstrand,
$name, $percent,
$pvalue, $seqname,
undef,
$self->analysis,
$positive,
$matches);
return $fp;
}
1;