Raw content of Bio::EnsEMBL::Analysis::Runnable::BlastRfam # Ensembl module for Bio::EnsEMBL::Analysis::Runnable::Blast # # Copyright (c) 2004 Ensembl # =head1 NAME Bio::EnsEMBL::Analysis::Runnable::BlastRfam =head1 SYNOPSIS my $blast = Bio::EnsEMBL::Analysis::Runnable::BlastRfam-> new( -query => $slice, -program => 'wublastn', -database => 'embl_vertrna', -options => 'hitdist=40 -cpus=1', -parser => $bplitewrapper, -filter => $featurefilter, ); $blast->run; my @output =@{$blast->output}; =head1 DESCRIPTION Modified blast runnable for specific use with RFAMSEQ. Use for running BLASTN of genomic vs RFAMSEQ prior to ncRNA analysis using Infernal. Keeps the coverage in the dna_align_feature score field. Also clusters overlapping hits and picks the one with the lowest evalue to represent that cluster. =head1 CONTACT Post questions to the Ensembl development list: ensembl-dev@ebi.ac.uk =cut package Bio::EnsEMBL::Analysis::Runnable::BlastRfam; use strict; use warnings; use Bio::EnsEMBL::Analysis::Runnable; use Bio::EnsEMBL::Utils::Exception qw(throw warning); use Bio::EnsEMBL::Utils::Argument qw( rearrange ); use Bio::EnsEMBL::Analysis::Runnable::Blast; use vars qw(@ISA); @ISA = qw(Bio::EnsEMBL::Analysis::Runnable::Blast); =head2 new Arg [1] : Bio::EnsEMBL::Analysis::Runnable::Blast Arg [Parser] : A blast parser object must meet specified interface Arg [Filter] : A Filter object must meet specified interface Arg [Database]: string, database name/path Arg [Type] : string, wu or ncbi to specify which type of input Arg [Unknown_error_string] : the string to throw if the blast runs fails with an unexpected error 4 Function : create a Blast runnable Returntype: Bio::EnsEMBL::Analysis::Runnable::Blast Exceptions: throws if not given a database name or if not given a parser object Example : =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($params ) = rearrange(['PARAMS'], @args); $self->params($params) if $params; return $self; } =head2 run Arg [1] : Bio::EnsEMBL::Analysis::Runnable Arg [2] : string, directory Function : a generic run method. This checks the directory specifed to run it, write the query sequence to file, marks the query sequence file and results file for deletion, runs the analysis parses the results and deletes any files Returntype: 1 Exceptions: throws if no query sequence is specified Example : =cut sub run{ my ($self, $dir) = @_; my $coverage = 70; # Coverage filter for very common blast hits $self->workdir($dir) if($dir); throw("Can't run ".$self." without a query sequence") unless($self->query); $self->checkdir(); eval { my $low_copy_db = $self->params->{'lowcopy'}; $self->throw("DATABASE NOT FOUND $low_copy_db") unless -e $low_copy_db; my $filename = $self->write_seq_file(); $self->files_to_delete($filename); $self->files_to_delete($self->resultsfile); $self->run_ncbi_analysis; $self->parse_results(20); $self->{'results_files'} = (); $self->run_analysis; $self->parse_results($coverage); $self->delete_files; }; if ($@){ print "$@\n";} return 1; } =head2 parse_results Arg [1] : Bio::EnsEMBL::Analysis::Runnable::Blast Function : override results parsing to allow for coverage calculations Returntype: none Exceptions: none Example : =cut sub parse_results{ my ($self,$coverage_cutoff) = @_; my $results = $self->results_files; my @daf_coverage_results; my $filtered_output; my $bplite = $self->parser->get_parsers($results); foreach my $blast (@{$bplite}){ while( my $subject = $blast->nextSbjct){ while (my $hsp = $subject->nextHSP) { my @daf_results; my $hsp_length = $hsp->length."\t"; my $subject_length = $subject->{'LENGTH'}; $subject_length = 1 unless ($subject_length); my $coverage = $hsp_length/$subject_length*100; $coverage =~ s/\.\d+//; if ($coverage_cutoff){ next unless($coverage > $coverage_cutoff); } $subject->name =~ /^(\S+)\/\S+\s+(\w+);\w+/; my $name = $2."-".$1; push @daf_results, $self->parser->split_hsp($hsp,$name); # add coverage into daf score? foreach my $daf(@daf_results){ # swaps strands over if blast aligns +strand genomic to -ve strand in # RFAM file, RFAM file sequences are the correct orientation # Whereas the genomic can be either, with unspliced DNA it becomes # impossibe to tell automatically if ($daf->hstrand == -1 && $daf->strand == 1){ $daf->strand(-1); $daf->hstrand(1); } $daf->score($coverage); $daf->external_db_id(4200); push @daf_coverage_results, $daf; } } } } return undef unless ( @daf_coverage_results); my $output = $self->cluster(\@daf_coverage_results); $self->output($output); } # foreach hit, look at all the other hits and push into an array anything that overlaps with it # then sort the array and take the top scoring hits for heach familly that lives in it. sub cluster{ my ($self,$dafs_ref)=@_; my @dafs = @$dafs_ref; @dafs = sort{$a->p_value <=> $b->p_value} @dafs; my $start =0; my @representative_sequences; DAFS: foreach my $daf (@dafs){ $start ++; my %family_cluster; next DAFS unless($daf); my $RFAM = substr($daf->hseqname,0,7); push @{$family_cluster{$RFAM}},$daf; MATCHES: for (my $index = $start; $index <= $#dafs ; $index ++){ next MATCHES unless ($dafs[$index]); if ($daf->end >= $dafs[$index]->start() && $daf->start() <= $dafs[$index]->end){ my $familly = substr($dafs[$index]->hseqname,0,7); push @{$family_cluster{$familly}},$dafs[$index]; $dafs[$index] = undef; } } foreach my $domain (keys %family_cluster){ my @temp_array = @{$family_cluster{$domain}}; # get highest scoring alignment for each family to be representative @temp_array = sort{$a->p_value <=> $b->p_value} @temp_array; push @representative_sequences,shift @temp_array; } } return \@representative_sequences; } ##################################################### # USE THIS TO RUN NCBI BLAST IF YOU ARE SO INCLINED # ##################################################### =head2 run_analysis Arg [1] : Bio::EnsEMBL::Analysis::Runnable::Blast Function : gets a list of databases to run against and constructs commandlines against each one and runs them Returntype: none Exceptions: throws if there is a problem opening the commandline or if blast produces an error Example : =cut sub run_ncbi_analysis { my ($self) = @_; # had coded options used in SGJs Rfam scan my $database = $self->params->{'lowcopy'}; my $options = "-W7 -F F -b 1000000 -v 1000000 "; my $command = "/software/bin/blastall"; my $filename = $self->queryfile; my $results_file = $self->create_filename("Infernal", 'blast.out'); $self->files_to_delete($results_file); $self->results_files($results_file); $command .= " -p blastn -d $database -i $filename "; $command .= "$options 2>&1 > ".$results_file; print "Running blast ".$command."\n"; open(my $fh, "$command |") || throw("Error opening Blast cmd <$command>." . " Returned error $? BLAST EXIT: '" . ($? >> 8) . "'," ." SIGNAL '" . ($? & 127) . "', There was " . ($? & 128 ? 'a' : 'no') . " core dump"); # this loop reads the STDERR from the blast command # checking for FATAL: messages (wublast) [what does ncbi blast say?] # N.B. using simple die() to make it easier for RunnableDB to parse. while(<$fh>){ if(/FATAL:(.+)/){ my $match = $1; print $match; # clean up before dying $self->delete_files; if($match =~ /no valid contexts/){ die qq{"VOID"\n}; # hack instead }elsif($match =~ /Bus Error signal received/){ die qq{"BUS_ERROR"\n}; # can we work out which host? }elsif($match =~ /Segmentation Violation signal received./){ die qq{"SEGMENTATION_FAULT"\n}; # can we work out which host? }elsif($match =~ /Out of memory;(.+)/){ # (.+) will be something like "1050704 bytes were last #requested." die qq{"OUT_OF_MEMORY"\n}; # resenD to big mem machine by rulemanager }elsif($match =~ /the query sequence is shorter than the word length/){ #no valid context die qq{"VOID"\n}; # hack instead }else{ warning("Something FATAL happened to BLAST we've not ". "seen before, please add it to Package: " . __PACKAGE__ . ", File: " . __FILE__); die ($self->unknown_error_string."\n"); # send appropriate string #as standard this will be failed so job can be retried #when in pipeline } }elsif(/WARNING:(.+)/){ # ONLY a warning usually something like hspmax=xxxx was exceeded # skip ... }elsif(/^\s{10}(.+)/){ # ten spaces # Continuation of a WARNING: message # Hope this doesn't catch more than these. # skip ... } } unless(close $fh){ # checking for failures when closing. # we should't get here but if we do then $? is translated #below see man perlvar warning("Error running Blast cmd <$command>. Returned ". "error $? BLAST EXIT: '" . ($? >> 8) . "', SIGNAL '" . ($? & 127) . "', There was " . ($? & 128 ? 'a' : 'no') . " core dump"); die ($self->unknown_error_string."\n"); } } sub params{ my $self = shift; $self->{'params'} = shift if(@_); return $self->{'params'}; } 1;