Raw content of Bio::EnsEMBL::Analysis::Runnable # Ensembl module for Bio::EnsEMBL::Analysis::Runnable # # Copyright (c) 2004 Ensembl # =head1 NAME Bio::EnsEMBL::Analysis::Runnable =head1 SYNOPSIS my $repeat_masker = Bio::EnsEMBL::Analysis::Runnable::RepeatMasker-> new( -query => 'slice', -program => 'repeatmasker', -options => '-low' -analysis => $analysis, ); $repeat_masker->run; my @repeats = @{$repeat_masker->output}; =head1 DESCRIPTION This module is base class for our Runnables. Runnables are there to provide modules which can run different analyses and then parse the results into core api objects This module provides some base functionatily The constructor can take 9 different arguments. The analysis object is obligatory and must be passed in. The next 3 arguments, query, program and options are the most important as it is with these the Runnable knows what to run and on what sequences with what command line options. The next 4 are directory paths which can be determined from the config file Bio::EnsEMBL::Analysis::Config::General but arguments are placed here so they can be overidden if desired The other base functionality includes some container methods an output method aswell as methods for finding files and executables and writing sequence to fasta files All Runnables are expected to have 2 methods, run and output run is the method which should run the analysis and output is where the results should be stored Generic versions of these methods are provided here. The run method expects the runnables program to fit the commandline model used by run_analysis, program options queryfile > resultsfile or to implement its own run_analysis. If the run method is used the child Runnable must implement a parse_results method as each analysis general has its own output format and as such it cant be genericized The output method provided simple holds an array of results and can be given an arrayref to push onto that array For more details about the specification look at Runnable.spec in the ensembl-doc cvs module =head1 CONTACT Post questions to the Ensembl development list: ensembl-dev@ebi.ac.uk =cut package Bio::EnsEMBL::Analysis::Runnable; use strict; use warnings; use Bio::SeqIO; #use Bio::EnsEMBL::Root; use Bio::EnsEMBL::FeaturePair; use Bio::EnsEMBL::Utils::Exception qw(verbose throw warning); use Bio::EnsEMBL::Utils::Argument qw( rearrange ); use Bio::EnsEMBL::Analysis::Programs; use Bio::EnsEMBL::Analysis::Config::General; use Bio::EnsEMBL::Analysis::Tools::FeatureFactory; use Bio::EnsEMBL::Analysis::Tools::Utilities qw(create_file_name write_seqfile); use Bio::EnsEMBL::Analysis::Tools::Logger qw(logger_info); use vars qw (@ISA); @ISA = qw(); =head2 new Arg [1] : Bio::EnsEMBL::Analysis::Runnable Arg [2] : Bio::EnsEMBL::Slice Arg [3] : string, name/path of program Arg [4] : string commandline options for the program Arg [5] : string path to working dir Arg [6] : string, path to bin dir Arg [7] : string, path to libary dir Arg [8] : string, path to data dir Arg [9] : Bio::EnsEMBL::Analysis; Function : create a new Bio::EnsEMBL::Analysis::Runnable Returntype: Bio::EnsEMBL::Analysis::Runnable Exceptions: throws if not passed an analysis object Example : $runnable = Bio::EnsEMBL::Analysis::Runnable::RepeatMasker ->new ( -query => $self->query, -program => $self->analysis->program_file, $self->parameters_hash, ); =cut sub new{ my ($class,@args) = @_; my $self = bless {},$class; my ($query, $program, $options, $workdir, $bindir, $libdir, $datadir, $analysis) = rearrange (['QUERY', 'PROGRAM', 'OPTIONS', 'WORKDIR', 'BINDIR', 'LIBDIR', 'DATADIR', 'ANALYSIS'], @args); if(!$analysis){ throw("Can't create a Runnable without an analysis object"); } $self->query($query); $self->program($program); $self->options($options); $self->workdir($workdir); $self->bindir($bindir); $self->libdir($libdir); $self->datadir($datadir); $self->analysis($analysis); return $self; } #containers =head2 containers Arg [1] : Bio::EnsEMBL::Analysis::Runnable Arg [2] : string Function : container for specified variable. This pod refers to the four methods below options, bindir, libdir and datadir. These are simple containers which dont do more than hold and return an given value Returntype: string Exceptions: none Example : my $options = $self->options; =cut sub options{ my $self = shift; $self->{'options'} = shift if(@_); return $self->{'options'} || ''; } sub bindir{ my $self = shift; $self->{'bindir'} = shift if(@_); return $self->{'bindir'} || $BIN_DIR; } sub libdir{ my $self = shift; $self->{'libdir'} = shift if(@_); return $self->{'libdir'} || $LIB_DIR; } sub datadir{ my $self = shift; $self->{'datadir'} = shift if(@_); return $self->{'datadir'} || $DATA_DIR; } =head2 workdir Arg [1] : Bio::EnsEMBL::Analysis::Runnable Arg [2] : string, path to working directory Function : If given a working directory which doesnt exist it will be created by as standard it default to the directory specified in General.pm and then to /tmp Returntype: string, directory Exceptions: none Example : =cut sub workdir{ my $self = shift; my $workdir = shift; if($workdir){ if(!$self->{'workdir'}){ mkdir ($workdir, '777') unless (-d $workdir); } $self->{'workdir'} = $workdir; } return $self->{'workdir'} || $ANALYSIS_WORK_DIR || '/tmp'; } =head2 query Arg [1] : Bio::EnsEMBL::Analysis::Runnable Arg [2] : Bio::EnsEMBL::Slice Function : container for the query sequence Returntype: Bio::EnsEMBL::Slice Exceptions: throws if passed an object which isnt a slice Example : =cut sub query{ my $self = shift; my $slice = shift; if($slice){ throw("Must pass Runnable::query a Bio::PrimarySeqI not a ". $slice) unless($slice->isa('Bio::PrimarySeqI')); $self->{'query'} = $slice; } return $self->{'query'}; } =head2 program Arg [1] : Bio::EnsEMBL::Analysis::Runnable Arg [2] : string, path to program Function : uses locate_executable to find the path of the executable Returntype: string, path to program Exceptions: throws if program path isnt executable Example : =cut sub program{ my $self = shift; my $program = shift; if($program){ my $path = $self->locate_executable($program); $self->{'program'} = $path; } throw($self->{'program'}." is not executable") if($self->{'program'} && !(-x $self->{'program'})); return $self->{'program'}; } =head2 output Arg [1] : Bio::EnsEMBL::Analysis::Runnable Arg [2] : arrayref of output Function : pushes passed in arrayref onto the output array Returntype: arrayref Exceptions: throws if not passed an arrayref Example : =cut sub output{ my ($self, $output) = @_; if(!$self->{'output'}){ $self->{'output'} = []; } if($output){ throw("Must pass Runnable:output an arrayref not a ".$output) unless(ref($output) eq 'ARRAY'); push(@{$self->{'output'}}, @$output); } return $self->{'output'}; } =head2 feature_factory Arg [1] : Bio::EnsEMBL::Analysis::RunnableDB Arg [2] : Bio::EnsEMBL::Analysis::Tools::FeatureFactory Function : container for a feature factory object. If none is defined when one is requested a new one is created. Returntype: Bio::EnsEMBL::Analysis::Tools::FeatureFactory Exceptions: none Example : =cut sub feature_factory{ my ($self, $feature_factory) = @_; if($feature_factory){ $self->{'feature_factory'} = $feature_factory; } if(!$self->{'feature_factory'}){ $self->{'feature_factory'} = Bio::EnsEMBL::Analysis::Tools::FeatureFactory ->new(); } return $self->{'feature_factory'}; } =head2 analysis Arg [1] : Bio::EnsEMBL::Analysis::RunnableDB Arg [2] : Bio::EnsEMBL::Analysis Function : container for analysis object Returntype: Bio::EnsEMBL::Analysis Exceptions: throws passed incorrect object type Example : =cut sub analysis{ my $self = shift; my $analysis = shift; if($analysis){ throw("Must pass RunnableDB:analysis a Bio::EnsEMBL::Analysis". "not a ".$analysis) unless($analysis->isa ('Bio::EnsEMBL::Analysis')); $self->{'analysis'} = $analysis; } return $self->{'analysis'}; } =head2 files_to_delete/protect Arg [1] : Bio::EnsEMBL::Analysis::Runnable Arg [2] : string, file name Function : both these methods create a hash keyed on file name the first a list of files to delete, the second a list of files to protect Returntype: hashref Exceptions: none Example : =cut sub files_to_delete{ my ($self, $file) = @_; if(!$self->{'del_list'}){ $self->{'del_list'} = {}; } if($file){ $self->{'del_list'}->{$file} = 1; } return $self->{'del_list'}; } sub files_to_protect{ my ($self, $file) = @_; if(!$self->{'protect_list'}){ $self->{'protect_list'} = {}; } if($file){ $self->{'protect_list'}->{$file} = 1; } return $self->{'protect_list'}; } =head2 queryfile Arg [1] : Bio::EnsEMBL::Analysis::Runnable Arg [2] : string, filename Function : will hold a given filename or if one is requested but none defined it will use the create_filename method to create a filename if the resultsfile name hasnt yet been defined it will set that to be queryfilename.out Returntype: string, filename Exceptions: none Example : =cut sub queryfile{ my ($self, $filename) = @_; if($filename){ $self->{'queryfile'} = $filename; } if(!$self->{'queryfile'}){ $self->{'queryfile'} = $self->create_filename('seq', 'fa'); } if(!$self->resultsfile){ my $resultsfile = $self->{'queryfile'}.".out"; $self->resultsfile($resultsfile); } return $self->{'queryfile'}; } =head2 resultsfile Arg [1] : Bio::EnsEMBL::Analysis::Runnable Arg [2] : string, file name Function : container for the results filename Returntype: string Exceptions: none Example : =cut sub resultsfile{ my ($self, $filename) = @_; if($filename){ $self->{'resultsfile'} = $filename; } return $self->{'resultsfile'}; } #utility methods =head2 create_filename Arg [1] : Bio::EnsEMBL::Analysis::Runnable Arg [2] : string, stem of filename Arg [3] : string, extension of filename Arg [4] : directory file should live in Function : create a filename containing the PID and a random number with the specified directory, stem and extension Returntype: string, filename Exceptions: throw if directory specifed doesnt exist Example : my $queryfile = $self->create_filename('seq', 'fa'); =cut sub create_filename{ my ($self, $stem, $ext, $dir) = @_; if(!$dir){ $dir = $self->workdir; } $stem = '' if(!$stem); $ext = '' if(!$ext); throw($dir." doesn't exist Runnable:create_filename") unless(-d $dir); my $num = int(rand(100000)); my $file = $dir."/".$stem.".".$$.".".$num.".".$ext; while(-e $file){ $num = int(rand(100000)); $file = $dir."/".$stem.".".$$.".".$num.".".$ext; } return $file; } =head2 locate_executable Arg [1] : Bio::EnsEMBL::Analysis::Runnable Arg [2] : string, program name Function : first checks if the passed in name is executable, if not checks if the name catted with the bindir is executable, if not then uses Bio::EnsEMBL::Analysis::Programs to find where the program is Returntype: full path of program Exceptions: throws if no name of program is passed in Example : =cut sub locate_executable{ my ($self, $name) = @_; my $path; if($name){ if(-x $name){ $path = $name; }elsif($self->bindir && -x $self->bindir."/$name"){ $path = $self->bindir."/$name"; }else{ Bio::EnsEMBL::Analysis::Programs->import($name); $path = $Bio::EnsEMBL::Analysis::Programs::Program_Paths{$name}; } }else{ throw("Must pass Runnable:locate_executable a name if the program ". "is to be located"); } return $path; } =head2 write_seq_file Arg [1] : Bio::EnsEMBL::Analysis::Runnable Arg [2] : Bio::Seq Arg [3] : filename Function : This uses Bio::SeqIO to dump a sequence to a fasta file Returntype: string, filename Exceptions: throw if failed to write sequence Example : =cut sub write_seq_file{ my ($self, $seq, $filename) = @_; if(!$seq){ $seq = $self->query; } if(!$filename){ $filename = $self->queryfile; } $filename = write_seqfile($seq, $filename); return $filename; } =head2 find_file Arg [1] : Bio::EnsEMBL::Analysis::Runnable Arg [2] : string, filename Function : checks for files existance in current directoru and in the data and lib dirs and returns its full path Returntype: string, file path Exceptions: thows if cant find file Example : =cut sub find_file{ my ($self, $file) = @_; my $found; if(-e $file){ $found = $file; }elsif($self->datadir && -e ($self->datadir."/".$file)){ $found = $self->datadir."/".$file; }elsif($self->libdir && -e ($self->libdir."/".$file)){ $found = $self->libdir."/".$file; }else{ throw($file." doesn't exist Runnable:find_file"); } return $found; } =head2 delete_files Arg [1] : Bio::EnsEMBL::Analysis::Runnable Arg [2] : hashref, keyed on filenames to delete Arg [3] : hashref keyed on filename to protect Function : will unlink any file which exists on the first list but not on the second Returntype: arrayref of protected filenames Exceptions: Example : =cut sub delete_files{ my ($self, $filehash, $protected_hash) = @_; if(!$filehash){ $filehash = $self->files_to_delete; } if(!$protected_hash){ $protected_hash = $self->files_to_protect; } foreach my $name (keys(%$filehash)){ if(!$protected_hash->{$name}){ unlink $name; } } my @protected = keys(%$protected_hash); return \@protected; } =head2 clean_output Arg [1] : Bio::EnsEMBL::Analysis::Runnable Function : empties output array as some runnabledbs use output array as a place holder do offers a simple manner to empty it for reuse Returntype: arrayref that used to be contained by $self->{'output'}; Exceptions: none Example : =cut sub clean_output{ my ($self) = @_; my $array = $self->{'output'}; $self->{'output'} = []; return $array; } =head2 checkdir Arg [1] : Bio::EnsEMBL::Analysis::Runnable Arg [2] : string, directory Arg [3] : int, space limit Function : check if specified directory has enough space and then changes into that directory Returntype: none Exceptions: throws if not enough diskspace or if cant change into specified directory Example : =cut sub checkdir{ my ($self, $dir, $spacelimit) = @_; if(!$dir){ $dir = $self->workdir; } if(!$spacelimit){ $spacelimit = 0.01; } throw("Not enough diskspace on ".$dir." RunnableDB:checkdir") unless($self->diskspace($dir, $spacelimit)); chdir($dir) or throw("FAILED to open ".$dir." Runnable::checkdir"); } =head2 diskspace Arg [1] : Bio::EnsEMBL::Analysis::Runnable Arg [2] : string, directory Arg [3] : int, space limit Function : checks how much space is availible in the specified directory using df -kP Returntype: int, binary toggle, returns 0 if not enough space, 1 if there is Exceptions: opens DF using a pipe throws if failed to open or close that pipe Example : =cut sub diskspace { my ($self, $dir, $limit) =@_; my $block_size; #could be used where block size != 512 ? my $Gb = 1024 ** 3; open DF, "df -kP $dir |" || throw("FAILED to open 'df' pipe ". "Runnable::diskspace : $!\n"); my $count = 0; my $status = 1; while (<DF>) { if($count && $count > 0){ my @values = split; my $space_in_Gb = $values[3] * 1024 / $Gb; $status = 0 if ($space_in_Gb < $limit); } $count++; } close DF || throw("FAILED to close 'df' pipe ". "Runnable::diskspace : $!\n"); return $status; } =head2 run Arg [1] : Bio::EnsEMBL::Analysis::Runnable Arg [2] : string, directory Function : a generic run method. This checks the directory specifed to run it, write the query sequence to file, marks the query sequence file and results file for deletion, runs the analysis parses the results and deletes any files Returntype: 1 Exceptions: throws if no query sequence is specified Example : =cut sub run{ my ($self, $dir) = @_; $self->workdir($dir) if($dir); throw("Can't run ".$self." without a query sequence") unless($self->query); $self->checkdir(); my $filename = $self->write_seq_file(); $self->files_to_delete($filename); $self->files_to_delete($self->resultsfile); $self->run_analysis(); $self->parse_results; $self->delete_files; return 1; } =head2 run_analysis Arg [1] : Bio::EnsEMBL::Analysis::Runnable Arg [2] : string, program name Function : constructs a generic commandline in the form program options queryfile > resultsfile Returntype: none Exceptions: throws if program isnt defined or is not executable Example : =cut sub run_analysis{ my ($self, $program) = @_; if(!$program){ $program = $self->program; } throw($program." is not executable Runnable::run_analysis ") unless($program && -x $program); my $command = $program." "; $command .= $self->options." " if($self->options); $command .= $self->queryfile." > ".$self->resultsfile; logger_info("Running analysis ".$command); system($command) == 0 or throw("FAILED to run ".$command); } =head2 parse_results Arg [1] : Bio::EnsEMBL::Analysis::Runnable Function : place holder to indicate a child Runnable should implement this method Returntype: none Exceptions: throws as this method should be implemented by any child module Example : =cut sub parse_results{ my ($self) = @_; throw("Need to implement parse results in ".$self. "Runnable won't provide this functionality for you"); } 1;