Raw content of Bio::EnsEMBL::Analysis::Runnable::Genefinder # Ensembl module for Bio::EnsEMBL::Analysis::Runnable::Genefinder # # Copyright (c) 2005 Ensembl # =head1 NAME Bio::EnsEMBL::Analysis::Runnable::Genefinder =head1 SYNOPSIS my $runnable = Bio::EnsEMBL::Analysis::Runnable::Genefinder->new( -query => $slice, -program => 'genefinder', ); $runnable->run; my @predictions = @{$runnable->output}; =head1 DESCRIPTION Wrapper to run the genefinder gene predictor and then parse the results into prediction transcripts =head1 CONTACT Post questions to the Ensembl development list: ensembl-dev@ebi.ac.uk =cut package Bio::EnsEMBL::Analysis::Runnable::Genefinder; use strict; use warnings; use Bio::EnsEMBL::Analysis::Runnable::BaseAbInitio; use Bio::EnsEMBL::Utils::Exception qw(throw warning); use Bio::EnsEMBL::Utils::Argument qw( rearrange ); use vars qw(@ISA); @ISA = qw(Bio::EnsEMBL::Analysis::Runnable::BaseAbInitio); =head2 new Arg [1] : Bio::EnsEMBL::Analysis::Runnable::Genefinder Arg [2] : string, tablename file Arg [3] : string, intron penalties file Arg [4] : string, exon penalties file Function : create a Genefinder runnable Returntype: Bio::EnsEMBL::Analysis::Runnable::Genefinder Exceptions: Example : =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($tablenamefile, $intronpenalty, $exonpenalty) = rearrange(['TABLENAMEFILE', 'INTRONPENALTY', 'EXONPENALTY'], @args); ###################### #SETTING THE DEFAULTS# ###################### $self->tablenamefile('/usr/local/ensembl/data/nemtables/sanger.tablenamefile.cod'); $self->intronpenalty('/usr/local/ensembl/data/nemtables/intron_penalty.lookup'); $self->exonpenalty('/usr/local/ensembl/data/nemtables/exon_penalty.lookup'); $self->program('genefinder') if(!$self->program); ###################### $self->tablenamefile($tablenamefile); $self->intronpenalty($intronpenalty); $self->exonpenalty($exonpenalty); return $self; } =head2 run_analysis Arg [1] : Bio::EnsEMBL::Analysis::Runnable::Genefinder Arg [2] : string, program name Function : create and open a commandline for the program genefinder Returntype: none Exceptions: throws if the program in not executable or the system command fails to execute or the appropriate files are not found Example : =cut sub run_analysis{ my ($self, $program) = @_; if(!$program){ $program = $self->program; } throw($program." is not executable Genefinder::run_analysis ") unless($program && -x $program); my $err_string = $self->filecheck; throw("Filecheck found missing files $err_string") if($err_string); my $command = $program .' -tablenamefile '.$self->tablenamefile. ' -intronPenaltyFile '.$self->intronpenalty.' -exonPenaltyFile '. $self->exonpenalty.' '.$self->queryfile.' > '. $self->resultsfile; print "Running analysis ".$command."\n"; my $value = system($command); #or throw("FAILED to run ".$command); throw("FAILED to run ".$command) if(!-s $self->resultsfile); } =head2 filecheck Arg [1] : Bio::EnsEMBL::Analysis::Runnable::Genefinder Function : make sure the config files exist Returntype: string, any errors found Exceptions: Example : =cut sub filecheck{ my ($self) = @_; my $err_string; if(!-e $self->tablenamefile){ $err_string = "Tablenamefile ".$self->tablenamefile." does not ". "exist\n"; } if(!-e $self->intronpenalty){ $err_string .= "IntronPenalty ".$self->intronpenalty." does not". " exists\n"; } if(!-e $self->exonpenalty){ $err_string .= "ExonPenalty ".$self->exonpenalty." does not ". "exist\n"; } return $err_string; } =head2 parse_results Arg [1] : Bio::EnsEMBL::Analysis::Runnable::Genefinder Arg [2] : string, resultsfile name Function : parse the results file into prediction exons then collate them into prediction transcripts and calculate their phases Returntype: none Exceptions: throws if cant open or close results file or the parsing doesnt work Example : =cut sub parse_results{ my ($self, $results) = @_; if(!$results){ $results = $self->resultsfile; } open(OUT, "<".$results) or throw("FAILED to open ".$results. "Genefinder:parse_results"); my $ff = $self->feature_factory; my $prefix; my @forward_lines; my @reverse_lines; while(<OUT>){ # this loop sorts the lines in to foward # and reverse genes chomp; if (/Sequence: (\S+)/) { $prefix = $1; } if($_ =~ /\.\./){ if($_ =~ /\*/){ push(@reverse_lines, $_); }else{ push(@forward_lines, $_); } } } my $gene_count = $self->parse_lines(\@forward_lines, $prefix, $ff, 0, 1); $gene_count = $self->parse_lines(\@reverse_lines, $prefix, $ff, $gene_count, -1); close(OUT) or throw("FAILED to close ".$results. "Genefinder:parse_results"); $self->create_transcripts; $self->calculate_phases; } #accessor methods =head2 calculate_phases Arg [1] : Bio::EnsEMBL::Analysis::Runnable::Genefinder Function : works out which phase to make the exons to get a complete cds Returntype: none Exceptions: throws if the number of transcripts on the way out isnt the same as the number on the way in Example : =cut sub calculate_phases{ my ($self) = @_; my @phases = (0, 1, 2); my @transcripts = @{$self->output}; my @checked_transcripts; my $ff = $self->feature_factory; $self->clean_output; TRANS:foreach my $transcript(@transcripts){ PHASE:foreach my $phase(@phases){ my $temp_exons = $self->set_phases($phase, $transcript->get_all_Exons, ); my $new = $ff->create_prediction_transcript($temp_exons, $self->query, $self->analysis); if($new->translate->seq !~ /\*/){ push(@checked_transcripts, $new); next TRANS; } } } if(scalar(@checked_transcripts) != scalar(@transcripts)){ throw("One of the transcripts doesn't translate as there are ". @transcripts." from genefinder but only ". @checked_transcripts." transcripts which translate"); } $self->output(\@checked_transcripts); } =head2 parse_lines Arg [1] : Bio::EnsEMBL::Analysis::Runnable::Genefinder Arg [2] : array, of lines from file Arg [3] : string Arg [4] : Bio::EnsEMBL::Analysis::Tools::FeatureFactory Arg [5] : int, starting count of genes Arg [6] : int, 1 or -1 for strand Function : parse lines into PredictionExon objects Returntype: int, new gene count (incremented for each line parsed) Exceptions: throws if * is found and strand isnt -1 and gives a warning about phase inconsistencies Example : =cut sub parse_lines{ my ($self, $lines, $prefix, $ff, $gene_count, $strand) = @_; my $phase = 0; foreach my $line(@$lines){ print $line."\n"; $line =~ s/\[TSL: \S+\s+\S+\]//; $line =~ s/start//; $line =~ s/end//; #end is stripped off $line =~ s/\[ -?(\d+\.\d+) \]//; my $score = $1; if($line =~ s/\*// && $strand != -1){ throw("Transcript is reverse strand but it isn't ". "being set to that"); } $line =~ s/\*//; if($strand == -1){ my @elements = split /\s+/, $line; $line = ''; foreach my $v (reverse(@elements)){ $line .= $v." "; } } my @values = split /\s+/, $line; if(!$values[0]){ my $first = shift @values; } if($values[0] =~ /U(\d)/){ $phase = $1; my $phase_variable = shift @values; } my $count = 0; my $exonname = $prefix.".".$gene_count; foreach my $coord(@values){ if($coord =~ /U\d/){ next; } my ($start, $end) = split /\.\./, $coord; my $end_phase = ($phase + ($end-$start) + 1)%3; my $exon = $ff->create_prediction_exon($start, $end, $strand, $score, 0, $phase, $exonname, $self->query, $self->analysis); $self->exon_groups($exonname, $exon); $phase = $end_phase; my $count++; if($values[$count] && $values[$count] =~ /U(\d)/){ if($phase != $1){ warning(" phase ".$phase." and continuation phase ".$1." aren't the same may be issues with translation\n"); } } } $gene_count++; } return $gene_count; } =head2 accessor methods Arg [1] : Bio::EnsEMBL::Analysis::Runnable::Genefinder Arg [2] : string Function : these are all acessor methods for config filenames Returntype: string Exceptions: Example : =cut sub tablenamefile{ my ($self, $arg) = @_; if($arg){ $self->{'tablenamefile'} = $arg; } return $self->{'tablenamefile'}; } sub exonpenalty{ my ($self, $arg) = @_; if($arg){ $self->{'exonpenalty'} = $arg; } return $self->{'exonpenalty'}; } sub intronpenalty{ my ($self, $arg) = @_; if($arg){ $self->{'intronpenalty'} = $arg; } return $self->{'intronpenalty'}; } 1;