Raw content of Bio::EnsEMBL::Analysis::RunnableDB::ExonerateForGenewise # # Ensembl module for Bio::EnsEMBL::Analysis::RunnableDB # # Copyright (c) 2008 Ensembl # =head1 NAME Bio::EnsEMBL::Analysis::RunnableDB::ExonerateForGenewise =head1 SYNOPSIS my $EFG = Bio::EnsEMBL::Analysis::RunnableDB::ExonerateForGenewise->new() $EFG->fetch_input; $EFG->run; $EFG->write_output; =head1 DESCRIPTION This module replaces Bio::EnsEMBL::Pipeline::RunnableDB::TargettedExonerate =cut package Bio::EnsEMBL::Analysis::RunnableDB::ExonerateForGenewise; use vars qw(@ISA); use strict; use Bio::EnsEMBL::Analysis::RunnableDB::BlastMiniGenewise; use Bio::EnsEMBL::Utils::Exception qw(throw warning); use Bio::EnsEMBL::Utils::Argument qw (rearrange); use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranscriptUtils qw(Transcript_info set_stop_codon set_start_codon attach_Slice_to_Transcript evidence_coverage list_evidence); use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::ConfigDependent::TranscriptUtils qw( low_complexity_less_than_maximum ) ; use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranslationUtils qw(contains_internal_stops); use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::GeneUtils qw(Gene_info print_Gene); use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils qw(id coord_string lies_inside_of_slice); use Bio::EnsEMBL::Analysis::Tools::Logger; use Bio::EnsEMBL::KillList::KillList; use Bio::EnsEMBL::Analysis::Config::GeneBuild::BlastMiniGenewise qw(GENEWISE_CONFIG_BY_LOGIC); use Bio::EnsEMBL::Analysis::Tools::Utilities qw(create_file_name write_seqfile); @ISA = qw ( Bio::EnsEMBL::Analysis::RunnableDB::BlastMiniGenewise ); sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); $self->read_and_check_config($GENEWISE_CONFIG_BY_LOGIC); $self->extra_sanity_check(); return $self; } sub create_bmg_runnables{ my ($self, $hits_hash, $seqfetcher, $analysis) = @_; throw("RunnableDB::BlastMiniGenewise Can't create BlastMiniGenewise ". "runnables without a hash of hits") unless($hits_hash); $seqfetcher = $self->seqfetcher if(!$seqfetcher); $analysis = $self->analysis if(!$analysis); my %exonerate_parameters = %{$self->EXONERATE_PARAMETERS}; my %params_hash = %{$self->parameters_hash}; if($self->LIMIT_TO_FEATURE_RANGE == 0){ my $genomic_file = write_seqfile($self->query, create_file_name("genomic", "fa", "/tmp/")); $self->genomic_file($genomic_file); my @ids = sort keys(%$hits_hash); my $seqs = $self->get_protein_sequences(\@ids); my $peptide_file = write_seqfile($seqs, create_file_name("peptide", "fa", "/tmp/")); $self->peptide_file($peptide_file); my $r = Bio::EnsEMBL::Analysis::Runnable::ExonerateTranscript->new ( -analysis => $self->analysis, -target_file => $genomic_file, -query_type => 'protein', -query_file => $peptide_file, -program => $self->analysis->program_file, %exonerate_parameters, %params_hash, ); $self->runnable($r); }else{ foreach my $id(keys(%$hits_hash)){ my $features = $hits_hash->{$id}; foreach my $feature(@$features){ #This should be made a non overlapping range my $start = $feature->seq_region_start - $self->FEATURE_RANGE_PADDING; my $end = $feature->seq_region_end + $self->FEATURE_RANGE_PADDING; $start = 1 if($start < 1); $end = $self->query->seq_region_length if($end > $self->query->seq_region_length); my $name = ($self->query->coord_system->name.":". $self->query->coord_system->version.":". $self->query->seq_region_name.":". $start.":".$end.":". $self->query->strand); my $db = $self->get_dbadaptor($self->GENE_SOURCE_DB); my $query = $self->fetch_sequence($name, $db, $self->REPEATMASKING); logger_info("Creating ExonerateTranscript Runnable over a limited range with " .$id." and ".$seqfetcher." to run on ".$query->name); my $genomic_file = write_seqfile($query, create_file_name("genomic", "fa", "/tmp/")); $self->genomic_file($genomic_file); my $seqs = $self->get_protein_sequences([$id]); my $peptide_file = write_seqfile($seqs, create_file_name("peptide", "fa", "/tmp/")); $self->peptide_file($peptide_file); my $r = Bio::EnsEMBL::Analysis::Runnable::ExonerateTranscript->new ( -analysis => $self->analysis, -target_file => $genomic_file, -query_type => 'protein', -query_file => $peptide_file, %exonerate_parameters, %params_hash, ); $self->runnable($r); } } } } sub process_transcripts{ my ($self, $transcripts) = @_; my @complete_transcripts; foreach my $transcript(@$transcripts){ my $query_name = $transcript->start_Exon->seqname; my $unmasked_slice = $self->fetch_sequence($query_name, $self->output_db); attach_Slice_to_Transcript($transcript, $unmasked_slice); my $start_t = set_start_codon($transcript); my $end_t = set_stop_codon($start_t); if(contains_internal_stops($transcript)){ warning(Transcript_info($transcript)." contains internal stops will be thrown ". "This will be thrown away"); } push(@complete_transcripts, $end_t); } foreach my $file($self->genomic_file, @{$self->peptide_file}){ unlink $file; } return \@complete_transcripts } =head2 filter_genes Arg [1] : Bio::EnsEMBL::Analysis::RunnableDB::BlastMiniGenewise Arg [2] : arrayref of Genes Function : filters the gene sets using the defined filter Returntype: n/a, stores the genes in the output array Exceptions: Example : =cut sub filter_genes{ my ($self, $genes) = @_; my $output; my $rejected_set; my $filtered_set; if($self->filter_object){ ($filtered_set, $rejected_set) = $self->filter_object->filter_genes($genes); #$self->rejected_set($rejected_set); foreach my $gene(@{$filtered_set}){ my $transcripts = $gene->get_all_Transcripts; my $transcript = $transcripts->[0]; my $evidence = $self->get_Transcript_supporting_evidence($transcript); my $coverage = evidence_coverage($transcript, $evidence); if(low_complexity_less_than_maximum($transcript, $coverage)){ push(@$output, $gene); }else{ $gene->description("low_complexity is greater than evidence coverage"); push(@$rejected_set, $gene); } } }else{ $self->output($genes); } $self->output($output); $self->rejected_set($rejected_set); } sub genomic_file{ my ($self) = @_; if(@_){ $self->{genomic_file} = shift; } return $self->{genomic_file}; } sub peptide_file{ my ($self) = @_; if(@_){ my $file = shift; push(@{$self->{peptide_file}}, $file); } return $self->{peptide_file}; } sub get_protein_sequences{ my ($self, $ids) = @_; my $seqfetcher = $self->seqfetcher; my $seqs; my $seq; foreach my $id(@$ids){ eval { $seq = $seqfetcher->get_Seq_by_acc($id); }; if ($@) { throw("Problem fetching sequence for [$id]: [$@]\n"); } if(!$seq){ throw("Can't fetch the sequence $id with ".$seqfetcher." - it's not in the index\n"); } push(@$seqs, $seq); } return $seqs; } sub extra_sanity_check{ my ($self) = @_; throw("ExonerateAlignFeatures needs an exonerate parameters hash defined") if(!$self->EXONERATE_PARAMETERS); } sub get_Transcript_supporting_evidence{ my ($self, $transcript, $seqfetcher) = @_; $seqfetcher = $self->seqfetcher if(!$seqfetcher); throw("Can't get ".$transcript." transcrpts supporting features without ". "a seqfetcher object") if(!$seqfetcher); my $ids = list_evidence($transcript); my $sequence; foreach my $id(@$ids){ $sequence = $seqfetcher->get_Seq_by_acc($id); last if($sequence); } return $sequence; } 1; 1;