Raw content of Bio::EnsEMBL::Analysis::RunnableDB::IncrementalBuild package Bio::EnsEMBL::Analysis::RunnableDB::IncrementalBuild; use vars qw(@ISA); use strict; use Bio::EnsEMBL::Analysis::RunnableDB::BaseGeneBuild; use Bio::EnsEMBL::Analysis::Config::GeneBuild::IncrementalBuild; use Bio::EnsEMBL::Utils::Exception qw(throw warning); use Bio::EnsEMBL::Utils::Argument qw (rearrange); use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::GeneUtils qw(empty_Gene); @ISA = qw ( Bio::EnsEMBL::Analysis::RunnableDB::BaseGeneBuild ); sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); $self->read_and_check_config($INCREMENTAL_HASH); return $self; } sub fetch_input{ my ($self) = @_; my $secondary_slice = $self->fetch_sequence($self->input_id, $self->secondary_database); my $secondary_genes = $self->get_Genes($secondary_slice, $self->SECONDARY_BIOTYPES); my $primary_slice = $self->fetch_sequence($self->input_id, $self->primary_database); my $primary_genes = $self->get_Genes($primary_slice, $self->PRIMARY_BIOTYPES); $self->query($secondary_slice); $self->secondary_genes($secondary_genes); $self->primary_genes($primary_genes); $self->primary_slice($primary_slice); } sub run{ my ($self) = @_; my $mask_regions = $self->mask_gene_regions($self->primary_genes); my @non_overlaps; my @overlaps; my @genes; if($self->PRE_FILTER){ #print "Running a prefilter\n"; my $filtered_genes = $self->filter_object->filter_genes($self->secondary_genes); push(@genes, @$filtered_genes); }else{ push(@genes, @{$self->secondary_genes}); } #print "Have ".@genes." secondary genes\n"; foreach my $gene(@genes){ my $keep_gene = 1; my $mask_reg_idx = 0; my @test_regions = ({start => $gene->start, end => $gene->end}); FEAT: foreach my $f (@test_regions) { #print "Comparing ".$f->{'start'}." ".$f->{'end'}."\n"; #print "have ".@$mask_regions." mask regions\n"; for( ; $mask_reg_idx < @$mask_regions; ) { my $mask_reg = $mask_regions->[$mask_reg_idx]; #print "To ".$mask_reg->{'start'}." ".$mask_reg->{'end'}."\n"; if ($mask_reg->{'start'} > $f->{'end'}) { # no mask regions will overlap this feature next FEAT; } elsif ( $mask_reg->{'end'} >= $f->{'start'}) { # overlap #print "Overlap found\n"; $keep_gene = 0; last FEAT; } else { $mask_reg_idx++; } } } if ($keep_gene) { my $gene_to_keep; if($self->CALCULATE_TRANSLATION){ $gene_to_keep = $self->calculate_translation($gene); }else{ $gene_to_keep = $gene; } push @non_overlaps, $gene_to_keep if($gene_to_keep); }else{ push(@overlaps, $gene); } } #print "Have ".@{$self->primary_genes}." primary genes\n"; #print "Have ".@non_overlaps." non overlapping genes\n"; #print "Have ".@overlaps." overlapping genes\n"; my @non_overlapping_genes; if($self->POST_FILTER){ my $filtered_genes = $self->filter_object->filter_genes(\@non_overlaps); push(@non_overlapping_genes, @$filtered_genes); }else{ push(@non_overlapping_genes, @non_overlaps); } my @output; push(@output, @non_overlapping_genes); if($self->NEW_BIOTYPE){ foreach my $output(@output){ $output->biotype($self->NEW_BIOTYPE); } } my $primary_genes = $self->primary_genes; print "Adding primary genes to set\n" if($self->STORE_PRIMARY); push(@output, @$primary_genes) if($self->STORE_PRIMARY); $self->output(\@output); #print "have ".@output." genes to store\n"; $self->overlapping_genes(\@overlaps); } sub write_output{ my ($self) = @_; my $db = $self->output_database; my $gene_adaptor = $db->get_GeneAdaptor; foreach my $gene (@{$self->output}){ empty_Gene($gene); eval{ $gene_adaptor->store($gene); }; if($@){ throw("Failed to store gene $@"); } } } sub overlapping_genes{ my ($self, $value) = @_; if($value){ $self->{'overlapping_genes'} = $value; } return $self->{'overlapping_genes'}; } sub secondary_genes{ my ($self, $value) = @_; if($value){ $self->{'secondary_genes'} = $value; } return $self->{'secondary_genes'}; } sub primary_genes{ my ($self, $value) = @_; if($value){ $self->{'primary_genes'} = $value; } return $self->{'primary_genes'}; } sub primary_slice{ my ($self, $value) = @_; if($value){ $self->{'primary_slice'} = $value; } return $self->{'primary_slice'}; } sub secondary_database{ my ($self, $value) = @_; if($value){ $self->{'secondary_database'} = $value; } if(!$self->{'secondary_database'}){ my $db = $self->get_dbadaptor($self->SECONDARY_DB); $db->dnadb($self->db); $self->{'secondary_database'} = $db; } return $self->{'secondary_database'}; } sub primary_database{ my ($self, $value) = @_; if($value){ $self->{'primary_database'} = $value; } if(!$self->{'primary_database'}){ my $db = $self->get_dbadaptor($self->PRIMARY_DB); $db->dnadb($self->db); $self->{'primary_database'} = $db; } return $self->{'primary_database'}; } sub output_database{ my ($self, $value) = @_; if($value){ $self->{'output_database'} = $value; } if(!$self->{'output_database'}){ my $db = $self->get_dbadaptor($self->OUTPUT_DB); $db->dnadb($self->db); $self->{'output_database'} = $db; } return $self->{'output_database'}; } sub PRIMARY_BIOTYPES{ my ($self,$value) = @_; if ($value) { $self->{'PRIMARY_BIOTYPES'} = $value; } if (exists($self->{'PRIMARY_BIOTYPES'})) { return $self->{'PRIMARY_BIOTYPES'}; } else { return undef; } } sub PRIMARY_DB{ my ($self,$value) = @_; if ($value) { $self->{'PRIMARY_DB'} = $value; } if (exists($self->{'PRIMARY_DB'})) { return $self->{'PRIMARY_DB'}; } else { return undef; } } sub SECONDARY_BIOTYPES{ my ($self,$value) = @_; if ($value) { $self->{'SECONDARY_BIOTYPES'} = $value; } if (exists($self->{'SECONDARY_BIOTYPES'})) { return $self->{'SECONDARY_BIOTYPES'}; } else { return undef; } } sub SECONDARY_DB{ my ($self,$value) = @_; if ($value) { $self->{'SECONDARY_DB'} = $value; } if (exists($self->{'SECONDARY_DB'})) { return $self->{'SECONDARY_DB'}; } else { return undef; } } sub SECONDARY_FILTER{ my ($self,$value) = @_; if ($value) { $self->{'SECONDARY_FILTER'} = $value; } if (exists($self->{'SECONDARY_FILTER'})) { return $self->{'SECONDARY_FILTER'}; } else { return undef; } } sub SECONDARY_PADDING{ my ($self,$value) = @_; if ($value) { $self->{'SECONDARY_PADDING'} = $value; } if (exists($self->{'SECONDARY_PADDING'})) { return $self->{'SECONDARY_PADDING'}; } else { return undef; } } sub OUTPUT_DB{ my ($self,$value) = @_; if ($value) { $self->{'OUTPUT_DB'} = $value; } if (exists($self->{'OUTPUT_DB'})) { return $self->{'OUTPUT_DB'}; } else { return undef; } } sub PRE_FILTER{ my ($self,$value) = @_; if (defined $value) { $self->{'PRE_FILTER'} = $value; } if (exists($self->{'PRE_FILTER'})) { return $self->{'PRE_FILTER'}; } else { return undef; } } sub POST_FILTER{ my ($self,$value) = @_; if (defined $value) { $self->{'POST_FILTER'} = $value; } if (exists($self->{'POST_FILTER'})) { return $self->{'POST_FILTER'}; } else { return undef; } } sub STORE_PRIMARY{ my ($self,$value) = @_; if (defined $value) { $self->{'STORE_PRIMARY'} = $value; } if (exists($self->{'STORE_PRIMARY'})) { return $self->{'STORE_PRIMARY'}; } else { return undef; } } sub CALCULATE_TRANSLATION{ my ($self,$value) = @_; if (defined $value) { $self->{'CALCULATE_TRANSLATION'} = $value; } if (exists($self->{'CALCULATE_TRANSLATION'})) { return $self->{'CALCULATE_TRANSLATION'}; } else { return undef; } } sub DISCARD_IF_NO_ORF{ my ($self,$value) = @_; if (defined $value) { $self->{'DISCARD_IF_NO_ORF'} = $value; } if (exists($self->{'DISCARD_IF_NO_ORF'})) { return $self->{'DISCARD_IF_NO_ORF'}; } else { return undef; } } sub NEW_BIOTYPE{ my ($self,$value) = @_; if ($value) { $self->{'NEW_BIOTYPE'} = $value; } if (exists($self->{'NEW_BIOTYPE'})) { return $self->{'NEW_BIOTYPE'}; } else { return undef; } } sub read_and_check_config { my $self = shift; $self->SUPER::read_and_check_config($INCREMENTAL_HASH); ####### #CHECKS ####### foreach my $config_var (qw(PRIMARY_DB SECONDARY_DB OUTPUT_DB)) { throw("You must define $config_var in config for logic '". $self->analysis->logicname."'") if not defined $self->$config_var; } if($self->SECONDARY_FILTER){ if($self->SECONDARY_FILTER->{OBJECT}){ my $module = $self->SECONDARY_FILTER->{OBJECT}; my $pars = $self->SECONDARY_FILTER->{PARAMETERS}; (my $class = $module) =~ s/::/\//g; eval{ require "$class.pm"; }; throw("Couldn't require ".$class. " IncrementalBuild:read_and_check_config $@") if($@); $self->filter_object($module->new(%{$pars})); } } } sub filter_object{ my ($self,$value) = @_; if ( $value) { $self->{'filter_object'} = $value; } if (exists($self->{'filter_object'})) { return $self->{'filter_object'}; } else { return undef; } } sub mask_gene_regions{ my ($self, $genes) = @_; #print "Generating mask regions for ".@$genes." genes\n"; my @mask_gene_regions; foreach my $gene(@$genes){ push @mask_gene_regions, { start => $gene->start, end => $gene->end }; } @mask_gene_regions = sort {$a->{'start'} <=> $b->{'start'} } @mask_gene_regions; #print "Returning ".@mask_gene_regions." mask regions\n"; return \@mask_gene_regions; } sub calculate_translation{ my ($self, $gene) = @_; my $gene_to_keep = Bio::EnsEMBL::Gene->new(); $gene_to_keep->biotype($gene->biotype); $gene_to_keep->confidence($gene->confidence); $gene_to_keep->analysis($self->analysis); $gene_to_keep->source($gene->source); $gene_to_keep->stable_id($gene->stable_id); my @new_transcripts; TRANS:foreach my $transcript (@{$gene->get_all_Transcripts}){ my $seq = $transcript->slice->seq; #print "The slice seq of the transcript = ".$seq."\n"; #print "dbadaptor of transcript ". # $transcript->adaptor->db->dnadb->dbname."\n"; my $new_translation = Bio::EnsEMBL::Pipeline::Tools::TranslationUtils->return_translation($transcript); if(!$new_translation && $self->DISCARD_IF_NO_ORF){ next TRANS; }else{ $transcript->translation($new_translation); push(@new_transcripts, $transcript); } } if(@new_transcripts >= 1){ foreach my $new_transcript(@new_transcripts){ $gene_to_keep->add_Transcript($new_transcript); } return $gene_to_keep; }else{ return undef; } } sub get_Genes{ my ($self, $slice, $biotypes) = @_; my @genes; if(@$biotypes){ foreach my $biotype(@$biotypes){ push(@genes, @{$slice->get_all_Genes_by_type($biotype)}); } }else{ push(@genes, @{$slice->get_all_Genes}); } return \@genes; } 1;