Raw content of Bio::EnsEMBL::Analysis::RunnableDB::FindPartialGenes.

Raw content of Bio::EnsEMBL::Analysis::RunnableDB::FindPartialGenes =pod =head1 NAME Bio::EnsEMBL::Analysis::RunnableDB::FindPartialGenes ; =head1 SYNOPSIS my $orthologueanalysis = Bio::EnsEMBL::Analysis::RunnableDB::FindPartialGenes->new ( -analysis => $analysis_obj, ); $orthologueanalysis->fetch_input(); $orthologueanalysis->run(); $orthologueanalysis->output(); $orthologueanalysis->write_output(); =head1 DESCRIPTION This object uses information from a Ensembl Compara database to find partial gene predictions in an organism. It's used to fetch the input from different databases as well as writing results the results to the database.a =head1 CONTACT Post general queries to B<ensembl-dev@ebi.ac.uk> =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a '_' =cut package Bio::EnsEMBL::Analysis::RunnableDB::FindPartialGenes; use strict; use Bio::SeqIO; use Bio::EnsEMBL::Utils::Exception qw(throw warning); use Bio::EnsEMBL::Analysis::RunnableDB; use Bio::EnsEMBL::Gene; use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::GeneUtils qw(get_transcript_with_longest_CDS get_one2one_orth_for_gene_in_other_species ) ; use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils; use Bio::EnsEMBL::Analysis::Tools::Utilities; use Bio::EnsEMBL::Analysis::Config::GeneBuild::OrthologueEvaluator; use Bio::EnsEMBL::Analysis::Config::Exonerate2Genes; use Bio::EnsEMBL::Registry; use Bio::EnsEMBL::Pipeline::DBSQL::DBAdaptor; use Bio::EnsEMBL::Pipeline::DBSQL::RuleAdaptor; use Bio::EnsEMBL::Pipeline::Rule; use vars qw(@ISA); use Bio::EnsEMBL::Analysis::RunnableDB::OrthologueEvaluator; @ISA = qw ( Bio::EnsEMBL::Analysis::RunnableDB::OrthologueEvaluator ) ; sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); $self->{_partials}={}; return $self; } # fetches the genes out of the QUERY database # ( db which contains the predictions of species we're working on ) sub fetch_input { my( $self ) = @_; # fetch all protein_coding genes from QUERY_SPECIES $self->get_initial_geneset( $$MAIN_CONFIG{QUERY_SPECIES}, $$FIND_PARTIAL_GENES{DEFAULT_GENE_BIOTYPES} ); print "SUMMARY : " . scalar( @{$self->genes}) . " genes fetched\n" ; } sub run { my ( $self ) = @_; # process all genes on this slice # comparision species are defined in config file my $qy_species = $$MAIN_CONFIG{QUERY_SPECIES}; my $tg1_species = $$FIND_PARTIAL_GENES{ANALYSIS_SETS}{$self->post_logic_name}{MAIN_ORTH}; my $tg2_species = $$FIND_PARTIAL_GENES{ANALYSIS_SETS}{$self->post_logic_name}{TEST_SPEC_1}; my $tg3_species = $$FIND_PARTIAL_GENES{ANALYSIS_SETS}{$self->post_logic_name}{TEST_SPEC_2}; # loop trough QUERY genes in species (dog) QUERY_GENES : for my $query_gene (@{$self->genes} ) { # # this code is mainly a cut-and-paste from Steve's find_partials.pl for mono3 # my $tg1_homol = get_one2one_orth_for_gene_in_other_species($query_gene,$tg1_species); if ($tg1_homol) { # we found a ONE-2-ONE between QUERY (dog) and MAIN_ORTH (human) # now we check if there are ortholgoues between human and test1 || human and test2 my $tg2_homol = get_one2one_orth_for_gene_in_other_species($tg1_homol,$tg2_species); my $tg3_homol = get_one2one_orth_for_gene_in_other_species($tg1_homol,$tg3_species); if ($tg2_homol || $tg3_homol) { my ($longest_qy,$len_longest_qy,$nexon_longest_qy) = get_transcript_with_longest_CDS($query_gene); my ($longest_tg1,$len_longest_tg1,$nexon_longest_tg1) = get_transcript_with_longest_CDS($tg1_homol); my ($longest_tg2,$len_longest_tg2,$nexon_longest_tg2) = get_transcript_with_longest_CDS($tg2_homol) if ($tg2_homol); my ($longest_tg3,$len_longest_tg3,$nexon_longest_tg3) = get_transcript_with_longest_CDS($tg3_homol) if ($tg3_homol); #print "Comparing lengths " . $qy_species . " $len_longest_qy " . $tg1_species . " $len_longest_tg1" . " " . $tg2_species ." " . (defined($tg2_homol) ? $len_longest_tg2 : "N/A") . " " . $tg3_species ." " . (defined($tg3_homol) ? $len_longest_tg3 : "N/A") . "\n"; my $ratio_tg1_tg2 = 10; my $ratio_tg1_tg3 = 10; $ratio_tg1_tg2 = $len_longest_tg2/$len_longest_tg1 if ($tg2_homol); $ratio_tg1_tg3 = $len_longest_tg3/$len_longest_tg1 if ($tg3_homol); my $ratio_tg1_qy = $len_longest_qy/$len_longest_tg1; # if length_of_human_cds vs length_of_mouse_cds between 0.9 and 1.1 (+-10%) # OR if length_of_human_cds vs legnght_of_monodelphis cds between 0.9 and 1.1 # and if predicted gene in query-species is less than 75% long than 1:1 orth in human # these values are set in the config my $ratio_tg1_tg2_low = $$FIND_PARTIAL_GENES{ANALYSIS_SETS}{$self->post_logic_name}{RATIO_MAIN_vs_SPEC_1_LOW}; my $ratio_tg1_tg2_high = $$FIND_PARTIAL_GENES{ANALYSIS_SETS}{$self->post_logic_name}{RATIO_MAIN_vs_SPEC_1_HIGH}; my $ratio_tg1_tg3_low = $$FIND_PARTIAL_GENES{ANALYSIS_SETS}{$self->post_logic_name}{RATIO_MAIN_vs_SPEC_2_LOW}; my $ratio_tg1_tg3_high = $$FIND_PARTIAL_GENES{ANALYSIS_SETS}{$self->post_logic_name}{RATIO_MAIN_vs_SPEC_2_HIGH}; my $ratio_tg1_qy_75 = $$FIND_PARTIAL_GENES{ANALYSIS_SETS}{$self->post_logic_name}{RATIO_MAIN_vs_QUERY}; if ($ratio_tg1_tg2 > $ratio_tg1_tg2_low && $ratio_tg1_tg2 < $ratio_tg1_tg2_high || $ratio_tg1_tg3 > $ratio_tg1_tg3_low && $ratio_tg1_tg3 < $ratio_tg1_tg3_high ) { if ($ratio_tg1_qy < $ratio_tg1_qy_75 ) { print "Significant length difference for :" . $qy_species . "\t" . $longest_qy->stable_id . "\t" . $query_gene->stable_id . " (" . ($query_gene->external_name ? $query_gene->external_name : '') . ")" . " with " . $tg1_species . "\t" . $longest_tg1->stable_id . "\t" . $tg1_homol->stable_id . " (" . $tg1_homol->external_name . ")\n"; my $transcript_stable_id = $longest_tg1->stable_id; if ( $$FIND_PARTIAL_GENES{ANALYSIS_SETS}{$self->post_logic_name}{IGNORE_ORTHOLOGUES_ON_MT_REGIONS} ) { if ( $tg1_homol->slice->seq_region_name =~m/MT/){ print "ignore MT regions is switched on \n" ; print "\nIgnoring homolog : $transcript_stable_id 'cause it's on an MT region \n" ; } else { $self->partials($transcript_stable_id, $tg1_homol, $longest_tg1, $query_gene ) ; } } else { $self->partials($transcript_stable_id, $tg1_homol, $longest_tg1, $query_gene ) ; } } } } } } } sub partials { my ($self, $tsi, $tg1_homol, $longest_tg1_trans, $query_gene ) = @_ ; if ( $tsi ) { my $already_stored = 0 ; if (!exists($self->{_partials}{$tsi})) { $self->{_partials}{$tsi}{gene} = $tg1_homol; $self->{_partials}{$tsi}{matches} = []; $self->{_partials}{$tsi}{transcript} = []; } else { for my $longest_tr ( @{$self->{_partials}{$tsi}{transcript}} ) { if ( $longest_tr eq $longest_tg1_trans ) { $already_stored = 1 ; } } } unless ( $already_stored ) { push @{$self->{_partials}{$tsi}{matches}}, $query_gene; push @{$self->{_partials}{$tsi}{transcript}}, $longest_tg1_trans; } } return $self->{_partials}; } sub write_output{ my ($self) = @_; my %identified_partials = %{ $self->partials } ; # info output foreach my $id (keys %identified_partials) { my $gene = $identified_partials{$id}{gene}; print "#desc stable_id chr start end strand cov". " stable_id chr start end strand cov\n"; foreach my $match_gene (@{$identified_partials{$id}{matches}}) { printf " %-9s %18s %3s %9d %9d %2d 0 %18s %3s %8d %8d %2d 0\n", "NA",$gene->stable_id, $gene->slice->seq_region_name, $gene->start, $gene->end, $gene->strand, $match_gene->stable_id, $match_gene->slice->seq_region_name, $match_gene->start, $match_gene->end, $match_gene->strand; } print "#----\n"; } my @sequences_to_write ; foreach my $id (keys %identified_partials) { foreach my $longest_missing_trans (@{$identified_partials{$id}{transcript}}) { #print $longest_missing_trans ."\t" . $longest_missing_trans->stable_id . "\n" ; push @sequences_to_write, $longest_missing_trans; } # this loops trough the genes in query-speices (new_species) #foreach my $longest_missing_trans (@{$identified_partials{$id}{matches}}) { # print "TEST" . $longest_missing_trans ."\t" . $longest_missing_trans->stable_id . "\n" ; #} } # sequences need to be proteins so let's translate them # and check for stop codon before my @output ; TRANSLATIONS: for my $trans ( @sequences_to_write ) { if ( $trans->translation ) { my $translate = $trans->translate; my $transstr = $translate->seq() ; $transstr =~ s/(.{1,60})/$1\n/g; if ($transstr =~m/\*/ ) { print "SKIPPING " . $trans->translation->stable_id . " : translation contain stop codon\n$transstr\n\n"; next TRANSLATIONS ; } $translate->display_id($trans->translation->stable_id) ; push @output, $translate ; } } my $written_files = $self->chunk_and_write_fasta_sequences( shuffle(\@output) ) ; $self->upload_input_ids( $written_files ) ; } sub read_and_check_config { (my $self) = @_ ; $self->SUPER::read_and_check_config(); # - registry file is checked by OrthologueEvaluator # - compara init file and schema-versions checked by OrtholgoueAnalysis # check config for FindPartialGenes my %analysis_sets = %{$$FIND_PARTIAL_GENES{ANALYSIS_SETS}} ; foreach my $logic_name ( keys %analysis_sets ) { unless ( exists $$EXONERATE_CONFIG_BY_LOGIC{$logic_name} ) { throw("You have defined a logic_name ". "\"$logic_name\"\n in our OrthologueEvaluator.pm". " configuration file but there is no configuration for such an analysis in the\n". " Exonerate2Genes-config, so i don't know where to write the genes to.\n". " I suggest to add a configuration for $logic_name to your ". "Exoneate2Genes.pm config\n") ; } my %subsets = %{$analysis_sets{$logic_name}}; for my $species ( qw ( MAIN_ORTH TEST_SPEC_1 TEST_SPEC_2) ) { print "analysing $subsets{$species}\n" ; my $sa = Bio::EnsEMBL::Registry->get_adaptor($subsets{$species},'core',"Slice" ) ; unless ( $sa ) { throw ( " can't get adaptor for species $subsets{$species} - maybe typo in config ?"); } } my $analysis = $self->db->get_adaptor("Analysis")->fetch_by_logic_name($logic_name) ; unless ( $analysis) { throw("There's no analysis $logic_name in your database - this analysis is normally added by". " the setup script\n" ) ; } } } # ## ## ## methods which need to go into Utils : ## ## # # #sub get_transcript_with_longest_cds { # my ($ga, $stable_ids) = @_ ; # my @cds ; # # GENES : for my $sid (@$stable_ids ) { # my $gene = $ga->fetch_by_stable_id ($sid) ; # my $maxlen = 0; # my $nexonmax = 0; # my $longest_transcript; # # foreach my $trans (@{$gene->get_all_Transcripts}) { # my $len = 0; # if ($trans->translation) { # my @trans_exons = @{$trans->get_all_translateable_Exons()}; # foreach my $exon (@trans_exons) { # $len+= $exon->length; # } # if ($len > $maxlen) { # $maxlen = $len; # $longest_transcript = $trans; # $nexonmax = scalar($trans->get_all_Exons) # } # } # } # # if (!defined($longest_transcript)) { # print "No longest_transcript transcript found for " . $gene->stable_id . "\n" ; # next GENES ; # } # my $transstr = $longest_transcript->translate->seq; # $transstr =~ s/(.{1,60})/$1\n/g; # #print ">$sid\n$transstr\n\n" ; # if ($transstr =~m/\*/ ) { # print "SKIPPING " . $longest_transcript->translation->stable_id . # " : translation contain stop codon\n$transstr\n\n"; # next GENES ; # } # my $cds = $longest_transcript->translate ; # $cds->display_id( $longest_transcript->translation->stable_id) ; # push @cds, $cds ; # } # my @rand_cds ; # # randomize # return shuffle(\@cds) ; #} # # 1;