Raw content of Bio::EnsEMBL::Analysis::RunnableDB::FindMissingOrthologues.

Raw content of Bio::EnsEMBL::Analysis::RunnableDB::FindMissingOrthologues =pod =head1 NAME Bio::EnsEMBL::Analysis::RunnableDB::FindMissingOrthologues; =head1 SYNOPSIS my $orthologueanalysis = Bio::EnsEMBL::Analysis::RunnableDB::FindMissingOrthologues->new( -analysis => $analysis_obj, ); $orthologueanalysis->fetch_input(); $orthologueanalysis->run(); $orthologueanalysis->output(); $orthologueanalysis->write_output(); =head1 DESCRIPTION This object wraps Bio::EnsEMBL::Analysis::Runnable::OrthologueEvaluator and is used to fetch the input from different databases as well as writing results the results to the database.a =head1 CONTACT Post general queries to B<ensembl-dev@ebi.ac.uk> =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a '_' =cut package Bio::EnsEMBL::Analysis::RunnableDB::FindMissingOrthologues; use strict; use Bio::SeqIO; use Bio::EnsEMBL::Utils::Exception qw(throw warning verbose); use Bio::EnsEMBL::Analysis::RunnableDB; use Bio::EnsEMBL::Gene; use Bio::EnsEMBL::Analysis::Config::GeneBuild::OrthologueEvaluator; use Bio::EnsEMBL::Analysis::Config::Exonerate2Genes; use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::GeneUtils qw ( get_one2one_orth_for_gene_in_other_species get_transcript_with_longest_CDS ) ; use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranslationUtils qw (contains_internal_stops); use Bio::EnsEMBL::Analysis::Tools::Utilities qw (shuffle ) ; use Bio::EnsEMBL::Registry; use Bio::EnsEMBL::Pipeline::DBSQL::DBAdaptor; use Bio::EnsEMBL::Pipeline::DBSQL::RuleAdaptor; use Bio::EnsEMBL::Pipeline::Rule; use vars qw(@ISA); use Bio::EnsEMBL::Analysis::RunnableDB::OrthologueEvaluator; @ISA = qw ( Bio::EnsEMBL::Analysis::RunnableDB::OrthologueEvaluator ) ; sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); $self->read_and_check_config ; $self->verbose(1) ; my $trusted_one2one_set= $$FIND_MISSING_ORTHOLOGUES{ANALYSIS_SETS}{$self->post_logic_name}; print "Searching for 1:1 orthoologues between $$trusted_one2one_set[0] and $$trusted_one2one_set[1]\n" ; $self->species_1($$trusted_one2one_set[0]) ; $self->species_2($$trusted_one2one_set[1]) ; verbose('WARNING') ; return $self; } # first analysis runs module FindMissingOrthologues and identifies possible targets, dumps them and # creates input_ids , than exonerate2genes wrapper runs with logich_name in Exoner2Genes config .... # --> do the logic_names in both configs need to be the same ? ?? # fetches the genes which have homologues in different species sub fetch_input { my( $self ) = @_; # get all genes on slice for 1st species in array (species 1) print "getting all genes for " . $self->species_1 . " type $$FIND_MISSING_ORTHOLOGUES{DEFAULT_GENE_BIOTYPES}\n"; $self->get_initial_geneset($self->species_1, $$FIND_MISSING_ORTHOLOGUES{DEFAULT_GENE_BIOTYPES}) ; } sub run { my ($self) = @_; my $query_spec = $$MAIN_CONFIG{QUERY_SPECIES}; my (%missing_orth,%trusted_orth); my $no_one2one_found = 0 ; QUERY_GENES : for my $gene_spec1 (@{$self->genes} ) { print " processing gene " . $gene_spec1->stable_id . " species : " . $self->species_1 ."\n" ; print "looking if gene has homolog in " . $self->species_2 . " \n" ; my $tg1_homol = get_one2one_orth_for_gene_in_other_species($gene_spec1 ,$self->species_2) ; if ( $tg1_homol ) { print "the gene has one2one-homology in " . $self->species_2 . " and is considered as trusted !\n" ; $trusted_orth{$gene_spec1->stable_id}=1; print "checking if gene has homology in new build : $query_spec\n" ; my $look_if_new_build_has_homologue = 0 ; foreach my $homolog_to_check ( @{ $gene_spec1->get_all_homologous_Genes()} ) { my ($check_homg, $check_homology, $check_species ) = @$homolog_to_check ; if ($check_species eq $query_spec ) { print "orth. between $query_spec and $check_species found\n" ; $look_if_new_build_has_homologue = 1 ; #next QUERY_GENES ; } #else { # print "flagging gene as missing orth : " . $gene_spec1->stable_id . "\n" ; # $missing_orth{$gene_spec1->stable_id} = $gene_spec1 ; #} } if ( $look_if_new_build_has_homologue == 0 ) { $missing_orth{$gene_spec1->stable_id} = $gene_spec1 ; print "\nFOUND MISS ORTH :" . $gene_spec1->stable_id . "\n\n" ; } } else { print " - no 1:1 homologue found for " . $self->species_2 . " so we don't consider this gene as trusted : " . $gene_spec1->stable_id . "\n" ; $no_one2one_found++; } } print "number of all genes fetched : " . scalar ( @{$self->genes } ) . "\n" ; print "\n\nNumber of genes out of ".$self->species_1 ." which don't have a one2one orth in " .$self->species_2 . " : $no_one2one_found\n"; for ( keys %missing_orth) { print "missing_orthologue: $_\n"; } print scalar( keys %missing_orth)." missing orthologues identifed beteen " . $self->species_1 . " and $query_spec ( using " . $self->species_2 ." as informant )\n"; # dump sequences of identified missing orthologues my @input_id_file_names ; my @longest_transcripts; for my $stable_id ( keys %missing_orth ) { # check for stop codon my ($ltr, $m,$n) = get_transcript_with_longest_CDS($missing_orth{$stable_id}) ; unless (contains_internal_stops($ltr)){ push @longest_transcripts, $ltr; } else { print "Skipping transcript because it contains internal stops\n" ; } } print " getting longest transcripts and their translation for the missing orth\n" ; my @cds ; for my $tr ( @longest_transcripts ) { my $cds = $tr->translate; $cds->display_id($tr->translation->stable_id ) ; push @cds, $cds ; } print " use -write option in the test-runnable to dump the identfied sequences to disk\n" ; print " and upload the new input_ids into the reference-db for the post-analysis (exonerate)\n" ; $self->output(shuffle(\@cds)) ; } sub write_output{ my ($self) = @_; my $nr_of_seqs_in_file = 1 ; # chunk-size my $written_files = $self->chunk_and_write_fasta_sequences($self->output, undef,undef,undef, $nr_of_seqs_in_file ) ; if ( $written_files ) { print scalar(@$written_files) . " fasta-files written for " . $self->species_1."\n" ; print "\nsequence written to files:\n" ; print join ("\n", @$written_files) ; print "\n\n"; $self->upload_input_ids( $written_files ); print STDERR "input_ids uploaded\n" ; } else { print STDERR "NO input_ids uploaded\n" ; } } sub check_transcript { my $tr = shift ; my $transstr = $tr->translate->seq; $transstr =~ s/(.{1,60})/$1\n/g; #print ">$sid\n$transstr\n\n" ; if ($transstr =~m/\*/ ) { print "SKIPPING " . $tr->translation->stable_id . " : translation contain stop codon\n$transstr\n\n"; return undef ; } return $tr ; } sub filter_homologies { my ( $all_homologies , $look_for_this_species) = @_ ; my @result ; HOMOLOGIES :for my $homology ( @$all_homologies ) { my @all_member_attributes = @{$homology->get_all_Member_Attribute} ; # first object is source itself so don't process this shift @all_member_attributes ; MA: foreach my $member_attribute (@all_member_attributes) { my ($new_member, $attribute) = @{$member_attribute}; my $species_name_of_orthologue = $new_member->genome_db->name ; if ( $species_name_of_orthologue =~m/$look_for_this_species/) { push @result, $homology ; } } } return \@result ; } sub read_and_check_config { (my $self) = @_ ; $self->SUPER::read_and_check_config(); # - registry file is checked by OrthologueEvaluator # - compara init file and schema-versions checked by OrtholgoueAnalysis # check config for FindMissingOrthologues my %config_hash = %{$FIND_MISSING_ORTHOLOGUES}; } # methods which need to go into Utils : sub get_longest_transcripts{ my ($ga, $stable_ids) = @_ ; my @cds ; GENES : for my $sid (@$stable_ids ) { my $gene = $ga->fetch_by_stable_id ($sid) ; my $maxlen = 0; my $nexonmax = 0; my $longest_transcript; foreach my $trans (@{$gene->get_all_Transcripts}) { my $len = 0; if ($trans->translation) { my @trans_exons = @{$trans->get_all_translateable_Exons()}; foreach my $exon (@trans_exons) { $len+= $exon->length; } if ($len > $maxlen) { $maxlen = $len; $longest_transcript = $trans; $nexonmax = scalar($trans->get_all_Exons) } } } if (!defined($longest_transcript)) { print "No longest_transcript transcript found for " . $gene->stable_id . "\n" ; next GENES ; } my $transstr = $longest_transcript->translate->seq; $transstr =~ s/(.{1,60})/$1\n/g; #print ">$sid\n$transstr\n\n" ; if ($transstr =~m/\*/ ) { print "SKIPPING " . $longest_transcript->translation->stable_id . " : translation contain stop codon\n$transstr\n\n"; next GENES ; } my $cds = $longest_transcript->translate ; $cds->display_id( $longest_transcript->translation->stable_id) ; push @cds, $cds ; } my @rand_cds ; # randomize return shuffle(\@cds) ; } 1;