Raw content of Bio::EnsEMBL::Analysis::RunnableDB::FindFalseIntrons =pod =head1 NAME Bio::EnsEMBL::Analysis::RunnableDB::FindFalseIntrons =head1 SYNOPSIS my $fs = Bio::EnsEMBL::Analysis::RunnableDB::FindFalseIntrons->new( -analysis => $analysis_obj, # ); $fs->fetch_input(); $fs->run(); $fs->output(); $fs->write_output(); =head1 DESCRIPTION This object wraps Bio::EnsEMBL::Analysis::Runnable::OrthologueEvaluator and is used to fetch the input from different databases as well as writing results the results to the database.a Configuration used to run this module is contained in the following files : Bio::EnsEMBL::Analysis::Config::GeneBuild::ExamineGeneSets; Bio::EnsEMBL::Analysis::Config::GeneBuild::OrthologueEvaluator; The analysis FindFalseIntrons uses input-ids of the following format : Coordsystem:Assembly:Seq_region_name:Start:End:Strand chromosome:BROADD2:12:1333:890890:1 chromosome:BROADD2:3:30434100:30529400:1 =head1 CONTACT Post general queries to B<ensembl-dev@ebi.ac.uk> =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a '_' =cut package Bio::EnsEMBL::Analysis::RunnableDB::FindFalseIntrons; use strict; use Bio::EnsEMBL::Analysis::Config::GeneBuild::ExamineGeneSets; use Bio::EnsEMBL::Analysis::Config::GeneBuild::OrthologueEvaluator; use Bio::EnsEMBL::Analysis::Config::GeneBuild::Databases; use Bio::EnsEMBL::Utils::Exception qw(throw warning info); use Bio::EnsEMBL::Analysis::RunnableDB::BaseGeneBuild; use Bio::EnsEMBL::Analysis::RunnableDB::ExamineGeneSets; use Bio::EnsEMBL::Registry; use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::GeneUtils qw( get_transcript_with_longest_CDS get_one2one_homology_for_gene_in_other_species get_one2one_orth_for_gene_in_other_species ); use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::HomologyUtils qw( get_gene_obj_out_of_compara_homology_object ); use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranscriptUtils qw( list_evidence convert_translateable_exons_to_exon_extended_objects ); use vars qw(@ISA); @ISA = qw ( Bio::EnsEMBL::Analysis::RunnableDB::BaseGeneBuild Bio::EnsEMBL::Analysis::RunnableDB::OrthologueEvaluator ) ; sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); $self->read_and_check_config(); # $self->verbose(50000) ; $self->orthologues_species( $$FIND_FALSE_INTRONS{ANALYSIS_SETS}{$self->analysis->logic_name} ) ; #print join ("\n" , @{$$FIND_FALSE_INTRONS{ANALYSIS_SETS}{$self->analysis->logic_name}} ) ; return $self; } # inspect all introns of the QUERY_SPECIES of genes with certain # biotypes ( see OrthologueEvaluator.pm config ) sub run { my ($self) = @_; my @genes = @{$self->genes} ; my $cnt =0 ; my %input_ids_to_upload ; my %transcripts_marked_for_deletion; my %intron_attributes; GENES: for my $g ( @genes ) { $cnt++; print "\n\nprocessing gene-nr $cnt / ".scalar(@genes) . " : " . $g->stable_id . "\n" ; print "="x80; print "\n"; # we only use the translation length of the transcript with the longest # CDS - otherwise statistics will be slightly different for genebuilds # where genes have lots of transcripts. we also only count the nr of exons # of the transcript with the longest CDS my ($t_long,$cdsl, $n_exon_max) = get_transcript_with_longest_CDS($g); unless ( $t_long) { print " longest transcript does not exist\n" ; next GENES; } my ( $unique_introns , $i2t ) = get_unique_introns_of_transcripts ( $g ) ; my %unique_introns = %{$unique_introns} ; ; my %i2t = %{$i2t}; # process a list of non-redundant introns of all transcripts seperately and independent from genes / transcripts $self->{_false_introns_alt_tr} = {} ; $self->{_ftr_false_introns} = {} ; $self->{_coding_ex_in_alt_tr_which_aligns_false_intron} = {}; my %intron_is_coding_in_homologues ; for my $intron ( values %unique_introns ) { if ( $intron->length < 150 ) { print "\nINTRON : " . $intron->length . "< 150\n========================================\n"; my $aligned_homolog_trans = $self->exonerate_intron_vs_homologs( $intron, $g, $i2t); if ( scalar( @$aligned_homolog_trans) > 0 ) { # intron aligned against homologues protein so make input_id to recover intron print "INTRON_IS_CODING_IN_HOMOLOGUES : " . join ("," , map { $_->stable_id } @$aligned_homolog_trans ) . "\n\n" ; $intron_is_coding_in_homologues{$intron} = $aligned_homolog_trans ; # recover id is : # chromosome:Btau_3.1:Un:287653469:287755066:1:other_protein:Q5W135.1:203:ENSP00000354519:345781 my @exonerateVSGenwise_ids = @{$self->get_recover_ids( $g,$t_long, $aligned_homolog_trans)} ; # flag the recovered intron for the recovery with RecoverFalseIntrons.pm my @iid = split /\:/, $self->input_id ; my $my_slice = $self->db->get_SliceAdaptor->fetch_by_region(undef,$iid[2]) ; # create simple_feature ( we could also change the method how we align the intron ( i.e. use exonerate # out of a runnableDB and than use the parser as well and store the whole alignemnt in the dna_align_feature table ) my $feature = Bio::EnsEMBL::SimpleFeature->new( -seq_region_start => $intron->seq_region_start , -seq_region_end => $intron->seq_region_end, -start => $intron->seq_region_start , -end => $intron->seq_region_end, -strand=> $intron->seq_region_strand, -analysis => $self->analysis() , -score => 0, -display_label => $intron->prev_Exon->stable_id."_".$intron->next_Exon->stable_id , -slice => $my_slice, ) ; # put input-ids / recovery on hold until we checked against alt transcript if we need to recover or if # FI is coding in alt trans $intron_attributes{$intron}{feature} = $feature ; $intron_attributes{$intron}{input_ids} = $self->get_recover_ids( $g,$t_long, $aligned_homolog_trans) ; # $sfa->store($feature) ; # for my $exVSgen_id ( @exonerateVSGenwise_ids ) { # $input_ids_to_upload{$exVSgen_id.":".$feature->dbID}=1; # $intron_attributes{$intron}{input_id} = $exVSgen_id ; # print "FEATURE : $intron \t $exVSgen_id:".$feature->dbID . "\n" ; # } } else { print "no hit - intron did not align\n" ; } print "\n\n" ; } else { # // length < xxx #print "intron too long \n" ; } } # // unique introns # # NOW we make decision if we want to flag for recovering, or if we flag as 'alternative transcript with introns which are coding in # alt. transcript and in homolog' - which basically mean it's a low quality one ...... # print "Summary:\n=====================\n" ; my %alt_trans_ex_coding; my %false_introns = %{ $self->{_false_introns_alt_tr} } ; for my $k ( keys %false_introns ) { for my $tr ( @{$false_introns{$k}} ) { $alt_trans_ex_coding{$tr->stable_id}++ ; #print $tr->stable_id ."\n"; } } print "\n" ; foreach my $tr_coding_stable_id ( keys %alt_trans_ex_coding ) { print "ALT_CODING_TR : $tr_coding_stable_id\t$alt_trans_ex_coding{$tr_coding_stable_id} introns recovered\n" ; } print "\n" ; # false transcript->stable_id - intron intron intron my %false_transcripts= %{ $self->{_ftr_false_introns} } ; for my $f_tsi ( keys %false_transcripts) { print "FalseTranscript " . $f_tsi ."\t: ".scalar(@{$false_transcripts{$f_tsi}})." possible_false_introns.\n" ; my $nr_false_introns_recovered = 0 ; for my $false_intron ( @{$false_transcripts{$f_tsi }} ) { print "\tFI coding in HOMOLOGUES: ".join ("," , map { $_->stable_id } @{$intron_is_coding_in_homologues{$false_intron}}) . "\n"; print "\tFI coding in alt.Trans : ".join (" ,", map { $_->stable_id } @{${$self->{_false_introns_alt_tr}}{$false_intron}} ) ; my %fi = %{ $self->{_coding_ex_in_alt_tr_which_aligns_false_intron}} ; print "\tFI-start : " . $fi{$false_intron}{false_intron}->seq_region_start . "\t" ; print " FI-end : " . $fi{$false_intron}{false_intron}->seq_region_end . "\t" ; print "\toverlapping_coding_ex_in_alt_tr: ". $fi{$false_intron}{overlapping_coding_exon}->stable_id . "\n\n" ; $nr_false_introns_recovered++; } for my $false_intron ( @{$false_transcripts{$f_tsi }} ) { my $delete_recovery_id = 0 ; if ( (scalar( @{$false_transcripts{$f_tsi }}) == $nr_false_introns_recovered) && (scalar( @{$intron_is_coding_in_homologues{$false_intron}}) == scalar(@{$self->orthologues_species}) ) ) { # all FalseIntrons are coding in alternative transcript as well as in both Orthologues print "marked_for_deletion_alt_coding_and_coding_in_homologues : $f_tsi\n" ; #$transcripts_marked_for_deletion{coding_in_alt_tr}{$f_tsi}=1; #$delete_recovery_id = 1 ; } elsif ( scalar ( @{$false_transcripts{$f_tsi }} ) == $nr_false_introns_recovered ) { # all FalseIntrons are recovered in alt trans print "transcript could be marked for deletion as it's coding in alt. trans : $f_tsi\n" ; #$transcripts_marked_for_deletion{coding_in_alt_tr}{$f_tsi}=1; #$delete_recovery_id = 1 ; } else { foreach my $intron_key ( keys %intron_attributes ) { print "intron_key $intron_key\n" ; my $simple_feature = $intron_attributes{$intron_key}{feature}; print "simple_feature $simple_feature\n" ; my $feature = $intron_attributes{$intron_key}{feature}; my @recover_ids = @{ $intron_attributes{$intron_key}{input_ids} } ; for my $rci ( @recover_ids ) { print " recover_id $rci\n" ; } } } if ( $delete_recovery_id ) { print "deleteing recovery_ids as FalseIntron is coding:\n" ; delete $intron_attributes{$false_intron}{feature}; delete $intron_attributes{$false_intron}{input_ids} ; } } } #my $nr_of_false_introns_in_transcript = keys %false_introns ; #my $nr_coding_ex = keys %{ $self->{_coding_ex_in_alt_tr_which_aligns_false_intron}} ; # # print "FI in trans : $nr_of_false_introns_in_transcript \n" ; # print "CODING EX : $nr_coding_ex\n" ; # # if ( $nr_of_false_introns_in_transcript == $nr_coding_ex ) { # # all false introns are recovered in alt. transcript - gene can be flagged for deletion, # print "Flagged for deletion : " ; # # don't upload input_ids to recover # } # } # // GENES my $sfa = $self->db->get_SimpleFeatureAdaptor() ; # upload SimpleFeatures for FalseIntrons with coordinates for my $false_intron ( keys %intron_attributes ) { my $feature = $intron_attributes{$false_intron}{feature}; if ( $feature ) { $sfa->store($feature) ; for my $exVSgen_id ( @{ $intron_attributes{$false_intron}{input_ids} }){ #print FEATURE : $feature->dbID . "\n" ; $input_ids_to_upload{$exVSgen_id.":".$feature->dbID}=1; #$intron_attributes{$false_intron}{input_id} = $exVSgen_id ; } } } # # upload stable-id's of alt. transcripts which have been flagged for removal # for my $set ( keys %transcripts_marked_for_deletion ) { for my $tsi ( keys %{$transcripts_marked_for_deletion{$set}}) { my $rm_transcript = $self->db->get_TranscriptAdaptor->fetch_by_stable_id($tsi) ; my $feature = Bio::EnsEMBL::SimpleFeature->new( -seq_region_start => $rm_transcript->seq_region_start , -seq_region_end => $rm_transcript->seq_region_end, -start => $rm_transcript->seq_region_start , -end => $rm_transcript->seq_region_end, -strand=> $rm_transcript->seq_region_strand, -analysis => $self->analysis() , -score => 0, -display_label => $rm_transcript->stable_id."_$set", -slice => $rm_transcript->slice, ) ; $sfa->store($feature) ; print $feature->display_id() . " stored for further investigation in SimpleFeature table \n" ; } } # NOW check analysis and upload input_ids # THIS CAN ALL GO INTO WRITE_OUTPUT # check if we got a post-analysis stored in the db otherwise set one up my $analysis_obj = new Bio::EnsEMBL::Pipeline::Analysis( -logic_name => "RecoverFalseIntrons", -program=> "exonerate", -module => "RecoverFalseIntrons", -input_id_type => 'prot_slice_form', ); my $submit_ana = new Bio::EnsEMBL::Pipeline::Analysis( -logic_name => "Submit_RecoverFalseIntrons", -module => "Dummy", -input_id_type => 'prot_slice_form', ); my $pa = $self->pipeline_adaptor($self->db) ; my $aa = $pa->get_AnalysisAdaptor; $aa->store($submit_ana); $aa->store($analysis_obj); # set up rules / conditions for post analysis print "input_ids:\n" ; print join("\n", keys %input_ids_to_upload)."\n\n" ; print "reformatting input_ids as we use the new format:\n" ; my @old_input_ids = keys %input_ids_to_upload; my @new_input_ids = @{reformat_input_ids(\@old_input_ids)} ; print "new_input_ids:\n" ; for my $nid ( @new_input_ids ) { print "new_input_id " . $nid . "\n" ; print "new_input_id_length " . length($nid) . "\n" ; } # $self->upload_input_ids( [keys %input_ids_to_upload] , "RecoverFalseIntrons" , "prot_slice_form") ; $self->upload_input_ids( \@new_input_ids , "Submit_RecoverFalseIntrons" , "prot_slice_form") ; my $goal_analysis = $aa->fetch_by_logic_name("RecoverFalseIntrons") ; my $rule = Bio::EnsEMBL::Pipeline::Rule->new(-goalanalysis => $goal_analysis); # check if rule has already been stored ... my $ra = $self->pipeline_adaptor($self->db)->get_RuleAdaptor(); my $store_rule = 1 ; RULE: foreach my $old_rule( $ra->fetch_all ) { if ( $goal_analysis->logic_name eq $old_rule->goalAnalysis->logic_name ) { $store_rule = 0 ; } } if ( $store_rule ) { $ra->store($rule) ; } Bio::EnsEMBL::Registry->load_all($$MAIN_CONFIG{LOCATION_OF_COMPARA_REGISTRY_FILE},1,1); my @dba = @{Bio::EnsEMBL::Registry->get_all_DBAdaptors()}; for ( @dba ) { $_->disconnect_when_inactive(1); } } sub reformat_input_ids { my ($input_id_aref ) = @_ ; my @new_ids ; my %tmp; for my $id ( @{$input_id_aref} ) { #my ($cs,$asm,$sname,$start,$end,$strand,$query_stable_id,$hom_stable_id,$sf_id) = split /\:/,$id ; my @components = split /\:/,$id ; my $sf_id = pop @components ; my $hom_stable_id = pop @components ; my $key = join ( ":", @components ) ; push @{$tmp{$key}{$hom_stable_id}}, $sf_id ; } for my $region_key ( keys %tmp ) { my %sf_id_index ; print "processing region and transcript : "; print $region_key . "\n" ; my %hom_to_sf_id = %{$tmp{$region_key}} ; for my $stable_id ( keys %hom_to_sf_id ) { print $stable_id . "\t" ; my @sf_ids = @{$hom_to_sf_id{ $stable_id}}; for my $sf_id ( @sf_ids ) { print "$sf_id\t" ; $sf_id_index{$sf_id} = 1; } print "\n" ; } my $sf_db_id_string = ""; my $counter = 0 ; for my $id ( sort keys %sf_id_index ) { $sf_db_id_string .= $id .","; $sf_id_index{$id}=$counter ; print $id . " --> " . $counter . "\n" ; $counter++; } my $hom_string =""; my $sf_string =""; for my $stable_id ( keys %hom_to_sf_id ) { $hom_string .=$stable_id."," ; for my $sf_id ( sort @{$hom_to_sf_id{ $stable_id}} ) { print "\t$sf_id\t" ; $sf_string .= $sf_id_index{$sf_id}. ","; } print "\n"; $sf_string =~s/,$//; $sf_string .= "_"; } $sf_string=~s/_$//; $sf_string=~s/,$//; $sf_db_id_string=~s/,$//; $hom_string =~s/,$//; push @new_ids , $region_key . ":" . $hom_string .":" . $sf_string . ":" . $sf_db_id_string ; } return \@new_ids ; } sub exonerate_intron_vs_homologs { my ($self,$intron_to_check, $gene, $i2t, $transcript_of_intron ) = @_ ; my %intron_aligns_in_x_homologues ; my %i2t = %{$i2t}; my @aligned_homolog_transcripts ; my @homo_genes = @{ $self->get_homologues_genes($gene) } ; if ( scalar(@homo_genes) == 0 ) { print "no homologs found\n" ; } HOMOLOG: for my $homolog_gene ( @homo_genes ) { print $homolog_gene . "\t" . $homolog_gene->stable_id . "\n" ; # build array of coding exons for homologues transcript my @introns_in_homolog_gene ; TRANS: for my $ht ( @{$homolog_gene->get_all_Transcripts } ) { push @introns_in_homolog_gene, @{$ht->get_all_Introns}; } TRANS: for my $homology_trans ( @{$homolog_gene->get_all_Transcripts } ) { my $transcript_of_intron = $i2t{$intron_to_check}; my $intron_id = $transcript_of_intron->stable_id."_".$intron_to_check->prev_Exon->stable_id."_".$intron_to_check->next_Exon->stable_id; my $query_length = exonerate_sequences($intron_to_check, $homology_trans , $intron_id, $transcript_of_intron); if ( $query_length > 0 ) { push @aligned_homolog_transcripts, $homology_trans ; # # check if the intron which aligned to the CDS-exon of one of the homologs is # coding sequence in one of the other alternatively spliced transcripts # of the gene to investigate ? or of the other homologs ? # TRANSCRIPTS: for my $tr ( @{$gene->get_all_Transcripts} ) { my $tr2 = $transcript_of_intron->stable_id ; if ( $tr->stable_id =~m/$tr2/) { next TRANSCRIPTS ; } #my $query_length = exonerate_sequences( $intron_to_check, $tr, $intron_id,$transcript_of_intron ) ; # compare coordinates of introns of one transcript and exons of the alternative transcript : my @exons = @{$tr->get_all_Exons} ; for my $e ( @exons ) { if ( $e->overlaps($intron_to_check)){ print "\nExon in alternative transcript overlaps FalseIntron - Intron will not be recovered.\n"; print "Exon : " . $e->stable_id ." ( " . $tr->stable_id . ") overlaps_intron in $intron_id\n" ; print "Positive: " . $tr->stable_id . " vs " . $transcript_of_intron->stable_id . "\n" ; # check if the false intron is maybe coding in one of the alternative transcripts $self->intron_aligns_alternative_transcript($intron_to_check, $transcript_of_intron, $e, $tr, $homology_trans ) ; } } } # OK now identify the intron -vs- coding-exon-in-homolog pair ... # get all coding exons of homolog my %hom_coding_alinging_exons = %{ exonerate_intron_vs_exons($intron_to_check, $homology_trans, $intron_id, $transcript_of_intron) } ; print "exonerating done \n" ; for my $exon_stable_id ( keys %hom_coding_alinging_exons ) { my ( $exon, $query_length ) = @{$hom_coding_alinging_exons{$exon_stable_id}} ; for my $hom_trans_intron ( @introns_in_homolog_gene ) { if ( $hom_trans_intron->overlaps($exon) ) { print "\nXXXXXXXX\nIntron overlaps " . $exon->stable_id . " - better not recover this as it's \nXXXXXXXXX\n" ; print $hom_trans_intron->prev_Exon->stable_id . " " . $hom_trans_intron->next_Exon->stable_id . " " . $exon->stable_id . "\n" ; print "\nXXXXXXXXXXXXXX\n" ; } } } } } # // foreach transcript } # // foreach homolog return (\@aligned_homolog_transcripts) ; } # report sub intron_aligns_alternative_transcript { my ($self, $false_intron, $false_transcript, $overlapping_exon , $alt_transcript, $homology_trans ) = @_ ; # Report for Gene X : # - has xx 1:1 homologues with yy alt. spliced transcripts # - has xx false-introns # - false-introns are recovered in tr1, tr2 # - are All false-introns recovered in one transcript ? # - nr of transcripts with false introns $transcript-> @{introns} # - ${intron} -> recovered in false Intron ${$self->{_coding_ex_in_alt_tr_which_aligns_false_intron}}{$false_intron}{false_intron}= $false_intron; ${$self->{_coding_ex_in_alt_tr_which_aligns_false_intron}}{$false_intron}{overlapping_coding_exon}= $overlapping_exon; push @{${ $self->{_false_introns_alt_tr}}{$false_intron}}, $alt_transcript; push @{${ $self->{_intron_coding_in_hom}}{$false_intron}}, $homology_trans; # my %tmp = %{$self->_false_introns_alt_tr} ; # print join ( "\n" , keys %tmp ) ; # print $tmp{$false_intron} ; push @{ ${$self->{_ftr_false_introns}} {$false_transcript->stable_id} }, $false_intron ; } sub exonerate_sequences { my ($intron_to_check , $target, $intron_id, $transcript_of_intron,$version) = @_ ; unless ( $intron_id) { $intron_id = $intron_to_check->display_id ; } my $out_dir = "/tmp" ; my $f1= $out_dir."/".$intron_id.".query" ; # intron seq ( short seq ) my $f2= $out_dir."/".$target->stable_id .".target"; # gene seq/ genome seq my $f3= $out_dir."/".$target->stable_id .".exonerate"; open (FH,">$f1") || die "Cant write file $f1\n" ; # this was set before in one of the subroutines # may need to worry if this gets redundant / if it's unset ....... # write INTRON to file my $seq = $intron_to_check->seq; $seq=~ s/(.{1,60})/$1\n/g; print FH ">Intron $intron_id\n" . $seq; close(FH) ; # write TRANSCRIPT to file open (FH,">$f2") || die "Cant write file $f2\n" ; #($seq=$target->seq->seq )=~ s/(.{1,60})/$1\n/g; ($seq=$target->translateable_seq )=~ s/(.{1,60})/$1\n/g; print FH ">".$target->stable_id . "\n$seq"; close(FH) ; my @versions = qw ( 1.0.0 ) ; my @options = ( " " , " --exhaustive " ) ; my $query_length =0 ; my $cmd ; for my $version ( @versions ) { for my $extra_opt ( @options ) { my $exonerate_version ; if ( $version =~m/0.7.1/ || $version=~m/1.0.0/) { $exonerate_version = "/usr/local/ensembl/bin/exonerate-$version" ; } else { $exonerate_version = "/software/ensembl/bin/exonerate-1.4.0 " ; } my @alignment ; $cmd = $exonerate_version . " -q $f1 -t $f2 -m affine:local --bestn 1 "; $cmd .= $extra_opt ; $cmd.=" > $f3"; system("$cmd") ; my $hit = 0 ; my $raw_score ; open (F,"$f3") || die " can't read $f3\n" ; foreach my $l (<F>){ $hit = 1 if ($l=~/C4 Alignment:/) ; #$hit = 0 if $l=~/vulgar/; push @alignment , $l if $hit ; if ( $l=~m/Query range:/){ my @it = split/\s+/,$l ; if ( $it[5]>$it[3]) { $query_length = ($it[5] - $it[3])+1 ; } else { $query_length = ($it[3] - $it[5])+1 ; } } } close(F); # print info if ( $hit ) { #print "\n$f1 \n $f2 \n $f3\n" ; print "hit_with $version $extra_opt \n$cmd" ; print "\nIntron_exonerate_alignment_OK : " . $transcript_of_intron->stable_id." [ ". $intron_to_check->prev_Exon->stable_id." <intron> " .$intron_to_check->next_Exon->stable_id . "] ALIGNS " . $target->stable_id ." ] QLEN $query_length iLEN " . $intron_to_check->length."\n\n" ; print $intron_id . "aligns perfect \n" if ( $intron_to_check->length == $query_length ); #if ( $query_length % 3 == 0 ){ # print " alignment_length_mod_3\n" ; #}else{ # print " alignment_frameshift\n"; #} #print "\n\n".join("",@alignment) ; system("rm $f1"); system("rm $f2"); system("rm $f3"); return $query_length ; } else { # print "no_hit \n" ; } # jhvx } # next version } # next option #system("rm $f1"); system("rm $f2"); system("rm $f3"); return $query_length ; } sub exonerate_intron_vs_exons { my ($intron_to_check , $homology_trans, $intron_id, $transcript_of_intron ) = @_ ; print "INTRON_EXONERATING to identify CODING exons in homolog.....................\n"; unless ( $intron_id) { $intron_id = $intron_to_check->display_id ; } my $out_dir = "/tmp"; my $f1= $out_dir."/".$intron_id.".query" ; # intron seq ( short seq ) my @alignment ; # write INTRON seq to file open (FH,">$f1") || die "Cant write file $f1\n" ; my $seq = $intron_to_check->seq; $seq=~ s/(.{1,60})/$1\n/g; print FH ">Intron $intron_id\n" . $seq; close(FH) ; my %aligning_hom_exons ; my @homology_exons = @{$homology_trans->get_all_translateable_Exons}; for my $hom_exon ( @homology_exons ) { my $f2= $out_dir."/".$hom_exon->stable_id .".homology_trans"; # gene seq/ genome seq my $f3= $out_dir."/".$hom_exon->stable_id .".exonerate"; # write HOMOLOGS CODING EXON seq to file open (FH,">$f2") || die "Cant write file $f2\n" ; ($seq=$hom_exon->seq->seq )=~ s/(.{1,60})/$1\n/g; print FH ">".$hom_exon->stable_id . "\n$seq"; close(FH) ; # used for first comparision my $exonerate_version ="/usr/local/ensembl/bin/exonerate-1.0.0" ; my $cmd = "$exonerate_version -q $f1 -t $f2 -m affine:local --bestn 1 "; $cmd .=" > $f3"; system("$cmd") ; my $hit = 0 ; my $query_length =0 ; my $raw_score ; open (F,"$f3") || die " can't read $f3\n" ; foreach my $l (<F>){ if ($l=~/C4 Alignment:/) { $hit = 1; $l = "INTRON vs CDS-EXON-HOMOLOG " . $l ; print "\nINTRON vs CDS-EXON-HOMOLOG\n"; } push @alignment , $l if $hit ; if ( $l=~m/Query range:/){ my @it = split/\s+/,$l ; if ( $it[5]>$it[3]) { $query_length = ($it[5] - $it[3])+1 ; } else { $query_length = ($it[3] - $it[5])+1 ; } } } close(F); if ( $hit ) { print "\ntest-intron aligns vs coding exon : " . $transcript_of_intron->stable_id. " [ ". $intron_to_check->prev_Exon->stable_id." <intron> " .$intron_to_check->next_Exon->stable_id . "] ALIGNS " . $hom_exon->stable_id ." ] QLEN $query_length iLEN " . $intron_to_check->length."\n\n" ; #print "\n\n".join("",@alignment) ; $aligning_hom_exons{$hom_exon->stable_id} = [ $hom_exon , $query_length] ; }else { print "intron does not match exon " . $hom_exon->stable_id . "\n" ; } system("rm $f2"); system("rm $f3"); } system("rm $f1"); return \%aligning_hom_exons ; } sub get_unique_introns_of_transcripts { my ( $g ) = @_; my (%unique_introns, %i2t) ; for my $tr ( @{$g->get_all_Transcripts}){ my @tr_exons = @{ $tr->get_all_translateable_Exons}; # not sure if this is needed for exons but it's definitely needed for introns ! if ( $tr->seq_region_strand == 1) { @tr_exons = sort { $a->seq_region_start <=> $b->seq_region_start } @tr_exons ; } else { @tr_exons = sort { $b->seq_region_start <=> $a->seq_region_start } @tr_exons ; } my $intron_count = 0 ; foreach ( my $i=0; $i < scalar(@tr_exons) ; $i++) { if ( $i+1 < scalar(@tr_exons) ){ $intron_count++; my $intron = new Bio::EnsEMBL::Intron( $tr_exons[$i], $tr_exons[$i+1] ) ; $i2t{$intron} = $tr; # push @{ $i2t{$intron} } , $tr; my $intron_hkey = $intron->slice->name."-".$intron->seq_region_start."-". $intron->seq_region_end."-".$intron->seq_region_strand; unless ( exists $unique_introns{$intron_hkey}) { $unique_introns{$intron_hkey}=$intron ; } } # all (nr_exons - 1 ) introns processed } # all exons } # // transcripts return ( \%unique_introns, \%i2t) ; } sub get_recover_ids { my ($self, $gene, $t_long, $aref ) = @_ ; my $g_sid = $gene->stable_id ? $gene->stable_id : "NO_ID_".$gene->dbID ; my %recover_ids ; my ($start, $end , $chr ) = get_coords($gene) ; #print " coord_system of gene : " . $gene->slice->coord_system_name()."\n" ; #print " coord_system of gene : " . $gene->slice->coord_system()."\n" ; #print " coord_system of gene : " . $gene->slice->coord_system->version()."\n" ; #print " coord_system of gene : " . $gene->slice->coord_system->name()."\n" ; my $cs = $gene->slice->coord_system->name(); my $csv = $gene->slice->coord_system->version(); unless ( $csv ) { warning("Error: Could not get coord_system_version for coord_system with name : ".$cs ."\n"); } for my $homolog ( @$aref ) { $recover_ids{"$cs:$csv:$chr:$start:$end:1:".$t_long->stable_id.":".$homolog->stable_id}=1; } return [keys %recover_ids] ; } sub get_homologues_genes { my ($self, $gene) = @_ ; #my @specs_to_check = ( $self->first_species, $self->second_species ) ; my @specs_to_check = @{$$FIND_FALSE_INTRONS{ANALYSIS_SETS}{$self->analysis->logic_name}} ; my @homologs; # SPECIES: foreach my $spec ( @specs_to_check ) { # my $homology = get_one2one_homology_for_gene_in_other_species( $gene ,$spec ) ; # # if ( $homology ) { # my $homol_gene = get_gene_obj_out_of_compara_homology_object($homology, $spec) ; # # if ( $homol_gene ) { # if ( $homol_gene->biotype=~m/protein_coding/ ) { # push @homologs , $homol_gene ; # } # } # } # } # for ( @homologs ) { # print "METHOD_1 identified homologs : " . $_->stable_id . "\t". $_ . "\n" ; # } my @hg_2 ; SPECIES: foreach my $other_species ( @specs_to_check ) { print "getting homolog for $other_species\n" ; my $hg = get_one2one_orth_for_gene_in_other_species($gene, $other_species) ; if ( $hg ) { if ( $hg->biotype=~m/protein_coding/ ) { push @hg_2 , $hg; } } } return \@hg_2; } sub read_and_check_config { (my $self) = @_ ; # # Check the compara config file in Config/OrthologueEvaluator; # throw("Your compara-registry-file LOCATION_OF_COMPARA_REGISTRY_FILE does not exist !!". "\nCheck your config !") unless ( -e $$MAIN_CONFIG{LOCATION_OF_COMPARA_REGISTRY_FILE} ) ; print STDERR "reading compara-config file : $$MAIN_CONFIG{LOCATION_OF_COMPARA_REGISTRY_FILE}\n"; # check compara configuration file and schema-versions of dbs Bio::EnsEMBL::Registry->load_all($$MAIN_CONFIG{LOCATION_OF_COMPARA_REGISTRY_FILE},1,1); my @dba = @{Bio::EnsEMBL::Registry->get_all_DBAdaptors()}; my %tmp; for my $a ( @dba ) { push @{$tmp{ $a->get_MetaContainer->get_schema_version }} , $a->dbname ; } if ( keys %tmp > 1 ) { warning("You're using databases with different schemas - this can cause problems ...\n" ) ; for ( keys %tmp ) { print "schema $_:\n". join("\n" , @{$tmp{$_}} ) . "\n\n" ; } } for my $a ( @dba ) { $a->disconnect_when_inactive(1); } } # work error !!!! # # substr outside of string at # # /lustre/work1/ensembl/jhv/project_genestructure_comparison/cvs/ensembl_42/modules/Bio/EnsEMBL/DBSQL/SequenceAdaptor.pm line 264. # sub get_coords{ my ($gene) = @_ ; my ($start, $end, $chr ) ; if ( ($gene->seq_region_start - 50000 ) < 0 ) { $start = $gene->slice->start ; } else { $start = $gene->seq_region_start - 50000 ; } if (($gene->seq_region_start + 50000 ) > $gene->slice->end) { $end = $gene->slice->length ; } else { $end = $gene->seq_region_end + 50000 ; } $chr = $gene->slice->seq_region_name ; return ( $start, $end, $chr ) ; } sub fetch_input { my( $self ) = @_; print "\n\nChecking introns for the species configured in ". " Config/GeneBuild/OrthologEvaluator.pm : $$MAIN_CONFIG{QUERY_SPECIES}\n" ; $self->cmp_species( $$MAIN_CONFIG{QUERY_SPECIES} ); print "getting adaptor from registry ... " . $self->cmp_species . "\n" ; my $ga = Bio::EnsEMBL::Registry->get_adaptor($self->cmp_species,"core","gene"); unless ( $ga ) { print " I can't get a gene-adaptor for " . $self->cmp_species . "\n" . " Maybe you forgot to add the alias for this to your compara registry file\n" ; exit(0); } print "==> Fetching gene from db : " . $ga->db->dbname . " \@ " . $ga->db->host . " : ". $ga->db->port ."\n" ; my $sa = Bio::EnsEMBL::Registry->get_adaptor($self->cmp_species,"core","slice"); throw("Can't get GeneAdaptor from Registry") unless $ga ; my @array = split(/:/,$self->input_id) ; my $slice ; my @genes_for_stats; # we can use any input_id to investigate all genes in genome, or we can # use a slice-name like 'chromosome:NCBI34:X:100:200' to just inspect a slice my @limit_to_biotypes = @{$$FIND_FALSE_INTRONS{DEFAULT_GENE_BIOTYPES}}; if(scalar(@array) < 3 || scalar(@array) > 6) { # process all genes print "Fetching all genes in genome - you can also supply a slice name to limit to a certain slice ...\n" ; if ( @limit_to_biotypes ) { print "Limiting analysis to biotypes : "; for ( @limit_to_biotypes ){ print "$_\t" ; push @genes_for_stats, @{$ga->fetch_all_by_biotype($_)} ; } print "\n"; }else{ @genes_for_stats = @{$ga->fetch_all}; } }else { # only process a certain slice my $slice = $sa->fetch_by_name ( $self->input_id) ; print "==> Getting genes on slice ".$self->input_id . "\n" ; if ( @limit_to_biotypes ){ print "==> Limiting analysis to biotypes :\n"; for my $bt ( @limit_to_biotypes){ print "\t- $bt\n" ; push @genes_for_stats, @{$slice->get_all_Genes_by_type($bt)} ; } print "\n\n"; }else{ print "==> using all biotypes \n" ; push @genes_for_stats, @{$slice->get_all_Genes}; } } # this code is just for test purposes to run on small sets my $short = 0 ; if ( $short ) { print "SHORT==1 ONLY RUNNING ON SUBSET !!!!!!!!!!!!!\n"; @genes_for_stats = splice(@genes_for_stats,0,10); } print "==> have ".scalar(@genes_for_stats)." genes fetched for ".$self->cmp_species()."\n"; $self->genes(\@genes_for_stats) ; } use vars '$AUTOLOAD'; sub AUTOLOAD { my ($self,$val) = @_; (my $routine_name=$AUTOLOAD)=~s/.*:://; #trim package name $self->{$routine_name}=$val if $val ; return $self->{$routine_name} ; } sub DESTROY {} # required due to AUTOLOAD 1;