Raw content of Bio::EnsEMBL::Analysis::RunnableDB::PSILC # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =pod =head1 NAME Bio::EnsEMBL::Analysis::RunnableDB::PSILC; =head1 SYNOPSIS my $runnabledb = Bio::EnsEMBL::Analysis::RunnableDB::PSILC->new ( -db => $db_adaptor, -input_id => flag start:flag end, -analysis => $analysis, ); $runnabledb->fetch_input(); $runnabledb->run(); my @array = @{$runnabledb->output}; $runnabledb->write_output(); =head1 DESCRIPTION Prepares and runs PSILC. PSILC is a pseudogene predicting program that compares DNA alignments of transcripts from the organism of interest and homologous sequences from 3 other organisms. It uses regions of conservation within the seqence alignment defined by PFAM domains to make its predictions. It uses flagged seqences as input ids that were identified for further analysis by Bio::EnsEMBL::Analysis::RunnableDB::Pseudogene_DB.pm or Bio::EnsEMBL::Analysis::RunnableDB::Spliced_elsewhere. Takes start and end flag ids as input, fetches all flags that lie within the specified range that are associated with the psilc analysis, retrieves the associated gene objects. Identifies all the transcripts in the set that contain PFAM domains (requires protein annotationto be run before the pseudogene tests) Runs a BLASTP search to identify orthologs from the 3 comparison databases, then clusters the results using clustalw and passed the alignment to PSILC to run over. Parses the resuls and modifies gene objects to make them into pseudogenes if required and then writes the genes to the final database. There is a script ensembl-personal/sw4/Scripts/Pseudogenes/prepare_PSILC.pl that prepares the module for pipelining, makes input ids etc. A second script: ensembl-personal/sw4/Scripts/Pseudogenes/make_PSILC_bsubs.pl which will prepare the blast databases to use from the 3 species. The module is configured through: Bio::EnsEMBL::Analysis::Config::Pseudogene.pm =head1 CONTACT Post questions to the Ensembl development list: ensembl-dev@ebi.ac.uk =cut package Bio::EnsEMBL::Analysis::RunnableDB::PSILC; use strict; use warnings; use Bio::EnsEMBL::Analysis::RunnableDB::Pseudogene_DB; use Bio::EnsEMBL::Analysis::Runnable::PSILC_BlastP; use Bio::EnsEMBL::Analysis::Config::Pseudogene; use Bio::EnsEMBL::Analysis::Runnable::PSILC; use Bio::EnsEMBL::Analysis::RunnableDB::BaseGeneBuild; use vars qw(@ISA); @ISA = qw(Bio::EnsEMBL::Analysis::RunnableDB::Pseudogene_DB Bio::EnsEMBL::Analysis::RunnableDB::BaseGeneBuild); my $runnable; =head2 fetch_input Arg [1] : none Description: Opens and stores connections to the 3 ortholog databases and the genebuild databases, parses the flag input id and fetches the genes to run PSILC on. Determines which transcripts contain PFAM domains and stores the domains internally in a hash that ties the transcript id to the PFAM id Returntype : none Exceptions : throws if the input ids are not recognised. Caller : general =cut sub fetch_input{ my ($self)=@_; # open the dbs, get the adaptors and store them my ($start, $end); my $count=0; my @genes; my $subjectdb = new Bio::EnsEMBL::DBSQL::DBAdaptor ( '-host' => $PSILC_SUBJECT_DBHOST, '-user' => 'ensro', '-dbname' => $PSILC_SUBJECT_DBNAME, '-port' => $PSILC_SUBJECT_DBPORT, ); my $orth1db = new Bio::EnsEMBL::DBSQL::DBAdaptor ( '-host' => $PSILC_ORTH1_DBHOST, '-user' => 'ensro', '-dbname' => $PSILC_ORTH1_DBNAME, '-port' => $PSILC_ORTH1_DBPORT, ); my $orth2db = new Bio::EnsEMBL::DBSQL::DBAdaptor ( '-host' => $PSILC_ORTH2_DBHOST, '-user' => 'ensro', '-dbname' => $PSILC_ORTH2_DBNAME, '-port' => $PSILC_ORTH2_DBPORT, ); $self->species_db($SUBJECT,$subjectdb); $self->species_db($ORTH1,$orth1db); $self->species_db($ORTH2,$orth2db); #store repeat db internally my $dna_db = $self->get_dbadaptor($DNA_DBNAME) ; $self->rep_db($dna_db); #genes come from final genebuild database my $genes_db = $self->get_dbadaptor("GENEBUILD_DB"); $self->gene_db($genes_db); my $ga = $genes_db->get_GeneAdaptor; my $fa = Bio::EnsEMBL::Pipeline::DBSQL::FlagAdaptor->new($self->db); my $ids = $fa->fetch_by_analysis($self->analysis); $self->throw("No flags found for analysis $PSILC_LOGIC_NAME\n") unless (scalar(@{$ids}>0)); if ($self->input_id =~ /(\d+):(\d+)/) { $start = $1; $end = $2; } else { $self->throw("Input id not recognised\n"); } # get ids foreach my $flag (@{$ids}) { my $gene; if ($flag->dbID >= $start && $flag->dbID <= $end) { $count++; if ($flag->ensembl_id =~ /\w+/){ $gene = $ga->fetch_by_stable_id($flag->ensembl_id); } else { $gene = $ga->fetch_by_dbID($flag->ensembl_id); } push @genes, $self->lazy_load($gene); } } $self->genes(\@genes); # get all the genes out of your core database with pfam domain hits # If $REP_TRANSCRIPT flag is set only select one transcript per gene # the longest translation that has a pfam domain my $pfam_transcripts =0; my %domains; GENE: foreach my $gene (@genes) { my %already_seen; my @rep_transcripts; my $domain; TRANS: foreach my $trans (@{$gene->get_all_Transcripts}) { my %already_seen; if ($trans->translation) { my $pep = $trans->translation; #If the transcript has a pfam domain store translation object foreach my $pf (@{$pep->get_all_ProteinFeatures}) { if ($pf->analysis->logic_name eq 'Pfam') { # remove the transcript from the array unless it has a pfam domain push @rep_transcripts,$trans; next TRANS; } } } } next GENE unless (scalar (@rep_transcripts > 0)); if (scalar(@rep_transcripts > 1) && $REP_TRANSCRIPT){ # arrange transcripts by length so you can pick the longest as rep if needed @rep_transcripts = sort {$a->length <=> $b->length} @rep_transcripts; my @temp = pop @rep_transcripts; print $temp[0]->stable_id."\n"; @rep_transcripts = @temp; } # othewise store domains found in the rep transcript foreach my $rep_transcript(@rep_transcripts){ foreach my $pf (@{$rep_transcript->translation->get_all_ProteinFeatures}) { unless ($already_seen{$pf->hseqname}) { if ($pf->analysis->logic_name eq 'Pfam') { my $domain_id = $pf->hseqname; $domain_id =~ s/\.\d+//; $domain .= "$domain_id\t$domain_id\n"; $already_seen{$pf->hseqname} =1; } } } $pfam_transcripts++; $domains{$rep_transcript->dbID}=$domain; } } print STDERR "Found $pfam_transcripts Pfam transcripts out of ".scalar(@genes)." genes \n"; $self->domains(\%domains); return 1; } =head2 run Arg [1] : none Description: Runs a BLASTP search on each transcript with a PFAM domain, then clusters the sequences using clustalw follwed by running PSILC Returntype : none Exceptions : Caller : general =cut sub run{ my ($self) = @_; my $ga = $self->gene_db->get_GeneAdaptor; $self->make_dir; my @genes = @{$self->genes}; my $pfam_transcript; my %domains = %{$self->domains}; PFAM: foreach my $gene(@genes){ my $pseudo_trans = 0; my $pfam_trans = 0; TRANSCRIPT: foreach my $transcript (@{$gene->get_all_Transcripts}){ my $PSILC_results; if ($domains{$transcript->dbID}){ $pfam_transcript = $transcript; $pfam_trans++; } else { next TRANSCRIPT; } if ($pfam_transcript->stable_id) { print STDERR "Running analysis for ".$pfam_transcript->stable_id."\n"; } else { print STDERR "Running analysis for ".$pfam_transcript->dbID."\n"; } # run blast comparison against the three databases my $blast_results = $self->run_blast($pfam_transcript); # fetch the sequences of the homologs identified in blast my @homologs = @{$self->fetch_trans($blast_results)}; if (scalar(@homologs) >= 2) { $PSILC_results = $self->run_PSILC($pfam_transcript,\@homologs); } unless ($PSILC_results) { print STDERR "Not enough sequences to continue for transcript ".$pfam_transcript->dbID; print STDERR " could only find ".scalar(@homologs)."\n"; $self->real_genes($gene); next PFAM; } if ($PSILC_results->{'postNMax'} > 0.95){ $pseudo_trans++; } } if ($pfam_trans > 0 && $pseudo_trans == $pfam_trans){ $self->pseudo_genes($gene) } else { $self->real_genes($gene); } } $self->output($self->pseudo_genes); $self->output($self->real_genes); return 1; } =head2 fetch_trans Arg [1] : HASH ref Description: fetches transcripts from the 3 ortholog databases, ensures sequences are non-redundant and limits the transcripts to a pre set number of sequences from each species Returntype : Bio::EnsEMBL::DBSQL::DBAdaptor Exceptions : throws unless passed a Bio::EnsEMBL::DBSQL::DBAdaptor, throws if the species name is not recognised Caller : general =cut sub fetch_trans{ my ($self,$blast_results) =@_; my %transcript_hash; my $transcript_count; my @homologs; my %already_seen; SPECIES: foreach my $species (keys %{$blast_results}){ my $ta = $self->species_db($species)->get_TranscriptAdaptor; print "Fetching $species transcripts\n"; $transcript_count=0; TRANS: foreach my $trans_id (@{$blast_results->{$species}}){; next TRANS if ($already_seen{$trans_id}); $transcript_count++; next SPECIES unless ($transcript_count <= $PS_SPECIES_LIMIT); my $transcript = $ta->fetch_by_translation_stable_id($trans_id); $already_seen{$trans_id} = 1; unless ($transcript && $transcript->isa("Bio::EnsEMBL::Transcript")){ $self->warn("Cannot find transcript $trans_id for species ".$transcript_hash{$trans_id}." I get a $transcript\n$@\n"); } next TRANS unless ($transcript->translateable_seq); push @homologs,$transcript; } } return \@homologs; } =head2 run_blast Arg [1] : Bio::EnsEMBL::Transcript Description: runs the BLASTP runnable Returntype : Hash reference Exceptions : none Caller : general =cut sub run_blast{ my ($self,$pfam_transcript) = @_; my $blast = Bio::EnsEMBL::Analysis::Runnable::PSILC_BlastP->new ( '-trans' => $pfam_transcript, '-analysis' => $self->analysis, ); $blast->run; return $blast->output; } =head2 species_db Arg [1] : Bio::EnsEMBL::Transcript, Array contining homolog transcripts Description: runs the PSILC runnable Returntype : Hash reference Exceptions : none Caller : general =cut sub run_PSILC{ my ($self,$pfam_transcript,$homologs) = @_; my %domains = %{$self->domains}; # Run PSILC analysis my $PSILC = Bio::EnsEMBL::Analysis::Runnable::PSILC->new ( '-trans' => $pfam_transcript, '-homologs' => $homologs, '-analysis' => $self->analysis, '-domain' => \$domains{$pfam_transcript->dbID}, '-input_id' => \$self->input_id, ); eval{ $PSILC->run; }; if ($@){ $self->warn("PSILC FAILED TO RUN $@\n"); return 0; } return $PSILC->output; } =head2 species_db Arg [1] : scalar species name, Bio::EnsEMBL::DBSQL::DBAdaptor Description: get/set db adaptors for the three species chosen Returntype : Bio::EnsEMBL::DBSQL::DBAdaptor Exceptions : throws unless passed a Bio::EnsEMBL::DBSQL::DBAdaptor, throws if the species name is not recognised Caller : general =cut sub species_db{ my ($self,$species,$db)= @_; if ($db){ unless ($db->isa("Bio::EnsEMBL::DBSQL::DBAdaptor")){ $self->throw("$species object is not a Bio::EnsEMBL::DBSQL::DBAdaptor, it is a $db$@\n"); } if ($species eq $SUBJECT){ $self->{'_db_subject'} = $db; } if ($species eq $ORTH1){ $self->{'_db_orth1'} = $db; } if ($species eq $ORTH2){ $self->{'_db_orth2'} = $db; } } if ($species eq $SUBJECT){ return $self->{'_db_subject'}; } if ($species eq $ORTH1){ return $self->{'_db_orth1'}; } if ($species eq $ORTH2){ return $self->{'_db_orth2'}; } $self->throw("Species not recognised : $species\n"); return; } =head2 make_dir Arg [1] : none Description: creates output directory for storing PSILC results Returntype : none Exceptions : none Caller : general =cut sub make_dir{ my ($self) = @_; my $input_id = $self->input_id; if ($PSILC_WORK_DIR){ system("mkdir $PSILC_WORK_DIR/$input_id"); } else{ $self->throw("Cannot make output directory\n"); } return 1; } ##################################################################### # Containers =head2 domains Arg [1] : Hash Description: get/set for hash tying pfam domain identifiers to transcript dbID Returntype : Hash Exceptions : none Caller : general =cut sub domains { my ($self, $domains) = @_; if ($domains) { $self->{'_domains'} = $domains; } return $self->{'_domains'}; } =head2 transcripts Arg [1] : Bio::EnsEMBL::Transcript Description: get/set for transcripts Returntype : Bio::EnsEMBL::Transcript Exceptions : none Caller : general =cut sub transcripts { my ($self, $transcript) = @_; if ($transcript) { push @{$self->{'_transcripts'}}, $transcript; } return $self->{'_transcripts'}; } 1;