Bio::EnsEMBL::Analysis::RunnableDB PSILC
SummaryIncluded librariesPackage variablesSynopsisDescriptionGeneral documentationMethods
Toolbar
WebCvsRaw content
Summary
Bio::EnsEMBL::Analysis::RunnableDB::PSILC;
Package variables
Privates (from "my" definitions)
$runnable;
Included modules
Bio::EnsEMBL::Analysis::Config::Pseudogene
Bio::EnsEMBL::Analysis::Runnable::PSILC
Bio::EnsEMBL::Analysis::Runnable::PSILC_BlastP
Bio::EnsEMBL::Analysis::RunnableDB::BaseGeneBuild
Bio::EnsEMBL::Analysis::RunnableDB::Pseudogene_DB
Inherit
Bio::EnsEMBL::Analysis::RunnableDB::BaseGeneBuild Bio::EnsEMBL::Analysis::RunnableDB::Pseudogene_DB
Synopsis
my $runnabledb = Bio::EnsEMBL::Analysis::RunnableDB::PSILC->new
(
-db => $db_adaptor,
-input_id => flag start:flag end,
-analysis => $analysis,
);
$runnabledb->fetch_input();
$runnabledb->run();
my @array = @{$runnabledb->output};
$runnabledb->write_output();
Description
Prepares and runs PSILC.
PSILC is a pseudogene predicting program that compares DNA alignments of
transcripts from the organism of interest and homologous sequences from 3
other organisms. It uses regions of conservation within the seqence alignment
defined by PFAM domains to make its predictions.
It uses flagged seqences as input ids that were identified for further
analysis by Bio::EnsEMBL::Analysis::RunnableDB::Pseudogene_DB.pm or
Bio::EnsEMBL::Analysis::RunnableDB::Spliced_elsewhere.
Takes start and end flag ids as input, fetches all flags that lie within the
specified range that are associated with the psilc analysis, retrieves the
associated gene objects. Identifies all the transcripts in the set that contain
PFAM domains (requires protein annotationto be run before the pseudogene tests)
Runs a BLASTP search to identify orthologs from the 3 comparison databases,
then clusters the results using clustalw and passed the alignment to PSILC to
run over.
Parses the resuls and modifies gene objects to make them into pseudogenes if
required and then writes the genes to the final database.
There is a script ensembl-personal/sw4/Scripts/Pseudogenes/prepare_PSILC.pl
that prepares the module for pipelining, makes input ids etc.
A second script:
ensembl-personal/sw4/Scripts/Pseudogenes/make_PSILC_bsubs.pl
which will prepare the blast databases to use from the 3 species.
The module is configured through:
Bio::EnsEMBL::Analysis::Config::Pseudogene.pm
Methods
domainsDescriptionCode
fetch_inputDescriptionCode
fetch_transDescriptionCode
make_dirDescriptionCode
runDescriptionCode
run_PSILC
No description
Code
run_blastDescriptionCode
species_dbDescriptionCode
transcriptsDescriptionCode
Methods description
domainscode    nextTop
  Arg [1]    : Hash
Description: get/set for hash tying pfam domain identifiers to transcript dbID
Returntype : Hash
Exceptions : none
Caller : general
fetch_inputcodeprevnextTop
  Arg [1]    : none
Description: Opens and stores connections to the 3 ortholog databases and the genebuild databases, parses the
flag input id and fetches the genes to run PSILC on. Determines which transcripts contain
PFAM domains and stores the domains internally in a hash that ties the transcript id to the
PFAM id
Returntype : none
Exceptions : throws if the input ids are not recognised.
Caller : general
fetch_transcodeprevnextTop
  Arg [1]    : HASH ref
Description: fetches transcripts from the 3 ortholog databases, ensures sequences
are non-redundant and limits the transcripts to a pre set number of sequences from each species
Returntype : Bio::EnsEMBL::DBSQL::DBAdaptor
Exceptions : throws unless passed a Bio::EnsEMBL::DBSQL::DBAdaptor, throws if the species name is not recognised
Caller : general
make_dircodeprevnextTop
  Arg [1]    : none
Description: creates output directory for storing PSILC results
Returntype : none
Exceptions : none
Caller : general
run codeprevnextTop
  Arg [1]    : none
Description: Runs a BLASTP search on each transcript with a PFAM domain, then clusters
the sequences using clustalw follwed by running PSILC
Returntype : none
Exceptions :
Caller : general
run_blastcodeprevnextTop
  Arg [1]    : Bio::EnsEMBL::Transcript
Description: runs the BLASTP runnable
Returntype : Hash reference
Exceptions : none
Caller : general
species_db(2)codeprevnextTop
  Arg [1]    : scalar species name, Bio::EnsEMBL::DBSQL::DBAdaptor
Description: get/set db adaptors for the three species chosen
Returntype : Bio::EnsEMBL::DBSQL::DBAdaptor
Exceptions : throws unless passed a Bio::EnsEMBL::DBSQL::DBAdaptor, throws if the species name is not recognised
Caller : general
transcriptscodeprevnextTop
  Arg [1]    : Bio::EnsEMBL::Transcript
Description: get/set for transcripts
Returntype : Bio::EnsEMBL::Transcript
Exceptions : none
Caller : general
Methods code
domainsdescriptionprevnextTop
sub domains {
  my ($self, $domains) = @_;
  if ($domains) {
    $self->{'_domains'} = $domains;
  }
  return $self->{'_domains'};
}
fetch_inputdescriptionprevnextTop
sub fetch_input {
  my ($self)=@_;
  # open the dbs, get the adaptors and store them
my ($start, $end); my $count=0; my @genes; my $subjectdb = new Bio::EnsEMBL::DBSQL::DBAdaptor ( '-host' => $PSILC_SUBJECT_DBHOST, '-user' => 'ensro', '-dbname' => $PSILC_SUBJECT_DBNAME, '-port' => $PSILC_SUBJECT_DBPORT, ); my $orth1db = new Bio::EnsEMBL::DBSQL::DBAdaptor ( '-host' => $PSILC_ORTH1_DBHOST, '-user' => 'ensro', '-dbname' => $PSILC_ORTH1_DBNAME, '-port' => $PSILC_ORTH1_DBPORT, ); my $orth2db = new Bio::EnsEMBL::DBSQL::DBAdaptor ( '-host' => $PSILC_ORTH2_DBHOST, '-user' => 'ensro', '-dbname' => $PSILC_ORTH2_DBNAME, '-port' => $PSILC_ORTH2_DBPORT, ); $self->species_db($SUBJECT,$subjectdb); $self->species_db($ORTH1,$orth1db); $self->species_db($ORTH2,$orth2db); #store repeat db internally
my $dna_db = $self->get_dbadaptor($DNA_DBNAME) ; $self->rep_db($dna_db); #genes come from final genebuild database
my $genes_db = $self->get_dbadaptor("GENEBUILD_DB"); $self->gene_db($genes_db); my $ga = $genes_db->get_GeneAdaptor; my $fa = Bio::EnsEMBL::Pipeline::DBSQL::FlagAdaptor->new($self->db); my $ids = $fa->fetch_by_analysis($self->analysis); $self->throw("No flags found for analysis $PSILC_LOGIC_NAME\n") unless (scalar(@{$ids}>0)); if ($self->input_id =~ /(\d+):(\d+)/) { $start = $1; $end = $2; } else { $self->throw("Input id not recognised\n"); } # get ids
foreach my $flag (@{$ids}) { my $gene; if ($flag->dbID >= $start && $flag->dbID <= $end) { $count++; if ($flag->ensembl_id =~ /\w+/){ $gene = $ga->fetch_by_stable_id($flag->ensembl_id); } else { $gene = $ga->fetch_by_dbID($flag->ensembl_id); } push @genes, $self->lazy_load($gene); } } $self->genes(\@genes); # get all the genes out of your core database with pfam domain hits
# If $REP_TRANSCRIPT flag is set only select one transcript per gene
# the longest translation that has a pfam domain
my $pfam_transcripts =0; my %domains; GENE: foreach my $gene (@genes) { my %already_seen; my @rep_transcripts; my $domain; TRANS: foreach my $trans (@{$gene->get_all_Transcripts}) { my %already_seen; if ($trans->translation) { my $pep = $trans->translation; #If the transcript has a pfam domain store translation object
foreach my $pf (@{$pep->get_all_ProteinFeatures}) { if ($pf->analysis->logic_name eq 'Pfam') { # remove the transcript from the array unless it has a pfam domain
push @rep_transcripts,$trans; next TRANS; } } } } next GENE unless (scalar (@rep_transcripts > 0)); if (scalar(@rep_transcripts > 1) && $REP_TRANSCRIPT){ # arrange transcripts by length so you can pick the longest as rep if needed
@rep_transcripts = sort {$a->length <=> $b->length} @rep_transcripts; my @temp = pop @rep_transcripts; print $temp[0]->stable_id."\n"; @rep_transcripts = @temp; } # othewise store domains found in the rep transcript
foreach my $rep_transcript(@rep_transcripts){ foreach my $pf (@{$rep_transcript->translation->get_all_ProteinFeatures}) { unless ($already_seen{$pf->hseqname}) { if ($pf->analysis->logic_name eq 'Pfam') { my $domain_id = $pf->hseqname; $domain_id =~ s/\.\d+//; $domain .= "$domain_id\t$domain_id\n"; $already_seen{$pf->hseqname} =1; } } } $pfam_transcripts++; $domains{$rep_transcript->dbID}=$domain; } } print STDERR "Found $pfam_transcripts Pfam transcripts out of ".scalar(@genes)." genes\n "; $self->domains(\%domains); return 1;
}
fetch_transdescriptionprevnextTop
sub fetch_trans {
  my ($self,$blast_results) =@_;
  my %transcript_hash;
  my $transcript_count;
  my @homologs;
  my %already_seen;
 SPECIES: foreach my $species (keys %{$blast_results}){
    my $ta = $self->species_db($species)->get_TranscriptAdaptor;
    print "Fetching $species transcripts\n";
    $transcript_count=0;
  TRANS:   foreach my $trans_id (@{$blast_results->{$species}}){;
      next TRANS if ($already_seen{$trans_id});
      $transcript_count++;
      next SPECIES unless ($transcript_count <= $PS_SPECIES_LIMIT);
      my $transcript = $ta->fetch_by_translation_stable_id($trans_id);
      $already_seen{$trans_id} = 1;
      unless ($transcript && $transcript->isa("Bio::EnsEMBL::Transcript")){
	$self->warn("Cannot find transcript $trans_id for species ".$transcript_hash{$trans_id}." I get a $transcript\n$@\n");
      }
      next TRANS unless ($transcript->translateable_seq);
      push @homologs,$transcript;
    }
  }
  return\@ homologs;
}
make_dirdescriptionprevnextTop
sub make_dir {
  my ($self) = @_;
  my $input_id = $self->input_id;  
  if ($PSILC_WORK_DIR){
    system("mkdir $PSILC_WORK_DIR/$input_id");
  }
  else{
    $self->throw("Cannot make output directory\n");
  }
  return 1;
}



#####################################################################
# Containers
}
rundescriptionprevnextTop
sub run {
  my ($self) = @_;
  my $ga = $self->gene_db->get_GeneAdaptor;
  $self->make_dir;
  my @genes = @{$self->genes};
  my $pfam_transcript;
  my %domains = %{$self->domains};

 PFAM:  foreach my $gene(@genes){
    my $pseudo_trans = 0;
    my $pfam_trans = 0;
  TRANSCRIPT: foreach my $transcript (@{$gene->get_all_Transcripts}){
      my $PSILC_results;
      if ($domains{$transcript->dbID}){
        $pfam_transcript = $transcript;
	$pfam_trans++;
      }
      else {
	next TRANSCRIPT;
      }
      if ($pfam_transcript->stable_id) {
	print STDERR "Running analysis for ".$pfam_transcript->stable_id."\n";
      } else {
	print STDERR "Running analysis for ".$pfam_transcript->dbID."\n";
      }
      # run blast comparison against the three databases
my $blast_results = $self->run_blast($pfam_transcript); # fetch the sequences of the homologs identified in blast
my @homologs = @{$self->fetch_trans($blast_results)}; if (scalar(@homologs) >= 2) { $PSILC_results = $self->run_PSILC($pfam_transcript,\@homologs); } unless ($PSILC_results) { print STDERR "Not enough sequences to continue for transcript ".$pfam_transcript->dbID; print STDERR " could only find ".scalar(@homologs)."\n"; $self->real_genes($gene); next PFAM; } if ($PSILC_results->{'postNMax'} > 0.95){ $pseudo_trans++; } } if ($pfam_trans > 0 && $pseudo_trans == $pfam_trans){ $self->pseudo_genes($gene) } else { $self->real_genes($gene); } } $self->output($self->pseudo_genes); $self->output($self->real_genes); return 1;
}
run_PSILCdescriptionprevnextTop
sub run_PSILC {
  my ($self,$pfam_transcript,$homologs) = @_;
  my %domains = %{$self->domains};
  # Run PSILC analysis
my $PSILC = Bio::EnsEMBL::Analysis::Runnable::PSILC->new ( '-trans' => $pfam_transcript, '-homologs' => $homologs, '-analysis' => $self->analysis, '-domain' =>\$ domains{$pfam_transcript->dbID}, '-input_id' =>\$ self->input_id, ); eval{ $PSILC->run; }; if ($@){ $self->warn("PSILC FAILED TO RUN $@\n"); return 0; } return $PSILC->output;
}
run_blastdescriptionprevnextTop
sub run_blast {
  my ($self,$pfam_transcript) = @_;
  my $blast = Bio::EnsEMBL::Analysis::Runnable::PSILC_BlastP->new 
    (
     '-trans'    => $pfam_transcript,
     '-analysis' => $self->analysis,
    );
  $blast->run;
  return $blast->output;
}
species_dbdescriptionprevnextTop
sub species_db {
  my ($self,$species,$db)= @_;
  if ($db){
    unless ($db->isa("Bio::EnsEMBL::DBSQL::DBAdaptor")){
      $self->throw("$species object is not a Bio::EnsEMBL::DBSQL::DBAdaptor, it is a $db$@\n");
    }
    if ($species eq $SUBJECT){
      $self->{'_db_subject'} = $db;
    }
    if ($species eq $ORTH1){
      $self->{'_db_orth1'} = $db;
    }
    if ($species eq $ORTH2){
      $self->{'_db_orth2'} = $db;
    }
  }
  if ($species eq $SUBJECT){
    return $self->{'_db_subject'};
  }
  if ($species eq $ORTH1){
    return $self->{'_db_orth1'};
  }
  if ($species eq $ORTH2){
    return $self->{'_db_orth2'};
  }
  $self->throw("Species not recognised : $species\n");
  return;
}
transcriptsdescriptionprevnextTop
sub transcripts {
  my ($self, $transcript) = @_;
  if ($transcript) {
    push @{$self->{'_transcripts'}}, $transcript;
  }
  return $self->{'_transcripts'};
}



1;
}
General documentation
CONTACTTop
Post questions to the Ensembl development list: ensembl-dev@ebi.ac.uk