PSILC documentation.

Prepares and runs PSILC.
PSILC is a pseudogene predicting program that compares DNA alignments of
transcripts from the organism of interest and homologous sequences from 3
other organisms. It uses regions of conservation within the seqence alignment
defined by PFAM domains to make its predictions.
It uses flagged seqences as input ids that were identified for further
analysis by Bio::EnsEMBL::Analysis::RunnableDB::Pseudogene_DB.pm or
Bio::EnsEMBL::Analysis::RunnableDB::Spliced_elsewhere.
Takes start and end flag ids as input, fetches all flags that lie within the
specified range that are associated with the psilc analysis, retrieves the
associated gene objects. Identifies all the transcripts in the set that contain
PFAM domains (requires protein annotationto be run before the pseudogene tests)
Runs a BLASTP search to identify orthologs from the 3 comparison databases,
then clusters the results using clustalw and passed the alignment to PSILC to
run over.
Parses the resuls and modifies gene objects to make them into pseudogenes if
required and then writes the genes to the final database.
There is a script ensembl-personal/sw4/Scripts/Pseudogenes/prepare_PSILC.pl
that prepares the module for pipelining, makes input ids etc.
A second script:
ensembl-personal/sw4/Scripts/Pseudogenes/make_PSILC_bsubs.pl
which will prepare the blast databases to use from the 3 species.
The module is configured through:
Bio::EnsEMBL::Analysis::Config::Pseudogene.pm

sub fetch_input {

  my ($self)=@_;
  # open the dbs, get the adaptors and store them
  my ($start, $end);
  my $count=0;
  my @genes; 
  my $subjectdb = new Bio::EnsEMBL::DBSQL::DBAdaptor
    (
     '-host'   => $PSILC_SUBJECT_DBHOST,
     '-user'   => 'ensro',
     '-dbname' => $PSILC_SUBJECT_DBNAME,
     '-port'   => $PSILC_SUBJECT_DBPORT,
    );
  my $orth1db = new Bio::EnsEMBL::DBSQL::DBAdaptor
    (
     '-host'   => $PSILC_ORTH1_DBHOST,
     '-user'   => 'ensro',
     '-dbname' => $PSILC_ORTH1_DBNAME,
     '-port'   => $PSILC_ORTH1_DBPORT,
    );  
  my $orth2db = new Bio::EnsEMBL::DBSQL::DBAdaptor
    (
     '-host'   => $PSILC_ORTH2_DBHOST,
     '-user'   => 'ensro',
     '-dbname' => $PSILC_ORTH2_DBNAME,
     '-port'   => $PSILC_ORTH2_DBPORT,
    ); 

  $self->species_db($SUBJECT,$subjectdb);
  $self->species_db($ORTH1,$orth1db);
  $self->species_db($ORTH2,$orth2db); 

  #store repeat db internally
  my $dna_db = $self->get_dbadaptor($DNA_DBNAME) ;
  $self->rep_db($dna_db);

  #genes come from final genebuild database
  my $genes_db = $self->get_dbadaptor("GENEBUILD_DB");
  $self->gene_db($genes_db);
 
  my $ga = $genes_db->get_GeneAdaptor;
  my $fa = Bio::EnsEMBL::Pipeline::DBSQL::FlagAdaptor->new($self->db);
  my $ids = $fa->fetch_by_analysis($self->analysis);
  $self->throw("No flags found for analysis $PSILC_LOGIC_NAME\n")  unless (scalar(@{$ids}>0));
  if ($self->input_id =~ /(\d+):(\d+)/) {
    $start = $1;
    $end = $2;
  } else {
    $self->throw("Input id not recognised\n");
  }
  # get ids
  foreach my $flag (@{$ids}) {
    my $gene;
    if ($flag->dbID >= $start && $flag->dbID <= $end) {
      $count++;
      if ($flag->ensembl_id =~ /\w+/){
	$gene = $ga->fetch_by_stable_id($flag->ensembl_id);
      }
      else {
	$gene = $ga->fetch_by_dbID($flag->ensembl_id);
      }
      push @genes, $self->lazy_load($gene);
    }
  }
  $self->genes(\@genes);

  # get all the genes out of your core database with pfam domain hits
  # If $REP_TRANSCRIPT flag is set only select one transcript per gene 
  # the longest translation that has a pfam domain

  my $pfam_transcripts =0;
  my %domains;

 GENE: foreach my $gene (@genes) {  
    my %already_seen;
    my @rep_transcripts;
    my $domain;
  TRANS: foreach my $trans (@{$gene->get_all_Transcripts}) {
      my %already_seen;
      if ($trans->translation) {
	my $pep = $trans->translation;
	#If the transcript has a pfam domain store translation object
	foreach my $pf (@{$pep->get_all_ProteinFeatures}) {
	  if ($pf->analysis->logic_name eq 'Pfam') {
	    # remove the transcript from the array unless it has a pfam domain
	    push @rep_transcripts,$trans;
	    next TRANS;
	  }
	}
      }
    }
    next GENE unless (scalar (@rep_transcripts > 0));
    if (scalar(@rep_transcripts > 1) && $REP_TRANSCRIPT){
      # arrange transcripts by length so you can pick the longest as rep if needed
      @rep_transcripts = sort {$a->length <=> $b->length} @rep_transcripts;
      my @temp = pop @rep_transcripts;
      print $temp[0]->stable_id."\n";
      @rep_transcripts = @temp;
    }
    # othewise store domains found in the rep transcript
    foreach my $rep_transcript(@rep_transcripts){
      foreach my $pf (@{$rep_transcript->translation->get_all_ProteinFeatures}) {
	unless ($already_seen{$pf->hseqname}) {
	    if ($pf->analysis->logic_name eq 'Pfam') {
	      my $domain_id = $pf->hseqname;
	      $domain_id =~ s/\.\d+//;
	      $domain .= "$domain_id\t$domain_id\n";
	      $already_seen{$pf->hseqname} =1;
	    }
	  }
      }
      $pfam_transcripts++;
      $domains{$rep_transcript->dbID}=$domain;
    }
  }
  print STDERR "Found $pfam_transcripts Pfam transcripts out of ".scalar(@genes)." genes\n ";
  $self->domains(\%domains);
  return 1;

}

sub fetch_trans {

  my ($self,$blast_results) =@_;
  my %transcript_hash;
  my $transcript_count;
  my @homologs;
  my %already_seen;
 SPECIES: foreach my $species (keys %{$blast_results}){
    my $ta = $self->species_db($species)->get_TranscriptAdaptor;
    print "Fetching $species transcripts\n";
    $transcript_count=0;
  TRANS:   foreach my $trans_id (@{$blast_results->{$species}}){;
      next TRANS if ($already_seen{$trans_id});
      $transcript_count++;
      next SPECIES unless ($transcript_count <= $PS_SPECIES_LIMIT);
      my $transcript = $ta->fetch_by_translation_stable_id($trans_id);
      $already_seen{$trans_id} = 1;
      unless ($transcript && $transcript->isa("Bio::EnsEMBL::Transcript")){
	$self->warn("Cannot find transcript $trans_id for species ".$transcript_hash{$trans_id}." I get a $transcript\n$@\n");
      }
      next TRANS unless ($transcript->translateable_seq);
      push @homologs,$transcript;
    }
  }
  return\@ homologs;

}

sub run {

  my ($self) = @_;
  my $ga = $self->gene_db->get_GeneAdaptor;
  $self->make_dir;
  my @genes = @{$self->genes};
  my $pfam_transcript;
  my %domains = %{$self->domains};

 PFAM:  foreach my $gene(@genes){
    my $pseudo_trans = 0;
    my $pfam_trans = 0;
  TRANSCRIPT: foreach my $transcript (@{$gene->get_all_Transcripts}){
      my $PSILC_results;
      if ($domains{$transcript->dbID}){
        $pfam_transcript = $transcript;
	$pfam_trans++;
      }
      else {
	next TRANSCRIPT;
      }
      if ($pfam_transcript->stable_id) {
	print STDERR "Running analysis for ".$pfam_transcript->stable_id."\n";
      } else {
	print STDERR "Running analysis for ".$pfam_transcript->dbID."\n";
      }
      # run blast comparison against the three databases
      my $blast_results = $self->run_blast($pfam_transcript);
      # fetch the sequences of the homologs identified in blast
      my @homologs = @{$self->fetch_trans($blast_results)};
      if (scalar(@homologs) >= 2) {
	$PSILC_results = $self->run_PSILC($pfam_transcript,\@homologs);
      }
      unless ($PSILC_results) {
	print STDERR "Not enough sequences to continue for transcript ".$pfam_transcript->dbID;
	print STDERR " could only find ".scalar(@homologs)."\n";
	$self->real_genes($gene);
	next PFAM;
      }
      if ($PSILC_results->{'postNMax'} > 0.95){
	$pseudo_trans++;
      }
    }
    if ($pfam_trans > 0 && $pseudo_trans == $pfam_trans){
      $self->pseudo_genes($gene)
    }
    else {
      $self->real_genes($gene);
    }
  }
  $self->output($self->pseudo_genes);
  $self->output($self->real_genes);
  return 1;

}

sub species_db {

  my ($self,$species,$db)= @_;
  if ($db){
    unless ($db->isa("Bio::EnsEMBL::DBSQL::DBAdaptor")){
      $self->throw("$species object is not a Bio::EnsEMBL::DBSQL::DBAdaptor, it is a $db$@\n");
    }
    if ($species eq $SUBJECT){
      $self->{'_db_subject'} = $db;
    }
    if ($species eq $ORTH1){
      $self->{'_db_orth1'} = $db;
    }
    if ($species eq $ORTH2){
      $self->{'_db_orth2'} = $db;
    }
  }
  if ($species eq $SUBJECT){
    return $self->{'_db_subject'};
  }
  if ($species eq $ORTH1){
    return $self->{'_db_orth1'};
  }
  if ($species eq $ORTH2){
    return $self->{'_db_orth2'};
  }
  $self->throw("Species not recognised : $species\n");
  return;

}

domains	Description	Code
fetch_input	Description	Code
fetch_trans	Description	Code
make_dir	Description	Code
run	Description	Code
run_PSILC	No description	Code
run_blast	Description	Code
species_db	Description	Code
transcripts	Description	Code