Bio::EnsEMBL::Analysis::RunnableDB
PSILC
Toolbar
Summary
Bio::EnsEMBL::Analysis::RunnableDB::PSILC;
Package variables
Privates (from "my" definitions)
$runnable;
Included modules
Bio::EnsEMBL::Analysis::Config::Pseudogene
Inherit
Synopsis
my $runnabledb = Bio::EnsEMBL::Analysis::RunnableDB::PSILC->new
(
-db => $db_adaptor,
-input_id => flag start:flag end,
-analysis => $analysis,
);
$runnabledb->fetch_input();
$runnabledb->run();
my @array = @{$runnabledb->output};
$runnabledb->write_output();
Description
Prepares and runs PSILC.
PSILC is a pseudogene predicting program that compares DNA alignments of
transcripts from the organism of interest and homologous sequences from 3
other organisms. It uses regions of conservation within the seqence alignment
defined by PFAM domains to make its predictions.
It uses flagged seqences as input ids that were identified for further
analysis by Bio::EnsEMBL::Analysis::RunnableDB::Pseudogene_DB.pm or
Bio::EnsEMBL::Analysis::RunnableDB::Spliced_elsewhere.
Takes start and end flag ids as input, fetches all flags that lie within the
specified range that are associated with the psilc analysis, retrieves the
associated gene objects. Identifies all the transcripts in the set that contain
PFAM domains (requires protein annotationto be run before the pseudogene tests)
Runs a BLASTP search to identify orthologs from the 3 comparison databases,
then clusters the results using clustalw and passed the alignment to PSILC to
run over.
Parses the resuls and modifies gene objects to make them into pseudogenes if
required and then writes the genes to the final database.
There is a script ensembl-personal/sw4/Scripts/Pseudogenes/prepare_PSILC.pl
that prepares the module for pipelining, makes input ids etc.
A second script:
ensembl-personal/sw4/Scripts/Pseudogenes/make_PSILC_bsubs.pl
which will prepare the blast databases to use from the 3 species.
The module is configured through:
Bio::EnsEMBL::Analysis::Config::Pseudogene.pm
Methods
Methods description
Arg [1] : Hash Description: get/set for hash tying pfam domain identifiers to transcript dbID Returntype : Hash Exceptions : none Caller : general |
Arg [1] : none Description: Opens and stores connections to the 3 ortholog databases and the genebuild databases, parses the flag input id and fetches the genes to run PSILC on. Determines which transcripts contain PFAM domains and stores the domains internally in a hash that ties the transcript id to the PFAM id Returntype : none Exceptions : throws if the input ids are not recognised. Caller : general |
Arg [1] : HASH ref Description: fetches transcripts from the 3 ortholog databases, ensures sequences are non-redundant and limits the transcripts to a pre set number of sequences from each species Returntype : Bio::EnsEMBL::DBSQL::DBAdaptor Exceptions : throws unless passed a Bio::EnsEMBL::DBSQL::DBAdaptor, throws if the species name is not recognised Caller : general |
Arg [1] : none Description: creates output directory for storing PSILC results Returntype : none Exceptions : none Caller : general |
Arg [1] : none Description: Runs a BLASTP search on each transcript with a PFAM domain, then clusters the sequences using clustalw follwed by running PSILC Returntype : none Exceptions : Caller : general |
Arg [1] : Bio::EnsEMBL::Transcript Description: runs the BLASTP runnable Returntype : Hash reference Exceptions : none Caller : general |
Arg [1] : scalar species name, Bio::EnsEMBL::DBSQL::DBAdaptor Description: get/set db adaptors for the three species chosen Returntype : Bio::EnsEMBL::DBSQL::DBAdaptor Exceptions : throws unless passed a Bio::EnsEMBL::DBSQL::DBAdaptor, throws if the species name is not recognised Caller : general |
Arg [1] : Bio::EnsEMBL::Transcript Description: get/set for transcripts Returntype : Bio::EnsEMBL::Transcript Exceptions : none Caller : general |
Methods code
sub domains
{ my ($self, $domains) = @_;
if ($domains) {
$self->{'_domains'} = $domains;
}
return $self->{'_domains'}; } |
sub fetch_input
{ my ($self)=@_;
my ($start, $end);
my $count=0;
my @genes;
my $subjectdb = new Bio::EnsEMBL::DBSQL::DBAdaptor
(
'-host' => $PSILC_SUBJECT_DBHOST,
'-user' => 'ensro',
'-dbname' => $PSILC_SUBJECT_DBNAME,
'-port' => $PSILC_SUBJECT_DBPORT,
);
my $orth1db = new Bio::EnsEMBL::DBSQL::DBAdaptor
(
'-host' => $PSILC_ORTH1_DBHOST,
'-user' => 'ensro',
'-dbname' => $PSILC_ORTH1_DBNAME,
'-port' => $PSILC_ORTH1_DBPORT,
);
my $orth2db = new Bio::EnsEMBL::DBSQL::DBAdaptor
(
'-host' => $PSILC_ORTH2_DBHOST,
'-user' => 'ensro',
'-dbname' => $PSILC_ORTH2_DBNAME,
'-port' => $PSILC_ORTH2_DBPORT,
);
$self->species_db($SUBJECT,$subjectdb);
$self->species_db($ORTH1,$orth1db);
$self->species_db($ORTH2,$orth2db);
my $dna_db = $self->get_dbadaptor($DNA_DBNAME) ;
$self->rep_db($dna_db);
my $genes_db = $self->get_dbadaptor("GENEBUILD_DB");
$self->gene_db($genes_db);
my $ga = $genes_db->get_GeneAdaptor;
my $fa = Bio::EnsEMBL::Pipeline::DBSQL::FlagAdaptor->new($self->db);
my $ids = $fa->fetch_by_analysis($self->analysis);
$self->throw("No flags found for analysis $PSILC_LOGIC_NAME\n") unless (scalar(@{$ids}>0));
if ($self->input_id =~ /(\d+):(\d+)/) {
$start = $1;
$end = $2;
} else {
$self->throw("Input id not recognised\n");
}
foreach my $flag (@{$ids}) {
my $gene;
if ($flag->dbID >= $start && $flag->dbID <= $end) {
$count++;
if ($flag->ensembl_id =~ /\w+/){
$gene = $ga->fetch_by_stable_id($flag->ensembl_id);
}
else {
$gene = $ga->fetch_by_dbID($flag->ensembl_id);
}
push @genes, $self->lazy_load($gene);
}
}
$self->genes(\@genes);
my $pfam_transcripts =0;
my %domains;
GENE: foreach my $gene (@genes) {
my %already_seen;
my @rep_transcripts;
my $domain;
TRANS: foreach my $trans (@{$gene->get_all_Transcripts}) {
my %already_seen;
if ($trans->translation) {
my $pep = $trans->translation;
foreach my $pf (@{$pep->get_all_ProteinFeatures}) {
if ($pf->analysis->logic_name eq 'Pfam') {
push @rep_transcripts,$trans;
next TRANS;
}
}
}
}
next GENE unless (scalar (@rep_transcripts > 0));
if (scalar(@rep_transcripts > 1) && $REP_TRANSCRIPT){
@rep_transcripts = sort {$a->length <=> $b->length} @rep_transcripts;
my @temp = pop @rep_transcripts;
print $temp[0]->stable_id."\n";
@rep_transcripts = @temp;
}
foreach my $rep_transcript(@rep_transcripts){
foreach my $pf (@{$rep_transcript->translation->get_all_ProteinFeatures}) {
unless ($already_seen{$pf->hseqname}) {
if ($pf->analysis->logic_name eq 'Pfam') {
my $domain_id = $pf->hseqname;
$domain_id =~ s/\.\d+//;
$domain .= "$domain_id\t$domain_id\n";
$already_seen{$pf->hseqname} =1;
}
}
}
$pfam_transcripts++;
$domains{$rep_transcript->dbID}=$domain;
}
}
print STDERR "Found $pfam_transcripts Pfam transcripts out of ".scalar(@genes)." genes\n ";
$self->domains(\%domains);
return 1; } |
sub fetch_trans
{ my ($self,$blast_results) =@_;
my %transcript_hash;
my $transcript_count;
my @homologs;
my %already_seen;
SPECIES: foreach my $species (keys %{$blast_results}){
my $ta = $self->species_db($species)->get_TranscriptAdaptor;
print "Fetching $species transcripts\n";
$transcript_count=0;
TRANS: foreach my $trans_id (@{$blast_results->{$species}}){;
next TRANS if ($already_seen{$trans_id});
$transcript_count++;
next SPECIES unless ($transcript_count <= $PS_SPECIES_LIMIT);
my $transcript = $ta->fetch_by_translation_stable_id($trans_id);
$already_seen{$trans_id} = 1;
unless ($transcript && $transcript->isa("Bio::EnsEMBL::Transcript")){
$self->warn("Cannot find transcript $trans_id for species ".$transcript_hash{$trans_id}." I get a $transcript\n$@\n");
}
next TRANS unless ($transcript->translateable_seq);
push @homologs,$transcript;
}
}
return\@ homologs; } |
sub make_dir
{ my ($self) = @_;
my $input_id = $self->input_id;
if ($PSILC_WORK_DIR){
system("mkdir $PSILC_WORK_DIR/$input_id");
}
else{
$self->throw("Cannot make output directory\n");
}
return 1;
}
} |
sub run
{ my ($self) = @_;
my $ga = $self->gene_db->get_GeneAdaptor;
$self->make_dir;
my @genes = @{$self->genes};
my $pfam_transcript;
my %domains = %{$self->domains};
PFAM: foreach my $gene(@genes){
my $pseudo_trans = 0;
my $pfam_trans = 0;
TRANSCRIPT: foreach my $transcript (@{$gene->get_all_Transcripts}){
my $PSILC_results;
if ($domains{$transcript->dbID}){
$pfam_transcript = $transcript;
$pfam_trans++;
}
else {
next TRANSCRIPT;
}
if ($pfam_transcript->stable_id) {
print STDERR "Running analysis for ".$pfam_transcript->stable_id."\n";
} else {
print STDERR "Running analysis for ".$pfam_transcript->dbID."\n";
}
my $blast_results = $self->run_blast($pfam_transcript);
my @homologs = @{$self->fetch_trans($blast_results)};
if (scalar(@homologs) >= 2) {
$PSILC_results = $self->run_PSILC($pfam_transcript,\@homologs);
}
unless ($PSILC_results) {
print STDERR "Not enough sequences to continue for transcript ".$pfam_transcript->dbID;
print STDERR " could only find ".scalar(@homologs)."\n";
$self->real_genes($gene);
next PFAM;
}
if ($PSILC_results->{'postNMax'} > 0.95){
$pseudo_trans++;
}
}
if ($pfam_trans > 0 && $pseudo_trans == $pfam_trans){
$self->pseudo_genes($gene)
}
else {
$self->real_genes($gene);
}
}
$self->output($self->pseudo_genes);
$self->output($self->real_genes);
return 1; } |
sub run_PSILC
{ my ($self,$pfam_transcript,$homologs) = @_;
my %domains = %{$self->domains};
my $PSILC = Bio::EnsEMBL::Analysis::Runnable::PSILC->new
(
'-trans' => $pfam_transcript,
'-homologs' => $homologs,
'-analysis' => $self->analysis,
'-domain' =>\$ domains{$pfam_transcript->dbID},
'-input_id' =>\$ self->input_id,
);
eval{
$PSILC->run;
};
if ($@){
$self->warn("PSILC FAILED TO RUN $@\n");
return 0;
}
return $PSILC->output; } |
sub species_db
{ my ($self,$species,$db)= @_;
if ($db){
unless ($db->isa("Bio::EnsEMBL::DBSQL::DBAdaptor")){
$self->throw("$species object is not a Bio::EnsEMBL::DBSQL::DBAdaptor, it is a $db$@\n");
}
if ($species eq $SUBJECT){
$self->{'_db_subject'} = $db;
}
if ($species eq $ORTH1){
$self->{'_db_orth1'} = $db;
}
if ($species eq $ORTH2){
$self->{'_db_orth2'} = $db;
}
}
if ($species eq $SUBJECT){
return $self->{'_db_subject'};
}
if ($species eq $ORTH1){
return $self->{'_db_orth1'};
}
if ($species eq $ORTH2){
return $self->{'_db_orth2'};
}
$self->throw("Species not recognised : $species\n");
return; } |
sub transcripts
{ my ($self, $transcript) = @_;
if ($transcript) {
push @{$self->{'_transcripts'}}, $transcript;
}
return $self->{'_transcripts'};
}
1; } |
General documentation