Raw content of Bio::EnsEMBL::Analysis::RunnableDB::Spliced_elsewhere
# You may distribute this module under the same terms as perl itself
#
# POD documentation - main docs before the code
=pod
=head1 NAME
Bio::EnsEMBL::Analysis::RunnableDB::Spliced_elsewhere;
=head1 SYNOPSIS
my $runnabledb = Bio::EnsEMBL::Analysis::RunnableDB::Spliced_elsewhere->new
(
-db => $dbadaptor,
-input_id => flag ids,
-analysis => $analysis
);
$runnabledb->fetch_input();
$runnabledb->run();
$runnabledb->write_output();
=head1 DESCRIPTION
Spliced elsewhere checks single exon transcripts and looks for copies
elsewhere in the genome that contain introns ie: processed pseudogenes.
The module runs on chunk files generated by scripts in:
ensembl-personal/sw4/Scripts/Pseudogenes
which partition the database into single and multiexon genes.The single exon
genes are written to flat files, the multiexon genes are written into the target
databse and also formatted as a blast db.
The module calls a gene spliced elsewhere if it is > 80% identical to another EnsEMBL
transcript with > 80 % coverage and the target gene has a genomic span > 3x the
span of the query. These numbers are configurable through:
Bio::EnsEMBL::Analysis::Config::Pseudogene
The databses used by the module are configured through:
Bio::EnsEMBL::Analysis::Config::Databases;
Runs as a part of the larger pseudogene analysis. Uses flagged single exon genes
identified by Bio::EnsEMBL::Analysis::RunnableDB::Pseudogene_DB.pm
=head1 CONTACT
Post questions to the Ensembl development list: ensembl-dev@ebi.ac.uk
=cut
package Bio::EnsEMBL::Analysis::RunnableDB::Spliced_elsewhere;
use strict;
use Bio::EnsEMBL::Analysis::RunnableDB::Pseudogene_DB;
use Bio::EnsEMBL::Analysis::Config::Pseudogene;
use Bio::EnsEMBL::Analysis::Runnable::Spliced_elsewhere;
use Bio::EnsEMBL::Pipeline::DBSQL::FlagAdaptor;
use Bio::EnsEMBL::Analysis::Runnable::BaseExonerate;
use Bio::EnsEMBL::Analysis::RunnableDB::BaseGeneBuild;
use vars qw(@ISA);
@ISA = qw( Bio::EnsEMBL::Analysis::RunnableDB::Pseudogene_DB Bio::EnsEMBL::Analysis::RunnableDB::BaseGeneBuild);
=head2 fetch_input
Title : fetch_input
Usage : $self->fetch_input
Function: Fetches input data for Spliced_elsewhere.pm from the database and flat files
Returns : none
Args : none
=cut
sub fetch_input{
my ($self)=@_;
my ($start, $end);
my @genes;
my $count=0;
my %parameters;
if ($self->parameters_hash) {
%parameters = %{$self->parameters_hash};
}
my $runname = "Bio::EnsEMBL::Analysis::Runnable::Spliced_elsewhere";
my $dna_db = $self->get_dbadaptor("REFERENCE_DB") ;
#genes come from final genebuild database
my $genes_db = $self->get_dbadaptor($self->PS_INPUT_DATABASE);
$self->gene_db($genes_db);
my $ga = $genes_db->get_GeneAdaptor;
my $fa = Bio::EnsEMBL::Pipeline::DBSQL::FlagAdaptor->new($self->db);
my $ids = $fa->fetch_by_analysis($self->analysis);
$self->throw("No flags found for analysis " .
$self->SPLICED_ELSEWHERE_LOGIC_NAME ."\n") unless (scalar(@{$ids}>0));
if ($self->input_id =~ /(\d+):(\d+)/) {
$start = $1;
$end = $2;
} else {
$self->throw("Input id not recognised\n");
}
# get ids
foreach my $flag (@{$ids}) {
if ($flag->dbID >= $start && $flag->dbID <= $end) {
$count++;
my $gene = $ga->fetch_by_dbID($flag->ensembl_id);
push @genes, $self->lazy_load($gene);
}
}
print "$count genes retrieved\n";
my $runnable = $runname->new
(
'-genes' => \@genes,
'-analysis' => $self->analysis,
'-PS_MULTI_EXON_DIR' => $self->PS_MULTI_EXON_DIR,
);
$self->runnable($runnable);
return 1;
}
=head2 run
Title : run
Usage : $self->run
Function: Overrides run method in parent class
Returns : none
Args : none
=cut
sub run {
my($self) = @_;
foreach my $runnable (@{$self->runnable}) {
$self->throw("Runnable module not set") unless ($runnable);
$runnable->run;
$self->parse_results($runnable->output);
$self->output($self->real_genes);
# If you want to store retrotransposed genes for psilc
return 1 unless ($self->retro_genes);
if ($self->RETROTRANSPOSED ) {
# Store gene dbIDS for PSILC
my @id_list;
foreach my $gene (@{$self->retro_genes}) {
push @id_list, $gene->dbID;
}
$self->store_ids(\@id_list,"psilc");
} else {
# store retrotransposed genes as pseudo or real depending on config
foreach my $gene (@{$self->retro_genes}) {
if ($self->RETRO_TYPE eq 'pseudogene') {
$self->pseudo_genes($gene);
} else {
$gene->type($self->RETRO_TYPE);
$self->output([$gene]);
}
}
}
$self->output($self->pseudo_genes);
}
return 1;
}
# Blast parsing done in runnable db as it needs acess to database to determine
# gene span
=head2 parse_results
Title : parse_results
Usage : $self->parse_results
Function: Parses blast results to find spliced copies.
Tests percent ID, coverage and span.
Span is calculated as the distance between two genomic cordinates
corresponding to the HSP subject start and end positions
Returns : none
Args : none
=cut
sub parse_results{
my ($self,$results)=@_;
my $ta = $self->gene_db->get_TranscriptAdaptor;
my $ga = $self->gene_db->get_GeneAdaptor;
RESULT:foreach my $result_hash_ref (@{$results}) {
my %result_hash = %{$result_hash_ref};
my %trans_type ;
next RESULT unless ( %result_hash );
TRANS: foreach my $id (keys %result_hash){
my $retro_trans = $ta->fetch_by_dbID($id);
my $retro_span = $retro_trans->cdna_coding_end- $retro_trans->cdna_coding_start;
# catch results where the blast job failed
if ( $result_hash{$id} eq 'NONE' ){
# gene is very unique we have to call it real really
push @{$trans_type{'real'}},$retro_trans;
next TRANS;
}
if ( $result_hash{$id} eq 'REPEATS' ) {
# gene is covered with low complexity repeats probably - let it through
push @{$trans_type{'real'}},$retro_trans;
next TRANS;
}
my @dafs = @{$result_hash{$id}};
@dafs = sort {$a->p_value <=> $b->p_value} @dafs;
DAF: foreach my $daf (@dafs) {
my $retro_coverage = 0;
my $real_coverage = 0;
# is the percent id above threshold?
next DAF unless ($daf->percent_id > $self->PS_PERCENT_ID_CUTOFF);
next DAF unless ($daf->p_value < $self->PS_P_VALUE_CUTOFF);
# dont want reverse matches
next DAF unless ($daf->strand == $daf->hstrand ) ;
# tighten up the result set
next DAF unless ($daf->percent_id > 80 or ( $daf->percent_id > 70 and $daf->p_value < 1.0e-140 ));
my @align_blocks = $daf->ungapped_features;
# is the coverage above the threshold?
# There are a few things to consider the coverage of the PROTEIN sitting in the genomic say > 50 %
# There is how much GENOMIC dna has been aligned as well - thats a good sign it is a retro gene
# lets work out the coverage exon at a time
foreach my $e ( @{$retro_trans->get_all_Exons} ) {
# put the exons onto the same slice as the dafs
my $slice = $retro_trans->feature_Slice;
my $exon = $e->transfer($slice);
foreach my $block ( @align_blocks ) {
# compensate for the 1kb padding on the retro transcript
$block->start($block->start-1000);
$block->end($block->end-1000);
$real_coverage+= $block->length;
if ( $exon->overlaps($block) ) {
if ( $exon->start < $block->start && $exon->end < $block->end ) {
# Bs|---------------|------Be
# Es---|===============|Ee
$retro_coverage+= $exon->end - $block->start ;
} elsif ( $exon->start >= $block->start && $exon->end >= $block->end ) {
# Bs-----|---------------|Be
# Es|===============|-----Ee
$retro_coverage+= $block->end - $exon->start;
} elsif ( $exon->start < $block->start && $exon->end >= $block->end ) {
# Bs|---------------------|Be
# Es----|=====================|-----Ee
$retro_coverage+= $block->end - $block->start;
} elsif ( $exon->start >= $block->start && $exon->end < $block->end ) {
# Bs|---|---------------------|---|Be
# Es|=====================|Ee
$retro_coverage+= $exon->end - $exon->start;
}
}
}
}
my $coverage = int(($retro_coverage / $retro_trans->length)*100);
my $aligned_genomic = $real_coverage - $retro_coverage;
next DAF unless ($coverage > $self->PS_RETOTRANSPOSED_COVERAGE);
next DAF unless ($aligned_genomic > $self->PS_ALIGNED_GENOMIC);
my $real_trans;
# Throw if transcript cannot be found
eval{
$real_trans = $ta->fetch_by_stable_id($daf->hseqname);
unless ( $real_trans ) {
$real_trans = $ta->fetch_by_dbID($daf->hseqname);
}
};
if ($@) {
$self->throw("Unable to find transcript $daf->hseqname \n$@\n");
next;
}
##################################################################
# real span is the genomic span of the subject HSP
# hstart+3 and $daf->hend-3 move inwards at both ends of the HSP by
# 3 residues, this is so that if 1 residue is sitting on a new exon
# it wont include that in the span, needs to overlap by 3 residues
# before it gets included
my $real_span;
my @genomic_coords = $real_trans->cdna2genomic($daf->hstart+9,$daf->hend-9);
@genomic_coords = sort {$a->start <=> $b->start} @genomic_coords;
$real_span = $genomic_coords[$#genomic_coords]->end - $genomic_coords[0]->start;
# Is the span higher than the allowed ratio?
if ($real_span / $retro_span > $self->PS_SPAN_RATIO ) {
print STDERR "---------------------------------------------------------------------------------------------------\n";
print STDERR "DAF: " .
$daf->start . " " .
$daf->end . " " .
$daf->strand . " " .
$daf->hstart . " " .
$daf->hend . " " .
$daf->hstrand . " " .
$daf->hseqname . " " .
$daf->cigar_string . "\n";
print STDERR "transcript id ". $retro_trans->dbID." matches transcriptid " . $daf->hseqname . " at "
. $daf->percent_id . " %ID and $coverage % coverage with $aligned_genomic flanking genomic bases aligned pseudogene span of $retro_span vs real gene span of $real_span\n";
print STDERR "P-value = " . $daf->p_value . "\n";
print STDERR "Coverage $coverage% - $retro_coverage bp of gene and $aligned_genomic bp of genomic $real_coverage total\n";
print STDERR ">".$retro_trans->dbID;
print STDERR "\n".$retro_trans->feature_Slice->expand(1000,1000)->get_repeatmasked_seq->seq."\n";
print STDERR ">".$real_trans->dbID;
print STDERR "\n".$real_trans->seq->seq."\n";
# Gene is pseudo, store it internally
push @{$trans_type{'pseudo'}},$retro_trans;
next TRANS;
}
}
# transcript is real
push @{$trans_type{'real'}},$retro_trans;
}
unless (defined($trans_type{'real'})) {
# if all transcripts are pseudo get label the gene as a pseudogene
my $gene = $ga->fetch_by_transcript_id($trans_type{'pseudo'}[0]->dbID);
my @pseudo_trans = @{$gene->get_all_Transcripts};
@pseudo_trans = sort {$a->length <=> $b->length} @pseudo_trans;
my $only_transcript_to_keep = pop @pseudo_trans;
$self->transcript_to_keep($only_transcript_to_keep);
foreach my $pseudo_transcript (@pseudo_trans) {
my $blessed = $self->_remove_transcript_from_gene($gene,$pseudo_transcript);
if ( $blessed ) {
print STDERR "Blessed transcript " . $pseudo_transcript->display_id .
" is a retro transcript \n";
}
}
$gene->biotype($self->RETRO_TYPE);
$self->retro_genes($gene);
next RESULT;
}
# gene has at least one real transcript so it is real
$self->real_genes($ga->fetch_by_transcript_id($trans_type{'real'}[0]->dbID));
}
return 1;
}
######################################
# CONTAINERS
=head2 retro_genes
Arg [1] : Bio::EnsEMBL::Gene
Description: get/set for retrogenes
Returntype : Bio::EnsEMBL::Gene
Exceptions : none
Caller : general
=cut
sub retro_genes {
my ($self, $retro_gene) = @_;
if ($retro_gene) {
unless ($retro_gene->isa("Bio::EnsEMBL::Gene")){
$self->throw("retro gene is not a Bio::EnsEMBL::Gene, it is a $retro_gene");
}
push @{$self->{'_retro_gene'}},$retro_gene;
}
return $self->{'_retro_gene'};
}
1;