Summary | Included libraries | Package variables | Synopsis | Description | General documentation | Methods |
WebCvs | Raw content |
fetch_input | Description | Code |
parse_results | Description | Code |
retro_genes | Description | Code |
run | Description | Code |
fetch_input | code | next | Top |
Title : fetch_input Usage : $self->fetch_input Function: Fetches input data for Spliced_elsewhere.pm from the database and flat files Returns : none Args : none |
parse_results | code | prev | next | Top |
Title : parse_results Usage : $self->parse_results Function: Parses blast results to find spliced copies. Tests percent ID, coverage and span. Span is calculated as the distance between two genomic cordinates corresponding to the HSP subject start and end positions Returns : none Args : none |
retro_genes | code | prev | next | Top |
Arg [1] : Bio::EnsEMBL::Gene Description: get/set for retrogenes Returntype : Bio::EnsEMBL::Gene Exceptions : none Caller : general |
run | code | prev | next | Top |
Title : run Usage : $self->run Function: Overrides run method in parent class Returns : none Args : none |
fetch_input | description | prev | next | Top |
my ($self)=@_; my ($start, $end); my @genes; my $count=0; my %parameters; if ($self->parameters_hash) { %parameters = %{$self->parameters_hash}; } my $runname = "Bio::EnsEMBL::Analysis::Runnable::Spliced_elsewhere"; my $dna_db = $self->get_dbadaptor("REFERENCE_DB") ; #genes come from final genebuild database}
my $genes_db = $self->get_dbadaptor($self->PS_INPUT_DATABASE); $self->gene_db($genes_db); my $ga = $genes_db->get_GeneAdaptor; my $fa = Bio::EnsEMBL::Pipeline::DBSQL::FlagAdaptor->new($self->db); my $ids = $fa->fetch_by_analysis($self->analysis); $self->throw("No flags found for analysis " . $self->SPLICED_ELSEWHERE_LOGIC_NAME ."\n") unless (scalar(@{$ids}>0)); if ($self->input_id =~ /(\d+):(\d+)/) { $start = $1; $end = $2; } else { $self->throw("Input id not recognised\n"); } # get ids
foreach my $flag (@{$ids}) { if ($flag->dbID >= $start && $flag->dbID <= $end) { $count++; my $gene = $ga->fetch_by_dbID($flag->ensembl_id); push @genes, $self->lazy_load($gene); } } print "$count genes retrieved\n"; my $runnable = $runname->new ( '-genes' =>\@ genes, '-analysis' => $self->analysis, '-PS_MULTI_EXON_DIR' => $self->PS_MULTI_EXON_DIR, ); $self->runnable($runnable); return 1;
parse_results | description | prev | next | Top |
my ($self,$results)=@_; my $ta = $self->gene_db->get_TranscriptAdaptor; my $ga = $self->gene_db->get_GeneAdaptor; RESULT:foreach my $result_hash_ref (@{$results}) { my %result_hash = %{$result_hash_ref}; my %trans_type ; next RESULT unless ( %result_hash ); TRANS: foreach my $id (keys %result_hash){ my $retro_trans = $ta->fetch_by_dbID($id); my $retro_span = $retro_trans->cdna_coding_end- $retro_trans->cdna_coding_start; # catch results where the blast job failed}
if ( $result_hash{$id} eq 'NONE' ){ # gene is very unique we have to call it real really
push @{$trans_type{'real'}},$retro_trans; next TRANS; } if ( $result_hash{$id} eq 'REPEATS' ) { # gene is covered with low complexity repeats probably - let it through
push @{$trans_type{'real'}},$retro_trans; next TRANS; } my @dafs = @{$result_hash{$id}}; @dafs = sort {$a->p_value <=> $b->p_value} @dafs; DAF: foreach my $daf (@dafs) { my $retro_coverage = 0; my $real_coverage = 0; # is the percent id above threshold?
next DAF unless ($daf->percent_id > $self->PS_PERCENT_ID_CUTOFF); next DAF unless ($daf->p_value < $self->PS_P_VALUE_CUTOFF); # dont want reverse matches
next DAF unless ($daf->strand == $daf->hstrand ) ; # tighten up the result set
next DAF unless ($daf->percent_id > 80 or ( $daf->percent_id > 70 and $daf->p_value < 1.0e-140 )); my @align_blocks = $daf->ungapped_features; # is the coverage above the threshold?
# There are a few things to consider the coverage of the PROTEIN sitting in the genomic say > 50 %
# There is how much GENOMIC dna has been aligned as well - thats a good sign it is a retro gene
# lets work out the coverage exon at a time
foreach my $e ( @{$retro_trans->get_all_Exons} ) { # put the exons onto the same slice as the dafs
my $slice = $retro_trans->feature_Slice; my $exon = $e->transfer($slice); foreach my $block ( @align_blocks ) { # compensate for the 1kb padding on the retro transcript
$block->start($block->start-1000); $block->end($block->end-1000); $real_coverage+= $block->length; if ( $exon->overlaps($block) ) { if ( $exon->start < $block->start && $exon->end < $block->end ) { # Bs|---------------|------Be
# Es---|===============|Ee
$retro_coverage+= $exon->end - $block->start ; } elsif ( $exon->start >= $block->start && $exon->end >= $block->end ) { # Bs-----|---------------|Be
# Es|===============|-----Ee
$retro_coverage+= $block->end - $exon->start; } elsif ( $exon->start < $block->start && $exon->end >= $block->end ) { # Bs|---------------------|Be
# Es----|=====================|-----Ee
$retro_coverage+= $block->end - $block->start; } elsif ( $exon->start >= $block->start && $exon->end < $block->end ) { # Bs|---|---------------------|---|Be
# Es|=====================|Ee
$retro_coverage+= $exon->end - $exon->start; } } } } my $coverage = int(($retro_coverage / $retro_trans->length)*100);
my $aligned_genomic = $real_coverage - $retro_coverage; next DAF unless ($coverage > $self->PS_RETOTRANSPOSED_COVERAGE); next DAF unless ($aligned_genomic > $self->PS_ALIGNED_GENOMIC); my $real_trans; # Throw if transcript cannot be found
eval{ $real_trans = $ta->fetch_by_stable_id($daf->hseqname); unless ( $real_trans ) { $real_trans = $ta->fetch_by_dbID($daf->hseqname); } }; if ($@) { $self->throw("Unable to find transcript $daf->hseqname\n $@\n"); next; } ##################################################################
# real span is the genomic span of the subject HSP
# hstart+3 and $daf->hend-3 move inwards at both ends of the HSP by
# 3 residues, this is so that if 1 residue is sitting on a new exon
# it wont include that in the span, needs to overlap by 3 residues
# before it gets included
my $real_span; my @genomic_coords = $real_trans->cdna2genomic($daf->hstart+9,$daf->hend-9); @genomic_coords = sort {$a->start <=> $b->start} @genomic_coords; $real_span = $genomic_coords[$#genomic_coords]->end - $genomic_coords[0]->start; # Is the span higher than the allowed ratio?
if ($real_span / $retro_span > $self->PS_SPAN_RATIO ) {
print STDERR "---------------------------------------------------------------------------------------------------\n";
print STDERR "DAF: " . $daf->start . " " . $daf->end . " " . $daf->strand . " " . $daf->hstart . " " . $daf->hend . " " . $daf->hstrand . " " . $daf->hseqname . " " . $daf->cigar_string . "\n"; print STDERR "transcript id ". $retro_trans->dbID." matches transcriptid " . $daf->hseqname . " at " . $daf->percent_id . " %ID and $coverage % coverage with $aligned_genomic flanking genomic bases aligned pseudogene span of $retro_span vs real gene span of $real_span\n"; print STDERR "P-value = " . $daf->p_value . "\n"; print STDERR "Coverage $coverage% - $retro_coverage bp of gene and $aligned_genomic bp of genomic $real_coverage total\n"; print STDERR ">".$retro_trans->dbID; print STDERR "\n".$retro_trans->feature_Slice->expand(1000,1000)->get_repeatmasked_seq->seq."\n"; print STDERR ">".$real_trans->dbID; print STDERR "\n".$real_trans->seq->seq."\n"; # Gene is pseudo, store it internally
push @{$trans_type{'pseudo'}},$retro_trans; next TRANS; } } # transcript is real
push @{$trans_type{'real'}},$retro_trans; } unless (defined($trans_type{'real'})) { # if all transcripts are pseudo get label the gene as a pseudogene
my $gene = $ga->fetch_by_transcript_id($trans_type{'pseudo'}[0]->dbID); my @pseudo_trans = @{$gene->get_all_Transcripts}; @pseudo_trans = sort {$a->length <=> $b->length} @pseudo_trans; my $only_transcript_to_keep = pop @pseudo_trans; $self->transcript_to_keep($only_transcript_to_keep); foreach my $pseudo_transcript (@pseudo_trans) { my $blessed = $self->_remove_transcript_from_gene($gene,$pseudo_transcript); if ( $blessed ) { print STDERR "Blessed transcript " . $pseudo_transcript->display_id . " is a retro transcript\n "; } } $gene->biotype($self->RETRO_TYPE); $self->retro_genes($gene); next RESULT; } # gene has at least one real transcript so it is real
$self->real_genes($ga->fetch_by_transcript_id($trans_type{'real'}[0]->dbID)); } return 1; } ######################################
# CONTAINERS
retro_genes | description | prev | next | Top |
my ($self, $retro_gene) = @_; if ($retro_gene) { unless ($retro_gene->isa("Bio::EnsEMBL::Gene")){ $self->throw("retro gene is not a Bio::EnsEMBL::Gene, it is a $retro_gene"); } push @{$self->{'_retro_gene'}},$retro_gene; } return $self->{'_retro_gene'}; } 1;}
run | description | prev | next | Top |
my($self) = @_; foreach my $runnable (@{$self->runnable}) { $self->throw("Runnable module not set") unless ($runnable); $runnable->run; $self->parse_results($runnable->output); $self->output($self->real_genes); # If you want to store retrotransposed genes for psilc}
return 1 unless ($self->retro_genes); if ($self->RETROTRANSPOSED ) { # Store gene dbIDS for PSILC
my @id_list; foreach my $gene (@{$self->retro_genes}) { push @id_list, $gene->dbID; } $self->store_ids(\@id_list,"psilc"); } else { # store retrotransposed genes as pseudo or real depending on config
foreach my $gene (@{$self->retro_genes}) { if ($self->RETRO_TYPE eq 'pseudogene') { $self->pseudo_genes($gene); } else { $gene->type($self->RETRO_TYPE); $self->output([$gene]); } } } $self->output($self->pseudo_genes); } return 1; } # Blast parsing done in runnable db as it needs acess to database to determine
# gene span
CONTACT | Top |