Raw content of Bio::EnsEMBL::Analysis::Runnable::Pseudogene
#
# You may distribute this module under the same terms as perl itself
#
=pod
=head1 NAME
Bio::EnsEMBL::Analysis::Runnable::Pseudogene
=head1 SYNOPSIS
my $runnable = Bio::EnsEMBL::Analysis::Runnable::Pseudogene->new
(
'-genes' => \@_genes,
'-repeat_features' => \%repeat_blocks,
);
$runnable->run;
$runnable->output;
Where output returns an array of modified genes.
Repeat blocks is a hash of repeats covering each gene merged into blocks.
=head1 DESCRIPTION
Runnable for PseudogeneDB
Runs tests to identiy pseudogenes:
Calls it a pseudo gene if:
1. All of the introns are frameshifted.
2. Real introns are covered with repeats.
3. Real introns in a two exon gene are covered with a protein feature
Pseudogene takes a Bio::EnsEMBL::Slice object and assesses genes and transcripts for evidence of retrotransposition.
In the case of genes being identified as pseudogenes, the gene objects have their type set to pseudogene and all but the longest transcripts and translations are deleted.
If the gene has 1 or more pseudo transcripts but has other transcritps that are valid the dubious transcripts are removed The resulting gene objects are returned in an array.
=head1 CONTACT
Mail to B
=cut
package Bio::EnsEMBL::Analysis::Runnable::Pseudogene;
use strict;
use Bio::EnsEMBL::DBSQL::DBAdaptor;
use Bio::EnsEMBL::SeqFeature;
use Bio::EnsEMBL::Analysis::Runnable;
use Bio::EnsEMBL::Utils::Argument qw( rearrange );
use vars qw(@ISA);
@ISA = qw(Bio::EnsEMBL::Analysis::Runnable);
# Object preamble - inherits from Bio::EnsEMBL::Root;
=head2 new
Args : various
Description: Runnable constructor
Returntype : Bio::EnsEMBL::Analysis::Runnable::Pseudogene
Caller : general
=cut
sub new {
my ($class,@args) = @_;
my $self = $class->SUPER::new(@args);
#SET UP ANY INSTANCE VARIABLES
$self->{'_modified_genes'} =[] ; # array ref to modified genes to write to new db
$self->{'_discarded_transcripts'} = []; # array ref to discarded transcripts
$self->{'_genes'} = []; #array of genes to test;
$self->{'_repeats'} = {}; # hash of repeat blocks corresponding to each gene;
$self->{'_real'} = 0; # scalar number of real genes identified
$self->{'_pseudogenes'} = 0; #scalar number of pseudogenes identified
$self->{'_indeterminate_genes'} = []; #array of indeterminategenes dbIDs identified
$self->{'_single_exon_genes'} = []; #array of single exon gene dbIDs identified
$self->{'_multi_exon_genes'} = []; #array of multiple exon gene to make into a blast database for spliced elsewhere
my( $genes,$repeat_features,$PS_REPEAT_TYPES,$PS_FRAMESHIFT_INTRON_LENGTH,$PS_MAX_INTRON_LENGTH,
$PS_MAX_INTRON_COVERAGE,$PS_MAX_EXON_COVERAGE,
$PS_NUM_FRAMESHIFT_INTRONS,$PS_NUM_REAL_INTRONS,$SINGLE_EXON,
$INDETERMINATE,$PS_MIN_EXONS,$PS_MULTI_EXON_DIR,
$BLESSED_BIOTYPES,$PS_PSEUDO_TYPE,$PS_BIOTYPE,$PS_REPEAT_TYPE,$DEBUG) = rearrange
([qw(
GENES
REPEAT_FEATURES
PS_REPEAT_TYPES
PS_FRAMESHIFT_INTRON_LENGTH
PS_MAX_INTRON_LENGTH
PS_MAX_INTRON_COVERAGE
PS_MAX_EXON_COVERAGE
PS_NUM_FRAMESHIFT_INTRONS
PS_NUM_REAL_INTRONS
SINGLE_EXON
INDETERMINATE
PS_MIN_EXONS
PS_MULTI_EXON_DIR
BLESSED_BIOTYPES
PS_PSEUDO_TYPE
PS_BIOTYPE
PS_REPEAT_TYPE
DEBUG )], @args);
$self->genes($genes);
$self->repeats($repeat_features);
$self->PS_REPEAT_TYPES($PS_REPEAT_TYPES);
$self->PS_FRAMESHIFT_INTRON_LENGTH($PS_FRAMESHIFT_INTRON_LENGTH);
$self->PS_MAX_INTRON_LENGTH($PS_MAX_INTRON_LENGTH);
$self->PS_MAX_INTRON_COVERAGE($PS_MAX_INTRON_COVERAGE);
$self->PS_MAX_EXON_COVERAGE($PS_MAX_EXON_COVERAGE);
$self->PS_NUM_FRAMESHIFT_INTRONS($PS_NUM_FRAMESHIFT_INTRONS);
$self->PS_NUM_REAL_INTRONS($PS_NUM_REAL_INTRONS);
$self->SINGLE_EXON($SINGLE_EXON);
$self->INDETERMINATE($INDETERMINATE);
$self->PS_MIN_EXONS($PS_MIN_EXONS);
$self->PS_MULTI_EXON_DIR($PS_MULTI_EXON_DIR);
$self->BLESSED_BIOTYPES($BLESSED_BIOTYPES);
$self->PS_PSEUDO_TYPE($PS_PSEUDO_TYPE);
$self->PS_BIOTYPE($PS_BIOTYPE);
$self->PS_REPEAT_TYPE($PS_REPEAT_TYPE);
$self->DEBUG($DEBUG);
return $self;
}
=head2 run
Arg [none] :
Description: runs the runnable
Returntype : none
Exceptions : none
Caller : general
=cut
sub run {
my ($self) = @_;
$self->test_genes;
$self->summary;
if ($self->SINGLE_EXON){
# Write out multiple exon genes for making into blast db for Spliced elsewhere
my $filename = $self->create_filename('multi_exon_seq','fasta',$self->PS_MULTI_EXON_DIR);
$self->write_seq_array($self->multi_exon_genes,$filename);
}
return 0;
}
=head2 summary
Arg [none] :
Description: prints out some data about the results
Returntype : none
Exceptions : none
Caller : general
=cut
sub summary {
my ($self) = @_;
if ( $self->real){print STDERR $self->real." real genes identified \n";}
if ( $self->pseudogenes){print STDERR $self->pseudogenes." pseudogenes identified \n";}
if ( $self->repeatgenes){print STDERR $self->repeatgenes." repeat genes identified \n";}
if ( $self->discarded_transcripts){print STDERR scalar(@{$self->discarded_transcripts})." pseudotranscripts to be chucked \n";}
foreach my $transcript (@{$self->discarded_transcripts}) {
print STDERR $transcript->dbID."\n";
}
if ($self->overlooked_genes){
print STDERR "Overlooked genes\n";
foreach my $hash_ref (@{$self->overlooked_genes}) {
my %hash = %{$hash_ref};
foreach my $gene (keys %hash){
print STDERR "$gene ".scalar(@{$hash{$gene}})."\t transcript ".@{$hash{$gene}}[0]->dbID." \n";
}
}
}
if ($self->SINGLE_EXON){
print STDERR scalar(@{$self->single_exon_genes})." single exon genes identified and held back for further study\n";
}
if ($self->INDETERMINATE){
print STDERR scalar(@{$self->indeterminate_genes})." indeterminate genes identified and held back for further study\n";
}
return 1;
}
=head2 test_genes
Arg [none] :
Description: Check genes one transcript at at a time pushes test result for each transcript onto an array, tests array to make final decision genes are classed as pseudogenes if the following criteria are met:
1. At least 80% of the introns are covered with repeats and the total intron length is smaller than 5kb and the gene has a least 1 real and 1 frameshifted intron (default values).
2. All of the introns are short frameshifted introns
3. A two exon gene with intron covered with a protein feature
Returntype : none
Exceptions : none
Caller : general
=cut
sub test_genes{
my $self = shift;
my @evidence;
my $num=0;
my $pseudo= 0;
my $possible = 0;
my @genes = @{$self->genes};
GENE: foreach my $gene (@genes) {
my %trans_type = ();
TRANS: foreach my $transcript (@{$gene->get_all_Transcripts}) {
my $evidence = $self->transcript_evidence($transcript,$gene);
# store the single exon gene for further analysis unless they are all covered by repeats
if ($self->SINGLE_EXON){
if (scalar(@{$transcript->get_all_Exons()})==1) {
unless ( $evidence->{'covered_exons'} && $evidence->{'covered_exons'} >= $self->PS_MAX_EXON_COVERAGE ) {
push @{$trans_type{'single_exon'}}, $transcript;
next TRANS;
}
}
}
$num++;
#transcript tests
#CALL PSEUDOGENE IF AT LEAST 80% COVERAGE OF INTRONS BY REPEATS
#AT LEAST 1 F/S EXON AND 1 REAL EXON (?)
#TOTAL INTRON LENGTH < 5K
if ($evidence->{'total_intron_len'} < $self->PS_MAX_INTRON_LENGTH &&
$evidence->{'frameshift_introns'} >= $self->PS_NUM_FRAMESHIFT_INTRONS &&
$evidence->{'real_introns'} >= $self->PS_NUM_REAL_INTRONS &&
$evidence->{'covered_introns'} >= $self->PS_MAX_INTRON_COVERAGE ) {
push @{$trans_type{'pseudo'}}, $transcript;
print STDERR $gene->dbID." - repeats in introns in transcript ".$transcript->dbID."\n" if $self->DEBUG;
print STDERR join (', ',%{$evidence}),"\n" if $self->DEBUG;
next TRANS;
}
#ALL FRAMESHIFTED - it is a pseudogene
if ($evidence->{'num_introns'} &&
$evidence->{'frameshift_introns'} == $evidence->{'num_introns'}) {
push@{$trans_type{'pseudo'}}, $transcript;
next TRANS;
}
# repeats covering exons
if ($evidence->{'covered_exons'} &&
$evidence->{'covered_exons'} >= $self->PS_MAX_EXON_COVERAGE ) {
push@{$trans_type{'repeat'}}, $transcript;
next TRANS;
}
if ($self->INDETERMINATE){
# Tests for genes that look odd to run PSILC over
# anything with any frameshifts above the cutoff or
# filled introns but lacking the other criteria
if ($evidence->{'frameshift_introns'} >= $self->PS_NUM_FRAMESHIFT_INTRONS or
$evidence->{'covered_introns'} >= $self->PS_MAX_INTRON_COVERAGE ) {
push @{$trans_type{'indeterminate'}}, $transcript;
print STDERR $gene->dbID." - looks a little dodgy ".$transcript->dbID."\n" if $self->DEBUG;
print STDERR join (', ',%{$evidence}),"\n" if $self->DEBUG;
next TRANS;
}
}
# Tests for situation where 2 exon gene has a protein feasture
# covering intron, ie it has spliced around somthing bad...
if ($evidence->{'real_introns'} == 1) {
my $judgement = $self->protein_covered_intron($transcript,$gene);
if ($judgement eq "dodgy"){
push @{$trans_type{'pseudo'}}, $transcript;
next TRANS;
}
}
# transcript passes all tests, it is real
push @{$trans_type{'real'}}, $transcript;
push @{$trans_type{'not_multi_exon'}}, $transcript
if scalar @{$transcript->get_all_Exons} < $self->PS_MIN_EXONS;
}
#########################################
# gene tests
#########################################
# All transcripts have only one exon
# put them to be stored for spliced elsewhere
# if specified in config
# Dont store IG segment genes, just protein codinf
if ($self->SINGLE_EXON){
if ($trans_type{'single_exon'}){
if ( $gene->biotype eq $self->PS_BIOTYPE ){
unless ($trans_type{'pseudo'} or
$trans_type{'indeterminate'} or
$trans_type{'real'} or
$trans_type{'repeat'}) {
$self->single_exon_genes($gene->dbID);
next GENE;
}
}
}
}
#################################################################
# gene is pseudogene, all transcripts are pseudo
# set type to pseudogene chuck away all but the longest transcript
if ($trans_type{'pseudo'}){
unless ($trans_type{'single_exon'} or
$trans_type{'indeterminate'} or
$trans_type{'real'} or
$trans_type{'repeat'}) {
$gene->biotype($self->PS_PSEUDO_TYPE);
my @pseudo_trans = @{$trans_type{'pseudo'}};
@pseudo_trans = sort {$a->length <=> $b->length} @pseudo_trans;
my $only_transcript_to_keep = pop @pseudo_trans;
$self->transcript_to_keep($only_transcript_to_keep);
foreach my $pseudo_transcript (@pseudo_trans) {
my $blessed = $self->_remove_transcript_from_gene($gene,$pseudo_transcript);
if ( $blessed ) {
print STDERR "Blessed transcript " . $pseudo_transcript->display_id .
" is a pseudo transcript \n";
}
}
$self->modified_genes($gene);
$self->pseudogenes(1);
next GENE;
}
}
##################################################
# gene is indeterminate, if it has no real
# transcripts - all either pseudo or indeterminate
# dont add gene to list to return instead get its
# dbID for later
if ($self->INDETERMINATE){
if ($trans_type{'indeterminate'}){
unless ($trans_type{'real'} or
$trans_type{'single_exon'} or
$trans_type{'repeat'} ){
$self->indeterminate_genes($gene->dbID);
next GENE;
}
}
}
###############################################
# gene is real but has some dodgy transcripts
# delete the dodgy transcripts from the gene
# do you want to throw away indeterminate transcripts?
# - not at the moment
if ($trans_type{'pseudo'}){
if ($trans_type{'real'} or
$trans_type{'single_exon'} or
$trans_type{'indeterminate'} or
$trans_type{'repeat'}){
foreach my $trans (@{$trans_type{'pseudo'}}) {
my $blessed = $self->_remove_transcript_from_gene($gene,$trans);
if ( $blessed ) {
print STDERR "Blessed transcript " . $trans->display_id .
" is a pseudo transcript \n";
}
}
$self->modified_genes($gene);
$self->multi_exon_genes($gene) unless $trans_type{'not_multi_exon'};
$self->pseudogenes(1);
next GENE;
}
}
####################################
# gene and transcripts are real
unless ($trans_type{'pseudo'} or $trans_type{'repeat'}) {
$self->modified_genes($gene);
$self->multi_exon_genes($gene) unless $trans_type{'not_multi_exon'};
$self->real(1);
next GENE;
}
if ($trans_type{'repeat'}) {
# if a repeat type is specified label the gene as a repeat and store it
# otherwise we can just leave it - it wont get written to the final db
if ($self->PS_REPEAT_TYPE) {
my @pseudo_trans = @{$gene->get_all_Transcripts};
@pseudo_trans = sort {$a->length <=> $b->length} @pseudo_trans;
my $only_transcript_to_keep = pop @pseudo_trans;
$self->transcript_to_keep($only_transcript_to_keep);
foreach my $pseudo_transcript (@pseudo_trans) {
my $blessed = $self->_remove_transcript_from_gene($gene,$pseudo_transcript);
if ( $blessed ) {
print STDERR "Blessed transcript " . $pseudo_transcript->display_id .
" is covered with repeats \n";
}
}
$gene->biotype($self->PS_REPEAT_TYPE);
$self->modified_genes($gene);
$self->repeatgenes(1);
} else {
print STDERR "Gene " . $gene->display_id . " is covered with repeats - ignoring it\n";
$self->repeatgenes(1);
}
next GENE;
}
#########################################
# should be nothing left after this point
# might be good to catch anything left and flag it as weird
$self->overlooked_genes(\%trans_type);
}
return 1;
}
=head2 transcript_evidence
Arg [none] : Bio::EnsEMBL::Transcript
Description: Test individual transcripts return a hash containing results evidence
Returntype : hash
Exceptions : none
Caller : general
=cut
##############################################################
#test individual transcripts return a hash containing evidence
sub transcript_evidence{
my ($self,$transcript,$gene) =@_;
my $repeat_blocks = $self->get_repeats($gene);
my $results;
my @exons = @{$transcript->get_all_Exons};
@exons = sort {$a->start <=> $b->start} @exons;
my $prev_exon = undef;
my $total_intron_len = 0;
my $covered_intron_len = 0;
my $total_exon_len = 0;
my $covered_exon_len = 0;
my $n_real_intron = 0;
my $n_frameshift_intron = 0;
my $covered_introns = 0 ;
my $covered_exons = 0 ;
foreach my $exon (@exons) {
my $seq_feature_exon = Bio::EnsEMBL::Feature->new(
-START => $exon->start,
-END => $exon->end,
-STRAND => $exon->strand
);
# Do intron
if (defined($prev_exon)) {
my $intron = Bio::EnsEMBL::Feature->new(
-START => $prev_exon->end+1,
-END => $exon->start-1,
-STRAND => $exon->strand
);
if ($intron->length > $self->PS_FRAMESHIFT_INTRON_LENGTH) {
$n_real_intron++;
} else {
$n_frameshift_intron++;
}
$total_intron_len+=$intron->length;
$covered_intron_len+=$self->_len_covered($intron,$repeat_blocks);
}
# Do exon
$total_exon_len+=$exon->length;
$covered_exon_len+=$self->_len_covered($seq_feature_exon,$repeat_blocks);
$prev_exon = $exon;
}
#calculate percentage coverage
if ($total_intron_len > 0) {
$covered_introns = (($covered_intron_len/$total_intron_len)*100);
}
if ($total_exon_len > 0) {
$covered_exons = (($covered_exon_len/$total_exon_len)*100)
}
$results = {
'num_introns' => $#exons,
'total_exon_len' => $total_exon_len,
'covered_exons' => $covered_exons,
'total_intron_len' => $total_intron_len,
'covered_introns' => $covered_introns,
'real_introns' => $n_real_intron,
'frameshift_introns' => $n_frameshift_intron
};
return $results;
}
=head2 len_covered
Args : Bio::Seq::Feature object, reference to an array of repeat blocks
Description: measures how much of the seq feature (intron or exon) is covered by repeat blocks
Returntype : scalar
=cut
sub _len_covered {
my ($self,$feat,$repeat_blocks_ref) = @_;
# print STDERR "FT " . $feat->start . " " . $feat->end . "\n";
my $covered_len = 0;
RBLOOP: foreach my $repeat_block (@$repeat_blocks_ref) {
# print STDERR "RB " . $repeat_block->start . " " . $repeat_block->end . "\n";
if ($repeat_block->overlaps($feat, 'ignore')) {
my $inter = $self->intersection($feat,$repeat_block);
$covered_len += $inter->length;
} elsif ($repeat_block->start > $feat->end) {
last RBLOOP;
}
}
return $covered_len;
}
=head2 protein_covered_intron
Args : Bio::EnsEMBL::Transcript object, Bio::EnsEMBL::Gene object
Description: decides if 'real' intron in transcript is covered with a protein feature
Returntype : scalar
=cut
sub protein_covered_intron{
my ($self,$transcript,$gene) =@_;
my %seq_features;
my $identified;
my @all_exons = @{$transcript->get_all_Exons};
@all_exons = sort {$a->start <=> $b->start} @all_exons;
my @exons;
# find real intron
EXON: for (my $i = 1 ; $i <= $#all_exons ; $i++){
my $intron_length = $all_exons[$i]->start - $all_exons[$i-1]->end;
if ($intron_length > 9) {
# real intron
push @exons , $all_exons[$i-1];
push @exons , $all_exons[$i];
last EXON;
}
}
$self->throw("real intron not found for gene " . $gene->dbID . " exons : @all_exons\n") unless (scalar(@exons) == 2);
my @exon_features = @{$exons[0]->get_all_supporting_features};
push @exon_features,@{$exons[1]->get_all_supporting_features};
########################################
# make a seq feature represening the intron
my $intron = Bio::EnsEMBL::Feature->new(
-START => $exons[0]->end+1,
-END => $exons[1]->start-1,
-STRAND => $exons[0]->strand
);
if (@exon_features) {
###########################################################
# get featues off both exons, split them into ungapped
# sections and test them against the intron
# feature see if there is an overlap (80% by default)
# need to group features by sequence name
FEATURES: foreach my $feat (@exon_features) {
my @sub_features = $feat->ungapped_features;
foreach my $subfeature (@sub_features) {
my $seq_feature = Bio::EnsEMBL::Feature->new(
-START => $subfeature->start,
-END => $subfeature->end,
-STRAND => $subfeature->strand
);
push @{$seq_features{$feat->hseqname}}, $seq_feature;
}
}
}
foreach my $key (keys %seq_features){
if ($intron && scalar(@{$seq_features{$key}}) > 0) {
my @features = @{$seq_features{$key}};
my @feature_blocks;
#########################################################
# merge overlapping features together for protein features
# grouped by name
my $curblock = undef;
@features = sort {$a->start <=> $b->start} @features;
foreach my $feature (@features){
if (defined($curblock) && $curblock->end >= $feature->start) {
if ($feature->end > $curblock->end) {
$curblock->end($feature->end);
}
} else {
$curblock = Bio::EnsEMBL::Feature->new(
-START => $feature->start,
-END => $feature->end,
-STRAND => $feature->strand
);
push (@feature_blocks,$curblock);
}
}
my $coverage = $self->_len_covered($intron,\@feature_blocks)."\n";
if ($coverage/$intron->length*100 > $self->PS_MAX_INTRON_COVERAGE) {
$identified++;
print STDERR $transcript->dbID." two exon with $key covering intron ".$coverage/$intron->length*100 . "%.\t";
# need more than one peice of protein evidence to make the call
if ($identified >1){
print STDERR "\ncalling ".$transcript->dbID." as having a protein covered intron\n";
return "dodgy";
}
else{
print "need another piece of evidence though....\n";
}
}
}
}
return 1;
}
=head2 remove_transcript_from_gene
Args : Bio::EnsEMBL::Gene object , Bio::EnsEMBL::Transcript object
Description: steves method for removing unwanted transcripts from genes
Returntype : scalar
=cut
sub _remove_transcript_from_gene {
my ($self, $gene, $trans_to_del) = @_;
# transcript is blessed dont delete it
return 'BLESSED' if $self->BLESSED_BIOTYPES->{$trans_to_del->biotype};
$self->discarded_transcripts($trans_to_del);
$trans_to_del->translation(undef);
my @newtrans;
foreach my $trans (@{$gene->get_all_Transcripts}) {
if ($trans != $trans_to_del) {
push @newtrans,$trans;
}
}
# The naughty bit!
$gene->{_transcript_array} = [];
foreach my $trans (@newtrans) {
$gene->add_Transcript($trans);
}
return ;
}
=head2 transcript_to_keep
Args : Bio::EnsEMBL::Transcript object
Description: removes the translation provided it is not a blessed transcript
Returntype : scalar
=cut
sub transcript_to_keep {
my ($self, $trans_to_keep) = @_;
return if $self->BLESSED_BIOTYPES->{$trans_to_keep->biotype};
$trans_to_keep->translation(undef);
return;
}
=head2 intersection
Arg [none] :
Description: returns length of intersecion of two features
Returntype : scalar
Exceptions : throws if not given two Bio::EsEMBL::Feature objects
Caller : len_covered
=cut
sub intersection {
my ($self,$feat1,$feat2) = @_;
unless ($feat1->isa("Bio::EnsEMBL::Feature") && $feat2->isa("Bio::EnsEMBL::Feature")){
$self->throw("object is not a Bio::EnsEMBL::Feature they are feat1 $feat1, feat2 $feat2\n$@\n");
}
my @start = sort {$a<=>$b}
($feat1->start(), $feat2->start());
my @end = sort {$a<=>$b}
($feat1->end(), $feat2->end());
my $start = pop @start;
my $end = shift @end;
if($start > $end) {
return undef;
} else {
return Bio::EnsEMBL::Feature->new('-start' => $start,
'-end' => $end,
'-strand' => $feat1->strand
);
}
}
=head2 output
Arg [none] :
Description: returns output of running Pseudogene
Returntype : @{Bio::EnsEMBL::Gene}
Exceptions : none
Caller : general
=cut
sub output {
my ($self) = @_;
return $self->modified_genes;
}
=head2 write_seq_array
Arg [1] : Bio::EnsEMBL::Analysis::Runnable
Arg [2] : Array of Bio::Ensembl::Gene objects
Arg [3] : filename
Function : This uses Bio::SeqIO to dump a transcript sequence to a fasta file
Returntype: string, filename
Exceptions: throw if failed to write sequence
Example :
=cut
sub write_seq_array{
my ($self, $genes, $filename) = @_;
return 0 unless (scalar(@{$genes}>0));
if(!$filename){
$self->throw("FAILED to write genes - no filename");
}
my $seqout = Bio::SeqIO->new(
-file => ">".$filename,
-format => 'Fasta',
);
eval{
foreach my $gene (@{$genes}){
foreach my $transcript (@{$gene->get_all_Transcripts}){
next unless ( $transcript->translateable_seq );
$seqout->write_seq($transcript->seq);
}
}
};
if($@){
$self->throw("FAILED to write to $filename Runnable:write_seq_file\n\n$@");
}
return $filename;
}
#################################################################################
# Container methods
=head2 modified_genes
Arg [1] : array ref
Description: get/set modified gene set to return
Returntype : array ref to Bio::EnsEMBL::Gene objects
Exceptions : throws if not a Bio::EnsEMBL::Gene
Caller : general
=cut
sub modified_genes {
my ($self, $modified_genes) = @_;
if ($modified_genes) {
unless ($modified_genes->isa("Bio::EnsEMBL::Gene")){
$self->throw("Input isn't a Bio::EnsEMBL::Gene, it is a $modified_genes\n$@");
}
push @{$self->{'_modified_genes'}}, $modified_genes;
}
return $self->{'_modified_genes'};
}
=head2 multi_exon_genes
Arg [1] : array ref
Description: get/set multi exon gene set to return
Returntype : array ref to Bio::EnsEMBL::Gene objects
Exceptions : throws if not a Bio::EnsEMBL::Gene
Caller : general
=cut
sub multi_exon_genes {
my ($self, $multi_exon_genes) = @_;
if ($multi_exon_genes) {
unless ($multi_exon_genes->isa("Bio::EnsEMBL::Gene")){
$self->throw("Input isn't a Bio::EnsEMBL::Gene, it is a $multi_exon_genes\n$@");
}
push @{$self->{'_multi_exon_genes'}}, $multi_exon_genes;
}
return $self->{'_multi_exon_genes'};
}
=head2 discarded transcripts
Arg [1] : array ref
Description: get/set modified gene set to throw away
Returntype : array ref to Bio::EnsEMBL::Gene objects
Exceptions : throws if not a Bio::EnsEMBL::Gene
Caller : general
=cut
sub discarded_transcripts {
my ($self, $discarded_transcripts) = @_;
if ( $discarded_transcripts) {
unless ($discarded_transcripts->isa("Bio::EnsEMBL::Transcript")){
$self->throw("Input isn't a Bio::EnsEMBL::Gene, it is a $discarded_transcripts\n$@");
}
push @{$self->{'_discarded_transcripts'}}, $discarded_transcripts;
}
return $self->{'_discarded_transcripts'};
}
=head2 genes
Arg [1] : array ref
Description: get/set gene set to run over
Returntype : array ref to Bio::EnsEMBL::Gene objects
Exceptions : throws if not a Bio::EnsEMBL::Gene
Caller : general
=cut
sub genes {
my ($self, $genes) = @_;
if ($genes) {
foreach my $gene (@{$genes}) {
unless ($gene->isa("Bio::EnsEMBL::Gene")){
$self->throw("Input isn't a Bio::EnsEMBL::Gene, it is a $gene\n$@");
}
}
$self->{'_genes'} = $genes;
}
return $self->{'_genes'};
}
=head2 repeats
Arg [1] : array ref
Description: set repeat set to test genes against
Returntype : none
Exceptions : throws if not a Bio::EnsEMBL::SeqFeature
Caller : general
=cut
sub repeats {
my ($self, $repeats) = @_;
foreach my $repeat_array (values %{$repeats}) {
foreach my $repeat (@{$repeat_array}) {
unless ($repeat->isa("Bio::EnsEMBL::Feature")){
$self->throw("Input is not a Bio::EnsEMBL::Feature, it is a $repeat");
}
}
}
$self->{'_repeats'} = $repeats;
return 1;
}
=head2 get_repeats
Arg [1] : array ref
Description: get repeat array using a gene object as the key
Returntype : array ref to Bio::EnsEMBL::SeqFeature objects
Exceptions : warns if no values corresponding to key
Caller : general
=cut
sub get_repeats {
my ($self, $gene) = @_;
return $self->{'_repeats'}->{$gene};
}
=head2 real
Arg [1] : none
Description: scalar number of real genes found
Returntype : scalar integer
Exceptions : none
Caller : general
=cut
sub real {
my ($self,$num) = @_;
if ($num){
$self->{'_real'} += $num;
}
return $self->{'_real'};
}
=head2 pseudogenes
Arg [1] : none
Description: scalar number of pseudogenes found
Returntype : scalar integer
Exceptions : none
Caller : general
=cut
sub pseudogenes{
my ($self,$num) = @_;
if ($num){
$self->{'_pseudogenes'}+= $num;
}
return $self->{'_pseudogenes'};
}
sub repeatgenes{
my ($self,$num) = @_;
if ($num){
$self->{'_repeatgenes'}+= $num;
}
return $self->{'_repeatgenes'};
}
sub indeterminate_genes{
my ($self,$input) = @_;
if ($input){
push @{$self->{'_indeterminate_genes'}}, $input;
}
return $self->{'_indeterminate_genes'};
}
sub single_exon_genes{
my ($self,$input) = @_;
if ($input){
push @{$self->{'_single_exon_genes'}}, $input;
}
return $self->{'_single_exon_genes'};
}
sub overlooked_genes{
my ($self,$input) = @_;
if ($input){
push @{$self->{'_overlooked_genes'}}, $input;
}
return $self->{'_overlooked_genes'};
}
sub PS_FRAMESHIFT_INTRON_LENGTH{
my ($self, $arg) = @_;
if($arg){
$self->{'PS_FRAMESHIFT_INTRON_LENGTH'} = $arg;
}
return $self->{'PS_FRAMESHIFT_INTRON_LENGTH'};
}
sub PS_MAX_INTRON_LENGTH{
my ($self, $arg) = @_;
if($arg){
$self->{'PS_MAX_INTRON_LENGTH'} = $arg;
}
return $self->{'PS_MAX_INTRON_LENGTH'};
}
sub PS_REPEAT_TYPES{
my ($self, $arg) = @_;
if($arg){
$self->{'PS_REPEAT_TYPES'} = $arg;
}
return $self->{'PS_REPEAT_TYPES'};
}
sub PS_MAX_INTRON_COVERAGE{
my ($self, $arg) = @_;
if($arg){
$self->{'PS_MAX_INTRON_COVERAGE'} = $arg;
}
return $self->{'PS_MAX_INTRON_COVERAGE'};
}
sub PS_MAX_EXON_COVERAGE{
my ($self, $arg) = @_;
if($arg){
$self->{'PS_MAX_EXON_COVERAGE'} = $arg;
}
return $self->{'PS_MAX_EXON_COVERAGE'};
}
sub PS_NUM_FRAMESHIFT_INTRONS {
my ($self, $arg) = @_;
if($arg){
$self->{'PS_NUM_FRAMESHIFT_INTRONS'} = $arg;
}
return $self->{'PS_NUM_FRAMESHIFT_INTRONS'};
}
sub PS_NUM_REAL_INTRONS{
my ($self, $arg) = @_;
if($arg){
$self->{'PS_NUM_REAL_INTRONS'} = $arg;
}
return $self->{'PS_NUM_REAL_INTRONS'};
}
sub SINGLE_EXON{
my ($self, $arg) = @_;
if($arg){
$self->{'SINGLE_EXON'} = $arg;
}
return $self->{'SINGLE_EXON'};
}
sub INDETERMINATE{
my ($self, $arg) = @_;
if($arg){
$self->{'INDETERMINATE'} = $arg;
}
return $self->{'INDETERMINATE'};
}
sub PS_MIN_EXONS{
my ($self, $arg) = @_;
if($arg){
$self->{'PS_MIN_EXONS'} = $arg;
}
return $self->{'PS_MIN_EXONS'};
}
sub PS_MULTI_EXON_DIR{
my ($self, $arg) = @_;
if($arg){
$self->{'PS_MULTI_EXON_DIR'} = $arg;
}
return $self->{'PS_MULTI_EXON_DIR'};
}
sub BLESSED_BIOTYPES{
my ($self, $arg) = @_;
if($arg){
$self->{'BLESSED_BIOTYPES'} = $arg;
}
return $self->{'BLESSED_BIOTYPES'};
}
sub PS_BIOTYPE{
my ($self, $arg) = @_;
if($arg){
$self->{'PS_BIOTYPE'} = $arg;
}
return $self->{'PS_BIOTYPE'};
}
sub PS_PSEUDO_TYPE{
my ($self, $arg) = @_;
if($arg){
$self->{'PS_PSEUDO_TYPE'} = $arg;
}
return $self->{'PS_PSEUDO_TYPE'};
}
sub PS_REPEAT_TYPE{
my ($self, $arg) = @_;
if($arg){
$self->{'PS_REPEAT_TYPE'} = $arg;
}
return $self->{'PS_REPEAT_TYPE'};
}
sub DEBUG{
my ($self, $arg) = @_;
if($arg){
$self->{'DEBUG'} = $arg;
}
return $self->{'DEBUG'};
}
1;