ensembl-analysis
TranscriptChecker
Toolbar
Summary
Bio::EnsEMBL::Test::TranscriptChecker
Package variables
No package variables defined.
Included modules
Inherit
Checker
Synopsis
Module to check the validity of a transcript
Description
Performs various checks on a Bio::EnsEMBL::Transcript object. These include:
1. Transcripts with no exons
2. Transcripts with more than a specified maximum number of exons
3. Exon rank ordering is correct
4. Exons don't overlap or contain other exons
5. Transcript translates
6. Translation is longer than a specified minimum length
7. Short or long introns
8. Introns with non GT .. AG splice sites (warnings)
9. Short or long exons
10. ATG immediately after 5' UTR (warning)
11. There are supporting features for every exon (warning)
12. The translation is not all X
This class does not use stable_ids but instead dbIDs because stable_ids are
not set until after the gene build.
Methods
adaptor | No description | Code |
check | No description | Code |
check_Intron | No description | Code |
check_Phases | No description | Code |
check_Structure | No description | Code |
check_Supporting_Evidence | No description | Code |
check_Translation | No description | Code |
check_UTRs | No description | Code |
get_initial_frame | No description | Code |
maxexonstranscript | No description | Code |
maxshortexonlen | No description | Code |
maxshortintronlen | No description | Code |
minlongexonlen | No description | Code |
minlongintronlen | No description | Code |
minshortexonlen | No description | Code |
minshortintronlen | No description | Code |
mintranslationlen | No description | Code |
new | No description | Code |
output | No description | Code |
print_raw_data | No description | Code |
slice | No description | Code |
transcript | Description | Code |
Methods description
Title : transcript Usage : $obj->transcript($newval) Function: Returns : value of transcript Args : newvalue (optional) |
Methods code
sub adaptor
{ my ( $self, $arg ) = @_;
if( defined $arg ) {
$self->{_adaptor} = $arg;
}
return $self->{_adaptor}; } |
sub check
{ my $self = shift;
my $transcript = $self->transcript;
my @exons = @{$transcript->get_all_Exons()};
my $numexon = scalar(@exons);
if ($numexon == 0) {
$self->add_Error("No exons\n", 'noexons');
return;
} elsif ($numexon > $self->maxexonstranscript) {
$self->add_Error("Unusually large number of exons (" .
$numexon . ")\n", 'manyexons');
}
my @sortedexons;
my $transstart;
my $transend;
my $vcoffset = 0;
if ($self->slice) {
$vcoffset = $self->slice->start - 1;
}
if ($exons[0]->strand == 1) {
@sortedexons = sort {$a->start <=> $b->start} @exons ;
$transstart = ($sortedexons[0]->start+$vcoffset);
$transend = ($sortedexons[$#sortedexons]->end+$vcoffset);
} else {
@sortedexons = sort {$b->start <=> $a->start} @exons;
$transstart = ($sortedexons[$#sortedexons]->start+$vcoffset);
$transend = ($sortedexons[0]->end+$vcoffset);
}
my $exnum = 0;
EXON: foreach my $exon (@sortedexons) {
if ($exon != $exons[$exnum++]) {
$self->add_Error("Incorrect exon ranks (first at exon " .
$exons[$exnum++]->dbID . ")\n".
"NOTE: Further error checking (except translations) " .
"done with resorted exons.\n" ,'exonrank');
last EXON;
}
}
$self->check_Structure(\@sortedexons, $transstart, $transend);
$self->check_Phases;
if (defined($self->transcript->translation)) {
$self->check_Translation;
$self->check_UTRs(\@sortedexons);
}
$self->check_Supporting_Evidence(\@sortedexons); } |
sub check_Intron
{ my $self = shift;
my $prev_exon = shift || $self->throw("Prev_exon must be passed");
my $exon = shift || $self->throw("Exon must be passed");
if ($prev_exon->overlaps($exon)) {
$self->add_Error("Overlapping exons in transcript for exons " . $prev_exon->dbID .
" and " . $exon->dbID . "\n", 'overlapexon');
return;
}
my $intron;
if (Bio::EnsEMBL::Intron->can('upstream_Exon')) {
$intron = new Bio::EnsEMBL::Intron;
$intron->upstream_Exon($prev_exon);
$intron->downstream_Exon($exon);
} else {
$intron = new Bio::EnsEMBL::Intron($prev_exon, $exon);
}
my $intlen = $intron->length;
if ($intlen >= $self->minshortintronlen &&
$intlen <= $self->maxshortintronlen) {
$self->add_Error("Short intron (" . $intlen .
" bases) between exons " . $prev_exon->dbID .
" and " . $exon->dbID . "\n", 'shortintron');
} elsif ($intlen >= $self->minlongintronlen) {
$self->add_Error("Long intron (" . $intlen . ") between exons " .
$prev_exon->dbID . " and " . $exon->dbID .
"\n", 'longintron');
}
if ($intlen >= 2) {
my $intseq = $intron->seq;
if (ref($intseq)) {
$intseq = $intseq->seq;
}
if (substr($intseq,0,2) ne "GT") {
$self->add_Warning("Non consensus 5' intron splice site sequence (".
substr($intseq,0,2) . ") after exon " .
$prev_exon->dbID . " (intron length = " . $intlen .
")\n", 'noncons5');
}
if (substr($intseq,-2,2) ne "AG") {
$self->add_Warning("Non consensus 3' intron splice site sequence (".
substr($intseq,-2,2) .
") before exon " . $exon->dbID . " (intron length = ".
$intlen . ")\n", 'noncons3');
}
} } |
sub check_Phases
{ my $self = shift;
if (defined($self->transcript->translation)) {
my @exons = @{$self->transcript->get_all_translateable_Exons};
my $prev_exon = $exons[0];
my $start_phase = $prev_exon->phase;
if ($start_phase == -1) {
$start_phase = 0;
}
my $zero_phase_count = 0;
if ($start_phase == 0) {
$zero_phase_count++;
}
my @calc_phases;
my $phase = $start_phase;
for (my $i=0; $i<scalar(@exons);$i++) {
push @calc_phases, $phase;
$phase = (($exons[$i]->length + $phase) % 3);
}
for (my $i=1; $i<scalar(@exons);$i++) {
my $exon=$exons[$i];
if ($exon->phase == 0) {
$zero_phase_count++;
}
if ($exon->phase != $prev_exon->end_phase) {
$self->add_Error("EndPhase/Phase mismatch (" . $prev_exon->end_phase . " and " . $exon->phase .
") between exons " . $prev_exon->dbID . " and " . $exon->dbID .
"\n", 'endphase_phase');
}
if ($exon->phase != $calc_phases[$i]) {
$self->add_Error("Phase wrong for " . $exon->dbID . " is " . $exon->phase . " should be " .
$calc_phases[$i] . "\n", 'startphase');
}
my $calc_endphase = (($exon->length + $calc_phases[$i]) % 3);
if ($exon->end_phase != $calc_endphase && $i != $#exons) {
$self->add_Error("EndPhase wrong for " . $exon->dbID . " is " . $exon->end_phase . " should be " .
$calc_endphase . "\n", 'endphase');
}
$prev_exon = $exon;
}
if ($zero_phase_count/scalar(@exons) < 0.20 && scalar(@exons) > 8) { $self->add_Warning("Suspiciously high number of non phase zero exon (num phase zero = $zero_phase_count nexon = " . scalar(@exons).")\n"); }
} } |
sub check_Structure
{ my $self = shift;
my $sortedexons = shift;
my $transstart = shift;
my $transend = shift;
my $prev_exon = undef;
my $totalexonlen = 0;
foreach my $exon (@$sortedexons) {
my $exlen = $exon->length;
$totalexonlen+=$exlen;
if ($exlen >= $self->minshortexonlen &&
$exlen <= $self->maxshortexonlen) {
$self->add_Error("Short exon (" . $exlen .
" bases) for exon " . $exon->dbID . "\n", 'shortexon');
} elsif ($exon->length >= $self->minlongexonlen) {
$self->add_Error("Long exon (" . $exlen .
" bases) for exon " . $exon->dbID . "\n", 'longexon');
}
if (defined $prev_exon) {
if ($exon->strand != $prev_exon->strand) {
$self->add_Error("Exons on different strands\n",'mixedstrands');
} elsif (($exon->strand == 1 && $exon->start < $prev_exon->end) ||
($exon->strand == -1 && $exon->end > $prev_exon->start)) {
$self->add_Error("Incorrect exon ordering (maybe duplicate exons or embedded exons)\n", 'exonorder');
} else {
$self->check_Intron($prev_exon, $exon);
}
}
$prev_exon = $exon;
}
my $exondensity = $totalexonlen/($transend-$transstart+1); if (scalar(@$sortedexons) > 1) {
if ($exondensity > 0.8) {
$self->add_Warning("Unusually high exon density (exon len/genomic len) (value $exondensity)\n");
} elsif ($exondensity < 0.005) {
$self->add_Warning("Unusually low exon density (exon len/genomic len) (value $exondensity)\n");
}
}
}
} |
sub check_Supporting_Evidence
{ my $self = shift;
my $exons = shift;
EXON: foreach my $exon (@$exons) {
if (!scalar(@{$exon->get_all_supporting_features})) {
}
} } |
sub check_Translation
{ my $self = shift;
my $pepseq = undef;
return if (!defined($self->transcript->translation));
eval {
$pepseq = $self->transcript->translate;
};
if (defined($pepseq)) {
my $pepseqstr = $pepseq->seq;
my $peplen = length($pepseqstr);
if ($pepseqstr =~ /\*/) {
$self->add_Error("Translation failed - Translation contains stop codons\n",'transstop');
$self->get_initial_frame;
return 1;
} elsif ($peplen == 0) {
$self->add_Error("Translation failed - Translation has zero length\n",'transzerolen');
return 1;
} elsif ($pepseqstr =~ /^X*$/) {
$self->add_Error("Translation failed - Translation is all X (probably on sequence gap)\n",'transallx');
return 1;
} elsif ($peplen < $self->mintranslationlen) {
$self->add_Error("Short (" . $peplen . " residue) translation\n",'transminlen');
}
return 0;
} else {
$self->add_Error("Translation failed.\n",'transfail');
return 1;
} } |
sub check_UTRs
{ my $self = shift;
my $exons = shift;
my $translation = $self->transcript->translation;
return if (!defined($translation));
my $rank = 0;
my $trans_start_exon = $translation->start_Exon;
my $trans_end_exon = $translation->end_Exon;
my $foundstart = 0;
my $foundend = 0;
my $start_is_atg = 0;
my $stop_is_term = 0;
EXON: foreach my $exon (@$exons) {
if ($exon == $trans_start_exon) {
$foundstart = 1;
if ($translation->start > 3 || $rank > 0) {
my $startcodon = substr($exon->seq->seq,$translation->start-1,3);
if ($startcodon ne "ATG") {
$self->add_Warning("No ATG at five prime of transcript CDS with UTR (has $startcodon)\n");
} else {
$start_is_atg = 1;
}
my $len = $exon->length;
my $end_phase = $exon->end_phase;
if ($exon == $trans_end_exon) { $len = $translation->end; }
if ($end_phase == -1) { $end_phase = 0; }
if (($len - $translation->start + 1 + ((3-$end_phase)%3)) % 3) {
$self->add_Warning("Translation start not on codon boundary (or end_phase error on first coding exon)\n");
if ($start_is_atg) {
$self->add_Error("ATG as first three bases in case where first codon supposedly incomplete\n");
}
}
}
}
if ($exon == $trans_end_exon) {
$foundend = 1;
if ($translation->end < ($exon->length-2) || $rank < scalar(@$exons) - 1) {
my $stopcodon = substr($exon->seq->seq,$translation->end-3,3);
if ($stopcodon !~ /TAA|TAG|TGA/) {
$self->add_Warning("No TAA, TAG or TGA at three prime end of transcript CDS with UTR (has $stopcodon)\n");
} else {
$stop_is_term = 1;
}
}
}
$rank++;
}
if (!$foundstart) {
$self->add_Error("Didn't find translation->start_exon (" .
$trans_start_exon->dbID . ")\n");
}
if (!$foundend) {
$self->add_Error("Didn't find translation->end_exon (" .
$trans_end_exon->dbID . ")\n");
}
my $cdslen = 0;
foreach my $exon (@{$self->transcript->get_all_translateable_Exons}) {
$cdslen += $exon->length;
}
if ($cdslen%3) {
if ($start_is_atg && $stop_is_term) {
$self->add_Error("CDS length not multiple of 3 in transcript with ATG->Stop\n");
} else {
$self->add_Warning("CDS length not multiple of 3\n");
}
} } |
sub get_initial_frame
{ my ($self) = @_;
my $trans = $self->transcript;
$trans->sort;
my $cdna = new Bio::Seq(-seq => $trans->translateable_seq);
foreach my $frame (0 .. 2) {
my $pepseq = $cdna->translate(undef, undef, $frame)->seq;
chop $pepseq;
if ($pepseq !~ /\*/) {
$self->add_Error("Frame $frame does translate\n",'wrongframe');
return $frame;
}
}
return 0; } |
sub maxexonstranscript
{ my ( $self, $arg ) = @_;
if (defined $arg) {
$self->{_maxexonstranscript} = $arg;
}
return $self->{_maxexonstranscript}; } |
sub maxshortexonlen
{ my ( $self, $arg ) = @_;
if (defined $arg) {
$self->{_maxshortexonlen} = $arg;
}
return $self->{_maxshortexonlen}; } |
sub maxshortintronlen
{ my ( $self, $arg ) = @_;
if (defined $arg) {
$self->{_maxshortintronlen} = $arg;
}
return $self->{_maxshortintronlen}; } |
sub minlongexonlen
{ my ( $self, $arg ) = @_;
if (defined $arg) {
$self->{_minlongexonlen} = $arg;
}
return $self->{_minlongexonlen}; } |
sub minlongintronlen
{ my ( $self, $arg ) = @_;
if (defined $arg) {
$self->{_minlongintronlen} = $arg;
}
return $self->{_minlongintronlen}; } |
sub minshortexonlen
{ my ( $self, $arg ) = @_;
if (defined $arg) {
$self->{_minshortexonlen} = $arg;
}
return $self->{_minshortexonlen}; } |
sub minshortintronlen
{ my ( $self, $arg ) = @_;
if (defined $arg) {
$self->{_minshortintronlen} = $arg;
}
return $self->{_minshortintronlen}; } |
sub mintranslationlen
{ my ( $self, $arg ) = @_;
if (defined $arg) {
$self->{_mintranslationlen} = $arg;
}
return $self->{_mintranslationlen}; } |
sub new
{ my ($class, @args) = @_;
my $self = bless {},$class;
my ( $transcript, $minshortintronlen, $maxshortintronlen, $minlongintronlen,
$minshortexonlen, $maxshortexonlen, $minlongexonlen, $maxexonstranscript,
$mintranslationlen, $ignorewarnings, $slice, $adaptor ) = rearrange
( [ qw ( TRANSCRIPT
MINSHORTINTRONLEN
MAXSHORTINTRONLEN
MINLONGINTRONLEN
MINSHORTEXONLEN
MAXSHORTEXONLEN
MINLONGEXONLEN
MAXEXONTRANSCRIPT
MINTRANSLATIONLEN
IGNOREWARNINGS
SLICE
ADAPTOR
)], @args );
if( !defined $transcript ) {
$self->throw("Transcript must be set in new for TranscriptChecker")
}
$self->transcript($transcript);
if( defined $minshortintronlen ) {
$self->minshortintronlen( $minshortintronlen );
} else {
$self->minshortintronlen(3);
}
if( defined $maxshortintronlen ) {
$self->maxshortintronlen( $maxshortintronlen );
} else {
$self->maxshortintronlen(50);
}
if( defined $minlongintronlen ) {
$self->minlongintronlen( $minlongintronlen );
} else {
$self->minlongintronlen(100000);
}
if( defined $minshortexonlen ) {
$self->minshortexonlen( $minshortexonlen );
} else {
$self->minshortexonlen(3);
}
if( defined $maxshortexonlen ) {
$self->maxshortexonlen( $maxshortexonlen );
} else {
$self->maxshortexonlen(10);
}
if( defined $minlongexonlen ) {
$self->minlongexonlen( $minlongexonlen );
} else {
$self->minlongexonlen(50000);
}
if( defined $mintranslationlen ) {
$self->mintranslationlen( $mintranslationlen );
} else {
$self->mintranslationlen(10);
}
if( defined $maxexonstranscript) {
$self->maxexonstranscript( $maxexonstranscript );
} else {
$self->maxexonstranscript(150);
}
if( defined $ignorewarnings) { $self->ignorewarnings($ignorewarnings); }
if( defined $slice) { $self->slice($slice); }
if( defined $adaptor ) { $self->adaptor( $adaptor )}
$self->{_errors} = [];
$self->{_warnings} = [];
return $self; } |
sub output
{ my $self = shift;
my $transcript = $self->transcript;
print "\n===\n";
print "Transcript " . $transcript->dbID . " " .
(defined($transcript->stable_id) ? $transcript->stable_id : "") . "\n";
my $vcoffset = 0;
if ($self->slice) {
$vcoffset = $self->slice->start - 1;
}
my $translation = $transcript->translation;
printf " %9s %9s %9s %9s %9s %9s %9s\n", "dbID","Start","End","Strand","Length","Phase","EndPhase";
foreach my $exon (@{$transcript->get_all_Exons()}) {
printf " Exon %9d %9d %9d %9d %9d %9d %9d", $exon->dbID, $exon->start + $vcoffset, $exon->end + $vcoffset,
$exon->strand, ($exon->end-$exon->start+1), $exon->phase, $exon->end_phase;
if (defined($translation)) {
if ($exon == $translation->start_Exon) {
print " S (" . $translation->start . " (" .
(($exon->strand == 1 ? $exon->start+$translation->start-1 : $exon->end-$translation->start+1) + $vcoffset) ."))";
}
if ($exon == $translation->end_Exon) {
print " E (" . $translation->end . " (" .
(($exon->strand == 1 ? $exon->start+$translation->end-1 : $exon->end-$translation->end+1) + $vcoffset) ."))";
}
}
print "\n";
}
if ($translation) {
my $pepstr = $transcript->translate->seq;
$pepstr =~ s/(.{0,80})/ $1\n/g;
print "\nTranslation (" . $translation->dbID . "):\n";
print $pepstr;
}
$self->SUPER::output;
if ($self->adaptor) {
$self->print_raw_data;
} } |
sub slice
{ my $self = shift;
if( @_ ) {
my $value = shift;
if (!($value->isa('Bio::EnsEMBL::Slice'))) {
$self->throw("vc passed a non Bio::EnsEMBL::Slice object\n");
}
$self->{_slice} = $value;
}
return $self->{_slice}; } |
sub transcript
{ my $self = shift;
if( @_ ) {
my $value = shift;
if (!($value->isa('Bio::EnsEMBL::Transcript'))) {
$self->throw("transcript passed a non Bio::EnsEMBL::Transcript object\n");
}
$self->{_transcript} = $value;
}
return $self->{_transcript}; } |
General documentation
The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _