Raw content of Transcript
use strict;
use warnings;
package Transcript;
#
# A set of utility methods for dealing with Interim transcripts
# and creating real transcripts out of them.
#
use StatMsg;
use InterimTranscript;
use Utils qw(print_exon);
use Bio::EnsEMBL::Transcript;
use Bio::EnsEMBL::Exon;
use Bio::EnsEMBL::Utils::Exception qw(throw info);
use constant MAX_INTRON_LEN => 2e6;
#
# sanity checks the interim exons, and splits this
# interim transcript into parts
#
sub check_iexons {
my $itranscript = shift;
my $itranscript_array = shift;
my $prev_start = undef;
my $prev_end = undef;
my $transcript_seq_region = undef;
my $transcript_strand = undef;
info("checking exons for : " . $itranscript->stable_id());
my $first = 1;
foreach my $iexon (@{$itranscript->get_all_Exons}) {
if ($iexon->fail() || $iexon->is_fatal()) {
info(" failed/fatal exon, splitting transcript");
# split this transcript in two, and restart the processing
# at the beginning of the second transcript
my $first_transcript = split_itrans($itranscript, $iexon);
if ($first_transcript) {
$itranscript_array ||= [];
push @$itranscript_array, $first_transcript;
}
return check_iexons($itranscript, $itranscript_array);
}
#sanity check: expect first exon to have cdna_start = 1
if($first && $iexon->cdna_start != 1) {
print_exon($iexon);
throw("Unexpected: first exon does not have cdna_start = 1");
}
$first = 0;
# sanity check: start must be less than or equal to end
if ($iexon->end() < $iexon->start()) {
throw("Unexpected: exon start less than end:\n" .
$iexon->stable_id().": ".$iexon->start().'-'.$iexon->end());
}
# sanity check: cdna length must equal length
if($iexon->length != $iexon->cdna_end - $iexon->cdna_start + 1) {
throw("Unexpected: exon cdna length != exon length:\n" .
$iexon->stable_id().": ".$iexon->start().'-'.$iexon->end() ."\n" .
" " . $iexon->cdna_start.'-'.$iexon->cdna_end());
}
if (!defined($transcript_seq_region)) {
$transcript_seq_region = $iexon->seq_region();
} elsif ($transcript_seq_region ne $iexon->seq_region()) {
info(" scaffold span, splitting transcript");
my $stat_msg = StatMsg->new(StatMsg::TRANSCRIPT|StatMsg::SCAFFOLD_SPAN);
$itranscript->add_StatMsg($stat_msg);
my $keep_exon = 1;
my $first_transcript = split_itrans($itranscript, $iexon, $keep_exon);
if ($first_transcript) {
$itranscript_array ||= [];
push @$itranscript_array, $first_transcript;
}
return check_iexons($itranscript, $itranscript_array);
}
# watch out for exons that come in the wrong order
if((defined($prev_end) && $iexon->strand() == 1 &&
$prev_end > $iexon->start()) ||
(defined($prev_start) && $iexon->strand() == -1 &&
$prev_start < $iexon->end())) {
info(" inversion, splitting transcript");
my $stat_msg = StatMsg->new(StatMsg::TRANSCRIPT | StatMsg::INVERT);
$itranscript->add_StatMsg($stat_msg);
my $keep_exon = 1;
my $first_transcript = split_itrans($itranscript, $iexon, $keep_exon);
if ($first_transcript) {
$itranscript_array ||= [];
push @$itranscript_array, $first_transcript;
}
return check_iexons($itranscript, $itranscript_array);
}
if (!defined($transcript_strand)) {
$transcript_strand = $iexon->strand();
} elsif ($transcript_strand != $iexon->strand()) {
info(" strand flip, splitting transcript");
my $stat_msg = StatMsg->new(StatMsg::TRANSCRIPT | StatMsg::STRAND_FLIP);
$itranscript->add_StatMsg($stat_msg);
my $keep_exon = 1;
my $first_transcript = split_itrans($itranscript, $iexon, $keep_exon);
if ($first_transcript) {
$itranscript_array ||= [];
push @$itranscript_array, $first_transcript;
}
return check_iexons($itranscript, $itranscript_array);
}
# watch out for extremely long introns
my $intron_len = 0;
if(defined($prev_start)) {
if($iexon->strand() == 1) {
$intron_len = $iexon->start - $prev_end + 1;
} else {
$intron_len = $prev_start - $iexon->end + 1;
}
}
if($intron_len > MAX_INTRON_LEN) {
info(" very long intron, splitting transcripts");
my $keep_exon = 1;
my $first_transcript = split_itrans($itranscript, $iexon, $keep_exon);
if($first_transcript) {
push @$itranscript_array, $first_transcript;
}
return check_iexons($itranscript, $itranscript_array);
}
$prev_end = $iexon->end();
$prev_start = $iexon->start();
}
$itranscript_array ||= [];
#
# if there exons left after all the splitting,
# then add this transcript to the array
#
my $total_exons = scalar(@{$itranscript->get_all_Exons});
if ($total_exons > 0) {
push @$itranscript_array, $itranscript;
} else {
info(" no exons left in transcript");
}
return $itranscript_array;
}
#
# Splits an interim transcript into two parts by discarding
# a 'bad exon' in the middle.
#
# If the 'bad exon' was the first exon then undef is returned, otherwise
# the first transcript is returned.
#
# The passed in transcript is adjusted to become the second transcript
#
# The keep_exon argument is a flag. If true the 'bad exon' is kept as part
# of the second transcript, otherwise it is discarded.
#
sub split_itrans {
my $itrans = shift;
my $bad_exon = shift; #fulcrum exon to split on
my $keep_exon = shift;
my @remaining_exons = @{$itrans->get_all_Exons()};
my @first_exons;
my $first_trans = InterimTranscript->new();
$first_trans->stable_id($itrans->stable_id());
$first_trans->version($itrans->version);
my $cur_exon = shift(@remaining_exons);
# all the exons until the 'bad exon' are loaded into the first transcript
info("==FIRST TRANSCRIPT:\n");
while($cur_exon && $cur_exon != $bad_exon) {
print_exon($cur_exon);
push @first_exons, $cur_exon;
$first_trans->add_Exon($cur_exon);
$cur_exon = shift(@remaining_exons);
}
if(!$cur_exon) {
throw("unexpected: could not find bad exon in transcript");
}
info("==BAD EXON: ". (($keep_exon) ? 'keeping' : 'discarding'));
print_exon($bad_exon);
# keep the 'bad exon' if the flag was set
if($keep_exon) {
unshift @remaining_exons, $bad_exon;
}
# the remaining exons are reloaded into the second (original) transcript
$itrans->flush_Exons();
info("==SECOND TRANSCRIPT:\n");
foreach my $exon (@remaining_exons) {
print_exon($exon);
$itrans->add_Exon($exon);
}
#
# set the coding end/start of the first transcript
#
if(@first_exons) {
my $first_ex = $first_exons[0];
my $last_ex = $first_exons[$#first_exons];
# set the coding start and coding end to same as original transcript,
# then adjust for ommitted exons
$first_trans->cdna_coding_start($itrans->cdna_coding_start());
$first_trans->cdna_coding_end($itrans->cdna_coding_end());
if($last_ex->cdna_end() < $first_trans->cdna_coding_start() ||
$first_ex->cdna_start() > $first_trans->cdna_coding_end()) {
# entire transcript is UTR and is non-coding
$first_trans->cdna_coding_start(1);
$first_trans->cdna_coding_end(0);
}
elsif($last_ex->cdna_end() >= $first_trans->cdna_coding_start() &&
$last_ex->cdna_end() < $first_trans->cdna_coding_end()) {
# coding sequence is cut by coding end
$first_trans->cdna_coding_end($last_ex->cdna_end());
}
}
#
# adjust the coding end/start of the second transcript
#
if(@remaining_exons) {
my $first_ex = $remaining_exons[0];
my $last_ex = $remaining_exons[$#remaining_exons];
# all of the cdna coodinates need to be shifted up by the
# amount of bases used by the first transcript & bad exon
my $cdna_shift = 0;
foreach my $ex (@first_exons) {
$cdna_shift += $ex->length();
info("cdna_shift = $cdna_shift\n");
}
# if we threw away the 'bad' exon we want to take into account
# how much sequence it had,
if(!$keep_exon &&
defined($bad_exon->cdna_start()) &&
defined($bad_exon->cdna_end())) {
$cdna_shift += $bad_exon->length();
}
info("Shifting CDNA of second transcript by $cdna_shift\n");
if($cdna_shift) {
foreach my $ex (@remaining_exons) {
if(!$ex->fail()) {
$ex->cdna_start($ex->cdna_start() - $cdna_shift);
$ex->cdna_end($ex->cdna_end() - $cdna_shift);
}
}
$itrans->move_cdna_coding_start(-$cdna_shift);
$itrans->move_cdna_coding_end(-$cdna_shift);
if($itrans->cdna_coding_start() < 1) {
# transcript was split in middle of cds
$itrans->cdna_coding_start(1);
}
if($itrans->cdna_coding_end() < 1) {
#there is no coding sequence left in this transcript
$itrans->cdna_coding_end(0);
}
}
}
return (@first_exons) ? $first_trans : undef;
}
#
# creates proper ensembl transcripts and exons from interim transcripts
# and exons.
#
sub make_Transcript {
my $itrans = shift;
my $slice_adaptor = shift;
my $transcript = Bio::EnsEMBL::Transcript->new();
$transcript->stable_id($itrans->stable_id);
$transcript->version($itrans->version);
info("making final transcript for ". $itrans->stable_id);
my $translation;
# the whole translation may have been deleted
# discard translation if mrna is less than a codon in length
if($itrans->cdna_coding_end - $itrans->cdna_coding_start + 1 < 3) {
$translation = undef;
} else {
$translation = Bio::EnsEMBL::Translation->new();
$transcript->translation($translation);
}
foreach my $iexon (@{$itrans->get_all_Exons}) {
my $slice =
$slice_adaptor->fetch_by_region('chromosome', $iexon->seq_region,
undef, undef,undef, 'CHIMP1');
my $exon = Bio::EnsEMBL::Exon->new
(-START => $iexon->start(),
-END => $iexon->end(),
-STRAND => $iexon->strand(),
-PHASE => $iexon->start_phase(),
-END_PHASE => $iexon->end_phase(),
-STABLE_ID => $iexon->stable_id(),
-SLICE => $slice);
$transcript->add_Exon($exon);
# see if this exon is the start or end exon of the translation
if ($translation) {
if ($iexon->cdna_start() <= $itrans->cdna_coding_start() &&
$iexon->cdna_end() >= $itrans->cdna_coding_start()) {
my $translation_start =
$itrans->cdna_coding_start() - $iexon->cdna_start() + 1;
$translation->start_Exon($exon);
$translation->start($translation_start);
}
if ($iexon->cdna_start() <= $itrans->cdna_coding_end() &&
$iexon->cdna_end() >= $itrans->cdna_coding_end()) {
my $translation_end =
$itrans->cdna_coding_end() - $iexon->cdna_start() + 1;
$translation->end_Exon($exon);
$translation->end($translation_end);
}
}
}
if($translation && !$translation->start_Exon()) {
print STDERR "Could not find translation start exon in transcript.\n";
print STDERR "FIRST EXON:\n";
print_exon($itrans->get_all_Exons->[0]);
print STDERR "LAST EXON:\n";
print_exon($itrans->get_all_Exons->[-1], $itrans);
throw("Unexpected: Could not find translation start exon in transcript\n");
}
if($translation && !$translation->end_Exon()) {
print STDERR "Could not find translation end exon in transcript.\n";
print STDERR "FIRST EXON:\n";
print_exon($itrans->get_all_Exons->[0]);
print STDERR "LAST EXON:\n";
print_exon($itrans->get_all_Exons->[-1], $itrans);
throw("Unexpected: Could not find translation end exon in transcript\n");
}
return $transcript;
}
1;