Transcript documentation.

sub check_iexons {

  my $itranscript = shift;
  my $itranscript_array   = shift;

  my $prev_start = undef;
  my $prev_end = undef;
  my $transcript_seq_region = undef;
  my $transcript_strand     = undef;

  info("checking exons for : " . $itranscript->stable_id());

  my $first = 1;

  foreach my $iexon (@{$itranscript->get_all_Exons}) {

    if ($iexon->fail() || $iexon->is_fatal()) {
      info("  failed/fatal exon, splitting transcript");

      # split this transcript in two, and restart the processing
      # at the beginning of the second transcript
      my $first_transcript = split_itrans($itranscript, $iexon);

      if ($first_transcript) {
        $itranscript_array ||= [];
        push @$itranscript_array, $first_transcript;
      }
      return check_iexons($itranscript, $itranscript_array);
    }

    #sanity check: expect first exon to have cdna_start = 1
    if($first && $iexon->cdna_start != 1) {
      print_exon($iexon);
      throw("Unexpected: first exon does not have cdna_start = 1");
    }
    $first = 0;

    # sanity check: start must be less than or equal to end
    if ($iexon->end() < $iexon->start()) {
      throw("Unexpected: exon start less than end:\n" .
            $iexon->stable_id().": ".$iexon->start().'-'.$iexon->end());
    }

    # sanity check: cdna length must equal length
    if($iexon->length != $iexon->cdna_end - $iexon->cdna_start + 1) {
      throw("Unexpected: exon cdna length != exon length:\n" .
            $iexon->stable_id().": ".$iexon->start().'-'.$iexon->end() ."\n" .
            "                 " . $iexon->cdna_start.'-'.$iexon->cdna_end());
    }

    if (!defined($transcript_seq_region)) {
      $transcript_seq_region = $iexon->seq_region();
    } elsif ($transcript_seq_region ne $iexon->seq_region()) {

      info("  scaffold span, splitting transcript");

      my $stat_msg = StatMsg->new(StatMsg::TRANSCRIPT|StatMsg::SCAFFOLD_SPAN);
      $itranscript->add_StatMsg($stat_msg);

      my $keep_exon = 1;
      my $first_transcript = split_itrans($itranscript, $iexon, $keep_exon);

      if ($first_transcript) {
        $itranscript_array ||= [];
        push @$itranscript_array, $first_transcript;
      }
      return check_iexons($itranscript, $itranscript_array);
    }


    # watch out for exons that come in the wrong order

    if((defined($prev_end) && $iexon->strand() == 1 &&
        $prev_end > $iexon->start()) ||
       (defined($prev_start) && $iexon->strand() == -1 &&
        $prev_start < $iexon->end())) {

      info("  inversion, splitting transcript");

      my $stat_msg = StatMsg->new(StatMsg::TRANSCRIPT | StatMsg::INVERT);
      $itranscript->add_StatMsg($stat_msg);

      my $keep_exon = 1;
      my $first_transcript = split_itrans($itranscript, $iexon, $keep_exon);

      if ($first_transcript) {
        $itranscript_array ||= [];
        push @$itranscript_array, $first_transcript;
      }
      return check_iexons($itranscript, $itranscript_array);
    }

    if (!defined($transcript_strand)) {
      $transcript_strand = $iexon->strand();
    } elsif ($transcript_strand != $iexon->strand()) {
      info("  strand flip, splitting transcript");
      my $stat_msg = StatMsg->new(StatMsg::TRANSCRIPT | StatMsg::STRAND_FLIP);
      $itranscript->add_StatMsg($stat_msg);

      my $keep_exon = 1;
      my $first_transcript = split_itrans($itranscript, $iexon, $keep_exon);

      if ($first_transcript) {
        $itranscript_array ||= [];
        push @$itranscript_array, $first_transcript;
      }
      return check_iexons($itranscript, $itranscript_array);
    }

    # watch out for extremely long introns
    my $intron_len = 0;

    if(defined($prev_start)) {
      if($iexon->strand() == 1) {
        $intron_len = $iexon->start - $prev_end + 1;
      } else {
        $intron_len = $prev_start - $iexon->end + 1;
      }
    }

    if($intron_len > MAX_INTRON_LEN) {
      info("  very long intron, splitting transcripts");
      my $keep_exon = 1;
      my $first_transcript = split_itrans($itranscript, $iexon, $keep_exon);

      if($first_transcript) {
        push @$itranscript_array, $first_transcript;
      }
      return check_iexons($itranscript, $itranscript_array);
    }

    $prev_end = $iexon->end();
    $prev_start = $iexon->start();
  }

  $itranscript_array ||= [];

  #
  # if there exons left after all the splitting,
  # then add this transcript to the array
  #
  my $total_exons = scalar(@{$itranscript->get_all_Exons});
  if ($total_exons > 0) {
    push @$itranscript_array, $itranscript;
  } else {
    info("  no exons left in transcript");
  }

  return $itranscript_array;
}


#
# Splits an interim transcript into two parts by discarding
# a 'bad exon' in the middle.
#
# If the 'bad exon' was the first exon then undef is returned, otherwise
# the first transcript is returned.
#
# The passed in transcript is adjusted to become the second transcript
#
# The keep_exon argument is a flag.  If true the 'bad exon' is kept as part
# of the second transcript, otherwise it is discarded.
#

}

sub make_Transcript {

  my $itrans = shift;
  my $slice_adaptor = shift;

  my $transcript = Bio::EnsEMBL::Transcript->new();
  $transcript->stable_id($itrans->stable_id);
  $transcript->version($itrans->version);

  info("making final transcript for ". $itrans->stable_id);

  my $translation;

  # the whole translation may have been deleted
  # discard translation if mrna is less than a codon in length
  if($itrans->cdna_coding_end - $itrans->cdna_coding_start + 1 < 3) {
    $translation = undef;
  } else {
    $translation = Bio::EnsEMBL::Translation->new();
    $transcript->translation($translation);
  }

  foreach my $iexon (@{$itrans->get_all_Exons}) {
    my $slice =
      $slice_adaptor->fetch_by_region('chromosome', $iexon->seq_region,
                                     undef, undef,undef, 'CHIMP1');

    my $exon = Bio::EnsEMBL::Exon->new
      (-START     => $iexon->start(),
       -END       => $iexon->end(),
       -STRAND    => $iexon->strand(),
       -PHASE     => $iexon->start_phase(),
       -END_PHASE => $iexon->end_phase(),
       -STABLE_ID => $iexon->stable_id(),
       -SLICE     => $slice);

    $transcript->add_Exon($exon);

    # see if this exon is the start or end exon of the translation
    if ($translation) {
      if ($iexon->cdna_start() <= $itrans->cdna_coding_start() &&
          $iexon->cdna_end()   >= $itrans->cdna_coding_start()) {
        my $translation_start =
          $itrans->cdna_coding_start() - $iexon->cdna_start() + 1;
        $translation->start_Exon($exon);
        $translation->start($translation_start);
      }

      if ($iexon->cdna_start() <= $itrans->cdna_coding_end() &&
          $iexon->cdna_end()   >= $itrans->cdna_coding_end()) {
        my $translation_end =
          $itrans->cdna_coding_end() - $iexon->cdna_start() + 1;
        $translation->end_Exon($exon);
        $translation->end($translation_end);
      }
    }
  }

  if($translation && !$translation->start_Exon()) {
    print STDERR "Could not find translation start exon in transcript.\n";
    print STDERR "FIRST EXON:\n";
    print_exon($itrans->get_all_Exons->[0]);
    print STDERR "LAST EXON:\n";
    print_exon($itrans->get_all_Exons->[-1], $itrans);
    throw("Unexpected: Could not find translation start exon in transcript\n");
  }
  if($translation && !$translation->end_Exon()) {
    print STDERR "Could not find translation end exon in transcript.\n";
    print STDERR "FIRST EXON:\n";
    print_exon($itrans->get_all_Exons->[0]);
    print STDERR "LAST EXON:\n";
    print_exon($itrans->get_all_Exons->[-1], $itrans);
    throw("Unexpected: Could not find translation end exon in transcript\n");
  }

  return $transcript;
}



1;

}

sub split_itrans {

  my $itrans = shift;
  my $bad_exon = shift; #fulcrum exon to split on
  my $keep_exon = shift;

  my @remaining_exons = @{$itrans->get_all_Exons()};
  my @first_exons;
  my $first_trans = InterimTranscript->new();
  $first_trans->stable_id($itrans->stable_id());
  $first_trans->version($itrans->version);

  my $cur_exon = shift(@remaining_exons);

  # all the exons until the 'bad exon' are loaded into the first transcript

  info("==FIRST TRANSCRIPT:\n");
  while($cur_exon && $cur_exon != $bad_exon) {
    print_exon($cur_exon);
    push @first_exons, $cur_exon;
    $first_trans->add_Exon($cur_exon);
    $cur_exon = shift(@remaining_exons);
  }

  if(!$cur_exon) {
    throw("unexpected: could not find bad exon in transcript");
  }

  info("==BAD EXON: ". (($keep_exon) ? 'keeping' : 'discarding'));
  print_exon($bad_exon);

  # keep the 'bad exon' if the flag was set
  if($keep_exon) {
    unshift @remaining_exons, $bad_exon;
  }

  # the remaining exons are reloaded into the second (original) transcript
  $itrans->flush_Exons();
  info("==SECOND TRANSCRIPT:\n");
  foreach my $exon (@remaining_exons) {
    print_exon($exon);
    $itrans->add_Exon($exon);
  }

  #
  # set the coding end/start of the first transcript
  #
  if(@first_exons) {
    my $first_ex = $first_exons[0];
    my $last_ex = $first_exons[$#first_exons];

    # set the coding start and coding end to same as original transcript,
    # then adjust for ommitted exons
    $first_trans->cdna_coding_start($itrans->cdna_coding_start());
    $first_trans->cdna_coding_end($itrans->cdna_coding_end());

    if($last_ex->cdna_end()    < $first_trans->cdna_coding_start() ||
       $first_ex->cdna_start() > $first_trans->cdna_coding_end()) {
      # entire transcript is UTR and is non-coding
      $first_trans->cdna_coding_start(1);
      $first_trans->cdna_coding_end(0);
    }
    elsif($last_ex->cdna_end() >= $first_trans->cdna_coding_start() &&
          $last_ex->cdna_end() < $first_trans->cdna_coding_end()) {
      # coding sequence is cut by coding end
      $first_trans->cdna_coding_end($last_ex->cdna_end());
    }
  }

  #
  # adjust the coding end/start of the second transcript
  #
  if(@remaining_exons) {
    my $first_ex = $remaining_exons[0];
    my $last_ex  = $remaining_exons[$#remaining_exons];

    # all of the cdna coodinates need to be shifted up by the
    # amount of bases used by the first transcript & bad exon
    my $cdna_shift = 0;
    foreach my $ex (@first_exons) {
      $cdna_shift += $ex->length();
      info("cdna_shift = $cdna_shift\n");
    }

    # if we threw away the 'bad' exon we want to take into account
    # how much sequence it had,
    if(!$keep_exon &&
       defined($bad_exon->cdna_start()) &&
       defined($bad_exon->cdna_end())) {
      $cdna_shift += $bad_exon->length();
    }

    info("Shifting CDNA of second transcript by $cdna_shift\n");

    if($cdna_shift) {
      foreach my $ex (@remaining_exons) {
        if(!$ex->fail()) {
          $ex->cdna_start($ex->cdna_start() - $cdna_shift);
          $ex->cdna_end($ex->cdna_end() - $cdna_shift);
        }
      }
      $itrans->move_cdna_coding_start(-$cdna_shift);
      $itrans->move_cdna_coding_end(-$cdna_shift);

      if($itrans->cdna_coding_start() < 1) {
        # transcript was split in middle of cds
        $itrans->cdna_coding_start(1);
      }
      if($itrans->cdna_coding_end() < 1) {
        #there is no coding sequence left in this transcript
        $itrans->cdna_coding_end(0);
      }
    }
  }

  return (@first_exons) ? $first_trans : undef;
}

#
# creates proper ensembl transcripts and exons from interim transcripts
# and exons.
#

}

check_iexons	No description	Code
make_Transcript	No description	Code
split_itrans	No description	Code