Bio::EnsEMBL::Analysis::Tools::WGA2Genes GeneScaffold
SummaryIncluded librariesPackage variablesSynopsisGeneral documentationMethods
Toolbar
WebCvsRaw content
Summary
Bio::EnsEMBL::Analysis::Tools::WGA2Genes::GeneScaffold
Package variables
Privates (from "my" definitions)
$GENE_SCAFFOLD_CS_NAME = 'genescaffold'
$INTERPIECE_PADDING = 100
$NEAR_CONTIG_END = 15
$FROM_CS_NAME = 'chromosome'
$TO_CS_NAME = 'scaffold'
Included modules
Bio::EnsEMBL::Analysis::Tools::WGA2Genes::CoordUtils
Bio::EnsEMBL::Slice
Bio::EnsEMBL::Utils::Argument qw ( rearrange )
Bio::EnsEMBL::Utils::Exception qw ( throw warning )
Bio::EnsEMBL::Utils::Sequence qw ( reverse_comp )
Inherit
Bio::EnsEMBL::Slice
Synopsis
A Bio::EnsEMBL::Slice that is comprised
of pieces of different target sequences, inferred by an alignment
of the target to a finished, query genome sequence. This object
extends Slice with mappings to/from the query and target
Assumptions:
- that the given GenomicAlignBlocks are sorted with respect
to the reference and non-overlapping on query and target
- that the given list of features is sorted with respect to
the reference
Description
No description!
Methods
_check_direct_target_coordinates
No description
Code
_check_transcripts
No description
Code
_construct_sequence
No description
Code
_make_alignment_mapper
No description
Code
alignment_mapper
No description
Code
direct_target_slice
No description
Code
from_mapper
No description
Code
from_slice
No description
Code
max_readthrough_dist
No description
Code
new
No description
Code
place_transcript
No description
Code
project_down
No description
Code
project_up
No description
Code
stringify_alignment
No description
Code
to_mapper
No description
Code
to_slices
No description
Code
Methods description
None available.
Methods code
_check_direct_target_coordinatesdescriptionprevnextTop
sub _check_direct_target_coordinates {
  my ($blocks, $slices) = @_;

  my ($tname, $tstrand);

  my @blocks = @$blocks;
  for(my $i=0; $i < @blocks; $i++) {
    my ($right_to) = @{$blocks[$i]->get_all_non_reference_genomic_aligns};
    if ($i > 0) { 
      my ($left_to) = @{$blocks[$i-1]->get_all_non_reference_genomic_aligns};

      if ($left_to->dnafrag->name ne $right_to->dnafrag->name or
          $left_to->dnafrag_strand != $right_to->dnafrag_strand or
          ($left_to->dnafrag_strand > 0 and $left_to->dnafrag_end >= $right_to->dnafrag_start) or
          ($left_to->dnafrag_strand < 0 and $left_to->dnafrag_start <= $right_to->dnafrag_end)) {
        
        throw("Cannot use direct target coordinates with inconsistent set of blocks");
      }
    }

    $tname = $right_to->dnafrag->name if not defined $tname;
    $tstrand = $right_to->dnafrag_strand if not defined $tstrand;
  }

  my $target_slice;
  if ($tstrand < 0) {
    $target_slice = $slices->{$tname}->invert;
  } else {
    $target_slice = $slices->{$tname};
  }

  return $target_slice;
}


##############################################
# Get/Sets
##############################################
}
_check_transcriptsdescriptionprevnextTop
sub _check_transcripts {
  my ($trans) = @_;

  if (scalar(@$trans) == 0) {
    throw("Attempt to create GeneScaffod with transcript list");
  }

  foreach my $t (@$trans) {
    if (not $t->translation) {
     # throw("Attempt to create GeneScaffold with non-coding Transcript (".$t->stable_id.")");
warn("Attempt to create GeneScaffold with non-coding Transcript (".$t->stable_id.")"); } if (length($t->translateable_seq) % 3 != 0) { # throw("Attempt to create GeneScaffold with non-mod-3 coding length Transcript (".$t->stable_id.")");
warn("Attempt to create GeneScaffold with non-mod-3 coding length Transcript (".$t->stable_id.")"); } } } #################################################
}
_construct_sequencedescriptionprevnextTop
sub _construct_sequence {
  my ($gen_al_blocks,
      $map,
      $transcripts,
      $from_slice,
      $to_slices,
      $max_readthrough_dist,
      $extend_into_gaps,
      $add_gaps,
      $direct_target_slice) = @_;

  # Basic gene-scaffold structure is taken directly from the given block list
my @block_coord_pairs; foreach my $bl (@{$gen_al_blocks}) { my $qy_al = $bl->reference_genomic_align; my ($tg_al) = @{$bl->get_all_non_reference_genomic_aligns}; my $from_coord = Bio::EnsEMBL::Mapper::Coordinate->new($qy_al->dnafrag->name, $qy_al->dnafrag_start, $qy_al->dnafrag_end, 1); my $to_coord = Bio::EnsEMBL::Mapper::Coordinate->new($tg_al->dnafrag->name, $tg_al->dnafrag_start, $tg_al->dnafrag_end, $tg_al->dnafrag_strand); my $pair = Bio::EnsEMBL::Mapper::Pair->new($from_coord, $to_coord); push @block_coord_pairs, $pair; } @block_coord_pairs = sort { $a->from->start <=> $b->from->start } @block_coord_pairs; # we now proceed to amend the structure with inserted gaps
# step 1: flatten the exons from the transcripts into a non-overlapping
# list, ommitting terminal exons that map completely to gaps
my $tran_coords = transcripts_to_coords($map, $FROM_CS_NAME, @$transcripts); # step 2: infer list of exon regions that map to gaps. We are only
# interested, at this stage, in regions outside the blocks, because
# these are the ones with potential to be "filled"
my @fillable_gaps; foreach my $tc (sort { $a->start <=> $b->start } @$tran_coords) { my $current_pos = $tc->start; foreach my $c ($map->map_coordinates($tc->id, $tc->start, $tc->end, 1, $FROM_CS_NAME)) { my $from_coord = Bio::EnsEMBL::Mapper::Coordinate->new($tc->id, $current_pos, $current_pos + $c->length - 1, 1); $current_pos += $c->length; if ($c->isa("Bio::EnsEMBL::Mapper::Gap")) { # only consider gaps that lies outside blocks
my $overlaps_block = 0; foreach my $bl (@block_coord_pairs) { if ($bl->from->start <= $from_coord->end and $bl->from->end >= $from_coord->start) { $overlaps_block = 1; last; } } if (not $overlaps_block) { push @fillable_gaps, Bio::EnsEMBL::Mapper::Pair->new($from_coord, $c); } } } } # Non-fillable gaps:
# 1. Gaps before the first or after the last block
# 2. If one of the flanking coords is on the same CDS region
# as the gap, and the end of the aligned region does
# not align to a sequence-level gap
# 3. If the 2 flanking coords are consistent and not
# separated by a sequence-level gap
#
my @all_coord_pairs = (@block_coord_pairs, @fillable_gaps); @all_coord_pairs = sort { $a->from->start <=> $b->from->start } @all_coord_pairs; my @extend_pairs; if ($extend_into_gaps) { my (%pairs_to_remove, @replacements); for(my $i=0; $i < @all_coord_pairs; $i++) { my $this_pair = $all_coord_pairs[$i]; if ($this_pair->to->isa("Bio::EnsEMBL::Mapper::Gap")) { # if it's gap that can be filled, leave it. Otherwise, remove it
my ($left_non_gap, $right_non_gap); for(my $j=$i-1; $j>=0; $j--) { if ($all_coord_pairs[$j]->to-> isa("Bio::EnsEMBL::Mapper::Coordinate")) { $left_non_gap = $all_coord_pairs[$j]; last; } } for(my $j=$i+1; $j < @all_coord_pairs; $j++) { if ($all_coord_pairs[$j]->to-> isa("Bio::EnsEMBL::Mapper::Coordinate")) { $right_non_gap = $all_coord_pairs[$j]; last; } } my ($ex_left, $ex_left_up, $ex_left_down) = extend_coord($left_non_gap->to, $to_slices->{$left_non_gap->to->id}); my ($ex_right, $ex_right_up, $ex_right_down) = extend_coord($right_non_gap->to, $to_slices->{$right_non_gap->to->id}); # flanking coords are inconsistent,
# which means that they come from different chains.
# By chain filtering then, they must either come
# from different target sequences, or be separable in the
# same target sequence by a sequence-level gap. Either
# way, we can nominally "fill" the gap.
#
# However, if the gap coincides with the end of a block,
# and furthermore if the block end conincides with the
# end of a sequence-level piece in the target, it is
# more appropriate to extend the exon into the existing
# gap; in that case, we replace the gap with a fake
# piece of alignment
if (not check_consistent_coords($left_non_gap->to, $right_non_gap->to) or $ex_left->start > $ex_right->end or $ex_left->end < $ex_right->start) { if ($left_non_gap->from->end == $this_pair->from->start - 1 and $right_non_gap->from->start == $this_pair->from->end + 1) { my $remove_coord = 1; my (@replace_coord); if ($left_non_gap->to->strand > 0 and $ex_left->end - $left_non_gap->to->end <= $NEAR_CONTIG_END) { $remove_coord = 0; if (defined $ex_left_down and $this_pair->to->length <= $ex_left_down->start - $ex_left->end - 1) { push @replace_coord, Bio::EnsEMBL::Mapper::Coordinate ->new($ex_left->id, $ex_left->end + 1, $ex_left->end + $this_pair->to->length, $ex_left->strand); } } elsif ($left_non_gap->to->strand < 0 and $ex_left->start - $left_non_gap->to->start <= $NEAR_CONTIG_END) { $remove_coord = 0; if (defined $ex_left_up and $this_pair->to->length <= $ex_left->start - $ex_left_up->end - 1) { push @replace_coord, Bio::EnsEMBL::Mapper::Coordinate ->new($ex_left->id, $ex_left->start - $this_pair->to->length, $ex_left->start - 1, $ex_left->strand); } } if ($right_non_gap->to->strand > 0 and $right_non_gap->to->start - $ex_right->start <= $NEAR_CONTIG_END) { $remove_coord = 0; if (defined $ex_right_up and $this_pair->to->length <= $ex_right->start - $ex_right_up->end - 1) { push @replace_coord, Bio::EnsEMBL::Mapper::Coordinate ->new($ex_right->id, $ex_right->start - $this_pair->to->length, $ex_right->start - 1, $ex_right->strand); } } elsif ($right_non_gap->to->strand < 0 and $ex_right->end - $right_non_gap->to->end <= $NEAR_CONTIG_END) { $remove_coord = 0; if (defined $ex_right_down and $this_pair->to->length <= $ex_right_down->start - $ex_right->end - 1) { push @replace_coord, Bio::EnsEMBL::Mapper::Coordinate ->new($ex_right->id, $ex_right->end + 1, $ex_right->end + $this_pair->to->length, $ex_right->strand); } } if ($remove_coord) { # gap does not align with the end of a contig; junk it
$pairs_to_remove{$this_pair} = 1; } elsif (@replace_coord) { # arbitrarily chose the first one
push @replacements, [$this_pair, $replace_coord[0]]; } } elsif ($left_non_gap->from->end == $this_pair->from->start - 1) { if ($left_non_gap->to->strand > 0 and $ex_left->end - $left_non_gap->to->end <= $NEAR_CONTIG_END) { if (defined $ex_left_down and $this_pair->to->length <= $ex_left_down->start - $ex_left->end - 1) { push @replacements, [$this_pair, Bio::EnsEMBL::Mapper::Coordinate ->new($ex_left->id, $ex_left->end + 1, $ex_left->end + $this_pair->to->length, $ex_left->strand)]; } } elsif ($left_non_gap->to->strand < 0 and $ex_left->start - $left_non_gap->to->start <= $NEAR_CONTIG_END) { if (defined $ex_left_up and $this_pair->to->length <= $ex_left->start - $ex_left_up->end - 1) { push @replacements, [$this_pair, Bio::EnsEMBL::Mapper::Coordinate ->new($ex_left->id, $ex_left->start - $this_pair->to->length, $ex_left->start - 1, $ex_left->strand)]; } } else { # gap does not align with the end of a contig; junk it
$pairs_to_remove{$this_pair} = 1; } } elsif ($right_non_gap->from->start == $this_pair->from->end + 1) { if ($right_non_gap->to->strand > 0 and $right_non_gap->to->start - $ex_right->start <= $NEAR_CONTIG_END) { if (defined $ex_right_up and $this_pair->to->length <= $ex_right->start - $ex_right_up->end - 1) { push @replacements, [$this_pair, Bio::EnsEMBL::Mapper::Coordinate ->new($ex_right->id, $ex_right->start - $this_pair->to->length, $ex_right->start - 1, $ex_right->strand)]; } } elsif ($right_non_gap->to->strand < 0 and $ex_right->end - $right_non_gap->to->end <= $NEAR_CONTIG_END) { if (defined $ex_right_down and $this_pair->to->length <= $ex_right_down->start - $ex_right->end - 1) { push @replacements, [$this_pair, Bio::EnsEMBL::Mapper::Coordinate ->new($ex_right->id, $ex_right->end + 1, $ex_right->end + $this_pair->to->length, $ex_right->strand)]; } } else { # gap does not align with the end of a contig; junk it
$pairs_to_remove{$this_pair} = 1; } } # else this gap is an isolate. It can be kept iff the coords are
# on different chains, or on the same chain but on different
# contigs; we've already determined that the components can
# be separated, so fine
} else { $pairs_to_remove{$this_pair} = 1; } } } @all_coord_pairs = grep { not exists $pairs_to_remove{$_} } @all_coord_pairs; @replacements = sort { $a->[1]->id cmp $b->[1]->id or $a->[1]->start <=> $b->[1]->start; } @replacements; while(@replacements) { my $el = shift @replacements; my ($pair, $rep) = @$el; my $fill = 1; if (@replacements) { my $nel = shift @replacements; my ($npair, $nrep) = @$nel; if ($rep->id eq $nrep->id and $rep->start <= $nrep->end and $rep->end >= $nrep->start) { # we have over-filled a gap. Remove these fills
$fill = 0; } else { unshift @replacements, $nel; } } if ($fill) { $pair->to($rep); push @extend_pairs, $pair; } } } if (not $add_gaps) { @all_coord_pairs = grep { not $_->to->isa("Bio::EnsEMBL::Mapper::Gap") } @all_coord_pairs; } # merge adjacent targets
# we want to be able to account for small, frame-preserving
# insertions in the target sequence with respect to the query.
# To give the later, gene-projection code the opportunity to
# "read through" these insertions, we have to merge togther
# adjacent, consistent targets that are within this "maximum
# read-through" distance
my @merged_pairs; for(my $i=0; $i<@all_coord_pairs; $i++) { my $this_pair = $all_coord_pairs[$i]; if ($this_pair->to->isa("Bio::EnsEMBL::Mapper::Coordinate") and @merged_pairs and $merged_pairs[-1]->to->isa("Bio::EnsEMBL::Mapper::Coordinate") and check_consistent_coords($merged_pairs[-1]->to, $this_pair->to)) { my $dist = distance_between_coords($merged_pairs[-1]->to, $this_pair->to); if ($dist <= $max_readthrough_dist) { my $last_pair = pop @merged_pairs; my $new_from = merge_coords($last_pair->from, $this_pair->from); my $new_to = merge_coords($last_pair->to, $this_pair->to); # check that the new merged coord will not result in an overlap
my $overlap = 0; foreach my $tg (@merged_pairs) { if ($tg->to->isa("Bio::EnsEMBL::Mapper::Coordinate") and $tg->to->id eq $new_to->id and $tg->to->start < $new_to->end and $tg->to->end > $new_to->start) { $overlap = 1; last; } } if (not $overlap) { for (my $j=$i+1; $j < @all_coord_pairs; $j++) { my $tg = $all_coord_pairs[$j]; if ($tg->to->isa("Bio::EnsEMBL::Mapper::Coordinate") and $tg->to->id eq $new_to->id and $tg->to->start < $new_to->end and $tg->to->end > $new_to->start) { $overlap = 1; last; } } } if ($overlap) { push @merged_pairs, $last_pair, $this_pair; } else { push @merged_pairs, Bio::EnsEMBL::Mapper::Pair->new($new_from, $new_to); } } else { push @merged_pairs, $this_pair; } } else { push @merged_pairs, $this_pair; } } #########################################################
my $t_map = Bio::EnsEMBL::Mapper->new($TO_CS_NAME, $GENE_SCAFFOLD_CS_NAME); my $q_map = Bio::EnsEMBL::Mapper->new($FROM_CS_NAME, $GENE_SCAFFOLD_CS_NAME); my ($seq, $last_end_pos) = ("", 0); for(my $i=0; $i < @merged_pairs; $i++) { my $pair = $merged_pairs[$i]; if ($direct_target_slice) { if ($pair->to->isa("Bio::EnsEMBL::Mapper::Coordinate")) { my ($gs_start, $gs_end); if ($direct_target_slice->strand < 0) { $gs_start = $direct_target_slice->length - $pair->to->end + 1; $gs_end = $direct_target_slice->length - $pair->to->start + 1; } else { $gs_start = $pair->to->start; $gs_end = $pair->to->end; } $t_map->add_map_coordinates($pair->to->id, $pair->to->start, $pair->to->end, $pair->to->strand, $GENE_SCAFFOLD_CS_NAME, $gs_start, $gs_end); } } else{ if ($pair->to->isa("Bio::EnsEMBL::Mapper::Coordinate")) { # the sequence itself
my $slice = $to_slices->{$pair->to->id}; my $this_seq = $slice->subseq($pair->to->start, $pair->to->end); if ($pair->to->strand < 0) { reverse_comp(\$this_seq); } $seq .= $this_seq; $t_map->add_map_coordinates($pair->to->id, $pair->to->start, $pair->to->end, $pair->to->strand, $GENE_SCAFFOLD_CS_NAME, $last_end_pos + 1, $last_end_pos + $pair->to->length); } else { # the sequence itself
$seq .= ('n' x $pair->from->length); # and the map. This is a target gap we have "filled", so no position
# in target, but a position in query
$q_map->add_map_coordinates($from_slice->seq_region_name, $pair->from->start, $pair->from->end, 1, $GENE_SCAFFOLD_CS_NAME, $last_end_pos + 1, $last_end_pos + $pair->from->length); } # add padding between the pieces
if ($i < @merged_pairs - 1) { $last_end_pos += $pair->to->length + $INTERPIECE_PADDING; $seq .= ('n' x $INTERPIECE_PADDING); } } } #
# add the gaps we have extended into to the query map
#
foreach my $pair (@extend_pairs) { my ($coord) = $t_map->map_coordinates($pair->to->id, $pair->to->start, $pair->to->end, $pair->to->strand, $TO_CS_NAME); $q_map->add_map_coordinates($from_slice->seq_region_name, $pair->from->start, $pair->from->end, 1, $GENE_SCAFFOLD_CS_NAME, $coord->start, $coord->end); } #
# finally add of the original alignment pieces to the query map
#
my @aln_coords = $map->map_coordinates($from_slice->seq_region_name, $from_slice->start, $from_slice->end, 1, $FROM_CS_NAME); my $current_start = $from_slice->start; foreach my $c (@aln_coords) { my $current_end = $current_start + $c->length - 1; if ($c->isa("Bio::EnsEMBL::Mapper::Coordinate")) { # get the gene_scaffold position from the target map
my ($coord) = $t_map->map_coordinates($c->id, $c->start, $c->end, $c->strand, $TO_CS_NAME); $q_map->add_map_coordinates($from_slice->seq_region_name, $current_start, $current_end, 1, $GENE_SCAFFOLD_CS_NAME, $coord->start, $coord->end); } $current_start += $c->length; } return ($seq, $q_map, $t_map); } #################################################
}
_make_alignment_mapperdescriptionprevnextTop
sub _make_alignment_mapper {
  my ($gen_al_blocks) = @_;

  my $mapper = Bio::EnsEMBL::Mapper->new($FROM_CS_NAME,
                                         $TO_CS_NAME);

  foreach my $bl (@$gen_al_blocks) {
    foreach my $ugbl (@{$bl->get_all_ungapped_GenomicAlignBlocks}) {      
      my ($from_bl) = $ugbl->reference_genomic_align;
      my ($to_bl)   = @{$ugbl->get_all_non_reference_genomic_aligns};

      $mapper->add_map_coordinates($from_bl->dnafrag->name,
                                   $from_bl->dnafrag_start,
                                   $from_bl->dnafrag_end,
                                   $from_bl->dnafrag_strand * $to_bl->dnafrag_strand,
                                   $to_bl->dnafrag->name,
                                   $to_bl->dnafrag_start,
                                   $to_bl->dnafrag_end);
    }
  }

  return $mapper;
}


##############################################
}
alignment_mapperdescriptionprevnextTop
sub alignment_mapper {
  my ($self, $val) = @_;

  if (defined $val) {
    $self->{_alignment_mapper} = $val;
  }

  return $self->{_alignment_mapper};
}
direct_target_slicedescriptionprevnextTop
sub direct_target_slice {
  my ($self, $val) = @_;

  if (defined $val) {
    $self->{_direct_target} = $val;
  }

  return $self->{_direct_target};
}
from_mapperdescriptionprevnextTop
sub from_mapper {
  my ($self, $val) = @_;

  if (defined $val) {
    $self->{_from_mapper} = $val;
  }

  return $self->{_from_mapper};
}
from_slicedescriptionprevnextTop
sub from_slice {
  my ($self, $val) = @_;

  if (defined $val) {
    $self->{_from_slice} = $val;
  }

  return $self->{_from_slice};
}
max_readthrough_distdescriptionprevnextTop
sub max_readthrough_dist {
  my ($self, $val) = @_;

  if (defined $val) {
    $self->{_max_readthrough} = $val;
  }

  return $self->{_max_readthrough};
}
newdescriptionprevnextTop
sub new {
  my ($caller, %given_args) = @_;

  my $class = ref($caller) || $caller;

  my ($name,
      $genomic_align_blocks,
      $transcripts,
      $from_slice,
      $to_slices,
      $max_readthrough_dist,
      $extend_into_gaps,
      $add_gaps,
      $direct_target_slice,
      ) = rearrange([qw(
                        NAME
                        GENOMIC_ALIGN_BLOCKS
                        TRANSCRIPTS
                        FROM_SLICE
                        TO_SLICES
                        MAX_READTHROUGH_DIST
                        EXTEND_INTO_GAPS
                        ADD_GAPS
                        DIRECT_TARGET_COORDS
                        )], %given_args);

  _check_transcripts($transcripts);

  $name = "GeneScaffold" if not defined $name;
  $max_readthrough_dist = 15 if not defined $max_readthrough_dist;  

  if ($direct_target_slice) {
    if ($add_gaps or $extend_into_gaps) {
      warning("GeneScaffold: cannot use -direct_target_coords with either of " . 
              "-add_gaps or -extend_into_gaps; unsetting both");
      $add_gaps = 0;
      $extend_into_gaps = 0;
    }

    # check that the genomic align blocks will allow this
$direct_target_slice = _check_direct_target_coordinates($genomic_align_blocks, $to_slices); } my $aln_map = _make_alignment_mapper($genomic_align_blocks); my ($gs_seq, $from_mapper, $to_mapper) = _construct_sequence($genomic_align_blocks, $aln_map, $transcripts, $from_slice, $to_slices, $max_readthrough_dist, $extend_into_gaps, $add_gaps, $direct_target_slice); return undef if not defined $from_mapper; # print "1: ", $direct_target_slice,"\n2: ",$direct_target_slice->length,"\n3: ",length($gs_seq),"\n";
my $gs_end = $direct_target_slice ? $direct_target_slice->length : length($gs_seq); my $self = $class->SUPER::new(-coord_system => Bio::EnsEMBL::CoordSystem->new(-name => $GENE_SCAFFOLD_CS_NAME, -rank => 1), -seq_region_name => $name, -seq => ( $gs_seq eq "" ? undef : $gs_seq), -start => 1, -end => $gs_end, ); $self->direct_target_slice($direct_target_slice) if defined $direct_target_slice; $self->max_readthrough_dist($max_readthrough_dist) if defined $max_readthrough_dist; $self->from_slice($from_slice); $self->to_slices($to_slices); $self->alignment_mapper($aln_map); $self->from_mapper($from_mapper); $self->to_mapper($to_mapper); return $self; } ###################################################################
# FUNCTION : place_transcript
#
# Description:
# Takes a transcript, and uses the mapping between
# query coords and gene scaffold coords to produces a transcript
# that is the result of "projecting" the original transcript,
# through alignment, onto the gene scaffold.
###################################################################
}
place_transcriptdescriptionprevnextTop
sub place_transcript {
  my ($self, 
      $tran,
      $add_attributes,
      $external_db_id) = @_;

  my (@all_coords, @new_exons);
  
  my @orig_exons = @{$tran->get_all_translateable_Exons};

  @orig_exons = sort { $a->start <=> $b->start } @orig_exons;
  
  my $orig_exon_coords = transcripts_to_coords($self->alignment_mapper,
                                               $FROM_CS_NAME,
                                               $tran);

  
  my $restricted_range;
  if (@$orig_exon_coords) {

   # print "ORIGINAL EXON COORDS: First start: ",$orig_exon_coords->[0]->start, " last end: ",$orig_exon_coords->[-1]->end," length: ",$orig_exon_coords->[-1]->end-$orig_exon_coords->[0]->start+1 ,"\n";
$restricted_range = Bio::EnsEMBL::Mapper::Coordinate->new($orig_exon_coords->[0]->id, $orig_exon_coords->[0]->start, $orig_exon_coords->[-1]->end, $orig_exon_coords->[0]->strand); } else { #print "ORIGINAL EXON COORDS: First start: ",$orig_exons[0]->start, " last end: ",$orig_exons[-1]->end," length ",$orig_exons[-1]->end-$orig_exons[0]->start+1,"\n";
$restricted_range = Bio::EnsEMBL::Mapper::Coordinate->new($orig_exons[0]->slice->seq_region_name, $orig_exons[0]->start, $orig_exons[-1]->end, $orig_exons[0]->strand); } my %filled_coords; # look for regions that map down to filled gaps. If any of these
# lie outside the restricted range (i.e. extent of the transcript
# within which gaps were potentially filled), then they must
# be gaps that were filled for a different transcript. In that
# case, discard these pieces
foreach my $orig_exon (@orig_exons) { #print "Single exon original coords: START: ", $orig_exon->start," END: ", $orig_exon->end," length ",$orig_exon->end-$orig_exon->start+1,"\n";
my @crds = $self->from_mapper->map_coordinates($orig_exon->slice->seq_region_name, $orig_exon->start, $orig_exon->end, 1, $FROM_CS_NAME); foreach my $c (@crds) { if ($c->isa("Bio::EnsEMBL::Mapper::Coordinate")) { #print "Exon Mapper from coordinates: START:", $c->start, " END: ",$c->end," length ",$c->end-$c->start+1,"\n";
my ($tc) = $self->to_mapper->map_coordinates($GENE_SCAFFOLD_CS_NAME, $c->start, $c->end, 1, $GENE_SCAFFOLD_CS_NAME); if ($tc->isa("Bio::EnsEMBL::Mapper::Coordinate")) { # this piece may partially extend into a real sequence gap;
# break it up into its constituent pieces
#print "TC Mapper from coordinates: START:", $tc->start, " END: ",$tc->end," length ",$tc->end-$tc->start+1,"\n";
my @alncrds = $self->alignment_mapper->map_coordinates($tc->id, $tc->start, $tc->end, $tc->strand, $TO_CS_NAME); my $current_start = $c->start; foreach my $ac (@alncrds) { #print "AC coords START: ", $ac->start, " END: ".$ac->end," length ",$ac->end-$ac->start+1,"\n";
my $current_end = $current_start + $ac->length - 1; my $newc = Bio::EnsEMBL::Mapper::Coordinate->new($c->id, $current_start, $current_end, $c->strand); $current_start += $ac->length; if ($ac->isa("Bio::EnsEMBL::Mapper::Gap")) { #print "\nAC is a GAP\n\n";
$filled_coords{$newc} = 1; } push @all_coords, $newc; } } else { $filled_coords{$c} = 1; push @all_coords, $c; } } else { push @all_coords, $c; } } } my @kept_coords; foreach my $c (@all_coords) { if ($c->isa("Bio::EnsEMBL::Mapper::Coordinate") and exists $filled_coords{$c}) { # this is a filled coord, so MUST correspond to exactly one piece of the
# original query sequence
#print "KEPT coord START: ",$c->start," END: ",$c->end," length ",$c->end-$c->start+1,"\n";
my ($oc) = $self->from_mapper->map_coordinates($GENE_SCAFFOLD_CS_NAME, $c->start, $c->end, 1, $GENE_SCAFFOLD_CS_NAME); if ($oc->start >= $restricted_range->start and $oc->end <= $restricted_range->end) { push @kept_coords, $c; } else { push @kept_coords, Bio::EnsEMBL::Mapper::Gap->new(1, $c->length); } } else { #print "KEPT coord START: ",$c->start," END: ",$c->end," length ",$c->end-$c->start+1,"\n";
push @kept_coords, $c; } } @all_coords = @kept_coords; #
# now massage the gaps to account for frameshifts
#
my $need_another_pass; do { $need_another_pass = 0; my (@proc_coords, @gap_indices); # merge gaps
foreach my $c (@all_coords) { if ($c->isa("Bio::EnsEMBL::Mapper::Gap")) { #print "GAP PRESENT START: ",$c->start," END: ",$c->end," length ",$c->end-$c->start+1,"\n";
if (@proc_coords and $proc_coords[-1]->isa("Bio::EnsEMBL::Mapper::Gap")) { $proc_coords[-1]->end( $proc_coords[-1]->end + $c->length ); #print "Extending existing GAP\n";
} else { #print "ADDing new GAP with length ",$c->length ,"\n";
push @proc_coords, Bio::EnsEMBL::Mapper::Gap->new(1, $c->length); push @gap_indices, scalar(@proc_coords) - 1; } } else { push @proc_coords, $c; } } GAP: foreach my $idx (@gap_indices) { #print "I have to handle a GAP\n";
my $gap = $proc_coords[$idx]; my $frameshift = $gap->length % 3; if ($frameshift) { #print "!!! Have a frameshift at " . $gap->start . "\n";
my $bases_to_remove = 3 - $frameshift; # calculate "surplus" bases on incomplete codons to left and right
my ($left_surplus, $right_surplus) = (0,0); for(my $j=$idx-1; $j >= 0; $j--) { $left_surplus += $proc_coords[$j]->length; } for(my $j=$idx+1; $j < @proc_coords; $j++) { $right_surplus += $proc_coords[$j]->length; } $left_surplus = $left_surplus % 3; $right_surplus = $right_surplus % 3; if ($left_surplus) { # eat left
$bases_to_remove = $left_surplus; my $left_coord = $proc_coords[$idx - 1]; if ($left_coord->length > $bases_to_remove) { $gap->end($gap->end + $bases_to_remove); $left_coord->end( $left_coord->end - $bases_to_remove ); } else { # we need to eat away the whole of this coord
$proc_coords[$idx-1] = Bio::EnsEMBL::Mapper::Gap->new(1,$left_coord->length); } } if ($right_surplus) { $bases_to_remove = $right_surplus; my $right_coord = $proc_coords[$idx + 1]; if ($right_coord->length > $bases_to_remove) { $gap->end($gap->end + $bases_to_remove); $right_coord->start( $right_coord->start + $bases_to_remove); } else { # we need to eat away the whole of this coord
$proc_coords[$idx+1] = Bio::EnsEMBL::Mapper::Gap->new(1,$right_coord->length); } } $need_another_pass = 1; last GAP; } } @all_coords = @proc_coords; } while ($need_another_pass); my $start_not_found = $all_coords[0]->isa("Bio::EnsEMBL::Mapper::Gap") ? 1 : 0; my $end_not_found = $all_coords[-1]->isa("Bio::EnsEMBL::Mapper::Gap") ? 1 : 0; #
# The remaining non-gap pieces are the exons
#
@all_coords = grep { $_->isa("Bio::EnsEMBL::Mapper::Coordinate") } @all_coords; return 0 if not @all_coords; my $sl = $self->direct_target_slice ? $self->direct_target_slice : $self; foreach my $coord (@all_coords) { push @new_exons, Bio::EnsEMBL::Exon->new(-start => $coord->start, -end => $coord->end, -strand => $tran->strand, -slice => $sl); } if ($tran->strand < 0) { @new_exons = sort { $b->start <=> $a->start } @new_exons; } else { @new_exons = sort { $a->start <=> $b->start } @new_exons; } #
# calculate phases, and add supporting features
#
my $source_pep = $tran->translate->seq; my $source_cds_len = 0; map { $source_cds_len += $_->length } @orig_exons; if (length($source_pep) + 1 == ($source_cds_len / 3)) {
# stop was removed
$source_pep .= "*";
} my $transcript_aligned_aas = 0; my $transcript_identical_aas = 0; my $incomplete_codon_bps = 0; my ($previous_exon); foreach my $exon (@new_exons) { # if ($exon->start == 16141929) {
# print "Modifying exon->start\n";
# $exon->start($exon->start+2);
# }
if (defined $previous_exon) { $exon->phase($previous_exon->end_phase); } else { $exon->phase(0); } # $exon->end_phase((($exon->end - $exon->start + 1) + $exon->phase)%3);
#print " Exon length = " . $exon->length . " phase = " . $exon->phase . "\n";
$exon->end_phase((($exon->length) + $exon->phase)%3); #print " Setting Exon end_phase to " . $exon->end_phase . "\n";
my $exon_aligned_aas = 0; my $exon_identical_aas = 0; # need to map back to the genomic coords to get the supporting feature
# for this exon;
my $extent_start = $exon->start; my $extent_end = $exon->end; if ($exon->strand > 0) { $extent_start += 3 - $exon->phase if $exon->phase; $extent_end -= $exon->end_phase if $exon->end_phase; } else { $extent_start += $exon->end_phase if $exon->end_phase; $extent_end -= 3 - $exon->phase if $exon->phase; } #print "Exon phase : ",$exon->phase,"\n";
#print "Exon end phase : ",$exon->end_phase,"\n";
#print "Previous exon end phase : ",$previous_exon->end_phase,"\n" if ($previous_exon);
if ($extent_end > $extent_start) { #print "extent start = " . $extent_start . " extent_end = " . $extent_end ."\n";
# if not, we've eaten away the whole exon, so there is no support
$incomplete_codon_bps += ($extent_start - $exon->start); $incomplete_codon_bps += ($exon->end - $extent_end); my @gen_coords = $self->from_mapper->map_coordinates($GENE_SCAFFOLD_CS_NAME, $extent_start, $extent_end, 1, $GENE_SCAFFOLD_CS_NAME); #print " Have " . scalar(@gen_coords) . " gen_coords\n";
my @fps; my $cur_gs_start = $extent_start; #print "Exon coords = " . $exon->start . " " . $exon->end ."\n";
foreach my $g_coord (@gen_coords) { #print " g_coord = " . $g_coord->start . " " . $g_coord->end ."\n";
#print "EXTEND LENGTH: ",($extent_end - $extent_start+1),"\n";
#print "G LENGTH: ",($g_coord->end- $g_coord->start+1),"\t length: ",$g_coord->length,"\n";
my $cur_gs_end = $cur_gs_start + $g_coord->length - 1; if ($g_coord->isa("Bio::EnsEMBL::Mapper::Coordinate")) { my ($p_coord) = $tran->genomic2pep($g_coord->start, $g_coord->end, $tran->strand); my $p_substr = uc(substr($source_pep, $p_coord->start - 1, $p_coord->length)); my $gs_reg = $sl->subseq($cur_gs_start, $cur_gs_end, $exon->strand); #print "SEQ: ",length($gs_reg),"\n";
my $gs_p_substr = uc(Bio::PrimarySeq->new(-seq => $gs_reg)->translate->seq); #print $p_substr, "\n AND: ",$gs_p_substr,"\n";
if (length($p_substr) != length ($gs_p_substr)) { #print "Length from source_pep = ", length($p_substr), "\n AND: from genomic translation ",length($gs_p_substr),"\n";
warning("Pep segments differ; cannot calculate percentage identity"); } else { $exon_aligned_aas += length($p_substr); my @p1 = split(//, $p_substr); my @p2 = split(//, $gs_p_substr); for(my $i=0; $i < @p1; $i++) { if ($p1[$i] eq $p2[$i]) { $exon_identical_aas++; } } } my $fp = Bio::EnsEMBL::FeaturePair-> new(-seqname => $self->seq_region_name, -start => $cur_gs_start, -end => $cur_gs_end, -strand => $exon->strand, -score => 100.0, -hseqname => $tran->translation->stable_id, -hstart => $p_coord->start, -external_db_id => $external_db_id, -hend => $p_coord->end, -hstrand => $p_coord->strand, -slice => $sl); push @fps, $fp; } $cur_gs_start += $g_coord->length; } if (@fps) { my $f = Bio::EnsEMBL::DnaPepAlignFeature->new(-features =>\@ fps); $f->percent_id(100 * ($exon_identical_aas / $exon_aligned_aas));
$exon->add_supporting_features($f); } } else { #print "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! Short exon of " . $exon->length . " bases\n";
$incomplete_codon_bps += $exon->length; } $transcript_aligned_aas += $exon_aligned_aas; $transcript_identical_aas += $exon_identical_aas; $previous_exon = $exon; } # optimistically, we are going to assume that split codons
# are identical and covered. This of course may not always
# be the case, but it's too fiddly (for now) to do it properly.
$transcript_aligned_aas += int($incomplete_codon_bps / 3);
$transcript_identical_aas += int($incomplete_codon_bps / 3);
#
# merge abutting exons; deals with exon fusion events, and
# small, frame-preserving insertions in the target
#
my @merged_exons; foreach my $exon (@new_exons) { if (@merged_exons) { my $prev_exon = pop @merged_exons; my ($new_start, $new_end); if ($tran->strand < 0) { my $intron_len = $prev_exon->start - $exon->end - 1; if ($intron_len % 3 == 0 and $intron_len <= $self->max_readthrough_dist) { $new_start = $exon->start; $new_end = $prev_exon->end; } } else { my $intron_len = $exon->start - $prev_exon->end - 1; if ($intron_len % 3 == 0 and $intron_len <= $self->max_readthrough_dist) { $new_start = $prev_exon->start; $new_end = $exon->end; } } if (defined $new_start and defined $new_end) { my $merged_exon = Bio::EnsEMBL::Exon-> new(-start => $new_start, -end => $new_end, -strand => $tran->strand, -phase => $prev_exon->phase, -end_phase => $exon->end_phase, -slice => $exon->slice); my @ug_feats; my $len_l = 0; my $pid_l = 0; if (@{$prev_exon->get_all_supporting_features}) { my ($sf) = @{$prev_exon->get_all_supporting_features}; $pid_l = $sf->percent_id; foreach my $ug ($sf->ungapped_features) { push @ug_feats, $ug; $len_l += $ug->length; } } my $len_r = 0; my $pid_r = 0; if (@{$exon->get_all_supporting_features}) { my ($sf) = @{$exon->get_all_supporting_features}; $pid_r = $sf->percent_id; foreach my $ug ($sf->ungapped_features) { push @ug_feats, $ug; $len_r += $ug->length; } } if (@ug_feats) { map { $_->percent_id(100) } @ug_feats; my $new_sup_feat = Bio::EnsEMBL::DnaPepAlignFeature-> new(-features =>\@ ug_feats); $new_sup_feat->percent_id((($len_l * $pid_l) + ($len_r * $pid_r)) / ($len_l + $len_r));
$merged_exon->add_supporting_features($new_sup_feat); } push @merged_exons, $merged_exon; next; } else { push @merged_exons, $prev_exon; push @merged_exons, $exon; } } else { push @merged_exons, $exon; } } my $proj_tran = Bio::EnsEMBL::Transcript->new(); map { $proj_tran->add_Exon($_) } @merged_exons; #
# do transcript-level supporting features/attributes
#
my (@trans_fps); foreach my $exon (@merged_exons) { if (@{$exon->get_all_supporting_features}) { my ($sf) = @{$exon->get_all_supporting_features}; my @e_fps = $sf->ungapped_features; # need to reset the pids here, otherwise the API complains
map { $_->percent_id(100) } @e_fps; push @trans_fps, @e_fps; } } if (@trans_fps) { my $t_sf = Bio::EnsEMBL::DnaPepAlignFeature-> new(-features =>\@ trans_fps); # use score to hold coverage, so that it is stored somewhere when written to db
$t_sf->score(100 * ($transcript_aligned_aas / length($source_pep)));
$t_sf->hcoverage( 100 * ($transcript_aligned_aas / length($source_pep)));
$t_sf->percent_id(100 * ($transcript_identical_aas / $transcript_aligned_aas));
$proj_tran->add_supporting_features($t_sf); # now copy this hcoverage value to all the exon supporting features
foreach my $exon (@{$proj_tran->get_all_Exons}) { foreach my $exon_sf (@{$exon->get_all_supporting_features}) { $exon_sf->hcoverage($t_sf->hcoverage); } } } else { # there are no complete codons in any exon!
# this is clearly rubbish, so bail out
return 0; } #
# set translation
#
my $translation = Bio::EnsEMBL::Translation->new(); $translation->start_Exon($merged_exons[0]); $translation->start(1); $translation->end_Exon($merged_exons[-1]); $translation->end($merged_exons[-1]->end - $merged_exons[-1]->start + 1); $proj_tran->translation($translation); my $pep = $proj_tran->translate; if (not defined $pep) { # this can happen if the transcript comprises a single stop codon only
return 0; } my $prop_non_gap = 100 - (100 * (($pep->seq =~ tr/X/X/) / $pep->length));
my $num_stops = $pep->seq =~ tr/\*/\*/; my $num_gaps = $pep->seq =~ tr/X/X/; #
# finally, attributes
#
my @attributes; my $perc_id_attr = Bio::EnsEMBL::Attribute-> new(-code => 'HitSimilarity', -name => 'hit similarity', -description => 'percentage id to parent transcripts', -value => sprintf("%.1f", 100 * ($transcript_identical_aas / $transcript_aligned_aas)));
push @attributes, $perc_id_attr; my $cov_attr = Bio::EnsEMBL::Attribute-> new(-code => 'HitCoverage', -name => 'hit coverage', -description => 'coverage of parent transcripts', -value => sprintf("%.1f", 100 * ($transcript_aligned_aas / length($source_pep))));
push @attributes, $cov_attr; my $gap_attr = Bio::EnsEMBL::Attribute-> new(-code => 'PropNonGap', -name => 'proportion non gap', -description => 'proportion non gap', -value => sprintf("%.1f", $prop_non_gap)); push @attributes, $gap_attr; my $prop_non_gap_of_coverage_attr = Bio::EnsEMBL::Attribute-> new(-code => 'NonGapHCov', -name => 'proportion non gap of hit coverage', -description => 'proportion non gap of hit coverage', -value => sprintf("%.1f", 100 * (($transcript_aligned_aas - $num_gaps) / length($source_pep))));
push @attributes, $prop_non_gap_of_coverage_attr; if ($start_not_found and $tran->strand > 0 or $end_not_found and $tran->strand < 0) { my $attr = Bio::EnsEMBL::Attribute-> new(-code => 'StartNotFound', -name => 'start not found', -description => 'start not found', -value => 1); push @attributes, $attr; } if ($end_not_found and $tran->strand > 0 or $start_not_found and $tran->strand < 0) { my $attr = Bio::EnsEMBL::Attribute-> new(-code => 'EndNotFound', -name => 'end not found', -description => 'end not found', -value => 1); push @attributes, $attr; } my $stop_attr = Bio::EnsEMBL::Attribute-> new(-code => 'NumStops', -name => 'number of stops', -desc => 'Number of stops before editing', -value => $num_stops); push @attributes, $stop_attr; # indentify gap exons
my $gap_exons = 0; foreach my $e (@{$proj_tran->get_all_Exons}) { my ($coord) = $self->to_mapper->map_coordinates($GENE_SCAFFOLD_CS_NAME, $e->start, $e->end, 1, $GENE_SCAFFOLD_CS_NAME); if ($coord->isa("Bio::EnsEMBL::Mapper::Gap")) { $gap_exons++; } } my $gap_exon_attr = Bio::EnsEMBL::Attribute-> new(-code => 'GapExons', -name => 'gap exons', -description => 'number of gap exons', -value => $gap_exons); push @attributes, $gap_exon_attr; my $tranid_attr = Bio::EnsEMBL::Attribute-> new(-code => 'SourceTran', -name => 'source transcript', -description => 'source transcript', -value => $tran->stable_id); push @attributes, $tranid_attr; if ($add_attributes) { $proj_tran->add_Attributes(@attributes); } if ($self->direct_target_slice and $self->direct_target_slice->strand < 0) { $proj_tran = $proj_tran->transfer($self->direct_target_slice->invert); } return $proj_tran;
}
project_downdescriptionprevnextTop
sub project_down {
  my ($self) = @_;

  my @comps = $self->to_mapper->map_coordinates($GENE_SCAFFOLD_CS_NAME,
                                                1,
                                                $self->length,
                                                1,
                                                $GENE_SCAFFOLD_CS_NAME);
                                                  
  my @segments;
  
  my $current_pos = 1;
  foreach my $c (@comps) {
    my $start = $current_pos;
    my $end   = $current_pos + $c->length - 1;
    $current_pos += $c->length;
    
    if ($c->isa("Bio::EnsEMBL::Mapper::Coordinate")) {
      my $tlsl = $self->to_slices->{$c->id};
      
      my $slice = $tlsl->adaptor->fetch_by_region($tlsl->coord_system->name,
                                                  $tlsl->seq_region_name,
                                                  $c->start,
                                                  $c->end,
                                                  $c->strand);
      
      push @segments, bless([$start, $end, $slice],
                            "Bio::EnsEMBL::ProjectionSegment");
    }
  }

  return @segments;
}


###############################################
# Internal helper methods
###############################################
#################################################
}
project_updescriptionprevnextTop
sub project_up {
  my ($self) = @_;

  my $tlsl = $self->from_slice;

  my @comps = $self->from_mapper->map_coordinates($GENE_SCAFFOLD_CS_NAME,
                                                  1,
                                                  $self->length,
                                                  1,
                                                  $GENE_SCAFFOLD_CS_NAME);
                                                  
  my @segments;
  
  my $current_pos = 1;
  foreach my $c (@comps) {
    my $start = $current_pos;
    my $end   = $current_pos + $c->length - 1;
    $current_pos += $c->length;
    
    if ($c->isa("Bio::EnsEMBL::Mapper::Coordinate")) {
      
      my $slice = $tlsl->adaptor->fetch_by_region($tlsl->coord_system->name,
                                                  $tlsl->seq_region_name,
                                                  $c->start,
                                                  $c->end,
                                                  1);
      
      push @segments, bless([$start, $end, $slice],
                            "Bio::EnsEMBL::ProjectionSegment");
    }
  }

  return @segments;
}
stringify_alignmentdescriptionprevnextTop
sub stringify_alignment {
  my ($self) = @_;

  my @coords = $self->alignment_mapper->map_coordinates($self->from_slice->seq_region_name,
                                                        $self->from_slice->start,
                                                        $self->from_slice->end,
                                                        1,
                                                        $FROM_CS_NAME);
  my $string = "";
  my $position = $self->from_slice->start;
  foreach my $c (@coords) {
    my $ref_start = $position;
    my $ref_end = $position + $c->length - 1;
    $position += $c->length;
    
    $string .= sprintf("%s %d %d ", $self->from_slice->seq_region_name, $ref_start, $ref_end);

    if ($c->isa("Bio::EnsEMBL::Mapper::Gap")) {
      $string .= "[GAP]\n";
    } else {
      $string .= "[%s %d %d %s]\n", $c->id, $c->start, $c->end, $c->strand;
    }
  }

  return $string;
}
to_mapperdescriptionprevnextTop
sub to_mapper {
  my ($self, $val) = @_;

  if (defined $val) {
    $self->{_to_mapper} = $val;
  }
  return $self->{_to_mapper};

}



1;
}
to_slicesdescriptionprevnextTop
sub to_slices {
  my ($self, $val) = @_;

  if (defined $val) {
    $self->{_to_slices} = $val;
  }

  return $self->{_to_slices};
}
General documentation
No general documentation available.