Raw content of Bio::EnsEMBL::Analysis::Tools::WGA2Genes::GeneScaffold
=head1 NAME
Bio::EnsEMBL::Analysis::Tools::WGA2Genes::GeneScaffold
=head1 SYNOPSIS
A Bio::EnsEMBL::Slice that is comprised
of pieces of different target sequences, inferred by an alignment
of the target to a finished, query genome sequence. This object
extends Slice with mappings to/from the query and target
Assumptions:
- that the given GenomicAlignBlocks are sorted with respect
to the reference and non-overlapping on query and target
- that the given list of features is sorted with respect to
the reference
=cut
package Bio::EnsEMBL::Analysis::Tools::WGA2Genes::GeneScaffold;
use strict;
use vars qw(@ISA);
use Bio::EnsEMBL::Utils::Argument qw(rearrange);
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
use Bio::EnsEMBL::Slice;
use Bio::EnsEMBL::Analysis::Tools::WGA2Genes::CoordUtils;
@ISA = qw(Bio::EnsEMBL::Slice);
my $FROM_CS_NAME = 'chromosome';
my $TO_CS_NAME = 'scaffold';
my $GENE_SCAFFOLD_CS_NAME = 'genescaffold';
my $INTERPIECE_PADDING = 100;
my $NEAR_CONTIG_END = 15;
###############################################
sub new {
my ($caller, %given_args) = @_;
my $class = ref($caller) || $caller;
my ($name,
$genomic_align_blocks,
$transcripts,
$from_slice,
$to_slices,
$max_readthrough_dist,
$extend_into_gaps,
$add_gaps,
$direct_target_slice,
) = rearrange([qw(
NAME
GENOMIC_ALIGN_BLOCKS
TRANSCRIPTS
FROM_SLICE
TO_SLICES
MAX_READTHROUGH_DIST
EXTEND_INTO_GAPS
ADD_GAPS
DIRECT_TARGET_COORDS
)], %given_args);
_check_transcripts($transcripts);
$name = "GeneScaffold" if not defined $name;
$max_readthrough_dist = 15 if not defined $max_readthrough_dist;
if ($direct_target_slice) {
if ($add_gaps or $extend_into_gaps) {
warning("GeneScaffold: cannot use -direct_target_coords with either of " .
"-add_gaps or -extend_into_gaps; unsetting both");
$add_gaps = 0;
$extend_into_gaps = 0;
}
# check that the genomic align blocks will allow this
$direct_target_slice = _check_direct_target_coordinates($genomic_align_blocks,
$to_slices);
}
my $aln_map = _make_alignment_mapper($genomic_align_blocks);
my ($gs_seq, $from_mapper, $to_mapper) =
_construct_sequence($genomic_align_blocks,
$aln_map,
$transcripts,
$from_slice,
$to_slices,
$max_readthrough_dist,
$extend_into_gaps,
$add_gaps,
$direct_target_slice);
return undef if not defined $from_mapper;
# print "1: ", $direct_target_slice,"\n2: ",$direct_target_slice->length,"\n3: ",length($gs_seq),"\n";
my $gs_end = $direct_target_slice
? $direct_target_slice->length
: length($gs_seq);
my $self = $class->SUPER::new(-coord_system =>
Bio::EnsEMBL::CoordSystem->new(-name => $GENE_SCAFFOLD_CS_NAME,
-rank => 1),
-seq_region_name => $name,
-seq => ( $gs_seq eq "" ? undef : $gs_seq),
-start => 1,
-end => $gs_end,
);
$self->direct_target_slice($direct_target_slice)
if defined $direct_target_slice;
$self->max_readthrough_dist($max_readthrough_dist)
if defined $max_readthrough_dist;
$self->from_slice($from_slice);
$self->to_slices($to_slices);
$self->alignment_mapper($aln_map);
$self->from_mapper($from_mapper);
$self->to_mapper($to_mapper);
return $self;
}
###################################################################
# FUNCTION : place_transcript
#
# Description:
# Takes a transcript, and uses the mapping between
# query coords and gene scaffold coords to produces a transcript
# that is the result of "projecting" the original transcript,
# through alignment, onto the gene scaffold.
###################################################################
sub place_transcript {
my ($self,
$tran,
$add_attributes,
$external_db_id) = @_;
my (@all_coords, @new_exons);
my @orig_exons = @{$tran->get_all_translateable_Exons};
@orig_exons = sort { $a->start <=> $b->start } @orig_exons;
my $orig_exon_coords = transcripts_to_coords($self->alignment_mapper,
$FROM_CS_NAME,
$tran);
my $restricted_range;
if (@$orig_exon_coords) {
# print "ORIGINAL EXON COORDS: First start: ",$orig_exon_coords->[0]->start, " last end: ",$orig_exon_coords->[-1]->end," length: ",$orig_exon_coords->[-1]->end-$orig_exon_coords->[0]->start+1 ,"\n";
$restricted_range = Bio::EnsEMBL::Mapper::Coordinate->new($orig_exon_coords->[0]->id,
$orig_exon_coords->[0]->start,
$orig_exon_coords->[-1]->end,
$orig_exon_coords->[0]->strand);
} else {
#print "ORIGINAL EXON COORDS: First start: ",$orig_exons[0]->start, " last end: ",$orig_exons[-1]->end," length ",$orig_exons[-1]->end-$orig_exons[0]->start+1,"\n";
$restricted_range = Bio::EnsEMBL::Mapper::Coordinate->new($orig_exons[0]->slice->seq_region_name,
$orig_exons[0]->start,
$orig_exons[-1]->end,
$orig_exons[0]->strand);
}
my %filled_coords;
# look for regions that map down to filled gaps. If any of these
# lie outside the restricted range (i.e. extent of the transcript
# within which gaps were potentially filled), then they must
# be gaps that were filled for a different transcript. In that
# case, discard these pieces
foreach my $orig_exon (@orig_exons) {
#print "Single exon original coords: START: ", $orig_exon->start," END: ", $orig_exon->end," length ",$orig_exon->end-$orig_exon->start+1,"\n";
my @crds = $self->from_mapper->map_coordinates($orig_exon->slice->seq_region_name,
$orig_exon->start,
$orig_exon->end,
1,
$FROM_CS_NAME);
foreach my $c (@crds) {
if ($c->isa("Bio::EnsEMBL::Mapper::Coordinate")) {
#print "Exon Mapper from coordinates: START:", $c->start, " END: ",$c->end," length ",$c->end-$c->start+1,"\n";
my ($tc) = $self->to_mapper->map_coordinates($GENE_SCAFFOLD_CS_NAME,
$c->start,
$c->end,
1,
$GENE_SCAFFOLD_CS_NAME);
if ($tc->isa("Bio::EnsEMBL::Mapper::Coordinate")) {
# this piece may partially extend into a real sequence gap;
# break it up into its constituent pieces
#print "TC Mapper from coordinates: START:", $tc->start, " END: ",$tc->end," length ",$tc->end-$tc->start+1,"\n";
my @alncrds = $self->alignment_mapper->map_coordinates($tc->id,
$tc->start,
$tc->end,
$tc->strand,
$TO_CS_NAME);
my $current_start = $c->start;
foreach my $ac (@alncrds) {
#print "AC coords START: ", $ac->start, " END: ".$ac->end," length ",$ac->end-$ac->start+1,"\n";
my $current_end = $current_start + $ac->length - 1;
my $newc = Bio::EnsEMBL::Mapper::Coordinate->new($c->id,
$current_start,
$current_end,
$c->strand);
$current_start += $ac->length;
if ($ac->isa("Bio::EnsEMBL::Mapper::Gap")) {
#print "\nAC is a GAP\n\n";
$filled_coords{$newc} = 1;
}
push @all_coords, $newc;
}
} else {
$filled_coords{$c} = 1;
push @all_coords, $c;
}
} else {
push @all_coords, $c;
}
}
}
my @kept_coords;
foreach my $c (@all_coords) {
if ($c->isa("Bio::EnsEMBL::Mapper::Coordinate") and
exists $filled_coords{$c}) {
# this is a filled coord, so MUST correspond to exactly one piece of the
# original query sequence
#print "KEPT coord START: ",$c->start," END: ",$c->end," length ",$c->end-$c->start+1,"\n";
my ($oc) = $self->from_mapper->map_coordinates($GENE_SCAFFOLD_CS_NAME,
$c->start,
$c->end,
1,
$GENE_SCAFFOLD_CS_NAME);
if ($oc->start >= $restricted_range->start and
$oc->end <= $restricted_range->end) {
push @kept_coords, $c;
} else {
push @kept_coords, Bio::EnsEMBL::Mapper::Gap->new(1, $c->length);
}
} else {
#print "KEPT coord START: ",$c->start," END: ",$c->end," length ",$c->end-$c->start+1,"\n";
push @kept_coords, $c;
}
}
@all_coords = @kept_coords;
#
# now massage the gaps to account for frameshifts
#
my $need_another_pass;
do {
$need_another_pass = 0;
my (@proc_coords, @gap_indices);
# merge gaps
foreach my $c (@all_coords) {
if ($c->isa("Bio::EnsEMBL::Mapper::Gap")) {
#print "GAP PRESENT START: ",$c->start," END: ",$c->end," length ",$c->end-$c->start+1,"\n";
if (@proc_coords and
$proc_coords[-1]->isa("Bio::EnsEMBL::Mapper::Gap")) {
$proc_coords[-1]->end( $proc_coords[-1]->end + $c->length );
#print "Extending existing GAP\n";
} else {
#print "ADDing new GAP with length ",$c->length ,"\n";
push @proc_coords, Bio::EnsEMBL::Mapper::Gap->new(1,
$c->length);
push @gap_indices, scalar(@proc_coords) - 1;
}
} else {
push @proc_coords, $c;
}
}
GAP: foreach my $idx (@gap_indices) {
#print "I have to handle a GAP\n";
my $gap = $proc_coords[$idx];
my $frameshift = $gap->length % 3;
if ($frameshift) {
#print "!!! Have a frameshift at " . $gap->start . "\n";
my $bases_to_remove = 3 - $frameshift;
# calculate "surplus" bases on incomplete codons to left and right
my ($left_surplus, $right_surplus) = (0,0);
for(my $j=$idx-1; $j >= 0; $j--) {
$left_surplus += $proc_coords[$j]->length;
}
for(my $j=$idx+1; $j < @proc_coords; $j++) {
$right_surplus += $proc_coords[$j]->length;
}
$left_surplus = $left_surplus % 3;
$right_surplus = $right_surplus % 3;
if ($left_surplus) {
# eat left
$bases_to_remove = $left_surplus;
my $left_coord = $proc_coords[$idx - 1];
if ($left_coord->length > $bases_to_remove) {
$gap->end($gap->end + $bases_to_remove);
$left_coord->end( $left_coord->end - $bases_to_remove );
} else {
# we need to eat away the whole of this coord
$proc_coords[$idx-1] =
Bio::EnsEMBL::Mapper::Gap->new(1,$left_coord->length);
}
}
if ($right_surplus) {
$bases_to_remove = $right_surplus;
my $right_coord = $proc_coords[$idx + 1];
if ($right_coord->length > $bases_to_remove) {
$gap->end($gap->end + $bases_to_remove);
$right_coord->start( $right_coord->start + $bases_to_remove);
} else {
# we need to eat away the whole of this coord
$proc_coords[$idx+1] =
Bio::EnsEMBL::Mapper::Gap->new(1,$right_coord->length);
}
}
$need_another_pass = 1;
last GAP;
}
}
@all_coords = @proc_coords;
} while ($need_another_pass);
my $start_not_found = $all_coords[0]->isa("Bio::EnsEMBL::Mapper::Gap") ? 1 : 0;
my $end_not_found = $all_coords[-1]->isa("Bio::EnsEMBL::Mapper::Gap") ? 1 : 0;
#
# The remaining non-gap pieces are the exons
#
@all_coords = grep { $_->isa("Bio::EnsEMBL::Mapper::Coordinate") } @all_coords;
return 0 if not @all_coords;
my $sl = $self->direct_target_slice
? $self->direct_target_slice
: $self;
foreach my $coord (@all_coords) {
push @new_exons, Bio::EnsEMBL::Exon->new(-start => $coord->start,
-end => $coord->end,
-strand => $tran->strand,
-slice => $sl);
}
if ($tran->strand < 0) {
@new_exons = sort { $b->start <=> $a->start } @new_exons;
} else {
@new_exons = sort { $a->start <=> $b->start } @new_exons;
}
#
# calculate phases, and add supporting features
#
my $source_pep = $tran->translate->seq;
my $source_cds_len = 0;
map { $source_cds_len += $_->length } @orig_exons;
if (length($source_pep) + 1 == ($source_cds_len / 3)) {
# stop was removed
$source_pep .= "*";
}
my $transcript_aligned_aas = 0;
my $transcript_identical_aas = 0;
my $incomplete_codon_bps = 0;
my ($previous_exon);
foreach my $exon (@new_exons) {
# if ($exon->start == 16141929) {
# print "Modifying exon->start\n";
# $exon->start($exon->start+2);
# }
if (defined $previous_exon) {
$exon->phase($previous_exon->end_phase);
} else {
$exon->phase(0);
}
# $exon->end_phase((($exon->end - $exon->start + 1) + $exon->phase)%3);
#print " Exon length = " . $exon->length . " phase = " . $exon->phase . "\n";
$exon->end_phase((($exon->length) + $exon->phase)%3);
#print " Setting Exon end_phase to " . $exon->end_phase . "\n";
my $exon_aligned_aas = 0;
my $exon_identical_aas = 0;
# need to map back to the genomic coords to get the supporting feature
# for this exon;
my $extent_start = $exon->start;
my $extent_end = $exon->end;
if ($exon->strand > 0) {
$extent_start += 3 - $exon->phase if $exon->phase;
$extent_end -= $exon->end_phase if $exon->end_phase;
} else {
$extent_start += $exon->end_phase if $exon->end_phase;
$extent_end -= 3 - $exon->phase if $exon->phase;
}
#print "Exon phase : ",$exon->phase,"\n";
#print "Exon end phase : ",$exon->end_phase,"\n";
#print "Previous exon end phase : ",$previous_exon->end_phase,"\n" if ($previous_exon);
if ($extent_end > $extent_start) {
#print "extent start = " . $extent_start . " extent_end = " . $extent_end ."\n";
# if not, we've eaten away the whole exon, so there is no support
$incomplete_codon_bps += ($extent_start - $exon->start);
$incomplete_codon_bps += ($exon->end - $extent_end);
my @gen_coords = $self->from_mapper->map_coordinates($GENE_SCAFFOLD_CS_NAME,
$extent_start,
$extent_end,
1,
$GENE_SCAFFOLD_CS_NAME);
#print " Have " . scalar(@gen_coords) . " gen_coords\n";
my @fps;
my $cur_gs_start = $extent_start;
#print "Exon coords = " . $exon->start . " " . $exon->end ."\n";
foreach my $g_coord (@gen_coords) {
#print " g_coord = " . $g_coord->start . " " . $g_coord->end ."\n";
#print "EXTEND LENGTH: ",($extent_end - $extent_start+1),"\n";
#print "G LENGTH: ",($g_coord->end- $g_coord->start+1),"\t length: ",$g_coord->length,"\n";
my $cur_gs_end = $cur_gs_start + $g_coord->length - 1;
if ($g_coord->isa("Bio::EnsEMBL::Mapper::Coordinate")) {
my ($p_coord) = $tran->genomic2pep($g_coord->start,
$g_coord->end,
$tran->strand);
my $p_substr = uc(substr($source_pep, $p_coord->start - 1, $p_coord->length));
my $gs_reg = $sl->subseq($cur_gs_start, $cur_gs_end, $exon->strand);
#print "SEQ: ",length($gs_reg),"\n";
my $gs_p_substr = uc(Bio::PrimarySeq->new(-seq => $gs_reg)->translate->seq);
#print $p_substr, "\n AND: ",$gs_p_substr,"\n";
if (length($p_substr) != length ($gs_p_substr)) {
#print "Length from source_pep = ", length($p_substr), "\n AND: from genomic translation ",length($gs_p_substr),"\n";
warning("Pep segments differ; cannot calculate percentage identity");
} else {
$exon_aligned_aas += length($p_substr);
my @p1 = split(//, $p_substr);
my @p2 = split(//, $gs_p_substr);
for(my $i=0; $i < @p1; $i++) {
if ($p1[$i] eq $p2[$i]) {
$exon_identical_aas++;
}
}
}
my $fp = Bio::EnsEMBL::FeaturePair->
new(-seqname => $self->seq_region_name,
-start => $cur_gs_start,
-end => $cur_gs_end,
-strand => $exon->strand,
-score => 100.0,
-hseqname => $tran->translation->stable_id,
-hstart => $p_coord->start,
-external_db_id => $external_db_id,
-hend => $p_coord->end,
-hstrand => $p_coord->strand,
-slice => $sl);
push @fps, $fp;
}
$cur_gs_start += $g_coord->length;
}
if (@fps) {
my $f = Bio::EnsEMBL::DnaPepAlignFeature->new(-features => \@fps);
$f->percent_id(100 * ($exon_identical_aas / $exon_aligned_aas));
$exon->add_supporting_features($f);
}
} else {
#print "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! Short exon of " . $exon->length . " bases\n";
$incomplete_codon_bps += $exon->length;
}
$transcript_aligned_aas += $exon_aligned_aas;
$transcript_identical_aas += $exon_identical_aas;
$previous_exon = $exon;
}
# optimistically, we are going to assume that split codons
# are identical and covered. This of course may not always
# be the case, but it's too fiddly (for now) to do it properly.
$transcript_aligned_aas += int($incomplete_codon_bps / 3);
$transcript_identical_aas += int($incomplete_codon_bps / 3);
#
# merge abutting exons; deals with exon fusion events, and
# small, frame-preserving insertions in the target
#
my @merged_exons;
foreach my $exon (@new_exons) {
if (@merged_exons) {
my $prev_exon = pop @merged_exons;
my ($new_start, $new_end);
if ($tran->strand < 0) {
my $intron_len = $prev_exon->start - $exon->end - 1;
if ($intron_len % 3 == 0 and
$intron_len <= $self->max_readthrough_dist) {
$new_start = $exon->start;
$new_end = $prev_exon->end;
}
} else {
my $intron_len = $exon->start - $prev_exon->end - 1;
if ($intron_len % 3 == 0 and
$intron_len <= $self->max_readthrough_dist) {
$new_start = $prev_exon->start;
$new_end = $exon->end;
}
}
if (defined $new_start and defined $new_end) {
my $merged_exon = Bio::EnsEMBL::Exon->
new(-start => $new_start,
-end => $new_end,
-strand => $tran->strand,
-phase => $prev_exon->phase,
-end_phase => $exon->end_phase,
-slice => $exon->slice);
my @ug_feats;
my $len_l = 0;
my $pid_l = 0;
if (@{$prev_exon->get_all_supporting_features}) {
my ($sf) = @{$prev_exon->get_all_supporting_features};
$pid_l = $sf->percent_id;
foreach my $ug ($sf->ungapped_features) {
push @ug_feats, $ug;
$len_l += $ug->length;
}
}
my $len_r = 0;
my $pid_r = 0;
if (@{$exon->get_all_supporting_features}) {
my ($sf) = @{$exon->get_all_supporting_features};
$pid_r = $sf->percent_id;
foreach my $ug ($sf->ungapped_features) {
push @ug_feats, $ug;
$len_r += $ug->length;
}
}
if (@ug_feats) {
map { $_->percent_id(100) } @ug_feats;
my $new_sup_feat = Bio::EnsEMBL::DnaPepAlignFeature->
new(-features => \@ug_feats);
$new_sup_feat->percent_id((($len_l * $pid_l) + ($len_r * $pid_r))
/ ($len_l + $len_r));
$merged_exon->add_supporting_features($new_sup_feat);
}
push @merged_exons, $merged_exon;
next;
} else {
push @merged_exons, $prev_exon;
push @merged_exons, $exon;
}
} else {
push @merged_exons, $exon;
}
}
my $proj_tran = Bio::EnsEMBL::Transcript->new();
map { $proj_tran->add_Exon($_) } @merged_exons;
#
# do transcript-level supporting features/attributes
#
my (@trans_fps);
foreach my $exon (@merged_exons) {
if (@{$exon->get_all_supporting_features}) {
my ($sf) = @{$exon->get_all_supporting_features};
my @e_fps = $sf->ungapped_features;
# need to reset the pids here, otherwise the API complains
map { $_->percent_id(100) } @e_fps;
push @trans_fps, @e_fps;
}
}
if (@trans_fps) {
my $t_sf = Bio::EnsEMBL::DnaPepAlignFeature->
new(-features => \@trans_fps);
# use score to hold coverage, so that it is stored somewhere when written to db
$t_sf->score(100 * ($transcript_aligned_aas / length($source_pep)));
$t_sf->hcoverage( 100 * ($transcript_aligned_aas / length($source_pep)));
$t_sf->percent_id(100 * ($transcript_identical_aas / $transcript_aligned_aas));
$proj_tran->add_supporting_features($t_sf);
# now copy this hcoverage value to all the exon supporting features
foreach my $exon (@{$proj_tran->get_all_Exons}) {
foreach my $exon_sf (@{$exon->get_all_supporting_features}) {
$exon_sf->hcoverage($t_sf->hcoverage);
}
}
} else {
# there are no complete codons in any exon!
# this is clearly rubbish, so bail out
return 0;
}
#
# set translation
#
my $translation = Bio::EnsEMBL::Translation->new();
$translation->start_Exon($merged_exons[0]);
$translation->start(1);
$translation->end_Exon($merged_exons[-1]);
$translation->end($merged_exons[-1]->end - $merged_exons[-1]->start + 1);
$proj_tran->translation($translation);
my $pep = $proj_tran->translate;
if (not defined $pep) {
# this can happen if the transcript comprises a single stop codon only
return 0;
}
my $prop_non_gap = 100 - (100 * (($pep->seq =~ tr/X/X/) / $pep->length));
my $num_stops = $pep->seq =~ tr/\*/\*/;
my $num_gaps = $pep->seq =~ tr/X/X/;
#
# finally, attributes
#
my @attributes;
my $perc_id_attr = Bio::EnsEMBL::Attribute->
new(-code => 'HitSimilarity',
-name => 'hit similarity',
-description => 'percentage id to parent transcripts',
-value => sprintf("%.1f",
100 * ($transcript_identical_aas / $transcript_aligned_aas)));
push @attributes, $perc_id_attr;
my $cov_attr = Bio::EnsEMBL::Attribute->
new(-code => 'HitCoverage',
-name => 'hit coverage',
-description => 'coverage of parent transcripts',
-value => sprintf("%.1f",
100 * ($transcript_aligned_aas / length($source_pep))));
push @attributes, $cov_attr;
my $gap_attr = Bio::EnsEMBL::Attribute->
new(-code => 'PropNonGap',
-name => 'proportion non gap',
-description => 'proportion non gap',
-value => sprintf("%.1f",
$prop_non_gap));
push @attributes, $gap_attr;
my $prop_non_gap_of_coverage_attr = Bio::EnsEMBL::Attribute->
new(-code => 'NonGapHCov',
-name => 'proportion non gap of hit coverage',
-description => 'proportion non gap of hit coverage',
-value => sprintf("%.1f",
100 * (($transcript_aligned_aas - $num_gaps) / length($source_pep))));
push @attributes, $prop_non_gap_of_coverage_attr;
if ($start_not_found and $tran->strand > 0 or
$end_not_found and $tran->strand < 0) {
my $attr = Bio::EnsEMBL::Attribute->
new(-code => 'StartNotFound',
-name => 'start not found',
-description => 'start not found',
-value => 1);
push @attributes, $attr;
}
if ($end_not_found and $tran->strand > 0 or
$start_not_found and $tran->strand < 0) {
my $attr = Bio::EnsEMBL::Attribute->
new(-code => 'EndNotFound',
-name => 'end not found',
-description => 'end not found',
-value => 1);
push @attributes, $attr;
}
my $stop_attr = Bio::EnsEMBL::Attribute->
new(-code => 'NumStops',
-name => 'number of stops',
-desc => 'Number of stops before editing',
-value => $num_stops);
push @attributes, $stop_attr;
# indentify gap exons
my $gap_exons = 0;
foreach my $e (@{$proj_tran->get_all_Exons}) {
my ($coord) = $self->to_mapper->map_coordinates($GENE_SCAFFOLD_CS_NAME,
$e->start,
$e->end,
1,
$GENE_SCAFFOLD_CS_NAME);
if ($coord->isa("Bio::EnsEMBL::Mapper::Gap")) {
$gap_exons++;
}
}
my $gap_exon_attr = Bio::EnsEMBL::Attribute->
new(-code => 'GapExons',
-name => 'gap exons',
-description => 'number of gap exons',
-value => $gap_exons);
push @attributes, $gap_exon_attr;
my $tranid_attr = Bio::EnsEMBL::Attribute->
new(-code => 'SourceTran',
-name => 'source transcript',
-description => 'source transcript',
-value => $tran->stable_id);
push @attributes, $tranid_attr;
if ($add_attributes) {
$proj_tran->add_Attributes(@attributes);
}
if ($self->direct_target_slice and
$self->direct_target_slice->strand < 0) {
$proj_tran = $proj_tran->transfer($self->direct_target_slice->invert);
}
return $proj_tran;
}
sub stringify_alignment {
my ($self) = @_;
my @coords = $self->alignment_mapper->map_coordinates($self->from_slice->seq_region_name,
$self->from_slice->start,
$self->from_slice->end,
1,
$FROM_CS_NAME);
my $string = "";
my $position = $self->from_slice->start;
foreach my $c (@coords) {
my $ref_start = $position;
my $ref_end = $position + $c->length - 1;
$position += $c->length;
$string .= sprintf("%s %d %d ", $self->from_slice->seq_region_name, $ref_start, $ref_end);
if ($c->isa("Bio::EnsEMBL::Mapper::Gap")) {
$string .= "[GAP]\n";
} else {
$string .= "[%s %d %d %s]\n", $c->id, $c->start, $c->end, $c->strand;
}
}
return $string;
}
sub project_up {
my ($self) = @_;
my $tlsl = $self->from_slice;
my @comps = $self->from_mapper->map_coordinates($GENE_SCAFFOLD_CS_NAME,
1,
$self->length,
1,
$GENE_SCAFFOLD_CS_NAME);
my @segments;
my $current_pos = 1;
foreach my $c (@comps) {
my $start = $current_pos;
my $end = $current_pos + $c->length - 1;
$current_pos += $c->length;
if ($c->isa("Bio::EnsEMBL::Mapper::Coordinate")) {
my $slice = $tlsl->adaptor->fetch_by_region($tlsl->coord_system->name,
$tlsl->seq_region_name,
$c->start,
$c->end,
1);
push @segments, bless([$start, $end, $slice],
"Bio::EnsEMBL::ProjectionSegment");
}
}
return @segments;
}
sub project_down {
my ($self) = @_;
my @comps = $self->to_mapper->map_coordinates($GENE_SCAFFOLD_CS_NAME,
1,
$self->length,
1,
$GENE_SCAFFOLD_CS_NAME);
my @segments;
my $current_pos = 1;
foreach my $c (@comps) {
my $start = $current_pos;
my $end = $current_pos + $c->length - 1;
$current_pos += $c->length;
if ($c->isa("Bio::EnsEMBL::Mapper::Coordinate")) {
my $tlsl = $self->to_slices->{$c->id};
my $slice = $tlsl->adaptor->fetch_by_region($tlsl->coord_system->name,
$tlsl->seq_region_name,
$c->start,
$c->end,
$c->strand);
push @segments, bless([$start, $end, $slice],
"Bio::EnsEMBL::ProjectionSegment");
}
}
return @segments;
}
###############################################
# Internal helper methods
###############################################
#################################################
sub _check_transcripts {
my ($trans) = @_;
if (scalar(@$trans) == 0) {
throw("Attempt to create GeneScaffod with transcript list");
}
foreach my $t (@$trans) {
if (not $t->translation) {
# throw("Attempt to create GeneScaffold with non-coding Transcript (".$t->stable_id.")");
warn("Attempt to create GeneScaffold with non-coding Transcript (".$t->stable_id.")");
}
if (length($t->translateable_seq) % 3 != 0) {
# throw("Attempt to create GeneScaffold with non-mod-3 coding length Transcript (".$t->stable_id.")");
warn("Attempt to create GeneScaffold with non-mod-3 coding length Transcript (".$t->stable_id.")");
}
}
}
#################################################
sub _construct_sequence {
my ($gen_al_blocks,
$map,
$transcripts,
$from_slice,
$to_slices,
$max_readthrough_dist,
$extend_into_gaps,
$add_gaps,
$direct_target_slice) = @_;
# Basic gene-scaffold structure is taken directly from the given block list
my @block_coord_pairs;
foreach my $bl (@{$gen_al_blocks}) {
my $qy_al = $bl->reference_genomic_align;
my ($tg_al) = @{$bl->get_all_non_reference_genomic_aligns};
my $from_coord = Bio::EnsEMBL::Mapper::Coordinate->new($qy_al->dnafrag->name,
$qy_al->dnafrag_start,
$qy_al->dnafrag_end,
1);
my $to_coord = Bio::EnsEMBL::Mapper::Coordinate->new($tg_al->dnafrag->name,
$tg_al->dnafrag_start,
$tg_al->dnafrag_end,
$tg_al->dnafrag_strand);
my $pair = Bio::EnsEMBL::Mapper::Pair->new($from_coord,
$to_coord);
push @block_coord_pairs, $pair;
}
@block_coord_pairs = sort { $a->from->start <=> $b->from->start } @block_coord_pairs;
# we now proceed to amend the structure with inserted gaps
# step 1: flatten the exons from the transcripts into a non-overlapping
# list, ommitting terminal exons that map completely to gaps
my $tran_coords = transcripts_to_coords($map,
$FROM_CS_NAME,
@$transcripts);
# step 2: infer list of exon regions that map to gaps. We are only
# interested, at this stage, in regions outside the blocks, because
# these are the ones with potential to be "filled"
my @fillable_gaps;
foreach my $tc (sort { $a->start <=> $b->start } @$tran_coords) {
my $current_pos = $tc->start;
foreach my $c ($map->map_coordinates($tc->id,
$tc->start,
$tc->end,
1,
$FROM_CS_NAME)) {
my $from_coord = Bio::EnsEMBL::Mapper::Coordinate->new($tc->id,
$current_pos,
$current_pos + $c->length - 1,
1);
$current_pos += $c->length;
if ($c->isa("Bio::EnsEMBL::Mapper::Gap")) {
# only consider gaps that lies outside blocks
my $overlaps_block = 0;
foreach my $bl (@block_coord_pairs) {
if ($bl->from->start <= $from_coord->end and
$bl->from->end >= $from_coord->start) {
$overlaps_block = 1;
last;
}
}
if (not $overlaps_block) {
push @fillable_gaps, Bio::EnsEMBL::Mapper::Pair->new($from_coord,
$c);
}
}
}
}
# Non-fillable gaps:
# 1. Gaps before the first or after the last block
# 2. If one of the flanking coords is on the same CDS region
# as the gap, and the end of the aligned region does
# not align to a sequence-level gap
# 3. If the 2 flanking coords are consistent and not
# separated by a sequence-level gap
#
my @all_coord_pairs = (@block_coord_pairs, @fillable_gaps);
@all_coord_pairs = sort { $a->from->start <=> $b->from->start } @all_coord_pairs;
my @extend_pairs;
if ($extend_into_gaps) {
my (%pairs_to_remove, @replacements);
for(my $i=0; $i < @all_coord_pairs; $i++) {
my $this_pair = $all_coord_pairs[$i];
if ($this_pair->to->isa("Bio::EnsEMBL::Mapper::Gap")) {
# if it's gap that can be filled, leave it. Otherwise, remove it
my ($left_non_gap, $right_non_gap);
for(my $j=$i-1; $j>=0; $j--) {
if ($all_coord_pairs[$j]->to->
isa("Bio::EnsEMBL::Mapper::Coordinate")) {
$left_non_gap = $all_coord_pairs[$j];
last;
}
}
for(my $j=$i+1; $j < @all_coord_pairs; $j++) {
if ($all_coord_pairs[$j]->to->
isa("Bio::EnsEMBL::Mapper::Coordinate")) {
$right_non_gap = $all_coord_pairs[$j];
last;
}
}
my ($ex_left, $ex_left_up, $ex_left_down) =
extend_coord($left_non_gap->to,
$to_slices->{$left_non_gap->to->id});
my ($ex_right, $ex_right_up, $ex_right_down) =
extend_coord($right_non_gap->to,
$to_slices->{$right_non_gap->to->id});
# flanking coords are inconsistent,
# which means that they come from different chains.
# By chain filtering then, they must either come
# from different target sequences, or be separable in the
# same target sequence by a sequence-level gap. Either
# way, we can nominally "fill" the gap.
#
# However, if the gap coincides with the end of a block,
# and furthermore if the block end conincides with the
# end of a sequence-level piece in the target, it is
# more appropriate to extend the exon into the existing
# gap; in that case, we replace the gap with a fake
# piece of alignment
if (not check_consistent_coords($left_non_gap->to,
$right_non_gap->to) or
$ex_left->start > $ex_right->end or
$ex_left->end < $ex_right->start) {
if ($left_non_gap->from->end == $this_pair->from->start - 1 and
$right_non_gap->from->start == $this_pair->from->end + 1) {
my $remove_coord = 1;
my (@replace_coord);
if ($left_non_gap->to->strand > 0 and
$ex_left->end - $left_non_gap->to->end <= $NEAR_CONTIG_END) {
$remove_coord = 0;
if (defined $ex_left_down and
$this_pair->to->length <= $ex_left_down->start - $ex_left->end - 1) {
push @replace_coord, Bio::EnsEMBL::Mapper::Coordinate
->new($ex_left->id,
$ex_left->end + 1,
$ex_left->end + $this_pair->to->length,
$ex_left->strand);
}
} elsif ($left_non_gap->to->strand < 0 and
$ex_left->start - $left_non_gap->to->start <= $NEAR_CONTIG_END) {
$remove_coord = 0;
if (defined $ex_left_up and
$this_pair->to->length <= $ex_left->start - $ex_left_up->end - 1) {
push @replace_coord, Bio::EnsEMBL::Mapper::Coordinate
->new($ex_left->id,
$ex_left->start - $this_pair->to->length,
$ex_left->start - 1,
$ex_left->strand);
}
}
if ($right_non_gap->to->strand > 0 and
$right_non_gap->to->start - $ex_right->start <= $NEAR_CONTIG_END) {
$remove_coord = 0;
if (defined $ex_right_up and
$this_pair->to->length <= $ex_right->start - $ex_right_up->end - 1) {
push @replace_coord, Bio::EnsEMBL::Mapper::Coordinate
->new($ex_right->id,
$ex_right->start - $this_pair->to->length,
$ex_right->start - 1,
$ex_right->strand);
}
} elsif ($right_non_gap->to->strand < 0 and
$ex_right->end - $right_non_gap->to->end <= $NEAR_CONTIG_END) {
$remove_coord = 0;
if (defined $ex_right_down and
$this_pair->to->length <= $ex_right_down->start - $ex_right->end - 1) {
push @replace_coord, Bio::EnsEMBL::Mapper::Coordinate
->new($ex_right->id,
$ex_right->end + 1,
$ex_right->end + $this_pair->to->length,
$ex_right->strand);
}
}
if ($remove_coord) {
# gap does not align with the end of a contig; junk it
$pairs_to_remove{$this_pair} = 1;
} elsif (@replace_coord) {
# arbitrarily chose the first one
push @replacements, [$this_pair,
$replace_coord[0]];
}
} elsif ($left_non_gap->from->end == $this_pair->from->start - 1) {
if ($left_non_gap->to->strand > 0 and
$ex_left->end - $left_non_gap->to->end <= $NEAR_CONTIG_END) {
if (defined $ex_left_down and
$this_pair->to->length <= $ex_left_down->start - $ex_left->end - 1) {
push @replacements, [$this_pair,
Bio::EnsEMBL::Mapper::Coordinate
->new($ex_left->id,
$ex_left->end + 1,
$ex_left->end + $this_pair->to->length,
$ex_left->strand)];
}
} elsif ($left_non_gap->to->strand < 0 and
$ex_left->start - $left_non_gap->to->start <= $NEAR_CONTIG_END) {
if (defined $ex_left_up and
$this_pair->to->length <= $ex_left->start - $ex_left_up->end - 1) {
push @replacements, [$this_pair,
Bio::EnsEMBL::Mapper::Coordinate
->new($ex_left->id,
$ex_left->start - $this_pair->to->length,
$ex_left->start - 1,
$ex_left->strand)];
}
} else {
# gap does not align with the end of a contig; junk it
$pairs_to_remove{$this_pair} = 1;
}
} elsif ($right_non_gap->from->start == $this_pair->from->end + 1) {
if ($right_non_gap->to->strand > 0 and
$right_non_gap->to->start - $ex_right->start <= $NEAR_CONTIG_END) {
if (defined $ex_right_up and
$this_pair->to->length <= $ex_right->start - $ex_right_up->end - 1) {
push @replacements, [$this_pair,
Bio::EnsEMBL::Mapper::Coordinate
->new($ex_right->id,
$ex_right->start - $this_pair->to->length,
$ex_right->start - 1,
$ex_right->strand)];
}
} elsif ($right_non_gap->to->strand < 0 and
$ex_right->end - $right_non_gap->to->end <= $NEAR_CONTIG_END) {
if (defined $ex_right_down and
$this_pair->to->length <= $ex_right_down->start - $ex_right->end - 1) {
push @replacements, [$this_pair,
Bio::EnsEMBL::Mapper::Coordinate
->new($ex_right->id,
$ex_right->end + 1,
$ex_right->end + $this_pair->to->length,
$ex_right->strand)];
}
} else {
# gap does not align with the end of a contig; junk it
$pairs_to_remove{$this_pair} = 1;
}
}
# else this gap is an isolate. It can be kept iff the coords are
# on different chains, or on the same chain but on different
# contigs; we've already determined that the components can
# be separated, so fine
}
else {
$pairs_to_remove{$this_pair} = 1;
}
}
}
@all_coord_pairs = grep { not exists $pairs_to_remove{$_} } @all_coord_pairs;
@replacements = sort {
$a->[1]->id cmp $b->[1]->id or
$a->[1]->start <=> $b->[1]->start;
} @replacements;
while(@replacements) {
my $el = shift @replacements;
my ($pair, $rep) = @$el;
my $fill = 1;
if (@replacements) {
my $nel = shift @replacements;
my ($npair, $nrep) = @$nel;
if ($rep->id eq $nrep->id and
$rep->start <= $nrep->end and
$rep->end >= $nrep->start) {
# we have over-filled a gap. Remove these fills
$fill = 0;
} else {
unshift @replacements, $nel;
}
}
if ($fill) {
$pair->to($rep);
push @extend_pairs, $pair;
}
}
}
if (not $add_gaps) {
@all_coord_pairs = grep { not $_->to->isa("Bio::EnsEMBL::Mapper::Gap") } @all_coord_pairs;
}
# merge adjacent targets
# we want to be able to account for small, frame-preserving
# insertions in the target sequence with respect to the query.
# To give the later, gene-projection code the opportunity to
# "read through" these insertions, we have to merge togther
# adjacent, consistent targets that are within this "maximum
# read-through" distance
my @merged_pairs;
for(my $i=0; $i<@all_coord_pairs; $i++) {
my $this_pair = $all_coord_pairs[$i];
if ($this_pair->to->isa("Bio::EnsEMBL::Mapper::Coordinate") and
@merged_pairs and
$merged_pairs[-1]->to->isa("Bio::EnsEMBL::Mapper::Coordinate") and
check_consistent_coords($merged_pairs[-1]->to,
$this_pair->to)) {
my $dist = distance_between_coords($merged_pairs[-1]->to,
$this_pair->to);
if ($dist <= $max_readthrough_dist) {
my $last_pair = pop @merged_pairs;
my $new_from = merge_coords($last_pair->from,
$this_pair->from);
my $new_to = merge_coords($last_pair->to,
$this_pair->to);
# check that the new merged coord will not result in an overlap
my $overlap = 0;
foreach my $tg (@merged_pairs) {
if ($tg->to->isa("Bio::EnsEMBL::Mapper::Coordinate") and
$tg->to->id eq $new_to->id and
$tg->to->start < $new_to->end and
$tg->to->end > $new_to->start) {
$overlap = 1;
last;
}
}
if (not $overlap) {
for (my $j=$i+1; $j < @all_coord_pairs; $j++) {
my $tg = $all_coord_pairs[$j];
if ($tg->to->isa("Bio::EnsEMBL::Mapper::Coordinate") and
$tg->to->id eq $new_to->id and
$tg->to->start < $new_to->end and
$tg->to->end > $new_to->start) {
$overlap = 1;
last;
}
}
}
if ($overlap) {
push @merged_pairs, $last_pair, $this_pair;
} else {
push @merged_pairs, Bio::EnsEMBL::Mapper::Pair->new($new_from,
$new_to);
}
} else {
push @merged_pairs, $this_pair;
}
}
else {
push @merged_pairs, $this_pair;
}
}
#########################################################
my $t_map = Bio::EnsEMBL::Mapper->new($TO_CS_NAME,
$GENE_SCAFFOLD_CS_NAME);
my $q_map = Bio::EnsEMBL::Mapper->new($FROM_CS_NAME,
$GENE_SCAFFOLD_CS_NAME);
my ($seq, $last_end_pos) = ("", 0);
for(my $i=0; $i < @merged_pairs; $i++) {
my $pair = $merged_pairs[$i];
if ($direct_target_slice) {
if ($pair->to->isa("Bio::EnsEMBL::Mapper::Coordinate")) {
my ($gs_start, $gs_end);
if ($direct_target_slice->strand < 0) {
$gs_start = $direct_target_slice->length - $pair->to->end + 1;
$gs_end = $direct_target_slice->length - $pair->to->start + 1;
} else {
$gs_start = $pair->to->start;
$gs_end = $pair->to->end;
}
$t_map->add_map_coordinates($pair->to->id,
$pair->to->start,
$pair->to->end,
$pair->to->strand,
$GENE_SCAFFOLD_CS_NAME,
$gs_start,
$gs_end);
}
} else{
if ($pair->to->isa("Bio::EnsEMBL::Mapper::Coordinate")) {
# the sequence itself
my $slice = $to_slices->{$pair->to->id};
my $this_seq = $slice->subseq($pair->to->start, $pair->to->end);
if ($pair->to->strand < 0) {
reverse_comp(\$this_seq);
}
$seq .= $this_seq;
$t_map->add_map_coordinates($pair->to->id,
$pair->to->start,
$pair->to->end,
$pair->to->strand,
$GENE_SCAFFOLD_CS_NAME,
$last_end_pos + 1,
$last_end_pos + $pair->to->length);
} else {
# the sequence itself
$seq .= ('n' x $pair->from->length);
# and the map. This is a target gap we have "filled", so no position
# in target, but a position in query
$q_map->add_map_coordinates($from_slice->seq_region_name,
$pair->from->start,
$pair->from->end,
1,
$GENE_SCAFFOLD_CS_NAME,
$last_end_pos + 1,
$last_end_pos + $pair->from->length);
}
# add padding between the pieces
if ($i < @merged_pairs - 1) {
$last_end_pos +=
$pair->to->length + $INTERPIECE_PADDING;
$seq .= ('n' x $INTERPIECE_PADDING);
}
}
}
#
# add the gaps we have extended into to the query map
#
foreach my $pair (@extend_pairs) {
my ($coord) = $t_map->map_coordinates($pair->to->id,
$pair->to->start,
$pair->to->end,
$pair->to->strand,
$TO_CS_NAME);
$q_map->add_map_coordinates($from_slice->seq_region_name,
$pair->from->start,
$pair->from->end,
1,
$GENE_SCAFFOLD_CS_NAME,
$coord->start,
$coord->end);
}
#
# finally add of the original alignment pieces to the query map
#
my @aln_coords = $map->map_coordinates($from_slice->seq_region_name,
$from_slice->start,
$from_slice->end,
1,
$FROM_CS_NAME);
my $current_start = $from_slice->start;
foreach my $c (@aln_coords) {
my $current_end = $current_start + $c->length - 1;
if ($c->isa("Bio::EnsEMBL::Mapper::Coordinate")) {
# get the gene_scaffold position from the target map
my ($coord) = $t_map->map_coordinates($c->id,
$c->start,
$c->end,
$c->strand,
$TO_CS_NAME);
$q_map->add_map_coordinates($from_slice->seq_region_name,
$current_start,
$current_end,
1,
$GENE_SCAFFOLD_CS_NAME,
$coord->start,
$coord->end);
}
$current_start += $c->length;
}
return ($seq, $q_map, $t_map);
}
#################################################
sub _make_alignment_mapper {
my ($gen_al_blocks) = @_;
my $mapper = Bio::EnsEMBL::Mapper->new($FROM_CS_NAME,
$TO_CS_NAME);
foreach my $bl (@$gen_al_blocks) {
foreach my $ugbl (@{$bl->get_all_ungapped_GenomicAlignBlocks}) {
my ($from_bl) = $ugbl->reference_genomic_align;
my ($to_bl) = @{$ugbl->get_all_non_reference_genomic_aligns};
$mapper->add_map_coordinates($from_bl->dnafrag->name,
$from_bl->dnafrag_start,
$from_bl->dnafrag_end,
$from_bl->dnafrag_strand * $to_bl->dnafrag_strand,
$to_bl->dnafrag->name,
$to_bl->dnafrag_start,
$to_bl->dnafrag_end);
}
}
return $mapper;
}
##############################################
sub _check_direct_target_coordinates {
my ($blocks, $slices) = @_;
my ($tname, $tstrand);
my @blocks = @$blocks;
for(my $i=0; $i < @blocks; $i++) {
my ($right_to) = @{$blocks[$i]->get_all_non_reference_genomic_aligns};
if ($i > 0) {
my ($left_to) = @{$blocks[$i-1]->get_all_non_reference_genomic_aligns};
if ($left_to->dnafrag->name ne $right_to->dnafrag->name or
$left_to->dnafrag_strand != $right_to->dnafrag_strand or
($left_to->dnafrag_strand > 0 and $left_to->dnafrag_end >= $right_to->dnafrag_start) or
($left_to->dnafrag_strand < 0 and $left_to->dnafrag_start <= $right_to->dnafrag_end)) {
throw("Cannot use direct target coordinates with inconsistent set of blocks");
}
}
$tname = $right_to->dnafrag->name if not defined $tname;
$tstrand = $right_to->dnafrag_strand if not defined $tstrand;
}
my $target_slice;
if ($tstrand < 0) {
$target_slice = $slices->{$tname}->invert;
} else {
$target_slice = $slices->{$tname};
}
return $target_slice;
}
##############################################
# Get/Sets
##############################################
sub alignment_mapper {
my ($self, $val) = @_;
if (defined $val) {
$self->{_alignment_mapper} = $val;
}
return $self->{_alignment_mapper};
}
sub direct_target_slice {
my ($self, $val) = @_;
if (defined $val) {
$self->{_direct_target} = $val;
}
return $self->{_direct_target};
}
sub max_readthrough_dist {
my ($self, $val) = @_;
if (defined $val) {
$self->{_max_readthrough} = $val;
}
return $self->{_max_readthrough};
}
sub from_slice {
my ($self, $val) = @_;
if (defined $val) {
$self->{_from_slice} = $val;
}
return $self->{_from_slice};
}
sub from_mapper {
my ($self, $val) = @_;
if (defined $val) {
$self->{_from_mapper} = $val;
}
return $self->{_from_mapper};
}
sub to_slices {
my ($self, $val) = @_;
if (defined $val) {
$self->{_to_slices} = $val;
}
return $self->{_to_slices};
}
sub to_mapper {
my ($self, $val) = @_;
if (defined $val) {
$self->{_to_mapper} = $val;
}
return $self->{_to_mapper};
}
1;