None available.
sub _construct_sequence
{ my ($gen_al_blocks,
$map,
$transcripts,
$from_slice,
$to_slices,
$max_readthrough_dist,
$extend_into_gaps,
$add_gaps,
$direct_target_slice) = @_;
my @block_coord_pairs;
foreach my $bl (@{$gen_al_blocks}) {
my $qy_al = $bl->reference_genomic_align;
my ($tg_al) = @{$bl->get_all_non_reference_genomic_aligns};
my $from_coord = Bio::EnsEMBL::Mapper::Coordinate->new($qy_al->dnafrag->name,
$qy_al->dnafrag_start,
$qy_al->dnafrag_end,
1);
my $to_coord = Bio::EnsEMBL::Mapper::Coordinate->new($tg_al->dnafrag->name,
$tg_al->dnafrag_start,
$tg_al->dnafrag_end,
$tg_al->dnafrag_strand);
my $pair = Bio::EnsEMBL::Mapper::Pair->new($from_coord,
$to_coord);
push @block_coord_pairs, $pair;
}
@block_coord_pairs = sort { $a->from->start <=> $b->from->start } @block_coord_pairs;
my $tran_coords = transcripts_to_coords($map,
$FROM_CS_NAME,
@$transcripts);
my @fillable_gaps;
foreach my $tc (sort { $a->start <=> $b->start } @$tran_coords) {
my $current_pos = $tc->start;
foreach my $c ($map->map_coordinates($tc->id,
$tc->start,
$tc->end,
1,
$FROM_CS_NAME)) {
my $from_coord = Bio::EnsEMBL::Mapper::Coordinate->new($tc->id,
$current_pos,
$current_pos + $c->length - 1,
1);
$current_pos += $c->length;
if ($c->isa("Bio::EnsEMBL::Mapper::Gap")) {
my $overlaps_block = 0;
foreach my $bl (@block_coord_pairs) {
if ($bl->from->start <= $from_coord->end and
$bl->from->end >= $from_coord->start) {
$overlaps_block = 1;
last;
}
}
if (not $overlaps_block) {
push @fillable_gaps, Bio::EnsEMBL::Mapper::Pair->new($from_coord,
$c);
}
}
}
}
my @all_coord_pairs = (@block_coord_pairs, @fillable_gaps);
@all_coord_pairs = sort { $a->from->start <=> $b->from->start } @all_coord_pairs;
my @extend_pairs;
if ($extend_into_gaps) {
my (%pairs_to_remove, @replacements);
for(my $i=0; $i < @all_coord_pairs; $i++) {
my $this_pair = $all_coord_pairs[$i];
if ($this_pair->to->isa("Bio::EnsEMBL::Mapper::Gap")) {
my ($left_non_gap, $right_non_gap);
for(my $j=$i-1; $j>=0; $j--) {
if ($all_coord_pairs[$j]->to->
isa("Bio::EnsEMBL::Mapper::Coordinate")) {
$left_non_gap = $all_coord_pairs[$j];
last;
}
}
for(my $j=$i+1; $j < @all_coord_pairs; $j++) {
if ($all_coord_pairs[$j]->to->
isa("Bio::EnsEMBL::Mapper::Coordinate")) {
$right_non_gap = $all_coord_pairs[$j];
last;
}
}
my ($ex_left, $ex_left_up, $ex_left_down) =
extend_coord($left_non_gap->to,
$to_slices->{$left_non_gap->to->id});
my ($ex_right, $ex_right_up, $ex_right_down) =
extend_coord($right_non_gap->to,
$to_slices->{$right_non_gap->to->id});
if (not check_consistent_coords($left_non_gap->to,
$right_non_gap->to) or
$ex_left->start > $ex_right->end or
$ex_left->end < $ex_right->start) {
if ($left_non_gap->from->end == $this_pair->from->start - 1 and
$right_non_gap->from->start == $this_pair->from->end + 1) {
my $remove_coord = 1;
my (@replace_coord);
if ($left_non_gap->to->strand > 0 and
$ex_left->end - $left_non_gap->to->end <= $NEAR_CONTIG_END) {
$remove_coord = 0;
if (defined $ex_left_down and
$this_pair->to->length <= $ex_left_down->start - $ex_left->end - 1) {
push @replace_coord, Bio::EnsEMBL::Mapper::Coordinate
->new($ex_left->id,
$ex_left->end + 1,
$ex_left->end + $this_pair->to->length,
$ex_left->strand);
}
} elsif ($left_non_gap->to->strand < 0 and
$ex_left->start - $left_non_gap->to->start <= $NEAR_CONTIG_END) {
$remove_coord = 0;
if (defined $ex_left_up and
$this_pair->to->length <= $ex_left->start - $ex_left_up->end - 1) {
push @replace_coord, Bio::EnsEMBL::Mapper::Coordinate
->new($ex_left->id,
$ex_left->start - $this_pair->to->length,
$ex_left->start - 1,
$ex_left->strand);
}
}
if ($right_non_gap->to->strand > 0 and
$right_non_gap->to->start - $ex_right->start <= $NEAR_CONTIG_END) {
$remove_coord = 0;
if (defined $ex_right_up and
$this_pair->to->length <= $ex_right->start - $ex_right_up->end - 1) {
push @replace_coord, Bio::EnsEMBL::Mapper::Coordinate
->new($ex_right->id,
$ex_right->start - $this_pair->to->length,
$ex_right->start - 1,
$ex_right->strand);
}
} elsif ($right_non_gap->to->strand < 0 and
$ex_right->end - $right_non_gap->to->end <= $NEAR_CONTIG_END) {
$remove_coord = 0;
if (defined $ex_right_down and
$this_pair->to->length <= $ex_right_down->start - $ex_right->end - 1) {
push @replace_coord, Bio::EnsEMBL::Mapper::Coordinate
->new($ex_right->id,
$ex_right->end + 1,
$ex_right->end + $this_pair->to->length,
$ex_right->strand);
}
}
if ($remove_coord) {
$pairs_to_remove{$this_pair} = 1;
} elsif (@replace_coord) {
push @replacements, [$this_pair,
$replace_coord[0]];
}
} elsif ($left_non_gap->from->end == $this_pair->from->start - 1) {
if ($left_non_gap->to->strand > 0 and
$ex_left->end - $left_non_gap->to->end <= $NEAR_CONTIG_END) {
if (defined $ex_left_down and
$this_pair->to->length <= $ex_left_down->start - $ex_left->end - 1) {
push @replacements, [$this_pair,
Bio::EnsEMBL::Mapper::Coordinate
->new($ex_left->id,
$ex_left->end + 1,
$ex_left->end + $this_pair->to->length,
$ex_left->strand)];
}
} elsif ($left_non_gap->to->strand < 0 and
$ex_left->start - $left_non_gap->to->start <= $NEAR_CONTIG_END) {
if (defined $ex_left_up and
$this_pair->to->length <= $ex_left->start - $ex_left_up->end - 1) {
push @replacements, [$this_pair,
Bio::EnsEMBL::Mapper::Coordinate
->new($ex_left->id,
$ex_left->start - $this_pair->to->length,
$ex_left->start - 1,
$ex_left->strand)];
}
} else {
$pairs_to_remove{$this_pair} = 1;
}
} elsif ($right_non_gap->from->start == $this_pair->from->end + 1) {
if ($right_non_gap->to->strand > 0 and
$right_non_gap->to->start - $ex_right->start <= $NEAR_CONTIG_END) {
if (defined $ex_right_up and
$this_pair->to->length <= $ex_right->start - $ex_right_up->end - 1) {
push @replacements, [$this_pair,
Bio::EnsEMBL::Mapper::Coordinate
->new($ex_right->id,
$ex_right->start - $this_pair->to->length,
$ex_right->start - 1,
$ex_right->strand)];
}
} elsif ($right_non_gap->to->strand < 0 and
$ex_right->end - $right_non_gap->to->end <= $NEAR_CONTIG_END) {
if (defined $ex_right_down and
$this_pair->to->length <= $ex_right_down->start - $ex_right->end - 1) {
push @replacements, [$this_pair,
Bio::EnsEMBL::Mapper::Coordinate
->new($ex_right->id,
$ex_right->end + 1,
$ex_right->end + $this_pair->to->length,
$ex_right->strand)];
}
} else {
$pairs_to_remove{$this_pair} = 1;
}
}
}
else {
$pairs_to_remove{$this_pair} = 1;
}
}
}
@all_coord_pairs = grep { not exists $pairs_to_remove{$_} } @all_coord_pairs;
@replacements = sort {
$a->[1]->id cmp $b->[1]->id or
$a->[1]->start <=> $b->[1]->start;
} @replacements;
while(@replacements) {
my $el = shift @replacements;
my ($pair, $rep) = @$el;
my $fill = 1;
if (@replacements) {
my $nel = shift @replacements;
my ($npair, $nrep) = @$nel;
if ($rep->id eq $nrep->id and
$rep->start <= $nrep->end and
$rep->end >= $nrep->start) {
$fill = 0;
} else {
unshift @replacements, $nel;
}
}
if ($fill) {
$pair->to($rep);
push @extend_pairs, $pair;
}
}
}
if (not $add_gaps) {
@all_coord_pairs = grep { not $_->to->isa("Bio::EnsEMBL::Mapper::Gap") } @all_coord_pairs;
}
my @merged_pairs;
for(my $i=0; $i<@all_coord_pairs; $i++) {
my $this_pair = $all_coord_pairs[$i];
if ($this_pair->to->isa("Bio::EnsEMBL::Mapper::Coordinate") and
@merged_pairs and
$merged_pairs[-1]->to->isa("Bio::EnsEMBL::Mapper::Coordinate") and
check_consistent_coords($merged_pairs[-1]->to,
$this_pair->to)) {
my $dist = distance_between_coords($merged_pairs[-1]->to,
$this_pair->to);
if ($dist <= $max_readthrough_dist) {
my $last_pair = pop @merged_pairs;
my $new_from = merge_coords($last_pair->from,
$this_pair->from);
my $new_to = merge_coords($last_pair->to,
$this_pair->to);
my $overlap = 0;
foreach my $tg (@merged_pairs) {
if ($tg->to->isa("Bio::EnsEMBL::Mapper::Coordinate") and
$tg->to->id eq $new_to->id and
$tg->to->start < $new_to->end and
$tg->to->end > $new_to->start) {
$overlap = 1;
last;
}
}
if (not $overlap) {
for (my $j=$i+1; $j < @all_coord_pairs; $j++) {
my $tg = $all_coord_pairs[$j];
if ($tg->to->isa("Bio::EnsEMBL::Mapper::Coordinate") and
$tg->to->id eq $new_to->id and
$tg->to->start < $new_to->end and
$tg->to->end > $new_to->start) {
$overlap = 1;
last;
}
}
}
if ($overlap) {
push @merged_pairs, $last_pair, $this_pair;
} else {
push @merged_pairs, Bio::EnsEMBL::Mapper::Pair->new($new_from,
$new_to);
}
} else {
push @merged_pairs, $this_pair;
}
}
else {
push @merged_pairs, $this_pair;
}
}
my $t_map = Bio::EnsEMBL::Mapper->new($TO_CS_NAME,
$GENE_SCAFFOLD_CS_NAME);
my $q_map = Bio::EnsEMBL::Mapper->new($FROM_CS_NAME,
$GENE_SCAFFOLD_CS_NAME);
my ($seq, $last_end_pos) = ("", 0);
for(my $i=0; $i < @merged_pairs; $i++) {
my $pair = $merged_pairs[$i];
if ($direct_target_slice) {
if ($pair->to->isa("Bio::EnsEMBL::Mapper::Coordinate")) {
my ($gs_start, $gs_end);
if ($direct_target_slice->strand < 0) {
$gs_start = $direct_target_slice->length - $pair->to->end + 1;
$gs_end = $direct_target_slice->length - $pair->to->start + 1;
} else {
$gs_start = $pair->to->start;
$gs_end = $pair->to->end;
}
$t_map->add_map_coordinates($pair->to->id,
$pair->to->start,
$pair->to->end,
$pair->to->strand,
$GENE_SCAFFOLD_CS_NAME,
$gs_start,
$gs_end);
}
} else{
if ($pair->to->isa("Bio::EnsEMBL::Mapper::Coordinate")) {
my $slice = $to_slices->{$pair->to->id};
my $this_seq = $slice->subseq($pair->to->start, $pair->to->end);
if ($pair->to->strand < 0) {
reverse_comp(\$this_seq);
}
$seq .= $this_seq;
$t_map->add_map_coordinates($pair->to->id,
$pair->to->start,
$pair->to->end,
$pair->to->strand,
$GENE_SCAFFOLD_CS_NAME,
$last_end_pos + 1,
$last_end_pos + $pair->to->length);
} else {
$seq .= ('n' x $pair->from->length);
$q_map->add_map_coordinates($from_slice->seq_region_name,
$pair->from->start,
$pair->from->end,
1,
$GENE_SCAFFOLD_CS_NAME,
$last_end_pos + 1,
$last_end_pos + $pair->from->length);
}
if ($i < @merged_pairs - 1) {
$last_end_pos +=
$pair->to->length + $INTERPIECE_PADDING;
$seq .= ('n' x $INTERPIECE_PADDING);
}
}
}
foreach my $pair (@extend_pairs) {
my ($coord) = $t_map->map_coordinates($pair->to->id,
$pair->to->start,
$pair->to->end,
$pair->to->strand,
$TO_CS_NAME);
$q_map->add_map_coordinates($from_slice->seq_region_name,
$pair->from->start,
$pair->from->end,
1,
$GENE_SCAFFOLD_CS_NAME,
$coord->start,
$coord->end);
}
my @aln_coords = $map->map_coordinates($from_slice->seq_region_name,
$from_slice->start,
$from_slice->end,
1,
$FROM_CS_NAME);
my $current_start = $from_slice->start;
foreach my $c (@aln_coords) {
my $current_end = $current_start + $c->length - 1;
if ($c->isa("Bio::EnsEMBL::Mapper::Coordinate")) {
my ($coord) = $t_map->map_coordinates($c->id,
$c->start,
$c->end,
$c->strand,
$TO_CS_NAME);
$q_map->add_map_coordinates($from_slice->seq_region_name,
$current_start,
$current_end,
1,
$GENE_SCAFFOLD_CS_NAME,
$coord->start,
$coord->end);
}
$current_start += $c->length;
}
return ($seq, $q_map, $t_map);
}
} |
sub new
{ my ($caller, %given_args) = @_;
my $class = ref($caller) || $caller;
my ($name,
$genomic_align_blocks,
$transcripts,
$from_slice,
$to_slices,
$max_readthrough_dist,
$extend_into_gaps,
$add_gaps,
$direct_target_slice,
) = rearrange([qw(
NAME
GENOMIC_ALIGN_BLOCKS
TRANSCRIPTS
FROM_SLICE
TO_SLICES
MAX_READTHROUGH_DIST
EXTEND_INTO_GAPS
ADD_GAPS
DIRECT_TARGET_COORDS
)], %given_args);
_check_transcripts($transcripts);
$name = "GeneScaffold" if not defined $name;
$max_readthrough_dist = 15 if not defined $max_readthrough_dist;
if ($direct_target_slice) {
if ($add_gaps or $extend_into_gaps) {
warning("GeneScaffold: cannot use -direct_target_coords with either of " .
"-add_gaps or -extend_into_gaps; unsetting both");
$add_gaps = 0;
$extend_into_gaps = 0;
}
$direct_target_slice = _check_direct_target_coordinates($genomic_align_blocks,
$to_slices);
}
my $aln_map = _make_alignment_mapper($genomic_align_blocks);
my ($gs_seq, $from_mapper, $to_mapper) =
_construct_sequence($genomic_align_blocks,
$aln_map,
$transcripts,
$from_slice,
$to_slices,
$max_readthrough_dist,
$extend_into_gaps,
$add_gaps,
$direct_target_slice);
return undef if not defined $from_mapper;
my $gs_end = $direct_target_slice
? $direct_target_slice->length
: length($gs_seq);
my $self = $class->SUPER::new(-coord_system =>
Bio::EnsEMBL::CoordSystem->new(-name => $GENE_SCAFFOLD_CS_NAME,
-rank => 1),
-seq_region_name => $name,
-seq => ( $gs_seq eq "" ? undef : $gs_seq),
-start => 1,
-end => $gs_end,
);
$self->direct_target_slice($direct_target_slice)
if defined $direct_target_slice;
$self->max_readthrough_dist($max_readthrough_dist)
if defined $max_readthrough_dist;
$self->from_slice($from_slice);
$self->to_slices($to_slices);
$self->alignment_mapper($aln_map);
$self->from_mapper($from_mapper);
$self->to_mapper($to_mapper);
return $self;
}
} |
sub place_transcript
{ my ($self,
$tran,
$add_attributes,
$external_db_id) = @_;
my (@all_coords, @new_exons);
my @orig_exons = @{$tran->get_all_translateable_Exons};
@orig_exons = sort { $a->start <=> $b->start } @orig_exons;
my $orig_exon_coords = transcripts_to_coords($self->alignment_mapper,
$FROM_CS_NAME,
$tran);
my $restricted_range;
if (@$orig_exon_coords) {
$restricted_range = Bio::EnsEMBL::Mapper::Coordinate->new($orig_exon_coords->[0]->id,
$orig_exon_coords->[0]->start,
$orig_exon_coords->[-1]->end,
$orig_exon_coords->[0]->strand);
} else {
$restricted_range = Bio::EnsEMBL::Mapper::Coordinate->new($orig_exons[0]->slice->seq_region_name,
$orig_exons[0]->start,
$orig_exons[-1]->end,
$orig_exons[0]->strand);
}
my %filled_coords;
foreach my $orig_exon (@orig_exons) {
my @crds = $self->from_mapper->map_coordinates($orig_exon->slice->seq_region_name,
$orig_exon->start,
$orig_exon->end,
1,
$FROM_CS_NAME);
foreach my $c (@crds) {
if ($c->isa("Bio::EnsEMBL::Mapper::Coordinate")) {
my ($tc) = $self->to_mapper->map_coordinates($GENE_SCAFFOLD_CS_NAME,
$c->start,
$c->end,
1,
$GENE_SCAFFOLD_CS_NAME);
if ($tc->isa("Bio::EnsEMBL::Mapper::Coordinate")) {
my @alncrds = $self->alignment_mapper->map_coordinates($tc->id,
$tc->start,
$tc->end,
$tc->strand,
$TO_CS_NAME);
my $current_start = $c->start;
foreach my $ac (@alncrds) {
my $current_end = $current_start + $ac->length - 1;
my $newc = Bio::EnsEMBL::Mapper::Coordinate->new($c->id,
$current_start,
$current_end,
$c->strand);
$current_start += $ac->length;
if ($ac->isa("Bio::EnsEMBL::Mapper::Gap")) {
$filled_coords{$newc} = 1;
}
push @all_coords, $newc;
}
} else {
$filled_coords{$c} = 1;
push @all_coords, $c;
}
} else {
push @all_coords, $c;
}
}
}
my @kept_coords;
foreach my $c (@all_coords) {
if ($c->isa("Bio::EnsEMBL::Mapper::Coordinate") and
exists $filled_coords{$c}) {
my ($oc) = $self->from_mapper->map_coordinates($GENE_SCAFFOLD_CS_NAME,
$c->start,
$c->end,
1,
$GENE_SCAFFOLD_CS_NAME);
if ($oc->start >= $restricted_range->start and
$oc->end <= $restricted_range->end) {
push @kept_coords, $c;
} else {
push @kept_coords, Bio::EnsEMBL::Mapper::Gap->new(1, $c->length);
}
} else {
push @kept_coords, $c;
}
}
@all_coords = @kept_coords;
my $need_another_pass;
do {
$need_another_pass = 0;
my (@proc_coords, @gap_indices);
foreach my $c (@all_coords) {
if ($c->isa("Bio::EnsEMBL::Mapper::Gap")) {
if (@proc_coords and
$proc_coords[-1]->isa("Bio::EnsEMBL::Mapper::Gap")) {
$proc_coords[-1]->end( $proc_coords[-1]->end + $c->length );
} else {
push @proc_coords, Bio::EnsEMBL::Mapper::Gap->new(1,
$c->length);
push @gap_indices, scalar(@proc_coords) - 1;
}
} else {
push @proc_coords, $c;
}
}
GAP: foreach my $idx (@gap_indices) {
my $gap = $proc_coords[$idx];
my $frameshift = $gap->length % 3;
if ($frameshift) {
my $bases_to_remove = 3 - $frameshift;
my ($left_surplus, $right_surplus) = (0,0);
for(my $j=$idx-1; $j >= 0; $j--) {
$left_surplus += $proc_coords[$j]->length;
}
for(my $j=$idx+1; $j < @proc_coords; $j++) {
$right_surplus += $proc_coords[$j]->length;
}
$left_surplus = $left_surplus % 3;
$right_surplus = $right_surplus % 3;
if ($left_surplus) {
$bases_to_remove = $left_surplus;
my $left_coord = $proc_coords[$idx - 1];
if ($left_coord->length > $bases_to_remove) {
$gap->end($gap->end + $bases_to_remove);
$left_coord->end( $left_coord->end - $bases_to_remove );
} else {
$proc_coords[$idx-1] =
Bio::EnsEMBL::Mapper::Gap->new(1,$left_coord->length);
}
}
if ($right_surplus) {
$bases_to_remove = $right_surplus;
my $right_coord = $proc_coords[$idx + 1];
if ($right_coord->length > $bases_to_remove) {
$gap->end($gap->end + $bases_to_remove);
$right_coord->start( $right_coord->start + $bases_to_remove);
} else {
$proc_coords[$idx+1] =
Bio::EnsEMBL::Mapper::Gap->new(1,$right_coord->length);
}
}
$need_another_pass = 1;
last GAP;
}
}
@all_coords = @proc_coords;
} while ($need_another_pass);
my $start_not_found = $all_coords[0]->isa("Bio::EnsEMBL::Mapper::Gap") ? 1 : 0;
my $end_not_found = $all_coords[-1]->isa("Bio::EnsEMBL::Mapper::Gap") ? 1 : 0;
@all_coords = grep { $_->isa("Bio::EnsEMBL::Mapper::Coordinate") } @all_coords;
return 0 if not @all_coords;
my $sl = $self->direct_target_slice
? $self->direct_target_slice
: $self;
foreach my $coord (@all_coords) {
push @new_exons, Bio::EnsEMBL::Exon->new(-start => $coord->start,
-end => $coord->end,
-strand => $tran->strand,
-slice => $sl);
}
if ($tran->strand < 0) {
@new_exons = sort { $b->start <=> $a->start } @new_exons;
} else {
@new_exons = sort { $a->start <=> $b->start } @new_exons;
}
my $source_pep = $tran->translate->seq;
my $source_cds_len = 0;
map { $source_cds_len += $_->length } @orig_exons;
if (length($source_pep) + 1 == ($source_cds_len / 3)) { # stop was removed $source_pep .= "*"; }
my $transcript_aligned_aas = 0;
my $transcript_identical_aas = 0;
my $incomplete_codon_bps = 0;
my ($previous_exon);
foreach my $exon (@new_exons) {
if (defined $previous_exon) {
$exon->phase($previous_exon->end_phase);
} else {
$exon->phase(0);
}
$exon->end_phase((($exon->length) + $exon->phase)%3);
my $exon_aligned_aas = 0;
my $exon_identical_aas = 0;
my $extent_start = $exon->start;
my $extent_end = $exon->end;
if ($exon->strand > 0) {
$extent_start += 3 - $exon->phase if $exon->phase;
$extent_end -= $exon->end_phase if $exon->end_phase;
} else {
$extent_start += $exon->end_phase if $exon->end_phase;
$extent_end -= 3 - $exon->phase if $exon->phase;
}
if ($extent_end > $extent_start) {
$incomplete_codon_bps += ($extent_start - $exon->start);
$incomplete_codon_bps += ($exon->end - $extent_end);
my @gen_coords = $self->from_mapper->map_coordinates($GENE_SCAFFOLD_CS_NAME,
$extent_start,
$extent_end,
1,
$GENE_SCAFFOLD_CS_NAME);
my @fps;
my $cur_gs_start = $extent_start;
foreach my $g_coord (@gen_coords) {
my $cur_gs_end = $cur_gs_start + $g_coord->length - 1;
if ($g_coord->isa("Bio::EnsEMBL::Mapper::Coordinate")) {
my ($p_coord) = $tran->genomic2pep($g_coord->start,
$g_coord->end,
$tran->strand);
my $p_substr = uc(substr($source_pep, $p_coord->start - 1, $p_coord->length));
my $gs_reg = $sl->subseq($cur_gs_start, $cur_gs_end, $exon->strand);
my $gs_p_substr = uc(Bio::PrimarySeq->new(-seq => $gs_reg)->translate->seq);
if (length($p_substr) != length ($gs_p_substr)) {
warning("Pep segments differ; cannot calculate percentage identity");
} else {
$exon_aligned_aas += length($p_substr);
my @p1 = split(//, $p_substr);
my @p2 = split(//, $gs_p_substr);
for(my $i=0; $i < @p1; $i++) {
if ($p1[$i] eq $p2[$i]) {
$exon_identical_aas++;
}
}
}
my $fp = Bio::EnsEMBL::FeaturePair->
new(-seqname => $self->seq_region_name,
-start => $cur_gs_start,
-end => $cur_gs_end,
-strand => $exon->strand,
-score => 100.0,
-hseqname => $tran->translation->stable_id,
-hstart => $p_coord->start,
-external_db_id => $external_db_id,
-hend => $p_coord->end,
-hstrand => $p_coord->strand,
-slice => $sl);
push @fps, $fp;
}
$cur_gs_start += $g_coord->length;
}
if (@fps) {
my $f = Bio::EnsEMBL::DnaPepAlignFeature->new(-features =>\@ fps);
$f->percent_id(100 * ($exon_identical_aas / $exon_aligned_aas)); $exon->add_supporting_features($f);
}
} else {
$incomplete_codon_bps += $exon->length;
}
$transcript_aligned_aas += $exon_aligned_aas;
$transcript_identical_aas += $exon_identical_aas;
$previous_exon = $exon;
}
$transcript_aligned_aas += int($incomplete_codon_bps / 3); $transcript_identical_aas += int($incomplete_codon_bps / 3);
my @merged_exons;
foreach my $exon (@new_exons) {
if (@merged_exons) {
my $prev_exon = pop @merged_exons;
my ($new_start, $new_end);
if ($tran->strand < 0) {
my $intron_len = $prev_exon->start - $exon->end - 1;
if ($intron_len % 3 == 0 and
$intron_len <= $self->max_readthrough_dist) {
$new_start = $exon->start;
$new_end = $prev_exon->end;
}
} else {
my $intron_len = $exon->start - $prev_exon->end - 1;
if ($intron_len % 3 == 0 and
$intron_len <= $self->max_readthrough_dist) {
$new_start = $prev_exon->start;
$new_end = $exon->end;
}
}
if (defined $new_start and defined $new_end) {
my $merged_exon = Bio::EnsEMBL::Exon->
new(-start => $new_start,
-end => $new_end,
-strand => $tran->strand,
-phase => $prev_exon->phase,
-end_phase => $exon->end_phase,
-slice => $exon->slice);
my @ug_feats;
my $len_l = 0;
my $pid_l = 0;
if (@{$prev_exon->get_all_supporting_features}) {
my ($sf) = @{$prev_exon->get_all_supporting_features};
$pid_l = $sf->percent_id;
foreach my $ug ($sf->ungapped_features) {
push @ug_feats, $ug;
$len_l += $ug->length;
}
}
my $len_r = 0;
my $pid_r = 0;
if (@{$exon->get_all_supporting_features}) {
my ($sf) = @{$exon->get_all_supporting_features};
$pid_r = $sf->percent_id;
foreach my $ug ($sf->ungapped_features) {
push @ug_feats, $ug;
$len_r += $ug->length;
}
}
if (@ug_feats) {
map { $_->percent_id(100) } @ug_feats;
my $new_sup_feat = Bio::EnsEMBL::DnaPepAlignFeature->
new(-features =>\@ ug_feats);
$new_sup_feat->percent_id((($len_l * $pid_l) + ($len_r * $pid_r))
/ ($len_l + $len_r)); $merged_exon->add_supporting_features($new_sup_feat);
}
push @merged_exons, $merged_exon;
next;
} else {
push @merged_exons, $prev_exon;
push @merged_exons, $exon;
}
} else {
push @merged_exons, $exon;
}
}
my $proj_tran = Bio::EnsEMBL::Transcript->new();
map { $proj_tran->add_Exon($_) } @merged_exons;
my (@trans_fps);
foreach my $exon (@merged_exons) {
if (@{$exon->get_all_supporting_features}) {
my ($sf) = @{$exon->get_all_supporting_features};
my @e_fps = $sf->ungapped_features;
map { $_->percent_id(100) } @e_fps;
push @trans_fps, @e_fps;
}
}
if (@trans_fps) {
my $t_sf = Bio::EnsEMBL::DnaPepAlignFeature->
new(-features =>\@ trans_fps);
$t_sf->score(100 * ($transcript_aligned_aas / length($source_pep))); $t_sf->hcoverage( 100 * ($transcript_aligned_aas / length($source_pep))); $t_sf->percent_id(100 * ($transcript_identical_aas / $transcript_aligned_aas)); $proj_tran->add_supporting_features($t_sf);
foreach my $exon (@{$proj_tran->get_all_Exons}) {
foreach my $exon_sf (@{$exon->get_all_supporting_features}) {
$exon_sf->hcoverage($t_sf->hcoverage);
}
}
} else {
return 0;
}
my $translation = Bio::EnsEMBL::Translation->new();
$translation->start_Exon($merged_exons[0]);
$translation->start(1);
$translation->end_Exon($merged_exons[-1]);
$translation->end($merged_exons[-1]->end - $merged_exons[-1]->start + 1);
$proj_tran->translation($translation);
my $pep = $proj_tran->translate;
if (not defined $pep) {
return 0;
}
my $prop_non_gap = 100 - (100 * (($pep->seq =~ tr/X/X/) / $pep->length)); my $num_stops = $pep->seq =~ tr/\*/\*/;
my $num_gaps = $pep->seq =~ tr/X/X/;
my @attributes;
my $perc_id_attr = Bio::EnsEMBL::Attribute->
new(-code => 'HitSimilarity',
-name => 'hit similarity',
-description => 'percentage id to parent transcripts',
-value => sprintf("%.1f",
100 * ($transcript_identical_aas / $transcript_aligned_aas))); push @attributes, $perc_id_attr;
my $cov_attr = Bio::EnsEMBL::Attribute->
new(-code => 'HitCoverage',
-name => 'hit coverage',
-description => 'coverage of parent transcripts',
-value => sprintf("%.1f",
100 * ($transcript_aligned_aas / length($source_pep)))); push @attributes, $cov_attr;
my $gap_attr = Bio::EnsEMBL::Attribute->
new(-code => 'PropNonGap',
-name => 'proportion non gap',
-description => 'proportion non gap',
-value => sprintf("%.1f",
$prop_non_gap));
push @attributes, $gap_attr;
my $prop_non_gap_of_coverage_attr = Bio::EnsEMBL::Attribute->
new(-code => 'NonGapHCov',
-name => 'proportion non gap of hit coverage',
-description => 'proportion non gap of hit coverage',
-value => sprintf("%.1f",
100 * (($transcript_aligned_aas - $num_gaps) / length($source_pep)))); push @attributes, $prop_non_gap_of_coverage_attr;
if ($start_not_found and $tran->strand > 0 or
$end_not_found and $tran->strand < 0) {
my $attr = Bio::EnsEMBL::Attribute->
new(-code => 'StartNotFound',
-name => 'start not found',
-description => 'start not found',
-value => 1);
push @attributes, $attr;
}
if ($end_not_found and $tran->strand > 0 or
$start_not_found and $tran->strand < 0) {
my $attr = Bio::EnsEMBL::Attribute->
new(-code => 'EndNotFound',
-name => 'end not found',
-description => 'end not found',
-value => 1);
push @attributes, $attr;
}
my $stop_attr = Bio::EnsEMBL::Attribute->
new(-code => 'NumStops',
-name => 'number of stops',
-desc => 'Number of stops before editing',
-value => $num_stops);
push @attributes, $stop_attr;
my $gap_exons = 0;
foreach my $e (@{$proj_tran->get_all_Exons}) {
my ($coord) = $self->to_mapper->map_coordinates($GENE_SCAFFOLD_CS_NAME,
$e->start,
$e->end,
1,
$GENE_SCAFFOLD_CS_NAME);
if ($coord->isa("Bio::EnsEMBL::Mapper::Gap")) {
$gap_exons++;
}
}
my $gap_exon_attr = Bio::EnsEMBL::Attribute->
new(-code => 'GapExons',
-name => 'gap exons',
-description => 'number of gap exons',
-value => $gap_exons);
push @attributes, $gap_exon_attr;
my $tranid_attr = Bio::EnsEMBL::Attribute->
new(-code => 'SourceTran',
-name => 'source transcript',
-description => 'source transcript',
-value => $tran->stable_id);
push @attributes, $tranid_attr;
if ($add_attributes) {
$proj_tran->add_Attributes(@attributes);
}
if ($self->direct_target_slice and
$self->direct_target_slice->strand < 0) {
$proj_tran = $proj_tran->transfer($self->direct_target_slice->invert);
}
return $proj_tran; } |