Raw content of Bio::EnsEMBL::Analysis::Tools::WGA2Genes::CoordUtils =head1 NAME =head1 SYNOPSIS =head1 DESCRIPTION =cut package Bio::EnsEMBL::Analysis::Tools::WGA2Genes::CoordUtils; use strict; use Exporter; use vars qw (@ISA @EXPORT); use Bio::EnsEMBL::Utils::Exception qw(verbose throw warning stack_trace_dump); @ISA = qw(Exporter); @EXPORT = qw(check_consistent_coords distance_between_coords merge_coords extend_coord transcripts_to_coords ); ###################################################### sub check_consistent_coords { my ($cj, $ci) = @_; if ($ci->id ne $cj->id or $ci->strand != $cj->strand) { return 0; } else { if ($ci->strand == 1) { # j should come before i in genomic coords if (not $cj->end < $ci->start) { return 0; } } else { # j should come after i if (not $ci->end < $cj->start) { return 0; } } } return 1; } ###################################################### sub distance_between_coords { my ($cj, $ci) = @_; if ($ci->strand == 1) { # j should come before i in genomic coords return $ci->start - $cj->end - 1; } else { return $cj->start - $ci->end - 1; } } ###################################################### sub merge_coords { my ($ci, $cj) = @_; # assumption: coords are consistent my ($start, $end); if ($cj->start > $ci->start) { ($start, $end) = ($ci->start, $cj->end); } else { ($start, $end) = ($cj->start, $ci->end); } return Bio::EnsEMBL::Mapper::Coordinate->new($ci->id, $start, $end, $ci->strand); } ###################################################### sub separate_coords { my ($ci, $cj, $target_slice) = @_; my ($ci_slice, $cj_slice, $between_slice); my $sa = $target_slice->adaptor; if (defined $ci and defined $cj) { $ci_slice = $sa->fetch_by_region('toplevel', $target_slice->seq_region_name, $ci->start, $ci->end); $cj_slice = $sa->fetch_by_region('toplevel', $target_slice->seq_region_name, $cj->start, $cj->end); $between_slice = $sa->fetch_by_region('toplevel', $target_slice->seq_region_name, $ci->end + 1, $cj->start - 1); } elsif (defined $ci) { $ci_slice = $sa->fetch_by_region('toplevel', $target_slice->seq_region_name, $ci->start, $ci->end); $between_slice = $sa->fetch_by_region('toplevel', $target_slice->seq_region_name, $ci->end + 1, $target_slice->length); } elsif (defined $cj) { $cj_slice = $sa->fetch_by_region('toplevel', $target_slice->seq_region_name, $cj->start, $cj->end); $between_slice = $sa->fetch_by_region('toplevel', $target_slice->seq_region_name, 1, $cj->start - 1); } # following puts the Projection segments back into reference coords. my (@seq_level_i, @seq_level_j, @between); if (defined $ci_slice) { @seq_level_i = map { { from_start => $_->from_start + $ci_slice->start - 1, from_end => $_->from_end + $ci_slice->start - 1, to_Slice => $_->to_Slice, }; } @{$ci_slice->project('seqlevel')}; } if (defined $cj_slice) { @seq_level_j = map { { from_start => $_->from_start + $cj_slice->start - 1, from_end => $_->from_end + $cj_slice->start - 1, to_Slice => $_->to_Slice, }; } @{$cj_slice->project('seqlevel')}; } @between = map { { from_start => $_->from_start + $between_slice->start - 1, from_end => $_->from_end + $between_slice->start - 1, to_Slice => $_->to_Slice, }; } @{$between_slice->project('seqlevel')}; # if the whole between slice is accounted for by seq-level bits, # then the ci and ck cannot be split either side of a gap my $between_seq_coverage = 0; foreach my $bit (@between) { $between_seq_coverage += $bit->{from_end} - $bit->{from_start} +1; } if (# no i coord given, so extend j not @seq_level_i or # no j coord given, so extend i not @seq_level_j or # ci ends in a gap, so the two must be separable $seq_level_i[-1]->{from_end} < $ci->end or # cj starts with a gap so the two must be separable $seq_level_j[0]->{from_start} > $cj->start or # between region contained a sequence gap $between_seq_coverage < $between_slice->length) { # return a pair that extends to the first gap on each side my ($new_left, $new_right); if (defined $ci) { if (not @between or $between[0]->{from_start} > $between_slice->start) { $new_left = Bio::EnsEMBL::Mapper::Coordinate-> new($ci->id, $ci->start, $ci->end, $ci->strand); } else { $new_left = Bio::EnsEMBL::Mapper::Coordinate-> new($ci->id, $ci->start, $between[0]->{from_end}, $ci->strand); } } if (defined $cj) { if (not @between or $between[-1]->{from_end} < $between_slice->end) { $new_right = Bio::EnsEMBL::Mapper::Coordinate-> new($cj->id, $cj->start, $cj->end, $cj->strand); } else { $new_right = Bio::EnsEMBL::Mapper::Coordinate-> new($cj->id, $between[-1]->{from_start}, $cj->end, $cj->strand); } } if (defined $new_left and defined $new_right) { return ($new_left, $new_right); } elsif (defined $new_left and not defined $new_right) { return ($new_left, undef); } elsif (defined $new_right and not defined $new_left) { return (undef, $new_right); } } return (undef, undef); } ############################################################ sub extend_coord { my ($coord, $target_slice) = @_; # this method pushes $coord out to the left and right # to the point of the next sequence-level gap throw("Cannot extend coord; wrong slice given") if $coord->id ne $target_slice->seq_region_name; my @proj = @{$target_slice->project('seqlevel')}; my ($new_start, $new_end, $up_seg, $down_seg); foreach my $seg (@proj) { if ($seg->from_start <= $coord->start and $seg->from_end >= $coord->start) { $new_start = $seg->from_start; } elsif ($seg->from_start <= $coord->start) { $up_seg = $seg; } if ($seg->from_start <= $coord->end and $seg->from_end >= $coord->end) { $new_end = $seg->from_end; } elsif ($seg->from_end >= $coord->end) { $down_seg = $seg if not defined $down_seg; } } throw("Could not find sequence-level pieces for " . $coord->id . "/" . $coord->start . "-" . $coord->end) if not defined $new_start or not defined $new_end; my $ex_coord = Bio::EnsEMBL::Mapper::Coordinate->new($coord->id, $new_start, $new_end, $coord->strand); my ($up_coord, $down_coord); if (defined $up_seg) { $up_coord = Bio::EnsEMBL::Mapper::Coordinate ->new($coord->id, $up_seg->from_start, $up_seg->from_end, $coord->strand); } if (defined $down_seg) { $down_coord = Bio::EnsEMBL::Mapper::Coordinate ->new($coord->id, $down_seg->from_start, $down_seg->from_end, $coord->strand); } return ($ex_coord, $up_coord, $down_coord); } ###################################################### sub transcripts_to_coords { my ($map, $from_cs_name, @trans) = @_; my (@all_good_bits, @results); foreach my $tr (@trans) { my @exons = sort {$a->start <=> $b->start} @{$tr->get_all_translateable_Exons}; my @bits; for(my $i = 0; $i < @exons; $i++) { my $e = $exons[$i]; my @c = $map->map_coordinates($e->slice->seq_region_name, $e->start, $e->end, 1, $from_cs_name); my $current_pos = $e->start; foreach my $c (@c) { my $current_end = $current_pos + $c->length - 1; if ($c->isa("Bio::EnsEMBL::Mapper::Coordinate")) { push @bits, Bio::EnsEMBL::Mapper::Coordinate->new($e->slice->seq_region_name, $current_pos, $current_end, 1); } else { push @bits, $c; } $current_pos += $c->length; } } # policy: only consider gaps as worth filling if they are flanked # by aligned sequence that contains at least one full codon my $total_bps = 0; my $first_good = -1; for(my $i=0; $i < @bits; $i++) { if ($bits[$i]->isa("Bio::EnsEMBL::Mapper::Coordinate")) { my $spare_start = (3 - ($total_bps % 3)) % 3; my $spare_end = ($bits[$i]->length - $spare_start) % 3; my $codons = ($bits[$i]->length - $spare_start - $spare_end) / 3; if ($codons >= 1) { $first_good = $i; last; } } $total_bps += $bits[$i]->length; } next if $first_good < 0; @bits = splice(@bits, $first_good, scalar(@bits) - $first_good); $total_bps = 0; $first_good = -1; for(my $i=scalar(@bits) - 1; $i >= 0; $i--) { if ($bits[$i]->isa("Bio::EnsEMBL::Mapper::Coordinate")) { my $spare_end = (3 - ($total_bps % 3)) % 3; my $spare_start = ($bits[$i]->length - $spare_end) % 3; my $codons = ($bits[$i]->length - $spare_start - $spare_end) / 3; if ($codons >= 1) { $first_good = $i; last; } } $total_bps += $bits[$i]->length; } next if $first_good < 0; @bits = splice(@bits, 0, $first_good + 1); push @all_good_bits, @bits; } @all_good_bits = sort { $a->start <=> $b->start } @all_good_bits; foreach my $g (@all_good_bits) { if (not @results or $results[-1]->end < $g->start - 1) { push @results, Bio::EnsEMBL::Mapper::Coordinate-> new($trans[0]->slice->seq_region_name, $g->start, $g->end, 1); } else { if ($g->end > $results[-1]->end) { $results[-1]->end($g->end); } } } return \@results; } 1;