Bio::EnsEMBL::Analysis::Tools::WGA2Genes ChainUtils
SummaryIncluded librariesPackage variablesSynopsisDescriptionGeneral documentationMethods
Toolbar
WebCvsRaw content
Summary
Package variables
No package variables defined.
Included modules
Bio::EnsEMBL::Utils::Exception qw ( verbose throw warning )
Exporter
Inherit
Exporter
Synopsis
Description
Methods
filter_block_overlap_chains
No description
Code
filter_inconsistent_chains
No description
Code
filter_irrelevant_chains
No description
Code
flatten_chains
No description
Code
stringify_chains
No description
Code
Methods description
None available.
Methods code
filter_block_overlap_chainsdescriptionprevnextTop
sub filter_block_overlap_chains {
  my ($raw_chains, $given_blocks) = @_;

  my @chains_to_process;

  foreach my $chain (@$raw_chains) {
    # if this chain conatains a block that overlaps with any of the 
# excluded regions on the target, ignore it.
my $keep_chain = 1; CHAIN: foreach my $block (@$chain) { my ($tga) = @{$block->get_all_non_reference_genomic_aligns}; foreach my $ex_block (@$given_blocks) { my ($ex_tga) = @{$ex_block->get_all_non_reference_genomic_aligns}; if ($tga->dnafrag->name eq $ex_tga->dnafrag->name and $tga->dnafrag_end >= $ex_tga->dnafrag_start and $tga->dnafrag_start <= $ex_tga->dnafrag_end) { $keep_chain = 0; last CHAIN; } } } if ($keep_chain) { push @chains_to_process, $chain; } } return\@ chains_to_process; } ###################################################################
# FUNCTION: filter_inconsistent_chains
#
# Decription:
#
# Progressively projects chains on to the query sequence (in order
# of score), rejecting chains that "interfere" with the set of
# chains retained so far. Interference is defined as an overlap
# in the query at the block level
###################################################################
}
filter_inconsistent_chainsdescriptionprevnextTop
sub filter_inconsistent_chains {
  my ($input_chains, $overlap_chain_thresh) = @_;

  my (@sorted_chains, @to_ignore_chains, @kept_chains);

  foreach my $bl_list (@$input_chains) {
    my $qga_l = $bl_list->[0]->reference_genomic_align;
    my $qga_r = $bl_list->[-1]->reference_genomic_align;
    my ($tga_l) = @{$bl_list->[0]->get_all_non_reference_genomic_aligns};
    my ($tga_r) = @{$bl_list->[-1]->get_all_non_reference_genomic_aligns};
    
    push @sorted_chains, {
      blocks => $bl_list,
      score  => $bl_list->[0]->score,
      qid    => $qga_l->dnafrag->name,
      qstart  => $qga_l->dnafrag_start,
      qend    => $qga_r->dnafrag_end,
      tid     => $tga_l->dnafrag->name,
      tstart  => $tga_l->dnafrag_strand > 0 ? $tga_l->dnafrag_start : $tga_r->dnafrag_start,
      tend    => $tga_l->dnafrag_strand > 0 ? $tga_r->dnafrag_end : $tga_l->dnafrag_end,
      tstrand => $tga_l->dnafrag_strand,
    };      

  }
  
  for(my $i=0; $i < @sorted_chains; $i++) {

    next if $to_ignore_chains[$i];

    my (@ov_chains, @block_ov_regs);

    my $c = $sorted_chains[$i];

    my @these_blocks = @{$c->{blocks}};

    foreach my $kc (@kept_chains) {
      if ($c->{qstart} <= $kc->{qend} and 
          $c->{qend} >= $kc->{qstart}) {

        push @ov_chains, $kc;
      }
    }

    for (my $i=0; $i < @these_blocks; $i++) {
      my $b = $these_blocks[$i];
      my $qga = $b->reference_genomic_align;

      $block_ov_regs[$i] = []; 

      KCHAIN: foreach my $kc (@ov_chains) {
        foreach my $kb (@{$kc->{blocks}}) {
          
          my $k_qga = $kb->reference_genomic_align;
          
          if ($qga->dnafrag_start <= $k_qga->dnafrag_end and
              $qga->dnafrag_end >= $k_qga->dnafrag_start) {

            my $ov_start = $k_qga->dnafrag_start;
            $ov_start = $qga->dnafrag_start 
                if $qga->dnafrag_start > $ov_start;
            my $ov_end = $k_qga->dnafrag_end;
            $ov_end = $qga->dnafrag_end 
                if $qga->dnafrag_end < $ov_end;

            push @{$block_ov_regs[$i]}, {
              start => $ov_start,
              end   => $ov_end,
            };
          }
        }
      }
    }

    my $rejected = 0;

    if (not grep { scalar(@{$_}) > 0 } @block_ov_regs) {
      # no overlapping blocks, so keep chain as is
push @kept_chains, $c; } else { # if the amount of overlap only constitutes a small proportion
# of the chain, keep the chain (with the overlapping blocks
# removed)
my $total_chain_len = 0; for(my $i=0; $i < @these_blocks; $i++) { my $b = $these_blocks[$i]; my $ga = $b->reference_genomic_align; $total_chain_len += $ga->dnafrag_end - $ga->dnafrag_start + 1; } # flatten the overlapping regions
for (my $i=0; $i < @block_ov_regs; $i++) { my @regs = @{$block_ov_regs[$i]}; my @new_regs; foreach my $reg (sort { $a->{start} <=> $b->{start} } @regs) { if (not @new_regs or $reg->{start} > $new_regs[-1]->{end}) { push @new_regs, $reg; } else { if ($reg->{end} > $new_regs[-1]->{end}) { $new_regs[-1]->{end} = $reg->{end}; } } } $block_ov_regs[$i] =\@ new_regs; } # we're only going to allow simple truncation of the chain
# to resolve the overlap, at one end, or both. If by
# truncating the chain in this way we lose too much of
# it, again we reject the whole thing.
# find the longest, contigous, non-overlapping stretch of chain
my @contig_chain_parts; my $last_was_non_ov = 0; for(my $i=0; $i < @these_blocks; $i++) { my $block = $these_blocks[$i]; my $qga = $block->reference_genomic_align; my $ovs = $block_ov_regs[$i]; if (@$ovs) { if ($ovs->[0]->{start} > $qga->dnafrag_start) { if ($last_was_non_ov) { push @{$contig_chain_parts[-1]}, { index => $i, start => $qga->dnafrag_start, end => $ovs->[0]->{start} - 1, }; } else { push @contig_chain_parts, [{ index => $i, start => $qga->dnafrag_start, end => $ovs->[0]->{start} - 1, }]; } } for(my $j=1; $j < @$ovs; $j++) { push @contig_chain_parts, [{ index => $i, start => $ovs->[$j-1]->{end} + 1, end => $ovs->[$j]->{start} - 1, }]; } if ($ovs->[-1]->{end} < $qga->dnafrag_end) { push @contig_chain_parts, [{ index => $i, start => $ovs->[-1]->{end} + 1, end => $qga->dnafrag_end, }]; $last_was_non_ov = 1; } else { $last_was_non_ov = 0; } } else { if ($last_was_non_ov) { push @{$contig_chain_parts[-1]}, { index => $i, start => $qga->dnafrag_start, end => $qga->dnafrag_end, }; } else { push @contig_chain_parts, [{ index => $i, start => $qga->dnafrag_start, end => $qga->dnafrag_end, }]; } $last_was_non_ov = 1; } } # find longest
my ($longest, $longest_len); if (not @contig_chain_parts) { $longest_len = 0; } else { foreach my $reg_list (@contig_chain_parts) { my $len = 0; foreach my $reg (@$reg_list) { $len += $reg->{end} - $reg->{start} + 1; } if (not defined $longest or $len > $longest_len) { $longest = $reg_list; $longest_len = $len; } } } # we've identifed a sub-region of the chain to keep.
# But is it worth keeping?
if ($longest_len / $total_chain_len > $overlap_chain_thresh) {
my
@bs;
foreach my $el (@$longest) { my $block = $these_blocks[$el->{index}]; my $st = $el->{start}; my $en = $el->{end}; my $qga = $block->reference_genomic_align; if ($st != $qga->dnafrag_start or $en != $qga->dnafrag_end) { my $new_block = $block->restrict_between_reference_positions($st, $en); $new_block->score($block->score); push @bs, $new_block; } else { push @bs, $block; } }; my $qga_l = $bs[0]->reference_genomic_align; my $qga_r = $bs[-1]->reference_genomic_align; my ($tga_l) = @{$bs[0]->get_all_non_reference_genomic_aligns}; my ($tga_r) = @{$bs[-1]->get_all_non_reference_genomic_aligns}; $c->{blocks} =\@ bs; $c->{qstart} = $qga_l->dnafrag_start; $c->{qend} = $qga_r->dnafrag_end; $c->{tstart} = ($tga_l->dnafrag_strand > 0) ? $tga_l->dnafrag_start : $tga_r->dnafrag_start; $c->{tend} = ($tga_l->dnafrag_strand > 0) ? $tga_r->dnafrag_end : $tga_l->dnafrag_end; push @kept_chains, $c; } else { $rejected = 1; } } if ($rejected) { # also reject all lower scoring chains of the same target
for(my $j=$i+1; $j < @sorted_chains; $j++) { if ($sorted_chains[$j]->{tid} eq $c->{tid}) { $to_ignore_chains[$j] = 1; } } } else { # also keep all lower scoring chains of the same target that
# are (a) consistent with this one in terms of coords and
# (b) do not interfere with any kept chains so far
for(my $j=$i+1; $j < @sorted_chains; $j++) { my $oc = $sorted_chains[$j]; my $consistent = 0; if ($oc->{tid} eq $c->{tid} and $oc->{tstrand} eq $c->{tstrand}) { if ($oc->{qstart} > $c->{qend}) { if ($oc->{tstrand} > 0 and $oc->{tstart} > $c->{tend} or $oc->{tstrand} < 0 and $oc->{tend} < $c->{tstart}) { $consistent = 1; } } elsif ($oc->{qend} < $c->{qstart}) { if ($oc->{tstrand} > 0 and $c->{tstart} > $oc->{tend} or $oc->{tstrand} < 0 and $c->{tend} < $oc->{tstart}) { $consistent = 1; } } } if ($consistent) { # any query overlap with other retained chains?
for(my $k=0; $k < @kept_chains; $k++) { if ($oc->{qstart} <= $kept_chains[$k]->{qend} and $oc->{qend} >= $kept_chains[$k]->{qstart}) { $consistent = 0; last; } } if ($consistent) { push @kept_chains, $oc; $to_ignore_chains[$j] = 1; } } } } } return [map { $_->{blocks} } @kept_chains]; } ###################################################################
# FUNCTION: flatten_chains
#
# Description:
# Takes a list of chains of GenomicAlignBlocks and computes
# and flattens it into a list of GenomicAlignBlocks
# NOTE: The method assumes that the input chains are non-
# overlapping in the reference sequence at the block level. If
# the '$check' flag is supplied, this is checked and an exception
# thrown if this is not the case
####################################################################
}
filter_irrelevant_chainsdescriptionprevnextTop
sub filter_irrelevant_chains {
  my ($chains, $feats) = @_;

  my @kept_chains;

  my @feats = sort { $a->start <=> $b->start } @$feats;

  foreach my $c (@$chains) {
    my $chain_overlaps_feat = 0;

    my $j=0;
    BLOCK: foreach my $b (@$c) {
      my $b_st = $b->reference_genomic_align->dnafrag_start;
      my $b_en = $b->reference_genomic_align->dnafrag_end;

      foreach my $this_f (@feats) {

        if ($this_f->end < $b_st) {
          next;
        } elsif ($this_f->start > $b_en) {
          last;
        } else {
          # block overlap
$chain_overlaps_feat = 1; last BLOCK; } } } if ($chain_overlaps_feat) { push @kept_chains, $c; } } return\@ kept_chains; } ###################################################################
# FUNCTION: filter_block_overlap_chains
#
# Decription:
#
# This function filters out chains from the first list that have
# block overlap with any of the given blocks
###################################################################
}
flatten_chainsdescriptionprevnextTop
sub flatten_chains {
  my ($chains, $check) = @_;

  my @blocks;
  foreach my $chain (@$chains) {
    push @blocks, @$chain;
  }
  
  @blocks = sort {
    $a->reference_genomic_align->dnafrag_start <=>
        $b->reference_genomic_align->dnafrag_start;
  } @blocks;

  if ($check) {
    for(my $i=1; $i < @blocks; $i++) {
      my $this = $blocks[$i];
      my $prev = $blocks[$i-1];

      if ($this->reference_genomic_align->dnafrag_start < 
          $prev->reference_genomic_align->dnafrag_end) {
        throw("Error when calculating simple net; overlapping blocks");
      }
    }
  }

  return\@ blocks;
}
stringify_chainsdescriptionprevnextTop
sub stringify_chains {
  my ($chains) = @_;

  my $str = "";

  foreach my $c (@$chains) {
    $str .= sprintf "CHAIN %d\n", $c->[0]->score;
    foreach my $b (@$c) {
      my $q = $b->reference_genomic_align;
      my ($t) = @{$b->get_all_non_reference_genomic_aligns};
      $str .= sprintf(" %s/%d-%d, %s/%d-%d %d (%d)\n", 
                      $q->dnafrag->name, 
                      $q->dnafrag_start, 
                      $q->dnafrag_end, 
                      $t->dnafrag->name, 
                      $t->dnafrag_start, 
                      $t->dnafrag_end, 
                      $t->dnafrag_strand, 
                      $t->level_id);
    }
  }


  return $str;
}



1;
}
General documentation
No general documentation available.