Raw content of Bio::EnsEMBL::Analysis::Tools::WGA2Genes::ChainUtils =head1 NAME =head1 SYNOPSIS =head1 DESCRIPTION =cut package Bio::EnsEMBL::Analysis::Tools::WGA2Genes::ChainUtils; use strict; use Exporter; use vars qw (@ISA @EXPORT); use Bio::EnsEMBL::Utils::Exception qw(verbose throw warning); @ISA = qw(Exporter); @EXPORT = qw(filter_irrelevant_chains filter_inconsistent_chains filter_block_overlap_chains flatten_chains stringify_chains ); ################################################################### # FUNCTION: filter_irrelevant_chains # # Decription: # This function returns the chains that have a block overlap # with at least one feature in the given list (thus removing # "irrelevant" chains that do not interact with exons) ################################################################### sub filter_irrelevant_chains { my ($chains, $feats) = @_; my @kept_chains; my @feats = sort { $a->start <=> $b->start } @$feats; foreach my $c (@$chains) { my $chain_overlaps_feat = 0; my $j=0; BLOCK: foreach my $b (@$c) { my $b_st = $b->reference_genomic_align->dnafrag_start; my $b_en = $b->reference_genomic_align->dnafrag_end; foreach my $this_f (@feats) { if ($this_f->end < $b_st) { next; } elsif ($this_f->start > $b_en) { last; } else { # block overlap $chain_overlaps_feat = 1; last BLOCK; } } } if ($chain_overlaps_feat) { push @kept_chains, $c; } } return \@kept_chains; } ################################################################### # FUNCTION: filter_block_overlap_chains # # Decription: # # This function filters out chains from the first list that have # block overlap with any of the given blocks ################################################################### sub filter_block_overlap_chains { my ($raw_chains, $given_blocks) = @_; my @chains_to_process; foreach my $chain (@$raw_chains) { # if this chain conatains a block that overlaps with any of the # excluded regions on the target, ignore it. my $keep_chain = 1; CHAIN: foreach my $block (@$chain) { my ($tga) = @{$block->get_all_non_reference_genomic_aligns}; foreach my $ex_block (@$given_blocks) { my ($ex_tga) = @{$ex_block->get_all_non_reference_genomic_aligns}; if ($tga->dnafrag->name eq $ex_tga->dnafrag->name and $tga->dnafrag_end >= $ex_tga->dnafrag_start and $tga->dnafrag_start <= $ex_tga->dnafrag_end) { $keep_chain = 0; last CHAIN; } } } if ($keep_chain) { push @chains_to_process, $chain; } } return \@chains_to_process; } ################################################################### # FUNCTION: filter_inconsistent_chains # # Decription: # # Progressively projects chains on to the query sequence (in order # of score), rejecting chains that "interfere" with the set of # chains retained so far. Interference is defined as an overlap # in the query at the block level ################################################################### sub filter_inconsistent_chains { my ($input_chains, $overlap_chain_thresh) = @_; my (@sorted_chains, @to_ignore_chains, @kept_chains); foreach my $bl_list (@$input_chains) { my $qga_l = $bl_list->[0]->reference_genomic_align; my $qga_r = $bl_list->[-1]->reference_genomic_align; my ($tga_l) = @{$bl_list->[0]->get_all_non_reference_genomic_aligns}; my ($tga_r) = @{$bl_list->[-1]->get_all_non_reference_genomic_aligns}; push @sorted_chains, { blocks => $bl_list, score => $bl_list->[0]->score, qid => $qga_l->dnafrag->name, qstart => $qga_l->dnafrag_start, qend => $qga_r->dnafrag_end, tid => $tga_l->dnafrag->name, tstart => $tga_l->dnafrag_strand > 0 ? $tga_l->dnafrag_start : $tga_r->dnafrag_start, tend => $tga_l->dnafrag_strand > 0 ? $tga_r->dnafrag_end : $tga_l->dnafrag_end, tstrand => $tga_l->dnafrag_strand, }; } for(my $i=0; $i < @sorted_chains; $i++) { next if $to_ignore_chains[$i]; my (@ov_chains, @block_ov_regs); my $c = $sorted_chains[$i]; my @these_blocks = @{$c->{blocks}}; foreach my $kc (@kept_chains) { if ($c->{qstart} <= $kc->{qend} and $c->{qend} >= $kc->{qstart}) { push @ov_chains, $kc; } } for (my $i=0; $i < @these_blocks; $i++) { my $b = $these_blocks[$i]; my $qga = $b->reference_genomic_align; $block_ov_regs[$i] = []; KCHAIN: foreach my $kc (@ov_chains) { foreach my $kb (@{$kc->{blocks}}) { my $k_qga = $kb->reference_genomic_align; if ($qga->dnafrag_start <= $k_qga->dnafrag_end and $qga->dnafrag_end >= $k_qga->dnafrag_start) { my $ov_start = $k_qga->dnafrag_start; $ov_start = $qga->dnafrag_start if $qga->dnafrag_start > $ov_start; my $ov_end = $k_qga->dnafrag_end; $ov_end = $qga->dnafrag_end if $qga->dnafrag_end < $ov_end; push @{$block_ov_regs[$i]}, { start => $ov_start, end => $ov_end, }; } } } } my $rejected = 0; if (not grep { scalar(@{$_}) > 0 } @block_ov_regs) { # no overlapping blocks, so keep chain as is push @kept_chains, $c; } else { # if the amount of overlap only constitutes a small proportion # of the chain, keep the chain (with the overlapping blocks # removed) my $total_chain_len = 0; for(my $i=0; $i < @these_blocks; $i++) { my $b = $these_blocks[$i]; my $ga = $b->reference_genomic_align; $total_chain_len += $ga->dnafrag_end - $ga->dnafrag_start + 1; } # flatten the overlapping regions for (my $i=0; $i < @block_ov_regs; $i++) { my @regs = @{$block_ov_regs[$i]}; my @new_regs; foreach my $reg (sort { $a->{start} <=> $b->{start} } @regs) { if (not @new_regs or $reg->{start} > $new_regs[-1]->{end}) { push @new_regs, $reg; } else { if ($reg->{end} > $new_regs[-1]->{end}) { $new_regs[-1]->{end} = $reg->{end}; } } } $block_ov_regs[$i] = \@new_regs; } # we're only going to allow simple truncation of the chain # to resolve the overlap, at one end, or both. If by # truncating the chain in this way we lose too much of # it, again we reject the whole thing. # find the longest, contigous, non-overlapping stretch of chain my @contig_chain_parts; my $last_was_non_ov = 0; for(my $i=0; $i < @these_blocks; $i++) { my $block = $these_blocks[$i]; my $qga = $block->reference_genomic_align; my $ovs = $block_ov_regs[$i]; if (@$ovs) { if ($ovs->[0]->{start} > $qga->dnafrag_start) { if ($last_was_non_ov) { push @{$contig_chain_parts[-1]}, { index => $i, start => $qga->dnafrag_start, end => $ovs->[0]->{start} - 1, }; } else { push @contig_chain_parts, [{ index => $i, start => $qga->dnafrag_start, end => $ovs->[0]->{start} - 1, }]; } } for(my $j=1; $j < @$ovs; $j++) { push @contig_chain_parts, [{ index => $i, start => $ovs->[$j-1]->{end} + 1, end => $ovs->[$j]->{start} - 1, }]; } if ($ovs->[-1]->{end} < $qga->dnafrag_end) { push @contig_chain_parts, [{ index => $i, start => $ovs->[-1]->{end} + 1, end => $qga->dnafrag_end, }]; $last_was_non_ov = 1; } else { $last_was_non_ov = 0; } } else { if ($last_was_non_ov) { push @{$contig_chain_parts[-1]}, { index => $i, start => $qga->dnafrag_start, end => $qga->dnafrag_end, }; } else { push @contig_chain_parts, [{ index => $i, start => $qga->dnafrag_start, end => $qga->dnafrag_end, }]; } $last_was_non_ov = 1; } } # find longest my ($longest, $longest_len); if (not @contig_chain_parts) { $longest_len = 0; } else { foreach my $reg_list (@contig_chain_parts) { my $len = 0; foreach my $reg (@$reg_list) { $len += $reg->{end} - $reg->{start} + 1; } if (not defined $longest or $len > $longest_len) { $longest = $reg_list; $longest_len = $len; } } } # we've identifed a sub-region of the chain to keep. # But is it worth keeping? if ($longest_len / $total_chain_len > $overlap_chain_thresh) { my @bs; foreach my $el (@$longest) { my $block = $these_blocks[$el->{index}]; my $st = $el->{start}; my $en = $el->{end}; my $qga = $block->reference_genomic_align; if ($st != $qga->dnafrag_start or $en != $qga->dnafrag_end) { my $new_block = $block->restrict_between_reference_positions($st, $en); $new_block->score($block->score); push @bs, $new_block; } else { push @bs, $block; } }; my $qga_l = $bs[0]->reference_genomic_align; my $qga_r = $bs[-1]->reference_genomic_align; my ($tga_l) = @{$bs[0]->get_all_non_reference_genomic_aligns}; my ($tga_r) = @{$bs[-1]->get_all_non_reference_genomic_aligns}; $c->{blocks} = \@bs; $c->{qstart} = $qga_l->dnafrag_start; $c->{qend} = $qga_r->dnafrag_end; $c->{tstart} = ($tga_l->dnafrag_strand > 0) ? $tga_l->dnafrag_start : $tga_r->dnafrag_start; $c->{tend} = ($tga_l->dnafrag_strand > 0) ? $tga_r->dnafrag_end : $tga_l->dnafrag_end; push @kept_chains, $c; } else { $rejected = 1; } } if ($rejected) { # also reject all lower scoring chains of the same target for(my $j=$i+1; $j < @sorted_chains; $j++) { if ($sorted_chains[$j]->{tid} eq $c->{tid}) { $to_ignore_chains[$j] = 1; } } } else { # also keep all lower scoring chains of the same target that # are (a) consistent with this one in terms of coords and # (b) do not interfere with any kept chains so far for(my $j=$i+1; $j < @sorted_chains; $j++) { my $oc = $sorted_chains[$j]; my $consistent = 0; if ($oc->{tid} eq $c->{tid} and $oc->{tstrand} eq $c->{tstrand}) { if ($oc->{qstart} > $c->{qend}) { if ($oc->{tstrand} > 0 and $oc->{tstart} > $c->{tend} or $oc->{tstrand} < 0 and $oc->{tend} < $c->{tstart}) { $consistent = 1; } } elsif ($oc->{qend} < $c->{qstart}) { if ($oc->{tstrand} > 0 and $c->{tstart} > $oc->{tend} or $oc->{tstrand} < 0 and $c->{tend} < $oc->{tstart}) { $consistent = 1; } } } if ($consistent) { # any query overlap with other retained chains? for(my $k=0; $k < @kept_chains; $k++) { if ($oc->{qstart} <= $kept_chains[$k]->{qend} and $oc->{qend} >= $kept_chains[$k]->{qstart}) { $consistent = 0; last; } } if ($consistent) { push @kept_chains, $oc; $to_ignore_chains[$j] = 1; } } } } } return [map { $_->{blocks} } @kept_chains]; } ################################################################### # FUNCTION: flatten_chains # # Description: # Takes a list of chains of GenomicAlignBlocks and computes # and flattens it into a list of GenomicAlignBlocks # NOTE: The method assumes that the input chains are non- # overlapping in the reference sequence at the block level. If # the '$check' flag is supplied, this is checked and an exception # thrown if this is not the case #################################################################### sub flatten_chains { my ($chains, $check) = @_; my @blocks; foreach my $chain (@$chains) { push @blocks, @$chain; } @blocks = sort { $a->reference_genomic_align->dnafrag_start <=> $b->reference_genomic_align->dnafrag_start; } @blocks; if ($check) { for(my $i=1; $i < @blocks; $i++) { my $this = $blocks[$i]; my $prev = $blocks[$i-1]; if ($this->reference_genomic_align->dnafrag_start < $prev->reference_genomic_align->dnafrag_end) { throw("Error when calculating simple net; overlapping blocks"); } } } return \@blocks; } sub stringify_chains { my ($chains) = @_; my $str = ""; foreach my $c (@$chains) { $str .= sprintf "CHAIN %d\n", $c->[0]->score; foreach my $b (@$c) { my $q = $b->reference_genomic_align; my ($t) = @{$b->get_all_non_reference_genomic_aligns}; $str .= sprintf(" %s/%d-%d, %s/%d-%d %d (%d)\n", $q->dnafrag->name, $q->dnafrag_start, $q->dnafrag_end, $t->dnafrag->name, $t->dnafrag_start, $t->dnafrag_end, $t->dnafrag_strand, $t->level_id); } } return $str; } 1;