Raw content of Bio::EnsEMBL::Analysis::Runnable::AlignmentNets
#
# You may distribute this module under the same terms as perl itself
#
# POD documentation - main docs before the code
=pod
=head1 NAME
Bio::EnsEMBL::Pipeline::Runnable::AlignmentNets
=head1 SYNOPSIS
=head1 DESCRIPTION
=head1 CONTACT
Describe contact details here
=head1 APPENDIX
The rest of the documentation details each of the object methods.
Internal methods are usually preceded with a _
=cut
# Let the code begin...
package Bio::EnsEMBL::Analysis::Runnable::AlignmentNets;
use vars qw(@ISA);
use strict;
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
use Bio::EnsEMBL::Utils::Argument qw( rearrange );
use Bio::EnsEMBL::Analysis::Runnable;
use Bio::EnsEMBL::DnaDnaAlignFeature;
@ISA = qw(Bio::EnsEMBL::Analysis::Runnable);
sub new {
my ($class,@args) = @_;
my $self = $class->SUPER::new(@args);
my ($chains,
$chains_sorted,
$query_lengths,
$target_lengths,
$chain_net,
$net_syntenic,
$net_filter,
$filter_non_syntenic,
$min_chain_score
) = rearrange([qw(
CHAINS
CHAINS_SORTED
QUERY_LENGTHS
TARGET_LENGTHS
CHAINNET
NETSYNTENIC
NETFILTER
FILTER_NON_SYNTENIC
MIN_CHAIN_SCORE
)],
@args);
throw("You must supply a ref to array of alignment chains")
if not defined $chains;
throw("You must supply a reference to an hash of query seq. lengths")
if not defined $query_lengths;
throw("You must supply a reference to an hash of query seq. lengths")
if not defined $target_lengths;
throw("You must supply the chainNet executable")
if not defined $chain_net;
if (defined $filter_non_syntenic) {
throw("You must supply the netSyntenic executable when doing synteny filtering")
if not defined $net_syntenic;
throw("You must supply the netFilter executable when doing synteny filtering")
if not defined $net_filter;
}
$self->query_length_hash($query_lengths);
$self->target_length_hash($target_lengths);
$self->chainNet($chain_net);
$self->filter_non_syntenic($filter_non_syntenic) if defined $filter_non_syntenic;
$self->netSyntenic($net_syntenic) if defined $net_syntenic;
$self->netFilter($net_filter) if defined $net_filter;
$self->min_chain_score($min_chain_score) if defined $min_chain_score;
$self->chains($chains);
$self->chains_sorted(defined $chains_sorted ? $chains_sorted : 0 );
return $self;
}
=head2 run
Title : run
Usage : $self->run()
Function:
Returns : none
Args :
=cut
sub run {
my ($self) = @_;
my $res_chains;
my ($query_name) = keys %{$self->query_length_hash};
my $work_dir = $self->workdir . "/$query_name.$$.ChainNet";
while (-e $work_dir) {
$work_dir .= ".$$";
}
my $chain_file = "$work_dir/$query_name.chain";
my $query_length_file = "$work_dir/$query_name.query.lengths";
my $target_length_file = "$work_dir/$query_name.target.lengths";
my $query_net_file = "$work_dir/$query_name.query.net";
my $target_net_file = "$work_dir/$query_name.target.net";
my $fh;
mkdir $work_dir;
##############################
# write the seq length files
##############################
foreach my $el ([$query_length_file, $self->query_length_hash],
[$target_length_file, $self->target_length_hash]) {
my ($file, $hash) = @$el;
open $fh, ">$file" or
throw("Could not open seq length file '$file' for writing");
foreach my $k (keys %{$hash}) {
print $fh $k, "\t", $hash->{$k}, "\n";
}
close($fh);
}
##############################
# sort chains, if necessary
##############################
if (not $self->chains_sorted) {
for(my $i=0; $i < @{$self->chains}; $i++) {
if ($self->chains->[$i]->[0]->isa("Bio::EnsEMBL::Compara::GenomicAlignBlock")) {
$self->chains->[$i] = [
sort {
$a->reference_genomic_align->dnafrag_start <=>
$a->reference_genomic_align->dnafrag_end
} @{$self->chains->[$i]}
];
} else {
$self->chains->[$i] = [
sort { $a->start <=> $b->start } @{$self->chains->[$i]}
];
}
}
}
##############################
# write chains
##############################
open $fh, ">$chain_file" or
throw("could not open chain file '$chain_file' for writing\n");
$self->write_chains($fh);
close($fh);
##################################
# Get the Nets from chainNet
##################################
my @arg_list;
if (defined $self->min_chain_score) {
@arg_list = ("-minScore=" . $self->min_chain_score);
}
push @arg_list, ($chain_file,
$query_length_file,
$target_length_file,
$query_net_file,
$target_net_file);
system($self->chainNet, @arg_list)
and throw("Something went wrong with chainNet");
##################################
# Apply the synteny filter if requested
##################################
if ($self->filter_non_syntenic) {
my $syntenic_net_file = "$work_dir/$query_name.query.synteny_annotated.net";
my $filtered_net_file = "$work_dir/$query_name.query.synteny.net";
system($self->netSyntenic, $query_net_file, $syntenic_net_file)
and throw("Something went wrong with netSyntenic");
open(FILTER, $self->netFilter . " -syn $syntenic_net_file |") or
throw("Could not run netFilter");
open(FILTERED,">$filtered_net_file")
or throw("Could not open filtered net file for writing");
while() {
print FILTERED $_;
}
close(FILTERED);
close(FILTER) or throw("Something went wrong with netFilter");
unlink $syntenic_net_file;
unlink $query_net_file;
$query_net_file = $filtered_net_file;
}
open $fh, $query_net_file or
throw("Could not open net file '$query_net_file' for reading\n");
$res_chains = $self->parse_Net_file($fh);
close($fh);
unlink $chain_file, $query_length_file, $target_length_file, $query_net_file, $target_net_file;
rmdir $work_dir;
$self->output($res_chains);
return 1;
}
#####################################################
sub write_chains {
my ($self, $fh) = @_;
# in the absence of a chain score, we will take the score of the
# first block in the chain to be the score
for(my $chain_id=1; $chain_id <= @{$self->chains}; $chain_id++) {
my $chain = $self->chains->[$chain_id-1];
my (@ungapped_features,
$chain_score,
$query_name,
$query_strand,
$target_name,
$target_strand);
foreach my $gf (@$chain) {
if (not defined $query_name) {
# all members of the chain must come from the same
# query and target, and be on the same strand on those
# sequences, otherwise all bets are off
if ($gf->isa("Bio::EnsEMBL::Compara::GenomicAlignBlock")) {
my $qga = $gf->reference_genomic_align;
my ($tga) = @{$gf->get_all_non_reference_genomic_aligns};
$query_name = $qga->dnafrag->name,
$target_name = $tga->dnafrag->name,
$query_strand = $qga->dnafrag_strand;
$target_strand = $tga->dnafrag_strand;
} else {
# assume Bio::EnsEMBL::DnaDnaAlignFeature
$query_name = $gf->seqname;
$target_name = $gf->hseqname,
$query_strand = $gf->strand;
$target_strand = $gf->hstrand;
}
# the chain must be written with respect to the forward strand
# of the query. Since we are dealing with the ungapped blocks below,
# this can be achieved by swapping the strands if the query is reverse.
if ($query_strand == -1) {
$query_strand *= -1;
$target_strand *= -1;
}
}
if (not defined $chain_score or $chain_score < $gf->score) {
$chain_score = $gf->score;
}
if ($gf->isa("Bio::EnsEMBL::Compara::GenomicAlignBlock")) {
foreach my $uf (@{$gf->get_all_ungapped_GenomicAlignBlocks}) {
my $qga = $uf->reference_genomic_align;
my ($tga) = @{$uf->get_all_non_reference_genomic_aligns};
my $sens_f = {
q_start => $qga->dnafrag_start,
q_end => $qga->dnafrag_end,
t_start => $tga->dnafrag_start,
t_end => $tga->dnafrag_end,
len => $qga->dnafrag_end - $qga->dnafrag_start + 1,
};
if ($target_strand == -1) {
$sens_f->{t_start} = $self->target_length_hash->{$tga->dnafrag->name} - $tga->dnafrag_end + 1;
$sens_f->{t_end} = $self->target_length_hash->{$tga->dnafrag->name} - $tga->dnafrag_start + 1;
}
push @ungapped_features, $sens_f;
}
} else {
foreach my $uf ($gf->ungapped_features) {
my $sens_f = {
q_start => $uf->start,
q_end => $uf->end,
t_start => $uf->hstart,
t_end => $uf->hend,
len => $uf->end - $uf->start + 1,
};
if ($target_strand == -1) {
$sens_f->{t_start} = $self->target_length_hash->{$uf->hseqname} - $uf->hend + 1;
$sens_f->{t_end} = $self->target_length_hash->{$uf->hseqname} - $uf->hstart + 1;
}
push @ungapped_features, $sens_f;
}
}
}
@ungapped_features = sort {$a->{q_start} <=> $b->{q_start}} @ungapped_features;
# write chain header here
# printf($fh "chain %d %s %d %s %d %d %s %d %s %s %s %d\n",
print $fh join(" ",("chain",
$chain_score,
$query_name,
$self->query_length_hash->{$query_name},
$query_strand == -1 ? "-" : "+",
$ungapped_features[0]->{q_start} - 1,
$ungapped_features[-1]->{q_end},
$target_name,
$self->target_length_hash->{$target_name},
$target_strand == -1 ? "-" : "+",
$ungapped_features[0]->{t_start} - 1,
$ungapped_features[-1]->{t_end},
$chain_id)), "\n";
for (my $i = 0; $i < @ungapped_features; $i++) {
my $f = $ungapped_features[$i];
print $fh $f->{len};
if ($i == @ungapped_features - 1) {
print $fh "\n";
} else {
my $next_f = $ungapped_features[$i+1];
my $q_gap = $next_f->{q_start} - $f->{q_end} - 1;
my $t_gap = $next_f->{t_start} - $f->{t_end} - 1;
print $fh "\t$q_gap\t$t_gap\n";
}
}
print $fh "\n";
}
}
sub parse_Net_file {
my ($self, $fh) = @_;
my (%new_chains, %new_chain_scores, @last_gap, @last_parent_chain);
while(<$fh>) {
/(\s+)fill\s+(\d+)\s+(\d+)\s+\S+\s+\S+\s+\d+\s+\d+\s+(.+)$/ and do {
my $indent = length($1) - 1;
my $level_id = int( $indent / 2 ) + 1;
my $q_start = $2 + 1;
my $q_end = $q_start + $3 - 1;
my $rest = $4;
my ($score) = $rest =~ /score\s+(\d+)/;
my ($chain_id) = $rest =~ /id\s+(\d+)/;
$new_chain_scores{$chain_id} += $score;
my ($restricted_fps)
= $self->restrict_between_positions($self->chains->[$chain_id-1],
$q_start,
$q_end);
foreach my $fp (@$restricted_fps) {
$fp->score($score);
if ($fp->isa("Bio::EnsEMBL::Compara::GenomicAlignBlock")) {
foreach my $align (@{$fp->genomic_align_array}) {
$align->level_id($level_id);
}
} else {
$fp->level_id($level_id);
}
}
if (@$restricted_fps) {
push @{$new_chains{$chain_id}}, @$restricted_fps;
if ($indent > 0) {
# the new alignment has been inserted into the parent gap
# need to split parent chain into two:
# begin -> $insert_start - 1,
# $insert_end + 1 -> end
my $parent_chain = $new_chains{$last_parent_chain[$indent - 2]};
my ($insert_start, $insert_end) = ($last_gap[$indent - 1]->[0],
$last_gap[$indent - 1]->[1]);
my ($left_start, $right_end);
if ($parent_chain->[0]->isa("Bio::EnsEMBL::Compara::GenomicAlignBlock")) {
$left_start = $parent_chain->[0]->reference_genomic_align->dnafrag_start;
} else {
$left_start = $parent_chain->[0]->start;
}
if ($parent_chain->[-1]->isa("Bio::EnsEMBL::Compara::GenomicAlignBlock")) {
$right_end = $parent_chain->[-1]->reference_genomic_align->dnafrag_end;
} else {
$right_end = $parent_chain->[-1]->end;
}
my ($chain1) = $self->restrict_between_positions($parent_chain,
$left_start,
$insert_start - 1);
my ($chain2) = $self->restrict_between_positions($parent_chain,
$insert_end + 1,
$right_end);
$new_chains{$last_parent_chain[$indent - 2]} = [@$chain1, @$chain2];
}
$last_parent_chain[$indent] = $chain_id;
}
};
/^(\s+)gap\s+(\d+)\s+(\d+)/ and do {
my $indent = length($1) - 1;
my $q_insert_start = $2 + 1;
my $q_insert_end = $q_insert_start + $3 - 1;
$last_gap[$indent] = [$q_insert_start, $q_insert_end];
};
}
foreach my $cid (keys %new_chains) {
my $chain_score = $new_chain_scores{$cid};
foreach my $fp (@{$new_chains{$cid}}) {
$fp->score($chain_score);
}
}
return [values %new_chains];
}
sub restrict_between_positions {
my ($self, $chain, $q_start, $q_end) = @_;
#my @blocks = @$chain;
my @new_chain;
#my $chain_left = [];
#my $chain_right = [];
my $first_idx = $self->_bin_search_start($chain, $q_start);
my $last_idx = $self->_bin_search_end($chain, $q_end);
if ($first_idx < 0) {
# all blocks to left of range
#$chain_left = $chain;
#$chain_right = [];
} elsif ($last_idx < 0) {
# all blocks to right of range
#$chain_left = [];
#$chain_right = $chain;
} else {
if ($first_idx > 0) {
#@$chain_left = @{@$chain}[0..$first_idx-1];
}
if ($last_idx < scalar(@$chain) - 1) {
#@$chain_right = @{@$chain}[$last_idx+1..scalar(@$chain)-1];
}
if ($first_idx <= $last_idx) {
@new_chain = @{@$chain}[$first_idx..$last_idx];
}
# may be necessary to cut the boundary blocks
if (@new_chain) {
my $block = shift @new_chain;
my $b_start = $block->isa("Bio::EnsEMBL::Compara::GenomicAlignBlock")
? $block->reference_genomic_align->dnafrag_start
: $block->start;
my $b_end = $block->isa("Bio::EnsEMBL::Compara::GenomicAlignBlock")
? $block->reference_genomic_align->dnafrag_end
: $block->end;
if ($b_start < $q_start) {
# need to cut the block;
my $inside;
# my $outside
if ($block->isa("Bio::EnsEMBL::Compara::GenomicAlignBlock")) {
#$outside = $block->restrict_between_reference_positions($b_start, $q_start - 1);
$inside = $block->restrict_between_reference_positions($q_start, $b_end);
} else {
#$outside = $block->restrict_between_positions($b_start, $q_start - 1, "SEQ");
$inside = $block->restrict_between_positions($q_start, $b_end, "SEQ");
}
#push @$chain_left, $outside;
unshift @new_chain, $inside;
} else {
unshift @new_chain, $block;
}
$block = pop @new_chain;
$b_start = $block->isa("Bio::EnsEMBL::Compara::GenomicAlignBlock")
? $block->reference_genomic_align->dnafrag_start
: $block->start;
$b_end = $block->isa("Bio::EnsEMBL::Compara::GenomicAlignBlock")
? $block->reference_genomic_align->dnafrag_end
: $block->end;
if ($b_end > $q_end) {
# need to cut the block;
my $inside;
#my $outside;
if ($block->isa("Bio::EnsEMBL::Compara::GenomicAlignBlock")) {
$inside = $block->restrict_between_reference_positions($b_start, $q_end);
#$outside = $block->restrict_between_reference_positions($q_end + 1, $b_end);
} else {
$inside = $block->restrict_between_positions($b_start, $q_end, "SEQ");
#$outside = $block->restrict_between_positions($q_end + 1, $b_end, "SEQ");
}
#unshift @$chain_right, $outside;
push @new_chain, $inside;
} else {
push @new_chain, $block;
}
}
}
#return (\@new_chain,
# $chain_left,
# $chain_right);
return (\@new_chain);
}
sub _bin_search_start {
my ($self, $blocks, $position) = @_;
# find index of left-most block that ends to the right of $position
my ($left, $right) = (0, scalar(@$blocks));
while ($right - $left > 0) {
my $mid = int(($left + $right) / 2);
my ($block_end, $block_start);
if ($blocks->[0]->isa("Bio::EnsEMBL::Compara::GenomicAlignBlock")) {
$block_end = $blocks->[$mid]->reference_genomic_align->dnafrag_end;
} else {
$block_end = $blocks->[$mid]->end;
}
if ($block_end < $position) {
$left = $mid + 1;
} else {
$right = $mid;
}
}
if ($right > scalar(@$blocks)-1) {
return -1;
} else {
return $right;
}
}
sub _bin_search_end {
my ($self, $blocks, $position) = @_;
# find index of rightmost block that starts before $position
my ($left, $right) = (-1, scalar(@$blocks)-1);
while ($right - $left > 0) {
my $mid = int(($left + $right + 1) / 2);
my ($block_end, $block_start);
if ($blocks->[0]->isa("Bio::EnsEMBL::Compara::GenomicAlignBlock")) {
$block_start = $blocks->[$mid]->reference_genomic_align->dnafrag_start;
} else {
$block_start = $blocks->[$mid]->start;
}
if ($block_start > $position) {
$right = $mid - 1;
} else {
$left = $mid;
}
}
# returns -1 if all blocks to right of $position
return $right;
}
#####################
# instance vars
#####################
sub query_length_hash {
my ($self, $val) = @_;
if (defined $val) {
$self->{_query_lengths_hashref} = $val;
}
return $self->{_query_lengths_hashref};
}
sub target_length_hash {
my ($self, $hash_ref) = @_;
if (defined $hash_ref) {
$self->{_target_lengths_hashref} = $hash_ref;
}
return $self->{_target_lengths_hashref};
}
sub chains {
my ($self, $val) = @_;
if (defined $val) {
$self->{_chains} = $val;
}
return $self->{_chains};
}
sub chains_sorted {
my ($self, $val) = @_;
if (defined $val) {
$self->{_chains_sorted} = $val;
}
return $self->{_chains_sorted};
}
sub filter_non_syntenic {
my ($self, $val) = @_;
if (defined $val) {
$self->{_filter_non_syntenic} = $val;
}
return $self->{_filter_non_syntenic};
}
sub min_chain_score {
my ($self, $val) = @_;
if (defined $val) {
$self->{_min_chain_score} = $val;
}
if (not exists $self->{_min_chain_score}) {
return undef;
} else {
return $self->{_min_chain_score};
}
}
##############
#### programs
##############
sub chainNet {
my ($self,$arg) = @_;
if (defined($arg)) {
$self->{_chainNet} = $arg;
}
return $self->{_chainNet};
}
sub netSyntenic {
my ($self,$arg) = @_;
if (defined($arg)) {
$self->{_netSyntenic} = $arg;
}
return $self->{_netSyntenic};
}
sub netFilter {
my ($self,$arg) = @_;
if (defined($arg)) {
$self->{_netFilter} = $arg;
}
return $self->{_netFilter};
}
1;