Raw content of Bio::EnsEMBL::Analysis::RunnableDB::AlignmentNets
# Cared for by Ensembl
#
# Copyright GRL & EBI
#
# You may distribute this module under the same terms as perl itself
#
# POD documentation - main docs before the code
=pod
=head1 NAME
Bio::EnsEMBL::Analysis::RunnableDB::AlignmentNets
=head1 SYNOPSIS
my $db = Bio::EnsEMBL::DBAdaptor->new($locator);
my $genscan = Bio::EnsEMBL::Analysis::RunnableDB::AlignmentNets->new (
-db => $db,
-input_id => $input_id
-analysis => $analysis );
$genscan->fetch_input();
$genscan->run();
$genscan->write_output(); #writes to DB
=head1 DESCRIPTION
Given an compara MethodLinkSpeciesSet identifer, and a reference genomic
slice identifer, fetches the GenomicAlignBlocks from the given compara
database, infers chains from the group identifiers, and then forms
an alignment net from the chains and writes the result
back to the database.
This module (at least for now) relies heavily on Jim Kent\'s Axt tools.
=cut
package Bio::EnsEMBL::Analysis::RunnableDB::AlignmentNets;
use vars qw(@ISA);
use strict;
use Bio::EnsEMBL::Analysis::RunnableDB;
use Bio::EnsEMBL::Analysis::Config::General;
use Bio::EnsEMBL::Analysis::RunnableDB::AlignmentFilter;
use Bio::EnsEMBL::Analysis::Config::AlignmentFilter;
use Bio::EnsEMBL::Analysis::Runnable::AlignmentNets;
use Bio::EnsEMBL::Compara::DBSQL::DBAdaptor;
use Bio::EnsEMBL::DBSQL::DBAdaptor;
use Bio::EnsEMBL::DnaDnaAlignFeature;
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
use Bio::EnsEMBL::Utils::Argument qw( rearrange );
@ISA = qw(Bio::EnsEMBL::Analysis::RunnableDB::AlignmentFilter);
############################################################
sub new {
my ($class,@args) = @_;
my $self = $class->SUPER::new(@args);
$self->read_and_check_config($NET_CONFIG_BY_LOGIC);
return $self;
}
############################################################
sub fetch_input {
my( $self) = @_;
$self->throw("No input id") unless defined($self->input_id);
my ($seq_name, $seq_start, $seq_end);
if ($self->input_id =~ /^([^:]+):(\d+):(\d+)$/) {
($seq_name, $seq_start, $seq_end) = ($1, $2, $3);
} elsif ($self->input_id =~ /(\S+)/) {
$seq_name = $1;
} else {
throw("Input id could not be parsed: ", $self->input_id);
}
if ($self->QUERY_CORE_DB) {
my $q_dbh = Bio::EnsEMBL::DBSQL::DBAdaptor->new(%{$self->QUERY_CORE_DB});
my $sl = $q_dbh->get_SliceAdaptor->fetch_by_region('toplevel',
$seq_name);
my @segs = @{$sl->project('seqlevel')};
$self->query_seq_level_projection(\@segs);
#foreach my $seg (@segs) {
# printf("FROM_START %d FROM_END %d TO_SLICE %s TO_START %d TO_END %d\n",
# $seg->from_start, $seg->from_end, $seg->to_Slice->seq_region_name,
# $seg->to_Slice->start, $seg->to_Slice->end);
#}
}
my $compara_dbh = Bio::EnsEMBL::Compara::DBSQL::DBAdaptor->new(%{$self->COMPARA_DB});
my $query_species = $self->QUERY_SPECIES;
my $target_species = $self->TARGET_SPECIES;
my $q_gdb = $compara_dbh->get_GenomeDBAdaptor->fetch_by_name_assembly($query_species);
my $t_gdb = $compara_dbh->get_GenomeDBAdaptor->fetch_by_name_assembly($target_species);
throw("Could not get GenomeDB for '$query_species'") if not defined $q_gdb;
throw("Could not get GenomeDB for '$target_species'") if not defined $t_gdb;
################################################################
# get the compara data: MethodLinkSpeciesSet, reference DnaFrag,
# and GenomicAlignBlocks
################################################################
my $mlss = $compara_dbh->get_MethodLinkSpeciesSetAdaptor
->fetch_by_method_link_type_GenomeDBs($self->INPUT_METHOD_LINK_TYPE,
[$q_gdb, $t_gdb]);
throw("No MethodLinkSpeciesSet for :\n" .
$self->INPUT_METHOD_LINK_TYPE . "\n" .
$query_species . "\n" .
$target_species)
if not $mlss;
my $out_mlss = $compara_dbh->get_MethodLinkSpeciesSetAdaptor
->fetch_by_method_link_type_GenomeDBs($self->OUTPUT_METHOD_LINK_TYPE,
[$q_gdb, $t_gdb]);
throw("No MethodLinkSpeciesSet for :\n" .
$self->OUTPUT_METHOD_LINK_TYPE . "\n" .
$query_species . "\n" .
$target_species)
if not $out_mlss;
######## needed for output####################
$self->output_MethodLinkSpeciesSet($out_mlss);
my $ref_dnafrag = $compara_dbh->get_DnaFragAdaptor->fetch_by_GenomeDB_and_name($q_gdb,
$seq_name);
my $gen_al_blocks = $compara_dbh->get_GenomicAlignBlockAdaptor
->fetch_all_by_MethodLinkSpeciesSet_DnaFrag($mlss, $ref_dnafrag, $seq_start, $seq_end);
###################################################################
# get the target slices and bin the GenomicAlignBlocks by group id
###################################################################
my (%features_by_group);
foreach my $block (@$gen_al_blocks) {
my ($qy_al) = $block->reference_genomic_align;
my ($tg_al) = @{$block->get_all_non_reference_genomic_aligns};
if (not exists($self->query_DnaFrag_hash->{$qy_al->dnafrag->name})) {
######### needed for output ######################################
$self->query_DnaFrag_hash->{$qy_al->dnafrag->name} = $qy_al->dnafrag;
}
if (not exists($self->target_DnaFrag_hash->{$tg_al->dnafrag->name})) {
######### needed for output #######################################
$self->target_DnaFrag_hash->{$tg_al->dnafrag->name} = $tg_al->dnafrag;
}
my $daf_cigar = $self->daf_cigar_from_compara_cigars($qy_al->cigar_line,
$tg_al->cigar_line);
my $daf = Bio::EnsEMBL::DnaDnaAlignFeature->new
(-seqname => $qy_al->dnafrag->name,
-start => $qy_al->dnafrag_start,
-end => $qy_al->dnafrag_end,
-strand => $qy_al->dnafrag_strand,
-hseqname => $tg_al->dnafrag->name,
-hstart => $tg_al->dnafrag_start,
-hend => $tg_al->dnafrag_end,
-hstrand => $tg_al->dnafrag_strand,
-score => $block->score,
-cigar_string => $daf_cigar,
-group_id => $block->group_id);
push @{$features_by_group{$group_id}}, $daf;
}
$self->chains($self->sort_chains_by_max_block_score([values %features_by_group]));
if ($self->METHOD eq 'STANDARD' or $self->METHOD eq 'SYNTENIC') {
my $run = $self->make_runnable($self->METHOD eq 'SYNTENIC');
$self->runnable($run);
}
}
############################################################
sub run{
my ($self) = @_;
if (@{$self->runnable}) {
$self->SUPER::run;
} else {
my $filtered_chains;
if ($self->METHOD eq 'SIMPLE_HIGH') {
$filtered_chains = $self->calculate_simple_high_net($self->chains);
} elsif ($self->METHOD eq 'SIMPLE_LOW') {
$filtered_chains = $self->calculate_simple_low_net($self->chains);
}
if (defined $filtered_chains) {
my $converted_out = $self->convert_output($filtered_chains);
$self->output($converted_out);
}
}
}
###############################################################
sub calculate_simple_high_net {
my ($self, $chains) = @_;
my (@net_chains, @retained_blocks, %contigs_of_kept_blocks);
# Junk chains that have extent overlap
# with retained chains so far
foreach my $c (@$chains) {
my @b = sort { $a->start <=> $b->start } @$c;
my $c_st = $b[0]->start;
my $c_en = $b[-1]->end;
my $keep_chain = 1;
foreach my $oc (@net_chains) {
my @ob = sort { $a->start <=> $b->start } @$oc;
my $oc_st = $ob[0]->start;
my $oc_en = $ob[-1]->end;
if ($oc_st <= $c_en and $oc_en >= $c_st) {
# overlap; junk
$keep_chain = 0;
last;
}
}
if ($keep_chain) {
my (%contigs_of_blocks, @split_blocks);
foreach my $bl (@b) {
my ($inside_seg, @overlap_segs);
foreach my $seg (@{$self->query_seq_level_projection}) {
if ($bl->start >= $seg->from_start and
$bl->end <= $seg->from_end) {
$inside_seg = $seg;
last;
} elsif ($seg->from_start <= $bl->end and
$seg->from_end >= $bl->start) {
push @overlap_segs, $seg;
} elsif ($seg->from_start > $bl->end) {
last;
}
}
if (defined $inside_seg) {
push @split_blocks, $bl;
$contigs_of_blocks{$bl} = $inside_seg;
} else {
my @cut_blocks;
foreach my $seg (@overlap_segs) {
my ($reg_start, $reg_end) = ($bl->start, $bl->end);
$reg_start = $seg->from_start if $seg->from_start > $reg_start;
$reg_end = $seg->from_end if $seg->from_end < $reg_end;
my $cut_block = $bl->restrict_between_positions($reg_start,
$reg_end,
"SEQ");
if (defined $cut_block) {
push @cut_blocks, $cut_block;
$contigs_of_blocks{$cut_block} = $seg;
}
}
push @split_blocks, @cut_blocks;
}
}
@b = @split_blocks;
my %new_block;
map { $new_block{$_} = 1 } @b;
my @new_list = (@retained_blocks, @b);
@new_list = sort { $a->start <=> $b->start } @new_list;
NEWBLOCK:
for(my $i = 0; $i < @new_list; $i++) {
my $bl = $new_list[$i];
if (exists($new_block{$bl})) {
my ($flank_left, $flank_right);
if ($i > 0 and
not exists($new_block{$new_list[$i-1]})) {
$flank_left = $new_list[$i-1];
}
if ($i < scalar(@new_list) - 1 and
not exists($new_block{$new_list[$i+1]})) {
$flank_right = $new_list[$i+1];
}
if (defined $flank_left and
$contigs_of_blocks{$bl}->to_Slice->seq_region_name eq
$contigs_of_kept_blocks{$flank_left}->to_Slice->seq_region_name
) {
$keep_chain = 0;
last NEWBLOCK;
}
if (defined $flank_right and
$contigs_of_blocks{$bl}->to_Slice->seq_region_name eq
$contigs_of_kept_blocks{$flank_right}->to_Slice->seq_region_name
) {
$keep_chain = 0;
last NEWBLOCK;
}
}
}
if ($keep_chain) {
foreach my $bid (keys %contigs_of_blocks) {
$contigs_of_kept_blocks{$bid} = $contigs_of_blocks{$bid};
}
push @net_chains, \@b;
push @retained_blocks, @b;
@retained_blocks = sort { $a->start <=> $b->start } @retained_blocks;
}
}
}
return \@net_chains;
}
################################################################
sub calculate_simple_low_net {
my ($self, $chains) = @_;
my (@net_chains, @retained_blocks, %contigs_of_kept_blocks);
foreach my $c (@$chains) {
my @b = sort { $a->start <=> $b->start } @$c;
my $keep_chain = 1;
BLOCK: foreach my $b (@b) {
OTHER_BLOCK: foreach my $ob (@retained_blocks) {
if ($ob->start <= $b->end and $ob->end >= $b->start) {
$keep_chain = 0;
last BLOCK;
} elsif ($ob->start > $b->end) {
last OTHER_BLOCK;
}
}
}
if ($keep_chain) {
my (%contigs_of_blocks, @split_blocks);
foreach my $bl (@b) {
my ($inside_seg, @overlap_segs);
foreach my $seg (@{$self->query_seq_level_projection}) {
if ($bl->start >= $seg->from_start and
$bl->end <= $seg->from_end) {
$inside_seg = $seg;
last;
} elsif ($seg->from_start <= $bl->end and
$seg->from_end >= $bl->start) {
push @overlap_segs, $seg;
} elsif ($seg->from_start > $bl->end) {
last;
}
}
if (defined $inside_seg) {
push @split_blocks, $bl;
$contigs_of_blocks{$bl} = $inside_seg;
} else {
my @cut_blocks;
foreach my $seg (@overlap_segs) {
my ($reg_start, $reg_end) = ($bl->start, $bl->end);
$reg_start = $seg->from_start if $seg->from_start > $reg_start;
$reg_end = $seg->from_end if $seg->from_end < $reg_end;
my $cut_block = $bl->restrict_between_positions($reg_start,
$reg_end,
"SEQ");
if (defined $cut_block) {
push @cut_blocks, $cut_block;
$contigs_of_blocks{$cut_block} = $seg;
}
}
push @split_blocks, @cut_blocks;
}
}
@b = @split_blocks;
my %new_block;
map { $new_block{$_} = 1 } @b;
my @new_list = (@retained_blocks, @b);
@new_list = sort { $a->start <=> $b->start } @new_list;
NEWBLOCK:
for(my $i = 0; $i < @new_list; $i++) {
my $bl = $new_list[$i];
if (exists($new_block{$bl})) {
my ($flank_left, $flank_right);
if ($i > 0 and
not exists($new_block{$new_list[$i-1]})) {
$flank_left = $new_list[$i-1];
}
if ($i < scalar(@new_list) - 1 and
not exists($new_block{$new_list[$i+1]})) {
$flank_right = $new_list[$i+1];
}
if (defined $flank_left and
#($flank_left->hseqname ne $bl->hseqname or
# $flank_left->hstrand != $bl->hstrand or
# ($bl->hstrand > 0 and $flank_left->hend >= $bl->hstart) or
# ($bl->hstrand < 0 and $flank_left->hstart <= $bl->hend)) and
$contigs_of_blocks{$bl}->to_Slice->seq_region_name eq
$contigs_of_kept_blocks{$flank_left}->to_Slice->seq_region_name
) {
$keep_chain = 0;
last NEWBLOCK;
}
if (defined $flank_right and
#($flank_right->hseqname ne $bl->hseqname or
# $flank_right->hstrand != $bl->hstrand or
# ($bl->hstrand > 0 and $flank_right->hstart <= $bl->hend) or
# ($bl->hstrand < 0 and $flank_right->hend >= $bl->hstart)) and
$contigs_of_blocks{$bl}->to_Slice->seq_region_name eq
$contigs_of_kept_blocks{$flank_right}->to_Slice->seq_region_name
) {
$keep_chain = 0;
last NEWBLOCK;
}
}
}
if ($keep_chain) {
foreach my $bid (keys %contigs_of_blocks) {
$contigs_of_kept_blocks{$bid} = $contigs_of_blocks{$bid};
}
push @net_chains, \@b;
push @retained_blocks, @b;
@retained_blocks = sort { $a->start <=> $b->start } @retained_blocks;
}
}
}
return \@net_chains;
}
############################################################
sub make_runnable {
my ($self, $syntenic) = @_;
my (%query_lengths, %target_lengths);
foreach my $nm (keys %{$self->query_DnaFrag_hash}) {
$query_lengths{$nm} = $self->query_DnaFrag_hash->{$nm}->length;
}
foreach my $nm (keys %{$self->target_DnaFrag_hash}) {
$target_lengths{$nm} = $self->target_DnaFrag_hash->{$nm}->length;
}
my %parameters = (-analysis => $self->analysis,
-chains => $self->chains,
-query_lengths => \%query_lengths,
-target_lengths => \%target_lengths,
-min_chain_score => $self->MIN_CHAIN_SCORE,
-filter_non_syntenic => $syntenic);
$parameters{-chainNet} = $self->CHAIN_NET
? $self->CHAIN_NET
: $BIN_DIR . "/" . "chainNet";
$parameters{-netSyntenic} = $self->NET_SYNTENIC
? $self->NET_SYNTENIC
: $BIN_DIR . "/" . "netSyntenic";
$parameters{-netFilter} = $self->NET_FILTER
? $self->NET_FILTER
: $BIN_DIR . "/" . "netFilter";
my $run = Bio::EnsEMBL::Analysis::Runnable::AlignmentNets->new(%parameters);
return $run;
}
##############################################################
sub chains {
my ($self,$value) = @_;
if (defined $value) {
$self->{_chains} = $value;
}
return $self->{_chains};
}
sub query_seq_level_projection {
my ($self, $val) = @_;
if (defined $val) {
$self->{_query_proj_segs} = $val;
}
return $self->{_query_proj_segs};
}
####################################
# config variable holders
####################################
sub read_and_check_config {
my ($self, $hash) = @_;
$self->SUPER::read_and_check_config($hash);
my $logic = $self->analysis->logic_name;
foreach my $var (qw(INPUT_METHOD_LINK_TYPE
OUTPUT_METHOD_LINK_TYPE
QUERY_SPECIES
TARGET_SPECIES
COMPARA_DB
METHOD)) {
throw("You must define $var in config for logic '$logic'" .
" or in the DEFAULT entry")
if not $self->$var;
}
# check for sanity
my %allowable_methods =
(
STANDARD => 1,
SYNTENIC => 1,
SIMPLE_HIGH => 1,
SIMPLE_MEDIUM => 1,
SIMPLE_LOW => 1,
);
if ($self->METHOD and
not $allowable_methods{$self->METHOD}) {
throw("You must set METHOD to one of the reserved names\n" .
"See the .example file for these names");
}
}
sub QUERY_CORE_DB {
my ($self, $val) = @_;
if (defined $val) {
$self->{_query_core_db} = $val;
}
return $self->{_query_core_db};
}
sub QUERY_SPECIES {
my ($self, $val) = @_;
if (defined $val) {
$self->{_query_species} = $val;
}
return $self->{_query_species};
}
sub TARGET_SPECIES {
my ($self, $val) = @_;
if (defined $val) {
$self->{_target_species} = $val;
}
return $self->{_target_species};
}
sub METHOD {
my ($self, $val) = @_;
if (defined $val) {
$self->{_primary_method} = $val;
}
return $self->{_primary_method};
}
sub CHAIN_NET {
my ($self, $val) = @_;
if (defined $val) {
$self->{_chain_net_prog} = $val;
}
return $self->{_chain_net_prog};
}
sub NET_SYNTENIC {
my ($self, $val) = @_;
if (defined $val) {
$self->{_net_syntenic_prog} = $val;
}
return $self->{_net_syntenic_prog};
}
sub NET_FILTER {
my ($self, $val) = @_;
if (defined $val) {
$self->{_net_filter_prog} = $val;
}
return $self->{_net_filter_prog};
}
1;