Raw content of Bio::EnsEMBL::Compara::Production::GenomicAlignBlock::SimpleNets
# Cared for by Ensembl
#
# Copyright GRL & EBI
#
# You may distribute this module under the same terms as perl itself
#
# POD documentation - main docs before the code
=pod
=head1 NAME
Bio::EnsEMBL::Compara::Production::GenomicAlign::AlignmentSimple
=head1 SYNOPSIS
my $db = Bio::EnsEMBL::DBAdaptor->new($locator);
my $genscan = Bio::EnsEMBL::Compara::Production::GenomicAlign::SimpleNets->new (
-db => $db,
-input_id => $input_id
-analysis => $analysis );
$genscan->fetch_input();
$genscan->run();
$genscan->write_output(); #writes to DB
=head1 DESCRIPTION
Given an compara MethodLinkSpeciesSet identifer, and a reference genomic
slice identifer, fetches the GenomicAlignBlocks from the given compara
database, infers chains from the group identifiers, and then forms
an alignment net from the chains and writes the result
back to the database.
This module implements some simple net-inspired functionality directly
in Perl, and does not rely on Jim Kent's original Axt tools
=cut
package Bio::EnsEMBL::Compara::Production::GenomicAlignBlock::SimpleNets;
use strict;
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
use Bio::EnsEMBL::Hive::Process;
use Bio::EnsEMBL::Compara::Production::GenomicAlignBlock::AlignmentProcessing;
our @ISA = qw(Bio::EnsEMBL::Compara::Production::GenomicAlignBlock::AlignmentProcessing);
############################################################
sub get_params {
my $self = shift;
my $param_string = shift;
return unless($param_string);
print("parsing parameter string : ",$param_string,"\n");
my $params = eval($param_string);
return unless($params);
$self->SUPER::get_params($param_string);
if (defined($params->{'qy_dnafrag_id'})) {
$self->QUERY_DNAFRAG_ID($params->{'qy_dnafrag_id'});
}
if (defined($params->{'tg_genomedb_id'})) {
$self->TARGET_GENOMEDB_ID($params->{'tg_genomedb_id'});
}
if (defined $params->{'net_method'}) {
$self->NET_METHOD($params->{'net_method'});
}
return 1;
}
=head2 fetch_input
Title : fetch_input
Usage : $self->fetch_input
Function:
Returns : nothing
Args : none
=cut
sub fetch_input {
my( $self) = @_;
$self->SUPER::fetch_input;
$self->compara_dba->dbc->disconnect_when_inactive(0);
my $mlssa = $self->compara_dba->get_MethodLinkSpeciesSetAdaptor;
my $dnafa = $self->compara_dba->get_DnaFragAdaptor;
my $gdba = $self->compara_dba->get_GenomeDBAdaptor;
my $gaba = $self->compara_dba->get_GenomicAlignBlockAdaptor;
$self->get_params($self->analysis->parameters);
$self->get_params($self->input_id);
################################################################
# get the compara data: MethodLinkSpeciesSet, reference DnaFrag,
# and GenomicAlignBlocks
################################################################
my $qy_dnafrag;
if ($self->QUERY_DNAFRAG_ID) {
$qy_dnafrag = $dnafa->fetch_by_dbID($self->QUERY_DNAFRAG_ID);
my @seq_level_bits = @{$qy_dnafrag->slice->project('seqlevel')};
$self->query_seq_level_projection(\@seq_level_bits);
}
throw("Could not fetch DnaFrag with dbID " . $self->QUERY_DNAFRAG_ID )
if not defined $qy_dnafrag;
my $tg_gdb;
if ($self->TARGET_GENOMEDB_ID) {
$tg_gdb = $gdba->fetch_by_dbID($self->TARGET_GENOMEDB_ID);
}
throw("Could not fetch GenomeDB with dbID " . $self->TARGET_GENOMEDB_ID)
if not defined $tg_gdb;
my $mlss = $mlssa->fetch_by_method_link_type_GenomeDBs($self->INPUT_METHOD_LINK_TYPE,
[$qy_dnafrag->genome_db, $tg_gdb]);
throw("No MethodLinkSpeciesSet for " . $self->INPUT_METHOD_LINK_TYPE)
if not defined $mlss;
my $out_mlss = new Bio::EnsEMBL::Compara::MethodLinkSpeciesSet;
$out_mlss->method_link_type($self->OUTPUT_METHOD_LINK_TYPE);
$out_mlss->species_set($mlss->species_set);
$mlssa->store($out_mlss);
######## needed for output####################
$self->output_MethodLinkSpeciesSet($out_mlss);
if ($self->input_job->retry_count > 0) {
print STDERR "Deleting alignments as it is a rerun\n";
$self->delete_alignments($out_mlss,
$qy_dnafrag);
}
my $gabs = $gaba->fetch_all_by_MethodLinkSpeciesSet_DnaFrag($mlss,
$qy_dnafrag);
###################################################################
# get the target slices and bin the GenomicAlignBlocks by group id
###################################################################
my %chains;
while (my $gab = shift @{$gabs}) {
my ($qy_ga) = $gab->reference_genomic_align;
my ($tg_ga) = @{$gab->get_all_non_reference_genomic_aligns};
my $group_id = $gab->group_id;
if (not exists $chains{$group_id}) {
$chains{$group_id} = {
score => $gab->score,
query_name => $qy_ga->dnafrag->name,
query_pos => $qy_ga->dnafrag_start,
target_name => $tg_ga->dnafrag->name,
target_pos => $tg_ga->dnafrag_start,
blocks => [],
};
} else {
if ($gab->score > $chains{$group_id}->{score}) {
$chains{$group_id}->{score} = $gab->score;
}
if ($chains{$group_id}->{query_pos} > $qy_ga->dnafrag_start) {
$chains{$group_id}->{query_pos} = $qy_ga->dnafrag_start;
}
if ($chains{$group_id}->{target_pos} > $tg_ga->dnafrag_start) {
$chains{$group_id}->{target_pos} = $tg_ga->dnafrag_start;
}
}
push @{$chains{$group_id}->{blocks}}, $gab;
}
# sort the blocks within each chain
foreach my $group_id (keys %chains) {
$chains{$group_id}->{blocks} = [sort {
$a->reference_genomic_align->dnafrag_start <=> $b->reference_genomic_align->dnafrag_start;
} @{$chains{$group_id}->{blocks}}];
}
# now sort the chains by score. Ties are resolved by target and location
# to make the sort deterministic
my @chains;
foreach my $group_id (sort { $chains{$b}->{score} <=> $chains{$a}->{score} or
$chains{$a}->{target_name} cmp $chains{$b}->{target_name} or
$chains{$a}->{target_pos} <=> $chains{$b}->{target_pos} or
$chains{$a}->{query_pos} <=> $chains{$b}->{query_pos}
} keys %chains) {
push @chains, $chains{$group_id}->{blocks};
}
$self->input_chains(\@chains);
}
sub run {
my ($self) = @_;
my $output;
if ($self->NET_METHOD) {
no strict 'refs';
my $method = $self->NET_METHOD;
$output = $self->$method;
} else {
$output = $self->ContigAwareNet();
}
$self->cleanse_output($output);
$self->output($output);
}
sub write_output {
my $self = shift;
my $disconnect_when_inactive_default = $self->db->dbc->disconnect_when_inactive;
$self->compara_dba->dbc->disconnect_when_inactive(0);
$self->SUPER::write_output;
$self->compara_dba->dbc->disconnect_when_inactive($disconnect_when_inactive_default);
}
############################
# specific net methods
###########################
my @ALLOWABLE_METHODS = qw(ContigAwareNet);
sub SUPPORTED_METHOD {
my ($class, $method ) = @_;
my $allowed = 0;
foreach my $meth (@ALLOWABLE_METHODS) {
if ($meth eq $method) {
$allowed = 1;
last;
}
}
return $allowed;
}
sub ContigAwareNet {
my ($self) = @_;
my $chains = $self->input_chains;
# assumption 1: chains are sorted from "best" to "worst"
# assumption 2: each chain is sorted from start to end in query (ref) sequence
my (@net_chains, @retained_blocks, %contigs_of_kept_blocks);
foreach my $c (@$chains) {
my @blocks = @$c;
my $keep_chain = 1;
BLOCK: foreach my $block (@blocks) {
my $qga = $block->reference_genomic_align;
#my ($tga) = @{$block->get_all_non_reference_genomic_aligns};
OTHER_BLOCK: foreach my $oblock (@retained_blocks) {
my $oqga = $oblock->reference_genomic_align;
#my ($otga) = @{$ob->get_all_non_reference_genomic_aligns};
if ($oqga->dnafrag_start <= $qga->dnafrag_end and
$oqga->dnafrag_end >= $qga->dnafrag_start) {
$keep_chain = 0;
last BLOCK;
} elsif ($oqga->dnafrag_start > $qga->dnafrag_end) {
last OTHER_BLOCK;
}
}
}
if ($keep_chain) {
my (%contigs_of_blocks, @split_blocks);
# the following chops the blocks into pieces such that each block
# lies completely within a sequence-level region (contig). It's rare
# that this is not the case anyway, but it's best to be sure...
foreach my $block (@blocks) {
my ($inside_seg, @overlap_segs);
my $qga = $block->reference_genomic_align;
foreach my $seg (@{$self->query_seq_level_projection}) {
if ($qga->dnafrag_start >= $seg->from_start and
$qga->dnafrag_end <= $seg->from_end) {
$inside_seg = $seg;
last;
} elsif ($seg->from_start <= $qga->dnafrag_end and
$seg->from_end >= $qga->dnafrag_start) {
push @overlap_segs, $seg;
} elsif ($seg->from_start > $qga->dnafrag_end) {
last;
}
}
if (defined $inside_seg) {
push @split_blocks, $block;
$contigs_of_blocks{$block} = $inside_seg;
} else {
my @cut_blocks;
foreach my $seg (@overlap_segs) {
my ($reg_start, $reg_end) = ($qga->dnafrag_start, $qga->dnafrag_end);
$reg_start = $seg->from_start if $seg->from_start > $reg_start;
$reg_end = $seg->from_end if $seg->from_end < $reg_end;
my $cut_block = $block->restrict_between_reference_positions($reg_start,
$reg_end);
$cut_block->score($block->score);
if (defined $cut_block) {
push @cut_blocks, $cut_block;
$contigs_of_blocks{$cut_block} = $seg;
}
}
push @split_blocks, @cut_blocks;
}
}
@blocks = @split_blocks;
# only retain blocks that lie on different contigs from all retained
# blocks so far
my @diff_contig_blocks;
my %kept_contigs = reverse %contigs_of_kept_blocks;
foreach my $block (@blocks) {
if (not exists $kept_contigs{$contigs_of_blocks{$block}}) {
push @diff_contig_blocks, $block;
}
}
# calculate what proportion of the overall chain remains; reject if
# the proportion is less than 50%
my $kept_len = 0;
my $total_len = 0;
map {
$kept_len += $_->reference_genomic_align->dnafrag_end - $_->reference_genomic_align->dnafrag_start + 1;
} @diff_contig_blocks;
map {
$total_len += $_->reference_genomic_align->dnafrag_end - $_->reference_genomic_align->dnafrag_start + 1;
} @blocks;
if ($kept_len / $total_len > 0.5) {
foreach my $bid (keys %contigs_of_blocks) {
$contigs_of_kept_blocks{$bid} = $contigs_of_blocks{$bid};
}
push @net_chains, \@diff_contig_blocks;
push @retained_blocks, @diff_contig_blocks;
@retained_blocks = sort {
$a->reference_genomic_align->dnafrag_start <=> $b->reference_genomic_align->dnafrag_start;
} @retained_blocks;
}
}
}
# fetch all genomic_aligns from the result blocks to avoid cacheing issues
# when storing
foreach my $ch (@net_chains) {
foreach my $bl (@{$ch}) {
foreach my $al (@{$bl->get_all_GenomicAligns}) {
$al->dnafrag;
}
}
}
return \@net_chains;
}
#############################
sub input_chains {
my ($self, $val) = @_;
if (defined $val) {
$self->{_query_chains} = $val;
}
return $self->{_query_chains};
}
sub query_seq_level_projection {
my ($self, $val) = @_;
if (defined $val) {
$self->{_query_seq_level_bits} = $val;
}
return $self->{_query_seq_level_bits};
}
#########################################
# config vars
sub NET_METHOD {
my ($self, $val) = @_;
if (defined $val) {
$self->{_net_type} = $val;
}
return $self->{_net_type};
}
sub QUERY_DNAFRAG_ID {
my ($self,$value) = @_;
if (defined $value) {
$self->{'_query_dnafrag_id'} = $value;
}
return $self->{'_query_dnafrag_id'};
}
sub TARGET_GENOMEDB_ID {
my ($self,$value) = @_;
if (defined $value) {
$self->{'_target_genomedb_id'} = $value;
}
return $self->{'_target_genomedb_id'};
}
1;