Raw content of Bio::EnsEMBL::Analysis::RunnableDB::LayerAnnotation
=head1 NAME
LayerAnnotation - DESCRIPTION of Object
=head1 SYNOPSIS
=head1 DESCRIPTION
=head1 CONTACT
ensembl-dev@ebi.ac.uk
=cut
package Bio::EnsEMBL::Analysis::RunnableDB::LayerAnnotation;
use vars qw(@ISA);
use strict;
use Bio::EnsEMBL::Analysis::RunnableDB::BaseGeneBuild;
use Bio::EnsEMBL::Analysis::Config::GeneBuild::LayerAnnotation;
use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::GeneUtils;
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
use Bio::EnsEMBL::Utils::Argument qw (rearrange);
@ISA = qw (
Bio::EnsEMBL::Analysis::RunnableDB::BaseGeneBuild
);
sub new{
my ($class,@args) = @_;
my $self = $class->SUPER::new(@args);
$self->read_and_check_config($LAYERANNOTATION_CONFIG_BY_LOGIC);
# other stuff here
return $self;
}
###################################
sub fetch_input {
my ($self) = @_;
foreach my $dbname (@{$self->SOURCEDB_REFS}) {
my $dbh = $self->get_dbadaptor($dbname);
my $slice = $dbh->get_SliceAdaptor->fetch_by_name($self->input_id);
my $tlslice = $dbh->get_SliceAdaptor->fetch_by_region($slice->coord_system->name,
$slice->seq_region_name);
foreach my $layer (@{$self->layers}) {
foreach my $tp (@{$layer->biotypes}) {
foreach my $g (@{$slice->get_all_Genes_by_type($tp)}) {
$g = $g->transfer($tlslice);
push @{$layer->genes}, $g;
}
}
}
}
}
#####################################
sub run {
my ($self) = @_;
my (@retained_genes);
my @layers = @{$self->layers};
for(my $i=0; $i < @layers; $i++) {
my $layer = $layers[$i];
if ($layer->genes) {
my @layer_genes = sort {$a->start <=> $b->start} @{$layer->genes};
my @compare_genes;
my %filter_against = map { $_ => 1 } @{$layer->filter_against};
for(my $j = $i-1; $j>=0; $j--) {
if (exists $filter_against{$layers[$j]->id} and
@{$layers[$j]->genes}) {
push @compare_genes, @{$layers[$j]->genes};
}
}
@compare_genes = sort {$a->start <=> $b->start} @compare_genes;
if ($layer->filter_object) {
@layer_genes = @{$layer->filter_object->filter(\@layer_genes, \@compare_genes)};
} else {
@layer_genes = @{$self->generic_filter(\@layer_genes, \@compare_genes)};
}
if (not $layer->discard) {
push @retained_genes, @layer_genes;
}
$layer->genes(\@layer_genes);
}
}
$self->output(\@retained_genes);
}
##################################
sub write_output {
my($self) = @_;
my $target_db = $self->get_dbadaptor($self->TARGETDB_REF);
my $g_adap = $target_db->get_GeneAdaptor;
# fully loading gene is required for the store to work
# reliably. However, fully loading all genes to be stored
# up front is expensive in memory. Therefore, load, store and
# discard one gene at a time
my $out = $self->output;
while(my $g = shift @$out) {
fully_load_Gene($g);
$g_adap->store($g);
}
return 1;
}
#####################################
sub generic_filter {
my ($self, $these, $others) = @_;
# interference is judged by overlap at exon level
# assumption is that @others is sorted by gene start
my @filtered;
my $cur_idx = 0;
foreach my $obj (@$these) {
my (@genomic_overlap, $left_bound);
for(my $i=$cur_idx; $i < @$others; $i++) {
my $o_obj = $others->[$i];
if ($o_obj->end >= $obj->start and not defined $left_bound) {
$left_bound = $i;
}
if ($o_obj->end < $obj->start) {
next;
} elsif ($o_obj->start > $obj->end) {
last;
} else {
push @genomic_overlap, $o_obj;
}
}
$cur_idx = $left_bound if defined $left_bound;
my $exon_overlap = 0;
if (@genomic_overlap) {
my @exons = @{$obj->get_all_Exons};
OG: foreach my $o_obj (@genomic_overlap) {
foreach my $oe (@{$o_obj->get_all_Exons}) {
foreach my $e (@exons) {
if ($oe->strand == $e->strand and
$oe->end >= $e->start and
$oe->start <= $e->end) {
$exon_overlap = 1;
last OG;
}
}
}
}
}
if (not $exon_overlap) {
push @filtered, $obj;
}
}
return \@filtered;
}
####################################
sub layers {
my ($self, $val) = @_;
if (defined $val) {
$self->{_runnabledb_layers} = $val;
}
return $self->{_runnabledb_layers};
}
####################################
sub read_and_check_config {
my ($self, $hash) = @_;
$self->SUPER::read_and_check_config($hash);
# check that all relevant vars have been defined
foreach my $i (qw(LAYERS
SOURCEDB_REFS
TARGETDB_REF)) {
throw("You must define $i in config")
if not $self->$i;
}
# check types
throw("Config var LAYERS must be an array reference")
if ref($self->LAYERS) ne "ARRAY";
throw("Config var SOURCEDB_REFS must be an array reference")
if ref($self->SOURCEDB_REFS) ne "ARRAY";
throw("Config var TARGETDB_REF must be an scalar")
if ref($self->TARGETDB_REF);
my (%biotypes, %layer_ids, @layers);
# finally, check the integrity of the LAYERS
foreach my $el (@{$self->LAYERS}) {
throw("Elements of LAYERS must be hash references")
if ref($el) ne "HASH";
throw("Each element of LAYERS must contain a layer ID")
if not exists $el->{ID};
throw("LAYER " . $el->{ID} . " should a list of BIOTYPES")
if not exists $el->{BIOTYPES} or ref($el->{BIOTYPES}) ne "ARRAY";
my $layer_id = $el->{ID};
my @biotypes = @{$el->{BIOTYPES}};
my $discard = 0;
my ($filter, @filter_against);
if (exists $el->{DISCARD} and $el->{DISCARD}) {
$discard = 1;
}
foreach my $tp (@biotypes) {
if (exists $biotypes{$tp}) {
throw("biotype $tp occurs more than once");
}
$biotypes{$tp} = 1;
}
if (exists $el->{FILTER_AGAINST}) {
throw("In layer $layer_id FILTER_AGAINST must contain a list of layer ids")
if ref($el->{FILTER_AGAINST}) ne "ARRAY";
@filter_against = @{$el->{FILTER_AGAINST}};
foreach my $id (@filter_against) {
throw("In FILTER_AGAINST in layer $layer_id, '$id' is not the name ".
"of a higher level layer")
if not exists $layer_ids{$id};
}
}
if (exists $el->{FILTER}) {
$self->require_module($el->{FILTER});
$filter = $el->{FILTER}->new;
}
push @layers, Bio::EnsEMBL::Analysis::RunnableDB::LayerAnnotation::Layer
->new(-id => $layer_id,
-discard => $discard,
-biotypes => \@biotypes,
-filter_object => $filter,
-filter_against => \@filter_against);
$layer_ids{$layer_id} = 1;
}
$self->layers(\@layers);
}
################################################
sub LAYERS {
my ($self, $val) = @_;
if (defined $val) {
$self->{_layers} = $val;
}
return $self->{_layers};
}
sub SOURCEDB_REFS {
my ($self, $val) = @_;
if (defined $val) {
$self->{_inputdb_names} = $val;
}
return $self->{_inputdb_names};
}
sub TARGETDB_REF {
my ($self, $val) = @_;
if (defined $val) {
$self->{_outputdb_name} = $val;
}
return $self->{_outputdb_name};
}
##############################################
package Bio::EnsEMBL::Analysis::RunnableDB::LayerAnnotation::Layer;
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
use Bio::EnsEMBL::Utils::Argument qw (rearrange);
sub new {
my ($class, @args) = @_;
my $self = bless {}, $class;
my ($id,
$discard,
$genes,
$filter_object,
$filter_against,
$biotypes) = rearrange
(['ID','DISCARD','GENES','FILTER_OBJECT','FILTER_AGAINST','BIOTYPES'],@args);
throw("Layers must have an id") if not defined $id;
throw("Layers must have a list of biotypes") if not defined $biotypes;
$discard = 0 if not defined $discard;
$genes = [] if not defined $genes;
$filter_against = [] if not defined $filter_against;
$self->id($id);
$self->filter_object($filter_object) if defined $filter_object;
$self->filter_against($filter_against);
$self->biotypes($biotypes);
$self->discard($discard);
$self->genes($genes);
return $self;
}
sub id{
my ($self,$value) = @_;
if (defined $value) {
$self->{_layer_id} = $value;
}
return $self->{_layer_id};
}
sub filter_against{
my ($self,$value) = @_;
if (defined $value) {
$self->{_layer_filter_against} = $value;
}
return $self->{_layer_filter_against};
}
sub filter_object{
my ($self,$value) = @_;
if (defined $value) {
$self->{_layer_filter} = $value;
}
return $self->{_layer_filter};
}
sub discard{
my ($self,$value) = @_;
if (defined $value) {
$self->{_layer_discard} = $value;
}
return $self->{_layer_discard};
}
sub biotypes{
my ($self,$value) = @_;
if (defined $value) {
$self->{_layer_biotypes} = $value;
}
return $self->{_layer_biotypes};
}
sub genes{
my ($self,$value) = @_;
if (defined $value) {
$self->{_layer_genes} = $value;
}
return $self->{_layer_genes};
}
1;