Raw content of Bio::EnsEMBL::IdMapping::Cache
=head1 LICENSE
Copyright (c) 1999-2009 The European Bioinformatics Institute and
Genome Research Limited. All rights reserved.
This software is distributed under a modified Apache license.
For license details, please see
/info/about/code_licence.html
=head1 CONTACT
Please email comments or questions to the public Ensembl
developers list at .
Questions may also be sent to the Ensembl help desk at
.
=cut
=head1 NAME
Bio::EnsEMBL::IdMapping::Cache - a cache to hold data objects used by the
IdMapping application
=head1 SYNOPSIS
=head1 DESCRIPTION
=head1 METHODS
=cut
package Bio::EnsEMBL::IdMapping::Cache;
use strict;
use warnings;
no warnings 'uninitialized';
use Bio::EnsEMBL::Utils::Argument qw(rearrange);
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
use Bio::EnsEMBL::Utils::ScriptUtils qw(parse_bytes path_append);
use Bio::EnsEMBL::IdMapping::TinyGene;
use Bio::EnsEMBL::IdMapping::TinyTranscript;
use Bio::EnsEMBL::IdMapping::TinyTranslation;
use Bio::EnsEMBL::IdMapping::TinyExon;
use Bio::EnsEMBL::DBSQL::DBAdaptor;
use Storable qw(nstore retrieve);
use Digest::MD5 qw(md5_hex);
# define available cache names here
my @cache_names = qw(
exons_by_id
transcripts_by_id
transcripts_by_exon_id
translations_by_id
genes_by_id
genes_by_transcript_id
);
=head2 new
Arg [LOGGER]: Bio::EnsEMBL::Utils::Logger $logger - a logger object
Arg [CONF] : Bio::EnsEMBL::Utils::ConfParser $conf - a configuration object
Example : my $cache = Bio::EnsEMBL::IdMapping::Cache->new(
-LOGGER => $logger,
-CONF => $conf,
);
Description : constructor
Return type : Bio::EnsEMBL::IdMapping::Cache object
Exceptions : thrown on wrong or missing arguments
Caller : general
Status : At Risk
: under development
=cut
sub new {
my $caller = shift;
my $class = ref($caller) || $caller;
my ($logger, $conf, $load_instance) =
rearrange(['LOGGER', 'CONF', 'LOAD_INSTANCE'], @_);
unless ($logger->isa('Bio::EnsEMBL::Utils::Logger')) {
throw("You must provide a Bio::EnsEMBL::Utils::Logger for logging.");
}
unless ($conf->isa('Bio::EnsEMBL::Utils::ConfParser')) {
throw("You must provide configuration as a Bio::EnsEMBL::Utils::ConfParser object.");
}
my $self = {};
bless ($self, $class);
# initialise
$self->logger($logger);
$self->conf($conf);
if ($load_instance) {
$self->read_instance_from_file;
}
return $self;
}
=head2 build_cache_by_slice
Arg[1] : String $dbtype - db type (source|target)
Arg[2] : String $slice_name - the name of a slice (format as returned by
Bio::EnsEMBL::Slice->name)
Example : my ($num_genes, $filesize) = $cache->build_cache_by_slice(
'source', 'chromosome:NCBI36:X:1:1000000:-1');
Description : Builds a cache of genes, transcripts, translations and exons
needed by the IdMapping application and serialises the resulting
cache object to a file, one slice at a time.
Return type : list of the number of genes processed and the size of the
serialised cache file
Exceptions : thrown on invalid slice name
Caller : general
Status : At Risk
: under development
=cut
sub build_cache_by_slice {
my $self = shift;
my $dbtype = shift;
my $slice_name = shift;
# set cache method (required for loading cache later)
$self->cache_method('BY_SEQ_REGION');
my $dba = $self->get_DBAdaptor($dbtype);
my $sa = $dba->get_SliceAdaptor;
my $slice = $sa->fetch_by_name($slice_name);
unless ($slice) {
throw("Could not retrieve slice $slice_name.");
}
my $genes = $slice->get_all_Genes(undef, undef, 1);
# find common coord_system
my $common_cs_found = $self->find_common_coord_systems;
# find out whether native coord_system is a common coord_system.
# if so, you don't need to project.
# also don't project if no common coord_system present
my $need_project = 1;
my $csid = join(':', $slice->coord_system_name,
$slice->coord_system->version);
if ($self->is_common_cs($csid) or !$self->highest_common_cs) {
$need_project = 0;
}
# build cache
my $type = "$dbtype.$slice_name";
my $num_genes = $self->build_cache_from_genes($type, $genes, $need_project);
undef $genes;
# write cache to file, then flush cache to reclaim memory
my $size = $self->write_all_to_file($type);
return $num_genes, $size;
}
=head2 build_cache_all
Arg[1] : String $dbtype - db type (source|target)
Example : my ($num_genes, $filesize) = $cache->build_cache_all('source');
Description : Builds a cache of genes, transcripts, translations and exons
needed by the IdMapping application and serialises the
resulting cache object to a file. All genes across the genome
are processed in one go. This method should be used when
build_cache_by_seq_region can't be used due to a large number
of toplevel seq_regions (e.g. 2x genomes).
Return type : list of the number of genes processed and the size of the
serialised cache file
Exceptions : thrown on invalid slice name
Caller : general
Status : At Risk
: under development
=cut
sub build_cache_all {
my $self = shift;
my $dbtype = shift;
# set cache method (required for loading cache later)
$self->cache_method('ALL');
my $dba = $self->get_DBAdaptor($dbtype);
my $ga = $dba->get_GeneAdaptor;
my $genes = $ga->fetch_all;
# find common coord_system
my $common_cs_found = $self->find_common_coord_systems;
# Build cache. Setting $need_project to 'CHECK' will cause
# build_cache_from_genes() to check the coordinate system for each gene.
my $type = "$dbtype.ALL";
my $num_genes = $self->build_cache_from_genes($type, $genes, 'CHECK');
undef $genes;
# write cache to file, then flush cache to reclaim memory
my $size = $self->write_all_to_file($type);
return $num_genes, $size;
}
=head2 build_cache_from_genes
Arg[1] : String $type - cache type
Arg[2] : Listref of Bio::EnsEMBL::Genes $genes - genes to build cache
from
Arg[3] : Boolean $need_project - indicate if we need to project exons to
common coordinate system
Example : $cache->build_cache_from_genes(
'source.chromosome:NCBI36:X:1:100000:1', \@genes);
Description : Builds the cache by fetching transcripts, translations and exons
for a list of genes from the database, and creating lightweight
Bio::EnsEMBL::IdMapping::TinyFeature objects containing only the
data needed by the IdMapping application. These objects are
attached to a name cache in this cache object. Exons only need
to be projected to a commond coordinate system if their native
coordinate system isn't common to source and target assembly
itself.
Return type : int - number of genes after filtering
Exceptions : thrown on wrong or missing arguments
Caller : internal
Status : At Risk
: under development
=cut
sub build_cache_from_genes {
my $self = shift;
my $type = shift;
my $genes = shift;
my $need_project = shift;
throw("You must provide a type.") unless $type;
throw("You must provide a listref of genes.") unless (ref($genes) eq 'ARRAY');
# biotype filter
if ( $self->conf->param('biotypes') ) {
$genes = $self->filter_biotypes($genes);
}
my $num_genes = scalar(@$genes);
# initialise cache for the given type.
$self->{'cache'}->{$type} = {};
#my $i = 0;
#my $num_genes = scalar(@$genes);
#my $progress_id = $self->logger->init_progress($num_genes);
# loop over genes sorted by gene location.
# the sort will hopefully improve assembly mapper cache performance and
# therefore speed up exon sequence retrieval
foreach my $gene (sort { $a->start <=> $b->start } @$genes) {
#$self->logger->log_progressbar($progress_id, ++$i, 2);
#$self->logger->log_progress($num_genes, ++$i, 20, 3, 1);
if ($need_project eq 'CHECK') {
# find out whether native coord_system is a common coord_system.
# if so, you don't need to project.
# also don't project if no common coord_system present
if ($self->highest_common_cs) {
my $csid = join(':', $gene->slice->coord_system_name,
$gene->slice->coord_system->version);
if ($self->is_common_cs($csid)) {
$need_project = 0;
}
} else {
$need_project = 0;
}
}
# create lightweigt gene
my $lgene = Bio::EnsEMBL::IdMapping::TinyGene->new_fast([
$gene->dbID,
$gene->stable_id,
$gene->version,
$gene->created_date,
$gene->modified_date,
$gene->start,
$gene->end,
$gene->strand,
$gene->slice->seq_region_name,
$gene->biotype,
$gene->status,
$gene->analysis->logic_name,
($gene->is_known ? 1 : 0),
]);
# build gene caches
$self->add('genes_by_id', $type, $gene->dbID, $lgene);
# transcripts
foreach my $tr (@{ $gene->get_all_Transcripts }) {
my $ltr = Bio::EnsEMBL::IdMapping::TinyTranscript->new_fast([
$tr->dbID,
$tr->stable_id,
$tr->version,
$tr->created_date,
$tr->modified_date,
$tr->start,
$tr->end,
$tr->strand,
$tr->length,
md5_hex($tr->spliced_seq),
($tr->is_known ? 1 : 0),
]);
$lgene->add_Transcript($ltr);
# build transcript caches
$self->add('transcripts_by_id', $type, $tr->dbID, $ltr);
$self->add('genes_by_transcript_id', $type, $tr->dbID, $lgene);
# translation (if there is one)
if (my $tl = $tr->translation) {
my $ltl = Bio::EnsEMBL::IdMapping::TinyTranslation->new_fast([
$tl->dbID,
$tl->stable_id,
$tl->version,
$tl->created_date,
$tl->modified_date,
$tr->dbID,
$tr->translate->seq,
($tr->is_known ? 1 : 0),
]);
$ltr->add_Translation($ltl);
$self->add('translations_by_id', $type, $tl->dbID, $ltl);
undef $tl;
}
# exons
foreach my $exon (@{ $tr->get_all_Exons }) {
my $lexon = Bio::EnsEMBL::IdMapping::TinyExon->new_fast([
$exon->dbID,
$exon->stable_id,
$exon->version,
$exon->created_date,
$exon->modified_date,
$exon->start,
$exon->end,
$exon->strand,
$exon->slice->seq_region_name,
$exon->slice->coord_system_name,
$exon->slice->coord_system->version,
$exon->slice->subseq($exon->start, $exon->end, $exon->strand),
$exon->phase,
$need_project,
]);
# get coordinates in common coordinate system if needed
if ($need_project) {
my @seg = @{ $exon->project($self->highest_common_cs,
$self->highest_common_cs_version) };
if (scalar(@seg) == 1) {
my $sl = $seg[0]->to_Slice;
$lexon->common_start($sl->start);
$lexon->common_end($sl->end);
$lexon->common_strand($sl->strand);
$lexon->common_sr_name($sl->seq_region_name);
}
}
$ltr->add_Exon($lexon);
$self->add('exons_by_id', $type, $exon->dbID, $lexon);
$self->add_list('transcripts_by_exon_id', $type, $exon->dbID, $ltr);
undef $exon;
}
undef $tr;
}
undef $gene;
}
return $num_genes;
}
=head2 filter_biotypes
Arg[1] : Listref of Bio::EnsEMBL::Genes $genes - the genes to filter
Example : my @filtered = @{ $cache->filter_biotypes(\@genes) };
Description : Filters a list of genes by biotype. Biotypes are taken from the
IdMapping configuration parameter 'biotypes'.
Return type : Listref of Bio::EnsEMBL::Genes (or empty list)
Exceptions : none
Caller : internal
Status : At Risk
: under development
=cut
sub filter_biotypes {
my $self = shift;
my $genes = shift;
my $filtered = [];
foreach my $biotype ($self->conf->param('biotypes')) {
push @$filtered, grep { $_->biotype eq $biotype } @$genes;
}
return $filtered;
}
=head2 add
Arg[1] : String $name - a cache name (e.g. 'genes_by_id')
Arg[2] : String type - a cache type (e.g. "source.$slice_name")
Arg[3] : String $key - key of this entry (e.g. a gene dbID)
Arg[4] : Bio::EnsEMBL::IdMappping::TinyFeature $val - value to cache
Example : $cache->add('genes_by_id',
'source.chromosome:NCBI36:X:1:1000000:1', '1234', $tiny_gene);
Description : Adds a TinyFeature object to a named cache.
Return type : Bio::EnsEMBL::IdMapping::TinyFeature
Exceptions : thrown on wrong or missing arguments
Caller : internal
Status : At Risk
: under development
=cut
sub add {
my $self = shift;
my $name = shift;
my $type = shift;
my $key = shift;
my $val = shift;
throw("You must provide a cache name (e.g. genes_by_id.") unless $name;
throw("You must provide a cache type.") unless $type;
throw("You must provide a cache key (e.g. a gene dbID).") unless $key;
$self->{'cache'}->{$type}->{$name}->{$key} = $val;
return $self->{'cache'}->{$type}->{$name}->{$key};
}
=head2 add_list
Arg[1] : String $name - a cache name (e.g. 'genes_by_id')
Arg[2] : String type - a cache type (e.g. "source.$slice_name")
Arg[3] : String $key - key of this entry (e.g. a gene dbID)
Arg[4] : List of Bio::EnsEMBL::IdMappping::TinyFeature @val - values
to cache
Example : $cache->add_list('transcripts_by_exon_id',
'source.chromosome:NCBI36:X:1:1000000:1', '1234',
$tiny_transcript1, $tiny_transcript2);
Description : Adds a list of TinyFeature objects to a named cache.
Return type : Listref of Bio::EnsEMBL::IdMapping::TinyFeature objects
Exceptions : thrown on wrong or missing arguments
Caller : internal
Status : At Risk
: under development
=cut
sub add_list {
my $self = shift;
my $name = shift;
my $type = shift;
my $key = shift;
my @vals = @_;
throw("You must provide a cache name (e.g. genes_by_id.") unless $name;
throw("You must provide a cache type.") unless $type;
throw("You must provide a cache key (e.g. a gene dbID).") unless $key;
push @{ $self->{'cache'}->{$type}->{$name}->{$key} }, @vals;
return $self->{'cache'}->{$type}->{$name}->{$key};
}
sub get_by_key {
my $self = shift;
my $name = shift;
my $type = shift;
my $key = shift;
throw("You must provide a cache name (e.g. genes_by_id.") unless $name;
throw("You must provide a cache type.") unless $type;
throw("You must provide a cache key (e.g. a gene dbID).") unless $key;
# transparently load cache from file unless already loaded
unless ($self->{'instance'}->{'loaded'}->{"$type"}) {
$self->read_and_merge($type);
}
return $self->{'cache'}->{$type}->{$name}->{$key};
}
sub get_by_name {
my $self = shift;
my $name = shift;
my $type = shift;
throw("You must provide a cache name (e.g. genes_by_id.") unless $name;
throw("You must provide a cache type.") unless $type;
# transparently load cache from file unless already loaded
unless ($self->{'instance'}->{'loaded'}->{$type}) {
$self->read_and_merge($type);
}
return $self->{'cache'}->{$type}->{$name} || {};
}
sub get_count_by_name {
my $self = shift;
my $name = shift;
my $type = shift;
throw("You must provide a cache name (e.g. genes_by_id.") unless $name;
throw("You must provide a cache type.") unless $type;
# transparently load cache from file unless already loaded
unless ($self->{'instance'}->{'loaded'}->{$type}) {
$self->read_and_merge($type);
}
return scalar(keys %{ $self->get_by_name($name, $type) });
}
sub find_common_coord_systems {
my $self = shift;
# get adaptors for source db
my $s_dba = $self->get_DBAdaptor('source');
my $s_csa = $s_dba->get_CoordSystemAdaptor;
my $s_sa = $s_dba->get_SliceAdaptor;
# get adaptors for target db
my $t_dba = $self->get_DBAdaptor('target');
my $t_csa = $t_dba->get_CoordSystemAdaptor;
my $t_sa = $t_dba->get_SliceAdaptor;
# find common coord_systems
my @s_coord_systems = @{ $s_csa->fetch_all };
my @t_coord_systems = @{ $t_csa->fetch_all };
my $found_highest = 0;
SOURCE:
foreach my $s_cs (@s_coord_systems) {
TARGET:
foreach my $t_cs (@t_coord_systems) {
if ($s_cs->name eq $t_cs->name) {
# test for identical coord_system version
if ($s_cs->version and ($s_cs->version ne $t_cs->version)) {
next TARGET;
}
# test for at least 50% identical seq_regions
if ($self->seq_regions_compatible($s_cs, $s_sa, $t_sa)) {
$self->add_common_cs($s_cs);
unless ($found_highest) {
$self->highest_common_cs($s_cs->name);
$self->highest_common_cs_version($s_cs->version);
}
$found_highest = 1;
next SOURCE;
}
}
}
}
return $found_highest;
}
sub seq_regions_compatible {
my $self = shift;
my $cs = shift;
my $s_sa = shift;
my $t_sa = shift;
unless ($cs and $cs->isa('Bio::EnsEMBL::CoordSystem')) {
throw('You must provide a CoordSystem');
}
unless ($s_sa and $t_sa and $s_sa->isa('Bio::EnsEMBL::DBSQL::SliceAdaptor')
and $t_sa->isa('Bio::EnsEMBL::DBSQL::SliceAdaptor')) {
throw('You must provide a source and target SliceAdaptor');
}
my %sr_match;
my $equal = 0;
my $s_seq_regions = $s_sa->fetch_all($cs->name, $cs->version);
my $t_seq_regions = $t_sa->fetch_all($cs->name, $cs->version);
# sanity check to prevent divison by zero
my $s_count = scalar(@$s_seq_regions);
my $t_count = scalar(@$t_seq_regions);
return(0) if ($s_count == 0 or $t_count == 0);
foreach my $s_sr (@$s_seq_regions) {
$sr_match{$s_sr->seq_region_name} = $s_sr->length;
}
foreach my $t_sr (@$t_seq_regions) {
if (exists($sr_match{$t_sr->seq_region_name})) {
$equal++;
# return false if we have a region with same name but different length
return(0) unless ($sr_match{$t_sr->seq_region_name} == $t_sr->length);
}
}
if ($equal/$s_count > 0.5 and $equal/$t_count > 0.5) {
return(1);
} else {
$self->logger->info("Only $equal seq_regions identical for ".$cs->name." ".$cs->version."\n");
return(0);
}
}
sub check_db_connection {
my $self = shift;
my $dbtype = shift;
my $err = 0;
eval {
my $dba = $self->get_DBAdaptor($dbtype);
$dba->dbc->connect;
};
if ($@) {
$self->logger->warning("Can't connect to $dbtype db: $@\n");
$err++;
} else {
$self->logger->debug("Connection to $dbtype db ok.\n");
$self->{'_db_conn_ok'}->{$dbtype} = 1;
}
return $err;
}
sub check_db_read_permissions {
my $self = shift;
my $dbtype = shift;
# skip this check if db connection failed (this prevents re-throwing
# exceptions).
return 1 unless ($self->{'_db_conn_ok'}->{$dbtype});
my $err = 0;
my %privs = %{ $self->get_db_privs($dbtype) };
unless ($privs{'SELECT'} or $privs{'ALL PRIVILEGES'}) {
$self->logger->warning("User doesn't have read permission on $dbtype db.\n");
$err++;
} else {
$self->logger->debug("Read permission on $dbtype db ok.\n");
}
return $err;
}
sub check_db_write_permissions {
my $self = shift;
my $dbtype = shift;
# skip this check if db connection failed (this prevents re-throwing
# exceptions).
return 1 unless ($self->{'_db_conn_ok'}->{$dbtype});
my $err = 0;
unless ($self->do_upload) {
$self->logger->debug("No uploads, so write permission on $dbtype db not required.\n");
return $err;
}
my %privs = %{ $self->get_db_privs($dbtype) };
unless ($privs{'INSERT'} or $privs{'ALL PRIVILEGES'}) {
$self->logger->warning("User doesn't have write permission on $dbtype db.\n");
$err++;
} else {
$self->logger->debug("Write permission on $dbtype db ok.\n");
}
return $err;
}
sub do_upload {
my $self = shift;
if ($self->conf->param('dry_run') or
! ($self->conf->param('upload_events') or
$self->conf->param('upload_stable_ids') or
$self->conf->param('upload_archive'))) {
return 0;
} else {
return 1;
}
}
sub get_db_privs {
my $self = shift;
my $dbtype = shift;
my %privs = ();
my $r;
# get privileges from mysql db
eval {
my $dbc = $self->get_DBAdaptor($dbtype)->dbc;
my $sql = qq(SHOW GRANTS FOR ).$dbc->username;
my $sth = $dbc->prepare($sql);
$sth->execute;
($r) = $sth->fetchrow_array;
$sth->finish;
};
if ($@) {
$self->logger->warning("Error obtaining privileges from $dbtype db: $@\n");
return {};
}
# parse the output
$r =~ s/GRANT (.*) ON .*/$1/i;
foreach my $p (split(',', $r)) {
# trim leading and trailing whitespace
$p =~ s/^\s+//;
$p =~ s/\s+$//;
$privs{uc($p)} = 1;
}
return \%privs;
}
sub check_empty_tables {
my $self = shift;
my $dbtype = shift;
# skip this check if db connection failed (this prevents re-throwing
# exceptions).
return 1 unless ($self->{'_db_conn_ok'}->{$dbtype});
my $err = 0;
my $c = 0;
if ($self->conf->param('no_check_empty_tables') or !$self->do_upload) {
$self->logger->debug("Won't check for empty stable ID and archive tables in $dbtype db.\n");
return $err;
}
eval {
my @tables = qw(
gene_stable_id
transcript_stable_id
translation_stable_id
exon_stable_id
stable_id_event
mapping_session
gene_archive
peptide_archive
);
my $dba = $self->get_DBAdaptor($dbtype);
foreach my $table (@tables) {
if ($c = $self->fetch_value_from_db($dba, "SELECT COUNT(*) FROM $table")) {
$self->logger->warning("$table table in $dbtype db has $c entries.\n");
$err++;
}
}
};
if ($@) {
$self->logger->warning("Error retrieving stable ID and archive table row counts from $dbtype db: $@\n");
$err++;
} elsif (!$err) {
$self->logger->debug("All stable ID and archive tables in $dbtype db are empty.\n");
}
return $err;
}
sub check_sequence {
my $self = shift;
my $dbtype = shift;
# skip this check if db connection failed (this prevents re-throwing
# exceptions).
return 1 unless ($self->{'_db_conn_ok'}->{$dbtype});
my $err = 0;
my $c = 0;
eval {
my $dba = $self->get_DBAdaptor($dbtype);
unless ($c = $self->fetch_value_from_db($dba, "SELECT COUNT(*) FROM dna")) {
$err++;
}
};
if ($@) {
$self->logger->warning("Error retrieving dna table row count from $dbtype db: $@\n");
$err++;
} elsif ($err) {
$self->logger->warning("No sequence found in $dbtype db.\n");
} else {
$self->logger->debug(ucfirst($dbtype)." db has sequence ($c entries).\n");
}
return $err;
}
sub check_meta_entries {
my $self = shift;
my $dbtype = shift;
# skip this check if db connection failed (this prevents re-throwing
# exceptions).
return 1 unless ($self->{'_db_conn_ok'}->{$dbtype});
my $err = 0;
my $assembly_default;
my $schema_version;
eval {
my $dba = $self->get_DBAdaptor($dbtype);
$assembly_default = $self->fetch_value_from_db($dba,
qq(SELECT meta_value FROM meta WHERE meta_key = 'assembly.default'));
$schema_version = $self->fetch_value_from_db($dba,
qq(SELECT meta_value FROM meta WHERE meta_key = 'schema_version'));
};
if ($@) {
$self->logger->warning("Error retrieving dna table row count from $dbtype db: $@\n");
return ++$err;
}
unless ($assembly_default) {
$self->logger->warning("No meta.assembly_default value found in $dbtype db.\n");
$err++;
} else {
$self->logger->debug("meta.assembly_default value found ($assembly_default).\n");
}
unless ($schema_version) {
$self->logger->warning("No meta.schema.version value found in $dbtype db.\n");
$err++;
} else {
$self->logger->debug("meta.schema.version value found ($schema_version).\n");
}
return $err;
}
sub fetch_value_from_db {
my $self = shift;
my $dba = shift;
my $sql = shift;
unless ($dba and ref($dba) and $dba->isa('Bio::EnsEMBL::DBSQL::DBAdaptor')) {
throw("Need a Bio::EnsEMBL::DBSQL::DBAdaptor.");
}
unless ($sql) {
throw("Need an SQL statement to execute.\n");
}
my $sth = $dba->dbc->prepare($sql);
$sth->execute;
my ($c) = $sth->fetchrow_array;
return $c;
}
sub get_DBAdaptor {
my $self = shift;
my $prefix = shift;
unless ($self->{'_dba'}->{$prefix}) {
# connect to database
my $dba = new Bio::EnsEMBL::DBSQL::DBAdaptor(
-host => $self->conf->param("${prefix}host"),
-port => $self->conf->param("${prefix}port"),
-user => $self->conf->param("${prefix}user"),
-pass => $self->conf->param("${prefix}pass"),
-dbname => $self->conf->param("${prefix}dbname"),
-group => $prefix,
);
# explicitely set the dnadb to itself - by default the Registry assumes
# a group 'core' for this now
$dba->dnadb($dba);
$self->{'_dba'}->{$prefix} = $dba;
}
return $self->{'_dba'}->{$prefix};
}
sub cache_file_exists {
my $self = shift;
my $type = shift;
throw("You must provide a cache type.") unless $type;
my $cache_file = $self->cache_file($type);
if (-e $cache_file) {
$self->logger->info("Cache file found for $type.\n", 2);
$self->logger->debug("Will read from $cache_file.\n", 2);
return 1;
} else {
$self->logger->info("No cache file found for $type.\n", 2);
$self->logger->info("Will build cache from db.\n", 2);
return 0;
}
}
sub cache_file {
my $self = shift;
my $type = shift;
throw("You must provide a cache type.") unless $type;
return $self->dump_path."/$type.object_cache.ser";
}
sub instance_file {
my $self = shift;
return $self->dump_path."/cache_instance.ser";
}
sub dump_path {
my $self = shift;
$self->{'dump_path'} ||= path_append($self->conf->param('basedir'), 'cache');
return $self->{'dump_path'};
}
sub write_all_to_file {
my $self = shift;
my $type = shift;
throw("You must provide a cache type.") unless $type;
my $size = 0;
$size += $self->write_to_file($type);
$size += $self->write_instance_to_file;
return parse_bytes($size);
}
sub write_to_file {
my $self = shift;
my $type = shift;
throw("You must provide a cache type.") unless $type;
unless ($self->{'cache'}->{$type}) {
$self->logger->warning("No features found in $type. Won't write cache file.\n");
return;
}
my $cache_file = $self->cache_file($type);
eval { nstore($self->{'cache'}->{$type}, $cache_file) };
if ($@) {
throw("Unable to store $cache_file: $@\n");
}
my $size = -s $cache_file;
return $size;
}
sub write_instance_to_file {
my $self = shift;
my $instance_file = $self->instance_file;
eval { nstore($self->{'instance'}, $instance_file) };
if ($@) {
throw("Unable to store $instance_file: $@\n");
}
my $size = -s $instance_file;
return $size;
}
sub read_from_file {
my $self = shift;
my $type = shift;
throw("You must provide a cache type.") unless $type;
my $cache_file = $self->cache_file($type);
if (-s $cache_file) {
#$self->logger->info("Reading cache from file...\n", 0, 'stamped');
#$self->logger->info("Cache file $cache_file.\n", 1);
eval { $self->{'cache'}->{$type} = retrieve($cache_file); };
if ($@) {
throw("Unable to retrieve cache: $@");
}
#$self->logger->info("Done.\n", 0, 'stamped');
} else {
$self->logger->warning("Cache file $cache_file not found or empty.\n");
}
return $self->{'cache'}->{$type};
}
sub read_and_merge {
my $self = shift;
my $dbtype = shift;
unless ($dbtype eq 'source' or $dbtype eq 'target') {
throw("Db type must be 'source' or 'target'.");
}
# read cache from single or multiple files, depending on caching strategy
my $cache_method = $self->cache_method;
if ($cache_method eq 'ALL') {
$self->read_from_file("$dbtype.ALL");
} elsif ($cache_method eq 'BY_SEQ_REGION') {
foreach my $slice_name (@{ $self->slice_names($dbtype) }) {
$self->read_from_file("$dbtype.$slice_name");
}
} else {
throw("Unknown cache method: $cache_method.");
}
$self->merge($dbtype);
# flag as being loaded
$self->{'instance'}->{'loaded'}->{$dbtype} = 1;
}
sub merge {
my $self = shift;
my $dbtype = shift;
unless ($dbtype eq 'source' or $dbtype eq 'target') {
throw("Db type must be 'source' or 'target'.");
}
foreach my $type (keys %{ $self->{'cache'} || {} }) {
next unless ($type =~ /^$dbtype/);
foreach my $name (keys %{ $self->{'cache'}->{$type} || {} }) {
foreach my $key (keys %{ $self->{'cache'}->{$type}->{$name} || {} }) {
if (defined $self->{'cache'}->{$dbtype}->{$name}->{$key}) {
# warning("Duplicate key in cache: $name|$dbtype|$key. Skipping.\n");
} else {
$self->{'cache'}->{$dbtype}->{$name}->{$key} =
$self->{'cache'}->{$type}->{$name}->{$key};
}
delete $self->{'cache'}->{$type}->{$name}->{$key};
}
delete $self->{'cache'}->{$type}->{$name};
}
delete $self->{'cache'}->{$type};
}
}
sub read_instance_from_file {
my $self = shift;
my $instance_file = $self->instance_file;
unless (-s $instance_file) {
throw("No valid cache instance file found at $instance_file.");
}
eval { $self->{'instance'} = retrieve($instance_file); };
if ($@) {
throw("Unable to retrieve cache instance: $@");
}
return $self->{'instance'};
}
sub slice_names {
my $self = shift;
my $dbtype = shift;
throw("You must provide a db type (source|target).") unless $dbtype;
my $dba = $self->get_DBAdaptor($dbtype);
my $sa = $dba->get_SliceAdaptor;
my @slice_names = ();
if ($self->conf->param('chromosomes')) {
# filter by chromosome
foreach my $chr ($self->conf->param('chromosomes')) {
my $slice = $sa->fetch_by_region('chromosome', $chr);
push @slice_names, $slice->name;
}
} elsif ($self->conf->param('region')) {
# filter by region (specific slice)
# don't use SliceAdaptor->fetch_by_name() since this will fail if assembly
# versions are different for source and target db
my ($cs, $version, $name, $start, $end, $strand) =
split(/:/, $self->conf->param('region'));
my $slice = $sa->fetch_by_region($cs, $name, $start, $end);
push @slice_names, $slice->name;
} else {
# fetch all genes, but do in junks to save memory
my $ga = $dba->get_GeneAdaptor;
foreach my $srid (@{ $ga->list_seq_region_ids }) {
my $slice = $sa->fetch_by_seq_region_id($srid);
push @slice_names, $slice->name;
}
}
return \@slice_names;
}
=head2 logger
Arg[1] :
Example :
Description :
Return type :
Exceptions :
Caller :
Status :
=cut
sub logger {
my $self = shift;
$self->{'logger'} = shift if (@_);
return $self->{'logger'};
}
=head2 conf
Arg[1] :
Example :
Description :
Return type :
Exceptions :
Caller :
Status :
=cut
sub conf {
my $self = shift;
$self->{'conf'} = shift if (@_);
return $self->{'conf'};
}
sub cache_method {
my $self = shift;
$self->{'instance'}->{'cache_method'} = shift if (@_);
return $self->{'instance'}->{'cache_method'};
}
sub highest_common_cs {
my $self = shift;
$self->{'instance'}->{'hccs'} = shift if (@_);
return $self->{'instance'}->{'hccs'};
}
sub highest_common_cs_version {
my $self = shift;
$self->{'instance'}->{'hccsv'} = shift if (@_);
return $self->{'instance'}->{'hccsv'};
}
sub add_common_cs {
my $self = shift;
my $cs = shift;
unless ($cs and $cs->isa('Bio::EnsEMBL::CoordSystem')) {
throw('You must provide a CoordSystem');
}
my $csid = join(':', $cs->name, $cs->version);
$self->{'instance'}->{'ccs'}->{$csid} = 1;
}
sub is_common_cs {
my $self = shift;
my $csid = shift;
return $self->{'instance'}->{'ccs'}->{$csid};
}
1;