Raw content of Bio::EnsEMBL::Funcgen::DBSQL::CoordSystemAdaptor
#
# EnsEMBL module for Bio::EnsEMBL::Funcgen::DBSQL::CoordSystemAdaptor
#
#
=head1 NAME
Bio::EnsEMBL::Funcgen::DBSQL::CoordSystemAdaptor
=head1 SYNOPSIS
my $db = Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor->new(...);
my $csa = $db->get_CoordSystemAdaptor();
#
# Fetch by name, schema_build and version(opt).
#
$cs = $csa->fetch_by_name_schema_build_version('chromosome', '39_36a', 'NCBI36');
#As this is a multi-assembly DB, we have to accomodate the idea of schema versions, which will
#enable a mapping from the feature table back to assembly/core DB of origin.
#Old core methods, some may not work as they assume that there will only be one default version
#where are there maybe multiple default versions, one for each assembly/schema_build
#
# Get all coord systems in the database:
#
foreach my $cs (@{$csa->fetch_all()}) {
print $cs->name, ' ', $cs->version, "\n";
}
#
# Fetching by name:
#
#use the default version of coord_system 'chromosome' (e.g. NCBI33):
$cs = $csa->fetch_by_name('chromosome');
#get an explicit version of coord_system 'chromosome':
$cs = $csa->fetch_by_name('chromsome', 'NCBI34');
#get all coord_systems of name 'chromosome':
foreach $cs (@{$csa->fetch_all_by_name('chromosome')}) {
print $cs->name, ' ', $cs->version, "\n";
}
#
# Fetching by rank:
#
$cs = $csa->fetch_by_rank(2);
#
# Fetching the pseudo coord system 'toplevel'
#
#Get the default top_level coord system:
$cs = $csa->fetch_top_level();
#can also use an alias in fetch_by_name:
$cs = $csa->fetch_by_name('toplevel');
#can also request toplevel using rank=0
$cs = $csa->fetch_by_rank(0);
#
# Fetching by sequence level:
#
#Get the coord system which is used to store sequence:
$cs = $csa->fetch_sequence_level();
#can also use an alias in fetch_by_name:
$cs = $csa->fetch_by_name('seqlevel');
#
# Fetching by id
#
$cs = $csa->fetch_by_dbID(1);
=head1 DESCRIPTION
The Funcgen CoordSystemAdaptor works slighty different to the core version. As
the Funcgen DB stores features mapped to multiple core/dna DBs the schema and
data versions(i.e. the last bit of the DB name) have to be stored. This maintains
a link between the seq_region_id stored in the Funcgen DB and the seq_region and assembly
tables stored in the core DB on which the features were originally built.
Default versions or ranking has not yet been tested.
This adaptor allows the querying of information from the coordinate system
adaptor.
Note that many coordinate systems do not have a concept of a version
for the entire coordinate system (though they may have a per-sequence version).
The 'chromosome' coordinate system usually has a version (i.e. the
assembly version) but the clonal coordinate system does not (despite having
individual sequence versions). In the case where a coordinate system does
not have a version an empty string ('') is used instead.
=head1 AUTHOR
This module was written by Nathan Johnson, based on the core CoordSystemAdaptor
written by Graham McVicker.
=head1 CONTACT
Post questions to the EnsEMBL development list ensembl-dev@ebi.ac.uk
=head1 METHODS
=cut
use strict;
use warnings;
package Bio::EnsEMBL::Funcgen::DBSQL::CoordSystemAdaptor;
use Bio::EnsEMBL::DBSQL::BaseAdaptor;
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
use Bio::EnsEMBL::Funcgen::CoordSystem;
use vars qw(@ISA);
@ISA = qw(Bio::EnsEMBL::DBSQL::BaseAdaptor);
=head2 new
Arg [1] : See BaseAdaptor for arguments (none specific to this
subclass)
Example : $cs = $db->get_CoordSystemAdaptor(); #better than new()
Description: Creates a new CoordSystem adaptor and caches the contents
of the coord_system table in memory.
Returntype : Bio::EnsEMBL::Funcgen::DBSQL::CoordSystemAdaptor
Exceptions : none
Caller :
Status : At risk
=cut
sub new {
my $caller = shift;
my $class = ref($caller) || $caller;
my $self = $class->SUPER::new(@_);
#
# Cache the entire contents of the coord_system table cross-referenced
# by dbID and name
#
#Funcgen specific
#Added extra key on schema_build for all
#keyed on name, list of coord_system value
$self->{'_name_cache'} = {};
#keyed on id, coord_system value
$self->{'_dbID_cache'} = {};
#keyed on rank
#$self->{'_rank_cache'} = {};
#keyed on id, 1/undef values
$self->{'_is_sequence_level'} = {};
$self->{'_is_default_version'} = {};
my $sql = 'SELECT coord_system_id, name, rank, version, attrib, schema_build, core_coord_system_id FROM coord_system';
my @args;
if($self->is_multispecies()) {
$sql.=' where species_id =?';
push(@args, $self->species_id());
}
$sql.=' order by coord_system_id';
my $sth = $self->prepare($sql);
$sth->execute(@args);
my ($dbID, $name, $rank, $version, $attrib, $sbuild, $ccs_id, $cs);
$sth->bind_columns(\$dbID, \$name, \$rank, \$version, \$attrib, \$sbuild, \$ccs_id);
while($sth->fetch()) {
my $seq_lvl = 0;
my $default = 0;
#what we need is an add schema_build, seq_level, default, rank method
#name and version shoudl be same for one CS
if($attrib) {
foreach my $attrib (split(',', $attrib)) {
$self->{"_is_$attrib"}->{$dbID} = 1;
if($attrib eq 'sequence_level') {
$seq_lvl = 1;
} elsif($attrib eq 'default_version') {
$default = 1;
}
}
}
#Found new name, version pair
if(! $cs || ($dbID != $cs->dbID())){
if($cs){
#handle caching here
#the get methods which utilise these caches need to sort the results based on the latest schema build.
#or maybe instead of just having one name, where cat the schema_build, but point to the same cs
#so loop through all the schema build for one CS?
$self->{'_dbID_cache'}->{$cs->dbID()} = $cs;
#Right then
#Unless we're querying by cs_id from the eFG DB then we will always need
#schema_build&level||rank or name&version
#No point in having NR rank cache, need to resolve with schema_build?
#Name
#have schema_build as optional arg in all methods, get from BDAdaptor if not defined?
#This will just match the schema build to the current eFG DB
$self->{'_name_cache'}->{lc($cs->name())} ||= [];
#$self->{'_rank_cache'}->{$rank} ||= [];
#push @{$self->{'_rank_cache'}->{$rank}}, $cs;
push @{$self->{'_name_cache'}->{lc($cs->name())}}, $cs;
}
$cs = Bio::EnsEMBL::Funcgen::CoordSystem->new
(-DBID => $dbID,
-ADAPTOR => $self,
-NAME => $name,
-VERSION => $version,
#-RANK => $rank,
#-SEQUENCE_LEVEL => $seq_lvl,
#-DEFAULT => $default,
#-SCHEMA_BUILD => $sbuild,
#-CORE_COORD_SYSTEM_ID => $ccs_id
);
}
#could we fetch the actual core CS here, and add it to the eFG coord sys?
#or should we just handle the individual args?
#do we need to write generic method in DBAdaptor for this, then we can use the
#CSAdaptor as a cache for all DBAdaptor(CSs) should we not use reg for this?
#we could populate objects from new rather than from db, then create adaptor as required?
#still need to store is stored in CD? and also we need to test everytime to see if we have an adaptor
$cs->add_core_coord_system_info(
-RANK => $rank,
-SEQUENCE_LEVEL => $seq_lvl,
-DEFAULT => $default,
-SCHEMA_BUILD => $sbuild,
-CORE_COORD_SYSTEM_ID => $ccs_id,
-IS_STORED => 1,
);
#orig
#if($attrib) {
# foreach my $attrib (split(',', $attrib)) {
# $self->{"_is_$attrib"}->{$dbID} = 1;
# if($attrib eq 'sequence_level') {
# $seq_lvl = 1;
# } elsif($attrib eq 'default_version') {
# $default = 1;
# }
# }
#}
#my $cs = Bio::EnsEMBL::Funcgen::CoordSystem->new
# (-DBID => $dbID,
# -ADAPTOR => $self,
# -NAME => $name,
# -VERSION => $version,
# -RANK => $rank,
# -SEQUENCE_LEVEL => $seq_lvl,
# -DEFAULT => $default,
# -SCHEMA_BUILD => $sbuild,
# );
#can we change these caches to use just the name and version rather than schema_build?
#$self->{'_sb_name_cache'}->{$sbuild.":".lc($name)} ||= [];
#$self->{'_dbID_cache'}->{$dbID} = $cs;
#$self->{'_sb_rank_cache'}->{$sbuild.":".$rank} = $cs;
#push @{$self->{'_sb_name_cache'}->{$sbuild.":".lc($name)}}, $cs;
}
#handle last cs
if($cs){
$self->{'_dbID_cache'}->{$cs->dbID()} = $cs;
#push @{$self->{'_rank_cache'}->{$rank}}, $cs;
push @{$self->{'_name_cache'}->{lc($cs->name())}}, $cs;
}
$sth->finish();
#Get rid? Let core handle this
#No mapping paths present in meta table!
#
# Retrieve a list of available mappings from the meta table.
# this may eventually be moved a table of its own if this proves too
# cumbersome
#
#my %mapping_paths;
#my $mc = $self->db()->get_MetaContainer();
#MAP_PATH:
#foreach my $map_path (@{$mc->list_value_by_key('assembly.mapping')}) {
# my @cs_strings = split(/[|#]/, $map_path);
# if(@cs_strings < 2) {
# warning("Incorrectly formatted assembly.mapping value in meta " .
# "table: $map_path");
# next MAP_PATH;
# }
# my @coord_systems;
# foreach my $cs_string (@cs_strings) {
# my($name, $version) = split(/:/, $cs_string);
# my $cs = $self->fetch_by_name($name, $version);
# if(!$cs) {
# warning("Unknown coordinate system specified in meta table " .
# " assembly.mapping:\n $name:$version");
# next MAP_PATH;
# }
# push @coord_systems, $cs;
# }
# if the delimiter is a # we want a special case, multiple parts of the same
# componente map to same assembly part. As this looks like the "long" mapping
# we just make the path a bit longer :-)
# if( $map_path =~ /\#/ && scalar( @coord_systems ) == 2 ) {
# splice( @coord_systems, 1, 0, ( undef ));
# }
# my $cs1 = $coord_systems[0];
# my $cs2 = $coord_systems[$#coord_systems];
# my $key1 = $cs1->name().':'.$cs1->version();
# my $key2 = $cs2->name().':'.$cs2->version();
# if(exists($mapping_paths{"$key1|$key2"})) {
# warning("Meta table specifies multiple mapping paths between " .
# "coord systems $key1 and $key2.\n" .
# "Choosing shorter path arbitrarily.");#
# next MAP_PATH if(@{$mapping_paths{"$key1|$key2"}} < @coord_systems);
# }
# $mapping_paths{"$key1|$key2"} = \@coord_systems;
# }
#
# Create the pseudo coord system 'toplevel' and cache it so that
# only one of these is created for each db...
#
#Not yet implemented across multiple dbs
#my $toplevel = Bio::EnsEMBL::Funcgen::CoordSystem->new(-TOP_LEVEL => 1,
# -NAME => 'toplevel',
# -ADAPTOR => $self);
# $self->{'_top_level'} = $toplevel;
#$self->{'_mapping_paths'} = \%mapping_paths;
return $self;
}
=head2 fetch_by_name
Arg [1] : string $name
The name of the coordinate system to retrieve. Alternatively
this may be an alias for a real coordinate system. Valid
aliases are 'toplevel' and 'seqlevel'.
Arg [2] : optional - string $version
The version of the coordinate system to retrieve. If not
specified the default version for the appropriate schema_build
will be used.
Example : $coord_sys = $csa->fetch_by_name('chromosome', 'NCBI36');
# toplevel is an pseudo coord system representing the highest
# coord system in a given region
# such as the chromosome coordinate system
$coord_sys = $csa->fetch_by_name('toplevel');
#seqlevel is an alias for the sequence level coordinate system
#such as the clone or contig coordinate system
$coord_sys = $csa->fetch_by_name('seqlevel');
Description: Retrieves a coordinate system by its name
Returntype : Bio::EnsEMBL::Funcgen::CoordSystem
Exceptions : throw if no name argument provided
warning if no version provided and default does not exist
Caller : general
Status : At risk
=cut
#we need the schema_build for the top/sequence_level!!!!!!!!!!!!!!!!!!!!!!!!!
#if schema_build not defined them we need to use ->db->dnadb schema_build
#careful, this could be using the default dnadb already
#but this is the desired behaviour is it not?
#need a generic method to fetch the best cs based on dnadb or latest schema_build
#also need generic method in DBAdaptor to set dnadb by Experiment
#need to populate schema_build in Experiment?
#how can we do this dynamically? all Experiment(ec, channel, rset) based methods should set dnadb appropriately?
#could this potentially mean this is called too many times for one query?
#or we could just let the user manage it?
#we need to check whether different/non-comparable schema_builds are added to the same result set
#use latest schema_build i.e. gene set or original schema_build.
sub fetch_by_name{
my $self = shift;
my $name = lc(shift);
my $version = lc(shift);
my $sbuild = $self->db->_get_schema_build($self->db->dnadb());
my $assembly = $self->db->get_CoordSystemAdaptor->fetch_by_name('chromosome')->version();
my ($cs, $found_cs);
throw('Mandatory argument \'name\'') if(! $name);
#can we not just use
#if(($name eq 'toplevel' || $name eq 'seqlevel') && ! $schema_build){
# throw('To access toplevel or seqlevel you must provide a the third schema_build argument');
# }
warn "Using dnadb(".$sbuild.") to acquire $name" if($name =~ /level/);
if($name eq 'seqlevel') {
return $self->fetch_sequence_level_by_schema_build($sbuild);
} elsif($name eq 'toplevel') {
return $self->fetch_top_level_by_schema_build($sbuild);
}
if(! exists($self->{'_name_cache'}->{$name})) {
if($name =~ /top/) {
warn("Did you mean 'toplevel' coord system instead of '$name'?");
} elsif($name =~ /seq/) {
warn("Did you mean 'seqlevel' coord system instead of '$name'?");
}
return undef;
}
my @coord_systems = @{$self->{'_name_cache'}->{$name}};
#Filter versions if or get the default for the schema_build or comparable
#This will only get non-versioned CSs if there are already loaded on a given schema_build
#Hence we can never retrieve a 'comparable' supercontig if it has not been loaded onto the current schema_build
#Hence we end up loading a new CS for each non-versioned level.
foreach $cs (@coord_systems) {
#Need if version first to allow for versioned and non-versioned supercontig level
if($version) {
#we need to get the one which corresponds to the dnadb?
#mmmm, no, dnadb may be set to the latest schema_build
#which may not contain name version
#take the dnadb if present, or else the latest
#should we sort here or sort the caches in new
#what if we add a new schema_build? Will that be cached and sorted?
if(lc($cs->version()) eq $version){
$found_cs = $cs;
last;
#push @schema_css, $cs if(lc($cs->version()) eq $version);
}
}elsif($cs->version eq $assembly){
#assume we want the current dnadb assembly version
#No longer need to check schema build as we a forcing the use of assembly version in eFG
$found_cs = $cs;
last;
}
#if($cs->contains_schema_build($sbuild) && $cs->{'core_cache'}{$sbuild}{'DEFAULT'}){#exact match
# $found_cs = $cs;
# last;
# }else{#find best equivalent default
# foreach my $cache_sbuild(keys %{$cs->{'core_cache'}}){
# warn "got cached $cache_sbuild matching against assembly $assembly";
#we need to deal with the version here rather than the DB assembly_version string
# #Find DB with same assembly and take default CS
# if($cache_sbuild =~ /_${assembly}/ && $cs->{'core_cache'}{$cache_sbuild}{'DEFAULT'}){
# $found_cs = $cs;
# last;
# }
# }
# }
#}else{#non-assmebled levels e.g. clone
# #should only ever be one of these by definition
# throw("Found more than one non-versioned CoordSystem:\t$name") if $found_cs;
# $found_cs = $cs;
# }
}
#should these throw?
if(! $found_cs){
if($version) {
warn "No coord system found for $sbuild version '$version'";
return undef;
}else{
warn "Could not find default CoordSystem for '$sbuild', use next ranking?";
return undef
}
}
#didn't find a default, just take first one
#my $cs = shift @coord_systems;
#warning("No default version for coord_system [$name] exists. " .
# "Using version [".$cs->version()."] arbitrarily");
return $found_cs;
}
=head2 fetch_all
Arg [1] : none
Example : foreach my $cs (@{$csa->fetch_all()}) {
print $cs->name(), ' ', $cs->version(), "\n";
}
Description: Retrieves every coordinate system defined in the DB.
These will be returned in ascending order of rank. I.e.
The highest coordinate system with rank=1 would be first in the
array.
Returntype : listref of Bio::EnsEMBL::Funcgen::CoordSystems
Exceptions : none
Caller : general
Status : at risk
=cut
sub fetch_all {
my $self = shift;
throw('Not implement rank cache');
my @coord_systems;
#order the array by rank in ascending order
foreach my $rank (sort {$a <=> $b} keys %{$self->{'_rank_cache'}}) {
push @coord_systems, $self->{'_rank_cache'}->{$rank};
}
return \@coord_systems;
}
=head2 fetch_by_rank
Arg [1] : int $rank
Example : my $cs = $coord_sys_adaptor->fetch_by_rank(1);
Description: Retrieves a CoordinateSystem via its rank. 0 is a special
rank reserved for the pseudo coordinate system 'toplevel'.
undef is returned if no coordinate system of the specified rank
exists.
Returntype : Bio::EnsEMBL::Funcgen::CoordSystem
Exceptions : none
Caller : general
Status : At risk
=cut
sub fetch_by_rank {
my $self = shift;
my $rank = shift;
thrw('not implemented rank cache yet');
throw("Rank argument must be defined.") if(!defined($rank));
throw("Rank argument must be a non-negative integer.") if($rank !~ /^\d+$/);
if($rank == 0) {
return $self->fetch_top_level();
}
return $self->{'_rank_cache'}->{$rank};
}
=head2 fetch_all_by_name
Arg [1] : string $name
The name of the coordinate system to retrieve. This can be
the name of an actual coordinate system or an alias for a
coordinate system. Valid aliases are 'toplevel' and 'seqlevel'.
Example : foreach my $cs (@{$csa->fetch_all_by_name('chromosome')}){
print $cs->name(), ' ', $cs->version();
}
Description: Retrieves all coordinate systems of a particular name
Returntype : listref of Bio::EnsEMBL::Funcgen::CoordSystem objects
Exceptions : throw if no name argument provided
Caller : general
Status : Medium
=cut
sub fetch_all_by_name {
my $self = shift;
my $name = lc(shift); #case insensitive matching
throw('Name argument is required') if(!$name);
if($name eq 'seqlevel') {
return [$self->fetch_sequence_level()];
} elsif($name eq 'toplevel') {
return [$self->fetch_top_level()];
}
return $self->{'_name_cache'}->{$name} || [];
}
=head2 fetch_by_dbID
Arg [1] : int dbID
Example : $cs = $csa->fetch_by_dbID(4);
Description: Retrieves a coord_system via its internal
identifier, or undef if no coordinate system with the provided
id exists.
Returntype : Bio::EnsEMBL::Funcgen::CoordSystem or undef
Exceptions : thrown if no coord_system exists for specified dbID
Caller : general
Status : Stable
=cut
sub fetch_by_dbID {
my $self = shift;
my $dbID = shift;
throw('dbID argument is required') if(!$dbID);
my $cs = $self->{'_dbID_cache'}->{$dbID};
return undef if(!$cs);
return $cs;
}
=head2 fetch_top_level
Arg [1] : none
Example : $cs = $csa->fetch_top_level();
Description: Retrieves the toplevel pseudo coordinate system.
Returntype : a Bio::EnsEMBL::Funcgen::CoordSystem object
Exceptions : none
Caller : general
Status : At risk
=cut
sub fetch_top_level {
my $self = shift;
throw("Not yet implemented with schema_build");
return $self->{'_top_level'};
}
=head2 fetch_sequence_level
Arg [1] : none
Example : ($id, $name, $version) = $csa->fetch_sequence_level();
Description: Retrieves the coordinate system at which sequence
is stored at.
Returntype : Bio::EnsEMBL::Funcgen::CoordSystem
Exceptions : throw if no sequence_level coord system exists at all
throw if multiple sequence_level coord systems exists
Caller : general
Status : At risk
=cut
sub fetch_sequence_level {
my $self = shift;
throw("Not yet implemented with schema_build");
my @dbIDs = keys %{$self->{'_is_sequence_level'}};
throw('No sequence_level coord_system is defined') if(!@dbIDs);
if(@dbIDs > 1) {
throw('Multiple sequence_level coord_systems are defined.' .
'Only one is currently supported');
}
return $self->{'_dbID_cache'}->{$dbIDs[0]};
}
=head2 get_mapping_path
Arg [1] : Bio::EnsEMBL::CoordSystem $cs1
Arg [2] : Bio::EnsEMBL::CoordSystem $cs2
Example : foreach my $cs @{$csa->get_mapping_path($cs1,$cs2);
Description: Given two coordinate systems this will return a mapping path
between them if one has been defined. Allowed Mapping paths are
explicitly defined in the meta table. The following is an
example:
mysql> select * from meta where meta_key = 'assembly.mapping';
+---------+------------------+--------------------------------------+
| meta_id | meta_key | meta_value |
+---------+------------------+--------------------------------------+
| 20 | assembly.mapping | chromosome:NCBI34|contig |
| 21 | assembly.mapping | clone|contig |
| 22 | assembly.mapping | supercontig|contig |
| 23 | assembly.mapping | chromosome:NCBI34|contig|clone |
| 24 | assembly.mapping | chromosome:NCBI34|contig|supercontig |
| 25 | assembly.mapping | supercontig|contig|clone |
+---------+------------------+--------------------------------------+
For a one-step mapping path to be valid there needs to be
a relationship between the two coordinate systems defined in
the assembly table. Two step mapping paths work by building
on the one-step mapping paths which are already defined.
The first coordinate system in a one step mapping path must
be the assembled coordinate system and the second must be
the component.
Example of use:
my $cs1 = $cs_adaptor->fetch_by_name('contig');
my $cs2 = $cs_adaptor->fetch_by_name('chromosome');
my @path = @{$cs_adaptor->get_mapping_path($cs1,$cs2)};
if(!@path) {
print "No mapping path.";
}
elsif(@path == 2) {
print "2 step mapping path.";
print "Assembled = " . $path[0]->name() . "\n";
print "Component = " . $path[1]->name() . "\n";
} else {
print "Multi step mapping path\n";
}
Returntype : reference to a list of Bio::EnsEMBL::CoordSystem objects
Exceptions : none
Caller : general
Status : At risk
=cut
#Need to be redirected to the core/dnadb of interest
sub get_mapping_path {
my $self = shift;
my $cs1 = shift;
my $cs2 = shift;
if(!ref($cs1) || !ref($cs2) ||
!$cs1->isa('Bio::EnsEMBL::CoordSystem') ||
!$cs2->isa('Bio::EnsEMBL::CoordSystem')) {
throw('Two Bio::EnsEMBL::CoordSystem arguments expected.');
}
my $key1 = $cs1->name() . ":" . $cs1->version();
my $key2 = $cs2->name() . ":" . $cs2->version();
my $path = $self->{'_mapping_paths'}->{"$key1|$key2"};
return $path if($path);
$path = $self->{'_mapping_paths'}->{"$key2|$key1"};
if(!$path) {
# No path was explicitly defined, but we might be able to guess a
# suitable path. We only guess for missing 2 step paths.
my %mid1;
my %mid2;
foreach my $path (values(%{$self->{'_mapping_paths'}})) {
next if(@$path != 2);
my $match = undef;
if($path->[0]->equals($cs1)) {
$match = 1;
} elsif($path->[1]->equals($cs1)) {
$match = 0;
}
if(defined($match)) {
my $mid = $path->[$match];
my $midkey = $mid->name() . ':' . $mid->version();
# is the same cs mapped to by other cs?
if($mid2{$midkey}) {
my $path = [$cs1,$mid,$cs2];
$self->{'_mapping_paths'}->{"$key1|$key2"} = $path;
$key1 =~ s/\:$//;
$key2 =~ s/\:$//;
$midkey =~ s/\:$//;
warning("Using implicit mapping path between '$key1' and '$key2' " .
"coord systems.\n" .
"An explicit 'assembly.mapping' entry should be added " .
"to the meta table.\nExample: " .
"'$key1|$midkey|$key2'\n");
return $path;
} else {
$mid1{$midkey} = $mid;
}
}
$match = undef;
if($path->[0]->equals($cs2)) {
$match = 1;
} elsif($path->[1]->equals($cs2)) {
$match = 0;
}
if(defined($match)) {
my $mid = $path->[$match];
my $midkey = $mid->name() . ':' . $mid->version();
# is the same cs mapped to by other cs?
if($mid1{$midkey}) {
my $path = [$cs2,$mid,$cs1];
$self->{'_mapping_paths'}->{"$key2|$key1"} = $path;
$key1 =~ s/\:$//;
$key2 =~ s/\:$//;
$midkey =~ s/\:$//;
warning("Using implicit mapping path between '$key1' and '$key2' " .
"coord systems.\n" .
"An explicit 'assembly.mapping' entry should be added " .
"to the meta table.\nExample: " .
"'$key1|$midkey|$key2'\n");
return $path;
} else {
$mid2{$midkey} = $mid;
}
}
}
}
return $path || [];
}
=head2 _fetch_by_attribute
Arg [1] :
Example :
Description:
Returntype :
Exceptions :
Caller :
Status : At risk
=cut
sub _fetch_by_attrib {
my $self = shift;
my $attrib = shift;
my $version = shift;
$version = lc($version) if($version);
my @dbIDs = keys %{$self->{"_is_$attrib"}};
throw("No $attrib coordinate system defined") if(!@dbIDs);
foreach my $dbID (@dbIDs) {
my $cs = $self->{'_dbID_cache'}->{$dbID};
if($version) {
return $cs if(lc($version) eq $cs->version());
} elsif($self->{'_is_default_version'}->{$dbID}) {
return $cs;
}
}
#specifically requested attrib system was not found
if($version) {
throw("$attrib coord_system with version [$version] does not exist");
}
#coordsystem with attrib exists but no default is defined:
my $dbID = shift @dbIDs;
my $cs = $self->{'_dbID_cache'}->{$dbID};
my $v = $cs->version();
warning("No default version for $attrib coord_system exists. " .
"Using version [$v] arbitrarily");
return $cs;
}
=head2 _fetch_all_by_attribute
Arg [1] :
Example :
Description:
Returntype :
Exceptions :
Caller :
Status : At risk
=cut
sub _fetch_all_by_attrib {
my $self = shift;
my $attrib = shift;
my @coord_systems = ();
foreach my $dbID (keys %{$self->{"_is_$attrib"}}) {
push @coord_systems, $self->{"_dbID_cache"}->{$dbID};
}
return \@coord_systems;
}
=head2 store
Arg [1] : Bio::EnsEMBL::Funcgen::CoordSystem
Example : $csa->store($coord_system);
Description: Stores a CoordSystem object in the database.
Returntype : none
Exceptions : Warning if CoordSystem is already stored in this database.
Caller : none
Status : At risk
=cut
sub store {
my $self = shift;
my $cs = shift;
if(!$cs || !ref($cs) || !$cs->isa('Bio::EnsEMBL::Funcgen::CoordSystem')) {
throw('CoordSystem argument expected.');
}
my $sth;
my $db = $self->db();
my $name = $cs->name();
my $version = $cs->version();
if($name eq 'toplevel' || $name eq 'seqlevel' || !$name) {
throw("[$name] is not a valid name for a storable CoordSystem.");
}
foreach my $sbuild(keys %{$cs->{'core_cache'}}){
my $rank = $cs->{'core_cache'}->{$sbuild}->{'RANK'};
my $seqlevel = $cs->{'core_cache'}->{$sbuild}->{'SEQUENCE_LEVEL'};
my $default = $cs->{'core_cache'}->{$sbuild}->{'DEFAULT'};
my $ccs_id = $cs->{'core_cache'}->{$sbuild}->{'CORE_COORD_SYSTEM_ID'};
#
# Do lots of sanity checking to prevent bad data from being entered
#
if($cs->{'core_cache'}->{$sbuild}->{'IS_STORED'}) {
#Doesn't this only check on dbID?
next;
}
#if($seqlevel && keys(%{$self->{'_is_sequence_level'}})) {
# throw("There can only be one sequence level CoordSystem.");
#}
#if(exists $self->{'_name_cache'}->{lc($name)}) {
# my @coord_systems = @{$self->{'_name_cache'}->{lc($name)}};
# foreach my $c (@coord_systems) {
# if(lc($c->version()) eq lc($version)) {
# warning("CoordSystem $name $version is already in db.\n");
# return;
# }
# if($default && $self->{'_is_default_version'}->{$c->dbID()}) {
# throw("There can only be one default version of CoordSystem $name");
# }
# }
#}
if($rank !~ /^\d+$/) {
throw("Rank attribute must be a positive integer not [$rank]");
}
if($rank == 0) {
throw("Only toplevel CoordSystem may have rank of 0.");
}
#if(defined($self->{'_rank_cache'}->{$rank})) {
# throw("CoordSystem with rank [$rank] already exists.");
#}
my @attrib;
push @attrib, 'default_version' if($default);
push @attrib, 'sequence_level' if($seqlevel);
my $attrib_str = (@attrib) ? join(',', @attrib) : undef;
#
# store the coordinate system in the database
#
if(! $cs->dbID()){
$sth = $self->prepare('insert into coord_system (name, version, attrib, rank, schema_build, core_coord_system_id, species_id) values (?,?,?,?,?,?,?)');
$sth->bind_param(1, $name, SQL_VARCHAR);
$sth->bind_param(2, $version, SQL_VARCHAR);
$sth->bind_param(3, $attrib_str, SQL_VARCHAR);
$sth->bind_param(4, $rank, SQL_INTEGER);
$sth->bind_param(5, $sbuild, SQL_VARCHAR);
$sth->bind_param(6, $ccs_id, SQL_INTEGER);
$sth->bind_param(7, $self->species_id(), SQL_INTEGER);
$sth->execute();
my $dbID = $sth->{'mysql_insertid'};
$sth->finish();
if(!$dbID) {
throw("Did not get dbID from store of CoordSystem.");
}
$cs->dbID($dbID);
$cs->adaptor($self);
}else{
#can we prep this out of the loop
#we don't know until we're in it
my $sql = 'insert into coord_system (coord_system_id, name, version, attrib, rank, schema_build, core_coord_system_id, species_id) values (?,?,?,?,?,?,?,?)';
$sth = $db->dbc->prepare($sql);
$sth->bind_param(1, $cs->dbID(), SQL_INTEGER);
$sth->bind_param(2, $name, SQL_VARCHAR);
$sth->bind_param(3, $version, SQL_VARCHAR);
$sth->bind_param(4, $attrib_str, SQL_VARCHAR);
$sth->bind_param(5, $rank, SQL_INTEGER);
$sth->bind_param(6, $sbuild, SQL_VARCHAR);
$sth->bind_param(7, $ccs_id, SQL_INTEGER);
$sth->bind_param(8, $self->species_id(), SQL_INTEGER);
$sth->execute();
$sth->finish();
}
$cs->{'core_cache'}{$sbuild}{'IS_STORED'} = 1;
}
#
# update the internal caches that are used for fetching
#
#$self->{'_is_default_version'}->{$dbID} = 1 if($default);
#$self->{'_is_sequence_level'}->{$dbID} = 1 if($seqlevel);
$self->{'_name_cache'}->{lc($name)} ||= [];
#$self->{'_rank_cache'}->{$rank} ||= [];
$self->{'_dbID_cache'}->{$cs->dbID()} = $cs;
#this will duplicate CS in cache if we add a core cs and then store
#same with rank cache, need to replace
my $push = 1;
foreach my $name_cs(@{$self->{'_name_cache'}->{lc($name)}}){
if($name_cs->version() eq $cs->version()){
$push = 0;
$name_cs = $cs;
}
}
push @{$self->{'_name_cache'}->{lc($name)}}, $cs if $push;
#$push = 1;
#this could result in mixed rank cs in the same rank cache
#push @{$self->{'_rank_cache'}->{$rank}}, $cs;
#need to rethink rank cache? make it schema_rank cache
return $cs;
}
=head2 validate_and_store_coord_system
Arg [1] : Bio::EnsEMBL::CoordSystem (could also be Funcgen::CoordSystem)
Example : my $funcgen_cs = $csa->validate_coord_system($core_cs);
Description: Given a CoordSystem retrieves the corresponding Funcgen CoordSystem
or generates new one
Returntype : Bio::EnsEMBL::Funcgen::CoordSystem
Exceptions : throw if arg not valid and stored
Caller : general
Status : At risk - just have validate and let DBAdaptor store totally new CSs?
=cut
#currently get cs from slice, and need to validate for dnadb too
#can take FGCoordSystem or CoordSystem
sub validate_and_store_coord_system{
my ($self, $cs) = @_;
if(! (ref($cs) && $cs->isa('Bio::EnsEMBL::CoordSystem') && $cs->dbID())){
throw('Must provide a valid stored Bio::EnsEMBL::CoordSystem');
}
#Need to add to Funcgen coord_system here
#check if name and version are present and reset coord_system_id to that one, else get last ID and create a new one
#coord_system_ids will not match those in core DBs, so we need ot be mindful about this.
#can't use is_stored as this simply checks the dbID
#seq_region_ids may change between schemas with the same assembly version
#Store schema_version in coord_system and create seq_region translation
#table to maintain the seq_region_id mapping back to each core DB
#Do we need to check the the dnadb and the slice db match?
#Do we have to have specified a dnadb at this point? No.
#But need to put checks in place for dnadb methods i.e. seq/slice retrieval
my $sbuild = $self->db->_get_schema_build($cs->adaptor->db());
#this should implicitly use the current schema_build
#hence providing specificty for non-version CS's e.g. supercontig etc...
my $fg_cs = $self->fetch_by_name($cs->name(), $cs->version());
#this needs to satify both schema_build and version
#retrieving by name version should retunr the lastest schema_build unless the it is not the toplevel or highest expected rank?
my $version;
if(! $fg_cs){
if($cs->name ne 'clone' && (! $cs->version)){
#NO VERSION for assembled level !!
#Assume the default version
#we could get this from meta, but is unreliable
#get from default chromosome version
my $tmp_cs = $cs->adaptor->fetch_by_name('chromosome');
$version = $tmp_cs->version;
}
$fg_cs = Bio::EnsEMBL::Funcgen::CoordSystem->new(
-NAME => $cs->name(),
-VERSION => $version || $cs->version(),
);
warn "Created new CoordSystem:\t".$fg_cs->name().":".$fg_cs->version()."\n";
}
#This is done in BaseFeatureAdaptor->_pre_store
#to avoid users without write permission trying
#to store olf assemblies on new shema_builds
#on old schema_build which are already present
#If the CS can't be found then you're probably
#importing for the first time and have write permissions
#re-instated as we don't want any extra calls in _pre_store
#as this iterates over every features stored
#increasing import time.
if(! $fg_cs->contains_schema_build($sbuild)){
$fg_cs->add_core_coord_system_info(
-RANK => $cs->rank(),
-SEQUENCE_LEVEL => $cs->is_sequence_level(),
-DEFAULT => $cs->is_default(),
-SCHEMA_BUILD => $sbuild,
-CORE_COORD_SYSTEM_ID => $cs->dbID(),
-IS_STORED => 0,
);
eval { $fg_cs = $self->store($fg_cs) };
if($@){
warning("$@\nYou do not have permisson to store the CoordSystem for schema_build $sbuild\n".
"Using comparable CoordSystem:\t".$fg_cs->name.':'.$fg_cs->version."\n");
}
}
return $fg_cs;
}
1;