Raw content of Bio::EnsEMBL::Funcgen::DBSQL::BaseFeatureAdaptor
#
# EnsEMBL module for Bio::EnsEMBL::Funcgen::DBSQL::BaseFeatureAdaptor
#
# Copyright (c) 2006 Ensembl
#
# You may distribute this module under the same terms as perl itself
=head1 NAME
Bio::EnsEMBL::Funcgen::DBSQL::BaseFeatureAdaptor - An Base class for all
Funcgen FeatureAdaptors, redefines some methods to use the Funcgen DB
=head1 SYNOPSIS
Abstract class - should not be instantiated. Implementation of
abstract methods must be performed by subclasses.
=head1 DESCRIPTION
This is a base adaptor for Funcgen feature adaptors. This base class is simply a way
of eliminating code duplication through the implementation of methods
common to all Funcgen feature adaptors.
=head1 CONTACT
Contact Ensembl development list for info:
=head1 METHODS
=cut
package Bio::EnsEMBL::Funcgen::DBSQL::BaseFeatureAdaptor;
use vars qw(@ISA @EXPORT);
use strict;
use Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor;
use Bio::EnsEMBL::Utils::Cache;
use Bio::EnsEMBL::Utils::Exception qw(warning throw deprecate stack_trace_dump);
use Bio::EnsEMBL::Utils::Argument qw(rearrange);
@ISA = qw(Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor);
@EXPORT = (@{$DBI::EXPORT_TAGS{'sql_types'}});
#Remove these?
our $SLICE_FEATURE_CACHE_SIZE = 4;
our $MAX_SPLIT_QUERY_SEQ_REGIONS = 3;
### To do ###
#Add Translation method for fetch by FeatureSets methods?
#Test and implement feature mapping between coord systems
#Correct/Document methods!!!
#Implement externaldb db_name version registry
=head2 generic_fetch
Arg [1] : (optional) string $constraint
An SQL query constraint (i.e. part of the WHERE clause)
Arg [2] : (optional) Bio::EnsEMBL::AssemblyMapper $mapper
A mapper object used to remap features
as they are retrieved from the database
Arg [3] : (optional) Bio::EnsEMBL::Slice $slice
A slice that features should be remapped to
Example : $fts = $a->generic_fetch('contig_id in (1234, 1235)', 'Swall');
Description: Wrapper method for core BaseAdaptor, build seq_region cache for features
Returntype : ARRAYREF of Bio::EnsEMBL::SeqFeature in contig coordinates
Exceptions : none
Caller : FeatureAdaptor classes
Status : at risk
=cut
sub generic_fetch {
my $self = shift;
#need to wrap _generic_fetch to always generate the
#seq_region_cache other wise non-slice based fetch methods fail
#build seq_region cache here once for entire query
$self->build_seq_region_cache();
return $self->SUPER::generic_fetch(@_);
}
=head2 fetch_all_by_Slice_constraint
Arg [1] : Bio::EnsEMBL::Slice $slice
the slice from which to obtain features
Arg [2] : (optional) string $constraint
An SQL query constraint (i.e. part of the WHERE clause)
Arg [3] : (optional) string $logic_name
the logic name of the type of features to obtain
Example : $fs = $a->fetch_all_by_Slice_constraint($slc, 'perc_ident > 5');
Description: Returns a listref of features created from the database which
are on the Slice defined by $slice and fulfill the SQL
constraint defined by $constraint. If logic name is defined,
only features with an analysis of type $logic_name will be
returned.
Returntype : listref of Bio::EnsEMBL::SeqFeatures in Slice coordinates
Exceptions : thrown if $slice is not defined
Caller : Bio::EnsEMBL::Slice
Status : Stable
=cut
sub fetch_all_by_Slice_constraint {
my($self, $slice, $constraint, $logic_name) = @_;
my @result;
if(!ref($slice) || !$slice->isa("Bio::EnsEMBL::Slice")) {
throw("Bio::EnsEMBL::Slice argument expected.");
}
$constraint ||= '';
my $fg_cs = $self->db->get_FGCoordSystemAdaptor->fetch_by_name(
$slice->coord_system->name(),
$slice->coord_system->version()
);
if(! defined $fg_cs){
warn "No CoordSystem present for ".$slice->coord_system->name().":".$slice->coord_system->version();
return \@result;
}
#build seq_region cache here once for entire query
$self->build_seq_region_cache($slice);#, $fg_cs);
my @tables = $self->_tables;
my (undef, $syn) = @{$tables[0]};
#$constraint .= ' AND ' if $constraint;#constraint can be empty string
#$constraint .= " ${syn}.coord_system_id=".$fg_cs->dbID();
$constraint = $self->_logic_name_to_constraint($constraint, $logic_name);
#if the logic name was invalid, undef was returned
return [] if(!defined($constraint));
#check the cache and return if we have already done this query
#Added schema_build to feature cache for EFG
my $key = uc(join(':', $slice->name, $constraint, $self->db->_get_schema_build($slice->adaptor->db())));
if(exists($self->{'_slice_feature_cache'}->{$key})) {
return $self->{'_slice_feature_cache'}->{$key};
}
my $sa = $slice->adaptor();
# Hap/PAR support: retrieve normalized 'non-symlinked' slices
my @proj = @{$sa->fetch_normalized_slice_projection($slice)};
if(@proj == 0) {
throw('Could not retrieve normalized Slices. Database contains ' .
'incorrect assembly_exception information.');
}
# Want to get features on the FULL original slice
# as well as any symlinked slices
# Filter out partial slices from projection that are on
# same seq_region as original slice
my $sr_id = $slice->get_seq_region_id();
@proj = grep { $_->to_Slice->get_seq_region_id() != $sr_id } @proj;
my $segment = bless([1,$slice->length(),$slice ],
'Bio::EnsEMBL::ProjectionSegment');
push( @proj, $segment );
# construct list of Hap/PAR boundaries for entire seq region
my @bounds;
my $ent_slice = $sa->fetch_by_seq_region_id($sr_id);
$ent_slice = $ent_slice->invert() if($slice->strand == -1);
my @ent_proj = @{$sa->fetch_normalized_slice_projection($ent_slice)};
shift @ent_proj; # skip first
@bounds = map {$_->from_start - $slice->start() + 1} @ent_proj;
# fetch features for the primary slice AND all symlinked slices
foreach my $seg (@proj) {
my $offset = $seg->from_start();
my $seg_slice = $seg->to_Slice();
my $features = $self->_slice_fetch($seg_slice, $constraint); ## NO RESULTS? This is a problem with the cs->equals method?
# if this was a symlinked slice offset the feature coordinates as needed
if($seg_slice->name() ne $slice->name()) {
FEATURE:
foreach my $f (@$features) {
if($offset != 1) {
$f->{'start'} += $offset-1;
$f->{'end'} += $offset-1;
}
# discard boundary crossing features from symlinked regions
foreach my $bound (@bounds) {
if($f->{'start'} < $bound && $f->{'end'} >= $bound) {
next FEATURE;
}
}
$f->{'slice'} = $slice;
push @result, $f;
}
}
else {
push @result, @$features;
}
}
$self->{'_slice_feature_cache'}->{$key} = \@result;
return \@result;
}
=head2 build_seq_region_cache
Arg [1] : optional - Bio::EnsEMBL::Slice
the slice from which to obtain features
Example : $self->build_seq_region_cache();
Description: Builds the seq_region_id translation caches
Returntype : None
Exceptions : thrown if optional Slice argument is not valid
Caller : self
Status : At risk - should be private _build_seq_region_cache? Change arg to DBAdaptor? Or remove if we are building the full cache?
=cut
#do we even need to have the coord system?
#so long as we are only using one schema build i.e. one dnadb defualt = current
#slice and fg_cs are optional
#need to look at this
sub build_seq_region_cache{
my ($self, $slice) = @_;
if(defined $slice){
throw('Optional argument must be a Bio::EnsEMBL::Slice') if(! ( ref($slice) && $slice->isa('Bio::EnsEMBL::Slice')));
}
my $dnadb = (defined $slice) ? $slice->adaptor->db() : $self->db->dnadb();
my $schema_build = $self->db->_get_schema_build($dnadb);
my $sql = 'select sr.core_seq_region_id, sr.seq_region_id from seq_region sr';
my @args = ($schema_build);
if($self->is_multispecies()) {
$sql.= ', coord_system cs where sr.coord_system_id = cs.coord_system_id and cs.species_id=? and';
unshift(@args, $self->species_id());
}
else {
$sql.= ' where';
}
$sql.=' sr.schema_build =?';
#Can't maintain these caches as we may be adding to them when storing
#Caches seem like they are the wrong way around but this is still keeping it the original way
$self->{'seq_region_cache'} = {};
$self->{'core_seq_region_cache'} = {};
my $sth = $self->prepare($sql);
$sth->execute(@args);
while(my $ref = $sth->fetchrow_arrayref()) {
$self->{seq_region_cache}->{$ref->[0]} = $ref->[1];
$self->{core_seq_region_cache}->{$ref->[1]} = $ref->[0];
}
$sth->finish();
return;
}
#Need to separate these into by slice and by cs?
#We could just use an old cs/schema_build to grab the correct slice based on the name
#however we want to at least warn that the db needs updating
#The problem is that we want to detect whether the seq_region_id is present for _pre_store
#but automatically use a comparable seq_region for the normal fetch_methods.
#So we need to split or add $?
sub get_seq_region_id_by_Slice{
my ($self, $slice, $fg_cs, $test_present) = @_;
if(! ($slice && ref($slice) && $slice->isa("Bio::EnsEMBL::Slice"))){
throw('You must provide a valid Bio::EnsEMBL::Slice');
}
my ($core_sr_id, $fg_sr_id);
#Slice should always have an adaptor, no?
if( $slice->adaptor() ) {
$core_sr_id = $slice->adaptor()->get_seq_region_id($slice);
}
else {
$core_sr_id = $self->db()->get_SliceAdaptor()->get_seq_region_id($slice);
}
#This does not work!! When updating for a new schema_build we get the first
#seq_region stored, than for each subsequent one, it arbitrarily assigns a value from the hash even tho the
#the exists condition isn't met!
#my $fg_sr_id = $self->{'seq_region_cache'}{$core_sr_id} if exists $self->{'seq_region_cache'}{$core_sr_id};
#Can't replicate this using a normal hash
#This cache has been built based on the schema_build
if (exists $self->{'seq_region_cache'}{$core_sr_id}){
$fg_sr_id = $self->{'seq_region_cache'}{$core_sr_id};
}
if(! $fg_sr_id && ref($fg_cs)){
#This is used to store new seq_region info along side previous stored seq_regions of the same version
if( ! $fg_cs->isa('Bio::EnsEMBL::Funcgen::CoordSystem')){
throw('Must pass as valid Bio::EnsEMBL::Funcgen:CoordSystem to retrieve seq_region_ids for forwards compatibility, passed '.$fg_cs);
}
my $sql = 'select seq_region_id from seq_region where coord_system_id =? and name =?';
my $sth = $self->prepare($sql);
$sth->execute($fg_cs->dbID(), $slice->seq_region_name());
#This may not exist, so we need to catch it here?
($fg_sr_id) = $sth->fetchrow_array();
$sth->finish();
#if we are providing forward comptaiblity
#Then we know the eFG DB doesn't have the core seq_region_ids in the DB
#Hence retrieving the slice will fail in _obj_from_sth
#So we need to set it internally here
#Then pick it up when get_core_seq_region_id is called for the first time
#and populate the cache with the value
$self->{'_tmp_core_seq_region_cache'} = {(
$fg_sr_id => $core_sr_id
)};
}
elsif(! $fg_sr_id && ! $test_present) {
#This generally happens when using a new core db with a efg db that hasn't been updated
#Default to name match or throw if not present in DB
my $schema_build = $self->db->_get_schema_build($slice->adaptor->db());
my $core_cs = $slice->coord_system;
#This is basically avoiding the mapping of core to efg seq_region_ids
#via schema_build(of the new core db) as we are matching directly to the seq_name
my $sql = 'select distinct(seq_region_id) from seq_region sr, coord_system cs where sr.coord_system_id=cs.coord_system_id and sr.name=? and cs.name =?';
my @args = ($slice->seq_region_name(), $core_cs->name());
if($core_cs->is_top_level()) {
$sql.= ' and cs.version =?';
push(@args, $core_cs->version());
}
if($self->is_multispecies()) {
$sql.=' and cs.species_id=?';
push(@args, $self->species_id());
}
my $sth = $self->prepare($sql);
$sth->execute(@args);
($fg_sr_id) = $sth->fetchrow_array();
$sth->finish();
if(! $fg_sr_id){
throw('Cannot find previously stored seq_region for: '.$core_cs->name.':'.$core_cs->version.':'.$slice->seq_region_name.
"\nYou need to update your eFG seq_regions to match your core DB using: update_DB_for_release.pl\n");
}
warn 'Defaulting to previously store seq_region for: '.$core_cs->name.':'.
$core_cs->version.':'.$slice->seq_region_name.
"\nYou need to update your eFG seq_regions to match your core DB using: update_DB_for_release.pl\n";
}
return $fg_sr_id;
}
sub get_core_seq_region_id{
my ($self, $fg_sr_id) = @_;
#This is based on what the current schema_build is
#to enable multi-schema/assmelby look up
#we will have to nest these in schema_build caches
#and use the schema_build of the slice which is passed to acquire the core_seq_region_id
#likewise for reverse, i.e. when we store.
my $core_sr_id = $self->{'core_seq_region_cache'}{$fg_sr_id};
if(! defined $core_sr_id && exists $self->{'_tmp_core_seq_region_cache'}{$fg_sr_id}){
$self->{'core_seq_region_cache'}{$fg_sr_id} = $self->{'_tmp_core_seq_region_cache'}{$fg_sr_id};
#Delete here so we don't have schema_build specific sr_ids hanging around for another query
delete $self->{'_tmp_core_seq_region_cache'}{$fg_sr_id};
$core_sr_id = $self->{'core_seq_region_cache'}{$fg_sr_id};
}
return $core_sr_id;
}
=head2 _pre_store
Arg [1] : Bio::EnsEMBL::Feature
Example : $fs = $a->fetch_all_by_Slice_constraint($slc, 'perc_ident > 5');
Description: Helper function containing some common feature storing functionality
Given a Feature this will return a copy (or the same feature if no changes
to the feature are needed) of the feature which is relative to the start
of the seq_region it is on. The seq_region_id of the seq_region it is on
is also returned. This method will also ensure that the database knows which coordinate
systems that this feature is stored in. This supercedes teh core method, to trust the
slice the feature has been generated on i.e. from the dnadb. Also handles multi-coordsys
aspect, generating new coord_system_ids as appropriate
Returntype : Bio::EnsEMBL::Feature and the seq_region_id it is mapped to
Exceptions : thrown if $slice is not defined
Caller : Bio::EnsEMBL::"Type"FeatureAdaptors
Status : At risk
=cut
sub _pre_store {
my $self = shift;
my $feature = shift;
if(!ref($feature) || !$feature->isa('Bio::EnsEMBL::Feature')) {
throw('Expected Feature argument.');
}
$self->_check_start_end_strand($feature->start(),$feature->end(),
$feature->strand());
my $db = $self->db();
my $slice = $feature->slice();
if(!ref($slice) || !$slice->isa('Bio::EnsEMBL::Slice')) {
throw('Feature must be attached to Slice to be stored.');
}
# make sure feature coords are relative to start of entire seq_region
if($slice->start != 1 || $slice->strand != 1) {
throw("You must generate your feature on a slice starting at 1 with strand 1");
#move feature onto a slice of the entire seq_region
#$slice = $slice_adaptor->fetch_by_region($slice->coord_system->name(),
# $slice->seq_region_name(),
# undef, #start
# undef, #end
# undef, #strand
# $slice->coord_system->version());
#$feature = $feature->transfer($slice);
#if(!$feature) {
# throw('Could not transfer Feature to slice of ' .
# 'entire seq_region prior to storing');
#}
}
# Ensure this type of feature is known to be stored in this coord system.
my $cs = $slice->coord_system;#from core/dnadb
#retrieve corresponding Funcgen coord_system and set id in feature
my $csa = $self->db->get_FGCoordSystemAdaptor();#had to call it FG as we were getting the core adaptor
my $fg_cs = $csa->validate_and_store_coord_system($cs);
$fg_cs = $csa->fetch_by_name($cs->name(), $cs->version());
#removed as we don't want this to slow down import
#my $sbuild = $self->db->_get_schema_build($slice->adaptor->db());
#if(! $fg_cs->contains_schema_build($sbuild)){
#warn "Adding new schema build $sbuild to CoordSystem\n";
#$fg_cs->add_core_coord_system_info(
# -RANK => $cs->rank(),
# -SEQ_LVL => $cs->is_sequence_level(),
# -DEFAULT_LVL => $cs->is_default(),
# -SCHEMA_BUILD => $sbuild,
# -CORE_COORD_SYSTEM_ID => $cs->dbID(),
# -IS_STORED => 0,
# );
#$fg_cs = $csa->store($fg_cs);
#}
#$feature->coord_system_id($fg_cs->dbID());
my ($tab) = $self->_tables();
my $tabname = $tab->[0];
#Need to do this for Funcgen DB
my $mcc = $db->get_MetaCoordContainer();
$mcc->add_feature_type($fg_cs, $tabname, $feature->length);
#build seq_region cache here once for entire query
$self->build_seq_region_cache($slice);#, $fg_cs);
#Now need to check whether seq_region is already stored
#1 is test present flag
my $seq_region_id = $self->get_seq_region_id_by_Slice($slice, undef, 1);
if(! $seq_region_id){
#check whether we have an equivalent seq_region_id
$seq_region_id = $self->get_seq_region_id_by_Slice($slice, $fg_cs);
my $schema_build = $self->db->_get_schema_build($slice->adaptor->db());
my $sql;
my @args = ($slice->seq_region_name(), $fg_cs->dbID(), $slice->get_seq_region_id(), $schema_build);
#Add to comparable seq_region
if($seq_region_id) {
$sql = 'insert into seq_region(seq_region_id, name, coord_system_id, core_seq_region_id, schema_build) values (?,?,?,?,?)';
unshift(@args, $seq_region_id);
}
#No compararble seq_region
else{
$sql = 'insert into seq_region(name, coord_system_id, core_seq_region_id, schema_build) values (?,?,?,?)';
}
my $sth = $self->prepare($sql);
#Need to eval this
eval{$sth->execute(@args);};
if(!$@){
$seq_region_id = $sth->{'mysql_insertid'};
}
}
#my $seq_region_id = $slice->get_seq_region_id();
#would never get called as we're not validating against a different core DB
#if(!$seq_region_id) {
# throw('Feature is associated with seq_region which is not in this dnadb.');
#}
#why are returning seq_region id here..is thi snot just in the feature slice?
#This is actually essential as the seq_region_ids are not stored in the slice
#retrieved from slice adaptor
return ($feature, $seq_region_id);
}
#This is causing problems with remapping
sub _slice_fetch {
my $self = shift;
my $slice = shift;
my $orig_constraint = shift;
my $slice_start = $slice->start();
my $slice_end = $slice->end();
my $slice_strand = $slice->strand();
my $slice_cs = $slice->coord_system();
#my $slice_seq_region = $slice->seq_region_name();
#### Here!!! We Need to translate the seq_regions IDs to efg seq_region_ids
#we need to fetch the seq_region ID based on the coord_system id and the name
#we don't want to poulate with the eFG seq_region_id, jsut the core one, as we need to maintain core info in the slice.
#we need to cache the seq_region_id mappings for the coord_system and schema_build,
#so we don't do it for every feature in a slice
#should we just reset it in temporarily in fetch above, other wise we lose cache between projections
#should we add seq_region cache to coord_system/adaptor?
#Then we can access it from here and never change the slice seq_region_id
#how are we accessing the eFG CS? As we only have the core CS from the slice here?
#No point in having in eFG CS if we're just having to recreate it for everyquery
#CSA already has all CS's cached, so query overhead would be mininal
#But cache overhead for all CS seq_region_id caches could be quite large.
#we're going to need it in the _pre_store, so having it in the CS would be natural there
#can we just cache dynamically instead?
#we're still calling for csa rather than accessing from cache
#could we cache the csa in the base feature adaptor?
#my $slice_seq_region_id = $slice->get_seq_region_id();
#get the synonym and name of the primary_table
my @tabs = $self->_tables;
my ($tab_name, $tab_syn) = @{$tabs[0]};
#find out what coordinate systems the features are in
my $mcc = $self->db->get_MetaCoordContainer();
my @feat_css=();
my $mca = $self->db->get_MetaContainer();
my $value_list = $mca->list_value_by_key( $tab_name."build.level" );
if( @$value_list and $slice->is_toplevel()) {
push @feat_css, $slice_cs;
}
else{
@feat_css = @{$mcc->fetch_all_CoordSystems_by_feature_type($tab_name)};
}
my $asma = $self->db->get_AssemblyMapperAdaptor();
my @features;
# fetch the features from each coordinate system they are stored in
COORD_SYSTEM: foreach my $feat_cs (@feat_css) {
my $mapper;
my @coords;
my @ids;
if($feat_cs->equals($slice_cs)) {
# no mapping is required if this is the same coord system
my $max_len = $self->_max_feature_length() ||
$mcc->fetch_max_length_by_CoordSystem_feature_type($feat_cs,$tab_name);
my $constraint = $orig_constraint;
#my $sr_id;
#if( $slice->adaptor() ) {
#$sr_id = $self->get_seq_region_id_by_Slice($slice->adaptor()->get_seq_region_id($slice));
#} else {
# $sr_id = $self->get_seq_region_id_by_Slice($self->db()->get_SliceAdaptor()->get_seq_region_id($slice));
# }
my $sr_id = $self->get_seq_region_id_by_Slice($slice, $feat_cs);
$constraint .= " AND " if($constraint);
$constraint .=
"${tab_syn}.seq_region_id = $sr_id AND " .
"${tab_syn}.seq_region_start <= $slice_end AND " .
"${tab_syn}.seq_region_end >= $slice_start";
if($max_len) {
my $min_start = $slice_start - $max_len;
$constraint .=
" AND ${tab_syn}.seq_region_start >= $min_start";
}
my $fs = $self->generic_fetch($constraint,undef,$slice);
# features may still have to have coordinates made relative to slice
# start
$fs = _remap($fs, $mapper, $slice);
push @features, @$fs;
}
#can't do remapping yet as AssemblyMapper expects a core CS
#change AssemblyMapper
#or do we just create a core CS just for the remap and convert back when done
#else {
# $mapper = $asma->fetch_by_CoordSystems($slice_cs, $feat_cs);
# next unless defined $mapper;
# Get list of coordinates and corresponding internal ids for
# regions the slice spans
# @coords = $mapper->map($slice_seq_region, $slice_start, $slice_end,
# $slice_strand, $slice_cs);
# @coords = grep {!$_->isa('Bio::EnsEMBL::Mapper::Gap')} @coords;
# next COORD_SYSTEM if(!@coords);
# @ids = map {$_->id()} @coords;
##coords are now id rather than name
## @ids = @{$asma->seq_regions_to_ids($feat_cs, \@ids)};
# When regions are large and only partially spanned by slice
# it is faster to to limit the query with start and end constraints.
# Take simple approach: use regional constraints if there are less
# than a specific number of regions covered.
# if(@coords > $MAX_SPLIT_QUERY_SEQ_REGIONS) {
# my $constraint = $orig_constraint;
# my $id_str = join(',', @ids);
# $constraint .= " AND " if($constraint);
# $constraint .= "${tab_syn}.seq_region_id IN ($id_str)";
# my $fs = $self->generic_fetch($constraint, $mapper, $slice);
# $fs = _remap($fs, $mapper, $slice);
# push @features, @$fs;
# } else {
# do multiple split queries using start / end constraints
# my $max_len = $self->_max_feature_length() ||
# $mcc->fetch_max_length_by_CoordSystem_feature_type($feat_cs,
# $tab_name);
# my $len = @coords;
# for(my $i = 0; $i < $len; $i++) {
# my $constraint = $orig_constraint;
# $constraint .= " AND " if($constraint);
# $constraint .=
# "${tab_syn}.seq_region_id = " . $ids[$i] . " AND " .
# "${tab_syn}.seq_region_start <= " . $coords[$i]->end() . " AND ".
# "${tab_syn}.seq_region_end >= " . $coords[$i]->start();
# if($max_len) {
# my $min_start = $coords[$i]->start() - $max_len;
# $constraint .=
# " AND ${tab_syn}.seq_region_start >= $min_start";
# }
# my $fs = $self->generic_fetch($constraint,$mapper,$slice);
# $fs = _remap($fs, $mapper, $slice);
# push @features, @$fs;
# }
# }
### }
} #COORD system loop
return \@features;
}
#
# Given a list of features checks if they are in the correct coord system
# by looking at the first features slice. If they are not then they are
# converted and placed on the slice.
#
sub _remap {
my ($features, $mapper, $slice) = @_;
#check if any remapping is actually needed
if(@$features && (!$features->[0]->isa('Bio::EnsEMBL::Feature') ||
$features->[0]->slice == $slice)) {
return $features;
}
#remapping has not been done, we have to do our own conversion from
#to slice coords
my @out;
my $slice_start = $slice->start();
my $slice_end = $slice->end();
my $slice_strand = $slice->strand();
my $slice_cs = $slice->coord_system();
my ($seq_region, $start, $end, $strand);
#my $slice_seq_region_id = $slice->get_seq_region_id();
my $slice_seq_region = $slice->seq_region_name();
foreach my $f (@$features) {
#since feats were obtained in contig coords, attached seq is a contig
my $fslice = $f->slice();
if(!$fslice) {
throw("Feature does not have attached slice.\n");
}
my $fseq_region = $fslice->seq_region_name();
my $fseq_region_id = $fslice->get_seq_region_id();
my $fcs = $fslice->coord_system();
if(!$slice_cs->equals($fcs)) {
#slice of feature in different coord system, mapping required
($seq_region, $start, $end, $strand) =
$mapper->fastmap($fseq_region_id,$f->start(),$f->end(),$f->strand(),$fcs);
# undefined start means gap
next if(!defined $start);
} else {
$start = $f->start();
$end = $f->end();
$strand = $f->strand();
$seq_region = $f->slice->seq_region_name();
}
# maps to region outside desired area
next if ($start > $slice_end) || ($end < $slice_start) ||
($slice_seq_region ne $seq_region);
#shift the feature start, end and strand in one call
if($slice_strand == -1) {
$f->move( $slice_end - $end + 1, $slice_end - $start + 1, $strand * -1 );
} else {
$f->move( $start - $slice_start + 1, $end - $slice_start + 1, $strand );
}
$f->slice($slice);
push @out,$f;
}
return \@out;
}
=head2 fetch_all_by_external_name
Arg [1] : String $external_name
An external identifier of the feature to be obtained
Arg [2] : (optional) String $external_db_name
The name of the external database from which the
identifier originates.
Example : my @features =
@{ $adaptor->fetch_all_by_external_name( 'NP_065811.1') };
Description: Retrieves all features which are associated with
an external identifier such as a GO term, Swissprot
identifer, etc. Usually there will only be a single
feature returned in the list reference, but not
always. Features are returned in their native
coordinate system, i.e. the coordinate system in which
they are stored in the database. If they are required
in another coordinate system the Feature::transfer or
Feature::transform method can be used to convert them.
If no features with the external identifier are found,
a reference to an empty list is returned.
Returntype : arrayref of Bio::EnsEMBL::Feature objects
Exceptions : none
Caller : general
Status : at risk
=cut
sub fetch_all_by_external_name {
my ( $self, $external_name, $external_db_name ) = @_;
my $entryAdaptor = $self->db->get_DBEntryAdaptor();
my (@ids);
my @tmp = split/::/, ref($self);
my $feature_type = pop @tmp;
$feature_type =~ s/FeatureAdaptor//;
my $xref_method = 'list_'.lc($feature_type).'_feature_ids_by_extid';
if(! $entryAdaptor->can($xref_method)){
warn "Does not yet accomodate $feature_type feature external names";
return;
}
else{
@ids = $entryAdaptor->$xref_method($external_name, $external_db_name);
}
return $self->fetch_all_by_dbID_list( \@ids );
}
=head2 fetch_all_by_stable_Feature_FeatureSets
Arg [1] : string - seq_region_name i.e. chromosome name.
Arg [2] : string - seq_region_start of current slice bound
Arg [3] : string - seq_region_end of current slice bound.
Arg [4] : Bio::EnsEMBL::Gene|Transcript|Translation
Arg [5] : arrayref - Bio::EnsEMBL::Funcgen::FeatureSet
Example : ($start, $end) = $self->_set_bounds_by_regulatory_feature_xref
($trans_chr, $start, $end, $transcript, $fsets);
Description: Internal method to set an xref Slice bounds given a list of
FeatureSets.
Returntype : List - ($start, $end);
Exceptions : throw if incorrent args provided
Caller : self
Status : at risk
=cut
#We really only want this method to work on storables
#Namely all SetFeatures, could extend to FeatureTypes
#This is a reimplementation of what has been used in the
#Funcgen::SliceAdaptor::_set_bounds_by_xref_FeatureSet
#we could have a fourth arg here which is a coderef to filter the feature returned?
#This could be used by the SliceAdaptor to only return features which exceed a current slice
#We would still have to test the start and end in the caller, so not a massive win
#for too much code obfucation. Altho we are maintaining two versions of this code
#which could get out of sync, epsecially with respect to external_db names
#This is currently feature generic
#So can call from one adaptor but will return features from all adaptors?
#Where can we put this?
#DBEntry adaptor?
#The only logical place for this is really a convinience method in the core BaseFeatureAdaptor
#This removes the problem of the generic nature of the returned data, as the focus is with the
#caller which is a singular feature, therefore, not mixing of data types
#restrict to one feature type for now. but this will be slower as we will have to make the list call
#for each type of feature_set passed, so if these are mixed they will be redundant
#Let's just put a warning in
#Keep this structure so we can port it to a convinience method somewhere
#and slim down later?
#This does not descend Gene>Transcript>Translation
#Now we have conflicting standards
#FeatureSets was kep as a list to prevent having to pass [$fset] for one FeatureSet
#This also elimated the need to test ref == ARRAY
#But now we want an extra optional descend flag
#Should we change back?
#Or should we wrap this method in one which will decend?
#Write wrappers for now, as we will have to write the descend loop anyway
#But this does not resolve the @fsets vs [$fset] issue
#Will have to do this in the wrappers anyway, so change back
#and live with the dichotomy of FeatureSet implementations for now
sub fetch_all_by_stable_Storable_FeatureSets{
my ($self, $obj, $fsets) = @_;
my ($extdb_name);
my $dbe_adaptor = $self->db->get_DBEntryAdaptor;
#Do we need a central registry for ensembl db names, what about versions?
if(ref($obj) && $obj->isa('Bio::EnsEMBL::Storable') && $obj->can('stable_id')){
my @tmp = split/::/, ref($obj);
my $obj_type = pop @tmp;
my $group = $obj->adaptor->db->group;
#Could sanity check for funcgen here?
if (! defined $group){
throw('You must pass a stable Bio::EnsEMBL::Feature with an attached DBAdaptor with the group attribute set');
}
$extdb_name = 'ensembl_'.$group.'_'.$obj_type;
#warn "ext db name is $extdb_name";
}
else{
throw('Must pass a stable Bio::EnsEMBL::Feature, you passed a '.$obj);
}
#Set which eFG features we want to look at.
if(ref($fsets) ne 'ARRAY' || scalar(@$fsets) == 0){
throw('Must define an array of Bio::EnsEMBL::FeatureSets to extend xref Slice bound. You passed: '.$fsets);
}
my %feature_set_types;
foreach my $fset(@$fsets){
$self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::FeatureSet', $fset);
$feature_set_types{$fset->type} ||= [];
push @{$feature_set_types{$fset->type}}, $fset;
}
#We can list the outer loop here and put in the BaseFeatureAdaptor, or possible storable as we do have FeatureType xrefs.
#This would be useful for fetching all the efg features for a given xref and FeatureSets
#Don't implement as a parent sub and call from here as this would mean looping through array twice.
#Altho we could pass a code ref to do the filtering?
#Just copy and paste for now to avoid obfuscation
my @features;
#Get xrefs for each eFG feature set type
foreach my $fset_type(keys %feature_set_types){
#my $xref_method = 'list_'.$fset_type.'_feature_ids_by_extid';
#e.g. list_regulatory_feature_ids_by_extid
#Do type test here
my $adaptor_type = ucfirst($fset_type).'FeatureAdaptor';
#remove this for generic method
next if ref($self) !~ /$adaptor_type/;
#This is for generic method
#my $adaptor_method = 'get_'.ucfirst($fset_type).'FeatureAdaptor';
#my $adaptor = $self->db->$adaptor_method;
my %feature_set_ids;
map $feature_set_ids{$_->dbID} = 1, @{$feature_set_types{$fset_type}};
my $cnt = 0;
#Change this self to adaptor for generic method
foreach my $efg_feature(@{$self->fetch_all_by_external_name($obj->stable_id, $extdb_name)}){
#Skip if it is not in one of our FeatureSets
next if ! exists $feature_set_ids{$efg_feature->feature_set->dbID};
push @features, $efg_feature;
}
}
return \@features;
}
sub fetch_all_by_Gene_FeatureSets{
my ($self, $gene, $fsets, $dblinks) = @_;
if(! ( ref($gene) && $gene->isa('Bio::EnsEMBL::Gene'))){
throw("You must pass a valid Bio::EnsEMBL::Gene object");
}
my @features = @{$self->fetch_all_by_stable_Storable_FeatureSets($gene, $fsets)};
if($dblinks){
foreach my $transcript(@{$gene->get_all_Transcripts}){
push @features, @{$self->fetch_all_by_Transcript_FeatureSets($transcript, $fsets, $dblinks)};
}
}
return \@features;
}
sub fetch_all_by_Transcript_FeatureSets{
my ($self, $transc, $fsets, $dblinks) = @_;
if(! ( ref($transc) && $transc->isa('Bio::EnsEMBL::Transcript'))){
throw("You must pass a valid Bio::EnsEMBL::Transcript object");
}
my @features = @{$self->fetch_all_by_stable_Storable_FeatureSets($transc, $fsets)};
if($dblinks){
my $translation = $transc->translation;
push @features, @{$self->fetch_all_by_stable_Storable_FeatureSets($translation, $fsets)} if $translation;
}
return \@features;
}
=head2 fetch_by_display_label
Arg [1] : String $label - display label of feature to fetch
Example : my $feat = $adaptor->fetch_by_display_label("BRCA2");
Description: Returns the feature which has the given display label or
undef if there is none. If there are more than 1, only the first
is reported.
Returntype : Bio::EnsEMBL::Funcgen::Feature
Exceptions : none
Caller : general
Status : At risk
=cut
sub fetch_by_display_label {
my $self = shift;
my $label = shift;
my @tables = $self->_tables();
#my $syn = $tables[0]->[1];
my $constraint = "x.display_label = '$label'";# AND ${syn}.is_current = 1";
my ($feature) = @{ $self->generic_fetch($constraint) };
return $feature;
}
1;