Raw content of Bio::EnsEMBL::ExternalData::Family::DBSQL::FamilyAdaptor
# $Id: FamilyAdaptor.pm,v 1.15 2003/05/09 14:35:59 abel Exp $
#
# BioPerl module for Bio::EnsEMBL::ExternalData::Family::DBSQL::FamilyAdaptor
#
# Initially cared for by Philip Lijnzaad
# Now cared by Elia Stupka and Abel Ureta-Vidal
#
# Copyright EnsEMBL
#
# You may distribute this module under the same terms as perl itself
# POD documentation - main docs before the code
=head1 NAME
FamilyAdaptor - DESCRIPTION of Object
This object represents a family coming from a database of protein families.
=head1 SYNOPSIS
use Bio::EnsEMBL::ExternalData::Family::DBSQL::DBAdaptor;
my $famdb = new Bio::EnsEMBL::ExternalData::Family::DBSQL::DBAdaptor(-user => 'myusername',
-dbname => 'myfamily_db',
-host => 'myhost');
my $fam_adtor = $famdb->get_FamilyAdaptor;
my $fam = $fam_adtor->fetch_by_stable_id('ENSF000013034');
my @fam = @{$fam_adtor->fetch_by_dbname_id('SPTR', 'P000123')};
@fam = @{$fam_adtor->fetch_by_description_with_wildcards('interleukin',1)};
@fam = @{$fam_adtor->fetch_all()};
### You can add the FamilyAdaptor as an 'external adaptor' to the 'main'
### Ensembl database object, then use it as:
$ensdb = new Bio::EnsEMBL::DBSQL::DBAdaptor->(-user....);
$ensdb->add_db_adaptor('MyfamilyAdaptor', $fam_adtor);
# then later on, elsewhere:
$fam_adtor = $ensdb->get_db_adaptor('MyfamilyAdaptor');
# also available:
$ensdb->get_all_db_adaptors;
$ensdb->remove_db_adaptor('MyfamilyAdaptor');
=head1 DESCRIPTION
This module is an entry point into a database of protein families,
clustering SWISSPROT/TREMBL and ensembl protein sets using the TRIBE MCL algorithm.
The clustering neatly follows the SWISSPROT DE-lines, which are
taken as the description of the whole family.
The objects can be read from and write to a family database.
For more info, see ensembl-doc/family.txt
=head1 CONTACT
Philip Lijnzaad [original perl modules]
Anton Enright [TRIBE algorithm]
Elia Stupka [refactoring]
Able Ureta-Vidal [multispecies migration]
=head1 APPENDIX
The rest of the documentation details each of the object methods.
Internal methods are usually preceded with a _
=cut
package Bio::EnsEMBL::ExternalData::Family::DBSQL::FamilyAdaptor;
use vars qw(@ISA);
use strict;
use Bio::EnsEMBL::ExternalData::Family::Family;
use Bio::EnsEMBL::ExternalData::Family::FamilyMember;
use Bio::EnsEMBL::DBSQL::BaseAdaptor;
use Bio::EnsEMBL::ExternalData::Family::FamilyConf;
use Bio::EnsEMBL::ExternalData::Family::Taxon;
@ISA = qw(Bio::EnsEMBL::DBSQL::BaseAdaptor);
=head2 list_familyIds
Arg [1] : none
Example : @family_ids = $family_adaptor->list_familyIds();
Description: Gets an array of internal ids for all families in the current db
Returntype : list of ints
Exceptions : none
Caller : ?
=cut
sub list_familyIds {
my ($self) = @_;
my @out;
my $sth = $self->prepare("SELECT family_id FROM family");
$sth->execute;
while (my ($id) = $sth->fetchrow) {
push(@out, $id);
}
$sth->finish;
return \@out;
}
=head2 fetch_by_dbID
Arg [1] : int $dbID
Example : $fam = $FamilyAdaptor->fetch_by_dbID(1);
Description: fetches a Family given its database identifier
Returntype : an Bio::EnsEMBL::ExternalData::Family::Family object
Exceptions : when missing arguments
Caller : general
=cut
sub fetch_by_dbID {
my ($self,$fid) = @_;
$self->throw("Should give a defined family_id as argument\n") unless (defined $fid);
my $q = "SELECT family_id,stable_id,description,release,annotation_confidence_score
FROM family
WHERE family_id = $fid";
return $self->_get_family($q);
}
=head2 fetch_by_stable_id
Arg [1] : string $family_stable_id
Example : $fam = $FamilyAdaptor->fetch_by_stable_id('ENSF00000000009');
Description: fetches a Family given its stable identifier
Returntype : an Bio::EnsEMBL::ExternalData::Family::Family object
Exceptions : when missing arguments
Caller : general
=cut
sub fetch_by_stable_id {
my ($self, $stable_id) = @_;
$self->throw("Should give a defined family_stable_id as argument\n") unless (defined $stable_id);
my $q = "SELECT family_id FROM family WHERE stable_id = '$stable_id'";
$q = $self->prepare($q);
$q->execute;
my ($id) = $q->fetchrow_array;
$id || $self->throw("Could not find family for stable id $stable_id");
return $self->fetch_by_dbID($id);
}
=head2 fetch_by_dbname_id
Arg [1] : string $dbname
Arg [2] : string $member_stable_id
Example : $fams = $FamilyAdaptor->fetch_of_dbname_id('SPTR', 'P01235');
Description: find the family to which the given database and member_stable_id belong
Returntype : an array reference of Bio::EnsEMBL::ExternalData::Family::Family objects
(could be empty or contain more than one Family in the case of ENSEMBLGENE only)
Exceptions : when missing arguments
Caller : general
=cut
sub fetch_by_dbname_id {
my ($self, $dbname, $extm_id) = @_;
$self->throw("Should give defined databasename and member_stable_id as arguments\n") unless (defined $dbname && defined $extm_id);
my $q = "SELECT f.family_id, f.stable_id, f.description,
f.release, f.annotation_confidence_score
FROM family f, family_members fm, external_db edb
WHERE f.family_id = fm.family_id
AND fm.external_db_id = edb.external_db_id
AND edb.name = '$dbname'
AND fm.external_member_id = '$extm_id'";
return $self->_get_families($q);
}
=head2 fetch_by_dbname_taxon_member
Arg [1] : string $dbname
Arg [2] : int $taxon_id
Arg [3] : string $member_stable_id
Example : $fams = $db->fetch_of_dbname_taxon_member('ENSEMBLGENE', '9606', 'ENSG000001101002');
Description: find the family to which the given database, taxon_id and member_stable_id belong
Returntype : an array reference of Bio::EnsEMBL::ExternalData::Family::Family objects
(could be empty or contain more than one Family in the case of ENSEMBLGENE only)
Exceptions : when missing arguments
Caller : general
=cut
sub fetch_by_dbname_taxon_member {
my ($self, $dbname, $taxon_id, $extm_id) = @_;
$self->throw("Should give defined databasename and taxon_id and member_stable_id as arguments\n") unless (defined $dbname && defined $taxon_id && defined $extm_id);
my $q = "SELECT f.family_id, f.stable_id, f.description,
f.release, f.annotation_confidence_score
FROM family f, family_members fm, external_db edb
WHERE f.family_id = fm.family_id
AND fm.external_db_id = edb.external_db_id
AND edb.name = '$dbname'
AND fm.external_member_id = '$extm_id'
AND fm.taxon_id = $taxon_id";
return $self->_get_families($q);
}
=head2 fetch_by_description_with_wildcards
Arg [1] : string $description
Arg [2] : int $wildcard (optional)
if set to 1, wildcards are added and the search is a slower LIKE search
Example : $fams = $FamilyAdaptor->fetch_by_description_with_wildcards('REDUCTASE',1);
Description: simplistic substring searching on the description to get the families
matching the description. (The search is currently case-insensitive;
this may change if SPTR changes to case-preservation)
Returntype : an array reference of Bio::EnsEMBL::ExternalData::Family::Family objects
Exceptions : none
Caller : general
=cut
sub fetch_by_description_with_wildcards{
my ($self,$desc,$wildcard) = @_;
my $query = $desc;
my $q;
if ($wildcard) {
$query = "%"."\U$desc"."%";
$q =
"SELECT f.family_id, f.stable_id, f.description,
f.release, f.annotation_confidence_score
FROM family f
WHERE f.description LIKE '$query'";
}
else {
$q =
"SELECT f.family_id, f.stable_id, f.description,
f.release, f.annotation_confidence_score
FROM family f
WHERE f.description = '$query'";
}
return $self->_get_families($q);
}
=head2 fetch_all
Args : none
Example : $FamilyAdaptor->fetch_all
Description: get all the families from a family database
Returntype : an array reference of Bio::EnsEMBL::ExternalData::Family::Family objects
Exceptions : none
Caller : general
=cut
sub fetch_all {
my ($self) = @_;
my $q =
"SELECT f.family_id, f.stable_id, f.description,
f.release, f.annotation_confidence_score
FROM family f";
return $self->_get_families($q);
}
=head2 fetch_Taxon_by_dbname_dbID
Arg [1] : string $dbname
Either "ENSEMBLGENE", "ENSEMBLPEP" or "SPTR"
Arg [2] : int dbID
a family_id
Example : $FamilyAdaptor->fetch_Taxon_by_dbname('ENSEMBLGENE',1)
Description: get all the taxons that belong to a particular database and family_id
Returntype : an array reference of Bio::EnsEMBL::ExternalData::Family::Taxon objects
(which may be empty)
Exceptions : when missing argument
Caller : general
=cut
sub fetch_Taxon_by_dbname_dbID {
my ($self,$dbname,$dbID) = @_;
$self->throw("Should give defined databasename and family_id as arguments\n") unless (defined $dbname && defined $dbID);
my $q = "SELECT distinct(taxon_id) as taxon_id
FROM family f, family_members fm, external_db edb
WHERE f.family_id = fm.family_id
AND fm.external_db_id = edb.external_db_id
AND f.family_id = $dbID
AND edb.name = '$dbname'";
$q = $self->prepare($q);
$q->execute;
my @taxons = ();
while (defined (my $rowhash = $q->fetchrow_hashref)) {
my $TaxonAdaptor = $self->db->get_TaxonAdaptor;
my $taxon = $TaxonAdaptor->fetch_by_taxon_id($rowhash->{taxon_id});
push @taxons, $taxon;
}
return \@taxons;
}
=head2 known_databases
Args : none
Example : $FamilyAdaptor->known_databases
Description: get all database name, source of the family members
Returntype : an array reference of string
Exceptions : none
Caller : general
=cut
sub known_databases {
my ($self) = @_;
if (not defined $self->{_known_databases}) {
$self->{_known_databases} = $self->_known_databases();
}
return $self->{_known_databases};
}
=head2 get_max_id
Args : none
Example : $FamilyAdaptor->get_max_id
Description: find the higest family_stable_id (ENSFxxx) in this database (needed for mapping).
Returntype : string
Exceptions : none
Caller : general
=cut
sub get_max_id {
my($self, $db) = @_;
my $q = "select max(stable_id) from family";
$q = $self->prepare($q);
$q->execute;
my ( @row ) = $q->fetchrow_array;
return $row[0];
}
=head2 fetch_alignment
Arg [1] : Bio::EnsEMBL::External::Family::Family $family
Example : $family_adaptor->fetch_alignment($family);
Description: Retrieves the alignment strings for all the members of a
family
Returntype : none
Exceptions : none
Caller : FamilyMember::align_string
=cut
sub fetch_alignment {
my($self, $family) = @_;
my $members = $family->get_all_members;
return unless(@$members);
my $sth = $self->prepare("SELECT family_member_id, alignment
FROM family_members
WHERE family_id = ?");
$sth->execute($family->dbID);
#move results of query into hash keyed on family member id
my %align_hash = map {$_->[0] => $_->[1]} (@{$sth->fetchall_arrayref});
$sth->finish;
#set the slign strings for each of the members
foreach my $member (@$members) {
$member->alignment_string($align_hash{$member->dbID()});
}
return;
}
##################
# internal methods
#internal method used in multiple calls above to build family objects from table data
sub _get_families {
my ($self, $q) = @_;
$q = $self->prepare($q);
$q->execute;
my @fams = ();
while (defined (my $rowhash = $q->fetchrow_hashref)) {
my $fam = new Bio::EnsEMBL::ExternalData::Family::Family;
$fam->adaptor($self);
$fam->dbID($rowhash->{family_id});
$fam->stable_id($rowhash->{stable_id});
$fam->description($rowhash->{description});
$fam->release($rowhash->{release});
$fam->annotation_confidence_score($rowhash->{annotation_confidence_score});
push(@fams, $fam);
}
return \@fams;
}
# get one or no family, given some query
sub _get_family {
my ($self, $q) = @_;
my $fams = $self->_get_families($q);
if (scalar @{$fams} > 1) {
$self->throw("Internal database error, expecting at most one Family.
Check data coherence, e.g. have two families with different family_id have the same stable id.\n");
# as family_id and stable_id are unique keys _get_families should sufficient;
};
return $fams->[0];
}
sub _known_databases {
my ($self) = @_;
my $q =
"SELECT name FROM external_db";
$q = $self->prepare($q);
$q->execute;
my @res= ();
while ( my ( @row ) = $q->fetchrow_array ) {
push @res, $row[0];
}
$self->throw("didn't find any database") if (int(@res) == 0);
return \@res;
}
###############
# store methods
=head2 store
Arg [1] : Bio::EnsEMBL::ExternalData:Family::Family $fam
Example : $FamilyAdaptor->store($fam)
Description: Stores a family object into a family database
Returntype : int
been the database family identifier, if family stored correctly
Exceptions : when isa if Arg [1] is not Bio::EnsEMBL::ExternalData::Family::Family
Caller : general
=cut
sub store {
my ($self,$fam) = @_;
$fam->isa('Bio::EnsEMBL::ExternalData::Family::Family') ||
$self->throw("You have to store a Bio::EnsEMBL::ExternalData::Family::Family object, not a $fam");
my $q = "SELECT family_id from family where stable_id = ?";
$q = $self->prepare($q);
$q->execute($fam->stable_id);
my $rowhash = $q->fetchrow_hashref;
if ($rowhash->{family_id}) {
return $rowhash->{family_id};
}
$q = "INSERT INTO family (stable_id, description, release, annotation_confidence_score) VALUES (?,?,?,?)";
$q = $self->prepare($q);
$q->execute($fam->stable_id,$fam->description,$fam->release,$fam->annotation_confidence_score);
$fam->dbID($q->{'mysql_insertid'});
my $member_adaptor = $self->db->get_FamilyMemberAdaptor;
foreach my $member (@{$fam->get_all_members}) {
$member->external_db_id($self->_store_db_if_needed($member->database));
$member_adaptor->store($fam->dbID,$member);
}
return $fam->dbID;
}
sub _store_db_if_needed {
my ($self,$dbname) = @_;
my $q = "select external_db_id from external_db where name = ?";
$q = $self->prepare($q);
$q->execute($dbname);
my $rowhash = $q->fetchrow_hashref;
if ($rowhash->{external_db_id}) {
return $rowhash->{external_db_id};
} else {
$q = "INSERT INTO external_db (name) VALUES (?)";
$q = $self->prepare($q);
$q->execute($dbname);
return $q->{'mysql_insertid'};
}
}
=head2 convert_store_family
Arg [1] : -family => \@Bio::Cluster::SequenceFamily
Example : $FamilyAdaptor->convert_store_family(-family=>\@family)
Description: converts Bio::Cluster::SequenceFamily objects into a Bio::EnsEMBL::ExternalData::Family objects
and store.
Returntype : array of dbIDs
been the database family identifier, if family stored correctly
Exceptions :
Caller : general
=cut
sub convert_store_family {
my($self,@args) = @_;
my ($family) = $self->_rearrange([qw(FAMILY)],@args);
my %conf = %Bio::EnsEMBL::ExternalData::Family::FamilyConf::FamilyConf;
my @ens_species = split(',',$conf{'ENSEMBL_SPECIES'});
my $family_prefix = $conf{"FAMILY_PREFIX"};
my $release = $conf{'RELEASE'};
my $ext_db_name = $conf{'EXTERNAL_DBNAME'};
my %taxon_species;
my @id;
my @taxon_str;
foreach my $sp(@ens_species){
$sp = uc $sp;
push @taxon_str, $conf{"$sp"."_TAXON"};
}
my %ens_taxon_info = $self->_setup_ens_taxon(@taxon_str);
my @ens_fam;
my $family_count = $conf{"FAMILY_START"} || 1;
foreach my $fam (@{$family}){
my @members = $fam->get_members;
my @ens_mem;
foreach my $mem (@members){
my $taxon = $mem->species;
if(!$taxon->ncbi_taxid){
foreach my $key (keys %ens_taxon_info){
if($mem->display_id =~/$key/){
my %taxon_hash = %{$ens_taxon_info{$key}};
my @class = split(':',$taxon_hash{'taxon_classification'});
$taxon = Bio::EnsEMBL::ExternalData::Family::Taxon->new(-classification=>\@class);
$taxon->common_name($taxon_hash{'taxon_common_name'});
$taxon->sub_species($taxon_hash{'taxon_sub_species'});
$taxon->ncbi_taxid($taxon_hash{'taxon_id'});
last;
}
}
}
bless $taxon,"Bio::EnsEMBL::ExternalData::Family::Taxon";
my $member = Bio::EnsEMBL::ExternalData::Family::FamilyMember->new();
$member->family_id($fam->family_id);
my ($annot) = $mem->annotation->get_Annotations('dblink');
$member->database(uc $annot->database);
$member->stable_id($mem->display_name);
$taxon->ncbi_taxid || $self->throw($mem->id." has no taxon id!");
$self->db->get_TaxonAdaptor->store_if_needed($taxon);
$member->taxon_id($taxon->ncbi_taxid);
$member->adaptor($self);
$member->database(uc $ext_db_name) if (! defined $member->database || $member->database eq "");
push @ens_mem, $member;
}
my $stable_id = sprintf ("$family_prefix%011.0d",$family_count);
$family_count++;
my $ens_fam= new Bio::EnsEMBL::ExternalData::Family::Family(-stable_id=>$stable_id,
-members=>\@ens_mem,
-description=>$fam->description,
-score=>$fam->annotation_score,
-adpator=>$self);
$ens_fam->release($release);
#$ens_fam->annotation_confidence_score($fam->annotation_score);
push @id,$self->store($ens_fam);
}
return @id;
}
#process ensembl taxon information for FamilyConf.pm
sub _setup_ens_taxon {
my ($self,@taxon_str) = @_;
my %hash;
foreach my $str(@taxon_str){
$str=~s/=;/=undef;/g;
my %taxon = map{split '=',$_}split';',$str;
my $prefix = $taxon{'PREFIX'};
delete $taxon{'PREFIX'};
$hash{$prefix} = \%taxon;
}
return %hash;
}
###########################################
#
# Deprecated methods. Will be deleted soon.
sub get_Family_by_id {
my ($self, $id) = @_;
$self->warn("FamilyAdaptor->get_Family_by_id is a deprecated method!
Calling FamilyAdaptor->fetch_by_dbID instead!");
return $self->fetch_by_dbID($id);
}
sub get_Family_of_Ensembl_gene_id {
my ($self, $eid) = @_;
$self->warn("FamilyAdaptor->get_Family_of_Ensembl_gene_id is a deprecated method!
Calling FamilyAdaptor->fetch_by_dbname_id instead!");
return $self->fetch_by_dbname_id('ENSEMBLGENE', $eid);
}
sub get_Family_of_Ensembl_pep_id {
my ($self, $eid) = @_;
$self->warn("FamilyAdaptor->get_Family_of_Ensembl_pep_id is a deprecated method!
Calling FamilyAdaptor->fetch_by_dbname_id instead!");
return $self->fetch_by_dbname_id('ENSEMBLPEP', $eid);
}
sub all_Families {
my ($self) = @_;
$self->warn("FamilyAdaptor->all_Families is a deprecated method!
Calling FamilyAdaptor->fetch_all instead!");
return $self->fetch_all;
}
1;