Raw content of Bio::EnsEMBL::Compara::DBSQL::GenomeDBAdaptor # # Ensembl module for Bio::EnsEMBL::Compara::DBSQL::GenomeDBAdaptor # # Cared for by Ewan Birney <birney@ebi.ac.uk> # # Copyright Ewan Birney # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::EnsEMBL::Compara::DBSQL::GenomeDBAdaptor - DESCRIPTION of Object =head1 SYNOPSIS use Bio::EnsEMBL::Registry; my $reg = "Bio::EnsEMBL::Registry"; $reg->load_registry_from_db(-host=>"ensembldb.ensembl.org", -user=>"anonymous"); my $genome_db_adaptor = $reg->get_adaptor("Multi", "compara", "GenomeDB"); $genome_db_adaptor->store($genome_db); $genome_db = $genome_db_adaptor->fetch_by_dbID(22); $all_genome_dbs = $genome_db_adaptor->fetch_all(); $genome_db = $genome_db_adaptor->fetch_by_name_assembly("Homo sapiens", 'NCBI36'); $genome_db = $genome_db_adaptor->fetch_by_registry_name("human"); $genome_db = $genome_db_adaptor->fetch_by_Slice($slice); =head1 DESCRIPTION This module is intended to access data in the genome_db table. The genome_db table stores information about each species including the taxon_id, species name, assembly, genebuild and the location of the core database =head1 AUTHOR - Ewan Birney This modules is part of the Ensembl project Email birney@ebi.ac.uk Describe contact details here =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::EnsEMBL::Compara::DBSQL::GenomeDBAdaptor; use vars qw(@ISA); use strict; use Bio::EnsEMBL::DBSQL::BaseAdaptor; use Bio::EnsEMBL::Compara::GenomeDB; use Bio::EnsEMBL::Utils::Exception; # Hashes for storing a cross-referencing of compared genomes my %genome_consensus_xreflist; my %genome_query_xreflist; @ISA = qw(Bio::EnsEMBL::DBSQL::BaseAdaptor); =head2 fetch_by_dbID Arg [1] : int $dbid Example : $genome_db = $gdba->fetch_by_dbID(1); Description: Retrieves a GenomeDB object via its internal identifier Returntype : Bio::EnsEMBL::Compara::GenomeDB Exceptions : none Caller : general Status : Stable =cut sub fetch_by_dbID { my ($self,$dbid) = @_; if( !defined $dbid) { throw("Must fetch by dbid"); } # check to see whether all the GenomeDBs have already been created if ( !defined $self->{'_GenomeDB_cache'}) { $self->create_GenomeDBs; } my $gdb = $self->{'_cache'}->{$dbid}; if(!$gdb) { return undef; # return undef if fed a bogus dbID } return $gdb; } =head2 fetch_all Args : none Example : my $all_genome_dbs = $genome_db_adaptor->fetch_all(); Description: gets all GenomeDBs for this compara database Returntype : listref Bio::EnsEMBL::Compara::GenomeDB Exceptions : none Caller : general Status : Stable =cut sub fetch_all { my ( $self ) = @_; if ( !defined $self->{'_GenomeDB_cache'}) { $self->create_GenomeDBs; } my @genomeDBs = values %{$self->{'_cache'}}; return \@genomeDBs; } =head2 fetch_by_name_assembly Arg [1] : string $name Arg [2] : string $assembly Example : $gdb = $gdba->fetch_by_name_assembly("Homo sapiens", 'NCBI36'); Description: Retrieves a genome db using the name of the species and the assembly version. Returntype : Bio::EnsEMBL::Compara::GenomeDB Exceptions : thrown if GenomeDB of name $name and $assembly cannot be found Caller : general Status : Stable =cut sub fetch_by_name_assembly { my ($self, $name, $assembly) = @_; unless($name) { throw('name arguments are required'); } my $sth; unless (defined $assembly && $assembly ne '') { my $sql = "SELECT genome_db_id FROM genome_db WHERE name = ? AND assembly_default = 1"; $sth = $self->prepare($sql); $sth->execute($name); } else { my $sql = "SELECT genome_db_id FROM genome_db WHERE name = ? AND assembly = ?"; $sth = $self->prepare($sql); $sth->execute($name, $assembly); } my ($id) = $sth->fetchrow_array(); if (!defined $id) { throw("No GenomeDB with this name [$name] and assembly [". ($assembly or "--undef--")."]"); } $sth->finish; return $self->fetch_by_dbID($id); } =head2 fetch_by_registry_name Arg [1] : string $name Example : $gdb = $gdba->fetch_by_registry_name("human"); Description: Retrieves a genome db using the name of the species as used in the registry configuration file. Any alias is acceptable as well. Returntype : Bio::EnsEMBL::Compara::GenomeDB Exceptions : thrown if $name is not found in the Registry configuration Caller : general Status : Stable =cut sub fetch_by_registry_name { my ($self, $name) = @_; unless($name) { throw('name arguments are required'); } my $species_db_adaptor = Bio::EnsEMBL::Registry->get_DBAdaptor($name, "core"); if (!$species_db_adaptor) { throw("Cannot connect to core database for $name!"); } return $self->fetch_by_core_DBAdaptor($species_db_adaptor); } =head2 fetch_by_Slice Arg [1] : Bio::EnsEMBL::Slice $slice Example : $gdb = $gdba->fetch_by_Slice($slice); Description: Retrieves the genome db corresponding to this Bio::EnsEMBL::Slice object Returntype : Bio::EnsEMBL::Compara::GenomeDB Exceptions : thrown if $slice is not a Bio::EnsEMBL::Slice Caller : general Status : Stable =cut sub fetch_by_Slice { my ($self, $slice) = @_; unless (UNIVERSAL::isa($slice, "Bio::EnsEMBL::Slice")) { throw("[$slice] must be a Bio::EnsEMBL::Slice"); } unless ($slice->adaptor) { throw("[$slice] must have an adaptor"); } my $core_dba = $slice->adaptor()->db(); return $self->fetch_by_core_DBAdaptor($core_dba); } =head2 fetch_by_taxon_id Arg [1] : string $name Arg [2] : string $assembly Example : $gdb = $gdba->fetch_by_taxon_id(1234); Description: Retrieves a genome db using the NCBI taxon_id of the species. Returntype : Bio::EnsEMBL::Compara::GenomeDB Exceptions : thrown if GenomeDB of taxon_id $taxon_id cannot be found. Will warn if the taxon returns more than one GenomeDB (possible in some branches of the Taxonomy) Caller : general Status : Stable =cut sub fetch_by_taxon_id { my ($self, $taxon_id) = @_; unless($taxon_id) { throw('taxon_id argument is required'); } my $sth; my $sql = "SELECT genome_db_id FROM genome_db WHERE taxon_id = ? AND assembly_default = 1"; $sth = $self->prepare($sql); $sth->execute($taxon_id); my @ids = $sth->fetchrow_array(); $sth->finish; my $return_count = scalar(@ids); my $id; if ($return_count ==0) { throw("No GenomeDB with this taxon_id [$taxon_id]"); } else { ($id) = @ids; if($return_count > 1) { warning("taxon_id [${taxon_id}] returned more than one row. Returning the first at ID [${id}]"); } } return $self->fetch_by_dbID($id); } =head2 fetch_by_core_DBAdaptor Arg [1] : Bio::EnsEMBL::DBSQL::DBAdaptor Example : my $gdb = $gdba->fetch_by_core_DBAdaptor($core_dba); Description : For a given core database adaptor object; this method will return the GenomeDB instance Returntype : Bio::EnsEMBL::Compara::GenomeDB Exceptions : thrown if no name is found for the adaptor Caller : general Status : Stable =cut sub fetch_by_core_DBAdaptor { my ($self, $core_dba) = @_; my $mc = $core_dba->get_MetaContainer(); my $species_name = $self->get_species_name_from_core_MetaContainer($mc); my ($highest_cs) = @{$core_dba->get_CoordSystemAdaptor->fetch_all()}; my $species_assembly = $highest_cs->version(); return $self->fetch_by_name_assembly($species_name, $species_assembly); } =head2 get_species_name_from_core_MetaContainer Arg [1] : Bio::EnsEMBL::MetaContainer Example : $gdba->get_species_name_from_core_MetaContainer($slice->adaptor->db->get_MetaContainer); Description : Returns the name of a species which was used to name the GenomeDB from a meta container. Can be the species binomial name or the value of the meta item species.sql_name Returntype : Scalar string Exceptions : thrown if no name is found Caller : general Status : Stable =cut sub get_species_name_from_core_MetaContainer { my ($self, $meta_container) = @_; my ($species_name) = @{$meta_container->list_value_by_key('species.sql_name')}; unless(defined $species_name) { $species_name = $meta_container->get_Species->binomial; } throw('Species name was still empty/undefined after looking for species.sql_name and binomial name') unless $species_name; return $species_name; } =head2 store Arg [1] : Bio::EnsEMBL::Compara::GenomeDB $gdb Example : $gdba->store($gdb); Description: Stores a genome database object in the compara database if it has not been stored already. The internal id of the stored genomeDB is returned. Returntype : int Exceptions : thrown if the argument is not a Bio::EnsEMBL::Compara:GenomeDB Caller : general Status : Stable =cut sub store{ my ($self,$gdb) = @_; unless(defined $gdb && ref $gdb && $gdb->isa('Bio::EnsEMBL::Compara::GenomeDB') ) { $self->throw("Must have genomedb arg [$gdb]"); } my $name = $gdb->name; my $assembly = $gdb->assembly; my $assembly_default = $gdb->assembly_default; my $taxon_id = $gdb->taxon_id; my $genebuild = $gdb->genebuild; my $locator = $gdb->locator; if ($taxon_id == 0 and !$assembly) { $assembly = ""; } else { unless($name && $assembly && $taxon_id) { $self->throw("genome db must have a name, assembly, and taxon_id"); } } my $sth = $self->prepare(" SELECT genome_db_id FROM genome_db WHERE taxon_id='$taxon_id' AND name = '$name' AND assembly = '$assembly' AND genebuild = '$genebuild' "); $sth->execute; my $dbID = $sth->fetchrow_array(); if(!$dbID) { #if the genome db has not been stored before, store it now # my $sql = "INSERT into genome_db (name,assembly,taxon_id,assembly_default,genebuild,locator) ". # " VALUES ('$name','$assembly', $taxon_id, $assembly_default, '$genebuild', '$locator')"; my $sql = qq( INSERT into genome_db (name,assembly,taxon_id,assembly_default,genebuild,locator) VALUES (?,?,?,?,?,?) ); #print("$sql\n"); my $sth = $self->prepare($sql); $sth->bind_param(1, $name, SQL_VARCHAR); $sth->bind_param(2, $assembly, SQL_VARCHAR); $sth->bind_param(3, $taxon_id, SQL_INTEGER); $sth->bind_param(4, $assembly_default, SQL_TINYINT); $sth->bind_param(5, $genebuild, SQL_VARCHAR); $sth->bind_param(6, $locator, SQL_VARCHAR); if($sth->execute()) { $dbID = $sth->{'mysql_insertid'}; if($gdb->dbID) { $sql = "UPDATE genome_db SET genome_db_id=".$gdb->dbID . " WHERE genome_db_id=$dbID"; my $sth = $self->prepare($sql); if($sth->execute()) { $dbID = $gdb->dbID; } } } } else { my $sql = "UPDATE genome_db SET ". " assembly_default = '$assembly_default'". " ,locator = '$locator'". " WHERE genome_db_id=$dbID"; #print("$sql\n"); my $sth = $self->prepare($sql); $sth->execute(); } #update the genomeDB object so that it's dbID and adaptor are set $gdb->dbID($dbID); $gdb->adaptor($self); return $dbID; } =head2 create_GenomeDBs Arg [1] : none Example : none Description: Reads the genomedb table and creates an internal cache of the values of the table. Returntype : none Exceptions : none Caller : internal Status : Stable =cut sub create_GenomeDBs { my ( $self ) = @_; # Populate the hash array which cross-references the consensus # and query dbs # my $sth = $self->prepare(" # SELECT consensus_genome_db_id, query_genome_db_id, method_link_id # FROM genomic_align_genome # "); # # $sth->execute; # # while ( my @db_row = $sth->fetchrow_array() ) { # my ( $con, $query, $method_link_id ) = @db_row; # # $genome_consensus_xreflist{$con .":" .$method_link_id} ||= []; # $genome_query_xreflist{$query .":" .$method_link_id} ||= []; # # push @{ $genome_consensus_xreflist{$con .":" .$method_link_id}}, $query; # push @{ $genome_query_xreflist{$query .":" .$method_link_id}}, $con; # } # grab all the possible species databases in the genome db table my $sth = $self->prepare(" SELECT genome_db_id, name, assembly, taxon_id, assembly_default, genebuild, locator FROM genome_db "); $sth->execute; # build a genome db for each species $self->{'_cache'} = undef; while ( my @db_row = $sth->fetchrow_array() ) { my ($dbid, $name, $assembly, $taxon_id, $assembly_default, $genebuild, $locator) = @db_row; my $gdb = Bio::EnsEMBL::Compara::GenomeDB->new(); $gdb->name($name); $gdb->assembly($assembly); $gdb->taxon_id($taxon_id); $gdb->assembly_default($assembly_default); $gdb->dbID($dbid); $gdb->adaptor( $self ); $gdb->genebuild($genebuild); $gdb->locator($locator); $self->{'_cache'}->{$dbid} = $gdb; } $self->{'_GenomeDB_cache'} = 1; $self->sync_with_registry(); } =head2 check_for_consensus_db [DEPRECATED] DEPRECATED : consensus and query sequences are not used anymore. Please, refer to Bio::EnsEMBL::Compara::GenomicAlignBlock for more details. Arg[1] : Bio::EnsEMBL::Compara::GenomeDB $consensus_genomedb Arg[2] : Bio::EnsEMBL::Compara::GenomeDB $query_genomedb Arg[3] : int $method_link_id Example : Description: Checks to see whether a consensus genome database has been analysed against the specific query genome database. Returns the dbID of the database of the query genomeDB if one is found. A 0 is returned if no match is found. Returntype : int ( 0 or 1 ) Exceptions : none Caller : Bio::EnsEMBL::Compara::GenomeDB.pm =cut sub check_for_consensus_db { my ( $self, $query_gdb, $con_gdb, $method_link_id) = @_; deprecate("consensus and query sequences are not used anymore.". " Please, refer to Bio::EnsEMBL::Compara::GenomicAlignBlock". " for more details"); # just to make things a wee bit more readable my $cid = $con_gdb->dbID; my $qid = $query_gdb->dbID; if ( exists $genome_consensus_xreflist{$cid .":" .$method_link_id} ) { for my $i ( 0 .. $#{$genome_consensus_xreflist{$cid .":" .$method_link_id}} ) { if ( $qid == $genome_consensus_xreflist{$cid .":" .$method_link_id}[$i] ) { return 1; } } } return 0; } =head2 check_for_query_db [DEPRECATED] DEPRECATED : consensus and query sequences are not used anymore. Please, refer to Bio::EnsEMBL::Compara::GenomicAlignBlock for more details. Arg[1] : Bio::EnsEMBL::Compara::GenomeDB $query_genomedb Arg[2] : Bio::EnsEMBL::Compara::GenomeDB $consensus_genomedb Arg[3] : int $method_link_id Example : none Description: Checks to see whether a query genome database has been analysed against the specific consensus genome database. Returns the dbID of the database of the consensus genomeDB if one is found. A 0 is returned if no match is found. Returntype : int ( 0 or 1 ) Exceptions : none Caller : Bio::EnsEMBL::Compara::GenomeDB.pm =cut sub check_for_query_db { my ( $self, $con_gdb, $query_gdb,$method_link_id ) = @_; deprecate("consensus and query sequences are not used anymore.". " Please, refer to Bio::EnsEMBL::Compara::GenomicAlignBlock". " for more details"); # just to make things a wee bit more readable my $cid = $con_gdb->dbID; my $qid = $query_gdb->dbID; if ( exists $genome_query_xreflist{$qid .":" .$method_link_id} ) { for my $i ( 0 .. $#{$genome_query_xreflist{$qid .":" .$method_link_id}} ) { if ( $cid == $genome_query_xreflist{$qid .":" .$method_link_id}[$i] ) { return 1; } } } return 0; } =head2 get_all_db_links Arg[1] : Bio::EnsEMBL::Compara::GenomeDB $query_genomedb Arg[2] : int $method_link_id Example : Description: For the GenomeDB object passed in, check is run to verify which other genomes it has been analysed against irrespective as to whether this was as the consensus or query genome. Returns a list of matching dbIDs separated by white spaces. Returntype : listref of Bio::EnsEMBL::Compara::GenomeDBs Exceptions : none Caller : Bio::EnsEMBL::Compara::GenomeDB.pm Status : At risk =cut sub get_all_db_links { my ($self, $ref_gdb, $method_link_id) = @_; my $gdb_list; my $method_link_species_set_adaptor = $self->db->get_MethodLinkSpeciesSetAdaptor; my $method_link_type = $method_link_species_set_adaptor-> get_method_link_type_from_method_link_id($method_link_id); my $method_link_species_sets = $method_link_species_set_adaptor->fetch_all_by_method_link_type_GenomeDB( $method_link_type, $ref_gdb ); foreach my $this_method_link_species_set (@{$method_link_species_sets}) { foreach my $this_genome_db (@{$this_method_link_species_set->species_set}) { next if ($this_genome_db->dbID eq $ref_gdb->dbID); $gdb_list->{$this_genome_db} = $this_genome_db; } } return [values %$gdb_list]; } =head2 sync_with_registry Example : Description: Synchronize all the cached genome_db objects db_adaptor (connections to core databases) with those set in Bio::EnsEMBL::Registry. Order of presidence is Registry.conf > ComparaConf > genome_db.locator Returntype : none Exceptions : none Caller : Bio::EnsEMBL::DBSQL::DBAdaptor Status : At risk =cut sub sync_with_registry { my $self = shift; return unless(eval "require Bio::EnsEMBL::Registry"); #print("Registry eval TRUE\n"); my $genomeDBs = $self->fetch_all(); foreach my $genome_db (@{$genomeDBs}) { my $coreDBA; my $registry_name; if ($genome_db->assembly) { $registry_name = $genome_db->name ." ". $genome_db->assembly; if(Bio::EnsEMBL::Registry->alias_exists($registry_name)) { $coreDBA = Bio::EnsEMBL::Registry->get_DBAdaptor($registry_name, 'core'); } } if(!defined($coreDBA) and Bio::EnsEMBL::Registry->alias_exists($genome_db->name)) { $coreDBA = Bio::EnsEMBL::Registry->get_DBAdaptor($genome_db->name, 'core'); Bio::EnsEMBL::Registry->add_alias($genome_db->name, $registry_name) if ($registry_name); } if($coreDBA) { #defined in registry so override any previous connection #and set in GenomeDB object (ie either locator or compara.conf) $genome_db->db_adaptor($coreDBA); } else { #fetch from genome_db which may be from a compara.conf or from #a locator $coreDBA = $genome_db->db_adaptor(); if(defined($coreDBA)) { if (Bio::EnsEMBL::Registry->alias_exists($genome_db->name)) { Bio::EnsEMBL::Registry->add_alias($genome_db->name, $registry_name) if ($registry_name); } else { Bio::EnsEMBL::Registry->add_DBAdaptor($registry_name, 'core', $coreDBA); Bio::EnsEMBL::Registry->add_alias($registry_name, $genome_db->name) if ($registry_name); } } } } } =head2 deleteObj Arg : none Example : none Description : Called automatically by DBConnection during object destruction phase. Clears the cache to avoid memory leaks. Returntype : none Exceptions : none Caller : general Status : Stable =cut sub deleteObj { my $self = shift; if($self->{'_cache'}) { foreach my $dbID (keys %{$self->{'_cache'}}) { delete $self->{'_cache'}->{$dbID}; } } $self->SUPER::deleteObj; } 1;