Raw content of BioMart::Registry # $Id: Registry.pm,v 1.10.2.1 2008/07/30 15:35:59 syed Exp $ # # BioMart module for BioMart::Registry # # You may distribute this module under the same terms as perl # itself. # POD documentation - main docs before the code. =head1 NAME BioMart::Registry =head1 SYNOPSIS TODO: Synopsis here. =head1 DESCRIPTION The registry is created by the initializer (BioMart::Initializer::getRegistry()) and it acts as a repository of datasets and links between datasets for the entire BioMart system, and all client code. The registry is also responsible for setting up all possible links between datasets once these are populated with filters and attributes. =head1 AUTHOR - Arek Kasprzyk, Syed Haider, Andreas Kahari, Damian Smedley =head1 CONTACT This module is part of the BioMart project http://www.biomart.org Questions can be posted to the mart-dev mailing list: mart-dev@ebi.ac.uk =head1 METHODS =cut #------------------------------------------------------------------------ package BioMart::Registry; use strict; use warnings; use BioMart::Links; use Data::Dumper; use Storable qw(store retrieve freeze nfreeze thaw); local $Storable::Deparse = 1; $Storable::forgive_me = 1; use Cwd; use BioMart::Dataset::TableSet; use BioMart::Dataset::GenomicSequence; use BioMart::Dataset::GenomicMAlign; use constant INF => 10_000; # Used in __Dijkstra(), must be # larger than the total number # of datasets. # Extends BioMart::Root use base qw(BioMart::Root); BEGIN{ # We want to use XML::Parser if installed, as it's faster than XML::SAX no strict 'refs'; my $fail; unless( 'XML::'->{'Parser::'} ){ # Check if already used eval "require XML::Parser"; if( $@ ){ $fail ++ } else{ XML::Parser->import() } } unless( $fail ){ $XML::Simple::PREFERRED_PARSER = 'XML::Parser' } } #------------------------------------------------------------------------ =head2 new Usage : my $registry = BioMart::Registry->new(); Description: Creates a new BioMart::Registry object. Return type: A BioMart::Registry object. Exceptions : Caller : =cut sub _new { my ($self, @params) = shift; $self->SUPER::_new(@params); $self->attr('defaultDatasetName', undef); $self->attr('virtualSchemas',[]); $self->attr('mode','MEMORY'); ## default to MEMORY other option is LAZYLOAD $self->attr('dirPath',undef); $self->attr('settingsConfParams', undef); } #------------------------------------------------------------------------ =head2 getMode Usage : $registry->getMode() $registry->setMode('LAZYLOAD'); To set Description: get the mode, default to MEMORY Return type: A string Exceptions : none Caller : caller =cut sub getMode { my ($self) = @_; return $self->get('mode'); } #------------------------------------------------------------------------ =head2 setMode Usage : $registry->getMode() $registry->setMode('LAZYLOAD'); To set Description: get the mode, default to MEMORY, Also sets the same mode for DATASETI objects Return type: none Exceptions : none Caller : caller =cut sub setMode { my ($self, $val) = @_; if ($val) { $self->set('mode', $val); } foreach my $virtualSchema (@{$self->getAllVirtualSchemas}){ foreach my $location (@{$virtualSchema->getAllLocations}){ foreach my $dataset (@{$location->getAllDatasets}){ $dataset->setMode($val); } } } } #------------------------------------------------------------------------ =head2 getDirPath Usage : $registry->getDirPath() $registry->setDirPath('/abc/def/'); To set Description: get the path to the folder taht contains registry file, where confTrees, _portables, XML directories live Return type: A string Exceptions : none Caller : caller =cut sub getDirPath { my ($self) = @_; return $self->get('dirPath'); } #------------------------------------------------------------------------- =head2 setDirPath Usage : $registry->getDirPath() $registry->setDirPath('/abc/def/'); To set Description: get the path to the folder taht contains registry file, where confTrees, _portables, XML directories live, also set the same path for DATASETI objects Return type: none Exceptions : none Caller : caller =cut sub setDirPath { my ($self, $val) = @_; if ($val) { $self->set('dirPath', $val); } foreach my $virtualSchema (@{$self->getAllVirtualSchemas}){ foreach my $location (@{$virtualSchema->getAllLocations}){ foreach my $dataset (@{$location->getAllDatasets}){ $dataset->setDirPath($val); } } } } #------------------------------------------------------------------------ =head2 cleanXMLs Usage : $registry->cleanXMLs() Description: deletes all existing XML files from disk, for action => clean, during Initializer->new Return type: none Exceptions : none Caller : caller =cut sub cleanXMLs { my ($self) = @_; my $cleanFile = $self->getDirPath(); my $v_schemas = $self->getAllVirtualSchemas(); foreach my $schema (@$v_schemas) { my $databases = $self->getAllDatabaseNames($schema->name()); ## databases are locations as per old API calls foreach my $database_name (@$databases) { my $datasets = $self->getAllDataSetsByDatabaseName($schema->name(), $database_name); foreach my $dataset_name(@$datasets) { my $dataset = $self->getDatasetByName($schema->name(), $dataset_name); my $interfacesList = $dataset->interfaces(); # should return a comma separated list of interfaces my @interfacesArray = split /,/,$interfacesList; foreach my $interface(@interfacesArray) { my $temp; $temp = $cleanFile; $temp .= $schema->name()."/XML/".$dataset->locationName().".".$dataset->name().".".$interface; #$temp .= $schema->name()."/XML/".$dataset->getParam('configurator')->get('location')->database().".".$dataset->name().".".$interface; if (-e $temp) { unlink $temp; } } } } } } #------------------------------------------------------------------------ =head2 getAllVirtualSchemaNames Usage : foreach my $vSchemas ( @{ $registry->getAllVirtualSchemas} ) {...} Description: get an arrayref of all virtualSchemas within the registry This will always return at least 'defaultSchema'. Return type: An arrayref of strings Exceptions : none Caller : caller =cut sub getAllVirtualSchemaNames { my ($self,$visible_only) = @_; my @names; foreach my $virtualSchema (@{$self->getAllVirtualSchemas}){ if ($visible_only){ push @names, $virtualSchema->name if ($virtualSchema->visible == 1); } else{ push @names, $virtualSchema->name; } } @names = sort(@names); return \@names; } #------------------------------------------------------------------------ =head2 getDefaultVirtualSchema Usage : my $default = $registry->getDefaultVirtualSchema; Description: gets the default virtualSchema if any as defined in the registry XML Return type: String Exceptions : none Caller : caller =cut sub getDefaultVirtualSchema { my $self = shift; foreach my $virtualSchema (@{$self->getAllVirtualSchemas}){ return $virtualSchema->name if ($virtualSchema->default == 1); } } =head2 getAllDatasetNames Usage : my @dataSetNames = $registry->getAllDatasetsNames($virtualSchema); my @visibleDatasetNames = $registry->getAllDatasetsNames($virtualSchema, $visible_only); Description: Gets all names of all datasets within a virtualSchema, or all visible datasets within a virtualSchema. If $visible_only is defined, only visible datasets are returned. Return type: An array of strings in list context, or a reference to such an array in scalar context. Exceptions : none Caller : caller =cut sub getAllDatasetNames { my ($self, $virtualSchemaName, $visible_only) = @_; # sort all the dataset Names per Mart based on their displayNames if visible_only = 1 my @dataSetNames; my %sortedNames; foreach my $virtualSchema (@{$self->getAllVirtualSchemas}){ next unless ($virtualSchema->name eq $virtualSchemaName); foreach my $location (@{$virtualSchema->getAllLocations}){ %sortedNames=(); foreach my $dataset (@{$location->getAllDatasets}){ if ($visible_only){ $sortedNames{$dataset->displayName} = $dataset->name if ($dataset->visible == 1); } else{ # directly pushing as invisible datasets often has no display Name push @dataSetNames,$dataset->name; } } if ($visible_only){ foreach my $displayName ( sort keys %sortedNames ) { push @dataSetNames, $sortedNames{$displayName}; } } } } return (wantarray() ? @dataSetNames : \@dataSetNames); } #------------------------------------------------------------------------ =head2 getAllDisplayNames Usage : my @dataSetNames = $registry->getAllDisplayNames($virtualSchema); my @dataSetNames = $registry->getAllDisplayNames($virtualSchema, $visible_only); Description: Gets all display names of all datasets within a virtualSchema. If $visible_only is defined, displayNames are returned only for visible datasets. Return type: An array of strings in list context, or a reference to such an array in scalar context. Exceptions : none Caller : caller =cut sub getAllDisplayNames { my ($self, $virtualSchemaName, $visible_only) = @_; my @dataSetNames; foreach my $virtualSchema (@{$self->getAllVirtualSchemas}){ next unless ($virtualSchema->name eq $virtualSchemaName); foreach my $location (@{$virtualSchema->getAllLocations}){ if ($visible_only){ push @dataSetNames, @{$location->getAllVisibleDatasetDisplayNames}; } else { foreach my $dataset (@{$location->getAllDatasets}){ push @dataSetNames,$dataset->displayName; } } } } return (wantarray() ? @dataSetNames : \@dataSetNames); } #------------------------------------------------------------------------ =head2 getLinksBetween Usage : my @link = $registry->getLinkBetween($virtualSchema, $exportingDatasetName, $importingDatasetName); Description: Returns the BioMart::Links object that links the given exportingDataset to the given importingDataset within the given virtualSchema, if one exists. Return type: BioMart::Dataset::Links object, or undef it no link exists between the two datasets within the given virtualSchema. Exceptions : exporting or importing DatasetName are not known in the configuration Caller : =cut sub getLinkBetween { my ($self,$virtualSchema,$exportingDatasetName,$importingDatasetName) = @_; my $link; my $exportingDatasetEntry = $self->__fetchDatasetEntry($virtualSchema, $exportingDatasetName); my $importingDatasetEntry = $self->__fetchDatasetEntry($virtualSchema, $importingDatasetName); LINK: foreach my $li (@{ $exportingDatasetEntry->{'links'} }) { my $targetDatasetName = $li->targetDataset(); if ($importingDatasetName && $importingDatasetName eq $targetDatasetName) { $link = $li; last LINK; } } return $link; } #------------------------------------------------------------------------ =head2 getDatasetByName Usage : my $dataSet = $registry->getDatasetByName($virtualSchema, $DatasetName); Description: Gets a named dataset from the registry within the given virtualSchema. Return type: A BioMart::DatasetI object, or undef Exceptions : none Caller : caller =cut sub getDatasetByName { my ($self, $virtualSchema, $dataSetName ,$martUser) = @_; my $dataSetEntry = $self->__fetchDatasetEntry($virtualSchema, $dataSetName, $martUser); return $dataSetEntry; } #------------------------------------------------------------------------ =head2 getAttribute Usage : my $att = $registry->getAttribute($dataset, $attribute, $schema, $interface); Description: Retrieve attribute object given its internal name and dataset Return type: A BioMart::Configuration::Attribute object, or undef Exceptions : none Caller : caller =cut sub getAttribute { my ($self, $dataset_name, $attributename, $schema_name, $interface) = @_; $schema_name ||= 'default'; $interface ||= 'default'; # Retrieve config-info for this dataset, return undef if can't be found my $dataset_obj = $self->getDatasetByName($schema_name,$dataset_name); if(!defined($dataset_obj)) { my $errmsg = "Dataset $schema_name\.$dataset_name not found in registry"; BioMart::Exception::Configuration->throw($errmsg); } my $dataset_conf = $dataset_obj->getConfigurationTree($interface); BioMart::Exception::Configuration->throw("No config-tree found for dataset $schema_name\. $dataset_name") if (!$dataset_conf); # Query config-tree for attribute, return undef if we can't find it my $attribute = $dataset_conf->getAttributeByName($attributename); if(!defined($attribute)) { my $errmsg = "Attribute '$attributename' not found in dataset $schema_name\.$dataset_name"; BioMart::Exception::Configuration->throw($errmsg); } # returning softwareVersion each time as thats the only quiet method of doing so # no additional memory headache as conftree is already requested for a particular att/filter my $softwareVersion = $dataset_conf->software_version(); if(wantarray()) { return ($attribute, $softwareVersion); } else { return $attribute; } #return $attribute; } =head2 getFilter Usage : my $filter = $registry->getFilter($dataset, $filtname, $schema, $interface); Description: Retrieve filter object given its internal name and dataset Return type: A BioMart::Configuration::BaseFilter implementing object, or undef Exceptions : none Caller : caller =cut sub getFilter { my ($self, $dataset_name, $filtername, $schema_name, $interface) = @_; $schema_name ||= 'default'; $interface ||= 'default'; # Retrieve config-info for this dataset, return undef if can't be found my $dataset_obj = $self->getDatasetByName($schema_name,$dataset_name); if(!defined($dataset_obj)) { my $errmsg = "Dataset $schema_name\.$dataset_name not found in registry"; BioMart::Exception::Configuration->throw($errmsg); } my $dataset_conf = $dataset_obj->getConfigurationTree($interface); BioMart::Exception::Configuration->throw("No config-tree found for dataset $schema_name\. $dataset_name") if (!$dataset_conf); # Query config-tree for attribute, return undef if we can't find it my $filter = $dataset_conf->getFilterByName($filtername); if(!defined($filter)) { my $errmsg = "Filter '$filtername' not found in dataset $schema_name\.$dataset_name"; BioMart::Exception::Configuration->throw($errmsg); } # returning softwareVersion each time as thats the only quiet method of doing so # no additional memory headache as conftree is already requested for a particular att/filter my $softwareVersion = $dataset_conf->software_version(); if(wantarray()) { return ($filter, $softwareVersion); } else { return $filter; } #return $filter; } =head2 getConfigTreeForDataset Usage : my $confTree = $registry->getConfigTreeForDataset($dataset, $schema, $interface); Description: Retrieve ConfigTree object given its dataset and interface Return type: A BioMart::Configuration::ConfigurationTree object, or undef Exceptions : none Caller : caller =cut sub getConfigTreeForDataset { my ($self, $dataset_name, $schema_name,$interface) = @_; $schema_name ||= 'default'; $interface ||= 'default'; my $dataset = $self->getDatasetByName($schema_name,$dataset_name); if(!defined($dataset)) { my $errmsg = "Can't find dataset $schema_name\.$dataset_name in registry"; BioMart::Exception::Configuration->throw($errmsg); } return $dataset->getConfigurationTree($interface); } #------------------------------------------------------------------------ =head2 getDefaultDataset Usage : my $dataset = $registry->getDefaultDataset; Description: Retrieve the default Schema's default Database' default Dataset Return type: Dataset object Exceptions : none Caller : caller =cut sub getDefaultDataset { my ($self) = @_; my $default_schema = $self->getDefaultVirtualSchema() || 'default'; my $default_database = $self->getDefaultDatabase($default_schema); $default_database ||= $self->getAllDatabaseNames($default_schema, 1)->[0]; my $dataset_names = $self->getAllDataSetsByDatabaseName($default_schema, $default_database, 1); foreach my $dataset_name(@$dataset_names) { my $dataset = $self->getDatasetByName($default_schema, $dataset_name); my $is_default = $dataset->getConfigurationTree('default')->defaultDataset(); if($is_default && $is_default eq 'true') { return $dataset; } } # Return first dset on list if no dsets are flagged as default return $self->getDatasetByName($default_schema, $dataset_names->[0]); } #------------------------------------------------------------------------ =head2 getAllDatabaseNames Usage : my $databases = $registry->getAllDatabaseNames($virtualSchema); my $visible_databases = $registry->getAllDatabaseNames($virtualSchema, $visible_only); Description: Gets a ref to an array of database names from a particular virtualSchema in the registry. ('defaultSchema' is the virtualSchema assigned to any database not explicitly assigned to a virtualSchema). If $visible_only is defined, returns names only for visible databases. Return type: A ref to an array of Strings Exceptions : none Caller : caller =cut sub getAllDatabaseNames { my ($self, $virtualSchemaName, $visible_only) = @_; my @dataBaseNames; foreach my $virtualSchema (@{$self->getAllVirtualSchemas}){ next unless ($virtualSchema->name eq $virtualSchemaName); foreach my $location (@{$virtualSchema->getAllLocations}){ if ($visible_only){ push @dataBaseNames,$location->displayName if ($location->visible == 1); } else{ push @dataBaseNames,$location->displayName; } } } return (wantarray() ? @dataBaseNames : \@dataBaseNames); } =head2 getDefaultDatabase Usage : my $databases = $registry->getDefaultDatabase($virtualSchema); Description: Gets the default database name for a particular virtualScema from the registry. Return type: String Exceptions : none Caller : caller =cut sub getDefaultDatabase { my $self = shift; my $virtualSchemaName = shift; foreach my $virtualSchema (@{$self->getAllVirtualSchemas}){ next unless ($virtualSchema->name eq $virtualSchemaName); foreach my $location (@{$virtualSchema->getAllLocations}){ return $location->displayName if ($location->default && $location->default == 1); } # incase no default location, use the first one as default location foreach my $location (@{$virtualSchema->getAllLocations}) { return $location->displayName; } } return ''; } #------------------------------------------------------------------------ =head2 getAllDataSetsByDatabaseName Usage : my $dataSets = $registry->getAllDatasetsByDatabaseName($virtualSchema, $DatabaseName); my $visible_dataSets = $registry->getAllDatasetsByDatabaseName($virtualSchema, $DatabaseName, $visible_only); Description: Gets a ref to an array of dataset names from the registry for a particular database, in a particular virtualSchema. If $visible_only is defined returns names only for visible datasets. Return type: A ref to an array of Strings. Exceptions : none Caller : caller =cut sub getAllDataSetsByDatabaseName { my ($self, $virtualSchemaName, $databaseName, $visible_only) = @_; # sort all the dataset Names in a Mart based on their displayNames if visible_only is true my @dataSetNames; my %sortedNames; foreach my $virtualSchema (@{$self->getAllVirtualSchemas}){ next unless ($virtualSchema->name eq $virtualSchemaName); foreach my $location (@{$virtualSchema->getAllLocations}){ next unless ($location->displayName eq $databaseName); %sortedNames=(); foreach my $dataset (@{$location->getAllDatasets}){ if ($visible_only){ $sortedNames{$dataset->displayName} = $dataset->name if ($dataset->visible == 1); } else{ # directly pushing as invisible datasets often has no display Name push @dataSetNames,$dataset->name; } } if ($visible_only){ foreach my $displayName ( sort keys %sortedNames ) { push @dataSetNames, $sortedNames{$displayName}; } } } } return (wantarray() ? @dataSetNames : \@dataSetNames); } #------------------------------------------------------------------------ =head2 _createAllLinks Usage : $registry->_createAllLinks(); Description: Investigates all datasets in the registry and creates all possible links between them. Any already existing link will be removed. Also precalculates dataset clusters and shortest paths between datasets. Return type: Exceptions : Caller : =cut sub _createAllLinks { my ($self) = @_; warn("Setting possible links between datasets\n"); # Count them my $totalDSCount = 0; my $currentDSCount = 0; foreach my $virtualSchema (@{$self->getAllVirtualSchemas}){ foreach my $location (@{$virtualSchema->getAllLocations}){ $totalDSCount += scalar @{$location->getAllDatasets}; } } foreach my $virtualSchema (@{$self->getAllVirtualSchemas}){ foreach my $location (@{$virtualSchema->getAllLocations}){ foreach my $dataset (@{$location->getAllDatasets}){ delete $dataset->{'links'}; delete $dataset->{'cluster'}; delete $dataset->{'pathHash'}; $currentDSCount++; printf STDERR "\r....(scanning) %d%%",(100*($currentDSCount/$totalDSCount)); # add any missing placeholder datasets before create links my $dataSetName = $dataset->name; my @interfaces = split(/\,/,$dataset->interfaces); foreach my $interface(@interfaces){ next if (!${$dataset->get('configurationTrees')} {$interface});# only do for cached configTrees my $configTree = #$dataset->getConfigurationTree($interface); $dataset->getConfigurationTree($interface,'CREATE_ALL_LINKS'); my $configurator = $dataset->getConfigurator(); # incase placeholder datasets not added yet $configurator-> addPlaceHolderDatasets($configTree, $virtualSchema->name, ); } } #---------------------reset the configurations trees to 'DISK' if ($self->getMode() eq 'LAZYLOAD') ### this call is made by Web.pm, when disk flag is set { foreach my $dataset (@{$location->getAllDatasets}) { my $dataSetName = $dataset->name; my @interfaces = split(/\,/,$dataset->interfaces); foreach my $interface(@interfaces){ $dataset->setConfigurationTree($interface, 'LAZYLOAD'); } } } #--------------------- } } print STDERR "\n"; $currentDSCount = 0; my $squaredDSCount = $totalDSCount*$totalDSCount; foreach my $virtualSchema (@{$self->getAllVirtualSchemas}){ foreach my $location (@{$virtualSchema->getAllLocations}){ foreach my $datasetA (@{$location->getAllDatasets}){ foreach my $locationB (@{$virtualSchema->getAllLocations}){ foreach my $datasetB (@{$locationB->getAllDatasets}){ $currentDSCount++; printf STDERR "\r....(linking) %d%%",(100*($currentDSCount/$squaredDSCount)); next if ($datasetA->name eq $datasetB->name); $self->__linkDatasets($virtualSchema->name, $datasetA->name, $datasetB->name); } } } } } print STDERR "\n"; $currentDSCount = 0; foreach my $virtualSchema (@{$self->getAllVirtualSchemas}){ foreach my $location (@{$virtualSchema->getAllLocations}){ foreach my $dataset (@{$location->getAllDatasets}){ $currentDSCount++; printf STDERR "\r....(sorting) %d%%",(100*($currentDSCount/$totalDSCount)); $dataset->{'pathHash'} = $self->__Dijkstra( $virtualSchema->name, $dataset->name); } } } print STDERR "\n"; $self->__cluster($totalDSCount); # add placeholder filts/atts now all datasets initialised as can # then recover real filts/atts from cached configurationTrees and # also check the linking is valid $currentDSCount = 0; foreach my $virtualSchema (@{$self->getAllVirtualSchemas}){ my $virtualSchemaName = $virtualSchema->name; foreach my $location (@{$virtualSchema->getAllLocations}){ foreach my $dataset (@{$location->getAllDatasets}){ $currentDSCount++; printf STDERR "\r....(resolving) %d%%",(100*($currentDSCount/$totalDSCount)); my $dataSetName = $dataset->name; my @interfaces = split(/\,/,$dataset->interfaces); foreach my $interface(@interfaces){ next if (!${$dataset->get('configurationTrees')} {$interface});# only do for cached configTrees my $configTree = #$dataset->getConfigurationTree($interface); $dataset->getConfigurationTree($interface,'CREATE_ALL_LINKS'); my $configurator = $dataset->getConfigurator(); $configurator->addPlaceHolders($configTree, $virtualSchemaName, $dataSetName, $interface); # cause dependsOn stuff to be distributed and populated $configurator->resolveDependsOn($configTree, $virtualSchemaName, $dataSetName, $interface); } } #---------------------reset the configurations trees to 'DISK' if ($self->getMode() eq 'LAZYLOAD') ### this call is made by Web.pm, when disk flag is set { foreach my $dataset (@{$location->getAllDatasets}) { my $dataSetName = $dataset->name; my @interfaces = split(/\,/,$dataset->interfaces); foreach my $interface(@interfaces){ $dataset->setConfigurationTree($interface, 'LAZYLOAD'); } } } #--------------------- } } print STDERR "\n"; } #------------------------------------------------------------------------ =head2 getPath Usage : my @dataSetNames = $registry->getPath($virtualSchema, $sourceDataset, $targetDataset); Description: Returns the shortest path from $sourceDataset to $targetDataset withing the given virtualSchema. Return type: Array of scalars (strings, dataset names), or undef if a path could not be found. Exceptions : Caller : =cut sub getPath { my ($self, $virtualSchema, $sourceDataset, $targetDataset) = @_; my $sourceDatasetEntry = $self->__fetchDatasetEntry($virtualSchema, $sourceDataset); my $targetDatasetEntry = $self->__fetchDatasetEntry($virtualSchema, $targetDataset); my $pathHash = $sourceDatasetEntry->{'pathHash'}; my @path; my $currentDataset = $targetDataset; while (defined $currentDataset) { unshift @path, $currentDataset; $currentDataset = $pathHash->{$currentDataset}; } if (!$path[0] || ($path[0] ne $sourceDataset) || !$path[-1] || ($path[-1] ne $targetDataset)) { return undef; } return (wantarray() ? @path : \@path); } #------------------------------------------------------------------------ =head2 configure Usage : $registry->configure(); $registry->configure([['hsapiens_gene_ensembl'], ['mmusculus_gene_ensembl']); $registry->configure([['hsapiens_gene_ensembl', 'default', 'default'], ['mmusculus_gene_ensembl', 'default', 'default']]); Description: The method caches configurationTrees in the Dataset objects for the specified datasets (or all if none are specified) and builds all possible links between subsystem by calling Registry::createAllLinks() Return type: Exceptions : Caller : =cut sub configure { my ($self,$datasets) = @_; if (!$datasets) { $self->_getConfigurationTrees; } else { foreach my $dataset(@{$datasets}){ $self->getConfigTreeForDataset(${$dataset}[0], ${$dataset}[1] || 'default', ${$dataset}[2] || 'default'); } $self->_createAllLinks(); } #---------------------------------------------------------------- if ($self->getMode() eq 'LAZYLOAD') { my $v_schemas = $self->getAllVirtualSchemas(); foreach my $schema (@$v_schemas) { my $databases = $self->getAllDatabaseNames($schema->name()); ## databases are locations as per old API calls foreach my $database_name (@$databases) { my $datasets = $self->getAllDataSetsByDatabaseName($schema->name(), $database_name); foreach my $dataset_name(@$datasets) { my $dataset = $self->getDatasetByName($schema->name(), $dataset_name); my $interfacesList = $dataset->interfaces(); # should return a comma separated list of interfaces my @interfacesArray = split /,/,$interfacesList; foreach my $interface(@interfacesArray) { $dataset->setExportables($interface, 'LAZYLOAD'); $dataset->setImportables($interface, 'LAZYLOAD'); } } } } } #---------------------------------------------------------------- } #------------------------------------------------------------------------ =head2 _getConfigurationTrees Usage : $registry->_getConfigurationTrees(); Description: The method caches all configurationTrees in the Dataset pbjects and builds all possible links between subsystem by calling Registry::createAllLinks() Return type: Exceptions : Caller : =cut sub _getConfigurationTrees { my $self = shift; my $dsCounter; foreach my $virtualSchema (@{$self->getAllVirtualSchemas}){ foreach my $location (@{$virtualSchema->getAllLocations}){ $dsCounter=0; foreach my $dataset (@{$location->getAllDatasets}){ my @interfaces = split(/\,/,$dataset->interfaces); foreach my $interface(@interfaces){ $dsCounter++; $dataset->getConfigurationTree($interface,$dsCounter); } } } } $self->_createAllLinks(); } sub toXML { my ($self, $registryXML) = @_; if ($registryXML) { $self->{'registryXML'}=$registryXML; } return $self->{'registryXML'}; } =head2 interface_type Usage : $registry->interface_type Description: get/set the interface_type Returntype : string interface_type Exceptions : none Caller : caller =cut sub interface_type { my ($self, $value) = @_; if ($value) { $self->{'interface_type'}= $value; } return $self->{'interface_type'}; } =head2 addVirtualSchema Usage : usage Description: Description Returntype : Exceptions : none Caller : caller =cut sub addVirtualSchema { my ($self, $virtualSchema) = @_; my $virtualSchemas = $self->get('virtualSchemas'); push @{$virtualSchemas}, $virtualSchema; } #------------------------------------------------------------------------ =head2 getVirtualSchemaByName Usage : Description: Fetches a virtualSchema entry. Caller : =cut sub getVirtualSchemaByName { my ($self, $virtualSchemaName) = @_; my $retSchema; foreach my $virtualSchema (@{$self->getAllVirtualSchemas}){ return $virtualSchema if ($virtualSchema->name eq $virtualSchemaName); } return $retSchema; } =head2 getAllVirtualSchemas Usage : usage Description: Description Returntype : Exceptions : none Caller : caller =cut sub getAllVirtualSchemas { my $self = shift; return $self->get('virtualSchemas'); } =head2 removeVirtualSchema Usage : usage Description: Description Returntype : Exceptions : none Caller : caller =cut sub removeVirtualSchema { my ($self,$virtualSchema) = @_; my $virtualSchemas = $self->getAllVirtualSchemas; my $i = 0; foreach my $vs (@$virtualSchemas){ if ($vs->name eq $virtualSchema->name){ splice @$virtualSchemas,$i,1; last; } $i++; } } sub getDatasetsExportingTo { my ($self,$virtualSchema,$to_dataset) = @_; my @exporting_datasets; my $dataset_names = $self->getAllDatasetNames($virtualSchema,1); foreach my $fr_dataset (@{$dataset_names}){ next if ($fr_dataset eq $to_dataset); push @exporting_datasets, $fr_dataset if ($self->getLinkBetween( $virtualSchema,$fr_dataset,$to_dataset)); } return @exporting_datasets; } # only called by linked dataset panel to reverse the order of DBs in linking Menu sub getDatasetsExportingTo_reverseDBs { my ($self,$virtualSchema,$to_dataset) = @_; my @exporting_datasets; my $dataset_names = $self->getAllDatasetNames($virtualSchema,1); foreach my $fr_dataset (@{$dataset_names}) { next if ($fr_dataset eq $to_dataset); push @exporting_datasets, $fr_dataset if ($self->getLinkBetween( $virtualSchema,$fr_dataset,$to_dataset)); } # get LocationName of this dataset my $to_datasetLocationName = $self->getDatasetByName($virtualSchema, $to_dataset)->locationDisplayName(); # maintain the same order of datasets by rearrange them as # the mart of first DS selection should go at the bottom of second dataset list my @allMarts; my %martsHash; my %martsHashDefaultDS; my @revisedOrder; foreach my $virtualSchemaObj (@{$self->getAllVirtualSchemas}) { next unless ($virtualSchemaObj->name eq $virtualSchema); foreach my $location (@{$virtualSchemaObj->getAllLocations(1)}) { next if ($location->displayName() eq $to_datasetLocationName); push @allMarts, $location->displayName(); } push @allMarts, $to_datasetLocationName; } # assigning datasets to their marts, except default datasets, they need to be sorted foreach my $dsName (@exporting_datasets) { my $dsObj = $self->getDatasetByName($virtualSchema, $dsName); my $dbName = $dsObj->locationDisplayName(); if ($dsObj->getConfigurationTree('default')->defaultDataset()) { #push @{$martsHashDefaultDS{$dbName}}, $dsName; push @{$martsHash{$dbName}}, $dsName; } else { push @{$martsHash{$dbName}}, $dsName; } } # sorting default datasets and adding them to their respective mart keys foreach my $dbName (keys %martsHashDefaultDS) { if ($martsHashDefaultDS{$dbName}) { foreach my $dsName (reverse sort(@{$martsHashDefaultDS{$dbName}})) { unshift @{$martsHash{$dbName}}, $dsName; } } } # determine if splitter-line is required or not. # this is only required when linking within the mart and across # marts exists my $splitter_line = 0; $splitter_line = 1 if($martsHash{$to_datasetLocationName} && scalar keys %martsHash > 1); foreach my $martName (@allMarts) { push @revisedOrder, "splitter-line" if($splitter_line && $martName eq $to_datasetLocationName); foreach (@{$martsHash{$martName}}) { push @revisedOrder, $_; } } return @revisedOrder; } #------------------------------------------------------------------------ # internal =head2 __linkDatasets (internal) Usage : $self->linkDatasets( $virtualSchema, $sourceDatasetName, $targetDatasetName); Description: Links two datasets in the same virtualSchema, if possible. Caller : BioMart::Registry =cut sub __linkDatasets { my ($self, $virtualSchema, $sourceDatasetName, $targetDatasetName) = @_; my $sourceDataset = $self->__fetchDatasetEntry($virtualSchema, $sourceDatasetName); return unless ($sourceDataset); my $targetDataset = $self->__fetchDatasetEntry($virtualSchema, $targetDatasetName); return unless ($targetDataset); my $link = BioMart::Links->new($self); $link->sourceDataset($sourceDatasetName); $link->targetDataset($targetDatasetName); my $haveLink = 0; my @sourceInterfaces = split(/\,/,$sourceDataset->interfaces); my @targetInterfaces = split(/\,/,$targetDataset->interfaces); foreach my $importable (@{ $targetDataset->getImportables() }) { foreach my $exportable (@{ $sourceDataset->getExportables() }) { if ($importable->linkName eq $exportable->linkName) { # do not link on DAS or GFF type exp/imp pairs next if ($importable->type() ne 'link' || $exportable->type ne 'link'); # versions must be compatible as well if exists for both next if (($importable->linkVersion && $exportable->linkVersion) && ($importable->linkVersion ne $exportable->linkVersion)); $link->addLink($virtualSchema, $importable->linkName); $haveLink = 1; } } } if ($haveLink) { push @{ $sourceDataset->{'links'} }, $link; push @{ $targetDataset->{'links'} }, $link; } } #------------------------------------------------------------------------ =head2 __fetchDatasetEntry (internal) Usage : my $dataSetEntry = $self->__fetchDatasetEntry($virtualSchema, $DatasetName); Description: Fetches a dataset entry. Throws an exception if the dataset does not exist. Caller : BioMart::Registry =cut sub __fetchDatasetEntry { my ($self, $virtualSchemaName, $dataSetName, $martUser) = @_; return unless ($virtualSchemaName && $dataSetName); my $retDataset; OUTER:foreach my $virtualSchema (@{$self->getAllVirtualSchemas}){ next unless ($virtualSchema->name eq $virtualSchemaName); foreach my $location (@{$virtualSchema->getAllLocations}){ next if ($martUser && $location->martUser && $location->martUser ne $martUser); my $dataSetEntry = $location->getDatasetByName($dataSetName); if (defined $dataSetEntry) { return $dataSetEntry; } } } return $retDataset; } #------------------------------------------------------------------------ =head2 __Dijkstra (internal) Usage : my $pathHash = $registry->__Dijkstra($virtualSchema, $sourceDataset); Description: Computes all shortest paths from $sourceDataset within a particular virtualSchema. See "http://en.wikipedia.org/wiki/Dijkstra's_algorithm" Caller : BioMart::Registry::createAllLinks() =cut sub __Dijkstra { my ($self, $virtualSchema, $dataSetName) = @_; my @vertices = $self->getAllDatasetNames($virtualSchema); my %dist; my %path; foreach my $vertex (@vertices) { $dist{$vertex} = INF; } $dist{$dataSetName} = 0; while (scalar @vertices > 0) { my $min_vert_idx = 0; my $min_vert = $vertices[$min_vert_idx]; my $min_dist = $dist{$min_vert}; for (my $vertex_idx = 1; $vertex_idx < scalar @vertices; ++$vertex_idx) { my $vertex = $vertices[$vertex_idx]; if ($dist{$vertex} < $min_dist) { $min_vert_idx = $vertex_idx; $min_vert = $vertex; $min_dist = $dist{$vertex}; } } if ($min_dist == INF) { # Exhausted a disjoint set of datasets. last; } splice @vertices, $min_vert_idx, 1; my @edges = $self->__getLinksFrom($virtualSchema, $min_vert); foreach my $edge (@edges) { my $vertex = $edge->targetDataset(); if ($dist{$vertex} > $dist{$min_vert} + 1) { $dist{$vertex} = $dist{$min_vert} + 1; $path{$vertex} = $min_vert; } } } return \%path; } #------------------------------------------------------------------------ =head2 __cluster (internal) Usage : $registry->__cluster(); Description: Finds all of the connected sets of the datasets within each virtualSchema. Assigns a cluster ID to the members of each set. Caller : BioMart::Registry::createAllLinks() =cut sub __cluster { my $self = shift; my $totalDSCount = shift; my $cluster = 0; my $currentDSCount = 0; foreach my $virtualSchema (@{$self->getAllVirtualSchemas}){ my $vSchema = $virtualSchema->name; foreach my $location (@{$virtualSchema->getAllLocations}){ foreach my $seedDatasetEntry (@{$location->getAllDatasets}){ $currentDSCount++; print STDERR "\r....(clustering) ".$currentDSCount."/".$totalDSCount." "; my $seedDataset = $seedDatasetEntry->name; next if (defined $seedDatasetEntry->{'cluster'}); my @vertices = ( $seedDataset ); ++$cluster; while (scalar @vertices > 0) { my $dataSet = shift @vertices; print STDERR "\r....(clustering) ".$currentDSCount."/".$totalDSCount." - ".(scalar @vertices)." remain "; foreach my $otherDataset ( $self->__getDatasetsImportingFrom($vSchema, $dataSet), $self->__getDatasetsExportingTo($vSchema, $dataSet)) { my $otherDatasetEntry = $self->getDatasetByName ($vSchema,$otherDataset); next if ($otherDatasetEntry->{'cluster'}); push @vertices, $otherDataset; } my $thisEntry = $self->getDatasetByName($vSchema,$dataSet); $thisEntry->{'cluster'} = $cluster; } } } } print STDERR "\n"; } #-------------------------------------------------------------------- =head2 __getDatasetsImportingFrom (internal) Usage : my @dataSetNames = $registry->__getDatasetsImportingFrom($virtualSchema, $DatasetName); Description: Returns the names of the datasets that import from the named dataset within the given virtualSchema. Return type: An array of strings in list context, or a reference to such an array in scalar context. Exceptions : Caller : =cut sub __getDatasetsImportingFrom { my ($self, $virtualSchema, $dataSetName) = @_; my $dataSetEntry = $self->__fetchDatasetEntry($virtualSchema, $dataSetName); my @dataSetNames; foreach my $link (@{ $dataSetEntry->{'links'} }) { my $targetDatasetName = $link->targetDataset(); my $targetDatasetEntry = $self->__fetchDatasetEntry($virtualSchema, $targetDatasetName); my $sourceDatasetName = $link->sourceDataset(); my $sourceDatasetEntry = $self->__fetchDatasetEntry($virtualSchema, $sourceDatasetName); if ($sourceDatasetName eq $dataSetName) { push @dataSetNames, $targetDatasetName; } } return (wantarray() ? @dataSetNames : \@dataSetNames); } #------------------------------------------------------------------------ =head2 __getDatasetsExportingTo (internal) Usage : my @dataSetNames = $registry->__getDatasetsExportingTo($virtualSchema, $DatasetName); Description: Returns the names of the datasets that export to the named dataset within the given virtualSchema. Return type: An array of strings in list context, or a reference to such an array in scalar context. Exceptions : Caller : =cut sub __getDatasetsExportingTo { my ($self, $virtualSchema, $dataSetName) = @_; my $dataSetEntry = $self->__fetchDatasetEntry($virtualSchema, $dataSetName); my @dataSetNames; foreach my $link (@{ $dataSetEntry->{'links'} }) { my $targetDatasetName = $link->targetDataset(); my $targetDatasetEntry = $self->__fetchDatasetEntry($virtualSchema, $targetDatasetName); my $sourceDatasetName = $link->sourceDataset(); my $sourceDatasetEntry = $self->__fetchDatasetEntry($virtualSchema, $sourceDatasetName); if ($targetDatasetName eq $dataSetName) { push @dataSetNames, $sourceDatasetName; } } return (wantarray() ? @dataSetNames : \@dataSetNames); } #------------------------------------------------------------------------ =head2 __getLinksFrom (internal) Usage : my @links = $registry->__getLinksFrom($virtualSchema, $DatasetName); Description: Returns all BioMart::Links objects defining a directional link from the named Dataset to a Dataset able to import from the named Dataset within the given virtualSchema. Return type: An array of BioMart::Links objects in list context, or a reference to such an array in scalar context. Exceptions : Caller : =cut sub __getLinksFrom { my ($self, $virtualSchema, $dataSetName) = @_; my $dataSetEntry = $self->__fetchDatasetEntry($virtualSchema, $dataSetName); my @links; foreach my $link (@{ $dataSetEntry->{'links'} }) { my $targetDatasetName = $link->targetDataset(); my $targetDatasetEntry = $self->__fetchDatasetEntry($virtualSchema, $targetDatasetName); my $sourceDatasetName = $link->sourceDataset(); my $sourceDatasetEntry = $self->__fetchDatasetEntry($virtualSchema, $sourceDatasetName); if ($sourceDatasetName eq $dataSetName) { push @links, $link; } } return (wantarray() ? @links : \@links); } =head2 settingsParams Usage : $registry->settingsParams($settingsHash); Description: adds all params in hash passed to registry These params come from settings.conf only used by Web.pm and martview Return type: Exceptions : Caller : Initializer's getRegistry() =cut sub settingsParams { my ($self, $hash) = @_; if($hash) { $self->set('settingsConfParams', $hash); } return $self->get('settingsConfParams'); } 1; # vim: et