Raw content of Bio::EnsEMBL::Funcgen::Utils::HealthChecker
=head1 NAME
Bio::EnsEMBL::Funcgen::Utils::Helper
=head1 SYNOPSIS
e.g.
my $object = Bio::EnsEMBL::Object->new
(
logging => 1,
log_file => "/tmp/Misc.log",
debug_level => 2,
debug_file => "/tmp/Misc.dbg",
);
$object->log("This is a log message.");
$object->debug(1,"This is a debug message.");
$object->system("rmdir /tmp/test");
----------------------------------------------------------------------------
=head1 OPTIONS
=over 8
=item B<-debug>
Turns on and defines the verbosity of debugging output, 1-3, default = 0 = off
=over 8
=item B<-log_file|l>
Defines the log file, default = "${instance}.log"
=item B<-help>
Print a brief help message and exits.
=item B<-man>
Prints the manual page and exits.
=back
=head1 DESCRIPTION
B performs several health check and update methods prior to release.
=cut
=head1 NOTES
=head1 AUTHOR(S)
Nathan Johnson, njohnson@ebi.ac.uk
=cut
################################################################################
package Bio::EnsEMBL::Funcgen::Utils::HealthChecker;
use strict;
use Bio::EnsEMBL::Funcgen::Utils::Helper;
use Bio::EnsEMBL::Funcgen::ProbeFeature;
use Bio::EnsEMBL::Utils::Argument qw( rearrange );
use Bio::EnsEMBL::Utils::Exception qw( throw );
use Bio::EnsEMBL::Analysis;
use vars qw(@ISA);
@ISA = qw(Bio::EnsEMBL::Funcgen::Utils::Helper);
#TO DO
# 1 DONE Print all fails and warnings in summary at end of script.
# 2 validate_RegulatoryFeature_Sets
################################################################################
sub new {
my $caller = shift;
my $class = ref($caller) || $caller;
my $self = $class->SUPER::new(@_);
#validate and set type, analysis and feature_set here
my ($db, $builds, $skip_mc, $check_displayable, $skip_analyse, $meta_coord_tables) =
rearrange(['DB', 'BUILDS', 'SKIP_META_COORD', 'CHECK_DISPLAYABLE', 'SKIP_ANALYSE', 'META_COORD_TABLES'], @_);
if (! ($db && ref($db) &&
$db->isa('Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor'))){
throw('You must provide a valid Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor');
}
#test connection
$db->dbc->db_handle;
$self->{'db'} = $db;
$self->{'mysql_connect_string'} = 'mysql -h'.$db->dbc->host.' -u'.$db->dbc->username.' -p'
.$db->dbc->password.' '.$db->dbc->dbname.' -P'.$db->dbc->port;
$self->{'dbname'} = $db->dbc->dbname;
$self->{'builds'} = (scalar(@$builds)>0) ? $builds : ['DEFAULT'];
$self->{'skip_meta_coord'} = $skip_mc;
if(defined $meta_coord_tables){
throw('-skip_meta_coord is set, Cannot build meta_coord entries for tables '.join(', ', @$meta_coord_tables));
if(! ref($meta_coord_tables) eq 'ARRAY'){
throw('-meta_coord_tables parameter must be an array ref');
}
@{$self->{'meta_coord_tables'}} = @$meta_coord_tables;
}
$self->{'skip_analyse'} = $skip_analyse;
$self->{'check_displayable'} = $check_displayable;
#$self->log_header(
return $self;
}
sub db{
my ($self) = @_;
return $self->{'db'};
}
=head2
Arg[0] :
Example :
Description: Wrapper method to perform all common update functions
Returntype :
Exceptions : None
Caller : General
Status : at risk
=cut
sub update_db_for_release{
my ($self, @args) = @_;
if(@args){
}
#do seq_region_update to validate dnadb first
#hence avoiding redoing longer methods
$self->validate_new_seq_regions;#($force_srs);
$self->update_meta_schema_version;
$self->update_meta_coord;
$self->check_meta_strings;
$self->analyse_and_optimise_tables;
$self->log('??? Have you dumped/copied GFF dumps ???');
$self->log("??? Have you diff'd the sql for each species vs. a fresh schema ???");
$self->log('Need to implement check meta string check');
#Log footer? Pass optional counts hash?
$self->log('Finished updating '.$self->{'dbname'}." for release\n\n");
}
sub validate_new_seq_regions{
my ($self, $force) = @_;
#do we need to add the none default levels here?
#or are we only bothered about those which constitute the toplevel?
#To make sure we have all the correct levels in eFG we need to get all the names.
#then get all by name from the core db and set them as the dnadb.
# we also need to get all the toplevel seq_regions and store them in the seq_region table
#use BaseFeatureAdaptor::_pre_store with and array of pseudo feature on each top level slice
#Validate the efgdb and dnadb schema version are the same first
if(! $force){
my $efgdb_sm = join('_', @{$self->get_schema_and_build($self->{'dbname'})});
my $dnadb_sm = join('_', @{$self->get_schema_and_build($self->{'dbname'})});
if($efgdb_sm ne $dnadb_sm){
$self->report("WARNING Skipped validate_new_seq_regions as schema_versions are mismatched:\t".
"efgdb $efgdb_sm\tdnadb $dnadb_sm");
return 0;
}
}
my $pf_adaptor = $self->db->get_ProbeFeatureAdaptor();
my $slice_adaptor = $self->db->dnadb->get_SliceAdaptor();
$self->log_header('Validating new coord_systems/seq_regions');
foreach my $build(@{$self->{'builds'}}){
$self->log("Importing seq_region/coord_system info for build:\t".$build);
foreach my $slice(@{$slice_adaptor->fetch_all('toplevel', $build, 1)}){
#1 is non-reference flag, essential for haplotype regions
if($slice->start() != 1){
$self->log("Reslicing slice:\t".$slice->name());
#we must have some sort of PAR linked region i.e. Y
$slice = $slice_adaptor->fetch_by_region($slice->coord_system_name(), $slice->seq_region_name());
}
#we need test if it needs doing first?
#we would need to test for the coord_systems outside of this loop
#and then for each seq_region inside the loop if the coord_system is present
$self->log("_pre_storing seq_region info for slice:\t".$slice->name());
my $pseudo_feature = Bio::EnsEMBL::Funcgen::ProbeFeature->new
(
-slice => $slice,
-start => 0,
-end => 0,
-strand => 0,
);
$pf_adaptor->_pre_store($pseudo_feature);
#This will create a meta_coord entry of max_length 1 for features which have an absent meta_coord entry
}
}
$self->log("Finished validating seq_regions\n");
return;
}
sub update_meta_schema_version{
my ($self) = @_;
my $schema_version = $self->get_schema_and_build($self->{'dbname'})->[0];
my $sql = 'DELETE from meta where meta_key="schema_version"';
$self->db->dbc->db_handle->do($sql);
$sql = "INSERT into meta(meta_key, species_id, meta_value) values ('schema_version', NULL, '$schema_version')";
$self->db->dbc->db_handle->do($sql);
$self->log_header("Updated meta.schema_version to $schema_version");
}
sub update_meta_coord{
my ($self, @table_names) = @_;
my $species_id = $self->db()->species_id();
if($self->{'skip_meta_coord'}){
$self->log("Skipping meta_coord update\n");
return;
}
$self->log_header('Updating meta_coord table');
#set default table_name
if(! @table_names || scalar(@table_names) == 0){
#Can we do this via DBAdaptor and get all available adaptors which are BaseFeatureAdaptors then grab the first table name
if(defined $self->{'meta_coord_tables'}){
@table_names = @{$self->{'meta_coord_tables'}};
}
else{#default
@table_names = qw(
regulatory_feature
probe_feature
external_feature
annotated_feature
result_feature
);
}
}
#backup meta coord
#if(system("mysql -h$host -P$port -u$user -p$pass -N "
# . "-e 'SELECT * FROM meta_coord' ${species}_funcgen_${schema_build}"
# . "> ${species}_funcgen_${schema_build}.meta_coord.backup"
# ) != 0 ){
if(system($self->{'mysql_connect_string'}." -e 'SELECT * FROM meta_coord'"
. '> '.$self->{'dbname'}.'meta_coord.backup'
) != 0 ){
throw("Can't dump the original meta_coord for back up");#will this get copied to log?
}
else {
$self->log('Original meta_coord table backed up in '. $self->{'dbname'}.'.meta_coord.backup');
}
#Update each max_length for table_name and coord_system
foreach my $table_name(@table_names){
my $sql1 = "select distinct(cs.name), mc.coord_system_id, cs.version, mc.max_length from coord_system cs, meta_coord mc where mc.table_name='$table_name' and mc.coord_system_id=cs.coord_system_id and cs.species_id = $species_id";
$self->log('');
$self->log("Updating meta_coord max_length for $table_name:\n\tname\tcoord_system_id\tversion\tmax_length");
#can we test for emtpy array here? Then skip delete.
my @info = @{$self->db->dbc->db_handle->selectall_arrayref($sql1)};
#log this
map {$self->log("\t".join("\t", @{$_})."\n")} @info;
# Clean old entries
$self->log("Deleting old meta_coord entries");
my $sql = "DELETE mc FROM meta_coord mc, coord_system cs WHERE mc.table_name ='$table_name' and mc.coord_system_id = cs.coord_system_id and cs.species_id = $species_id";
$self->db->dbc->db_handle->do($sql);
# Generate new max_lengths
$self->log("Generating new max_lengths");
#Is this query running for each redundant cs_id?
#would it be more efficient to retrieve the NR cs_ids first and loop the query for each cs_id?
#Can we get the dbID of the largest feature for ease of checking?
#This won't work as we're grouping by coord_system
#would need to select distinct coord_system_id for table first
#This may well slow down quite a bit doing it this way
$sql = "select distinct s.coord_system_id from coord_system cs, seq_region s, $table_name t WHERE t.seq_region_id = s.seq_region_id and s.coord_system_id = cs.coord_system_id and cs.species_id = $species_id";
my @cs_ids = @{$self->db->dbc->db_handle->selectall_arrayref($sql)};
#Convert single element arrayrefs to scalars
map $_ = ${$_}[0], @cs_ids;
$self->log("New max_lengths for $table_name are:");
#wtf?
#map {$self->log(join("\t", @{$_}))} ['coord_system_id', 'max_length', 'longest record dbID'];
$self->log(join("\t", ('coord_system_id', 'max_length', 'longest record dbID')));
foreach my $cs_id(@cs_ids){
#This will always give a length of 1 even if there are no features present
$sql = "SELECT s.coord_system_id, (t.seq_region_end - t.seq_region_start + 1 ) as max, t.${table_name}_id "
. "FROM $table_name t, seq_region s, coord_system cs "
. "WHERE t.seq_region_id = s.seq_region_id "
. "and s.coord_system_id=${cs_id} "
. "and s.coord_system_id = cs.coord_system_id and cs.species_id = $species_id"
. "order by max desc limit 1";
@info = @{$self->db->dbc->db_handle->selectall_arrayref($sql)};
#Convert one multi element array_ref into array
@info = @{$info[0]};
$self->log(join("\t\t", @info));
$sql = "INSERT INTO meta_coord values(\"${table_name}\", \"${cs_id}\", \"$info[1]\")";
#$sql = "INSERT INTO meta_coord "
#. "SELECT '$table_name', s.coord_system_id, "
# . "MAX( t.seq_region_end - t.seq_region_start + 1 ) "
# . "FROM $table_name t, seq_region s "
# . "WHERE t.seq_region_id = s.seq_region_id "
# . "GROUP BY s.coord_system_id";
$self->db->dbc->db_handle->do($sql);
}
#$self->log("New max_lengths for $table_name are:");
#@info = @{$self->db->dbc->db_handle->selectall_arrayref($sql1)};
#map {$self->log(join("\t", @{$_})."\n")} @info;
}
$self->log("Finished updating meta_coord max_lengths\n");
return;
}
sub check_meta_strings{
my ($self, $all_builds) = @_;
$self->log_header('Checking meta strings');
warn "Need to check/update rebuild.version and regbuild.initial_release_date";
#update flag?
my @regf_fsets;
my $passed = 1;
my $fset_a = $self->db->get_FeatureSetAdaptor;
my $mc = $self->db->get_MetaContainer;
my $regf_a = $self->db->get_RegulatoryFeatureAdaptor;
if($all_builds){
@regf_fsets = @{$fset_a->fetch_all_by_type('regulatory')};
}else{
my $fset = $fset_a->fetch_by_name('RegulatoryFeatures');
push @regf_fsets, $fset if defined $fset;
}
#Need to remove/implement this
my @meta_keys = ('reguild.feature_set_ids', 'regbuild.feature_type_ids');
#What about anchor/seed sets?
if(scalar(@regf_fsets) == 0){
$self->report("WARNING: Found no regulatory FeatureSets for check_meta_strings");
}
else{
$self->log_header("Validating meta entries for FeatureSets:\t".join("\t", (map $_->name, @regf_fsets)));
#How do we validate this?
#Check all feature_sets exist
#Pull back some features from a test slice and check the number of bits match.
#Check the feature_type string exists and matches else create.
foreach my $fset(@regf_fsets){
#get version number of build
my (undef, $build_version) = split/v/, $fset->name;
$build_version = (defined $build_version) ? '_v'.$build_version : '';
my $fset_string_key = 'regbuild.feature_set_ids'.$build_version;
my $ftype_string_key = 'regbuild.feature_type_ids'.$build_version;
my $fset_string = $mc->list_value_by_key($fset_string_key)->[0];
my $ftype_string = $mc->list_value_by_key($ftype_string_key)->[0];
$self->log('Validating '.$fset->name.":\n\t$fset_string_key($fset_string) vs $ftype_string_key($ftype_string)");
#Test fset vs ftype string
if(! defined $fset_string && ! defined $ftype_string){
$self->report("FAIL:\tNo $fset_string_key or $ftype_string_key found in meta table");
}
elsif(! defined $fset_string){
$self->report("FAIL:\tNo $fset_string_key found in meta table");
}
else{
my @fset_ids = split/,/, $fset_string;
my @ftype_ids;
my @new_ftype_ids;
my $ftype_fail = 0;
if(defined $ftype_string){
@ftype_ids = split/,/, $ftype_string;
}
else{
$self->report("WARNING:\tNo $ftype_string_key found in meta table, will update using $fset_string_key");
}
#Now need to work backwards through ftypes to remove pseudo ftypes before validating
#New string should be A,A,A;S,S,S,S,S,S;P,P,P
#Where A is and Anchor/Seed set
#S is a supporting set
#P is a pseudo feature type e.g. TSS proximal
if(scalar(@fset_ids) != scalar(@ftype_ids)){
$self->report("FAIL:\tLength mismatch between $fset_string_key and $ftype_string_key");
}
foreach my $i(0..$#fset_ids){
my $supporting_set_id = $fset_ids[$i];
my $sset = $fset_a->fetch_by_dbID($supporting_set_id);
if(! defined $sset){
$self->report("FAIL:\t$fset_string_key $supporting_set_id does not exist in the DB");
}
else{
#test/build ftype string
if(defined $ftype_string){
if($sset->feature_type->dbID != $ftype_ids[$i]){
$ftype_fail = 1;
$self->report("FAIL:\t$fset_string_key $supporting_set_id(".$sset->name.") FeatureType(".$sset->feature_type->name.") does not match $ftype_string_key $ftype_ids[$i]");
}
}
push @new_ftype_ids, $sset->feature_type->dbID;
}
}
#Set ftype_string
#This will not account for pseudo ftypes? Remove!!!?
my $new_ftype_string = join(',', @new_ftype_ids);
if(! defined $ftype_string){
$self->log("Updating $ftype_string_key to:\t$new_ftype_string");
$self->db->dbc->db_handle->do("INSERT into meta values(NULL, '$ftype_string_key', '$new_ftype_string')");
}
elsif($ftype_fail){
$self->report("FAIL:\t$ftype_string_key($ftype_string) does not match $fset_string_key types($new_ftype_string)");
}
#Finally validate versus a reg feat
#Need to change this to ftype string rather than fset string?
my ($regf_dbID) = @{$self->db->dbc->db_handle->selectrow_arrayref('select regulatory_feature_id from regulatory_feature where feature_set_id='.$fset->dbID.' limit 1')};
if(! defined $regf_dbID){
$self->report("FAIL:\tNo RegulatoryFeatures found for FeatureSet ".$fset->name);
}
else{
my $rf_string = $regf_a->fetch_by_dbID($regf_dbID)->{'display_label'};#Direct access to avoid feature type
if(length($rf_string) != scalar(@fset_ids)){
$self->report("FAIL:\tRegulatory string length mismatch between RegulatoryFeature($regf_dbID) and $fset_string_key:\n$rf_string(".length($rf_string).")\n$fset_string(".scalar(@fset_ids).")");
}
}
}
}
}
return;
}
#Change this to log sets and incorporate RegFeat FeatureSet as standard
sub log_data_sets{
my $self = shift;
my ($status);
my $txt = 'Checking ';
$status = 'DISPLAYABLE' if($self->{'check_displayable'});
my @dsets;
my $dsets = $self->db->get_DataSetAdaptor->fetch_all($status);
@dsets = @$dsets if defined $dsets;
$txt .= scalar(@dsets).' ';
$txt.= $status.' ' if($self->{'check_displayable'});
$txt .= 'DataSets';
$self->log_header($txt);
foreach my $dset(@dsets){
$self->log_set("Found DataSet:\t\t", $dset) ;
my $fset = $dset->product_FeatureSet;
$self->log_set("Product FeatureSet:\t", $fset) if $fset;
#my @supporting_sets = @{$dset->get_supporting_sets};
if(my @supporting_sets = @{$dset->get_supporting_sets}){
map $self->log_set("SupportingSet:\t\t", $_), @supporting_sets;
}
}
#$self->log_header('Checking Regulatory FeatureSets');
return;
}
sub log_set{
my ($self, $text, $set) = @_;
#if(! $set->isa('Bio::EnsEMBL::Funcgen::DataSet')){
# $text .= $set->set_type.":\t";
# }
$text .= $set->display_label.'('.$set->name.')';
$text .= "\tDISPLAYABLE" if($set->is_displayable);
$self->log($text);
return;
}
sub check_stable_ids{
my ($self, @slices) = @_;
my $species_id = $self->db()->species_id();
$self->log_header('Checking stable IDs');
my $fset_a = $self->db->get_FeatureSetAdaptor;
my $fset = $fset_a->fetch_by_name('RegulatoryFeatures');
if(! $fset){
$self->report('WARNING: No RegulatoryFeatures FeatureSet found');
}
else{
#Can't count NULL field, so have to count regulatory_ffeature_id!!!
my $sql = 'select count(rf.regulatory_feature_id) from regulatory_feature rf, seq_region sr, coord_system cs where rf.stable_id is NULL and rf.seq_region_id = sr.seq_region_id and sr.coord_system_id = cs.coord_system_id and cs.species_id = $species_id and rf.feature_set_id='.$fset->dbID;
#warn "sql is $sql";
my ($null_sids) = @{$self->db->dbc->db_handle->selectrow_arrayref($sql)};
if($null_sids){
$self->report("FAIL: Found a total of $null_sids NULL stable IDs");
my $slice_a = $self->db->get_SliceAdaptor;
if(! @slices){
@slices = @{$slice_a->fetch_all('toplevel', 1)};
}
foreach my $slice(@slices){
my $sr_name=$slice->seq_region_name;
$sql = 'select count(rf.stable_id) from regulatory_feature rf, seq_region sr, coord_system cs where rf.seq_region_id=sr.seq_region_id and sr.name="'.$sr_name.'" and sr.coord_system_id = cs.coord_system_id and cs.species_id = $species_id and rf.stable_id is NULL and rf.feature_set_id='.$fset->dbID;
($null_sids) = @{$self->db->dbc->db_handle->selectrow_arrayref($sql)};
$self->log('Slice '.$slice->name." has $null_sids NULL stable IDs");
}
}
else{
$self->log('No NULL stable IDs found');
}
}
return;
}
sub validate_RegulatoryFeature_Sets{
#checks feature data and supporting sets
#links between DatSet and FeatureSet, i.e. correct naming, not linking to old set
#Displayable
}
sub analyse_and_optimise_tables{
my $self = shift;
#myisamchk --analyze. or analyze statement
if($self->{'skip_analyse'}){
$self->log('Skipping analyse/optimise tables');
return;
}
my $sql = 'show tables;';
my @tables = @{$self->db->dbc->db_handle->selectall_arrayref($sql)};
map $_ = "@{$_}", @tables;
my $analyse_sql = 'analyze table ';
my $optimise_sql = 'optimize table ';
foreach my $table(@tables){
$self->log("Analysing and optimising table $table:");
my @anal_info = @{$self->db->dbc->db_handle->selectall_arrayref($analyse_sql.$table)};
foreach my $line_ref(@anal_info){
$self->log(join("\t", @$line_ref));
}
my @opt_info = @{$self->db->dbc->db_handle->selectall_arrayref($optimise_sql.$table)};
foreach my $line_ref(@opt_info){
$self->log(join("\t", @$line_ref));
}
}
return;
}
### Check for regulatory meta entries for all regulatory feature_sets
1;