Raw content of Bio::EnsEMBL::Compara::RunnableDB::UpdatePAFIds
#
# You may distribute this module under the same terms as perl itself
#
# POD documentation - main docs before the code
=pod
=head1 NAME
Bio::EnsEMBL::Compara::RunnableDB::UpdatePAFIds
=cut
=head1 SYNOPSIS
my $aa = $sdba->get_AnalysisAdaptor;
my $analysis = $aa->fetch_by_logic_name('UpdatePAFIds');
my $rdb = new Bio::EnsEMBL::Compara::RunnableDB::UpdatePAFIds(
-input_id => 1,
-analysis => $analysis);
$rdb->fetch_input
$rdb->run;
=cut
=head1 DESCRIPTION
This is a compara specific runnableDB, that based on an input_id
of arrayrefs of genome_db_ids, and from this species set relationship
it will search through the peptide_align_feature data and build
SingleLinkage Clusters and store them into a NestedSet datastructure.
This is the first step in the ProteinTree analysis production system.
=cut
=head1 CONTACT
Contact Albert Vilella on module implementation/design detail: avilella@ebi.ac.uk
Contact Abel Ureta-Vidal on EnsEMBL/Compara: abel@ebi.ac.uk
Contact Ewan Birney on EnsEMBL in general: birney@sanger.ac.uk
=cut
=head1 APPENDIX
The rest of the documentation details each of the object methods.
Internal methods are usually preceded with a _
=cut
package Bio::EnsEMBL::Compara::RunnableDB::UpdatePAFIds;
use strict;
use Switch;
use Bio::EnsEMBL::Compara::DBSQL::DBAdaptor;
use Bio::EnsEMBL::Hive;
use Time::HiRes qw(time gettimeofday tv_interval);
our @ISA = qw(Bio::EnsEMBL::Hive::Process);
sub fetch_input {
my( $self) = @_;
$self->{'species_set'} = undef;
#create a Compara::DBAdaptor which shares the same DBI handle
#with the pipeline DBAdaptor that is based into this runnable
$self->{'comparaDBA'} = Bio::EnsEMBL::Compara::DBSQL::DBAdaptor->new(-DBCONN=>$self->db->dbc);
$self->{gdba} = $self->{'comparaDBA'}->get_GenomeDBAdaptor;
$self->get_params($self->parameters);
my @species_set = @{$self->{'species_set'}};
my %seen;
foreach my $gdb_id (@species_set) {
next if (defined($seen{$gdb_id})); # Make sure we dont have repeated gdbs, specially for setS in Old Homology
my $gdb = $self->{gdba}->fetch_by_dbID($gdb_id);
push @{$self->{'genomeDB_set'}}, $gdb;
$seen{$gdb_id} = 1;
}
# Before we were using all the species in genome_db, which is ok for
# EnsEMBL Compara, but could cause problems for people running their
# stuff on subsets of genome_db
# $self->{'genomeDB_set'} = $self->{'comparaDBA'}->get_GenomeDBAdaptor->fetch_all;
return 1;
}
sub get_params {
my $self = shift;
my $param_string = shift;
return unless($param_string);
print("parsing parameter string : ",$param_string,"\n");
my $params = eval($param_string);
return unless($params);
foreach my $key (keys %$params) {
print(" $key : ", $params->{$key}, "\n");
}
# Species_set is usually for the new genetree pipeline
if (defined $params->{'species_set'}) {
$self->{'species_set'} = $params->{'species_set'};
}
# Species_sets is usually for the old homology pipeline
if (defined $params->{'species_sets'}) {
foreach my $species_set (@{$params->{'species_sets'}}) {
push @{$self->{'species_set'}}, @$species_set;
}
}
return;
}
sub run
{
my $self = shift;
$self->updatepafids();
return 1;
}
sub write_output {
my $self = shift;
return 1;
}
##########################################
#
# internal methods
#
##########################################
# This will make sure that the indexes for paf are fine
sub updatepafids {
my $self = shift;
my $starttime = time();
my @tbl_names;
foreach my $gdb (@{$self->{'genomeDB_set'}}) {
my $gdb_id = $gdb->dbID;
my $species_name = lc($gdb->name);
$species_name =~ s/\ /\_/g;
my $tbl_name = "peptide_align_feature"."_"."$species_name"."_"."$gdb_id";
push @tbl_names, $tbl_name;
}
# Find all the max, start from the smallest
my $top_max;
foreach my $tbl_name (sort @tbl_names) {
my $sql = "SELECT MAX(peptide_align_feature_id) as max".
" FROM $tbl_name";
my $sth = $self->dbc->prepare($sql);
$sth->execute();
my $first_offset_hash = $sth->fetchrow_hashref;
my $first_offset = $first_offset_hash->{max};
$top_max->{$first_offset} = $tbl_name;
}
my ($first_tbl_name, @rest_tbl_names) = map {$top_max->{$_}} sort {$b<=>$a} keys %{$top_max};
# First offset -- first table remains as it is
my $sql = "SELECT MAX(peptide_align_feature_id) as max".
" FROM $first_tbl_name";
my $sth = $self->dbc->prepare($sql);
$sth->execute();
my $first_offset_hash = $sth->fetchrow_hashref;
my $first_offset = $first_offset_hash->{max};
# Subsequent offsets -- subsequent tables are offsetted
foreach my $tbl_name (sort @rest_tbl_names) {
my $sql = "SELECT MIN(peptide_align_feature_id) as min".
" FROM $tbl_name";
my $sth = $self->dbc->prepare($sql);
$sth->execute();
my $offset_hash = $sth->fetchrow_hashref;
my $offset = $offset_hash->{min};
if ($offset > 1) {
$sql = "SELECT MAX(peptide_align_feature_id) as max".
" FROM $tbl_name";
$sth = $self->dbc->prepare($sql);
$sth->execute();
my $second_offset_hash = $sth->fetchrow_hashref;
my $second_offset = $second_offset_hash->{max};
$first_offset = $second_offset;
next;
} # Dont reupdate it if done before
$sql = "SELECT MAX(peptide_align_feature_id) as max".
" FROM $tbl_name";
$sth = $self->dbc->prepare($sql);
$sth->execute();
my $second_offset_hash = $sth->fetchrow_hashref;
my $second_offset = $second_offset_hash->{max};
# my $sql2 = "UPDATE $tbl_name".
# " SET peptide_align_feature_id=peptide_align_feature_id+$first_offset";
# my $sth2 = $self->dbc->prepare($sql2);
# print STDERR "Executing [", $sth2->sql, "].\n";
# $sth2->execute();
#####
my $temp_tbl_name = $tbl_name . "_temp";
my $sql2 = "CREATE TABLE $temp_tbl_name LIKE $tbl_name";
my $sth2 = $self->dbc->prepare($sql2);
print STDERR "Executing [", $sth2->sql, "].\n";
$sth2->execute();
$sql2 = "ALTER TABLE $temp_tbl_name AUTO_INCREMENT=$first_offset";
$sth2 = $self->dbc->prepare($sql2);
print STDERR "Executing [", $sth2->sql, "].\n";
$sth2->execute();
$sql2 = "ALTER TABLE $temp_tbl_name DISABLE KEYS";
$sth2 = $self->dbc->prepare($sql2);
print STDERR "Executing [", $sth2->sql, "].\n";
$sth2->execute();
$sql2 = "INSERT INTO $temp_tbl_name (qmember_id, hmember_id, qgenome_db_id, hgenome_db_id, analysis_id, qstart, qend, hstart, hend, score, evalue, align_length, identical_matches, perc_ident, positive_matches, perc_pos, hit_rank, cigar_line) select qmember_id, hmember_id, qgenome_db_id, hgenome_db_id, analysis_id, qstart, qend, hstart, hend, score, evalue, align_length, identical_matches, perc_ident, positive_matches, perc_pos, hit_rank, cigar_line FROM $tbl_name";
$sth2 = $self->dbc->prepare($sql2);
print STDERR "Executing [", $sth2->sql, "].\n";
$sth2->execute();
$sql2 = "DROP TABLE $tbl_name";
$sth2 = $self->dbc->prepare($sql2);
print STDERR "Executing [", $sth2->sql, "].\n";
$sth2->execute();
$sql2 = "RENAME TABLE $temp_tbl_name TO $tbl_name";
$sth2 = $self->dbc->prepare($sql2);
print STDERR "Executing [", $sth2->sql, "].\n";
$sth2->execute();
#####
$first_offset += $second_offset;
}
printf(" %1.3f secs to Update PAF Ids\n", (time()-$starttime));
}
1;