Raw content of Bio::EnsEMBL::Compara::RunnableDB::HclusterPrepare
#
# You may distribute this module under the same terms as perl itself
#
# POD documentation - main docs before the code
=pod
=head1 NAME
Bio::EnsEMBL::Compara::RunnableDB::HclusterPrepare
=cut
=head1 SYNOPSIS
my $aa = $sdba->get_AnalysisAdaptor;
my $analysis = $aa->fetch_by_logic_name('HclusterPrepare');
my $rdb = new Bio::EnsEMBL::Compara::RunnableDB::HclusterPrepare(
-input_id => "{'species_set'=>[1,2,3,14]}",
-analysis => $analysis);
$rdb->fetch_input
$rdb->run;
=cut
=head1 DESCRIPTION
Blah
=cut
=head1 CONTACT
Contact Albert Vilella on module implemetation/design detail: avilella@ebi.ac.uk
Contact Ewan Birney on EnsEMBL in general: birney@sanger.ac.uk
=cut
=head1 APPENDIX
The rest of the documentation details each of the object methods.
Internal methods are usually preceded with a _
=cut
package Bio::EnsEMBL::Compara::RunnableDB::HclusterPrepare;
use strict;
use Switch;
use Bio::EnsEMBL::Compara::DBSQL::DBAdaptor;
use Bio::EnsEMBL::Hive;
use Bio::EnsEMBL::Compara::NestedSet;
use Bio::EnsEMBL::Compara::Homology;
use Bio::EnsEMBL::Compara::Graph::ConnectedComponents;
use Bio::EnsEMBL::Compara::MethodLinkSpeciesSet;
use Time::HiRes qw(time gettimeofday tv_interval);
our @ISA = qw(Bio::EnsEMBL::Hive::Process);
sub fetch_input {
my( $self) = @_;
$self->{'species_set'} = undef;
$self->throw("No input_id") unless defined($self->input_id);
#create a Compara::DBAdaptor which shares the same DBI handle
#with the Pipeline::DBAdaptor that is based into this runnable
$self->{'comparaDBA'} = Bio::EnsEMBL::Compara::DBSQL::DBAdaptor->new(-DBCONN=>$self->db->dbc);
$self->{gdba} = $self->{'comparaDBA'}->get_GenomeDBAdaptor;
$self->get_params($self->parameters);
my $input_gdb_id = $self->input_id;
my $gdb = $self->{gdba}->fetch_by_dbID($input_gdb_id);
throw("no genome_db for $input_gdb_id") unless(defined($gdb));
$self->{gdb} = $gdb;
return 1;
}
sub get_params {
my $self = shift;
my $param_string = shift;
return if ($param_string eq "1");
return unless($param_string);
print("parsing parameter string : ",$param_string,"\n");
my $params = eval($param_string);
return unless($params);
foreach my $key (keys %$params) {
print(" $key : ", $params->{$key}, "\n");
}
if (defined $params->{'species_set'}) {
$self->{'species_set'} = $params->{'species_set'};
}
if (defined $params->{'fasta_dir'}) {
$self->{'fasta_dir'} = $params->{'fasta_dir'};
}
if (defined $params->{'outgroups'}) {
foreach my $outgroup (@{$params->{'outgroups'}}) {
$self->{outgroups}{$outgroup} = 1;
}
}
print("parameters...\n");
printf(" fasta_dir : %d\n", $self->{'fasta_dir'});
printf(" species_set : (%s)\n", join(',', @{$self->{'species_set'}}));
printf(" outgroups : (%s)\n", join(',', keys %{$self->{'outgroups'}}));
return;
}
sub run
{
my $self = shift;
$self->analyze_table();
$self->fetch_categories();
$self->fetch_distances();
return 1;
}
sub write_output {
my $self = shift;
# $self->store_clusters;
# $self->dataflow_clusters;
# # modify input_job so that it now contains the clusterset_id
# my $outputHash = {};
# $outputHash = eval($self->input_id) if(defined($self->input_id) && $self->input_id =~ /^\s*\{.*\}\s*$/);
# $outputHash->{'clusterset_id'} = $self->{'clusterset_id'};
# my $output_id = $self->encode_hash($outputHash);
# $self->input_job->input_id($output_id);
return 1;
}
##########################################
#
# internal methods
#
##########################################
# This will make sure that the indexes for paf are fine
sub analyze_table {
my $self = shift;
my $starttime = time();
my $gdb = $self->{gdb};
my $gdb_id = $gdb->dbID;
my $species_name = lc($gdb->name);
$species_name =~ s/\ /\_/g;
my $tbl_name = "peptide_align_feature"."_"."$species_name"."_"."$gdb_id";
# Re-enable the keys before starting the queries
my $sql = "ALTER TABLE $tbl_name ENABLE KEYS";
print("$sql\n") if ($self->debug);
my $sth = $self->dbc->prepare($sql);
$sth->execute();
$sql = "ANALYZE TABLE $tbl_name";
#print("$sql\n");
$sth = $self->dbc->prepare($sql);
$sth->execute();
printf(" %1.3f secs to ANALYZE TABLE\n", (time()-$starttime));
}
sub fetch_distances {
my $self = shift;
return unless($self->{'gdb'});
my $gdb = $self->{'gdb'};
return unless $gdb;
my $starttime = time();
my $gdb_id = $gdb->dbID;
my $species_name = lc($gdb->name);
$species_name =~ s/\ /\_/g;
my $tbl_name = "peptide_align_feature"."_"."$species_name"."_"."$gdb_id";
my $species_set_string = join (",",@{$self->{species_set}});
my $sql = "SELECT ".
"concat(qmember_id,'_',qgenome_db_id), ".
"concat(hmember_id,'_',hgenome_db_id), ".
"IF(evalue<1e-199,100,ROUND(-log10(evalue)/2)) ".
"FROM $tbl_name WHERE qgenome_db_id=$gdb_id and hgenome_db_id in ($species_set_string);";
print("$sql\n");
my $sth = $self->dbc->prepare($sql);
$sth->execute();
printf("%1.3f secs to execute\n", (time()-$starttime));
print(" done with fetch\n");
my $filename = $self->{fasta_dir} . "/" . "$tbl_name.hcluster.txt";
open FILE, ">$filename" or die $!;
while ( my $ref = $sth->fetchrow_arrayref() ) {
my ($query_id, $hit_id, $score) = @$ref;
print FILE "$query_id\t$hit_id\t$score\n";
}
$sth->finish;
close FILE;
printf("%1.3f secs to process\n", (time()-$starttime));
}
sub fetch_categories {
my $self = shift;
return unless($self->{'gdb'});
my $gdb = $self->{'gdb'};
return unless $gdb;
my $starttime = time();
my $gdb_id = $gdb->dbID;
my $species_name = lc($gdb->name);
$species_name =~ s/\ /\_/g;
my $tbl_name = "peptide_align_feature"."_"."$species_name"."_"."$gdb_id";
my $sql = "SELECT DISTINCT ".
"qmember_id ".
"FROM $tbl_name WHERE qgenome_db_id=$gdb_id;";
print("$sql\n");
my $sth = $self->dbc->prepare($sql);
$sth->execute();
printf("%1.3f secs to execute\n", (time()-$starttime));
print(" done with fetch\n");
my $filename = $self->{fasta_dir} . "/" . "$tbl_name.hcluster.cat";
my $outgroup = 1;
$outgroup = 2 if (defined($self->{outgroups}{$gdb_id}));
my $member_id_hash;
while ( my $ref = $sth->fetchrow_arrayref() ) {
my ($member_id) = @$ref;
$member_id_hash->{$member_id} = 1;
}
$sth->finish;
printf("%1.3f secs to gather distinct\n", (time()-$starttime));
open FILE, ">$filename" or die $!;
foreach my $member_id (keys %$member_id_hash) {
print FILE "$member_id"."_","$gdb_id\t$outgroup\n";
}
close FILE;
printf("%1.3f secs to process\n", (time()-$starttime));
}
1;