HclusterPrepare documentation.

my $aa = $sdba->get_AnalysisAdaptor;
my $analysis = $aa->fetch_by_logic_name('HclusterPrepare');
my $rdb = new Bio::EnsEMBL::Compara::RunnableDB::HclusterPrepare(
-input_id => "{'species_set'=>[1,2,3,14]}",
-analysis => $analysis);
$rdb->fetch_input
$rdb->run;

sub analyze_table {

  my $self = shift;

  my $starttime = time();

  my $gdb = $self->{gdb};
  my $gdb_id = $gdb->dbID;
  my $species_name = lc($gdb->name);
  $species_name =~ s/\ /\_/g;
  my $tbl_name = "peptide_align_feature"."_"."$species_name"."_"."$gdb_id";
  # Re-enable the keys before starting the queries
  my $sql = "ALTER TABLE $tbl_name ENABLE KEYS";

  print("$sql\n") if ($self->debug);
  my $sth = $self->dbc->prepare($sql);
  $sth->execute();

  $sql = "ANALYZE TABLE $tbl_name";

  #print("$sql\n");
  $sth = $self->dbc->prepare($sql);
  $sth->execute();
  printf("  %1.3f secs to ANALYZE TABLE\n", (time()-$starttime));

}

sub fetch_categories {

  my $self = shift;

  return unless($self->{'gdb'});
  my $gdb = $self->{'gdb'};
  return unless $gdb;

  my $starttime = time();


  my $gdb_id = $gdb->dbID;
  my $species_name = lc($gdb->name);
  $species_name =~ s/\ /\_/g;
  my $tbl_name = "peptide_align_feature"."_"."$species_name"."_"."$gdb_id";
  my $sql = "SELECT DISTINCT ".
            "qmember_id ".
             "FROM $tbl_name WHERE qgenome_db_id=$gdb_id;";
  print("$sql\n");
  my $sth = $self->dbc->prepare($sql);
  $sth->execute();
  printf("%1.3f secs to execute\n", (time()-$starttime));
  print("  done with fetch\n");

  my $filename = $self->{fasta_dir} . "/" . "$tbl_name.hcluster.cat";

  my $outgroup = 1;
  $outgroup = 2 if (defined($self->{outgroups}{$gdb_id}));

  my $member_id_hash;
  while ( my $ref  = $sth->fetchrow_arrayref() ) {
    my ($member_id) = @$ref;
    $member_id_hash->{$member_id} = 1;
  }
  $sth->finish;
  printf("%1.3f secs to gather distinct\n", (time()-$starttime));
  open FILE, ">$filename" or die $!;
  foreach my $member_id (keys %$member_id_hash) {
    print FILE "$member_id"."_","$gdb_id\t$outgroup\n";
  }
  close FILE;
  printf("%1.3f secs to process\n", (time()-$starttime));
}

1;

}

sub fetch_distances {

  my $self = shift;

  return unless($self->{'gdb'});
  my $gdb = $self->{'gdb'};
  return unless $gdb;

  my $starttime = time();

  my $gdb_id = $gdb->dbID;
  my $species_name = lc($gdb->name);
  $species_name =~ s/\ /\_/g;
  my $tbl_name = "peptide_align_feature"."_"."$species_name"."_"."$gdb_id";
  my $species_set_string = join (",",@{$self->{species_set}});
  my $sql = "SELECT ".
            "concat(qmember_id,'_',qgenome_db_id), ".
            "concat(hmember_id,'_',hgenome_db_id), ".
            "IF(evalue<1e-199,100,ROUND(-log10(evalue)/2)) ".
             "FROM $tbl_name WHERE qgenome_db_id=$gdb_id and hgenome_db_id in ($species_set_string);";
  print("$sql\n");
  my $sth = $self->dbc->prepare($sql);
  $sth->execute();
  printf("%1.3f secs to execute\n", (time()-$starttime));
  print("  done with fetch\n");
  my $filename = $self->{fasta_dir} . "/" . "$tbl_name.hcluster.txt";
  open FILE, ">$filename" or die $!;
  while ( my $ref  = $sth->fetchrow_arrayref() ) {
    my ($query_id, $hit_id, $score) = @$ref;
    print FILE "$query_id\t$hit_id\t$score\n";
  }
  $sth->finish;
  close FILE;
  printf("%1.3f secs to process\n", (time()-$starttime));

}

sub fetch_input {

  my( $self) = @_;

  $self->{'species_set'} = undef;
  $self->throw("No input_id") unless defined($self->input_id);

  #create a Compara::DBAdaptor which shares the same DBI handle
  #with the Pipeline::DBAdaptor that is based into this runnable
  $self->{'comparaDBA'} = Bio::EnsEMBL::Compara::DBSQL::DBAdaptor->new(-DBCONN=>$self->db->dbc);
  $self->{gdba} = $self->{'comparaDBA'}->get_GenomeDBAdaptor;

  $self->get_params($self->parameters);

  my $input_gdb_id = $self->input_id;
  my $gdb = $self->{gdba}->fetch_by_dbID($input_gdb_id);
  throw("no genome_db for $input_gdb_id") unless(defined($gdb));
  $self->{gdb} = $gdb;

  return 1;

}

sub get_params {

  my $self         = shift;
  my $param_string = shift;

  return if ($param_string eq "1");

  return unless($param_string);
  print("parsing parameter string : ",$param_string,"\n");

  my $params = eval($param_string);
  return unless($params);

  foreach my $key (keys %$params) {
    print("  $key : ", $params->{$key}, "\n");
  }

  if (defined $params->{'species_set'}) {
    $self->{'species_set'} = $params->{'species_set'};
  }
  if (defined $params->{'fasta_dir'}) {
    $self->{'fasta_dir'} = $params->{'fasta_dir'};
  }
  if (defined $params->{'outgroups'}) {
    foreach my $outgroup (@{$params->{'outgroups'}}) {
      $self->{outgroups}{$outgroup} = 1;
    }
  }

  print("parameters...\n");
  printf("  fasta_dir    : %d\n", $self->{'fasta_dir'});
  printf("  species_set  : (%s)\n", join(',', @{$self->{'species_set'}}));
  printf("  outgroups    : (%s)\n", join(',', keys %{$self->{'outgroups'}}));

  return;

}

sub write_output {

  my $self = shift;

#   $self->store_clusters;
#   $self->dataflow_clusters;

#   # modify input_job so that it now contains the clusterset_id
#   my $outputHash = {};
#   $outputHash = eval($self->input_id) if(defined($self->input_id) && $self->input_id =~ /^\s*\{.*\}\s*$/);
#   $outputHash->{'clusterset_id'} = $self->{'clusterset_id'};
#   my $output_id = $self->encode_hash($outputHash);
#   $self->input_job->input_id($output_id);

  return 1;
}

##########################################
#
# internal methods
#
##########################################

# This will make sure that the indexes for paf are fine

}

  Contact Albert Vilella on module implemetation/design detail: avilella@ebi.ac.uk
  Contact Ewan Birney on EnsEMBL in general: birney@sanger.ac.uk

analyze_table	No description	Code
fetch_categories	No description	Code
fetch_distances	No description	Code
fetch_input	No description	Code
get_params	No description	Code
run	No description	Code
write_output	No description	Code