PAFCluster documentation.

sub BRH_grow_for_species {

  my $self = shift;
  my ($gdb_id1, $gdb_id2) = @_;

  return unless($self->{'include_brh'});

  my $starttime = time();

  my $gdb1 = $self->{gdba}->fetch_by_dbID($gdb_id1);
  my $species_name1 = lc($gdb1->name);
  $species_name1 =~ s/\ /\_/g;
  my $tbl_name1 = "peptide_align_feature"."_"."$species_name1"."_"."$gdb_id1";
  my $gdb2 = $self->{gdba}->fetch_by_dbID($gdb_id2);
  my $species_name2 = lc($gdb2->name);
  $species_name2 =~ s/\ /\_/g;
  my $tbl_name2 = "peptide_align_feature"."_"."$species_name2"."_"."$gdb_id2";

  my $sql = "SELECT paf1.qmember_id, paf1.hmember_id, paf1.score, paf1.hit_rank ".
            "FROM $tbl_name1 paf1 ".
            "JOIN $tbl_name2 paf2 ".
            "  ON( paf1.qmember_id = paf2.hmember_id and paf1.hmember_id = paf2.qmember_id)  ".
            "WHERE paf1.qgenome_db_id = $gdb_id1 AND paf1.hgenome_db_id = $gdb_id2 ".
            "AND   paf2.qgenome_db_id = $gdb_id2 AND paf2.hgenome_db_id = $gdb_id1 ".
            "AND paf1.hit_rank=1 and paf2.hit_rank =1";

  print("$sql\n");
  my $sth = $self->dbc->prepare($sql);
  $sth->execute();
  printf("  %1.3f secs to fetch BRHs via PAF\n", (time()-$starttime));

  my $midtime = time();
  my $paf_counter=0;
  while( my $ref  = $sth->fetchrow_arrayref() ) {
    my ($pep1_id, $pep2_id, $score, $hit_rank) = @$ref;
    $paf_counter++;
    #my $pep_pair = [$pep1_id, $pep2_id];
    #$self->grow_memclusters_with_peppair($pep_pair);
    $self->{'ccEngine'}->add_connection($pep1_id, $pep2_id);
  }

  printf("  %d clusters so far\n", $self->{'ccEngine'}->get_cluster_count);
  printf("  %d members in hash\n", $self->{'ccEngine'}->get_component_count);
  printf("  %1.3f secs to process %d BRH PAFs\n", time()-$midtime, $paf_counter);
  printf("  %1.3f secs to load/process\n", (time()-$starttime));

}

sub analyze_table {

  my $self = shift;

  my $starttime = time();

  foreach my $gdb_id (@{$self->{'species_set'}}) {
    my $gdb = $self->{gdba}->fetch_by_dbID($gdb_id);
    my $species_name = lc($gdb->name);
    $species_name =~ s/\ /\_/g;
    my $tbl_name = "peptide_align_feature"."_"."$species_name"."_"."$gdb_id";
    $DB::single=1;1;
    # Re-enable the keys before starting the queries
    my $sql = "ALTER TABLE $tbl_name ENABLE KEYS";

    #print("$sql\n");
    my $sth = $self->dbc->prepare($sql);
    $sth->execute();

    $sql = "ANALYZE TABLE $tbl_name";

    #print("$sql\n");
    $sth = $self->dbc->prepare($sql);
    $sth->execute();
  }
  printf("  %1.3f secs to ANALYZE TABLE\n", (time()-$starttime));

}

sub build_paf_clusters {

  my $self = shift;

  return unless($self->{'species_set'});
  my @species_set = @{$self->{'species_set'}};
  return unless @species_set;

  my $starttime = time();

  # create ConnectedComponents cluster building engine
  $self->{'ccEngine'} = new Bio::EnsEMBL::Compara::Graph::ConnectedComponents;

  #
  # load all the self equal hits for each genome so we have our reference score
  #

  $self->fetch_selfhit_score;

  #
  # for each species pair, get all 'high scoring' hits and build clusters
  #

  while (my $gdb_id1 = shift @species_set) {
    #first get paralogues
    $self->threshold_grow_for_species($gdb_id1);

    foreach my $gdb_id2 (@species_set) {
      $starttime = time();
      $self->BRH_grow_for_species($gdb_id1, $gdb_id2);
      $self->threshold_grow_for_species($gdb_id1, $gdb_id2);
    }
  }
}


#########################################################################
#
# new fast algorithm idea:
#  1) use light weight query to get 'homologies' as a peptide_pair
#     array reference of two member_ids
#  2) use NestedSet/AlignedMember objects in light-weight mode
#     by only storing member_ids
#  3) build clusters in memory (uses very little now)
#  4) store
#
#########################################################################

}

sub fetch_input {

  my( $self) = @_;

  $self->{'species_set'} = undef;
  $self->throw("No input_id") unless defined($self->input_id);

  #create a Compara::DBAdaptor which shares the same DBI handle
  #with the pipeline DBAdaptor that is based into this runnable
  $self->{'comparaDBA'} = Bio::EnsEMBL::Compara::DBSQL::DBAdaptor->new(-DBCONN=>$self->db->dbc);
  $self->{gdba} = $self->{'comparaDBA'}->get_GenomeDBAdaptor;
  $self->{'selfhit_score_hash'} = {};
  $self->{'no_filters'} = 0;
  $self->{'all_bests'} = 0;
  $self->{'include_brh'} = 1;
  $self->{'bsr_threshold'} = 0.25;
  $self->{'clusterset_id'} = undef;

  $self->get_params($self->parameters);
  $self->get_params($self->input_id);

  my @species_set = @{$self->{'species_set'}};
  $self->{'cluster_mlss'} = new Bio::EnsEMBL::Compara::MethodLinkSpeciesSet;
  $self->{'cluster_mlss'}->method_link_type('PROTEIN_TREES');
  my @genomeDB_set;
  foreach my $gdb_id (@species_set) {
    my $gdb = $self->{'comparaDBA'}->get_GenomeDBAdaptor->fetch_by_dbID($gdb_id);
    throw("print gdb not defined for gdb_id = $gdb_id\n") unless (defined $gdb);
    push @genomeDB_set, $gdb;
  }
  $self->{'cluster_mlss'}->species_set(\@genomeDB_set);

  return 1;

}

sub fetch_selfhit_score {

  my $self= shift;

  return undef unless(($self->{'bsr_threshold'} >0.0) and ($self->{'bsr_threshold'} < 1.0));

  my $starttime = time();
  foreach my $gdb_id (@{$self->{'species_set'}}) {
    my $gdb = $self->{gdba}->fetch_by_dbID($gdb_id);
    my $species_name = lc($gdb->name);
    $species_name =~ s/\ /\_/g;
    my $tbl_name = "peptide_align_feature"."_"."$species_name"."_"."$gdb_id";
    my $sql = "SELECT qmember_id, score ".
      "FROM $tbl_name paf ".
        "WHERE qmember_id=hmember_id ".
          "AND qgenome_db_id=$gdb_id";
    print("$sql\n");
    my $sth = $self->dbc->prepare($sql);
    $sth->execute();
    print("  done with fetch\n");
    while ( my $ref  = $sth->fetchrow_arrayref() ) {
      my ($member_id, $score) = @$ref;
      $self->{'selfhit_score_hash'}->{$member_id} = $score;
    }
    $sth->finish;
  }
  printf("%1.3f secs to process\n", (time()-$starttime));

}

sub get_params {

  my $self         = shift;
  my $param_string = shift;

  return if ($param_string eq "1");

  return unless($param_string);
  print("parsing parameter string : ",$param_string,"\n");

  my $params = eval($param_string);
  return unless($params);

  foreach my $key (keys %$params) {
    print("  $key : ", $params->{$key}, "\n");
  }

  if (defined $params->{'species_set'}) {
    $self->{'species_set'} = $params->{'species_set'};
  }
  if (defined $params->{'gene_stable_id'}) {
    $self->{'gene_stable_id'} = $params->{'gene_stable_id'};
  }
  if (defined $params->{'all_best'}) {
    $self->{'all_bests'} = $params->{'all_best'};
  }
  if (defined $params->{'no_filters'}) {
    $self->{'no_filters'} = $params->{'no_filters'};
  }
  if (defined $params->{'bsr_threshold'}) {
    $self->{'bsr_threshold'} = $params->{'bsr_threshold'};
  }
  if (defined $params->{'brh'}) {
    $self->{'include_brh'} = $params->{'brh'};
  }
  $self->{'max_gene_count'} = 1000000;
  if (defined $params->{'max_gene_count'}) {
    $self->{'max_gene_count'} = $params->{'max_gene_count'};
  }

  print("parameters...\n");
  printf("  species_set    : (%s)\n", join(',', @{$self->{'species_set'}}));
  printf("  BRH            : %d\n", $self->{'include_brh'});
  printf("  all_blast_hits : %d\n", $self->{'no_filters'});
  printf("  all_bests      : %d\n", $self->{'all_bests'});
  printf("  bsr_threshold  : %1.3f\n", $self->{'bsr_threshold'});

  return;

}

sub store_clusters {

  my $self = shift;

  return unless($self->{'species_set'});
#  my @species_set = @{$self->{'species_set'}};
#  return unless @species_set;
  return unless ($self->{'cluster_mlss'});

  my $mlssDBA = $self->{'comparaDBA'}->get_MethodLinkSpeciesSetAdaptor;
  my $pafDBA = $self->{'comparaDBA'}->get_PeptideAlignFeatureAdaptor;
  my $treeDBA = $self->{'comparaDBA'}->get_ProteinTreeAdaptor;
  my $starttime = time();

  my $clusterset = $self->{'ccEngine'}->clusterset;
  throw("no clusters generated") unless($clusterset);

  $clusterset->name("PROTEIN_TREES");
  $treeDBA->store_node($clusterset);
  printf("root_id %d\n", $clusterset->node_id);
  $self->{'clusterset_id'} = $clusterset->node_id;

  #
  # create Cluster MLSS
  #
#  $self->{'cluster_mlss'} = new Bio::EnsEMBL::Compara::MethodLinkSpeciesSet;
#  $self->{'cluster_mlss'}->method_link_type('PROTEIN_TREES');
#  my @genomeDB_set;
#  foreach my $gdb_id (@species_set) {
#    my $gdb = $self->{'comparaDBA'}->get_GenomeDBAdaptor->fetch_by_dbID($gdb_id);
#
#    push @genomeDB_set, $gdb;
#  }
#  $self->{'cluster_mlss'}->species_set(\@genomeDB_set);
  $mlssDBA->store($self->{'cluster_mlss'});
  printf("MLSS %d\n", $self->{'cluster_mlss'}->dbID);

  #
  # Go through all the leaves which were generated by ConnectedComponents
  # and convert them into AlignedMember objects with additional data
  # to allow them to be stored correctly
  #
  my $mlss_id = $self->{'cluster_mlss'}->dbID;
  my $leaves = $clusterset->get_all_leaves;
  foreach my $leaf (@$leaves) {
    #leaves are NestedSet objects, bless to make into AlignedMember objects
    bless $leaf, "Bio::EnsEMBL::Compara::AlignedMember";

    #the building method uses member_id's to reference unique nodes
    #which are stored in the node_id value, copy to member_id
    $leaf->member_id($leaf->node_id);
    $leaf->method_link_species_set_id($mlss_id);
  }


  printf("storing the clusters\n");
  printf("    loaded %d leaves\n", scalar(@$leaves));
  my $count=0;
  foreach my $mem (@$leaves) { $count++ if($mem->isa('Bio::EnsEMBL::Compara::AlignedMember'));}
  printf("    loaded %d leaves which are members\n", $count);
  printf("    loaded %d members in hash\n", $self->{'ccEngine'}->get_component_count);
  printf("    %d clusters generated\n", $self->{'ccEngine'}->get_cluster_count);

  my $clusters = $clusterset->children;
  my $counter=1;
  foreach my $cluster (@{$clusters}) {
    $treeDBA->store($cluster);

    #calc residue count total
    my $leafcount = scalar(@{$cluster->get_all_leaves});
    $cluster->store_tag('gene_count', $leafcount);
    $cluster->store_tag('include_brh', $self->{'include_brh'});
    $cluster->store_tag('bsr_threshold', $self->{'bsr_threshold'});

    if($counter++ % 200 == 0) { printf("%10d clusters stored\n", $counter); }
  }
  printf("  %1.3f secs to store clusters\n", (time()-$starttime));
  printf("tree_root : %d\n", $clusterset->node_id);

}

sub threshold_grow_for_species {

  my $self = shift;
  my @species_set = @_;

  return undef unless
    ($self->{'all_bests'} or 
     (($self->{'bsr_threshold'} >0.0) and ($self->{'bsr_threshold'} < 1.0)));

  my $starttime = time();
  my $species_string = "(" . join(',', @species_set) . ")";

  my $midtime;
  my $paf_counter;
  my $included_pair_count;
  my $included_bests_count;
  foreach my $gdb_id (@species_set) {
    my $gdb = $self->{gdba}->fetch_by_dbID($gdb_id);
    my $species_name = lc($gdb->name);
    $species_name =~ s/\ /\_/g;
    my $tbl_name = "peptide_align_feature"."_"."$species_name"."_"."$gdb_id";
    my $sql = "SELECT paf.qmember_id, paf.hmember_id, paf.score, paf.hit_rank ".
      "FROM $tbl_name paf ".
        "WHERE paf.qmember_id != paf.hmember_id ".
          "AND paf.hgenome_db_id in $species_string ";
    # !! why do we have that here? Surely adding this constraint we may loose within_species_paralogues...
    if(scalar(@species_set) > 1) {
      $sql .= "AND paf.qgenome_db_id != paf.hgenome_db_id ";
    }
    print("$sql\n");
    my $sth = $self->dbc->prepare($sql);
    $sth->execute();
    printf("  %1.3f secs to fetch PAFs\n", (time()-$starttime));

    $midtime = time();
    $paf_counter=0;
    $included_pair_count=0;
    $included_bests_count=0;
    while( my $ref  = $sth->fetchrow_arrayref() ) {
      my ($pep1_id, $pep2_id, $score, $hit_rank) = @$ref;

      $paf_counter++;

      my $include_pair = 0;
      if($self->{'no_filters'}) {
        $include_pair = 1;
    }

      if(!$include_pair and $self->{'all_bests'} and $hit_rank==1) {
        $included_bests_count++;
        $include_pair = 1;
    }

      if(!$include_pair and ($self->{'bsr_threshold'} < 1.0)) {
        unless(defined($self->{'selfhit_score_hash'}->{$pep1_id})) {
          printf("member_pep %d missing self_hit\n", $pep1_id);
        }
        unless(defined($self->{'selfhit_score_hash'}->{$pep2_id})) {
          printf("member_pep %d missing self_hit\n", $pep2_id);
        }

      #find largest self hit blast score to use as reference
        my $ref_score = $self->{'selfhit_score_hash'}->{$pep1_id};
        my $ref2_score = $self->{'selfhit_score_hash'}->{$pep2_id};
        if (!defined($ref_score) or 
            (defined($ref2_score) and ($ref2_score > $ref_score))) {
          $ref_score = $ref2_score;
        }

        #do blast score ratio (BSR) filter (
        if (defined($ref_score) and ($score / $ref_score > $self->{'bsr_threshold'})) {
          $include_pair=1;
        }
      }

      if ($include_pair) {
        $included_pair_count++;
        $self->{'ccEngine'}->add_connection($pep1_id, $pep2_id);
      }
    }
  }

  printf("  %d clusters so far\n", $self->{'ccEngine'}->get_cluster_count);
  printf("  %d members in hash\n", $self->{'ccEngine'}->get_component_count);
  printf("  %1.3f secs to process %d PAFs => %d picked (%d best + %d threshold)\n", 
         time()-$midtime, $paf_counter, $included_pair_count, $included_bests_count, 
         $included_pair_count - $included_bests_count);
  printf("  %1.3f secs to load/process\n", (time()-$starttime));

}

sub write_output {

  my $self = shift;

  $self->store_clusters;

  $self->dataflow_clusters;

  # modify input_job so that it now contains the clusterset_id
  my $outputHash = {};
  $outputHash = eval($self->input_id) if(defined($self->input_id) && $self->input_id =~ /^\s*\{.*\}\s*$/);
  $outputHash->{'clusterset_id'} = $self->{'clusterset_id'};
  my $output_id = $self->encode_hash($outputHash);
  $self->input_job->input_id($output_id);

  return 1;
}

##########################################
#
# internal methods
#
##########################################

# This will make sure that the indexes for paf are fine

}

BRH_grow_for_species	No description	Code
analyze_table	No description	Code
build_paf_clusters	No description	Code
dataflow_clusters	No description	Code
fetch_input	No description	Code
fetch_selfhit_score	No description	Code
get_params	No description	Code
run	No description	Code
store_clusters	No description	Code
threshold_grow_for_species	No description	Code
write_output	No description	Code