Bio::EnsEMBL::Compara::Production::GenomicAlignBlock ChunkAndGroupDna
SummaryIncluded librariesPackage variablesSynopsisDescriptionGeneral documentationMethods
Toolbar
WebCvsRaw content
Summary
Bio::EnsEMBL::Compara::Production::GenomicAlignBlock::ChunkAndGroupDna
Package variables
No package variables defined.
Included modules
Bio::EnsEMBL::Analysis::RunnableDB
Bio::EnsEMBL::Compara::Production::DBSQL::DBAdaptor
Bio::EnsEMBL::Compara::Production::DnaCollection
Bio::EnsEMBL::Compara::Production::DnaFragChunk
Bio::EnsEMBL::Compara::Production::DnaFragChunkSet
Bio::EnsEMBL::DBLoader
Bio::EnsEMBL::DBSQL::DBAdaptor
Bio::EnsEMBL::Hive::DBSQL::AnalysisJobAdaptor
Bio::EnsEMBL::Hive::Process
Bio::EnsEMBL::Utils::Exception qw ( throw warning verbose )
Time::HiRes qw ( time gettimeofday tv_interval )
Inherit
Bio::EnsEMBL::Analysis::RunnableDB Bio::EnsEMBL::Hive::Process
Synopsis
my $db = Bio::EnsEMBL::Compara::DBAdaptor->new($locator);
my $runnable = Bio::EnsEMBL::Compara::Production::GenomicAlignBlock::ChunkAndGroupDna->new (
-db => $db,
-input_id => $input_id
-analysis => $analysis );
$runnable->fetch_input(); #reads from DB
$runnable->run();
$runnable->write_output(); #writes to DB
Description
This object chunks the Dna from a genome_db and creates and stores the
chunks as DnaFragChunk objects into the compara database
Methods
create_chunk_analysis
No description
Code
create_chunks
No description
Code
create_dnafrag_chunks
No description
Code
fetch_inputDescriptionCode
get_params
No description
Code
print_params
No description
Code
run
No description
Code
submit_job
No description
Code
write_output
No description
Code
Methods description
fetch_inputcode    nextTop
    Title   :   fetch_input
Usage : $self->fetch_input
Function: prepares global variables and DB connections
Returns : none
Args : none
Methods code
create_chunk_analysisdescriptionprevnextTop
sub create_chunk_analysis {
  #routine to create one analysis per chunk
my $self = shift; my $chunk = shift; my $analysisDBA = $self->db->get_AnalysisAdaptor(); my $gdb = $chunk->dnafrag->genome_db; my $logic_name = $self->{'create_analysis_prefix'} ."_". $gdb->dbID ."_". $gdb->assembly ."_". $chunk->dbID; #print("look for analysis ", $self->{'create_analysis_prefix'}, "\n");
my $analysis = $analysisDBA->fetch_by_logic_name($self->{'create_analysis_prefix'}); unless($analysis) { $analysis = Bio::EnsEMBL::Analysis->new( -db => '', -db_file => '', -db_version => '1', -logic_name => $logic_name, -program => 'blastz', -module => 'Bio::EnsEMBL::Compara::Production::GenomicAlignBlock::BlastZ', ); } return unless($analysis); my $template_analysis_stats = $analysis->stats; $analysis->adaptor(0); $analysis->dbID(0); $analysis->logic_name($logic_name); #print("new logic_name : ", $analysis->logic_name, "\n");
my $param_hash = {}; if($analysis->parameters and ($analysis->parameters =~ /^{/)) { #print("parsing parameters : ", $analysis->parameters, "\n");
$param_hash = eval($analysis->parameters); } $param_hash->{'dbChunk'} = $chunk->dbID; $analysis->parameters(main::encode_hash($param_hash)); #print("new parameters : ", $analysis->parameters, "\n");
$analysisDBA->store($analysis); my $stats = $analysis->stats; $stats->batch_size($template_analysis_stats->batch_size); $stats->hive_capacity($template_analysis_stats->hive_capacity); $stats->update(); return; } 1;
}
create_chunksdescriptionprevnextTop
sub create_chunks {
  my $self      = shift;

  my $genome_db = $self->{'genome_db'};

  my $collectionDBA = $self->{'comparaDBA'}->get_DnaCollectionAdaptor;
  if ($self->{'collection_id'}) {
    $self->{'dna_collection'} = $collectionDBA->fetch_by_dbID($self->{'collection_id'});
  } else {
    $self->{'dna_collection'} = new Bio::EnsEMBL::Compara::Production::DnaCollection;
    $self->{'dna_collection'}->description($self->{'collection_name'});
    $self->{'dna_collection'}->dump_loc($self->{'dump_loc'}) if(defined($self->{'dump_loc'}));
    $collectionDBA->store($self->{'dna_collection'});
  }
  throw("couldn't get a DnaCollection for ChunkAndGroup analysis\n") unless($self->{'dna_collection'});
  
  $genome_db->db_adaptor->dbc->disconnect_when_inactive(0);
  my $SliceAdaptor = $genome_db->db_adaptor->get_SliceAdaptor;
  my $dnafragDBA = $self->{'comparaDBA'}->get_DnaFragAdaptor;

  my $chromosomes = [];
  if(defined $self->{'region'}) {
    my ($coord_system_name, $seq_region_name, $seq_region_start, $seq_region_end) = split(/:/,  $self->{'region'});
    if (defined $seq_region_name && $seq_region_name ne "") {
      print("fetch by region coord:$coord_system_name seq_name:$seq_region_name\n");
      push @{$chromosomes}, $SliceAdaptor->fetch_by_region($coord_system_name, $seq_region_name);
    } else {
      print("fetch by region coord:$coord_system_name\n");
      $chromosomes = $SliceAdaptor->fetch_all($coord_system_name);
    }
  } else {
      #default for $include_non_reference = 0, $include_duplicates = 0
$chromosomes = $SliceAdaptor->fetch_all('toplevel',undef, $self->{'include_non_reference'}, $self->{'include_duplicates'}); } print("number of seq_regions ".scalar @{$chromosomes}."\n"); $self->{'chunkset_counter'} = 1; $self->{'current_chunkset'} = new Bio::EnsEMBL::Compara::Production::DnaFragChunkSet; $self->{'current_chunkset'}->description(sprintf("collection_id:%d group:%d", $self->{'dna_collection'}->dbID, $self->{'chunkset_counter'}++)); my $starttime = time(); foreach my $chr (@{$chromosomes}) { #print "fetching dnafrag\n";
if (defined $self->{'region'}) { next unless (scalar @{$chr->get_all_Attributes('toplevel')}); } my ($dnafrag) = @{$dnafragDBA->fetch_all_by_GenomeDB_region( $genome_db, $chr->coord_system->name(), #$self->{'coordinate_system'},
$chr->seq_region_name)}; #if($dnafrag) { print(" already stores as dbID ", $dnafrag->dbID, "\n"); }
unless($dnafrag) { #
# create dnafrag for this chromosome
#
#print "loading dnafrag for ".$chr->name."...\n";
$dnafrag = new Bio::EnsEMBL::Compara::DnaFrag; $dnafrag->name($chr->seq_region_name); #ie just 22
$dnafrag->genome_db($genome_db); $dnafrag->coord_system_name($chr->coord_system->name()); $dnafrag->length($chr->length); $dnafragDBA->store_if_needed($dnafrag); } $self->create_dnafrag_chunks($dnafrag, $chr->start, $chr->end); } #save the current_chunkset if it isn't empty
if($self->{'current_chunkset'}->count > 0) { $self->{'comparaDBA'}->get_DnaFragChunkSetAdaptor->store($self->{'current_chunkset'}); #$self->submit_job($self->{'current_chunkset'});
$self->{'dna_collection'}->add_dna_object($self->{'current_chunkset'}); } #
# finish by storing all the dna_objects of the collection
#
$collectionDBA->store($self->{'dna_collection'}); print "genome_db ",$genome_db->dbID, " : total time ", (time()-$starttime), " secs\n";
}
create_dnafrag_chunksdescriptionprevnextTop
sub create_dnafrag_chunks {
  my $self = shift;
  my $dnafrag = shift;
  my $region_start = (shift or 1);
  my $region_end = (shift or $dnafrag->length);

 #return if($dnafrag->display_id =~ /random/);
my $dnafragDBA = $self->{'comparaDBA'}->get_DnaFragAdaptor; my ($coord_system_name, $seq_region_name, $seq_region_start, $seq_region_end) = split(/:/, $self->{'region'}) if($self->{'region'}); if (defined $seq_region_start && defined $seq_region_end) { $region_end = $seq_region_end; $region_start = $seq_region_start; } #If chunk_size is not set then set it to be the fragment length
#overlap must be 0 in this case.
my $chunk_size = $self->{'chunk_size'}; my $overlap = $self->{'overlap'}; if (!defined $chunk_size) { $chunk_size = $region_end; $overlap = 0; } #print "dnafrag : ", $dnafrag->display_id, "n";
#print " sequence length : ",$length,"\n";
my $lasttime = time(); #initialise chunk_start and chunk_end to be the dnafrag start and end
my $chunk_start = $region_start; my $chunk_end = $region_start; while ($chunk_end < $region_end) { my $chunk = new Bio::EnsEMBL::Compara::Production::DnaFragChunk(); $chunk->dnafrag($dnafrag); $chunk->seq_start($chunk_start); $chunk_end = $chunk_start + $chunk_size - 1; #if current $chunk_end is too long, trim to be $region_end
if ($chunk_end > $region_end) { $chunk_end = $region_end; } $chunk->seq_end($chunk_end); $chunk->masking_analysis_data_id($self->{'masking_analysis_data_id'}); if($self->{'masking_options'}) { $chunk->masking_options($self->{'masking_options'}); } if($self->{'store_seq'}) { $chunk->bioseq; #fetches sequence and stores internally in ->sequence variable
} #print "store chunk " . $chunk->dnafrag->name . " " . $chunk->seq_start . " " . $chunk->seq_end . " " . length($chunk->bioseq->seq) . "\n";
$self->{'comparaDBA'}->get_DnaFragChunkAdaptor->store($chunk); # do grouping if requested
if($self->{'group_set_size'} and ($chunk->length < $self->{'group_set_size'})) { if(($self->{'current_chunkset'}->count > 0) and (($self->{'current_chunkset'}->total_basepairs + $chunk->length) > $self->{'group_set_size'})) { #set has hit max, so save it
$self->{'comparaDBA'}->get_DnaFragChunkSetAdaptor->store($self->{'current_chunkset'}); $self->{'dna_collection'}->add_dna_object($self->{'current_chunkset'}); #$self->submit_job($self->{'current_chunkset'});
if($self->debug) { printf("created chunkSet(%d) %d chunks, %1.3f mbase\n", $self->{'current_chunkset'}->dbID, $self->{'current_chunkset'}->count, $self->{'current_chunkset'}->total_basepairs/1000000.0);
} $self->{'current_chunkset'} = new Bio::EnsEMBL::Compara::Production::DnaFragChunkSet; $self->{'current_chunkset'}->description(sprintf("collection_id:%d group:%d", $self->{'dna_collection'}->dbID, $self->{'chunkset_counter'}++)); } $self->{'current_chunkset'}->add_DnaFragChunk($chunk); if($self->debug) { printf("chunkSet %d chunks, %1.3f mbase\n", $self->{'current_chunkset'}->count, $self->{'current_chunkset'}->total_basepairs/1000000.0);
} } else { #not doing grouping so put the $chunk directly into the collection
$self->{'dna_collection'}->add_dna_object($chunk); if($self->debug) { printf("dna_collection : chunk (%d) %s\n",$chunk->dbID, $chunk->display_id); } } $self->submit_job($chunk) if($self->{'analysis_job'}); $self->create_chunk_analysis($chunk) if($self->{'create_analysis_prefix'}); $chunk_start = $chunk_end - $overlap + 1; } #print "Done\n";
#print scalar(time()-$lasttime), " secs to chunk, and store\n";
}
fetch_inputdescriptionprevnextTop
sub fetch_input {
  my( $self) = @_;

  #
# parameters which can be set either via
# $self->parameters OR
# $self->input_id
#
$self->{'genome_db_id'} = 0; # 'gdb'
$self->{'store_seq'} = 0; $self->{'store_chunk'} = 0; $self->{'overlap'} = 0; $self->{'chunk_size'} = undef; $self->{'region'} = undef; $self->{'masking_analysis_data_id'} = 0; $self->{'masking_options'} = undef; $self->{'group_set_size'} = undef; $self->{'include_non_reference'} = 0; # eg haplotypes
$self->{'include_duplicates'} = 0; # eg PAR region
$self->{'analysis_job'} = undef; $self->{'create_analysis_prefix'} = undef; # 'analysis', 'job'
$self->{'collection_id'} = undef; $self->{'collection_name'} = undef; $self->get_params($self->parameters); $self->get_params($self->input_id); throw("No genome_db specified") unless defined($self->{'genome_db_id'}); $self->print_params; #create a Compara::DBAdaptor which shares the same DBI handle
#with the DBAdaptor that is based into this runnable
$self->{'comparaDBA'} = Bio::EnsEMBL::Compara::Production::DBSQL::DBAdaptor->new(-DBCONN => $self->db->dbc); #get the Compara::GenomeDB object for the genome_db_id
$self->{'genome_db'} = $self->{'comparaDBA'}->get_GenomeDBAdaptor-> fetch_by_dbID($self->{'genome_db_id'}); throw("Can't fetch genome_db for id=".$self->{'genome_db_id'}) unless($self->{'genome_db'}); #using genome_db_id, connect to external core database
my $coreDBA = $self->{'genome_db'}->db_adaptor(); throw("Can't connect to genome database for id=".$self->{'genome_db_id'}) unless($coreDBA); return 1;
}
get_paramsdescriptionprevnextTop
sub get_params {
  my $self         = shift;
  my $param_string = shift;

  return unless($param_string);
  print("parsing parameter string : ",$param_string,"\n");
  
  my $params = eval($param_string);
  return unless($params);

  foreach my $key (keys %$params) {
    print("  $key : ", $params->{$key}, "\n");
  }
  
  if($params->{'input_data_id'}) {
    my $input_id = $self->db->get_AnalysisDataAdaptor->fetch_by_dbID($params->{'input_data_id'});
    $self->get_params($input_id);
  }
      
  $self->{'store_seq'} = $params->{'store_seq'} if(defined($params->{'store_seq'}));
  $self->{'store_chunk'} = $params->{'store_chunk'} if(defined($params->{'store_chunk'}));
  $self->{'chunk_size'} = $params->{'chunk_size'} if(defined($params->{'chunk_size'}));
  $self->{'overlap'} = $params->{'overlap'} if(defined($params->{'overlap'}));

  $self->{'dump_loc'} = $params->{'dump_loc'} if(defined($params->{'dump_loc'}));

  $self->{'genome_db_id'} = $params->{'gdb'} if(defined($params->{'gdb'}));
  $self->{'genome_db_id'} = $params->{'genome_db_id'} if(defined($params->{'genome_db_id'}));

  $self->{'region'} = $params->{'region'} if(defined($params->{'region'}));

  $self->{'masking_options'} = $params->{'masking_options'}
    if(defined($params->{'masking_options'}));
  $self->{'masking_analysis_data_id'} = $params->{'masking_analysis_data_id'}
    if(defined($params->{'masking_analysis_data_id'}));
    
  $self->{'create_analysis_prefix'} = $params->{'analysis_template'} 
    if(defined($params->{'analysis_template'}));
  $self->{'analysis_job'} = $params->{'analysis_job'} if(defined($params->{'analysis_job'}));

  $self->{'group_set_size'} = $params->{'group_set_size'} if(defined($params->{'group_set_size'}));

  $self->{'collection_name'} = $params->{'collection_name'} if(defined($params->{'collection_name'}));
  $self->{'collection_id'} = $params->{'collection_id'} if(defined($params->{'collection_id'}));

  $self->{'include_non_reference'} = $params->{'include_non_reference'} if(defined($params->{'include_non_reference'}));
  $self->{'include_duplicates'} = $params->{'include_duplicates'} if(defined($params->{'include_duplicates'}));
  
  return;
}
print_paramsdescriptionprevnextTop
sub print_params {
  my $self = shift;

  print(" params:\n");
  print("   genome_db_id             : ", $self->{'genome_db_id'},"\n");
  print("   region                   : ", $self->{'region'},"\n") if($self->{'region'});
  print("   store_seq                : ", $self->{'store_seq'},"\n");
  print("   store_chunk              : ", $self->{'store_chunk'},"\n");
  print("   chunk_size               : ", $self->{'chunk_size'},"\n");
  print("   overlap                  : ", $self->{'overlap'} ,"\n");
  print("   masking_analysis_data_id : ", $self->{'masking_analysis_data_id'} ,"\n");
  print("   masking_options          : ", $self->{'masking_options'} ,"\n") if($self->{'masking_options'});
  print("   include_non_reference    : ", $self->{'include_non_reference'} ,"\n");
  print("   include_duplicates       : ", $self->{'include_duplicates'} ,"\n");
}
rundescriptionprevnextTop
sub run {
  my $self = shift;

  # main routine which takes a genome_db_id (from input_id) and
# access the ensembl_core database, useing the SliceAdaptor
# it will load all slices, all genes, and all transscripts
# and convert them into members to be stored into compara
$self->create_chunks; return 1;
}
submit_jobdescriptionprevnextTop
sub submit_job {
  my $self  = shift;
  my $chunk = shift;

  unless($self->{'submit_analysis'}) {
    #print("\ncreate Submit Analysis\n");
my $gdb = $chunk->dnafrag->genome_db; my $logic_name = $self->{'analysis_job'} ."_". $gdb->dbID ."_". $gdb->assembly; #print(" see if analysis '$logic_name' is in database\n");
my $analysis = $self->{'comparaDBA'}->get_AnalysisAdaptor->fetch_by_logic_name($logic_name); #if($analysis) { print(" YES in database with analysis_id=".$analysis->dbID()); }
unless($analysis) { #print(" NOPE: go ahead and insert\n");
$analysis = Bio::EnsEMBL::Analysis->new( -db => '', -db_file => '', -db_version => '1', -parameters => "", -logic_name => $logic_name, -module => 'Bio::EnsEMBL::Hive::RunnableDB::Dummy', ); $self->db->get_AnalysisAdaptor()->store($analysis); my $stats = $analysis->stats; $stats->batch_size(3); $stats->hive_capacity(11); $stats->status('BLOCKED'); $stats->update(); } $self->{'submit_analysis'} = $analysis; } my $input_id = "{'qyChunk'=>" . $chunk->dbID . "}"; Bio::EnsEMBL::Hive::DBSQL::AnalysisJobAdaptor->CreateNewJob ( -input_id => $input_id, -analysis => $self->{'submit_analysis'}, -input_job_id => 0, ); return;
}
write_outputdescriptionprevnextTop
sub write_output {
  
  my $self = shift;

  my $outputHash = {};
  $outputHash = eval($self->input_id) if(defined($self->input_id));
  $outputHash->{'collection_id'} = $self->{'dna_collection'}->dbID;
  my $output_id = main::encode_hash($outputHash);

  print("output_id = $output_id\n");
  $self->input_id($output_id);                    
  return 1;
}



######################################
#
# subroutines
#
#####################################
}
General documentation
CONTACTTop
Describe contact details here
APPENDIXTop
The rest of the documentation details each of the object methods.
Internal methods are usually preceded with a _