Bio::EnsEMBL::Compara::Production::GenomicAlignBlock
ChunkAndGroupDna
Toolbar
Summary
Bio::EnsEMBL::Compara::Production::GenomicAlignBlock::ChunkAndGroupDna
Package variables
No package variables defined.
Included modules
Time::HiRes qw ( time gettimeofday tv_interval )
Inherit
Synopsis
my $db = Bio::EnsEMBL::Compara::DBAdaptor->new($locator);
my $runnable = Bio::EnsEMBL::Compara::Production::GenomicAlignBlock::ChunkAndGroupDna->new (
-db => $db,
-input_id => $input_id
-analysis => $analysis );
$runnable->fetch_input(); #reads from DB
$runnable->run();
$runnable->write_output(); #writes to DB
Description
This object chunks the Dna from a genome_db and creates and stores the
chunks as DnaFragChunk objects into the compara database
Methods
create_chunk_analysis | No description | Code |
create_chunks | No description | Code |
create_dnafrag_chunks | No description | Code |
fetch_input | Description | Code |
get_params | No description | Code |
print_params | No description | Code |
run | No description | Code |
submit_job | No description | Code |
write_output | No description | Code |
Methods description
Title : fetch_input Usage : $self->fetch_input Function: prepares global variables and DB connections Returns : none Args : none |
Methods code
create_chunk_analysis | description | prev | next | Top |
sub create_chunk_analysis
{ my $self = shift;
my $chunk = shift;
my $analysisDBA = $self->db->get_AnalysisAdaptor();
my $gdb = $chunk->dnafrag->genome_db;
my $logic_name = $self->{'create_analysis_prefix'}
."_". $gdb->dbID
."_". $gdb->assembly
."_". $chunk->dbID;
my $analysis = $analysisDBA->fetch_by_logic_name($self->{'create_analysis_prefix'});
unless($analysis) {
$analysis = Bio::EnsEMBL::Analysis->new(
-db => '',
-db_file => '',
-db_version => '1',
-logic_name => $logic_name,
-program => 'blastz',
-module => 'Bio::EnsEMBL::Compara::Production::GenomicAlignBlock::BlastZ',
);
}
return unless($analysis);
my $template_analysis_stats = $analysis->stats;
$analysis->adaptor(0);
$analysis->dbID(0);
$analysis->logic_name($logic_name);
my $param_hash = {};
if($analysis->parameters and ($analysis->parameters =~ /^{/)) {
$param_hash = eval($analysis->parameters);
}
$param_hash->{'dbChunk'} = $chunk->dbID;
$analysis->parameters(main::encode_hash($param_hash));
$analysisDBA->store($analysis);
my $stats = $analysis->stats;
$stats->batch_size($template_analysis_stats->batch_size);
$stats->hive_capacity($template_analysis_stats->hive_capacity);
$stats->update();
return;
}
1; } |
sub create_chunks
{
my $self = shift;
my $genome_db = $self->{'genome_db'};
my $collectionDBA = $self->{'comparaDBA'}->get_DnaCollectionAdaptor;
if ($self->{'collection_id'}) {
$self->{'dna_collection'} = $collectionDBA->fetch_by_dbID($self->{'collection_id'});
} else {
$self->{'dna_collection'} = new Bio::EnsEMBL::Compara::Production::DnaCollection;
$self->{'dna_collection'}->description($self->{'collection_name'});
$self->{'dna_collection'}->dump_loc($self->{'dump_loc'}) if(defined($self->{'dump_loc'}));
$collectionDBA->store($self->{'dna_collection'});
}
throw("couldn't get a DnaCollection for ChunkAndGroup analysis\n") unless($self->{'dna_collection'});
$genome_db->db_adaptor->dbc->disconnect_when_inactive(0);
my $SliceAdaptor = $genome_db->db_adaptor->get_SliceAdaptor;
my $dnafragDBA = $self->{'comparaDBA'}->get_DnaFragAdaptor;
my $chromosomes = [];
if(defined $self->{'region'}) {
my ($coord_system_name, $seq_region_name, $seq_region_start, $seq_region_end) = split(/:/, $self->{'region'});
if (defined $seq_region_name && $seq_region_name ne "") {
print("fetch by region coord:$coord_system_name seq_name:$seq_region_name\n");
push @{$chromosomes}, $SliceAdaptor->fetch_by_region($coord_system_name, $seq_region_name);
} else {
print("fetch by region coord:$coord_system_name\n");
$chromosomes = $SliceAdaptor->fetch_all($coord_system_name);
}
} else {
$chromosomes = $SliceAdaptor->fetch_all('toplevel',undef, $self->{'include_non_reference'}, $self->{'include_duplicates'});
}
print("number of seq_regions ".scalar @{$chromosomes}."\n");
$self->{'chunkset_counter'} = 1;
$self->{'current_chunkset'} = new Bio::EnsEMBL::Compara::Production::DnaFragChunkSet;
$self->{'current_chunkset'}->description(sprintf("collection_id:%d group:%d",
$self->{'dna_collection'}->dbID,
$self->{'chunkset_counter'}++));
my $starttime = time();
foreach my $chr (@{$chromosomes}) {
if (defined $self->{'region'}) {
next unless (scalar @{$chr->get_all_Attributes('toplevel')});
}
my ($dnafrag) = @{$dnafragDBA->fetch_all_by_GenomeDB_region(
$genome_db,
$chr->coord_system->name(), $chr->seq_region_name)};
unless($dnafrag) {
$dnafrag = new Bio::EnsEMBL::Compara::DnaFrag;
$dnafrag->name($chr->seq_region_name); $dnafrag->genome_db($genome_db);
$dnafrag->coord_system_name($chr->coord_system->name());
$dnafrag->length($chr->length);
$dnafragDBA->store_if_needed($dnafrag);
}
$self->create_dnafrag_chunks($dnafrag, $chr->start, $chr->end);
}
if($self->{'current_chunkset'}->count > 0) {
$self->{'comparaDBA'}->get_DnaFragChunkSetAdaptor->store($self->{'current_chunkset'});
$self->{'dna_collection'}->add_dna_object($self->{'current_chunkset'});
}
$collectionDBA->store($self->{'dna_collection'});
print "genome_db ",$genome_db->dbID, " : total time ", (time()-$starttime), " secs\n"; } |
sub create_dnafrag_chunks
{ my $self = shift;
my $dnafrag = shift;
my $region_start = (shift or 1);
my $region_end = (shift or $dnafrag->length);
my $dnafragDBA = $self->{'comparaDBA'}->get_DnaFragAdaptor;
my ($coord_system_name, $seq_region_name, $seq_region_start, $seq_region_end) = split(/:/, $self->{'region'})
if($self->{'region'});
if (defined $seq_region_start && defined $seq_region_end) {
$region_end = $seq_region_end;
$region_start = $seq_region_start;
}
my $chunk_size = $self->{'chunk_size'};
my $overlap = $self->{'overlap'};
if (!defined $chunk_size) {
$chunk_size = $region_end;
$overlap = 0;
}
my $lasttime = time();
my $chunk_start = $region_start;
my $chunk_end = $region_start;
while ($chunk_end < $region_end) {
my $chunk = new Bio::EnsEMBL::Compara::Production::DnaFragChunk();
$chunk->dnafrag($dnafrag);
$chunk->seq_start($chunk_start);
$chunk_end = $chunk_start + $chunk_size - 1;
if ($chunk_end > $region_end) {
$chunk_end = $region_end;
}
$chunk->seq_end($chunk_end);
$chunk->masking_analysis_data_id($self->{'masking_analysis_data_id'});
if($self->{'masking_options'}) {
$chunk->masking_options($self->{'masking_options'});
}
if($self->{'store_seq'}) {
$chunk->bioseq; }
$self->{'comparaDBA'}->get_DnaFragChunkAdaptor->store($chunk);
if($self->{'group_set_size'} and ($chunk->length < $self->{'group_set_size'})) {
if(($self->{'current_chunkset'}->count > 0) and
(($self->{'current_chunkset'}->total_basepairs + $chunk->length) > $self->{'group_set_size'}))
{
$self->{'comparaDBA'}->get_DnaFragChunkSetAdaptor->store($self->{'current_chunkset'});
$self->{'dna_collection'}->add_dna_object($self->{'current_chunkset'});
if($self->debug) {
printf("created chunkSet(%d) %d chunks, %1.3f mbase\n",
$self->{'current_chunkset'}->dbID, $self->{'current_chunkset'}->count,
$self->{'current_chunkset'}->total_basepairs/1000000.0); }
$self->{'current_chunkset'} = new Bio::EnsEMBL::Compara::Production::DnaFragChunkSet;
$self->{'current_chunkset'}->description(sprintf("collection_id:%d group:%d",
$self->{'dna_collection'}->dbID,
$self->{'chunkset_counter'}++));
}
$self->{'current_chunkset'}->add_DnaFragChunk($chunk);
if($self->debug) {
printf("chunkSet %d chunks, %1.3f mbase\n",
$self->{'current_chunkset'}->count,
$self->{'current_chunkset'}->total_basepairs/1000000.0); }
}
else {
$self->{'dna_collection'}->add_dna_object($chunk);
if($self->debug) {
printf("dna_collection : chunk (%d) %s\n",$chunk->dbID, $chunk->display_id);
}
}
$self->submit_job($chunk) if($self->{'analysis_job'});
$self->create_chunk_analysis($chunk) if($self->{'create_analysis_prefix'});
$chunk_start = $chunk_end - $overlap + 1;
}
} |
sub fetch_input
{ my( $self) = @_;
$self->{'genome_db_id'} = 0; $self->{'store_seq'} = 0;
$self->{'store_chunk'} = 0;
$self->{'overlap'} = 0;
$self->{'chunk_size'} = undef;
$self->{'region'} = undef;
$self->{'masking_analysis_data_id'} = 0;
$self->{'masking_options'} = undef;
$self->{'group_set_size'} = undef;
$self->{'include_non_reference'} = 0; $self->{'include_duplicates'} = 0;
$self->{'analysis_job'} = undef;
$self->{'create_analysis_prefix'} = undef; $self->{'collection_id'} = undef;
$self->{'collection_name'} = undef;
$self->get_params($self->parameters);
$self->get_params($self->input_id);
throw("No genome_db specified") unless defined($self->{'genome_db_id'});
$self->print_params;
$self->{'comparaDBA'} = Bio::EnsEMBL::Compara::Production::DBSQL::DBAdaptor->new(-DBCONN => $self->db->dbc);
$self->{'genome_db'} = $self->{'comparaDBA'}->get_GenomeDBAdaptor->
fetch_by_dbID($self->{'genome_db_id'});
throw("Can't fetch genome_db for id=".$self->{'genome_db_id'}) unless($self->{'genome_db'});
my $coreDBA = $self->{'genome_db'}->db_adaptor();
throw("Can't connect to genome database for id=".$self->{'genome_db_id'}) unless($coreDBA);
return 1; } |
sub get_params
{ my $self = shift;
my $param_string = shift;
return unless($param_string);
print("parsing parameter string : ",$param_string,"\n");
my $params = eval($param_string);
return unless($params);
foreach my $key (keys %$params) {
print(" $key : ", $params->{$key}, "\n");
}
if($params->{'input_data_id'}) {
my $input_id = $self->db->get_AnalysisDataAdaptor->fetch_by_dbID($params->{'input_data_id'});
$self->get_params($input_id);
}
$self->{'store_seq'} = $params->{'store_seq'} if(defined($params->{'store_seq'}));
$self->{'store_chunk'} = $params->{'store_chunk'} if(defined($params->{'store_chunk'}));
$self->{'chunk_size'} = $params->{'chunk_size'} if(defined($params->{'chunk_size'}));
$self->{'overlap'} = $params->{'overlap'} if(defined($params->{'overlap'}));
$self->{'dump_loc'} = $params->{'dump_loc'} if(defined($params->{'dump_loc'}));
$self->{'genome_db_id'} = $params->{'gdb'} if(defined($params->{'gdb'}));
$self->{'genome_db_id'} = $params->{'genome_db_id'} if(defined($params->{'genome_db_id'}));
$self->{'region'} = $params->{'region'} if(defined($params->{'region'}));
$self->{'masking_options'} = $params->{'masking_options'}
if(defined($params->{'masking_options'}));
$self->{'masking_analysis_data_id'} = $params->{'masking_analysis_data_id'}
if(defined($params->{'masking_analysis_data_id'}));
$self->{'create_analysis_prefix'} = $params->{'analysis_template'}
if(defined($params->{'analysis_template'}));
$self->{'analysis_job'} = $params->{'analysis_job'} if(defined($params->{'analysis_job'}));
$self->{'group_set_size'} = $params->{'group_set_size'} if(defined($params->{'group_set_size'}));
$self->{'collection_name'} = $params->{'collection_name'} if(defined($params->{'collection_name'}));
$self->{'collection_id'} = $params->{'collection_id'} if(defined($params->{'collection_id'}));
$self->{'include_non_reference'} = $params->{'include_non_reference'} if(defined($params->{'include_non_reference'}));
$self->{'include_duplicates'} = $params->{'include_duplicates'} if(defined($params->{'include_duplicates'}));
return; } |
sub print_params
{ my $self = shift;
print(" params:\n");
print(" genome_db_id : ", $self->{'genome_db_id'},"\n");
print(" region : ", $self->{'region'},"\n") if($self->{'region'});
print(" store_seq : ", $self->{'store_seq'},"\n");
print(" store_chunk : ", $self->{'store_chunk'},"\n");
print(" chunk_size : ", $self->{'chunk_size'},"\n");
print(" overlap : ", $self->{'overlap'} ,"\n");
print(" masking_analysis_data_id : ", $self->{'masking_analysis_data_id'} ,"\n");
print(" masking_options : ", $self->{'masking_options'} ,"\n") if($self->{'masking_options'});
print(" include_non_reference : ", $self->{'include_non_reference'} ,"\n");
print(" include_duplicates : ", $self->{'include_duplicates'} ,"\n"); } |
sub run
{
my $self = shift;
$self->create_chunks;
return 1; } |
sub submit_job
{ my $self = shift;
my $chunk = shift;
unless($self->{'submit_analysis'}) {
my $gdb = $chunk->dnafrag->genome_db;
my $logic_name = $self->{'analysis_job'} ."_". $gdb->dbID ."_". $gdb->assembly;
my $analysis = $self->{'comparaDBA'}->get_AnalysisAdaptor->fetch_by_logic_name($logic_name);
unless($analysis) {
$analysis = Bio::EnsEMBL::Analysis->new(
-db => '',
-db_file => '',
-db_version => '1',
-parameters => "",
-logic_name => $logic_name,
-module => 'Bio::EnsEMBL::Hive::RunnableDB::Dummy',
);
$self->db->get_AnalysisAdaptor()->store($analysis);
my $stats = $analysis->stats;
$stats->batch_size(3);
$stats->hive_capacity(11);
$stats->status('BLOCKED');
$stats->update();
}
$self->{'submit_analysis'} = $analysis;
}
my $input_id = "{'qyChunk'=>" . $chunk->dbID . "}";
Bio::EnsEMBL::Hive::DBSQL::AnalysisJobAdaptor->CreateNewJob (
-input_id => $input_id,
-analysis => $self->{'submit_analysis'},
-input_job_id => 0,
);
return; } |
sub write_output
{
my $self = shift;
my $outputHash = {};
$outputHash = eval($self->input_id) if(defined($self->input_id));
$outputHash->{'collection_id'} = $self->{'dna_collection'}->dbID;
my $output_id = main::encode_hash($outputHash);
print("output_id = $output_id\n");
$self->input_id($output_id);
return 1;
}
} |
General documentation
Describe contact details here
The rest of the documentation details each of the object methods.
Internal methods are usually preceded with a _