Raw content of Bio::EnsEMBL::Compara::Production::GenomicAlignBlock::CreateFilterDuplicatesJobs
#
# You may distribute this module under the same terms as perl itself
#
# POD documentation - main docs before the code
=pod
=head1 NAME
Bio::EnsEMBL::Compara::Production::GenomicAlignBlock::CreateFilterDuplicatesJobs
=cut
=head1 SYNOPSIS
my $db = Bio::EnsEMBL::Compara::DBAdaptor->new($locator);
my $repmask = Bio::EnsEMBL::Compara::Production::GenomicAlignBlock::CreateFilterDuplicatesJobs->new (
-db => $db,
-input_id => $input_id
-analysis => $analysis );
$repmask->fetch_input(); #reads from DB
$repmask->run();
$repmask->output();
$repmask->write_output(); #writes to DB
=cut
=head1 DESCRIPTION
=cut
=head1 CONTACT
Describe contact details here
=cut
=head1 APPENDIX
The rest of the documentation details each of the object methods.
Internal methods are usually preceded with a _
=cut
package Bio::EnsEMBL::Compara::Production::GenomicAlignBlock::CreateFilterDuplicatesJobs;
use strict;
#use Bio::EnsEMBL::Hive;
#use Bio::EnsEMBL::Hive::Extensions;
use Bio::EnsEMBL::Compara::Production::DBSQL::DBAdaptor;
use Bio::EnsEMBL::Compara::Production::DnaFragChunk;
use Bio::EnsEMBL::Compara::Production::DnaFragChunkSet;
use Bio::EnsEMBL::Compara::Production::DnaCollection;
use Bio::EnsEMBL::Utils::Exception;
use Bio::EnsEMBL::Hive::Process;
our @ISA = qw(Bio::EnsEMBL::Hive::Process);
sub fetch_input {
my $self = shift;
#
# parameters which can be set either via
# $self->parameters OR
# $self->input_id
#
$self->{'collection'} = undef;
$self->{'filter_duplicates_analysis'} = undef;
$self->get_params($self->parameters);
$self->get_params($self->input_id);
#{'collection_name'=>'rat','filter_duplicates'=>'in_chunk_overlaps','region'=>'chromosome:11'}
# create a Compara::DBAdaptor which shares my DBConnection
$self->{'comparaDBA'} = Bio::EnsEMBL::Compara::Production::DBSQL::DBAdaptor->new(-DBCONN => $self->db->dbc);
# get the FilterDuplicates analysis
throw("must specify pair_aligner to identify logic_name of FilterDuplicates analysis")
unless(defined($self->{'logic_name'}));
$self->{'filter_duplicates_analysis'} = $self->db->get_AnalysisAdaptor->fetch_by_logic_name($self->{'logic_name'});
throw("unable to find analysis with logic_name ". $self->{'logic_name'})
unless(defined($self->{'filter_duplicates_analysis'}));
# get DnaCollection
throw("must specify 'collection_name' to identify DnaCollection")
unless(defined($self->{'collection_name'}));
$self->{'collection'} = $self->{'comparaDBA'}->get_DnaCollectionAdaptor->
fetch_by_set_description($self->{'collection_name'});
throw("unable to find DnaCollection with name : ". $self->{'collection_name'})
unless(defined($self->{'collection'}));
$self->print_params;
return 1;
}
sub run
{
my $self = shift;
$self->createFilterDuplicatesJobs();
return 1;
}
sub write_output
{
my $self = shift;
return 1;
}
##################################
#
# subroutines
#
##################################
sub get_params {
my $self = shift;
my $param_string = shift;
#{'collection_name'=>'rat','filter_duplicates'=>'in_chunk_overlaps','region'=>'chromosome:11'}
return unless($param_string);
print("parsing parameter string : ",$param_string,"\n");
my $params = eval($param_string);
return unless($params);
foreach my $key (keys %$params) {
print(" $key : ", $params->{$key}, "\n");
}
$self->{'logic_name'} = $params->{'logic_name'} if(defined($params->{'logic_name'}));
$self->{'collection_name'} = $params->{'collection_name'} if(defined($params->{'collection_name'}));
$self->{'region'} = $params->{'region'} if(defined($params->{'region'}));
return;
}
sub print_params {
my $self = shift;
printf(" params:\n");
printf(" logic_name : %s\n", $self->{'logic_name'});
printf(" collection : (%d) %s\n",
$self->{'collection'}->dbID, $self->{'collection'}->description);
if (defined $self->{'region'}) {
printf(" region : %s\n", $self->{'region'});
}
}
sub createFilterDuplicatesJobs
{
my $self = shift;
my $dna_collection = $self->{'collection'};
my $analysis = $self->{'filter_duplicates_analysis'};
my $region = $self->{'region'};
my ($coord_system_name, $seq_region_name, $seq_region_start, $seq_region_end);
if (defined $region && $region =~ //) {
($coord_system_name, $seq_region_name, $seq_region_start, $seq_region_end) = split(/:/, $region);
}
my $dnafrag_id_list = $dna_collection->get_all_dnafrag_ids;
my $count = 0;
my %already_seen_dnafrag_ids;
foreach my $dnafrag_id (@{$dnafrag_id_list}) {
next if (defined $already_seen_dnafrag_ids{$dnafrag_id});
my $input_hash = {};
$input_hash->{'dnafrag_id'} = $dnafrag_id;
$input_hash->{'seq_region_start'} = $seq_region_start if (defined $seq_region_start);
$input_hash->{'seq_region_end'} = $seq_region_end if (defined $seq_region_end);
my $input_id = main::encode_hash($input_hash);
#printf("create_job : %s : %s\n", $analysis->logic_name, $input_id);
Bio::EnsEMBL::Hive::DBSQL::AnalysisJobAdaptor->CreateNewJob
(-input_id => $input_id,
-analysis => $analysis,
-input_job_id => 0);
$already_seen_dnafrag_ids{$dnafrag_id} = 1;
$count++;
}
printf("created %d jobs for analysis logic_name %s\n", $count, $analysis->logic_name);
}
1;