Raw content of Bio::EnsEMBL::Compara::Production::GenomicAlignBlock::FilterStack # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =pod =head1 NAME Bio::EnsEMBL::Compara::Production::GenomicAlignBlock::FilterStack =cut =head1 SYNOPSIS my $db = Bio::EnsEMBL::Compara::DBAdaptor->new($locator); my $runnable = Bio::EnsEMBL::Pipeline::RunnableDB::FilterStack->new ( -db => $db, -input_id => $input_id -analysis => $analysis ); $runnable->fetch_input(); #reads from DB $runnable->run(); $runnable->write_output(); #writes to DB =cut =head1 DESCRIPTION This is designed to run after Chaining and Netting has been done on a PairWise analysis. It will perform a sort of pseudo softmasking on regions of large numbers of overlapping genomic_aligns on the non-reference species. It marks regions where there are more than "threshold" genomic_aligns and will remove all genomic_aligns and genomic_align_blocks that are entirely within this region. It takes as input (on the input_id string). =cut =head1 CONTACT Describe contact details here =cut =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::EnsEMBL::Compara::Production::GenomicAlignBlock::FilterStack; use strict; use Time::HiRes qw(time gettimeofday tv_interval); use Bio::EnsEMBL::Utils::Exception qw( throw warning verbose ); use Bio::EnsEMBL::DBSQL::DBAdaptor; use Bio::EnsEMBL::DBLoader; use Bio::EnsEMBL::Compara::Production::DBSQL::DBAdaptor; use Bio::EnsEMBL::Compara::Production::DnaFragChunk; use Bio::EnsEMBL::Compara::Production::DnaFragChunkSet; use Bio::EnsEMBL::Compara::GenomicAlignBlock; use Bio::EnsEMBL::Hive::DBSQL::AnalysisJobAdaptor; use Bio::EnsEMBL::Pipeline::RunnableDB; our @ISA = qw(Bio::EnsEMBL::Pipeline::RunnableDB); =head2 fetch_input Title : fetch_input Usage : $self->fetch_input Function: prepares global variables and DB connections Returns : none Args : none =cut sub fetch_input { my( $self) = @_; # # parameters which can be set either via # $self->parameters OR # $self->input_id # $self->{'threshold'} = undef; $self->{'method_link_species_set_id'} = undef; $self->{'options'} = undef; #default height. Only prune stacks that are above this height initially $self->{'height'} = 40; $self->debug(0); $self->get_params($self->parameters); $self->get_params($self->input_id); $self->print_params; #create a Compara::DBAdaptor which shares the same DBI handle #with the Pipeline::DBAdaptor that is based into this runnable $self->{'comparaDBA'} = Bio::EnsEMBL::Compara::Production::DBSQL::DBAdaptor->new(-DBCONN => $self->db->dbc); $self->{'comparaDBA'}->dbc->disconnect_when_inactive(0); # get DnaCollection throw("must specify 'collection_name' to identify DnaCollection") unless(defined($self->{'collection_name'})); $self->{'collection'} = $self->{'comparaDBA'}->get_DnaCollectionAdaptor->fetch_by_set_description($self->{'collection_name'}); throw("unable to find DnaCollection with name : ". $self->{'collection_name'}) unless(defined($self->{'collection'})); return 1; } sub run { my $self = shift; $self->filter_stack; return 1; } sub write_output { my $self = shift; my $output_id = $self->input_id; print("output_id = $output_id\n"); $self->input_id($output_id); return 1; } ###################################### # # subroutines # ##################################### sub get_params { my $self = shift; my $param_string = shift; return unless($param_string); print("parsing parameter string : ",$param_string,"\n"); my $params = eval($param_string); return unless($params); foreach my $key (keys %$params) { print(" $key : ", $params->{$key}, "\n"); } # from analysis parameters $self->{'method_link'} = $params->{'method_link'} if(defined($params->{'method_link'})); $self->{'query_genome_db_id'} = $params->{'query_genome_db_id'} if(defined($params->{'query_genome_db_id'})); $self->{'target_genome_db_id'} = $params->{'target_genome_db_id'} if(defined($params->{'target_genome_db_id'})); $self->{'height'} = $params->{'height'} if(defined($params->{'height'})); # from job input_id $self->{'collection_name'} = $params->{'collection_name'} if(defined($params->{'collection_name'})); return; } sub print_params { my $self = shift; print(" params:\n"); print(" method_link : ", $self->{'method_link'},"\n"); print(" query_genome_db_id : ", $self->{'query_genome_db_id'},"\n"); print(" target_genome_db_id : ", $self->{'target_genome_db_id'},"\n"); print(" height : ", $self->{'height'},"\n"); print(" collection_name : ", $self->{'collection_name'},"\n"); } sub filter_stack { my $self = shift; my $height = $self->{'height'}; my $dna_collection = $self->{'collection'}; $self->{'delete_group'} = {}; #all the genomic_align_blocks that need to be deleted my $mlss = $self->{'comparaDBA'}->get_MethodLinkSpeciesSetAdaptor->fetch_by_method_link_type_genome_db_ids($self->{'method_link'}, [$self->{'query_genome_db_id'}, $self->{'target_genome_db_id'}]); my $GAB_DBA = $self->{'comparaDBA'}->get_GenomicAlignBlockAdaptor; # create a list of dnafrag here in case we want to make this runnable working with an input_id # containing a list of dnafrag_ids #my $dnafrag_list = [$self->{'comparaDBA'}->get_DnaFragAdaptor->fetch_by_dbID($self->{'dnafrag_id'})]; #my $dnafrag_id_list = $dna_collection->get_all_dnafrag_ids; my $dna_list = $dna_collection->get_all_dna_objects; foreach my $dna_object (@{$dna_list}) { my $dnafrag_chunk_array; if ($dna_object->isa('Bio::EnsEMBL::Compara::Production::DnaFragChunkSet')) { $dnafrag_chunk_array = $dna_object->get_all_DnaFragChunks; } else { $dnafrag_chunk_array = [$dna_object]; } foreach my $new_dna_object (@$dnafrag_chunk_array) { my $delete_group = {}; my $dnafrag = $new_dna_object->dnafrag; my $genomic_align_block_list = $GAB_DBA->fetch_all_by_MethodLinkSpeciesSet_DnaFrag($mlss, $dnafrag); #printf("dnafrag %s %s has %d GABs\n", $dnafrag->coord_system_name, $dnafrag->name, scalar(@$genomic_align_block_list)); #print $dnafrag->name, ": ", scalar(@$genomic_align_block_list), "\n"; if ($self->debug) { foreach my $gab (@{$genomic_align_block_list}) { $self->assign_jobID_to_genomic_align_block($gab); } } @$genomic_align_block_list = sort { $a->reference_genomic_align->dnafrag_start <=> $b->reference_genomic_align->dnafrag_start} @$genomic_align_block_list; my $max_end = 0; my $blocks = []; foreach my $this_genomic_align_block (@$genomic_align_block_list) { #found all overlapping blocks if ($this_genomic_align_block->reference_genomic_align->dnafrag_start > $max_end) { if (@$blocks > 0) { $delete_group = $self->find_stack_coverage($blocks, $height, $delete_group); } $blocks = []; } push(@$blocks, $this_genomic_align_block); if ($this_genomic_align_block->reference_genomic_align->dnafrag_end > $max_end) { $max_end = $this_genomic_align_block->reference_genomic_align->dnafrag_end; } } if (@$blocks > 0) { $delete_group = $self->find_stack_coverage($blocks, $height, $delete_group); } $self->delete_alignments($GAB_DBA, $mlss, $delete_group); } } } sub assign_jobID_to_genomic_align_block { my $self = shift; my $gab = shift; my $sql = "SELECT analysis_job_id FROM genomic_align_block_job_track ". "WHERE genomic_align_block_id=?"; my $sth = $self->{'comparaDBA'}->prepare($sql); $sth->execute($gab->dbID); my ($job_id) = $sth->fetchrow_array(); $sth->finish; $gab->{'analysis_job_id'} = $job_id; } sub find_stack_coverage { my ($self, $blocks, $height, $delete_group) = @_; #find coverage ie min start and max end of overlapping blocks my ($min_start, $max_end); foreach my $this_block (@$blocks) { if (!$min_start or $min_start > $this_block->reference_genomic_align->dnafrag_start) { $min_start = $this_block->reference_genomic_align->dnafrag_start; } if (!$max_end or $max_end < $this_block->reference_genomic_align->dnafrag_end) { $max_end = $this_block->reference_genomic_align->dnafrag_end; } } #print "update_coverage min_start $min_start max_end $max_end\n"; #find number of overlapping blocks for each position in coverage my @coverage_by_pos; foreach my $this_block (@$blocks) { my $start = $this_block->reference_genomic_align->dnafrag_start; my $end = $this_block->reference_genomic_align->dnafrag_end; for (my $a = $start; $a <= $end; $a++) { $coverage_by_pos[$a-$min_start]++; } } my $threshold_limits; my $threshold_limit; #loop through each position in coverage for (my $a = 0; $a < @coverage_by_pos; $a++) { $coverage_by_pos[$a] ||= 0; #if number of gabs at this position is above threshold if ($coverage_by_pos[$a] > $height) { #find start of threshold coverage if (!defined $threshold_limit->{min}) { $threshold_limit->{min} = $a+$min_start; } } else { #coverage has fallen below threshold so store max as previous value if #min has already been stored. if (defined($threshold_limit->{min})) { $threshold_limit->{max} = $a+$min_start-1; push @$threshold_limits, $threshold_limit; #unset threshold_limit undef($threshold_limit); } } } #if the last item of coverage_by_pos is above the threshold, I won't have found it before in previous loop if (defined($threshold_limit->{min}) && !defined($threshold_limit->{max} && $coverage_by_pos[-1] > $height)) { $threshold_limit->{max} = @coverage_by_pos+$min_start-1; push @$threshold_limits, $threshold_limit; undef($threshold_limit); } #remove all blocks that are entirely covered by the threshold limits. foreach my $limit (@$threshold_limits) { foreach my $this_block (@$blocks) { my $start = $this_block->reference_genomic_align->dnafrag_start; my $end = $this_block->reference_genomic_align->dnafrag_end; if ($start >= $limit->{min} && $end <= $limit->{max}) { #print "delete gab $start $end " . $this_block->dbID . "\n"; push @{$delete_group->{$this_block->group_id}}, $this_block; } } } return $delete_group; } sub delete_alignments { my ($self, $GAB_DBA, $mlss, $delete_group) = @_; my @del_list; foreach my $group_id (keys %{$delete_group}) { my $group_gabs = $GAB_DBA->fetch_all_by_MethodLinkSpeciesSet_GroupID($mlss, $group_id); #Only delete a gab if all the gabs of a group are in $delete_group if (@$group_gabs == @{$delete_group->{$group_id}}) { #check the gab->dbIDs match my $error = 0; @$group_gabs = sort {$a->dbID<=>$b->dbID} @$group_gabs; @{$delete_group->{$group_id}} = sort {$a->dbID<=>$b->dbID} @{$delete_group->{$group_id}}; for (my $i = 0; $i < @$group_gabs; $i++) { #Hopefully this should never happen if ($group_gabs->[$i]->dbID != $delete_group->{$group_id}->[$i]->dbID) { warn "Inconsistent genomic_align_blocks in group $group_id \n"; $error = 1; } } if (!$error) { push @del_list, @$group_gabs; } } } #assume not many of these if (@del_list > 0) { my @gab_ids = map {$_->dbID} @del_list; my $sql_genomic_align = "DELETE FROM genomic_align WHERE genomic_align_block_id IN (" . join(",", @gab_ids) . ");"; my $sql_genomic_align_block = "DELETE FROM genomic_align_block WHERE genomic_align_block_id in (" . join(",", @gab_ids) . ");"; #foreach my $gab (@del_list) { # print("DELETE "); # print_gab($gab); #} my $sth_genomic_align = $self->{'comparaDBA'}->prepare($sql_genomic_align); $sth_genomic_align->execute; $sth_genomic_align->finish; my $sth_genomic_align_block = $self->{'comparaDBA'}->prepare($sql_genomic_align_block); $sth_genomic_align_block->execute; $sth_genomic_align->execute; } printf("%d gabs to delete\n", scalar(@del_list)); } sub print_gab { my $gab = shift; my ($gab_1, $gab_2) = @{$gab->genomic_align_array}; printf(" id(%d) %s:(%d)%d-%d %s:(%d)%d-%d score=%d\n", $gab->dbID, $gab_1->dnafrag->name, $gab_1->dnafrag_strand, $gab_1->dnafrag_start, $gab_1->dnafrag_end, $gab_2->dnafrag->name, $gab_2->dnafrag_strand, $gab_2->dnafrag_start, $gab_2->dnafrag_end, $gab->score); } 1;