Raw content of Bio::EnsEMBL::Compara::Production::EPOanchors::FilterAnchors # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =pod =head1 NAME Bio::EnsEMBL::Compara::Production::GenomicAlignBlock::EPOanchors::FilterAnchors =cut =head1 SYNOPSIS parameters {input_analysis_id=> ?,method_link_species_set_id=> ?,test_method_link_species_set_id=> ?} =cut =head1 DESCRIPTION There are 3 hard-coded filtering conditions for anchor removal: 1). Anchors which hit > 5 dnafrags in any one genome. 2). Anchors which hit the same dnafrag > 10 in any one genome. 3). Anchors which hit any one genome > 20 times. =cut =head1 CONTACT ensembl-compara@ebi.ac.uk =cut =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::EnsEMBL::Compara::Production::EPOanchors::FilterAnchors; use strict; use Data::Dumper; use Bio::EnsEMBL::Compara::Production::DBSQL::DBAdaptor; use Bio::EnsEMBL::Hive::Process; use Bio::EnsEMBL::Compara::MethodLinkSpeciesSet; use Bio::EnsEMBL::Compara::Production::Anchors::AnchorAlign; use Bio::EnsEMBL::Utils::Exception qw(throw warning); use constant MAX_NUMBER_OF_DNAFRAGS_HIT => 5; #max number of dnafrags hit per genome use constant MAX_NUMBER_OF_HITS_TO_SAME_DNAFRAG => 10; use constant MAX_NUMBER_OF_HITS_TO_ANY_ONE_GENOME => 20; our @ISA = qw(Bio::EnsEMBL::Hive::Process); sub configure_defaults { my $self = shift; $self->max_number_of_dnafrags_hit(MAX_NUMBER_OF_DNAFRAGS_HIT); $self->max_number_of_hits_to_same_dnafrag(MAX_NUMBER_OF_HITS_TO_SAME_DNAFRAG); $self->max_number_of_hits_to_any_one_genome(MAX_NUMBER_OF_HITS_TO_ANY_ONE_GENOME); return 0; } sub fetch_input { my ($self) = @_; $self->configure_defaults(); #create a Compara::DBAdaptor which shares the same DBI handle #with $self->db (Hive DBAdaptor) $self->{'comparaDBA'} = Bio::EnsEMBL::Compara::Production::DBSQL::DBAdaptor->new(-DBCONN=>$self->db->dbc); $self->{'comparaDBA'}->dbc->disconnect_when_inactive(0); my $dnafrag_adaptor = $self->{'comparaDBA'}->get_DnaFragAdaptor(); my $anchor_align_adaptor = $self->{'comparaDBA'}->get_AnchorAlignAdaptor(); $self->get_params($self->parameters); $self->get_params($self->input_id); return 1; } sub run { my ($self) = @_; my $anchor_align_adaptor = $self->{'comparaDBA'}->get_AnchorAlignAdaptor(); my(@palindromic_anchors, $all_anchors, %anchor_hits2genomes_and_dnafrags, %anchors2remove); $self->anchor_ids_with_zero_strand( $anchor_align_adaptor->fetch_all_anchors_with_zero_strand( $self->test_method_link_species_set_id)); $all_anchors = $anchor_align_adaptor->fetch_all_anchor_ids_by_test_mlssid_and_genome_db_ids( $self->test_method_link_species_set_id, $self->genome_db_ids); foreach my $anchor (@{$all_anchors}) { my $dnafrags_and_genomedbs = $anchor_align_adaptor->fetch_dnafrag_and_genome_db_ids_by_test_mlssid( $self->test_method_link_species_set_id, $anchor->[0]); foreach my $genome_db_dnafrag(@{$dnafrags_and_genomedbs}) { $anchor_hits2genomes_and_dnafrags{$anchor->[0]}{$genome_db_dnafrag->[1]}{$genome_db_dnafrag->[0]}++; } } foreach my $anchor_id(sort keys %anchor_hits2genomes_and_dnafrags) { foreach my $genome_db_id(sort keys %{$anchor_hits2genomes_and_dnafrags{$anchor_id}}) { last if(exists($anchors2remove{$anchor_id})); my $num_hits_to_each_genome = 0; if(scalar(keys %{$anchor_hits2genomes_and_dnafrags{$anchor_id}{$genome_db_id}}) > $self->max_number_of_dnafrags_hit) { $anchors2remove{$anchor_id}++; # print "MAX_NUM_DNAFRAGS_HIT : $anchor_id\n"; last; } foreach my $dnafrag_id(%{$anchor_hits2genomes_and_dnafrags{$anchor_id}{$genome_db_id}}) { if($anchor_hits2genomes_and_dnafrags{$anchor_id}{$genome_db_id}{$dnafrag_id} > $self->max_number_of_hits_to_same_dnafrag) { $anchors2remove{$anchor_id}++; # print "MAX_NUM_HITS_TO_SAME_DNAFRAG : $anchor_id\n"; last; } else{ $num_hits_to_each_genome += $anchor_hits2genomes_and_dnafrags{$anchor_id}{$genome_db_id}{$dnafrag_id}; } } if($num_hits_to_each_genome > $self->max_number_of_hits_to_any_one_genome) { $anchors2remove{$anchor_id}++; # print "MAX_NUM_HITS_TO_ANY_ONE_GENOME : $anchor_id\n"; last; } } } $self->anchors2remove(\%anchors2remove); return 1; } sub write_output { my ($self) = @_; my $anchor_align_adaptor = $self->{'comparaDBA'}->get_AnchorAlignAdaptor(); print join(":", $self->input_analysis_id, $self->test_method_link_species_set_id), "\n"; $anchor_align_adaptor->update_zero_strand_anchors($self->anchor_ids_with_zero_strand, $self->input_analysis_id, $self->test_method_link_species_set_id); if(scalar (keys %{$self->anchors2remove})) { $anchor_align_adaptor->update_failed_anchor($self->anchors2remove, $self->input_analysis_id, $self->test_method_link_species_set_id); } else{ print "No anchors to remove\n"; } return 1; } sub anchor_ids_with_zero_strand { my $self = shift; if (@_) { $self->{anchor_ids_with_zero_strand} = shift; } return $self->{anchor_ids_with_zero_strand}; } sub anchors2remove { my $self = shift; if (@_) { $self->{anchors2remove} = shift; } return $self->{anchors2remove}; } sub max_number_of_dnafrags_hit { my $self = shift; if (@_) { $self->{max_number_of_dnafrags_hit} = shift; } return $self->{max_number_of_dnafrags_hit}; } sub max_number_of_hits_to_same_dnafrag { my $self = shift; if (@_) { $self->{max_number_of_hits_to_same_dnafrag} = shift; } return $self->{max_number_of_hits_to_same_dnafrag}; } sub max_number_of_hits_to_any_one_genome { my $self = shift; if (@_) { $self->{max_number_of_hits_to_any_one_genome} = shift; } return $self->{max_number_of_hits_to_any_one_genome}; } sub test_method_link_species_set_id { my $self = shift; if (@_) { $self->{test_method_link_species_set_id} = shift; } return $self->{test_method_link_species_set_id}; } sub method_link_species_set_id { my $self = shift; if (@_) { $self->{method_link_species_set_id} = shift; } return $self->{method_link_species_set_id}; } sub input_analysis_id { my $self = shift; if (@_) { $self->{input_analysis_id} = shift; } return $self->{input_analysis_id}; } sub genome_db_ids { my $self = shift; if (@_) { $self->{genome_db_ids} = shift; } return $self->{genome_db_ids}; } #sub method_link_type { # my $self = shift; # if (@_) { # $self->{method_link_type} = shift; # } # return $self->{method_link_type}; #} # #sub input_anchor_id { # my $self = shift; # if (@_) { # $self->{input_anchor_id} = shift; # } # return $self->{input_anchor_id}; #} sub get_params { my $self = shift; my $param_string = shift; return unless($param_string); print("parsing parameter string : ",$param_string,"\n"); my $params = eval($param_string); return unless($params); if(defined($params->{'test_method_link_species_set_id'})) { $self->test_method_link_species_set_id($params->{'test_method_link_species_set_id'}); } if(defined($params->{'method_link_species_set_id'})) { $self->method_link_species_set_id($params->{'method_link_species_set_id'}); } if(defined($params->{'input_analysis_id'})) { $self->input_analysis_id($params->{'input_analysis_id'}); } if(defined($params->{'genome_db_ids'})) { $self->genome_db_ids($params->{'genome_db_ids'}); } if(defined($params->{'method_link_type'})) { $self->method_link_type($params->{'method_link_type'}); } if(defined($params->{'input_anchor_id'})) { #same as anchor_id $self->input_anchor_id($params->{'input_anchor_id'}); } if(defined($params->{'anchor_id'})) { #same as input_anchor_id $self->input_anchor_id($params->{'anchor_id'}); } return 1; } #sub store_input { # my $self = shift; # # if (@_) { # $self->{store_input} = shift; # } # # return $self->{store_input}; #} 1;