Raw content of Bio::EnsEMBL::Compara::Production::EPOanchors::TrimStoreAnchors #Ensembl module for Bio::EnsEMBL::Compara::Production::EPOanchors::TrimStoreAnchors # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =head1 NAME Bio::EnsEMBL::Compara::Production::EPOanchors::TrimStoreAnchors =head1 SYNOPSIS $exonate_anchors->fetch_input(); $exonate_anchors->run(); $exonate_anchors->write_output(); writes to database =head1 DESCRIPTION Given a database with anchor sequences and a target genome. This modules exonerates the anchors against the target genome. The required information (anchor batch size, target genome file, exonerate parameters are provided by the analysis, analysis_job and analysis_data tables This modules is part of the Ensembl project Email compara@ebi.ac.uk =head1 CONTACT This modules is part of the EnsEMBL project () Questions can be posted to the ensembl-dev mailing list: ensembl-dev@ebi.ac.uk =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # package Bio::EnsEMBL::Compara::Production::EPOanchors::TrimStoreAnchors; use strict; use Data::Dumper; use Bio::EnsEMBL::Compara::Production::DBSQL::DBAdaptor; use Bio::EnsEMBL::Hive::Process; use Bio::EnsEMBL::Utils::Exception qw(throw warning); use Bio::EnsEMBL::Analysis; use Bio::EnsEMBL::Compara::DBSQL::DBAdaptor; use Bio::EnsEMBL::Registry; Bio::EnsEMBL::Registry->load_all; Bio::EnsEMBL::Registry->no_version_check(1); our @ISA = qw(Bio::EnsEMBL::Hive::Process); sub configure_defaults { my $self = shift; $self->gab_batch_size(1000); $self->N_cut_off_ratio(0.1); return 1; } sub fetch_input { my ($self) = @_; $self->configure_defaults(); $self->get_parameters($self->parameters); #create a Compara::DBAdaptor which shares the same DBI handle with $self->db (Hive DBAdaptor) $self->{'comparaDBA'} = Bio::EnsEMBL::Compara::Production::DBSQL::DBAdaptor->new(-DBCONN=>$self->db->dbc) or die "cant connect\n"; $self->{'comparaDBA'}->dbc->disconnect_if_idle(); $self->{'hiveDBA'} = Bio::EnsEMBL::Hive::DBSQL::DBAdaptor->new(-DBCONN => $self->{'comparaDBA'}->dbc) or die "cant connect\n"; $self->{'hiveDBA'}->dbc->disconnect_if_idle(); return 1; } sub run { my ($self) = @_; #find all genomic_align_blocks with the minimum size and minimum number of seqs my %sql_statements = ( select_gerp_blocks => "SELECT genomic_align_block_id FROM genomic_align_block WHERE length >= ? AND method_link_species_set_id = ?", select_analysis_data => "SELECT data FROM analysis_data WHERE analysis_data_id = ?", select_genomic_aligns => "SELECT genomic_align_id, dnafrag_id, dnafrag_start, dnafrag_end, dnafrag_strand FROM genomic_align WHERE genomic_align_block_id = ?", select_genome_db_ids => "SELECT ss.genome_db_id, gdb.name FROM genome_db gdb INNER JOIN species_set ss ON gdb.genome_db_id = ss.genome_db_id WHERE species_set_id = (SELECT species_set_id FROM method_link_species_set WHERE method_link_species_set_id = ?)", select_dnafrag_info => "SELECT df.genome_db_id, df.dnafrag_id, df.name, df.coord_system_name FROM dnafrag df INNER JOIN genomic_align ga ON df.dnafrag_id = ga.dnafrag_id INNER JOIN genomic_align_block gab ON ga.genomic_align_block_id = gab.genomic_align_block_id WHERE gab.method_link_species_set_id = ? AND gab.length >= ? GROUP BY df.dnafrag_id", insert_anchor_seq => "INSERT INTO anchor_sequence (method_link_species_set_id, anchor_id, dnafrag_id, start, end, strand, sequence, length) VALUES (?,?,?,?,?,?,?,?)", ); foreach my$sql_statement(keys %sql_statements) {#prepare all the sql statements $sql_statements{$sql_statement} = $self->{'comparaDBA'}->dbc->prepare($sql_statements{$sql_statement}); } $sql_statements{select_genome_db_ids}->execute($self->method_link_species_set_id); my(%org_hash, $dnafrag_hashref); while(my@row = $sql_statements{select_genome_db_ids}->fetchrow_array) { $org_hash{$row[0]} = $row[1]; } foreach my$genome_db_id(sort keys %org_hash) { #get slice adaptors for all the species used to generate the anchors $org_hash{$genome_db_id} = Bio::EnsEMBL::Registry->get_adaptor("$org_hash{$genome_db_id}", "core", "Slice"); } $sql_statements{select_analysis_data}->execute($self->analysis_data_id); my $analysis_struct = eval ( ($sql_statements{select_analysis_data}->fetchrow_array)[0] ); $sql_statements{select_dnafrag_info}->execute($self->previous_mlssid, $analysis_struct->{min_anc_size}); $dnafrag_hashref = $sql_statements{select_dnafrag_info}->fetchall_hashref("dnafrag_id"); $sql_statements{select_gerp_blocks}->execute($analysis_struct->{min_anc_size}, $self->previous_mlssid); while(my@row = $sql_statements{select_gerp_blocks}->fetchrow_array) { my $genomic_align_block_id = $row[0]; $sql_statements{select_genomic_aligns}->execute($genomic_align_block_id); my $ga_ref = $sql_statements{select_genomic_aligns}->fetchall_hashref("genomic_align_id"); foreach my $ga_id(sort keys %{$ga_ref}) { my $dnafrag_id = $ga_ref->{$ga_id}->{dnafrag_id}; my $slice = $org_hash{ $dnafrag_hashref->{$dnafrag_id}->{genome_db_id} }->fetch_by_region( $dnafrag_hashref->{$dnafrag_id}->{coord_system_name}, $dnafrag_hashref->{$dnafrag_id}->{name}, $ga_ref->{$ga_id}->{dnafrag_start}, $ga_ref->{$ga_id}->{dnafrag_end}, $ga_ref->{$ga_id}->{dnafrag_strand}); next if $slice->length < $analysis_struct->{min_anc_size}; my($new_end, $new_seq); if($slice->length > $analysis_struct->{max_anc_size}) { #shorten the sequence if it's greater that the max length $new_seq = join("", (split("", $slice->seq))[0..$analysis_struct->{max_anc_size}-1]); $new_end = $ga_ref->{$ga_id}->{dnafrag_start} + $analysis_struct->{max_anc_size} - 1; } if($new_seq) { #skip if sequence has too many Ns, insert into db otherwise my $new_seq_Ns_len = length( join("", $new_seq=~/(N)/g)); next if ($new_seq_Ns_len && ( $new_seq_Ns_len / length($new_seq) > $self->N_cut_off_ratio )); $sql_statements{insert_anchor_seq}->execute($self->method_link_species_set_id, $genomic_align_block_id, $dnafrag_id, $ga_ref->{$ga_id}->{dnafrag_start}, $ga_ref->{$ga_id}->{dnafrag_end}, $ga_ref->{$ga_id}->{dnafrag_strand}, $new_seq, length($new_seq)); } else { my $slice_seq_Ns_len = length( join("", $slice->seq=~/(N)/g)); next if ($slice_seq_Ns_len && ( $slice_seq_Ns_len / length($slice->seq) > $self->N_cut_off_ratio )); $sql_statements{insert_anchor_seq}->execute($self->method_link_species_set_id, $genomic_align_block_id, $dnafrag_id, $ga_ref->{$ga_id}->{dnafrag_start}, $ga_ref->{$ga_id}->{dnafrag_end}, $ga_ref->{$ga_id}->{dnafrag_strand}, $slice->seq, length($slice->seq)); } } } return 1; } sub write_output { my ($self) = @_; return 1; } sub N_cut_off_ratio { my $self = shift; if (@_) { $self->{_N_cut_off_ratio} = shift; } return $self->{_N_cut_off_ratio}; } sub previous_mlssid { #this should be the Gerp mlssid my $self = shift; if (@_) { $self->{_previous_mlssid} = shift; } return $self->{_previous_mlssid}; } sub gab_batch_size { my $self = shift; if (@_) { $self->{_gab_batch_size} = shift; } return $self->{_gab_batch_size}; } sub analysis_data_id { my $self = shift; if (@_) { $self->{_analysis_data_id} = shift; } return $self->{_analysis_data_id}; } sub trim_and_store_analysis_id { my $self = shift; if (@_) { $self->{_trim_and_store_analysis_id} = shift; } return $self->{_trim_and_store_analysis_id}; } sub method_link_species_set_id { my $self = shift; if (@_){ $self->{_method_link_species_set_id} = shift; } return $self->{_method_link_species_set_id}; } sub get_parameters { my $self = shift; my $param_string = shift; return unless($param_string); my $params = eval($param_string); if(defined($params->{'method_link_species_set_id'})) { $self->method_link_species_set_id($params->{'method_link_species_set_id'}); } if(defined($params->{'analysis_data_id'})) { $self->analysis_data_id($params->{'analysis_data_id'}); } if(defined($params->{'previous_mlssid'})) { $self->previous_mlssid($params->{'previous_mlssid'}); } } sub get_input_id { my $self = shift; my $input_id_string = shift; return unless($input_id_string); print("parsing input_id string : ",$input_id_string,"\n"); return 1; } 1;