Raw content of Bio::EnsEMBL::Compara::Production::EPOanchors::TrimAnchorAlign # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =pod =head1 NAME Bio::EnsEMBL::Compara::Production::EPOanchors::TrimAnchorAlign =cut =head1 SYNOPSIS parameters {input_analysis_id=> ?,method_link_species_set_id=> ?,test_method_link_species_set_id=> ?, genome_db_ids => [?],} =cut =head1 DESCRIPTION Finds a good splitting point within the anchors. Leave the anchors as a 2bp long region. - fetch_input - run - write_output =cut =head1 CONTACT ensembl-dev@ebi.ac.uk =cut =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::EnsEMBL::Compara::Production::EPOanchors::TrimAnchorAlign; use strict; use Data::Dumper; use Bio::EnsEMBL::Compara::Production::DBSQL::DBAdaptor; use Bio::EnsEMBL::Hive::Process; use Bio::EnsEMBL::Compara::MethodLinkSpeciesSet; use Bio::EnsEMBL::Compara::Production::EPOanchors::AnchorAlign; use Bio::EnsEMBL::Utils::Exception qw(throw warning); use Bio::EnsEMBL::Analysis::Runnable::Pecan; our @ISA = qw(Bio::EnsEMBL::Hive::Process); sub fetch_input { my ($self) = @_; $self->{'comparaDBA'} = Bio::EnsEMBL::Compara::Production::DBSQL::DBAdaptor->new(-DBCONN=>$self->db->dbc); $self->{'comparaDBA'}->dbc->disconnect_when_inactive(0); $self->get_params($self->parameters); $self->get_params($self->input_id); if (!$self->anchor_id) { return 0; } # $self->{'fasta_files'} = []; my $anchor_align_adaptor = $self->{'comparaDBA'}->get_AnchorAlignAdaptor(); ## This method returns a hash at the moment, not the objects # print "Fetching AnchorAligns for Anchor ", $self->{'anchor_id'}, " and MLSS ", $self->{'input_method_link_species_set_id'}, "\n"; my $anchor_align_hash = $anchor_align_adaptor->fetch_all_by_anchor_id_and_mlss_id( $self->{'anchor_id'}, $self->{'input_method_link_species_set_id'}); die "Cannot find any anchor_align with anchor_id = ". $self->{'anchor_id'}. " and method_link_species_set_id = ". $self->{'input_method_link_species_set_id'} if (!$anchor_align_hash); foreach my $anchor_align_id (keys %$anchor_align_hash) { # print "Fetching AnchorAlign $anchor_align_id\n"; my $anchor_align = $anchor_align_adaptor->fetch_by_dbID($anchor_align_id); push(@{$self->{'anchor_aligns'}}, $anchor_align); } $self->_dump_fasta(); exit(1) if (!$self->{'fasta_files'} or !@{$self->{'fasta_files'}}); return 1; } sub run { my $self = shift; exit(1) if (!$self->{'fasta_files'} or !@{$self->{'fasta_files'}}); exit(1) if (!$self->{'tree_string'}); my $run_str = "/software/ensembl/compara/OrtheusC/bin/OrtheusC"; $run_str .= " -a " . join(" ", @{$self->{'fasta_files'}}); $run_str .= " -b '" . $self->{'tree_string'} . "'"; $run_str .= " -h"; # output leaves only $self->{'msa_string'} = qx"$run_str"; my $trim_position = $self->get_best_trimming_position($self->{'msa_string'}); $self->{'trimmed_anchor_aligns'} = $self->get_trimmed_anchor_aligns($trim_position); return 1; } sub write_output { my ($self) = @_; my $anchor_align_adaptor = $self->{'comparaDBA'}->get_AnchorAlignAdaptor(); foreach my $this_trimmed_anchor_align (@{$self->{'trimmed_anchor_aligns'}}) { $anchor_align_adaptor->store($this_trimmed_anchor_align); } return 1; } sub anchor_id { my $self = shift; if (@_) { $self->{anchor_id} = shift; } return $self->{anchor_id}; } sub input_method_link_species_set_id { my $self = shift; if (@_) { $self->{input_method_link_species_set_id} = shift; } return $self->{input_method_link_species_set_id}; } sub output_method_link_species_set_id { my $self = shift; if (@_) { $self->{output_method_link_species_set_id} = shift; } return $self->{output_method_link_species_set_id}; } sub get_params { my $self = shift; my $param_string = shift; return unless($param_string); print STDERR "parsing parameter string : ",$param_string,"\n"; my $params = eval($param_string); return unless($params); if(defined($params->{'anchor_id'})) { $self->anchor_id($params->{'anchor_id'}); } if(defined($params->{'input_method_link_species_set_id'})) { $self->input_method_link_species_set_id($params->{'input_method_link_species_set_id'}); } if(defined($params->{'output_method_link_species_set_id'})) { $self->output_method_link_species_set_id($params->{'output_method_link_species_set_id'}); } if(defined($params->{'method_link_species_set_id'})) { $self->method_link_species_set_id($params->{'method_link_species_set_id'}); } return 1; } =head2 _dump_fasta Arg [1] : -none- Example : $self->_dump_fasta(); Description: Dumps FASTA files in the order given by the tree string (needed by Pecan). Resulting file names are stored using the fasta_files getter/setter Returntype : 1 Exception : Warning : =cut sub _dump_fasta { my $self = shift; my $all_anchor_aligns = $self->{'anchor_aligns'}; $self->{tree_string} = "("; foreach my $anchor_align (@$all_anchor_aligns) { my $anchor_align_id = $anchor_align->dbID; $self->{tree_string} .= "aa$anchor_align_id:0.1,"; my $file = $self->worker_temp_directory . "/seq" . $anchor_align_id . ".fa"; open F, ">$file" || throw("Couldn't open $file"); print F ">AnchorAlign", $anchor_align_id, "|", $anchor_align->dnafrag->name, ".", $anchor_align->dnafrag_start, "-", $anchor_align->dnafrag_end, ":", $anchor_align->dnafrag_strand, "\n"; my $seq = $anchor_align->seq; if ($seq =~ /[^ACTGactgNnXx]/) { print STDERR "AnchorAlign $anchor_align_id contains at least one non-ACTGactgNnXx character. These have been replaced by N's\n"; $seq =~ s/[^ACTGactgNnXx]/N/g; } $seq =~ s/(.{80})/$1\n/g; chomp $seq; print F $seq,"\n"; close F; push @{$self->{'fasta_files'}}, $file; } substr($self->{tree_string}, -1, 1, ");"); return 1; } sub get_best_trimming_position { my ($self, $msa_string) = @_; my $num_of_leaves = 1; ################################ # Parse the multiple alignment ################################ my @lines = split("\n", $msa_string); my $anchor_align_id; my $seqs; my $this_seq; foreach my $this_line (@lines) { if ($this_line =~ /score/) { } elsif ($this_line =~ /^>aa(\d+)$/) { $seqs->{$anchor_align_id} = $this_seq if ($anchor_align_id); $anchor_align_id = $1; $this_seq = ""; $num_of_leaves++; } else { $this_seq .= $this_line; } } $seqs->{$anchor_align_id} = $this_seq; $self->{'aligned_sequences'} = $seqs; my $ideal_score = 4 * $num_of_leaves; ##################################### # consensus seq, gaps, conservation ##################################### my @gaps; my @consensus; my @conservation; foreach $this_seq (values %$seqs) { my @seq = split("", $this_seq); for (my $i = 0; $i < @seq; $i++) { if ($seq[$i] eq "-") { $gaps[$i]++; } elsif ($seq[$i] =~ /[ACTG]/) { $consensus[$i]->{$seq[$i]}++; } } } for (my $i=0; $i<@consensus; $i++) { my $cons_hash_ref = $consensus[$i]; my $max = 0; my $consensus_nucleotide; while (my ($nucleotide, $this_count) = each %$cons_hash_ref) { if ($this_count > $max) { $max = $this_count; $consensus_nucleotide = $nucleotide; } } push(@conservation, $max); $consensus[$i] = $consensus_nucleotide; } #################################################### # Score each position according to previous values #################################################### my @final_scores; for (my $i = 0; $i < @consensus - 3; $i++) { my @these_bases = @consensus[$i..($i+3)]; my @these_gaps = @gaps[$i..($i+3)]; my @these_scores = @conservation[$i..($i+3)]; my $total_score = 0; # Same as max score - 1 * $num_mismatches for (my $j=0; $j<4; $j++) { $total_score += $these_scores[$j]; } # 3 points for every gap for (my $j=0; $j<4; $j++) { $total_score -= 3 * $these_gaps[$j]; } my $all_bases; for (my $j=0; $j<4; $j++) { $all_bases->{$these_bases[$j]}++; } if ($these_bases[0] eq $these_bases[2] and # Avoid repetitive sequence $these_bases[1] eq $these_bases[3]) { $total_score -= $num_of_leaves; if ($these_bases[0] eq $these_bases[1]) { $total_score -= $num_of_leaves; } } elsif (scalar(keys %$all_bases) == 2) { # simple sequence (only 2 diff nucleotides in 4 bp) $total_score -= $num_of_leaves/2; } elsif (scalar(keys %$all_bases) == 4) { # Give an extra point to runs of all 4 diff nucleotides $total_score += 1; } push(@final_scores, $total_score); # print join(" : ", @these_bases, @these_gaps,@these_scores, $total_score), "\n"; } my $max_score = 0; my $best_position = 0; for (my $i = 0; $i < @final_scores; $i++) { if ($final_scores[$i] > $max_score) { $max_score = $final_scores[$i]; $best_position = $i+2; } } die "Cannot find a good position" if ($best_position == 0 or $max_score < 0.8 * $ideal_score); return $best_position; } sub get_trimmed_anchor_aligns { my ($self, $best_position) = @_; my $trimmed_anchor_aligns; my $aligned_sequences = $self->{'aligned_sequences'}; # # while (my ($align_anchor_id, $aligned_sequence) = each %$aligned_sequences) { # # print substr($aligned_sequence, 0, $best_position), " ** ", substr($aligned_sequence, $best_position), # # " :: $align_anchor_id\n"; # # } my $all_anchor_aligns = $self->{'anchor_aligns'}; foreach my $this_anchor_align (@$all_anchor_aligns) { my $this_anchor_align_id = $this_anchor_align->dbID; my $this_aligned_sequence = $aligned_sequences->{$this_anchor_align_id}; # # print substr($this_aligned_sequence, 0, ($best_position -1)), " ** ", # # substr($this_aligned_sequence, $best_position-1, 2) , " ** ", # # substr($this_aligned_sequence, ($best_position + 1)), # # " ++ $this_anchor_align_id\n"; my $seq_before = substr($this_aligned_sequence, 0, $best_position); my $seq_after = substr($this_aligned_sequence, $best_position); my $start = $this_anchor_align->dnafrag_start; my $end = $this_anchor_align->dnafrag_end; my ($count_before) = $seq_before =~ tr/ACTGactgNn/ACTGactgNn/; my ($count_after) = $seq_after =~ tr/ACTGactgNn/ACTGactgNn/; if ($count_before + $count_after != $end - $start + 1) { die "Wrong length $count_before $count_after $start $end"; } if ($this_anchor_align->dnafrag_strand == 1) { $start += $count_before - 1; $end -= $count_after - 1; } elsif ($this_anchor_align->dnafrag_strand == -1) { $start += $count_after - 1; $end -= $count_before - 1; } else { die "Wrong strand: ".$this_anchor_align->dnafrag_strand; } # Check we get a 2bp long anchor_align if ($end - $start + 1 != 2) { die "Wrong length $start $end"; } my $new_anchor_align; %$new_anchor_align = %$this_anchor_align; bless $new_anchor_align, ref($this_anchor_align); delete($new_anchor_align->{'_seq'}); delete($new_anchor_align->{'_dbID'}); $new_anchor_align->dnafrag_start($start); $new_anchor_align->dnafrag_end($end); $new_anchor_align->method_link_species_set_id($self->output_method_link_species_set_id); push(@$trimmed_anchor_aligns, $new_anchor_align); } return $trimmed_anchor_aligns; } 1;