TrimAnchorAlign documentation.

sub _dump_fasta {

  my $self = shift;

  my $all_anchor_aligns = $self->{'anchor_aligns'};
  $self->{tree_string} = "(";

  foreach my $anchor_align (@$all_anchor_aligns) {
    my $anchor_align_id = $anchor_align->dbID;
    $self->{tree_string} .= "aa$anchor_align_id:0.1,";
    my $file = $self->worker_temp_directory . "/seq" . $anchor_align_id . ".fa";

    open F, ">$file" || throw("Couldn't open $file");

    print F ">AnchorAlign", $anchor_align_id, "|", $anchor_align->dnafrag->name, ".",
        $anchor_align->dnafrag_start, "-", $anchor_align->dnafrag_end, ":",
        $anchor_align->dnafrag_strand, "\n";
    my $seq = $anchor_align->seq;
    if ($seq =~ /[^ACTGactgNnXx]/) {
      print STDERR "AnchorAlign $anchor_align_id contains at least one non-ACTGactgNnXx character. These have been replaced by N's\n";
      $seq =~ s/[^ACTGactgNnXx]/N/g;
    }
    $seq =~ s/(.{80})/$1\n/g;
    chomp $seq;
    print F $seq,"\n";

    close F;

    push @{$self->{'fasta_files'}}, $file;
  }
  substr($self->{tree_string}, -1, 1, ");");

  return 1;

}

sub fetch_input {

  my ($self) = @_;

  $self->{'comparaDBA'} = Bio::EnsEMBL::Compara::Production::DBSQL::DBAdaptor->new(-DBCONN=>$self->db->dbc);
  $self->{'comparaDBA'}->dbc->disconnect_when_inactive(0);
  $self->get_params($self->parameters);
  $self->get_params($self->input_id);

  if (!$self->anchor_id) {
    return 0;
  }
#   $self->{'fasta_files'} = [];
  my $anchor_align_adaptor = $self->{'comparaDBA'}->get_AnchorAlignAdaptor();
  ## This method returns a hash at the moment, not the objects
#   print "Fetching AnchorAligns for Anchor ", $self->{'anchor_id'}, " and MLSS ", $self->{'input_method_link_species_set_id'}, "\n";
  my $anchor_align_hash = $anchor_align_adaptor->fetch_all_by_anchor_id_and_mlss_id(
      $self->{'anchor_id'}, $self->{'input_method_link_species_set_id'});
  die "Cannot find any anchor_align with anchor_id = ". $self->{'anchor_id'}.
    " and method_link_species_set_id = ". $self->{'input_method_link_species_set_id'}
      if (!$anchor_align_hash);
  foreach my $anchor_align_id (keys %$anchor_align_hash) {
#     print "Fetching AnchorAlign $anchor_align_id\n";
    my $anchor_align = $anchor_align_adaptor->fetch_by_dbID($anchor_align_id);
    push(@{$self->{'anchor_aligns'}}, $anchor_align);
  }
  $self->_dump_fasta();
  exit(1) if (!$self->{'fasta_files'} or !@{$self->{'fasta_files'}});


  return 1;

}

sub get_best_trimming_position {

  my ($self, $msa_string) = @_;
  my $num_of_leaves = 1;

  ################################
  # Parse the multiple alignment
  ################################
  my @lines = split("\n", $msa_string);
  my $anchor_align_id;
  my $seqs;
  my $this_seq;
  foreach my $this_line (@lines) {
    if ($this_line =~ /score/) {
    } elsif ($this_line =~ /^>aa(\d+)$/) {
      $seqs->{$anchor_align_id} = $this_seq if ($anchor_align_id);
      $anchor_align_id = $1;
      $this_seq = "";
      $num_of_leaves++;
    } else {
      $this_seq .= $this_line;
    }
  }
  $seqs->{$anchor_align_id} = $this_seq;
  $self->{'aligned_sequences'} = $seqs;

  my $ideal_score = 4 * $num_of_leaves;

  #####################################
  # consensus seq, gaps, conservation
  #####################################
  my @gaps;
  my @consensus;
  my @conservation;
  foreach $this_seq (values %$seqs) {
    my @seq = split("", $this_seq);
    for (my $i = 0; $i < @seq; $i++) {
      if ($seq[$i] eq "-") {
        $gaps[$i]++;
      } elsif ($seq[$i] =~ /[ACTG]/) {
        $consensus[$i]->{$seq[$i]}++;
      }
    }
  }
  for (my $i=0; $i<@consensus; $i++) {
    my $cons_hash_ref = $consensus[$i];
    my $max = 0;
    my $consensus_nucleotide;
    while (my ($nucleotide, $this_count) = each %$cons_hash_ref) {
      if ($this_count > $max) {
        $max = $this_count;
        $consensus_nucleotide = $nucleotide;
      }
    }
    push(@conservation, $max);
    $consensus[$i] = $consensus_nucleotide;
  }

  ####################################################
  # Score each position according to previous values
  ####################################################
  my @final_scores;
  for (my $i = 0; $i < @consensus - 3; $i++) {
    my @these_bases = @consensus[$i..($i+3)];
    my @these_gaps = @gaps[$i..($i+3)];
    my @these_scores = @conservation[$i..($i+3)];
    my $total_score = 0;
    # Same as max score - 1 * $num_mismatches
    for (my $j=0; $j<4; $j++) {
      $total_score += $these_scores[$j];
    }
    # 3 points for every gap
    for (my $j=0; $j<4; $j++) {
      $total_score -= 3 * $these_gaps[$j];
    }
    my $all_bases;
    for (my $j=0; $j<4; $j++) {
      $all_bases->{$these_bases[$j]}++;
    }
    if ($these_bases[0] eq $these_bases[2] and
      # Avoid repetitive sequence
        $these_bases[1] eq $these_bases[3]) {
      $total_score -= $num_of_leaves;
      if ($these_bases[0] eq $these_bases[1]) {
        $total_score -= $num_of_leaves;
      }
    } elsif (scalar(keys %$all_bases) == 2) {
      # simple sequence (only 2 diff nucleotides in 4 bp)
      $total_score -= $num_of_leaves/2;
    } elsif (scalar(keys %$all_bases) == 4) {
      # Give an extra point to runs of all 4 diff nucleotides
      $total_score += 1;
    }
    push(@final_scores, $total_score);
#     print join(" : ", @these_bases, @these_gaps,@these_scores, $total_score), "\n";
  }

  my $max_score = 0;
  my $best_position = 0;
  for (my $i = 0; $i < @final_scores; $i++) {
    if ($final_scores[$i] > $max_score) {
      $max_score = $final_scores[$i];
      $best_position = $i+2;
    }
  }

  die "Cannot find a good position" if ($best_position == 0 or $max_score < 0.8 * $ideal_score);

  return $best_position;

}

sub get_params {

  my $self         = shift;
  my $param_string = shift;

  return unless($param_string);
  print STDERR "parsing parameter string : ",$param_string,"\n";

  my $params = eval($param_string);
  return unless($params);

  if(defined($params->{'anchor_id'})) {
    $self->anchor_id($params->{'anchor_id'});
  }
  if(defined($params->{'input_method_link_species_set_id'})) {
    $self->input_method_link_species_set_id($params->{'input_method_link_species_set_id'});
  }
  if(defined($params->{'output_method_link_species_set_id'})) {
    $self->output_method_link_species_set_id($params->{'output_method_link_species_set_id'});
  }
  if(defined($params->{'method_link_species_set_id'})) {
    $self->method_link_species_set_id($params->{'method_link_species_set_id'});
  }

  return 1;

}

sub get_trimmed_anchor_aligns {

  my ($self, $best_position) = @_;
  my $trimmed_anchor_aligns;

  my $aligned_sequences = $self->{'aligned_sequences'};
# #   while (my ($align_anchor_id, $aligned_sequence) = each %$aligned_sequences) {
# #     print substr($aligned_sequence, 0, $best_position), " ** ", substr($aligned_sequence, $best_position),
# #         " :: $align_anchor_id\n";
# #   }
  my $all_anchor_aligns = $self->{'anchor_aligns'};
  foreach my $this_anchor_align (@$all_anchor_aligns) {
    my $this_anchor_align_id = $this_anchor_align->dbID;
    my $this_aligned_sequence = $aligned_sequences->{$this_anchor_align_id};
# #     print substr($this_aligned_sequence, 0, ($best_position -1)), " ** ",
# #         substr($this_aligned_sequence, $best_position-1, 2) , " ** ",
# #         substr($this_aligned_sequence, ($best_position + 1)),
# #         " ++ $this_anchor_align_id\n";
    my $seq_before = substr($this_aligned_sequence, 0, $best_position);
    my $seq_after = substr($this_aligned_sequence, $best_position);
    my $start = $this_anchor_align->dnafrag_start;
    my $end = $this_anchor_align->dnafrag_end;
    my ($count_before) = $seq_before =~ tr/ACTGactgNn/ACTGactgNn/;
    my ($count_after) = $seq_after =~ tr/ACTGactgNn/ACTGactgNn/;
    if ($count_before + $count_after != $end - $start + 1) {
      die "Wrong length $count_before $count_after $start $end";
    }
    if ($this_anchor_align->dnafrag_strand == 1) {
      $start += $count_before - 1;
      $end -= $count_after - 1;
    } elsif ($this_anchor_align->dnafrag_strand == -1) {
      $start += $count_after - 1;
      $end -= $count_before - 1;
    } else {
      die "Wrong strand: ".$this_anchor_align->dnafrag_strand;
    }
    # Check we get a 2bp long anchor_align
    if ($end - $start + 1 != 2) {
      die "Wrong length $start $end";
    }
    my $new_anchor_align;
    %$new_anchor_align = %$this_anchor_align;
    bless $new_anchor_align, ref($this_anchor_align);
    delete($new_anchor_align->{'_seq'});
    delete($new_anchor_align->{'_dbID'});
    $new_anchor_align->dnafrag_start($start);
    $new_anchor_align->dnafrag_end($end);
    $new_anchor_align->method_link_species_set_id($self->output_method_link_species_set_id);
    push(@$trimmed_anchor_aligns, $new_anchor_align);
  }

  return $trimmed_anchor_aligns;
}

1;

}

sub run {

  my $self = shift;
  exit(1) if (!$self->{'fasta_files'} or !@{$self->{'fasta_files'}});
  exit(1) if (!$self->{'tree_string'});

  my $run_str = "/software/ensembl/compara/OrtheusC/bin/OrtheusC";
  $run_str .= " -a " . join(" ", @{$self->{'fasta_files'}});
  $run_str .= " -b '" . $self->{'tree_string'} . "'";
  $run_str .= " -h"; # output leaves only

  $self->{'msa_string'} = qx""$run_str";

  my $trim_position = $self->get_best_trimming_position($self->{'msa_string'});
  $self->{'trimmed_anchor_aligns'} = $self->get_trimmed_anchor_aligns($trim_position);

  return 1;

}

_dump_fasta	Description	Code
anchor_id	No description	Code
fetch_input	No description	Code
get_best_trimming_position	No description	Code
get_params	No description	Code
get_trimmed_anchor_aligns	No description	Code
input_method_link_species_set_id	No description	Code
output_method_link_species_set_id	No description	Code
run	No description	Code
write_output	No description	Code