LowCoverageGenomeAlignment documentation.

Bio::EnsEMBL::Compara::Production::GenomicAlignBlock::LowCoverageGenomeAlignment

No package variables defined.

This module acts as a layer between the Hive system and the Bio::EnsEMBL::Analysis::Runnable::LowCoverageGenomeAlignment module since the ensembl-analysis API does not know about ensembl-compara
This module is the alternative module to Ortheus.pm for aligning low coverage (2X) genomes. This takes an existing high coverage alignment and maps the pairwise BlastZ-Net alignments of the low coverage genomes to the human sequence in the high coverage alignment. Any insertions in the low coverage alignments are removed, that is, no gaps are added to the human sequence. In regions where there are duplications, a phylogenetic tree is generated using either treeBest where there are more than 3 sequences in the alignment or semphy where there are 3 or less sequences. This module is still under development.

sub _add_to_cluster {

    my ($cluster, $region1) = @_;

    if (!defined $cluster) {
	 $cluster->[0]->{$region1} = 1;
     }
    return $cluster;

}

sub _add_to_different_cluster {

     my ($cluster, $region1, $region2) = @_;

     if (!defined $cluster) {
	 $cluster->[0]->{$region1} = 1;
	 $cluster->[1]->{$region2} = 1;
	 return $cluster;
     }
     my $cluster_size = @$cluster;

     my $index1 = _in_cluster($cluster, $region1);
     my $index2 = _in_cluster($cluster, $region2);

     if ($index1 == -1) {
	 $cluster->[@$cluster]->{$region1} = 1;
     } 
     if ($index2 == -1) {
	 $cluster->[@$cluster]->{$region2} = 1;
     }

     return $cluster;

}

sub _add_to_same_cluster {

     my ($cluster, $region1, $region2) = @_;

     #print "add to same cluster $region1 $region2\n";

     if (!defined $cluster) {
	 $cluster->[0]->{$region1} = 1;
	 $cluster->[0]->{$region2} = 1;
	 return $cluster;
     }

     my $cluster_size = @$cluster;

     my $index1 = _in_cluster($cluster, $region1);
     my $index2 = _in_cluster($cluster, $region2);

     if ($index1 == -1 && $index2 == -1) {
	 #neither found, add both to new cluster
	 $cluster->[$cluster_size]->{$region1} = 1;
	 $cluster->[$cluster_size]->{$region2} = 1;
     } elsif ($index1 != -1 && $index2 == -1) {
	 #id1 found, id2 not. add id2 to id1 cluster
	 $cluster->[$index1]->{$region2} = 1;
     } elsif ($index1 == -1 && $index2 != -1) {
	 #id2 found, id1 not. add id1 to id2 cluster
	 $cluster->[$index2]->{$region1} = 1;
     } else {
	 #both ids set in different clusters. Merge the clusters together
	 $cluster = _merge_clusters($cluster, $index1, $index2);
     }
     return $cluster;

}

sub _build_tree_string {

  my $self = shift;

  my $tree = $self->get_species_tree->copy;
  return if (!$tree);
  
  $tree = $self->_update_tree_2x($tree);

  return if (!$tree);

  my $tree_string = $tree->newick_simple_format;


  # Remove quotes around node labels
  $tree_string =~ s/"(seq\d+)"/$1/g;
  # Remove branch length if 0
  $tree_string =~ s/\:0\.0+(\D)/$1/g;
  $tree_string =~ s/\:0([^\.\d])/$1/g;

  $tree->release_tree;

  $self->tree_string($tree_string);

}

sub _cluster_regions {

    my ($self, $found_overlap) = @_;

    my $overlap_cnt = 0;
    my $not_overlap_cnt = 0;

    my $cluster;

    foreach my $region1 (keys %$found_overlap) {
	foreach my $region2 (keys %{$found_overlap->{$region1}}) {
	    print "FOUND OVERLAP $region1 $region2 " . $found_overlap->{$region1}{$region2} . "\n" if $self->debug;
	    if ($found_overlap->{$region1}{$region2}) {
		$overlap_cnt++;

		$cluster = _add_to_same_cluster($cluster, $region1, $region2);
	    } else {
		$not_overlap_cnt++;
		$cluster = _add_to_different_cluster($cluster, $region1, $region2);
	    }
	}
    }
    print "overlap_cnt $overlap_cnt not $not_overlap_cnt\n" if $self->debug;
    return $cluster;
}

#add single region to cluster. No overlaps found.

}

sub _create_frag_array {

    my ($self, $gab_adaptor, $pairwise_mlss, $ref_gas) = @_;

    my $ga_frag_array;

    my $ga_num_ns = 0;

    #Multiple alignment reference genomic_aligns (maybe more than 1)
    foreach my $ref_ga (@$ref_gas) {
	print "  " . $ref_ga->dnafrag->name . " " . $ref_ga->dnafrag_start . " " . $ref_ga->dnafrag_end . " " . $ref_ga->dnafrag_strand . "\n" if $self->debug;
	
	#find the slice corresponding to the ref_genome
	my $slice = $ref_ga->get_Slice;

	print "ref_seq " . $slice->start . " " . $slice->end . " " . $slice->strand . " " . substr($slice->seq,0,120) . "\n" if $self->debug;

	#find the pairwise blocks between ref_genome and the 2x genome
	my $pairwise_gabs = $gab_adaptor->fetch_all_by_MethodLinkSpeciesSet_Slice($pairwise_mlss, $slice, undef,undef,"restrict");
	
	#sort by reference_genomic_align start position (NB I sort again when parsing
	#the results if the ref strand is reverse since the fragments will be in the
	#reverse order ie A-B-C should be C-B-A). Don't do it here because I try to find
	#duplicates in load_2XGenomes.
	@$pairwise_gabs = sort {$a->reference_genomic_align->dnafrag_start <=> $b->reference_genomic_align->dnafrag_start} @$pairwise_gabs;

	print "    pairwise gabs " . scalar(@$pairwise_gabs) . "\n" if $self->debug;
	#if there are no pairwise matches found to 2x genome, then escape
	#back to loop
	next if (scalar(@$pairwise_gabs) == 0);
	
	my $ga_frags;

	#need to save each match separately but still use same structure as
	#create_span_frag_array in case we change our minds back again

	foreach my $pairwise_gab (@$pairwise_gabs) {

	    #should only have 1!
	    my $ga = $pairwise_gab->get_all_non_reference_genomic_aligns->[0];

	    my $ref_start = $ga->genomic_align_block->reference_genomic_align->dnafrag_start;
	    my $ref_end = $ga->genomic_align_block->reference_genomic_align->dnafrag_end;

	    #need to store gab too otherwise it goes out of scope and I can't
	    #access it from ga
	    my $ga_fragment = {genomic_align => $ga,
			       genomic_align_block => $ga->genomic_align_block,
			       taxon_id => $ga->genome_db->taxon_id,
			       genome_db_id => $ga->dnafrag->genome_db_id,
			       ref_ga => $ref_ga,
			      };

	    print "GAB " . $ga_fragment->{genomic_align}->genome_db->name . " " . $ga_fragment->{genomic_align}->dnafrag_start . " " . $ga_fragment->{genomic_align}->dnafrag_end . " " . $ga_fragment->{genomic_align}->dnafrag_strand . " " . substr($ga_fragment->{genomic_align}->get_Slice->seq,0,120) . "\n" if $self->debug;
	    push @$ga_frags, $ga_fragment;
	}
	#add to array of fragments for each reference genomic_align
	push @$ga_frag_array, $ga_frags;
    }
    return $ga_frag_array;
}


#foreach cluster, find the longest region.

}

sub _create_mfa {

    my ($self) = @_;

    my $multi_gab_id = $self->genomic_align_block_id;
    my $multi_gaba = $self->{'comparaDBA'}->get_GenomicAlignBlockAdaptor;
    my $multi_gab = $multi_gaba->fetch_by_dbID($multi_gab_id);
    my $multi_gas = $multi_gab->get_all_GenomicAligns;

    my $pairwise_frags = $self->{ga_frag};

    my $species_order = $self->species_order;

    
    foreach my $ga_frag_array (@$pairwise_frags) {
	my $multi_ref_ga = $ga_frag_array->[0]->{ref_ga};
	my $multi_mapper = $multi_ref_ga->get_Mapper;
	#my $multi_gab_length = length($multi_ref_ga->aligned_sequence);
	my $multi_gab_length = $multi_ref_ga->genomic_align_block->length;

	## New aligned sequence for the 2X genome. Empty (only dashes) at creation time
	my $aligned_sequence = "-" x $multi_gab_length;
	
	foreach my $ga_frag (@$ga_frag_array) {
	    my $pairwise_gab = $ga_frag->{genomic_align_block};
	    
	    my $pairwise_non_ref_ga = $pairwise_gab->get_all_non_reference_genomic_aligns->[0];
	    my $pairwise_ref_ga = $pairwise_gab->reference_genomic_align;
	    
	    my $pairwise_fixed_seq = $pairwise_non_ref_ga->aligned_sequence("+FIX_SEQ");
	    
	    #undef($pairwise_non_ref_ga->{'aligned_sequence'});

	    print "pairwise " . $pairwise_non_ref_ga->genome_db->name . " " . substr($pairwise_fixed_seq,0,120) . "\n" if $self->debug;
	    my $depad = $pairwise_fixed_seq;
	    $depad =~ tr/-//d;
	    #print "depad length " . length($depad) . "\n";

	    #my $deletion_array = find_ref_deletions($pairwise_ref_ga);
	    my $deletion_array = find_ref_deletions($pairwise_gab);

	    my @coords = $multi_mapper->map_coordinates("sequence",
							$pairwise_ref_ga->dnafrag_start,
							$pairwise_ref_ga->dnafrag_end,
							$pairwise_ref_ga->dnafrag_strand,
							    "sequence");
	    my $length = 0;
	    my $start = undef;
	    my $end = undef;
	    foreach my $this_coord (@coords) {
		next if ($this_coord->isa("Bio::EnsEMBL::Mapper::Gap"));
		$length += $this_coord->length;
		## Extract the first N characters from $other_fixed_seq
		my $subseq = substr($pairwise_fixed_seq, 0, $this_coord->length, "");
		## Copy extracted characters into the new aligned sequence for the 2X genome.
		substr($aligned_sequence, $this_coord->start-1, $this_coord->length, $subseq);
		$start = $this_coord->start if (!defined($start) or $this_coord->start < $start);
		$end = $this_coord->end if (!defined($end) or $this_coord->end > $end);
	    }

	    $ga_frag->{deletions} = $deletion_array;
	    $ga_frag->{length} = length($depad);

	    $ga_frag_array->[0]->{cigar_line} = undef;
	    $ga_frag_array->[0]->{aligned_seq} = $aligned_sequence;
	} 
	#for (my $x = 0; $x < length($multi_ref_ga->aligned_sequence); $x += 80) {
	#   print substr($aligned_sequence, $x, 80), "\n";
	#  print substr($multi_ref_ga->aligned_sequence, $x, 80), "\n\n";
	#}
    }
}


#find deletions in reference species and store the position in slice coords
#and the length

}

sub _dump_2x_fasta {

    my ($self, $ga_frags, $file, $seq_id, $mfa_fh) = @_;

    $file .= "_" . $ga_frags->[0]->{taxon_id} . ".fa";

    open F, ">$file" || throw("Couldn't open $file");

    print F ">SeqID" . $seq_id . "\n";
    #print MFA ">SeqID" . $seq_id . "\n";
    #print MFA ">seq" . $seq_id . "\n";
    print MFA ">seq" . $seq_id . "_" . $ga_frags->[0]->{taxon_id} . "\n";
    my $aligned_seq = $ga_frags->[0]->{aligned_seq};
    my $seq = $aligned_seq;
    $seq =~ tr/-//d;
    print F "$seq\n";
    close F;

    $aligned_seq =~ s/(.{60})/$1\n/g;
    $aligned_seq =~ s/\n$//;
    print $mfa_fh $aligned_seq, "\n";

    push @{$self->fasta_files}, $file;
    
    push @{$self->species_order}, $ga_frags->[0]->{genome_db_id};

}

#create alignment from multiple genomic_align_block and 2X genomes.

}

sub _dump_fasta_and_mfa {

  my $self = shift;

  my $all_genomic_aligns = $self->genomic_aligns;


  ## Dump FASTA files in the order given by the tree string (needed by Pecan)
  my @seqs;
  if ($self->tree_string) {
    @seqs = ($self->tree_string =~ /seq(\d+)/g);
  } else {
    @seqs = (1..scalar(@$all_genomic_aligns));
  }

  my $mfa_file = $self->worker_temp_directory . "/epo_alignment.$$.mfa";
  $self->{multi_fasta_file} = $mfa_file;

  print "mfa_file $mfa_file\n" if $self->debug;
  open MFA, ">$mfa_file" || throw("Couldn't open $mfa_file");

  foreach my $seq_id (@seqs) {
    my $ga = $all_genomic_aligns->[$seq_id-1];

    my $file = $self->worker_temp_directory . "/seq" . $seq_id;

    #Check if I have a DnaFragRegion object or my 2x genome object
    #if (!UNIVERSAL::isa($dfr, 'Bio::EnsEMBL::Compara::DnaFragRegion')) {
    if (!UNIVERSAL::isa($ga, 'Bio::EnsEMBL::Compara::GenomicAlign')) {
	print "FOUND 2X GENOME\n" if $self->debug;
	print "num of frags " . @$ga . "\n" if $self->debug;
	$self->_dump_2x_fasta($ga, $file, $seq_id,\* MFA);
	next;
    }

    #add taxon_id to end of fasta files
    $file .= "_" . $ga->genome_db->taxon_id . ".fa";
    print "file $file\n" if $self->debug;
    


    open F, ">$file" || throw("Couldn't open $file");


    print F ">SeqID" . $seq_id . "\n";
    #print MFA ">SeqID" . $seq_id . "\n";

    #print MFA ">seq" . $seq_id . "\n";
    print MFA ">seq" . $seq_id . "_" . $ga->genome_db->taxon_id . "\n";

    print ">DnaFrag", $ga->dnafrag->dbID, "|", $ga->dnafrag->name, ".",
        $ga->dnafrag_start, "-", $ga->dnafrag_end, ":", $ga->dnafrag_strand,"\n" if $self->debug;

    my $slice = $ga->get_Slice;
    throw("Cannot get slice for DnaFragRegion in DnaFrag #".$ga->dnafrag->dbID) if (!$slice);
    
    my $seq = $slice->get_repeatmasked_seq(undef, 1)->seq;

    if ($seq =~ /[^ACTGactgNnXx]/) {
      print STDERR $slice->name, " contains at least one non-ACTGactgNnXx character. These have been replaced by N's\n";
      $seq =~ s/[^ACTGactgNnXx]/N/g;
    }
    $seq =~ s/(.{80})/$1\n/g;

    chomp $seq;
    print F $seq,"\n";

    close F;

    my $aligned_seq = $ga->aligned_sequence;
    $aligned_seq =~ s/(.{60})/$1\n/g;
    $aligned_seq =~ s/\n$//;
    print MFA $aligned_seq, "\n";

    push @{$self->fasta_files}, $file;
    push @{$self->species_order}, $ga->dnafrag->genome_db_id;
  }
  close MFA;

  return 1;

}

sub _extract_sequence {

    my ($seq, $aligned_start, $seq_length) = @_;
    my $curr_length = 0;
    my $aligned_length = 0;
    #my $aligned_start;
    my $aligned_end;

    #print "original_start $aligned_start length $seq_length\n";

    #create new seq starting from aligned_start to the end
    my $new_seq = substr($seq, $aligned_start);

    #find the end in alignment coords counting seq_length bases.
    foreach my $subseq (grep {$_} split /(\-+)/, $new_seq) {
	#print "subseq $subseq\n";
	my $length = length($subseq);
	if ($subseq !~ /\-/) {
	    if (!defined($aligned_end) && ($curr_length + $length >= $seq_length)) {
		$aligned_end = $aligned_length + ($seq_length - $curr_length) + $aligned_start;
		#print "aligned_end $aligned_end\n";
		last;
	    }
	    #length in bases
	    $curr_length += $length;
	}
	#length in alignment coords
	$aligned_length += $length;
    }
    
    my $subseq = substr($seq, $aligned_start, ($aligned_end-$aligned_start));
    return ($subseq, $aligned_start, $aligned_end);
}

##########################################
#
# getter/setter methods
# 
##########################################

}

sub _find_longest_region_in_cluster {

    my ($self, $cluster, $sum_lengths) = @_;

    my $max_frag = 0;
    my $final_region = -1;
    my @overlap_frag;

    my $overlap_cnt = 0;
    my $not_overlap_cnt = 0;
    my $longest_clusters;

    foreach my $this_cluster (@$cluster) {
	my $longest_frag;
	my $longest_region;

	foreach my $region (keys %{$this_cluster}) {
	    #initialise variables
	    if (!defined $longest_frag) {
		$longest_frag = $sum_lengths->[$region];
		$longest_region = $region;
	    }

 	    if ($sum_lengths->[$region] >= $longest_frag) {
 		$longest_frag = $sum_lengths->[$region];
 		$longest_region = $region;
 	    }
	}
	push @$longest_clusters, $longest_region;
    }
    print "overlap_cnt $overlap_cnt not $not_overlap_cnt\n" if $self->debug;
    return $longest_clusters;
}

#Put overlapping regions in the same cluster. If region 0 overlaps with region 
#1 and region 2, but not with region 3, create 2 clusters: (0,1,2), (3)

}

sub _in_cluster {

     my ($cluster, $region) = @_;

     for (my $i = 0; $i < @$cluster; $i++) {
 	if ($cluster->[$i]->{$region}) {
 	    return $i;
 	}
     }
     return -1;

}

sub _load_2XGenomes {

  my ($self, $genomic_align_block_id, $analysis_data_id) = @_;

  #get data from analysis_data table
  my $analysis_data_adaptor = $self->{hiveDBA}->get_AnalysisDataAdaptor();
  my @parameters = split (" ",$analysis_data_adaptor->fetch_by_dbID($analysis_data_id));

  #if no 2x genomes defined, return
  if (scalar(@parameters) == 0) {
      print "No 2x genomes to load\n" if $self->debug;
      return;
  }

  #Find the slice on the reference genome
  my $genome_db_adaptor = $self->{'comparaDBA'}->get_GenomeDBAdaptor;

  #DEBUG this opens up connections to all the databases
  my $ref_genome_db = $genome_db_adaptor->fetch_by_name_assembly($self->reference_species);
  my $ref_dba = $ref_genome_db->db_adaptor;
  my $ref_slice_adaptor = $ref_dba->get_SliceAdaptor();

  #Get multiple alignment genomic_align_block adaptor
  my $multi_gaba = $self->{'comparaDBA'}->get_GenomicAlignBlockAdaptor;

  #Find all the dnafrag_regions for the reference genome in this synteny region
  my $ref_gas =[];
  my $multi_gab = $multi_gaba->fetch_by_dbID($genomic_align_block_id);
  my $all_gas = $multi_gab->get_all_GenomicAligns;

  foreach my $ga (@$all_gas) {
      if ($ga->genome_db->dbID == $ref_genome_db->dbID) {
	  push @$ref_gas, $ga;
      }
  }
  
  #Return if there is no reference sequence in this gab region
  if (scalar(@$ref_gas) == 0) {
      print "No " . $self->reference_species . " sequences found in genomic_align_block $genomic_align_block_id\n";
      return;
  }

  print "GAB $genomic_align_block_id num ref copies " . scalar(@$ref_gas) . "\n" if $self->debug;

  #Find the BLASTZ_NET alignments between the reference species and each
  #2X genome.
  foreach my $params (@parameters) {
      my $param = eval($params);
      my $target_species;

      #open compara database containing 2x genome vs $ref_name blastz results
      my $compara_db_url = $param->{'compara_db_url'};

      #if the database name is defined in the url, then open that
      my $compara_dba;
      my $locator;
      if ($compara_db_url =~ /mysql:\/\/.*@.*\/.+/) {
	  #open database defined in url
	  $locator = "Bio::EnsEMBL::Compara::DBSQL::DBAdaptor/url=>$compara_db_url";
      } else {
	  throw "Invalid url $compara_db_url. Should be of the form: mysql://user:pass\@host:port/db_name\n";
      }

      $compara_dba = Bio::EnsEMBL::DBLoader->new($locator);

      #need to store this to allow disconnect when call ortheus
      $self->{pairwise_compara_dba}->{$compara_dba->dbc->dbname} = $compara_dba;
      #Get pairwise genomic_align_block adaptor
      my $pairwise_gaba = $compara_dba->get_GenomicAlignBlockAdaptor;

      #Get pairwise method_link_species_set
      my $p_mlss_adaptor = $compara_dba->get_MethodLinkSpeciesSetAdaptor;
      my $pairwise_mlss = $p_mlss_adaptor->fetch_by_dbID($param->{'method_link_species_set_id'});

      #find non_reference species name in pairwise alignment
      my $species_set = $pairwise_mlss->species_set;
      foreach my $genome_db (@$species_set) {
	  if ($genome_db->name ne $self->reference_species) {
	      $target_species = $genome_db->name;
	      last;
	  }
      }
     
      my $target_genome_db = $genome_db_adaptor->fetch_by_name_assembly($target_species);
      my $target_dba = $target_genome_db->db_adaptor;
      my $target_slice_adaptor = $target_dba->get_SliceAdaptor();

      #Foreach copy of the ref_genome in the multiple alignment block, 
      #find the alignment blocks between the ref_genome and the 2x 
      #target_genome in the pairwise database

      my $ga_frag_array = $self->_create_frag_array($pairwise_gaba, $pairwise_mlss, $ref_gas);
  
      #not found 2x genome
      next if (!defined $ga_frag_array);

      #must first sort so I have a reasonable chance of finding duplicates


      #4/10/08 don't think sorting here is a good idea. The order is very
      #important for when I store the fragments and work out their offsets.
#      for (my $i = 0; $i < scalar(@$ga_frag_array); $i++) {
#	  @{$ga_frag_array->[$i]} = sort {$a->{genomic_align}->dnafrag_start <=> $b->{genomic_align}->dnafrag_start} @{$ga_frag_array->[$i]};
#      }
    
      #find the total length of all the fragments in the ref_region
      my $sum_lengths;
      for (my $i = 0; $i < scalar(@$ga_frag_array); $i++) {
	  for (my $j = 0; $j < scalar(@{$ga_frag_array->[$i]}); $j++) {
	      #print "*** gab *** " . $ga_frag_array->[$i][$j]->{genomic_align}->genome_db->name . " " . $ga_frag_array->[$i][$j]->{genomic_align}->genomic_align_block . "\n";

	      $sum_lengths->[$i] += ($ga_frag_array->[$i][$j]->{genomic_align}->dnafrag_end - $ga_frag_array->[$i][$j]->{genomic_align}->dnafrag_start + 1);
	  }
      }

      #check if there is any overlap between pairwise blocks on the ref_genomes
      #if there is an overlap, then choose ref_genome duplication which is the 
      #longest in 2x genome
      #if there is no overlap, save dnafrags on all duplications
      my $found_overlap;
      my $j = 0;
    
      #Simple case: only found one reference region containing 2x genome 
      #fragments
      if (@$ga_frag_array == 1) {
	  my $cluster;
	  $cluster = _add_to_cluster($cluster, 0);
	  _print_cluster($cluster) if $self->debug;
	  my $longest_ref_region = 0;

	  print "SIMPLE CASE: longest_region $longest_ref_region length " . $sum_lengths->[$longest_ref_region] . "\n" if $self->debug;
	  
	  #_build_2x_composite_seq($self, $compara_dba, $ref_slice_adaptor, $target_slice_adaptor, $ga_frag_array->[$longest_ref_region]);
	  
	  push @{$self->{ga_frag}}, $ga_frag_array->[$longest_ref_region];
	  push @{$self->{'2x_dnafrag_region'}}, $ga_frag_array->[$longest_ref_region]->[0]->{genomic_align};
	  next;
      }

      #Found more than one reference region in this synteny block
      for (my $region1 = 0; $region1 < scalar(@$ga_frag_array)-1; $region1++) {
	  for (my $region2 = $region1+1; $region2 <  scalar(@$ga_frag_array); $region2++) {
	      #initialise found_overlap hash
	      if (!defined $found_overlap->{$region1}{$region2}) {
		  $found_overlap->{$region1}{$region2} = 0;
	      }
	      
	      #loop through the 2x genome fragments on region1
	      for (my $j = 0; ($j < @{$ga_frag_array->[$region1]}); $j++) {
		  
		  #if I've already found an overlap, then stop
		  last if ($found_overlap->{$region1}{$region2});
		  
		  #loop through 2x genome fragments on region2
		  for (my $k = 0; ($k < @{$ga_frag_array->[$region2]}); $k++) {

		    #if I've already found an overlap, then stop
		      last if ($found_overlap->{$region1}{$region2});
		      
		      #check if 2x genome fragments have the same name
		      if ($ga_frag_array->[$region1][$j]->{seq_region_name} eq $ga_frag_array->[$region2][$k]->{seq_region_name}) {
			  
			  #check these overlap
			  if (($ga_frag_array->[$region1][$j]->{start} <= $ga_frag_array->[$region2][$k]->{end}) && ($ga_frag_array->[$region1][$j]->{end} >= $ga_frag_array->[$region2][$k]->{start})) {

			      $found_overlap->{$region1}{$region2} = 1;
			      print "found overlap $region1 $region2\n" if $self->debug;
			      #found overlap so can stop.
			      last;
			  }
		      }
		  }
	      }
	  }
      }

      #Cluster all the alignment blocks that are overlapping together
      my $cluster = $self->_cluster_regions($found_overlap);
      _print_cluster($cluster) if $self->debug;
      my $longest_regions = $self->_find_longest_region_in_cluster($cluster, $sum_lengths);

      #find the reference with the longest region
      my $slice_array;
      foreach my $longest_ref_region (@$longest_regions) {
	  print "longest_ref_region $longest_ref_region length " . $sum_lengths->[$longest_ref_region] . "\n" if $self->debug;

	  #store composite_seq in ga_frag_array->[$longest_ref_region]
	  #_build_2x_composite_seq($self, $compara_dba, $ref_slice_adaptor, $target_slice_adaptor, $ga_frag_array->[$longest_ref_region]);
	  push @{$self->{ga_frag}}, $ga_frag_array->[$longest_ref_region];

	  push @{$self->{'2x_dnafrag_region'}}, $ga_frag_array->[$longest_ref_region]->[0]->{genomic_align};

      }
  }

}

sub _load_GenomicAligns {

  my ($self, $genomic_align_block_id) = @_;
  my $genomic_aligns = [];

  # Fail if dbID has not been provided
  return $genomic_aligns if (!$genomic_align_block_id);

  my $gaba = $self->{'comparaDBA'}->get_GenomicAlignBlockAdaptor;
  my $gab = $gaba->fetch_by_dbID($genomic_align_block_id);

  foreach my $ga (@{$gab->get_all_GenomicAligns}) {  
    push(@{$genomic_aligns}, $ga);
  }

  $self->genomic_aligns($genomic_aligns);

}

sub _merge_clusters {

    my ($cluster, $index1, $index2) = @_;
    
    #already in same cluster
    if ($index1 != -1 && $index1 == $index2) {
	return $cluster;
    }

    #copy over keys from index2 to index1
    foreach my $region (keys %{$cluster->[$index2]}) {
	#print "region $region\n";
	$cluster->[$index1]->{$region} = 1;
    }
 
    #delete index2
    splice(@$cluster, $index2, 1);

    return $cluster;

}

sub _parse_results {

    my ($self) = @_;

    #Taken from Analysis/Runnable/LowCoverageGenomeAlignment.pm module
    print "PARSE RESULTS\n";

    ## The output file contains one fasta aligned sequence per original sequence + ancestral sequences.
    ## The first seq. corresponds to the fist leaf of the tree, the second one will be an internal
    ## node, the third is the second leaf and so on. The fasta header in the result file correspond
    ## to the names of the leaves for the leaf nodes and to the concatenation of the names of all the
    ## underlying leaves for internal nodes. For instance:
    ## ----------------------------------
    ## >0
    ## ACTTGG--CCGT
    ## >0_1
    ## ACTTGGTTCCGT
    ## >1
    ## ACTTGGTTCCGT
    ## >1_2_3
    ## ACTTGCTTCCGT
    ## >2
    ## CCTTCCTTCCGT
    ## ----------------------------------
    ## The sequence of fasta files and leaves in the tree have the same order. If Ortheus is run
    ## with a given tree, the sequences in the file follow the tree. If Ortheus estimate the tree,
    ## the tree output file contains also the right order of files:
    ## ----------------------------------
    ## ((1:0.0157,0:0.0697):0.0000,2:0.0081);
    ## /tmp/file3.fa /tmp/file1.fa /tmp/file2.fa
    ## ----------------------------------
    
    my $workdir;
    my $tree_file = $self->worker_temp_directory . "/output.$$.tree";

    my $ordered_fasta_files = $self->fasta_files;

    if (-e $tree_file) {
	## Estimated tree. Overwrite the order of the fasta files and get the tree
	open(F, $tree_file) || throw("Could not open tree file <$tree_file>");

	my ($newick, $files);
	while (<F>) {
	    $newick .= $_;
	}
	close(F);
	$newick =~ s/[\r\n]+$//;
	$self->tree_string($newick);
	
	my $this_tree =
	  Bio::EnsEMBL::Compara::Graph::NewickParser::parse_newick_into_tree($newick);
	
	#created tree via semphy or treebest
	$ordered_fasta_files = undef;
	my $all_leaves = $this_tree->get_all_leaves;
	foreach my $this_leaf (@$all_leaves) {
	    push @$ordered_fasta_files, $this_leaf->name . ".fa";
	}
	
	$self->fasta_files(@$ordered_fasta_files);
	#print STDOUT "**NEWICK: $newick\nFILES: ", join(" -- ", @$ordered_fasta_files), "\n";
    }

    my (@ordered_leaves) = $self->tree_string =~ /[(,]([^(:)]+)/g;
    #print "++NEWICK: ", $self->tree_string, "\nLEAVES: ", join(" -- ", @ordered_leaves), "\nFILES: ", join(" -- ", @{$self->fasta_files}), "\n";

    my $alignment_file;
    #read in mfa file created for input into treebest
    $alignment_file = $self->{multi_fasta_file};

    my $this_genomic_align_block = new Bio::EnsEMBL::Compara::GenomicAlignBlock;
    
    open(F, $alignment_file) || throw("Could not open $alignment_file");
    my $seq = "";
    my $this_genomic_align;

    #Create genomic_align_group object to store genomic_aligns for
    #each node. For 2x genomes, there may be several genomic_aligns
    #for a node but for other genomes there will only be one
    #genomic_align in the genomic_align_group
    my $genomic_align_group;

    print "tree_string " . $self->tree_string . "\n";
    my $tree = Bio::EnsEMBL::Compara::Graph::NewickParser::parse_newick_into_tree($self->tree_string);
    $tree->print_tree(100);
    
    print $tree->newick_format("simple"), "\n";
    print join(" -- ", map {$_->name} @{$tree->get_all_leaves}), "\n";
    print "Reading $alignment_file...\n";
    my $ids;

    foreach my $this_file (@$ordered_fasta_files) {
	push(@$ids, qx""head -1 $this_file");
	#print "add ids $this_file " . $ids->[-1] . "\n";
    }
    #print join(" :: ", @$ids), "\n\n";

    my $genomic_aligns_2x_array = [];
    my @num_frag_pads;
    my $frag_limits;
    my @ga_lengths;
    my $ga_deletions;

    while (<F>) {
	next if (/^\s*$/);
	chomp;
	## FASTA headers correspond to the tree and the order of the leaves in the tree corresponds
	## to the order of the files

	if (/^>/) {
	    print "PARSING $_\n" if ($self->debug);
	    #print $tree->newick_format(), "\n" if ($self->debug);
	    my ($name) = $_ =~ /^>(.+)/;
	    if (defined($this_genomic_align) and  $seq) {
		if (@$genomic_aligns_2x_array) {
		    print "*****FOUND 2x seq " . length($seq) . "\n" if ($self->debug);
		    #starting offset
		    my $offset = $num_frag_pads[0];
		    #how many X's to add at the start of the cigar_line
		    my $start_X;

		    #how many X's to add to the end of the cigar_line
		    my $end_X;

		    my $align_offset = 0;
		    for (my $i = 0; $i < @$genomic_aligns_2x_array; $i++) {
			my $genomic_align = $genomic_aligns_2x_array->[$i];
			my $num_pads = $num_frag_pads[$i+1];
			my $ga_length = $genomic_align->dnafrag_end-$genomic_align->dnafrag_start+1;

			#print "ga-length $ga_length " . $genomic_align->dnafrag_start . " " . $genomic_align->dnafrag_end . " " , $ga_lengths[$i] . "\n";

			
			my ($subseq, $aligned_start, $aligned_end) = _extract_sequence($seq, $align_offset, $ga_lengths[$i]);

			$align_offset = $aligned_end;
			#print "final subseq $aligned_start $aligned_end $subseq\n";
			#Add aligned sequence
			$genomic_align->aligned_sequence($subseq);

			my $cigar_line = create_2x_cigar_line($subseq, $ga_deletions->[$i]);
			$genomic_align->cigar_line($cigar_line);


			#Add X padding characters to ends of seq
			$start_X = $aligned_start;
			$end_X = length($seq) - ($start_X+length($subseq));

			print "start_X $start_X end_X $end_X subseq_length " . length($subseq) . "\n" if ($self->debug);

			#print "before cigar_line " . $genomic_align->cigar_line . "\n";

			
			$genomic_align->cigar_line($start_X . "X" .$genomic_align->cigar_line . $end_X . "X");

			#print "after cigar_line " . $genomic_align->cigar_line . "\n";

			#my $aln_seq = "." x $start_X;
			#$aln_seq .= $genomic_align->aligned_sequence();
			#$aln_seq .= "." x $end_X;
			#$genomic_align->aligned_sequence($aln_seq);

			#free aligned_sequence now that I've used it to 
			#create the cigar_line
			undef($genomic_align->{'aligned_sequence'});

			#Add genomic align to genomic align block
			$this_genomic_align_block->add_GenomicAlign($genomic_align);
			#$offset += $num_pads + $ga_length;
			$offset += $ga_length;
		    }
		    $genomic_aligns_2x_array = [];
		    undef @num_frag_pads;
		    undef @ga_lengths;
		    undef $ga_deletions;
		    undef $frag_limits;
		} else {

		    print "add aligned_sequence " . $this_genomic_align->dnafrag_id . " " . $this_genomic_align->dnafrag_start . " " . $this_genomic_align->dnafrag_end . "\n" if $self->debug;

		    $this_genomic_align->aligned_sequence($seq);

		    #need to add original sequence here because the routine
		    #remove_empty_columns can delete parts of the alignment and
		    #so the original_sequence cannot be reconstructed from the
		    #aligned_sequence
		    if ($this_genomic_align->dnafrag_id == -1) {
			$this_genomic_align->original_sequence;
		    }
		    #undef aligned_sequence now. Necessary because otherwise 
		    #when I remove_empty_columns, this
		    #modifies the cigar_line only and not the aligned_sequence
		    #so not removing it here causes the genomic_align_block
		    #length to be wrong since it finds the length of the
		    #aligned_sequence
		    $this_genomic_align->cigar_line;
		    undef($this_genomic_align->{'aligned_sequence'});

		    $this_genomic_align_block->add_GenomicAlign($this_genomic_align);

		}
	    }
	    my $header = shift(@$ids);
	    $this_genomic_align = new Bio::EnsEMBL::Compara::GenomicAlign;

	    if (!defined($header)) {
		print "INTERNAL NODE $name\n" if ($self->debug);
		my $this_node;
		foreach my $this_leaf_name (split("_", $name)) {
		    if ($this_node) {
			my $other_node = $tree->find_node_by_name($this_leaf_name);
			if (!$other_node) {
			    throw("Cannot find node <$this_leaf_name>\n");
			}
			$this_node = $this_node->find_first_shared_ancestor($other_node);
		    } else {
			print $tree->newick_format() if ($self->debug);
			print " LEAF: $this_leaf_name\n" if ($self->debug);
			$this_node = $tree->find_node_by_name($this_leaf_name);
		    }
		}
		print join("_", map {$_->name} @{$this_node->get_all_leaves}), "\n" if ($self->debug);
		## INTERNAL NODE: dnafrag_id and dnafrag_end must be edited somewhere else

		$this_genomic_align->dnafrag_id(-1);
		$this_genomic_align->dnafrag_start(1);
		$this_genomic_align->dnafrag_end(0);
		$this_genomic_align->dnafrag_strand(1);

		bless($this_node, "Bio::EnsEMBL::Compara::GenomicAlignTree");
		#$this_node->genomic_align($this_genomic_align);
		$genomic_align_group = new Bio::EnsEMBL::Compara::GenomicAlignGroup(
				# -genomic_align_array => [$this_genomic_align],
				-type => "epo");
		$genomic_align_group->add_GenomicAlign($this_genomic_align);

		
		$this_node->genomic_align_group($genomic_align_group);
		$this_node->name($name);
	    } elsif ($header =~ /^>SeqID(\d+)/) {
		#print "old $name\n";
		print "leaf_name?? $name\n" if ($self->debug);

		my $this_leaf = $tree->find_node_by_name($name);
		if (!$this_leaf) {
		    print $tree->newick_format(), " ****\n" if ($self->debug);
		    die "";
		}
		#print "$this_leaf\n";
		#         print "****** $name -- $header -- ";
		#         if ($this_leaf) {
		#           $this_leaf->print_node();
		#         } else {
		#           print "[none]\n";
		#         }

		#information extracted from fasta header
		my $seq_id = ($1);

		my $all_genomic_aligns = $self->genomic_aligns;

		my $ga = $all_genomic_aligns->[$seq_id-1];

		if (!UNIVERSAL::isa($ga, 'Bio::EnsEMBL::Compara::GenomicAlign')) {
		    print "FOUND 2X GENOME\n" if $self->debug;
		    print "num of frags " . @$ga . "\n" if $self->debug;
		    print "*** NAME  " . $ga->[0]->{genomic_align}->genome_db->name . " start " . $ga->[0]->{genomic_align}->dnafrag_start . " end " . $ga->[0]->{genomic_align}->dnafrag_end . " name " . $ga->[0]->{genomic_align}->dnafrag->name . "\n" if $self->debug;
		    #reorder fragments if reference slice is on the reverse
		    #strand
		    my $first_ref_ga = $ga->[0]->{ref_ga};

		    if ($first_ref_ga->dnafrag_strand == -1) { 
		      @{$ga} = sort {$b->{genomic_align_block}->reference_genomic_align->dnafrag_start <=> $a->{genomic_align_block}->reference_genomic_align->dnafrag_start} @{$ga};
		  }

		    #first pads
		    push @num_frag_pads, $ga->[0]->{first_pads};
		    #create new genomic_align for each pairwise fragment
		    foreach my $ga_frag (@$ga) {
			my $genomic_align = new Bio::EnsEMBL::Compara::GenomicAlign;
			my $genomic_align_block = $ga_frag->{genomic_align_block};
			my $non_ref_genomic_align = $genomic_align_block->get_all_non_reference_genomic_aligns->[0];

			$genomic_align->dnafrag_id($non_ref_genomic_align->dnafrag_id);
			$genomic_align->dnafrag_start($non_ref_genomic_align->dnafrag_start);
			$genomic_align->dnafrag_end($non_ref_genomic_align->dnafrag_end);
			$genomic_align->dnafrag_strand($non_ref_genomic_align->dnafrag_strand);

			print "store start " . $genomic_align->dnafrag_start . " end " . $genomic_align->dnafrag_end . " strand " . $genomic_align->dnafrag_strand . "\n" if $self->debug;

			#print "LENGTHS " . $ga_frag->{length} . "\n";
			push @$ga_deletions, $ga_frag->{deletions};
			push @ga_lengths, $ga_frag->{length};
			push @num_frag_pads, $ga_frag->{num_pads};
			push @$genomic_aligns_2x_array, $genomic_align;
		    }
		    #Add genomic align to genomic align group 
		    $genomic_align_group = new Bio::EnsEMBL::Compara::GenomicAlignGroup(
											#-genomic_align_array => $genomic_aligns_2x_array,
										        -type => "epo");
		    foreach my $this_genomic_align (@$genomic_aligns_2x_array) {
			$genomic_align_group->add_GenomicAlign($this_genomic_align);
		    }

		    bless($this_leaf, "Bio::EnsEMBL::Compara::GenomicAlignTree");
		    $this_leaf->genomic_align_group($genomic_align_group);
		    print "size of array " . @$genomic_aligns_2x_array . "\n" if $self->debug;
		    print "store gag1 $this_leaf\n" if $self->debug;

		    #$self->{$this_leaf} = $genomic_align_group;
		} else  {
		    print "normal name " . $ga->genome_db->name . "\n" if $self->debug;


		    $this_genomic_align->dnafrag_id($ga->dnafrag_id);
		    $this_genomic_align->dnafrag_start($ga->dnafrag_start);
		    $this_genomic_align->dnafrag_end($ga->dnafrag_end);
		    $this_genomic_align->dnafrag_strand($ga->dnafrag_strand);

		    $genomic_align_group = new Bio::EnsEMBL::Compara::GenomicAlignGroup(
											#-genomic_align_array => [$this_genomic_align],
										        -type => "epo");
		    $genomic_align_group->add_GenomicAlign($this_genomic_align);

		    bless($this_leaf, "Bio::EnsEMBL::Compara::GenomicAlignTree");
		    $this_leaf->genomic_align_group($genomic_align_group);
		    print "store gag2 $this_leaf\n" if $self->debug;
		}
	    } else {
		throw("Error while parsing the FASTA header. It must start by\" >DnaFrag#####\" where ##### is the dnafrag_id\n$_");
	    }
	    $seq = "";
	} else {
	    $seq .= $_;
	}
    }
    close F;

    #last genomic_align
    print "Last genomic align\n" if ($self->debug);
    if (@$genomic_aligns_2x_array) {
	print "*****FOUND 2x seq " . length($seq) . "\n" if ($self->debug);

	#starting offset
	my $offset = $num_frag_pads[0];

	#how many X's to add at the start and end of the cigar_line
	my ($start_X , $end_X);
	
	my $align_offset = 0;
	for (my $i = 0; $i < @$genomic_aligns_2x_array; $i++) {
	    my $genomic_align = $genomic_aligns_2x_array->[$i];

 	    my $num_pads = $num_frag_pads[$i+1];
 	    my $ga_length = $genomic_align->dnafrag_end-$genomic_align->dnafrag_start+1;

 	    print "extract_sequence $offset " .($offset+$ga_length) . " num pads $num_pads\n" if ($self->debug); 
 	    my ($subseq, $aligned_start, $aligned_end) = _extract_sequence($seq, $align_offset, $ga_lengths[$i]);

	    $align_offset = $aligned_end;

# 	    #Add aligned sequence
 	    $genomic_align->aligned_sequence($subseq);

	    my $cigar_line = create_2x_cigar_line($subseq, $ga_deletions->[$i]);
	    $genomic_align->cigar_line($cigar_line);

# 	    #Add X padding characters to ends of seq
 	    $start_X = $aligned_start;
 	    $end_X = length($seq) - ($start_X+length($subseq));
 	    print "start_X $start_X end_X $end_X subseq_length " . length($subseq) . "\n" if ($self->debug);
	    
 	    $genomic_align->cigar_line($start_X . "X" .$genomic_align->cigar_line . $end_X . "X");
	    
	    #free aligned_sequence now that I've used it to 
	    #create the cigar_line
	    undef($genomic_align->{'aligned_sequence'});

	    #Add genomic align to genomic align block
	    $this_genomic_align_block->add_GenomicAlign($genomic_align);
	    $offset += $num_pads + $ga_length;
	}
    } else {
	
	if (defined $this_genomic_align && 
	    $this_genomic_align->dnafrag_id != -1) {
	    $this_genomic_align->aligned_sequence($seq);
	    $this_genomic_align_block->add_GenomicAlign($this_genomic_align);
	}
    }


    #Where there is no ancestral sequences ie 2X genomes
    #convert all the nodes of the tree to Bio::EnsEMBL::GenomicAlignTree objects
    foreach my $this_node (@{$tree->get_all_nodes}) {
	if (!UNIVERSAL::isa($this_node, 'Bio::EnsEMBL::Compara::GenomicAlignTree')) {
	    bless($this_node, "Bio::EnsEMBL::Compara::GenomicAlignTree");
	}
    }

    #print $tree->newick_format("simple"), "\n";
    #print join(" -- ", map {$_."+".$_->node_id."+".$_->name} (@{$tree->get_all_nodes()})), "\n";
    #$self->output([$tree]);
    $self->{'_runnable'}->output([$tree]);
}

#create cigar line for 2x genomes manually because I need to add in the
#insertions, that is "I" in the cigar_line to represent the 2x-only sequences
#that are not found in the reference species which I removed during the
#creation of the _create_mfa routine.

}

sub _print_cluster {

    my ($cluster) = @_;

    print "FINAL cluster ";
    foreach my $this_cluster (@$cluster) {
	print "(";
	foreach my $group (keys %{$this_cluster}) {
	    print "$group ";
	}
	print "), ";
    }
    print "\n";

}

sub _update_tree_2x {

  my $self = shift;
  my $tree = shift;

  my $all_genomic_aligns = $self->genomic_aligns();
  my $ordered_genomic_aligns = [];
  my $ordered_2x_genomes = [];

  my $idx = 1;
  my $all_leaves = $tree->get_all_leaves;
  foreach my $this_leaf (@$all_leaves) {

    my $these_genomic_aligns = [];
    my $these_2x_genomes = [];
    ## Look for genomic_aligns belonging to this genome_db_id
    foreach my $this_genomic_align (@$all_genomic_aligns) {
      if ($this_genomic_align->dnafrag->genome_db_id == $this_leaf->name) {
        push (@$these_genomic_aligns, $this_genomic_align);
      }
    }

    my $index = 0;
    foreach my $ga_frags (@{$self->{ga_frag}}) {
	my $first_frag = $ga_frags->[0];

	#print "update_tree first_frag " . $first_frag->{genomic_align}->genome_db->dbID . " this leaf " . $this_leaf->name . "\n";
	if ($first_frag->{genomic_align}->dnafrag->genome_db->dbID == $this_leaf->name) {
	    push(@$these_2x_genomes, $index);
	}
	$index++;
    }
    print "num " . @$these_genomic_aligns . " " . @$these_2x_genomes . "\n" if $self->debug;

    if (@$these_genomic_aligns == 1) {
      ## If only 1 has been found...
	my $taxon_id = $these_genomic_aligns->[0]->dnafrag->genome_db->taxon_id;
      print "seq$idx" . "_" . $taxon_id . " genome_db_id=" . $these_genomic_aligns->[0]->dnafrag->genome_db_id . "\n" if $self->debug;
      
      $this_leaf->name("seq".$idx++."_".$taxon_id); #.".".$these_dnafrag_regions->[0]->dnafrag_id);

      push(@$ordered_genomic_aligns, $these_genomic_aligns->[0]);

    } elsif (@$these_genomic_aligns > 1) {
      ## If more than 1 has been found, let Ortheus estimate the Tree

	#need to add on 2x genomes to genomic_aligns array
	my $ga = $self->genomic_aligns;
	foreach my $ga_frags (@{$self->{ga_frag}}) {
	    push @$ga, $ga_frags;
	}
	$self->genomic_aligns($ga);
	return undef;

   } elsif (@$these_2x_genomes == 1) {
	#See what happens...
	#Find 2x genomes
       my $ga_frags = $self->{ga_frag}->[$these_2x_genomes->[0]];
       print "number of frags " . @$ga_frags . "\n" if $self->debug;

       my $taxon_id = $ga_frags->[0]->{taxon_id};
	print "2x seq$idx" . "_" . $taxon_id . " " . $ga_frags->[0]->{genome_db_id} . "\n" if $self->debug;
	$this_leaf->name("seq".$idx++."_".$taxon_id);
	#push(@$ordered_2x_genomes, $these_2x_genomes->[0]);
	push(@$ordered_genomic_aligns, $ga_frags);
   } else {
      ## If none has been found...
      $this_leaf->disavow_parent;
      $tree = $tree->minimize_tree;
    }
  }

  $self->genomic_aligns($ordered_genomic_aligns);

  $self->{ordered_2x_genomes} = $ordered_2x_genomes;

  if ($tree->get_child_count == 1) {
    my $child = $tree->children->[0];
    $child->parent->merge_children($child);
    $child->disavow_parent;
  }

  return $tree;
}

#
# Create array of 2x fragments defined by the $pairwise_mlss found for the reference 
#genomic_aligns (may be more than one) in $ref_gas
#

}

sub _write_gerp_dataflow {

    my ($self, $gab_id, $mlss) = @_;
    
    my $species_set = "[";
    my $genome_db_set  = $mlss->species_set;
    
    foreach my $genome_db (@$genome_db_set) {
	$species_set .= $genome_db->dbID . ","; 
    }
    $species_set .= "]";
    
    my $output_id = "{genomic_align_block_id=>" . $gab_id . ",species_set=>" .  $species_set . "}";
    $self->dataflow_output_id($output_id);

}

sub add_match_elem {

    my ($firstM, $gap_len, $new_cigar_line) = @_;

    #add firstM
    if ($firstM == 1) {
	$new_cigar_line .= "M";
    } elsif($firstM > 1) {
	$new_cigar_line .= $firstM . "M";
    } 
    
    if ($gap_len == 1) {
	$new_cigar_line .= "I";
    } elsif ($gap_len > 1) {
	$new_cigar_line .= $gap_len . "I";
    } 
    return ($new_cigar_line);
}

#
# Extract the sequence corresponding to the 2X genome fragment
# extracts subsequence from seq starting from aligned_start (alignment coords)
# for $seq_length bases (not counting pads)

}

sub check_cigar_line {

    my ($genomic_align, $total_gap) = @_;

    #can't check ancestral nodes because these don't have a dnafarg_start
    #or dnafrag_end.
    return if ($genomic_align->dnafrag_id == -1);

    my $seq_pos = 0;
    my $align_len = 0;
    my $cigar_line = $genomic_align->cigar_line;
    my $length = $genomic_align->dnafrag_end-$genomic_align->dnafrag_start+1;
    my $gab = $genomic_align->genomic_align_block;

    my @cig = ( $cigar_line =~ /(\d*[GMDXI])/g );
    for my $cigElem ( @cig ) {
	my $cigType = substr( $cigElem, -1, 1 );
	my $cigCount = substr( $cigElem, 0 ,-1 );
	$cigCount = 1 unless ($cigCount =~ /^\d+$/);

	if( $cigType eq "M" ) {
	    $seq_pos += $cigCount;
	} elsif( $cigType eq "I") {
	    $seq_pos += $cigCount;
	} elsif( $cigType eq "X") {
	} elsif( $cigType eq "G" || $cigType eq "D") {	
	}
	if ($cigType ne "I") {
	    $align_len += $cigCount;
	}
    }

    throw ("Cigar line aligned length $align_len does not match (genomic_align_block_length (" . $gab->length . ") - num of gaps ($total_gap)) " . ($gab->length - $total_gap) . " for gab_id " . $gab->dbID . "\n")
      if ($align_len != ($gab->length - $total_gap));

    throw("Cigar line ($seq_pos) does not match sequence length $length\n") 
      if ($seq_pos != $length);
}



#If a gap has been found in a cig_elem of type M, need to split it into
#firstM - I - lastM. This function adds firstM and I to new_cigar_line

}

sub cigar_element {

    my ($mode, $len) = @_;
    my $elem;
    if ($len == 1) {
	$elem = $mode;
    } elsif ($len > 1) { #length can be 0 if the sequence starts with a gap
	$elem = $len.$mode;
    }
    return $elem;
}

#check the new cigar_line is consistent ie the seq_length and number of (M+I) 
#agree and the alignment length and total of cig_elems agree.

}

sub create_2x_cigar_line {

    my ($aligned_sequence, $ga_deletions) = @_;

    my $cigar_line = "";
    my $base_pos = 0;
    my $current_deletion;
    if (defined $ga_deletions && @$ga_deletions > 0) {
	$current_deletion = shift @$ga_deletions;
    }
    
    my @pieces = grep {$_} split(/(\-+)|(\.+)/, $aligned_sequence);
    foreach my $piece (@pieces) {
	my $elem;

	#length of current piece
	my $this_len = length($piece);
	
	my $mode;
	if ($piece =~ /\-/) {
	    $mode = "D"; # D for gaps (deletions)
	    $elem = cigar_element($mode, $this_len);
	} elsif ($piece =~ /\./) {
	    $mode = "X"; # X for pads (in 2X genomes)
	    $elem = cigar_element($mode, $this_len);
	} else {
	    $mode = "M"; # M for matches/mismatches
	    my $next_pos = $base_pos + $this_len;

	    #TODO need special case if have insertion as the last base.
	    #need to have >= and < (not <=) otherwise if an insertion occurs
	    #in the same position as a - then I is added twice.

	    #check to see if next deletion occurs in this cigar element
	    if (defined $current_deletion && 
		$current_deletion->{pos} >= $base_pos && 
		$current_deletion->{pos} < $next_pos) {
		
		#find all deletions that occur in this cigar element
		my $this_del_array;
		while ($current_deletion->{pos} >= $base_pos && 
		       $current_deletion->{pos} < $next_pos) {
		    push @$this_del_array, $current_deletion;

		    last if (@$ga_deletions == 0);
		    $current_deletion = shift @$ga_deletions;
		} 
		
		#loop through all deletions, adding them instead of this cigar element
		my $prev_pos = $base_pos;
		foreach my $this_del (@$this_del_array) {
		    my $piece_len = ($this_del->{pos} - $prev_pos);
		    $elem .= cigar_element($mode, $piece_len);
		    $elem .= cigar_element("I", $this_del->{len});
		    $prev_pos = $this_del->{pos};
		    
		}
		#add final bit
		$elem .= cigar_element($mode, ($base_pos+$this_len) - $this_del_array->[-1]->{pos});
	    } else {
		$elem = cigar_element($mode, $this_len);
	    }
	    
	    $base_pos += $this_len;
	    #print "LENGTH $this_len BASE POS $base_pos\n";
	}
	$cigar_line .= $elem;
    }	
    #print "cigar $cigar_line\n";
    return $cigar_line;
}

#create cigar element from mode and length

}

sub delete_epo_alignments {

    my ($self, $gab_id) = @_;
    
    my $dbc = $self->{'comparaDBA'};
    my $mlss_id = $self->method_link_species_set_id;
    
    my $gab_adaptor = $self->{'comparaDBA'}->get_GenomicAlignBlockAdaptor;
    my $gat_adaptor = $self->{'comparaDBA'}->get_GenomicAlignTreeAdaptor;
    my $genome_db_adaptor = $self->{'comparaDBA'}->get_GenomeDBAdaptor;

    my $ancestor_genome_db = $genome_db_adaptor->fetch_by_name_assembly("Ancestral sequences");
    my $ancestor_dba = $ancestor_genome_db->db_adaptor;
    my $compara_dba = $gab_adaptor;
    
    #get all genomic_align_blocks for mlss
    my $sql = "SELECT DISTINCT ga2.genomic_align_block_id FROM genomic_align ga1 LEFT JOIN genomic_align ga2 USING (dnafrag_id, dnafrag_start, dnafrag_end) WHERE ga1.genomic_align_block_id=$gab_id AND ga2.method_link_species_set_id=$mlss_id";

    my $sth = $compara_dba->prepare($sql);
    $sth->execute();

    my ($genomic_align_block_id);
    my @new_gabs;
    $sth->bind_columns(\$genomic_align_block_id);

    while ($sth->fetch()) {
	print "Gab " . $genomic_align_block_id . "\n";    
	push @new_gabs, $gab_adaptor->fetch_by_dbID($genomic_align_block_id);
    }
    $sth->finish();

    if (@new_gabs == 0) {
	print STDERR "Nothing to delete\n";
	return;
    }

    #need to delete any analysis_jobs created for gerp.
    


    my %undeleted_gabs;
    foreach my $genomic_align_block (@new_gabs) {
	my %gabs_to_delete;
	my @gags_to_delete;
	my @dnafrags_to_delete;
	my @seq_regions_to_delete;

	$undeleted_gabs{$genomic_align_block->dbID} = 1;

	#delete both modern and ancestral block in each iteration
	next if (!defined $genomic_align_block);
	
	if ($genomic_align_block->method_link_species_set->method_link_class ne "GenomicAlignTree.tree_alignment") {
	    warn("This script is for deleting epo alignments only\n");
	    next;
	}

	#get the tree for each block
	my $genomic_align_tree = $gat_adaptor->fetch_by_GenomicAlignBlock($genomic_align_block);
	
	#check still have a tree
	next if (!defined $genomic_align_tree);

	foreach my $this_node (@{$genomic_align_tree->get_all_nodes}) {
	    my $genomic_align_group = $this_node->genomic_align_group;
	    next if (!$genomic_align_group);
	   
	    foreach my $this_genomic_align (@{$genomic_align_group->get_all_GenomicAligns}) {
		
		#get unique genomic_align_blocks
		$gabs_to_delete{$this_genomic_align->genomic_align_block_id} = 1;

		#check all gabs end up being deleted
		delete($undeleted_gabs{$this_genomic_align->genomic_align_block_id});

		#get the ancestral dnafrags
		my $dnafrag_name = $this_genomic_align->dnafrag->name;
		if ($dnafrag_name =~ /Ancestor/) {
		    push @dnafrags_to_delete, $dnafrag_name;
		}
	    }
	    
	    #get the genomic_align_groups
	    push @gags_to_delete, $genomic_align_group->dbID;
	}
	my $root_id = $genomic_align_tree->root->node_id;

	my ($sql_gab, $sql_ga, $sql_gag, $sql_gat, $sql_dnafrag, $sql_dna, $sql_seq_region);
	
	#assume not have too many of these!
	$sql_gab = "DELETE FROM genomic_align_block WHERE genomic_align_block_id IN ";
	$sql_ga = "DELETE FROM genomic_align WHERE genomic_align_block_id IN ";
	$sql_gag = "DELETE FROM genomic_align_group WHERE group_id IN ";
	$sql_gat = "DELETE FROM genomic_align_tree WHERE root_id =$root_id";
	$sql_dnafrag = "DELETE FROM dnafrag WHERE genome_db_id = " . $ancestor_genome_db->dbID . " AND name IN ";
	
	$sql_dna = "DELETE FROM dna WHERE seq_region_id IN ";
	$sql_seq_region = "DELETE FROM seq_region WHERE seq_region_id IN ";

	#convert hash to array
	my @gab_ids;
	foreach my $gab (keys %gabs_to_delete) {
	    push @gab_ids, $gab;
	}

	#find seq_region_ids from names
	my $sql_seq_region_select = "SELECT seq_region_id FROM seq_region WHERE name IN ";
	my $sql_seq_region_select_to_exec = $sql_seq_region_select . "(\"" . join("\",\"", @dnafrags_to_delete) . "\")";
	my $sth = $ancestor_dba->dbc->prepare($sql_seq_region_select_to_exec);
	#print "SQL: $sql_seq_region_select_to_exec\n";
	$sth->execute;

	my $this_seq_region_id;
	$sth->bind_columns(\$this_seq_region_id);
	while ($sth->fetch()) {
	    push @seq_regions_to_delete, $this_seq_region_id;
	}
    
	my $sql_gab_to_exec = $sql_gab . "(" . join(",", @gab_ids) . ")";
	my $sql_ga_to_exec = $sql_ga . "(" . join(",", @gab_ids) . ")";
	my $sql_gag_to_exec = $sql_gag . "(" . join(",", @gags_to_delete) . ")";
	my $sql_dnafrag_to_exec = $sql_dnafrag . "(\"" . join("\",\"", @dnafrags_to_delete) . "\")";
	
	#my $sql_gat_to_exec = $sql_gat . "(" . join(",", @gags_to_delete) . ")";
	my $sql_gat_to_exec = "DELETE FROM genomic_align_tree WHERE root_id = $root_id";
	my $sql_dna_to_exec = $sql_dna . "(" . join(",", @seq_regions_to_delete) . ")";
	my $sql_seq_region_to_exec = $sql_seq_region . "(" . join(",", @seq_regions_to_delete) . ")";
	

	#delete genomic_align_block, genomic_aligns, genomic_align_groups, genomic_align_trees, ancestral dnafrags
	foreach my $sql ($sql_gab_to_exec,$sql_ga_to_exec,$sql_gag_to_exec,$sql_gat_to_exec,$sql_dnafrag_to_exec) {
	    my $sth = $compara_dba->dbc->prepare($sql);
	    print "SQL: $sql\n";
	    $sth->execute;
	    $sth->finish;
	}
	
	#delete dna and seq_region from the ancestral_core db
	foreach my $sql ($sql_dna_to_exec, $sql_seq_region_to_exec) {
	    my $sth = $ancestor_dba->dbc->prepare($sql);
	    print "SQL: $sql\n";
	    $sth->execute;
	    $sth->finish;
	}
	print "\n\n";
    }
    #convert hash to array
    my @gab_ids;
    foreach my $gab_id (keys %undeleted_gabs) {
	warn("*** This genomic_align_block $gab_id has no tree ***\n");
	#print "Deleting genomic_align_block and genomic_align only\n";
	push @gab_ids, $gab_id;
    }
    
    #Deleting genomic_align_block and genomic_align only
    if (@gab_ids) {
	my $sql_gab = "DELETE FROM genomic_align_block WHERE genomic_align_block_id IN ";
	my $sql_ga = "DELETE FROM genomic_align WHERE genomic_align_block_id IN ";

	my $sql_gab_to_exec = $sql_gab . "(" . join(",", @gab_ids) . ")";
	my $sql_ga_to_exec = $sql_ga . "(" . join(",", @gab_ids) . ")";
	
	foreach my $sql ($sql_gab_to_exec,$sql_ga_to_exec) {
	    my $sth = $compara_dba->dbc->prepare($sql);
	    #print "SQL: $sql\n";
	    #$sth->execute;
	    $sth->finish;
	}
    }

}

1;

}

sub fasta_files {

  my $self = shift;

  $self->{'_fasta_files'} = [] unless (defined $self->{'_fasta_files'});

  if (@_) {
    my $value = shift;
    push @{$self->{'_fasta_files'}}, $value;
  }

  return $self->{'_fasta_files'};

}

sub fetch_input {

  my( $self) = @_;

  $self->{'comparaDBA'} = Bio::EnsEMBL::Compara::Production::DBSQL::DBAdaptor->new(-DBCONN=>$self->db->dbc);
  $self->{'hiveDBA'} = Bio::EnsEMBL::Hive::DBSQL::DBAdaptor->new(-DBCONN => $self->{'comparaDBA'}->dbc);
  $self->get_params($self->parameters);
  $self->get_params($self->input_id);

  if (!$self->method_link_species_set_id) {
    throw("MethodLinkSpeciesSet->dbID is not defined for this Pecan job");
  }

  #delete any bits in the database left over from a previous, failed run
  if ($self->input_job->retry_count > 0) {
      print STDERR "Deleting alignments as it is a rerun\n";
      print STDERR "Not deleting anything at the minute. Need to uncomment\n";
      #$self->delete_epo_alignments($self->genomic_align_block_id);
  }

  #load from genomic_align_block ie using in 2X mode
  $self->_load_GenomicAligns($self->genomic_align_block_id);

  if ($self->genomic_aligns) {
      #load 2X genomes
      $self->_load_2XGenomes($self->genomic_align_block_id, $self->{_pairwise_analysis_data_id});

      ## Get the tree string by taking into account duplications and deletions. Resort dnafrag_regions
      ## in order to match the name of the sequences in the tree string (seq1, seq2...)
      if ($self->get_species_tree) {
	  $self->_build_tree_string;
      }
      ## Dumps fasta files for the DnaFragRegions. Fasta files order must match the entries in the
      ## newick tree. The order of the files will match the order of sequences in the tree_string.
      
      #create multi-fasta file with 2X genomes 
      $self->_create_mfa;
      
      $self->_dump_fasta_and_mfa;

  } else {
    throw("Cannot start alignment because some information is missing");
  }
  return 1;

}

sub find_ref_deletions {

    my ($gab) = @_;
    my $deletion_array;

    my $ref_ga = $gab->reference_genomic_align;
    my $non_ref_ga = $gab->get_all_non_reference_genomic_aligns->[0];

    my $ref_mapper = $ref_ga->get_Mapper;

    my $non_ref_mapper = $non_ref_ga->get_Mapper;

    my @ref_coords = $ref_mapper->map_coordinates("sequence",
						  $ref_ga->dnafrag_start,
						  $ref_ga->dnafrag_end,
						  $ref_ga->dnafrag_strand,
						  "sequence");
    #print "num coords " . @ref_coords . "\n";
    #print "non_ref start " . $non_ref_ga->dnafrag_start . " end " . $non_ref_ga->dnafrag_end . "\n";
    my $start_del;
    my $end_del;
    my $num_del = 0;
    foreach my $this_coord (@ref_coords) {
	#print "coords " . $this_coord->start . " end " . $this_coord->end . " strand " . $this_coord->strand . "\n";
	my @non_ref_coords = $non_ref_mapper->map_coordinates("alignment",
							      $this_coord->start,
							      $this_coord->end,
							      $this_coord->strand,
							      "alignment");

	#want all coords starting from left hand end
	if ($non_ref_ga->dnafrag_strand == -1) {
	    #print "start " . $non_ref_ga->dnafrag_end . " " . $non_ref_coords[0]->start . "\n";
	    $end_del = ($non_ref_ga->dnafrag_end - $non_ref_coords[0]->end +1);
	} else {
	    $end_del = $non_ref_coords[0]->start - $non_ref_ga->dnafrag_start+1;
	}
	if (defined $start_del) {
	    #print "found del $start_del $end_del\n";
	    my $deletion;
	    $deletion->{pos} = $start_del - $num_del;
	    $deletion->{len} = ($end_del-$start_del-1);
	    push @$deletion_array, $deletion;
	    $num_del += $deletion->{len};

	    #print "del pos " . $deletion->{pos} . " len " . $deletion->{len} . "\n";
	}
	if ($non_ref_ga->dnafrag_strand == -1) {
	    #print "end " . $non_ref_ga->dnafrag_end . " " . $non_ref_coords[-1]->end . "\n";
	    $start_del = ($non_ref_ga->dnafrag_end - $non_ref_coords[-1]->start +1);
	} else {	
	    $start_del = $non_ref_coords[-1]->end-$non_ref_ga->dnafrag_start+1;
	}
#  	foreach my $non_ref (@non_ref_coords) {
#  	    if ($non_ref->isa("Bio::EnsEMBL::Mapper::Gap")) {
#  		print "   found gap " . $non_ref->start . " end ". $non_ref->end . "\n";
#  	    } else {
#  		print "   non ref " . ($non_ref->start-$non_ref_ga->dnafrag_start+1) . " end " . ($non_ref->end-$non_ref_ga->dnafrag_start+1) . "\n";
#  		print "   non ref real " . $non_ref->start . " end " . $non_ref->end . "\n";
	  
#  	    }
#  	}

    }
    return $deletion_array;

}

sub genomic_align_block_id {

  my $self = shift;
  $self->{'_genomic_align_block_id'} = shift if(@_);
  return $self->{'_genomic_align_block_id'};

}

sub genomic_aligns {

  my $self = shift;
  $self->{'_genomic_aligns'} = shift if(@_);
  return $self->{'_genomic_aligns'};

}

sub get_params {

  my $self         = shift;
  my $param_string = shift;

  return unless($param_string);
  print("parsing parameter string : ",$param_string,"\n");

  my $params = eval($param_string);
  return unless($params);

  if(defined($params->{'genomic_align_block_id'})) {
    $self->genomic_align_block_id($params->{'genomic_align_block_id'});
  }
  if(defined($params->{'method_link_species_set_id'})) {
    $self->method_link_species_set_id($params->{'method_link_species_set_id'});
  }
  if(defined($params->{'tree_file'})) {
    $self->{_tree_file} = $params->{'tree_file'};
  }
  if(defined($params->{'tree_analysis_data_id'})) {
    $self->{_tree_analysis_data_id} = $params->{'tree_analysis_data_id'};
  }
  if(defined($params->{'taxon_tree_analysis_data_id'})) {
    $self->{_taxon_tree_analysis_data_id} = $params->{'taxon_tree_analysis_data_id'};
  }
  if(defined($params->{'pairwise_analysis_data_id'})) {
    $self->{_pairwise_analysis_data_id} = $params->{'pairwise_analysis_data_id'};
  }
  if(defined($params->{'reference_species'})) {
    $self->{_reference_species} = $params->{'reference_species'};
  }
  if(defined($params->{'max_block_size'})) {
    $self->{_max_block_size} = $params->{'max_block_size'};
  }

  return 1;

}

sub get_seq_length_from_cigar {

    my ($cigar_line) = @_;
    my $seq_pos;

    my @cig = ( $cigar_line =~ /(\d*[GMDXI])/g );
    for my $cigElem ( @cig ) {
	my $cigType = substr( $cigElem, -1, 1 );
	my $cigCount = substr( $cigElem, 0 ,-1 );
	$cigCount = 1 unless ($cigCount =~ /^\d+$/);

	if( $cigType eq "M" ) {
	    $seq_pos += $cigCount;
	} elsif( $cigType eq "I") {
	    $seq_pos += $cigCount;
	}
    }
    return $seq_pos;

}

sub get_species_tree {

  my $self = shift;

  my $newick_species_tree;
  if (defined($self->{_species_tree})) {
    return $self->{_species_tree};
  } elsif ($self->{_tree_analysis_data_id}) {
    my $analysis_data_adaptor = $self->{hiveDBA}->get_AnalysisDataAdaptor();
    $newick_species_tree = $analysis_data_adaptor->fetch_by_dbID($self->{_tree_analysis_data_id});
  } elsif ($self->{_tree_file}) {
    open(TREE_FILE, $self->{_tree_file}) or throw("Cannot open file ".$self->{_tree_file});
    $newick_species_tree = join("", <TREE_FILE>);
    close(TREE_FILE);
  }

  if (!defined($newick_species_tree)) {
    return undef;
  }

  $newick_species_tree =~ s/^\s*//;
  $newick_species_tree =~ s/\s*$//;
  $newick_species_tree =~ s/[\r\n]//g;

  $self->{'_species_tree'} =
      Bio::EnsEMBL::Compara::Graph::NewickParser::parse_newick_into_tree($newick_species_tree);

  return $self->{'_species_tree'};

}

sub get_taxon_tree {

  my $self = shift;

  my $newick_taxon_tree;
  if (defined($self->{_taxon_tree})) {
    return $self->{_taxon_tree};
  } elsif ($self->{_taxon_tree_analysis_data_id}) {
    my $analysis_data_adaptor = $self->{hiveDBA}->get_AnalysisDataAdaptor();
    $newick_taxon_tree = $analysis_data_adaptor->fetch_by_dbID($self->{_taxon_tree_analysis_data_id});
  } elsif ($self->{_taxon_tree_file}) {
    open(TREE_FILE, $self->{_taxon_tree_file}) or throw("Cannot open file ".$self->{_taxon_tree_file});
    $newick_taxon_tree = join("", <TREE_FILE>);
    close(TREE_FILE);
  }

  if (!defined($newick_taxon_tree)) {
    return undef;
  }

  $self->{'_taxon_tree'} = $newick_taxon_tree;

  return $self->{'_taxon_tree'};

}

sub max_block_size {

  my $self = shift;
  $self->{'_max_block_size'} = shift if(@_);
  return $self->{'_max_block_size'};

}

sub method_link_species_set_id {

  my $self = shift;
  $self->{'_method_link_species_set_id'} = shift if(@_);
  return $self->{'_method_link_species_set_id'};

}

sub reference_species {

  my $self = shift;
  $self->{'_reference_species'} = shift if(@_);
  return $self->{'_reference_species'};
}



##########################################
#
# internal methods
#
##########################################

}

sub run {

  my $self = shift;

  my $runnable = new Bio::EnsEMBL::Analysis::Runnable::LowCoverageGenomeAlignment(
      -analysis => $self->analysis,
      -workdir => $self->worker_temp_directory,
      -multi_fasta_file => $self->{multi_fasta_file},
      -tree_string => $self->tree_string,
      -taxon_species_tree => $self->get_taxon_tree,
      );
  $self->{'_runnable'} = $runnable;


  #disconnect pairwise compara database
  if ($self->{pairwise_compara_dba}) {
      foreach my $dba (values %{$self->{pairwise_compara_dba}}) {
	  $dba->dbc->disconnect_if_idle;
      }
  }

  #disconnect ancestral core database
  my $ancestor_genome_db = $self->{'comparaDBA'}->get_GenomeDBAdaptor->fetch_by_name_assembly("Ancestral sequences");
  my $ancestor_dba = $ancestor_genome_db->db_adaptor;
  $ancestor_dba->dbc->disconnect_if_idle;

  #disconnect compara database
  $self->{'comparaDBA'}->dbc->disconnect_if_idle;

  $runnable->run_analysis;
  $self->_parse_results();

}

sub species_order {

  my $self = shift;

  $self->{'_species_order'} = [] unless (defined $self->{'_species_order'});

  if (@_) {
    my $value = shift;
    push @{$self->{'_species_order'}}, $value;
  }

  return $self->{'_species_order'};

}

sub taxon_tree_string {

  my $self = shift;
  $self->{'_taxon_tree_string'} = shift if(@_);
  return $self->{'_taxon_tree_string'};

}

sub tree_string {

  my $self = shift;
  $self->{'_tree_string'} = shift if(@_);
  return $self->{'_tree_string'};

}

sub write_output {

  my ($self) = @_;

  print "WRITE OUTPUT\n";
  if ($self->{'_runnable'}->{tree_to_save}) {
    my $meta_container = $self->{'comparaDBA'}->get_MetaContainer;
    $meta_container->store_key_value("synteny_region_tree_".$self->synteny_region_id,
        $self->{'_runnable'}->{tree_to_save});
  }

  my $mlssa = $self->{'comparaDBA'}->get_MethodLinkSpeciesSetAdaptor;
  my $mlss = $mlssa->fetch_by_dbID($self->method_link_species_set_id);
  my $dnafrag_adaptor = $self->{'comparaDBA'}->get_DnaFragAdaptor;
  my $gaba = $self->{'comparaDBA'}->get_GenomicAlignBlockAdaptor;
#   $gaba->use_autoincrement(0);
  my $gaa = $self->{'comparaDBA'}->get_GenomicAlignAdaptor;
#   $gaa->use_autoincrement(0);

  my $gaga = $self->{'comparaDBA'}->get_GenomicAlignGroupAdaptor;

  my $gata = $self->{'comparaDBA'}->get_GenomicAlignTreeAdaptor;

  my $ancestor_genome_db = $self->{'comparaDBA'}->get_GenomeDBAdaptor->fetch_by_name_assembly("Ancestral sequences");
  my $ancestor_dba = $ancestor_genome_db->db_adaptor;
  my $slice_adaptor = $ancestor_dba->get_SliceAdaptor();
  my $ancestor_coord_system_adaptor = $ancestor_dba->get_CoordSystemAdaptor();
  my $ancestor_coord_system;
  eval{
    $ancestor_coord_system = $ancestor_coord_system_adaptor->fetch_by_name("ancestralsegment");
  };
  if(!$ancestor_coord_system){
    $ancestor_coord_system = new Bio::EnsEMBL::CoordSystem(
            -name            => "ancestralsegment",
            -VERSION         => undef,
            -DEFAULT         => 1,
            -SEQUENCE_LEVEL  => 1,
            -RANK            => 1
        );
    $ancestor_coord_system_adaptor->store($ancestor_coord_system);
  }

  foreach my $genomic_align_tree (@{$self->{'_runnable'}->output}) {
      my $all_nodes;

      foreach my $genomic_align_node (@{$genomic_align_tree->get_all_nodes}) {
	   next if (!defined $genomic_align_node->genomic_align_group);
	   foreach my $genomic_align (@{$genomic_align_node->genomic_align_group->get_all_GenomicAligns}) {
 	      $genomic_align->adaptor($gaa);
 	      $genomic_align->method_link_species_set($mlss);
 	      $genomic_align->level_id(1);

 	      if ($genomic_align->dnafrag_id == -1) {
 		  ## INTERNAL NODE, i.e. an ancestral sequence

 		  my $length = length($genomic_align->original_sequence);

 		  $slice_adaptor->dbc->db_handle->do("LOCK TABLES seq_region WRITE, dna WRITE");
 		  my $last_id = $slice_adaptor->dbc->db_handle->selectrow_array("SELECT max(seq_region_id) FROM seq_region");
 		  $last_id++;
 		  my $name = "Ancestor$last_id";
 		  my $slice = new Bio::EnsEMBL::Slice(
 						      -seq_region_name   => $name,
 						      -start             => 1,
 						      -end               => $length,
 						      -seq_region_length => $length,
 						      -strand            => 1,
 						      -coord_system      => $ancestor_coord_system,
 						     );
 		  $slice_adaptor->store($slice,\$ genomic_align->original_sequence);
 		  $slice_adaptor->dbc->db_handle->do("UNLOCK TABLES");
 		  my $dnafrag = new Bio::EnsEMBL::Compara::DnaFrag(
					   -name => $name,
					   -genome_db => $ancestor_genome_db,
					   -length => $length,
					   -coord_system_name => "ancestralsegment",
								  );
		  
 		  $dnafrag_adaptor->store($dnafrag);
 		  $genomic_align->dnafrag_id($dnafrag->dbID);
 		  $genomic_align->dnafrag_end($length);
		  $genomic_align->dnafrag($dnafrag);
	      }
	  }
       }
       my $split_trees;
       #restrict genomic_align_tree if it is too long and store as groups

      #need to do it this way in case have no ancestral nodes
      my $gat_length;
      $all_nodes = $genomic_align_tree->get_all_leaves;
      $gat_length = $all_nodes->[0]->length;

       if ($self->max_block_size() && $gat_length >  $self->max_block_size()) {
  	   for (my $start = 1; $start <= $gat_length; 
  		$start += $self->max_block_size()) {
  	       my $end = $start+$self->max_block_size()-1;
  	       if ($end > $gat_length) {
  		   $end = $gat_length;
  	       }
  	       my $new_gat = $genomic_align_tree->restrict_between_alignment_positions($start, $end, "skip_empty_GenomicAligns");
	       push @$split_trees, $new_gat;
  	   }
	   $gata->store_group($split_trees);
	   foreach my $tree (@$split_trees) {
	       $self->_write_gerp_dataflow($tree->modern_genomic_align_block_id,
					   $mlss);
	       
	   }
       } else {
	   
	   $gata->store($genomic_align_tree);
	   $self->_write_gerp_dataflow(
			    $genomic_align_tree->modern_genomic_align_block_id,
			    $mlss);
       }
   }
  chdir("$self->worker_temp_directory");
  foreach(glob("*")){
      #DO NOT COMMENT THIS OUT!!! (at least not permenantly). Needed
      #to clean up after each job otherwise you get files left over from
      #the previous job.
      unlink($_);
  }
  return 1;

}

General documentation

The fetch_input method reads the parameters of the analysis (analysis.parameters) first and then
the input_id of the analysis job (analysis_job.input_id). Both are expected to be string
representing hash references like {key1 => "value1", key2 => "value2"}. Values defined in the
analysis_job.input_id column will overwrite values in the analysis.parameters.

* genomic_align_block_id (int)

This module will use the alignment of high coverage genomes defined by this dbID to map the low coverage genomes onto.

* method_link_species_set_id (int)

This module will store alignments with this method_link_species_set_id

* tree_file

FIXME

* tree_analysis_data_id (int)

The species tree using the genome_db_id as the species identifier is stored in
the analysis_data table with this analysis_data_id

* pairwise_analysis_data_id (int)

A list of database locations and method_link_species_set_id pairs for the low coverage geonome BlastZ-Net alignments. The database locations should be identified using the url format.ie mysql://user:pass\@host:port/db_name.

* reference_species

The reference species for the low coverage genome BlastZ_Net alignments

* max_block_size (int)

If an alignment is longer than this value, it will be split in several blocks in the database. All resulting blocks will share the same genomic_align_group_id.

Javier Herrero and Kathryn Beal

Post questions to the Ensembl development list: ensembl-dev@ebi.ac.uk

The rest of the documentation details each of the object methods.
Internal methods are usually preceded with a _

_add_to_cluster	No description	Code
_add_to_different_cluster	No description	Code
_add_to_same_cluster	No description	Code
_build_tree_string	Description	Code
_cluster_regions	No description	Code
_create_frag_array	No description	Code
_create_mfa	No description	Code
_dump_2x_fasta	No description	Code
_dump_fasta_and_mfa	Description	Code
_extract_sequence	No description	Code
_find_longest_region_in_cluster	No description	Code
_in_cluster	No description	Code
_load_2XGenomes	Description	Code
_load_GenomicAligns	No description	Code
_merge_clusters	No description	Code
_parse_results	Description	Code
_print_cluster	No description	Code
_update_tree_2x	Description	Code
_write_gerp_dataflow	No description	Code
add_match_elem	No description	Code
check_cigar_line	No description	Code
cigar_element	No description	Code
create_2x_cigar_line	No description	Code
delete_epo_alignments	No description	Code
fasta_files	No description	Code
fetch_input	Description	Code
find_ref_deletions	No description	Code
genomic_align_block_id	No description	Code
genomic_aligns	No description	Code
get_params	No description	Code
get_seq_length_from_cigar	No description	Code
get_species_tree	No description	Code
get_taxon_tree	No description	Code
max_block_size	No description	Code
method_link_species_set_id	No description	Code
reference_species	No description	Code
run	Description	Code
species_order	No description	Code
taxon_tree_string	No description	Code
tree_string	No description	Code
write_output	Description	Code