Raw content of Bio::EnsEMBL::Compara::Production::GenomicAlignBlock::LowCoverageGenomeAlignment
#
# You may distribute this module under the same terms as perl itself
#
# POD documentation - main docs before the code
=pod
=head1 NAME
Bio::EnsEMBL::Compara::Production::GenomicAlignBlock::LowCoverageGenomeAlignment
=head1 SYNOPSIS
=head1 DESCRIPTION
This module acts as a layer between the Hive system and the Bio::EnsEMBL::Analysis::Runnable::LowCoverageGenomeAlignment module since the ensembl-analysis API does not know about ensembl-compara
This module is the alternative module to Ortheus.pm for aligning low coverage (2X) genomes. This takes an existing high coverage alignment and maps the pairwise BlastZ-Net alignments of the low coverage genomes to the human sequence in the high coverage alignment. Any insertions in the low coverage alignments are removed, that is, no gaps are added to the human sequence. In regions where there are duplications, a phylogenetic tree is generated using either treeBest where there are more than 3 sequences in the alignment or semphy where there are 3 or less sequences. This module is still under development.
=head1 PARAMETERS
The fetch_input method reads the parameters of the analysis (analysis.parameters) first and then
the input_id of the analysis job (analysis_job.input_id). Both are expected to be string
representing hash references like {key1 => "value1", key2 => "value2"}. Values defined in the
analysis_job.input_id column will overwrite values in the analysis.parameters.
=over 5
=item * genomic_align_block_id (int)
This module will use the alignment of high coverage genomes defined by this dbID to map the low coverage genomes onto.
=item * method_link_species_set_id (int)
This module will store alignments with this method_link_species_set_id
=item * tree_file
FIXME
=item * tree_analysis_data_id (int)
The species tree using the genome_db_id as the species identifier is stored in
the analysis_data table with this analysis_data_id
=item * pairwise_analysis_data_id (int)
A list of database locations and method_link_species_set_id pairs for the low coverage geonome BlastZ-Net alignments. The database locations should be identified using the url format.ie mysql://user:pass\@host:port/db_name.
=item * reference_species
The reference species for the low coverage genome BlastZ_Net alignments
=item * max_block_size (int)
If an alignment is longer than this value, it will be split in several blocks in the database. All resulting blocks will share the same genomic_align_group_id.
=back
=head1 AUTHOR
Javier Herrero and Kathryn Beal
=head1 CONTACT
Post questions to the Ensembl development list: ensembl-dev@ebi.ac.uk
=head1 APPENDIX
The rest of the documentation details each of the object methods.
Internal methods are usually preceded with a _
=cut
package Bio::EnsEMBL::Compara::Production::GenomicAlignBlock::LowCoverageGenomeAlignment;
use strict;
use Bio::EnsEMBL::Analysis::Runnable::LowCoverageGenomeAlignment;
use Bio::EnsEMBL::Compara::DnaFragRegion;
use Bio::EnsEMBL::Compara::Production::DBSQL::DBAdaptor;
use Bio::EnsEMBL::Utils::Exception;
use Bio::EnsEMBL::Compara::Graph::NewickParser;
use Bio::EnsEMBL::Compara::NestedSet;
use Bio::EnsEMBL::Compara::GenomicAlignGroup;
use Data::Dumper;
use Bio::EnsEMBL::Hive::Process;
our @ISA = qw(Bio::EnsEMBL::Hive::Process);
=head2 fetch_input
Arg : -none-
Example : $self->fetch_input
Description: Fetches input data for the module from the database
Returntype : none
Excptions :
Caller :
Status : At risk
=cut
sub fetch_input {
my( $self) = @_;
$self->{'comparaDBA'} = Bio::EnsEMBL::Compara::Production::DBSQL::DBAdaptor->new(-DBCONN=>$self->db->dbc);
$self->{'hiveDBA'} = Bio::EnsEMBL::Hive::DBSQL::DBAdaptor->new(-DBCONN => $self->{'comparaDBA'}->dbc);
$self->get_params($self->parameters);
$self->get_params($self->input_id);
if (!$self->method_link_species_set_id) {
throw("MethodLinkSpeciesSet->dbID is not defined for this Pecan job");
}
#delete any bits in the database left over from a previous, failed run
if ($self->input_job->retry_count > 0) {
print STDERR "Deleting alignments as it is a rerun\n";
print STDERR "Not deleting anything at the minute. Need to uncomment\n";
#$self->delete_epo_alignments($self->genomic_align_block_id);
}
#load from genomic_align_block ie using in 2X mode
$self->_load_GenomicAligns($self->genomic_align_block_id);
if ($self->genomic_aligns) {
#load 2X genomes
$self->_load_2XGenomes($self->genomic_align_block_id, $self->{_pairwise_analysis_data_id});
## Get the tree string by taking into account duplications and deletions. Resort dnafrag_regions
## in order to match the name of the sequences in the tree string (seq1, seq2...)
if ($self->get_species_tree) {
$self->_build_tree_string;
}
## Dumps fasta files for the DnaFragRegions. Fasta files order must match the entries in the
## newick tree. The order of the files will match the order of sequences in the tree_string.
#create multi-fasta file with 2X genomes
$self->_create_mfa;
$self->_dump_fasta_and_mfa;
} else {
throw("Cannot start alignment because some information is missing");
}
return 1;
}
=head2 run
Arg : -none-
Example : $self->run
Description: runs the LowCoverageGenomeAlignment analysis module and
parses the results
Returntype : none
Exceptions : none
Caller :
Status : At risk
=cut
sub run
{
my $self = shift;
my $runnable = new Bio::EnsEMBL::Analysis::Runnable::LowCoverageGenomeAlignment(
-analysis => $self->analysis,
-workdir => $self->worker_temp_directory,
-multi_fasta_file => $self->{multi_fasta_file},
-tree_string => $self->tree_string,
-taxon_species_tree => $self->get_taxon_tree,
);
$self->{'_runnable'} = $runnable;
#disconnect pairwise compara database
if ($self->{pairwise_compara_dba}) {
foreach my $dba (values %{$self->{pairwise_compara_dba}}) {
$dba->dbc->disconnect_if_idle;
}
}
#disconnect ancestral core database
my $ancestor_genome_db = $self->{'comparaDBA'}->get_GenomeDBAdaptor->fetch_by_name_assembly("Ancestral sequences");
my $ancestor_dba = $ancestor_genome_db->db_adaptor;
$ancestor_dba->dbc->disconnect_if_idle;
#disconnect compara database
$self->{'comparaDBA'}->dbc->disconnect_if_idle;
$runnable->run_analysis;
$self->_parse_results();
}
=head2 write_output
Arg : -none
Example : $self->write_output
Description: stores results in a database
Returntype : none
Exceptions : none
Caller :
Status : At risk
=cut
sub write_output {
my ($self) = @_;
print "WRITE OUTPUT\n";
if ($self->{'_runnable'}->{tree_to_save}) {
my $meta_container = $self->{'comparaDBA'}->get_MetaContainer;
$meta_container->store_key_value("synteny_region_tree_".$self->synteny_region_id,
$self->{'_runnable'}->{tree_to_save});
}
my $mlssa = $self->{'comparaDBA'}->get_MethodLinkSpeciesSetAdaptor;
my $mlss = $mlssa->fetch_by_dbID($self->method_link_species_set_id);
my $dnafrag_adaptor = $self->{'comparaDBA'}->get_DnaFragAdaptor;
my $gaba = $self->{'comparaDBA'}->get_GenomicAlignBlockAdaptor;
# $gaba->use_autoincrement(0);
my $gaa = $self->{'comparaDBA'}->get_GenomicAlignAdaptor;
# $gaa->use_autoincrement(0);
my $gaga = $self->{'comparaDBA'}->get_GenomicAlignGroupAdaptor;
my $gata = $self->{'comparaDBA'}->get_GenomicAlignTreeAdaptor;
my $ancestor_genome_db = $self->{'comparaDBA'}->get_GenomeDBAdaptor->fetch_by_name_assembly("Ancestral sequences");
my $ancestor_dba = $ancestor_genome_db->db_adaptor;
my $slice_adaptor = $ancestor_dba->get_SliceAdaptor();
my $ancestor_coord_system_adaptor = $ancestor_dba->get_CoordSystemAdaptor();
my $ancestor_coord_system;
eval{
$ancestor_coord_system = $ancestor_coord_system_adaptor->fetch_by_name("ancestralsegment");
};
if(!$ancestor_coord_system){
$ancestor_coord_system = new Bio::EnsEMBL::CoordSystem(
-name => "ancestralsegment",
-VERSION => undef,
-DEFAULT => 1,
-SEQUENCE_LEVEL => 1,
-RANK => 1
);
$ancestor_coord_system_adaptor->store($ancestor_coord_system);
}
foreach my $genomic_align_tree (@{$self->{'_runnable'}->output}) {
my $all_nodes;
foreach my $genomic_align_node (@{$genomic_align_tree->get_all_nodes}) {
next if (!defined $genomic_align_node->genomic_align_group);
foreach my $genomic_align (@{$genomic_align_node->genomic_align_group->get_all_GenomicAligns}) {
$genomic_align->adaptor($gaa);
$genomic_align->method_link_species_set($mlss);
$genomic_align->level_id(1);
if ($genomic_align->dnafrag_id == -1) {
## INTERNAL NODE, i.e. an ancestral sequence
my $length = length($genomic_align->original_sequence);
$slice_adaptor->dbc->db_handle->do("LOCK TABLES seq_region WRITE, dna WRITE");
my $last_id = $slice_adaptor->dbc->db_handle->selectrow_array("SELECT max(seq_region_id) FROM seq_region");
$last_id++;
my $name = "Ancestor$last_id";
my $slice = new Bio::EnsEMBL::Slice(
-seq_region_name => $name,
-start => 1,
-end => $length,
-seq_region_length => $length,
-strand => 1,
-coord_system => $ancestor_coord_system,
);
$slice_adaptor->store($slice, \$genomic_align->original_sequence);
$slice_adaptor->dbc->db_handle->do("UNLOCK TABLES");
my $dnafrag = new Bio::EnsEMBL::Compara::DnaFrag(
-name => $name,
-genome_db => $ancestor_genome_db,
-length => $length,
-coord_system_name => "ancestralsegment",
);
$dnafrag_adaptor->store($dnafrag);
$genomic_align->dnafrag_id($dnafrag->dbID);
$genomic_align->dnafrag_end($length);
$genomic_align->dnafrag($dnafrag);
}
}
}
my $split_trees;
#restrict genomic_align_tree if it is too long and store as groups
#need to do it this way in case have no ancestral nodes
my $gat_length;
$all_nodes = $genomic_align_tree->get_all_leaves;
$gat_length = $all_nodes->[0]->length;
if ($self->max_block_size() && $gat_length > $self->max_block_size()) {
for (my $start = 1; $start <= $gat_length;
$start += $self->max_block_size()) {
my $end = $start+$self->max_block_size()-1;
if ($end > $gat_length) {
$end = $gat_length;
}
my $new_gat = $genomic_align_tree->restrict_between_alignment_positions($start, $end, "skip_empty_GenomicAligns");
push @$split_trees, $new_gat;
}
$gata->store_group($split_trees);
foreach my $tree (@$split_trees) {
$self->_write_gerp_dataflow($tree->modern_genomic_align_block_id,
$mlss);
}
} else {
$gata->store($genomic_align_tree);
$self->_write_gerp_dataflow(
$genomic_align_tree->modern_genomic_align_block_id,
$mlss);
}
}
chdir("$self->worker_temp_directory");
foreach(glob("*")){
#DO NOT COMMENT THIS OUT!!! (at least not permenantly). Needed
#to clean up after each job otherwise you get files left over from
#the previous job.
unlink($_);
}
return 1;
}
sub _write_gerp_dataflow {
my ($self, $gab_id, $mlss) = @_;
my $species_set = "[";
my $genome_db_set = $mlss->species_set;
foreach my $genome_db (@$genome_db_set) {
$species_set .= $genome_db->dbID . ",";
}
$species_set .= "]";
my $output_id = "{genomic_align_block_id=>" . $gab_id . ",species_set=>" . $species_set . "}";
$self->dataflow_output_id($output_id);
}
=head2 _parse_results
Arg : none
Example : $self->_parse_results
Description: parse the alignment and tree files
Returntype : none
Exceptions :
Caller : run
Status : At risk
=cut
sub _parse_results {
my ($self) = @_;
#Taken from Analysis/Runnable/LowCoverageGenomeAlignment.pm module
print "PARSE RESULTS\n";
## The output file contains one fasta aligned sequence per original sequence + ancestral sequences.
## The first seq. corresponds to the fist leaf of the tree, the second one will be an internal
## node, the third is the second leaf and so on. The fasta header in the result file correspond
## to the names of the leaves for the leaf nodes and to the concatenation of the names of all the
## underlying leaves for internal nodes. For instance:
## ----------------------------------
## >0
## ACTTGG--CCGT
## >0_1
## ACTTGGTTCCGT
## >1
## ACTTGGTTCCGT
## >1_2_3
## ACTTGCTTCCGT
## >2
## CCTTCCTTCCGT
## ----------------------------------
## The sequence of fasta files and leaves in the tree have the same order. If Ortheus is run
## with a given tree, the sequences in the file follow the tree. If Ortheus estimate the tree,
## the tree output file contains also the right order of files:
## ----------------------------------
## ((1:0.0157,0:0.0697):0.0000,2:0.0081);
## /tmp/file3.fa /tmp/file1.fa /tmp/file2.fa
## ----------------------------------
my $workdir;
my $tree_file = $self->worker_temp_directory . "/output.$$.tree";
my $ordered_fasta_files = $self->fasta_files;
if (-e $tree_file) {
## Estimated tree. Overwrite the order of the fasta files and get the tree
open(F, $tree_file) || throw("Could not open tree file <$tree_file>");
my ($newick, $files);
while () {
$newick .= $_;
}
close(F);
$newick =~ s/[\r\n]+$//;
$self->tree_string($newick);
my $this_tree =
Bio::EnsEMBL::Compara::Graph::NewickParser::parse_newick_into_tree($newick);
#created tree via semphy or treebest
$ordered_fasta_files = undef;
my $all_leaves = $this_tree->get_all_leaves;
foreach my $this_leaf (@$all_leaves) {
push @$ordered_fasta_files, $this_leaf->name . ".fa";
}
$self->fasta_files(@$ordered_fasta_files);
#print STDOUT "**NEWICK: $newick\nFILES: ", join(" -- ", @$ordered_fasta_files), "\n";
}
my (@ordered_leaves) = $self->tree_string =~ /[(,]([^(:)]+)/g;
#print "++NEWICK: ", $self->tree_string, "\nLEAVES: ", join(" -- ", @ordered_leaves), "\nFILES: ", join(" -- ", @{$self->fasta_files}), "\n";
my $alignment_file;
#read in mfa file created for input into treebest
$alignment_file = $self->{multi_fasta_file};
my $this_genomic_align_block = new Bio::EnsEMBL::Compara::GenomicAlignBlock;
open(F, $alignment_file) || throw("Could not open $alignment_file");
my $seq = "";
my $this_genomic_align;
#Create genomic_align_group object to store genomic_aligns for
#each node. For 2x genomes, there may be several genomic_aligns
#for a node but for other genomes there will only be one
#genomic_align in the genomic_align_group
my $genomic_align_group;
print "tree_string " . $self->tree_string . "\n";
my $tree = Bio::EnsEMBL::Compara::Graph::NewickParser::parse_newick_into_tree($self->tree_string);
$tree->print_tree(100);
print $tree->newick_format("simple"), "\n";
print join(" -- ", map {$_->name} @{$tree->get_all_leaves}), "\n";
print "Reading $alignment_file...\n";
my $ids;
foreach my $this_file (@$ordered_fasta_files) {
push(@$ids, qx"head -1 $this_file");
#print "add ids $this_file " . $ids->[-1] . "\n";
}
#print join(" :: ", @$ids), "\n\n";
my $genomic_aligns_2x_array = [];
my @num_frag_pads;
my $frag_limits;
my @ga_lengths;
my $ga_deletions;
while () {
next if (/^\s*$/);
chomp;
## FASTA headers correspond to the tree and the order of the leaves in the tree corresponds
## to the order of the files
if (/^>/) {
print "PARSING $_\n" if ($self->debug);
#print $tree->newick_format(), "\n" if ($self->debug);
my ($name) = $_ =~ /^>(.+)/;
if (defined($this_genomic_align) and $seq) {
if (@$genomic_aligns_2x_array) {
print "*****FOUND 2x seq " . length($seq) . "\n" if ($self->debug);
#starting offset
my $offset = $num_frag_pads[0];
#how many X's to add at the start of the cigar_line
my $start_X;
#how many X's to add to the end of the cigar_line
my $end_X;
my $align_offset = 0;
for (my $i = 0; $i < @$genomic_aligns_2x_array; $i++) {
my $genomic_align = $genomic_aligns_2x_array->[$i];
my $num_pads = $num_frag_pads[$i+1];
my $ga_length = $genomic_align->dnafrag_end-$genomic_align->dnafrag_start+1;
#print "ga-length $ga_length " . $genomic_align->dnafrag_start . " " . $genomic_align->dnafrag_end . " " , $ga_lengths[$i] . "\n";
my ($subseq, $aligned_start, $aligned_end) = _extract_sequence($seq, $align_offset, $ga_lengths[$i]);
$align_offset = $aligned_end;
#print "final subseq $aligned_start $aligned_end $subseq\n";
#Add aligned sequence
$genomic_align->aligned_sequence($subseq);
my $cigar_line = create_2x_cigar_line($subseq, $ga_deletions->[$i]);
$genomic_align->cigar_line($cigar_line);
#Add X padding characters to ends of seq
$start_X = $aligned_start;
$end_X = length($seq) - ($start_X+length($subseq));
print "start_X $start_X end_X $end_X subseq_length " . length($subseq) . "\n" if ($self->debug);
#print "before cigar_line " . $genomic_align->cigar_line . "\n";
$genomic_align->cigar_line($start_X . "X" .$genomic_align->cigar_line . $end_X . "X");
#print "after cigar_line " . $genomic_align->cigar_line . "\n";
#my $aln_seq = "." x $start_X;
#$aln_seq .= $genomic_align->aligned_sequence();
#$aln_seq .= "." x $end_X;
#$genomic_align->aligned_sequence($aln_seq);
#free aligned_sequence now that I've used it to
#create the cigar_line
undef($genomic_align->{'aligned_sequence'});
#Add genomic align to genomic align block
$this_genomic_align_block->add_GenomicAlign($genomic_align);
#$offset += $num_pads + $ga_length;
$offset += $ga_length;
}
$genomic_aligns_2x_array = [];
undef @num_frag_pads;
undef @ga_lengths;
undef $ga_deletions;
undef $frag_limits;
} else {
print "add aligned_sequence " . $this_genomic_align->dnafrag_id . " " . $this_genomic_align->dnafrag_start . " " . $this_genomic_align->dnafrag_end . "\n" if $self->debug;
$this_genomic_align->aligned_sequence($seq);
#need to add original sequence here because the routine
#remove_empty_columns can delete parts of the alignment and
#so the original_sequence cannot be reconstructed from the
#aligned_sequence
if ($this_genomic_align->dnafrag_id == -1) {
$this_genomic_align->original_sequence;
}
#undef aligned_sequence now. Necessary because otherwise
#when I remove_empty_columns, this
#modifies the cigar_line only and not the aligned_sequence
#so not removing it here causes the genomic_align_block
#length to be wrong since it finds the length of the
#aligned_sequence
$this_genomic_align->cigar_line;
undef($this_genomic_align->{'aligned_sequence'});
$this_genomic_align_block->add_GenomicAlign($this_genomic_align);
}
}
my $header = shift(@$ids);
$this_genomic_align = new Bio::EnsEMBL::Compara::GenomicAlign;
if (!defined($header)) {
print "INTERNAL NODE $name\n" if ($self->debug);
my $this_node;
foreach my $this_leaf_name (split("_", $name)) {
if ($this_node) {
my $other_node = $tree->find_node_by_name($this_leaf_name);
if (!$other_node) {
throw("Cannot find node <$this_leaf_name>\n");
}
$this_node = $this_node->find_first_shared_ancestor($other_node);
} else {
print $tree->newick_format() if ($self->debug);
print " LEAF: $this_leaf_name\n" if ($self->debug);
$this_node = $tree->find_node_by_name($this_leaf_name);
}
}
print join("_", map {$_->name} @{$this_node->get_all_leaves}), "\n" if ($self->debug);
## INTERNAL NODE: dnafrag_id and dnafrag_end must be edited somewhere else
$this_genomic_align->dnafrag_id(-1);
$this_genomic_align->dnafrag_start(1);
$this_genomic_align->dnafrag_end(0);
$this_genomic_align->dnafrag_strand(1);
bless($this_node, "Bio::EnsEMBL::Compara::GenomicAlignTree");
#$this_node->genomic_align($this_genomic_align);
$genomic_align_group = new Bio::EnsEMBL::Compara::GenomicAlignGroup(
# -genomic_align_array => [$this_genomic_align],
-type => "epo");
$genomic_align_group->add_GenomicAlign($this_genomic_align);
$this_node->genomic_align_group($genomic_align_group);
$this_node->name($name);
} elsif ($header =~ /^>SeqID(\d+)/) {
#print "old $name\n";
print "leaf_name?? $name\n" if ($self->debug);
my $this_leaf = $tree->find_node_by_name($name);
if (!$this_leaf) {
print $tree->newick_format(), " ****\n" if ($self->debug);
die "";
}
#print "$this_leaf\n";
# print "****** $name -- $header -- ";
# if ($this_leaf) {
# $this_leaf->print_node();
# } else {
# print "[none]\n";
# }
#information extracted from fasta header
my $seq_id = ($1);
my $all_genomic_aligns = $self->genomic_aligns;
my $ga = $all_genomic_aligns->[$seq_id-1];
if (!UNIVERSAL::isa($ga, 'Bio::EnsEMBL::Compara::GenomicAlign')) {
print "FOUND 2X GENOME\n" if $self->debug;
print "num of frags " . @$ga . "\n" if $self->debug;
print "*** NAME " . $ga->[0]->{genomic_align}->genome_db->name . " start " . $ga->[0]->{genomic_align}->dnafrag_start . " end " . $ga->[0]->{genomic_align}->dnafrag_end . " name " . $ga->[0]->{genomic_align}->dnafrag->name . "\n" if $self->debug;
#reorder fragments if reference slice is on the reverse
#strand
my $first_ref_ga = $ga->[0]->{ref_ga};
if ($first_ref_ga->dnafrag_strand == -1) {
@{$ga} = sort {$b->{genomic_align_block}->reference_genomic_align->dnafrag_start <=> $a->{genomic_align_block}->reference_genomic_align->dnafrag_start} @{$ga};
}
#first pads
push @num_frag_pads, $ga->[0]->{first_pads};
#create new genomic_align for each pairwise fragment
foreach my $ga_frag (@$ga) {
my $genomic_align = new Bio::EnsEMBL::Compara::GenomicAlign;
my $genomic_align_block = $ga_frag->{genomic_align_block};
my $non_ref_genomic_align = $genomic_align_block->get_all_non_reference_genomic_aligns->[0];
$genomic_align->dnafrag_id($non_ref_genomic_align->dnafrag_id);
$genomic_align->dnafrag_start($non_ref_genomic_align->dnafrag_start);
$genomic_align->dnafrag_end($non_ref_genomic_align->dnafrag_end);
$genomic_align->dnafrag_strand($non_ref_genomic_align->dnafrag_strand);
print "store start " . $genomic_align->dnafrag_start . " end " . $genomic_align->dnafrag_end . " strand " . $genomic_align->dnafrag_strand . "\n" if $self->debug;
#print "LENGTHS " . $ga_frag->{length} . "\n";
push @$ga_deletions, $ga_frag->{deletions};
push @ga_lengths, $ga_frag->{length};
push @num_frag_pads, $ga_frag->{num_pads};
push @$genomic_aligns_2x_array, $genomic_align;
}
#Add genomic align to genomic align group
$genomic_align_group = new Bio::EnsEMBL::Compara::GenomicAlignGroup(
#-genomic_align_array => $genomic_aligns_2x_array,
-type => "epo");
foreach my $this_genomic_align (@$genomic_aligns_2x_array) {
$genomic_align_group->add_GenomicAlign($this_genomic_align);
}
bless($this_leaf, "Bio::EnsEMBL::Compara::GenomicAlignTree");
$this_leaf->genomic_align_group($genomic_align_group);
print "size of array " . @$genomic_aligns_2x_array . "\n" if $self->debug;
print "store gag1 $this_leaf\n" if $self->debug;
#$self->{$this_leaf} = $genomic_align_group;
} else {
print "normal name " . $ga->genome_db->name . "\n" if $self->debug;
$this_genomic_align->dnafrag_id($ga->dnafrag_id);
$this_genomic_align->dnafrag_start($ga->dnafrag_start);
$this_genomic_align->dnafrag_end($ga->dnafrag_end);
$this_genomic_align->dnafrag_strand($ga->dnafrag_strand);
$genomic_align_group = new Bio::EnsEMBL::Compara::GenomicAlignGroup(
#-genomic_align_array => [$this_genomic_align],
-type => "epo");
$genomic_align_group->add_GenomicAlign($this_genomic_align);
bless($this_leaf, "Bio::EnsEMBL::Compara::GenomicAlignTree");
$this_leaf->genomic_align_group($genomic_align_group);
print "store gag2 $this_leaf\n" if $self->debug;
}
} else {
throw("Error while parsing the FASTA header. It must start by \">DnaFrag#####\" where ##### is the dnafrag_id\n$_");
}
$seq = "";
} else {
$seq .= $_;
}
}
close F;
#last genomic_align
print "Last genomic align\n" if ($self->debug);
if (@$genomic_aligns_2x_array) {
print "*****FOUND 2x seq " . length($seq) . "\n" if ($self->debug);
#starting offset
my $offset = $num_frag_pads[0];
#how many X's to add at the start and end of the cigar_line
my ($start_X , $end_X);
my $align_offset = 0;
for (my $i = 0; $i < @$genomic_aligns_2x_array; $i++) {
my $genomic_align = $genomic_aligns_2x_array->[$i];
my $num_pads = $num_frag_pads[$i+1];
my $ga_length = $genomic_align->dnafrag_end-$genomic_align->dnafrag_start+1;
print "extract_sequence $offset " .($offset+$ga_length) . " num pads $num_pads\n" if ($self->debug);
my ($subseq, $aligned_start, $aligned_end) = _extract_sequence($seq, $align_offset, $ga_lengths[$i]);
$align_offset = $aligned_end;
# #Add aligned sequence
$genomic_align->aligned_sequence($subseq);
my $cigar_line = create_2x_cigar_line($subseq, $ga_deletions->[$i]);
$genomic_align->cigar_line($cigar_line);
# #Add X padding characters to ends of seq
$start_X = $aligned_start;
$end_X = length($seq) - ($start_X+length($subseq));
print "start_X $start_X end_X $end_X subseq_length " . length($subseq) . "\n" if ($self->debug);
$genomic_align->cigar_line($start_X . "X" .$genomic_align->cigar_line . $end_X . "X");
#free aligned_sequence now that I've used it to
#create the cigar_line
undef($genomic_align->{'aligned_sequence'});
#Add genomic align to genomic align block
$this_genomic_align_block->add_GenomicAlign($genomic_align);
$offset += $num_pads + $ga_length;
}
} else {
if (defined $this_genomic_align &&
$this_genomic_align->dnafrag_id != -1) {
$this_genomic_align->aligned_sequence($seq);
$this_genomic_align_block->add_GenomicAlign($this_genomic_align);
}
}
#Where there is no ancestral sequences ie 2X genomes
#convert all the nodes of the tree to Bio::EnsEMBL::GenomicAlignTree objects
foreach my $this_node (@{$tree->get_all_nodes}) {
if (!UNIVERSAL::isa($this_node, 'Bio::EnsEMBL::Compara::GenomicAlignTree')) {
bless($this_node, "Bio::EnsEMBL::Compara::GenomicAlignTree");
}
}
#print $tree->newick_format("simple"), "\n";
#print join(" -- ", map {$_."+".$_->node_id."+".$_->name} (@{$tree->get_all_nodes()})), "\n";
#$self->output([$tree]);
$self->{'_runnable'}->output([$tree]);
}
#create cigar line for 2x genomes manually because I need to add in the
#insertions, that is "I" in the cigar_line to represent the 2x-only sequences
#that are not found in the reference species which I removed during the
#creation of the _create_mfa routine.
sub create_2x_cigar_line {
my ($aligned_sequence, $ga_deletions) = @_;
my $cigar_line = "";
my $base_pos = 0;
my $current_deletion;
if (defined $ga_deletions && @$ga_deletions > 0) {
$current_deletion = shift @$ga_deletions;
}
my @pieces = grep {$_} split(/(\-+)|(\.+)/, $aligned_sequence);
foreach my $piece (@pieces) {
my $elem;
#length of current piece
my $this_len = length($piece);
my $mode;
if ($piece =~ /\-/) {
$mode = "D"; # D for gaps (deletions)
$elem = cigar_element($mode, $this_len);
} elsif ($piece =~ /\./) {
$mode = "X"; # X for pads (in 2X genomes)
$elem = cigar_element($mode, $this_len);
} else {
$mode = "M"; # M for matches/mismatches
my $next_pos = $base_pos + $this_len;
#TODO need special case if have insertion as the last base.
#need to have >= and < (not <=) otherwise if an insertion occurs
#in the same position as a - then I is added twice.
#check to see if next deletion occurs in this cigar element
if (defined $current_deletion &&
$current_deletion->{pos} >= $base_pos &&
$current_deletion->{pos} < $next_pos) {
#find all deletions that occur in this cigar element
my $this_del_array;
while ($current_deletion->{pos} >= $base_pos &&
$current_deletion->{pos} < $next_pos) {
push @$this_del_array, $current_deletion;
last if (@$ga_deletions == 0);
$current_deletion = shift @$ga_deletions;
}
#loop through all deletions, adding them instead of this cigar element
my $prev_pos = $base_pos;
foreach my $this_del (@$this_del_array) {
my $piece_len = ($this_del->{pos} - $prev_pos);
$elem .= cigar_element($mode, $piece_len);
$elem .= cigar_element("I", $this_del->{len});
$prev_pos = $this_del->{pos};
}
#add final bit
$elem .= cigar_element($mode, ($base_pos+$this_len) - $this_del_array->[-1]->{pos});
} else {
$elem = cigar_element($mode, $this_len);
}
$base_pos += $this_len;
#print "LENGTH $this_len BASE POS $base_pos\n";
}
$cigar_line .= $elem;
}
#print "cigar $cigar_line\n";
return $cigar_line;
}
#create cigar element from mode and length
sub cigar_element {
my ($mode, $len) = @_;
my $elem;
if ($len == 1) {
$elem = $mode;
} elsif ($len > 1) { #length can be 0 if the sequence starts with a gap
$elem = $len.$mode;
}
return $elem;
}
#check the new cigar_line is consistent ie the seq_length and number of (M+I)
#agree and the alignment length and total of cig_elems agree.
sub check_cigar_line {
my ($genomic_align, $total_gap) = @_;
#can't check ancestral nodes because these don't have a dnafarg_start
#or dnafrag_end.
return if ($genomic_align->dnafrag_id == -1);
my $seq_pos = 0;
my $align_len = 0;
my $cigar_line = $genomic_align->cigar_line;
my $length = $genomic_align->dnafrag_end-$genomic_align->dnafrag_start+1;
my $gab = $genomic_align->genomic_align_block;
my @cig = ( $cigar_line =~ /(\d*[GMDXI])/g );
for my $cigElem ( @cig ) {
my $cigType = substr( $cigElem, -1, 1 );
my $cigCount = substr( $cigElem, 0 ,-1 );
$cigCount = 1 unless ($cigCount =~ /^\d+$/);
if( $cigType eq "M" ) {
$seq_pos += $cigCount;
} elsif( $cigType eq "I") {
$seq_pos += $cigCount;
} elsif( $cigType eq "X") {
} elsif( $cigType eq "G" || $cigType eq "D") {
}
if ($cigType ne "I") {
$align_len += $cigCount;
}
}
throw ("Cigar line aligned length $align_len does not match (genomic_align_block_length (" . $gab->length . ") - num of gaps ($total_gap)) " . ($gab->length - $total_gap) . " for gab_id " . $gab->dbID . "\n")
if ($align_len != ($gab->length - $total_gap));
throw("Cigar line ($seq_pos) does not match sequence length $length\n")
if ($seq_pos != $length);
}
#If a gap has been found in a cig_elem of type M, need to split it into
#firstM - I - lastM. This function adds firstM and I to new_cigar_line
sub add_match_elem {
my ($firstM, $gap_len, $new_cigar_line) = @_;
#add firstM
if ($firstM == 1) {
$new_cigar_line .= "M";
} elsif($firstM > 1) {
$new_cigar_line .= $firstM . "M";
}
if ($gap_len == 1) {
$new_cigar_line .= "I";
} elsif ($gap_len > 1) {
$new_cigar_line .= $gap_len . "I";
}
return ($new_cigar_line);
}
#
# Extract the sequence corresponding to the 2X genome fragment
# extracts subsequence from seq starting from aligned_start (alignment coords)
# for $seq_length bases (not counting pads)
sub _extract_sequence {
my ($seq, $aligned_start, $seq_length) = @_;
my $curr_length = 0;
my $aligned_length = 0;
#my $aligned_start;
my $aligned_end;
#print "original_start $aligned_start length $seq_length\n";
#create new seq starting from aligned_start to the end
my $new_seq = substr($seq, $aligned_start);
#find the end in alignment coords counting seq_length bases.
foreach my $subseq (grep {$_} split /(\-+)/, $new_seq) {
#print "subseq $subseq\n";
my $length = length($subseq);
if ($subseq !~ /\-/) {
if (!defined($aligned_end) && ($curr_length + $length >= $seq_length)) {
$aligned_end = $aligned_length + ($seq_length - $curr_length) + $aligned_start;
#print "aligned_end $aligned_end\n";
last;
}
#length in bases
$curr_length += $length;
}
#length in alignment coords
$aligned_length += $length;
}
my $subseq = substr($seq, $aligned_start, ($aligned_end-$aligned_start));
return ($subseq, $aligned_start, $aligned_end);
}
##########################################
#
# getter/setter methods
#
##########################################
sub genomic_align_block_id {
my $self = shift;
$self->{'_genomic_align_block_id'} = shift if(@_);
return $self->{'_genomic_align_block_id'};
}
sub genomic_aligns {
my $self = shift;
$self->{'_genomic_aligns'} = shift if(@_);
return $self->{'_genomic_aligns'};
}
sub fasta_files {
my $self = shift;
$self->{'_fasta_files'} = [] unless (defined $self->{'_fasta_files'});
if (@_) {
my $value = shift;
push @{$self->{'_fasta_files'}}, $value;
}
return $self->{'_fasta_files'};
}
sub species_order {
my $self = shift;
$self->{'_species_order'} = [] unless (defined $self->{'_species_order'});
if (@_) {
my $value = shift;
push @{$self->{'_species_order'}}, $value;
}
return $self->{'_species_order'};
}
sub get_species_tree {
my $self = shift;
my $newick_species_tree;
if (defined($self->{_species_tree})) {
return $self->{_species_tree};
} elsif ($self->{_tree_analysis_data_id}) {
my $analysis_data_adaptor = $self->{hiveDBA}->get_AnalysisDataAdaptor();
$newick_species_tree = $analysis_data_adaptor->fetch_by_dbID($self->{_tree_analysis_data_id});
} elsif ($self->{_tree_file}) {
open(TREE_FILE, $self->{_tree_file}) or throw("Cannot open file ".$self->{_tree_file});
$newick_species_tree = join("", );
close(TREE_FILE);
}
if (!defined($newick_species_tree)) {
return undef;
}
$newick_species_tree =~ s/^\s*//;
$newick_species_tree =~ s/\s*$//;
$newick_species_tree =~ s/[\r\n]//g;
$self->{'_species_tree'} =
Bio::EnsEMBL::Compara::Graph::NewickParser::parse_newick_into_tree($newick_species_tree);
return $self->{'_species_tree'};
}
sub get_taxon_tree {
my $self = shift;
my $newick_taxon_tree;
if (defined($self->{_taxon_tree})) {
return $self->{_taxon_tree};
} elsif ($self->{_taxon_tree_analysis_data_id}) {
my $analysis_data_adaptor = $self->{hiveDBA}->get_AnalysisDataAdaptor();
$newick_taxon_tree = $analysis_data_adaptor->fetch_by_dbID($self->{_taxon_tree_analysis_data_id});
} elsif ($self->{_taxon_tree_file}) {
open(TREE_FILE, $self->{_taxon_tree_file}) or throw("Cannot open file ".$self->{_taxon_tree_file});
$newick_taxon_tree = join("", );
close(TREE_FILE);
}
if (!defined($newick_taxon_tree)) {
return undef;
}
$self->{'_taxon_tree'} = $newick_taxon_tree;
return $self->{'_taxon_tree'};
}
sub tree_string {
my $self = shift;
$self->{'_tree_string'} = shift if(@_);
return $self->{'_tree_string'};
}
sub taxon_tree_string {
my $self = shift;
$self->{'_taxon_tree_string'} = shift if(@_);
return $self->{'_taxon_tree_string'};
}
sub method_link_species_set_id {
my $self = shift;
$self->{'_method_link_species_set_id'} = shift if(@_);
return $self->{'_method_link_species_set_id'};
}
sub max_block_size {
my $self = shift;
$self->{'_max_block_size'} = shift if(@_);
return $self->{'_max_block_size'};
}
sub reference_species {
my $self = shift;
$self->{'_reference_species'} = shift if(@_);
return $self->{'_reference_species'};
}
##########################################
#
# internal methods
#
##########################################
sub get_params {
my $self = shift;
my $param_string = shift;
return unless($param_string);
print("parsing parameter string : ",$param_string,"\n");
my $params = eval($param_string);
return unless($params);
if(defined($params->{'genomic_align_block_id'})) {
$self->genomic_align_block_id($params->{'genomic_align_block_id'});
}
if(defined($params->{'method_link_species_set_id'})) {
$self->method_link_species_set_id($params->{'method_link_species_set_id'});
}
if(defined($params->{'tree_file'})) {
$self->{_tree_file} = $params->{'tree_file'};
}
if(defined($params->{'tree_analysis_data_id'})) {
$self->{_tree_analysis_data_id} = $params->{'tree_analysis_data_id'};
}
if(defined($params->{'taxon_tree_analysis_data_id'})) {
$self->{_taxon_tree_analysis_data_id} = $params->{'taxon_tree_analysis_data_id'};
}
if(defined($params->{'pairwise_analysis_data_id'})) {
$self->{_pairwise_analysis_data_id} = $params->{'pairwise_analysis_data_id'};
}
if(defined($params->{'reference_species'})) {
$self->{_reference_species} = $params->{'reference_species'};
}
if(defined($params->{'max_block_size'})) {
$self->{_max_block_size} = $params->{'max_block_size'};
}
return 1;
}
sub _load_GenomicAligns {
my ($self, $genomic_align_block_id) = @_;
my $genomic_aligns = [];
# Fail if dbID has not been provided
return $genomic_aligns if (!$genomic_align_block_id);
my $gaba = $self->{'comparaDBA'}->get_GenomicAlignBlockAdaptor;
my $gab = $gaba->fetch_by_dbID($genomic_align_block_id);
foreach my $ga (@{$gab->get_all_GenomicAligns}) {
push(@{$genomic_aligns}, $ga);
}
$self->genomic_aligns($genomic_aligns);
}
=head2 _load_2XGenomes
Arg [1] : int genomic_align_block_id
Arg [2] : int analysis_data_id
Description: Creates a fake assembly for each 2X genome by stitching
together the BLASTZ_NET alignments found on this synteny_region
between the reference species and each 2X genome. The list of
the pairwise database locations and
Bio::EnsEMBL::Compara::MethodLinkSpeciesSet ids are obtained
from the analysis_data_id. Creates a listref of genomic_align
fragments
Returntype :
Exception :
Warning :
=cut
sub _load_2XGenomes {
my ($self, $genomic_align_block_id, $analysis_data_id) = @_;
#get data from analysis_data table
my $analysis_data_adaptor = $self->{hiveDBA}->get_AnalysisDataAdaptor();
my @parameters = split (" ",$analysis_data_adaptor->fetch_by_dbID($analysis_data_id));
#if no 2x genomes defined, return
if (scalar(@parameters) == 0) {
print "No 2x genomes to load\n" if $self->debug;
return;
}
#Find the slice on the reference genome
my $genome_db_adaptor = $self->{'comparaDBA'}->get_GenomeDBAdaptor;
#DEBUG this opens up connections to all the databases
my $ref_genome_db = $genome_db_adaptor->fetch_by_name_assembly($self->reference_species);
my $ref_dba = $ref_genome_db->db_adaptor;
my $ref_slice_adaptor = $ref_dba->get_SliceAdaptor();
#Get multiple alignment genomic_align_block adaptor
my $multi_gaba = $self->{'comparaDBA'}->get_GenomicAlignBlockAdaptor;
#Find all the dnafrag_regions for the reference genome in this synteny region
my $ref_gas =[];
my $multi_gab = $multi_gaba->fetch_by_dbID($genomic_align_block_id);
my $all_gas = $multi_gab->get_all_GenomicAligns;
foreach my $ga (@$all_gas) {
if ($ga->genome_db->dbID == $ref_genome_db->dbID) {
push @$ref_gas, $ga;
}
}
#Return if there is no reference sequence in this gab region
if (scalar(@$ref_gas) == 0) {
print "No " . $self->reference_species . " sequences found in genomic_align_block $genomic_align_block_id\n";
return;
}
print "GAB $genomic_align_block_id num ref copies " . scalar(@$ref_gas) . "\n" if $self->debug;
#Find the BLASTZ_NET alignments between the reference species and each
#2X genome.
foreach my $params (@parameters) {
my $param = eval($params);
my $target_species;
#open compara database containing 2x genome vs $ref_name blastz results
my $compara_db_url = $param->{'compara_db_url'};
#if the database name is defined in the url, then open that
my $compara_dba;
my $locator;
if ($compara_db_url =~ /mysql:\/\/.*@.*\/.+/) {
#open database defined in url
$locator = "Bio::EnsEMBL::Compara::DBSQL::DBAdaptor/url=>$compara_db_url";
} else {
throw "Invalid url $compara_db_url. Should be of the form: mysql://user:pass\@host:port/db_name\n";
}
$compara_dba = Bio::EnsEMBL::DBLoader->new($locator);
#need to store this to allow disconnect when call ortheus
$self->{pairwise_compara_dba}->{$compara_dba->dbc->dbname} = $compara_dba;
#Get pairwise genomic_align_block adaptor
my $pairwise_gaba = $compara_dba->get_GenomicAlignBlockAdaptor;
#Get pairwise method_link_species_set
my $p_mlss_adaptor = $compara_dba->get_MethodLinkSpeciesSetAdaptor;
my $pairwise_mlss = $p_mlss_adaptor->fetch_by_dbID($param->{'method_link_species_set_id'});
#find non_reference species name in pairwise alignment
my $species_set = $pairwise_mlss->species_set;
foreach my $genome_db (@$species_set) {
if ($genome_db->name ne $self->reference_species) {
$target_species = $genome_db->name;
last;
}
}
my $target_genome_db = $genome_db_adaptor->fetch_by_name_assembly($target_species);
my $target_dba = $target_genome_db->db_adaptor;
my $target_slice_adaptor = $target_dba->get_SliceAdaptor();
#Foreach copy of the ref_genome in the multiple alignment block,
#find the alignment blocks between the ref_genome and the 2x
#target_genome in the pairwise database
my $ga_frag_array = $self->_create_frag_array($pairwise_gaba, $pairwise_mlss, $ref_gas);
#not found 2x genome
next if (!defined $ga_frag_array);
#must first sort so I have a reasonable chance of finding duplicates
#4/10/08 don't think sorting here is a good idea. The order is very
#important for when I store the fragments and work out their offsets.
# for (my $i = 0; $i < scalar(@$ga_frag_array); $i++) {
# @{$ga_frag_array->[$i]} = sort {$a->{genomic_align}->dnafrag_start <=> $b->{genomic_align}->dnafrag_start} @{$ga_frag_array->[$i]};
# }
#find the total length of all the fragments in the ref_region
my $sum_lengths;
for (my $i = 0; $i < scalar(@$ga_frag_array); $i++) {
for (my $j = 0; $j < scalar(@{$ga_frag_array->[$i]}); $j++) {
#print "*** gab *** " . $ga_frag_array->[$i][$j]->{genomic_align}->genome_db->name . " " . $ga_frag_array->[$i][$j]->{genomic_align}->genomic_align_block . "\n";
$sum_lengths->[$i] += ($ga_frag_array->[$i][$j]->{genomic_align}->dnafrag_end - $ga_frag_array->[$i][$j]->{genomic_align}->dnafrag_start + 1);
}
}
#check if there is any overlap between pairwise blocks on the ref_genomes
#if there is an overlap, then choose ref_genome duplication which is the
#longest in 2x genome
#if there is no overlap, save dnafrags on all duplications
my $found_overlap;
my $j = 0;
#Simple case: only found one reference region containing 2x genome
#fragments
if (@$ga_frag_array == 1) {
my $cluster;
$cluster = _add_to_cluster($cluster, 0);
_print_cluster($cluster) if $self->debug;
my $longest_ref_region = 0;
print "SIMPLE CASE: longest_region $longest_ref_region length " . $sum_lengths->[$longest_ref_region] . "\n" if $self->debug;
#_build_2x_composite_seq($self, $compara_dba, $ref_slice_adaptor, $target_slice_adaptor, $ga_frag_array->[$longest_ref_region]);
push @{$self->{ga_frag}}, $ga_frag_array->[$longest_ref_region];
push @{$self->{'2x_dnafrag_region'}}, $ga_frag_array->[$longest_ref_region]->[0]->{genomic_align};
next;
}
#Found more than one reference region in this synteny block
for (my $region1 = 0; $region1 < scalar(@$ga_frag_array)-1; $region1++) {
for (my $region2 = $region1+1; $region2 < scalar(@$ga_frag_array); $region2++) {
#initialise found_overlap hash
if (!defined $found_overlap->{$region1}{$region2}) {
$found_overlap->{$region1}{$region2} = 0;
}
#loop through the 2x genome fragments on region1
for (my $j = 0; ($j < @{$ga_frag_array->[$region1]}); $j++) {
#if I've already found an overlap, then stop
last if ($found_overlap->{$region1}{$region2});
#loop through 2x genome fragments on region2
for (my $k = 0; ($k < @{$ga_frag_array->[$region2]}); $k++) {
#if I've already found an overlap, then stop
last if ($found_overlap->{$region1}{$region2});
#check if 2x genome fragments have the same name
if ($ga_frag_array->[$region1][$j]->{seq_region_name} eq $ga_frag_array->[$region2][$k]->{seq_region_name}) {
#check these overlap
if (($ga_frag_array->[$region1][$j]->{start} <= $ga_frag_array->[$region2][$k]->{end}) && ($ga_frag_array->[$region1][$j]->{end} >= $ga_frag_array->[$region2][$k]->{start})) {
$found_overlap->{$region1}{$region2} = 1;
print "found overlap $region1 $region2\n" if $self->debug;
#found overlap so can stop.
last;
}
}
}
}
}
}
#Cluster all the alignment blocks that are overlapping together
my $cluster = $self->_cluster_regions($found_overlap);
_print_cluster($cluster) if $self->debug;
my $longest_regions = $self->_find_longest_region_in_cluster($cluster, $sum_lengths);
#find the reference with the longest region
my $slice_array;
foreach my $longest_ref_region (@$longest_regions) {
print "longest_ref_region $longest_ref_region length " . $sum_lengths->[$longest_ref_region] . "\n" if $self->debug;
#store composite_seq in ga_frag_array->[$longest_ref_region]
#_build_2x_composite_seq($self, $compara_dba, $ref_slice_adaptor, $target_slice_adaptor, $ga_frag_array->[$longest_ref_region]);
push @{$self->{ga_frag}}, $ga_frag_array->[$longest_ref_region];
push @{$self->{'2x_dnafrag_region'}}, $ga_frag_array->[$longest_ref_region]->[0]->{genomic_align};
}
}
}
=head2 _dump_fasta_and_mfa
Arg [1] : -none-
Example : $self->_dump_fasta();
Description: Dumps FASTA files in the order given by the tree
string (needed by Pecan). Resulting file names are
stored using the fasta_files getter/setter
Returntype : 1
Exception :
Warning :
=cut
sub _dump_fasta_and_mfa {
my $self = shift;
my $all_genomic_aligns = $self->genomic_aligns;
## Dump FASTA files in the order given by the tree string (needed by Pecan)
my @seqs;
if ($self->tree_string) {
@seqs = ($self->tree_string =~ /seq(\d+)/g);
} else {
@seqs = (1..scalar(@$all_genomic_aligns));
}
my $mfa_file = $self->worker_temp_directory . "/epo_alignment.$$.mfa";
$self->{multi_fasta_file} = $mfa_file;
print "mfa_file $mfa_file\n" if $self->debug;
open MFA, ">$mfa_file" || throw("Couldn't open $mfa_file");
foreach my $seq_id (@seqs) {
my $ga = $all_genomic_aligns->[$seq_id-1];
my $file = $self->worker_temp_directory . "/seq" . $seq_id;
#Check if I have a DnaFragRegion object or my 2x genome object
#if (!UNIVERSAL::isa($dfr, 'Bio::EnsEMBL::Compara::DnaFragRegion')) {
if (!UNIVERSAL::isa($ga, 'Bio::EnsEMBL::Compara::GenomicAlign')) {
print "FOUND 2X GENOME\n" if $self->debug;
print "num of frags " . @$ga . "\n" if $self->debug;
$self->_dump_2x_fasta($ga, $file, $seq_id, \*MFA);
next;
}
#add taxon_id to end of fasta files
$file .= "_" . $ga->genome_db->taxon_id . ".fa";
print "file $file\n" if $self->debug;
open F, ">$file" || throw("Couldn't open $file");
print F ">SeqID" . $seq_id . "\n";
#print MFA ">SeqID" . $seq_id . "\n";
#print MFA ">seq" . $seq_id . "\n";
print MFA ">seq" . $seq_id . "_" . $ga->genome_db->taxon_id . "\n";
print ">DnaFrag", $ga->dnafrag->dbID, "|", $ga->dnafrag->name, ".",
$ga->dnafrag_start, "-", $ga->dnafrag_end, ":", $ga->dnafrag_strand,"\n" if $self->debug;
my $slice = $ga->get_Slice;
throw("Cannot get slice for DnaFragRegion in DnaFrag #".$ga->dnafrag->dbID) if (!$slice);
my $seq = $slice->get_repeatmasked_seq(undef, 1)->seq;
if ($seq =~ /[^ACTGactgNnXx]/) {
print STDERR $slice->name, " contains at least one non-ACTGactgNnXx character. These have been replaced by N's\n";
$seq =~ s/[^ACTGactgNnXx]/N/g;
}
$seq =~ s/(.{80})/$1\n/g;
chomp $seq;
print F $seq,"\n";
close F;
my $aligned_seq = $ga->aligned_sequence;
$aligned_seq =~ s/(.{60})/$1\n/g;
$aligned_seq =~ s/\n$//;
print MFA $aligned_seq, "\n";
push @{$self->fasta_files}, $file;
push @{$self->species_order}, $ga->dnafrag->genome_db_id;
}
close MFA;
return 1;
}
=head2 _build_tree_string
Arg [1] : -none-
Example : $self->_build_tree_string();
Description: This method sets the tree_string using the orginal
species tree and the set of DnaFragRegions. The
tree is edited by the _update_tree method which
resort the DnaFragRegions (see _update_tree elsewwhere
in this document)
Returntype : -none-
Exception :
Warning :
=cut
sub _build_tree_string {
my $self = shift;
my $tree = $self->get_species_tree->copy;
return if (!$tree);
$tree = $self->_update_tree_2x($tree);
return if (!$tree);
my $tree_string = $tree->newick_simple_format;
# Remove quotes around node labels
$tree_string =~ s/"(seq\d+)"/$1/g;
# Remove branch length if 0
$tree_string =~ s/\:0\.0+(\D)/$1/g;
$tree_string =~ s/\:0([^\.\d])/$1/g;
$tree->release_tree;
$self->tree_string($tree_string);
}
=head2 _update_tree_2x
Arg [1] : Bio::EnsEMBL::Compara::NestedSet $tree_root
Example : $self->_update_nodes_names($tree);
Description: This method updates the tree by removing or
duplicating the leaves according to the orginal
tree and the set of DnaFragRegions. The tree nodes
will be renamed seq1, seq2, seq3 and so on and the
DnaFragRegions will be resorted in order to match
the names of the nodes (the first DnaFragRegion will
correspond to seq1, the second to seq2 and so on).
Returntype : Bio::EnsEMBL::Compara::NestedSet (a tree)
Exception :
Warning :
=cut
sub _update_tree_2x {
my $self = shift;
my $tree = shift;
my $all_genomic_aligns = $self->genomic_aligns();
my $ordered_genomic_aligns = [];
my $ordered_2x_genomes = [];
my $idx = 1;
my $all_leaves = $tree->get_all_leaves;
foreach my $this_leaf (@$all_leaves) {
my $these_genomic_aligns = [];
my $these_2x_genomes = [];
## Look for genomic_aligns belonging to this genome_db_id
foreach my $this_genomic_align (@$all_genomic_aligns) {
if ($this_genomic_align->dnafrag->genome_db_id == $this_leaf->name) {
push (@$these_genomic_aligns, $this_genomic_align);
}
}
my $index = 0;
foreach my $ga_frags (@{$self->{ga_frag}}) {
my $first_frag = $ga_frags->[0];
#print "update_tree first_frag " . $first_frag->{genomic_align}->genome_db->dbID . " this leaf " . $this_leaf->name . "\n";
if ($first_frag->{genomic_align}->dnafrag->genome_db->dbID == $this_leaf->name) {
push(@$these_2x_genomes, $index);
}
$index++;
}
print "num " . @$these_genomic_aligns . " " . @$these_2x_genomes . "\n" if $self->debug;
if (@$these_genomic_aligns == 1) {
## If only 1 has been found...
my $taxon_id = $these_genomic_aligns->[0]->dnafrag->genome_db->taxon_id;
print "seq$idx" . "_" . $taxon_id . " genome_db_id=" . $these_genomic_aligns->[0]->dnafrag->genome_db_id . "\n" if $self->debug;
$this_leaf->name("seq".$idx++."_".$taxon_id); #.".".$these_dnafrag_regions->[0]->dnafrag_id);
push(@$ordered_genomic_aligns, $these_genomic_aligns->[0]);
} elsif (@$these_genomic_aligns > 1) {
## If more than 1 has been found, let Ortheus estimate the Tree
#need to add on 2x genomes to genomic_aligns array
my $ga = $self->genomic_aligns;
foreach my $ga_frags (@{$self->{ga_frag}}) {
push @$ga, $ga_frags;
}
$self->genomic_aligns($ga);
return undef;
} elsif (@$these_2x_genomes == 1) {
#See what happens...
#Find 2x genomes
my $ga_frags = $self->{ga_frag}->[$these_2x_genomes->[0]];
print "number of frags " . @$ga_frags . "\n" if $self->debug;
my $taxon_id = $ga_frags->[0]->{taxon_id};
print "2x seq$idx" . "_" . $taxon_id . " " . $ga_frags->[0]->{genome_db_id} . "\n" if $self->debug;
$this_leaf->name("seq".$idx++."_".$taxon_id);
#push(@$ordered_2x_genomes, $these_2x_genomes->[0]);
push(@$ordered_genomic_aligns, $ga_frags);
} else {
## If none has been found...
$this_leaf->disavow_parent;
$tree = $tree->minimize_tree;
}
}
$self->genomic_aligns($ordered_genomic_aligns);
$self->{ordered_2x_genomes} = $ordered_2x_genomes;
if ($tree->get_child_count == 1) {
my $child = $tree->children->[0];
$child->parent->merge_children($child);
$child->disavow_parent;
}
return $tree;
}
#
# Create array of 2x fragments defined by the $pairwise_mlss found for the reference
#genomic_aligns (may be more than one) in $ref_gas
#
sub _create_frag_array {
my ($self, $gab_adaptor, $pairwise_mlss, $ref_gas) = @_;
my $ga_frag_array;
my $ga_num_ns = 0;
#Multiple alignment reference genomic_aligns (maybe more than 1)
foreach my $ref_ga (@$ref_gas) {
print " " . $ref_ga->dnafrag->name . " " . $ref_ga->dnafrag_start . " " . $ref_ga->dnafrag_end . " " . $ref_ga->dnafrag_strand . "\n" if $self->debug;
#find the slice corresponding to the ref_genome
my $slice = $ref_ga->get_Slice;
print "ref_seq " . $slice->start . " " . $slice->end . " " . $slice->strand . " " . substr($slice->seq,0,120) . "\n" if $self->debug;
#find the pairwise blocks between ref_genome and the 2x genome
my $pairwise_gabs = $gab_adaptor->fetch_all_by_MethodLinkSpeciesSet_Slice($pairwise_mlss, $slice, undef,undef,"restrict");
#sort by reference_genomic_align start position (NB I sort again when parsing
#the results if the ref strand is reverse since the fragments will be in the
#reverse order ie A-B-C should be C-B-A). Don't do it here because I try to find
#duplicates in load_2XGenomes.
@$pairwise_gabs = sort {$a->reference_genomic_align->dnafrag_start <=> $b->reference_genomic_align->dnafrag_start} @$pairwise_gabs;
print " pairwise gabs " . scalar(@$pairwise_gabs) . "\n" if $self->debug;
#if there are no pairwise matches found to 2x genome, then escape
#back to loop
next if (scalar(@$pairwise_gabs) == 0);
my $ga_frags;
#need to save each match separately but still use same structure as
#create_span_frag_array in case we change our minds back again
foreach my $pairwise_gab (@$pairwise_gabs) {
#should only have 1!
my $ga = $pairwise_gab->get_all_non_reference_genomic_aligns->[0];
my $ref_start = $ga->genomic_align_block->reference_genomic_align->dnafrag_start;
my $ref_end = $ga->genomic_align_block->reference_genomic_align->dnafrag_end;
#need to store gab too otherwise it goes out of scope and I can't
#access it from ga
my $ga_fragment = {genomic_align => $ga,
genomic_align_block => $ga->genomic_align_block,
taxon_id => $ga->genome_db->taxon_id,
genome_db_id => $ga->dnafrag->genome_db_id,
ref_ga => $ref_ga,
};
print "GAB " . $ga_fragment->{genomic_align}->genome_db->name . " " . $ga_fragment->{genomic_align}->dnafrag_start . " " . $ga_fragment->{genomic_align}->dnafrag_end . " " . $ga_fragment->{genomic_align}->dnafrag_strand . " " . substr($ga_fragment->{genomic_align}->get_Slice->seq,0,120) . "\n" if $self->debug;
push @$ga_frags, $ga_fragment;
}
#add to array of fragments for each reference genomic_align
push @$ga_frag_array, $ga_frags;
}
return $ga_frag_array;
}
#foreach cluster, find the longest region.
sub _find_longest_region_in_cluster {
my ($self, $cluster, $sum_lengths) = @_;
my $max_frag = 0;
my $final_region = -1;
my @overlap_frag;
my $overlap_cnt = 0;
my $not_overlap_cnt = 0;
my $longest_clusters;
foreach my $this_cluster (@$cluster) {
my $longest_frag;
my $longest_region;
foreach my $region (keys %{$this_cluster}) {
#initialise variables
if (!defined $longest_frag) {
$longest_frag = $sum_lengths->[$region];
$longest_region = $region;
}
if ($sum_lengths->[$region] >= $longest_frag) {
$longest_frag = $sum_lengths->[$region];
$longest_region = $region;
}
}
push @$longest_clusters, $longest_region;
}
print "overlap_cnt $overlap_cnt not $not_overlap_cnt\n" if $self->debug;
return $longest_clusters;
}
#Put overlapping regions in the same cluster. If region 0 overlaps with region
#1 and region 2, but not with region 3, create 2 clusters: (0,1,2), (3)
sub _cluster_regions {
my ($self, $found_overlap) = @_;
my $overlap_cnt = 0;
my $not_overlap_cnt = 0;
my $cluster;
foreach my $region1 (keys %$found_overlap) {
foreach my $region2 (keys %{$found_overlap->{$region1}}) {
print "FOUND OVERLAP $region1 $region2 " . $found_overlap->{$region1}{$region2} . "\n" if $self->debug;
if ($found_overlap->{$region1}{$region2}) {
$overlap_cnt++;
$cluster = _add_to_same_cluster($cluster, $region1, $region2);
} else {
$not_overlap_cnt++;
$cluster = _add_to_different_cluster($cluster, $region1, $region2);
}
}
}
print "overlap_cnt $overlap_cnt not $not_overlap_cnt\n" if $self->debug;
return $cluster;
}
#add single region to cluster. No overlaps found.
sub _add_to_cluster {
my ($cluster, $region1) = @_;
if (!defined $cluster) {
$cluster->[0]->{$region1} = 1;
}
return $cluster;
}
sub _add_to_same_cluster {
my ($cluster, $region1, $region2) = @_;
#print "add to same cluster $region1 $region2\n";
if (!defined $cluster) {
$cluster->[0]->{$region1} = 1;
$cluster->[0]->{$region2} = 1;
return $cluster;
}
my $cluster_size = @$cluster;
my $index1 = _in_cluster($cluster, $region1);
my $index2 = _in_cluster($cluster, $region2);
if ($index1 == -1 && $index2 == -1) {
#neither found, add both to new cluster
$cluster->[$cluster_size]->{$region1} = 1;
$cluster->[$cluster_size]->{$region2} = 1;
} elsif ($index1 != -1 && $index2 == -1) {
#id1 found, id2 not. add id2 to id1 cluster
$cluster->[$index1]->{$region2} = 1;
} elsif ($index1 == -1 && $index2 != -1) {
#id2 found, id1 not. add id1 to id2 cluster
$cluster->[$index2]->{$region1} = 1;
} else {
#both ids set in different clusters. Merge the clusters together
$cluster = _merge_clusters($cluster, $index1, $index2);
}
return $cluster;
}
sub _add_to_different_cluster {
my ($cluster, $region1, $region2) = @_;
if (!defined $cluster) {
$cluster->[0]->{$region1} = 1;
$cluster->[1]->{$region2} = 1;
return $cluster;
}
my $cluster_size = @$cluster;
my $index1 = _in_cluster($cluster, $region1);
my $index2 = _in_cluster($cluster, $region2);
if ($index1 == -1) {
$cluster->[@$cluster]->{$region1} = 1;
}
if ($index2 == -1) {
$cluster->[@$cluster]->{$region2} = 1;
}
return $cluster;
}
sub _in_cluster {
my ($cluster, $region) = @_;
for (my $i = 0; $i < @$cluster; $i++) {
if ($cluster->[$i]->{$region}) {
return $i;
}
}
return -1;
}
sub _merge_clusters {
my ($cluster, $index1, $index2) = @_;
#already in same cluster
if ($index1 != -1 && $index1 == $index2) {
return $cluster;
}
#copy over keys from index2 to index1
foreach my $region (keys %{$cluster->[$index2]}) {
#print "region $region\n";
$cluster->[$index1]->{$region} = 1;
}
#delete index2
splice(@$cluster, $index2, 1);
return $cluster;
}
sub _print_cluster {
my ($cluster) = @_;
print "FINAL cluster ";
foreach my $this_cluster (@$cluster) {
print "(";
foreach my $group (keys %{$this_cluster}) {
print "$group ";
}
print "), ";
}
print "\n";
}
sub _dump_2x_fasta {
my ($self, $ga_frags, $file, $seq_id, $mfa_fh) = @_;
$file .= "_" . $ga_frags->[0]->{taxon_id} . ".fa";
open F, ">$file" || throw("Couldn't open $file");
print F ">SeqID" . $seq_id . "\n";
#print MFA ">SeqID" . $seq_id . "\n";
#print MFA ">seq" . $seq_id . "\n";
print MFA ">seq" . $seq_id . "_" . $ga_frags->[0]->{taxon_id} . "\n";
my $aligned_seq = $ga_frags->[0]->{aligned_seq};
my $seq = $aligned_seq;
$seq =~ tr/-//d;
print F "$seq\n";
close F;
$aligned_seq =~ s/(.{60})/$1\n/g;
$aligned_seq =~ s/\n$//;
print $mfa_fh $aligned_seq, "\n";
push @{$self->fasta_files}, $file;
push @{$self->species_order}, $ga_frags->[0]->{genome_db_id};
}
#create alignment from multiple genomic_align_block and 2X genomes.
sub _create_mfa {
my ($self) = @_;
my $multi_gab_id = $self->genomic_align_block_id;
my $multi_gaba = $self->{'comparaDBA'}->get_GenomicAlignBlockAdaptor;
my $multi_gab = $multi_gaba->fetch_by_dbID($multi_gab_id);
my $multi_gas = $multi_gab->get_all_GenomicAligns;
my $pairwise_frags = $self->{ga_frag};
my $species_order = $self->species_order;
foreach my $ga_frag_array (@$pairwise_frags) {
my $multi_ref_ga = $ga_frag_array->[0]->{ref_ga};
my $multi_mapper = $multi_ref_ga->get_Mapper;
#my $multi_gab_length = length($multi_ref_ga->aligned_sequence);
my $multi_gab_length = $multi_ref_ga->genomic_align_block->length;
## New aligned sequence for the 2X genome. Empty (only dashes) at creation time
my $aligned_sequence = "-" x $multi_gab_length;
foreach my $ga_frag (@$ga_frag_array) {
my $pairwise_gab = $ga_frag->{genomic_align_block};
my $pairwise_non_ref_ga = $pairwise_gab->get_all_non_reference_genomic_aligns->[0];
my $pairwise_ref_ga = $pairwise_gab->reference_genomic_align;
my $pairwise_fixed_seq = $pairwise_non_ref_ga->aligned_sequence("+FIX_SEQ");
#undef($pairwise_non_ref_ga->{'aligned_sequence'});
print "pairwise " . $pairwise_non_ref_ga->genome_db->name . " " . substr($pairwise_fixed_seq,0,120) . "\n" if $self->debug;
my $depad = $pairwise_fixed_seq;
$depad =~ tr/-//d;
#print "depad length " . length($depad) . "\n";
#my $deletion_array = find_ref_deletions($pairwise_ref_ga);
my $deletion_array = find_ref_deletions($pairwise_gab);
my @coords = $multi_mapper->map_coordinates("sequence",
$pairwise_ref_ga->dnafrag_start,
$pairwise_ref_ga->dnafrag_end,
$pairwise_ref_ga->dnafrag_strand,
"sequence");
my $length = 0;
my $start = undef;
my $end = undef;
foreach my $this_coord (@coords) {
next if ($this_coord->isa("Bio::EnsEMBL::Mapper::Gap"));
$length += $this_coord->length;
## Extract the first N characters from $other_fixed_seq
my $subseq = substr($pairwise_fixed_seq, 0, $this_coord->length, "");
## Copy extracted characters into the new aligned sequence for the 2X genome.
substr($aligned_sequence, $this_coord->start-1, $this_coord->length, $subseq);
$start = $this_coord->start if (!defined($start) or $this_coord->start < $start);
$end = $this_coord->end if (!defined($end) or $this_coord->end > $end);
}
$ga_frag->{deletions} = $deletion_array;
$ga_frag->{length} = length($depad);
$ga_frag_array->[0]->{cigar_line} = undef;
$ga_frag_array->[0]->{aligned_seq} = $aligned_sequence;
}
#for (my $x = 0; $x < length($multi_ref_ga->aligned_sequence); $x += 80) {
# print substr($aligned_sequence, $x, 80), "\n";
# print substr($multi_ref_ga->aligned_sequence, $x, 80), "\n\n";
#}
}
}
#find deletions in reference species and store the position in slice coords
#and the length
sub find_ref_deletions {
my ($gab) = @_;
my $deletion_array;
my $ref_ga = $gab->reference_genomic_align;
my $non_ref_ga = $gab->get_all_non_reference_genomic_aligns->[0];
my $ref_mapper = $ref_ga->get_Mapper;
my $non_ref_mapper = $non_ref_ga->get_Mapper;
my @ref_coords = $ref_mapper->map_coordinates("sequence",
$ref_ga->dnafrag_start,
$ref_ga->dnafrag_end,
$ref_ga->dnafrag_strand,
"sequence");
#print "num coords " . @ref_coords . "\n";
#print "non_ref start " . $non_ref_ga->dnafrag_start . " end " . $non_ref_ga->dnafrag_end . "\n";
my $start_del;
my $end_del;
my $num_del = 0;
foreach my $this_coord (@ref_coords) {
#print "coords " . $this_coord->start . " end " . $this_coord->end . " strand " . $this_coord->strand . "\n";
my @non_ref_coords = $non_ref_mapper->map_coordinates("alignment",
$this_coord->start,
$this_coord->end,
$this_coord->strand,
"alignment");
#want all coords starting from left hand end
if ($non_ref_ga->dnafrag_strand == -1) {
#print "start " . $non_ref_ga->dnafrag_end . " " . $non_ref_coords[0]->start . "\n";
$end_del = ($non_ref_ga->dnafrag_end - $non_ref_coords[0]->end +1);
} else {
$end_del = $non_ref_coords[0]->start - $non_ref_ga->dnafrag_start+1;
}
if (defined $start_del) {
#print "found del $start_del $end_del\n";
my $deletion;
$deletion->{pos} = $start_del - $num_del;
$deletion->{len} = ($end_del-$start_del-1);
push @$deletion_array, $deletion;
$num_del += $deletion->{len};
#print "del pos " . $deletion->{pos} . " len " . $deletion->{len} . "\n";
}
if ($non_ref_ga->dnafrag_strand == -1) {
#print "end " . $non_ref_ga->dnafrag_end . " " . $non_ref_coords[-1]->end . "\n";
$start_del = ($non_ref_ga->dnafrag_end - $non_ref_coords[-1]->start +1);
} else {
$start_del = $non_ref_coords[-1]->end-$non_ref_ga->dnafrag_start+1;
}
# foreach my $non_ref (@non_ref_coords) {
# if ($non_ref->isa("Bio::EnsEMBL::Mapper::Gap")) {
# print " found gap " . $non_ref->start . " end ". $non_ref->end . "\n";
# } else {
# print " non ref " . ($non_ref->start-$non_ref_ga->dnafrag_start+1) . " end " . ($non_ref->end-$non_ref_ga->dnafrag_start+1) . "\n";
# print " non ref real " . $non_ref->start . " end " . $non_ref->end . "\n";
# }
# }
}
return $deletion_array;
}
sub get_seq_length_from_cigar {
my ($cigar_line) = @_;
my $seq_pos;
my @cig = ( $cigar_line =~ /(\d*[GMDXI])/g );
for my $cigElem ( @cig ) {
my $cigType = substr( $cigElem, -1, 1 );
my $cigCount = substr( $cigElem, 0 ,-1 );
$cigCount = 1 unless ($cigCount =~ /^\d+$/);
if( $cigType eq "M" ) {
$seq_pos += $cigCount;
} elsif( $cigType eq "I") {
$seq_pos += $cigCount;
}
}
return $seq_pos;
}
sub delete_epo_alignments {
my ($self, $gab_id) = @_;
my $dbc = $self->{'comparaDBA'};
my $mlss_id = $self->method_link_species_set_id;
my $gab_adaptor = $self->{'comparaDBA'}->get_GenomicAlignBlockAdaptor;
my $gat_adaptor = $self->{'comparaDBA'}->get_GenomicAlignTreeAdaptor;
my $genome_db_adaptor = $self->{'comparaDBA'}->get_GenomeDBAdaptor;
my $ancestor_genome_db = $genome_db_adaptor->fetch_by_name_assembly("Ancestral sequences");
my $ancestor_dba = $ancestor_genome_db->db_adaptor;
my $compara_dba = $gab_adaptor;
#get all genomic_align_blocks for mlss
my $sql = "SELECT DISTINCT ga2.genomic_align_block_id FROM genomic_align ga1 LEFT JOIN genomic_align ga2 USING (dnafrag_id, dnafrag_start, dnafrag_end) WHERE ga1.genomic_align_block_id=$gab_id AND ga2.method_link_species_set_id=$mlss_id";
my $sth = $compara_dba->prepare($sql);
$sth->execute();
my ($genomic_align_block_id);
my @new_gabs;
$sth->bind_columns(\$genomic_align_block_id);
while ($sth->fetch()) {
print "Gab " . $genomic_align_block_id . "\n";
push @new_gabs, $gab_adaptor->fetch_by_dbID($genomic_align_block_id);
}
$sth->finish();
if (@new_gabs == 0) {
print STDERR "Nothing to delete\n";
return;
}
#need to delete any analysis_jobs created for gerp.
my %undeleted_gabs;
foreach my $genomic_align_block (@new_gabs) {
my %gabs_to_delete;
my @gags_to_delete;
my @dnafrags_to_delete;
my @seq_regions_to_delete;
$undeleted_gabs{$genomic_align_block->dbID} = 1;
#delete both modern and ancestral block in each iteration
next if (!defined $genomic_align_block);
if ($genomic_align_block->method_link_species_set->method_link_class ne "GenomicAlignTree.tree_alignment") {
warn("This script is for deleting epo alignments only\n");
next;
}
#get the tree for each block
my $genomic_align_tree = $gat_adaptor->fetch_by_GenomicAlignBlock($genomic_align_block);
#check still have a tree
next if (!defined $genomic_align_tree);
foreach my $this_node (@{$genomic_align_tree->get_all_nodes}) {
my $genomic_align_group = $this_node->genomic_align_group;
next if (!$genomic_align_group);
foreach my $this_genomic_align (@{$genomic_align_group->get_all_GenomicAligns}) {
#get unique genomic_align_blocks
$gabs_to_delete{$this_genomic_align->genomic_align_block_id} = 1;
#check all gabs end up being deleted
delete($undeleted_gabs{$this_genomic_align->genomic_align_block_id});
#get the ancestral dnafrags
my $dnafrag_name = $this_genomic_align->dnafrag->name;
if ($dnafrag_name =~ /Ancestor/) {
push @dnafrags_to_delete, $dnafrag_name;
}
}
#get the genomic_align_groups
push @gags_to_delete, $genomic_align_group->dbID;
}
my $root_id = $genomic_align_tree->root->node_id;
my ($sql_gab, $sql_ga, $sql_gag, $sql_gat, $sql_dnafrag, $sql_dna, $sql_seq_region);
#assume not have too many of these!
$sql_gab = "DELETE FROM genomic_align_block WHERE genomic_align_block_id IN ";
$sql_ga = "DELETE FROM genomic_align WHERE genomic_align_block_id IN ";
$sql_gag = "DELETE FROM genomic_align_group WHERE group_id IN ";
$sql_gat = "DELETE FROM genomic_align_tree WHERE root_id =$root_id";
$sql_dnafrag = "DELETE FROM dnafrag WHERE genome_db_id = " . $ancestor_genome_db->dbID . " AND name IN ";
$sql_dna = "DELETE FROM dna WHERE seq_region_id IN ";
$sql_seq_region = "DELETE FROM seq_region WHERE seq_region_id IN ";
#convert hash to array
my @gab_ids;
foreach my $gab (keys %gabs_to_delete) {
push @gab_ids, $gab;
}
#find seq_region_ids from names
my $sql_seq_region_select = "SELECT seq_region_id FROM seq_region WHERE name IN ";
my $sql_seq_region_select_to_exec = $sql_seq_region_select . "(\"" . join("\",\"", @dnafrags_to_delete) . "\")";
my $sth = $ancestor_dba->dbc->prepare($sql_seq_region_select_to_exec);
#print "SQL: $sql_seq_region_select_to_exec\n";
$sth->execute;
my $this_seq_region_id;
$sth->bind_columns(\$this_seq_region_id);
while ($sth->fetch()) {
push @seq_regions_to_delete, $this_seq_region_id;
}
my $sql_gab_to_exec = $sql_gab . "(" . join(",", @gab_ids) . ")";
my $sql_ga_to_exec = $sql_ga . "(" . join(",", @gab_ids) . ")";
my $sql_gag_to_exec = $sql_gag . "(" . join(",", @gags_to_delete) . ")";
my $sql_dnafrag_to_exec = $sql_dnafrag . "(\"" . join("\",\"", @dnafrags_to_delete) . "\")";
#my $sql_gat_to_exec = $sql_gat . "(" . join(",", @gags_to_delete) . ")";
my $sql_gat_to_exec = "DELETE FROM genomic_align_tree WHERE root_id = $root_id";
my $sql_dna_to_exec = $sql_dna . "(" . join(",", @seq_regions_to_delete) . ")";
my $sql_seq_region_to_exec = $sql_seq_region . "(" . join(",", @seq_regions_to_delete) . ")";
#delete genomic_align_block, genomic_aligns, genomic_align_groups, genomic_align_trees, ancestral dnafrags
foreach my $sql ($sql_gab_to_exec,$sql_ga_to_exec,$sql_gag_to_exec,$sql_gat_to_exec,$sql_dnafrag_to_exec) {
my $sth = $compara_dba->dbc->prepare($sql);
print "SQL: $sql\n";
$sth->execute;
$sth->finish;
}
#delete dna and seq_region from the ancestral_core db
foreach my $sql ($sql_dna_to_exec, $sql_seq_region_to_exec) {
my $sth = $ancestor_dba->dbc->prepare($sql);
print "SQL: $sql\n";
$sth->execute;
$sth->finish;
}
print "\n\n";
}
#convert hash to array
my @gab_ids;
foreach my $gab_id (keys %undeleted_gabs) {
warn("*** This genomic_align_block $gab_id has no tree ***\n");
#print "Deleting genomic_align_block and genomic_align only\n";
push @gab_ids, $gab_id;
}
#Deleting genomic_align_block and genomic_align only
if (@gab_ids) {
my $sql_gab = "DELETE FROM genomic_align_block WHERE genomic_align_block_id IN ";
my $sql_ga = "DELETE FROM genomic_align WHERE genomic_align_block_id IN ";
my $sql_gab_to_exec = $sql_gab . "(" . join(",", @gab_ids) . ")";
my $sql_ga_to_exec = $sql_ga . "(" . join(",", @gab_ids) . ")";
foreach my $sql ($sql_gab_to_exec,$sql_ga_to_exec) {
my $sth = $compara_dba->dbc->prepare($sql);
#print "SQL: $sql\n";
#$sth->execute;
$sth->finish;
}
}
}
1;