Bio::EnsEMBL::Compara::Production::GenomicAlignBlock Mlagan
SummaryIncluded librariesPackage variablesSynopsisDescriptionGeneral documentationMethods
Toolbar
WebCvsRaw content
Summary
Bio::EnsEMBL::Compara::Production::GenomicAlignBlock::Mlagan
Package variables
No package variables defined.
Included modules
Bio::EnsEMBL::Analysis::Runnable::Mlagan
Bio::EnsEMBL::Compara::DnaFragRegion
Bio::EnsEMBL::Compara::Graph::NewickParser
Bio::EnsEMBL::Compara::NestedSet
Bio::EnsEMBL::Compara::Production::DBSQL::DBAdaptor ;
Bio::EnsEMBL::Hive::Process
Bio::EnsEMBL::Utils::Exception
Time::HiRes qw ( time gettimeofday tv_interval )
Inherit
Bio::EnsEMBL::Hive::Process
Synopsis
Description
Methods
_trim_gab_left
No description
Code
_trim_gab_right
No description
Code
_write_gerp_dataflow
No description
Code
build_tree_string
No description
Code
dumpFasta
No description
Code
fasta_files
No description
Code
fetch_inputDescriptionCode
get_params
No description
Code
get_species_tree
No description
Code
max_block_size
No description
Code
method_link_species_set_id
No description
Code
run
No description
Code
synteny_region_id
No description
Code
tree_string
No description
Code
update_node_names
No description
Code
write_output
No description
Code
Methods description
fetch_inputcode    nextTop
    Title   :   fetch_input
Usage : $self->fetch_input
Function: Fetches input data for repeatmasker from the database
Returns : none
Args : none
Methods code
_trim_gab_leftdescriptionprevnextTop
sub _trim_gab_left {
    my ($gab) = @_;
    
    if (!defined($gab)) {
	return undef;
    }
    my $align_length = $gab->length;
    
    my $gas = $gab->get_all_GenomicAligns();
    my $d_length;
    my $m_length;
    my $min_d_length = $align_length;
    
    my $found_min = 0;

    #take first element in cigar string for each genomic_align and if it is a
#match, it must extend to the start of the block. Find the shortest delete.
#If the shortest delete and the match are the same length, there is no
#overlap between them so restrict to the end of the delete and try again.
#If the delete is shorter than the match, there must be an overlap.
foreach my $ga (@$gas) { my ($cigLength, $cigType) = ( $ga->cigar_line =~ /^(\d*)([GMD])/ ); $cigLength = 1 unless ($cigLength =~ /^\d+$/); if ($cigType eq "D" or $cigType eq "G") { $d_length = $cigLength; if ($d_length < $min_d_length) { $min_d_length = $d_length; } } else { $m_length = $cigLength; $found_min++; } } #if more than one alignment filled to the left edge, no need to restrict
if ($found_min > 1) { return $gab; } my $new_gab = ($gab->restrict_between_alignment_positions( $min_d_length+1, $align_length, 1)); #no overlapping genomic_aligns
if ($new_gab->length == 0) { return $new_gab; } #if delete length is less than match length then must have sequence overlap
if ($min_d_length < $m_length) { return $new_gab; } #otherwise try again with restricted gab
return _trim_gab_left($new_gab); } #trim genomic align block from the right hand edge to first position having at
#least 2 genomic aligns which overlap
}
_trim_gab_rightdescriptionprevnextTop
sub _trim_gab_right {
    my ($gab) = @_;
    
    if (!defined($gab)) {
	return undef;
    }
    my $align_length = $gab->length;

    my $max_pos = 0;
    my $gas = $gab->get_all_GenomicAligns();
    
    my $found_max = 0;
    my $d_length;
    my $m_length;
    my $min_d_length = $align_length;

    #take last element in cigar string for each genomic_align and if it is a
#match, it must extend to the end of the block. Find the shortest delete.
#If the shortest delete and the match are the same length, there is no
#overlap between them so restrict to the end of the delete and try again.
#If the delete is shorter than the match, there must be an overlap.
foreach my $ga (@$gas) { my ($cigLength, $cigType) = ( $ga->cigar_line =~ /(\d*)([GMD])$/ ); $cigLength = 1 unless ($cigLength =~ /^\d+$/); if ($cigType eq "D" or $cigType eq "G") { $d_length =$cigLength; if ($d_length < $min_d_length) { $min_d_length = $d_length; } } else { $m_length = $cigLength; $found_max++; } } #if more than one alignment filled the right edge, no need to restrict
if ($found_max > 1) { return $gab; } my $new_gab = $gab->restrict_between_alignment_positions(1, $align_length - $min_d_length, 1); #no overlapping genomic_aligns
if ($new_gab->length == 0) { return $new_gab; } #if delete length is less than match length then must have sequence overlap
if ($min_d_length < $m_length) { return $new_gab; } #otherwise try again with restricted gab
return _trim_gab_right($new_gab);
}
_write_gerp_dataflowdescriptionprevnextTop
sub _write_gerp_dataflow {
    my ($self, $gab, $mlss) = @_;

    my $species_set = "[";
    my $genome_db_set  = $mlss->species_set;

    foreach my $genome_db (@$genome_db_set) {
	$species_set .= $genome_db->dbID . ","; 
    }
    $species_set .= "]";
      
    my $output_id = "{genomic_align_block_id=>" . $gab->dbID . ",species_set=>" .  $species_set . "}";
    $self->dataflow_output_id($output_id);

}

##########################################
#
# getter/setter methods
#
##########################################
#sub input_dir {
# my $self = shift;
# $self->{'_input_dir'} = shift if(@_);
# return $self->{'_input_dir'};
#
}
build_tree_stringdescriptionprevnextTop
sub build_tree_string {
  my $self = shift;

  my $tree = $self->get_species_tree;
  return if (!$tree);

  $tree = $self->update_node_names($tree);

  my $tree_string = $tree->newick_simple_format;

  $tree_string =~ s/:\d+\.\d+//g;
  $tree_string =~ s/[,;]/ /g;
  $tree_string =~ s/\"//g;

  $tree->release_tree;

  return $tree_string;
}
dumpFastadescriptionprevnextTop
sub dumpFasta {
  my $self = shift;

#  $self->{'comparaDBA'}->dbc->disconnect_when_inactive(1);
my $sra = $self->{'comparaDBA'}->get_SyntenyRegionAdaptor; my $sr = $sra->fetch_by_dbID($self->synteny_region_id); my $idx = 1; foreach my $dfr (@{$sr->children}) { my $file = $self->worker_temp_directory . "/seq" . $idx . ".fa"; my $masked_file = $self->worker_temp_directory . "/seq" . $idx . ".fa.masked"; $idx++; open F, ">$file" || throw("Couldn't open $file"); open MF, ">$masked_file" || throw("Couldn't open $masked_file"); # WARNING this is a hack. It won't work at all on self comparisons!!!
# This will be more generic and fixed when the retain/release call will be
# cleaned from the Node/Link/NestedSet code.
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
$self->{'_dnafrag_regions'}{$dfr->dnafrag_id} = $dfr; $dfr->retain; $dfr->disavow_parent; my $slice = $dfr->slice; print F ">DnaFrag" . $dfr->dnafrag_id . ".\n"; print MF ">DnaFrag" . $dfr->dnafrag_id . ".\n"; my $seq = $slice->seq; $seq =~ s/(.{80})/$1\n/g; chomp $seq; print F $seq,"\n"; $seq = $slice->get_repeatmasked_seq->seq; $seq =~ s/(.{80})/$1\n/g; chomp $seq; print MF $seq,"\n"; close F; close MF; push @{$self->fasta_files}, $file; } # $self->{'comparaDBA'}->dbc->disconnect_when_inactive(0);
$sr->release_tree; if ($self->get_species_tree) { my $tree_string = $self->build_tree_string; $self->tree_string($tree_string); } return 1;
}
fasta_filesdescriptionprevnextTop
sub fasta_files {
  my $self = shift;

  $self->{'_fasta_files'} = [] unless (defined $self->{'_fasta_files'});

  if (@_) {
    my $value = shift;
    push @{$self->{'_fasta_files'}}, $value;
  }

  return $self->{'_fasta_files'};
}
fetch_inputdescriptionprevnextTop
sub fetch_input {
  my( $self) = @_;

  $self->{'comparaDBA'} = Bio::EnsEMBL::Compara::Production::DBSQL::DBAdaptor->new(-DBCONN=>$self->db->dbc);
  $self->{'hiveDBA'} = Bio::EnsEMBL::Hive::DBSQL::DBAdaptor->new(-DBCONN => $self->{'comparaDBA'}->dbc);
  $self->get_params($self->parameters);
  $self->get_params($self->input_id);

  $self->dumpFasta;

  return 1;
}
get_paramsdescriptionprevnextTop
sub get_params {
  my $self         = shift;
  my $param_string = shift;

  return unless($param_string);
#   print("parsing parameter string : ",$param_string,"\n");
my $params = eval($param_string); return unless($params); if(defined($params->{'synteny_region_id'})) { $self->synteny_region_id($params->{'synteny_region_id'}); } if(defined($params->{'method_link_species_set_id'})) { $self->method_link_species_set_id($params->{'method_link_species_set_id'}); } if(defined($params->{'tree_file'})) { $self->{_tree_file} = $params->{'tree_file'}; } if(defined($params->{'tree_analysis_data_id'})) { $self->{_tree_analysis_data_id} = $params->{'tree_analysis_data_id'}; } if(defined($params->{'max_block_size'})) { $self->{_max_block_size} = $params->{'max_block_size'}; } return 1;
}
get_species_treedescriptionprevnextTop
sub get_species_tree {
  my $self = shift;

  my $newick_species_tree;
  if (defined($self->{_species_tree})) {
    return $self->{_species_tree};
  } elsif ($self->{_tree_analysis_data_id}) {
    my $analysis_data_adaptor = $self->{hiveDBA}->get_AnalysisDataAdaptor();
    $newick_species_tree = $analysis_data_adaptor->fetch_by_dbID($self->{_tree_analysis_data_id});
  } elsif ($self->{_tree_file}) {
    open(TREE_FILE, $self->{_tree_file}) or throw("Cannot open file ".$self->{_tree_file});
    $newick_species_tree = join("", <TREE_FILE>);
    close(TREE_FILE);
  }

  if (!defined($newick_species_tree)) {
    throw("Cannot get the species tree");
  }

  $newick_species_tree =~ s/^\s*//;
  $newick_species_tree =~ s/\s*$//;
  $newick_species_tree =~ s/[\r\n]//g;

  $self->{'_species_tree'} =
      Bio::EnsEMBL::Compara::Graph::NewickParser::parse_newick_into_tree($newick_species_tree);

  return $self->{'_species_tree'};
}
max_block_sizedescriptionprevnextTop
sub max_block_size {
  my $self = shift;
  $self->{'_max_block_size'} = shift if(@_);
  return $self->{'_max_block_size'};
}


##########################################
#
# internal methods
#
##########################################
}
method_link_species_set_iddescriptionprevnextTop
sub method_link_species_set_id {
  my $self = shift;
  $self->{'_method_link_species_set_id'} = shift if(@_);
  return $self->{'_method_link_species_set_id'};
}
rundescriptionprevnextTop
sub run {
  my $self = shift;

  throw("Wrong tree: ".$self->tree_string) if ($self->tree_string !~ /^\(/);
  my $runnable = new Bio::EnsEMBL::Analysis::Runnable::Mlagan
    (-workdir => $self->worker_temp_directory,
     -fasta_files => $self->fasta_files,
     -tree_string => $self->tree_string,
     -analysis => $self->analysis);
  $self->{'_runnable'} = $runnable;
  $runnable->run_analysis;
}
synteny_region_iddescriptionprevnextTop
sub synteny_region_id {
  my $self = shift;
  $self->{'_synteny_region_id'} = shift if(@_);
  return $self->{'_synteny_region_id'};
}
tree_stringdescriptionprevnextTop
sub tree_string {
  my $self = shift;
  $self->{'_tree_string'} = shift if(@_);
  return $self->{'_tree_string'};
}
update_node_namesdescriptionprevnextTop
sub update_node_names {
  my $self = shift;
  my $tree = shift;
  my %gdb_id2dfr;
  foreach my $dfr (values %{$self->{'_dnafrag_regions'}}) {
    $gdb_id2dfr{$dfr->dnafrag->genome_db->dbID} = "DnaFrag".$dfr->dnafrag_id .".";
  }

  foreach my $leaf (@{$tree->get_all_leaves}) {
    if (defined $gdb_id2dfr{$leaf->name}) {
      $leaf->name($gdb_id2dfr{$leaf->name});
    } else {
      $leaf->disavow_parent;
      $tree = $tree->minimize_tree;
    }
  }
  if (@{$tree->get_all_leaves} != scalar(values %{$self->{'_dnafrag_regions'}})) {
    throw("Tree has a wrong number of leaves after updating the node names");
  }
  if ($tree->get_child_count == 1) {
    my $child = $tree->children->[0];
    $child->parent->merge_children($child);
    $child->disavow_parent;
  }
  return $tree;
}

1;
}
write_outputdescriptionprevnextTop
sub write_output {
  my ($self) = @_;

  my $mlssa = $self->{'comparaDBA'}->get_MethodLinkSpeciesSetAdaptor;
  my $mlss = $mlssa->fetch_by_dbID($self->method_link_species_set_id);
  my $gaba = $self->{'comparaDBA'}->get_GenomicAlignBlockAdaptor;
  my $gaga = $self->{'comparaDBA'}->get_GenomicAlignGroupAdaptor;

  foreach my $gab (@{$self->{'_runnable'}->output}) {
    foreach my $ga (@{$gab->genomic_align_array}) {
      $ga->method_link_species_set($mlss);
      my $dfr = $self->{'_dnafrag_regions'}{$ga->dnafrag_id};
      $ga->dnafrag_id($dfr->dnafrag_id);
      $ga->dnafrag($dfr->dnafrag);
      $ga->dnafrag_start($dfr->dnafrag_start);
      $ga->dnafrag_end($dfr->dnafrag_end);
      $ga->dnafrag_strand($dfr->dnafrag_strand);
      $ga->level_id(1);
      $dfr->release;
      unless (defined $gab->length) {
        $gab->length(length($ga->aligned_sequence));
      }
    }
    $gab->method_link_species_set($mlss);
    
    my $group;
    # Split block if it is too long and store as groups
if ($self->max_block_size() and $gab->length > $self->max_block_size()) { my $gab_array = undef; my $find_next = 0; for (my $start = 1; $start <= $gab->length; $start += $self->max_block_size()) { my $split_gab = $gab->restrict_between_alignment_positions( $start, $start + $self->max_block_size() - 1, 1); #less than 2 genomic_aligns
if (@{$split_gab->get_all_GenomicAligns()} < 2) { #set find_next flag to remember to trim the block to the right if it has more than 2 genomic_aligns
$find_next = 1; #trim the previous block
my $prev_gab = pop @$gab_array; my $trim_gab = _trim_gab_right($prev_gab); #check it has at least 2 genomic_aligns, otherwise try again
while (@{$trim_gab->get_all_GenomicAligns()} < 2) { $prev_gab = pop @$gab_array; $trim_gab = _trim_gab_right($prev_gab); } #add trimmed block to array
if ($trim_gab) { push @$gab_array, $trim_gab; } } else { #more than 2 genomic_aligns
push @$gab_array, $split_gab; #but may be to the right of a gab with only 1 ga and
#therefore needs to be trimmed
if ($find_next) { my $next_gab = pop @$gab_array; my $trim_gab = _trim_gab_left($next_gab); if (@{$trim_gab->get_all_GenomicAligns()} >= 2) { push @$gab_array, $trim_gab; $find_next = 0; } } } } #store the first block to get the dbID which is used to create the
#group_id.
my $first_block = shift @$gab_array; $gaba->store($first_block); my $group_id = $first_block->dbID; $gaba->store_group_id($first_block, $group_id); $self->_write_gerp_dataflow($first_block, $mlss); #store the rest of the genomic_align_blocks
foreach my $this_gab (@$gab_array) { $this_gab->group_id($group_id); $gaba->store($this_gab); $self->_write_gerp_dataflow($this_gab, $mlss); } } else { $gaba->store($gab); $self->_write_gerp_dataflow($gab, $mlss); } } return 1; } #trim genomic align block from the left hand edge to first position having at
#least 2 genomic aligns which overlap
}
General documentation
CONTACTTop
Abel Ureta-Vidal <abel@ebi.ac.uk>
APPENDIXTop
The rest of the documentation details each of the object methods.
Internal methods are usually preceded with a _