Bio::EnsEMBL::Compara::DBSQL DnaAlignFeatureAdaptor
SummaryIncluded librariesPackage variablesSynopsisDescriptionGeneral documentationMethods
Toolbar
WebCvsRaw content
Summary
Bio::EnsEMBL::Compara::DBSQL::DnaAlignFeatureAdaptor
Package variables
Privates (from "my" definitions)
$CACHE_SIZE = 4
Included modules
Bio::EnsEMBL::DBSQL::BaseAdaptor
Bio::EnsEMBL::DnaDnaAlignFeature
Bio::EnsEMBL::Utils::Cache
Bio::EnsEMBL::Utils::Exception
Inherit
Bio::EnsEMBL::DBSQL::BaseAdaptor
Synopsis
$dafa = $compara_dbadaptor->get_DnaAlignFeatureAdaptor; @align_features = @{$dafa->fetch_by_Slice_species($slice, $qy_species)};
Description
Retrieves alignments from a compara database in the form of DnaDnaAlignFeatures
Methods
_convert_GenomicAlignBlocks_into_DnaDnaAlignFeatures
No description
Code
deleteObjDescriptionCode
fetch_all_by_SliceDescriptionCode
fetch_all_by_species_regionDescriptionCode
interpolate_best_locationDescriptionCode
newDescriptionCode
Methods description
deleteObj(2)code    nextTop
  Arg [1]    : none
Example : none
Description: Called automatically by DBConnection during object destruction
phase. Clears the cache to avoid memory leaks.
Returntype : none
Exceptions : none
Caller : none
fetch_all_by_SlicecodeprevnextTop
 Arg [1]    : Bio::EnsEMBL::Slice
Arg [2] : string $qy_species
The query species to retrieve alignments against
Arg [3] : string $qy_assembly
Arg [4] : string $$alignment_type
The type of alignments to be retrieved
e.g. WGA or WGA_HCR
Example : $gaa->fetch_all_by_Slice($slice, "Mus musculus","WGA");
Description: find matches of query_species in the region of a slice of a
subject species
Returntype : an array reference of Bio::EnsEMBL::DnaDnaAlignFeature objects
Exceptions : none
Caller : general
fetch_all_by_species_regioncodeprevnextTop
 Arg [1]    : string $cs_species
e.g. "Homo sapiens"
Arg [2] : string $cs_assembly (can be undef)
e.g. "NCBI_31" if undef assembly_default will be taken
Arg [3] : string $qy_species
e.g. "Mus musculus"
Arg [4] : string $qy_assembly (can be undef)
e.g. "MGSC_3", if undef assembly_default will be taken
Arg [5] : string $chromosome_name
the name of the chromosome to retrieve alignments from (e.g. 'X')
Arg [6] : int start
Arg [7] : int end
Arg [8] : string $alignment_type
The type of alignments to be retrieved
e.g. WGA or WGA_HCR
Example : $gaa->fetch_all_by_species_region("Homo sapiens", "NCBI_31",
"Mus musculus", "MGSC_3",
"X", 250_000, 750_000,"WGA");
Description: Retrieves alignments between the consensus and query species
from a specified region of the consensus genome.
Returntype : an array reference of Bio::EnsEMBL::DnaDnaAlignFeature objects
Exceptions : returns a ref to an empty list if requested DnaFrag or MethodLinkSpeciesSet
are not in the compara DB.
Caller : general
interpolate_best_locationcodeprevnextTop
  Arg [1]    : Bio::EnsEMBL::Slice $slice
Arg [2] : string $species
e.g. "Mus musculus"
Arg [3] : string $alignment_type
e.g. "BLASTZ_NET"
Arg [4] : string $seq_region_name
e.g. "6-COX"
Example :
Description:
Returntype : array with 3 elements
Exceptions :
Caller :
newcodeprevnextTop
  Arg [1]    : list of args to super class constructor
Example : $dafa = new Bio::EnsEMBL::Compara::Genomi
Description: Creates a new DnaAlignFeatureAdaptor. The superclass
constructor is extended to initialise an internal cache. This
class should be instantiated through the get method on the
DBAdaptor rather than calling this method directory.
Returntype : none
Exceptions : none
Caller : Bio::EnsEMBL::DBSQL::DBConnection
Methods code
_convert_GenomicAlignBlocks_into_DnaDnaAlignFeaturesdescriptionprevnextTop
sub _convert_GenomicAlignBlocks_into_DnaDnaAlignFeatures {
  my ($genomic_align_blocks) = @_;
  my $dna_dna_align_features = [];

  my $query_slice_adaptor;
  foreach my $this_genomic_align_block (@$genomic_align_blocks) {

    ## KNOWN BUG: This will ignore third and following parts of a multiple alignment...
## This adaptor cannot deal with multiple alignments. Use the new
## Bio::EnsEMBL::Compara::DBSQL::GenomicAlignBlockAdaptor instead.
my $consensus_genomic_align = $this_genomic_align_block->reference_genomic_align; my $query_genomic_align = $this_genomic_align_block->get_all_non_reference_genomic_aligns->[0]; my $top_slice; if (!defined($query_slice_adaptor)) { $query_slice_adaptor = $query_genomic_align->get_Slice->adaptor; } if ($query_slice_adaptor) { $top_slice = $query_slice_adaptor->fetch_by_region( $query_genomic_align->dnafrag->coord_system_name, $query_genomic_align->dnafrag->name ); } else { $top_slice = undef; } ## The code for transforming GenomicAlignBlocks into DnaDnaAlignFeatures assumes that
## reference_genomic_align is on the forward strand!
if ($consensus_genomic_align->dnafrag_strand == -1) { $this_genomic_align_block->reverse_complement; } my $cstart = $consensus_genomic_align->dnafrag_start; my $cend = $consensus_genomic_align->dnafrag_end; #skip features which do not overlap the requested region
#next if ($cstart > $end || $cend < $start);
my $ga_cigar_line; do { my @consensus_cigar_pieces = split(/(\d*[DIMG])/, $consensus_genomic_align->cigar_line); my @query_cigar_pieces = split(/(\d*[DIMG])/, $query_genomic_align->cigar_line); my @consensus_gapped_pieces; foreach my $piece (@consensus_cigar_pieces) { next if ($piece !~ /^(\d*)([MDIG])$/); my $num = $1; my $type = $2; $num = 1 if ($num !~ /^\d+$/); if( $type eq "M" ) { for (my $i=0; $i<$num; $i++) {push(@consensus_gapped_pieces, "N")} } else { for (my $i=0; $i<$num; $i++) {push(@consensus_gapped_pieces, '-')} } } my @query_gapped_pieces; foreach my $piece (@query_cigar_pieces) { next if ($piece !~ /^(\d*)([MDIG])$/); my $num = $1; my $type = $2; $num = 1 if ($num !~ /^\d+$/); if( $type eq "M" ) { for (my $i=0; $i<$num; $i++) {push(@query_gapped_pieces, "N")} } else { for (my $i=0; $i<$num; $i++) {push(@query_gapped_pieces, '-')} } } throw if (scalar(@consensus_gapped_pieces) != scalar(@query_gapped_pieces)); my $type = ""; my $num = 0; for (my $i=0; $i<@consensus_gapped_pieces; $i++) { if ($consensus_gapped_pieces[$i] eq "N" and $query_gapped_pieces[$i] eq "N") { if ($type ne "M") { $ga_cigar_line .= (($num==1)?"":$num).$type if ($num); $num = 0; $type = "M"; } } elsif ($consensus_gapped_pieces[$i] eq "N" and $query_gapped_pieces[$i] eq "-") { if ($type ne "I") { $ga_cigar_line .= (($num==1)?"":$num).$type if ($num); $num = 0; $type = "I"; } } elsif ($consensus_gapped_pieces[$i] eq "-" and $query_gapped_pieces[$i] eq "N") { if ($type ne "D") { $ga_cigar_line .= (($num==1)?"":$num).$type if ($num); $num = 0; $type = "D"; } } else { throw "no double gaps can occur in a pairwise alignment!"; } $num++; } $ga_cigar_line .= (($num==1)?"":$num).$type; }; my $df_name = $consensus_genomic_align->dnafrag->name; my $score = $this_genomic_align_block->score; my $perc_id = $this_genomic_align_block->perc_id; my $qdf_start = 1; my $ga_query_start = $query_genomic_align->dnafrag_start; my $ga_query_end = $query_genomic_align->dnafrag_end; my $ga_query_strand = $query_genomic_align->dnafrag_strand; my $qdf_name = $query_genomic_align->dnafrag->name; my $ga_level_id = $consensus_genomic_align->level_id; my $ga_strands_reversed = 0; if ($consensus_genomic_align->dnafrag_strand == -1) { $ga_strands_reversed = 1; $ga_query_strand = -$ga_query_strand; } #my $ga_group_id = $consensus_genomic_align->genomic_align_group_id_by_type("default");
my $ga_group_id = ($this_genomic_align_block->group_id() || $consensus_genomic_align->genomic_align_group_id_by_type("default")); my ($slice, $seq_name, $start, $end, $strand); if ($this_genomic_align_block->reference_slice) { $slice = $this_genomic_align_block->reference_slice; $seq_name = $this_genomic_align_block->reference_slice->name; $start = $this_genomic_align_block->reference_slice_start; $end = $this_genomic_align_block->reference_slice_end; $strand = $this_genomic_align_block->reference_slice_strand; } else { $slice = $consensus_genomic_align->get_Slice(); $seq_name = $consensus_genomic_align->dnafrag->name; $start = $consensus_genomic_align->dnafrag_start; $end = $consensus_genomic_align->dnafrag_end; $strand = $consensus_genomic_align->dnafrag_strand; } my $f = Bio::EnsEMBL::DnaDnaAlignFeature->new_fast ({'cigar_string' => $ga_cigar_line, 'slice' => $slice, 'seqname' => $seq_name, 'start' => $start, 'end' => $end, 'strand' => $strand, 'species' => $consensus_genomic_align->genome_db->name, 'score' => $score, 'percent_id' => $perc_id, 'hstart' => $qdf_start + $ga_query_start - 1, 'hend' => $qdf_start + $ga_query_end -1, 'hstrand' => $ga_query_strand, 'hseqname' => $qdf_name, 'hspecies' => $query_genomic_align->genome_db->name, 'hslice' => $top_slice, 'alignment_type' => $this_genomic_align_block->method_link_species_set->method_link_type, 'group_id' => $ga_group_id, 'level_id' => $ga_level_id, 'strands_reversed' => $ga_strands_reversed}); push @$dna_dna_align_features, $f; } return $dna_dna_align_features; } 1;
}
deleteObjdescriptionprevnextTop
sub deleteObj {
  my $self = shift;

  $self->SUPER::deleteObj;

  #clear the cache, removing references
%{$self->{'_cache'}} = ();
}
fetch_all_by_SlicedescriptionprevnextTop
sub fetch_all_by_Slice {
  my ($self, $orig_slice, $qy_species, $qy_assembly, $alignment_type, 
      $limit) = @_;

  unless($orig_slice && ref $orig_slice && 
         $orig_slice->isa('Bio::EnsEMBL::Slice')) {
    throw("Invalid slice argument [$orig_slice]\n");
  }

  unless($qy_species) {
    throw("Query species argument is required");
  }

  $limit = 0 unless (defined $limit);

  my $genome_db_adaptor = $self->db->get_GenomeDBAdaptor();
  my $cs_genome_db = $genome_db_adaptor->fetch_by_Slice($orig_slice);
  my $qy_genome_db = $genome_db_adaptor->fetch_by_name_assembly(
      $qy_species, $qy_assembly);
  return [] if (!$cs_genome_db or !$qy_genome_db);

  my $method_link_species_set_adaptor = $self->db->get_MethodLinkSpeciesSetAdaptor();
  my $method_link_species_set;
  if ($cs_genome_db->dbID == $qy_genome_db->dbID) {
    $method_link_species_set = $method_link_species_set_adaptor->
        fetch_by_method_link_type_GenomeDBs($alignment_type, [$cs_genome_db]);
  } else {
    $method_link_species_set = $method_link_species_set_adaptor->
        fetch_by_method_link_type_GenomeDBs($alignment_type,
            [$cs_genome_db, $qy_genome_db]);
  }

  my $key = uc(join(':', $orig_slice->name,
                    $cs_genome_db->name, $qy_genome_db->name, $qy_genome_db->assembly, $alignment_type));

  if(exists $self->{'_cache'}->{$key}) {
    return $self->{'_cache'}->{$key};
  }

  my $genomic_align_block_adaptor = $self->db->get_GenomicAlignBlockAdaptor();
  my $genomic_align_blocks = $genomic_align_block_adaptor->fetch_all_by_MethodLinkSpeciesSet_Slice(
      $method_link_species_set, $orig_slice, $limit);

  my $dafs = _convert_GenomicAlignBlocks_into_DnaDnaAlignFeatures($genomic_align_blocks);

  #update the cache
$self->{'_cache'}->{$key} = $dafs; return $dafs;
}
fetch_all_by_species_regiondescriptionprevnextTop
sub fetch_all_by_species_region {
  my ($self, $consensus_species, $consensus_assembly,
      $query_species, $query_assembly,
      $chromosome_name, $start, $end, $alignment_type, $limit,$dnafrag_type) = @_;

  $limit = 0 unless (defined $limit);

  #get the genome database for each species
my $genome_db_adaptor = $self->db->get_GenomeDBAdaptor; my $consensus_genome_db = $genome_db_adaptor->fetch_by_name_assembly($consensus_species, $consensus_assembly); my $query_genome_db = $genome_db_adaptor->fetch_by_name_assembly($query_species, $query_assembly); #retrieve dna fragments from the subjects species region of interest
my $dna_frag_adaptor = $self->db->get_DnaFragAdaptor; my $this_dnafrag = $dna_frag_adaptor->fetch_by_GenomeDB_and_name( $consensus_genome_db, $chromosome_name, ); return [] if (!$this_dnafrag); # Get the Bio::EnsEMBL::Compara::MethodLinkSpeciesSet object corresponding to the alignment_type and
# the couple of genomes
my $method_link_species_set_adaptor = $self->db->get_MethodLinkSpeciesSetAdaptor; my $method_link_species_set; if ($consensus_genome_db->dbID == $query_genome_db->dbID) { # Allow to fetch the right method_link_species_set for self-comparisons!
$method_link_species_set = $method_link_species_set_adaptor->fetch_by_method_link_type_GenomeDBs( $alignment_type, [$consensus_genome_db]); } else { # Normal, pairwise comparisons...
$method_link_species_set = $method_link_species_set_adaptor->fetch_by_method_link_type_GenomeDBs( $alignment_type, [$consensus_genome_db, $query_genome_db] ); } return [] if (!$method_link_species_set); # my $gaa = $self->db->get_GenomicAlignAdaptor;
my $genomic_align_block_adaptor = $self->db->get_GenomicAlignBlockAdaptor; my @out = (); my $consensus_slice_adaptor = $consensus_genome_db->db_adaptor->get_SliceAdaptor; my $query_slice_adaptor; eval { $query_slice_adaptor = $query_genome_db->db_adaptor->get_SliceAdaptor; }; #caclulate coords relative to start of dnafrag
## Bio::EnsEMBL::Compara::Dnafrag::start is always 1
# my $this_dnafrag_start = $start - $this_dnafrag->start + 1;
# my $this_dnafrag_end = $end - $this_dnafrag->start + 1;
my $this_dnafrag_start = $start; my $this_dnafrag_end = $end; #constrain coordinates so they are completely within the dna frag
my $this_dnafrag_length = $this_dnafrag->length; $this_dnafrag_start = 1 unless (defined $this_dnafrag_start); $this_dnafrag_start = ($this_dnafrag_start < 1) ? 1 : $this_dnafrag_start; $this_dnafrag_end = $this_dnafrag_length unless (defined $this_dnafrag_end); $this_dnafrag_end = ($this_dnafrag_end > $this_dnafrag_length) ? $this_dnafrag_length : $this_dnafrag_end; #fetch all alignments in the region we are interested in
my $genomic_align_blocks = $genomic_align_block_adaptor->fetch_all_by_MethodLinkSpeciesSet_DnaFrag( $method_link_species_set, $this_dnafrag, $this_dnafrag_start, $this_dnafrag_end, $limit ); #convert genomic align blocks to dna align features
my $dafs = _convert_GenomicAlignBlocks_into_DnaDnaAlignFeatures($genomic_align_blocks); # We need to attach slices of the entire seq region to the features.
# The features come without any slices at all, but their coords are
# relative to the beginning of the seq region.
my $top_slice = $consensus_slice_adaptor->fetch_by_region($dnafrag_type, $chromosome_name); map {$_->slice($top_slice)} @$dafs; return $dafs;
}
interpolate_best_locationdescriptionprevnextTop
sub interpolate_best_location {
  my ($self,$slice,$species,$alignment_type,$seq_region_name) = @_;

#warn $slice->name,"\t$species\t$alignment_type\t$seq_region_name";
$| =1 ; my $max_distance_for_clustering = 10000; my $dafs = $self->fetch_all_by_Slice($slice, $species, undef, $alignment_type); my %name_strand_clusters; my $based_on_group_id = 1; foreach my $daf (@{$dafs}) { next if ($seq_region_name && $daf->hseqname ne $seq_region_name); if (defined $daf->group_id && $daf->group_id > 0 && $alignment_type ne "TRANSLATED_BLAT") { push @{$name_strand_clusters{$daf->group_id}}, $daf; } else { $based_on_group_id = 0 if ($based_on_group_id); push @{$name_strand_clusters{$daf->hseqname. "_" .$daf->hstrand}}, $daf; } } if ($based_on_group_id) { my @ordered_name_strands = sort {scalar @{$name_strand_clusters{$b}} <=> scalar @{$name_strand_clusters{$a}}} keys %name_strand_clusters; my @best_blocks = sort {$a->hstart <=> $b->hend} @{$name_strand_clusters{$ordered_name_strands[0]}||[]}; return undef if( !@best_blocks ); return ($best_blocks[0]->hseqname, $best_blocks[0]->hstart + int(($best_blocks[-1]->hend - $best_blocks[0]->hstart)/2),
$best_blocks[0]->hstrand * $slice->strand,
$best_blocks[0]->hstart,
$best_blocks[-1]->hend);
} else { my @refined_clusters; foreach my $name_strand (keys %name_strand_clusters) { # an array of arrayrefs
# name, strand, start, end, nb of blocks
my @sub_clusters; foreach my $block (sort {$a->hstart <=> $b->hstart} @{$name_strand_clusters{$name_strand}||[]}) { unless (scalar @sub_clusters) { push @sub_clusters, [$block->hseqname,$block->hstrand, $block->hstart, $block->hend, 1]; next; } my $block_clustered = 0; foreach my $arrayref (@sub_clusters) { my ($n,$st,$s,$e,$c) = @{$arrayref}; if ($block->hstart<=$e && $block->hend>=$s) { # then overlaps.
$arrayref->[2] = $block->hstart if ($block->hstart < $s); $arrayref->[3] = $block->hend if ($block->hend > $e); $arrayref->[4]++; $block_clustered = 1; } elsif ($block->hstart <= $e + $max_distance_for_clustering && $block->hstart > $e) { # then is downstream
$arrayref->[3] = $block->hend; $arrayref->[4]++; $block_clustered = 1; } elsif ($block->hend >= $s - $max_distance_for_clustering && $block->hend < $s) { # then is upstream
$arrayref->[2] = $block->hstart; $arrayref->[4]++; $block_clustered = 1; } } unless ($block_clustered) { # do not overlap anything already seen, so adding as new seeding cluster
push @sub_clusters, [$block->hseqname,$block->hstrand, $block->hstart, $block->hend, 1]; } } push @refined_clusters, @sub_clusters; } # sort by the max number of blocks desc
@refined_clusters = sort {$b->[-1] <=> $a->[-1]} @refined_clusters; return undef if(!@refined_clusters); return ($refined_clusters[0]->[0], #hseqname,
$refined_clusters[0]->[2] + int(($refined_clusters[0]->[3] - $refined_clusters[0]->[2])/2),
$refined_clusters[0]->[1] * $slice->strand,
$refined_clusters[0]->[2],
$refined_clusters[0]->[3]);
}
}
newdescriptionprevnextTop
sub new {
  my ($class, @args) = @_;

  my $self = $class->SUPER::new(@args);

  #initialize internal LRU cache
tie(%{$self->{'_cache'}}, 'Bio::EnsEMBL::Utils::Cache', $CACHE_SIZE); return $self;
}
General documentation
CONTACTTop
Post questions to the EnsEMBL developer list: <ensembl-dev@ebi.ac.uk>