Raw content of Bio::EnsEMBL::Compara::DBSQL::DnaAlignFeatureAdaptor.

Raw content of Bio::EnsEMBL::Compara::DBSQL::DnaAlignFeatureAdaptor # Copyright EnsEMBL 1999-2004 # # Ensembl module for Bio::EnsEMBL::Compara::DBSQL::DnaAlignFeatureAdaptor # # You may distribute this module under the same terms as perl itself =head1 NAME Bio::EnsEMBL::Compara::DBSQL::DnaAlignFeatureAdaptor =head1 SYNOPSIS $dafa = $compara_dbadaptor->get_DnaAlignFeatureAdaptor; @align_features = @{$dafa->fetch_by_Slice_species($slice, $qy_species)}; =head1 DESCRIPTION Retrieves alignments from a compara database in the form of DnaDnaAlignFeatures =head1 CONTACT Post questions to the EnsEMBL developer list: <ensembl-dev@ebi.ac.uk> =cut package Bio::EnsEMBL::Compara::DBSQL::DnaAlignFeatureAdaptor; use strict; use vars qw(@ISA); use Bio::EnsEMBL::DBSQL::BaseAdaptor; use Bio::EnsEMBL::Utils::Cache; #CPAN LRU cache use Bio::EnsEMBL::DnaDnaAlignFeature; use Bio::EnsEMBL::Utils::Exception; @ISA = qw(Bio::EnsEMBL::DBSQL::BaseAdaptor); my $CACHE_SIZE = 4; =head2 new Arg [1] : list of args to super class constructor Example : $dafa = new Bio::EnsEMBL::Compara::Genomi Description: Creates a new DnaAlignFeatureAdaptor. The superclass constructor is extended to initialise an internal cache. This class should be instantiated through the get method on the DBAdaptor rather than calling this method directory. Returntype : none Exceptions : none Caller : Bio::EnsEMBL::DBSQL::DBConnection =cut sub new { my ($class, @args) = @_; my $self = $class->SUPER::new(@args); #initialize internal LRU cache tie(%{$self->{'_cache'}}, 'Bio::EnsEMBL::Utils::Cache', $CACHE_SIZE); return $self; } =head2 fetch_all_by_species_region Arg [1] : string $cs_species e.g. "Homo sapiens" Arg [2] : string $cs_assembly (can be undef) e.g. "NCBI_31" if undef assembly_default will be taken Arg [3] : string $qy_species e.g. "Mus musculus" Arg [4] : string $qy_assembly (can be undef) e.g. "MGSC_3", if undef assembly_default will be taken Arg [5] : string $chromosome_name the name of the chromosome to retrieve alignments from (e.g. 'X') Arg [6] : int start Arg [7] : int end Arg [8] : string $alignment_type The type of alignments to be retrieved e.g. WGA or WGA_HCR Example : $gaa->fetch_all_by_species_region("Homo sapiens", "NCBI_31", "Mus musculus", "MGSC_3", "X", 250_000, 750_000,"WGA"); Description: Retrieves alignments between the consensus and query species from a specified region of the consensus genome. Returntype : an array reference of Bio::EnsEMBL::DnaDnaAlignFeature objects Exceptions : returns a ref to an empty list if requested DnaFrag or MethodLinkSpeciesSet are not in the compara DB. Caller : general =cut sub fetch_all_by_species_region { my ($self, $consensus_species, $consensus_assembly, $query_species, $query_assembly, $chromosome_name, $start, $end, $alignment_type, $limit,$dnafrag_type) = @_; $limit = 0 unless (defined $limit); #get the genome database for each species my $genome_db_adaptor = $self->db->get_GenomeDBAdaptor; my $consensus_genome_db = $genome_db_adaptor->fetch_by_name_assembly($consensus_species, $consensus_assembly); my $query_genome_db = $genome_db_adaptor->fetch_by_name_assembly($query_species, $query_assembly); #retrieve dna fragments from the subjects species region of interest my $dna_frag_adaptor = $self->db->get_DnaFragAdaptor; my $this_dnafrag = $dna_frag_adaptor->fetch_by_GenomeDB_and_name( $consensus_genome_db, $chromosome_name, ); return [] if (!$this_dnafrag); # Get the Bio::EnsEMBL::Compara::MethodLinkSpeciesSet object corresponding to the alignment_type and # the couple of genomes my $method_link_species_set_adaptor = $self->db->get_MethodLinkSpeciesSetAdaptor; my $method_link_species_set; if ($consensus_genome_db->dbID == $query_genome_db->dbID) { # Allow to fetch the right method_link_species_set for self-comparisons! $method_link_species_set = $method_link_species_set_adaptor->fetch_by_method_link_type_GenomeDBs( $alignment_type, [$consensus_genome_db]); } else { # Normal, pairwise comparisons... $method_link_species_set = $method_link_species_set_adaptor->fetch_by_method_link_type_GenomeDBs( $alignment_type, [$consensus_genome_db, $query_genome_db] ); } return [] if (!$method_link_species_set); # my $gaa = $self->db->get_GenomicAlignAdaptor; my $genomic_align_block_adaptor = $self->db->get_GenomicAlignBlockAdaptor; my @out = (); my $consensus_slice_adaptor = $consensus_genome_db->db_adaptor->get_SliceAdaptor; my $query_slice_adaptor; eval { $query_slice_adaptor = $query_genome_db->db_adaptor->get_SliceAdaptor; }; #caclulate coords relative to start of dnafrag ## Bio::EnsEMBL::Compara::Dnafrag::start is always 1 # my $this_dnafrag_start = $start - $this_dnafrag->start + 1; # my $this_dnafrag_end = $end - $this_dnafrag->start + 1; my $this_dnafrag_start = $start; my $this_dnafrag_end = $end; #constrain coordinates so they are completely within the dna frag my $this_dnafrag_length = $this_dnafrag->length; $this_dnafrag_start = 1 unless (defined $this_dnafrag_start); $this_dnafrag_start = ($this_dnafrag_start < 1) ? 1 : $this_dnafrag_start; $this_dnafrag_end = $this_dnafrag_length unless (defined $this_dnafrag_end); $this_dnafrag_end = ($this_dnafrag_end > $this_dnafrag_length) ? $this_dnafrag_length : $this_dnafrag_end; #fetch all alignments in the region we are interested in my $genomic_align_blocks = $genomic_align_block_adaptor->fetch_all_by_MethodLinkSpeciesSet_DnaFrag( $method_link_species_set, $this_dnafrag, $this_dnafrag_start, $this_dnafrag_end, $limit ); #convert genomic align blocks to dna align features my $dafs = _convert_GenomicAlignBlocks_into_DnaDnaAlignFeatures($genomic_align_blocks); # We need to attach slices of the entire seq region to the features. # The features come without any slices at all, but their coords are # relative to the beginning of the seq region. my $top_slice = $consensus_slice_adaptor->fetch_by_region($dnafrag_type, $chromosome_name); map {$_->slice($top_slice)} @$dafs; return $dafs; } =head2 fetch_all_by_Slice Arg [1] : Bio::EnsEMBL::Slice Arg [2] : string $qy_species The query species to retrieve alignments against Arg [3] : string $qy_assembly Arg [4] : string $$alignment_type The type of alignments to be retrieved e.g. WGA or WGA_HCR Example : $gaa->fetch_all_by_Slice($slice, "Mus musculus","WGA"); Description: find matches of query_species in the region of a slice of a subject species Returntype : an array reference of Bio::EnsEMBL::DnaDnaAlignFeature objects Exceptions : none Caller : general =cut sub fetch_all_by_Slice { my ($self, $orig_slice, $qy_species, $qy_assembly, $alignment_type, $limit) = @_; unless($orig_slice && ref $orig_slice && $orig_slice->isa('Bio::EnsEMBL::Slice')) { throw("Invalid slice argument [$orig_slice]\n"); } unless($qy_species) { throw("Query species argument is required"); } $limit = 0 unless (defined $limit); my $genome_db_adaptor = $self->db->get_GenomeDBAdaptor(); my $cs_genome_db = $genome_db_adaptor->fetch_by_Slice($orig_slice); my $qy_genome_db = $genome_db_adaptor->fetch_by_name_assembly( $qy_species, $qy_assembly); return [] if (!$cs_genome_db or !$qy_genome_db); my $method_link_species_set_adaptor = $self->db->get_MethodLinkSpeciesSetAdaptor(); my $method_link_species_set; if ($cs_genome_db->dbID == $qy_genome_db->dbID) { $method_link_species_set = $method_link_species_set_adaptor-> fetch_by_method_link_type_GenomeDBs($alignment_type, [$cs_genome_db]); } else { $method_link_species_set = $method_link_species_set_adaptor-> fetch_by_method_link_type_GenomeDBs($alignment_type, [$cs_genome_db, $qy_genome_db]); } my $key = uc(join(':', $orig_slice->name, $cs_genome_db->name, $qy_genome_db->name, $qy_genome_db->assembly, $alignment_type)); if(exists $self->{'_cache'}->{$key}) { return $self->{'_cache'}->{$key}; } my $genomic_align_block_adaptor = $self->db->get_GenomicAlignBlockAdaptor(); my $genomic_align_blocks = $genomic_align_block_adaptor->fetch_all_by_MethodLinkSpeciesSet_Slice( $method_link_species_set, $orig_slice, $limit); my $dafs = _convert_GenomicAlignBlocks_into_DnaDnaAlignFeatures($genomic_align_blocks); #update the cache $self->{'_cache'}->{$key} = $dafs; return $dafs; } =head2 interpolate_best_location Arg [1] : Bio::EnsEMBL::Slice $slice Arg [2] : string $species e.g. "Mus musculus" Arg [3] : string $alignment_type e.g. "BLASTZ_NET" Arg [4] : string $seq_region_name e.g. "6-COX" Example : Description: Returntype : array with 3 elements Exceptions : Caller : =cut sub interpolate_best_location { my ($self,$slice,$species,$alignment_type,$seq_region_name) = @_; #warn $slice->name,"\t$species\t$alignment_type\t$seq_region_name"; $| =1 ; my $max_distance_for_clustering = 10000; my $dafs = $self->fetch_all_by_Slice($slice, $species, undef, $alignment_type); my %name_strand_clusters; my $based_on_group_id = 1; foreach my $daf (@{$dafs}) { next if ($seq_region_name && $daf->hseqname ne $seq_region_name); if (defined $daf->group_id && $daf->group_id > 0 && $alignment_type ne "TRANSLATED_BLAT") { push @{$name_strand_clusters{$daf->group_id}}, $daf; } else { $based_on_group_id = 0 if ($based_on_group_id); push @{$name_strand_clusters{$daf->hseqname. "_" .$daf->hstrand}}, $daf; } } if ($based_on_group_id) { my @ordered_name_strands = sort {scalar @{$name_strand_clusters{$b}} <=> scalar @{$name_strand_clusters{$a}}} keys %name_strand_clusters; my @best_blocks = sort {$a->hstart <=> $b->hend} @{$name_strand_clusters{$ordered_name_strands[0]}||[]}; return undef if( !@best_blocks ); return ($best_blocks[0]->hseqname, $best_blocks[0]->hstart + int(($best_blocks[-1]->hend - $best_blocks[0]->hstart)/2), $best_blocks[0]->hstrand * $slice->strand, $best_blocks[0]->hstart, $best_blocks[-1]->hend); } else { my @refined_clusters; foreach my $name_strand (keys %name_strand_clusters) { # an array of arrayrefs # name, strand, start, end, nb of blocks my @sub_clusters; foreach my $block (sort {$a->hstart <=> $b->hstart} @{$name_strand_clusters{$name_strand}||[]}) { unless (scalar @sub_clusters) { push @sub_clusters, [$block->hseqname,$block->hstrand, $block->hstart, $block->hend, 1]; next; } my $block_clustered = 0; foreach my $arrayref (@sub_clusters) { my ($n,$st,$s,$e,$c) = @{$arrayref}; if ($block->hstart<=$e && $block->hend>=$s) { # then overlaps. $arrayref->[2] = $block->hstart if ($block->hstart < $s); $arrayref->[3] = $block->hend if ($block->hend > $e); $arrayref->[4]++; $block_clustered = 1; } elsif ($block->hstart <= $e + $max_distance_for_clustering && $block->hstart > $e) { # then is downstream $arrayref->[3] = $block->hend; $arrayref->[4]++; $block_clustered = 1; } elsif ($block->hend >= $s - $max_distance_for_clustering && $block->hend < $s) { # then is upstream $arrayref->[2] = $block->hstart; $arrayref->[4]++; $block_clustered = 1; } } unless ($block_clustered) { # do not overlap anything already seen, so adding as new seeding cluster push @sub_clusters, [$block->hseqname,$block->hstrand, $block->hstart, $block->hend, 1]; } } push @refined_clusters, @sub_clusters; } # sort by the max number of blocks desc @refined_clusters = sort {$b->[-1] <=> $a->[-1]} @refined_clusters; return undef if(!@refined_clusters); return ($refined_clusters[0]->[0], #hseqname, $refined_clusters[0]->[2] + int(($refined_clusters[0]->[3] - $refined_clusters[0]->[2])/2), $refined_clusters[0]->[1] * $slice->strand, $refined_clusters[0]->[2], $refined_clusters[0]->[3]); } } =head2 deleteObj Arg [1] : none Example : none Description: Called automatically by DBConnection during object destruction phase. Clears the cache to avoid memory leaks. Returntype : none Exceptions : none Caller : none =cut sub deleteObj { my $self = shift; $self->SUPER::deleteObj; #clear the cache, removing references %{$self->{'_cache'}} = (); } =head2 deleteObj Arg [1] : none Example : none Description: Called automatically by DBConnection during object destruction phase. Clears the cache to avoid memory leaks. Returntype : none Exceptions : none Caller : none =cut sub _convert_GenomicAlignBlocks_into_DnaDnaAlignFeatures { my ($genomic_align_blocks) = @_; my $dna_dna_align_features = []; my $query_slice_adaptor; foreach my $this_genomic_align_block (@$genomic_align_blocks) { ## KNOWN BUG: This will ignore third and following parts of a multiple alignment... ## This adaptor cannot deal with multiple alignments. Use the new ## Bio::EnsEMBL::Compara::DBSQL::GenomicAlignBlockAdaptor instead. my $consensus_genomic_align = $this_genomic_align_block->reference_genomic_align; my $query_genomic_align = $this_genomic_align_block->get_all_non_reference_genomic_aligns->[0]; my $top_slice; if (!defined($query_slice_adaptor)) { $query_slice_adaptor = $query_genomic_align->get_Slice->adaptor; } if ($query_slice_adaptor) { $top_slice = $query_slice_adaptor->fetch_by_region( $query_genomic_align->dnafrag->coord_system_name, $query_genomic_align->dnafrag->name ); } else { $top_slice = undef; } ## The code for transforming GenomicAlignBlocks into DnaDnaAlignFeatures assumes that ## reference_genomic_align is on the forward strand! if ($consensus_genomic_align->dnafrag_strand == -1) { $this_genomic_align_block->reverse_complement; } my $cstart = $consensus_genomic_align->dnafrag_start; my $cend = $consensus_genomic_align->dnafrag_end; #skip features which do not overlap the requested region #next if ($cstart > $end || $cend < $start); my $ga_cigar_line; do { my @consensus_cigar_pieces = split(/(\d*[DIMG])/, $consensus_genomic_align->cigar_line); my @query_cigar_pieces = split(/(\d*[DIMG])/, $query_genomic_align->cigar_line); my @consensus_gapped_pieces; foreach my $piece (@consensus_cigar_pieces) { next if ($piece !~ /^(\d*)([MDIG])$/); my $num = $1; my $type = $2; $num = 1 if ($num !~ /^\d+$/); if( $type eq "M" ) { for (my $i=0; $i<$num; $i++) {push(@consensus_gapped_pieces, "N")} } else { for (my $i=0; $i<$num; $i++) {push(@consensus_gapped_pieces, '-')} } } my @query_gapped_pieces; foreach my $piece (@query_cigar_pieces) { next if ($piece !~ /^(\d*)([MDIG])$/); my $num = $1; my $type = $2; $num = 1 if ($num !~ /^\d+$/); if( $type eq "M" ) { for (my $i=0; $i<$num; $i++) {push(@query_gapped_pieces, "N")} } else { for (my $i=0; $i<$num; $i++) {push(@query_gapped_pieces, '-')} } } throw if (scalar(@consensus_gapped_pieces) != scalar(@query_gapped_pieces)); my $type = ""; my $num = 0; for (my $i=0; $i<@consensus_gapped_pieces; $i++) { if ($consensus_gapped_pieces[$i] eq "N" and $query_gapped_pieces[$i] eq "N") { if ($type ne "M") { $ga_cigar_line .= (($num==1)?"":$num).$type if ($num); $num = 0; $type = "M"; } } elsif ($consensus_gapped_pieces[$i] eq "N" and $query_gapped_pieces[$i] eq "-") { if ($type ne "I") { $ga_cigar_line .= (($num==1)?"":$num).$type if ($num); $num = 0; $type = "I"; } } elsif ($consensus_gapped_pieces[$i] eq "-" and $query_gapped_pieces[$i] eq "N") { if ($type ne "D") { $ga_cigar_line .= (($num==1)?"":$num).$type if ($num); $num = 0; $type = "D"; } } else { throw "no double gaps can occur in a pairwise alignment!"; } $num++; } $ga_cigar_line .= (($num==1)?"":$num).$type; }; my $df_name = $consensus_genomic_align->dnafrag->name; my $score = $this_genomic_align_block->score; my $perc_id = $this_genomic_align_block->perc_id; my $qdf_start = 1; my $ga_query_start = $query_genomic_align->dnafrag_start; my $ga_query_end = $query_genomic_align->dnafrag_end; my $ga_query_strand = $query_genomic_align->dnafrag_strand; my $qdf_name = $query_genomic_align->dnafrag->name; my $ga_level_id = $consensus_genomic_align->level_id; my $ga_strands_reversed = 0; if ($consensus_genomic_align->dnafrag_strand == -1) { $ga_strands_reversed = 1; $ga_query_strand = -$ga_query_strand; } #my $ga_group_id = $consensus_genomic_align->genomic_align_group_id_by_type("default"); my $ga_group_id = ($this_genomic_align_block->group_id() || $consensus_genomic_align->genomic_align_group_id_by_type("default")); my ($slice, $seq_name, $start, $end, $strand); if ($this_genomic_align_block->reference_slice) { $slice = $this_genomic_align_block->reference_slice; $seq_name = $this_genomic_align_block->reference_slice->name; $start = $this_genomic_align_block->reference_slice_start; $end = $this_genomic_align_block->reference_slice_end; $strand = $this_genomic_align_block->reference_slice_strand; } else { $slice = $consensus_genomic_align->get_Slice(); $seq_name = $consensus_genomic_align->dnafrag->name; $start = $consensus_genomic_align->dnafrag_start; $end = $consensus_genomic_align->dnafrag_end; $strand = $consensus_genomic_align->dnafrag_strand; } my $f = Bio::EnsEMBL::DnaDnaAlignFeature->new_fast ({'cigar_string' => $ga_cigar_line, 'slice' => $slice, 'seqname' => $seq_name, 'start' => $start, 'end' => $end, 'strand' => $strand, 'species' => $consensus_genomic_align->genome_db->name, 'score' => $score, 'percent_id' => $perc_id, 'hstart' => $qdf_start + $ga_query_start - 1, 'hend' => $qdf_start + $ga_query_end -1, 'hstrand' => $ga_query_strand, 'hseqname' => $qdf_name, 'hspecies' => $query_genomic_align->genome_db->name, 'hslice' => $top_slice, 'alignment_type' => $this_genomic_align_block->method_link_species_set->method_link_type, 'group_id' => $ga_group_id, 'level_id' => $ga_level_id, 'strands_reversed' => $ga_strands_reversed}); push @$dna_dna_align_features, $f; } return $dna_dna_align_features; } 1;