Raw content of Bio::EnsEMBL::Variation::DBSQL::AlleleFeatureAdaptor # # Ensembl module for Bio::EnsEMBL::Variation::DBSQL::AlleleFeatureAdaptor # # Copyright (c) 2004 Ensembl # # You may distribute this module under the same terms as perl itself # # =head1 NAME Bio::EnsEMBL::Variation::DBSQL::AlleleFeatureAdaptor =head1 SYNOPSIS $vdb = Bio::EnsEMBL::Variation::DBSQL::DBAdaptor->new(...); $db = Bio::EnsEMBL::DBSQL::DBAdaptor->new(...); # tell the variation database where core database information can be # be found $vdb->dnadb($db); $afa = $vdb->get_AlleleFeatureAdaptor(); $sa = $db->get_SliceAdaptor(); # Get a VariationFeature by its internal identifier $af = $afa->fetch_by_dbID(145); # get all AlleleFeatures in a region $slice = $sa->fetch_by_region('chromosome', 'X', 1e6, 2e6); foreach $af (@{$afa->fetch_all_by_Slice($slice)}) { print $af->start(), '-', $af->end(), ' ', $af->allele(), "\n"; } =head1 DESCRIPTION This adaptor provides database connectivity for AlleleFeature objects. Genomic locations of alleles in samples can be obtained from the database using this adaptor. See the base class BaseFeatureAdaptor for more information. =head1 AUTHOR - Daniel Rios =head1 CONTACT Post questions to the Ensembl development list ensembl-dev@ebi.ac.uk =head1 METHODS =cut use strict; use warnings; package Bio::EnsEMBL::Variation::DBSQL::AlleleFeatureAdaptor; use Bio::EnsEMBL::Variation::AlleleFeature; use Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor; use Bio::EnsEMBL::Utils::Exception qw(throw); use Bio::EnsEMBL::Utils::Sequence qw(expand); use Bio::EnsEMBL::Variation::Utils::Sequence qw(ambiguity_code); our @ISA = ('Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor'); =head2 from_IndividualSlice Arg[0] : (optional) int $number Example : $afa->from_IndividualSlice(1); Description : Getter/Setter to know wether the Adaptor has been called from and IndividualSlice (and must create the objects from the genotype table) or from the StrainSlice (and must create the objects from the allele table) ReturnType : int Exceptions : none Caller : general Status : At Risk =cut sub from_IndividualSlice{ my $self = shift; return $self->{'from_IndividualSlice'} = shift if(@_); return $self->{'from_IndividualSlice'}; } =head2 fetch_all_by_Slice Arg[0] : Bio::EnsEMBL::Slice $slice Arg[1] : (optional) Bio::EnsEMBL::Variation::Individual $individual Example : my $vf = $vfa->fetch_all_by_Slice($slice,$individual); Description : Gets all the VariationFeatures in a certain Slice for a given Individual (if provided) ReturnType : listref of Bio::EnsEMBL::Variation::AlleleFeature Exceptions : thrown on bad arguments Caller : general Status : At Risk =cut sub fetch_all_by_Slice{ my $self = shift; my $slice = shift; my $individual = shift; if(!ref($slice) || !$slice->isa('Bio::EnsEMBL::Slice')) { throw('Bio::EnsEMBL::Slice arg expected'); } if (defined $individual){ if(!ref($individual) || !$individual->isa('Bio::EnsEMBL::Variation::Individual')) { throw('Bio::EnsEMBL::Variation::Individual arg expected'); } if(!defined($individual->dbID())) { throw("Individual arg must have defined dbID"); } } %{$self->{'_slice_feature_cache'}} = (); #clean the cache to avoid caching problems my $genotype_adaptor = $self->db->get_IndividualGenotypeAdaptor; #get genotype adaptor my $genotypes = $genotype_adaptor->fetch_all_by_Slice($slice,$individual); #and get all genotype data my $afs = $self->SUPER::fetch_all_by_Slice($slice); #get all AlleleFeatures within the Slice my $last_position = 0; my $new_afs = []; my $string1; my $string2; my $alleles = {}; #hash to alleles, so we can check the genotype is within the alleles #we need to merge genotype data with AlleleFeatures to assign alleles foreach my $af (@{$afs}){ $alleles = {}; #flush the hash map {$alleles->{$_}++} split /\//,$af->{'_vf_allele_string'}; #create a hash with the alleles #both, genotypes and af should be sorted for (my $i = $last_position;$i<@{$genotypes};$i++){ #need to reverse the alleles if they are in different strand, since the genotype #table is stored always in the positive strand $string1 = $genotypes->[$i]->allele1; $string2 = $genotypes->[$i]->allele2; if ($af->seq_region_strand == -1){ $string1 =~ tr/ACGTN-/TGCAN-/; $string2 =~ tr/ACGTN-/TGCAN-/; } #we need to check the genotypes are within the alleles of the variation if ($genotypes->[$i]->start == $af->seq_region_start && (exists $alleles->{$string1} || exists $alleles->{$string2})){ $alleles = {}; #flush the alleles hash delete $af->{'_vf_allele_string'}; #and remove the attribute if ($af->seq_region_strand == -1){ $genotypes->[$i]->allele1($string1); $genotypes->[$i]->allele2($string2); } if ($genotypes->[$i]->allele2 eq 'N'){ $af->{'_half_genotype'} = 1; $af->allele_string($genotypes->[$i]->allele1); #for half genotypes } else{ $af->{'_half_genotype'} = 0; #we need to put the allele and Individual in the AlleleFeature object if($genotypes->[$i]->allele1 eq '-' and $genotypes->[$i]->allele1 eq '-'){ 1; } $af->allele_string(ambiguity_code($genotypes->[$i]->allele1 . '/' . $genotypes->[$i]->allele2)); #for heterozigous alleles } # $af->allele_string($genotypes->[$i]->allele1); #if it is strain, both alleles should be the same, might be changed in the future $af->individual($genotypes->[$i]->individual); $last_position++; push @{$new_afs},$af; } elsif ($genotypes->[$i]->start < $af->seq_region_start){ $last_position++; #this should not happen, it means the genotype has no allele feature } else{ last; } } } return $new_afs; } =head2 fetch_all_IndividualSlice Arg[0] : Bio::EnsEMBL::Slice $slice Arg[1] : Bio::EnsEMBL::Variation::Population $population Example : my $vf = $vfa->fetch_all_by_Slice($slice,$population); Description : Gets all the VariationFeatures in a certain Slice for a given Population ReturnType : listref of Bio::EnsEMBL::Variation::VariationFeature Exceptions : thrown on bad arguments Caller : general Status : At Risk =cut sub fetch_all_IndividualSlice{ my $self = shift; my $slice = shift; if(!ref($slice) || !$slice->isa('Bio::EnsEMBL::Slice')) { throw('Bio::EnsEMBL::Slice arg expected'); } #indicate to select the data from the single_bp table $self->_multiple_bp(0); #call the method fetch_all_by_Slice_constraint with the population constraint for the single_bp my $allele_features = $self->fetch_all_by_Slice($slice); #remove from the cache the result of the query (would not run the second one) # my $key = uc(join(':', $slice->name, $constraint)); # delete $self->{'_slice_feature_cache'}->{$key}; #indicate to select the data from the multiple_bp table $self->_multiple_bp(1); #and add the same information for the multiple_bp table # my $allele_features_multiple = $self->fetch_all_by_Slice_constraint($slice,$constraint); # push @{$allele_features},$allele_features_multiple if (@{$allele_features_multiple} > 0); #and include then the result of both queries # $self->{'_slice_feature_cache'}->{$key} = $allele_features; return $allele_features; } sub _tables{ my $self = shift; if ($self->from_IndividualSlice){ return (['variation_feature','vf'], ['individual_genotype_single_bp','ig']) if (!$self->_multiple_bp()); return (['variation_feature','vf'], ['individual_genotype_multiple_bp','ig']) if ($self->_multiple_bp()); } else{ return (['variation_feature','vf'], ['source','s FORCE INDEX(PRIMARY)']); } } sub _columns{ my $self = shift; return ('vf.variation_id' ,'ig.sample_id', 'CONCAT(ig.allele_1,"|",ig.allele_2) as alleles', 'vf.seq_region_id', 'vf.seq_region_start', 'vf.seq_region_end', 'vf.seq_region_strand', 'vf.variation_name') if ($self->from_IndividualSlice()); return qw(vf.variation_id vf.seq_region_id vf.seq_region_start vf.seq_region_end vf.seq_region_strand vf.variation_name s.name vf.variation_feature_id vf.allele_string); } sub _default_where_clause{ my $self = shift; return "ig.variation_id = vf.variation_id" if ($self->from_IndividualSlice()); return "vf.source_id = s.source_id"; } sub _objs_from_sth{ my ($self, $sth, $mapper, $dest_slice) = @_; # # This code is ugly because an attempt has been made to remove as many # function calls as possible for speed purposes. Thus many caches and # a fair bit of gymnastics is used. # my $sa = $self->db()->dnadb()->get_SliceAdaptor(); my @features; my %slice_hash; my %sr_name_hash; my %sr_cs_hash; my ($variation_id, $seq_region_id, $seq_region_start,$seq_region_end, $seq_region_strand, $variation_name, $source_name, $variation_feature_id, $allele_string ); $sth->bind_columns(\$variation_id, \$seq_region_id,\$seq_region_start,\$seq_region_end,\$seq_region_strand, \$variation_name, \$source_name, \$variation_feature_id, \$allele_string); my $asm_cs; my $cmp_cs; my $asm_cs_vers; my $asm_cs_name; my $cmp_cs_vers; my $cmp_cs_name; if($mapper) { $asm_cs = $mapper->assembled_CoordSystem(); $cmp_cs = $mapper->component_CoordSystem(); $asm_cs_name = $asm_cs->name(); $asm_cs_vers = $asm_cs->version(); $cmp_cs_name = $cmp_cs->name(); $cmp_cs_vers = $cmp_cs->version(); } my $dest_slice_start; my $dest_slice_end; my $dest_slice_strand; my $dest_slice_length; if($dest_slice) { $dest_slice_start = $dest_slice->start(); $dest_slice_end = $dest_slice->end(); $dest_slice_strand = $dest_slice->strand(); $dest_slice_length = $dest_slice->length(); } FEATURE: while($sth->fetch()) { #get the slice object my $slice = $slice_hash{"ID:".$seq_region_id}; if(!$slice) { $slice = $sa->fetch_by_seq_region_id($seq_region_id); $slice_hash{"ID:".$seq_region_id} = $slice; $sr_name_hash{$seq_region_id} = $slice->seq_region_name(); $sr_cs_hash{$seq_region_id} = $slice->coord_system(); } # # remap the feature coordinates to another coord system # if a mapper was provided # if($mapper) { my $sr_name = $sr_name_hash{$seq_region_id}; my $sr_cs = $sr_cs_hash{$seq_region_id}; ($sr_name,$seq_region_start,$seq_region_end,$seq_region_strand) = $mapper->fastmap($sr_name, $seq_region_start, $seq_region_end, $seq_region_strand, $sr_cs); #skip features that map to gaps or coord system boundaries next FEATURE if(!defined($sr_name)); #get a slice in the coord system we just mapped to if($asm_cs == $sr_cs || ($cmp_cs != $sr_cs && $asm_cs->equals($sr_cs))) { $slice = $slice_hash{"NAME:$sr_name:$cmp_cs_name:$cmp_cs_vers"} ||= $sa->fetch_by_region($cmp_cs_name, $sr_name,undef, undef, undef, $cmp_cs_vers); } else { $slice = $slice_hash{"NAME:$sr_name:$asm_cs_name:$asm_cs_vers"} ||= $sa->fetch_by_region($asm_cs_name, $sr_name, undef, undef, undef, $asm_cs_vers); } } # # If a destination slice was provided convert the coords # If the dest_slice starts at 1 and is foward strand, nothing needs doing # if($dest_slice) { if($dest_slice_start != 1 || $dest_slice_strand != 1) { if($dest_slice_strand == 1) { $seq_region_start = $seq_region_start - $dest_slice_start + 1; $seq_region_end = $seq_region_end - $dest_slice_start + 1; } else { my $tmp_seq_region_start = $seq_region_start; $seq_region_start = $dest_slice_end - $seq_region_end + 1; $seq_region_end = $dest_slice_end - $tmp_seq_region_start + 1; $seq_region_strand *= -1; } #throw away features off the end of the requested slice if($seq_region_end < 1 || $seq_region_start > $dest_slice_length) { next FEATURE; } } $slice = $dest_slice; } #allele and sample table comes from the Compressed genotype adaptor # $allele = ambiguity_code($allele) if ($self->from_IndividualSlice); push @features, Bio::EnsEMBL::Variation::AlleleFeature->new_fast( {'start' => $seq_region_start, 'end' => $seq_region_end, 'strand' => $seq_region_strand, 'slice' => $slice, 'allele_string' => '', 'variation_name' => $variation_name, 'adaptor' => $self, 'source' => $source_name, '_variation_id' => $variation_id, '_variation_feature_id' => $variation_feature_id, '_vf_allele_string' => $allele_string, '_sample_id' => ''}); } return\@features; } #internal function used to determine wether selecting data from the individual_genotype_single_bp table or the multiple_bp sub _multiple_bp{ my $self = shift; $self->{'multiple_bp'} = shift if (@_); return $self->{'multiple_bp'}; } =head2 get_all_synonym_sources Args[1] : Bio::EnsEMBL::Variation::AlleleFeature vf Example : my @sources = @{$af_adaptor->get_all_synonym_sources($af)}; Description : returns a list of all the sources for synonyms of this AlleleFeature ReturnType : reference to list of strings Exceptions : none Caller : general Status : At Risk : Variation database is under development. =cut sub get_all_synonym_sources{ my $self = shift; my $af = shift; my %sources; my @sources; if(!ref($af) || !$af->isa('Bio::EnsEMBL::Variation::AlleleFeature')) { throw("Bio::EnsEMBL::Variation::AlleleFeature argument expected"); } if (!defined($af->{'_variation_id'}) && !defined($af->{'variation'})){ warning("Not possible to get synonym sources for the AlleleFeature: you need to attach a Variation first"); return \@sources; } #get the variation_id my $variation_id; if (defined ($af->{'_variation_id'})){ $variation_id = $af->{'_variation_id'}; } else{ $variation_id = $af->variation->dbID(); } #and go to the varyation_synonym table to get the extra sources my $source_name; my $sth = $self->prepare(qq{SELECT s.name FROM variation_synonym vs, source s WHERE s.source_id = vs.source_id AND vs.variation_id = ? }); $sth->bind_param(1,$variation_id,SQL_INTEGER); $sth->execute(); $sth->bind_columns(\$source_name); while ($sth->fetch){ $sources{$source_name}++; } @sources = keys(%sources); return \@sources; } 1;