Raw content of Bio::EnsEMBL::Variation::DBSQL::VariationAdaptor # # Ensembl module for Bio::EnsEMBL::Variation::DBSQL::VariationAdaptor # # Copyright (c) 2004 Ensembl # # You may distribute this module under the same terms as perl itself # # =head1 NAME Bio::EnsEMBL::Variation::DBSQL::VariationAdaptor =head1 SYNOPSIS $db = Bio::EnsEMBL::Variation::DBSQL::DBAdaptor->new(...); $va = $db->get_VariationAdaptor(); $vga = $db->get_VariationGroupAdaptor(); $pa = $db->get_PopulationAdaptor(); # Get a Variation by its internal identifier $var = $va->fetch_by_dbID(145); # fetch a variation by its name $var = $va->fetch_by_name('rs100'); # fetch all variations from a population $pop = $pa->fetch_by_name('PACIFIC'); @vars = {$va->fetch_all_by_Population($pop)}; =head1 DESCRIPTION This adaptor provides database connectivity for Variation objects. Variations (SNPs, etc.) may be retrieved from the Ensembl variation database by several means using this module. =head1 AUTHOR - Graham McVicker =head1 CONTACT Post questions to the Ensembl development list ensembl-dev@ebi.ac.uk =head1 METHODS =cut use strict; use warnings; package Bio::EnsEMBL::Variation::DBSQL::VariationAdaptor; use Bio::EnsEMBL::DBSQL::BaseAdaptor; use Bio::EnsEMBL::Utils::Exception qw(throw warning); use Bio::EnsEMBL::Variation::Variation; use Bio::EnsEMBL::Variation::Allele; our @ISA = ('Bio::EnsEMBL::DBSQL::BaseAdaptor'); =head2 fetch_by_dbID Arg [1] : int $dbID Example : $var = $var_adaptor->fetch_by_dbID(5526); Description: Retrieves a Variation object via its internal identifier. If no such variation exists undef is returned. Returntype : Bio::EnsEMBL::Variation::Variation Exceptions : throw if dbID arg is not defined Caller : general, IndividualAdaptor Status : Stable =cut sub fetch_by_dbID { my $self = shift; my $dbID = shift; throw('dbID argument expected') if(!defined($dbID)); my $sth = $self->prepare (q{SELECT v.variation_id, v.name, v.validation_status, s1.name, v.ancestral_allele, a.allele_id, a.allele, a.frequency, a.sample_id, vs.moltype, vs.name, s2.name, f.description FROM (variation v, source s1) LEFT JOIN allele a ON v.variation_id = a.variation_id LEFT JOIN variation_synonym vs on v.variation_id = vs.variation_id LEFT JOIN source s2 on vs.source_id = s2.source_id LEFT JOIN failed_variation fv on v.variation_id = fv.variation_id LEFT JOIN failed_description f on fv.failed_description_id = f.failed_description_id WHERE v.source_id = s1.source_id AND v.variation_id = ?}); $sth->bind_param(1,$dbID,SQL_INTEGER); $sth->execute(); my $result = $self->_objs_from_sth($sth); $sth->finish(); return undef if(!@$result); return $result->[0]; } =head2 fetch_by_name Arg [1] : string $name Example : $var = $var_adaptor->fetch_by_name('rs1453','dbSNP'); Description: Retrieves a population object via its name Returntype : Bio::EnsEMBL::Variation::Variation Exceptions : throw if name argument is not defined Caller : general Status : Stable =cut sub fetch_by_name { my $self = shift; my $name = shift; my $source = shift; throw('name argument expected') if(!defined($name)); my $extra_sql = ""; if ( defined $source ) { $extra_sql = qq(AND s1.name = ?); } my $sth = $self->prepare (qq{SELECT v.variation_id, v.name, v.validation_status, s1.name, v.ancestral_allele, a.allele_id, a.allele, a.frequency, a.sample_id, vs.moltype, vs.name, s2.name, f.description # FROM variation v, source s1, source s2, allele a, variation_synonym vs FROM (variation v, source s1) LEFT JOIN allele a on v.variation_id = a.variation_id LEFT JOIN variation_synonym vs on v.variation_id = vs.variation_id LEFT JOIN source s2 on vs.source_id = s2.source_id LEFT JOIN failed_variation fv on v.variation_id = fv.variation_id LEFT JOIN failed_description f on fv.failed_description_id = f.failed_description_id # WHERE v.variation_id = a.variation_id # AND v.variation_id = vs.variation_id WHERE v.source_id = s1.source_id # AND vs.source_id = s2.source_id AND v.name = ? $extra_sql ORDER BY a.allele_id}); $sth->bind_param(1,$name,SQL_VARCHAR); $sth->bind_param(2,$source,SQL_VARCHAR) if defined $source; $sth->execute(); my $result = $self->_objs_from_sth($sth); $sth->finish(); if(!@$result) { # try again if nothing found, but check synonym table instead $sth = $self->prepare (qq{SELECT v.variation_id, v.name, v.validation_status, s1.name, v.ancestral_allele, a.allele_id, a.allele, a.frequency, a.sample_id, vs1.moltype, vs2.name, s2.name, NULL FROM variation v, source s1, source s2, allele a, variation_synonym vs1, variation_synonym vs2 WHERE v.variation_id = a.variation_id AND v.variation_id = vs1.variation_id AND v.variation_id = vs2.variation_id AND v.source_id = s1.source_id AND vs2.source_id = s2.source_id AND vs1.name = ? $extra_sql ORDER BY a.allele_id}); $sth->bind_param(1,$name,SQL_VARCHAR); $sth->bind_param(2,$source,SQL_VARCHAR) if defined $source; $sth->execute(); $result = $self->_objs_from_sth($sth); return undef if(!@$result); $sth->finish(); } return $result->[0]; } =head2 fetch_all_by_source Arg [1] : string $source_name Arg [2] : int $primary Example : $var = $var_adaptor->fetch_all_by_source(); Description: Retrieves all Variation objects associated with a source. By default ($primary=0) returns variations that have the source or variation_synonym that have the source. If primary set to 1, it returns only variations where the primary name is associated with the source Returntype : listref of Bio::EnsEMBL::Variation::Variation Exceptions : thrown if source_name not provided Caller : general Status : Stable =cut sub fetch_all_by_source { my $self = shift; my $source_name = shift; my $primary = shift; my @out; $primary ||= 0; #by default, returns ALL variation and variation_synonyms where source = $name throw('name argument expected') if(!defined($source_name)); my $sth = $self->prepare (qq{SELECT v.variation_id, v.name, v.validation_status, s1.name, v.ancestral_allele, a.allele_id, a.allele, a.frequency, a.sample_id, vs.moltype, vs.name, s2.name, f.description FROM (variation v, source s1) LEFT JOIN allele a on v.variation_id = a.variation_id LEFT JOIN variation_synonym vs on v.variation_id = vs.variation_id LEFT JOIN source s2 on vs.source_id = s2.source_id LEFT JOIN failed_variation fv on v.variation_id = fv.variation_id LEFT JOIN failed_description f on fv.failed_description_id = f.failed_description_id WHERE v.source_id = s1.source_id AND s1.name = ? }); $sth->bind_param(1,$source_name,SQL_VARCHAR); $sth->execute(); my $result = $self->_objs_from_sth($sth); $sth->finish(); push @out, @{$result}; #we need to include variation_synonym as well, where the variation was merged with dbSNP if (!$primary){ $sth = $self->prepare (qq{SELECT v.variation_id, v.name, v.validation_status, s1.name, v.ancestral_allele, a.allele_id, a.allele, a.frequency, a.sample_id, vs1.moltype, vs1.name, s2.name, NULL FROM (variation v, source s1, source s2, variation_synonym vs1) LEFT JOIN allele a ON v.variation_id = a.variation_id WHERE v.variation_id = vs1.variation_id AND v.source_id = s1.source_id AND vs1.source_id = s2.source_id AND s2.name = ? ORDER BY v.variation_id }); $sth->bind_param(1,$source_name,SQL_VARCHAR); $sth->execute(); $result = $self->_objs_from_sth($sth); $sth->finish(); #need to merge both lists, trying to avoid duplicates push @out, @{$result}; } return \@out; } =head2 fetch_all_by_dbID_list Arg [1] : reference to list of ints $list Example : @vars = @{$va->fetch_all_by_dbID_list([124, 56, 90])}; Description: Retrieves a set of variations via their internal identifiers. This is faster than repeatedly calling fetch_by_dbID if there are a large number of variations to retrieve Returntype : reference to list of Bio::EnsEMBL::Variation::Variation objects Exceptions : throw on bad argument Caller : general, IndividualGenotypeAdaptor, PopulationGenotypeAdaptor Status : At Risk =cut sub fetch_all_by_dbID_list { my $self = shift; my $list = shift; if(!defined($list) || ref($list) ne 'ARRAY') { throw("list reference argument is required"); } return [] if(!@$list); my @out; # mysql is faster and we ensure that we do not exceed the max query size by # splitting large queries into smaller queries of 200 ids my $max = 200; while(@$list) { my @ids = (@$list > $max) ? splice(@$list, 0, $max) : splice(@$list, 0); my $id_str = (@ids > 1) ? " IN (".join(',',@ids).")" : ' = '.$ids[0]; my $sth = $self->prepare (qq{SELECT v.variation_id, v.name, v.validation_status, s1.name, v.ancestral_allele, a.allele_id, a.allele, a.frequency, a.sample_id, vs.moltype, vs.name, s2.name, f.description FROM (variation v, source s1) LEFT JOIN allele a on v.variation_id = a.variation_id LEFT JOIN variation_synonym vs on v.variation_id = vs.variation_id LEFT JOIN source s2 on vs.source_id = s2.source_id LEFT JOIN failed_variation fv on v.variation_id = fv.variation_id LEFT JOIN failed_description f on fv.failed_description_id = f.failed_description_id WHERE v.source_id = s1.source_id AND v.variation_id $id_str}); $sth->execute(); my $result = $self->_objs_from_sth($sth); $sth->finish(); push @out, @$result if(@$result); } return \@out; } =head2 get_all_sources Args : none Example : $sources = $va->get_all_sources(); Description : Retrieves from the database all sources in the Source table ReturnType : array ref of string Exceptions : none Caller : web Status : At Risk =cut sub get_all_sources{ my $self = shift; my @sources; my $source_name; my $sth = $self->prepare(qq{SELECT name from source }); $sth->execute(); $sth->bind_columns(\$source_name); while ($sth->fetch()){ push @sources, $source_name } $sth->finish(); return \@sources; } =head2 get_default_source Args : none Example : $default_source = $va->get_default_source(); Description : Retrieves from the database the default source used for display purposes ReturnType : string Exceptions : none Caller : web Status : At Risk =cut sub get_default_source{ my $self = shift; my $source_name; my $sth = $self->prepare(qq{SELECT meta_value from meta where meta_key = ? }); $sth->bind_param(1,'source.default_source',SQL_VARCHAR); $sth->execute(); $sth->bind_columns(\$source_name); $sth->fetch(); $sth->finish(); return $source_name; } =head2 get_source_version Arg[1] : string $name Example : $version = $va->get_source_version('dbSNP'); Description : Retrieves from the database the version for the source given as an argument ReturnType : int Exceptions : none Caller : general Status : At Risk =cut sub get_source_version{ my $self = shift; my $name = shift; my $version; my $sth = $self->prepare(qq{SELECT version from source where name = ? }); $sth->bind_param(1,$name,SQL_VARCHAR); $sth->execute(); $sth->bind_columns(\$version); $sth->fetch(); $sth->finish(); return $version; } =head2 get_flanking_sequence Arg[1] : int $variationID Example : $flankinq_sequence = $va->get_flanking_sequence('652'); Description : Retrieves from the database the appropriate flanking sequence (five,three) for the variation. If the flanking sequence is not in the Flankinq_sequence table, access the core database with the coordinates ReturnType : reference to a list containing (three_flank,five_flank) Exceptions : throw when not possible to obtain sequence Caller : general, Variation Status : Stable =cut sub get_flanking_sequence{ my $self = shift; my $variationID = shift; my $flanking_sequence; #reference to an array for the three_prime and five_prime seqs my ($seq_region_id, $seq_region_strand, $up_seq, $down_seq, $up_seq_region_start, $up_seq_region_end, $down_seq_region_start, $down_seq_region_end); my $sth = $self->prepare(qq{ SELECT seq_region_id, seq_region_strand, up_seq, down_seq, up_seq_region_start, up_seq_region_end, down_seq_region_start, down_seq_region_end FROM flanking_sequence WHERE variation_id = ? }); $sth->bind_param(1,$variationID,SQL_INTEGER); $sth->execute(); #retrieve the flank from the variation database $sth->bind_columns(\($seq_region_id, $seq_region_strand, $up_seq, $down_seq, $up_seq_region_start, $up_seq_region_end, $down_seq_region_start, $down_seq_region_end)); $sth->fetch(); $sth->finish(); if (!defined $down_seq){ if( $seq_region_id){ $down_seq = $self->_get_flank_from_core($seq_region_id, $down_seq_region_start, $down_seq_region_end, $seq_region_strand); } else { warn( "*****[ERROR]: No seq_region_id for SNP with dbID: $variationID. ". "Cannot retrieve flanking region******\n" ); } } if (!defined $up_seq){ if( $seq_region_id){ $up_seq = $self->_get_flank_from_core($seq_region_id, $up_seq_region_start, $up_seq_region_end, $seq_region_strand); } else { warn( "*****[ERROR]: No seq_region_id for SNP with dbID: $variationID. ". "Cannot retrieve flanking region******\n" ); } } push @{$flanking_sequence},$down_seq,$up_seq; #add to the array the 3 and 5 prime sequences return $flanking_sequence; } =head2 fetch_all_by_Population Arg [1] : Bio::EnsEMBL::Variation::Population Example : $pop = $pop_adaptor->fetch_by_dbID(1345); @vars = @{$va_adaptor->fetch_all_by_Population($pop)}; Description: Retrieves all variations which are stored for a specified population. Returntype : listref of Bio::EnsEMBL::Variation::Variation Exceptions : throw on incorrect argument Caller : general Status : At Risk =cut sub fetch_all_by_Population { my $self = shift; my $pop = shift; if(!ref($pop) || !$pop->isa('Bio::EnsEMBL::Variation::Population')) { throw('Bio::EnsEMBL::Variation::Population argument expected'); } if(!defined($pop->dbID())) { warning("Cannot retrieve genotypes for population without set dbID"); return []; } my $sth = $self->prepare (q{SELECT v.variation_id, v.name, v.validation_status, s1.name, v.ancestral_allele, a.allele_id, a.allele, a.frequency, a.sample_id, vs.moltype, vs.name, s2.name, f.failed_description_id FROM (variation v, source s1, allele a) LEFT JOIN variation_synonym vs on v.variation_id = vs.variation_id LEFT JOIN source s2 on vs.source_id = s2.source_id LEFT JOIN failed_variation fv on v.variation_id = fv.variation_id LEFT JOIN failed_description f on fv.failed_description_id = f.failed_description_id WHERE v.variation_id = a.variation_id AND v.source_id = s1.source_id AND a.sample_id = ?}); $sth->bind_param(1,$pop->dbID,SQL_INTEGER); $sth->execute(); my $results = $self->_objs_from_sth($sth); $sth->finish(); return $results; } sub _get_flank_from_core{ my $self = shift; my $seq_region_id = shift; my $seq_region_start = shift; my $seq_region_end = shift; my $seq_region_strand = shift; my $flanking_sequence; if (defined $self->db()->dnadb()){ my $slice_adaptor = $self->db()->dnadb()->get_SliceAdaptor(); my $slice = $slice_adaptor->fetch_by_seq_region_id($seq_region_id); if (!$slice){ throw("Not possible to obtain slice for seq_region_id \"$seq_region_id\"\n"); } my $flank = $slice->subseq($seq_region_start,$seq_region_end,$seq_region_strand); return $slice->subseq($seq_region_start,$seq_region_end,$seq_region_strand); } return ''; } sub _objs_from_sth { my $self = shift; my $sth = shift; my ($var_id, $name, $vstatus, $source, $ancestral_allele, $allele_id, $allele, $allele_freq, $allele_sample_id, $moltype, $syn_name, $syn_source, $cur_allele_id, $cur_var, $cur_var_id, $failed_description); $sth->bind_columns(\$var_id, \$name, \$vstatus, \$source, \$ancestral_allele, \$allele_id, \$allele, \$allele_freq, \$allele_sample_id, \$moltype, \$syn_name, \$syn_source, \$failed_description); my @vars; my %seen_syns; my %seen_pops; my $pa = $self->db()->get_PopulationAdaptor(); while($sth->fetch()) { if(!defined($cur_var) || $cur_var_id != $var_id) { $vstatus = 0 if (!defined $vstatus); my @states = split(',',$vstatus); $cur_var = Bio::EnsEMBL::Variation::Variation->new (-dbID => $var_id, -ADAPTOR => $self, -NAME => $name, -SOURCE => $source, -ANCESTRAL_ALLELE => $ancestral_allele, -MOLTYPE => $moltype, -VALIDATION_STATES => \@states, -FAILED_DESCRIPTION => $failed_description); push @vars, $cur_var; $cur_var_id = $var_id; } if(!defined($cur_allele_id) || $cur_allele_id != $allele_id) { my $pop; if($allele_sample_id) { $pop = $seen_pops{$allele_sample_id} ||= $pa->fetch_by_dbID($allele_sample_id); } if (defined $allele_id){ my $allele = Bio::EnsEMBL::Variation::Allele->new (-dbID => $allele_id, -ALLELE => $allele, -FREQUENCY => $allele_freq, -POPULATION => $pop); $cur_var->add_Allele($allele); $cur_allele_id = $allele_id; } } if(defined ($syn_source) && !$seen_syns{"$syn_source:$syn_name"}) { $seen_syns{"$syn_source:$syn_name"} = 1; $cur_var->add_synonym($syn_source, $syn_name); } } # # this next slab of code is a hack # # it deals with the case where the same rsID and population have multiple # # allele frequencies in the allele table - it forces each variation to return # # only 1 matching (i.e. add up to 1) pair of frequencies for each population # my %by_pop; # my %seen_alleles; # my %by_allele; # my %flag; # my @chosen; # # foreach my $var(@vars) { # %by_pop = (); # %seen_alleles = (); # %flag = (); # @chosen = (); # # # get all the alleles # my @alleles = @{$var->get_all_Alleles()}; # # # arrange them in a hash by population and flag any that we want to check # foreach my $allele(@alleles) { # push @{$by_pop{$allele->population->name}}, $allele; # # if(defined $seen_alleles{$allele->population}{$allele->allele}) { # $flag{$allele->population->name} = 1; # } # else { # $seen_alleles{$allele->population}{$allele->allele} = 1; # } # } # # foreach my $pop(keys %by_pop) { # # # if this population has been flagged # if(defined $flag{$pop}) { # # # arrange in a hash by allele # %by_allele = (); # foreach my $allele(@{$by_pop{$pop}}) { # push @{$by_allele{$allele->allele}}, $allele; # } # # # since it's arbitrary which pair we choose, just get the first allele and frequency # my @allele_order = sort keys %by_allele; # my $first_allele = shift @allele_order; # my $first_freq = $by_allele{$first_allele}->[0]->frequency; # # push @chosen, $by_allele{$first_allele}->[0]; # last if $first_freq == 1; # # # now try and get an allele to match this first chosen one # # only works at the moment if the SNP has 2 alleles # if(scalar @allele_order == 1) { # my %diffs = (); # my %link = (); # # foreach my $allele(@{$by_allele{$allele_order[0]}}) { # $diffs{$allele->frequency} = abs(1 - ($first_freq + $allele->frequency)); # $link{$allele->frequency} = $allele; # } # # my $best = (sort {$diffs{$a} <=> $diffs{$b}} keys %diffs)[0]; # # push @chosen, $link{$best};# if $diffs{$best} < 0.01; # } # # # if it has more than 2 alleles, just put all of them in # else { # push @chosen, @{$by_pop{$pop}}; # } # } # # # if not flagged, add all alleles # else { # push @chosen, @{$by_pop{$pop}}; # } # } # # # replace this variation's allele list ref with the new one # $var->{'alleles'} = \@chosen; # } return \@vars; } 1;