Bio::EnsEMBL::Variation::DBSQL
LDFeatureContainerAdaptor
Toolbar
Summary
Bio::EnsEMBL::Variation::DBSQL::LDFeatureContainerAdaptor
Package variables
No package variables defined.
Included modules
Data::Dumper
FileHandle
POSIX
Inherit
Synopsis
$vdb = Bio::EnsEMBL::Variation::DBSQL::DBAdaptor->new(...);
$db = Bio::EnsEMBL::DBSQL::DBAdaptor->new(...);
$sa = $db->get_SliceAdaptor();
$lda = $vdb->get_LDFeatureContainerAdaptor();
$vfa = $vdb->get_VariationFeatureAdaptor();
# Get a LDFeatureContainer in a region
$slice = $sa->fetch_by_region('chromosome', 'X', 1e6, 2e6);
$ldContainer = $lda->fetch_by_Slice($slice);
print "Name of the ldContainer is: ", $ldContainer->name();
# fetch ld featureContainer for a particular variation feature
$vf = $vfa->fetch_by_dbID(145);
$ldContainer = $lda->fetch_by_VariationFeature($vf);
print "Name of the ldContainer: ", $ldContainer->name();
Description
This adaptor provides database connectivity for LDFeature objects.
LD Features may be retrieved from the Ensembl variation database by
several means using this module.
Methods
_convert_genotype | No description | Code |
_get_LD_populations | No description | Code |
_get_siblings | No description | Code |
_objs_from_sth | No description | Code |
_objs_from_sth_temp_file | No description | Code |
_store_genotype | No description | Code |
executable | No description | Code |
fetch_by_Slice | Description | Code |
fetch_by_VariationFeature | Description | Code |
temp_path | No description | Code |
Methods description
Arg [1] : Bio::EnsEMBL::Slice $slice The slice to fetch genes on. Assuming it is always correct (in the top level) Arg [2] : (optional) Bio::EnsEMBL::Variation::Population $population. Population where we want to select the LD information Example : $ldFeatureContainer = $ldfeaturecontainer_adaptor->fetch_by_Slice($slice); Description: Overwrites superclass method to add the name of the slice to the LDFeatureContainer. Returntype : Bio::EnsEMBL::Variation::LDFeatureContainer Exceptions : thrown on bad argument Caller : general Status : At Risk |
Arg [1] : Bio::EnsEMBL:Variation::VariationFeature $vf Arg [2] : (optional) int $population_id. Population where we want to select the LD information Example : my $ldFeatureContainer = $ldFetureContainerAdaptor->fetch_by_VariationFeature($vf); Description: Retrieves LDFeatureContainer for a given variation feature. Most variations should only hit the genome once and only a return a single variation feature. Returntype : reference to Bio::EnsEMBL::Variation::LDFeatureContainer Exceptions : throw on bad argument Caller : general Status : At Risk |
Methods code
_convert_genotype | description | prev | next | Top |
sub _convert_genotype
{ my $self = shift;
my $alleles_variation = shift; my $individual_information = shift; my @alleles_ordered;
@alleles_ordered = sort({$alleles_variation->{$b} <=> $alleles_variation->{$a}} keys %{$alleles_variation});
foreach my $individual_id (keys %{$individual_information}){
if ($individual_information->{$individual_id}{allele_1} ne $individual_information->{$individual_id}{allele_2}){
$individual_information->{$individual_id}{genotype} = 'Aa';
}
else{
if ($alleles_ordered[0] eq $individual_information->{$individual_id}{allele_1}){
$individual_information->{$individual_id}{genotype} = 'AA';
}
else{
$individual_information->{$individual_id}{genotype} = 'aa';
}
}
} } |
sub _get_LD_populations
{ my $self = shift;
my $siblings = shift;
my ($pop_id,$population_name);
my $sth = $self->db->dbc->prepare(qq{SELECT s.sample_id, s.name
FROM population p, sample s
WHERE (s.name like 'PERLEGEN:AFD%'
OR s.name like 'CSHL-HAPMAP%')
AND s.sample_id = p.sample_id});
$sth->execute();
$sth->bind_columns(\$pop_id,\$population_name);
my @pops;
while($sth->fetch){
if($population_name =~ /CEU|YRI/){
$self->_get_siblings($pop_id,$siblings);
}
push @pops, $pop_id;
}
my $in_str = " IN (" . join(',', @pops). ")";
return $in_str if (defined $pops[0]);
return '' if (!defined $pops[0]); } |
sub _get_siblings
{ my $self = shift;
my $population_id = shift;
my $siblings = shift;
my $sth_individual = $self->db->dbc->prepare(qq{SELECT i.sample_id
FROM individual i, individual_population ip
WHERE ip.individual_sample_id = i.sample_id
AND ip.population_sample_id = ?
AND i.father_individual_sample_id IS NOT NULL
AND i.mother_individual_sample_id IS NOT NULL
});
my ($individual_id);
$sth_individual->execute($population_id);
$sth_individual->bind_columns(\$individual_id);
while ($sth_individual->fetch){
$siblings->{$population_id.'-'.$individual_id}++; }
}
} |
sub _objs_from_sth
{ my $self = shift;
return $self->_objs_from_sth_temp_file( @_ );
my $sth = shift;
my $slice = shift;
my $siblings = shift;
my ($sample_id,$ld_region_id,$ld_region_start,$ld_region_end,$d_prime,$r2,$sample_count);
my ($vf_id1,$vf_id2);
my %feature_container = ();
my %vf_objects = ();
my $vfa = $self->db->get_VariationFeatureAdaptor();
my $variations = $vfa->fetch_all_by_Slice($slice); my %pos_vf = ();
my $region_Slice = $slice->seq_region_Slice();
map {$pos_vf{$_->seq_region_start} = $_->transfer($region_Slice)} @{$variations};
my %alleles_variation = (); my %individual_information = (); my %regions; my $previous_seq_region_id = 0;
my %_pop_ids;
my ($individual_id, $seq_region_id, $seq_region_start,$seq_region_end,$genotypes, $population_id);
my $bin = $self->executable;
if( ! $bin ) {
warning("Binary file calc_genotypes not found. Please, read the ensembl-variation/C_code/README.txt file if you want to use LD calculation\n");
goto OUT;
}
my $pid;
eval "require IPC::Run"; if( $@ ){
warning("IPC::Run it is not installed in you system. Please, read ensembl-variation/C_code/README.txt if you want to use the LD calculation");
goto OUT;
} else{
use IPC::Run qw(start finish);
$pid = start [$bin], '<pipe',\* IN, '>pipe',\* OUT, '2>pipe',\* ERR || die "returned $?" ;
}
my $piid = fork;
if (!defined $piid){
throw("Not possible to fork: $!\n");
}
elsif ($piid !=0){
close IN || die "Could not close writer filehandle: $!\n";
while(<OUT>){
my %ld_values = ();
chomp;
($sample_id,$ld_region_id,$ld_region_start,$ld_region_end,$r2,$d_prime,$sample_count) = split /\s/;
$ld_values{'d_prime'} = $d_prime;
$ld_values{'r2'} = $r2;
$ld_values{'sample_count'} = $sample_count;
$vf_id1 = $pos_vf{$ld_region_start}->dbID();
$vf_id2 = $pos_vf{$ld_region_end}->dbID();
$feature_container{$vf_id1 . '-' . $vf_id2}->{$sample_id} =\% ld_values;
$vf_objects{$vf_id1} = $pos_vf{$ld_region_start};
$vf_objects{$vf_id2} = $pos_vf{$ld_region_end};
$_pop_ids{$sample_id} = 1;
}
waitpid($piid,0);
close OUT || die "Could not close filehandle: $!\n";
finish $pid || die "Could not finish fork..: $!\n";
}
else{
close OUT || die "Could not close filehandle: $!\n";
$sth->bind_columns(\$individual_id,\$ seq_region_id,\$ seq_region_start,\$ seq_region_end,\$ genotypes,\$ population_id);
while($sth->fetch()) {
if (!exists $siblings->{$population_id . '-' . $individual_id}){ $self->_store_genotype(\%individual_information,\%alleles_variation, $individual_id, $seq_region_start, $genotypes, $population_id, $slice);
$previous_seq_region_id = $seq_region_id;
}
}
$sth->finish();
select(IN); $| = 1;
foreach my $snp_start (sort{$a<=>$b} keys %alleles_variation){
foreach my $population (keys %{$alleles_variation{$snp_start}}){
if (keys %{$alleles_variation{$snp_start}{$population}} == 2){
$self->_convert_genotype($alleles_variation{$snp_start}{$population},$individual_information{$population}{$snp_start});
foreach my $individual_id (keys %{$individual_information{$population}{$snp_start}}){
print IN join("\t",$previous_seq_region_id,$snp_start, $snp_start,
$population, $individual_id,
$individual_information{$population}{$snp_start}{$individual_id}{genotype})."\n" || warn $!;
}
}
}
}
close IN || die "Could not close filehandle: $! and $?\n";
POSIX:_exit(0);
}
OUT:
my $t = Bio::EnsEMBL::Variation::LDFeatureContainer->new(
'-ldContainer'=>\% feature_container,
'-name' => '',
'-variationFeatures' =>\% vf_objects
);
$t->{'_pop_ids'} =\%_pop_ids;
return $t;
}
} |
sub _objs_from_sth_temp_file
{ my $self = shift;
my $sth = shift;
my $slice = shift;
my $siblings = shift;
my ($sample_id,$ld_region_id,$ld_region_start,$ld_region_end,$d_prime,$r2,$sample_count);
my ($vf_id1,$vf_id2);
my %feature_container = ();
my %vf_objects = ();
my $vfa = $self->db->get_VariationFeatureAdaptor();
my $variations = $vfa->fetch_all_by_Slice($slice); my %pos_vf = ();
my $region_Slice = $slice->seq_region_Slice();
map {$pos_vf{$_->seq_region_start} = $_->transfer($region_Slice)} @{$variations};
my %alleles_variation = (); my %individual_information = (); my %regions; my $previous_seq_region_id = 0;
my %_pop_ids;
my ($individual_id, $seq_region_id, $seq_region_start,$seq_region_end,$genotypes, $population_id);
my @cmd = qw(calc_genotypes);
my $bin = $self->executable;
if( ! $bin ) {
warning("Binary file calc_genotypes not found. Please, read the ensembl-variation/C_code/README.txt file if you want to use LD calculation\n");
goto OUT;
}
my $pid;
my $file= $self->temp_path."/".sprintf( "ld%08x%08x%08x", $$, time, rand( 0x7fffffff) );
open IN, ">$file.in";
$sth->bind_columns(\$individual_id,\$ seq_region_id,\$ seq_region_start,\$ seq_region_end,\$ genotypes,\$ population_id);
while($sth->fetch()) {
if (!exists $siblings->{$population_id . '-' . $individual_id}){ $self->_store_genotype(\%individual_information,\%alleles_variation, $individual_id, $seq_region_start, $genotypes, $population_id, $slice);
$previous_seq_region_id = $seq_region_id;
}
}
$sth->finish();
foreach my $snp_start (sort{$a<=>$b} keys %alleles_variation){
foreach my $population (keys %{$alleles_variation{$snp_start}}){
if (keys %{$alleles_variation{$snp_start}{$population}} == 2){
$self->_convert_genotype($alleles_variation{$snp_start}{$population},$individual_information{$population}{$snp_start});
foreach my $individual_id (keys %{$individual_information{$population}{$snp_start}}){
print IN join("\t",$previous_seq_region_id,$snp_start, $snp_start,
$population, $individual_id,
$individual_information{$population}{$snp_start}{$individual_id}{genotype})."\n" || warn $!;
}
}
}
}
close IN;
`$bin <$file.in >$file.out`;
open OUT, "$file.out";
while(<OUT>){
my %ld_values = ();
chomp;
($sample_id,$ld_region_id,$ld_region_start,$ld_region_end,$r2,$d_prime,$sample_count) = split /\s/;
$ld_values{'d_prime'} = $d_prime;
$ld_values{'r2'} = $r2;
$ld_values{'sample_count'} = $sample_count;
if (!defined $pos_vf{$ld_region_start} || !defined $pos_vf{$ld_region_end}){
next; }
$vf_id1 = $pos_vf{$ld_region_start}->dbID();
$vf_id2 = $pos_vf{$ld_region_end}->dbID();
$feature_container{$vf_id1 . '-' . $vf_id2}->{$sample_id} =\% ld_values;
$vf_objects{$vf_id1} = $pos_vf{$ld_region_start};
$vf_objects{$vf_id2} = $pos_vf{$ld_region_end};
$_pop_ids{$sample_id} = 1;
}
close OUT || die "Could not close filehandle: $!\n";
unlink( "$file.in" );
unlink( "$file.out" );
OUT:
my $t = Bio::EnsEMBL::Variation::LDFeatureContainer->new(
'-ldContainer'=>\% feature_container,
'-name' => '',
'-variationFeatures' =>\% vf_objects
);
$t->{'_pop_ids'} =\%_pop_ids;
return $t;
}
1; } |
sub _store_genotype
{ my $self = shift;
my $individual_information = shift;
my $alleles_variation = shift;
my $individual_id = shift;
my $seq_region_start = shift;
my $genotype = shift;
my $population_id = shift;
my $slice = shift;
my $blob = substr($genotype,2);
my @genotypes = unpack("naa" x (length($blob)/4),$blob); unshift @genotypes, substr($genotype,1,1); unshift @genotypes, substr($genotype,0,1); unshift @genotypes, 0; my $snp_start;
my $allele_1;
my $allele_2;
for (my $i=0; $i < @genotypes -1;$i+=3){
if ($i == 0){
$snp_start = $seq_region_start; }
else{
if ($genotypes[$i] == 0){
$snp_start += 1;
next;
}
$snp_start += $genotypes[$i] +1;
}
$allele_1 = $genotypes[$i+1];
$allele_2 = $genotypes[$i+2];
if (($snp_start >= $slice->start) && ($snp_start <= $slice->end)){
if ($allele_1 ne 'N' and $allele_2 ne 'N'){
$alleles_variation->{$snp_start}->{$population_id}->{$allele_1}++;
$alleles_variation->{$snp_start}->{$population_id}->{$allele_2}++;
$individual_information->{$population_id}->{$snp_start}->{$individual_id}->{allele_1} = $allele_1;
$individual_information->{$population_id}->{$snp_start}->{$individual_id}->{allele_2} = $allele_2;
}
}
}
}
} |
sub executable
{ my $self = shift;
$BINARY_FILE = shift if @_;
unless( $BINARY_FILE ) {
my $binary_name = 'calc_genotypes';
($BINARY_FILE) = grep {-e $_} map {"$_/calc_genotypes"} split /:/,$ENV{'PATH'};
}
return $BINARY_FILE; } |
sub fetch_by_Slice
{ my $self = shift;
my $slice = shift;
my $population = shift;
if(!ref($slice) || !$slice->isa('Bio::EnsEMBL::Slice')) {
throw('Bio::EnsEMBL::Slice arg expected');
}
my $sth;
my $in_str;
my $siblings = {};
$in_str = $self->_get_LD_populations($siblings);
if ($population){
if(!ref($population) || !$population->isa('Bio::EnsEMBL::Variation::Population')) {
throw('Bio::EnsEMBL::Variation::Population arg expected');
}
my $population_id = $population->dbID;
$in_str = " = $population_id";
}
if ($in_str eq ''){
my $t = Bio::EnsEMBL::Variation::LDFeatureContainer->new(
'-ldContainer'=> {},
'-name' => $slice->name,
'-variationFeatures' => {}
);
return $t
}
$sth = $self->prepare(qq{SELECT c.sample_id,c.seq_region_id,c.seq_region_start,c.seq_region_end,c.genotypes,ip.population_sample_id
FROM compressed_genotype_single_bp c, individual_population ip
WHERE ip.individual_sample_id = c.sample_id
AND ip.population_sample_id $in_str
AND c.seq_region_id = ?
AND c.seq_region_start >= ? and c.seq_region_start <= ?
AND c.seq_region_end >= ?
ORDER BY c.seq_region_id, c.seq_region_start},{mysql_use_result => 1});
$sth->bind_param(1,$slice->get_seq_region_id,SQL_INTEGER);
$sth->bind_param(2,$slice->start - MAX_SNP_DISTANCE,SQL_INTEGER) if ($slice->start - MAX_SNP_DISTANCE >= 1);
$sth->bind_param(2,1,SQL_INTEGER) if ($slice->start - MAX_SNP_DISTANCE < 1);
$sth->bind_param(3,$slice->end,SQL_INTEGER);
$sth->bind_param(4,$slice->start,SQL_INTEGER);
$sth->execute();
my $ldFeatureContainer = $self->_objs_from_sth($sth,$slice,$siblings);
$sth->finish();
$ldFeatureContainer->name($slice->name());
return $ldFeatureContainer; } |
sub fetch_by_VariationFeature
{ my $self = shift;
my $vf = shift;
my $pop = shift;
if(!ref($vf) || !$vf->isa('Bio::EnsEMBL::Variation::VariationFeature')) {
throw('Bio::EnsEMBL::Variation::VariationFeature arg expected');
}
if(!defined($vf->dbID())) {
throw("VariationFeature arg must have defined dbID");
}
my $ldFeatureContainer = $self->fetch_by_Slice($vf->feature_Slice->expand(MAX_SNP_DISTANCE,MAX_SNP_DISTANCE),$pop);
my %feature_container = ();
my %vf_objects = ();
my %pop_ids = ();
my $vf_id = $vf->dbID;
foreach my $ld_key (keys %{$ldFeatureContainer->{'ldContainer'}}){
if ($ld_key =~ /$vf_id/){
my ($vf_id1,$vf_id2) = split /-/,$ld_key;
$feature_container{$vf_id1 . '-' . $vf_id2} = $ldFeatureContainer->{'ldContainer'}->{$ld_key};
$vf_objects{$vf_id1} = $ldFeatureContainer->{'variationFeatures'}->{$vf_id1};
$vf_objects{$vf_id2} = $ldFeatureContainer->{'variationFeatures'}->{$vf_id2};
map {$pop_ids{$_}++} keys %{$ldFeatureContainer->{'ldContainer'}->{$ld_key}};
}
}
my $new_ldFeatureContainer = Bio::EnsEMBL::Variation::LDFeatureContainer->new(
'-ldContainer'=>\% feature_container,
'-name' => $vf->dbID,
'-variationFeatures' =>\% vf_objects
);
$new_ldFeatureContainer->{'_pop_ids'} =\% pop_ids;
return $new_ldFeatureContainer;
}
} |
sub temp_path
{ my $self = shift;
$TMP_PATH = shift if @_;
$TMP_PATH ||= '/tmp';
return $TMP_PATH; } |
General documentation