Raw content of dbSNP::Human
use strict;
use warnings;
#object that contains the specific methods to dump data when the specie is a HUMAN (adds HGVbase and TSC information).
package dbSNP::Human;
use dbSNP::GenericContig;
use vars qw(@ISA);
use ImportUtils qw(debug load dumpSQL create_and_load);
@ISA = ('dbSNP::GenericContig');
sub dump_dbSNP{
my $self = shift;
#first, dump all dbSNP data as usual
$self->SUPER::dump_dbSNP();
#then, get HGVbase IDs from Yuans file
#$self->dump_HGVbaseIDs();
#and finally, get TSC data from dbSNP
#$self->dump_TSCIDs();
#get mitochondrial SNPs provided by Yuan in .tb file formats---DON'T RUN THIS ANYMORE
#$self->dump_mitocondrialSNPs();
}
#specific function to get the HGVbase IDs from a file provided by Yuan and add them to the variation_synonym table
sub dump_HGVbaseIDs{
my $self = shift;
#copy the file with the rs-> HGVbaseID information to the temp folder
system "gunzip -c dbSNP/rs_hgvbase.txt.gz > " . $self->{'tmpdir'} . "/" . $self->{'tmpfile'};
debug("Loading HGVbase data");
create_and_load($self->{'dbVariation'},"tmp_rs_hgvbase","rsID *","HGVbaseID");
#add a new source to the Source table
$self->{'dbVariation'}->do(qq{INSERT INTO source (name,version) values ('HGVbase',15)
});
debug("Adding HGVbaseIDs to synonym table");
my $source_id = $self->{'dbVariation'}->{'mysql_insertid'}; #get the last autoinc id from the database (the one from the HGVbase source)
#add the HGVbaseIDs to the Variation_Synonym table
$self->{'dbVariation'}->do(qq{INSERT INTO variation_synonym (variation_id,source_id,name)
SELECT v.variation_id, $source_id, trh.HGVbaseID
FROM variation v, tmp_rs_hgvbase trh
WHERE v.name = trh.rsID
});
#and finally, remove the temporary table
$self->{'dbVariation'}->do(qq{DROP TABLE tmp_rs_hgvbase
});
}
#specific function to get the TSC IDs from dbSNP and add them to the variation_synonym table
sub dump_TSCIDs{
my $self = shift;
#add the TSC source to the table
$self->{'dbVariation'}->do(qq{INSERT INTO source (name,version) values ('TSC',1)
});
my $source_id = $self->{'dbVariation'}->{'mysql_insertid'}; #get the last autoinc id in the database (the one from the TSC source)
#and finally add the TSC ids to the synonyms table
debug("Dumping TSC information from dbSNP");
dumpSQL($self->{'dbSNP'}, qq{SELECT concat('rs',ss.snp_id), $source_id, s.loc_snp_id
FROM SubSNP s, SNPSubSNPLink ss
WHERE ss.subsnp_id = s.subsnp_id
AND s.loc_snp_id like 'TSC%'
}
);
debug("Loading TSC ids into temporary table");
create_and_load($self->{'dbVariation'},"tmp_rs_TSC","rsID *","source_id","TSCid");
$self->{'dbVariation'}->do(qq{ INSERT IGNORE INTO variation_synonym (variation_id, source_id, name)
SELECT v.variation_id, trt.source_id, trt.TSCid
FROM variation v, tmp_rs_TSC trt
WHERE v.name = trt.rsID
}
);
#and finally, remove the temporary table
$self->{'dbVariation'}->do(qq{DROP TABLE tmp_rs_TSC
});
}
#will get from the RefSNP.tb and ContigHit files the information about the Simon mapped mitochondrial SNPs, and add the information to the relevant
# tables: Variation, Allele, Source, Flanking_sequence and Variation_Feature
sub dump_mitocondrialSNPs{
my $self = shift;
my %mitoSNPs; #hash containing all the information in the files RefSNP.tb and ContigHit.tb provided by Yuan at:
my $variation_id; #internal id of the variation added to the database
my $region; #hash that will contain, for a certain region, the seq_region_id in the database
my $slice_adaptor = $self->{'dbCore'}->get_SliceAdaptor();
my $slice;
my $seq_region_id; #region for the mitocontig MT_NC_001807, extracted from the core database
my $status;
#/ecs2/scratch4/yuan/hum/MT_35
#first of all, add the new source of information
$self->{'dbVariation'}->do(qq{INSERT INTO source (name) values ('mitomap.com')
});
my $source_id = $self->{'dbVariation'}->dbh()->{'mysql_insertid'}; #get the last autoinc id in the database (the one from the mitomap.com source)
#reads and loads into a hash table all the information in the RefSNP file
$self->read_RefSNP(\%mitoSNPs,'/ecs2/scratch4/yuan/hum/MT_35/RefSNP.tb');
#reads and loads into a hash table all the information in the ContigHit table referent to the location
$self->read_ContigHit(\%mitoSNPs,'/ecs2/scratch4/yuan/hum/MT_35/ContigHit.tb');
#and finally, add the information to the database
foreach my $snp (keys %mitoSNPs){
if ($mitoSNPs{$snp}{'status'} eq 'by-other-pop'){
$status = 4;
}
#insert in the Variation table
$self->{'dbVariation'}->do(qq{INSERT INTO variation (source_id,name,validation_status) VALUES ($source_id, "$mitoSNPs{$snp}{'name'}", $status);
});
$variation_id = $self->{'dbVariation'}->dbh()->{'mysql_insertid'}; #get the last autoinc id in the database (the in the variation table)
if (!exists $region->{$mitoSNPs{$snp}{'region'}}){
$slice = $slice_adaptor->fetch_by_region('toplevel',$mitoSNPs{$snp}{'region'}); #will get the slice for the region where the SNP is present
$region->{$mitoSNPs{$snp}{'region'}} = $slice_adaptor->get_seq_region_id($slice); #get the seq_region_id and store it in a hash
}
$seq_region_id = $region->{$mitoSNPs{$snp}{'region'}};
#insert in the Flanking_sequence table
$self->{'dbVariation'}->do(qq{INSERT INTO flanking_sequence (variation_id,seq_region_id,seq_region_strand,up_seq,down_seq)
VALUES ($variation_id,$seq_region_id,$mitoSNPs{$snp}{'strand'},"$mitoSNPs{$snp}{'up_seq'}",
"$mitoSNPs{$snp}{'down_seq'}")
});
#insert all the alleles
foreach my $allele (split /\//,$mitoSNPs{$snp}{'alleles'}){
$self->{'dbVariation'}->do(qq{INSERT INTO allele (variation_id, allele) VALUES ($variation_id,"$allele")
});
}
#and finally, insert the variation_feature table
$self->{'dbVariation'}->do(qq{INSERT INTO variation_feature (variation_id, seq_region_id,
seq_region_start, seq_region_end, seq_region_strand, variation_name,source_id,validation_status)
VALUES ($variation_id, $seq_region_id, $mitoSNPs{$snp}{'start'}, $mitoSNPs{$snp}{'end'},
$mitoSNPs{$snp}{'strand'}, "$mitoSNPs{$snp}{'name'}",$source_id, $status)
});
}
}
#will read from Yuans directory /ecs2/scratch4/yuan/hum/MT_35 the RefSNP.tb file. Important that in each release the format of the data and the location
#of the file might change
sub read_RefSNP{
my $self = shift;
my $snps = shift;
my $file_location = shift;
my ($snp_id, $snp_name, $alleles, $up_seq, $down_seq, $status); #values we want to get from the file
my @line;
open IN,$file_location || die "file with mitochondrial SNP information doesn't exist at $file_location\n";
while (){
chomp; #remove the last character
@line = split /\t/;
$snp_id = $line[0]; #internal id of the SNP
$snp_name = $line[1]; #name of the snp
$alleles = $line[5]; #alleles in C/T format
$up_seq = $line[6]; #up_seq
$down_seq = $line[7]; #down_seq
$status = $line[13]; #validation_status
$snps->{$snp_id}->{'name'} = $snp_name;
$snps->{$snp_id}->{'alleles'} = $alleles;
$snps->{$snp_id}->{'up_seq'} = $up_seq;
$snps->{$snp_id}->{'down_seq'} = $down_seq;
$snps->{$snp_id}->{'status'} = $status;
}
close IN;
}
#will read from Yuans directory /ecs2/scratch4/yuan/hum/MT_35 the ContigHit.tb file. Important that in each release the format of the data and the
#location of the file might change
sub read_ContigHit{
my $self = shift;
my $snps = shift;
my $file_location = shift;
my ($snp_id,$region, $strand, $start, $end);
my @line;
open IN,$file_location || die "Could not open file with mitochondrial SNP information at $file_location\n";
while (){
chomp; #remove last character
@line = split /\t/;
$snp_id = $line[0]; #snp_id
$region = $line[7]; #name of the region MT_NC....
$strand = $line[9]; #strand of the SNP in the sequence
$start = $line[10]; #physmapstart of the SNP
$end = $line[11]; #physmapend of the SNP
#get region name withou th MT
$region =~ /MT_(NC_\d+)/;
$snps->{$snp_id}->{'region'} = $1;
$snps->{$snp_id}->{'strand'} = $strand;
$snps->{$snp_id}->{'start'} = $start;
$snps->{$snp_id}->{'end'} = $end;
}
close IN;
}
1;