Raw content of dbSNP::Mosquito
use strict;
use warnings;
#object that contains the specific methods to dump data when there specie is a mosquito (not contigs, as usual).
package dbSNP::Mosquito;
use dbSNP::GenericContig;
use vars qw(@ISA);
use ImportUtils qw(debug load dumpSQL create_and_load);
@ISA = ('dbSNP::GenericContig');
# will be used to store the 2 different strains and the description
our %STRAINS = ( 'PEST' => 'SNPs detected by comparing mosquito pest and mopti reads against Celera Anopheles Gambiae build 2 Genomic Reference Sequence using ssahaSNP',
'MOPTI' => 'SNPs detected by comparing mosquito pest and mopti reads against Celera Anopheles Gambiae build 2 Genomic Reference Sequence using ssahaSNP');
sub dump_dbSNP{
my $self = shift;
#first, dump all dbSNP data as usual
$self->SUPER::dump_dbSNP();
#then, get strain information
$self->add_strains();
}
sub add_strains{
my $self = shift;
my $pop_MOPTI; #population_id for the mopti strain
my $pop_PEST; #population_id for the pest strain
#first of all, add to the population table the 2 strains: PEST and MOPTI
foreach my $population_name (keys %STRAINS){
$self->{'dbVariation'}->do(qq{INSERT INTO sample (name,description)
VALUES ('$population_name', '$STRAINS{$population_name}')});
#store in the same hash the id for the population
$STRAINS{$population_name} = $self->{'dbVariation'}->{'mysql_insertid'};
}
#and copy the data from the sample to the Population table
debug("Loading population table with data from sample");
$self->{'dbVariation'}->do(qq{INSERT INTO population (sample_id,is_strain)
SELECT sample_id, 1
FROM sample
});
#then, get from dbSNP the relation between ssId and mopti/pest strain
debug("Dumping strain information from dbSNP");
dumpSQL($self->{'dbSNP'}, qq{SELECT subsnp_id, loc_snp_id
FROM SubSNP
});
create_and_load($self->{'dbVariation'},'tmp_strain', "subsnp_id i*", "strain");
debug("Updating allele table with strain alleles");
#first, update the MOPTI alleles
$self->{'dbVariation'}->do(qq{UPDATE allele , variation_synonym, tmp_strain
SET allele.sample_id = $STRAINS{'MOPTI'}, allele.frequency = 1
WHERE allele.variation_id = variation_synonym.variation_id
AND SUBSTRING(variation_synonym.name,3) = tmp_strain.subsnp_id
AND tmp_strain.strain like '%mopti%'
});
#and do the same for the PEST alleles
$self->{'dbVariation'}->do(qq{UPDATE allele , variation_synonym, tmp_strain
SET allele.sample_id = $STRAINS{'PEST'}, allele.frequency = 1
WHERE allele.variation_id = variation_synonym.variation_id
AND SUBSTRING(variation_synonym.name,3) = tmp_strain.subsnp_id
AND tmp_strain.strain like '%pest%'
});
#and finally, drop the temporary table
$self->{'dbVariation'}->do(qq{DROP TABLE tmp_strain});
}
sub variation_feature{
my $self = shift;
my %scaff; #contains relation AAAB -> NW
open(IN,"gzip -dc dbSNP/scaff_NW_chr_CRA_AAAB.gz |") || die "Could not get file with scaff mapping data: $!\n";
my @line;
while (){
chomp;
@line = split /\t/;
$scaff{$line[3]} = $line[0];
}
### TBD not sure if variations with map_weight > 1 or 2 should be
### imported.
debug("Dumping seq_region data");
my $sth = $self->{'dbCore'}->dbc()->prepare(qq{SELECT sr.seq_region_id, sr.name
FROM seq_region sr, coord_system cs
WHERE sr.coord_system_id = cs.coord_system_id
AND cs.name = "scaffold"});
$sth->execute();
open(FH,">" . $self->{'tmpdir'} . '/' . $self->{'tmpfile'}); #open the file with the data dump from the core database
my ($tablename1,$tablename2,$row);
while($row = $sth->fetchrow_arrayref()) {
my @row = @$row;
$row[1] = $scaff{$row[1]} if (defined $scaff{$row[1]});
@row = map {(defined($_)) ? $_ : '\N'} @row; # convert undefined to NULL;
print FH join("\t", @row), "\n";
}
$sth->finish();
close(FH);
debug("Loading seq_region data");
create_and_load($self->{'dbVariation'}, "tmp_seq_region", "seq_region_id", "name *");
debug("Dumping SNPLoc data");
my ($assembly_version) = $self->{'assembly_version'} =~ /^[a-zA-Z]+(\d+)\.*.*$/; $assembly_version=2;
print "assembly_version again is $assembly_version\n";
my $sth1 = $self->{'dbSNP'}->prepare(qq{SHOW TABLES LIKE
'$self->{'dbSNP_version'}\_SNPContigLoc\_$assembly_version\__'});
$sth1->execute();
while($row = $sth1->fetchrow_arrayref()) {
$tablename1 = $row->[0];
}
my $sth2 = $self->{'dbSNP'}->prepare(qq{SHOW TABLES LIKE
'$self->{'dbSNP_version'}\_ContigInfo\_$assembly_version\__'});
$sth2->execute();
while($row = $sth2->fetchrow_arrayref()) {
$tablename2 = $row->[0];
}
print "table_name1 is $tablename1 table_name2 is $tablename2\n";
dumpSQL($self->{'dbSNP'}, qq{SELECT t1.snp_id, t2.contig_acc,
t1.lc_ngbr+2,t1.rc_ngbr,
IF(orientation, -1, 1)
FROM $tablename1 t1, $tablename2 t2
WHERE t1.ctg_id=t2.ctg_id
$self->{'limit'}});
debug("Loading SNPLoc data");
create_and_load($self->{'dbVariation'}, "tmp_contig_loc", "snp_id i*", "contig *", "start i",
"end i", "strand i");
#creating the temporary table with the genotyped variations
$self->{'dbVariation'}->do(qq{CREATE TABLE tmp_genotyped_var SELECT DISTINCT variation_id FROM tmp_individual_genotype_single_bp});
$self->{'dbVariation'}->do(qq{CREATE UNIQUE INDEX variation_idx ON tmp_genotyped_var (variation_id)});
$self->{'dbVariation'}->do(qq{INSERT IGNORE INTO tmp_genotyped_var SELECT DISTINCT variation_id FROM individual_genotype_multiple_bp});
debug("Creating variation_feature data");
$self->{'dbVariation'}->do(qq{INSERT INTO variation_feature
(variation_id, seq_region_id,
seq_region_start, seq_region_end, seq_region_strand,
variation_name, flags, source_id, validation_status)
SELECT v.variation_id, ts.seq_region_id, tcl.start, tcl.end,
tcl.strand, v.name, IF(tgv.variation_id,'genotyped',NULL), v.source_id, v.validation_status
FROM variation v LEFT JOIN tmp_genotyped_var tgv ON v.variation_id = tgv.variation_id, tmp_contig_loc tcl, tmp_seq_region ts
WHERE v.snp_id = tcl.snp_id
AND tcl.contig = ts.name});
$self->{'dbVariation'}->do("DROP TABLE tmp_contig_loc");
$self->{'dbVariation'}->do("DROP TABLE tmp_seq_region");
$self->{'dbVariation'}->do("DROP TABLE tmp_genotyped_var");
}
1;