Raw content of dbSNP::MappingChromosome
use strict;
use warnings;
#object that loading mapping data into the variation_feature table
package dbSNP::MappingChromosome;
use dbSNP::GenericContig;
use vars qw(@ISA);
use ImportUtils qw(debug load create_and_load dumpSQL);
@ISA = ('dbSNP::GenericContig');
sub variation_feature{
my $self = shift;
my (%rec,%source,%status,%rec_pos,%rec_seq_region,$three_mappings);
debug("Dumping Variation");
# Create hashes of variation_id's, source_id's etc keyed by variation name
my ($variation_id,$name,$source_id,$validation_status);
my $sth = $self->{'dbVar'}->prepare (qq{SELECT variation_id, name, source_id, validation_status
FROM variation});
$sth->execute();
$sth->bind_columns(\$variation_id,\$name,\$source_id,\$validation_status);
while($sth->fetch){
$rec{$name} = $variation_id;
$source{$name} = $source_id;
$status{$name} = $validation_status ? $validation_status : '\N';
}
$sth->finish();
#we need to create a seq_region table
$self->{'dbVar'}->do(qq{CREATE TABLE IF NOT EXISTS seq_region(
seq_region_id int(10),
name varchar(40),
primary key (seq_region_id),
unique key (name))
});
# Create hash of top-level seq_region_id keyed by seq_region_name
my ($seq_region_id,$seq_region_name);
my $sth1 = $self->{'dbCore'}->dbc->prepare (qq{
select sr.seq_region_id, sr.name
from seq_region sr, seq_region_attrib sra, attrib_type at
where sr.seq_region_id=sra.seq_region_id
and sra.attrib_type_id = at.attrib_type_id
and at.code="toplevel" } );
$sth1->execute();
$sth1->bind_columns(\$seq_region_id,\$seq_region_name);
while ($sth1->fetch) {
$rec_seq_region{$seq_region_name} = $seq_region_id;
$self->{'dbVar'}->do(qq{INSERT IGNORE INTO seq_region(seq_region_id,name)values($seq_region_id,"$seq_region_name")});
}
$sth1->finish();
=head
debug("Reading Mapping file");
open (FH, ">" . $self->{'tmpdir'} . "/" . $self->{'tmpfile'} );
my $mapping_file_dir = $self->{'mapping_file_dir'};
debug("Mapping file is $mapping_file_dir");
# If mapping_file_dir is a file, read from it
if (-f "$mapping_file_dir") {
open (IN, "$mapping_file_dir") or die "can't open mapping_file:$!";
my @ar = split /\//, $mapping_file_dir;
$ar[-1] = "THREE_MAPPINGS";
$three_mappings = join "/", @ar;
print "three_mappings is $three_mappings\n";
}
# Concatenate all mapping_file_N files into a single result file
elsif (-d "$mapping_file_dir") {
system("cat $mapping_file_dir/mapping_file* > $mapping_file_dir/all_mapping_file");
system("cat $mapping_file_dir/THREE_MAPPINGS > $mapping_file_dir/THREE_MAPPINGS");
$three_mappings = "$mapping_file_dir/THREE_MAPPINGS";
open (IN, "$mapping_file_dir/all_mapping_file") or die "can't open mapping_file:$!";
}
#create table three_mappings and failed_variation
my $host = $self->{'dbVar'}->host();
my $dbname = $self->{'dbVar'}->dbname();
print "host is $host and dbname is $dbname\n";
$self->{'dbVar'}->do(qq{CREATE TABLE IF NOT EXISTS THREE_MAPPINGS (name varchar(15), index name(name))});
system("mysqlimport -L -uensadmin -pensembl -h$host $dbname $three_mappings");
$self->{'dbVar'}->do(qq{insert into failed_variation (variation_id,failed_description_id) select v.variation_id,1 from variation v, THREE_MAPPINGS t where v.name=t.name});
# Process results file
while () {
chomp;
next if /^more |^PARSING/;
s/^MORE_HITS//;
my ($ref_id, $slice_name, $start, $end, $strand, $ratio) =split;
# Skip mappings where %ID < 50%
next if $ratio <0.5;
my ($coord_sys,$assembly,$seq_region_name,$seq_region_start,$seq_region_end,$version);
# Parse the slice name
if( $slice_name =~ /-/ ){
# Old style; --
($seq_region_name,
$seq_region_start,
$seq_region_end)
= split /\-/, $slice_name;
}
elsif( $slice_name =~ /\:/ ){
# New style: uses Bio::EnsEMBL::Slice->name
($coord_sys,
$assembly,
$seq_region_name,
$seq_region_start,
$seq_region_end,
$version)
= split /\:/, $slice_name
}
else{
# Asume that we have a seq_region_name only
$seq_region_name = $slice_name;
$seq_region_start = 1;
$seq_region_end = '';
}
# Get the coreDB seq_region_id from its name
my $seq_region_id = $rec_seq_region{$seq_region_name}; # Find in cache
if (! $seq_region_id) {
# Not in cache - query database
my $sth = $self->{'dbCore'}->dbc->prepare (qq{SELECT seq_region_id from seq_region where name = ?});
$sth->execute("$seq_region_name");
$seq_region_id = $sth->fetchrow_array();
}
unless( $seq_region_id ){
debug( "Seq region $seq_region_name not found in DB" );
next;
}
# Calculate the seq_region_start for the variation_feature
my $new_seq_region_start = $start;
my $new_seq_region_end = $end;
if( $seq_region_start ){ # Apply offset if available
$new_seq_region_start = $seq_region_start + $start -1;
$new_seq_region_end = $seq_region_start + $end -1;
}
$strand = ($strand eq "+") ? 1 : -1;
#print "$ref_id\n";
#next if ( $rec_pos{$ref_id}{$seq_region_id}{$new_seq_region_start}{$new_seq_region_end}); # Skip if variation_feature already processed
# Print to output file
print FH join
( "\t",
$seq_region_id, # variation_feature.seq_region_id
$new_seq_region_start, # variation_feature.seq_region_start
$new_seq_region_end, # variation_feature.seq_region_end
$strand, # variation_feature.seq_region_strand
$rec{$ref_id}, # variation_feature.variation_id
$ref_id, # variation_feature.variation_name
$source{$ref_id}, # variation_feature.source_id
$status{$ref_id}||'NULL', # variation_feature.validation_status
) . "\n";
# Flag as processed
#$rec_pos{$ref_id}{$seq_region_id}{$new_seq_region_start}{$new_seq_region_end}=1; needs too much memory to hashing
}
close IN;
close FH;
=cut
debug("Creating genotyped variations");
# Creating temporary variation_feature table
create_and_load( $self->{'dbVar'},
"tmp_variation_feature",
"seq_region_id",
"seq_region_start",
"seq_region_end",
"seq_region_strand",
"variation_id *",
"variation_name",
"source_id",
"validation_status" );
# creating the temporary table with the genotyped variations
$self->{'dbVar'}->do(qq{
CREATE TABLE tmp_genotyped_var
SELECT DISTINCT variation_id FROM tmp_individual_genotype_single_bp});
$self->{'dbVar'}->do(qq{
CREATE UNIQUE INDEX variation_idx ON tmp_genotyped_var (variation_id)});
$self->{'dbVar'}->do(qq{
INSERT IGNORE INTO tmp_genotyped_var
SELECT DISTINCT variation_id FROM individual_genotype_multiple_bp});
# Copy from tmp variation_feature to final variation_feature
$self->{'dbVar'}->do(qq{
INSERT INTO variation_feature
(variation_id,
seq_region_id,
seq_region_start,
seq_region_end,
seq_region_strand,
variation_name,
flags,
source_id,
validation_status)
SELECT tv.variation_id,
tv.seq_region_id,
tv.seq_region_start,
tv.seq_region_end,
tv.seq_region_strand,
tv.variation_name,
IF(tgv.variation_id,'genotyped',NULL),
tv.source_id,
tv.validation_status
FROM tmp_variation_feature tv LEFT JOIN
tmp_genotyped_var tgv ON tv.variation_id = tgv.variation_id });
#$self->{'dbVar'}->do(qq{DROP TABLE tmp_variation_feature});
#$self->{'dbVar'}->do(qq{DROP TABLE tmp_genotyped_var});
}
1;