Raw content of SARA::GetSARA
use strict;
use warnings;
#generic object for the dbSNP data. Contains the general methods to dump the data into the new Variation database. Any change in the methods
# will need to overload the correspondent method in the subclass for the specie
package SARA::GetSARA;
use Bio::Index::Fastq;
use Bio::EnsEMBL::Utils::Argument qw(rearrange);
use Bio::EnsEMBL::Utils::Exception qw(throw);
use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
use ImportUtils qw(dumpSQL debug create_and_load load);
use Time::HiRes qw(tv_interval gettimeofday);
#creates the object and assign the attributes to it (connections, basically)
sub new {
my $caller = shift;
my $class = ref($caller) || $caller;
my ($dbCore, $dbVar, $dbSara, $tmp_dir, $tmp_file, $species, $source_id) =
rearrange([qw(DBCORE DBVAR DBSARA TMPDIR TMPFILE SPECIES SOURCE_ID)],@_);
return bless {'dbSara' => $dbSara,
'dbCore' => $dbCore,
'dbVar' => $dbVar, ##this is a dbconnection
'tmpdir' => $tmp_dir,
'tmpfile' => $tmp_file,
'species' => $species,
'source_id' => $source_id,
}, $class;
}
#main and only function in the object that dumps all dbSNP data
sub get_sara{
my $self = shift;
my $var_dbname = ($self->{'dbVar'}) ? $self->{'dbVar'}->dbname : "var_dbname";
my %rec_seq_region_id;
my $sth = $self->{'dbSara'}->prepare(qq{SELECT distinct seq_region_id
FROM $var_dbname.variation_feature
});
my ($seq_region_id);
$sth->execute();
$sth->bind_columns(\$seq_region_id);
while ($sth->fetch()) {
$rec_seq_region_id{$seq_region_id}=1;
}
#can't built index_file for human in lustre file system, ask Tim? (need 2-3 days), so need to use turing for flanking_qual part of the comput
my $queue_hugemem = "-q hugemem -R'select[mem>5000] rusage[mem=5000]'";
my $queue_linux64 = "-q normal -R'select[type == LINUX64 && myens_genomics2 < 200] rusage[myens_genomics1=10]'";
my $queue_long = "-q long -M5000000 -R'select[mem>5000] rusage[mem=5000]'";
my $queue;
$queue = $queue_hugemem if $self->{'tmpdir'} =~ /turing/;
$queue = $queue_long if $self->{'tmpdir'} !~ /turing/;
#$queue = $queue_linux64 if $self->{'tmpdir'} !~ /turing/;
#my $t0 = [gettimeofday];
#debug("Time to start get_query_snp_pos $t0");
#$self->get_query_snp_pos(\%rec_seq_region_id,$var_dbname);
#my $t1 = [gettimeofday];
#debug("Time to start get_flanking_seq_qual $t1");
#$self->make_reads_file(\%rec_seq_region_id,$queue);
#$self->make_pileup_reads_file();
#$self->get_flanking_seq_qual(\%rec_seq_region_id,$var_dbname,$queue);
#print "Time to run GetSARA ", tv_interval($t0,$t1),"\n";
#$self->make_genotype_allele_table(\%rec_seq_region_id,$var_dbname,$queue);
#$self->merge_tables();
#$self->insert_allele_gtype_to_vardb(\%rec_seq_region_id,$var_dbname);
#$self->remove_faulse_variation_in_multiple_strains();
#$self->read_coverage($var_dbname);##this is been calculated ealier
#$self->remove_empty_tables();
$self->remove_tables();
#$self->merge_table_with_same_seq_region_id();
}
sub get_query_snp_pos {
my $self = shift;
my $rec_seq_region_id = shift;
my $var_dbname = shift;
my $source_id = $self->{'source_id'};
my $tmp_dir = $self->{'tmpdir'};
my $tmp_file = $self->{'tmpfile'};
my $count;
my $species = $self->{'species'};
foreach my $seq_region_id (keys %$rec_seq_region_id) {
#foreach my $seq_region_id (226032,226034) {
my $call = "bsub -q long -R'select[myens_genomics1 < 200] rusage[myens_genomics1=10]' -J $var_dbname\_ssaha_feature_job_$seq_region_id -o $tmp_dir/ssaha_feature_out\_$seq_region_id /usr/local/ensembl/bin/perl parallel_sara_zm.pl -species $species -source_id $source_id -seq_region_id $seq_region_id -job snp_pos -tmpdir $tmp_dir -tmpfile $tmp_file";
$count++;
print "call is $call $count\n";
system($call);
}
my $call1 = "bsub -q normal -K -w 'done($var_dbname\_ssaha_feature_job*)' -J waiting_process sleep 1"; #waits until all variation features have finished to continue
system($call1);
}
sub get_flanking_seq_qual {
##use farm run smaller jobs or use turing run all jobs
my $self = shift;
my $rec_seq_region_id = shift;
my $var_dbname = shift;
my $queue = shift;
my $species = $self->{'species'};
my $tmp_dir = $self->{'tmpdir'};
my $tmp_file = $self->{'tmpfile'};
my $count;
my $defined_table_row = 100000;
my $reads_dir;
#my $index_file = "/lustre/work1/ensembl/dr2/data/Watson/index_file";
#my $index_file = "/turing/mouse129_extra/yuan/human_celera/dani/reads_dir/watson/index_file";
#$reads_dir = "/lustre/scratch1/ensembl/yuan/tmp/mouse/reads_out" if $tmp_dir =~ /lustre/;
#$reads_dir = "/lustre/scratch1/ensembl/yuan/mouse/reads_dir/venter" if $tmp_dir =~ /lustre/;
$reads_dir = "/lustre/work1/ensembl/yuan/SARA/tetraodon/new_assembly/pileup_dir/reads_out";
#$reads_dir = "/turing/mouse129/yuan/mouse/sanger_mouse/input_dir/reads_dir" if $tmp_dir =~ /turing/;
#job 1 somehow failed with memory 7 GB, so sent to turing, but only maxmem 3GB
my %rec_id_name;
#note the read_file is seq_region_name rather than seq_region_id,so use this
#my $sth = $self->{'dbSara'}->prepare(qq{SELECT seq_region_id,seq_region_name
# FROM $var_dbname.tmp_seq_region
# });
#my ($seq_region_id,$seq_region_name);
#$sth->execute(); $sth->bind_columns(\$seq_region_id,\$seq_region_name);
#while ($sth->fetch()) {
# $rec_id_name{$seq_region_id}=$seq_region_name;
#}
my $call;
foreach my $seq_region_id (keys %$rec_seq_region_id) {
#foreach my $seq_region_id (226032,226034) {
my $row_count_ref = $self->{'dbSara'}->db_handle->selectall_arrayref(qq{SELECT COUNT(*) FROM snp_pos_$seq_region_id});
my $row_count = $row_count_ref->[0][0];
if ($row_count > $defined_table_row) {
#my $reads_file = "$reads_dir/reads_out_$rec_id_name{$seq_region_id}";
my $reads_file = "$reads_dir/reads_out_$seq_region_id";
$call = "bsub $queue -J $var_dbname\_ssaha_flank_job_$seq_region_id -o $tmp_dir/ssaha_flank_out\_$seq_region_id perl parallel_sara_zm.pl -species $species -seq_region_id $seq_region_id -job flank -reads_file $reads_file -tmpdir $tmp_dir -tmpfile $tmp_file";
#$call = "bsub $queue -J $var_dbname\_ssaha_flank_job_$seq_region_id -o $tmp_dir/ssaha_flank_out\_$seq_region_id perl parallel_sara_feature.pl -species $species -seq_region_id $seq_region_id -job flank -index_file $index_file -tmpdir $tmp_dir -tmpfile $tmp_file";
}
else {
my $reads_file = "$reads_dir/reads_out_small";
$call = "bsub $queue -J $var_dbname\_ssaha_flank_job_$seq_region_id -o $tmp_dir/ssaha_flank_out\_$seq_region_id perl parallel_sara_zm.pl -species $species -seq_region_id $seq_region_id -job flank -reads_file $reads_file -tmpdir $tmp_dir -tmpfile $tmp_file";
}
$count++;
print "call is $call $count\n";
system($call);
sleep(5);
}
my $call1 = "bsub -q normal -K -w 'done($var_dbname\_ssaha_flank_job*)' -J waiting_process sleep 1"; #waits until all variation features have finished to continue
system($call1);
}
sub make_reads_file {
##use turing run all seq_region_ids
##before run, run this first: /nfs/team71/psg/zn1/bin/tag.pl try.fastq > readname.tag
my $self = shift;
my $rec_seq_region_id = shift;
my $queue = shift;
my $tmp_dir = $self->{'tmpdir'};
my $tmp_file = $self->{'tmpfile'};
#my $reads_dir = "/turing/mouse129/yuan/mouse/sanger_mouse/input_dir";
#my $reads_dir = "/turing/mouse129_extra/yuan/human_celera/dani/reads_dir/venter";
my $reads_dir = "/turing/mouse129_extra/yuan/tetraodon";
my $defined_table_row = 100000;
my (@big_seq_region_id,@small_seq_region_id);
foreach my $seq_region_id (keys %$rec_seq_region_id) {
#foreach my $seq_region_id (226052,226062,226046) {
my $row_count_ref = $self->{'dbSara'}->db_handle->selectall_arrayref(qq{SELECT COUNT(*) FROM snp_pos_$seq_region_id});
my $row_count = $row_count_ref->[0][0];
if ($row_count > $defined_table_row) {
push @big_seq_region_id,$seq_region_id;
}
else {
push @small_seq_region_id, $seq_region_id;
}
}
if ($tmp_dir !~ /turing/) {
die "search_read job has to run on turing\n";
}
foreach my $seq_region_id (@big_seq_region_id) {
#foreach my $seq_region_id (226053) {
debug("Dumping reads names for $seq_region_id...");
dumpSQL($self->{'dbSara'},qq{SELECT DISTINCT query_name FROM snp_pos_$seq_region_id});
system("sort $tmp_dir/$tmp_file |uniq >$reads_dir/reads_dir/reads_name_$seq_region_id");
#system("/nfs/team71/psg/zn1/bin/tag.pl $reads_dir/*.fastq > $reads_dir/readname.tag");
#my $seq_region_id="missed";
###needs 50GB memeory to run search_read (for any size of reads_name)
#system("bsub -q hugemem -R'select[mem>50000] rusage[mem=50000]' -J reads_file_$seq_region_id -o $reads_dir/reads_dir/out_reads_file_$seq_region_id /nfs/team71/psg/zn1/bin/search_read -fastq 1 $reads_dir/reads_dir/reads_name_$seq_region_id $reads_dir/reads_dir/reads_out_$seq_region_id $reads_dir/readname.tag $reads_dir/*fastq");
system("bsub -q hugemem -R'select[mem>50000] rusage[mem=50000]' -J reads_file_$seq_region_id -o $reads_dir/reads_dir/out_reads_file_$seq_region_id /nfs/acari/yuan/ensembl/src/ensembl-variation/scripts/ssahaSNP/ssaha_pileup/get_seqreads-turing $reads_dir/reads_dir/reads_name_$seq_region_id $reads_dir/genome-reads.fastq $reads_dir/reads_dir/reads_out_$seq_region_id");
if ($? == -1) {
print "failed to execute: $!\n";
}
elsif ($? & 127) {
printf "child died with signal %d, %s coredump\n",
($? & 127), ($? & 128) ? 'with' : 'without';
}
else {
printf "child exited with value %d\n", $? >> 8;
}
}
foreach my $seq_region_id (@small_seq_region_id) {
debug("Dumping reads names for $seq_region_id...");
dumpSQL($self->{'dbSara'},qq{SELECT query_name FROM snp_pos_$seq_region_id});
system("sort $tmp_dir/$tmp_file |uniq >>$reads_dir/reads_dir/reads_name_small");
}
###needs 50GB memeory to run search_read (for any size of reads_name)
#system("bsub -q hugemem -R'select[mem>130000] rusage[mem=130000]' -J reads_file_small -o $reads_dir/reads_dir/out_reads_file_small /nfs/team71/psg/zn1/bin/search_read -fastq 1 $reads_dir/reads_dir/reads_name_small $reads_dir/reads_dir/reads_out_small $reads_dir/readname.tag $reads_dir/*fastq");
system("bsub -q hugemem -R'select[mem>50000] rusage[mem=50000]' -J reads_file_small -o $reads_dir/reads_dir/out_reads_file_small /nfs/acari/yuan/ensembl/src/ensembl-variation/scripts/ssahaSNP/ssaha_pileup/get_seqreads-turing $reads_dir/reads_dir/reads_name_small $reads_dir/genome-reads.fastq $reads_dir/reads_dir/reads_out_small");
if ($? == -1) {
print "failed to execute: $!\n";
}
elsif ($? & 127) {
printf "child died with signal %d, %s coredump\n",
($? & 127), ($? & 128) ? 'with' : 'without';
}
else {
printf "child exited with value %d\n", $? >> 8;
}
my $call1 = "bsub -q hugemem -R'select[mem>100] rusage[mem=100]' -K -w 'done(reads_file*)' -J waiting_process sleep 1"; #waits until all variation features have finished to continue
system($call1);
system("scp $reads_dir/reads_dir/reads_out_* farm-login:/lustre/scratch1/ensembl/yuan/tmp/tetraodon/reads_out");
#return ("$reads_dir/reads_dir/reads_out_$seq_region_id$t\_000.fastq");
}
sub make_pileup_reads_file {
##use turing run all chromosomes
##before run, run this first: /nfs/team71/psg/zn1/bin/tag.pl try.fastq > readname.tag
my $self = shift;
my $tmp_dir = $self->{'tmpdir'};
my $tmp_file = $self->{'tmpfile'};
my $fastq_dir = "/turing/mouse129_extra/yuan/watson/fastq";
my $output_dir = "/turing/mouse129_extra/yuan/watson/output_dir";
my $target_dir = "/turing/mouse129_extra/yuan_watson/target_dir";
opendir DIR, "$output_dir" or die "Failed to open dir : $!";
my @reads_dirs = grep /dir$/, readdir(DIR);
print "files are @reads_dirs\n";
#foreach my $read_dir (@reads_dirs) {
foreach my $read_dir("1_dir") {
my ($chr) = $read_dir =~ /(\S+)\_dir/;
debug("chr is $chr Get reads fastq for $read_dir...");
###needs 50GB memeory to run search_read (for any size of reads_name)
system("bsub -q hugemem -R'select[mem>60000] rusage[mem=60000]' -J reads_file_$read_dir -o $output_dir/$read_dir/out_reads_file_$chr /nfs/team71/psg/zn1/bin/search_read -fastq 1 $output_dir/$read_dir/$chr\_read_name $output_dir/$read_dir/reads_out_$chr $fastq_dir/readname.tag $fastq_dir/*fastq");
if ($? == -1) {
print "failed to execute: $!\n";
}
elsif ($? & 127) {
printf "child died with signal %d, %s coredump\n",
($? & 127), ($? & 128) ? 'with' : 'without';
}
else {
printf "child exited with value %d\n", $? >> 8;
}
debug("Running pileup SNP...");
if (! -e "$output_dir/$read_dir/reads_out_$chr\.fastq") {
system("cat $output_dir/$read_dir/reads_out_$chr\_*.fastq >$output_dir/$read_dir/reads_out_$chr\.fastq");
}
system("bsub -q hugemem -M20000000 -R'select[mem>20000] rusage[mem=20000]' -o $output_dir/$read_dir/out_$chr\_SNP /nfs/team71/psg/zn1/src/ssahaSNP2/parse_SNP/view_ssahaSNP/cons/ABI/ssahaSNP_cons $output_dir/$read_dir/$chr\_align $output_dir/$read_dir/$chr\_cigar $target_dir/$chr\.fa $output_dir/$read_dir/reads_out_$chr\.fastq");
}
}
sub make_genotype_allele_table {
##use farm run these jobs
my $self = shift;
my $rec_seq_region_id = shift;
my $var_dbname = shift;
my $queue = shift;
my $tmp_dir = $self->{'tmpdir'};
my $tmp_file = $self->{'tmpfile'};
my $count;
my $species = $self->{'species'};
foreach my $seq_region_id (keys %$rec_seq_region_id) {
#foreach my $seq_region_id (226055) {
my $call = "bsub -q normal -R'select[myens_genomics1 < 200] rusage[myens_genomics1=10]' -J $var_dbname\_ssaha_gtype_job_$seq_region_id -o $tmp_dir/ssaha_gtype_out\_$seq_region_id perl parallel_sara_zm.pl -species $species -seq_region_id $seq_region_id -job gtype_allele -tmpdir $tmp_dir -tmpfile $tmp_file";
$count++;
print "call is $call $count\n";
system($call);
}
my $call1 = "bsub -q normal -K -w 'done($var_dbname\_ssaha_gtype_job*)' -J waiting_process sleep 1"; #waits until all variation features have finished to continue
system($call1);
}
sub insert_allele_gtype_to_vardb {
my ($self, $rec_seq_region_id, $var_dbname) = @_;
$self->{'dbVar'}->do(qq{ALTER TABLE allele add unique index unique_allele_idx(variation_id,allele,sample_id)});
$self->{'dbVar'}->do(qq{CREATE TABLE tmp_individual_genotype_single_bp (
variation_id int not null,allele_1 varchar(255),allele_2 varchar(255),sample_id int,
key variation_idx(variation_id),
key sample_idx(sample_id)
) MAX_ROWS = 100000000}
);
$self->{'dbVar'}->do(qq{CREATE UNIQUE INDEX ind_genotype_idx ON tmp_individual_genotype_single_bp(variation_id,sample_id,allele_1,allele_2)});
debug("Insert individual allele first...");
$self->{'dbSara'}->do(qq{INSERT IGNORE INTO $var_dbname.allele (variation_id,allele,sample_id) select ga.variation_id,ga.allele,ip.population_sample_id from gtype_allele ga, $var_dbname.sample s, $var_dbname.individual_population ip where ga.sample_name = s.name and s.sample_id = ip.individual_sample_id});
debug("Then insert reference allele...");
$self->{'dbSara'}->do(qq{INSERT IGNORE INTO $var_dbname.allele (variation_id,allele,sample_id) select ga.variation_id,ga.allele,s.sample_id from gtype_allele ga, $var_dbname.sample s where ga.sample_name = s.name and s.name like "ENS%"}) ;
debug("insert into tmp_genotype table...");
$self->{'dbSara'}->do(qq{INSERT IGNORE INTO $var_dbname.tmp_individual_genotype_single_bp
(variation_id,allele_1,allele_2,sample_id) select ig.variation_id,ig.allele_1,ig.allele_2,s.sample_id from tmp_individual_genotype_single_bp ig, $var_dbname.sample s where ig.sample_name = s.name});
$self->{'dbVar'}->do(qq{DROP INDEX unique_allele_idx ON allele});
$self->{'dbVar'}->do(qq{DROP INDEX ind_genotype_idx ON tmp_individual_genotype_single_bp});
#delete entries from variation, variation_feature and flanking_sequence tables that variation_id is not in allele and tmp_genotype_single_bp table
#$self->{'dbSara'}->do(qq{CREATE TABLE variation_old SELECT * FROM $var_dbname.variation});
#$self->{'dbSara'}->do(qq{CREATE TABLE variation_feature_old SELECT * FROM $var_dbname.variation_feature});
#$self->{'dbSara'}->do(qq{CREATE TABLE flanking_sequence_old SELECT * FROM $var_dbname.flanking_sequence});
$self->{'dbVar'}->do(qq{CREATE TABLE uniq_var_id_tmp_gtype SELECT DISTINCT variation_id FROM tmp_individual_genotype_single_bp});
$self->{'dbVar'}->do(qq{ALTER TABLE uniq_var_id_tmp_gtype ADD INDEX variation_idx(variation_id)});
debug("delete from variation table...");
$self->{'dbVar'}->do(qq{DELETE FROM v USING variation v LEFT JOIN uniq_var_id_tmp_gtype u ON v.variation_id = u.variation_id
WHERE u.variation_id IS NULL
});
debug("delete from variation_feature table...");
$self->{'dbVar'}->do(qq{DELETE FROM vf USING variation_feature vf LEFT JOIN uniq_var_id_tmp_gtype u ON vf.variation_id = u.variation_id
WHERE u.variation_id IS NULL
});
debug("delete from flanking_sequence table...");
$self->{'dbVar'}->do(qq{DELETE FROM f USING flanking_sequence f LEFT JOIN uniq_var_id_tmp_gtype u ON f.variation_id = u.variation_id
WHERE u.variation_id IS NULL
});
}
sub remove_faulse_variation_in_multiple_strains {
my $self = shift;
$self->{'dbVar'}->do(qq{CREATE TABLE varid_remove
SELECT t.variation_id,group_concat(t.allele_1) as allele_1,group_concat(t.allele_2) as allele_2
FROM tmp_individual_genotype_single_bp t
GROUP BY variation_id
});
$self->{'dbVar'}->do(qq{CREATE TABLE varid_remove1
SELECT * FROM varid_remove
WHERE allele_1=allele_2
});
$self->{'dbVar'}->do(qq{CREATE TABLE varid_same_as_ref
SELECT DISTINCT v.variation_id from varid_remove1 v, variation_feature vf
WHERE substring(v.allele_1,1,1) like substring(vf.allele_string,1,1)
AND substring(v.allele_1,3,1) like substring(vf.allele_string,1,1)
AND substring(v.allele_1,5,1) like substring(vf.allele_string,1,1)
AND v.variation_id=vf.variation_id
AND length(v.allele_1)=5
});
$self->{'dbVar'}->do(qq{INSERT INTO varid_same_as_ref
SELECT DISTINCT v.variation_id from varid_remove1 v, variation_feature vf
WHERE substring(v.allele_1,1,1) like substring(vf.allele_string,1,1)
AND substring(v.allele_1,3,1) like substring(vf.allele_string,1,1)
AND v.variation_id=vf.variation_id
AND length(v.allele_1)=3
});
$self->{'dbVar'}->do(qq{INSERT INTO varid_same_as_ref
SELECT DISTINCT v.variation_id from varid_remove1 v, variation_feature vf
WHERE substring(v.allele_1,1,1) like substring(vf.allele_string,1,1)
AND v.variation_id=vf.variation_id
AND length(v.allele_1)=1
});
$self->{'dbVar'}->do(qq{ALTER TABLE varid_same_as_ref ADD INDEX variation_id(variation_id)});
#remove these variation_id from variation/variation_feature/flanking_sequence/allele/tmp_gtype tables
debug("delete from variation table...");
$self->{'dbVar'}->do(qq{DELETE FROM v USING variation v, varid_same_as_ref u
WHERE v.variation_id = u.variation_id
});
debug("delete from variation_feature table...");
$self->{'dbVar'}->do(qq{DELETE FROM vf USING variation_feature vf, varid_same_as_ref u
WHERE vf.variation_id = u.variation_id
});
debug("delete from flanking_sequence table...");
$self->{'dbVar'}->do(qq{DELETE FROM f USING flanking_sequence f, varid_same_as_ref u
WHERE f.variation_id = u.variation_id
});
debug("delete from allele table...");
$self->{'dbVar'}->do(qq{DELETE FROM v USING allele a, varid_same_as_ref u
WHERE a.variation_id = u.variation_id
});
debug("delete from tmp_individual_genotype_single_bp table...");
$self->{'dbVar'}->do(qq{DELETE FROM v USING tmp_individual_genotype_single_bp t, varid_same_as_ref u
WHERE t.variation_id = u.variation_id
});
debug("Drop tables varid_remove varid_remove1 varid_same_as_ref");
#$self->{'dbVar'}->do(qq{DROP TABLES varid_remove varid_remove1 varid_same_as_ref});
}
sub read_coverage {
#not used here
my $self = shift;
my $var_dbname = shift;
my $tmp_dir = $self->{'tmpdir'};
my $tmp_file = $self->{'tmpfile'};
my $species = $self->{'species'};
#my $alignment_file ="/turing/mouse129_extra/yuan/human_celera/analysis/chimp/ALIGNMENT_FILE";
my $alignment_file ="/lustre/work1/ensembl/yuan/rat/STAR/output_dir/ALIGNMENT_FILE";
my $sth = $self->{'dbCore'}->prepare(qq{SELECT sr.seq_region_id,sr.name
FROM seq_region_attrib sra, attrib_type at, seq_region sr
WHERE sra.attrib_type_id=at.attrib_type_id
AND at.code="toplevel"
AND sr.seq_region_id = sra.seq_region_id
});
my ($seq_region_id,$seq_region_name,%rec_seq_region);
$sth->execute();
$sth->bind_columns(\$seq_region_id,\$seq_region_name);
while ($sth->fetch()) {
$rec_seq_region{$seq_region_id}=$seq_region_name;
}
foreach my $seq_region_id (keys %rec_seq_region) {
my $seq_region_name = $rec_seq_region{$seq_region_id};
my $call = "bsub -q bigmem -R'select[mem>3000] rusage[mem=3000]' -J $var_dbname\_read_coverage_job_$seq_region_id -o /$tmp_dir/read_coverage_out\_$seq_region_id /usr/local/bin/perl /nfs/acari/yuan/ensembl/src/ensembl-variation/scripts/import/parallel_sara_feature.pl -species $species -seq_region_name $seq_region_name -job read_coverage -alignment_file $alignment_file -tmpdir $tmp_dir -tmpfile $tmp_file";
print "call is $call\n";
system($call);
}
my $call1 = "bsub -q normal -K -w 'done($var_dbname\_read_coverage_job*)' -J waiting_process sleep 1"; #waits until all variation features have finished to continue
system($call1);
}
sub merge_tables {
my ($self) = @_;
#foreach my $table ("combine_feature","snp_pos","flanking_qual","gtype_allele","failed_flanking_qual","failed_gtype","tmp_individual_genotype_single_bp") {
foreach my $table ("gtype_allele","failed_gtype","tmp_individual_genotype_single_bp") {
debug("Merge table $table...");
my $single_table_ref = $self->{'dbSara'}->db_handle->selectall_arrayref(qq{SHOW tables like "$table\_%"});
my @tables = map {$_->[0] } @$single_table_ref;
foreach my $t(@tables) {
if ($t eq $table) {
$self->{'dbSara'}->do(qq{DROP TABLE $table});
}
}
$self->{'dbSara'}->do(qq{CREATE TABLE $table like $tables[0]});
my $table_names = join ',',@tables;
#print "table_names is $table_names\n";
$self->{'dbSara'}->do(qq{ALTER TABLE $table engine = merge union($table_names) insert_method=last});
}
}
sub remove_tables {
my ($self) = @_;
#foreach my $table ("combine_feature","snp_pos","failed_flanking_qual","flanking_qual","gtype_allele","failed_gtype","tmp_individual_genotype_single_bp") {
foreach my $table ("gtype_allele","failed_gtype","tmp_individual_genotype_single_bp") {
my $single_table_ref = $self->{'dbSara'}->db_handle->selectall_arrayref(qq{SHOW tables like "$table%"});
my @tables = map {$_->[0] } @$single_table_ref;
my $table_names = join ',',@tables;
print "table_names is $table_names\n";
$self->{'dbSara'}->do(qq{DROP TABLES $table_names});
}
}
sub remove_empty_tables {
my ($self) = @_;
#foreach my $table ("combine_feature","snp_pos","failed_flanking_qual","flanking_qual","gtype_allele","failed_gtype","tmp_individual_genotype_single_bp") {
foreach my $table ("flanking_qual","failed_flanking_qual") {
my $single_table_ref = $self->{'dbSara'}->db_handle->selectall_arrayref(qq{SHOW tables like "$table%"});
my @tables = map {$_->[0] } @$single_table_ref;
my @empty_tables;
foreach my $table (@tables) {
my $table_row_ref = $self->{'dbSara'}->db_handle->selectall_arrayref(qq{SELECT COUNT(*) FROM $table});
#print "table is $table\n";
my $table_row = $table_row_ref->[0][0];
if (! $table_row) {
push @empty_tables, $table;
}
}
my $table_names = join ',',@empty_tables;
print "empty_table_names is $table_names\n";
$self->{'dbSara'}->do(qq{DROP TABLES $table_names});
}
}
sub merge_table_with_same_seq_region_id {
my ($self) = @_;
#foreach my $table ("combine_feature","snp_pos","flanking_qual","gtype_allele","failed_qual","failed_gtype","tmp_individual_genotype_single_bp") {
foreach my $table ("flanking_qual","failed_qual") {
my $single_table_ref = $self->{'dbSara'}->db_handle->selectall_arrayref(qq{SHOW tables like "$table\_%"});
my @tables = map {$_->[0] } @$single_table_ref;
my $main_tab_name;
$self->{'dbSara'}->do(qq{CREATE TABLE IF NOT EXISTS $main_tab_name like $tables[0]});
foreach my $tab (@tables) {
if ($tab =~ /(.*\_.*).*\_(\d+)(\_\d+)$/) {
$main_tab_name = $1.$3;
$self->{'dbSara'}->do(qq{CREATE TABLE IF NOT EXISTS $main_tab_name like $tab});
print "table is $tab and main table is $main_tab_name\n";
#$self->{'dbSara'}->do(qq{INSERT INTO $main_tab_name select * from $tab});
#$self->{'dbSara'}->do(qq{DROP TABLE $tab});
}
}
}
}
1;