Raw content of SeqStoreConverter::BasicConverter
# Convert release 18/19-era schemas to use new non-clone/contig schema
use strict;
use warnings;
use DBI;
package SeqStoreConverter::BasicConverter;
###############################################################################
# Constructor
###############################################################################
sub new {
my ( $class, $user, $pass, $host, $source, $target, $schema, $vega_schema, $force, $verbose, $limit ) = @_;
my $self = bless {}, $class;
my $port;
($host, $port) = split(/:/, $host);
$port ||= 3306;
my $dbh = DBI->connect( "DBI:mysql:host=$host:port=$port", $user, $pass,
{'RaiseError' => 1});
$self->verbose( $verbose );
$self->dbh( $dbh );
$self->force( $force );
$self->source( $source );
$self->target( $target );
$self->schema( $schema );
$self->vegaschema( $vega_schema);
$self->host( $host );
$self->password( $pass);
$self->user($user);
$self->port($port);
$self->limit($limit);
#check to see if the destination and source databases exist already.
my %dbs = map {$_->[0] => 1} @{$dbh->selectall_arrayref('show databases')};
if( !$dbs{$source} ) {
die ("Source db $source does not exist" );
}
if( $dbs{$target} ) {
if( $force ) {
$dbh->do( "drop database $target" );
} else {
die("Target db $target already exists. Use -force option to overwrite.");
}
}
$dbh->do( "create database ".$self->target() );
$self->debug("Building schema for $target from $schema");
die "Cannot open $schema" if (! -e $schema);
my $cmd = "/usr/local/mysql/bin/mysql -u $user -p$pass -P $port -h $host $target < $schema";
system ($cmd);
if ($vega_schema) {
$self->debug("Adding vega tables for $target");
die "Cannot open vega creation script" if (! -e $vega_schema);
my $cmd = "/usr/local/mysql/bin/mysql -u $user -p$pass -P $port -h $host $target < $vega_schema";
system ($cmd);
}
$self->debug("Creating temporary tables");
#create a temporary table to store the mapping of old ids to new ids
$dbh->do
("CREATE TEMPORARY TABLE $target.tmp_cln_map (" .
"old_id INT, new_id INT, INDEX new_idx (new_id))");
#create a temp table which will store the mapping of old chromosome
#identifiers to new identifiers
$dbh->do("CREATE TEMPORARY TABLE $target.tmp_chr_map (" .
" old_id INT, new_id INT,".
" INDEX new_idx (new_id))");
#create a temporary table to hold old supercontig name -> new id mappings
$dbh->do("CREATE TEMPORARY TABLE $target.tmp_superctg_map (" .
"name VARCHAR(255), new_id INT, ".
"INDEX new_idx (new_id))");
return $self;
}
###############################################################################
# Getter/Setters for converter properties
###############################################################################
sub force {
my $self = shift;
$self->{'force'} = shift if(@_);
return $self->{'force'};
}
sub dbh {
my $self = shift;
$self->{'dbh'} = shift if(@_);
return $self->{'dbh'};
}
sub user {
my $self = shift;
$self->{'user'} = shift if(@_);
return $self->{'user'};
}
sub host {
my $self = shift;
$self->{'host'} = shift if(@_);
return $self->{'host'};
}
sub port {
my $self = shift;
$self->{'port'} = shift if(@_);
return $self->{'port'};
}
sub password {
my $self = shift;
$self->{'password'} = shift if(@_);
return $self->{'password'};
}
sub verbose {
my $self = shift;
$self->{'verbose'} = shift if (@_);
return $self->{'verbose'};
}
sub schema {
my $self = shift;
$self->{'schema'} = shift if (@_);
return $self->{'schema'};
}
sub vegaschema {
my $self = shift;
$self->{'vega_schema'} = shift if (@_);
return $self->{'vega_schema'};
}
sub source {
my $self = shift;
$self->{'source'} = shift if(@_);
return $self->{'source'};
}
sub target {
my $self = shift;
$self->{'target'} = shift if(@_);
return $self->{'target'};
}
sub limit {
my $self = shift;
$self->{'limit'} = shift if(@_);
return $self->{'limit'};
}
###############################################################################
# Utility methods
###############################################################################
sub debug {
my $self = shift;
my $str = shift;
print STDERR $str . "\n" if $self->verbose();
return;
}
sub copy_tables {
my ($self, @tables) = @_;
foreach my $table (@tables) {
$self->debug("Copying $table");
my $source = $self->source();
my $target = $self->target();
eval {
my $sth = $self->dbh()->prepare
("INSERT INTO $target.$table SELECT * FROM $source.$table");
$sth->execute();
$sth->finish();
};
if($@) {
warn("Copy of table $table failed: $@\n");
}
}
return;
}
sub get_coord_system_id {
my $self = shift;
my $cs_name = shift;
my $cs_version = shift;
my $target = $self->target();
my @bind_vals = ($cs_name);
my $sql = "SELECT cs.coord_system_id " .
"FROM $target.coord_system cs " .
"WHERE cs.name = ?";
if($cs_version) {
push(@bind_vals, $cs_version);
$sql .= " AND cs.version = ?";
}
my $sth = $self->dbh()->prepare($sql);
$sth->execute(@bind_vals);
if($sth->rows() != 1) {
die("Id for non-existant or ambiguous coord system requested " .
"$cs_name:$cs_version");
}
my ($id) = $sth->fetchrow_array();
$sth->finish();
return $id;
}
sub get_default_assembly {
my $self = shift;
my $source = $self->source();
my $sth = $self->dbh->prepare
("SELECT meta_value FROM $source.meta WHERE meta_key='assembly.default'");
$sth->execute();
if(!$sth->rows() == 1) {
die("This species has an ambiguous or non-existant assembly.default" .
" in the meta table");
}
my ($result) = $sth->fetchrow_array();
return $result;
}
sub contig_to_seq_region {
my $self = shift;
my $target_cs_name = shift;
my $target = $self->target();
my $source = $self->source();
my $dbh = $self->dbh();
$target_cs_name ||= 'contig';
$self->debug("Transforming contigs into $target_cs_name seq_regions");
my $cs_id = $self->get_coord_system_id($target_cs_name);
my $sth = $dbh->prepare
("INSERT INTO $target.seq_region " .
"SELECT contig_id, name, $cs_id, length FROM $source.contig");
$sth->execute();
$sth->finish();
return;
}
sub clone_to_seq_region {
my $self = shift;
my $target_cs_name = shift;
my $target = $self->target();
my $source = $self->source();
my $dbh = $self->dbh();
# target coord_system will have a different ID
$target_cs_name ||= "clone";
my $cs_id = $self->get_coord_system_id($target_cs_name);
$self->debug("Transforming clones into $target_cs_name seq_regions");
my $select_sth = $dbh->prepare
("SELECT cl.clone_id,
CONCAT(cl.embl_acc, '.', cl.embl_version),
MAX(ctg.embl_offset+ctg.length-1)
FROM $source.clone cl, $source.contig ctg
WHERE cl.clone_id = ctg.clone_id GROUP BY ctg.clone_id");
$select_sth->execute();
my ($clone_id, $embl_acc, $length);
$select_sth->bind_columns(\$clone_id, \$embl_acc, \$length);
my $insert_sth = $dbh->prepare
("INSERT INTO $target.seq_region (name, coord_system_id, length) " .
"VALUES(?,?,?)");
my $tmp_insert_sth = $dbh->prepare
("INSERT INTO $target.tmp_cln_map (old_id, new_id) VALUES (?, ?)");
while ($select_sth->fetch()) {
$insert_sth->execute("$embl_acc", $cs_id, $length);
#store mapping of old -> new ids in temp table
$tmp_insert_sth->execute($clone_id, $insert_sth->{'mysql_insertid'});
}
$select_sth->finish();
$insert_sth->finish();
$tmp_insert_sth->finish();
return;
}
sub chromosome_to_seq_region {
my $self = shift;
my $target_cs_name = shift;
my $target = $self->target();
my $source = $self->source();
my $dbh = $self->dbh();
$target_cs_name ||= "chromosome";
my $cs_id = $self->get_coord_system_id($target_cs_name);
$self->debug("Transforming chromosomes into $target_cs_name seq_regions");
my $select_sth = $dbh->prepare
("SELECT chromosome_id, name, length FROM $source.chromosome");
my $insert_sth = $dbh->prepare
("INSERT INTO $target.seq_region (name, coord_system_id, length) " .
"VALUES (?,?,?)");
my $tmp_insert_sth = $dbh->prepare
("INSERT INTO $target.tmp_chr_map (old_id, new_id) VALUES (?, ?)");
$select_sth->execute();
my ($chrom_id, $name, $length);
$select_sth->bind_columns(\$chrom_id, \$name, \$length);
while ($select_sth->fetch()) {
#insert into seq_region table
$insert_sth->execute($name, $cs_id, $length);
#copy old/new mapping into temporary table
$tmp_insert_sth->execute($chrom_id, $insert_sth->{'mysql_insertid'});
}
$select_sth->finish();
$insert_sth->finish();
$tmp_insert_sth->finish();
return;
}
sub supercontig_to_seq_region {
my $self = shift;
my $target_cs_name = shift || "supercontig";
my $target = $self->target();
my $source = $self->source();
my $dbh = $self->dbh();
$self->debug("Transforming supercontigs into $target_cs_name seq_regions");
my $cs_id = $self->get_coord_system_id($target_cs_name);
my $select_sth = $dbh->prepare
("SELECT superctg_name, " .
"MAX(superctg_end) AS length " .
"FROM $source.assembly " .
"GROUP BY superctg_name");
my $insert_sth = $dbh->prepare
("INSERT INTO $target.seq_region (name, coord_system_id, length) " .
"VALUES (?,?,?)");
my $tmp_insert_sth = $dbh->prepare
("INSERT INTO $target.tmp_superctg_map (name, new_id) VALUES (?, ?)");
my ($name, $length);
$select_sth->execute();
$select_sth->bind_columns(\$name, \$length);
while ($select_sth->fetch()) {
$insert_sth->execute($name, $cs_id, $length);
$tmp_insert_sth->execute($name, $insert_sth->{'mysql_insertid'});
}
$select_sth->finish();
$insert_sth->finish();
$tmp_insert_sth->finish();
return;
}
sub assembly_contig_chromosome {
my $self = shift;
$self->debug("Building assembly table - contig/chromosome");
my $source = $self->source();
my $target = $self->target();
my $dbh = $self->dbh();
$dbh->do(
"INSERT INTO $target.assembly " .
"SELECT tcm.new_id, " . # asm_seq_region_id (old-new chromosome ID mapping)
"a.contig_id, " . # cmp_seq_region_id
"a.chr_start, " . # asm_start
"a.chr_end, " . # asm_end
"a.contig_start, " . # cmp_start
"a.contig_end, " . # cmp_end
"a.contig_ori " . # ori
"FROM $target.tmp_chr_map tcm, $source.assembly a, $source.contig c " .
"WHERE tcm.old_id = a.chromosome_id " .
"AND c.contig_id = a.contig_id "); # only copy assembly entries that
# refer to valid contigs (test db has
# superfluous assembly entries)
}
sub assembly_contig_clone {
my $self = shift;
$self->debug("Building assembly table - contig/clone");
my $source = $self->source();
my $target = $self->target();
my $dbh = $self->dbh();
$dbh->do(
"INSERT INTO $target.assembly " .
"SELECT tcm.new_id, " . # asm_seq_region_id (old-new clone ID mapping)
"ctg.contig_id, ". # cmp_seq_region_id
"ctg.embl_offset, " . # asm_start
"ctg.embl_offset+ctg.length-1, " . # asm_end
"1, " . # cmp_start
"ctg.length, " . # cmp_end
"1 " . # ori - contig always positively oriented on the clone
"FROM $target.tmp_cln_map tcm, " .
"$source.clone cln, $source.contig ctg " .
"WHERE tcm.old_id = cln.clone_id " .
"AND cln.clone_id = ctg.clone_id");
}
sub assembly_contig_supercontig {
my $self = shift;
$self->debug("Building assembly table - contig/supercontig");
my $source = $self->source();
my $target = $self->target();
my $dbh = $self->dbh();
$dbh->do(
"INSERT INTO $target.assembly " .
"SELECT tsm.new_id, " . # asm_seq_region_id (superctg name-sr_id mapping)
"a.contig_id, " . # cmp_seq_region_id
"a.superctg_start, " . # asm_start
"a.superctg_end, " . # asm_end
"a.contig_start, " . # cmp_start
"a.contig_end, " . # cmp_end
"a.contig_ori " . # ori
"FROM $target.tmp_superctg_map tsm, $source.assembly a, $source.contig c ".
"WHERE tsm.name = a.superctg_name " .
"AND c.contig_id = a.contig_id "); # only copy assembly entries that
# refer to valid contigs (test db might
# have these)
}
sub assembly_supercontig_chromosome {
my $self = shift;
$self->debug("Building assembly table - supercontig/chromosome");
my $source = $self->source();
my $target = $self->target();
my $dbh = $self->dbh();
$dbh->do(
"INSERT INTO $target.assembly " .
"SELECT tcm.new_id, " . # asm_seq_region_id (chr id)
"tsm.new_id, " . # cmp_seq_region_id (supercontig id)
"min(a.chr_start), " . # asm_start
"max(a.chr_end), " . # asm_end
"min(a.superctg_start), " . # cmp_start
"max(a.superctg_end), " . # cmp_end
"a.superctg_ori " . # ori
"FROM $target.tmp_superctg_map tsm, $target.tmp_chr_map tcm, " .
" $source.assembly a ".
"WHERE tsm.name = a.superctg_name " .
"AND tcm.old_id = a.chromosome_id " .
"GROUP BY superctg_name");
}
###############################################################################
# Base class implementations of transfer methods. Can be overridden to
# create species specific behaviour
###############################################################################
sub create_coord_systems {
my $self = shift;
my $target = $self->target();
my $dbh = $self->dbh();
my $ass_def = $self->get_default_assembly();
my @coords =
(["chromosome" , $ass_def, "default_version" ,1],
["supercontig", undef , "default_version" ,2],
["clone" , undef , "default_version" ,3],
["contig", undef , "default_version,sequence_level",4]);
my @assembly_mappings = ("chromosome:$ass_def|contig",
"clone|contig",
"supercontig|contig",
"supercontig|contig|clone",
"chromosome:$ass_def|contig|clone",
"chromosome:$ass_def|contig|supercontig");
$self->debug("Building coord_system table");
my $sth = $dbh->prepare("INSERT INTO $target.coord_system " .
"(name, version, attrib,rank) VALUES (?,?,?,?)");
my %coord_system_ids;
foreach my $cs (@coords) {
$sth->execute(@$cs);
$coord_system_ids{$cs->[0]} = $sth->{'mysql_insertid'};
}
$sth->finish();
$self->debug("Adding assembly.mapping entries to meta table");
$sth = $dbh->prepare("INSERT INTO $target.meta(meta_key, meta_value) " .
"VALUES ('assembly.mapping', ?)");
foreach my $mapping (@assembly_mappings) {
$sth->execute($mapping);
}
$sth->finish();
return;
}
#
# populates the contents of the meta_coord table
# must be executed after all of the feature tables in the target database
# have already been populated
#
sub create_meta_coord {
my $self = shift;
$self->debug("Building meta_coord table");
my $target = $self->target();
my $dbh = $self->dbh();
my @feature_tables = qw(density_feature
dna_align_feature
exon
gene
karyotype
marker_feature
misc_feature
prediction_exon
prediction_transcript
protein_align_feature
repeat_feature
simple_feature
transcript);
foreach my $ft (@feature_tables) {
$dbh->do(qq{INSERT INTO $target.meta_coord(table_name, coord_system_id,
max_length)
SELECT '$ft', sr.coord_system_id,
MAX(f.seq_region_end - f.seq_region_start + 1)
FROM $target.$ft f, $target.seq_region sr
WHERE sr.seq_region_id = f.seq_region_id
GROUP BY sr.coord_system_id});
}
# special case for assembly exception, features are created from both
# sides of table
$dbh->do(qq{INSERT INTO $target.meta_coord
SELECT 'assembly_exception', sr.coord_system_id,
MAX(IF(ae.seq_region_end - ae.seq_region_start > ae.exc_seq_region_end - ae.exc_seq_region_start, ae.seq_region_end - ae.seq_region_start + 1, ae.exc_seq_region_end - ae.exc_seq_region_start + 1))
FROM $target.assembly_exception ae, $target.seq_region sr
WHERE sr.seq_region_id = ae.seq_region_id
GROUP BY sr.coord_system_id});
}
sub create_seq_regions {
my $self = shift;
my $target = $self->target();
my $dbh = $self->dbh();
#default behaviour is to simply copy all tables as they come
$self->contig_to_seq_region('contig');
$self->chromosome_to_seq_region();
$self->supercontig_to_seq_region();
$self->clone_to_seq_region();
return;
}
sub create_assembly {
my $self = shift;
$self->assembly_contig_chromosome();
$self->assembly_contig_clone();
$self->assembly_contig_supercontig();
return;
}
sub transfer_dna {
my $self = shift;
my $target = $self->target();
my $source = $self->source();
my $dbh = $self->dbh();
$self->debug("Building dna table");
$dbh->do("INSERT INTO $target.dna " .
"SELECT c.contig_id as seq_region_id, d.sequence as sequence " .
"FROM $source.dna d, $source.contig c " .
"WHERE c.dna_id = d.dna_id");
return;
}
sub transfer_genes {
my $self = shift;
my $target = $self->target();
my $source = $self->source();
my $dbh = $self->dbh();
#
# Transfer the gene table
#
$self->debug("Building gene table");
$dbh->do
("INSERT INTO $target.gene " .
"SELECT g.gene_id, g.type, g.analysis_id, tcm.new_id, " .
"MIN(IF (a.contig_ori=1,(e.contig_start+a.chr_start-a.contig_start)," .
" (a.chr_start+a.contig_end-e.contig_end ))) as start, " .
"MAX(IF (a.contig_ori=1,(e.contig_end+a.chr_start-a.contig_start), " .
" (a.chr_start+a.contig_end-e.contig_start))) as end, " .
" a.contig_ori*e.contig_strand as strand, " .
" g.display_xref_id " .
"FROM $source.transcript t, $source.exon_transcript et, " .
" $source.exon e, $source.assembly a, $source.gene g, " .
" $target.tmp_chr_map tcm " .
"WHERE t.transcript_id = et.transcript_id " .
"AND et.exon_id = e.exon_id " .
"AND e.contig_id = a.contig_id " .
"AND g.gene_id = t.gene_id " .
"AND a.chromosome_id = tcm.old_id " .
"GROUP BY g.gene_id");
#
# Transfer the transcript table
#
$self->debug("Building transcript table");
$dbh->do
("INSERT INTO $target.transcript " .
"SELECT t.transcript_id, t.gene_id, tcm.new_id, " .
"MIN(IF (a.contig_ori=1,(e.contig_start+a.chr_start-a.contig_start)," .
" (a.chr_start+a.contig_end-e.contig_end ))) as start, " .
"MAX(IF (a.contig_ori=1,(e.contig_end+a.chr_start-a.contig_start), " .
" (a.chr_start+a.contig_end-e.contig_start))) as end, " .
" a.contig_ori*e.contig_strand as strand, " .
" t.display_xref_id " .
"FROM $source.transcript t, $source.exon_transcript et, " .
" $source.exon e, $source.assembly a, $target.tmp_chr_map tcm " .
"WHERE t.transcript_id = et.transcript_id " .
"AND et.exon_id = e.exon_id " .
"AND e.contig_id = a.contig_id " .
"AND a.chromosome_id = tcm.old_id " .
"GROUP BY t.transcript_id");
#
# Transfer the exon table
#
$self->debug("Building exon table");
$dbh->do
("INSERT INTO $target.exon " .
"SELECT e.exon_id, tcm.new_id, " .
"MIN(IF (a.contig_ori=1,(e.contig_start+a.chr_start-a.contig_start)," .
" (a.chr_start+a.contig_end-e.contig_end ))) as start, " .
"MAX(IF (a.contig_ori=1,(e.contig_end+a.chr_start-a.contig_start), " .
" (a.chr_start+a.contig_end-e.contig_start))) as end, " .
" a.contig_ori*e.contig_strand as strand, " .
" e.phase, e.end_phase " .
"FROM $source.transcript t, $source.exon_transcript et, " .
" $source.exon e, $source.assembly a, $source.gene g, " .
" $target.tmp_chr_map tcm " .
"WHERE t.transcript_id = et.transcript_id " .
"AND et.exon_id = e.exon_id " .
"AND e.contig_id = a.contig_id " .
"AND g.gene_id = t.gene_id " .
"AND a.chromosome_id = tcm.old_id " .
"GROUP BY e.exon_id");
#
# Transfer translation table
#
$self->debug("Building translation table");
$dbh->do
("INSERT INTO $target.translation " .
"SELECT tl.translation_id, ts.transcript_id, tl.seq_start, " .
" tl.start_exon_id, tl.seq_end, tl.end_exon_id " .
"FROM $source.transcript ts, $source.translation tl " .
"WHERE ts.translation_id = tl.translation_id");
return;
}
sub transfer_prediction_transcripts {
my $self = shift;
my $source = $self->source();
my $target = $self->target();
my $dbh = $self->dbh();
# prediction_transcript / prediction_exon
$self->debug( "Building prediction_exon table" );
$dbh->do
("INSERT INTO $target.prediction_exon ".
"( prediction_transcript_id, seq_region_id, seq_region_start, " .
" seq_region_end, seq_region_strand, start_phase, score, p_value," .
" exon_rank ) " .
"SELECT prediction_transcript_id, contig_id, contig_start, contig_end, " .
" contig_strand, start_phase, score, p_value, exon_rank " .
"FROM $source.prediction_transcript" );
$self->debug("Building prediction_transcript table");
$dbh->do
("INSERT INTO $target.prediction_transcript ".
"( prediction_transcript_id, seq_region_id, seq_region_start, " .
"seq_region_end, seq_region_strand, analysis_id ) " .
"SELECT prediction_transcript_id, contig_id, MIN(contig_start), " .
" MAX(contig_end), contig_strand, analysis_id ".
"FROM $source.prediction_transcript " .
"GROUP BY prediction_transcript_id ");
return;
}
sub transfer_features {
my $self = shift;
my $target = $self->target();
my $source = $self->source();
my $dbh = $self->dbh();
my $limit = '';
if($self->limit()) {
$limit = ' limit ' . $self->limit();
}
#
# Feature tables
# Note that we can just rename contig_* to set_region_* since the
# contig IDs were copied verbatim into seq_region
#
# For some reason mysql occasionally refuses to use the index on large
# tables following copies like the following.
# So: drop the indexes first and then re-add them after
# simple_feature
$self->debug("Building simple_feature table");
$dbh->do
("INSERT INTO $target.simple_feature (simple_feature_id, seq_region_id, ".
" seq_region_start, seq_region_end, seq_region_strand, " .
" display_label, analysis_id, score) " .
"SELECT simple_feature_id, contig_id, contig_start, contig_end, " .
" contig_strand, display_label, analysis_id, score " .
"FROM $source.simple_feature $limit");
# repeat_feature
$self->debug("Dropping indexes on repeat_feature");
$dbh->do("ALTER TABLE $target.repeat_feature DROP INDEX seq_region_idx");
$dbh->do("ALTER TABLE $target.repeat_feature DROP INDEX repeat_idx");
$dbh->do("ALTER TABLE $target.repeat_feature DROP INDEX analysis_idx");
$self->debug("Building repeat_feature table");
$dbh->do
("INSERT INTO $target.repeat_feature (repeat_feature_id, seq_region_id, " .
" seq_region_start, seq_region_end, seq_region_strand, analysis_id, " .
" repeat_start, repeat_end, repeat_consensus_id, score) " .
"SELECT repeat_feature_id, contig_id, contig_start, contig_end, " .
" contig_strand, analysis_id, repeat_start, repeat_end, " .
" repeat_consensus_id, score FROM $source.repeat_feature $limit");
$self->debug("Reading indexes on repeat_feature");
$dbh->do("ALTER TABLE $target.repeat_feature " .
"ADD INDEX seq_region_idx( seq_region_id, seq_region_start)");
$dbh->do("ALTER TABLE $target.repeat_feature " .
"ADD INDEX repeat_idx( repeat_consensus_id )");
$dbh->do("ALTER TABLE $target.repeat_feature " .
"ADD INDEX analysis_idx(analysis_id)");
# protein_align_feature
$self->debug("Dropping indexes on protein_align_feature");
$dbh->do("ALTER TABLE $target.protein_align_feature DROP INDEX hit_idx");
$dbh->do( "ALTER TABLE $target.protein_align_feature " .
"DROP INDEX seq_region_idx");
$self->debug("Building protein_align_feature table");
$dbh->do( "INSERT INTO $target.protein_align_feature " .
" (protein_align_feature_id, seq_region_id, seq_region_start, " .
" seq_region_end, seq_region_strand, analysis_id, hit_start, " .
" hit_end, hit_name, cigar_line, evalue, perc_ident, score) " .
"SELECT protein_align_feature_id, contig_id, contig_start, " .
" contig_end, contig_strand, analysis_id, hit_start, " .
" hit_end, hit_name, cigar_line, evalue, perc_ident, score ".
"FROM $source.protein_align_feature $limit");
$self->debug("Reading indexes on protein_align_feature");
$dbh->do( qq{ALTER TABLE $target.protein_align_feature
ADD index seq_region_idx( analysis_id, seq_region_id,
seq_region_start, score )});
$dbh->do( "ALTER TABLE $target.protein_align_feature " .
"ADD index hit_idx(hit_name)");
# dna_align_feature
$self->debug("Dropping indexes on dna_align_feature");
$dbh->do( "ALTER TABLE $target.dna_align_feature DROP INDEX seq_region_idx");
$dbh->do( "ALTER TABLE $target.dna_align_feature DROP INDEX hit_idx");
$self->debug("Building dna_align_feature table");
$dbh->do( "INSERT INTO $target.dna_align_feature " .
" (dna_align_feature_id, seq_region_id, seq_region_start, ".
" seq_region_end, seq_region_strand, analysis_id, " .
" hit_start, hit_end, hit_name, hit_strand, cigar_line, " .
" evalue, perc_ident, score) " .
"SELECT dna_align_feature_id, contig_id, contig_start, " .
" contig_end, contig_strand, analysis_id, hit_start, " .
" hit_end, hit_name, hit_strand, cigar_line, evalue, " .
" perc_ident, score FROM $source.dna_align_feature $limit");
$self->debug("Reading indexes on dna_align_feature");
$dbh->do( qq{ALTER TABLE $target.dna_align_feature
ADD INDEX seq_region_idx(seq_region_id, analysis_id,
seq_region_start, score)});
$dbh->do( "ALTER TABLE $target.dna_align_feature " .
"ADD index hit_idx(hit_name)");
# marker_feature
$self->debug("Building marker_feature table");
$dbh->do( "INSERT INTO $target.marker_feature " .
" (marker_feature_id, marker_id, seq_region_id, " .
" seq_region_start, seq_region_end, analysis_id, " .
" map_weight) " .
"SELECT marker_feature_id, marker_id, contig_id, contig_start, ".
" contig_end, analysis_id, map_weight " .
"FROM $source.marker_feature $limit");
# qtl_feature
# Note this uses chromosome coords so we have to join with tmp_chr_map to
# get the mapping
$self->debug("Building qtl_feature table");
$dbh->do
("INSERT INTO $target.qtl_feature( seq_region_id, seq_region_start, " .
" seq_region_end, qtl_id, analysis_id) " .
"SELECT tcm.new_id, " .
" q.start, q.end, q.qtl_id, q.analysis_id " .
"FROM $target.tmp_chr_map tcm, $source.qtl_feature q " .
"WHERE tcm.old_id = q.chromosome_id $limit");
# These tables now have seq_region_* instead of chromosome_*
$self->debug("Building karyotype table");
$dbh->do(
"INSERT INTO $target.karyotype " .
"SELECT null, tcm.new_id, " .
" k.chr_start, k.chr_end, k.band, k.stain " .
"FROM $target.tmp_chr_map tcm, $source.karyotype k " .
"WHERE tcm.old_id = k.chromosome_id $limit");
$self->debug("Building marker_map_location table");
$dbh->do(
"INSERT INTO $target.marker_map_location " .
"SELECT mml.marker_id, mml.map_id, " .
" c.name, " .
" mml.marker_synonym_id, mml.position, mml.lod_score " .
"FROM $source.chromosome c, $source.marker_map_location mml " .
"WHERE c.chromosome_id = mml.chromosome_id $limit");
$self->debug( "Building misc_feature table" );
$dbh->do
("INSERT INTO $target.misc_feature( misc_feature_id, seq_region_id, " .
" seq_region_start, seq_region_end, seq_region_strand ) " .
"SELECT m.mapfrag_id, sr.seq_region_id, m.seq_start, m.seq_end, " .
" m.orientation " .
"FROM $source.mapfrag m, $target.seq_region sr, $source.dnafrag d " .
"WHERE m.dnafrag_id = d.dnafrag_id " .
"AND d.name = sr.name $limit" );
$self->debug( "Building misc_set table" );
$dbh->do
("INSERT INTO $target.misc_set( misc_set_id, code, name, description, " .
" max_length ) " .
"SELECT mapset_id, code, name, description, max_length " .
"FROM $source.mapset ms" );
$self->debug( "Building misc_attrib table" );
$dbh->do
("INSERT INTO $target.misc_attrib( misc_feature_id, attrib_type_id, " .
" value ) ".
"SELECT mapfrag_id, mapannotationtype_id, value " .
"FROM $source.mapannotation" );
$dbh->do
("INSERT INTO $target.misc_attrib( misc_feature_id, attrib_type_id, " .
" value) " .
"SELECT mf.mapfrag_id, at.attrib_type_id, mf.name " .
"FROM $source.mapfrag mf, $target.attrib_type at " .
"WHERE at.code = 'name'");
$dbh->do
("INSERT INTO $target.misc_attrib( misc_feature_id, attrib_type_id, " .
" value) " .
"SELECT mf.mapfrag_id, at.attrib_type_id, mf.type " .
"FROM $source.mapfrag mf, $target.attrib_type at " .
"WHERE at.code = 'type'");
$self->debug( "Building misc_feature_misc_set table" );
$dbh->do
("INSERT INTO $target.misc_feature_misc_set(misc_feature_id, misc_set_id)".
"SELECT mapfrag_id, mapset_id ".
"FROM $source.mapfrag_mapset $limit" );
return;
}
sub transfer_stable_ids {
my $self = shift;
my $source = $self->source();
my $target = $self->target();
my $dbh = $self->dbh();
$self->debug("Building stable id tables");
#
# Copy the stable id tables
# remove the unused created and modified dates from the stable ids
#
$dbh->do
("INSERT INTO $target.exon_stable_id " .
" (exon_id, stable_id, version) " .
"SELECT exon_id, stable_id, version " .
"FROM $source.exon_stable_id" );
$dbh->do
("INSERT INTO $target.gene_stable_id " .
" (gene_id, stable_id, version) " .
"SELECT gene_id, stable_id, version " .
"FROM $source.gene_stable_id" );
$self->copy_tables
("stable_id_event","mapping_session","transcript_stable_id",
"translation_stable_id","gene_archive","peptide_archive");
return;
}
sub transfer_vega_stable_ids {
my $self = shift;
my $source = $self->source();
my $target = $self->target();
my $dbh = $self->dbh();
$self->debug("Building vega_stable id tables");
# Copy the stable id tables
# add blank columns for created and modified dates for transcript and translations
$self->debug("Copying transcript_stable_id");
$dbh->do
("INSERT INTO $target.transcript_stable_id " .
" (transcript_id, stable_id, version) " .
"SELECT * " .
"FROM $source.transcript_stable_id" );
$self->debug("Copying translation_stable_id");
$dbh->do
("INSERT INTO $target.translation_stable_id " .
" (translation_id, stable_id, version) " .
"SELECT * " .
"FROM $source.translation_stable_id" );
# copy all other tables
$self->copy_tables
("gene_stable_id","exon_stable_id","stable_id_event",
"mapping_session","gene_archive","peptide_archive");
return;
}
sub transfer_meta {
my $self = shift;
my $source = $self->source();
my $target = $self->target();
my $dbh = $self->dbh();
$dbh->do("INSERT INTO $target.meta (meta_key, meta_value) " .
"SELECT m.meta_key, m.meta_value FROM $source.meta m " .
"ORDER BY meta_id");
return;
}
sub copy_other_tables {
my $self = shift;
#xref tables
$self->copy_tables("xref",
"go_xref",
"identity_xref",
"object_xref",
"external_db",
"external_synonym",
#marker/qtl related tables
"map",
"marker",
"marker_synonym",
"qtl",
"qtl_synonym",
#misc other tables
"supporting_feature",
"analysis",
"exon_transcript",
"interpro",
"gene_description",
"protein_feature");
}
sub copy_repeat_consensus {
my $self = shift;
my $source = $self->source();
my $target = $self->target();
my $dbh = $self->dbh();
$self->debug("Converting repeat_consensus table.");
$dbh->do("INSERT INTO $target.repeat_consensus " .
"(repeat_consensus_id, repeat_name, repeat_class, repeat_type, ".
" repeat_consensus) " .
"SELECT repeat_consensus_id, repeat_name, repeat_class, " .
" '', repeat_consensus " .
"FROM $source.repeat_consensus rc" );
return;
}
sub create_attribs {
my $self = shift;
my $source = $self->source();
my $target = $self->target();
my $dbh = $self->dbh();
#copy the attrib types from the mapannotation type table
$dbh->do
("INSERT INTO $target.attrib_type( attrib_type_id, code, " .
" name, description ) " .
"SELECT mapannotationtype_id, code, name, description " .
"FROM $source.mapannotationtype " );
$dbh->do
("INSERT INTO $target.attrib_type( code, name, description ) " .
"VALUES ('name', 'Name',''), ('type', 'Type of feature','')");
return;
}
#
# The process of actually identifying toplevel seq_regions using the info in
# the database is quite slow. Make the assumption that the coordsystem with
# lowest rank value is going to have all of the toplevel seq_regions.
#
# This method must be overridden if alternate behaviour is required.
#
sub set_top_level {
my $self = shift;
my $target = $self->target();
my $dbh = $self->dbh();
my $attrib_type_id = $self->add_attrib_code();
$self->debug("Setting toplevel attributes of seq_regions");
my $sth = $dbh->prepare("DELETE FROM $target.seq_region_attrib " .
"WHERE attrib_type_id = ?");
$sth->execute($attrib_type_id);
$sth->finish();
$sth = $dbh->prepare("SELECT coord_system_id FROM $target.coord_system " .
"ORDER BY RANK ASC LIMIT 1");
$sth->execute();
my ($cs_id) = $sth->fetchrow_array();
$sth->finish();
$sth = $dbh->prepare("INSERT INTO $target.seq_region_attrib " .
'(seq_region_id, attrib_type_id, value) ' .
"SELECT sr.seq_region_id, $attrib_type_id, 1 " .
"FROM $target.seq_region sr " .
"WHERE sr.coord_system_id = $cs_id");
$sth->execute();
$sth->finish();
}
sub add_attrib_code {
my $self = shift;
my $dbh = $self->dbh();
my $target = $self->target();
# add a toplevel code to the attrib_type table if it is not there already
my $sth = $dbh->prepare("SELECT attrib_type_id " .
"FROM $target.attrib_type " .
"WHERE code = 'toplevel'");
$sth->execute();
if($sth->rows()) {
my ($attrib_type_id) = $sth->fetchrow_array();
$sth->finish();
return $attrib_type_id;
}
$sth->finish();
$sth = $dbh->prepare("INSERT INTO $target.attrib_type " .
"SET code = 'toplevel', " .
"name = 'Top Level', " .
"description = 'Top Level Non-Redundant Sequence Region'");
$sth->execute();
my $attrib_type_id = $sth->{'mysql_insertid'};
$sth->finish();
return $attrib_type_id;
}
1;