Raw content of XrefMapper::BasicMapper
package XrefMapper::BasicMapper;
use strict;
use warnings;
use Cwd;
use DBI;
use File::Basename;
use IPC::Open3;
=head2 new
Description: Constructor for BasicMapper.
Returntype : BasicMapper
Exceptions : none
Caller : general
=cut
sub new{
my($class, @args) = @_;
my $self ={};
bless $self,$class;
return $self;
}
=head2 xref
Arg [1] : (optional)
Example : $mapper->core($new_core);
Description: Getter / Setter for the core.
info for the xref database.
Returntype : XrefMapper::db
Exceptions : none
=cut
sub xref{
my ($self, $arg) = @_;
(defined $arg) &&
($self->{_xref} = $arg );
return $self->{_xref};
}
=head2 core
Arg [1] : (optional)
Example : $mapper->core($new_core);
Description: Getter / Setter for the core.
info for the ensembl core database.
Returntype : XrefMapper::db
Exceptions : none
=cut
sub core{
my ($self, $arg) = @_;
(defined $arg) &&
($self->{_core} = $arg );
return $self->{_core};
}
sub xref_latest_status {
my $self = shift;
my $verbose = shift || 0;
my $sth = $self->xref->dbc->prepare("select id, status, date from process_status order by id");
$sth->execute();
my ($xref_id, $acc);
my ($id, $status, $date);
$sth->bind_columns(\$id, \$status,\$date);
while($sth->fetch){
print "$status\t$date\n" if($verbose and $self->verbose);
}
return $status;
}
sub process_file {
my $self = shift;
my $file = shift;
my $verbose = shift;
open(FILE, $file) or die ("\nCannot open input file '$file':\n $!\n");
my $xref=undef;
my $ensembl=undef;
my $type;
my %xref_hash=();
my %species_hash=();
while( my $line = ) {
chomp($line);
next if $line =~ /^#/;
next if !$line;
# print $line."\n";
my ($key, $value) = split("=",$line);
if($key eq "species"){
$type = "species";
$species_hash{'species'} = $value;
}
elsif($key eq "xref"){
$type = "xref";
}
elsif($type eq "species"){ # processing species data
$species_hash{lc($key)} = $value;
}
elsif($type eq "xref"){ # processing xref data
$xref_hash{lc($key)} = $value;
}
}
my $value = $species_hash{'species'};
if ($value !~ /_/) {
print STDERR "\'$value\' is not a recognised species - please use full species name (e.g. homo_sapiens) in $file\n";
exit(1);
}
my $mapper;
my $module;
my $class = "XrefMapper/$value.pm";
eval {
require $class;
};
if($@) {
if ($@ =~ /Can\'t locate $class/) {
warn("Did not find a specific mapping module XrefMapper::$value - using XrefMapper::BasicMapper instead\n") if(defined($verbose) and $verbose);
require XrefMapper::BasicMapper;
$module = "BasicMapper";
} else {
die "$@";
}
} else{
$module = $value;
}
$mapper = "XrefMapper::$module"->new();
if(defined($xref_hash{host})){
my ($host, $user, $dbname, $pass, $port);
$host = $xref_hash{'host'};
$user = $xref_hash{'user'};
$dbname = $xref_hash{'dbname'};
if(defined($xref_hash{'password'})){
$pass = $xref_hash{'password'};
}
else{
$pass = '';
}
if(defined($xref_hash{'port'})){
$port = $xref_hash{'port'};
}
else{
$port = 3306;
}
$xref = new XrefMapper::db(-host => $host,
-port => $port,
-user => $user,
-pass => $pass,
-group => 'core',
-dbname => $dbname);
$mapper->xref($xref);
if(defined($xref_hash{'dir'})){
$xref->dir($xref_hash{'dir'});
if(!-d $xref_hash{'dir'}){
die "directory ".$xref_hash{'dir'}." does not exist please create this\n";
}
}
else{
die "No directory specified for the xref fasta files\n";
}
}
else{
die "No host name given for xref database\n";
}
if(defined($species_hash{'species'})){
my ($host, $port, $user, $dbname, $pass);
$host = $species_hash{'host'};
$user = $species_hash{'user'};
$dbname = $species_hash{'dbname'};
if(defined($species_hash{'password'})){
$pass = $species_hash{'password'};
}
else{
$pass = '';
}
if(defined($species_hash{'port'})){
$port = $species_hash{'port'};
}
else{
$port = '';
}
my $core = new XrefMapper::db(-host => $host,
-port => $port,
-user => $user,
-pass => $pass,
-group => 'core',
-dbname => $dbname);
$mapper->core($core);
if(defined($species_hash{'dir'})){
$core->dir($species_hash{'dir'});
if(!-d $species_hash{'dir'}){
die "directory ".$species_hash{'dir'}." does not exist please create this\n";
}
}
else{
die "No directory specified for the ensembl fasta files\n";
}
$core->species($value);
}
return $mapper;
}
=head2 dumpcheck
Arg [1] : (optional)
Example : $mapper->dumpcheck("yes");
Description: Getter / Setter for dumpcheck.
If set the mapper will not dump fasta files
if they exist already.
Returntype : scalar
Exceptions : none
=cut
sub dumpcheck {
my ($self, $arg) = @_;
(defined $arg) &&
($self->{_dumpcheck} = $arg );
return $self->{_dumpcheck};
}
sub nofarm {
my ($self, $arg) = @_;
(defined $arg) &&
($self->{_nofarm} = $arg );
return $self->{_nofarm};
}
sub verbose {
my ($self, $arg) = @_;
(defined $arg) &&
($self->{_verbose} = $arg );
return $self->{_verbose};
}
sub species_id {
my ($self, $arg) = @_;
(defined $arg) &&
($self->{_species_id} = $arg );
return $self->{_species_id};
}
sub get_id_from_species_name {
my ($self, $species_name) = @_;
my $sql = "select species_id from species where name = '".$species_name."'";
my $sth = $self->xref->dbc->prepare($sql);
$sth->execute();
my @row = $sth->fetchrow_array();
my $species_id;
if (@row) {
$species_id = $row[0];
} else {
print STDERR "Couldn't get ID for species ".$species_name."\n";
print STDERR "It must be one of :-\n";
$sql = "select name from species";
$sth = $self->dbc->prepare($sql);
$sth->execute();
while(my @row = $sth->fetchrow_array()){
print STDERR $row[0]."\n";
}
die("Please try again :-)\n");
}
$sth->finish();
return $species_id;
}
#
# official naming
#
sub get_official_name {
return undef;
}
sub official_naming {
my $self = shift;
my $dbname = $self->get_official_name();
if(!defined($dbname)){
my $sth_stat = $self->xref->dbc->prepare("insert into process_status (status, date) values('official_naming_done',now())");
$sth_stat->execute();
$sth_stat->finish;
return;
}
if($dbname eq "MGI"){ # first Copy MGI to Genes
$self->biomart_fix("MGI","Translation","Gene");
$self->biomart_fix("MGI","Transcript","Gene");
}
print "Official naming started. Copy $dbname from gene to canonical transcript\n" if($self->verbose);
my ($max_object_xref_id, $max_xref_id);
$self->species_id($self->get_id_from_species_name($self->core->species));
my $sth = $self->xref->dbc->prepare("SELECT MAX(object_xref_id) FROM object_xref");
$sth->execute();
$sth->bind_columns(\$max_object_xref_id);
$sth->fetch;
$sth = $self->xref->dbc->prepare("SELECT MAX(xref_id) FROM xref");
$sth->execute();
$sth->bind_columns(\$max_xref_id);
$sth->fetch;
########################################################
# Copy $dbname from gene to the canonical transcripts. #
########################################################
# remove the ignore later on after testing
my $sth_add_ox = $self->xref->dbc->prepare("insert ignore into object_xref (object_xref_id, xref_id, ensembl_id, ensembl_object_type, linkage_type, ox_status) values(?, ?, ?, 'Transcript', ?, ?)");
# my $object_xref_id = $max_object_xref_id + 1;
$max_object_xref_id++;
if($max_object_xref_id == 1){
die "max_object_xref_id should not be 1\n";
}
my $object_sql = (<core->dbc->prepare($sql);
$sth->execute();
my ($gene_id, $tran_id);
$sth->bind_columns(\$gene_id,\$tran_id);
my %gene_to_tran_canonical;
while ($sth->fetch){
$gene_to_tran_canonical{$gene_id} = $tran_id;
}
$sth->finish;
print "Pre copy all $dbname from gene to canonical transcripts\n" if($self->verbose);
$self->biomart_test();
my $ins_dep_ix_sth = $self->xref->dbc->prepare("insert into identity_xref (object_xref_id, query_identity, target_identity) values(?, ?, ?)");
$sth = $self->xref->dbc->prepare($object_sql);
$sth->execute();
my ($xref_id, $linkage_type, $ox_status, $q_id, $t_id);
$sth->bind_columns(\$xref_id, \$gene_id, \$linkage_type, \$ox_status, \$q_id, \$t_id);
while ($sth->fetch){
if(defined($gene_to_tran_canonical{$gene_id})){
$max_object_xref_id++;
$sth_add_ox->execute($max_object_xref_id, $xref_id, $gene_to_tran_canonical{$gene_id}, $linkage_type, $ox_status) || print STDERR "(Gene id - $gene_id) Could not add $max_object_xref_id, .".$gene_to_tran_canonical{$gene_id}.", $xref_id, $linkage_type, $ox_status to object_xref\n";
$ins_dep_ix_sth->execute($max_object_xref_id, $q_id, $t_id);
}
else{
print STDERR "Could not find canonical for gene $gene_id\n";
}
}
$sth->finish;
print "Post copy all $dbname from gene to canonical transcripts\n" if($self->verbose);
$self->biomart_test();
##########################################################################################
# HGNC/MGI synonyms are special as they are only on some prioritys so copy all the synonyms
# from the xref database to the xref database. Granted some will already be there but use
# update ignore just in case.
###########################################################################################
#get the xref synonyms store as a hash of arrays.
my $syn_hash = $self->get_official_synonyms();
$sql = "insert into synonym (xref_id, synonym) values (?, ?)";
my $add_syn_sth = $self->xref->dbc->prepare($sql);
#
# get the OFDN (HGNC/MGI) xrefs in the xrefs and add the synonyms, plus create hash to get id from the display name
# This is becouse not all MGI.HGNC's are loaded as these are priority xrefs.
#
my %display_label_to_id;
$sql = 'select x.accession, sy.synonym from synonym sy, xref x, source so where x.xref_id = sy.xref_id and so.source_id = x.source_id and so.name like "'.$dbname.'"';
$sth = $self->xref->dbc->prepare($sql);
$sth->execute();
my ($display_label, $acc, $syn,);
$sth->bind_columns(\$acc,\$display_label);
while($sth->fetch){
$display_label_to_id{$display_label} = $acc;
}
$sth->finish;
# get label to id from xref database to start with.
$sql = 'select x.accession, x.label from xref x, source s where s.source_id = x.source_id and s.name like "'.$dbname.'"';
$sth = $self->xref->dbc->prepare($sql);
$sth->execute();
$sth->bind_columns(\$acc,\$display_label);
while($sth->fetch){
$display_label_to_id{$display_label} = $acc;
}
$sth->finish;
my %synonym;
$sth = $self->xref->dbc->prepare('select es.synonym, x.label from synonym es, xref x, source s where x.xref_id = es.xref_id and x.source_id = s.source_id and
s.name = "'.$dbname.'"' );
$sth->execute();
my ($name);
$sth->bind_columns(\$syn,\$name);
while($sth->fetch){
$synonym{$syn} = $name;
}
$sth->finish;
$sth = $self->xref->dbc->prepare('select es.synonym, x.label from synonym es, xref x, source s where x.xref_id = es.xref_id and x.source_id = s.source_id and
s.name = "EntrezGene"' );
$sth->execute();
$sth->bind_columns(\$syn,\$name);
while($sth->fetch){
$synonym{$syn} = $name;
}
$sth->finish;
#######################
#Do the naming bit now.
#######################
# get the vega external_sources
my ($odn_curated_gene_id, $odn_automatic_gene_id, $clone_based_vega_gene_id, $clone_based_ensembl_gene_id);
my ($odn_curated_tran_id, $odn_automatic_tran_id, $clone_based_vega_tran_id, $clone_based_ensembl_tran_id);
my $odn_id;
$sth = $self->xref->dbc->prepare("select source_id from source where name like ?");
$sth->execute($dbname."_curated_gene");
$sth->bind_columns(\$odn_curated_gene_id);
$sth->fetch;
$sth->execute($dbname."_automatic_gene");
$sth->bind_columns(\$odn_automatic_gene_id);
$sth->fetch;
$sth->execute("Clone_based_vega_gene");
$sth->bind_columns(\$clone_based_vega_gene_id);
$sth->fetch;
$sth->execute("Clone_based_ensembl_gene");
$sth->bind_columns(\$clone_based_ensembl_gene_id);
$sth->fetch;
if(!defined($odn_curated_gene_id)){
die "Could not find external database name ".$dbname."_curated_gene\n";
}
if(!defined($odn_automatic_gene_id)){
die "Could not find external database name ".$dbname."_automatic_gene\n";
}
if(!defined($clone_based_vega_gene_id)){
die "Could not find external database name Clone_based_vega_gene\n";
}
if(!defined($clone_based_ensembl_gene_id)){
die "Could not find external database name Clone_based_ensembl_gene\n";
}
$sth->execute($dbname."_curated_transcript");
$sth->bind_columns(\$odn_curated_tran_id);
$sth->fetch;
$sth->execute($dbname."_automatic_transcript");
$sth->bind_columns(\$odn_automatic_tran_id);
$sth->fetch;
$sth->execute("Clone_based_vega_transcript");
$sth->bind_columns(\$clone_based_vega_tran_id);
$sth->fetch;
$sth->execute("Clone_based_ensembl_transcript");
$sth->bind_columns(\$clone_based_ensembl_tran_id);
$sth->fetch;
if(!defined($odn_curated_tran_id)){
die "Could not find external database name ".$dbname."_curated_transcript\n";
}
if(!defined($odn_automatic_tran_id)){
die "Could not find external database name ".$dbname."_automatic_transcript\n";
}
if(!defined($clone_based_vega_tran_id)){
die "Could not find external database name Clone_based_vega_transcript\n";
}
if(!defined($clone_based_ensembl_tran_id)){
die "Could not find external database name Clone_based_ensembl_transcript\n";
}
###########################
# Delete the old ones.
###########################
my $del_vega_sql = "delete o from object_xref o, xref x where x.xref_id = o.xref_id and x.source_id in ( $odn_curated_gene_id, $odn_automatic_gene_id, $clone_based_vega_gene_id, $clone_based_ensembl_gene_id,$odn_automatic_tran_id, $clone_based_ensembl_tran_id)";
$sth = $self->xref->dbc->prepare($del_vega_sql);
$sth->execute();
$del_vega_sql = "delete x from xref x where x.source_id in ($odn_curated_gene_id, $odn_automatic_gene_id, $clone_based_vega_gene_id, $clone_based_ensembl_gene_id,$odn_automatic_tran_id, $clone_based_ensembl_tran_id)";
$sth = $self->xref->dbc->prepare($del_vega_sql);
$sth->execute();
$del_vega_sql = "delete x from xref x where x.source_id = $clone_based_vega_tran_id and x.info_type = 'MISC'"; # original ones added have info type of "DIRECT"
$sth = $self->xref->dbc->prepare($del_vega_sql);
$sth->execute();
######################################################
# Get the current max values for xref and object_xref
######################################################
my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor(-dbconn => $self->core->dbc);
my $ga = $db->get_GeneAdaptor();
# if(!defined($max_xref_id) or $max_xref_id == 0){
# die "Sorry i dont believe there are no xrefs max xref = 0\n";
# }
# if(!defined($max_object_xref_id) or $max_object_xref_id == 0){
# die "Sorry i dont believe there are no object_xrefs max object_xref = 0\n";
# }
###########################
# Process each Gene
###########################
my %gene_to_transcripts;
my %gene_id_to_stable_id;
my %tran_id_to_stable_id;
$sth = $self->xref->dbc->prepare("select gtt.gene_id, gtt.transcript_id, gsi.stable_id, tsi.stable_id
from gene_transcript_translation gtt, gene_stable_id gsi, transcript_stable_id tsi
where gtt.gene_id = gsi.internal_id and gtt.transcript_id = tsi.internal_id");
$sth->execute;
my $gsi;
my $tsi;
$sth->bind_columns(\$gene_id, \$tran_id, \$gsi, \$tsi);
while ($sth->fetch){
push @{$gene_to_transcripts{$gene_id}}, $tran_id;
$gene_id_to_stable_id{$gene_id} = $gsi;
$tran_id_to_stable_id{$tran_id} = $tsi;
}
my $total_gene_vega = 0;
my $total_gene = 0;
my $total_clone_name = 0;
my $odn_count = 0;
my $dbentrie_sth = $self->xref->dbc->prepare("select x.label from xref x, object_xref ox, source s where x.xref_id = ox.xref_id and
x.source_id = s.source_id and s.name = ? and ox.ox_status = 'DUMP_OUT' and ox.ensembl_id = ? and ox.ensembl_object_type = ? ");
$sql = "insert into xref (xref_id, source_id, accession, label, version, species_id, info_type) values (?, ?, ?, ?, 0, ".$self->species_id.", 'MISC' )";
my $ins_xref_sth = $self->xref->dbc->prepare($sql);
# important or will crash and burn!!!
my %xref_added; # store those added already $xref_added{$accession:$source_id} = $xref_id;
my $ins_object_xref_sth = $self->xref->dbc->prepare("insert into object_xref (object_xref_id, ensembl_id, ensembl_object_type, xref_id, linkage_type, ox_status) values (?, ?, ?, ?, 'MISC', 'DUMP_OUT')");
foreach my $gene_id (keys %gene_to_transcripts){
my @ODN=();
my @VEGA_NAME=();
my $CLONE_NAME = undef;
my $display;
$dbentrie_sth->execute($dbname, $gene_id, "Gene");
$dbentrie_sth->bind_columns(\$display);
while($dbentrie_sth->fetch){
push @ODN, $display;
}
if(scalar(@ODN)){
$odn_count++;
}
my %no_vega; # hash now as we want to sort by stable id and hence need a key value pair
my $vega_count = 0;
my $count = 0;
foreach my $tran_id ( @{$gene_to_transcripts{$gene_id}} ){
my $VEGA = undef;
$count = 0 ;
$dbentrie_sth->execute($dbname."_curated_transcript", $tran_id, "Transcript");
$dbentrie_sth->bind_columns(\$display);
while($dbentrie_sth->fetch){
my($hgnc_bit, $num) = split(/-0\d\d/,$display);
$VEGA = $hgnc_bit;
$vega_count++;
my $multiple = 1;
if(scalar(@VEGA_NAME)){
foreach my $name (@VEGA_NAME){
if($name ne $VEGA){
if(defined($synonym{$name}) and uc($synonym{$name}) eq $VEGA){
$multiple = 0;
}
if(defined($synonym{$VEGA}) and uc($synonym{$VEGA}) eq uc($name)){
$multiple = 0;
}
}
else{
$multiple = 0;
}
}
if($multiple){
push @VEGA_NAME , $hgnc_bit;
}
}
else{
push @VEGA_NAME , $hgnc_bit;
}
$count++;
}
$dbentrie_sth->execute("Clone_based_vega_transcript", $tran_id, "Transcript");
$dbentrie_sth->bind_columns(\$display);
while($dbentrie_sth->fetch){
my($hgnc_bit, $num) = split(/-0\d\d/,$display);
$CLONE_NAME = $hgnc_bit;
$count++;
}
if($count == 0){
$no_vega{$tran_id_to_stable_id{$tran_id}} = $tran_id;
}
if($count > 1){
print STDERR "Problem: ".$tran_id_to_stable_id{$tran_id}." has more than one vega_transcript\n";
}
if($count == 1){
if(defined($VEGA) and scalar(@ODN)){
my $found = 0;
foreach my $odn_name (@ODN){
if(uc($VEGA) eq uc($odn_name)){
$found = 1;
}
elsif(defined($synonym{$VEGA}) and uc($synonym{$VEGA}) eq uc($odn_name)){
$found = 1;
}
}
if(!$found){
print STDERR "Problem: ".$gene_id_to_stable_id{$gene_id}." linked to ".$dbname." (".join(', ',@ODN).") BUT ".$tran_id_to_stable_id{$tran_id}." linked to vega_transcript $VEGA????\n";
}
}
}
} # end for each transcript
####################################################################################
# if there is at least one transcript
# set vega_gene
# loop through no_vega array and set vega_transcript_like for each starting at 201
####################################################################################
if(scalar(@VEGA_NAME) > 1){
print STDERR "Warning: gene ".$gene_id_to_stable_id{$gene_id}." has more than one vega_transcript these are (".join(', ',@VEGA_NAME).")\n";
}
if($vega_count){
foreach my $name (@VEGA_NAME){
my $id = $display_label_to_id{$name};
if(!defined($id)){
$id = $name;
print STDERR "Warning Could not find id for $name\n";
}
if(!defined($xref_added{$id.":".$odn_curated_gene_id})){
$max_xref_id++;
$ins_xref_sth->execute($max_xref_id, $odn_curated_gene_id, $id, $name);
$xref_added{$id.":".$odn_curated_gene_id} = $max_xref_id;
if(defined($syn_hash->{$name})){
foreach my $syn (@{$syn_hash->{$name}}){
$add_syn_sth->execute($max_xref_id, $syn);
}
}
}
$max_object_xref_id++;
$ins_object_xref_sth->execute($max_object_xref_id, $gene_id, 'Gene', $xref_added{$id.":".$odn_curated_gene_id});
$ins_dep_ix_sth->execute($max_object_xref_id, 100, 100);
}
my $name = $VEGA_NAME[0];
my $tran_name_ext = 201;
foreach my $tran (sort keys %no_vega){
my $id = $name."-".$tran_name_ext;
if(!defined($xref_added{$id.":".$odn_automatic_tran_id})){
$max_xref_id++;
$ins_xref_sth->execute($max_xref_id, $odn_automatic_tran_id, $id, $id);
$xref_added{$id.":".$odn_automatic_tran_id} = $max_xref_id;
}
$max_object_xref_id++;
$ins_object_xref_sth->execute($max_object_xref_id, $no_vega{$tran}, 'Transcript', $xref_added{$id.":".$odn_automatic_tran_id});
$ins_dep_ix_sth->execute($max_object_xref_id, 100, 100);
$tran_name_ext++;
}
}
####################################################################################
# if no vega_transcript but hgnc/mgi
# set vega_gene_like to hgnc/mgi
# loop through both arrays and set vega_transcript_like to hgnc-101 etc
####################################################################################
elsif(scalar(@ODN)){
foreach my $name (@ODN){
my $id = $display_label_to_id{$name};
if(!defined($id)){
$id = $name;
print STDERR "Warning Could not find id for $name\n";
}
if(!defined($xref_added{$id.":".$odn_automatic_gene_id})){
$max_xref_id++;
$ins_xref_sth->execute($max_xref_id, $odn_automatic_gene_id, $id, $name);
$xref_added{$id.":".$odn_automatic_gene_id} = $max_xref_id;
if(defined($syn_hash->{$name})){
foreach my $syn (@{$syn_hash->{$name}}){
$add_syn_sth->execute($max_xref_id, $syn);
}
}
}
$max_object_xref_id++;
$ins_object_xref_sth->execute($max_object_xref_id, $gene_id, 'Gene', $xref_added{$id.":".$odn_automatic_gene_id});
$ins_dep_ix_sth->execute($max_object_xref_id, 100, 100);
}
my $name = $ODN[0];
my $tran_name_ext = 201;
foreach my $tran (sort keys %no_vega){
my $id = $name."-".$tran_name_ext;
while(defined($xref_added{$id.":".$odn_automatic_tran_id})){
$tran_name_ext++;
$id = $name."-".$tran_name_ext;
}
if(!defined($xref_added{$id.":".$odn_automatic_tran_id})){
$max_xref_id++;
$ins_xref_sth->execute($max_xref_id, $odn_automatic_tran_id, $id, $id);
$xref_added{$id.":".$odn_automatic_tran_id} = $max_xref_id;
}
else{
print "ERROR: should not get here $id already defined?\n";
}
$max_object_xref_id++;
$ins_object_xref_sth->execute($max_object_xref_id, $no_vega{$tran}, 'Transcript', $xref_added{$id.":".$odn_automatic_tran_id});
$ins_dep_ix_sth->execute($max_object_xref_id, 100, 100);
$tran_name_ext++;
}
}
####################################################################################
# if no vega_transcript and no hgnc/mgi use clone name
# set vega_gene_like to clone name
# loop through both arrays and set vega_transcript_like to clone_name-101 etc
####################################################################################
else{
if(defined($CLONE_NAME)){
my $id = $CLONE_NAME;
if(!defined($xref_added{$id.":".$clone_based_vega_gene_id})){
$max_xref_id++;
$ins_xref_sth->execute($max_xref_id, $clone_based_vega_gene_id, $id, $id);
$xref_added{$id.":".$clone_based_vega_gene_id} = $max_xref_id;
}
$max_object_xref_id++;
$ins_object_xref_sth->execute($max_object_xref_id, $gene_id, 'Gene', $xref_added{$id.":".$clone_based_vega_gene_id});
$ins_dep_ix_sth->execute($max_object_xref_id, 100 , 100);
my $tran_name_ext = 201;
foreach my $tran (sort keys %no_vega){
my $id = $CLONE_NAME."-".$tran_name_ext;
if(!defined($xref_added{$id.":".$clone_based_vega_tran_id})){
$max_xref_id++;
$ins_xref_sth->execute($max_xref_id, $clone_based_vega_tran_id, $id, $id);
$xref_added{$id.":".$clone_based_vega_tran_id} = $max_xref_id;
}
$max_object_xref_id++;
$ins_object_xref_sth->execute($max_object_xref_id, $no_vega{$tran}, 'Transcript', $xref_added{$id.":".$clone_based_vega_tran_id});
$ins_dep_ix_sth->execute($max_object_xref_id, 100, 100);
$tran_name_ext++;
}
}
else{
#get the clone name
my $gene= $ga->fetch_by_dbID($gene_id);
my $new_gene = $gene->transform('clone');
my $new_clone_name = undef;
if(defined($new_gene)){
$new_clone_name = $new_gene->slice->seq_region_name;
}
else{
# on more than one clone?? try
my $slice = $gene->slice->sub_Slice($gene->start,$gene->end,$gene->strand);
my $clone_projection = $slice->project('clone');
foreach my $seg (@$clone_projection) {
my $clone = $seg->to_Slice();
$new_clone_name = $clone->seq_region_name;
}
if(!defined($new_clone_name)){
# try going via contig
my $super_projection = $slice->project('contig');
my $super;
foreach my $seg (@$super_projection) {
$super = $seg->to_Slice();
$new_clone_name = $super->seq_region_name;
}
my $clone_projection = $super->project('clone');
foreach my $seg (@$clone_projection) {
my $clone = $seg->to_Slice();
$new_clone_name = $clone->seq_region_name;
}
if(!defined($new_clone_name)){
print STDERR "PROJECT failed for ".$gene->stable_id."\n";
next;
}
}
}
# store the data
if(!defined($xref_added{$new_clone_name.":".$clone_based_ensembl_gene_id})){
$max_xref_id++;
$ins_xref_sth->execute($max_xref_id, $clone_based_ensembl_gene_id, $new_clone_name, $new_clone_name);
$xref_added{$new_clone_name.":".$clone_based_ensembl_gene_id} = $max_xref_id;
}
$max_object_xref_id++;
$ins_object_xref_sth->execute($max_object_xref_id, $gene->dbID, 'Gene', $xref_added{$new_clone_name.":".$clone_based_ensembl_gene_id});
$ins_dep_ix_sth->execute($max_object_xref_id, 100, 100);
my $tran_name_ext = 201;
foreach my $tran (sort keys %no_vega){
my $id = $new_clone_name."-".$tran_name_ext;
if(!defined($xref_added{$id.":".$clone_based_ensembl_tran_id})){
$max_xref_id++;
$ins_xref_sth->execute($max_xref_id, $clone_based_ensembl_tran_id, $id, $id);
$xref_added{$id.":".$clone_based_ensembl_tran_id} = $max_xref_id;
}
$max_object_xref_id++;
$ins_object_xref_sth->execute($max_object_xref_id, $no_vega{$tran}, 'Transcript', $xref_added{$id.":".$clone_based_ensembl_tran_id});
$ins_dep_ix_sth->execute($max_object_xref_id, 100, 100);
$tran_name_ext++;
}
}
}
if($vega_count){
$total_gene_vega++;
}
$total_gene++;
# print "Finished Gene ".$gene->stable_id."\t$ODN\t $vega_count\n"
}
$add_syn_sth->finish;
#
# Now check for duplicate clone names as gene names.
#
my $xref_list_sql = 'select x.xref_id, x.label, count(*) as count from source s, xref x, object_xref ox where s.source_id = x.source_id and s.name like "Clone_based_ensembl_gene" and x.xref_id = ox.xref_id and ox.ensembl_object_type = "Gene" group by xref_id having count >1';
# may need to add gsi to get the order the same each time
my $object_xref_list = "select object_xref_id, ensembl_id from object_xref where xref_id = ? order by ensembl_id"; # Can only be linked to a gene so no point checking really.
my $ox_list_sth = $self->xref->dbc->prepare($object_xref_list);
my $update_first_xref = "update xref set accession = ?, label = ? where xref_id = ?"; # just rename xref;
my $update_first_sth = $self->xref->dbc->prepare($update_first_xref);
my $change_object_xref = "update object_xref set xref_id = ? where xref_id = ? and ensembl_id = ?";
my $change_ox_sth = $self->xref->dbc->prepare($change_object_xref);
$sth = $self->xref->dbc->prepare($xref_list_sql);
$sth->execute;
my ($xref, $count);
$sth->bind_columns(\$xref, \$name, \$count);
while($sth->fetch){
$ox_list_sth->execute($xref);
my ($object_xref_id, $ensembl_id);
$count = 1;
$ox_list_sth->bind_columns(\$object_xref_id, \$ensembl_id);
while($ox_list_sth->fetch){
my $new_name = $name."-".$count;
if($count == 1){
$update_first_sth->execute($new_name, $new_name, $xref);
$xref_added{$new_name.":".$clone_based_ensembl_gene_id} = $xref;
}
else{
#add new one
if(!defined($xref_added{$new_name.":".$clone_based_ensembl_gene_id})){
$max_xref_id++;
$ins_xref_sth->execute($max_xref_id, $clone_based_ensembl_gene_id, $new_name, $new_name);
$xref_added{$new_name.":".$clone_based_ensembl_gene_id} = $max_xref_id;
}
else{
print STDERR "$new_name already exists????\n";
}
$change_ox_sth->execute($max_xref_id, $xref, $ensembl_id);
}
$count++;
}
}
$change_ox_sth->finish;
$update_first_sth->finish;
$ox_list_sth->finish;
$ins_dep_ix_sth->finish;
my $sth_stat = $self->xref->dbc->prepare("insert into process_status (status, date) values('official_naming_done',now())");
$sth_stat->execute();
$sth_stat->finish;
}
sub biomart_fix{
my ($self, $db_name, $type1, $type2) = @_;
my $xref_dbc = $self->xref->dbc;
print "$db_name is associated with both $type1 and $type2 object types\n" if($self->verbose);
my $to;
my $from;
my $to_id;
my $from_id;
if($type1 eq "Gene" or $type2 eq "Gene"){
$to = "Gene";
$to_id = "gene_id";
if($type1 eq "Translation" or $type2 eq "Translation"){
$from = "Translation";
$from_id = "translation_id"
}
else{
$from = "Transcript";
$from_id = "transcript_id";
}
}
else{
$to = "Transcript";
$to_id = "transcript_id";
$from = "Translation";
$from_id = "translation_id";
}
print "Therefore moving all associations from $from to ".$to."\n" if($self->verbose);
my $sql =(<do($sql) ;
# print "\n$sql\n";
$sql =(<do($sql);
# print "\n$sql\n";
}
sub biomart_testing{
my ($self) = @_;
my $sql = 'SELECT ox.ensembl_object_type, COUNT(*), s.name FROM xref x, object_xref ox, source s WHERE x.xref_id = ox.xref_id AND s.source_id = x.source_id and ox.ox_status = "DUMP_OUT" GROUP BY s.name, ox.ensembl_object_type';
my $again = 1;
while ($again){
$again = 0;
my $sth = $self->xref->dbc->prepare($sql);
$sth->execute();
my ($type, $count, $name);
my ($last_type, $last_count, $last_name);
$sth->bind_columns(\$type,\$count,\$name);
$last_name = "DEFAULT";
while (!$again and $sth->fetch){
if($last_name eq $name){
$again = 1;
$self->biomart_fix($name,$last_type, $type);
# $again = 0; # remove this line after testing
}
$last_name = $name;
$last_type= $type;
$last_count = $count;
}
$sth->finish;
}
}
sub biomart_test{
my ($self) = @_;
my $sql = 'SELECT ox.ensembl_object_type, COUNT(*), s.name FROM xref x, object_xref ox, source s WHERE x.xref_id = ox.xref_id AND s.source_id = x.source_id and ox.ox_status = "DUMP_OUT" GROUP BY s.name, ox.ensembl_object_type';
my $sth = $self->xref->dbc->prepare($sql);
$sth->execute();
my ($type, $count, $name);
my ($last_type, $last_count, $last_name);
$sth->bind_columns(\$type,\$count,\$name);
$last_name = "ARSE";
my $first = 1;
while ($sth->fetch){
if($last_name eq $name){
if($first){
print STDERR "\nProblem Biomart test fails\n";
$first=0;
}
print STDERR "$last_name\t$last_count\t$last_type\n";
print STDERR "$name\t$count\t$type\n";
}
$last_name = $name;
$last_type= $type;
$last_count = $count;
}
$sth->finish;
}
# remove a list of patterns from a string
sub filter_by_regexp {
my ($self, $str, $regexps) = @_;
foreach my $regexp (@$regexps) {
$str =~ s/$regexp//ig;
}
return $str;
}
sub gene_description_filter_regexps {
return ();
}
sub gene_description_sources {
return ("RFAM",
"miRBase",
"IMGT/GENE_DB",
"Uniprot/SWISSPROT",
"RefSeq_peptide",
"RefSeq_dna",
"Uniprot/Varsplic",
"Uniprot/SPTREMBL");
}
sub transcript_display_xref_sources {
my @list = qw(miRBase
RFAM
HGNC_curated_gene
HGNC_automatic_gene
MGI_curated_gene
MGI_automatic_gene
Clone_based_vega_gene
Clone_based_ensembl_gene
HGNC_curated_transcript
HGNC_automatic_transcript
MGI_curated_transcript
MGI_automatic_transcript
Clone_based_vega_transcript
Clone_based_ensembl_transcript
IMGT/GENE_DB
HGNC
SGD
MGI
flybase_symbol
Anopheles_symbol
Genoscope_annotated_gene
Uniprot/SWISSPROT
Uniprot/Varsplic
RefSeq_peptide
RefSeq_dna
Uniprot/SPTREMBL
EntrezGene
IPI);
my %ignore;
$ignore{"EntrezGene"}= 'FROM:RefSeq_[pd][en][pa].*_predicted';
return [\@list,\%ignore];
}
sub get_official_synonyms{
my $self = shift;
my %hgnc_syns;
my %seen; # can be in more than one for each type of hgnc.
my $dbname = $self->get_official_name();
my $sql = (<xref->dbc->prepare($sql);
$sth->execute;
my ($acc, $label, $syn);
$sth->bind_columns(\$acc, \$label, \$syn);
my $count = 0;
while($sth->fetch){
if(!defined($seen{$acc.":".$syn})){
push @{$hgnc_syns{$acc}}, $syn;
push @{$hgnc_syns{$label}}, $syn;
$count++;
}
$seen{$acc.":".$syn} = 1;
}
$sth->finish;
return \%hgnc_syns;
}
sub get_species_id_from_species_name{
my ($self,$species) = @_;
my $sql = "select species_id from species where name = '".$species."'";
my $sth = $self->dbc->prepare($sql);
$sth->execute();
my @row = $sth->fetchrow_array();
my $species_id;
if (@row) {
$species_id = $row[0];
} else {
print STDERR "Couldn't get ID for species ".$species."\n";
print STDERR "It must be one of :-\n";
$sql = "select name from species";
$sth = $self->dbc->prepare($sql);
$sth->execute();
while(my @row = $sth->fetchrow_array()){
print STDERR $row[0]."\n";
}
die("Please try again :-)\n");
}
$sth->finish();
return $species_id;
}
1;