Raw content of XrefParser::HGNC_CCDSParser
package XrefParser::HGNC_CCDSParser;
use strict;
use DBI;
use base qw( XrefParser::BaseParser );
# Parse file of HGNC records and assign direct xrefs
# All assumed to be linked to genes
sub run_script {
my ($self, $file, $source_id, $species_id, $verbose) = @_;
my $user = "ensro";
my $host;
my $port;
my $dbname;
my $pass;
my $wget = "";
if($file =~ /host[=][>](\S+?)[,]/){
$host = $1;
}
if($file =~ /port[=][>](\S+?)[,]/){
$port = $1;
}
if($file =~ /dbname[=][>](\S+?)[,]/){
$dbname = $1;
}
if($file =~ /pass[=][>](\S+?)[,]/){
$pass = $1;
}
if($file =~ /wget[=][>](\S+?)[,]/){
$wget = $1;
}
my $ua = LWP::UserAgent->new();
$ua->timeout(10);
$ua->env_proxy();
my %ccds_to_hgnc;
my $response = $ua->get($wget);
if ( !$response->is_success() ) {
die $response->status_line;
}
else{
my @lines = split(/\n/,$response->content);
foreach my $line (@lines){
my($hgnc, $junk, $ccds) = split(/\t/,$line);
# print "ccds:$ccds\n";
my @ccds_list = split(/, /,$ccds);
foreach my $c (@ccds_list){
# print $c."\t".$hgnc."\n";
$ccds_to_hgnc{$c} = $hgnc;
}
}
}
my $dbi2 = $self->dbi2($host, $port, $user, $dbname, $pass);
if(!defined($dbi2)){
return 1;
}
my $sql = 'select ox.ensembl_id, x.dbprimary_acc from object_xref ox, xref x, external_db e where x.xref_id = ox.xref_id and x.external_db_id = e.external_db_id and e.db_name like "ENST" and x.dbprimary_acc like "ENST%"';
my %trans_id_to_stable_id;
my $sth = $dbi2->prepare($sql);
$sth->execute() or croak( $dbi2->errstr() );
while ( my @row = $sth->fetchrow_array() ) {
$trans_id_to_stable_id{$row[0]} = $row[1];
}
$sth->finish;
$sql = 'select ox.ensembl_id, x.display_label from object_xref ox, xref x, external_db e where x.xref_id = ox.xref_id and x.external_db_id = e.external_db_id and e.db_name like "CCDS"';
my %ccds_to_stable_id;
my $sth = $dbi2->prepare($sql);
$sth->execute() or croak( $dbi2->errstr() );
while ( my @row = $sth->fetchrow_array() ) {
if(defined($trans_id_to_stable_id{$row[0]})){
$ccds_to_stable_id{$row[1]} = $trans_id_to_stable_id{$row[0]};
}
else{
print "NO transcript_stable_id for for ".$row[0]."\n";
}
}
$sth->finish;
# becouse the direct mapping have no descriptions etc
# we have to steal these fromt he previous HGNC parser.
# This is why the order states this is after the other one.
# maybe 1091,1092 is not right maybe should use name = HGNC and priority = 30r4 ??
my %label;
my %version;
my %description;
my $dbi = $self->dbi();
my $sql = "insert into synonym (xref_id, synonym) values (?, ?)";
my $add_syn_sth = $dbi->prepare($sql);
my $syn_hash = $self->get_hgnc_synonyms();
$sql = 'select source_id, priority_description from source where name like "HGNC"';
my $sth = $dbi->prepare($sql);
$sth->execute();
my ($hgnc_source_id, $desc);
$sth->bind_columns(\$hgnc_source_id, \$desc);
my @arr;
while($sth->fetch()){
push @arr, $hgnc_source_id;
}
$sth->finish;
$sql = "select accession, label, version, description from xref where source_id in (".join(", ",@arr).")";
$sth = $dbi->prepare($sql);
$sth->execute();
my ($acc, $lab, $ver, $desc);
$sth->bind_columns(\$acc, \$lab, \$ver, \$desc);
while (my @row = $sth->fetchrow_array()) {
if(defined($desc)){
$label{$acc} = $lab;
$version{$acc} = $ver;
$description{$acc} = $desc;
}
}
$sth->finish;
my $xref_count = 0;
my $no_ccds_to_hgnc = 0;
foreach my $ccds (keys %ccds_to_stable_id){
if(defined($ccds_to_hgnc{$ccds})){
my $hgnc = $ccds_to_hgnc{$ccds};
$hgnc =~ s/HGNC://;
my $xref_id = $self->add_xref($hgnc, $version{$hgnc} , $label{$hgnc}||$hgnc ,
$description{$hgnc}, $source_id, $species_id, "DIRECT");
$self->add_direct_xref($xref_id, $ccds_to_stable_id{$ccds}, "Transcript", "");
$xref_count++;
if(defined($syn_hash->{$hgnc})){
foreach my $syn (@{$syn_hash->{$hgnc}}){
$add_syn_sth->execute($xref_id, $syn);
}
}
}
else{
$no_ccds_to_hgnc++;
# print "no ccds to hgnc for $ccds\n";
}
}
$add_syn_sth->finish;
print "$no_ccds_to_hgnc missed as no hgnc for the ccds. Added $xref_count HGNC xrefs via CCDS\n" if($verbose);
return 0;
}
1;