Raw content of XrefParser::BaseParser package XrefParser::BaseParser; use strict; use Carp; use DBI; use Digest::MD5 qw(md5_hex); use Getopt::Long; use POSIX qw(strftime); use File::Basename; use File::Spec::Functions; use IO::File; use Net::FTP; use URI; use URI::file; use Text::Glob qw( match_glob ); use LWP::UserAgent; use Bio::EnsEMBL::Utils::Exception; my $base_dir = File::Spec->curdir(); my $add_xref_sth = undef; my %add_direct_xref_sth; my $add_dependent_xref_sth = undef; my $get_xref_sth = undef; my $add_synonym_sth = undef; my $dbi; my %dependent_sources; my %taxonomy2species_id; my %species_id2taxonomy; my %name2species_id; my %species_id2name; my %xref_dependent_mapped; my ( $host, $port, $dbname, $user, $pass, $create, $release, $cleanup, $deletedownloaded, $drop_db, $checkdownload, $dl_path, $unzip, $stats, $verbose); # -------------------------------------------------------------------------------- # Get info about files to be parsed from the database sub run { my $self = shift; ( $host, $port, $dbname, $user, $pass, my $speciesr, my $sourcesr, $checkdownload, $create, $release, $cleanup, $drop_db, $deletedownloaded, $dl_path, my $notsourcesr, $unzip, $stats, $verbose ) = @_; $base_dir = $dl_path if $dl_path; my @species = @$speciesr; my @sources = @$sourcesr; my @notsources = @$notsourcesr; my $sql_dir = dirname($0); if ($create) { create( $host, $port, $user, $pass, $dbname, $sql_dir, $drop_db ); } my $dbi = dbi(); my $sth_c = $dbi->prepare("insert into process_status (status, date) values('xref_created',now())"); $sth_c->execute; # validate species names my @species_ids = validate_species(@species); # validate source names exit(1) if ( !validate_sources(\@species_ids,@sources) ); exit(1) if ( !validate_sources(\@species_ids,@notsources) ); # build SQL my $species_sql = ""; if (@species_ids) { $species_sql .= " AND su.species_id IN ("; for ( my $i = 0 ; $i < @species_ids ; $i++ ) { $species_sql .= "," if ( $i ne 0 ); $species_sql .= $species_ids[$i]; } $species_sql .= ") "; } my $source_sql = ""; if (@sources) { $source_sql .= " AND LOWER(s.name) IN ("; for ( my $i = 0 ; $i < @sources ; $i++ ) { $source_sql .= "," if ( $i ne 0 ); $source_sql .= "\'" . lc( $sources[$i] ) . "\'"; } $source_sql .= ") "; } if (@notsources) { $source_sql .= " AND LOWER(s.name) NOT IN ("; for ( my $i = 0 ; $i < @notsources ; $i++ ) { $source_sql .= "," if ( $i ne 0 ); $source_sql .= "\'" . lc( $notsources[$i] ) . "\'"; } $source_sql .= ") "; } my $sql = "SELECT DISTINCT(s.source_id), su.source_url_id, s.name, su.url, " . "su.release_url, su.checksum, su.parser, su.species_id " . "FROM source s, source_url su, species sp " . "WHERE s.download='Y' AND su.source_id=s.source_id " . "AND su.species_id=sp.species_id " . $source_sql . $species_sql . "ORDER BY s.ordered"; #print $sql . "\n"; my $sth = $dbi->prepare("insert into process_status (status, date) values('parsing_started',now())"); $sth->execute; $sth = $dbi->prepare($sql); $sth->execute(); my ( $source_id, $source_url_id, $name, $url, $release_url, $checksum, $parser, $species_id ); $sth->bind_columns( \$source_id, \$source_url_id, \$name, \$url, \$release_url, \$checksum, \$parser, \$species_id ); my $dir; my %summary = (); my %sum_xrefs; my %sum_prim; my %sum_dep; my %sum_dir; my %sum_coord; my %sum_list; my %sum_syn; while ( my @row = $sth->fetchrow_array() ) { print '-' x 4, "{ $name }", '-' x ( 72 - length($name) ), "\n" if ($verbose); my $cs; my $file_cs = ""; my $parse = 0; my $empty = 0; my $type = $name; my $dsn; my @files = split( /\s+/, $url ); my @files_to_parse = (); $dir = catdir( $base_dir, sanitise($type) ); # For summary purposes: If 0 is returned (in # $summary{$name}->{$parser}) then it is successful. If 1 is # returned then it failed. If undef/nothing is returned the we # do not know. $summary{$name}->{$parser} = 0; @files = $self->fetch_files( $dir, @files ); if ( !@files ) { # Fetching failed. ++$summary{$name}->{$parser}; next; } if ( defined($release_url) ) { $release_url = $self->fetch_files( $dir, $release_url )->[-1]; } foreach my $file (@files) { # Database parsing if ( $file =~ /^mysql:/i ) { $dsn = $file; print "Parsing $dsn with $parser\n" if ($verbose); eval "require XrefParser::$parser"; my $new = "XrefParser::$parser"->new(); if ( $new->run( $dsn, $source_id, $species_id, $name, undef, $verbose ) ) { ++$summary{$name}->{$parser}; } next; } if ( $file =~ /^script:/i ) { print "Parsing $file with $parser\n" if ($verbose); eval "require XrefParser::$parser"; my $new = "XrefParser::$parser"->new(); if ( $new->run_script( $file, $source_id, $species_id, $verbose ) ) { ++$summary{$name}->{$parser}; } next; } if ( $unzip && ( $file =~ /\.(gz|Z)$/ ) ) { printf( "Uncompressing '%s' using 'gunzip'\n", $file ) if ($verbose); system( "gunzip", "-f", $file ); } if ($unzip) { $file =~ s/\.(gz|Z)$// } # Compare checksums and parse/upload if necessary need to # check file size as some .SPC files can be of zero length if ( !defined( $cs = md5sum($file) ) ) { printf( "Download '%s'\n", $file ) if($verbose); ++$summary{$name}->{$parser}; } else { $file_cs .= ':' . $cs; if ( !defined $checksum || index( $checksum, $file_cs ) == -1 ) { if ( -s $file ) { $parse = 1; print "Checksum for '$file' does not match, " . "will parse...\n" if ($verbose); # Files from sources "Uniprot/SWISSPROT" and # "Uniprot/SPTREMBL" are all parsed with the # same parser if ( $parser eq "Uniprot/SWISSPROT" || $parser eq "Uniprot/SPTREMBL" ) { $parser = 'UniProtParser'; } } else { $empty = 1; printf( "The file '%s' has zero length, skipping\n", $file ) if ($verbose); } } } ## end else [ if ( !defined( $cs = md5sum... # Push this file to the list of files to parsed. The files # are *actually* parsed only if $parse == 1. push @files_to_parse, $file; } ## end foreach my $file (@files) if ( $parse and @files_to_parse and defined $file_cs ) { print "Parsing '" . join( "', '", @files_to_parse ) . "' with $parser\n" if ($verbose); eval "require XrefParser::$parser"; $@ && warn( "[ERROR] Cannot require $parser: $@" ); my $new = "XrefParser::$parser"->new(); if ( defined $release_url ) { # Run with $release_url. if ( $new->run( $source_id, $species_id, \@files_to_parse, $release_url, $verbose ) ) { ++$summary{$name}->{$parser}; } } else { # Run without $release_url. if ( $new->run( $source_id, $species_id, \@files_to_parse, undef, $verbose ) ) { ++$summary{$name}->{$parser}; } } # update AFTER processing in case of crash. update_source( $dbi, $source_url_id, $file_cs, $files_to_parse[0] ); # Set release if specified if ( defined $release ) { $self->set_release( $source_id, $release ); } } elsif ( !$dsn && !$empty && @files_to_parse ) { print( "Ignoring '" . join( "', '", @files_to_parse ) . "' as checksums match\n" ) if ($verbose); } if ($cleanup) { foreach my $file (@files_to_parse) { printf( "Deleting '%s'\n", $file ) if($verbose); unlink($file); } } if($stats){ # produce summary of what has been added my %sum_line; # first the number of xrefs; my $group_sql = "SELECT count(*), s.name from source s, xref x where s.source_id = x.source_id group by s.name"; my $sum_sth = $dbi->prepare($group_sql); $sum_sth->execute(); my ($sum_count, $sum_name); $sum_sth->bind_columns(\$sum_count, \$sum_name); while($sum_sth->fetch){ if(defined($sum_xrefs{$sum_name})){ if($sum_count != $sum_xrefs{$sum_name}){ my $diff = ($sum_count - $sum_xrefs{$sum_name}); $sum_line{$sum_name} = [$diff, 0, 0, 0, 0, 0]; } # else{ # $sum_line{$sum_name} = [0, 0, 0, 0, 0, 0]; # } } else{ $sum_line{$sum_name} = [$sum_count, 0, 0, 0, 0, 0]; } $sum_xrefs{$sum_name} = $sum_count; } $sum_sth->finish; # second the number of primary xrefs $group_sql = "SELECT count(*), s.name from source s, primary_xref px, xref x where s.source_id = x.source_id and px.xref_id = x.xref_id group by s.name"; my $sum_sth = $dbi->prepare($group_sql); $sum_sth->execute(); $sum_sth->bind_columns(\$sum_count, \$sum_name); while($sum_sth->fetch){ if($sum_count != $sum_prim{$sum_name}){ my $diff = ($sum_count - $sum_prim{$sum_name}); $sum_line{$sum_name}[1] = $diff; } $sum_prim{$sum_name} = $sum_count; } $sum_sth->finish; # third the number of dependent xrefs $group_sql = "SELECT count(*), s.name from source s, dependent_xref dx, xref x where s.source_id = x.source_id and dx.dependent_xref_id = x.xref_id group by s.name"; my $sum_sth = $dbi->prepare($group_sql); $sum_sth->execute(); $sum_sth->bind_columns(\$sum_count, \$sum_name); while($sum_sth->fetch){ if($sum_count != $sum_dep{$sum_name}){ my $diff = ($sum_count - $sum_dep{$sum_name}); $sum_line{$sum_name}[2] = $diff; } $sum_dep{$sum_name} = $sum_count; } $sum_sth->finish; # fourth the number of direct xrefs $group_sql = "SELECT count(*), s.name from source s, direct_xref dx, xref x where s.source_id = x.source_id and dx.general_xref_id = x.xref_id group by s.name"; my $sum_sth = $dbi->prepare($group_sql); $sum_sth->execute(); $sum_sth->bind_columns(\$sum_count, \$sum_name); while($sum_sth->fetch){ if($sum_count != $sum_dir{$sum_name}){ my $diff = ($sum_count - $sum_dir{$sum_name}); $sum_line{$sum_name}[3] = $diff; } $sum_dir{$sum_name} = $sum_count; } $sum_sth->finish; # fifth the number of coordinate xrefs $group_sql = "SELECT count(*), s.name from source s, coordinate_xref cx where s.source_id = cx.source_id group by s.name"; my $sum_sth = $dbi->prepare($group_sql); $sum_sth->execute(); $sum_sth->bind_columns(\$sum_count, \$sum_name); while($sum_sth->fetch){ if($sum_count != $sum_coord{$sum_name}){ my $diff = ($sum_count - $sum_coord{$sum_name}); $sum_line{$sum_name}[4] = $diff; } $sum_coord{$sum_name} = $sum_count; } $sum_sth->finish; # sixth the number of synonyms $group_sql = "select count(*), s.name from source s, xref x, synonym o where s.source_id = x.source_id and x.xref_id = o.xref_id group by s.name"; my $sum_sth = $dbi->prepare($group_sql); $sum_sth->execute(); $sum_sth->bind_columns(\$sum_count, \$sum_name); while($sum_sth->fetch){ if($sum_count != $sum_syn{$sum_name}){ my $diff = ($sum_count - $sum_syn{$sum_name}); $sum_line{$sum_name}[5] = $diff; } $sum_syn{$sum_name} = $sum_count; } $sum_sth->finish; print "source xrefs\tprim\tdep\tdir\tcoord\tsynonyms\n"; foreach my $sum_name (keys %sum_line){ printf ("%-28s",$sum_name); print join("\t",@{$sum_line{$sum_name}})."\n"; } } # if ($stats) } ## end while ( my @row = $sth->fetchrow_array... print "\n", '=' x 80, "\n"; print "Summary of status\n"; print '=' x 80, "\n"; foreach my $source_name ( sort keys %summary ) { foreach my $parser_name ( keys %{ $summary{$source_name} } ) { printf( "%30s %-20s\t%s\n", $source_name, $parser_name, ( defined $summary{$source_name}->{$parser_name} && $summary{$source_name}->{$parser_name} ? 'FAILED' : 'OKAY' ) ); } } if($stats){ my %sum_line; # first the number of xrefs; my $group_sql = "SELECT count(*), s.name from source s, xref x where s.source_id = x.source_id group by s.name"; my $sum_sth = $dbi->prepare($group_sql); $sum_sth->execute(); my ($sum_count, $sum_name); $sum_sth->bind_columns(\$sum_count, \$sum_name); while($sum_sth->fetch){ $sum_line{$sum_name} = [$sum_count, 0, 0, 0, 0, 0]; } $sum_sth->finish; # second the number of primary xrefs $group_sql = "SELECT count(*), s.name from source s, primary_xref px, xref x where s.source_id = x.source_id and px.xref_id = x.xref_id group by s.name"; my $sum_sth = $dbi->prepare($group_sql); $sum_sth->execute(); $sum_sth->bind_columns(\$sum_count, \$sum_name); while($sum_sth->fetch){ $sum_line{$sum_name}[1] = $sum_count; } $sum_sth->finish; # third the number of dependent xrefs $group_sql = "SELECT count(*), s.name from source s, dependent_xref dx, xref x where s.source_id = x.source_id and dx.dependent_xref_id = x.xref_id group by s.name"; my $sum_sth = $dbi->prepare($group_sql); $sum_sth->execute(); $sum_sth->bind_columns(\$sum_count, \$sum_name); while($sum_sth->fetch){ $sum_line{$sum_name}[2] = $sum_count; } # fourth the number of direct xrefs $group_sql = "SELECT count(*), s.name from source s, direct_xref dx, xref x where s.source_id = x.source_id and dx.general_xref_id = x.xref_id group by s.name"; my $sum_sth = $dbi->prepare($group_sql); $sum_sth->execute(); $sum_sth->bind_columns(\$sum_count, \$sum_name); while($sum_sth->fetch){ $sum_line{$sum_name}[3] = $sum_count; } $sum_sth->finish; # fifth the number of coordinate xrefs $group_sql = "SELECT count(*), s.name from source s, coordinate_xref cx where s.source_id = cx.source_id group by s.name"; my $sum_sth = $dbi->prepare($group_sql); $sum_sth->execute(); $sum_sth->bind_columns(\$sum_count, \$sum_name); while($sum_sth->fetch){ $sum_line{$sum_name}[4] = $sum_count; } $sum_sth->finish; # sixth the number of synonyms $group_sql = "select count(*), s.name from source s, xref x, synonym o where s.source_id = x.source_id and x.xref_id = o.xref_id group by s.name"; my $sum_sth = $dbi->prepare($group_sql); $sum_sth->execute(); $sum_sth->bind_columns(\$sum_count, \$sum_name); while($sum_sth->fetch){ $sum_line{$sum_name}[5] = $sum_count; } print "---------------------------------------------------------------------------------------\n"; print "TOTAL source xrefs\tprim\tdep\tdir\tcoord\tsynonyms\n"; foreach my $sum_name (keys %sum_line){ printf ("%-28s",$sum_name); print join("\t",@{$sum_line{$sum_name}})."\n"; } } $sth = $dbi->prepare("insert into process_status (status, date) values('parsing_finished',now())"); $sth->execute; # remove last working directory # TODO reinstate after debugging #rmtree $dir; } ## end sub run # ------------------------------------------------------------------------------ # Given one or several FTP or HTTP URIs, download them. If an URI is # for a file or MySQL connection, then these will be ignored. For # FTP, standard shell file name globbing is allowed (but not regular # expressions). HTTP does not allow file name globbing. The routine # returns a list of successfully downloaded local files or an empty list # if there was an error. sub fetch_files { my $self = shift; my ( $dest_dir, @user_uris ) = @_; my @processed_files; foreach my $user_uri (@user_uris) { # Change old-style 'LOCAL:' URIs into 'file:'. $user_uri =~ s#^LOCAL:#file:#i; my $uri = URI->new($user_uri); # print "\n*******$user_uri\n*********\n"; if ( $uri->scheme() eq 'script'){ push( @processed_files, $user_uri ); }elsif ( $uri->scheme() eq 'file' ) { # Deal with local files. my @local_files; $user_uri =~ s/file://; if ( -f $user_uri ) { push( @processed_files, $user_uri ); } else { printf( "==> Can not find file '%s'\n", $user_uri ); return (); } } elsif ( $uri->scheme() eq 'ftp' ) { # Deal with FTP files. my $file_path = catfile( $dest_dir, basename( $uri->path() ) ); if ( $deletedownloaded && -f $file_path ) { printf( "Deleting '%s'\n", $file_path ) if ($verbose); unlink($file_path); } if ( $checkdownload && -f $file_path ) { # The file is already there, no need to connect to a FTP # server. This also means no file name globbing was # used (for globbing FTP URIs, we always need to connect # to a FTP site to see what files are there). printf( "File '%s' already exists\n", $file_path ) if ($verbose); push( @processed_files, $file_path ); next; } printf( "Connecting to FTP host '%s'\n", $uri->host() ) if ($verbose); my $ftp = Net::FTP->new( $uri->host(), 'Debug' => 0 ); if ( !defined($ftp) ) { printf( "==> Can not open FTP connection: %s\n", $ftp->message() ); return (); } if ( !$ftp->login( 'anonymous', '-anonymous@' ) ) { printf( "==> Can not log in on FTP host: %s\n", $ftp->message() ); return (); } if ( !$ftp->cwd( dirname( $uri->path() ) ) ) { printf( "== Can not change directory to '%s': %s\n", dirname( $uri->path() ), $ftp->message() ); return (); } $ftp->binary(); foreach my $remote_file ( ( @{ $ftp->ls() } ) ) { if ( !match_glob( basename( $uri->path() ), $remote_file ) ) { next; } $file_path = catfile( $dest_dir, basename($remote_file) ); if ( $deletedownloaded && -f $file_path ) { printf( "Deleting '%s'\n", $file_path ) if($verbose); unlink($file_path); } if ( $checkdownload && -f $file_path ) { printf( "File '%s' already exists\n", $file_path ) if ($verbose); } else { if ( !-d dirname($file_path) ) { printf( "Creating directory '%s'\n", dirname($file_path) ) if($verbose); if ( !mkdir( dirname($file_path) ) ) { printf( "==> Can not create directory '%s': %s", dirname($file_path), $! ); return (); } } printf( "Fetching '%s' (size = %s)\n", $remote_file, $ftp->size($remote_file) || '(unknown)' ) if ($verbose); printf( "Local file is '%s'\n", $file_path ) if($verbose); if ( !$ftp->get( $remote_file, $file_path ) ) { printf( "==> Could not get '%s': %s\n", basename( $uri->path() ), $ftp->message() ); return (); } } ## end else [ if ( $checkdownload &&... push( @processed_files, $file_path ); } ## end foreach my $remote_file ( (... } elsif ( $uri->scheme() eq 'http' ) { # Deal with HTTP files. my $file_path = catfile( $dest_dir, basename( $uri->path() ) ); if ( $deletedownloaded && -f $file_path ) { printf( "Deleting '%s'\n", $file_path ) if($verbose); unlink($file_path); } if ( $checkdownload && -f $file_path ) { # The file is already there, no need to connect to a # HTTP server. printf( "File '%s' already exists\n", $file_path ) if ($verbose); push( @processed_files, $file_path ); next; } if ( !-d dirname($file_path) ) { printf( "Creating directory '%s'\n", dirname($file_path) ) if($verbose); if ( !mkdir( dirname($file_path) ) ) { printf( "==> Can not create directory '%s': %s", dirname($file_path), $! ); return (); } } printf( "Connecting to HTTP host '%s'\n", $uri->host() ) if($verbose); printf( "Fetching '%s'\n", $uri->path() ) if($verbose); if ( $checkdownload && -f $file_path ) { printf( "File '%s' already exists\n", $file_path ) if($verbose); } else { printf( "Local file is '%s'\n", $file_path ) if($verbose); my $ua = LWP::UserAgent->new(); $ua->env_proxy(); my $response = $ua->get( $uri->as_string(), ':content_file' => $file_path ); if ( !$response->is_success() ) { printf( "==> Could not get '%s': %s\n", basename( $uri->path() ), $response->content() ); return (); } } push( @processed_files, $file_path ); } elsif ( $uri->scheme() eq 'mysql' ) { # Just leave MySQL data untouched for now. push( @processed_files, $user_uri ); } else { printf( "==> Unknown URI scheme '%s' in URI '%s'\n", $uri->scheme(), $uri->as_string() ); return (); } } ## end foreach my $user_uri (@user_uris) return ( wantarray() ? @processed_files : \@processed_files ); } ## end sub fetch_files # Given a file name, returns a IO::Handle object. If the file is # gzipped, the handle will be to an unseekable stream coming out of a # zcat pipe. If the given file name doesn't correspond to an existing # file, the routine will try to add '.gz' to the file name or to remove # any .'Z' or '.gz' and try again. Returns undef on failure and will # write a warning to stderr. sub get_filehandle { my ($self, $file_name) = @_; my $io; my $alt_file_name = $file_name; $alt_file_name =~ s/\.(gz|Z)$//; if ( $alt_file_name eq $file_name ) { $alt_file_name .= '.gz'; } if ( !-f $file_name ) { carp( "File '$file_name' does not exist, " . "will try '$alt_file_name'" ); $file_name = $alt_file_name; } if ( $file_name =~ /\.(gz|Z)$/ ) { # Read from zcat pipe $io = IO::File->new("zcat $file_name |") or carp("Can not open file '$file_name' with 'zcat'"); } else { # Read file normally $io = IO::File->new($file_name) or carp("Can not open file '$file_name'"); } if ( !defined $io ) { return undef } print "Reading from '$file_name'...\n" if($verbose); return $io; } # ------------------------------------------------------------------------------ sub new { my ($proto) = @_; my $class = ref $proto || $proto; return bless {}, $class; } # -------------------------------------------------------------------------------- # Get source ID for a particular file; matches url field sub get_source_id_for_filename { my ($self, $file) = @_; print "FILE $file\n" if($verbose) ; my $sql = "SELECT s.source_id FROM source s, source_url su WHERE su.source_id=s.source_id AND su.url LIKE '%/" . $file . "%'"; my $sth = dbi()->prepare($sql); $sth->execute(); my @row = $sth->fetchrow_array(); my $source_id; if (@row) { $source_id = $row[0]; } else { if($file =~ /rna.fna/ or $file =~ /gpff/){ $source_id = 3; }else{ warn("Couldn't get source ID for file $file\n"); $source_id = -1; } } return $source_id; } sub rename_url_file{ return undef; } # Get species ID for a particular file; matches url field sub get_species_id_for_filename { my ($self, $file) = @_; my $sql = "SELECT su.species_id FROM source_url su WHERE su.url LIKE '%/" . $file . "%'"; my $sth = dbi()->prepare($sql); $sth->execute(); my @row = $sth->fetchrow_array(); my $source_id; if (@row) { $source_id = $row[0]; } else { warn("Couldn't get species ID for file $file\n"); $source_id = -1; } return $source_id; } # -------------------------------------------------------------------------------- # Get source ID for a particular source name sub get_source_id_for_source_name { my ($self, $source_name,$priority_desc) = @_; my $sql = "SELECT source_id FROM source WHERE LOWER(name)='" . lc($source_name) . "'"; if(defined($priority_desc)){ $sql .= " AND LOWER(priority_description)='".lc($priority_desc)."'"; } my $sth = dbi()->prepare($sql); $sth->execute(); my @row = $sth->fetchrow_array(); my $source_id; if (@row) { $source_id = $row[0]; } else { print STDERR "WARNING: There is no entity $source_name in the source-table of the xref database.\n" . "WARNING:. The external db name ($source_name) is hardcoded in the parser\n"; warn("WARNING: Couldn't get source ID for source name $source_name\n"); $source_id = -1; } return $source_id; } # -------------------------------------------------------------------------------- # Get a set of source IDs matching a source name pattern sub get_source_ids_for_source_name_pattern { my ($self, $source_name) = @_; my $sql = "SELECT source_id FROM source WHERE upper(name) LIKE '%".uc($source_name)."%'"; my $sth = dbi()->prepare($sql); my @sources; $sth->execute(); while(my @row = $sth->fetchrow_array()){ push @sources,$row[0]; } $sth->finish; return @sources; } sub get_source_name_for_source_id { my ($self, $source_id) = @_; my $source_name; my $sql = "SELECT name FROM source WHERE source_id= '" . $source_id. "'"; my $sth = dbi()->prepare($sql); $sth->execute(); my @row = $sth->fetchrow_array(); if (@row) { $source_name = $row[0]; } else { print STDERR "WARNING: There is no entity with source-id $source_id in the source-table of the \n" . "WARNING: xref-database. The source-id and the name of the source-id is hard-coded in populate_metadata.sql\n" . "WARNING: and in the parser\n"; warn("WARNING: Couldn't get source name for source ID $source_id\n"); $source_name = -1; } return $source_name; } sub get_valid_xrefs_for_dependencies{ my ($self, $dependent_name, @reverse_ordered_source_list) = @_; my %dependent_2_xref; my $sql = "select source_id from source where LOWER(name) =?"; my $sth = dbi()->prepare($sql); my @dependent_sources; $sth->execute(lc($dependent_name)); while(my @row = $sth->fetchrow_array()){ push @dependent_sources,$row[0]; } my @sources; foreach my $name (@reverse_ordered_source_list){ $sth->execute(lc($name)); while(my @row = $sth->fetchrow_array()){ push @sources,$row[0]; } } $sth->finish; $sql = "select d.master_xref_id, x2.accession "; $sql .= " from dependent_xref d, xref x1, xref x2 "; $sql .= " where x1.xref_id = d.master_xref_id and"; $sql .= " x1.source_id=? and "; $sql .= " x2.xref_id = d.dependent_xref_id and"; $sql .= " x2.source_id=? "; $sth = dbi()->prepare($sql); foreach my $d (@dependent_sources){ foreach my $s (@sources){ $sth->execute($s,$d); while(my @row = $sth->fetchrow_array()){ $dependent_2_xref{$row[1]} = $row[0]; } } } return \%dependent_2_xref; } sub get_valid_xrefs_for_direct_xrefs{ my ($self, $direct_name, @list) = @_; my %direct_2_xref; my $sql = "select source_id from source where name like ?"; my $sth = dbi()->prepare($sql); my @direct_sources; $sth->execute($direct_name."%"); while(my @row = $sth->fetchrow_array()){ push @direct_sources,$row[0]; } my @sources; foreach my $name (@list){ $sth->execute($name); while(my @row = $sth->fetchrow_array()){ push @sources,$row[0]; } } $sth->finish; $sql = "select d.general_xref_id, d.ensembl_stable_id, d.type, d.linkage_xref, x1.accession "; $sql .= " from direct_xref d, xref x1 "; $sql .= " where x1.xref_id = d.general_xref_id and"; $sql .= " x1.source_id=?"; $sth = dbi()->prepare($sql); foreach my $d (@direct_sources){ $sth->execute($d); while(my @row = $sth->fetchrow_array()){ $direct_2_xref{$row[4]} = $row[0]."::".$row[1]."::".$row[2]."::".$row[3]; } } return \%direct_2_xref; } sub label_to_acc{ my ($self,$source_name,$species_id) =@_; # First cache synonyms so we can quickly add them later my %synonyms; my $syn_sth = dbi()->prepare("SELECT xref_id, synonym FROM synonym"); $syn_sth->execute(); my ($xref_id, $synonym); $syn_sth->bind_columns(\$xref_id, \$synonym); while ($syn_sth->fetch()) { push @{$synonyms{$xref_id}}, $synonym; } my %valid_codes; my @sources; my $sql = "select source_id from source where upper(name) like '%".uc($source_name)."%'"; my $sth = dbi()->prepare($sql); $sth->execute(); while(my @row = $sth->fetchrow_array()){ push @sources,$row[0]; } $sth->finish; foreach my $source (@sources){ $sql = "select label, xref_id from xref where species_id = $species_id and source_id = $source"; my $sth = dbi()->prepare($sql); $sth->execute(); while(my @row = $sth->fetchrow_array()){ $valid_codes{$row[0]} =$row[1]; # add any synonyms for this xref as well foreach my $syn (@{$synonyms{$row[1]}}) { $valid_codes{$syn} = $row[1]; } } } return \%valid_codes; } sub get_valid_codes{ my ($self,$source_name,$species_id) =@_; # First cache synonyms so we can quickly add them later my %synonyms; my $syn_sth = dbi()->prepare("SELECT xref_id, synonym FROM synonym"); $syn_sth->execute(); my ($xref_id, $synonym); $syn_sth->bind_columns(\$xref_id, \$synonym); while ($syn_sth->fetch()) { push @{$synonyms{$xref_id}}, $synonym; } my %valid_codes; my @sources; my $sql = "select source_id from source where upper(name) like '%".uc($source_name)."%'"; my $sth = dbi()->prepare($sql); $sth->execute(); while(my @row = $sth->fetchrow_array()){ push @sources,$row[0]; } $sth->finish; foreach my $source (@sources){ $sql = "select accession, xref_id from xref where species_id = $species_id and source_id = $source"; my $sth = dbi()->prepare($sql); $sth->execute(); while(my @row = $sth->fetchrow_array()){ $valid_codes{$row[0]} =$row[1]; # add any synonyms for this xref as well foreach my $syn (@{$synonyms{$row[1]}}) { $valid_codes{$syn} = $row[1]; } } } return \%valid_codes; } # -------------------------------------------------------------------------------- # -------------------------------------------------------------------------------- sub get_existing_mappings { my ($self, $from_source_name, $to_source_name, $species_id) =@_; my %mappings; my $from_source = $self->get_source_id_for_source_name($from_source_name); my $to_source = $self->get_source_id_for_source_name($to_source_name); my $sql = "SELECT dx.dependent_xref_id, x1.accession as dependent, dx.master_xref_id, x2.accession as master FROM dependent_xref dx, xref x1, xref x2 WHERE x1.xref_id=dx.dependent_xref_id AND x2.xref_id=dx.master_xref_id AND x2.source_id=? AND x1.source_id=? AND x1.species_id=? AND x2.species_id=?"; my $sth = dbi()->prepare($sql); $sth->execute($to_source, $from_source, $species_id, $species_id); while(my @row = $sth->fetchrow_array()){ $mappings{$row[1]} = $row[2]; #print "mgi_to_uniprot{" . $row[1] . "} = " . $row[2] . "\n"; } print "Got " . scalar(keys(%mappings)) . " $from_source_name -> $to_source_name mappings\n" if($verbose); return \%mappings; } # -------------------------------------------------------------------------------- # Upload xrefs to the database sub upload_xref_object_graphs { my ($self, $rxrefs) = @_; my $dbi = dbi(); print "count = ".$#$rxrefs."\n" if($verbose); if ($#$rxrefs > -1) { # remove all existing xrefs with same source ID(s) # $self->delete_by_source($rxrefs); # upload new ones print "Uploading xrefs\n" if($verbose); my $xref_sth = $dbi->prepare("INSERT INTO xref (accession,version,label,description,source_id,species_id, info_type) VALUES(?,?,?,?,?,?,?)"); my $pri_insert_sth = $dbi->prepare("INSERT INTO primary_xref VALUES(?,?,?,?)"); my $pri_update_sth = $dbi->prepare("UPDATE primary_xref SET sequence=? WHERE xref_id=?"); my $syn_sth = $dbi->prepare("INSERT INTO synonym VALUES(?,?)"); my $dep_sth = $dbi->prepare("INSERT INTO dependent_xref (master_xref_id, dependent_xref_id, linkage_annotation, linkage_source_id) VALUES(?,?,?,?)"); my $xref_update_label_sth = $dbi->prepare("UPDATE xref SET label=? WHERE xref_id=?"); my $xref_update_descr_sth = $dbi->prepare("UPDATE xref SET description=? WHERE xref_id=?"); my $pair_sth = $dbi->prepare("INSERT INTO pairs VALUES(?,?,?)"); local $xref_sth->{RaiseError}; # disable error handling here as we'll do it ourselves local $xref_sth->{PrintError}; foreach my $xref (@{$rxrefs}) { my $xref_id=undef; if(!defined($xref->{ACCESSION})){ print "your xref does not have an accession-number,so it can't be stored in the database\n"; return undef; } # Create entry in xref table and note ID if(! $xref_sth->execute($xref->{ACCESSION}, $xref->{VERSION} || 0, $xref->{LABEL}|| $xref->{ACCESSION}, $xref->{DESCRIPTION}, $xref->{SOURCE_ID}, $xref->{SPECIES_ID}, $xref->{INFO_TYPE} || "MISC")){ if(!defined($xref->{SOURCE_ID})){ print "your xref: $xref->{ACCESSION} does not have a source-id\n"; return undef; } $xref_id = $self->insert_or_select($xref_sth, $dbi->err, $xref->{ACCESSION}, $xref->{SOURCE_ID}, $xref->{SPECIES_ID}); $xref_update_label_sth->execute($xref->{LABEL},$xref_id) if (defined($xref->{LABEL})); $xref_update_descr_sth->execute($xref->{DESCRIPTION},$xref_id,) if (defined($xref->{DESCRIPTION})); } else{ $xref_id = $self->insert_or_select($xref_sth, $dbi->err, $xref->{ACCESSION}, $xref->{SOURCE_ID}, $xref->{SPECIES_ID}); } # create entry in primary_xref table with sequence; if this is a "cumulative" # entry it may already exist, and require an UPDATE rather than an INSERT if(defined($xref->{SEQUENCE})){ if(!(defined($xref_id) and $xref_id)){ print STDERR "xref_id is not set for :\n$xref->{ACCESSION}\n$xref->{LABEL}\n$xref->{DESCRIPTION}\n$xref->{SOURCE_ID}\n$xref->{SPECIES_ID}\n"; } if ( primary_xref_id_exists($xref_id) ) { $pri_update_sth->execute( $xref->{SEQUENCE}, $xref_id ) or croak( $dbi->errstr() ); } else { $pri_insert_sth->execute( $xref_id, $xref->{SEQUENCE}, $xref->{SEQUENCE_TYPE}, $xref->{STATUS} ) or croak( $dbi->errstr() ); } } # if there are synonyms, add entries in the synonym table foreach my $syn ( @{ $xref->{SYNONYMS} } ) { $syn_sth->execute( $xref_id, $syn ) or croak( $dbi->errstr() . "\n $xref_id\n $syn\n" ); } # foreach syn # if there are dependent xrefs, add xrefs and dependent xrefs for them foreach my $depref (@{$xref->{DEPENDENT_XREFS}}) { my %dep = %$depref; $xref_sth->execute($dep{ACCESSION}, $dep{VERSION} || 0, $dep{LABEL} || $dep{ACCESSION}, $dep{DESCRIPTION} || "", $dep{SOURCE_ID}, $xref->{SPECIES_ID}, "DEPENDENT"); my $dep_xref_id = $self->insert_or_select($xref_sth, $dbi->err, $dep{ACCESSION}, $dep{SOURCE_ID}, $xref->{SPECIES_ID}); if($dbi->err){ print STDERR "dbi\t$dbi->err \n$dep{ACCESSION} \n $dep{SOURCE_ID} \n"; } if(!defined($dep_xref_id) || $dep_xref_id ==0 ){ print STDERR "acc = $dep{ACCESSION} \nlink = $dep{LINKAGE_SOURCE_ID} \n".$dbi->err."\n"; print STDERR "source = $dep{SOURCE_ID}\n"; } $dep_sth->execute( $xref_id, $dep_xref_id, $dep{LINKAGE_ANNOTATION}, $dep{LINKAGE_SOURCE_ID} ) or croak( $dbi->errstr() ); } # foreach dep if(defined($xref_id) and defined($xref->{PAIR})){ $pair_sth->execute($xref->{SOURCE_ID},$xref->{ACCESSION},$xref->{PAIR}); } $xref_sth->finish() if defined $xref_sth; $pri_insert_sth->finish() if defined $pri_insert_sth; $pri_update_sth->finish() if defined $pri_update_sth; } # foreach xref } return 1; } sub upload_direct_xrefs{ my ($self, $direct_xref) = @_; for my $dr(@$direct_xref) { # print "having now direct-XREF : ".$dr->{ENSEMBL_STABLE_ID}."\t".$dr->{SPECIES_ID}." \n" ; my $general_xref_id = get_xref($dr->{ACCESSION},$dr->{SOURCE_ID},$dr->{SPECIES_ID}); if ($general_xref_id){ # print "direct_xref:\n$general_xref_id\n$dr->{ENSEMBL_STABLE_ID}\n$dr->{ENSEMBL_TYPE}\t$dr->{LINKAGE_XREF}\n\n"; $self->add_direct_xref($general_xref_id, $dr->{ENSEMBL_STABLE_ID},$dr->{ENSEMBL_TYPE},$dr->{LINKAGE_XREF}); } } } # -------------------------------------------------------------------------------- # Get & cache a hash of all the source names for dependent xrefs (those that are # in the source table but don't have an associated URL etc) sub get_dependent_xref_sources { my $self = shift; if (!%dependent_sources) { my $dbi = dbi(); my $sth = $dbi->prepare("SELECT name,source_id FROM source"); $sth->execute() or croak( $dbi->errstr() ); while(my @row = $sth->fetchrow_array()) { my $source_name = $row[0]; my $source_id = $row[1]; $dependent_sources{$source_name} = $source_id; } } return %dependent_sources; } # -------------------------------------------------------------------------------- # Get & cache a hash of all the species IDs & taxonomy IDs. sub taxonomy2species_id { warn( "[DEPRECATED] taxonomy2species_id is a deprecated (unsafe) method. ". "Please use species_id2taxonomy instead. Called by ". join( ', ', (caller(0))[1..2] ) ); my $self = shift; if (!%taxonomy2species_id) { my $dbi = dbi(); my $sth = $dbi->prepare("SELECT species_id, taxonomy_id FROM species"); $sth->execute() or croak( $dbi->errstr() ); while(my @row = $sth->fetchrow_array()) { my $species_id = $row[0]; my $taxonomy_id = $row[1]; if( my $ori =$taxonomy2species_id{$taxonomy_id} ){ die( "Taxon $taxonomy_id already used for species $ori. ". "Cannot assign to species $species_id as well. ". "Consider using the species_id2taxonomy call instead. ". "Called by ". join( ', ', (caller(0))[1..2] ) ); } $taxonomy2species_id{$taxonomy_id} = $species_id; } } return %taxonomy2species_id; } sub species_id2taxonomy { my $self = shift; if (!%species_id2taxonomy) { my $dbi = dbi(); my $sth = $dbi->prepare("SELECT species_id, taxonomy_id FROM species"); $sth->execute() or croak( $dbi->errstr() ); while(my @row = $sth->fetchrow_array()) { my $species_id = $row[0]; my $taxonomy_id = $row[1]; if(defined($species_id2taxonomy{$species_id})){ push @{$species_id2taxonomy{$species_id}}, $taxonomy_id; } else{ $species_id2taxonomy{$species_id} = [$taxonomy_id]; } } } return %species_id2taxonomy; } # -------------------------------------------------------------------------------- # Get & cache a hash of all the species IDs & species names. sub name2species_id { warn( "[DEPRECATED] name2species_id is a deprecated (unsafe) method. ". "Please use species_id2name instead. Called by ". join( ', ', (caller(0))[1..2] ) ); my $self = shift; if ( !%name2species_id ) { my $dbi = dbi(); my $sth = $dbi->prepare("SELECT species_id, name FROM species"); $sth->execute() or croak( $dbi->errstr() ); while ( my @row = $sth->fetchrow_array() ) { my $species_id = $row[0]; my $name = $row[1]; $name2species_id{$name} = $species_id; } # Also populate the hash with all the aliases. $sth = $dbi->prepare("SELECT species_id, aliases FROM species"); $sth->execute() or croak( $dbi->errstr() ); while ( my @row = $sth->fetchrow_array() ) { my $species_id = $row[0]; foreach my $name ( split /,\s*/, $row[1] ) { if ( my $ori = $name2species_id{$name} ) { die( "Name $name already used for species $ori. ". "Cannot assign to species $species_id as well. ". "Consider using the species_id2name call instead. ". "Called by ". join( ', ', (caller(0))[1..2] ) ); } else { $name2species_id{$name} = $species_id; } } } } ## end if ( !%name2species_id) return %name2species_id; } ## end sub name2species_id sub species_id2name { my $self = shift; if ( !%species_id2name ) { my $dbi = dbi(); my $sth = $dbi->prepare("SELECT species_id, name FROM species"); $sth->execute() or croak( $dbi->errstr() ); while ( my @row = $sth->fetchrow_array() ) { my $species_id = $row[0]; my $name = $row[1]; $species_id2name{$species_id} = [ $name ]; } # Also populate the hash with all the aliases. $sth = $dbi->prepare("SELECT species_id, aliases FROM species"); $sth->execute() or croak( $dbi->errstr() ); while ( my @row = $sth->fetchrow_array() ) { my $species_id = $row[0]; foreach my $name ( split /,\s*/, $row[1] ) { $species_id2name{$species_id} ||= []; push @{$species_id2name{$species_id}}, $name; } } } ## end if ( !%species_id2name) return %species_id2name; } ## end sub species_id2name # -------------------------------------------------------------------------------- # Update a row in the source table sub update_source { my ( $dbi, $source_url_id, $checksum, $file_name ) = @_; my $file = IO::File->new($file_name) or croak("Failed to open file '$file_name'"); my $file_date = POSIX::strftime( '%Y%m%d%H%M%S', localtime( [ $file->stat() ]->[9] ) ); $file->close(); my $sql = "UPDATE source_url SET checksum='$checksum', " . "file_modified_date='$file_date', " . "upload_date=NOW() " . "WHERE source_url_id=$source_url_id"; # The release is set by the individual parser by calling the # inherited set_release() method. $dbi->prepare($sql)->execute() || croak( $dbi->errstr() ); } # -------------------------------------------------------------------------------- sub dbi2{ my $self = shift; my ($host, $port, $user, $dbname, $pass) = @_; my $dbi2 = undef; if ( !defined $dbi2 || !$dbi2->ping() ) { my $connect_string = sprintf( "dbi:mysql:host=%s;port=%s;database=%s", $host, $port, $dbname ); $dbi2 = DBI->connect( $connect_string, $user, $pass, # { 'RaiseError' => 1 } ) ) or warn( "Can't connect to database: " . $DBI::errstr ) and return undef; $dbi2->{'mysql_auto_reconnect'} = 1; # Reconnect on timeout } return $dbi2; } # -------------------------------------------------------------------------------- sub dbi { my $self = shift; if ( !defined $dbi || !$dbi->ping() ) { my $connect_string = sprintf( "dbi:mysql:host=%s;port=%s;database=%s", $host, $port, $dbname ); $dbi = DBI->connect( $connect_string, $user, $pass, { 'RaiseError' => 1 } ) or croak( "Can't connect to database: " . $DBI::errstr ); $dbi->{'mysql_auto_reconnect'} = 1; # Reconnect on timeout } return $dbi; } # -------------------------------------------------------------------------------- # Compute a checksum of a file. This checksum is not a straight MD5 # hex digest, but instead the file size combined with the first six # characters of the MD5 hex digest. This is to save space. sub md5sum { my $file = shift; if ( !open( FILE, $file ) ) { return undef } binmode(FILE); my $checksum = sprintf( "%s/%d", substr( Digest::MD5->new()->addfile(*FILE)->hexdigest(), 0, 6 ), [ stat FILE ]->[7] ); close(FILE); return $checksum; } # -------------------------------------------------------------------------------- sub get_xref_id_by_accession_and_source_OLD { my ($acc, $source_id, $species_id ) = @_; my $dbi = dbi(); my $sql = ' SELECT xref_id FROM xref WHERE accession=? AND source_id=?'; if( $species_id ){ $sql .= ' AND species_id=?' } my $sth = $dbi->prepare( $sql ); $sth->execute( $acc, $source_id, ( $species_id ? $species_id : () ) ) or croak( $dbi->errstr() ); my @row = $sth->fetchrow_array(); my $xref_id = $row[0]; return $xref_id; } # -------------------------------------------------------------------------------- # If there was an error, an xref with the same acc & source already exists. # If so, find its ID, otherwise get ID of xref just inserted sub insert_or_select { my ($self, $sth, $error, $acc, $source, $species) = @_; my $id; # TODO - check for specific error code rather than for just any error if ($error) { $id = $self->get_xref($acc, $source, $species); } else { $id = $sth->{'mysql_insertid'}; } return $id; } # -------------------------------------------------------------------------------- sub primary_xref_id_exists { my $xref_id = shift; my $exists = 0; my $dbi = dbi(); my $sth = $dbi->prepare("SELECT xref_id FROM primary_xref WHERE xref_id=?"); $sth->execute($xref_id) or croak( $dbi->errstr() ); my @row = $sth->fetchrow_array(); my $result = $row[0]; $exists = 1 if (defined $result); return $exists; } # -------------------------------------------------------------------------------- # delete all xrefs & related objects sub delete_by_source { my $self =shift; my $xrefs = shift; # SQL for deleting stuff # Note this SQL only works on MySQL version 4 and above #Remove direct xrefsbased on source my $direct_sth = $dbi->prepare("DELETE FROM direct_xref USING xref, direct_xref WHERE xref.xref_id=direct_xref.general_xref_id AND xref.source_id=?"); #remove Pairs fro source my $pairs_sth = $dbi->prepare("DELETE FROM pairs WHERE source_id=?"); # Remove dependent_xrefs and synonyms based on source of *xref* my $syn_sth = $dbi->prepare("DELETE FROM synonym USING xref, synonym WHERE xref.xref_id=synonym.xref_id AND xref.source_id=?"); my $dep_sth = $dbi->prepare("DELETE FROM dependent_xref USING xref, dependent_xref WHERE xref.xref_id=dependent_xref.master_xref_id AND xref.source_id=?"); # xrefs and primary_xrefs are straightforward deletes my $xref_sth = $dbi->prepare("DELETE FROM xref, primary_xref USING xref, primary_xref WHERE source_id=? AND primary_xref.xref_id = xref.xref_id"); # my $p_xref_sth = $dbi->prepare("DELETE FROM primary_xref WHERE source_id=?"); # xrefs may come from more than one source (e.g. UniProt/SP/SPtr) # so find all sources first my %source_ids; foreach my $xref (@$xrefs) { my $xref_source = $xref->{SOURCE_ID}; $source_ids{$xref_source} = 1; } # now delete them foreach my $source (keys %source_ids) { print "Deleting pairs with source ID $source \n" if($verbose); $pairs_sth->execute($source); print "Deleting direct xrefs with source ID $source \n" if($verbose); $direct_sth->execute($source); print "Deleting synonyms of xrefs with source ID $source \n" if($verbose); $syn_sth->execute($source); print "Deleting dependent xrefs of xrefs with source ID $source \n" if($verbose); $dep_sth->execute($source); print "Deleting primary xrefs with source ID $source \n" if($verbose); # $p_xref_sth->execute($source); print "Deleting xrefs with source ID $source \n" if($verbose); $xref_sth->execute($source); } $syn_sth->finish() if defined $syn_sth; $dep_sth->finish() if defined $dep_sth; $xref_sth->finish() if defined $xref_sth; # $p_xref_sth->finish() if defined $p_xref_sth; } # -------------------------------------------------------------------------------- sub validate_sources { my ($speciesref, @sources) = @_; my $dbi = dbi(); my $sth = $dbi->prepare("SELECT * FROM source WHERE LOWER(name)=?"); foreach my $source (@sources) { my $rv = $sth->execute(lc($source)); if ( $rv > 0 ) { print "Source $source is valid\n" if($verbose); } else { print "\nSource $source is not valid; valid sources are:\n"; foreach my $sp (@{$speciesref}){ show_valid_sources($sp); } return 0; } } return 1; } # -------------------------------------------------------------------------------- sub show_valid_sources() { my $species = shift; my $dbi = dbi(); my $sth = $dbi->prepare("SELECT distinct(name) FROM source s, source_url su WHERE s.download='Y' and s.source_id = su.source_id and su.species_id = $species"); $sth->execute(); while (my @row = $sth->fetchrow_array()) { print $row[0] . "\n"; } } # -------------------------------------------------------------------------------- sub validate_species { my @species = @_; my @species_ids; my $dbi = dbi(); my $sth = $dbi->prepare("SELECT species_id, name FROM species WHERE LOWER(name)=? OR LOWER(aliases) REGEXP ?"); my ($species_id, $species_name); foreach my $sp (@species) { # $sth->execute(lc($sp), "%" . lc($sp) . "%"); # no longer allow % as this generates tomany possible errors $sth->execute(lc($sp), "^".lc($sp).",|[ ]".lc($sp)."[,]|^".lc($sp)."\$|[,] ".lc($sp)."\$" ); $sth->bind_columns(\$species_id, \$species_name); if (my @row = $sth->fetchrow_array()) { print "Species $sp is valid (name = " . $species_name . ", ID = " . $species_id . ")\n" if($verbose); push @species_ids, $species_id; } else { print STDERR "Species $sp is not valid; valid species are:\n"; show_valid_species(); exit(1); } } return @species_ids; } # -------------------------------------------------------------------------------- sub show_valid_species() { my $dbi = dbi(); my $sth = $dbi->prepare("SELECT name, aliases FROM species"); $sth->execute(); while (my @row = $sth->fetchrow_array()) { print STDERR $row[0] . " (aliases: " . $row[1] . ")\n"; } } sub get_taxonomy_from_species_id{ my ($self,$species_id) = @_; my %hash; my $dbi = dbi(); my $sth = $dbi->prepare("SELECT taxonomy_id FROM species WHERE species_id = $species_id"); $sth->execute() or croak( $dbi->errstr() ); while(my @row = $sth->fetchrow_array()) { $hash{$row[0]} = 1; } $sth->finish; return \%hash; } sub get_direct_xref{ my ($self,$stable_id,$type,$link) = @_; my $direct_sth; if(!defined($direct_sth)){ my $sql = "select general_xref_id from direct_xref d where ensembl_stable_id = ? and type = ? and linkage_xref= ?"; $direct_sth = $dbi->prepare($sql); } $direct_sth->execute( $stable_id, $type, $link ) or croak( $dbi->errstr() ); if(my @row = $direct_sth->fetchrow_array()) { return $row[0]; } return undef; } sub get_xref{ my ($self,$acc,$source, $species_id) = @_; if(!defined($get_xref_sth)){ my $sql = "select xref_id from xref where accession = ? and source_id = ? and species_id = ?"; $get_xref_sth = $dbi->prepare($sql); } $get_xref_sth->execute( $acc, $source, $species_id ) or croak( $dbi->errstr() ); if(my @row = $get_xref_sth->fetchrow_array()) { return $row[0]; } return undef; } sub add_xref { my ( $self, $acc, $version, $label, $description, $source_id, $species_id, $info_type ) = @_; my $xref_id = $self->get_xref($acc,$source_id, $species_id); if(defined($xref_id)){ return $xref_id; } if ( !defined($add_xref_sth) ) { $add_xref_sth = dbi->prepare( "INSERT INTO xref " . "(accession,version,label,description,source_id,species_id, info_type) " . "VALUES(?,?,?,?,?,?,?)" ); } # If the description is more than 255 characters, chop it off and add # an indication that it has been truncated to the end of it. if ( length($description) > 255 ) { my $truncmsg = ' /.../'; substr( $description, 255 - length($truncmsg), length($truncmsg), $truncmsg ); } $add_xref_sth->execute( $acc, $version || 0, $label, $description, $source_id, $species_id, $info_type ) or croak("$acc\t$label\t\t$source_id\t$species_id\n"); return $add_xref_sth->{'mysql_insertid'}; } ## end sub add_xref sub add_to_direct_xrefs{ my ($self,$direct_xref,$type, $acc,$version,$label,$description,$linkage,$source_id,$species_id) = @_; $direct_xref || die( "Need a direct_xref on which this xref linked too" ); $acc || die( "Need an accession of this dependent xref" ); $version ||= 0; $label ||= $acc; $description ||= undef; $linkage ||= undef; $source_id || die( "Need a source_id for this dependent xref" ); $species_id || die( "Need a species_id for this dependent xref" ); if(!defined($add_xref_sth)){ $add_xref_sth = dbi->prepare(" INSERT INTO xref (accession,version,label,description,source_id,species_id, info_type) VALUES (?,?,?,?,?,?,?)"); } my $direct_id = $self->get_xref($acc, $source_id, $species_id); if(!defined($direct_id)){ $add_xref_sth->execute( $acc, $version || 0, $label, $description, $source_id, $species_id, "DIRECT" ) or croak("$acc\t$label\t\t$source_id\t$species_id\n"); } $direct_id = $self->get_xref($acc, $source_id, $species_id); $self->add_direct_xref($direct_id, $direct_xref, $type, ""); } sub add_to_xrefs{ my ($self,$master_xref,$acc,$version,$label,$description,$linkage,$source_id,$species_id) = @_; $master_xref || die( "Need a master_xref_id on which this xref depends" ); $acc || die( "Need an accession of this dependent xref" ); $version ||= 0; $label ||= $acc; $description ||= undef; $linkage ||= undef; $source_id || die( "Need a source_id for this dependent xref" ); $species_id || die( "Need a species_id for this dependent xref" ); if(!defined($add_xref_sth)){ $add_xref_sth = dbi->prepare(" INSERT INTO xref (accession,version,label,description,source_id,species_id, info_type) VALUES (?,?,?,?,?,?,?)"); } if(!defined($add_dependent_xref_sth)){ $add_dependent_xref_sth = dbi->prepare(" INSERT INTO dependent_xref (master_xref_id,dependent_xref_id,linkage_annotation,linkage_source_id) VALUES (?,?,?,?)"); } my $dependent_id = $self->get_xref($acc, $source_id, $species_id); if(!defined($dependent_id)){ $add_xref_sth->execute( $acc, $version || 0, $label, $description, $source_id, $species_id, "DEPENDENT" ) or croak("$acc\t$label\t\t$source_id\t$species_id\n"); } $dependent_id = $self->get_xref($acc, $source_id, $species_id); if(!defined($dependent_id)){ croak("$acc\t$label\t\t$source_id\t$species_id\n"); } if(!defined($xref_dependent_mapped{$master_xref."|".$dependent_id})){ $add_dependent_xref_sth->execute( $master_xref, $dependent_id, $linkage, $source_id ) or croak("$master_xref\t$dependent_id\t$linkage\t$source_id"); $xref_dependent_mapped{$master_xref."|".$dependent_id} = 1; } } sub add_to_syn_for_mult_sources{ my ($self, $acc, $sources, $syn, $species_id) = @_; if(!defined($add_synonym_sth)){ $add_synonym_sth = $dbi->prepare("INSERT INTO synonym VALUES(?,?)"); } my $found =0; foreach my $source_id (@$sources){ my $xref_id = $self->get_xref($acc, $source_id, $species_id); if(defined($xref_id)){ $add_synonym_sth->execute( $xref_id, $syn ) or croak( $dbi->errstr() . "\n $xref_id\n $syn\n" ); $found = 1; } } } sub add_to_syn{ my ($self, $acc, $source_id, $syn, $species_id) = @_; if(!defined($add_synonym_sth)){ $add_synonym_sth = $dbi->prepare("INSERT INTO synonym VALUES(?,?)"); } my $xref_id = $self->get_xref($acc, $source_id, $species_id); if(defined($xref_id)){ $add_synonym_sth->execute( $xref_id, $syn ) or croak( $dbi->errstr() . "\n $xref_id\n $syn\n" ); } else { croak( "Could not find acc $acc in " . "xref table source = $source_id of species $species_id\n" ); } } sub add_synonym{ my ($self, $xref_id, $syn) = @_; my $add_synonym_sth; if(!defined($add_synonym_sth)){ $add_synonym_sth = $dbi->prepare("INSERT INTO synonym VALUES(?,?)"); } $add_synonym_sth->execute( $xref_id, $syn ) or croak( $dbi->errstr() . "\n $xref_id\n $syn\n" ); } # -------------------------------------------------------------------------------- # Add a single record to the direct_xref table. # Note that an xref must already have been added to the xref table (xref_id passed as 1st arg) sub add_direct_xref { my ($self, $general_xref_id, $ensembl_stable_id, $ensembl_type, $linkage_type) = @_; if (!defined($add_direct_xref_sth{$ensembl_type})){ my $add_gene_direct_xref_sth = dbi->prepare("INSERT INTO gene_direct_xref VALUES(?,?,?)"); my $add_tr_direct_xref_sth = dbi->prepare("INSERT INTO transcript_direct_xref VALUES(?,?,?)"); my $add_tl_direct_xref_sth = dbi->prepare("INSERT INTO translation_direct_xref VALUES(?,?,?)"); $add_direct_xref_sth{"gene"} = $add_gene_direct_xref_sth; $add_direct_xref_sth{"transcript"} = $add_tr_direct_xref_sth; $add_direct_xref_sth{"translation"} = $add_tl_direct_xref_sth; $add_direct_xref_sth{"Gene"} = $add_gene_direct_xref_sth; $add_direct_xref_sth{"Transcript"} = $add_tr_direct_xref_sth; $add_direct_xref_sth{"Translation"} = $add_tl_direct_xref_sth; } if(!defined($add_direct_xref_sth{$ensembl_type})){ print "ERROR add_direct_xref_sth does not exist for $ensembl_type ???\n"; } else{ $add_direct_xref_sth{$ensembl_type}->execute($general_xref_id, $ensembl_stable_id, $linkage_type); } } # ------------------------------------------------------------------------------ # Remove potentially problematic characters from string used as file or # directory names. sub sanitise { my $str = shift; $str =~ tr[/:][]d; return $str; } # ------------------------------------------------------------------------------ # Create database if required. Assumes sql/table.sql and sql/populate_metadata.sql # are present. sub create { my ( $host, $port, $user, $pass, $dbname, $sql_dir, $drop_db ) = @_; my $dbh = DBI->connect( "DBI:mysql:host=$host:port=$port", $user, $pass, {'RaiseError' => 1}); my $metadata_file = catfile( $sql_dir, 'sql', 'populate_metadata.sql' ); my $ini_file = catfile( $sql_dir, 'xref_config.ini' ); $| = 1; # flush stdout # Figure out whether to run 'xref_config2sql.pl' or not by comparing # the timestamps on 'xref_config.ini' and 'sql/populate_metadata.sql'. my $ini_tm = ( stat $ini_file )[9]; my $meta_tm = ( stat $metadata_file )[9]; if ( !defined($meta_tm) || $ini_tm > $meta_tm ) { printf( "==> Your copy of 'xref_config.ini' is newer than '%s'\n", catfile( 'sql', 'populate_metadata.sql' ) ); print("==> Should I re-run 'xref_config2sql.pl' for you? [y/N]: "); my $reply = <STDIN>; chomp $reply; if ( lc( substr( $reply, 0, 1 ) ) eq 'y' ) { my $cmd = sprintf( "perl %s %s >%s", catfile( $sql_dir, 'xref_config2sql.pl' ), $ini_file, $metadata_file ); if ( system($cmd) == 0 ) { print("==> Done.\n") if($verbose); } else { if ( $? == -1 ) { croak("Failed to execute: $!\n"); } elsif ( $? & 127 ) { croak( sprintf( "Command died with signal %d, %s coredump\n", ( $? & 127 ), ( $? & 128 ) ? 'with' : 'without' ) ); } else { croak( sprintf( "Command exited with value %d\n", $? >> 8 ) ); } } } } ## end if ( !defined($meta_tm... # check to see if the database already exists my %dbs = map {$_->[0] => 1} @{$dbh->selectall_arrayref('SHOW DATABASES')}; if ($dbs{$dbname}) { if ( $drop_db ) { $dbh->do( "DROP DATABASE $dbname" ); print "Database $dbname dropped\n" if($verbose) ; } if ( $create && !$drop_db ) { print "WARNING: about to drop database $dbname on $host:$port; yes to confirm, otherwise exit: "; my $p = <STDIN>; chomp $p; if ($p eq "yes") { $dbh->do( "DROP DATABASE $dbname" ); print "Removed existing database $dbname\n" if($verbose); } else { print "$dbname NOT removed\n"; exit(1); } } elsif ( !$create) { croak( "Database $dbname already exists. " . "Use -create option to overwrite it." ); } } $dbh->do( "CREATE DATABASE " . $dbname ); my $table_file = catfile( $sql_dir, 'sql', 'table.sql' ); printf( "Creating %s from %s\n", $dbname, $table_file ) if($verbose); if ( !-e $table_file ) { croak( "Cannot open " . $table_file ); } my $cmd = "mysql -u $user -p'$pass' -P $port -h $host $dbname < $table_file"; system($cmd) == 0 or croak("Cannot execute the following command (exit $?):\n$cmd\n"); printf( "Populating metadata in %s from %s\n", $dbname, $metadata_file ) if($verbose); if ( !-e $metadata_file ) { croak( "Cannot open " . $metadata_file ); } $cmd = "mysql -u $user -p'$pass' -P $port -h $host " . "$dbname < $metadata_file"; system($cmd) == 0 or croak("Cannot execute the following command (exit $?):\n$cmd\n"); } sub get_label_to_acc{ my ($self, $name, $species_id, $prio_desc) = @_; my %hash1=(); my $dbi = dbi(); my $sql = "select xref.accession, xref.label from xref, source where source.name like '$name%' and xref.source_id = source.source_id"; if(defined($prio_desc)){ $sql .= " and source.priority_description like '".$prio_desc."'"; } if(defined($species_id)){ $sql .= " and xref.species_id = $species_id"; } my $sub_sth = dbi->prepare($sql); $sub_sth->execute(); while(my @row = $sub_sth->fetchrow_array()) { $hash1{$row[1]} = $row[0]; } # # Remember synonyms # $sql = "select xref.accession, synonym.synonym from xref, source, synonym where synonym.xref_id = xref.xref_id and source.name like '$name%' and xref.source_id = source.source_id"; if(defined($prio_desc)){ $sql .= " and source.priority_description like '".$prio_desc."'"; } if(defined($species_id)){ $sql .= " and xref.species_id = $species_id"; } my $sub_sth = dbi->prepare($sql); $sub_sth->execute(); while(my @row = $sub_sth->fetchrow_array()) { $hash1{$row[1]} = $row[0]; } return \%hash1; } sub get_label_to_desc{ my ($self, $name, $species_id, $prio_desc) = @_; my %hash1=(); my $dbi = dbi(); my $sql = "select xref.description, xref.label from xref, source where source.name like '$name%' and xref.source_id = source.source_id"; if(defined($prio_desc)){ $sql .= " and source.priority_description like '".$prio_desc."'"; } if(defined($species_id)){ $sql .= " and xref.species_id = $species_id"; } my $sub_sth = dbi->prepare($sql); $sub_sth->execute(); while(my @row = $sub_sth->fetchrow_array()) { $hash1{$row[1]} = $row[0]; } # # Also include the synonyms # $sql = "select xref.description, synonym.synonym from xref, source, synonym where synonym.xref_id = xref.xref_id and source.name like '$name%' and xref.source_id = source.source_id"; if(defined($prio_desc)){ $sql .= " and source.priority_description like '".$prio_desc."'"; } if(defined($species_id)){ $sql .= " and xref.species_id = $species_id"; } my $sub_sth = dbi->prepare($sql); $sub_sth->execute(); while(my @row = $sub_sth->fetchrow_array()) { $hash1{$row[1]} = $row[0]; } return \%hash1; } sub get_accession_from_label{ my ($self, $name) = @_; my $dbi = dbi(); my $sql = "select xref.accession from xref where xref.label like '$name'"; my $sub_sth = dbi->prepare($sql); $sub_sth->execute(); while(my @row = $sub_sth->fetchrow_array()) { return $row[0]; } return undef; } sub get_sub_list{ my ($self, $name) = @_; my @list=(); my $dbi = dbi(); my $sql = "select xref.accession from xref where xref.accession like '$name%'"; my $sub_sth = dbi->prepare($sql); $sub_sth->execute(); while(my @row = $sub_sth->fetchrow_array()) { push @list, $row[0]; } return @list; } # -------------------------------------------------------------------------------- # Set release for a source. sub set_release { my $self = shift; my ( $source_id, $release ) = @_; my $dbi = dbi(); my $sth = $dbi->prepare( "UPDATE source SET source_release=? WHERE source_id=?"); print "Setting release to '$release' for source ID '$source_id'\n" if($verbose); $sth->execute( $release, $source_id ); } sub get_dependent_mappings { my $self = shift; my $source_id = shift; my $dbi = dbi(); my $sth = $dbi->prepare( "select d.master_xref_id, d.dependent_xref_id from dependent_xref d, xref x where x.xref_id = d.dependent_xref_id and x.source_id = $source_id"); $sth->execute(); my $master_xref; my $dependent_xref; $sth->bind_columns(\$master_xref,\$dependent_xref); while($sth->fetch){ $xref_dependent_mapped{$master_xref."|".$dependent_xref}=1; } $sth->finish; } sub get_hgnc_synonyms{ my $self = shift; my %hgnc_syns; my %seen; # can be in more than once fro each type of hgnc. my $sql = (<<SYN); SELECT x.accession, x.label, sy.synonym FROM xref x, source so, synonym sy WHERE x.xref_id = sy.xref_id AND so.source_id = x.source_id AND so.name like "HGNC" SYN my $dbi = $self->dbi(); my $sth = $dbi->prepare($sql); $sth->execute; my ($acc, $label, $syn); $sth->bind_columns(\$acc, \$label, \$syn); my $count = 0; while($sth->fetch){ if(!defined($seen{$acc.":".$syn})){ push @{$hgnc_syns{$acc}}, $syn; push @{$hgnc_syns{$label}}, $syn; $count++; } $seen{$acc.":".$syn} = 1; } $sth->finish; return \%hgnc_syns; } # -------------------------------------------------------------------------------- 1;