Raw content of XrefParser::FlybaseParser
# $Id: FlybaseParser.pm,v 1.16 2008/09/02 10:02:10 ianl Exp $
package XrefParser::FlybaseParser;
use strict;
use warnings;
use Carp;
use base qw( XrefParser::BaseParser );
my $verbose;
# The object types we'd like to parse.
our %object_types = ( gene => 1,
mRNA => 1,
miRNA => 1,
ncRNA => 1,
protein => 1,
pseudogene => 1,
rRNA => 1,
snRNA => 1,
snoRNA => 1,
tRNA => 1 );
# This is some statistics from the 5.4 file 'dmel-all-r5.4.gff.gz',
# looking at the FlyBase (2nd column) object types (3rd column) gene,
# mRNA, miRNA, ncRNA, protein, pseudogene, rRNA, snRNA, snoRNA, and tRNA
# only:
#
# cnt Dbxref name source_name in Xref database
# 57884 FlyBase_Annotation_IDs (special, see below)
# 33441 GB_protein protein_id
# 14324 GB EMBL
# 13951 flygrid FlyGrid
# 13265 FlyBase flybase_annotation_id
# 12768 dedb dedb
# 11745 UniProt/TrEMBL Uniprot/SPTREMBL
# 10076 INTERPRO Interpro
# 8077 orthologs SKIPPED
# 2089 UniProt/Swiss-Prot Uniprot/SWISSPROT
# 1596 bdgpinsituexpr bdgpinsituexpr
# 1207 hybrigenics SKIPPED
# 787 if SKIPPED
# 290 MITODROME SKIPPED
# 153 TF SKIPPED
# 82 EPD SKIPPED
# 80 MIR SKIPPED
# 61 PDB SKIPPED
# 56 MEROPS SKIPPED
# 17 GCR SKIPPED
# 15 Rfam SKIPPED
# 12 NRL_3D SKIPPED
# 11 GO SKIPPED
#
# The Dbxref name 'FlyBase_Annotation_IDs' will be associated with the
# source_names FlyBaseCGID_{gene,transcript,translation} depending on
# the type of 'ID' of the line.
#
# Likewise, the source_names FlyBaseName_{gene,transcript,translation}
# will be associated with the 'Name' of each entry depending on the type
# of 'ID'.
#
# ... and the source_names flybase_{gene,transcript,translation}_id will
# be associated with the 'ID' of each entry depending on the type of
# 'ID'.
# This hash will translate the Dbxref names in the data file into source
# names known by the Xref system.
our %source_name_map = ( 'FlyBase' => 'flybase_annotation_id',
'GB' => 'EMBL',
'GB_protein' => 'protein_id',
'INTERPRO' => 'Interpro',
'UniProt/Swiss-Prot' => 'Uniprot/SWISSPROT',
'UniProt/TrEMBL' => 'Uniprot/SPTREMBL',
'bdgpinsituexpr' => 'bdgpinsituexpr',
'dedb' => 'dedb',
'flygrid' => 'FlyGrid' );
# This is for source_ids that depend on the type of 'ID' of the line.
our %special_source_name_map = (
'gene' => {
'Dbxref' => 'FlyBaseCGID_gene',
'Name' => 'FlyBaseName_gene',
'ID' => 'flybase_gene_id'
},
'transcript' => {
'Dbxref' => 'FlyBaseCGID_transcript',
'Name' => 'FlyBaseName_transcript',
'ID' => 'flybase_transcript_id'
},
'translation' => {
'Dbxref' => 'FlyBaseCGID_translation',
'Name' => 'FlyBaseName_translation',
'ID' => 'flybase_translation_id'
} );
# This hash will eventually be populated with the source_id for the
# sources above.
our %source_id;
sub get_source_id_for_source_name {
my $self = shift;
my ($source_name) = @_;
if ( !defined( $source_id{$source_name} ) ) {
$source_id{$source_name} =
$self->SUPER::get_source_id_for_source_name(@_);
printf( "source_id for source '%s' is %d\n",
$source_name, $source_id{$source_name} ) if ($verbose);
}
if ( !defined( $source_id{$source_name} )
|| $source_id{$source_name} < 0 )
{
carp(
sprintf( "Can not find source_id for source '%s'", $source_name )
);
}
return $source_id{$source_name};
}
sub run {
my $self = shift if (defined(caller(1)));
my $source_id = shift;
my $species_id = shift;
my $files = shift;
my $release_file = shift;
$verbose = shift;
my $data_file = @{$files}[0];
# my $self = shift;
# my ( $source_id, $species_id, $data_file, $release_file ) = @_;
# Fetch hashes of already stored Uniprot and Interpro accessions.
my %pre_xref_ids = (
'Uniprot' => $self->get_valid_codes( 'uniprot', $species_id ),
'Interpro' => $self->get_valid_codes( 'interpro', $species_id )
);
my %xref_ids;
my $data_io = $self->get_filehandle($data_file);
my ( $count_read, $count_skipped, $last_count_read ) = ( 0, 0, 0 );
my $status_interval = 30;
local $SIG{ALRM} = sub {
printf( "%d lines read, %d skipped, %d parsed; %d lines/s\n",
$count_read, $count_skipped,
$count_read - $count_skipped,
( $count_read - $last_count_read )/$status_interval ) if($verbose);
$last_count_read = $count_read;
alarm($status_interval);
};
alarm($status_interval);
while ( defined( my $line = $data_io->getline() ) ) {
++$count_read;
# Skip comment lines at the start of the file.
if ( substr( $line, 0, 1 ) eq '#' ) { ++$count_skipped; next }
chomp($line);
# Split each line into fields.
my @fields = split( /\t/, $line );
# Only pick out the interesting lines.
if (
!( defined( $fields[1] )
&& $fields[1] eq 'FlyBase'
&& defined( $fields[2] )
&& exists( $object_types{ $fields[2] } ) ) )
{
++$count_skipped;
next;
}
# Go though each attribute (from the 9th field), split them up into
# key-value pairs and store them.
my %attributes;
foreach my $attribute ( split( /;/, $fields[8] ) ) {
my ( $key, $value ) = split( /=/, $attribute );
if ( $key ne '' && $value ne '' ) {
$attributes{$key} = $value;
}
}
my $id = $attributes{'ID'};
my $type;
if ( substr( $id, 0, 4 ) eq 'FBgn' ) { $type = 'gene' }
elsif ( substr( $id, 0, 4 ) eq 'FBtr' ) { $type = 'transcript' }
elsif ( substr( $id, 0, 4 ) eq 'FBpp' ) { $type = 'translation' }
else { $type = 'unknown' }
# For the 'Dbxref' and 'Ontology_term' attributes, split them up on
# commas, divide into key-value pairs, and store them.
foreach my $attribute_key ( 'Dbxref', 'Ontology_term' ) {
if ( exists( $attributes{$attribute_key} ) ) {
my %tmphash;
foreach
my $subattribute ( split( /,/, $attributes{$attribute_key} ) )
{
my ( $key, $value ) = split( /:/, $subattribute, 2 );
push( @{ $tmphash{$key} }, $value );
}
# Replace the attribute entry with the hash.
$attributes{$attribute_key} = \%tmphash;
}
}
my $dbxref = $attributes{'Dbxref'};
#-------------------------------------------------------------------
# Store Xrefs and Direct Xrefs for all the interesting Dbxref
# entries.
#-------------------------------------------------------------------
foreach my $dbxref_name ( keys( %{$dbxref} ) ) {
if ( exists( $source_name_map{$dbxref_name} ) ) {
my $source_name = $source_name_map{$dbxref_name};
my $source_id =
$self->get_source_id_for_source_name($source_name);
# Treat Uniprot and Interpro differently.
my ($pre_source) = ( $source_name =~ /^(Uniprot|Interpro)/ );
if ( defined($pre_source) ) {
foreach my $accession ( @{ $dbxref->{$dbxref_name} } ) {
if ( exists( $pre_xref_ids{$pre_source}{$accession} ) ) {
$self->add_direct_xref(
$pre_xref_ids{$pre_source}{$accession},
$id, $type, '' );
$xref_ids{$pre_source}{$accession} =
$pre_xref_ids{$pre_source}{$accession};
} else {
$xref_ids{ $pre_source . ' (missed)' }{$accession} = -1;
}
}
} else {
foreach my $accession ( @{ $dbxref->{$dbxref_name} } ) {
my $xref_id;
if ( exists( $xref_ids{$source_name}{$accession} ) ) {
$xref_id = $xref_ids{$source_name}{$accession};
} else {
# The Dbxref 'bdgpinsituexpr' needs case sensitivity, just
# like the FlyBase Names, so use the ID as the accession
# for this source.
if ( $dbxref_name eq 'bdgpinsituexpr' ) {
$xref_id =
$self->add_xref( $id, undef, $accession, '',
$source_id, $species_id );
} else {
$xref_id =
$self->add_xref( $accession, undef, $accession, '',
$source_id, $species_id );
}
$xref_ids{$source_name}{$accession} = $xref_id;
}
$self->add_direct_xref( $xref_id, $id, $type, '' );
}
}
} ## end if ( exists( $source_name_map...
} ## end foreach my $dbxref_name ( keys...
#-------------------------------------------------------------------
# Store Xrefs and Direct Xrefs for the GO 'Ontology_term' entries.
#-------------------------------------------------------------------
if ( exists( $attributes{'Ontology_term'}{'GO'} ) ) {
my $source_name = 'GO';
my $source_id =
$self->get_source_id_for_source_name($source_name);
foreach my $accession ( @{ $attributes{'Ontology_term'}{'GO'} } )
{
my $xref_id;
if ( exists( $xref_ids{$source_name}{$accession} ) ) {
$xref_id = $xref_ids{$source_name}{$accession};
} else {
$xref_id =
$self->add_xref( $accession, undef, $accession, '',
$source_id, $species_id );
$xref_ids{$source_name}{$accession} = $xref_id;
}
$self->add_direct_xref( $xref_id, $id, $type, '' );
}
}
#-------------------------------------------------------------------
# Store Xrefs and Direct Xrefs for the 'FlyBase_Annotation_IDs'
# Dbxref entry (depends on type of 'ID').
#-------------------------------------------------------------------
if ( exists( $dbxref->{'FlyBase_Annotation_IDs'} ) ) {
my $source_name = $special_source_name_map{$type}{'Dbxref'};
my $source_id =
$self->get_source_id_for_source_name($source_name);
foreach my $accession ( @{ $dbxref->{'FlyBase_Annotation_IDs'} } )
{
my $xref_id;
if ( exists( $xref_ids{$source_name}{$accession} ) ) {
$xref_id = $xref_ids{$source_name}{$accession};
} else {
$xref_id =
$self->add_xref( $accession, undef, $accession, '',
$source_id, $species_id );
$xref_ids{$source_name}{$accession} = $xref_id;
}
$self->add_direct_xref( $xref_id, $id, $type, '' );
}
}
#-------------------------------------------------------------------
# Store Xref and Direct Xref for the 'Name' (depends on type of
# 'ID').
#-------------------------------------------------------------------
{
my $source_name = $special_source_name_map{$type}{'Name'};
my $source_id =
$self->get_source_id_for_source_name($source_name);
my $accession = $attributes{'Name'};
my $xref_id;
if ( exists( $xref_ids{$source_name}{$accession} ) ) {
$xref_id = $xref_ids{$source_name}{$accession};
} else {
$xref_id =
$self->add_xref( $id, undef, $accession, '', $source_id,
$species_id );
$xref_ids{$source_name}{$accession} = $xref_id;
}
$self->add_direct_xref( $xref_id, $id, $type, '' );
}
#-------------------------------------------------------------------
# Store Xref and Direct Xref for the 'ID' (depends on type of 'ID').
#-------------------------------------------------------------------
{
my $source_name = $special_source_name_map{$type}{'ID'};
my $source_id =
$self->get_source_id_for_source_name($source_name);
my $accession = $id;
my $xref_id;
if ( exists( $xref_ids{$source_name}{$accession} ) ) {
$xref_id = $xref_ids{$source_name}{$accession};
} else {
$xref_id =
$self->add_xref( $accession, undef, $accession, '',
$source_id, $species_id );
$xref_ids{$source_name}{$accession} = $xref_id;
}
$self->add_direct_xref( $xref_id, $id, $type, '' );
}
} ## end while ( defined( my $line...
$data_io->close();
print("FlybaseParser Summary:\n") if($verbose);
foreach my $label ( sort( keys(%xref_ids) ) ) {
my $accessions = $xref_ids{$label};
printf( "\t%-32s %6d\n", $label, scalar( keys( %{$accessions} ) ) ) if($verbose);
}
} ## end sub run
1;