Raw content of XrefParser::CeleraParser
package XrefParser::CeleraParser;
use strict;
use File::Basename;
use base qw( XrefParser::BaseParser );
# Celera database dump for anopheles - FASTA format
#
# >agCP5429,cg_name=agCG43843,ga_name=GA_x9P1GAV56A9,transcript_name=agCT42178
# MNPNSTGSSSAAGSSISTSSLPGIERLIGRENWETWKFAVQTFLELEDLWCAVKPKKNDD
# GSYESVDTAKDRKARAKIILLLEPVNYVHVKEATTAKEVWSKLEKAFDDSGLTRRVGLLH
#
# This is the parser that provides most functionality, subclasses
# (CeleraProteinParser, CeleraTranscriptParser) just set sequence type)
sub run {
my $self = shift if (defined(caller(1)));
my $source_id = shift;
my $species_id = shift;
my $files = shift;
my $release_file = shift;
my $verbose = shift;
my $file = @{$files}[0];
my $celera_gene_source_id = $self->get_source_id_for_source_name('Celera_Gene');
my @xrefs;
local $/ = "\n>";
my $file_io = $self->get_filehandle($file);
if ( !defined $file_io ) {
print STDERR "Could not open $file\n";
return 1;
}
while ( $_ = $file_io->getline() ) {
next if (/^File:/); # skip header
my $xref;
my ($header, $sequence) = $_ =~ /^>?(.+?)\n([^>]*)/s or warn("Can't parse FASTA entry: $_\n");
# deconstruct header - just use first part
my ($accession, $cg) = split /,/, $header;
# make sequence into one long string
$sequence =~ s/\n//g;
# build the xref object and store it
$xref->{ACCESSION} = $accession;
$xref->{LABEL} = $accession;
$xref->{SEQUENCE} = $sequence;
$xref->{SOURCE_ID} = $source_id;
$xref->{SPECIES_ID} = $species_id;
$xref->{SEQUENCE_TYPE} = $self->get_sequence_type();
$xref->{STATUS} = 'experimental';
# pull cg_name from peptide files as well and create dependent xrefs
if ($self->get_sequence_type() =~ /peptide/) {
my ($cg_name) = $cg =~ /cg_name=(.*)/;
my %dep;
$dep{SOURCE_NAME} = 'Celera_Gene';
$dep{LINKAGE_SOURCE_ID} = $xref->{SOURCE_ID};
$dep{SOURCE_ID} = $celera_gene_source_id;
$dep{ACCESSION} = $cg_name;
push @{$xref->{DEPENDENT_XREFS}}, \%dep; # array of hashrefs
}
push @xrefs, $xref;
}
$file_io->close();
XrefParser::BaseParser->upload_xref_object_graphs(\@xrefs);
print scalar(@xrefs) . " Celera xrefs succesfully parsed\n" if($verbose);
return 0;
}
1;