Raw content of XrefParser::RefSeqGPFFParser # Parse RefSeq GPFF files to create xrefs. package XrefParser::RefSeqGPFFParser; use strict; use File::Basename; use base qw( XrefParser::BaseParser ); # -------------------------------------------------------------------------------- # Parse command line and run if being run directly if (!defined(caller())) { if (scalar(@ARGV) != 1) { print "\nUsage: RefSeqGPFFParser.pm file.SPC <source_id>\n\n"; exit(1); } run($ARGV[0], -1); } # -------------------------------------------------------------------------------- my $verbose; sub run { my $self = shift if (defined(caller(1))); my $source_id = shift; my $species_id = shift; my $files_ref = shift; my $rel_file = shift; $verbose = shift; my @files = @{$files_ref}; my $release_file; if ( $files[-1] =~ /RefSeq-release/ ) { $release_file = pop @files; } my $peptide_source_id = $self->get_source_id_for_source_name('RefSeq_peptide'); my $dna_source_id = $self->get_source_id_for_source_name('RefSeq_dna'); print "RefSeq_peptide source ID = $peptide_source_id\n" if($verbose); print "RefSeq_dna source ID = $dna_source_id\n" if($verbose); my $pred_peptide_source_id = $self->get_source_id_for_source_name('RefSeq_peptide_predicted'); my $pred_dna_source_id = $self->get_source_id_for_source_name('RefSeq_dna_predicted'); print "RefSeq_peptide_predicted source ID = " . "$pred_peptide_source_id\n" if($verbose); print "RefSeq_dna_predicted source ID = $pred_dna_source_id\n" if($verbose); my @xrefs; foreach my $file (@files) { if ( $source_id < 1 ) { $source_id = $self->get_source_id_for_filename( basename($file) ); } if ( !defined($species_id) ) { $species_id = $self->get_species_id_for_filename($file); } my $xrefs = $self->create_xrefs( $peptide_source_id, $dna_source_id, $pred_peptide_source_id, $pred_dna_source_id, $file, $species_id ); if ( !defined( $xrefs ) ) { return 1; #error } push @xrefs, @{$xrefs}; } if ( !defined( $self->upload_xref_object_graphs( \@xrefs ) ) ) { return 1; # error } if ( defined $release_file ) { # Parse and set release info. my $release_io = $self->get_filehandle($release_file); local $/ = "\n*"; my $release = $release_io->getline(); $release_io->close(); $release =~ s/\s{2,}/ /g; $release =~ s/.*(NCBI Reference Sequence.*) Distribution.*/$1/s; # Put a comma after the release number to make it more readable. $release =~ s/Release (\d+)/Release $1,/; print "RefSeq release: '$release'\n" if($verbose); $self->set_release( $source_id, $release ); $self->set_release( $peptide_source_id, $release ); $self->set_release( $dna_source_id, $release ); $self->set_release( $pred_peptide_source_id, $release ); $self->set_release( $pred_dna_source_id, $release ); } return 0; # successful } # -------------------------------------------------------------------------------- # Parse file into array of xref objects # There are 2 types of RefSeq files that we are interested in: # - protein sequence files *.protein.faa # - mRNA sequence files *.rna.fna # Slightly different formats sub create_xrefs { my $self = shift; my ( $peptide_source_id, $dna_source_id, $pred_peptide_source_id, $pred_dna_source_id, $file, $species_id ) = @_; # Create a hash of all valid names and taxon_ids for this species my %species2name = $self->species_id2name(); my %species2tax = $self->species_id2taxonomy(); my @names = @{$species2name{$species_id}}; my @tax_ids = @{$species2tax{$species_id}}; my %name2species_id = map{ $_=>$species_id } @names; my %taxonomy2species_id = map{ $_=>$species_id } @tax_ids; my %dependent_sources = $self->get_dependent_xref_sources(); my $refseq_io = $self->get_filehandle($file); if ( !defined $refseq_io ) { print STDERR "ERROR: Can't open RefSeqGPFF file $file\n"; return undef; } my @xrefs; local $/ = "\/\/\n"; my $type; if ($file =~ /protein/) { $type = 'peptide'; } elsif ($file =~ /rna/) { $type = 'dna'; } elsif($file =~ /RefSeq_dna/){ $type = 'dna'; } elsif($file =~ /RefSeq_protein/){ $type = 'peptide'; }else{ print STDERR "Could not work out sequence type for $file\n"; return undef; } while ( $_ = $refseq_io->getline() ) { my $xref; my $entry = $_; chomp $entry; my ($species) = $entry =~ /\s+ORGANISM\s+(.*)\n/; $species = lc $species; $species =~ s/^\s*//g; $species =~ s/\s*\(.+\)//; # Ditch anything in parens $species =~ s/\s+/_/g; $species =~ s/\n//g; my $species_id_check = $name2species_id{$species}; # Try going through the taxon ID if species check didn't work. if ( !defined $species_id_check ) { my ($taxon_id) = $entry =~ /db_xref="taxon:(\d+)"/; $species_id_check = $taxonomy2species_id{$taxon_id}; } # skip xrefs for species that aren't in the species table if ( defined $species_id && defined $species_id_check && $species_id == $species_id_check ) { my ($acc) = $entry =~ /ACCESSION\s+(\S+)/; my ($ver) = $entry =~ /VERSION\s+(\S+)/; # get the right source ID based on $type and whether this is predicted (X*) or not my $source_id; if ($type =~ /dna/) { if ($acc =~ /^XM_/) { $source_id = $pred_dna_source_id; } else { $source_id = $dna_source_id; } } elsif ($type =~ /peptide/) { if ($acc =~ /^XP_/) { $source_id = $pred_peptide_source_id; } else { $source_id = $peptide_source_id; } } print "Warning: can't get source ID for $type $acc\n" if (!$source_id); # Description - may be multi-line my ($description) = $entry =~ /DEFINITION\s+([^[]+)/s; print $entry if (length($description) == 0); $description =~ s/\nACCESSION.*//s; $description =~ s/\n//g; $description =~ s/\s+/ /g; $description = substr($description, 0, 255) if (length($description) > 255); my ($seq) = $_ =~ /ORIGIN\s+(.+)/s; # /s allows . to match newline my @seq_lines = split /\n/, $seq; my $parsed_seq = ""; foreach my $x (@seq_lines) { my ($seq_only) = $x =~ /^\s*\d+\s+(.*)$/; next if (!defined $seq_only); $parsed_seq .= $seq_only; } $parsed_seq =~ s#//##g; # remove trailing end-of-record character $parsed_seq =~ s#\s##g; # remove whitespace ( my $acc_no_ver, $ver ) = split( /\./, $ver ); $xref->{ACCESSION} = $acc; if($acc eq $acc_no_ver){ $xref->{VERSION} = $ver; } else{ print "$acc NE $acc_no_ver\n"; } $xref->{LABEL} = $acc . "\." . $ver; $xref->{DESCRIPTION} = $description; $xref->{SOURCE_ID} = $source_id; $xref->{SEQUENCE} = $parsed_seq; $xref->{SEQUENCE_TYPE} = $type; $xref->{SPECIES_ID} = $species_id; $xref->{INFO_TYPE} = "SEQUENCE_MATCH"; # TODO experimental/predicted my @EntrezGeneIDline = $entry =~ /db_xref=.GeneID:(\d+)/g; my @SGDGeneIDline = $entry =~ /db_xref=.SGD:(S\d+)/g; my @protein_id = $entry =~ /\/protein_id=.(\S+_\d+)/g; my @coded_by = $entry =~ /\/coded_by=.(\w+_\d+)/g; foreach my $cb (@coded_by){ $xref->{PAIR} = $cb; } foreach my $pi (@protein_id){ $xref->{PROTEIN} = $pi; } foreach my $ll (@EntrezGeneIDline) { my %dep; $dep{SOURCE_ID} = $dependent_sources{EntrezGene} || die( 'No source for EntrezGene!' ); $dep{LINKAGE_SOURCE_ID} = $source_id; $dep{ACCESSION} = $ll; push @{$xref->{DEPENDENT_XREFS}}, \%dep; my %dep2; $dep2{SOURCE_ID} = $dependent_sources{WikiGene} || die( 'No source for WikiGene!' ); $dep2{LINKAGE_SOURCE_ID} = $source_id; $dep2{ACCESSION} = $ll; push @{$xref->{DEPENDENT_XREFS}}, \%dep2; } foreach my $ll (@SGDGeneIDline) { my %dep; $dep{SOURCE_ID} = $dependent_sources{"SGD"} || die( 'No source for SGD!' ); $dep{LINKAGE_SOURCE_ID} = $source_id; $dep{ACCESSION} = $ll; push @{$xref->{DEPENDENT_XREFS}}, \%dep; } # Refseq's do not tell wetheer the mim is for the gene of morbid so ignore for now. push @xrefs, $xref; }# if defined species } # while <REFSEQ> $refseq_io->close(); print "Read " . scalar(@xrefs) ." xrefs from $file\n" if($verbose); return \@xrefs; } # -------------------------------------------------------------------------------- 1;