None available.
sub create_xrefs
{ my $self = shift;
my ( $peptide_source_id, $dna_source_id, $pred_peptide_source_id,
$pred_dna_source_id, $file, $species_id ) = @_;
my %species2name = $self->species_id2name();
my %species2tax = $self->species_id2taxonomy();
my @names = @{$species2name{$species_id}};
my @tax_ids = @{$species2tax{$species_id}};
my %name2species_id = map{ $_=>$species_id } @names;
my %taxonomy2species_id = map{ $_=>$species_id } @tax_ids;
my %dependent_sources = $self->get_dependent_xref_sources();
my $refseq_io = $self->get_filehandle($file);
if ( !defined $refseq_io ) {
print STDERR "ERROR: Can't open RefSeqGPFF file $file\n";
return undef;
}
my @xrefs;
local $/ = "\/\/\n";
my $type;
if ($file =~ /protein/) {
$type = 'peptide';
} elsif ($file =~ /rna/) {
$type = 'dna';
} elsif($file =~ /RefSeq_dna/){
$type = 'dna';
} elsif($file =~ /RefSeq_protein/){
$type = 'peptide';
}else{
print STDERR "Could not work out sequence type for $file\n";
return undef;
}
while ( $_ = $refseq_io->getline() ) {
my $xref;
my $entry = $_;
chomp $entry;
my ($species) = $entry =~ /\s+ORGANISM\s+(.*)\n/;
$species = lc $species;
$species =~ s/^\s*//g;
$species =~ s/\s*\(.+\)//; $species =~ s/\s+/_/g;
$species =~ s/\n//g;
my $species_id_check = $name2species_id{$species};
if ( !defined $species_id_check ) {
my ($taxon_id) = $entry =~ /db_xref="taxon:(\d+)"/;
$species_id_check = $taxonomy2species_id{$taxon_id};
}
if ( defined $species_id
&& defined $species_id_check
&& $species_id == $species_id_check )
{
my ($acc) = $entry =~ /ACCESSION\s+(\S+)/;
my ($ver) = $entry =~ /VERSION\s+(\S+)/;
my $source_id;
if ($type =~ /dna/) {
if ($acc =~ /^XM_/) {
$source_id = $pred_dna_source_id;
} else {
$source_id = $dna_source_id;
}
} elsif ($type =~ /peptide/) {
if ($acc =~ /^XP_/) {
$source_id = $pred_peptide_source_id;
} else {
$source_id = $peptide_source_id;
}
}
print "Warning: can't get source ID for $type $acc\n" if (!$source_id);
my ($description) = $entry =~ /DEFINITION\s+([^[]+)/s;
print $entry if (length($description) == 0);
$description =~ s/\nACCESSION.*//s;
$description =~ s/\n//g;
$description =~ s/\s+/ /g;
$description = substr($description, 0, 255) if (length($description) > 255);
my ($seq) = $_ =~ /ORIGIN\s+(.+)/s; my @seq_lines = split /\n/, $seq;
my $parsed_seq = "";
foreach my $x (@seq_lines) {
my ($seq_only) = $x =~ /^\s*\d+\s+(.*)$/;
next if (!defined $seq_only);
$parsed_seq .= $seq_only;
}
$parsed_seq =~ s#//##g; $parsed_seq =~ s#\s##g;
( my $acc_no_ver, $ver ) = split( /\./, $ver );
$xref->{ACCESSION} = $acc;
if($acc eq $acc_no_ver){
$xref->{VERSION} = $ver;
}
else{
print "$acc NE $acc_no_ver\n";
}
$xref->{LABEL} = $acc . "\." . $ver;
$xref->{DESCRIPTION} = $description;
$xref->{SOURCE_ID} = $source_id;
$xref->{SEQUENCE} = $parsed_seq;
$xref->{SEQUENCE_TYPE} = $type;
$xref->{SPECIES_ID} = $species_id;
$xref->{INFO_TYPE} = "SEQUENCE_MATCH";
my @EntrezGeneIDline = $entry =~ /db_xref=.GeneID:(\d+)/g;
my @SGDGeneIDline = $entry =~ /db_xref=.SGD:(S\d+)/g;
my @protein_id = $entry =~ /\/protein_id=.(\S+_\d+)/g;
my @coded_by = $entry =~ /\/coded_by=.(\w+_\d+)/g;
foreach my $cb (@coded_by){
$xref->{PAIR} = $cb;
}
foreach my $pi (@protein_id){
$xref->{PROTEIN} = $pi;
}
foreach my $ll (@EntrezGeneIDline) {
my %dep;
$dep{SOURCE_ID} = $dependent_sources{EntrezGene}
|| die( 'No source for EntrezGene!' );
$dep{LINKAGE_SOURCE_ID} = $source_id;
$dep{ACCESSION} = $ll;
push @{$xref->{DEPENDENT_XREFS}},\% dep;
my %dep2;
$dep2{SOURCE_ID} = $dependent_sources{WikiGene}
|| die( 'No source for WikiGene!' );
$dep2{LINKAGE_SOURCE_ID} = $source_id;
$dep2{ACCESSION} = $ll;
push @{$xref->{DEPENDENT_XREFS}},\% dep2;
}
foreach my $ll (@SGDGeneIDline) {
my %dep;
$dep{SOURCE_ID} = $dependent_sources{"SGD"}
|| die( 'No source for SGD!' );
$dep{LINKAGE_SOURCE_ID} = $source_id;
$dep{ACCESSION} = $ll;
push @{$xref->{DEPENDENT_XREFS}},\% dep;
}
push @xrefs, $xref;
}
}
$refseq_io->close();
print "Read " . scalar(@xrefs) ." xrefs from $file\n" if($verbose);
return\@ xrefs;
}
1; } |
sub run
{
my $self = shift if (defined(caller(1)));
my $source_id = shift;
my $species_id = shift;
my $files_ref = shift;
my $rel_file = shift;
$verbose = shift;
my @files = @{$files_ref};
my $release_file;
if ( $files[-1] =~ /RefSeq-release/ ) {
$release_file = pop @files;
}
my $peptide_source_id =
$self->get_source_id_for_source_name('RefSeq_peptide');
my $dna_source_id =
$self->get_source_id_for_source_name('RefSeq_dna');
print "RefSeq_peptide source ID = $peptide_source_id\n" if($verbose);
print "RefSeq_dna source ID = $dna_source_id\n" if($verbose);
my $pred_peptide_source_id =
$self->get_source_id_for_source_name('RefSeq_peptide_predicted');
my $pred_dna_source_id =
$self->get_source_id_for_source_name('RefSeq_dna_predicted');
print "RefSeq_peptide_predicted source ID = "
. "$pred_peptide_source_id\n" if($verbose);
print "RefSeq_dna_predicted source ID = $pred_dna_source_id\n" if($verbose);
my @xrefs;
foreach my $file (@files) {
if ( $source_id < 1 ) {
$source_id =
$self->get_source_id_for_filename( basename($file) );
}
if ( !defined($species_id) ) {
$species_id = $self->get_species_id_for_filename($file);
}
my $xrefs =
$self->create_xrefs( $peptide_source_id,
$dna_source_id,
$pred_peptide_source_id,
$pred_dna_source_id,
$file,
$species_id );
if ( !defined( $xrefs ) ) {
return 1; }
push @xrefs, @{$xrefs};
}
if ( !defined( $self->upload_xref_object_graphs(\@ xrefs ) ) ) {
return 1; }
if ( defined $release_file ) {
my $release_io = $self->get_filehandle($release_file);
local $/ = "\n*";
my $release = $release_io->getline();
$release_io->close();
$release =~ s/\s{2,}/ /g;
$release =~ s/.*(NCBI Reference Sequence.*) Distribution.*/$1/s;
$release =~ s/Release (\d+)/Release $1,/;
print "RefSeq release: '$release'\n" if($verbose);
$self->set_release( $source_id, $release );
$self->set_release( $peptide_source_id, $release );
$self->set_release( $dna_source_id, $release );
$self->set_release( $pred_peptide_source_id, $release );
$self->set_release( $pred_dna_source_id, $release );
}
return 0; }
} |