Raw content of Bio::LiveSeq::IO::SRS
# $Id: SRS.pm,v 1.7 2001/06/18 08:27:55 heikki Exp $
#
# bioperl module for Bio::LiveSeq::IO::SRS
#
# Cared for by Joseph Insana
#
# Copyright Joseph Insana
#
# You may distribute this module under the same terms as perl itself
#
# POD documentation - main docs before the code
=head1 NAME
Bio::LiveSeq::IO::SRS - Loader for LiveSeq from EMBL entries with SRS
=head1 SYNOPSIS
my $db="EMBL";
my $acc_id="M20132";
my $query="embl-acc:$acc_id";
my $loader=Bio::LiveSeq::IO::SRS->load(-db=>"EMBL", -query=>"$query");
my @translationobjects=$loader->entry2liveseq();
my $gene="AR";
my $gene=$loader->gene2liveseq("gene");
NOTE: The only -db now supported is EMBL. Hence it defaults to EMBL.
=head1 DESCRIPTION
This package uses the SRS (Sequence Retrieval System) to fetch a sequence
database entry, analyse it and create LiveSeq objects out of it.
An embl-acc ID has to be passed to this package which will return references
to all translation objects created from the EMBL entry.
References to Transcription, DNA and Exon objects can all be retrieved departing
from these.
Alternatively, a specific "gene" name can be specified, together with the
embl-acc ID. This will create a LiveSeq::Gene object with all relevant gene
features attached/created.
=head1 AUTHOR - Joseph A.L. Insana
Email: Insana@ebi.ac.uk, jinsana@gmx.net
Address:
EMBL Outstation, European Bioinformatics Institute
Wellcome Trust Genome Campus, Hinxton
Cambs. CB10 1SD, United Kingdom
=head1 APPENDIX
The rest of the documentation details each of the object
methods. Internal methods are usually preceded with a _
=cut
# Let the code begin...
package Bio::LiveSeq::IO::SRS;
$VERSION=2.4;
# Version history:
# Wed Apr 5 13:06:43 BST 2000 v 1.0 restarted as a child of Loader.pm
# Wed Apr 5 15:57:22 BST 2000 v 1.1 load() created
# Thu Apr 6 02:52:11 BST 2000 v 1.2 new field "range" in hash
# Thu Apr 6 03:11:29 BST 2000 v 1.22 transition from $location to @range
# Thu Apr 6 03:40:04 BST 2000 v 1.25 added Division
# Tue Apr 18 17:15:26 BST 2000 v 2.0 started coding swissprot2hash
# Thu Apr 20 02:17:28 BST 2000 v 2.1 mRNA added to valid_feature_names
# Wed Jun 7 02:08:57 BST 2000 v 2.2 added support for joinedlocation features
# Thu Jun 29 19:07:59 BST 2000 v 2.22 Gene stripped of possible newlines (horrible characteristic of some entries!!!!)
# Fri Jun 30 14:08:21 BST 2000 v 2.3 SRS querying routines now conveniently wrapped in eval{} blocks to avoid SRS crash the whole LiveSeq
# Tue Jul 4 14:07:52 BST 2000 v 2.31 note&number added in val_qual_names
# Mon Sep 4 17:46:42 BST 2000 v 2.4 novelaasequence2gene() added
use strict;
use Carp qw(cluck croak carp);
use vars qw($VERSION @ISA);
use lib $ENV{SRSEXE};
use srsperl;
use Bio::Tools::CodonTable; # for novelaasequence2gene
use Bio::LiveSeq::IO::Loader 2.2;
@ISA=qw(Bio::LiveSeq::IO::Loader);
# This package can in the future host other databases loading subroutines.
# e.g. ensembl2hash
=head2 load
Title : load
Usage : my $acc_id="M20132";
my $query="embl-acc:$acc_id";
$loader=Bio::LiveSeq::IO::SRS->load(-db=>"EMBL", -query=>"$query");
Function: loads an entry with SRS from a database into a hash
Returns : reference to a new object of class IO::SRS holding an entry
Errorcode 0
Args : an SRS query resulting in one fetched EMBL (by default) entry
=cut
sub load {
my ($thing, %args) = @_;
my $class = ref($thing) || $thing;
my ($obj,%loader);
my ($db,$query)=($args{-db},$args{-query});
if (defined($db)) {
unless ($db eq "EMBL") {
carp "Note: only EMBL now supported!";
return(0);
}
} else {
$db="EMBL";
}
my $hashref;
if ($db eq "EMBL") {
my $test_transl=0; # change to 0 to avoid comparison of translation
# these can be changed for future needs
my @embl_valid_feature_names=qw(CDS exon prim_transcript intron repeat_unit repeat_region mRNA);
my @embl_valid_qual_names=qw(gene codon_start db_xref product note number rpt_family transl_table);
# dunno yet how to implement test_transl again....
# probably on a one-on-one basis with each translation?
if ($test_transl) {
push (@embl_valid_qual_names,"translation"); # needed for test_transl
}
# not to have the whole program die because of SRS death
eval {
$hashref=&embl2hash("$query",\@embl_valid_feature_names,\@embl_valid_qual_names);
};
my $errormsg;
if ($@) {
foreach $errormsg (split(/\n/,$@)) {
if (index($errormsg,"in cleanup") == -1) {
carp "SRS EMBL Loader: $@";
}
}
}
}
unless ($hashref) { return (0); }
%loader = (db => $db, query => $query, hash => $hashref);
$obj = \%loader;
$obj = bless $obj, $class;
return $obj;
}
=head2 embl2hash
Title : embl2hash
Function: retrieves with SRS an EMBL entry, parses it and creates
a hash that contains all the information.
Returns : a reference to a hash
Errorcode: 0
Args : an SRS query resulting in one fetched EMBL entry
i.e. a string in a form like "embl-acc:accession_number"
two array references to skip features and qualifiers (for
performance)
Example: @valid_features=qw(CDS exon prim_transcript mRNA);
@valid_qualifiers=qw(gene codon_start db_xref product rpt_family);
$hashref=&embl2hash("$query",\@valid_features,\@valid_qualifiers);
=cut
# this has to be defined here as it is the only thing really proper to
# the /SRS/ loader. Every loader has to sport his own "entry2hash" function
# the main functions will be in the Loader.pm
# arguments: embl SRS query resulting in one fetched EMBL entry
# to skip features and qualifiers (for performance), two array
# references must be passed (this can change into string arguments to
# be passed....)
# returns: a reference to a hash containing the important features requested
sub embl2hash {
my ($i,$j);
my $query=$_[0];
my %valid_features; my %valid_names;
if ($_[1]) {
%valid_features = map {$_, 1} @{$_[1]}; # to skip features
}
if ($_[2]) {
%valid_names = map {$_, 1} @{$_[2]}; # to skip qualifiers
}
# SRS API used to fetch all relevant fields
my $sess = new Session;
my $set = $sess->query("[$query]", "");
my $numEntries=$set->size();
if ($numEntries < 1) {
carp "No sequence entry found for the query $query";
return (0);
} elsif ($numEntries > 1) {
carp "Too many entries found for the input given";
return (0);
} else {
my $entry = $set->getEntry(0);
my ($entry_ID,$entry_AccNumber,$entry_Molecule,$entry_Description,$entry_Organism,$entry_SeqLength,$entry_Sequence,$entry_Division);
# Fetch what we can fetch without the loader
$entry_ID = $entry->fieldToString("id","","","");
$entry_AccNumber = $entry->fieldToString("acc","","","");
$entry_Molecule = $entry->fieldToString("mol","","","");
$entry_Division = $entry->fieldToString("div","","","");
$entry_Description = $entry->fieldToString("des","","","");
$entry_Description =~ s/\n/ /g;
$entry_Organism = $entry->fieldToString("org","","","");
$entry_SeqLength = $entry->fieldToString("sl","","","");
# Now use the loader
my $loadedentry = $entry->load("EMBL");
# Fetch the rest via the loader
$entry_Sequence = $loadedentry->attrStr("sequence");
$entry_Sequence =~ s/\n//g; # from plain format to raw string
# put into the hash
my %entryhash;
$entryhash{ID}=$entry_ID;
$entryhash{AccNumber}=$entry_AccNumber;
$entryhash{Molecule}=$entry_Molecule;
$entryhash{Division}=$entry_Division;
$entryhash{Description}=$entry_Description;
$entryhash{Organism}=$entry_Organism;
$entryhash{Sequence}=$entry_Sequence;
$entryhash{SeqLength}=$entry_SeqLength;
# create features array
my $features = $loadedentry->attrObjList("features");
my $featuresnumber= $features->size();
$entryhash{FeaturesNumber}=$featuresnumber;
my ($feature,$feature_name,$feature_location);
my ($feature_qual_names,$feature_qual_values);
my ($feature_qual_name,$feature_qual_value);
my ($feature_qual_number,$feature_qual_number2);
my @features;
for $i (0..$featuresnumber-1) {
my %feature;
$feature = $features->get($i);
$feature_name = $feature->attrStr("FtKey");
#unless ($feature_name eq "CDS") {
unless ($valid_features{$feature_name}) {
#print "not valid feature: $feature_name\n";
next;
}
#print "now processing feature: $feature_name\n";
$feature_location = $feature->attrStr("FtLocation");
$feature_location =~ s/\n//g;
$feature_qual_names= $feature->attrStrList("FtQualNames");
$feature_qual_values= $feature->attrStrList("FtQualValues");
$feature_qual_number= $feature_qual_names->size();
$feature_qual_number2= $feature_qual_values->size(); # paranoia check
if ($feature_qual_number > $feature_qual_number2) {
carp ("Warning with Feature at position $i ($feature_name): There are MORE qualifier names than qualifier values.");
# this can happen, e.g. "/partial", let's move on, just issue a warning
#return (0);
} elsif ($feature_qual_number < $feature_qual_number2) {
carp ("Error with Feature at position $i ($feature_name): There are LESS qualifier names than qualifier values. Stopped");
return (0);
}
#} else {print "NUMBER OF QUALIFIERS: $feature_qual_number \n";} # DEBUG
# Put things inside hash
$feature{position}=$i;
$feature{name}=$feature_name;
# new range field in hash
my @range;
if ($feature_name eq "CDS") {
@range=cdslocation2transcript($feature_location);
$feature{locationtype}="joined";
} else {
if (index($feature_location,"join") == -1) {
@range=location2range($feature_location);
} else { # complex location in feature different than CDS
@range=joinedlocation2range($feature_location);
$feature{locationtype}="joined";
}
}
$feature{range}=\@range;
$feature{location}="deprecated";
my %feature_qualifiers;
for $j (0..$feature_qual_number-1) {
$feature_qual_name=$feature_qual_names->get($j);
$feature_qual_name =~ s/^\///; # eliminates heading "/"
# skip all not interesting (for now) qualifiers
unless ($valid_names{$feature_qual_name}) {
#print "not valid name: $feature_qual_name\n";
next;
}
#print "now processing: $feature_qual_name\n";
$feature_qual_value=$feature_qual_values->get($j);
$feature_qual_value =~ s/^"//; # eliminates heading "
$feature_qual_value =~ s/"$//; # eliminates trailing "
$feature_qual_value =~ s/\n//g; # eliminates all new lines
$feature_qualifiers{$feature_qual_name}=$feature_qual_value;
}
$feature{qualifiers}=\%feature_qualifiers;
push (@features,\%feature); # array of features
}
$entryhash{Features}=\@features; # put this also into the hash
my @cds; # array just of CDSs
for $i (0..$#features) {
if ($features[$i]->{'name'} eq "CDS") {
push(@cds,$features[$i]);
}
}
$entryhash{CDS}=\@cds; # put this also into the hash
return (\%entryhash);
}
}
# argument: location field of an EMBL feature
# returns: array with correct $start $end and $strand to create LiveSeq obj
sub location2range {
my ($location)=@_;
my ($start,$end,$strand);
if (index($location,"complement") == -1) { # forward strand
$strand=1;
} else { # reverse strand
$location =~ s/complement\(//g;
$location =~ s/\)//g;
$strand=-1;
}
$location =~ s/\/g;
$location =~ s/\>//g;
my @range=split(/\.\./,$location);
if (scalar(@range) == 1) { # special case of range with just one position (e.g. polyA_site EMBL features
$start=$end=$range[0];
} else {
if ($strand == 1) {
($start,$end)=@range;
} else { # reverse strand
($end,$start)=@range;
}
}
return ($start,$end,$strand);
}
# argument: location field of a CDS feature
# returns: array of exons arrayref, useful to create LiveSeq Transcript obj
sub cdslocation2transcript {
my ($location)=@_;
my @exonlocs;
my $exonloc;
my @exon;
my @transcript=();
$location =~ s/^join\(//;
$location =~ s/\)$//;
@exonlocs = split (/\,/,$location);
for $exonloc (@exonlocs) {
my @exon=location2range($exonloc);
push (@transcript,\@exon);
}
return (@transcript);
}
# argument: location field of a CDS feature
# returns: array of exons arrayref, useful to create LiveSeq Transcript obj
sub joinedlocation2range {
&cdslocation2transcript;
}
=head2 get_swisshash
Title : get_swisshash
Usage : $loader->get_swisshash();
Example : $swisshash=$loader->swissprot2hash("SWISS-PROT:P10275")
Function: retrieves with SRS a SwissProt entry, parses it and creates
a hash that contains all the information.
Returns : a reference to a hash
Errorcode: 0
Args : the db_xref qualifier's value from an EMBL CDS Feature
i.e. a string in the form "SWISS-PROT:accession_number"
Note : this can be modified (adding a second argument) to retrieve
and parse SWTREMBL, SWALL... entries
=cut
# argument: db_xref qualifier's value from EMBL CDS
# errorcode: 0
# returns hashref
sub get_swisshash {
my ($self,$query)=@_;
if (index($query,"SWISS-PROT") == -1) {
return (0);
}
$query =~ s/SWISS-PROT/swissprot-acc/;
my $hashref;
eval {
$hashref=&swissprot2hash("$query");
};
my $errormsg;
if ($@) {
foreach $errormsg (split(/\n/,$@)) {
if (index($errormsg,"in cleanup") == -1) {
carp "SRS Swissprot Loader: $@";
}
}
}
unless ($hashref) {
return (0);
} else {
return ($hashref);
}
}
=head2 swissprot2hash
Title : swissprot2hash
Usage : $loader->swissprot2hash();
Example : $swisshash=$loader->swissprot2hash("swissprot-acc:P10275")
Function: retrieves with SRS a SwissProt entry, parses it and creates
a hash that contains all the information.
Returns : a reference to a hash
Errorcode: 0
Args : an SRS query resulting in one entry from SwissProt database
i.e. a string in the form "swissprot-acc:accession_number"
Note : this can be modified (adding a second argument) to retrieve
and parse SWTREMBL, SWALL... entries
=cut
# arguments: swissprot SRS query resulting in one fetched swissprot entry
# returns: a reference to a hash containing the important features requested
sub swissprot2hash {
my ($i,$j);
my $query=$_[0];
# SRS API used to fetch all relevant fields
my $sess = new Session;
my $set = $sess->query("[$query]", "");
my $numEntries = $set->size();
if ($numEntries < 1) {
carp "No sequence entry found for the query $query";
return (0);
} elsif ($numEntries > 1) {
carp "Too many entries found for the input given";
return (0);
} else {
my $entry = $set->getEntry(0);
my ($entry_ID,$entry_AccNumber,$entry_Molecule,$entry_Description,$entry_Organism,$entry_SeqLength,$entry_Sequence,$entry_Gene);
# Fetch what we can fetch without the loader
$entry_ID = $entry->fieldToString("id","","","");
$entry_AccNumber = $entry->fieldToString("acc","","","");
$entry_Gene = $entry->fieldToString("gen","","","");
$entry_Gene =~ s/\n/ /g;
$entry_Description = $entry->fieldToString("des","","","");
$entry_Description =~ s/\n/ /g;
$entry_Organism = $entry->fieldToString("org","","","");
chop $entry_Organism;
$entry_SeqLength = $entry->fieldToString("sl","","","");
# Now use the loader
my $loadedentry = $entry->load("Swissprot");
# Fetch the rest via the loader
$entry_Sequence = $loadedentry->attrStr("Sequence");
$entry_Sequence =~ s/\n//g; # from plain format to raw string
# put into the hash
my %entryhash;
$entryhash{ID}=$entry_ID;
$entryhash{AccNumber}=$entry_AccNumber;
$entryhash{Molecule}=$entry_Molecule;
$entryhash{Gene}=$entry_Gene;
$entryhash{Description}=$entry_Description;
$entryhash{Organism}=$entry_Organism;
$entryhash{Sequence}=$entry_Sequence;
$entryhash{SeqLength}=$entry_SeqLength;
# create features array
my $features = $loadedentry->attrObjList("Features");
my $featuresnumber= $features->size();
$entryhash{FeaturesNumber}=$featuresnumber;
my ($feature,$feature_name,$feature_description,$feature_location);
my @features;
for $i (0..$featuresnumber-1) {
my %feature;
$feature = $features->get($i);
$feature_name = $feature->attrStr("FtKey");
$feature_location = $feature->attrStr("FtLocation");
$feature_location =~ s/ +/ /g;
$feature_description = $feature->attrStr("FtDescription");
chop $feature_description;
$feature_description =~ s/\nFT / /g;
# Put things inside hash
$feature{position}=$i;
$feature{name}=$feature_name;
$feature{location}=$feature_location;
$feature{description}=$feature_description;
push (@features,\%feature); # array of features
}
$entryhash{Features}=\@features; # put this also into the hash
return (\%entryhash);
}
}
=head2 novelaasequence2gene
Title : novelaasequence2gene
Usage : $gene=Bio::LiveSeq::IO::SRS->novelaasequence2gene(-aasequence => "MGLAAPTRS*");
: $gene=Bio::LiveSeq::IO::SRS->novelaasequence2gene(-aasequence => "MGLAAPTRS*",
-genome => "Homo sapiens");
: $gene=Bio::LiveSeq::IO::SRS->novelaasequence2gene(-aasequence => "MGLAAPTRS*",
-genome => "Mitochondrion Homo sapiens",
-gene_name => "tyr-kinase");
Function: creates LiveSeq objects from a novel amino acid sequence,
using codon usage database to choose codons according to
relative frequencies.
If a genome latin name is not specified, the default is to use
'Homo sapiens' (taxonomy ID 9606).
Returns : reference to a Gene object containing references to LiveSeq objects
Errorcode 0
Args : string containing an amino acid sequence
string (optional) with a species/genome latin name
string specifying a gene name
Note : SRS access to TAXON and CODONUSAGE databases is required
=cut
sub novelaasequence2gene {
my ($self, %args) = @_;
my ($gene_name,$species_name,$aasequence)=($args{-gene_name},$args{-genome},$args{-aasequence});
unless ($aasequence) {
carp "aasequence not given";
return (0);
}
unless ($gene_name) {
$gene_name="Novel Unknown";
}
unless ($species_name) {
$species_name="Homo sapiens";
}
my $sess = new Session;
my ($e,$numEntries,$set);
# codonusage lookup
my $codonusage_usage;
my @species_codon_usage;
$set = $sess->query("[codonusage:'$species_name']", "");
$numEntries = $set->size();
if ($numEntries > 0) {
$e = $set->getEntry(0);
$species_name = $e->fieldToString("id","","","");
$codonusage_usage = $e->fieldToString("usage","","","");
@species_codon_usage=split(/\s/,$codonusage_usage); # spaces or tabs
if ($numEntries > 1) {
carp "Search in Codon Usage DB resulted in $numEntries results. Taking the first one: $species_name";
}
} else {
carp "Genome not found in codon usage DB.";
return (0);
}
# taxonomy lookup
my $mito_flag = 0;
my $species_origin;
if ($species_name =~ /^Mitochondrion /) {
$mito_flag = 1;
$species_origin = $species_name;
$species_origin =~ s/^Mitochondrion //;
$set = $sess->query("[taxonomy-species:'$species_origin']", "");
} elsif ($species_name =~ /^Chloroplast |^Kinetoplast |^Chromoplast /) {
$species_origin = $species_name;
$species_origin =~ s/^Chromoplast //;
$species_origin =~ s/^Kinetoplast //;
$species_origin =~ s/^Chloroplast //;
$set = $sess->query("[taxonomy-species:'$species_origin']", "");
} else {
$set = $sess->query("[taxonomy-species:'$species_name']", "");
}
$numEntries = $set->size();
my ($taxonomy_id,$taxonomy_gc,$taxonomy_mgc,$taxonomy_name);
if ($numEntries > 0) {
$e = $set->getEntry(0);
$taxonomy_id = $e->fieldToString("id","","","");
$taxonomy_name = $e->fieldToString("species","","","");
$taxonomy_gc = $e->fieldToString("gc","","","");
$taxonomy_mgc = $e->fieldToString("mgc","","","");
if ($numEntries > 1) {
carp "Note! More than one entry found in taxonomy DB for the genome query given. Using the first one: $taxonomy_name ($taxonomy_id)";
}
} else {
carp "Genome not found in taxonomy DB.";
return (0);
}
# Lookup appropriate translation table
my $ttabid;
if ($mito_flag) {
$ttabid = $taxonomy_mgc;
} else {
$ttabid = $taxonomy_gc;
}
my $gene=Bio::LiveSeq::IO::Loader::_common_novelaasequence2gene(\@species_codon_usage,$ttabid,$aasequence,$gene_name);
return ($gene);
}
1;