Raw content of Bio::EnsEMBL::ExternalData::DAS::SourceParser=head1 NAME
Bio::EnsEMBL::ExternalData::DAS::SourceParser
=head1 SYNOPSIS
my $parser = Bio::EnsEMBL::ExternalData::DAS::SourceParser->new(
-timeout => 5,
-proxy => 'http://proxy.company.com',
);
my $sources = $parser->fetch_Sources(
-location => 'http://www.dasregistry.org/das',
-species => 'Homo_sapiens'
);
for my $source (@{ $sources }) {
printf "URL: %s, Description: %s, Coords: %s\n",
$source->full_url,
$source->description,
join '; ', @{ $source->coord_systems };
}
=head1 DESCRIPTION
Parses XML produced by the 'sources' DAS command, creating object
representations of each source.
=head1 AUTHOR
Andy Jenkinson
=cut
package Bio::EnsEMBL::ExternalData::DAS::SourceParser;
use strict;
use warnings;
use vars qw(@EXPORT_OK);
use base qw(Exporter);
@EXPORT_OK = qw(%GENE_COORDS @GENE_COORDS %PROT_COORDS @PROT_COORDS is_genomic);
use Bio::EnsEMBL::Utils::Argument qw(rearrange);
use Bio::EnsEMBL::Utils::Exception qw(throw warning info);
use Bio::EnsEMBL::ExternalData::DAS::CoordSystem;
use Bio::EnsEMBL::ExternalData::DAS::Source;
use Bio::Das::Lite;
use URI;
our $GENOMIC_REGEX = '^chromosome|clone|contig|scaffold|genescaffold|supercontig|ultracontig|reftig|group$';
our @GENE_COORDS = (
Bio::EnsEMBL::ExternalData::DAS::CoordSystem->new( -name => 'ensembl_gene', -label => 'Ensembl Gene Accession' ),
Bio::EnsEMBL::ExternalData::DAS::CoordSystem->new( -name => 'entrezgene_acc', -label => 'Entrez Accession' ),
Bio::EnsEMBL::ExternalData::DAS::CoordSystem->new( -name => 'hgnc', -species => 'Homo_sapiens', -label => 'HUGO Gene Accession' ),
Bio::EnsEMBL::ExternalData::DAS::CoordSystem->new( -name => 'mgi_acc', -species => 'Mus_musculus', -label => 'MGI Gene Accession' ),
Bio::EnsEMBL::ExternalData::DAS::CoordSystem->new( -name => 'mgi', -species => 'Mus_musculus', -label => 'MGI Gene Symbol' ),
);
our @PROT_COORDS = (
Bio::EnsEMBL::ExternalData::DAS::CoordSystem->new( -name => 'ensembl_peptide', -label => 'Ensembl Protein Accession' ),
Bio::EnsEMBL::ExternalData::DAS::CoordSystem->new( -name => 'uniprot_peptide', -label => 'UniProt Protein Accession' ),
Bio::EnsEMBL::ExternalData::DAS::CoordSystem->new( -name => 'ipi_acc', -label => 'IPI Protein Accession' ),
Bio::EnsEMBL::ExternalData::DAS::CoordSystem->new( -name => 'ipi_id', -label => 'IPI Protein ID' ),
);
our %GENE_COORDS = map { $_->name => $_ } @GENE_COORDS;
our %PROT_COORDS = map { $_->name => $_ } @PROT_COORDS;
# For compatibility with previous versions of Ensembl:
$PROT_COORDS{'uniprot/swissprot_acc'} = $PROT_COORDS{'uniprot_peptide'};
$PROT_COORDS{'uniprot/sptrembl'} = $PROT_COORDS{'uniprot_peptide'};
# Intended for occasions when assembly names don't match between DAS and Ensembl
# TODO: get these from a config file of some sort?
our %AUTHORITY_MAPPINGS = (
'NCBI m' => 'NCBIM',
'Btau' => 'Btau_',
'MMUL' => 'MMUL_',
);
our %TYPE_MAPPINGS = (
'NT_Contig' => 'supercontig',
'Gene Scaffold' => 'genescaffold',
);
our %COORD_MAPPINGS = (
'Chromosome' => {
'BROADS' => {
'1' => {
'Gasterosteus aculeatus' => 'group:BROADS1:Gasterosteus_aculeatus',
},
},
},
);
our %NON_GENOMIC_COORDS = (
'Gene_ID' => {
'Ensembl' => $GENE_COORDS{'ensembl_gene'},
'HUGO_ID' => $GENE_COORDS{'hgnc'},
'MGI' => $GENE_COORDS{'mgi_acc'},
'MGI_Symbol' => $GENE_COORDS{'mgi'},
'Entrez' => $GENE_COORDS{'entrezgene_acc'},
},
'Protein Sequence' => {
'Ensembl' => $PROT_COORDS{'ensembl_peptide'},
'UniProt' => $PROT_COORDS{'uniprot_peptide'},
'IPI' => $PROT_COORDS{'ipi_acc'},
'IPI_ID' => $PROT_COORDS{'ipi_id'},
},
);
=head1 METHODS
=head2 new
Arg [..] : List of optional named arguments:
-PROXY - A URL to use as an HTTP proxy server
-NOPROXY - A list of domains/hosts not to use the proxy for
-TIMEOUT - Timeout in seconds (default is 10)
Example : my $parser = Bio::EnsEMBL::ExternalData::DAS::SourceParser->new(
-proxy => 'http://proxy.company.com',
-timeout => 10,
);
Description: Constructor
Returntype : Bio::EnsEMBL::ExternalData::DAS::SourceParser
Exceptions : If no location is specified
Caller : general
Status : Stable
=cut
sub new {
my $class = shift;
my ($proxy, $no_proxy, $timeout)
= rearrange(['PROXY','NOPROXY','TIMEOUT'], @_);
$timeout ||= 10;
my $das = Bio::Das::Lite->new();
$das->user_agent('Ensembl');
$das->timeout($timeout);
$das->http_proxy($proxy);
if ($no_proxy) {
if ($das->can('no_proxy')) {
$das->no_proxy($no_proxy);
} else {
warning("Installed version of Bio::Das::Lite does not support use of 'no_proxy'");
}
}
my $self = {
'daslite' => $das,
'proxy' => $proxy,
'noproxy' => $no_proxy,
'timeout' => $timeout,
};
bless $self, $class;
return $self;
}
=head2 fetch_Sources
Arg [..] : List of named arguments:
-LOCATION - A URL from which to obtain a list of sources XML.
This is usually a DAS registry or server URL, but
could be a local path to a directory containing an
XML file named "sources?" or "dsn?"
-SPECIES - (optional) scalar or arrayref species name filter
-NAME - (optional) scalar or arrayref source name filter
-LOGIC_NAME - (optional) scalar or arrayref logic_name filter
Example: $arr = $parser->fetch_Sources(
-location => 'http://www.dasregistry.org/das',
-species => 'Homo_sapiens',
-name => ['asd', 'atd', 'astd'],
);
Example: $arr = $parser->fetch_Sources(
-location => 'file:///registry', # parses "/registry/sources?"
);
Description: Fetches DAS Source objects. The first call to this method
initiates lazy parsing of the XML, and the results are stored.
The different filter types supplied to this method are treated as
a logical AND. Several filters of the same type are logical OR.
Returntype : Arrayref of Bio::EnsEMBL::ExternalData::DAS::Source objects,
sorted by label.
Exceptions : If there is an error contacting the DAS registry/server.
Caller : general
Status : Stable
=cut
sub fetch_Sources {
my $self = shift;
my ($server, $f_species, $f_name, $f_logic)
= rearrange([ 'LOCATION', 'SPECIES', 'NAME', 'LOGIC_NAME' ], @_);
$server || throw('No DAS server specified');
($server, my $f_id) = $self->parse_das_string( $server );
# Actual parsing is lazy
if (!defined $self->{'_sources'}{$server}) {
$self->_parse_server( $server );
}
my @sources = values %{ $self->{'_sources'}{$server} || {} };
my @f_species = !defined $f_species ? ()
: ref $f_species ? @{ $f_species } : ( $f_species );
my @f_name = !defined $f_name ? ()
: ref $f_name ? @{ $f_name } : ( $f_name );
my @f_logic = !defined $f_logic ? ()
: ref $f_logic ? @{ $f_logic } : ( $f_logic );
# optional species filter
if ( scalar @f_species ) {
info('Filtering by species');
@sources = grep { my $source = $_; grep { !scalar @{$source->coord_systems} || $source->matches_species( $_ ) } @f_species } @sources;
}
# optional name filter
if ( scalar @f_name ) {
info('Filtering by name');
@sources = grep { my $source = $_; grep { $source->matches_name( $_ ) } @f_name } @sources;
}
# optional logic name filter
if ( scalar @f_logic ) {
info('Filtering by logic_name');
@sources = grep { my $source = $_; grep { $source->logic_name eq $_ } @f_logic } @sources;
}
if ( $f_id ) {
info('Filtering by identifier (logic_name or dsn)');
@sources = grep { $_->logic_name eq $f_id || $_->dsn eq $f_id } @sources;
}
return [sort { lc $a->label cmp lc $b->label } @sources];
}
=head2 _parse_server
Arg [..] : none
Example : $parser->_parse_server( @servers );
Description: Contacts the given DAS server(s) via the sources or dsn command
and parses the results. Populates $self->{'_sources} as a hashref
of DAS sources, organised by server:
{
http://... => [ Bio::EnsEMBL::ExternalData::DAS::Source, .. ],
}
Returntype : none
Exceptions : If there is an error contacting the DAS registry/server.
Caller : fetch_Sources
Status : Stable
=cut
sub _parse_server {
my ( $self, @servers ) = @_;
# NOTE: this method technically supports multiple servers/locations, but
# in practice we expect to only be parsing one at a time
$self->{'daslite'}->dsn(\@servers);
# Servers which don't respond to the "sources" command will be attempted via
# the "dsn" command
my %success = ();
my $struct = $self->{'daslite'}->sources();
# Iterate over each server
while (my ($url, $set) = each %{ $struct }) {
info("Processing $url");
my $status = $self->{'daslite'}->statuscodes($url);
$url =~ s|/sources\??$||;
$self->{'_sources'}{$url} = {};
$set = $set->[0]->{'source'} || [];
# If we get data back from the sources command, parse it
if ($status =~ /^200/ && scalar @{ $set }) {
$self->_parse_sources_output($url, $set);
$success{$url} = 1;
} else {
info("$url does not support sources command; trying dsn");
}
}
my @failed = grep { !$success{$_} } @servers;
# Run the dsn command on the remaining servers (if any)
if (scalar @failed) {
$self->{'daslite'}->dsn(\@failed);
$struct = $self->{'daslite'}->dsns();
$self->{'daslite'}->dsn(\@servers);
while (my ($url, $set) = each %{ $struct }) {
info("Processing $url");
my $status = $self->{'daslite'}->statuscodes($url);
$url =~ s|/dsn\??$||;
$set ||= [];
# If we get data back from the sources command, parse it
if ($status !~ /^200/) {
throw("Error contacting DAS server '$url' : $status");
} elsif (scalar @{ $set }) {
$self->_parse_dsn_output($url, $set);
}
}
}
}
=head2 _parse_sources_output
Arg [1] : The URL of the server
Arg [2] : Arrayref of sources, each being a hashref
Example : $parser->_parse_sources_output($server_url, $sources_set);
Description: Parses the output of the sources command.
Returntype : none
Exceptions : none
Caller : _parse_server
Status : Stable
=cut
sub _parse_sources_output {
my ($self, $server_url, $set) = @_;
my $count = 0;
# Iterate over the