Raw content of Bio::EnsEMBL::Pipeline::Tools::Embl
package Bio::EnsEMBL::Pipeline::Tools::Embl;
### embl ###
use strict;
#no warnings;
sub new{
my ($proto) = shift;
my $class = ref($proto) || $proto;
my $p;
if(ref($_[0]) eq "HASH"){($p) = @_ ;}
else{ $p = {@_} ;}
my $self = {
_debug => 0,
_acc => undef,
_fh => undef,
map { "_".substr($_,1) => $p->{$_} } keys(%$p)
};
bless ($self, $class);
if($self->{'_debug'} > 2){
require 'Data/Dumper.pm';
print Data::Dumper->Dump([$self],['self']);
}
return $self;
}
sub _fh{
my $self = shift;
if(my $fh = shift){
map { delete $self->{$_} } keys(%{$self});
die "Not a filehandle [_fh]" unless ref($fh) eq "GLOB" || ref($fh) eq 'IO::String';
$self->{'_fh'} = $fh;
}
return $self->{'_fh'};
}
sub parse{
my ($self,$string) = @_;
if($string){$self->_parse_record($string)}
elsif($self->_fh){
local $/ = "//\n";
my $FH = $self->_fh();
my $single_check = 0;
while(<$FH>){
next unless $single_check < 1;
$self->_parse_record($_);
$single_check++;
}
}
}
sub _parse_record{
my ($self,$string) = @_;
$self->clean();
$string =~ s/^SQ\s{3}(.+;)(.+)/$self->sequence($2,$1)/ems;
$string =~ s/^([A-Z]{2})\s{3}(.+\n)/$self->{"_" . $1} .= $2/egm;
if($self->{'_FH'}){
chop $self->{'_FT'};
my @keys = split(/\n\b/, $self->{'_FT'});
delete $self->{'_FT'};
foreach(@keys){
my @qualifiers = split(/\s+\//, $_);
my ($key,$location) = split(/\s+/,shift(@qualifiers));
# my $hash = { map { ($a, $b) = split('=') } @qualifiers }; # unfortunately not, [non unique $a!] :(
# $self->{'_FT'}->{$key}->{$location} = $hash || {};
my $hash;
foreach(@qualifiers){
my ($qualifier, $value) = (split(/=/),'')[0,1];
$value =~ s/^\"|\"$//g;
$value =~ s/(\s?)\n\s+(\B)/$1/g;
push(@{$self->{'_FT'}->{$key}->{$location}->{$qualifier}}, $value);
}
}
}
else{
$self->_non_EMBL(1);
}
# print "found accession: <" . join(" ", @{$self->accession()}) .">\n";
# print ", sequence version: <". $self->sequence_version() . ">";
# print ", description: <" . $self->description() . ">\n";
# print " * fasta:\n" . $self->fasta() . "\n";
}
sub feature_table{
my $self = shift;
return $self->{'_FT'};
}
sub comment{
my $self = shift;
return $self->{1}->{'_CC'};
}
sub description{
my $self = shift;
$self->{'_DE'} =~ s/\n/ /g;
$self->{'_DE'} =~ s/\.\s$//;
return $self->{'_DE'};
}
sub date{
my $self = shift;
unless( ref($self->{'_DT'}) eq "ARRAY"){
$self->{'_DT'} = [ split("\n",$self->{'_DT'}) ];
}
return $self->{'_DT'};
}
sub sequence{
my ($self, $sequence, $sequence_line) = @_;
if(@_){
$self->sequence_line($sequence_line);
$self->{'_sequence'} = $sequence;
$self->{'_sequence'} =~ s/[\s0-9\/]//g;
return '';
}
return $self->{'_sequence'};
}
sub sequence_line{
my $self = shift;
$self->{'_SQ'} = shift if @_;
return $self->{'_SQ'};
}
sub _non_EMBL{
my $self = shift;
$self->{'_non_EMBL'} = shift if @_;
return $self->{'_non_EMBL'} ? 1 : 0;
}
sub accession{
my $self = shift;
$self->{'_AC'} =~ s/[;]//g;
unless(ref($self->{'_AC'}) eq "ARRAY"){
$self->{'_AC'} = [ split(/\s+/, $self->{'_AC'}) ];
}
return $self->{'_AC'};
}
sub sequence_version{
my $self = shift;
$self->{'_SV'} =~ s/\n//g if $self->{'_SV'};
return $self->{'_SV'};
}
sub ox{
# what's OX? a chop'ed OXO??
my $self = shift;
$self->{'_OX'} =~ s/[;\n]//g;
return $self->{'_OX'};
}
sub os{
my $self = shift;
$self->{'_OS'} =~ s/[\.$|\n]//g;
return $self->{'_OS'};
}
sub keywords{
my $self = shift;
unless(ref($self->{'_KW'}) eq "ARRAY"){
$self->{'_KW'} =~ s/\n|\.$//g;
$self->{'_KW'} = [ split(/;\s?/, $self->{'_KW'}) ];
}
return $self->{'_KW'};
}
sub fasta{
my $self = shift;
my ($length, $fasta) = (60,undef);
my $seq = $self->sequence();
unless($fasta = [ $seq =~ /\G.{1,$length}/g ]){
$fasta = [ $seq ];
}
return ">" . $self->description() . "\n" . join("\n",@{$fasta});
}
sub clean{
my $self = shift;
map { delete $self->{$_} } keys(%{$self});
}
sub DESTROY{## DESTRUCTOR if needed
my ($self) = @_;
## Clear/Correct Class Variables
}
# +-------------------------------------------------+
# | useful methods |
# | |
# +-------------------------------------------------+
sub taxon{
my ($self, $obj) = @_;
my $ft = $self->feature_table();
my $ret;
if(ref($ft) eq "HASH"){
foreach my $location(keys %{$ft->{'source'}}){
foreach(@{$ft->{'source'}->{$location}->{'db_xref'}}){
return $1 if /taxon:(\d+)/;
}
}
}else{
return $1 if $self->ox =~ /NCBI_TaxID=(\d+)/;
}
return undef;
}
sub seq_length{
my ($self, $obj) = @_;
my $arr = [ split(";", $self->sequence_line) ];
my $first = shift @{$arr};
my $length = $1 if $first =~ /(\d+)/;
return $length;
}
sub which_database{
my ($self, $obj, $ret) = @_;
if($self->_non_EMBL){
$ret = 'SWISSPROT';
my $date = shift( @{$self->date} );
$ret = 'TrEMBL' if $date =~ /TrEMBL/;
}else{
$ret = 'EMBL'; # default assumption
}
return $ret;
}
1;
=head1
=head2 DESCRIPTION
Designed to only read one embl entry @ a time, as they can be big.
=cut