Raw content of Bio::EnsEMBL::Analysis::Tools::IMGT::SeqIO::imgt_embl
use strict;
package Bio::EnsEMBL::Analysis::Tools::IMGT::SeqIO::imgt_embl;
use base qw(Bio::SeqIO::embl);
sub _initialize {
my($self,@args) = @_;
$self->SUPER::_initialize(@args);
$self->sequence_factory(new Bio::Seq::SeqFactory
(-verbose => $self->verbose(),
-type => 'Bio::Seq::RichSeqIMGT'));
}
sub next_seq {
my ($self,@args) = @_;
my ($pseq,$c,$line,$name,$dataclass,$desc,$acc,$seqc,$mol,$div,
$date, $comment, @date_arr);
my ($annotation, %params, @features) =
new Bio::Annotation::Collection;
$line = $self->_readline;
# This needs to be before the first eof() test
if( !defined $line ) {
return; # no throws - end of file
}
if( $line =~ /^\s+$/ ) {
while( defined ($line = $self->_readline) ) {
$line =~/^\S/ && last;
}
# return without error if the whole next sequence was just a single
# blank line and then eof
return unless $line;
}
# no ID as 1st non-blank line, need short circuit and exit routine
$self->throw("EMBL stream with no ID. Not embl in my book")
unless $line =~ /^ID\s+\S+/;
# At this point we are sure that $line contains an ID header line
my $alphabet;
# IMGT header : ID A00673 IMGT/LIGM annotation : keyword level; DNA; SYN; 45 BP.
if ( $line =~ /^ID\s+(\S+)\s+\S+\s+\S+\s+:\s+([^;]+);\s+([^;]+);\s+([^;]+);\s+\d+\s+BP\.$/) {
($name, $dataclass, $mol, $div) = ($1, $2, $3, $4);
if (defined $mol ) {
if ($mol =~ /DNA/) {
$alphabet='dna';
}
elsif ($mol =~ /RNA/) {
$alphabet='rna';
}
elsif ($mol =~ /AA/) {
$alphabet='protein';
}
}
}
unless( defined $name && length($name) ) {
$name = "unknown_id";
}
# $self->warn("not parsing upper annotation in EMBL file yet!");
my $buffer = $line;
local $_;
BEFORE_FEATURE_TABLE: until( !defined $buffer ) {
$_ = $buffer;
# Exit at start of Feature table
if( /^(F[HT]|SQ)/ ) {
$self->_pushback($_) if( $1 eq 'SQ' );
last;
}
# Description line(s)
if (/^DE\s+(\S.*\S)/) {
$desc .= $desc ? " $1" : $1;
}
#accession number
if( /^AC\s+(.*)?/ ) {
my @accs = split(/[; ]+/, $1); # allow space in addition
$params{'-accession_number'} = shift @accs
unless defined $params{'-accession_number'};
push @{$params{'-secondary_accessions'}}, @accs;
}
#version number
if( /^SV\s+\S+\.(\d+);?/ ) {
my $sv = $1;
#$sv =~ s/\;//;
$params{'-seq_version'} = $sv;
$params{'-version'} = $sv;
}
#date (NOTE: takes last date line)
if( /^DT\s+(.+)$/ ) {
my $line = $1;
my ($date, $version) = split(' ', $line, 2);
$date =~ tr/,//d; # remove comma if new version
if ($version =~ /\(Rel\. (\d+), Created\)/xms ) {
my $release = Bio::Annotation::SimpleValue->
new(
-tagname => 'creation_release',
-value => $1
);
$annotation->add_Annotation($release);
} elsif ($version =~ /\(Rel\. (\d+), Last updated, Version (\d+)\)/xms ) {
my $release = Bio::Annotation::SimpleValue->
new(
-tagname => 'update_release',
-value => $1
);
$annotation->add_Annotation($release);
my $update = Bio::Annotation::SimpleValue->
new(
-tagname => 'update_version',
-value => $2
);
$annotation->add_Annotation($update);
}
push @{$params{'-dates'}}, $date;
}
#keywords
if( /^KW (.*)\S*$/ ) {
my @kw = split(/\s*\;\s*/,$1);
push @{$params{'-keywords'}}, @kw;
}
# Organism name and phylogenetic information
elsif (/^O[SC]/) {
# pass the accession number so we can give an informative throw message if necessary
my $species = $self->_read_EMBL_Species(\$buffer, $params{'-accession_number'});
$params{'-species'}= $species;
}
# References
elsif (/^R/) {
my @refs = $self->_read_EMBL_References(\$buffer);
foreach my $ref ( @refs ) {
$annotation->add_Annotation('reference',$ref);
}
}
# DB Xrefs
elsif (/^DR/) {
my @links = $self->_read_EMBL_DBLink(\$buffer);
foreach my $dblink ( @links ) {
$annotation->add_Annotation('dblink',$dblink);
}
}
# Comments
#elsif (/^CC\s+(.*)/) {
# $comment .= $1;
# $comment .= " ";
# while (defined ($_ = $self->_readline) ) {
# if (/^CC\s+(.*)/) {
# $comment .= $1;
# $comment .= " ";
# }
# else {
# last;
# }
# }
# my $commobj = Bio::Annotation::Comment->new();
# $commobj->text($comment);
# $annotation->add_Annotation('comment',$commobj);
# $comment = "";
#}
# Get next line.
$buffer = $self->_readline;
}
while( defined ($_ = $self->_readline) ) {
/^FT\s{3}\w/ && last;
/^SQ / && last;
/^CO / && last;
}
$buffer = $_;
if (defined($buffer) && $buffer =~ /^FT\s+\S+/) {
until( !defined ($buffer) ) {
my $ftunit = $self->_read_FTHelper_EMBL(\$buffer);
next if not defined $ftunit;
# process ftunit
my $feat =
$ftunit->_generic_seqfeature($self->location_factory(), $name);
# add taxon_id from source if available
if($params{'-species'} && ($feat->primary_tag eq 'source')
&& $feat->has_tag('db_xref')
&& (! $params{'-species'}->ncbi_taxid())) {
foreach my $tagval ($feat->get_tag_values('db_xref')) {
if(index($tagval,"taxon:") == 0) {
$params{'-species'}->ncbi_taxid(substr($tagval,6));
last;
}
}
}
# add feature to list of features
push(@features, $feat);
if( $buffer !~ /^FT/ ) {
last;
}
}
}
# skip comments
while( defined ($buffer) && $buffer =~ /^XX/ ) {
$buffer = $self->_readline();
}
if( $buffer !~ /^SQ/ ) {
while( defined ($_ = $self->_readline) ) {
/^SQ/ && last;
}
}
$seqc = "";
while( defined ($_ = $self->_readline) ) {
m{^//} && last;
$_ = uc($_);
s/[^A-Za-z]//g;
$seqc .= $_;
}
my $seq = $self->sequence_factory->create
(-verbose => $self->verbose(),
-division => $div,
-data_class => $dataclass,
-seq => $seqc,
-desc => $desc,
-display_id => $name,
-primary_id => $name,
-annotation => $annotation,
-molecule => $mol,
-alphabet => $alphabet,
-features => \@features,
%params);
return $seq;
}
=head2 _read_EMBL_Species
Title : _read_EMBL_Species
Usage :
Function: Reads the EMBL Organism species and classification
lines.
Example :
Returns : A Bio::Species object
Args : a reference to the current line buffer, accession number
Notes : Over-ridden from Bio/SeqIO/embl.pm to cope with "eccentric"
OS and OC lines in IMGT flat file
=cut
sub _read_EMBL_Species {
my( $self, $buffer, $acc ) = @_;
my $org;
$_ = $$buffer;
my( $sub_species, $species, $genus, $common, $sci_name, $class_lines );
while (defined( $_ ||= $self->_readline )) {
if (/^OS\s+(.+)/) {
$sci_name .= ($sci_name) ? ' '.$1 : $1;
}
elsif (s/^OC\s+(.+)$//) {
$class_lines .= $1;
}
elsif (/^OG\s+(.*)/) {
$org = $1;
}
else {
last;
}
$_ = undef; # Empty $_ to trigger read of next line
}
$$buffer = $_;
$sci_name || return;
# IMGT sometimes has "unclassified" for both species and taxon
# which confuses Bio::Taxon. Replace it with unidentified in species
$sci_name =~ s/unclassified/unidentified/i;
# sometimes things have common name in brackets, like
# Schizosaccharomyces pombe (fission yeast), so get rid of the common
# name bit. Probably dangerous if real scientific species name ends in
# bracketed bit.
if ($sci_name =~ /^(.+)\s+\((.+)\)$/) {
$sci_name = $1;
$common = $2;
}
if ($sci_name =~ /^(\S+)\s+(\S+)$/) {
($genus, $species) = ($1, $2);
} elsif ($sci_name =~ /(\S+)\s+(\S+)\s+(\S+)/) {
($genus, $species, $sub_species) = ($1, $2, $3);
$sci_name = $genus . " " . $species;
}
# Convert data in classification lines into classification array.
# only split on ';' or '.' so that classification that is 2 or more words
# will still get matched, use map() to remove trailing/leading/intervening
# spaces
my @class = map { s/^\s+//; s/\s+$//; s/\s{2,}/ /g; $_; } split /[;\.]+/, $class_lines;
@class = grep { /\S/ } @class;
# Bio::Species array needs array in Species -> Kingdom direction
unless ($class[-1] eq $species) {
push(@class, $sci_name);
}
@class = reverse @class;
my %names;
foreach my $i (0..$#class) {
my $name = $class[$i];
$names{$name}++;
if ($names{$name} > 1 && $name ne $class[$i - 1]) {
$self->throw("$acc seems to have an invalid species classification.");
}
}
my $make = Bio::Species->new();
$make->scientific_name($sci_name);
$make->classification(@class);
$make->genus($genus) if $genus;
$make->species($species) if $species;
$make->sub_species($sub_species) if $sub_species;
$make->common_name($common) if $common;
$make->organelle($org) if $org;
return $make;
}
=head2 _read_FTHelper_EMBL
Title : _read_FTHelper_EMBL
Usage : _read_FTHelper_EMBL($buffer)
Function: reads the next FT key line
Example :
Returns : Bio::SeqIO::FTHelper object
Args : filehandle and reference to a scalar
Notes : overridden from Bio::SeqIO::embl
to cope with blank FT lines in IMGT flatfile
=cut
sub _read_FTHelper_EMBL {
my ($self,$buffer) = @_;
my ($key, # The key of the feature
$loc, # The location line from the feature
@qual, # An arrray of lines making up the qualifiers
);
if ($$buffer =~ /^FT\s{3}(\S+)\s*(\S*)$/ ) {
$key = $1;
$loc = $2;
if (not $loc) {
# in some cases, there is no whitespace between
# the key and the location. Try to separate them.
if ($key =~ /^(\S+?)([\>\<\.\d]+)/) {
$key = $1;
$loc = $2;
}
}
# Read all the lines up to the next feature
while ( defined($_ = $self->_readline) ) {
if (/^FT(\s+)(.+?)\s*$/) {
# Lines inside features are preceeded by 19 spaces
# A new feature is preceeded by 3 spaces
my ($spacer, $rest) = ($1, $2);
next if $rest !~ /\S/;
if (length($spacer) > 4) {
# Add to qualifiers if we're in the qualifiers
if (@qual) {
push(@qual, $rest);
}
# Start the qualifier list if it's the first qualifier
elsif (substr($rest, 0, 1) eq '/') {
@qual = ($rest);
}
# We're still in the location line, so append to location
else {
$loc .= $rest;
}
} else {
# We've reached the start of the next feature
last;
}
} else {
# We're at the end of the feature table
last;
}
}
} else {
# No feature key
return;
}
# Put the first line of the next feature into the buffer
$$buffer = $_;
return if not $key or not $loc;
# Make the new FTHelper object
my $out = new Bio::SeqIO::FTHelper();
$out->verbose($self->verbose());
$out->key($key);
$out->loc($loc);
# Now parse and add any qualifiers. (@qual is kept
# intact to provide informative error messages.)
QUAL: for (my $i = 0; $i < @qual; $i++) {
$_ = $qual[$i];
my( $qualifier, $value ) = m{^/([^=]+)(?:=(.+))?}
or $self->throw("Can't see new qualifier in: $_\nfrom:\n"
. join('', map "$_\n", @qual));
if (defined $value) {
# Do we have a quoted value?
if (substr($value, 0, 1) eq '"') {
# Keep adding to value until we find the trailing quote
# and the quotes are balanced
QUOTES: while ($value !~ /\"$/ or $value =~ tr/"/"/ % 2) { #"
$i++;
my $next = $qual[$i];
if (!defined($next)) {
$self->warn("Unbalanced quote in:\n".join("\n", @qual).
"\nAdding quote to close...".
"Check sequence quality!");
$value .= '"';
last QUOTES;
}
# Join to value with space if value or next line contains a space
$value .= (grep /\s/, ($value, $next)) ? " $next" : $next;
}
# Trim leading and trailing quotes
$value =~ s/^"|"$//g;
# Undouble internal quotes
$value =~ s/""/"/g; #"
}
} else {
$value = '_no_value';
}
# Store the qualifier
$out->field->{$qualifier} ||= [];
push(@{$out->field->{$qualifier}},$value);
}
return $out;
}
1;