sub _read_EMBL_Species
{ my( $self, $buffer, $acc ) = @_;
my $org;
$_ = $$buffer;
my( $sub_species, $species, $genus, $common, $sci_name, $class_lines );
while (defined( $_ ||= $self->_readline )) {
if (/^OS\s+(.+)/) {
$sci_name .= ($sci_name) ? ' '.$1 : $1;
}
elsif (s/^OC\s+(.+)$//) {
$class_lines .= $1;
}
elsif (/^OG\s+(.*)/) {
$org = $1;
}
else {
last;
}
$_ = undef; }
$$buffer = $_;
$sci_name || return;
$sci_name =~ s/unclassified/unidentified/i;
if ($sci_name =~ /^(.+)\s+\((.+)\)$/) {
$sci_name = $1;
$common = $2;
}
if ($sci_name =~ /^(\S+)\s+(\S+)$/) {
($genus, $species) = ($1, $2);
} elsif ($sci_name =~ /(\S+)\s+(\S+)\s+(\S+)/) {
($genus, $species, $sub_species) = ($1, $2, $3);
$sci_name = $genus . " " . $species;
}
my @class = map { s/^\s+//; s/\s+$//; s/\s{2,}/ /g; $_; } split /[;\.]+/, $class_lines;
@class = grep { /\S/ } @class;
unless ($class[-1] eq $species) {
push(@class, $sci_name);
}
@class = reverse @class;
my %names;
foreach my $i (0..$#class) {
my $name = $class[$i];
$names{$name}++;
if ($names{$name} > 1 && $name ne $class[$i - 1]) {
$self->throw("$acc seems to have an invalid species classification.");
}
}
my $make = Bio::Species->new();
$make->scientific_name($sci_name);
$make->classification(@class);
$make->genus($genus) if $genus;
$make->species($species) if $species;
$make->sub_species($sub_species) if $sub_species;
$make->common_name($common) if $common;
$make->organelle($org) if $org;
return $make; } |
sub _read_FTHelper_EMBL
{ my ($self,$buffer) = @_;
my ($key, $loc, @qual, );
if ($$buffer =~ /^FT\s{3}(\S+)\s*(\S*)$/ ) {
$key = $1;
$loc = $2;
if (not $loc) {
if ($key =~ /^(\S+?)([\>\<\.\d]+)/) {
$key = $1;
$loc = $2;
}
}
while ( defined($_ = $self->_readline) ) {
if (/^FT(\s+)(.+?)\s*$/) {
my ($spacer, $rest) = ($1, $2);
next if $rest !~ /\S/;
if (length($spacer) > 4) {
if (@qual) {
push(@qual, $rest);
}
elsif (substr($rest, 0, 1) eq '/') {
@qual = ($rest);
}
else {
$loc .= $rest;
}
} else {
last;
}
} else {
last;
}
}
} else {
return;
}
$$buffer = $_;
return if not $key or not $loc;
my $out = new Bio::SeqIO::FTHelper();
$out->verbose($self->verbose());
$out->key($key);
$out->loc($loc);
QUAL: for (my $i = 0; $i < @qual; $i++) {
$_ = $qual[$i];
my( $qualifier, $value ) = m{^/([^=]+)(?:=(.+))?}
or $self->throw("Can't see new qualifier in: $_\nfrom:\n"
. join('', map "$_\n", @qual));
if (defined $value) {
if (substr($value, 0, 1) eq '"') {
QUOTES: while ($value !~ /\"$/ or $value =~ tr/"/"/ % 2) { $i++;
my $next = $qual[$i];
if (!defined($next)) {
$self->warn("Unbalanced quote in:\n".join("\n", @qual).
"\nAdding quote to close...".
"Check sequence quality!");
$value .= '"';
last QUOTES;
}
$value .= (grep /\s/, ($value, $next)) ? " $next" : $next;
}
$value =~ s/^"|"$//g;
$value =~ s/""/"/g; }
} else {
$value = '_no_value';
}
$out->field->{$qualifier} ||= [];
push(@{$out->field->{$qualifier}},$value);
}
return $out;
}
1; } |
sub next_seq
{ my ($self,@args) = @_;
my ($pseq,$c,$line,$name,$dataclass,$desc,$acc,$seqc,$mol,$div,
$date, $comment, @date_arr);
my ($annotation, %params, @features) =
new Bio::Annotation::Collection;
$line = $self->_readline;
if( !defined $line ) {
return; }
if( $line =~ /^\s+$/ ) {
while( defined ($line = $self->_readline) ) {
$line =~/^\S/ && last;
}
return unless $line;
}
$self->throw("EMBL stream with no ID. Not embl in my book")
unless $line =~ /^ID\s+\S+/;
my $alphabet;
if ( $line =~ /^ID\s+(\S+)\s+\S+\s+\S+\s+:\s+([^;]+);\s+([^;]+);\s+([^;]+);\s+\d+\s+BP\.$/) {
($name, $dataclass, $mol, $div) = ($1, $2, $3, $4);
if (defined $mol ) {
if ($mol =~ /DNA/) {
$alphabet='dna';
}
elsif ($mol =~ /RNA/) {
$alphabet='rna';
}
elsif ($mol =~ /AA/) {
$alphabet='protein';
}
}
}
unless( defined $name && length($name) ) {
$name = "unknown_id";
}
my $buffer = $line;
local $_;
BEFORE_FEATURE_TABLE: until( !defined $buffer ) {
$_ = $buffer;
if( /^(F[HT]|SQ)/ ) {
$self->_pushback($_) if( $1 eq 'SQ' );
last;
}
if (/^DE\s+(\S.*\S)/) {
$desc .= $desc ? " $1" : $1;
}
if( /^AC\s+(.*)?/ ) {
my @accs = split(/[; ]+/, $1); $params{'-accession_number'} = shift @accs
unless defined $params{'-accession_number'};
push @{$params{'-secondary_accessions'}}, @accs;
}
if( /^SV\s+\S+\.(\d+);?/ ) {
my $sv = $1;
$params{'-seq_version'} = $sv;
$params{'-version'} = $sv;
}
if( /^DT\s+(.+)$/ ) {
my $line = $1;
my ($date, $version) = split(' ', $line, 2);
$date =~ tr/,//d; if ($version =~ /\(Rel\. (\d+), Created\)/xms ) {
my $release = Bio::Annotation::SimpleValue->
new(
-tagname => 'creation_release',
-value => $1
);
$annotation->add_Annotation($release);
} elsif ($version =~ /\(Rel\. (\d+), Last updated, Version (\d+)\)/xms ) {
my $release = Bio::Annotation::SimpleValue->
new(
-tagname => 'update_release',
-value => $1
);
$annotation->add_Annotation($release);
my $update = Bio::Annotation::SimpleValue->
new(
-tagname => 'update_version',
-value => $2
);
$annotation->add_Annotation($update);
}
push @{$params{'-dates'}}, $date;
}
if( /^KW (.*)\S*$/ ) {
my @kw = split(/\s*\;\s*/,$1);
push @{$params{'-keywords'}}, @kw;
}
elsif (/^O[SC]/) {
my $species = $self->_read_EMBL_Species(\$buffer, $params{'-accession_number'});
$params{'-species'}= $species;
}
elsif (/^R/) {
my @refs = $self->_read_EMBL_References(\$buffer);
foreach my $ref ( @refs ) {
$annotation->add_Annotation('reference',$ref);
}
}
elsif (/^DR/) {
my @links = $self->_read_EMBL_DBLink(\$buffer);
foreach my $dblink ( @links ) {
$annotation->add_Annotation('dblink',$dblink);
}
}
$buffer = $self->_readline;
}
while( defined ($_ = $self->_readline) ) {
/^FT\s{3}\w/ && last;
/^SQ / && last;
/^CO / && last;
}
$buffer = $_;
if (defined($buffer) && $buffer =~ /^FT\s+\S+/) {
until( !defined ($buffer) ) {
my $ftunit = $self->_read_FTHelper_EMBL(\$buffer);
next if not defined $ftunit;
my $feat =
$ftunit->_generic_seqfeature($self->location_factory(), $name);
if($params{'-species'} && ($feat->primary_tag eq 'source')
&& $feat->has_tag('db_xref')
&& (! $params{'-species'}->ncbi_taxid())) {
foreach my $tagval ($feat->get_tag_values('db_xref')) {
if(index($tagval,"taxon:") == 0) {
$params{'-species'}->ncbi_taxid(substr($tagval,6));
last;
}
}
}
push(@features, $feat);
if( $buffer !~ /^FT/ ) {
last;
}
}
}
while( defined ($buffer) && $buffer =~ /^XX/ ) {
$buffer = $self->_readline();
}
if( $buffer !~ /^SQ/ ) {
while( defined ($_ = $self->_readline) ) {
/^SQ/ && last;
}
}
$seqc = "";
while( defined ($_ = $self->_readline) ) {
m{^//} && last;
$_ = uc($_);
s/[^A-Za-z]//g;
$seqc .= $_;
}
my $seq = $self->sequence_factory->create
(-verbose => $self->verbose(),
-division => $div,
-data_class => $dataclass,
-seq => $seqc,
-desc => $desc,
-display_id => $name,
-primary_id => $name,
-annotation => $annotation,
-molecule => $mol,
-alphabet => $alphabet,
-features =>\@ features,
%params);
return $seq; } |