Raw content of Bio::EnsEMBL::Compara::Graph::NewickParser
=head1 NAME
NewickParser - DESCRIPTION of Object
=head1 SYNOPSIS
=head1 DESCRIPTION
Module which implements a newick string parser as a finite state machine which enables it
to parse the full Newick specification. Module does not need to be instantiated, the method
can be called directly.
=head1 CONTACT
Contact Jessica Severin on implemetation/design detail: jessica@ebi.ac.uk
Contact Ewan Birney on EnsEMBL in general: birney@sanger.ac.uk
=head1 APPENDIX
The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
=cut
package Bio::EnsEMBL::Compara::Graph::NewickParser;
use strict;
use Switch;
use Bio::EnsEMBL::Compara::NestedSet;
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
=head2 parse_newick_into_tree
Arg 1 : string $newick_tree
Example : $tree = Bio::EnsEMBL::Compara::NestedSet::parse_newick_into_tree($newick_tree);
Description: Read the newick string and returns (the root of) a tree
Returntype : Bio::EnsEMBL::Compara::NestedSet
Exceptions : none
Caller : general
=cut
sub parse_newick_into_tree
{
my $newick = shift;
my $count=1;
my $debug = 0;
print("$newick\n") if($debug);
my $token = next_token(\$newick, "(;");
my $lastset = undef;
my $node = undef;
my $root = undef;
my $state=1;
my $bracket_level = 0;
while(defined($token)) {
if($debug) { printf("state %d : '%s'\n", $state, $token); };
switch ($state) {
case 1 { #new node
$node = new Bio::EnsEMBL::Compara::NestedSet;
$node->node_id($count++);
$lastset->add_child($node) if($lastset);
$root=$node unless($root);
if($token eq '(') { #create new set
printf(" create set\n") if($debug);
$token = next_token(\$newick, "[(:,)");
$state = 1;
$bracket_level++;
$lastset = $node;
} else {
$state = 2;
}
}
case 2 { #naming a node
if(!($token =~ /[\[\:\,\)\;]/)) {
$node->name($token);
if($debug) { print(" naming leaf"); $node->print_node; }
$token = next_token(\$newick, "[:,);");
}
$state = 3;
}
case 3 { # optional : and distance
if($token eq ':') {
$token = next_token(\$newick, "[,);");
$node->distance_to_parent($token);
if($debug) { print("set distance: $token"); $node->print_node; }
$token = next_token(\$newick, ",);"); #move to , or )
} elsif ($token eq '[') { # NHX tag without previous blength
$token .= next_token(\$newick, ",);");
}
$state = 4;
}
case 4 { # optional NHX tags
if($token =~ /\[\&\&NHX/) {
# careful: this regexp gets rid of all NHX wrapping in one step
$token =~ /\[\&\&NHX\:(\S+)\]/;
if ($1) {
# NHX may be empty, presumably at end of file, just before ";"
my @attributes = split ':', $1;
foreach my $attribute (@attributes) {
$attribute =~ s/\s+//;
my($key,$value) = split '=', $attribute;
# we assume only one value per key
# shortcut duplication mapping
if ($key eq 'D') {
$key = "Duplication";
# this is a small hack that will work for
# treefam nhx trees
$value =~ s/Y/1/;
$value =~ s/N/0/;
}
if ($key eq 'DD') {
# this is a small hack that will work for
# treefam nhx trees
$value =~ s/Y/1/;
$value =~ s/N/0/;
}
$node->add_tag("$key","$value");
}
}
# $token = next_token(\$newick, ",);");
#$node->distance_to_parent($token);
# Force a duplication = 0 for some strange treefam internal nodes
unless ($node->is_leaf) {
if (!defined($node->get_tagvalue("Duplication"))) {
$node->add_tag("Duplication",0);
}
}
if($debug) { print("NHX tags: $token"); $node->print_node; }
$token = next_token(\$newick, ",);"); #move to , or )
}
$state = 5;
}
case 5 { # end node
if($token eq ')') {
if($debug) { print("end set : "); $lastset->print_node; }
$node = $lastset;
$lastset = $lastset->parent;
$token = next_token(\$newick, "[:,);");
# it is possible to have anonymous internal nodes no name
# no blength but with NHX tags
if ($token eq '[') {
$state=1;
} else {
$state=2;
}
$bracket_level--;
} elsif($token eq ',') {
$token = next_token(\$newick, "[(:,)"); #can be un_blengthed nhx nodes
$state=1;
} elsif($token eq ';') {
#done with tree
throw("parse error: unbalanced ()\n") if($bracket_level ne 0);
$state=13;
$token = next_token(\$newick, "(");
} else {
throw("parse error: expected ; or ) or ,\n");
}
}
case 13 {
throw("parse error: nothing expected after ;");
}
}
}
return $root;
}
sub next_token {
my $string = shift;
my $delim = shift;
$$string =~ s/^(\s)+//;
return undef unless(length($$string));
#print("input =>$$string\n");
#print("delim =>$delim\n");
my $index=undef;
my @delims = split(/ */, $delim);
foreach my $dl (@delims) {
my $pos = index($$string, $dl);
if($pos>=0) {
$index = $pos unless(defined($index));
$index = $pos if($pos<$index);
}
}
unless(defined($index)) {
throw("couldn't find delimiter $delim\n");
}
my $token ='';
if($index==0) {
$token = substr($$string,0,1);
$$string = substr($$string, 1);
} else {
$token = substr($$string, 0, $index);
$$string = substr($$string, $index);
}
#print(" token =>$token\n");
#print(" outstring =>$$string\n\n");
return $token;
}
1;