Module which implements a newick string parser as a finite state machine which enables it
to parse the full Newick specification. Module does not need to be instantiated, the method
can be called directly.
sub next_token
{ my $string = shift;
my $delim = shift;
$$string =~ s/^(\s)+//;
return undef unless(length($$string));
my $index=undef;
my @delims = split(/ */, $delim);
foreach my $dl (@delims) {
my $pos = index($$string, $dl);
if($pos>=0) {
$index = $pos unless(defined($index));
$index = $pos if($pos<$index);
}
}
unless(defined($index)) {
throw("couldn't find delimiter $delim\n");
}
my $token ='';
if($index==0) {
$token = substr($$string,0,1);
$$string = substr($$string, 1);
} else {
$token = substr($$string, 0, $index);
$$string = substr($$string, $index);
}
return $token;
}
1; } |
sub parse_newick_into_tree
{
my $newick = shift;
my $count=1;
my $debug = 0;
print("$newick\n") if($debug);
my $token = next_token(\$newick, "(;");
my $lastset = undef;
my $node = undef;
my $root = undef;
my $state=1;
my $bracket_level = 0;
while(defined($token)) {
if($debug) { printf("state %d : '%s'\n", $state, $token); };
switch ($state) {
case 1 { $node = new Bio::EnsEMBL::Compara::NestedSet;
$node->node_id($count++);
$lastset->add_child($node) if($lastset);
$root=$node unless($root);
if($token eq '(') { printf(" create set\n") if($debug);
$token = next_token(\$newick, "[(:,)");
$state = 1;
$bracket_level++;
$lastset = $node;
} else {
$state = 2;
}
}
case 2 { if(!($token =~ /[\[\:\,\)\;]/)) {
$node->name($token);
if($debug) { print(" naming leaf"); $node->print_node; }
$token = next_token(\$newick, "[:,);");
}
$state = 3;
}
case 3 { if($token eq ':') {
$token = next_token(\$newick, "[,);");
$node->distance_to_parent($token);
if($debug) { print("set distance: $token"); $node->print_node; }
$token = next_token(\$newick, ",);"); } elsif ($token eq '[') { $token .= next_token(\$newick, ",);");
}
$state = 4;
}
case 4 { if($token =~ /\[\&\&NHX/) {
$token =~ /\[\&\&NHX\:(\S+)\]/;
if ($1) {
my @attributes = split ':', $1;
foreach my $attribute (@attributes) {
$attribute =~ s/\s+//;
my($key,$value) = split '=', $attribute;
if ($key eq 'D') {
$key = "Duplication";
$value =~ s/Y/1/;
$value =~ s/N/0/;
}
if ($key eq 'DD') {
$value =~ s/Y/1/;
$value =~ s/N/0/;
}
$node->add_tag("$key","$value");
}
}
unless ($node->is_leaf) {
if (!defined($node->get_tagvalue("Duplication"))) {
$node->add_tag("Duplication",0);
}
}
if($debug) { print("NHX tags: $token"); $node->print_node; }
$token = next_token(\$newick, ",);"); }
$state = 5;
}
case 5 { if($token eq ')') {
if($debug) { print("end set : "); $lastset->print_node; }
$node = $lastset;
$lastset = $lastset->parent;
$token = next_token(\$newick, "[:,);");
if ($token eq '[') {
$state=1;
} else {
$state=2;
}
$bracket_level--;
} elsif($token eq ',') {
$token = next_token(\$newick, "[(:,)"); $state=1;
} elsif($token eq ';') {
throw("parse error: unbalanced ()\n") if($bracket_level ne 0);
$state=13;
$token = next_token(\$newick, "(");
} else {
throw("parse error: expected ; or ) or ,\n");
}
}
case 13 {
throw("parse error: nothing expected after ;");
}
}
}
return $root; } |
The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _