Raw content of Bio::Tree::TreeFunctionsI # $Id: TreeFunctionsI.pm,v 1.5.2.3 2003/09/14 20:18:25 jason Exp $ # # BioPerl module for Bio::Tree::TreeFunctionsI # # Cared for by Jason Stajich <jason@bioperl.org> # # Copyright Jason Stajich # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::Tree::TreeFunctionsI - Decorated Interface implementing basic Tree exploration methods =head1 SYNOPSIS use Bio::TreeIO; my $in = new Bio::TreeIO(-format => 'newick', -file => 'tree.tre'); my $tree = $in->next_tree; my @nodes = $tree->find_node('id1'); if( $tree->is_monophyletic(-clade => @nodes, -outgroup => $outnode) ){ } =head1 DESCRIPTION Describe the interface here =head1 FEEDBACK =head2 Mailing Lists User feedback is an integral part of the evolution of this and other Bioperl modules. Send your comments and suggestions preferably to the Bioperl mailing list. Your participation is much appreciated. bioperl-l@bioperl.org - General discussion http://bioperl.org/MailList.shtml - About the mailing lists =head2 Reporting Bugs Report bugs to the Bioperl bug tracking system to help us keep track of the bugs and their resolution. Bug reports can be submitted via email or the web: bioperl-bugs@bioperl.org http://bugzilla.bioperl.org/ =head1 AUTHOR - Jason Stajich, Aaron Mackey, Justin Reese Email jason-at-bioperl-dot-org Email amackey-at-virginia.edu Email jtr4v-at-virginia.edu =head1 CONTRIBUTORS Additional contributors names and emails here Rerooting code was worked on by Daniel Barker d.barker-at-reading.ac.uk Ramiro Barrantes Ramiro.Barrantes-at-uvm.edu =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::Tree::TreeFunctionsI; use vars qw(@ISA); use strict; use Bio::Tree::TreeI; @ISA = qw(Bio::Tree::TreeI); =head2 find_node Title : find_node Usage : my @nodes = $self->find_node(-id => 'node1'); Function: returns all nodes that match a specific field, by default this is id, but different branch_length, Returns : List of nodes which matched search Args : text string to search for OR -fieldname => $textstring =cut sub find_node { my ($self,$type,$field) = @_; if( ! defined $type ) { $self->warn("Must request a either a string or field and string when searching"); } # all this work for a '-' named field # is so that we could potentially # expand to other constraints in # different implementations # like 'find all nodes with boostrap < XX' if( ! defined $field ) { # only 1 argument, default to searching by id $field= $type; $type = 'id'; } else { $type =~ s/^-//; } # could actually do this by testing $rootnode->can($type) but # it is possible that a tree is implemeted with different node types # - although it is unlikely that the root node would be richer than the # leaf nodes. Can't handle NHX tags right now unless( $type eq 'id' || $type eq 'name' || $type eq 'bootstrap' || $type eq 'description' || $type eq 'internal_id') { $self->warn("unknown search type $type - will try anyways"); } my @nodes = grep { $_->can($type) && defined $_->$type() && $_->$type() eq $field } $self->get_nodes(); if ( wantarray) { return @nodes; } else { if( @nodes > 1 ) { $self->warn("More than 1 node found but caller requested scalar, only returning first node"); } return shift @nodes; } } =head2 remove_Node Title : remove_Node Usage : $tree->remove_Node($node) Function: Removes a node from the tree Returns : boolean represent status of success Args : either Bio::Tree::NodeI or string of the node id =cut sub remove_Node { my ($self,$input) = @_; my $node = undef; unless( ref($input) ) { $node = $self->find_node($input); } elsif( ! $input->isa('Bio::Tree::NodeI') ) { $self->warn("Did not provide either a valid Bio::Tree::NodeI object to remove_node or the node name"); return 0; } else { $node = $input; } if( ! $node->ancestor && $self->get_root_node->internal_id != $node->internal_id) { $self->warn("Node (".$node->to_string . ") has no ancestor, can't remove!"); } else { $node->ancestor->remove_Descendent($node); } } # Added for Justin Reese by Jason =head2 get_lca Title : get_lca Usage : get_lca(-nodes => \@nodes ) Function: given two nodes, returns the lowest common ancestor Returns : node object Args : -nodes => arrayref of nodes to test =cut sub get_lca { my ($self,@args) = @_; my ($nodes) = $self->_rearrange([qw(NODES)],@args); if( ! defined $nodes ) { $self->warn("Must supply -nodes parameter to get_lca() method"); return undef; } my ($node1,$node2) = $self->_check_two_nodes($nodes); return undef unless $node1 && $node2; # algorithm: Start with first node, find and save every node from it to # root. Then start with second node; for it and each of its ancestor # nodes, check to see if it's in the first node's ancestor list - if # so it is the lca. # # This is very slow and naive, but I somehow doubt the overhead # of mapping the tree to a complete binary tree and doing the linear # lca search would be worth the overhead, especially for small trees. # Maybe someday I'll write a linear get_lca and find out. # find and save every ancestor of node1 (including itself) my %node1_ancestors; # keys are internal ids, values are objects my $place = $node1; # start at node1 while ( $place ){ $node1_ancestors{$place->internal_id} = $place; $place = $place->ancestor; } # now climb up node2, for each node checking whether # it's in node1_ancestors $place = $node2; # start at node2 while ( $place ){ foreach my $key ( keys %node1_ancestors ){ # ugh if ( $place->internal_id == $key){ return $node1_ancestors{$key}; } } $place = $place->ancestor; } $self->warn("Could not find lca!"); # should never execute, # if so, there's a problem return undef; } # Added for Justin Reese by Jason =head2 distance Title : distance Usage : distance(-nodes => \@nodes ) Function: returns the distance between two given nodes Returns : numerical distance Args : -nodes => arrayref of nodes to test =cut sub distance { my ($self,@args) = @_; my ($nodes) = $self->_rearrange([qw(NODES)],@args); if( ! defined $nodes ) { $self->warn("Must supply -nodes parameter to distance() method"); return undef; } my ($node1,$node2) = $self->_check_two_nodes($nodes); # algorithm: # Find lca: Start with first node, find and save every node from it # to root, saving cumulative distance. Then start with second node; # for it and each of its ancestor nodes, check to see if it's in # the first node's ancestor list - if so it is the lca. Return sum # of (cumul. distance from node1 to lca) and (cumul. distance from # node2 to lca) # find and save every ancestor of node1 (including itself) my %node1_ancestors; # keys are internal ids, values are objects my %node1_cumul_dist; # keys are internal ids, values # are cumulative distance from node1 to given node my $place = $node1; # start at node1 my $cumul_dist = 0; while ( $place ){ $node1_ancestors{$place->internal_id} = $place; $node1_cumul_dist{$place->internal_id} = $cumul_dist; if ($place->branch_length) { $cumul_dist += $place->branch_length; # include current branch # length in next iteration } $place = $place->ancestor; } # now climb up node2, for each node checking whether # it's in node1_ancestors $place = $node2; # start at node2 $cumul_dist = 0; while ( $place ){ foreach my $key ( keys %node1_ancestors ){ # ugh if ( $place->internal_id == $key){ # we're at lca return $node1_cumul_dist{$key} + $cumul_dist; } } # include current branch length in next iteration $cumul_dist += $place->branch_length; $place = $place->ancestor; } $self->warn("Could not find distance!"); # should never execute, # if so, there's a problem return undef; } # helper function to check lca and distance arguments sub _check_two_nodes { my ($self, $nodes) = @_; if( ref($nodes) !~ /ARRAY/i || !ref($nodes->[0]) || !ref($nodes->[1]) ) { $self->warn("Must provide a valid array reference for -nodes"); return undef; } elsif( scalar(@$nodes) > 2 ){ $self->warn("More than two nodes given, using first two"); } elsif( scalar(@$nodes) < 2 ){ $self->warn("-nodes parameter does not contain reference to two nodes"); return undef; } unless( $nodes->[0]->isa('Bio::Tree::NodeI') && $nodes->[1]->isa('Bio::Tree::NodeI') ) { $self->warn("Did not provide valid Bio::Tree::NodeI objects as nodes\n"); return undef; } return @$nodes; } =head2 is_monophyletic Title : is_monophyletic Usage : if( $tree->is_monophyletic(-nodes => \@nodes, -outgroup => $outgroup) Function: Will do a test of monophyly for the nodes specified in comparison to a chosen outgroup Returns : boolean Args : -nodes => arrayref of nodes to test -outgroup => outgroup to serve as a reference =cut sub is_monophyletic{ my ($self,@args) = @_; my ($nodes,$outgroup) = $self->_rearrange([qw(NODES OUTGROUP)],@args); if( ! defined $nodes || ! defined $outgroup ) { $self->warn("Must supply -nodes and -outgroup parameters to the method is_monophyletic"); return undef; } if( ref($nodes) !~ /ARRAY/i ) { $self->warn("Must provide a valid array reference for -nodes"); } my $clade_root; # this is to combine multiple tests into a single node # order doesn't really matter as long as get_lca does its job right while( @$nodes > 2 ) { my ($a,$b) = ( shift @$nodes, shift @$nodes); $clade_root = $self->get_lca(-nodes => [$a,$b] ); unshift @$nodes, $clade_root; } $clade_root = $self->get_lca(-nodes => $nodes ); my $og_ancestor = $outgroup->ancestor; while( defined ($og_ancestor ) ) { if( $og_ancestor->internal_id == $clade_root->internal_id ) { # monophyly is violated return 0; } $og_ancestor = $og_ancestor->ancestor; } return 1; } =head2 is_paraphyletic Title : is_paraphyletic Usage : if( $tree->is_paraphyletic(-nodes =>\@nodes, -outgroup => $node) ){ } Function: Tests whether or not a given set of nodes are paraphyletic (representing the full clade) given an outgroup Returns : [-1,0,1] , -1 if the group is not monophyletic 0 if the group is not paraphyletic 1 if the group is paraphyletic Args : -nodes => Array of Bio::Tree::NodeI objects which are in the tree -outgroup => a Bio::Tree::NodeI to compare the nodes to =cut sub is_paraphyletic{ my ($self,@args) = @_; my ($nodes,$outgroup) = $self->_rearrange([qw(NODES OUTGROUP)],@args); if( ! defined $nodes || ! defined $outgroup ) { $self->warn("Must suply -nodes and -outgroup parameters to the method is_paraphyletic"); return undef; } if( ref($nodes) !~ /ARRAY/i ) { $self->warn("Must provide a valid array reference for -nodes"); return undef; } # Algorithm # Find the lca # Find all the nodes beneath the lca # Test to see that none are missing from the nodes list my %nodehash; foreach my $n ( @$nodes ) { $nodehash{$n->internal_id} = $n; } while( @$nodes > 2 ) { unshift @$nodes, $self->get_lca(-nodes => [( shift @$nodes, shift @$nodes)] ); } my $clade_root = $self->get_lca(-nodes => $nodes ); unless( defined $clade_root ) { $self->warn("could not find clade root via lca"); return undef; } my $og_ancestor = $outgroup->ancestor; # Is this necessary/correct for paraphyly test? while( defined ($og_ancestor ) ) { if( $og_ancestor->internal_id == $clade_root->internal_id ) { # monophyly is violated, could be paraphyletic return -1; } $og_ancestor = $og_ancestor->ancestor; } my $tree = new Bio::Tree::Tree(-root => $clade_root, -nodelete => 1); foreach my $n ( $tree->get_nodes() ) { next unless $n->is_Leaf(); # if any leaf node is not in the list # then it is part of the clade and so the list # must be paraphyletic return 1 unless ( $nodehash{$n->internal_id} ); } return 0; } =head2 reroot Title : reroot_tree Usage : $tree->reroot($node); Function: Reroots a tree either making a new node the root Returns : 1 on success, 0 on failure Args : Bio::Tree::NodeI that is in the tree, but is not the current root =cut sub reroot { my ($self,$new_root) = @_; unless (defined $new_root && $new_root->isa("Bio::Tree::NodeI")) { $self->warn("Must provide a valid Bio::Tree::NodeI when rerooting"); return 0; } if( $new_root->is_Leaf() ) { $self->warn("Asking to root with a leaf, will use the leaf's ancestor"); $new_root = $new_root->ancestor; } my $old_root = $self->get_root_node; if( $new_root == $old_root ) { $self->warn("Node requested for reroot is already the root node!"); return 0; } my @path = (); # along tree, from newroot to oldroot my $node = $new_root; while ($node) { push @path, $node; $node = $node->ancestor; } my @path_from_oldroot = reverse @path; for (my $i = 0; $i < @path_from_oldroot - 1; $i++) { my $current = $path_from_oldroot[$i]; my $next = $path_from_oldroot[$i + 1]; $current->remove_Descendent($next); $current->branch_length($next->branch_length); $next->add_Descendent($current); } $new_root->branch_length(undef); $self->set_root_node($new_root); return 1; } =head2 reverse_edge Title : reverse_edge Usage : $node->reverse_edge(child); Function: makes child be a parent of node Requires: child must be a direct descendent of node Returns : nothing Args : Bio::Tree::NodeI that is in the tree =cut sub reverse_edge { my ($self,$node) = @_; delete_edge($self, $node); $node->add_Descendent($self); 1; } =head2 delete_edge Title : delete_edge Usage : $node->reverse_edge(child); Function: makes child be a parent of node Requires: child must be a direct descendent of node Returns : nothing Args : Bio::Tree::NodeI that is in the tree =cut sub delete_edge { my ($self,$node) = @_; unless (defined $self && $self->isa("Bio::Tree::NodeI")) { $self->warn("Must provide a valid Bio::Tree::NodeI when rerooting"); return 1; } unless (defined $node && $node->isa("Bio::Tree::NodeI")) { $self->warn("Must provide a valid Bio::Tree::NodeI when rerooting"); return 1; } if( $self->{'_desc'}->{$node->internal_id} ) { $node->ancestor(undef); $self->{'_desc'}->{$node->internal_id}->ancestor(undef); delete $self->{'_desc'}->{$node->internal_id}; } else { $self->warn("First argument must be direct parent of node"); return 1; } 1; } sub findnode_by_id { my $tree = shift; my $id = shift; my $rootnode = $tree->get_root_node; if ( ($rootnode->id) and ($rootnode->id eq $id) ) { return $rootnode; } # process all the children foreach my $node ( $rootnode->get_Descendents ) { if ( ($node->id) and ($node->id eq $id ) ) { return $node; } } } 1;