Raw content of Bio::EnsEMBL::Compara::NestedSet =head1 NAME NestedSet - DESCRIPTION of Object =head1 SYNOPSIS =head1 DESCRIPTION Abstract superclass to encapsulate the process of storing and manipulating a nested-set representation tree. Also implements a 'reference count' system based on the ObjectiveC retain/release design. Designed to be used as the Root class for all Compara 'proxy' classes (Member, GenomeDB, DnaFrag, NCBITaxon) to allow them to be made into sets and trees. =head1 CONTACT Contact Albert Vilella on implementation detail: avilella@ebi.ac.uk Contact Jessica Severin on implementation/design detail: jessica@ebi.ac.uk Contact Ewan Birney on EnsEMBL in general: birney@sanger.ac.uk =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::EnsEMBL::Compara::NestedSet; use strict; use Bio::EnsEMBL::Utils::Exception; use Bio::EnsEMBL::Utils::Argument; use Bio::TreeIO; use Bio::EnsEMBL::Compara::Graph::Node; our @ISA = qw(Bio::EnsEMBL::Compara::Graph::Node); ################################################# # Factory methods ################################################# sub init { my $self = shift; $self->SUPER::init; return $self; } sub dealloc { my $self = shift; #printf("DEALLOC NestedSet refcount:%d ", $self->refcount); $self->print_node; #$self->release_children; return $self->SUPER::dealloc; } =head2 copy Overview : creates copy of tree starting at this node going down Example : my $clone = $self->copy; Returntype : Bio::EnsEMBL::Compara::NestedSet Exceptions : none Caller : general =cut sub copy { my $self = shift; my $mycopy = $self->SUPER::copy; bless $mycopy, ref $self; $mycopy->distance_to_parent($self->distance_to_parent); $mycopy->left_index($self->left_index); $mycopy->right_index($self->right_index); foreach my $child (@{$self->children}) { $mycopy->add_child($child->copy); } return $mycopy; } =head2 release_tree Overview : deletes and frees the memory used by this tree and all the underlying nodes. Example : $self->release_tree; Returntype : undef Exceptions : none Caller : general =cut sub release_tree { my $self = shift; my $child_count = $self->get_child_count; $self->disavow_parent; $self->cascade_unlink if($child_count); return undef; } ################################################# # Object variable methods ################################################# sub left_index { my $self = shift; $self->{'_left_index'} = shift if(@_); $self->{'_left_index'} = 0 unless(defined($self->{'_left_index'})); return $self->{'_left_index'}; } sub right_index { my $self = shift; $self->{'_right_index'} = shift if(@_); $self->{'_right_index'} = 0 unless(defined($self->{'_right_index'})); return $self->{'_right_index'}; } ####################################### # Set manipulation methods ####################################### =head2 add_child Overview : attaches child nestedset node to this nested set Arg [1] : Bio::EnsEMBL::Compara::NestedSet $child Arg [2] : (opt.) distance between this node and child Example : $self->add_child($child); Returntype : undef Exceptions : if child is undef or not a NestedSet subclass Caller : general =cut sub add_child { my $self = shift; my $child = shift; my $dist = shift; throw("child not defined") unless(defined($child)); throw("arg must be a [Bio::EnsEMBL::Compara::NestedSet] not a [$child]") unless($child->isa('Bio::EnsEMBL::Compara::NestedSet')); #printf("add_child: parent(%s) <-> child(%s)\n", $self->node_id, $child->node_id); unless(defined($dist)) { $dist = $child->_distance; } $child->disavow_parent; #create_link_to_node is a safe method which checks if connection exists my $link = $self->create_link_to_node($child); $child->_set_parent_link($link); $self->{'_children_loaded'} = 1; $link->distance_between($dist); return $link; } sub store_child { my ($self, $child) = @_; throw("store_child has been deprecated. Highly inefficient.". "Use add_child, build in memory and store in one go\n"); } =head2 disavow_parent Overview : unlink and release self from its parent might cause self to delete if refcount reaches Zero. Example : $self->disavow_parent Returntype : undef Caller : general =cut sub disavow_parent { my $self = shift; if($self->{'_parent_link'}) { my $link = $self->{'_parent_link'}; #print("DISAVOW parent : "); $parent->print_node; #print(" child : "); $self->print_node; $link->dealloc; } $self->_set_parent_link(undef); return undef; } =head2 release_children Overview : recursive releases all children will cause potential deletion of children if refcount reaches Zero. Example : $self->release_children Returntype : $self Exceptions : none Caller : general =cut sub release_children { my $self = shift; # by calling with parent, this preserved the link to the parent # and thus doesn't unlink self foreach my $child (@{$self->children}) { $child->disavow_parent; $child->release_children; } #$self->cascade_unlink($self->{'_parent_node'}); return $self; } =head2 parent Overview : returns the parent NestedSet object for this node Example : my $my_parent = $object->parent(); Returntype : undef or Bio::EnsEMBL::Compara::NestedSet Exceptions : none Caller : general =cut sub parent { my $self = shift; if(!defined($self->{'_parent_link'}) and $self->adaptor and $self->_parent_id) { my $parent = $self->adaptor->fetch_parent_for_node($self); #print("fetched parent : "); $parent->print_node; # if (!defined($parent)) { # $DB::single=1;1; # } $parent->add_child($self); } return undef unless($self->{'_parent_link'}); return $self->{'_parent_link'}->get_neighbor($self); } sub parent_link { my $self = shift; return $self->{'_parent_link'}; } sub has_parent { my $self = shift; return 1 if($self->{'_parent_link'} or $self->_parent_id); return 0; } sub has_ancestor { my $self = shift; my $ancestor = shift; throw "[$ancestor] must be a Bio::EnsEMBL::Compara::NestedSet object" unless ($ancestor and $ancestor->isa("Bio::EnsEMBL::Compara::NestedSet")); my $node = $self->parent; while($node) { return 1 if($node->equals($ancestor)); $node = $node->parent; } return 0; } =head2 root Overview : returns the root NestedSet object for this node returns $self if node has no parent (this is the root) Example : my $root = $object->root(); Description: Returns the root of the tree for this node with links to all the intermediate nodes. Sister nodes are not included in the result. Returntype : undef or Bio::EnsEMBL::Compara::NestedSet Exceptions : none Caller : general =cut sub root { my $self = shift; # Only if we don't have it cached # Only if we have left and right and it's not a leaf # Only if it's for release clusterset (1 genetrees - 0 genomic align trees) if (!defined($self->{'_parent_link'}) and $self->adaptor and ($self->right_index-$self->left_index)>1 and (1==$self->{'_parent_id'}) ) { return $self->adaptor->fetch_root_by_node($self); } # Otherwise, go through the step-by-step method return $self unless(defined($self->parent)); # return $self if($self->node_id eq $self->parent->node_id); return $self->parent->root; } sub subroot { my $self = shift; return undef unless($self->parent); return $self unless(defined($self->parent->parent)); return $self->parent->subroot; } =head2 children Overview : returns a list of NestedSet nodes directly under this parent node Example : my @children = @{$object->children()}; Returntype : array reference of Bio::EnsEMBL::Compara::NestedSet objects (could be empty) Exceptions : none Caller : general Algorithm : new algorithm for fetching children: for each link connected to this NestedsSet node, a child is defined if old: the link is not my parent_link new: the link's neighbors' parent_link is the link This allows one (with a carefully coded algorithm) to overlay a tree on top of a fully connected graph and use the parent/children methods of NestedSet to walk the 'tree' substructure of the graph. Trees that are really just trees are still trees. =cut sub children { my $self = shift; $self->load_children_if_needed; my @kids; foreach my $link (@{$self->links}) { next unless(defined($link)); my $neighbor = $link->get_neighbor($self); next unless($neighbor->parent_link); next unless($neighbor->parent_link->equals($link)); push @kids, $neighbor; } return \@kids; } sub each_child { my $self = shift; # Return an iterator over the children (most effective when children list is LONG) my $count = -1; $self->load_children_if_needed; my @links = @{$self->links}; return sub { while ($count < scalar(@links)) { $count++; my $link = $links[$count]; next unless(defined $link); my $neighbor = $link->get_neighbor($self); next unless($neighbor->parent_link); next unless($neighbor->parent_link->equals($link)); return $neighbor; } return undef; }; } =head2 sorted_children Overview : returns a sorted list of NestedSet nodes directly under this parent node sort so that internal nodes<leaves and then on distance Example : my @kids = @{$object->ordered_children()}; Returntype : array reference of Bio::EnsEMBL::Compara::NestedSet objects (could be empty) Exceptions : none Caller : general =cut sub sorted_children { my $self = shift; my @sortedkids = sort { $a->is_leaf <=> $b->is_leaf || $a->get_child_count <=> $b->get_child_count || $a->distance_to_parent <=> $b->distance_to_parent } @{$self->children;}; return \@sortedkids; } =head2 get_all_nodes Arg 1 : hashref $node_hash [used for recursivity, do not use it!] Example : my $all_nodes = $root->get_all_nodes(); Description : Returns this and all underlying sub nodes ReturnType : listref of Bio::EnsEMBL::Compara::NestedSet objects Exceptions : none Caller : general Status : Stable =cut sub get_all_nodes { my $self = shift; my $node_hash = shift; my $toplevel = 0; unless($node_hash) { $node_hash = {}; $toplevel =1; } $node_hash->{$self->obj_id} = $self; foreach my $child (@{$self->children}) { $child->get_all_nodes($node_hash); } if ($toplevel) { return [values(%$node_hash)]; } return undef; } =head2 get_all_nodes_by_tagvalue Arg 1 : tag_name Arg 2 : tag_value (optional) Example : my $all_nodes = $root->get_all_nodes_by_tagvalue('taxon_name'=>'Mamalia'); Description : Returns all underlying nodes that have a tag of the given name, and optionally a value of the given value. ReturnType : listref of Bio::EnsEMBL::Compara::NestedSet objects Exceptions : none Caller : general Status : =cut sub get_all_nodes_by_tag_value{ my $self = shift; my $tag = shift || die( "Need a tag name" ); my $value = shift; my @found; foreach my $node( @{$self->get_all_nodes} ){ my $tval = $node->get_tagvalue($tag); if( defined $tval and $value ? $tval eq $value : 1 ){ push @found, $node; } } return [@found]; } =head2 get_all_subnodes Arg 1 : hashref $node_hash [used for recursivity, do not use it!] Example : my $all_nodes = $root->get_all_nodes(); Description : Returns all underlying sub nodes ReturnType : listref of Bio::EnsEMBL::Compara::NestedSet objects Exceptions : none Caller : general Status : Stable =cut sub get_all_subnodes { my $self = shift; my $node_hash = shift; my $toplevel = 0; unless($node_hash) { $node_hash = {}; $toplevel =1; } foreach my $child (@{$self->children}) { $node_hash->{$child->obj_id} = $child; $child->get_all_subnodes($node_hash); } return values(%$node_hash) if($toplevel); return undef; } =head2 get_all_ancestors Arg 1 : Example : my @ancestors = @{$node->get_all_ancestors}; Description : Returns all ancestor nodes for a given node ReturnType : listref of Bio::EnsEMBL::Compara::NestedSet objects Exceptions : none Caller : general Status : =cut sub get_all_ancestors { my $self = shift; my $this = $self; my @ancestors; while( $this = $this->parent ){ push @ancestors, $this; } return [@ancestors] } =head2 get_all_adjacent_subtrees Arg 1 : Example : my @subtrees = @{$node->get_all_adjacent_subtrees}; Description : Returns subtree 'root' nodes where the subtree is adjacent to this node. Used e.g. by the web code for the 'collapse other nodes' action ReturnType : listref of Bio::EnsEMBL::Compara::NestedSet objects Exceptions : none Caller : EnsEMBL::Web::Component::Gene::ComparaTree Status : =cut sub get_all_adjacent_subtrees { my $self = shift; my $node_id = $self->node_id; my @node_path_to_root = ($self, @{$self->get_all_ancestors} ); my %path_node_ids = map{ $_->node_id => 1 } @node_path_to_root; my $this = $self->subroot; # Start at the root node my @adjacent_subtrees; while( $this ){ last if $this->node_id == $node_id; # Stop on reaching current node my $next; foreach my $child (@{$this->children}){ next if $child->is_leaf; # Leaves cannot be subtrees if( $path_node_ids{$child->node_id} ){ # Ancestor node $next = $child; } else { push @adjacent_subtrees, $child; } } $this = $next || undef; } return [@adjacent_subtrees] } =head2 num_leaves Example : my $num_leaves = $node->num_leaves Description : Returns the number of leaves underlying the node ReturnType : integer Exceptions : none Caller : general Status : At risk (relies on left and right indexes) =cut #' sub num_leaves{ my $self = shift; my $left = $self->left_index; my $right = $self->right_index; return undef unless (defined($left) && defined($right)); my $num = $right - $left + 1; my $num_leaves = ( ($num/2) + 1 ) / 2; return $num_leaves; } sub get_child_count { my $self = shift; $self->load_children_if_needed; return scalar @{$self->children}; # my $count = $self->link_count; # $count-- if($self->has_parent); # return $count; } sub load_children_if_needed { my $self = shift; if($self->adaptor and !defined($self->{'_children_loaded'})) { #define _children_id_hash thereby signally that I've tried to load my children $self->{'_children_loaded'} = 1; #print("load_children_if_needed : "); $self->print_node; $self->adaptor->fetch_all_children_for_node($self); } return $self; } sub no_autoload_children { my $self = shift; return if($self->{'_children_loaded'}); $self->{'_children_loaded'} = 1; } =head2 distance_to_parent Arg [1] : (opt.) <int or double> distance Example : my $dist = $object->distance_to_parent(); Example : $object->distance_to_parent(1.618); Description: Getter/Setter for the distance between this child and its parent Returntype : integer node_id Exceptions : none Caller : general =cut sub distance_to_parent { my $self = shift; my $dist = shift; if($self->{'_parent_link'}) { if(defined($dist)) { $self->{'_parent_link'}->distance_between($dist); } else { $dist = $self->{'_parent_link'}->distance_between; } } else { if(defined($dist)) { $self->_distance($dist); } else { $dist = $self->_distance; } } return $dist; } sub _distance { my $self = shift; $self->{'_distance_to_parent'} = shift if(@_); $self->{'_distance_to_parent'} = 0.0 unless(defined($self->{'_distance_to_parent'})); return $self->{'_distance_to_parent'}; } sub distance_to_root { my $self = shift; my $dist = $self->distance_to_parent; $dist += $self->parent->distance_to_root if($self->parent); return $dist; } =head2 distance_to_ancestor Arg [1] : Bio::EnsEMBL::Compara::NestedSet $ancestor Example : my $distance = $this_node->distance_to_ancestor($ancestor); Description : Calculates the distance in the tree between this node and its ancestor $ancestor Returntype : float Exceptions : throws if $ancestor is not an ancestor of this node. Caller : general Status : Stable =cut sub distance_to_ancestor { my $self = shift; my $ancestor = shift; if ($ancestor->node_id eq $self->node_id) { return 0; } unless (defined $self->parent) { throw("Ancestor not found\n"); } return $self->distance_to_parent + $self->parent->distance_to_ancestor($ancestor); } =head2 distance_to_node Arg [1] : Bio::EnsEMBL::Compara::NestedSet $node Example : my $distance = $this_node->distance_to_node($other_node); Description : Calculates the distance in the tree between these two nodes. Returntype : float Exceptions : returns undef if no ancestor can be found, no distances are defined in the tree, etc. Caller : general Status : Stable =cut sub distance_to_node { my $self = shift; my $node = shift; my $ancestor = $self->find_first_shared_ancestor($node); if (!$ancestor) { return undef; } my $distance = $self->distance_to_ancestor($ancestor); $distance += $node->distance_to_ancestor($ancestor); return $distance; } # Returns a TreeI-compliant object based on this NestedSet. sub get_TreeI { my $self = shift; my $newick = $self->newick_format(); open(my $fake_fh, "+<", \$newick); my $treein = new Bio::TreeIO (-fh => $fake_fh, #-verbose => 1, -format => 'newick'); my $treeI = $treein->next_tree; $treein->close; return $treeI; } sub new_from_newick { my $class = shift; my $file = shift; my $treein = new Bio::TreeIO (-file => $file, -format => 'newick'); my $treeI = $treein->next_tree; $treein->close; my $new_tree = $class->new_from_TreeI($treeI); return bless($new_tree,$class); } sub new_from_TreeI { my $class = shift; my $treeI = shift; my $rootI = $treeI->get_root_node; my $node = new $class; # Kick off the recursive, parallel node adding. _add_nodeI_to_node($node,$rootI); return bless($node,$class); } # Recursive helper for new_from_TreeI. sub _add_nodeI_to_node { my $node = shift; # Our node object (Compara) my $nodeI = shift; # Our nodeI object (BioPerl) $node->node_id(-1); foreach my $c ($nodeI->each_Descendent) { my $child = ref($node)->new; my $name = $c->id; $name =~ s/^\s+//; $name =~ s/\s+$//; if ($c->is_Leaf) { $child = Bio::EnsEMBL::Compara::Member->new(); $child->stable_id($name); $child->source_name("NestedSet.pm"); } # Set name. $child->name($name); $child->store_tag("name",$name); # Set branch length. $node->add_child($child,$c->branch_length); # Recurse. _add_nodeI_to_node($child,$c); } } =head2 print_tree Arg [1] : int $scale Example : $this_node->print_tree(100); Description : Prints this tree in ASCII format. The scale is used to define the width of the tree in the output Returntype : undef Exceptions : Caller : general Status : At risk (as the output might change) =cut sub print_tree { my $self = shift; my $scale = shift; $scale = 100 unless($scale); $self->_internal_print_tree(undef, 0, $scale); } sub string_tree { my $self = shift; my $scale = shift; $scale ||= 100; my $buffer = ''; $self->_internal_string_tree(undef, 0, $scale, \$buffer); return $buffer; } sub _internal_string_tree { my $self = shift; my $indent = shift; my $lastone = shift; my $scale = shift; my $buffer = shift; if(defined($indent)) { $$buffer .= $indent; for(my $i=0; $i<$self->distance_to_parent()*$scale; $i++) { $$buffer .= '-'; } } $$buffer .= $self->string_node($indent); if(defined($indent)) { if($lastone) { chop($indent); $indent .= " "; } for(my $i=0; $i<$self->distance_to_parent()*$scale; $i++) { $indent .= ' '; } } $indent = '' unless(defined($indent)); $indent .= "|"; my $children = $self->sorted_children; my $count=0; $lastone = 0; foreach my $child_node (@$children) { $count++; $lastone = 1 if($count == scalar(@$children)); $child_node->_internal_string_tree($indent,$lastone,$scale,$buffer); } } sub _internal_print_tree { my $self = shift; my $indent = shift; my $lastone = shift; my $scale = shift; if(defined($indent)) { print($indent); for(my $i=0; $i<$self->distance_to_parent()*$scale; $i++) { print('-'); } } $self->print_node($indent); if(defined($indent)) { if($lastone) { chop($indent); $indent .= " "; } for(my $i=0; $i<$self->distance_to_parent()*$scale; $i++) { $indent .= ' '; } } $indent = '' unless(defined($indent)); $indent .= "|"; my $children = $self->sorted_children; my $count=0; $lastone = 0; foreach my $child_node (@$children) { $count++; $lastone = 1 if($count == scalar(@$children)); $child_node->_internal_print_tree($indent,$lastone,$scale); } } sub print_node { my $self = shift; print $self->string_node; } sub string_node { my $self = shift; my $str = '('; if(defined $self->get_tagvalue("Duplication") && $self->get_tagvalue("Duplication") ne '' && $self->get_tagvalue("Duplication") > 0 && $self->get_tagvalue("dubious_duplication") ne '1') { $str .= "DUP "; } elsif (defined $self->get_tagvalue("Duplication") && $self->get_tagvalue("Duplication") ne '' && $self->get_tagvalue("Duplication") > 0 && $self->get_tagvalue("dubious_duplication") eq '1' ) { $str .= "DD "; } if(defined $self->get_tagvalue("Bootstrap") && $self->get_tagvalue("Bootstrap") ne '') { my $bootstrap_value = $self->get_tagvalue("Bootstrap"); $str .= "B=$bootstrap_value "; } if(defined $self->get_tagvalue("taxon_name") && $self->get_tagvalue("taxon_name") ne '') { my $taxon_name_value = $self->get_tagvalue("taxon_name"); $str .="T=$taxon_name_value "; } $str .= sprintf("%s %d,%d)", $self->node_id, $self->left_index, $self->right_index); $str .= sprintf("%s\n", $self->name); return $str; } sub nhx_format { my $self = shift; my $format_mode = shift; $format_mode="protein_id" unless(defined($format_mode)); my $nhx = $self->_internal_nhx_format($format_mode); $nhx .= ";"; return $nhx; } sub _internal_nhx_format { my $self = shift; my $format_mode = shift; my $nhx = ""; if($self->get_child_count() > 0) { $nhx .= "("; my $first_child=1; foreach my $child (@{$self->sorted_children}) { $nhx .= "," unless($first_child); $nhx .= $child->_internal_nhx_format($format_mode); $first_child = 0; } $nhx .= ")"; } if($format_mode eq "full" || $format_mode eq "full_web" || $format_mode eq "display_label" || $format_mode eq "display_label_composite" || $format_mode eq "treebest_ortho" || $format_mode eq "transcript_id" || $format_mode eq "gene_id" || $format_mode eq "protein_id") { #full: name and distance on all nodes if($self->isa('Bio::EnsEMBL::Compara::AlignedMember')) { if ($format_mode eq "transcript_id") { $self->description =~ /Transcript:(\w+)/; my $transcript_stable_id = $1; $nhx .= sprintf("%s", $transcript_stable_id); } elsif ($format_mode eq "gene_id") { my $gene_stable_id = $self->gene_member->stable_id; $nhx .= sprintf("%s", $gene_stable_id); } elsif ($format_mode eq "display_label_composite") { my $display_label = $self->gene_member->display_label; if (!defined($display_label) || $display_label eq '') { my $display_xref = $self->gene_member->gene->display_xref; $display_label = $display_xref->display_id if (defined($display_xref)); } if (defined($display_label)) { $nhx .= $display_label . "_"; } $nhx .= $self->gene_member->stable_id; $nhx .= "_" . $self->genome_db->short_name; } elsif ($format_mode eq "display_label") { my $display_label = $self->gene_member->display_label; if (!defined($display_label) || $display_label eq '') { my $display_xref = $self->gene_member->gene->display_xref; $display_label = $display_xref->display_id if (defined($display_xref)); } if (defined($display_label) && $display_label =~ /^\w+$/) { $nhx .= $display_label . "_"; } else { $nhx .= $self->gene_member->stable_id . "_"; } # $nhx .= $self->gene_member->stable_id; $nhx .= $self->genome_db->short_name; } elsif ($format_mode eq "protein_id") { $nhx .= sprintf("%s", $self->name); } elsif ($format_mode eq "treebest_ortho") { $nhx .= $self->member_id . "_" . $self->taxon_id; } } else { my $name = sprintf("%s", $self->name); $name = sprintf("%s", $self->get_tagvalue("taxon_name")) if ($name eq ''); $name =~ s/\ /\_/g; $nhx .= $name; } $nhx .= sprintf(":%1.4f", $self->distance_to_parent); $nhx .= "[&&NHX"; if(defined $self->get_tagvalue("Duplication") && $self->get_tagvalue("Duplication") ne '' && $self->get_tagvalue("Duplication") > 0) { # mark as duplication $nhx .= ":D=Y"; } else { # this only applies to internal nodes, hence the name of the # method and the reason we can safely add this here $nhx .= ":D=N"; } if(defined $self->get_tagvalue("Bootstrap") && $self->get_tagvalue("Bootstrap") ne '') { my $bootstrap_value = $self->get_tagvalue("Bootstrap"); # mark as duplication $nhx .= ":B=$bootstrap_value"; } my $taxon_id; if($self->isa('Bio::EnsEMBL::Compara::AlignedMember')) { my $gene_stable_id = $self->gene_member->stable_id; if (defined $gene_stable_id && $format_mode eq "transcript_id") { $nhx .= ":G=$gene_stable_id"; } elsif (defined $gene_stable_id && $format_mode eq "gene_id") { $self->description =~ /Transcript:(\w+)/; my $transcript_stable_id = $1; $nhx .= ":G=$transcript_stable_id"; } elsif (defined $gene_stable_id && $format_mode eq "protein_id") { $nhx .= ":G=$gene_stable_id"; } elsif (defined $gene_stable_id && $format_mode eq "treebest_ortho") { # # $nhx .= ":O=" . $self->stable_id; # # $nhx .= ":G=$gene_stable_id"; } $taxon_id = $self->taxon_id; } else { $taxon_id = $self->get_tagvalue("taxon_id"); } if(defined ($taxon_id) && (!($taxon_id eq ''))) { $nhx .= ":T=$taxon_id"; $nhx .= ":S=$taxon_id" if ($format_mode eq "treebest_ortho"); } $nhx .= "]"; } if($format_mode eq 'simple') { #simplified: name only on leaves, dist only if has parent if($self->parent) { if($self->is_leaf) { $nhx .= sprintf("%s", $self->name); } $nhx .= sprintf(":%1.4f", $self->distance_to_parent); } } if($format_mode eq 'phylip') { #phylip: restrict names to 21 characters if($self->parent) { if($self->is_leaf) { my $name = $self->name; $name =~ s/[,(:)]//g; $name = substr($name, 0 , 21); $nhx .= sprintf("%s", $name); } $nhx .= sprintf(":%1.4f", $self->distance_to_parent); } } return $nhx; } =head2 newick_format Arg [1] : string $format_mode Example : $this_node->newick_format("full"); Description : Prints this tree in Newick format. Several modes are available: full, display_label_composite, simple, species, species_short_name, ncbi_taxon, ncbi_name, njtree and phylip Returntype : undef Exceptions : Caller : general Status : Stable =cut sub newick_format { my $self = shift; my $format_mode = shift; $format_mode="full" unless(defined($format_mode)); my $newick = $self->_internal_newick_format($format_mode); $newick .= ";"; return $newick; } sub _internal_newick_format { my $self = shift; my $format_mode = shift; my $newick = ""; if($self->get_child_count() > 0) { $newick .= "("; my $first_child=1; foreach my $child (@{$self->sorted_children}) { $newick .= "," unless($first_child); $newick .= $child->_internal_newick_format($format_mode); $first_child = 0; } $newick .= ")"; } if($format_mode eq "full") { #full: name and distance on all nodes $newick .= sprintf("%s", $self->name); $newick .= sprintf(":%1.4f", $self->distance_to_parent); } if($format_mode eq "full_common") { #full: name and distance on all nodes my $name = $self->name; my $full_common_name = $name; if ($self->is_leaf) { my $common = uc($self->get_tagvalue('genbank common name')); $common = uc($self->get_tagvalue('ensembl common name')) if (1 > length($common)); $common =~ s/\,//g; $common =~ s/\ /\./g; $common =~ s/\'//g; $full_common_name .= " " . $common if (1 < length($common)); my $gdb_id = $self->adaptor->db->get_GenomeDBAdaptor->fetch_by_taxon_id($self->taxon_id)->dbID; $full_common_name .= ".$gdb_id"; } $newick .= sprintf("%s", $full_common_name); $newick .= sprintf(":%1.4f", $self->distance_to_parent); } if($format_mode eq "int_node_id") { #full: name and distance on all nodes $newick .= sprintf("%s", $self->name) if ($self->is_leaf); $newick .= sprintf("%s", $self->node_id) if (!$self->is_leaf); $newick .= sprintf(":%1.4f", $self->distance_to_parent); } if($format_mode eq "display_label_composite") { #display_label: external name and distance on all nodes my $display_label; if($self->is_leaf) { $display_label = $self->gene_member->display_label; } if (defined($display_label)) { $newick .= $display_label . "_"; } $newick .= $self->name; if ($self->is_leaf) { $newick .= "_" . $self->genome_db->short_name; } $newick .= sprintf(":%1.4f", $self->distance_to_parent); } if($format_mode eq "full_web") { #display_label: external name and distance on all nodes my $display_label = $self->name; if($self->is_leaf) { my $prot_member = $self->get_longest_peptide_Member; #my $gene_member = $self->gene_member; my $short_name = $prot_member->genome_db->short_name; $display_label = $prot_member->stable_id; $display_label = $display_label . '_' . $short_name . '_' ; } $newick .= $display_label; $newick .= sprintf(":%1.4f", $self->distance_to_parent); } if($format_mode eq "gene_stable_id") { #display_label: external name and distance on all nodes my $display_label; if($self->is_leaf) { $display_label = $self->gene_member->stable_id; } if (defined($display_label)) { $newick .= $display_label; } $newick .= sprintf(":%1.4f", $self->distance_to_parent); } if($format_mode eq "otu_id") { #out_id: species name then "|" then external name my $display_label; if($self->is_leaf) { $display_label = $self->gene_member->display_label; } if ($self->is_leaf) { $newick .= $self->genome_db->short_name . "|"; } if (defined($display_label)) { $newick .= $display_label; } $newick .= $self->name; $newick .= sprintf(":%1.4f", $self->distance_to_parent); } if($format_mode eq 'simple') { #simplified: name only on leaves, dist only if has parent if($self->parent) { if($self->is_leaf) { $newick .= sprintf("%s", $self->name); } $newick .= sprintf(":%1.4f", $self->distance_to_parent); } } if($format_mode eq 'species') { #simplified: name only on leaves, dist only if has parent if($self->parent) { if($self->is_leaf) { my $species_name; if ($self->isa('Bio::EnsEMBL::Compara::AlignedMember')) { $species_name = $self->genome_db->name; } else { $species_name = $self->name; } $species_name =~ s/\ /\_/g; $newick .= sprintf("%s", $species_name); } } } if($format_mode eq 'species_short_name') { #simplified: name only on leaves, dist only if has parent if($self->parent) { if($self->is_leaf) { my $species_name; if ($self->isa('Bio::EnsEMBL::Compara::AlignedMember')) { $species_name = $self->genome_db->short_name; } else { $species_name = $self->short_name; } $newick .= sprintf("%s", $species_name); } } } if($format_mode eq 'ncbi_taxon') { #name leaves an internal nodes by ncbi taxon_id if($self->parent) { my $ncbi_taxon_id = $self->node_id; $newick .= sprintf("%s", $ncbi_taxon_id); } } if($format_mode eq 'ncbi_name') { #name leaves an internal nodes by ncbi taxon_id if($self->parent) { my $ncbi_name = $self->name; $newick .= sprintf("%s", $ncbi_name); } } if($format_mode eq 'njtree') { #name leaves an internal nodes by ncbi taxon_id #add * for leaves my $ncbi_taxon_id = $self->node_id; if($self->is_leaf) { my $is_incomplete = $self->get_tagvalue("is_incomplete"); $ncbi_taxon_id .= "*" unless ('1' eq $is_incomplete); } $newick .= sprintf("%s", $ncbi_taxon_id); } if($format_mode eq 'phylip') { #phylip: restrict names to 21 characters if($self->parent) { if($self->is_leaf) { my $name = $self->name; $name =~ s/[,(:)]//g; $name = substr($name, 0 , 21); $newick .= sprintf("%s", $name); } $newick .= sprintf(":%1.4f", $self->distance_to_parent); } } return $newick; } =head2 newick_simple_format Arg [1] : -none- Example : $this_node->newick_simple_format(); Description : Prints this tree in simple Newick format. This is an alias for $this_node->newick_format("simple"); Returntype : undef Exceptions : Caller : general Status : Stable =cut sub newick_simple_format { my $self = shift; return $self->newick_format('simple'); } ################################## # # Set theory methods # ################################## #sub equals { # my $self = shift; # my $other = shift; # throw("arg must be a [Bio::EnsEMBL::Compara::NestedSet] not a [$other]") # unless($other->isa('Bio::EnsEMBL::Compara::NestedSet')); # return 1 if($self->node_id eq $other->node_id); # foreach my $child (@{$self->children}) { # return 0 unless($other->has_child($child)); # } # return 1; #} sub has_child { my $self = shift; my $child = shift; throw("arg must be a [Bio::EnsEMBL::Compara::NestedSet] not a [$child]") unless($child->isa('Bio::EnsEMBL::Compara::NestedSet')); $self->load_children_if_needed; my $link = $self->link_for_neighbor($child); return 0 unless($link); return 0 if($self->{'_parent_link'} and ($self->{'_parent_link'}->equals($link))); return 1; } sub is_member_of { my $A = shift; my $B = shift; return 1 if($B->has_child($A)); return 0; } sub is_subset_of { my $A = shift; my $B = shift; foreach my $child (@{$A->children}) { return 0 unless($B->has_child($child)); } return 1; } sub is_leaf { my $self = shift; return 1 unless($self->get_child_count); return 0; } sub merge_children { my $self = shift; my $nset = shift; throw("arg must be a [Bio::EnsEMBL::Compara::NestedSet] not a [$nset]") unless($nset->isa('Bio::EnsEMBL::Compara::NestedSet')); foreach my $child_node (@{$nset->children}) { $self->add_child($child_node, $child_node->distance_to_parent); } return $self; } sub merge_node_via_shared_ancestor { my $self = shift; my $node = shift; my $node_dup = $self->find_node_by_node_id($node->node_id); if($node_dup) { #warn("trying to merge in a node with already exists\n"); return $node_dup; } return undef unless($node->parent); my $ancestor = $self->find_node_by_node_id($node->parent->node_id); if($ancestor) { $ancestor->add_child($node); #print("common ancestor at : "); $ancestor->print_node; return $ancestor; } return $self->merge_node_via_shared_ancestor($node->parent); } sub extract_subtree_from_leaves { my $self = shift; my $copy = $self->copy; my $node_ids = shift; # Array ref of node_ids. my @keepers = @{$node_ids}; my @all = @{$copy->get_all_nodes}; # Add all ancestors of kept nodes to the keep list. my @all_keepers = (); foreach my $keeper (@keepers) { my $node = $copy->find_node_by_node_id($keeper); push @all_keepers, $keeper; my $parent = $node->parent; while (defined $parent) { push @all_keepers, $parent->node_id; $parent = $parent->parent; } } my @remove_me = (); foreach my $node (@all) { push @remove_me, $node unless (grep {$node->node_id == $_} @all_keepers); } $copy->remove_nodes(\@remove_me); return $copy; } ################################## # # nested_set manipulations # ################################## =head2 flatten_tree Overview : Removes all internal nodes and attaches leaves to the tree root, creating a "flattened" star tree structure. Example : $node->flatten_tree(); Returntype : undef or Bio::EnsEMBL::Compara::NestedSet Exceptions : none Caller : general =cut sub flatten_tree { my $self = shift; my $leaves = $self->get_all_leaves; foreach my $leaf (@{$leaves}) { $leaf->disavow_parent; } $self->release_children; foreach my $leaf (@{$leaves}) { $self->add_child($leaf, 0.0); } return $self; } =head2 re_root Overview : rearranges the tree structure so that the root is moved to beetween this node and its parent. If the old root was more than bifurcated (2 children) a new node is created where it was to hold the multiple children that arises from the re-rooting. The old root is returned. Example : $node->re_root(); Returntype : undef or Bio::EnsEMBL::Compara::NestedSet Exceptions : none Caller : general =cut sub re_root { my $self = shift; return $self unless($self->parent); #I'm root so just return self my $root = $self->root; my $tmp_root = new Bio::EnsEMBL::Compara::NestedSet; $tmp_root->merge_children($root); my $parent = $self->parent; my $dist = $self->distance_to_parent; $self->disavow_parent; my $old_root = $parent->_invert_tree_above; $old_root->minimize_node; $root->add_child($parent, $dist / 2.0); $root->add_child($self, $dist / 2.0); return $root; } sub _invert_tree_above { my $self = shift; return $self unless($self->parent); my $old_root = $self->parent->_invert_tree_above; #now my parent has been inverted so it is the new root #flip the direction of the link between myself and my parent $self->parent->_set_parent_link($self->{'_parent_link'}); $self->_set_parent_link(undef); #now I'm the new root and the old root might need to be modified return $old_root; } sub build_leftright_indexing { my $self = shift; my $counter = shift; $counter = 1 unless($counter); $self->left_index($counter++); foreach my $child_node (@{$self->sorted_children}) { $counter = $child_node->build_leftright_indexing($counter); } $self->right_index($counter++); return $counter; } =head2 remove_nodes Arg [1] : arrayref Bio::EnsEMBL::Compara::NestedSet $nodes Example : my $ret_tree = $tree->remove_nodes($nodes); Description : Returns the tree with removed nodes in list. Nodes should be in the tree. Returntype : Bio::EnsEMBL::Compara::NestedSet object Exceptions : Caller : general Status : At risk (behaviour on exceptions could change) =cut sub remove_nodes { my $self = shift; my $nodes = shift; foreach my $node (@$nodes) { if ($node->is_leaf) { $node->disavow_parent; $self = $self->minimize_tree; } else { my $node_children = $node->children; foreach my $child (@$node_children) { $node->parent->add_child($child); } $node->disavow_parent; } # Delete dangling one-child trees (help memory manager) if ($self->get_child_count == 1) { my $child = $self->children->[0]; $child->parent->merge_children($child); $child->disavow_parent; return undef; } # Could be zero if all asked to delete, so return undef instead of # fake one-node tree. if ($self->get_child_count < 2) { return undef; } } return $self; } =head2 minimize_tree Arg [1] : -none- Example : $leaf->disavow_parent(); $tree = $tree->minimize_tree(); Description : Returns the tree after removing internal nodes that do not represent an multi- or bi-furcation anymore. This is typically required after disavowing a node. Please ensure you use the object returned by the method and not the original object anymore! Returntype : Bio::EnsEMBL::Compara::NestedSet object Exceptions : Caller : general Status : Stable =cut sub minimize_tree { my $self = shift; return $self if($self->is_leaf); foreach my $child (@{$self->children}) { $child->minimize_tree; } return $self->minimize_node; } sub minimize_node { my $self = shift; return $self unless($self->get_child_count() == 1); my $child = $self->children->[0]; my $dist = $child->distance_to_parent + $self->distance_to_parent; if ($self->parent) { $self->parent->add_child($child, $dist); $self->disavow_parent; } else { $child->disavow_parent; } return $child } ################################## # # search methods # ################################## sub find_node_by_name { my $self = shift; my $name = shift; return $self if((defined $self->name) && $name eq $self->name); my $children = $self->children; foreach my $child_node (@$children) { my $found = $child_node->find_node_by_name($name); return $found if(defined($found)); } return undef; } sub find_node_by_node_id { my $self = shift; my $node_id = shift; return $self if($node_id eq $self->node_id); my $children = $self->children; foreach my $child_node (@$children) { my $found = $child_node->find_node_by_node_id($node_id); return $found if(defined($found)); } return undef; } sub find_leaf_by_name { my $self = shift; my $name = shift; return $self if($name eq $self->name); my $leaves = $self->get_all_leaves; foreach my $leaf (@$leaves) { return $leaf if($name eq $leaf->name); } return undef; } sub find_leaf_by_node_id { my $self = shift; my $node_id = shift; return $self if($node_id eq $self->node_id); my $leaves = $self->get_all_leaves; foreach my $leaf (@$leaves) { return $leaf if($node_id eq $leaf->node_id); } return undef; } =head2 get_all_leaves Title : get_all_leaves Usage : my @leaves = @{$tree->get_all_leaves}; Function: searching from the given starting node, searches and creates list of all leaves in this subtree and returns by reference Example : Returns : reference to list of NestedSet objects (all leaves) Args : none =cut sub get_all_leaves { my $self = shift; my $leaves = {}; $self->_recursive_get_all_leaves($leaves); my @leaf_list = sort {$a->node_id <=> $b->node_id} values(%{$leaves}); return \@leaf_list; } sub _recursive_get_all_leaves { my $self = shift; my $leaves = shift; $leaves->{$self->obj_id} = $self if($self->is_leaf); foreach my $child (@{$self->children}) { $child->_recursive_get_all_leaves($leaves); } return undef; } sub get_all_leaves_indexed { my $self = shift; my @leaf_list = @{$self->adaptor->fetch_all_leaves_indexed($self)}; return \@leaf_list; } =head2 max_distance Title : max_distance Args : none Usage : $tree_node->max_distance; Function: searching from the given starting node, calculates the maximum distance to a leaf Returns : int =cut sub max_distance { my $self = shift; my $max_distance = 0; foreach my $child (@{$self->children}) { my $distance = $child->max_distance; $max_distance = $distance if($distance>$max_distance); } return ($self->distance_to_parent + $max_distance); } =head2 max_depth Title : max_depth Args : none Usage : $tree_node->max_depth; Function: searching from the given starting node, calculates the maximum depth to a leaf Returns : int =cut sub max_depth { my $self = shift; my $max_depth = 0; foreach my $child (@{$self->children}) { my $depth = $child->max_depth + 1; $max_depth=$depth if($depth>$max_depth); } return $max_depth; } =head2 find_first_shared_ancestor Arg [1] : Bio::EnsEMBL::Compara::NestedSet $node Example : my $ancestor = $this_node->find_first_shared_ancestor($other_node); Description : Gets the first common ancestor between this node and the other one. Returntype : Bio::EnsEMBL::Compara::NestedSet object Exceptions : Caller : general Status : Stable =cut sub find_first_shared_ancestor { my $self = shift; my $node = shift; return $self if($self->equals($node)); return $node if($self->has_ancestor($node)); return $self->find_first_shared_ancestor($node->parent); } ################################## # # developer/adaptor API methods # ################################## # used for building tree from a DB fetch, want to restrict users to create trees # by only -add_child method sub _set_parent_link { my ($self, $link) = @_; $self->{'_parent_id'} = 0; $self->{'_parent_link'} = $link; $self->{'_parent_id'} = $link->get_neighbor($self)->node_id if($link); return $self; } # used for building tree from a DB fetch until all the objects are in memory sub _parent_id { my $self = shift; $self->{'_parent_id'} = shift if(@_); return $self->{'_parent_id'}; } # used for building tree from a DB fetch until all the objects are in memory sub _root_id { my $self = shift; $self->{'_root_id'} = shift if(@_); return $self->{'_root_id'}; } 1;