Raw content of Bio::EnsEMBL::IdMapping::ScoredMappingMatrix =head1 LICENSE Copyright (c) 1999-2009 The European Bioinformatics Institute and Genome Research Limited. All rights reserved. This software is distributed under a modified Apache license. For license details, please see /info/about/code_licence.html =head1 CONTACT Please email comments or questions to the public Ensembl developers list at <ensembl-dev@ebi.ac.uk>. Questions may also be sent to the Ensembl help desk at <helpdesk@ensembl.org>. =cut =head1 NAME Bio::EnsEMBL::IdMapping::ScoredMappingMatrix - object holding a list of scored Entries =head1 SYNOPSIS # create a new ScoredMappingMatrix my $gene_scores = Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new( -DUMP_PATH => $dump_path, -CACHE_FILE => 'gene_scores.ser', ); # add entries my $gene_scores->add_Entry($entry1); # serialise to file $gene_scores->write_to_file; # later, read these gene_scores from file my $gene_scores1 = Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new( -DUMP_PATH => $dump_path, -CACHE_FILE => 'gene_gene_scores.ser', ); $gene_scores1->read_from_file; =head1 DESCRIPTION This object represents a collection of scores between source and target objects. It holds a list of Bio::EnsEMBL::IdMapping::Entry objects and has methods to retrieve indiviual or all Entries, as well as derived data like number of unique sources or targets, or various counts and averages. It is the main collection for dealing with scored relationships in the stable Id mapping application. =head1 METHODS new flush sub_matrix add_Entry update_Entry remove_Entry add_score set_score get_Entry get_score get_targets_for_source get_Entries_for_source get_sources_for_target get_Entries_for_target get_all_Entries get_all_sources get_all_targets get_entry_count size get_source_count get_target_count get_min_max_scores get_average_score merge log to_string =cut package Bio::EnsEMBL::IdMapping::ScoredMappingMatrix; use strict; use warnings; no warnings 'uninitialized'; use Bio::EnsEMBL::IdMapping::Serialisable; our @ISA = qw(Bio::EnsEMBL::IdMapping::Serialisable); use Bio::EnsEMBL::Utils::Argument qw(rearrange); use Bio::EnsEMBL::Utils::Exception qw(throw warning); use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append); use Bio::EnsEMBL::IdMapping::Entry; =head2 new Arg[1-N] : see superclass Example : my $gene_scores = Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new( -DUMP_PATH => $dump_path, -CACHE_FILE => 'gene_scores.ser', ); Description : Constructor. Return type : Bio::EnsEMBL::IdMapping::ScoredMappingMatrix Exceptions : none Caller : general Status : At Risk : under development =cut sub new { my $caller = shift; my $class = ref($caller) || $caller; my $self = $class->SUPER::new(@_); # initialise internal datastructure unless ($self->loaded) { $self->{'cache'}->{'matrix'} = {}; $self->{'cache'}->{'source_list'} = {}; $self->{'cache'}->{'target_list'} = {}; } return $self; } =head2 flush Example : $gene_scores->flush; Description : Flushes (empties) the scoring matrix. Return type : none Exceptions : none Caller : general Status : At Risk : under development =cut sub flush { my $self = shift; # reset caches $self->{'cache'}->{'matrix'} = {}; $self->{'cache'}->{'source_list'} = {}; $self->{'cache'}->{'target_list'} = {}; } =head2 sub_matrix Arg[1] : Int $start - start index (inclusive) Arg[2] : Int $end - end index (inclusive) Example : # get the first 1000 elements in the matrix my $sub_matrix = $gene_scores->sub_matrix(1, 1000); Description : Returns a sub-matrix of the ScoredMappingMatrix. The arguments ($start and $end) specify the position of the first and last element to return (inclusive, counting starts with element 1, not 0) Return type : Bio::EnsEMBL::IdMapping::ScoredMappingMatrix Exceptions : none Caller : general Status : At Risk : under development =cut sub sub_matrix { my $self = shift; my $start = shift; my $end = shift; # default to returning the full matrix if no start/end provided $start ||= 1; $end ||= $self->size; my $sub_matrix = Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new( -DUMP_PATH => $self->dump_path, -CACHE_FILE => $self->cache_file_name, ); my $i = 0; foreach my $key (sort keys %{ $self->{'cache'}->{'matrix'} }) { $i++; next if ($i < $start); last if ($i > $end); my ($source, $target) = split(/:/, $key); $sub_matrix->add_score($source, $target, $self->{'cache'}->{'matrix'}->{$key}); } return $sub_matrix; } =head2 add_Entry Arg[1] : Bio::EnsEMBL::IdMapping::Entry $entry - Entry to add Example : $gene_scores->add_Entry($entry); Description : Adds an Entry to the scoring matrix. Return type : Float - the Entry's score Exceptions : thrown on wrong or missing argument Caller : general Status : At Risk : under development =cut sub add_Entry { my $self = shift; my $entry = shift; unless ($entry and $entry->isa('Bio::EnsEMBL::IdMapping::Entry')) { throw("Need a Bio::EnsEMBL::IdMapping::Entry"); } return $self->add_score($entry->source, $entry->target, $entry->score); } =head2 update_Entry Arg[1] : Bio::EnsEMBL::IdMapping::Entry $entry - Entry to update Example : $gene_scores->update_Entry($entry); Description : Updates an Entry (or rather its score) in the scoring matrix. Actually delegates to add_Entry(), only there as an intuitively named wrapper. Return type : Float - the Entry's score Exceptions : thrown on wrong or missing argument Caller : general Status : At Risk : under development =cut sub update_Entry { return $_[0]->add_Entry($_[1]); } # # not needed in the current application, so not implemented # sub remove_Entry { warning('Method ScoredMappingMatrix->remove_Entry not implemented (yet).'); } =head2 add_score Arg[1] : Int $source - source object's internal Id ("dbID") Arg[2] : Int $target - target object's internal Id ("dbID") Arg[3] : Float $score - score for source/target pair Example : $gene_scores->add_score(1234, 5678, 0.997); Description : Adds a score for a source/target pair to the scoring matrix. This is a low-level version of add_Entry(). Return type : Float - the score Exceptions : none Caller : general Status : At Risk : under development =cut sub add_score { my $self = shift; my $source = shift; my $target = shift; my $score = shift; # make sure you don't put duplicates on the source and target lists unless (exists($self->{'cache'}->{'matrix'}->{"$source:$target"})) { push @{ $self->{'cache'}->{'source_list'}->{$source} }, $target; push @{ $self->{'cache'}->{'target_list'}->{$target} }, $source; } $self->{'cache'}->{'matrix'}->{"$source:$target"} = $score; } =head2 set_score Arg[1] : Int $source - source object's internal Id ("dbID") Arg[2] : Int $target - target object's internal Id ("dbID") Arg[3] : Float $score - score for source/target pair Example : $gene_scores->set_score(1234, 5678, 0.997); Description : Sets the score for a source/target pair in the scoring matrix. This method is similar to add_score, but assumes that the Entry has been added before, so won't update the sources and target lists. Return type : Float - the score Exceptions : none Caller : general Status : At Risk : under development =cut sub set_score { my $self = shift; my $source = shift; my $target = shift; my $score = shift; $self->{'cache'}->{'matrix'}->{"$source:$target"} = $score; } =head2 get_Entry Arg[1] : Int $source - source object's internal Id ("dbID") Arg[2] : Int $target - target object's internal Id ("dbID") Example : my $entry = $gene_scores->get_Entry($source_gene->id, $target_gene->id); Description : Gets an Entry from the scoring matrix for a given source and target object. Return type : Bio::EnsEMBL::IdMapping::Entry or undef Exceptions : none Caller : general Status : At Risk : under development =cut sub get_Entry { my $self = shift; my $source = shift; my $target = shift; if (exists($self->{'cache'}->{'matrix'}->{"$source:$target"})) { return Bio::EnsEMBL::IdMapping::Entry->new_fast( [$source, $target, $self->{'cache'}->{'matrix'}->{"$source:$target"}] ); } else { return undef; } } =head2 get_score Arg[1] : Int $source - source object's internal Id ("dbID") Arg[2] : Int $target - target object's internal Id ("dbID") Example : my $score = $gene_scores->get_score($source_gene->id, $target_gene->id); Description : Gets the score from the scoring matrix for a given source and target object. Return type : Float or undef Exceptions : none Caller : general Status : At Risk : under development =cut sub get_score { my $self = shift; my $source = shift; my $target = shift; if (exists($self->{'cache'}->{'matrix'}->{"$source:$target"})) { return $self->{'cache'}->{'matrix'}->{"$source:$target"}; } else { return undef; } } =head2 get_targets_for_source Arg[1] : Int $source - source object's internal Id ("dbID") Example : my @targets = $gene_scores->get_targets_for_source(1234); Description : Returns a list of all targets which have a score against a given source object. Return type : Arrayref of Int (target objects' internal Ids) Exceptions : none Caller : general Status : At Risk : under development =cut sub get_targets_for_source { my $self = shift; my $source = shift; return $self->{'cache'}->{'source_list'}->{$source} || []; } =head2 get_Entries_for_source Arg[1] : Int $source - source object's internal Id ("dbID") Example : my @entries = $gene_scores->get_Entries_for_source(1234); Description : Returns a list of all Entries in the scoring matrix for a given source object. Return type : Arrayref of Bio::EnsEMBL::IdMapping::Entry objects Exceptions : none Caller : general Status : At Risk : under development =cut sub get_Entries_for_source { my $self = shift; my $source = shift; return [ map { $self->get_Entry($source, $_) } @{ $self->{'cache'}->{'source_list'}->{$source} || [] } ]; } =head2 get_sources_for_target Arg[1] : Int $target - target object's internal Id ("dbID") Example : my @sources = $gene_scores->get_sources_for_target(5678); Description : Returns a list of all sources which have a score against a given target object. Return type : Arrayref of Int (source objects' internal Ids) Exceptions : none Caller : general Status : At Risk : under development =cut sub get_sources_for_target { my $self = shift; my $target = shift; return $self->{'cache'}->{'target_list'}->{$target} || []; } =head2 get_Entries_for_target Arg[1] : Int $target - target object's internal Id ("dbID") Example : my @entries = $gene_scores->get_Entries_for_target(5678); Description : Returns a list of all Entries in the scoring matrix for a given target object. Return type : Arrayref of Bio::EnsEMBL::IdMapping::Entry objects Exceptions : none Caller : general Status : At Risk : under development =cut sub get_Entries_for_target { my $self = shift; my $target = shift; return [ map { $self->get_Entry($_, $target) } @{ $self->{'cache'}->{'target_list'}->{$target} || [] } ]; } =head2 get_all_Entries Example : foreach my $entry (@{ $gene_scores->get_all_Entries }) { # do something with the entry } Description : Returns a list of all Entries in the scoring matrix. Return type : Arrayref of Bio::EnsEMBL::IdMapping::Entry objects Exceptions : none Caller : general Status : At Risk : under development =cut sub get_all_Entries { my $self = shift; my @result = (); foreach my $key (keys %{ $self->{'cache'}->{'matrix'} }) { my ($source, $target) = split(/:/, $key); push @result, Bio::EnsEMBL::IdMapping::Entry->new_fast( [$source, $target, $self->{'cache'}->{'matrix'}->{$key}] ); } return \@result; } =head2 get_all_sources Example : my @sources = @{ $gene_scores->get_all_sources }; Description : Returns a list of all sources in the scoring matrix. Return type : Arrayref of Int (source objects' internal Ids) Exceptions : none Caller : general Status : At Risk : under development =cut sub get_all_sources { my $self = shift; return [keys %{ $self->{'cache'}->{'source_list'} }]; } =head2 get_all_targets Example : my @targets = @{ $gene_scores->get_all_targets }; Description : Returns a list of all targets in the scoring matrix. Return type : Arrayref of Int (target objects' internal Ids) Exceptions : none Caller : general Status : At Risk : under development =cut sub get_all_targets { my $self = shift; return [keys %{ $self->{'cache'}->{'target_list'} }]; } =head2 get_entry_count Example : my $num_entries = $gene_scores->get_entry_count; Description : Returns the number of Entries in the scoring matrix. Return type : Int Exceptions : none Caller : general Status : At Risk : under development =cut sub get_entry_count { my $self = shift; return scalar(keys %{ $self->{'cache'}->{'matrix'} }); } =head2 size Example : my $size = $gene_scores->size; Description : Returns the size of the scoring matrix. Same value as returned by get_entry_count(). Return type : Int Exceptions : none Caller : general Status : At Risk : under development =cut sub size { return $_[0]->get_entry_count; } =head2 get_source_count Example : my $num_sources = $gene_scores->get_source_count; Description : Returns the number of distinct sources in the scoring matrix. Return type : Int Exceptions : none Caller : general Status : At Risk : under development =cut sub get_source_count { my $self = shift; return scalar(keys %{ $self->{'cache'}->{'source_list'} }); } =head2 get_target_count Example : my $num_targets = $gene_scores->get_target_count; Description : Returns the number of distinct targets in the scoring matrix. Return type : Int Exceptions : none Caller : general Status : At Risk : under development =cut sub get_target_count { my $self = shift; return scalar(keys %{ $self->{'cache'}->{'target_list'} }); } =head2 get_min_max_scores Example : my ($min_score, $max_score) = @{ $gene_scores->get_min_max_scores }; Description : Returns the mininum and maximum score in the scoring matrix. Return type : Arrayref of Float [min_score, max_score] Exceptions : none Caller : general Status : At Risk : under development =cut sub get_min_max_scores { my $self = shift; my @keys = keys %{ $self->{'cache'}->{'matrix'} }; return [undef, undef] unless (@keys); # initialise; this should make loop quicker my $min = $self->{'cache'}->{'matrix'}->{$keys[0]}; my $max = $self->{'cache'}->{'matrix'}->{$keys[0]}; foreach my $key (@keys) { $min = $self->{'cache'}->{'matrix'}->{$key} if ($min > $self->{'cache'}->{'matrix'}->{$key}); $max = $self->{'cache'}->{'matrix'}->{$key} if ($max < $self->{'cache'}->{'matrix'}->{$key}); } return [$min, $max]; } =head2 get_average_score Example : my $avg_score = $gene_scores->get_average_score; Description : Returns the average (mean) score in the matrix. Return type : Float Exceptions : none Caller : general Status : At Risk : under development =cut sub get_average_score { my $self = shift; my @keys = keys %{ $self->{'cache'}->{'matrix'} }; return undef unless (@keys); my $total = 0; foreach my $key (@keys) { $total += $self->{'cache'}->{'matrix'}->{$key}; } return $total/scalar(@keys); } =head2 merge Arg[1] : Bio::EnsEMBL::IdMapping::ScoredMappingMatrix $matrix - another matrix to merge with Example : my $update_count = $gene_scores->merge($more_gene_scores); Description : Merges two scoring matrices. If there's an Entry for a source/target pair in both matrices, the higher score will be retained. Return type : Int - number of Entries added or updated Exceptions : thrown on wrong or missing argument Caller : general Status : At Risk : under development =cut sub merge { my $self = shift; my $matrix = shift; unless ($matrix and $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { throw('You must provide a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix'); } my $c = 0; # merge the matrices foreach my $key (keys %{ $matrix->{'cache'}->{'matrix'} }) { if (!defined($self->{'cache'}->{'matrix'}->{$key}) or $self->{'cache'}->{'matrix'}->{$key} < $matrix->{'cache'}->{'matrix'}->{$key}) { $self->{'cache'}->{'matrix'}->{$key} = $matrix->{'cache'}->{'matrix'}->{$key}; $c++; } } # merge sources and target lists foreach my $key (keys %{ $matrix->{'cache'}->{'source_list'} }) { if (defined($self->{'cache'}->{'source_list'}->{$key})) { # need to merge lists my %unique = map { $_ => 1 } @{ $self->get_targets_for_source($key) }; map { $unique{$_} = 1 } @{ $matrix->get_targets_for_source($key) }; $self->{'cache'}->{'source_list'}->{$key} = [keys %unique]; } else { # no merging needed $self->{'cache'}->{'source_list'}->{$key} = $matrix->{'cache'}->{'source_list'}->{$key}; } } foreach my $key (keys %{ $matrix->{'cache'}->{'target_list'} }) { if (defined($self->{'cache'}->{'target_list'}->{$key})) { # need to merge lists my %unique = map { $_ => 1 } @{ $self->get_sources_for_target($key) }; map { $unique{$_} = 1 } @{ $matrix->get_sources_for_target($key) }; $self->{'cache'}->{'target_list'}->{$key} = [keys %unique]; } else { # no merging needed $self->{'cache'}->{'target_list'}->{$key} = $matrix->{'cache'}->{'target_list'}->{$key}; } } return $c; } =head2 log Arg[1] : String $type - object type (e.g. 'gene') Arg[2] : String $dump_path - path for writing output Example : $gene_scores->log('gene', $conf->param('basedir')); Description : Logs all Entries in the scoring matrix to a file. Used for debugging. Return type : none Exceptions : thrown on I/0 error Caller : general Status : At Risk : under development =cut sub log { my $self = shift; my $type = shift; my $dump_path = shift; my $debug_path = path_append($dump_path, 'debug'); my $logfile = "$debug_path/${type}_scores.txt"; open(my $fh, '>', $logfile) or throw("Unable to open $logfile for writing: $!"); foreach my $entry (@{ $self->get_all_Entries }) { print $fh ($entry->to_string."\n"); } close($fh); } =head2 to_string Example : print LOG $gene_scores->to_string, "\n"; Description : Returns a string representation of the scoring matrix. This is simply a multi-line string, where each line is a stringified Entry. Useful for debugging and logging. Return type : String Exceptions : none Caller : general Status : At Risk : under development =cut sub to_string { my $self = shift; my $string = ''; foreach my $entry (@{ $self->get_all_Entries }) { $string .= $entry->to_string."\n"; } return $string; } 1;