Raw content of Bio::EnsEMBL::IdMapping::ScoreBuilder
=head1 LICENSE
Copyright (c) 1999-2009 The European Bioinformatics Institute and
Genome Research Limited. All rights reserved.
This software is distributed under a modified Apache license.
For license details, please see
/info/about/code_licence.html
=head1 CONTACT
Please email comments or questions to the public Ensembl
developers list at .
Questions may also be sent to the Ensembl help desk at
.
=cut
=head1 NAME
Bio::EnsEMBL::IdMapping::ScoreBuilder - score builder base class
=head1 SYNOPSIS
This class is not instantiated. Please see subclasses for usage examples
(e.g. GeneScoreBuilder).
=head1 DESCRIPTION
This is the base class for the score builders used in the stable Id
mapping application. It contains methods which are used by more than one
ScoreBuilder.
=head1 METHODS
create_shrinked_matrix
internal_id_rescore
log_matrix_stats
=cut
package Bio::EnsEMBL::IdMapping::ScoreBuilder;
use strict;
use warnings;
no warnings 'uninitialized';
use Bio::EnsEMBL::IdMapping::BaseObject;
our @ISA = qw(Bio::EnsEMBL::IdMapping::BaseObject);
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append);
use Bio::EnsEMBL::IdMapping::ScoredMappingMatrix;
=head2 create_shrinked_matrix
Arg[1] : Bio::EnsEMBL::Idmapping::ScoredMappingMatrix $matrix - a scoring
matrix
Arg[2] : Bio::EnsEMBL::Idmapping::MappingList $mappings - mappings
Arg[3] : String $cache_file - base name of a cache file (extension '.ser'
will be added automatically) for the returned matrix
Example : my $new_scores = $score_builder->create_shrinked_matrix(
$gene_scores, $mappings, "gene_matrix1");
Description : Create a shrinked scoring matrix which doesn't contain entries
which were already mapped. It also logs how many new mappings
were added in this process.
Return type : Bio::EnsEMBL::IdMapping::ScoredMappingMatrix
Exceptions : thrown on wrong or missing arguments
Caller : InternalIdMapper plugin
Status : At Risk
: under development
=cut
#
sub create_shrinked_matrix {
my $self = shift;
my $matrix = shift;
my $mappings = shift;
my $cache_file = shift; # base name, extension '.ser' will be added
# argument checks
unless ($matrix and
$matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
}
unless ($mappings and
$mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) {
throw('Need a gene Bio::EnsEMBL::IdMapping::MappingList.');
}
throw('Need a cache file name.') unless ($cache_file);
my $dump_path = path_append($self->conf->param('basedir'), 'matrix');
$cache_file .= '.ser';
my $shrinked_matrix = Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new(
-DUMP_PATH => $dump_path,
-CACHE_FILE => $cache_file,
-AUTO_LOAD => 1,
);
# if we already found a saved matrix, just return it
if ($shrinked_matrix->loaded) {
$self->logger->info("Read existing scoring matrix from $cache_file.\n");
} else {
# create lookup hashes for sources and targets in the MappingList
my %sources = ();
my %targets = ();
foreach my $entry (@{ $mappings->get_all_Entries }) {
$sources{$entry->source} = 1;
$targets{$entry->target} = 1;
}
# add all entries to shrinked matrix which are not in the MappingList
foreach my $entry (@{ $matrix->get_all_Entries }) {
unless ($sources{$entry->source} or $targets{$entry->target}) {
$shrinked_matrix->add_Entry($entry);
}
}
}
# log shrinking stats
$self->logger->info('Sources '.$matrix->get_source_count.' --> '.
$shrinked_matrix->get_source_count."\n");
$self->logger->info('Targets '.$matrix->get_target_count.' --> '.
$shrinked_matrix->get_target_count."\n");
$self->logger->info('Entries '.$matrix->get_entry_count.' --> '.
$shrinked_matrix->get_entry_count."\n");
$self->logger->info('New mappings: '.$mappings->get_entry_count."\n\n");
return $shrinked_matrix;
}
=head2 internal_id_rescore
Arg[1] : Bio::EnsEMBL::Idmapping::ScoredMappingMatrix $matrix - a scoring
matrix
Example : $score_builder->internal_id_rescore($gene_scores);
Description : Rescore ambiguous mappings based on internal Ids. This is the
last disambiguation step and is only useful if objects with the
same internal Id were used in source and target dbs (e.g. in
patch builds or if objects were copied from source to target).
If a source and target gene have the same internal Id and there
are mappings to other target genes then these *other* mappings
are penalised.
Return type : none
Exceptions : thrown on wrong or missing argument
Caller : InternalIdMapper plugins
Status : At Risk
: under development
=cut
sub internal_id_rescore {
my $self = shift;
my $matrix = shift;
unless ($matrix and
$matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
}
my $i = 0;
foreach my $source (@{ $matrix->get_all_sources }) {
my @entries = sort { $b <=> $a }
@{ $matrix->get_Entries_for_source($source) };
# nothing to do if we only have one mapping
next unless (scalar(@entries) > 1);
# only penalise if mappings are ambiguous
next unless ($entries[0]->score == $entries[1]->score);
# only penalise if one source id == target id where score == best score
my $ambiguous = 0;
foreach my $e (@entries) {
if ($e->target == $source and $e->score == $entries[0]) {
$ambiguous = 1;
}
}
next unless ($ambiguous);
# now penalise those where source id != target id and score == best score
foreach my $e (@entries) {
if ($e->target != $source and $e->score == $entries[0]) {
$matrix->set_score($source, $e->target, ($e->score * 0.8));
$i++;
}
}
}
$self->logger->debug("Scored entries with internal ID mismatch: $i\n", 1);
}
=head2 log_matrix_stats
Arg[1] : Bio::EnsEMBL::Idmapping::ScoredMappingMatrix $matrix - a scoring
matrix
Example : $score_builder->log_matrix_stats;
Description : Logs scoring matrix statistics (number of entries, min/max/avg
scores).
Return type : none
Exceptions : thrown on wrong or missing argument
Caller : general
Status : At Risk
: under development
=cut
sub log_matrix_stats {
my $self = shift;
my $matrix = shift;
unless ($matrix and
$matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
throw('You must provide a ScoredMappingMatrix.');
}
my $fmt1 = "%-40s%10.0f\n";
my $fmt2 = "%-40s%10.5f\n";
$self->logger->info(sprintf($fmt1, "Scoring matrix entries:",
$matrix->get_entry_count), 1);
$self->logger->info(sprintf($fmt1, "Scoring matrix sources:",
$matrix->get_source_count), 1);
$self->logger->info(sprintf($fmt1, "Scoring matrix targets:",
$matrix->get_target_count), 1);
$self->logger->info(sprintf($fmt2, "Average score:",
$matrix->get_average_score), 1);
my ($min, $max) = @{ $matrix->get_min_max_scores };
$self->logger->info(sprintf($fmt2, "Min. score:", $min), 1);
$self->logger->info(sprintf($fmt2, "Max. score:", $max), 1);
}
1;