Raw content of Bio::EnsEMBL::IdMapping::InternalIdMapper::BaseMapper =head1 LICENSE Copyright (c) 1999-2009 The European Bioinformatics Institute and Genome Research Limited. All rights reserved. This software is distributed under a modified Apache license. For license details, please see /info/about/code_licence.html =head1 CONTACT Please email comments or questions to the public Ensembl developers list at <ensembl-dev@ebi.ac.uk>. Questions may also be sent to the Ensembl help desk at <helpdesk@ensembl.org>. =cut =head1 NAME =head1 SYNOPSIS =head1 DESCRIPTION =head1 METHODS =cut package Bio::EnsEMBL::IdMapping::InternalIdMapper::BaseMapper; use strict; use warnings; no warnings 'uninitialized'; use Bio::EnsEMBL::IdMapping::BaseObject; our @ISA = qw(Bio::EnsEMBL::IdMapping::BaseObject); use Bio::EnsEMBL::Utils::Exception qw(throw warning); use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append); use Bio::EnsEMBL::IdMapping::MappingList; # scores are considered the same if (2.0 * (s1-s2))/(s1 + s2) < this use constant SIMILAR_SCORE_RATIO => 0.01; # # find the highest unambiguous score for all sources and targets in a scoring # matrix # sub basic_mapping { my $self = shift; my $matrix = shift; my $mapping_name = shift; # argument checks unless ($matrix and $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); } throw('Need a name for serialising the mapping.') unless ($mapping_name); # Create a new MappingList object. Specify AUTO_LOAD to load serialised # existing mappings if found my $dump_path = path_append($self->conf->param('basedir'), 'mapping'); my $mappings = Bio::EnsEMBL::IdMapping::MappingList->new( -DUMP_PATH => $dump_path, -CACHE_FILE => "${mapping_name}.ser", -AUTO_LOAD => 1, ); # checkpoint test: return a previously stored MappingList if ($mappings->loaded) { $self->logger->info("Read existing mappings from ${mapping_name}.ser.\n"); return $mappings; } my $sources_done = {}; my $targets_done = {}; # sort scoring matrix entries by descending score my @sorted_entries = sort { $b->score <=> $a->score } @{ $matrix->get_all_Entries }; # debug #my $idx = substr($mapping_name, -1); while (my $entry = shift(@sorted_entries)) { #$self->logger->debug("\nxxx$idx ".$entry->to_string." "); # we already found a mapping for either source or target next if ($sources_done->{$entry->source} or $targets_done->{$entry->target}); #$self->logger->debug('d'); # there's a better mapping for either source or target next if ($self->higher_score_exists($entry, $matrix, $sources_done, $targets_done)); #$self->logger->debug('h'); # check for ambiguous mappings; they are dealt with later my $other_sources = []; my $other_targets = []; if ($self->ambiguous_mapping($entry, $matrix, $other_sources, $other_targets)) { #$self->logger->debug('a'); $other_sources = $self->filter_sources($other_sources, $sources_done); $other_targets = $self->filter_targets($other_targets, $targets_done); next if (scalar(@$other_sources) or scalar(@$other_targets)); } #$self->logger->debug('A'); # this is the best mapping, add it $mappings->add_Entry($entry); $sources_done->{$entry->source} = 1; $targets_done->{$entry->target} = 1; } # create checkpoint $mappings->write_to_file; return $mappings; } sub higher_score_exists { my ($self, $entry, $matrix, $sources_done, $targets_done) = @_; my $source = $entry->source; my $target = $entry->target; my $score = $entry->score; foreach my $other_source (@{ $matrix->get_sources_for_target($target) }) { if ($other_source != $source and !$sources_done->{$other_source} and $score < $matrix->get_score($other_source, $target)) { return 1; } } foreach my $other_target (@{ $matrix->get_targets_for_source($source) }) { if ($other_target != $target and !$targets_done->{$other_target} and $score < $matrix->get_score($source, $other_target)) { return 1; } } return 0; } # # find ambiguous mappings (see scores_similar() for definition) # sub ambiguous_mapping { my ($self, $entry, $matrix, $other_sources, $other_targets) = @_; my $source = $entry->source; my $target = $entry->target; my $score = $entry->score; my $retval = 0; foreach my $other_source (@{ $matrix->get_sources_for_target($target) }) { my $other_score = $matrix->get_score($other_source, $target); if ($other_source != $source and ($self->scores_similar($score, $other_score) or $score < $other_score)) { $retval = 1; push @{ $other_sources }, $other_source; } } foreach my $other_target (@{ $matrix->get_targets_for_source($source) }) { my $other_score = $matrix->get_score($source, $other_target); if ($other_target != $target and ($self->scores_similar($score, $other_score) or $score < $other_score)) { $retval = 1; push @{ $other_targets }, $other_target; } } return $retval; } # # rule for similarity taken from java code... # sub scores_similar { my ($self, $s1, $s2) = @_; # always give priority to exact matches over very similar ones return 0 if ($s1 == 1 and $s2 < 1); my $diff = $s1 -$s2; $diff = -$diff if ($diff < 0); my $pc = 2 * $diff / ($s1 + $s2); return ($pc < SIMILAR_SCORE_RATIO); } sub filter_sources { my ($self, $other_sources, $sources_done) = @_; unless (scalar(@$other_sources) and scalar(keys %$sources_done)) { return $other_sources; } my @tmp = (); foreach my $e (@{ $other_sources }) { push @tmp, $e unless ($sources_done->{$e}); } return \@tmp; } sub filter_targets { my ($self, $other_targets, $targets_done) = @_; unless (scalar(@{ $other_targets }) and scalar(keys %$targets_done)) { return $other_targets; } my @tmp = (); foreach my $e (@{ $other_targets }) { push @tmp, $e unless ($targets_done->{$e}); } return \@tmp; } 1;