Raw content of Bio::EnsEMBL::IdMapping::StableIdMapper =head1 LICENSE Copyright (c) 1999-2009 The European Bioinformatics Institute and Genome Research Limited. All rights reserved. This software is distributed under a modified Apache license. For license details, please see /info/about/code_licence.html =head1 CONTACT Please email comments or questions to the public Ensembl developers list at <ensembl-dev@ebi.ac.uk>. Questions may also be sent to the Ensembl help desk at <helpdesk@ensembl.org>. =cut =head1 NAME =head1 SYNOPSIS =head1 DESCRIPTION =head1 METHODS =cut package Bio::EnsEMBL::IdMapping::StableIdMapper; use strict; use warnings; no warnings 'uninitialized'; use Bio::EnsEMBL::IdMapping::BaseObject; our @ISA = qw(Bio::EnsEMBL::IdMapping::BaseObject); use Bio::EnsEMBL::Utils::Exception qw(throw warning); use Bio::EnsEMBL::Utils::ScriptUtils qw(inject path_append); use Bio::EnsEMBL::IdMapping::ScoredMappingMatrix; use POSIX qw(strftime); # instance variables my %debug_mappings; sub new { my $caller = shift; my $class = ref($caller) || $caller; my $self = $class->SUPER::new(@_); # inject a StableIdGenerator # # If you write your own generators, make sure they extend # Bio::EnsEMBL::Idmapping::BaseObject and additionally implement these three # methods: initial_stable_id(), increment_stable_id() and calculate_version(). my $stable_id_generator = $self->conf->param('plugin_stable_id_generator') || 'Bio::EnsEMBL::IdMapping::StableIdGenerator::EnsemblGeneric'; $self->logger->debug("Using $stable_id_generator to generate stable Ids.\n"); inject($stable_id_generator); # create a new StableIdGenerator object my $generator_instance = $stable_id_generator->new( -LOGGER => $self->logger, -CONF => $self->conf, -CACHE => $self->cache ); $self->stable_id_generator($generator_instance); return $self; } sub generate_mapping_session { my $self = shift; # only run this method once return if ($self->mapping_session_date); $self->logger->info("Generating new mapping_session...\n"); $self->mapping_session_date(time); $self->mapping_session_date_fmt(strftime("%Y-%m-%d %T", localtime($self->mapping_session_date))); my $s_dba = $self->cache->get_DBAdaptor('source'); my $s_dbh = $s_dba->dbc->db_handle; my $t_dba = $self->cache->get_DBAdaptor('target'); my $t_dbh = $t_dba->dbc->db_handle; # check if mapping_session_id was manually set by the configuration my $mapping_session_id = $self->conf->param('mapping_session_id'); if ($mapping_session_id) { $self->logger->debug("Using manually configured mapping_session_id $mapping_session_id\n", 1); } else { # calculate mapping_session_id from db my $sql = qq(SELECT MAX(mapping_session_id) FROM mapping_session); $mapping_session_id = $self->fetch_value_from_db($s_dbh, $sql); unless ($mapping_session_id) { $self->logger->debug("No previous mapping_session found.\n", 1); } # increment last mapping_session_id $mapping_session_id++; $self->logger->debug("Using mapping_session_id $mapping_session_id\n", 1); } $self->mapping_session_id($mapping_session_id); # write old mapping_session table to a file my $i; my $fh = $self->get_filehandle('mapping_session.txt', 'tables'); my $sth1 = $s_dbh->prepare("SELECT * FROM mapping_session"); $sth1->execute; while (my @row = $sth1->fetchrow_array) { $i++; print $fh join("\t", @row); print $fh "\n"; } $sth1->finish; # append the new mapping_session to the file my $release_sql = qq( SELECT meta_value FROM meta WHERE meta_key = 'schema_version' ); my $old_release = $self->fetch_value_from_db($s_dbh, $release_sql); my $new_release = $self->fetch_value_from_db($t_dbh, $release_sql); my $assembly_sql = qq( SELECT meta_value FROM meta WHERE meta_key = 'assembly.default' ); my $old_assembly = $self->fetch_value_from_db($s_dbh, $assembly_sql); my $new_assembly = $self->fetch_value_from_db($t_dbh, $assembly_sql); unless ($old_release and $new_release and $old_assembly and $new_assembly) { $self->logger->warning("Not all data for new mapping_session found:\n", 1); $self->logger->info("old_release: $old_release, new_release: $new_release"); $self->logger->info("old_assembly: $old_assembly, new_assembly $new_assembly\n", 2); } print $fh join("\t", $mapping_session_id, $self->conf->param('sourcedbname'), $self->conf->param('targetdbname'), $old_release, $new_release, $old_assembly, $new_assembly, $self->mapping_session_date_fmt); print $fh "\n"; close($fh); $self->logger->info("Done writing ".++$i." mapping_session entries.\n\n"); } sub map_stable_ids { my $self = shift; my $mappings = shift; my $type = shift; unless ($mappings and $mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) { throw("Need a Bio::EnsEMBL::IdMapping::MappingList of ${type}s."); } # generate a new mapping_session and write all mapping_session data to a file $self->generate_mapping_session; $self->logger->info("== Stable ID mapping for $type...\n\n", 0, 'stamped'); # check if there are any objects of this type at all my %all_sources = %{ $self->cache->get_by_name("${type}s_by_id", 'source') }; my %all_targets = %{ $self->cache->get_by_name("${type}s_by_id", 'target') }; unless (scalar(keys %all_sources)) { $self->logger->info("No cached ${type}s found.\n\n"); return; } my %stats = map { $_ => 0 } qw(mapped_known mapped_novel new lost_known lost_novel); # create some lookup hashes from the mappings my %sources_mapped = (); my %targets_mapped = (); my %scores_by_target = (); foreach my $e (@{ $mappings->get_all_Entries }) { $sources_mapped{$e->source} = $e->target; $targets_mapped{$e->target} = $e->source; $scores_by_target{$e->target} = $e->score; } # determine starting stable ID for new assignments my $new_stable_id = $self->stable_id_generator->initial_stable_id($type); # # assign mapped and new stable IDs # foreach my $tid (keys %all_targets) { my $t_obj = $all_targets{$tid}; # a mapping exists, assign stable ID accordingly if (my $sid = $targets_mapped{$tid}) { my $s_obj = $all_sources{$sid}; # set target's stable ID and created_date $t_obj->stable_id($s_obj->stable_id); $t_obj->created_date($s_obj->created_date); # calculate and set version $t_obj->version($self->stable_id_generator->calculate_version( $s_obj, $t_obj)); # change modified_date if version changed if ($s_obj->version == $t_obj->version) { $t_obj->modified_date($s_obj->modified_date); } else { $t_obj->modified_date($self->mapping_session_date); } # create a stable_id_event entry (not for exons) unless ($type eq 'exon') { my $key = join("\t", $s_obj->stable_id, $s_obj->version, $t_obj->stable_id, $t_obj->version, $self->mapping_session_id, $type, $scores_by_target{$tid} ); $self->add_stable_id_event('new', $key); } # add to debug hash push @{ $debug_mappings{$type} }, [ $sid, $tid, $t_obj->stable_id ]; # stats if ($s_obj->is_known) { $stats{'mapped_known'}++; } else { $stats{'mapped_novel'}++; } # no mapping was found, assign a new stable ID } else { $t_obj->stable_id($new_stable_id); $t_obj->version(1); $t_obj->created_date($self->mapping_session_date); $t_obj->modified_date($self->mapping_session_date); # create a stable_id_event entry (not for exons) unless ($type eq 'exon') { my $key = join("\t", '\N', 0, $t_obj->stable_id, $t_obj->version, $self->mapping_session_id, $type, 0 ); $self->add_stable_id_event('new', $key); } # increment the stable Id (to be assigned to the next unmapped object) $new_stable_id = $self->stable_id_generator->increment_stable_id( $new_stable_id); # stats $stats{'new'}++; } } # # deletion events for lost sources # my $fh; if ($type eq 'gene' or $type eq 'transcript') { $fh = $self->get_filehandle("${type}s_lost.txt", 'debug'); } foreach my $sid (keys %all_sources) { my $s_obj = $all_sources{$sid}; # no mapping exists, add deletion event unless ($sources_mapped{$sid}) { unless ($type eq 'exon') { my $key = join("\t", $s_obj->stable_id, $s_obj->version, '\N', 0, $self->mapping_session_id, $type, 0 ); $self->add_stable_id_event('new', $key); } # stats my $status; if ($s_obj->is_known) { $stats{'lost_known'}++; $status = 'known'; } else { $stats{'lost_novel'}++; $status = 'novel'; } # log lost genes and transcripts (for debug purposes) # # The Java app did this with a separate method # (StableIdMapper.dumpLostGeneAndTranscripts()) which also claims to log # losses due to merge. Since at that point this data isn't available yet # the logging can be done much more efficient here if ($type eq 'gene' or $type eq 'transcript') { print $fh $s_obj->stable_id, "\t$status\n"; } } } close($fh) if (defined($fh)); # # write stable IDs to file # $self->write_stable_ids_to_file($type, \%all_targets); # also generate and write stats to file $self->generate_mapping_stats($type, \%stats); $self->logger->info("Done.\n\n"); } sub generate_similarity_events { my $self = shift; my $mappings = shift; my $scores = shift; my $type = shift; # argument checks unless ($mappings and $mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) { throw('Need a gene Bio::EnsEMBL::IdMapping::MappingList.'); } unless ($scores and $scores->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); } throw("Need a type (gene|transcript|translation).") unless ($type); my $mapped; # # add similarities for mapped entries # foreach my $e (@{ $mappings->get_all_Entries }) { # create lookup hash for mapped sources and targets; we'll need this later $mapped->{'source'}->{$e->source} = 1; $mapped->{'target'}->{$e->target} = 1; # loop over all other entries which contain either source or target; # add similarity if score is within 2% of this entry (which is the top # scorer) my @others = @{ $scores->get_Entries_for_target($e->target) }; push @others, @{ $scores->get_Entries_for_source($e->source) }; while (my $e2 = shift(@others)) { # skip self next if (($e->source eq $e2->source) and ($e->target eq $e2->target)); if ($e2->score > ($e->score * 0.98)) { my $s_obj = $self->cache->get_by_key("${type}s_by_id", 'source', $e2->source); my $t_obj = $self->cache->get_by_key("${type}s_by_id", 'target', $e2->target); my $key = join("\t", $s_obj->stable_id, $s_obj->version, $t_obj->stable_id, $t_obj->version, $self->mapping_session_id, $type, $e2->score ); $self->add_stable_id_event('similarity', $key); } # [todo] add overlap hack here? (see Java code) # probably better solution: let synteny rescoring affect this decision } } # # similarities for other entries # foreach my $dbtype (keys %$mapped) { # note: $dbtype will be either 'source' or 'target' my $m1 = "get_all_${dbtype}s"; my $m2 = "get_Entries_for_${dbtype}"; foreach my $id (@{ $scores->$m1 }) { # skip if this is a mapped source/target next if ($mapped->{$dbtype}->{$id}); my @entries = sort { $b->score <=> $a->score } @{ $scores->$m2($id) }; next unless (@entries); # skip if top score < 0.7 my $top_score = $entries[0]->score; next if ($top_score < 0.7); # add similarities for all entries within 5% of top scorer while (my $e = shift(@entries)) { if ($e->score > ($top_score * 0.95)) { my $s_obj = $self->cache->get_by_key("${type}s_by_id", 'source', $e->source); my $t_obj = $self->cache->get_by_key("${type}s_by_id", 'target', $e->target); my $key = join("\t", $s_obj->stable_id, $s_obj->version, $t_obj->stable_id, $t_obj->version, $self->mapping_session_id, $type, $e->score ); $self->add_stable_id_event('similarity', $key); } } } } } sub filter_same_gene_transcript_similarities { my $self = shift; my $transcript_scores = shift; # argument checks unless ($transcript_scores and $transcript_scores->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix of transcripts.'); } # create a new matrix for the filtered entries my $filtered_scores = Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new( -DUMP_PATH => path_append($self->conf->param('basedir'), 'matrix'), -CACHE_FILE => 'filtered_transcript_scores.ser', ); # lookup hash for all target transcripts my %all_targets = map { $_->stable_id => 1 } values %{ $self->cache->get_by_name("transcripts_by_id", 'target') }; my $i = 0; foreach my $e (@{ $transcript_scores->get_all_Entries }) { my $s_tr = $self->cache->get_by_key('transcripts_by_id', 'source', $e->source); my $s_gene = $self->cache->get_by_key('genes_by_transcript_id', 'source', $e->source); my $t_gene = $self->cache->get_by_key('genes_by_transcript_id', 'target', $e->target); # workaround for caching issue: only gene objects in 'genes_by_id' cache # have a stable ID assigned #$t_gene = $self->cache->get_by_key('genes_by_id', 'target', $t_gene->id); #$self->logger->debug("xxx ".join(":", $s_tr->stable_id, $s_gene->stable_id, # $t_gene->stable_id)."\n"); # skip if source and target transcript are in same gene, BUT keep events for # deleted transcripts if (($s_gene->stable_id eq $t_gene->stable_id) and $all_targets{$s_tr->stable_id}) { $i++; next; } $filtered_scores->add_Entry($e); } $self->logger->debug("Skipped $i same gene transcript mappings.\n"); return $filtered_scores; } sub generate_translation_similarity_events { my $self = shift; my $mappings = shift; my $transcript_scores = shift; # argument checks unless ($mappings and $mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) { throw('Need a gene Bio::EnsEMBL::IdMapping::MappingList.'); } unless ($transcript_scores and $transcript_scores->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); } # create a fake translation scoring matrix my $translation_scores = Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new( -DUMP_PATH => path_append($self->conf->param('basedir'), 'matrix'), -CACHE_FILE => 'translation_scores.ser', ); foreach my $e (@{ $transcript_scores->get_all_Entries }) { my $s_tl = $self->cache->get_by_key('transcripts_by_id', 'source', $e->source)->translation; my $t_tl = $self->cache->get_by_key('transcripts_by_id', 'target', $e->target)->translation; # add an entry to the translation scoring matrix using the score of the # corresponding transcripts if ($s_tl and $t_tl) { $translation_scores->add_score($s_tl->id, $t_tl->id, $e->score); } } # now generate similarity events using this fake scoring matrix $self->generate_similarity_events($mappings, $translation_scores, 'translation'); } sub write_stable_ids_to_file { my $self = shift; my $type = shift; my $all_targets = shift; $self->logger->info("Writing ${type} stable IDs to file...\n"); my $fh = $self->get_filehandle("${type}_stable_id.txt", 'tables'); my @sorted_targets = map { $all_targets->{$_} } sort { $a <=> $b } keys %$all_targets; foreach my $obj (@sorted_targets) { # check for missing created and modified dates my $created_date = $obj->created_date; unless ($created_date) { #$self->logger->debug("Missing created_date for target ". # $obj->to_string."\n", 1); $created_date = $self->mapping_session_date; } my $modified_date = $obj->modified_date; unless ($modified_date) { #$self->logger->debug("Missing modified_date for target ". # $obj->to_string."\n", 1); $modified_date = $self->mapping_session_date; } my $row = join("\t", $obj->id, $obj->stable_id, $obj->version, strftime("%Y-%m-%d %T", localtime($created_date)), strftime("%Y-%m-%d %T", localtime($modified_date)), ); print $fh "$row\n"; } close($fh); $self->logger->info("Done writing ".scalar(@sorted_targets)." entries.\n\n"); } sub generate_mapping_stats { my $self = shift; my $type = shift; my $stats = shift; my $result = ucfirst($type)." mapping results:\n\n"; my $fmt1 = "%-10s%-10s%-10s%-10s\n"; my $fmt2 = "%-10s%6.0f %6.0f %4.2f%%\n"; $result .= sprintf($fmt1, qw(TYPE MAPPED LOST PERCENTAGE)); $result .= ('-'x40)."\n"; my $mapped_total = $stats->{'mapped_known'} + $stats->{'mapped_novel'}; my $lost_total = $stats->{'lost_known'} + $stats->{'lost_novel'}; my $known_total = $stats->{'mapped_known'} + $stats->{'lost_known'}; my $novel_total = $stats->{'mapped_novel'} + $stats->{'lost_novel'}; # no split into known and novel for exons unless ( $type eq 'exon' ) { $result .= sprintf( $fmt2, 'known', $stats->{'mapped_known'}, $stats->{'lost_known'}, ($known_total ? $stats->{'mapped_known'}/$known_total*100 : 0) ); $result .= sprintf( $fmt2, 'novel', $stats->{'mapped_novel'}, $stats->{'lost_novel'}, ($novel_total ? $stats->{'mapped_novel'}/$novel_total*100 : 0) ); } ## end unless ( $type eq 'exon' ) $result .= sprintf($fmt2, 'total', $mapped_total, $lost_total, $mapped_total/($known_total + $novel_total)*100); # log result $self->logger->info($result."\n"); # write result to file my $fh = $self->get_filehandle("${type}_mapping_stats.txt", 'stats'); print $fh $result; close($fh); } sub dump_debug_mappings { my $self = shift; foreach my $type (keys %debug_mappings) { $self->logger->debug("Writing $type mappings to debug/${type}_mappings.txt...\n"); my $fh = $self->get_filehandle("${type}_mappings.txt", 'debug'); foreach my $row (@{ $debug_mappings{$type} }) { print $fh join("\t", @$row); print $fh "\n"; } close($fh); $self->logger->debug("Done.\n"); } } sub write_stable_id_events { my $self = shift; my $event_type = shift; throw("Need an event type (new|similarity).") unless ($event_type); $self->logger->debug("Writing $event_type stable_id_events to file...\n"); my $fh = $self->get_filehandle("stable_id_event_${event_type}.txt", 'tables'); my $i = 0; foreach my $event (@{ $self->get_all_stable_id_events($event_type) }) { print $fh "$event\n"; $i++; } close($fh); $self->logger->debug("Done writing $i entries.\n"); } sub add_stable_id_event { my ($self, $type, $event) = @_; # argument check throw("Need an event type (new|similarity).") unless ($type); $self->{'stable_id_events'}->{$type}->{$event} = 1; } sub get_all_stable_id_events { my ($self, $type) = @_; # argument check throw("Need an event type (new|similarity).") unless ($type); return [ keys %{ $self->{'stable_id_events'}->{$type} } ]; } sub mapping_session_id { my $self = shift; $self->{'_mapping_session_id'} = shift if (@_); return $self->{'_mapping_session_id'}; } sub mapping_session_date { my $self = shift; $self->{'_mapping_session_date'} = shift if (@_); return $self->{'_mapping_session_date'}; } sub mapping_session_date_fmt { my $self = shift; $self->{'_mapping_session_date_fmt'} = shift if (@_); return $self->{'_mapping_session_date_fmt'}; } sub stable_id_generator { my $self = shift; $self->{'_stable_id_generator'} = shift if (@_); return $self->{'_stable_id_generator'}; } 1;