Raw content of Bio::EnsEMBL::IdMapping::GeneScoreBuilder =head1 LICENSE Copyright (c) 1999-2009 The European Bioinformatics Institute and Genome Research Limited. All rights reserved. This software is distributed under a modified Apache license. For license details, please see /info/about/code_licence.html =head1 CONTACT Please email comments or questions to the public Ensembl developers list at <ensembl-dev@ebi.ac.uk>. Questions may also be sent to the Ensembl help desk at <helpdesk@ensembl.org>. =cut =head1 NAME =head1 SYNOPSIS =head1 DESCRIPTION Combines ExonScoreBuilder, ExonDirectMapper and ExonerateRunner from Java application. =head1 METHODS =cut package Bio::EnsEMBL::IdMapping::GeneScoreBuilder; use strict; use warnings; no warnings 'uninitialized'; use Bio::EnsEMBL::IdMapping::ScoreBuilder; our @ISA = qw(Bio::EnsEMBL::IdMapping::ScoreBuilder); use Bio::EnsEMBL::Utils::Exception qw(throw warning); use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append); use Bio::EnsEMBL::IdMapping::ScoredMappingMatrix; sub score_genes { my $self = shift; my $transcript_matrix = shift; unless ($transcript_matrix and $transcript_matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); } $self->logger->info("-- Scoring genes...\n\n", 0, 'stamped'); # build scores based on transcript scores my $matrix = $self->scores_from_transcript_scores($transcript_matrix); # debug logging if ($self->logger->loglevel eq 'debug') { $matrix->log('gene', $self->conf->param('basedir')); } # log stats of combined matrix my $fmt = "%-40s%10.0f\n"; $self->logger->info("Scoring matrix:\n"); $self->logger->info(sprintf($fmt, "Total source genes:", $self->cache->get_count_by_name('genes_by_id', 'source')), 1); $self->logger->info(sprintf($fmt, "Total target genes:", $self->cache->get_count_by_name('genes_by_id', 'target')), 1); $self->log_matrix_stats($matrix); $self->logger->info("\nDone with gene scoring.\n\n"); return $matrix; } sub scores_from_transcript_scores { my $self = shift; my $transcript_matrix = shift; unless ($transcript_matrix and $transcript_matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); } my $dump_path = path_append($self->conf->param('basedir'), 'matrix'); my $matrix = Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new( -DUMP_PATH => $dump_path, -CACHE_FILE => 'gene_matrix.ser', ); my $gene_cache = $matrix->cache_file; if (-s $gene_cache) { # read from file $self->logger->info("Reading gene scoring matrix from file...\n", 0, 'stamped'); $self->logger->debug("Cache file $gene_cache.\n", 1); $matrix->read_from_file; $self->logger->info("Done.\n\n", 0, 'stamped'); } else { # build scoring matrix $self->logger->info("No gene scoring matrix found. Will build new one.\n"); $self->logger->info("Gene scoring...\n", 0, 'stamped'); $matrix = $self->build_scores($matrix, $transcript_matrix); $self->logger->info("Done.\n\n", 0, 'stamped'); # write scoring matrix to file $matrix->write_to_file; } return $matrix; } sub build_scores { my $self = shift; my $matrix = shift; my $transcript_matrix = shift; unless ($matrix and $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { throw('Need a gene Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); } unless ($transcript_matrix and $transcript_matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { throw('Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); } # first find out which source and target genes have scoring transcripts and # build a "flag" matrix for these genes (all scores are 1) $self->flag_matrix_from_transcript_scores($matrix, $transcript_matrix); # now calculate the actual scores for the genes in the flag matrix my $final_matrix = $self->score_matrix_from_flag_matrix($matrix, $transcript_matrix); return $final_matrix; } sub flag_matrix_from_transcript_scores { my $self = shift; my $matrix = shift; my $transcript_matrix = shift; unless ($matrix and $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { throw('Need a gene Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); } unless ($transcript_matrix and $transcript_matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { throw('Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); } # initialise progress logger my $i; my $num_entries = $transcript_matrix->get_entry_count; my $progress_id = $self->logger->init_progress($num_entries, 100); $self->logger->info("Creating flag matrix...\n", 1); # for every transcript scoring matrix entry, make an entry in the gene flag # matrix. foreach my $entry (@{ $transcript_matrix->get_all_Entries }) { $self->logger->log_progress($progress_id, ++$i, 1); my $source_gene = $self->cache->get_by_key('genes_by_transcript_id', 'source', $entry->source); my $target_gene = $self->cache->get_by_key('genes_by_transcript_id', 'target', $entry->target); $matrix->add_score($source_gene->id, $target_gene->id, 1); } $self->logger->info("\n"); return $matrix; } sub score_matrix_from_flag_matrix { my $self = shift; my $flag_matrix = shift; my $transcript_matrix = shift; my $simple_scoring = shift; unless ($flag_matrix and $flag_matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { throw('Need a gene Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); } unless ($transcript_matrix and $transcript_matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { throw('Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); } # create a new scoring matrix which will replace the flag matrix my $matrix = Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new( -DUMP_PATH => $flag_matrix->dump_path, -CACHE_FILE => $flag_matrix->cache_file_name, ); # initialise progress logger my $i; my $num_entries = $flag_matrix->get_entry_count; my $progress_id = $self->logger->init_progress($num_entries, 100); $self->logger->info("Creating score matrix from flag matrix...\n", 1); # loop over flag matrix and do proper scoring for each entry foreach my $entry (@{ $flag_matrix->get_all_Entries }) { $self->logger->log_progress($progress_id, ++$i, 1); my $score = 0; my $source_gene = $self->cache->get_by_key('genes_by_id', 'source', $entry->source); my $target_gene = $self->cache->get_by_key('genes_by_id', 'target', $entry->target); if ($simple_scoring) { # simple scoring (used for rescoring purposes) $score = $self->simple_gene_gene_score($source_gene, $target_gene, $transcript_matrix); } else { # full scoring $score = $self->complex_gene_gene_score($source_gene, $target_gene, $transcript_matrix); } $matrix->add_score($entry->source, $entry->target, $score); } $self->logger->info("\n"); return $matrix; } sub complex_gene_gene_score { my $self = shift; my $source_gene = shift; my $target_gene = shift; my $transcript_matrix = shift; my $source_gene_score = 0; my $target_gene_score = 0; my $source_gene_accum_length = 0; # sum of all transcript lengths my $target_gene_accum_length = 0; # sum of all transcript lengths # We are only interested in scoring with transcripts that are in the target # gene. The scored mapping matrix may contain scores for transcripts that # aren't in this gene so create a hash of the target genes's transcripts my %target_transcripts = map { $_->id => 1 } @{ $target_gene->get_all_Transcripts }; # loop over source transcripts foreach my $source_transcript (@{ $source_gene->get_all_Transcripts }) { # now loop over target transcripts and find the highest scoring target # transcript belonging to the target gene my $max_source_score = -1; foreach my $target_transcript_id (@{ $transcript_matrix->get_targets_for_source($source_transcript->id) }) { next unless ($target_transcripts{$target_transcript_id}); my $score = $transcript_matrix->get_score( $source_transcript->id, $target_transcript_id); $max_source_score = $score if ($score > $max_source_score); } if ($max_source_score > 0) { $source_gene_score += ($max_source_score * $source_transcript->length); } $source_gene_accum_length += $source_transcript->length; } # now do the same for target genes my %source_transcripts = map { $_->id => 1 } @{ $source_gene->get_all_Transcripts }; # loop over target transcripts foreach my $target_transcript (@{ $target_gene->get_all_Transcripts }) { # now loop over source transcripts and find the highest scoring source # transcript belonging to the source gene my $max_target_score = -1; foreach my $source_transcript_id (@{ $transcript_matrix->get_sources_for_target($target_transcript->id) }) { next unless ($source_transcripts{$source_transcript_id}); my $score = $transcript_matrix->get_score( $source_transcript_id, $target_transcript->id); $max_target_score = $score if ($score > $max_target_score); } if ($max_target_score > 0) { $target_gene_score += ($max_target_score * $target_transcript->length); } $target_gene_accum_length += $target_transcript->length; } # calculate overall score for this gene my $gene_score = 0; if (($source_gene_accum_length + $target_gene_accum_length) > 0) { $gene_score = ($source_gene_score + $target_gene_score) / ($source_gene_accum_length + $target_gene_accum_length); } else { $self->logger->warning("Combined transcript length of source (".$source_gene->id.") and target (".$target_gene->id.") gene is zero!\n", 1); } if ($gene_score > 1) { $self->logger->warning("Illegal gene score: $gene_score (". join("|", $source_gene_score, $target_gene_score, $source_gene_accum_length, $target_gene_accum_length).")\n", 1); } return $gene_score; } # # Simplified scoring for genes. Score is best scoring transcript pair. # This is used when the more elaborate gene representing score does not # distinguish very well. # sub simple_gene_gene_score { my $self = shift; my $source_gene = shift; my $target_gene = shift; my $transcript_matrix = shift; my $gene_score = 0; foreach my $source_transcript (@{ $source_gene->get_all_Transcripts }) { foreach my $target_transcript (@{ $target_gene->get_all_Transcripts }) { my $score = $transcript_matrix->get_score($source_transcript->id, $target_transcript->id); $gene_score = $score if ($score > $gene_score); } } return $gene_score; } sub simple_gene_rescore { my $self = shift; my $gene_matrix = shift; my $transcript_matrix = shift; $gene_matrix = $self->score_matrix_from_flag_matrix($gene_matrix, $transcript_matrix, 1); } # # penalise scores between genes with different biotypes. # entries are modified in place # sub biotype_gene_rescore { my $self = shift; my $matrix = shift; unless ($matrix and $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); } my $i = 0; foreach my $entry (@{ $matrix->get_all_Entries }) { my $source_gene = $self->cache->get_by_key('genes_by_id', 'source', $entry->source); my $target_gene = $self->cache->get_by_key('genes_by_id', 'target', $entry->target); if ($source_gene->biotype ne $target_gene->biotype) { #$self->logger->debug("biotype ".$entry->to_string."\n"); $matrix->set_score($entry->source, $entry->target, ($entry->score * 0.8)); $i++; } } $self->logger->debug("Scored genes with biotype mismatch: $i\n", 1); } 1;