Raw content of Bio::EnsEMBL::IdMapping::ResultAnalyser
=head1 LICENSE
Copyright (c) 1999-2009 The European Bioinformatics Institute and
Genome Research Limited. All rights reserved.
This software is distributed under a modified Apache license.
For license details, please see
/info/about/code_licence.html
=head1 CONTACT
Please email comments or questions to the public Ensembl
developers list at .
Questions may also be sent to the Ensembl help desk at
.
=cut
=head1 NAME
Bio::EnsEMBL::IdMapping::ResultAnalyser - analyse stable Id mapping results
=head1 SYNOPSIS
# get a result analyser
my $analyser = Bio::EnsEMBL::IdMapping::ResultAnalyser->new(
-LOGGER => $logger,
-CONF => $conf,
-CACHE => $cache
);
# analyse results
$analyser->analyse( $gene_mappings,
$stable_id_mapper->get_all_stable_id_events('similarity') );
# write results to file
$analyser->write_results_to_file;
# create click lists
$analyser->create_clicklist;
# summary email
$analyser->create_summary_email;
=head1 DESCRIPTION
This is a utility module which analyses the stable Id mapping results
by providing various sorts of mapping statistics. It also creates
clicklists and a summary email.
=head1 METHODS
analyse
analyse_db
classify_source_genes_by_type
classify_genes_by_mapping_simple
classify_genes_by_mapping
add
get
get_all_by_subclass
get_all_by_class
get_count_by_subclass
get_count_by_class
get_all_classes
class_key
write_results_to_file
create_clicklist
create_summary_email
read_from_file
=cut
package Bio::EnsEMBL::IdMapping::ResultAnalyser;
use strict;
use warnings;
no warnings 'uninitialized';
use Bio::EnsEMBL::IdMapping::BaseObject;
our @ISA = qw(Bio::EnsEMBL::IdMapping::BaseObject);
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append);
=head2 analyse
Arg[1] : Bio::EnsEMBL::IdMapping::MappingList $gene_mappings - the gene
mappings to analyse
Arg[2] : Arrayref of Strings - similarity events
Example : $analyser->analyse($gene_mappings,
$stable_id_mapper->get_all_stable_id_events('similarity'));
Description : Analyses the results of a stable Id mapping run.
Return type : none
Exceptions : thrown on wrong or missing arguments
Caller : general
Status : At Risk
: under development
=cut
sub analyse {
my $self = shift;
my $gene_mappings = shift;
my $similarity_events = shift;
# argument check
unless ($gene_mappings and
$gene_mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) {
throw("Need a Bio::EnsEMBL::IdMapping::MappingList of genes.");
}
unless ($similarity_events and ref($similarity_events) eq 'ARRAY') {
throw("Need a list of similarity events.");
}
# classify source genes by type (status-logic_name-biotype)
$self->classify_source_genes_by_type;
# classify source genes by mapping status
$self->classify_genes_by_mapping($gene_mappings, $similarity_events);
}
=head2 classify_source_genes_by_type
Example : $analyser->classify_source_genes_by_type;
Description : Classifies source genes by type and adds them to the internal
datastructure. For the format of the classification string see
class_key().
Return type : none
Exceptions : none
Caller : internal
Status : At Risk
: under development
=cut
sub classify_source_genes_by_type {
my $self = shift;
foreach my $s_gene (values %{ $self->cache->get_by_name('genes_by_id', 'source') }) {
$self->add('source', $self->class_key($s_gene), 'all', $s_gene->stable_id);
}
}
=head2 classify_genes_by_mapping_simple
Arg[1] : Bio::EnsEMBL::IdMapping::MapppingList $gene_mappings - gene
mappings to classify
Example : $analyser->classify_genes_by_mapping_simple;
Description : Classifies target genes by mapping ('mapped' or 'unmapped').
Return type : none
Exceptions : thrown on wrong or missing argument
Caller : This method is not in use at the momen.
Status : At Risk
: under development
=cut
sub classify_genes_by_mapping_simple {
my $self = shift;
my $gene_mappings = shift;
# argument check
unless ($gene_mappings and
$gene_mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) {
throw("Need a Bio::EnsEMBL::IdMapping::MappingList of genes.");
}
my %result = ();
# firrst, create a lookup hash of source genes by target internal ID
my %source_genes_by_target = ();
foreach my $e (@{ $gene_mappings->get_all_Entries }) {
my $s_gene = $self->cache->get_by_key('genes_by_id', 'source', $e->source);
my $t_gene = $self->cache->get_by_key('genes_by_id', 'target', $e->target);
$source_genes_by_target{$t_gene->id} = $s_gene;
}
# now loop over target genes
foreach my $t_gene (values %{ $self->cache->get_by_name('genes_by_id', 'target') }) {
# check if target gene has all required properties set
unless ($t_gene->status and $t_gene->logic_name and $t_gene->biotype) {
$self->logger->warning("Missing data for target gene: ".
$t_gene->to_string."\n", 1);
}
my $class = $self->class_key($t_gene);
# classify as '1' if mapped (using source gene's stable ID), otherwise '0'
if (my $s_gene = $source_genes_by_target{$t_gene->id}) {
$self->add('target', $class, 'mapped', $s_gene->stable_id);
} else {
$self->add('target', $class, 'unmapped', $t_gene->stable_id);
}
}
}
=head2 classify_genes_by_mapping
Arg[1] : Bio::EnsEMBL::IdMapping::MapppingList $gene_mappings - gene
mappings to classify
Arg[2] : Arrayref of Strings - similarity events
Example : $analyser->classify_genes_by_mapping;
Description : Classifies genes by mapping. Status is
'mapped' => stable Id was mapped
'lost_similar' => stable Id not mapped, but there is a
similarity entry for the source Id
'lost_definite' => not mapped and no similarity
Return type : none
Exceptions : thrown on wrong or missing argument
Caller : This method is not in use at the momen.
Status : At Risk
: under development
=cut
sub classify_genes_by_mapping {
my $self = shift;
my $gene_mappings = shift;
my $similarity_events = shift;
# argument check
unless ($gene_mappings and
$gene_mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) {
throw("Need a Bio::EnsEMBL::IdMapping::MappingList of genes.");
}
unless ($similarity_events and ref($similarity_events) eq 'ARRAY') {
throw("Need a list of similarity events.");
}
# mapped genes
foreach my $e (@{ $gene_mappings->get_all_Entries }) {
my $s_gene = $self->cache->get_by_key('genes_by_id', 'source', $e->source);
$self->add('source', $self->class_key($s_gene), 'mapped',
$s_gene->stable_id);
}
# lookup hash for similarities
my %similar = ();
foreach my $event (@{ $similarity_events }) {
my ($stable_id) = split("\t", $event);
$similar{$stable_id} = 1;
}
# deleted genes
foreach my $s_gene (values %{ $self->cache->get_by_name('genes_by_id', 'source') }) {
my $stable_id = $s_gene->stable_id;
my $class = $self->class_key($s_gene);
unless ($self->get('source', $class, 'mapped', $stable_id)) {
# sub-classify as 'lost_similar' or 'lost_definite'
if ($similar{$stable_id}) {
$self->add('source', $class, 'lost_similar', $stable_id);
} else {
$self->add('source', $class, 'lost_definite', $stable_id);
}
}
}
}
=head2 add
Arg[1] : String $dbtype - db type ('source' or 'target')
Arg[2] : String $class - key identifying a gene type (see class_key())
Arg[3] : String $subclass - status identifier (e.g. 'mapped', 'lost')
Arg[4] : String $stable_id - gene stable Id
Arg[5] : String $val - value (usually 0 or 1)
Example : $analyser->add('source', 'KNOWN-ensembl-protein_coding',
'mapped', 'ENSG00002342', 1);
Description : Add a stable Id / property pair to a name/dbtype lookup hash.
The datastructure is a bit of a bloat, but is general enough to
be used as a lookup hash and to generate statistics (counts by
type) and debug lists (dump by type).
Return type : String - the added value
Exceptions : none
Caller : internal
Status : At Risk
: under development
=cut
sub add {
my ($self, $dbtype, $class, $subclass, $stable_id, $val) = @_;
# private method, so no argument check done for performance reasons
# default to a value of '1'
$val = 1 unless (defined($val));
$self->{$dbtype}->{$class}->{$subclass}->{$stable_id} = $val;
}
=head2 get
Arg[1] : String $dbtype - db type ('source' or 'target')
Arg[2] : String $class - key identifying a gene type (see class_key())
Arg[3] : String $subclass - status identifier (e.g. 'mapped', 'lost')
Arg[4] : String $stable_id - gene stable Id
Example : my $mapping_status = $analyser->get('source',
'KNOWN-ensembl-protein_coding', 'mapped', 'ENSG00002342');
Description : Gets a stable Id mapping status from the internal datastructure.
Return type : String
Exceptions : none
Caller : internal
Status : At Risk
: under development
=cut
sub get {
my ($self, $dbtype, $class, $subclass, $stable_id) = @_;
# private method, so no argument check done for performance reasons
return $self->{$dbtype}->{$class}->{$subclass}->{$stable_id};
}
=head2 get_all_by_subclass
Arg[1] : String $dbtype - db type ('source' or 'target')
Arg[2] : String $class - key identifying a gene type (see class_key())
Arg[3] : String $subclass - status identifier (e.g. 'mapped', 'lost')
Example : my @mapped_stable_ids = $analyser->get_all_by_subclass('source',
'KNOWN-ensembl-protein_coding', 'mapped');
Description : Gets a list of stable Id for a given subclass.
Return type : Arrayref of String (stable Ids)
Exceptions : thrown on missing arguments
Caller : internal
Status : At Risk
: under development
=cut
sub get_all_by_subclass {
my ($self, $dbtype, $class, $subclass) = @_;
# argument check
throw("Need a dbtype (source|target).") unless ($dbtype);
throw("Need a class.") unless ($class);
throw("Need a subclass.") unless ($subclass);
return [ keys %{ $self->{$dbtype}->{$class}->{$subclass} || {} } ];
}
=head2 get_all_by_class
Arg[1] : String $dbtype - db type ('source' or 'target')
Arg[2] : String $class - key identifying a gene type (see class_key())
Example : my @stable_ids = $analyser->get_all_by_class('source',
'KNOWN-ensembl-protein_coding');
Description : Gets a list of stable Id for a given class.
Return type : Arrayref of String (stable Ids)
Exceptions : thrown on missing arguments
Caller : internal
Status : At Risk
: under development
=cut
sub get_all_by_class {
my ($self, $dbtype, $class) = @_;
# argument check
throw("Need a dbtype (source|target).") unless ($dbtype);
throw("Need a class.") unless ($class);
my %merged = ();
foreach my $subclass (keys %{ $self->{$dbtype}->{$class} || {} }) {
while (my ($key, $val) = each(%{ $self->{$dbtype}->{$class}->{$subclass} })) {
$merged{$key} = $val;
}
}
return [ keys %merged ];
}
=head2 get_count_by_subclass
Arg[1] : String $dbtype - db type ('source' or 'target')
Arg[2] : String $class - key identifying a gene type (see class_key())
Arg[3] : String $subclass - status identifier (e.g. 'mapped', 'lost')
Example : my $num_mapped = $analyser->get_count_by_subclass('source',
'KNOWN-ensembl-protein_coding', 'mapped');
Description : Gets the number of stable Ids for a given subclass.
Return type : Int
Exceptions : thrown on missing arguments
Caller : internal
Status : At Risk
: under development
=cut
sub get_count_by_subclass {
my ($self, $dbtype, $class, $subclass) = @_;
# argument check
throw("Need a dbtype (source|target).") unless ($dbtype);
throw("Need a class.") unless ($class);
throw("Need a subclass.") unless ($subclass);
return scalar(keys %{ $self->{$dbtype}->{$class}->{$subclass} || {} });
}
=head2 get_count_by_class
Arg[1] : String $dbtype - db type ('source' or 'target')
Arg[2] : String $class - key identifying a gene type (see class_key())
Example : my $num_mapped = $analyser->get_count_by_class('source',
'KNOWN-ensembl-protein_coding');
Description : Gets the number of stable Ids for a given class.
Return type : Int
Exceptions : thrown on missing arguments
Caller : internal
Status : At Risk
: under development
=cut
sub get_count_by_class {
my ($self, $dbtype, $class) = @_;
# argument check
throw("Need a dbtype (source|target).") unless ($dbtype);
throw("Need a class.") unless ($class);
return scalar(@{ $self->get_all_by_class($dbtype, $class) });
}
=head2 get_all_classes
Arg[1] : String $dbtype - db type ('source' or 'target')
Example : foreach my $class (@{ $analyser->get_all_classes('source') }) {
print "$class\n";
}
Description : Gets a list of classes in the ResultAnalyser.
Return type : Arrayref of String
Exceptions : thrown on missing argument
Caller : internal
Status : At Risk
: under development
=cut
sub get_all_classes {
my ($self, $dbtype) = @_;
# argument check
throw("Need a dbtype (source|target).") unless ($dbtype);
return [ sort keys %{ $self->{$dbtype} || {} } ];
}
=head2 class_key
Arg[1] : Bio::EnsEMBL::IdMapping::TinyGene $gene - a gene object
Example : my $class = $analyser->class_key($gene);
Description : Generates a key identifying a gene class. This identifier is
composed from the gene's status, logic naame, and biotye.
Return type : String
Exceptions : none
Caller : internal
Status : At Risk
: under development
=cut
sub class_key {
my ($self, $gene) = @_;
return join('-', map { $gene->$_ } qw(status logic_name biotype));
}
=head2 write_results_to_file
Example : $analyser->write_results_to_file;
Description : Writes the results of the result analysis to a file. This is a
human-readable text detailing the mapping statistics.
Return type : none
Exceptions : none
Caller : general
Status : At Risk
: under development
=cut
sub write_results_to_file {
my $self = shift;
my $fh = $self->get_filehandle('gene_detailed_mapping_stats.txt', 'stats');
my $fmt1 = "%-60s%-16s%-16s%-16s\n";
my $fmt2 = "%-60s%5.0f (%7s) %5.0f (%7s) %5.0f (%7s)\n";
my $fmt3 = "%3.2f%%";
print $fh "Gene detailed mapping results:\n\n";
print $fh sprintf($fmt1, "Gene type", "mapped", "lost (similar)",
"lost (definite)");
print $fh ('-'x108), "\n";
foreach my $class (@{ $self->get_all_classes('source') }) {
next if ($class eq 'all');
my $total = $self->get_count_by_class('source', $class);
# avoid division by zero error
unless ($total) {
$self->logger->warning("No count found for $class.\n", 1);
next;
}
my $mapped = $self->get_count_by_subclass('source', $class, 'mapped');
my $similar = $self->get_count_by_subclass('source', $class,
'lost_similar');
my $lost = $self->get_count_by_subclass('source', $class, 'lost_definite');
print $fh sprintf($fmt2,
$class,
$mapped, sprintf($fmt3, $mapped/$total*100),
$similar, sprintf($fmt3, $similar/$total*100),
$lost, sprintf($fmt3, $lost/$total*100));
}
close($fh);
}
=head2 create_clicklist
Example : $analyser->create_clicklist;
Description : Writes an html file which contains a list of all lost genes,
with hyperlinks to the appropriate archive website. This is to
manually check lost genes.
Return type : none
Exceptions : none
Caller : general
Status : At Risk
: under development
=cut
sub create_clicklist {
my $self = shift;
my $fh = $self->get_filehandle('genes_lost.html', 'stats');
# start html output
print $fh qq(\n);
print $fh qq();
print $fh "\n";
print $fh "Lost genes ";
print $fh $self->conf->param('sourcedbname'), ' -> ',
$self->conf->param('targetdbname');
print $fh "\n";
print $fh "\n\n";
my $prefix = $self->conf->param('urlprefix');
unless ($prefix) {
$self->logger->warning("No urlprefix set, clicklists might not be useable.\n", 1);
}
my $navigation;
my $clicklist;
foreach my $class (@{ $self->get_all_classes('source') }) {
next if ($class eq 'all');
$navigation .= "$class ";
$clicklist .= "$class
\n";
foreach my $subclass (qw(lost_similar lost_definite)) {
# navigation
$navigation .= qq($subclass );
# clicklist
$clicklist .= "$subclass
\n";
foreach my $stable_id (@{ $self->get_all_by_subclass('source', $class, $subclass) }) {
$clicklist .= qq($stable_id
\n);
}
}
$navigation .= "
\n";
}
# print navigation and clicklist
print $fh "$navigation\n\n";
print $fh "$clicklist\n\n";
# html footer
print $fh "\n";
close($fh);
}
=head2 create_summary_email
Example : $analyser->create_summary_email;
Description : Writes a text file containing a summary of the mapping stats.
This will be emailed to the genebuilder for evaluation (you will
have to manually send the email, using the text in
summary_email.txt as the template).
Return type : none
Exceptions : none
Caller : general
Status : At Risk
: under development
=cut
sub create_summary_email {
my $self = shift;
my $fh = $self->get_filehandle('summary_email.txt');
#
# title
#
print $fh qq(Stable ID mapping results\n);
print $fh qq(=========================\n\n);
#
# timing
#
print $fh "Run at: ".localtime()."\n";
print $fh "Runtime: ";
print $fh $self->logger->runtime, "\n\n";
#
# parameters used for this run
#
print $fh $self->conf->list_param_values;
print $fh "\n";
#
# mapping stats
#
foreach my $type (qw(exon transcript translation gene gene_detailed)) {
my $filename = "${type}_mapping_stats.txt";
if ($self->file_exists($filename, 'stats')) {
print $fh $self->read_from_file($filename, 'stats');
print $fh "\n\n";
} else {
print $fh "No mapping stats found for $type.\n\n";
}
}
#
# db uploads
#
my @uploads = (
['stable_ids' => 'Stable IDs'],
['events' => 'Stable ID events and mapping session'],
['archive' => 'Gene and peptide archive'],
);
my $fmt1 = "%-40s%-20s\n";
print $fh qq(Data uploaded to db:\n);
print $fh qq(====================\n\n);
if ($self->conf->param('dry_run')) {
print $fh "None (dry run).\n";
} else {
foreach my $u (@uploads) {
my $uploaded = 'no';
$uploaded = 'yes' if ($self->conf->is_true("upload_".$u->[0]));
print $fh sprintf($fmt1, $u->[1], $uploaded);
}
}
print $fh "\n";
#
# stats and clicklist
#
my @output = (
['stats' => 'statistics (including clicklists of deleted IDs)'],
['debug' => 'detailed mapping output for debugging'],
['tables' => 'data files for db upload'],
);
my $fmt2 = "%-20s%-50s\n";
print $fh qq(\nOutput directories:\n);
print $fh qq(===================\n\n);
print $fh sprintf($fmt2, qw(DIRECTORY DESCRIPTION));
print $fh ('-'x72), "\n";
print $fh sprintf($fmt2, 'basedir', $self->conf->param('basedir'));
foreach my $o (@output) {
print $fh sprintf($fmt2, '$basedir/'.$o->[0], $o->[1]);
}
print $fh "\n";
#
# clicklist of first 10 deleted genes
#
print $fh qq(\nFirst 10 deleted known genes:\n);
print $fh qq(=============================\n\n);
my $in_fh = $self->get_filehandle('genes_lost.txt', 'debug', '<');
my $prefix = $self->conf->param('urlprefix');
my $i;
while (<$in_fh>) {
last if (++$i > 10);
chomp;
my ($stable_id, $type) = split(/\s+/);
next unless ($type eq 'known');
print $fh sprintf($fmt2, $stable_id, "${prefix}$stable_id");
}
close($in_fh);
close($fh);
}
=head2 read_from_file
Arg[1] : String $filename - name of file to read
Arg[2] : (optional) String $append - directory name to append to basedir
Example : my $stats_text = $analyser->read_from_file('gene_mapping_stats',
'stats');
Description : Reads mapping stats from a file.
Return type : String
Exceptions : none
Caller : internal
Status : At Risk
: under development
=cut
sub read_from_file {
my $self = shift;
my $filename = shift;
my $append = shift;
my $in_fh = $self->get_filehandle($filename, $append, '<');
my $txt;
while (<$in_fh>) {
$txt .= $_;
}
return $txt;
}
1;