Raw content of Bio::EnsEMBL::IdMapping::SyntenyFramework =head1 LICENSE Copyright (c) 1999-2009 The European Bioinformatics Institute and Genome Research Limited. All rights reserved. This software is distributed under a modified Apache license. For license details, please see /info/about/code_licence.html =head1 CONTACT Please email comments or questions to the public Ensembl developers list at <ensembl-dev@ebi.ac.uk>. Questions may also be sent to the Ensembl help desk at <helpdesk@ensembl.org>. =cut =head1 NAME Bio::EnsEMBL::IdMapping::SyntenyFramework - framework representing syntenic regions across the genome =head1 SYNOPSIS # build the SyntenyFramework from unambiguous gene mappings my $sf = Bio::EnsEMBL::IdMapping::SyntenyFramework->new( -DUMP_PATH => $dump_path, -CACHE_FILE => 'synteny_framework.ser', -LOGGER => $self->logger, -CONF => $self->conf, -CACHE => $self->cache, ); $sf->build_synteny($gene_mappings); # use it to rescore the genes $gene_scores = $sf->rescore_gene_matrix_lsf($gene_scores); =head1 DESCRIPTION The SyntenyFramework is a set of SyntenyRegions. These are pairs of locations very analoguous to the information in the assembly table (the locations dont have to be the same length though). They are built from genes that map uniquely between source and target. Once built, the SyntenyFramework is used to score source and target gene pairs to determine whether they are similar. This process is slow (it involves testing all gene pairs against all SyntenyRegions), this module therefor has built-in support to run the process in parallel via LSF. =head1 METHODS new build_synteny _by_overlap add_SyntenyRegion get_all_SyntenyRegions rescore_gene_matrix_lsf rescore_gene_matrix logger conf cache =cut package Bio::EnsEMBL::IdMapping::SyntenyFramework; use strict; use warnings; no warnings 'uninitialized'; use Bio::EnsEMBL::IdMapping::Serialisable; our @ISA = qw(Bio::EnsEMBL::IdMapping::Serialisable); use Bio::EnsEMBL::Utils::Argument qw(rearrange); use Bio::EnsEMBL::Utils::Exception qw(throw warning); use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append); use Bio::EnsEMBL::IdMapping::SyntenyRegion; use Bio::EnsEMBL::IdMapping::ScoredMappingMatrix; use FindBin qw($Bin); FindBin->again; =head2 new Arg [LOGGER]: Bio::EnsEMBL::Utils::Logger $logger - a logger object Arg [CONF] : Bio::EnsEMBL::Utils::ConfParser $conf - a configuration object Arg [CACHE] : Bio::EnsEMBL::IdMapping::Cache $cache - a cache object Arg [DUMP_PATH] : String - path for object serialisation Arg [CACHE_FILE] : String - filename of serialised object Example : my $sf = Bio::EnsEMBL::IdMapping::SyntenyFramework->new( -DUMP_PATH => $dump_path, -CACHE_FILE => 'synteny_framework.ser', -LOGGER => $self->logger, -CONF => $self->conf, -CACHE => $self->cache, ); Description : Constructor. Return type : Bio::EnsEMBL::IdMapping::SyntenyFramework Exceptions : thrown on wrong or missing arguments Caller : InternalIdMapper plugins Status : At Risk : under development =cut sub new { my $caller = shift; my $class = ref($caller) || $caller; my $self = $class->SUPER::new(@_); my ($logger, $conf, $cache) = rearrange(['LOGGER', 'CONF', 'CACHE'], @_); unless ($logger and ref($logger) and $logger->isa('Bio::EnsEMBL::Utils::Logger')) { throw("You must provide a Bio::EnsEMBL::Utils::Logger for logging."); } unless ($conf and ref($conf) and $conf->isa('Bio::EnsEMBL::Utils::ConfParser')) { throw("You must provide configuration as a Bio::EnsEMBL::Utils::ConfParser object."); } unless ($cache and ref($cache) and $cache->isa('Bio::EnsEMBL::IdMapping::Cache')) { throw("You must provide configuration as a Bio::EnsEMBL::IdMapping::Cache object."); } # initialise $self->logger($logger); $self->conf($conf); $self->cache($cache); $self->{'cache'} = []; return $self; } =head2 build_synteny Arg[1] : Bio::EnsEMBL::IdMapping::MappingList $mappings - gene mappings to build the SyntenyFramework from Example : $synteny_framework->build_synteny($gene_mappings); Description : Builds the SyntenyFramework from unambiguous gene mappings. SyntenyRegions are allowed to overlap. At most two overlapping SyntenyRegions are merged (otherwise we'd get too large SyntenyRegions with little information content). Return type : none Exceptions : thrown on wrong or missing argument Caller : InternalIdMapper plugins Status : At Risk : under development =cut sub build_synteny { my $self = shift; my $mappings = shift; unless ($mappings and $mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) { throw('Need a gene Bio::EnsEMBL::IdMapping::MappingList.'); } # create a synteny region for each mapping my @synteny_regions = (); foreach my $entry (@{ $mappings->get_all_Entries }) { my $source_gene = $self->cache->get_by_key('genes_by_id', 'source', $entry->source); my $target_gene = $self->cache->get_by_key('genes_by_id', 'target', $entry->target); my $sr = Bio::EnsEMBL::IdMapping::SyntenyRegion->new_fast([ $source_gene->start, $source_gene->end, $source_gene->strand, $source_gene->seq_region_name, $target_gene->start, $target_gene->end, $target_gene->strand, $target_gene->seq_region_name, $entry->score, ]); push @synteny_regions, $sr; } unless (@synteny_regions) { $self->logger->warning("No synteny regions could be identified.\n"); return; } # sort synteny regions #my @sorted = sort _by_overlap @synteny_regions; my @sorted = reverse sort { $a->source_seq_region_name cmp $b->source_seq_region_name || $a->source_start <=> $b->source_start || $a->source_end <=> $b->source_end } @synteny_regions; $self->logger->info("SyntenyRegions before merging: ".scalar(@sorted)."\n"); # now create merged regions from overlapping syntenies, but only merge a # maximum of 2 regions (otherwise you end up with large synteny blocks which # won't contain much information in this context) my $last_merged = 0; my $last_sr = shift(@sorted); while (my $sr = shift(@sorted)) { #$self->logger->debug("this ".$sr->to_string."\n"); my $merged_sr = $last_sr->merge($sr); if (! $merged_sr) { unless ($last_merged) { $self->add_SyntenyRegion($last_sr->stretch(2)); #$self->logger->debug("nnn ".$last_sr->to_string."\n"); } $last_merged = 0; } else { $self->add_SyntenyRegion($merged_sr->stretch(2)); #$self->logger->debug("mmm ".$merged_sr->to_string."\n"); $last_merged = 1; } $last_sr = $sr; } # deal with last synteny region in @sorted unless ($last_merged) { $self->add_SyntenyRegion($last_sr->stretch(2)); $last_merged = 0; } #foreach my $sr (@{ $self->get_all_SyntenyRegions }) { # $self->logger->debug("SRs ".$sr->to_string."\n"); #} $self->logger->info("SyntenyRegions after merging: ".scalar(@{ $self->get_all_SyntenyRegions })."\n"); } # # sort SyntenyRegions by overlap # sub _by_overlap { # first sort by seq_region my $retval = ($b->source_seq_region_name cmp $a->source_seq_region_name); return $retval if ($retval); # then sort by overlap: # return -1 if $a is downstream, 1 if it's upstream, 0 if they overlap if ($a->source_end < $b->source_start) { return 1; } if ($a->source_start < $b->source_end) { return -1; } return 0; } =head2 add_SyntenyRegion Arg[1] : Bio::EnsEMBL::IdMaping::SyntenyRegion - SyntenyRegion to add Example : $synteny_framework->add_SyntenyRegion($synteny_region); Description : Adds a SyntenyRegion to the framework. For speed reasons (and since this is an internal method), no argument check is done. Return type : none Exceptions : none Caller : internal Status : At Risk : under development =cut sub add_SyntenyRegion { push @{ $_[0]->{'cache'} }, $_[1]; } =head2 get_all_SyntenyRegions Example : foreach my $sr (@{ $sf->get_all_SyntenyRegions }) { # do something with the SyntenyRegion } Description : Get a list of all SyntenyRegions in the framework. Return type : Arrayref of Bio::EnsEMBL::IdMapping::SyntenyRegion Exceptions : none Caller : general Status : At Risk : under development =cut sub get_all_SyntenyRegions { return $_[0]->{'cache'}; } =head2 rescore_gene_matrix_lsf Arg[1] : Bio::EnsEMBL::IdMapping::ScoredmappingMatrix $matrix - gene scores to rescore Example : my $new_scores = $sf->rescore_gene_matrix_lsf($gene_scores); Description : This method runs rescore_gene_matrix() (via the synteny_resocre.pl script) in parallel with lsf, then combines the results to return a single rescored scoring matrix. Parallelisation is done by chunking the scoring matrix into several pieces (determined by the --synteny_rescore_jobs configuration option). Return type : Bio::EnsEMBL::IdMapping::ScoredMappingMatrix Exceptions : thrown on wrong or missing argument thrown on filesystem I/O error thrown on failure of one or mor lsf jobs Caller : InternalIdMapper plugins Status : At Risk : under development =cut sub rescore_gene_matrix_lsf { my $self = shift; my $matrix = shift; unless ($matrix and $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); } # serialise SyntenyFramework to disk $self->logger->debug("Serialising SyntenyFramework...\n", 0, 'stamped'); $self->write_to_file; $self->logger->debug("Done.\n", 0, 'stamped'); # split the ScoredMappingMatrix into chunks and write to disk my $matrix_size = $matrix->size; $self->logger->debug("Scores before rescoring: $matrix_size.\n"); my $num_jobs = $self->conf->param('synteny_rescore_jobs') || 20; $num_jobs++; my $dump_path = path_append($self->conf->param('basedir'), 'matrix/synteny_rescore'); $self->logger->debug("Creating sub-matrices...\n", 0, 'stamped'); foreach my $i (1..$num_jobs) { my $start = (int($matrix_size/($num_jobs-1)) * ($i - 1)) + 1; my $end = int($matrix_size/($num_jobs-1)) * $i; $self->logger->debug("$start-$end\n", 1); my $sub_matrix = $matrix->sub_matrix($start, $end); $sub_matrix->cache_file_name("gene_matrix_synteny$i.ser"); $sub_matrix->dump_path($dump_path); $sub_matrix->write_to_file; } $self->logger->debug("Done.\n", 0, 'stamped'); # create an empty lsf log directory my $logpath = path_append($self->logger->logpath, 'synteny_rescore'); system("rm -rf $logpath") == 0 or $self->logger->error("Unable to delete lsf log dir $logpath: $!\n"); system("mkdir -p $logpath") == 0 or $self->logger->error("Can't create lsf log dir $logpath: $!\n"); # build lsf command my $lsf_name = 'idmapping_synteny_rescore_'.time; my $options = $self->conf->create_commandline_options( logauto => 1, logautobase => "synteny_rescore", logpath => $logpath, interactive => 0, is_component => 1, ); my $cmd = qq{$Bin/synteny_rescore.pl $options --index \$LSB_JOBINDEX}; my $pipe = qq{|bsub -J$lsf_name\[1-$num_jobs\] } . qq{-o $logpath/synteny_rescore.\%I.out } . qq{-e $logpath/synteny_rescore.\%I.err } . $self->conf->param('lsf_opt_synteny_rescore'); # run lsf job array $self->logger->info("Submitting $num_jobs jobs to lsf.\n"); $self->logger->debug("$cmd\n\n"); local *BSUB; open BSUB, $pipe or $self->logger->error("Could not open open pipe to bsub: $!\n"); print BSUB $cmd; $self->logger->error("Error submitting synteny rescoring jobs: $!\n") unless ($? == 0); close BSUB; # submit dependent job to monitor finishing of jobs $self->logger->info("Waiting for jobs to finish...\n", 0, 'stamped'); my $dependent_job = qq{bsub -K -w "ended($lsf_name)" -q small } . qq{-o $logpath/synteny_rescore_depend.out /bin/true}; system($dependent_job) == 0 or $self->logger->error("Error submitting dependent job: $!\n"); $self->logger->info("All jobs finished.\n", 0, 'stamped'); # check for lsf errors sleep(5); my $err; foreach my $i (1..$num_jobs) { $err++ unless (-e "$logpath/synteny_rescore.$i.success"); } if ($err) { $self->logger->error("At least one of your jobs failed.\nPlease check the logfiles at $logpath for errors.\n"); } # merge and return matrix $self->logger->debug("Merging rescored matrices...\n"); $matrix->flush; foreach my $i (1..$num_jobs) { # read partial matrix created by lsf job from file my $sub_matrix = Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new( -DUMP_PATH => $dump_path, -CACHE_FILE => "gene_matrix_synteny$i.ser", ); $sub_matrix->read_from_file; # merge with main matrix $matrix->merge($sub_matrix); } $self->logger->debug("Done.\n"); $self->logger->debug("Scores after rescoring: ".$matrix->size.".\n"); return $matrix; } # # =head2 rescore_gene_matrix Arg[1] : Bio::EnsEMBL::IdMapping::ScoredmappingMatrix $matrix - gene scores to rescore Example : my $new_scores = $sf->rescore_gene_matrix($gene_scores); Description : Rescores a gene matrix. Retains 70% of old score and builds other 30% from the synteny match. Return type : Bio::EnsEMBL::IdMapping::ScoredMappingMatrix Exceptions : thrown on wrong or missing argument Caller : InternalIdMapper plugins Status : At Risk : under development =cut sub rescore_gene_matrix { my $self = shift; my $matrix = shift; unless ($matrix and $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); } my $retain_factor = 0.7; foreach my $entry (@{ $matrix->get_all_Entries }) { my $source_gene = $self->cache->get_by_key('genes_by_id', 'source', $entry->source); my $target_gene = $self->cache->get_by_key('genes_by_id', 'target', $entry->target); my $highest_score = 0; foreach my $sr (@{ $self->get_all_SyntenyRegions }) { my $score = $sr->score_location_relationship($source_gene, $target_gene); $highest_score = $score if ($score > $highest_score); } #$self->logger->debug("highscore ".$entry->to_string." ". # sprintf("%.6f\n", $highest_score)); $matrix->set_score($entry->source, $entry->target, ($entry->score * 0.7 + $highest_score * 0.3)); } return $matrix; } =head2 logger Arg[1] : (optional) Bio::EnsEMBL::Utils::Logger - the logger to set Example : $object->logger->info("Starting ID mapping.\n"); Description : Getter/setter for logger object Return type : Bio::EnsEMBL::Utils::Logger Exceptions : none Caller : constructor Status : At Risk : under development =cut sub logger { my $self = shift; $self->{'_logger'} = shift if (@_); return $self->{'_logger'}; } =head2 conf Arg[1] : (optional) Bio::EnsEMBL::Utils::ConfParser - the configuration to set Example : my $basedir = $object->conf->param('basedir'); Description : Getter/setter for configuration object Return type : Bio::EnsEMBL::Utils::ConfParser Exceptions : none Caller : constructor Status : At Risk : under development =cut sub conf { my $self = shift; $self->{'_conf'} = shift if (@_); return $self->{'_conf'}; } =head2 cache Arg[1] : (optional) Bio::EnsEMBL::IdMapping::Cache - the cache to set Example : $object->cache->read_from_file('source'); Description : Getter/setter for cache object Return type : Bio::EnsEMBL::IdMapping::Cache Exceptions : none Caller : constructor Status : At Risk : under development =cut sub cache { my $self = shift; $self->{'_cache'} = shift if (@_); return $self->{'_cache'}; } 1;