Bio::EnsEMBL::IdMapping
ExonScoreBuilder
Toolbar
Summary
Package variables
No package variables defined.
Included modules
Inherit
Synopsis
Description
Combines ExonScoreBuilder, ExonDirectMapper and ExonerateRunner from
Java application.
Methods
build_overlap_scores | No description | Code |
calc_overlap_score | No description | Code |
compare_exon_containers | No description | Code |
dump_filtered_exons | No description | Code |
exon_fasta_file | No description | Code |
exonerate_score | No description | Code |
non_mapped_transcript_rescore | No description | Code |
overlap_score | No description | Code |
parse_exonerate_results | No description | Code |
run_exonerate | No description | Code |
score_exons | No description | Code |
sort_exons | No description | Code |
write_filtered_exons | No description | Code |
Methods description
None available.
Methods code
build_overlap_scores | description | prev | next | Top |
sub build_overlap_scores
{ my $self = shift;
my $matrix = shift;
unless ($matrix and
$matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
}
$self->logger->info("Reading sorted exons from cache...\n", 1, 'stamped');
my @source_exons = $self->sort_exons(
[values %{ $self->cache->get_by_name('exons_by_id', 'source') }]
);
my @target_exons = $self->sort_exons(
[values %{ $self->cache->get_by_name('exons_by_id', 'target') }]
);
$self->logger->info("Done.\n", 1, 'stamped');
my $source_ec = shift(@source_exons);
my $target_ec = shift(@target_exons);
my %source_overlap = ();
my %target_overlap = ();
$self->logger->info("Scoring...\n", 1, 'stamped');
while ($source_ec or $target_ec) {
my $add_source = 0;
my $add_target = 0;
if ($source_ec and $target_ec) {
my $cmp = $self->compare_exon_containers($source_ec, $target_ec);
$add_source = 1 if ($cmp <= 0);
$add_target = 1 if ($cmp >= 0);
} elsif ($source_ec) {
$add_source = 1;
} else {
$add_target = 1;
}
if ($add_source) {
if ($source_overlap{$source_ec->[0]}) {
delete $source_overlap{$source_ec->[0]};
} else {
$source_overlap{$source_ec->[0]} = $source_ec->[0];
foreach my $target_exon (values %target_overlap) {
next if (defined($matrix->get_score(
$source_ec->[0]->id, $target_exon->id)));
$self->calc_overlap_score($source_ec->[0], $target_exon,
$matrix);
}
}
$source_ec = shift(@source_exons);
}
if ($add_target) {
if ($target_overlap{$target_ec->[0]}) {
delete $target_overlap{$target_ec->[0]};
} else {
$target_overlap{$target_ec->[0]} = $target_ec->[0];
foreach my $source_exon (values %source_overlap) {
next if (defined($matrix->get_score(
$source_exon->id, $target_ec->[0]->id)));
$self->calc_overlap_score($source_exon, $target_ec->[0], $matrix);
}
}
$target_ec = shift(@target_exons);
}
}
$self->logger->info("Done.\n", 1, 'stamped');
return $matrix;
}
} |
sub calc_overlap_score
{ my $self = shift;
my $source_exon = shift;
my $target_exon = shift;
my $matrix = shift;
my ($start, $end);
return unless ($source_exon->strand == $target_exon->strand);
if ($source_exon->start > $target_exon->start) {
$start = $source_exon->start;
} else {
$start = $target_exon->start;
}
if ($source_exon->end < $target_exon->end) {
$end = $source_exon->end;
} else {
$end = $target_exon->end;
}
my $overlap = $end - $start + 1;
my $source_length = $source_exon->end - $source_exon->start + 1;
my $target_length = $target_exon->end - $target_exon->start + 1;
my $score = ($overlap/$source_length + $overlap/$target_length)/2;
$score *= 0.9 if ($source_exon->phase != $target_exon->phase);
if ($score >= 0.5) {
$matrix->add_score($source_exon->id, $target_exon->id, $score);
} } |
sub compare_exon_containers
{ my $self = shift;
my $e1 = shift;
my $e2 = shift;
return ( ($e1->[0]->common_sr_name cmp $e2->[0]->common_sr_name) ||
($e1->[1] <=> $e2->[1]) );
}
} |
sub dump_filtered_exons
{ my $self = shift;
my $matrix = shift;
unless ($matrix and
$matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
throw('You must provide a ScoredMappingMatrix.');
}
my $source_count = $self->write_filtered_exons('source', $matrix);
my $target_count = $self->write_filtered_exons('target', $matrix);
return (($source_count > 0) and ($target_count > 0)); } |
sub exon_fasta_file
{ my $self = shift;
my $type = shift;
throw("You must provide a type.") unless $type;
return $self->cache->dump_path."/$type.exons.fasta"; } |
sub exonerate_score
{ my $self = shift;
my $matrix = shift;
unless ($matrix and
$matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
}
my $dump_path = path_append($self->conf->param('basedir'), 'matrix');
my $exonerate_matrix = Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new(
-DUMP_PATH => $dump_path,
-CACHE_FILE => 'exon_exonerate_matrix.ser',
);
my $exonerate_cache = $exonerate_matrix->cache_file;
if (-s $exonerate_cache) {
$self->logger->info("Reading exonerate matrix from file...\n", 0, 'stamped');
$self->logger->debug("Cache file $exonerate_cache.\n", 1);
$exonerate_matrix->read_from_file;
$self->logger->info("Done.\n", 0, 'stamped');
} else {
$self->logger->info("No exonerate matrix found. Will build new one.\n");
my $dump_count = $self->dump_filtered_exons($matrix);
if ($dump_count) {
$self->run_exonerate;
$self->parse_exonerate_results($exonerate_matrix);
} else {
$self->logger->info("No source and/or target exons dumped, so don't need to run exonerate.\n");
}
$exonerate_matrix->write_to_file;
}
return $exonerate_matrix;
}
} |
sub non_mapped_transcript_rescore
{ my $self = shift;
my $matrix = shift;
my $transcript_mappings = shift;
unless ($matrix and
$matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix of exons.');
}
unless ($transcript_mappings and
$transcript_mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) {
throw('Need a Bio::EnsEMBL::IdMapping::MappingList of transcripts.');
}
my %transcript_lookup = map { $_->source => $_->target }
@{ $transcript_mappings->get_all_Entries };
my $i = 0;
foreach my $entry (@{ $matrix->get_all_Entries }) {
my @source_transcripts = @{ $self->cache->get_by_key(
'transcripts_by_exon_id', 'source', $entry->source) };
my @target_transcripts = @{ $self->cache->get_by_key(
'transcripts_by_exon_id', 'target', $entry->target) };
my $found_mapped = 0;
TR:
foreach my $source_tr (@source_transcripts) {
foreach my $target_tr (@target_transcripts) {
my $mapped_target = $transcript_lookup{$source_tr->id};
if ($mapped_target and ($mapped_target == $target_tr->id)) {
$found_mapped = 1;
last TR;
}
}
}
unless ($found_mapped) {
$matrix->set_score($entry->source, $entry->target, ($entry->score * 0.8));
$i++;
}
}
$self->logger->debug("Scored exons in non-mapped transcripts: $i\n", 1);
}
1; } |
sub overlap_score
{ my $self = shift;
my $dump_path = path_append($self->conf->param('basedir'), 'matrix');
my $matrix = Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new(
-DUMP_PATH => $dump_path,
-CACHE_FILE => 'exon_overlap_matrix.ser',
);
my $overlap_cache = $matrix->cache_file;
if (-s $overlap_cache) {
$self->logger->info("Reading exon overlap scoring matrix from file...\n", 0, 'stamped');
$self->logger->debug("Cache file $overlap_cache.\n", 1);
$matrix->read_from_file;
$self->logger->info("Done.\n", 0, 'stamped');
} else {
$self->logger->info("No exon overlap scoring matrix found. Will build new one.\n");
if ($self->cache->highest_common_cs) {
$self->logger->info("Overlap scoring...\n", 0, 'stamped');
$matrix = $self->build_overlap_scores($matrix);
$self->logger->info("Done.\n", 0, 'stamped');
}
$matrix->write_to_file;
}
return $matrix;
}
} |
sub parse_exonerate_results
{ my $self = shift;
my $exonerate_matrix = shift;
unless ($exonerate_matrix and
$exonerate_matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
throw('You must provide a ScoredMappingMatrix.');
}
$self->logger->info("Parsing exonerate results...\n", 0, 'stamped');
my $dump_path = $self->cache->dump_path;
my $num_files = 0;
my $num_lines = 0;
opendir(DUMPDIR, $dump_path) or
$self->logger->error("Can't open $dump_path for reading: $!");
while (defined(my $file = readdir(DUMPDIR))) {
next unless $file =~ /exonerate_map\.\d+/;
$num_files++;
open(F, '<', "$dump_path/$file");
while (<F>) {
$num_lines++;
chomp;
my (undef, $source_id, $target_id, $match_length, $source_length,
$target_length) = split;
my $score = 0;
if ($source_length == 0 or $target_length == 0) {
$self->logger->warning("Alignment length is 0 for $source_id/$target_id.\n");
} else {
$score = 2 * $match_length / ($source_length + $target_length); }
$exonerate_matrix->add_score($source_id, $target_id, $score);
}
close(F);
}
closedir(DUMPDIR);
$self->logger->info("Done parsing $num_lines lines from $num_files result files.\n", 0, 'stamped');
return $exonerate_matrix; } |
sub run_exonerate
{ my $self = shift;
my $source_file = $self->exon_fasta_file('source');
my $target_file = $self->exon_fasta_file('target');
my $source_size = -s $source_file;
my $target_size = -s $target_file;
unless ($source_size and $target_size) {
throw("Can't find exon fasta files.");
}
my $logpath = path_append($self->logger->logpath, 'exonerate');
system("rm -rf $logpath") == 0 or
$self->logger->error("Unable to delete lsf log dir $logpath: $!\n");
system("mkdir -p $logpath") == 0 or
$self->logger->error("Can't create lsf log dir $logpath: $!\n");
my $dump_path = $self->cache->dump_path;
opendir(DUMPDIR, $dump_path) or
$self->logger->error("Can't open $dump_path for reading: $!");
while (defined(my $file = readdir(DUMPDIR))) {
next unless /exonerate_map\.\d+/;
unlink("$dump_path/$file") or
$self->logger->error("Can't delete $dump_path/$file: $!");
}
closedir(DUMPDIR);
my $bytes_per_job = $self->conf->param('exonerate_bytes_per_job') || 250000;
my $num_jobs = $self->conf->param('exonerate_jobs');
$num_jobs ||= int($source_size/$bytes_per_job + 1);
my $percent = ($self->conf->param('exonerate_threshold') || 0.5) * 100;
my $lsf_name = 'idmapping_exonerate_'.time;
my $exonerate_path = $self->conf->param('exonerate_path');
my $exonerate_extra_params = $self->conf->param('exonerate_extra_params');
my $exonerate_job = qq{$exonerate_path } .
qq{--query $source_file --target $target_file } .
q{--querychunkid $LSB_JOBINDEX } .
qq{--querychunktotal $num_jobs } .
q{--model affine:local -M 900 --showalignment FALSE --subopt no } .
qq{--percent $percent } .
$self->conf->param('exonerate_extra_params') . " " .
q{--ryo 'myinfo: %qi %ti %et %ql %tl\n' } .
qq{| grep '^myinfo:' > $dump_path/exonerate_map.\$LSB_JOBINDEX} . "\n";
$self->logger->info("Submitting $num_jobs exonerate jobs to lsf.\n");
$self->logger->debug("$exonerate_job\n\n");
local *BSUB;
open BSUB, "|bsub -J$lsf_name\[1-$num_jobs\] -o $logpath/exonerate.\%I.out"
or $self->logger->error("Could not open open pipe to bsub: $!\n");
print BSUB $exonerate_job;
$self->logger->error("Error submitting exonerate jobs: $!\n")
unless ($? == 0);
close BSUB;
$self->logger->info("Waiting for exonerate jobs to finish...\n", 0, 'stamped');
my $dependent_job = qq{bsub -K -w "ended($lsf_name)" -q small } .
qq{-o $logpath/exonerate_depend.out /bin/true};
system($dependent_job) == 0 or
$self->logger->error("Error submitting dependent job: $!\n");
$self->logger->info("All exonerate jobs finished.\n", 0, 'stamped');
my @missing;
my @error;
for (my $i = 1; $i <= $num_jobs; $i++) {
my $outfile = "$dump_path/exonerate_map.$i";
push @missing, $outfile unless (-s "$outfile");
my $errfile = "$logpath/exonerate.$i.err";
push @error, $errfile if (-s "$errfile");
}
if (@missing) {
$self->logger->info("Couldn't find all exonerate output files. These are missing:\n");
foreach (@missing) {
$self->logger->info("$_\n", 1);
}
exit(1);
}
if (@error) {
$self->logger->info("One or more exonerate jobs failed. Check these error files:\n");
foreach (@error) {
$self->logger->info("$_\n", 1);
}
exit(1);
} } |
sub score_exons
{ my $self = shift;
$self->logger->info("-- Scoring exons...\n\n", 0, 'stamped');
my $matrix = $self->overlap_score;
my $exonerate_matrix = $self->exonerate_score($matrix);
$self->logger->info("\nOverlap scoring matrix:\n");
$self->log_matrix_stats($matrix);
$self->logger->info("\nExonerate scoring matrix:\n");
$self->log_matrix_stats($exonerate_matrix);
$self->logger->info("\nMerging scoring matrices...\n", 0, 'stamped');
$matrix->merge($exonerate_matrix);
$self->logger->info("Done.\n\n", 0, 'stamped');
if ($self->logger->loglevel eq 'debug') {
$matrix->log('exon', $self->conf->param('basedir'));
}
$self->logger->info("Combined scoring matrix:\n");
$self->log_matrix_stats($matrix);
$self->logger->info("\nDone with exon scoring.\n\n", 0, 'stamped');
return $matrix;
}
} |
sub sort_exons
{ my $self = shift;
my $exons = shift;
return
sort { ($a->[0]->common_sr_name cmp $b->[0]->common_sr_name)
|| ($a->[1] <=> $b->[1]) }
(map { [$_, $_->common_start - 1] } @$exons),
(map { [$_, $_->common_end] } @$exons); } |
write_filtered_exons | description | prev | next | Top |
sub write_filtered_exons
{ my $self = shift;
my $type = shift;
my $matrix = shift;
throw("You must provide a type.") unless $type;
unless ($matrix and
$matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
throw('You must provide a ScoredMappingMatrix.');
}
$self->logger->info("\nDumping $type exons to fasta file...\n", 0, 'stamped');
my $min_exon_length = $self->conf->param('min_exon_length') || 15;
my $total_exons = 0;
my $dumped_exons = 0;
my $fh;
my $file = $self->exon_fasta_file($type);
open($fh, '>', $file) or throw("Unable to open $file for writing: $!");
EXON:
foreach my $eid (sort { $b <=> $a }
keys %{ $self->cache->get_by_name('exons_by_id', $type) }) {
my $exon = $self->cache->get_by_key('exons_by_id', $type, $eid);
$total_exons++;
next EXON if ($exon->length < $min_exon_length);
if ($type eq 'source') {
foreach my $target (@{ $matrix->get_targets_for_source($eid) }) {
next EXON if ($matrix->get_score($eid, $target) > 0.9999);
}
} else {
foreach my $source (@{ $matrix->get_sources_for_target($eid) }) {
next EXON if ($matrix->get_score($source, $eid) > 0.9999);
}
}
print $fh '>', $eid, "\n", $exon->seq, "\n";
$dumped_exons++;
}
close($fh);
my $fmt = "%-30s%10s\n";
my $size = -s $file;
$self->logger->info(sprintf($fmt, 'Total exons:', $total_exons), 1);
$self->logger->info(sprintf($fmt, 'Dumped exons:', $dumped_exons), 1);
$self->logger->info(sprintf($fmt, 'Dump file size:', parse_bytes($size)), 1);
$self->logger->info("Done.\n\n", 0, 'stamped');
return $dumped_exons; } |
General documentation
Copyright (c) 1999-2009 The European Bioinformatics Institute and
Genome Research Limited. All rights reserved.
This software is distributed under a modified Apache license.
For license details, please see
/info/about/code_licence.html