Raw content of Bio::EnsEMBL::Analysis::RunnableDB::WGA2Genes
# cARED For by Ensembl
#
# Copyright GRL & EBI
#
# You may distribute this module under the same terms as perl itself
#
# POD documentation - main docs before the code
=pod
=head1 NAME
Bio::EnsEMBL::Analysis::RunnableDB::WGA2Genes
=head1 SYNOPSIS
my $db = Bio::EnsEMBL::DBAdaptor->new($locator);
my $genscan = Bio::EnsEMBL::Analysis::RunnableDB::WGA2Genes
->new (-db => $pipelinedb,
-input_id => $input_id
-analysis => $analysis );
$genscan->fetch_input();
$genscan->run();
$genscan->write_output(); #writes to stdout
=head1 DESCRIPTION
=cut
package Bio::EnsEMBL::Analysis::RunnableDB::WGA2Genes;
require Exporter;
use vars qw(@ISA @EXPORT);
use strict;
use warnings;
use Data::Dumper;
use Bio::EnsEMBL::DBSQL::DBAdaptor;
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
use Bio::EnsEMBL::Compara::DBSQL::DBAdaptor;
use Bio::EnsEMBL::Analysis::RunnableDB;
use Bio::EnsEMBL::Analysis::Config::General;
use Bio::EnsEMBL::Analysis::Config::WGA2Genes;
use Bio::EnsEMBL::Analysis::Tools::Logger;
use Bio::EnsEMBL::Analysis::Tools::WGA2Genes::GeneScaffold;
use Bio::EnsEMBL::Analysis::Tools::WGA2Genes::CoordUtils;
use Bio::EnsEMBL::Analysis::Tools::WGA2Genes::ChainUtils;
use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranscriptUtils
qw(replace_stops_with_introns);
use IO::String;
use Bio::SeqIO;
use DBI qw(:sql_types);
use DBD::mysql;
@ISA = qw(Bio::EnsEMBL::Analysis::RunnableDB
Exporter);
@EXPORT = (@{$DBI::EXPORT_TAGS{'sql_types'}});
############################################################
=head2 new
Title : fetch_input
=cut
sub new {
my ($class,@args) = @_;
my $self = $class->SUPER::new(@args);
$self->read_and_check_config($WGA2GENES_CONFIG_BY_LOGIC);
return $self;
}
############################################################
=head2 fetch_input
Title : fetch_input
=cut
sub fetch_input {
my( $self) = @_;
my $input_id = $self->input_id;
throw("No input id") unless defined($input_id);
my $q_dbh = Bio::EnsEMBL::DBSQL::DBAdaptor->
new(%{$self->QUERY_CORE_DB});
my $t_dbh = Bio::EnsEMBL::DBSQL::DBAdaptor->
new(%{$self->TARGET_CORE_DB});
my $compara_dbh = Bio::EnsEMBL::Compara::DBSQL::DBAdaptor->
new(%{$self->COMPARA_DB});
my $query_species =
$q_dbh->get_MetaContainerAdaptor->get_Species->binomial;
my $target_species =
$t_dbh->get_MetaContainerAdaptor->get_Species->binomial;
my $gdb_adap = $compara_dbh->get_GenomeDBAdaptor;
my $q_gdb = $gdb_adap->fetch_by_name_assembly($query_species);
my $t_gdb = $gdb_adap->fetch_by_name_assembly($target_species);
########
# check that the default assembly for the query and target agrees
# with that for the method_link_species_set GenomeDBs
########
my ($q_assembly_version, $t_assembly_version);
eval {
$q_assembly_version = $q_dbh->get_CoordSystemAdaptor->
fetch_by_name('toplevel',
$q_gdb->assembly);
$t_assembly_version = $t_dbh->get_CoordSystemAdaptor->
fetch_by_name('toplevel',
$t_gdb->assembly);
};
$@ and do {
throw("Had trouble fetching coord systems for ".
$q_gdb->assembly . " and " .
$t_gdb->assembly . " from core dbs: $@");
};
##############
# populate a list of genes and transcripts to ignore
##############
my $kill_list = {};
if ($self->KILL_LIST) {
open(KILL, $self->KILL_LIST) or
throw("Could not open kill list " . $self->KILL_LIST . " for reading");
while() {
/^\#/ and next;
/^(\S+)/ and $kill_list->{$1} = 1;
}
close(KILL);
}
########
# fetch the genes; need to work in the coordinate space of the
# top-level slice to be consistent with compara
########
my $sa = $q_dbh->get_SliceAdaptor;
my (@gene_records, $reg_start, $reg_end);
if ($input_id =~ /:/) {
# assume slice name
my $slice = $sa->fetch_by_name($input_id);
$reg_start = $slice->start;
$reg_end = $slice->end;
$self->query_slice($sa->fetch_by_region('toplevel',
$slice->seq_region_name));
foreach my $g (@{$slice->get_all_Genes}) {
next if $g->biotype ne 'protein_coding';
next if exists $kill_list->{$g->stable_id};
$g = $g->transfer($self->query_slice);
my @good_trans;
foreach my $t (@{$g->get_all_Transcripts}) {
next if $t->coding_region_start < $slice->start;
next if $t->coding_region_end > $slice->end;
next if exists $kill_list->{$t->stable_id};
push @good_trans, $t;
}
if (@good_trans) {
if ($self->LONGEST_SOURCE_TRANSCRIPT) {
# load seq up front. Otherwise get warnings in the sort
map { $_->translateable_seq } @good_trans;
@good_trans = sort {
CORE::length($b->translateable_seq) <=> CORE::length($a->translateable_seq);
} @good_trans;
@good_trans = ($good_trans[0]);
}
push @gene_records, Bio::EnsEMBL::Analysis::RunnableDB::WGA2Genes::GeneRecord
->new(-gene => $g,
-source_transcripts => \@good_trans);
}
}
} else {
# assume iid is a gene stable id; ignore the kill-list
my ($gene, $tran);
eval {
$gene = $q_dbh->get_GeneAdaptor->fetch_by_stable_id($input_id);
};
eval {
$tran = $q_dbh->get_TranscriptAdaptor->fetch_by_stable_id($input_id);
};
if (not defined $gene and not defined $tran) {
throw("Could not find gene or transcript with '$input_id' in query database");
}
if (not defined $gene) {
$gene = Bio::EnsEMBL::Gene->new;
$gene->stable_id("None");
$gene->add_Transcript($tran);
}
$self->query_slice($sa->fetch_by_region('toplevel',
$gene->seq_region_name));
$gene = $gene->transfer($self->query_slice);
$reg_start = $gene->start;
$reg_end = $gene->end;
my @good_trans;
foreach my $t (@{$gene->get_all_Transcripts}) {
if (not exists($kill_list->{$t->stable_id})) {
push @good_trans, $t;
}
}
if (@good_trans) {
if ($self->LONGEST_SOURCE_TRANSCRIPT) {
# load seq up front. Otherwise get warnings in the sort
map { $_->translateable_seq } @good_trans;
@good_trans = sort {
CORE::length($b->translateable_seq) <=> CORE::length($a->translateable_seq);
} @good_trans;
@good_trans = ($good_trans[0]);
}
push @gene_records, Bio::EnsEMBL::Analysis::RunnableDB::WGA2Genes::GeneRecord
->new(-gene => $gene,
-source_transcripts => \@good_trans);
}
}
$self->gene_records( \@gene_records );
#########
# get the compara data: MethodLinkSpeciesSet, reference DnaFrag,
# and all GenomicAlignBlocks
#########
my $mlss = $compara_dbh->get_MethodLinkSpeciesSetAdaptor
->fetch_by_method_link_type_GenomeDBs($self->INPUT_METHOD_LINK_TYPE,
[$q_gdb, $t_gdb]);
throw("No MethodLinkSpeciesSet for :\n" .
$self->INPUT_METHOD_LINK_TYPE . "\n" .
$query_species . "\n" .
$target_species)
if not $mlss;
my $dnafrag = $compara_dbh->get_DnaFragAdaptor->
fetch_by_GenomeDB_and_name($q_gdb,
$self->query_slice->seq_region_name);
my $gaba = $compara_dbh->get_GenomicAlignBlockAdaptor;
my $gen_al_blocks =
$gaba->fetch_all_by_MethodLinkSpeciesSet_DnaFrag($mlss,
$dnafrag,
$reg_start,
$reg_end);
my (%chains, @chains);
foreach my $block (@$gen_al_blocks) {
my $qga = $block->reference_genomic_align;
my ($tga) = @{$block->get_all_non_reference_genomic_aligns};
# fetch the target slice for later reference
if (not exists $self->target_slices->{$tga->dnafrag->name}) {
$self->target_slices->{$tga->dnafrag->name} =
$t_dbh->get_SliceAdaptor->fetch_by_region('toplevel',
$tga->dnafrag->name);
}
if ($block->reference_genomic_align->dnafrag_strand < 0) {
$block->reverse_complement;
}
push @{$chains{$block->group_id}}, $block;
}
foreach my $chain_id (keys %chains) {
push @chains, [
sort {
$a->reference_genomic_align->dnafrag_start <=>
$b->reference_genomic_align->dnafrag_start;
} @{$chains{$chain_id}}
];
}
@chains = sort { $b->[0]->score <=> $a->[0]->score } @chains;
$self->genomic_align_block_chains(\@chains);
}
############################################################
=head2 run
Title : run
=cut
sub run {
my ($self) = @_;
my (@results, @original_genes, @chains_generecs_pairs);
my $external_db_id = $self->get_external_db_id();
logger_info("INITIAL GENES: " . join(" ", map {$_->gene->stable_id}
@{$self->gene_records}) . "\n");
logger_info("ORIGINAL_CHAINS\n" .
stringify_chains($self->genomic_align_block_chains));
@chains_generecs_pairs =
$self->segregate_chains_and_generecords($self->genomic_align_block_chains,
$self->gene_records,
$self->MIN_CHAIN_ANNOTATION_OVERLAP);
foreach my $chains_generecs (@chains_generecs_pairs) {
my ($alignment_chains, $generecs) = @$chains_generecs;
my $pair_id = join(":",
map { $_->gene->stable_id } @$generecs);
logger_info("CHAINS_FOR_GENESET $pair_id\n" .
stringify_chains($alignment_chains));
my %blocks_from_used_chains;
my @these_generecs = map { $_->clone } @$generecs;
# Repeat transcript contruction until we had no rejected transcripts.
# This is because if a transcript is rejected, some of the underlying
# GeneScaffold components may be unnecessary
for(;;) {
logger_info("Working with genes " . join(" ", map { $_->gene->stable_id} @these_generecs));
my @cds_feats = map { @{$_->transcript_cds_feats} } @these_generecs;
my $filtered_chains =
filter_irrelevant_chains($alignment_chains,
\@cds_feats);
logger_info("RELEVANT_CHAINS\n" . stringify_chains($filtered_chains));
$filtered_chains = filter_inconsistent_chains($filtered_chains,
$self->OVERLAP_CHAIN_FILTER);
logger_info("CONSISTENT_CHAINS\n" . stringify_chains($filtered_chains));
$filtered_chains = $self->remove_contig_split_chains($filtered_chains);
logger_info("NON_CONTIG_SPLIT_CHAINS\n" . stringify_chains($filtered_chains));
$filtered_chains = $self->trim_exon_split_chains($filtered_chains,
\@cds_feats);
logger_info("TRIMMED_CHAINS\n" . stringify_chains($filtered_chains));
my @these_pairs = $self->segregate_chains_and_generecords($filtered_chains,
\@these_generecs,
$self->MIN_CHAIN_ANNOTATION_OVERLAP);
my @these_results;
foreach my $pair (@these_pairs) {
my ($subset_chains, $subset_generecs) = @$pair;
my $net_blocks = flatten_chains($subset_chains, 1);
my $gs_name = $subset_generecs->[0]->gene->stable_id . "-0";
my $gene_scaffold = $self->make_gene_scaffold_and_project_genes($net_blocks,
$subset_generecs,
$gs_name,
$self->MAX_EDITABLE_STOPS_PRIMARY,
$self->MIN_COVERAGE,
$self->MIN_NON_GAP,
$self->FULLY_GAP_EXONS,
$self->PARTIAL_GAP_EXONS,
$external_db_id);
push @these_results, [$gene_scaffold, @$subset_generecs];
}
# check that all transcripts were successfully projected. If not,
# the gene scaffolds may contain unnecessary pieces, in which case
# we remove the unsuccesful transcript and repeat.
my @kept_generecs;
my $need_to_repeat = 0;
foreach my $res (@these_results) {
my ($gs, @res_genes) = @$res;
foreach my $res_gene (@res_genes) {
if (scalar(@{$res_gene->source_transcripts}) ==
scalar(@{$res_gene->projected_transcripts})) {
push @kept_generecs, $res_gene;
} else {
if (@{$res_gene->good_source_transcripts}) {
push @kept_generecs, $res_gene;
}
$need_to_repeat = 1;
}
}
}
if ($need_to_repeat) {
if (@kept_generecs) {
@these_generecs = @kept_generecs;
foreach my $grec (@these_generecs) {
$grec->source_transcripts($grec->good_source_transcripts);
$grec->projected_transcripts([]);
$grec->good_source_transcripts([]);
}
next;
} else {
last;
}
} else {
if (@these_results) {
push @results, @these_results;
foreach my $pair (@these_pairs) {
my ($these_chains) = @$pair;
foreach my $chain (@$these_chains) {
foreach my $block (@$chain) {
$blocks_from_used_chains{$block} = $block;
}
}
}
}
last;
}
}
my $filtered_chains = filter_block_overlap_chains($alignment_chains,
[values %blocks_from_used_chains]);
logger_info("ITERATING over remaining single chains\n");
foreach my $chain (@$filtered_chains) {
my ($single_pair) = $self->segregate_chains_and_generecords([$chain],
$generecs,
$self->MIN_CHAIN_ANNOTATION_OVERLAP);
if (defined $single_pair) {
logger_info("CHAIN_WITH_GENES\n" . stringify_chains([$chain]));
my @grs = map { $_->clone } @{$single_pair->[1]};;
my ($tg_al) = @{$chain->[0]->get_all_non_reference_genomic_aligns};
my $gs_name = $tg_al->dnafrag->name . "-" . $tg_al->dnafrag_start;
my $gene_scaffold = $self->make_gene_scaffold_and_project_genes($chain,
\@grs,
$gs_name,
$self->MAX_EDITABLE_STOPS_NON_PRIMARY,
$self->MIN_COVERAGE,
$self->MIN_NON_GAP,
$self->FULLY_GAP_EXONS,
$self->PARTIAL_GAP_EXONS,
$external_db_id);
my @kept;
foreach my $gr (@grs) {
if (scalar(@{$gr->projected_transcripts})) {
push @kept, $gr;
}
}
if (@kept) {
push @results, [$gene_scaffold, @kept];
}
}
}
}
$self->output(\@results);
}
############################################################
=head2 write_output
Title : write_output
=cut
sub write_output {
my ($self) = @_;
my $outfile = $self->create_output_dir;
# This is just to make sure that the output files written properly
# and contain all the information that they should. This is done by
# counting the number of lines in each file and comparing them to
# what is actually printed out.
open(FH, ">$outfile") or throw("Failed to open $outfile!\n$!");
my $fh = \*FH;
my $line_numbers = 0;
print ($fh "#\n");
$line_numbers++;
printf($fh "# WGA2Genes output for %s\n", $self->input_id);
$line_numbers++;
printf($fh "# Genes are:");
$line_numbers++;
foreach my $grec (@{$self->gene_records}) {
print ($fh " " . $grec->gene->stable_id);
}
print ($fh "\n#\n");
$line_numbers++;
#print "OUTFILE ".$outfile." used\n";
foreach my $obj (@{$self->output}) {
my ($gs, @res_genes) = @$obj;
$line_numbers += $self->write_agp($gs, $fh);
foreach my $g (@res_genes) {
$line_numbers += $self->write_gene($gs,
$g,
$fh);
}
}
if (!-e $outfile) { throw("The $outfile was not created.\n"); }
#print $line_numbers, "<-- counted number of lines\n";
my $cmd = "wc -l $outfile";
# Run the command and capture the output
my $cmd_string = `$cmd`;
my @wc = (split(/ /, $cmd_string));
#print $wc[0], "<-- what should have been written\n";
if ($wc[0] !~ $line_numbers) {
throw("The number of lines in the output files differ to what should have
been saved. You might need to rerun the pipeline\n");
}
close($fh);
return;
}
######################################
# internal methods
#####################################
#################################################################
# FUNCTION : make_gene_scaffold_and_project_genes
#
# Description:
# Obvious
#################################################################
sub make_gene_scaffold_and_project_genes {
my ($self,
$blocks,
$res_genes,
$name,
$max_stops,
$min_coverage,
$min_non_gap,
$add_gaps,
$extend_into_gaps,
$external_db_id) = @_;
my $gene_scaffold = Bio::EnsEMBL::Analysis::Tools::WGA2Genes::GeneScaffold->
new(-name => $name,
-genomic_align_blocks => $blocks,
-from_slice => $self->query_slice,
-to_slices => $self->target_slices,
-transcripts => [map {@{$_->source_transcripts}} @$res_genes],
-max_readthrough_dist => $self->MAX_EXON_READTHROUGH_DIST,
-add_gaps => $add_gaps,
-extend_into_gaps => $extend_into_gaps,
);
foreach my $res_gene (@$res_genes) {
$res_gene->name($name . "." . $res_gene->gene->stable_id);
foreach my $tran (@{$res_gene->source_transcripts}) {
my $proj_trans = $gene_scaffold->place_transcript($tran, 1, $external_db_id);
if ( $tran->analysis) {
$proj_trans->analysis($tran->analysis);
for my $e( @{$proj_trans->get_all_Exons } ) {
for my $sf ( @{ $e->get_all_supporting_features } ) {
$sf->analysis($tran->analysis);
}
}
}
$proj_trans =
$self->process_transcript($proj_trans,
$max_stops,
$min_coverage,
$min_non_gap,
$tran->stable_id);
if ($proj_trans) {
push @{$res_gene->good_source_transcripts}, $tran;
push @{$res_gene->projected_transcripts}, $proj_trans;
}
}
}
return $gene_scaffold;
}
#################################################################
# FUNCTION : process_transcript
#
# Description:
# Subjects the given transcript to a few tests, returning
# the transcript if they succeed, undef if not. If the
# transcript contains less than $max_stops stops, these
# are "spliced out"; otherwise the transcripts is rejected
#################################################################
sub process_transcript {
my ($self,
$tran,
$max_stops,
$min_coverage,
$min_non_gap,
$source_id) = @_;
return 0 if not $tran;
my (@processed_transcripts, $num_stops, $prop_non_gap);
my @exons = @{$tran->get_all_Exons};
my ($tsf) = @{$tran->get_all_supporting_features};
my $pep = $tran->translate->seq;
##################
# reject transcripts that have:
# zero length
# less than minimum coverage of parent peptide
# higher that maximum proportion of gap residues
##################
if (CORE::length($pep) == 0) {
logger_info("Rejecting proj of $source_id because was just a stop codon");
return 0;
}
if ($tsf->hcoverage < $min_coverage) {
logger_info("Rejecting proj of $source_id because coverage (" .
$tsf->hcoverage .
") is below threshold");
return 0;
}
foreach my $attr (@{$tran->get_all_Attributes}) {
if ($attr->code eq "PropNonGap") {
$prop_non_gap = $attr->value;
} elsif ($attr->code eq "NumStops") {
$num_stops = $attr->value;
}
}
if ($prop_non_gap < $min_non_gap) {
logger_info("Rejecting proj of $source_id due to too many Ns ($prop_non_gap)");
return 0;
}
if ($num_stops > $max_stops) {
logger_info("Rejecting proj of $source_id due to too many stops ($num_stops)");
return 0;
} elsif ($num_stops == 0) {
return $tran;
}
##################
# number of stops is non-zero but acceptable. Need to
# operate on the transcript to jump over the stops
$tran = replace_stops_with_introns($tran);
return $tran;
}
###################################################################
# FUNCTION: remove_contig_split_chains
#
# Decription:
# This function removes chains that (a) interlock with higher
# scoring chains, and (b) this interlocking cannot be explained
# by contig mis-ordering within the involved scaffolds
###################################################################
sub remove_contig_split_chains {
my ($self, $input_chains) = @_;
my @kept_chains;
foreach my $c (@$input_chains) {
my (%coords_by_tid, @all_coords, @merged_coords);
my $consistent = 1;
my @these_chains = (@kept_chains, $c);
my $blocks = flatten_chains(\@these_chains, 1);
foreach my $bl (@$blocks) {
my ($tga) = @{$bl->get_all_non_reference_genomic_aligns};
my $tgac = Bio::EnsEMBL::Mapper::Coordinate->new
($tga->dnafrag->name,
$tga->dnafrag_start,
$tga->dnafrag_end,
$tga->dnafrag_strand);
push @{$coords_by_tid{$tgac->id}}, $tgac;
push @all_coords, $tgac;
}
foreach my $tgac (@all_coords) {
my $merged = 0;
if (@merged_coords and
$merged_coords[-1]->id eq $tgac->id and
$merged_coords[-1]->strand eq $tgac->strand) {
if ($tgac->strand > 0) {
if ($tgac->start > $merged_coords[-1]->end) {
my $can_merge = 1;
foreach my $o_c (@{$coords_by_tid{$tgac->id}}) {
if ($o_c->start > $merged_coords[-1]->end and
$o_c->end < $tgac->start) {
$can_merge = 0;
last;
}
}
if ($can_merge) {
$merged_coords[-1]->end($tgac->end);
$merged = 1;
}
}
} else {
if ($tgac->end < $merged_coords[-1]->start) {
my $can_merge = 1;
foreach my $o_c (@{$coords_by_tid{$tgac->id}}) {
if ($o_c->start > $tgac->end and
$o_c->end < $merged_coords[-1]->start) {
$can_merge = 0;
last;
}
}
if ($can_merge) {
$merged_coords[-1]->start($tgac->start);
$merged = 1;
}
}
}
}
if (not $merged) {
push @merged_coords, $tgac;
}
}
%coords_by_tid = ();
foreach my $c (@merged_coords) {
push @{$coords_by_tid{$c->id}}, $c;
}
TID: foreach my $id (keys %coords_by_tid) {
my @chain = @{$coords_by_tid{$id}};
@chain = sort { $a->start <=> $b->start } @chain;
# each chain should be separable from the last at contig
# level
for(my $i=1; $i < @chain; $i++) {
my $prev = $chain[$i-1];
my $this = $chain[$i];
my ($ex_prev) = extend_coord($prev,
$self->target_slices->{$id});
my ($ex_this) = extend_coord($this,
$self->target_slices->{$id});
if ($ex_prev->end >= $ex_this->start) {
$consistent = 0;
last TID;
}
}
}
if ($consistent) {
push @kept_chains, $c;
}
}
return \@kept_chains;
}
###################################################################
# FUNCTION: trim_exon_split_chains
#
# Decription:
# When a single exon overlaps blocks in more than one chain, the
# minority chains are trimmed back.
# This prevents the situation of a single-exon gene in the
# reference mapping down to a multi-scaffold GeneScaffold, and
# generally gives cleaner results.
###################################################################
sub trim_exon_split_chains {
my ($self, $chains, $feats) = @_;
# make nr list of coding regions
my (@nr_cds);
foreach my $cds (sort { $a->start <=> $b->start } @$feats) {
if (not @nr_cds or $cds->start > $nr_cds[-1]->end + 1) {
push @nr_cds, $cds;
} elsif ($cds->end > $nr_cds[-1]->end) {
$nr_cds[-1]->end($cds->end);
}
}
foreach my $cds (@nr_cds) {
my (@ov_chains, @other_chains);
CH: foreach my $ch (@$chains) {
my $ov_bps = 0;
foreach my $bl (@$ch) {
my $ga = $bl->reference_genomic_align;
if ($ga->dnafrag_start <= $cds->end and
$ga->dnafrag_end >= $cds->start) {
my ($st, $en) = ($cds->start, $cds->end);
$st = $ga->dnafrag_start if $ga->dnafrag_start > $st;
$en = $ga->dnafrag_end if $ga->dnafrag_end < $en;
$ov_bps += ($en - $st + 1);
}
}
if ($ov_bps) {
push @ov_chains, {
chain => $ch,
ov_bps => $ov_bps,
};
} else {
push @other_chains, $ch;
}
}
if (scalar(@ov_chains) > 1) {
# more than one chain implicated. Find the "dominant" chain
# for this exon and trim back the blocks for the other ones
@ov_chains = sort { $b->{ov_bps} <=> $a->{ov_bps} } @ov_chains;
my @new_chains = (@other_chains, $ov_chains[0]->{chain});
for (my $i=1; $i < @ov_chains; $i++) {
my @retained_blocks;
foreach my $bl (@{$ov_chains[$i]->{chain}}) {
my $ga = $bl->reference_genomic_align;
if ($ga->dnafrag_end < $cds->start or $ga->dnafrag_start > $cds->end) {
push @retained_blocks, $bl;
} else {
if ($ga->dnafrag_start < $cds->start) {
my $cut_bl = $bl->restrict_between_reference_positions($ga->dnafrag_start,
$cds->start - 1);
$cut_bl->score($bl->score);
push @retained_blocks, $cut_bl;
}
if ($ga->dnafrag_end > $cds->end) {
my $cut_bl = $bl->restrict_between_reference_positions($cds->end + 1,
$ga->dnafrag_end);
$cut_bl->score($bl->score);
push @retained_blocks, $cut_bl;
}
}
}
if (@retained_blocks) {
push @new_chains, \@retained_blocks;
}
}
$chains = \@new_chains;
}
}
return $chains;
}
###################################################################
# FUNCTION: segregate_chains_and_generecords
#
# Decription:
# partition the chains and genes into (chain_list, gene_list)
# pairs, where each chain_list contains chains that touch
# one or more genes in gene_list
###################################################################
sub segregate_chains_and_generecords {
my ($self, $chains, $generecs, $min_overlap) = @_;
my (%generecs_by_id, %chains_by_id);
foreach my $grec (@$generecs) {
$generecs_by_id{$grec} = $grec;
}
my (%genes_per_chain);
foreach my $c (@$chains) {
$chains_by_id{$c} = $c;
my @ref_gas = map { $_->reference_genomic_align } @$c;
@ref_gas = sort { $a->dnafrag_start <=> $b->dnafrag_start } @ref_gas;
my %overlapping;
foreach my $grec (@$generecs) {
my $overlap_bps = 0;
BLOCK:
foreach my $bl (@ref_gas) {
#printf " looking at block %s %d %d\n", $bl->dnafrag->name, $bl->dnafrag_start, $bl->dnafrag_end;
FEAT:
foreach my $f (@{$grec->transcript_cds_feats}) {
#printf " looking at cds feature %d %d\n", $f->start, $f->end;
if ($f->start > $bl->dnafrag_end) {
# no exons in this transcript will overlap the block
next BLOCK;
} elsif ($f->end < $bl->dnafrag_start) {
next FEAT;
} else {
my $ov_start = $f->start;
my $ov_end = $f->end;
$ov_start = $bl->dnafrag_start if $bl->dnafrag_start > $ov_start;
$ov_end = $bl->dnafrag_end if $bl->dnafrag_end < $ov_end;
$overlap_bps += ($ov_end - $ov_start + 1);
}
}
}
if ($overlap_bps >= $min_overlap) {
$overlapping{$grec} = 1;
}
}
foreach my $grecid (keys %overlapping) {
#print "FOUND A GENE OVERLAP\n";
$genes_per_chain{$c}->{$grecid} = 1;
}
}
my (@clusters, @name_clusters, @res_clusters);
foreach my $cref (keys %genes_per_chain) {
my $clust = {
chains => { $cref => 1 },
genes => {},
};
foreach my $grecid (keys %{$genes_per_chain{$cref}}) {
$clust->{genes}->{$grecid} = 1;
}
push @clusters, $clust;
}
while(@clusters) {
my $this_clust = shift @clusters;
my @merged;
do {
@merged = ();
for (my $i=0; $i < @clusters; $i++) {
# possibly merge with this clust
# if merge, remove from @clusters and re-search
my $other_clust = $clusters[$i];
my $in_common = 0;
foreach my $this_g (keys %{$this_clust->{genes}}) {
if (exists $other_clust->{genes}->{$this_g}) {
$in_common = 1;
last;
}
}
if ($in_common) {
foreach my $cref (keys %{$other_clust->{chains}}) {
$this_clust->{chains}->{$cref} = 1;
}
foreach my $gref (keys %{$other_clust->{genes}}) {
$this_clust->{genes}->{$gref} = 1;
}
push @merged, $i;
}
}
if (@merged) {
foreach my $i (reverse @merged) {
splice (@clusters, $i, 1);
}
}
} while @merged;
push @name_clusters, $this_clust;
}
# finally: form the clusters
foreach my $c (@name_clusters) {
my (@chains, @genes);
foreach my $cref (keys %{$c->{chains}}) {
push @chains, $chains_by_id{$cref};
}
foreach my $gref (keys %{$c->{genes}}) {
push @genes, $generecs_by_id{$gref};
}
@chains = sort { $b->[0]->score <=> $a->[0]->score } @chains;
@genes = sort { $a->gene->start <=> $b->gene->start } @genes;
push @res_clusters, [\@chains, \@genes];
}
return @res_clusters;
}
###########################################################
# write methods
###########################################################
sub write_agp {
my ($self,
$gscaf,
$fh) = @_;
$fh = \*STDOUT if(!$fh);
my $prefix = "##-AGP";
my $line_count = 0;
my @tsegs = $gscaf->project_down;
my @qsegs = $gscaf->project_up;
print $fh "$prefix \#\n";
$line_count++;
printf($fh "$prefix \#\# AGP for %s source-region=%s/%d-%d (iid=%s)\n",
$gscaf->seq_region_name,
$qsegs[0]->to_Slice->seq_region_name,
$qsegs[0]->to_Slice->start,
$qsegs[-1]->to_Slice->end,
$self->input_id);
$line_count++;
print $fh "$prefix \#\n";
$line_count++;
my $piece_count = 1;
for(my $i=0; $i < @tsegs; $i++) {
my $seg = $tsegs[$i];
printf($fh "$prefix %s\t%d\t%d\t%s\t%s\t%s\t%d\t%d\t%s\n",
$gscaf->seq_region_name,
$seg->from_start,
$seg->from_end,
$piece_count++,
"W",
$seg->to_Slice->seq_region_name,
$seg->to_Slice->start,
$seg->to_Slice->end,
$seg->to_Slice->strand < 0 ? "-" : "+"
);
$line_count++;
if ($i < @tsegs - 1) {
printf($fh "$prefix %s\t%d\t%d\t%s\t%s\t%d\n",
$gscaf->seq_region_name,
$seg->from_end + 1,
$tsegs[$i+1]->from_start - 1,
$piece_count++,
"N",
$tsegs[$i+1]->from_start - $seg->from_end - 1,
);
$line_count++;
}
}
return $line_count;
}
sub write_gene {
my ($self,
$gene_scaf,
$g,
$fh ) = @_;
$fh = \*STDOUT if(!$fh);
my $prefix = "##-GENES ";
my $line_count = 0;
my $seq_id = $gene_scaf->seq_region_name;
my $gene_id = $g->name;
printf $fh "$prefix \# Gene report for $seq_id\n";
$line_count++;
foreach my $tran (@{$g->projected_transcripts}) {
my ($sf) = @{$tran->get_all_supporting_features};
my $tran_id = $gene_id . "_" . $sf->hseqname;
foreach my $attr (@{$tran->get_all_Attributes}) {
printf($fh "$prefix \##\-ATTRIBUTE transcript=$tran_id code=%s value=%s\n",
$attr->code,
$attr->value);
$line_count++;
}
my @exons = @{$tran->get_all_Exons};
for (my $i=0; $i < @exons; $i++) {
my $exon = $exons[$i];
my $exon_id = $tran_id . "_" . $i;
printf($fh "%s %s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%s=%s; %s=%s; %s=%s\n",
$prefix,
$seq_id,
"WGA2Genes",
"Exon",
$exon->start,
$exon->end,
$exon->strand,
$exon->phase,
$exon->end_phase,
"exon",
$exon_id,
"transcript",
$tran_id,
"gene",
$gene_id);
$line_count++;
foreach my $sup_feat (@{$exon->get_all_supporting_features}) {
foreach my $ug ($sup_feat->ungapped_features) {
printf($fh "%s %s\t%s\t%s\t%d\t%d\t%d\t%s=%s; %s=%s; %s=%s; %s=%s; %s=%s; %s=%s\n",
$prefix,
$seq_id,
"WGA2Genes",
"Supporting",
$ug->start,
$ug->end,
$ug->strand,
"exon",
$exon_id,
"hname",
$ug->hseqname,
"hstart",
$ug->hstart,
"hend",
$ug->hend,
"external_db_id",
$ug->external_db_id,
"hcoverage",
$ug->hcoverage);
$line_count++;
}
}
}
my $pep = $tran->translate;
$pep->id($tran_id);
my $fasta_string;
my $stringio = IO::String->new($fasta_string);
my $seqio = Bio::SeqIO->new(-format => 'fasta',
-fh => $stringio);
$seqio->write_seq($pep);
logger_info("PROJECTED-PEPTIDE:\n$fasta_string\n");
if ($pep->seq =~ /^X/ or $pep->seq =~ /X$/) {
logger_info("Transcript $tran_id has Xs at the ends");
}
}
return $line_count;
}
sub create_output_dir {
my ($self, $output_dir_number) = @_;
$output_dir_number = 10 if(!$output_dir_number);
my $num = int(rand($output_dir_number));
my $output_dir = $self->OUTPUT_DIR;
if (! -e $output_dir) {
warning("Your output directory '" . $output_dir . "' does not exist");
warning("- it will be created now\n");
eval{
system("mkdir -p " . $output_dir);
};
if($@){
throw("Failed to make output directory " . $output_dir . "$@" );
}
}
my $dir = $output_dir."/".$num;
if(! -e $dir){
my $command = "mkdir $dir";
eval{
system($command);
};
if($@){
throw("Failed to make $dir $@");
}
}
my $filename = $dir. "/" . $self->input_id."_".$self->analysis->logic_name."_";
$filename .= int(rand(1000));
$filename .= ".out";
return $filename;
}
###########################
# gets/sets
###########################
sub genomic_align_block_chains {
my ($self, $val) = @_;
if (defined $val) {
$self->{_gen_al_chains} = $val;
}
return $self->{_gen_al_chains};
}
sub gene_records {
my ($self, $val) = @_;
if (defined $val) {
$self->{_gene_recs} = $val;
}
return $self->{_gene_recs};
}
sub query_slice {
my ($self, $val) = @_;
if (defined $val) {
$self->{_query_tl_slice} = $val;
}
return $self->{_query_tl_slice};
}
sub target_slices {
my ($self, $val) = @_;
if (defined $val) {
$self->{_target_slices} = $val;
}
if (not exists $self->{_target_slices}) {
$self->{_target_slices} = {};
}
return $self->{_target_slices};
}
####################################
# config variable holders
####################################
sub read_and_check_config {
my ($self, $hash) = @_;
$self->SUPER::read_and_check_config($hash);
my $logic = $self->analysis->logic_name;
foreach my $var (qw(INPUT_METHOD_LINK_TYPE
QUERY_CORE_DB
TARGET_CORE_DB
COMPARA_DB
TARGET_SPECIES_EXTERNAL_DBNAME)) {
throw("You must define $var in config for logic '$logic'" .
" or in the DEFAULT entry")
if not $self->$var;
}
}
#
# core options
#
sub INPUT_METHOD_LINK_TYPE {
my ($self, $type) = @_;
if (defined $type) {
$self->{_input_method_link_type} = $type;
}
return $self->{_input_method_link_type};
}
sub COMPARA_DB {
my ($self, $db) = @_;
if (defined $db) {
$self->{_compara_db} = $db;
}
return $self->{_compara_db};
}
sub QUERY_CORE_DB {
my ($self, $db) = @_;
if (defined $db) {
$self->{_query_core_db} = $db;
}
return $self->{_query_core_db};
}
sub TARGET_CORE_DB {
my ($self, $db) = @_;
if (defined $db) {
$self->{_target_core_db} = $db;
}
return $self->{_target_core_db};
}
sub TARGET_SPECIES_EXTERNAL_DBNAME {
my ($self, $target_species_external_dbname) = @_;
if (defined $target_species_external_dbname) {
$self->{_target_species_external_dbname} = $target_species_external_dbname;
}
return $self->{_target_species_external_dbname};
}
#
# chain filtering
#
sub MIN_CHAIN_ANNOTATION_OVERLAP {
my ($self, $val) = @_;
if (defined $val) {
$self->{_min_chain_annotation_overlap} = $val;
}
if (not exists($self->{_min_chain_annotation_overlap})) {
$self->{_min_chain_annotation_overlap} = 1;
}
return $self->{_min_chain_annotation_overlap};
}
sub OVERLAP_CHAIN_FILTER {
my ($self, $val) = @_;
if (defined $val) {
$self->{_overlap_chain_filter} = $val;
}
if (not exists($self->{_overlap_chain_filter})) {
$self->{_overlap_chain_filter} = 0;
}
return $self->{_overlap_chain_filter};
}
#
# transcript editing and filtering
#
sub MAX_EXON_READTHROUGH_DIST {
my ($self, $val) = @_;
if (defined $val) {
$self->{_max_exon_readthrough_dist} = $val;
}
return $self->{_max_exon_readthrough_dist};
}
sub MAX_EDITABLE_STOPS_PRIMARY {
my ($self, $val) = @_;
if (defined $val) {
$self->{_max_editable_stops} = $val;
}
return $self->{_max_editable_stops};
}
sub MAX_EDITABLE_STOPS_NON_PRIMARY {
my ($self, $val) = @_;
if (defined $val) {
$self->{_max_editable_stops_non_primary} = $val;
}
return $self->{_max_editable_stops_non_primary};
}
sub FULLY_GAP_EXONS {
my ($self, $val) = @_;
if (defined $val) {
$self->{_fully_gap_exons} = $val;
}
return $self->{_fully_gap_exons};
}
sub PARTIAL_GAP_EXONS {
my ($self, $val) = @_;
if (defined $val) {
$self->{_partial_gap_exons} = $val;
}
return $self->{_partial_gap_exons};
}
sub MIN_COVERAGE {
my ($self, $val) = @_;
if (defined $val) {
$self->{_min_coverage} = $val;
}
return $self->{_min_coverage};
}
sub MIN_NON_GAP {
my ($self, $val) = @_;
if (defined $val) {
$self->{_min_non_gap} = $val;
}
return $self->{_min_non_gap};
}
sub LONGEST_SOURCE_TRANSCRIPT {
my ($self, $val) = @_;
if (defined $val) {
$self->{_longest_source} = $val;
}
return $self->{_longest_source};
}
sub OUTPUT_DIR {
my ($self, $val) = @_;
if (defined $val) {
$self->{_output_dir} = $val;
}
return $self->{_output_dir};
}
sub KILL_LIST {
my ($self, $val) = @_;
if (defined $val) {
$self->{_kill_list} = $val;
}
return $self->{_kill_list};
}
sub get_external_db_id {
my ($self) = @_;
my $query_db = Bio::EnsEMBL::DBSQL::DBAdaptor->
new(%{$self->QUERY_CORE_DB});
my $external_dbname = $self->TARGET_SPECIES_EXTERNAL_DBNAME;
my $sth = $query_db->prepare(
"SELECT external_db_id ".
"FROM external_db ".
"WHERE db_name = ?"
);
$sth->bind_param(1, $external_dbname, SQL_VARCHAR);
$sth->execute();
my ($external_db_id) = $sth->fetchrow();
$sth->finish();
return $external_db_id;
}
###########################################
# Local class Result to encapsulate results
############################################
package Bio::EnsEMBL::Analysis::RunnableDB::WGA2Genes::GeneRecord;
use Bio::EnsEMBL::Utils::Argument qw( rearrange );
sub new {
my ($class, @args) = @_;
my $self = bless {}, $class;
my ($name, $gene, $source_trans) = rearrange
(['NAME', 'GENE', 'SOURCE_TRANSCRIPTS'], @args);
$self->name($name) if defined $name;
$self->gene($gene) if defined $gene;
$self->source_transcripts($source_trans) if defined $source_trans;
$self->good_source_transcripts([]);
$self->projected_transcripts([]);
return $self;
}
sub clone {
my ($self) = @_;
my $new_one = Bio::EnsEMBL::Analysis::RunnableDB::WGA2Genes::GeneRecord
->new(-source_transcripts => $self->source_transcripts,
-gene => $self->gene);
return $new_one;
}
#################################################################
# FUNCTION : get_all_transcript_cds_features
#
# Description :
# Returns a list of Bio::EnsEMBL::Features representing the
# projection of the coding regions of the given transcripts
# onto the query
#################################################################
sub _get_all_transcript_cds_features {
my ($self) = @_;
my (@orig_cds_feats, @merged_cds_feats);
foreach my $tran (@{$self->source_transcripts}) {
foreach my $exon (@{$tran->get_all_translateable_Exons}) {
push @orig_cds_feats, Bio::EnsEMBL::Feature->
new(-start => $exon->start,
-end => $exon->end,
-slice => $exon->slice);
}
}
foreach my $feat (sort {$a->start <=> $b->start} @orig_cds_feats) {
if (not @merged_cds_feats or
$feat->start > $merged_cds_feats[-1]->end) {
push @merged_cds_feats, $feat;
} else {
if ($feat->end > $merged_cds_feats[-1]->end) {
$merged_cds_feats[-1]->end($feat->end);
}
}
}
$self->transcript_cds_feats(\@merged_cds_feats);
}
##################
# get/sets
#################
sub gene {
my ($self, $val) = @_;
if (defined $val) {
$self->{_gene} = $val;
}
return $self->{_gene};
}
sub name {
my ($self, $val) = @_;
if (defined $val) {
$self->{_name} = $val;
}
return $self->{_name};
}
sub source_transcripts {
my ($self, $val) = @_;
if (defined $val) {
$self->{_source_transcripts} = $val;
$self->_get_all_transcript_cds_features;
}
return $self->{_source_transcripts};
}
sub transcript_cds_feats {
my ($self, $val) = @_;
if (defined $val) {
$self->{_transcript_cds} = $val;
}
return $self->{_transcript_cds};
}
sub good_source_transcripts {
my ($self, $val) = @_;
if (defined $val) {
$self->{_good_source_transcripts} = $val;
}
return $self->{_good_source_transcripts};
}
sub projected_transcripts {
my ($self, $val) = @_;
if (defined $val) {
$self->{_projected_transcripts} = $val;
}
return $self->{_projected_transcripts};
}
1;