Raw content of Bio::EnsEMBL::Analysis::RunnableDB::WGA2GenesDirect
# Cared for by Ensembl
#
# Copyright GRL & EBI
#
# You may distribute this module under the same terms as perl itself
#
# POD documentation - main docs before the code
=pod
=head1 NAME
Bio::EnsEMBL::Analysis::RunnableDB::WGA2GenesDirect
=head1 SYNOPSIS
my $db = Bio::EnsEMBL::DBAdaptor->new($locator);
my $genscan = Bio::EnsEMBL::Analysis::RunnableDB::WGA2GenesDirect
->new (-db => $pipelinedb,
-input_id => $input_id
-analysis => $analysis );
$genscan->fetch_input();
$genscan->run();
$genscan->write_output(); #writes to stdout
=head1 DESCRIPTION
=cut
package Bio::EnsEMBL::Analysis::RunnableDB::WGA2GenesDirect;
use vars qw(@ISA);
use strict;
use Bio::SeqIO;
use Bio::EnsEMBL::DBSQL::DBAdaptor;
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
use Bio::EnsEMBL::Compara::DBSQL::DBAdaptor;
use Bio::EnsEMBL::Analysis::RunnableDB;
use Bio::EnsEMBL::Analysis::Config::General;
use Bio::EnsEMBL::Analysis::Config::WGA2GenesDirect;
#use Bio::EnsEMBL::Analysis::Tools::Logger;
use Bio::EnsEMBL::Analysis::Tools::WGA2Genes::GeneScaffold;
use Bio::EnsEMBL::Analysis::Tools::WGA2Genes::GeneScaffoldDirect;
use Bio::EnsEMBL::Analysis::Tools::ClusterFilter;
use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranscriptUtils
qw(replace_stops_with_introns);
@ISA = qw(Bio::EnsEMBL::Analysis::RunnableDB);
############################################################
=head2 new
Title : fetch_input
=cut
sub new {
my ($class,@args) = @_;
my $self = $class->SUPER::new(@args);
$self->read_and_check_config($WGA2GENES_CONFIG_BY_LOGIC);
return $self;
}
############################################################
=head2 fetch_input
Title : fetch_input
=cut
sub fetch_input {
my( $self) = @_;
my $input_id = $self->input_id;
throw("No input id") unless defined($input_id);
print "YOUR INPUT ID:",$input_id,"\n";
my $q_dbh = Bio::EnsEMBL::DBSQL::DBAdaptor->
new(%{$self->QUERY_CORE_DB});
my $t_dbh = Bio::EnsEMBL::DBSQL::DBAdaptor->
new(%{$self->TARGET_CORE_DB});
my $compara_dbh = Bio::EnsEMBL::Compara::DBSQL::DBAdaptor->
new(%{$self->COMPARA_DB});
my $query_species =
$q_dbh->get_MetaContainerAdaptor->get_Species->binomial;
my $target_species =
$t_dbh->get_MetaContainerAdaptor->get_Species->binomial;
my $gdb_adap = $compara_dbh->get_GenomeDBAdaptor;
my $q_gdb = $gdb_adap->fetch_by_name_assembly($query_species);
my $t_gdb = $gdb_adap->fetch_by_name_assembly($target_species);
########
# check that the default assembly for the query and target agrees
# with that for the method_link_species_set GenomeDBs
########
my ($q_assembly_version, $t_assembly_version);
eval {
$q_assembly_version = $q_dbh->get_CoordSystemAdaptor->
fetch_by_name('toplevel',
$q_gdb->assembly);
$t_assembly_version = $t_dbh->get_CoordSystemAdaptor->
fetch_by_name('toplevel',
$t_gdb->assembly);
};
$@ and do {
throw("Had trouble fetching coord systems for ".
$q_gdb->assembly . " and " .
$t_gdb->assembly . " from core dbs: $@");
};
########
# fetch the genes; need to work in the coordinate space of the
# top-level slice to be consistent with compara
########
my $gene = $q_dbh->get_GeneAdaptor->fetch_by_stable_id($input_id);
$self->_check_gene($gene);
$self->gene($gene);
#########
# get the compara data: MethodLinkSpeciesSet, reference DnaFrag,
# and all GenomicAlignBlocks
#########
my $mlss = $compara_dbh->get_MethodLinkSpeciesSetAdaptor
->fetch_by_method_link_type_GenomeDBs($self->INPUT_METHOD_LINK_TYPE,
[$q_gdb, $t_gdb]);
throw("No MethodLinkSpeciesSet for :\n" .
$self->INPUT_METHOD_LINK_TYPE . "\n" .
$query_species . "\n" .
$target_species)
if not $mlss;
my $dnafrag = $compara_dbh->get_DnaFragAdaptor->
fetch_by_GenomeDB_and_name($q_gdb,
$self->gene->slice->seq_region_name);
my $gaba = $compara_dbh->get_GenomicAlignBlockAdaptor;
my $gen_al_blocks =
$gaba->fetch_all_by_MethodLinkSpeciesSet_DnaFrag($mlss,
$dnafrag,
$gene->start,
$gene->end);
my (%chains, @chains);
foreach my $block (@$gen_al_blocks) {
my $qga = $block->reference_genomic_align;
my ($tga) = @{$block->get_all_non_reference_genomic_aligns};
###########################################################
###### INVESTIGATE: do we want to just use level 1 chains?
###########################################################
#next if $qga->level_id != 1 or $tga->level_id != 1;
# fetch the target slice for later reference
if (not exists $self->target_slices->{$tga->dnafrag->name}) {
$self->target_slices->{$tga->dnafrag->name} =
$t_dbh->get_SliceAdaptor->fetch_by_region('toplevel',
$tga->dnafrag->name);
}
if ($block->reference_genomic_align->dnafrag_strand < 0) {
$block->reverse_complement;
}
push @{$chains{$block->group_id}}, $block;
}
foreach my $chain_id (keys %chains) {
push @chains, [
sort {
$a->reference_genomic_align->dnafrag_start <=>
$b->reference_genomic_align->dnafrag_start;
} @{$chains{$chain_id}}
];
}
$self->genomic_align_block_chains(\@chains);
}
############################################################
=head2 run
Title : run
=cut
sub run {
my ($self) = @_;
my @res_tran;
my $tran_stable_id;
my @final_tran;
print scalar(@{$self->genomic_align_block_chains}),"\n";
foreach my $chain (@{$self->genomic_align_block_chains}) {
my $gene_scaffold = Bio::EnsEMBL::Analysis::Tools::WGA2Genes::GeneScaffold->new(
# my $gene_scaffold = Bio::EnsEMBL::Analysis::Tools::WGA2Genes::GeneScaffoldDirect->new(
-genomic_align_blocks => $chain,
-from_slice => $self->gene->slice,
-to_slices => $self->target_slices,
-transcripts => $self->good_transcripts,
-max_readthrough_dist => $self->MAX_EXON_READTHROUGH_DIST,
-direct_target_coords => 1,
);
foreach my $tran (@{$self->good_transcripts}) {
$tran_stable_id = $tran->stable_id;
my $proj_trans =
# Remember to remove the 1 in place transcripts as right now this is only used for testing purposes
#$gene_scaffold->place_transcript($tran,1);
$gene_scaffold->place_transcript($tran);
if ($proj_trans) {
push @res_tran, $proj_trans;
}
}
}
if ($self->TRANSCRIPT_FILTER){
@res_tran = @{$self->filter->filter_results(\@res_tran)};
}
foreach my $res_tran (@res_tran){
$res_tran = $self->process_transcript($res_tran,
$tran_stable_id);
push @final_tran, $res_tran;
}
# create new gene object for each transcript
print "At the end of RUN, we had ", scalar(@final_tran), " transcripts\n";
$self->output(\@final_tran);
}
############################################################
=head2 write_output
Title : write_output
=cut
sub write_output {
my ($self) = @_;
my $trans_count = 0;
my $t_dbh = Bio::EnsEMBL::DBSQL::DBAdaptor->
new(%{$self->TARGET_CORE_DB});
my $t_gene_adaptor = $t_dbh->get_GeneAdaptor();
foreach my $t (@{$self->output}) {
$t->analysis($self->analysis);
my $gene = Bio::EnsEMBL::Gene->new( -analysis => $self->analysis,
-biotype => 'protein_coding',);
foreach my $tsf ( @{ $t->get_all_supporting_features }){
$tsf->analysis($self->analysis);
}
foreach my $exon (@{$t->get_all_Exons()}){
$exon->analysis($self->analysis);
foreach my $esf (@{$exon->get_all_supporting_features()}){
$esf->analysis($self->analysis);
}
}
$gene->add_Transcript($t);
$t_gene_adaptor->store($gene);
$trans_count++;
print "TRANSCRIPT:\n";
foreach my $e (@{$t->get_all_Exons}) {
printf("%s\tEnsembl\tExon\t%d\t%d\t%d\t%d\t%d\n", $e->slice->seq_region_name, $e->start, $e->end, $e->strand, $e->phase, $e->end_phase);
}
my $seqio = Bio::SeqIO->new(-format => 'fasta',
-fh => \*STDOUT);
$seqio->write_seq($t->translate);
}
print "For gene " . $self->input_id . " you stored ", $trans_count, " transcripts\n";
# to do: write gene back to core target database
}
######################################
# internal methods
#####################################
sub _check_gene {
my ($self, $gene) = @_;
my @good_transcripts;
foreach my $t (@{$gene->get_all_Transcripts}){
#print "Transcript Start: ",$t->start, " END: ", $t->end,"\n";
#print "translation: ",$t->translateable_seq,"\n";
if ((length($t->translateable_seq) % 3) == 0){
push @good_transcripts, $t;
}
else{
warn ("Gene ", $gene->display_id(), " contains no valid transcripts");
#push @good_transcripts, $t;
#print "Done\n";
#exit(1);
}
}
warn ("Gene ", $gene->display_id(), " contains no valid transcripts") if (scalar(@good_transcripts) == 0);
$self->good_transcripts(\@good_transcripts);
# throw an exception here if:
# 1. gene is not protein-coding (translations in all transcripts)
# 2. gene contains at least one transcript with a coding region
# that is not multiple-of-three in length
}
#################################################################
# FUNCTION : process_transcript
#
# Description:
# Subjects the given transcript to a few tests, returning
# the transcript if they succeed, undef if not. If the
# transcript contains less than $max_stops stops, these
# are "spliced out"; otherwise the transcripts is rejected
#################################################################
sub process_transcript {
my ($self,
$tran,
$source_id
) = @_;
return 0 if not $tran;
my (@processed_transcripts);
my @exons = @{$tran->get_all_Exons};
my ($tsf) = @{$tran->get_all_supporting_features};
my $pep = $tran->translate->seq;
if (CORE::length($pep) == 0) {
logger_info("Rejecting proj of $source_id because was just a stop codon");
return 0;
}
##################
# number of stops is non-zero but acceptable. Need to
# operate on the transcript to jump over the stops
$tran = replace_stops_with_introns($tran);
return $tran;
}
###########################
# gets/sets
###########################
sub genomic_align_block_chains {
my ($self, $val) = @_;
if (defined $val) {
$self->{_gen_al_chains} = $val;
}
return $self->{_gen_al_chains};
}
sub gene {
my ($self, $val) = @_;
if (defined $val) {
$self->{_gene_recs} = $val;
}
return $self->{_gene_recs};
}
sub good_transcripts {
my ($self, $val) = @_;
if (defined $val) {
$self->{_good_transcripts} = $val;
}
return $self->{_good_transcripts};
}
sub target_slices {
my ($self, $val) = @_;
if (defined $val) {
$self->{_target_slices} = $val;
}
if (not exists $self->{_target_slices}) {
$self->{_target_slices} = {};
}
return $self->{_target_slices};
}
####################################
# config variable holders
####################################
sub read_and_check_config {
my ($self, $hash) = @_;
$self->SUPER::read_and_check_config($hash);
my $logic = $self->analysis->logic_name;
foreach my $var (qw(INPUT_METHOD_LINK_TYPE
QUERY_CORE_DB
TARGET_CORE_DB
COMPARA_DB)) {
throw("You must define $var in config for logic '$logic'" .
" or in the DEFAULT entry")
if not $self->$var;
}
# filter does not have to be defined, but if it is, it should
# give details of an object and its parameters
if ($self->TRANSCRIPT_FILTER) {
if (not ref($self->TRANSCRIPT_FILTER) eq "HASH" or
not exists($self->TRANSCRIPT_FILTER->{OBJECT}) or
not exists($self->TRANSCRIPT_FILTER->{PARAMETERS})) {
throw("FILTER in config foR '$logic' must be a hash ref with elements:\n" .
" OBJECT : qualified name of the filter module;\n" .
" PARAMETERS : anonymous hash of parameters to pass to the filter");
} else {
my $module = $self->TRANSCRIPT_FILTER->{OBJECT};
my $pars = $self->TRANSCRIPT_FILTER->{PARAMETERS};
(my $class = $module) =~ s/::/\//g;
eval{
require "$class.pm";
};
throw("Couldn't require ".$class." Exonerate2Genes:require_module $@") if($@);
$self->filter($module->new(%{$pars}));
}
}
}
#
# core options
#
sub INPUT_METHOD_LINK_TYPE {
my ($self, $type) = @_;
if (defined $type) {
$self->{_input_method_link_type} = $type;
}
return $self->{_input_method_link_type};
}
sub COMPARA_DB {
my ($self, $db) = @_;
if (defined $db) {
$self->{_compara_db} = $db;
}
return $self->{_compara_db};
}
sub QUERY_CORE_DB {
my ($self, $db) = @_;
if (defined $db) {
$self->{_query_core_db} = $db;
}
return $self->{_query_core_db};
}
sub TARGET_CORE_DB {
my ($self, $db) = @_;
if (defined $db) {
$self->{_target_core_db} = $db;
}
return $self->{_target_core_db};
}
#
# transcript editing and filtering
#
sub TRANSCRIPT_FILTER {
my ($self, $val) = @_;
if (defined $val) {
$self->{_transcript_filter} = $val;
}
return $self->{_transcript_filter};
}
sub filter {
my ($self, $val) = @_;
if ($val) {
$self->{_filter} = $val;
}
return $self->{_filter};
}
sub MAX_EXON_READTHROUGH_DIST {
my ($self, $val) = @_;
if (defined $val) {
$self->{_max_ex_rt_dist} = $val;
}
return $self->{_max_ex_rt_dist};
}
sub MIN_COVERAGE {
my ($self, $val) = @_;
if (defined $val) {
$self->{_min_coverage} = $val;
}
return $self->{_min_coverage};
}
1;