Bio::EnsEMBL::Analysis::RunnableDB WGA2GenesDirect
SummaryIncluded librariesPackage variablesSynopsisDescriptionGeneral documentationMethods
Toolbar
WebCvsRaw content
Summary
Bio::EnsEMBL::Analysis::RunnableDB::WGA2GenesDirect
Package variables
No package variables defined.
Included modules
Bio::EnsEMBL::Analysis::Config::General
Bio::EnsEMBL::Analysis::Config::WGA2GenesDirect
Bio::EnsEMBL::Analysis::RunnableDB
Bio::EnsEMBL::Analysis::Tools::ClusterFilter
Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranscriptUtils qw ( replace_stops_with_introns )
Bio::EnsEMBL::Analysis::Tools::WGA2Genes::GeneScaffold
Bio::EnsEMBL::Analysis::Tools::WGA2Genes::GeneScaffoldDirect
Bio::EnsEMBL::Compara::DBSQL::DBAdaptor
Bio::EnsEMBL::DBSQL::DBAdaptor
Bio::EnsEMBL::Utils::Exception qw ( throw warning )
Bio::SeqIO
Inherit
Bio::EnsEMBL::Analysis::RunnableDB
Synopsis
  my $db      = Bio::EnsEMBL::DBAdaptor->new($locator);
my $genscan = Bio::EnsEMBL::Analysis::RunnableDB::WGA2GenesDirect
->new (-db => $pipelinedb,
-input_id => $input_id
-analysis => $analysis );
$genscan->fetch_input();
$genscan->run();
$genscan->write_output(); #writes to stdout
Description
Methods
COMPARA_DB
No description
Code
INPUT_METHOD_LINK_TYPE
No description
Code
MAX_EXON_READTHROUGH_DIST
No description
Code
MIN_COVERAGE
No description
Code
QUERY_CORE_DB
No description
Code
TARGET_CORE_DB
No description
Code
TRANSCRIPT_FILTER
No description
Code
_check_gene
No description
Code
fetch_inputDescriptionCode
filter
No description
Code
gene
No description
Code
genomic_align_block_chains
No description
Code
good_transcripts
No description
Code
newDescriptionCode
process_transcript
No description
Code
read_and_check_config
No description
Code
runDescriptionCode
target_slices
No description
Code
write_outputDescriptionCode
Methods description
fetch_inputcode    nextTop
    Title   :   fetch_input
newcodeprevnextTop
    Title   :   fetch_input
runcodeprevnextTop
    Title   :   run
write_outputcodeprevnextTop
    Title   :   write_output
Methods code
COMPARA_DBdescriptionprevnextTop
sub COMPARA_DB {
  my ($self, $db) = @_;

  if (defined $db) {
    $self->{_compara_db} = $db;
  }

  return $self->{_compara_db};
}
INPUT_METHOD_LINK_TYPEdescriptionprevnextTop
sub INPUT_METHOD_LINK_TYPE {
  my ($self, $type) = @_;

  if (defined $type) {
    $self->{_input_method_link_type} = $type;
  }

  return $self->{_input_method_link_type};
}
MAX_EXON_READTHROUGH_DISTdescriptionprevnextTop
sub MAX_EXON_READTHROUGH_DIST {
  my ($self, $val) = @_;

  if (defined $val) {
    $self->{_max_ex_rt_dist} = $val; 
  }

  return $self->{_max_ex_rt_dist};
}
MIN_COVERAGEdescriptionprevnextTop
sub MIN_COVERAGE {
  my ($self, $val) = @_;

  if (defined $val) {
    $self->{_min_coverage} = $val;
  }

  return $self->{_min_coverage};
}



1;
}
QUERY_CORE_DBdescriptionprevnextTop
sub QUERY_CORE_DB {
  my ($self, $db) = @_;

  if (defined $db) {
    $self->{_query_core_db} = $db;
  }

  return $self->{_query_core_db};
}
TARGET_CORE_DBdescriptionprevnextTop
sub TARGET_CORE_DB {
  my ($self, $db) = @_;

  if (defined $db) {
    $self->{_target_core_db} = $db;
  }

  return $self->{_target_core_db};
}


#
# transcript editing and filtering
#
}
TRANSCRIPT_FILTERdescriptionprevnextTop
sub TRANSCRIPT_FILTER {
   my ($self, $val) = @_;

  if (defined $val) {
    $self->{_transcript_filter} = $val; 
  }

  return $self->{_transcript_filter};
}
_check_genedescriptionprevnextTop
sub _check_gene {
  my ($self, $gene) = @_; 

  my @good_transcripts;  

  foreach my $t (@{$gene->get_all_Transcripts}){
    #print "Transcript Start: ",$t->start, "  END: ", $t->end,"\n";
#print "translation: ",$t->translateable_seq,"\n";
if ((length($t->translateable_seq) % 3) == 0){ push @good_transcripts, $t; } else{ warn ("Gene ", $gene->display_id(), " contains no valid transcripts"); #push @good_transcripts, $t;
#print "Done\n";
#exit(1);
} } warn ("Gene ", $gene->display_id(), " contains no valid transcripts") if (scalar(@good_transcripts) == 0); $self->good_transcripts(\@good_transcripts); # throw an exception here if:
# 1. gene is not protein-coding (translations in all transcripts)
# 2. gene contains at least one transcript with a coding region
# that is not multiple-of-three in length
} #################################################################
# FUNCTION : process_transcript
#
# Description:
# Subjects the given transcript to a few tests, returning
# the transcript if they succeed, undef if not. If the
# transcript contains less than $max_stops stops, these
# are "spliced out"; otherwise the transcripts is rejected
#################################################################
}
fetch_inputdescriptionprevnextTop
sub fetch_input {
  my( $self) = @_; 
  
  my $input_id = $self->input_id;  
  throw("No input id") unless defined($input_id);

  print "YOUR INPUT ID:",$input_id,"\n";

  my $q_dbh = Bio::EnsEMBL::DBSQL::DBAdaptor->
      new(%{$self->QUERY_CORE_DB});
  my $t_dbh = Bio::EnsEMBL::DBSQL::DBAdaptor->
      new(%{$self->TARGET_CORE_DB});
  my $compara_dbh = Bio::EnsEMBL::Compara::DBSQL::DBAdaptor->
      new(%{$self->COMPARA_DB});
  
  my $query_species = 
      $q_dbh->get_MetaContainerAdaptor->get_Species->binomial;
  my $target_species = 
      $t_dbh->get_MetaContainerAdaptor->get_Species->binomial;
  
  my $gdb_adap = $compara_dbh->get_GenomeDBAdaptor;
  my $q_gdb = $gdb_adap->fetch_by_name_assembly($query_species);
  my $t_gdb = $gdb_adap->fetch_by_name_assembly($target_species);

  ########
# check that the default assembly for the query and target agrees
# with that for the method_link_species_set GenomeDBs
########
my ($q_assembly_version, $t_assembly_version); eval { $q_assembly_version = $q_dbh->get_CoordSystemAdaptor-> fetch_by_name('toplevel', $q_gdb->assembly); $t_assembly_version = $t_dbh->get_CoordSystemAdaptor-> fetch_by_name('toplevel', $t_gdb->assembly); }; $@ and do { throw("Had trouble fetching coord systems for ". $q_gdb->assembly . " and " . $t_gdb->assembly . " from core dbs: $@"); }; ########
# fetch the genes; need to work in the coordinate space of the
# top-level slice to be consistent with compara
########
my $gene = $q_dbh->get_GeneAdaptor->fetch_by_stable_id($input_id); $self->_check_gene($gene); $self->gene($gene); #########
# get the compara data: MethodLinkSpeciesSet, reference DnaFrag,
# and all GenomicAlignBlocks
#########
my $mlss = $compara_dbh->get_MethodLinkSpeciesSetAdaptor ->fetch_by_method_link_type_GenomeDBs($self->INPUT_METHOD_LINK_TYPE, [$q_gdb, $t_gdb]); throw("No MethodLinkSpeciesSet for :\n" . $self->INPUT_METHOD_LINK_TYPE . "\n" . $query_species . "\n" . $target_species) if not $mlss; my $dnafrag = $compara_dbh->get_DnaFragAdaptor-> fetch_by_GenomeDB_and_name($q_gdb, $self->gene->slice->seq_region_name); my $gaba = $compara_dbh->get_GenomicAlignBlockAdaptor; my $gen_al_blocks = $gaba->fetch_all_by_MethodLinkSpeciesSet_DnaFrag($mlss, $dnafrag, $gene->start, $gene->end); my (%chains, @chains); foreach my $block (@$gen_al_blocks) { my $qga = $block->reference_genomic_align; my ($tga) = @{$block->get_all_non_reference_genomic_aligns}; ###########################################################
###### INVESTIGATE: do we want to just use level 1 chains?
###########################################################
#next if $qga->level_id != 1 or $tga->level_id != 1;
# fetch the target slice for later reference
if (not exists $self->target_slices->{$tga->dnafrag->name}) { $self->target_slices->{$tga->dnafrag->name} = $t_dbh->get_SliceAdaptor->fetch_by_region('toplevel', $tga->dnafrag->name); } if ($block->reference_genomic_align->dnafrag_strand < 0) { $block->reverse_complement; } push @{$chains{$block->group_id}}, $block; } foreach my $chain_id (keys %chains) { push @chains, [ sort { $a->reference_genomic_align->dnafrag_start <=> $b->reference_genomic_align->dnafrag_start; } @{$chains{$chain_id}} ]; } $self->genomic_align_block_chains(\@chains); } ############################################################
}
filterdescriptionprevnextTop
sub filter {
  my ($self, $val) = @_;
  if ($val) {
    $self->{_filter} = $val;
  }
  return $self->{_filter};
}
genedescriptionprevnextTop
sub gene {
  my ($self, $val) = @_;

  if (defined $val) {
    $self->{_gene_recs} = $val;
  }

  return $self->{_gene_recs};
}
genomic_align_block_chainsdescriptionprevnextTop
sub genomic_align_block_chains {
  my ($self, $val) = @_;

  if (defined $val) {
    $self->{_gen_al_chains} = $val;
  }

  return $self->{_gen_al_chains};
}
good_transcriptsdescriptionprevnextTop
sub good_transcripts {
  my ($self, $val) = @_;

  if (defined $val) {
    $self->{_good_transcripts} = $val;
  }

  return $self->{_good_transcripts};
}
newdescriptionprevnextTop
sub new {
  my ($class,@args) = @_;
  my $self = $class->SUPER::new(@args);
  
  $self->read_and_check_config($WGA2GENES_CONFIG_BY_LOGIC);

  return $self;
}

############################################################
}
process_transcriptdescriptionprevnextTop
sub process_transcript {
  my ($self, 
      $tran,
      $source_id
) = @_;
  
  return 0 if not $tran;

  my (@processed_transcripts);

  my @exons = @{$tran->get_all_Exons};
  my ($tsf) = @{$tran->get_all_supporting_features};
  my $pep = $tran->translate->seq;


  if (CORE::length($pep) == 0) {
    logger_info("Rejecting proj of $source_id because was just a stop codon");
    return 0;
  }

  ##################
# number of stops is non-zero but acceptable. Need to
# operate on the transcript to jump over the stops
$tran = replace_stops_with_introns($tran); return $tran; } ###########################
# gets/sets
###########################
}
read_and_check_configdescriptionprevnextTop
sub read_and_check_config {
  my ($self, $hash) = @_;

  $self->SUPER::read_and_check_config($hash);
 
  my $logic = $self->analysis->logic_name;

  foreach my $var (qw(INPUT_METHOD_LINK_TYPE
                      QUERY_CORE_DB
                      TARGET_CORE_DB
                      COMPARA_DB)) {

    throw("You must define $var in config for logic '$logic'" . 
          " or in the DEFAULT entry")
        if not $self->$var;
  }

 # filter does not have to be defined, but if it is, it should
# give details of an object and its parameters
if ($self->TRANSCRIPT_FILTER) { if (not ref($self->TRANSCRIPT_FILTER) eq "HASH" or not exists($self->TRANSCRIPT_FILTER->{OBJECT}) or not exists($self->TRANSCRIPT_FILTER->{PARAMETERS})) { throw("FILTER in config foR '$logic' must be a hash ref with elements:\n" . " OBJECT : qualified name of the filter module;\n" . " PARAMETERS : anonymous hash of parameters to pass to the filter"); } else { my $module = $self->TRANSCRIPT_FILTER->{OBJECT}; my $pars = $self->TRANSCRIPT_FILTER->{PARAMETERS}; (my $class = $module) =~ s/::/\//g; eval{ require "$class.pm"; }; throw("Couldn't require ".$class." Exonerate2Genes:require_module $@") if($@); $self->filter($module->new(%{$pars})); } } } #
# core options
#
}
rundescriptionprevnextTop
sub run {
  my ($self) = @_;


  my @res_tran;
  my $tran_stable_id;
  my @final_tran;
  print scalar(@{$self->genomic_align_block_chains}),"\n";
  foreach my $chain (@{$self->genomic_align_block_chains}) {
    
    my $gene_scaffold = Bio::EnsEMBL::Analysis::Tools::WGA2Genes::GeneScaffold->new(
#    my $gene_scaffold = Bio::EnsEMBL::Analysis::Tools::WGA2Genes::GeneScaffoldDirect->new(
-genomic_align_blocks => $chain, -from_slice => $self->gene->slice, -to_slices => $self->target_slices, -transcripts => $self->good_transcripts, -max_readthrough_dist => $self->MAX_EXON_READTHROUGH_DIST, -direct_target_coords => 1, ); foreach my $tran (@{$self->good_transcripts}) { $tran_stable_id = $tran->stable_id; my $proj_trans = # Remember to remove the 1 in place transcripts as right now this is only used for testing purposes
#$gene_scaffold->place_transcript($tran,1);
$gene_scaffold->place_transcript($tran); if ($proj_trans) { push @res_tran, $proj_trans; } } } if ($self->TRANSCRIPT_FILTER){ @res_tran = @{$self->filter->filter_results(\@res_tran)}; } foreach my $res_tran (@res_tran){ $res_tran = $self->process_transcript($res_tran, $tran_stable_id); push @final_tran, $res_tran; } # create new gene object for each transcript
print "At the end of RUN, we had ", scalar(@final_tran), " transcripts\n"; $self->output(\@final_tran); } ############################################################
}
target_slicesdescriptionprevnextTop
sub target_slices {
  my ($self, $val) = @_;

  if (defined $val) {
    $self->{_target_slices} = $val;
  }
  
  if (not exists $self->{_target_slices}) {
    $self->{_target_slices} = {};
  }

  return $self->{_target_slices};
}



####################################
# config variable holders
####################################
}
write_outputdescriptionprevnextTop
sub write_output {
  my ($self) = @_;
  
  my $trans_count = 0;

  my $t_dbh = Bio::EnsEMBL::DBSQL::DBAdaptor->
      new(%{$self->TARGET_CORE_DB});

  my $t_gene_adaptor = $t_dbh->get_GeneAdaptor();
 
 foreach my $t (@{$self->output}) {
   $t->analysis($self->analysis);
   
    my $gene = Bio::EnsEMBL::Gene->new( -analysis => $self->analysis,
                                        -biotype  => 'protein_coding',);
    
    foreach my $tsf ( @{ $t->get_all_supporting_features }){
      $tsf->analysis($self->analysis);
    }

    foreach my $exon (@{$t->get_all_Exons()}){
      $exon->analysis($self->analysis);
      
      foreach my $esf (@{$exon->get_all_supporting_features()}){
        $esf->analysis($self->analysis);
      }
    }

    $gene->add_Transcript($t);

    $t_gene_adaptor->store($gene);
    
    $trans_count++;

    print "TRANSCRIPT:\n";
    foreach my $e (@{$t->get_all_Exons}) {
      printf("%s\tEnsembl\tExon\t%d\t%d\t%d\t%d\t%d\n", $e->slice->seq_region_name, $e->start, $e->end, $e->strand, $e->phase, $e->end_phase);
    }
    my $seqio = Bio::SeqIO->new(-format => 'fasta',
                                -fh =>\* STDOUT);
    $seqio->write_seq($t->translate);
  }

  print "For gene " . $self->input_id . " you stored ", $trans_count, " transcripts\n";
  # to do: write gene back to core target database
} ######################################
# internal methods
#####################################
}
General documentation
No general documentation available.