Raw content of Bio::EnsEMBL::Analysis::RunnableDB::GeneBuilder
package Bio::EnsEMBL::Analysis::RunnableDB::GeneBuilder;
use vars qw(@ISA);
use strict;
use Bio::EnsEMBL::Analysis::RunnableDB::BaseGeneBuild;
use Bio::EnsEMBL::Analysis::Config::GeneBuild::GeneBuilder
qw(GENEBUILDER_CONFIG_BY_LOGIC);
use Bio::EnsEMBL::Analysis::Runnable::GeneBuilder;
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
use Bio::EnsEMBL::Utils::Argument qw (rearrange);
use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils qw(id coord_string lies_inside_of_slice);
use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::GeneUtils qw(Gene_info attach_Analysis_to_Gene_no_support);
use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranscriptUtils
qw(are_strands_consistent are_phases_consistent
is_not_folded all_exons_are_valid intron_lengths_all_less_than_maximum);
use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::ExonUtils qw(exon_length_less_than_maximum);
use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranslationUtils
qw(validate_Translation_coords contains_internal_stops print_Translation print_peptide);
use Bio::EnsEMBL::Analysis::Tools::Logger;
@ISA = qw (
Bio::EnsEMBL::Analysis::RunnableDB::BaseGeneBuild
);
=head2 new
Arg [1] : Bio::EnsEMBL::Analysis::RunnableDB::GeneBuilder
Function : instatiates a GeneBuilder object and reads and checks the config
file
Returntype: Bio::EnsEMBL::Analysis::RunnableDB::GeneBuilder
Exceptions:
Example :
=cut
sub new {
my ($class,@args) = @_;
my $self = $class->SUPER::new(@args);
$self->read_and_check_config($GENEBUILDER_CONFIG_BY_LOGIC);
return $self;
}
sub fetch_input{
my ($self) = @_;
#fetch sequence
$self->query($self->fetch_sequence);
#fetch genes
$self->get_Genes;
#print "Have ".@{$self->input_genes}." genes to cluster\n";
#filter genes
my @filtered_genes = @{$self->filter_genes($self->input_genes)};
#print "Have ".@filtered_genes." filtered genes\n";
#create genebuilder runnable
my $runnable = Bio::EnsEMBL::Analysis::Runnable::GeneBuilder
->new(
-query => $self->query,
-analysis => $self->analysis,
-genes => \@filtered_genes,
-output_biotype => $self->OUTPUT_BIOTYPE,
-max_transcripts_per_cluster => $self->MAX_TRANSCRIPTS_PER_CLUSTER,
-min_short_intron_len => $self->MIN_SHORT_INTRON_LEN,
-max_short_intron_len => $self->MAX_SHORT_INTRON_LEN,
-blessed_biotypes => $self->BLESSED_BIOTYPES,
);
$self->runnable($runnable);
};
sub write_output{
my ($self) = @_;
my $ga = $self->get_adaptor;
my $sucessful_count = 0;
logger_info("WRITE OUTPUT have ".@{$self->output}." genes to write");
foreach my $gene(@{$self->output}){
my $attach = 0;
if(!$gene->analysis){
my $attach = 1;
attach_Analysis_to_Gene_no_support($gene, $self->analysis);
}
if($attach == 0){
TRANSCRIPT:foreach my $transcript(@{$gene->get_all_Transcripts}){
if(!$transcript->analysis){
attach_Analysis_to_Gene_no_support($gene, $self->analysis);
last TRANSCRIPT;
}
}
}
eval{
$ga->store($gene);
};
if($@){
warning("Failed to write gene ".id($gene)." ".coord_string($gene)." $@");
}else{
$sucessful_count++;
logger_info("STORED GENE ".$gene->dbID);
}
}
if($sucessful_count != @{$self->output}){
throw("Failed to write some genes");
}
}
sub output_db{
my ($self, $db) = @_;
if($db){
$self->{output_db} = $db;
}
if(!$self->{output_db}){
my $db = $self->get_dbadaptor($self->OUTPUT_DB);
$self->{output_db} = $db;
}
return $self->{output_db};
}
sub get_adaptor{
my ($self) = @_;
return $self->output_db->get_GeneAdaptor;
}
sub get_Genes{
my ($self) = @_;
my @genes;
foreach my $db_name(keys(%{$self->INPUT_GENES})){
my $gene_db = $self->get_dbadaptor($db_name);
my $slice = $self->fetch_sequence($self->input_id, $gene_db);
my $biotypes = $self->INPUT_GENES->{$db_name};
foreach my $biotype(@$biotypes){
my $genes = $slice->get_all_Genes_by_type($biotype);
print "Retrieved ".@$genes." of type ".$biotype."\n";
push(@genes, @$genes);
}
}
$self->input_genes(\@genes);
}
sub input_genes{
my ($self, $arg) = @_;
if($arg){
throw("Need to pass input genes an arrayref not ".$arg)
if(!ref($arg) || ref($arg) ne 'ARRAY');
push(@{$self->{input_genes}}, @$arg);
}
return $self->{input_genes};
}
sub filter_genes{
my ($self, $genes) = @_;
$genes = $self->input_genes if(!$genes);
print "Have ".@$genes." to filter\n";
my @filtered;
GENE:foreach my $gene(@$genes){
#throw("Genebuilder only works with one gene one transcript structures")
# if(@{$gene->get_all_Transcripts} >= 2);
foreach my $transcript(@{$gene->get_all_Transcripts}){
if($self->validate_Transcript($transcript)){
push(@filtered, $gene);
next GENE;
}else{
print Gene_info($gene)." is invalid skipping\n";
next GENE;
}
}
}
return \@filtered;
}
sub validate_Transcript{
my ($self, $transcript) = @_;
my $slice = $self->query;
$slice = $transcript->slice if(!$slice);
my $is_valid = 0;
#basic transcript validation
unless(are_strands_consistent($transcript)){
$is_valid++;
}
unless(are_phases_consistent($transcript)){
$is_valid++;
}
unless(is_not_folded($transcript)){
$is_valid++;
}
EXON:foreach my $exon(@{$transcript->get_all_Exons}){
if(exon_length_less_than_maximum($exon, $self->MAX_EXON_LENGTH)){
next EXON;
}else{
$is_valid++;
last EXON;
}
}
if(contains_internal_stops($transcript)){
$is_valid++;
}
unless(validate_Translation_coords($transcript)){
$is_valid++;
}
return 0 if($is_valid >= 1);
return 1;
}
#CONFIG METHODS
=head2 read_and_check_config
Arg [1] : Bio::EnsEMBL::Analysis::RunnableDB::GeneBuilder
Arg [2] : hashref from config file
Function : call the superclass method to set all the varibles and carry
out some sanity checking
Returntype: N/A
Exceptions: throws if certain variables arent set properlu
Example :
=cut
sub read_and_check_config{
my ($self, $hash) = @_;
$self->SUPER::read_and_check_config($hash);
#######
#CHECKS
#######
foreach my $var(qw(INPUT_GENES
OUTPUT_DB)){
throw("RunnableDB::GeneBuilder $var config variable is not defined")
unless($self->$var);
}
my @keys = keys(%{$self->INPUT_GENES});
throw("RunnableDB::GeneBuilder INPUT_GENES has needs to contain values") if(!@keys);
my %unique;
foreach my $key(@keys){
my $biotypes = $self->INPUT_GENES->{$key};
foreach my $biotype(@$biotypes){
if(!$unique{$biotype}){
$unique{$biotype} = $key;
}else{
if($self->BLESSED_BIOTYPES->{$biotype}){
throw($biotype." is defined for both ".$key." and ".$unique{$biotype}.
" and is found in the blessed biotype hash\n".
"This is likely to cause problems for the filtering done in ".
"the genebuilder code");
}else{
warning($biotype." appears twice in your listing, make sure this ".
"isn't for the same database otherwise it will cause issue");
}
}
}
}
$self->OUTPUT_BIOTYPE($self->analysis->logic_name) if(!$self->OUTPUT_BIOTYPE);
}
=head2 CONFIG_ACCESSOR_METHODS
Arg [1] : Bio::EnsEMBL::Analysis::RunnableDB::GeneBuilder
Arg [2] : Varies, tends to be boolean, a string, a arrayref or a hashref
Function : Getter/Setter for config variables
Returntype: again varies
Exceptions:
Example :
=cut
#Note the function of these variables is better described in the
#config file itself Bio::EnsEMBL::Analysis::Config::GeneBuild::GeneBuilder
sub INPUT_GENES{
my ($self, $arg) = @_;
if($arg){
$self->{'INPUT_GENES'} = $arg;
}
return $self->{'INPUT_GENES'};
}
sub OUTPUT_DB{
my ($self, $arg) = @_;
if($arg){
$self->{'OUTPUT_DB'} = $arg;
}
return $self->{'OUTPUT_DB'};
}
sub OUTPUT_BIOTYPE{
my ($self, $arg) = @_;
if($arg){
$self->{'OUTPUT_BIOTYPE'} = $arg;
}
return $self->{'OUTPUT_BIOTYPE'};
}
sub MAX_TRANSCRIPTS_PER_CLUSTER{
my ($self, $arg) = @_;
if($arg){
$self->{'MAX_TRANSCRIPTS_PER_CLUSTER'} = $arg;
}
return $self->{'MAX_TRANSCRIPTS_PER_CLUSTER'};
}
sub MIN_SHORT_INTRON_LEN{
my ($self, $arg) = @_;
if($arg){
$self->{'MIN_SHORT_INTRON_LEN'} = $arg;
}
return $self->{'MIN_SHORT_INTRON_LEN'};
}
sub MAX_SHORT_INTRON_LEN{
my ($self, $arg) = @_;
if($arg){
$self->{'MAX_SHORT_INTRON_LEN'} = $arg;
}
return $self->{'MAX_SHORT_INTRON_LEN'};
}
sub BLESSED_BIOTYPES{
my ($self, $arg) = @_;
if($arg){
$self->{'BLESSED_BIOTYPES'} = $arg;
}
return $self->{'BLESSED_BIOTYPES'};
}
sub MAX_EXON_LENGTH{
my ($self, $arg) = @_;
if($arg){
$self->{'MAX_EXON_LENGTH'} = $arg;
}
return $self->{'MAX_EXON_LENGTH'};
}
1;