Raw content of Bio::EnsEMBL::Analysis::RunnableDB::OrthologueEvaluator
=pod
=head1 NAME
Bio::EnsEMBL::Analysis::RunnableDB::OrthologueEvaluator;
=head1 SYNOPSIS
my $orthologueanalysis = Bio::EnsEMBL::Analysis::RunnableDB::OrthologueEvaluator->new(
-analysis => $analysis_obj,
);
$orthologueanalysis->fetch_input();
$orthologueanalysis->run();
$orthologueanalysis->output();
$orthologueanalysis->write_output();
=head1 DESCRIPTION
This object wraps Bio::EnsEMBL::Analysis::Runnable::OrthologueEvaluator and is
used to fetch the input from different databases as well as writing results
the results to the database.a
=head1 CONTACT
Post general queries to B
=head1 APPENDIX
The rest of the documentation details each of the object methods.
Internal methods are usually preceded with a '_'
=cut
package Bio::EnsEMBL::Analysis::RunnableDB::OrthologueEvaluator;
use strict;
use Bio::SeqIO;
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
use Bio::EnsEMBL::Analysis::RunnableDB;
use Bio::EnsEMBL::Gene;
use Bio::EnsEMBL::Analysis::Config::GeneBuild::OrthologueEvaluator;
use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils;
use Bio::EnsEMBL::Registry;
use Bio::EnsEMBL::Pipeline::Utils::InputIDFactory;
use Bio::EnsEMBL::Pipeline::Utils::InputIDFactory;
use Bio::EnsEMBL::Pipeline::DBSQL::DBAdaptor;
use Bio::EnsEMBL::Analysis::Config::Exonerate2Genes;
use Bio::EnsEMBL::Pipeline::DBSQL::RuleAdaptor;
use Bio::EnsEMBL::Pipeline::Rule;
use vars qw(@ISA);
@ISA = qw (Bio::EnsEMBL::Analysis::RunnableDB);
sub new {
my ($class,@args) = @_;
my $self = $class->SUPER::new(@args);
$self->read_and_check_config;
$self->verbose(1) ;
return $self ;
}
sub get_initial_geneset {
my ( $self, $species_alias,$biotypes) = @_ ;
my $dba = Bio::EnsEMBL::Registry->get_DBAdaptor($species_alias,'core') ;
unless ( $dba ) {
throw("could not get database adaptor for species : $species_alias ") ;
}
my $sa = $dba->get_SliceAdaptor() ;
print "Fetching genes out of ". $dba->dbname . " @ " . $dba->host ." : " . $dba->port ." : " . $self->slice_name . "\n" ;
my $qy_slice = $sa->fetch_by_name($self->slice_name) ;
my @pt_genes ;
for my $bt ( @$biotypes ) {
my $genes_on_slice = $qy_slice->get_all_Genes_by_type( $bt );
print scalar(@$genes_on_slice) . " genes of biotype $bt found on slice ( $species_alias ) \n";
$self->genes( $genes_on_slice ) ;
}
}
sub upload_input_ids {
my ( $self, $input_ids ) = @_ ;
my $submit_analysis = "Submit_" . $self->post_logic_name;
print "Submit-analysis for next analysis is : $submit_analysis\n" ;
print "Logic-name for next analysis is : " . $self->post_logic_name . "\n" ;
# otherwise the Registry overrides the adaptor - silly ...
#print "db before clear : " . $self->db();
#print "\n" ;
#Bio::EnsEMBL::Registry->clear();
#print "db after clear : " . $self->db();
#print "\n" ;
$self->pipeline_adaptor($self->db) ;
my $input_id_type = "file_" . $self->post_logic_name ;
my $if = Bio::EnsEMBL::Pipeline::Utils::InputIDFactory->new(
-db => $self->pipeline_adaptor ,
-logic_name => $submit_analysis ,
-file => 1 ,
-input_id_type => $input_id_type,
);
unless ($if->get_analysis($submit_analysis , $input_id_type, 1)) {
throw( "Cant find analysis $submit_analysis in db ") ;
}
# check first if input_id is not already stored ...
my $a =
$self->pipeline_adaptor->get_AnalysisAdaptor->fetch_by_logic_name($submit_analysis) ;
my $ia_db = $self->pipeline_adaptor->get_StateInfoContainer->list_input_id_by_Analysis($a) ;
my @input_ids_not_stored ;
my %tmp ;
@tmp{@$ia_db} = 1;
for my $i ( @$input_ids ) {
push @input_ids_not_stored, $i unless (exists $tmp{$i}) ;
}
print scalar(@$input_ids) - scalar(@input_ids_not_stored) . " input ids already stored in db " . $self->pipeline_adaptor->dbname . "\n" ;
$if->input_ids(\@input_ids_not_stored) ;
$if->store_input_ids;
print scalar(@input_ids_not_stored) . " input-ids uploaded into " . $self->pipeline_adaptor->dbname . "\n" ;
}
sub chunk_and_write_fasta_sequences {
my ( $self, $tref, $base_dir , $file_prefix, $file_suffix,$chunk_size ) = @_ ;
$chunk_size = 5 unless $chunk_size ;
print scalar(@$tref) . " sequences to write \n" ;
return if scalar(@$tref) == 0 ;
unless ($base_dir) {
my $conf = $$EXONERATE_CONFIG_BY_LOGIC{$self->post_logic_name}{QUERYSEQS};
$conf ? $base_dir = $conf : throw ( "There's no output-dir configured in Exonerate2Genes.pm " .
"for analysis " . $self->post_logic_name . " check the config\n");
}
$base_dir = $base_dir . "/" unless $base_dir =~m/\/$/;
#$base_dir = $base_dir . $self->post_logic_name . "/" ;
`mkdir $base_dir ` unless ( -e $base_dir);
unless ( $file_prefix ) {
if ($self->slice_name){
$file_prefix = $self->slice_name ;
}elsif( $self->input_id ) {
$file_prefix = $self->input_id ;
}else{
throw("Error - can't create file name because of missing slice-name or input_id\n") ;
}
}
$file_suffix = ".fa" unless $file_suffix ;
print "writing squences to $base_dir\n" ;
my @filenames ;
my $wtf = 0 ;
my @ltr = @$tref ;
my ($seq_file , $name ) ;
my $fcnt = 0 ;
for ( my $i=0 ; $iclose() if ($wtf == 1) ;
$fcnt++;
my $file_name = $file_prefix . "_" . $fcnt . $file_suffix ;
# these filenames are stored as input_ids in refdb
push @filenames, $file_name ;
my $tmp = $base_dir . $file_name ;
$seq_file = Bio::SeqIO->new(
#-file => ">$base_dir. $file_name" ,
-file => ">$tmp" ,
-format => 'fasta'
);
$wtf = 1 ;
}
$seq_file->write_seq($seq);
}
$seq_file->close();
return \@filenames ;
}
sub genes {
my ($self, $g) = @_ ;
push @{$self->{_genes}}, @$g if $g ;
return $self->{_genes} ;
}
sub species_1 {
my ($self, $g) = @_ ;
$self->{_species_1} = $g if $g ;
return $self->{_species_1} ;
}
sub species_2 {
my ($self, $g) = @_ ;
$self->{_species_2} = $g if $g ;
return $self->{_species_2} ;
}
sub post_logic_name {
my ($self) = @_ ;
my $post_logic_name = $self->input_id ;
$post_logic_name =~s/(^.*?)(\:.*)/$1/ ;
return $post_logic_name ;
}
sub slice_name {
my ($self) = @_ ;
my $slice_name = $self->input_id ;
$slice_name =~s/(^.*?\:)(.*)/$2/ ;
return $slice_name ;
}
sub run {
my ($self) = @_;
}
sub write_output{
my ($self,$missing_seq) = @_;
}
sub read_and_check_config {
(my $self) = @_ ;
# check if config file exists
throw("Your compara-registry-file LOCATION_OF_COMPARA_REGISTRY_FILE does not exist !!".
"\nCheck your config !")
unless ( -e $$MAIN_CONFIG{LOCATION_OF_COMPARA_REGISTRY_FILE} ) ;
print STDERR "reading compara-config file : $$MAIN_CONFIG{LOCATION_OF_COMPARA_REGISTRY_FILE}\n";
# check compara configuration file and schema-versions of dbs
Bio::EnsEMBL::Registry->load_all($$MAIN_CONFIG{LOCATION_OF_COMPARA_REGISTRY_FILE},1,1);
my @dba = @{Bio::EnsEMBL::Registry->get_all_DBAdaptors()};
my %tmp;
for my $a ( @dba ) {
push @{$tmp{ $a->get_MetaContainer->get_schema_version }} , $a->dbname ;
}
if ( keys %tmp > 1 ) {
warning("You're using databases with different schemas - this can cause problems ...\n" ) ;
for ( keys %tmp ) {
print "schema $_:\n". join("\n" , @{$tmp{$_}} ) . "\n\n" ;
}
}
}
sub pipeline_adaptor{
my ($self, $db )= @_ ;
if ( $db ) {
my $pa = Bio::EnsEMBL::Pipeline::DBSQL::DBAdaptor->new(
-host => $db->host,
-dbname => $db->dbname,
-user => $db->username,
-pass => $db->password ,
-port => $db->port,
);
$self->{_PA}=$pa ;
}
return $self->{_PA} ;
}
1;