Raw content of Bio::EnsEMBL::Analysis::RunnableDB::Funcgen::SWEmbl
# Ensembl module for Bio::EnsEMBL::Analysis::RunnableDB::Funcgen::SWEmbl
#
# Copyright (c) 2008 Ensembl
#
=head1 NAME
Bio::EnsEMBL::Analysis::RunnableDB::Funcgen::SWEmbl
=head1 SYNOPSIS
my $runnable = Bio::EnsEMBL::Analysis::RunnableDB::Funcgen::SWEmbl->new
(
-db => $db,
-input_id => 'chromosome::20:1:100000:1',
-analysis => $analysis,
);
$runnable->fetch_input;
$runnable->run;
$runnable->write_output;
=head1 DESCRIPTION
This module provides an interface between the ensembl functional genomics
database and the Runnable SWEmbl which wraps the ChIP-Seq peak caller SWEmbl.
=head1 AUTHOR
Stefan Graf, Ensembl Functional Genomics - /
=head1 CONTACT
Post questions to the Ensembl development list: ensembl-dev@ebi.ac.uk
=cut
package Bio::EnsEMBL::Analysis::RunnableDB::Funcgen::SWEmbl;
use strict;
use warnings;
use Data::Dumper;
use Bio::EnsEMBL::Analysis::RunnableDB;
use Bio::EnsEMBL::Analysis::RunnableDB::Funcgen;
use Bio::EnsEMBL::Analysis::Runnable::Funcgen::SWEmbl;
use Bio::EnsEMBL::Analysis::Config::General;
use Bio::EnsEMBL::Analysis::Config::Funcgen::SWEmbl;
use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump);
use vars qw(@ISA);
@ISA = qw(Bio::EnsEMBL::Analysis::RunnableDB::Funcgen);
=head2 new
Arg [1] :
Arg [2] :
Description : Instantiates new SWEmbl runnabledb
Returntype : Bio::EnsEMBL::Analysis::RunnableDB::Funcgen::SWEmbl object
Exceptions :
Example :
=cut
sub new {
print "Analysis::RunnableDB::Funcgen::SWEmbl::new\n";
my ($class,@args) = @_;
my $self = $class->SUPER::new(@args);
$self->read_and_check_config($CONFIG);
# make sure we have the correct analysis object
$self->check_Analysis();
# make sure we can store the correct feature_set, data_sets, and result_sets
$self->check_Sets();
return $self;
}
sub check_InputId {
warn("Analysis::RunnableDB::Funcgen::SWEmbl::check_InputId");
my ($self) = @_;
my ($ename, $file) = split(':', $self->input_id);
# set experiment
my $ea = $self->efgdb->get_ExperimentAdaptor;
my $e = $ea->fetch_by_name($ename);
my @date = (localtime)[5,4,3];
$date[0] += 1900; $date[1]++;
#print Dumper @date;
if (! defined $e) {
warn("Experiment NOT defined");
my $exp = Bio::EnsEMBL::Funcgen::Experiment->new
(
-NAME => $ename,
-GROUP => "$ENV{EFG_GROUP}",
-DATE => join('-', @date),
-PRIMARY_DESIGN_TYPE => 'binding_site_identification',
-ADAPTOR => $ea,
);
my ($g_dbid) = $self->efgdb->fetch_group_details($exp->group());
if (! $g_dbid) {
warn("Group specified does, not exist. Importing (group, location, contact)");
my $sql = "INSERT INTO experimental_group (name, location, contact) ".
"values ('$ENV{EFG_GROUP}','$ENV{EFG_LOCATION}','$ENV{EFG_CONTACT}')";
eval {
$self->efgdb->dbc->do($sql);
};
throw("Couldn't import group information. Double-check that the environment variables ".
"EFG_GROUP, EFG_LOCATION, and EFG_CONTACT are set.") if ($@);
}
($e) = @{$ea->store($exp)};
}
$self->experiment($e)
or throw("Can't fetch experiment with name ".$ename);
#print Dumper $self->experiment;
my $infile = $self->ANALYSIS_WORK_DIR.'/infiles/'.$self->input_id;
warn('INFILE: '.$infile);
$self->query($infile);
}
sub check_Sets {
warn("Analysis::RunnableDB::Funcgen::SWEmbl::check_Experiment");
my ($self) = @_;
my $set_name = $self->analysis->logic_name.'_'.$self->experiment->name();
warn("SetName: $set_name");
my ($ct_name, $ft_name) = split(/_/, $self->experiment->name());
my $feature_type = $self->efgdb->get_FeatureTypeAdaptor()->fetch_by_name($ft_name)
or throw("Feature type '$ft_name' does not exist");
$self->feature_type($feature_type);
my $cell_type = $self->efgdb->get_CellTypeAdaptor()->fetch_by_name($ct_name)
or throw("Cell type '$ct_name' does not exist");
$self->cell_type($cell_type);
my $esa = $self->efgdb->get_ExperimentalSetAdaptor();
my $eset = $esa->fetch_by_name($set_name);
if (! defined $eset){
warn("ExperimentalSet NOT defined");
$eset = Bio::EnsEMBL::Funcgen::ExperimentalSet->new
(
-name => $set_name,
-experiment => $self->experiment(),
-feature_type => $feature_type,
-cell_type => $cell_type,
-vendor => 'SOLEXA',
-format => 'SEQUENCING',
#-analysis => $self->feature_analysis,
);
warn("Storing new experimental set \'$set_name\'");
eval {
($eset) = @{$esa->store($eset)};
};
throw("Coudn't store experimental set \'$set_name\': $!") if ($@);
}
my $fsa = $self->efgdb->get_FeatureSetAdaptor();
my $fset = $fsa->fetch_by_name($set_name);
#print Dumper $fset;
if ( ! defined $fset ) {
$fset = Bio::EnsEMBL::Funcgen::FeatureSet->new
(
-analysis => $self->efg_analysis,
-feature_type => $self->feature_type,
-cell_type => $self->cell_type,
-name => $set_name,
-type => 'annotated'
);
#print Dumper $fset;
warn("Storing new feature set \'$set_name\'");
eval {
($fset) = @{$fsa->store($fset)};
};
throw("Coudn't store feature set \'$set_name\': $!") if ($@);
} else {
warn("Feature set with name $set_name already exists.");
}
# save FeatureSet
$self->feature_set($fset);
my $dsa = $self->efgdb->get_DataSetAdaptor;
my $dset = $dsa->fetch_by_name($set_name);
#print Dumper $dset;
if ( ! defined $dset ) {
$dset = Bio::EnsEMBL::Funcgen::DataSet->new
(
-SUPPORTING_SETS => [$eset],
-FEATURE_SET => $fset,
-DISPLAYABLE => 1,
-NAME => $set_name,
-SUPPORTING_SET_TYPE => 'experimental',
);
#print Dumper $dset;
warn("Storing new data set \'$set_name\'");
eval {
($dset) = @{$dsa->store($dset)}
};
throw("Coudn't store data set \'$set_name\': $!") if ($@);
} else {
warn("Data set with name $set_name already exists.");
# need to check whether ExperimentalSets and supporting_sets are the same and
# possibly add ExperimentalSet to supporting_sets
my $ssets = $dset->get_supporting_sets();
my %ssets_dbIDs = ();
map { $ssets_dbIDs{$_->dbID}='' } (@{$ssets});
$dset->add_supporting_sets([ $eset ]) if (! exists $ssets_dbIDs{$eset->dbID});
}
# save DataSet
$self->data_set($dset);
}
sub fetch_input {
my ($self) = @_;
#warn ("Bio::EnsEMBL::Analysis::RunnableDB::Funcgen::SWEmbl::fetch_input");
# make sure the cache directory exists
my $cachedir = $self->ANALYSIS_WORK_DIR.'/cache';
if ( ! -d $cachedir) {
my $retval = system("mkdir -p $cachedir");
throw("Couldn't create cache directory $cachedir") unless ($retval == 0);
}
warn('infile: '.$self->query);
### Checking for gzip and bed format has been moved to create_input_id stage!!!
#my $open = ($self->is_gzip($self->query))? 'gzip -dc' : 'cat';
#my $cmd = $open.' '.$self->query.' |';
#warn $cmd;
### bed file format ?
#open(INFILE, "$cmd")
# or throw("Can't open gzipped infile ".$self->query);
#while () {
#
# my @line = split("\t");
# throw("Infile does not have 6 or more columns. We expect bed format: CHROM START END NAME SCORE STRAND.")
# if scalar @line < 6;
#
# throw ("1st column must contain name of seq_region (e.g. chr1 or 1)")
# unless ($line[0] =~ m/^(chr[MTXY\d]+)$/);
# throw ("2nd and 3rd column must contain start and end respectively")
# unless ($line[1] =~ m/^\d+$/ && $line[2] =~ m/^\d+$/);
# throw ("6th column must define strand (either '+' or '-')")
# unless ($line[5] =~ m/^[+-]$/);
# last;
#}
#close INFILE;
(my $cachefile = $self->query) =~ s,.*/([^/]+)$,$1,;
my $datfile = $cachedir.'/'.$cachefile;
warn('datafile: '.$datfile);
unless (-e $datfile) { # dat file already dumped sorted; skipping dump
# make sure reads are dumped in sorted order
my $sort = "gzip -dc ".$self->query.
" | sort -k1,1 -k2,2n -k3,3n | gzip -c > $datfile |";
warn("Executing $sort");
open(SORT, "$sort")
or throw("Can't open and sort gzipped file ".$self->query);
while () {warn($_)}
close SORT;
}
$self->query($datfile);
my %parameters_hash = %{$self->parameters_hash($self->efg_analysis->parameters)};
my $runnable = 'Bio::EnsEMBL::Analysis::Runnable::Funcgen::'
.$self->efg_analysis->module;
$runnable = $runnable->new
(
-query => $self->query,
-program => $self->efg_analysis->program_file,
-analysis => $self->efg_analysis,
-workdir => $self->ANALYSIS_WORK_DIR,
%parameters_hash
);
$self->runnable($runnable);
return 1;
}
sub query {
my $self = shift;
$self->{'query'} = shift if(@_);
throw("file ".$self->{'query'}. " doesn't exist")
unless ( -e $self->{'query'});
return $self->{'query'};
}
sub write_output{
print "Bio::EnsEMBL::Analysis::RunnableDB::Funcgen::SWEmbl::write_output\n";
my ($self) = @_;
if (scalar(@{$self->output}) == 0) {
warn("No features to annotate on slice ".$self->query.
" for experiment ".$self->experiment->name()."!");
return 1;
}
# store analysis, feature set and data set
# analysis was alredy been stored while checking config in read_and_check_config
#$self->adaptor('Analysis')->store($self->analysis());
if (! defined $self->feature_set->dbID) {
$self->efgdb->get_FeatureSetAdaptor->store($self->feature_set());
}
if (! defined $self->data_set->dbID) {
$self->efgdb->get_DataSetAdaptor->store($self->data_set());
}
### annotated features
my $fset = $self->feature_set;
my $fs_id = $fset->dbID();
my $af = $self->efgdb->get_AnnotatedFeatureAdaptor->fetch_all_by_FeatureSets([$fset]);
warn('No. of annotated features already stored: '.scalar(@$af).' ('.$self->query.' '.$fset->name.')');
warn('No. of annotated features to be stored: '.scalar(@{$self->output}).' ('.$self->query.')');
if (@$af) {
warn("NOT IMPORTING ".scalar(@{$self->output})." annotated features! File ".
$self->query." already has been processed; contains ".scalar(@$af).
" annotated features of feature set ".$fset->dbID.".");
return 1;
} else {
my @af;
my $sa = $self->efgdb->get_SliceAdaptor();
my %slice;
foreach my $ft (@{$self->output}){
#print Dumper $ft;
my ($seqid, $start, $end, $score) = @{$ft};
#print Dumper ($seqid, $start, $end, $score);
# skip mito calls
next if ($seqid =~ m/^M/);
unless (exists $slice{"$seqid"}) {
$slice{"$seqid"} = $sa->fetch_by_region('chromosome', $seqid);
}
my $af = Bio::EnsEMBL::Funcgen::AnnotatedFeature->new
(
-slice => $slice{"$seqid"},
-start => $start,
-end => $end,
-strand => 0,
-display_label => $self->efg_analysis->logic_name,
-score => $score,
-feature_set => $fset,
);
push(@af, $af);
}
$self->efgdb->get_AnnotatedFeatureAdaptor->store(@af);
}
return 1;
}
1;