Raw content of Bio::EnsEMBL::Analysis::RunnableDB::ExonerateTags
# package Bio::EnsEMBL::Analysis::RunnableDB::ExonerateTags
#
# Copyright EMBL-EBI/Wellcome Trust Sanger Center 2006
#
# You may distribute this module under the same terms as perl itself
#
# Cared for by EnsEMBL (ensembl-dev@ebi.ac.uk)
=pod
=head1 NAME
Bio::EnsEMBL::Analysis::RunnableDB::ExonerateTags
=head1 SYNOPSIS
my $analysis_obj = $db->get_AnalysisAdaptor->fetch_by_logic_name("Exonerate_Tags");
my $runnabledb =
Bio::EnsEMBL::Analysis::RunnableDB::ExonerateTags->new( -db => $db,
-analysis => $analysis_obj};
$runnabledb->fetch_input();
$runnabledb->run();
$runnabledb->write_output();
=head1 DESCRIPTION
RunnnableDB module for the Ditag analysis. Fetches Ditag sequences from database,
tries to align them to the genome with the Runnable and stores DitagFeatures
after filtering the hits.
=head1 CONTACT
ensembl-dev@ebi.ac.uk
=cut
package Bio::EnsEMBL::Analysis::RunnableDB::ExonerateTags;
use strict;
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
use Bio::EnsEMBL::Utils::Argument qw( rearrange );
use Bio::EnsEMBL::Analysis::RunnableDB;
use Bio::EnsEMBL::Analysis::Runnable::ExonerateTags;
use Bio::SeqIO;
use Bio::Seq;
use Bio::EnsEMBL::Analysis::Config::ExonerateTags;
use vars qw(@ISA);
@ISA = qw (Bio::EnsEMBL::Analysis::RunnableDB);
=head2 new
Arg : various
Function : create a runnableDB for the Ditag analysis
Returntype: Bio::EnsEMBL::Analysis::RunnableDB::ExonerateTags
Caller : general
=cut
sub new {
my ( $class, @args ) = @_;
my $self = $class->SUPER::new(@args);
$self->read_and_check_config($DITAG_CONFIG);
return $self;
}
=head2 fetch_input
Args : none
Function : fetch data from the database and create runnable
Returntype: none
Caller : general
=cut
sub fetch_input{
my ($self) = @_;
#define target
my $target = $self->GENOMICSEQS;
if ( -e $target ){
if(!(-d $target)and !(-s $target)) {
throw("'$target' isn't a file or a directory?");
}
} else {
throw("'$target' could not be found");
}
#define query: batch of ditags, split in two
my @fasta_entries = ();
my $tmp_dir = $self->TMPDIR;
#expected format of input ids: ditag..
$self->input_id =~ /ditag\.(\S+)\.([0-9]+)/;
my $tagtype = $1;
my $input_index = $2;
throw("Couldn t get index number from input-id ".$self->input_id)
unless($input_index);
throw("Couldn t get tag-type from input-id ".$self->input_id)
unless($tagtype);
#fetch all desired ditags
my $batch_size = $self->BATCHSIZE();
#my $tagtype = $self->TAGTYPE();
my $ditags = $self->db->get_ditagAdaptor->fetch_all_with_limit(
$tagtype, $batch_size, ($batch_size * ($input_index - 1)) );
my $chop_first = $self->CHOPFIRST();
my $chop_last = $self->CHOPLAST();
foreach my $ditag (@$ditags) {
#avoid polyA sequences
$ditag = $self->_check_seq($ditag);
if($ditag) {
my $ditag_fasta_entry;
if($self->SPLITSEQS()) {
#split the seq and store as fasta line, remove specific bases if necessary
$ditag_fasta_entry = $self->_split_seq($ditag, $chop_first, $chop_last);
} else {
my $tagseq = $ditag->sequence;
#remove leading bases? (e.g. G from GIS tags)
if($chop_first){
$tagseq = $self->_chop_first($tagseq, $chop_first);
}
#remove trailing bases? (e.g. AA)
if($chop_last){
$tagseq = $self->_chop_last($tagseq, $chop_last);
}
$ditag_fasta_entry = ">".$ditag->dbID."_F"."\n".$tagseq."\n";
}
push(@fasta_entries, $ditag_fasta_entry);
}
}
#write fasta lines to temp-file for exonerate
my $tmp_file = $tmp_dir."/".$input_index.".tmp";
open(OUTFILE, ">$tmp_file") or throw "couldnt create fasta chunk file $tmp_file";
foreach my $ditag_fasta_entry (@fasta_entries) {
print OUTFILE $ditag_fasta_entry;
}
close(OUTFILE) or throw "couldnt close fasta chunk file $tmp_file";
#define other parameters
my %parameters = %{ $self->parameters_hash };
if (
not exists( $parameters{-options} )
and defined $self->OPTIONS
){
$parameters{-options} = $self->OPTIONS;
}
if($self->analysis->db_file){
$parameters{'-STS_FILE'} = $self->analysis->db_file
unless($parameters{'-STS_FILE'});
}
my $program = ($self->PROGRAM()) ? $self->PROGRAM() : $self->analysis->program_file;
if(!$program){ throw "Cant decide what program to use for analysis!\n" }
#create runnable
my $runnable = Bio::EnsEMBL::Analysis::Runnable::ExonerateTags->new
(
-program => $program,
-analysis => $self->analysis,
-target_file => $target,
-query_file => $tmp_file,
-deletequeryfile => 0,
-maxmismatch => $self->MAXMISMATCH(),
-specoptions => $self->SPECOPTIONS(),
-splitseqs => $self->SPLITSEQS(),
-maxdistance => $self->MAXDISTANCE(),
-keeporder => $self->KEEPORDER(),
%parameters
);
$self->runnable($runnable);
}
=head2 _chop_first
Arg [1] : sequence string of tag
Arg [2] : sequence string of bases to remove
Function : Remove undesired sequences at the start of a tag,
eg. G from GIS tags if this specified for the tag lib.
Returntype: sequence string
Caller : private
=cut
sub _chop_first {
my ($self, $ditagseq, $chop_first) = @_;
if($ditagseq =~ /^$chop_first.+$/){
$ditagseq = substr($ditagseq, length($chop_first), length($ditagseq)-length($chop_first));
}
return $ditagseq;
}
=head2 _chop_last
Arg [1] : sequence string of tag
Arg [2] : sequence string of bases to remove
Function : Remove undesired sequences from the end of a tag,
eg. AA from GIS tags.
Returntype: sequence string
Caller : private
=cut
sub _chop_last {
my ($self, $ditagseq, $chop_last) = @_;
if($ditagseq =~ /^$chop_last.+$/){
$ditagseq = substr($ditagseq, 0, length($ditagseq)-length($chop_last));
}
return $ditagseq;
}
=head2 _check_seq
Arg [1] : Bio::EnsEMBL::Map::Ditag
Function : Remove undesired sequences, eg. repetative seqs.
Returntype: Bio::EnsEMBL::Map::Ditag or undef
Caller : private
=cut
sub _check_seq {
my ($self, $ditag) = @_;
my $repeat_number = $self->REPEATNUMBER();
#remove repetetive sequences
if( ($ditag->sequence =~ /A{$repeat_number}/) or ($ditag->sequence =~ /T{$repeat_number}/) or
($ditag->sequence =~ /C{$repeat_number}/) or ($ditag->sequence =~ /G{$repeat_number}/) ){
# if( $ditag->sequence =~ /A{$repeat_number} | T{$repeat_number} | C{$repeat_number} | G{$repeat_number}/ ){
print "removing repetitive ditag ".($ditag->dbID)." ".($ditag->tag_count)."\n";
$ditag = undef;
}
#remove too short or too long sequences
elsif(defined($self->MINSEQLENGTH) and (length($ditag->sequence) < $self->MINSEQLENGTH)){
print "removing short ditag ".($ditag->dbID)." ".($ditag->tag_count)."\n";
$ditag = undef;
}
elsif(defined($self->MAXSEQLENGTH) and (length($ditag->sequence) > $self->MAXSEQLENGTH)){
print "removing long ditag ".($ditag->dbID)." ".($ditag->tag_count)."\n";
$ditag = undef;
}
#remove sequences with Ns
elsif($ditag->sequence =~ /N/){
print "removing ditag with 'N' ".($ditag->dbID)." ".($ditag->tag_count)."\n";
$ditag = undef;
}
return $ditag;
}
=head2 _split_seq
Arg [1] : Bio::EnsEMBL::Map::Ditag
Function : split a ditag into two parts to save to file
FOR NOW THESE ARE JUST 2 HALVES
Returntype: string with sequence header (_L & _R) and sequence
for the parts (FASTA format)
Caller : private
=cut
sub _split_seq {
my ($self, $ditag, $chop_first, $chop_last) = @_;
my $addone = 0;
my $motherseq = $ditag->sequence;
my $id = $ditag->dbID;
#remove leading bases? (e.g. G from GIS tags)
if($chop_first){
$motherseq = $self->_chop_first($motherseq, $chop_first);
}
#remove trailing bases? (e.g. AA)
if($chop_last){
$motherseq = $self->_chop_last($motherseq, $chop_last);
}
#split seq in two parts, TODO: adjust for other types
#define border
my $cutoffpoint = (length($motherseq))/2;
if($cutoffpoint =~ /\./){
$addone = 1;
}
$cutoffpoint = int($cutoffpoint);
#first part of seq
my $start_A = 0;
my $end_A = $cutoffpoint;
if($addone){ $end_A++; }
my $seqpart_A = ">".$id."_L_".$start_A."\n";
$seqpart_A .= (substr($motherseq, 0, $end_A))."\n";
#second part of seq
my $start_B = $cutoffpoint;
my $seqpart_B = ">".$id."_R_".($start_B)."\n";
$seqpart_B .= (substr($motherseq, $start_B))."\n";
return($seqpart_A.$seqpart_B);
}
=head2 run
Args : none
Function : run the Ditag runnable
Returntype: none
Caller : general
=cut
sub run {
my ($self) = @_;
my @out_features;
throw("Can't run - no runnable objects") unless ( $self->runnable );
my $runnable = @{ $self->runnable }[0];
$runnable->run;
@out_features = @{$runnable->output};
$self->output(\@out_features);
}
=head2 write_output
Arg [1] : array ref of Bio::EnsEMBL::Map::DitagFeatures
Function : get ditag feature adaptor with new features,
clean and store them.
Returntype: none
Caller : general
=cut
sub write_output {
my ( $self, $ditagfeatures ) = @_;
my $ditagfeature_adaptor;
if($self->OUTDB) {
my $outdb = $self->create_output_db;
$ditagfeature_adaptor = $outdb->get_DitagFeatureAdaptor;
} else {
$ditagfeature_adaptor = $self->db->get_DitagFeatureAdaptor;
}
if(!$ditagfeatures or !(scalar @$ditagfeatures)){
$ditagfeatures = $self->output();
}
print "\nhave ".(scalar @$ditagfeatures)." features to store.\n";
if(!$ditagfeatures or !(scalar @$ditagfeatures)){
warn "no features to store.";
}
else {
$ditagfeatures = $self->clean_features($ditagfeatures);
#eval { $ditagfeature_adaptor->store($ditagfeatures) };
eval { $ditagfeature_adaptor->batch_store($ditagfeatures) };
if ($@) {
$self->throw("Unable to store ditagFeatures!\n $@");
}
}
}
=head2 clean_features
Arg [1] : array ref of Bio::EnsEMBL::Map::DitagFeatures
Function : re-set analysis, attach slice
Returntype: array ref of Bio::EnsEMBL::Map::DitagFeatures
Caller : general
=cut
sub clean_features {
my ( $self, $ditagfeatures ) = @_;
my %genome_slices;
my @cleanfeatures;
my $sa = $self->db->get_SliceAdaptor;
foreach my $ditagfeature (@$ditagfeatures) {
#(re-)set analysis
$ditagfeature->analysis($self->analysis);
#attach slice
my $slice_id = $ditagfeature->slice;
if ( not exists $genome_slices{$slice_id} ) {
print "\nTRYING TO FETCH $slice_id.";
$genome_slices{$slice_id} = $sa->fetch_by_name($slice_id);
}
my $slice = $genome_slices{$slice_id};
$ditagfeature->slice($slice);
push(@cleanfeatures, $ditagfeature);
}
return \@cleanfeatures;
}
=head2 create_output_db
Args : none
Function : get DBAdaptor for output db
Returntype: Bio::EnsEMBL::DBSQL::DBAdaptor
Caller : general
=cut
sub create_output_db {
my ($self) = @_;
my $outdb;
my $dnadb;
print "\nCreating OUTDB.";
if ( $self->OUTDB && $self->DNADB) {
$dnadb = new Bio::EnsEMBL::DBSQL::DBAdaptor(%{ $self->OUTDB });
$outdb = new Bio::EnsEMBL::DBSQL::DBAdaptor(
%{ $self->OUTDB },
-dnadb => $dnadb
);
} elsif( $self->OUTDB) {
$outdb = new Bio::EnsEMBL::DBSQL::DBAdaptor(%{ $self->OUTDB });
} else {
$outdb = $self->db;
}
return $outdb;
}
=head2 runnable_path
Args : none
Function : GETTER functions for path to Runnable
Returntype: string
Caller : general
=cut
sub runnable_path{
my ($self);
return "Bio::EnsEMBL::Analysis::Runnable::DitagAlign";
}
=head2 helper functions
Arg [1] : (optional) new value
Function : GETTER/SETTER functions for Config values
Returntype: string or undef
Caller : private
=cut
sub GENOMICSEQS {
my ( $self, $value ) = @_;
if ( defined $value ) {
$self->{'_GENOMICSEQS'} = $value;
}
return $self->{'_GENOMICSEQS'};
}
sub QUERYSEQS {
my ( $self, $value ) = @_;
if ( defined $value ) {
$self->{'_QUERYSEQS'} = $value;
}
return $self->{'_QUERYSEQS'};
}
sub TAGTYPE {
my ( $self, $value ) = @_;
if ( defined $value ) {
$self->{'_TAGTYPE'} = $value;
}
return $self->{'_TAGTYPE'};
}
sub IIDREGEXP {
my ( $self, $value ) = @_;
if ( defined $value ) {
$self->{'_IIDREGEXP'} = $value;
}
return $self->{'_IIDREGEXP'};
}
sub OUTDB {
my ( $self, $value ) = @_;
if ( defined $value ) {
$self->{'_OUTDB'} = $value;
}
return $self->{'_OUTDB'};
}
sub DNADB {
my ( $self, $value ) = @_;
if ( defined $value ) {
$self->{'_DNADB'} = $value;
}
return $self->{'_DNADB'};
}
sub OPTIONS {
my ( $self, $value ) = @_;
if ( defined $value ) {
$self->{'_OPTIONS'} = $value;
}
return $self->{'_OPTIONS'};
}
sub PROGRAM {
my ( $self, $value ) = @_;
if ( defined $value ) {
$self->{'_PROGRAM'} = $value;
}
return $self->{'_PROGRAM'};
}
sub TMPDIR {
my ( $self, $value ) = @_;
if ( defined $value ) {
$self->{'_TMPDIR'} = $value;
}
return $self->{'_TMPDIR'};
}
sub BATCHSIZE {
my ( $self, $value ) = @_;
if ($value) {
$self->{'_BATCHSIZE'} = $value;
}
return $self->{'_BATCHSIZE'};
}
sub QUERYFILES {
my ( $self, $value ) = @_;
if ($value) {
$self->{'_QUERYFILE'} = $value;
}
return $self->{'_QUERYFILE'};
}
sub SPLITSEQS {
my ( $self, $value ) = @_;
if ($value) {
$self->{'_SPLITSEQS'} = $value;
}
return $self->{'_SPLITSEQS'};
}
sub CHOPFIRST {
my ( $self, $value ) = @_;
if ($value) {
$self->{'_CHOPFIRST'} = $value;
}
return $self->{'_CHOPFIRST'};
}
sub CHOPLAST {
my ( $self, $value ) = @_;
if ($value) {
$self->{'_CHOPLAST'} = $value;
}
return $self->{'_CHOPLAST'};
}
sub MAXDISTANCE {
my ( $self, $value ) = @_;
if ($value) {
$self->{'_MAXDISTANCE'} = $value;
}
return $self->{'_MAXDISTANCE'};
}
sub MAXMISMATCH {
my $self = shift;
$self->{'_MAXMISMATCH'} = shift if (@_);
return $self->{'_MAXMISMATCH'};
}
sub SPECOPTIONS {
my $self = shift;
$self->{'_SPECOPTIONS'} = shift if (@_);
return $self->{'_SPECOPTIONS'};
}
sub MINSEQLENGTH {
my ( $self, $value ) = @_;
if ($value) {
$self->{'_MINSEQLENGTH'} = $value;
}
return $self->{'_MINSEQLENGTH'};
}
sub MAXSEQLENGTH {
my ( $self, $value ) = @_;
if ($value) {
$self->{'_MAXSEQLENGTH'} = $value;
}
return $self->{'_MAXSEQLENGTH'};
}
sub REPEATNUMBER {
my ( $self, $value ) = @_;
if ($value) {
$self->{'_REPEATNUMBER'} = $value;
}
return $self->{'_REPEATNUMBER'};
}
sub KEEPORDER {
my ( $self, $value ) = @_;
if ($value) {
$self->{'_KEEPORDER'} = $value;
}
return $self->{'_KEEPORDER'};
}
sub TAGCOUNT {
my ( $self, $value ) = @_;
if ($value) {
$self->{'_TAGCOUNT'} = $value;
}
return $self->{'_TAGCOUNT'};
}
1;