ExonerateTags documentation.

Bio::EnsEMBL::Analysis::RunnableDB ExonerateTags

Included libraries

Package variables

General documentation

Toolbar

Summary

Bio::EnsEMBL::Analysis::RunnableDB::ExonerateTags

Package variables

No package variables defined.

Included modules

Bio::EnsEMBL::Analysis::Config::ExonerateTags

Bio::EnsEMBL::Analysis::Runnable::ExonerateTags

Bio::EnsEMBL::Analysis::RunnableDB

Bio::EnsEMBL::Utils::Argument qw ( rearrange )

Bio::EnsEMBL::Utils::Exception qw ( throw warning )

Inherit

Bio::EnsEMBL::Analysis::RunnableDB

Synopsis

my $analysis_obj = $db->get_AnalysisAdaptor->fetch_by_logic_name("Exonerate_Tags");
my $runnabledb =
Bio::EnsEMBL::Analysis::RunnableDB::ExonerateTags->new( -db => $db,
-analysis => $analysis_obj};
$runnabledb->fetch_input();
$runnabledb->run();
$runnabledb->write_output();

Description

RunnnableDB module for the Ditag analysis. Fetches Ditag sequences from database,
tries to align them to the genome with the Runnable and stores DitagFeatures
after filtering the hits.

Methods

BATCHSIZE	No description	Code
CHOPFIRST	No description	Code
CHOPLAST	No description	Code
DNADB	No description	Code
GENOMICSEQS	No description	Code
IIDREGEXP	No description	Code
KEEPORDER	No description	Code
MAXDISTANCE	No description	Code
MAXMISMATCH	No description	Code
MAXSEQLENGTH	No description	Code
MINSEQLENGTH	No description	Code
OPTIONS	No description	Code
OUTDB	No description	Code
PROGRAM	No description	Code
QUERYFILES	No description	Code
QUERYSEQS	No description	Code
REPEATNUMBER	No description	Code
SPECOPTIONS	No description	Code
SPLITSEQS	No description	Code
TAGCOUNT	No description	Code
TAGTYPE	No description	Code
TMPDIR	No description	Code
_check_seq	Description	Code
_chop_first	Description	Code
_chop_last	Description	Code
_split_seq	Description	Code
clean_features	Description	Code
create_output_db	Description	Code
fetch_input	Description	Code
new	Description	Code
run	Description	Code
runnable_path	Description	Code
write_output	Description	Code

Methods description

_check_seq

  Arg [1]   : Bio::EnsEMBL::Map::Ditag
  Function  : Remove undesired sequences, eg. repetative seqs.
  Returntype: Bio::EnsEMBL::Map::Ditag or undef
  Caller    : private

_chop_first

  Arg [1]   : sequence string of tag
  Arg [2]   : sequence string of bases to remove
  Function  : Remove undesired sequences at the start of a tag,
              eg. G from GIS tags if this specified for the tag lib.
  Returntype: sequence string
  Caller    : private

_chop_last

  Arg [1]   : sequence string of tag
  Arg [2]   : sequence string of bases to remove
  Function  : Remove undesired sequences from the end of a tag,
              eg. AA from GIS tags.
  Returntype: sequence string
  Caller    : private

_split_seq

  Arg [1]   : Bio::EnsEMBL::Map::Ditag
  Function  : split a ditag into two parts to save to file
              FOR NOW THESE ARE JUST 2 HALVES
  Returntype: string with sequence header (_L & _R) and sequence 
              for the parts (FASTA format)
  Caller    : private

clean_features

  Arg [1]   : array ref of Bio::EnsEMBL::Map::DitagFeatures
  Function  : re-set analysis, attach slice
  Returntype: array ref of Bio::EnsEMBL::Map::DitagFeatures
  Caller    : general

create_output_db

  Args      : none
  Function  : get DBAdaptor for output db
  Returntype: Bio::EnsEMBL::DBSQL::DBAdaptor
  Caller    : general

fetch_input

  Args      : none
  Function  : fetch data from the database and create runnable
  Returntype: none
  Caller    : general

new

  Arg       : various
  Function  : create a runnableDB for the Ditag analysis
  Returntype: Bio::EnsEMBL::Analysis::RunnableDB::ExonerateTags
  Caller    : general

run

  Args      : none
  Function  : run the Ditag runnable
  Returntype: none
  Caller    : general

runnable_path

  Args      : none
  Function  : GETTER functions for path to Runnable
  Returntype: string
  Caller    : general

write_output

  Arg [1]   : array ref of Bio::EnsEMBL::Map::DitagFeatures
  Function  : get ditag feature adaptor with new features,
              clean and store them.
  Returntype: none
  Caller    : general

Methods code

BATCHSIZE

description

prev

sub BATCHSIZE {

  my ( $self, $value ) = @_;
  if ($value) {
    $self->{'_BATCHSIZE'} = $value;
  }

  return $self->{'_BATCHSIZE'};

}

CHOPFIRST

description

sub CHOPFIRST {

  my ( $self, $value ) = @_;
  if ($value) {
    $self->{'_CHOPFIRST'} = $value;
  }

  return $self->{'_CHOPFIRST'};

}

CHOPLAST

description

sub CHOPLAST {

  my ( $self, $value ) = @_;
  if ($value) {
    $self->{'_CHOPLAST'} = $value;
  }

  return $self->{'_CHOPLAST'};

}

DNADB

description

sub DNADB {

  my ( $self, $value ) = @_;

  if ( defined $value ) {
    $self->{'_DNADB'} = $value;
  }

  return $self->{'_DNADB'};

}

GENOMICSEQS

description

sub GENOMICSEQS {

  my ( $self, $value ) = @_;

  if ( defined $value ) {
    $self->{'_GENOMICSEQS'} = $value;
  }

  return $self->{'_GENOMICSEQS'};

}

IIDREGEXP

description

sub IIDREGEXP {

  my ( $self, $value ) = @_;

  if ( defined $value ) {
    $self->{'_IIDREGEXP'} = $value;
  }

  return $self->{'_IIDREGEXP'};

}

KEEPORDER

description

sub KEEPORDER {

  my ( $self, $value ) = @_;
  if ($value) {
    $self->{'_KEEPORDER'} = $value;
  }

  return $self->{'_KEEPORDER'};

}

MAXDISTANCE

description

sub MAXDISTANCE {

  my ( $self, $value ) = @_;
  if ($value) {
    $self->{'_MAXDISTANCE'} = $value;
  }

  return $self->{'_MAXDISTANCE'};

}

MAXMISMATCH

description

sub MAXMISMATCH {

  my $self = shift;
  $self->{'_MAXMISMATCH'} = shift if (@_);

  return $self->{'_MAXMISMATCH'};

}

MAXSEQLENGTH

description

sub MAXSEQLENGTH {

  my ( $self, $value ) = @_;
  if ($value) {
    $self->{'_MAXSEQLENGTH'} = $value;
  }

  return $self->{'_MAXSEQLENGTH'};

}

MINSEQLENGTH

description

sub MINSEQLENGTH {

  my ( $self, $value ) = @_;
  if ($value) {
    $self->{'_MINSEQLENGTH'} = $value;
  }

  return $self->{'_MINSEQLENGTH'};

}

OPTIONS

description

sub OPTIONS {

  my ( $self, $value ) = @_;

  if ( defined $value ) {
    $self->{'_OPTIONS'} = $value;
  }

  return $self->{'_OPTIONS'};

}

OUTDB

description

sub OUTDB {

  my ( $self, $value ) = @_;

  if ( defined $value ) {
    $self->{'_OUTDB'} = $value;
  }

  return $self->{'_OUTDB'};

}

PROGRAM

description

sub PROGRAM {

  my ( $self, $value ) = @_;

  if ( defined $value ) {
    $self->{'_PROGRAM'} = $value;
  }

  return $self->{'_PROGRAM'};

}

QUERYFILES

description

sub QUERYFILES {

  my ( $self, $value ) = @_;
  if ($value) {
    $self->{'_QUERYFILE'} = $value;
  }

  return $self->{'_QUERYFILE'};

}

QUERYSEQS

description

sub QUERYSEQS {

  my ( $self, $value ) = @_;

  if ( defined $value ) {
    $self->{'_QUERYSEQS'} = $value;
  }

  return $self->{'_QUERYSEQS'};

}

REPEATNUMBER

description

sub REPEATNUMBER {

  my ( $self, $value ) = @_;
  if ($value) {
    $self->{'_REPEATNUMBER'} = $value;
  }

  return $self->{'_REPEATNUMBER'};

}

SPECOPTIONS

description

sub SPECOPTIONS {

  my $self = shift;
  $self->{'_SPECOPTIONS'} = shift if (@_);

  return $self->{'_SPECOPTIONS'};

}

SPLITSEQS

description

sub SPLITSEQS {

  my ( $self, $value ) = @_;
  if ($value) {
    $self->{'_SPLITSEQS'} = $value;
  }

  return $self->{'_SPLITSEQS'};

}

TAGCOUNT

description

sub TAGCOUNT {

  my ( $self, $value ) = @_;
  if ($value) {
    $self->{'_TAGCOUNT'} = $value;
  }

  return $self->{'_TAGCOUNT'};
}

1;

}

TAGTYPE

description

sub TAGTYPE {

  my ( $self, $value ) = @_;

  if ( defined $value ) {
    $self->{'_TAGTYPE'} = $value;
  }

  return $self->{'_TAGTYPE'};

}

TMPDIR

description

sub TMPDIR {

  my ( $self, $value ) = @_;

  if ( defined $value ) {
    $self->{'_TMPDIR'} = $value;
  }

  return $self->{'_TMPDIR'};

}

_check_seq

sub _check_seq {

  my ($self, $ditag) = @_;

  my $repeat_number = $self->REPEATNUMBER();
  #remove repetetive sequences
  if( ($ditag->sequence =~ /A{$repeat_number}/) or ($ditag->sequence =~ /T{$repeat_number}/) or
      ($ditag->sequence =~ /C{$repeat_number}/) or ($ditag->sequence =~ /G{$repeat_number}/) ){
#  if( $ditag->sequence =~ /A{$repeat_number} | T{$repeat_number} | C{$repeat_number} | G{$repeat_number}/ ){
    print "removing repetitive ditag ".($ditag->dbID)." ".($ditag->tag_count)."\n";
    $ditag = undef;
  }
  #remove too short or too long sequences
  elsif(defined($self->MINSEQLENGTH) and (length($ditag->sequence) < $self->MINSEQLENGTH)){
    print "removing short ditag ".($ditag->dbID)." ".($ditag->tag_count)."\n";
    $ditag = undef;
  }
  elsif(defined($self->MAXSEQLENGTH) and (length($ditag->sequence) > $self->MAXSEQLENGTH)){
    print "removing long ditag ".($ditag->dbID)." ".($ditag->tag_count)."\n";
    $ditag = undef;
  }
  #remove sequences with Ns
  elsif($ditag->sequence =~ /N/){
    print "removing ditag with 'N' ".($ditag->dbID)." ".($ditag->tag_count)."\n";
    $ditag = undef;
  }

  return $ditag;

}

_chop_first

sub _chop_first {

  my ($self, $ditagseq, $chop_first) = @_;

  if($ditagseq =~ /^$chop_first.+$/){
    $ditagseq =  substr($ditagseq, length($chop_first), length($ditagseq)-length($chop_first));
  }

  return $ditagseq;

}

_chop_last

sub _chop_last {

  my ($self, $ditagseq, $chop_last) = @_;

  if($ditagseq =~ /^$chop_last.+$/){
    $ditagseq =  substr($ditagseq, 0, length($ditagseq)-length($chop_last));
  }

  return $ditagseq;

}

_split_seq

sub _split_seq {

  my ($self, $ditag, $chop_first, $chop_last) = @_;
  my $addone = 0;

  my $motherseq = $ditag->sequence;
  my $id = $ditag->dbID;

  #remove leading bases? (e.g. G from GIS tags)
  if($chop_first){
    $motherseq = $self->_chop_first($motherseq, $chop_first);
  }
  #remove trailing bases? (e.g. AA)
  if($chop_last){
    $motherseq = $self->_chop_last($motherseq, $chop_last);
  }

  #split seq in two parts, TODO: adjust for other types
  #define border
  my $cutoffpoint = (length($motherseq))/2;
  if($cutoffpoint =~ /\./){
    $addone = 1;
  }
  $cutoffpoint = int($cutoffpoint);

  #first part of seq
  my $start_A    = 0;
  my $end_A      = $cutoffpoint;
  if($addone){ $end_A++; }
  my $seqpart_A  = ">".$id."_L_".$start_A."\n";
  $seqpart_A    .= (substr($motherseq, 0, $end_A))."\n";

  #second part of seq
  my $start_B    = $cutoffpoint;
  my $seqpart_B  = ">".$id."_R_".($start_B)."\n";
  $seqpart_B    .= (substr($motherseq, $start_B))."\n";

  return($seqpart_A.$seqpart_B);

}

clean_features

sub clean_features {

  my ( $self, $ditagfeatures ) = @_;
  my %genome_slices;
  my @cleanfeatures;
  my $sa = $self->db->get_SliceAdaptor;

  foreach my $ditagfeature (@$ditagfeatures) {

    #(re-)set analysis
    $ditagfeature->analysis($self->analysis);

    #attach slice
    my $slice_id = $ditagfeature->slice;
    if ( not exists $genome_slices{$slice_id} ) {
      print "\nTRYING TO FETCH $slice_id.";
      $genome_slices{$slice_id} = $sa->fetch_by_name($slice_id);
    }
    my $slice = $genome_slices{$slice_id};
    $ditagfeature->slice($slice);
    push(@cleanfeatures, $ditagfeature);

  }

  return\@ cleanfeatures;

}

create_output_db

sub create_output_db {

  my ($self) = @_;

  my $outdb;
  my $dnadb;

  print "\nCreating OUTDB.";
  if ( $self->OUTDB && $self->DNADB) {
    $dnadb =  new Bio::EnsEMBL::DBSQL::DBAdaptor(%{ $self->OUTDB }); 

    $outdb = new Bio::EnsEMBL::DBSQL::DBAdaptor(
              %{ $self->OUTDB }, 
              -dnadb => $dnadb 
             );

  } elsif( $self->OUTDB) {
    $outdb = new Bio::EnsEMBL::DBSQL::DBAdaptor(%{ $self->OUTDB }); 
  } else {
    $outdb = $self->db;
  }

  return $outdb;

}

fetch_input

sub fetch_input {

  my ($self) = @_;

  #define target
  my $target = $self->GENOMICSEQS;
  if ( -e $target ){
    if(!(-d $target)and !(-s $target)) {
      throw("'$target' isn't a file or a directory?");
    }
  } else {
    throw("'$target' could not be found");
  }

  #define query: batch of ditags, split in two
  my @fasta_entries = ();
  my $tmp_dir       = $self->TMPDIR;
  #expected format of input ids: ditag.<TYPE>.<NUMBER>
  $self->input_id   =~ /ditag\.(\S+)\.([0-9]+)/;
  my $tagtype       = $1;
  my $input_index   = $2;
  throw("Couldn t get index number from input-id ".$self->input_id)
      unless($input_index);
  throw("Couldn t get tag-type from input-id ".$self->input_id)
      unless($tagtype);

  #fetch all desired ditags
  my $batch_size = $self->BATCHSIZE();
  #my $tagtype    = $self->TAGTYPE();
  my $ditags     = $self->db->get_ditagAdaptor->fetch_all_with_limit(
                   $tagtype, $batch_size, ($batch_size * ($input_index - 1)) );
  my $chop_first = $self->CHOPFIRST();

  my $chop_last  = $self->CHOPLAST();

  foreach my $ditag (@$ditags) {
    #avoid polyA sequences
    $ditag = $self->_check_seq($ditag);
    if($ditag) {

      my $ditag_fasta_entry;
      if($self->SPLITSEQS()) {
        #split the seq and store as fasta line, remove specific bases if necessary
        $ditag_fasta_entry = $self->_split_seq($ditag, $chop_first, $chop_last);
      } else {
        my $tagseq = $ditag->sequence;
        #remove leading bases? (e.g. G from GIS tags)
        if($chop_first){
          $tagseq = $self->_chop_first($tagseq, $chop_first);
        }
        #remove trailing bases? (e.g. AA)
        if($chop_last){
          $tagseq = $self->_chop_last($tagseq, $chop_last);
        }
        $ditag_fasta_entry = ">".$ditag->dbID."_F"."\n".$tagseq."\n";
      }
      push(@fasta_entries, $ditag_fasta_entry);
    }
  }

  #write fasta lines to temp-file for exonerate
  my $tmp_file = $tmp_dir."/".$input_index.".tmp";
  open(OUTFILE, ">$tmp_file") or throw "couldnt create fasta chunk file $tmp_file";
  foreach my $ditag_fasta_entry (@fasta_entries) {
    print OUTFILE $ditag_fasta_entry;
  }
  close(OUTFILE) or throw "couldnt close fasta chunk file $tmp_file";

  #define other parameters
  my %parameters = %{ $self->parameters_hash };

  if (
    not exists( $parameters{-options} )
    and defined $self->OPTIONS
  ){
    $parameters{-options} = $self->OPTIONS;
  }

  if($self->analysis->db_file){
    $parameters{'-STS_FILE'} = $self->analysis->db_file 
      unless($parameters{'-STS_FILE'});
  }

  my $program = ($self->PROGRAM()) ? $self->PROGRAM() : $self->analysis->program_file;
  if(!$program){ throw "Cant decide what program to use for analysis!\n" }

  #create runnable
  my $runnable = Bio::EnsEMBL::Analysis::Runnable::ExonerateTags->new
    (
     -program         => $program,
     -analysis        => $self->analysis,
     -target_file     => $target,
     -query_file      => $tmp_file,
     -deletequeryfile => 0,
     -maxmismatch     => $self->MAXMISMATCH(),
     -specoptions     => $self->SPECOPTIONS(),
     -splitseqs       => $self->SPLITSEQS(),
     -maxdistance     => $self->MAXDISTANCE(),
     -keeporder       => $self->KEEPORDER(),
     %parameters
    );
  $self->runnable($runnable);

}

new

sub new {

  my ( $class, @args ) = @_;
  my $self = $class->SUPER::new(@args);

  $self->read_and_check_config($DITAG_CONFIG);

  return $self;

}

run

sub run {

  my ($self) = @_;
  my @out_features;

  throw("Can't run - no runnable objects") unless ( $self->runnable );

  my $runnable = @{ $self->runnable }[0];

  $runnable->run;

  @out_features = @{$runnable->output};
  $self->output(\@out_features);

}

runnable_path

sub runnable_path {

  my ($self);
  return "Bio::EnsEMBL::Analysis::Runnable::DitagAlign";

}

write_output

next

sub write_output {

  my ( $self, $ditagfeatures ) = @_;

  my $ditagfeature_adaptor;
  if($self->OUTDB) {
    my $outdb             = $self->create_output_db;
    $ditagfeature_adaptor = $outdb->get_DitagFeatureAdaptor;
  } else {
    $ditagfeature_adaptor = $self->db->get_DitagFeatureAdaptor;
  }

  if(!$ditagfeatures or !(scalar @$ditagfeatures)){
    $ditagfeatures = $self->output();
  }
  print "\nhave ".(scalar @$ditagfeatures)." features to store.\n";
  if(!$ditagfeatures or !(scalar @$ditagfeatures)){
    warn "no features to store.";
  }
  else {
    $ditagfeatures = $self->clean_features($ditagfeatures);
    #eval { $ditagfeature_adaptor->store($ditagfeatures) };
    eval { $ditagfeature_adaptor->batch_store($ditagfeatures) };
    if ($@) {
      $self->throw("Unable to store ditagFeatures!\n $@");
    }
  }

}

General documentation

CONTACT

ensembl-dev@ebi.ac.uk

helper functions

  Arg [1]   : (optional) new value
  Function  : GETTER/SETTER functions for Config values
  Returntype: string or undef
  Caller    : private