ImportArrays documentation.

Bio::EnsEMBL::Analysis::RunnableDB ImportArrays

Included libraries

Package variables

General documentation

Toolbar

Summary

Bio::EnsEMBL::Analysis::RunnableDB::ImportArrays;

Package variables

No package variables defined.

Included modules

Bio::EnsEMBL::Analysis::Config::ImportArrays

Bio::EnsEMBL::Analysis::RunnableDB

Bio::EnsEMBL::DBSQL::DBAdaptor

Bio::EnsEMBL::Funcgen::Array

Bio::EnsEMBL::Funcgen::ArrayChip

Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor

Bio::EnsEMBL::Funcgen::Probe

Bio::EnsEMBL::Funcgen::ProbeSet

Bio::EnsEMBL::Funcgen::Utils::Helper

Bio::EnsEMBL::Utils::Exception qw ( throw warning )

Inherit

Bio::EnsEMBL::Analysis::RunnableDB

Synopsis

my $importer =
Bio::EnsEMBL::Analysis::RunnableDB::ImportArrays->new(

);

$importer->fetch_input();
$importer->run();
$importer->output();
$importer->write_output(); #writes to DB and a big fasta file

Description

This object imports array fasta files of a common array format and collapses redundant
probes into unique records based on their probeset name and sequence identity. The
probes supplied are redundant - a single probe (characterised by a unique sequence) may
occur in different positions of different arrays (i.e. for AFFY the chip coord is the
probe name). Parsing of the fasta file is facilitated by dynamic configuration of regular
expressions and array parameters based on the array format and array being imported. To
enable multiple instances of ImportArrays to run at the same time using different
configurations, the correct config type is read from an ImportArrays.config file,
using the input_id as the key.
A non-redundant array format specific output file is written, using the probe dbIDs in
the fasta headers. This can then be cat'd with out no-redundant array format fasta file
for use in the mapping carried out by ProbeAlign and ProbeTranscriptAlign.
Note that probes are defined as redundant when they share the same
- sequence and
- probeset

Methods

ARRAY_FORMAT	No description	Code
ARRAY_PARAMS	No description	Code
DNADB	No description	Code
IFIELDORDER	No description	Code
IIDREGEXP	No description	Code
INPUT_FORMAT	No description	Code
NAMES_FILE	No description	Code
NON_REDUNDANT_PROBE_SEQS	No description	Code
OUTDB	No description	Code
OUTPUT_DIR	No description	Code
QUERYSEQS	No description	Code
add_array_chip_to_existing_probe	No description	Code
create_new_array_chip	No description	Code
create_new_probe	No description	Code
fetch_input	No description	Code
get_ARRAY_PARAMS_by_array_name	No description	Code
get_IFIELDORDER	No description	Code
get_IIDREGEXP	No description	Code
helper	No description	Code
new	No description	Code
outdb	No description	Code
probes	No description	Code
query_file	No description	Code
read_and_check_config	No description	Code
run	No description	Code
run_FASTA	No description	Code
write_output	No description	Code

Methods description

None available.

Methods code

ARRAY_FORMAT

description

prev

sub ARRAY_FORMAT {

  my ( $self, $value ) = @_;

  $self->{'_ARRAY_FORMAT'} = $value if defined $value;
  return $self->{'_ARRAY_FORMAT'};

}

ARRAY_PARAMS

description

sub ARRAY_PARAMS {

  my ( $self, $value ) = @_;

  $self->{'_CONFIG_ARRAY_PARAMS'} = $value  if defined $value;
  return $self->{'_CONFIG_ARRAY_PARAMS'};
}


#Don't really need this unless we see probe records not in consecutive array blocks
#This accessor will slow down import
#Can we implement this as a local cache in the caller?
#sub get_array_by_name{
#  my ($self, $array_name)
#
#

}

DNADB

description

sub DNADB {

  my ( $self, $value ) = @_;


  $self->{'_CONFIG_DNADB'} = $value  if defined $value;
  return $self->{'_CONFIG_DNADB'};
}



###############################################
###     end of config
###############################################

1;

}

IFIELDORDER

description

sub IFIELDORDER {

  my ( $self, $value ) = @_;

  $self->{'_CONFIG_IFIELDORDER'} = $value if defined $value;
  return $self->{'_CONFIG_IFIELDORDER'};

}

IIDREGEXP

description

sub IIDREGEXP {

  my ( $self, $value ) = @_;

  $self->{'_CONFIG_IIDREGEXP'} = $value  if defined $value;
  return $self->{'_CONFIG_IIDREGEXP'};

}

INPUT_FORMAT

description

sub INPUT_FORMAT {

  my ( $self, $value ) = @_;

  $self->{'_CONFIG_INPUT_FORMAT'} = $value  if defined $value ;
  
  return $self->{'_CONFIG_INPUT_FORMAT'};

}

NAMES_FILE

description

sub NAMES_FILE {

  my ( $self, $value ) = @_;

  $self->{'_NAMES_FILE'} = $value if defined $value;
  return $self->{'_NAMES_FILE'};
}

### Config from ImportArrays ##

}

NON_REDUNDANT_PROBE_SEQS

description

sub NON_REDUNDANT_PROBE_SEQS {

  my ( $self, $value ) = @_;

  $self->{'_NON_REDUNDANT_PROBE_SEQS'} = $value if defined $value;
  return $self->{'_NON_REDUNDANT_PROBE_SEQS'};

}

OUTDB

description

sub OUTDB {

  my ( $self, $value ) = @_;


  $self->{'_CONFIG_OUTDB'} = $value  if defined $value;
  return $self->{'_CONFIG_OUTDB'};

}

OUTPUT_DIR

description

sub OUTPUT_DIR {

  my ( $self, $value ) = @_;

  $self->{'_CONFIG_OUTPUT_DIR'} = $value  if defined $value;
  return $self->{'_CONFIG_OUTPUT_DIR'};
}

#Only used to define dnadb if not on ensembldb
#else dnadb autoguessing will fail

}

QUERYSEQS

description

sub QUERYSEQS {

  my ( $self, $value ) = @_;

  $self->{'_QUERYSEQS'} = $value if defined $value;
  return $self->{'_QUERYSEQS'};

}

add_array_chip_to_existing_probe

description

sub add_array_chip_to_existing_probe {

  my ($self, $probe, $array_chip, $probeset, $probename) = @_;

  #We can have non-unique probes on a non-probe_set array with different IDs
  #This should "never" happen, but it does. On plate duplicates or simply bas probe design

  #We really want to test to see if we already have a name for this probe on this array
  #my @all_probenames = @{$probe->get_all_probenames};

  #Is this not all done by Probe::add_array_chip_probename?

  #my $name = $probe->get_probename($array_chip->get_Array->name);

  #if($name){

#	if($name eq $probename){
#	  throw("Found duplicate fasta records for:\y".$array_chip->get_Array->name.':'.$probeset.':'.$name);
#	}
#	else{
#	  throw('Found probeset('.$probeset.")  on Array(".$array_chip->get_Array->name.
#			") with duplicate probe sequence for probes $probename and $name\n".
#			'Need to alter Probe/Adaptor to allow probe duplicates or create separate probes in ImportArrays?');
#	}
#  }
# #MSG: Found probeset(AF012129_at)  on Array(Mu11LsubA) with duplicate probe sequence for probes 251:249; and 241:249;
  $probe->add_array_chip_probename($array_chip->dbID, $probename, $array_chip->get_Array);
}

#can we replace these var with $_[n] for speed?

}

create_new_array_chip

description

sub create_new_array_chip {

  my($self, $array_name, $design_id) = @_;

  if(! ($array_name && $design_id)){
	throw('Need to pass an Array name and an ArrayChip design ID');
  }

  
  #Let's make the assumption that related arrays will have the same format header
  #Therefore we don't have to do any heuristics to get the parse regex and hence the name of the array
  #This would assume that array names are unique across vendors
  #Very probably, and can just comment out duplicate names on running
  #Need to check this in validate? Will two keys of the same value be returned, pointing to noe set of data?
  #Should also check that all arrays in arrays.list are present in config
  #Or can we just do this in a text file which we parse and populate the config with?
  #This way we don't have to keep editing the code
  #And we can test for duplications in names easily
  #Would need to check in the config file separately for each run
  #Can we automate this, with a comment which is the pid or something to link it to the the log?
  #Or maybe we can just append the config to the log
  #Implement Helper.pm!
  #Is this not reinventing the wheel slightly?
  
  #We also need to alter config to set arrays which are to be imported together
  #We can validate each one, by just taking the first record and testing the parse regex works


  my $array_adaptor = $self->outdb->get_ArrayAdaptor;
  my $achip_adaptor = $self->outdb->get_ArrayChipAdaptor;
  my $array_params = $self->get_ARRAY_PARAMS_by_array_name($array_name);
  my $array = $array_adaptor->fetch_by_name_vendor($array_name, $array_params->{'-vendor'});
  my $array_chip;

  if(! defined $array){
	$array = Bio::EnsEMBL::Funcgen::Array->new(%{$array_params});
	($array) = @{$self->outdb->get_ArrayAdaptor->store($array)};
  }
  
  
  if($array_chip = $achip_adaptor->fetch_by_array_design_ids($array->dbID, $design_id)){

	if($array_chip->has_status('IMPORTED')){
	  throw("$array_name ArrayChip has already been IMPORTED. Please rollback_ArrayChip or recreate your arrays_nr".$self->ARRAY_FORMAT.'.fasta file for alignment');
	}
	else{#Rollback
	  $self->helper->rollback_ArrayChip($array_chip, 'probe');#, 'force');
	  #should we force roll back here?
	  #If not we may have to manually remove probe2transcript xrefs first
	  #Forcing here will mean that we are silently deleting the probe2transcript xrefs
	  #before we have a chance to back up
	  #should we env var this?
	}
  }
  else{
	$array_chip = Bio::EnsEMBL::Funcgen::ArrayChip->new
	  (
	   -name => $array->name,
	   -design_id => $design_id,#This is a placeholder as we don't have the real ArrayChip details
	   -array_id  => $array->dbID,
	  );
	
	($array_chip) = @{$self->outdb->get_ArrayChipAdaptor->store($array_chip)};
  }

  if(! exists $self->{'_array_names'}->{$array_name}){
	$self->{'_array_names'}->{$array->name} = $array_chip;
  }



  return $array_chip;
}
############################################################

}

create_new_probe

description

sub create_new_probe {

  my($self, $array_chip, $probe_hash, $length) = @_;
  
  #can we set a template hash with the array and array chip id in?
  $probe_hash->{'-length'} = $length;
  $probe_hash->{'-class'}  = 'EXPERIMENTAL';#Will they all be experimental?
  $probe_hash->{'-array_chip_id'} = $array_chip->dbID;
  $probe_hash->{'-array'}  = $array_chip->get_Array;
  #remove probeset as this is not ProbeSet yet
  delete $probe_hash->{-probe_set};
 
  return Bio::EnsEMBL::Funcgen::Probe->new(%{$probe_hash});

}

fetch_input

description

sub fetch_input {

  my ($self) = @_;

  my $logic = $self->analysis->logic_name;

  my ($query_file, $chunk_number, $chunk_total);

  my $query = $self->QUERYSEQS;

 
  if ( -e $query and -d $query ) {
    throw "I need to have all affy probes input in one big file\n";
  } elsif ( -e $query and -s $query ) {
    #store this for reference later
    $self->query_file($query);
  } else {
    throw("'$query' refers to something that could not be made sense of\n");
  }


  #Why do we bother setting this?  Why don't we just use the config methods directly?
  #$self->non_redundant_probe_seqs($self->NON_REDUNDANT_PROBE_SEQS);

}

get_ARRAY_PARAMS_by_array_name

description

sub get_ARRAY_PARAMS_by_array_name {

  my ( $self, $array_name ) = @_;

  #Should cache the 
  #Need to test for array name here

  if(! exists $self->{'_CONFIG_ARRAY_PARAMS'}{$array_name}){
	throw("No ARRAY_PARAMS config available for $array_name.  You must add this to the ImportArrays config before importing");
  }


  return $self->{'_CONFIG_ARRAY_PARAMS'}{$array_name};

}

get_IFIELDORDER

description

sub get_IFIELDORDER {

  my $self = shift;

  return $self->{'_CONFIG_IFIELDORDER'};

}

get_IIDREGEXP

description

sub get_IIDREGEXP {

  my $self = shift;

  return $self->{'_CONFIG_IIDREGEXP'};

}

helper

description

sub helper {

  return $_[0]->{'_helper'};

}

new

description

sub new {

  my ( $class, @args ) = @_;
  my $self = $class->SUPER::new(@args);

  $self->read_and_check_config;
  
  #Set up and check DBs before we do the mapping
  $self->outdb->dbc->db_handle;

  #array names hash so we can print out a list of the actual names
  #i.e. not the names in the filename which can be formated differently.
  $self->{'_array_names'} = {};

  #Have to set Helper here as it doesn't use the same param names
  #This is for checking the import status of the arrays and rolling back if required
  #This should enable failed imports to be picked

  $self->{'_helper'} = Bio::EnsEMBL::Funcgen::Utils::Helper->new( no_log => 1 );


  my $db = $self->db;

  return $self;

}

outdb

description

sub outdb {

  my ($self) = @_;

  #Don't need DNADB for collapse
  #But need to define as will try and find a default on ensembldb which may not exist?

  my ($outdb);



  #Where are db and dnadb methods coming from?
  #Is this defaulting to the pipeline DB?
 
  if(! defined $self->{'_outdb'}){

	#This DNADB testing is a work around to avoid having to edit Config::ProbeAlign
	my $dnadb;

	#not defined as an empty env var will give the defined null string
	if($self->DNADB->{-dbname}){
	  $dnadb =  new Bio::EnsEMBL::DBSQL::DBAdaptor(%{ $self->DNADB });
	}

	$self->{'_outdb'} = Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor->new
	  (
	   %{ $self->OUTDB }, 
	   -dnadb => $dnadb,
	  );
   
	if(! $self->DNADB->{-dbname}){
	  print "WARNING: Using default DNADB ". $self->{'_outdb'}->dnadb->dbname."\n";
	}
	
  }

  return $self->{'_outdb'};
}

############################################################
#
# The QUERYSEQS config variable.
#

}

probes

description

sub probes {

  my ( $self, $value ) = @_;

  if ( defined $value ) {
    $self->{'_probes'} = $value;
  }

  if ( exists( $self->{'_probes'} ) ) {
    return $self->{'_probes'};
  } else {
    return undef;
  }
}


#############################################################
# Declare and set up config variables
#############################################################

}

query_file

description

sub query_file {

  my ( $self, $value ) = @_;

  if ( defined $value ) {
    $self->{'_query_file'} = $value;
  }

  if ( exists( $self->{'_query_file'} ) ) {
    return $self->{'_query_file'};
  } else {
    return undef;
  }
}

#############################################################

}

read_and_check_config

description

sub read_and_check_config {

  my $self = shift;

  #This global variable is defined in the ImportArrays config
  $self->SUPER::read_and_check_config($ARRAY_CONFIG);

  ##########
  # CHECKS
  ##########
  my $logic = $self->analysis->logic_name;

  my ($array_format) = split/\:/, $self->input_id;
  $self->ARRAY_FORMAT($array_format);#Do we even need this now?


  # check that compulsory options have values
  #removed  ARRAY_FORMAT_FILE
  foreach my $config_var (
						  qw(
							 OUTDB
							 OUTPUT_DIR
							 IIDREGEXP
							 INPUT_FORMAT
							 IFIELDORDER
							 IIDREGEXP
							 IFIELDORDER
							 INPUT_FORMAT
							)
						 ){

	if ( ! defined $self->$config_var ){
      throw("You must define $config_var in config for logic '$logic'");
    }
  }

  
  $self->QUERYSEQS($self->OUTPUT_DIR."/arrays.${array_format}.fasta");
  $self->NON_REDUNDANT_PROBE_SEQS($self->OUTPUT_DIR."/arrays_nr.${array_format}.fasta");
  $self->NAMES_FILE($self->OUTPUT_DIR."/arrays.${array_format}.names");
 


  #Now defined as separate logic name config in ImportArrays config
  #now check for ARRAY_FORMAT defined config
  #we could set this here too
  #foreach my $config_var ('IIDREGEXP', 'IFIELDORDER', 'INPUT_FORMAT'){
	
#	if ( ! defined $self->{"_CONFIG_${config_var}"}{$self->ARRAY_FORMAT}){
#	  throw('You must define a '.$self->ARRAY_FORMAT." entry in the $config_var config for logic '$logic'");
#	}
#  }


  #Why does ouput DB not have to be defined? We are actually checking for it above
  #What about DNADB?
  
  #remove?
  # output db does not have to be defined, but if it is, it should be a hash
  if ( ref( $self->OUTDB ) ne "HASH"  || ! defined $self->OUTDB->{-dbname}) {
    throw("OUTDB in config for '$logic' must be a hash ref of db connection pars.");
  }


}

### Dynamic config
#Now built from input id

}

run

description

sub run {

  my $self = shift;

  #If we are dealing with fasta then just the one run method should do
  #If not then we need to write a different parse method for the input format
  #Need INPUT_FORMAT in config?

  my $method = 'run_'.$self->INPUT_FORMAT;#remove get from this method

  $self->$method;
}


#The idea is that we could handle other formats here.
#This overlaps with the idea of Parsers, so need to merge these at some point
#Fastas are not design files, parsers handle design files.

}

run_FASTA

description

sub run_FASTA {

  my ($self) = @_;
 
  my (%probes_by_sequence, %array_chips, %probe_attrs, $probe_set);
  my ($current_array_chip, $existing_probe, $current_sequence, $sequence_fragment);
  
  #These get_CONFIGFIELD methods use the ARRAY_FORMAT to retrieve the correct config
  my $header_regex = $self->get_IIDREGEXP;
  my %valid_fields = (
				-probe_set => undef,
				-name       => undef,
				-array      => undef,
				-array_chip => undef,
			   );
  my %field_order = %{$self->get_IFIELDORDER};

  #Validate field
  foreach my $config_field(keys(%field_order)){

	if(! exists $valid_fields{$config_field}){
	  throw("Found invalid field on ImportArrays.pm config:\t$config_field\n".
			"IFIELDORDER must only contain keys:\t".join("\t", keys %valid_fields));
	}
  }

  #This is done so we can dynamically assign fields using their match order
  #Don't actually use 4 and 5 here, but here just in case config regexps are extended
  my @match_refs = (\$1,\$ 2,\$ 3,\$ 4,\$ 5);
  #or is there a special array to hold these?

#  warn "We need to count and report the number of probes imported, else throw if none imported\n".
  #"or if it doesn't match half the wc of the input file";
  
  open( PROBES, "<".$self->query_file);
 
  while(<PROBES>){
    chomp;
	
    if(/$header_regex$/){
	
      if($current_sequence){
		
        if(! $current_array_chip){
          throw ("Have sequence $current_sequence but no current array chip!\n");
        }

		#You can only place the probe into the hash at this point, because
        #it's only at this point that you know you have the full sequence
		#Set probeset here as we delete from hash when creating probe
		$probe_set = (exists $probe_attrs{'-probe_set'}) ?  $probe_attrs{'-probe_set'} : undef;
        $existing_probe = $probes_by_sequence{$probe_set}{$current_sequence};

        if(! $existing_probe){

          $existing_probe = $self->create_new_probe(
													$current_array_chip,\% 
													probe_attrs,
													length($current_sequence),
												   );

          $probes_by_sequence{$probe_set}{$current_sequence} = $existing_probe;
        }
		else{
        
		  $self->add_array_chip_to_existing_probe
			(
			 $existing_probe, 
			 $current_array_chip, 
			 $probe_set,
			 $probe_attrs{'-name'},
			);
        }

        $current_sequence = undef;
      }


	  #Set probe attrs using field order config
	  #Do not use map as this is slow
	  foreach my $field(keys %field_order){
		$probe_attrs{$field} = ${$match_refs[$field_order{$field}]};
	  }

	  #Set current array chip
      $current_array_chip = $array_chips{$probe_attrs{-array_chip}};

	  #Create new array chip
      if(! $current_array_chip){		
        $current_array_chip = $self->create_new_array_chip($probe_attrs{-array}, $probe_attrs{-array_chip});
        $array_chips{$probe_attrs{-array_chip}} = $current_array_chip;
      }

    }
	elsif(/^[atgcuATGCU]+$/){#build sequence
      $sequence_fragment = $_;

      if($current_sequence){
        $current_sequence = $current_sequence.$sequence_fragment;
      }
	  else{
        $current_sequence = $sequence_fragment;
      }
    }
	else{
	  throw('Found header which does not match '.$self->INPUT_FORMAT." regex($header_regex):\n$_");
	}
  }

  #deal with last entry
  $array_chips{$probe_attrs{-array_chip}} = $current_array_chip;
$probe_set = (exists $probe_attrs{'-probe_set'}) ? $probe_attrs{'-probe_set'} : undef;
  $existing_probe = $probes_by_sequence{$probe_set}{$current_sequence};

  if(! $existing_probe){
    $existing_probe =  $self->create_new_probe(
											   $current_array_chip,\% 
											   probe_attrs,
											   length($current_sequence),
											  );

    $probes_by_sequence{$probe_set}{$current_sequence} = $existing_probe;
  }
  else{
    $self->add_array_chip_to_existing_probe
	  (
	   $existing_probe, 
	   $current_array_chip, 
	   $probe_set,
	   $probe_attrs{'-name'},
	  );
  }  

  $self->probes(\%probes_by_sequence);

  return;

}

write_output

description

next

sub write_output {

  my ( $self, @output ) = @_;

  my $outdb = $self->outdb;
  my $outfile = $self->NON_REDUNDANT_PROBE_SEQS;
  my $probe_adaptor = $outdb->get_ProbeAdaptor;
  my $probeset_adaptor = $outdb->get_ProbeSetAdaptor;
  
  open (OUTFILE, ">".$outfile) || throw("Failed to open ouput file:\t".$outfile);
    
  #Store all probes
  foreach my $probeset(keys %{$self->probes}){	
	my %probes = %{$self->probes->{$probeset}};

	if($probeset){
	  $probeset = Bio::EnsEMBL::Funcgen::ProbeSet->new
		(
		 -name => $probeset,
		 -size => scalar(values %probes),
		 #-array_chip => #do we need to add array_chip_id to the probeset table?
		 #-family => ?,
		);
	  ($probeset) = @{$probeset_adaptor->store($probeset)};
	}

	foreach my $sequence(keys %probes){
	  my $probe = $probes{$sequence};
	  $probe->probeset($probeset) if $probeset;
	  
	  ($probe) = @{$probe_adaptor->store($probe)};

	  print OUTFILE ">".$probe->dbID."\n".$sequence."\n";
	}
  }

  close(OUTFILE);


  #No we record imported as we need the nr_fasta dump?
  #Or are we going to allow this to run if already import
  #and just create the nr_fasta by querying the db for probe dbID
  #and using the sequence from the input file?


  #And now the array names and states
  $outfile = $self->NAMES_FILE;
  open (OUTFILE, ">".$outfile) || throw("Failed to open ouput file:\t".$outfile);

  foreach my $aname(keys %{$self->{'_array_names'}}){
	print OUTFILE $aname."\n";
	$self->{'_array_names'}->{$aname}->add_status('IMPORTED');
	$self->{'_array_names'}->{$aname}->adaptor->store_states($self->{'_array_names'}->{$aname});
  }

  close(OUTFILE);

  return;
}

############################################################

#problem here is that the super new is callign this before we get a chance to define it here
#rename method
#is db is super pipeline db?

}

General documentation

AUTHOR

This module was written by Nathan Johnson, based on the CollapseAffy/Oligo code.

CONTACT

Post general queries to ensembl-dev@ebi.ac.uk