Bio::EnsEMBL::Analysis::RunnableDB CollapseAffyProbes
SummaryIncluded librariesPackage variablesSynopsisDescriptionGeneral documentationMethods
Toolbar
WebCvsRaw content
Summary
Bio::EnsEMBL::Analysis::RunnableDB::Exonerate2Genes;
Package variables
No package variables defined.
Included modules
Bio::EnsEMBL::AffyArray
Bio::EnsEMBL::AffyProbe
Bio::EnsEMBL::Analysis::Config::CollapseAffyProbes
Bio::EnsEMBL::Analysis::RunnableDB
Bio::EnsEMBL::Utils::Exception qw ( throw warning )
Inherit
Bio::EnsEMBL::Analysis::RunnableDB
Synopsis
my $affy =
Bio::EnsEMBL::Analysis::RunnableDB::CollapseAffyProbes->new(
-db => $refdb,
-analysis => $analysis_obj,
-database => $EST_GENOMIC,
-query_seqs => \@sequences,
);
$affy->fetch_input();
$affy->run();
$affy->output();
$affy->write_output(); #writes to DB and a big fasta file
Description
This object runs the first step in the process for mapping affymetrix probes to a genome.
The probes supplied are redundant - a single probe (characterised by a unique sequence)
may occur in different positions of different arrays. SO we will first collect all
redundant sequences, find a non-redudant subset, and then simultaneously
-- write the probes to our db, and
-- write an output file of non-redundant sequences to a flat file, so that
exonerate can map them against the genome in the next step.
Note that probes are defined as redundant when they share the same
- sequence and
- probeset
Methods
NON_REDUNDANT_PROBE_SEQS
No description
Code
OUTDB
No description
Code
QUERYSEQS
No description
Code
add_array_to_existing_probe
No description
Code
affy_arrays
No description
Code
affy_probes
No description
Code
clean_affy_features
No description
Code
create_new_array
No description
Code
create_new_probe
No description
Code
fetch_input
No description
Code
get_output_db
No description
Code
new
No description
Code
non_redundant_probe_seqs
No description
Code
populate_affy_arrays_and_probes
No description
Code
query_file
No description
Code
read_and_check_config
No description
Code
run
No description
Code
write_output
No description
Code
Methods description
None available.
Methods code
NON_REDUNDANT_PROBE_SEQSdescriptionprevnextTop
sub NON_REDUNDANT_PROBE_SEQS {
  my ( $self, $value ) = @_;

  if ( defined $value ) {
    $self->{'_NON_REDUNDANT_PROBE_SEQS'} = $value;
  }

  if ( exists( $self->{'_NON_REDUNDANT_PROBE_SEQS'} ) ) {
    return $self->{'_NON_REDUNDANT_PROBE_SEQS'};
  } else {
    return undef;
  }
}
OUTDBdescriptionprevnextTop
sub OUTDB {
  my ( $self, $value ) = @_;

  if ( defined $value ) {
    $self->{'_CONFIG_OUTDB'} = $value;
  }

  if ( exists( $self->{'_CONFIG_OUTDB'} ) ) {
    return $self->{'_CONFIG_OUTDB'};
  } else {
    return undef;
  }
}

###############################################
### end of config
###############################################
1;
}
QUERYSEQSdescriptionprevnextTop
sub QUERYSEQS {
  my ( $self, $value ) = @_;

  if ( defined $value ) {
    $self->{'_CONFIG_QUERYSEQS'} = $value;
  }

  if ( exists( $self->{'_CONFIG_QUERYSEQS'} ) ) {
    return $self->{'_CONFIG_QUERYSEQS'};
  } else {
    return undef;
  }
}
add_array_to_existing_probedescriptionprevnextTop
sub add_array_to_existing_probe {
  my ($self, $probe, $array, $probeset, $probename) = @_;
  my @all_probenames = @{$probe->get_all_probenames};

      
  if(!($probe->probeset eq $probeset)){
    throw (
      "Inconsistency: have found a probe ".$array->name.":$probeset:$probename with ".
      "identical sequence but different probeset to another one: ".$probe->probeset."\n"
    );
  }
  $probe->add_Array_probename($array, $probename);
}
affy_arraysdescriptionprevnextTop
sub affy_arrays {
  my ( $self, $value ) = @_;

  if ( defined $value ) {
    $self->{'_affy_arrays'} = $value;
  }

  if ( exists( $self->{'_affy_arrays'} ) ) {
    return $self->{'_affy_arrays'};
  } else {
    return undef;
  }
}

#############################################################
}
affy_probesdescriptionprevnextTop
sub affy_probes {
  my ( $self, $value ) = @_;

  if ( defined $value ) {
    $self->{'_affy_probes'} = $value;
  }

  if ( exists( $self->{'_affy_probes'} ) ) {
    return $self->{'_affy_probes'};
  } else {
    return undef;
  }
}


#############################################################
# Declare and set up config variables
#############################################################
}
clean_affy_featuresdescriptionprevnextTop
sub clean_affy_features {
  my ( $self, @affy_features ) = @_;

  my $slice_adaptor = $self->db->get_SliceAdaptor;

  my %genome_slices;

  foreach my $affy_feature (@affy_features) {

    $affy_feature->analysis( $self->analysis );

    # get the slice based on the seqname stamped on in the runnable
my $slice_id = $affy_feature->seqname; if ( not exists $genome_slices{$slice_id} ) { # assumes genome seqs were named in the Ensembl API Slice naming
# convention, i.e. coord_syst:version:seq_reg_id:start:end:strand
$genome_slices{$slice_id} = $slice_adaptor->fetch_by_name($slice_id); } my $slice = $genome_slices{$slice_id}; $affy_feature->slice($slice); #This is a hack: a probe might sit on two arrays, but I don't
# know how to resolve this ambiguity now, or whether it's significant
my $array_name = $affy_feature->probe->get_all_AffyArrays->[0]->name; my $probe_name = $array_name.":".$affy_feature->probeset.":".$affy_feature->probe->get_all_probenames->[0]; my $real_probe = $self->affy_probes->{$probe_name}; if(!$real_probe){ throw "Inconsistency! I can't find an affy probe corresponding to $probe_name\n"; } #print "setting real probe: ".$real_probe->get_all_AffyArrays->[0]->name.":".
$affy_feature->probe($real_probe); } } ############################################################
}
create_new_arraydescriptionprevnextTop
sub create_new_array {
  my($self, $array_name) = @_;
  my $affy_array =
     new Bio::EnsEMBL::AffyArray(
       -name => $array_name,
       -setsize => 0,
     );
}
############################################################
}
create_new_probedescriptionprevnextTop
sub create_new_probe {
  my($self, $affy_array, $probeset, $probe_name, $current_sequence) = @_;

  my $affy_probe =
     new Bio::EnsEMBL::AffyProbe(
       -probeset => $probeset,
       -name => $probe_name,
       -array => $affy_array
     );

  return $affy_probe;
}
fetch_inputdescriptionprevnextTop
sub fetch_input {
  my ($self) = @_;

  my $logic = $self->analysis->logic_name;

  my ($query_file, $chunk_number, $chunk_total);

  my $query = $self->QUERYSEQS;

  if ( -e $query and -d $query ) {
    throw "I need to have all affy probes input in one big file\n";
  } elsif ( -e $query and -s $query ) {
    #store this for reference later
$self->query_file($query); } else { throw("'$query' refers to something that could not be made sense of\n"); } $self->non_redundant_probe_seqs($self->NON_REDUNDANT_PROBE_SEQS);
}
get_output_dbdescriptionprevnextTop
sub get_output_db {
  my ($self) = @_;

  my $outdb;

  if ( $self->OUTDB ) {
    $outdb = new Bio::EnsEMBL::DBSQL::DBAdaptor( %{ $self->OUTDB }, -dnadb => $self->db );
  } else {
    $outdb = $self->db;
  }

  return $outdb;
}

############################################################
#
# The QUERYSEQS config variable.
#
}
newdescriptionprevnextTop
sub new {
  my ( $class, @args ) = @_;
  my $self = $class->SUPER::new(@args);

  $self->read_and_check_config($AFFY_CONFIG);
  return $self;
}
non_redundant_probe_seqsdescriptionprevnextTop
sub non_redundant_probe_seqs {
  my ( $self, $value ) = @_;

  if ( defined $value ) {
    $self->{'_non_redudant_probe_seqs'} = $value;
  }

  if ( exists( $self->{'_non_redudant_probe_seqs'} ) ) {
    return $self->{'_non_redudant_probe_seqs'};
  } else {
    return undef;
  }
}
#############################################################
}
populate_affy_arrays_and_probesdescriptionprevnextTop
sub populate_affy_arrays_and_probes {
  my($self, @args) = @_;
  
  my $query_file = $self->query_file;

  my $outdb        = $self->get_output_db;
  my $affy_array_adaptor = $outdb->get_AffyArrayAdaptor;
  my $affy_probe_adaptor = $outdb->get_AffyProbeAdaptor;
  
  #
#make a pass through each fasta header in the query file,
#and sift out the array and probe id.
#If they don't exist in our %affy_array and %affy_probe hashes,
# store an object for them.
open(PROBES, "<".$self->query_file); while(<PROBES>){ chomp; /^>probe:(\S+):(\S+):(\S+:\S+;).*$/; my $array_name = $1; my $probeset = $2; my $probe_name = $3; if (!$array_name){ throw "array name could not be deduced from: ".$_."\n"; } if( !$probeset){ throw "probeset could not be deduced from: ".$_."\n"; } if(!$probe_name){ throw "probename could not be deduced from: ".$_."\n"; } my $affy_array = $affy_array_adaptor->fetch_by_name($array_name); if(!$affy_array){ $affy_array = new Bio::EnsEMBL::AffyArray( -name => $array_name ); } $self->affy_arrays->{$array_name} = $affy_array; my $affy_probe = $affy_probe_adaptor->fetch_by_array_probeset_probe( $array_name, $probeset, $probe_name ); if(!$affy_probe){ $affy_probe = new Bio::EnsEMBL::AffyProbe( -probeset => $probeset, -name => $probe_name, -array => $affy_array ); } $self->affy_probes->{$array_name.":".$probeset.":".$probe_name} = $affy_probe; } } ############################################################
}
query_filedescriptionprevnextTop
sub query_file {
  my ( $self, $value ) = @_;

  if ( defined $value ) {
    $self->{'_query_file'} = $value;
  }

  if ( exists( $self->{'_query_file'} ) ) {
    return $self->{'_query_file'};
  } else {
    return undef;
  }
}

#############################################################
}
read_and_check_configdescriptionprevnextTop
sub read_and_check_config {
  my $self = shift;

  $self->SUPER::read_and_check_config($AFFY_CONFIG);

  ##########
# CHECKS
##########
my $logic = $self->analysis->logic_name; # check that compulsory options have values
foreach my $config_var ( qw( QUERYSEQS NON_REDUNDANT_PROBE_SEQS OUTDB ) ){ if ( not defined $self->$config_var ){ throw("You must define $config_var in config for logic '$logic'"); } } # output db does not have to be defined, but if it is, it should be a hash
if ( $self->OUTDB and ref( $self->OUTDB ) ne "HASH" ) { throw("OUTDB in config for '$logic' must be a hash ref of db connection pars."); }
}
rundescriptionprevnextTop
sub run {
  my ($self) = @_;
  my %probes_by_sequence;
  my %affy_arrays;
  
  open( PROBES, "<".$self->query_file);

  my $current_sequence = undef;
  my $sequence_fragment = undef;

  #
# Dont forget to keep a set of arrays for writing as well!
#
my $array_name = undef; my $probeset = undef; my $probe_name = undef; my $current_affy_array = undef; my $existing_probe = undef; while(<PROBES>){ chomp; if(/^>probe:(\S+):(\S+):(\S+:\S+;).*$/){ if($current_sequence){ if(!$current_affy_array){ throw ("Have sequence $current_sequence but no current array !\n"); } #
#You can only place the probe into the hash at this point, because
#it's only at this point that you know you have the full sequence
$existing_probe = $probes_by_sequence{$probeset.":---:".$current_sequence}; if(!$existing_probe){ $existing_probe = $self->create_new_probe( $current_affy_array, $probeset, $probe_name, $current_sequence ); $probes_by_sequence{$probeset.":---:".$current_sequence} = $existing_probe; }else{ $self->add_array_to_existing_probe( $existing_probe, $current_affy_array, $probeset, $probe_name ); } $current_sequence = undef; } $array_name = $1; $probeset = $2; $probe_name = $3; $current_affy_array = $affy_arrays{$array_name}; if(!$current_affy_array){ $current_affy_array = $self->create_new_array($array_name); $affy_arrays{$array_name} = $current_affy_array; } }else{ $sequence_fragment = $_; if($current_sequence){ $current_sequence = $current_sequence . $sequence_fragment; }else{ $current_sequence = $sequence_fragment; } } } $affy_arrays{$array_name} = $current_affy_array; $existing_probe = $probes_by_sequence{$probeset.":---:".$current_sequence}; if(!$existing_probe){ $existing_probe = $self->create_new_probe( $current_affy_array, $probeset, $probe_name, $current_sequence ); $probes_by_sequence{$probeset.":---:".$current_sequence} = $existing_probe; }else{ $self->add_array_to_existing_probe( $existing_probe, $current_affy_array, $probeset, $probe_name ); } $self->affy_probes(\%probes_by_sequence); $self->affy_arrays(\%affy_arrays);
}
write_outputdescriptionprevnextTop
sub write_output {
  my ( $self, @output ) = @_;

  my $outdb        = $self->get_output_db;
  my $outfile = $self->non_redundant_probe_seqs;
  open (OUTFILE, ">".$outfile);
  my $affy_array_adaptor = $outdb->get_AffyArrayAdaptor;
  my $affy_probe_adaptor = $outdb->get_AffyProbeAdaptor;

  foreach my $affy_array (values %{$self->affy_arrays}){
    my $existing_affy_array = $affy_array_adaptor->fetch_by_name($affy_array->name);
    if(!$affy_array->dbID){
      eval{ $affy_array_adaptor->store($affy_array) };
      if ($@) {
        $self->throw("Unable to store affy array!\n $@");
      }
    }
  }

  foreach my $probeset_sequence_key (keys %{$self->affy_probes}){
    my $affy_probe = $self->affy_probes->{$probeset_sequence_key};
    if(!$affy_probe->dbID){
      eval{ $affy_probe_adaptor->store($affy_probe) };
      if ($@) {
        $self->throw("Unable to store affy probe!\n $@");
      }
    }
    
    #Now that you've stored it, you have the probe's dbID, so you can write out the NEW
#fasta file, with that dbID as a header...
$probeset_sequence_key =~ /.*:---:(.*)/; my $sequence = $1; print OUTFILE ">".$affy_probe->dbID."\n"; print OUTFILE $sequence."\n"; } close(OUTFILE) or throw("Failed top close ".$outfile); } ############################################################
}
General documentation
CONTACTTop
Post general queries to ensembl-dev@ebi.ac.uk
APPENDIXTop
The rest of the documentation details each of the object methods.
Internal methods are usually preceded with a _