Raw content of Bio::EnsEMBL::Funcgen::Probe
#
# Ensembl module for Bio::EnsEMBL::Funcgen::Probe
#
# You may distribute this module under the same terms as Perl itself
=head1 NAME
Bio::EnsEMBL::Funcgen::Probe - A module to represent a nucleotide probe.
=head1 SYNOPSIS
use Bio::EnsEMBL::Funcgen::Probe;
#
my $probe = Bio::EnsEMBL::Funcgen::Probe->new(
-PROBE_SET => $probe_set,
-NAME => 'Probe-1',
-ARRAY => $array,
-ARRAY_CHIP_ID => $ac_dbid,
-CLASS => "EXPERIMENTAL",
);
=head1 DESCRIPTION
An Probe object represents an probe on a microarray. The data (currently the
name, probe_set_id, length, pair_index and class) are stored
in the oligo_probe table.
For Affy arrays, a probe can be part of more than one array, but only part of
one probeset. On each Affy array the probe has a slightly different name. For
example, two different complete names for the same probe might be
DrosGenome1:AFFX-LysX-5_at:535:35; and Drosophila_2:AFFX-LysX-5_at:460:51;. In
the database, these two probes will have the same oligo_probe_id. Thus the same
Affy probe can have a number of different names and complete names depending on
which array it is on.
=head1 AUTHOR
This module was created by Nathan Johnson, but is almost entirely based on the
Affy/OligoProbe module written by Arne Stabenau and Ian Sealy.
This module is part of the Ensembl project: /
=head1 CONTACT
Post comments or questions to the Ensembl development list: ensembl-dev@ebi.ac.uk
=head1 METHODS
=cut
use strict;
use warnings;
package Bio::EnsEMBL::Funcgen::Probe;
use Bio::EnsEMBL::Utils::Argument qw( rearrange ) ;
use Bio::EnsEMBL::Utils::Exception qw( throw warning );
use Bio::EnsEMBL::Storable;
use vars qw(@ISA);
@ISA = qw(Bio::EnsEMBL::Storable);
=head2 new
Arg [-NAME] : string - probe name
Used when the probe is on one array.
Arg [-NAMES] : Listref of strings - probe names
Used when the probe is on multiple arrays.
Arg [-ARRAY] : Bio::EnsEMBL::Funcgen::Array
Used when the probe is on one array.
Arg [-ARRAYS] : Listref of Bio::EnsEMBL::Funcgen::Array
Used when the probe is on multiple arrays.
Arg [-ARRAY_CHIP_ID] : int - array_chip db ID
Used when the probe is on one array.
Arg [-NAMES] : Listref of ints - arary_chip db IDs
Used when the probe is on multiple arrays.
Arg [-PROBE_SET] : Bio::EnsEMBL::ProbeSet
Each probe is part of one(and only one) probeset, if not probe set
then probeset = probe i.e. probe_set size = 1
Arg [-LENGTH] : int - probe length
Will obviously be the same for all probes if same probe
is on multiple arrays.
Arg [-CLASS] : string - probe class e.g. CONTROL, EXPERIMENTAL
Will be the same for all probes if same probe is on
multiple arrays.
Example : my $probe = Bio::EnsEMBL::Probe->new(
-NAME => 'Probe-1',
-PROBE_SET => $probe_set,
-ARRAY => $array,
-ARRAY_CHIP_ID => $array_chip_id,
-LENGTH => 25,
-CLASS => 'EXPERIMENTAL',
);
Description: Creates a new Bio::EnsEMBL::Probe object.
Returntype : Bio::EnsEMBL::Probe
Exceptions : Throws if not supplied with probe name(s) and array(s)
Caller : General
Status : Medium Risk
=cut
sub new {
my $caller = shift;
my $class = ref($caller) || $caller;
my $self = $class->SUPER::new(@_);
my (
$names, $name,
$array_chip_ids, $array_chip_id,
$arrays, $array,
$probeset, $aclass,
$length
) = rearrange([
'NAMES', 'NAME',
'ARRAY_CHIP_IDS', 'ARRAY_CHIP_ID',
'ARRAYS', 'ARRAY',
'PROBE_SET', 'CLASS',
'LENGTH'
], @_);
@$names = ($name) if(ref($names) ne "ARRAY");
@$array_chip_ids = ($array_chip_id) if (ref($array_chip_ids) ne "ARRAY");
@$arrays = ($array) if (ref($arrays) ne "ARRAY");
#We need to record duplicates for each probe_set i.e. each array.
#the relationship is really array_chip to name, as everything else stays the same
#can't have same probe_set_id as this wouldn't maintain relationship
#need unique ps id's or array_chip_id in probe table?
#Then we can miss probeset id's out totally if required
#or should we just duplicate everything with unique db IDs
if (defined $$names[0]) {
if(scalar(@$names) != scalar(@$array_chip_ids)){
throw("You have not specified valid name:array_chip_id pairs\nYou need a probe name for each Array");
}
if(defined $$arrays[0]){
if(scalar(@$names) != scalar(@$arrays)){
throw("You have not specified valid name:Array pairs\nYou need a probe name for each Array\n");
}
}
else{
warn("You have not specified and Array objects, this will result in multiple/redundant queries based on the array_chip_id\nYou should pass Array objects to speed up this process");
#Is this true? We should cache this in the ArrayChip and make sure we're caching it in the caller.
}
# Probe(s) have been specified
# Different names reflect different array
for my $i(0..$#{$names}){
$self->add_array_chip_probename($$array_chip_ids[$i], $$names[$i], $$arrays[$i]);
}
} else {
throw('You need to provide a probe name (or names) to create an Probe');
}
$self->probeset($probeset) if defined $probeset;
$self->class($aclass) if defined $aclass;
$self->length($length) if defined $length;
return $self;
}
#only takes single values for array and array_chip
#as we're shortcuting the constructor and simply blessing the hash
#therefore attr keys should not be lc and not prefix with '-'
=head2 new_fast
Args : Hashref with all internal attributes set
Example : none
Description: Quick and dirty version of new. Only works if the code is very
disciplined. Cannot add array chip probe names unless we recreate
the data structure in the caller.
Returntype : Bio::EnsEMBL::Funcgen::Probe
Exceptions : None
Caller : General
Status : At Risk
=cut
sub new_fast {
bless ($_[1], $_[0]);
}
=head2 add_array_chip_probename
Arg [1] : int - db ID of array_chip
Arg [2] : string - probe name
Arg [3] : Bio::EnsEMBL::Funcgen::Array
Example : $probe->add_array_chip_probename($ac_dbid, $probename, $array);
Description: Adds a probe name / array pair to a probe, allowing incremental
generation of a probe.
Returntype : None
Exceptions : None
Caller : General,
Probe->new(),
ProbeAdaptor->_obj_from_sth(),
AffyProbeAdaptor->_obj_from_sth()
Status : Medium Risk - Change to take ArrayChip object.
=cut
sub add_array_chip_probename {
my $self = shift;
my ($ac_dbid, $probename, $array) = @_;
$self->{ 'arrays' } ||= {};
$self->{ 'probenames' } ||= {};
#mass redundancy here, possibility of fetching same array over and over!!!!!!!!!!!!!!
#Need to implement cache in caller i.e. adaptor
#Made mandatory to force creation of cache
#we need access to adaptor before we can test is valid and stored
#let's no test each time for adaptor as this would slow down
#Just test here instead.
if(! (ref($array) && $array->isa('Bio::EnsEMBL::Funcgen::Array') && $array->dbID)){
#$array = $self->adaptor()->db()->get_ArrayAdaptor()->fetch_by_array_chip_dbID($ac_dbid);
throw('You must pass a valid Bio::EnsEMBL::Funcgen::Array. Maybe you want to generate a cache in the caller?');
}
#mapping between probename and ac_dbid is conserved through array name between hashes
#only easily linked from arrays to probenames,as would have to do foreach on array name
#Can we change the implementation of this so we're only storing the array once, reverse
#the cache? But we want access to the array and using an object reference as a key is ????
#How would this impact on method functionality?
#We now handle multiple names per probe/array
#This will not capture the relationship between
#probe name and position on array!
#Not a problem for affy as name is position
#Currently not a problem for nimblegen as probes never have more than 1 name???
$self->{ 'arrays' }->{$ac_dbid} = $array;
$self->{ 'probenames' }->{$array->name()} ||= [];
push @{$self->{ 'probenames' }->{$array->name()}}, $probename;
return;
}
=head2 get_all_ProbeFeatures
Args : None
Example : my $features = $probe->get_all_ProbeFeatures();
Description: Get all features produced by this probe. The probe needs to be
database persistent.
Returntype : Listref of Bio::EnsEMBL:Funcgen::ProbeFeature objects
Exceptions : None
Caller : General
Status : Medium Risk
=cut
sub get_all_ProbeFeatures {
my $self = shift;
if ( $self->adaptor() && $self->dbID() ) {
return $self->adaptor()->db()->get_ProbeFeatureAdaptor()->fetch_all_by_Probe($self);
} else {
warning('Need database connection to retrieve Features');
return [];
}
}
=head2 get_all_Arrays
Args : None
Example : my $arrays = $probe->get_all_Arrays();
Description: Returns all arrays that this probe is part of. Only works if the
probe was retrieved from the database or created using
add_Array_probename (rather than add_arrayname_probename).
Returntype : Listref of Bio::EnsEMBL::Funcgen::Array objects
Exceptions : None
Caller : General
Status : Medium Risk
=cut
sub get_all_Arrays {
my $self = shift;
# Do we have Array objects for this probe?
if (defined $self->{'arrays'}) {
return [ values %{$self->{'arrays'}} ];
} elsif ( $self->adaptor() && $self->dbID() ) {
# Only have names for arrays, so need to retrieve arrays from database
warning('Not yet implemented');
return [];
} else {
warning('Need database connection to get Arrays by name');
return [];
}
}
=head2 get_all_probenames
Arg [1] : Optional - list of array names, defaults to all available
Example : my @probenames = @{$probe->get_all_probenames()};
Description: Retrieves all names for this probe. Only makes sense for probes
that are part of a probeset (i.e. Affy probes), in which case
get_all_complete_names() would be more appropriate.
Returntype : Listref of strings
Exceptions : None
Caller : General
Status : Medium Risk
=cut
sub get_all_probenames {
my ($self, @array_names) = @_;
my @names;
@array_names = keys %{$self->{'probenames'}} if ! @array_names;
foreach my $array(@array_names){
push @names, @{$self->{'probenames'}->{$array}};
}
return \@names;
}
=head2 get_probename
Arg [1] : string - array name
Example : my $probename = $probe->get_probename('Array-1');
Description: For a given array, retrieve the name for this probe.
Returntype : string
Exceptions : Throws if the array name is required but not specified
Warns if probe has more than one name for the given array.
Caller : General
Status : Medium Risk
=cut
#we can have dulplicate probes on same array for Nimblegen
#what defines and unique probe?
#If we have a duplicate on the same array or even on the same array_chip, then we can still return the same name
#Needs more work
sub get_probename {
my ($self, $arrayname) = @_;
my $probename;
if (! $arrayname){
#Sanity check that there is only one non-AFFY array
my @ac_ids = keys %{$self->{'arrays'}};
if((scalar @ac_ids == 1) && ($self->get_all_Arrays()->[0]->vendor() ne "AFFY")){
$arrayname = $self->get_all_Arrays()->[0]->name();
}
else{
throw('Cannot retrieve name for Probe('.$self->dbID.") without arrayname if more than 1 array chip(@ac_ids) and not NIMBELGEN(".$self->get_all_Arrays()->[0]->vendor().")\n");
}
}
my @names = @{$self->{'probenames'}->{$arrayname}};
if(scalar(@names) > 1){
warn "This probe has more than one probe name for probeset $arrayname:".$self->probeset->name.":\t @names";
}
else{
($probename) = @{$self->{'probenames'}->{$arrayname}};
}
return $probename;
}
=head2 get_all_complete_names
Args : None
Example : my @compnames = @{$probe->get_all_complete_names()};
Description: Retrieves all complete names for this probe. The complete name
is a concatenation of the array name, the probeset name and the
probe name.
Returntype : Listref of strings
Exceptions : None
Caller : General
Status : Medium Risk
=cut
sub get_all_complete_names {
my $self = shift;
my @result = ();
my $probeset = $self->probeset()->name();
$probeset .= ':' if $probeset;
warn "For Nimblegen this need to be Container:Seqid::probeid?";
while ( my (undef, $array) = each %{$self->{'arrays'}} ) {
#would have to put test in here for $self->arrays()->vendor()
#if($array->vendor() eq "AFFY"){
foreach my $name(@{$self->{'probenames'}{$array->name()}}){
push @result, $array->name().":$probeset".$name;
}
}
return \@result;
}
#For affy this matters as name will be different, but not for Nimblegen
#Need to consolidate this
#have get name method which throws if there is more than one array
#detects array vendor and does appropriate method
=head2 get_complete_name
Arg [1] : string - array name
Example : my $compname = $probe->get_complete_name('Array-1');
Description: For a given array, retrieve the complete name for this probe.
Returntype : string
Exceptions : Throws if the array name is not known for this probe
Caller : General
Status : Medium Risk
=cut
sub get_complete_name {
my $self = shift;
my $arrayname = shift;
my $probename = $self->{'probenames'}->{$arrayname};
if (!defined $probename) {
throw('Unknown array name');
}
my $probeset = $self->probeset()->name();
$probeset .= ':' if $probeset;
return "$arrayname:$probeset$probename";
}
=head2 probeset
Arg [1] : (optional) Bio::EnsEMBL::Funcgen::ProbeSet
Example : my $probe_set = $probe->probeset();
Description: Getter and setter of probe_set attribute for Probe objects.
Returntype : string
Exceptions : None
Caller : General
Status : Medium Risk
=cut
sub probeset {
my $self = shift;
$self->{'probe_set'} = shift if @_;
return $self->{'probe_set'};
}
=head2 class
Arg [1] : (optional) string - class
Example : my $class = $probe->class();
Description: Getter and setter of class attribute for Probe
objects e.g. CONTROL, EXPERIMENTAL
Returntype : string
Exceptions : None
Caller : General
Status : Medium Risk
=cut
sub class {
my $self = shift;
$self->{'class'} = shift if @_;
return $self->{'class'};
}
=head2 length
Arg [1] : (optional) int - probe length
Example : my $probelength = $probe->length();
Description: Getter and setter of length attribute for Probe
objects.
Returntype : int
Exceptions : None
Caller : General
Status : Medium Risk
=cut
sub length {
my $self = shift;
$self->{'length'} = shift if @_;
return $self->{'length'};
}
### ARRAY DESIGN SPECIFIC METHODS
### Sould really split this into BaseProbeDesign and eFGProbeDesign wrapper class?
### Quick implementation for now
### These analyses are just the core Analysis classes representing
### the output from a RunnableDB ensembl-analysis.
### Note scores added to previously stored probes may never make it into the DB?
### May also get duplicate record for previously stored probes?
### This will only happen if we're Collapsing or we fully implemnt the nr probe id to seq theory
=head2 add_Analysis_score
Arg [1] : Bio::EnsEMBL::Analysis
Arg [2] : string - analysis score (as string a precision may differ between analyses)??
Example : $probe->add_Analysis_score($analysis, $score);
Description: Setter for probe analysis attributes from an array design
Returntype : None
Exceptions : throws if args are not met or valid
Caller : General
Status : at risk
=cut
sub add_Analysis_score{
my ($self, $anal, $score) = @_;
if(! ($anal && $anal->dbID() && $anal->isa("Bio::EnsEMBL::Analysis"))){
throw("Must provide a valid stored Bio::EnsEMBL::Analysis");
}
throw("Must provide a score to add to the probe") if ! defined $score;
$self->{'analysis'}{$anal->dbID()} = $score;
return;
}
=head2 add_Analysis_CoordSystem_score
Arg [1] : Bio::EnsEMBL::Analysis
Arg [2] : Bio::EnsEMBL::CoordSystem
Arg [3] : string - analysis score (as string a precision may differ between analyses)??
Example : $probe->add_Analysis_CoordSystem_score($analysis, $coord_sys, $score);
Description: Setter for coord system dependant probe analysis attributes from an array design
Returntype : None
Exceptions : throws if args are not met or valid
Caller : General
Status : at risk
=cut
sub add_Analysis_CoordSystem_score{
my ($self, $anal, $cs, $score) = @_;
if(! ($anal && $anal->dbID() && $anal->isa("Bio::EnsEMBL::Analysis"))){
throw("Must provide a valid stored Bio::EnsEMBL::Analysis");
}
if(! ($cs && $cs->dbID() && $cs->isa("Bio::EnsEMBL::Funcgen::CoordSystem"))){
throw("Must provide a valid stored Bio::EnsEMBL::Funcgen::CoordSystem");
}
throw("Must provide a score to add to the probe") if ! defined $score;
$self->{'analysis_coord_system'}{$anal->dbID()}{$cs->dbID()} = $score;
return;
}
=head2 get_score_by_Analysis
Arg [1] : Bio::EnsEMBL::Analysis
Example : my $anal_score = $probe->get_analysis_score($analysis);
Description: Setter for probe analysis attributes from an array design
Returntype : string
Exceptions : throws if args are not met or valid
Caller : General
Status : at risk
=cut
sub get_score_by_Analysis{
my ($self, $anal) = @_;
$self->get_all_design_scores() if ! defined $self->{'analysis'};
if(! ($anal && $anal->dbID() && $anal->isa("Bio::EnsEMBL::Analysis"))){
throw("Must provide a valid stored Bio::EnsEMBL::Analysis");
}
return (exists $self->{'analysis'}{$anal->dbID()}) ? $self->{'analysis'}{$anal->dbID()} : undef;
}
=head2 get_score_by_Analysis_CoordSystem
Arg [1] : Bio::EnsEMBL::Analysis
Arg [2] : Bio::EnsEMBL::CoordSystem
Arg [3] : string - analysis score (as string a precision may differ between analyses)??
Example : $probe->add_analysis($analysis, $coord_sys, $score);
Description: Setter for coord system dependant probe analysis attributes from an array design
Returntype : None
Exceptions : throws if args are not met or valid
Caller : General
Status : at risk
=cut
sub get_score_by_Analysis_CoordSystem{
my ($self, $anal, $cs) = @_;
$self->get_all_design_scores() if ! defined $self->{'analysis_coord_system'};
if(! ($anal && $anal->dbID() && $anal->isa("Bio::EnsEMBL::Analysis"))){
throw("Must provide a valid stored Bio::EnsEMBL::Analysis");
}
if(! ($cs && $cs->dbID() && $cs->isa("Bio::EnsEMBL::Funcgen::CoordSystem"))){
throw("Must provide a valid stored Bio::EnsEMBL::Funcgen::CoordSystem");
}
my $score = undef;
if(exists $self->{'analysis_coord_system'}{$anal->dbID()} &&
exists $self->{'analysis_coord_system'}{$anal->dbID()}{$cs->dbID()}){
$score = $self->{'analysis_coord_system'}{$anal->dbID()}{$cs->dbID()};
}
return $score;
}
=head2 get_all_design_scores
Arg [1] : Boolean - No fetch flag, to fetch design scores from DB, used in adaptor
To avoid testing DB for each probe when no design scores have been added.
Example : my @anal_score_coordsets = @{$probe->get_all_design_scores()};
Description: Gets all design scores as analysis_id, score and optionally coord_system_id
Returntype : ARRAYREF
Exceptions : throws if no fetch flag is not defined and adaptor or probe is not defined and or stored.
Caller : General
Status : at risk
=cut
#not named get_all_Analysis_scores as this would imply only non-cs dependent scores
#hence named after table, as this returns simple table records
sub get_all_design_scores{
my ($self, $no_fetch) = @_;
my ($analysis_id, $cs_id, $score, @design_scores);
if(! $no_fetch){#can assume we have none stored already due to implementation of add methods
throw("Probe must have and adaptor to fetch design scores from the DB") if(! $self->adaptor());
foreach my $probe_analysis(@{$self->adaptor->fetch_all_design_records($self)}){
#we can't use the add methods here as would be cyclical
#nor do we need extra validation
($analysis_id, $cs_id, $score) = @$probe_analysis;
if($cs_id){
$self->{'analysis_coord_system'}{$analysis_id}{$cs_id} = $score;
}else{
$self->{'analysis'}{$analysis_id} = $score;
}
}
}
#populate array from attrs
if(exists $self->{'analysis_coord_system'}){
foreach $analysis_id(keys %{$self->{'analysis_coord_system'}}){
foreach $cs_id(keys %{$self->{'analysis_coord_system'}{$analysis_id}}){
push @design_scores, [$analysis_id, $self->{'analysis_coord_system'}{$analysis_id}{$cs_id}, $cs_id];
}
}
}
if(exists $self->{'analysis'}){
foreach $analysis_id(keys %{$self->{'analysis'}}){
push @design_scores, [$analysis_id, $self->{'analysis'}{$analysis_id}];
}
}
return \@design_scores;
}
#do we need get_all methods for Analysis and Analysis_CoordSystem?
#maybe if we split into another Class and Adaptor
1;