Summary | Included libraries | Package variables | Synopsis | Description | General documentation | Methods |
WebCvs | Raw content |
_columns | Description | Code |
_default_where_clause | Description | Code |
_objs_from_sth | Description | Code |
_remap | No description | Code |
_tables | Description | Code |
fetch_all_by_Slice_ResultSet | Description | Code |
fetch_results_by_probe_id_ResultSet | Description | Code |
list_dbIDs | Description | Code |
resolve_replicates_by_ResultSet | Description | Code |
store | Description | Code |
window_sizes | Description | Code |
_columns | code | next | Top |
Args : None |
_default_where_clause | code | prev | next | Top |
Args : None |
_objs_from_sth | code | prev | next | Top |
Arg [1] : DBI statement handle object |
_tables | code | prev | next | Top |
Args : None |
fetch_all_by_Slice_ResultSet | code | prev | next | Top |
Arg[0] : Bio::EnsEMBL::Slice - Slice to retrieve results from |
fetch_results_by_probe_id_ResultSet | code | prev | next | Top |
Arg [1] : int - probe dbID |
list_dbIDs | code | prev | next | Top |
Args : None |
resolve_replicates_by_ResultSet | code | prev | next | Top |
Arg[0] : HASHREF - chip_channel_id => @scores pairs |
store | code | prev | next | Top |
Args : List of Bio::EnsEMBL::Funcgen::ResultFeature objects |
window_sizes | code | prev | next | Top |
Args : None |
_columns | description | prev | next | Top |
my $self = shift; #This need to be dynamic dependent on whether result_set has 'result_feature' status.}
#removed window as we don't want to return that really?
#Could remove seq_region_id, but this may cause problems with attempts to map
#Should add strand to this to enable stranded features
my @result_columns = qw (r.score pf.seq_region_start pf.seq_region_end pf.seq_region_strand cc.chip_channel_id); if($_probe_extend){ #We can get this direct from the ProbeAdaptor
#Then we can use split/commodotised methods for generation of probe external to the ProbeAdaptor
push @result_columns, qw(p.probe_id p.probe_set_id p.name p.length p.array_chip_id p.class); } #Can we remove result_feature_id, result_set_id and seq_region_id?
#Need to dynamically add strand dependant on whether feature is stranded?rf.seq_region_strand
#May need to add rf.seq_region_id if we want to create Collections
#Hve also left out window, as we don't need to know this.
#warn "columns: probe_extend $_probe_extend rfs $_result_feature_set";
return $_result_feature_set ? qw(rf.seq_region_start rf.seq_region_end rf.seq_region_strand rf.score rf.seq_region_id) : @result_columns;
_default_where_clause | description | prev | next | Top |
my $self = shift; #warn "where: probe_extend $_probe_extend rfs $_result_feature_set";}
return 'r.probe_id = p.probe_id' if $_probe_extend && ! $_result_feature_set;
_objs_from_sth | description | prev | next | Top |
my ($self, $sth) = @_; my (@rfeats, $dbid, $rset_id, $seq_region_id, $sr_start, $sr_end, $sr_strand, $score, $sr_id); my ($sql); #Test speed with no dbID and sr_id}
#Could dynamically define simple obj hash dependant on whether feauture is stranded and new_fast?
#We never call _obj_from_sth for extended queries
#This is only for result_feature table queries i.e. standard/new queries
$sth->bind_columns(\$sr_start,\$ sr_end,\$ sr_strand,\$ score,\$ sr_id); #We could rmeove all the slice stuff if we can override the BaseFeature _remap stuff
#And implement out own here based on the query slice
my %slice_hash; my $slice_adaptor = $self->db->get_SliceAdaptor; while ( $sth->fetch() ) { #warn "Need to unpack score here!";
#Leave packing for v51
#get core seq_region_id
my $csr_id = $self->get_core_seq_region_id($sr_id); if(! $csr_id){ warn "Cannot get slice for eFG seq_region_id $sr_id\n". "The region you are using is not present in the cuirrent dna DB"; next; } $slice_hash{$csr_id} ||= $slice_adaptor->fetch_by_seq_region_id($csr_id); push @rfeats, Bio::EnsEMBL::Funcgen::ResultFeature->new_fast($sr_start, $sr_end, $sr_strand, $score, undef, undef, undef, $slice_hash{$csr_id}); #We never want the heavy object from the result_feature table method
#In fact we never want the heavy object unless we create it from scratch to store it
#Therefore we don't need to use create_feature at all unless we ever want to use
#The collection to generate bins on the fly(rather than for storing)
#We would only want this if we don't want to implement the object, just the array
#Now we're back to considering the base collection array
#'DBID', 'START', 'END', 'STRAND', 'SLICE'
#we don't need dbID or slice for ResultFeature
#But we do need a score
#Stick with object implementation for now
# Finally, create the new exon.
#push( @exons,
# $self->_create_feature( 'Bio::EnsEMBL::Exon', {
# '-start' => $seq_region_start,
# '-end' => $seq_region_end,
# '-strand' => $seq_region_strand,
# '-adaptor' => $self,
# '-slice' => $slice,
# '-dbID' => $exon_id,
# '-stable_id' => $stable_id,
# '-version' => $version,
# '-created_date' => $created_date
# || undef,
# '-modified_date' => $modified_date
# || undef,
# '-phase' => $phase,
# '-end_phase' => $end_phase,
# '-is_current' => $is_current
# } ) );
} #reset for safety, altho this will be reset in fetch method
$_result_feature_set = 1; return\@ rfeats;
_remap | description | prev | next | Top |
my $self = shift; warn "Overriding remap method to avoid using slice"; #Can't override as it is a non class method, just a sub.}
return;
_tables | description | prev | next | Top |
my $self = shift; my @result_tables = ( ['result', 'r'], ['probe_feature', 'pf'] ); push @result_tables, ( ['probe', 'p'] ) if $_probe_extend; return $_result_feature_set ? ( [ 'result_feature', 'rf' ] ) : @result_tables;}
fetch_all_by_Slice_ResultSet | description | prev | next | Top |
my ($self, $slice, $rset, $ec_status, $with_probe, $display_size, $window_size, $constraint) = @_; #Change this so this optionally takes hashref of all params.}
if(! (ref($slice) && $slice->isa('Bio::EnsEMBL::Slice'))){ throw('You must pass a valid Bio::EnsEMBL::Slice'); } $self->db->is_stored_and_valid('Bio::EnsEMBL::Funcgen::ResultSet', $rset); if($rset->table_name eq 'channel'){ warn('Can only get ResultFeatures for an ExperimentalChip level ResultSet'); return; } #change this rset call to a status call?
$_result_feature_set = $rset->has_status('RESULT_FEATURE_SET'); $_probe_extend = $with_probe if defined $with_probe; #Work out window size here based on Slice length
#Keep working method here for now and just use generic fetch for simple result_feature table query
if($_result_feature_set){ #warn "IS RESULT FEATURE SET";
if($_probe_extend){ throw("Cannot retrieve Probe information with a result_feature_set query, try using ???"); } #Set default display width
$display_size ||= 700; my $bucket_size = ($slice->length)/$display_size;
my @window_sizes = $self->window_sizes; #warn "Optimum window_size is $bucket_size";
if(! defined $window_size){ #default is largest;
$window_size = $window_sizes[$#window_sizes]; #Try and find a smaller window
for (my $i = 0; $i <= $#window_sizes; $i++) { if ($bucket_size < $window_sizes[$i]){ $window_size = $window_sizes[$i-1]; #warn "Found better window size $window_size";
last; } } #warn "Using window size $window_size";
} else{ #Validate window size here
throw("Invalid window size $window_size, must be one of:\t". join(', ', @window_sizes)) if ! grep(/^${window_size}$/, @window_sizes); } $constraint .= ' AND ' if defined $constraint; $constraint .= 'rf.result_set_id='.$rset->dbID.' and rf.window_size='.$window_size; return $self->fetch_all_by_Slice_constraint($slice, $constraint); #Conservation score method is to store all scores within a given window, by packing each individually
#Then
#They also calc min and max and store in first ConservationScore
#Can we do this on the fly and return as separate values e.g.
#my ($rfs, $max_score, $min_score) = @{$rf_adaptor->fetch_all_by_Slice_ResultSet};
#maybe fetch_all_min_mix
#If we're not doing this on the fly then we can just provide external method
#This alters the standard design pattern :/
} #This is the old method from ResultSetAdaptor
#warn "using old method";
my (@rfeatures, %biol_reps, %rep_scores, @filtered_ids); my ($biol_rep, $score, $start, $end, $strand, $cc_id, $old_start, $old_end, $probe_field, $old_strand); my ($padaptor, $probe, $array,); my ($probe_id, $probe_set_id, $pname, $plength, $arraychip_id, $pclass, $probeset); my (%array_cache, %probe_set_cache, $ps_adaptor, $array_adaptor); my $ptable_syn = ''; my $pjoin = ''; my $pfields = ''; #This with probe function should now needs to be limited to the lowest window size
if($with_probe){ #This would be in BaseAdaptor? or core DBAdaptor?
#This would work on assuming that the default foreign key would be the primary key unless specified
#my($table_info, $fields, $adaptor) = @{$self->validate_and_get_join_fields('probe')};
$padaptor = $self->db->get_ProbeAdaptor; $ps_adaptor = $self->db->get_ProbeSetAdaptor; $array_adaptor = $self->db->get_ArrayAdaptor; #This would return table and syn and syn.fields
#Would also need access to obj_from_sth_values
#could return code ref or adaptor
$ptable_syn = ', probe p'; #Some of these will be redundant: probe_id
#Can we remove the foreign key from the returned fields?
#But we need it here as it isn't being returned
$pfields = ', p.probe_id, p.probe_set_id, p.name, p.length, p.array_chip_id, p.class '; #will this make a difference if we join on pf instead of r?
$pjoin = ' AND r.probe_id = p.probe_id '; } my @ids = @{$rset->table_ids()}; #should we do some more optimisation of method here if we know about presence or lack or replicates?
if($ec_status){ @filtered_ids = @{$self->status_filter($ec_status, 'experimental_chip', @ids)}; if(! @filtered_ids){ warn("No ExperimentalChips have the $ec_status status, No ResultFeatures retrieved"); return\@ rfeatures; } } #we need to build a hash of cc_id to biolrep value
#Then we use the biolrep as a key, and push all techrep values.
#this can then be resolved in the method below, using biolrep rather than cc_id
my $sql = "SELECT ec.biological_replicate, cc.chip_channel_id from experimental_chip ec, chip_channel cc WHERE cc.table_name='experimental_chip' AND ec.experimental_chip_id=cc.table_id AND cc.table_id IN(".join(', ', (map $_->dbID(), @{$rset->get_ExperimentalChips()})).")"; my $sth = $self->prepare($sql); $sth->execute(); $sth->bind_columns(\$biol_rep,\$ cc_id); #could maybe do a selecthashref here?
while($sth->fetch()){ $biol_rep ||= 'NO_REP_SET'; $biol_reps{$cc_id} = $biol_rep; } if(grep(/^NO_REP_SET$/, values %biol_reps)){ warn 'You have ExperimentalChips with no biological_replicate information, these will be all treated as one biological replicate'; } #we don't need to account for strnadedness here as we're dealing with a double stranded feature
#need to be mindful if we ever consider expression
#we don't need X Y here, as X Y for probe will be unique for cc_id.
#any result with the same cc_id will automatically be treated as a tech rep
#This does not currently handle multiple CSs i.e. level mapping
my $mcc = $self->db->get_MetaCoordContainer(); my $fg_cs = $self->db->get_FGCoordSystemAdaptor->fetch_by_name( $slice->coord_system->name(), $slice->coord_system->version() ); my $max_len = $mcc->fetch_max_length_by_CoordSystem_feature_type($fg_cs, 'probe_feature'); #This join between sr and pf is causing the slow down. Need to select right join for this.
#just do two separate queries for now.
#$sql = "SELECT seq_region_id from seq_region where core_seq_region_id=".$slice->get_seq_region_id().
# " AND schema_build='".$self->db->_get_schema_build($slice->adaptor->db())."'";
#my ($seq_region_id) = $self->db->dbc->db_handle->selectrow_array($sql);
#This by passes fetch_all_by_Slice_constraint
#So we need to build the seq_region_cache here explicitly.
$self->build_seq_region_cache($slice); my $seq_region_id = $self->get_seq_region_id_by_Slice($slice); #Can we push the median calculation onto the server?
#group by pf.seq_region_start?
#will it always be a median?
$sql = 'SELECT STRAIGHT_JOIN r.score, pf.seq_region_start, pf.seq_region_end, pf.seq_region_strand, cc.chip_channel_id '.$pfields. ' FROM probe_feature pf, '.$rset->get_result_table().' r, chip_channel cc '.$ptable_syn.' WHERE cc.result_set_id = '. $rset->dbID(); $sql .= ' AND cc.table_id IN ('.join(' ,', @filtered_ids).')' if ((@filtered_ids != @ids) && $ec_status); $sql .= ' AND cc.chip_channel_id = r.chip_channel_id'. ' AND r.probe_id=pf.probe_id'.$pjoin. ' AND pf.seq_region_id='.$seq_region_id. ' AND pf.seq_region_start<='.$slice->end(); if($max_len){ my $start = $slice->start - $max_len; $start = 0 if $start < 0; $sql .= ' AND pf.seq_region_start >= '.$start; } $sql .= ' AND pf.seq_region_end>='.$slice->start(). ' ORDER by pf.seq_region_start'; #do we need to add probe_id here as we may have probes which start at the same place
#can we resolves replicates here by doing a median on the score and grouping by cc_id some how?
#what if a probe_id is present more than once, i.e. on plate replicates?
#To extend this to perform median calculation we'd have to group by probe_id, and ec.biological_replicate
#THen perform means across biol reps. This would require nested selects
#would this group correctly on NULL values if biol rep not set for a chip?
#This would require an extra join too.
$sth = $self->prepare($sql); $sth->execute(); if($with_probe){ $sth->bind_columns(\$score,\$ start,\$ end,\$ strand,\$ cc_id,\$ probe_id,\$ probe_set_id,\$ pname,\$ plength,\$ arraychip_id,\$ pclass); } else{ $sth->bind_columns(\$score,\$ start,\$ end,\$ strand,\$ cc_id); } my $position_mod = $slice->start() - 1; my $new_probe = 0; my $first_record = 1; while ( $sth->fetch() ) { #we need to get best result here if start and end the same
#set start end for first result
$old_start ||= $start; $old_end ||= $end; $old_strand||= $strand; #This is assuming that a feature at the exact same locus is from the same probe
#This may not be correct and will collapse probes with same sequence into one ResultFeature
#This is okay for analysis purposes, but obscures the probe to result relationship
#we arguably need to implement r.probe_id check here for normal ResultFeatures
if(($start == $old_start) && ($end == $old_end)){#First result and duplicate result for same feature?
#only if with_probe(otherwise we have a genuine on plate replicate)
#if probe_id is same then we're just getting the extra probe for a different array?
#cc_id might be different for same probe_id?
#How can we differentiate between an on plate replicate and just the extra probe records?
#If array_chip_id is the same? If it is then we have an on plate replicate (differing x/y vals in result)
#Otherwise we know we're getting another probe record from a different Array/ArrayChip.
#It is still possible for the extra probe record on a different ArrayChip to have a result,
#the cc_id would be different and would be captured?
#This would still be the same probe tho, so still one RF
#When do we want to split?
#If probe_id is different...do we need to order by probe_id?
#probe will never be defined without with_probe
#so can omit from test
if(defined $probe && ($probe->dbID != $probe_id)){ $new_probe = 1; } else{ push @{$rep_scores{$biol_reps{$cc_id}}}, $score; } } else{#Found new location
$new_probe = 1; } if($new_probe){ #store previous feature with best result from @scores
#Do not change arg order, this is an array object!!
push @rfeatures, Bio::EnsEMBL::Funcgen::ResultFeature->new_fast ( ($old_start - $position_mod), ($old_end - $position_mod), $old_strand, $self->resolve_replicates_by_ResultSet(\%rep_scores, $rset), $probe, ); undef %rep_scores; $old_start = $start; $old_end = $end; #record new score
@{$rep_scores{$biol_reps{$cc_id}}} = ($score); } if($with_probe){ #This would be done via the ProbeAdaptor->obj_from_sth_values
#We need away ti maintain the Array/ProbeSet caches in obj_from_sth fetch
#Pulling into an array would increase memory usage over the fetch loop
#Can we maintain an object level cache which we would then have to reset?
#Wouldn't be the end of the world if it wasn't, but would use memory
#This is recreating the Probe->_obj_frm_sth
#We need to be mindful that we can get multiple records for a probe
#So we need to check whether the probe_id is the same
#It may be possible to get two of the same probes contiguously
#So we would also have to check on seq_region_start
#Do we really need to set these?
#We are getting the advantage that we're cacheing
#Rather than a fetch for each object
#But maybe we don't need this information?
#Can we have two mode for obj_from_sth?
$array = $array_cache{$arraychip_id} || $self->db->get_ArrayAdaptor()->fetch_by_array_chip_dbID($arraychip_id); if($probe_set_id){ $probeset = $probe_set_cache{$probe_set_id} || $self->db->get_ProbeSetAdaptor()->fetch_by_dbID($probe_set_id); } if($first_record || $new_probe){ $probe = Bio::EnsEMBL::Funcgen::Probe->new ( -dbID => $probe_id, -name => $pname, -array_chip_id => $arraychip_id, -array => $array, -probe_set => $probeset, -length => $plength, -class => $pclass, -adaptor => $padaptor, ); } else { # Extend existing probe
$probe->add_array_chip_probename($arraychip_id, $pname, $array); } } $new_probe = 0; $first_record = 0; } #store last feature
#Do not change arg order, this is an array object!!
#only if found previosu results
if($old_start){ push @rfeatures, Bio::EnsEMBL::Funcgen::ResultFeature->new_fast ( ($old_start - $position_mod), ($old_end - $position_mod), $old_strand, $self->resolve_replicates_by_ResultSet(\%rep_scores, $rset), $probe ); #(scalar(@scores) == 0) ? $scores[0] : $self->_get_best_result(\@scores)]);
} return\@ rfeatures;
fetch_results_by_probe_id_ResultSet | description | prev | next | Top |
my ($self, $probe_id, $rset) = @_; throw("Need to pass a valid stored Bio::EnsEMBL::Funcgen::ResultSet") if (! ($rset && $rset->isa("Bio::EnsEMBL::Funcgen::ResultSet") && $rset->dbID())); throw('Must pass a probe dbID') if ! $probe_id; my $cc_ids = join(',', @{$rset->chip_channel_ids()}); my $query = "SELECT r.score from result r where r.probe_id ='${probe_id}'". " AND r.chip_channel_id IN (${cc_ids}) order by r.score;"; #without a left join this will return empty results for any probes which may have been remapped}
#to the a chromosome, but no result exist for a given set due to only importing a subset of
#a vendor specified mapping
#This converts no result to a 0!
my @results = map $_ = "@$_", @{$self->dbc->db_handle->selectall_arrayref($query)}; return\@ results; } 1;
list_dbIDs | description | prev | next | Top |
my $self = shift; return $self->_list_dbIDs('result_feature');}
resolve_replicates_by_ResultSet | description | prev | next | Top |
my ($self, $rep_ref) = @_;#, $rset) = @_;}
my ($score, @scores, $biol_rep); #deal with simplest case first and fastest?
#can we front load this with the replicate set info, i.e. if we know we only have one then we don't' have to do all this testing and can do a mean
if(scalar(keys %{$rep_ref}) == 1){ ($biol_rep) = keys %{$rep_ref}; @scores = @{$rep_ref->{$biol_rep}}; if (scalar(@scores) == 1){ $score = $scores[0]; }else{ $score = mean(\@scores) } }else{#deal with biol replicates
foreach $biol_rep(keys %{$rep_ref}){ push @scores, mean($rep_ref->{$biol_rep}); } @scores = sort {$a<=>$b} @scores; $score = median(\@scores); } return $score;
store | description | prev | next | Top |
my ($self, @rfeats) = @_; throw("Must provide a list of ResultFeature objects") if(scalar(@rfeats == 0)); #These are in the order of the ResultFeature attr array(excluding probe_id, which is the result/probe_feature query only attr))}
my $sth = $self->prepare('INSERT INTO result_feature (result_set_id, seq_region_id, seq_region_start, seq_region_end, seq_region_strand, window_size, score) VALUES (?, ?, ?, ?, ?, ?, ?)'); my $db = $self->db(); FEATURE: foreach my $rfeat (@rfeats) { if( ! ref $rfeat || ! $rfeat->isa('Bio::EnsEMBL::Funcgen::ResultFeature') ) { throw('Must be an ResultFeature object to store'); } #This is the only validation! So all the validation must be done in the caller as we are simply dealing with ints?
#We can do some validation, all have to be scalars, except strand which can be undef(or 0).
#Not a lot of point in doing this.
#Just specify not null in table def
#Remove result_feature_set from result_set and set as status?
#warn "pack score into blob here";
my $seq_region_id; #Here we need to temporarily override the $_result_feature_set
#As when we are storing
#Or should we just change default?
($rfeat, $seq_region_id) = $self->_pre_store($rfeat); $sth->bind_param(1, $rfeat->result_set_id, SQL_INTEGER); $sth->bind_param(2, $seq_region_id, SQL_INTEGER); $sth->bind_param(3, $rfeat->start, SQL_INTEGER); $sth->bind_param(4, $rfeat->end, SQL_INTEGER); $sth->bind_param(5, $rfeat->strand, SQL_INTEGER); $sth->bind_param(6, $rfeat->window_size, SQL_INTEGER); $sth->bind_param(7, sprintf("%.3f", $rfeat->score), SQL_DOUBLE);#??Change this to BLOB?
$sth->execute(); #$rfeat->dbID( $sth->{'mysql_insertid'} );
#$rset->adaptor($self);
} return\@ rfeats;
window_sizes | description | prev | next | Top |
my $self = shift; #Max slice length is 1MB, max default bucket size is 1429bp}
#which would be 14.29 probes. Not sensible to average ovr this region as we lose too much resolution.
#We should still turn the track off at a sensible limit as the peaks are just going to be
#averaged away.
#Standard design is 50 bp every 100 bp.
#So 150 is two probes, this should be the first bin size with 0 being the probes themselves
#Currently only displayed for less than 500KB
#200KB comes back okay for one track
#slice length limits increment 70KB for bins
#105KB, 175KB, 245KB, 315KB, 385KB, 455KB
#my @window_sizes = (0, 150, 250, 350, 450, 550, 650);
# 1000);# Do we really want to average more than 7 probes?
#Need to check how this looks
#Maybe lose 550 and 650?
#we're not guaranteed to get 2 in 150.
#May have one sat in the middle
#We really need to extend to extend by the probe length
#to capture overlapping probes.
#Not a problem between windows, as we can compare start/ends and count on the fly
#Maybe would could just omit window_size and let the Collection handle that bit
#This would reduce the amount of data stored.
return (0, 150, 250, 350, 450, 550, 650);
AUTHOR | Top |
CONTACT | Top |
_final_clause | Top |
Args : None
Example : None
Description: PROTECTED implementation of superclass abstract method.
Returns an ORDER BY clause. Sorting by oligo_feature_id would be
enough to eliminate duplicates, but sorting by location might
make fetching features on a slice faster.
Returntype : String
Exceptions : None
Caller : generic_fetch
Status : At Risk