Raw content of Bio::EnsEMBL::Funcgen::Collection::ResultFeature
# $Id: ResultFeature.pm,v 1.9 2008/11/10 14:27:01 nj1 Exp $
package Bio::EnsEMBL::Funcgen::Collection::ResultFeature;
use strict;
use warnings;
use Bio::EnsEMBL::Utils::Argument ('rearrange');
use Bio::EnsEMBL::Utils::Exception ('throw');
use Bio::EnsEMBL::Funcgen::ResultFeature;
use base( 'Bio::EnsEMBL::Funcgen::DBSQL::ResultFeatureAdaptor',
'Bio::EnsEMBL::Collection');
#Had to put adaptor first as it wasn't finding new?
#Wasn't transvering the package tree?
#Using standard object generation methods for ResultFeature
#We never want the heavy object from the result_feature table method
#In fact we never want the heavy object unless we create it from scratch to store it
#Therefore we don't need to use create_feature at all unless we ever want to use
#The collection to generate bins on the fly(rather than for storing)
#We would only want this if we don't want to implement the object, just the array
#Now we're back to considering the base collection array
#'DBID', 'START', 'END', 'STRAND', 'SLICE'
#we don't need dbID or slice for ResultFeature
#But we do need a score
#Stick with object implementation for now i.e. dont use create_feature methods
#TO DO
#Had to change ResultFeature to inherit from Feature, so can use Feature methods
#Is this going to cause problem mixing and array based object with hash based object?
#Not is we implement the mandatory methods here.
#seq_region start etc?
#Issues
#As we define the windows by the start of the features
#we get some skew in the data to the right as we generally have
#overhanging 3' features and no overhanging 5' features
#Also when dealing with window sets we are not resetting the start
#of the bins when we reach a gap, hence we will then get 5' skew
#but only for gaps within a chunk, not at the start.
#Solution, set start and end values for each bin dynamically
#to the start or end of a feature if there are not boundary spanning features
#This should remove skew completely
=pod
sub _create_feature {
my ( $this, $feature_type, $args ) = @_;
my $feature = $this->SUPER::_create_feature( $feature_type, $args );
if ( !$this->_lightweight() ) {
my ( $phase, $end_phase, $stable_id, $version, $created_date,
$modified_date, $is_current )
= rearrange( [ 'PHASE', 'END_PHASE',
'STABLE_ID', 'VERSION',
'CREATED_DATE', 'MODIFIED_DATE',
'IS_CURRENT'
],
%{$args} );
push( @{$feature},
$phase, $end_phase, $stable_id, $version, $created_date,
$modified_date, $is_current );
}
return $feature;
}
sub _create_feature_fast {
my ( $this, $feature_type, $args ) = @_;
my $feature =
$this->SUPER::_create_feature_fast( $feature_type, $args );
return $feature;
}
#This might not be sensible for Features which are split across tables
sub _tables {
my ($this) = @_;
my @tables = $this->SUPER::_tables();
if ( $this->_lightweight() ) {
return ( $tables[0] );
}
return @tables;
}
sub _columns {
my ($this) = @_;
my @columns = $this->SUPER::_columns();
if ( $this->_lightweight() ) {
#What is this doing?
#Probably not sensible for ResultFeature
@columns[ 5 .. $#columns ] = map( 1, 5 .. $#columns );
}
return @columns;
}
#Also not sensible for objects spread across several tables
sub _default_where_clause {
my ($this) = @_;
if ( $this->_lightweight() ) {
return '';
}
return $this->SUPER::_default_where_clause();
}
=cut
sub store_window_bins_by_Slice_ResultSet {
my $this = shift;
my ( $slice, $rset, $window_sizes, $logic_name, $rollback) =
rearrange( [ 'SLICE', 'RESULT_SET', 'WINDOW_SIZES', 'LOGIC_NAME', 'ROLLBACK'], @_ );
my $slice_adaptor = $this->db->dnadb->get_SliceAdaptor;
#my $method='average_score';#average
my $method='max_magnitude';#i.e. greatest positive or negative score
#Hardcoded method to average. What about those probe which cross boundaries?
#Will this be returned?
if(! defined $window_sizes){
if($this->can('window_sizes')){
@{$window_sizes} = $this->window_sizes;
print 'No window sizes provided running with defaults: '.join(' ', @$window_sizes)."\n";
}
else{
throw('No default window sizes available. Must supply at least one window size as an arrayref');
}
}
elsif(scalar(@$window_sizes) == 0){
throw('Must supply at least one window size as an arrayref, or leave undef for defaults');
}
#Slice methods return over hanging feature, so we could do this externally
#If we are not returning over hanging features we should really handle whole chromosome
#As we will need persistant data to handle over hangs
#Do as whole chromosome anyway and chunk the Slices into windows which are divisible by all windows sizes
#Nope, too much memmory, we either need to store here or generate the slice chunks in the caller and handle storing there
#This is an adaptor after all let's do it here
#Test for Rset here and whether we have status?
#Currently just result_feature_set.
#Need to implement roll back method in Helper
#fetch this first to perform validation on slice and ResultSet??
#my $rfs = $this->fetch_all_by_Slice_ResultSet($slice, $rset)
#maybe want to tst if RESULT_FEATURE_SET before we do the query?
#Need to as coord_system here.
#Could rollback by slice(for all window sizes)
if($rset->has_status('RESULT_FEATURE_SET')){
throw('ResultSet('.$rset->name.') already has precomputed ResultFeatures stored, please rollback ResultFeature first');
#Retrieving ResultFeatures here would end up retrieving them from the result_feature table
#which is where we want to store them
#They are only retrived from the probe/result/probe_feature table if they do not have this status.
#REMEMBER TO SET THIS IN THE CALLING SCRIPT!
}
#Calulate sensible slice length based on window sizes
my @wsizes = sort {$a <=> $b} @$window_sizes;
#Current limit is 500KB, so let's start there
#we should really start with a nice round number
#so
my $multiplier = int(500000/$wsizes[$#wsizes]);
my $chunk_length = $multiplier * $wsizes[$#wsizes];
my $not_divisible = 1;
my %chunk_windows;#Registry of chunk lengths to run with windows
my %workable_chunks = map {$_ => {}} @wsizes;
delete $workable_chunks{'0'};#get rid of natural resolution as this will always work
while($not_divisible && $chunk_length != 0){
$not_divisible = 0;
foreach my $wsize(@wsizes){
next if $wsize == 0;#Special wsize for normal data
#Set not divisible if modulus is true
if($chunk_length % $wsize){
$not_divisible = 1;
}
else{
#No need to listref here?
$workable_chunks{$wsize}{$chunk_length} = [];
}
#warn "chunk length is $chunk_length and not_divisible is $not_divisible";
}
#Gradually shrink the length until we find a workable slice length for all windows
$chunk_length -= $wsizes[$#wsizes] if $not_divisible;
}
my %chunk_sets;
if($chunk_length == 0){
print "Could not find chunk length for all window sizes, attempting to subset windows using alternate slice length\n";
#my %sorted_windows
foreach my $wsize(keys %workable_chunks){
#Loop through windows, seeing if they are workable in the other windows
foreach my $chunk(keys %{$workable_chunks{$wsize}}){
foreach my $other_wsize(keys %workable_chunks){
next if $wsize == $other_wsize;
if(exists $workable_chunks{$other_wsize}{$chunk}){
#only push it onto the other wsize, as we will do the reverse later
$chunk_sets{$chunk}{$wsize} = undef;
}
}
}
}
#Now we have a register of co-occurence of wsizes with repect to chunks
#Loop through finding the least amount of sets with the longest chunk length?
#There is no way to decide which is best?
#we could calculate the number of loops? Factored by the chunk length?
#Let's just print out and see what we get
#warn "chunk sets are :\n".Data::Dumper::Dumper(\%chunk_sets);
#For now let's just take the one which has the most windows and the longest chunk
#Then we just get the largest which handles the rest.
#define possible set lengths
my $i = 0;
my %set_lengths;
map {$set_lengths{$i} = []; $i++} @wsizes;
delete $set_lengths{'0'};#get rid of natural resolution as this will always work
#store chunks lengths for each set size
foreach my $chunk(keys %chunk_sets){
my $set_size = scalar(values %{$chunk_sets{$chunk}});
push @{$set_lengths{$set_size}}, $chunk;
}
#Now we get the biggest set with the longest length;
my $largest_size = scalar(@wsizes);#scalar here as we are disregarding natural resolution of 0 in loop
my $found_largest_set = 0;
while(! $found_largest_set){
$largest_size--;
if(scalar(@{$set_lengths{$largest_size}}>0)){
$found_largest_set = 1;
}
}
#We should be able to loop this bit, to find all the biggest sets.
my ($largest_chunk) = sort {$b<=>$a} @{$set_lengths{$largest_size}};
#we could even be selective here, but let's just take the first one for now
my @largest_windows = keys %{$chunk_sets{$largest_chunk}};
@{$chunk_windows{$largest_chunk}} = @largest_windows;
print "Largest chunk $largest_chunk($largest_size) contains windows: @largest_windows\n";
my %remaining_windows = map {$_ => {}} @wsizes;
delete $remaining_windows{'0'};#get rid of natural resolution as this will always work
map { delete $remaining_windows{$_} } @largest_windows;
my $remaining_set_size = scalar(keys %remaining_windows);
#swapping to array here for practicality, would need to maintain hash if we need to iterate
my @rwindows = keys %remaining_windows;
#This could just be one window, but this will not be inthe co-occurence hash %chunk_sets
#Hence the normal approach will not work. and we just want to find a suitably large chunk for this one window.
my $next_chunk;
if(scalar(@rwindows) == 1){
#we just want to find a suitably large chunk for this one window.
my ($last_window) = @rwindows;
$multiplier = int(500000/$last_window);
$next_chunk = $multiplier * $last_window;
}
else{
#Now were are doing something very similar to above
#populating a set_size chunk length registry
#my %seen_hash;
foreach my $chunk(sort {$b<=>$a} @{$set_lengths{$remaining_set_size}}){
my $seen_count = 0;
foreach my $rwindow(@rwindows){
$seen_count++ if grep/$rwindow/, (values %{$chunk_sets{$chunk}});
}
if ($seen_count == $remaining_set_size){
$next_chunk = $chunk;
last;
}
}
}
@{$chunk_windows{$next_chunk}} = @rwindows;
if($next_chunk){
print "Found next chunk length $next_chunk contains remaining windows:\t@rwindows\n";
#Now we want to cycle through all the set lengths which could contain the ones not in the first
#so we need to
}
else{
warn "Need to write iterative sub for set definition";
throw('Could not find workable slice length for remaining windows: '.
join(', ', @rwindows));
}
}
else{
@{$chunk_windows{$chunk_length}} = keys(%workable_chunks);
print "Found workable chunk length($chunk_length) for all window sizes:\t".
join(' ', @{$chunk_windows{$chunk_length}})."\n";
}
#Not lightweight as we will be storing them
# Temporarily set the collection to be lightweight??? #Why?
#my $old_value = $this->_lightweight();
#if ( defined($lightweight) ) { $this->_lightweight($lightweight) }
#else { $this->_lightweight(1) }
my (%counts, $store_natural);
$store_natural = 1 if(grep/^0/, @$window_sizes);
#Set natural res count to 0
$counts{0}=0;
my $store_slice = $slice;
my $slice_adj = 0;
my $slice_end = $slice->end;
my $orig_slice = $slice;
my $orig_start = $slice->start;
my $region = $slice->coord_system_name;
my $seq_region_name = $slice->seq_region_name;
my $strand = $slice->strand;
my $rset_id = $rset->dbID;
if($store_slice->start != 1){
$store_slice = $slice->adaptor->fetch_by_region('chromosome', $slice->seq_region_name);
$slice_adj = ($slice->start - 1);
}
#Can probably tidy this up and just alter start adjust directly
foreach my $chunk_length(sort keys %chunk_windows){
print "Processing windows ".join(', ', @{$chunk_windows{$chunk_length}})." with chunk length $chunk_length\n";
#Set window counts to 0
foreach my $window(@{$chunk_windows{$chunk_length}}){
$counts{$window}=0;
}
#Now walk through slice using slice length chunks and build all windows in each chunk
my $in_slice = 1;
my $start_adj = 0;
my $slice_start = $orig_start;
#my $end_adj = $slice->start + $chunk_length - 1;
#my $end_adj = $chunk_length;
my ($start, $end, $features, $bins);
#$chunk_slice,
while($in_slice){
$start = $slice_start + $start_adj;
$end = $start + $chunk_length - 1;
#Last chunk might not be the correct window length
#Hence why we should do this on whole chromosomes
#Force this or just warn, check seq_region slice
#if($start > $slice_end);
if($end >= $slice_end){
#or equal as we want to catch end of slice and set in_slice
$end = $slice_end;
$in_slice = 0;
}
$slice = $slice_adaptor->fetch_by_region($region, $seq_region_name, $start, $end, $strand);
$features = $this->fetch_all_by_Slice_ResultSet($slice, $rset);
#Shift chunk coords
$start_adj = $chunk_length if($in_slice);
$slice_start = $slice->start;
#$start_adj += $chunk_length;
#$end_adj += $chunk_length;
#}
next if scalar(@$features) == 0;
#This should return a hash of window size => bin array pairs
$bins = $this->_bin_features_by_window_sizes
(
-slice => $slice,
-window_sizes => $chunk_windows{$chunk_length},
-method => $method,
-features =>
$features,
);
my $bin_start = $slice->start;
#my $bin_end = $slice->start;#???? wtf?
#We need to handle strandedness of slice!?
#Store all normal features in result_feature
if($store_natural){
foreach my $feature(@$features){
$counts{0}++;
#warn "storing ".join(', ', ($feature->start, $feature->end, $feature->strand, $feature->score, 'undef', $rset->dbID, 0, $slice));
$this->store(Bio::EnsEMBL::Funcgen::ResultFeature->new_fast
(
($feature->start + $bin_start + $slice_adj),
($feature->end + $bin_start + $slice_adj),
$feature->strand,
$feature->score,
undef,#absent probe info
$rset->dbID,
0, #window size
$store_slice
));
}
print "Window size 0 (natural resolution) has ".scalar(@{$features})." feature bins\n";
}
my ($bin_end, $bin_score);
foreach my $wsize(sort keys %{$bins}){
my $count = 0;
$bin_start = $slice->start;
$bin_end = $slice->start;
foreach my $bin_index(0..$#{$bins->{$wsize}}){
$bin_score = $bins->{$wsize}->[$bin_index];
#next if ! $bin_score;#No we're no inc'ing the start ends for bins with no scores
$bin_end += $wsize;
if($bin_score){
#$counts{$wsize}++;
$count++;
#This is a little backwards as we are generating the object to store it
#If we are aiming for speed the maybe we could also commodotise the store method
#store by args or hash? store_fast?
#Speed not essential for storing
#Note: list ref passed
#warn "storing $bin_start, $bin_end, $strand, $bin_score, undef, $rset->dbID, $wsize, $slice";
$this->store(Bio::EnsEMBL::Funcgen::ResultFeature->new_fast
(
($bin_start + $slice_adj),
($bin_end + $slice_adj),
$strand,
$bin_score,
undef,#absent probe info
$rset->dbID,
$wsize,
$store_slice
));
}
$bin_start += $wsize;
}
print "Window size $wsize has ".scalar(@{$bins->{$wsize}})." bins, storing $count\n";
$counts{$wsize}+= $count;
}
}
#Turn off storing of natural resolution for next chunk length sets
$store_natural = 0;
}
#print some counts here
foreach my $wsize(sort (keys %counts)){
print "Stored ".$counts{$wsize}." for window size $wsize for ".$orig_slice->name."\n";
}
#Return this counts hash so we can print/log from the caller, hence we don't print in here?
return;
}
#This is a slight rewrite to calculate bins for a list of all window sizes
#Should only be called by store_window_bins_by_Slice_ResultSet
#This could be put in Collection and _bin_feature could call this to maintain interface
sub _bin_features_by_window_sizes{
my $this = shift;
my ( $slice, $window_sizes, $method, $features ) =
rearrange( [ 'SLICE', 'WINDOW_SIZES', 'METHOD', 'FEATURES' ], @_ );
if ( !defined($features) || !@{$features} ) { return {} }
#No need to validate window sizes as done in the store_window_bins?
#Do here anyway?
warn "Processing window sizes ".join(', ',@$window_sizes).' for slice '.$slice->name;
#Set default to ResultFeature implementation?
$method ||= 'average_score';
#Should sub this to allow different implementations to use it
$method = $this->validate_bin_method($method);
my $slice_start = $slice->start();
#my $bin_length = ( $slice->end() - $slice_start + 1 )/$nbins;
#my @bins;
#Set up some hashes to store data by window_size
my (%bins, %nbins, %bin_counts);
#my($first_bin);
#if ( $method == 0 || # 'count' or 'density'
# $method == 3 || # 'fractional_count' or 'weight'
# $method == 4 # 'coverage'
# ){
# # For binning methods where each bin contain numerical values.
# $first_bin = 0;
# }
# else {
# # For binning methods where each bin does not contain numerical
# # values.
#
# #Remove this
# $first_bin = undef;
# }
#Set up some bin data for the windows
foreach my $wsize (@$window_sizes) {
$nbins{$wsize} = int($slice->length / $wsize);#should this be -1 as this will be an index?
#no as int always rounds down
#So nbins is actually the index of the bin not the 'number'
#@{$bins{$wsize}} = map( $first_bin, 1 .. $nbins{$wsize});
#Create default bins with 0
@{$bins{$wsize}} = ();
map {$bins{$wsize}->[$_] = 0} (0 .. $nbins{$wsize});
#Set bin counts to 0 for each bin
@{$bin_counts{$wsize}} = ();
#Huh this is doing the same thing?
#We don't want to do this as this implies we have seen a 0 value
#When we have not seen any value at all!
#Need to set to undef? Or not set at all?
#This is adding an undef to the start of the array!?
map { $bin_counts{$wsize}->[($_)] = 0 } @{$bins{$wsize}};
foreach my $bin(@{$bins{$wsize}}){
$bin_counts{$wsize}->[$bin] = 0;
}
}
#warn "bin_counts are :\n".Data::Dumper::Dumper(\%bin_counts);
#This failes for slices which are smaller than the chunk length;
#Just do a little sanity test here to make sure the bin_lengths == wsizes
#Then remove
#foreach my $wsize (keys %bin_lengths) {
# warn "wsize is $wsize with bin_length ".$bin_lengths{$wsize};
# throw('Oh no, these should be the same') if($wsize != $bin_lengths{$wsize});
# }
my $feature_index = 0;
#What is this?
my @bin_masks;
#my %start_bins;
#my %end_bins;
#Also store local starts and ends as we currently calc this several times
my $lfeature_start;
my $lfeature_end;
#To remove skew from bins we need to clip the start and end
#if they do not have overlapping features.
#So we need to store features in each bin and post process.
#This would be easiest, but 2nd loop would slow it down.
#Not so bad for storage, but let's try and keep it quick
#So in line bin clipping requires awareness of when we have moved to a new bin
#between features. This holds for overlapping and non-overlapping features.
#Once we have observed a gap we need to clip the end of the last bin and clip the start of the new bin.
#This requires knowing the greatest end values from the last bin's feature.
#what if two overlapping features had the same start and different end, would we see the longest last?
#Check default slice_fetch sort
foreach my $feature ( @{$features} ) {
#Set up the bins for each window size
foreach my $wsize (@$window_sizes) {
#We have already highjacked the object creation by here
#This is done in core BaseFeatureAdaptor
#We probably don't want to do this for ResultFeatures as we don't use the
#standard feature implementation
#we already use an array and we don't store the slice
#as this is already known by the caller
#and we always build on top level so we don't need to remap
#We do however need the slice to store?
#Do we?
#Yes, as we only store local starts when generating
#We need a store by Slice method?
#This will remove the need to inherit from Feature.
#These will need to be regenerated everytime we import a new build
#As we do with the probe_features themselves
#This also mean the result_feature status has to be associated with a coord_system_id
#Which bins do the start and end lie in for this feature?
#Already dealing with local starts, so no slice subtraction
my $start_bin = int(($feature->start ) / $wsize);
my $end_bin = int(($feature->end) / $wsize );
#Don't need slice start as we are dealing with local coords
#my $start_bin =
# int( ( $feature->[FEATURE_START] - $slice_start )/$bin_length );
# my $end_bin =
# int( ( $feature->[FEATURE_END] - $slice_start )/$bin_length );
#This might happen for last feature
#if ( $end_bins{$wsize} >= $nbins{$wsize} ) {
# $end_bins{$wsize} = $nbins{$wsize};
# }
$end_bin = $nbins{$wsize} if $end_bin > $nbins{$wsize};
#Slightly obfuscated code here
#Altho this should speed up generation
#by avoiding string comparisons.
#We should do default count processing for all methods as this is required for all no?
#No, just weight, coverage and average_score
=pod
if ( $method == 0 ) {
throw('Not implemented for method for count/density');
# ----------------------------------------------------------------
# For 'count' and 'density'.
for ( my $bin_index = $start_bin ;
$bin_index <= $end_bin ;
++$bin_index ) {
++$bins[$bin_index];
}
} elsif ( $method == 1 ) {
# ----------------------------------------------------------------
# For 'indices' and 'index'
#How is this useful?
#Is this not just count per bin?
#No this is a list of the feature indices
#So forms a distribution?
throw('Not implemented for method for index');
for ( my $bin_index = $start_bin ;
$bin_index <= $end_bin ;
++$bin_index ) {
push( @{ $bins[$bin_index] }, $feature_index );
}
++$feature_index;
} elsif ( $method == 2 ) {
# ----------------------------------------------------------------
# For 'features' and 'feature'.
throw('Not implemented for method for feature/features');
for ( my $bin_index = $start_bin ;
$bin_index <= $end_bin ;
++$bin_index ) {
push( @{ $bins[$bin_index] }, $feature );
}
} elsif ( $method == 3 ) {
# ----------------------------------------------------------------
# For 'fractional_count' and 'weight'.
throw('Not implemented for method for fractional_count/weight');
if ( $start_bin == $end_bin ) {
++$bins[$start_bin];
} else {
my $feature_length =
$feature->[FEATURE_END] - $feature->[FEATURE_START] + 1;
# The first bin...
$bins[$start_bin] +=
( ( $start_bin + 1 )*$bin_length -
( $feature->[FEATURE_START] - $slice_start ) )/
$feature_length;
# The intermediate bins (if there are any)...
for ( my $bin_index = $start_bin + 1 ;
$bin_index <= $end_bin - 1 ;
++$bin_index ) {
$bins[$bin_index] += $bin_length/$feature_length;
}
# The last bin...
$bins[$end_bin] +=
( ( $feature->[FEATURE_END] - $slice_start ) -
$end_bin*$bin_length +
1 )/$feature_length;
} ## end else [ if ( $start_bin == $end_bin)
} elsif ( $method == 4 ) {
# ----------------------------------------------------------------
# For 'coverage'.
#What exactly is this doing?
#This is coverage of bin
#Rather than coverage of feature as in fractional_count
my $feature_start = $feature->[FEATURE_START] - $slice_start;
my $feature_end = $feature->[FEATURE_END] - $slice_start;
if ( !defined( $bin_masks[$start_bin] )
|| ( defined( $bin_masks[$start_bin] )
&& $bin_masks[$start_bin] != 1 ) ) {
# Mask the $start_bin from the start of the feature to the end
# of the bin, or to the end of the feature (whichever occurs
# first).
my $bin_start = int( $start_bin*$bin_length );
my $bin_end = int( ( $start_bin + 1 )*$bin_length - 1 );
for ( my $pos = $feature_start ;
$pos <= $bin_end && $pos <= $feature_end ;
++$pos ) {
$bin_masks[$start_bin][ $pos - $bin_start ] = 1;
}
}
for ( my $bin_index = $start_bin + 1 ;
$bin_index <= $end_bin - 1 ;
++$bin_index ) {
# Mark the middle bins between $start_bin and $end_bin as fully
# masked out.
$bin_masks[$bin_index] = 1;
}
if ( $end_bin != $start_bin ) {
if ( !defined( $bin_masks[$end_bin] )
|| ( defined( $bin_masks[$end_bin] )
&& $bin_masks[$end_bin] != 1 ) ) {
# Mask the $end_bin from the start of the bin to the end of
# the feature, or to the end of the bin (whichever occurs
# first).
my $bin_start = int( $end_bin*$bin_length );
my $bin_end = int( ( $end_bin + 1 )*$bin_length - 1 );
for ( my $pos = $bin_start ;
$pos <= $feature_end && $pos <= $bin_end ;
++$pos ) {
$bin_masks[$end_bin][ $pos - $bin_start ] = 1;
}
}
}
} ## end elsif ( $method == 4 )
=cut
if ( $method == 5 ) {
#average score
#This is simple an average of all the scores for features which overlap this bin
#No weighting with respect to the bin or the feature
for ( my $bin_index = $start_bin ;
$bin_index <= $end_bin ;
++$bin_index ) {
#we should really push onto array here so we can have median or mean.
$bins{$wsize}->[$bin_index] += $feature->score;
$bin_counts{$wsize}->[$bin_index]++;
}
}
elsif( $method == 6){
#Max magnitude
#Take the highest value +ve or -ve score
for ( my $bin_index = $start_bin ;
$bin_index <= $end_bin ;
++$bin_index ) {
#we really need to capture the lowest -ve and higest +ve scores here and post process
#To pick between them
my $score = $feature->score;
$bins{$wsize}->[$bin_index] ||= [0,0]; #-ve, +ve
if($score < $bins{$wsize}->[$bin_index]->[0]){
$bins{$wsize}->[$bin_index]->[0] = $score;
}
elsif($score > $bins{$wsize}->[$bin_index][1]){
$bins{$wsize}->[$bin_index]->[1] = $score;
}
}
}
else {
throw("Only accomodates average score method");
}
}
} ## end foreach my $feature ( @{$features...
#Now do post processing of bins
=pod
if ( $method == 4 ) {
# ------------------------------------------------------------------
# For the 'coverage' method: Finish up by going through @bin_masks
# and sum up the arrays.
for ( my $bin_index = 0 ; $bin_index < $nbins ; ++$bin_index ) {
if ( defined( $bin_masks[$bin_index] ) ) {
if ( !ref( $bin_masks[$bin_index] ) ) {
$bins[$bin_index] = 1;
} else {
$bins[$bin_index] =
scalar( grep ( defined($_), @{ $bin_masks[$bin_index] } ) )/
$bin_length;
}
}
}
}
=cut
if( $method == 5){
#For average score, need to divide bins by bin_counts
foreach my $wsize(keys %bins){
foreach my $bin_index(0..$#{$bins{$wsize}}){
if($bin_counts{$wsize}->[$bin_index]){
$bins{$wsize}->[$bin_index] /= $bin_counts{$wsize}->[$bin_index];
}
#warn "bin_index $wsize:$bin_index has score ".$bins{$wsize}->[$bin_index];
}
}
}
elsif( $method == 6){
#Max magnitude
#Take the highest value +ve or -ve score
foreach my $wsize(keys %bins){
foreach my $bin_index(0..$#{$bins{$wsize}}){
#So we have the potential that we have no listref in a given bin
#default value if we haven't seen anything is 0
#we actually want an array of -ve +ve values
#warn "Are we storing 0 values for absent data?";
#Not for max_magnitude, but maybe for others?
if($bins{$wsize}->[$bin_index]){
#warn $wsize.':'.$bin_index;
#warn $bins{$wsize}->[$bin_index];
#warn $bins{$wsize}->[$bin_index]->[0];
my $tmp_minus = $bins{$wsize}->[$bin_index]->[0] * -1;
if($tmp_minus > $bins{$wsize}->[$bin_index]->[1]){
$bins{$wsize}->[$bin_index] = $bins{$wsize}->[$bin_index]->[0];
}
else{
$bins{$wsize}->[$bin_index] = $bins{$wsize}->[$bin_index]->[1];
}
}
}
}
}
else{
throw('Only accomodates average_score method');
}
#Could return bin_counts too summary reporting in zmenu
#Could also do counting of specific type
return \%bins;
} ## end sub _bin_features
#separated to allow addition of non-standard methods
#Could potentially add these in new
#and put this back in _bin_features
sub validate_bin_method{
my ($self, $method) = @_;
#Add average_score to avoid changing Collection.pm
my $class = ref($self);
${$class::VALID_BINNING_METHODS}{'average_score'} = 5;
${$class::VALID_BINNING_METHODS}{'max_magnitude'} = 6;
#warn "Still can't access VALID_BINNING_METHODS";
#foreach my $method_name(keys %{$class::VALID_BINNING_METHODS}){
# warn "valid method is $method name";
# }
if ( ! exists( ${$class::VALID_BINNING_METHODS}{$method} ) ) {
throw(
sprintf(
"Invalid binning method '%s', valid methods are:\n\t%s\n",
$method,
join( "\n\t", sort( keys(%{$self::VALID_BINNING_METHODS}) ) ) ) );
}
else{
#warn "found valid method $method with index ".${$class::VALID_BINNING_METHODS}{$method};
}
return ${$class::VALID_BINNING_METHODS}{$method};
}
1;