ResultFeature documentation.

sub _bin_features_by_window_sizes {

  my $this = shift;
  my ( $slice, $window_sizes, $method, $features ) =
    rearrange( [ 'SLICE', 'WINDOW_SIZES', 'METHOD', 'FEATURES' ], @_ );


  if ( !defined($features) || !@{$features} ) { return {} }

  #No need to validate window sizes as done in the store_window_bins?
  #Do here anyway?
  
  warn "Processing window sizes ".join(', ',@$window_sizes).' for slice '.$slice->name;


  #Set default to ResultFeature implementation?
  $method ||= 'average_score';

  #Should sub this to allow different implementations to use it
  $method = $this->validate_bin_method($method);


  my $slice_start = $slice->start();

  #my $bin_length = ( $slice->end() - $slice_start + 1 )/$nbins;
  #my @bins;

  #Set up some hashes to store data by window_size
  my (%bins, %nbins, %bin_counts);

  #my($first_bin);
  #if ( $method == 0 ||    # 'count' or 'density'
  #     $method == 3 ||    # 'fractional_count' or 'weight'
  #     $method == 4       # 'coverage'
  #  ){
  #  # For binning methods where each bin contain numerical values.
#	$first_bin = 0;
#  } 
#  else {
#	# For binning methods where each bin does not contain numerical
 #   # values.
#
#	#Remove this
#	$first_bin = undef;
#  }

  #Set up some bin data for the windows
  foreach my $wsize (@$window_sizes) {
	$nbins{$wsize}         = int($slice->length / $wsize);#should this be -1 as this will be an index?
	#no as int always rounds down
	#So nbins is actually the index of the bin not the 'number'

	#@{$bins{$wsize}}       = map( $first_bin, 1 .. $nbins{$wsize});
	#Create default bins with 0
	@{$bins{$wsize}} = ();
	map {$bins{$wsize}->[$_] = 0} (0 .. $nbins{$wsize});		
	
	#Set bin counts to 0 for each bin
	@{$bin_counts{$wsize}}    = ();



	#Huh this is doing the same thing?
	#We don't want to do this as this implies we have seen a 0 value
	#When we have not seen any value at all!
	#Need to set to undef? Or not set at all?

	#This is adding an undef to the start of the array!?
    map { $bin_counts{$wsize}->[($_)] = 0 } @{$bins{$wsize}};

	foreach my $bin(@{$bins{$wsize}}){
	  $bin_counts{$wsize}->[$bin] = 0;
	}	
  }

  #warn "bin_counts are :\n".Data::Dumper::Dumper(\%bin_counts);
    
  #This failes for slices which are smaller than the chunk length;
  #Just do a little sanity test here to make sure the bin_lengths == wsizes
  #Then remove
  #foreach my $wsize (keys %bin_lengths) {
#	warn "wsize is $wsize with bin_length ".$bin_lengths{$wsize};

#	throw('Oh no, these should be the same') if($wsize != $bin_lengths{$wsize});

#  }
 
  my $feature_index = 0;

  #What is this?
  my @bin_masks;
  #my %start_bins;
  #my %end_bins;

  #Also store local starts and ends as we currently calc this several times
  my $lfeature_start;
  my $lfeature_end;


  #To remove skew from bins we need to clip the start and end 
  #if they do not have overlapping features.
  #So we need to store features in each bin and post process.
  #This would be easiest, but 2nd loop would slow it down.
  #Not so bad for storage, but let's try and keep it quick
  #So in line bin clipping requires awareness of when we have moved to a new bin
  #between features.  This holds for overlapping and non-overlapping features.
  #Once we have observed a gap we need to clip the end of the last bin and clip the start of the new bin.
  #This requires knowing the greatest end values from the last bin's feature.
  #what if two overlapping features had the same start and different end, would we see the longest last?
  #Check default slice_fetch sort


  foreach my $feature ( @{$features} ) {
	#Set up the bins for each window size

	foreach my $wsize (@$window_sizes) {
	
	  #We have already highjacked the object creation by here
	  #This is done in core BaseFeatureAdaptor
	  #We probably don't want to do this for ResultFeatures as we don't use the
	  #standard feature implementation
	  #we already use an array and we don't store the slice
	  #as this is already known by the caller
	  #and we always build on top level so we don't need to remap

	  #We do however need the slice to store?
	  #Do we?
	  #Yes, as we only store local starts when generating
	  #We need a store by Slice method?
	  #This will remove the need to inherit from Feature.
	  #These will need to be regenerated everytime we import a new build
	  #As we do with the probe_features themselves
	  #This also mean the result_feature status has to be associated with a coord_system_id
	  




	  #Which bins do the start and end lie in for this feature?
	  #Already dealing with local starts, so no slice subtraction


	  my $start_bin =  int(($feature->start ) / $wsize);
	  my $end_bin   =  int(($feature->end) / $wsize );
	

	  #Don't need slice start as we are dealing with local coords

	
	  #my $start_bin =
	  #	int( ( $feature->[FEATURE_START] - $slice_start )/$bin_length );
	  
	  #  my $end_bin =
	  #	int( ( $feature->[FEATURE_END] - $slice_start )/$bin_length );
	  

	  #This might happen for last feature
	  #if (   $end_bins{$wsize} >= 	$nbins{$wsize} ) {
	  #	$end_bins{$wsize} =  $nbins{$wsize};

	  # }

	  $end_bin = $nbins{$wsize} if $end_bin > $nbins{$wsize};



	  #Slightly obfuscated code here
	  #Altho this should speed up generation
	  #by avoiding string comparisons.
	
	  #We should do default count processing for all methods as this is required for all no?
	  #No, just weight, coverage and average_score

}

sub store_window_bins_by_Slice_ResultSet {

  my $this = shift;
  my ( $slice, $rset, $window_sizes, $logic_name, $rollback) =
    rearrange( [ 'SLICE', 'RESULT_SET', 'WINDOW_SIZES', 'LOGIC_NAME', 'ROLLBACK'], @_ );


  my $slice_adaptor = $this->db->dnadb->get_SliceAdaptor;
  #my $method='average_score';#average
  my $method='max_magnitude';#i.e. greatest positive or negative score
  #Hardcoded method to average.  What about those probe which cross boundaries?
  #Will this be returned?

  if(! defined $window_sizes){

	if($this->can('window_sizes')){
	  @{$window_sizes} = $this->window_sizes;
	  print 'No window sizes provided running with defaults: '.join(' ', @$window_sizes)."\n";
	}
	else{
	  throw('No default window sizes available. Must supply at least one window size as an arrayref');
	}
  }
  elsif(scalar(@$window_sizes) == 0){
	throw('Must supply at least one window size as an arrayref, or leave undef for defaults');
  }

 
  #Slice methods return over hanging feature, so we could do this externally
  #If we are not returning over hanging features we should really handle whole chromosome
  #As we will need persistant data to handle over hangs
  #Do as whole chromosome anyway and chunk the Slices into windows which are divisible by all windows sizes
  #Nope, too much memmory, we either need to store here or generate the slice chunks in the caller and handle storing there
  #This is an adaptor after all let's do it here


  #Test for Rset here and whether we have status?
  #Currently just result_feature_set.
  #Need to implement roll back method in Helper

  #fetch this first to perform validation on slice and ResultSet??
  #my $rfs = $this->fetch_all_by_Slice_ResultSet($slice, $rset)
  #maybe want to tst if RESULT_FEATURE_SET before we do the query?


  #Need to as coord_system here.
  #Could rollback by slice(for all window sizes)


  if($rset->has_status('RESULT_FEATURE_SET')){
	throw('ResultSet('.$rset->name.') already has precomputed ResultFeatures stored, please rollback ResultFeature first');
	#Retrieving ResultFeatures here would end up retrieving them from the result_feature table
	#which is where we want to store them
	#They are only retrived from the probe/result/probe_feature table if they do not have this status.
	#REMEMBER TO SET THIS IN THE CALLING SCRIPT!

  }
  
  #Calulate sensible slice length based on window sizes
  my @wsizes = sort {$a <=> $b} @$window_sizes;
  
  #Current limit is 500KB, so let's start there
  #we should really start with a nice round number
  #so 


  my $multiplier = int(500000/$wsizes[$#wsizes]);
  my $chunk_length = $multiplier * $wsizes[$#wsizes];
  my $not_divisible = 1;
  my %chunk_windows;#Registry of chunk lengths to run with windows
  my %workable_chunks = map {$_ => {}} @wsizes;
  delete $workable_chunks{'0'};#get rid of natural resolution as this will always work


  while($not_divisible && $chunk_length != 0){
	$not_divisible = 0;
	
	foreach my $wsize(@wsizes){
	  next if $wsize == 0;#Special wsize for normal data
	  
	  #Set not divisible if modulus is true
	  if($chunk_length % $wsize){
		$not_divisible = 1;
	  }
	  else{
		#No need to listref here?
		$workable_chunks{$wsize}{$chunk_length} = [];
	  }
	  
	  #warn "chunk length is $chunk_length and not_divisible is $not_divisible";
	}
	
	#Gradually shrink the length until we find a workable slice length for all windows
	$chunk_length -= $wsizes[$#wsizes] if $not_divisible;
  }
  
  my %chunk_sets;
  
  
  if($chunk_length == 0){
	
	print "Could not find chunk length for all window sizes, attempting to subset windows using alternate slice length\n";
	
	#my %sorted_windows
	
	foreach my $wsize(keys %workable_chunks){
	  #Loop through windows, seeing if they are workable in the other windows
	  
	  foreach my $chunk(keys %{$workable_chunks{$wsize}}){
		
		foreach my $other_wsize(keys %workable_chunks){
		  next if $wsize == $other_wsize;
		  
		  if(exists $workable_chunks{$other_wsize}{$chunk}){
			#only push it onto the other wsize, as we will do the reverse later			
			$chunk_sets{$chunk}{$wsize} = undef;
		  }
		}
	  }
	}
	
	#Now we have a register of co-occurence of wsizes with repect to chunks
	#Loop through finding the least amount of sets with the longest chunk length?
	#There is no way to decide which is best?
	#we could calculate the number of loops? Factored by the chunk length?
	#Let's just print out and see what we get
	
	#warn "chunk sets are :\n".Data::Dumper::Dumper(\%chunk_sets);


	#For now let's just take the one which has the most windows and the longest chunk
	#Then we just get the largest which handles the rest.

	#define possible set lengths
	my $i = 0;
	my %set_lengths;
	map {$set_lengths{$i} = []; $i++} @wsizes;
	delete $set_lengths{'0'};#get rid of natural resolution as this will always work


	#store chunks lengths for each set size
	foreach my $chunk(keys %chunk_sets){
	  my $set_size = scalar(values %{$chunk_sets{$chunk}});
	  push @{$set_lengths{$set_size}}, $chunk;
	}		
  
	#Now we get the biggest set with the longest length;
	my $largest_size = scalar(@wsizes);#scalar here as we are disregarding natural resolution of 0 in loop
	my $found_largest_set = 0;

	while(! $found_largest_set){
	  $largest_size--;

	  if(scalar(@{$set_lengths{$largest_size}}>0)){
		$found_largest_set = 1;
	  }
	}


	#We should be able to loop this bit, to find all the biggest sets.

	my ($largest_chunk) = sort {$b<=>$a} @{$set_lengths{$largest_size}};
	#we could even be selective here, but let's just take the first one for now


	my @largest_windows = keys %{$chunk_sets{$largest_chunk}};
	@{$chunk_windows{$largest_chunk}} = @largest_windows;

	print "Largest chunk $largest_chunk($largest_size) contains windows: @largest_windows\n";

	my %remaining_windows = map {$_ => {}} @wsizes;
	delete $remaining_windows{'0'};#get rid of natural resolution as this will always work
	map { delete $remaining_windows{$_} } @largest_windows;	
	my $remaining_set_size = scalar(keys %remaining_windows);

	#swapping to array here for practicality, would need to maintain hash if we need to iterate
	my @rwindows = keys %remaining_windows;


	#This could just be one window, but this will not be inthe co-occurence hash %chunk_sets
	#Hence the normal approach will not work. and we just want to find a suitably large chunk for this one window.
	my $next_chunk;

	if(scalar(@rwindows) == 1){
	  #we just want to find a suitably large chunk for this one window.
	  my ($last_window) = @rwindows;

	  $multiplier = int(500000/$last_window);
	  $next_chunk = $multiplier * $last_window;
	}
	else{
	  #Now were are doing something very similar to above
	  #populating a set_size chunk length registry
	  #my %seen_hash;
	 	  

	  foreach my $chunk(sort {$b<=>$a} @{$set_lengths{$remaining_set_size}}){
	  	my $seen_count = 0;
		
		foreach my $rwindow(@rwindows){
		  
		  $seen_count++ if grep/$rwindow/, (values %{$chunk_sets{$chunk}});
		}
		
		if ($seen_count == $remaining_set_size){
		  
		  $next_chunk = $chunk;
		  last;
		}
	  }
	}

	@{$chunk_windows{$next_chunk}} = @rwindows;


	if($next_chunk){

	  print "Found next chunk length $next_chunk contains remaining windows:\t@rwindows\n";

	#Now we want to cycle through all the set lengths which could contain the ones not in the first
	#so we need to
	}
	else{
	  warn "Need to write iterative sub for set definition";
	  throw('Could not find workable slice length for remaining windows: '.
			join(', ', @rwindows));

	}
  }
  else{
	@{$chunk_windows{$chunk_length}} = keys(%workable_chunks);
   	print "Found workable chunk length($chunk_length) for all window sizes:\t".
	  join(' ', @{$chunk_windows{$chunk_length}})."\n";
  }


  #Not lightweight as we will be storing them
  # Temporarily set the collection to be lightweight??? #Why?
  #my $old_value = $this->_lightweight();

  #if   ( defined($lightweight) ) { $this->_lightweight($lightweight) }
  #else                           { $this->_lightweight(1) }

  my (%counts, $store_natural);
  $store_natural = 1 if(grep/^0/, @$window_sizes);

  #Set natural res count to 0
  $counts{0}=0;
  

  my $store_slice = $slice;
  my $slice_adj = 0;
  my $slice_end   = $slice->end;
  my $orig_slice  = $slice;
  my $orig_start = $slice->start;
  my $region       = $slice->coord_system_name;
  my $seq_region_name  = $slice->seq_region_name;
  my $strand       = $slice->strand;
  my $rset_id      = $rset->dbID;
  
  if($store_slice->start != 1){
	$store_slice = $slice->adaptor->fetch_by_region('chromosome', $slice->seq_region_name);
	$slice_adj = ($slice->start - 1);
  }
 
  #Can probably tidy this up and just alter start adjust directly
 
 
  foreach my $chunk_length(sort keys %chunk_windows){

	print "Processing windows ".join(', ', @{$chunk_windows{$chunk_length}})." with chunk length $chunk_length\n";
	
	#Set window counts to 0
	foreach my $window(@{$chunk_windows{$chunk_length}}){
	  $counts{$window}=0;
	}



	#Now walk through slice using slice length chunks and build all windows in each chunk
	my $in_slice     = 1;
	my $start_adj    = 0;
	my $slice_start  = $orig_start;
	
	#my $end_adj      = $slice->start + $chunk_length - 1;
	#my $end_adj = $chunk_length;

	
	
	my ($start, $end, $features, $bins);
	#$chunk_slice, 
	

	while($in_slice){

	  $start = $slice_start + $start_adj;
	  $end   = $start + $chunk_length - 1;
	  

	  #Last chunk might not be the correct window length
	  #Hence why we should do this on whole chromosomes
	  #Force this or just warn, check seq_region slice
	  
	  
	  #if($start > $slice_end);

	  if($end >= $slice_end){
		#or equal as we want to catch end of slice and set in_slice
		$end = $slice_end;
		$in_slice = 0;
	  }
	  

	  $slice = $slice_adaptor->fetch_by_region($region, $seq_region_name, $start, $end, $strand);
	  $features = $this->fetch_all_by_Slice_ResultSet($slice, $rset);

	  #Shift chunk coords
	  $start_adj = $chunk_length if($in_slice);
	  $slice_start = $slice->start;
	  #$start_adj += $chunk_length;
		#$end_adj   += $chunk_length;
	  #}
	
	
	  next if scalar(@$features) == 0;

	  
	  #This should return a hash of window size => bin array pairs
	  $bins = $this->_bin_features_by_window_sizes
				  ( 
				   -slice  => $slice,
				   -window_sizes  => $chunk_windows{$chunk_length},
				   -method => $method,
				   -features =>
				   $features,
				  );


	  my $bin_start = $slice->start;
	  #my $bin_end   = $slice->start;#???? wtf?

	  #We need to handle strandedness of slice!?	  
	  
	  #Store all normal features in result_feature
	  if($store_natural){
		
		foreach my $feature(@$features){
		  $counts{0}++;
		  
		  #warn "storing ".join(', ',	($feature->start, $feature->end, $feature->strand, $feature->score, 'undef', $rset->dbID, 0, $slice));
		  
		  
		  $this->store(Bio::EnsEMBL::Funcgen::ResultFeature->new_fast
					   (
						($feature->start + $bin_start + $slice_adj), 
						($feature->end   + $bin_start + $slice_adj), 
						$feature->strand, 
						$feature->score, 
						undef,#absent probe info
						$rset->dbID, 
						0, #window size
						$store_slice
					   )); 
		}
		print "Window size 0 (natural resolution) has ".scalar(@{$features})." feature bins\n";	
	  }
	  
	  my ($bin_end, $bin_score);

	  foreach my $wsize(sort keys %{$bins}){
		my $count = 0;

		$bin_start = $slice->start;
		$bin_end   = $slice->start;


		foreach my $bin_index(0..$#{$bins->{$wsize}}){
		  $bin_score = $bins->{$wsize}->[$bin_index];

		  
		  #next if ! $bin_score;#No we're no inc'ing the start ends for bins with no scores

		  $bin_end += $wsize;
	
		  if($bin_score){
			#$counts{$wsize}++;	
			$count++;

			#This is a little backwards as we are generating the object to store it
			#If we are aiming for speed the maybe we could also commodotise the store method
			#store by args or hash? store_fast?
			#Speed not essential for storing
			
			#Note: list ref passed
			
			#warn "storing $bin_start, $bin_end, $strand, $bin_score, undef, $rset->dbID, $wsize, $slice";
			
			$this->store(Bio::EnsEMBL::Funcgen::ResultFeature->new_fast
						 (
						  ($bin_start + $slice_adj), 
						  ($bin_end   + $slice_adj), 
						  $strand, 
						  $bin_score, 
						  undef,#absent probe info
						  $rset->dbID, 
						  $wsize, 
						  $store_slice
						 ));
		  }
		  
		  $bin_start += $wsize;
		}

		print "Window size $wsize has ".scalar(@{$bins->{$wsize}})." bins, storing $count\n";
		$counts{$wsize}+= $count;	
	  }
	}



	#Turn off storing of natural resolution for next chunk length sets
	$store_natural = 0;
  }
  

  

  #print some counts here
 
  foreach my $wsize(sort (keys %counts)){
	print "Stored ".$counts{$wsize}." for window size $wsize for ".$orig_slice->name."\n";
  }

  #Return this counts hash so we can print/log from the caller, hence we don't print in here?
  
  return;
}


#This is a slight rewrite to calculate bins for a list of all window sizes
#Should only be called by store_window_bins_by_Slice_ResultSet
#This could be put in Collection and _bin_feature could call this to maintain interface

}

_bin_features_by_window_sizes	No description	Code
_columns	No description	Code
_create_feature	No description	Code
_create_feature_fast	No description	Code
_default_where_clause	No description	Code
_tables	No description	Code
store_window_bins_by_Slice_ResultSet	No description	Code
validate_bin_method	No description	Code