Package variables | General documentation | Methods |
WebCvs | Raw content |
_bin_features_by_window_sizes | No description | Code |
_columns | No description | Code |
_create_feature | No description | Code |
_create_feature_fast | No description | Code |
_default_where_clause | No description | Code |
_tables | No description | Code |
store_window_bins_by_Slice_ResultSet | No description | Code |
validate_bin_method | No description | Code |
_bin_features_by_window_sizes | description | prev | next | Top |
my $this = shift; my ( $slice, $window_sizes, $method, $features ) = rearrange( [ 'SLICE', 'WINDOW_SIZES', 'METHOD', 'FEATURES' ], @_ ); if ( !defined($features) || !@{$features} ) { return {} } #No need to validate window sizes as done in the store_window_bins?}
#Do here anyway?
warn "Processing window sizes ".join(', ',@$window_sizes).' for slice '.$slice->name; #Set default to ResultFeature implementation?
$method ||= 'average_score'; #Should sub this to allow different implementations to use it
$method = $this->validate_bin_method($method); my $slice_start = $slice->start(); #my $bin_length = ( $slice->end() - $slice_start + 1 )/$nbins;
#my @bins;
#Set up some hashes to store data by window_size
my (%bins, %nbins, %bin_counts); #my($first_bin);
#if ( $method == 0 || # 'count' or 'density'
# $method == 3 || # 'fractional_count' or 'weight'
# $method == 4 # 'coverage'
# ){
# # For binning methods where each bin contain numerical values.
# $first_bin = 0;
# }
# else {
# # For binning methods where each bin does not contain numerical
# # values.
#
# #Remove this
# $first_bin = undef;
# }
#Set up some bin data for the windows
foreach my $wsize (@$window_sizes) { $nbins{$wsize} = int($slice->length / $wsize);#should this be -1 as this will be an index?
#no as int always rounds down
#So nbins is actually the index of the bin not the 'number'
#@{$bins{$wsize}} = map( $first_bin, 1 .. $nbins{$wsize});
#Create default bins with 0
@{$bins{$wsize}} = (); map {$bins{$wsize}->[$_] = 0} (0 .. $nbins{$wsize}); #Set bin counts to 0 for each bin
@{$bin_counts{$wsize}} = (); #Huh this is doing the same thing?
#We don't want to do this as this implies we have seen a 0 value
#When we have not seen any value at all!
#Need to set to undef? Or not set at all?
#This is adding an undef to the start of the array!?
map { $bin_counts{$wsize}->[($_)] = 0 } @{$bins{$wsize}}; foreach my $bin(@{$bins{$wsize}}){ $bin_counts{$wsize}->[$bin] = 0; } } #warn "bin_counts are :\n".Data::Dumper::Dumper(\%bin_counts);
#This failes for slices which are smaller than the chunk length;
#Just do a little sanity test here to make sure the bin_lengths == wsizes
#Then remove
#foreach my $wsize (keys %bin_lengths) {
# warn "wsize is $wsize with bin_length ".$bin_lengths{$wsize};
# throw('Oh no, these should be the same') if($wsize != $bin_lengths{$wsize});
# }
my $feature_index = 0; #What is this?
my @bin_masks; #my %start_bins;
#my %end_bins;
#Also store local starts and ends as we currently calc this several times
my $lfeature_start; my $lfeature_end; #To remove skew from bins we need to clip the start and end
#if they do not have overlapping features.
#So we need to store features in each bin and post process.
#This would be easiest, but 2nd loop would slow it down.
#Not so bad for storage, but let's try and keep it quick
#So in line bin clipping requires awareness of when we have moved to a new bin
#between features. This holds for overlapping and non-overlapping features.
#Once we have observed a gap we need to clip the end of the last bin and clip the start of the new bin.
#This requires knowing the greatest end values from the last bin's feature.
#what if two overlapping features had the same start and different end, would we see the longest last?
#Check default slice_fetch sort
foreach my $feature ( @{$features} ) { #Set up the bins for each window size
foreach my $wsize (@$window_sizes) { #We have already highjacked the object creation by here
#This is done in core BaseFeatureAdaptor
#We probably don't want to do this for ResultFeatures as we don't use the
#standard feature implementation
#we already use an array and we don't store the slice
#as this is already known by the caller
#and we always build on top level so we don't need to remap
#We do however need the slice to store?
#Do we?
#Yes, as we only store local starts when generating
#We need a store by Slice method?
#This will remove the need to inherit from Feature.
#These will need to be regenerated everytime we import a new build
#As we do with the probe_features themselves
#This also mean the result_feature status has to be associated with a coord_system_id
#Which bins do the start and end lie in for this feature?
#Already dealing with local starts, so no slice subtraction
my $start_bin = int(($feature->start ) / $wsize);
my $end_bin = int(($feature->end) / $wsize );
#Don't need slice start as we are dealing with local coords
#my $start_bin =
# int( ( $feature->[FEATURE_START] - $slice_start )/$bin_length );
# my $end_bin =
# int( ( $feature->[FEATURE_END] - $slice_start )/$bin_length );
#This might happen for last feature
#if ( $end_bins{$wsize} >= $nbins{$wsize} ) {
# $end_bins{$wsize} = $nbins{$wsize};
# }
$end_bin = $nbins{$wsize} if $end_bin > $nbins{$wsize}; #Slightly obfuscated code here
#Altho this should speed up generation
#by avoiding string comparisons.
#We should do default count processing for all methods as this is required for all no?
#No, just weight, coverage and average_score
_columns | description | prev | next | Top |
my ($this) = @_; my @columns = $this->SUPER::_columns(); if ( $this->_lightweight() ) { #What is this doing?}
#Probably not sensible for ResultFeature
@columns[ 5 .. $#columns ] = map( 1, 5 .. $#columns ); } return @columns; } #Also not sensible for objects spread across several tables
_create_feature | description | prev | next | Top |
my ( $this, $feature_type, $args ) = @_; my $feature = $this->SUPER::_create_feature( $feature_type, $args ); if ( !$this->_lightweight() ) { my ( $phase, $end_phase, $stable_id, $version, $created_date, $modified_date, $is_current ) = rearrange( [ 'PHASE', 'END_PHASE', 'STABLE_ID', 'VERSION', 'CREATED_DATE', 'MODIFIED_DATE', 'IS_CURRENT' ], %{$args} ); push( @{$feature}, $phase, $end_phase, $stable_id, $version, $created_date, $modified_date, $is_current ); } return $feature;}
_create_feature_fast | description | prev | next | Top |
my ( $this, $feature_type, $args ) = @_; my $feature = $this->SUPER::_create_feature_fast( $feature_type, $args ); return $feature; } #This might not be sensible for Features which are split across tables}
_default_where_clause | description | prev | next | Top |
my ($this) = @_; if ( $this->_lightweight() ) { return ''; } return $this->SUPER::_default_where_clause();}
_tables | description | prev | next | Top |
my ($this) = @_; my @tables = $this->SUPER::_tables(); if ( $this->_lightweight() ) { return ( $tables[0] ); } return @tables;}
store_window_bins_by_Slice_ResultSet | description | prev | next | Top |
my $this = shift; my ( $slice, $rset, $window_sizes, $logic_name, $rollback) = rearrange( [ 'SLICE', 'RESULT_SET', 'WINDOW_SIZES', 'LOGIC_NAME', 'ROLLBACK'], @_ ); my $slice_adaptor = $this->db->dnadb->get_SliceAdaptor; #my $method='average_score';#average}
my $method='max_magnitude';#i.e. greatest positive or negative score
#Hardcoded method to average. What about those probe which cross boundaries?
#Will this be returned?
if(! defined $window_sizes){ if($this->can('window_sizes')){ @{$window_sizes} = $this->window_sizes; print 'No window sizes provided running with defaults: '.join(' ', @$window_sizes)."\n"; } else{ throw('No default window sizes available. Must supply at least one window size as an arrayref'); } } elsif(scalar(@$window_sizes) == 0){ throw('Must supply at least one window size as an arrayref, or leave undef for defaults'); } #Slice methods return over hanging feature, so we could do this externally
#If we are not returning over hanging features we should really handle whole chromosome
#As we will need persistant data to handle over hangs
#Do as whole chromosome anyway and chunk the Slices into windows which are divisible by all windows sizes
#Nope, too much memmory, we either need to store here or generate the slice chunks in the caller and handle storing there
#This is an adaptor after all let's do it here
#Test for Rset here and whether we have status?
#Currently just result_feature_set.
#Need to implement roll back method in Helper
#fetch this first to perform validation on slice and ResultSet??
#my $rfs = $this->fetch_all_by_Slice_ResultSet($slice, $rset)
#maybe want to tst if RESULT_FEATURE_SET before we do the query?
#Need to as coord_system here.
#Could rollback by slice(for all window sizes)
if($rset->has_status('RESULT_FEATURE_SET')){ throw('ResultSet('.$rset->name.') already has precomputed ResultFeatures stored, please rollback ResultFeature first'); #Retrieving ResultFeatures here would end up retrieving them from the result_feature table
#which is where we want to store them
#They are only retrived from the probe/result/probe_feature table if they do not have this status.
#REMEMBER TO SET THIS IN THE CALLING SCRIPT!
} #Calulate sensible slice length based on window sizes
my @wsizes = sort {$a <=> $b} @$window_sizes; #Current limit is 500KB, so let's start there
#we should really start with a nice round number
#so
my $multiplier = int(500000/$wsizes[$#wsizes]);
my $chunk_length = $multiplier * $wsizes[$#wsizes]; my $not_divisible = 1; my %chunk_windows;#Registry of chunk lengths to run with windows
my %workable_chunks = map {$_ => {}} @wsizes; delete $workable_chunks{'0'};#get rid of natural resolution as this will always work
while($not_divisible && $chunk_length != 0){ $not_divisible = 0; foreach my $wsize(@wsizes){ next if $wsize == 0;#Special wsize for normal data
#Set not divisible if modulus is true
if($chunk_length % $wsize){ $not_divisible = 1; } else{ #No need to listref here?
$workable_chunks{$wsize}{$chunk_length} = []; } #warn "chunk length is $chunk_length and not_divisible is $not_divisible";
} #Gradually shrink the length until we find a workable slice length for all windows
$chunk_length -= $wsizes[$#wsizes] if $not_divisible; } my %chunk_sets; if($chunk_length == 0){ print "Could not find chunk length for all window sizes, attempting to subset windows using alternate slice length\n"; #my %sorted_windows
foreach my $wsize(keys %workable_chunks){ #Loop through windows, seeing if they are workable in the other windows
foreach my $chunk(keys %{$workable_chunks{$wsize}}){ foreach my $other_wsize(keys %workable_chunks){ next if $wsize == $other_wsize; if(exists $workable_chunks{$other_wsize}{$chunk}){ #only push it onto the other wsize, as we will do the reverse later
$chunk_sets{$chunk}{$wsize} = undef; } } } } #Now we have a register of co-occurence of wsizes with repect to chunks
#Loop through finding the least amount of sets with the longest chunk length?
#There is no way to decide which is best?
#we could calculate the number of loops? Factored by the chunk length?
#Let's just print out and see what we get
#warn "chunk sets are :\n".Data::Dumper::Dumper(\%chunk_sets);
#For now let's just take the one which has the most windows and the longest chunk
#Then we just get the largest which handles the rest.
#define possible set lengths
my $i = 0; my %set_lengths; map {$set_lengths{$i} = []; $i++} @wsizes; delete $set_lengths{'0'};#get rid of natural resolution as this will always work
#store chunks lengths for each set size
foreach my $chunk(keys %chunk_sets){ my $set_size = scalar(values %{$chunk_sets{$chunk}}); push @{$set_lengths{$set_size}}, $chunk; } #Now we get the biggest set with the longest length;
my $largest_size = scalar(@wsizes);#scalar here as we are disregarding natural resolution of 0 in loop
my $found_largest_set = 0; while(! $found_largest_set){ $largest_size--; if(scalar(@{$set_lengths{$largest_size}}>0)){ $found_largest_set = 1; } } #We should be able to loop this bit, to find all the biggest sets.
my ($largest_chunk) = sort {$b<=>$a} @{$set_lengths{$largest_size}}; #we could even be selective here, but let's just take the first one for now
my @largest_windows = keys %{$chunk_sets{$largest_chunk}}; @{$chunk_windows{$largest_chunk}} = @largest_windows; print "Largest chunk $largest_chunk($largest_size) contains windows: @largest_windows\n"; my %remaining_windows = map {$_ => {}} @wsizes; delete $remaining_windows{'0'};#get rid of natural resolution as this will always work
map { delete $remaining_windows{$_} } @largest_windows; my $remaining_set_size = scalar(keys %remaining_windows); #swapping to array here for practicality, would need to maintain hash if we need to iterate
my @rwindows = keys %remaining_windows; #This could just be one window, but this will not be inthe co-occurence hash %chunk_sets
#Hence the normal approach will not work. and we just want to find a suitably large chunk for this one window.
my $next_chunk; if(scalar(@rwindows) == 1){ #we just want to find a suitably large chunk for this one window.
my ($last_window) = @rwindows; $multiplier = int(500000/$last_window);
$next_chunk = $multiplier * $last_window; } else{ #Now were are doing something very similar to above
#populating a set_size chunk length registry
#my %seen_hash;
foreach my $chunk(sort {$b<=>$a} @{$set_lengths{$remaining_set_size}}){ my $seen_count = 0; foreach my $rwindow(@rwindows){ $seen_count++ if grep/$rwindow/, (values %{$chunk_sets{$chunk}}); } if ($seen_count == $remaining_set_size){ $next_chunk = $chunk; last; } } } @{$chunk_windows{$next_chunk}} = @rwindows; if($next_chunk){ print "Found next chunk length $next_chunk contains remaining windows:\t@rwindows\n"; #Now we want to cycle through all the set lengths which could contain the ones not in the first
#so we need to
} else{ warn "Need to write iterative sub for set definition"; throw('Could not find workable slice length for remaining windows: '. join(', ', @rwindows)); } } else{ @{$chunk_windows{$chunk_length}} = keys(%workable_chunks); print "Found workable chunk length($chunk_length) for all window sizes:\t". join(' ', @{$chunk_windows{$chunk_length}})."\n"; } #Not lightweight as we will be storing them
# Temporarily set the collection to be lightweight??? #Why?
#my $old_value = $this->_lightweight();
#if ( defined($lightweight) ) { $this->_lightweight($lightweight) }
#else { $this->_lightweight(1) }
my (%counts, $store_natural); $store_natural = 1 if(grep/^0/, @$window_sizes); #Set natural res count to 0
$counts{0}=0; my $store_slice = $slice; my $slice_adj = 0; my $slice_end = $slice->end; my $orig_slice = $slice; my $orig_start = $slice->start; my $region = $slice->coord_system_name; my $seq_region_name = $slice->seq_region_name; my $strand = $slice->strand; my $rset_id = $rset->dbID; if($store_slice->start != 1){ $store_slice = $slice->adaptor->fetch_by_region('chromosome', $slice->seq_region_name); $slice_adj = ($slice->start - 1); } #Can probably tidy this up and just alter start adjust directly
foreach my $chunk_length(sort keys %chunk_windows){ print "Processing windows ".join(', ', @{$chunk_windows{$chunk_length}})." with chunk length $chunk_length\n"; #Set window counts to 0
foreach my $window(@{$chunk_windows{$chunk_length}}){ $counts{$window}=0; } #Now walk through slice using slice length chunks and build all windows in each chunk
my $in_slice = 1; my $start_adj = 0; my $slice_start = $orig_start; #my $end_adj = $slice->start + $chunk_length - 1;
#my $end_adj = $chunk_length;
my ($start, $end, $features, $bins); #$chunk_slice,
while($in_slice){ $start = $slice_start + $start_adj; $end = $start + $chunk_length - 1; #Last chunk might not be the correct window length
#Hence why we should do this on whole chromosomes
#Force this or just warn, check seq_region slice
#if($start > $slice_end);
if($end >= $slice_end){ #or equal as we want to catch end of slice and set in_slice
$end = $slice_end; $in_slice = 0; } $slice = $slice_adaptor->fetch_by_region($region, $seq_region_name, $start, $end, $strand); $features = $this->fetch_all_by_Slice_ResultSet($slice, $rset); #Shift chunk coords
$start_adj = $chunk_length if($in_slice); $slice_start = $slice->start; #$start_adj += $chunk_length;
#$end_adj += $chunk_length;
#}
next if scalar(@$features) == 0; #This should return a hash of window size => bin array pairs
$bins = $this->_bin_features_by_window_sizes ( -slice => $slice, -window_sizes => $chunk_windows{$chunk_length}, -method => $method, -features => $features, ); my $bin_start = $slice->start; #my $bin_end = $slice->start;#???? wtf?
#We need to handle strandedness of slice!?
#Store all normal features in result_feature
if($store_natural){ foreach my $feature(@$features){ $counts{0}++; #warn "storing ".join(', ', ($feature->start, $feature->end, $feature->strand, $feature->score, 'undef', $rset->dbID, 0, $slice));
$this->store(Bio::EnsEMBL::Funcgen::ResultFeature->new_fast ( ($feature->start + $bin_start + $slice_adj), ($feature->end + $bin_start + $slice_adj), $feature->strand, $feature->score, undef,#absent probe info
$rset->dbID, 0, #window size
$store_slice )); } print "Window size 0 (natural resolution) has ".scalar(@{$features})." feature bins\n"; } my ($bin_end, $bin_score); foreach my $wsize(sort keys %{$bins}){ my $count = 0; $bin_start = $slice->start; $bin_end = $slice->start; foreach my $bin_index(0..$#{$bins->{$wsize}}){ $bin_score = $bins->{$wsize}->[$bin_index]; #next if ! $bin_score;#No we're no inc'ing the start ends for bins with no scores
$bin_end += $wsize; if($bin_score){ #$counts{$wsize}++;
$count++; #This is a little backwards as we are generating the object to store it
#If we are aiming for speed the maybe we could also commodotise the store method
#store by args or hash? store_fast?
#Speed not essential for storing
#Note: list ref passed
#warn "storing $bin_start, $bin_end, $strand, $bin_score, undef, $rset->dbID, $wsize, $slice";
$this->store(Bio::EnsEMBL::Funcgen::ResultFeature->new_fast ( ($bin_start + $slice_adj), ($bin_end + $slice_adj), $strand, $bin_score, undef,#absent probe info
$rset->dbID, $wsize, $store_slice )); } $bin_start += $wsize; } print "Window size $wsize has ".scalar(@{$bins->{$wsize}})." bins, storing $count\n"; $counts{$wsize}+= $count; } } #Turn off storing of natural resolution for next chunk length sets
$store_natural = 0; } #print some counts here
foreach my $wsize(sort (keys %counts)){ print "Stored ".$counts{$wsize}." for window size $wsize for ".$orig_slice->name."\n"; } #Return this counts hash so we can print/log from the caller, hence we don't print in here?
return; } #This is a slight rewrite to calculate bins for a list of all window sizes
#Should only be called by store_window_bins_by_Slice_ResultSet
#This could be put in Collection and _bin_feature could call this to maintain interface
validate_bin_method | description | prev | next | Top |
my ($self, $method) = @_; #Add average_score to avoid changing Collection.pm}
my $class = ref($self); ${$class::VALID_BINNING_METHODS}{'average_score'} = 5; ${$class::VALID_BINNING_METHODS}{'max_magnitude'} = 6; #warn "Still can't access VALID_BINNING_METHODS";
#foreach my $method_name(keys %{$class::VALID_BINNING_METHODS}){
# warn "valid method is $method name";
# }
if ( ! exists( ${$class::VALID_BINNING_METHODS}{$method} ) ) { throw( sprintf( "Invalid binning method '%s', valid methods are:\n\t%s\n", $method, join( "\n\t", sort( keys(%{$self::VALID_BINNING_METHODS}) ) ) ) ); } else{ #warn "found valid method $method with index ".${$class::VALID_BINNING_METHODS}{$method};
} return ${$class::VALID_BINNING_METHODS}{$method}; } 1;