Raw content of Bio::EnsEMBL::Variation::DBSQL::ReadCoverageCollectionAdaptor # # Ensembl module for Bio::EnsEMBL::Variation::DBSQL::ReadCoverageCollectionAdaptor # # Copyright (c) 2005 Ensembl # # You may distribute this module under the same terms as perl itself # # =head1 NAME Bio::EnsEMBL::Variation::DBSQL::ReadCoverageCollectionAdaptor =head1 SYNOPSIS $vdb = Bio::EnsEMBL::Variation::DBSQL::DBAdaptor->new(...); $db = Bio::EnsEMBL::DBSQL::DBAdaptor->new(...); # tell the variation database where core database information can be # be found $vdb->dnadb($db); $rca = $vdb->get_ReadCoverageCollectionAdaptor(); $sa = $db->get_SliceAdaptor(); $pa = $vdb->get_PopulationAdaptor(); # get read coverage in a region for a certain population in in a certain level $slice = $sa->fetch_by_region('chromosome', 'X', 1e6, 2e6); $population = $pa->fetch_by_name("UNKNOWN"); $level = 1; foreach $rc (@{$vfa->fetch_all_by_Slice_Sample($slice,$population,$level)}) { print $rc->start(), '-', $rc->end(), "\n"; } =head1 DESCRIPTION This adaptor provides database connectivity for ReadCoverageCollection objects. Coverage information for reads can be obtained from the database using this adaptor. See the base class BaseFeatureAdaptor for more information. =head1 AUTHOR - Yuan Chen =head1 CONTACT Post questions to the Ensembl development list ensembl-dev@ebi.ac.uk =head1 METHODS =cut use strict; use warnings; package Bio::EnsEMBL::Variation::DBSQL::ReadCoverageCollectionAdaptor; use Bio::EnsEMBL::Variation::ReadCoverageCollection; use Bio::EnsEMBL::Mapper::RangeRegistry; use Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor; use Bio::EnsEMBL::Utils::Exception qw(throw warning); our @ISA = ('Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor'); my %Printable = ( "\\"=>'\\', "\r"=>'r', "\n"=>'n', "\t"=>'t', "\""=>'"' ); #my $_pack_type = "n"; =head2 fetch_all_by_Slice_Sample Arg[0] : Bio::EnsEMBL::Slice $slice Arg[1] : (optional) $sample_id Arg[2] : (optional) int $display_size (default 700) Arg[3] : (optional) int $display_type (one of "AVERAGE" or "MAX","MIN") (default "AVERAGE") Arg[4] : (optional) int $window_size Example : my $reads_coverages = $rcca->fetch_all_by_Slice_SampleId($slice,$sample_id,$display_size,$display_type,$window_size); Window_size defines which set of pre-averaged scores to use. Valid values are 50, 500 or 5000. There is no need to define the window_size because the program will select the most appropriate window_size to use based on the slice_length and the display_size. Description : Gets all the read coverage collections for a given sample(strain or individual) in a given slice ReturnType : listref of Bio::EnsEMBL::Variation::ReadCoverageCollection Exceptions : thrown on bad arguments Caller : general Status : At Risk =cut sub fetch_all_by_Slice_SampleId{ my $self = shift; my ($slice,$sample_id,$display_size,$display_type,$window_size) = @_; my $rcc = []; if(!ref($slice) || !$slice->isa('Bio::EnsEMBL::Slice')) { throw('Bio::EnsEMBL::Slice arg expected'); } # if (defined $sample) { # if (!ref($sample) || !$sample->isa('Bio::EnsEMBL::Variation::Sample')) { # throw('Bio::EnsEMBL::Variation::Sample arg expected'); # } # if (! defined($sample->dbID())) { # throw("Sample arg must have defined dbID"); # } # } #default display_size is 700 if (!defined $display_size) { $display_size = 700; } #default display_mode is AVERAGE if (!defined $display_type) { $display_type = "AVERAGE"; } #set up bucket object for storing bucket_size number of scores my $bucket_size = ($slice->length)/$display_size; #default window size is the largest bucket that gives at least #display_size values ie get speed but reasonable resolution my @window_sizes = (50,500,5000); #my $num_windows = 100; #check if valid window_size my $found = 0; if (defined $window_size) { foreach my $win_size (@window_sizes) { if ($win_size == $window_size) { $found = 1; last; } } if (!$found) { warning("Invalid window_size $window_size"); return $rcc; } } if (!defined $window_size) { #set window_size to be the largest for when for loop fails $window_size = $window_sizes[scalar(@window_sizes)-1]; for (my $i = 1; $i < scalar(@window_sizes); $i++) { if ($bucket_size < $window_sizes[$i]) { $window_size = $window_sizes[$i-1]; last; } } } #print "window_size is $window_size\n"; $rcc = $self->_fetch_all_by_Slice_WindowSize_SampleId($slice,$window_size,$sample_id) if defined $sample_id; $rcc = $self->_fetch_all_by_Slice_WindowSize_SampleId($slice,$window_size) if !$sample_id; if (scalar(@$rcc ==0)) { return $rcc; } #Find the min and max scores for y axis scaling. Save in first #read_coverage_collection object my ($min_y_axis, $max_y_axis) = _find_min_max_score($rcc); #add min and max scores to the first read_coverage_collection object if ((scalar @$rcc) > 0) { $rcc->[0]->y_axis_min($min_y_axis); $rcc->[0]->y_axis_max($max_y_axis); } return ($rcc); } sub _fetch_all_by_Slice_WindowSize_SampleId { my ($self,$slice,$window_size,$sample_id) = @_; my $rcc = []; my $rcs = []; my $reads_string_avg=[]; my $reads_string_min=[]; my $reads_string_max=[]; my ($window_start,$window_end); my $last_sql = (defined $sample_id) ? "AND sample_id = ?" : ''; my $sql = qq{SELECT window_size,window_start,window_end,read_coverage_string_avg,read_coverage_string_min,read_coverage_string_max,sample_id FROM read_coverage_collection WHERE seq_region_id = ? AND window_size = ? AND window_end > ? AND window_start < ? $last_sql }; my $sth = $self->prepare($sql); $sth->execute($slice->get_seq_region_id,$window_size,$slice->start,$slice->end,$sample_id) if defined $sample_id; $sth->execute($slice->get_seq_region_id,$window_size,$slice->start,$slice->end) if !defined $sample_id; #these bind_param are not working here #$sth->bind_param(1,$slice->get_seq_region_id,SQL_INTEGER); #$sth->bind_param(2,$window_size,SQL_INTEGER); #$sth->bind_param(3,$slice->start,SQL_INTEGER); #$sth->bind_param(4,$slice->end,SQL_INTEGER); while (my @arrays = $sth->fetchrow_array()) { $window_size = $arrays[0]; $window_start = $arrays[1]; $window_end = $arrays[2]; $reads_string_avg = _unpack_strings($arrays[3]); $reads_string_min = _unpack_strings($arrays[4]); $reads_string_max = _unpack_strings($arrays[5]); $sample_id = $arrays[6] if $arrays[6]; #need to find the start_offset for slice_start and end_offset for slice_end #in the read coverage collection row #print "window_size is $window_size,window_start is $window_start,window_end is $window_end,avg is @$reads_string_avg,min is @$reads_string_min,max is @$reads_string_max\n"; for (my $j = 0; $j < @$reads_string_avg; $j++) { my $rc = {}; my $seq_region_start = $window_start + $window_size * $j ; my $seq_region_end = $seq_region_start + $window_size -1; next if ($seq_region_start > $slice->end or $seq_region_end < $slice->start); #covert seq_region_start and seq_region_end tp a start and end relative to the slice my ($start,$end); if ($slice->strand == -1) { $start = $slice->end - $seq_region_end + 1; $end = $slice->end - $seq_region_start + 1; } else { $start = $seq_region_start - $slice->start + 1; $end = $seq_region_end - $slice->start +1; } $rc->{'start'} = $start; $rc->{'end'} = $end; $rc->{'seq_region_start'} = $seq_region_start; $rc->{'seq_region_end'} = $seq_region_end; $rc->{'strand'} = $slice->strand; $rc->{'read_coverage_avg'} = $reads_string_avg->[$j]; $rc->{'read_coverage_min'} = $reads_string_min->[$j]; $rc->{'read_coverage_max'} = $reads_string_max->[$j]; push @{$rcs}, $rc; } }#end while loop foreach my $rc (@$rcs) { push @$rcc,$self-> _create_feature_fast('Bio::EnsEMBL::Variation::ReadCoverageCollection', {'adaptor' => $self, 'slice' => $slice, 'start' => $rc->{'start'}, 'end' => $rc->{'end'}, 'strand' => $rc->{'strand'}, 'window_size' => $window_size, 'window_start' => $window_start, 'window_end' => $window_end, 'seq_region_start' => $rc->{'seq_region_start'}, 'seq_region_end' => $rc->{'seq_region_end'}, 'seq_region_strand' => 1, 'sample_id' => (defined $sample_id) ? $sample_id : '', 'read_coverage_avg' => $rc->{'read_coverage_avg'}, 'read_coverage_min' => $rc->{'read_coverage_min'}, 'read_coverage_max' => $rc->{'read_coverage_max'}, }); } #sort into numerical order based on position my @sorted_features = sort {$a->{start} <=> $b->{start}} @$rcc; return\@sorted_features; } sub _unpack_strings { my $string = shift; my $pack_type = "n" x (length($string)/2); my @arrays = unpack($pack_type,$string); return \@arrays; } sub escape ($) { local $_ = ( defined $_[0] ? $_[0] : '' ); s/([\r\n\t\\\"])/\\$Printable{$1}/sg; return $_; } sub _find_min_max_score { my ($scores) = @_; my $min; my $max; foreach my $score (@$scores) { #find min and max of diff scores if (defined $score->read_coverage_min) { #if min hasn't been defined yet, then define min and max unless (defined $min and defined $max) { $min = $score->read_coverage_min; $max = $score->read_coverage_max; } if ($min > $score->read_coverage_min) { $min = $score->read_coverage_min; } if ($max < $score->read_coverage_max) { $max = $score->read_coverage_max; } } } return ($min, $max); } 1;