Raw content of Bio::EnsEMBL::Funcgen::ProbeFeature.

Raw content of Bio::EnsEMBL::Funcgen::ProbeFeature # # Ensembl module for Bio::EnsEMBL::Funcgen::ProbeFeature # # You may distribute this module under the same terms as Perl itself =head1 NAME Bio::EnsEMBL::ProbeFeature - A module to represent an nucleotide probe genomic mapping. =head1 SYNOPSIS use Bio::EnsEMBL::Funcgen::ProbeFeature; my $feature = Bio::EnsEMBL::Funcgen::ProbeFeature->new( -PROBE => $probe, -MISMATCHCOUNT => 0, -SLICE => $chr_1_slice, -START => 1_000_000, -END => 1_000_024, -STRAND => -1, -ANALYSIS => $analysis, ); #build_id/version, seq_region_id, seq_region_strand - from slice? #cigar_line? =head1 DESCRIPTION An ProbeFeature object represents the genomic placement of an Probe object. The data are stored in the probe_feature table. =head1 AUTHOR This module was created by Nathan Johnson, but is almost entirely based on the core OligoFeature module written by Ian Sealy. This module is part of the Ensembl project: / =head1 CONTACT Post comments or questions to the Ensembl development list: ensembl-dev@ebi.ac.uk =head1 METHODS =cut use strict; use warnings; package Bio::EnsEMBL::Funcgen::ProbeFeature; use Bio::EnsEMBL::Utils::Argument qw( rearrange ); use Bio::EnsEMBL::Utils::Exception qw( throw ); use Bio::EnsEMBL::Feature; use Bio::EnsEMBL::Funcgen::Storable; use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw(median); use vars qw(@ISA); @ISA = qw(Bio::EnsEMBL::Feature Bio::EnsEMBL::Funcgen::Storable); =head2 new Arg [-PROBE] : Bio::EnsEMBL::Funcgen::Probe - probe A ProbeFeature must have a probe. This probe must already be stored if you plan to store the feature. Arg [-MISMATCHCOUNT]: int Number of mismatches over the length of the probe. Arg [-SLICE] : Bio::EnsEMBL::Slice The slice on which this feature is. Arg [-START] : int The start coordinate of this feature relative to the start of the slice it is sitting on. Coordinates start at 1 and are inclusive. Arg [-END] : int The end coordinate of this feature relative to the start of the slice it is sitting on. Coordinates start at 1 and are inclusive. Arg [-STRAND] : int The orientation of this feature. Valid values are 1, -1 and 0. Arg [-dbID] : (optional) int Internal database ID. Arg [-ADAPTOR] : (optional) Bio::EnsEMBL::DBSQL::BaseAdaptor Database adaptor. Example : my $feature = Bio::EnsEMBL::Funcgen::ProbeFeature->new( -PROBE => $probe, -MISMATCHCOUNT => 0, -SLICE => $chr_1_slice, -START => 1_000_000, -END => 1_000_024, -STRAND => -1, -ANALYSIS => $analysis, -CIGARLINE => '15M2m3d4M', #Can represent transcript alignment as gapped genomic alignments #D(eletions) representing introns #Lowercase m's showing sequence mismatches ); Description: Constructor for ProbeFeature objects. Returntype : Bio::EnsEMBL::Funcgen::ProbeFeature Exceptions : None Caller : General Status : At risk =cut sub new { my $caller = shift; my $class = ref($caller) || $caller; my $self = $class->SUPER::new(@_); my ($probe, $mismatchcount, $pid, $cig_line) = rearrange(['PROBE', 'MISMATCHCOUNT', 'PROBE_ID', 'CIGAR_LINE'], @_); #remove mismatch? #mandatory args? #warn "creating probe feature with $pid"; $self->{'probe_id'} = $pid if $pid; $self->probe($probe) if $probe; $self->mismatchcount($mismatchcount) if defined $mismatchcount;#do not remove until probe mapping pipeline fixed $self->cigar_line($cig_line) if defined $cig_line; #do we need to validate this against the db? Grab from slice and create new if not present? Will this be from the dnadb? #do we need this coordsys id if we're passing a slice? We should have the method but not in here? return $self; } =head2 new_fast Args : Hashref with all internal attributes set Example : none Description: Quick and dirty version of new. Only works if the code is very disciplined. Returntype : Bio::EnsEMBL::Funcgen::ProbeFeature Exceptions : None Caller : General Status : At Risk =cut sub new_fast { bless ($_[1], $_[0]); } =head2 probeset Arg [1] : (optional) string - probeset Example : my $probeset = $feature->probeset(); Description: Getter and setter for the probeset for this feature. Shortcut for $feature->probe->probeset(), which should be used instead. Probeset is not persisted if set with this method. Returntype : string Exceptions : None Caller : General Status : Medium Risk : Use $feature->probe->probeset() because this may be removed =cut sub probeset { my $self = shift; $self->{'probeset'} = shift if @_; if ($self->{'probe'}) { $self->{'probeset'} = $self->probe()->probeset(); } return $self->{'probeset'}; } =head2 mismatchcount Arg [1] : int - number of mismatches Example : my $mismatches = $feature->mismatchcount(); Description: Getter and setter for number of mismatches for this feature. Returntype : int Exceptions : None Caller : General Status : High Risk =cut sub mismatchcount { my $self = shift; #replace with dynamic check of cigarline? $self->{'mismatchcount'} = shift if @_; return $self->{'mismatchcount'}; } =head2 cigar_line Arg [1] : str - Cigar line alignment annotation Example : my $cg = $feature->cigar_line(); Description: Getter and setter for number of the cigar line attribute for this feature. Returntype : str Exceptions : None Caller : General Status : High Risk =cut sub cigar_line { my $self = shift; $self->{'cigar_line'} = shift if @_; return $self->{'cigar_line'}; } =head2 probelength Args : None Example : my $probelength = $feature->probelength(); Description: Getter for the length of the probe. Shortcut for $feature->probe->length(), which should be used instead. Originally, this method returned the length of the feature, which was often, but not always, the same as the length of the probe. Returntype : int Exceptions : None Caller : General Status : Medium Risk : Use $feature->probe->length() because this may be removed =cut sub probelength { my $self = shift; return $self->probe->length(); } =head2 probe Arg [1] : Bio::EnsEMBL::Funcgen::Probe - probe Example : my $probe = $feature->probe(); Description: Getter, setter and lazy loader of probe attribute for ProbeFeature objects. Features are retrieved from the database without attached probes, so retrieving probe information for a feature will involve another query. Returntype : Bio::EnsEMBL::Funcgen::Probe Exceptions : None Caller : General Status : at risk =cut sub probe { my $self = shift; my $probe = shift; #can we not use _probe_id here? #why is probe_id not set sometimes? #warn "in pf and probe is ".$self->{'probe_id'}; if ($probe) { #warn "Probe defined and is ".$probe. "and probe id is".$self->{'probe_id'}; if ( !ref $probe || !$probe->isa('Bio::EnsEMBL::Funcgen::Probe') ) { throw('Probe must be a Bio::EnsEMBL::Funcgen::Probe object'); } $self->{'probe'} = $probe; } if ( ! defined $self->{'probe'}){ # && $self->dbID() && $self->adaptor() ) { #$self->{'probe'} = $self->adaptor()->db()->get_ProbeAdaptor()->fetch_by_ProbeFeature($self); #warn "fetching probe with dbID ".$self->probe_id(); $self->{'probe'} = $self->adaptor()->db()->get_ProbeAdaptor()->fetch_by_dbID($self->probe_id()); } return $self->{'probe'}; } =head2 probe_id Example : my $probe_id = $pfeature->probe_id(); Description: Getter for the probe db id of the ProbeFeature Returntype : int Exceptions : None Caller : General Status : at risk =cut sub probe_id{ my $self = shift; return $self->{'probe_id'} || $self->probe->dbID(); } =head2 get_results_by_channel_id Arg [1] : int - channel_id (mandatory) Arg [2] : string - Analysis name e.g. RawValue, VSN (optional) Example : my @results = $feature->results(); Description: Getter, setter and lazy loader of results attribute for ProbeFeature objects. Returntype : List ref to arrays containing ('score', 'Analysis logic_name'); Exceptions : None Caller : General Status : Medium Risk =cut sub get_results_by_channel_id { my $self = shift; my $channel_id = shift; my $anal_name = shift; warn("This method not fully implemented, remove/deprecate?"); #$self->{'results'} ||= {}; $self->{'results_complete'} ||= 0; if(! $self->{'results'} || ($anal_name && ! exists $self->{'results'}{$anal_name})){ #fetch all, set complete set flag $self->{'results_complete'} ||= 1 if(! $anal_name); foreach my $results_ref(@{$self->adaptor->fetch_results_by_channel_analysis($self->probe->dbID(), $channel_id, $anal_name)}){ $self->{'results'}{$$results_ref[1]} = $$results_ref[0]; } } return $self->{'results'} } #The experiment/al chip specificity has already been done by the ofa->fetch_all_by_Slice_Experiment #This may be called with no preceding Experiment specificity #this would return results for all experiments #do we need to set a default Experiment? #THis should return both Chip and Channel based results #just Chip for now #maybe retrieve and hash all if not Analysis object passed? Then return what? =head2 get_result_by_Analysis_ExperimentalChips Arg [1] : Bio::EnsEMBL::Analysis Arg [2] : listref - Bio::EnsEMBL::Funcgen::ExperimentalChip Example : my $result = $feature->get_result_by_Analysis_ExperimentalChips($anal, \@echips); Description: Getter of results attribute for a given Analysis and set of ExperimentalChips Returntype : float Exceptions : Throws is no Analysis or ExperimentalChips are not passed? Caller : General Status : High Risk =cut #make ExperimentalChips optional? #or have ResultSetAdaptor? Do we need a ResultSet? #may not have ExperimentalChip, so would need to return ec dbID aswell ######This will break/return anomalous if #ECs are passed from different experiments #ECs are passed from different Arrays sub get_result_by_Analysis_ExperimentalChips{ my ($self, $anal, $exp_chips) = @_; throw("Need to pass listref of ExperiemntalChips") if(scalar(@$exp_chips) == 0); throw("Need to pass a valid Bio::EnsEMBL::Analysis") if ! $anal->isa("Bio::EnsEMBL::Analysis"); my (%query_ids, %all_ids, %ac_ids); my $anal_name = $anal->logic_name(); foreach my $ec(@$exp_chips){ throw("Need to pass a listref of Bio::EnsEMBL::Funcgen::ExperimentalChip objects") if ! $ec->isa("Bio::EnsEMBL::Funcgen::ExperimentalChip"); #my $tmp_id = $self->adaptor->db->get_ArrayAdaptor->fetch_by_array_chip_dbID($ec->array_chip_id())->dbID(); #$array_id ||= $tmp_id; #throw("You have passed ExperimentalChips from different if($array_id != $tmp_id) #if(exists $ac_ids{$ec->array_chip_id()}){ # throw("Multiple chip query only works with contiguous chips within an array, rather than duplicates"); # } $ac_ids{$ec->array_chip_id()} = 1; $all_ids{$ec->dbID()} = 1; $query_ids{$ec->dbID()} = 1 if(! exists $self->{'results'}{$anal_name}{$ec->dbID()}); } my @ec_ids = keys %query_ids; my @all_ids = keys %all_ids; #warn "ec ids @ec_ids\n"; #warn "all ids @all_ids\n"; #$self->{'results'} ||= {}; #$self->{'results_complete'} ||= 0;#do we need this now? if((scalar(@all_ids) - scalar(@ec_ids))> 1){ throw("DATA ERROR - There is more than one result stored for the following ExperimentalChip ids: @all_ids"); } elsif(! $self->{'results'} || (($anal_name && scalar(@ec_ids) > 0) && scalar(@all_ids) == scalar(@ec_ids))){ #fetch all, set complete set flag #$self->{'results_complete'} ||= 1 if(! $anal_name); #would need to look up chip and channel analyses here and call relevant fetch #or pass the chip and then build the query as = or IN dependent on context of logic name #if there are multiple results, last one will overwrite others #could do foreach here to deal with retrieving all i.e. no logic name #Can supply mutliple chips, but probe ids "should" be unique(in the DB at least) amongst contiguous array_chips #build the cache based on logic name and table_id #cahce key?? should we cat the ec_ids together? my @result_refs = @{$self->adaptor->fetch_results_by_Probe_Analysis_experimental_chip_ids($self->probe(), $anal, \@ec_ids)}; #Remove lines with no result while(@result_refs && (! $result_refs[0]->[0])){ shift @result_refs; } my $num_results = scalar(@result_refs); my ($result, $mpos); #throw("Fetched more than one result for this ProbeFeature, Analysis and ExperimentalChips") if (scalar(@result_refs) >1); #No sort needed as we sort in the query if($num_results == 1){ $result = $result_refs[0]->[0]; } elsif($num_results == 2){#mean $result = ($result_refs[0]->[0] + $result_refs[1]->[0])/2; } elsif($num_results > 2){#median or mean of median flanks $mpos = $num_results/2; if($mpos =~ /\./){#true median $mpos =~ s/\..*//; $mpos ++; $result = $result_refs[$mpos]->[0]; }else{ $result = ($result_refs[$mpos]->[0] + $result_refs[($mpos+1)]->[0])/2 ; } } $self->{'results'}{$anal_name}{":".join(":", @ec_ids).":"} = $result; } #do we return the ec ids here to, or do we trust that the user will know to only pass contiguous rather than duplicate chips #how are we going to retrieve the result for one of many possible ec id keys? #options, cat ec dbids as key, and grep them to find full key, then return result #this may hide the duplicate chip problem #If a query has already been made and cached,another query with one differing ID(duplicate result) may never be queried as we already have a cahced result #We shoulld pick up duplicates before this happens #If we try and mix ExperimentalChips from different experiments, then this would also cause multiple results, and hence hide some data my @keys; foreach my $id(@all_ids){ my @tmp = grep(/:${id}:/, keys %{$self->{'results'}{$anal_name}}); #Hacky needs sorting, quick fix for release!! if(@tmp){ push @keys, grep(/:${id}:/, keys %{$self->{'results'}{$anal_name}}); last; } } throw("Got more than one key for the results cache") if scalar(@keys) > 1; return $self->{'results'}{$anal_name}{$keys[0]}; } #Will this be too slow, can we not do one query across all tables sub get_result_by_ResultSet{ my ($self, $rset) = @_; my $results = $rset->adaptor->fetch_results_by_probe_id_ResultSet($self->probe_id(), $rset); return median($results); } 1;