Raw content of Bio::EnsEMBL::Analysis::RunnableDB::ProbeAlign.

Raw content of Bio::EnsEMBL::Analysis::RunnableDB::ProbeAlign =pod =head1 NAME Bio::EnsEMBL::Analysis::RunnableDB::ProbeAlign; =head1 SYNOPSIS my $affy = Bio::EnsEMBL::Analysis::RunnableDB::ProbeAlign->new (-db => $refdb, -analysis => $analysis_obj, -database => $EST_GENOMIC, -query_seqs => \@sequences, ); $affy->fetch_input(); $affy->run(); $affy->write_output(); #writes to DB =head1 DESCRIPTION This object maps probes to a genomic and transcript sequence, writing the results as Bio::EnsEMBL::Funcgen::ProbeFeatures. UnmappedObjects are also written for those probes which either do not map at all or exceed the maximum mapping threshold defined by HIT_SATURATION_LEVEL. You must FIRST have created and imported all the necessary Arrays and Probe objects using the ImportArrays module. =head1 CONTACT Post general queries to B<ensembl-dev@ebi.ac.uk> =head1 METHODS =cut package Bio::EnsEMBL::Analysis::RunnableDB::ProbeAlign; use strict; use Bio::EnsEMBL::Utils::Exception qw(throw warning); use Bio::EnsEMBL::Analysis::RunnableDB; use Bio::EnsEMBL::Analysis::Runnable::ExonerateProbe; use Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor; use Bio::EnsEMBL::DBSQL::DBAdaptor; use Bio::EnsEMBL::UnmappedObject; use Bio::EnsEMBL::DBEntry; use Bio::EnsEMBL::Analysis::Config::ProbeAlign; use vars qw(@ISA); @ISA = qw (Bio::EnsEMBL::Analysis::RunnableDB); ############################################################ #TO DO # 1. Account for 5'/3' hanging alignment mismatches # These may be a sequence match or mismatch, but are currently being reported as an alignment 'm' # Which implies a sequence mis-match. This is simple as we don't generally have long overhangs # due to restrictive mismatch rules in ExonerateProbe # But would get more complicated (i.e. performing alignment) with longer overhangs # Can we not just report the shorter alignment? This will cause problems when trying # to identify which bp of the probe a particular mismatch is on i.e.13th for AFFY MM probe? # Can we substitute M for E(nd) or U(nknown)? This would have to be done in ExonerateProbe # Is potentially a source for problems if a probe were to overhand the 3' end and this were not # caught by the genomic mapping due to an intron being present at the very end of the transcript. # e.g. ENSMUST00000097969 # To capture this we must perform 5'/3' extension on the alignment against the genome! # Would need to get the query sequence from ExonerateProbe or directly from the fasta # Change to U(nknown not exonerate standard nomenclature) or N(on-equivalenced region), the # later is normally used for large regions on low conservation in protein alignments e.g. loops # THIS IS NOW SET TO U IN set_probe_and_slice # # 2. Delay writes of unmapped objects so we don't write anything unless -w is specified when testing # Also prevents having to rollback if job failed hlafway through filter # # 3. Enable rollback by chunk. This should skip the run sub new { my ( $class, @args ) = @_; my $self = $class->SUPER::new(@args); #Because we dont know whether the sort of adapter (compara, hive, core) #we'll be passed, it's better to just remake the core dbadaptor and #keep it on ourselves as the dna db - this has to be the intent of the #dataadaptor input through the rulemanager / beekeeper / whatever $self->read_and_check_config; #Set up and check DBs before we do the mapping $self->outdb->dbc->db_handle; $self->outdb->dnadb->dbc->db_handle; $self->arrays({}); $self->probes({}); #$self->features({});#?? #$self->{'_unmapped_objects'} = []; my $logic = $self->analysis->logic_name; my $schema_build = $self->outdb->_get_schema_build($self->outdb->dnadb); my $species = $self->outdb->species; throw('Must provide a -species to the OUTDB in Bio::EnsEMBL::Analysis::Config::ProbeAlign') if ! $species; my ($db_name, $display_name); if($logic =~ /Transcript/){ $self->{'mapping_type'} = 'transcript'; #We need to check for ensembl_Core_Transcript external db here #This should also be linked to the release #Should we store here? May over-write each other? #No problem if it fails once as the pipeline will resubmit the job #Will this cause problems with maintaining a master external_dbs file #As this may will grow *3 for every release. #But once we allow GENE, TRANSCRIPT, PROTEIN in xref.info_type #Then this would only be one extra db per release #Would be DBEntry::ADaptor->get_all_external_dbs_by_name #This is tricky as we may be aligning against an old DB #i.e. if there was a new array but not a new build #Current DBEntry queries are naive to external_db version $db_name = $species.'_core_Transcript'; $display_name = 'EnsemblTranscript'; } else{ $self->{'mapping_type'} = 'genomic'; $db_name = $species.'_core_Genome'; $display_name = 'EnsemblGenome'; #This is really just a place holder for UnmappedObjects against the entire genome #Should we redo this as EnsemblSpecies and actually have xref entries for each species name? } my $sql = "select external_db_id from external_db where db_name='$db_name' and db_release='$schema_build'"; my ($extdb_id) = $self->outdb->db_handle->selectrow_array($sql); if(! $extdb_id){ warn 'No external_db found for '.$self->{'mapping_type'}." mapping, inserting $db_name $schema_build"; #status is dubious here as this should really be on object_xref my $insert_sql = 'INSERT into external_db(db_name, db_release, status, dbprimary_acc_linkable, priority, db_display_name, type)'. " values('$db_name', '$schema_build', 'KNOWNXREF', 1, 5, '$display_name', 'MISC')"; $self->outdb->db_handle->do($insert_sql); ($extdb_id) = $self->outdb->db_handle->selectrow_array($sql); #Now test again just to make sure the db_name fits! ($extdb_id) = $self->outdb->db_handle->selectrow_array($sql); if(! $extdb_id){ throw("Failed to store external_db properly:\t$db_name $schema_build"); } } $self->{'_external_db_id'} = $extdb_id; return $self; } sub mapping_type{ return $_[0]->{'mapping_type'}; } sub fetch_input { my ($self) = @_; ########################################## # set up the target (genome) ########################################## my $target = $self->TARGETSEQS; if ( -e $target ){ if(-d $target ) { warn ("Target $target is a directory of files\n"); }elsif (-s $target){ warn ("Target $target is a whole-genome file\n"); }else{ throw("'$target' isn't a file or a directory?"); } } else { throw("'$target' could not be found"); } ########################################## # set up the query probe seq ########################################## my ($query_file, $chunk_number, $chunk_total); my $query = $self->QUERYSEQS; if ( -e $query and -s $query ) { # query seqs is a single file; input id will correspond to a chunk number $query_file = $query; my $iid_regexp = $self->IIDREGEXP; if (not defined $iid_regexp){ throw("You must define IIDREGEXP in config to enable inference of chunk number and total from your single fasta file" ) } ( $chunk_number, $chunk_total ) = $self->input_id =~ /$iid_regexp/; if(!$chunk_number || !$chunk_total){ throw "I can't make sense of your input id using the IIDREGEXP in the config!\n"; } #store this for reference later $self->query_file($query_file); } else { throw("'$query' must refer to a single fasta file with all probe sequences referenced by affy_probe_id\n"); } ########################################## # setup the runnable ########################################## my %parameters = %{ $self->parameters_hash }; #Parameter hash is picked up from the DB analysis table #else the config -options value is used if (not exists $parameters{-options}) { if ($self->OPTIONS) { $parameters{-options} = $self->OPTIONS; } else { throw('You have not options stored in the DB or accessible from the analysis config hash for '.$self->analysis->logic_name); #$parameters{-options} = ""; } } print STDERR "PROGRAM FILE: ".$self->analysis->program_file."\n"; my $runnable = Bio::EnsEMBL::Analysis::Runnable::ExonerateProbe->new ( -program => $self->analysis->program_file, -analysis => $self->analysis, -target_file => $target, -query_type => $self->QUERYTYPE,#dna -query_file => $query_file, -query_chunk_number => $chunk_number, -query_chunk_total => $chunk_total, -max_mismatches => $self->MAX_MISMATCHES, -mapping_type => $self->mapping_type, #-filter_method => $self->FILTER_METHOD, %parameters, ); $self->runnable($runnable); } ############################################################ #Overrides RunnableDB::run sub run { my ($self) = @_; throw("Can't run - no runnable objects") unless ( $self->runnable ); my $runnable = @{ $self->runnable }[0]; $runnable->run; #Can we rollback individual chunk output here #so we never duplicate output #If job fails we currently don't know whether it failed before storing any unmapped objects #Can we do this optionally by setting an env var? #We would set this in SubmitAlign via a parameter #We need to set this in the config to maintain decoupling between the environment and the RunnableDB #Could we record a status in the eFG DB to track this? #Don't need both? #does not fit with efg status as is not a table, but could be hijacked quite easily? #Let's just set this manually. You will have to reset the jobs, so we could add a warning here to use the appropriate flag? #These must be used in the rule_manager.pl #Which is where write_output must also be called ffrom #$self->output($features);#Moved this after filter/set_probe so we get the write output reported $self->features($self->filter_features($runnable->output)); } ############################################################ #overrides RunnableDB::write_output sub write_output { my ( $self, @output ) = @_; my $outdb = $self->outdb; my $probe_adaptor = $outdb->get_ProbeAdaptor; my $feature_adaptor = $outdb->get_ProbeFeatureAdaptor; my $dbe_adaptor = $outdb->get_DBEntryAdaptor; #Add analysis, slices to features, and make #sure they're pointing at the persistent array / probe instances #instead of the fake arrays & probes they were created with my $features = $self->set_probe_and_slice($self->features); #Now set correct output #This is only used to report the number of features $self->output($features);#$self->features); warn "Writing ProbeFeatures\n"; foreach my $feature_xref(@{$features}){ my ($feature, $xref) = @$feature_xref; if(! defined $feature->slice()){ warn "Skipping feature storage for Probe, no slice available for ".$feature->seqname(); next; } eval{ $feature_adaptor->store($feature)}; if ($@) { warn $feature->slice->name; $self->throw('Unable to store ProbeFeature for probe '.$feature->probe_id." !\n $@"); } if($xref){ #Remove ignore release flag so we store on the correct schema build eval{ $dbe_adaptor->store($xref, $feature->dbID, 'ProbeFeature') }; if ($@) { $self->throw('Unable to store ProbeFeature DBEntry for probe '.$feature->dbID." !\n $@"); } } } } ############################################################ sub filter_features { my ($self, $features) = @_; my (%hits_by_probe, @kept_hits); my $analysis = $self->analysis; my $logic_name = $analysis->logic_name; my $mapping_type = $self->mapping_type; my $max_hits = $self->HIT_SATURATION_LEVEL;#default is 100 my $uo_adaptor = $self->outdb->get_UnmappedObjectAdaptor; #We don't really want one for every logic name, #so let's simplify this to just ProbeAlign and ProbeTranscriptAlign my $uo_type; ($uo_type = $logic_name) =~ s/^.*_//; #This is tricky #We may map something >100 times to the genome but only a few times to transcripts. #Do we need a clean up step which removes all transcript mapping if we have an #UnmappedObject due to exceeding the genomic hit saturation??? #Or keep the transcript mapping and warn in the annotation #Do we need a HIT_SATURATION_LEVEL explcitly for transcripts? #I think yes and yes, yes we keep and warn and yes we need a more stringent saturation level, #probably needs to be number of genes mapped rather than simply number of mappings as we will always #get multiple mappings across alt trans #In fact this is why we ignored it for transcript mapping. #But we are throwing away non-gapped alignments for transcripts! #So it is very unlikely that we will ever reach this level! #We are also not throwing ungapped away before we do this test #So must not filter here as we may throw away a probe which matches many times ungapped #but only once gapped...but we're stil matching transcripts rather than the whole genome #here so this is still valid. foreach my $hit (@$features) { #we need to capture first an dlast transcript ID's to #enable cleaning of potential duplicates push @{$hits_by_probe{$hit->probe_id}}, $hit; } foreach my $probe_id (keys %hits_by_probe) { my @hits = @{$hits_by_probe{$probe_id}}; my $num_hits = scalar(@hits); my $all_hits = $num_hits; if ($num_hits > $max_hits) { @hits = grep { $_->mismatchcount == 0 } @hits; $num_hits = scalar(@hits); if ($num_hits <= $max_hits) { warn "Keeping only perfect matches($num_hits/$all_hits) to $probe_id. Do we need to keep this in an UnmappedObject?\n"; } else { warn "UnmappedObject:\tToo many hits($num_hits|$all_hits/$max_hits) to $probe_id so rejecting all hits\n"; #So we have issues with what we consider for transcript mapping. #If it has failed genomic mapping then we currently do not consider if for xrefing. #But it may only map to one transcript, but all over the genome #We don't store the genomic mappings, but maybe we should store the transcript mapping which we can then warn against using the unmapped object info from the genomic mapping. #Okay, so now we don't have interdepedancy of the mapping steps #We need to grab the external_db_id here if there is only one #Leave blank if this exists in multiple DBs e.g. affy 3' UTR designs #This assumes that the same ID in two different DBs are the same entity #This is true for affy but maybe not for other arrays #The simple way to do this is to pull back the array names using the probe_id and direct sql #This means yet another call for each unmapped probe #push @{$self->unmapped_objects}, $uo_adaptor->store(Bio::EnsEMBL::UnmappedObject->new ( -type => $uo_type,#Currently get's set to NULL as can only have xref or probe2transcript -analysis => $analysis, -ensembl_id => $probe_id, -ensembl_object_type => 'Probe', -external_db_id => $self->{'_external_db_id'}, -identifier => $mapping_type, -summary => 'Promiscuous probe', -full_desc => "Probe exceeded maximum allowed number of $mapping_type mappings(${num_hits}/${max_hits})" )); $num_hits = 0; } } elsif($num_hits == 0){#Both ProbeAlign and ProbeTranscriptAlign #Very unlikely, and probably only ProbeTranscriptAlign #push @{$self->unmapped_objects}, #We want to an external_db_id here for if this is Trancsript #What about genomic mapping? #Yes we need to reimplement species DB? warn "UnmappedObject:\t$probe_id has no $mapping_type mappings\n"; $uo_adaptor->store(Bio::EnsEMBL::UnmappedObject->new ( -type => $uo_type,#Currently get's set to NULL as can only have xref or probe2transcript -analysis => $analysis, -ensembl_id => $probe_id, -ensembl_object_type => 'Probe', -external_db_id => $self->{'_external_db_id'}, -identifier => $mapping_type, -summary => 'Unmapped probe', -full_desc => "Probe has no $mapping_type mappings", )); } if($num_hits) { push @kept_hits, @hits; } } return \@kept_hits; } ############################################################ =head2 get_display_name_by_stable_id Args [0] : stable ID from core DB. Args [1] : stable feature type e.g. gene, transcript, translation Example : $self->validate_and_store_feature_types; Description: Builds a cache of stable ID to display names. Returntype : string - display name Exceptions : Throws is type is not valid. Caller : General Status : At risk =cut # -------------------------------------------------------------------------------- # Build a cache of ensembl stable ID -> display_name # Return hashref keyed on {$type}{$stable_id} #Need to update cache if we're doing more than one 'type' at a time # as it will never get loaded for the new type! sub get_display_name_by_stable_id{ my ($self, $stable_id, $type) = @_; $type = lc($type); if($type !~ /(gene|transcript|translation)/){ throw("Cannot get display_name for stable_id $stable_id with type $type"); } if(! exists $self->{'display_name_cache'}->{$stable_id}){ #warn "Generating $type display_name cache\n"; ($self->{'display_name_cache'}->{$stable_id}) = $self->outdb->dnadb->dbc->db_handle->selectrow_array("SELECT x.display_label FROM ${type}_stable_id s, $type t, xref x where t.display_xref_id=x.xref_id and s.${type}_id=t.gene_id and s.stable_id='${stable_id}'"); } return $self->{'display_name_cache'}->{$stable_id}; } sub set_probe_and_slice { my ( $self, $features ) = @_; my $db = $self->outdb; my $slice_adaptor = $db->get_SliceAdaptor; my $probe_adaptor = $db->get_ProbeAdaptor; my $trans_adaptor = $db->dnadb->get_TranscriptAdaptor; my $mapping_type = $self->mapping_type; my $gene_adaptor = $db->dnadb->get_GeneAdaptor; #my $dbe_adaptor = $db->get_DBEntryAdaptor; my $analysis = $self->analysis; my $schema_build = $self->outdb->_get_schema_build($self->outdb->dnadb); my $edb_name = $self->outdb->species.'_core_Transcript'; my (%slices, $slice_id, $trans_mapper, $align_type, $align_length, @tmp, $gap_length); my (@trans_cigar_line, @genomic_blocks, $cigar_line, $gap_start, $block_end); my ($block_start, $slice, @gaps, @features, %gene_hits, $gene, @stranded_cigar_line, $gene_sid); my ($query_perc, $display_name, $gene_hit_key, %transcript_cache); foreach my $feature (@$features) { # get the slice based on the seqname stamped on in the runnable my $seq_id = $feature->seqname; my $probe_id = $feature->probe_id; my $load_feature = 1; my $xref; my ($genomic_start, $genomic_end); #Get the slice if($mapping_type eq 'transcript'){ if(! exists $transcript_cache{$seq_id}){ $transcript_cache{$seq_id} = $trans_adaptor->fetch_by_stable_id($seq_id); } $slice_id = $transcript_cache{$seq_id}->seq_region_name; if ( not exists $slices{$slice_id} ) { $slices{$slice_id} = $transcript_cache{$seq_id}->slice; } } else{ $slice_id= $seq_id; if ( not exists $slices{$slice_id} ) { # assumes genome seqs were named in the Ensembl API Slice naming # convention, i.e. coord_syst:version:seq_reg_id:start:end:strand $slices{$slice_id} = $slice_adaptor->fetch_by_name($slice_id); } } $slice = $slices{$slice_id}; #Now project onto genomic coords if we are on cdna if($mapping_type eq 'transcript'){ #This will basically need to change the start and stops accordingly #and insert D's the size of introns #reject id no D's found as we will already have this as genomic mapping. #The cigar line should always be displayed with reference to the +ve strand of the #target feature i.e. the -ve strand if the transcript is on the -ve strand #These are local stranded start and ends with respect to transcript strand my $transcript_start = $feature->start; my $transcript_end = $feature->end; my $transcript_strand = $transcript_cache{$seq_id}->feature_Slice->strand; #warn "trans($seq_id) start end strand $transcript_start $transcript_end $transcript_strand" if $warn; $trans_mapper = Bio::EnsEMBL::TranscriptMapper->new($transcript_cache{$seq_id}); @genomic_blocks = $trans_mapper->cdna2genomic($transcript_start, $transcript_end); #Next feature is this is an ungapped alignment (1 block) #or just representing a flank seq overhang #which will have been caught by the genomic mapping (2 blocks) #print "Found genomic blocks @genomic_blocks\n"; next if(! (scalar(@genomic_blocks) >2)); #Coordinate object returned are genomic coords #Introns(ProbeFeature deletions) are only represented by absent Coordindate blocks #5'/3' overhangs(Genomic deletions) are represented by Gap objects #We are not accounting for 5'/3' hanging alignment mismatches #Which may actually be a sequence match or mismatch to the genomic sequence! #else alter the start stop values and rebuild the cigarline my $cigar_line = ''; @trans_cigar_line = split/:/, $feature->cigar_line; if($transcript_strand == -1){ #Always report start end and cigar line wrt the +ve strand of the target seq. #Even if the match is against the -ve strand. This is what we do in compara. #This is always the genome even for transcript mapping, as we a re projecting back. #Need to flip genomic blocks around so that we're still dealing with +ve order #This is counterintuitive to how the rest of the API deals with stranded features #Normally returns lowest start first wrt to +ve strand even if you use a -ve strand slice, no? @genomic_blocks = reverse(@genomic_blocks); @stranded_cigar_line = reverse(@trans_cigar_line); #This only makes a difference if there are any m's (seq mismatches) #print "Transcript is -ve strand so reverse cigar is @stranded_cigar_line\n"; } else{ @stranded_cigar_line = @trans_cigar_line; } #Account for 5'/3' overhanging gaps here my %gap_lengths = ( 5 => undef, 3 => undef, ); foreach my $block(@genomic_blocks){ #warn $block.' '.$block->start.' '.$block->end;#.' '.$block->strand; if(! $genomic_start){ if($block->isa('Bio::EnsEMBL::Mapper::Coordinate')){#Set genomic_start if($gap_lengths{5}){#We have seen a gap $genomic_start = $block->start - $gap_length; } else{#Just set to first start $genomic_start = $block->start; } } else{#Must be 5' Gap $gap_lengths{5} = $block->length; } } elsif($block->isa('Bio::EnsEMBL::Mapper::Coordinate')){ $genomic_end = $block->end; } else{#Must be 3' Gap $gap_lengths{3} = $block->length; $genomic_end += $gap_lengths{3}; } #So calculate gap coordinates #We want to skip and Gap objects #as these will have already been inserted into the cigarline by ExonerateProbe next if $block->isa('Bio::EnsEMBL::Mapper::Gap'); if(@gaps){ push @gaps, ($block->start - $gaps[$#gaps]); } push @gaps, ($block->end + 1); } #remove last value as this is the end of the match and not a gap start pop @gaps; #Insert intron gaps into cDNA cigarline $gap_start = shift @gaps; $gap_length = shift @gaps; #Set this to a hypothetical previous block end #So the initial block_start calculation will be valid $block_end = ($genomic_start - 1); my $last_gap = 0; my $match_count = 0; my $mismatch_count = 0; my $score = 0; my $q_length = 0; #Account for 5'/3' overhangs here by simple bp matching #We need to account for Gap length as we may get 2m #Which might actually be 1m1M, where the 1M is an overhang Gap my ($gap_block); foreach my $end(keys %gap_lengths){ if($gap_lengths{$end}){ if($end == 5){ $gap_block = shift @stranded_cigar_line; } else{ $gap_block = pop @stranded_cigar_line; } @tmp = split//, $gap_block; $align_type = pop @tmp; ($align_length = $gap_block) =~ s/$align_type//; if($align_type ne 'm' || $align_length > $gap_lengths{$end}){ throw("Found unexpected alignment block($gap_block) when handling ${end}' overhanging Gap"); } #Now match against genomic sequence??? #No way of doing this as we don't have the probe sequence!!!! #Change this to U? #Can we get the query seq from ExonerateProbe? $gap_block = $gap_lengths{$end}.'U'; $align_length -= $gap_lengths{$end}; if($align_length){ if($end == 5 ){ $gap_block .= $align_length.$align_type; }else{ $gap_block = $align_length.$align_type.$gap_block; } } if($end == 5){ unshift @stranded_cigar_line, $gap_block; }else{ push @stranded_cigar_line, $gap_block; } } } foreach my $block(@stranded_cigar_line){ @tmp = split//, $block; $align_type = pop @tmp; ($align_length = $block) =~ s/$align_type//; $q_length += $align_length; if($align_type eq 'M'){ $match_count += $align_length; $score += ($align_length * 5); } else{# 'm' or U mismatch $mismatch_count += $align_length; $score -= ($align_length * 4); } #We need to calculate the genomic end of the current block #given the previous block end or the genomic_start $block_start = ($block_end + 1); $block_end += $align_length; if($block_end >= $gap_start){ #Could have multiple deletions while(($block_end >= $gap_start) && ! $last_gap){ #Insert the match first $align_length = ($gap_start - $block_start); $cigar_line .= $align_length.$align_type if $align_length; #Deletion wrt probe i.e. intron $cigar_line .= $gap_length.'D'; #Now redefine start and end values $block_start += $align_length + $gap_length; $block_end += $gap_length; #Now grab the next gap if(@gaps){ $gap_start = shift @gaps; $gap_length = shift @gaps; } else{ $last_gap = 1; } } #We have reached the end of the gaps in this block #so just redefine block here $align_length = ($block_end - $block_start + 1); $block = ($align_length) ? $align_length.$align_type : ''; } $cigar_line .= $block; } warn "$genomic_start $genomic_end $cigar_line"; #We could assign the start end directly $feature->start($genomic_start); $feature->end($genomic_end); $feature->strand($transcript_strand); $feature->cigar_line($cigar_line); #warn "Final Feature $genomic_start $genomic_end $cigar_line" if $warn; #Test if we have already seen this alignment $gene = $gene_adaptor->fetch_by_transcript_stable_id($seq_id); #The only way of doing this is to test the genomic_start/end and the genomic cigarline with the gene_stable_id and the probe_id $gene_sid = $gene->stable_id; $gene_hit_key = "${gene_sid}:${probe_id}:${genomic_start}:${genomic_end}:${cigar_line}"; if(exists $gene_hits{$gene_hit_key}){ $load_feature = 0; }else{ #No need to count hits here $gene_hits{$gene_hit_key} = undef; } #Now store the IDXref for this probe transcript hit #This will mean we don't have recalculate this during the probe2transcript mapping step #% ID over aligned region #Which can be different if we have I|Ds #As these will give different seq lengths #But this is the ungapped alignment #So ignore Target ID? #cigar_line is reported wrt to strand of target #we filter out all -ve hits by this point so don't need to account for here. $query_perc = ($match_count/($match_count + $mismatch_count)) * 100; $display_name = $self->get_display_name_by_stable_id($seq_id, 'transcript'); #$id_xref = Bio::EnsEMBL::IdentityXref->new $xref = Bio::EnsEMBL::DBEntry->new ( #-XREF_IDENTITY => $query_perc, #-TARGET_IDENTITY => 90.1, #-SCORE => $score, #-EVALUE => 12, #-CIGAR_LINE => $cigar_line, #-XREF_START => 1,#We are currently padding with mismatches to full length of query #-XREF_END => $q_length, #-ENSEMBL_START => $transcript_start,#target/hit_start #-ENSEMBL_END => $transcript_end,#target/hit_end #-ANALYSIS => $analysis, -PRIMARY_ID => $seq_id, -DISPLAY_ID => $display_name, -DBNAME => $edb_name, -release => $schema_build, -info_type => 'MISC', -info_text => 'TRANSCRIPT', -linkage_annotation => "ProbeTranscriptAlign",#Add query_perc here when we have analysis #-info_text => , #? What is this for? Is used in unique key so we get duplicated if null!!! #-version => , #version of transcript sid? ); #No strand here! Always +ve?! #$dbe_adaptor->store($id_xref, $probe_id, 'Probe', 1);#Do we need ignore release flag here? ###This cannot be done until we have ox.analysis_id in place v54? #Yes we can, just use the linkage_annotation for now! #No store this as a ProbeFeature DBEntry in line with how the individual probe xrefs #are stored in probe2transcript #we don't really need this extra translation and score info here #and we are storing the cigar_line as part of the probe_feature #so we can reconstitute the alignment if required #This will mean we can separate the individual feature xrefs from the Probe/ProbeSet level full xref annotation #Will mean some duplication for single probe arrays #But will allow different sets of rules between ProbeAlign and probe2transcript #i.e. we could relax the mapping rules but keep the xreffin rules more stringent #We don't know what the ProbeFeature dbID is yet so we will have to pass this to store with the feature #$dbe_adaptor->store($xref, $probe_id, 'Probe', 1);#Do we need ignore release flag here? } else{#No introns - reformat cigar line $feature->cigar_line(join('', (split/:/, $feature->cigar_line))); } if($load_feature){ #Reset start ends for non-ref slices if($slice->start != 1){ #Need to reslice, but we don't want to affect #the slice in the cahse as this will screw up #further feature start/end tranforms my ($level, undef, $name) = split /:/, $slice->name; #warn "Resetting slice for ".$slice->name; my $start = $feature->start + $slice->start - 1; my $end = $feature->end + $slice->start - 1; $feature->start($start); $feature->end($end); $slice = $slice_adaptor->fetch_by_region($level, $name, 1, $end); } $feature->slice($slice); # Recover the probe from the cache or DB my $real_probe = $self->probes->{$probe_id}; if(!$real_probe){ $real_probe = $probe_adaptor->fetch_by_dbID($probe_id); if (!$real_probe){ throw "Trying to clean up features for persistence: cant find probe with dbID $probe_id in database!\n"; } $self->probes->{$probe_id} = $real_probe; } $feature->probe($real_probe); $feature->analysis($analysis); push @features, [ $feature, $xref ]; } } return $self->features(\@features); } sub outdb { my ($self) = @_; my $outdb; my $dnadb; #Do we need to alter this??????????????????????????????????????????????????????????????????????????????????????????????????? #Look at ImportArrays #There is method duplication here which we could move to a ProbeDB.pm? if(! defined $self->{'efg_db'}){ #This DNADB testing is a work around to avoid having to edit Config::ProbeAlign my $dnadb; #not defined as an empty env var will give the defined null string if($self->DNADB->{-dbname}){ $dnadb = new Bio::EnsEMBL::DBSQL::DBAdaptor(%{ $self->DNADB }); } $self->{'efg_db'} = Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor->new ( %{ $self->OUTDB }, -dnadb => $dnadb, ); if(! $self->DNADB->{-dbname}){ print "WARNING: Using default DNADB ". $self->{'efg_db'}->dnadb->dbname."\n"; } #We are now forcing use of OUTDB/DNADB config #else { # #Historical, but sensible? # $self->{'efg_db'} = $self->SUPER::db; #} } return $self->{'efg_db'}; } sub query_file { my ( $self, $value ) = @_; if ( defined $value ) { $self->{'_query_file'} = $value; } if ( exists( $self->{'_query_file'} ) ) { return $self->{'_query_file'}; } else { return undef; } } ############################################################# sub unmapped_objects{ my ( $self, $value ) = @_; $self->{'_unmnapped_objects'} = $value if defined $value; return $self->{'_unmapped_objects'}; } ############################################################# ############################################################# sub arrays { my ( $self, $value ) = @_; $self->{'_arrays'} = $value if defined $value; return $self->{'_arrays'}; } ############################################################# sub probes { my ( $self, $value ) = @_; $self->{'_probes'} = $value if defined $value; return $self->{'_probes'}; } ############################################################# sub features { my ( $self, $value ) = @_; $self->{'_features'} = $value if defined $value; return $self->{'_features'};#do we need to test and return undef? } ############################################################# # Declare and set up config variables ############################################################# sub read_and_check_config { my $self = shift; $self->SUPER::read_and_check_config($PROBE_CONFIG); ########## # CHECKS ########## my $logic = $self->analysis->logic_name; #We need to set the QUERYSEQS here dependant on the config #We are running all array formats from one cat'd file so we don't need to know any special format specific config # check that compulsory options have values # Remove DNADB?? # Removed ARRAY_DESIGN foreach my $config_var ( qw( QUERYSEQS QUERYTYPE TARGETSEQS OUTDB DNADB OPTIONS HIT_SATURATION_LEVEL MAX_MISMATCHES ) ){ #We could make MAX_MISMATCHES optional and have % ID here instead? # FILTER_METHOD if ( not defined $self->$config_var ){ throw("You must define $config_var in config for logic '$logic'"); } } #Test DB config hashes for my $db(qw(OUTDB DNADB)){ if (ref( $self->$db ) ne "HASH" || ! defined $self->$db->{-dbname}) { throw("$db in config for '$logic' must be a hash ref of db connection parameters."); } } } #sub ARRAY_DESIGN{ # my ( $self, $value ) = @_; # $self->{'ARRAY_DESIGN'} = $value if defined $value; # return $self->{'ARRAY_DESIGN'}; #} #sub FILTER_METHOD{ # my ( $self, $value ) = @_; # $self->{'_CONFIG_FILTER_METHOD'} = $value if defined $value; # return $self->{'_CONFIG_FILTER_METHOD'} || undef; #} sub QUERYSEQS { my ( $self, $value ) = @_; if ( defined $value ) { $self->{'_CONFIG_QUERYSEQS'} = $value; } if ( exists( $self->{'_CONFIG_QUERYSEQS'} ) ) { return $self->{'_CONFIG_QUERYSEQS'}; } else { return undef; } } sub QUERYTYPE { my ( $self, $value ) = @_; if ( defined $value ) { $self->{'_CONFIG_QUERYTYPE'} = $value; } if ( exists( $self->{'_CONFIG_QUERYTYPE'} ) ) { return $self->{'_CONFIG_QUERYTYPE'}; } else { return undef; } } sub TARGETSEQS { my ( $self, $value ) = @_; if ( defined $value ) { $self->{'_CONFIG_TARGETSEQS'} = $value; } if ( exists( $self->{'_CONFIG_TARGETSEQS'} ) ) { return $self->{'_CONFIG_TARGETSEQS'}; } else { return undef; } } sub MAX_MISMATCHES { my ( $self, $value ) = @_; if ( defined $value ) { $self->{'_CONFIG_MAX_MISMATCHES'} = $value; } if ( exists( $self->{'_CONFIG_MAX_MISMATCHES'} ) ) { return $self->{'_CONFIG_MAX_MISMATCHES'}; } else { return undef; } } sub IIDREGEXP { my ( $self, $value ) = @_; if ( defined $value ) { $self->{'_CONFIG_IIDREGEXP'} = $value; } if ( exists( $self->{'_CONFIG_IIDREGEXP'} ) ) { return $self->{'_CONFIG_IIDREGEXP'}; } else { return undef; } } sub OUTDB { my ( $self, $value ) = @_; if ( defined $value ) { $self->{'_CONFIG_OUTDB'} = $value; } if ( exists( $self->{'_CONFIG_OUTDB'} ) ) { return $self->{'_CONFIG_OUTDB'}; } else { return undef; } } sub DNADB { my ( $self, $value ) = @_; if ( defined $value ) { $self->{'_CONFIG_DNADB'} = $value; } if ( exists( $self->{'_CONFIG_DNADB'} ) ) { return $self->{'_CONFIG_DNADB'}; } else { return undef; } } sub OPTIONS { my ( $self, $value ) = @_; if ( defined $value ) { $self->{'_CONFIG_OPTIONS'} = $value; } if ( exists( $self->{'_CONFIG_OPTIONS'} ) ) { return $self->{'_CONFIG_OPTIONS'}; } else { return undef; } } sub HIT_SATURATION_LEVEL { my ($self, $val) = @_; if (defined $val) { $self->{'_CONFIG_HIT_SATURATION_LEVEL'} = $val; } if (exists $self->{'_CONFIG_HIT_SATURATION_LEVEL'}) { return $self->{'_CONFIG_HIT_SATURATION_LEVEL'}; } else { # default to 100 # This is not used for ProbeTranscriptAlign return 100; } } ############################################### ### end of config ############################################### 1;