Raw content of Bio::EnsEMBL::Analysis::Config::ProbeAlign # # package Bio::EnsEMBL::Analysis::Config::Funcgen::ProbeAlign? # # Cared for by EnsEMBL (ensembl-dev@ebi.ac.uk) # # Copyright GRL & EBI # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME Bio::EnsEMBL::Analysis::Config::Funcgen::ProbeAlign? =head1 SYNOPSIS use Bio::EnsEMBL::Analysis::Config::Funcgen::ProbeAlign; =head1 DESCRIPTION This contains the configuration for step 2 of the process which maps probes to a Genome. This step is an alignment of probes (dna) against a genome (dna) using exonerate. So this config looks very similar to that of any other exonerate-driving config. The layout of the configuration is a set of hashes, each one keyed by logic name. There is also a DEFAULT hash, which is used as the default for all logic names. There are genomic and transcript based logic names and config hashes for each discrete format of array. =head1 CONTACT =cut package Bio::EnsEMBL::Analysis::Config::ProbeAlign; use strict; use vars qw( %Config ); %Config = ( #This entire hash is exported as the global $PROBE_CONFIG var #each key will be exported as $PROBE_CONFIG->{'_CONFIG_'.$key} #Dependant on logic name of RunnableDB PROBE_CONFIG => { DEFAULT => { # path to softmasked, dusted genomic sequence or transcript seqs on the farm # #'/data/blastdb/Ensembl/Rmacaque/MMUL_2/genome/softmasked_dusted.fa', #/data/blastdb/Ensembl/Human/NCBI35/softmasked_dusted/', #allowed to be a dir. TARGETSEQS => $ENV{'GENOMICSEQS'},#or $ENV{'TRANSCRIPTSEQS'} QUERYTYPE => 'dna', # must be a single file containing all (non-redundant) probes indexed by affy_probe_id # QUERYSEQS refers to the value of the parameter NON_REDUNDANT_PROBE_SEQS # in the config-file ensembl-analysis/Config/CollapseAffyProbes.pm #QUERYSEQS => $ENV{'NR_FASTA'}, #Removed this now as we want to run different analyses at the same time so we have to hardcode below # must supply one, since the queryseqs MUST be a single file #InputIDREGEXP this is used to infer chunk number from the headers of a single fasta file #Therefore we cannot have mixed type in the same file, must be in a different array set #If not related, or reformated prior to Import if they are related IIDREGEXP => '(\d+):(\d+)', #DNADB is not essential, but we need this if we are going to define a DNADB not on ensembldb #e.g. new release on staging #Add species and group here? DNADB => { -dbname => $ENV{'DNADB_NAME'}, -host => $ENV{'DNADB_HOST'}, -port => $ENV{'DNADB_PORT'}, -user => $ENV{'DNADB_USER'}, -pass => $ENV{'DNADB_PASS'}, -species => $ENV{'SPECIES'}, -multispecies_db => $ENV{'DNADB_MULTISPECIES_DB'}, -species_id => $ENV{'DNADB_SPECIES_ID'} }, OUTDB => { -dbname => $ENV{'DB_NAME'}, -host => $ENV{'DB_HOST'}, -port => $ENV{'DB_PORT'}, -user => $ENV{'DB_USER'}, -pass => $ENV{'DB_PASS'}, -species => $ENV{'SPECIES'},#required for auto generation fo DNADB -multispecies_db => $ENV{'MULTISPECIES_DB'}, -species_id => $ENV{'SPECIES_ID'} }, #25 mers OPTIONS => ' --bestn 100 --dnahspthreshold 116 --fsmmemory 256 --dnawordlen 25 --dnawordlimit 11 ', #50 mers #OPTIONS => ' --bestn 100 --dnahspthreshold 116 --fsmmemory 256 --dnawordlen 50 --dnawordthreshold 11 ', # if the number of hits reaches or exceeds the figure below, we reject # all hits to that probe HIT_SATURATION_LEVEL => 100,#2 for unique tiling arrays mappings MAX_MISMATCHES => 1,#No way to dynamically test method prerequisite config vars without setting a FILTER_METHOD hash? #This would not be bullet proof a the hash could then be edited #Would need to add another method to Runnable::ExonerateProbe to change this #FILTER_METHOD => 'filter_mismatches', #Or we can pass a code ref here to allow easy extension without editing Runnables etc. #Can't name code ref subs! #This is used in ExonerateProbe.pm #e.g. #FILTER_METHOD => sub { # my ($self, $query_match_length, $q_length, $score) = @_; # my $mismatch; # my $full_score = $q_length * 5; # if($query_match_length == $q_length){ # if($score == $full_score){ # $mismatch = 0; # } # } # if(! defined $mismatch){ # my $max_mismatch = $self->allowed_mismatches; # for my $i(1..$max_mismatch){ # my $mismatch_length = $q_length - $i; # my $mismatch_score = $mismatch_length * 5; # if($query_match_length == $q_length){ # if ($score == ($full_score - ($i*9))) { # $mismatch = $i; # } # } # elsif($query_match_length == $mismatch_length){ # $mismatch = $i if ($score == $mismatch_score); # } # } # } # return $mismatch; #}, },#end of DEFAULT #IIDREGEXP, DNADB, OUTDB and QUERYSEQS and QUERYTYPE should be same for all these configs #Need to add ILLUMINA_PROBE_ALIGN, ILLUMINA_PROBE_TRANSCRIPT_ALIGN, CODELINK, AGILENT etc #There is no point in using a % threshold here as bestn value will most likely cause only high quality #hits to be returned. However there is a possiblity with longer sequences that we may get a duff alignment. #We could set -percent to a conservative 95%, but this entirely depends on the length and number of allowed #mismatches. #Define QUERYSEQS here instead of in Runnable to prevent convolution #of RunnableDB and environment, #i.e. we can still run ProbeAlign so long as we change this config #The only downfall is lack of validation of QUERYSEQS files #WARNING CHECK YOUR QUERYSEQS! AFFY_UTR_PROBEALIGN => { MAX_MISMATCHES => 1, TARGETSEQS => $ENV{'GENOMICSEQS'}, QUERYSEQS => $ENV{'WORK_DIR'}.'/arrays_nr.AFFY_UTR.fasta', #at least 25 mers allowing 1bp mismatch #this will still work in the worst case where the mismatch is at the centre of a 25bp probe OPTIONS => ' --bestn 101 --fsmmemory 256 --dnawordlen 12 --seedrepeat 2 --dnahspthreshold 118 --dnawordlimit 0', #OPTIONS => ' --bestn 101 --dnahspthreshold 116 --fsmmemory 256 --dnawordlen 14 --dnawordlimit 11 ', HIT_SATURATION_LEVEL => 100, }, #Essentially same as AFFY but with different NR_FASTA AFFY_ST_PROBEALIGN => { TARGETSEQS => $ENV{'GENOMICSEQS'}, QUERYSEQS => $ENV{'WORK_DIR'}.'/arrays_nr.AFFY_ST.fasta', #25 mers #OPTIONS => ' --bestn 101 --dnahspthreshold 116 --fsmmemory 256 --dnawordlen 14 --dnawordlimit 11 ', OPTIONS => ' --bestn 101 --fsmmemory 256 --dnawordlen 12 --seedrepeat 2 --dnahspthreshold 118 --dnawordlimit 0', HIT_SATURATION_LEVEL => 100, MAX_MISMATCHES => 1, }, NIMBLEGEN_PROBEALIGN => { TARGETSEQS => $ENV{'GENOMICSEQS'}, #Need to define this dynamically based on oligo length (40-60mers) #50mers #Can we up the dnaword limit 10 50 here if we only want unique matches? OPTIONS => ' --bestn 2 --dnahspthreshold 116 --fsmmemory 256 --dnawordlen 50 --dnawordlimit 11 ', HIT_SATURATION_LEVEL => 2, #We only want unique mappings for tiling probes MAX_MISMATCHES => 0, #Unique mappings for tiling probes }, #ILLUMINA_WG are 51mers. These settings allow for at least 1bp mismatch ILLUMINA_WG_PROBEALIGN => { TARGETSEQS => $ENV{'GENOMICSEQS'}, QUERYSEQS => $ENV{'WORK_DIR'}.'/arrays_nr.ILLUMINA_WG.fasta', #Need to define this dynamically based on oligo length (40-60mers) #50mers OPTIONS => ' --bestn 101 --dnahspthreshold 246 --fsmmemory 256 --dnawordlen 25 --seedrepeat 2 --dnawordlimit 0 ', HIT_SATURATION_LEVEL => 100, MAX_MISMATCHES => 1, #Unique mappings for tiling probes }, ILLUMINA_WG_PROBETRANSCRIPTALIGN => { TARGETSEQS => $ENV{'TRANSCRIPTSEQS'}, QUERYSEQS => $ENV{'WORK_DIR'}.'/arrays_nr.ILLUMINA_WG.fasta', #Need to define this dynamically based on oligo length (40-60mers) #50mers OPTIONS => ' --bestn 101 --dnahspthreshold 246 --fsmmemory 256 --dnawordlen 25 --seedrepeat 2 --dnawordlimit 0 ', HIT_SATURATION_LEVEL => 100, MAX_MISMATCHES => 1, #Unique mappings for tiling probes }, #CODELINK are 30mers CODELINK_WG_PROBEALIGN => { TARGETSEQS => $ENV{'GENOMICSEQS'}, QUERYSEQS => $ENV{'WORK_DIR'}.'/arrays_nr.CODELINK.fasta', OPTIONS => ' --bestn 101 --dnahspthreshold 246 --fsmmemory 256 --dnawordlen 15 --seedrepeat 2 --dnawordlimit 0 ', HIT_SATURATION_LEVEL => 100, MAX_MISMATCHES => 1, #Unique mappings for tiling probes }, CODELINK_WG_PROBETRANSCRIPTALIGN => { TARGETSEQS => $ENV{'TRANSCRIPTSEQS'}, QUERYSEQS => $ENV{'WORK_DIR'}.'/arrays_nr.CODELINK.fasta', OPTIONS => ' --bestn 101 --dnahspthreshold 246 --fsmmemory 256 --dnawordlen 15 --seedrepeat 2 --dnawordlimit 0 ', HIT_SATURATION_LEVEL => 100, MAX_MISMATCHES => 1, #Unique mappings for tiling probes }, #PHALANX are 60mers PHALANX_WG_PROBEALIGN => { TARGETSEQS => $ENV{'GENOMICSEQS'}, QUERYSEQS => $ENV{'WORK_DIR'}.'/arrays_nr.PHALANX.fasta', OPTIONS => ' --bestn 101 --dnahspthreshold 246 --fsmmemory 256 --dnawordlen 30 --seedrepeat 2 --dnawordlimit 0 ', HIT_SATURATION_LEVEL => 100, MAX_MISMATCHES => 1, #Unique mappings for tiling probes }, PHALANX_WG_PROBETRANSCRIPTALIGN => { TARGETSEQS => $ENV{'TRANSCRIPTSEQS'}, QUERYSEQS => $ENV{'WORK_DIR'}.'/arrays_nr.PHALANX.fasta', OPTIONS => ' --bestn 101 --dnahspthreshold 246 --fsmmemory 256 --dnawordlen 30 --seedrepeat 2 --dnawordlimit 0 ', HIT_SATURATION_LEVEL => 100, MAX_MISMATCHES => 1, #Unique mappings for tiling probes }, AFFY_UTR_PROBETRANSCRIPTALIGN => { TARGETSEQS => $ENV{'TRANSCRIPTSEQS'}, QUERYSEQS => $ENV{'WORK_DIR'}.'/arrays_nr.AFFY_UTR.fasta', #25 mers OPTIONS => ' --bestn 101 --fsmmemory 256 --dnawordlen 12 --seedrepeat 2 --dnahspthreshold 118 --dnawordlimit 0', #OPTIONS => ' --bestn 101 --dnahspthreshold 116 --fsmmemory 256 --dnawordlen 14 --dnawordlimit 11 ', #HIT_SATURATION_LEVEL => 100,#I don't think we want this for the transcript mappings #Defaults to 100 anyway, but not used #FILTER_METHOD => 'filter_mismatches',#Would need to add another method to Runnable::Exonerate MAX_MISMATCHES => 1, }, #Essentially same as AFFY but with different NR_FASTA AFFY_ST_PROBETRANSCRIPTALIGN => { TARGETSEQS => $ENV{'TRANSCRIPTSEQS'}, QUERYSEQS => $ENV{'WORK_DIR'}.'/arrays_nr.AFFY_ST.fasta', #25 mers OPTIONS => ' --bestn 101 --fsmmemory 256 --dnawordlen 12 --seedrepeat 2 --dnahspthreshold 118 --dnawordlimit 0', #OPTIONS => ' --bestn 101 --dnahspthreshold 116 --fsmmemory 256 --dnawordlen 14 --dnawordlimit 11 ', #HIT_SATURATION_LEVEL => 100, MAX_MISMATCHES => 1, }, #AGILENT 60 mers AGILENT_PROBEALIGN => { TARGETSEQS => $ENV{'GENOMICSEQS'}, QUERYSEQS => $ENV{'WORK_DIR'}.'/arrays_nr.AGILENT.fasta', OPTIONS => ' --bestn 101 --dnahspthreshold 246 --fsmmemory 256 --dnawordlen 30 --seedrepeat 2 --dnawordlimit 0 ', HIT_SATURATION_LEVEL => 100, MAX_MISMATCHES => 1, }, AGILENT_PROBETRANSCRIPTALIGN => { TARGETSEQS => $ENV{'TRANSCRIPTSEQS'}, QUERYSEQS => $ENV{'WORK_DIR'}.'/arrays_nr.AGILENT.fasta', OPTIONS => ' --bestn 101 --dnahspthreshold 246 --fsmmemory 256 --dnawordlen 30 --seedrepeat 2 --dnawordlimit 0 ', HIT_SATURATION_LEVEL => 100, MAX_MISMATCHES => 1, }, #LEIDEN 50 mers LEIDEN_PROBEALIGN => { TARGETSEQS => $ENV{'GENOMICSEQS'}, QUERYSEQS => $ENV{'WORK_DIR'}.'/arrays_nr.LEIDEN.fasta', OPTIONS => ' --bestn 101 --dnahspthreshold 246 --fsmmemory 256 --dnawordlen 25 --seedrepeat 2 --dnawordlimit 0 ', HIT_SATURATION_LEVEL => 100, MAX_MISMATCHES => 1, }, LEIDEN_PROBETRANSCRIPTALIGN => { TARGETSEQS => $ENV{'TRANSCRIPTSEQS'}, QUERYSEQS => $ENV{'WORK_DIR'}.'/arrays_nr.LEIDEN.fasta', OPTIONS => ' --bestn 101 --dnahspthreshold 246 --fsmmemory 256 --dnawordlen 25 --seedrepeat 2 --dnawordlimit 0 ', HIT_SATURATION_LEVEL => 100, MAX_MISMATCHES => 1, }, } ); sub import { my ($callpack) = caller(0); # Name of the calling package my $pack = shift; # Need to move package off @_ # Get list of variables supplied, or else everything my @vars = @_ ? @_ : keys( %Config ); return unless @vars; # Predeclare global variables in calling package eval "package $callpack; use vars qw(" . join(' ', map { '$'.$_ } @vars) . ")"; die $@ if $@; foreach (@vars) { if ( defined $Config{$_} ) { no strict 'refs'; # Exporter does a similar job to the following # statement, but for function names, not # scalar variables: *{"${callpack}::$_"} = \$Config{ $_ }; } else { die "Error: Config: $_ not known\n"; } } } 1;