Raw content of Bio::EnsEMBL::Pipeline::GeneDuplication::Finder.

Raw content of Bio::EnsEMBL::Pipeline::GeneDuplication::Finder package Bio::EnsEMBL::Pipeline::GeneDuplication::Finder; use vars qw(@ISA); use strict; use Bio::EnsEMBL::Root; use Bio::Tools::Run::Alignment::Clustalw; use Bio::EnsEMBL::Pipeline::Runnable::BlastDB; use Bio::EnsEMBL::Pipeline::Runnable::MinimalBlast; use Bio::EnsEMBL::Pipeline::SeqFetcher::FetchFromBlastDB; use Bio::EnsEMBL::Pipeline::GeneDuplication::PAML; use Bio::EnsEMBL::Pipeline::GeneDuplication::CodonBasedAlignment; use Bio::EnsEMBL::Pipeline::GeneDuplication::Result; use Bio::EnsEMBL::Utils::Exception qw(warning throw); use Bio::EnsEMBL::Utils::Argument qw(rearrange); my $DEFAULT_DISTANCE_METHOD = 'NeiGojobori'; my $DEFAULT_BLAST_PROGRAM = 'wublastn'; my $DEFAULT_DISTANCE_CUTOFF = 1.000; @ISA = qw(Bio::EnsEMBL::Root Bio::EnsEMBL::Pipeline::RunnableI); ### Constructor ### =head2 new Args : -dbfile - (string) Full path to a fasta formatted file of nucleotide sequences. -blastdb - A Bio::EnsEMBL::Pipeline::Runnable::BlastDB object for which the run method has been invoked. -query_seq - A Bio::Seq which is comprised of nucleotides. -blast_program - Manually set the blast program (and full path) that should be used. Make sure the program matches the index type chosen. Defaults to wublastn. -blast_index_type - The distribution of blast to use. See Bio::EnsEMBL::Pipeline::Runnable::BlastDB->index_type documentation. Defaults to 'wu_new'. -hit_identity - (optional) Hit identity. Defaults to 0.80 -hit_coverage - (optional) Hit coverage. Defults to 0.80 -work_dir - (optional) Dir where working files are placed. Defaults to /tmp. -codeml - Full path to codeml executable. -genetic_code - 0 for Universal, 1 for Mitochondrial. See the Bio::Seq->translate method for a full list of options. -regex_query_species - A regular expression that will parse some portion of the ids of the query species (and not the ids of outgroup species). Eg. 'ENSG'. -regex_outgroup_species - A ref to an array of regular expressions that will parse the ids of the various outgroup species (but not the ids of the query species). E.g. ['ENSMUSG', 'ENSRNOG'] -distance_cutoff - A genetic distance cutoff to apply for occasions where an outgroup species is not set, or an outgroup species match was not found. -distance_method - The method used to calculate the genetic distance and Ka/Ks ratios. Options are 'NeiGojobori' and 'ML', ML being the maximum likelihood method of Yang. Example : none Description: Constructs new object Returntype : Bio::EnsEMBL::Pipeline::GeneDuplication::Finder Exceptions : Throws if database file not specified or does not exist. Caller : General =cut sub new { my ($class, @args) = @_; my $self = bless {}, $class; my ($query, $blastdb, $blast_program, $blast_index_type, $work_dir, $codeml, $genetic_code, $regex_query_species, $regex_outgroup_species, $identity_cutoff, $coverage_cutoff, $distance_cutoff, $distance_method) = rearrange([qw(QUERY BLASTDB BLAST_PROGRAM BLAST_INDEX_TYPE WORK_DIR CODEML_EXECUTABLE GENETIC_CODE REGEX_QUERY_SPECIES REGEX_OUTGROUP_SPECIES HIT_IDENTITY HIT_COVERAGE DISTANCE_CUTOFF DISTANCE_METHOD )],@args); $self->_work_dir($work_dir) if $work_dir; if ($blastdb && $blastdb->isa("Bio::EnsEMBL::Pipeline::Runnable::BlastDB")){ $self->_blastdb($blastdb); } else { throw ("Need a Bio::EnsEMBL::Pipeline::Runnable::BlastDB object."); } $self->_query_seq($query) if $query; $self->_codeml($codeml) if $codeml; $self->_genetic_code($genetic_code); $self->_regex_query_species($regex_query_species) if $regex_query_species; $self->_regex_outgroup_species($regex_outgroup_species) if $regex_outgroup_species; $self->_identity_cutoff($identity_cutoff) if $identity_cutoff; $self->_coverage_cutoff($coverage_cutoff) if $coverage_cutoff; $self->_blast_program($blast_program) if $blast_program; $self->_identity_cutoff(80) unless $self->_identity_cutoff; $self->_coverage_cutoff(80) unless $self->_coverage_cutoff; $self->_distance_method($distance_method) if $distance_method; $self->_distance_cutoff($distance_cutoff) if $distance_cutoff; return $self } ### Public methods ### =head2 run Args[1] : [optional] Input sequence (Bio::Seq) Example : none Description: Top level method that executes the gene duplicate finding algorithm. Returntype : Bio::EnsEMBL::Pipeline::GeneDuplication::Result Exceptions : Warns if PAML run fails. Throws if PAML returns multiple result sets (unlikely). Caller : General =cut sub run { my ($self, $input_seq) = @_; $self->_query_seq($input_seq) if $input_seq; # Derive a list of sequences that look like duplications of # the query sequence. my $accepted_hits = $self->_find_recent_duplications($self->_query_seq); unless (scalar @$accepted_hits > 1) { print "No homologous matches were found that satisfied the match criteria.\n"; return 0 } return $accepted_hits; } ### Hit Chooser Methods ### =head2 _find_recent_duplications Args[1] : Bio::Seq query sequence Example : none Description: This is the implementation of the main algorithm of this module. Returntype : Exceptions : Caller : General =cut sub _find_recent_duplications { my ($self, $query) = @_; # Perform blast search with this query sequence. my $bplite_report = $self->_run_blast($query); # Take blast report and filter for hits with good coverage # and sufficiently high identity. my $good_hits = $self->_preliminary_filter($bplite_report); # Check that good hits have been found. unless (scalar(keys %$good_hits)){ print "Did not find hits to query sequence.\n"; return [] } # From the set of promising hits, which will possibly include # hits to other species, filter using outgroup sequences (if # any) and the genetic distance between each hit and the query # sequence. my $recent_duplications = $self->_phylogenetic_filter($good_hits); # Return a complex data object that contains the pairwise # match details for each final accepted hit. return $recent_duplications; } =head2 _run_blast Args[1] : Bio::Seq query sequence Example : none Description: Returntype : Exceptions : Caller : General =cut sub _run_blast { my ($self, $query) = @_; # We are looking for duplicates of the query gene # in our blast database. $self->_query_seq($query) if $query; my $bplite_report; eval{ $bplite_report = $self->_blast_obj->run; }; throw ("Blast did not run successfully. Blast program was [". $self->_blast_program."]. Index type was [". $self->_blast_index_type."].") if $@; throw ("Blast process did not return a report.") unless ($bplite_report->isa("Bio::Tools::BPlite")); return $bplite_report } =head2 _preliminary_filter Args[1] : Example : none Description: Returntype : Exceptions : Caller : General =cut sub _preliminary_filter { my ($self, $bplite_report) = @_; # Process our blast report. # # For each blast hit: # * throw away self matches (if any) # * filter hits by identity and coverage # * align WHOLE genes of promising hits, re-filter by # identity and coverage # * calculate genetic distance between each subject and the query my %good_hits; DISTINCT_HIT: while (my $sbjct = $bplite_report->nextSbjct){ # Mangle the BPLite::Sbjct object for its own good. Quite # often the hit ids parsed by BPLite include the whole # Fasta header description line. This is problematic if # sequence ids need to be compared or a hash is keyed on # this sequence id. Here we simply lop the description # from each Sbjct->name, if there is one. $sbjct = $self->_fix_sbjct($sbjct); # It appears that the BPLite::Sbjct object only allows # HSPs to be accessed once (as this process is closely # tied to the parsing of the Blast report). Hence, here # we loop through them all here and store them in an # array. my @hsps; while (my $hsp = $sbjct->nextHSP) { push (@hsps, $hsp); } # Skip hit if it is a match to self. next DISTINCT_HIT if ($self->_query_seq->display_id eq $sbjct->name); # First, filter hits by identity my $hit_identity = $self->_hit_identity(\@hsps); next DISTINCT_HIT unless ($hit_identity >= $self->_identity_cutoff); # Second, filter hits by coverage next DISTINCT_HIT unless ($self->_appraise_hit_coverage($sbjct, \@hsps)); # Third, this has become a serious hit. Make a pairwise # alignment of the query and hit. my @seqs = ($self->_fetch_seq($self->_query_seq->display_id), $self->_fetch_seq($sbjct->name)); throw ("Didnt correctly obtain two sequences for alignment.") unless scalar @seqs == 2; my $align = $self->_pairwise_align(\@seqs); # and filter by coverage and identity for the whole # aligned sequences. my ($min_coverage, $identity) = $self->_calculate_coverage_and_identity($align); unless ($min_coverage >= $self->_coverage_cutoff && $identity >= $self->_identity_cutoff) { next DISTINCT_HIT; } # Forth, calculate the genetic distance while we are here. my ($ka, $ks, $n, $s) = $self->_run_pairwise_paml($align); # Finally, store all this useful information. Perhaps a proper # object would make this easier to access later? die "undefined N and S" unless defined $n and defined $s; $good_hits{$sbjct->name} = {'Ka' => $ka, 'Ks' => $ks, 'N' => $n, 'S' => $s, 'coverage' => $min_coverage, 'identity' => $identity, 'alignment'=> $align}; } return \%good_hits } =head2 _phylogenetic_filter Args[1] : Example : none Description: Returntype : Exceptions : Caller : General =cut sub _phylogenetic_filter { my ($self, $good_hits) = @_; my %hits_by_species; HIT: foreach my $hit_id (keys %$good_hits) { # Now, partition hits according to their species. The species # from which the subject is derived is determined by a # regular expression match to the sequence id. my $query_regex = $self->_regex_query_species; if ($hit_id =~ /$query_regex/) { push(@{$hits_by_species{$self->_regex_query_species}}, $hit_id); next HIT; } else { foreach my $regex (@{$self->{_regex_outgroup_species}}) { if ($hit_id =~ /$regex/){ push (@{$hits_by_species{$regex}}, $hit_id); next HIT; } } } warning("Didnt match hit id to any regex [". $hit_id ."]."); } # Return now if there are no intra-species hits. unless (defined @{$hits_by_species{$self->_regex_query_species}} && scalar @{$hits_by_species{$self->_regex_query_species}}) { return [] } # Sort our hits by their distance to the query sequence. my %sorted_hits_by_species; foreach my $species (keys %hits_by_species) { my @sorted_hits = sort {$good_hits->{$a}->{Ks} <=> $good_hits->{$b}->{Ks}} @{$hits_by_species{$species}}; $sorted_hits_by_species{$species} = \@sorted_hits; } # Accept all query species hits with a distance less than # the distance to the most related outgroup species. $self->outgroup_distance($self->_distance_cutoff); my $closest_outgroup_distance = $self->_distance_cutoff; foreach my $regex (@{$self->_regex_outgroup_species}){ next unless $sorted_hits_by_species{$regex}; my $closest_hit_id = $sorted_hits_by_species{$regex}->[0]; throw("Missing distance value") unless defined $good_hits->{$closest_hit_id}->{Ks} || $good_hits->{$closest_hit_id}->{Ks} == 0; my $closest_hit_distance = $good_hits->{$closest_hit_id}->{Ks}; if (($closest_hit_distance < $closest_outgroup_distance) && ($closest_hit_distance > 0)) { $closest_outgroup_distance = $closest_hit_distance; } } $self->outgroup_distance($closest_outgroup_distance); my @accepted_hits; my @query_species_hits = @{$sorted_hits_by_species{$self->_regex_query_species}}; foreach my $hit_id (@query_species_hits) { if ($good_hits->{$hit_id}->{Ks} <= $closest_outgroup_distance && $good_hits->{$hit_id}->{Ks} >= 0) { push (@accepted_hits, $good_hits->{$hit_id}); } elsif ($good_hits->{$hit_id}->{Ks} >= 0) { last; } } return \@accepted_hits } ### Utility Methods ### =head2 _run_pairwise_paml Args : An arrayref to a set of aligned Bio::Seq objects. Example : none Description: Uses an array of aligned Bio::Seq objects to execute PAML in pairwise mode. Returntype : Bio::Tools::Phylo::PAML Exceptions : none Caller : $self->run =cut sub _run_pairwise_paml { my ($self, $aligned_seqs) = @_; # Cope with a bloody PAML quirk. Identical sequences # cause a fatal error. if ($aligned_seqs->[0]->seq eq $aligned_seqs->[1]->seq) { return (0, 0, 0, 0) } # The business of running codeml. my $retries = 0; my $result; my $paml; # Object must stay in scope until parsing is complete. while (! defined $result && $retries < 3) { $paml = Bio::EnsEMBL::Pipeline::GeneDuplication::PAML->new( '-work_dir' => $self->_work_dir, '-executable' => $self->_codeml, '-aligned_seqs' => $aligned_seqs, '-runmode' => '-2', '-seqtype' => '1', '-model' => '0', '-nssites' => '0', '-icode' => ($self->_genetic_code) - 1 ); my $parser = $paml->run_codeml; # Often a codeml run might look successful, but the Bioperl parser # will not be able to read the output created. This is usually # due to a silent codeml failure and a lack of output. Hence, # here an attempt is made to check the success of the parser. eval { $result = $parser->next_result(); }; if ($@ || ! defined $result) { warning("Problem deriving PAML result object.\n$@") } $retries++ } unless (defined $result) { throw ("Result has become undefined. This is probably a problem " . "with Bio::Tools::Phylo::PAML.") } my $matrix; eval { if ($self->_distance_method eq 'NeiGojobori') { $matrix = $result->get_NGmatrix() } else { $matrix = $result->get_MLmatrix() } }; if ($@){ throw ("PAML failed to give a file that could be parsed.\n" . "This is a PAML/codeml error.\n" . "The offending aligned sequences are :\n>" . $aligned_seqs->[0]->display_id . "\n" . $aligned_seqs->[0]->seq . "\n>" . $aligned_seqs->[1]->display_id . "\n" . $aligned_seqs->[1]->seq . "\n$@"); } $matrix->[0]->[1]->{N} = 0 unless defined $matrix->[0]->[1]->{N}; $matrix->[0]->[1]->{S} = 0 unless defined $matrix->[0]->[1]->{S}; return ($matrix->[0]->[1]->{dN}, $matrix->[0]->[1]->{dS}, $matrix->[0]->[1]->{N}, $matrix->[0]->[1]->{S}) } =head2 _pairwise_align Args[1] : Example : Description: Returntype : Exceptions : Caller : =cut sub _pairwise_align { my ($self, $seqs) = @_; throw ("Pairwise alignment was only expecting two sequences.") unless ((scalar @$seqs) == 2); my $cba = Bio::EnsEMBL::Pipeline::GeneDuplication::CodonBasedAlignment->new( -genetic_code => 1); $cba->sequences($seqs); my $aligned_seqs = $cba->run_alignment; return $aligned_seqs } =head2 _calculate_coverage_and_identity Args[1] : Example : Description: Returntype : Array (min_coverage, identity) Exceptions : Caller : =cut sub _calculate_coverage_and_identity { my ($self, $align) = @_; my $seq1 = $align->[0]->seq; my $seq2 = $align->[1]->seq; my $align_len = length($seq1); my $seq_length1 = 0; my $seq_length2 = 0; my $matches = 0; my $identical = 0; for (my $i = 0; $i < $align_len; $i++) { my $base1 = substr($seq1, $i, 1); my $base2 = substr($seq2, $i, 1); if ($base1 ne '-') { $seq_length1++; } if ($base2 ne '-') { $seq_length2++; } if ($base1 ne '-' && $base2 ne '-'){ $matches++; } if ($base1 eq $base2 && $base1 ne '-'){ $identical++; } } my $coverage1 = $matches/$seq_length1; my $coverage2 = $matches/$seq_length2; my ($min_coverage) = sort {$a <=> $b} ($coverage1, $coverage2); my $identity = $identical/$matches; return ($min_coverage, $identity) } =head2 _appraise_hit_coverage Args[1] : Example : Description: Returntype : 0 or 1 Exceptions : Caller : =cut sub _appraise_hit_coverage{ my ($self, $sbjct, $hsps) = @_; # Select our sequence length as being that of the # longer sequence. my $sbjct_length = $self->_fetch_seq($sbjct->name)->length; my ($longest_length) = sort {$a <=> $b} ($self->_query_seq->length, $sbjct_length); # Look at all the hits along the length of the # query and tally the collective coverage of the hits. my @query_coverage; foreach my $hsp (@$hsps) { for (my $base_position = $hsp->query->start; $base_position <= $hsp->query->end; $base_position++){ $query_coverage[$base_position]++; } } my $covered_bases = 0; foreach my $base (@query_coverage){ $covered_bases++ if $base; } # Return true if good coverage exists. return 1 if ($covered_bases >= $self->_coverage_cutoff * $longest_length); # Otherwise return false. return 0; } =head2 _fix_sbjct Args[1] : Example : Description: A work-around for a BPLite::Sbjct annoyance. The sbjct->name object returns the whole fasta description line for a subject hit. If the input fasta sequence file includes more than an id on the description line, this will be passed back every time the name method is called. This is a real pest is you are trying to match ids via a regex or use the ids as hash keys. Returntype : Exceptions : Caller : =cut sub _fix_sbjct { my ($self, $sbjct) = @_; my $sbjct_name = $sbjct->name; $sbjct_name =~ s/\W*(\S+).*/$1/; # BAD! $sbjct->{NAME} = $sbjct_name; return $sbjct; } =head2 _hit_identity Args[1] : Example : Description: Returntype : Exceptions : Caller : =cut sub _hit_identity{ my ($self, $hsps) = @_; my $tally_query_length = 0; my $tally_matched_bases = 0; foreach my $hsp (@$hsps) { $tally_query_length += length($hsp->querySeq); $tally_matched_bases += $hsp->positive; } if ($tally_matched_bases && $tally_query_length){ return $tally_matched_bases/$tally_query_length; } return 0 } =head2 outgroup_distance Args[1] : Example : Description: Returntype : Exceptions : Caller : =cut sub outgroup_distance { my $self = shift; if (@_) { $self->{_outgroup_distance} = shift; } return $self->{_outgroup_distance}; } ### Sequence fetching ### =head2 _fetch_seq Args[1] : Example : Description: Returntype : Exceptions : Caller : =cut sub _fetch_seq { my ($self, $id) = @_; throw ("Cant fetch sequence without an id.") unless $id; $self->{_cache} = {} unless $self->{_cache}; if ($self->{_cache}->{$id}){ return $self->{_cache}->{$id} } my $seq = $self->_seq_fetcher->fetch($id); throw ("Sequence fetch failed for id [$id].") unless ($seq && $seq->isa("Bio::Seq")); $self->{_cache}->{$seq->display_id} = $seq; return $seq } =head2 _fetch_seqs Args : An arrayref of string sequence ids. Example : none Description: An alias to $self->_fetch_seq, but handles multiple sequences. Returntype : Arrayref of Bio::Seq Exceptions : none Caller : $self->run =cut sub _fetch_seqs { my ($self, $seq_ids) = @_; my @seqs; foreach my $seq_id (@$seq_ids){ push (@seqs, $self->_fetch_seq($seq_id)); } return \@seqs; } =head2 _force_cache Args[1] : Bio::Seq Example : $self->_force_cache($seq); Description: Allows a sequence to be manually added to the seqfetcher cache. This is useful for coping with user supplied sequences (eg. passed as a query sequence) that dont exist in any database. Returntype : 1 Exceptions : Warns if sequence already exists in cache. Throws if a defined sequence isnt supplied. Caller : =cut sub _force_cache { my ($self, $seq) = @_; throw ("Trying to add something odd to the sequence cache [$seq].") unless (defined $seq); if ($self->{_cache}->{$seq->display_id}){ warning('Sequence [' . $seq->display_id . '] already exists in cache, but will replace.'); } $self->{_cache}->{$seq->display_id} = $seq; return 1 } =head2 _seq_fetcher Args : (optional) A seqfetcher of any variety, as long as it has a 'fetch' method. Example : none Description: Holds SeqFetcher object. Returntype : Bio::EnsEMBL::Pipeline::SeqFetcher::xxx Exceptions : none Caller : $self->_candidate_hits, $self->_fetch_seqs =cut sub _seq_fetcher { my $self = shift; $self->{_seq_fetcher} = shift if @_; if (! $self->{_seq_fetcher}){ $self->{_seq_fetcher} = Bio::EnsEMBL::Pipeline::SeqFetcher::FetchFromBlastDB->new( -db => $self->_blastdb); } return $self->{_seq_fetcher}; } ### Blast-related Getter/Setters ### =head2 _dbfile Args : (optional) String Example : none Description: Holds the filename of the blast database. Returntype : String. Exceptions : none Caller : $self->new, $self->_hit_chooser. =cut sub _dbfile { my $self = shift; return $self->_blastdb->dbfile } =head2 _blast_obj Args[1] : Example : Description: Returntype : Exceptions : Caller : =cut sub _blast_obj { my $self = shift; if (@_){ $self->{_blast_obj} = shift; return } unless ($self->{_blast_obj}){ # Create a new blast object with our mixed species input database. $self->{_blast_obj} = Bio::EnsEMBL::Pipeline::Runnable::MinimalBlast->new( -program => $self->_blast_program, -blastdb => $self->_blastdb, -queryseq => $self->_query_seq, -options => '-hspmax 200', -workdir => $self->_work_dir, -identity_cutoff => $self->_identity_cutoff); } return $self->{_blast_obj}; } =head2 _blastdb Args[1] : Example : Description: Returntype : Exceptions : Caller : =cut sub _blastdb { my $self = shift; if (@_) { $self->{_blastdb} = shift; throw ("Blast database must be a Bio::EnsEMBL::Pipeline::Runnable::BlastDB.") unless ($self->{_blastdb}->isa("Bio::EnsEMBL::Pipeline::Runnable::BlastDB")); throw ("Blast database has not been formatted.") unless $self->{_blastdb}->db_formatted; throw ("Blast database has been built without the " . "make_fetchable_index flag set (and this is " . "a problem because the database can not be " . "used for sequence fetching).") unless $self->{_blastdb}->make_fetchable_index } throw ("Blast database object not set.") unless ($self->{_blastdb}); return $self->{_blastdb}; } =head2 _blast_program Args[1] : Example : Description: Returntype : Exceptions : Caller : =cut sub _blast_program { my $self = shift; if (@_){ $self->{_blast_program} = shift; return } $self->{_blast_program} = $DEFAULT_BLAST_PROGRAM unless $self->{_blast_program}; return $self->{_blast_program}; } =head2 _blast_index_type Args[1] : Example : Description: Returntype : Exceptions : Caller : =cut sub _blast_index_type { my $self = shift; return $self->_blastdb->index_type; } ### Getter/Setters ### =head2 _query_seq Args : (optional) Bio::Seq Example : none Description: Holds the query sequence for which we are searching for duplicates. Returntype : Bio::Seq Exceptions : none Caller : $self->run =cut sub _query_seq { my $self = shift; if (@_) { $self->{_query_seq} = shift; throw ("Query sequence is not a Bio::Seq object [" . $self->{_query_seq} . "]") unless $self->{_query_seq}->isa("Bio::Seq"); throw ("Cant add query sequence to sequence cache manually.") unless $self->_force_cache($self->{_query_seq}); return } throw ("Query sequence has not been set.") unless $self->{_query_seq}; return $self->{_query_seq}; } =head2 _work_dir Args : (optional) String. Example : none Description: Holds the path to the working directory. Returntype : String. Exceptions : none Caller : $self->new, $self->_hit_chooser, $self->_run_pairwise_paml. =cut sub _work_dir { my $self = shift; if (@_) { $self->{_work_dir} = shift; return } throw ("Work directory not set.") unless $self->{_work_dir}; return $self->{_work_dir}; } =head2 _identity_cutoff Args : (optional) an int or a float - a percentage value anyways. Example : none Description: Holds identity cutoff percentage value. Returntype : A float value Exceptions : none Caller : $self->new, $self->_hit_chooser. =cut sub _identity_cutoff { my $self = shift; if (@_) { $self->{_identity_cutoff} = shift; $self->{_identity_cutoff} /= 100 if $self->{_identity_cutoff} > 1; return } throw ("Blast match identity cutoff has not been set.") unless $self->{_identity_cutoff}; return $self->{_identity_cutoff}; } =head2 _coverage_cutoff Args : (optional) an int or a float - a percentage value anyways. Example : none Description: Holds coverage cutoff percentage value. Returntype : A float value. Exceptions : none Caller : $self->new, $self->_hit_chooser. =cut sub _coverage_cutoff { my $self = shift; if (@_) { $self->{_coverage_cutoff} = shift; $self->{_coverage_cutoff} /= 100 if $self->{_coverage_cutoff} > 1; return } throw ("The coverage cutoff has not been set.") unless $self->{_coverage_cutoff}; return $self->{_coverage_cutoff}; } =head2 _distance_cutoff Args[1] : Example : Description: Returntype : Exceptions : Caller : =cut sub _distance_cutoff { my $self = shift; if (@_) { $self->{_distance_cutoff} = shift; return } $self->{_distance_cutoff} = $DEFAULT_DISTANCE_CUTOFF unless $self->{_distance_cutoff}; return $self->{_distance_cutoff}; } =head2 _regex_query_species Args : String Example : $self->_regex_query_species('ENSG') Description: Holds a regex string that will match the id of any sequence from the query species. Returntype : String or 1 Exceptions : Throws when regex is not set. Caller : $self->new, $self->_hit_chooser =cut sub _regex_query_species { my $self = shift; if (@_) { $self->{_regex_query_species} = shift; return } throw ("The regular expression used to match the sequence" ." ids from the query species has not been set.") unless $self->{_regex_query_species}; return $self->{_regex_query_species}; } =head2 _regex_outgroup_species Args : ref to an array of strings Example : $self->_regex_outgroup_species(['ENSRNO', 'ENSMUS']); Description: Holds an array of regexs that will allow the sequence id of all non-query species sequences to be matched. Returntype : arrayref Exceptions : Warns if called while unset. Caller : $self->new, $self->_hit_chooser =cut sub _regex_outgroup_species { my $self = shift; if (@_) { $self->{_regex_outgroup_species} = shift; return } warning('No outgroup species regex provided. ' . 'This may or may not be what you intend.') unless $self->{_regex_outgroup_species}; return $self->{_regex_outgroup_species}; } =head2 _genetic_code Args : int Example : $self->_genetic_code(1); Description: Holds an integer representing the genetic code. To choose the correct integer consult the documentation used by the Bio::Seq->translate method. 1 is universal, 2 is vertebrate mitochondria. Returntype : int Exceptions : Warns if called while unset. Caller : $self->new, $self->run, $self->_run_pairwise_paml =cut sub _genetic_code { my $self = shift; if (@_) { $self->{_genetic_code} = shift; return } unless (defined $self->{_genetic_code}) { throw ('Genetic code unset.') } return $self->{_genetic_code}; } =head2 _distance_method Args : String Example : Description: Returntype : Exceptions : Throws if set to an unrecognised string. Caller : =cut sub _distance_method { my $self = shift; if (@_) { $self->{_distance_method} = shift; unless ($self->{_distance_method} =~ /NeiGojobori/i | $self->{_distance_method} =~ /ML/i){ throw ("Distance method must be set to either " . "NeiGojobori or ML, not [". $self->{_distance_method}."]"); } } $self->{_distance_method} = $DEFAULT_DISTANCE_METHOD unless $self->{_distance_method}; return $self->{_distance_method}; } =head2 _codeml Args : [optional] String Example : $self->_codeml('/path/to/codeml') Description: Holds the path to the codeml executable Returntype : String Exceptions : Throws if a full path is included, but the file is not executable. Caller : $self->new, $self->_run_pairwise_paml =cut sub _codeml { my $self = shift; if (@_) { $self->{_codeml} = shift; return } $self->{_codeml} = 'codeml' unless $self->{_codeml}; # If it looks like our executable comes with a full # path, check that it will work. throw ("codeml executable not found or not " . "executable. Trying to execute path " . "[". $self->{_codeml} ."]") if ($self->{_codeml} =~ /^\// & !-x $self->{_codeml}); return $self->{_codeml}; } return 1