ExonerateProbe documentation.

sub make_feature {

  my ($self, @args) = @_;
 
  #huh? Isn't this assign self to $tag?
 
  my (
    $tag, $q_id, $q_start, $q_end, $q_strand, 
    $t_id, $t_start, $t_end, $t_strand, $score, 
    $q_length, $query_match_length
  ) = @_;


  #because of the 'in-between' coordinates.???
  $t_start += 1;
  #What about t_end and q_start/end?


  my $probe_internal_id = $q_id;
  my $mismatch_count;
  
  if(!($probe_internal_id =~ /\d+/)){
    throw "Probe headers MUST be the internal db ids of the Probes for this parser to work!\n";
  }
  


  #we need to full unmatched query length to calculat the mismatches!
  #Can't assume will always be

  #=pod

  #If we miss by more than one, don't create the feature.
  #This is based on 25mers
  #we need to be able to adapt this for a % hit

  #This should be calculated based on % id?
  #score is + 5 for a match -4 for mismatch: hence -9 from total score for one mismatch.

  #if($query_match_length == $q_length){
 

    #if($score == 125){#25mers
#    if($score == 250){ #50mers
  #    $mismatch_count = 0;
    #}elsif ($score == 116) {
      #}elsif ($score == 241) {#50mer 98%
    #  $mismatch_count = 1;
      #}elsif ($score == 232) {#50mer 96%
      #  $mismatch_count = 2;  
    #} else {
 #     return undef;
    #}
  #}elsif($query_match_length == $q_length -1){
  #  if ($score == 120) {
      #if ($score == 245) {#50mer 100% over 49 bases
  #    $mismatch_count = 1;
      #}elsif ($score == 236) {#50mer just under 98% over 49 bases
      #  $mismatch_count = 2;  
  #  }
  #  else {
  #    return undef;
  #  }
  #}
    #elsif($query_match_length == $q_length -2){#50mers only!
    #  if($score == 240){
    #    $mismatch_count=2;
    #  }
    #}else{
    #  return undef;
    #}

  #=cut

  ##NATH HACK
    #100% ID for tiling arrays
    if(($query_match_length == $q_length) && $score == 250){
      $mismatch_count = 0;
    } else {
      return undef;
    }
  #98 % =  241 for 50 mers
  
  
  
  #Create a dummy probe object to pass out. It must be checked and
  #replaced by the parent runnabledb before being stored.

  #my $probe = 
  #  new Bio::EnsEMBL::Funcgen::Probe(
  #    -dbID =>$q_id,
  #    -name => "replace this probe name",
  #    -arrayname => "replace this array name",
  #  );
   
  # Everything is 'flipped' into the forward strand of the probe -
  # so a hit on the reverse strand of the probe (q_strand = '-1')
  # is altered:  q_strand = '+1' and t_strand => -1 x t_strand. 
  if($q_strand eq '+'){
    if($t_strand eq '+'){
      $t_strand = 1;
    }elsif($t_strand eq '-'){
      $t_strand = -1;
    }else{
      throw "unrecognised target strand symbol: $t_strand\n";
    }
  }elsif($q_strand eq '-'){
    if($t_strand eq '-'){
      $t_strand = 1;
    }elsif($t_strand eq '+'){
      $t_strand = -1;
    }else{
      throw "unrecognised target strand symbol: $t_strand\n";
    }
  }else{    
      throw "unrecognised query strand symbol: $q_strand\n";
  }
  

  my $feature =
    new Bio::EnsEMBL::Funcgen::ProbeFeature(
											#-probe => $probe,
											-probe_id => $q_id,
											-start => $t_start,
											-end => $t_end,
											-strand => $t_strand,
											-mismatchcount => $mismatch_count,
										   );

  # attach the slice name onto the feature: let the runnabledb
  # sort out whether it's valid.
  $feature->seqname($t_id);
  
  return $feature;
}


1;

}

sub new {

  my ( $class, @args ) = @_;
  #my $self = $class->SUPER::new(@args);

  #slightly rearranged order as we want to pass some different defaults to BaseExonerate

  my ($max_mismatches, $mapping_type, $basic_options) = rearrange(['max_mismatches', 'mapping_type', 'basic_options'], @args);
  
  my %basic_opts;

  #We could change mapping_type to same_strand hits filter
  #As this is the only thing we're using it for here


  if(! defined $basic_options){
	#parse result depends on the output format options
	#only override if you intend overload or rewrite the parse_results method.
	#Now let's reset the default BaseExonerate options to remove vulgar and add scores
	#RESULT: 3020922 0 50 + ENSMUST00000111559 964 1014 + 250 100.00 50 3184 0 scores:0:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:0:

	$basic_opts{'-basic_options'} = "--showsugar false --showvulgar false --showalignment false --ryo\" RESULT: %S %pi %ql %tl %em scores:{%Ps:}\\n\" ";
  }

  my $self = $class->SUPER::new(@args, %basic_opts);

  if(! defined $max_mismatches){
	throw("Must provide a -max_mismatches parameter e.g. 0, 1 or 2");
  }

  if(! (defined $mapping_type && $mapping_type =~ /genomic|transcript/)){
	throw("Must provide a valid -mapping_type parameter e.g. genomic or transcript");
  }

  #reset to the new default exonerate
  if (not $self->program) {
	#This is an architecture specific build path
    $self->program('/lustre/work1/ensembl/gs2/local/x86_64/bin/exonerate');
  }
 


  #if(! (defined $filter_method && (ref($filter_method) ne 'CODE' || $self->can($filter_method)))){
#	throw('You must pass a -filter_method name or CODEREF to filter the ProbeFeatures');
#  }

  ##Set code ref or pointer to internal method
  #We will always have explicitly pass self to the coderef
  

  #$self->{'filter_method'} = (ref($filter_method) eq 'CODE') ? $filter_method : $self->can($filter_method);
  $self->{'max_mismatches'} = $max_mismatches;
  $self->{'mapping_type'}   = $mapping_type;
  


  return $self;

}

sub parse_results {

  my ( $self, $fh ) = @_;
  
  my @features;
  #my $filter_method = $self->{'match_rules'};
  #my $filter_method = $self->filter_method;
  #No, now uses code ref to allow definition in config

  #print "Parsing results from fh ".Data::Dumper::Dumper($fh)."\n";

  
  my ($tag, $probe_id, $q_start, $q_end, $q_strand, 
	 $t_id, $t_start, $t_end, $t_strand, $score, $tscore,
	 $perc_id, $q_length, $t_length, $mismatch_count, $scores,
	$match_length, $align_mismatch, $total_mismatches, $tmp);

  my $max_mismatches = $self->max_mismatches;
  my $mapping_type   = $self->mapping_type;


  while (<$fh>){
    #print STDERR $_ if $self->_verbose;
    next unless /^RESULT:/;
    chomp;
    
	#Vulgar blocks are also report in in-between coords! So need to add 1(only to start???)
	#Shows  the  alignments  in "vulgar" format.  Vulgar is Verbose Useful Labelled Gapped Alignment Report, This format also starts with the same 9 fields as sugar output (see above), and is followed by a series of <label, query_length, target_length> triplets.  The label may be one of the following:

#              M      Match
#              G      Gap
#              N      Non-equivalenced region
#              5      5' splice site
#              3      3' splice site
#              I      Intron
#              S      Split codon
#              F      Frameshift

	#Should always be M probe_length target_length(which would be equal to probe length for ungapped alignment))

	#Can we have a way of dynamically linking the ryo string to a particular method?
	#We shouldn't need different formats!!!
	#RESULT: %S %pi %ql %tl %em scores:{%Ps:}\n
	#RESULT: 3020922 0 50 + ENSMUST00000111559 964 1014 + 250 100.00 50 3184 0 scores:0:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:0:
	
	#So we use %em to figure out if we don't have a perfect match, then we use %em the info string to figure out where the mismatches are
	#< 25M effectively means mismatches at the start of end of a query sequence
	
	print $_."\n";

    (undef, $probe_id, $q_start, $q_end, $q_strand, 
	 $t_id, $t_start, $t_end, $t_strand, $score, 
	 $perc_id, $q_length, $t_length, $mismatch_count, $scores) = split;

	#Let's validate output formats somehow?

	


	#$gene_orientation always .?
	#, $match_type, $query_match_length, $target_match_length, @rest_of_vulgar #vulgar blocks	
	#query strand should always be +?
	
	#Do we really need vulgar? 
	#We never get anything than M N N for ungapped alignments?
	
	#if(@rest_of_vulgar){
	#  throw ("There is more than a simple match ('M') vulgar output:\n".
	#		 "$match_type $query_match_length $target_match_length @rest_of_vulgar\n".
	#		 "for affy tag $probe_id mapped to region $t_id \n");
	#}
	
	#if(!($match_type eq 'M')){
	#  throw "I have received a starting Vulgar symbol $match_type which is not a match!"; 
	#}
  


	#1 25 + ENSMUST00000115314 1593 1617 + 120 100.00 25 3173 . 0 M 24 24
	#0 25 + ENSMUST00000109902 2318 2343 + 116 96.00 25 2449 . 1 M 25 25 

	#because of the 'in-between' coordinates.???
	$t_start += 1;	
	#What about t_end?
	#Not all seem to be inbetween:
	#0 25 -represents 1-25 of a 25mer
	#Only seems to affect start!
	#what about query? We also need to add to query if we were using it.
	
	
	if(!($probe_id =~ /\d+/)){
	  throw "Probe headers MUST be the internal db ids of the Probes for this parser to work!\n";
	}
  


	#We could filter here for -ve strand on transcripts, unless we want to capture this infor for antisense transcription?
	#But this may not follow cdna, so genomic alignment is fine for this.
	#Would -ve to -ve alignments be valid? Yes, but they will never be report over +ve to +ve?
	#SHouldn't this be filtered in the RunnableDB ProbeAlign and leave this to be generic?

	if($mapping_type eq 'transcript' && $t_strand eq '-'){

	  if($q_strand eq '-'){
		warn "Found -ve to -ve mapping for probe $probe_id, need to account for this";
	  }
	  else{
		next;
	  }
	}


	#RESULT: 9379131 0 25 + chromosome:NCBIM37:16:1:98319150:1 33212985 33213010 + 116 96.00 25 98319150 1 scores:0:5:5:5:5:5:5:5:5:5:5:-4:5:5:5:5:5:5:5:5:5:5:5:5:5:5:0: at /nfs/acari/nj1/src/ensembl-analysi


	#filter_emthod is FILTER_METHOD coderef set in the RunnableDB analysis config hash

	#We don't need this filter method any more as we can simply us the query length - match_length + mismatch count!!!
	#Will query start end always have full query length?
	#How will we know full query length if we have different sized probes?

	#my $mismatch_count = $self->filter_method($query_match_length, $q_length, $score);#, $cigar_line);


	$match_length = $q_end - $q_start;
	$align_mismatch = $q_length - $match_length;

	#We also need to alter $t_end or $t_start if we have an align_mismatch!?
	#Should we be reporting these align mismatches at the end?
	#We simply need to know them for filtering
	#We have no way of knowing the full query length if we don't pass the full length
	#cigar line to the Runnable DB
	#Storing the hanging mismatches is not incorrect, just non-sandard
	#We would have to pass the probe length as part of the ProbeFeature
	#to enable storage of query % ID
	#Leave for now as we have no way of passing the query start stop to ProbeAlign
	#hence we would lose where the flanking mismatch was.
	$total_mismatches = $mismatch_count + $align_mismatch;
	my @soft_cigar_line;
	
	#5' unaligned
	if($align_mismatch){
	  push @soft_cigar_line, $q_start.'m' if $q_start;#set this to the value of start if not 0
	  #We want to subtract from start if +ve hit
	  #else we want to add to end if -ve strand, end is actually start in ensembl terms
	  $t_start = ($t_strand eq '+') ? ($t_start - $q_start) : ($t_end + $q_start);
	}


	#mismatches
	#if($mismatch_count){
	if($total_mismatches){
	  my @scores = split/:/, $scores;
	  #remove scores:0
	  shift @scores;
	  shift @scores;
	  #remove last :0
	  pop @scores;
	  #Is this quicker that a splice?

	  my $cnt        = 0;
	  my $prev_score = 5;#always starts with a match otherwise it wouldn't have been reported.

	  while($tscore = shift @scores){
		
		if($tscore == $prev_score){
		  $cnt += 1;
		}else{
		  $tmp = ($prev_score == 5) ?  $cnt.'M' :  $cnt.'m';
		  push @soft_cigar_line, $tmp;
		  $cnt = 1;
		  $prev_score       = $tscore;
		}
	  }

	  #handle last align length
	  $tmp = ($prev_score == 5) ? $cnt.'M' : $cnt.'m';
	  push @soft_cigar_line, $tmp;
	}



	#3' unaligned
	if($align_mismatch != $q_start){
	  #Add end mismatch if
	  #not accounted for by 5' mismatch
	  my $three_mismatch = ($q_length - $q_end);
	  push @soft_cigar_line, $three_mismatch.'m';

	  #Either add to end for +ve or subtract from ensembl start(which is end) for -ve
	  $t_end = ($t_strand eq '+') ? ($t_end + $three_mismatch) : ($t_start - $three_mismatch);
	}




	#Do we need to account for this?!!
	#i.e. do we need to swap the t_start t_end?
	#9381894 0 25 + ENSMUST00000039541 2799 2774 - 
	#We don't want -ve strand matches from transcripts?!
	#Only valid -ve strand matches will be not cross introns
	#So we are not interested in them for transcript mapping.
	#Only for genomic mapping

	if($total_mismatches <= $max_mismatches){

	  if($q_strand eq '+'){
		if($t_strand eq '+'){
		  $t_strand = 1;
		  #Alter start/end if we have flanking mismatches here or above?
		}
		elsif($t_strand eq '-'){
		  $t_strand = -1;

		  #Do we really want to do this?
		  #Are we interested in -ve strand exon hits?
		  #Isn't -ve strand transcription normally intronic?

		  #Yes but we may have an ST type array so we don't want to through these away.

		  ($t_start, $t_end) = reverse($t_start, $t_end);

		  #Reverse the cigar line
		  #As match scores are reported wrt to strand of hit
		  #And we want cigar line wrt to +ve strand of hit
		  @soft_cigar_line = reverse(@soft_cigar_line);
		}
		else{
		  throw "Unrecognised target strand symbol: $t_strand\n";
		}
	  }
	  elsif($q_strand eq '-'){

		throw('We have found a -ve query match');
		#Exonerate only reports +ve query strand matches??? Really?
		#Are we even interested in -ve query strand matches?

		#if($t_strand eq '-'){
		#  $t_strand = 1;
		#}elsif($t_strand eq '+'){
		#  $t_strand = -1;
		#}else{
		#  throw "unrecognised target strand symbol: $t_strand\n";
		#}
	  }else{    
		throw "unrecognised query strand symbol: $q_strand\n";
	  }

	  #print "Creating ProbeFeature \n";

	  push @features, new Bio::EnsEMBL::Funcgen::ProbeFeature
		(
		 #-probe => $probe,
		 -probe_id => $probe_id,
		 -start => $t_start,
		 -end => $t_end,
		 -strand => $t_strand,
		 -mismatchcount => $total_mismatches,
		 -cigar_line => join(':', @soft_cigar_line) || $match_length.'M',
		 -seqname => $t_id,
		);

	  # attach the slice name onto the feature: let the runnabledb
	  # sort out whether it's valid.
	  #$feature->seqname($t_id);
  

	}
	else{
      warn "Feature from probe :$probe_id doesnt match well enough\n";
    }
  }

  return\@ features;
}

#we need the full unmatched query length to calculat the mismatches!
#Can't assume will always be 25 or 50
#This should be calculated based on % id?

#This can now coderef'd in the analysis config hash
#Altho this obfuscates this module slighlty, it gather all element which need to be 
#configured/added into one place, making this truly generic/extendable

}

filter_method	No description	Code
filter_mismatches	No description	Code
make_feature	No description	Code
mapping_type	No description	Code
max_mismatches	No description	Code
new	No description	Code
parse_results	No description	Code