CdnaUpdateTranscriptFilter documentation.

sub filter_results {
  my ($self, $transcripts) = @_;
  # results are Bio::EnsEMBL::Transcripts with exons and supp_features
  my @good_matches;  
  my %matches;
  my $printing = $self->verbosity;

TRAN:
  
  foreach my $transcript (@$transcripts ){
    my $coverage  = $self->_get_transcript_coverage($transcript);
    my $percent_id  = $self->_get_transcript_percent_id($transcript);

	#identify the longest intron
	my @exons = @{$transcript->get_all_Exons()};
	my $num_exons = scalar @exons;
	my @sorted_exons = sort {$a->start <=> $b->start} @exons;
	my $max_intron = 0;
	my $short_intron = 0; #flag to check that not all introns are long
	
	for my $a (0 .. $#sorted_exons - 1){
		my $intron_len = ($sorted_exons[$a+1]->start -  $sorted_exons[$a]->end) - 1;
		if ($intron_len > $max_intron){
			$max_intron = $intron_len;
		}
		if ($intron_len < 250000){
			$short_intron = 1;
		}
	}

    my $id = $self->_get_transcript_evidence_id($transcript);
    push @{$matches{$id}}, {
      transcript  => $transcript,
      coverage    => $coverage,
      percent_id  => $percent_id,
      num_exons   => $num_exons,
      is_spliced  => $self->_transcript_is_spliced($transcript),
	  max_intron  => $max_intron,
	  short_intron => $short_intron,
    };
  }
  
  my %matches_sorted_by_coverage;
  my %selected_matches;
 
 QUERY:
  foreach my $query_id ( keys( %matches ) ){
	@{$matches_sorted_by_coverage{$query_id}} = 
        sort { $b->{coverage}   <=> $a->{coverage} or
               $b->{num_exons}  <=> $a->{num_exons} or
               $b->{percent_id} <=> $a->{percent_id} } @{$matches{$query_id}};

    
    my $max_coverage;
    my $perc_id_of_best;
    my $count = 0;
    my $splices_elsewhere = 0;
    my $best_has_been_seen = 0;
    
    #print STDERR "####################\n";
    #print STDERR "Matches for $query_id:\n";
    
  TRANSCRIPT:
    
    foreach my $hit ( @{$matches_sorted_by_coverage{$query_id}} ){
	  my $coverage = $hit->{coverage};
      my $percent_id = $hit->{percent_id};
      my $is_spliced = $hit->{is_spliced};
	
      unless ($max_coverage){
        $max_coverage = $coverage;
      }
      unless ( $perc_id_of_best ){
		$perc_id_of_best = $percent_id;
      }
	
      #sd3
      #single exon genes (ie mouse olfactory receptors) being thrown out in favour of 
      #low percentage id multi-exon genes of equivalent coverage
      #this loop checks for high quality matches in multi-exon hits before the splices_elsewhere flag is set
      
      #so if have good coverage & percentage id:
      if ( (($coverage >= $self->min_coverage && $percent_id >= $self->min_percent) 
      
        #or have high coverage & a slightly lower percentage id
        || ($coverage   >= (1 + 5/100) * $self->min_coverage &&
            $percent_id >= (1 - 3/100) * $self->min_percent)) 
	    && $is_spliced){
        
	$splices_elsewhere = 1;
	
	last;
      }
      
    }	
    
    foreach my $hit ( @{$matches_sorted_by_coverage{$query_id}} ){
      $count++;

      my ($accept, $label);      
      my $transcript = $hit->{transcript};
      my $strand = $transcript->strand;
      my $coverage = $hit->{coverage};
      my $percent_id = $hit->{percent_id};
      my $is_spliced = $hit->{is_spliced};
	  my $max_intron = $hit->{max_intron};
	  my $short_intron = $hit->{short_intron};
	  my $num_exons = $hit->{num_exons};
     
      if ( $count == 1 ){
		$label = 'best_match';
      } elsif ( $count > 1 && 
        	$splices_elsewhere && 
            ! $is_spliced) {
		$label = 'potential_processed_pseudogene';
      } else{
		$label = $count;
      }
      
#      if ( $count == 1 && $is_spliced ){ #old way of doing it
#	     $splices_elsewhere = 1;
#      }
	
     
  if ( $self->best_in_genome ){
    # we keep the hit with the best coverage...
	  if ($coverage == $max_coverage &&
              # as long as it has coverage/percent_id above limits or...
              (($coverage >= $self->min_coverage && 
            	$percent_id >= $self->min_percent)
               ||
               # ...if coverage is significantly greater than the
               # specified minimum, then we are willing to accept
               # hits that have a percent_id just below the specified
               # minimum
               ($coverage   >= (1 + 5/100) * $self->min_coverage &&
            	$percent_id >= (1 - 3/100) * $self->min_percent))) { 

		if ( $self->reject_processed_pseudos
	    	 && $count > 1 
	    	 && $splices_elsewhere 
	    	 && ! $is_spliced) {
	      $accept = 'NO';
	      if ($printing){
		  	 print  "rpp $query_id\n"; 
		  }
		}
		elsif (($short_intron == 0) && ($num_exons > 1)){
			#all long introns
			$accept = 'NO';
			if ($printing){
			  print "only long introns $query_id\n";
			}
		}
		#Only accept long intron hits with very high coverage and percent_id
		elsif($max_intron > 250000 ){ 
			if (($coverage >= 98) && ($percent_id >= 98)){
				$accept = 'YES';
				push( @good_matches, $transcript);
				
				#print "accept: intron $max_intron coverage $coverage \%id $percent_id $query_id\n"; 
				
				#find out which introns are long - ie are they the first and last introns?
#				my @exons = @{$transcript->get_all_Exons()};
#				my @sorted_exons = sort {$a->start <=> $b->start} @exons;
#				my $num_introns = scalar @exons - 1;
#
#				for my $a (0 .. $#sorted_exons - 1){
#					my $intron_len = ($sorted_exons[$a+1]->start -  $sorted_exons[$a]->end) - 1;
#					if ($intron_len > 250000){
#						print "intron position ".($a + 1)." out of $num_introns\n"; 		
#					}
#				}
				
			}else{
				$accept = 'NO';
				if ($printing){
				  print "reject: intron $max_intron coverage $coverage\% id $percent_id $query_id\n";
				}
			}	
		}else{
	      $accept = 'YES';
	      push( @good_matches, $transcript);

		}
	  }
	  else{
		$accept = 'NO';
 		if ($printing){
		  print "max_coverage $max_coverage coverage $coverage\% id $percent_id $query_id\n"; 
		}

	  }

  }
  else{
        # we keep anything which is within the 2% of the best score...
	if ($coverage >= (0.98 * $max_coverage) && 
            # as long as it has coverage/percent_id above limits or...
            (($coverage >= $self->min_coverage && 
              $percent_id >= $self->min_percent)
             ||              
             # ...if coverage is significantly greater than the
             # specified minimum, then we are willing to accept
             # hits that have a percent_id just below the specified
             # minimum
             ($coverage   >= (1 + 5/100) * $self->min_coverage &&
              $percent_id >= (1 - 3/100) * $self->min_percent))) {
          
          
	  ############################################################
	  # non-best matches are kept only if they are not unspliced with the
	  # best match being spliced - otherwise they could be processed pseudogenes
	  if ( $self->reject_processed_pseudos &&
	       $count > 1 &&
	       $splices_elsewhere &&
	       ! $is_spliced) {
	    $accept = 'NO';
	  }
	  else{
	    $accept = 'YES';
	    push( @good_matches, $transcript);
	  }
	}
	else{
	  $accept = 'NO';
	}
      }
    }
  }
  return\@ good_matches;

}

############################################################
}
_get_transcript_coverage	No description	Code
_get_transcript_evidence_id	No description	Code
_get_transcript_percent_id	No description	Code
_transcript_is_spliced	No description	Code
best_in_genome	No description	Code
filter_results	Description	Code
min_coverage	No description	Code
min_percent	No description	Code
new	Description	Code
reject_processed_pseudos	No description	Code
verbosity	No description	Code