ExonerateTranscriptFilter documentation.

  Bio::EnsEMBL::Analysis::Tools::ExonerateTranscriptFilter

No package variables defined.

Bio::EnsEMBL::Root

Bio::EnsEMBL::Utils::Argument qw ( rearrange )

Bio::EnsEMBL::Utils::Exception qw ( verbose throw warning )

Data::Dumper

  my $filter = new Bio::EnsEMBL::Analysis::Tools::ExonerateTranscriptFilter
  new->(
        -best_in_genome => 1,
        -reject_processed_pseudos => 1,
        -coverage => 80,
        -percent_id => 90,
       );

  my @filtered_results = @{$filter->filter_results(\@results)};

This is the standard module used for filtering Exonerate transcripts

sub _get_transcript_coverage {

  my ($self,$tran) = @_;

  if (@{$tran->get_all_supporting_features} and
      defined $tran->get_all_supporting_features->[0]->hcoverage) {
    my ($evi) = @{$tran->get_all_supporting_features};
    return $evi->hcoverage;
  } else {
    my @exons = @{$tran->get_all_Exons};
    my ($evi) = @{$exons[0]->get_all_supporting_features};
    return $evi->score;
  }
}

############################################################

}

sub _get_transcript_evidence_id {

  my ($self,$tran) = @_;

  my ($sf);

  if (@{$tran->get_all_supporting_features}) {
    ($sf) = @{$tran->get_all_supporting_features};
  } else {
    my @exons = @{$tran->get_all_Exons};
    ($sf) = @{$exons[0]->get_all_supporting_features};    
  }
  
  return $sf->hseqname;
}

############################################################

}

sub _get_transcript_percent_id {

  my ($self,$tran) = @_;

  my ($sf);

  if (@{$tran->get_all_supporting_features}) {
    ($sf) = @{$tran->get_all_supporting_features};
  } else {
    my @exons = @{$tran->get_all_Exons};
    ($sf) = @{$exons[0]->get_all_supporting_features};    
  }

  return $sf->percent_id;
}

############################################################

}

sub _transcript_is_spliced {

  my ($self, $tran) = @_;

  my @exons = sort { $a->start <=> $b->start } @{$tran->get_all_Exons};

  if ( scalar (@exons) > 1 ){    
    # check that there are non "frameshift" introns
    for(my $i=0; $i < @exons - 1; $i++){
      my $intron_len = $exons[$i+1]->start - $exons[$i]->end - 1;
      if ( $intron_len > 9 ){
        return 1;
      }
    }
  }

  return 0;
}


# containers

}

sub best_in_genome {

  my $self = shift;
  $self->{'_best_in_genome'} = shift if(@_);

  return exists($self->{'_best_in_genome'}) ? $self->{'_best_in_genome'} : 0;

}

sub filter_results {

  my ($self, $transcripts) = @_;
  
  # results are Bio::EnsEMBL::Transcripts with exons and supp_features
  
  my @good_matches;

  my %matches;

TRAN:
  foreach my $transcript (@$transcripts ){
    my $coverage  = $self->_get_transcript_coverage($transcript);
    my $percent_id  = $self->_get_transcript_percent_id($transcript);


    my $id = $self->_get_transcript_evidence_id($transcript);
    push @{$matches{$id}}, {
      transcript  => $transcript,
      coverage    => $coverage,
      percent_id  => $percent_id,
      num_exons   => scalar(@{$transcript->get_all_Exons}),
      is_spliced  => $self->_transcript_is_spliced($transcript),
    };
  }
  
  my %matches_sorted_by_coverage;
  my %selected_matches;
 
 QUERY:
  foreach my $query_id ( keys( %matches ) ){
    @{$matches_sorted_by_coverage{$query_id}} = 
        sort { $b->{coverage}   <=> $a->{coverage} or
               $b->{num_exons}  <=> $a->{num_exons} or
               $b->{percent_id} <=> $a->{percent_id} } @{$matches{$query_id}};

    my $max_coverage;
    my $perc_id_of_best;
    my $count = 0;
    my $splices_elsewhere = 0;
    my $best_has_been_seen = 0;
    
    #print STDERR "####################\n";
    #print STDERR "Matches for $query_id:\n";

  #get the slice_name, start, and end of the best hit and store them to be used
  #in checking of any other good hit may be overlaping the best prediction
  # This is to avoid a match that spans over a region where two or more proteins of
  # a same family are close together and they get merged by a wrong "good quallity" alignment
    my $best_transcript = ${$matches_sorted_by_coverage{$query_id}}[0]->{transcript};
    my $best_start = $best_transcript->start;
    my $best_end = $best_transcript->end;
    my $best_slice;
    if (!defined $best_transcript->slice) {
      $best_slice = $best_transcript->start_Exon->seqname;
    } else {
      $best_slice = $best_transcript->slice;
    }
    
  TRANSCRIPT:
    foreach my $hit ( @{$matches_sorted_by_coverage{$query_id}} ){
      $count++;

      my ($accept, $label);      
      my $transcript = $hit->{transcript};
      my $strand = $transcript->strand;
      my $coverage = $hit->{coverage};
      my $percent_id = $hit->{percent_id};
      my $is_spliced = $hit->{is_spliced};

      my $transcript_slice;
      if (!defined $transcript->slice) {
        $transcript_slice = $transcript->start_Exon->seqname;
      } else {
        $transcript_slice = $transcript->slice; 
      }

      unless ($max_coverage){
        $max_coverage = $coverage;
      }
      unless ( $perc_id_of_best ){
	$perc_id_of_best = $percent_id;
      }

      if ( $count == 1 ){
	$label = 'best_match';
      } elsif ( $count > 1 && 
                $splices_elsewhere && 
                ! $is_spliced) {
	$label = 'potential_processed_pseudogene';
      } else{
	$label = $count;
      }

      if ( $count == 1 && $is_spliced ){
	$splices_elsewhere = 1;
      }

      if ( $self->best_in_genome ){
        # we keep the hit with the best coverage...
	if ($coverage == $max_coverage &&
            # as long as it has coverage/percent_id above limits or...
            (($coverage >= $self->min_coverage && 
              $percent_id >= $self->min_percent)
             ||
             # ...if coverage is significanly greater than the
             # specified minimum, then we are willing to accept
             # hits that have a percent_id just below the specified
             # minimum
             ($coverage   >= (1 + 5/100) * $self->min_coverage &&
              $percent_id >= (1 - 3/100) * $self->min_percent))) { 
	  if ( $self->reject_processed_pseudos
	       && $count > 1 
	       && $splices_elsewhere 
	       && ! $is_spliced) {
	    $accept = 'NO';
	  }
          # ... if one transcript with lower quality completely overlaps
          # the best one don't accept the lower quality one.
          elsif ($best_slice eq $transcript_slice &&
                 $best_start > $transcript->start &&
                 $best_end   < $transcript->end){
            $accept = 'NO';
          }
	  else {
	    $accept = 'YES';
	    push( @good_matches, $transcript);
	  }
	}
	else{
	  $accept = 'NO';
	}
	
      }
      else{
        # we keep anything which is within the 2% of the best score...
	if ($coverage >= (0.98 * $max_coverage) && 
            # as long as it has coverage/percent_id above limits or...
            (($coverage >= $self->min_coverage && 
              $percent_id >= $self->min_percent)
             ||              
             # ...if coverage is significanly greater than the
             # specified minimum, then we are willing to accept
             # hits that have a percent_id just below the specified
             # minimum
             ($coverage   >= (1 + 5/100) * $self->min_coverage &&
              $percent_id >= (1 - 3/100) * $self->min_percent))) {
          
          
	  ############################################################
	  # non-best matches are kept only if they are not unspliced with the
	  # best match being spliced - otherwise they could be processed pseudogenes
	  if ( $self->reject_processed_pseudos &&
	       $count > 1 &&
	       $splices_elsewhere &&
	       ! $is_spliced) {
	    $accept = 'NO';
	  }
	  else{
	    $accept = 'YES';
	    push( @good_matches, $transcript);
	  }
	}
	else{
	  $accept = 'NO';
	}
      }
    }
  }
  
  return\@ good_matches;

}
 
############################################################

}

sub min_coverage {

  my $self = shift;
  $self->{'_min_coverage'} = shift if(@_);

  return exists($self->{'_min_coverage'}) ? $self->{'_min_coverage'} : undef;

}

sub min_percent {

  my $self = shift;
  $self->{'_min_percent'} = shift if(@_);

  return exists($self->{'_min_percent'}) ? $self->{'_min_percent'} : undef;

}

sub new {

  my ($class,@args) = @_;
  my $self = $class->SUPER::new(@args);
  &verbose('WARNING');
  my ($min_coverage,
      $min_percent,
      $best_in_genome,
      $rpp) = 
        rearrange([
                   'COVERAGE',
                   'PERCENT_ID', 
                   'BEST_IN_GENOME',
                   'REJECT_PROCESSED_PSEUDOS',], @args); 

  ######################
  #SETTING THE DEFAULTS#
  ######################

  if (defined ($min_coverage)) {
    $self->min_coverage($min_coverage);
  } elsif ( !defined($self->min_coverage) ) {
    warn("\n\tmin_coverage not set, setting it to zero (0)!\n\n$!");
    $self->min_coverage(0);
  }
  if (defined ($min_percent)) {
    $self->min_percent($min_percent);
  } elsif (!defined($self->min_percent)) {
    warn("\n\tmin_percent not set, setting it to zero (0)!\n\n$!");
    $self->min_percent(0);
  }
  if (defined ($best_in_genome)) {
    $self->best_in_genome($best_in_genome);
  } elsif (!defined($self->best_in_genome)) {
    warn("\n\tbest_in_genome not set, setting it to one (1)!\n\n$!");
    $self->best_in_genome(1);
  }
  if (defined ($rpp)) {
    $self->reject_processed_pseudos($rpp);
  } elsif (!defined($self->reject_processed_pseudos)) {
    warn("\n\treject_processed_pseudos, setting it to one (1)!\n\n$!");
    $self->reject_processed_pseudos(1);
  }

  return $self;
}


#filter methods

}

sub reject_processed_pseudos {

  my $self = shift;
  $self->{'_reject_processed_pseudos'} = shift if(@_);

  return exists($self->{'_reject_processed_pseudos'}) ? $self->{'_reject_processed_pseudos'} : 0;
}



1;

}

General documentation

_get_transcript_coverage	No description	Code
_get_transcript_evidence_id	No description	Code
_get_transcript_percent_id	No description	Code
_transcript_is_spliced	No description	Code
best_in_genome	No description	Code
filter_results	Description	Code
min_coverage	No description	Code
min_percent	No description	Code
new	Description	Code
reject_processed_pseudos	No description	Code