Bio::EnsEMBL::Analysis::Tools CdnaUpdateTranscriptFilter
SummaryIncluded librariesPackage variablesSynopsisDescriptionGeneral documentationMethods
Toolbar
WebCvsRaw content
Summary
  Bio::EnsEMBL::Analysis::Tools::CdnaUpdateTranscriptFilter
Package variables
No package variables defined.
Included modules
Bio::EnsEMBL::Root
Bio::EnsEMBL::Utils::Argument qw ( rearrange )
Bio::EnsEMBL::Utils::Exception qw ( verbose throw warning )
Inherit
Bio::EnsEMBL::Root
Synopsis
  my $filter = new Bio::EnsEMBL::Analysis::Tools::CdnaUpdateTranscriptFilter
new->(
-best_in_genome => 1,
-reject_processed_pseudos => 1,
-coverage => 80,
-percent_id => 90,
-verbosity => 0,
);
my @filtered_results = @{$filter->filter_results(\@results)};
Description
This is a module used for filtering Exonerate transcripts
Methods
_get_transcript_coverage
No description
Code
_get_transcript_evidence_id
No description
Code
_get_transcript_percent_id
No description
Code
_transcript_is_spliced
No description
Code
best_in_genome
No description
Code
filter_resultsDescriptionCode
min_coverage
No description
Code
min_percent
No description
Code
newDescriptionCode
reject_processed_pseudos
No description
Code
verbosity
No description
Code
Methods description
filter_resultscode    nextTop
  Arg [1]   : Bio::EnsEMBL::Analysis::Tools::DefaultExonerateFilter
Arg [2] : arrayref of Trancripts
Function : filter the given Transcripts in the tried and trusted manner
Returntype: arrayref
Exceptions: throws if passed nothing or not an arrayref
Example :
newcodeprevnextTop
  Returntype: Bio::EnsEMBL::Analysis::Tools::CdnaUpdateTranscriptFilter
Exceptions: none
Example :
Methods code
_get_transcript_coveragedescriptionprevnextTop
sub _get_transcript_coverage {
  my ($self,$tran) = @_;

  if (@{$tran->get_all_supporting_features} and
      defined $tran->get_all_supporting_features->[0]->hcoverage) {
    my ($evi) = @{$tran->get_all_supporting_features};
    return $evi->hcoverage;
  } else {
    my @exons = @{$tran->get_all_Exons};
    my ($evi) = @{$exons[0]->get_all_supporting_features};
    return $evi->score;
  }
}

############################################################
}
_get_transcript_evidence_iddescriptionprevnextTop
sub _get_transcript_evidence_id {
  my ($self,$tran) = @_;

  my ($sf);

  if (@{$tran->get_all_supporting_features}) {
    ($sf) = @{$tran->get_all_supporting_features};
  } else {
    my @exons = @{$tran->get_all_Exons};
    ($sf) = @{$exons[0]->get_all_supporting_features};    
  }
  
  return $sf->hseqname;
}

############################################################
}
_get_transcript_percent_iddescriptionprevnextTop
sub _get_transcript_percent_id {
  my ($self,$tran) = @_;

  my ($sf);

  if (@{$tran->get_all_supporting_features}) {
    ($sf) = @{$tran->get_all_supporting_features};
  } else {
    my @exons = @{$tran->get_all_Exons};
    ($sf) = @{$exons[0]->get_all_supporting_features};    
  }

  return $sf->percent_id;
}

############################################################
}
_transcript_is_spliceddescriptionprevnextTop
sub _transcript_is_spliced {
  my ($self, $tran) = @_;

  my @exons = sort { $a->start <=> $b->start } @{$tran->get_all_Exons};

  if ( scalar (@exons) > 1 ){    
    # check that there are non "frameshift" introns
for(my $i=0; $i < @exons - 1; $i++){ my $intron_len = $exons[$i+1]->start - $exons[$i]->end - 1; if ( $intron_len > 9 ){ return 1; } } } return 0; } # containers
}
best_in_genomedescriptionprevnextTop
sub best_in_genome {
  my $self = shift;
  $self->{'_best_in_genome'} = shift if(@_);

  return exists($self->{'_best_in_genome'}) ? $self->{'_best_in_genome'} : 0;
}
filter_resultsdescriptionprevnextTop
sub filter_results {
  my ($self, $transcripts) = @_;
  # results are Bio::EnsEMBL::Transcripts with exons and supp_features
my @good_matches; my %matches; my $printing = $self->verbosity; TRAN: foreach my $transcript (@$transcripts ){ my $coverage = $self->_get_transcript_coverage($transcript); my $percent_id = $self->_get_transcript_percent_id($transcript); #identify the longest intron
my @exons = @{$transcript->get_all_Exons()}; my $num_exons = scalar @exons; my @sorted_exons = sort {$a->start <=> $b->start} @exons; my $max_intron = 0; my $short_intron = 0; #flag to check that not all introns are long
for my $a (0 .. $#sorted_exons - 1){ my $intron_len = ($sorted_exons[$a+1]->start - $sorted_exons[$a]->end) - 1; if ($intron_len > $max_intron){ $max_intron = $intron_len; } if ($intron_len < 250000){ $short_intron = 1; } } my $id = $self->_get_transcript_evidence_id($transcript); push @{$matches{$id}}, { transcript => $transcript, coverage => $coverage, percent_id => $percent_id, num_exons => $num_exons, is_spliced => $self->_transcript_is_spliced($transcript), max_intron => $max_intron, short_intron => $short_intron, }; } my %matches_sorted_by_coverage; my %selected_matches; QUERY: foreach my $query_id ( keys( %matches ) ){ @{$matches_sorted_by_coverage{$query_id}} = sort { $b->{coverage} <=> $a->{coverage} or $b->{num_exons} <=> $a->{num_exons} or $b->{percent_id} <=> $a->{percent_id} } @{$matches{$query_id}}; my $max_coverage; my $perc_id_of_best; my $count = 0; my $splices_elsewhere = 0; my $best_has_been_seen = 0; #print STDERR "####################\n";
#print STDERR "Matches for $query_id:\n";
TRANSCRIPT: foreach my $hit ( @{$matches_sorted_by_coverage{$query_id}} ){ my $coverage = $hit->{coverage}; my $percent_id = $hit->{percent_id}; my $is_spliced = $hit->{is_spliced}; unless ($max_coverage){ $max_coverage = $coverage; } unless ( $perc_id_of_best ){ $perc_id_of_best = $percent_id; } #sd3
#single exon genes (ie mouse olfactory receptors) being thrown out in favour of
#low percentage id multi-exon genes of equivalent coverage
#this loop checks for high quality matches in multi-exon hits before the splices_elsewhere flag is set
#so if have good coverage & percentage id:
if ( (($coverage >= $self->min_coverage && $percent_id >= $self->min_percent) #or have high coverage & a slightly lower percentage id
|| ($coverage >= (1 + 5/100) * $self->min_coverage &&
$percent_id >= (1 - 3/100) * $self->min_percent)) && $is_spliced){ $splices_elsewhere = 1; last; } } foreach my $hit ( @{$matches_sorted_by_coverage{$query_id}} ){ $count++; my ($accept, $label); my $transcript = $hit->{transcript}; my $strand = $transcript->strand; my $coverage = $hit->{coverage}; my $percent_id = $hit->{percent_id}; my $is_spliced = $hit->{is_spliced}; my $max_intron = $hit->{max_intron}; my $short_intron = $hit->{short_intron}; my $num_exons = $hit->{num_exons}; if ( $count == 1 ){ $label = 'best_match'; } elsif ( $count > 1 && $splices_elsewhere && ! $is_spliced) { $label = 'potential_processed_pseudogene'; } else{ $label = $count; } # if ( $count == 1 && $is_spliced ){ #old way of doing it
# $splices_elsewhere = 1;
# }
if ( $self->best_in_genome ){ # we keep the hit with the best coverage...
if ($coverage == $max_coverage && # as long as it has coverage/percent_id above limits or...
(($coverage >= $self->min_coverage && $percent_id >= $self->min_percent) || # ...if coverage is significantly greater than the
# specified minimum, then we are willing to accept
# hits that have a percent_id just below the specified
# minimum
($coverage >= (1 + 5/100) * $self->min_coverage &&
$percent_id >= (1 - 3/100) * $self->min_percent))) { if ( $self->reject_processed_pseudos && $count > 1 && $splices_elsewhere && ! $is_spliced) { $accept = 'NO'; if ($printing){ print "rpp $query_id\n"; } } elsif (($short_intron == 0) && ($num_exons > 1)){ #all long introns
$accept = 'NO'; if ($printing){ print "only long introns $query_id\n"; } } #Only accept long intron hits with very high coverage and percent_id
elsif($max_intron > 250000 ){ if (($coverage >= 98) && ($percent_id >= 98)){ $accept = 'YES'; push( @good_matches, $transcript); #print "accept: intron $max_intron coverage $coverage \%id $percent_id $query_id\n";
#find out which introns are long - ie are they the first and last introns?
# my @exons = @{$transcript->get_all_Exons()};
# my @sorted_exons = sort {$a->start <=> $b->start} @exons;
# my $num_introns = scalar @exons - 1;
#
# for my $a (0 .. $#sorted_exons - 1){
# my $intron_len = ($sorted_exons[$a+1]->start - $sorted_exons[$a]->end) - 1;
# if ($intron_len > 250000){
# print "intron position ".($a + 1)." out of $num_introns\n";
# }
# }
}else{ $accept = 'NO'; if ($printing){ print "reject: intron $max_intron coverage $coverage\% id $percent_id $query_id\n"; } } }else{ $accept = 'YES'; push( @good_matches, $transcript); } } else{ $accept = 'NO'; if ($printing){ print "max_coverage $max_coverage coverage $coverage\% id $percent_id $query_id\n"; } } } else{ # we keep anything which is within the 2% of the best score...
if ($coverage >= (0.98 * $max_coverage) && # as long as it has coverage/percent_id above limits or...
(($coverage >= $self->min_coverage && $percent_id >= $self->min_percent) || # ...if coverage is significantly greater than the
# specified minimum, then we are willing to accept
# hits that have a percent_id just below the specified
# minimum
($coverage >= (1 + 5/100) * $self->min_coverage &&
$percent_id >= (1 - 3/100) * $self->min_percent))) { ############################################################
# non-best matches are kept only if they are not unspliced with the
# best match being spliced - otherwise they could be processed pseudogenes
if ( $self->reject_processed_pseudos && $count > 1 && $splices_elsewhere && ! $is_spliced) { $accept = 'NO'; } else{ $accept = 'YES'; push( @good_matches, $transcript); } } else{ $accept = 'NO'; } } } } return\@ good_matches; } ############################################################
}
min_coveragedescriptionprevnextTop
sub min_coverage {
  my $self = shift;
  $self->{'_min_coverage'} = shift if(@_);

  return exists($self->{'_min_coverage'}) ? $self->{'_min_coverage'} : undef;
}
min_percentdescriptionprevnextTop
sub min_percent {
  my $self = shift;
  $self->{'_min_percent'} = shift if(@_);

  return exists($self->{'_min_percent'}) ? $self->{'_min_percent'} : undef;
}
newdescriptionprevnextTop
sub new {
  my ($class,@args) = @_;
  my $self = $class->SUPER::new(@args);
  &verbose('WARNING');

  my ($min_coverage,
      $min_percent,
      $best_in_genome,
      $rpp,
	  $verbosity) = 
        rearrange([
                   'COVERAGE',
                   'PERCENT_ID', 
                   'BEST_IN_GENOME',
                   'REJECT_PROCESSED_PSEUDOS',
				   'VERBOSITY',], @args); 

  ######################
#SETTING THE DEFAULTS#
######################
$self->min_coverage($min_coverage) if defined $min_coverage; $self->min_percent($min_percent) if defined $min_percent; $self->best_in_genome($best_in_genome) if defined $best_in_genome; $self->reject_processed_pseudos($rpp) if defined $rpp; $self->verbosity($verbosity) if defined $verbosity; return $self; } #filter methods
}
reject_processed_pseudosdescriptionprevnextTop
sub reject_processed_pseudos {
  my $self = shift;
  $self->{'_reject_processed_pseudos'} = shift if(@_);

  return exists($self->{'_reject_processed_pseudos'}) ? $self->{'_reject_processed_pseudos'} : 0;
}
verbositydescriptionprevnextTop
sub verbosity {
  my $self = shift;
  $self->{'_verbosity'} = shift if(@_);

  return exists($self->{'_verbosity'}) ? $self->{'_verbosity'} : 0;
}


1;
}
General documentation
No general documentation available.