Raw content of Bio::EnsEMBL::Analysis::Tools::ClusterFilter.

Raw content of Bio::EnsEMBL::Analysis::Tools::ClusterFilter # Ensembl module for Bio::EnsEMBL::Analysis::Tools::ExonerateTranscriptFilter # # Copyright (c) 2004 Ensembl # =head1 NAME Bio::EnsEMBL::Analysis::Tools::ClusterFilter =head1 SYNOPSIS my $filter = new Bio::EnsEMBL::Analysis::Tools::ClusterFilter new->( -coverage => 80, -percent_id => 90, -best_in_genome => 1, ); my @filtered_results = @{$filter->filter_results(\@results)}; =head1 DESCRIPTION This is the standard module used for filtering WGA2GenesDirect transcripts. It takes all the transcript alignments riginated from one gene and cluster them together by location, it then selects the best cluster based in coverage and percent id of the transcripts within each cluster. It was originally used to avoid transcript from one gene to be allocated in different chromosomes in the projection process (which would have no sense). =cut package Bio::EnsEMBL::Analysis::Tools::ClusterFilter; use strict; use warnings; use Bio::EnsEMBL::Root; use Bio::EnsEMBL::Utils::Exception qw(verbose throw warning); use Bio::EnsEMBL::Utils::Argument qw( rearrange ); use vars qw (@ISA); @ISA = qw(Bio::EnsEMBL::Root); =head2 new Returntype: Bio::EnsEMBL::Analysis::Tools::ClusterFilter Exceptions: none Example : =cut sub new{ my ($class,@args) = @_; my $self = $class->SUPER::new(@args); &verbose('WARNING'); my ($min_coverage, $min_percent, $max_stops, $best_in_genome, ) = rearrange(['coverage', 'percent_id', 'max_stops', 'best_in_genome'], @args); ###################### #SETTING THE DEFAULTS# ###################### $self->min_coverage($min_coverage) if defined $min_coverage; $self->min_percent($min_percent) if defined $min_percent; $self->max_stops($max_stops) if defined $max_stops; $self->best_in_genome($best_in_genome) if $best_in_genome; return $self; } #filter methods =head2 filter_results Arg [1] : Bio::EnsEMBL::Analysis::Tools::ClusterFilter Arg [2] : arrayref of Trancripts Function : filter the given Transcruipts in the tried and trusted manner Returntype: arrayref Exceptions: throws if passed nothing or not an arrayref Example : =cut sub filter_results{ my ($self, $transcripts) = @_; # results are Bio::EnsEMBL::Transcripts with exons and supp_features my @good_matches; TRANSCRIPT: foreach my $transcript (@$transcripts ){ my $coverage = $self->_get_transcript_coverage($transcript); my $percent_id = $self->_get_transcript_percent_id($transcript); my $pep = $transcript->translate->seq; my $num_stops = $pep =~ tr/\*/\*/; if (defined $self->min_percent and $percent_id < $self->min_percent) { print "Transcript REJECTED with: perc_ident: $percent_id\n"; next TRANSCRIPT; } if (defined $self->min_coverage and $coverage < $self->min_coverage){ print "Transcript REJECTED with: coverage: $coverage\n"; next TRANSCRIPT; } if (defined $self->max_stops and $num_stops > $self->max_stop){ print "Rejecting proj due to too many STOPS ",$num_stops,"\n"; next TRANSCRIPT; } print "Transcripts passed filter (cov=$coverage, per=$percent_id, stops=$num_stops\n"; push @good_matches, $transcript; } my @best_matches; if ($self->best_in_genome == 1){ @good_matches = sort { $a->slice->seq_region_name cmp $b->slice->seq_region_name or $a->start <=> $b->start } @good_matches; my @c; foreach my $t (@good_matches) { if (not @c or $c[-1]->{name} ne $t->slice->seq_region_name or $c[-1]->{end} < $t->start) { push @c, { name => $t->slice->seq_region_name, start => $t->start, end => $t->end, trans => [$t], }; } else { push @{$c[-1]->{trans}}, $t; if ($t->{end} > $c[-1]->{end}) { $c[-1]->{end} = $t->end; } } } ##################### my ($best_c, $best_cov); foreach my $c (@c) { my @trans = @{$c->{trans}}; my ($total_cov, $total_pid); foreach my $t (@trans) { my ($sf) = @{$t->get_all_supporting_features}; my $cov = $sf->score; my $pid = $sf->percent_id; $total_cov += $cov; $total_pid += $pid; } $total_cov /= scalar(@trans); $total_pid /= scalar(@trans); $c->{average_coverage} = $total_cov; $c->{average_percent_id} = $total_pid; } @c = sort { $b->{average_coverage} <=> $a->{average_coverage} or $b->{average_percent_id} <=> $a->{average_percent_id} } @c; if (@c && @{$c[0]->{trans}}){ my $best_c = shift @c; push @best_matches, @{$best_c->{trans}}; foreach my $oc (@c) { if ($oc->{average_coverage} >= $best_c->{average_coverage} and $oc->{average_percent_id} >= $best_c->{average_percent_id}) { push @best_matches, @{$oc->{trans}}; } } } } return \@best_matches; } ############################################################ sub _get_transcript_coverage{ my ($self,$tran) = @_; if (@{$tran->get_all_supporting_features} and defined $tran->get_all_supporting_features->[0]->hcoverage) { my ($evi) = @{$tran->get_all_supporting_features}; return $evi->hcoverage; } else { my @exons = @{$tran->get_all_Exons}; my ($evi) = @{$exons[0]->get_all_supporting_features}; return $evi->score; } } ############################################################ sub _get_transcript_percent_id{ my ($self,$tran) = @_; my ($sf); if (@{$tran->get_all_supporting_features}) { ($sf) = @{$tran->get_all_supporting_features}; } else { my @exons = @{$tran->get_all_Exons}; ($sf) = @{$exons[0]->get_all_supporting_features}; } return $sf->percent_id; } ############################################################ sub _get_transcript_evidence_id{ my ($self,$tran) = @_; my ($sf); if (@{$tran->get_all_supporting_features}) { ($sf) = @{$tran->get_all_supporting_features}; } else { my @exons = @{$tran->get_all_Exons}; ($sf) = @{$exons[0]->get_all_supporting_features}; } return $sf->hseqname; } ############################################################ # containers sub min_coverage{ my $self = shift; $self->{'_min_coverage'} = shift if(@_); return exists($self->{'_min_coverage'}) ? $self->{'_min_coverage'} : undef; } sub min_percent{ my $self = shift; $self->{'_min_percent'} = shift if(@_); return exists($self->{'_min_percent'}) ? $self->{'_min_percent'} : undef; } sub max_stops{ my $self = shift; $self->{'_max_stops'} = shift if(@_); return exists($self->{'_max_stops'}) ? $self->{'_max_stops'} : undef; } sub best_in_genome{ my $self = shift; $self->{'_best_in_genome'} = shift if(@_); return exists($self->{'_best_in_genome'}) ? $self->{'_best_in_genome'} : undef; } 1;