Raw content of Bio::EnsEMBL::Analysis::Tools::Filter::BlastMiniGenewise ## # # Cared for by Ensembl <ensembl-dev@ebi.ac.uk> # # Copyright GRL & EBI # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =pod =head1 NAME Bio::EnsEMBL::Analysis::Tools::Filter::BlastMiniGenewise =head1 SYNOPSIS my $obj = Bio::EnsEMBL::Analysis::Tools::Filter::BlastMiniGenewise-> new( -max_exon_length => '20000', -multi_exon_min_coverage => '25', -single_exon_min_coverage => '80', -max_intron_length => '200000', -min_split_coverage => 95, -max_low_complexity => 101, ); my ($accepted, $rejected) = $obj->filter_genes($genes); =head1 DESCRIPTION This module is designed to assess gene structures on several factors including coverage of evidence, size of introns and presence of complete ORF The limits of these variables are defined in the constructure The filter method then return 2 sets of genes, the first is the accepted set the second is the rejected set =head1 CONTACT ensembl-dev@ebi.ac.uk =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::EnsEMBL::Analysis::Tools::Filter::BlastMiniGenewise; use strict; use warnings; use Bio::EnsEMBL::Utils::Exception qw(verbose throw warning); use Bio::EnsEMBL::Utils::Argument qw( rearrange ); use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranscriptUtils qw(convert_to_genes are_strands_consistent are_phases_consistent is_not_folded has_no_unwanted_evidence all_exons_are_valid intron_lengths_all_less_than_maximum list_evidence evidence_coverage_greater_than_minimum Transcript_info split_Transcript tidy_split_transcripts evidence_coverage); use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::ConfigDependent::TranscriptUtils qw(low_complexity_less_than_maximum) ; use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::ExonUtils qw(exon_length_less_than_maximum); use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::GeneUtils qw(clone_Gene); use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils qw(id coord_string lies_inside_of_slice); use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranslationUtils qw(validate_Translation_coords contains_internal_stops print_Translation print_peptide); use Bio::EnsEMBL::Analysis::Tools::Logger qw(logger_info); use Bio::EnsEMBL::Attribute; use vars qw (@ISA); @ISA = qw(); =head2 new Arg [1] : Bio::EnsEMBL::Analysis::Tools::Filter::BlastMiniGenewise Function : to create a BlastMiniGenewise filter object Returntype: Bio::EnsEMBL::Analysis::Tools::Filter::BlastMiniGenewise Exceptions: there are several args which musst be defined otherwise it will throw Example : =cut sub new{ my ($class,@args) = @_; my $self = bless {},$class; my ($max_exon_length, $split_genes, $slice, $unwanted_evidence, $max_intron_length, $min_coverage, $max_low_complexity, $seqfetcher, $single_exon_min_coverage, $min_split_coverage) = rearrange([qw(MAX_EXON_LENGTH SPLIT_GENES SLICE UNWANTED_EVIDENCE MAX_INTRON_LENGTH MULTI_EXON_MIN_COVERAGE MAX_LOW_COMPLEXITY SEQFETCHER SINGLE_EXON_MIN_COVERAGE MIN_SPLIT_COVERAGE)], @args); #Setting defaults #$self->max_exon_length(20000); # $self->split_multi_transcript_genes(0); # $self->max_intron_length(200000); # $self->min_coverage(25); # $self->min_split_coverage(95); # $self->single_exon_min_coverage(80); # $self->max_low_complexity(101); #decided against setting defaults to avoid accidentally using the wrong #value ################# $self->max_exon_length($max_exon_length); $self->split_multi_transcript_genes($split_genes); $self->slice($slice); $self->unwanted_evidence($unwanted_evidence); $self->min_coverage($min_coverage); $self->single_exon_min_coverage($single_exon_min_coverage); $self->max_low_complexity($max_low_complexity); $self->min_split_coverage($min_split_coverage); $self->seqfetcher($seqfetcher); $self->max_intron_length($max_intron_length); throw("Filter::BlastMiniGenewise needs a seqfetcher object to run") if(!$self->seqfetcher); throw("Filter::BlastMiniGenewise needs a max exon length ") if(! defined($self->max_exon_length)); throw("Filter::BlastMiniGenewise needs a max intron length ") if(! defined($self->max_intron_length)); throw("Filter::BlastMiniGenewise needs a multi exon min coverage") if(! defined($self->min_coverage)); throw("Filter::BlastMiniGenewise needs a single exon min coverage") if(!defined($self->single_exon_min_coverage)); throw("Filter::BlastMiniGenewise needs a max low complexity value ") if(!defined($self->max_low_complexity)); throw("Filter::BlastMiniGenewise needs a min split coverage") if(! defined($self->min_split_coverage)); return $self; } #accessor methods =head2 accessor methods Arg [1] : Bio::EnsEMBL::Analysis::Tools::Filter::BlastMiniGenewise Arg [2] : int/hash ref/array ref/object Function : to store the given value Returntype: the give value Exceptions: some with throw if given the wrong thing Example : =cut sub max_exon_length{ my ($self, $arg) = @_; if(defined $arg){ $self->{max_exon_length} = $arg; } return $self->{max_exon_length}; } sub max_intron_length{ my ($self, $arg) = @_; if(defined $arg){ $self->{max_intron_length} = $arg; } return $self->{max_intron_length}; } sub max_low_complexity{ my ($self, $arg) = @_; if(defined $arg){ $self->{max_low_complexity} = $arg; } return $self->{max_low_complexity}; } sub min_coverage{ my ($self, $arg) = @_; if(defined $arg){ $self->{min_coverage} = $arg; } return $self->{min_coverage}; } sub min_split_coverage{ my ($self, $arg) = @_; if(defined $arg){ $self->{min_split_coverage} = $arg; } return $self->{min_split_coverage}; } sub single_exon_min_coverage{ my ($self, $arg) = @_; if(defined $arg){ $self->{single_exon_min_coverage} = $arg; } return $self->{single_exon_min_coverage}; } sub split_multi_transcript_genes{ my ($self, $arg) = @_; if(defined $arg){ $self->{split_multi_transcript_gene} = $arg; } return $self->{split_multi_transcript_gene}; } sub slice{ my ($self, $arg) = @_; if(defined $arg){ throw($arg." should be a Bio::EnsEMBL::Slice") unless($arg->isa("Bio::EnsEMBL::Slice")); $self->{slice} = $arg; } return $self->{slice}; } sub unwanted_evidence{ my ($self, $arg) = @_; if(defined $arg){ throw($arg." should be a hashref") unless(ref($arg) eq "HASH"); $self->{unwanted_evidence} = $arg; } return $self->{unwanted_evidence}; } sub seqfetcher{ my ($self, $arg) = @_; if($arg){ throw("Filter::BlastMiniGenewise ". $arg." must have a method get_Seq_by_acc") unless($arg->can("get_Seq_by_acc")); $self->{seqfetcher} = $arg; } return $self->{seqfetcher}; } sub accepted_genes{ my ($self, $arg) = @_; $self->{accepted_genes} = [] if(!$self->{accepted_genes}); if($arg){ my $temp; if(ref($arg) ne "ARRAY"){ throw($arg." must be a Bio::EnsEMBL::Gene object") unless($arg->isa("Bio::EnsEMBL::Gene")); $temp = [$arg]; }else{ $temp = $arg; } push(@{$self->{accepted_genes}}, @$temp); } return $self->{accepted_genes}; } sub rejected_genes{ my ($self, $arg) = @_; $self->{rejected_genes} = [] if(!$self->{rejected_genes}); if($arg){ my $temp; if(ref($arg) ne "ARRAY"){ throw($arg." must be a Bio::EnsEMBL::Gene object") unless($arg->isa("Bio::EnsEMBL::Gene")); $temp = [$arg]; }else{ $temp = $arg; } push(@{$self->{rejected_genes}}, @$temp); } return $self->{rejected_genes}; } sub split_genes{ my ($self, $arg) = @_; $self->{split_genes} = [] if(!$self->{split_genes}); if($arg){ my $temp; if(ref($arg) ne "ARRAY"){ throw($arg." must be a Bio::EnsEMBL::Gene object") unless($arg->isa("Bio::EnsEMBL::Gene")); $temp = [$arg]; }else{ throw($arg->[0]." must be a Bio::EnsEMBL::Gene object") unless($arg->[0]->isa("Bio::EnsEMBL::Gene")); $temp = $arg; } push(@{$self->{split_genes}}, @$temp); } return $self->{split_genes}; } =head2 filter_genes Arg [1] : Bio::EnsEMBL::Analysis::Tools::Filter::BlastMiniGenewise Arg [2] : Arrayref of Bio::EnsEMBL::Gene objects Function : filter genes on defined characteristics Returntype: 2 arrayrefs, first accepted genes, then rejected Exceptions: Example : =cut sub filter_genes{ my ($self, $genes) = @_; my @accepted; GENE:foreach my $gene(@$genes, @{$self->split_genes}){ my @transcripts = @{$gene->get_all_Transcripts}; if(@transcripts > 1){ my $cloned_gene = clone_gene($gene); warning("Filter::BlastMiniGenewise only works on single transcript ". "structures. Will convert ".$gene." into to one gene one ". "transcript structures"); my $split_genes = convert_to_genes($cloned_gene->get_all_Transcripts); push(@$genes, @$split_genes) if($self->split_multi_transcript_genes); $self->split_genes($gene); next GENE; } my $is_valid = $self->validate_Transcript($transcripts[0]); if($is_valid){ logger_info("Have accepted ".id($gene)." ".coord_string($gene)); push(@accepted, $gene); }else{ logger_info("Have rejected ".id($gene)." ".coord_string($gene)); $self->rejected_genes($gene); } } ACCEPTED:foreach my $accepted(@accepted){ my @transcripts = @{$accepted->get_all_Transcripts}; my $evidence = $self->get_Transcript_supporting_evidence($transcripts[0]); if(evidence_coverage_greater_than_minimum($transcripts[0], $evidence, ($self->min_split_coverage -1))){ $self->accepted_genes($accepted); next ACCEPTED; } if(@{$transcripts[0]->get_all_Exons} == 1){ $self->accepted_genes($accepted); next ACCEPTED; } my $split_transcripts = split_Transcript($transcripts[0], $self->max_intron_length); my $acceptable_transcripts = tidy_split_transcripts($transcripts[0], $split_transcripts); my $genes = convert_to_genes($acceptable_transcripts, $accepted->analysis, $accepted->biotype); foreach my $gene(@$genes){ my $attrib = $self->get_Attribute("split_tscript"); foreach my $transcript(@{$gene->get_all_Transcripts}){ $transcript->add_Attributes($attrib); } $self->accepted_genes($gene); } } return $self->accepted_genes, $self->rejected_genes; } =head2 validate_Transcript Arg [1] : Bio::EnsEMBL::Analysis::Tools::Filter::BlastMiniGenewise Arg [2] : Bio::EnsEMBL::Transcript Function : validate the transcript on the basis of several factors Returntype: boolean, 1 if valid 0 if not Exceptions: Example : =cut sub validate_Transcript{ my ($self, $transcript) = @_; #print "VALIDATING ".Transcript_info($transcript)."\n"; my $slice = $self->slice; $slice = $transcript->slice if(!$slice); my $is_valid = 0; #basic transcript validation unless(are_strands_consistent($transcript)){ $is_valid++; my $attrib = $self->get_Attribute("incons_strands"); $transcript->add_Attributes($attrib); } #print "IS VALID is ".$is_valid." after strand consistency\n"; unless(are_phases_consistent($transcript)){ $is_valid++; my $attrib = $self->get_Attribute("incons_phases"); $transcript->add_Attributes($attrib); } #print "IS VALID is ".$is_valid." phase consistency\n"; unless(is_not_folded($transcript)){ $is_valid++; my $attrib = $self->get_Attribute("is_folded"); $transcript->add_Attributes($attrib); } #print "IS VALID is ".$is_valid." folded \n"; unless(has_no_unwanted_evidence($transcript, $self->unwanted_evidence)){ $is_valid++; my $attrib = $self->get_Attribute("unwanted_evidence"); $transcript->add_Attributes($attrib); } #print "IS VALID is ".$is_valid." after unwanted evidence\n"; #$is_valid++ unless(all_exons_are_valid($transcript, $self->max_exon_length)); EXON:foreach my $exon(@{$transcript->get_all_Exons}){ if(exon_length_less_than_maximum($exon, $self->max_exon_length)){ next EXON; }else{ $is_valid++; last EXON; my $attrib = $self->get_Attribute("exon_too_long"); $transcript->add_Attributes($attrib); } } #print "IS VALID is ".$is_valid." after exon validation\n"; #basic translation validation #print_peptide($transcript); if(contains_internal_stops($transcript)){ warning(Transcript_info($transcript)." contains internal stop codons"); $is_valid++; my $attrib = $self->get_Attribute("contains_stops"); $transcript->add_Attributes($attrib); } #print "IS VALID is ".$is_valid." after contains internal stops\n"; unless(validate_Translation_coords($transcript)){ $is_valid++; my $attrib = $self->get_Attribute("borked_coords"); $transcript->add_Attributes($attrib); } #print "IS VALID is ".$is_valid." after translation coord validation\n"; #intron checks #$is_valid++ # unless(intron_lengths_all_less_than_maximum($transcript, $self->max_intron_length)); #print "IS VALID is ".$is_valid." intron length check\n"; #checking low complexity unless(low_complexity_less_than_maximum($transcript, $self->max_low_complexity)){ $is_valid++; my $attrib = $self->get_Attribute("low_complex"); $transcript->add_Attributes($attrib) } #print "IS VALID is ".$is_valid." low complexity check\n"; #evidence coverage #note the coverage passed into the method is min - 1 as the method returns true if #the coverage is greater than the given value to allow the utils conventions to #stand but in the old code the reverse was true it returned false if coverage #is less than the given so in order for this to produce the same results I needed #to add the min - 1 convention my $evidence = $self->get_Transcript_supporting_evidence($transcript); if(@{$transcript->get_all_Exons} >= 2){ my $coverage = evidence_coverage($transcript, $evidence); unless(evidence_coverage_greater_than_minimum($transcript, $evidence, ($self->min_coverage - 1))){ $is_valid++; my $attrib = $self->get_Attribute("evi_coverage"); $transcript->add_Attributes($attrib); } }else{ unless(evidence_coverage_greater_than_minimum($transcript, $evidence, ($self->single_exon_min_coverage -1))){ $is_valid++; my $attrib = $self->get_Attribute("evi_coverage"); $transcript->add_Attributes($attrib); } } #print "IS VALID is ".$is_valid." after evidence coverage check\n"; warning(Transcript_info($transcript)." failed ".$is_valid." tests out of 9 ". "returning 0") if($is_valid >= 1); return 0 if($is_valid >= 1); return 1; } sub get_Transcript_supporting_evidence{ my ($self, $transcript, $seqfetcher) = @_; $seqfetcher = $self->seqfetcher if(!$seqfetcher); throw("Can't get ".$transcript." transcrpts supporting features without ". "a seqfetcher object") if(!$seqfetcher); my $ids = list_evidence($transcript); my $sequence; foreach my $id(@$ids){ $sequence = $seqfetcher->get_Seq_by_acc($id); last if($sequence); } return $sequence; } sub create_Attribute{ my ($self, $reason, $description) = @_; $description = $reason if(!$description); my $attrib = Bio::EnsEMBL::Attribute->new( -code => $reason, -name => $reason, -description => $description, -value => $description, ); return $attrib; } sub get_Attribute{ my ($self, $code, $description) = @_; if(!$self->{'attribute_hash'}){ $self->{'attribute_hash'} = {}; } my $attrib = $self->{'attribute_hash'}->{$code}; if(!$attrib){ $attrib = $self->create_Attribute($code, $description); $self->{'attribute_hash'}->{$code} = $attrib; } return $attrib; } 1;