Raw content of Bio::EnsEMBL::Pipeline::Tools::Pmatch::Second_PMF # holds the object that does the second round of filtering package Bio::EnsEMBL::Pipeline::Tools::Pmatch::Second_PMF; use Bio::Root::Object; @ISA = qw(Bio::Root::Object); sub new { my ($class, @args) = @_; my $self = bless {}, $class; my ($phits) = $self->_rearrange(['PHITS'], @args); $self->throw("No pmatch hits data") unless defined $phits; $self->phits($phits); #print STDERR "SECOND PMF\n"; #foreach my $hit(@$phits){ #print STDERR $hit->chr_name . ":" .$hit->qstart . "," .$hit->qend . ":" . $hit->protein_id . ":" .$hit->coverage . "\n"; #} #print STDERR "SECOND PMF\n"; my %proteins = (); # hash of arrays of MergedHits, indexed by protin name $self->{_proteins} = \%proteins; return $self; } sub phits { my ($self, $phits) = @_; # $phits is an array reference if(defined($phits)){ if (ref($phits) eq "ARRAY") { $self->{_phits} = $phits; } else { $self->throw("[$phits] is not an array ref."); } } return $self->{_phits}; } sub run { my ($self) = @_; # group hits by protein my %prots = %{$self->{_proteins}}; # just makes it a bit easier to follow foreach my $hit(@{$self->phits}){ # print the details of all the constituent coord pairs separated by white space push (@{$prots{$hit->protein_id}}, $hit); } $self->{_proteins} = \%prots; # prune and store the hits $self->prune_hits; } sub prune_hits { my ($self) = @_; my %prots = %{$self->{_proteins}}; # just makes it a bit easier to follow PROTEIN: foreach my $p(keys %prots){ my @chosen = (); my @allhits = @{$prots{$p}}; # sort by descending order of coverage @allhits = sort {$b->coverage <=> $a->coverage} @allhits; my $first = shift(@allhits); # don't select any hits that have coverage less than 2% below that of the first hit, be it 100 or 99.9 or ... my $curr_pc = $first->coverage() - 2; # lower bound threshold - reject anything with < 25% coverage my $lower_threshold = 25; next PROTEIN if $first->coverage < $lower_threshold; push (@chosen,$first) unless $first->coverage < $lower_threshold; PRUNE: foreach my $hit(@allhits) { last PRUNE if $hit->coverage() < $curr_pc; last PRUNE if $hit->coverage() < $lower_threshold; push (@chosen,$hit); } push(@{$self->{_output}},@chosen); } } sub output { my ($self) = @_; if (!defined($self->{_output})) { $self->{_output} = []; } return @{$self->{_output}}; } 1;