Raw content of Bio::EnsEMBL::Analysis::Tools::Pmatch::First_PMF
package Bio::EnsEMBL::Analysis::Tools::Pmatch::First_PMF;
use Bio::Root::Object;
use Bio::EnsEMBL::Analysis::Tools::Pmatch::ContigHit;
use Bio::EnsEMBL::Analysis::Tools::Pmatch::ProteinHit;
use Bio::EnsEMBL::Analysis::Tools::Pmatch::CoordPair;
use Bio::EnsEMBL::Analysis::Tools::Pmatch::MergedHit;
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
use Bio::EnsEMBL::Utils::Argument qw( rearrange );
@ISA = qw(Bio::Root::Object);
sub new {
my ($class, @args) = @_;
my $self = bless {}, $class;
my ($plengths, $maxintronlen, $min_coverage) =
rearrange(['PLENGTHS','MAXINTRONLEN', 'MIN_COVERAGE'], @args);
#Setting defaults
$self->min_coverage(25);
$self->maxintronlen(50000);
#################
$self->plengths($plengths);
$self->maxintronlen($maxintronlen);
$self->min_coverage($min_coverage);
throw("No protlengths data") unless $self->plengths;
throw("No maximum intron length") unless defined $self->maxintronlen;
return $self;
}
=head2 make_coord_pair
Title : make_coord_pair
Usage :
Function:
Example :
Returns :
Args :
=cut
sub make_coord_pair {
my ($self) = @_;
#print STDERR "Making coord pair with ".$_."\n";
my @cols = split();
# clean up the refseq IDs
my $protid = $cols[5];
# alter this as necessary
if($protid =~ /\w+\|\w+\|\w+\|([\w\.]+)\|/) {
$protid = $1;
}
# sort out strand hmmm if strand = -1 then shouldn't we switch qstart & qend? Currently don't ...
my $strand = 1;
if ( $cols[3] < $cols[2] ) { $strand=-1; }
my $cp = new Bio::EnsEMBL::Analysis::Tools::Pmatch::CoordPair(
-query => $cols[1],
-target => $protid,
-qstart => $cols[2],
-qend => $cols[3],
-tstart => $cols[6],
-tend => $cols[7],
-percent=> $cols[8],
-strand => $strand,
);
# where to put the CoordPair?
# find the relevant ProteinHit, or make a new one
my $proteins = $self->proteins;
my $ph = $$proteins{$protid};
if (!defined $ph) {
#make a new one and add it into %proteins
$ph = new Bio::EnsEMBL::Analysis::Tools::Pmatch::ProteinHit(-id=>$protid);
$$proteins{$protid} = $ph;
}
$self->proteins($proteins);
# now find the relevant ContigHit, or make a new one
my $ch = $ph->get_ContigHit($cols[1]);
if (!defined $ch) {
# make a new one and add it into $ph
$ch = Bio::EnsEMBL::Analysis::Tools::Pmatch::ContigHit->new(-id => $cols[1]);
$ph->add_ContigHit($ch);
}
# now add the CoordPair
$ch->add_CoordPair($cp);
}
=head2 merge_hits
Title : merge_hits
Usage :
Function:
Example :
Returns :
Args :
=cut
sub merge_hits {
my ($self) = @_;
my @merged;
# merge the hits together.
my $protref = $self->proteins;
foreach my $p_hit(values %$protref) {
my @allhits = @{$self->make_mergelist($p_hit)};
my @chosen = @{$self->prune(\@allhits)};
#print STDERR "\nNo hits good enough for " . $p_hit->id() . "\n"
# unless scalar(@chosen);
push(@merged,@chosen);
}
#print STDERR "PMF have ".@merged." ".$merged[0]." results\n";
return \@merged;
}
=head2 make_mergelist
Title : make_mergelist
Usage :
Function:
Example :
Returns :
Args :
=cut
sub make_mergelist {
my ($self, $ph) = @_;
my @hits = ();
foreach my $ch($ph->each_ContigHit()) {
# forward strand
my @cps = @{$ch->each_ForwardPair()};
# sort!
@cps = sort {$a->qstart <=> $b->qstart} @cps;
#reverse strand
my @mps = @{$ch->each_ReversePair()};
@mps = sort {$b->qstart <=> $a->qstart} @mps;
push (@cps,@mps);
# deal with forward & reverse separately?
my $first = shift(@cps);
my $mh = $self->new_merged_hit($first);
push(@hits,$mh);
my $prev = $hits[$#hits];
my $prev_cp = $first;
CP:
foreach my $cp(@cps) {
# need to compare with the last entry in @merged
# first the strand
my $strand = 1;
if ($cp->qend() < $cp->qstart()) { $strand = -1; }
# does this CoordPair extend the current hit?
# need a fudge factor - pmatch could overlap them by 1 or 2 ... or 3
if( $strand == $prev->strand &&
( ($cp->tstart >= $prev_cp->tend) ||
(($prev_cp->tend - $cp->tstart) <= 3)) &&
# Steve's fix - Added qstart/qend condition (*strand should allow it to work on
# either strand)
# no overlap currently allowed
$cp->qstart*$strand >= $prev_cp->qend*$strand
)
{
#extend existing MergedHit
my $coverage = $cp->tend - $cp->tstart + 1;
$coverage += $prev->coverage();
# compensate for any overlap
my $diff = $prev_cp->tend - $cp->tstart;
if($diff >=0) {
$diff++;
$coverage -= $diff;
}
$prev->coverage($coverage);
$prev->add_CoordPair($cp);
$prev_cp = $cp;
}
else {
# make a new MergedHit
my $mh = $self->new_merged_hit($cp);
push(@hits,$mh);
$prev = $hits[$#hits];
$prev_cp = $cp;
}
}
}
# my $times = join " ", times;
# print "Before extend " . $times . "\n";
# extend those merged hits
# print "Number of hits = " . scalar(@hits) . "\n";
@hits = @{$self->extend_hits(\@hits)};
# $times = join " ", times;
# print "After extend " . $times . "\n";
# print "Number of hits = " . scalar(@hits) . "\n";
# sort out coverage
my $plengths = $self->plengths;
my $protlen = $$plengths{$ph->id};
warn "No length for " . $ph->id . "\n" unless $protlen;
return unless $protlen;
foreach my $hit(@hits) {
my $percent = $hit->coverage;
$percent *= 100;
$percent /= $protlen;
$percent=sprintf("%.1f",$percent);
$hit->coverage($percent);
}
return \@hits;
}
=head2 new_merged_hit
Title : new_merged_hit
Usage :
Function:
Example :
Returns :
Args :
=cut
sub new_merged_hit {
my ($self,$cp) = @_;
my $coverage = $cp->tend - $cp->tstart + 1;
my $mh = new Bio::EnsEMBL::Analysis::Tools::Pmatch::MergedHit( -query => $cp->query(),
-target => $cp->target(),
-strand => $cp->strand(),
-coverage => $coverage,
);
$mh->add_CoordPair($cp);
return $mh;
}
sub plengths {
my ($self, $plengths) = @_;
# $plengths is a hash reference
if(defined($plengths)){
if (ref($plengths) eq "HASH") {
$self->{_plengths} = $plengths;
}
else {
throw("[$plengths] is not a hash ref.");
}
}
return $self->{_plengths};
}
sub proteins {
my ($self, $proteins) = @_;
if(!$self->{_proteins}){
$self->{_proteins} = {};
}
# $proteins is a hash reference
if(defined($proteins)){
if (ref($proteins) eq "HASH") {
$self->{_proteins} = $proteins;
}
else {
throw("[$proteins] is not a hash ref.");
}
}
return $self->{_proteins};
}
sub maxintronlen {
my ($self, $maxintronlen) = @_;
if(defined($maxintronlen)){
if ($maxintronlen =~ /\d+/) {
$self->{_maxintronlen} = $maxintronlen;
}
else {
throw("[$maxintronlen] is not numeric.");
}
}
return $self->{_maxintronlen};
}
sub min_coverage {
my ($self, $min_coverage) = @_;
if(defined($min_coverage)){
if ($min_coverage =~ /\d+/) {
$self->{_min_coverage} = $min_coverage;
}
else {
throw("[$min_coverage] is not numeric.");
}
}
return $self->{_min_coverage};
}
sub extend_hits {
my ($self, $hits) = @_;
# we want to do essentially what we did to create the merged hits but we can skip over
# intervening hits if we need to.
my @fhits;
my @rhits;
my @newhits;
foreach my $mh(@$hits){
if($mh->strand == 1){
push (@fhits, $mh);
}
else {
push (@rhits, $mh);
}
}
@fhits = sort {$a->qstart <=> $b->qstart} @fhits;
@rhits = sort {$b->qstart <=> $a->qstart} @rhits;
while(scalar(@fhits)){
my $hit = shift(@fhits);
push (@newhits, $hit);
# can we link it up to a subsequent hit?
foreach my $sh(@fhits){
die ("Argh!") unless $hit->strand == $sh->strand;
last if ($hit->qend+$self->{_maxintronlen} < $sh->qstart);
if ($sh->tstart > $hit->tend &&
$sh->tend > $hit->tend &&
abs($sh->tstart - $hit->tend) <= 3 &&
# qstart/qend condition - no overlap currently allowed
$sh->qstart >= $hit->qend
) {
# add the coord pairs from $sh into $hit
$hit->subsume_MergedHit($sh);
}
}
}
# same for rhits
while(scalar(@rhits)){
my $hit = shift(@rhits);
push (@newhits, $hit);
# can we link it up to a subsequent hit?
foreach my $sh(@rhits){
die ("Argh!") unless $hit->strand == $sh->strand;
# hmmmm On minus strand qstart is currently > qend
# ie $sh->qend <= $sh->qstart <= $hit->qend <= $hit->qstart
# last if ($hit->qstart-$self->{_maxintronlen} > $sh->qend);
last if ($hit->qend-$self->{_maxintronlen} > $sh->qstart);
if ($sh->tstart > $hit->tend &&
$sh->tend > $hit->tend &&
abs($sh->tstart - $hit->tend) <= 3 &&
# qstart/qend condition - no overlap currently allowed
$hit->qend >= $sh->qstart
) {
# add the coord pairs from $sh into $hit
$hit->subsume_MergedHit($sh);
}
}
}
# return extended hits
return (\@newhits);
}
=head2 prune
Title : prune
Usage :
Function:
Example :
Returns :
Args :
=cut
sub prune {
my ($self, $all) = @_;
my @chosen = ();
# reject hits with < 25% coverage
my $lower_threshold = $self->min_coverage;
# sort by descending order of coverage
my @all = sort {$b->coverage <=> $a->coverage} @$all;
my $first = shift(@all);
if($first->coverage < $lower_threshold){
# we're done
return \@chosen;
}
push (@chosen,$first);
# don't select any hits that have coverage less than 2% below that of the first hit, be it 100 or 99.9 or ...
my $curr_pc = $first->coverage() - 2;
PRUNE:
foreach my $hit(@all) {
last PRUNE if $hit->coverage < $lower_threshold;
last PRUNE if $hit->coverage < $curr_pc;
push (@chosen,$hit);
}
return \@chosen;
}
1;