Bio::EnsEMBL::Analysis::Tools::Pmatch First_PMF
Included librariesPackage variablesGeneral documentationMethods
Toolbar
WebCvsRaw content
Package variables
No package variables defined.
Included modules
Bio::EnsEMBL::Analysis::Tools::Pmatch::ContigHit
Bio::EnsEMBL::Analysis::Tools::Pmatch::CoordPair
Bio::EnsEMBL::Analysis::Tools::Pmatch::MergedHit
Bio::EnsEMBL::Analysis::Tools::Pmatch::ProteinHit
Bio::EnsEMBL::Utils::Argument qw ( rearrange )
Bio::EnsEMBL::Utils::Exception qw ( throw warning )
Bio::Root::Object
Inherit
Bio::Root::Object
Synopsis
No synopsis!
Description
No description!
Methods
extend_hits
No description
Code
make_coord_pairDescriptionCode
make_mergelistDescriptionCode
maxintronlen
No description
Code
merge_hitsDescriptionCode
min_coverage
No description
Code
new
No description
Code
new_merged_hitDescriptionCode
plengths
No description
Code
proteins
No description
Code
pruneDescriptionCode
Methods description
make_coord_paircode    nextTop
 Title   : make_coord_pair
Usage :
Function:
Example :
Returns :
Args :
make_mergelistcodeprevnextTop
 Title   : make_mergelist
Usage :
Function:
Example :
Returns :
Args :
merge_hitscodeprevnextTop
 Title   : merge_hits
Usage :
Function:
Example :
Returns :
Args :
new_merged_hitcodeprevnextTop
 Title   : new_merged_hit
Usage :
Function:
Example :
Returns :
Args :
prunecodeprevnextTop
 Title   : prune
Usage :
Function:
Example :
Returns :
Args :
Methods code
extend_hitsdescriptionprevnextTop
sub extend_hits {
  my ($self, $hits) = @_;

  # we want to do essentially what we did to create the merged hits but we can skip over
# intervening hits if we need to.
my @fhits; my @rhits; my @newhits; foreach my $mh(@$hits){ if($mh->strand == 1){ push (@fhits, $mh); } else { push (@rhits, $mh); } } @fhits = sort {$a->qstart <=> $b->qstart} @fhits; @rhits = sort {$b->qstart <=> $a->qstart} @rhits; while(scalar(@fhits)){ my $hit = shift(@fhits); push (@newhits, $hit); # can we link it up to a subsequent hit?
foreach my $sh(@fhits){ die ("Argh!") unless $hit->strand == $sh->strand; last if ($hit->qend+$self->{_maxintronlen} < $sh->qstart); if ($sh->tstart > $hit->tend && $sh->tend > $hit->tend && abs($sh->tstart - $hit->tend) <= 3 && # qstart/qend condition - no overlap currently allowed
$sh->qstart >= $hit->qend ) { # add the coord pairs from $sh into $hit
$hit->subsume_MergedHit($sh); } } } # same for rhits
while(scalar(@rhits)){ my $hit = shift(@rhits); push (@newhits, $hit); # can we link it up to a subsequent hit?
foreach my $sh(@rhits){ die ("Argh!") unless $hit->strand == $sh->strand; # hmmmm On minus strand qstart is currently > qend
# ie $sh->qend <= $sh->qstart <= $hit->qend <= $hit->qstart
# last if ($hit->qstart-$self->{_maxintronlen} > $sh->qend);
last if ($hit->qend-$self->{_maxintronlen} > $sh->qstart); if ($sh->tstart > $hit->tend && $sh->tend > $hit->tend && abs($sh->tstart - $hit->tend) <= 3 && # qstart/qend condition - no overlap currently allowed
$hit->qend >= $sh->qstart ) { # add the coord pairs from $sh into $hit
$hit->subsume_MergedHit($sh); } } } # return extended hits
return (\@newhits);
}
make_coord_pairdescriptionprevnextTop
sub make_coord_pair {
  my ($self) = @_;
  #print STDERR "Making coord pair with ".$_."\n";
my @cols = split(); # clean up the refseq IDs
my $protid = $cols[5]; # alter this as necessary
if($protid =~ /\w+\|\w+\|\w+\|([\w\.]+)\|/) { $protid = $1; } # sort out strand hmmm if strand = -1 then shouldn't we switch qstart & qend? Currently don't ...
my $strand = 1; if ( $cols[3] < $cols[2] ) { $strand=-1; } my $cp = new Bio::EnsEMBL::Analysis::Tools::Pmatch::CoordPair( -query => $cols[1], -target => $protid, -qstart => $cols[2], -qend => $cols[3], -tstart => $cols[6], -tend => $cols[7], -percent=> $cols[8], -strand => $strand, ); # where to put the CoordPair?
# find the relevant ProteinHit, or make a new one
my $proteins = $self->proteins; my $ph = $$proteins{$protid}; if (!defined $ph) { #make a new one and add it into %proteins
$ph = new Bio::EnsEMBL::Analysis::Tools::Pmatch::ProteinHit(-id=>$protid); $$proteins{$protid} = $ph; } $self->proteins($proteins); # now find the relevant ContigHit, or make a new one
my $ch = $ph->get_ContigHit($cols[1]); if (!defined $ch) { # make a new one and add it into $ph
$ch = Bio::EnsEMBL::Analysis::Tools::Pmatch::ContigHit->new(-id => $cols[1]); $ph->add_ContigHit($ch); } # now add the CoordPair
$ch->add_CoordPair($cp);
}
make_mergelistdescriptionprevnextTop
sub make_mergelist {
  my ($self, $ph) = @_;

  my @hits = ();
  foreach my $ch($ph->each_ContigHit()) {
    # forward strand
my @cps = @{$ch->each_ForwardPair()}; # sort!
@cps = sort {$a->qstart <=> $b->qstart} @cps; #reverse strand
my @mps = @{$ch->each_ReversePair()}; @mps = sort {$b->qstart <=> $a->qstart} @mps; push (@cps,@mps); # deal with forward & reverse separately?
my $first = shift(@cps); my $mh = $self->new_merged_hit($first); push(@hits,$mh); my $prev = $hits[$#hits]; my $prev_cp = $first; CP: foreach my $cp(@cps) { # need to compare with the last entry in @merged
# first the strand
my $strand = 1; if ($cp->qend() < $cp->qstart()) { $strand = -1; } # does this CoordPair extend the current hit?
# need a fudge factor - pmatch could overlap them by 1 or 2 ... or 3
if( $strand == $prev->strand && ( ($cp->tstart >= $prev_cp->tend) || (($prev_cp->tend - $cp->tstart) <= 3)) && # Steve's fix - Added qstart/qend condition (*strand should allow it to work on
# either strand)
# no overlap currently allowed
$cp->qstart*$strand >= $prev_cp->qend*$strand ) { #extend existing MergedHit
my $coverage = $cp->tend - $cp->tstart + 1; $coverage += $prev->coverage(); # compensate for any overlap
my $diff = $prev_cp->tend - $cp->tstart; if($diff >=0) { $diff++; $coverage -= $diff; } $prev->coverage($coverage); $prev->add_CoordPair($cp); $prev_cp = $cp; } else { # make a new MergedHit
my $mh = $self->new_merged_hit($cp); push(@hits,$mh); $prev = $hits[$#hits]; $prev_cp = $cp; } } } # my $times = join " ", times;
# print "Before extend " . $times . "\n";
# extend those merged hits
# print "Number of hits = " . scalar(@hits) . "\n";
@hits = @{$self->extend_hits(\@hits)}; # $times = join " ", times;
# print "After extend " . $times . "\n";
# print "Number of hits = " . scalar(@hits) . "\n";
# sort out coverage
my $plengths = $self->plengths; my $protlen = $$plengths{$ph->id}; warn "No length for " . $ph->id . "\n" unless $protlen; return unless $protlen; foreach my $hit(@hits) { my $percent = $hit->coverage; $percent *= 100; $percent /= $protlen;
$percent=sprintf("%.1f",$percent); $hit->coverage($percent); } return\@ hits;
}
maxintronlendescriptionprevnextTop
sub maxintronlen {
  my ($self, $maxintronlen) = @_;
  
  if(defined($maxintronlen)){
    if ($maxintronlen =~ /\d+/) {
      $self->{_maxintronlen} = $maxintronlen;
    } 
    else {
      throw("[$maxintronlen] is not numeric.");
    }
  }
  return $self->{_maxintronlen};
}
merge_hitsdescriptionprevnextTop
sub merge_hits {
  my ($self) = @_;
  my @merged;

  # merge the hits together.
my $protref = $self->proteins; foreach my $p_hit(values %$protref) { my @allhits = @{$self->make_mergelist($p_hit)}; my @chosen = @{$self->prune(\@allhits)}; #print STDERR "\nNo hits good enough for " . $p_hit->id() . "\n"
# unless scalar(@chosen);
push(@merged,@chosen); } #print STDERR "PMF have ".@merged." ".$merged[0]." results\n";
return\@ merged;
}
min_coveragedescriptionprevnextTop
sub min_coverage {
  my ($self, $min_coverage) = @_;
  
  if(defined($min_coverage)){
    if ($min_coverage =~ /\d+/) {
      $self->{_min_coverage} = $min_coverage;
    } 
    else {
      throw("[$min_coverage] is not numeric.");
    }
  }
  return $self->{_min_coverage};
}
newdescriptionprevnextTop
sub new {
  my ($class, @args) = @_;
  my $self = bless {}, $class;

  my ($plengths, $maxintronlen, $min_coverage) = 
                 rearrange(['PLENGTHS','MAXINTRONLEN', 'MIN_COVERAGE'], @args);

  #Setting defaults
$self->min_coverage(25); $self->maxintronlen(50000); #################
$self->plengths($plengths); $self->maxintronlen($maxintronlen); $self->min_coverage($min_coverage); throw("No protlengths data") unless $self->plengths; throw("No maximum intron length") unless defined $self->maxintronlen; return $self;
}
new_merged_hitdescriptionprevnextTop
sub new_merged_hit {
  my ($self,$cp) = @_;
  my $coverage = $cp->tend - $cp->tstart + 1;
  my $mh = new Bio::EnsEMBL::Analysis::Tools::Pmatch::MergedHit( -query    =>  $cp->query(),
			  -target   =>  $cp->target(),
			  -strand   =>  $cp->strand(),
			  -coverage =>  $coverage,
			);
  $mh->add_CoordPair($cp);
  return $mh;
}
plengthsdescriptionprevnextTop
sub plengths {
  my ($self, $plengths) = @_;
  
  # $plengths is a hash reference
if(defined($plengths)){ if (ref($plengths) eq "HASH") { $self->{_plengths} = $plengths; } else { throw("[$plengths] is not a hash ref."); } } return $self->{_plengths};
}
proteinsdescriptionprevnextTop
sub proteins {
  my ($self, $proteins) = @_;
  
  if(!$self->{_proteins}){
    $self->{_proteins} = {};
  }
  # $proteins is a hash reference
if(defined($proteins)){ if (ref($proteins) eq "HASH") { $self->{_proteins} = $proteins; } else { throw("[$proteins] is not a hash ref."); } } return $self->{_proteins};
}
prunedescriptionprevnextTop
sub prune {
  my ($self, $all) = @_;
  my @chosen = ();
  # reject hits with < 25% coverage
my $lower_threshold = $self->min_coverage; # sort by descending order of coverage
my @all = sort {$b->coverage <=> $a->coverage} @$all; my $first = shift(@all); if($first->coverage < $lower_threshold){ # we're done
return\@ chosen; } push (@chosen,$first); # don't select any hits that have coverage less than 2% below that of the first hit, be it 100 or 99.9 or ...
my $curr_pc = $first->coverage() - 2; PRUNE: foreach my $hit(@all) { last PRUNE if $hit->coverage < $lower_threshold; last PRUNE if $hit->coverage < $curr_pc; push (@chosen,$hit); } return\@ chosen; } 1;
}
General documentation
No general documentation available.