Raw content of Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::Prints_wormbase
package Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::Prints_wormbase;
use vars qw(@ISA);
use strict;
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
use Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation;
@ISA = qw(Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation);
###################
# analysis methods
###################
=head2 run_program
Title : run_program
Usage : $self->program
Function : makes the system call to program
Example :
Returns :
Args :
Throws :
=cut
###########################
### The E-value calculation
###########################
# The reported P-value of any fingerprint result, is the product of the
# p-values for each motif. The motif p-values represent the probabilty
# that a comparison between the motif and a random sequence would
# achieve a score greater than or equal to the score attributed to the
# match between your query sequence and the motif.
#
# However, when a database search is performed this value must be
# adjusted for the multiple comparisons made therein. An E-value can be
# computed which represents the Expected numberof occurences of
# sequences scoring greater than or equal to the query's score.
#
# There are a number of estimates of this parameter;
#
# E = pD, which states that the probability multiplied by the sample
# database size (D equals the number of sequences), provides the number
# of occurrences, the assumption made here is that all sequences in a
# database are equally likely to be related to a particular motif.
#
# E = pN/n, is the same calculation, but the size of the database is
# calculated by taking the total length of the database in residues and
# dividing it by the width of the scoring entity (in this case the motif
# or fingerprint). This calculation assumes that the chance of a
# sequence being related to a motif (or fingerprint) is proportional to
# length (rather than being equally likely). Incidentally this latter
# approach is the method of choice for calculating BLAST database
# E-values (NCBI).
#
# FingerPRINTScan has now been modified to conform to the E = pN/n
# method of database E-value calculation, however P-value calculations
# remain unchanged
#
# The database from which D and N are calculated is (for example):
#
# SWISS-PROT 37 (77,977 sequences and 28,777,979 residues) and
# TrEMBL 9 (179,066 sequences and 55,577,465 residues)
#
# Therefore in all calculations D = 257,043 and N = 84,355,444.
#
# The current default values used in the /usr/local/ensembl/bin/FingerPRINTScan
# script are:
# -E #1 #2 E-value calculation parameters.
# (where #1 is the number of sequences in the primary database (default 80000))
# (where #2 is the number of resides in the primary database (default 2.96103e+07))
# ( the default values are based on SWISS-PROT 38)
#
# The database may get updated out of synch of the scripts, so you may
# need to set '-E #1 #2' explicitly in the parameters= line of the config file.
sub run_analysis {
my ($self) = @_;
# run program
print STDERR "running ".$self->program." against ".$self->database."\n";
print STDERR "FILENAME: ".$self->queryfile."\n";
my $cmd = $self->program .' '.
$self->database .' '.
$self->queryfile. ' '.
$self->analysis->parameters .' '.
'> ' . $self->resultsfile;
print STDERR "$cmd\n";
$self->throw ("Error running Prints_wormbase ".$self->program." on ".$self->filename)
unless ((system ($cmd)) == 0);
}
=head2 parse_results
Title : parse_results
Usage : $self->parse_results ($filename)
Function : parses program output to give a set of features
Example :
Returns :
Args : filename (optional, can be filename, filehandle or pipe, not implemented)
Throws :
=cut
sub parse_results {
my ($self) = @_;
my $filehandle;
my $resfile = $self->resultsfile();
my @fps;
if (-e $resfile) {
if (-z $resfile) {
print STDERR "Prints didn't find any hits\n";
return;
} else {
open (CPGOUT, "<$resfile") or $self->throw("Error opening ", $resfile, " \n"); #
}
}
my $id;
my $line;
my $sequenceId;
my @features;
my %evalue;
my %printsac;
while () {
$line = $_;
print STDERR "Next line: $line";
chomp $line;
# Pattern match the Sn; field which should contain the SequenceId and Accession
if ($line =~ s/^Sn;//) { # We have identified a Sn; line so there should be the following:
#ENSP00000003603 Gene:ENSG00000000003 Query:AL035608 Contig:AL035608.00001 Chr:chrX basepair:97227305
($sequenceId) = $line =~ /^\s*(\w+)/;
}
if ($line =~ s/^1TBH//) {
my ($id) = $line =~ /^\s*(\w+)/;
my ($ac) = $line =~ /(PR\w+)[;\.]*\s*$/;
$printsac{$id} = $ac;
print STDERR "1TBH line = $line\n";
print STDERR "In 1TBH data, printsac{$id} = $ac\n";
}
# get the evalues of each of the fingerprint names
if ($line =~ /^2TBH/) {
print STDERR "got a 2TBH line: $line\n";
my @line = split /\s+/, $line;
$evalue{$line[1]} = $line[9];
print STDERR "hash evalue of $line[1] = $line[9]\n";
}
if ($line =~ s/^3TB//) {
if ($line =~ s/^[HN]//) {
my ($num,$temp1,$tot1) = "";
# Grab these lines
# 1433ZETA 1 of 6 88.19 1328 1.00e-16 ELTVEERNLLSVAYKNVIGARRASWRIITS 30 35 36 48
# split line on space, hence strip off all leading spaces first.
$line =~ s/^\s+//;
print STDERR "line = $line\n";
# Place all elements of list into an array
my @elements = split /\s+/, $line;
# Name each of the elements in the array
my ($fingerprintName,$motifNumber,$temp,$tot,$percentageIdentity,$profileScore,$pvalue,$subsequence,$motifLength,$lowestMotifPosition,$matchPosition,$highestMotifPosition) = @elements;
print STDERR "fingerprintName=$fingerprintName\n";
my $start = $matchPosition;
#
# If the match to the pattern lies at the end of the protein we might get padding of the subsequence with #'s, and the
# end position will be bigger than the actual end of the protein. So we'll strip the #'s off the end, adjust the
# motif length accordingly, and only then derive the match end.
my $hash_substring;
my $end;
if($subsequence =~ /(\#+)$/){
$hash_substring = $1;
$end = $matchPosition + $motifLength - 1 - length($hash_substring);
} else {
$end = $matchPosition + $motifLength - 1;
}
# if we don't have a valid hit for this PRINTS ID, then ignore teh rest
if (! exists $printsac{$fingerprintName}) {next;}
my $print = $printsac{$fingerprintName};
my $feat = "$print,$start,$end,$percentageIdentity,$profileScore,$evalue{$fingerprintName}";
print STDERR "features= $feat\n";
print "matched\n";
my $hstart = 0;
my $hend = 0;
my $evalue = $evalue{$fingerprintName};
print STDERR "writing to database\n";
my $fp= $self->create_protein_feature($start, $end, $profileScore, $sequenceId, $hstart, $hend, $print, $self->analysis, $evalue, $percentageIdentity);
push @fps, $fp;
}
}
}
close (CPGOUT);
$self->output(\@fps);
}
1;