Raw content of Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::Prints
=head1 NAME
Prints - DESCRIPTION of Object
=head1 SYNOPSIS
Give standard usage here
=head1 DESCRIPTION
Describe the object here
=cut
package Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation::Prints;
use vars qw(@ISA);
use strict;
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
use Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation;
@ISA = qw(Bio::EnsEMBL::Analysis::Runnable::ProteinAnnotation);
sub multiprotein{
my ($self) = @_;
return 1;
}
sub run_analysis {
my ($self) = @_;
# because Prints reports matches off the end of the sequence,
# we need the sequence lengths to be able to trim them back
my $command = $self->program ." " .
$self->database . " " .
$self->queryfile . " " .
"-fjR > " .
$self->resultsfile;
throw("Failed during prints run $!\n") unless
system($command) == 0 ;
}
sub parse_results {
my ($self) = @_;
my %seqlengths;
my $seqio = Bio::SeqIO->new(-format => 'fasta',
-file => $self->queryfile);
while (my $seq = $seqio->next_seq) {
$seqlengths{$seq->id} = $seq->length;
}
$seqio->close;
my ($fh);
my $resfile = $self->resultsfile;
if (-e $resfile) {
if (-z $resfile) {
# No hits found
return;
}
else {
open ($fh, "<$resfile") or $self->throw("Error opening ", $resfile, " \n");
}
}
my (@fps, %printsac, $seq_id);
while (<$fh>) {
my $line = $_;
chomp $line;
# Pattern match the Sn; field which should contain the SequenceId and Accession
if ($line =~ s/^Sn;//) { # We have identified a Sn; line so there should be the following:
#ENSP00000003603 Gene:ENSG00000000003 Query:AL035608 Contig:AL035608.00001 Chr:chrX basepair:97227305
($seq_id) = $line =~ /^\s*(\w+)/;
}
if ($line =~ s/^1TBH//) {
my ($id) = $line =~ /^\s*(\w+)/;
my ($ac) = $line =~ /(PR\w+);?\s*$/;
$printsac{$id} = $ac;
}
if ($line =~ s/^3TB//) {
if ($line =~ s/^[HN]//) {
my ($num,$temp1,$tot1);
# Grab these lines
# 1433ZETA 1 of 6 88.19 1328 1.00e-16 ELTVEERNLLSVAYKNVIGARRASWRIITS 30 35 36 48
$line =~ s/^\s+//;
my @elements = split /\s+/, $line;
# Name each of the elements in the array
my ($fingerprintName,
$motifNumber,
$temp,
$tot,
$percentageIdentity,
$profileScore,
$pvalue,
$subsequence,
$motifLength,
$lowestMotifPosition,
$matchPosition,
$highestMotifPosition) = @elements;
# If protein is 10,000+ residues (i.e. titin), then last two elements are merged, e.g.:
# VEGFRECEPTOR 5 of 6 39.20 406 1.05e-05 LIVRNARKENAGKYTLVL 18 374 13564392
if (!defined $highestMotifPosition) {
# First five characters of $matchPosition is actual $matchPosition
$highestMotifPosition = $matchPosition;
$matchPosition = substr($highestMotifPosition, 0, 5, '');
}
my $start = $matchPosition;
my $end = $matchPosition + $motifLength - 1;
# Prints sometimes reports -1 for the motif position start;
$lowestMotifPosition = 1 if $lowestMotifPosition == -1;
$highestMotifPosition = $lowestMotifPosition + $motifLength -1
if $highestMotifPosition == -1;
# It's possible that near the ends of the sequence,
# fragment matches are found. These can result in a
# start < 1 or end > seq_length. Make a policy decision
# prune these back in the sequence, but not the hit
$start = 1 if $start < 1;
$end = $seqlengths{$seq_id} if $end > $seqlengths{$seq_id};
my $fp = $self->create_protein_feature($start, $end,
$profileScore,
$seq_id,
$lowestMotifPosition,
$highestMotifPosition,
$printsac{$fingerprintName},
$self->analysis,
$pvalue,
$percentageIdentity);
push @fps, $fp;
}
}
}
close($fh);
$self->output(\@fps);
}
1;