Raw content of Bio::EnsEMBL::Analysis::Runnable::Pmatch
package Bio::EnsEMBL::Analysis::Runnable::Pmatch;
use strict;
use warnings;
use Bio::EnsEMBL::Analysis::Runnable;
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
use Bio::EnsEMBL::Utils::Argument qw( rearrange );
use Bio::EnsEMBL::Analysis::Tools::Logger qw(logger_info);
use Bio::EnsEMBL::Analysis::Tools::Pmatch::First_PMF;
use Bio::SeqIO;
use vars qw(@ISA);
@ISA = qw(Bio::EnsEMBL::Analysis::Runnable);
sub new {
my ($class,@args) = @_;
my $self = $class->SUPER::new(@args);
my ($protein_file, $protein_length, $max_intron_length, $min_coverage) =
rearrange(['PROTEIN_FILE', 'PROTEIN_LENGTHS', 'MAX_INTRON_LENGTH',
'MIN_COVERAGE'], @args);
###SETTING DEFAULTS###
$self->program('pmatch') if(!$self->program);
$self->max_intron_length(50000);
$self->min_coverage(25);
#######################
$self->protein_file($protein_file);
throw("Need to pass in a defined readable protein file not ".$protein_file)
unless($self->protein_file && (-s $self->protein_file));
$self->protein_lengths($protein_length);
$self->max_intron_length($max_intron_length);
$self->min_coverage($min_coverage);
return $self;
}
sub protein_file{
my ($self, $arg) = @_;
if($arg){
$self->{'protein_file'} = $arg;
}
return $self->{'protein_file'};
}
sub protein_lengths{
my ($self, $arg) = @_;
if($arg){
throw("Runnable::Pmatch::protein_lengths must be passed a hashref not a ".$arg)
unless(ref($arg) eq 'HASH');
$self->{'protein_lengths'} = $arg;
}
if(!$self->{'protein_lengths'}){
my $hash = $self->make_protein_length_hash;
$self->{'protein_lengths'} = $hash;
}
return $self->{'protein_lengths'};
}
sub max_intron_length{
my ($self, $arg) = @_;
if($arg){
$self->{'max_intron_length'} = $arg;
}
return $self->{'max_intron_length'};
}
sub min_coverage{
my ($self, $arg) = @_;
if($arg){
$self->{'min_coverage'} = $arg;
}
return $self->{'min_coverage'};
}
sub make_protein_length_hash{
my ($self) = @_;
my %hash;
my $seqio = Bio::SeqIO->new(-format => 'Fasta',
-file => $self->protein_file);
while(my $seq = $seqio->next_seq){
$hash{$seq->id} = $seq->length;
}
return \%hash;
}
sub run_analysis{
my ($self, $program) = @_;
if(!$program){
$program = $self->program;
}
throw($program." is not executable Pmatch::run_analysis ")
unless($program && -x $program);
my $command = $self->program." -D ".$self->options." ".$self->protein_file." ".
$self->queryfile." > ".$self->resultsfile;
print "Running analysis ".$command."\n";
logger_info("Running analysis ".$command);
system($command) == 0 or throw("FAILED to run ".$command);
}
sub parse_results {
my ($self) = @_;
my (%prot_ids, @all_merged_hits);
open RES, $self->resultsfile or $self->throw("Could not open " . $self->resultsfile . " results file\n");
while() {
my @cols = split;
$prot_ids{$cols[5]}++;
}
close(RES);
my @idlist = sort keys %prot_ids;
my @lists;
# chop list into group ensuring that the total number of features
# in each group does not exceed 1M
foreach my $id (sort keys %prot_ids) {
my $size = $prot_ids{$id};
if (not @lists or $lists[-1]->{size} + $size > 1000000) {
push @lists, {
size => 0,
ids => [],
};
}
push @{$lists[-1]->{ids}}, $id;
$lists[-1]->{size} += $size;
}
foreach my $list (@lists) {
my %these_ids = map { $_ => 1 } @{$list->{ids}};
my (%hits);
open RES, $self->resultsfile or $self->throw("Could not re-open " . $self->resultsfile . " results file\n");
while() {
my @cols = split;
if (exists $these_ids{$cols[5]}) {
push @{$hits{$cols[5]}}, [$cols[2], $_];
}
}
close(RES);
foreach my $id (keys %hits) {
my $pmf = new Bio::EnsEMBL::Analysis::Tools::Pmatch::First_PMF(
-plengths => $self->protein_lengths,
-maxintronlen => $self->max_intron_length,
-min_coverage => $self->min_coverage,
);
foreach my $el (sort { $a->[0] <=> $b->[0] } @{$hits{$id}}) {
$_ = $el->[1];
$pmf->make_coord_pair($_);
}
my @merged_hits = @{$pmf->merge_hits};
push @all_merged_hits, @merged_hits;
}
}
my $ff = $self->feature_factory;
foreach my $hit (@all_merged_hits) {
my $start = $hit->qstart;
my $end = $hit->qend;
if($start > $end){
($start, $end) = ($end, $start);
}
my $hstart = 1;
my $hlength = ($end - $start +1)/3;
my $hend = sprintf("%.0f", $hlength);
my $fp = $ff->create_feature_pair($start,
$end,
$hit->strand,
$hit->coverage,
$hstart,
$hend,
1,
$hit->target,
100,
0,
$hit->query,
$self->query,
$self->analysis);
my $paf = Bio::EnsEMBL::DnaPepAlignFeature->new(-features => [$fp]);
$self->output([$paf]);
}
}