Raw content of Bio::EnsEMBL::Analysis::Runnable::BaseAbInitio
package Bio::EnsEMBL::Analysis::Runnable::BaseAbInitio;
use strict;
use warnings;
use Bio::EnsEMBL::Analysis::Runnable;
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
use Bio::EnsEMBL::Utils::Argument qw( rearrange );
use vars qw(@ISA);
@ISA = qw(Bio::EnsEMBL::Analysis::Runnable);
sub new {
my ($class,@args) = @_;
my $self = $class->SUPER::new(@args);
my ($matrix) = rearrange(['MATRIX'], @args);
$self->matrix($matrix);
return $self;
}
#containters
=head2 matrix
Arg [1] : Bio::EnsEMBL::Analysis::Runnable::BaseAbInitio
Arg [2] : string, matrix file
Function : container for path to matrix file, when passed a filename it
will use the find_file method to check for its existance
Returntype: string
Exceptions:
Example :
=cut
sub matrix{
my ($self, $matrix) = @_;
if($matrix){
my $file = $self->find_file($matrix);
$self->{'matrix'} = $file;
}
return $self->{'matrix'};
}
=head2 peptides
Arg [1] : Bio::EnsEMBL::Analysis::Runnable::BaseAbInitio
Arg [2] : string, peptide sequence from genscan results
Function : container for the peptides sequences from genscan results
Returntype: arrayref
Exceptions:
Example :
=cut
sub peptides{
my ($self, $peptides) = @_;
if($peptides){
$self->{'peptides'} = $peptides;
}
return $self->{'peptides'};
}
=head2 exon_groups
Arg [1] : Bio::EnsEMBL::Analysis::Runnable::BaseAbInitio
Arg [2] : string, group name
Arg [3] : Bio::EnsEMBL::PredictionExon
Function : stores exons in arrayrefs in a hash keyed on group names
Returntype: hashref
Exceptions: throws if passed an exon which isnt a
Bio::EnsEMBL::PredictionExon
Example :
=cut
sub exon_groups{
my ($self, $group, $exon) = @_;
if(!$self->{'exon_groups'}){
$self->{'exon_groups'} = {};
}
if($group && $exon){
if(!$exon->isa('Bio::EnsEMBL::PredictionExon')){
throw("must be passed an exon object ".
"Bio::EnsEMBL::PredictionExon not an $exon ".
"BaseAbInitio:exon_groups");
}
if(!$self->{'exon_groups'}->{$group}){
$self->{'exon_groups'}->{$group} = [];
}
push(@{$self->{'exon_groups'}->{$group}}, $exon);
}
return $self->{'exon_groups'};
}
#utility methods
=head2 run_analysis
Arg [1] : Bio::EnsEMBL::Analysis::Runnable::BaseAbInitio
Arg [2] : string, program name
Function : create and open a commandline for one
of the ab initio gene finding programs
Returntype: none
Exceptions: throws if the program in not executable or the system
command fails to execute
Example :
=cut
sub run_analysis{
my ($self, $program) = @_;
if(!$program){
$program = $self->program;
}
throw($program." is not executable BaseAbInitio::run_analysis ")
unless($program && -x $program);
my $command = $program." ".$self->matrix." ".$self->queryfile." > ".
$self->resultsfile;
print "Running analysis ".$command."\n";
system($command) == 0 or throw("FAILED to run ".$command);
}
=head2 create_transcripts
Arg [1] : Bio::EnsEMBL::Analysis::Runnable::BaseAbInitio
Function : use the groups of exons to create prediction transcripts
Returntype: none
Exceptions: none
Example :
=cut
sub create_transcripts{
my ($self) = @_;
my @transcripts;
my $ff = $self->feature_factory;
my %exon_groups = %{$self->exon_groups};
foreach my $group(keys(%exon_groups)){
my @exons = @{$exon_groups{$group}};
my $transcript = $ff->create_prediction_transcript(\@exons, $self->query,
$self->analysis);
$transcript->seqname($group);
push(@transcripts, $transcript);
}
$self->output(\@transcripts);
}
=head2 calculate_phases
Arg [1] : Bio::EnsEMBL::Analysis::Runnable::BaseAbInitio
Function : works out which phase to make the exons to get
a complete cds
Returntype: none
Exceptions: throws if it cant find a translation
for a transcript
Example :
=cut
sub calculate_phases{
my ($self) = @_;
my @phases = (0, 1, 2);
my @output;
my $ff = $self->feature_factory;
my $peptides = $self->peptides;
TRANS:foreach my $trans(@{$self->output}){
my @exons = @{$trans->get_all_Exons};
foreach my $phase(@phases){
my @temp_exons = @{$self->set_phases($phase, \@exons)};
my $new = $ff->create_prediction_transcript(\@temp_exons,
$self->query,
$self->analysis);
my $pep = $new->translate->seq;
my $peptide = $peptides->{$trans->seqname};
my ($ensembl, $genscan) = $self->subsitute_x_codes($pep,
$peptide);
$ensembl =~ s/^x//i;
$ensembl =~ s/x$//i;
$genscan =~ s/^x//i;
$genscan =~ s/x$//i;
if ($ensembl =~ /$genscan/){
push(@output, $new);
next TRANS;
}
}
throw("Failed to find translation for ".$trans." ".$exons[0]->seqname)
}
$self->clean_output;
$self->output(\@output);
}
=head2 set_phases
Arg [1] : Bio::EnsEMBL::Analysis::Runnable::Genscan
Arg [2] : number, start phase,
Arg [3] : arrayref of Bio::EnsEMBL::PredictionExons
Function : starting with the given phase sets of the phase of
each exon on the basis of the end phase of the last. This is done
after ordering them on the basis of there strand
Returntype: arrayref of
Exceptions:
Example :
=cut
sub set_phases{
my ($self, $start_phase, $exons) = @_;
if(@$exons == 0){
throw("Can't set phases if have no exons ".$exons." ".@$exons);
}
if ($exons->[0]->strand == 1) {
@$exons = sort {$a->start <=> $b->start} @$exons;
} else {
@$exons = sort {$b->start <=> $a->start} @$exons;
}
foreach my $e(@$exons){
$e->phase($start_phase);
$start_phase = ($e->phase + $e->length)%3;
}
return $exons;
}
=head2 subsitute_x_codes
Arg [1] : Bio::EnsEMBL::Analysis::Runnable::Genscan
Arg [2] : string, ensembl produced peptides
Arg [3] : string, genscan predicted peptide
Function : makes sure x's and length's of peps are the same
to allow fair comparison
Returntype: string, string
Exceptions:
Example :
=cut
sub subsitute_x_codes{
my ($self, $ensembl_pep, $genscan_pep) = @_;
$genscan_pep =~ s/\*$//;
my $ens_len = length($ensembl_pep);
my $gen_len = length($genscan_pep);
if($ens_len == ($gen_len+1)){
chop($ensembl_pep);
}
if($gen_len == ($ens_len+1)){
chop($genscan_pep);
}
$ens_len = length($ensembl_pep);
$gen_len = length($genscan_pep);
my $x = 0;
while (($x = index($ensembl_pep, 'X', $x)) != -1) {
substr($genscan_pep, $x, 1) = 'X'
if length($genscan_pep) >= length($ensembl_pep);
$x++;
}
$x = 0;
while (($x = index($genscan_pep, 'X', $x)) != -1) {
substr($ensembl_pep, $x, 1) = 'X'
if length($ensembl_pep) >= length($genscan_pep);
$x++;
}
return $ensembl_pep, $genscan_pep;
}
1;