Raw content of Bio::EnsEMBL::Analysis::RunnableDB::Finished::HalfwiseHMM
#
#
# Cared for by EnsEMBL
# Copyright GRL & EBI
#
# You may distribute this module under the same terms as perl itself
#
# POD documentation - main docs before the code
=pod
=head1 NAME
Bio::EnsEMBL::Analysis::RunnableDB::Finished::HalfwiseHMM
=head1 SYNOPSIS
my $obj = Bio::EnsEMBL::Analysis::RunnableDB::Finished::HalfwiseHMM->new(
-db => $db,
-input_id => $id
);
$obj->fetch_input
$obj->run
my @newfeatures = $obj->output;
=head1 DESCRIPTION
runs HalfwiseHMM runnable and converts it output into genes which can be stored in an ensembl database
=head1 CONTACT
lec@sanger.ac.uk
refactored by Sindhu K. Pillai sp1@sanger.ac.uk
=head1 APPENDIX
The rest of the documentation details each of the object methods.
Internal methods are usually preceded with a _
=cut
# Let the code begin...
package Bio::EnsEMBL::Analysis::RunnableDB::Finished::HalfwiseHMM;
use Bio::EnsEMBL::Analysis::RunnableDB::Finished;
use vars qw(@ISA);
use strict;
use Bio::EnsEMBL::Analysis::Runnable::Finished::HalfwiseHMM;
use Bio::EnsEMBL::Exon;
use Bio::EnsEMBL::Transcript;
use Bio::EnsEMBL::Translation;
use Bio::EnsEMBL::Gene;
use Bio::EnsEMBL::DBSQL::DBAdaptor;
use Bio::EnsEMBL::DBSQL::DBConnection;
use Bio::EnsEMBL::Utils::Argument qw( rearrange );
use Bio::EnsEMBL::Utils::Exception qw(verbose throw warning);
use Bio::EnsEMBL::Analysis::Config::General;
use Bio::EnsEMBL::DBEntry;
use Bio::EnsEMBL::Analysis::Config::GeneBuild::Similarity qw (
GB_SIMILARITY_DATABASES
);
use Data::Dumper;
use BlastableVersion;
@ISA = qw(Bio::EnsEMBL::Analysis::RunnableDB::Finished);
=head2 new
Arg : all those inherited from RunnableDB
Function : Make a new HalfwiseHMM object defining the above variables
Exception: thrown if no input id is provided, this is found in RunnableDB
Caller :
Example : $runnable = Bio::EnsEMBL::Analysis::RunnableDB::HalfwiseHMM new->(-db => $db
-INPUT_ID => $id
-ANALYSIS => $analysis);
=cut
sub new {
my ( $new, @args ) = @_;
my $self = $new->SUPER::new(@args);
# db, input_id, seqfetcher, and analysis objects are all set in
# in superclass constructor (RunnableDB.pm)
my ( $type, $threshold ) = rearrange( [qw(TYPE THRESHOLD)], @args );
$self->{' '} = []; #create key to an array of feature pairs
return $self;
}
sub type {
my ( $self, $type ) = @_;
if ( defined($type) ) {
$self->{_type} = $type;
}
return $self->{_type};
}
sub threshold {
my ( $self, $threshold ) = @_;
if ( defined($threshold) ) {
$self->{_threshold} = $threshold;
}
return $self->{_threshold};
}
=head2 fetch_input
Arg : none
Function : fetches the repeatmasked sequence and the uniprot features for the slice being run on and creates the HalfwiseHMM Runnable
Exception: throws if no input_id has been provided
Caller :
Example :
=cut
sub fetch_input {
my ($self) = @_;
my %ests;
my @estseqs;
throw("No input id") unless defined( $self->input_id );
my $sliceid = $self->input_id;
my $sa = $self->db->get_SliceAdaptor();
my $slice = $sa->fetch_by_name($sliceid);
$slice->{'seq'} = $slice->seq();
$self->query($slice);
my $maskedslice =
$slice->get_repeatmasked_seq( $ANALYSIS_REPEAT_MASKING, $SOFT_MASKING )
or throw("Unable to fetch contig");
my $alignAdaptor = $self->db->get_ProteinAlignFeatureAdaptor();
my $stateinfocontainer = $self->db->get_StateInfoContainer;
my $analysis_adaptor = $self->db->get_AnalysisAdaptor();
foreach my $database ( @{$GB_SIMILARITY_DATABASES} ) {
my $analysis_ln = $database->{'type'};
my $threshold = $database->{'threshold'};
my $fps = [];
my $features =
$alignAdaptor->fetch_all_by_Slice_and_pid( $slice,
$threshold,
$analysis_ln );
print STDERR
"Number of features matching threshold $database->{'threshold'} = "
. scalar(@$features) . "\n";
foreach my $f (@$features) {
if ( UNIVERSAL::isa( $f, "Bio::EnsEMBL::FeaturePair" )
&& defined( $f->hseqname ) )
{
push( @$fps, $f );
}
}
my $runnable =
Bio::EnsEMBL::Analysis::Runnable::Finished::HalfwiseHMM->new(
'-query' => $maskedslice,
'-features' => $fps,
'-pfamdb' => $self->getPfamDB(),
'-options' => $self->analysis->parameters(),
'-program' => $self->analysis->program(),
'-analysis' => $self->analysis
);
$self->runnable($runnable);
# set the db version searched which is a concatenation of Uniprot and Pfam db versions
my $ana = $analysis_adaptor->fetch_by_logic_name($analysis_ln);
my $uniprot_db_version = $stateinfocontainer->fetch_db_version($self->input_id,$ana);
my $pfam_db_version = $self->get_db_version();
$self->db_version_searched(join('_',$uniprot_db_version,$pfam_db_version));
}
}
sub make_hash_from_meta_value {
my ( $self, $string ) = @_;
if ($string) {
my $hash = { eval $string };
if($@) {
die "error evaluating $string [$@]";
} else {
return $hash || {};
}
}
return {};
}
sub getPfamDB {
my ($self) = @_;
unless ( $self->{'_pfam_db'} ) {
my $pfam_meta = $self->db->get_MetaContainer();
my $value = $pfam_meta->list_value_by_key('pfam_db')
|| throw("please enter pfam_db key - value into meta table\n");
my $pfam_db_conn = $self->make_hash_from_meta_value( $value->[0] );
# Use the Blast tracking system to set the correct Pfam DB name
my $db_file_path = $self->analysis->db_file;
my $db_file_version = $self->get_db_version($db_file_path);
if ( $db_file_version =~ /^(\d+).(\d+)$/ ) {
$pfam_db_conn->{'-dbname'} = "pfam_$1_$2";
}
elsif ( $db_file_version =~ /^(\d+)$/ ) {
$pfam_db_conn->{'-dbname'} = "pfam_$1_0";
}
$self->{'_pfam_db'} =
Bio::EnsEMBL::DBSQL::DBConnection->new(%$pfam_db_conn);
}
return $self->{'_pfam_db'};
}
=head2 get_db_version
Title : get_db_version
[ distinguished from RunnableDB::*::db_version_searched() ]
Useage : $self->get_db_version('/data/base/path')
$obj->get_db_version()
Function: Set a blast database version from the supplied path
Get a blast database version from previously supplied path
Uses tjrc''s BlastableVersion module.
Returns : String
Args : String (should be a full database path)
Caller : $self::fetch_databases()
RunnableDB::Finished_EST::db_version_searched()
=cut
sub get_db_version {
my ( $self, $db ) = @_;
my $debug_this = 1; # this just shows debug info.
my $force_dbi = 1; # this will force a dbi call SLOW!!!!!!
unless ( $self->{'_pfam_db_version'} ) {
if ($db) {
$BlastableVersion::debug = $debug_this;
warning
"BlastableVersion is cvs revision $BlastableVersion::revision \n"
if $debug_this;
my $ver = eval {
my $blast_ver = BlastableVersion->new();
$blast_ver->force_dbi($force_dbi); # if set will be SLOW.
$blast_ver->get_version($db);
$blast_ver;
};
throw("I failed to get a BlastableVersion for $db [$@]") if $@;
my $dbv = $ver->version();
my $sgv = $ver->sanger_version();
my $name = $ver->name();
my $date = $ver->date();
unless ($dbv) {
throw( "I know nothing about $db I tried to find out:\n"
. " - name <"
. $name . ">\n"
. " - date <"
. $date . ">\n"
. " - version <"
. $dbv . ">\n"
. " - sanger_version <"
. $sgv
. ">\n" );
}
$self->{'_pfam_db_version'} = $dbv;
}
else {
throw("Must provide a db file name as argument");
}
}
return $self->{'_pfam_db_version'};
}
sub db_version_searched {
my ( $self, $arg ) = @_;
$self->{'_db_version_searched'} = $arg if $arg;
return $self->{'_db_version_searched'};
}
=head2 runnable
Arg : a Bio::EnsEMBL::Analysis::Runnable
Function : Gets/sets the runnable
Exception: throws if argument passed isn't a runnable
Caller :
Example :'
=cut
sub runnable {
my ( $self, $arg ) = @_;
if ( !defined( $self->{'_seqfetchers'} ) ) {
$self->{'_seqfetchers'} = [];
}
if ( defined($arg) ) {
throw("[$arg] is not a Bio::EnsEMBL::Analysis::Runnable")
unless $arg->isa("Bio::EnsEMBL::Analysis::Runnable");
push( @{ $self->{_runnable} }, $arg );
}
return @{ $self->{_runnable} };
}
sub run {
my ($self) = @_;
foreach my $runnable ( $self->runnable ) {
$runnable || throw("Can't run - no runnable object");
print STDERR "using " . $runnable . "\n";
eval { $runnable->run; };
if ( my $err = $@ ) {
chomp $err;
$self->failing_job_status($1)
if $err =~ /^\"([A-Z_]{1,40})\"$/i; # only match '"ABC_DEFGH"'
throw("$@");
}
}
$self->_convert_output();
}
=head2 write_output
Arg : none
Function : writes the converted output to the database as genes
Exception: none
Caller :
Example :
=cut
sub write_output {
my ($self) = @_;
my @times = times;
print STDERR "started writing @times \n";
my @genes = $self->output();
my $db = $self->db();
my $gene_adaptor = $db->get_GeneAdaptor;
my $dbh = $db->dbc->db_handle;
$dbh->begin_work;
eval {
GENE: foreach my $gene (@genes)
{
$gene_adaptor->store($gene,0);
}
$dbh->commit;
};
if ($@) {
$dbh->rollback;
throw("UNABLE TO WRITE GENES IN DATABASE\n[$@]\n");
}
@times = times;
print STDERR "finished writing @times \n";
return 1;
}
=head2 _convert_output
Arg : none
Function : takes the features from the halfwise runnable and runs _make_genes to convert them into Bio::EnsEMBL::Genes with appropriately attached exons and supporting evidence
Exception: thows if there are no analysis types
Caller :
Example :
=cut
sub _convert_output {
my ($self) = @_;
my @genes;
my $analysis = $self->analysis();
my $genetype = 'Halfwise';
# make an array of genes for each runnable
my @out;
foreach my $runnable ( $self->runnable ) {
push( @out, $runnable->output );
$self->pfam_lookup( $runnable->pfam_lookup )
if $runnable->can('pfam_lookup');
}
my @g = $self->_make_genes( $genetype, $analysis, \@out );
push( @genes, @g );
$self->output(@genes);
}
# get/set for lookup multi-valued hash { pfam_id => [pfam_acc, pfam_desc], ... }
# can append multiple to the lookup (see { %{$self->{'_pfam_lookup'}}, %{$hash_ref} })
sub pfam_lookup {
my ( $self, $hash_ref ) = @_;
if ( ref($hash_ref) eq 'HASH' ) {
$self->{'_pfam_lookup'} ||= {};
$self->{'_pfam_lookup'} =
{ %{ $self->{'_pfam_lookup'} }, %{$hash_ref} };
}
return $self->{'_pfam_lookup'};
}
=head2 _make_genes
Arg : runnable being run and analysis object being used
Function : converts the seqfeatures outputed by the runnable and actually converts them into Bio::EnsEMBL::Genes
Exception: none
Caller :
Example :
=cut
=head2 _make_genes
Title : make_genes
Usage : $self->make_genes
Function: makes Bio::EnsEMBL::Genes out of the output from runnables
Returns : array of Bio::EnsEMBL::Gene
Args : $genetype: string
$analysis_obj: Bio::EnsEMBL::Analysis
$results: reference to an array of FeaturePairs
=cut
sub _make_genes {
my ( $self, $genetype, $analysis_obj, $results ) = @_;
my $sliceid = $self->input_id;
my $sa = $self->db->get_SliceAdaptor();
my $slice = $sa->fetch_by_name($sliceid);
my @genes;
my $info_type = 'MISC';
my $info_text = 'Relationship generated from genewisedb search';
# fetch lookup multi-valued hash { pfam_id => [pfam_acc, pfam_desc], ... }
my $pfam_lookup = $self->pfam_lookup();
my $pfam_release = $self->get_db_version();
$self->_check_that_external_db_table_populated( $pfam_release, 'PFAM',
'XREF' );
foreach my $tmp_gene (@$results) {
my $pfam_id = $tmp_gene->seqname();
my $acc_ver = $pfam_lookup->{$pfam_id}->[0];
my @pfamacc_ver = split /\./, $acc_ver;
my $dbentry = Bio::EnsEMBL::DBEntry->new(
-primary_id => $pfamacc_ver[0],
-display_id => $pfam_id,
-version => $pfamacc_ver[1],
-release => $pfam_release,
-dbname => "PFAM",
-description => $pfam_lookup->{$pfam_id}->[1],
-info_type => $info_type,
-info_text => $info_text
);
$dbentry->status('XREF');
my $gene = Bio::EnsEMBL::Gene->new();
my $transcript =
$self->_make_transcript( $tmp_gene, $slice, $genetype,
$analysis_obj );
$gene->biotype($genetype);
$gene->analysis($analysis_obj);
$gene->add_Transcript($transcript);
$gene->add_DBEntry($dbentry);
$gene->display_xref();
push( @genes, $gene );
}
return @genes;
}
sub _check_that_external_db_table_populated {
my ( $self, $release, $name, $status ) = @_;
$status ||= 'XREF';
my $db = $self->db();
my $find_tuple_sql = qq(SELECT count(*) AS tuple_exists
FROM external_db
WHERE db_name = ?
&& db_release = ?);
my $sth = $db->prepare($find_tuple_sql);
$sth->execute( $name, $release );
my $tuple = $sth->fetchrow_hashref() || {};
$sth->finish();
# if there is one return do nothing and the job can get on as normal
return if $tuple->{'tuple_exists'};
# else lock table external_db write
$sth = $db->prepare("LOCK TABLES external_db WRITE");
$sth->execute();
$sth->finish();
# get the next external_db_id
my $max_db_id_sql =
q`SELECT MAX(external_db_id) + 1 AS next_db_id from external_db`;
$sth = $db->prepare($max_db_id_sql);
$sth->execute();
my $row = $sth->fetchrow_hashref || {};
my $max_db_id = $row->{'next_db_id'} || warning "Error";
# insert the row
my $insert_sql =
q`INSERT INTO external_db (external_db_id, db_name, db_release, status) VALUES(?, ?, ?, ?)`;
$sth = $db->prepare($insert_sql);
$sth->execute( $max_db_id, $name, $release, $status );
$sth->finish();
# unlock tables;
$sth = $db->prepare("UNLOCK TABLES");
$sth->execute();
return $max_db_id;
}
=head2 _make_transcript
Title : make_transcript
Usage : $self->make_transcript($gene, $slice, $genetype)
Function: makes a Bio::EnsEMBL::Transcript from a SeqFeature representing a gene,
with sub_SeqFeatures representing exons.
Example :
Returns : Bio::EnsEMBL::Transcript with Bio::EnsEMBL:Exons(with supporting feature
data), and a Bio::EnsEMBL::translation
Args : $gene: Bio::EnsEMBL::SeqFeatureI, $contig: Bio::EnsEMBL::RawContig,
$genetype: string, $analysis_obj: Bio::EnsEMBL::Analysis
=cut
sub _make_transcript {
my ( $self, $gene, $slice, $genetype, $analysis_obj ) = @_;
$genetype = 'unspecified' unless defined($genetype);
unless ( $gene->isa("Bio::EnsEMBL::SeqFeatureI") ) {
print "$gene must be Bio::EnsEMBL::SeqFeatureI\n";
}
my $transcript = Bio::EnsEMBL::Transcript->new();
my $translation = Bio::EnsEMBL::Translation->new();
$transcript->translation($translation);
$transcript->analysis($analysis_obj);
my $excount = 1;
my @exons;
foreach my $exon_pred ( $gene->sub_SeqFeature ) {
# make an exon
my $exon = Bio::EnsEMBL::Exon->new();
my $sa = $slice->adaptor();
$exon->display_id( $sa->get_seq_region_id($slice) );
$exon->start( $exon_pred->start );
$exon->end( $exon_pred->end );
$exon->strand( $exon_pred->strand );
$exon->phase( $exon_pred->phase || 0 );
$exon->end_phase(0);
$exon->slice($slice);
# sort out supporting evidence for this exon prediction
foreach my $subf ( $exon_pred->sub_SeqFeature ) {
$subf->feature1->seqname( $slice->get_seq_region_id );
$subf->feature1->score(100);
$subf->feature1->analysis($analysis_obj);
$subf->feature2->score(100);
$subf->feature2->analysis($analysis_obj);
$exon->add_Supporting_Feature($subf);
}
push( @exons, $exon );
$excount++;
}
if ( @exons < 0 ) {
# printSTDERR "Odd. No exons found\n";
}
else {
if ( $exons[0]->strand == -1 ) {
@exons = sort { $b->start <=> $a->start } @exons;
}
else {
@exons = sort { $a->start <=> $b->start } @exons;
}
foreach my $exon (@exons) {
$transcript->add_Exon($exon);
}
$translation->start_Exon( $exons[0] );
$translation->end_Exon( $exons[$#exons] );
if ( $exons[0]->phase == 0 ) {
$translation->start(1);
}
elsif ( $exons[0]->phase == 1 ) {
$translation->start(3);
}
elsif ( $exons[0]->phase == 2 ) {
$translation->start(2);
}
$translation->end( $exons[$#exons]->end - $exons[$#exons]->start + 1 );
}
return $transcript;
}
=head2 output
Title : output
Usage :
Function: get/set for output array
Example :
Returns : array of Bio::EnsEMBL::Gene
Args :
=cut
sub output {
my ( $self, @genes ) = @_;
if ( !defined( $self->{'_output'} ) ) {
$self->{'_output'} = [];
}
if ( scalar(@genes) ) {
push( @{ $self->{'_output'} }, @genes );
}
return @{ $self->{'_output'} };
}
1;