Raw content of Bio::EnsEMBL::Analysis::Runnable::Finished::HalfwiseHMM =pod =head1 NAME Bio::EnsEMBL::Analysis::Runnable::Finished::HalfwiseHMM =head1 SYNOPSIS my $obj = Bio::EnsEMBL::Analysis::Runnable::Finished::HalfwiseHMM->new( -genomic => $seq, -features => \@features, ); $obj->run my @newfeatures = $obj->output; or something similar???? =head1 DESCRIPTION Finds which pfam domains a particular swissprot hit matches, finds the related hmms and runs genewiseHmm =head1 CONTACT rds@sanger.ac.uk refactored by Sindhu K. Pillai B<sp1@sanger.ac.uk> =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::EnsEMBL::Analysis::Runnable::Finished::HalfwiseHMM; use vars qw(@ISA); use strict; # Object preamble - inherits from Bio::EnsEMBL::Root; use Bio::EnsEMBL::Analysis::Runnable; use Bio::EnsEMBL::FeaturePair; use Bio::EnsEMBL::SeqFeature; use Bio::EnsEMBL::Analysis; use Bio::EnsEMBL::Analysis::Tools::FeatureFilter; use Bio::PrimarySeq; use Bio::Seq; use Bio::SeqIO; use Bio::EnsEMBL::Root; use Bio::Tools::BPlite; use Bio::EnsEMBL::Analysis::Runnable::Blast; use Bio::EnsEMBL::Analysis::Runnable::Finished::GenewiseHmm; use Bio::EnsEMBL::Utils::Argument qw( rearrange ); use Bio::EnsEMBL::Utils::Exception qw(throw warning); use BlastableVersion; #use DB_File; use Fcntl; @ISA = qw(Bio::EnsEMBL::Analysis::Runnable); =head2 new Arg : Bio:Seq obj, reference to array of uniprot features, location of hmmfetch program, location of hmmdb, location of dbm index Function : Make a new HalfwiseHMM object defining the above variables Exception: Will throw an exception if no genomic sequence is defined or no features are passed Caller : Example : $halfwise = Bio::EnsEMBL::Analysis::Runnable::HalfwiseHMM->new(genomic => $seq features => \@features); =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); $self->{'_query'} = undef; $self->{'_input_features'} = []; $self->{'_output_features'} = []; $self->{'_hmmfetch'} = undef; $self->{'_hmmdb'} = undef; $self->{'_hmmfilename'} = undef; #file to store protein profile hmms $self->{'_errorfile'} = undef; #file to store any errors hmmfetch throw $self->{'_dbm_file'} = undef; #print STDERR "args = @args\n"; my ($genomic, $features, $hmmfetch, $hmmdb, $pfamdb, $memory, $options, $analysis,$program) = rearrange([qw(QUERY FEATURES HMMFETCH HMMDB PFAMDB MEMORY OPTIONS ANALYSIS PROGRAM)], @args); throw("No genomic sequence input") unless defined($genomic); $self->query($genomic); throw("No features input") unless defined($features); $self->add_input_features($features); throw("No pfam database") unless $pfamdb; $self->pfamDB($pfamdb); $self->options($options); $self->program($program); throw("No genomic sequence input") unless defined($analysis); $self->analysis($analysis); if ($hmmfetch){ $self->hmmfetch($hmmfetch); } else { $self->hmmfetch('/usr/local/ensembl/bin/hmmfetch'); } if($hmmdb){ $self->hmmdb($hmmdb); } else { $self->hmmdb('/data/blastdb/Finished/Pfam_ls'); } $self->memory ($memory) if (defined($memory)); return $self; # success - we hope! } ################# #GET/SET METHODS# ################# sub analysis { my ( $self, $analysis ) = @_; if ($analysis) { $self->{'_analysis'} = $analysis; } return $self->{'_analysis'}; } sub pfamDB{ my ($self, $dbobj) = @_; $self->{'_pfamDB'} = $dbobj if $dbobj; throw("Not a Bio::EnsEMBL::DBSQL::DBAdaptor") unless $self->{'_pfamDB'}->isa("Bio::EnsEMBL::DBSQL::DBConnection"); return $self->{'_pfamDB'}; } sub pfam_db_version{ my ($self) = @_; unless($self->{'_pfam_db_version'}){ my $db = $self->pfamDB(); $self->{'_pfam_db_version'} = $db->get_meta_value_by_key('version'); } return $self->{'_pfam_db_version'}; } sub pfam_ls_version{ my ($self) = @_; unless($self->{'_pfam_ls_version'}){ my $ver = BlastableVersion->new($self->hmmdb); # $self->{'_pfam_ls_version'} = $ver->version(); $self->{'_pfam_ls_version'} = $ver->get_version($self->hmmdb); } return $self->{'_pfam_ls_version'}; } sub program{ my ($self, $program) = @_; $self->{'_program'} = $program if $program; return $self->{'_program'}; } =head2 query Arg : Bio:Seq object Function : get/set method for query Exception: throws an exception if not passed a Bio:PrimarySeq Caller : =cut sub query { my( $self, $value ) = @_; if ($value) { #need to check if passed sequence is Bio::Seq object $value->isa("Bio::PrimarySeqI") || throw("Input isn't a Bio::PrimarySeqI"); $self->{'_query'} = $value; } return $self->{'_query'}; } =head2 add_input_features Arg : reference to array of features Function : get/set method for input features Exception: throws an exception if not passed an array ref Caller : =cut sub add_input_features{ my ($self, $features) = @_; if (ref($features) eq "ARRAY") { push(@{$self->{'_input_features'}},@$features); } else { throw("[$features] is not an array ref."); } } =head2 all_input_features Arg : none Function : returns all input features Exception: none Caller : =cut sub all_input_features{ my ($self) = @_; return @{$self->{'_input_features'}}; } #methods that probably won't be needed =head2 hmmfetch Arg : location of hmmfetch program Function : gets/sets location of hmmfetch program Exception: none Caller : =cut sub hmmfetch { my ($self, $args) =@_; if (defined($args)){ $self->{'_hmmfetch'} = $args; } return $self->{'_hmmfetch'}; } =head2 hmmdb Arg : location of hmmdb Function : gets/sets location of hmmdb Exception: none Caller : =cut sub hmmdb { my ($self, $args) =@_; if (defined($args)){ $self->{'_hmmdb'} = $args; } return $self->{'_hmmdb'}; } =head2 hmmfilename Arg : hmm filename Function : gets/sets hmmfilename Exception: none Caller : =cut sub hmmfilename { my ($self, $args) = @_; if (defined ($args)){ $self->{'_hmmfilename'} = $args; } return $self->{'_hmmfilename'}; } =head2 dbm_file Arg : dbm filename Function : gets/sets dbm filename Exception: none Caller : =cut sub dbm_file { my ($self, $args) = @_; if (defined ($args)){ $self->{'_dbm_file'} = $args; } return $self->{'_dbm_file'}; } =head2 memory Arg : value memory to be set to, this is an option of how much memory genewise can use when it is run Function : gets/sets memory Exception: none Caller : =cut sub memory { my ($self,$arg) = @_; if (defined($arg)) { $self->{'_memory'} = $arg; } return $self->{'_memory'} || 400000; } ########## #ANALYSIS# ########## =head2 run Arg : directory if to be set to anything other than tmp Function : runs functions which run genewisehmm Exception: none Caller : =cut sub run { my ($self, $dir) = @_; my $swissprot_ids = $self->get_swissprot_ids(); my $pfam_ids = $self->get_pfam_hmm($swissprot_ids,$dir); #$self->create_genewisehmm($pfam_ids, $dir); #$self->create_genewisehmm_individually($pfam_ids, $dir); $self->create_genewisehmm_complete($pfam_ids); } =head2 get_swissprot_ids Arg : none Function : gets swissprot ids and strands from input features and returns a hash of these Exception: none Caller : =cut ### 1, -1 and 0 are used to repesent strand, 1 and -1 have the same meaning as in teh ensembl databases 1 is the forward strand and -1 is the reverse ### 0 is used if a particular id has hit on both strands sub get_swissprot_ids{ my ($self) = @_; my @features = $self->all_input_features(); my $swissprot_ids; #{} foreach my $f(@features){ my $hname = $f->hseqname(); my $strand = $f->strand(); #print "swissprot id = ".$hname."\n"; if(!defined($swissprot_ids->{$hname})){ $swissprot_ids->{$hname} = $strand; }else{ $swissprot_ids->{$hname} = ($strand eq $swissprot_ids->{$hname} ? $strand : 0); } } return $swissprot_ids; } =head2 get_pfam_ids Arg : reference to a hash of swissprotids and stands Function : gets all pfam ids for a particular swissprot id and puts them in a hash along with the strand the swissprot id is found on and returns its Exception: warns if swissprot id has no pfam domains Caller : =cut sub get_pfam_hmm { my ($self, $uniprot_ids, $dir) = @_; my $pfam_accs; my $pfam_lookup; my $db = $self->pfamDB(); $self->workdir('/tmp') unless($self->workdir($dir)); $self->checkdir(); my $sql = "SELECT DISTINCT(CONCAT(a.pfamA_acc,'.',a.version)), a.pfamA_id, a.description, ls.hmm_ls FROM pfamA_reg_full_significant f, pfamseq p, pfamA a, pfamA_HMM_ls ls WHERE f.auto_pfamseq = p.auto_pfamseq AND f.in_full = 1 AND a.auto_pfamA = f.auto_pfamA AND ls.auto_pfamA = f.auto_pfamA AND p.pfamseq_acc IN "; my @accessions = keys(%$uniprot_ids); map ( s/(\w*(-\d+)?)(\.\d+)?/'$1'/,@accessions); # deal with empty @accessions next unless defined @accessions; $sql .= "(".join(',',@accessions).")"; #print STDOUT $sql."\n"; # create the hmm file $self->hmmfilename("halfwise.$$.hmmdb"); open (HMMFILE, ">".$self->hmmfilename) or die "couldn't create hmm file ".$self->hmmfilename." $!"; #print STDOUT "HMM file is ".$self->hmmfilename."\n"; my $sth = $db->prepare($sql); $sth->execute(); my ($pfam_acc, $pfam_id, $description, $hmm_ls); $sth->bind_columns(\($pfam_acc, $pfam_id, $description, $hmm_ls)); while(my $row = $sth->fetchrow_arrayref()){ $pfam_lookup->{$pfam_id} = [$pfam_acc, $description]; $pfam_accs->{$pfam_acc} = 1; print HMMFILE $hmm_ls; } $sth->finish(); close HMMFILE; $self->pfam_lookup($pfam_lookup); return $pfam_accs; } sub get_pfam_ids{ my ($self, $swissprot_ids) = @_; my $pfam_accs; my $pfam_lookup; # CREATE TEMPORARY TABLE my $tbl_name = "pipeline_tmp_$$"; my $create_table = qq{CREATE TEMPORARY TABLE $tbl_name( pfamseq_id varchar(12) NOT NULL PRIMARY KEY, strand enum('1','0','-1') DEFAULT '0' )TYPE = HEAP }; # should this be HEAP? # There's never gonna be that many matches # to exceed tmp_table_size = 10048576 ??? my $db = $self->pfamDB(); my $sth = $db->prepare($create_table); $sth->execute(); $sth->finish(); # INSERT my (@binds, @values); my $sql = qq{INSERT IGNORE INTO $tbl_name (pfamseq_id, strand) VALUES }; while (my ($swiss_id, $strand) = each(%$swissprot_ids)){ #print STDERR "$swiss_id : $strand\n"; # strip off sequence version from uniprot ID (introduced in Uniprot release 7.0) $swiss_id =~ s/\.\d+//; push(@binds, $swiss_id, $strand); push(@values, qq{ (?, ?)}); } if(scalar(@values)){ $sql .= join(", ", @values); #warning $sql; $sth = $db->prepare($sql); $sth->execute(@binds); $sth->finish(); } # SELECT # my $select = qq{SELECT a.pfamA_acc, t.strand, t.pfamseq_id, a.pfamA_id, a.description # FROM pfamA a, pfamA_reg_full f, pfamseq p, $tbl_name t # WHERE p.auto_pfamseq = f.auto_pfamseq # && a.auto_pfamA = f.auto_pfamA # && p.pfamseq_id = t.pfamseq_id # && f.in_full = 1}; my $select = qq{SELECT CONCAT(a.pfamA_acc,".",a.version), t.strand, t.pfamseq_id, a.pfamA_id, a.description FROM pfamA_reg_full f, pfamseq p, $tbl_name t, pfamA a WHERE f.auto_pfamseq = p.auto_pfamseq && p.pfamseq_acc = t.pfamseq_id && f.in_full = 1 && a.auto_pfamA = f.auto_pfamA;}; $sth = $db->prepare($select); $sth->execute(); my ($pfam_acc, $strand, $swall, $pfam_id, $description); $sth->bind_columns(\($pfam_acc, $strand, $swall, $pfam_id, $description)); while(my $row = $sth->fetchrow_arrayref()){ print STDERR "$swall : $strand : $pfam_acc : $pfam_id\n"; # making pfam_lookup for later ... $pfam_lookup->{$pfam_id} = [$pfam_acc, $description]; if(defined($pfam_accs->{$pfam_acc})){ $pfam_accs->{$pfam_acc} = ($strand eq $pfam_accs->{$pfam_acc} ? $strand : 0); }else{ $pfam_accs->{$pfam_acc} = $strand; } print "\n pfam_acc: $pfam_acc, strand: $strand, uniprot: $swall,pfamid: $pfam_id,description: $description"; } $self->pfam_lookup($pfam_lookup); # store the lookup somewhere useful. return $pfam_accs; } # get/set for lookup multi-valued hash { pfam_id => [pfam_acc, pfam_desc], ... } # can append multiple to the lookup (see { %{$self->{'_pfam_lookup'}}, %{$hash_ref} }) sub pfam_lookup{ my ($self, $hash_ref) = @_; if(ref($hash_ref) eq 'HASH'){ $self->{'_pfam_lookup'} ||= {}; $self->{'_pfam_lookup'} = { %{$self->{'_pfam_lookup'}}, %{$hash_ref} }; } return $self->{'_pfam_lookup'}; } =head2 create_genewisehmm_individually Arg : reference to pfam_id hash and directory if it is to be set to anything other than temp Function : runs through each pfam id runs, get_hmm and creates and runs the GenewiseHmm runnable Exception: throws an exception if anything other than 1, -1 or 0 is found for strand Caller : =cut sub create_genewisehmm_individually{ my ($self, $pfam_ids, $dir) = @_; print STDERR "getting hmms\n"; ########## $self->workdir('/tmp') unless ($self->workdir($dir)); $self->checkdir(); my $memory = $self->memory; print STDERR "there are ".scalar(keys(%$pfam_ids))." pfam ids to run\n"; ########## while(my ($pfam_id, $strand) = each(%$pfam_ids)){ print STDERR "doing the hmm for id: $pfam_id\n"; ########## $self->get_hmm($pfam_id); if (-z $self->hmmfilename){ warning("hmm file not created :$!"); next; } my @strands = (); if($strand == 1){ push(@strands, $strand); }elsif($strand == -1){ push(@strands, $strand); }elsif($strand == 0){ push(@strands, -1); push(@strands, 1); }else{ throw("strand = ".$strand." something funnies going on : $!\n"); } foreach my $s (@strands){ $self->run_genewisehmm($s); #print STDERR "running on strand: $s\n"; ########## #my $genewisehmm = $self->get_GenewiseHMM($s, $memory); #$genewisehmm->run(); #my @features = $genewisehmm->output(); #$self->display_genes(\@features); ########## #print STDERR "adding ".scalar(@features)." to output\n"; ########## #$self->add_output_features(\@features); } print STDERR "removing hmmfile: ".$self->hmmfilename()."\n"; ########## unlink $self->hmmfilename(); } throw("finished this bit"); return undef; } =head2 create_genewisehmm_individually Arg : reference to pfam_id hash and directory if it is to be set to anything other than temp Function : creates a hmm databases of pfam ids, then creates and runs the GenewiseHmm runnable Exception: Caller : =cut sub create_genewisehmm_complete{ my ($self, $pfam_ids) = @_; print STDERR "getting hmms\n"; ########## print STDERR "\nthere are ".scalar(keys(%$pfam_ids))." pfam ids in database\n"; ########## return unless scalar(keys(%$pfam_ids)); print STDERR "doing the hmm for ids: ". join(" ", keys(%$pfam_ids)) . "\n"; ########## $self->run_genewisehmm(0); print STDERR "removing hmmfile: ".$self->hmmfilename()."\n"; ########## unlink $self->hmmfilename(); return undef; } =head2 run_genewisehmm Arg : Function : Exception : Caller : $self->create_genewisehmm_complete, $self->create_genewisehmm_individually =cut sub run_genewisehmm{ my ($self, $strand) = @_; my $memory = $self->memory; print STDERR "running on strand: $strand\n"; ########## my $genewisehmm = $self->get_GenewiseHMM($strand, $memory); $genewisehmm->run(); my @features = $genewisehmm->output(); $self->display_genes(\@features); ########## print STDERR "adding ".scalar(@features)." to output\n"; ########## $self->add_output_features(\@features); } =head2 get_GenewiseHMM Arg : the strand of the hit and the memory to be used by genewisehmm Function : runs the new method of GenewiseHmm and returns the runnable Exception: none Caller : =cut sub get_GenewiseHMM{ my ($self, $strand, $memory) = @_; my $reverse = ($strand == -1 ? 1 : undef); print STDERR "creating genewisehmm strand $strand reverse $reverse\n"; ########## print STDERR "OPTIONS To Genewise: ".$self->options()."\n"; ########## # $genewisehmm->set_environment("/usr/local/ensembl/data/wisecfg/"); $ENV{WISECONFIGDIR} = "/usr/local/ensembl/data/wisecfg/"; my $genewisehmm = Bio::EnsEMBL::Analysis::Runnable::Finished::GenewiseHmm->new('-query' => $self->query(), '-memory' => $memory, '-hmmfile' => $self->hmmfilename(), '-reverse' => $reverse, '-genewise' => $self->program(), '-options' => $self->options(), '-analysis' => $self->analysis(), ); return $genewisehmm; } =head2 get_hmm Arg : id of hmm to be fetched normally a pfam id Function : runs hmmfetch on given id Exception: thows if no hmmfile is created Caller : =cut sub get_hmm{ my ($self, $id) = @_; #print "getting hmms\n"; $self->hmmfilename($id.".".$$.".hmm"); #print "getting hmm for id ".$id."\n"; my $command = $self->hmmfetch." ".$self->hmmdb." ".$id." > ".$self->hmmfilename; #print "command = ".$command."\n"; #system ('pwd'); eval{ system($command); }; if($@){ warning("hmmfetch threw error : $@\n"); } if (-z $self->hmmfilename){ warning("hmm file not created :$!"); }elsif(-e $self->hmmfilename){ open(ERROR, $self->hmmfilename) or die "couldn't open error file : $!"; while(<ERROR>){ if(/no such hmm/i){ print STDERR "error message : ".$_."\n"; } } close(ERROR) or die "couldn't close error file : $!"; } } sub get_hmmdb{ my ($self, $pfam_ids) = @_; my @pfamIds = keys(%$pfam_ids); print STDERR "getting the hmms for ids: ". join(" ", @pfamIds) . "\n"; ########## my $filename = substr(join("_", @pfamIds),0,20); $self->hmmfilename("$filename.$$.hmmdb"); foreach my $id(@pfamIds){ print "getting hmm for id ".$id."\n"; my $command = $self->hmmfetch." ".$self->hmmdb." ".$id." >> ".$self->hmmfilename; eval{ system($command); }; if($@){ warning("hmmfetch threw error : $@\n"); } if (-z $self->hmmfilename){ warning("hmm file not created :$!"); }elsif(-e $self->hmmfilename){ open(ERROR, $self->hmmfilename) or die "couldn't open error file : $!"; while(<ERROR>){ if(/no such hmm/i){ print STDERR "error message : ".$_."\n"; } } close(ERROR) or die "couldn't close error file : $!"; } } } =head2 add_output_features Arg : array ref to array of output features Function : adds the array of features to the output feature array Exception: throws an exception if not passed an array ref Caller : =cut sub add_output_features{ my ($self, $features) = @_; #print "adding ".scalar(@$features)." to output\n"; if (ref($features) eq "ARRAY") { push(@{$self->{'_output_features'}},@$features); } else { throw("[$features] is not an array ref."); } #print "total feature no = ".scalar(@{$self->{'_output_features'}})."\n"; } ################ #OUTPUT METHODS# ################ =head2 output Arg : none Function : returns the array of output features Exception: none Caller : =cut sub output{ my ($self) = @_; #print "outputing data\n"; #print "returning ".scalar(@{$self->{'_output_features'}})." to output\n"; return @{$self->{'_output_features'}}; } sub display_genes { my ($self, $result) = @_; #Display output my @results = @$result; foreach my $obj (@results) { print STDERR ("gene: ".$obj->gffstring. "\n"); if ($obj->sub_SeqFeature) { foreach my $exon ($obj->sub_SeqFeature) { print STDERR "Exon: ".$exon->gffstring."\n"; if ($exon->sub_SeqFeature) { foreach my $sub ($exon->sub_SeqFeature){ print STDERR "supporting features: ".$sub->gffstring."\n"; } } } } } } 1;