Raw content of Bio::EnsEMBL::Analysis::Runnable::PSILC # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =pod =head1 NAME Bio::EnsEMBL::Analysis::Runnable::PSILC; =head1 SYNOPSIS my $PSILC = Bio::EnsEMBL::Analysis::Runnable::PSILC->new ( '-trans' => $transcript, '-homologs' => $homologs, '-analysis' => $analysis, '-domain' => $domains, '-input_id' => $self->input_id, ); $runnabledb->fetch_input(); $runnabledb->run(); my @array = @{$runnabledb->output}; $runnabledb->write_output(); =head1 DESCRIPTION Runnable for PSILC =head1 CONTACT Post questions to the Ensembl development list: ensembl-dev@ebi.ac.uk =cut package Bio::EnsEMBL::Analysis::Runnable::PSILC; use strict; use warnings; use Bio::EnsEMBL::Analysis::Runnable; use Bio::EnsEMBL::Analysis::Config::Pseudogene; use Bio::Tools::Run::Alignment::Clustalw; use Bio::Align::Utilities qw(aa_to_dna_aln); use Bio::AlignIO; use Bio::SimpleAlign; use vars qw(@ISA); @ISA = qw(Bio::EnsEMBL::Analysis::Runnable); =head2 new Arg [1] : none Description: Creates runnable Returntype : none Exceptions : none Caller : general =cut sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); $self->{'_trans'} = {}; # transcript to test; $self->{'_homologs'} = (); # array of holmologs to cluster with transcript; $self->{'_domains'} = (); # array ref of protein feature ids; my($trans,$domains,$homologs,$input_id) = $self->_rearrange([qw( TRANS DOMAIN HOMOLOGS INPUT_ID )], @args); if ($trans) { $self->trans($trans); } if ($homologs) { $self->homologs($homologs); } if ($domains) { $self->domains($domains); } $self->output_dir($input_id); $self->program('/nfs/acari/sw4/Pseudogenes/PSILC/psilc1.21/psilc.jar'); return $self; } =head2 run Arg [1] : none Description: Makes alignment out of transcripts and runs PSILC Returntype : none Exceptions : none Caller : general =cut sub run{ my ($self) = @_; $self->checkdir(); $self->write_pfam_ids; my $filename = $self->create_filename('PSILC','fasta'); $self->make_codon_based_alignment; $self->run_analysis(); $self->files_to_be_deleted($filename); $self->parse_results; # $self->delete_files; } =head2 write_pfam_ids Arg [1] : none Description: Writes identifiers for PFAM domains that PSILC will search for in the protein alignment Returntype : none Exceptions : throws if it cannot open the file to write to Caller : general =cut sub write_pfam_ids{ my ($self)=@_; my $dir = $self->workdir; my $domains = $self->domains; print STDERR "writing pfam ids to $dir/pfamA\n"; open (IDS,">$dir/pfamA") or $self->throw("Cannot open file for writing pfam ids $dir/pfamA\n"); print IDS "$domains"; close IDS; return 1; } =head2 run_analysis Arg [1] : scalar - program name Description: runs the PSILC executable Returntype : none Exceptions : throws if program is not executable Caller : general =cut sub run_analysis{ my ($self, $program) = @_; if(!$program){ $program = $self->program; } throw($program." is not executable Runnable::run_analysis ") unless($program && -x $program); my $file = $self->queryfile; my $dir = $self->workdir; $file =~ s/^$dir\///; my $command = "java -Xmx400m -jar $program"; $command .= " --align $file.aln"; $command .= " --seq ".$self->id; $command .= " --restrict 1"; # $command .= " --alignMethod 1"; $command .= " --max_nodes 10:6"; $command .= " --repository /ecs2/work2/sw4/PFAM"; $command .= " > ".$self->resultsfile; print "Running analysis ".$command."\n"; system($command) == 0 or $self->throw("FAILED to run ".$command); } =head2 parse_results Arg [1] : scalar - filename Description: parses PSILC results Returntype : none Exceptions : warns if the results are unreadable Caller : general =cut sub parse_results{ my ($self, $filename) =@_; my ($nuc_dom,$prot_dom,$postPMax,$postNmax) = 0; my $dir = $self->workdir; my $trans = $self->trans; my $id = $self->id; open (PSILC,"$dir/PSILC_WAG_HKY/summary") or $self->warn("Cannot open results file $dir/PSILC_WAG_HKY/summary\n$@\n"); while(<PSILC>){ chomp; next unless $_ =~ /^$id\/\S+\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)/; $nuc_dom = $1; $prot_dom = $2; $postPMax = $3; $postNmax = $4; last; } close PSILC; unless ($nuc_dom){ $self->warn("ERROR unable to parse results file $dir/PSILC_WAG_HKY/summary\n"); $self->warn("Failed for $id"); return 0; } my %results = ( 'id' => $id, 'prot_dom' => $prot_dom, 'nuc_dom' => $nuc_dom, 'postPMax' => $postPMax, 'postNMax' => $postNmax, ); $self->output(\%results); return 1; } =head2 files_to_be_deleted Arg [1] : scalar - filename Description: marks the PSILC output files for deletion Returntype : none Exceptions : none Caller : general =cut sub files_to_be_deleted{ my ($self,$filename) = @_; my $dir = $self->workdir; my $domains = $self->domains; $domains =~ s/PF.+\t//; chomp $domains; # Break filename into its component parts my @fnc = split /\./,$filename; # $self->files_to_delete($filename.".aln"); $self->files_to_delete($filename.".out"); $self->files_to_delete($fnc[0].".nhx"); $self->files_to_delete("$dir/pfamA"); $self->files_to_delete("$dir/PSILC_WAG_HKY/domdom"); $self->files_to_delete("$dir/PSILC_WAG_HKY/posteriorN"); $self->files_to_delete("$dir/PSILC_WAG_HKY/posteriorP"); $self->files_to_delete("$dir/PSILC_WAG_HKY/psilcN"); $self->files_to_delete("$dir/PSILC_WAG_HKY/psilcP"); # $self->files_to_delete("$dir/PSILC_WAG_HKY/summary"); $self->files_to_delete("$dir/PSILC_WAG_HKY/$domains"); return 1; } =head2 output_dir Arg [1] : scalar - input id of the anlysis Description: Creates the output directory for the PSILC files Returntype : none Exceptions : none Caller : general =cut sub output_dir{ my ($self,$input_id) = @_; my $output_dir; $output_dir = $self->id; if ($PSILC_WORK_DIR){ system("mkdir $PSILC_WORK_DIR/$$input_id/$output_dir"); $self->workdir("$PSILC_WORK_DIR/$$input_id/$output_dir"); } else{ $self->throw("Cannot make output directory\n"); } return 1; } sub id { my ($self) = @_; if ($self->trans->stable_id){ return $self->trans->stable_id; } return $self->trans->dbID; } =head2 make_codon_based_alignment Arg [1] : none Description: Makes a dna alignment based on a protein alignment using Bio::Align::Utilities DNA alignment is for the translateable part of the transcript only Returntype : none Exceptions : none Caller : general =cut sub make_codon_based_alignment{ my ($self) = @_; my @aaseqs; my $aaseq; my %dnaseqs; # make alignment file for output my $filename = $self->queryfile.".aln"; my $alignIO = Bio::AlignIO->new ( -file => ">$filename", -format => "clustalw", ); my $pep_alignIO = Bio::AlignIO->new ( -file => ">$filename.pep", -format => "clustalw", ); # run clustal my $clustalw = Bio::Tools::Run::Alignment::Clustalw->new(); # prepare sequences, aaseqs has the proteins used for the alignment # dnaseqs is a hash of the tranlateable nucleotide sequence # Both sets need to use transcript ids which as also used as the hash keys $aaseq = $self->trans->translate; # change id of translation to match transcript $aaseq->display_id($self->id); push @aaseqs,$aaseq; $dnaseqs{$self->id} = Bio::Seq->new ( -display_id => $self->id, -seq => $self->trans->translateable_seq, ); foreach my $homolog(@{$self->homologs}){ next if ($homolog->stable_id eq $self->id); $aaseq = $homolog->translate; # change id of translation to match transcript $aaseq->display_id($homolog->stable_id); push @aaseqs,$aaseq; $dnaseqs{$homolog->stable_id} = Bio::Seq->new ( -display_id => $homolog->stable_id, -seq => $homolog->translateable_seq, ); } foreach my $seq (@aaseqs){ print $seq->display_id."\n"; } my $alignment = $clustalw->align(\@aaseqs); my $dna_aln = aa_to_dna_aln($alignment,\%dnaseqs); foreach my $seq ($dna_aln->each_seq){ print $seq->display_id."\n"; } $pep_alignIO->write_aln($alignment); $alignIO->write_aln($dna_aln); return 1; } ####################################### # Containers =head2 trans Arg [1] : array ref Description: get/set trans set to run over Returntype : array ref to Bio::EnsEMBL::Transcript objects Exceptions : throws if not a Bio::EnsEMBL::Transcript Caller : general =cut sub trans { my ($self, $trans) = @_; if ($trans) { unless ($trans->isa("Bio::EnsEMBL::Transcript")){ $self->throw("Input isn't a Bio::EnsEMBL::Transcript, it is a $trans\n$@"); } $self->{'_trans'} = $trans; } return $self->{'_trans'}; } =head2 homologs Arg [1] : array ref Description: get/set gene set to run over Returntype : array ref to Bio::EnsEMBL::Gene objects Exceptions : throws if not a Bio::EnsEMBL::Gene Caller : general =cut sub homologs { my ($self, $homologs) = @_; if ($homologs) { foreach my $hom (@{$homologs}){ unless ($hom->isa("Bio::EnsEMBL::Transcript")){ $self->throw("Input isn't a Bio::EnsEMBL::Transcript, it is a $hom\n$@"); } } $self->{'_homologs'} = $homologs; } return $self->{'_homologs'}; } =head2 domains Arg [1] : array ref Description: get/set pfam domain identifiers Returntype : string of pfam identifiers Exceptions : none Caller : general =cut sub domains { my ($self, $domains) = @_; if ($domains) { $self->{'_domains'} = $$domains; } return $self->{'_domains'}; } =head2 output Arg [1] : hash_ref Description: overrides output array Returntype : hash_ref Exceptions : none Caller : general =cut sub output { my ($self, $hash_ref) = @_; if ($hash_ref) { $self->{'_output'} = $hash_ref; } return $self->{'_output'}; } 1;