Raw content of Bio::EnsEMBL::Pipeline::Utils::SliceDump package Bio::EnsEMBL::Pipeline::Utils::SliceDump; use strict; use warnings; use Exporter; use vars qw(@ISA); @ISA = qw(Bio::EnsEMBL::Root); use Bio::EnsEMBL::Utils::Exception qw(throw warning verbose); use Bio::EnsEMBL::Utils::Argument qw( rearrange ); sub new{ my ($class,@args) = @_; my $self = $class->SUPER::new(@args); &verbose('WARNING'); my ($db, $slice, $output_dir ) = rearrange ([ 'DB', 'SLICE', 'OUTPUT_DIR'], @args); if(!$db){ throw("Can't run SliceDump without a database adaptor"); } $self->db($db); $self->slice($slice); $self->output_dir($output_dir); return $self; } ################### #container methods# ################### =head2 db Arg [1] : Bio::EnsEMBL::Pipeline::DBSQL::DBAdaptor Function : stores the DBadaptor for the object Returntype: Bio::EnsEMBL::Pipeline::DBSQL::DBAdaptor Exceptions: throws if argument passed in isn't a Bio::EnsEMBL::Pipeline::DBSQL::DBAdaptor' Example : my $rules_adaptor = $self->db->get_RulesAdaptor; =cut sub db{ my ($self, $db) = @_; if($db){ if(!$db->isa('Bio::EnsEMBL::DBSQL::DBAdaptor')){ throw("Can't run the RuleManager with $db you need a ". "Bio::EnsEMBL::Pipeline::DBSQL::DBAdaptor"); } $self->{'dbadaptor'} = $db; } return $self->{'dbadaptor'}; } =head2 slice Arg [1] : (optional) Bio::EnsEMBL::Slice $slice Example : $seqname = $feature->slice()->name(); Description: Getter/Setter for the Slice that data is desired from Returntype : Bio::EnsEMBL::Slice Exceptions : thrown if an invalid argument is passed Caller : general =cut sub slice { my $self = shift; if(@_) { my $sl = shift; if(defined($sl) && (!ref($sl) || !$sl->isa('Bio::EnsEMBL::Slice'))) { throw('slice argument must be a Bio::EnsEMBL::Slice'); } $self->{'slice'} = $sl; } return $self->{'slice'}; } =head2 output_dir Arg [1] : string, path to directory Function : container for path to directoy Returntype: string, Exceptions: throws if directory doesn't exist' Example : my $filename = $self->output_dir'/filename' =cut sub output_dir{ my ($self, $output_dir) = @_; if($output_dir){ throw("Output dir ".$output_dir." must exist or we can't use it") unless (-d $output_dir); $self->{'output_dir'} = $output_dir; } return $self->{'output_dir'}; } ################# #Utility methods# ################# #these are methods which provide generic functionality for the #object =head2 dump_table Arg [1] : string, name of table Arg [2] : string, name of file to dump to Arg [3] : string, where clause for sql statement if wanted Arg [4] : string, select clause for sql if wanted Arg [5] : string, from clause of sql if wanted Arg [6] : string, into outfile clause of wanted Function : construction the sql to dump a particular tables contents for a particular slice if desire. The last four arguments are options to allow for various different statements to be generated as not all tables fit the select * from table where seq_region_id = $id model Returntype: filename Exceptions: throws if not passed a table name or if the filename already exists as mysql won't dump to an existing file' Example : my $filename = $slice_dump->dump_table() =cut sub dump_table{ my ($self, $table_name, $filename, $where, $select, $from, $out) = @_; if(!$table_name){ throw("Can't dump table without tablename"); } if(!$filename){ $filename = $self->output_dir."/".$table_name; } if(-e $filename){ throw($filename." exists mysql can't dump into an existing file "); } $select = "select * " unless($select); $from = "from ".$table_name." " unless($from); $out = " into outfile '".$filename."'" unless($out); my $sql = $select; $sql .= $from; $sql .= $where if($where); $sql .= $out; print $sql."\n"; my $sth = $self->db->prepare($sql); $sth->execute; return $filename; } =head2 genereate_where_clause Arg [1] : int, seq_region_id Arg [2] : int, seq_region_start Arg [3] : int, seq_region_end Arg [4] : int, boolean toggle Function : This method will construct a standard where clause working on the assumption that the column names are seq_region_id, seq_region_start and seq_region_end the last toggle is to whether to get features which overlap the specified boundaries Returntype: string, the where clause Exceptions: throws if not passed a seq_region_id Example : =cut sub generate_where_clause{ my ($self, $id, $start, $end, $overlaps_boundaries) = @_; if(!$id){ throw("Can't generate a where clause without seq_region_id"); } my $where_clause = "where seq_region_id = ".$id; if($overlaps_boundaries){ $where_clause .= " and seq_region_start <= ".$end if($end); $where_clause .= " and seq_region_end >= ".$start if($start); }else{ $where_clause .= " and seq_region_start >= ".$start if($start); $where_clause .= " and seq_region_end <= ".$end if($end); } return $where_clause; } =head2 can_dump Arg [1] : string table_name Arg [2] : int seq_region_id Arg [3] : string column name Function : This checks if the id passed in has any entries in the table specified for the column name specified, the default column name is seq_region_id Returntype: int, count Exceptions: throws if not passed either a table name or a seq_region_id Example : if($self->can_dump('seq_region', 1)) =cut sub can_dump{ my ($self, $table_name, $id, $column_name) = @_; if(!$table_name || !$id){ throw("Can't check if can dump for a table with a specific seq_region_id ". " if table_name $table_name or id $id isn't defined"); } if(!$column_name){ $column_name = 'seq_region_id'; } my $sql = "select count(*) from ".$table_name." where ".$column_name. " = ".$id; my $sth = $self->db->prepare($sql); $sth->execute; my ($count) = $sth->fetchrow; return $count; } =head2 get_filename Arg [1] : string table name Arg [2] : Bio::EnsEMBL::Slice Function : generate a filename useable by mysql import on the basis of the tablename and info from the slice, can't use Slice::name as the format upsets mysqlimport Returntype: string, filename Exceptions: none Example : my $filename = $self->get_filename(seq_region', $slice) =cut sub get_filename{ my ($self, $table_name, $slice) = @_; return $self->output_dir."/".$table_name.".".$slice->seq_region_name. ".".$slice->start."-".$slice->end; } # #Dump methods for tables which require partial dumps #All these dumps will require a slice object and will take either the #slice held by the SliceDump object or a slice which is passed in # #the following tables have structures which fit a standard model #they all have columns seq_region_id, seq_region_start and seq_region_end #all these methods have the name which follows the structure #dump_table_name_table to allow for on the fly calling of methods #for appropriate tables =head2 dump_name_table Arg [1] : Bio::EnsEMBL::Slice Function : check if any data can be dumped from the defined table and if it can call the dump table method with the appropriate arguments Returntype: returns 1 if a dump was made 0 if not Exceptions: throws if no slice is defined either in the method args or SliceDump::Slice or if that slice isn't a slice object' Example : $SliceDump->dump_seq_region_table =cut sub dump_seq_region_table{ my ($self, $slice) = @_; if(!$slice){ $slice = $self->slice; } if(!$slice || !$slice->isa("Bio::EnsEMBL::Slice")){ throw("Can't dump partial table without slice information"); } my $id = $slice->get_seq_region_id; if($self->can_dump('seq_region', $id)){ my $filename = $self->get_filename('seq_region', $slice); my $where_clause = $self->generate_where_clause($id); $self->dump_table('seq_region', $filename, $where_clause); return 1; } return 0; } sub dump_dna_table{ my ($self, $slice) = @_; if(!$slice){ $slice = $self->slice; } if(!$slice || !$slice->isa("Bio::EnsEMBL::Slice")){ throw("Can't dump partial table without slice information"); } if(!$slice->coord_system->is_sequence_level){ return 1; } my $id = $slice->get_seq_region_id; if($self->can_dump('dna', $id)){ my $filename = $self->get_filename('dna', $slice); my $where_clause = $self->generate_where_clause($id); $self->dump_table('dna', $filename, $where_clause); return 1; } return 0; } sub dump_seq_region_attrib_table{ my ($self, $slice) = @_; if(!$slice){ $slice = $self->slice; } if(!$slice || !$slice->isa("Bio::EnsEMBL::Slice")){ throw("Can't dump partial table without slice information"); } my $id = $slice->get_seq_region_id; if($self->can_dump('seq_region_attrib', $id)){ my $filename = $self->get_filename('seq_region_attrib', $slice); my $where_clause = $self->generate_where_clause($id); $self->dump_table('seq_region_attrib', $filename, $where_clause); return 1; } return 0; } sub dump_assembly_exception_table{ my ($self, $slice) = @_; if(!$slice){ $slice = $self->slice; } if(!$slice || !$slice->isa("Bio::EnsEMBL::Slice")){ throw("Can't dump partial table without slice information"); } my $id = $slice->get_seq_region_id; if($self->can_dump('assembly_exception', $id)){ my $filename = $self->get_filename('assembly_exception', $slice); my $where_clause = $self->generate_where_clause($id); $self->dump_table('assembly_exception', $filename, $where_clause); return 1; } return 0; } sub dump_repeat_feature_table{ my ($self, $slice) = @_; if(!$slice){ $slice = $self->slice; } if(!$slice || !$slice->isa("Bio::EnsEMBL::Slice")){ throw("Can't dump partial table without slice information"); } my $id = $slice->get_seq_region_id; if($self->can_dump('repeat_feature', $id)){ my $filename = $self->get_filename('repeat_feature', $slice); my $where_clause = $self->generate_where_clause($id, $slice->start, $slice->end); $self->dump_table('repeat_feature', $filename, $where_clause); return 1; } return 0; } sub dump_simple_feature_table{ my ($self, $slice) = @_; if(!$slice){ $slice = $self->slice; } if(!$slice || !$slice->isa("Bio::EnsEMBL::Slice")){ throw("Can't dump partial table without slice information"); } my $id = $slice->get_seq_region_id; if($self->can_dump('simple_feature', $id)){ my $filename = $self->get_filename('simple_feature', $slice); my $where_clause = $self->generate_where_clause($id, $slice->start, $slice->end); $self->dump_table('simple_feature', $filename, $where_clause); return 1; } return 0; } sub dump_protein_align_feature_table{ my ($self, $slice) = @_; if(!$slice){ $slice = $self->slice; } if(!$slice || !$slice->isa("Bio::EnsEMBL::Slice")){ throw("Can't dump partial table without slice information"); } my $id = $slice->get_seq_region_id; if($self->can_dump('protein_align_feature', $id)){ my $filename = $self->get_filename('protein_align_feature', $slice); my $where_clause = $self->generate_where_clause($id, $slice->start, $slice->end); $self->dump_table('protein_align_feature', $filename, $where_clause); return 1; } return 0; } sub dump_dna_align_feature_table{ my ($self, $slice) = @_; if(!$slice){ $slice = $self->slice; } if(!$slice || !$slice->isa("Bio::EnsEMBL::Slice")){ throw("Can't dump partial table without slice information"); } my $id = $slice->get_seq_region_id; if($self->can_dump('dna_align_feature', $id)){ my $filename = $self->get_filename('dna_align_feature', $slice); my $where_clause = $self->generate_where_clause($id, $slice->start, $slice->end); $self->dump_table('dna_align_feature', $filename, $where_clause); return 1; } return 0; } sub dump_prediction_exon_table{ my ($self, $slice) = @_; if(!$slice){ $slice = $self->slice; } if(!$slice || !$slice->isa("Bio::EnsEMBL::Slice")){ throw("Can't dump partial table without slice information"); } my $id = $slice->get_seq_region_id; if($self->can_dump('prediction_exon', $id)){ my $filename = $self->get_filename('prediction_exon', $slice); my $where_clause = $self->generate_where_clause($id, $slice->start, $slice->end); $self->dump_table('prediction_exon', $filename, $where_clause); return 1; } return 0; } sub dump_prediction_transcript_table{ my ($self, $slice) = @_; if(!$slice){ $slice = $self->slice; } if(!$slice || !$slice->isa("Bio::EnsEMBL::Slice")){ throw("Can't dump partial table without slice information"); } my $id = $slice->get_seq_region_id; if($self->can_dump('prediction_transcript', $id)){ my $filename = $self->get_filename('prediction_transcript', $slice); my $where_clause = $self->generate_where_clause($id, $slice->start, $slice->end); $self->dump_table('prediction_transcript', $filename, $where_clause); return 1; } return 0; } sub dump_gene_table{ my ($self, $slice) = @_; if(!$slice){ $slice = $self->slice; } if(!$slice || !$slice->isa("Bio::EnsEMBL::Slice")){ throw("Can't dump partial table without slice information"); } my $id = $slice->get_seq_region_id; if($self->can_dump('gene', $id)){ my $filename = $self->get_filename('gene', $slice); my $where_clause = $self->generate_where_clause($id, $slice->start, $slice->end); $self->dump_table('gene', $filename, $where_clause); return 1; } return 0; } sub dump_exon_table{ my ($self, $slice) = @_; if(!$slice){ $slice = $self->slice; } if(!$slice || !$slice->isa("Bio::EnsEMBL::Slice")){ throw("Can't dump partial table without slice information"); } my $id = $slice->get_seq_region_id; if($self->can_dump('exon', $id)){ my $filename = $self->get_filename('exon', $slice); my $where_clause = $self->generate_where_clause($id, $slice->start, $slice->end); $self->dump_table('exon', $filename, $where_clause); return 1; } return 0; } sub dump_transcript_table{ my ($self, $slice) = @_; if(!$slice){ $slice = $self->slice; } if(!$slice || !$slice->isa("Bio::EnsEMBL::Slice")){ throw("Can't dump partial table without slice information"); } my $id = $slice->get_seq_region_id; if($self->can_dump('transcript', $id)){ my $filename = $self->get_filename('transcript', $slice); my $where_clause = $self->generate_where_clause($id, $slice->start, $slice->end); $self->dump_table('transcript', $filename, $where_clause); return 1; } return 0; } ## ##methods for tables which don't contain straight ref to seq_region_id ## #dumps entries from the assembly table #can use standard model as must take entries whose column labelled #asm_seq_region_id match the given id and the start and end are asm_start #and asm_end respectively sub dump_assembly_table{ my ($self, $slice) = @_; if(!$slice){ $slice = $self->slice; } if(!$slice || !$slice->isa("Bio::EnsEMBL::Slice")){ throw("Can't dump partial table without slice information"); } my $id = $slice->get_seq_region_id; if($self->can_dump('assembly', $id, 'asm_seq_region_id')){ my $filename = $self->get_filename('assembly', $slice); my $where_clause = "where asm_seq_region_id = ".$id. " and asm_start <= ".$slice->end. " and asm_end >= ".$slice->start; $self->dump_table('assembly', $filename, $where_clause); return 1; } return 0; } #dumps entries from the repeat_consensus table #this must generate sql which joins to the repeat feature table # in order to dump its entries sub dump_repeat_consensus_table{ my ($self, $slice) = @_; if(!$slice){ $slice = $self->slice; } if(!$slice || !$slice->isa("Bio::EnsEMBL::Slice")){ throw("Can't dump partial table without slice information"); } my $id = $slice->get_seq_region_id; if($self->can_dump('repeat_feature', $id)){ my $filename = $self->get_filename('repeat_consensus', $slice); my $select = "select rc.* "; my $from = "from repeat_consensus rc, repeat_feature rf "; my $where = "where rf.repeat_consensus_id = rc.repeat_consensus_id ". "and rf.seq_region_id = ".$id." ". "and rf.seq_region_start = ".$slice->start." ". "and rf.seq_region_end = ".$slice->end." "; my $out = "into outfile '".$filename."'"; $self->dump_table('repeat_consensus', $filename, $where, $select, $from, $out); return 1; } return 0; } #dumps entries from the translation table #must join to the transcript table to do this sub dump_translation_table{ my ($self, $slice) = @_; if(!$slice){ $slice = $self->slice; } if(!$slice || !$slice->isa("Bio::EnsEMBL::Slice")){ throw("Can't dump partial table without slice information"); } my $id = $slice->get_seq_region_id; if($self->can_dump('transcript', $id)){ my $filename = $self->get_filename('translation', $slice); my $select = "select translation.* "; my $from = "from translation, transcript "; my $where = "where ". "translation.transcript_id = transcript.transcript_id and ". "seq_region_id = ".$id." and ". "seq_region_start >= ".$slice->start." and ". "seq_region_end <= ".$slice->end." "; my $out = "into outfile '".$filename."'"; $self->dump_table('translation', $filename, $where, $select, $from, $out); return 1; } return 0; } #dumps entries from the exon_transcript table #must join to the transcript table to do this sub dump_exon_transcript_table{ my ($self, $slice) = @_; if(!$slice){ $slice = $self->slice; } if(!$slice || !$slice->isa("Bio::EnsEMBL::Slice")){ throw("Can't dump partial table without slice information"); } my $id = $slice->get_seq_region_id; if($self->can_dump('transcript', $id)){ my $filename = $self->get_filename('exon_transcript', $slice); my $select = "select exon_transcript.* "; my $from = "from exon_transcript, transcript "; my $where = "where ". "exon_transcript.transcript_id = transcript.transcript_id and ". "seq_region_id = ".$id." and ". "seq_region_start >= ".$slice->start." and ". "seq_region_end <= ".$slice->end." "; my $out = "into outfile '".$filename."'"; $self->dump_table('exon_transcript', $filename, $where, $select, $from, $out); return 1; } return 0; } #dumps entries from the supporting_feature table #must join to the exon table to do this sub dump_supporting_feature_table{ my ($self, $slice) = @_; if(!$slice){ $slice = $self->slice; } if(!$slice || !$slice->isa("Bio::EnsEMBL::Slice")){ throw("Can't dump partial table without slice information"); } my $id = $slice->get_seq_region_id; if($self->can_dump('exon', $id)){ my $filename = $self->get_filename('supporting_feature', $slice); my $select = "select supporting_feature.* "; my $from = "from supporting_feature, exon "; my $where = "where ". "supporting_feature.exon_id = exon.exon_id and ". "seq_region_id = ".$id." and ". "seq_region_start >= ".$slice->start." and ". "seq_region_end <= ".$slice->end." "; my $out = "into outfile '".$filename."'"; $self->dump_table('supporting_feature', $filename, $where, $select, $from, $out); return 1; } return 0; } #dumps entries from the transcript_stable_id table #must join to the transcript table to do this sub dump_transcript_stable_id_table{ my ($self, $slice) = @_; if(!$slice){ $slice = $self->slice; } if(!$slice || !$slice->isa("Bio::EnsEMBL::Slice")){ throw("Can't dump partial table without slice information"); } my $id = $slice->get_seq_region_id; if($self->can_dump('transcript', $id)){ my $filename = $self->get_filename('transcript_stable_id', $slice); my $select = "select transcript_stable_id.* "; my $from = "from transcript_stable_id, transcript "; my $where = "where ". "transcript_stable_id.transcript_id = transcript.transcript_id ". " and seq_region_id = ".$id." and ". "seq_region_start >= ".$slice->start." and ". "seq_region_end <= ".$slice->end." "; my $out = "into outfile '".$filename."'"; $self->dump_table('transcript_stable_id', $filename, $where, $select, $from, $out); return 1; } return 0; } #dumps entries from the exon_stable_id table #must join to the exon table to do this sub dump_exon_stable_id_table{ my ($self, $slice) = @_; if(!$slice){ $slice = $self->slice; } if(!$slice || !$slice->isa("Bio::EnsEMBL::Slice")){ throw("Can't dump partial table without slice information"); } my $id = $slice->get_seq_region_id; if($self->can_dump('exon', $id)){ my $filename = $self->get_filename('exon_stable_id', $slice); my $select = "select exon_stable_id.* "; my $from = "from exon_stable_id, exon "; my $where = "where ". "exon_stable_id.exon_id = exon.exon_id and ". "seq_region_id = ".$id." and ". "seq_region_start >= ".$slice->start." and ". "seq_region_end <= ".$slice->end." "; my $out = "into outfile '".$filename."'"; $self->dump_table('exon_stable_id', $filename, $where, $select, $from, $out); return 1; } return 0; } #dumps entries from the gene_stable_id table #must join to the gene table to do this sub dump_gene_stable_id_table{ my ($self, $slice) = @_; if(!$slice){ $slice = $self->slice; } if(!$slice || !$slice->isa("Bio::EnsEMBL::Slice")){ throw("Can't dump partial table without slice information"); } my $id = $slice->get_seq_region_id; if($self->can_dump('gene', $id)){ my $filename = $self->get_filename('gene_stable_id', $slice); my $select = "select gene_stable_id.* "; my $from = "from gene_stable_id, gene "; my $where = "where ". "gene_stable_id.gene_id = gene.gene_id and ". "seq_region_id = ".$id." and ". "seq_region_start >= ".$slice->start." and ". "seq_region_end <= ".$slice->end." "; my $out = "into outfile '".$filename."'"; $self->dump_table('gene_stable_id', $filename, $where, $select, $from, $out); return 1; } return 0; } #dumps entries from the protein_feature table #must join to the translation and transcript tables to do this sub dump_protein_feature_table{ my ($self, $slice) = @_; if(!$slice){ $slice = $self->slice; } if(!$slice || !$slice->isa("Bio::EnsEMBL::Slice")){ throw("Can't dump partial table without slice information"); } my $id = $slice->get_seq_region_id; if($self->can_dump('transcript', $id)){ my $filename = $self->get_filename('protein_feature', $slice); if(-e $filename){ throw($filename." exists mysql can't dump into an existing file "); } my $sql = "select protein_feature.* from protein_feature, translation, ". "transcript where translation.transcript_id = transcript.transcript_id ". "and seq_region_id = ".$id." and seq_region_start >= ". $slice->start." and seq_region_end <= ".$slice->end." ". "and protein_feature.translation_id = translation.translation_id ". "into outfile '".$filename."'"; my $sth = $self->db->prepare($sql); $sth->execute; return 1; } return 0; } 1;