Raw content of Bio::EnsEMBL::Pipeline::Utils::InputIDFactory package Bio::EnsEMBL::Pipeline::Utils::InputIDFactory; use strict; use warnings; use Bio::EnsEMBL::Utils::Exception qw(verbose throw warning info); use Bio::EnsEMBL::Utils::Argument qw( rearrange ); use Bio::EnsEMBL::Root; use Bio::EnsEMBL::Pipeline::Analysis; use vars qw(@ISA); @ISA = ('Bio::EnsEMBL::Root'); use Bio::EnsEMBL::Utils::Slice qw(split_Slices); =head2 new Arg [1] : Bio::EnsEMBL::DBSQL::DBAdaptor Arg [2] : int, toggle for slice based input_ids Arg [3] : int toggle for single input_ids Arg [4] : int toggle for filename based input_ids Arg [5] : int Function : creates an InputIDFactory object Returntype: Bio::EnsEMBL::Pipeline::Utils::InputIDFactory Exceptions: none Caller : Example : =cut sub new{ my $caller = shift; my $class = ref($caller) || $caller; my $self = bless({}, $class); $self->{'db'} = undef; my ($db, $slice, $single, $file, $translation_id, $slice_size, $slice_overlaps, $seq_level, $top_level, $dir, $regex, $single_name, $verbose, $logic_name, $input_id_type, $insert_analysis, $coord_system, $coord_system_version, $seq_region_name, $seq_region_start, $seq_region_end, $hap_pair, $include_non_reference)=rearrange([qw(DB SLICE SINGLE FILE TRANSLATION_ID SLICE_SIZE SLICE_OVERLAPS SEQ_LEVEL TOP_LEVEL DIR REGEX SINGLE_NAME VERBOSE LOGIC_NAME INPUT_ID_TYPE INSERT_ANALYSIS COORD_SYSTEM COORD_SYSTEM_VERSION SEQ_REGION_NAME SEQ_REGION_START SEQ_REGION_END HAP_PAIR INCLUDE_NON_REFERENCE )], @_); $slice = 0 unless ($slice); $single = 0 unless ($single); $file = 0 unless ($file); $hap_pair = 0 unless ($hap_pair); $include_non_reference = 0 unless ($include_non_reference); $translation_id = 0 unless($translation_id); if(!$db){ throw("You can't create and store input_ids without a dbadaptor\n"); } $self->db($db); if(!$slice && !$file && !$translation_id && !$single && !$hap_pair){ throw("You must define one of these options SLICE, FILE, SINGLE ". "TRANSLATION_ID, HAP_PAIR for the input id factory to work"); } print "slice: ",$slice,"\n"; print "file: ",$file,"\n"; print "translation_id: ",$translation_id,"\n"; print "single: ",$single,"\n"; print "hap_pair: ",$hap_pair,"\n"; print "SUM: ",$slice+$file+$translation_id+$single+$hap_pair,"\n"; if(($slice+$file+$translation_id+$single+$hap_pair) > 1){ throw("You must only specify one of these options SLICE, FILE, SINGLE ". "TRANSLATION_ID, HAP_PAIR for the input id factory to work"); } $self->slice($slice) if($slice); $self->top_level($top_level) if($top_level); $self->seq_level($seq_level) if($seq_level); $self->coord_system($coord_system) if($coord_system); if($slice && !$self->coord_system){ throw("You must specify a coordinate system if you want slice ". "input ids created\n"); } # if($hap_pair && !$self->coord_system){ # throw("You must specify a coordinate system if you want slice ". # "input ids created\n"); # } $self->coord_system_version($coord_system_version) if($coord_system_version); $self->seq_region_name($seq_region_name) if defined $seq_region_name; $self->seq_region_start($seq_region_start) if defined $seq_region_start; $self->seq_region_end($seq_region_end) if defined $seq_region_end; $self->file($file) if($file); $self->single($single) if($single); $self->include_non_reference($include_non_reference) if($include_non_reference); $self->translation_id($translation_id) if($translation_id); $self->hap_pair($hap_pair) if($hap_pair); if(!$logic_name){ throw("Must have a logic_name otherwise don't know which analysis to ". "store the input ids under"); } if($insert_analysis && !$input_id_type){ throw("if you want your analysis object to be inserted into the ". "database you must also provide an input_id_type"); } my $analysis = $self->get_analysis($logic_name, $input_id_type, $insert_analysis); $self->logic_name($logic_name); $self->slice_size($slice_size) if($slice_size); $self->slice_overlaps($slice_overlaps) if($slice_overlaps); $self->dir($dir) if($dir); $self->regex($regex) if($regex); $self->single_name($single_name) if($single_name); return $self; } #container methods sub db{ my ($self, $db) = @_; if($db){ if(!$db->isa('Bio::EnsEMBL::Pipeline::DBSQL::DBAdaptor')){ throw("Can't run the RuleManager with $db you need a ". "Bio::EnsEMBL::Pipeline::DBSQL::DBAdaptor"); } $self->{'dbadaptor'} = $db; } return $self->{'dbadaptor'}; } sub stateinfocontainer{ my ($self, $adaptor) = @_; if($adaptor){ $self->{'stateinfocontainer'} = $adaptor; } if(!$self->{'stateinfocontainer'}){ my $stateinfocontainer = $self->db->get_StateInfoContainer; $self->{'stateinfocontainer'} = $stateinfocontainer; } return $self->{'stateinfocontainer'}; } sub analysis_adaptor{ my ($self, $adaptor) = @_; $self->{'analysis_adaptor'} = $adaptor; if(!$self->{'analysis_adaptor'}){ $self->{'analysis_adaptor'} = $self->db->get_AnalysisAdaptor; } return $self->{'analysis_adaptor'}; } sub slice{ my $self = shift; $self->{'slice'} = shift if(@_); return $self->{'slice'}; } sub file{ my $self = shift; $self->{'file'} = shift if(@_); return $self->{'file'}; } sub translation_id{ my $self = shift; $self->{'translation_id'} = shift if(@_); return $self->{'translation_id'}; } sub single{ my $self = shift; $self->{'single'} = shift if(@_); return $self->{'single'}; } sub hap_pair{ my $self = shift; $self->{'hap_pair'} = shift if(@_); return $self->{'hap_pair'}; } sub include_non_reference{ my $self = shift; $self->{'include_non_reference'} = shift if(@_); return $self->{'include_non_reference'}; } sub slice_size{ my $self = shift; $self->{'slice_size'} = shift if(@_); return $self->{'slice_size'}; } sub slice_overlaps{ my $self = shift; $self->{'slice_overlaps'} = shift if(@_); return $self->{'slice_overlaps'}; } sub dir{ my $self = shift; $self->{'dir'} = shift if(@_); return $self->{'dir'}; } sub regex{ my $self = shift; $self->{'regex'} = shift if(@_); return $self->{'regex'}; } sub single_name{ my $self = shift; $self->{'single_name'} = shift if(@_); return $self->{'single_name'} || 'genome'; } sub coord_system{ my $self = shift; $self->{'coord_system'} = shift if(@_); return $self->{'coord_system'}; } sub coord_system_version{ my $self = shift; $self->{'coord_system_version'} = shift if(@_); return $self->{'coord_system_version'}; } sub seq_region_name { my $self = shift; $self->{'seq_region_name'} = shift if(@_); return $self->{'seq_region_name'}; } sub seq_region_start { my $self = shift; $self->{'seq_region_start'} = shift if(@_); return $self->{'seq_region_start'}; } sub seq_region_end { my $self = shift; $self->{'seq_region_end'} = shift if(@_); return $self->{'seq_region_end'}; } sub top_level{ my $self = shift; $self->{'top_level'} = shift if(@_); if($self->{'top_level'}){ $self->coord_system('toplevel'); } return $self->{'top_level'}; } sub seq_level{ my $self = shift; $self->{'seq_level'} = shift if(@_); if($self->{'seq_level'}){ $self->coord_system('seqlevel'); } return $self->{'seq_level'}; } sub get_analysis{ my ($self, $logic_name, $input_id_type, $insert) = @_; my $analysis; if($logic_name && $input_id_type && $insert){ $analysis = Bio::EnsEMBL::Pipeline::Analysis->new; $analysis->logic_name($logic_name); $analysis->input_id_type($input_id_type); $self->analysis_adaptor->store($analysis); }elsif($logic_name && !$insert){ $analysis = $self->analysis_adaptor->fetch_by_logic_name($logic_name); } if($analysis){ $self->{'analysis'} = $analysis; } return $self->{'analysis'}; } sub logic_name{ my $self = shift; $self->{'logic_name'} = shift if(@_); if(!$self->{'logic_name'}){ $self->{'logic_name'} = $self->get_analysis->logic_name; } return $self->{'logic_name'}; } sub input_id_type{ my $self = shift; $self->{'input_id_type'} = shift if(@_); if(!$self->{'input_id_type'}){ $self->{'input_id_type'} = $self->get_analysis->input_id_type; } return $self->{'input_id_type'}; } sub input_ids{ my ($self, $input_ids) = @_; if($input_ids){ throw("Must has an array ref of input_ids not a $input_ids ") unless(ref($input_ids) eq 'ARRAY'); $self->{'input_ids'} = $input_ids; } return $self->{'input_ids'}; } =head2 generate_input_ids Arg [1] : none Function : on the basis of whats in config decides which method to call to generate the input_ids Returntype: Bio::EnsEMBL::Pipeline::IDSet Exceptions: throws if the type isn't recognised' Caller : Example : =cut sub generate_input_ids{ my ($self) = @_; my $ids; if($self->slice){ $ids = $self->get_slice_names; }elsif($self->file){ $ids = $self->get_filenames; }elsif($self->single){ $ids = $self->get_single; }elsif($self->translation_id){ $ids = $self->get_translation_id; }elsif($self->hap_pair){ $ids = $self->get_hap_pairs; }else{ throw("Reaching this point means you haven't created InputIDFactory ". "without selecting what type of input_id to create this won't ". "work"); } $self->input_ids($ids); return $ids; } sub get_slice_names{ my ($self) = @_; $self->slice_size(0) if(!$self->slice_size); $self->slice_overlaps(0) if(!$self->slice_overlaps); $self->coord_system_version('') if(!$self->coord_system_version); if ($self->slice_size && $self->slice_size < 0) { throw("Slice size must be >= 0. Currently " . $self->slice_size); } my $csa = $self->db->get_CoordSystemAdaptor(); my $sa = $self->db->get_SliceAdaptor(); my $slices; if ($self->seq_region_name) { my $sname = sprintf("%s:%s:%s:%s:%s:", $self->coord_system, $self->coord_system_version, $self->seq_region_name, $self->seq_region_start, $self->seq_region_end); $slices = [$sa->fetch_by_name($sname)]; } else { $slices = $sa->fetch_all($self->coord_system, $self->coord_system_version, $self->include_non_reference); } if($self->slice_size > 0){ $slices = split_Slices($slices,$self->slice_size,$self->slice_overlaps); } my @ids; foreach my $slice(@$slices){ push(@ids, $slice->name); } return \@ids; } sub get_hap_pairs{ my ($self) = @_; my @ids; my $assembly_exception_feature_adaptor =$self->db->get_AssemblyExceptionFeatureAdaptor(); my @assembly_exception_features = @{$assembly_exception_feature_adaptor->fetch_all}; $self->coord_system_version('') if(!$self->coord_system_version); foreach my $aef(@assembly_exception_features){ if ($aef->type eq 'HAP'){ my $sname = sprintf("%s:%s:%s:%s:%s:%s:%s:%s:%s:%s:", $aef->slice->coord_system->name, $aef->slice->coord_system->version, $aef->slice->seq_region_name, $aef->start, $aef->end, $aef->alternate_slice->coord_system->name, $aef->alternate_slice->coord_system->version, $aef->alternate_slice->seq_region_name, $aef->alternate_slice->start, $aef->alternate_slice->end,); push(@ids, $sname); } } return \@ids; } sub get_filenames{ my ($self) = @_; if(!$self->dir){ $self->throw("need a directory inorder to fetch the filenames to be used as input_ids $!"); } my @input_ids; opendir(DIR, $self->dir); my @allfiles = readdir DIR; closedir DIR; my $regexp = $self->regex(); foreach my $f(@allfiles) { if($f eq '.' || $f eq '..'){ next; }elsif(-d $f){ next; }else{ my $file; if($regexp){ if($f =~ m|$regexp|){ $file = $f; } }else{ $file = $f; } push(@input_ids, $file) if($file); } } return \@input_ids; } sub get_translation_id{ my ($self) = @_; my $ids = $self->db->get_TranslationAdaptor->list_dbIDs; return $ids; } sub get_single{ my ($self) = @_; my @ids = ($self->single_name); return \@ids; } sub store_input_ids{ my ($self) = @_; my $ids = $self->input_ids; foreach my $id(@$ids){ eval{ $self->stateinfocontainer->store_input_id_analysis($id, $self->get_analysis, ''); }; if($@){ throw("Error storing input_id $id for analysis ". $self->get_analysis->logic_name."\n".$@); } } return 1; } sub get_id_hash{ my ($self) = @_; my $ids = $self->input_ids; my $id_hash = {}; $id_hash->{$self->input_id_type} = {}; foreach my $id(@$ids){ $id_hash->{$self->input_id_type}->{$id} = 1; } return $id_hash; } 1;