Raw content of Bio::EnsEMBL::Funcgen::DesignDefs # # EnsEMBL module for Bio::EnsEMBL::Funcgen::DesignDefs # =head1 NAME Bio::EnsEMBL::Funcgen::DesignDefs =head1 SYNOPSIS my $imp = Bio::EnsEMBL::Funcgen::Importer->new(%params); $imp->set_defs(); =head1 DESCRIPTION This is a definitions class which should not be instatiated directly, it normally inherited from the Importer. ArrayDefs contains meta data and methods specific to NimbleGen arrays to aid parsing and importing of experimental data. This module is currently NimbleGen specific but is likely to be genericised even more, extracting vendor specific data and methods to a separate VendorDefs module e.g. NimbleGenDefs. =head1 AUTHOR This module was written by Nathan Johnson. =head1 CONTACT Post questions to the EnsEMBL development list ensembl-dev@ebi.ac.uk =head1 METHODS =cut package Bio::EnsEMBL::Funcgen::DesignDefs; use Bio::EnsEMBL::Funcgen::Array; use Bio::EnsEMBL::Funcgen::ProbeSet; use Bio::EnsEMBL::Funcgen::Probe; use Bio::EnsEMBL::Funcgen::ProbeFeature; use Bio::EnsEMBL::Funcgen::FeatureType; use Bio::EnsEMBL::Funcgen::ExperimentalChip; use Bio::EnsEMBL::Funcgen::ArrayChip; use Bio::EnsEMBL::Funcgen::Channel; use Bio::EnsEMBL::Utils::Exception qw( throw warning deprecate ); use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw(species_chr_num open_file); use strict; my $reg = "Bio::EnsEMBL::Registry"; #Create internal object and pass ref to Importer #So importer calls generic methods on Defs object which then uses set ref to access #Do we need to "set" this or can we just have all of this in $self? =head2 set_defs Example : $imp->set_defs(); Description: Sets a definitions hash for the vendor. Returntype : none Exceptions : Throws if not called by Importer Caller : Importer Status : At risk - likely to be replaced/moved with/to VendorDefs =cut sub set_defs{ my ($self) = @_; warn "Change all this to new method and check for mandatory params"; #set defs as self attributes not defs #Need to separate this into vendor defs and experiment defs? #these cause undefs if we're just setting sanger defs without exp context my %array_defs = ( DESIGN => { #order of these data arrays is important! #array_data => ['experiment'], probe_data => ["probe"], #results_data => ["results"], #sample_key_fields => ['DESIGN_ID', 'CHIP_ID', 'DYE', 'PROMOT_SAMPLE_TYPE'],# 'SAMPLE_DESCRIPTION removed due to naming disparities prb_fields => ['SEQ_ID', 'POSITION', 'LENGTH', 'PROBE_SEQUENCE', 'PROBE_ID', 'UNIQUENESS_SCORE', 'TM', 'MAS_CYCLES'], #pos_fields => ['CHROMOSOME', 'PROBE_ID', 'POSITION', 'COUNT'], #result_fields => ['PROBE_ID', 'PM'], notes_fields => ['DESIGN_ID', 'DESIGN_NAME', 'DESCRIPTION'], #import_methods => [], #data paths here? #but this is vendor specific and remains an array_def #design_dir => $self->get_dir("data")."/input/".$self->vendor()."/".$self->name()."/DesignFiles", #chip_file => $self->get_dir("data")."/input/".$self->vendor()."/".$self->name()."/SampleKey.txt", notes_file => $self->get_dir("data")."/input/".$self->vendor()."/".$self->name()."/DesignNotes.txt", #results_file => $self->get_dir("data")."/input/".$self->vendor()."/".$self->name()."/PairData/All_Pair.txt", #results_dir => $self->get_dir("data")."/input/".$self->vendor()."/".$self->name()."/PairData", #norm_method => 'VSN_GLOG', #dye_freqs => {( # Cy5 => 635, # Cy3 => 532, # )}, }, ); $self->{'array_defs'} = $array_defs{$self->vendor()}; return; } =head2 read_array_data Example : $imp->read_array_data(); Description: Parses NimbleGen DesignNotes.txt files to create and store new Arrays Returntype : none Exceptions : None Caller : general Status : At risk - Can this be generic? Can we force the creation of a DesignNotes file on other formats? =cut #this is currently OLIGO specific. sub read_array_data{ my ($self, $design_notes) = @_; $self->log("Reading and importing array data"); $self->{'design_notes'} = $design_notes if $design_notes; my ($line, $array, $array_chip, @data, %hpos); my $oa_adaptor = $self->db->get_ArrayAdaptor(); my $ac_adaptor = $self->db->get_ArrayChipAdaptor(); #Slurp file to string, sets local delimtter to null and subs new lines my $fh = open_file("<", $self->{'design_notes'}); while ($line = <$fh>){ $line =~ s/\r*\n//;#chump @data = split/\t/o, $line; #We need to have a DESIGN vendor type? #also need to be able to set file path independently of defs if($. == 1){ %hpos = %{$self->set_header_hash(\@data, $self->get_def('notes_fields'))}; next; } ### CREATE AND STORE Array and ArrayChips if(! defined $array ){ #This is treating each array chip as a separate array, unless arrayset is defined #AT present we have no way of differentiating between different array_chips on same array???!!! #Need to add functionality afterwards to collate array_chips into single array #This will use a stored array if present $array = Bio::EnsEMBL::Funcgen::Array->new ( -NAME => $self->array_name() || $data[$hpos{'DESIGN_NAME'}], -FORMAT => uc($self->format()), -VENDOR => uc($self->vendor()), -TYPE => 'OLIGO', -DESCRIPTION => $data[$hpos{'DESCRIPTION'}],#need to trim the array chip specific description here ); ($array) = @{$oa_adaptor->store($array)}; $array_chip = Bio::EnsEMBL::Funcgen::ArrayChip->new( -ARRAY_ID => $array->dbID(), -NAME => $data[$hpos{'DESIGN_NAME'}], -DESIGN_ID => $data[$hpos{'DESIGN_ID'}], #add description? ); #This will use a stored array_chip if present ($array_chip) = @{$ac_adaptor->store($array_chip)}; $array->add_ArrayChip($array_chip); } elsif((! $array->get_ArrayChip_by_design_id($data[$hpos{'DESIGN_ID'}])) && ($self->array_set())){ $self->log("Generating new ArrayChip(".$data[$hpos{'DESIGN_NAME'}].". for same Array ".$array->name()."\n"); $array_chip = Bio::EnsEMBL::Funcgen::ArrayChip->new( -ARRAY_ID => $array->dbID(), -NAME => $data[$hpos{'DESIGN_NAME'}], -DESIGN_ID => $data[$hpos{'DESIGN_ID'}], ); ($array_chip) = @{$ac_adaptor->store($array_chip)}; $array->add_ArrayChip($array_chip); } elsif(! $array->get_ArrayChip_by_design_id($data[$hpos{'DESIGN_ID'}])){ throw("Found experiment with more than one design without -array_set"); } } $self->add_array($array); close($fh); return; } =head2 read_probe_data Example : $imp->read_probe_data(); Description: Parses and imports probes, probe sets and features for a given array design Returntype : none Exceptions : throws is not tiling format Caller : Importer Status : at risk =cut #Assumes one chip_design per experimental set. sub read_probe_data{ my ($self, $array_file) = @_; $self->log("Reading and importing probe data"); my ($fh, $line, @data, @log, %hpos, %probe_pos);#, %duplicate_probes); my $aa = $self->db->get_AnalysisAdaptor(); my $manal = $aa->fetch_by_logic_name('MASCycles'); my $uanal = $aa->fetch_by_logic_name('UScore'); my $tmanal= $aa->fetch_by_logic_name('NimblegenTM'); $array_file ||= $self->array_file(); $self->log("Parsing ".$self->vendor()." probe data (".localtime().")"); throw("DesignDefs only accomodates a tiling design with no feature/probesets") if ($self->format() ne 'TILED'); ### Read in # eFG prb file, not chiip info yet so only one ArrayChip per design # potential to have pos file here for probes built on generic slices of genome #We need to handle different coord systems and possibly different assmemblies my $slice_a = $self->db->get_SliceAdaptor(); my $cs = $self->db->get_FGCoordSystemAdaptor()->fetch_by_name_schema_build_version( 'chromosome', $self->db->_get_schema_build($self->db->dnadb()) ); #sanity check we're only dealing with one array/chip my @arrays = @{$self->arrays()}; if(scalar(@arrays) != 1){ throw("Array DESIGN imports only accomodate one Array per import, please check ".$self->{'design_notes'}); } my @achips = @{$arrays[0]->get_ArrayChips()}; if(scalar(@achips) != 1){ throw("Array DESIGN imports only accomodates one ArrayChip per import, please check ".$self->{'design_notes'}); } my $achip = $achips[0]; #foreach my $array(@{$self->arrays()}){ # foreach my $achip(@{$array->get_ArrayChips()}){ $self->log("Importing array design(".$achip->name().") from ".$array_file); if($achip->has_status('IMPORTED')){ $self->log("Skipping fully imported ArrayChip:\t".$achip->design_id()); return; }elsif($self->recovery()){ $self->log("Rolling back partially imported ArrayChip:\t".$achip->design_id()); $self->db->rollback_ArrayChip($achip); } $self->log("Importing ArrayChip:".$achip->design_id()); #OPEN PROBE IN/OUT FILES $fh = open_file("<", $array_file); my $f_out = open_file(">", $self->get_dir("output")."/probe.".$achip->name()."fasta") if($self->{'_dump_fasta'}); my ($op, $of, %pfs); #should define mapping_method arg to allows this to be set to LiftOver/EnsemblMap my $anal = $self->db->get_AnalysisAdaptor()->fetch_by_logic_name("TileMap");##??? my $strand = 0; #default for TileMap, should be defs hash? my $fasta = ""; while($line = <$fh>){ $line =~ s/\r*\n//; @data = split/\t/o, $line; my $loc = ""; #SEQ_ID WINDOW_START WINDOW_END POSITION LENGTH PROBE_SEQUENCE TM UNIQUENESS_SCORE MAS_CYCLES #X 3000001 3000100 3000041 56 TGACATCTTCAGTTCTTTACATAGTTTTCATATTAGTCCTCTATCAGATGTGGAGT 73.09 132 15 if ($. == 1){ %hpos = %{$self->set_header_hash(\@data, $self->get_def('prb_fields'))}; next; } #This assumes tiling format with no feature/probe sets if(%pfs){ $self->store_set_probes_features($achip->dbID(), \%pfs); undef %pfs; } #PROBE $op = Bio::EnsEMBL::Funcgen::Probe->new( -NAME => $data[$hpos{'PROBE_ID'}], -LENGTH => $data[$hpos{'LENGTH'}], -ARRAY => $arrays[0], -ARRAY_CHIP_ID => $achip->dbID(), -CLASS => 'DESIGN', ); $op->add_Analysis_score($manal, $data[$hpos{'MAS_CYCLES'}]); $op->add_Analysis_score($tmanal, $data[$hpos{'TM'}]); $op->add_Analysis_CoordSystem_score($uanal, $cs, $data[$hpos{'UNIQUENESS_SCORE'}]); #would need to pass cs to store USCORE, CYCLES and TM are seq/anal dependent not cs #options: #associate cs dependent scores with features #we are duplicating cs in probe_design table #do we need another object? ProbeDesign #-cs would be empty for all but uscore #-mas_cycles analysis_id #-uscore analysis_id cs_id #-tm analysis_id (anal most likely wont change so highly redundant) #this would produce 3 records for each probe, with cs being empty for two and anal being redundant for the other #would however provide for extensible design attributes #would only need one tm and mas_cycles for each probe irrespective of cs #could have separate table probe_design_feature? #mmm doesn't have location, just cs #just have empty cs fields, calls by cs would have to be in ('cs_id', 'NULL') #or just make uscore method dependent on cs. #or split table? #ProbeDesign::add_analysis_score #ProbeDesign::add_coord_sys_analysis_score #can we add this directly to the probe? #separate retrieval in ProbeAdaptor so we're not joining everytime #this would require generic get_analysis/analysis_coord_system_attribute method %{$pfs{$data[$hpos{'PROBE_ID'}]}} = ( probe => $op, features => [], ); #PROBE FEATURE if(! $self->cache_slice($data[$hpos{'SEQ_ID'}])){ warn("Skipping non-standard probe chromosome"); undef %pfs; next; } my $end = ($data[$hpos{'POSITION'}] + $data[$hpos{'LENGTH'}]); if ($self->{'_dump_fasta'}){ $loc .= $data[$hpos{'SEQ_ID'}].":".$data[$hpos{'POSITION'}]."-${end};"; } $of = Bio::EnsEMBL::Funcgen::ProbeFeature->new ( -START => $data[$hpos{'POSITION'}], -END => $end, -STRAND => $strand, -SLICE => $self->cache_slice($data[$hpos{'SEQ_ID'}]), -ANALYSIS => $anal, -MISMATCHCOUNT => 0, -CIGAR_LINE => $data[$hpos{'LENGTH'}].'M', -PROBE => undef,#Need to update this in the store method ); push @{$pfs{$data[$hpos{'PROBE_ID'}]}{'features'}}, $of; if($self->{'_dump_fasta'}){ #filter controls/randoms? Or would it be sensible to see where they map #wrap seq here? $fasta .= ">".$data[$hpos{'PROBE_ID'}]."\t".$data[$hpos{'CHROMOSOME'}]. "\t$loc\n".$data[$hpos{'PROBE_SEQUENCE'}]."\n"; } } #need to store last data here $self->store_set_probes_features($achip->dbID(), \%pfs); $self->log(join("\n", @log)); $achip->adaptor->set_status("IMPORTED", $achip); $self->log("ArrayChip:\t".$achip->design_id()." has been IMPORTED"); if ($self->{'_dump_fasta'}){ print $f_out $fasta if($self->{'_dump_fasta'}); close($f_out); } $self->log("Finished parsing probe data"); #Total probe_sets:\t$psid\n". # "Total probes:\t$pid\nTotal probe_features:\t$fid"); return; } 1;