Bio::EnsEMBL::Funcgen DesignDefs
SummaryPackage variablesSynopsisDescriptionGeneral documentationMethods
WebCvsRaw content
Package variables
Privates (from "my" definitions)
$reg = "Bio::EnsEMBL::Registry"
Included modules
Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw ( species_chr_num open_file )
Bio::EnsEMBL::Utils::Exception qw ( throw warning deprecate )
  my $imp = Bio::EnsEMBL::Funcgen::Importer->new(%params);
This is a definitions class which should not be instatiated directly, it
normally inherited from the Importer. ArrayDefs contains meta data and methods specific
to NimbleGen arrays to aid parsing and importing of experimental data.
This module is currently NimbleGen specific but is likely to be genericised even more,
extracting vendor specific data and methods to a separate VendorDefs module e.g. NimbleGenDefs.
Methods description
read_array_datacode    nextTop
  Example    : $imp->read_array_data();
Description: Parses NimbleGen DesignNotes.txt files to create and store new Arrays
Returntype : none
Exceptions : None
Caller : general
Status : At risk - Can this be generic? Can we force the creation of a DesignNotes file on other formats?
  Example    : $imp->read_probe_data();
Description: Parses and imports probes, probe sets and features for a given array design
Returntype : none
Exceptions : throws is not tiling format
Caller : Importer
Status : at risk
  Example    : $imp->set_defs();
Description: Sets a definitions hash for the vendor.
Returntype : none
Exceptions : Throws if not called by Importer
Caller : Importer
Status : At risk - likely to be replaced/moved with/to VendorDefs
Methods code
sub read_array_data {
  my ($self, $design_notes) = @_;

  $self->log("Reading and importing array data");

  $self->{'design_notes'} = $design_notes if $design_notes;

  my ($line, $array, $array_chip, @data, %hpos);
  my $oa_adaptor = $self->db->get_ArrayAdaptor();
  my $ac_adaptor = $self->db->get_ArrayChipAdaptor();


  #Slurp file to string, sets local delimtter to null and subs new lines
my $fh = open_file("<", $self->{'design_notes'}); while ($line = <$fh>){ $line =~ s/\r*\n//;#chump
@data = split/\t/o, $line; #We need to have a DESIGN vendor type?
#also need to be able to set file path independently of defs
if($. == 1){ %hpos = %{$self->set_header_hash(\@data, $self->get_def('notes_fields'))}; next; } ### CREATE AND STORE Array and ArrayChips
if(! defined $array ){ #This is treating each array chip as a separate array, unless arrayset is defined
#AT present we have no way of differentiating between different array_chips on same array???!!!
#Need to add functionality afterwards to collate array_chips into single array
#This will use a stored array if present
$array = Bio::EnsEMBL::Funcgen::Array->new ( -NAME => $self->array_name() || $data[$hpos{'DESIGN_NAME'}], -FORMAT => uc($self->format()), -VENDOR => uc($self->vendor()), -TYPE => 'OLIGO', -DESCRIPTION => $data[$hpos{'DESCRIPTION'}],#need to trim the array chip specific description here
); ($array) = @{$oa_adaptor->store($array)}; $array_chip = Bio::EnsEMBL::Funcgen::ArrayChip->new( -ARRAY_ID => $array->dbID(), -NAME => $data[$hpos{'DESIGN_NAME'}], -DESIGN_ID => $data[$hpos{'DESIGN_ID'}], #add description?
); #This will use a stored array_chip if present
($array_chip) = @{$ac_adaptor->store($array_chip)}; $array->add_ArrayChip($array_chip); } elsif((! $array->get_ArrayChip_by_design_id($data[$hpos{'DESIGN_ID'}])) && ($self->array_set())){ $self->log("Generating new ArrayChip(".$data[$hpos{'DESIGN_NAME'}].". for same Array ".$array->name()."\n"); $array_chip = Bio::EnsEMBL::Funcgen::ArrayChip->new( -ARRAY_ID => $array->dbID(), -NAME => $data[$hpos{'DESIGN_NAME'}], -DESIGN_ID => $data[$hpos{'DESIGN_ID'}], ); ($array_chip) = @{$ac_adaptor->store($array_chip)}; $array->add_ArrayChip($array_chip); } elsif(! $array->get_ArrayChip_by_design_id($data[$hpos{'DESIGN_ID'}])){ throw("Found experiment with more than one design without -array_set"); } } $self->add_array($array); close($fh); return;
sub read_probe_data {
  my ($self, $array_file) = @_;
  $self->log("Reading and importing probe data");

  my ($fh, $line, @data, @log, %hpos, %probe_pos);#, %duplicate_probes);
my $aa = $self->db->get_AnalysisAdaptor(); my $manal = $aa->fetch_by_logic_name('MASCycles'); my $uanal = $aa->fetch_by_logic_name('UScore'); my $tmanal= $aa->fetch_by_logic_name('NimblegenTM'); $array_file ||= $self->array_file(); $self->log("Parsing ".$self->vendor()." probe data (".localtime().")"); throw("DesignDefs only accomodates a tiling design with no feature/probesets") if ($self->format() ne 'TILED'); ### Read in
# eFG prb file, not chiip info yet so only one ArrayChip per design
# potential to have pos file here for probes built on generic slices of genome
#We need to handle different coord systems and possibly different assmemblies
my $slice_a = $self->db->get_SliceAdaptor(); my $cs = $self->db->get_FGCoordSystemAdaptor()->fetch_by_name_schema_build_version( 'chromosome', $self->db->_get_schema_build($self->db->dnadb()) ); #sanity check we're only dealing with one array/chip
my @arrays = @{$self->arrays()}; if(scalar(@arrays) != 1){ throw("Array DESIGN imports only accomodate one Array per import, please check ".$self->{'design_notes'}); } my @achips = @{$arrays[0]->get_ArrayChips()}; if(scalar(@achips) != 1){ throw("Array DESIGN imports only accomodates one ArrayChip per import, please check ".$self->{'design_notes'}); } my $achip = $achips[0]; #foreach my $array(@{$self->arrays()}){
# foreach my $achip(@{$array->get_ArrayChips()}){
$self->log("Importing array design(".$achip->name().") from ".$array_file); if($achip->has_status('IMPORTED')){ $self->log("Skipping fully imported ArrayChip:\t".$achip->design_id()); return; }elsif($self->recovery()){ $self->log("Rolling back partially imported ArrayChip:\t".$achip->design_id()); $self->db->rollback_ArrayChip($achip); } $self->log("Importing ArrayChip:".$achip->design_id()); #OPEN PROBE IN/OUT FILES
$fh = open_file("<", $array_file); my $f_out = open_file(">", $self->get_dir("output")."/probe.".$achip->name()."fasta") if($self->{'_dump_fasta'}); my ($op, $of, %pfs); #should define mapping_method arg to allows this to be set to LiftOver/EnsemblMap
my $anal = $self->db->get_AnalysisAdaptor()->fetch_by_logic_name("TileMap");##???
my $strand = 0; #default for TileMap, should be defs hash?
my $fasta = ""; while($line = <$fh>){ $line =~ s/\r*\n//; @data = split/\t/o, $line; my $loc = ""; #SEQ_ID WINDOW_START WINDOW_END POSITION LENGTH PROBE_SEQUENCE TM UNIQUENESS_SCORE MAS_CYCLES
if ($. == 1){ %hpos = %{$self->set_header_hash(\@data, $self->get_def('prb_fields'))}; next; } #This assumes tiling format with no feature/probe sets
if(%pfs){ $self->store_set_probes_features($achip->dbID(),\% pfs); undef %pfs; } #PROBE
$op = Bio::EnsEMBL::Funcgen::Probe->new( -NAME => $data[$hpos{'PROBE_ID'}], -LENGTH => $data[$hpos{'LENGTH'}], -ARRAY => $arrays[0], -ARRAY_CHIP_ID => $achip->dbID(), -CLASS => 'DESIGN', ); $op->add_Analysis_score($manal, $data[$hpos{'MAS_CYCLES'}]); $op->add_Analysis_score($tmanal, $data[$hpos{'TM'}]); $op->add_Analysis_CoordSystem_score($uanal, $cs, $data[$hpos{'UNIQUENESS_SCORE'}]); #would need to pass cs to store USCORE, CYCLES and TM are seq/anal dependent not cs
#associate cs dependent scores with features
#we are duplicating cs in probe_design table
#do we need another object? ProbeDesign
#-cs would be empty for all but uscore
#-mas_cycles analysis_id
#-uscore analysis_id cs_id
#-tm analysis_id (anal most likely wont change so highly redundant)
#this would produce 3 records for each probe, with cs being empty for two and anal being redundant for the other
#would however provide for extensible design attributes
#would only need one tm and mas_cycles for each probe irrespective of cs
#could have separate table probe_design_feature?
#mmm doesn't have location, just cs
#just have empty cs fields, calls by cs would have to be in ('cs_id', 'NULL')
#or just make uscore method dependent on cs.
#or split table?
#can we add this directly to the probe?
#separate retrieval in ProbeAdaptor so we're not joining everytime
#this would require generic get_analysis/analysis_coord_system_attribute method
%{$pfs{$data[$hpos{'PROBE_ID'}]}} = ( probe => $op, features => [], ); #PROBE FEATURE
if(! $self->cache_slice($data[$hpos{'SEQ_ID'}])){ warn("Skipping non-standard probe chromosome"); undef %pfs; next; } my $end = ($data[$hpos{'POSITION'}] + $data[$hpos{'LENGTH'}]); if ($self->{'_dump_fasta'}){ $loc .= $data[$hpos{'SEQ_ID'}].":".$data[$hpos{'POSITION'}]."-${end};"; } $of = Bio::EnsEMBL::Funcgen::ProbeFeature->new ( -START => $data[$hpos{'POSITION'}], -END => $end, -STRAND => $strand, -SLICE => $self->cache_slice($data[$hpos{'SEQ_ID'}]), -ANALYSIS => $anal, -MISMATCHCOUNT => 0, -CIGAR_LINE => $data[$hpos{'LENGTH'}].'M', -PROBE => undef,#Need to update this in the store method
); push @{$pfs{$data[$hpos{'PROBE_ID'}]}{'features'}}, $of; if($self->{'_dump_fasta'}){ #filter controls/randoms? Or would it be sensible to see where they map
#wrap seq here?
$fasta .= ">".$data[$hpos{'PROBE_ID'}]."\t".$data[$hpos{'CHROMOSOME'}]. "\t$loc\n".$data[$hpos{'PROBE_SEQUENCE'}]."\n"; } } #need to store last data here
$self->store_set_probes_features($achip->dbID(),\% pfs); $self->log(join("\n", @log)); $achip->adaptor->set_status("IMPORTED", $achip); $self->log("ArrayChip:\t".$achip->design_id()." has been IMPORTED"); if ($self->{'_dump_fasta'}){ print $f_out $fasta if($self->{'_dump_fasta'}); close($f_out); } $self->log("Finished parsing probe data"); #Total probe_sets:\t$psid\n".
# "Total probes:\t$pid\nTotal probe_features:\t$fid");
return; } 1;
sub set_defs {
	my ($self) = @_;

	warn "Change all this to new method and check for mandatory params";
	#set defs as self attributes not defs
#Need to separate this into vendor defs and experiment defs?
#these cause undefs if we're just setting sanger defs without exp context
my %array_defs = ( DESIGN => { #order of these data arrays is important!
#array_data => ['experiment'],
probe_data => ["probe"], #results_data => ["results"],
#sample_key_fields => ['DESIGN_ID', 'CHIP_ID', 'DYE', 'PROMOT_SAMPLE_TYPE'],# 'SAMPLE_DESCRIPTION removed due to naming disparities
#result_fields => ['PROBE_ID', 'PM'],
notes_fields => ['DESIGN_ID', 'DESIGN_NAME', 'DESCRIPTION'], #import_methods => [],
#data paths here?
#but this is vendor specific and remains an array_def
#design_dir => $self->get_dir("data")."/input/".$self->vendor()."/".$self->name()."/DesignFiles",
#chip_file => $self->get_dir("data")."/input/".$self->vendor()."/".$self->name()."/SampleKey.txt",
notes_file => $self->get_dir("data")."/input/".$self->vendor()."/".$self->name()."/DesignNotes.txt", #results_file => $self->get_dir("data")."/input/".$self->vendor()."/".$self->name()."/PairData/All_Pair.txt",
#results_dir => $self->get_dir("data")."/input/".$self->vendor()."/".$self->name()."/PairData",
#norm_method => 'VSN_GLOG',
#dye_freqs => {(
# Cy5 => 635,
# Cy3 => 532,
# )},
}, ); $self->{'array_defs'} = $array_defs{$self->vendor()}; return;
General documentation
This module was written by Nathan Johnson.
Post questions to the EnsEMBL development list