Bio::EnsEMBL::Funcgen::Parsers ArrayDesign
SummaryIncluded librariesPackage variablesSynopsisDescriptionGeneral documentationMethods
Toolbar
WebCvsRaw content
Summary
Bio::EnsEMBL::Funcgen::Parsers::ArrayDesign
Package variables
No package variables defined.
Included modules
Bio::EnsEMBL::Funcgen::Array
Bio::EnsEMBL::Funcgen::ArrayChip
Bio::EnsEMBL::Funcgen::Channel
Bio::EnsEMBL::Funcgen::ExperimentalChip
Bio::EnsEMBL::Funcgen::FeatureType
Bio::EnsEMBL::Funcgen::Probe
Bio::EnsEMBL::Funcgen::ProbeFeature
Bio::EnsEMBL::Funcgen::ProbeSet
Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw ( species_chr_num open_file )
Bio::EnsEMBL::Funcgen::Utils::Helper
Bio::EnsEMBL::Utils::Exception qw ( throw warning deprecate )
strict
Inherit
Bio::EnsEMBL::Funcgen::Utils::Helper
Synopsis
  my $parser_type = "Bio::EnsEMBL::Funcgen::Parsers::ArrayDesign";
push @INC, $parser_type;
my $imp = $class->SUPER::new(@_); my $imp = Bio::EnsEMBL::Funcgen::Importer->new(%params);
$imp->set_config();
Description
This is a definitions class which should not be instatiated directly, it
normally inherited from the Importer. ArrayDesign contains meta data and methods
specific to handling array designs only (i.e. no experimental data), which have
been produced from the eFG array design software.
Methods
newDescriptionCode
read_array_dataDescriptionCode
read_probe_dataDescriptionCode
set_configDescriptionCode
Methods description
newcode    nextTop
  Example    : my $self = $class->SUPER::new(@_);
Description: Constructor method for ArrayDesign class
Returntype : Bio::EnsEMBL::Funcgen::Parsers::ArrayDesign
Exceptions : throws if Experiment name not defined or if caller is not Importer
Caller : Bio::EnsEMBL::Funcgen::Importer
Status : at risk
read_array_datacodeprevnextTop
  Example    : $imp->read_array_data();
Description: Parses NimbleGen style DesignNotes.txt format files to create and store new Arrays
Returntype : none
Exceptions : None
Caller : general
Status : At risk - Can this be generic? Can we force the creation of a DesignNotes file on other formats?
read_probe_datacodeprevnextTop
  Example    : $imp->read_probe_data();
Description: Parses and imports probes, probe sets and features for a given array design
Returntype : none
Exceptions : throws is not tiling format
Caller : Importer
Status : at risk
set_configcodeprevnextTop
  Example    : my $self->set_config;
Description: Sets attribute dependent config
Returntype : None
Exceptions : None
Caller : Bio::EnsEMBL::Funcgen::Importer
Status : at risk
Methods code
newdescriptionprevnextTop
sub new {
  my $caller = shift;

  my $class = ref($caller) || $caller;
  my $self = $class->SUPER::new();

  throw("This is a skeleton class for Bio::EnsEMBL::Importer, should not be used directly") if(! $self->isa("Bio::EnsEMBL::Funcgen::Importer"));


  $self->{'config'} =  
    {(     
      probe_data   => ["probe"],
      prb_fields   => ['SEQ_ID',  'POSITION', 'LENGTH', 'PROBE_SEQUENCE', 'PROBE_ID', 'UNIQUENESS_SCORE', 'TM', 'MAS_CYCLES'],
      notes_fields => ['DESIGN_ID', 'DESIGN_NAME', 'DESCRIPTION'],
     )};

  return $self;
}
read_array_datadescriptionprevnextTop
sub read_array_data {
  my ($self, $design_notes) = @_;

  $self->log("Reading and importing array data");

  throw('You need to pass the path to a DesignNotes.txt file') if ! defined $design_notes;
  $self->{'design_notes'} = $design_notes;

  my ($line, $array, $array_chip, @data, %hpos);
  my $oa_adaptor = $self->db->get_ArrayAdaptor();
  my $ac_adaptor = $self->db->get_ArrayChipAdaptor();
  my $fh = open_file("<", $self->{'design_notes'});
  
  while ($line = <$fh>){
    
    $line =~ s/\r*\n//;#chump
@data = split/\t/o, $line; #We need to have a DESIGN vendor type?
#also need to be able to set file path independently of config
if($. == 1){ %hpos = %{$self->set_header_hash(\@data, $self->get_config('notes_fields'))}; next; } ### CREATE AND STORE Array and ArrayChips
if(! defined $array ){ #This is treating each array chip as a separate array, unless arrayset is defined
#AT present we have no way of differentiating between different array_chips on same array???!!!
#Need to add functionality afterwards to collate array_chips into single array
#This will use a stored array if present
$array = Bio::EnsEMBL::Funcgen::Array->new ( -NAME => $self->array_name() || $data[$hpos{'DESIGN_NAME'}], -FORMAT => uc($self->format()), -VENDOR => uc($self->vendor()), -TYPE => 'OLIGO', -DESCRIPTION => $data[$hpos{'DESCRIPTION'}],#need to trim the array chip specific description here
); ($array) = @{$oa_adaptor->store($array)}; $array_chip = Bio::EnsEMBL::Funcgen::ArrayChip->new( -ARRAY_ID => $array->dbID(), -NAME => $data[$hpos{'DESIGN_NAME'}], -DESIGN_ID => $data[$hpos{'DESIGN_ID'}], #add description?
); #This will use a stored array_chip if present
($array_chip) = @{$ac_adaptor->store($array_chip)}; $array->add_ArrayChip($array_chip); } elsif((! $array->get_ArrayChip_by_design_id($data[$hpos{'DESIGN_ID'}])) && ($self->array_set())){ $self->log("Generating new ArrayChip(".$data[$hpos{'DESIGN_NAME'}].". for same Array ".$array->name()."\n"); $array_chip = Bio::EnsEMBL::Funcgen::ArrayChip->new( -ARRAY_ID => $array->dbID(), -NAME => $data[$hpos{'DESIGN_NAME'}], -DESIGN_ID => $data[$hpos{'DESIGN_ID'}], ); ($array_chip) = @{$ac_adaptor->store($array_chip)}; $array->add_ArrayChip($array_chip); } elsif(! $array->get_ArrayChip_by_design_id($data[$hpos{'DESIGN_ID'}])){ throw("Found experiment with more than one design without -array_set"); } } $self->add_Array($array); close($fh); return;
}
read_probe_datadescriptionprevnextTop
sub read_probe_data {
  my ($self, $array_file) = @_;
  
  $self->log("Reading and importing probe data");


  my ($fh, $line, @data, @log, %hpos, %probe_pos);#, %duplicate_probes);
my $aa = $self->db->get_AnalysisAdaptor(); my $manal = $aa->fetch_by_logic_name('MASCycles'); my $uanal = $aa->fetch_by_logic_name('UScore'); my $tmanal= $aa->fetch_by_logic_name('NimblegenTM'); $array_file ||= $self->array_file(); $self->log("Parsing ".$self->vendor()." probe data (".localtime().")"); throw("ArrayDesign only accomodates a tiling design with no feature/probesets") if ($self->format() ne 'TILED'); ### Read in
# eFG prb file, not chiip info yet so only one ArrayChip per design
# potential to have pos file here for probes built on generic slices of genome
#We need to handle different coord systems and possibly different assmemblies
my $slice_a = $self->db->get_SliceAdaptor(); my $cs = $self->db->get_FGCoordSystemAdaptor()->fetch_by_name_schema_build_version( 'chromosome', $self->db->_get_schema_build($self->db->dnadb()) ); #sanity check we're only dealing with one array/chip
my @arrays = @{$self->arrays()}; if(scalar(@arrays) != 1){ throw("Array DESIGN imports only accomodate one Array per import, please check ".$self->{'design_notes'}); } my @achips = @{$arrays[0]->get_ArrayChips()}; if(scalar(@achips) != 1){ throw("Array DESIGN imports only accomodates one ArrayChip per import, please check ".$self->{'design_notes'}); } my $achip = $achips[0]; #foreach my $array(@{$self->arrays()}){
# foreach my $achip(@{$array->get_ArrayChips()}){
$self->log("Importing array design(".$achip->name().") from ".$array_file); if($achip->has_status('IMPORTED')){ $self->log("Skipping fully imported ArrayChip:\t".$achip->design_id()); return; }elsif($self->recovery()){ $self->log("Rolling back partially imported ArrayChip:\t".$achip->design_id()); $self->db->rollback_ArrayChip($achip); } $self->log("Importing ArrayChip:".$achip->design_id()); #OPEN PROBE IN/OUT FILES
$fh = open_file("<", $array_file); my $f_out = open_file(">", $self->get_dir("output")."/probe.".$achip->name()."fasta") if($self->{'_dump_fasta'}); my ($op, $of, %pfs); #should define mapping_method arg to allows this to be set to LiftOver/EnsemblMap
my $anal = $self->db->get_AnalysisAdaptor()->fetch_by_logic_name("TileMap");##???
my $strand = 0; #default for TileMap, should be config hash?
my $fasta = ""; while($line = <$fh>){ $line =~ s/\r*\n//; @data = split/\t/o, $line; my $loc = ""; #SEQ_ID WINDOW_START WINDOW_END POSITION LENGTH PROBE_SEQUENCE TM UNIQUENESS_SCORE MAS_CYCLES
#X 3000001 3000100 3000041 56 TGACATCTTCAGTTCTTTACATAGTTTTCATATTAGTCCTCTATCAGATGTGGAGT 73.09 132 15
if ($. == 1){ %hpos = %{$self->set_header_hash(\@data, $self->get_config('prb_fields'))}; next; } #This assumes tiling format with no feature/probe sets
if(%pfs){ $self->store_set_probes_features($achip->dbID(),\% pfs); undef %pfs; } #PROBE
$op = Bio::EnsEMBL::Funcgen::Probe->new( -NAME => $data[$hpos{'PROBE_ID'}], -LENGTH => $data[$hpos{'LENGTH'}], -ARRAY => $arrays[0], -ARRAY_CHIP_ID => $achip->dbID(), -CLASS => 'DESIGN', ); $op->add_Analysis_score($manal, $data[$hpos{'MAS_CYCLES'}]); $op->add_Analysis_score($tmanal, $data[$hpos{'TM'}]); $op->add_Analysis_CoordSystem_score($uanal, $cs, $data[$hpos{'UNIQUENESS_SCORE'}]); #would need to pass cs to store USCORE, CYCLES and TM are seq/anal dependent not cs
#options:
#associate cs dependent scores with features
#we are duplicating cs in probe_design table
#do we need another object? ProbeDesign
#-cs would be empty for all but uscore
#-mas_cycles analysis_id
#-uscore analysis_id cs_id
#-tm analysis_id (anal most likely wont change so highly redundant)
#this would produce 3 records for each probe, with cs being empty for two and anal being redundant for the other
#would however provide for extensible design attributes
#would only need one tm and mas_cycles for each probe irrespective of cs
#could have separate table probe_design_feature?
#mmm doesn't have location, just cs
#just have empty cs fields, calls by cs would have to be in ('cs_id', 'NULL')
#or just make uscore method dependent on cs.
#or split table?
#ProbeDesign::add_analysis_score
#ProbeDesign::add_coord_sys_analysis_score
#can we add this directly to the probe?
#separate retrieval in ProbeAdaptor so we're not joining everytime
#this would require generic get_analysis/analysis_coord_system_attribute method
%{$pfs{$data[$hpos{'PROBE_ID'}]}} = ( probe => $op, features => [], ); #PROBE FEATURE
if(! $self->cache_slice($data[$hpos{'SEQ_ID'}])){ warn("Skipping non-standard probe chromosome"); undef %pfs; next; } my $end = ($data[$hpos{'POSITION'}] + $data[$hpos{'LENGTH'}]); if ($self->{'_dump_fasta'}){ $loc .= $data[$hpos{'SEQ_ID'}].":".$data[$hpos{'POSITION'}]."-${end};"; } $of = Bio::EnsEMBL::Funcgen::ProbeFeature->new ( -START => $data[$hpos{'POSITION'}], -END => $end, -STRAND => $strand, -SLICE => $self->cache_slice($data[$hpos{'SEQ_ID'}]), -ANALYSIS => $anal, -MISMATCHCOUNT => 0, -CIGAR_LINE => $data[$hpos{'LENGTH'}].'M', -PROBE => undef,#Need to update this in the store method
); push @{$pfs{$data[$hpos{'PROBE_ID'}]}{'features'}}, $of; if($self->{'_dump_fasta'}){ #filter controls/randoms? Or would it be sensible to see where they map
#wrap seq here?
$fasta .= ">".$data[$hpos{'PROBE_ID'}]."\t".$data[$hpos{'CHROMOSOME'}]. "\t$loc\n".$data[$hpos{'PROBE_SEQUENCE'}]."\n"; } } #need to store last data here
$self->store_set_probes_features($achip->dbID(),\% pfs); $self->log(join("\n", @log)); $achip->adaptor->set_status("IMPORTED", $achip); $self->log("ArrayChip:\t".$achip->design_id()." has been IMPORTED"); if ($self->{'_dump_fasta'}){ print $f_out $fasta if($self->{'_dump_fasta'}); close($f_out); } $self->log("Finished parsing probe data"); #Total probe_sets:\t$psid\n".
# "Total probes:\t$pid\nTotal probe_features:\t$fid");
return; } 1;
}
set_configdescriptionprevnextTop
sub set_config {
  my ($self) = @_;
  
  #placeholder method
#set paths
return;
}
General documentation
AUTHORTop
This module was written by Nathan Johnson.
CONTACTTop
Post questions to the EnsEMBL development list ensembl-dev@ebi.ac.uk