Bio::EnsEMBL::Funcgen::Parsers ExperimentalSet
SummaryIncluded librariesPackage variablesSynopsisDescriptionGeneral documentationMethods
Toolbar
WebCvsRaw content
Summary
Bio::EnsEMBL::Funcgen::Parsers::Simple
Package variables
No package variables defined.
Included modules
Bio::EnsEMBL::Funcgen::AnnotatedFeature
Bio::EnsEMBL::Funcgen::ExperimentalSet
Bio::EnsEMBL::Funcgen::FeatureSet
Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw ( species_chr_num open_file )
Bio::EnsEMBL::Funcgen::Utils::Helper
Bio::EnsEMBL::Utils::Argument qw ( rearrange )
Bio::EnsEMBL::Utils::Exception qw ( throw warning deprecate )
strict
Inherit
Bio::EnsEMBL::Funcgen::Utils::Helper
Synopsis
  use vars qw(@ISA);
@ISA = qw(Bio::EnsEMBL::Funcgen::Parsers::ExperimentalSet);
Description
This is a base class to support simple file format parsers. For simple imports the vendor is
set to the parser type i.e. the file format. The generic read_and_import_simple_data assumes
a one line per feature format, other format need there own read_and_import_format_data method,
which will need defining in the result_data config element.
Methods
annotated_feature_adaptor
No description
Code
count
No description
Code
counts
No description
Code
data_set
No description
Code
dbentry_adaptor
No description
Code
dbentry_params
No description
Code
define_sets
No description
Code
experimental_set_adaptor
No description
Code
feature_params
No description
Code
feature_separator
No description
Code
load_feature_and_xrefs
No description
Code
newDescriptionCode
read_and_import_data
No description
Code
set_configDescriptionCode
set_feature_separator
No description
Code
validate_files
No description
Code
Methods description
newcode    nextTop
  Example    : my $self = $class->SUPER::new(@_);
Description: Constructor method for Bed class
Returntype : Bio::EnsEMBL::Funcgen::Parsers::Simple
Exceptions : throws if caller is not Importer
Caller : Bio::EnsEMBL::Funcgen::Parsers:Simple
Status : at risk
set_configcodeprevnextTop
  Example    : my $self->set_config;
Description: Sets attribute dependent config
Returntype : None
Exceptions : None
Caller : Bio::EnsEMBL::Funcgen::Importer
Status : at risk
Methods code
annotated_feature_adaptordescriptionprevnextTop
sub annotated_feature_adaptor {
  my $self = shift;
  return $self->{'annotated_feature_adaptor'};
}
countdescriptionprevnextTop
sub count {
  my ($self, $count_type) = @_;

  $self->{'_counts'}{$count_type} ||=0;
  $self->{'_counts'}{$count_type}++;
  return;
}
countsdescriptionprevnextTop
sub counts {
  my $self = shift;
  return $self->{'_counts'}
}
data_setdescriptionprevnextTop
sub data_set {
  my $self = shift;
  return $self->{'_data_set'};
}
dbentry_adaptordescriptionprevnextTop
sub dbentry_adaptor {
  my $self = shift;
  return $self->{'dbentry_adaptor'};
}
dbentry_paramsdescriptionprevnextTop
sub dbentry_params {
  my $self = shift;
  return $self->{'_dbentry_params'};
}
define_setsdescriptionprevnextTop
sub define_sets {
  my ($self) = @_;

  my $eset = $self->experimental_set_adaptor->fetch_by_name($self->experimental_set_name);
  
  if(! defined $eset){
	$eset = Bio::EnsEMBL::Funcgen::ExperimentalSet->new
	  (
	   -name         => $self->experimental_set_name(),
	   -experiment   => $self->experiment(),
	   -feature_type => $self->feature_type(),
	   -cell_type    => $self->cell_type(),
	   -vendor       => $self->vendor(),
	   -format       => $self->format(),
	   -analysis     => $self->feature_analysis,
	  );
	($eset)  = @{$self->experimental_set_adaptor->store($eset)};

  }

  #Use define_and_validate with fetch/append as we may have a pre-existing set
my $dset = $self->define_and_validate_sets ( -dbadaptor => $self->db, -name => $self->experimental_set_name, -feature_type => $self->feature_type, -cell_type => $self->cell_type, -analysis => $self->feature_analysis, -type => 'annotated', -description => $self->feature_set_description, #-append => 1,#Omit append to ensure we only have this eset
-recovery => $self->recovery, -supporting_sets => [$eset], #Can't set rollback here, as we don't know until we've validated the files
#Can't validate the files until we have the sets.
#So we're doing this manually in validate_files
); #We are now using IMPORTED to define wheather a FeatureSet has been imported succesfully
#However we already have IMPORTED on the ExperimentalSubSet
#We should add it to FeatureSet to remain consistent.
#See Helper::define_and_validate_sets for more notes on
#potential problems with FeatureSet IMPORTED status
$self->{'_data_set'} = $dset; return $self->{'_data_set'};
}
experimental_set_adaptordescriptionprevnextTop
sub experimental_set_adaptor {
  my $self = shift;
  return $self->{'experimental_set_adaptor'};
}
feature_paramsdescriptionprevnextTop
sub feature_params {
  my $self = shift;
  return $self->{'_feature_params'};
}

#getter only
}
feature_separatordescriptionprevnextTop
sub feature_separator {
  my $self = shift;
  return $self->{'_feature_separator'};
}

#getter only
}
load_feature_and_xrefsdescriptionprevnextTop
sub load_feature_and_xrefs {
  my $self = shift;

  my $seq;

  #grab seq if dump fasta and available
if($self->dump_fasta){ if(exists $self->feature_params->{'sequence'}){ $seq = $self->feature_params->{'sequence'}; delete $self->feature_params->{'sequence'}; } else{ $self->log('No fasta sequence available for '.$self->feature_params->display_label); } } my $feature = Bio::EnsEMBL::Funcgen::AnnotatedFeature->new(%{$self->feature_params}); ($feature) = @{$self->annotated_feature_adaptor->store($feature)}; $self->count('stored_features'); ##This needs to be handled in caller as we are validating loci
#if($self->ucsc_coords){
# $start += 1;
# $end += 1;
# }
#This needs to be put in a separate sub and called by the caller
#if(! $self->cache_slice($chr)){
# warn "Skipping AnnotatedFeature import, cound non standard chromosome: $chr";
#}
#else{
#dump fasta here
if ($self->dump_fasta){ $self->{'_fasta'} .= $self->generate_fasta_header($feature)."\n$seq\n"; #$fasta .= '>'.$pid."\n".$self->cache_slice($chr)->sub_Slice($start, $end, 1)->seq()."\n";
} #Now we need to store the xrefs
#my $edb_ref = $self->db->dbc->db_handle->selectrow_arrayref('select db_name from external_db where db_name="ensembl_variation_Variation"');
#if(! defined $edb_ref){
# throw('Could not find external_db ensembl_variation_Variation');
# }
# my ($edbname) = @{$edb_ref};
foreach my $dbentry_hash(@{$self->{'_dbentry_params'}}){ my $ftype = $dbentry_hash->{feature_type}; delete $dbentry_hash->{feature_type}; my $dbentry = Bio::EnsEMBL::DBEntry->new(%{$dbentry_hash}); $self->dbentry_adaptor->store($dbentry, $feature->dbID, $ftype, 1);#1 is ignore release flag
#count here? no count in caller
} #Clean data cache
$self->{'_feature_params'} = {}; $self->{'_dbentry_params'} = []; return $feature; } 1;
}
newdescriptionprevnextTop
sub new {
  my $caller = shift;
  
  my $class = ref($caller) || $caller;
  my $self  = $class->SUPER::new(@_);
  

  #No rollback flag yet to void losing old data which we are not reimporting
#Could potentially take fields params directly to define a custom format
#Take direct field mappings, plus special fields which needs parsing differently
#i.e. default is tab delimited, and GFF would define Attrs field as compound field and provide special parsing and field mapping
throw("This is a skeleton class for Bio::EnsEMBL::Importer, should not be used directly") if(! $self->isa("Bio::EnsEMBL::Funcgen::Importer")); $self->{'config'} = {( #can we omit these?
array_data => [],#['experiment'],
probe_data => [],#["probe"],
norm_method => undef, #protocols => {()},
'results_data' => ["and_import"], )}; #set up feature params
$self->{'_feature_params'} = {}; $self->{'_dbentry_params'} = []; $self->{'counts'} = {}; return $self; } #we surely use thes only once in the code
#is it faster to cache like this over the reg method?
#or should we just use reg directly?
}
read_and_import_datadescriptionprevnextTop
sub read_and_import_data {
  my $self = shift;
    
  $self->log("Reading and importing ".$self->vendor()." data");
  my ($filename, $fh, $f_out, $fasta_file,  %feature_params, @lines);
  my $dset   = $self->define_sets;
  my $fset   = $dset->product_FeatureSet;

  #If we can do these the other way araound we can get define_sets to rollback the FeatureSet
#Cyclical dependency for the sets :|
my ($eset) = @{$dset->get_supporting_sets}; my ($new_data) = $self->validate_files; #should remove IMPORTED status from FeatureSet before commencing load?
### READ AND IMPORT FILES ###
foreach my $filepath(@{$self->result_files()}) { chomp $filepath; ($filename = $filepath) =~ s/.*\///; #We're checking for recover here, as we have to reload all if just one has been screwed up.
if( $new_data->{$filepath} ){ $filepath = $self->pre_process_file($filepath) if $self->can('pre_process_file'); $self->log_header('Reading '.$self->vendor." file:\t".$filepath); $fh = open_file($filepath); my @lines = <$fh>; close($fh); $self->{'_fasta'} = ''; #warn "we need to either dump the pid rather than the dbID or dump the fasta in the DB dir";
#make this use get_fit
$fasta_file = $ENV{'EFG_DATA'}."/fastas/".$self->experiment->name().'.'.$filename.'.fasta'; if($self->dump_fasta){ ###NEW we need to check whether we have a seq field accesible here
$self->backup_file($fasta_file); $f_out = open_file($fasta_file, '>'); } foreach my $line (@lines) { #Generic line processing
#Move these to parse_line?
$line =~ s/\r*\n//o; next if $line =~ /^\#/; next if $line =~ /^$/; #This has now been simplified to process_line method
#my @fields = split/\t/o, $line;
#start building parameters hash
#foreach my $field_index(@{$self->get_field_indices}){
# my $field = $self->get_field_by_index($field_index);
# $feature_params = ($field =~ /^-/) ? $fields[$field_index] : $self->$field($fields[$field_index]);
# }
#We also need to enable different parse line methods if we have different file
#e.g. cisRED
#Code refs?
$self->parse_line($line); } #Now we need to deal with anything left in the read cache
$self->process_params; #This may get a little large, probably need to print periodically. Use $.?
if ($self->dump_fasta()){ print $f_out $self->{'_fasta'}; close($f_out); } $self->log('Finished importing '.$self->counts->{'features'}.' '. $fset->name." features from:\t$filepath"); #warn "Need to handle other counts in caller here?";
$self->log("Counts:\n".Data::Dumper::Dumper($self->{'_counts'})); #foreach my $key (%{$self->counts}){
# $self->log("Count $key:\t".$self->counts->{$key}."\n");
# }
my $sub_set = $eset->get_subset_by_name($filename); $sub_set->adaptor->store_status('IMPORTED', $sub_set); } } #Here we should set IMPORTED on the FeatureSet
#We could also log the first dbID of each feature in a subset to facilitate subset rollback
#in feature table
#this would be sketchy at best
#delete from annotated_feature where annotated_feature_id >= $first_subset_feature_id and feature_set_id=$feature_set_id
#This may already have IMPORTED status as we don't revoke the status whilst
#updating to protect the feature set due to lack of supportingset tracking
#see Helper::defined_and_validate_sets for more notes.
#Is there any point in setting it if we don't revoke it?
#To allow consistent status handling across sets. Just need to be aware of fset status caveat.
if(! $fset->has_status('IMPORTED')){ $fset->adaptor->store_status('IMPORTED', $fset); } $self->log("No new data, skipping result parse") if ! grep /1/,values %{$new_data}; $self->log("Finished parsing and importing results"); return;
}
set_configdescriptionprevnextTop
sub set_config {
  my $self = shift;

  throw('Must provide an ExperimentalSet name for a '.uc($self->vendor).' import') if ! defined $self->experimental_set_name();

  #Mandatory checks
if(! defined $self->feature_analysis){ throw('Must define a -feature_analysis parameter for '.uc($self->vendor).' imports'); } #We need to undef norm method as it has been set to the env var
$self->{'config'}{'norm_method'} = undef; #dir are not set in config to enable generic get_dir method access
#some convenience methods
$self->{'annotated_feature_adaptor'} = $self->db->get_AnnotatedFeatureAdaptor; $self->{'dbentry_adaptor'} = $self->db->get_DBEntryAdaptor; $self->{'experimental_set_adaptor'} = $self->db->get_ExperimentalSetAdaptor(); return;
}
set_feature_separatordescriptionprevnextTop
sub set_feature_separator {
  my ($self, $separator) = @_;

  #How do we test if something undefined was passed?
#Rather than nothing passed at all?
#Can't do this as this is the accessor
#Need to split method
throw('Must provide a valid feature separator') if ( (! defined $separator) || ($separator eq '') ); $self->{'_feature_separator'} = $separator;
}
validate_filesdescriptionprevnextTop
sub validate_files {
  my $self = shift;


  #Get file
if (! @{$self->result_files()}) { my $list = "ls ".$self->get_dir('input').'/'.$self->name().'*.'.lc($self->vendor);#could use vendor here?
my @rfiles = `$list`; $self->result_files(\@rfiles); } if (scalar(@{$self->result_files()}) >1) { warn('Found more than one '.$self->vendor." file:\n". join("\n", @{$self->result_files()})."\nThe Simple parser does not yet handle replicates.". " We need to resolve how we are going handle replicates with random cluster IDs"); #do we even need to?
} #Here were are tracking the import of individual files by adding them as ExperimentalSubSets
#Recovery would never know what to delete
#So would need to delete all, Hence no point in setting status?
#We do not rollback IMPORTED data here. This is done via separate scripts
#To reduce the rick of accidentally deleting/overwriting data by leaving a stry -rollback
#flag in the run script
### VALIDATE FILES ###
#We need validate all the files first, so the import doesn't fall over half way through
#Or if we come across a rollback halfway through
my (%new_data); my $recover_unimported = 0; my ($eset) = @{$self->data_set->get_supporting_sets}; foreach my $filepath( @{$self->result_files} ) { chomp $filepath; my $filename; ($filename = $filepath) =~ s/.*\///; my $sub_set; $self->log('Found '.$self->vendor." file\t$filename"); if( $sub_set = $eset->get_subset_by_name($filename) ){ if($recover_unimported){ $new_data{$filepath} = 1; next; } if( $sub_set->has_status('IMPORTED') ){ $new_data{$filepath} = 0; #if(! $self->rollback){
$self->log("ExperimentalSubset(${filename}) has already been imported"); #}
#else{
#remove IMPORTED status and rollback, set recover_unimported?
#}
} else{ $self->log("Found partially imported ExperimentalSubset(${filename})"); $recover_unimported = 1; $new_data{$filepath} = 1; if ( $self->recovery && $recover_unimported ) { $self->log("Rolling back results for ExperimentalSubset:\t".$filename); warn "Cannot yet rollback for just an ExperimentalSubset, rolling back entire set\n"; warn "WARNING:: This may be deleting previously imported data which you are not re-importing..list?!!!\n"; $self->rollback_FeatureSet($self->data_set->product_FeatureSet); $self->rollback_ExperimentalSet($eset); last; } elsif( $recover_unimported ){ throw("Found partially imported ExperimentalSubSet:\t$filepath\n". "You must specify -recover to perform a full roll back for this ExperimentalSet:\t".$eset->name); } } } else{ $self->log("Found new ExperimentalSubset(${filename})"); $new_data{$filepath} = 1; $sub_set = $eset->add_new_subset($filename); $self->experimental_set_adaptor->store_ExperimentalSubsets([$sub_set]); } } #Set all the new if we have rolled back due to a recovery.
if ($recover_unimported){ foreach my $esset(@{$eset->get_subsets}){ #map $new_data{$_} = 1, keys %new_data if $recover_unimported;
$new_data{$esset->name} = 1; $eset->adaptor->revoke_states($esset); } } return (\%new_data);
}
General documentation
AUTHORTop
This module was created by Nathan Johnson.
CONTACTTop
Post questions to the EnsEMBL development list ensembl-dev@ebi.ac.uk