Bio::EnsEMBL::Funcgen::Parsers Bed
SummaryIncluded librariesPackage variablesSynopsisDescriptionGeneral documentationMethods
Toolbar
WebCvsRaw content
Summary
Bio::EnsEMBL::Funcgen::Parsers::Bed
Package variables
No package variables defined.
Included modules
Bio::EnsEMBL::Funcgen::AnnotatedFeature
Bio::EnsEMBL::Funcgen::ExperimentalSet
Bio::EnsEMBL::Funcgen::FeatureSet
Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw ( species_chr_num open_file )
Bio::EnsEMBL::Funcgen::Utils::Helper
Bio::EnsEMBL::Utils::Argument qw ( rearrange )
Bio::EnsEMBL::Utils::Exception qw ( throw warning deprecate )
Data::Dumper
strict
Inherit
Bio::EnsEMBL::Funcgen::Utils::Helper
Synopsis
  my $parser_type = "Bio::EnsEMBL::Funcgen::Parsers::Bed";
push @INC, $parser_type;
my $imp = $class->SUPER::new(@_);
Description
This is a definitions class which should not be instatiated directly, it
normally set by the Importer as the parent class. Bed contains meta
data and methods specific to data in bed format, to aid
parsing and importing of experimental data.
Methods
newDescriptionCode
read_and_import_bed_data
No description
Code
set_configDescriptionCode
Methods description
newcode    nextTop
  Example    : my $self = $class->SUPER::new(@_);
Description: Constructor method for Bed class
Returntype : Bio::EnsEMBL::Funcgen::Parsers::Bed
Exceptions : throws if Experiment name not defined or if caller is not Importer
Caller : Bio::EnsEMBL::Funcgen::Importer
Status : at risk
set_configcodeprevnextTop
  Example    : my $self->set_config;
Description: Sets attribute dependent config
Returntype : None
Exceptions : None
Caller : Bio::EnsEMBL::Funcgen::Importer
Status : at risk
Methods code
newdescriptionprevnextTop
sub new {
  my $caller = shift;
  
  my $class = ref($caller) || $caller;
  my $self  = $class->SUPER::new();
  
  throw("This is a skeleton class for Bio::EnsEMBL::Importer, should not be used directly") 
	if(! $self->isa("Bio::EnsEMBL::Funcgen::Importer"));
  
  $self->{'config'} =  
	{(
      #order of these method arrays is important!
#remove method arrays and execute serially?
array_data => [],#['experiment'],
probe_data => [],#["probe"],
results_data => ["and_import_bed"], norm_method => undef, #Need to make these definable?
#have protocolfile arg and just parse tab2mage protocol section format
#SEE NIMBLEGEN FOR EXAMPLE
protocols => {()}, )}; return $self;
}
read_and_import_bed_datadescriptionprevnextTop
sub read_and_import_bed_data {
    my $self = shift;
    
    $self->log("Reading and importing ".$self->vendor()." data");
    my (@header, @data, @design_ids, @lines);
    my ($anal, $fh, $file);
    
    my $eset_adaptor = $self->db->get_ExperimentalSetAdaptor();
    my $af_adaptor = $self->db->get_AnnotatedFeatureAdaptor();
    my $fset_adaptor = $self->db->get_FeatureSetAdaptor();
    my $dset_adaptor = $self->db->get_DataSetAdaptor();
	my $new_data = 0;
	my $eset = $eset_adaptor->fetch_by_name($self->experimental_set_name());
    
    if(! defined $eset){
        $eset = Bio::EnsEMBL::Funcgen::ExperimentalSet->new(
                                                            -name         => $self->experimental_set_name(),
                                                            -experiment   => $self->experiment(),
                                                            -feature_type => $self->feature_type(),
                                                            -cell_type    => $self->cell_type(),
                                                            -vendor       => $self->vendor(),
                                                            -format       => $self->format(),
															-analysis     => $self->feature_analysis,
                                                            );
        ($eset)  = @{$eset_adaptor->store($eset)};
    }

    #we need a way to define replicates on a file basis when we have no meta file!
#can we make this generic for application to array imports?
#currently we have to do a separate import for each replicate, specifying the result files each time
#we need to add a experimental_set_name option
#Actually ExperimentalSet is a little redundant as we can do the roll back which is exactly what this is designed to facilitate
#It does however allow incremental addition of new subsets
#Now define FeatureSet
#shouldn't we be using the exp name?
my $fset = $fset_adaptor->fetch_by_name($self->experiment->name()); if(! defined $fset){ #currently hardcoded, but we should probably add feature_analysis_name
$fset = Bio::EnsEMBL::Funcgen::FeatureSet->new( -name => $self->experiment->name(), -feature_type => $self->feature_type(), -cell_type => $self->cell_type(), -type => 'annotated', -analysis => $self->feature_analysis, ); ($fset) = @{$fset_adaptor->store($fset)}; } #Define DataSet
my $dset = $dset_adaptor->fetch_by_name($self->experiment->name()); if(! defined $dset){ $dset = Bio::EnsEMBL::Funcgen::DataSet->new( -name => $self->experiment->name(), -supporting_sets => [$eset], -feature_set => $fset, -displayable => 1, -supporting_set_type => 'experimental', ); ($dset) = @{$dset_adaptor->store($dset)}; } #Get file
if (! @{$self->result_files()}) { my $list = "ls ".$self->input_dir().'/'.$self->name().'*.bed'; my @rfiles = `$list`; $self->result_files(\@rfiles); } if (scalar(@{$self->result_files()}) >1) { warn("Found more than one bed file:\n". join("\n", @{$self->result_files()})."\nBed does not yet handle replicates.". " We need to resolve how we are going handle replicates with random cluster IDs"); #do we even need to?
} #Here were are tracking the import of individual bed files by adding them as ExperimentalSubSets
#Recovery would never know what to delete
#So would need to delete all, Hence no point in setting status?
### VALIDATE FILES ###
#We need validate all the files first, so the import doesn't fall over half way through
#Or if we come across a rollback halfway through
my %new_data; my $roll_back = 0; foreach my $filepath(@{$self->result_files()}) { chomp $filepath; my $filename; ($filename = $filepath) =~ s/.*\///; my $sub_set; $self->log("Found bed file\t$filename"); if($sub_set = $eset->get_subset_by_name($filename)){ if ($sub_set->has_status('IMPORTED')){ $self->log("ExperimentalSubset(${filename}) has already been imported"); } else { $self->log("Found partially imported ExperimentalSubset(${filename})"); $roll_back = 1; if ($self->recovery() && $roll_back) { $self->log("Rolling back results for ExperimentalSubset:\t".$filename); warn "Cannot yet rollback for just an ExperimentalSubset, rolling back entire set\n"; $self->rollback_FeatureSet($fset); $self->rollback_ExperimentalSet($eset); last; } elsif($roll_back){ throw("Found partially imported ExperimentalSubSet:\t$filepath\n". "You must specify -recover to perform a full roll back for this ExperimentalSet:\t".$eset->name); } } } else{ $self->log("Found new ExperimentalSubset(${filename})"); $new_data{$filepath} = undef; $sub_set = $eset->add_new_subset($filename); $eset_adaptor->store_ExperimentalSubsets([$sub_set]); } } ### READ AND IMPORT FILES ###
foreach my $filepath(@{$self->result_files()}) { chomp $filepath; my $filename; my $count = 0; ($filename = $filepath) =~ s/.*\///; if($roll_back || exists $new_data{$filepath}){ $self->log("Reading bed file:\t".$filename); my $fh = open_file($filepath); my @lines = <$fh>; close($fh); my ($line, $f_out); my $fasta = ''; #warn "we need to either dump the pid rather than the dbID or dump the fasta in the DB dir";
my $fasta_file = $ENV{'EFG_DATA'}."/fastas/".$self->experiment->name().'.'.$filename.'.fasta'; if($self->dump_fasta()){ $self->backup_file($fasta_file); $f_out = open_file($fasta_file, '>'); } foreach my $line (@lines) { $line =~ s/\r*\n//o; next if $line =~ /^\#/; next if $line =~ /^$/; next unless $line =~ /^chr/i; #next if $line =~ /^chr/i;#Mikkelson hack
my ($chr, $start, $end, $pid, $score) = split/\t/o, $line; #my ($chr, $start, $end, $score) = split/\t/o, $line;#Mikkelson hack
if($self->ucsc_coords){ $start +=1; $end +=1; } if(! $self->cache_slice($chr)){ warn "Skipping AnnotatedFeature import, cound non standard chromosome: $chr"; } else{ #this is throwing away the encode region which could be used for the probeset/family?
my $feature = Bio::EnsEMBL::Funcgen::AnnotatedFeature->new ( -START => $start, -END => $end, -STRAND => 1, -SLICE => $self->cache_slice($chr), -ANALYSIS => $fset->analysis, -SCORE => $score, -DISPLAY_LABEL => $pid, -FEATURE_SET => $fset, ); $af_adaptor->store($feature); $count++; #dump fasta here
if ($self->dump_fasta()){ $fasta .= '>'.$pid."\n".$self->cache_slice($chr)->sub_Slice($start, $end, 1)->seq()."\n"; } } } if ($self->dump_fasta()){ print $f_out $fasta; close($f_out); } $self->log("Finished importing $count features:\t$filepath"); my $sub_set = $eset->get_subset_by_name($filename); $sub_set->adaptor->store_status('IMPORTED', $sub_set); } } $self->log("No new data, skipping result parse") if ! keys %new_data && ! $roll_back; $self->log("Finished parsing and importing results"); #Retrieve DataSet here as sanity check it is valid
#We had one which had a mismatched feature_type between the feature_set and the experimental_set
#How was this possible, surely this should have failed on generation/storage?
return; } 1;
}
set_configdescriptionprevnextTop
sub set_config {
  my $self = shift;

  throw('Must provide an ExperimentalSet name for a Bed import') if ! defined $self->experimental_set_name();

  #Mandatory checks
if(! defined $self->feature_analysis){ throw('Must define a -feature_analysis parameter for Bed imports'); } #We need to undef norm emthod as it has been set to the env var
$self->{'norm_method'} = undef; #dir are not set in config to enable generic get_dir method access
return;
}
General documentation
AUTHORTop
This module was created by Stefan Graf.
CONTACTTop
Post questions to the EnsEMBL development list ensembl-dev@ebi.ac.uk