Raw content of Bio::EnsEMBL::Funcgen::Parsers::redfly
package Bio::EnsEMBL::Funcgen::Parsers::redfly;
use strict;
use File::Basename;
# To get files for REDfly, download the following 2 GFF3 files (e.g. via wget):
#
# http://redfly.ccr.buffalo.edu/datadumps/tbfs_dump.gff
# http://redfly.ccr.buffalo.edu/datadumps/crm_dump.gff
#contact?
#TFBS
#2L REDfly regulatory_region 2456365 2456372 . . . ID="Unspecified_dpp:REDFLY:TF000068"; Dbxref="Flybase:FBgn0000490", "PMID:8543160", "REDfly:644, "FlyBase:"; Evidence="footprint/binding assay"; Factor="Unspecified"; Target="dpp";
#2L REDfly regulatory_region 2456352 2456369 . . . ID="dl_dpp:REDFLY:TF000069"; Dbxref="Flybase:FBgn0000490", "PMID:8458580", "REDfly:645, "FlyBase:FBgn0000463"; Evidence="footprint/binding assay"; Factor="dl"; Target="dpp";
#CRMs
#2L REDfly regulatory_region 2455781 2457764 . . . ID="dpp_intron2"; Dbxref="Flybase:FBgn0000490", "PMID:8167377", "REDfly:247; Evidence="reporter construct (in vivo)"; Ontology_term="FBbt:00005304";
#2L REDfly regulatory_region 2445769 2446581 . . . ID="dpp_dpp813"; Dbxref="Flybase:FBgn0000490", "PMID:7821226", "REDfly:246; Evidence="reporter construct (in vivo)"; Ontology_term="FBbt:00005653","FBbt:00001051";
use Bio::EnsEMBL::Funcgen::Parsers::BaseExternalParser;
use Bio::EnsEMBL::DBEntry;
use Bio::EnsEMBL::Funcgen::ExternalFeature;
use Bio::EnsEMBL::Utils::Exception qw( throw );
use Bio::EnsEMBL::Utils::Argument qw( rearrange );
use vars qw(@ISA);
@ISA = qw(Bio::EnsEMBL::Funcgen::Parsers::BaseExternalParser);
sub new {
my $caller = shift;
my $class = ref($caller) || $caller;
my $self = $class->SUPER::new(@_);
#Set default feature_type and feature_set config
#We need to capture version/release/data of external feature sets.
#This can be nested in the description? Need to add description to feature_set?
$self->{'feature_types'} = {
'REDfly TFBS' => {
name => 'REDfly TFBS',
class => 'Transcription Factor',
description => 'REDfly transciption factor binding site',
},
'REDfly CRM' => {
name => 'REDfly CRM',
class => 'Regulatory Motif',
description => 'REDfly cis regulatory motif',
},
};
$self->{feature_sets} = {
'REDfly TFBSs' => {
feature_type => \$self->{'feature_types'}{'REDfly TFBS'},
#display_label => 'REDfly TFBSs',#defaults to name
analysis =>
{
-logic_name => 'REDfly TFBS',
-description => 'REDfly transcription factor binding sites (http://redfly.ccr.buffalo.edu/)',
-display_label => 'REDfly TFBS',
-displayable => 1,
},
xrefs => 1,
},
'REDfly CRMs' => {
feature_type => \$self->{'feature_types'}{'REDfly CRM'},
analysis =>
{
-logic_name => 'REDfly CRM',
-description => 'REDfly cis regulatory motif (http://redfly.ccr.buffalo.edu/)',
-display_label => 'REDfly CRM',
-displayable => 1,
},
xrefs => 1,
},
};
#Move xref flag here?
$self->{config} = {
'REDfly CRMs' => {
file => $ENV{'EFG_DATA'}.'/input/REDFLY/crm_dump.gff',
gff_attrs => {
'ID' => 1,
},
},
'REDfly TFBSs' => {
file => $ENV{'EFG_DATA'}.'/input/REDFLY/tbfs_dump.gff',
gff_attrs => {
'ID' => 1,
'Factor' => 1,
'Target' => 1,
},
desc_suffix => ' binding site',
}
};
#Default feature_set names
if(! defined $self->import_sets){
@{$self->{'import_sets'}} = keys %{$self->{'feature_sets'}};
}
else{#validate
foreach my $import_fset(@{$self->import_sets}){
if(! exists $self->{'feature_sets'}{$import_fset}){
throw("$import_fset is not a valid import feature set. Maybe you need to add this to the config in:\t".ref($self));
}
}
}
#Need to change this so we can just (re)load the one set.
#Change this so we only call it from parse_and_load?
#Should we validate all first, so we fail at the earliest possible moment?
#Or serially?
$self->validate_and_store_feature_types;
$self->set_feature_sets;
return $self;
}
# Parse file and return hashref containing:
#
# - arrayref of features
# - arrayref of factors
sub parse_and_load {
my $self = shift;
my ($fset_name, $old_assembly, $new_assembly, $file) = rearrange(['FEATURE_SET', 'OLD_ASSEMBLY', 'NEW_ASSEMBLY', 'FILE'], @_);
warn "params not yet fully implemented, loading defaults import sets";
#if(! defined $fset_name && defined $file){
# throw("Cannot specify a file to parse if no -feature_set parameter provided");
# }
#Use default file path? Make importer inherit from this?
#init_external import
#just do for each in import_sets here for now?
my $analysis_adaptor = $self->db->get_AnalysisAdaptor();
my %slice_cache;
my $extf_adaptor = $self->db->get_ExternalFeatureAdaptor;
my $dbentry_adaptor = $self->db->get_DBEntryAdaptor;
my $ftype_adaptor = $self->db->get_FeatureTypeAdaptor;
# this object is only used for projection
my $dummy_analysis = new Bio::EnsEMBL::Analysis(-logic_name => 'REDflyProjection');#do we need this?
my $species = $self->db->species;
if(! $species){
throw('Must define a species to define the external_db');
}
#Just to make sure we hav homo_sapiens and not Homo Sapiens
($species = lc($species)) =~ s/ /_/;
foreach my $import_set(@{$self->import_sets}){
$self->log_header("Parsing $import_set data");
my %factor_cache; # name -> factor_id
my %target_cache;
my $config = $self->{'config'}{$import_set};
my $fset = $self->{'feature_sets'}{$import_set};
my %gff_attrs = %{$config->{'gff_attrs'}};
# Parse motifs.txt file
my $file = $config->{'file'};
my $skipped = 0;
my $factor_cnt = 0;
my $factor_xref_cnt = 0;
my $feature_cnt = 0;
my $feature_target_cnt = 0;
open (FILE, "<$file") || die "Can't open $file";
; # skip header
LINE: while (my $line = ) {
next if ($line =~ /^\s*\#/o || $line =~ /^\s*$/o);
chomp $line;
my %attr_cache;#Can we move this outside the loop and rely on it being reset each time?
#GFF3
#Is this format valid, missing " after REDfly xref
#2L REDfly regulatory_region 2456365 2456372 . . . ID="Unspecified_dpp:REDFLY:TF000068"; Dbxref="Flybase:FBgn0000490", "PMID:8543160", "REDfly:644, "FlyBase:"; Evidence="footprint/binding assay"; Factor="Unspecified"; Target="dpp";
#seq_name, source, feature, start, end, score, strand, frame, [attrs]
my ($chromosome, undef, $feature, $start, $end, undef, undef, undef, $attrs) = split /\t/o, $line;
my @attrs = split/\;\s+/o, $attrs;
#UCSC coords
$start ++;
$end ++;
foreach my $gff_attr(keys %gff_attrs){
if(($attr_cache{$gff_attr}) = grep {/^${gff_attr}\=/} @attrs){
$attr_cache{$gff_attr} =~ s/(^${gff_attr}\=\")(.*)(\")/$2/;
#warn "attr cache is $attr_cache{$gff_attr} ";
}
else{
warn "Skipping import, unable to find mandatory $gff_attr attribute in:\t$line";
next LINE;
}
}
#For TFBS
#Factor = coding gene name display_label
#Target = Target gene?
#Ignore other xrefs for name, just put ID in feature as display_label
#These are mixed up! and where not getting any coding xrefs!
#For CRM
#Can we split the ID and have Reguatory XREF?
#e.g. ID="dpp_dpp813"; => dpp
#This can be moved to the BaseExternalParser
if(! exists $slice_cache{$chromosome}){
if($old_assembly){
$slice_cache{$chromosome} = $self->slice_adaptor->fetch_by_region('chromosome',
$chromosome,
undef,
undef,
undef,
$old_assembly);
}else{
$slice_cache{$chromosome} = $self->slice_adaptor->fetch_by_region('chromosome', $chromosome);
}
}
if(! defined $slice_cache{$chromosome}){
warn "Can't get slice $chromosome for motif $attr_cache{'ID'};\n";
$skipped++;
next;
}
#get feature_type first
#we are not maintaining this link in the DB!
#Do we need another xref for this or a different table?
my $feature_type;
#TFBSs
if(exists $attr_cache{'Factor'}){
if(! exists $factor_cache{$attr_cache{'Factor'}}){
$factor_cache{$attr_cache{'Factor'}} = $ftype_adaptor->fetch_by_name($attr_cache{'Factor'});
if(! defined $factor_cache{$attr_cache{'Factor'}}){
#Would need to add CODING DBEntry here!
#Will this work on a scalar ref to a hash?
my $desc = (exists $config->{'desc_suffix'}) ? $attr_cache{'Factor'}.$config->{'desc_suffix'} : undef;
($factor_cache{$attr_cache{'Factor'}}) = @{$ftype_adaptor->store(Bio::EnsEMBL::Funcgen::FeatureType->new
(
-name => $attr_cache{'Factor'},
-class => $fset->feature_type->class,
-description => $desc,
))};
$feature_type = $factor_cache{$attr_cache{'Factor'}};
$factor_cnt ++;
my $stable_id = $self->get_core_stable_id_by_display_name($self->db->dnadb, $attr_cache{'Factor'});
#Handle release/version in xref version as stable_id version?
if(! defined $stable_id){
warn "Could not generate CODING xref for feature_type:\t". $attr_cache{'Factor'};
}else{
#warn "got $stable_id for ".$attr_cache{'Factor'};
my $dbentry = Bio::EnsEMBL::DBEntry->new(
-dbname => $species.'_core_Gene',
#-release => $self->db->dnadb->dbc->dbname,
-status => 'KNOWNXREF',#This is for the external DB
#-display_label_linkable => 1,
-#db_display_name => $self->db->dnadb->dbc->dbname,
-db_display_name => 'EnsemblGene',
-type => 'MISC',#Is for the external_db
-primary_id => $stable_id,
-display_id => $attr_cache{'Factor'},
-info_type => 'MISC',
-into_text => 'GENE',
-linkage_annotation => 'REDfly Coding'
#-description => 'cisRED motif gene xref',#This is now generic and no longer resitricted to REDfly
#could have version here if we use the correct dnadb to build the cache
);
$dbentry_adaptor->store($dbentry, $factor_cache{$attr_cache{'Factor'}}->dbID, 'FeatureType', 1);#1 is ignore release flag
$factor_xref_cnt ++;
}
}
}
}
else{
#CRMs
$feature_type = $fset->feature_type;
}
#Now build actual feature
$feature = Bio::EnsEMBL::Funcgen::ExternalFeature->new
(
-display_label => $attr_cache{'ID'},
-start => $start,
-end => $end,
-strand => 0,
-feature_type => $feature_type,
-feature_set => $fset,
-slice => $slice_cache{$chromosome},
);
# project if necessary
if ($new_assembly) {
$feature = $self->project_feature($feature, $new_assembly);
if(! defined $feature){
$skipped ++;
next;
}
}
($feature) = @{$extf_adaptor->store($feature)};
$feature_cnt++;
my $target = (exists $attr_cache{'Target'}) ? $attr_cache{'Target'} : (split/_/, $attr_cache{'ID'})[0];
my $stable_id;
if($target ne 'Unspecified'){
$stable_id = $self->get_core_stable_id_by_display_name($self->db->dnadb, $target);
}
if(! defined $stable_id){
warn "Could not generate TARGET xref for feature:\t". $attr_cache{'ID'} if $target ne 'Unspecified';
}
else{
#Handle release/version in xref version as stable_id version?
my $dbentry = Bio::EnsEMBL::DBEntry->new(
-dbname => $species.'_core_Gene',
#-release => $self->db->dnadb->dbc->dbname,
-status => 'KNOWNXREF',
#-display_label_linkable => 1,
-#db_display_name => $self->db->dnadb->dbc->dbname,
-db_display_name => 'EnsemblGene',
-type => 'MISC',#
-primary_id => $stable_id,
-display_id => $target,
-info_type => 'MISC',
-info_text => 'GENE',
-linkage_annotation => $fset->feature_type->name.' Target',
#could have version here if we use the correct dnadb to build the cache
);
$dbentry_adaptor->store($dbentry, $feature->dbID, 'ExternalFeature', 1);#1 is ignore release flag
$feature_target_cnt ++;
}
}
close FILE;
$self->log("Loaded ".$fset->name);
$self->log("$factor_cnt feature types");
$self->log("$factor_xref_cnt feature type coding xrefs");
$self->log("$feature_cnt features");
$self->log("$feature_target_cnt feature target xrefs");
$self->log("Skipped $skipped features");
}
return;
}
1;