Raw content of Bio::EnsEMBL::Funcgen::Parsers::redfly package Bio::EnsEMBL::Funcgen::Parsers::redfly; use strict; use File::Basename; # To get files for REDfly, download the following 2 GFF3 files (e.g. via wget): # # http://redfly.ccr.buffalo.edu/datadumps/tbfs_dump.gff # http://redfly.ccr.buffalo.edu/datadumps/crm_dump.gff #contact? #TFBS #2L REDfly regulatory_region 2456365 2456372 . . . ID="Unspecified_dpp:REDFLY:TF000068"; Dbxref="Flybase:FBgn0000490", "PMID:8543160", "REDfly:644, "FlyBase:"; Evidence="footprint/binding assay"; Factor="Unspecified"; Target="dpp"; #2L REDfly regulatory_region 2456352 2456369 . . . ID="dl_dpp:REDFLY:TF000069"; Dbxref="Flybase:FBgn0000490", "PMID:8458580", "REDfly:645, "FlyBase:FBgn0000463"; Evidence="footprint/binding assay"; Factor="dl"; Target="dpp"; #CRMs #2L REDfly regulatory_region 2455781 2457764 . . . ID="dpp_intron2"; Dbxref="Flybase:FBgn0000490", "PMID:8167377", "REDfly:247; Evidence="reporter construct (in vivo)"; Ontology_term="FBbt:00005304"; #2L REDfly regulatory_region 2445769 2446581 . . . ID="dpp_dpp813"; Dbxref="Flybase:FBgn0000490", "PMID:7821226", "REDfly:246; Evidence="reporter construct (in vivo)"; Ontology_term="FBbt:00005653","FBbt:00001051"; use Bio::EnsEMBL::Funcgen::Parsers::BaseExternalParser; use Bio::EnsEMBL::DBEntry; use Bio::EnsEMBL::Funcgen::ExternalFeature; use Bio::EnsEMBL::Utils::Exception qw( throw ); use Bio::EnsEMBL::Utils::Argument qw( rearrange ); use vars qw(@ISA); @ISA = qw(Bio::EnsEMBL::Funcgen::Parsers::BaseExternalParser); sub new { my $caller = shift; my $class = ref($caller) || $caller; my $self = $class->SUPER::new(@_); #Set default feature_type and feature_set config #We need to capture version/release/data of external feature sets. #This can be nested in the description? Need to add description to feature_set? $self->{'feature_types'} = { 'REDfly TFBS' => { name => 'REDfly TFBS', class => 'Transcription Factor', description => 'REDfly transciption factor binding site', }, 'REDfly CRM' => { name => 'REDfly CRM', class => 'Regulatory Motif', description => 'REDfly cis regulatory motif', }, }; $self->{feature_sets} = { 'REDfly TFBSs' => { feature_type => \$self->{'feature_types'}{'REDfly TFBS'}, #display_label => 'REDfly TFBSs',#defaults to name analysis => { -logic_name => 'REDfly TFBS', -description => 'REDfly transcription factor binding sites (http://redfly.ccr.buffalo.edu/)', -display_label => 'REDfly TFBS', -displayable => 1, }, xrefs => 1, }, 'REDfly CRMs' => { feature_type => \$self->{'feature_types'}{'REDfly CRM'}, analysis => { -logic_name => 'REDfly CRM', -description => 'REDfly cis regulatory motif (http://redfly.ccr.buffalo.edu/)', -display_label => 'REDfly CRM', -displayable => 1, }, xrefs => 1, }, }; #Move xref flag here? $self->{config} = { 'REDfly CRMs' => { file => $ENV{'EFG_DATA'}.'/input/REDFLY/crm_dump.gff', gff_attrs => { 'ID' => 1, }, }, 'REDfly TFBSs' => { file => $ENV{'EFG_DATA'}.'/input/REDFLY/tbfs_dump.gff', gff_attrs => { 'ID' => 1, 'Factor' => 1, 'Target' => 1, }, desc_suffix => ' binding site', } }; #Default feature_set names if(! defined $self->import_sets){ @{$self->{'import_sets'}} = keys %{$self->{'feature_sets'}}; } else{#validate foreach my $import_fset(@{$self->import_sets}){ if(! exists $self->{'feature_sets'}{$import_fset}){ throw("$import_fset is not a valid import feature set. Maybe you need to add this to the config in:\t".ref($self)); } } } #Need to change this so we can just (re)load the one set. #Change this so we only call it from parse_and_load? #Should we validate all first, so we fail at the earliest possible moment? #Or serially? $self->validate_and_store_feature_types; $self->set_feature_sets; return $self; } # Parse file and return hashref containing: # # - arrayref of features # - arrayref of factors sub parse_and_load { my $self = shift; my ($fset_name, $old_assembly, $new_assembly, $file) = rearrange(['FEATURE_SET', 'OLD_ASSEMBLY', 'NEW_ASSEMBLY', 'FILE'], @_); warn "params not yet fully implemented, loading defaults import sets"; #if(! defined $fset_name && defined $file){ # throw("Cannot specify a file to parse if no -feature_set parameter provided"); # } #Use default file path? Make importer inherit from this? #init_external import #just do for each in import_sets here for now? my $analysis_adaptor = $self->db->get_AnalysisAdaptor(); my %slice_cache; my $extf_adaptor = $self->db->get_ExternalFeatureAdaptor; my $dbentry_adaptor = $self->db->get_DBEntryAdaptor; my $ftype_adaptor = $self->db->get_FeatureTypeAdaptor; # this object is only used for projection my $dummy_analysis = new Bio::EnsEMBL::Analysis(-logic_name => 'REDflyProjection');#do we need this? my $species = $self->db->species; if(! $species){ throw('Must define a species to define the external_db'); } #Just to make sure we hav homo_sapiens and not Homo Sapiens ($species = lc($species)) =~ s/ /_/; foreach my $import_set(@{$self->import_sets}){ $self->log_header("Parsing $import_set data"); my %factor_cache; # name -> factor_id my %target_cache; my $config = $self->{'config'}{$import_set}; my $fset = $self->{'feature_sets'}{$import_set}; my %gff_attrs = %{$config->{'gff_attrs'}}; # Parse motifs.txt file my $file = $config->{'file'}; my $skipped = 0; my $factor_cnt = 0; my $factor_xref_cnt = 0; my $feature_cnt = 0; my $feature_target_cnt = 0; open (FILE, "<$file") || die "Can't open $file"; <FILE>; # skip header LINE: while (my $line = <FILE>) { next if ($line =~ /^\s*\#/o || $line =~ /^\s*$/o); chomp $line; my %attr_cache;#Can we move this outside the loop and rely on it being reset each time? #GFF3 #Is this format valid, missing " after REDfly xref #2L REDfly regulatory_region 2456365 2456372 . . . ID="Unspecified_dpp:REDFLY:TF000068"; Dbxref="Flybase:FBgn0000490", "PMID:8543160", "REDfly:644, "FlyBase:"; Evidence="footprint/binding assay"; Factor="Unspecified"; Target="dpp"; #seq_name, source, feature, start, end, score, strand, frame, [attrs] my ($chromosome, undef, $feature, $start, $end, undef, undef, undef, $attrs) = split /\t/o, $line; my @attrs = split/\;\s+/o, $attrs; #UCSC coords $start ++; $end ++; foreach my $gff_attr(keys %gff_attrs){ if(($attr_cache{$gff_attr}) = grep {/^${gff_attr}\=/} @attrs){ $attr_cache{$gff_attr} =~ s/(^${gff_attr}\=\")(.*)(\")/$2/; #warn "attr cache is $attr_cache{$gff_attr} "; } else{ warn "Skipping import, unable to find mandatory $gff_attr attribute in:\t$line"; next LINE; } } #For TFBS #Factor = coding gene name display_label #Target = Target gene? #Ignore other xrefs for name, just put ID in feature as display_label #These are mixed up! and where not getting any coding xrefs! #For CRM #Can we split the ID and have Reguatory XREF? #e.g. ID="dpp_dpp813"; => dpp #This can be moved to the BaseExternalParser if(! exists $slice_cache{$chromosome}){ if($old_assembly){ $slice_cache{$chromosome} = $self->slice_adaptor->fetch_by_region('chromosome', $chromosome, undef, undef, undef, $old_assembly); }else{ $slice_cache{$chromosome} = $self->slice_adaptor->fetch_by_region('chromosome', $chromosome); } } if(! defined $slice_cache{$chromosome}){ warn "Can't get slice $chromosome for motif $attr_cache{'ID'};\n"; $skipped++; next; } #get feature_type first #we are not maintaining this link in the DB! #Do we need another xref for this or a different table? my $feature_type; #TFBSs if(exists $attr_cache{'Factor'}){ if(! exists $factor_cache{$attr_cache{'Factor'}}){ $factor_cache{$attr_cache{'Factor'}} = $ftype_adaptor->fetch_by_name($attr_cache{'Factor'}); if(! defined $factor_cache{$attr_cache{'Factor'}}){ #Would need to add CODING DBEntry here! #Will this work on a scalar ref to a hash? my $desc = (exists $config->{'desc_suffix'}) ? $attr_cache{'Factor'}.$config->{'desc_suffix'} : undef; ($factor_cache{$attr_cache{'Factor'}}) = @{$ftype_adaptor->store(Bio::EnsEMBL::Funcgen::FeatureType->new ( -name => $attr_cache{'Factor'}, -class => $fset->feature_type->class, -description => $desc, ))}; $feature_type = $factor_cache{$attr_cache{'Factor'}}; $factor_cnt ++; my $stable_id = $self->get_core_stable_id_by_display_name($self->db->dnadb, $attr_cache{'Factor'}); #Handle release/version in xref version as stable_id version? if(! defined $stable_id){ warn "Could not generate CODING xref for feature_type:\t". $attr_cache{'Factor'}; }else{ #warn "got $stable_id for ".$attr_cache{'Factor'}; my $dbentry = Bio::EnsEMBL::DBEntry->new( -dbname => $species.'_core_Gene', #-release => $self->db->dnadb->dbc->dbname, -status => 'KNOWNXREF',#This is for the external DB #-display_label_linkable => 1, -#db_display_name => $self->db->dnadb->dbc->dbname, -db_display_name => 'EnsemblGene', -type => 'MISC',#Is for the external_db -primary_id => $stable_id, -display_id => $attr_cache{'Factor'}, -info_type => 'MISC', -into_text => 'GENE', -linkage_annotation => 'REDfly Coding' #-description => 'cisRED motif gene xref',#This is now generic and no longer resitricted to REDfly #could have version here if we use the correct dnadb to build the cache ); $dbentry_adaptor->store($dbentry, $factor_cache{$attr_cache{'Factor'}}->dbID, 'FeatureType', 1);#1 is ignore release flag $factor_xref_cnt ++; } } } } else{ #CRMs $feature_type = $fset->feature_type; } #Now build actual feature $feature = Bio::EnsEMBL::Funcgen::ExternalFeature->new ( -display_label => $attr_cache{'ID'}, -start => $start, -end => $end, -strand => 0, -feature_type => $feature_type, -feature_set => $fset, -slice => $slice_cache{$chromosome}, ); # project if necessary if ($new_assembly) { $feature = $self->project_feature($feature, $new_assembly); if(! defined $feature){ $skipped ++; next; } } ($feature) = @{$extf_adaptor->store($feature)}; $feature_cnt++; my $target = (exists $attr_cache{'Target'}) ? $attr_cache{'Target'} : (split/_/, $attr_cache{'ID'})[0]; my $stable_id; if($target ne 'Unspecified'){ $stable_id = $self->get_core_stable_id_by_display_name($self->db->dnadb, $target); } if(! defined $stable_id){ warn "Could not generate TARGET xref for feature:\t". $attr_cache{'ID'} if $target ne 'Unspecified'; } else{ #Handle release/version in xref version as stable_id version? my $dbentry = Bio::EnsEMBL::DBEntry->new( -dbname => $species.'_core_Gene', #-release => $self->db->dnadb->dbc->dbname, -status => 'KNOWNXREF', #-display_label_linkable => 1, -#db_display_name => $self->db->dnadb->dbc->dbname, -db_display_name => 'EnsemblGene', -type => 'MISC',# -primary_id => $stable_id, -display_id => $target, -info_type => 'MISC', -info_text => 'GENE', -linkage_annotation => $fset->feature_type->name.' Target', #could have version here if we use the correct dnadb to build the cache ); $dbentry_adaptor->store($dbentry, $feature->dbID, 'ExternalFeature', 1);#1 is ignore release flag $feature_target_cnt ++; } } close FILE; $self->log("Loaded ".$fset->name); $self->log("$factor_cnt feature types"); $self->log("$factor_xref_cnt feature type coding xrefs"); $self->log("$feature_cnt features"); $self->log("$feature_target_cnt feature target xrefs"); $self->log("Skipped $skipped features"); } return; } 1;