Raw content of Bio::EnsEMBL::Funcgen::Parsers::vista
package Bio::EnsEMBL::Funcgen::Parsers::vista;
use strict;
# Parse data from LBL enhancers, see http://enhancer.lbl.gov/cgi-bin/imagedb.pl?show=1;search.result=yes;form=search;search.form=no;action=search;search.sequence=1
# e.g.
#
# >chr16:84987588-84988227 | element 1 | positive | neural tube[12/12] | hindbrain (rhombencephalon)[12/12] | limb[3/12] | cranial nerve[8/12]
# AACTGAAGGGACCCCGTTAGCATAtaaacaaaaggtggggggtagccccgagcctcttct
# ctgacagccagtggcggcagtgatgaatttgtgaagttatctaattttccactgttttaa
# ttagagacttgggctctgaggcctcgcagctggcttctttgtgctgtattctgttgcctg
# acagag
use Bio::EnsEMBL::Funcgen::Parsers::BaseExternalParser;
use vars qw(@ISA);
@ISA = qw(Bio::EnsEMBL::Funcgen::Parsers::BaseExternalParser);
sub new {
my $caller = shift;
my $class = ref($caller) || $caller;
my $self = $class->SUPER::new(@_);
#Set default feature_type and feature_set config
$self->{'feature_types'} = {(
'VISTA Target' => {
name => 'VISTA Target',
class => 'Region',
description => 'VISTA target region',
},
'VISTA Enhancer' => {
name => 'VISTA Enhancer',
class => 'Enhancer',
description => 'Enhancer identified by positive VISTA assay',
},
'VISTA Target - Negative' => {
name => 'VISTA Target - Negative',
class => 'Region',
description => 'Enhancer negative region identified by VISTA assay',
},
)};
$self->{feature_sets} = {
'VISTA enhancer set' => {
feature_type => \$self->{'feature_types'}{'VISTA Target'},
display_label => 'VISTA Enhancers',
analysis =>
{
-logic_name => 'VISTA',
-description => 'VISTA Enhancer Assay (http://enhancer.lbl.gov/)',
-display_label => 'VISTA',
-displayable => 1,
},
},
};
$self->validate_and_store_feature_types;
$self->set_feature_sets;
return $self;
}
# Parse file and return hashref containing:
#
# - arrayref of features
# - arrayref of factors
sub parse_and_load{
my ($self, $file, $old_assembly, $new_assembly) = @_;
my %result;
#we want to set up the new external_feature set here or in the caller?
$self->log_header("Parsing and loading LBNL VISTA enhancer data from:\t$file");
#my $feature_internal_id = ($self->find_max_id("external_feature")) + 1;
#This is now feature_type
#but we don't want to just import a flat file for this, as we'd have to remove all the previous
#entries. Either check each one, either via the API or by dumping to file and checking the sorted file?
#my $highest_factor_id = ($self->find_max_id("feature_type")) + 1;
my $extfeat_adaptor = $self->db->get_ExternalFeatureAdaptor;
my @features;
my %feature_objects;
my %anal;
# this object is only used for projection
my $dummy_analysis = new Bio::EnsEMBL::Analysis(-logic_name => 'EnhancerProjection');
my $feature_positive = $self->{'feature_types'}{'VISTA Enhancer'};
my $feature_negative = $self->{'feature_types'}{'VISTA Target - Negative'};
#we should separate this into rna_feature_type for cisRED?
#The adaptor should then conditionally look up the rna_feature_type table instead of the feature_type table
# read file
open (FILE, "<$file") || die "Can't open $file";
my $cnt = 0;
my $skipped = 0;
while () {
next if ($_ !~ /^>/o); # only read headers
my %feature;
# >chr16:84987588-84988227 | element 1 | positive | neural tube[12/12] | hindbrain (rhombencephalon)[12/12] | limb[3/12] | cranial nerve[8/12]
my ($coords, $element, $posneg, @stuff) = split /\s+\|\s+/o;
# parse co-ordinates
my ($chr, $start, $end) = $coords =~ /chr([^:]+):(\d+)-(\d+)/o;
# parse element name
my ($element_number) = $element =~ /\s*element\s*(\d+)/o;
# ----------------------------------------
# Feature name
#$feature{NAME} = "LBNL-$element_number";
# ----------------------------------------
# Analysis
#$feature{ANALYSIS_ID} = $posneg eq 'positive' ? $analysis_positive : $analysis_negative;
#Feature type
#$feature{FEATURE_TYPE_ID} = $posneg eq 'positive' ? $feature_positive_id : $feature_negative_id;
# ----------------------------------------
# Seq_region ID and co-ordinates
my $chr_slice;
if ($old_assembly) {
$chr_slice = $self->slice_adaptor->fetch_by_region('chromosome', $chr, undef, undef, undef, $old_assembly);
} else {
$chr_slice = $self->slice_adaptor->fetch_by_region('chromosome', $chr);
}
if (!$chr_slice) {
warn "Can't get slice for chromosme $chr\n";
next;
}
my $seq_region_id = $chr_slice->get_seq_region_id;
throw("Can't get seq_region_id for chromosome $chr") if (!$seq_region_id);
#$feature{SEQ_REGION_ID} = $seq_region_id;
# Assume these are all on the positive strand? Is this correct?
my $strand = 1;
#build feature here, but don't store?
my $feature = Bio::EnsEMBL::Funcgen::ExternalFeature->new
(
#-dbID => $feature_internal_id++,
#-analysis => $self->analysis,
-start => $start,#is this in UCSC coords?
-end => $end, #is this in UCSC coords?
-strand => $strand,
-feature_type => $posneg eq 'positive' ? $feature_positive : $feature_negative,
-slice => $self->slice_adaptor->fetch_by_region('chromosome', $chr, undef, undef, $strand, $old_assembly),
-display_label => "LBNL-$element_number",
-feature_set => $self->{'feature_sets'}{'VISTA enhancer set'},
);
# project if necessary
if ($new_assembly) {
$feature = $self->project_feature($feature, $dummy_analysis, $new_assembly);
if(! defined $feature){
$skipped ++;
next;
}
}
$cnt ++;
$extfeat_adaptor->store($feature);
}
close FILE;
#$result{FEATURES} = \@features;
$self->log('Parsed '.($cnt+$skipped).' features');
$self->log("Loaded $cnt features");
$self->log("Skipped $skipped features");
return;
}
1;