None available.
sub parse
{
my ($self, $db_adaptor, $file, $old_assembly, $new_assembly) = @_;
my %result;
print "Parsing $file with miranda parser\n";
my $feature_internal_id = ($self->find_max_id($db_adaptor, "regulatory_feature")) + 1;
my $highest_factor_id = ($self->find_max_id($db_adaptor, "regulatory_factor")) + 1;
my $analysis_adaptor = $db_adaptor->get_AnalysisAdaptor();
my $slice_adaptor = $db_adaptor->get_SliceAdaptor();
my @features;
my @factors;
my %factor_ids_by_name; my %feature_objects;
my %anal;
my $stable_id_to_internal_id = $self->build_stable_id_cache($db_adaptor);
my $dummy_analysis = new Bio::EnsEMBL::Analysis(-logic_name => 'CisRedProjection');
open (FILE, "<$file") || die "Can't open $file";
while (<FILE>) {
next if ($_ =~ /^\s*\#/ || $_ =~ /^\s*$/);
my %feature;
my ($group, $seq, $method, $feature, $chr, $start, $end, $str, $phase, $score, $pvalue, $type, $id_ignore, $id) = split;
my $strand = ($str =~ /\+/ ? 1 : -1);
$id =~ s/[\"\']//g;
$feature{NAME} = $id .":" . $seq;
$feature{INFLUENCE} = "negative";
my $factor_id = $factor_ids_by_name{$seq};
if (!$factor_id) {
my %factor;
$factor_id = $highest_factor_id + 1;
$factor{INTERNAL_ID} = $factor_id;
$factor{NAME} = $seq;
$factor{TYPE} = $feature; push @factors,\% factor;
$factor_ids_by_name{$seq} = $factor_id;
$highest_factor_id = $factor_id;
}
$feature{FACTOR_ID} = $factor_id;
my $analysis = $anal{$method};
if (!$analysis) {
$analysis = $analysis_adaptor->fetch_by_logic_name($method);
$anal{$method} = $analysis;
}
if (!$analysis) {
print STDERR "Can't get analysis for $method, skipping\n";
next;
}
$feature{ANALYSIS_ID} = $analysis->dbID();
my $chr_slice = $slice_adaptor->fetch_by_region('chromosome', $chr, undef, undef, undef, $old_assembly);
if (!$chr_slice) {
print STDERR "Can't get slice for $chr:$start:$end:$strand\n";
next;
}
my $seq_region_id = $slice_adaptor->get_seq_region_id($chr_slice);
if (!$seq_region_id) {
print STDERR "Can't get seq_region_id for chromosome $chr\n";
next;
}
$feature{SEQ_REGION_ID} = $seq_region_id;
if ($new_assembly) {
my $projected_feature = $self->project_feature($start, $end, $strand, $chr, $chr_slice, $dummy_analysis, $new_assembly, $slice_adaptor, $feature{NAME});
$start = $projected_feature->start();
$end = $projected_feature->end();
$strand = $projected_feature->strand();
$feature{SEQ_REGION_ID} = $slice_adaptor->get_seq_region_id($projected_feature->feature_Slice);
}
$feature{START} = $start;
$feature{END} = $end;
$feature{STRAND} = $strand;
my ($ensembl_type) = $type =~ /(gene|transcript|translation)/;
if (!$ensembl_type) {
print STDERR "Can't get ensembl type from $type, skipping\n";
next;
}
my $ensembl_id = $stable_id_to_internal_id->{$ensembl_type}->{$id};
$ensembl_type = ucfirst(lc($ensembl_type));
if (!$ensembl_id) {
print STDERR "Can't get ensembl internal ID for $id, skipping\n";
next;
}
$feature{ENSEMBL_TYPE} = $ensembl_type;
$feature{ENSEMBL_ID} = $ensembl_id;
$feature{INTERNAL_ID} = $feature_internal_id++;
$feature{EVIDENCE} = "";
push @features,\% feature;
}
close FILE;
$result{FEATURES} =\@ features;
$result{FACTORS} =\@ factors;
print "Parsed " . scalar(@{$result{FEATURES}}) . " features and " . scalar(@{$result{FACTORS}}) . " factors\n";
return\% result; } |