None available.
sub parse
{
my ($self, $db_adaptor, $file, $old_assembly, $new_assembly) = @_;
my %result;
my $analysis_adaptor = $db_adaptor->get_AnalysisAdaptor();
my $slice_adaptor = $db_adaptor->get_SliceAdaptor();
my $stable_id_to_internal_id = $self->build_stable_id_cache($db_adaptor);
print "Parsing $file with cisred parser\n";
my $blank_factor_id = $self->get_blank_factor_id($db_adaptor);
my $feature_internal_id = ($self->find_max_id($db_adaptor, "regulatory_feature")) + 1;
my $highest_factor_id = ($self->find_max_id($db_adaptor, "regulatory_factor")) + 1;
my @features;
my @factors;
my %factor_ids_by_name; my %feature_objects;
my %analysis;
foreach my $anal ("cisRed", "cisred_search") {
my $analysis_obj = $analysis_adaptor->fetch_by_logic_name($anal);
die "Can't get analysis for $anal, skipping" if (!$analysis_obj);
$analysis{$anal} = $analysis_obj->dbID();
print "Analysis ID for $anal is " . $analysis{$anal} . "\n";
}
my $dummy_analysis = new Bio::EnsEMBL::Analysis(-logic_name => 'CisRedProjection');
print "Parsing features from $file\n";
my $skipped = 0;
my $coords_changed = 0;
open (FILE, "<$file") || die "Can't open $file";
<FILE>; while (<FILE>) {
next if ($_ =~ /^\s*\#/ || $_ =~ /^\s*$/);
my %feature;
my ($motif_name, $chromosome, $start, $end, $strand, $group_name, $gene_id) = split (/\t/);
($gene_id) = $gene_id =~ /(ENS.*G\d{11})/;
$feature{NAME} = $motif_name;
$feature{INFLUENCE} = "unknown"; $feature{ANALYSIS_ID} = $analysis{cisRed};
$feature{FACTOR_ID} = $blank_factor_id;
if ($group_name && $group_name ne '' && $group_name !~ /\s/) {
my $factor_id = $factor_ids_by_name{$group_name};
if (!$factor_id) { my %factor;
$factor_id = $highest_factor_id + 1;
$factor{INTERNAL_ID} = $factor_id;
$factor{NAME} = $group_name;
$factor{TYPE} = 'NULL';
push @factors,\% factor;
$factor_ids_by_name{$factor{NAME}} = $factor_id;
$highest_factor_id = $factor_id;
}
$feature{FACTOR_ID} = $factor_id;
}
my $chr_slice = $slice_adaptor->fetch_by_region('chromosome', $chromosome, undef, undef, undef, $old_assembly);
if (!$chr_slice) {
print STDERR "Can't get slice for $chromosome:$start:$end\n";
next;
}
my $seq_region_id = $slice_adaptor->get_seq_region_id($chr_slice);
if (!$seq_region_id) {
print STDERR "Can't get seq_region_id for chromosome $chromosome\n";
next;
}
$feature{SEQ_REGION_ID} = $seq_region_id;
if ($new_assembly) {
my $projected_feature = $self->project_feature($start, $end, $strand, $chromosome, $chr_slice, $dummy_analysis, $new_assembly, $slice_adaptor, $motif_name);
$coords_changed++ if ($projected_feature->start() != $start || $projected_feature->end() != $end);
$start = $projected_feature->start();
$end = $projected_feature->end();
$strand = $projected_feature->strand();
$feature{SEQ_REGION_ID} = $slice_adaptor->get_seq_region_id($projected_feature->feature_Slice);
}
$feature{START} = $start;
$feature{END} = $end;
$feature{STRAND} = $strand;
my $ensembl_id = $stable_id_to_internal_id->{gene}->{$gene_id};
if (!$ensembl_id) {
print STDERR "Can't get ensembl internal ID for $gene_id, skipping\n";
print join("-", $motif_name, $chromosome, $start, $end, $strand, $group_name, $gene_id, "\n");
$skipped++;
next;
}
$feature{ENSEMBL_TYPE} = "Gene";
$feature{ENSEMBL_ID} = $ensembl_id;
$feature{INTERNAL_ID} = $feature_internal_id++;
$feature{EVIDENCE} = "";
push @features,\% feature;
}
close FILE;
my $search_regions_file = dirname($file) . "/search_regions.txt";
my $skipped_sr = 0;
my @search_regions;
print "Parsing search regions from $search_regions_file\n";
open (SEARCH_REGIONS, "<$search_regions_file") || die "Can't open $search_regions_file";
<SEARCH_REGIONS>; while (<SEARCH_REGIONS>) {
chomp;
my ($id, $chromosome, $start, $end, $strand, $ensembl_gene_id) = split;
my $gene_id = $stable_id_to_internal_id->{gene}->{$ensembl_gene_id};
if (!$gene_id) {
warn("Can't get internal ID for $ensembl_gene_id\n");
$skipped_sr++;
next;
}
my $sr_chr_slice = $slice_adaptor->fetch_by_region('chromosome', $chromosome, undef, undef, undef, $old_assembly);
if (!$sr_chr_slice) {
print STDERR "Can't get slice for $chromosome:$start:$end\n";
next;
}
my $sr_seq_region_id = $slice_adaptor->get_seq_region_id($sr_chr_slice);
if (!$sr_seq_region_id) {
print STDERR "Can't get seq_region_id for chromosome $chromosome\n";
next;
}
my $name = "CisRed_Search_$id";
if ($new_assembly) {
my $projected_region = $self->project_feature($start, $end, $strand, $chromosome, $sr_chr_slice, $dummy_analysis, $new_assembly, $slice_adaptor, "CisRed_Search_$id");
$start = $projected_region->start();
$end = $projected_region->end();
$strand = $projected_region->strand();
}
my %search_region;
$search_region{NAME} = $name;
$search_region{SEQ_REGION_ID} = $sr_seq_region_id;
$search_region{START} = $start;
$search_region{END} = $end;
$search_region{STRAND} = ($strand =~ /\+/ ? 1 : -1);
$search_region{ENSEMBL_OBJECT_TYPE} = 'Gene';
$search_region{ENSEMBL_OBJECT_ID} = $gene_id;
$search_region{ANALYSIS_ID} = $analysis{cisred_search};
push @search_regions,\% search_region;
}
close(SEARCH_REGIONS);
$result{FEATURES} =\@ features;
$result{FACTORS} =\@ factors;
$result{SEARCH_REGIONS} =\@ search_regions;
print "Parsed " . scalar(@{$result{FEATURES}}) . " features, " . scalar(@{$result{FACTORS}}) . " factors and " . scalar(@{$result{SEARCH_REGIONS}}) . " search regions\n";
print "Skipped $skipped features and $skipped_sr search regions due to missing stable ID - internal ID mappings\n";
print "$coords_changed features had their co-ordinates changed as a result of assembly mapping.\n" if ($new_assembly);
return\% result; } |