package EnsEMBL::Web::Component::Export;
use strict;
use base qw(Exporter);
our @EXPORT = our @EXPORT_OK = qw(export export_file);
use EnsEMBL::Web::SeqDumper;
use EnsEMBL::Web::Document::SpreadSheet;
sub export {
my $class = shift;
my $custom_outputs = shift || {};
my $object = $class->object;
my $slice = $object->can('slice') ? $object->slice : $object->get_Slice;
my @inputs = @_;
my $o = $object->param('output');
my $flank5 = $object->param('flank5_display');
my $flank3 = $object->param('flank3_display');
my $strand = $object->param('strand');
$strand = undef unless ($strand == 1 || $strand == -1); # Feature strand will be correct automatically
$slice = $slice->invert if ($strand && $strand != $slice->strand);
$slice = $slice->expand($flank5, $flank3) if ($flank5 || $flank3);
my $format = $object->param('_format');
my $params = { 'html_format' => (!$format || $format eq 'HTML') };
map { $params->{$_} = 1 } $object->param('st');
map { $params->{'misc_set'}->{$_} = 1 if $_ } $object->param('miscset');
my $outputs = {
'fasta' => sub { return fasta($object, $slice, $params, @inputs); },
'csv' => sub { return features($object, $slice, $params, 'csv'); },
'gff' => sub { return features($object, $slice, $params, 'gff'); },
'tab' => sub { return features($object, $slice, $params, 'tab'); },
'embl' => sub { return flat($object, $slice, $params, 'embl'); },
'genbank' => sub { return flat($object, $slice, $params, 'genbank'); },
%$custom_outputs
};
return $outputs->{$o}() if $outputs->{$o};
}
sub fasta {
my ($object, $slice, $params, $trans_objects, $object_id) = @_;
my $html;
foreach (@$trans_objects) {
my $transcript = $_->Obj;
my $id_type = $transcript->isa('Bio::EnsEMBL::PredictionTranscript') ? $transcript->analysis->logic_name : $transcript->status . '_' . $transcript->biotype;
my $id = ($object_id ? "$object_id:" : "") . $transcript->stable_id;
my $intron_id = 1;
my $output = {
'cdna' => [[ "$id cdna:$id_type", $transcript->spliced_seq ]],
'coding' => eval { [[ "$id cds:$id_type", $transcript->translateable_seq ]] },
'peptide' => eval { [[ "$id peptide:@{[$transcript->translation->stable_id]} pep:$id_type", $transcript->translate->seq ]] },
'utr3' => eval { [[ "$id utr3:$id_type", $transcript->three_prime_utr->seq ]] },
'utr5' => eval { [[ "$id utr5:$id_type", $transcript->five_prime_utr->seq ]] },
'exons' => eval { [ map {[ "$id " . $_->id . " exon:$id_type", $_->seq->seq ]} @{$transcript->get_all_Exons} ] },
'introns' => eval { [ map {[ "$id intron " . $intron_id++ . ":$id_type", $_->seq ]} @{$transcript->get_all_Introns} ] }
};
foreach (sort keys %$params) {
next unless ref $output->{$_} eq 'ARRAY';
foreach (@{$output->{$_}}) {
$_->[1] =~ s/(.{60})/$1\r\n/g;
$_->[1] =~ s/\r\n$//g;
$html .= ">$_->[0]\r\n$_->[1]\r\n";
}
}
$html .= "\r\n";
}
my $genomic = $object->param('genomic');
if (defined $genomic && $genomic ne 'off') {
my $masking = $genomic eq 'soft_masked' ? 1 : $genomic eq 'hard_masked' ? 0 : undef;
my $seq;
if ($genomic =~ /flanking/) {
if ($genomic =~ /5/) {
my $flank5_slice = $slice->sub_Slice(1, $object->param('flank5_display'), $slice->strand);
if ($flank5_slice) {
$seq = $flank5_slice->seq;
$seq =~ s/(.{60})/$1\r\n/g;
$html .= ">5' Flanking sequence @{[$flank5_slice->name]}\r\n$seq\r\n";
}
}
if ($genomic =~ /3/) {
my $flank3_slice = $slice->sub_Slice($slice->length - $object->param('flank3_display'), $slice->length, $slice->strand);
if ($flank3_slice) {
$seq = $flank3_slice->seq;
$seq =~ s/(.{60})/$1\r\n/g;
$html .= ">3' Flanking sequence @{[$flank3_slice->name]}\r\n$seq\r\n";
}
}
} else {
$seq = defined $masking ? $slice->get_repeatmasked_seq(undef, $masking)->seq : $slice->seq;
$seq =~ s/(.{60})/$1\r\n/g;
$html .= ">@{[$object->seq_region_name]} dna:@{[$object->seq_region_type]} @{[$slice->name]}\r\n$seq\r\n";
}
}
$html = "<pre>$html</pre>" if $html && $params->{'html_format'};
return $html || "No data available";
}
sub features {
my ($object, $slice, $params, $format) = @_;
my $html;
my @common_fields = qw( seqname source feature start end score strand frame );
my @other_fields = qw( hid hstart hend genscan gene_id transcript_id exon_id gene_type variation_name );
my $delim = { 'gff' => "\t", 'csv' => ",", 'tab' => "\t" };
my $options = {
'other' => \@other_fields,
'format' => $format,
'delim' => $delim->{$format}
};
my @features = ();
my $header = join ($delim->{$format}, @common_fields, @other_fields) . "\r\n" if ($format ne 'gff');
if ($params->{'similarity'}) {
foreach (@{$slice->get_all_SimilarityFeatures}) {
$html .= feature('similarity', $options, $_, { 'hid' => $_->hseqname, 'hstart' => $_->hstart, 'hend' => $_->hend });
}
}
if ($params->{'repeat'}) {
foreach (@{$slice->get_all_RepeatFeatures}) {
$html .= feature('repeat', $options, $_, { 'hid' => $_->repeat_consensus->name, 'hstart' => $_->hstart, 'hend' => $_->hend });
}
}
if ($params->{'genscan'}) {
foreach my $t (@{$slice->get_all_PredictionTranscripts}) {
foreach my $f (@{$t->get_all_Exons}) {
$html .= feature('pred.trans.', $options, $f, { 'genscan' => $t->stable_id });
}
}
}
if ($params->{'variation'}) {
foreach (@{$slice->get_all_VariationFeatures}) {
$html .= feature('variation', $options, $_, { 'variation_name' => $_->variation_name });
}
}
if ($params->{'gene'}) {
my @dbs = ('core');
push (@dbs, 'vega') if $object->species_defs->databases->{'DATABASE_VEGA'};
push (@dbs, 'otherfeatures') if $object->species_defs->databases->{'DATABASE_OTHERFEATURES'};
foreach my $db (@dbs) {
foreach my $g (@{$slice->get_all_Genes(undef, $db)}) {
foreach my $t (@{$g->get_all_Transcripts}) {
foreach my $f (@{$t->get_all_Exons}) {
$html .= feature('gene', $options, $f, {
'exon_id' => $f->stable_id,
'transcript_id' => $t->stable_id,
'gene_id' => $g->stable_id,
'gene_type' => $g->status . '_' . $g->biotype
}, $db eq 'vega' ? 'Vega' : 'Ensembl');
}
}
}
}
}
if ($html) {
$html = "$header$html";
$html = "<pre>$html</pre>" if $params->{'html_format'};
}
if ($params->{'misc_set'}) {
my $misc_sets = $object->species_defs->databases->{'DATABASE_CORE'}{'tables'}{'misc_feature'}{'sets'};
my $table;
$options->{'seq_region'} = $object->seq_region_name;
$options->{'start'} = $object->seq_region_start;
$options->{'end'} = $object->seq_region_end;
$html .= "\r\n";
foreach (sort { $misc_sets->{$a}->{'name'} cmp $misc_sets->{$b}->{'name'} } keys %{$params->{'misc_set'}}) {
my $tmp = {
'misc_set' => $_,
'name' => $misc_sets->{$_}->{'name'}
};
$table = $params->{'html_format'} ? new EnsEMBL::Web::Document::SpreadSheet : undef;
$html .= misc_set($object, $slice, { %$options, %$tmp }, $table);
}
$table = $params->{'html_format'} ? new EnsEMBL::Web::Document::SpreadSheet : undef;
$html .= misc_set_genes($object, $slice, $options, $table);
}
return $html || "No data available";
}
sub feature {
my ($type, $options, $feature, $extra, $def_source) = @_;
my $score = $feature->can('score') ? $feature->score : '.';
my $source = $feature->can('source_tag') ? $feature->source_tag : ($def_source || 'Ensembl');
my $tag = $feature->can('primary_tag') ? $feature->primary_tag : (ucfirst(lc $type) || '.');
$source =~ s/\s/_/g;
$tag =~ s/\s/_/g;
my ($name, $strand, $start, $end, $phase);
if ($feature->can('seq_region_name')) {
$strand = $feature->seq_region_strand;
$name = $feature->seq_region_name;
$start = $feature->seq_region_start;
$end = $feature->seq_region_end;
} else {
$strand = $feature->can('strand') ? $feature->strand : undef;
$name = $feature->can('entire_seq') && $feature->entire_seq ? $feature->entire_seq->name : undef;
$name = $feature->seqname if !$name && $feature->can('seqname');
$start = $feature->can('start') ? $feature->start : undef;
$end = $feature->can('end') ? $feature->end : undef;
}
$name ||= 'SEQ';
$name =~ s/\s/_/g;
if ($strand == 1) {
$strand = '+';
$phase = $feature->can('phase') ? $feature->phase : '.';
} elsif ($strand == -1) {
$strand = '-';
$phase = $feature->can('end_phase') ? $feature->end_phase : '.';
}
$phase = '.' if $phase == -1 || !defined $phase;
$strand ||= '.';
my @results = ($name, $source, $tag, $start, $end, $score, $strand, $phase);
if ($options->{'format'} eq 'gff') {
push (@results, join ("; ", map { defined $extra->{$_} ? "$_=$extra->{$_}" : () } @{$options->{'other'}}));
} else {
push (@results, map { $extra->{$_} } @{$options->{'other'}});
}
return join ($options->{'delim'}, @results) . "\r\n";
}
sub misc_set {
my ($object, $slice, $options, $table) = @_;
my @fields = ( 'SeqRegion', 'Start', 'End', 'Name', 'Well name', 'Sanger', 'EMBL Acc', 'FISH', 'Centre', 'State' );
my $header = "Features in set $options->{'name'} in Chromosome $options->{'seq_region'} $options->{'start'} - $options->{'end'}";
$header = "<h2>$header</h2>" if $table;
my $db = $object->database('core');
my @regions;
my $adaptor;
my $row;
my $results;
my $i = 0;
eval {
$adaptor = $db->get_MiscSetAdaptor->fetch_by_code($options->{'misc_set'});
};
if ($adaptor) {
if ($table) {
$table->add_columns(map {{ 'title' => $_, 'align' => 'left' }} @fields);
} else {
$header .= "\r\n" . join ($options->{'delim'}, @fields) . "\r\n";
}
push (@regions, $slice);
foreach my $r (@regions) {
foreach (sort { $a->start <=> $b->start } @{$db->get_MiscFeatureAdaptor->fetch_all_by_Slice_and_set_code($r, $adaptor->code)}) {
$row = [
$_->seq_region_name,
$_->seq_region_start,
$_->seq_region_end,
join (';', @{$_->get_all_attribute_values('clone_name')}, @{$_->get_all_attribute_values('name')}),
join (';', @{$_->get_all_attribute_values('well_name')}),
join (';', @{$_->get_all_attribute_values('synonym')}, @{$_->get_all_attribute_values('sanger_project')}),
join (';', @{$_->get_all_attribute_values('embl_acc')}),
$_->get_scalar_attribute('fish'),
$_->get_scalar_attribute('org'),
$_->get_scalar_attribute('state')
];
if ($table) {
$table->add_row($row);
} else {
$results .= join ($options->{'delim'}, @$row) . "\r\n";
}
$i++;
}
}
}
return $header . ($i ? ($table ? $table->render : $results) : "No data available\r\n") . ($table ? "<br /><br />" : "\r\n");
}
sub misc_set_genes {
my ($object, $slice, $options, $table) = @_;
my @gene_fields = ( 'SeqRegion', 'Start', 'End', 'Ensembl ID', 'DB', 'Name' );
my $header = "Genes in Chromosome $options->{'seq_region'} $options->{'start'} - $options->{'end'}";
$header = "<h2>$header</h2>" if $table;
my $row;
my $results;
my $i = 0;
if ($table) {
$table->add_columns(map {{ 'title' => $_, 'align' => 'left' }} @gene_fields);
} else {
$header .= "\r\n" . join ($options->{'delim'}, @gene_fields) . "\r\n";
}
foreach (sort { $a->seq_region_start <=> $b->seq_region_start } map { @{$slice->get_all_Genes($_)||[]} } qw( ensembl havana ensembl_havana_gene )) {
$row = [
$_->seq_region_name,
$_->seq_region_start,
$_->seq_region_end,
$_->stable_id,
$_->external_db || '-',
$_->external_name || '-novel-'
];
if ($table) {
$table->add_row($row);
} else {
$results .= join ($options->{'delim'}, @$row) . "\r\n";
}
$i++;
}
return $header . ($i ? ($table ? $table->render : $results) : "No data available");
}
sub flat {
my ($object, $slice, $params, $format) = @_;
my $seq_dumper = EnsEMBL::Web::SeqDumper->new;
foreach (qw( genscan similarity gene repeat variation contig marker )) {
$seq_dumper->disable_feature_type($_) unless $params->{$_};
}
my $vega_db = $object->database('vega');
my $estgene_db = $object->database('otherfeatures');
if ($params->{'vegagene'} && $vega_db) {
$seq_dumper->enable_feature_type('vegagene');
$seq_dumper->attach_database('vega', $vega_db);
}
if ($params->{'estgene'} && $estgene_db) {
$seq_dumper->enable_feature_type('estgene');
$seq_dumper->attach_database('estgene', $estgene_db);
}
my $rtn = $seq_dumper->dump($slice, $format);
$rtn = "<pre>$rtn</pre>" if $params->{'html_format'};
return $rtn;
}
sub export_file {
my ($file, $object, $o) = @_;
my $slice = $object->can('slice') ? $object->slice : $object->get_Slice unless $o =~ /haploview|ld_excel/;
my $outputs = {
'seq' => sub { pip_seq_file($file, $object, $slice); },
'pipmaker' => sub { pip_anno_file($file, $object, $slice, $o); },
'vista' => sub { pip_anno_file($file, $object, $slice, $o); }
};
if (!$outputs->{$o}) {
warn "Invalid file format $o";
return;
}
$outputs->{$o}();
}
sub pip_seq_file {
my ($file, $object, $slice) = @_;
(my $seq = $slice->seq) =~ s/(.{60})/$1\r\n/g;
my $fh;
if (ref $file) {
$fh = $file;
} else {
open $fh, ">$file";
}
print $fh ">@{[$slice->name]}\r\n$seq";
close $fh unless ref $file;
}
sub pip_anno_file {
my ($file, $object, $slice, $o) = @_;
my $slice_length = $slice->length;
my $outputs = {
'pipmaker' => sub { return pip_anno_file_pipmaker(@_); },
'vista' => sub { return pip_anno_file_vista(@_); }
};
my $fh;
if (ref $file) {
$fh = $file;
} else {
open $fh, ">$file";
}
foreach my $gene (@{$slice->get_all_Genes(undef, undef, 1) || []}) {
# only include genes that don't overlap slice boundaries
next if ($gene->start < 1 or $gene->end > $slice_length);
my $gene_header = join(" ", ($gene->strand == 1 ? ">" : "<"), $gene->start, $gene->end, $gene->external_name || $gene->stable_id);
$gene_header .= "\r\n";
foreach my $transcript (@{$gene->get_all_Transcripts}) {
# get UTR/exon lines
my @exons = @{$transcript->get_all_Exons};
@exons = reverse @exons if ($gene->strand == -1);
my $out = $outputs->{$o}($transcript, \@exons);
# write output to file if there are exons in the exported region
print $fh "$gene_header$out" if $out;
}
}
close $fh unless ref $file;
}
sub pip_anno_file_vista {
my ($transcript, $exons) = @_;
my $coding_start = $transcript->coding_region_start;
my $coding_end = $transcript->coding_region_end;
my $out;
foreach my $exon (@$exons) {
if (!$coding_start) { # no coding region at all
$out .= join(" ", $exon->start, $exon->end, "UTR\r\n");
} elsif ($exon->start < $coding_start) { # we begin with an UTR
if ($coding_start < $exon->end) { # coding region begins in this exon
$out .= join(" ", $exon->start, $coding_start - 1, "UTR\r\n");
$out .= join(" ", $coding_start, $exon->end, "exon\r\n");
} else { # UTR until end of exon
$out .= join(" ", $exon->start, $exon->end, "UTR\r\n");
}
} elsif ($coding_end < $exon->end) { # we begin with an exon
if ($exon->start < $coding_end) { # coding region ends in this exon
$out .= join(" ", $exon->start, $coding_end, "exon\r\n");
$out .= join(" ", $coding_end + 1, $exon->end, "UTR\r\n");
} else { # UTR (coding region has ended in previous exon)
$out .= join(" ", $exon->start, $exon->end, "UTR\r\n");
}
} else { # coding exon
$out .= join(" ", $exon->start, $exon->end, "exon\r\n");
}
}
return "$out\r\n";
}
sub pip_anno_file_pipmaker {
my ($transcript, $exons) = @_;
my $coding_start = $transcript->coding_region_start;
my $coding_end = $transcript->coding_region_end;
my $out;
# do nothing for non-coding transcripts
return unless ($coding_start);
# add UTR line
if ($transcript->start < $coding_start or $transcript->end > $coding_end) {
$out .= join(" ", "+", $coding_start, $coding_end, "\r\n");
}
# add exon lines
foreach my $exon (@$exons) {
$out .= join(" ", $exon->start, $exon->end, "\r\n");
}
return "$out\r\n";
}
1;