Raw content of BioMart::Formatter::GFF
# $Id: GFF.pm,v 1.3.2.1 2008/07/19 22:13:37 syed Exp $
#
# BioMart module for BioMart::Formatter::GFF
#
# You may distribute this module under the same terms as perl
# itself.
# POD documentation - main docs before the code.
=head1 NAME
BioMart::Formatter::GFF
=head1 SYNOPSIS
The GFF Formatter returns GFF Formatter data for a BioMart query
=head1 DESCRIPTION
The GFF Formatter first of all removes any user chosen attributes from
the BioMart::Query object and adds the appropiate attributes required
for GFF data calculation. These attributes are defined in 'gtf' exportables
for the Dataset being processed. After this initial processing the query is
run and the ResultTable is processed row by row to calculate the correct
structural data for GFF output
=head1 AUTHOR - Syed Haider, Damian Smedley
=head1 CONTACT
This module is part of the BioMart project
http://www.biomart.org
Questions can be posted to the mart-dev mailing list:
mart-dev@ebi.ac.uk
=head1 METHODS
=cut
package BioMart::Formatter::GFF;
use strict;
use warnings;
use Data::Dumper;
# Extends BioMart::FormatterI
use base qw(BioMart::FormatterI);
sub _new {
my ($self) = @_;
$self->SUPER::_new();
$self->attr('outRow', undef);
$self->attr('lastPkey', undef);
$self->attr('exon_idHash', undef);
}
# overrides the same method from FormatterI
sub processQuery {
my ($self, $query) = @_;
$self->set('original_attributes',[@{$query->getAllAttributes()}])
if ($query->getAllAttributes());
# get exportable for terminal dataset from registry and set (attributes)
# on it and remove existing attribute - then set the list - may want a
# general method on FormatterI for doing all the rigid ones
my $final_dataset_order = $query->finalDatasetOrder();
my $registry = $query->getRegistry();
# remove all attributes from query
$query->removeAllAttributes();
foreach my $dataset_name(reverse @$final_dataset_order){
my $dataset = $registry->getDatasetByName($query->virtualSchema, $dataset_name);
if($dataset->visible)
{
if ($dataset->getExportables('gtf', $query->getInterfaceForDataset($dataset_name))){
$query->setDataset($dataset_name);
my $attribute_list = $dataset->getExportables('gtf', $query->getInterfaceForDataset($dataset_name));
my $temp_atts = $attribute_list->getAllAttributes;
$query->addAttributes($attribute_list->getAllAttributes);
}
}
}
$self->set('query',$query);
return $query;
}
sub nextRow {
my $self = shift;
my $rtable = $self->get('result_table');
my $formatted_rows;
my $attr_tmpl = 'gene_id "%s"; transcript_id "%s"; exon_id "%s"';
my $exon_tmpl = "%s\tEnsEMBL\texon\t%s\t%s\t.\t%s\t.\t%s";
my $cds_tmpl = "%s\tEnsEMBL\tCDS\t%s\t%s\t.\t%s\t%s\t%s";
my $start_tmpl = "%s\tEnsEMBL\tstart_codon\t%s\t%s\t.\t%s\t.\t%s";
my $stop_tmpl = "%s\tEnsEMBL\tstop_codon\t%s\t%s\t.\t%s\t.\t%s";
my $curRow = $rtable->nextRow;
#print "\n","@$curRow";
if( !$curRow && !$self->get('lastPkey')){
return;
}
my $pkey = ($curRow) ? $curRow->[0].$curRow->[1] : undef ;
my $lastPkey = $self->get('lastPkey') || $pkey;
my $outRow = $self->get('outRow');
if(($lastPkey && !$pkey) || ($pkey ne $lastPkey)) {
# print "\n\n [$lastPkey]\n\n";
# print Dumper($outRow);
# process saved rows and send some results back
my ($previous_frame, $previous_cds_start, $previous_cds_end) = 0;
foreach my $true_rank (sort { $a <=> $b } keys %$outRow ) {
no warnings 'uninitialized';
my $rank = $outRow->{$true_rank}->{'exon_id'};# dodgy
my $start_rank = $outRow->{$true_rank}->{'start_exon_id'};
my $end_rank = $outRow->{$true_rank}->{'end_exon_id'};
my $exon_start = $outRow->{$true_rank}->{'exon_chrom_start'};
my $exon_end = $outRow->{$true_rank}->{'exon_chrom_end'};
my $strand = $outRow->{$true_rank}->{'strand'};
my $coding_start_offset = $outRow->{$true_rank}->{'coding_start_offset'};
my $coding_end_offset = $outRow->{$true_rank}->{'coding_end_offset'};
my ($coding_start, $coding_end);
my ($attributes, $exon_line, $cds_line, $start_codon_line, $stop_codon_line);
#print "rank: ", $outRow->{$true_rank}->{'rank'}," : " , $outRow->{$true_rank}->{'geneId'} ,
# " : ", $outRow->{$true_rank}->{'transcriptId'} , "[$rank:$start_rank:$end_rank]", "\n";
my $exon_idHash = $self->get('exon_idHash');
if ($start_rank && $end_rank) {
# this includes sequence types : coding_gene_flank, coding_transcript_flank, coding, peptide
if ($rank == $start_rank && $rank == $end_rank) {
# coding start and finish on the same exon
if ($strand == 1) {
$coding_start = $exon_start + $coding_start_offset - 1;
$coding_end = $exon_start + $coding_end_offset - 1;
}
if ($strand == -1) {
$coding_start = $exon_end - $coding_end_offset + 1;
$coding_end = $exon_end - $coding_start_offset + 1;
}
}
elsif ($rank == $start_rank) {
if ($strand == 1) {
$coding_start = $exon_start + $coding_start_offset - 1;
$coding_end = $exon_end;
}
if ($strand == -1) {
$coding_start = $exon_start;
$coding_end = $exon_end - $coding_start_offset + 1;
}
# encountered coding START exon. storing as transcriptId.start_exonId to make it unique
$self->_setCodingExonFlag($pkey.$start_rank, 1);
}
elsif ($rank == $end_rank) {
if ($strand == 1) {
$coding_start = $exon_start;
$coding_end = $exon_start + $coding_end_offset - 1;
}
if ($strand == -1) {
$coding_start = $exon_end - $coding_end_offset + 1;
$coding_end = $exon_end;
}
# encountered coding END exon. clear ranscriptId.start_exonId stored earlier
$self->_setCodingExonFlag($pkey.$start_rank, 0);
}
else {
if ($exon_idHash->{$pkey.$start_rank} == 1) {
$coding_start = $exon_start;
$coding_end = $exon_end;
}
else {
$coding_start = undef;
$coding_end = undef;
}
}
# sanity check
if ($coding_start > $coding_end) {
$coding_start = undef;
$coding_end = undef;
}
# update the coordinates to be returned by this function - now similar to old coordinates
$outRow->{$true_rank}->{'coding_start'} = $coding_start;
$outRow->{$true_rank}->{'coding_end'} = $coding_end;
}
else { # no coding start/end set the vals to NULL
$outRow->{$true_rank}->{'coding_start'} = undef;
$outRow->{$true_rank}->{'coding_end'} = undef;
}
#$attributes = $outRow->{$true_rank}->{'transcriptId'};
$attributes = sprintf($attr_tmpl, $outRow->{$true_rank}->{'geneId'}, $outRow->{$true_rank}->{'transcriptId'}, $outRow->{$true_rank}->{'exonId'});
# exon line
$exon_line = sprintf($exon_tmpl,
$outRow->{$true_rank}->{'chr'},
$outRow->{$true_rank}->{'exon_chrom_start'},
$outRow->{$true_rank}->{'exon_chrom_end'},
($outRow->{$true_rank}->{'strand'} == -1)? '-':'+');
# CDS line
if ($outRow->{$true_rank}->{'coding_start'} && $outRow->{$true_rank}->{'coding_end'}) {
if ($outRow->{$true_rank}->{'strand'} == 1) {
if ($rank == $end_rank
&& $outRow->{$true_rank}->{'exon_chrom_end'} == $outRow->{$true_rank}->{'coding_end'}) {
$outRow->{$true_rank}->{'coding_end'} -= 3;
}
}
else{ # -1
if ($rank == $end_rank
&& $outRow->{$true_rank}->{'exon_chrom_start'} == $outRow->{$true_rank}->{'coding_start'}) {
$outRow->{$true_rank}->{'coding_start'} += 3;
}
}
# adding frame info, this goes only in CDS line
if ($rank == $start_rank) {
$outRow->{$true_rank}->{'frame'} = 0;
}
else {
# NEW/THIS frame = (previous_frame - length) % 3
my $length = ($outRow->{$true_rank}->{'strand'} == 1) ?
($previous_cds_end - $previous_cds_start)
: ($previous_cds_start - $previous_cds_end);
$length += 1;
$outRow->{$true_rank}->{'frame'} = ($previous_frame - $length) % 3;
}
$previous_frame = $outRow->{$true_rank}->{'frame'};
$previous_cds_start = $outRow->{$true_rank}->{'coding_start'};
$previous_cds_end = $outRow->{$true_rank}->{'coding_end'};
$cds_line = sprintf($cds_tmpl,
$outRow->{$true_rank}->{'chr'},
$outRow->{$true_rank}->{'coding_start'},
$outRow->{$true_rank}->{'coding_end'},
($outRow->{$true_rank}->{'strand'} == -1)? '-':'+',
$outRow->{$true_rank}->{'frame'});
}
# start_codon line
if ($rank == $start_rank) {
if ($outRow->{$true_rank}->{'strand'} == 1) {
$start_codon_line = sprintf($start_tmpl,
$outRow->{$true_rank}->{'chr'},
$outRow->{$true_rank}->{'coding_start'},
$outRow->{$true_rank}->{'coding_start'} + 2,
($outRow->{$true_rank}->{'strand'} == -1)? '-':'+');
}
else { # -1
$start_codon_line = sprintf($start_tmpl,
$outRow->{$true_rank}->{'chr'},
$outRow->{$true_rank}->{'coding_end'} - 2 ,
$outRow->{$true_rank}->{'coding_end'},
($outRow->{$true_rank}->{'strand'} == -1)? '-':'+');
}
}
# stop_codon line
if ($rank == $end_rank) {
if ($outRow->{$true_rank}->{'strand'} == 1) {
$stop_codon_line = sprintf($stop_tmpl,
$outRow->{$true_rank}->{'chr'},
$outRow->{$true_rank}->{'coding_end'} + 1,
$outRow->{$true_rank}->{'coding_end'} + 3,
($outRow->{$true_rank}->{'strand'} == -1)? '-':'+');
}
else { # -1
$stop_codon_line = sprintf($stop_tmpl,
$outRow->{$true_rank}->{'chr'},
$outRow->{$true_rank}->{'coding_start'} - 3,
$outRow->{$true_rank}->{'coding_start'} - 1,
($outRow->{$true_rank}->{'strand'} == -1)? '-':'+');
}
}
#$formatted_rows .= $outRow->{$true_rank}->{'has_stop_codon'}.' > ';
$formatted_rows .= $exon_line.$attributes."\n";
$formatted_rows .= $cds_line.$attributes."\n" if ($cds_line);
$formatted_rows .= $start_codon_line.$attributes."\n" if ($start_codon_line);
$formatted_rows .= $stop_codon_line.$attributes."\n" if ($stop_codon_line);
}
undef $outRow;
# lets release the last set of rows. when $pkey is NULL and we have lastPkey
if (!$pkey) {
$self->set('outRow', undef);
$self->set('lastPkey', undef);
return $formatted_rows;
}
}
#########################################################
my $rowHash = ();
my $rowIndex = 0;
# atts from gtf exportable
foreach my $name ( "geneId","transcriptId","exonId","chr", "exon_chrom_start","exon_chrom_end", "coding_start_offset", "coding_end_offset",
"strand", "exon_id", "rank", "start_exon_id", "end_exon_id", "transcript_count", "has_m_start", "has_stop_codon") {
$rowHash->{$name} = $curRow->[$rowIndex++];
}
foreach my $name ( "coding_start", "coding_end", "frame") {
$rowHash->{$name} = "";
}
$outRow->{$rowHash->{'rank'}} = $rowHash;
#push @$outRow, $rowHash;
# now save the current row
$self->set('outRow', $outRow);
$self->set('lastPkey', $pkey);
return $formatted_rows || "\n";
}
sub _setCodingExonFlag {
my ($self, $key, $val) = @_;
# the key value is pkey.start_exonId OR pkey.end_exonId
# doesnt matter if its start or end exon id as the corresponding
# value is always the same for each exon.
my $exon_idHash = $self->get('exon_idHash');
$exon_idHash->{$key} = $val;
$self->set('exon_idHash', $exon_idHash);
}
sub getDisplayNames {
my $self = shift;
return '';# no header required for GFF
}
sub isSpecial {
return 1;
}
1;