BioMart::Formatter GFF
SummaryIncluded librariesPackage variablesSynopsisDescriptionGeneral documentationMethods
Toolbar
WebCvsRaw content
Summary
BioMart::Formatter::GFF
Package variables
No package variables defined.
Included modules
Data::Dumper
Inherit
BioMart::FormatterI
Synopsis
The GFF Formatter returns GFF Formatter data for a BioMart query
Description
The GFF Formatter first of all removes any user chosen attributes from
the BioMart::Query object and adds the appropiate attributes required
for GFF data calculation. These attributes are defined in 'gtf' exportables
for the Dataset being processed. After this initial processing the query is
run and the ResultTable is processed row by row to calculate the correct
structural data for GFF output
Methods
_new
No description
Code
_setCodingExonFlag
No description
Code
getDisplayNames
No description
Code
isSpecial
No description
Code
nextRow
No description
Code
processQuery
No description
Code
Methods description
None available.
Methods code
_newdescriptionprevnextTop
sub _new {
	my ($self) = @_;
	$self->SUPER::_new();

	$self->attr('outRow', undef);
	$self->attr('lastPkey', undef);
	$self->attr('exon_idHash', undef);
}

 # overrides the same method from FormatterI
}
_setCodingExonFlagdescriptionprevnextTop
sub _setCodingExonFlag {
	my ($self, $key, $val) = @_;
	# the key value is pkey.start_exonId OR pkey.end_exonId
# doesnt matter if its start or end exon id as the corresponding
# value is always the same for each exon.
my $exon_idHash = $self->get('exon_idHash'); $exon_idHash->{$key} = $val; $self->set('exon_idHash', $exon_idHash);
}
getDisplayNamesdescriptionprevnextTop
sub getDisplayNames {
    my $self = shift;

    return '';# no header required for GFF
}
isSpecialdescriptionprevnextTop
sub isSpecial {
    return 1;
}


1;
}
nextRowdescriptionprevnextTop
sub nextRow {
	my $self = shift;

	my $rtable = $self->get('result_table');
	
	my $formatted_rows;
	my $attr_tmpl = 'gene_id "%s"; transcript_id "%s"; exon_id "%s"';
	my $exon_tmpl = "%s\tEnsEMBL\texon\t%s\t%s\t.\t%s\t.\t%s";
	my $cds_tmpl = "%s\tEnsEMBL\tCDS\t%s\t%s\t.\t%s\t%s\t%s";
	my $start_tmpl = "%s\tEnsEMBL\tstart_codon\t%s\t%s\t.\t%s\t.\t%s";
	my $stop_tmpl =  "%s\tEnsEMBL\tstop_codon\t%s\t%s\t.\t%s\t.\t%s"; 

	my $curRow = $rtable->nextRow;
	#print "\n","@$curRow";	
if( !$curRow && !$self->get('lastPkey')){ return; } my $pkey = ($curRow) ? $curRow->[0].$curRow->[1] : undef ; my $lastPkey = $self->get('lastPkey') || $pkey; my $outRow = $self->get('outRow'); if(($lastPkey && !$pkey) || ($pkey ne $lastPkey)) { # print "\n\n [$lastPkey]\n\n";
# print Dumper($outRow);
# process saved rows and send some results back
my ($previous_frame, $previous_cds_start, $previous_cds_end) = 0; foreach my $true_rank (sort { $a <=> $b } keys %$outRow ) { no warnings 'uninitialized'; my $rank = $outRow->{$true_rank}->{'exon_id'};# dodgy
my $start_rank = $outRow->{$true_rank}->{'start_exon_id'}; my $end_rank = $outRow->{$true_rank}->{'end_exon_id'}; my $exon_start = $outRow->{$true_rank}->{'exon_chrom_start'}; my $exon_end = $outRow->{$true_rank}->{'exon_chrom_end'}; my $strand = $outRow->{$true_rank}->{'strand'}; my $coding_start_offset = $outRow->{$true_rank}->{'coding_start_offset'}; my $coding_end_offset = $outRow->{$true_rank}->{'coding_end_offset'}; my ($coding_start, $coding_end); my ($attributes, $exon_line, $cds_line, $start_codon_line, $stop_codon_line); #print "rank: ", $outRow->{$true_rank}->{'rank'}," : " , $outRow->{$true_rank}->{'geneId'} ,
# " : ", $outRow->{$true_rank}->{'transcriptId'} , "[$rank:$start_rank:$end_rank]", "\n";
my $exon_idHash = $self->get('exon_idHash'); if ($start_rank && $end_rank) { # this includes sequence types : coding_gene_flank, coding_transcript_flank, coding, peptide
if ($rank == $start_rank && $rank == $end_rank) { # coding start and finish on the same exon
if ($strand == 1) { $coding_start = $exon_start + $coding_start_offset - 1; $coding_end = $exon_start + $coding_end_offset - 1; } if ($strand == -1) { $coding_start = $exon_end - $coding_end_offset + 1; $coding_end = $exon_end - $coding_start_offset + 1; } } elsif ($rank == $start_rank) { if ($strand == 1) { $coding_start = $exon_start + $coding_start_offset - 1; $coding_end = $exon_end; } if ($strand == -1) { $coding_start = $exon_start; $coding_end = $exon_end - $coding_start_offset + 1; } # encountered coding START exon. storing as transcriptId.start_exonId to make it unique
$self->_setCodingExonFlag($pkey.$start_rank, 1); } elsif ($rank == $end_rank) { if ($strand == 1) { $coding_start = $exon_start; $coding_end = $exon_start + $coding_end_offset - 1; } if ($strand == -1) { $coding_start = $exon_end - $coding_end_offset + 1; $coding_end = $exon_end; } # encountered coding END exon. clear ranscriptId.start_exonId stored earlier
$self->_setCodingExonFlag($pkey.$start_rank, 0); } else { if ($exon_idHash->{$pkey.$start_rank} == 1) { $coding_start = $exon_start; $coding_end = $exon_end; } else { $coding_start = undef; $coding_end = undef; } } # sanity check
if ($coding_start > $coding_end) { $coding_start = undef; $coding_end = undef; } # update the coordinates to be returned by this function - now similar to old coordinates
$outRow->{$true_rank}->{'coding_start'} = $coding_start; $outRow->{$true_rank}->{'coding_end'} = $coding_end; } else { # no coding start/end set the vals to NULL
$outRow->{$true_rank}->{'coding_start'} = undef; $outRow->{$true_rank}->{'coding_end'} = undef; } #$attributes = $outRow->{$true_rank}->{'transcriptId'};
$attributes = sprintf($attr_tmpl, $outRow->{$true_rank}->{'geneId'}, $outRow->{$true_rank}->{'transcriptId'}, $outRow->{$true_rank}->{'exonId'}); # exon line
$exon_line = sprintf($exon_tmpl, $outRow->{$true_rank}->{'chr'}, $outRow->{$true_rank}->{'exon_chrom_start'}, $outRow->{$true_rank}->{'exon_chrom_end'}, ($outRow->{$true_rank}->{'strand'} == -1)? '-':'+'); # CDS line
if ($outRow->{$true_rank}->{'coding_start'} && $outRow->{$true_rank}->{'coding_end'}) { if ($outRow->{$true_rank}->{'strand'} == 1) { if ($rank == $end_rank && $outRow->{$true_rank}->{'exon_chrom_end'} == $outRow->{$true_rank}->{'coding_end'}) { $outRow->{$true_rank}->{'coding_end'} -= 3; } } else{ # -1
if ($rank == $end_rank && $outRow->{$true_rank}->{'exon_chrom_start'} == $outRow->{$true_rank}->{'coding_start'}) { $outRow->{$true_rank}->{'coding_start'} += 3; } } # adding frame info, this goes only in CDS line
if ($rank == $start_rank) { $outRow->{$true_rank}->{'frame'} = 0; } else { # NEW/THIS frame = (previous_frame - length) % 3
my $length = ($outRow->{$true_rank}->{'strand'} == 1) ? ($previous_cds_end - $previous_cds_start) : ($previous_cds_start - $previous_cds_end); $length += 1; $outRow->{$true_rank}->{'frame'} = ($previous_frame - $length) % 3; } $previous_frame = $outRow->{$true_rank}->{'frame'}; $previous_cds_start = $outRow->{$true_rank}->{'coding_start'}; $previous_cds_end = $outRow->{$true_rank}->{'coding_end'}; $cds_line = sprintf($cds_tmpl, $outRow->{$true_rank}->{'chr'}, $outRow->{$true_rank}->{'coding_start'}, $outRow->{$true_rank}->{'coding_end'}, ($outRow->{$true_rank}->{'strand'} == -1)? '-':'+', $outRow->{$true_rank}->{'frame'}); } # start_codon line
if ($rank == $start_rank) { if ($outRow->{$true_rank}->{'strand'} == 1) { $start_codon_line = sprintf($start_tmpl, $outRow->{$true_rank}->{'chr'}, $outRow->{$true_rank}->{'coding_start'}, $outRow->{$true_rank}->{'coding_start'} + 2, ($outRow->{$true_rank}->{'strand'} == -1)? '-':'+'); } else { # -1
$start_codon_line = sprintf($start_tmpl, $outRow->{$true_rank}->{'chr'}, $outRow->{$true_rank}->{'coding_end'} - 2 , $outRow->{$true_rank}->{'coding_end'}, ($outRow->{$true_rank}->{'strand'} == -1)? '-':'+'); } } # stop_codon line
if ($rank == $end_rank) { if ($outRow->{$true_rank}->{'strand'} == 1) { $stop_codon_line = sprintf($stop_tmpl, $outRow->{$true_rank}->{'chr'}, $outRow->{$true_rank}->{'coding_end'} + 1, $outRow->{$true_rank}->{'coding_end'} + 3, ($outRow->{$true_rank}->{'strand'} == -1)? '-':'+'); } else { # -1
$stop_codon_line = sprintf($stop_tmpl, $outRow->{$true_rank}->{'chr'}, $outRow->{$true_rank}->{'coding_start'} - 3, $outRow->{$true_rank}->{'coding_start'} - 1, ($outRow->{$true_rank}->{'strand'} == -1)? '-':'+'); } } #$formatted_rows .= $outRow->{$true_rank}->{'has_stop_codon'}.' > ';
$formatted_rows .= $exon_line.$attributes."\n"; $formatted_rows .= $cds_line.$attributes."\n" if ($cds_line); $formatted_rows .= $start_codon_line.$attributes."\n" if ($start_codon_line); $formatted_rows .= $stop_codon_line.$attributes."\n" if ($stop_codon_line); } undef $outRow; # lets release the last set of rows. when $pkey is NULL and we have lastPkey
if (!$pkey) { $self->set('outRow', undef); $self->set('lastPkey', undef); return $formatted_rows; } } #########################################################
my $rowHash = (); my $rowIndex = 0; # atts from gtf exportable
foreach my $name ( "geneId","transcriptId","exonId","chr", "exon_chrom_start","exon_chrom_end", "coding_start_offset", "coding_end_offset", "strand", "exon_id", "rank", "start_exon_id", "end_exon_id", "transcript_count", "has_m_start", "has_stop_codon") { $rowHash->{$name} = $curRow->[$rowIndex++]; } foreach my $name ( "coding_start", "coding_end", "frame") { $rowHash->{$name} = ""; } $outRow->{$rowHash->{'rank'}} = $rowHash; #push @$outRow, $rowHash;
# now save the current row
$self->set('outRow', $outRow); $self->set('lastPkey', $pkey); return $formatted_rows || "\n";
}
processQuerydescriptionprevnextTop
sub processQuery {
    my ($self, $query) = @_;

    $self->set('original_attributes',[@{$query->getAllAttributes()}]) 
	if ($query->getAllAttributes());
    # get exportable for terminal dataset from registry and set (attributes) 
# on it and remove existing attribute - then set the list - may want a
# general method on FormatterI for doing all the rigid ones
my $final_dataset_order = $query->finalDatasetOrder(); my $registry = $query->getRegistry(); # remove all attributes from query
$query->removeAllAttributes(); foreach my $dataset_name(reverse @$final_dataset_order){ my $dataset = $registry->getDatasetByName($query->virtualSchema, $dataset_name); if($dataset->visible) { if ($dataset->getExportables('gtf', $query->getInterfaceForDataset($dataset_name))){ $query->setDataset($dataset_name); my $attribute_list = $dataset->getExportables('gtf', $query->getInterfaceForDataset($dataset_name)); my $temp_atts = $attribute_list->getAllAttributes; $query->addAttributes($attribute_list->getAllAttributes); } } } $self->set('query',$query); return $query;
}
General documentation
AUTHOR - Syed Haider, Damian SmedleyTop
CONTACTTop
This module is part of the BioMart project
http://www.biomart.org
Questions can be posted to the mart-dev mailing list: mart-dev@ebi.ac.uk