Raw content of FlyBaseGff
#!/usr/local/ensembl/bin/perl -w
# This module currently contains a parser does not parse all information possible out of the file.
package FlyBaseGff;
use strict;
use warnings;
use FlyBaseConf;
use FlyBaseToolbox;
use Bio::EnsEMBL::Utils::Argument qw(rearrange);
use Bio::EnsEMBL::Utils::Exception qw(throw warning deprecate verbose);
use Bio::EnsEMBL::SimpleFeature;
use Bio::EnsEMBL::Transcript;
use Bio::EnsEMBL::Exon;
use Bio::EnsEMBL::Gene;
use Bio::EnsEMBL::DBEntry;
use Data::Dumper;
$|=1;
=pod
=head2 new
Title : new
Usage :
Function : new GFF object
Arguments :
Return-Val :
=cut
# construct newFlyBaseGff-object (GFF Version 3) of flybase gff
# desc of gff3: http://song.sourceforge.net/gff3.shtml
sub new {
my ($class,@args) = @_;
my $self={};
bless $self,$class;
my ($db,$gff,$slice) = rearrange(
[qw(
DB
GFF
SLICE
)],@args);
$self->db($db);
$self->gff($gff);
$self->slice($slice);
$self->{_gene}=[];
$self->{exons}={};
$self->{db_entry}={};
$self->parse_gff();
print "\n* * *\ngff $gff parsed\n* * * \n";
return $self;
}
=pod
=head2 db_entry
Title : db_entry
Usage :
Function : for xrefs
Arguments :
Return-Val :
=cut
sub db_entry{
my ($self, $tr,$db_entry) = @_;
if($db_entry){
$ {$self->{db_entry}} {$tr}=$db_entry;
}
return $ { $self->{db_entry}}{$tr};
}
=pod
=head2 get_all_attributes_by_id
Title : get_all_attributes_by_id
Usage : $obj->get_all_attributes_by_id('ID')
Function : access all stored attributes of a feature-id
Arguments : ID
Return-Val : arrayref to arrayrefs of attributes of a given ID
=cut
sub get_childrenID_from_parentID {
my ($self,$id) = @_;
my $children_ref = {};
if (${$self->parent2child}{$id}) {
$children_ref = ${$self->parent2child}{$id};
}
return $children_ref;
}
sub get_features_from_ID {
my ($self,$id) = @_;
return ${$self->featurehash}{$id};
}
#
# Methods used for storing attributes, genes, SimpleFeatures
################################################################################
=pod
=head3 storing attributes, genes, SimpleFeatures
=head2 store_as_gene_object
Title : store_as_gene_object
Usage : store_as_gene_object ('gene', 'mRNA','flybase_gene')
Function : looks in gff for entity of type 'gene' which has childs 'mRNA' and 'exon' and stores them
as gene of type 'flybase_gene' in the database
Decsription : Given a slice, parent type (eg. Gene), child type (eg. mRNA, tRNA, pseudogene)
this goes through the gff hash and finds all parents of the given type and
makes them into Genes. If they have associates Transcripts and Exons then
these are added too. They are all stored in the database.
Arguments :
Return-Val :
=cut
sub store_as_gene_object {
my ($self, $parent_type, $child_type, $ensembl_gene_type) = @_;
my %all_processed_exons;
print "Parent feature_type=$parent_type and Child_feature_type=$child_type\n";
# test if $parent_type is annotated in gff
my @gene_ids = @{$self->get_ids_by_type($parent_type)};
if (!@gene_ids || scalar(@gene_ids) < 1) {
warning("No parent type '$parent_type' found in gff file ".$self->gff()." (child type $child_type)\n");
return;
} else {
# Get all gene ids for this type (gene)
print "Have ".scalar(@gene_ids)." genes of parent type $parent_type\n";
}
# want to timestamp the stable_ids
my $time = time();
GENE: foreach my $gene_id ( @gene_ids ) {
print "Looking at gene $gene_id\n";
# note that genes with no transcripts are not stored
# eg. mt:ori gene
# # #
# Transcripts
# # #
# start looking for transcripts of this parent gene
my $all_new_transcripts;
# get a list of child ids for the parent
# ie. get a list of transcript IDs for the Gene
my @transcript_ids = keys %{$self->get_childrenID_from_parentID($gene_id)};
my @keep;
if (scalar(@transcript_ids) == 0) {
# there are no children of this type for the parent
print " x WARNING: No child of type $child_type for $parent_type with id $gene_id\n";
# make a gene without transcripts?
next GENE;
} else {
# the parent has children of type $child_type
# Loop thru the alternate transcripts
foreach my $t (@transcript_ids) {
if (${$self->get_childrenID_from_parentID($gene_id)}{$t}[1] eq $child_type) {
push @keep, $t;
}
}
if (scalar(@keep) == 0) {
print " x WARNING: No child of type $child_type for $parent_type with id $gene_id\n";
next GENE;
} else {
print "\n* * *\n>> Doing gene $gene_id \n* * *\n";
$all_new_transcripts = make_Transcripts($self, \@keep, $child_type, $time);
}
} # if (scalar(@transcript_ids) == 0) {
print "Renaming exons\n";
# Rename exons where necessary
my ($rt) = rename_exons($gene_id, $all_new_transcripts);
my @renamed_transcripts = @{$rt};
# # #
# Make Genes
# # #
# got all alternative transcripts and make genes
my $gene = Bio::EnsEMBL::Gene->new(
-START => ${$self->get_features_from_ID($gene_id)}{'start'},
-END => ${$self->get_features_from_ID($gene_id)}{'end'},
-STRAND => ${$self->get_features_from_ID($gene_id)}{'strand'},
-SLICE => $self->slice,
-ANALYSIS => $self->create_analysis($LOGIC_NAME_EXON),
-STABLE_ID => $gene_id,
-VERSION => $ATTRIBUTE_VERSION,
-TYPE => $ensembl_gene_type,
-CREATED_DATE => $time,
-MODIFIED_DATE => $time,
-SOURCE => ${$self->get_features_from_ID($gene_id)}{'source'},
);
# add Transcripts to Gene
foreach my $tr (@renamed_transcripts) {
$gene->add_Transcript($tr);
}
# # #
# Store
# # #
# check if we already have an exon-stable-id/gene-stable-id/translation-stable-id/transcript-stable-id for this gene
print "\nNow trying to store gene $gene_id....." ;
my $gene_DB_id = $self->db->get_GeneAdaptor->store($gene);
print "STORED gene $gene_DB_id \n\n" ;
# # Xrefs
# foreach my $dbe (@{$self->get_dbxrefs($gene_id, $parent_type)}) {
# $self->db->get_DBEntryAdaptor->store($dbe,$gene_DB_id, 'Gene');
# print " *stored xref with dbname ".$dbe->dbname." for ".ref($gene)." with id $gene_DB_id\n";
# }
# reprocess the db_entry for transcripts
#for my $tr (@renamed_transcripts){
# for my $tr (@{$gene->get_all_Transcripts}) {
# foreach my $dbe (@{$self->get_dbxrefs($tr->stable_id, $child_type)}) {
# # my $dbe = $self->db_entry($tr);
# if($dbe){
# $self->db->get_DBEntryAdaptor->store($dbe, $tr->dbID,'Transcript');
# print " *stored xref with dbname ".$dbe->dbname." for ".ref($tr)." with id ".$tr->dbID."\n";
# }
# }
# if ( $tr->translation() ) {
# my $tln = $tr->translation() ;
# # we have a protein-coding gene and can gets xrefs for the protein
# foreach my $dbe (@{$self->get_dbxrefs($tln->stable_id, 'protein')}) {
# if ($dbe) {
# $self->db->get_DBEntryAdaptor->store($dbe, $tln->dbID,'Translation');
# print " *stored xref with dbname ".$dbe->dbname." for ".ref($tln)." with id ".$tln->dbID."\n";
# }
# }
# }
# }
} # for my $gene_id ( @{$self->get_ids_by_type($parent_type)} ) {
} # end sub store_as_gene_object {
sub make_Transcripts {
my ($self, $tref, $biotype, $time) = @_;
my @transcript_ids = @{$tref};
my @transcripts;
for my $transcript_id (@transcript_ids) {
# make an EnsEMBL Transcript
my $new_transcript = new Bio::EnsEMBL::Transcript(
-STABLE_ID => $transcript_id,
-VERSION => $ATTRIBUTE_VERSION,
-TYPE => $biotype,
-CREATED_DATE => $time,
-MODIFIED_DATE => $time,
-ANALYSIS => $self->create_analysis($LOGIC_NAME_EXON),
);
# print " Have new Transcript $transcript_id of biotype $biotype. Getting xrefs...\n";
#
# # It takes the first Dbxref entry and stores it
# # my @dbx = @{${$self->featurehash}{$transcript_id}{'Dbxref'}};
# my @dbxrefs = @{$self->get_features_from_ID($transcript_id)->{'Dbxref'}};
# if (scalar @dbxrefs) {
# print "Found xrefs @dbxrefs";
# # not sure if we should use all of them or just the first
# $self->db_entry($new_transcript, ${$self->get_features_from_ID($transcript_id)}{'Dbxref'}[0]);
# } else {
# print "WARNING: No db entry transcript $transcript_id\n";
# }
# # #
# Exons
# # #
my @exon_ids = keys %{$self->get_childrenID_from_parentID($transcript_id)};
if (scalar(@exon_ids) == 0) {
print "\n x No children for transcript $transcript_id, must be non-coding\n";
} else {
print "\nTranscript $transcript_id has ".@exon_ids." possible exons\n";
my $transcript = make_Exons($self, $new_transcript, \@exon_ids, $time);
my $transcript_with_tln = add_Translation($self, $transcript);
push @transcripts, $transcript_with_tln;
}
} # for my $transcript_id (@transcript_ids) {
return \@transcripts;
}
sub make_Exons {
my ($self, $transcript, $exon_ids, $time ) = @_;
my @exons;
# Loop thru exons and Make EnsEMBL objects
EXON: foreach my $exon_id (@$exon_ids) {
if (${$self->get_features_from_ID($exon_id)}{'type'} ne 'exon') {
next EXON;
}
my $ex = Bio::EnsEMBL::Exon->new(
-START => ${$self->get_features_from_ID($exon_id)}{'start'},
-END => ${$self->get_features_from_ID($exon_id)}{'end'},
-STRAND => ${$self->get_features_from_ID($exon_id)}{'strand'},
-PHASE => 0,
-END_PHASE => 0,
-SLICE => $self->slice,
-ANALYSIS => $self->create_analysis($LOGIC_NAME_EXON),
-STABLE_ID => $exon_id,
-CREATED_DATE => $time,
-MODIFIED_DATE => $time,
-VERSION => $ATTRIBUTE_VERSION
);
print " For Transcript ".$transcript->stable_id.", have Exon ".$ex->stable_id." with ".
"start ".$ex->start." end ".$ex->end." strand ".$ex->strand."\n" ;
# add Exon to Transcript
$transcript->add_Exon($ex);
}
return $transcript;
}
sub add_Translation {
my ($self, $transcript) = @_;
my @protein_ids;
foreach my $child_id (keys %{$self->get_childrenID_from_parentID($transcript->stable_id)}) {
if (${$self->get_features_from_ID($child_id)}{'type'} eq 'protein') {
push @protein_ids, $child_id;
print " Found protein for transcript ".$transcript->stable_id." (strand ".$transcript->strand.") : ";
}
}
if (scalar(@protein_ids) > 1) {
print "WARNING: More than one protein for transcript ".$transcript->stable_id." @protein_ids";
}
if (defined $protein_ids[0]) {
print $protein_ids[0]."\n";
# get 3,4 and 6th attribute of gff-line
my $cds_start = ${$self->get_features_from_ID($protein_ids[0])}{'start'};
my $cds_end = ${$self->get_features_from_ID($protein_ids[0])}{'end'};
my $cds_strand = ${$self->get_features_from_ID($protein_ids[0])}{'strand'};
# add Translatio to transcript
print "Adding tln with protein ".$protein_ids[0]." having start $cds_start end $cds_end strand $cds_strand\n";
$transcript = add_TranslationObject($protein_ids[0], $transcript, $cds_start,$cds_end,$cds_strand);
$transcript = $self->setExonPhases($transcript);
# set phases of exons
} else {
print "No protein for transcript ".$transcript->stable_id." and phases not set\n";
}
return $transcript;
}
sub rename_exons {
my ($gene_id, $all_new_transcripts) = @_;
my @transcripts = @$all_new_transcripts;
my %all_processed_exons;
# renaming of exons if the have the same coordiantes but different phases
my (%hk_name,%exoncheck);
# check for exons which have same name but different phases
# by getting all exon-hashkeys of one flybase-exon-identifer CG100: hk1, hk2,..
foreach my $trans ( @transcripts ) {
print "Finding exons for transcript $trans with stable_id ".$trans->stable_id."\n";
foreach my $e ( @{$trans->get_all_Exons()} ) {
if (exists $exoncheck{$e->stable_id}) {
${$exoncheck{$e->stable_id}}{$e->hashkey} = $e->stable_id;
# $e->hashkey returns a unique hashkey that can be used to uniquely identify
# this exon. Exons are considered to be identical if they share
# the same seq_region, start, end, strand, phase, end_phase.
# Note that this will consider two exons on different slices
# to be different, even if they actually are not.
# Returntype : string formatted as slice_name-start-end-strand-phase-end_phase
}else{
#warning("No stable_id for exon ".$e->hashkey."\n");
${$exoncheck{$e->stable_id}}{$e->hashkey} = $e->stable_id;
}
$hk_name{$e->hashkey}= $e->stable_id;
# Get slice_name, start, end, strand, phase, end_phase
my @line = split /\-/,$e->hashkey;
${$all_processed_exons{"$line[0]-$line[1]-$line[2]"}}{$gene_id}=();
}
}
# find exons which have the same id (CG923:1) but different hashkeys
my @multiple_hashkey;
for (keys %exoncheck) {
if (scalar (keys %{$exoncheck{$_}} ) > 1) {
# more than one hashkey for this exons_stable_id
for my $hashkey (keys %{$exoncheck{$_}}) {
push @multiple_hashkey, $hashkey;
}
}
}
# make new names for exon with same bounds but different phases
my %same_bounds;
for my $mh(@multiple_hashkey) {
my @line = split /\-/,$mh;
${$same_bounds{"$line[0]-$line[1]-$line[2]"}}{$mh}=();
}
# construct new name for each of the diffrent hahskeys
for(keys %same_bounds) {
my %exon_hk = %{$same_bounds{$_}};
my $cnt = "A";
for my $ehk(keys %exon_hk) {
$hk_name{$ehk} = $hk_name{$ehk}."-$cnt";
$cnt++;
}
}
# rename the exons with diff. hashkeys
for my $mh(@multiple_hashkey) {
foreach my $trans ( @transcripts ) {
foreach my $e ( @{$trans->get_all_Exons} ) {
# change stable_id
if ($e->hashkey eq $mh) {
$e->stable_id($hk_name{$mh});
}
}
}
}
# renaming exons which have different phases and which are shared by different genes
foreach my $trans ( @transcripts ) {
foreach my $e ( @{$trans->get_all_Exons} ) {
# rename exons which are shared by different genes
my @line = split /\-/,$e->hashkey; # get "root" of exon-hashkey without phase and strand-information
# get all unique genes which share this exon
my %genes_of_exon = %{$all_processed_exons{"$line[0]-$line[1]-$line[2]"} };
for my $gid (keys %genes_of_exon) {
if ($gid ne $gene_id) {
# if the exon is shared by another gene than the actual one rename the exon
# processed_gene : CG233
# name of exon CG100:4
# new name of exon: CG100:4
my $new_name = $e->stable_id . "--" . $gene_id ;
$e->stable_id ($new_name);
}
}
}
}
return \@transcripts;
}
=head2 prune_Exons
Arg [1] : Bio::EnsEMBL::Gene
Function : remove duplicate exons between two transcripts
Returntype: Bio::EnsEMBL::Gene
=cut
sub prune_Exons {
my ($gene) = @_;
my @unique_Exons;
# keep track of all unique exons found so far to avoid making duplicates
# need to be very careful about translation->start_Exon and translation->end_Exon
foreach my $tran (@{$gene->get_all_Transcripts}) {
my @newexons;
foreach my $exon (@{$tran->get_all_Exons}) {
my $found;
#always empty
UNI:foreach my $uni (@unique_Exons) {
if ($uni->start == $exon->start &&
$uni->end == $exon->end &&
$uni->strand == $exon->strand &&
$uni->phase == $exon->phase &&
$uni->end_phase == $exon->end_phase
) {
$found = $uni;
last UNI;
}
}
if (defined($found)) {
push(@newexons,$found);
if ($exon == $tran->translation->start_Exon) {
$tran->translation->start_Exon($found);
}
if ($exon == $tran->translation->end_Exon) {
$tran->translation->end_Exon($found);
}
} else {
push(@newexons,$exon);
push(@unique_Exons, $exon);
}
}
$tran->flush_Exons;
foreach my $exon (@newexons) {
$tran->add_Exon($exon);
}
}
return $gene;
}
=pod
=head2 store_as_simple_feature
Title : store_as_simple_feature
Usage : $obj->store_as_simple_feature($analysis_adaptor, $type, $logic_name)
Function : 1st creates an analysis-object using the submitted analysis-adaptor and
stores all features of gff of class $type (indicated by 3rd column in gff) as SimpleFeature of a given Analysis
Arguments : AnalysisAdaptor-Object, $type, $logic_name
Return-Val : none
=cut
sub store_as_simple_feature{
my ($self, $sf_adaptor, $type) = @_;
# check if there is such a feature in gff
if ($self->get_ids_by_type($type)) {
# get all ids of simplefeaturs of a given type
my @ids_of_given_type = @{$self->get_ids_by_type($type)};
print "\n* * *\n>> Doing type $type\n* * *\n";
print "Found ".scalar(@ids_of_given_type)." ids of type $type\n";
# check that these ids are unique... they should be!
my %unique_ids;
for (@ids_of_given_type){
if (!exists $unique_ids{$_}) {
$unique_ids{$_} = 1;
} else {
warning("Non-unique entry of type $type : ID ".$_);
}
}
print "Found ".scalar(keys %unique_ids)." UNIQUE ids of type $type\n";
# ok , now we get the simple features
my @simple_features;
my %analyses;
SIMPLE: foreach my $unique_id (keys %unique_ids) { # process each simple_feature
if (${$self->get_features_from_ID($unique_id)}{'type'} eq $type) {
# make an analysis, if it does not yet exists, from the type
# it would be nice to use the source too, but i'm not quite sure how to bring it in to our current db schema
my $analysis_logic = ${$self->get_features_from_ID($unique_id)}{'type'};
if (length($analysis_logic) > 40) {
my $old = $analysis_logic;
$analysis_logic = substr($analysis_logic,0,40);
warning("Shortened analysis logic_name from $old to $analysis_logic");
}
if (!exists $analyses{$analysis_logic}) {
$analyses{$analysis_logic} = $self->create_analysis($analysis_logic);
}
# my $cds_start = ${$self->get_features_from_ID($protein_ids[0])}{'start'};
my $score = ${$self->get_features_from_ID($unique_id)}{'score'};
my $display = ${$self->get_features_from_ID($unique_id)}{'ID'}[0];
if (${$self->get_features_from_ID($unique_id)}{'Name'}[0]) {
$display = ${$self->get_features_from_ID($unique_id)}{'Name'}[0];
}
# we could also use the name field as a display name but for now the ID is unique and more descriptive
my $sf = Bio::EnsEMBL::SimpleFeature->new(
-start => ${$self->get_features_from_ID($unique_id)}{'start'},
-end => ${$self->get_features_from_ID($unique_id)}{'end'},
-strand => ${$self->get_features_from_ID($unique_id)}{'strand'},
-slice => $self->slice,
-analysis => $analyses{$analysis_logic},
-score => ($score eq '.' ? '0.0' : $score),
-display_label => $display,
-dbID => undef,
-adaptor => undef
);
push @simple_features, $sf;
} else {
next SIMPLE;
}
} # end for
if (scalar(@simple_features > 0)) {
$sf_adaptor->store(@simple_features);
return scalar @simple_features;
} else {
return 0;
}
} else {
warning("Can't find any simple features of type $type");
return 0;
}
} # eos
=pod
=head2 create_analysis
Title : create_analysis
Usage : $obj->create_analysis ( $analysis_adaptor, $logic_name)
Function : 1st creates an analysis-object of the values in the FlyBaseConf-file and
stores the analysis by using the submitted AnalysisAdaptor (normally created out of the registry-file)
Arguments : AnalysisAdaptor-Object, $type, $logic_name
Return-Val : none
=cut
sub create_analysis{
my ($self,$logic_name, $db_file) = @_;
my $ana_obj =Bio::EnsEMBL::Analysis->new(
-logic_name =>$logic_name,
-db => $ANA_DB,
-db_version => $ANA_DB_VERSION,
-db_file => $ANA_DB_FILE,
-program => $ANA_PROGRAM,
-program_version => $ANA_PROGRAM_VERSION,
-program_file => $ANA_PROGRAM_FILE,
-gff_source => $ANA_GFF_SOURCE,
-gff_feature => $ANA_GFF_FEATURE,
-module => $ANA_MODULE,
-module_version => $ANA_MODULE_VERSION,
-parameters => $ANA_PARAMETERS,
-created => $ANA_CREATED
);
if ($db_file) {
$ana_obj->db_file($db_file);
}
return $ana_obj;
}
sub parse_gff {
my ($self) = @_;
my %featurehash;
my %parent2child;
my %m_type2id;
open(GFF,$self->gff)|| die "Could not open ".$self->gff."\n";
LINE: while() {
# read in the gff file, line by line
chomp;
last if /^\#\#FASTA/;
next if /^\#/;
my @line = split/\t/; # line consists of 9 tab-delimited fields in GFF Version v3.0
# split the line up into the tab-delimited fields and store them
my ($seqid,$source,$type,$start,$end,$score,$strand,$phase,$attrib) = @line;
# # #
# check the strand
# # #
if ($strand eq '-') {
$strand = $line[6] = "-1";
}
if ($strand eq '+') {
$strand = $line[6] = "+1";
}
if ($strand eq '.') {
$strand = $line[6] = "0";
}
if ($strand eq '?') {
$strand = $line[6] = "0";
}
# # #
# attributes
# # #
my $unique_id;
my @attributes = split/;/,$attrib if $attrib;
my $seenID;
# possible tags are: ID, Parent, Name, Dbxref, gbunit, cyto_range etc etc
for (@attributes) { #process all values of an attribute Parent=ID1,ID2,ID3 and write them in an attribute_hash
my ($attrib_name, $attrib_values ) = split /=/;
my @values = split /,/,$attrib_values;
# we expect the ID to come first
if ($attrib_name eq 'ID') {
$unique_id = $values[0];
$seenID = 1;
if (exists($featurehash{$unique_id})) {
warning("Already seen ID $unique_id in line @line");
}
} elsif (!$seenID) {
throw("parsing problem, need ID @line");
}
# add to our featurehash as an array
foreach my $value (@values) {
push @{$featurehash{$unique_id}{$attrib_name}}, $value;
if ($attrib_name eq 'Parent' || $attrib_name eq 'Derives_from') {
# parent --> child
my @parents = @values;
foreach my $parent (@parents) {
push @{$parent2child{$parent}{$unique_id}}, ($attrib_name, $type);
# as in, a protein derives from a transcript
}
}
}
}# for (@attributes) {
push @{$m_type2id{$type}}, $unique_id;
# # #
# hash for this feature
# # #
# add to our featurehash as a scalar
$featurehash{$unique_id}{'seqid'} = $seqid;
$featurehash{$unique_id}{'source'} = $source;
$featurehash{$unique_id}{'type'} = $type;
$featurehash{$unique_id}{'start'} = $start;
$featurehash{$unique_id}{'end'} = $end;
$featurehash{$unique_id}{'score'} = $score;
$featurehash{$unique_id}{'strand'} = $strand;
$featurehash{$unique_id}{'phase'} = $phase;
if ($featurehash{$unique_id}{'type'} eq 'orthologous_to') {
my $new_unique_id = $unique_id."_".$featurehash{$unique_id}{'to_id'}[0];
if (exists $featurehash{$unique_id}{'to_id'}[0]) {
# change the featurehash
foreach my $old_attrib_name (keys %{$featurehash{$unique_id}}) {
$featurehash{$new_unique_id}{$old_attrib_name} = $featurehash{$unique_id}{$old_attrib_name};
} # foreach my $old_attrib_name (keys %{$featurehash{$unique_id}}) {
delete $featurehash{$unique_id};
# change the typeid hash
for (my $x=0; $x < scalar(@{$m_type2id{$type}}); $x++) {
if (${$m_type2id{$type}}[$x] eq $unique_id) {
${$m_type2id{$type}}[$x] = $new_unique_id;
next;
}
}
# change the parent2childhash
foreach my $p (keys %parent2child) {
if (exists $parent2child{$p}{$unique_id}) {
@{$parent2child{$p}{$new_unique_id}} = @{$parent2child{$p}{$unique_id}};
#push @{$parent2child{$p}{$new_unique_id}}, @{$parent2child{$p}{$unique_id}};
delete $parent2child{$p}{$unique_id};
}
}
} else {
throw("Unable to make ID ".$featurehash{$unique_id}{'ID'}[0]." unique");
} # if (exists $featurehash{$unique_id}{'to_id'}[0]) {
} # if ($featurehash{$unique_id}{'type'} eq 'orthologous_to') {
} # LINE: while() {
$self->featurehash(\%featurehash);
$self->parent2child(\%parent2child);
$self->type2id(\%m_type2id);
close GFF;
}
sub parent2child {
my ($self,$hashref) = @_;
$self->{parent2child} = $hashref if $hashref;
return $self->{parent2child};
}
sub featurehash {
my ($self,$hashref) = @_;
$self->{featurehash} = $hashref if $hashref;
return $self->{featurehash};
}
#
# alternative getter/setters
#
################################################################################
=pod
=head3 alternative getter/setters
=head2 slice()
Usage : $obj->slice || $obj->slice($slice)
Function : sets/gets the actual slice on which the features/genes are stored
=cut
sub slice{
my ($self,$slice) = @_;
$self->{'slice'} = $slice if $slice;
return $self->{'slice'};
}
=pod
=head2 db()
Usage : $obj->db() || $obj->db($database_adaptor)
Function : sets/gets the DatabaseAdaptor
=cut
sub db{
my ($self,$db) = @_;
$self->{'db'} = $db if $db;
return $self->{'db'};
}
=pod
=head2 gff()
Usage : $obj->gff() || $obj->gff( $gff_filename)
Function : sets/gets the filename of the current gff-file
=cut
sub gff{
my ($self,$gff)=@_;
if ($gff) {
$self->{gff}=$gff;
$self->throw("gff not found $gff: $!\n") unless (-e $gff);
}
return $self->{gff};
}
=pod
=head2 warn_inconsistency( $warning )
Usage : warn_inconsistency($warning)
Function : prints out warning-statement
=cut
sub warn_inconsistency{
my $error = shift;
print STDERR "\tWARNING: UNEXPECTED INCONSISTENCY IN GFF-FILE !!\t\t$error";
return;
}
=pod
=head2 get_dbxrefs_($id, $xref_type )
Usage : $obj->get_field_attributes($id, $descriptor )
Function : returns the Dbxref-Identifier of a given ID like 'FBgn0033692' for type 'gene'
Returnvalue : Arrayref. to all DBxrefs of a given type
=cut
sub get_dbxrefs {
my ($self,$id, $xref_type) = @_;
# eg. where $id is a Transcript_id and $xref_type is the child_type eg. mRNA, miRNA, pseudogene
my @dbes;
if (exists ${$self->get_features_from_ID($id)}{'Dbxref'}) {
DBXREF: foreach my $dbx (@{$self->get_features_from_ID($id)->{'Dbxref'}}) {
my ($db_name, @primary_id_string) = split /\:/,$dbx;
# split up into db_name and primary_id
my $primary_id = join(':',@primary_id_string);
if ($db_name eq 'if' && $primary_id !~ m/www/) {
# Interactive Fly = IF
$primary_id = "http://www.sdbonline.org/fly".$primary_id;
# eg. http://www.sdbonline.org/fly/cytoskel/lethl2g1.htm
}
if ($db_name && $primary_id) {
print "got xref $db_name with pid $primary_id\n";
# make a db_entry
my $db_entry = Bio::EnsEMBL::DBEntry->new(
-DBNAME => $db_name,
-RELEASE => 1,
-PRIMARY_ID => $primary_id,
-DISPLAY_ID => $primary_id
);
push @dbes, $db_entry;
} else {
print "No xref db_name or primary_id for id $id ";
} # if ($db_name && $primary_id) {
} # DBXREF
} else {
warning("No xrefs for id $id of type $xref_type\n");
@dbes = ();
}
if (exists ${$self->get_features_from_ID($id)}{'Name'}) {
foreach my $syn (@{${$self->get_features_from_ID($id)}{'Name'}}) {
$dbes[0]->add_synonym($syn);
print "Added synonym name $syn to ".$dbes[0]->primary_id."\n";
}
}
if (exists ${$self->get_features_from_ID($id)}{'Alias'}) {
foreach my $syn (@{${$self->get_features_from_ID($id)}{'Alias'}}) {
$dbes[0]->add_synonym($syn);
print "Added synonym alias $syn to ".$dbes[0]->primary_id."\n";
}
}
# at the moment, we could parse out SO and GO too, but we don't
# eg. if (exists ${$self->get_features_from_ID($id)}{'Ontolgy_term'}) {
# get the SO and Go things...
# }
return \@dbes;
}
sub type2id{
my ($self,$type2id_hash) = @_;
$self->{_type2id} = $type2id_hash if $type2id_hash;
return $self->{_type2id};
}
=pod
=head3 getting attributes by their type ('gene')
=head2 get_ids_by_type
Title : get_ids_by_type
Usage : $obj->get_ids_by_type('gene')
Function : returns an Arrayref to all known IDs of a given type (in the scope of the gff)
Arguments : ID
Return-Val : Arrayref
=cut
sub get_ids_by_type{
my ($self,$type) = @_;
# get hash in which TYPES of the features are stored as keys return array-ref to all feature-identifiers of this TYPE
my $idref = [];
if (${$self->type2id}{$type}) {
$idref = ${$self->type2id}{$type}
}
return $idref;
}
1;