FlyBaseGff documentation.

  Title       : create_analysis
  Usage       : $obj->create_analysis ( $analysis_adaptor, $logic_name)
  Function    : 1st creates an analysis-object of the values in the FlyBaseConf-file and
                stores the analysis by using the submitted AnalysisAdaptor (normally created out of the registry-file)
  Arguments   : AnalysisAdaptor-Object, $type, $logic_name
  Return-Val  : none

db()

  Usage       : $obj->db() || $obj->db($database_adaptor)
  Function    : sets/gets the DatabaseAdaptor

db_entry

  Title       : db_entry 
  Usage       :
  Function    : for xrefs 
  Arguments   :
  Return-Val  :

get_ids_by_type

  Title       : get_ids_by_type
  Usage       : $obj->get_ids_by_type('gene')
  Function    : returns an Arrayref to all known IDs of a given type (in the scope of the gff)
  Arguments   : ID
  Return-Val  : Arrayref

gff()

  Usage       : $obj->gff() || $obj->gff( $gff_filename)
  Function    : sets/gets the filename of the current gff-file

new

  Title       : new 
  Usage       : 
  Function    : new GFF object 
  Arguments   : 
  Return-Val  :

prune_Exons

  Arg [1]   : Bio::EnsEMBL::Gene
  Function  : remove duplicate exons between two transcripts
  Returntype: Bio::EnsEMBL::Gene

slice()

  Usage       : $obj->slice || $obj->slice($slice)
  Function    : sets/gets the actual slice on which the features/genes are stored

store_as_gene_object

  Title       :  store_as_gene_object
  Usage       :  store_as_gene_object ('gene', 'mRNA','flybase_gene')
  Function    :  looks in gff for entity of type 'gene' which has childs 'mRNA' and 'exon' and stores them 
                 as gene of type 'flybase_gene' in the database
  Decsription :  Given a slice, parent type (eg. Gene), child type (eg. mRNA, tRNA, pseudogene) 
                 this goes through the gff hash and finds all parents of the given type and 
                 makes them into Genes. If they have associates Transcripts and Exons then 
                 these are added too. They are all stored in the database.
  Arguments   :
  Return-Val  :

store_as_simple_feature

  Title       : store_as_simple_feature
  Usage       : $obj->store_as_simple_feature($analysis_adaptor, $type, $logic_name)
  Function    : 1st creates an analysis-object using the submitted analysis-adaptor and 
                stores all features of gff of class $type (indicated by 3rd column in gff) as SimpleFeature of a given Analysis
  Arguments   : AnalysisAdaptor-Object, $type, $logic_name
  Return-Val  : none

warn_inconsistency( $warning )

  Usage       : warn_inconsistency($warning)
  Function    : prints out warning-statement

Methods code

add_Translation

description

sub add_Translation {

  my ($self, $transcript) = @_;
  my @protein_ids;

  foreach my $child_id (keys %{$self->get_childrenID_from_parentID($transcript->stable_id)}) {
    if (${$self->get_features_from_ID($child_id)}{'type'} eq 'protein') {
      push @protein_ids, $child_id;
      print "   Found protein for transcript ".$transcript->stable_id." (strand ".$transcript->strand.") : ";
    }
  }
  if (scalar(@protein_ids) > 1) {
    print "WARNING: More than one protein for transcript ".$transcript->stable_id." @protein_ids";
  }
  if (defined $protein_ids[0]) {
    print $protein_ids[0]."\n";

    # get 3,4 and 6th attribute of gff-line
    my $cds_start = ${$self->get_features_from_ID($protein_ids[0])}{'start'};
    my $cds_end = ${$self->get_features_from_ID($protein_ids[0])}{'end'};
    my $cds_strand = ${$self->get_features_from_ID($protein_ids[0])}{'strand'};
    # add Translatio to transcript
    print "Adding tln with protein ".$protein_ids[0]." having start $cds_start end $cds_end strand $cds_strand\n";
    $transcript = add_TranslationObject($protein_ids[0], $transcript, $cds_start,$cds_end,$cds_strand);
    $transcript =  $self->setExonPhases($transcript);
    # set phases of exons
  } else {
    print "No protein for transcript ".$transcript->stable_id." and phases not set\n";
  }
  return $transcript;

}

create_analysis

sub create_analysis {

  my ($self,$logic_name, $db_file) = @_; 

  my $ana_obj =Bio::EnsEMBL::Analysis->new(
                                           -logic_name =>$logic_name,
                                           -db => $ANA_DB,
                                           -db_version => $ANA_DB_VERSION,
                                           -db_file => $ANA_DB_FILE,
                                           -program => $ANA_PROGRAM,
                                           -program_version => $ANA_PROGRAM_VERSION,
                                           -program_file => $ANA_PROGRAM_FILE,
                                           -gff_source => $ANA_GFF_SOURCE,
                                           -gff_feature => $ANA_GFF_FEATURE,
                                           -module => $ANA_MODULE,
                                           -module_version => $ANA_MODULE_VERSION,
                                           -parameters => $ANA_PARAMETERS,
                                           -created => $ANA_CREATED
                                          );
  if ($db_file) {
    $ana_obj->db_file($db_file);
  }
  return $ana_obj;

}

sub db {

  my ($self,$db) = @_;
  $self->{'db'} = $db if $db;
  return $self->{'db'};

}

db_entry

sub db_entry {

  my ($self, $tr,$db_entry) = @_;
  if($db_entry){
    $ {$self->{db_entry}} {$tr}=$db_entry;
  }
  return $ { $self->{db_entry}}{$tr};

}

featurehash

description

sub featurehash {

  my ($self,$hashref) = @_;
  $self->{featurehash} = $hashref if $hashref;
  return $self->{featurehash};
}


#
#  alternative getter/setters
#
 ################################################################################

}

get_childrenID_from_parentID

description

sub get_childrenID_from_parentID {

   my ($self,$id) = @_;
   my $children_ref = {}; 
   if (${$self->parent2child}{$id}) {
     $children_ref = ${$self->parent2child}{$id};
   }
   return $children_ref;

}

get_dbxrefs

description

sub get_dbxrefs {

  my ($self,$id, $xref_type) = @_;
  # eg. where $id is a Transcript_id and $xref_type is the child_type eg. mRNA, miRNA, pseudogene
  my @dbes; 

  if (exists ${$self->get_features_from_ID($id)}{'Dbxref'}) {
    DBXREF: foreach my $dbx (@{$self->get_features_from_ID($id)->{'Dbxref'}}) {
      my ($db_name, @primary_id_string) = split /\:/,$dbx;
      # split up into db_name and primary_id
      my $primary_id = join(':',@primary_id_string);
      if ($db_name eq 'if' && $primary_id !~ m/www/) {
        # Interactive Fly = IF
        $primary_id = "http://www.sdbonline.org/fly".$primary_id;
        # eg. http://www.sdbonline.org/fly/cytoskel/lethl2g1.htm
      }
      if ($db_name && $primary_id) {
        print "got xref $db_name with pid $primary_id\n";
        # make a db_entry
        my $db_entry = Bio::EnsEMBL::DBEntry->new(
                                              -DBNAME  => $db_name,
                                              -RELEASE => 1,
                                              -PRIMARY_ID => $primary_id,
                                              -DISPLAY_ID => $primary_id
                                             );

        push @dbes, $db_entry;
      } else {
        print "No xref db_name or primary_id for id $id ";
      } # if ($db_name && $primary_id) {
    } # DBXREF
  } else {
    warning("No xrefs for id $id of type $xref_type\n");
    @dbes = ();
  }
  if (exists ${$self->get_features_from_ID($id)}{'Name'}) { 
    foreach my $syn (@{${$self->get_features_from_ID($id)}{'Name'}}) {
      $dbes[0]->add_synonym($syn);
      print "Added synonym name $syn to ".$dbes[0]->primary_id."\n";
    }
  }
  if (exists ${$self->get_features_from_ID($id)}{'Alias'}) {
    foreach my $syn (@{${$self->get_features_from_ID($id)}{'Alias'}}) {
      $dbes[0]->add_synonym($syn);
      print "Added synonym alias $syn to ".$dbes[0]->primary_id."\n";
    }
  }

  # at the moment, we could parse out SO and GO too, but we don't 
  # eg. if (exists ${$self->get_features_from_ID($id)}{'Ontolgy_term'}) {
  # get the SO and Go things...
  # }
  return\@ dbes;

}

get_features_from_ID

description

sub get_features_from_ID {

   my ($self,$id) = @_;
   return ${$self->featurehash}{$id};
}



#
# Methods used for storing attributes, genes, SimpleFeatures
 ################################################################################

}

get_ids_by_type

sub get_ids_by_type {

  my ($self,$type) = @_;
  # get hash in which TYPES of the features are stored as keys return array-ref to all feature-identifiers of this TYPE
  my $idref = [];
  if (${$self->type2id}{$type}) {
    $idref = ${$self->type2id}{$type}
  }
  return $idref;
}
1;

}

gff

sub gff {

  my ($self,$gff)=@_;
  if ($gff) {
    $self->{gff}=$gff;
    $self->throw("gff not found $gff: $!\n") unless (-e $gff);
  }
  return $self->{gff};

}

make_Exons

description

sub make_Exons {

  my ($self, $transcript, $exon_ids, $time ) = @_;
  my @exons;

  # Loop thru exons and Make EnsEMBL objects
  EXON: foreach my $exon_id (@$exon_ids) {
    if (${$self->get_features_from_ID($exon_id)}{'type'} ne 'exon') {
      next EXON;
    }
    my $ex = Bio::EnsEMBL::Exon->new(
                                 -START     => ${$self->get_features_from_ID($exon_id)}{'start'}, 
                                 -END       => ${$self->get_features_from_ID($exon_id)}{'end'}, 
                                 -STRAND    => ${$self->get_features_from_ID($exon_id)}{'strand'}, 
                                 -PHASE     => 0,
                                 -END_PHASE => 0,
                                 -SLICE     => $self->slice,
                                 -ANALYSIS  => $self->create_analysis($LOGIC_NAME_EXON),
                                 -STABLE_ID => $exon_id, 
                                 -CREATED_DATE => $time,
                                 -MODIFIED_DATE => $time,
                                 -VERSION   => $ATTRIBUTE_VERSION
                                );
    print "  For Transcript ".$transcript->stable_id.", have Exon ".$ex->stable_id." with ".
          "start ".$ex->start." end ".$ex->end." strand ".$ex->strand."\n" ;
    # add Exon to Transcript
    $transcript->add_Exon($ex);
  }
  return $transcript;

}

make_Transcripts

description

sub make_Transcripts {

  my ($self, $tref, $biotype, $time) = @_;
  my @transcript_ids = @{$tref};
  my @transcripts;

  for my $transcript_id (@transcript_ids) {
    # make an EnsEMBL Transcript
    my $new_transcript = new Bio::EnsEMBL::Transcript(
                                                       -STABLE_ID => $transcript_id,
                                                       -VERSION => $ATTRIBUTE_VERSION,
                                                       -TYPE => $biotype, 
                                                       -CREATED_DATE => $time,
                                                       -MODIFIED_DATE => $time,
                                                       -ANALYSIS  => $self->create_analysis($LOGIC_NAME_EXON),
                                                      );
#    print "   Have new Transcript $transcript_id of biotype $biotype. Getting xrefs...\n";
#
#    # It takes the first Dbxref entry and stores it
#    # my @dbx = @{${$self->featurehash}{$transcript_id}{'Dbxref'}};
#    my @dbxrefs = @{$self->get_features_from_ID($transcript_id)->{'Dbxref'}};
#    if (scalar @dbxrefs) {
#      print "Found xrefs @dbxrefs";
#      # not sure if we should use all of them or just the first
#      $self->db_entry($new_transcript, ${$self->get_features_from_ID($transcript_id)}{'Dbxref'}[0]);
#    } else {
#      print "WARNING: No db entry transcript $transcript_id\n";
#    }

    # # #
    # Exons
    # # #
    my @exon_ids = keys %{$self->get_childrenID_from_parentID($transcript_id)};
    if (scalar(@exon_ids) == 0) {
      print "\n   x No children for transcript $transcript_id, must be non-coding\n";
    } else {
      print "\nTranscript $transcript_id has ".@exon_ids." possible  exons\n";
      my $transcript = make_Exons($self, $new_transcript,\@ exon_ids, $time);
      my $transcript_with_tln = add_Translation($self, $transcript);  
      push @transcripts, $transcript_with_tln;
    }
  } # for my $transcript_id (@transcript_ids) {
  return\@ transcripts;

}

new

sub new {

  my ($class,@args) = @_;
  my $self={};
  bless $self,$class;

  my ($db,$gff,$slice) = rearrange(
                            [qw(
                                DB
                                GFF
                                SLICE
                               )],@args);

  $self->db($db);
  $self->gff($gff);
  $self->slice($slice);
  $self->{_gene}=[];
  $self->{exons}={};
  $self->{db_entry}={};
  $self->parse_gff();
  print "\n* * *\ngff $gff parsed\n* * *\n ";
  return $self;

}

parent2child

description

sub parent2child {

  my ($self,$hashref) = @_;
  $self->{parent2child} = $hashref if $hashref;
  return $self->{parent2child};

}

parse_gff

description

sub parse_gff {

  my ($self) = @_;
  my %featurehash;
  my %parent2child;
  my %m_type2id;

  open(GFF,$self->gff)|| die "Could not open ".$self->gff."\n";
  LINE: while(<GFF>) {
    # read in the gff file, line by line
    chomp;
    last if /^\#\#FASTA/;
    next if /^\#/;
    my @line = split/\t/;    # line consists of 9 tab-delimited fields in GFF Version v3.0

    # split the line up into the tab-delimited fields and store them
    my ($seqid,$source,$type,$start,$end,$score,$strand,$phase,$attrib) = @line;
         
    # # #
    # check the strand
    # # #
    if ($strand eq '-') {
      $strand = $line[6] = "-1";
    }
    if ($strand eq '+') {
      $strand = $line[6] = "+1";
    }
    if ($strand eq '.') {
      $strand = $line[6] = "0";
    }
    if ($strand eq '?') {
      $strand = $line[6] = "0";
    }

    # # #
    # attributes
    # # #
    my $unique_id;
    my @attributes = split/;/,$attrib if $attrib;
    my $seenID;
    # possible tags are: ID, Parent, Name, Dbxref, gbunit, cyto_range etc etc
    for (@attributes) {     #process all values of an attribute Parent=ID1,ID2,ID3 and write them in an attribute_hash
      my ($attrib_name, $attrib_values ) = split /=/;
      my @values = split /,/,$attrib_values;
      # we expect the ID to come first
      if ($attrib_name eq 'ID') {
        $unique_id = $values[0];
        $seenID = 1;
        if (exists($featurehash{$unique_id})) {
          warning("Already seen ID $unique_id in line @line");
        }
      } elsif (!$seenID) {
        throw("parsing problem, need ID @line");
      }
      
      # add to our featurehash as an array
      foreach my $value (@values) {
        push @{$featurehash{$unique_id}{$attrib_name}}, $value;
        if ($attrib_name eq 'Parent' || $attrib_name eq 'Derives_from') {
          # parent --> child
          my @parents = @values;
          foreach my $parent (@parents) {
            push @{$parent2child{$parent}{$unique_id}}, ($attrib_name, $type);
            # as in, a protein derives from a transcript
          }
        }
      }
    }# for (@attributes) { 
    push @{$m_type2id{$type}}, $unique_id;
    # # # 
    # hash for this feature
    # # #
    # add to our featurehash as a scalar
    $featurehash{$unique_id}{'seqid'} = $seqid;
    $featurehash{$unique_id}{'source'} = $source;
    $featurehash{$unique_id}{'type'} = $type;
    $featurehash{$unique_id}{'start'} = $start;
    $featurehash{$unique_id}{'end'} = $end;
    $featurehash{$unique_id}{'score'} = $score;
    $featurehash{$unique_id}{'strand'} = $strand;
    $featurehash{$unique_id}{'phase'} = $phase;

    if ($featurehash{$unique_id}{'type'} eq 'orthologous_to') {
      my $new_unique_id = $unique_id."_".$featurehash{$unique_id}{'to_id'}[0];
      if (exists $featurehash{$unique_id}{'to_id'}[0]) {
        # change the featurehash
        foreach my $old_attrib_name (keys %{$featurehash{$unique_id}}) {
            $featurehash{$new_unique_id}{$old_attrib_name} = $featurehash{$unique_id}{$old_attrib_name};
        } # foreach my $old_attrib_name (keys %{$featurehash{$unique_id}}) {
        delete $featurehash{$unique_id};
        # change the typeid hash
        for (my $x=0; $x < scalar(@{$m_type2id{$type}}); $x++) {
          if (${$m_type2id{$type}}[$x] eq $unique_id) {
            ${$m_type2id{$type}}[$x] = $new_unique_id;
            next;
          }
        } 
        # change the parent2childhash
        foreach my $p (keys %parent2child) {
          if (exists $parent2child{$p}{$unique_id}) {
            @{$parent2child{$p}{$new_unique_id}} = @{$parent2child{$p}{$unique_id}};
            #push @{$parent2child{$p}{$new_unique_id}}, @{$parent2child{$p}{$unique_id}};
            delete $parent2child{$p}{$unique_id};
          }
        }
      } else {
        throw("Unable to make ID ".$featurehash{$unique_id}{'ID'}[0]." unique");
      } # if (exists $featurehash{$unique_id}{'to_id'}[0]) {
    } # if ($featurehash{$unique_id}{'type'} eq 'orthologous_to') {
  } # LINE: while(<GFF>) {

  $self->featurehash(\%featurehash);
  $self->parent2child(\%parent2child);
  $self->type2id(\%m_type2id);
  close GFF;

}

prune_Exons

sub prune_Exons {

  my ($gene) = @_;
  my @unique_Exons;

  # keep track of all unique exons found so far to avoid making duplicates
  # need to be very careful about translation->start_Exon and translation->end_Exon

  foreach my $tran (@{$gene->get_all_Transcripts}) {
    my @newexons;
    foreach my $exon (@{$tran->get_all_Exons}) {
      my $found;
      #always empty
    UNI:foreach my $uni (@unique_Exons) {
        if ($uni->start  == $exon->start  &&
            $uni->end    == $exon->end    &&
            $uni->strand == $exon->strand &&
            $uni->phase  == $exon->phase  &&
            $uni->end_phase == $exon->end_phase
           ) {
          $found = $uni;
          last UNI;
        }
      }
      if (defined($found)) {
        push(@newexons,$found);
        if ($exon == $tran->translation->start_Exon) {
          $tran->translation->start_Exon($found);
        }
        if ($exon == $tran->translation->end_Exon) {
          $tran->translation->end_Exon($found);
        }
      } else {
        push(@newexons,$exon);
        push(@unique_Exons, $exon);
      }
    }
    $tran->flush_Exons;
    foreach my $exon (@newexons) {
      $tran->add_Exon($exon);
    }
  }
  return $gene;

}

rename_exons

description

sub rename_exons {

  my ($gene_id, $all_new_transcripts) = @_;
  my @transcripts = @$all_new_transcripts;
  my %all_processed_exons;

  # renaming of exons if the have the same coordiantes but different phases
  my (%hk_name,%exoncheck);
  # check for exons which have same name but different phases 
  # by getting all exon-hashkeys of one flybase-exon-identifer CG100: hk1, hk2,..
  foreach my $trans ( @transcripts ) {
    print "Finding exons for transcript $trans with stable_id ".$trans->stable_id."\n";
    foreach my $e ( @{$trans->get_all_Exons()} ) {
      if (exists $exoncheck{$e->stable_id}) {
        ${$exoncheck{$e->stable_id}}{$e->hashkey} = $e->stable_id;
        # $e->hashkey returns a unique hashkey that can be used to uniquely identify
        # this exon.  Exons are considered to be identical if they share
        # the same seq_region, start, end, strand, phase, end_phase.
        # Note that this will consider two exons on different slices
        # to be different, even if they actually are not. 
        # Returntype : string formatted as slice_name-start-end-strand-phase-end_phase
      }else{
        #warning("No stable_id for exon ".$e->hashkey."\n");
        ${$exoncheck{$e->stable_id}}{$e->hashkey} = $e->stable_id;
      }
      $hk_name{$e->hashkey}= $e->stable_id;
      # Get slice_name, start, end, strand, phase, end_phase
      my @line = split /\-/,$e->hashkey;
      ${$all_processed_exons{"$line[0]-$line[1]-$line[2]"}}{$gene_id}=();
    }
  }
  
  # find exons which have the same id (CG923:1) but different hashkeys
  my @multiple_hashkey;
  for (keys %exoncheck) {
    if (scalar (keys %{$exoncheck{$_}} )  > 1) {
      # more than one hashkey for this exons_stable_id
      for my $hashkey (keys %{$exoncheck{$_}}) {
        push @multiple_hashkey, $hashkey;
      }
    }
  }
  
  # make new names for exon with same bounds but different phases
  my %same_bounds;
  for my $mh(@multiple_hashkey) {
    my @line = split /\-/,$mh;
    ${$same_bounds{"$line[0]-$line[1]-$line[2]"}}{$mh}=();
  }
  
  # construct new name for each of the diffrent hahskeys
  for(keys %same_bounds) {
    my %exon_hk = %{$same_bounds{$_}};
    my $cnt = "A";
    for my $ehk(keys %exon_hk) {
      $hk_name{$ehk} = $hk_name{$ehk}."-$cnt";
      $cnt++;
    }
  }
  
  # rename the exons with diff. hashkeys
  for my $mh(@multiple_hashkey) {
    foreach my $trans ( @transcripts ) {
      foreach my $e ( @{$trans->get_all_Exons} ) {
        # change stable_id
        if ($e->hashkey eq $mh) {
          $e->stable_id($hk_name{$mh});
        }
      }
    }
  }
  
  # renaming exons which have different phases and which are shared by different genes
  foreach my $trans ( @transcripts ) {
    foreach my $e ( @{$trans->get_all_Exons} ) {
      # rename exons which are shared by different genes
      my @line = split /\-/,$e->hashkey; # get "root" of exon-hashkey without phase and strand-information
      # get all unique genes which share this exon
      my %genes_of_exon = %{$all_processed_exons{"$line[0]-$line[1]-$line[2]"} };
      for my $gid (keys %genes_of_exon) {
        if ($gid ne $gene_id) {
          # if the exon is shared by another gene than the actual one rename the exon
          # processed_gene : CG233
          # name of exon CG100:4
          # new name of exon: CG100:4
          my $new_name = $e->stable_id . "--" . $gene_id ;
          $e->stable_id ($new_name);
        }
      }
    }
  }
  return\@ transcripts;

}

slice

sub slice {

  my ($self,$slice) = @_;
  $self->{'slice'} = $slice if $slice;
  return $self->{'slice'};

}

store_as_gene_object

sub store_as_gene_object {

  my ($self, $parent_type, $child_type, $ensembl_gene_type) = @_;
  my %all_processed_exons;

  print "Parent feature_type=$parent_type and Child_feature_type=$child_type\n";

  # test if $parent_type is annotated in gff
  my @gene_ids = @{$self->get_ids_by_type($parent_type)};
  if (!@gene_ids || scalar(@gene_ids) < 1) {
    warning("No parent type '$parent_type' found in gff file ".$self->gff()." (child type $child_type)\n");
    return;
  } else {   
  # Get all gene ids for this type (gene) 
    print "Have ".scalar(@gene_ids)." genes of parent type $parent_type\n";
  }

  # want to timestamp the stable_ids
  my $time = time();

 GENE:  foreach my $gene_id ( @gene_ids ) {
    print "Looking at gene $gene_id\n";
    # note that genes with no transcripts are not stored
    # eg. mt:ori gene     

    # # #
    # Transcripts
    # # #
    # start looking for transcripts of this parent gene
    my $all_new_transcripts;
    # get a list of child ids for the parent
    # ie. get a list of transcript IDs for the Gene
    my @transcript_ids = keys %{$self->get_childrenID_from_parentID($gene_id)};
    my @keep;
    if (scalar(@transcript_ids) == 0) {
      # there are no children of this type for the parent
      print "   x WARNING: No child of type $child_type for $parent_type with id $gene_id\n";
      # make a gene without transcripts?
      next GENE;
    } else {
      # the parent has children of type $child_type
      # Loop thru the alternate transcripts
      foreach my $t (@transcript_ids) {
        if (${$self->get_childrenID_from_parentID($gene_id)}{$t}[1] eq $child_type) {
          push @keep, $t;
        }
      }
      if (scalar(@keep) == 0) {
        print "   x WARNING: No child of type $child_type for $parent_type with id $gene_id\n";
        next GENE;
      } else {
        print "\n* * *\n>> Doing gene $gene_id\n * * *\n";
        $all_new_transcripts = make_Transcripts($self,\@ keep, $child_type, $time);      
      }
    } # if (scalar(@transcript_ids) == 0) {  

    print "Renaming exons\n";
    # Rename exons where necessary
    my ($rt) = rename_exons($gene_id, $all_new_transcripts);
    my @renamed_transcripts = @{$rt};
      
    # # #
    # Make Genes
    # # #
    # got all alternative transcripts and make genes
    my $gene = Bio::EnsEMBL::Gene->new(
                                     -START     => ${$self->get_features_from_ID($gene_id)}{'start'}, 
                                     -END       => ${$self->get_features_from_ID($gene_id)}{'end'}, 
                                     -STRAND    => ${$self->get_features_from_ID($gene_id)}{'strand'}, 
                                     -SLICE     => $self->slice,
                                     -ANALYSIS  => $self->create_analysis($LOGIC_NAME_EXON),
                                     -STABLE_ID => $gene_id,
                                     -VERSION   => $ATTRIBUTE_VERSION,
	                             -TYPE      => $ensembl_gene_type,
                                     -CREATED_DATE => $time,
                                     -MODIFIED_DATE => $time,
                                     -SOURCE    => ${$self->get_features_from_ID($gene_id)}{'source'}, 
                                    );

    # add Transcripts to Gene
    foreach my $tr (@renamed_transcripts) {
      $gene->add_Transcript($tr);
    }

    # # #
    # Store
    # # #
    # check if we already have an exon-stable-id/gene-stable-id/translation-stable-id/transcript-stable-id for this gene
    print "\nNow trying to store gene $gene_id....." ;
    my $gene_DB_id = $self->db->get_GeneAdaptor->store($gene);
    print "STORED gene $gene_DB_id\n\n " ;
    
#    # Xrefs 
#    foreach my $dbe (@{$self->get_dbxrefs($gene_id, $parent_type)}) {
#      $self->db->get_DBEntryAdaptor->store($dbe,$gene_DB_id, 'Gene');
#      print "  *stored xref with dbname ".$dbe->dbname." for ".ref($gene)." with id $gene_DB_id\n";
#    }

    # reprocess the db_entry for transcripts
    #for my $tr (@renamed_transcripts){
#    for my $tr (@{$gene->get_all_Transcripts}) {
#      foreach my $dbe (@{$self->get_dbxrefs($tr->stable_id, $child_type)}) {
#       # my $dbe = $self->db_entry($tr);
#        if($dbe){
#          $self->db->get_DBEntryAdaptor->store($dbe, $tr->dbID,'Transcript');
#          print "  *stored xref with dbname ".$dbe->dbname." for ".ref($tr)." with id ".$tr->dbID."\n";
#        }
#      }
#      if ( $tr->translation() ) {
#        my $tln = $tr->translation() ;
#        # we have a protein-coding gene and can gets xrefs for the protein
#        foreach my $dbe (@{$self->get_dbxrefs($tln->stable_id, 'protein')}) {
#          if ($dbe) {
#            $self->db->get_DBEntryAdaptor->store($dbe, $tln->dbID,'Translation');
#            print "  *stored xref with dbname ".$dbe->dbname." for ".ref($tln)." with id ".$tln->dbID."\n";
#          }
#        }
#      }
#    }

  } # for my $gene_id ( @{$self->get_ids_by_type($parent_type)} ) {
} # end sub store_as_gene_object {

}

store_as_simple_feature

sub store_as_simple_feature {

  my ($self, $sf_adaptor, $type) = @_;

  # check if there is such a feature in gff
  if ($self->get_ids_by_type($type)) {
    # get all ids of simplefeaturs of a given type
    my @ids_of_given_type = @{$self->get_ids_by_type($type)}; 
    print "\n* * *\n>> Doing type $type\n* * *\n";
    print "Found ".scalar(@ids_of_given_type)." ids of type $type\n";
    # check that these ids are unique... they should be!
    my %unique_ids;
    for (@ids_of_given_type){
      if (!exists $unique_ids{$_}) {
        $unique_ids{$_} = 1;
      } else {
        warning("Non-unique entry of type $type : ID ".$_);
      }
    }
    print "Found ".scalar(keys %unique_ids)." UNIQUE ids of type $type\n";

    # ok , now we get the simple features
    my @simple_features;
    my %analyses;

    SIMPLE: foreach my $unique_id (keys %unique_ids) {                                 # process each simple_feature
      if (${$self->get_features_from_ID($unique_id)}{'type'} eq $type) {
        # make an analysis, if it does not yet exists, from the type 
        # it would be nice to use the source too, but i'm not quite sure how to bring it in to our current db schema
        my $analysis_logic = ${$self->get_features_from_ID($unique_id)}{'type'};
        if (length($analysis_logic) > 40) {
          my $old = $analysis_logic;
          $analysis_logic = substr($analysis_logic,0,40);
          warning("Shortened analysis logic_name from $old to $analysis_logic");
        }

        if (!exists $analyses{$analysis_logic}) {
          $analyses{$analysis_logic} = $self->create_analysis($analysis_logic);
        }

      # my $cds_start = ${$self->get_features_from_ID($protein_ids[0])}{'start'};
        my $score = ${$self->get_features_from_ID($unique_id)}{'score'};
        my $display = ${$self->get_features_from_ID($unique_id)}{'ID'}[0];
        if (${$self->get_features_from_ID($unique_id)}{'Name'}[0]) {
          $display = ${$self->get_features_from_ID($unique_id)}{'Name'}[0];
        }
        # we could also use the name field as a display name but for now the ID is unique and more descriptive
        my $sf = Bio::EnsEMBL::SimpleFeature->new(
                                                    -start    => ${$self->get_features_from_ID($unique_id)}{'start'}, 
                                                    -end      => ${$self->get_features_from_ID($unique_id)}{'end'}, 
                                                    -strand   => ${$self->get_features_from_ID($unique_id)}{'strand'}, 
                                                    -slice    => $self->slice,
                                                    -analysis => $analyses{$analysis_logic},
                                                    -score    => ($score eq '.' ? '0.0' : $score),
                                                    -display_label => $display,
                                                    -dbID     => undef,
                                                    -adaptor  => undef
                                                   );
        push @simple_features, $sf;
      } else {
        next SIMPLE;
      }
    } # end for
    if (scalar(@simple_features > 0)) {    
      $sf_adaptor->store(@simple_features);
      return scalar @simple_features;
    } else {
      return 0; 
    }
  } else {
    warning("Can't find any simple features of type $type");
    return 0;
  }
} # eos

}

type2id

description

sub type2id {

  my ($self,$type2id_hash) = @_;

  $self->{_type2id} = $type2id_hash if $type2id_hash;
  return $self->{_type2id};

}

warn_inconsistency

sub warn_inconsistency {

  my $error = shift;
  print STDERR "\tWARNING: UNEXPECTED INCONSISTENCY IN GFF-FILE !!\t\t$error";
  return;

}

General documentation

get_all_attributes_by_id

  Title       : get_all_attributes_by_id
  Usage       : $obj->get_all_attributes_by_id('ID')
  Function    :  access all stored attributes of a feature-id
  Arguments   : ID
  Return-Val  : arrayref to arrayrefs of attributes of a given ID

storing attributes, genes, SimpleFeatures

alternative getter/setters

get_dbxrefs_($id, $xref_type )

  Usage       : $obj->get_field_attributes($id, $descriptor )
  Function    : returns the Dbxref-Identifier of a given ID like 'FBgn0033692' for type 'gene'
  Returnvalue : Arrayref. to all DBxrefs of a given type

getting attributes by their type ('gene')