Gene documentation.

sub cluster_transcripts {

  my $transcripts = shift;

  my @clusters;

  info("Clustering transcripts");

  foreach my $tr (@$transcripts) {
    my $cl = undef;

    foreach my $c (@clusters) {
      if(is_near($tr->start(), $tr->end(), $tr->strand(), $tr->slice(),
                 $c->{'start'}, $c->{'end'}, $c->{'strand'}, $c->{'slice'})) {

        $cl = $c;
      }
    }

    if($cl) {
      push @{$cl->{'transcripts'}}, $tr;

      $cl->{'end'} =
        ( $tr->end > $cl->{'end'} )     ? $tr->{'end'}   : $cl->{'end'};
      $cl->{'start'} =
        ( $tr->start < $cl->{'start'} ) ? $tr->{'start'} : $cl->{'start'};

    } else {
      $cl = {'start'  => $tr->start(),
             'end'    => $tr->end(),
             'strand' => $tr->strand(),
             'slice'  => $tr->slice(),
             'transcripts' => [$tr]};

      push @clusters, $cl;
    }
  }

  # now cluster clusters

  for(my $i = 0; $i < @clusters; $i++) {
    for(my $j = $i+1; $j < @clusters; $j++) {
      my $c1 = $clusters[$i];
      my $c2 = $clusters[$j];
      if(is_near($c1->{'start'}, $c1->{'end'}, $c1->{'strand'}, $c1->{'slice'},
              $c2->{'start'}, $c2->{'end'}, $c2->{'strand'}, $c2->{'slice'})) {
        # merge clusters and take one of them out of the list
        splice(@clusters, $j, 1);

        $c1->{'start'} =
          ($c1->{'start'} < $c2->{'start'}) ? $c1->{'start'} : $c2->{'start'};

        $c1->{'end'} =
          ($c1->{'end'} > $c2->{'end'}) ? $c1->{'end'} : $c2->{'end'};

        push @{$c1->{'transcripts'}}, @{$c2->{'transcripts'}};

        # start over again
        $i = -1;
        $j = -1;
      }
    }
  }

  # sum the nucleotide length and amino acid lengths of each of the
  # transcripts in a given cluster
  foreach my $cl (@clusters) {
    $cl->{'nt_len'} = 0;
    $cl->{'aa_len'} = 0;

    foreach my $tr (@{$cl->{'transcripts'}}) {
      $cl->{'nt_len'} += length($tr->spliced_seq());
      if($tr->translation) {
        $cl->{'aa_len'} += length($tr->translate->seq());
      }
    }
  }


  return\@ clusters;

}

sub compact_transcripts {

  my $transcripts = shift;

  my %unique_hash;
  my @unique_list;

  info("Compacting transcripts");

  foreach my $transcript (@$transcripts) {
    my $hashkey = 'exons:';

    foreach my $exon (@{$transcript->get_all_Exons}) {
      $hashkey .= '('.$exon->hashkey.')';
    }

    # include the translation in the hashkey because sometimes the
    # same transcripts may end up with different translations
    # (this is especially possible when split transcripts become partially the
    # same)
    if($transcript->translation) {
      $hashkey .= 'translation:' .
        $transcript->translation->start() . '-' .
        $transcript->translation->end() . '(' .
        $transcript->translation->start_Exon->hashkey() . ')('.
        $transcript->translation->end_Exon->hashkey() . ')';
    }


    $unique_hash{$hashkey} ||= [];
    push @{$unique_hash{$hashkey}}, $transcript;
  }

  # merge xrefs from duplicated transcripts

  foreach my $key (keys %unique_hash) {
    # choose one of the duplicates arbitrarily
    my $duplicates = $unique_hash{$key};
    my $transcript = pop(@$duplicates);

    # merge all of the xrefs and add them to the one transcript that will
    # be kept
    merge_xrefs($transcript, $duplicates);
    push @unique_list, $transcript;
  }

  return\@ unique_list;

}

sub create_ensembl_xrefs {

  my $chimp_transcripts = shift;

  foreach my $transcript (@$chimp_transcripts) {
    my $dbe = Bio::EnsEMBL::DBEntry->new
      (-primary_id => $transcript->stable_id(),
       -version    => $transcript->version(),
       -dbname     => 'Ens_Hs_transcript',
       -release    => 1,
       -display_id => $transcript->stable_id());
    $transcript->add_DBEntry($dbe);

    if($transcript->translation()) {
      $dbe = Bio::EnsEMBL::DBEntry->new
        (-primary_id => $transcript->translation->stable_id(),
         -version    => $transcript->translation->version(),
         -dbname     => 'Ens_Hs_translation',
         -release    => 1,
         -display_id => $transcript->translation->stable_id());
      $transcript->translation->add_DBEntry($dbe);
    }
  }
}

1;

}

sub generate_stable_id {

  my $object = shift;

  my $SPECIES_PREFIX = 'PTR';
  my $PAD            = 18;

  my $type_prefix;
  my $num;

  if($object->isa('Bio::EnsEMBL::Exon')) {
    $type_prefix = 'E';
    $EXON_NUM       ||= 0;
    $num = ++$EXON_NUM;
  } elsif($object->isa('Bio::EnsEMBL::Transcript')) {
    $type_prefix = 'T';
    $TRANSCRIPT_NUM ||= 0;
    $num = ++$TRANSCRIPT_NUM;
  } elsif($object->isa('Bio::EnsEMBL::Gene')) {
    $type_prefix = 'G';
    $GENE_NUM       ||= 0;
    $num = ++$GENE_NUM;
  } elsif($object->isa('Bio::EnsEMBL::Translation')) {
    $type_prefix = 'P';
    $TRANSLATION_NUM ||= 0;
    $num = ++$TRANSLATION_NUM;
  } else {
    throw('Unknown object type '.ref($object).'. Cannot create stable_id.');
  }

  my $prefix = "ENS${SPECIES_PREFIX}${type_prefix}";

  my $pad = $PAD - length($prefix) - length($num);

  $object->version(1);
  $object->stable_id($prefix . ('0'x$pad) . $num);
}





###############################################################################
# compact_transcripts
#
# Given a set of transcripts this function removes duplicate transcripts and
# returns the unique set
#
###############################################################################

}

sub merge_xrefs {

  my $kept_transcript = shift;
  my $duplicates      = shift;

  return if(!@$duplicates);

  info('Merging xrefs from duplicate transcripts');

  # construct a set of xrefs the transcript to keep already has

  my %existing_tl_xrefs;
  my %existing_tr_xrefs;
  foreach my $xref (@{$kept_transcript->get_all_DBEntries()}) {
    $existing_tr_xrefs{$xref->dbname().':'.$xref->primary_id()} = 1;
  }

  if($kept_transcript->translation()) {
    foreach my $xref (@{$kept_transcript->translation->get_all_DBEntries()}) {
      $existing_tl_xrefs{$xref->dbname().':'.$xref->primary_id()} = 1;
    }
  }

  # collect a list of additional xrefs which should be kept

  my %tl_xrefs;
  my %tr_xrefs;
  foreach my $dup (@$duplicates) {
    foreach my $xref (@{$dup->get_all_DBEntries()}) {
      my $key = $xref->dbname().':'.$xref->primary_id();
      $tr_xrefs{$key} = $xref if(!$existing_tr_xrefs{$key});
    }

    if($dup->translation()) {
      foreach my $xref (@{$dup->translation->get_all_DBEntries()}) {
        my $key = $xref->dbname().':'.$xref->primary_id();
        $tl_xrefs{$key} = $xref if(!$existing_tl_xrefs{$key});
      }
    }
  }

  # add any new xrefs which were found to the kept translation

  foreach my $xref (values %tr_xrefs) {
    $kept_transcript->add_DBEntry($xref);
  }

  my @new_tl_xrefs = values(%tl_xrefs);
  my $tl = $kept_transcript->translation();

  if(@new_tl_xrefs && !$tl) {
    throw("Some duplicate transcripts have translations, and others do not?");
    return;
  }

  foreach my $xref (@new_tl_xrefs) {
    $tl->add_DBEntry($xref);
  }

  return;
}


###############################################################################
# cluster_transcripts
#
# Given a set of transcripts this function will cluster them into groups
# which are near to each other. Note this may be more efficient to use
# a sorted list, but for the small size of the clusters it should not matter.
#
##############################################################################

}

sub store_gene {

  my $db = shift;
  my $hum_gene = shift; # human gene
  my $ctranscripts = shift; # chimp transcripts

  my $MIN_AA_LEN = 15;
  my $MIN_NT_LEN = 600;

  my $analysis = $db->get_AnalysisAdaptor->fetch_by_logic_name('ensembl');

  # Look at the translations and convert any transcripts with stop codons
  # into pseudogenes
  foreach my $ct (@$ctranscripts) {
    if($ct->translation && $ct->translate->seq() =~ /\*/) {
      $ct->translation(undef);
    }
  }

  # create xrefs to reference the human transcripts and translations
  create_ensembl_xrefs($ctranscripts);

  # transfer xrefs from human transcripts/translations
  transfer_xrefs($hum_gene, $ctranscripts);

  # compact duplicate transcripts into the same transcripts
  $ctranscripts = compact_transcripts($ctranscripts);

  # group all close together transcripts on the same strand into clusters
  my $clusters = cluster_transcripts($ctranscripts);

  my $gene_adaptor = $db->get_GeneAdaptor();

  foreach my $cluster (@$clusters) {

    # keep genes only if there is a minimum amount of nucleotide
    # OR amino acid sequence in transcripts in the same region
    if($cluster->{'nt_len'} < $MIN_NT_LEN &&
       $cluster->{'aa_len'} < $MIN_AA_LEN) {
      next;
    }

    # one gene for each cluster
    my $cgene = Bio::EnsEMBL::Gene->new();

    # add reference to the original human gene
    $cgene->add_DBEntry(Bio::EnsEMBL::DBEntry->new
                        (-primary_id => $hum_gene->stable_id(),
                         -version    => $hum_gene->version(),
                         -dbname     => 'Ens_Hs_gene',
                         -release    => 1,
                         -display_id => $hum_gene->stable_id()));

    generate_stable_id($cgene);

    # rename transcripts and add to gene
    foreach my $ctrans (@{$cluster->{'transcripts'}}) {
      generate_stable_id($ctrans);

      # rename translation
      if($ctrans->translation) {
        generate_stable_id($ctrans->translation);
      }

      $cgene->add_Transcript($ctrans);
    }

    # rename all of the exons
    # but watch out because duplicate exons will be merged and we do not
    # want to generate multiple names
    my %ex_stable_ids;
    foreach my $ex (@{$cgene->get_all_Exons()}) {
      if($ex_stable_ids{$ex->hashkey()}) {
        $ex->stable_id($ex_stable_ids{$ex->hashkey()});
      } else {
        generate_stable_id($ex);
        $ex_stable_ids{$ex->hashkey()} = $ex->stable_id();
      }
    }


    foreach my $gx (@{$hum_gene->get_all_DBEntries}) {
      $cgene->add_DBEntry($gx) if($KEEP_XREF{uc($gx->dbname())});
    }

    if($hum_gene->display_xref &&
       $KEEP_XREF{uc($hum_gene->display_xref->dbname)}){
      $cgene->display_xref($hum_gene->display_xref);
    }

    # set the analysis on the gene object
    $cgene->analysis($analysis);

    my $name = $cgene->stable_id();

    $name .= '/'.$cgene->display_xref->display_id() if($cgene->display_xref());

    $cgene->type('ensembl');

    # store the bloody thing
    print STDERR "Storing gene: $name\n";
    $gene_adaptor->store($cgene);
  }

  return;
}



###############################################################################
# generate_stable_id
#
# Generates a stable_id for a gene, transcript, translation or exon and sets
# it on the object.
#
###############################################################################


my ($TRANSCRIPT_NUM, $GENE_NUM, $EXON_NUM, $TRANSLATION_NUM);

}

sub transfer_xrefs {

  my $hum_gene = shift;
  my $chimp_transcripts = shift;

  my %chimp_transcripts;
  my %chimp_translations;

  foreach my $tr (@$chimp_transcripts) {
    $chimp_transcripts{$tr->stable_id()} ||= [];
    push @{$chimp_transcripts{$tr->stable_id()}}, $tr;

    my $tl = $tr->translation();

    if($tl) {
      $chimp_translations{$tl->stable_id()} ||= [];
      push @{$chimp_translations{$tl->stable_id()}}, $tl;
    }
  }

  foreach my $tr (@{$hum_gene->get_all_Transcripts()}) {
    foreach my $chimp_tr (@{$chimp_transcripts{$tr->stable_id}}) {
      foreach my $xref (@{$tr->get_all_DBEntries}) {
        $chimp_tr->add_DBEntry($xref) if($KEEP_XREF{uc($xref->dbname())});
      }

      if($tr->display_xref() && $KEEP_XREF{uc($tr->display_xref->dbname)}) {
        $chimp_tr->display_xref($tr->display_xref);
      }
    }

    my $tl = $tr->translation();
    if($tl) {
      foreach my $xref (@{$tl->get_all_DBEntries}) {
        foreach my $chimp_tl (@{$chimp_translations{$tl->stable_id()}}) {
          $chimp_tl->add_DBEntry($xref) if($KEEP_XREF{uc($xref->dbname())});
        }
      }
    }
  }

  return;

}

cluster_transcripts	No description	Code
compact_transcripts	No description	Code
create_ensembl_xrefs	No description	Code
generate_stable_id	No description	Code
is_near	No description	Code
merge_xrefs	No description	Code
store_gene	No description	Code
transfer_xrefs	No description	Code