Raw content of Gene use strict; use warnings; # # Utility methods for the storing of chimp genes # package Gene; use Bio::EnsEMBL::Utils::Exception qw(info throw warning); use Bio::EnsEMBL::DBEntry; use constant NEAR => 1.5e6; our %KEEP_XREF = ('SWISSPROT' => 1, 'SPTREMBL' => 1, 'HUGO' => 1); ############################################################################### # store gene # # Builds Ensembl genes from the generated chimp transcripts and stores them # in the database. # ############################################################################### sub store_gene { my $db = shift; my $hum_gene = shift; # human gene my $ctranscripts = shift; # chimp transcripts my $MIN_AA_LEN = 15; my $MIN_NT_LEN = 600; my $analysis = $db->get_AnalysisAdaptor->fetch_by_logic_name('ensembl'); # Look at the translations and convert any transcripts with stop codons # into pseudogenes foreach my $ct (@$ctranscripts) { if($ct->translation && $ct->translate->seq() =~ /\*/) { $ct->translation(undef); } } # create xrefs to reference the human transcripts and translations create_ensembl_xrefs($ctranscripts); # transfer xrefs from human transcripts/translations transfer_xrefs($hum_gene, $ctranscripts); # compact duplicate transcripts into the same transcripts $ctranscripts = compact_transcripts($ctranscripts); # group all close together transcripts on the same strand into clusters my $clusters = cluster_transcripts($ctranscripts); my $gene_adaptor = $db->get_GeneAdaptor(); foreach my $cluster (@$clusters) { # keep genes only if there is a minimum amount of nucleotide # OR amino acid sequence in transcripts in the same region if($cluster->{'nt_len'} < $MIN_NT_LEN && $cluster->{'aa_len'} < $MIN_AA_LEN) { next; } # one gene for each cluster my $cgene = Bio::EnsEMBL::Gene->new(); # add reference to the original human gene $cgene->add_DBEntry(Bio::EnsEMBL::DBEntry->new (-primary_id => $hum_gene->stable_id(), -version => $hum_gene->version(), -dbname => 'Ens_Hs_gene', -release => 1, -display_id => $hum_gene->stable_id())); generate_stable_id($cgene); # rename transcripts and add to gene foreach my $ctrans (@{$cluster->{'transcripts'}}) { generate_stable_id($ctrans); # rename translation if($ctrans->translation) { generate_stable_id($ctrans->translation); } $cgene->add_Transcript($ctrans); } # rename all of the exons # but watch out because duplicate exons will be merged and we do not # want to generate multiple names my %ex_stable_ids; foreach my $ex (@{$cgene->get_all_Exons()}) { if($ex_stable_ids{$ex->hashkey()}) { $ex->stable_id($ex_stable_ids{$ex->hashkey()}); } else { generate_stable_id($ex); $ex_stable_ids{$ex->hashkey()} = $ex->stable_id(); } } foreach my $gx (@{$hum_gene->get_all_DBEntries}) { $cgene->add_DBEntry($gx) if($KEEP_XREF{uc($gx->dbname())}); } if($hum_gene->display_xref && $KEEP_XREF{uc($hum_gene->display_xref->dbname)}){ $cgene->display_xref($hum_gene->display_xref); } # set the analysis on the gene object $cgene->analysis($analysis); my $name = $cgene->stable_id(); $name .= '/'.$cgene->display_xref->display_id() if($cgene->display_xref()); $cgene->type('ensembl'); # store the bloody thing print STDERR "Storing gene: $name\n"; $gene_adaptor->store($cgene); } return; } ############################################################################### # generate_stable_id # # Generates a stable_id for a gene, transcript, translation or exon and sets # it on the object. # ############################################################################### my ($TRANSCRIPT_NUM, $GENE_NUM, $EXON_NUM, $TRANSLATION_NUM); sub generate_stable_id { my $object = shift; my $SPECIES_PREFIX = 'PTR'; my $PAD = 18; my $type_prefix; my $num; if($object->isa('Bio::EnsEMBL::Exon')) { $type_prefix = 'E'; $EXON_NUM ||= 0; $num = ++$EXON_NUM; } elsif($object->isa('Bio::EnsEMBL::Transcript')) { $type_prefix = 'T'; $TRANSCRIPT_NUM ||= 0; $num = ++$TRANSCRIPT_NUM; } elsif($object->isa('Bio::EnsEMBL::Gene')) { $type_prefix = 'G'; $GENE_NUM ||= 0; $num = ++$GENE_NUM; } elsif($object->isa('Bio::EnsEMBL::Translation')) { $type_prefix = 'P'; $TRANSLATION_NUM ||= 0; $num = ++$TRANSLATION_NUM; } else { throw('Unknown object type '.ref($object).'. Cannot create stable_id.'); } my $prefix = "ENS${SPECIES_PREFIX}${type_prefix}"; my $pad = $PAD - length($prefix) - length($num); $object->version(1); $object->stable_id($prefix . ('0'x$pad) . $num); } ############################################################################### # compact_transcripts # # Given a set of transcripts this function removes duplicate transcripts and # returns the unique set # ############################################################################### sub compact_transcripts { my $transcripts = shift; my %unique_hash; my @unique_list; info("Compacting transcripts"); foreach my $transcript (@$transcripts) { my $hashkey = 'exons:'; foreach my $exon (@{$transcript->get_all_Exons}) { $hashkey .= '('.$exon->hashkey.')'; } # include the translation in the hashkey because sometimes the # same transcripts may end up with different translations # (this is especially possible when split transcripts become partially the # same) if($transcript->translation) { $hashkey .= 'translation:' . $transcript->translation->start() . '-' . $transcript->translation->end() . '(' . $transcript->translation->start_Exon->hashkey() . ')('. $transcript->translation->end_Exon->hashkey() . ')'; } $unique_hash{$hashkey} ||= []; push @{$unique_hash{$hashkey}}, $transcript; } # merge xrefs from duplicated transcripts foreach my $key (keys %unique_hash) { # choose one of the duplicates arbitrarily my $duplicates = $unique_hash{$key}; my $transcript = pop(@$duplicates); # merge all of the xrefs and add them to the one transcript that will # be kept merge_xrefs($transcript, $duplicates); push @unique_list, $transcript; } return \@unique_list; } sub merge_xrefs { my $kept_transcript = shift; my $duplicates = shift; return if(!@$duplicates); info('Merging xrefs from duplicate transcripts'); # construct a set of xrefs the transcript to keep already has my %existing_tl_xrefs; my %existing_tr_xrefs; foreach my $xref (@{$kept_transcript->get_all_DBEntries()}) { $existing_tr_xrefs{$xref->dbname().':'.$xref->primary_id()} = 1; } if($kept_transcript->translation()) { foreach my $xref (@{$kept_transcript->translation->get_all_DBEntries()}) { $existing_tl_xrefs{$xref->dbname().':'.$xref->primary_id()} = 1; } } # collect a list of additional xrefs which should be kept my %tl_xrefs; my %tr_xrefs; foreach my $dup (@$duplicates) { foreach my $xref (@{$dup->get_all_DBEntries()}) { my $key = $xref->dbname().':'.$xref->primary_id(); $tr_xrefs{$key} = $xref if(!$existing_tr_xrefs{$key}); } if($dup->translation()) { foreach my $xref (@{$dup->translation->get_all_DBEntries()}) { my $key = $xref->dbname().':'.$xref->primary_id(); $tl_xrefs{$key} = $xref if(!$existing_tl_xrefs{$key}); } } } # add any new xrefs which were found to the kept translation foreach my $xref (values %tr_xrefs) { $kept_transcript->add_DBEntry($xref); } my @new_tl_xrefs = values(%tl_xrefs); my $tl = $kept_transcript->translation(); if(@new_tl_xrefs && !$tl) { throw("Some duplicate transcripts have translations, and others do not?"); return; } foreach my $xref (@new_tl_xrefs) { $tl->add_DBEntry($xref); } return; } ############################################################################### # cluster_transcripts # # Given a set of transcripts this function will cluster them into groups # which are near to each other. Note this may be more efficient to use # a sorted list, but for the small size of the clusters it should not matter. # ############################################################################## sub cluster_transcripts { my $transcripts = shift; my @clusters; info("Clustering transcripts"); foreach my $tr (@$transcripts) { my $cl = undef; foreach my $c (@clusters) { if(is_near($tr->start(), $tr->end(), $tr->strand(), $tr->slice(), $c->{'start'}, $c->{'end'}, $c->{'strand'}, $c->{'slice'})) { $cl = $c; } } if($cl) { push @{$cl->{'transcripts'}}, $tr; $cl->{'end'} = ( $tr->end > $cl->{'end'} ) ? $tr->{'end'} : $cl->{'end'}; $cl->{'start'} = ( $tr->start < $cl->{'start'} ) ? $tr->{'start'} : $cl->{'start'}; } else { $cl = {'start' => $tr->start(), 'end' => $tr->end(), 'strand' => $tr->strand(), 'slice' => $tr->slice(), 'transcripts' => [$tr]}; push @clusters, $cl; } } # now cluster clusters for(my $i = 0; $i < @clusters; $i++) { for(my $j = $i+1; $j < @clusters; $j++) { my $c1 = $clusters[$i]; my $c2 = $clusters[$j]; if(is_near($c1->{'start'}, $c1->{'end'}, $c1->{'strand'}, $c1->{'slice'}, $c2->{'start'}, $c2->{'end'}, $c2->{'strand'}, $c2->{'slice'})) { # merge clusters and take one of them out of the list splice(@clusters, $j, 1); $c1->{'start'} = ($c1->{'start'} < $c2->{'start'}) ? $c1->{'start'} : $c2->{'start'}; $c1->{'end'} = ($c1->{'end'} > $c2->{'end'}) ? $c1->{'end'} : $c2->{'end'}; push @{$c1->{'transcripts'}}, @{$c2->{'transcripts'}}; # start over again $i = -1; $j = -1; } } } # sum the nucleotide length and amino acid lengths of each of the # transcripts in a given cluster foreach my $cl (@clusters) { $cl->{'nt_len'} = 0; $cl->{'aa_len'} = 0; foreach my $tr (@{$cl->{'transcripts'}}) { $cl->{'nt_len'} += length($tr->spliced_seq()); if($tr->translation) { $cl->{'aa_len'} += length($tr->translate->seq()); } } } return \@clusters; } sub is_near { my ($start1,$end1,$strand1, $slice1, $start2, $end2, $strand2, $slice2) = @_; # cannot be clustered if not on same strand if($strand1 != $strand2) { return 0; } # cannot be clustered if not on same slice if($slice1->name() ne $slice2->name()) { return 0; } # check if the regions overlap if($end1 >= $start2 && $start1 <= $end2) { return 1; } if($start1 > $end2) { return (($start1 - $end2) < NEAR) ? 1 : 0; } return (($start2 - $end1) < NEAR) ? 1 : 0; } sub transfer_xrefs { my $hum_gene = shift; my $chimp_transcripts = shift; my %chimp_transcripts; my %chimp_translations; foreach my $tr (@$chimp_transcripts) { $chimp_transcripts{$tr->stable_id()} ||= []; push @{$chimp_transcripts{$tr->stable_id()}}, $tr; my $tl = $tr->translation(); if($tl) { $chimp_translations{$tl->stable_id()} ||= []; push @{$chimp_translations{$tl->stable_id()}}, $tl; } } foreach my $tr (@{$hum_gene->get_all_Transcripts()}) { foreach my $chimp_tr (@{$chimp_transcripts{$tr->stable_id}}) { foreach my $xref (@{$tr->get_all_DBEntries}) { $chimp_tr->add_DBEntry($xref) if($KEEP_XREF{uc($xref->dbname())}); } if($tr->display_xref() && $KEEP_XREF{uc($tr->display_xref->dbname)}) { $chimp_tr->display_xref($tr->display_xref); } } my $tl = $tr->translation(); if($tl) { foreach my $xref (@{$tl->get_all_DBEntries}) { foreach my $chimp_tl (@{$chimp_translations{$tl->stable_id()}}) { $chimp_tl->add_DBEntry($xref) if($KEEP_XREF{uc($xref->dbname())}); } } } } return; } sub create_ensembl_xrefs { my $chimp_transcripts = shift; foreach my $transcript (@$chimp_transcripts) { my $dbe = Bio::EnsEMBL::DBEntry->new (-primary_id => $transcript->stable_id(), -version => $transcript->version(), -dbname => 'Ens_Hs_transcript', -release => 1, -display_id => $transcript->stable_id()); $transcript->add_DBEntry($dbe); if($transcript->translation()) { $dbe = Bio::EnsEMBL::DBEntry->new (-primary_id => $transcript->translation->stable_id(), -version => $transcript->translation->version(), -dbname => 'Ens_Hs_translation', -release => 1, -display_id => $transcript->translation->stable_id()); $transcript->translation->add_DBEntry($dbe); } } } 1;