Bio::EnsEMBL::IdMapping Archiver
SummaryIncluded librariesPackage variablesSynopsisDescriptionGeneral documentationMethods
Toolbar
WebCvsRaw content
Summary
Bio::EnsEMBL::IdMapping::Archiver - create gene_archive and peptide_archive
Package variables
Privates (from "my" definitions)
$pa_id;
Included modules
Bio::EnsEMBL::IdMapping::BaseObject
Bio::EnsEMBL::Utils::Exception qw ( throw warning )
Bio::EnsEMBL::Utils::ScriptUtils qw ( path_append )
Digest::MD5 qw ( md5_hex )
Inherit
Bio::EnsEMBL::IdMapping::BaseObject
Synopsis
  my $archiver = Bio::EnsEMBL::IdMapping::Archiver->new(
-LOGGER => $logger,
-CONF => $conf,
-CACHE => $cache
);
# create gene and peptide archive $archiver->create_archive($mapping_session_id); # dump existing archive tables to file my $num_entries = $archiver->dump_table_to_file( 'source', 'gene_archive', 'gene_archive_existing.txt', 1 );
Description
This module creates the gene_archive and peptide_archive
tables. Data is written to a file as tab-delimited text for
loading into a MySQL database (this can be done manually, or using
StableIdmapper->upload_file_into_table()).
An archive entry for a given source gene is created if no target
gene exists, or if any of its transcripts or their translations
changed. Non-coding transcripts only have an entry in gene_archive (i.e.
without a corresponding peptide_archive entry).
Methods
create_archiveDescriptionCode
dump_geneDescriptionCode
dump_nc_rowDescriptionCode
dump_tupleDescriptionCode
mapping_session_idDescriptionCode
Methods description
create_archivecode    nextTop
  Arg[1]      : Int $mapping_session_id - the mapping_session_id for this run
Example : $archiver->create_archive($stable_id_mapper->mapping_session_id);
Description : Creates the gene_archive and peptide_archive tables and writes
the data to a tab-delimited file. The decision as to what to
archive is deferred to dump_gene(), see documentation there for
details.
Return type : none
Exceptions : Thrown on missing argument.
Caller : id_mapping.pl
Status : At Risk
: under development
dump_genecodeprevnextTop
  Arg[1]      : Bio::EnsEMBL::IdMapping::TinyGene $s_gene - source gene
Arg[2] : Bio::EnsEMBL::IdMapping::TinyGene $t_gene - target gene
Arg[3] : Filehandle $ga_fh - filehandle for writing gene_archive data
Arg[4] : Filehandle $pa_fh - filehandle for writing peptide_archive data
Example : my $target_gene = $gene_mappings{$source_gene->stable_id};
$archiver->dump_gene($source_gene, $target_gene, $ga_fh, $pa_fh);
Description : Given a source gene, it will write a gene_achive and
peptide_achive entry for it if no target gene exists, or if any
of its transcripts or their translation changed.
Return type : none
Exceptions : none
Caller : create_archive()
Status : At Risk
: under development
dump_nc_rowcodeprevnextTop
  Arg[1]      : Bio::EnsEMBL::IdMapping::TinyGene $gene - gene to archive
Arg[2] : Bio::EnsEMBL::IdMapping::TinyTrancript $tr - its transcript
Arg[3] : Filehandle $ga_fh - filehandle for writing gene_archive data
Example : $archive->dump_nc_row($s_gene, $s_tr, $ga_fh);
Description : Writes an entry line for gene_archive for non-coding
transcripts.
Return type : none
Exceptions : none
Caller : dump_gene()
Status : At Risk
: under development
dump_tuplecodeprevnextTop
  Arg[1]      : Bio::EnsEMBL::IdMapping::TinyGene $gene - gene to archive
Arg[2] : Bio::EnsEMBL::IdMapping::TinyTrancript $tr - its transcript
Arg[3] : Bio::EnsEMBL::IdMapping::TinyTranslation $tl - its translation
Arg[4] : Filehandle $ga_fh - filehandle for writing gene_archive data
Arg[5] : Filehandle $pa_fh - filehandle for writing peptide_archive data
Example : $archive->dump_tuple($s_gene, $s_tr, $s_tl, $ga_fh, $pa_fh);
Description : Writes entry lines for gene_archive and peptide_archive.
Return type : none
Exceptions : none
Caller : dump_gene()
Status : At Risk
: under development
mapping_session_idcodeprevnextTop
  Arg[1]      : (optional) Int - mapping_session_id to set
Example : my $msi = $archiver->mapping_session_id;
Description : Getter/setter for mapping_session_id.
Return type : Int
Exceptions : none
Caller : create_archive()
Status : At Risk
: under development
Methods code
create_archivedescriptionprevnextTop
sub create_archive {
  my $self = shift;
  my $mapping_session_id = shift;

  # argument check
unless ($mapping_session_id) { $self->logger->warning("No mapping_session_id set."); } $self->mapping_session_id($mapping_session_id); # get filehandles to write gene and peptide archive
my $ga_fh = $self->get_filehandle('gene_archive_new.txt', 'tables'); my $pa_fh = $self->get_filehandle('peptide_archive_new.txt', 'tables'); # get the currently highest peptide_archive_id from the source db
my $s_dba = $self->cache->get_DBAdaptor('source'); my $s_dbh = $s_dba->dbc->db_handle; my $sql = qq(SELECT MAX(peptide_archive_id) FROM peptide_archive); $pa_id = $self->fetch_value_from_db($s_dbh, $sql); unless ($pa_id) { $self->logger->warning("No max(peptide_archive_id) found in db.\n", 1); $self->logger->info("That's ok if this is the first stable ID mapping for this species.\n", 1); } $pa_id++; $self->logger->debug("Starting with peptide_archive_id $pa_id.\n"); # lookup hash of target gene stable IDs
my %target_genes = map { $_->stable_id => $_ } values %{ $self->cache->get_by_name("genes_by_id", 'target') }; # loop over source genes and dump to archive (dump_gene() will decide whether
# to do full or partial dump)
foreach my $source_gene (values %{ $self->cache->get_by_name("genes_by_id", 'source') }) { $self->dump_gene($source_gene, $target_genes{$source_gene->stable_id}, $ga_fh, $pa_fh); } close($ga_fh); close($pa_fh);
}
dump_genedescriptionprevnextTop
sub dump_gene {
  my ($self, $s_gene, $t_gene, $ga_fh, $pa_fh) = @_;

  # private method, so no argument check done for performance reasons
# deal with ncRNA differently
# hope this simple biotype regex is accurate enough...
my $is_ncRNA = 0; $is_ncRNA = 1 if ($s_gene->biotype =~ /RNA/); # loop over source transcripts
foreach my $s_tr (@{ $s_gene->get_all_Transcripts }) { my $s_tl = $s_tr->translation; # we got a coding transcript
if ($s_tl) { # do a full dump of this gene if no target gene exists
if (! $t_gene) { $self->dump_tuple($s_gene, $s_tr, $s_tl, $ga_fh, $pa_fh); # otherwise, only dump if any transcript or its translation changed
} else { my $changed_flag = 1; foreach my $t_tr (@{ $t_gene->get_all_Transcripts }) { my $t_tl = $t_tr->translation; next unless ($t_tl); if (($s_tr->stable_id eq $t_tr->stable_id) and ($s_tl->stable_id eq $t_tl->stable_id) and ($s_tl->seq eq $t_tl->seq)) { $changed_flag = 0; } } if ($changed_flag) { $self->dump_tuple($s_gene, $s_tr, $s_tl, $ga_fh, $pa_fh); } } # now deal with ncRNAs (they don't translate but we still want to archive
# them)
} elsif ($is_ncRNA) { if (! $t_gene) { $self->dump_nc_row($s_gene, $s_tr, $ga_fh); } else { my $changed_flag = 1; foreach my $t_tr (@{ $t_gene->get_all_Transcripts }) { $changed_flag = 0 if ($s_tr->stable_id eq $t_tr->stable_id); } if ($changed_flag) { $self->dump_nc_row($s_gene, $s_tr, $ga_fh); } } } }
}
dump_nc_rowdescriptionprevnextTop
sub dump_nc_row {
  my ($self, $gene, $tr, $ga_fh) = @_;
  
  # private method, so no argument check done for performance reasons
# gene archive
print $ga_fh join("\t", $gene->stable_id, $gene->version, $tr->stable_id, $tr->version, '\N', '\N', '\N', $self->mapping_session_id ); print $ga_fh "\n";
}
dump_tupledescriptionprevnextTop
sub dump_tuple {
  my ($self, $gene, $tr, $tl, $ga_fh, $pa_fh) = @_;
  
  # private method, so no argument check done for performance reasons
# gene archive
print $ga_fh join("\t", $gene->stable_id, $gene->version, $tr->stable_id, $tr->version, $tl->stable_id, $tl->version, $pa_id, $self->mapping_session_id ); print $ga_fh "\n"; # peptide archive
my $pep_seq = $tl->seq; print $pa_fh join("\t", $pa_id, md5_hex($pep_seq), $pep_seq); print $pa_fh "\n"; # increment peptide_archive_id
$pa_id++;
}
mapping_session_iddescriptionprevnextTop
sub mapping_session_id {
  my $self = shift;
  $self->{'_mapping_session_id'} = shift if (@_);
  return $self->{'_mapping_session_id'};
}


1;
}
General documentation
LICENSETop
  Copyright (c) 1999-2009 The European Bioinformatics Institute and
Genome Research Limited. All rights reserved.
This software is distributed under a modified Apache license. For license details, please see /info/about/code_licence.html
CONTACTTop
  Please email comments or questions to the public Ensembl
developers list at <ensembl-dev@ebi.ac.uk>.
Questions may also be sent to the Ensembl help desk at <helpdesk@ensembl.org>.