Raw content of Bio::Index::AbstractSeq
# $Id: AbstractSeq.pm,v 1.16 2002/10/22 07:38:33 lapp Exp $
#
# BioPerl module for Bio::DB::AbstractSeq
#
# Cared for by Ewan Birney
#
# Copyright Ewan Birney
#
# You may distribute this module under the same terms as perl itself
# POD documentation - main docs before the code
=head1 NAME
Bio::Index::AbstractSeq - Base class for AbstractSeq s
=head1 SYNOPSIS
# Make a new sequence file indexing package
package MyShinyNewIndexer;
use Bio::Index::AbstractSeq;
@ISA = ('Bio::Index::AbstractSeq');
# Now provide the necessary methods...
=head1 DESCRIPTION
Provides a common base class for multiple
sequence files built using the
Bio::Index::Abstract system, and provides a
Bio::DB::SeqI interface.
=head1 FEEDBACK
=head2 Mailing Lists
User feedback is an integral part of the evolution of this
and other Bioperl modules. Send your comments and suggestions preferably
to one of the Bioperl mailing lists.
Your participation is much appreciated.
bioperl-l@bioperl.org - General discussion
http://bioperl.org/MailList.shtml - About the mailing lists
=head2 Reporting Bugs
Report bugs to the Bioperl bug tracking system to help us keep track
the bugs and their resolution.
Bug reports can be submitted via email or the web:
bioperl-bugs@bio.perl.org
http://bugzilla.bioperl.org/
=head1 AUTHOR - Ewan Birney
Email birney@ebi.ac.uk
Describe contact details here
=head1 APPENDIX
The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
=head1 SEE ALSO
Bio::Index::Abstract - Module which
Bio::Index::AbstractSeq inherits off, which
provides dbm indexing for flat files (which are
not necessarily sequence files).
=cut
# Let's begin the code ...
package Bio::Index::AbstractSeq;
use vars qw(@ISA);
use strict;
use Bio::SeqIO::MultiFile;
use Bio::Index::Abstract;
use Bio::DB::SeqI;
@ISA = qw(Bio::Index::Abstract Bio::DB::SeqI);
sub new {
my ($class, @args) = @_;
my $self = $class->SUPER::new(@args);
$self->{'_seqio_cache'} = [];
return $self;
}
=head2 _file_format
Title : _file_format
Usage : $self->_file_format
Function: Derived classes should override this
method (it throws an exception here)
to give the file format of the files used
Example :
Returns :
Args :
=cut
sub _file_format {
my ($self,@args) = @_;
my $pkg = ref($self);
$self->throw("Class '$pkg' must provide a file format method correctly");
}
=head2 fetch
Title : fetch
Usage : $index->fetch( $id )
Function: Returns a Bio::Seq object from the index
Example : $seq = $index->fetch( 'dJ67B12' )
Returns : Bio::Seq object
Args : ID
=cut
sub fetch {
my( $self, $id ) = @_;
my $db = $self->db();
my $seq;
if (my $rec = $db->{ $id }) {
my ($file, $begin) = $self->unpack_record( $rec );
# Get the (possibly cached) SeqIO object
my $seqio = $self->_get_SeqIO_object( $file );
my $fh = $seqio->_fh();
# move to start of record
$begin-- if( $^O =~ /mswin/i); # workaround for Win DB_File bug
seek($fh, $begin, 0);
$seq = $seqio->next_seq();
}
# we essentially assumme that the primary_id for the database
# is the display_id
$seq->primary_id($seq->display_id()) if( defined $seq && ref($seq) &&
$seq->isa('Bio::PrimarySeqI') );
return $seq;
}
=head2 _get_SeqIO_object
Title : _get_SeqIO_object
Usage : $index->_get_SeqIO_object( $file )
Function: Returns a Bio::SeqIO object for the file
Example : $seq = $index->_get_SeqIO_object( 0 )
Returns : Bio::SeqIO object
Args : File number (an integer)
=cut
sub _get_SeqIO_object {
my( $self, $i ) = @_;
unless ($self->{'_seqio_cache'}[$i]) {
my $fh = $self->_file_handle($i);
# make a new SeqIO object
my $seqio = Bio::SeqIO->new( -Format => $self->_file_format,
-fh => $fh);
$self->{'_seqio_cache'}[$i] = $seqio;
}
return $self->{'_seqio_cache'}[$i];
}
=head2 get_Seq_by_id
Title : get_Seq_by_id
Usage : $seq = $db->get_Seq_by_id()
Function: retrieves a sequence object, identically to
->fetch, but here behaving as a Bio::DB::BioSeqI
Returns : new Bio::Seq object
Args : string represents the id
=cut
sub get_Seq_by_id {
my ($self,$id) = @_;
return $self->fetch($id);
}
=head2 get_Seq_by_acc
Title : get_Seq_by_acc
Usage : $seq = $db->get_Seq_by_acc()
Function: retrieves a sequence object, identically to
->fetch, but here behaving as a Bio::DB::BioSeqI
Returns : new Bio::Seq object
Args : string represents the accession number
=cut
sub get_Seq_by_acc {
my ($self,$id) = @_;
return $self->fetch($id);
}
=head2 get_PrimarySeq_stream
Title : get_PrimarySeq_stream
Usage : $stream = get_PrimarySeq_stream
Function: Makes a Bio::DB::SeqStreamI compliant object
which provides a single method, next_primary_seq
Returns : Bio::DB::SeqStreamI
Args : none
=cut
sub get_PrimarySeq_stream {
my $self = shift;
my $num = $self->_file_count() || 0;
my @file;
for (my $i = 0; $i < $num; $i++) {
my( $file, $stored_size ) = $self->unpack_record( $self->db->{"__FILE_$i"} );
push(@file,$file);
}
my $out = Bio::SeqIO::MultiFile->new( '-format' => $self->_file_format , -files => \@file);
return $out;
}
=head2 get_all_primary_ids
Title : get_all_primary_ids
Usage : @ids = $seqdb->get_all_primary_ids()
Function: gives an array of all the primary_ids of the
sequence objects in the database. These
maybe ids (display style) or accession numbers
or something else completely different - they
*are not* meaningful outside of this database
implementation.
Example :
Returns : an array of strings
Args : none
=cut
sub get_all_primary_ids {
my ($self,@args) = @_;
my $db = $self->db;
# the problem is here that we have indexed things both on
# accession number and name.
# We could take two options
# here - loop over the database, returning only one copy of each
# id that points to the same byte position, or we rely on semantics
# of accession numbers.
# someone is going to index a database with no accession numbers.
# doh!. We have to uniquify the index...
my( %bytepos );
while (my($id, $rec) = each %$db) {
if( $id =~ /^__/ ) {
# internal info
next;
}
my ($file, $begin) = $self->unpack_record( $rec );
$bytepos{"$file:$begin"} = $id;
}
return values %bytepos;
}
=head2 get_Seq_by_primary_id
Title : get_Seq_by_primary_id
Usage : $seq = $db->get_Seq_by_primary_id($primary_id_string);
Function: Gets a Bio::Seq object by the primary id. The primary
id in these cases has to come from $db->get_all_primary_ids.
There is no other way to get (or guess) the primary_ids
in a database.
The other possibility is to get Bio::PrimarySeqI objects
via the get_PrimarySeq_stream and the primary_id field
on these objects are specified as the ids to use here.
Returns : A Bio::Seq object
Args : primary id (as a string)
Throws : "acc does not exist" exception
=cut
sub get_Seq_by_primary_id {
my ($self,$id) = @_;
return $self->fetch($id);
}
1;