Bio::EnsEMBL::Pipeline::SeqFetcher
DBIndex
Toolbar
Package variables
No package variables defined.
Included modules
Inherit
Synopsis
No synopsis!
Description
No description!
Methods
_add_id_position | No description | Code |
_index_file | No description | Code |
_make_BIOINDEX_file | No description | Code |
_make_config_file | No description | Code |
_make_fileid_file | No description | Code |
_make_indexdir | No description | Code |
database | No description | Code |
find_entry | No description | Code |
get_Seq_by_id | No description | Code |
get_Seqs_by_id_array | No description | Code |
get_entry | No description | Code |
get_filehandle_by_fileid | No description | Code |
get_fileid_by_filename | No description | Code |
make_index | No description | Code |
namespace | No description | Code |
new | No description | Code |
primary_index_file | No description | Code |
primary_index_filehandle | No description | Code |
read_config | No description | Code |
read_fileids | No description | Code |
read_header | No description | Code |
read_record | No description | Code |
record_size | No description | Code |
write_primary_index | No description | Code |
Methods description
None available.
Methods code
_add_id_position | description | prev | next | Top |
sub _add_id_position
{ my ($self,$id,$pos,$fileid,$length) = @_;
if (!defined($id)) {
$self->throw("No id defined. Can't add id position");
}
if (!defined($pos)) {
$self->throw("No position defined. Can't add id position");
}
if (!defined($fileid)) {
$self->throw("No fileid defined. Can't add id position");
}
if (!defined($length) || $length <= 0) {
$self->throw("No length defined or <= 0 [$length]. Can't add id position");
}
$self->{_id}{$id}{_pos} = $pos;
$self->{_id}{$id}{_length} = $length;
$self->{_id}{$id}{_fileid} = $fileid;
if (length($id) >= $self->{_maxidlength}) {
$self->{_maxidlength} = length($id);
}
if (length($pos) >= $self->{_maxposlength}) {
$self->{_maxposlength} = length($pos);
}
if (length($length >= $self->{_maxlengthlength})) {
$self->{_maxlengthlength} = length($length);
} } |
sub _index_file
{ my ($self,$file) = @_;
open(FILE,"<$file") || $self->throw("Can't open file [$file]");
my $recstart = 0;
my $fileid = $self->get_fileid_by_filename($file);
my $found = 0;
my $id;
my $count;
LINE: while (<FILE>) {
if (/^>/) {
$count++;
my $newid = $_;
$newid =~ s/^(\S+).*/$1/;
chomp($newid);
my $begin = tell(FILE) - length( $_ );
my $length = ($begin - $recstart);
if ($found) {
if (!defined($id)) {
$self->throw("No id defined for sequence");
}
if (!defined($fileid)) {
$self->throw("No fileid defined for file $file");
}
if (!defined($recstart)) {
$self->throw("No position defined for " . $id . "\n");
}
if (!defined($length)) {
$self->throw("No length defined for " . $id . "\n");
}
$self->_add_id_position($id,$recstart,$fileid,$length);
} else {
$found = 1;
}
$recstart = $begin;
$id = $newid;
if ($count%1000 == 0) {
print "Indexed $count ids. Time " . time . "\n";
}
}
}
close(FILE); } |
sub _make_BIOINDEX_file
{ my ($self) = @_; } |
sub _make_config_file
{ my ($self) = @_; } |
sub _make_fileid_file
{ my ($self,@files) = @_;
my $dir = $self->database;
if (! -d $dir) {
$self->throw("[$dir] is not a directory. Can't write fileid file");
}
my $fileid_name = $dir . "/fileid";
open(FILEID,">$fileid_name") || $self->throw("Can't write fileid file [$fileid_name]");
my $count = 1;
foreach my $file (@files) {
print FILEID $count . "\t" . $file . "\n";
$count++;
}
close(FILEID);
$self->read_fileids; } |
sub _make_indexdir
{ my ($self) = @_;
if (! -e $self->database) {
mkdir $self->database,0755;
} else {
$self->throw("Index directory " . $self->database . " already exists. Exiting\n");
} } |
sub database
{ my ($self,$arg) = @_;
if (defined($arg)) {
$self->{_database} = $arg;
}
return $self->{_database}; } |
sub find_entry
{ my ($self,$fh,$record_size,$start,$end,$id) = @_;
my $mid = int(($end+1+$start)/2);
my $pos = ($mid -1)* $record_size;
sysseek($fh,$self->{_start_pos}+ $pos ,0);
my @field_width = @{$self->{_field_widths}};
my $entryid;
sysread($fh,$entryid,$field_width[0]);
$entryid =~ s/ //g;
my ($first,$second) = sort { $a cmp $b} ($id,$entryid);
if ($id eq $entryid) {
my $pos;
my $length;
my $fileid;
sysread($fh,$fileid,$field_width[1]);
sysread($fh,$pos,$field_width[2]);
sysread($fh,$length,$field_width[3]);
return ($id,$fileid,$pos,$length);
} elsif ($first eq $id) {
if ($end-$start <= 1) {
return;
}
my $end = $mid;
$self->find_entry($fh,$record_size,$start,$end,$id);
} elsif ($second eq $id ) {
if ($end-$start <= 1) {
return;
}
$start = $mid;
$self->find_entry($fh,$record_size,$start,$end,$id);
} else {
print "No match\n";
} } |
sub get_Seq_by_id
{ my ($self,$id) = @_;
my $indexfh = $self->primary_index_filehandle;
my $recsize = $self->record_size;
sysseek ($indexfh,0,2);
my $filesize = (tell $indexfh);
my $end = ($filesize-$self->{_start_pos})/$recsize;
print "File size is $filesize $end " . $self->{_start_pos} . "\n";
my ($id,$fileid,$fpos,$length) = $self->find_entry($indexfh,$recsize,0,$end,$id);
$id =~ s/ //g;
$fileid =~ s/ //g;
$fpos =~ s/ //g;
$length =~ s/ //g;
$length =~ s/ +$//;
if (!defined($id)) {
return;
}
my $fh = $self->get_filehandle_by_fileid($fileid);
return $self->get_entry($fh,$fpos,$length); } |
sub get_Seqs_by_id_array
{ my ($self,@ids) = @_;
my %entry;
foreach my $id (@ids) {
$entry{$id} = $self->get_Seq_by_id($id);
}
return %entry; } |
sub get_entry
{ my ($self,$fh,$pos,$length) = @_;
my $entry;
sysseek ($fh,$pos,0);
sysread($fh,$entry,$length);
return $entry; } |
sub get_filehandle_by_fileid
{ my ($self,$fileid) = @_;
if (!defined($self->{_fileid}{$fileid})) {
$self->throw("ERROR: undefined fileid in index [$fileid]");
}
return $self->{_fileid}{$fileid}; } |
sub get_fileid_by_filename
{ my ($self,$file) = @_;
if (!defined($self->{_dbfile})) {
$self->throw("No file to fileid mapping present. Has the fileid file been read?");
}
return $self->{_dbfile}{$file}; } |
sub make_index
{
my ($self,$dbname,$format,@files) = @_;;
if (!defined(@files)) {
$self->throw("Must enter an array of filenames to index");
}
if (!defined($dbname)) {
$self->throw("Must enter an index name for your files");
}
foreach my $file (@files) {
if (! -e $file) {
$self->throw("Can't index file [$file] as it doesn't exist");
}
}
print "Database is $dbname\n";
$self->database($dbname);
$self->_make_indexdir;
$self->_make_BIOINDEX_file;
$self->_make_config_file;
$self->_make_fileid_file(@files);
foreach my $file (@files) {
$self->_index_file($file);
}
$self->write_primary_index;
} |
sub namespace
{ my ($self,$arg) = @_;
if (defined($arg)) {
$self->{_namespace} = $arg;
}
return $self->{_namespace};
}
1; } |
sub new
{ my($class, @args) = @_;
my $self = $class->SUPER::new(@args);
my( $database ) = $self->_rearrange([qw(DATABASE)], @args);
$self->database($database);
if ($self->database) {
$self->read_config;
$self->read_fileids;
$self->read_header;
}
return $self; } |
sub primary_index_file
{ my ($self) = @_;
return $self->database . "/key_" . $self->namespace . ".key"; } |
sub primary_index_filehandle
{ my ($self) = @_;
if (!defined ($self->{_primary_index_handle})) {
$self->{_primary_index_handle} = new FileHandle("<" . $self->primary_index_file);
}
return $self->{_primary_index_handle}; } |
sub read_config
{ my ($self) = @_;
if (!defined($self->database)) {
$self->throw("Database not set in index. Can't read config.dat");
}
my $configfile = $self->database . "/config.dat";
if (! -e $configfile) {
$self->throw("No config file [$configfile]. Can't read namespace");
}
open(IN,"<$configfile") || $self->throw("Can't open configfile [$configfile]");
while (<IN>) {
chomp;
my ($key,$value) = split(/\t/,$_);
if ($key eq "namespace") {
$self->namespace($value);
} else {
$self->throw("Unrecognised key value pair in config.dat [$key][$value]");
}
}
close(IN); } |
sub read_fileids
{ my ($self) = @_;
if (!defined($self->database)) {
$self->throw("Database not set in index. Can't read fileids");
}
my $fileid_file = $self->database . "/fileids.dat";
if (! -e $fileid_file) {
$self->throw("Fileid file [$fileid_file] doesn't exist for database [" .
$self->database . "\n");
}
open(FILEID,"<$fileid_file") || $self->throw("Can't open fileid file [$fileid_file");
while (<FILEID>) {
chomp;
my ($id,$name,$size) = split(/\t/);
if (!defined($self->{_fileid}{$id})) {
my $fh = new FileHandle("<$name");
$self->{_fileid}{$id} = $fh;
$self->{_dbfile}{$name} = $id;
$self->{_size}{$id} = $size;
} else {
$self->throw("Something's wrong - there are two identical file ids $id");
}
}
close(FILEID); } |
sub read_header
{ my ($self) = @_;
my $file = $self->primary_index_file;;
open (IN,"<$file") || $self->throw("Can't read index [$file]");
my $fh =\* IN;
my $num_fields;
my $record_width;
my @field_width;
my $i = 0;
sysread($fh,$record_width,4);
sysread($fh,$num_fields,4);
print "Width and fields $record_width:$num_fields:\n";
$record_width =~ s/ //g;
$num_fields =~ s/ //g;
$record_width = $record_width * 1;
$num_fields = $num_fields * 1;
$self->{_num_fields} = $num_fields;
$self->record_size($record_width);
while ($i < $num_fields) {
sysread($fh,$field_width[$i],5);
print "Field width for $i is $field_width[$i]\n";
$i++;
}
$self->{_start_pos} = tell($fh);
$self->{_field_widths} = [];
my $tmp;
sysseek($fh,$self->{_start_pos},0);
sysread($fh,$tmp,$field_width[0]);
print "Tmp $tmp\n";
sysseek($fh,$self->{_start_pos} + 1000*$self->record_size,0);
sysread($fh,$tmp,$field_width[0]);
print "Tmp $tmp\n";
push(@{$self->{_field_widths}},@field_width); } |
sub read_record
{ my ($self,$fh,$pos) = @_;
sysseek($fh,$pos,0);
my @field_width = @{$self->{_field_widths}};
my $id;
my $fileid;
my $pos;
my $length;
sysread($fh,$id, $field_width[0]);
sysread($fh,$fileid,$field_width[1]);
sysread($fh,$pos, $field_width[2]);
sysread($fh,$length,$field_width[3]);
return ($id,$fileid,$pos,$length); } |
sub record_size
{ my ($self,$arg) = @_;
if (defined($arg)) {
$self->{_record_size} = $arg;
}
return $self->{_record_size}; } |
write_primary_index | description | prev | next | Top |
sub write_primary_index
{ my ($self) = @_;
my @ids = sort keys %{$self->{_id}};
print "Number of ids = " . scalar(@ids) . "\n";
open (INDEX,">" . $self->primary_index_file);
my $recordlength = $self->{_maxidlength} +
$self->{_maxfileidlength} +
$self->{_maxposlength} +
$self->{_maxlengthlength} + 6;
foreach my $id (@ids) {
if (!defined($self->{_id}{$id}{_fileid})) {
$self->throw("No fileid for $id\n");
}
if (!defined($self->{_id}{$id}{_pos})) {
$self->throw("No position for $id\n");
}
if (!defined($self->{_id}{$id}{_length})) {
$self->throw("No length for $id");
}
my $record = $id . "\t" .
$self->{_id}{$id}{_fileid} . "\t" .
$self->{_id}{$id}{_pos} . "\t" .
$self->{_id}{$id}{_length};
if (length($record) < $recordlength) {
my $pad = ' ' x ($recordlength - length($record));
$record = $record . $pad;
}
print INDEX $record . "\n";
}
close(INDEX); } |
General documentation
No general documentation available.