NCBIHelper documentation.

Title : get_Stream_by_acc
Usage : $seq = $db->get_Stream_by_acc([$acc1, $acc2]);
Function: Gets a series of Seq objects by accession numbers
Returns : a Bio::SeqIO stream object
Args : $ref : a reference to an array of accession numbers for
the desired sequence entries
Note : For GenBank, this just calls the same code for get_Stream_by_id()

Title : get_Stream_by_batch
Usage : $seq = $db->get_Stream_by_batch($ref);
Function: Retrieves Seq objects from Entrez 'en masse', rather than one
at a time. For large numbers of sequences, this is far superior
than get_Stream_by_[id/acc]().
Example :
Returns : a Bio::SeqIO stream object
Args : $ref : either an array reference, a filename, or a filehandle
from which to get the list of unique ids/accession numbers.

Title : get_Stream_by_query
Usage : $seq = $db->get_Stream_by_query($query);
Function: Retrieves Seq objects from Entrez 'en masse', rather than one
at a time. For large numbers of sequences, this is far superior
than get_Stream_by_[id/acc]().
Example :
Returns : a Bio::SeqIO stream object
Args : $query : An Entrez query string or a
Bio::DB::Query::GenBank object. It is suggested that you
create a Bio::DB::Query::GenBank object and get the entry
count before you fetch a potentially large stream.

Title : get_params
Usage : my %params = $self->get_params($mode)
Function: Returns key,value pairs to be passed to NCBI database
for either 'batch' or 'single' sequence retrieval method
Returns : a key,value pair hash
Args : 'single' or 'batch' mode for retrieval

Title : postprocess_data
Usage : $self->postprocess_data ( 'type' => 'string',
'location' => \$datastr);
Function: process downloaded data before loading into a Bio::SeqIO
Returns : void
Args : hash with two keys - 'type' can be 'string' or 'file'
- 'location' either file location or string
reference containing data

Title : request_format
Usage : my ($req_format, $ioformat) = $self->request_format;
$self->request_format("genbank");
$self->request_format("fasta");
Function: Get/Set sequence format retrieval. The get-form will normally not
be used outside of this and derived modules.
Returns : Array of two strings, the first representing the format for
retrieval, and the second specifying the corresponding SeqIO format.
Args : $format = sequence format

BEGIN {

    $MAX_ENTRIES = 19000;
    $HOSTBASE = 'http://eutils.ncbi.nlm.nih.gov';
    %CGILOCATION = (
		    'batch'  => ['post' => '/entrez/eutils/efetch.fcgi'],
		    'query'  => ['get'  => '/entrez/eutils/efetch.fcgi'],
		    'single' => ['get'  => '/entrez/eutils/efetch.fcgi'],
		    'version'=> ['get'  => '/entrez/eutils/efetch.fcgi'],
		    'gi'   =>   ['get'  => '/entrez/eutils/efetch.fcgi'],
		     );

    %FORMATMAP = ( 'gb' => 'genbank',
		   'gp' => 'genbank',
		   'fasta'   => 'fasta',
		   );

    $DEFAULTFORMAT = 'gb';

}

sub _check_id {

    my ($self, $ids) = @_;

    # NT contigs can not be retrieved
    $self->throw("NT_ contigs are whole chromosome files which are not part of regular".
		 "database distributions. Go to ftp://ftp.ncbi.nih.gov/genomes/.") 
	if $ids =~ /NT_/;

    # Asking for a RefSeq from EMBL/GenBank

    if ($ids =~ /N._/) {
	$self->warn("[$ids] is not a normal sequence database but a RefSeq entry.".
		   " Redirecting the request.\n")
	    if $self->verbose >= 0;
	return  new Bio::DB::RefSeq;
    }

}

my ($self, $ids ) = @_; my $newdb = $self->_check_id($ids); if (defined $newdb && ref($newdb) && $newdb->isa('Bio::DB::RefSeq')) { return $newdb->get_seq_stream('-uids' => $ids, '-mode' => 'single'); } else { return $self->get_seq_stream('-uids' => $ids, '-mode' => 'single'); }

my ($self, $query) = @_; unless (ref $query && $query->can('query')) { $query = Bio::DB::Query::GenBank->new($query); } return $self->get_seq_stream('-query' => $query, '-mode'=>'query');

sub get_request {

    my ($self, @qualifiers) = @_;
    my ($mode, $uids, $format, $query) = $self->_rearrange([qw(MODE UIDS 
							       FORMAT QUERY)],
							   @qualifiers);

    $mode = lc $mode;
    ($format) = $self->request_format() unless ( defined $format);
    if( !defined $mode || $mode eq '' ) { $mode = 'single'; }
    my %params = $self->get_params($mode);
    if( ! %params ) {
	$self->throw("must specify a valid retrieval mode 'single' or 'batch' not '$mode'") 
	}
    my $url = URI->new($HOSTBASE . $CGILOCATION{$mode}[1]);

    unless( defined $uids or defined $query) {
	$self->throw("Must specify a query or list of uids to fetch");
    }

    if ($uids) {
	if( ref($uids) =~ /array/i ) {
	    $uids = join(",", @$uids);
	}
	$params{'id'}      = $uids;
    }

    elsif ($query && $query->can('cookie')) {
	@params{'WebEnv','query_key'} = $query->cookie;
	$params{'db'}                 = $query->db;
    }

    elsif ($query) {
	$params{'id'} = join ',',$query->ids;
    }

    $params{'rettype'} = $format;
    if ($CGILOCATION{$mode}[0] eq 'post') {
	return POST $url,[%params];
    } else {
	$url->query_form(%params);
	$self->debug("url is $url\n ");
	return GET $url;
    }

}

sub postprocess_data {

    my ($self, %args) = @_;
    my $data;
    my $type = uc $args{'type'};
    my $location = $args{'location'};
    if( !defined $type || $type eq '' || !defined $location) {
	return;
    } elsif( $type eq 'STRING' ) {
	$data = $$location; 
    } elsif ( $type eq 'FILE' ) {
	open(TMP, $location) or $self->throw("could not open file $location");
	my @in = <TMP>;
	close TMP;
	$data = join("", @in);
    }

    # transform links to appropriate descriptions
    if ($data =~ /\nCONTIG\s+/) {	
	$self->warn("CONTIG found. GenBank get_Stream_by_acc about to run."); 
    	my(@batch,@accession,%accessions,@location,$id,
	   $contig,$stream,$aCount,$cCount,$gCount,$tCount);

    	# process GenBank CONTIG join(...) into two arrays
    	$data =~ /(?:CONTIG\s+join\()((?:.+\n)+)(?:\/\/)/;
	$contig = $1;
    	$contig =~ s/\n|\)//g;
	foreach (split /\s*,\s*/,$contig){	    
	    if (/>(.+)<.+>:(.+)/) {
		($id) = split /\./, $1;		
		push @accession, $id;
		push @location, $2;
		$accessions{$id}->{'count'}++;
	    } elsif( /([\w\.]+):(.+)/ ) { 
		($id) = split /\./, $1;
		$accessions{$id}->{'count'}++;
		push @accession, $id;
		push @location, $2;
	    }
	}

	# grab multiple sequences by batch and join based location variable
	my @unique_accessions = keys %accessions;
	$stream = $self->get_Stream_by_acc(\@unique_accessions);
	$contig = "";
	my $ct = 0;
	while( my $seq = $stream->next_seq() ) {	    
	    if( $seq->accession_number !~ /$unique_accessions[$ct]/ ) {
		printf STDERR "warning, %s does not match %s\n",
		$seq->accession_number, $unique_accessions[$ct];
	    }
	    $accessions{$unique_accessions[$ct]}->{'seq'} = $seq;
	    $ct++;
	}
	for (my $i = 0; $i < @accession; $i++) {
	    my $seq = $accessions{$accession[$i]}->{'seq'};
	    unless( defined $seq ) {
		# seq not cached, get next sequence
		$self->warn("unable to find sequence $accession[$i]\n");
		return undef;
	    }
	    my($start,$end) = split(/\.\./, $location[$i]);
	    $contig .= $seq->subseq($start,$end-$start);
	}

	# count number of each letter in sequence
	$aCount = () = $contig =~ /a/ig;
	$cCount = () = $contig =~ /c/ig;
	$gCount = () = $contig =~ /g/ig;
	$tCount = () = $contig =~ /t/ig;

	# remove everything after and including CONTIG
	$data =~ s/(CONTIG[\s\S]+)$//i;

	# build ORIGIN part of data file using sequence and counts
	$data .= "BASE COUNT     $aCount a   $cCount c   $gCount g   $tCount t\n";
	$data .= "ORIGIN\n      ";
	$data .= "$contig\n//";
    }
    else {
	$data =~ s/<a\s+href\s*=.+>\s*(\S+)\s*<\s*\/a\s*\>/$1/ig;
    }
    
    # fix gt and lt
    $data =~ s/&gt;/>/ig;
    $data =~ s/&lt;/</ig;
    if( $type eq 'FILE'  ) {
	open(TMP, ">$location") or $self->throw("couldn't overwrite file $location");
	print TMP $data;
	close TMP;
    } elsif ( $type eq 'STRING' ) {
	${$args{'location'}} = $data;
    }
    $self->debug("format is ". join(',',$self->request_format()). 
		 " data is\n$data\n");

}

my ($self, $value) = @_; if( defined $value ) { $value = lc $value; if( defined $FORMATMAP{$value} ) { $self->{'_format'} = [ $value, $FORMATMAP{$value}]; } else { # Try to fall back to a default. Alternatively, we could throw
# an exception
$self->{'_format'} = [ $value, $value ]; } } return @{$self->{'_format'}};

BEGIN		Code
_check_id	Description	Code
default_format	Description	Code
delay_policy	Description	Code
get_Stream_by_acc	Description	Code
get_Stream_by_batch (reference)	Description	Code
get_Stream_by_query	Description	Code
get_params	Description	Code
get_request	Description	Code
new	No description	Code
postprocess_data	Description	Code
request_format	Description	Code