Raw content of BioMart::Dataset::GenomicSequence::DNAAdaptor # # BioMart module for BioMart::Dataset::GenomicRegion::DNAAdaptor # # You may distribute this module under the same terms as perl itself # POD documentation - main docs before the code =head1 NAME BioMart::Dataset::GenomicSequence::DNAAdaptor.pm =head1 SYNOPSIS =head1 DESCRIPTION =head1 AUTHOR Darin London, Damian Smedley =head1 CONTACT This module is part of the BioMart project http://www.biomart.org Questions can be posted to the mart-dev mailing list: mart-dev@ebi.ac.uk =head1 METHODS =cut package BioMart::Dataset::GenomicSequence::DNAAdaptor; use strict; use warnings; use DBI; use base qw(BioMart::Root); use Log::Log4perl; my $logger=Log::Log4perl->get_logger(__PACKAGE__); use constant SEQNAME => "seq_name"; use constant SEQTABLENAME => "dna_tablename"; use constant SEQFIELDNAME => "seq_fieldname"; use constant CHUNKNAMEFIELD => "chunk_name_fieldname"; use constant CHUNKSTARTFIELD => "chunk_start_fieldname"; use constant CONF => "configurator"; use constant CHUNKSIZE => "chunk_size"; use constant TITLES => [ SEQNAME, SEQTABLENAME, SEQFIELDNAME, CHUNKNAMEFIELD, CHUNKSTARTFIELD, CONF, CHUNKSIZE ]; use constant SQLFULL => "select %s from %s where %s = ? and %s = ?"; use constant SQLSUB => "select substring(%s, ?, ?) from %s where %s = ? and %s = ?"; use constant SQLSUBORACLE => "select substr(%s, ?, ?) from %s where %s = ? and %s = ?"; =head2 _new Usage : my $dna = BioMart::Dataset::GenomicSequence::DNAAdaptor-> new("seq_name" => $name, "dna_tablename" => $dnatablename, "seq_fieldname" => $seqfieldname, "chunk_name_fieldname" => $chunkfieldname, "chunk_start_fieldname" => $chunk_start_fieldname, "configurator" => $conf, "chunk_size" => 10 ); Description : creates a DNAAdaptor, for retrieving dna sequence. Requires a name, which maps to a mart sequence database, the name of the dna table within the database(eg, dna_chunks in ensembl) , and the names of that tables sequence, chunkname, and chunk_start fields (eg, sequence, chr_name, chr_start in ensmbl), a BioMart::Configurator object, and the dna chunk size. Returntype : BioMart::Dataset::GenomicSequence::DNAAdaptor object. Meant for use by the BioMart::Dataset::GenomicSequence object. Exceptions : Missing or invalid params Caller : BioMart::Dataset::GenomicSequence =cut sub _new { my ($self, @param) = @_; $self->SUPER::_new(@param); $self->addParams(TITLES, @param); $self->checkRequiredParams(TITLES); $self->attr('fullSth', undef); $self->attr('subSth', undef); $self->attr('dbh', undef); $self->_initialize; } #private methods sub _initialize { my $self = shift; #connect to the sequence mart my $dbname = $self->getParam(SEQNAME); my $location=$self->getParam(CONF)->get('location'); $location->openConnection(); my $dbh = $location->dbh(); if ($self->getParam('configurator')->get('location')->databaseType eq 'oracle'){ $dbh->{LongReadLen} = 2**25; } $self->throw("Could not connect to sequence db $dbname ".DBI::errstr."!\n") unless ($dbh); my $fullSQL = sprintf(SQLFULL, $self->getParam(SEQFIELDNAME), $self->getParam(SEQTABLENAME), $self->getParam(CHUNKSTARTFIELD), $self->getParam(CHUNKNAMEFIELD)); my $subSQL; if ($self->getParam('configurator')->get('location')->databaseType eq 'oracle'){ $subSQL = sprintf(SQLSUBORACLE, $self->getParam(SEQFIELDNAME), $self->getParam(SEQTABLENAME), $self->getParam(CHUNKSTARTFIELD), $self->getParam(CHUNKNAMEFIELD)); } else{ $subSQL = sprintf(SQLSUB, $self->getParam(SEQFIELDNAME), $self->getParam(SEQTABLENAME), $self->getParam(CHUNKSTARTFIELD), $self->getParam(CHUNKNAMEFIELD)); } my $fullSth = $dbh->prepare($fullSQL) or $self->throw("Couldnt prepare fullSQL ".$dbh->errstr."!\n"); my $subSth = $dbh->prepare($subSQL) or $self->throw("Couldnt prepare subSQL ".$dbh->errstr."!\n"); #my $logger=Log::Log4perl->get_logger(__PACKAGE__); #$logger->info("QUERY FULL SQL: $fullSQL"); #$logger->info("QUERY SUB SQL: $subSQL"); $self->set('fullSth', $fullSth); $self->set('subSth', $subSth); $self->set('dbh', $dbh); } sub _Npad{ my ($self, $num)= @_; my $ret = ''; my $i = 0; while ($i < $num) { $ret .= 'N'; $i++; } return $ret; } sub _fetchSequence { my ($self, $chr, $start, $len) = @_; my $chunkSize = $self->getParam(CHUNKSIZE); my $chunkStart = $start - ( ( $start - 1 ) % $chunkSize ); if ($start == $chunkStart && $len == $chunkSize) { return $self->_fetchFullChunk($chr, $chunkStart); } return $self->_fetchChunkSubstring($chr, $start, $chunkStart, $len); } sub _fetchFullChunk { my ($self, $chr, $chunkStart) = @_; my $sth = $self->get('fullSth'); my $sql_statement = $sth->{Statement}; $sql_statement =~ s/\?/$_/ foreach ("\"$chunkStart\"", "\"$chr\""); $logger->info("QUERY FULL SQL: $sql_statement\;"); $sth->execute($chunkStart, $chr); my $ret = $sth->fetchrow; $sth->finish; $self->set('fullSth', $sth); return $ret; } sub _fetchChunkSubstring { my ($self, $chr, $start, $chunkStart, $len) = @_; my $coord = $start - $chunkStart + 1; my $sth = $self->get('subSth'); my $sql_statement = $sth->{Statement}; $sql_statement =~ s/\?/$_/ foreach ("\"$coord\"", "\"$len\"", "\"$chunkStart\"", "\"$chr\""); $logger->info("QUERY SUBSTRING SQL: $sql_statement\;"); $sth->execute($coord, $len, $chunkStart, $chr); my $ret = $sth->fetchrow; $sth->finish; $self->set('subSth', $sth); return $ret; } sub _fetchResidualSequence { my ($self, $chr, $start, $len, $initialSeq) = @_; my $currentLength = length(${$initialSeq}); my $currentStart = $start + $currentLength; while ($currentLength < $len) { my $residual = $len - $currentLength; my $curr = $self->_fetchSequence($chr, $currentStart, $residual); my $currLength = length($curr); last if ($currLength < 1); ${$initialSeq} .= $curr; $currentLength += $currLength; $currentStart = $start + $currentLength; } } # interface methods #override Root::throw to close before throw sub throw { my ($self, $message) = @_; $self->close; $self->SUPER::throw($message); } #public methods =head2 getSequence Usage : my $seq = $dna->getSequence($chunk_name, $start, $end); Description : gets the dna sequence at a particular genomic location chunk_name corresponds to the value that would occur in the chunk_name_fieldname field passed to the constructor for DNAAdaptor (eg. the value of chr_name in ensembl). start corresonds to the value in the chunk_start_fieldname field passed to the constructor (eg. the value of chr_start in ensembl). Returntype : scalar $seq Exceptions : none Caller : BioMart::SubSequence::GenomicSequence =cut sub getSequence { my ($self, $chr, $start, $end) = @_; my $len = ($end - $start) + 1; my $ret = $self->_fetchSequence($chr, $start, $len); my $seqLen = 0; if ($ret) { $seqLen = length($ret); } unless ($seqLen) { $logger->info("Padding with Ns"); return $self->_Npad($len); } if ($seqLen < $len) { #in place modification of reference $ret $self->_fetchResidualSequence($chr, $start, $len, \$ret); } return $ret; } =head2 close Usage : $dna->close; Description : closes all DBI resources involved with fetching sequences. To be called by the sequence parser when it has exhausted its resultSet for the given query. Returntype : none Exceptions : none Caller : sequence parser module =cut sub close { my $self = shift; my $dbh = $self->get('dbh'); if ($dbh) { my $fullSth = $self->get('fullSth'); my $subSth = $self->get('subSth'); if ($fullSth) { $fullSth->finish; } if ($subSth) { $subSth->finish; } $dbh->disconnect; } } sub DESTROY { my $self = shift; $self->close; } 1;