Raw content of Bio::EnsEMBL::Compara::Production::DnaFragChunk
=head1 NAME - Bio::EnsEMBL::Compara::Production::DnaFragChunk
=head1 SYNOPSIS
=head1 DESCRIPTION
=head1 CONTACT
Jessica Severin
=head1 APPENDIX
The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
=cut
package Bio::EnsEMBL::Compara::Production::DnaFragChunk;
use strict;
use Bio::EnsEMBL::Compara::DnaFrag;
use Bio::EnsEMBL::Compara::DBSQL::SequenceAdaptor;
use Bio::EnsEMBL::Utils::Exception;
use Time::HiRes qw(time gettimeofday tv_interval);
sub new {
my ($class, $dnafrag, $start, $end) = @_;
my $self = {};
bless $self,$class;
$self->dnafrag($dnafrag) if($dnafrag);
$self->seq_start($start) if($start);
$self->seq_end($end) if($end);
return $self;
}
=head2 slice
Arg : none
Example : $slice = $chunk->slice;
Description: Meta method which uses the dnafrag of this chunk to get the genomeDB
to connect to corresponding core database, and then to uses the core
SliceAdaptor to get a slice associated with the dnafrag type and name and
the this chunks start,end.
Returntype : Bio::EnsEMBL::Slice object
Exceptions : none
Caller : general, self->fetch_masked_sequence()
=cut
sub slice {
my ($self) = @_;
return $self->{'_slice'} if(defined($self->{'_slice'}));
return undef unless($self->dnafrag);
return undef unless($self->dnafrag->genome_db);
return undef unless(my $dba = $self->dnafrag->genome_db->db_adaptor);
my $sliceDBA = $dba->get_SliceAdaptor;
#if ($self->seq_end > $self->seq_start) {
#Should be >= to since end can equal start and the slice be one base long
#If seq_start and seq_end are both 0, set the slice to be whole dnafrag
if ($self->seq_end >= $self->seq_start && $self->seq_end != 0) {
$self->{'_slice'} = $sliceDBA->fetch_by_region($self->dnafrag->coord_system_name,
$self->dnafrag->name,
$self->seq_start, $self->seq_end);
} else {
$self->{'_slice'} = $sliceDBA->fetch_by_region($self->dnafrag->coord_system_name,
$self->dnafrag->name);
}
return $self->{'_slice'};
}
=head2 fetch_masked_sequence
Description: Meta method which uses the slice associated with this chunk
and from the external core database associated with the slice
it extracts the masked DNA sequence.
Returns as Bio::Seq object. does not cache sequence internally
Arg [1] : (int or string) masked status of the sequence [optional]
0 or '' = unmasked (default)
1 or 'hard' = masked
2 or 'soft' = soft-masked
Arg[2] : (ref to hash) hash of masking options [optional]
Example : $bioseq = $chunk->get_sequence(1);
Returntype : Bio::Seq or undef if a problem
Exceptions : none
Caller : general
=cut
sub fetch_masked_sequence {
my $self = shift;
return undef unless(my $slice = $self->slice());
my $dcs = $slice->adaptor->db->dbc->disconnect_when_inactive();
#print("fetch_masked_sequence disconnect=$dcs\n");
$slice->adaptor->db->dbc->disconnect_when_inactive(0);
#printf("fetch_masked_sequence disconnect=%d\n", $slice->adaptor->db->dbc->disconnect_when_inactive());
my $seq;
my $id = $self->display_id;
my $masking_options;
my $starttime = time();
if(defined($self->masking_options)) {
$masking_options = eval($self->masking_options);
my $logic_names = $masking_options->{'logic_names'};
if(defined($masking_options->{'default_soft_masking'}) and
$masking_options->{'default_soft_masking'} == 0)
{
#print "getting HARD masked sequence...\n";
$seq = $slice->get_repeatmasked_seq($logic_names,0,$masking_options);
} else {
#print "getting SOFT masked sequence...\n";
$seq = $slice->get_repeatmasked_seq($logic_names,1,$masking_options);
}
}
else { # no masking options set, so get unmasked sequence
#print "getting UNMASKED sequence...\n";
$seq = Bio::PrimarySeq->new( -id => $id, -seq => $slice->seq);
}
unless($seq->isa('Bio::PrimarySeq')) {
#print("seq is a [$seq] not a [Bio::PrimarySeq]\n");
my $oldseq = $seq;
$seq = Bio::PrimarySeq->new( -id => $id, -seq => $oldseq->seq);
}
#print ((time()-$starttime), " secs\n");
$slice->adaptor->db->dbc->disconnect_when_inactive($dcs);
#printf("fetch_masked_sequence disconnect=%d\n", $slice->adaptor->db->dbc->disconnect_when_inactive());
#print STDERR "sequence length : ",$seq->length,"\n";
$seq = $seq->seq;
if (defined $masking_options) {
foreach my $ae (@{$slice->get_all_AssemblyExceptionFeatures}) {
next unless (defined $masking_options->{"assembly_exception_type_" . $ae->type});
my $length = $ae->end - $ae->start + 1;
if ($masking_options->{"assembly_exception_type_" . $ae->type} == 0) {
my $padstr = 'N' x $length;
substr ($seq, $ae->start, $length) = $padstr;
} elsif ($masking_options->{"assembly_exception_type_" . $ae->type} == 1) {
my $padstr = lc substr ($seq, $ae->start, $length);
substr ($seq, $ae->start, $length) = $padstr;
}
}
}
$self->sequence($seq);
return $seq;
}
=head2 display_id
Args : none
Example : my $id = $chunk->display_id;
Description: returns string describing this chunk which can be used
as display_id of a Bio::Seq object or in a fasta file.
Uses dnafrag information in addition to start and end.
Returntype : string
Exceptions : none
Caller : general
=cut
sub display_id {
my $self = shift;
my $id = "";
if($self->dbID) {
$id .= 'chunkID'.$self->dbID.":";
} elsif($self->dnafrag) {
$id .= $self->dnafrag->display_id.":";
}
$id .= $self->seq_start.".".$self->seq_end;
return $id;
}
=head2 bioseq
Description: returns sequence of this chunk as a Bio::Seq object
will fetch from core if not previously cached in compara
Args : none
Example : my $bioseq = $chunk->bioseq;
Returntype : Bio::Seq object
Exceptions : none
Caller : general
=cut
sub bioseq {
my $self = shift;
my $seq = undef;
if(not defined($self->sequence())) {
my $starttime = time();
$seq = $self->fetch_masked_sequence;
my $fetch_time = time()-$starttime;
$self->sequence($seq);
}
$seq = Bio::Seq->new(-seq => $self->sequence(),
-display_id => $self->display_id(),
-primary_id => $self->sequence_id(),
);
return $seq;
}
##########################
#
# getter/setter methods of data which is stored in database
#
##########################
sub adaptor {
my $self = shift;
$self->{'_adaptor'} = shift if(@_);
return $self->{'_adaptor'};
}
sub dbID {
my $self = shift;
$self->{'_dbID'} = shift if(@_);
return $self->{'_dbID'};
}
sub dnafrag {
my ($self,$dnafrag) = @_;
if (defined($dnafrag)) {
throw("arg must be a [Bio::EnsEMBL::Compara::DnaFrag] not a [$dnafrag]")
unless($dnafrag->isa('Bio::EnsEMBL::Compara::DnaFrag'));
$self->{'_dnafrag'} = $dnafrag;
$self->dnafrag_id($dnafrag->dbID);
}
#lazy load the DnaFrag
if(!defined($self->{'_dnafrag'}) and defined($self->dnafrag_id) and $self->adaptor) {
$self->{'_dnafrag'} = $self->adaptor->_fetch_DnaFrag_by_dbID($self->dnafrag_id);
}
return $self->{'_dnafrag'};
}
sub dnafrag_id {
my $self = shift;
return $self->{'dnafrag_id'} = shift if(@_);
return $self->{'dnafrag_id'};
}
sub seq_start {
my $self = shift;
return $self->{'seq_start'} = shift if(@_);
$self->{'seq_start'}=0 unless(defined($self->{'seq_start'}));
return $self->{'seq_start'};
}
sub seq_end {
my $self = shift;
my $end = shift;
if($end) {
$end=$self->dnafrag->length if($self->dnafrag and ($end > $self->dnafrag->length));
$self->{'seq_end'} = $end;
}
$self->{'seq_end'}=0 unless(defined($self->{'seq_end'}));
return $self->{'seq_end'};
}
sub length {
my $self = shift;
return $self->{'seq_end'} - $self->{'seq_start'} + 1;
}
sub sequence_id {
my $self = shift;
return $self->{'sequence_id'} = shift if(@_);
return $self->{'sequence_id'};
}
sub sequence {
my $self = shift;
if(@_) {
$self->{'_sequence'} = shift;
$self->sequence_id(0);
}
return $self->{'_sequence'} if(defined($self->{'_sequence'}));
#lazy load the sequence if sequence_id is set
if(defined($self->sequence_id()) and defined($self->adaptor())) {
$self->{'_sequence'} = $self->adaptor->db->get_SequenceAdaptor->fetch_by_dbID($self->sequence_id);
}
return $self->{'_sequence'};
}
sub masking_options {
my $self = shift;
if(@_) {
$self->{'_masking_options'} = shift;
$self->masking_analysis_data_id(0);
}
return $self->{'_masking_options'};
}
#method for passing previously known and stored analysis_data_id reference around
#so that there is no need to store it again
sub masking_analysis_data_id {
my $self = shift;
$self->{'masking_analysis_data_id'} = shift if(@_);
$self->{'masking_analysis_data_id'}=0 unless(defined($self->{'masking_analysis_data_id'}));
return $self->{'masking_analysis_data_id'};
}
sub dump_to_fasta_file
{
my $self = shift;
my $fastafile = shift;
my $bioseq = $self->bioseq;
#printf(" writing chunk %s\n", $self->display_id);
open(OUTSEQ, ">$fastafile")
or $self->throw("Error opening $fastafile for write");
my $output_seq = Bio::SeqIO->new( -fh =>\*OUTSEQ, -format => 'Fasta');
$output_seq->write_seq($bioseq);
close OUTSEQ;
return $self;
}
#Version of dump_to_fasta_file that appends chunk_size portions of the
#sequence to fastafile. Useful if the sequence is very long eg opossum chr1 & 2
#Must remember that fastafile should not exist beforehand otherwise it will be
#appended to so check if it exists and delete it
sub dump_chunks_to_fasta_file
{
my $self = shift;
my $fastafile = shift;
#choosen because it is divisible by 60 (fasta line width used)
my $chunk_size = 99999960;
my $start = 1;
my $end = $chunk_size;
my $total_length = $self->length;
#check to see if the fastafile already exists and delete it if it does.
if (-e $fastafile) {
unlink $fastafile;
}
open(OUTSEQ, ">>$fastafile")
or $self->throw("Error opening $fastafile for write");
print OUTSEQ ">" . $self->display_id . "\n";
while ($start <= $total_length) {
$self->{'_slice'} = undef;
$self->{'_sequence'} = undef;
$self->seq_start($start);
$self->seq_end($end);
my $bioseq = $self->bioseq;
#increment start and end to do next chunk
$start = $end + 1;
if ($end+$chunk_size < $total_length) {
$end += $chunk_size;
} else {
$end = $total_length;
}
#write out sequence to fasta file
my $seq = $bioseq->seq;
$seq =~ s/(.{60})/$1\n/g;
$seq =~ s/\n$//;
print OUTSEQ $seq, "\n";
}
close OUTSEQ;
return $self;
}
sub cache_sequence
{
my $self = shift;
# $self->sequence will load from compara if available, if not go fetch it
unless($self->sequence) {
$self->fetch_masked_sequence;
}
# fetching sequence will set $self->sequence but not sequence_id so store it
# fetching may have occurred in a previous method call, so keep this logic
# separate to make sure it will get stored
if($self->sequence_id==0) {
$self->adaptor->update_sequence($self);
}
return $self;
}
1;