Bio::EnsEMBL::Compara::Production::GenomicAlignBlock
PairAligner
Toolbar
Summary
Bio::EnsEMBL::Compara::Production::GenomicAlignBlock::PairAligner
Package variables
No package variables defined.
Included modules
Bio::EnsEMBL::Compara::GenomicAlign(1)
Bio::EnsEMBL::Compara::GenomicAlign(2)
File::Basename
Time::HiRes qw ( time gettimeofday tv_interval )
Inherit
Synopsis
Description
This object is an abstract superclass which must be inherited from.
It uses a runnable which takes sequence as input and returns
FeaturePair objects as output (like Bio::EnsEMBL::Analysis::Runnable::Blastz)
It adds functionality to read and write to a compara databases.
It takes as input (via input_id or analysis->parameters) DnaFragChunk or DnaFragChunkSet
objects (via dbID reference) and stores GenomicAlignBlock entries.
The appropriate Bio::EnsEMBL::Analysis object must be passed for
extraction of appropriate parameters.
Methods
compact_cigar_line | No description | Code |
configure_defaults | No description | Code |
configure_runnable | No description | Code |
db_DnaFragChunkSet | No description | Code |
delete_fasta_dumps_but_these | No description | Code |
dumpChunkSetToWorkdir | No description | Code |
dumpChunkToWorkdir | No description | Code |
dump_loc | No description | Code |
fetch_input | Description | Code |
get_params | No description | Code |
max_alignments | No description | Code |
method_link_type | No description | Code |
options | No description | Code |
print_simple_align | No description | Code |
query_DnaFragChunkSet | No description | Code |
run | No description | Code |
store_featurePair_as_genomicAlignBlock | No description | Code |
write_output | No description | Code |
Methods description
Title : fetch_input Usage : $self->fetch_input Function: Fetches input data for repeatmasker from the database Returns : none Args : none |
Methods code
compact_cigar_line | description | prev | next | Top |
sub compact_cigar_line
{
my $cigar_line = shift;
my @pieces = ( $cigar_line =~ /(\d*[MDI])/g );
my @new_pieces = ();
foreach my $piece (@pieces) {
$piece =~ s/I/M/;
if (! scalar @new_pieces || $piece =~ /D/) {
push @new_pieces, $piece;
next;
}
if ($piece =~ /\d*M/ && $new_pieces[-1] =~ /\d*M/) {
my ($matches1) = ($piece =~ /(\d*)M/);
my ($matches2) = ($new_pieces[-1] =~ /(\d*)M/);
if (! defined $matches1 || $matches1 eq "") {
$matches1 = 1;
}
if (! defined $matches2 || $matches2 eq "") {
$matches2 = 1;
}
$new_pieces[-1] = $matches1 + $matches2 . "M";
} else {
push @new_pieces, $piece;
}
}
my $new_cigar_line = join("", @new_pieces);
return $new_cigar_line; } |
sub configure_defaults
{ my $self = shift;
return 0; } |
sub configure_runnable
{ my $self = shift;
throw("subclass must implement configure_runnable method\n"); } |
sub db_DnaFragChunkSet
{ my $self = shift;
$self->{'_db_DnaFragChunkSet'} = shift if(@_);
unless(defined($self->{'_db_DnaFragChunkSet'})) {
$self->{'_db_DnaFragChunkSet'} = new Bio::EnsEMBL::Compara::Production::DnaFragChunkSet;
}
return $self->{'_db_DnaFragChunkSet'};
}
} |
sub delete_fasta_dumps_but_these
{ my $self = shift;
my $fasta_files_not_to_delete = shift;
my $work_dir = $self->worker_temp_directory;
open F, "ls $work_dir|";
while (my $file = <F>) {
chomp $file;
next unless ($file =~ /\.fasta$/);
my $delete = 1;
foreach my $fasta_file (@{$fasta_files_not_to_delete}) {
if ($file eq basename($fasta_file)) {
$delete = 0;
last;
}
}
unlink "$work_dir/$file" if ($delete);
}
close F; } |
sub dumpChunkSetToWorkdir
{
my $self = shift;
my $chunkSet = shift;
my $starttime = time();
my $fastafile = $self->worker_temp_directory. "chunk_set_". $chunkSet->dbID .".fasta";
$fastafile =~ s/\/\//\//g; return $fastafile if(-e $fastafile);
open(OUTSEQ, ">$fastafile")
or $self->throw("Error opening $fastafile for write");
my $output_seq = Bio::SeqIO->new( -fh =>\*OUTSEQ, -format => 'Fasta');
my $chunk_array = $chunkSet->get_all_DnaFragChunks;
if($self->debug){printf("dumpChunkSetToWorkdir : %s : %d chunks\n", $fastafile, $chunkSet->count());}
foreach my $chunk (@$chunk_array) {
my $bioseq = $chunk->bioseq;
if($chunk->sequence_id==0) {
$self->{'comparaDBA'}->get_DnaFragChunkAdaptor->update_sequence($chunk);
}
$output_seq->write_seq($bioseq);
}
close OUTSEQ;
if($self->debug){printf(" %1.3f secs to dump\n", (time()-$starttime));}
return $fastafile } |
sub dumpChunkToWorkdir
{
my $self = shift;
my $chunk = shift;
my $starttime = time();
my $fastafile = $self->worker_temp_directory .
"chunk_" . $chunk->dbID . ".fasta";
$fastafile =~ s/\/\//\//g; return $fastafile if(-e $fastafile);
if($self->debug){print("dumpChunkToWorkdir : $fastafile\n");}
$chunk->cache_sequence;
$chunk->dump_to_fasta_file($fastafile);
if($self->debug){printf(" %1.3f secs to dump\n", (time()-$starttime));}
return $fastafile } |
sub dump_loc
{ my $self = shift;
$self->{'_dump_loc'} = shift if(@_);
return $self->{'_dump_loc'}; } |
sub fetch_input
{ my( $self) = @_;
$self->configure_defaults();
$self->{'comparaDBA'} = Bio::EnsEMBL::Compara::Production::DBSQL::DBAdaptor->new(-DBCONN=>$self->db->dbc);
$self->{'comparaDBA'}->dbc->disconnect_when_inactive(0);
$self->get_params($self->parameters);
$self->get_params($self->input_id);
throw("Missing qyChunk(s)") unless($self->query_DnaFragChunkSet->count > 0);
throw("Missing dbChunk") unless($self->db_DnaFragChunkSet->count > 0);
throw("Missing method_link_type") unless($self->method_link_type);
my ($first_qy_chunk) = @{$self->query_DnaFragChunkSet->get_all_DnaFragChunks};
my ($first_db_chunk) = @{$self->db_DnaFragChunkSet->get_all_DnaFragChunks};
my $mlss = new Bio::EnsEMBL::Compara::MethodLinkSpeciesSet;
$mlss->method_link_type($self->method_link_type);
if ($first_qy_chunk->dnafrag->genome_db->dbID == $first_db_chunk->dnafrag->genome_db->dbID) {
$mlss->species_set([$first_qy_chunk->dnafrag->genome_db]);
} else {
$mlss->species_set([$first_qy_chunk->dnafrag->genome_db,
$first_db_chunk->dnafrag->genome_db]);
}
$self->{'comparaDBA'}->get_MethodLinkSpeciesSetAdaptor->store($mlss);
$self->{'method_link_species_set'} = $mlss;
if ($self->max_alignments) {
my $sth = $self->{'comparaDBA'}->dbc->prepare("SELECT count(*) FROM genomic_align_block".
" WHERE method_link_species_set_id = ".$mlss->dbID);
$sth->execute();
my ($num_alignments) = $sth->fetchrow_array();
$sth->finish();
if ($num_alignments >= $self->max_alignments) {
throw("Too many alignments ($num_alignments) have been stored already for MLSS ".$mlss->dbID."\n".
" Try changing the parameters or increase the max_alignments option if you think\n".
" your system can cope with so many alignments.");
}
}
$self->configure_runnable();
return 1; } |
sub get_params
{ my $self = shift;
my $param_string = shift;
return unless($param_string);
my $params = eval($param_string);
return unless($params);
if(defined($params->{'qyChunkSetID'})) {
my $chunkset = $self->{'comparaDBA'}->get_DnaFragChunkSetAdaptor->fetch_by_dbID($params->{'qyChunkSetID'});
$self->query_DnaFragChunkSet($chunkset);
}
if(defined($params->{'qyChunk'})) {
my $qy_chunk = $self->{'comparaDBA'}->get_DnaFragChunkAdaptor->fetch_by_dbID($params->{'qyChunk'});
$self->query_DnaFragChunkSet->add_DnaFragChunk($qy_chunk);
}
if(defined($params->{'dbChunkSetID'})) {
my $chunkset = $self->{'comparaDBA'}->get_DnaFragChunkSetAdaptor->fetch_by_dbID($params->{'dbChunkSetID'});
$self->db_DnaFragChunkSet($chunkset);
}
if(defined($params->{'dbChunk'})) {
my $db_chunk = $self->{'comparaDBA'}->get_DnaFragChunkAdaptor->fetch_by_dbID($params->{'dbChunk'});
$self->db_DnaFragChunkSet->add_DnaFragChunk($db_chunk);
}
$self->options($params->{'options'}) if(defined($params->{'options'}));
$self->method_link_type($params->{'method_link'}) if(defined($params->{'method_link'}));
$self->max_alignments($params->{'max_alignments'}) if(defined($params->{'max_alignments'}));
$self->dump_loc($params->{'dump_loc'}) if(defined($params->{'dump_loc'}));
return;
}
} |
sub max_alignments
{ my $self = shift;
$self->{'_max_alignments'} = shift if(@_);
return $self->{'_max_alignments'}; } |
sub method_link_type
{ my $self = shift;
$self->{'_method_link_type'} = shift if(@_);
return $self->{'_method_link_type'}; } |
sub options
{ my $self = shift;
$self->{'_options'} = shift if(@_);
return $self->{'_options'}; } |
sub print_simple_align
{
my $alignment = shift;
my $aaPerLine = shift;
$aaPerLine=40 unless($aaPerLine and $aaPerLine > 0);
my ($seq1, $seq2) = $alignment->each_seq;
my $seqStr1 = "|".$seq1->seq().'|';
my $seqStr2 = "|".$seq2->seq().'|';
my $enddiff = length($seqStr1) - length($seqStr2);
while($enddiff>0) { $seqStr2 .= " "; $enddiff--; }
while($enddiff<0) { $seqStr1 .= " "; $enddiff++; }
my $label1 = sprintf("%40s : ", $seq1->id);
my $label2 = sprintf("%40s : ", "");
my $label3 = sprintf("%40s : ", $seq2->id);
my $line2 = "";
for(my $x=0; $x<length($seqStr1); $x++) {
if(substr($seqStr1,$x,1) eq substr($seqStr2, $x,1)) { $line2.='|'; } else { $line2.=' '; }
}
my $offset=0;
my $numLines = (length($seqStr1) / $aaPerLine); while($numLines>0) {
printf("$label1 %s\n", substr($seqStr1,$offset,$aaPerLine));
printf("$label2 %s\n", substr($line2,$offset,$aaPerLine));
printf("$label3 %s\n", substr($seqStr2,$offset,$aaPerLine));
print("\n\n");
$offset+=$aaPerLine;
$numLines--;
}
}
1; } |
sub query_DnaFragChunkSet
{ my $self = shift;
$self->{'_query_DnaFragChunkSet'} = shift if(@_);
unless(defined($self->{'_query_DnaFragChunkSet'})) {
$self->{'_query_DnaFragChunkSet'} = new Bio::EnsEMBL::Compara::Production::DnaFragChunkSet;
}
return $self->{'_query_DnaFragChunkSet'}; } |
sub run
{
my $self = shift;
$self->{'comparaDBA'}->dbc->disconnect_when_inactive(1);
my $starttime = time();
foreach my $runnable (@{$self->runnable}) {
throw("Runnable module not set") unless($runnable);
$runnable->run();
}
if($self->debug){printf("%1.3f secs to run %s pairwise\n", (time()-$starttime), $self->method_link_type);}
$self->{'comparaDBA'}->dbc->disconnect_when_inactive(0);
return 1; } |
store_featurePair_as_genomicAlignBlock | description | prev | next | Top |
sub store_featurePair_as_genomicAlignBlock
{
my $self = shift;
my $fp = shift;
my $qyChunk = undef;
my $dbChunk = undef;
if($fp->seqname =~ /chunkID(\d*):/) {
my $chunk_id = $1;
$qyChunk = $self->{'comparaDBA'}->get_DnaFragChunkAdaptor->
fetch_by_dbID($chunk_id);
}
if($fp->hseqname =~ /chunkID(\d*):/) {
my $chunk_id = $1;
$dbChunk = $self->{'comparaDBA'}->get_DnaFragChunkAdaptor->
fetch_by_dbID($chunk_id);
}
unless($qyChunk and $dbChunk) {
warn("unable to determine DnaFragChunk objects from FeaturePair");
return undef;
}
if($self->debug > 1) {
print("qyChunk : ",$qyChunk->display_id,"\n");
print("dbChunk : ",$dbChunk->display_id,"\n");
print STDOUT $fp->seqname."\t".
$fp->start."\t".
$fp->end."\t".
$fp->strand."\t".
$fp->hseqname."\t".
$fp->hstart."\t".
$fp->hend."\t".
$fp->hstrand."\t".
$fp->score."\t".
$fp->percent_id."\t".
$fp->cigar_string."\n";
}
$fp->slice($qyChunk->slice);
$fp->hslice($dbChunk->slice);
if($self->debug > 2) {
print_simple_align($fp->get_SimpleAlign, 80);
my $testChunk = new Bio::EnsEMBL::Compara::Production::DnaFragChunk();
$testChunk->dnafrag($qyChunk->dnafrag);
$testChunk->seq_start($qyChunk->seq_start+$fp->start-1);
$testChunk->seq_end($qyChunk->seq_start+$fp->end-1);
my $bioseq = $testChunk->bioseq;
print($bioseq->seq, "\n");
}
my $genomic_align1 = new Bio::EnsEMBL::Compara::GenomicAlign;
$genomic_align1->method_link_species_set($self->{'method_link_species_set'});
$genomic_align1->dnafrag($qyChunk->dnafrag);
$genomic_align1->dnafrag_start($qyChunk->seq_start + $fp->start -1);
$genomic_align1->dnafrag_end($qyChunk->seq_start + $fp->end -1);
$genomic_align1->dnafrag_strand($fp->strand);
$genomic_align1->level_id(1);
my $cigar1 = $fp->cigar_string;
$cigar1 =~ s/I/M/g;
$cigar1 = compact_cigar_line($cigar1);
$cigar1 =~ s/D/G/g;
$genomic_align1->cigar_line($cigar1);
my $genomic_align2 = new Bio::EnsEMBL::Compara::GenomicAlign;
$genomic_align2->method_link_species_set($self->{'method_link_species_set'});
$genomic_align2->dnafrag($dbChunk->dnafrag);
$genomic_align2->dnafrag_start($dbChunk->seq_start + $fp->hstart -1);
$genomic_align2->dnafrag_end($dbChunk->seq_start + $fp->hend -1);
$genomic_align2->dnafrag_strand($fp->hstrand);
$genomic_align2->level_id(1);
my $cigar2 = $fp->cigar_string;
$cigar2 =~ s/D/M/g;
$cigar2 =~ s/I/D/g;
$cigar2 = compact_cigar_line($cigar2);
$cigar2 =~ s/D/G/g;
$genomic_align2->cigar_line($cigar2);
if($self->debug > 1) {
print("original cigar_line ",$fp->cigar_string,"\n");
print(" $cigar1\n");
print(" $cigar2\n");
}
my $GAB = new Bio::EnsEMBL::Compara::GenomicAlignBlock;
$GAB->method_link_species_set($self->{'method_link_species_set'});
$GAB->genomic_align_array([$genomic_align1, $genomic_align2]);
$GAB->score($fp->score);
$GAB->perc_id($fp->percent_id);
$GAB->length($fp->alignment_length);
$self->{'comparaDBA'}->get_GenomicAlignBlockAdaptor->store($GAB);
if ($self->debug) {
my $track_sql = "INSERT IGNORE INTO genomic_align_block_job_track ".
"(genomic_align_block_id, analysis_job_id) ".
"VALUES (".$GAB->dbID.",".$self->input_job->dbID.")";
print("$track_sql\n") if($self->debug);
$self->{'comparaDBA'}->dbc->do($track_sql);
}
if($self->debug > 2) { print_simple_align($GAB->get_SimpleAlign, 80);}
return $GAB; } |
sub write_output
{ my( $self) = @_;
my $starttime = time();
foreach my $fp ($self->output) {
if($fp->isa('Bio::EnsEMBL::FeaturePair')) {
$fp->analysis($self->analysis);
$self->store_featurePair_as_genomicAlignBlock($fp);
}
}
if($self->debug){printf("%d FeaturePairs found\n", scalar($self->output));}
}
} |
General documentation
The rest of the documentation details each of the object methods.
Internal methods are usually preceded with a _