sub fetch_input
{ my( $self) = @_;
throw("No input id") unless defined($self->input_id);
my ($seq_name, $target_chunk_total, $target_chunk_id);
if ($self->input_id =~ /^([^:]+):(\d+):(\d+)$/) {
($seq_name, $target_chunk_total, $target_chunk_id) = ($1, $2, $3);
} elsif ($self->input_id =~ /(\S+)/) {
($seq_name, $target_chunk_total, $target_chunk_id) = ($1, 1, 1);
} else {
throw("Input id could not be parsed: ", $self->input_id);
}
$self->OUTPUT_GROUP_TYPE("chain") unless (defined $self->OUTPUT_GROUP_TYPE);
my $q_dbh = Bio::EnsEMBL::DBSQL::DBAdaptor->new(%{$self->QUERY_CORE_DB});
my $t_dbh = Bio::EnsEMBL::DBSQL::DBAdaptor->new(%{$self->TARGET_CORE_DB});
my $compara_dbh = Bio::EnsEMBL::Compara::DBSQL::DBAdaptor->new(%{$self->COMPARA_DB});
my $query_species = $q_dbh->get_MetaContainerAdaptor->get_Species->binomial;
my $target_species = $t_dbh->get_MetaContainerAdaptor->get_Species->binomial;
my $q_gdb = $compara_dbh->get_GenomeDBAdaptor->fetch_by_name_assembly($query_species);
my $t_gdb = $compara_dbh->get_GenomeDBAdaptor->fetch_by_name_assembly($target_species);
my ($q_assembly_version, $t_assembly_version);
eval {
$q_assembly_version = $q_dbh->get_CoordSystemAdaptor->fetch_by_name
('toplevel',
$q_gdb->assembly);
$t_assembly_version = $t_dbh->get_CoordSystemAdaptor->fetch_by_name
('toplevel',
$t_gdb->assembly);
};
$@ and do {
throw("Had trouble fetching coord systems for ".
$q_gdb->assembly . " and " .
$t_gdb->assembly . " from core dbs: $@");
};
my $ref_slice = $q_dbh->get_SliceAdaptor->fetch_by_region('toplevel',
$seq_name,
undef,
undef,
undef,
$q_assembly_version);
throw("Could not fetch top level query slice for '$seq_name'") if not defined $ref_slice;
my $mlss = $compara_dbh->get_MethodLinkSpeciesSetAdaptor
->fetch_by_method_link_type_GenomeDBs($self->INPUT_METHOD_LINK_TYPE,
[$q_gdb, $t_gdb]);
throw("No MethodLinkSpeciesSet for :\n" .
$self->INPUT_METHOD_LINK_TYPE . "\n" .
$query_species . "\n" .
$target_species)
if not $mlss;
my $out_mlss = $compara_dbh->get_MethodLinkSpeciesSetAdaptor
->fetch_by_method_link_type_GenomeDBs($self->OUTPUT_METHOD_LINK_TYPE,
[$q_gdb, $t_gdb]);
throw("No MethodLinkSpeciesSet for :\n" .
$self->OUTPUT_METHOD_LINK_TYPE . "\n" .
$query_species . "\n" .
$target_species)
if not $out_mlss;
$self->output_MethodLinkSpeciesSet($out_mlss);
my $ref_dnafrag = $compara_dbh->get_DnaFragAdaptor->fetch_by_GenomeDB_and_name($q_gdb,
$seq_name);
print STDERR "Fetching all GenomicAlignBlocks and sorting them by target...\n";
my %blocks_by_target;
foreach my $block (@{$compara_dbh->get_GenomicAlignBlockAdaptor
->fetch_all_by_MethodLinkSpeciesSet_DnaFrag($mlss, $ref_dnafrag)}) {
my ($tg_al) = @{$block->get_all_non_reference_genomic_aligns};
my $tg_name = $tg_al->dnafrag->name;
$block->{'genomic_align_array'} = undef;
push @{$blocks_by_target{$tg_name}}, $block;
}
print STDERR "Gathering sequence converting features for id group $target_chunk_id of $target_chunk_total...\n";
my @all_target_names = sort keys %blocks_by_target;
my (%target_slices, %features_by_target);
for(my $i = $target_chunk_id - 1;
$i < @all_target_names;
$i += $target_chunk_total) {
my $target_name = $all_target_names[$i];
$target_slices{$target_name} =
$t_dbh->get_SliceAdaptor->fetch_by_region('toplevel',
$target_name,
undef,
undef,
undef,
$t_assembly_version);
foreach my $target_block (@{$blocks_by_target{$target_name}}) {
my ($qy_al) = $target_block->reference_genomic_align;
my ($tg_al) = @{$target_block->get_all_non_reference_genomic_aligns};
if (not exists($self->query_DnaFrag_hash->{$qy_al->dnafrag->name})) {
$self->query_DnaFrag_hash->{$qy_al->dnafrag->name} = $qy_al->dnafrag;
}
if (not exists($self->target_DnaFrag_hash->{$target_name})) {
$self->target_DnaFrag_hash->{$tg_al->dnafrag->name} = $tg_al->dnafrag;
}
my $daf_cigar = $self->daf_cigar_from_compara_cigars($qy_al->cigar_line,
$tg_al->cigar_line);
if (defined $daf_cigar) {
my $daf = Bio::EnsEMBL::DnaDnaAlignFeature->new
(-seqname => $qy_al->dnafrag->name,
-start => $qy_al->dnafrag_start,
-end => $qy_al->dnafrag_end,
-strand => $qy_al->dnafrag_strand,
-hseqname => $tg_al->dnafrag->name,
-hstart => $tg_al->dnafrag_start,
-hend => $tg_al->dnafrag_end,
-hstrand => $tg_al->dnafrag_strand,
-cigar_string => $daf_cigar);
push @{$features_by_target{$tg_al->dnafrag->name}}, $daf;
}
}
}
print STDERR "Sorting resulting features into batches...\n";
my $target_batches = $self->form_target_batches(\%target_slices);
foreach my $targets (@$target_batches) {
my (%these_target_slices, @features);
foreach my $t_id (@$targets) {
$these_target_slices{$t_id} = $target_slices{$t_id};
push @features, @{$features_by_target{$t_id}};
}
printf(STDERR "Making runnable with %d targets (%d features)\n", scalar(@$targets), scalar(@features));
my %parameters = (-analysis => $self->analysis,
-query_slice => $ref_slice,
-target_slices =>\% these_target_slices,
-query_nib_dir => $self->QUERY_NIB_DIR,
-target_nib_dir => $self->TARGET_NIB_DIR,
-min_chain_score => $self->MIN_CHAIN_SCORE,
-features =>\@ features);
foreach my $program (qw(faToNib lavToAxt axtChain)) {
$parameters{'-' . $program} = $BIN_DIR . "/" . $program;
}
my $runnable = Bio::EnsEMBL::Analysis::Runnable::AlignmentChains->new(%parameters);
$self->runnable($runnable);
}
}
} |
sub form_target_batches
{ my ($self, $t_slices) = @_;
my @batches;
my $batch_index = 0;
my ($total_len, $total_count) = (0,0);
foreach my $hname (keys %{$t_slices}) {
push @{$batches[$batch_index]}, $hname;
$total_count++;
if ($total_count >= 1000) {
$total_count = 0;
$batch_index++;
}
}
return\@ batches;
}
} |