Raw content of Bio::EnsEMBL::Analysis::RunnableDB::MakeSimilarityInputIDs
package Bio::EnsEMBL::Analysis::RunnableDB::MakeSimilarityInputIDs;
use strict;
use Bio::EnsEMBL::Analysis::RunnableDB;
use Bio::EnsEMBL::Analysis::RunnableDB::BaseGeneBuild;
use Bio::EnsEMBL::Analysis::Config::GeneBuild::BlastMiniGenewise qw(GENEWISE_CONFIG_BY_LOGIC);
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
use Bio::EnsEMBL::Utils::Argument qw(rearrange);
use Bio::EnsEMBL::Analysis::Tools::Utilities qw (parse_config);
use Bio::EnsEMBL::Analysis::Tools::Logger qw(logger_info);
use Bio::EnsEMBL::KillList::KillList;
use vars qw (@ISA);
@ISA = qw(Bio::EnsEMBL::Analysis::RunnableDB::BaseGeneBuild);
sub new {
my ($class,@args) = @_;
my $self = $class->SUPER::new(@args);
my ($output_logicname, $protein_count, $bmg_logicname) = rearrange
(['OUTPUT_LOGICNAME', 'PROTEIN_COUNT', 'BMG_LOGICNAME'], @args);
#output logicname
#protein count
#bmg logicname
#####
#setting a default
#####
$self->protein_count(20);
#####
### Defaults are over-ridden by parameters given in analysis table...
my $ph = $self->parameters_hash;
$self->protein_count($ph->{-protein_count});
$self->output_logicname($ph->{-output_logicname});
$self->bmg_logicname($ph->{-bmg_logicname});
### ...which are over-ridden by constructor arguments.
$self->protein_count($protein_count);
$self->output_logicname($output_logicname);
$self->bmg_logicname($bmg_logicname);
throw("Need an output logicname ".$self->output_logicname." and a bmg logicname ".
$self->bmg_logicname." defined") if(!$self->output_logicname ||
!$self->bmg_logicname);
parse_config($self, $GENEWISE_CONFIG_BY_LOGIC, $self->bmg_logicname);
return $self;
}
sub fetch_input{
my ($self) = @_;
print "\n\n***Fetching sequence from ".$self->db->dbname."***\n\n";
$self->query($self->fetch_sequence($self->input_id, $self->db, $self->REPEATMASKING));
my $output_analysis = $self->db->get_AnalysisAdaptor->
fetch_by_logic_name($self->output_logicname);
throw("Failed to find an analysis with the logic_name ".
$self->output_logicname." Cannot continue")
if(!$output_analysis);
$self->output_analysis($output_analysis);
}
sub run{
my ($self) = @_;
my %kill_list = %{$self->kill_list} if($self->USE_KILL_LIST);
my @mask_exons;
my @iids;
# remove masked and killed hits as will be done in the build itself
foreach my $type ( @{$self->BIOTYPES_TO_MASK} ) {
foreach my $mask_genes ( @{ $self->gene_slice->get_all_Genes_by_type($type) } ) {
foreach my $mask_exon ( @{ $mask_genes->get_all_Exons } ) {
if ( $mask_exon->seqname eq $self->gene_slice->id ) {
push @mask_exons, $mask_exon;
}
}
}
}
# make the mask list non-redundant. Much faster when checking against features
my @mask_regions;
foreach my $mask_exon ( sort { $a->start <=> $b->start } @mask_exons ) {
if ( @mask_regions and $mask_regions[-1]->{'end'} > $mask_exon->start ) {
if ( $mask_exon->end > $mask_regions[-1]->{'end'} ) {
$mask_regions[-1]->{'end'} = $mask_exon->end;
}
} else {
push @mask_regions, { start => $mask_exon->start, end => $mask_exon->end }
}
}
my $num_seeds = 0;
foreach my $logicname(@{$self->PAF_LOGICNAMES}) {
my %features;
print "FETCHING FEATURES FOR ".$logicname."\n";
my @features = @{$self->paf_slice->get_all_ProteinAlignFeatures($logicname)};
print "HAVE ".@features." features\n";
FEATURE:foreach my $f(@features){
next FEATURE if($self->PAF_MIN_SCORE_THRESHOLD && $f->score < $self->PAF_MIN_SCORE_THRESHOLD);
next FEATURE if($self->PAF_UPPER_SCORE_THRESHOLD && $f->score > $self->PAF_UPPER_SCORE_THRESHOLD);
push(@{$features{$f->hseqname}}, $f);
}
my @ids_to_ignore;
SEQID: foreach my $sid ( keys %features ) {
my $ex_idx = 0;
my $count = 0;
#print STDERR "Looking at $sid\n";
FEAT: foreach my $f ( sort { $a->start <=> $b->start } @{ $features{$sid} } ) {
#printf STDERR "Feature: %d %d\n", $f->start, $f->end;
for ( ; $ex_idx < @mask_regions ; ) {
my $mask_exon = $mask_regions[$ex_idx];
#printf STDERR " Mask exon %d %d\n", $mask_exon->{'start'}, $mask_exon->{'end'};
if ( $mask_exon->{'start'} > $f->end ) {
# no exons will overlap this feature
next FEAT;
} elsif ( $mask_exon->{'end'} >= $f->start ) {
# overlap
push @ids_to_ignore, $f->hseqname;
#printf STDERR "Ignoring %s\n", $f->hseqname;
next SEQID;
} else {
$ex_idx++;
}
}
}
}
print "Ignoring ".@ids_to_ignore." features\n";
foreach my $dud_id ( @ids_to_ignore, keys %kill_list ) {
if ( exists $features{$dud_id} ) {
delete $features{$dud_id};
}
}
$num_seeds += scalar( keys %features );
}
# rule of thumb; split data so that each job constitutes one piece of
# genomic DNA against ~20 proteins.
#
return () if($num_seeds == 0);
my $num_chunks = int( $num_seeds / $self->protein_count )
+ 1;
for ( my $x = 1 ; $x <= $num_chunks ; $x++ ) {
#generate input id : $chr_name.1-$chr_length:$num_chunks:$x
my $new_iid = $self->query->name . ":$num_chunks:$x";
push @iids, $new_iid;
}
print "HAVE ".@iids." to write to the ref database\n";
$self->output(\@iids);
}
sub write_output{
my ($self) = @_;
my $sic = $self->db->get_StateInfoContainer;
foreach my $iid(@{$self->output}){
eval{
$sic->store_input_id_analysis($iid,
$self->output_analysis,
'');
};
throw("Failed to store ".$iid." $@") if($@);
logger_info("Stored ".$iid);
}
}
sub kill_list{
my ($self, $arg) = @_;
if($arg){
$self->{kill_list} = $arg;
}
if(!$self->{kill_list}){
my $kill_list_object = Bio::EnsEMBL::KillList::KillList
->new(-TYPE => 'protein');
$self->{kill_list} = $kill_list_object->get_kill_list;
}
return $self->{kill_list};
}
sub output_analysis{
my ($self, $value) = @_;
if($value &&
$value->isa('Bio::EnsEMBL::Pipeline::Analysis')){
$self->{output_analysis} = $value;
}
return $self->{'output_analysis'};
}
sub protein_count{
my ($self, $value) = @_;
if($value){
$self->{'protein_count'} = $value;
}
return $self->{'protein_count'};
}
sub output_logicname{
my ($self, $value) = @_;
if($value){
$self->{'output_logicname'} = $value;
}
return $self->{'output_logicname'};
}
sub bmg_logicname{
my ($self, $value) = @_;
if($value){
$self->{'bmg_logicname'} = $value;
}
return $self->{'bmg_logicname'};
}
sub paf_slice{
my ($self, $slice) = @_;
if($slice){
$self->{paf_slice} = $slice;
}
if(!$self->{paf_slice}){
print "FETCHING SEQUENCE FROM ".$self->paf_source_db->dbc->dbname."\n";
my $slice = $self->fetch_sequence($self->input_id, $self->paf_source_db,
$self->REPEATMASKING);
$self->{paf_slice} = $slice;
}
return $self->{paf_slice};
}
sub gene_slice{
my ($self, $slice) = @_;
if($slice){
$self->{gene_slice} = $slice;
}
if(!$self->{gene_slice}){
my $slice = $self->fetch_sequence($self->input_id, $self->gene_source_db,
$self->REPEATMASKING);
$self->{gene_slice} = $slice;
}
return $self->{gene_slice};
}
sub paf_source_db{
my ($self, $db) = @_;
if($db){
$self->{paf_source_db} = $db;
}
if(!$self->{paf_source_db}){
my $db = $self->get_dbadaptor($self->PAF_SOURCE_DB);
$self->{paf_source_db} = $db;
}
return $self->{paf_source_db};
}
sub gene_source_db{
my ($self, $db) = @_;
if($db){
$self->{gene_source_db} = $db;
}
if(!$self->{gene_source_db}){
my $db = $self->get_dbadaptor($self->GENE_SOURCE_DB);
$self->{gene_source_db} = $db;
}
return $self->{gene_source_db};
}
=head2 CONFIG_ACCESSOR_METHODS
Arg [1] : Bio::EnsEMBL::Analysis::RunnableDB::BlastMiniGenewise
Arg [2] : Varies, tends to be boolean, a string, a arrayref or a hashref
Function : Getter/Setter for config variables
Returntype: again varies
Exceptions:
Example :
=cut
#Note the function of these variables is better described in the
#config file itself Bio::EnsEMBL::Analysis::Config::GeneBuild::BlastMiniGenewise
sub PAF_LOGICNAMES{
my ($self, $arg) = @_;
if($arg){
$self->{PAF_LOGICNAMES} = $arg;
}
return $self->{PAF_LOGICNAMES}
}
sub PAF_MIN_SCORE_THRESHOLD{
my ($self, $arg) = @_;
if($arg){
$self->{PAF_MIN_SCORE_THRESHOLD} = $arg;
}
return $self->{PAF_MIN_SCORE_THRESHOLD}
}
sub PAF_UPPER_SCORE_THRESHOLD{
my ($self, $arg) = @_;
if($arg){
$self->{PAF_UPPER_SCORE_THRESHOLD} = $arg;
}
return $self->{PAF_UPPER_SCORE_THRESHOLD}
}
sub PAF_SOURCE_DB{
my ($self, $arg) = @_;
if($arg){
$self->{PAF_SOURCE_DB} = $arg;
}
return $self->{PAF_SOURCE_DB}
}
sub GENE_SOURCE_DB{
my ($self, $arg) = @_;
if($arg){
$self->{GENE_SOURCE_DB} = $arg;
}
return $self->{GENE_SOURCE_DB}
}
sub OUTPUT_DB{
my ($self, $arg) = @_;
if($arg){
$self->{OUTPUT_DB} = $arg;
}
return $self->{OUTPUT_DB}
}
sub OUTPUT_BIOTYPE{
my ($self, $arg) = @_;
if($arg){
$self->{OUTPUT_BIOTYPE} = $arg;
}
return $self->{OUTPUT_BIOTYPE}
}
sub GENEWISE_PARAMETERS{
my ($self, $arg) = @_;
if($arg){
$self->{GENEWISE_PARAMETERS} = $arg;
}
return $self->{GENEWISE_PARAMETERS}
}
sub MINIGENEWISE_PARAMETERS{
my ($self, $arg) = @_;
if($arg){
$self->{MINIGENEWISE_PARAMETERS} = $arg;
}
return $self->{MINIGENEWISE_PARAMETERS}
}
sub MULTIMINIGENEWISE_PARAMETERS{
my ($self, $arg) = @_;
if($arg){
$self->{MULTIMINIGENEWISE_PARAMETERS} = $arg;
}
return $self->{MULTIMINIGENEWISE_PARAMETERS}
}
sub BLASTMINIGENEWISE_PARAMETERS{
my ($self, $arg) = @_;
if($arg){
$self->{BLASTMINIGENEWISE_PARAMETERS} = $arg;
}
return $self->{BLASTMINIGENEWISE_PARAMETERS}
}
sub FILTER_PARAMS{
my ($self, $arg) = @_;
if($arg){
$self->{FILTER_PARAMETERS} = $arg;
}
return $self->{FILTER_PARAMETERS}
}
sub FILTER_OBJECT{
my ($self, $arg) = @_;
if($arg){
$self->{FILTER_OBJECT} = $arg;
}
return $self->{FILTER_OBJECT}
}
sub BIOTYPES_TO_MASK{
my ($self, $arg) = @_;
if($arg){
$self->{BIOTYPES_TO_MASK} = $arg;
}
return $self->{BIOTYPES_TO_MASK}
}
sub EXON_BASED_MASKING{
my ($self, $arg) = @_;
if($arg){
$self->{EXON_BASED_MASKING} = $arg;
}
return $self->{EXON_BASED_MASKING}
}
sub GENE_BASED_MASKING{
my ($self, $arg) = @_;
if($arg){
$self->{GENE_BASED_MASKING} = $arg;
}
return $self->{GENE_BASED_MASKING}
}
sub POST_GENEWISE_MASK{
my ($self, $arg) = @_;
if($arg){
$self->{POST_GENEWISE_MASK} = $arg;
}
return $self->{POST_GENEWISE_MASK}
}
sub PRE_GENEWISE_MASK{
my ($self, $arg) = @_;
if($arg){
$self->{PRE_GENEWISE_MASK} = $arg;
}
return $self->{PRE_GENEWISE_MASK}
}
sub REPEATMASKING{
my ($self, $arg) = @_;
if($arg){
$self->{REPEATMASKING} = $arg;
}
return $self->{REPEATMASKING}
}
sub SEQFETCHER_OBJECT{
my ($self, $arg) = @_;
if($arg){
$self->{SEQFETCHER_OBJECT} = $arg;
}
return $self->{SEQFETCHER_OBJECT}
}
sub SEQFETCHER_PARAMS{
my ($self, $arg) = @_;
if($arg){
$self->{SEQFETCHER_PARAMS} = $arg;
}
return $self->{SEQFETCHER_PARAMS}
}
sub USE_KILL_LIST{
my ($self, $arg) = @_;
if($arg){
$self->{USE_KILL_LIST} = $arg;
}
return $self->{USE_KILL_LIST}
}
sub LIMIT_TO_FEATURE_RANGE{
my ($self, $arg) = @_;
if($arg){
$self->{LIMIT_TO_FEATURE_RANGE} = $arg;
}
return $self->{LIMIT_TO_FEATURE_RANGE}
}
sub FEATURE_RANGE_PADDING{
my ($self, $arg) = @_;
if($arg){
$self->{FEATURE_RANGE_PADDING} = $arg;
}
return $self->{FEATURE_RANGE_PADDING}
}
sub WRITE_REJECTED{
my ($self, $arg) = @_;
if(defined($arg)){
$self->{WRITE_REJECTED} = $arg;
}
return $self->{WRITE_REJECTED};
}
sub REJECTED_BIOTYPE{
my ($self, $arg) = @_;
if($arg){
$self->{REJECTED_BIOTYPE} = $arg;
}
return $self->{REJECTED_BIOTYPE};
}
sub SOFTMASKING{
my ($self, $arg) = @_;
if($arg){
$self->{SOFTMASKING} = $arg;
}
return $self->{SOFTMASKING}
}
sub EXONERATE_PARAMETERS{
my ($self, $arg) = @_;
if($arg){
$self->{EXONERATE_PARAMETERS} = $arg;
}
return $self->{EXONERATE_PARAMETERS}
}
1;