Raw content of Bio::EnsEMBL::Analysis::RunnableDB::Dust # Ensembl module for Bio::EnsEMBL::Analysis::RunnableDB::Dust # # Copyright (c) 2004 Ensembl # =head1 NAME Bio::EnsEMBL::Analysis::RunnableDB::Dust =head1 SYNOPSIS my $dust = Bio::EnsEMBL::Analysis::RunnableDB::Dust-> new( -input_id => 'contig::AL805961.22.1.166258:1:166258:1', -db => $db, -analysis => $analysis, ); $dust->fetch_input; $dust->run; $dust->write_output; =head1 DESCRIPTION This module provides an interface between the ensembl database and the Runnable Dust which wraps the program tcdust This module can fetch appropriate input from the database pass it to the runnable then write the results back to the database in the repeat_feature and repeat_consensus tables =head1 CONTACT Post questions to the Ensembl development list: ensembl-dev@ebi.ac.uk =cut package Bio::EnsEMBL::Analysis::RunnableDB::Dust; use strict; use warnings; use Bio::EnsEMBL::Analysis::RunnableDB; use Bio::EnsEMBL::Analysis::Runnable::Dust; use Bio::EnsEMBL::Utils::Exception qw(verbose throw warning); use vars qw(@ISA); @ISA = qw(Bio::EnsEMBL::Analysis::RunnableDB); =head2 fetch_input Arg [1] : Bio::EnsEMBL::Analysis::RunnableDB::Dust Function : fetch data out of database and create runnable Returntype: 1 Exceptions: none Example : =cut sub fetch_input{ my ($self) = @_; my $slice = $self->fetch_sequence; $self->query($slice); my %parameters; if($self->parameters_hash){ %parameters = %{$self->parameters_hash}; } my $runnable = Bio::EnsEMBL::Analysis::Runnable::Dust->new ( -query => $self->query, -program => $self->analysis->program_file, -analysis => $self->analysis, %parameters, ); $self->runnable($runnable); return 1; } =head2 get_adaptor Arg [1] : Bio::EnsEMBL::Analysis::RunnableDB::Dust Function : get repeatfeature adaptor Returntype: Bio::EnsEMBL::DBSQL::RepeatFeatureAdaptor Exceptions: none Example : =cut sub get_adaptor{ my ($self) = @_; return $self->db->get_RepeatFeatureAdaptor; } =head2 write_output Arg [1] : Bio::EnsEMBL::Analysis::RunnableDB Function : calculate which hits are mostly gap, and set analysis and slice on each wanted feature and store it Returntype: 1 Exceptions: none Example : =cut #NOTE: convert features is an important method see its docs for more info sub write_output{ my ($self) = @_; my $adaptor = $self->get_adaptor; my @transformed; foreach my $feature (@{$self->output}) { if ( $feature->length > 50000 ) { my $transformed_features = $self->convert_feature($feature); push(@transformed, @$transformed_features) if($transformed_features); }else{ push(@transformed, $feature); } } foreach my $feature (@transformed) { #print "Have feature ".$feature."\n"; $feature->analysis($self->analysis); $feature->slice($self->query) if(!$feature->slice); $self->feature_factory->validate($feature); eval{ $adaptor->store($feature); }; if ($@){ throw("RunnableDB:store failed, failed to write ".$feature." to ". "the database $@"); } } } =head2 convert_feature Arg [1] : Bio::EnsEMBL::Analysis::RunnableDB::Dust Arg [2] : Bio::EnsEMBL::RepeatFeature Function : This method takes a repeat feature projects it onto the sequence level coord_system calculates what percentage of gaps make up the low complexity sequence. If the sequence is more than 25% low complexity it is throw away. Only features which are more than 50k are checked for speed reasons. Once this has been calculated the features are converted by to the coordinate system they came from but they may now be in more pieces. The main reason for doing this was because features which lied across long gaps were causing problems for the core api but it also means we dont store a lot of features which simply mask out Ns Returntype: Bio::EnsEMBL::RepeatFeature Exceptions: throws if it cant transform the feature Example : =cut sub convert_feature{ my ($self, $rf) = @_; print "Converting ".$rf->start." ".$rf->end." ". $rf->slice->seq_region_name."\n"; my $ff = $self->feature_factory; my $projections = $rf->project('seqlevel'); my @converted; my $feature_length = $rf->length; my $projected_length = 0; PROJECT:foreach my $projection(@$projections){ $projected_length += ($projection->from_end - $projection->from_start) +1; } my $percentage = 100; if($projected_length != 0){ $percentage = ($projected_length / $feature_length)*100; } if($percentage <= 75){ return; } REPEAT:foreach my $projection(@$projections){ my $start = 1; my $end = $projection->to_Slice->length; my $slice = $projection->to_Slice; my $rc = $ff->create_repeat_consensus('dust', 'dust', 'simple', 'N'); my $rf = $ff->create_repeat_feature($start, $end, 0, 0, $start, $end, $rc, $slice->name, $slice); my $transformed = $rf->transform($self->query->coord_system->name, $self->query->coord_system->version); if(!$transformed){ throw("Failed to transform ".$rf." ".$rf->start." ". $rf->end." ".$rf->seq_region_name." skipping \n"); } push(@converted, $transformed); } return \@converted; } 1;