Bio::EnsEMBL::Analysis::RunnableDB::Finished DepthFilter
Included librariesPackage variablesGeneral documentationMethods
Toolbar
WebCvsRaw content
Package variables
No package variables defined.
Included modules
Bio::EnsEMBL::Analysis::Config::General
Bio::EnsEMBL::Pipeline::Tools::MM_Taxonomy
Bio::EnsEMBL::SimpleFeature
Bio::EnsEMBL::Utils::Exception qw ( verbose throw warning )
Inherit
Bio::EnsEMBL::Analysis::RunnableDB::Finished
Synopsis
No synopsis!
Description
No description!
Methods
db_version_searchedDescriptionCode
depth_filter
No description
Code
fetch_input
No description
Code
get_hit_description
No description
Code
get_original_features
No description
Code
get_taxon_id_child
No description
Code
run
No description
Code
Methods description
db_version_searchedcode    nextTop
    Title   :  db_version_searched
[ distinguished from Runnable::*::get_db_version() ]
Useage : $self->db_version_searched('version string')
$obj->db_version_searched()
Function: Get/Set a blast database version that was searched
The actual look up is done in Runnable::Finished::Blast
This is just a holding place for the string in this
module
Returns : String or undef
Args : String
Caller : $self::run()
Job::run_module()
Methods code
db_version_searcheddescriptionprevnextTop
sub db_version_searched {
	my ( $self, $arg ) = @_;
	$self->{'_db_version_searched'} = $arg if $arg;
	return $self->{'_db_version_searched'};
}
depth_filterdescriptionprevnextTop
sub depth_filter {
	my ($self, $orig_features, $slice, $max_coverage, $percentid_cutoff, $hits_to_keep,
	$no_filter, $max_hits_per_meta_cluster, $hit_db) = @_;

	print STDERR "DepthFilter: MaxCoverage=$max_coverage\n";
	print STDERR "DepthFilter: PercentIdCutoff=$percentid_cutoff\n";
	print STDERR "DepthFilter: ".scalar(@$orig_features)." features before filtering\n";

    my %grouped_byname = ();
    my %is_kept_by_name = ();
	my %is_kept_by_acc = ();

    for my $af (@$orig_features) {
        my ($score, $percentid) = ($af->score(), $af->percent_id());
        if($percentid < $percentid_cutoff) {
            next;
        }

        my $node = $grouped_byname{$af->hseqname()} ||= {};
        if(%$node) { # nonempty
$node->{max_score} = $score if $score>$node->{max_score}; $node->{max_percentid} = $percentid if $percentid>$node->{max_percentid}; } else { $node->{max_score} = $score; $node->{max_percentid} = $percentid; } if($self->get_hit_description($af->hseqname())) { $node->{taxon_id} = $self->get_hit_description($af->hseqname())->taxon_id; } push @{$node->{features}}, $af; } print STDERR "DepthFilter: ".scalar(keys %grouped_byname)." unique hitnames\n"; my @bisorted = sort { ($b->{max_score} <=> $a->{max_score}) || ($b->{max_percentid} <=> $a->{max_percentid}) } values %grouped_byname; my @coverage_map = (); my @filtered_features = (); for my $node (@bisorted) { my $keep_node = 0; my $hit_name; for my $af (sort {$a->start() <=> $b->start()} @{$node->{features}}) { for my $position ($af->start()..$af->end()) { my $depth = $coverage_map[$position] ||= 0; if($depth < $max_coverage) { $keep_node = 1; } $coverage_map[$position]++; } $hit_name = $af->hseqname; } # add code to keep the hits with "no_filter" taxon_id
$keep_node = 1 if $no_filter && grep( /^$node->{taxon_id}$/, @{$self->get_taxon_id_child($no_filter)},$no_filter ); if($keep_node) { for my $af (@{$node->{features}}) { $af->analysis( $self->analysis ); $af->dbID(0); $af->{adaptor} = undef; push @filtered_features, $af; } $is_kept_by_name{$hit_name} = 1; # trim off the splice variant number (if any) and version number
$hit_name =~ s/(-\d+)?\.\d+$//; $is_kept_by_acc{$hit_name} = 1; } } # free up some memory ?
%grouped_byname = undef; # recover discarded Swissprot variants that should be shown
if( $hit_db eq 'Swissprot' ) { foreach my $f (@$orig_features) { my $hit_name = $f->hseqname(); next unless !$is_kept_by_name{$hit_name}; $hit_name =~ s/(-\d+)?\.\d+$//; if($is_kept_by_acc{$hit_name}){ print STDERR "DepthFilter: recover SW hit ".$f->hseqname."\n"; $f->analysis( $self->analysis ); $f->dbID(0); $f->{adaptor} = undef; push @filtered_features, $f; } } } print STDERR "DepthFilter: ".scalar(@filtered_features)." features after filtering\n"; my @saturated_zones = (); my $zone_start = undef; my $zone_score = 0; my $slice_length = $slice->length(); for(my $i=1; $i<=$slice_length; $i++) { my $n = $coverage_map[$i] || 0; if ($zone_start) { $zone_score += $n; if ($n < $max_coverage) { my $new_zone = Bio::EnsEMBL::SimpleFeature->new( -start => $zone_start, -end => $i - 1, -strand => 0, # we mix both strands here
-score => $zone_score, -display_label => sprintf("avg depth = %.2f", $zone_score/($i-$zone_start)),
-analysis =>
$self->analysis(),
);
push(@saturated_zones, $new_zone); $zone_start = undef; $zone_score = 0; } } elsif ($n >= $max_coverage) { $zone_start = $i; $zone_score = $n; } } if ($zone_start) { # Are saturated up to end of contig:
my $new_zone = Bio::EnsEMBL::SimpleFeature->new( -start => $zone_start, -end => $slice_length, -strand => 0, # we mix both strands here
-score => $zone_score, -display_label => sprintf("avg depth = %.2f", $zone_score/($slice_length-$zone_start+1)),
-analysis =>
$self->analysis(),
);
push(@saturated_zones, $new_zone); } print STDERR "DepthFilter: ".scalar(@saturated_zones)." saturated zones found\n"; return (\@filtered_features,\@ saturated_zones);
}
fetch_inputdescriptionprevnextTop
sub fetch_input {
	my ($self) = @_;
	my $slice =
	  $self->fetch_sequence( $self->input_id, $self->db,
		$ANALYSIS_REPEAT_MASKING, $SOFT_MASKING );
	$self->query($slice);
}
get_hit_descriptiondescriptionprevnextTop
sub get_hit_description {
	my ($self,$hit) = @_;
	if(ref($hit) eq 'HASH') {
		$self->{_hit_desc} = $hit;
		my $hit_desc_a = $self->db->get_HitDescriptionAdaptor;
		$hit_desc_a->fetch_HitDescriptions_into_hash($hit);

		return 1;
	}

	return $self->{_hit_desc}->{$hit};
}
get_original_featuresdescriptionprevnextTop
sub get_original_features {
	my ($self,$slice,$analysis,$hit_db,$mode,$taxon_id, $no_filter) = @_;
	my $hit_db_features = [];
    my $prot_feat_a = $self->db->get_ProteinAlignFeatureAdaptor;
    my $dna_feat_a  = $self->db->get_DnaAlignFeatureAdaptor;

    my $orig_features = $dna_feat_a->fetch_all_by_Slice( $slice, $analysis );
	$orig_features = $prot_feat_a->fetch_all_by_Slice( $slice, $analysis ) if (!(@$orig_features));

	# The block of code below discard old sequence versions from the original set
# of features (should add "mode => single" in analysis parameters)
if($mode && $mode eq "single") { print STDERR "DepthFilter: $mode mode\n"; my $single_hash; my $old_seq = []; foreach my $feature (@$orig_features) { my $hit_name = $feature->hseqname; my ($acc,$ver) = $hit_name =~ /(\w+-?\w+)\.(\d+)/; my $key = $acc; if($single_hash->{$key}) { my $stored_feature = $single_hash->{$key}->[0]; my ($sacc,$sver) = $stored_feature->hseqname =~ /(\w+-?\w+)\.(\d+)/; if($ver > $sver) { $single_hash->{$key} = [$feature]; #print STDERR "DepthFilter: drop ".$stored_feature->hseqname." (".$feature->hseqname.")\n";
push @$old_seq, $stored_feature; } elsif($ver == $sver) { push @{$single_hash->{$key}}, $feature; } else { #print STDERR "DepthFilter: drop ".$feature->hseqname." (".$stored_feature->hseqname.")\n";
push @$old_seq, $feature; } } else { $single_hash->{$key} = [$feature]; } } print STDERR "DepthFilter: drop ".scalar(@$old_seq)." old sequences\n" if @$old_seq; $orig_features = []; map ( push(@$orig_features,@$_) , values %$single_hash); } if($hit_db || $taxon_id || $no_filter){ print STDERR "DepthFilter: hit db is $hit_db\n" if $hit_db; my $plus_taxon_ids = []; my $minus_taxon_ids = []; if ($taxon_id){ my @ti = split(/\|/,$taxon_id); print STDERR "DepthFilter: taxonomy ids are ".join(" ",@ti)."\n"; for my $t (@ti) { if($t < 0) { $t = abs($t); push @$minus_taxon_ids, @{$self->get_taxon_id_child($t)}; push @$minus_taxon_ids, $t; } else { push @$plus_taxon_ids, @{$self->get_taxon_id_child($t)}; push @$plus_taxon_ids, $t; } } } my $hit_hash = {map {$_->hseqname, undef} @$orig_features}; $self->get_hit_description($hit_hash); my $error; foreach my $feat (@$orig_features) { my $tag = 1; if (my $desc = $self->get_hit_description($feat->hseqname)) { my $hit_taxon_id = $desc->taxon_id; if($hit_db) { $tag = 0 if $desc->db_name ne $hit_db; } if(@$plus_taxon_ids) { $tag *= 0 unless grep(/^$hit_taxon_id$/,@$plus_taxon_ids); } if(@$minus_taxon_ids) { $tag *= 0 if grep(/^$hit_taxon_id$/,@$minus_taxon_ids); } push(@$hit_db_features, $feat) if $tag; } else { $error->{$feat->hseqname} = 1; } } throw("Missing hit_description entry for the following sequence(s):\n".join(',',keys %$error)) if keys %$error; print STDERR "DepthFilter: use ".scalar(@$hit_db_features)." features out of ".scalar(@$orig_features)."\n"; } else { return $orig_features; } return $hit_db_features
}
get_taxon_id_childdescriptionprevnextTop
sub get_taxon_id_child {
	my ( $self, $taxon_id ) = @_;
	$self->{taxonomy} ||= Bio::EnsEMBL::Pipeline::Tools::MM_Taxonomy->new();
	if(!$self->{_taxon}->{$taxon_id}) {
		$self->{_taxon}->{$taxon_id} = $self->{taxonomy}->get_all_children_id($taxon_id);
	}

	return $self->{_taxon}->{$taxon_id};
}

1;
}
rundescriptionprevnextTop
sub run {
	my ($self) = @_;

    my %params = %{ $self->parameters_hash() };
    my $max_coverage     = $params{max_coverage} || 10;
    my $hits_to_keep     = $params{hits_to_keep} || 3;
    my $max_hits_per_meta_cluster = $params{max_hits_per_meta_cluster} || 40;
    my $percentid_cutoff = $params{percentid_cutoff} || 0.0;
	my $orig_analysis_name = $params{ori_analysis};
	my $hit_db = $params{hit_db};
	my $mode = $params{mode};
	my $taxon_id = $params{taxon};
	my $no_filter = $params{no_filter};

	# Get the blast db version from the raw analysis and save it
my $analysis_adaptor = $self->db->get_AnalysisAdaptor(); my $ana = $analysis_adaptor->fetch_by_logic_name($orig_analysis_name); my $stateinfocontainer = $self->db->get_StateInfoContainer; my $db_version = $stateinfocontainer->fetch_db_version($self->input_id,$ana); $self->db_version_searched($db_version); my $slice = $self->query(); my $orig_features = $self->get_original_features($slice,$orig_analysis_name,$hit_db,$mode,$taxon_id, $no_filter); my ( $filtered_features, $saturated_zones) = $self->depth_filter($orig_features, $slice, $max_coverage, $percentid_cutoff, $hits_to_keep, $no_filter, $max_hits_per_meta_cluster, $hit_db); $self->output($filtered_features, $saturated_zones);
}
General documentation
NAME - Bio::EnsEMBL::Analysis::RunnableDB::Finished::DepthFilterTop
AUTHOR(1)Top
James Gilbert email jgrg@sanger.ac.uk - original implementation
AUTHOR(2)Top
Mustapha Larbaoui email ml6@sanger.ac.uk - new pipeline
AUTHOR(3)Top
Leo Gordon email lg4@sanger.ac.uk - porting