sub depth_filter
{
my ($self, $orig_features, $slice, $max_coverage, $percentid_cutoff, $hits_to_keep,
$no_filter, $max_hits_per_meta_cluster, $hit_db) = @_;
print STDERR "DepthFilter: MaxCoverage=$max_coverage\n";
print STDERR "DepthFilter: PercentIdCutoff=$percentid_cutoff\n";
print STDERR "DepthFilter: ".scalar(@$orig_features)." features before filtering\n";
my %grouped_byname = ();
my %is_kept_by_name = ();
my %is_kept_by_acc = ();
for my $af (@$orig_features) {
my ($score, $percentid) = ($af->score(), $af->percent_id());
if($percentid < $percentid_cutoff) {
next;
}
my $node = $grouped_byname{$af->hseqname()} ||= {};
if(%$node) { $node->{max_score} = $score if $score>$node->{max_score};
$node->{max_percentid} = $percentid if $percentid>$node->{max_percentid};
} else {
$node->{max_score} = $score;
$node->{max_percentid} = $percentid;
}
if($self->get_hit_description($af->hseqname())) {
$node->{taxon_id} = $self->get_hit_description($af->hseqname())->taxon_id;
}
push @{$node->{features}}, $af;
}
print STDERR "DepthFilter: ".scalar(keys %grouped_byname)." unique hitnames\n";
my @bisorted =
sort { ($b->{max_score} <=> $a->{max_score})
|| ($b->{max_percentid} <=> $a->{max_percentid}) }
values %grouped_byname;
my @coverage_map = ();
my @filtered_features = ();
for my $node (@bisorted) {
my $keep_node = 0;
my $hit_name;
for my $af (sort {$a->start() <=> $b->start()} @{$node->{features}}) {
for my $position ($af->start()..$af->end()) {
my $depth = $coverage_map[$position] ||= 0;
if($depth < $max_coverage) {
$keep_node = 1;
}
$coverage_map[$position]++;
}
$hit_name = $af->hseqname;
}
$keep_node = 1 if $no_filter && grep(
/^$node->{taxon_id}$/,
@{$self->get_taxon_id_child($no_filter)},$no_filter
);
if($keep_node) {
for my $af (@{$node->{features}}) {
$af->analysis( $self->analysis );
$af->dbID(0);
$af->{adaptor} = undef;
push @filtered_features, $af;
}
$is_kept_by_name{$hit_name} = 1;
$hit_name =~ s/(-\d+)?\.\d+$//;
$is_kept_by_acc{$hit_name} = 1;
}
}
%grouped_byname = undef;
if( $hit_db eq 'Swissprot' ) {
foreach my $f (@$orig_features) {
my $hit_name = $f->hseqname();
next unless !$is_kept_by_name{$hit_name};
$hit_name =~ s/(-\d+)?\.\d+$//;
if($is_kept_by_acc{$hit_name}){
print STDERR "DepthFilter: recover SW hit ".$f->hseqname."\n";
$f->analysis( $self->analysis );
$f->dbID(0);
$f->{adaptor} = undef;
push @filtered_features, $f;
}
}
}
print STDERR "DepthFilter: ".scalar(@filtered_features)." features after filtering\n";
my @saturated_zones = ();
my $zone_start = undef;
my $zone_score = 0;
my $slice_length = $slice->length();
for(my $i=1; $i<=$slice_length; $i++) {
my $n = $coverage_map[$i] || 0;
if ($zone_start) {
$zone_score += $n;
if ($n < $max_coverage) {
my $new_zone = Bio::EnsEMBL::SimpleFeature->new(
-start => $zone_start,
-end => $i - 1,
-strand => 0, -score => $zone_score,
-display_label => sprintf("avg depth = %.2f", $zone_score/($i-$zone_start)), -analysis => $self->analysis(), ); push(@saturated_zones, $new_zone);
$zone_start = undef;
$zone_score = 0;
}
}
elsif ($n >= $max_coverage) {
$zone_start = $i;
$zone_score = $n;
}
}
if ($zone_start) { my $new_zone = Bio::EnsEMBL::SimpleFeature->new(
-start => $zone_start,
-end => $slice_length,
-strand => 0, -score => $zone_score,
-display_label => sprintf("avg depth = %.2f", $zone_score/($slice_length-$zone_start+1)), -analysis => $self->analysis(), ); push(@saturated_zones, $new_zone);
}
print STDERR "DepthFilter: ".scalar(@saturated_zones)." saturated zones found\n";
return (\@filtered_features,\@ saturated_zones); } |
sub fetch_input
{ my ($self) = @_;
my $slice =
$self->fetch_sequence( $self->input_id, $self->db,
$ANALYSIS_REPEAT_MASKING, $SOFT_MASKING );
$self->query($slice); } |
sub get_hit_description
{ my ($self,$hit) = @_;
if(ref($hit) eq 'HASH') {
$self->{_hit_desc} = $hit;
my $hit_desc_a = $self->db->get_HitDescriptionAdaptor;
$hit_desc_a->fetch_HitDescriptions_into_hash($hit);
return 1;
}
return $self->{_hit_desc}->{$hit}; } |
sub get_original_features
{ my ($self,$slice,$analysis,$hit_db,$mode,$taxon_id, $no_filter) = @_;
my $hit_db_features = [];
my $prot_feat_a = $self->db->get_ProteinAlignFeatureAdaptor;
my $dna_feat_a = $self->db->get_DnaAlignFeatureAdaptor;
my $orig_features = $dna_feat_a->fetch_all_by_Slice( $slice, $analysis );
$orig_features = $prot_feat_a->fetch_all_by_Slice( $slice, $analysis ) if (!(@$orig_features));
if($mode && $mode eq "single") {
print STDERR "DepthFilter: $mode mode\n";
my $single_hash;
my $old_seq = [];
foreach my $feature (@$orig_features) {
my $hit_name = $feature->hseqname;
my ($acc,$ver) = $hit_name =~ /(\w+-?\w+)\.(\d+)/;
my $key = $acc;
if($single_hash->{$key}) {
my $stored_feature = $single_hash->{$key}->[0];
my ($sacc,$sver) = $stored_feature->hseqname =~ /(\w+-?\w+)\.(\d+)/;
if($ver > $sver) {
$single_hash->{$key} = [$feature];
push @$old_seq, $stored_feature;
} elsif($ver == $sver) {
push @{$single_hash->{$key}}, $feature;
} else {
push @$old_seq, $feature;
}
} else {
$single_hash->{$key} = [$feature];
}
}
print STDERR "DepthFilter: drop ".scalar(@$old_seq)." old sequences\n" if @$old_seq;
$orig_features = [];
map ( push(@$orig_features,@$_) , values %$single_hash);
}
if($hit_db || $taxon_id || $no_filter){
print STDERR "DepthFilter: hit db is $hit_db\n" if $hit_db;
my $plus_taxon_ids = [];
my $minus_taxon_ids = [];
if ($taxon_id){
my @ti = split(/\|/,$taxon_id);
print STDERR "DepthFilter: taxonomy ids are ".join(" ",@ti)."\n";
for my $t (@ti) {
if($t < 0) {
$t = abs($t);
push @$minus_taxon_ids, @{$self->get_taxon_id_child($t)};
push @$minus_taxon_ids, $t;
} else {
push @$plus_taxon_ids, @{$self->get_taxon_id_child($t)};
push @$plus_taxon_ids, $t;
}
}
}
my $hit_hash = {map {$_->hseqname, undef} @$orig_features};
$self->get_hit_description($hit_hash);
my $error;
foreach my $feat (@$orig_features) {
my $tag = 1;
if (my $desc = $self->get_hit_description($feat->hseqname)) {
my $hit_taxon_id = $desc->taxon_id;
if($hit_db) {
$tag = 0 if $desc->db_name ne $hit_db;
}
if(@$plus_taxon_ids) {
$tag *= 0 unless grep(/^$hit_taxon_id$/,@$plus_taxon_ids);
}
if(@$minus_taxon_ids) {
$tag *= 0 if grep(/^$hit_taxon_id$/,@$minus_taxon_ids);
}
push(@$hit_db_features, $feat) if $tag;
} else {
$error->{$feat->hseqname} = 1;
}
}
throw("Missing hit_description entry for the following sequence(s):\n".join(',',keys %$error)) if keys %$error;
print STDERR "DepthFilter: use ".scalar(@$hit_db_features)." features out of ".scalar(@$orig_features)."\n";
} else {
return $orig_features;
}
return $hit_db_features } |
sub run
{ my ($self) = @_;
my %params = %{ $self->parameters_hash() };
my $max_coverage = $params{max_coverage} || 10;
my $hits_to_keep = $params{hits_to_keep} || 3;
my $max_hits_per_meta_cluster = $params{max_hits_per_meta_cluster} || 40;
my $percentid_cutoff = $params{percentid_cutoff} || 0.0;
my $orig_analysis_name = $params{ori_analysis};
my $hit_db = $params{hit_db};
my $mode = $params{mode};
my $taxon_id = $params{taxon};
my $no_filter = $params{no_filter};
my $analysis_adaptor = $self->db->get_AnalysisAdaptor();
my $ana = $analysis_adaptor->fetch_by_logic_name($orig_analysis_name);
my $stateinfocontainer = $self->db->get_StateInfoContainer;
my $db_version = $stateinfocontainer->fetch_db_version($self->input_id,$ana);
$self->db_version_searched($db_version);
my $slice = $self->query();
my $orig_features = $self->get_original_features($slice,$orig_analysis_name,$hit_db,$mode,$taxon_id, $no_filter);
my ( $filtered_features, $saturated_zones) =
$self->depth_filter($orig_features, $slice, $max_coverage,
$percentid_cutoff, $hits_to_keep, $no_filter,
$max_hits_per_meta_cluster, $hit_db);
$self->output($filtered_features, $saturated_zones); } |