Bio::EnsEMBL::Analysis::RunnableDB::Finished
ClusterDepthFilter
Toolbar
Package variables
Privates (from "my" definitions)
$SANITY_CHECK_STRANDS = 0
$DEBUG = 0
Included modules
Data::Dumper
Scalar::Util ' weaken '
Inherit
Synopsis
No synopsis!
Description
No description!
Methods
break_discontinuities | No description | Code |
build_summary_features | No description | Code |
cluster_clusters | No description | Code |
cluster_hits | No description | Code |
cluster_str | No description | Code |
depth_filter | No description | Code |
filter_features | No description | Code |
find_new_introns | No description | Code |
group_features_by_name | No description | Code |
hit_str | No description | Code |
is_consistent | No description | Code |
meta_cluster_str | No description | Code |
overlap | No description | Code |
Methods description
None available.
Methods code
break_discontinuities | description | prev | next | Top |
sub break_discontinuities
{
my ( $self, $hits_by_name, $no_filter, $slice ) = @_;
my @hits = ();
for my $hit ( values %$hits_by_name ) {
my @features = sort { $a->start <=> $b->start } @{ $hit->{features} };
my $last = $features[0];
my $new_hit = {};
$new_hit->{features} = [$last];
$new_hit->{tot_score} = $last->score;
$new_hit->{avg_percentid} = $last->percent_id;
$new_hit->{strand} = $last->hstrand;
$new_hit->{taxon_id} = $hit->{taxon_id} if $no_filter;
print STDERR "ClusterDepthFilter: Processing hit: ", $last->hseqname,
"\n"
if $DEBUG;
for my $af ( @features[ 1 .. $#features ] ) {
if ( $last->hstart == $af->hstart && $last->hend == $af->hend ) {
print STDERR "ClusterDepthFilter: Found duplicate align "
. "feature for hit "
. $af->hseqname
. " (coords: "
. $af->hstart . "-"
. $af->hend
. ", slice: "
. $slice->name . ")\n";
next;
}
my $delta =
$af->hstrand == -1
? ( $last->hstart - $af->hend )
: ( $af->hstart - $last->hend );
if ( $delta != 1 ) {
print STDERR "ClusterDepthFilter: creating new hit: hstrand: ",
$af->hstrand, " last: ", $last->hstart, "-", $last->hend,
" (", $last->start, "-", $last->end, ") af: ", $af->hstart,
"-", $af->hend, " (", $af->start, "-", $af->end, ")\n"
if $DEBUG;
$new_hit->{avg_percentid} /= scalar( @{ $new_hit->{features} } ); push @hits, $new_hit;
$new_hit = {};
$new_hit->{features} = [$af];
$new_hit->{tot_score} = $af->score;
$new_hit->{avg_percentid} = $af->percent_id;
$new_hit->{strand} = $af->hstrand;
$new_hit->{taxon_id} = $hit->{taxon_id} if $no_filter;
}
else {
print STDERR "ClusterDepthFilter: continuing hit: hstrand: ",
$af->hstrand, " last: ", $last->hstart, "-", $last->hend,
" (", $last->start, "-", $last->end, ") af: ", $af->hstart,
"-", $af->hend, " (", $af->start, "-", $af->end, ")\n"
if $DEBUG;
push @{ $new_hit->{features} }, $af;
$new_hit->{tot_score} += $af->score;
$new_hit->{avg_percentid} += $af->percent_id;
}
$last = $af;
}
$new_hit->{avg_percentid} /= scalar( @{ $new_hit->{features} } ); push @hits, $new_hit;
}
return\@ hits; } |
sub build_summary_features
{ my ( $self, $meta_clusters ) = @_;
my @summary_features = ();
my $tot_filtered = 0;
for my $meta_cluster (@$meta_clusters) {
for my $cluster ( @{ $meta_cluster->{clusters} } ) {
my ( $start, $end, $score, $introns );
my $filtered = 0;
for my $hit ( @{ $cluster->{hits} } ) {
if ( $hit->{features}->[0]->analysis == $self->analysis ) {
print STDERR
"ClusterDepthFilter: ignoring previously added feature\n"
if $DEBUG;
next;
}
if ( defined $start ) {
my $s = $hit->{features}->[0]->start;
$start = $s if $s < $start;
my $e = $hit->{features}->[-1]->end;
$end = $e if $e > $end;
my $ms = $hit->{tot_score};
$score = $ms if $ms > $score;
my $new_introns =
find_new_introns( $introns, $hit->{introns} );
push @$introns, map { [@$_] } @$new_introns;
$introns = [ sort { $a->[0] <=> $b->[0] } @$introns ];
}
else {
$start = $hit->{features}->[0]->start;
$end = $hit->{features}->[-1]->end;
$score = $hit->{tot_score};
$introns = $hit->{introns};
}
$filtered++;
}
if ( defined $start ) {
for my $intron (@$introns) {
my $summary = Bio::EnsEMBL::SimpleFeature->new(
-start => $start,
-end => $intron->[0],
-strand => $cluster->{strand},
-score => $score,
-display_label =>
sprintf( "summary feature from %d redundant hits",
$filtered ),
-analysis => $self->analysis()
);
push @summary_features, $summary;
$start = $intron->[1];
}
my $summary = Bio::EnsEMBL::SimpleFeature->new(
-start => $start,
-end => $end,
-strand => $cluster->{strand},
-score => $score,
-display_label =>
sprintf( "summary feature from %d redundant hits",
$filtered ),
-analysis => $self->analysis()
);
push @summary_features, $summary;
}
$tot_filtered += $filtered;
}
}
print STDERR
"ClusterDepthFilter: built summary features from $tot_filtered filtered hits\n";
return\@ summary_features; } |
sub cluster_clusters
{
my ( $self, $clusters ) = @_;
my @meta_clusters = ();
for my $cluster (@$clusters) {
my $added = 0;
for my $meta (@meta_clusters) {
if (
overlap(
$cluster->{start}, $cluster->{end},
$meta->{start}, $meta->{end}
)
&& $cluster->{strand} == $meta->{strand}
)
{
push @{ $meta->{clusters} }, $cluster;
$added = 1;
last;
}
}
if ( !$added ) {
my $new_meta = {
start => $cluster->{start},
end => $cluster->{end},
strand => $cluster->{strand},
clusters => [$cluster],
hits_added => 0
};
push @meta_clusters, $new_meta;
}
}
return\@ meta_clusters; } |
sub cluster_hits
{
my ( $self, $hits ) = @_;
my @hits = sort {
( $a->{features}->[0]->start <=> $b->{features}->[0]->start )
|| ( $b->{features}->[-1]->end <=> $a->{features}->[-1]->end )
} @$hits;
my @clusters = ();
for my $hit (@hits) {
my @afs = @{ $hit->{features} };
$hit->{start} = $afs[0]->start;
$hit->{end} = $afs[-1]->end;
print STDERR "ClusterDepthFilter: looking at hit: ", hit_str($hit), "\n"
if $DEBUG;
my @introns = ();
if ( @afs > 1 ) {
my $start = $afs[0]->end;
for my $af ( @afs[ 1 .. $#afs - 1 ] ) {
my $end = $af->start;
push @introns, [ $start, $end ];
$start = $af->end;
}
my $end = $afs[-1]->start;
push @introns, [ $start, $end ];
}
$hit->{introns} = [@introns];
my $added = 0;
for my $cluster (@clusters) {
if ( $hit->{strand} == $cluster->{strand}
&& $hit->{start} >= $cluster->{start}
&& $hit->{end} <= $cluster->{end}
&& is_consistent( $hit, $cluster ) )
{
print STDERR "ClusterDepthFilter: adding hit to cluster: ",
cluster_str($cluster), "\n"
if $DEBUG;
push @{ $cluster->{hits} }, $hit;
$added = 1;
last;
}
else {
print STDERR "ClusterDepthFilter: can't add hit to cluster: ",
cluster_str($cluster), "\n"
if $DEBUG;
}
}
if ( !$added ) {
my $cluster = {
start => $hit->{start},
end => $hit->{end},
introns => [ map { [@$_] } @introns ],
hits => [$hit],
strand => $hit->{strand}
};
push @clusters, $cluster;
print STDERR "ClusterDepthFilter: creating new cluster: ",
cluster_str($cluster), "\n"
if $DEBUG;
}
}
return\@ clusters; } |
sub cluster_str
{ my $cluster = shift;
my $s = $cluster->{start} . "-";
map { $s .= $_->[0] . "," . $_->[1] . "-" } @{ $cluster->{introns} };
$s .= $cluster->{end};
return $s;
}
} |
sub depth_filter
{
my $self = shift;
my ( $orig_features, $slice, $max_coverage, $percentid_cutoff,
$hits_to_keep, $no_filter, $max_hits_per_meta_cluster, $hit_db )
= @_;
print STDERR "ClusterDepthFilter: HitsToKeep=$hits_to_keep\n";
print STDERR "ClusterDepthFilter: PercentIdCutoff=$percentid_cutoff\n";
print STDERR "ClusterDepthFilter: NoFilter=$no_filter\n" if $no_filter;
print STDERR
"ClusterDepthFilter: MaxHitsPerMetaCluster=$max_hits_per_meta_cluster\n";
print STDERR "ClusterDepthFilter: "
. scalar(@$orig_features)
. " features before filtering\n";
my @matching_taxa = ();
if ($no_filter) {
@matching_taxa = @{ $self->get_taxon_id_child($no_filter) };
push @matching_taxa, $no_filter;
}
my $grouped_by_name =
$self->group_features_by_name( $orig_features, $percentid_cutoff,
$no_filter );
print STDERR "ClusterDepthFilter: "
. scalar( keys %$grouped_by_name )
. " unique hitnames\n";
my $hits =
$self->break_discontinuities( $grouped_by_name, $no_filter, $slice );
if ($SANITY_CHECK_STRANDS) {
for my $hit (@$hits) {
for my $af ( @{ $hit->{features} } ) {
if ( $af->hstrand != $hit->{strand} ) {
die "mismatching strands in hit groups\n";
}
}
}
}
print STDERR "ClusterDepthFilter: "
. scalar(@$hits)
. " hits after processing discontinuities\n";
my $clusters = $self->cluster_hits($hits);
print STDERR "ClusterDepthFilter: grouped "
. scalar(@$hits)
. " hits into "
. scalar(@$clusters)
. " clusters";
if ($SANITY_CHECK_STRANDS) {
for my $cluster ( @$clusters ) {
for my $hit ( @{ $cluster->{hits} } ) {
for my $af ( @{ $hit->{features} } ) {
if ( $af->hstrand != $cluster->{strand} ) {
die "mismatching strands in clusters\n";
}
}
}
}
}
print STDERR sprintf( " (average of %.2f hits per cluster)",
( scalar(@$hits) / scalar(@$clusters) ) ) if @$clusters > 0;
print STDERR "\n";
map { print STDERR cluster_str($_), "\n" } @$clusters if $DEBUG;
my $meta_clusters = $self->cluster_clusters($clusters);
print STDERR "ClusterDepthFilter: grouped "
. scalar(@$clusters)
. " clusters into "
. scalar(@$meta_clusters)
. " meta clusters\n";
if ($SANITY_CHECK_STRANDS) {
for my $meta (@$meta_clusters) {
for my $cluster ( @{ $meta->{clusters} } ) {
for my $hit ( @{ $cluster->{hits} } ) {
for my $af ( @{ $hit->{features} } ) {
if ( $af->hstrand != $meta->{strand} ) {
die "mismatching strands in meta clusters";
}
}
}
}
}
}
my ( $filtered_features, $summary_features ) =
$self->filter_features( $meta_clusters, $hits_to_keep, $no_filter,\@
matching_taxa, $max_hits_per_meta_cluster );
print STDERR "ClusterDepthFilter: "
. scalar(@$summary_features)
. " summary features\n";
print STDERR "ClusterDepthFilter: "
. scalar(@$filtered_features)
. " features after filtering\n";
return ( $filtered_features, $summary_features );
}
1; } |
sub filter_features
{
my ( $self, $meta_clusters, $hits_to_keep, $no_filter, $matching_taxa,
$max_hits_per_meta_cluster )
= @_;
my @retained_features = ();
my @summary_features = ();
my $retained = 0;
for my $meta_cluster (@$meta_clusters) {
my @clusters = @{ $meta_cluster->{clusters} };
for my $cluster (@clusters) {
$cluster->{hits} = [
sort {
( $b->{tot_score} <=> $a->{tot_score} )
|| ( $b->{avg_percentid} <=> $a->{avg_percentid} )
} @{ $cluster->{hits} }
];
my $added = 0;
while ( $added < $hits_to_keep && @{ $cluster->{hits} } ) {
my $hit_to_keep = shift @{ $cluster->{hits} };
$added++;
print STDERR "ClusterDepthFilter: adding features of cluster: ",
$cluster->{start}, "-", $cluster->{end}, "\n"
if $DEBUG;
for my $af ( @{ $hit_to_keep->{features} } ) {
print STDERR "ClusterDepthFilter: adding feature: ",
$af->start, "-", $af->end, "\n"
if $DEBUG;
$af->analysis( $self->analysis );
$af->dbID(0);
$af->{adaptor} = undef;
push @retained_features, $af;
}
}
$meta_cluster->{hits_added} += $hits_to_keep;
}
if ( $no_filter
&& $max_hits_per_meta_cluster
&& ( $meta_cluster->{hits_added} < $max_hits_per_meta_cluster ) )
{
my @candidates = ();
for my $cluster (@clusters) {
for my $hit ( @{ $cluster->{hits} } ) {
if ( grep { /^$hit->{taxon_id}$/ } @$matching_taxa ) {
push @candidates, $hit;
}
}
}
@candidates = sort {
( $b->{tot_score} <=> $a->{tot_score} )
|| ( $b->{avg_percentid} <=> $a->{avg_percentid} )
} @candidates;
while ($meta_cluster->{hits_added} < $max_hits_per_meta_cluster
&& @candidates )
{
my $hit_to_keep = shift @candidates;
$meta_cluster->{hits_added}++;
$retained++;
for my $af ( @{ $hit_to_keep->{features} } ) {
print STDERR "ClusterDepthFilter: adding extra feature: ",
$af->start, "-", $af->end, "\n"
if $DEBUG;
$af->analysis( $self->analysis );
$af->dbID(0);
$af->{adaptor} = undef;
push @retained_features, $af;
}
}
}
}
print STDERR
"ClusterDepthFilter: $retained features retained from matching taxa\n";
my $summary_features = $self->build_summary_features($meta_clusters);
return (\@ retained_features, $summary_features ); } |
sub find_new_introns
{
my ( $set1, $set2 ) = @_;
my %h1 = map { $_->[0] . "-" . $_->[1] => $_ } @$set1;
my %h2 = map { $_->[0] . "-" . $_->[1] => $_ } @$set2;
my @new_keys = grep { !( exists $h1{$_} ) } keys %h2;
return [ map { $h2{$_} } @new_keys ];
}
} |
sub group_features_by_name
{ my ( $self, $orig_features, $percentid_cutoff, $no_filter ) = @_;
my %grouped_by_name = ();
for my $af (@$orig_features) {
my ( $score, $percentid ) = ( $af->score(), $af->percent_id() );
if ( $percentid < $percentid_cutoff ) {
next;
}
my $hit = $grouped_by_name{ $af->hseqname() } ||= {};
push @{ $hit->{features} }, $af;
if ( $no_filter && $self->get_hit_description( $af->hseqname() ) ) {
$hit->{taxon_id} =
$self->get_hit_description( $af->hseqname() )->taxon_id;
}
}
return\% grouped_by_name; } |
sub hit_str
{ my $hit = shift;
my $s = '';
map { $s .= $_->start . "-" . $_->end . "," } @{ $hit->{features} };
return $s;
}
} |
sub is_consistent
{
my $hit = shift;
my $cluster = shift;
for my $af ( @{ $hit->{features} } ) {
for my $intron ( @{ $cluster->{introns} } ) {
if ( overlap( $af->start, $af->end, $intron->[0], $intron->[1] ) ) {
return 0;
}
}
}
my $start = $cluster->{start};
for my $cluster_intron ( @{ $cluster->{introns} } ) {
my $end = $cluster_intron->[0];
for my $intron ( @{ $hit->{introns} } ) {
if ( overlap( $start, $end, $intron->[0], $intron->[1] ) ) {
return 0;
}
}
$start = $cluster_intron->[1];
}
my $end = $cluster->{end};
for my $intron ( @{ $hit->{introns} } ) {
if ( overlap( $start, $end, $intron->[0], $intron->[1] ) ) {
return 0;
}
}
return 1;
}
} |
sub meta_cluster_str
{ my $meta = shift;
my $s = sprintf "%d-%d (%d)", $meta->{start}, $meta->{end}, $meta->{strand};
return $s; } |
sub overlap
{
my ( $f1_start, $f1_end, $f2_start, $f2_end ) = @_;
return ( ( $f1_start > $f2_start && $f1_start < $f2_end )
|| ( $f1_end > $f2_start && $f1_end < $f2_end )
|| ( $f1_start <= $f2_start && $f1_end >= $f2_end ) );
}
} |
General documentation
NAME - Bio::EnsEMBL::Analysis::RunnableDB::Finished::ClusterDepthFilter | Top |