Raw content of FeatureComparison package FeatureComparison; use strict; use warnings; use Bio::EnsEMBL::Utils::Exception qw(throw warning verbose); use Bio::EnsEMBL::Utils::Argument qw( rearrange ); use Bio::EnsEMBL::Root; use vars qw(@ISA); @ISA = qw(Bio::EnsEMBL::Root); =head2 new Arg [1] : FeatureComparison Arg [2] : arrayref, array of features Arg [3] : arrayref, array of features Arg [4] : int, binary switch for verbosity Function : create a FeatureComparison module Returntype: FeatureComparison Exceptions: Example : my $feature_comp = FeatureComparison-> ( -QUERY => $query, -TARGET => $target, -VERBOSE => $verbose, ); =cut sub new{ my ($class,@args) = @_; my $self = $class->SUPER::new(@args); &verbose('WARNING'); my ($query, $target, $verbose) = rearrange( ['QUERY', 'TARGET', 'VERBOSE'], @args); $self->query($query); $self->target($target); $self->verbosity($verbose); return $self; } =head2 verbosity Arg [1] : FeatureComparison Arg [2] : int Function : toggle as to be verbose or not Returntype: int Exceptions: Example : =cut sub verbosity{ my $self = shift; $self->{'verbose'} = shift if(@_); return $self->{'verbose'}; } =head2 query/target Arg [1] : FeatureComparison Arg [2] : arrayref, array of features Function : container for the arrayref of features Returntype: arrayref Exceptions: throws if not passed an arrayref Example : =cut sub query{ my ($self, $query) = @_; if($query){ throw("query must be an array ref not a ".$query) unless(ref($query) eq 'ARRAY'); $self->{'query'} = $query; } return $self->{'query'}; } sub target{ my ($self, $target) = @_; if($target){ throw("target must be an array ref not a ".$target) unless(ref($target) eq 'ARRAY'); $self->{'target'} = $target; } return $self->{'target'}; } sub tables_to_compare{ my ($self, $target) = @_; if($target){ throw("tables to compare must be an array ref not a ".$target) unless(ref($target) eq 'ARRAY'); $self->{'tables'} = $target; } return $self->{'tables'}; } =head2 fast_sort Arg [1] : FeatureComparison Arg [2] : arrayref array of features Function : sorts the features based on their start coord very quickly Returntype: arrayref Exceptions: none Example : =cut #this complex map sort map is used as the feature list could potetially #be very long and this provides a significant speed up from just doing #sort{$a->start <=> $b->start} as it means the start method is called #significantly fewer times sub fast_sort{ my ($self, $array) = @_; my @array = map { $_->[1] } sort { $a->[0] <=> $b->[0] } map { [$_->start, $_] } @$array; return \@array; } =head2 compare Arg [1] : FeatureComparison Function : Compares the to array of featues noting which features are not the same. If the verbose flag is on it will also print out the coordinates of the features which are the same Returntype: none Exceptions: none Example : =cut sub compare { my ($self) = @_; my @query = @{$self->query}; my @target = @{$self->target}; my @q_not_matched; @query = @{$self->fast_sort(\@query)}; @target = @{$self->fast_sort(\@target)}; print "\nThere are ".@query." query features\n"; print "There are ".@target." target features\n\n"; my $num_t_feats = scalar(@target); my $start_ind = 0; QUERY:foreach my $feat(@query){ if($start_ind > $num_t_feats){ #as the query features and target feature have been push(@q_not_matched, $feat); #sorted, if there are more query features than target next QUERY; #features, the query will surely fail to find a match. } TARGET:for (my $i = $start_ind; $i< $num_t_feats; $i++){ my $t_feat = $target[$i]; next TARGET if(!$t_feat); if($t_feat->start == $feat->start && $t_feat->end == $feat->end && $t_feat->strand == $feat->strand){ if($self->verbosity){ print "Got match\n"; print "t_feat = " . $t_feat->start . " " . $t_feat->end . " ". $t_feat->strand."\n"; print "feat = " . $feat->start . " " . $feat->end . " ". $feat->strand."\n"; } $target[$i] = undef; #those targets which have been matched with query were "wiped out" next QUERY; }elsif($t_feat->start > $feat->start){ push(@q_not_matched, $feat); next QUERY; }elsif($t_feat->end < $feat->start){ $start_ind++; } } } if (scalar(@q_not_matched)>0) { print "Unmatched query features\n"; $self->feature_info(\@q_not_matched); } else { print "No unmatched query features were found.\n"; print "If the analysis has run successfully, this indicates that ". "all query features had matching target features.\n"; } my @unmatched_targets; for (my $i = 0; $i < scalar(@target); $i++) { if (defined $target[$i]) { push (@unmatched_targets, $target[$i]); } } if (scalar(@unmatched_targets) > 0) { print "\nUnmatched target features\n"; $self->feature_info(\@target); } else { print "All target features had matching query features.\n"; } } =head2 feature_info Arg [1] : FeatureComparison Arg [2] : arrayref of features Function : prints the start, end and strand of each feature passed in Returntype: none Exceptions: none Example : =cut sub feature_info{ my ($self, $array) = @_; my $name; foreach my $f(@$array){ if(!$f){ next; } $name = $f->analysis->logic_name if(!$name); print $name." ".$f->slice->seq_region_name." ".$f->start." ".$f->end. " ".$f->strand."\n"; } } sub pipeline_compare{ my($self, $table_names, $query_db, $ref_db) = @_; if(!$table_names){ $table_names = $self->tables_to_compare; } $self->db_type_check($query_db); $self->db_type_check($ref_db); if(@$table_names == 0){ print "FeatureComparison:pipeline_compare --- you passed in no tables ". "so we can't make any comparisons\n"; return; } my $query_numbers = {}; my $ref_numbers = {}; foreach my $table(@$table_names){ my $sql = "select logic_name, count(*) from analysis, "; $sql .= $table." where analysis.analysis_id=".$table.".analysis_id ". "group by logic_name\n"; $query_numbers = $self->run_query($sql, $query_db, $query_numbers); $ref_numbers = $self->run_query($sql, $ref_db, $ref_numbers); } print "\nThere are ".keys(%$query_numbers)." analysis in the query database ". $query_db->dbname." and ".keys(%$ref_numbers)." analysis in the reference ". "database ".$ref_db->dbname.", all from ".@$table_names." tables\n\n"; printf("%-15s %-15s %-15s\n", 'logic_name', 'query_count', 'ref_count'); printf("%-15s %-15s %-15s\n", '----------', '-----------', '---------'); foreach my $logic_name(keys(%$query_numbers)){ printf("%-15s %-15s %-15s\n", $logic_name, $query_numbers->{$logic_name}, $ref_numbers->{$logic_name}); } print "\n"; } sub db_type_check{ my ($self, $db) = @_; if(!$db->isa('Bio::EnsEMBL::Pipeline::DBSQL::DBAdaptor')){ throw("Can't run the RuleManager with $db you need a ". "Bio::EnsEMBL::Pipeline::DBSQL::DBAdaptor"); } return; } sub run_query{ my ($self, $sql, $db, $hash) = @_; my $sth = $db->prepare($sql); $sth->execute; while(my($logic_name, $count) = $sth->fetchrow){ if(!$hash->{$logic_name}){ $hash->{$logic_name} = $count; }else{ warning($logic_name." already appears in ".$hash." with count ". $hash->{$logic_name}." from database ".$db->dbname); } } return $hash; }