Raw content of Bio::EnsEMBL::Compara::RunnableDB::GenomeCalcStats.

Raw content of Bio::EnsEMBL::Compara::RunnableDB::GenomeCalcStats # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =pod =head1 NAME Bio::EnsEMBL::Compara::RunnableDB::GenomeLoadMembers =cut =head1 SYNOPSIS my $db = Bio::EnsEMBL::Compara::DBAdaptor->new($locator); my $repmask = Bio::EnsEMBL::Compara::RunnableDB::GenomeLoadMembers->new ( -db => $db, -input_id => $input_id -analysis => $analysis ); $repmask->fetch_input(); #reads from DB $repmask->run(); $repmask->output(); $repmask->write_output(); #writes to DB =cut =head1 DESCRIPTION This object wraps Bio::EnsEMBL::Analysis::Runnable::Blast to add functionality to read and write to databases. The appropriate Bio::EnsEMBL::Analysis object must be passed for extraction of appropriate parameters. =cut =head1 CONTACT Contact Jessica Severin on module implemetation/design detail: jessica@ebi.ac.uk Contact Abel Ureta-Vidal on EnsEMBL/Compara: abel@ebi.ac.uk Contact Ewan Birney on EnsEMBL in general: birney@sanger.ac.uk =cut =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::EnsEMBL::Compara::RunnableDB::GenomeCalcStats; use strict; use Statistics::Descriptive; use Bio::EnsEMBL::DBSQL::DBAdaptor; use Bio::EnsEMBL::DBLoader; use Bio::EnsEMBL::Compara::DBSQL::DBAdaptor; use Bio::EnsEMBL::Compara::Member; use Bio::EnsEMBL::Compara::Homology; use Bio::EnsEMBL::Compara::Member; use Bio::EnsEMBL::Compara::Subset; use Bio::EnsEMBL::Hive::Process; our @ISA = qw(Bio::EnsEMBL::Hive::Process); =head2 batch_size Title : batch_size Usage : $value = $self->batch_size; Description: Defines the number of jobs the RunnableDB subclasses should run in batch before querying the database for the next job batch. Used by the Hive system to manage the number of workers needed to complete a particular job type. Returntype : integer scalar =cut sub batch_size { return 1; } =head2 carrying_capacity Title : carrying_capacity Usage : $value = $self->carrying_capacity; Description: Defines the total number of Workers of this RunnableDB for a particular analysis_id that can be created in the hive. Used by Queen to manage creation of Workers. Returntype : integer scalar =cut sub carrying_capacity { return 20; } =head2 fetch_input Title : fetch_input Usage : $self->fetch_input Function: prepares global variables and DB connections Returns : none Args : none =cut sub fetch_input { my( $self) = @_; $self->throw("No input_id") unless defined($self->input_id); print("input_id = ".$self->input_id."\n"); $self->throw("Improper formated input_id") unless ($self->input_id =~ /{/); my $input_hash = eval($self->input_id); my $genome_db_id = $input_hash->{'gdb'}; $self->throw("No genome_db_id in input_id") unless defined($genome_db_id); #create a Compara::DBAdaptor which shares the same DBI handle #with the pipeline DBAdaptor that is based into this runnable $self->{'comparaDBA'} = Bio::EnsEMBL::Compara::DBSQL::DBAdaptor->new(-DBCONN => $self->db->dbc); #get the Compara::GenomeDB object for the genome_db_id $self->{'genome_db'} = $self->{'comparaDBA'}->get_GenomeDBAdaptor->fetch_by_dbID($genome_db_id); return 1; } sub run { my $self = shift; $self->{'comparaDBA'}->dbc->disconnect_when_inactive(0); $self->calc_intergenic_stats(); $self->{'comparaDBA'}->dbc->disconnect_when_inactive(1); return 1; } sub write_output { my $self = shift; #need to subclass otherwise it defaults to a version that fails #just return 1 so success return 1; } ###################################### # # subroutines # ##################################### sub calc_intergenic_stats { my $self = shift; return unless(defined($self->{'genome_db'})); print("calc_inter_genic_distance for '", $self->{'genome_db'}->name(), "'\n"); my $memberDBA = $self->{'comparaDBA'}->get_MemberAdaptor(); $memberDBA->_final_clause("ORDER BY m.chr_name, m.chr_start"); my $sortedMembers = $memberDBA->fetch_by_source_taxon('ENSEMBLGENE', $self->{'genome_db'}->taxon_id); print(scalar(@{$sortedMembers}) . " members to process\n"); my $intergenic_stats = Statistics::Descriptive::Full->new(); my $genesize_stats = Statistics::Descriptive::Full->new(); my $overlapCount=0; my $lastMember = shift @{$sortedMembers}; #$lastMember->print_member; foreach my $member (@{$sortedMembers}) { #$member->print_member; $genesize_stats->add_data($member->chr_end - $member->chr_start); if($lastMember->chr_name ne $member->chr_name) { print("broken syntenty from \n"); $lastMember->print_member; $member->print_member; } else { my $dist = ($member->chr_start - $lastMember->chr_end); if($dist < 0) { $dist=0; $overlapCount++; } $intergenic_stats->add_data($dist); } $lastMember = $member; } $intergenic_stats->{'overlapCount'} = $overlapCount; print("intergenic overlap = ",$overlapCount,"\n"); print("intergenic overlap = ",$intergenic_stats->{'overlapCount'},"\n"); $self->insert_statistics($intergenic_stats, 'intergenic'); $self->insert_statistics($genesize_stats, 'gene_size'); } sub insert_statistics { my $self = shift; my $stats = shift; my $dataType = shift; my $sql = "INSERT ignore INTO genome_db_stats SET data_type='$dataType'" ." ,genome_db_id=".$self->{'genome_db'}->dbID ." ,count='".$stats->count()."'" ." ,mean='".$stats->mean()."'" ." ,median='".$stats->median()."'" ." ,mode='".$stats->mode()."'" ." ,stddev='".$stats->standard_deviation()."'" ." ,variance='".$stats->variance()."'" ." ,min='".$stats->min()."'" ." ,max='".$stats->max()."'"; $sql .= " ,overlap_count='".$stats->{'overlapCount'}."'" if($stats->{'overlapCount'}); print("$sql\n"); my $sth = $self->db->dbc->prepare($sql); $sth->execute; $sth->finish; } 1;