Raw content of Bio::EnsEMBL::Compara::RunnableDB::GenomeLoadExonMembers
#
# You may distribute this module under the same terms as perl itself
#
# POD documentation - main docs before the code
=pod
=head1 NAME
Bio::EnsEMBL::Compara::RunnableDB::GenomeLoadExonMembers
=cut
=head1 SYNOPSIS
=cut
=head1 DESCRIPTION
=cut
=head1 CONTACT
Describe contact details here
=cut
=head1 APPENDIX
The rest of the documentation details each of the object methods.
Internal methods are usually preceded with a _
=cut
package Bio::EnsEMBL::Compara::RunnableDB::GenomeLoadExonMembers;
use strict;
use Bio::EnsEMBL::Compara::DBSQL::DBAdaptor;
use Bio::EnsEMBL::Compara::Member;
use Bio::EnsEMBL::Compara::Subset;
use Bio::EnsEMBL::Utils::Exception;
use Bio::EnsEMBL::Hive::Process;
our @ISA = qw(Bio::EnsEMBL::Hive::Process);
=head2 fetch_input
Title : fetch_input
Usage : $self->fetch_input
Function: prepares global variables and DB connections
Returns : none
Args : none
=cut
sub fetch_input {
my( $self) = @_;
throw("No input_id") unless defined($self->input_id);
print("input_id = ".$self->input_id."\n");
throw("Improper formated input_id") unless ($self->input_id =~ /^\s*\{/);
my $input_hash = eval($self->input_id);
my $genome_db_id = $input_hash->{'gdb'};
print("gdb = $genome_db_id\n");
throw("No genome_db_id in input_id") unless defined($genome_db_id);
throw("Improper formated analysis parameters") unless ($self->analysis->parameters =~ /^\s*\{/);
$input_hash = eval($self->analysis->parameters);
my $min_length = $input_hash->{'min_length'};
$min_length = 0 unless (defined $min_length);
$self->min_length($min_length);
#create a Compara::DBAdaptor which shares the same DBI handle
#with the pipeline DBAdaptor that is based into this runnable
$self->{'comparaDBA'} = Bio::EnsEMBL::Compara::DBSQL::DBAdaptor->new(-DBCONN => $self->db->dbc);
#get the Compara::GenomeDB object for the genome_db_id
$self->{'genome_db'} = $self->{'comparaDBA'}->get_GenomeDBAdaptor->fetch_by_dbID($genome_db_id);
#using genome_db_id, connect to external core database
$self->{'coreDBA'} = $self->{'genome_db'}->db_adaptor();
throw("Can't connect to genome database for id=$genome_db_id") unless($self->{'coreDBA'});
#global boolean control value (whether the genes are also stored as members)
$self->{'store_genes'} = 1;
$self->{'verbose'} = 0;
#variables for tracking success of process
$self->{'sliceCount'} = 0;
$self->{'geneCount'} = 0;
$self->{'realGeneCount'} = 0;
$self->{'transcriptCount'} = 0;
$self->{'exonCount'} = 0;
return 1;
}
sub run
{
my $self = shift;
$self->{'comparaDBA'}->dbc->disconnect_when_inactive(0);
$self->{'coreDBA'}->dbc->disconnect_when_inactive(0);
# main routine which takes a genome_db_id (from input_id) and
# access the ensembl_core database, useing the SliceAdaptor
# it will load all slices, all genes, and all transscripts
# and convert them into members to be stored into compara
$self->loadCodingExonMembersFromCoreSlices();
return 1;
}
sub write_output
{
my $self = shift;
#need to subclass otherwise it defaults to a version that fails
#just return 1 so success
my $output_id = "{gdb=>" . $self->{'genome_db'}->dbID .
",ss=>" . $self->{'exonSubset'}->dbID . "}";
$self->dataflow_output_id($output_id);
return 1;
}
######################################
#
# subroutines
#
#####################################
sub loadCodingExonMembersFromCoreSlices
{
my $self = shift;
#create subsets for the gene members, and the longest peptide members
$self->{'exonSubset'} = Bio::EnsEMBL::Compara::Subset->new(
-name=>"gdb:".$self->{'genome_db'}->dbID ." ". $self->{'genome_db'}->name . ' coding exons');
$self->{'comparaDBA'}->get_SubsetAdaptor->store($self->{'exonSubset'});
my $dfa = $self->{'comparaDBA'}->get_DnafragAdaptor;
#from core database, get all slices, and then all genes in slice
#and then all transcripts in gene to store as members in compara
my @slices = @{$self->{'coreDBA'}->get_SliceAdaptor->fetch_all('toplevel')};
print("fetched ",scalar(@slices), " slices to load from\n");
foreach my $slice (@slices) {
my $df = new Bio::EnsEMBL::Compara::DnaFrag
(-name => $slice->seq_region_name,
-length => $slice->length,
-coord_system_name => $slice->coord_system->name,
-genome_db => $self->{'genome_db'});
$dfa->store_if_needed($df);
$self->{'sliceCount'}++;
#print("slice " . $slice->name . "\n");
my @genes = ();
my $current_end;
foreach my $gene (sort {$a->start <=> $b->start ||
$a->end <=> $b->end} @{$slice->get_all_Genes}) {
$current_end = $gene->end unless (defined $current_end);
$self->{'geneCount'}++;
if((lc($gene->biotype) eq 'protein_coding')) {
$self->{'realGeneCount'}++;
if ($gene->start <= $current_end) {
push @genes, $gene;
$current_end = $gene->end if ($gene->end > $current_end);
} else {
$self->store_all_coding_exons(\@genes);
@genes = ();
$current_end = $gene->end;
push @genes, $gene;
}
}
}
$self->store_all_coding_exons(\@genes);
}
print("loaded ".$self->{'sliceCount'}." slices\n");
print(" ".$self->{'geneCount'}." genes\n");
print(" ".$self->{'realGeneCount'}." real genes\n");
print(" ".$self->{'transcriptCount'}." transcripts\n");
print(" ".$self->{'exonCount'}." exons\n");
print(" ".$self->{'exonSubset'}->count()." in Subset\n");
}
sub store_all_coding_exons
{
my $self = shift;
my $genes = shift;
return 1 if (scalar @{$genes} == 0);
my $MemberAdaptor = $self->{'comparaDBA'}->get_MemberAdaptor();
my $genome_db = $self->{'genome_db'};
my @exon_members = ();
foreach my $gene (@{$genes}) {
foreach my $transcript (@{$gene->get_all_Transcripts}) {
$self->{'transcriptCount'}++;
print(" transcript " . $transcript->stable_id ) if($self->{'verbose'});
foreach my $exon (@{$transcript->get_all_translateable_Exons}) {
unless (defined $exon->stable_id) {
warn("COREDB error: does not contain exon stable id for translation_id ".$exon->dbID."\n");
next;
}
my $description = $self->fasta_description($exon, $transcript);
my $exon_member = new Bio::EnsEMBL::Compara::Member;
$exon_member->taxon_id($genome_db->taxon_id);
if(defined $description ) {
$exon_member->description($description);
} else {
$exon_member->description("NULL");
}
$exon_member->genome_db_id($genome_db->dbID);
$exon_member->chr_name($exon->seq_region_name);
$exon_member->chr_start($exon->seq_region_start);
$exon_member->chr_end($exon->seq_region_end);
$exon_member->chr_strand($exon->seq_region_strand);
$exon_member->version($exon->version);
$exon_member->stable_id($exon->stable_id);
$exon_member->source_name("ENSEMBLEXON");
my $seq_string = $exon->peptide($transcript)->seq;
## a star or a U (selenocysteine) in the seq breaks the pipe to the cast filter for Blast
$seq_string =~ tr/\*U/XX/;
if ($seq_string =~ /^X+$/) {
warn("X+ in sequence from exon " . $exon->stable_id."\n");
}
else {
$exon_member->sequence($seq_string);
}
print(" => member " . $exon_member->stable_id) if($self->{'verbose'});
unless($exon_member->sequence) {
print(" => NO SEQUENCE!\n") if($self->{'verbose'});
next;
}
print(" len=",$exon_member->seq_length ) if($self->{'verbose'});
next if ($exon_member->seq_length < $self->min_length);
push @exon_members, $exon_member;
}
}
}
@exon_members = sort {$b->seq_length <=> $a->seq_length} @exon_members;
my @exon_members_stored = ();
while (my $exon_member = shift @exon_members) {
my $not_to_store = 0;
foreach my $stored_exons (@exon_members_stored) {
if ($exon_member->chr_start <=$stored_exons->chr_end &&
$exon_member->chr_end >= $stored_exons->chr_start) {
$not_to_store = 1;
last;
}
}
next if ($not_to_store);
push @exon_members_stored, $exon_member;
$MemberAdaptor->store($exon_member);
print(" : stored\n") if($self->{'verbose'});
$self->{'exonSubset'}->add_member($exon_member);
$self->{'exonCount'}++;
}
}
sub fasta_description {
my ($self, $exon, $transcript) = @_;
my $description = "Exon:" . $exon->stable_id .
" Chr:" . $exon->seq_region_name .
" Start:" . $exon->seq_region_start .
" End:" . $exon->seq_region_end .
" Transcript:" . $transcript->stable_id;
return $description;
}
sub min_length {
my ($self, $value) = @_;
if (defined $value) {
$self->{_min_length} = $value;
}
return $self->{_min_length};
}
1;