Raw content of Bio::EnsEMBL::Analysis::RunnableDB::FindSplitGenes
package Bio::EnsEMBL::Analysis::RunnableDB::FindSplitGenes;
use strict;
use Bio::SeqIO;
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
use Bio::EnsEMBL::Analysis::RunnableDB;
use Bio::EnsEMBL::Gene;
use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils;
#use Bio::EnsEMBL::Analysis::Tools::Utilities;
use Bio::EnsEMBL::Analysis::Tools::Utilities qw ( shuffle ) ;
use Bio::EnsEMBL::Analysis::Config::GeneBuild::OrthologueEvaluator;
use Bio::EnsEMBL::Analysis::Config::Exonerate2Genes;
use Bio::EnsEMBL::Registry;
use Bio::EnsEMBL::Pipeline::DBSQL::DBAdaptor;
use Bio::EnsEMBL::Pipeline::DBSQL::RuleAdaptor;
use Bio::EnsEMBL::Pipeline::Rule;
use vars qw(@ISA);
use Bio::EnsEMBL::Analysis::RunnableDB::OrthologueEvaluator;
@ISA = qw ( Bio::EnsEMBL::Analysis::RunnableDB::OrthologueEvaluator ) ;
sub new {
my ($class,@args) = @_;
my $self = $class->SUPER::new(@args);
return $self;
# fetches the genes out of the QUERY database
# ( db which contains the predictions of species we're working on )
# The module FindSplitGenes queries the compara database with raw sql.
# It uses any random string as input id. it does not run on slices etc.
sub fetch_input {
my( $self ) = @_;
my $dbname = 'Compara' ;
my @potential_stable_ids_for_recovery ;
# INSERT INTO `analysis` VALUES (100,'2006-11-22 16:39:35','FindSplitGenes',NULL,NULL,NULL,NULL,NULL,NULL,NULL,'FindSplitGenes',NULL,NULL,NULL);
my $dbc = Bio::EnsEMBL::Registry->get_DBAdaptor($dbname ,'compara') ;
my $gdba = Bio::EnsEMBL::Registry->get_adaptor($dbname,'compara','GenomeDB');
my $mlssa = Bio::EnsEMBL::Registry->get_adaptor($dbname,'compara','MethodLinkSpeciesSet');
unless ($dbc){
throw ("Can't get Adaptor for comara ") ;
my $method_link_type = "ENSEMBL_ORTHOLOGUES";
# this should be the same so REVOCER_SPLIT_GENES ... can be removed ...
my $qy_species = $$MAIN_CONFIG{QUERY_SPECIES};
my $targeted_species = $$FIND_SPLIT_GENES{ANALYSIS_SETS}{$self->post_logic_name}{TARGETTED};
my $informant_species = $$FIND_SPLIT_GENES{ANALYSIS_SETS}{$self->post_logic_name}{INFORMANT};
#my $tg2_species = $$FIND_PARTIAL_GENES{ANALYSIS_SETS}{$self->post_logic_name}{TARGETED};
my $informant_perc_cov = $$FIND_SPLIT_GENES{ANALYSIS_SETS}{$self->post_logic_name}{INFORMANT_PERC_COV} ;
my $targeted_perc_cov = $$FIND_SPLIT_GENES{ANALYSIS_SETS}{$self->post_logic_name}{TARGETTED_PERC_COV} ;
my $informant_gdb = $gdba->fetch_by_name_assembly($informant_species);
my $targeted_gdb = $gdba->fetch_by_name_assembly($targeted_species);
my $mlss = $mlssa->fetch_by_method_link_type_GenomeDBs($method_link_type,[$informant_gdb, $targeted_gdb]);
my $no_chr_constraint = 0;
my $sql = "SELECT im.stable_id FROM
homology h, homology_member thm, member tm, homology_member ihm, member im
WHERE h.homology_id=thm.homology_id AND
thm.member_id=tm.member_id AND
h.homology_id=ihm.homology_id AND
ihm.member_id=im.member_id AND
h.method_link_species_set_id = ? AND
tm.genome_db_id = ? AND im.genome_db_id = ? AND
ihm.perc_cov < ? AND thm.perc_cov > ?
group by im.stable_id;";
my $sth = $dbc->prepare($sql);
$sth->execute($mlss->dbID, $targeted_gdb->dbID, $informant_gdb->dbID,$informant_perc_cov, $targeted_perc_cov);
$sql = "SELECT h.description,im.stable_id,im.chr_name,im.chr_start,im.chr_end,im.chr_strand,
ihm.perc_cov,tm.stable_id,tm.chr_name,tm.chr_start,tm.chr_end,tm.chr_strand,thm.perc_cov FROM
homology h, homology_member thm, member tm, homology_member ihm, member im
WHERE h.homology_id=thm.homology_id AND
thm.member_id=tm.member_id AND
h.homology_id=ihm.homology_id AND
ihm.member_id=im.member_id AND
h.method_link_species_set_id = ? AND
tm.genome_db_id = ? AND im.genome_db_id = ? AND
im.stable_id = ?";
my $stable_id;
my $sth1 = $dbc->prepare($sql);
my $targeted_split_genes = 0;
my %stable_ids;
my ($min_start, $max_end);
print "#Possible split genes in $targeted_species #using $informant_species as informant\n";
while ($sth->fetch) {
$sth1->execute($mlss->dbID, $targeted_gdb->dbID, $informant_gdb->dbID, $stable_id);
next if ($sth1->rows < 2);
my ($description,$im_stable_id,$im_chr_name,$im_chr_start,$im_chr_end,$im_chr_strand,
my @split_gene_data = ();
my $split_gene_chr;
while ($sth1->fetch) {
unless (defined $split_gene_chr) {
$split_gene_chr = $tm_chr_name;
if ($ihm_perc_cov >$targeted_perc_cov) {# || $split_gene_chr ne $tm_chr_name)
@split_gene_data = ();
$split_gene_chr = undef;
if (!$no_chr_constraint && $split_gene_chr ne $tm_chr_name) {
@split_gene_data = ();
$split_gene_chr = undef;
push @split_gene_data, [$description,$im_stable_id,$im_chr_name,$im_chr_start,
$min_start = $tm_chr_start unless (defined $min_start);
$min_start = $tm_chr_start if ($min_start > $tm_chr_start);
$max_end = $tm_chr_end unless (defined $max_end);
$max_end = $tm_chr_end if ($max_end < $tm_chr_end);
$stable_ids{$im_stable_id} =1;
$stable_ids{$tm_stable_id} =1;
push @potential_stable_ids_for_recovery, $im_stable_id ;
if (scalar @split_gene_data) {
printf "#%-5s %20s %5s %9s %9s %6s %3s %20s %5s %9s %9s %6s %3s\n", qw(desc stable_id chr start end strand cov stable_id chr start end strand cov);
foreach my $gene_piece (@split_gene_data) {
printf " %-5s %20s %5s %9d %9d %6s %3d %20s %5s %9d %9d %6s %3d\n",@{$gene_piece};
print "#----\n";
%stable_ids = ();
$min_start = undef;
$max_end = undef;
print "#Potentially $targeted_split_genes $targeted_species split genes\n";
$self->split_genes(\@potential_stable_ids_for_recovery) ;
sub run {
my ( $self ) = @_;
my @identified_stable_ids = @{$self->split_genes};
my $informant_species = $$FIND_SPLIT_GENES{ANALYSIS_SETS}{$self->post_logic_name}{INFORMANT};
my $ga = Bio::EnsEMBL::Registry->get_adaptor($informant_species,'core','gene') ;
$self->output(get_longest_transcripts($ga, \@identified_stable_ids)) ;
print " have " . scalar (@{$self->output}) . " sequences to write\n" ;
sub write_output{
my ($self) = @_;
# sequences need to be proteins so let's translate them and check for stop codon before
my $written_files = $self->chunk_and_write_fasta_sequences( shuffle($self->output)) ;
$self->upload_input_ids( $written_files ) ;
sub read_and_check_config {
(my $self) = @_ ;
sub split_genes {
my ( $self, $arg ) = @_;
$self->{split_genes} = $arg if $arg ;
return $self->{split_genes} ;
# method has to go in Utils - used by MissingOrtholouges as well ...
sub get_longest_transcripts{
my ($ga, $stable_ids) = @_ ;
my @cds ;
GENES : for my $sid (@$stable_ids ) {
my $gene = $ga->fetch_by_stable_id ($sid) ;
my $maxlen = 0;
my $nexonmax = 0;
my $longest_transcript;
foreach my $trans (@{$gene->get_all_Transcripts}) {
my $len = 0;
if ($trans->translation) {
my @trans_exons = @{$trans->get_all_translateable_Exons()};
foreach my $exon (@trans_exons) {
$len+= $exon->length;
if ($len > $maxlen) {
$maxlen = $len;
$longest_transcript = $trans;
$nexonmax = scalar($trans->get_all_Exons)
if (!defined($longest_transcript)) {
print "No longest_transcript transcript found for " . $gene->stable_id . "\n" ;
next GENES ;
my $transstr = $longest_transcript->translate->seq;
$transstr =~ s/(.{1,60})/$1\n/g;
#print ">$sid\n$transstr\n\n" ;
if ($transstr =~m/\*/ ) {
print "SKIPPING " . $longest_transcript->translation->stable_id .
" : translation contain stop codon\n$transstr\n\n";
next GENES ;
my $cds = $longest_transcript->translate ;
$cds->display_id( $longest_transcript->translation->stable_id) ;
push @cds, $cds ;
my @rand_cds ;
# randomize
return shuffle(\@cds) ;