sub _compare_Genes
{ my ($gene1,$gene2,$translate, $ignore_strand) = @_;
if ($gene1->end < $gene2->start || $gene1->start > $gene2->end) {
print "Gene 1 " . $gene1->start . " " . $gene1->end . "\n ";
print "Gene 2 " . $gene1->start . " " . $gene1->end . "\n ";
print "Failed extents check - returning 0\n";
return 0;
}
if ($translate) {
my $exons1 = get_coding_exons_for_gene($gene1);
my $exons2 = get_coding_exons_for_gene($gene2);
foreach my $exon1 (@$exons1) {
foreach my $exon2 (@$exons2) {
if (!$ignore_strand) {
if ( ($exon1->overlaps($exon2)) && ($exon1->strand == $exon2->strand) ){
return 1;
}
} else {
if ($exon1->overlaps($exon2)){
return 1;
}
}
}
}
} else {
foreach my $exon1 (@{$gene1->get_all_Exons}){
foreach my $exon2 (@{$gene2->get_all_Exons}){
if (!$ignore_strand) {
if ( ($exon1->overlaps($exon2)) && ($exon1->strand == $exon2->strand) ){
return 1;
}
} else {
if ($exon1->overlaps($exon2)){
return 1;
}
}
}
}
}
return 0; } |
sub cluster_Genes
{ my ($genes, $types_hash, $check_coding_overlap, $ignore_strand) = @_ ;
return ([],[]) if (!scalar(@$genes));
my @sorted_genes =
sort { $a->start <=> $b->start ? $a->start <=> $b->start : $b->end <=> $a->end } @$genes;
print STDERR "Clustering ".scalar( @sorted_genes )." genes on slice\n" ;
for ( @sorted_genes ) {
my $tr = scalar ( @{$_->get_all_Transcripts} ) ;
if ( $tr == 0 ) {
throw("data error - gene with gene_id " . $_->dbID ." biotype " . $_->biotype . " does not have any associated transcripts\n ") ;
}
}
my $count = 0;
my @active_clusters;
my @inactive_clusters;
TRANSCRIPT: foreach my $gene (@sorted_genes) {
$count++;
if (!($count%50)) {
my @still_active_clusters;
my $gene_start = $gene->start;
foreach my $cluster (@active_clusters) {
if ($cluster->end < $gene_start) {
push @inactive_clusters,$cluster;
} else {
push @still_active_clusters,$cluster;
}
}
@active_clusters = @still_active_clusters;
}
my @matching_clusters;
CLUSTER: foreach my $cluster (@active_clusters) {
if ($gene->end >= $cluster->start && $gene->start <= $cluster->end) {
foreach my $cluster_gene ($cluster->get_Genes){
if ($gene->end >= $cluster_gene->start && $gene->start <= $cluster_gene->end) {
if (_compare_Genes( $gene, $cluster_gene, $check_coding_overlap, $ignore_strand)) {
push (@matching_clusters, $cluster);
next CLUSTER;
}
}
}
}
}
if (scalar(@matching_clusters) == 0) {
my $newcluster = Bio::EnsEMBL::Analysis::Tools::Algorithms::GeneCluster->new($ignore_strand);
foreach my $set_name (keys %$types_hash) {
$newcluster->gene_Types($set_name,$types_hash->{$set_name});
}
$newcluster->put_Genes($ignore_strand, $gene);
push(@active_clusters,$newcluster);
} elsif (scalar(@matching_clusters) == 1) {
$matching_clusters[0]->put_Genes($ignore_strand, $gene);
} else {
my @new_clusters;
my $merged_cluster = Bio::EnsEMBL::Analysis::Tools::Algorithms::GeneCluster->new($ignore_strand);
foreach my $set_name (keys %$types_hash) {
$merged_cluster->gene_Types($set_name,$types_hash->{$set_name});
}
my %match_cluster_hash;
foreach my $clust (@matching_clusters) {
$merged_cluster->put_Genes($ignore_strand, $clust->get_Genes);
$match_cluster_hash{$clust} = $clust;
}
$merged_cluster->put_Genes($ignore_strand, $gene);
push @new_clusters,$merged_cluster;
foreach my $clust (@active_clusters) {
if (!exists($match_cluster_hash{$clust})) {
push @new_clusters,$clust;
}
}
@active_clusters = @new_clusters;
}
}
my @clusters = (@active_clusters,@inactive_clusters);
my (@new_clusters, @unclustered);
foreach my $cl (@clusters){
if ( $cl->get_Gene_Count == 1 ){
push @unclustered, $cl;
} else{
push( @new_clusters, $cl );
}
}
print STDERR "All Genes clustered\nGot " . scalar(@new_clusters) . " new Clusters\n" ;
return (\@new_clusters,\@ unclustered); } |
sub get_coding_exons_for_gene
{ my ($gene) = @_;
my @coding;
foreach my $trans (@{$gene->get_all_Transcripts}) {
next if (!$trans->translation);
foreach my $exon (@{$trans->get_all_translateable_Exons}) {
push @coding, $exon;
}
}
return\@ coding; } |