Raw content of Bio::EnsEMBL::Analysis::Tools::Algorithms::GeneCluster
=head1 NAME
GeneCluster
=head1 SYNOPSIS
=head1 DESCRIPTION
This object holds one or more genes which has been clustered according to
comparison criteria external to this class (for instance, in the
methods compare and _compare_Genes methods of the class GeneComparison).
Each GeneCluster object holds the IDs of the genes clustered and the beginning and end coordinates
of each one (taken from the start and end coordinates of the first and last exon in the correspondig
get_all_Exons array)
=head1 CONTACT
eae@sanger.ac.uk
=cut
# Let the code begin ...
package Bio::EnsEMBL::Analysis::Tools::Algorithms::GeneCluster;
use vars qw(@ISA);
use strict;
use Bio::EnsEMBL::Gene;
use Bio::EnsEMBL::Root;
use Bio::EnsEMBL::Utils::Exception qw(throw warning );
#use Bio::EnsEMBL::Analysis::Tools::Algorithms::ClusterUtils qw( genes_to_Transcript_Cluster ) ;
@ISA = qw(Bio::EnsEMBL::Root);
=head1 METHODS
=cut
#########################################################################
=head2 new()
new() initializes the attributes:
$self->{'_benchmark_types'}
$self->{'_prediction_types'}
$self->{'_benchmark_genes'}
$self->{'_prediction_genes'}
=cut
sub new {
my ($class,$ignore_strand, $whatever)=@_;
if (ref($class)){
$class = ref($class);
}
my $self = {};
bless($self,$class);
$self->{_ignore_strand} = $ignore_strand;
if ($whatever){
throw( "Can't pass an object to new() method. Use put_Genes() to include Bio::EnsEMBL::Gene in cluster");
}
$self->{_cached_start} = undef;
$self->{_cached_end} = undef;
$self->{_cached_strand} = undef;
$self->{v} = 0 ; # verbosity
return $self;
}
#########################################################################
=head2 put_Genes()
function to include one or more genes in the cluster.
Useful when creating a cluster. It takes as argument an array of genes, it returns nothing.
=cut
sub put_Genes {
my ($self, $ignore_strand, @new_genes)= @_;
if ( !defined( $self->{'_types_sets'} ) ){
throw( "Cluster lacks references to gene-types, unable to put the gene");
}
GENE:
foreach my $gene (@new_genes){
throw("undef for gene. Cannot put_Genes") if (!$gene);
my $gene_biotype = $gene->biotype;
foreach my $set_name ( keys %{$self->{'_types_sets'}}) {
my $set = $self->{'_types_sets'}{$set_name};
foreach my $type ( @{$set} ){
if ($gene_biotype eq $type) {
push ( @{ $self->{'_gene_sets'}{$set_name} }, $gene );
if (defined($self->{_cached_start})) {
my $gene_start = $gene->start;
if ($gene_start < $self->{_cached_start}) {
$self->{_cached_start} = $gene_start;
}
}
if (defined($self->{_cached_end})) {
my $gene_end = $gene->end;
if ($gene_end > $self->{_cached_end}) {
$self->{_cached_end} = $gene_end;
}
}
if (!$self->{_ignore_strand}) {
#if (!$ignore_strand) {
if (defined($self->{_cached_strand})) {
my $gene_strand = $gene->start;
if ($gene_strand != $self->{_cached_strand}) {
throw("You have a cluster with genes on opposite strands");
}
}
}
next GENE;
}
}
}
throw("Failed putting gene of type " . $gene->biotype . "\n");
}
}
sub get_sets_included {
my $self = shift;
my @included_sets;
foreach my $set_name ( keys %{$self->{'_types_sets'}}) {
if (defined( $self->{'_gene_sets'}{$set_name})) {
push @included_sets,$set_name;
}
}
return \@included_sets;
}
#########################################################################
=head2 get_Genes()
it returns the array of genes in the GeneCluster object
=cut
sub get_Genes {
my $self = shift @_;
my @genes;
if (!defined( $self->{'_gene_sets'} ) ) {
$self->warning("The gene array you try to retrieve is empty");
@genes = ();
}
foreach my $set_name (keys %{$self->{'_gene_sets'}}) {
push( @genes, @{ $self->{'_gene_sets'}{$set_name} } );
}
return @genes;
}
sub get_Genes_ref {
my $self = shift @_;
my @genes;
if (!defined( $self->{'_gene_sets'} ) ) {
$self->warning("The gene array you try to retrieve is empty");
@genes = ();
}
foreach my $set_name (keys %{$self->{'_gene_sets'}}) {
push( @genes, @{ $self->{'_gene_sets'}{$set_name} } );
}
return \@genes;
}
sub get_all_Exons {
my $self = shift @_;
my @exons;
if (!defined( $self->{'_gene_sets'} ) ) {
$self->warning("The gene array you try to retrieve exons for is empty");
@exons = ();
}
foreach my $set_name (keys %{$self->{'_gene_sets'}}) {
foreach my $gene (@{ $self->{'_gene_sets'}{$set_name} }) {
push @exons, @{$gene->get_all_Exons};
}
}
return \@exons;
}
############################################################
sub strand{
my $self = shift;
if ($self->{_ignore_strand}) {
print "Strand called, but ignoring\n";
return 0;
}
if (!defined($self->{_cached_strand})) {
my @genes = $self->get_Genes;
unless (@genes){
$self->warning("cannot retrieve the strand in a cluster with no genes");
}
my $strand;
foreach my $gene (@genes){
if (ref($gene)=~m/Gene/) {
foreach my $transcript (@{$gene->get_all_Transcripts}){
unless (defined($strand)) {
$strand = $transcript->start_Exon->strand;
next;
}
if ( $transcript->start_Exon->strand != $strand ){
throw("You have a cluster with genes on opposite strands");
}
}
}else {
# prediction transcript
unless (defined($strand)) {
$strand = $gene->start_Exon->strand;
}
if ( $gene->start_Exon->strand != $strand ){
throw("You have a cluster with genes on opposite strands");
}
}
}
$self->{_cached_strand} = $strand;
}
return $self->{_cached_strand};
}
#########################################################################
=head2 get_Gene_Count()
it returns the number of genes in the GeneCluster object
=cut
sub get_Gene_Count {
my $self = shift @_;
my @genes = $self->get_Genes;
return scalar(@genes);
}
#########################################################################
sub gene_Types {
my ($self, $set_name, $types) = @_;
$self->{'_types_sets'}{$set_name} = $types;
return $self->{'_types_sets'}{$set_name};
}
#########################################################################
=head2 get_Genes_of_Type()
We can get the genes in each cluster of one type.
We pass one string identifying the genetype.
The difference with get_Genes_by_Type is that that as an arrayref as argument.
=cut
sub get_Genes_of_Type() {
my ($self,$type) = @_;
unless ($type){
throw( "must provide a type");
}
my @genes = $self->get_Genes; # this should give them in order, but we check anyway
my @selected_genes;
push ( @selected_genes, grep { $_->biotype eq $type } @genes );
return @selected_genes;
}
sub get_Genes_by_Set() {
my ($self,$set) = @_;
unless ($set){
throw( "must provide a set");
}
my @selected_genes;
if ($self->{v}){
for (keys %{ $self->{_gene_sets} } ) {
print " i know the following sets : $_\n" ;
}
}
if (!defined($self->{'_gene_sets'}{$set})) {
# throw("No genes of set name $set");
#warning("No genes of set name $set in cluster");
}else{
push @selected_genes, @{$self->{'_gene_sets'}{$set}};
}
return @selected_genes;
}
#########################################################################
=head2 get_Genes_by_Type()
We can get the genes in each cluster of a given type.
We pass an arrayref containing the types we want to retrieve.
=cut
sub get_Genes_by_Type() {
my ($self,$types) = @_;
unless ($types){
throw( "must provide a type");
}
my @genes = $self->get_Genes; # this should give them in order, but we check anyway
my @selected_genes;
foreach my $type ( @{ $types } ){
push ( @selected_genes, grep { $_->biotype eq $type } @genes );
}
return @selected_genes;
}
#########################################################################
=head2 to_String()
it returns a string containing the information about the genes contained in the
GeneCluster object
=cut
sub to_String {
my $self = shift @_;
my $data='';
foreach my $gene ( $self->get_Genes ){
my @exons = @{ $gene->get_all_Exons };
$data .= sprintf "Id: %-16s" , $gene->stable_id;
$data .= sprintf "Contig: %-20s" , $exons[0]->contig->id;
$data .= sprintf "Exons: %-3d" , scalar(@exons);
$data .= sprintf "Start: %-9d" , $self->_get_start($gene);
$data .= sprintf "End: %-9d" , $self->_get_end ($gene);
$data .= sprintf "Strand: %-2d\n" , $exons[0]->strand;
}
return $data;
}
#########################################################################
=head2 _get_start()
function to get the start position of a gene - it reads the gene object and it returns
the start position of the first exon
=cut
sub _get_start {
my ($self,$gene) = @_;
my @exons = @{ $gene->get_all_Exons };
my $st;
if ($exons[0]->strand == 1) {
@exons = sort {$a->start <=> $b->start} @exons;
$st = $exons[0]->start;
} else {
@exons = sort {$b->start <=> $a->start} @exons; # they're read in opposite direction (from right to left)
$st = $exons[0]->end; # the start is the end coordinate of the right-most exon
} # which is here the first of the list of sorted @exons
return $st;
}
#########################################################################
=head2 _get_end()
function to get the end position of a gene - it reads the gene object and it returns
the end position of the last exon
=cut
sub _get_end {
my ($self,$gene) = @_;
my @exons = @{ $gene->get_all_Exons };
my $end;
if ($exons[0]->strand == 1) {
@exons = sort {$a->start <=> $b->start} @exons;
$end = $exons[$#exons]->end;
} else {
@exons = sort {$b->start <=> $a->start} @exons; # they're read in opposite direction (from right to left)
$end = $exons[$#exons]->start; # the end is the start coordinate of the left-most exon
} # which is here the last of the list @exons
return $end;
}
=head2 _translateable_exon_length()
internal function that returns the length of the translateable exons
=cut
sub _translateable_exon_length {
my ($trans)= @_;
my @exons = $trans->translateable_exons;
my $length = 0;
foreach my $ex (@exons) {
$length += $ex->length;
}
return $length;
}
# method to get the start of the cluster, which we take to be the left_most exon_coordinate
# i.e. the start coordinate of the first exon ordered as { $a->start <=> $b->start }, regardless of the strand
sub start{
my ($self) = @_;
if (!defined($self->{_cached_start})) {
my $start;
foreach my $gene ($self->get_Genes) {
my $this_start = $gene->start;
unless ( $start ){
$start = $this_start;
}
if ( $this_start < $start ){
$start = $this_start;
}
}
$self->{_cached_start} = $start;
}
return $self->{_cached_start};
}
# method to get the end of the cluster, which we take to be the right_most exon_coordinate
# this being the end coordinate of the first exon ordered as { $b->end <=> $a->end }, regardless of the strand
sub end{
my ($self) = @_;
if (!defined($self->{_cached_end})) {
my $end;
foreach my $gene ($self->get_Genes) {
my $this_end = $gene->end;
unless ( $end ){
$end = $this_end;
}
if ( $this_end > $end ){
$end = $this_end;
}
}
$self->{_cached_end} = $end;
}
return $self->{_cached_end};
}
=head2 get_exon_clustering_from_gene_cluster
Name : $self->get_exon_clustering_from_gene_cluster()
Arg[0] : Bio::EnsEMBL::Analysis::Tools::Algorithms::GeneCluster;
Function : gets a GeneCluster and converts it by building a TranscriptCluster, than
clusters the exons of all Transcripts and returns an array-ref to
Bio::EnsEMBL::ExonCluster-objects
Returnval : Aref of Bio::EnsEMBL::Analysis::Tools::Algorithms::ExonCluster objects
=cut
sub get_exon_clustering_from_gene_cluster {
my ($self, $ignore_strand) = @_ ;
my @clg = sort {$a->start <=> $b->start} $self->get_Genes ;
# building Transcript-Cluster
#my $tc = genes_to_Transcript_Cluster(\@clg);
my $tc = $self->get_TranscriptCluster ($ignore_strand);
my @exon_clusters = $tc->get_ExonCluster_using_all_Exons($ignore_strand) ;
if (defined $tc->strand && $tc->strand eq '1') {
@exon_clusters = sort { $a->start <=> $b->start } @exon_clusters ;
} elsif (defined $tc->strand && $tc->strand eq '-1') {
@exon_clusters = sort { $b->start <=> $a->start } @exon_clusters ;
} else {
# for cases where TranscriptCluster does not have strand defined
# then cluster as for (+) strand
warning("TranscriptCluster's strand is not defined. Are you ignoring strand?");
@exon_clusters = sort { $a->start <=> $b->start } @exon_clusters ;
}
return \@exon_clusters ;
}
=head2 get_coding_exon_clustering_from_gene_cluster
Name : $self->get_coding_exon_clustering_from_gene_cluster()
Arg[0] : Bio::EnsEMBL::Analysis::Tools::Algorithms::GeneCluster;
Function : gets a GeneCluster and converts it by building a TranscriptCluster,
using only coding exons for the clustering. It then clusters the
coding exons of all Transcripts and returns an array-ref to
Bio::EnsEMBL::ExonCluster-objects
Returnval : Aref of Bio::EnsEMBL::Analysis::Tools::Algorithms::ExonCluster objects
=cut
sub get_coding_exon_clustering_from_gene_cluster {
my ($self, $ignore_strand) = @_ ;
my @clg = sort {$a->start <=> $b->start} $self->get_Genes ;
# building Transcript-Cluster
#my $tc = genes_to_Transcript_Cluster(\@clg);
my $tc = $self->get_coding_TranscriptCluster ($ignore_strand);
my @exon_clusters = $tc->get_coding_ExonCluster($ignore_strand) ;
if (defined $tc->strand && $tc->strand eq '1') {
@exon_clusters = sort { $a->start <=> $b->start } @exon_clusters ;
} elsif (defined $tc->strand && $tc->strand eq '-1') {
@exon_clusters = sort { $b->start <=> $a->start } @exon_clusters ;
} else {
# for cases where TranscriptCluster does not have strand defined
# then cluster as for (+) strand
warning("TranscriptCluster's strand is not defined. Are you ignoring strand?");
@exon_clusters = sort { $a->start <=> $b->start } @exon_clusters ;
}
return \@exon_clusters ;
}
=head2 get_TranscriptCluster
Name : $self->get_TranscriptCluster()
Arg[0] : Bio::EnsEMBL::Analysis::Tools::Algorithms::TranscriptCluster
Function : gets a GeneCluster and converts it by building a TranscriptCluster, than
clusters the exons of all Transcripts and returns an array-ref to
Bio::EnsEMBL::ExonCluster-objects
Returnval : Aref of Bio::EnsEMBL::Analysis::Tools::Algorithms::TranscriptCluster objects
=cut
sub get_TranscriptCluster {
my ($self, $ignore_strand) = @_;
#my ($genes_or_predTrans) = @_;
my $tc = Bio::EnsEMBL::Analysis::Tools::Algorithms::TranscriptCluster->new() ;
print "building new TranscriptCluster\n" if $self->{v};
foreach my $gene ( $self->get_Genes ) {
#foreach my $gene (@$genes_or_predTrans) {
if( ref($gene)=~m/Gene/){
# is a Bio::EnsEMBL::Gene
foreach my $trans (@{$gene->get_all_Transcripts}) {
if ($gene->strand ne $trans->strand ) {
throw("Weird - gene is on other strand than transcript\n") ;
}
for (@{ $trans->get_all_Exons} ) {
if ($_->strand ne $trans->strand ) {
print $trans->biotype . " " . $trans->seq_region_start . " "
. $trans->seq_region_end . " " . $trans->seq_region_strand ."\n" ;
print $_->biotype . " " . $_->seq_region_start . " " . $_->seq_region_end . " " .$_->seq_region_strand . "\n";
throw("Weird - exon is on other strand than transcript\n") ;
}
}
# assure that transcript has same biotype as gene
$trans->biotype($gene->biotype) ;
#$trans->sort;
print "Adding transcript " . $trans->stable_id . "\n" if $self->{v};
$tc->put_Transcripts($ignore_strand, $trans);
$tc->register_biotype($gene->biotype) ;
}
} else {
# is not a Bio::EnsEMBL::Gene
warning("Not having a Bio::EnsEMBL::Gene-object : clustering $gene\n") ;
$gene->sort ;
$tc->put_Transcripts($ignore_strand, $gene) ;
}
}
return $tc;
}
sub get_coding_TranscriptCluster {
my ($self, $ignore_strand) = @_;
#my ($genes_or_predTrans) = @_;
my $tc = Bio::EnsEMBL::Analysis::Tools::Algorithms::TranscriptCluster->new() ;
print "building new TranscriptCluster using only coding Exons\n" if $self->{v};
foreach my $gene ( $self->get_Genes ) {
if( ref($gene)=~m/Gene/){
# is a Bio::EnsEMBL::Gene
# loop though transcripts to get clusters
foreach my $trans (@{$gene->get_all_Transcripts}) {
if ($gene->strand ne $trans->strand ) {
throw("Weird - gene is on other strand than transcript\n") ;
}
for (@{ $trans->get_all_translateable_Exons} ) {
if ($_->strand ne $trans->strand ) {
print $trans->biotype . " " . $trans->seq_region_start . " "
. $trans->seq_region_end . " " . $trans->seq_region_strand ."\n" ;
print $_->biotype . " " . $_->seq_region_start . " " . $_->seq_region_end . " " .$_->seq_region_strand . "\n";
throw("Weird - exon is on other strand than transcript\n") ;
}
}
# assure that transcript has same biotype as gene
$trans->biotype($gene->biotype) ;
$trans->sort;
print "Adding transcript " . $trans->stable_id . "\n" if $self->{v};
# keep the type sets
$tc->{'_types_sets'} = $self->{'_types_sets'};
$tc->put_Transcripts($ignore_strand, $trans);
$tc->register_biotype($gene->biotype) ;
}
} else {
# is not a Bio::EnsEMBL::Gene
warning("Not having a Bio::EnsEMBL::Gene-object : clustering $gene\n") ;
$gene->sort ;
$tc->put_Transcripts($ignore_strand, $gene) ;
}
}
return $tc;
}
1;