Raw content of Bio::EnsEMBL::Analysis::Tools::Algorithms::ExonCluster
package Bio::EnsEMBL::Analysis::Tools::Algorithms::ExonCluster;
use Bio::EnsEMBL::Transcript ;
use vars qw(@ISA);
use Carp;
use strict;
use Bio::EnsEMBL::Utils::Exception qw (warning throw) ;
sub new {
my ($class,@args) = @_;
my $self = bless {},ref($class) || $class;
$self->{_start} = undef;
$self->{_end} = undef;
$self->{_exonhash} = undef;
$self->{_exonidhash} = undef;
$self->{_transcripthash} = undef;
$self->{_transcriptidhash} = undef;
$self->{_internal_index} = 0;
$self->{_exon_2_biotype} = undef ; # holds biotype of exon
$self->{_all_exons_in_cluster} = {} ; # holds all exons ( key is exon itself )
if (@args) {
$self->throw("Constructor does not expect any parameters");
}
return $self;
}
sub start{
my ($self,$start) = @_;
if ($start){
$self->throw( "$start is not an integer") unless $start =~/^[-+]?\d+$/;
$self->{'_start'} = $start;
}
return $self->{'_start'};
}
sub end{
my ($self,$end) = @_;
if ($end){
$self->throw( "$end is not an integer") unless $end =~/^[-+]?\d+$/;
$self->{'_end'} = $end;
}
return $self->{'_end'};
}
sub length{
my $self = shift @_;
if (@_){
$self->confess( ref($self)."->length() is read-only");
}
return ( $self->{'_end'} - $self->{'_start'} + 1 );
}
sub strand{
my ($self,$strand) = @_;
if ($strand){
$self->{'_strand'} = $strand;
}
return $self->{'_strand'};
}
sub type {
my ($self,$ignore_strand, @args) = @_;
if (@args){
$self->throw("type is a get only method");
}
if (!defined($self->{'_type'})) {
$self->_determine_type($ignore_strand);
}
return $self->{'_type'};
}
sub merge {
my ($self,$cluster, $ignore_strand) = @_;
my %transhash = $cluster->each_transcripts_exons;
foreach my $transref (keys %transhash) {
foreach my $exon (@{$transhash{$transref}}) {
$self->add_exon($exon,$cluster->transcript_from_ref($transref), $ignore_strand);
}
}
$self->{'_type'} = undef;
}
sub merge_new_exon {
my ($self,$cluster, $ignore_strand) = @_;
my %transhash = $cluster->each_transcripts_exons;
foreach my $transref (keys %transhash) {
foreach my $exon (@{$transhash{$transref}}) {
$self->add_exon_if_not_present($exon,$cluster->transcript_from_ref($transref), $ignore_strand);
}
}
$self->{'_type'} = undef;
}
sub add_exon {
my ($self,$exon,$transcript, $ignore_strand) = @_;
if (!$self->contains_exon($exon)) {
$self->_add_new_exon($exon, $ignore_strand);
# print "Added exon " . $exon->dbID . "\n";
}
$self->_add_transcript_reference($exon,$transcript);
$self->_add_exon_biotype($exon,$transcript) ;
if ($self->{_internal_index} != 1) {
#print "Got more than one exon in cluster\n";
}
$self->{'_type'} = undef;
}
# this is a variation on the method above.
# calls contains_exon_with_dbid_and_dbname
# instead of contains_exon
sub add_exon_if_not_present {
my ($self,$exon,$transcript, $ignore_strand) = @_;
if (!$self->contains_exon_with_dbid_and_dbname($exon)) {
$self->_add_new_exon($exon,$ignore_strand);
# print "Added exon " . $exon->dbID . "\n";
} else {
# do not return here or you will not add
# the transcript_reference to _transcripthash
}
$self->_add_transcript_reference($exon,$transcript);
$self->_add_exon_biotype($exon,$transcript) ;
if ($self->{_internal_index} != 1) {
#print "Got more than one exon in cluster\n";
}
$self->{'_type'} = undef;
}
sub _add_new_exon {
my ($self,$exon, $ignore_strand) = @_;
if (!defined($self->start) || $exon->start < $self->start) {
$self->start($exon->start);
}
if (!defined($self->end) || $exon->end > $self->end) {
$self->end($exon->end);
}
if (!$ignore_strand) {
if (!defined($self->strand)) {
$self->strand($exon->strand);
} elsif ($self->strand != $exon->strand) {
# confess("Trying to add exon with strand ". $exon->strand . " to cluster with strand " . $self->strand);
carp("Trying to add exon with strand ". $exon->strand . " to cluster with strand " . $self->strand);
}
}
$self->{_exonhash}{"$exon"} = $self->{_internal_index}++;
#$self->{_exonidhash}{$exon->stable_id} = $exon;
if (exists $self->{_exonidhash}{$exon->dbID.$exon->adaptor->db->dbc->dbname}) {
$self->throw("Error : there seem to be exons with the same dbID and dbname in the databases ".$exon->dbID." ".$exon->adaptor->db->dbname) ;
}
$self->{_exonidhash}{$exon->dbID.$exon->adaptor->db->dbc->dbname} = $exon;
$self->{_all_exons_in_cluster}{$exon}=$exon ;
}
=head 2
Name : _add_transcript_reference($exon, $transcript)
Arg[0] : Bio::EnsEMBL::Exon
Arg[0] : Bio::EnsEMBL::Transcript
Function : called by add_exon , buids relation href{$transcript}=\@exons
=cut
sub _add_transcript_reference {
my ($self,$exon,$transcript) = @_;
# if there's not already a reference to transcript stored make an arrayref
if (!$self->contains_transcript($transcript)) {
$self->{_transcripthash}{"$transcript"} = [];
}
# store exons of transcript (key: transcript)
push @{$self->{_transcripthash}{"$transcript"}}, $exon;
$self->{_transcriptrefhash}{"$transcript"} = $transcript;
}
=head2
Name transcript_from_ref
Arg : String "Bio::EnsEMBL::Transcript(HASHXXXXXX)" used as key in _transcriptrefhash
Function : Returns for a given STRING the reference to a Bio::EnsEMBL::Transcript-Object
Returnval: Bio::EnsEMBL::Transcript
=cut
sub transcript_from_ref {
my ($self,$ref) = @_;
return $self->{_transcriptrefhash}{$ref}
}
sub contains_exon {
my ($self,$exon) = @_;
return (exists $self->{_exonhash}{"$exon"});
}
# this is a variation on the method above. contains_exon looks in the
# _exonhash which keys on the memory location of the Exon objects. This means
# that, when we fetch the same exon from the same db at two different times,
# the _exonhash will not recognise that they are the same because their location
# in memory is different. contains_exon only works when the Exon object has not
# gone out of scope.
sub contains_exon_with_dbid_and_dbname {
my ($self,$exon) = @_;
my $exon_duplicate = 0;
foreach my $ex (keys %{$self->{_exonidhash}}) {
if ($ex eq "".$exon->dbID.$exon->adaptor->db->dbc->dbname) {
$exon_duplicate = 1;
}
}
return $exon_duplicate;
}
=head2
Name : get_all_Exons_in_ExonCluster ()
Arg[0] : Bio::EnsEMBL::Analysis::Runnable::Condense_EST::ExonCluster
Function : returns an Array of all Exons (all types) which belong to an ExonCluster
if Arg[1] is supplied the list of returned Bio::EnsEMBL::Exon Objects will
be filtered and only exons >= the supplied rank will be returned
Returnval : Arrayref to Array of Bio::EnsEMBL::Exon Objects
=cut
sub get_all_Exons_in_ExonCluster{
my ($self ) = @_ ;
my @all_exons_in_cluster = values %{$self->{_all_exons_in_cluster}} ;
return \@all_exons_in_cluster ;
}
=head2
Name : get_all_Exons_of_EvidenceSet
Arg : String describing Name of evidence_set
Function : returns an Array of all Exons (all types) which belong to an ExonCluster
and belong to the specified Evidence_set.
be filtered and only exons >= the supplied rank will be returned
Returnval : Arrayref to Array of Bio::EnsEMBL::Exon Objects
=cut
sub get_all_Exons_of_EvidenceSet{
my ($self, $setname) = @_ ;
my @result ;
for my $e (@{ $self->get_all_Exons_in_ExonCluster }) {
if ($e->ev_set =~m/$setname/){
push @result, $e ;
}
}
@result = sort { $b->seq_region_length <=> $a->seq_region_length} @result ;
return \@result ;
}
sub get_all_Exons {
my $self =shift;
# return array of exons in cluster stored by their id
return values %{$self->{_exonidhash}};
}
sub contains_transcript {
my ($self,$transcript) = @_;
return (exists $self->{_transcripthash}{"$transcript"});
}
# returns transcsripts which exactly hit the exon
sub transcripts_containing_exon {
my ($self,$exon) = @_;
my @transcripts;
my %transhash = $self->each_transcripts_exons;
TRANS:
foreach my $trans (keys %transhash) {
foreach my $e (@{$transhash{$trans}}) {
if ($e == $exon) {
push @transcripts, $self->transcript_from_ref($trans);
next TRANS;
}
}
}
return @transcripts;
}
sub get_transcripts_having_this_Exon_in_ExonCluster {
my ($self,$exon) = @_;
# self = exoncluster, exon = exon
my @transcripts;
my %transhash = $self->each_transcripts_exons;
#print "\n\nEC-start: ".$self->start."\tEC-end: " .$self->end . "\n" ;
TRANS:foreach my $trans (keys %transhash) {
#print "New Transcript:\n";
unless ( $trans =~m/PredictionTranscript/ ) {
# we only processs "real" transcripts, no predictionTranscripts from ab-initio
foreach my $ex_to_test (@{$transhash{$trans}}) {
#print " testing exon: start: " . $ex_to_test->start . "\t" . $ex_to_test->end ."\n";
if($ex_to_test->stable_id eq $exon->stable_id){
#print " pushing \t".$self->transcript_from_ref($trans)->dbID."\n";
push @transcripts, $self->transcript_from_ref($trans);
next TRANS;
}
}
}
}
return @transcripts;
}
sub get_prediction_transcripts_which_have_exon_in_ExonCluster {
my ($self,$exon) = @_;
# self = exoncluster, exon = exon
my @transcripts;
my %transhash = $self->each_transcripts_exons;
#print "\n\nEC-start: ".$self->start."\tEC-end: " .$self->end . "\n" ;
TRANS:foreach my $trans (keys %transhash) {
#print "New Transcript:\n";
if ( $trans=~m/PredictionTranscript/) {
# we only processs "real" transcripts, no predictionTranscripts from ab-initio
foreach my $ex_to_test (@{$transhash{$trans}}) {
#print " testing exon: start: " . $ex_to_test->start . "\t" . $ex_to_test->end ."\n";
if($ex_to_test->start >= $self->start && $ex_to_test->end <= $self->end){
#print " pushing \t".$self->transcript_from_ref($trans)->dbID."\n";
push @transcripts, $self->transcript_from_ref($trans);
next TRANS;
}
}
}
}
return @transcripts;
}
sub check_if_ExonCluster_has_est_evidence {
my ( $self,$ev_sets ) = @_ ;
my @est_bioytpes = @{ $$ev_sets{'est'} };
my %est_bt ;
@est_bt{@est_bioytpes} = () ;
my %transhash = $self->each_transcripts_exons;
my $cluster_has_real_evidence = 0 ;
my @rank_of_trans ;
TRANS: for my $trans_refname (keys %transhash) {
my $trans = $self->transcript_from_ref( $trans_refname );
my $trans_biotype = $trans->biotype() ;
if (exists $est_bt{$trans->biotype}) {
#print "ec holds exon which belongs to ev_set \'est\' ".$trans->biotype."\n" ;
return 1 ;
}
}
# print "ec holds no gene which belong to ev-set \'est\' \n" ;
return 0 ;
}
# returns array of all transcripts which share the same exon
#
sub unique_exon_combinations {
my $self = shift;
my $ignore_strand = shift;
my %unique_combs;
my %transhash = $self->each_transcripts_exons;
foreach my $trans (keys %transhash) {
my $keystr;
if (!$ignore_strand) {
@{$transhash{$trans}} = sort _by_stranded_start @{$transhash{$trans}};
} else {
@{$transhash{$trans}} = sort _sort_by_forward_start @{$transhash{$trans}};
}
foreach my $exon (@{$transhash{$trans}}) {
$keystr .= ":" . $self->{_exonhash}{$exon};
}
if (! exists($unique_combs{$keystr})) {
$unique_combs{$keystr}{exons} = [@{$transhash{$trans}}];
}
push @{$unique_combs{$keystr}{transcripts}}, $self->transcript_from_ref($trans);
}
return values %unique_combs;
}
sub each_transcripts_exons {
my $self = shift;
return %{$self->{_transcripthash}};
}
sub _determine_type {
my ($self, $ignore_strand) = @_;
my @combs = $self->unique_exon_combinations($ignore_strand);
if (scalar(@combs) == 1) {
$self->{'_type'} = 0;
return 0;
}
my $maxexoncount = 0;
foreach my $comb (@combs) {
if (scalar(@{$comb->{exons}} > $maxexoncount)) {
$maxexoncount = scalar(@{$comb->{exons}});
}
}
# All clusters have one exon which all have the same coordinates
# In this case the difference should be in the phase or the
# translation start position - if there's no difference in these then
# this is an exon duplication.
if ($maxexoncount == 1) {
my $failed = 0;
my $first_start;
my $first_end;
my $first_phase;
my $subtype;
foreach my $comb (@combs) {
if (!defined($first_start)) {
$first_start = $comb->{exons}[0]->start;
$first_end = $comb->{exons}[0]->end;
$first_phase = $comb->{exons}[0]->phase;
}
if ($comb->{exons}[0]->start != $first_start ||
$comb->{exons}[0]->end != $first_end) {
$failed = 1;
last;
} else {
if ($comb->{exons}[0]->phase != $first_phase) {
$subtype .= "p";
} else {
my $countterm = 0;
foreach my $trans (@{$comb->{transcripts}}) {
if ($trans->translation->start_Exon == $comb->{exons}[0] ||
$trans->translation->end_Exon == $comb->{exons}[0]) {
$countterm++;
}
}
if ($countterm == scalar(@{$comb->{transcripts}})) {
$subtype .= "t";
} elsif ($countterm) {
print "WARNING: Exon which isn't always the terminal exon in a translation\n";
}
}
}
}
if (!$failed) {
#print "Type 1 $subtype ";
#print_clust_summary(@combs);
$self->{'_type'} = 1;
return 1;
}
}
# One or more clusters have ONLY one exon in all its transcripts
# Other clusters have longer exon and can have surrounding exons
if ($maxexoncount == 1) {
# Foreach comb determine exon extents and number of exons in
# transcripts
my @allsingle;
my @notallsingle;
foreach my $comb (@combs) {
my $nsingle = 0;
foreach my $trans (@{$comb->{transcripts}}) {
if (scalar(@{$trans->get_all_Exons}) == 1) {
$nsingle++;
}
}
if ($nsingle == scalar(@{$comb->{transcripts}})) {
push @allsingle,$comb;
} else {
push @notallsingle,$comb;
}
}
my $failed = 0;
if (@allsingle) {
my $notsingle_start = undef;
my $notsingle_end = undef;
SINGLE:
foreach my $comb (@allsingle) {
foreach my $comp_comb (@notallsingle) {
if ($comb->{exons}[0]->start < $comp_comb->{exons}[0]->start ||
$comb->{exons}[0]->end > $comp_comb->{exons}[0]->end) {
$failed = 1;
last SINGLE;
} elsif (!defined($notsingle_start)) {
$notsingle_start = $comp_comb->{exons}[0]->start;
$notsingle_end = $comp_comb->{exons}[0]->end;
} elsif ($notsingle_start != $comp_comb->{exons}[0]->start ||
$notsingle_end != $comp_comb->{exons}[0]->end) {
$failed = 1;
last SINGLE;
}
}
}
} else {
$failed = 1;
}
if (!$failed) {
#print "Type 2 ";
#print_clust_summary(@combs);
$self->{'_type'} = 2;
return 2;
}
}
# One cluster has a single exon which is the terminal exon in all its
# transcripts
# Other clusters has single (possibly) longer exon which has splice side
# boundary conserved. It may have extra exons to non spliced side
if ($maxexoncount == 1) {
if ($ignore_strand) {
throw("This method is not designed to handle cases where we ignore strand");
}
my @allterminal;
my @notallterminal;
my $failed = 0;
my $first_end = undef;
COMB:
foreach my $comb (@combs) {
my $nterminal = 0;
my $end = undef;
foreach my $trans (@{$comb->{transcripts}}) {
if (scalar(@{$trans->get_all_Exons}) == 1) {
$nterminal++;
print "Single exon transcript\n";
} elsif ($comb->{exons}[0]->strand == 1) {
if ($comb->{exons}[0] == $trans->start_Exon &&
(!defined($end) || ($end == 0))) {
$end = 0;
if (!defined($first_end)) {
$first_end = $end;
}
$nterminal++;
} elsif ($comb->{exons}[0] == $trans->end_Exon &&
(!defined($end) || $end == 1)) {
$end = 1;
if (!defined($first_end)) {
$first_end = $end;
}
$nterminal++;
} else {
print "Failed forward strand condition\n";
}
} else {
if ($comb->{exons}[0] == $trans->start_Exon &&
(!defined($end) || ($end == 1))) {
$end = 1;
if (!defined($first_end)) {
$first_end = $end;
}
$nterminal++;
} elsif ($comb->{exons}[0] == $trans->end_Exon &&
(!defined($end) || $end == 0)) {
$end = 0;
if (!defined($first_end)) {
$first_end = $end;
}
$nterminal++;
} else {
print "Failed reverse strand condition\n";
}
}
if (defined($end) && $end != $first_end) {
$failed = 1;
last COMB;
}
}
if ($nterminal == scalar(@{$comb->{transcripts}})) {
push @allterminal,$comb;
} else {
push @notallterminal,$comb;
}
}
if (!defined($first_end)) {
$failed = 1;
}
if (!$failed) {
if (scalar(@allterminal)) {
# first find (possibly) conserved end and max extent of
# non conserved end
my $conspos;
if ($first_end == 0) {
$conspos = $allterminal[0]->{exons}[0]->end;
} else {
$conspos = $allterminal[0]->{exons}[0]->start;
}
my $nonconspos = undef;
foreach my $comb (@allterminal) {
if ($first_end == 0) {
if (!defined($nonconspos) || $comb->{exons}[0]->start < $nonconspos) {
$nonconspos = $comb->{exons}[0]->start;
}
} else {
if (!defined($nonconspos) || $comb->{exons}[0]->end > $nonconspos) {
$nonconspos = $comb->{exons}[0]->end;
}
}
}
# now check that non of the combs with extra exons on the non conserved side
#
TERMINAL:
foreach my $comb (@combs) {
if ($first_end == 0 && $comb->{exons}[0]->end != $conspos ) {
$failed = 1;
last TERMINAL;
} elsif ($first_end == 1 && $comb->{exons}[0]->start != $conspos) {
$failed = 1;
last TERMINAL;
}
}
} else {
$failed = 1;
}
}
if (!$failed) {
#print "Type 3 ";
#print_clust_summary(@combs);
$self->{'_type'} = 3;
return 3;
}
}
# All have a single exon but do not match any of above cases
if ($maxexoncount == 1) {
#print "Type 4 ";
#print_clust_summary(@combs);
$self->{'_type'} = 4;
return 4;
} else {
# Clusters with multiple exons
#Ones which have very short introns (probably frameshift introns)
foreach my $comb (@combs) {
if (scalar(@{$comb->{exons}}) > 1) {
my $prev_exon = undef;
my @exons = sort {$a->start <=> $b->start} @{$comb->{exons}};
foreach my $exon (@exons) {
if (defined($prev_exon)) {
if ($exon->start - $prev_exon->end > 8) {
#print "Type 6 (has multiple exon clusters) ";
#print_clust_summary(@combs);
$self->{'_type'} = 6;
return 6;
}
}
$prev_exon = $exon;
}
}
}
#print "Type 5 (has multiple frameshift exons) ";
#print_clust_summary(@combs);
$self->{'_type'} = 5;
return 5;
}
}
sub print_clust_summary {
my (@combs) = @_;
foreach my $comb (@combs) {
print ": ";
foreach my $exon (@{$comb->{exons}}) {
print $exon->dbID. " ";
}
print ": ";
}
print "\n";
}
sub _sort_by_forward_start {
my $alow;
my $blow;
my $ahigh;
my $bhigh;
# we ignore strand
$alow = $a->start;
$ahigh = $a->end;
$blow = $b->start;
$bhigh = $b->end;
if ($alow != $blow) {
return $alow <=> $blow;
} else {
return $ahigh <=> $bhigh;
}
}
sub _by_stranded_start {
my $alow;
my $blow;
my $ahigh;
my $bhigh;
if ($a->strand != $b->strand) {
confess("Mixed strand in sort comparison routine - Bad!");
}
if ($a->strand == 1) {
$alow = $a->start;
$ahigh = $a->end;
} else {
$blow = $a->start;
$bhigh = $a->end;
}
if ($b->strand == 1) {
$blow = $b->start;
$bhigh = $b->end;
} else {
$alow = $b->start;
$ahigh = $b->end;
}
if ($a->strand == 1) {
if ($alow != $blow) {
return $alow <=> $blow;
} else {
return $ahigh <=> $bhigh;
}
} else {
if ($ahigh != $bhigh) {
return $ahigh <=> $bhigh;
} else {
return $alow <=> $blow;
}
}
}
sub _add_exon_biotype{
my ( $self, $exon, $transcript ) = @_ ;
unless (exists $self->{_exon_2_biotype}{$exon}) {
$self->{_exon_2_biotype}{$exon}=$transcript->biotype ;
} else {
my $old_bt = $self->{_exon_2_biotype}{$exon};
my $new_bt = $transcript->biotype ;
if ($old_bt eq $new_bt) {
$self->{_exon_2_biotype}{$exon}=$transcript->biotype ;
} else {
$self->{_exon_2_biotype}{$exon}=$transcript->biotype ;
warning("Changing biotype of exon : $old_bt changed to $new_bt\n" ) ; # jhv
}
}
}
# could also do this with my $ex = ${$ec->get_all_Exons_in_ExonCluster}{$exon}}
# print $ex->biotype() ;
#
sub get_biotype_of_Exon{
my ($self,$exon) = @_ ;
return $self->{_exon_2_biotype}{$exon};
}
1;