ensembl-analysis
ContigGenesChecker
Toolbar
Summary
Bio::EnsEMBL::Test::ContigGenesChecker
Package variables
No package variables defined.
Included modules
Checker
TranscriptCluster
strict
Inherit
Checker
Synopsis
Module to check the validity of a transcript
Description
Performs various checks on the Genes in a Bio::EnsEMBL::Contig object. These
should only be checks which are not internal to a particular Gene but instead
dependant on the arrangement of genes on the Contig. These include:
1. Genes have been clustered correctly
2. Interlocking Genes
3. Genes on both strands with overlapping exons.
4. Transcripts on both strands with all overlapping exons
This class does not use stable_ids but instead dbIDs because stable_ids are
not set until after the gene build.
Methods
_all_exons_overlap | No description | Code |
adaptor | No description | Code |
by_transcript_high | No description | Code |
by_transcript_low | No description | Code |
check | No description | Code |
check_Clustering | No description | Code |
check_for_interlock | No description | Code |
find_Interlocks | No description | Code |
find_StrandOverlap_Transcripts | No description | Code |
find_StrandOverlaps | No description | Code |
find_enclosing_Exons | No description | Code |
genes | No description | Code |
get_cluster_Exons | No description | Code |
get_gene_id | No description | Code |
get_transcript_id | No description | Code |
new | No description | Code |
output | No description | Code |
slice | No description | Code |
Methods description
None available.
Methods code
_all_exons_overlap | description | prev | next | Top |
sub _all_exons_overlap
{ my ($self,$exons1, $exons2, $direction) = @_;
my $nexon2 = scalar(@$exons2);
my $exon2_ind = 0;
EXON1: foreach my $exon1 (@$exons1) {
my $found = 0;
for (; $exon2_ind < $nexon2; $exon2_ind++) {
if ($exon1->overlaps($exons2->[$exon2_ind],'ignore')) {
$found = 1;
next EXON1;
}
}
if (!$found) {
return 0;
}
}
return 1; } |
sub adaptor
{ my ( $self, $arg ) = @_;
if( defined $arg ) {
$self->{_adaptor} = $arg;
}
return $self->{_adaptor}; } |
sub by_transcript_high
{ my $alow;
my $blow;
my $ahigh;
my $bhigh;
$alow = $a->start;
$ahigh = $a->end;
$blow = $b->start;
$bhigh = $b->end;
if ($ahigh != $bhigh) {
return $ahigh <=> $bhigh;
} else {
return $alow <=> $blow;
} } |
sub by_transcript_low
{ my $alow;
my $blow;
my $ahigh;
my $bhigh;
$alow = $a->start;
$ahigh = $a->end;
$blow = $b->start;
$bhigh = $b->end;
if ($alow != $blow) {
return $alow <=> $blow;
} else {
return $bhigh <=> $ahigh;
} } |
sub check
{ my $self = shift;
print "Checking clustering\n";
my $corrected_clusters = $self->check_Clustering();
print "Looking for strand overlaps\n";
$self->find_StrandOverlaps($corrected_clusters);
print "Looking for interlocks\n";
$self->find_Interlocks($corrected_clusters); } |
sub check_Clustering
{ my $self = shift;
my @transcripts_unsorted;
my $genes = $self->{'_genes'};
foreach my $gene (@$genes) {
foreach my $tran (@{$gene->get_all_Transcripts}) {
push(@transcripts_unsorted, $tran);
}
}
my @transcripts = sort by_transcript_high @transcripts_unsorted;
my $ga = $self->adaptor->get_adaptor("Gene");
my @clusters;
foreach my $tran (@transcripts) {
my @matching_clusters;
my ($trans_start, $trans_end) = ($tran->start,$tran->end);
CLUSTER: foreach my $cluster (@clusters) {
if (!($trans_start > $cluster->end ||
$trans_end < $cluster->start)) {
foreach my $cluster_transcript (@{$cluster->get_Transcripts()}) {
foreach my $exon1 (@{$tran->get_all_Exons}) {
foreach my $cluster_exon (@{$cluster_transcript->get_all_Exons}) {
if ($exon1->overlaps($cluster_exon) &&
$exon1->strand == $cluster_exon->strand) {
push (@matching_clusters, $cluster);
next CLUSTER;
}
}
}
}
}
}
if (scalar(@matching_clusters) == 0) {
my $newcluster = new TranscriptCluster;
$newcluster->put_Transcripts($tran);
push(@clusters,$newcluster);
} elsif (scalar(@matching_clusters) == 1) {
$matching_clusters[0]->put_Transcripts($tran);
} else {
my @new_clusters;
my $merged_cluster = new TranscriptCluster;
foreach my $clust (@matching_clusters) {
$merged_cluster->put_Transcripts(@{$clust->get_Transcripts});
}
$merged_cluster->put_Transcripts($tran);
push @new_clusters,$merged_cluster;
foreach my $clust (@clusters) {
my $found = 0;
MATCHING: foreach my $m_clust (@matching_clusters) {
if ($clust == $m_clust) {
$found = 1;
last MATCHING;
}
}
if (!$found) {
push @new_clusters,$clust;
}
}
@clusters = @new_clusters;
}
}
my $ntrans = 0;
my %trans_check_hash;
foreach my $cluster (@clusters) {
my %gene_ids = ();
$ntrans += scalar(@{$cluster->get_Transcripts});
foreach my $trans (@{$cluster->get_Transcripts}) {
if (defined($trans_check_hash{"$trans"})) {
$self->throw("Transcript " . $trans->dbID . " added twice to clusters\n");
}
$trans_check_hash{"$trans"} = 1;
$gene_ids{$ga->fetch_by_transcript_id($trans->dbID)->dbID} = 1;
}
if (!scalar(@{$cluster->get_Transcripts})) {
$self->throw("Empty cluster");
}
if(scalar keys %gene_ids > 1){
print STDERR "Multiple gene cluster: ".(join(", ", keys %gene_ids))."\n";
}
}
if ($ntrans != scalar(@transcripts)) {
$self->throw("Not all transcripts have been added into clusters $ntrans and " . scalar(@transcripts). "\n ");
}
if (scalar(@clusters) < scalar(@$genes)) {
$self->add_Error("Reclustering reduced number of genes from " .
scalar(@$genes) . " to " . scalar(@clusters). "\n");
} elsif (scalar(@clusters) > scalar(@$genes)) {
$self->add_Error("Reclustering increased number of genes from " .
scalar(@$genes) . " to " . scalar(@clusters). "\n");
}
return\@ clusters; } |
sub check_for_interlock
{ my $self = shift;
my $cluster1 = shift;
my $cluster2 = shift;
my $strand_name = shift;
if ($cluster1->overlaps($cluster2)) {
if (($cluster1->start < $cluster2->start && $cluster1->end < $cluster2->end) ||
($cluster1->start > $cluster2->start && $cluster1->end > $cluster2->end)) {
$self->add_Error("Interlocking genes on the $strand_name strand (bounds ".
$cluster1->start . "-" . $cluster1->end . " and ".
$cluster2->start . "-" . $cluster2->end . ")\n");
} else {
if ($cluster1->start > $cluster2->start) {
my $tmp = $cluster1;
$cluster1 = $cluster2;
$cluster2 = $tmp;
}
my @containing_exons_unsort = $self->get_cluster_Exons($cluster1);
my @contained_exons_unsort = $self->get_cluster_Exons($cluster2);
my @containing_exons = sort {$a->start <=> $b->start}
@containing_exons_unsort;
my @contained_exons = sort {$a->start <=> $b->start}
@contained_exons_unsort;
my ($left_exon_first, $right_exon_first) =
$self->find_enclosing_Exons(\@containing_exons, $contained_exons[0]);
my ($left_exon_last, $right_exon_last) =
$self->find_enclosing_Exons(\@containing_exons,
$contained_exons[$#contained_exons]);
if ($left_exon_first != $left_exon_last ||
$right_exon_first != $right_exon_last) {
$self->add_Error("Interlocking enclosed gene on the $strand_name strand (bounds ".
$cluster1->start . "-" . $cluster1->end . " and ".
$cluster2->start . "-" . $cluster2->end . ")\n");
} else {
my $warnstr= "Enclosed gene on the $strand_name strand (bounds ".
$cluster1->start . "-" . $cluster1->end . " and ".
$cluster2->start . "-" . $cluster2->end . ")\n";
$warnstr .= " first cluster contains :";
foreach my $trans (@{$cluster1->get_Transcripts}) {
$warnstr .= " " . get_transcript_id($trans);
}
$warnstr .= "\n";
$warnstr .= " second cluster contains:";
foreach my $trans (@{$cluster2->get_Transcripts}) {
$warnstr .= " " . get_transcript_id($trans);
}
$warnstr .= "\n";
$self->add_Warning($warnstr);
}
}
} } |
sub find_Interlocks
{ my $self = shift;
my $clusters = shift;
my @forward_clusters;
my @reverse_clusters;
foreach my $cluster (@$clusters) {
if ($cluster->strand == 1) {
push (@forward_clusters,$cluster);
} else {
push (@reverse_clusters,$cluster);
}
}
for (my $i = 0; $i < scalar(@forward_clusters)-1; $i++) {
my $cluster1 = $forward_clusters[$i];
for (my $j = $i+1; $j < scalar(@forward_clusters); $j++) {
my $cluster2 = $forward_clusters[$j];
$self->check_for_interlock($cluster1,$cluster2,"forward");
}
}
for (my $i = 0; $i < scalar(@reverse_clusters)-1; $i++) {
my $cluster1 = $reverse_clusters[$i];
for (my $j = $i+1; $j < scalar(@reverse_clusters); $j++) {
my $cluster2 = $reverse_clusters[$j];
$self->check_for_interlock($cluster1,$cluster2,"reverse");
}
} } |
sub find_StrandOverlap_Transcripts
{ my ($self, $clusters) = @_;
my @forward_clusters;
my @reverse_clusters;
foreach my $cluster (@$clusters) {
if ($cluster->strand == 1) {
push (@forward_clusters,$cluster);
} else {
push (@reverse_clusters,$cluster);
}
}
foreach my $f_cluster (@forward_clusters) {
foreach my $r_cluster (@reverse_clusters) {
if ($f_cluster->overlaps($r_cluster,'ignore')) {
foreach my $f_trans (@{$f_cluster->get_Transcripts}) {
foreach my $r_trans (@{$r_cluster->get_Transcripts}) {
my @f_exons = @{$f_trans->get_all_Exons};
my @r_exons = reverse @{$r_trans->get_all_Exons};
if ($self->_all_exons_overlap(\@f_exons,\@r_exons,"forward")) {
$self->add_Error("Transcript with all (". scalar(@f_exons) .
") overlapping exons on other strand for " .
get_transcript_id($f_trans) . " and " . get_transcript_id($r_trans) .
" (forward)\n");
}
if ($self->_all_exons_overlap(\@r_exons,\@f_exons,"reverse")) {
$self->add_Error("Transcript with all (". scalar(@r_exons) .
") overlapping exons on other strand for " .
get_transcript_id($f_trans) . " and " . get_transcript_id($r_trans) .
" (reverse)\n");
}
}
}
}
}
} } |
sub find_StrandOverlaps
{ my ($self, $clusters) = @_;
my @forward_exons;
my @reverse_exons;
my $genes = $self->{'_genes'};
my %exon_gene_map;
foreach my $gene (@$genes) {
my @gene_exons = @{$gene->get_all_Exons};
if (scalar(@gene_exons)) {
my $strand = $gene_exons[0]->strand;
foreach my $exon (@gene_exons) {
if ($strand != $exon->strand) {
$self->add_Error("Gene ". $gene->dbID .
" has exons on different strands\n");
}
if ($exon->strand == 1) {
push (@forward_exons,$exon);
} elsif ($exon->strand == -1) {
push (@reverse_exons,$exon);
} else {
$self->add_Error("Gene ". $gene->dbID .
" has an unstranded exon (" . $exon->dbID . ")\n");
}
if (exists($exon_gene_map{$exon->dbID})) {
$self->add_Error("Seen exon " . $exon->dbID .
" more than once (in gene " . $gene->dbID . " and gene " .
$exon_gene_map{$exon->dbID}->dbID . ")\n");
}
$exon_gene_map{$exon->dbID} = $gene;
}
} else {
$self->add_Error("Gene ". $gene->dbID . " has no exons\n");
}
}
my @sorted_forward_exons = map { $_->[1] } sort { $a->[0] <=> $b->[0] } map { [$_->start, $_] } @forward_exons;
my @sorted_reverse_exons = map { $_->[1] } sort { $a->[0] <=> $b->[0] } map { [$_->start, $_] } @reverse_exons;
my $has_overlaps = 0;
my $n_r_exon = scalar(@sorted_reverse_exons);
FEXON: foreach my $f_exon (@sorted_forward_exons) {
for (my $i=0; $i<$n_r_exon; $i++) {
my $r_exon = $sorted_reverse_exons[$i];
next if (!defined($r_exon));
if ($r_exon->end < $f_exon->start) {
$sorted_reverse_exons[$i] = undef;
}
if ($r_exon->overlaps($f_exon)) {
$self->add_Error("Overlapping exons on two strands for exons ".
$f_exon->dbID . " (gene = " . get_gene_id($exon_gene_map{$f_exon->dbID}) .
") and " . $r_exon->dbID . " (gene = " . get_gene_id($exon_gene_map{$r_exon->dbID}) . ")\n");
$has_overlaps = 1;
}
if ($r_exon->start > $f_exon->end) {
next FEXON;
}
}
}
if ($has_overlaps) {
print " doing detail check\n";
$self->find_StrandOverlap_Transcripts($clusters);
} } |
sub find_enclosing_Exons
{ my $self = shift;
my $containing_exons = shift;
my $contained_exon = shift;
my $left_exon;
my $right_exon;
my $left_exon_rank;
my $right_exon_rank;
my $rank = 0;
CEXON: foreach my $exon (@$containing_exons) {
if ($exon->end < $contained_exon->start) {
$left_exon = $exon;
$left_exon_rank = $rank;
} elsif ($exon->start > $contained_exon->end) {
$right_exon = $exon;
$right_exon_rank = $rank;
last CEXON;
}
$rank++;
}
if (!defined($left_exon) || !defined($right_exon)) {
$self->throw("Didn't find enclosing exons - should not be possible\n");
}
if ($right_exon_rank != $left_exon_rank+1) {
$self->throw("Left and right ranks not consecutive - should not be possible\n");
}
return ($left_exon, $right_exon); } |
sub genes
{ my ( $self, $arg ) = @_;
if( defined $arg ) {
$self->{_genes} = $arg;
}
return $self->{_genes}; } |
sub get_cluster_Exons
{ my $self = shift;
my $cluster = shift;
my %h;
foreach my $trans (@{$cluster->get_Transcripts}) {
foreach my $exon ( @{$trans->get_all_Exons} ) {
$h{"$exon"} = $exon;
}
}
return values %h; } |
sub get_gene_id
{ my ($gene) = @_;
if (defined($gene->stable_id)) {
return $gene->stable_id;
} elsif (defined($gene->dbID)) {
return $gene->dbID;
} else {
return "no id";
} } |
sub get_transcript_id
{ my ($trans) = @_;
if (defined($trans->stable_id)) {
return $trans->stable_id;
} elsif (defined($trans->dbID)) {
return $trans->dbID;
} else {
return "no id";
} } |
sub new
{ my ($class, @args) = @_;
my $self = bless {},$class;
my ( $genes, $ignorewarnings, $adaptor,
$slice ) = rearrange
( [ qw ( GENES
IGNOREWARNINGS
ADAPTOR
SLICE
)], @args );
if( !defined $genes ) {
$self->throw("Genes must be set in new for ContigGenesChecker")
}
$self->genes($genes);
if( defined $ignorewarnings) { $self->ignorewarnings($ignorewarnings); }
if( defined $slice) { $self->slice($slice); }
if( defined $adaptor ) { $self->adaptor( $adaptor )}
$self->{_errors} = [];
$self->{_warnings} = [];
return $self; } |
sub output
{ my $self = shift;
$self->SUPER::output; } |
sub slice
{ my $self = shift;
if( @_ ) {
my $value = shift;
if (!($value->isa('Bio::EnsEMBL::Slice'))) {
$self->throw("slice passed a non Bio::EnsEMBL::Slice object\n");
}
$self->{_slice} = $value;
}
return $self->{_slice}; } |
General documentation
The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _