Bio::EnsEMBL::Analysis::Runnable
GeneBuilder
Toolbar
Package variables
No package variables defined.
Included modules
Inherit
Synopsis
No synopsis!
Description
No description!
Methods
bin_sort_transcripts | No description | Code |
blessed_biotypes | No description | Code |
check_Clusters | No description | Code |
cluster_Transcripts | No description | Code |
cluster_Transcripts_by_genomic_range | No description | Code |
cluster_into_Genes | No description | Code |
get_Transcripts | No description | Code |
input_genes | No description | Code |
make_shared_exons_unique | No description | Code |
max_short_intron_len | No description | Code |
max_transcript_number | No description | Code |
min_short_intron_len | No description | Code |
new | No description | Code |
output_biotype | No description | Code |
prune_Exons | No description | Code |
prune_Transcripts | No description | Code |
prune_redundant_CDS | No description | Code |
prune_redundant_transcripts | No description | Code |
remove_transcript_from_gene | No description | Code |
run | No description | Code |
select_best_transcripts | No description | Code |
Methods description
None available.
Methods code
bin_sort_transcripts | description | prev | next | Top |
sub bin_sort_transcripts
{
my ($self,$transcripts) = @_;
my %lengths;
my $max_num_exons = 0;
my %sizehash;
my %orfhash;
my %tran2orf;
my %tran2length;
my %exon2transcript;
foreach my $tran (@$transcripts) {
my @exons = @{ $tran->get_all_Exons };
if(scalar(@exons) > $max_num_exons){
$max_num_exons = scalar(@exons);
}
my $length = 0;
foreach my $e ( @exons ){
$length += $e->end - $e->start + 1;
push ( @{ $exon2transcript{ $e } }, $tran );
}
$sizehash{$tran} = $length;
$tran2length{ $tran } = $length;
$length = 0;
foreach my $e(@{$tran->get_all_translateable_Exons}){
$length += $e->end - $e->start + 1;
}
$tran2orf{ $tran } = $length;
push(@{$orfhash{$length}}, $tran);
}
my @transcripts = ();
my @orflengths = sort {$b <=> $a} (keys %orfhash);
my %orflength_bin;
my $numbins = 4;
my $currbin = 1;
foreach my $orflength(@orflengths){
last if $currbin > $numbins;
my $percid = ($orflength*100)/$orflengths[0]; if ($percid > 100) {
$percid = 100;
}
my $currthreshold = $currbin * (100/$numbins); $currthreshold = 100 - $currthreshold;
if($percid <$currthreshold) {
$currbin++;
}
my @tmp = @{$orfhash{$orflength}};
push(@{$orflength_bin{$currbin}}, @{$orfhash{$orflength}});
}
$currbin = 1;
EXONLENGTH_SORT:
while( $currbin <= $numbins){
if(!defined $orflength_bin{$currbin} ){
$currbin++;
next EXONLENGTH_SORT;
}
my @sorted_transcripts = sort { $sizehash{$b} <=> $sizehash{$a} }
@{$orflength_bin{$currbin}};
push(@transcripts, @sorted_transcripts);
$currbin++;
}
return\@ transcripts; } |
sub blessed_biotypes
{ my ($self, $arg) = @_;
if($arg){
$self->{'blessed_biotypes'} = $arg;
}
return $self->{'blessed_biotypes'}; } |
sub check_Clusters
{ my ($self, $num_transcripts, $clusters) = @_;
my $ntrans = 0;
my $cluster_num = 0;
my %trans_check_hash;
foreach my $cluster (@$clusters) {
$ntrans += scalar(@$cluster);
foreach my $trans (@$cluster) {
if (defined($trans_check_hash{$trans})) {
throw("Transcript " . $trans->dbID ." ".$trans->adaptor->dbc->dbname.
" added twice to clusters\n");
}
$trans_check_hash{$trans} = 1;
}
if (!scalar(@$cluster)) {
throw("Empty cluster");
}
}
if ($ntrans != $num_transcripts) {
throw("Not all transcripts have been added into clusters $ntrans and " . $num_transcripts. "\n ");
}
return; } |
sub cluster_Transcripts
{ my ($self, $transcripts) = @_;
my @forward;
my @reverse;
foreach my $transcript(@{$transcripts}){
if($transcript->strand == 1){
push(@forward, $transcript);
}else{
push(@reverse, $transcript);
}
}
my $forward_clusters = $self->cluster_Transcripts_by_genomic_range(\@forward);
my $reverse_clusters = $self->cluster_Transcripts_by_genomic_range(\@reverse);
my @clusters;
push(@clusters, @{$forward_clusters}) if($forward_clusters);
push(@clusters, @{$reverse_clusters}) if($reverse_clusters);
return\@ clusters; } |
cluster_Transcripts_by_genomic_range | description | prev | next | Top |
sub cluster_Transcripts_by_genomic_range
{ my ($self, $transcripts) = @_;
if(!$transcripts || @$transcripts == 0){
return undef;
}
my @transcripts = sort { $a->start <=> $b->start ? $a->start <=> $b->start : $b->end <=> $a->end } @$transcripts;
my @clusters;
my $cluster_count = 0;
my @cluster_starts;
my @cluster_ends;
my $cluster = Bio::EnsEMBL::Analysis::Tools::Algorithms::TranscriptCluster->new();
$cluster->put_Transcripts(0, $transcripts[0]);
$cluster_starts[$cluster_count] = $transcripts[0]->start;
$cluster_ends[$cluster_count] = $transcripts[0]->end;
push(@clusters, $cluster);
for (my $c=1; $c<=$#transcripts; $c++){
if ( !( $transcripts[$c]->end < $cluster_starts[$cluster_count] ||
$transcripts[$c]->start > $cluster_ends[$cluster_count] ) ){
$cluster->put_Transcripts(0, $transcripts[$c] );
if ($transcripts[$c]->start < $cluster_starts[$cluster_count]) {
$cluster_starts[$cluster_count] = $transcripts[$c]->start;
}
if ( $transcripts[$c]->end > $cluster_ends[$cluster_count]) {
$cluster_ends[$cluster_count] = $transcripts[$c]->end;
}
}else{
$cluster_count++;
$cluster = Bio::EnsEMBL::Analysis::Tools::Algorithms::TranscriptCluster->new();
$cluster->put_Transcripts(0, $transcripts[$c] );
$cluster_starts[$cluster_count] = $transcripts[$c]->start;
$cluster_ends[$cluster_count] = $transcripts[$c]->end;
push(@clusters,$cluster);
}
}
return\@ clusters; } |
sub cluster_into_Genes
{ my ($self, $transcripts_unsorted) = @_;
my $num_trans = scalar(@$transcripts_unsorted);
my @transcripts = sort { $a->start <=> $b->start ? $a->start <=> $b->start : $b->end <=> $a->end } @$transcripts_unsorted;
my @clusters;
foreach my $tran (@transcripts) {
my @matching_clusters;
CLUSTER:
foreach my $cluster (@clusters) {
foreach my $cluster_transcript (@$cluster) {
if ($tran->end >= $cluster_transcript->start &&
$tran->start <= $cluster_transcript->end) {
foreach my $exon1 (@{$tran->get_all_Exons}) {
foreach my $cluster_exon (@{$cluster_transcript->get_all_Exons}) {
if ($exon1->overlaps($cluster_exon)
&& $exon1->strand == $cluster_exon->strand) {
push (@matching_clusters, $cluster);
next CLUSTER;
}
}
}
}
}
}
if (scalar(@matching_clusters) == 0) {
my @newcluster;
push(@newcluster,$tran);
push(@clusters,\@newcluster);
} elsif (scalar(@matching_clusters) == 1) {
push @{$matching_clusters[0]}, $tran;
} else {
my @new_clusters;
my @merged_cluster;
foreach my $clust (@matching_clusters) {
push @merged_cluster, @$clust;
}
push @merged_cluster, $tran;
push @new_clusters,\@merged_cluster;
foreach my $clust (@clusters) {
my $found = 0;
MATCHING:
foreach my $m_clust (@matching_clusters) {
if ($clust == $m_clust) {
$found = 1;
last MATCHING;
}
}
if (!$found) {
push @new_clusters,$clust;
}
}
@clusters = @new_clusters;
}
}
$self->check_Clusters(scalar(@transcripts),\@ clusters);
my @genes;
foreach my $cluster(@clusters){
my $count = 0;
my $gene = new Bio::EnsEMBL::Gene;
$gene->biotype($self->output_biotype);
foreach my $transcript (@$cluster){
$gene->add_Transcript($transcript);
}
push( @genes, $gene );
}
return\@ genes; } |
sub get_Transcripts
{ my ($self, $transcripts) = @_;
if($transcripts){
$self->{'transcripts'} = $transcripts;
}
if(!$self->{'transcripts'}){
foreach my $gene(@{$self->input_genes}){
foreach my $transcript(@{$gene->get_all_Transcripts}){
push(@{$self->{'transcripts'}}, $transcript);
}
}
}
return $self->{'transcripts'}; } |
sub input_genes
{ my ($self, $arg) = @_;
if($arg){
$self->{'input_genes'} = $arg;
}
return $self->{'input_genes'}; } |
sub make_shared_exons_unique
{ my ($self, $genes) = @_;
my @pruned;
foreach my $gene(@$genes){
my $new_gene = $self->prune_Exons($gene);
push(@pruned, $new_gene);
}
return\@ pruned; } |
sub max_short_intron_len
{ my ($self, $arg) = @_;
if(defined $arg){
$self->{'max_short_intron'} = $arg;
}
return $self->{'max_short_intron'}; } |
sub max_transcript_number
{ my ($self, $arg) = @_;
if(defined $arg){
$self->{'max_transcript_number'} = $arg;
}
return $self->{'max_transcript_number'}; } |
sub min_short_intron_len
{ my ($self, $arg) = @_;
if(defined $arg){
$self->{'min_short_intron'} = $arg;
}
return $self->{'min_short_intron'}; } |
sub new
{ my ($class,@args) = @_;
my $self = $class->SUPER::new(@args);
my($genes, $blessed_biotypes, $max_transcript_number,
$min_short_intron_len, $max_short_intron_len,$output_biotype) =
rearrange([qw(GENES BLESSED_BIOTYPES MAX_TRANSCRIPT_PER_CLUSTER
MIN_SHORT_INTRON_LEN MAX_SHORT_INTRON_LEN OUTPUT_BIOTYPE)], @args);
print "HERE ARE THE ARGS: ", join(" ",@args),"\n";
print "HERE I AM: ", $min_short_intron_len,"\t", $max_short_intron_len,"\n";
$self->max_transcript_number(10);
$self->output_biotype($self->analysis->logic_name);
$self->input_genes($genes);
$self->blessed_biotypes($blessed_biotypes);
$self->max_transcript_number($max_transcript_number);
$self->min_short_intron_len($min_short_intron_len);
$self->max_short_intron_len($max_short_intron_len);
$self->output_biotype($output_biotype);
warning("Strange running the Genebuilder without any genes")
if(!$genes || scalar(@$genes) == 0);
return $self; } |
sub output_biotype
{ my ($self, $arg) = @_;
if($arg){
$self->{'output_biotype'} = $arg;
}
return $self->{'output_biotype'}; } |
sub prune_Exons
{ my ($self, $gene) = @_;
my @unique_Exons;
foreach my $tran (@{$gene->get_all_Transcripts}) {
my @newexons;
foreach my $exon (@{$tran->get_all_Exons}) {
my $found;
UNI:foreach my $uni (@unique_Exons) {
if ($uni->start == $exon->start &&
$uni->end == $exon->end &&
$uni->strand == $exon->strand &&
$uni->phase == $exon->phase &&
$uni->end_phase == $exon->end_phase
) {
$found = $uni;
last UNI;
}
}
if (defined($found)) {
push(@newexons,$found);
if ($exon == $tran->translation->start_Exon){
$tran->translation->start_Exon($found);
}
if ($exon == $tran->translation->end_Exon){
$tran->translation->end_Exon($found);
}
} else {
push(@newexons,$exon);
push(@unique_Exons, $exon);
}
}
$tran->flush_Exons;
foreach my $exon (@newexons) {
$tran->add_Exon($exon);
}
}
return $gene;
}
1; } |
sub prune_Transcripts
{ my ($self, $transcript_clusters) = @_;
my @newtran;
my %blessed_genetypes = %{$self->blessed_biotypes};;
my $cluster_count = 0;
CLUSTER:
foreach my $transcript_cluster ( @$transcript_clusters ){
$cluster_count++;
my $mytranscripts = $transcript_cluster->get_Transcripts;
my @transcripts = @{$self->bin_sort_transcripts( $mytranscripts )};
my $maxexon_number = 0;
foreach my $t (@transcripts){
if ( scalar(@{$t->get_all_Exons}) > $maxexon_number ){
$maxexon_number = scalar(@{$t->get_all_Exons});
}
}
if ($maxexon_number == 1){
@transcripts = map { $_->[1] } sort { $b->[0]->length <=> $a->[0]->length } map{ [ $_->start_Exon, $_ ] } @transcripts;
my $tran = shift( @transcripts );
push (@newtran, $tran);
my @es = @{$tran->get_all_Exons};
my $e = $es[0];
foreach my $transcript (@transcripts){
if(exists $blessed_genetypes{$transcript->type}){
push(@newtran, $transcript);
}
else{
foreach my $exon ( @{$transcript->get_all_Exons} ){
transfer_supporting_evidence($exon, $e);
}
}
}
next CLUSTER;
}
my %pairhash;
my %exonhash;
my %single_exon_rejects;
TRANSCRIPT:
foreach my $tran (@transcripts) {
my @exons = @{$tran->get_all_Exons};
my $i = 0;
my $found = 1;
my @evidence_pairs;
EXONS:
for ($i = 0; $i < $#exons; $i++) {
my $foundpair = 0;
my $exon1 = $exons[$i];
my $exon2 = $exons[$i+1];
my $intron;
if ($exon1->strand == 1) {
$intron = abs($exon2->start - $exon1->end - 1);
}
else {
$intron = abs($exon1->start - $exon2->end - 1);
}
if ($intron < $self->max_short_intron_len &&
$intron > $self->min_short_intron_len ) {
print STDERR "Intron too short: $intron bp. Transcript will be rejected\n";
$foundpair = 1;
} else {
foreach my $first_exon_id (keys %pairhash) {
my $first_exon = $exonhash{$first_exon_id};
foreach my $second_exon_id (keys %{$pairhash{$first_exon}}) {
my $second_exon = $exonhash{$second_exon_id};
if ( $exon1->overlaps($first_exon) && $exon2->overlaps($second_exon) ) {
$foundpair = 1;
push( @evidence_pairs, [ $exon1 , $first_exon ] );
push( @evidence_pairs, [ $exon2 , $second_exon ] );
transfer_supporting_evidence($exon1, $first_exon);
transfer_supporting_evidence($first_exon, $exon1);
transfer_supporting_evidence($exon2, $second_exon);
transfer_supporting_evidence($second_exon, $exon2);
}
}
}
}
if ($foundpair == 0) {
$found = 0;
$exonhash{$exon1} = $exon1;
$exonhash{$exon2} = $exon2;
$pairhash{$exon1}{$exon2} = 1;
}
}
if(exists $blessed_genetypes{$tran->type}){
push (@newtran, $tran);
}
elsif ($found == 0) {
push(@newtran,$tran);
@evidence_pairs = ();
} elsif ($found == 1 && $#exons == 0){
$single_exon_rejects{$tran} = $tran;
} else {
if ( $tran == $transcripts[0] ){
print STDERR "Strange, this is the first transcript in the cluster!\n";
}
foreach my $pair ( @evidence_pairs ){
my @pair = @$pair;
my $source_exon = $pair[0];
my $target_exon = $pair[1];
transfer_supporting_evidence($source_exon, $target_exon)
}
}
}
foreach my $reject(values %single_exon_rejects){
my @exons = @{$reject->get_all_Exons};
my @sf = @{$exons[0]->get_all_supporting_features};
foreach my $stored_exon(values %exonhash){
if($exons[0]->overlaps($stored_exon)){
transfer_supporting_evidence($exons[0], $stored_exon);
}
}
}
} return\@ newtran; } |
sub prune_redundant_CDS
{ my ( $self, $genes ) = @_;
my $nremoved = 0;
my %blessed_genetypes = %{$self->blessed_biotypes};
foreach my $gene (@$genes) {
my @trans_with_utrs;
my @trans_without_utrs;
foreach my $trans (@{$gene->get_all_Transcripts}) {
$trans->sort;
my @exons = @{$trans->get_all_Exons};
if ($trans->translation) {
if ($trans->translation->start_Exon == $exons[0] &&
$trans->translation->start == 1 &&
$trans->translation->end_Exon == $exons[$#exons] &&
$trans->translation->end == $exons[$#exons]->length) {
push @trans_without_utrs, $trans;
} else {
push @trans_with_utrs, $trans;
}
} else {
$self->warn("No translation for transcript");
}
}
my %cds_exon_hash;
foreach my $trans (@{$gene->get_all_Transcripts}) {
if ($trans->translation) {
my @cds_exons = @{$trans->get_all_translateable_Exons};
$cds_exon_hash{$trans} =\@ cds_exons;
}
}
foreach my $utr_trans (@trans_with_utrs) {
my $utr_trans_cds_exons = $cds_exon_hash{$utr_trans};
CDS_TRANS: foreach my $cds_trans (@trans_without_utrs) {
my $cds_trans_cds_exons = $cds_exon_hash{$cds_trans};
if (scalar(@$cds_trans_cds_exons) != scalar(@$utr_trans_cds_exons)) {
next CDS_TRANS;
}
my @cds_exons = @$cds_trans_cds_exons;
foreach my $utr_trans_exon (@$utr_trans_cds_exons) {
my $cds_trans_exon = shift @cds_exons;
if ($cds_trans_exon->start != $utr_trans_exon->start ||
$cds_trans_exon->end != $utr_trans_exon->end ||
$cds_trans_exon->strand != $utr_trans_exon->strand) {
next CDS_TRANS;
}
}
if ( exists $blessed_genetypes{$cds_trans->biotype} &&
! exists $blessed_genetypes{$utr_trans->biotype}) {
$utr_trans->biotype($cds_trans->biotype);
}
$nremoved++;
$self->remove_transcript_from_gene($gene,$cds_trans);
}
}
my $num_of_utr_trans = scalar(@trans_with_utrs);
} } |
sub prune_redundant_transcripts
{ my ($self,$genes) = @_;
my $nremoved = 0;
my %blessed_genetypes = %{$self->blessed_biotypes};
foreach my $gene (@$genes) {
my @sorted_transcripts = map { $_->[1] } sort { $b->[0] <=> $a->[0] } map { [$_->length, $_] } @{$gene->get_all_Transcripts};
my %sorted_exon_hash;
my %cds_start_hash;
my %cds_end_hash;
foreach my $trans (@sorted_transcripts) {
my @exons = sort {$a->start <=> $b->start} @{$trans->get_all_Exons};
$sorted_exon_hash{$trans} =\@ exons;
if ($trans->translation) {
$cds_start_hash{$trans} = $trans->coding_region_start;
$cds_end_hash{$trans} = $trans->coding_region_end;
}
}
my @trans_to_remove;
for (my $i=0; $i<scalar(@sorted_transcripts); $i++) {
my $trans = $sorted_transcripts[$i];
next if (!defined($trans));
next if (!$trans->translation);
my @exons = @{$sorted_exon_hash{$trans}};
COMPTRANS:
for (my $j=$i+1; $j<scalar(@sorted_transcripts); $j++) {
my $comp_trans = $sorted_transcripts[$j];
next if (!defined($comp_trans));
next if (!$comp_trans->translation);
next if ($trans == $comp_trans);
next if ($cds_start_hash{$trans} !=
$cds_start_hash{$comp_trans});
next if ($cds_end_hash{$trans} != $cds_end_hash{$comp_trans});
my @comp_exons = @{$sorted_exon_hash{$comp_trans}};
if (scalar(@comp_exons) != scalar(@exons)){
next;
}
if (scalar(@exons) == 1) {
if ( exists $blessed_genetypes{$comp_trans->biotype} &&
! exists $blessed_genetypes{$trans->biotype}) {
$trans->biotype($comp_trans->biotype);
}
$sorted_transcripts[$j] = undef;
push @trans_to_remove, $comp_trans;
} else {
for (my $k=0; $k<scalar(@exons) - 1; $k++) {
if ($exons[$k]->end != $comp_exons[$k]->end ||
$exons[$k+1]->start != $comp_exons[$k+1]->start) {
next COMPTRANS;
}
}
if ( exists $blessed_genetypes{$comp_trans->biotype} &&
! exists $blessed_genetypes{$trans->biotype}) {
$trans->biotype($comp_trans->biotype);
}
$sorted_transcripts[$j] = undef;
push @trans_to_remove, $comp_trans;
}
}
}
foreach my $trans (@trans_to_remove) {
$nremoved++;
$self->remove_transcript_from_gene($gene,$trans);
}
} } |
sub remove_transcript_from_gene
{ my ($self, $gene, $trans_to_del) = @_;
my @newtrans;
foreach my $trans (@{$gene->get_all_Transcripts}) {
if ($trans != $trans_to_del) {
push @newtrans,$trans;
}
}
$gene->{_transcript_array} = [];
foreach my $trans (@newtrans) {
$gene->add_Transcript($trans);
}
return scalar(@newtrans); } |
sub run
{ my ($self) = @_;
my $transcripts = $self->get_Transcripts;
if(!$transcripts || @$transcripts == 0){
print "GeneBuilder seems to have no transcripts to cluster\n";
return;
}
my $transcript_clusters = $self->cluster_Transcripts($transcripts);
my $pruned_transcripts = $self->prune_Transcripts($transcript_clusters);
my $initial_genes = $self->cluster_into_Genes($pruned_transcripts);
$self->prune_redundant_CDS($initial_genes);
$self->prune_redundant_transcripts($initial_genes);
my $best_transcripts = $self->select_best_transcripts($initial_genes);
my $final_genes = $self->cluster_into_Genes($best_transcripts);
my $pruned_genes = $self->make_shared_exons_unique($final_genes);
$self->output($pruned_genes);
} |
select_best_transcripts | description | prev | next | Top |
sub select_best_transcripts
{ my ( $self, $genes ) = @_;
my @selected_transcripts;
my %blessed_genetypes = %{$self->blessed_biotypes};
GENE:
foreach my $gene ( @$genes ){
my $sorted_transcripts =
$self->bin_sort_transcripts( $gene->get_all_Transcripts );
my $count = 0;
TRAN:
foreach my $transcript( @$sorted_transcripts ){
$count++;
unless (exists $blessed_genetypes{$transcript->biotype}){
next TRAN if ($count > $self->max_transcript_number);
}
push ( @selected_transcripts, $transcript );
}
}
return\@ selected_transcripts; } |
General documentation
No general documentation available.