Bio::EnsEMBL::Analysis::Runnable
TranscriptConsensus
Toolbar
Package variables
No package variables defined.
Included modules
Inherit
Synopsis
my $runnable = Bio::EnsEMBL::Analysis::Runnable::TranscriptConsensus->new
(
-query => $query,
-analysis => $analysis,
-all_genes => \%biotypes_to_genes , # ref to $hash{biotype_of_gene} = @all_genes_of_this_biotype
-evidence_sets => $evidence_sets ,
-dnadb => $db ,
)
Description
TranscriptConsensus is an extension of TranscriptCoalescer that combines protein coding and est
transcripts to identify the best supported transcript models.
The initial gene sets are clustered, then collapsed into a non-redundant set of exons
and introns which are assigned scores according to the amount of supporting evidence they have.
The similarity genes have UTR added using the est transcripts and the resulting models are assigned
a score by summing the individual exon and intron scores.
The transcripts are sorted by score and the highest scoring models are made into
gene objets and written to the TranscriptCoalescer database.
Methods
Methods description
Name : add_single_exon_genes Arg[0] : Bio::EnsEMBL::Analysis::Tools::Algorithms::GeneCluster Arg[1] : Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::CollapsedCluster Function : Assigns scores to single exon genes Exceptions : Throws if the exon has not been scored Returnval : Array ref of Bio::EnsEMBL::Gene |
Name : collapse_cluster Arg[0] : Bio::EnsEMBL::Analysis::Tools::Algorithms::GeneCluster Arg[1] : Array ref of Bio::EnsEMBL::Gene objects Function : Cretaes a Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::CollapsedCluster object and : populates it with exons and introns from the GeneCluster Exceptions : none Returnval : Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::CollapsedCluster |
Name : compare_translation Arg[0] : Bio::EnsEMBL::Analysis:Tools::GeneBuildUtils::TranscriptExtended Arg[1] : Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::CollapsedCluster Function : Post processing to penalise transcripts made from fragments of similarity genes : that have been extended with UTR Exceptions : Throws if no ExonCluster is stored w.r.t the exon in question. Returnval : Scalar score |
Name : filter_genes Arg[0] : Array ref of Bio::EnsEMBL::Gene objects Function : Pre-filters the gene set on the basis of number of consensus splice sites or unsuported exons Exceptions : Throws unless ARG[0] is a Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::ExonExtended object Returnval : Array ref of Bio::EnsEMBL::Gene objects |
Name : make_genes Arg[0] : Array ref of Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranscriptExtended Arg[1] : Bio::EnsEMBL::Analysis::Tools::Algorithms::GeneCluster Arg[2] : Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::CollapsedCluster Function : Takes an array ref of transcript objects and makes them into gene objects. : Deals with translations and orders the genes by score stores a subset : of the highest scoring genes on the output array. Exceptions : Throws if it cannot work out the translation start or end exons Returnval : none |
Name : make_transcripts Arg[0] : Bio::EnsEMBL::Analysis::Tools::Algorithms::GeneCluster Arg[1] : Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::CollapsedCluster Function : Combines est and similarity trascripts to make coding genes with UTR. : Joins transcripts if they match the internal boundary of the first or last : exon of the similarity gene. Exceptions : none Returnval : Array ref of Bio::EnsEMBL::Transcript |
Name : run Args : none Function : Pre filters genes and clusters them, then collapses each cluster to make non-redundant : sets of exons and introns, builds transcripts and adds UTR and assigns them scores, : makes genes out of the highest scoring transcripts Exceptions : none Returnval : none |
Name : score Arg[0] : Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::CollapsedCluster Arg[1] : Bio::EnsEMBL::Analysis:Tools::GeneBuildUtils::ExonExtended Arg[2] : Array ref of Bio::EnsEMBL::Gene objects Arg[3] : String 'exon' or 'intron' Function : Assigns scores to introns and exons essentially the number of identical : features divided by the number of overlapping non-identical features. : Additional weights are added for end exons and est exons. Exceptions : none Returnval : Scalar score |
Name : transcript_from_exons Arg[0] : Bio::EnsEMBL::Analysis:Tools::GeneBuildUtils::ExonExtended Arg[1] : (optional) Bio::EnsEMBL::Analysis:Tools::GeneBuildUtils::TranscriptExtended Arg[2] : Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::CollapsedCluster Function : Makes a transcript object from an array of exons or extends an existing transcript : Assigns the new transcript a score by summing all the intron and exon scores Exceptions : Throws unless ARG[1] is a Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranscriptExtended object : Throws if an exon or intron has not been previously assigned a score Returnval : Bio::EnsEMBL::Analysis:Tools::GeneBuildUtils::TranscriptExtended |
Methods code
sub add_single_exon_genes
{ my ($self,$cluster,$collapsed_cluster) = @_;
my @single_exon_genes;
my @genes = $cluster->get_Genes;
foreach my $gene (@genes){
next unless (scalar(@{$gene->get_all_Exons}) == 1);
my $trans = $gene->get_all_Transcripts->[0];
next unless $trans->ev_set eq 'simgw';
my $exon = $trans->get_all_Exons->[0];
my $score = $collapsed_cluster->exon_score($exon);
$self->throw("Score not found for exon ".$collapsed_cluster->exon_info($exon)."\n")
unless defined($score);
$trans->score($score);
push @single_exon_genes, $self->clone_gene($gene);
}
return\@ single_exon_genes; } |
sub clone_exon_extended
{ my ($self,$exon) = @_;
$self->throw("Feature must be a Bio::EnsEMBL::Analysis:Tools::GeneBuildUtils::ExonExtended not a ".ref($exon)."\n")
unless $exon->isa("Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::ExonExtended");
my $newexon = Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::ExonExtended->new();
foreach my $sf(@{$exon->get_all_supporting_features}){
my $newsf = clone_Evidence($sf);
$newsf->dbID(undef);
$newexon->add_supporting_features($newsf);
}
$newexon->start ($exon->start);
$newexon->end ($exon->end);
$newexon->phase ($exon->phase);
$newexon->end_phase ($exon->end_phase);
$newexon->strand ($exon->strand);
$newexon->slice ($exon->slice);
$newexon->stable_id ($exon->stable_id);
$newexon->analysis ($exon->analysis);
$newexon->biotype ($exon->biotype);
$newexon->transcript ($exon->transcript);
$newexon->ev_set ($exon->ev_set);
$newexon->cluster ($exon->cluster);
return $newexon; } |
sub clone_gene
{ my ($self,$gene) =@_;
my $new_gene = Bio::EnsEMBL::Gene->new();
foreach my $transcript ( @{$gene->get_all_Transcripts} ){
my $new_transcript = $self->clone_transcript_extended( $transcript );
$new_gene->analysis( $new_transcript->analysis );
$new_gene->slice( $new_transcript->slice );
$new_gene->add_Transcript( $new_transcript );
}
$new_gene->biotype( $gene->biotype );
return $new_gene; } |
sub clone_transcript_extended
{ my ($self,$transcript) =@_;
$self->throw("Feature must be a Bio::EnsEMBL::Analysis:Tools::GeneBuildUtils::TranscriptExtended not a ".ref($transcript)."\n")
unless $transcript->isa("Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranscriptExtended");
my $newtranscript = Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranscriptExtended->new();
foreach my $exon ( @{$transcript->get_all_Exons}){
my $new_exon = $self->clone_exon_extended($exon);
$newtranscript->add_Exon($new_exon);
}
foreach my $sf(@{$transcript->get_all_supporting_features}){
my $newsf = clone_Evidence($sf);
$newsf->dbID(undef);
$newtranscript->add_supporting_features($newsf);
}
my $attribs = $transcript->get_all_Attributes();
if ( $transcript->translation ) {
$newtranscript->translation( clone_Translation( $transcript,$newtranscript ));
}
$newtranscript->add_Attributes(@$attribs);
$newtranscript->slice($transcript->slice);
$newtranscript->biotype($transcript->biotype);
$newtranscript->score($transcript->score);
$newtranscript->ev_set($transcript->ev_set);
$newtranscript->similarity($transcript->similarity);
$newtranscript->analysis($transcript->analysis);
return $newtranscript; } |
sub collapse_cluster
{ my ($self,$cluster,$genes_by_strand) = @_;
my @genes = $cluster->get_Genes;
my @exon_clusters = @{$cluster->get_exon_clustering_from_gene_cluster};
my $collapsed_cluster = Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::CollapsedCluster->new();
foreach my $gene (@genes){
foreach my $trans (@{$gene->get_all_Transcripts}){
my @exons = sort {$a->start <=> $b->start } @{$trans->get_all_Exons};
for (my $i =0 ; $i < scalar(@exons) ; $i++){
my $exon_cluster;
my $exon = $exons[$i];
unless ($collapsed_cluster->contains_exon($exon)){
EC: foreach my $ec (@exon_clusters){
if ($ec->contains_exon($exon)){
$exon_cluster = $ec;
last EC;
}
}
}
$collapsed_cluster->add_exon($exon,$exon_cluster);
next if $i == 0;
my $intron = $collapsed_cluster->make_intron_from_exons($exons[$i-1],$exons[$i]);
$intron->ev_set($exon->ev_set);
$intron->biotype('intron');
$collapsed_cluster->add_intron($intron);
}
}
}
foreach my $exon (@{$collapsed_cluster->get_all_exons}){
my $score = $self->score($collapsed_cluster,$exon,\@genes,'exon');
$collapsed_cluster->exon_score($exon,$score);
}
foreach my $intron (@{$collapsed_cluster->get_all_introns}){
my $score = $self->score($collapsed_cluster,$intron,$genes_by_strand->{$intron->strand},'intron');
$collapsed_cluster->intron_score($intron,$score);
}
return $collapsed_cluster; } |
sub compare_translation
{ my ($self,$transcript,$collapsed_cluster) = @_;
my $score = $transcript->score;
EXON: foreach my $exon (@{$transcript->get_all_Exons}){
if ($exon->end > $transcript->coding_region_start &&
$exon->start < $transcript->coding_region_end ){
next;
}
my $exon_cluster = $collapsed_cluster->get_exon_cluster($exon);
$self->throw("Exon cluster not found for exon ".$collapsed_cluster->exon_info($exon)."\n")
unless ($exon_cluster);
my $coding_exons = 0;
foreach my $overlapping_exon (@{$exon_cluster->get_all_Exons_of_EvidenceSet('simgw')}){
$coding_exons++;
}
if ($coding_exons > 0){
$collapsed_cluster->exon_info($exon) if $VERBOSE;
print " overlaps $coding_exons coding exons penalty -.5\n" if $VERBOSE;
$score -= 0.5;
}
}
return $score; } |
sub filter_genes
{ my ( $self, $genes) = @_;
my @genes_to_filter;
my %singletons;
my %non_con;
my @filtered_genes;
my %delete;
my %exon_hash;
my $gene_count = 0;
for ( my $g = 0 ; $g < scalar(@$genes) ; $g++ ){
unless ($FILTER_ESTS){
next if ($genes->[$g]->get_all_Transcripts->[0]->ev_set eq 'est');
}
$gene_count++;
foreach my $trans (@{$genes->[$g]->get_all_Transcripts}){
$non_con{$g} = count_non_canonical_splice_sites($trans);
my @exons = @{$trans->get_all_Exons};
foreach (my $i =0 ; $i < scalar(@exons) ; $i++){
my $exon = $exons[$i];
my $unique_key = $exon->start.":".$exon->end.":".$exon->strand;
push @{$exon_hash{$unique_key}} , $g;
}
}
}
if ($gene_count <= 2){
return $genes;
}
foreach my $key (sort keys %exon_hash){
if (scalar(@{$exon_hash{$key}}) == 1){
$singletons{$exon_hash{$key}[0]}++;
$delete{$exon_hash{$key}[0]} = 1;
}
}
if ( scalar (keys %delete) >= $gene_count - 2 ){
print "All transcripts look dodgy, not filtering\n" if $VERBOSE;
return $genes;
} else {
for ( my $g = 0 ; $g < scalar(@$genes) ; $g++ ){
if ( $FILTER_SINGLETONS && $singletons{$g} && $singletons{$g}>= $FILTER_SINGLETONS ){
print "Filtering out ".$genes->[$g]->dbID." ".$genes->[$g]->biotype." ".$genes->[$g]->start.":".$genes->[$g]->end.":".$genes->[$g]->strand." " if $VERBOSE;
print $singletons{$g}." singleton exons\n" if $VERBOSE;
next;
}
if ( $FILTER_NON_CONSENSUS && $non_con{$g} && $non_con{$g} >= $FILTER_NON_CONSENSUS ){
print "Filtering out ".$genes->[$g]->dbID." ".$genes->[$g]->biotype." ".$genes->[$g]->start.":".$genes->[$g]->end.":".$genes->[$g]->strand." " if $VERBOSE;
print $non_con{$g}." non - consensus\n" if $VERBOSE;
next;
}
push @filtered_genes , $genes->[$g];
}
}
return\@ filtered_genes; } |
sub make_genes
{ my ($self,$transcripts,$cluster,$collapsed_cluster)= @_;
my @genes;
my @final_genes;
$self->throw("No Transcripts to use\n") unless defined($transcripts);
SCORE: foreach my $t (sort {$b->score <=> $a->score } @$transcripts){
my $transcript = $self->clone_transcript_extended($t);
my $cds_start_exon;
my $cds_end_exon;
my $similarity_tran = $transcript->similarity;
my $translation = Bio::EnsEMBL::Translation->new();
my $strand= 1;
my $slice = $similarity_tran->slice;
my $weighted_score;
if ($ADD_UTR){
print "Transcript ".$transcript->start .":".$transcript->end .":".$transcript->strand." ".$transcript->biotype." " if $VERBOSE;
print scalar(@{$transcript->get_all_Exons})." Exons\n" if $VERBOSE;
my $last_exon;
foreach my $exon (@{$transcript->get_all_Exons}){
if ($last_exon){
my $intron = $collapsed_cluster->make_intron_from_exons($last_exon,$exon);
}
$last_exon = $exon;
}
if ($transcript->strand == -1){
my $invertedslice = $similarity_tran->slice->invert;
$strand = -1;
$similarity_tran = $similarity_tran->transfer($invertedslice);
$transcript = $transcript->transfer($invertedslice);
}
my $cds_start = $similarity_tran->coding_region_start;
my $cds_end = $similarity_tran->coding_region_end;
my @exon_array = @{$transcript->get_all_Exons};
my @similarity_exons = sort { $a->start <=> $b->start } @{$similarity_tran->get_all_Exons};
my $number = scalar(@exon_array);
for ( my $i = 0 ; $i < scalar(@exon_array) ; $i ++ ){
my $exon = $exon_array[$i];
$exon->phase( -1 );
$exon->end_phase( -1 );
foreach my $se (@similarity_exons){
if ($exon->start == $se->start){
$exon->phase($se->phase);
}
if ( $exon->end == $se->end ){
$exon->end_phase($se->end_phase);
}
}
if ($exon->start <= $cds_start && $exon->end >= $cds_start){
$translation->start_Exon($exon);
$translation->start($cds_start - $exon->start+1);
$exon->phase( -1 ) if $exon->start < $cds_start;
}
if ($exon->start <= $cds_end && $exon->end >= $cds_end){
$translation->end_Exon($exon);
$translation->end($cds_end - $exon->start+1);
$exon->end_phase( -1 ) if $exon->end > $cds_end;
}
if ($exon->end == $similarity_exons[0]->end){
$cds_start_exon = $exon;
}
if ($exon->start == $similarity_exons[-1]->start){
$cds_end_exon = $exon;
}
}
unless ( $translation->start_Exon && $translation->start_Exon eq $cds_start_exon ) {
throw("Cannot find the coding sequence start exon\n") unless $cds_start_exon;
$translation->start_Exon($cds_start_exon);
my $trim = $cds_start_exon->start - $cds_start;
if ( $trim%3 == 0 ){
$translation->start(1);
} else {
$translation->start(1+(3-$trim%3));
}
$cds_start_exon->phase( -1 ) if $cds_start_exon->start < $cds_start;
}
unless ($translation->end_Exon && $translation->end_Exon eq $cds_end_exon ){
throw("Cannot find the coding sequence end exon\n") unless $cds_end_exon;
$translation->end_Exon($cds_end_exon);
my $trim = $cds_end - $cds_end_exon->end;
if ( $trim%3 == 0 ){
$translation->end($cds_end_exon->length);
} else {
$translation->end($cds_end_exon->length-(3-$trim%3));
}
$cds_end_exon->end_phase( -1 ) if $cds_end_exon->end > $cds_end;
}
$transcript->translation($translation);
if ($strand == -1){
my $slice = $transcript->slice->invert;
$transcript = $transcript->transfer($slice);
$similarity_tran = $similarity_tran->transfer($slice);
}
} else {
my $newtranslation = clone_Translation($similarity_tran,$transcript);
$transcript->translation($newtranslation);
my @exon_array = @{$transcript->get_all_Exons};
my @similarity_exons = sort { $a->start <=> $b->start } @{$similarity_tran->get_all_Exons};
for ( my $i = 0 ; $i < scalar(@exon_array) ; $i ++ ){
my $exon = $exon_array[$i];
foreach my $se (@similarity_exons){
if ($exon->start == $se->start){
$exon->phase($se->phase);
}
if ( $exon->end == $se->end ){
$exon->end_phase($se->end_phase);
}
}
}
}
$weighted_score = $self->compare_translation($transcript,$collapsed_cluster);
print $transcript->score."\t" if $VERBOSE;
print "\npeptide weighted score $weighted_score\n\n" if $VERBOSE;
$transcript->score( $weighted_score );
$transcript->analysis($similarity_tran->analysis);
$transcript->slice($similarity_tran->slice);
my $gene = Bio::EnsEMBL::Gene->new;
$gene->slice($transcript->slice);
$gene->add_Transcript($transcript);
$gene->analysis($transcript->analysis);
push @genes,$gene;
}
push @genes, @{$self->add_single_exon_genes($cluster,$collapsed_cluster)};
@genes = sort { $b->get_all_Transcripts->[0]->score <=> $a->get_all_Transcripts->[0]->score } @genes;
my $top_score = $genes[0]->get_all_Transcripts->[0]->score;
my $bottom_score = $genes[-1]->get_all_Transcripts->[0]->score;
$bottom_score-= 0.0001 if $bottom_score == 0;
foreach my $gene (@genes){
foreach my $transcript ( @{$gene->get_all_Transcripts}){
my $biotype = $BAD_BIOTYPE;
if ($GOOD_PERCENT){
my $top = ($transcript->score - $bottom_score );
my $bottom = ($top_score - $bottom_score);
if ($bottom && (($top/$bottom) * 100 >= 100 - $GOOD_PERCENT)){ $biotype = $GOOD_BIOTYPE; }
} else {
if ($transcript->score == $top_score){
$biotype = $GOOD_BIOTYPE;
}
}
$biotype = $SMALL_BIOTYPE if (scalar(@genes) < $MIN_CONSENSUS);
$gene->biotype($biotype);
push @final_genes, $gene if $biotype;
}
}
foreach my $final(@final_genes){
foreach my $transcript(@{$final->get_all_Transcripts}){
throw(Transcript_info($transcript)." has inconsistent phases")
unless(are_phases_consistent($transcript));
my $final_score = $transcript->score / (scalar(@{$transcript->get_all_Exons})*2-1); foreach my $tsf ( @{$transcript->get_all_supporting_features}) {
$tsf->hcoverage($tsf->score);
$tsf->score(sprintf("%.4f", $final_score));
}
print "Transcript ". ($transcript->start + $self->solexa_slice->start -1) .":".
($transcript->end + $self->solexa_slice->start-1) . ":".
$transcript->strand." ".
$final->biotype." " if $VERBOSE && $SOLEXA;
print "Final score $final_score\n";
}
}
$self->output(\@final_genes);
return; } |
sub make_transcripts
{ my ($self,$cluster,$collapsed_cluster) = @_;
my @genes = $cluster->get_Genes;
my @est_genes;
my @similarity_genes;
my $longest_similarity = 0;
my @all_transcripts;
foreach my $gene (@genes){
push @est_genes, $gene if $gene->get_all_Transcripts->[0]->ev_set eq 'est';
push @similarity_genes, $gene if $gene->get_all_Transcripts->[0]->ev_set eq 'simgw';
}
foreach my $similarity (@similarity_genes) {
foreach my $trans (@{$similarity->get_all_Transcripts}) {
my @partial_transcripts;
my @possible_transcripts;
my @possible_start_utr;
my @possible_end_utr;
my @transcript_exons = sort {$a->start <=> $b->start } @{$trans->get_all_Exons};
next if (scalar(@transcript_exons) <= 1);
my $exon_count = $#transcript_exons;
my $start_splice;
my $end_splice;
my @similarity_exons;
my $almost_complete;
my @start_utr;
my @end_utr;
$start_splice = $collapsed_cluster->make_intron_from_exons($transcript_exons[0],$transcript_exons[1]);
$self->throw("Intron was not previously defined in the cluster\n")
unless $collapsed_cluster->contains_intron($start_splice);
my @joining_exons = @{$collapsed_cluster->last_exons_by_intron($start_splice)};
foreach my $exon (@joining_exons) {
my @transcripts = @{$collapsed_cluster->transcript_by_feature($exon)};
foreach my $trans (@transcripts) {
next unless $trans->ev_set eq 'est' ;
push @start_utr, $trans if $ADD_UTR;
}
}
$end_splice = $collapsed_cluster->make_intron_from_exons($transcript_exons[$exon_count-1],$transcript_exons[$exon_count]);
$self->throw("Intron was not previously defined in the cluster\n")
unless $collapsed_cluster->contains_intron($end_splice);
@joining_exons = @{$collapsed_cluster->next_exons_by_intron($end_splice)};
foreach my $exon (@joining_exons) {
my @transcripts = @{$collapsed_cluster->transcript_by_feature($exon)};
foreach my $trans (@transcripts) {
next unless $trans->ev_set eq 'est' ;
push @end_utr, $trans if $ADD_UTR;
}
}
if ( scalar(@start_utr) > 0 ) {
foreach my $est (@start_utr) {
my @exon_array;
foreach my $exon (@{$est->get_all_Exons}){
if ($exon->end <= $start_splice->start){
push @exon_array, $exon;
}
}
my $new_transcript = $self->transcript_from_exons(\@exon_array,undef,$collapsed_cluster)
if scalar(@exon_array > 0);
if ($new_transcript){
$new_transcript->add_supporting_features(@{$est->get_all_supporting_features});
push @possible_start_utr, $new_transcript;
}
}
@possible_start_utr = sort { $a->score <=> $b->score } @possible_start_utr if @possible_start_utr;
push @partial_transcripts, $possible_start_utr[-1] if @possible_start_utr;
}
my @exon_array = ( $transcript_exons[0] );
my $new_transcript = $self->transcript_from_exons(\@exon_array,undef,$collapsed_cluster);
$self->throw("Unable to make transcript using first exon of similarity gene\n")
unless $new_transcript;
push @partial_transcripts, $new_transcript if $new_transcript;
@similarity_exons = @{$trans->get_all_Exons};
pop @similarity_exons;
shift @similarity_exons;
foreach my $transcript (@partial_transcripts) {
my $score;
$almost_complete = $self->transcript_from_exons(\@similarity_exons,$transcript,$collapsed_cluster);
if ( scalar(@end_utr) > 0 ) {
foreach my $est (@end_utr) {
my @exon_array;
foreach my $exon (@{$est->get_all_Exons}){
if ($exon->start >= $end_splice->end){
push @exon_array, $exon;
}
}
my $new_transcript = $self->transcript_from_exons(\@exon_array,$almost_complete,$collapsed_cluster)
if scalar(@exon_array > 0);
if ($new_transcript){
$new_transcript->add_supporting_features(@{$est->get_all_supporting_features});
$new_transcript->similarity($trans);
push @possible_end_utr, $new_transcript;
}
}
@possible_end_utr = sort { $a->score <=> $b->score } @possible_end_utr;
push @possible_transcripts, $possible_end_utr[-1];
}
my @exon_array = ( $transcript_exons[-1] );
my $new_transcript = $self->transcript_from_exons(\@exon_array,$almost_complete,$collapsed_cluster);
$new_transcript->similarity($trans);
$self->throw("Unable to make transcript using last exon of similarity gene\n")
unless $new_transcript;
push @possible_transcripts, $new_transcript if $new_transcript;
}
@possible_transcripts = sort { $a->score <=> $b->score } @possible_transcripts if @possible_transcripts;
my $chosen_transcript = $possible_transcripts[-1];
$chosen_transcript->add_supporting_features(@{$trans->get_all_supporting_features});
$chosen_transcript->biotype($trans->biotype);
push @all_transcripts, $chosen_transcript ;
}
}
return\@ all_transcripts; } |
sub run
{ my ($self) = @_ ;
my $count =1;
my $genes_by_strand;
my @allgenes = @{ $self->get_genes_by_evidence_set('simgw') } ;
push @allgenes , @{ $self->get_genes_by_evidence_set('est') } ;
if ($FILTER_SINGLETONS or $FILTER_NON_CONSENSUS){
my @tmp;
my ($clusters, $non_clusters) = cluster_Genes(\@allgenes, $self->get_all_evidence_sets ) ;
foreach my $cluster(@$clusters, @$non_clusters){
my @genes = $cluster->get_Genes;
@genes = @{$self->filter_genes(\@genes)};
push(@tmp, @genes);
}
@allgenes = @tmp;
}
my ($clusters, $non_clusters) = cluster_Genes(\@allgenes, $self->get_all_evidence_sets ) ;
foreach my $gene (@allgenes){
push @{$genes_by_strand->{$gene->strand}}, $gene;
}
push @$clusters, @$non_clusters if (scalar(@$non_clusters) > 0 ) ;
my $found;
$count = 0;
foreach my $cluster (@$clusters){
my $simgw;
my @genes = $cluster->get_Genes;
next unless $cluster->get_Genes_by_Set('simgw');
print "\nCluster $count\n" if $VERBOSE;
print $cluster->start." ".$cluster->end." ".$cluster->strand."\n" if $VERBOSE;
$count++;
my $collapsed_cluster = $self->collapse_cluster($cluster,$genes_by_strand);
my $transcripts = $self->make_transcripts($cluster,$collapsed_cluster);
$self->make_genes($transcripts,$cluster,$collapsed_cluster);
} } |
sub score
{ my ($self,$collapsed_cluster,$feature,$genes,$type) = @_;
my $total_score;
my $solexa_score = 0;
$total_score = $collapsed_cluster->get_intron_count($feature) if $type eq 'intron';
$total_score = $collapsed_cluster->get_exon_count($feature) if $type eq 'exon';
$collapsed_cluster->exon_info($feature) if $VERBOSE;
print " " if $VERBOSE;
my $score = 0 ;
my $overlap = $MIN_CONSENSUS ;
my $est_trans = $MIN_CONSENSUS;
if ($SOLEXA && defined($self->solexa_slice) && $type eq 'exon'){
my $sub_slice = $self->solexa_slice->adaptor->fetch_by_region
(
'toplevel',
$self->solexa_slice->seq_region_name,
$feature->start + $self->solexa_slice->start,
$feature->end + $self->solexa_slice->start,
1);
$self->throw("Sub slice not found " . $feature->start . " " . $feature->end . " " . $feature->strand . "\n")
unless $sub_slice;
$solexa_score = scalar(@{$sub_slice->get_all_DnaAlignFeatures
(
$$TRANSCRIPT_CONSENSUS_DB_CONFIG{"SOLEXA_DB"}->{"BIOTYPE"}[0],
$SOLEXA_SCORE_CUTOFF)}
);
print "Solexa reads = $solexa_score\n" if $VERBOSE;
}
foreach my $gene (@$genes){
foreach my $tran (@{$gene->get_all_Transcripts}){
if ($type eq 'exon'){
$est_trans++ if $tran->ev_set eq 'est';
$overlap++;
}
next if scalar(@{$tran->get_all_Exons}) == 1;
if ($type eq 'intron'){
if ($feature->start < $tran->end() && $feature->end > $tran->start()){
$est_trans++ if $tran->ev_set eq 'est';
$overlap++;
}
}
}
}
my $est_exons = $collapsed_cluster->ev_count_by_feature($feature,'est');
if ($type eq 'exon' && $feature->is_terminal_exon && $feature->number_exons > 1){
print "End exon $est_exons " if $VERBOSE;
$est_exons = $collapsed_cluster->ev_count_by_terminal_feature($feature,'est');
$total_score = $collapsed_cluster->count_end_exon($feature);
$self->throw("No evidence found for exon " . $collapsed_cluster->exon_info($feature) . "\n")
unless defined($est_exons);
}
print "$type " if $VERBOSE;
print "TOTAL $total_score OVERLAP $overlap " if $VERBOSE;
if ($est_exons){
print "WEIGHTED GENES $est_exons EST TRANS $est_trans " if $VERBOSE;
my $weighted_score = 0;
my $unweighted_score = 0;
if ($est_trans){
$weighted_score = ($est_exons + $solexa_score) / ($est_trans + $solexa_score) ; print "weighted score $weighted_score " if $VERBOSE;
}
$unweighted_score = ($total_score + $solexa_score) / ($overlap + $solexa_score) ; print "unweighted score $unweighted_score\t " if $VERBOSE;
$score = ($weighted_score + $unweighted_score) / 2; } else {
$score += (($total_score + $solexa_score) / ($overlap + $solexa_score)) ; if ($est_trans && $est_trans > 1.5 ){
$score -= $EST_OVERLAP_PENALTY;
}
}
if ($feature->is_terminal_exon && $type eq 'exon' && $feature->number_exons > 1){
my $longest_end_exon = $collapsed_cluster->get_end_exon($feature)->length;
my $multiplyer = $feature->length / $longest_end_exon; print "End exon penalty ".$feature->length." / ". $longest_end_exon." -.5\t" if $VERBOSE;
if ($score >= 0){
$score *= $multiplyer;
} else {
$score /= $multiplyer; }
$score = $score - $END_EXON_PENALTY;
print " ENDEXON " if $VERBOSE;
} else {
$score-= 0.5 if $feature->length < $SHORT_INTRON_PENALTY && $type eq 'intron';
$score-= 0.5 if $feature->length < $SHORT_EXON_PENALTY && $type eq 'exon';
}
print "$score\n " if $VERBOSE;
return $score; } |
sub solexa_slice
{ my ($self,$slice) = @_;
if (defined($slice)) {
$self->{_solexa} = $slice;
}
return $self->{_solexa};
}
1; } |
sub transcript_from_exons
{ my ($self, $exons, $transcript, $collapsed_cluster) = @_;
my $new_transcript = Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranscriptExtended->new();
my $prev_exon;
my $score;
@$exons = sort { $a->start <=> $b->start } @$exons;
if (defined($transcript)){
$self->throw("Transcript needs to be a Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranscriptExtended not a $transcript\n")
unless $transcript->isa("Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranscriptExtended");
$score = $transcript->score;
foreach my $exon (sort { $a->start <=> $b->start } @{$transcript->get_all_Exons}){
$new_transcript->add_Exon($exon);
$prev_exon = $exon;
}
}
foreach my $exon (@$exons){
my $collapsed_exon = $collapsed_cluster->get_exon($exon);
unless ( $exon ){
$collapsed_cluster->exon_info($collapsed_exon) if $VERBOSE;
$self->throw("No score assigned to this exon\n");
}
if ($prev_exon){
my $intron = $collapsed_cluster->make_intron_from_exons($prev_exon,$exon);
unless ( $intron ){
$collapsed_cluster->exon_info($intron) if $VERBOSE;
$self->throw("No score assigned to this intron\n");
}
$score += $collapsed_cluster->intron_score($intron);
$score -= $UTR_PENALTY if $exon->ev_set eq 'est';
}
$score += $collapsed_cluster->exon_score($collapsed_exon);
$score -= $UTR_PENALTY if $exon->ev_set eq 'est';
$prev_exon = $exon;
$new_transcript->add_Exon($collapsed_exon);
}
$new_transcript->score($score);
return $new_transcript; } |
General documentation
NAME Bio::EnsEMBL::Analysis::Runnable::TranscriptConsensus | Top |