Bio::EnsEMBL::Analysis::Runnable
BestTargetted
Toolbar
Package variables
No package variables defined.
Included modules
Inherit
Synopsis
my $runnable = Bio::EnsEMBL::Analysis::Runnable::BestTargetted->new(
-query => $slice,
);
$runnable->run;
my @genes = @{$runnable->output};
Description
BestTargetted combines gene-structures from any number of different targetted runs
generating a new set containing the best genes from the sets.
NOTES
- Each gene has only a single transcript
- First, Genes are clustered into overlapping clusters. This does not mean that
every Transcript in the cluster will overlap with every other Transcript in
the cluster. It is more like TranscriptCoalescer where each Transcript in the
cluster overlaps with at least one other Transcript.
- Unclustered Transcripts (those that do not overlap with any other Transcript)
are automatically kept
- If Genes in a cluster consist of those from only one analysis (biotype),
then all the Genes in this cluster will be stored.
- The tricky bit. Now we have a cluster that contains Genes from more than one
analysis. The Transcripts in the this cluster are hashed by:
Protein -> Block -> list of genes in block
where Protein is the protein from which the Transcript is made and Block is a
location where all the Transcripts in this location overlap with every other
Transcript, and all Transcripts are made from the same protein.
- Foreach block, we want to find the best protein. This is achieved by ordering
the Genes in the block by biotype, with Genes from the preferred analysis
being first in the list. The Transcript translations are compared against the
actual source protein. The first Transcript to match exactly is stored. If no
Transcripts match exactly, but all Transcripts have the same Tranlasion, then
store the first Transcript (Gene). If no Transcripts match exactly and they
have different translation sequences, then use exonerate to find the best Gene.
If there is a choice between 2 short genes from one analysis vs 1 long gene
from a second analysis, then choose the 2 short genes (provided they have a
good enough score/percent_id according to Exonerate).
- also, whene there is only one gene in a protein cluster, we might get suspicious.
In these cases, exonerate the gene to check it's score and percent_id to the
original protein. (Often a bad alignment has been produced.) Flag if the
exonerate results indicate that we may have a dodgy alignmennt.
Methods
Methods description
Arg [0] : ref to a Bio::EnsEMBL::Analysis Arg [1] : ref to a Bio::EnsEMBL::Gene Function : makes a gene and sets biotype and analysis Returntype: new Bio::EnsEMBL::Gene |
Arg [0] : runnable Arg [1] : ref to a hash Function : Returntype: 2 hashrefs |
Arg [0] : ref to a Bio::EnsEMBL::Analysis Arg [1] : ref to a Bio::EnsEMBL::Gene Arg [2] : ref to a Bio::EnsEMBL::Transcript (optional) Function : Clones the gene object and assigns the new analysis to it Returntype: new Bio::EnsEMBL::Gene Example : |
Arg : none Function : Runs BestTargetted - compares overlapping transcripts from both sets, constructed from the same evidence and stores the best model Returnval : none |
Arg [0] : query Bio::Seq object Arg [1] : target Bio::Seq object Function : calls exonerate->run Returntype: score - integer |
Methods code
sub all_genes
{ my ($self, $val) = @_;
if (defined $val) {
$self->{_all_genes} = $val;
}
return $self->{_all_genes}; } |
sub biotypes
{ my ($self, $val) = @_;
if (defined $val) {
$self->{_bt_biotypes} = $val;
}
return $self->{_bt_biotypes}; } |
sub check_for_pairs
{ my ($genes) = @_;
my %logics;
print "we have ". scalar @$genes ." genes to compare\n";
foreach my $outer (@$genes) {
foreach my $inner (@$genes) {
if (($outer->biotype eq $inner->biotype) && ($outer->start > $inner->end || $outer->end < $inner->start)) {
$logics{$outer->biotype} += 1;
}
}
}
foreach my $k (keys %logics){
print "Double count $k ".($logics{$k} / 2)."\n"; #divided by 2 because each pair counted twice }
#print if a biotype has >2 transcripts in that cluster foreach my $k (keys %logics){ if ($logics{$k} > 2){ print "Eek - more than 1 pair of doubles for a single biotype\n"; last;
}
}
return keys %logics; } |
sub check_gene
{ my ($self, $gene) = @_;
my $pass_checks = 1;
my $hit_name;
my @transcripts = @{$gene->get_all_Transcripts};
my $protein_acc;
if (scalar(@transcripts) != 1) {
print STDERR "Gene with dbID ".$gene->dbID." has ".scalar(@transcripts)." transcripts\n";
$pass_checks = 0;
}
foreach my $transcript (@transcripts) {
my @tsfs = @{$transcript->get_all_supporting_features};
if (scalar(@tsfs) == 1) {
$hit_name = $tsfs[0]->hseqname;
my $entry_obj1 = $self->seqfetcher->get_entry_by_acc($hit_name);
if ($entry_obj1 =~ m/^\>(\S+)\n/) { $protein_acc = $1; } elsif ($entry_obj1 =~ m/^\>(\S+)\s+(\S+)\n/) { # $secondary_header_id = $2; $protein_acc = $2;
} else {
throw("Entry has unusual header :\" $entry_obj1\" - hit_name\" $hit_name\"");
}
} else {
print STDERR "Transcript with dbID ".$transcript->dbID.
" has ".scalar(@tsfs)." tsfs. ".
"Should only have one transcript_supporting_feature\n";
$pass_checks = 0;
}
if (!defined($transcript->translation)) {
print STDERR "Transcript with dbID ".$transcript->dbID." does not translate.\n";
$pass_checks = 0;
}
}
return $pass_checks, $protein_acc; } |
sub cluster_by_protein
{ my ($self, $cluster) = @_;
my %protein_hash;
GENE: foreach my $gene ($cluster->get_Genes()) {
my ($gene_ok, $protein) = check_gene($self, $gene);
if (!$gene_ok) {
warning("Gene check failed for gene ".$gene->dbID." (in cluster with 2 or more Sets) - removed from analysis");
next GENE;
}
my $duplicate;
if (!exists $protein_hash{$protein}{$gene->biotype}) {
push @{$protein_hash{$protein}{$gene->biotype}}, $gene;
} else {
GOT: foreach my $got_gene (@{$protein_hash{$protein}{$gene->biotype}}){
if ($gene->biotype ne $got_gene->biotype) {
next GOT;
}
my @transcripts1 = @{$gene->get_all_Transcripts()};
my @transcripts2 = @{$got_gene->get_all_Transcripts()};
if (identical_Transcripts($transcripts1[0], $transcripts2[0])) {
$duplicate = 1;
}
}
if (!$duplicate) {
push @{$protein_hash{$protein}{$gene->biotype}}, $gene;
}
}
}
return\% protein_hash; } |
sub copy_empty_gene
{ my ($analysis, $gene) = @_;
my $newgene = new Bio::EnsEMBL::Gene;
$newgene->biotype( $gene->biotype);
$newgene->analysis($analysis);
return $newgene; } |
sub filter_transcripts
{ my ($h) = @_;
my %hash = %{$h};
my %hash2;
foreach my $protein (keys %hash) {
print "Filtering protein $protein\n";
foreach my $biotype (keys %{$hash{$protein}}) {
print "Filtering biotypes in random order $biotype\n";
my @genes = @{$hash{$protein}{$biotype}};
print " started off with ".scalar(@genes)." genes\n";
foreach my $g (@genes) {
print "... gene ".$g->dbID."\n";
}
if (scalar(@genes) > 1) {
print "We have doubles or overlapping\n";
my @genes2 = @{tidy_up($hash{$protein}{$biotype})};
print " now have ".scalar(@genes2)." genes left\n";
push @{$hash2{$protein}{$biotype}}, @{tidy_up($hash{$protein}{$biotype})};
} else {
push @{$hash2{$protein}{$biotype}}, @genes;
}
}
}
return\% hash2; } |
sub get_alternate_transcripts
{ my ($self, $cluster) = @_;
my %alternates;
my %seen;
GENE: foreach my $gene ($cluster->get_Genes()) {
if (exists $seen{$gene->dbID}) {
next GENE;
}
my ($gene_ok, $protein) = check_gene($self, $gene);
if (!$gene_ok) {
throw("Gene check failed for gene ".$gene->dbID."");
}
push @{$alternates{$protein}{$gene->dbID}}, $gene;
$seen{$gene->dbID} = 1;
my $exon_string = make_exon_string(@{$gene->get_all_Transcripts}[0]);
INNER: foreach my $inner_gene ($cluster->get_Genes()) {
if (exists $seen{$inner_gene->dbID}) {
next INNER;
}
my ($inner_gene_ok, $inner_protein) = check_gene($self, $inner_gene);
if (!$inner_gene_ok) {
throw("Gene check failed for gene ".$inner_gene->dbID."");
}
if ($protein ne $inner_protein) {
next INNER;
}
my $inner_exon_string = make_exon_string(@{$inner_gene->get_all_Transcripts}[0]);
if ($exon_string eq $inner_exon_string) {
$seen{$inner_gene->dbID} = 1;
next INNER;
} elsif ($gene->start <= $inner_gene->end && $gene->end >= $inner_gene->start) {
if (overlaps_all($alternates{$protein}{$gene->dbID}, $inner_gene) ) {
push @{$alternates{$protein}{$gene->dbID}}, $inner_gene;
$seen{$inner_gene->dbID} = 1;
}
} else {
print "Genes ".$gene->dbID." (exon string ".$exon_string.") and ".
$inner_gene->dbID." (exon string $inner_exon_string) do not overlap.\n";
} } } return\% alternates; } |
sub get_best_gene
{ my ($self, $protein, $biotypes, $hsh) = @_;
my %hash = %{$hsh};
my $biotypes_represented;
my $token_gene;
foreach my $biotype (keys %{$hash{$protein}}) {
if (scalar(@{$hash{$protein}{$biotype}}) > 0) {
$biotypes_represented++;
$token_gene = $hash{$protein}{$biotype}->[0];
}
}
if ($biotypes_represented == 1 && scalar(@$biotypes) > 2) {
print "FLAG_NULL1: For $protein, only one biotype represented. See gene ".$token_gene->dbID." (".$token_gene->biotype.")\n";
if (!defined $self->keep_single_analysis) {
print "FLAG_NULL2: For $protein, am returning no genes.\n";
return [];
}
}
my @doubles;
my $num_genes = 0;
foreach my $biotype (keys %{$hash{$protein}}) {
print "biotype $biotype has ".scalar(@{$hash{$protein}{$biotype}})." genes\n ";
$num_genes += scalar(@{$hash{$protein}{$biotype}});
if (scalar(@{$hash{$protein}{$biotype}}) > 1) {
print "*** Have ".$biotype." doubles so need to be careful ***\n";
push @doubles, $biotype;
}
}
my $pep = $self->seqfetcher->get_Seq_by_acc($protein)->seq;
if (!$pep) {
throw("Unable to fetch peptide sequence");
}
my @best;
my $exonerate; print "Finding best gene for protein $protein...\n";
if (scalar(@doubles) > 0) {
print "We ave doubles so let's just use them\n";
OUTER: foreach my $biotype (@$biotypes) {
INNER: foreach my $type (@doubles) {
if ($biotype eq $type) {
print "The most favoured set of doubles is $biotype so we takes them\n";
foreach my $gene (@{$hash{$protein}{$biotype}}) {
print "Keeping gene ".$gene->dbID."\n";
}
push @best, @{$hash{$protein}{$biotype}};
return\@ best;
}
}
}
}
my @match_pep;
my %tln_groups;
my $tln_seq;
OUTER: foreach my $type (@$biotypes) {
INNER: foreach my $biotype (keys %{$hash{$protein}}) {
if ($biotype ne $type) {
next INNER;
}
if (scalar(@{$hash{$protein}{$biotype}}) > 0) {
foreach my $gene (@{$hash{$protein}{$biotype}}) {
print "Got gene dbid ".$gene->dbID." ".$gene->biotype."\n";
foreach my $transcript (@{$gene->get_all_Transcripts}) {
$tln_seq = $transcript->translate->seq;
if ($tln_seq eq $pep) {
push @match_pep, $gene;
}
push @{$tln_groups{$tln_seq}}, $gene;
}
}
} else {
print "No genes for biotype $biotype\n";
}
}
}
my $num_tln_groups = scalar(keys %tln_groups);
print "Have $num_tln_groups different possible translations represented.\n";
foreach my $s (keys %tln_groups) {
print "option ".$tln_groups{$s}[0]->biotype.": $s\n";
}
if (scalar(@match_pep == 1)) {
print "Only one exact match to protein.\n Keeping only gene ".$match_pep[0]->dbID." (".$match_pep[0]->biotype.")\n";
@best = ( $match_pep[0] );
} elsif (scalar(@match_pep > 1)) {
print "More than one exact match to protein.\n";
if (scalar(@doubles) > 0) {
print "Have doubles and more than one match to the protein.\n";
$exonerate = 1;
} else {
print "Keeping first gene ".$match_pep[0]->dbID.
" from array as we prefer it's analysis (".$match_pep[0]->biotype.")\n";
@best = ( $match_pep[0] );
}
} else {
print "No exact match to protein. ";
if (($num_genes > 1) && (scalar(keys %tln_groups) == 1) && (scalar(@doubles < 1))) {
print "All transcripts have the same translated sequence.\n Keeping first gene ".$tln_groups{$tln_seq}->[0]->dbID.
" from array as we prefer it's analysis (".$tln_groups{$tln_seq}->[0]->biotype.")\n";
print "*** Maybe we should exonerate or look at exon structure to decide what to choose?\n";
@best = ( $tln_groups{$tln_seq}->[0] );
} else {
print "We have $num_genes genes, ".scalar(keys %tln_groups)." different sequences ".
" and ".scalar(@doubles)." doubles. Let's dig deeper.\n";
$exonerate = 1;
}
}
if (defined $exonerate) {
print "Exonerate...\n";
my $gene_tln_comp;
my $max_gene_score = 0;
my $max_perc_id = 0;
my $best;
my $first_best;
my %scores;
my $target = Bio::Seq->new( -display_id => $protein, -seq => $pep);
foreach my $biotype (@$biotypes) {
if (defined $hash{$protein}{$biotype}) {
foreach my $gene (@{$hash{$protein}{$biotype}}) {
if (!defined $best) {
print "First gene is ".$gene->dbID."\n";
$best = $gene;
$first_best = $gene;
}
foreach my $transcript (@{$gene->get_all_Transcripts}) {
print "Running transcript ".$transcript->dbID."\n";
my $query = Bio::Seq->new( -display_id => $transcript->dbID, -seq => $transcript->translate->seq);
my ($gene_score, $perc_id) = @{$self->run_exonerate($query, $target)};
print "Gene ".$gene->dbID." (".$gene->biotype.") has score $gene_score and percent_id $perc_id (ratio ".
($transcript->translation->length)/($transcript->end - $transcript->start).")\n"; $scores{$gene->biotype}{$gene->dbID}{'score'} = $gene_score;
$scores{$gene->biotype}{$gene->dbID}{'percent_id'} = $perc_id;
$scores{$gene->biotype}{$gene->dbID}{'ratio'} = ($transcript->translation->length)/($transcript->end - $transcript->start); $scores{$gene->biotype}{$gene->dbID}{'gene'} = $gene;
if ($gene_score > $max_gene_score) {
$max_gene_score = $gene_score;
$best = $gene;
print "Gene with max score $max_gene_score is now ".$gene->dbID." (with percent_id $perc_id)\n";
} if ($perc_id > $max_perc_id) {
$max_perc_id = $perc_id;
}
} } }
}
my %allowed_doubles;
if (scalar(@doubles) == 0) {
print "No doubles, Checking the ratio\n";
if ($scores{$best->biotype}{$best->dbID}{'ratio'} >= 0.8*($scores{$first_best->biotype}{$first_best->dbID}{'ratio'})) {
@best = ($best);
print "ratio OK. Keeping gene ".$best->dbID." biotype ".$best->biotype.
" , ratio ".$scores{$best->biotype}{$best->dbID}{'ratio'}." vs ".
$scores{$first_best->biotype}{$first_best->dbID}{'ratio'}."\n";
} else {
print "Ratio too low. Befault to 1st biotype. Keeping gene ".$first_best->dbID." biotype ".$first_best->biotype."\n";
@best = ($first_best);
}
} elsif (scalar(@doubles) >= 1) {
print "We have ".scalar(@doubles)." sets of doubles\n";
foreach my $dbl_biotype (@doubles) {
$allowed_doubles{$dbl_biotype} = 0;
foreach my $gid (keys %{$scores{$dbl_biotype}}) {
if ($scores{$dbl_biotype}{$gid}{'score'} >= 0.9*$max_gene_score) {
$allowed_doubles{$dbl_biotype} += $scores{$dbl_biotype}{$gid}{'score'};
} else {
print " Not acceptable: gene $gid has score ".$scores{$dbl_biotype}{$gid}{'score'}." which is too low\n";
} } }
my $best_biotype;
my $max_dbl_score = 0;
foreach my $dbl_biotype (@$biotypes) {
if (defined $dbl_biotype && ($allowed_doubles{$dbl_biotype} > $max_dbl_score)) {
$max_dbl_score = $allowed_doubles{$dbl_biotype};
$best_biotype = $dbl_biotype;
}
}
if ($max_dbl_score > 0) {
foreach my $gene (@{$hash{$protein}{$best_biotype}}) {
if (($gene->biotype eq $best_biotype) && ($scores{$gene->biotype}{$gene->dbID}{'score'} >= 0.9*$max_gene_score)) {
print "Keeping gene ".$gene->dbID." biotype ".$gene->biotype."\n";
push @best, $gene;
if ($scores{$gene->biotype}{$gene->dbID}{'percent_id'} < 95) {
print "WARNING_FLAG: percent_id is less than max\n";
}
} else {
print "(Not keeping ".$gene->dbID." as score too low)\n";
}
}
}
}
foreach my $b (@best) {
if ($scores{$b->biotype}{$b->dbID}{'percent_id'} < 90) {
print "FLAG: best_gene ".$b->dbID." has percent_id ".$scores{$b->biotype}{$b->dbID}{'percent_id'}.
"<90 (start ".$b->start." end ".$b->end.")\n";
}
}
} return\@ best; } |
sub get_ordered_genes
{ my ($self, $genes) = @_;
my @ordered;
my %seen;
print "block has ".scalar(@$genes)." genes\n";
foreach my $logic (@{$self->biotypes}) {
foreach my $gene (@$genes) {
if (!exists $seen{$gene} && ($logic eq $gene->biotype)) {
print " Got in cluster: gene ".$gene->dbID." of biotype ".$gene->biotype."\n";
push @ordered, $gene;
$seen{$gene} = 1;
}
}
}
return\@ ordered; } |
sub logic_names
{ my ($self, $val) = @_;
if (defined $val) {
$self->{_bt_logic_names} = $val;
}
return $self->{_bt_logic_names}; } |
sub make_exon_string
{ my ($transcript) = @_;
my $exon_string = "E";
my @exons = sort {$a->start <=> $b->start} @{$transcript->get_all_translateable_Exons};
foreach my $exon (@exons) {
$exon_string .= ":".$exon->start.":".$exon->end;
}
return $exon_string; } |
sub make_outgenes
{ my ($analysis, $gene, $trans) = @_;
my $outgene = copy_empty_gene($analysis, $gene);
if (not defined $trans){
my @trans = @{$gene->get_all_Transcripts};
if (scalar @trans > 1){
print STDERR "gene ".$$gene->dbID." has >1 transcript\n";
}else{
$trans = $trans[0];
}
}
my $outtrans = clone_Transcript($trans);
$outtrans->analysis($analysis);
$outtrans->dbID(undef);
foreach my $sf(@{$outtrans->get_all_supporting_features}){
$sf->dbID(undef);
$sf->analysis($analysis);
}
foreach my $exon (@{$outtrans->get_all_Exons}){
$exon->dbID(undef);
foreach my $sf (@{$exon->get_all_supporting_features}){
$sf->dbID(undef);
$sf->analysis($analysis);
}
}
$outgene->add_Transcript($outtrans);
return $outgene; } |
sub new
{ my ($class,@args) = @_;
my $self = $class->SUPER::new(@args);
my( $biotypes, $seqfetcher, $verbose, $genes, $keep_single_analysis ) =
rearrange([qw(
BIOTYPES
SEQFETCHER
VERBOSE
GENES
KEEP_SINGLE_ANALYSIS
)], @args);
$self->seqfetcher($seqfetcher) if defined $seqfetcher;
$self->biotypes($biotypes) if defined $biotypes;
$self->verbose($verbose) if defined $verbose;
$self->all_genes($genes) if defined $genes;
$self->keep_single_analysis($keep_single_analysis) if defined $keep_single_analysis;
return $self ; } |
sub overlaps_all
{ my ($gene_array, $gene) = @_;
my $num_overlapped;
my $overlaps_all;
foreach my $g (@$gene_array) {
if ($g->start <= $gene->end && $g->end >= $gene->start) {
$num_overlapped++;
}
}
if ($num_overlapped == scalar(@$gene_array)) {
$overlaps_all = 1;
} else {
print "does not overlap all genes\n";
}
return $overlaps_all; } |
sub run
{ my ($self) = @_ ;
my %outgenes;
my %logic_hash;
foreach my $logic (@{$self->biotypes}) {
print STDERR "got $logic\n";
$logic_hash{$logic} = [$logic];
}
my @allgenes = @{$self->all_genes};
my ($clusters, $non_clusters) = cluster_Genes(\@allgenes,\% logic_hash ) ;
if ($self->verbose){
print @$non_clusters ." non_clusters and ".@$clusters ." clusters\n";
}
foreach my $non_cluster ( @$non_clusters ) {
my @genes = $non_cluster->get_Genes();
GENE: foreach my $gene (@genes){
my ($gene_ok, $protein) = check_gene($self, $gene);
if (!$gene_ok) {
warning("Gene check failed for unclustered gene ".$gene->dbID." - removed from analysis");
next GENE;
}
if ($self->verbose){
print "storing non_cluster_gene ".$gene->dbID."\n";
}
$outgenes{$gene} = make_outgenes($self->analysis, $gene);
}
}
my @twoways;
foreach my $cluster (@$clusters) {
my @inc_sets = @{$cluster->get_sets_included};
if (scalar @inc_sets >= 2) {
push @twoways, $cluster;
} elsif (scalar @inc_sets == 1) {
GENE: foreach my $gene ($cluster->get_Genes_by_Set($inc_sets[0])) {
my ($gene_ok, $protein) = check_gene($self, $gene);
if (!$gene_ok) {
warning("Gene check failed for gene ".$gene->dbID." (in cluster of only one Set) - removed from analysis");
next GENE;
}
if ($self->verbose){
my $gene_id = $gene->dbID ? $gene->dbID : "" ;
print "storing single_set_cluster_gene ".$gene_id ." ".$inc_sets[0]."\n";
}
$outgenes{$gene} = make_outgenes($self->analysis, $gene);
} } }
print "Got " . scalar(@twoways) . " twoways\n";
foreach my $cluster (@twoways) {
my $protein_clusters = cluster_by_protein($self, $cluster);
my $filtered = filter_transcripts($protein_clusters);
foreach my $protein (keys %{$protein_clusters}) {
print "fetching protein from index : $protein\n " ;
my $fetched_protein;
eval {
$fetched_protein = $self->seqfetcher->get_Seq_by_acc($protein) ;
};
if ($@) {
throw ( " Could not get sequence for protein $protein\n from index ". $self->seqfetcher->index_name .
"\n If you use OBDA Index : Maybe your keys are redundant ?\n\n $@ " ) ;
}
unless ( $fetched_protein ) {
throw("SeqFetcher cannot get_Seq_by_acc for protein '$protein' in indexfile : ". $self->seqfetcher->index_name );
}
my $pep = $fetched_protein->seq;
print "Comparing transcripts made from protein $protein with seq $pep\n";
if (!$pep) {
throw("Unable to fetch peptide sequence for protein $protein");
}
my @best_genes = @{$self->get_best_gene($protein, $self->biotypes, $filtered)};
foreach my $best_gene (@best_genes) {
if (! exists $outgenes{$best_gene}){
$outgenes{$best_gene} = make_outgenes($self->analysis, $best_gene);
}
}
} } my @outgenes;
foreach my $gene ( keys %outgenes ) {
push @outgenes, $outgenes{$gene};
}
$self->output(\@outgenes);
} |
sub run_exonerate
{ my ($self, $query, $target) = @_;
my $exonerate = Bio::EnsEMBL::Analysis::Runnable::BestTargettedExonerate->new
(
-program => $self->program,
-analysis => $self->analysis,
-query_seqs => [$query],
-target_seqs => [$target],
-options => "--model affine:local",
);
$exonerate->run;
my ($score, $perc_id) = @{$exonerate->output} ;
return [$score, $perc_id];
}
} |
sub seqfetcher
{ my ($self, $val) = @_;
if (defined $val) {
$self->{_bt_seqfetcher} = $val;
}
return $self->{_bt_seqfetcher}; } |
sub tidy_up
{ my ($genes) = @_;
print "In tidy up...have ".scalar(@$genes)." genes\n";
my @ordered = sort {($a->end - $a->start) <=> ($b->end - $b->start)} @$genes;
my @non_overlapping;
push @non_overlapping, $ordered[0];
my $overlaps = 0;
OUTER: foreach my $outer (@ordered) {
INNER: foreach my $inner (@non_overlapping) {
if ($outer->dbID == $inner->dbID) {
next OUTER;
}
if (($outer->start <= $inner->end && $outer->end >= $inner->start)
|| ($inner->start <= $outer->end && $inner->end >= $outer->start)) {
print "overlaps... checking if it extends the smaller gene... ";
my $outer_transc = $outer->get_all_Transcripts->[0];
my $inner_transc = $inner->get_all_Transcripts->[0];
my $exact_match;
for (my $i=0; $i< scalar(@{$outer_transc->get_all_Exons}); $i++) {
for (my $j=0; $j<scalar(@{$inner_transc->get_all_Exons}); $j++) {
if ($outer_transc->get_all_Exons->[$i]->start == $inner_transc->get_all_Exons->[$j]->start &&
$outer_transc->get_all_Exons->[$i]->end == $inner_transc->get_all_Exons->[$j]->end) {
$exact_match++;
}
}
}
if ($exact_match == scalar(@{$inner_transc->get_all_Exons})) {
print "EXTENDS\n";
} else {
print "OVERLAP\n";
$overlaps++;
next OUTER;
}
}
}
if ($overlaps < 1) {
print "ADDING A DOUBLE\n";
push @non_overlapping, $outer;
}
}
return\@ non_overlapping; } |
sub verbose
{ my ($self, $val) = @_;
if (defined $val) {
$self->{_bt_verbose} = $val;
}
return $self->{_bt_verbose}; } |
General documentation
Bio::EnsEMBL::Analysis::Runnable::BestTargetted