Raw content of Bio::EnsEMBL::Analysis::RunnableDB::UTR_Builder
#
# Ensembl module for Bio::EnsEMBL::Analysis::RunnableDB::UTR_Builder
#
# Cared for by EnsEMBL
#
# Copyright GRL and EBI
#
# You may distribute this module under the same terms as perl itself
# POD documentation - main docs before the code
=head1 NAME
Bio::EnsEMBL::Analysis::RunnableDB::UTR_Builder
=head1 SYNOPSIS
my $utrbuilder_runnable = new Bio::EnsEMBL::Analysis::RunnableDB::UTR_Builder(
-db => $db,
-input_id => $input_id
);
$utrbuilder_runnable->fetch_input();
$utrbuilder_runnable->run();
$utrbuilder_runnable->output();
$utrbuilder_runnable->write_output(); #writes to DB
=head1 DESCRIPTION
This is the new version of the UTR-addition procedure.
It combines predictions made from proteins with cDNA alignments to add UTR regions to the
gene models. It can also inlcude ESTs and ditags. It uses code from Coalescer/Consensus to produce
score for the alternative models and chose the best option.
It also includes ("look-for-both") code to correct the phases of the transcripts unless they are "blessed"
and inculdes the option to check for predefined protein/cDNA pairing as a first step,
looking for NM/NPentries in a GeneBank file.
Config files to set-up are
Bio::EnsEMBL::Analysis::Config::GeneBuild::UTR_Builder
Bio::EnsEMBL::Analysis::Config::Databases
Bio::EnsEMBL::Analysis::Config::GeneBuild::TranscriptConsensus (just a copy of the example file)
Bio::EnsEMBL::Analysis::Config::GeneBuild::KillListFilter
=head1 CONTACT
Ensembl - ensembl-dev@ebi.ac.uk
=head1 APPENDIX
The rest of the documentation details each of the object methods. Internal methods are
usually preceded with a '_'
=cut
package Bio::EnsEMBL::Analysis::RunnableDB::UTR_Builder;
use vars qw(@ISA);
use strict;
use Bio::SeqIO;
use Bio::EnsEMBL::Utils::Exception qw( throw warning verbose);
use Bio::EnsEMBL::Analysis::RunnableDB;
use Bio::EnsEMBL::Gene;
use Bio::EnsEMBL::Transcript;
use Bio::EnsEMBL::Translation;
use Bio::EnsEMBL::Exon;
use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::ExonUtils qw(
clone_Exon
transfer_supporting_evidence
validate_Exon_coords
);
use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranscriptUtils qw(
is_Transcript_sane
all_exons_are_valid
intron_lengths_all_less_than_maximum
set_start_codon
set_stop_codon
clone_Transcript
has_no_unwanted_evidence
);
use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranslationUtils qw(
validate_Translation_coords
compute_translation
contains_internal_stops
print_Translation
);
use Bio::EnsEMBL::Analysis::Tools::Algorithms::ClusterUtils;
use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranscriptExtended;
use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::ExonExtended;
use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::CollapsedCluster;
use Bio::EnsEMBL::Analysis::Tools::Utilities;
use Bio::EnsEMBL::Analysis::Runnable::TranscriptConsensus;
use Bio::EnsEMBL::Analysis::RunnableDB::BaseGeneBuild;
use Bio::EnsEMBL::KillList::KillList;
use Bio::EnsEMBL::KillList::DBSQL::DBAdaptor;
#CONFIG FILES
use Bio::EnsEMBL::Analysis::Config::GeneBuild::UTR_Builder;
use Bio::EnsEMBL::Analysis::Config::GeneBuild::TranscriptConsensus;
use Bio::EnsEMBL::Analysis::Config::GeneBuild::KillListFilter;
@ISA = qw ( Bio::EnsEMBL::Analysis::RunnableDB::BaseGeneBuild );
=head2 new
Arg [1] : Bio::EnsEMBL::Analysis::RunnableDB::UTR_Builder
Function : instatiates object & check config file
Returntype: Bio::EnsEMBL::Analysis::RunnableDB::UTR_Builder
=cut
sub new {
my ($class, @args) = @_;
my $self = $class->SUPER::new(@args);
$self->read_and_check_config($UTR_BUILDER_CONFIG_BY_LOGIC);
return $self;
}
my $totalgenes = 0;
=head2 fetch_input
Arg [1] : none
Description: Get all raw data needed from the databases
Returntype : none
=cut
sub fetch_input{
my ($self) = @_;
my $slice = $self->fetch_sequence(undef, $self->INPUT_DB);
if(!$slice){ throw "can't fetch input slice!\n" }
$self->query($slice);
# fetch all general input genes
foreach my $input_genetype (@{$self->INPUT_GENETYPES}) {
my $input_genes = $self->query->get_all_Genes_by_type($input_genetype);
print STDERR "got " . scalar(@{$input_genes}) . " $input_genetype genes [ ".
$self->INPUT_DB->dbname() . "@" . $self->INPUT_DB->host ." ]\n";
$self->gw_genes( $input_genes );
}
# get blessed genes
my $blessed_slice;
my @blessed_genes;
my $blessed_type;
if($self->BLESSED_DB and scalar(@{$self->BLESSED_GENETYPES})){
if ($self->BLESSED_DB){
#fetch blessed genes from blessed db
$blessed_slice = $self->BLESSED_DB->get_SliceAdaptor->fetch_by_name($self->input_id);
}
else{
#fetch blessed genes from gw db
$blessed_slice = $self->query;
}
foreach my $bgt( @{$self->BLESSED_GENETYPES} ){
my $blessed_genes = $blessed_slice->get_all_Genes_by_type($bgt);
print STDERR "got " . scalar(@{$blessed_genes}) . " $bgt genes [ ".
$self->BLESSED_DB->dbname() . "@" . $self->BLESSED_DB->host ." ]\n";
$self->blessed_genes( $blessed_slice, $blessed_genes );
$blessed_type .= $bgt."";
}
# store all blessed type names for VIP treatment
$self->{'blessed_type'} = ($blessed_type.$self->BLESSED_UTR_GENETYPE);
}
# are there any genes here at all?
if(!scalar @{$self->gw_genes} and !scalar @{$self->blessed_genes}){
print STDERR "\nNo genes found here.\n\n";
return 0;
}
# get cdnas
if(!(defined($self->cDNA_GENETYPE) and $self->CDNA_DB)){
warn("NOT using any cDNAs!?\n");
}
else{
my $cdna_vc = $self->CDNA_DB->get_SliceAdaptor->fetch_by_name($self->input_id);
$self->_cdna_slice($cdna_vc);
foreach my $cdna_type (@{$self->cDNA_GENETYPE}){
#my @cdna_genes = @{$cdna_vc->get_all_Genes_by_type($cdna_type)};
my @cdna_genes = @{$self->_cdna_slice->get_all_Genes_by_type($cdna_type)};
print STDERR "got ".scalar(@cdna_genes)." ".$cdna_type." cDNAs.\n" if $self->VERBOSE;
$self->CDNA_DB->dbc->disconnect_when_inactive(1);
# filter cdnas
my $filtered_cdna = $self->_filter_cdnas(\@cdna_genes, 0);
$self->cdna_genes($filtered_cdna);
print STDERR "got " . scalar(@{$filtered_cdna}) . " cdnas after filtering.\n" if $self->VERBOSE;
}
}
# get ESTs
if(defined($self->EST_GENETYPE) && $self->EST_DB){
my $est_vc = $self->EST_DB->get_SliceAdaptor->fetch_by_name($self->input_id);
my @est_genes = @{$est_vc->get_all_Genes_by_type($self->EST_GENETYPE)};
print STDERR "got " . scalar(@est_genes) . " ".$self->EST_GENETYPE." ESTs.\n";
if($self->FILTER_ESTS){
my $filtered_ests = $self->_filter_cdnas(\@est_genes, 1);
$self->ests($filtered_ests);
}
else{
$self->ests(\@est_genes);
}
print STDERR "got " . scalar(@{$self->ests}) . " ESTs after filtering.\n" if $self->VERBOSE;
}
# get ditags
my ($dfa, $ditag_slice);
my @ditags;
if((scalar @{$self->DITAG_TYPE_NAMES}) && $self->DITAG_DB){
$dfa = $self->DITAG_DB->get_DitagFeatureAdaptor;
$ditag_slice = $self->DITAG_DB->get_SliceAdaptor->fetch_by_name($self->input_id);
foreach my $ditag_type (@{$self->DITAG_TYPE_NAMES}) {
my @type_ditags = @{$dfa->fetch_pairs_by_Slice($ditag_slice, $ditag_type)};
print STDERR "got " . scalar(@type_ditags) . " ".$ditag_type." ditags.\n" if $self->VERBOSE;
push(@ditags, @type_ditags);
}
}
if(scalar @ditags){
@ditags = sort {($a->{'start'} <=> $b->{'start'}) or ($a->{'end'} <=> $b->{'end'})} @ditags;
$self->ditags(\@ditags);
print STDERR "got " . scalar(@ditags) ." ditags.\n";
}
else{
print STDERR "not using Ditags.\n";
}
# db disconnections
$self->INPUT_DB->dbc->disconnect_when_inactive(1);
$self->CDNA_DB->dbc->disconnect_when_inactive(1)
if($self->CDNA_DB);
$self->EST_DB->dbc->disconnect_when_inactive(1)
if($self->EST_DB);
$self->BLESSED_DB->dbc->disconnect_when_inactive(1)
if($self->BLESSED_DB);
$self->DITAG_DB->dbc->disconnect_when_inactive(1)
if($self->DITAG_DB);
# set evidence sets for Coalescer code
my (@est_biotypes, @cdna_logicnames);
push(@est_biotypes, $self->EST_GENETYPE);
my @simgw_biotypes = @{ $self->INPUT_GENETYPES };
push(@cdna_logicnames, @{$self->cDNA_GENETYPE});
$self->{evidence_sets} = {
'est' => \@est_biotypes,
'simgw' => \@simgw_biotypes,
'cdna' => \@cdna_logicnames,
};
# prune genes during filtering?
# (usually not used, as too many good things are thrown out.)
$self->prune($self->PRUNE_GENES);
# get rid of identical models?
# (usually not used, as we want the same number of output as input genes)
$self->{'remove_redundant'} = 0;
#get killlist entries for cDNAs to ignore
my $kill_list_type = "cDNA";
my $kill_list = $self->populate_kill_list($kill_list_type);
$self->kill_list($kill_list);
if($self->LOOK_FOR_KNOWN){
#read KnowUTR pairing into hash
$self->create_predefined_pairing($self->KNOWNUTR_FILE);
}
}
=head2 run
Description: general run method
Returntype : none
=cut
sub run {
my ($self) = @_;
# filter the genewise predictions to remove fragments.
# Make sure we keep the fragments to let the
# genebuilder do the final sorting out, but don't add UTRs to them.
print "TOTAL GENES IN: ".(scalar @{$self->gw_genes} + scalar @{$self->blessed_genes})."\n";
my $ininumber = scalar @{$self->gw_genes};
my $filtered_genes = $self->filter_genes($self->gw_genes);
print STDERR "filtered from $ininumber to ".(scalar @$filtered_genes). " genes.\n" if $self->VERBOSE && $filtered_genes;
#for blessed genes, only check region and existing UTR
$ininumber = scalar @{$self->blessed_genes};
my $filtered_blessed_genes = $self->filter_genes($self->blessed_genes, 1);
print STDERR "filtered blessed from $ininumber to ".(scalar @$filtered_blessed_genes). " genes.\n"
if $self->VERBOSE && $filtered_blessed_genes;
# match gene-model to cDNAs/ESTs for blessed & other genes
$self->run_matching($filtered_blessed_genes, $self->BLESSED_UTR_GENETYPE, 1);
$self->run_matching($filtered_genes, $self->UTR_GENETYPE, 0);
print STDERR "Have ".(scalar @{$self->combined_genes})." combined_genes & ".
(scalar @{$self->unmatched_genes})." unmatched_genes.\n" if $self->VERBOSE;
# check translation, start and stop
my @remapped;
push( @remapped, @{$self->remap_genes( $self->combined_genes, undef )} );
push( @remapped, @{$self->remap_genes( $self->unmatched_genes, $self->EXTEND_BIOTYPE_OF_UNCHANGED_GENES )} );
#results are stored in "output"
$self->output(@remapped);
print "TOTAL GENES OUT: ".(scalar @remapped)."\n";
if($self->VERBOSE){
foreach ( @remapped ) {
print "final biotype : " . $_->biotype . "\n";
}
}
}
=head2 filter_genes
Arg [1] : ref to array of Bio::EnsEMBL::Gene
Description: We do not want to add UTRs to fragments of genes
otherwise they get boosted in importance during the final gene build
and produce rubbish alternative transcripts and in the worst cases
can misjoin clusters especially if there is some
misassembled/duplicated sequence. One solution is to cluster the
genewises at this stage in much the same way as we do during the
GeneBuilder run, and attempt to add UTRs ony to the best ones eg
fullest genomic extent.
Also checks for existing UTR, these genes are just passed on
Returntype : ref to array of Bio::EnsEMBL::Gene
=cut
sub filter_genes {
my ($self, $genesref, $blessed) = @_;
my @tested_genes;
my $pruned_CDS;
#run tests: get rid of long-intron genes, etc.
GENES:
foreach my $test_gene (@$genesref){
if( $test_gene ){
my $test_transcript = $test_gene->get_all_Transcripts->[0];
#would like to avoid slow sorting call - but the "get_all_Transcripts" does not do this job properly...
$test_transcript->sort;
if($test_transcript->three_prime_utr or $test_transcript->five_prime_utr){
print STDERR "Gene has UTR already. Skipping.\n" if $self->VERBOSE;
$self->unmatched_genes($test_gene);
next GENES;
}
if($blessed){
#for blessed genes only check the region
if($test_transcript->start < 1 && $test_transcript->end > 1){
print STDERR "ignoring blessed gene: falls off the slice by its lower end\n" if $self->VERBOSE;
}
else{
push(@tested_genes, $test_gene);
}
}
else{
if( is_Transcript_sane($test_transcript)
&& all_exons_are_valid($test_transcript, $self->MAX_EXON_LENGTH)
&& intron_lengths_all_less_than_maximum($test_transcript, $self->MAX_INTRON_LENGTH)
&& ($test_transcript->start < $self->query->length)
&& ($test_transcript->end > 1)
&& has_no_unwanted_evidence($test_transcript) ) {
push(@tested_genes, $test_gene);
}
else{
#test if it falls off the slice on the left, these will be picked up by another job
if($test_transcript->start < 1 && $test_transcript->end > 1){
print STDERR "ignoring gene: falls off the slice by its lower end\n" if $self->VERBOSE;
}
else{
#keep them in the set
$self->unmatched_genes($test_gene);
}
}
}
}
else{
throw "\nERROR: Cant check genes!\n\n";
}
}
$genesref = \@tested_genes;
#cluster genes (stores internally)
if(!$blessed){
# all the genes are single transcript at this stage, cluster them
my $clustered_genes = $self->cluster_CDS($genesref);
#pruning if desired
if($self->prune){
$genesref = $self->prune_CDS($clustered_genes);
}
}
return $genesref;
}
=head2 cluster_CDS
Arg [1] : ref to array of Bio::EnsEMBL::Gene
Description: Separates CDSs according to strand and then clusters
each set by calling Bio::EnsEMBL::Analysis::Tools::Algorithms::ClusterUtils:cluster_Genes
Returntype : ref to array of Bio::EnsEMBL::Analysis::Tools::Algorithms::GeneCluster
=cut
sub cluster_CDS {
my ($self, $CDSref) = @_;
my @forward_CDS;
my @reverse_CDS;
foreach my $gene (@$CDSref){
my @exons = @{ $gene->get_all_Exons };
if ( $exons[0]->strand == 1 ){
push( @forward_CDS, $gene );
}
else{
push( @reverse_CDS, $gene );
}
}
my (@clusters, @forward_clusters, @reverse_clusters);
my ($forward_clusters, $forward_clusters_rest);
my ($reverse_clusters, $reverse_clusters_rest);
if ( @forward_CDS ){
($forward_clusters, $forward_clusters_rest) = cluster_Genes( \@forward_CDS, $self->{evidence_sets} );
push(@forward_clusters, @$forward_clusters);
push(@forward_clusters, @$forward_clusters_rest);
}
if ( @reverse_CDS ){
($reverse_clusters, $reverse_clusters_rest) = cluster_Genes( \@reverse_CDS, $self->{evidence_sets} );
push(@reverse_clusters, @$reverse_clusters);
push(@reverse_clusters, @$reverse_clusters_rest);
}
#store the two cluster sets and return combined set
if ( scalar @forward_clusters ){
#store for later
$self->forward_genewise_clusters(\@forward_clusters);
push( @clusters, @forward_clusters);
}
if ( scalar @reverse_clusters ){
#store for later
$self->reverse_genewise_clusters(\@reverse_clusters);
push( @clusters, @reverse_clusters);
}
return \@clusters;
}
=head2 prune_CDS
Arg [1] : ref to array of GeneClusters
Description: rejects duplicate CDS, makes sure they are kept on
unmatched_genes or they (and their supporting evidence)
will be lost for the rest of the build
Note: pruning will not be used unless specified otherwise
Returntype : ref to array of Bio::EnsEMBL::Gene
=cut
sub prune_CDS {
my ($self, $gene_cluster_ref) = @_;
my $cluster_count = 0;
my @pruned_transcripts;
my @pruned_genes;
CLUSTER:
foreach my $gene_cluster ( @$gene_cluster_ref ){
$cluster_count++;
my @unpruned_genes;
#check for multi-transcript genes
foreach my $unpruned_gene ($gene_cluster->get_Genes){
if(scalar(@{$unpruned_gene->get_all_Transcripts}) > 1){
throw($unpruned_gene->dbID . " has >1 transcript - can't handle it yet \n");
}
push(@unpruned_genes, $unpruned_gene);
}
# sort the unpruned_genes by total exon length of their single transcripts - there are no UTRs to worry about yet
@unpruned_genes = sort{$b->get_all_Transcripts->[0]->length <=> $a->get_all_Transcripts->[0]->length} @unpruned_genes;
# do we really just want to take the first transcript only?
# If there's a very long single exon gene we will lose any underlying multi-exon transcripts
# this may increase problems with the loss of valid single exon genes as mentioned below.
# it's a balance between keeping multi exon transcripts and losing single exon ones
my $maxexon_number = 0;
foreach my $gene (@unpruned_genes){
my $exon_number = scalar(@{$gene->get_all_Transcripts->[0]->get_all_Exons});
if ( $exon_number > $maxexon_number ){
$maxexon_number = $exon_number;
}
}
if ($maxexon_number == 1){ # ie the longest transcript is a single exon one
# take the longest:
push (@pruned_genes, $unpruned_genes[0]);
print STDERR "VAC: found single_exon_transcript: " .$unpruned_genes[0]->dbID. "\n" if $self->VERBOSE;
shift @unpruned_genes;
$self->unmatched_genes(@unpruned_genes);
next CLUSTER;
}
# otherwise we need to deal with multi exon transcripts and reject duplicates.
# links each exon in the transcripts of this cluster with a hash of other exons it is paired with
my %pairhash;
# allows retrieval of exon objects by exon->id - convenience
my %exonhash;
# prune redundant transcripts
GENE:
foreach my $gene (@unpruned_genes) {
my @exons = @{$gene->get_all_Transcripts->[0]->get_all_Exons};
my $i = 0;
my $found = 1;
# 10.1.2002 VAC we know there's a potential problem here - single exon transcripts which are in a
# cluster where the longest transcript has > 1 exon are not going to be considered in
# this loop, so they'll always be marked "transcript already seen"
# How to sort them out? If the single exon overlaps an exon in a multi exon transcript then
# by our rules it probably ought to be rejected the same way transcripts with shared exon-pairs are.
# Tough one.
EXONS:
for ($i = 0; $i < $#exons; $i++) {
my $foundpair = 0;
my $exon1 = $exons[$i];
my $exon2 = $exons[$i+1];
# go through the exon pairs already stored in %pairhash.
# If there is a pair whose exon1 overlaps this exon1, and
# whose exon2 overlaps this exon2, then these two transcripts are paired
EXONHASH:
foreach my $first_exon_id (keys %pairhash) {
my $first_exon = $exonhash{$first_exon_id};
foreach my $second_exon_id (keys %{$pairhash{$first_exon}}) {
my $second_exon = $exonhash{$second_exon_id};
if ( $exon1->overlaps($first_exon) && $exon2->overlaps($second_exon) ) {
$foundpair = 1;
last EXONHASH;
# this method allows a transcript to be covered by exon pairs
# from different transcripts, rejecting possible
# splicing variants. Needs rethinking
}
}
}
if ($foundpair == 0) { # ie this exon pair does not overlap with a pair yet found in another transcript
$found = 0; # ie currently this transcript is not paired with another
# store the exons so they can be retrieved by id
$exonhash{$exon1} = $exon1;
$exonhash{$exon2} = $exon2;
# store the pairing between these 2 exons
$pairhash{$exon1}{$exon2} = 1;
}
} # end of EXONS
# decide whether this is a new transcript or whether it has already been seen
if ($found == 0) {
push(@pruned_genes, $gene);
}
elsif ($found == 1 && $#exons == 0){
$self->unmatched_genes($gene);
}
else {
# dont know what this is about...
$self->unmatched_genes($gene);
if ( $gene == $unpruned_genes[0] ){
}
}
} # end of this gene
} #end CLUSTER
return \@pruned_genes;
}
=head2 run_matching
Arg [1] : ref to array of Bio::EnsEMBL::Gene
Arg [2] : string representing genetype to be associated with UTR-modified genes
Arg [3] : bool indicating that we are NOT trying to map to predefined cDNA (blessed genes)
Description: main function which matches CDS (genewise) predictions to cDNA alignments
considering only terminal CDS exons, looking for "knowns" first, optionally
using ESTs and ditags.
Returntype : none
=cut
sub run_matching{
my ($self, $genesref, $combined_genetype, $blessed) = @_;
# merge exons with frameshifts into a big exon
my @merged_genes = ();
#maybe we should not do merging with blessed genes at all?!
@merged_genes = $self->_merge_genes($genesref);
print STDERR "\n --- \ngot " . scalar(@merged_genes) . " merged " . $combined_genetype . " genes\n" if $self->VERBOSE;
# sort genewises by exonic length and genomic length
@merged_genes = sort {
my $result = ( ($b->get_all_Transcripts->[0]->end - $b->get_all_Transcripts->[0]->start + 1) <=>
($a->get_all_Transcripts->[0]->end - $b->get_all_Transcripts->[0]->start + 1) );
unless ($result){
return ( ($b->get_all_Transcripts->[0]->length) <=>
($a->get_all_Transcripts->[0]->length) );
}
return $result;
} @merged_genes;
# find one-2-one matching between proteins and cdnas
CDS:
foreach my $cds (@merged_genes){
print STDERR "--next cds: ".$cds->seq_region_start."-".$cds->seq_region_end." --\n" if $self->VERBOSE;
# should be only 1 transcript
my @cds_tran = @{$cds->get_all_Transcripts};
my @cds_exons = @{$cds_tran[0]->get_all_Exons}; # ordered array of exons
my $strand = $cds_exons[0]->strand;
my $cdna_match;
my $usingKnown = 0;
if($cds_exons[$#cds_exons]->strand != $strand){
warn("first and last cds exons have different strands - can't make a sensible combined gene\n");
# get and store unmerged version of cds
my $unmerged_cds = $self->retrieve_unmerged_gene($cds);
$self->unmatched_genes($unmerged_cds);
next CDS;
}
#look for a pre-defined pairing between a protein and a cDNA the gene was build from
my $predef_match = undef;
my $combined_transcript = undef;
if($self->LOOK_FOR_KNOWN){
$predef_match = $self->check_for_predefined_pairing($cds, $blessed);
if(defined $predef_match){
$combined_transcript = $self->combine_genes($cds, $predef_match);
$combined_transcript->sort;
# just check combined transcript works before throwing away the original transcript
if ( $combined_transcript
&& is_Transcript_sane($combined_transcript)
&& all_exons_are_valid($combined_transcript, $self->MAX_EXON_LENGTH, 1)
&& intron_lengths_all_less_than_maximum($combined_transcript, $self->MAX_INTRON_LENGTH)
&& validate_Translation_coords($combined_transcript, 1)
&& !contains_internal_stops($combined_transcript)
&& $combined_transcript->translate
&& has_no_unwanted_evidence($combined_transcript) ){
# make sure combined transcript doesn't misjoin any genewise clusters
if($self->find_cluster_joiners($combined_transcript)){
print STDERR "Found a cluster_joiner!\n" if $self->VERBOSE;
print STDERR $combined_transcript->seq_region_start."/".$combined_transcript->seq_region_end."\n" if $self->VERBOSE;
#dont use this one
$combined_transcript = undef;
}
}
else{
print SDERR "Did not pass tests.\n" if $self->VERBOSE;
$combined_transcript = undef;
}
}
}
if($predef_match && $combined_transcript){
#use this cDNA
print STDERR "Using predefined cDNA!\n" if $self->VERBOSE;
$cdna_match = $predef_match;
$usingKnown = 1;
}
else{
# find matching cdnas using scoring of all evidence available
my ($matching_cdnas, $utr_length_hash, $UTR_side_indicator_hash) = $self->match_protein_to_cdna($cds, 0);
#we probably dont need any of this UTR stuff anymore!?
my %utr_length_hash = %$utr_length_hash;
if(!(scalar @$matching_cdnas)){
warn("Could not identify any matching cDNA!\n");
#try to use ESTs instead
#we could use coalescer code to produce joined ESTs (find longest path, etc.)?
($matching_cdnas, $utr_length_hash, $UTR_side_indicator_hash) = $self->match_protein_to_cdna($cds, 1);
%utr_length_hash = %$utr_length_hash;
if(!(scalar @$matching_cdnas)){
my $unmerged_cds = $self->retrieve_unmerged_gene($cds);
warn("Could not identify any matching ESTs either!\n");
$self->unmatched_genes($unmerged_cds);
next CDS;
}
}
## scoring code...
#convert genes to extended objects
my $matching_extended_cdnas = $self->convert_to_extended_genes($matching_cdnas);
#cluster matching cDNA or EST genes
#call cluster method from ClusterUtils
# print "EXTENDED : ",join(' ',@{$matching_extended_cdnas}),"\n";
#foreach my $keys (keys %{$self->{evidence_sets}}){
# print "Evidence: ",@{$self->{evidence_sets}{$keys}},"\n";
#}
my ($clusters, $non_clusters) = cluster_Genes( $matching_extended_cdnas, $self->{evidence_sets} ) ;
#store genes seperated by strand #USAGE??
my $genes_by_strand;
my @possible_transcripts;
foreach my $gene (@$matching_extended_cdnas){
push @{$genes_by_strand->{$gene->strand}}, $gene;
}
#apply collapsing method from TranscriptConsensus (creates the scores)
my @cluster_to_use = ();
if(scalar @$clusters){
push(@cluster_to_use, @$clusters);
}
if(scalar @$non_clusters){
push(@cluster_to_use, @$non_clusters);
}
foreach my $cluster (@cluster_to_use){
print STDERR "CLUSTER: ".$cluster->start." ".$cluster->end." ".$cluster->strand."\n" if $self->VERBOSE;
my $collapsed_cluster = Bio::EnsEMBL::Analysis::Runnable::TranscriptConsensus->collapse_cluster($cluster, $genes_by_strand);
my @potential_genes = $cluster->get_Genes;
foreach my $potential_gene (@potential_genes){
foreach my $potential_trans (@{$potential_gene->get_all_Transcripts}) {
#create an extended transcript (with scores)
my @exon_array = ( @{$potential_trans->get_all_Exons} );
my $new_transcript = Bio::EnsEMBL::Analysis::Runnable::TranscriptConsensus->transcript_from_exons(
\@exon_array, undef, $collapsed_cluster);
#add supporting features
$new_transcript->add_supporting_features(@{$potential_trans->get_all_supporting_features});
#add a ditag score
print STDERR "new_transcript score :".$new_transcript->score()."\n" if $self->VERBOSE;
my ($ditag_score, $index) = $self->score_ditags($self->ditags, $new_transcript, 0);
$new_transcript->score($new_transcript->score() + $ditag_score);
#add a UTR length score
my $UTR_score = $self->calculate_UTR_score($cds_tran[0], $new_transcript);
$new_transcript->score($new_transcript->score() + $UTR_score);
push(@possible_transcripts, $new_transcript);
}
}
} #clusters
#get the highest scoring transcript, that survives tests
@possible_transcripts = sort { $a->score <=> $b->score } @possible_transcripts if @possible_transcripts;
my ($cdnaname, $genename);
my $round = 0;
POS:
while(my $chosen_transcript = pop @possible_transcripts){
my $chosen_feats = $chosen_transcript->get_all_supporting_features;
#make a gene from this
$cdna_match = Bio::EnsEMBL::Gene->new;
$cdna_match->slice($chosen_transcript->slice);
$cdna_match->add_Transcript($chosen_transcript);
$cdna_match->analysis($chosen_transcript->analysis);
$cdna_match->get_all_Transcripts->[0]->add_supporting_features(@$chosen_feats);
#combine it with the cds gene
$combined_transcript = $self->combine_genes($cds, $cdna_match);
# just check combined transcript works before throwing away the original transcript
if ( defined($combined_transcript)){
$combined_transcript->sort;
if(is_Transcript_sane($combined_transcript)
&& all_exons_are_valid($combined_transcript, $self->MAX_EXON_LENGTH, 1)
&& intron_lengths_all_less_than_maximum($combined_transcript, $self->MAX_INTRON_LENGTH)
&& has_no_unwanted_evidence($combined_transcript)
){
# make sure combined transcript doesn't misjoin any genewise clusters
if($self->find_cluster_joiners($combined_transcript)){
print STDERR "Found a cluster_joiner!\n" if $self->VERBOSE;
print STDERR $combined_transcript->seq_region_start."/".$combined_transcript->seq_region_end."\n" if $self->VERBOSE;
#dont use this one
$combined_transcript = undef;
}
}
else{
print STDERR "Didn't pass checks.\n" if $self->VERBOSE;
$combined_transcript = undef;
}
}
else{
$combined_transcript = undef;
}
if(defined $combined_transcript){
#leave the loop
last POS;
}
}
} #find match
if (defined $combined_transcript){
#transfer evidence
$combined_transcript = $self->_transfer_evidence($combined_transcript, $cdna_match);
# #make the new gene with UTR
# my $genetype;
# if($usingKnown){
# $genetype = $self->KNOWN_UTR_GENETYPE;
# } else{
# if ( $self->EXTEND_ORIGINAL_BIOTYPE && (length($self->EXTEND_ORIGINAL_BIOTYPE)>0)) {
# my $sep = "";
# $sep = "_" unless ( $self->UTR_GENETYPE =~ m/^_/ );
# $genetype = $combined_transcript->biotype. $sep. $self->UTR_GENETYPE ;
# } else {
# $genetype = $combined_genetype;
# }
# }
#set biotype
my $genetype;
if ( $self->EXTEND_ORIGINAL_BIOTYPE && (length($self->EXTEND_ORIGINAL_BIOTYPE)>0)) {
my $sep = "";
if($usingKnown){
$sep = "_" unless ( $self->KNOWN_UTR_GENETYPE =~ m/^_/ );
$genetype = $combined_transcript->biotype. $sep. $self->KNOWN_UTR_GENETYPE;
}
else{
$sep = "_" unless ( $self->UTR_GENETYPE =~ m/^_/ );
$genetype = $combined_transcript->biotype. $sep. $self->UTR_GENETYPE;
}
}
else {
if($usingKnown){
$genetype = $self->KNOWN_UTR_GENETYPE;
}
else{
$genetype = $combined_genetype;
}
}
#print STDERR "MAKING_GENE FROM ".." AND ".$cdna_match->hit_name."\n";
my $combined_genes = $self->make_gene($genetype, $combined_transcript);
#check phases, etc.- if it is not a blessed gene
my $combined_gene;
if(!$blessed){
$combined_gene = $self->look_for_both($combined_genes->[0]);
my $combined_transcript = $combined_gene->get_all_Transcripts->[0];
$combined_transcript->sort;
if(! (is_Transcript_sane($combined_transcript)
&& all_exons_are_valid($combined_transcript, $self->MAX_EXON_LENGTH, 1)
&& intron_lengths_all_less_than_maximum($combined_transcript, $self->MAX_INTRON_LENGTH)
&& validate_Translation_coords($combined_transcript, 1)
&& !contains_internal_stops($combined_transcript)
&& $combined_transcript->translate
&& has_no_unwanted_evidence($combined_transcript) )
){
#revert to previous version if tests failed
$combined_gene = $combined_genes->[0];
}
}
else{
#use blessed genes without modifications
$combined_gene = $combined_genes->[0];
}
#store as combined
$self->combined_genes($combined_gene);
print STDERR "RESULT-CombinedGene: ".$combined_gene->seq_region_name." ".$combined_gene->seq_region_start.
"-".$combined_gene->seq_region_end."\n";
}
else{
#retrieving unmerged if no UTR could be defined
my $unmerged_cds = $self->retrieve_unmerged_gene($cds);
$self->unmatched_genes($unmerged_cds);
print STDERR "RESULT-UnmergedGene: ".$unmerged_cds->seq_region_name." ".$unmerged_cds->seq_region_start.
"-".$unmerged_cds->seq_region_end."\n";
next CDS;
}
}
print STDERR "At the end of the matching I have ".(scalar @{$self->combined_genes})." combined_genes genes".
" and ".(scalar @{$self->unmatched_genes})." unmatched_genes\n" if $self->VERBOSE;
if(!$blessed && ($self->{'remove_redundant'})){
#remove redundatant models from the umatched group
my $unique_unmatched_genes = $self->remove_redundant_models($self->combined_genes, $self->unmatched_genes);
#replace all unmatched genes
$self->{'_unmatched_genes'} = [];
$self->unmatched_genes(@{$unique_unmatched_genes});
print STDERR "without redundant models, we have ".(scalar @{$self->combined_genes})." combined_genes genes".
" and ".(scalar @{$self->unmatched_genes})." unmatched_genes\n" if $self->VERBOSE;
}
}
=head2 calculate_UTR_score
Arg [1] : genewise transcript
Arg [2] : transcript adding UTR region
Description: calculate a score favouring transcripts that add UTR on both sides
Could also favour longer UTRs if desired
Returntype : int score
=cut
sub calculate_UTR_score {
my ($self, $gw_gene, $matching_gene) = @_;
my $BONUS_FOR_BOTH_SIDES = 1;
my $UTR_score = 0;
if(($gw_gene->start - $matching_gene->start) && ($matching_gene->end - $gw_gene->end)){
$UTR_score += $BONUS_FOR_BOTH_SIDES;
}
#...to be extended if needed
return $UTR_score;
}
=head2 score_ditags
Arg [1] : array ref with mapped ditags to analyse
Arg [2] : transcript to analyse
Arg [3] : (optional) int index, where to start looking in the ditag-array
Description: score ditags in the region of a exon, transcripts or est
that might support it
Give a higher score to those features that match well
to ditags and have a high tag_count.
Returntype : int score: high value indicates many/well matching ditags
int index: updated array index
Exceptions : none
=cut
sub score_ditags{
my ($self, $ditags, $feature, $index) = @_;
my $ditag_score = 0;
if(!$index){ $index = 0 }
#check start & end seperately, use best matching
for(my $i = $index; $i < (scalar @$ditags); $i++){
my $ditag = $ditags->[$i];
if(($ditag->{'start'} < $feature->end) && ($ditag->{'end'} > $feature->start)){
my $start_distance = abs($ditag->{'start'} - $feature->start);
my $end_distance = abs($ditag->{'end'} - $feature->end);
if(($start_distance < $self->DITAG_WINDOW) && ($end_distance < $self->DITAG_WINDOW)){
#matching ditag; produce a score favoring those transcripts,
#that have a high ditag count &/| perfectly positioned ditags.
#magic equation to generate a score
my $score = 0;
$score += ($ditag->{'tag_count'} / (($start_distance +1) * 0.9));
$score += ($ditag->{'tag_count'} / (($end_distance +1) * 0.9));
if($score > 0){ $ditag_score += $score; }
}
}
if($ditag->{'end'} < $feature->start){
$index++;
}
elsif($ditag->{'start'} > $feature->start){
last;
}
}
print STDERR " returning ditagscore $ditag_score.\n" if $self->VERBOSE;
return($ditag_score, $index);
}
=head2 check_for_predefined_pairing
Arg [1] : Bio::EnsEMBL::Gene CDS gene
Arg [2] : Boolean to indicate blessed gene
Description: check whether there is a specific cDNA assigned to the given gene model
Returntype : Bio::EnsEMBL::Gene cDNA gene
=cut
sub check_for_predefined_pairing {
my ($self, $gene, $blessed) = @_;
my $cdna = undef;
my $protein_id;
my $cdna_id;
# get hash with cDNAs: $cdna_evidence{$evidence->hseqname()} = $cdna-gene;
my $cdna_evidence = $self->_cdna_evidence();
# get the protein id, the gene was built from;
# do this for blessed genes also, as this should be most reliable
EXON:
foreach my $exon(@{$gene->get_all_Exons}){
my @feat = @{$exon->get_all_supporting_features};
foreach my $feat(@feat){
$protein_id = $feat->hseqname;
last EXON if (defined $protein_id);
}
}
if ((!defined $protein_id || $protein_id eq '') && $blessed){
#try to use RefSeq xrefs for the blessed genes
#The CCDS genes are stored with NM-entries in the xref table,
#so these can be used directly as cDNA-ids.
#Might have to be adjusted if this changes.
$protein_id = 'blessed';
my @xrefs = @{$gene->get_all_DBLinks()};
if(scalar @xrefs){
foreach my $xref (@xrefs){
if($xref->display_id =~ "^NM_"){
$cdna_id = $xref->display_id;
$cdna_id =~ s/\.\S+$//;
print STDERR "have xref $cdna_id\n" if $self->VERBOSE;
last;
}
}
}
else{ print STDERR "no suitable xref\n" if $self->VERBOSE; }
}
else{
if (!defined $protein_id || $protein_id eq ''){
print STDERR "no protein_id for gene.\n" if $self->VERBOSE;
return undef;
}
# Using the protein id of the targetted gene, determine the
# corresponding cDNA id from the pre-loaded hash.
#remove version info
$protein_id =~ s/\.\S+$//;
$cdna_id = $self->get_cdna_id_from_protein_id($protein_id);
}
if (!defined $cdna_id || $cdna_id eq ''){
print STDERR "no predefined cDNA found.\n" if $self->VERBOSE;
return undef;
}
else{
print STDERR "found predefined cDNA $cdna_id for $protein_id\n" if $self->VERBOSE;
}
#make sure it's not on the kill list
if(defined ($self->kill_list()->{$cdna_id})){
print STDERR "skipping " . $cdna_id . " present in kill list\n";
return undef;
}
#get the gene for this cDNA
my $cdna_gene = $cdna_evidence->{$cdna_id};
#check if they really are overlapping to avoid disappointment when combining
if($cdna_gene){
if($self->VERBOSE){ print STDERR "found cdna gene $cdna_gene / $cdna_id.\n" }
if( !(($gene->seq_region_name eq $cdna_gene->seq_region_name)
&& ($gene->strand == $cdna_gene->strand)
&& ($gene->seq_region_start < $cdna_gene->seq_region_end)
&& ($cdna_gene->seq_region_start < $gene->seq_region_end)
&& ($gene->get_all_Transcripts->[0]->get_all_Exons->[0]->strand
== $cdna_gene->get_all_Transcripts->[0]->get_all_Exons->[0]->strand)) ){
print STDERR "not overlapping properly, not using." if $self->VERBOSE;
$cdna_gene = undef;
}
}
else {
print STDERR "Unable to fetch cDNA gene [$cdna_id].\n";
return undef;
}
return $cdna_gene;
}
=head2 get_cdna_id_from_protein_id
Arg [1] : String protein-id
Description: Find the cDNA-id a given protein-id is linked to as a "known" evidence
Returntype : String cDNA-id
=cut
sub get_cdna_id_from_protein_id {
my ($self, $protein_id) = @_;
my $cdna_id = undef;
if(defined($self->_known_pairs()->{$protein_id})){
$cdna_id = $self->_known_pairs()->{$protein_id};
}
return $cdna_id;
}
=head2 create_predefined_pairing
Arg [1] : none
Description: Read GenBank sequence file linking NP proteins to a specific NM cDNA
to use as "known" UTR evidence
Returns : nothing
=cut
sub create_predefined_pairing {
my ($self, $known_utr_file) = @_;
print STDERR "Parsing GenBank file $known_utr_file for KnowUTR pairing.\n" if $self->VERBOSE;
open(REFSEQ, "<$known_utr_file") or die "Can't open ".$known_utr_file.": $@\n";
my $cdna_id;
my $protein_id;
my %predefined_pairing = ();
while(){
#use only these 2 fields of the file:
next unless /^VERSION|DBSOURCE/;
if(/VERSION/){
next unless /(NP\S+)/;
if(defined $protein_id){
die("previous protein_id [$protein_id] has not been cleared out\n");
}
if(defined $cdna_id){
die("previous cdna_id [$cdna_id] has not been cleared out ...\n");
}
$protein_id = $1;
}
if(/DBSOURCE/){
# don't want NCs or NGs, etc.
if(/(NC\_\S+)|(NG\_\S+)|(XM\S+)|(AC\_\S+)/){
$cdna_id = undef;
$protein_id = undef;
next;
}
if(!defined $protein_id){
die("something very wrong - no protein_id for $_\n");
}
if (defined $cdna_id){
die("previous cdna_id [$cdna_id] has not been cleared out ...\n");
}
next unless /(NM\S+)/;
$cdna_id = $1;
#remove version info
$cdna_id =~ s/\.\S+$//;
$protein_id =~ s/\.\S+$//;
if(exists($predefined_pairing{$protein_id})){ print STDERR $protein_id." allready exists!\n"; }
$predefined_pairing{$protein_id} = $cdna_id;
$cdna_id = undef;
$protein_id = undef;
}
}
close REFSEQ;
$self->_known_pairs(\%predefined_pairing);
}
=head2 find_cluster_joiners
Arg [1] : ref to array of GeneClusters
Description: screens UTR modified transcripts for those that
potentially misjoin genewise clusters, and remove them
Returntype : ref to array of Bio::EnsEMBL::Gene
=cut
sub find_cluster_joiners{
my ($self, $transcript) = @_;
my @clusters;
my $transcript_start;
my $transcript_end;
my $overlaps_previous_cluster = 0;
my $matching_clusters = 0;
if ($transcript->get_all_Exons->[0]->strand == 1){
@clusters = @{$self->forward_genewise_clusters};
$transcript_start = $transcript->start_Exon->start;
$transcript_end = $transcript->end_Exon->end;
}
else{
@clusters = @{$self->reverse_genewise_clusters};
$transcript_start = $transcript->end_Exon->start;
$transcript_end = $transcript->start_Exon->end;
}
print STDERR "transcript spans $transcript_start - $transcript_end ".
$transcript->seq_region_start." - ".$transcript->seq_region_end."\n".
"clusters span: \n" if $self->VERBOSE;
if(!scalar(@clusters)){
print STDERR "Odd, no clusters\n" if $self->VERBOSE;
return 0;
}
#NEW CODE: CLUSTERING WITH EXON COORDINATES
foreach my $cluster(@clusters){
if( _overlapping_genes($transcript, $cluster) ){
$matching_clusters++;
}
if($matching_clusters>1){
print STDERRR "transcript joins clusters - discard it\n" if $self->VERBOSE;
return 1;
}
}
return 0;
}
=head2 _overlapping_genes
Arg [1] : Bio::EnsEMBL::Transcript
Arg [2] : Bio::EnsEMBL::Analysis::Tools::Algorithms::GeneCluster
Description: Check whether any of the exons of the transcript
overlap any of the exons of the cluster.
Returntype : boolean, true if overlap exists
=cut
sub _overlapping_genes {
my ($gene1, $cluster) = @_;
# quit if genes do not have genomic overlap
if ($gene1->end < $cluster->start || $gene1->start > $cluster->end) {
return 0;
}
# overlap check based on all (noncoding + coding) Exons
foreach my $exon1 (@{$gene1->get_all_Exons}){
foreach my $exon2 (@{$cluster->get_all_Exons}){
if ( ($exon1->overlaps($exon2)) && ($exon1->strand == $exon2->strand) ){
return 1;
}
}
}
return 0;
}
=head2 _filter_cdnas
Arg [1] : ref to array of Bio::EnsEMBL::Gene
Arg [2] : flag whether these genes are EST genes rather than cDNA genes
Description: This method checks the cDNA and EST genes
translation is not checked as these genescome without
translation
Returntype : ref to array of filtered Bio::EnsEMBL::Gene
=cut
sub _filter_cdnas{
my ($self, $cdna_arrayref, $ests) = @_;
my @newcdna;
my %cdna_evidence;
cDNA_GENE:
foreach my $cdna (@{$cdna_arrayref}) {
cDNA_TRANSCRIPT:
foreach my $tran (@{$cdna->get_all_Transcripts}) {
$tran->sort;
if(!$ests){
#store cDNA gene evidence for later
foreach my $evidence (@{ $tran->get_all_supporting_features }){
#remove version number?!
my $evidence_name = $evidence->hseqname();
$evidence_name =~ s/\.\S+$//;
#print STDERR "evidence: ".$evidence->hseqname()." => ".$evidence_name."\n";
$cdna_evidence{$evidence_name} = $cdna;
}
}
# rejecting on basis of intron length may not be valid here
# - it may not be that simple in the same way as it isn;t that simple in Targetted & Similarity builds
next cDNA_TRANSCRIPT unless ( is_Transcript_sane($tran)
&& all_exons_are_valid($tran, $self->MAX_EXON_LENGTH, 1)
&& intron_lengths_all_less_than_maximum($tran, $self->MAX_INTRON_LENGTH)
&& has_no_unwanted_evidence($tran) );
push(@newcdna, $cdna);
}
}
if(!$ests){
#store cDNAs for later use
$self->_cdna_evidence(\%cdna_evidence);
}
return \@newcdna;
}
=head2 write_output
Arg [1] : none
Description: Do some cleaning up uf the result and store the genes to the database
Returntype : none
=cut
sub write_output {
my($self) = @_;
# write genes in the output database
my $gene_adaptor = $self->OUTPUT_DB->get_GeneAdaptor;
print STDERR "Have ".scalar (@{$self->output})."(".$totalgenes.") genes to write\n";
GENE:
foreach my $gene (@{$self->output}) {
if(!$gene->analysis ||
$gene->analysis->logic_name ne $self->analysis->logic_name){
$gene->analysis($self->analysis);
}
# double check gene coordinates
$gene->recalculate_coordinates;
#As all genes/exons are stored as new in the target db
#it's save to remove the old adaptor & old dbID here,
#to avoid warnings from the store function.
foreach my $exon (@{$gene->get_all_Exons}) {
$exon->adaptor(undef);
$exon->dbID(undef);
}
$gene->dbID(undef) ;
$gene->adaptor(undef);
for ( @{$gene->get_all_Transcripts} ) {
for ( @{$_->get_all_supporting_features}){
$_->dbID(undef);
$_->adaptor(undef);
}
$_->dbID(undef);
$_->adaptor(undef);
}
eval {
$gene_adaptor->store($gene);
print STDERR "wrote gene dbID " . $gene->dbID . "\t".$gene->biotype . "\n" ;
};
if( $@ ) {
print STDERR "UNABLE TO WRITE GENE " . $gene->dbID. "of type " . $gene->type . "\n\n$@\n\nSkipping this gene\n";
}
}
}
=head2 convert_to_extended_genes
Arg [1] : ref to array of Bio::EnsEMBL::Gene
Description: converts transcripts to Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranscriptExtended
Returntype : ref to array of TranscriptExtended
Exceptions : throws if gene has multiple transcripts
=cut
sub convert_to_extended_genes {
my ($self, $genes) =@_;
my @new_genes;
for my $g (@$genes){
#my $st = _get_evidence_set( $g->biotype );
#my $set = $g->biotype;
my ( $set ) = $self->_get_evidence_set ( $g->biotype ) ;
for my $t (@{$g->get_all_Transcripts}){
throw ("gene has more than one trancript - only processing 1-gene-1-transcript-genes")
if (@{$g->get_all_Transcripts}>1);
# conversion
my $gene_from_pt = Bio::EnsEMBL::Gene->new(
-start => $t->start ,
-end => $t->end ,
-strand => $t->strand ,
-slice => $t->slice ,
-biotype => $g->biotype,
-analysis => $t->analysis,
);
my $new_tr = Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranscriptExtended->new(
-BIOTYPE => $g->biotype ,
-ANALYSIS => $t->analysis ,
);
$new_tr->ev_set($set);
$new_tr->add_supporting_features(@{$t->get_all_supporting_features});
my @pt_exons = @{$t->get_all_Exons} ;
for (my $i=0 ; $ibiotype($g->biotype);
$pte->ev_set($set);
$pte->end_phase(0);
$pte->phase(0);
$pte->next_exon($pt_exons[$i+1]) ;
if ($i == 0 ) {
$pte->prev_exon(0);
} else {
$pte->prev_exon($pt_exons[$i-1]);
}
$pte->transcript($new_tr) ;
$pte->analysis($t->analysis) ;
};
# Extending the Bio::EnsEMBL::Transcript object by ev_set methods
for (@pt_exons) {
$new_tr->add_Exon($_);
}
$new_tr->sort;
$gene_from_pt->add_Transcript($new_tr);
push @new_genes , $gene_from_pt;
}
}
return \@new_genes;
}
=head2 make_gene
Arg [1] : string representing genetyoe to be associated with genes
Arg [2] : array of Bio::EnsEMBL::Transcript
Description: Constructs Bio::EnsEMBL::Gene objects from UTR-modified transcripts
Returntype : none; new genes are stored in self->combined_genes
Exceptions : throws when missing genetype or analysis
=cut
sub make_gene{
my ($self, $genetype, @transcripts) = @_;
unless ( $genetype ){
throw("You must define UTR_GENETYPE in Bio::EnsEMBL::Analysis::Conf::GeneBuild::UTR_Builder");
}
# an analysis should be passed in via the RunnableDB.m parent class:
my $analysis = $self->analysis;
unless ($analysis){
throw("You have to pass an analysis to this RunnableDB through new()");
}
my @genes;
my $count=0;
foreach my $trans(@transcripts){
$trans->sort;
unless ( is_Transcript_sane($trans)
&& intron_lengths_all_less_than_maximum($trans, $self->MAX_INTRON_LENGTH)
&& all_exons_are_valid($trans, $self->MAX_EXON_LENGTH, 1)
&& has_no_unwanted_evidence($trans) ){
print STDERR "\nrejecting transcript\n";
return;
}
my $gene = new Bio::EnsEMBL::Gene;
$gene->biotype($genetype);
$trans->biotype($genetype);
$trans->analysis($analysis);
$gene->add_Transcript($trans);
$gene->analysis($analysis);
# do not modify the analysis of the supporting features
# they should be the original ones: cdna, targetted_genewise or similarity_genewise
if($self->validate_gene($gene)){
push (@genes,$gene);
$count++;
}
else{
print STDERR "\nCould not validate gene!\n";
}
}
return(\@genes);
}
=head2 match_protein_to_cdna
Arg [1] : Bio::EnsEMBL::Gene
Arg [2] : int is_est
Description: this method tries to find the cdnas that can be merged with the genewise genes.
Basically it checks for exact internal splice matching in the 5' and 3' exons of the genewise gene.
In order to match a starting genewise exon with an cDNA exon, we need to have
a. exactly coinciding exon boundaries
b. either cdna exon has start <= genewise exon start,
OR cdna exon start lies within $exon_slop bp of genewise exon start AND
the cdna transcript will add exra UTR exons.
Substitute "end" for "start" for 3prime ends of transcripts
BUT do not allow through any cDNA that will result just in a
shortened peptide without additional UTR exons.
Returntype : ref to array of Bio::EnsEMBL::Gene, ref to hash relating Bio::EnsEMBL::Gene to UTR length
Exceptions :
=cut
sub match_protein_to_cdna{
my ($self, $gw, $is_est) = @_;
print STDERR "Looking at ESTs.\n" if $is_est;
my (%UTR_hash, %UTR_side_indicator_hash);
my @other_genes;
my @matching_e2g;
my @gw_tran = @{$gw->get_all_Transcripts};
my @gw_exons = @{$gw_tran[0]->get_all_Exons};
my $strand = $gw_exons[0]->strand;
if($gw_exons[$#gw_exons]->strand != $strand){
warn("first and last gw exons have different strands ".
"- can't make a sensible combined gene\n with ".$gw_tran[0]->dbId );
return undef;
}
if ( @gw_exons ){
if ($strand == 1 ){
@gw_exons = sort { $a->start <=> $b->start } @gw_exons;
}
else{
@gw_exons = sort { $b->start <=> $a->start } @gw_exons;
}
}
else{
warn("gw gene without exons: ".$gw->dbID.", skipping it");
return undef;
}
my $exon_slop = 20;
my $cds_length = length($gw_tran[0]->translateable_seq());
my @genes;
if($is_est){
@genes = @{$self->ests};
}
else{
@genes = @{$self->cdna_genes};
}
cDNA:
foreach my $e2g(@genes){
my @egtran = @{$e2g->get_all_Transcripts};
my @eg_exons = @{$egtran[0]->get_all_Exons};
$strand = $eg_exons[0]->strand;
if($eg_exons[$#eg_exons]->strand != $strand){
warn("first and last e2g exons have different strands - skipping transcript ".$egtran[0]->dbID);
next cDNA;
}
if ($strand == 1 ){
@eg_exons = sort { $a->start <=> $b->start } @eg_exons;
}
else{
@eg_exons = sort { $b->start <=> $a->start } @eg_exons;
}
my $fiveprime_match = 0;
my $threeprime_match = 0;
my $left_exon;
my $right_exon;
my $left_diff = 0;
my $right_diff = 0;
# Lets deal with single exon genes first
if ($#gw_exons == 0) {
foreach my $current_exon (@eg_exons) {
if($current_exon->strand != $gw_exons[0]->strand){
next cDNA;
}
# don't yet deal with genewise leakage for single exon genes
if ($gw_exons[0]->end <= $current_exon->end &&
$gw_exons[0]->start >= $current_exon->start){
$fiveprime_match = 1;
$threeprime_match = 1;
$left_exon = $current_exon;
$right_exon = $current_exon;
$left_diff = $gw_exons[0]->start - $current_exon->start;
$right_diff = $current_exon->end - $gw_exons[0]->end;
}
}
}
else { # Now the multi exon genewises
cDNA_EXONS:
foreach my $current_exon (@eg_exons) {
if($current_exon->strand != $gw_exons[0]->strand){
next cDNA;
}
if($gw_exons[0]->strand == 1){
#FORWARD:
# 5prime
if ($gw_exons[0]->end == $current_exon->end &&
# either e2g exon starts before genewise exon
($current_exon->start <= $gw_exons[0]->start ||
# or e2g exon is a bit shorter but there are spliced UTR exons as well
(abs($current_exon->start - $gw_exons[0]->start) <= $exon_slop && $current_exon != $eg_exons[0]))){
$fiveprime_match = 1;
$left_exon = $current_exon;
$left_diff = $gw_exons[0]->start - $current_exon->start;
}
# 3prime
elsif($gw_exons[$#gw_exons]->start == $current_exon->start &&
# either e2g exon ends after genewise exon
($current_exon->end >= $gw_exons[$#gw_exons]->end ||
# or there are UTR exons to be added
(abs($current_exon->end - $gw_exons[$#gw_exons]->end) <= $exon_slop &&
$current_exon != $eg_exons[$#eg_exons]))){
$threeprime_match = 1;
$right_exon = $current_exon;
$right_diff = $current_exon->end - $gw_exons[0]->end;
}
}
elsif($gw_exons[0]->strand == -1){
#REVERSE:
# 5prime
if ($gw_exons[0]->start == $current_exon->start &&
# either e2g exon ends after gw exon
($current_exon->end >= $gw_exons[0]->end ||
# or there are UTR exons to be added
(abs($current_exon->end - $gw_exons[0]->end) <= $exon_slop &&
$current_exon != $eg_exons[0]))){
#print STDERR "fiveprime reverse match\n";
$fiveprime_match = 1;
$right_exon = $current_exon;
$right_diff = $current_exon->end - $gw_exons[0]->end;
}
#3prime
elsif ($gw_exons[$#gw_exons]->end == $current_exon->end &&
# either e2g exon starts before gw exon
($current_exon->start <= $gw_exons[$#gw_exons]->start ||
# or there are UTR exons to be added
(abs($current_exon->start - $gw_exons[$#gw_exons]->start) <= $exon_slop &&
$current_exon != $eg_exons[$#eg_exons]))){
#print STDERR "threeprime reverse match\n";
$threeprime_match = 1;
$left_exon = $current_exon;
$left_diff = $gw_exons[0]->start - $current_exon->start;
}
}
}
}
# can match either end, or both
if($fiveprime_match || $threeprime_match){
my ($UTR_length, $left_UTR_length, $right_UTR_length) =
$self->_compute_UTRlength($egtran[0], $left_exon, $left_diff, $right_exon, $right_diff);
my $UTR_diff = $egtran[0]->length;
#make sure CDS is not much smaller than UTR
if(($cds_length * 10) > $UTR_diff){
$UTR_hash{$e2g} = $UTR_length;
$UTR_side_indicator_hash{$e2g} = 1;
print STDERR "considering cDNA ".$e2g->seq_region_start."-".$e2g->seq_region_end."[".($cds_length * 10)." > ".$UTR_diff."]\n";
push(@matching_e2g, $e2g);
}
else{
print STDERR "didnt pass UTR length check [".($cds_length * 10)." > ".$UTR_diff."]: ".$e2g->seq_region_start."-".$e2g->seq_region_end."\n" if $self->VERBOSE;
}
}
}
print STDERR "returning ".(scalar @matching_e2g)." candidates.\n";
return (\@matching_e2g,\%UTR_hash, \%UTR_side_indicator_hash);
}
=head2 _compute_UTRlength
Arg [1] : eg-transcript
Arg [2] : first matching exon
Arg [3] : (int) UTR bases of first matching exon
Arg [4] : last matching exon
Arg [5] : (int) UTR bases of first matching exon
Description: add up genomic extend of UTR regions
Returntype : int (basepairs) total UTR-length, left UTR-length, right UTR-lenght
Exceptions : none
=cut
sub _compute_UTRlength{
my ($self, $transcript, $left_exon, $left_diff, $right_exon, $right_diff) = @_;
my $strand = $transcript->start_Exon->strand;
my @exons = sort { $a->start <=> $b->start } @{ $transcript->get_all_Exons };
my $UTRlength = 0;
my $in_UTR = 1;
my $start_flag = 0;
my $left_UTR = $left_diff;
my $right_UTR = $right_diff;
foreach my $exon ( @exons ){
if ( defined $left_exon && $exon == $left_exon ){
$UTRlength += $left_diff;
$in_UTR = 0;
}
elsif( defined $right_exon && $exon == $right_exon ){
$UTRlength += $right_diff;
$in_UTR = 1;
}
elsif( $in_UTR == 1 ){
$UTRlength += $exon->length;
if(!$start_flag){
$left_UTR += $UTRlength;
$start_flag = 1;
}
else{
$right_UTR += $UTRlength;
}
}
}
return($UTRlength, $left_UTR, $right_UTR);
}
=head2 _merge_genes
Arg [1] : ref to array of Bio::EnsEMBL::Gene
Description: merges adjacent exons if they are frameshifted; stores
component exons as subSeqFeatures of the merged exon
Returntype : ref to array of Bio::EnsEMBL::Gene
=cut
sub _merge_genes {
my ($self, $genesref) = @_;
my @merged;
my $count = 1;
UNMERGED_GENE:
foreach my $unmerged (@{$genesref}){
my $gene = new Bio::EnsEMBL::Gene;
$gene->dbID($unmerged->dbID);
my @pred_exons;
my $ecount = 0;
# order is crucial
my @trans = @{$unmerged->get_all_Transcripts};
if(scalar(@trans) != 1) {
throw("Gene with dbID " . $unmerged->dbID .
" has NO or more than one related transcript, where a 1-gene-to-1-transcript-relation is assumed.".
" Check preceding analysis \n");
}
$trans[0]->sort;
# check the sanity of the transcript
if( !is_Transcript_sane($trans[0])
|| !all_exons_are_valid($trans[0], $self->MAX_EXON_LENGTH, 1)
|| !has_no_unwanted_evidence($trans[0])
|| !intron_lengths_all_less_than_maximum($trans[0], $self->MAX_INTRON_LENGTH) ){
print STDERR "transcript ".$unmerged->dbID."did NOT pass sanity check!\n";
print STDERR "found at ".$unmerged->seq_region_name.", ".$unmerged->seq_region_start.", ".$unmerged->seq_region_end."\n";
next UNMERGED_GENE;
}
### we follow here 5' -> 3' orientation ###
# $trans[0]->sort;
my $cloned_translation = new Bio::EnsEMBL::Translation;
my @unmerged_exons = @{$trans[0]->get_all_Exons};
my $strand = $unmerged_exons[0]->strand;
my $previous_exon;
EXON:
foreach my $exon(@unmerged_exons){
## genewise frameshift? we merge here two exons separated by max 10 bases into a single exon
#if ($ecount && $pred_exons[$ecount-1]){
# $previous_exon = $pred_exons[$ecount-1];
#}
$ecount++;
my $separation = 0;
my $merge_it = 0;
## we put every exon following a frameshift into the first exon before the frameshift
## following the ordering 5' -> 3'
if (defined($previous_exon)){
#print STDERR "previous exon: ".$previous_exon->start."-".$previous_exon->end."\n";
#print STDERR "current exon : ".$exon->start."-".$exon->end."\n";
if ($strand == 1){
$separation = $exon->start - $previous_exon->end - 1;
}
elsif( $strand == -1 ){
$separation = $previous_exon->end - $exon->start - 1;
}
if ($separation <=10){
$merge_it = 1;
}
}
if ( defined($previous_exon) && $merge_it == 1){
# combine the two
# the first exon (5'->3' orientation always) is the containing exon,
# which gets expanded and the other exons are added into it
#print STDERR "merging $exon into $previous_exon\n";
#print STDERR $exon->start."-".$exon->end." into ".$previous_exon->start."-".$previous_exon->end."\n";
if ($strand == 1) {
$previous_exon->end($exon->end);
} else {
$previous_exon->start($exon->start);
}
$previous_exon->add_sub_SeqFeature($exon,'');
# if this is end of translation, keep that inf:
if ( $exon == $trans[0]->translation->end_Exon ){
$cloned_translation->end_Exon( $previous_exon );
$cloned_translation->end($trans[0]->translation->end);
}
my %evidence_hash;
foreach my $sf( @{$exon->get_all_supporting_features}){
if ( $evidence_hash{$sf->hseqname}{$sf->hstart}{$sf->hend}{$sf->start}{$sf->end} ){
next;
}
#print STDERR $sf->start."-".$sf->end." ".$sf->hstart."-".$sf->hend." ".$sf->hseqname."\n";
$evidence_hash{$sf->hseqname}{$sf->hstart}{$sf->hend}{$sf->start}{$sf->end} = 1;
$previous_exon->add_supporting_features($sf);
}
next EXON;
}
else{
# make a new Exon - clone using Bio::EnsEMBL::Analysis::Tools::GeneB uildUtils::ExonUtils
my $cloned_exon = clone_Exon($exon);
$cloned_exon->add_sub_SeqFeature($exon,'');
# if this is start/end of translation, keep that info:
if ( $exon == $trans[0]->translation->start_Exon ){
$cloned_translation->start_Exon( $cloned_exon );
$cloned_translation->start($trans[0]->translation->start);
}
if ( $exon == $trans[0]->translation->end_Exon ){
$cloned_translation->end_Exon( $cloned_exon );
$cloned_translation->end($trans[0]->translation->end);
}
$previous_exon = $cloned_exon;
push(@pred_exons, $cloned_exon);
}
}
# transcript
my $merged_transcript = new Bio::EnsEMBL::Transcript;
$merged_transcript->dbID($trans[0]->dbID);
foreach my $pe(@pred_exons){
$merged_transcript->add_Exon($pe);
}
$merged_transcript->sort;
$merged_transcript->translation($cloned_translation);
my @seqeds = @{$trans[0]->translation->get_all_SeqEdits};
if (scalar(@seqeds)) {
print "Copying sequence edits\n" if $self->VERBOSE;
foreach my $se (@seqeds) {
my $new_se =
Bio::EnsEMBL::SeqEdit->new(
-CODE => $se->code,
-NAME => $se->name,
-DESC => $se->description,
-START => $se->start,
-END => $se->end,
-ALT_SEQ => $se->alt_seq
);
my $attribute = $new_se->get_Attribute();
$cloned_translation->add_Attributes($attribute);
}
}
my @support = @{$trans[0]->get_all_supporting_features};
if (scalar(@support)) {
$merged_transcript->add_supporting_features(@support);
}
$merged_transcript->add_Attributes(@{$trans[0]->get_all_Attributes});
# and gene
$gene->add_Transcript($merged_transcript);
$gene->biotype($unmerged->biotype);
push(@merged, $gene);
$count++;
# store match between merged and original gene so we can easily retrieve the latter if we need to
$self->merged_unmerged_pairs($gene,$unmerged);
} # end UNMERGED_GENE
return @merged;
}
=head2 combine_genes
Arg [1] : genewise gene
Arg [2] : cDNA gene
Description: combine gene with matching cDNA
Returntype : Bio::EnsEMBL::Transcript
=cut
sub combine_genes{
my ($self, $gw, $e2g) = @_;
my $modified_peptide = 0;
my @combined_transcripts = ();
# should be only 1 transcript
my @gw_tran = @{$gw->get_all_Transcripts};
$gw_tran[0]->sort;
my @gw_exons = @{$gw_tran[0]->get_all_Exons}; # ordered array of exons
my @egtran = @{$e2g->get_all_Transcripts};
$egtran[0]->sort;
my @e2g_exons = @{$egtran[0]->get_all_Exons}; # ordered array of exons
# clone transcript
my $newtranscript = new Bio::EnsEMBL::Transcript;
if ( $self->EXTEND_ORIGINAL_BIOTYPE && (length($self->EXTEND_ORIGINAL_BIOTYPE)>0) ) {
# original biotype of gene / transcript will only be extended, not overwritten
$newtranscript->biotype($gw->biotype) ;
}
foreach my $exon(@gw_exons){
my $new_exon = clone_Exon($exon);
$newtranscript->add_Exon($new_exon);
}
my @support = @{$gw_tran[0]->get_all_supporting_features};
if (scalar(@support)) {
$newtranscript->add_supporting_features(@support);
}
$newtranscript->add_Attributes(@{$gw_tran[0]->get_all_Attributes});
my $translation = new Bio::EnsEMBL::Translation;
$translation->start($gw_tran[0]->translation->start);
$translation->end($gw_tran[0]->translation->end);
$translation->start_Exon($gw_tran[0]->translation->start_Exon);
$translation->end_Exon($gw_tran[0]->translation->end_Exon);
my @seqeds = @{$gw_tran[0]->translation->get_all_SeqEdits};
if (scalar(@seqeds)) {
print "Copying sequence edits\n" if $self->VERBOSE;
foreach my $se (@seqeds) {
my $new_se =
Bio::EnsEMBL::SeqEdit->new(
-CODE => $se->code,
-NAME => $se->name,
-DESC => $se->description,
-START => $se->start,
-END => $se->end,
-ALT_SEQ => $se->alt_seq
);
my $attribute = $new_se->get_Attribute();
$translation->add_Attributes($attribute);
}
}
$newtranscript->translation($translation);
$newtranscript->translation->start_Exon($newtranscript->start_Exon);
$newtranscript->translation->end_Exon($newtranscript->end_Exon);
my $eecount = 0;
my $modified_peptide_flag = 0;
EACH_E2G_EXON:
foreach my $ee (@e2g_exons){
# check strands are consistent
if ($ee->strand != $gw_exons[0]->strand){
warn("gw and e2g exons have different strands - can't combine genes\n") ;
return undef;
}
# single exon genewise prediction?
if(scalar(@gw_exons) == 1) {
($newtranscript, $modified_peptide_flag) = $self->transcript_from_single_exon_genewise( $ee,
$gw_exons[0],
$newtranscript,
$translation,
$eecount,
@e2g_exons);
}
else {
($newtranscript, $modified_peptide_flag) = $self->transcript_from_multi_exon_genewise($ee,
$newtranscript,
$translation,
$eecount,
$gw,
$e2g)
}
if ( $modified_peptide_flag ){
$modified_peptide = 1;
}
# increment the exon
$eecount++;
} # end of EACH_E2G_EXON
#don't modify translation of blessed genes
my $biotype = $gw->biotype;
if(defined($newtranscript) && $modified_peptide && (($self->{'blessed_type'}) =~ m/$biotype/)){
print STDERR "translation of blessed gene would need to be modified - not using combined gene.\n";
return undef;
}
##############################
# expand merged exons
##############################
# the new transcript is made from a merged genewise gene
# check the transcript and expand frameshifts in all but original 3' gw_exon
# (the sub_SeqFeatures have been flushed for this exon)
if (defined($newtranscript)){
$newtranscript->sort;
foreach my $ex (@{$newtranscript->get_all_Exons}){
if($ex->sub_SeqFeature && scalar($ex->sub_SeqFeature) > 1 ){
my @sf = sort {$a->start <=> $b->start} $ex->sub_SeqFeature;
my $first = shift(@sf);
$ex->end($first->end);
# add back the remaining component exons
foreach my $s(@sf){
$newtranscript->add_Exon($s);
$newtranscript->sort;
}
# flush the sub_SeqFeatures
$ex->flush_sub_SeqFeature;
}
}
# check that the resulting transcript
unless( is_Transcript_sane($newtranscript)
&& all_exons_are_valid($newtranscript, $self->MAX_EXON_LENGTH, 1)
&& intron_lengths_all_less_than_maximum($newtranscript, $self->MAX_INTRON_LENGTH)
&& has_no_unwanted_evidence($newtranscript) ){
print STDERR "problems with this combined transcript, return undef\n";
return undef;
}
#check the translation
unless( validate_Translation_coords($newtranscript, 1)
&& validate_Translation_coords($newtranscript, 1)
&& !contains_internal_stops($newtranscript)
&& $newtranscript->translate){
print STDERR "problems with this combined translation, return undef\n";
return undef;
}
# check translation is the same as for the genewise gene we built from
my $foundtrans = 0;
# the genewise translation can be modified due to a disagreement in a
# splice site with cdnas. This can happen as neither blast nor genewise can
# always find very tiny exons.
# we then recalculate the translation using genomewise:
my $newtrans;
if ( $modified_peptide ){
my $strand = $newtranscript->start_Exon->strand;
$newtrans = $self->_recalculate_translation($newtranscript, $strand);
unless( validate_Translation_coords($newtrans, 1) && is_Transcript_sane($newtrans)
&& all_exons_are_valid($newtrans, $self->MAX_EXON_LENGTH, 1)
&& intron_lengths_all_less_than_maximum($newtrans, $self->MAX_INTRON_LENGTH) ){
print STDERR "problems with this genomewise alternative model, returning original transript.\n";
$newtrans = $newtranscript
}
}
else{
$newtrans = $newtranscript;
}
return $newtrans;
}
else{
warn("No combination could be built\n");
return undef;
}
}
=head2 transcript_from_single_exon_genewise
Arg [1] :
Description: This method will actually do the combination of both
cdna and genewise gene. Note that if there is a match
on one end but not on the other, the code will extend
one end, but will leave the other as it is in the
genewise genes. This will explit cdna matches that look
fine on one end and we disregard the mismatching part.
Returntype :
=cut
sub transcript_from_single_exon_genewise {
my ($self, $eg_exon, $gw_exon, $transcript, $translation, $exoncount, @e2g_exons) = @_;
# save out current translation end - we will need this if we have to unmerge frameshifted exons later
my $orig_tend = $translation->end;
# stay with being strict about gw vs e2g coords - may change this later ...
# the overlapping e2g exon must at least cover the entire gw_exon
if ($gw_exon->start >= $eg_exon->start && $gw_exon->end <= $eg_exon->end){
my $egstart = $eg_exon->start;
my $egend = $eg_exon->end;
my $gwstart = $gw_exon->start;
my $gwend = $gw_exon->end;
# modify the coordinates of the first exon in $newtranscript
my $ex = $transcript->start_Exon;
$ex->start($eg_exon->start);
$ex->end($eg_exon->end);
# need to explicitly set the translation start & end exons here.
$translation->start_Exon($ex);
# end_exon may be adjusted by 3' coding exon frameshift expansion. Ouch.
$translation->end_Exon($ex);
# need to deal with translation start and end this time - varies depending on strand
#FORWARD:
if($gw_exon->strand == 1){
my $diff = $gwstart - $egstart;
my $tstart = $translation->start;
my $tend = $translation->end;
#print STDERR "diff: ".$diff." translation start : ".$tstart." end: ".$tend."\n";
#print STDERR "setting new translation to start: ".($tstart+$diff)." end: ".($tend+$diff)."\n";
$translation->start($tstart + $diff);
$translation->end($tend + $diff);
if($translation->start < 0){
warn("Forward strand: setting very dodgy translation start: " . $translation->start. "\n");
return(undef, 1);
}
if($translation->end > $translation->end_Exon->length){
warn("Forward strand: setting dodgy translation end: " . $translation->end .
" exon_length: " . $translation->end_Exon->length . "\n");
return(undef, 1);
}
}
#REVERSE:
elsif($gw_exon->strand == -1){
my $diff = $egend - $gwend;
my $tstart = $translation->start;
my $tend = $translation->end;
$translation->start($tstart+$diff);
$translation->end($tend + $diff);
if($translation->start < 0){
warn("Forward strand: setting very dodgy translation start: " . $translation->start. "\n");
return(undef, 1);
}
if($translation->end > $translation->end_Exon->length){
warn("Forward strand: setting dodgy translation end: " . $translation->end .
" exon_length: " . $translation->end_Exon->length . "\n");
return(undef, 1);
}
}
# expand frameshifted single exon genewises back from one exon to multiple exons
if(defined($ex->sub_SeqFeature) && (scalar($ex->sub_SeqFeature) > 1)){
#print STDERR "frameshift in a single exon genewise\n";
my @sf = $ex->sub_SeqFeature;
# save current start and end of modified exon
my $cstart = $ex->start;
my $cend = $ex->end;
my $exlength = $ex->length;
# get first exon - this has same id as $ex
my $first = shift(@sf);
$ex->end($first->end); # NB end has changed!!!
# don't touch start.
# no need to modify translation start
# get last exon
my $last = pop(@sf);
$last->end($cend);
$transcript->add_Exon($last);
# and adjust translation end - the end is still relative to the merged gw exon
$translation->end_Exon($last);
# put back the original end translation
$translation->end($orig_tend);
# get any remaining exons
foreach my $s(@sf){
$transcript->add_Exon($s);
}
$transcript->sort;
# flush the sub_SeqFeatures
$ex->flush_sub_SeqFeature;
}
# need to add back exons, both 5' and 3'
$self->add_5prime_exons($transcript, $exoncount, @e2g_exons);
$self->add_3prime_exons($transcript, $exoncount, @e2g_exons);
}
return ($transcript,0);
}
=head2 transcript_from_multi_exon_genewise
Arg [1] :
Description: This method will actually do the combination of both
cdna and genewise gene. Note that if there is a match
on one end but not on the other, the code will extend
one end, but will leave the other as it is in the
genewise genes. This will explit cdna matches that look
fine on one end and we disregard the mismatching part.
Returntype :
=cut
sub transcript_from_multi_exon_genewise {
my ($self, $current_exon, $transcript, $translation, $exoncount, $gw_gene, $eg_gene) = @_;
# $current_exon is the exon one the e2g_transcript we are in at the moment
# $exoncount is the position of the e2g exon in the array
# save out current translation->end - we'll need it if we have to expand 3prime exon later
# my $orig_tend = $translation->end;
my @gwtran = @{$gw_gene->get_all_Transcripts};
$gwtran[0]->sort;
my @gwexons = @{$gwtran[0]->get_all_Exons};
my @egtran = @{$eg_gene->get_all_Transcripts};
$egtran[0]->sort;
my @egexons = @{$egtran[0]->get_all_Exons};
# in order to match a starting genewise exon with an e2g exon, we need to have
# a. exactly coinciding exon ends
# b. exon starts lying within $exon_slop bp of each other.
# previously we had required e2g start to be strictly <= gw start, but this will lose us some valid UTRs
# substitute "end" for "start" for 3' ends of transcripts
# compare to the first genewise exon
if($gwexons[0]->strand == 1){
return $self->transcript_from_multi_exon_genewise_forward($current_exon, $transcript, $translation,
$exoncount, $gw_gene, $eg_gene);
}
elsif( $gwexons[0]->strand == -1 ){
return $self->transcript_from_multi_exon_genewise_reverse($current_exon, $transcript, $translation,
$exoncount, $gw_gene, $eg_gene);
}
}
=head2 transcript_from_multi_exon_genewise_forward
Arg [1] :
Description:
Returntype :
=cut
sub transcript_from_multi_exon_genewise_forward {
my ($self, $current_exon, $transcript, $translation, $exoncount, $gw_gene, $eg_gene) = @_;
my $modified_peptide = 0;
my @gwtran = @{$gw_gene->get_all_Transcripts};
$gwtran[0]->sort;
my @gwexons = @{$gwtran[0]->get_all_Exons};
my @egtran = @{$eg_gene->get_all_Transcripts};
$egtran[0]->sort;
my @egexons = @{$egtran[0]->get_all_Exons};
# save out current translation->end - we'll need it if we have to expand 3prime exon later
my $orig_tend = $translation->end;
my $exon_slop = 20;
#5_PRIME:
if (#they have a coincident end
$gwexons[0]->end == $current_exon->end &&
# either e2g exon starts before genewise exon
($current_exon->start <= $gwexons[0]->start ||
# or e2g exon is a bit shorter but there are spliced UTR exons as well
(abs($current_exon->start - $gwexons[0]->start) <= $exon_slop &&
$current_exon != $egexons[0]))){
my $current_start = $current_exon->start;
my $gwstart = $gwexons[0]->start;
# this exon will be the start of translation, convention: phase = -1
my $ex = $transcript->start_Exon;
$ex->phase(-1);
# modify the coordinates of the first exon in $newtranscript if
# e2g is larger on this end than gw.
if ( $current_exon->start < $gwexons[0]->start ){
$ex->start($current_exon->start);
}
elsif( $current_exon->start == $gwexons[0]->start ){
$ex->start($gwstart);
$ex->phase($gwexons[0]->phase);
}
# if the e2g exon starts after the gw exon,
# modify the start only if this e2g exon is not the first of the transcript
elsif( $current_start > $gwstart && $exoncount != 0 ) {
$ex->start($current_exon->start);
}
# add all the exons from the est2genome transcript, previous to this one
transfer_supporting_evidence($current_exon, $ex);
$self->add_5prime_exons($transcript, $exoncount, @egexons);
# fix translation start
if($gwstart >= $current_start){
# take what it was for the gw gene, and add on the extra
my $tstart = $translation->start;
print STDERR "Forward 5': original translation start: $tstart "; ##
$tstart += ($gwstart - $current_start);
$translation->start($tstart);
print STDERR "re-setting translation start to: $tstart\n"; ##
}
# only trust a smaller cdna exon if it is not the first of the transcript
# (it could be a truncated cdna)
elsif($gwstart < $current_start && $exoncount != 0){
$modified_peptide = 1;
#print STDERR "SHORTENING GENEWISE TRANSLATION\n";
# genewise has leaked over the start. Tougher call - we need to take into account the
# frame here as well
#print STDERR "gw exon starts: $gwstart < new start: $current_start\n";
#print STDERR "modifying exon, as cdna exon is not the first of transcript-> exoncount = $exoncount\n";
# $diff is the number of bases we chop from the genewise exon
my $diff = $current_start - $gwstart;
my $tstart = $translation->start;
warn("this is a case where gw translation starts at $tstart > 1") if ($tstart>1);
print STDERR "gw translation start: ".$tstart."\n";
#print STDERR "start_exon: ".$translation->start_Exon->start.
#"-".$translation->start_Exon->end.
#" length: ".($translation->start_Exon->end - $translation->start_Exon->start + 1).
#" phase: ".$translation->start_Exon->phase.
#" end_phase: ".$translation->start_Exon->end_phase."\n";
if($diff % 3 == 0) {
# we chop exactily N codons from the beginning of translation
$translation->start(1);
}
elsif ($diff % 3 == 1) {
# we chop N codons plus one base
$translation->start(3);
}
elsif ($diff % 3 == 2) {
# we chop N codons plus 2 bases
$translation->start(2);
}
else {
$translation->start(1);
warn("very odd - $diff mod 3 = " . $diff % 3 . "\n");
}
}
else{
#print STDERR "gw exon starts: $gwstart > new start: $current_start";
#print STDERR "but cdna exon is the first of transcript-> exoncount = $exoncount, so we don't modify it\n";
}
throw("setting very dodgy translation start: " . $translation->start. "\n")
unless $translation->start > 0;
} # end 5' exon
# 3_PRIME:
elsif (# they have coincident start
$gwexons[$#gwexons]->start == $current_exon->start &&
# either e2g exon ends after genewise exon
($current_exon->end >= $gwexons[$#gwexons]->end ||
# or we allow to end before if there are UTR exons to be added
(abs($current_exon->end - $gwexons[$#gwexons]->end) <= $exon_slop &&
$current_exon != $egexons[$#egexons]))){
my $end_translation_shift = 0;
# modify the coordinates of the last exon in $newtranscript
# e2g is larger on this end than gw.
my $ex = $transcript->end_Exon;
# this exon is the end of translation, convention: end_phase = -1
$ex->end_phase(-1);
if ( $current_exon->end > $gwexons[$#gwexons]->end ){
$ex->end($current_exon->end);
}
elsif( $current_exon->end == $gwexons[$#gwexons]->end ){
$ex->end($gwexons[$#gwexons]->end);
$ex->end_phase($gwexons[$#gwexons]->end_phase);
}
# if the e2g exon ends before the gw exon,
# modify the end only if this e2g exon is not the last of the transcript
elsif ( $current_exon->end < $gwexons[$#gwexons]->end && $exoncount != $#egexons ){
$modified_peptide = 1;
#print STDERR "SHORTENING GENEWISE TRANSLATION\n";
## fix translation end iff genewise has leaked over - will need truncating
my $diff = $gwexons[$#gwexons]->end - $current_exon->end;
#print STDERR "diff: $diff\n";
my $tend = $translation->end;
my $gw_exon_length = $gwexons[$#gwexons]->end - $gwexons[$#gwexons]->start + 1;
my $cdna_exon_length = $current_exon->end - $current_exon->start + 1;
#print STDERR "gw exon length : $gw_exon_length\n";
#print STDERR "cdna exon length: $cdna_exon_length\n";
my $length_diff = $gw_exon_length - $cdna_exon_length;
#print STDERR "length diff: ".$length_diff."\n"; # should be == diff
$ex->end($current_exon->end);
if($diff % 3 == 0) {
# we chop exactily N codons from the end of the translation
# so it can end where the cdna exon ends
$translation->end($cdna_exon_length);
$end_translation_shift = $length_diff;
}
elsif ($diff % 3 == 1) {
# we chop N codons plus one base
# it should end on a full codon, so we need to end translation 2 bases earlier:
$translation->end($cdna_exon_length - 2);
$end_translation_shift = $length_diff + 2;
}
elsif ($diff % 3 == 2) {
# we chop N codons plus 2 bases
# it should end on a full codon, so we need to end translation 1 bases earlier:
$translation->end($cdna_exon_length - 1);
$end_translation_shift = $length_diff + 1;
}
else {
# absolute genebuild paranoia 8-)
$translation->end($cdna_exon_length);
warn("very odd - $diff mod 3 = " . $diff % 3 . "\n");
}
#print STDERR "Forward: translation end set to : ".$translation->end."\n";
}
# need to explicitly set the translation end exon for translation to work out
my $end_ex = $transcript->end_Exon;
$translation->end_Exon($end_ex);
# strand = 1
my $expanded = $self->expand_3prime_exon($ex, $transcript, 1);
if($expanded){
# set translation end to what it originally was in the unmerged genewise gene
# taking into account the diff
#print STDERR "Forward: expanded 3' exon, re-setting end of translation from ".$translation->end." to orig_end ($orig_tend)- ( length_diff + shift_due_to_phases ) ($end_translation_shift)".($orig_tend - $end_translation_shift)."\n";
$translation->end($orig_tend - $end_translation_shift);
}
# finally add any 3 prime e2g exons
transfer_supporting_evidence($current_exon, $ex);
$self->add_3prime_exons($transcript, $exoncount, @egexons);
} # end 3' exon
return ($transcript,$modified_peptide);
}
=head2 transcript_from_multi_exon_genewise_reverse
Arg [1] :
Description:
Returntype :
Exceptions :
Example :
=cut
sub transcript_from_multi_exon_genewise_reverse {
my ($self, $current_exon, $transcript, $translation, $exoncount, $gw_gene, $eg_gene) = @_;
my $modified_peptide = 0;
my @gwtran = @{$gw_gene->get_all_Transcripts};
$gwtran[0]->sort;
my @gwexons = @{$gwtran[0]->get_all_Exons};
my @egtran = @{$eg_gene->get_all_Transcripts};
$egtran[0]->sort;
my @egexons = @{$egtran[0]->get_all_Exons};
# save out current translation->end - we'll need it if we have to expand 3prime exon later
my $orig_tend = $translation->end;
my $exon_slop = 20;
# 5_PRIME:
if ($gwexons[0]->start == $current_exon->start &&
# either e2g exon ends after gw exon
($current_exon->end >= $gwexons[0]->end ||
# or there are UTR exons to be added
(abs($current_exon->end - $gwexons[0]->end) <= $exon_slop &&
$current_exon != $egexons[0]))){
# sort out translation start
my $tstart = $translation->start;
if($current_exon->end >= $gwexons[0]->end){
# take what it was for the gw gene, and add on the extra
$tstart += $current_exon->end - $gwexons[0]->end;
$translation->start($tstart);
}
elsif( $current_exon->end < $gwexons[0]->end && $current_exon != $egexons[0] ){
# genewise has leaked over the start. Tougher call - we need to take into account the
# frame here as well
$modified_peptide = 1;
#print STDERR "SHORTENING GENEWISE TRANSLATION\n";
#print STDERR "In Reverse strand. gw exon ends: ".$gwexons[0]->end." > cdna exon end: ".$current_exon->end."\n";
#print STDERR "modifying exon, as cdna exon is not the first of transcript-> exoncount = $exoncount\n";
my $diff = $gwexons[0]->end - $current_exon->end;
my $gwstart = $gwexons[0]->end;
my $current_start = $current_exon->end;
my $tstart = $translation->start;
#print STDERR "start_exon: ".$translation->start_Exon->start.
# "-".$translation->start_Exon->end.
# " length: ".($translation->start_Exon->end - $translation->start_Exon->start + 1).
# " phase: ".$translation->start_Exon->phase.
# " end_phase: ".$translation->start_Exon->end_phase."\n";
if ($diff % 3 == 0) { $translation->start(1); }
elsif ($diff % 3 == 1) { $translation->start(3); }
elsif ($diff % 3 == 2) { $translation->start(2); }
else {
$translation->start(1);
warn("very odd - $diff mod 3 = " . $diff % 3 . "\n");}
}
throw("setting very dodgy translation start: " . $translation->start. "\n")
unless $translation->start > 0;
# this exon is the start of translation, convention: phase = -1
my $ex = $transcript->start_Exon;
$ex->phase(-1);
# modify the coordinates of the first exon in $newtranscript
if ( $current_exon->end > $gwexons[0]->end){
## HERE WAS THE PROBLEM WHICH CHANGED THE SOURCE COORDINATES! ##
$ex->end($current_exon->end);
$ex->phase(-1);
}
elsif ( $current_exon->end == $gwexons[0]->end){
$ex->end($gwexons[0]->end);
$ex->phase($gwexons[0]->phase);
}
elsif ( ($current_exon->end < $gwexons[0]->end) && ($current_exon != $egexons[0]) ){
$ex->end($current_exon->end);
}
# need to explicitly set the translation start exon for translation to work out
$translation->start_Exon($ex);
transfer_supporting_evidence($current_exon, $ex);
$self->add_5prime_exons($transcript, $exoncount, @egexons);
}
# end 5' exon
# 3_PRIME:
elsif ($gwexons[$#gwexons]->end == $current_exon->end &&
# either e2g exon starts before gw exon
($current_exon->start <= $gwexons[$#gwexons]->start ||
# or there are UTR exons to be added
(abs($current_exon->start - $gwexons[$#gwexons]->start) <= $exon_slop &&
$current_exon != $egexons[$#egexons]))){
my $end_translation_shift = 0;
# this exon is the end of translation, convention: end_phase = -1
my $ex = $transcript->end_Exon;
$ex->end_phase(-1);
# modify the coordinates of the last exon in $newtranscript
if ( $current_exon->start < $gwexons[$#gwexons]->start ){
# no need to modify translation->end as the 'end' of this exon has not changed
$ex->start($current_exon->start);
$ex->end_phase(-1);
}
elsif( $current_exon->start == $gwexons[$#gwexons]->start){
$ex->start($gwexons[$#gwexons]->start);
$ex->end_phase($gwexons[$#gwexons]->end_phase);
}
# if the e2g exon starts after the gw exon,
# modify the end only if this e2g exon is not the last of the transcript
elsif ( $current_exon->start > $gwexons[$#gwexons]->start && $exoncount != $#egexons ){
$modified_peptide = 1;
#print STDERR "SHORTENING GENEWISE TRANSLATION\n";
#print STDERR "In Reverse strand: gw exon start: ".$gwexons[$#gwexons]->start." < cdna exon start: ".$current_exon->start."\n";
#print STDERR "modifying exon, as cdna exon is not the last of transcript-> exoncount = $exoncount, and #egexons = $#egexons\n";
## adjust translation
my $diff = $current_exon->start - $gwexons[$#gwexons]->start;
#print STDERR "diff: $diff\n";
my $tend = $translation->end;
my $gw_exon_length = $gwexons[$#gwexons]->end - $gwexons[$#gwexons]->start + 1;
my $cdna_exon_length = $current_exon->end - $current_exon->start + 1;
#print STDERR "gw exon length : $gw_exon_length\n";
#print STDERR "cdna exon length: $cdna_exon_length\n";
my $length_diff = $gw_exon_length - $cdna_exon_length;
# modify the combined exon coordinate to be that of the cdna
$ex->start($current_exon->start);
if($diff % 3 == 0) {
# we chop exactily N codons from the end of the translation
# so it can end where the cdna exon ends
$translation->end($cdna_exon_length);
$end_translation_shift = $length_diff;
}
elsif ($diff % 3 == 1) {
# we chop N codons plus one base
# it should end on a full codon, so we need to end translation 2 bases earlier:
$translation->end($cdna_exon_length - 2);
$end_translation_shift = $length_diff + 2;
}
elsif ($diff % 3 == 2) {
# we chop N codons plus 2 bases
# it should end on a full codon, so we need to end translation 1 bases earlier:
$translation->end($cdna_exon_length - 1);
$end_translation_shift = $length_diff + 1;
}
else {
# absolute genebuild paranoia 8-)
$translation->end($cdna_exon_length);
warn("very odd - $diff mod 3 = " . $diff % 3 . "\n");
}
}
# strand = -1
my $expanded = $self->expand_3prime_exon($ex, $transcript, -1);
# need to explicitly set the translation end exon for translation to work out
my $end_ex = $transcript->end_Exon;
$translation->end_Exon($end_ex);
if($expanded){
# set translation end to what it originally was in the unmerged genewise gene
#print STDERR "Reverse: expanded 3' exon, re-setting translation exon ".$translation->end.
# " to original end( $orig_tend ) - shifts_due_to_phases_etc ( $end_translation_shift ) :".
# ($orig_tend - $end_translation_shift)."\n";
$translation->end($orig_tend - $end_translation_shift);
}
transfer_supporting_evidence($current_exon, $ex);
$self->add_3prime_exons($transcript, $exoncount, @egexons);
} # end 3' exon
return ($transcript, $modified_peptide);
}
=head2 add_5prime_exons
Arg [1] :
Description:
Returntype :
=cut
sub add_5prime_exons {
my ($self, $transcript, $exoncount, @e2g_exons) = @_;
# add all the exons from the est2genome transcript, previous to this one
# db handle will be screwed up, need to mak new exons from these
my $c = 0;
my $modified = 0;
while($c < $exoncount){
my $newexon = new Bio::EnsEMBL::Exon;
my $oldexon = $e2g_exons[$c];
$newexon->start($oldexon->start);
$newexon->end($oldexon->end);
$newexon->strand($oldexon->strand);
# these are all 5prime UTR exons
$newexon->phase(-1);
$newexon->end_phase(-1);
$newexon->slice($oldexon->slice);
my %evidence_hash;
#print STDERR "adding evidence at 5':\n";
foreach my $sf( @{$oldexon->get_all_supporting_features} ){
if ( $evidence_hash{$sf->hseqname}{$sf->hstart}{$sf->hend}{$sf->start}{$sf->end} ){
next;
}
$evidence_hash{$sf->hseqname}{$sf->hstart}{$sf->hend}{$sf->start}{$sf->end} = 1;
#print STDERR $sf->start."-".$sf->end." ".$sf->hstart."-".$sf->hend." ".$sf->hseqname."\n";
$newexon->add_supporting_features($sf);
}
# print STDERR "Adding 5prime exon " . $newexon->start . " " . $newexon->end . "\n";
$transcript->add_Exon($newexon);
$modified = 1;
$c++;
}
if ($modified == 1){
$transcript->translation->start_Exon->phase(-1);
}
}
# $exon is the terminal exon in the genewise transcript, $transcript. We need
# to expand any frameshifts we merged in the terminal genewise exon.
# The expansion is made by putting $exon to be the last (3' end) component, so we modify its
# start but not its end. The rest of the components are added. The translation end will have to be modified,
# this happens in the method _transcript_from_multi_exon....
=head2 expand_3prime_exon
Arg [1] :
Description: $exon is the terminal exon in the genewise transcript,
$transcript. We need to expand any frameshifts we
merged in the terminal genewise exon. The expansion is
made by putting $exon to be the last (3 end)
component, so we modify its start but not its end. The
rest of the components are added. The translation end
will have to be modified, this happens in the method
_transcript_from_multi_exon....
Returntype :
=cut
sub expand_3prime_exon{
my ($self, $exon, $transcript, $strand) = @_;
if(defined($exon->sub_SeqFeature) && (scalar($exon->sub_SeqFeature) > 1)){
#print STDERR "expanding 3'prime frameshifted exon $exon in strand $strand: ".
#$exon->start."-".$exon->end." phase: ".$exon->phase." end_phase: ".$exon->end_phase."\n";
my @sf = $exon->sub_SeqFeature;
my $last = pop(@sf);
#print STDERR "last component: ".$last->start."-".$last->end." phase ".$last->phase." end_phase ".$last->end_phase."\n";
#print STDERR "setting exon $exon start: ".$last->start." phase: ".$last->phase."\n";
$exon->start($last->start); # but don't you dare touch the end!
$exon->dbID($last->dbID);
$exon->phase($last->phase);
# add back the remaining component exons
foreach my $s(@sf){
#print STDERR "adding exon: ".$s->start."-".$s->end."\n";
$transcript->add_Exon($s);
$transcript->sort;
}
# flush the sub_SeqFeatures so we don't try to re-expand later
$exon->flush_sub_SeqFeature;
return 1;
}
# else, no expansion
return 0;
}
=head2 add_3prime_exons
Arg [1] :
Description: $exoncount tells us which position in the array of e2g
exons corresponds to the end of the genewise transcript
so we add back exons 3 to that position. $exon and
$transcript are references to Exon and Transcript
objects.
Returntype :
=cut
sub add_3prime_exons {
my ($self, $transcript, $exoncount, @e2g_exons) = @_;
# need to deal with frameshifts - 3' exon is a special case as its end might have changed
# add all the exons from the est2genome transcript, subsequent to this one
my $c = $#e2g_exons;
my $modified = 0;
while($c > $exoncount){
my $newexon = new Bio::EnsEMBL::Exon;
my $oldexon = $e2g_exons[$c];
$newexon->start($oldexon->start);
$newexon->end($oldexon->end);
$newexon->strand($oldexon->strand);
# these are all exons with UTR:
$newexon->phase(-1);
$newexon->end_phase(-1);
$newexon->slice($oldexon->slice);
#print STDERR "adding evidence in 3':\n";
my %evidence_hash;
foreach my $sf( @{$oldexon->get_all_supporting_features }){
if ( $evidence_hash{$sf->hseqname}{$sf->hstart}{$sf->hend}{$sf->start}{$sf->end} ){
next;
}
$evidence_hash{$sf->hseqname}{$sf->hstart}{$sf->hend}{$sf->start}{$sf->end} = 1;
#print STDERR $sf->start."-".$sf->end." ".$sf->hstart."-".$sf->hend." ".$sf->hseqname."\n";
$newexon->add_supporting_features($sf);
}
# print STDERR "Adding 3prime exon " . $newexon->start . " " . $newexon->end . "\n";
$transcript->add_Exon($newexon);
$modified = 1;
$c--;
}
if ($modified == 1){
$transcript->translation->end_Exon->end_phase(-1);
}
}
=head2 remap_genes
Description: strictly speaking this is no longer remapping anything...
checks translation, set start/stop, can set biotype
Returntype : ref to array of Bio::EnsEMBL::Gene
=cut
sub remap_genes {
my ($self, $genes, $biotype_suffix) = @_;
my @remapped_genes = ();
my $blessed_type = $self->{'blessed_type'};
print STDERR "remapping ".scalar @$genes." genes.\n" if $self->VERBOSE;
GENE:
foreach my $gene (@$genes) {
#leave the blessed genes alone
my $biotype = $gene->biotype;
#force a centain biotype?
if($biotype_suffix && (length($biotype_suffix) > 0)){
$gene->biotype($gene->biotype().$biotype_suffix);
}
if(defined $blessed_type && $blessed_type =~ m/$biotype/){
print STDERR "not remapping ".$biotype."\n" if $self->VERBOSE;
push(@remapped_genes, $gene);
next GENE;
}
print STDERR "remapping ".$gene->biotype."\n" if $self->VERBOSE;
my @t = @{$gene->get_all_Transcripts};
my $tran = $t[0];
# check that it translates
unless(validate_Translation_coords($tran, 1)
&& !contains_internal_stops($tran)
&& $tran->translate){
print STDERR "\nERROR: Gene at ".$gene->seq_region_name." ".$gene->seq_region_start."-".$gene->seq_region_end." ".
$gene->seq_region_strand." doesn't translate!\n\n";
push(@remapped_genes, $gene); ##added
next GENE;
}
foreach my $transcript ( @{$gene->get_all_Transcripts} ){
if($biotype){
$transcript->biotype($gene->biotype);
$transcript->analysis($gene->analysis);
}
# set start and stop codons
set_start_codon($transcript);
set_stop_codon($transcript);
}
push(@remapped_genes, $gene);
}
return \@remapped_genes;
}
=head2 validate_gene
Arg [1] : Bio::EnsEMBL::Gene
Description: checks start and end coordinates of each exon of each transcript are sane
Returntype : 1 if gene is valid, otherwise zero
=cut
sub validate_gene {
my ($self, $gene) = @_;
# should be only a single transcript
my @transcripts = @{$gene->get_all_Transcripts};
if(scalar(@transcripts) != 1) {
print STDERR "Rejecting gene - should have one transcript, not " . scalar(@transcripts) . "\n";
return 0;
}
foreach my $transcript(@transcripts){
foreach my $exon(@{$transcript->get_all_Exons}){
unless( validate_Exon_coords($exon, 1) ){
print STDERR "Rejecting gene because of invalid exon\n";
return 0;
}
}
}
return 1;
}
=head2 _recalculate_translation
Arg[1] : Bio::EnsEMBL::Transcript
Arg[2] : strand of the transcript
Description : a transcript is used as evidence for genomewise
to recalculate the ORF. The idea is to use this when
the peptide has been shortened, due to a genewise model
being incompatible with the cdna splicing. This can happen when
genewise cannot find very short exons
and attaches them to one of the flanking exons.
We tell genomewise to keep the splice boundaries pretty much
static, so that we preserve the original splicing structure.
Returntype : Bio::EnsEMBL::Transcript
=cut
sub _recalculate_translation {
my ($self, $mytranscript, $strand) = @_;
my $this_is_my_transcript = clone_Transcript($mytranscript);
compute_translation($mytranscript);
# check that everything is sane:
unless(validate_Translation_coords($mytranscript, 1)
&& contains_internal_stops($mytranscript)
&& $mytranscript->translate){
print STDERR "problem with the translation. Returning the original transcript\n";
return $this_is_my_transcript;
}
return $mytranscript;
}
=head2 _transfer_evidence
Arg [1] : reference to Bio::EnsEMBL::Tanscript $combined_transcript
Arg [2] : reference to Bio::EnsEMBL::Transcript $cdna_transcript
Description: transfers cdna evidence to combined transcript
Returntype: Bio::EnsEMBL::Transcript
=cut
sub _transfer_evidence {
my ($self, $combined_transcript, $cdna_transcript) = @_;
my $first_support_id;
foreach my $combined_exon(@{$combined_transcript->get_all_Exons}){
foreach my $cdna_exon(@{$cdna_transcript->get_all_Exons}){
# exact match or overlap?
# exact match
# if($combined_exon->start == $cdna_exon->start &&
# $combined_exon->end == $cdna_exon->end &&
# $combined_exon->strand == $cdna_exon->strand){
# print STDERR "exact match " . $combined_exon->dbID . " with " . $cdna_exon->dbID . "; transferring evidence\n";
# transfer_supporting_evidence($cdna_exon, $combined_exon);
# }
# overlap - feature boundaries may well be wonky
if($combined_exon->overlaps($cdna_exon)){
if($combined_exon->strand != $cdna_exon->strand){
print STDERR "OVERLAPPING-BUT-DIFFERENT_STRANDS!\n";
}
else{
transfer_supporting_evidence($cdna_exon, $combined_exon);
}
}
}
}
my $cdna_trans = $cdna_transcript->get_all_Transcripts()->[0];
foreach my $tsf (@{$cdna_trans->get_all_supporting_features}) {
print STDERR "adding supporting feature: ".$tsf->hseqname."\n" if $self->VERBOSE;
$combined_transcript->add_supporting_features($tsf);
}
return $combined_transcript;
}
=head2 look_for_both
Arg [1] : Bio:EnsEMBL:Gene
Description: a copy of Steve's look_for_both-script,
checks phases, etc.
Returntype : Bio:EnsEMBL:Gene
=cut
sub look_for_both {
my ($self, $gene) = @_;
my $time = time;
my $nupdated_start = 0;
my $nupdated_end = 0;
my $metcnt = 1;
my $maxterdist = 150;
foreach my $trans (@{$gene->get_all_Transcripts}) {
if ($trans->translation) {
my $tln = $trans->translation;
my $coding_start = $trans->cdna_coding_start;
my $orig_coding_start = $coding_start;
$trans->sort;
my $cdna_seq = uc($trans->spliced_seq);
my @pepgencoords = $trans->pep2genomic(1,1);
if(scalar(@pepgencoords) > 2) {
print STDERR "pep start does not map cleanly\n";
goto TLNEND; # I swore I'd never use this - this code desperately needs a rewrite
}
my $pepgenstart = $pepgencoords[0]->start;
my $pepgenend = $pepgencoords[$#pepgencoords]->end;
unless($pepgencoords[0]->isa('Bio::EnsEMBL::Mapper::Coordinate')) {
print STDERR "pep start maps to gap\n";
goto TLNEND; # I swore I'd never use this - this code desperately needs a rewrite
}
unless($pepgencoords[$#pepgencoords]->isa('Bio::EnsEMBL::Mapper::Coordinate')) {
print STDERR "pep start (end of) maps to gap\n";
goto TLNEND; # I swore I'd never use this - this code desperately needs a rewrite
}
print STDERR "Pep genomic location = " . $pepgenstart . " " . $pepgenend . "\n" if $self->VERBOSE;
my $startseq= substr($cdna_seq,$coding_start-1,3);
print "cdna seq for pep start = " . $startseq . "\n";
if ($startseq ne "ATG") {
if ($coding_start > 3) {
my $had_stop = 0;
while ($coding_start > 3 && !$had_stop) {
my $testseq = substr($cdna_seq,$coding_start-4,3);
if ($testseq eq "ATG") {
print_Translation($trans) if $self->VERBOSE;
my @coords = $trans->cdna2genomic($coding_start-3,$coding_start-1,$gene->strand);
my $new_start;
my $new_end;
if(scalar(@coords) > 2) {
print STDERR "Shouldn't happen - new start does not map cleanly - I'm out of here\n";
next;
} elsif (scalar(@coords) == 2) {
print STDERR "WOW ISN'T NATURE HORRIBLE: new start crosses intron\n";
print STDERR "coord[0] = " . $coords[0]->start . " " . $coords[0]->end ."\n";
print STDERR "coord[1] = " . $coords[1]->start . " " . $coords[1]->end ."\n";
if ($gene->strand == 1) {
$new_start = $coords[0]->start;
$new_end = $coords[$#coords]->end;
} else {
$new_start = $coords[0]->end;
$new_end = $coords[$#coords]->start;
}
} else {
$new_start = $coords[0]->start;
$new_end = $coords[0]->end;
}
unless($coords[0]->isa('Bio::EnsEMBL::Mapper::Coordinate')) {
print STDERR "Shouldn't happen - new start maps to gap - I'm out of here\n";
next;
}
unless($coords[$#coords]->isa('Bio::EnsEMBL::Mapper::Coordinate')) {
print STDERR "Shouldn't happen - new start (end of) maps to gap - I'm out of here\n";
next;
}
print STDERR "genomic pos for new start = " . $new_start . " " . $new_end . "\n" if $self->VERBOSE;
if ($new_end - $new_start == 2) {
$nupdated_start++;
my $newstartexon;
foreach my $exon (@{$trans->get_all_Exons}) {
if ($exon->end >= $new_start && $exon->start <= $new_start) {
$newstartexon = $exon;
last;
}
}
if ($newstartexon == $tln->start_Exon) {
if ($tln->start_Exon->strand == 1) {
$tln->start($new_start - $tln->start_Exon->start + 1);
} else {
$tln->start($tln->start_Exon->end - $new_end + 1);
}
# NAUGHTY, but hey I should have to do this - I've changed the translation after all
$trans->{'transcript_mapper'} = undef;
$trans->{'coding_region_start'} = undef;
$trans->{'coding_region_end'} = undef;
$trans->{'cdna_coding_start'} = undef;
$trans->{'cdna_coding_end'} = undef;
} else {
# find exon
if (!defined($newstartexon)) {
print STDERR "Failed finding new start exon - how can this be?\n";
next;
}
# create a copy of if and of current start exon (because of phase change)
my $copyexon = new Bio::EnsEMBL::Exon(
-start => $tln->start_Exon->start,
-end => $tln->start_Exon->end,
-strand => $gene->strand,
);
my $copynewstartexon = new Bio::EnsEMBL::Exon(
-start => $newstartexon->start,
-end => $newstartexon->end,
-strand => $gene->strand,
);
# $copyexon->phase(0);
$copyexon->end_phase($tln->start_Exon->end_phase);
$copyexon->contig($tln->start_Exon->contig);
if ($tln->start_Exon->stable_id) {
$copyexon->stable_id($tln->start_Exon->stable_id . "MET" . $metcnt++);
$copyexon->created($time);
$copyexon->modified($time);
$copyexon->version(1);
}
$copynewstartexon->phase($newstartexon->phase);
# $copynewstartexon->end_phase(0);
$copynewstartexon->contig($newstartexon->contig);
if ($newstartexon->stable_id) {
$copynewstartexon->stable_id($newstartexon->stable_id . "MET" . $metcnt++);
$copynewstartexon->created($time);
$copynewstartexon->modified($time);
$copynewstartexon->version(1);
}
# TODO evidence
if ($copynewstartexon->strand == 1) {
$tln->start($new_start - $copynewstartexon->start + 1);
} else {
$tln->start($copynewstartexon->end - $new_end + 1);
}
# Replace exons in transcript, and fix phases
my @newexons;
my $inrange = 0;
foreach my $exon (@{$trans->get_all_Exons}) {
if ($inrange) {
$exon->phase( $newexons[$#newexons]->end_phase );
$exon->end_phase(($exon->length + $exon->phase) % 3);
}
if ($exon == $tln->start_Exon) {
$copyexon->phase( $newexons[$#newexons]->end_phase );
push @newexons,$copyexon;
$inrange = 0;
} elsif ($exon == $newstartexon) {
push @newexons,$copynewstartexon;
$copynewstartexon->end_phase(($exon->length - $tln->start + 1)%3);
print STDERR "Setting end_phase on new start exon to " . $copynewstartexon->end_phase .
" l = " . $exon->length . " ts = " . $tln->start . "\n" if $self->VERBOSE;
$inrange = 1;
} else {
push @newexons,$exon;
}
}
$trans->flush_Exons;
foreach my $exon (@newexons) {
$trans->add_Exon($exon);
}
# Reset translation start exon
if ($tln->end_Exon == $tln->start_Exon) {
$tln->end_Exon($copyexon);
}
$tln->start_Exon($copynewstartexon);
}
print_Translation($trans);
} else {
print STDERR "Across exons - not handling this\n";
}
last;
} else {
if ($testseq =~ /TAA/ or $testseq =~ /TGA/ or $testseq =~ /TAG/) {
$had_stop = 1;
} else {
$coding_start -= 3;
}
}
}
} else {
print STDERR "Not enough bases upstream - NOT looking into genomic\n"if $self->VERBOSE;
}
}
TLNEND:
{
my $coding_end = $trans->cdna_coding_end;
my $orig_coding_end = $coding_end;
$trans->sort;
my $peplen = $trans->translate->length;
my @pepgencoords = $trans->pep2genomic($peplen,$peplen);
if(scalar(@pepgencoords) > 2) {
print STDERR "pep end does not map cleanly\n";
next;
}
my $pepgenstart = $pepgencoords[0]->start;
my $pepgenend = $pepgencoords[$#pepgencoords]->end;
unless($pepgencoords[0]->isa('Bio::EnsEMBL::Mapper::Coordinate')) {
print STDERR "pep end maps to gap\n";
next;
}
unless($pepgencoords[$#pepgencoords]->isa('Bio::EnsEMBL::Mapper::Coordinate')) {
print STDERR "pep start (end of) maps to gap\n";
next;
}
#print "End Pep genomic location = " . $pepgenstart . " " . $pepgenend . "\n";
my $endseq= substr($cdna_seq,$coding_end-3,3);
my $cdnalen = length($cdna_seq);
#print "cdna seq for pep end = " . $endseq . "\n";
my $longendseq= substr($cdna_seq,$coding_end-6,12);
#print "long end seq (-3 to len 12) = $longendseq\n";
# if (!($endseq ne "TGA" and $endseq ne "TAA" and $endseq ne "TAG")) {
# print "Has end " . $trans->translateable_seq . "\n";
# }
if ($endseq ne "TGA" and $endseq ne "TAA" and $endseq ne "TAG") {
if (($cdnalen-$coding_end) > 3) {
while (($cdnalen-$coding_end) > 3 && ($coding_end-$orig_coding_end) <= $maxterdist) {
my $testseq = substr($cdna_seq,$coding_end,3);
#print "Test seq = $testseq\n" if $self->VERBOSE ;
if ($testseq eq "TGA" or $testseq eq "TAA" or $testseq eq "TAG") {
my @coords = $trans->cdna2genomic($coding_end+1,$coding_end+3,$gene->strand);
my $new_start;
my $new_end;
if(scalar(@coords) > 2) {
throw("new end does not map cleanly\n");
next;
} elsif (scalar(@coords) == 2) {
print STDERR "WOW ISN'T NATURE HORRIBLE: new end crosses intron\n";
print STDERR "coord[0] = " . $coords[0]->start . " " . $coords[0]->end ."\n";
print STDERR "coord[1] = " . $coords[1]->start . " " . $coords[1]->end ."\n";
if ($gene->strand == 1) {
$new_start = $coords[0]->start;
$new_end = $coords[$#coords]->end;
} else {
$new_start = $coords[0]->end;
$new_end = $coords[$#coords]->start;
}
} else {
$new_start = $coords[0]->start;
$new_end = $coords[0]->end;
}
unless($coords[0]->isa('Bio::EnsEMBL::Mapper::Coordinate')) {
print STDERR "new start maps to gap\n";
next;
}
unless($coords[$#coords]->isa('Bio::EnsEMBL::Mapper::Coordinate')) {
print STDERR "new start (end of) maps to gap\n";
next;
}
if ($new_end - $new_start == 2) {
#print "Sequence of genomic pos of new end = " . $slice->subseq($new_start,$new_end,$gene->strand) . "\n";
$nupdated_end++;
my $newendexon;
foreach my $exon (@{$trans->get_all_Exons}) {
if ($exon->end >= $new_start && $exon->start <= $new_start) {
$newendexon = $exon;
last;
}
}
if ($newendexon == $tln->end_Exon) {
if ($tln->end_Exon->strand == 1) {
$tln->end($new_end - $tln->end_Exon->start + 1);
} else {
$tln->end($tln->end_Exon->end - $new_start + 1);
}
# NAUGHTY, but hey I should have to do this - I've changed the translation after all
$trans->{'transcript_mapper'} = undef;
$trans->{'coding_region_start'} = undef;
$trans->{'coding_region_end'} = undef;
$trans->{'cdna_coding_start'} = undef;
$trans->{'cdna_coding_end'} = undef;
} else {
# find exon
if (!defined($newendexon)) {
print STDERR "Failed finding new end exon - how can this be?\n";
next;
}
# create a copy of if and of current end exon (because of phase change)
my $copyexon = new Bio::EnsEMBL::Exon(
-start => $tln->end_Exon->start,
-end => $tln->end_Exon->end,
-strand => $gene->strand,
);
my $copynewendexon = new Bio::EnsEMBL::Exon(
-start => $newendexon->start,
-end => $newendexon->end,
-strand => $gene->strand,
);
$copyexon->phase($tln->end_Exon->phase);
$copyexon->end_phase($tln->end_Exon->end_phase);
$copyexon->contig($tln->end_Exon->contig);
if ($tln->end_Exon->stable_id) {
$copyexon->stable_id($tln->end_Exon->stable_id . "TER" . $metcnt++);
$copyexon->created($time);
$copyexon->modified($time);
$copyexon->version(1);
}
$copynewendexon->phase($newendexon->phase);
# $copynewendexon->end_phase(0);
$copynewendexon->contig($newendexon->contig);
if ($newendexon->stable_id) {
$copynewendexon->stable_id($newendexon->stable_id . "TER" . $metcnt++);
$copynewendexon->created($time);
$copynewendexon->modified($time);
$copynewendexon->version(1);
}
# TODO evidence
if ($copynewendexon->strand == 1) {
$tln->end($new_end - $copynewendexon->start + 1);
} else {
$tln->end($copynewendexon->end - $new_start + 1 );
my $tercodon = $copynewendexon->seq->subseq($copynewendexon->end - $new_start-1, $copynewendexon->end - $new_start +1);
#reverse($tercodon);
#$tercodon =~ tr /ACGT/TGCA/;
}
# Replace exons in transcript, and fix phases
my @newexons;
my $inrange = 0;
foreach my $exon (@{$trans->get_all_Exons}) {
if ($inrange) {
print STDERR "in range exon before phase = " . $exon->phase . " endphase " . $exon->end_phase . "\n" if $self->VERBOSE;
$exon->phase( $newexons[$#newexons]->end_phase );
$exon->end_phase(($exon->length + $exon->phase) % 3);
print STDERR "in range exon after phase = " . $exon->phase . " endphase " . $exon->end_phase . "\n" if $self->VERBOSE;
}
if ($exon == $tln->end_Exon) {
my $phase = $exon->phase;
if ($phase == -1) {
$phase = 0;
}
if ($exon == $tln->start_Exon) {
$copyexon->end_phase(($exon->length - $tln->start + 1)%3);
} else {
$copyexon->end_phase(($exon->length + $exon->phase)%3);
}
print STDERR "Setting end_phase on old end exon to " . $copyexon->end_phase . " l = " . $exon->length . "\n" if $self->VERBOSE;
push @newexons,$copyexon;
$inrange = 1;
} elsif ($exon == $newendexon) {
$copynewendexon->phase( $newexons[$#newexons]->end_phase );
$copynewendexon->end_phase( -1);
push @newexons,$copynewendexon;
$inrange = 0;
} else {
push @newexons,$exon;
}
}
$trans->flush_Exons;
foreach my $exon (@newexons) {
$trans->add_Exon($exon);
}
# Reset translation start exon
if ($tln->end_Exon == $tln->start_Exon) {
$tln->start_Exon($copyexon);
}
$tln->end_Exon($copynewendexon);
}
print_Translation($trans);
# print "translateable seq = \n";
# print $trans->translateable_seq . "\n";
} else {
print STDERR "Across exons - not handling this\n" if $self->VERBOSE;
}
last;
}
$coding_end += 3;
}
} else {
print STDERR "Not enough bases downstream - NOT looking into genomic\n" if $self->VERBOSE;
}
}
}
}
# These lines force loads from the database to stop attempted lazy
# loading during the write (which fail because they are to the wrong
# db)
$trans->get_all_supporting_features();
my @exons= @{$trans->get_all_Exons};
my $get = $trans->translation;
$trans->_translation_id(undef);
foreach my $exon (@exons) {
$exon->stable_id;
$exon->contig($gene->slice);
$exon->get_all_supporting_features;
}
}
return($gene);
}
=head2 forward_genewise_clusters
Arg [1] :
Description: get/set for genewise clusters
Returntype :
=cut
sub forward_genewise_clusters{
my ($self, $cluster_ref) = @_;
if (!defined($self->{'_forward_genewise_clusters'})) {
$self->{'_forward_genewise_clusters'} = [];
}
if (defined $cluster_ref && scalar(@{$cluster_ref})) {
push(@{$self->{'_forward_genewise_clusters'}},@{$cluster_ref});
# store them sorted
@{$self->{'_forward_genewise_clusters'}} = sort { $a->start <=> $b->start } @{$self->{'_forward_genewise_clusters'}};
}
return $self->{'_forward_genewise_clusters'};
}
=head2 reverse_genewise_clusters
Arg [1] :
Description: get/set for genewise clusters
Returntype :
Exceptions :
Example :
=cut
sub reverse_genewise_clusters{
my ($self, $cluster_ref) = @_;
if (!defined($self->{'_reverse_genewise_clusters'})) {
$self->{'_reverse_genewise_clusters'} = [];
}
if (defined $cluster_ref && scalar(@{$cluster_ref})) {
push(@{$self->{'_reverse_genewise_clusters'}},@{$cluster_ref});
# store them sorted
@{$self->{'_reverse_genewise_clusters'}} = sort { $a->start <=> $b->start } @{$self->{'_reverse_genewise_clusters'}};
}
return $self->{'_reverse_genewise_clusters'};
}
=head2 cdna_genes
Arg [1] :
Description: get/set for e2g gene array
Returntype :
=cut
sub cdna_genes {
my ($self, $genes) = @_;
if (!defined($self->{'_cdna_genes'})) {
$self->{'_cdna_genes'} = [];
}
if (defined $genes && scalar(@{$genes})) {
push(@{$self->{'_cdna_genes'}},@{$genes});
}
return ($self->{'_cdna_genes'});
}
=head2 prune
Arg [1] : (optional) bool
Description: get/set for option to prune genes
=cut
sub prune {
my ($self, $bool) = @_;
if (!defined($self->{'_prune_genes'})) {
$self->{'_prune_genes'} = undef;
}
if (defined $bool) {
$self->{'_prune_genes'} = $bool;
}
return ($self->{'_prune_genes'});
}
=head2 ests
Arg [1] : (optional) ref to array with ests
Description: get/set for ests
Returntype : array ref with ST objects
=cut
sub ests {
my ($self, $ests) = @_;
if (!defined($self->{'_ests'})) {
$self->{'_ests'} = [];
}
if (defined $ests && scalar(@{$ests})) {
push(@{$self->{'_ests'}},@{$ests});
}
return($self->{'_ests'});
}
=head2 ditags
Arg [1] : (optional) ref to array with ditags
Description: get/set ditags
Returntype : array ref with ditag objects
Exceptions : none
=cut
sub ditags {
my ($self, $ditags) = @_;
if (!defined($self->{'_ditags'})) {
$self->{'_ditags'} = [];
}
if (defined $ditags && scalar(@{$ditags})) {
push(@{$self->{'_ditags'}}, @{$ditags});
}
return($self->{'_ditags'});
}
=head2 gw_genes
Arg [1] :
Description: get/set for genewise gene array
Returntype :
Exceptions :
Example :
=cut
sub gw_genes {
my ($self, $genes) = @_;
if (!defined($self->{'_gw_genes'})) {
$self->{'_gw_genes'} = [];
}
if (defined $genes && scalar(@{$genes})) {
push(@{$self->{'_gw_genes'}},@{$genes});
}
return $self->{'_gw_genes'};
}
=head2 blessed_genes
Arg [1] :
Description: get/set for blessed gene array
clones the genes
Returntype :
=cut
sub blessed_genes {
my ($self, $slice, $genes) = @_;
if (!defined($self->{'_blessed_genes'})) {
$self->{'_blessed_genes'} = [];
}
if (defined $slice && defined $genes && scalar(@{$genes})) {
# split input genes into one transcript per gene; keep type the same
OLDGENE:
foreach my $gene(@{$genes}){
foreach my $transcript(@{$gene->get_all_Transcripts}){
# make a new gene
my $newgene = new Bio::EnsEMBL::Gene;
$newgene->biotype($gene->biotype);
#preserve the xref of the blessed genes!
foreach my $dblink (@{$gene->get_all_DBLinks()}){
#print STDERR "Adding xref ".$dblink->display_id."\n";
$newgene->add_DBEntry($dblink);
}
# clone transcript
my $newtranscript = new Bio::EnsEMBL::Transcript;
$newtranscript->slice($slice);
# clone translation
my $newtranslation = new Bio::EnsEMBL::Translation;
my @seqeds = @{$transcript->translation->get_all_SeqEdits};
if (scalar(@seqeds)) {
print "Copying sequence edits\n" if $self->VERBOSE;
foreach my $se (@seqeds) {
my $new_se =
Bio::EnsEMBL::SeqEdit->new(
-CODE => $se->code,
-NAME => $se->name,
-DESC => $se->description,
-START => $se->start,
-END => $se->end,
-ALT_SEQ => $se->alt_seq
);
my $attribute = $new_se->get_Attribute();
$newtranslation->add_Attributes($attribute);
}
}
my @support = @{$transcript->get_all_supporting_features};
if (scalar(@support)) {
$newtranscript->add_supporting_features(@support);
}
$newtranscript->add_Attributes(@{$transcript->get_all_Attributes});
$newtranscript->translation($newtranslation);
foreach my $exon(@{$transcript->get_all_Exons}){
# clone the exon
my $newexon = clone_Exon($exon);
# get rid of stable ids
$newexon->stable_id('');
# if this is start/end of translation, keep that info:
if ( $exon == $transcript->translation->start_Exon ){
$newtranslation->start_Exon( $newexon );
$newtranslation->start($transcript->translation->start);
}
if ( $exon == $transcript->translation->end_Exon ){
$newtranslation->end_Exon( $newexon );
$newtranslation->end($transcript->translation->end);
}
$newtranscript->add_Exon($newexon);
#add sf
$newexon->add_supporting_features(@{$exon->get_all_supporting_features});
}
$newgene->add_Transcript($newtranscript);
push(@{$self->{'_blessed_genes'}}, $newgene);
}
}
}
return $self->{'_blessed_genes'};
}
=head2 combined_genes
Arg [1] : ref to Bio::EnsEMBL::Gene
Description: get/set for combined gene array
Returntype :
=cut
sub combined_genes {
my ($self, $genesref) = @_;
if (!defined($self->{'_combined_genes'})) {
$self->{'_combined_genes'} = [];
}
if (defined $genesref) {
push(@{$self->{'_combined_genes'}}, $genesref);
}
# if (defined $genesref && scalar(@{$genesref})) {
# push(@{$self->{'_combined_genes'}},@{$genesref});
# }
return $self->{'_combined_genes'};
}
=head2 unmatched_genes
Arg [1] : array of genes
Description: get/set for unmatched gene array
Returntype : none
=cut
sub unmatched_genes {
my ($self, @genes) = @_;
if (!defined($self->{'_unmatched_genes'})) {
$self->{'_unmatched_genes'} = [];
}
if(@genes){
push(@{$self->{'_unmatched_genes'}},@genes);
}
return $self->{'_unmatched_genes'};
}
=head2 populate_kill_list
Arg [1] : String mol-type
Description: read ids to ignore from KillList db
Returntype : ref to hash
=cut
sub populate_kill_list {
my ($self, $type) = @_;
my $kill_list_object = Bio::EnsEMBL::KillList::KillList->new(-TYPE => $type);
my %kill_list = %{$kill_list_object->get_kill_list()};
return \%kill_list;
}
=head2 kill_list
Arg [1] : optional hash ref with kill list
Description: get/set kill list object
Returntype : ref to hash
=cut
sub kill_list {
my ($self, $kill_list) = @_;
if($kill_list){
$self->{'_kill_list'} = $kill_list;
}
return $self->{'_kill_list'};
}
=head2 retrieve_unmerged_gene
Arg [1] : Bio::EnsEMBL::Gene
Description: Returns unmeregd, frameshifted version of input gene
Returntype : Bio::EnsEMBL::Gene
=cut
sub retrieve_unmerged_gene{
my ($self, $merged_gene) = @_;
my %pairs = %{$self->merged_unmerged_pairs()};
if(!exists $pairs{$merged_gene}){
print STDERR "Can't retrieve unmerged \n";
print STDERR "for gene ".$merged_gene->dbID." \n";
}
return $pairs{$merged_gene};
}
=head2 merged_unmerged_pairs
Arg [1] :
Description: get/set for pairs of frameshift merged and unmerged
genes. Key is merged gene, value is unmerged
Returntype :
=cut
sub merged_unmerged_pairs {
my ($self, $merged_gene, $unmerged_gene) = @_;
if (!defined($self->{'_merged_unmerged_pairs'})) {
$self->{'_merged_unmerged_pairs'} = {};
}
if ($unmerged_gene && $merged_gene) {
$self->{'_merged_unmerged_pairs'}{$merged_gene}= $unmerged_gene;
}
# hash ref
return $self->{'_merged_unmerged_pairs'};
}
=head2 modified_unmodified_pairs
Arg [1] :
Description: get/set for pairs of UTR modified and unmodified genes.
Key is modified gene, value is unmodified
Returntype :
=cut
sub modified_unmodified_pairs {
my ($self, $modified_gene, $unmodified_gene) = @_;
if (!defined($self->{'_modified_unmodified_pairs'})) {
$self->{'_modified_unmodified_pairs'} = {};
}
if ($unmodified_gene && $modified_gene) {
$self->{'_modified_unmodified_pairs'}{$modified_gene}= $unmodified_gene;
}
# hash ref
return $self->{'_modified_unmodified_pairs'};
}
=head2 output
Arg [1] : optional RunnableDB output
Description: get/set for result of Analysis
Returntype : array ref with genes
=cut
sub output{
my ($self,@genes) = @_;
if (!defined($self->{'_output'})) {
$self->{'_output'} = [];
}
if(@genes){
push(@{$self->{'_output'}},@genes);
}
return $self->{'_output'};
}
###############################################
=head2 _get_evidence_set ($logic_name_or_biotype)
Name : get_evidence_set( $logic_name_or_biotype )
Arg : String
Func : returns the name of the evidence_set of a genee / PredictionTranscript
Returnval: String describing evidence_set_name
=cut
sub _get_evidence_set {
my ($self, $logic_name_or_biotype) = @_ ;
my %ev_sets = %{ $self->{evidence_sets} } ;
my $result_set_name;
for my $set_name (keys %ev_sets){
my @logic_names = @{$ev_sets{$set_name}} ;
for my $ln (@logic_names ) {
if ($ln eq $logic_name_or_biotype){
$result_set_name = $set_name ;
}
}
}
return $result_set_name;
}
#=head2 make_seqfetcher
# Title : make_seqfetcher
# Usage :
# Description: for the KnownUTR module
# Example :
# Returns : Bio::DB::RandomAccessI
# Args :
#=cut
#sub make_seqfetcher{
# my ( $self, $index, $seqfetcher_class ) = @_;
# my $seqfetcher;
# (my $class = $seqfetcher_class) =~ s/::/\//g;
# eval{
# require "$class.pm";
# };
# if($@){ warn("Could not require seqfetcher class.\n") }
# print "using index $index \n" if $self->VERBOSE;
# if(defined $index && $index ne ''){
# my @db = ( $index );
# # make sure that your class is compatible with the index type
# $seqfetcher = "$seqfetcher_class"->new('-db' => \@db, );
# }
# else{
# throw("can't make seqfetcher\n");
# }
# return $seqfetcher;
#}
#=head2 _cdna_seqfetcher
# Arg [1] :
# Description:
# Returntype :
#=cut
#sub _cdna_seqfetcher{
# my ($self, $seqfetcher) = @_;
# if($seqfetcher){
# $self->{'_cdna_seqfetcher'} = $seqfetcher;
# }
# return $self->{'_cdna_seqfetcher'};
#}
=head2 _cdna_slice
Arg [1] :
Description:
Returntype :
=cut
sub _cdna_slice {
my ($self, $slice) = @_;
if($slice){
$self->{'_cdna_slice'} = $slice;
}
return $self->{'_cdna_slice'};
}
=head2 _cdna_evidence
Arg [1] :
Description:
Returntype :
=cut
sub _cdna_evidence {
my ($self, $cdna_evidence) = @_;
if($cdna_evidence){
$self->{'_cdna_evidence'} = $cdna_evidence;
}
return $self->{'_cdna_evidence'};
}
=head2 _known_pairs
Arg [1] : optional hash ref
Description: store the links between NM and NP entries
Returntype : has ref
=cut
sub _known_pairs {
my ($self, $hashref) = @_;
if (defined $hashref) {
$self->{_known_pairs} = $hashref;
}
return $self->{_known_pairs};
}
######## database access functions ###########
=head2 CDNA_DB
Arg [1] : optional Bio::EnsEMBL::DBSQL::DBAdaptor
Description: get/set for db storing exonerate alignments of cDNAs
Returntype : Bio::EnsEMBL::DBSQL::DBAdaptor
Exceptions : Throws if input isn't DBAdaptor or db parameters haven't been defined
=cut
sub CDNA_DB {
my( $self, $cdna_db ) = @_;
if ($cdna_db){
$self->{_cdna_db} = get_db_adaptor_by_string($cdna_db,1);
}
return $self->{_cdna_db};
}
=head2 EST_DB
Arg [1] : optional Bio::EnsEMBL::DBSQL::DBAdaptor
Description: get/set for db storing exonerate alignments of ESTs
Returntype : Bio::EnsEMBL::DBSQL::DBAdaptor
Exceptions : Throws if input isn't DBAdaptor or db parameters haven't been defined
=cut
sub EST_DB {
my( $self, $est_db ) = @_;
if ($est_db){
$self->{_est_db} = get_db_adaptor_by_string($est_db,1);
}
return $self->{_est_db};
}
=head2 DITAG_DB
Arg [1] : optional Bio::EnsEMBL::DBSQL::DBAdaptor
Description: get/set for db storing exonerate alignments of ditags
Returntype : Bio::EnsEMBL::DBSQL::DBAdaptor
Exceptions : Throws if input isn't DBAdaptor or db parameters haven't been defined
=cut
sub DITAG_DB {
my( $self, $ditag_db ) = @_;
if ($ditag_db){
$self->{_ditag_db} = get_db_adaptor_by_string($ditag_db,1);
}
if((defined $self->{_ditag_db}) && (!$self->{_ditag_db}) && (scalar @{$self->DITAG_TYPE_NAMES})){
throw("You have defined DITAG_TYPE_NAMES, but no DITAG_DB parameters.\n");
}
return $self->{_ditag_db};
}
=head2 INPUT_DB
Arg [1] : optional Bio::EnsEMBL::DBSQL::DBAdaptor
Description: get/set for db to read input genes from
Returntype : Bio::EnsEMBL::DBSQL::DBAdaptor
Exeptions : Throws if input isn't DBAdaptor or db parameters haven't been defined
=cut
sub INPUT_DB {
my( $self, $input_db ) = @_;
if ($input_db){
$self->{_input_db} = get_db_adaptor_by_string($input_db,1);
}
if(!$self->{_input_db}){
throw("Please define database parameters for input db.\n");
}
return $self->{_input_db};
}
=head2 blessed_db
Arg [1] : optional Bio::EnsEMBL::DBSQL::DBAdaptor
Description: get/set for db storing blessed gene structures
Returntype : Bio::EnsEMBL::DBSQL::DBAdaptor
Exceptions : Throws if input isn't DBAdaptor or db parameters haven't been defined
=cut
sub BLESSED_DB {
my( $self, $blessed_db ) = @_;
if ($blessed_db){
$self->{_blessed_db} = get_db_adaptor_by_string($blessed_db,1);
}
return $self->{_blessed_db};
}
=head2 OUTPUT_DB
Arg [1] : optional Bio::EnsEMBL::DBSQL::DBAdaptor
Description: get/set for db in which UTR modified genes should be stored
Returntype : Bio::EnsEMBL::DBSQL::DBAdaptor
Exceptions : Throws if input isn't DBAdaptor or db parameters haven't been defined
=cut
sub OUTPUT_DB {
my( $self, $output_db ) = @_;
if ($output_db){
$self->{_output_db} = get_db_adaptor_by_string($output_db,1);
}
if(!$self->{_output_db}){
throw("Please define database parameters for output db.\n");
}
return $self->{_output_db};
}
#### other config variable get/set methods ######
=head2 various_config_variable_methods
Arg [1] : optional parameter
Description: setter / getter for config vars
Returntype : config value
=cut
sub INPUT_GENETYPES {
my ($self, $input_genetypes) = @_;
if($input_genetypes){
$self->{'_input_genetypes'} = $input_genetypes;
}
return $self->{'_input_genetypes'};
}
sub BLESSED_GENETYPES {
my ($self,$value) = @_;
if (defined $value) {
$self->{_blessed_genetypes} = $value;
}
return $self->{_blessed_genetypes};
}
sub VERBOSE {
my ($self,$value) = @_;
if (defined $value) {
$self->{_verbose} = $value;
}
return $self->{_verbose};
}
sub MAX_EXON_LENGTH {
my ($self,$value) = @_;
if (defined $value) {
$self->{_max_exon_length} = $value;
}
return $self->{_max_exon_length};
}
sub DITAG_TYPE_NAMES {
my ($self,$value) = @_;
if (defined $value) {
$self->{_ditag_type_names} = $value;
}
return $self->{_ditag_type_names};
}
sub cDNA_GENETYPE {
my ($self,$value) = @_;
if (defined $value) {
$self->{_cdna_genetype} = $value;
}
return $self->{_cdna_genetype};
}
sub EXTEND_ORIGINAL_BIOTYPE {
my ($self,$value) = @_;
if (defined $value) {
$self->{_extend_original_biotype} = $value;
}
return $self->{_extend_original_biotype};
}
sub EXTEND_BIOTYPE_OF_UNCHANGED_GENES {
my ($self,$value) = @_;
if (defined $value) {
$self->{_extend_biotype_of_unchanged_genes} = $value;
}
return $self->{_extend_biotype_of_unchanged_genes};
}
sub KNOWN_UTR_GENETYPE {
my ($self,$value) = @_;
if (defined $value) {
$self->{_known_utr_genetype} = $value;
}
return $self->{_known_utr_genetype};
}
sub UTR_GENETYPE {
my ($self,$value) = @_;
if (defined $value) {
$self->{_utr_genetype} = $value;
}
return $self->{_utr_genetype};
}
sub PRUNE_GENES {
my ($self,$value) = @_;
if (defined $value) {
$self->{_prune_genes} = $value;
}
return $self->{_prune_genes};
}
sub FILTER_ESTS {
my ($self,$value) = @_;
if (defined $value) {
$self->{_filter_ests} = $value;
}
return $self->{_filter_ests};
}
sub MAX_INTRON_LENGTH {
my ($self,$value) = @_;
if (defined $value) {
$self->{_max_intron} = $value;
}
return $self->{_max_intron};
}
sub EST_GENETYPE {
my ($self,$value) = @_;
if (defined $value) {
$self->{_est_genetype} = $value;
}
return $self->{_est_genetype};
}
sub BLESSED_UTR_GENETYPE {
my ($self,$value) = @_;
if (defined $value) {
$self->{_blessed_utr_genetype} = $value;
}
return $self->{_blessed_utr_genetype};
}
sub LOOK_FOR_KNOWN {
my ($self,$value) = @_;
if (defined $value) {
$self->{_look_for_known} = $value;
}
return $self->{_look_for_known};
}
sub DITAG_WINDOW {
my ($self,$value) = @_;
if (defined $value) {
$self->{_ditag_window} = $value;
}
return $self->{_ditag_window};
}
sub KNOWNUTR_FILE {
my ($self,$value) = @_;
if (defined $value) {
$self->{_knownutr_file} = $value;
}
return $self->{_knownutr_file};
}
1;