Raw content of Bio::EnsEMBL::Analysis::RunnableDB::RefineSolexaGenes.

Raw content of Bio::EnsEMBL::Analysis::RunnableDB::RefineSolexaGenes # Cared for by Ensembl # # Copyright GRL & EBI # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =pod =head1 NAME Bio::EnsEMBL::Analysis::RunnableDB::RefineSolexaGenes =head1 SYNOPSIS my $db = Bio::EnsEMBL::DBAdaptor->new($locator); my $refine_genes = Bio::EnsEMBL::Analysis::RunnableDB::RefineSolexaGenes->new ( -db => $db, -input_id => $input_id -analysis => $analysis ); $refine_genes->fetch_input(); $refine_genes->run(); $refine_genes->write_output(); #writes to DB =head1 DESCRIPTION This module takes intron spanning dna_align_features and combines them with rough transcript models to build refined genes with CDS. The module produces transcripts representing all possible combinations of introns and exons which are then filtered according to the specifications in the config. The databases containing the various features to combine is defined in Bio::EnsEMBL::Analysis::Config::Databases and the configuration for the module is defined in Bio::EnsEMBL::Analysis::Config::GeneBuild::RefineSolexaGenes =cut package Bio::EnsEMBL::Analysis::RunnableDB::RefineSolexaGenes; use vars qw(@ISA); use strict; use Bio::EnsEMBL::Analysis::RunnableDB; use Bio::EnsEMBL::Analysis::RunnableDB::BaseGeneBuild; use Bio::EnsEMBL::SimpleFeature; use Bio::EnsEMBL::FeaturePair; use Bio::EnsEMBL::DnaDnaAlignFeature; use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranscriptUtils ; use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::ExonUtils ; use Bio::EnsEMBL::Transcript; use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranslationUtils; use Bio::EnsEMBL::Analysis::Config::GeneBuild::RefineSolexaGenes; use Bio::EnsEMBL::Utils::Exception qw(throw warning); use Bio::EnsEMBL::Utils::Argument qw (rearrange); @ISA = qw(Bio::EnsEMBL::Analysis::RunnableDB::BaseGeneBuild); my $limit = 0; sub new{ my ($class,@args) = @_; my $self = $class->SUPER::new(@args); $self->read_and_check_config($REFINESOLEXAGENES_CONFIG_BY_LOGIC); # Hard limit to the number of possible paths to explore $self->recursive_limit(50000); # initialise intron feature cash my %feature_hash; $self->feature_cash(\%feature_hash); return $self; } =head2 fetch_input Title : fetch_input Usage : $self->fetch_input Returns : nothing Args : none =cut sub fetch_input { my( $self) = @_; my $logic = $self->analysis->logic_name; # fetch adaptors and store them for use later $self->intron_slice_adaptor($self->get_dbadaptor($self->INTRON_DB)->get_SliceAdaptor); $self->prediction_exon_adaptor($self->db->get_PredictionExonAdaptor); $self->repeat_feature_adaptor($self->db->get_RepeatFeatureAdaptor); $self->gene_slice_adaptor($self->get_dbadaptor($self->MODEL_DB)->get_SliceAdaptor); # fetch splice matricies for making ab-initio introns where we have no intron support $self->donor_matrix($self->load_matrix($self->DONOR_MATRIX)); $self->acceptor_matrix($self->load_matrix($self->ACCEPTOR_MATRIX)); # want a slice and a full chromsome to keep everything on in the same coords my $slice = $self->fetch_sequence($self->input_id); my $gene_slice = $self->gene_slice_adaptor->fetch_by_region ( 'toplevel', $slice->seq_region_name, $slice->start, $slice->end, 1 ); my $repeat_slice = $self->intron_slice_adaptor->fetch_by_region ('toplevel', $slice->seq_region_name, $slice->start, $slice->end, 1 ); my $chr_slice = $self->db->get_SliceAdaptor->fetch_by_region ( 'toplevel', $slice->seq_region_name ); $self->chr_slice($chr_slice); my @repeats = sort { $a->start <=> $b->start } @{$self->repeat_feature_adaptor->fetch_all_by_Slice($repeat_slice)} ; # put on chromosome coords foreach my $repeat ( @repeats ) { $repeat = $repeat->transfer($chr_slice); } $self->repeats($self->make_repeat_blocks(\@repeats)); # we want to fetch and store all the rough transcripts at the beginning - speeds things up # also we want to take out tiny introns - merge them into longer structures my @prelim_genes; foreach my $gene ( @{$gene_slice->get_all_Genes} ) { # put them on the chromosome $gene = $gene->transfer($chr_slice); push @prelim_genes,$gene; } print STDERR scalar(@prelim_genes) . " genes found \n"; # deterine strandedness ( including splitting merged genes ) my @genes = @{$self->strand_separation(\@prelim_genes)}; print STDERR scalar(@genes) . " genes once they have been separated by strand \n"; $self->prelim_genes(\@genes); } sub run { my ($self) = @_; $self->refine_genes; } =head2 refine_genes Title : refine_genes Usage : $self->refine_genes Returns : nothing Args : none Description : Combines exons with introns in all possible combinations to : Make a series of transcript models =cut sub refine_genes { my ($self) = @_; GENE: foreach my $gene ( @{$self->prelim_genes} ) { my %intron_count; my %intron_hash; my @exon_intron; my @exon_prev_intron; my %intron_exon; my $variants; my $strand = $gene->strand; my @trans; my $most_real_introns = 0; my $highest_score = 0; print STDERR $gene->stable_id. " : " . $gene->start . " " . $gene->end . ":\n"; # merge exons to remove little artifactual introns my @exons = @{$self->merge_exons($gene)}; my $exon_count = $#exons; my @fake_introns; EXON: for ( my $i = 0 ; $i <= $exon_count ; $i ++ ) { my $exon = $exons[$i]; my $retained_intron; my $left_introns = 0; my $right_introns = 0; $exon->{'left_mask'} = 0; $exon->{'right_mask'} = $exon->length; print STDERR "$i : " . $exon->start . " " . $exon->end . ":\n"; # make intron features by collapsing the dna_align_features my @introns = @{$self->dna_2_simple_features($exon->seq_region_start,$exon->seq_region_end)}; my @left_c_introns; my @right_c_introns; my @left_nc_introns; my @right_nc_introns; my @filtered_introns; my $intron_overlap; INTRON: foreach my $intron ( @introns ){ next unless $intron->strand == $strand; next unless $intron->length > $self->MIN_INTRON_SIZE; # disguard introns that splice over our exon if ( $intron->start< $exon->start && $intron->end > $exon->end ) { $intron_overlap++; next; } # check to see if this exon contains a retained intron if ( $intron->start > $exon->start && $intron->end < $exon->end ) { $retained_intron = 1; $exon->{'retained'} =1; } # we need to know how many consensus introns we have to the # left and right in order to determine whether to put in # a non consensus intron if ( $intron->end <= $exon->end ) { if ( $intron->display_label =~ /non-consensus/ ) { push @left_nc_introns, $intron; } else { push @left_c_introns, $intron; } } if ( $intron->start >= $exon->start ) { if ( $intron->display_label =~ /non-consensus/ ) { push @right_nc_introns, $intron; } else { push @right_c_introns, $intron; } } } # add non consensus introns only where there are no consensus introns push @filtered_introns, @left_c_introns; push @filtered_introns, @right_c_introns; push @filtered_introns, @left_nc_introns if scalar(@left_c_introns) == 0; push @filtered_introns, @right_nc_introns if scalar(@right_c_introns) == 0; ; print "Got " , scalar(@left_nc_introns ) + scalar(@right_nc_introns ) . " non consensus introns \n"; if ( scalar(@left_c_introns) == 0 && scalar(@left_nc_introns) > 0) { print "using " . scalar(@left_nc_introns) . " NC left \n"; } if ( scalar(@right_c_introns) == 0 && scalar(@right_nc_introns) > 0 ) { print "using " . scalar(@right_nc_introns) . " NC right \n"; } INTRON: foreach my $intron ( @filtered_introns ) { print STDERR "\t" . $intron->start . " " . $intron->end . " " . $intron->strand . " " . $intron->display_label . "\n"; # becasue we make a new exons where we have a reatained intron to # stop circular references we need to allow the final # intron splicing out of the exon to be used more than once # by each new exon in fact $intron_count{$intron->display_label}++ unless $retained_intron; $intron_hash{$intron->display_label} = $intron; # only use each intron twice once at the end and once at the start of # an exon $self->exon_mask($exon,$intron); next if $intron_count{$intron->display_label} && $intron_count{$intron->display_label} > 2; #print STDERR "USING intron " . $intron->display_label . "\n"; #print STDERR "E-I\n" if $intron->end > $exon->end; #print STDERR "I-E\n" if $intron->start < $exon->start; # exon_intron links exons to the intron on their right ignoring strand push @{ $exon_intron[$i]} , $intron if $intron->end > $exon->end; # intron exon links introns to exons on their right ignoring strand if ( $intron->start < $exon->start ) { push @{$intron_exon{$intron->display_label}} , $i; # exon_prev_intron links exons to introns on their left ignoring strand push @{ $exon_prev_intron[$i]} , $intron ; } # intron is within the exon - this is not a true exon but a retained intron if ( $intron->start > $exon->start && $intron->end < $exon->end && $intron->length > 50 ) { # dont use NC introns here - dont trust them enough next if $intron->display_label =~ /non-consensus/; #print STDERR "retained " . $intron->start ." " . $exon->start ." ". $intron->end ." ". $exon->end ."\n"; # dont have circular references to exons or the paths # will be infinite so clone this exon instead my $new_exon = clone_Exon( $exon ); # chop it up a bit so it no longer overlaps the other introns #print STDERR "TRIMMING EXON \n"; $exon->end($intron->start + 10 ); $new_exon->start( $intron->end -10 ); # split the exon into 2 new exons and replace the old retained exon with the 2 new ones my @replacements = ( $exon, $new_exon); splice( @exons,$i,1,@replacements) ; $exon_count++; $i--; next EXON; } } } # if we have no intron support we can guess at introns if ( $self->ABINITIO_INTRONS ) { EXON: for ( my $i = 1 ; $i <= $exon_count ; $i ++ ) { my $left_exon = $exons[$i-1]; my $exon = $exons[$i]; my $right_exon = $exon; my $intron_start = 0; my $intron_end = 0; # we may already know the splice sites for the left exon even if the intron # that splices it doesnt join onto this gene... if ($exon_intron[$i-1]){ my $prev_intron = $exon_intron[$i-1][0]; $intron_start = $prev_intron->start; } # likewise right intron if ($exon_prev_intron[$i]){ # just pick the first one, lets not get carried away my $prev_intron = $exon_prev_intron[$i][0]; $intron_end = $prev_intron->end; } next if $intron_start && $intron_end; #print STDERR "FOUND intron start $intron_start and intron end $intron_end \n"; # otherwise one is missing lets fake the intron # lets find the splice sites using a variety of cunning techniques if ($strand == 1) { $intron_start = $self->find_splice_sites( $left_exon, $strand,'donor') unless $intron_start; $intron_end = $self->find_splice_sites( $right_exon, $strand,'acceptor') unless $intron_end; } else { $intron_start = $self->find_splice_sites( $left_exon, $strand,'acceptor') unless $intron_start; $intron_end = $self->find_splice_sites( $right_exon, $strand,'donor') unless $intron_end; } # check for problems with introns if ( $intron_end <= $intron_start ) { warn("coudnt get splices for this intron\n"); $intron_start = $left_exon->end; $intron_end = $right_exon->start; } # catch other problems if ( $intron_start <= $left_exon->start or $intron_end >= $right_exon->end ) { warn("This intron would result with an exon start > end $intron_start $intron_end \n"); $intron_start = $left_exon->end; $intron_end = $right_exon->start; } # use existing values if you cannot find a splice site $intron_start = $left_exon->end unless $intron_start; $intron_end = $right_exon->start unless $intron_end; # make the fake intron my $fake_intron = Bio::EnsEMBL::SimpleFeature->new (-start => $intron_start , -end => $intron_end , -strand => $strand, -slice => $self->chr_slice, -analysis => $self->analysis, -score => 0 ); # add the fake intron into the hash structure my $key = $fake_intron->start . ":" . $fake_intron->end . ":" . $fake_intron->strand; #print STDERR "INTRON $key\n"; $fake_intron->display_label("INTRON-FAKE-$key"); $intron_hash{$fake_intron->display_label} = $fake_intron; push @{ $exon_intron[$i-1]} , $fake_intron if $fake_intron->end > $left_exon->end; push @{$intron_exon{$fake_intron->display_label}} , $i if $fake_intron->start < $exon->start; $self->exon_mask($left_exon,$fake_intron); $self->exon_mask($right_exon,$fake_intron); if ( $exons[$i]->{'left_mask'} >= $exons[$i]->{'right_mask'} ) { print STDERR "Problem gene " . $gene->stable_id ." Exon $i has overlapping introns skipping this gene\n"; next GENE; } } } next unless @exon_intron; # now lets make a hash of hashes holding which exons connect to which for ( my $i = 0 ; $i < scalar(@exons) ; $i ++ ) { if ( $exon_intron[$i] ) { foreach my $intron ( @{$exon_intron[$i]} ) { # only allow each intron to connect to 1 exon foreach my $exon ( @{$intron_exon{$intron->display_label}} ) { if ( $intron->end > $exons[$exon]->end ) { next ; } # store the possible paths as a hash (splice)variants $variants->{$i}->{$intron->display_label} = 1; $variants->{$intron->display_label}->{$exon} = 1; # check if ( $intron->end > $exons[$exon]->end ) { throw(" exon $i start end " . $intron->end ." - " . $exons[$exon]->start ); } } } } } # work out all the possible paths given the features we have my @paths; CLUSTER: for ( my $i = 0 ; $i < scalar(@exons) ; $i ++ ) { $limit = 0; my $result; $result = $self->ProcessTree($variants,$i); if ($result eq "ERROR"){ print STDERR ("Could not process cluster\n"); next GENE; } push( @paths, (keys %$result)); } @paths = sort { length($a) <=> length($b) } @paths; # lets collapse redundant paths foreach ( my $i = scalar(@paths)-1 ; $i >= 0 ; $i-- ) { foreach my $pathb ( @paths ) { next if $pathb eq $paths[$i] ; next unless $paths[$i]; if ( $pathb =~ /$paths[$i]/ ) { splice(@paths,$i,1); } } } # paths are stored as text - turn them into arrays of features "models" my @models; foreach my $path ( @paths ) { my @model; foreach my $feature ( split(/\./,$path ) ) { if ( $feature =~ /INTRON/ ) { push @model, $intron_hash{$feature}; } else { push @model, $exons[$feature]; } } push @models,\@model; } # Now we cycle through all the models and turn them into genes MODEL: foreach my $model ( @models ) { my $intron_count = 0; my $intron_score = 0; my $exon_score = 0; my $fake_introns = 0; my @new_exons; # make an array containing cloned exons for ( my $i = 0 ; $i < scalar(@$model) ; $i++ ) { unless ( $model->[$i]->isa("Bio::EnsEMBL::SimpleFeature") ) { my $new_exon = clone_Exon($model->[$i]); # add in exon coverage scores from supporting features foreach my $daf ( @{$model->[$i]->get_all_supporting_features} ) { $exon_score += $daf->score; } $new_exon->strand($strand); push @new_exons,$new_exon; } else { push @new_exons, $model->[$i]; } } # trim the exons using the intron features to get the splice sites correct for ( my $i = 0 ; $i < scalar(@$model) ; $i++ ) { if ( $model->[$i]->isa("Bio::EnsEMBL::SimpleFeature") ) { my $intron = $model->[$i]; next unless $intron->strand == $strand; next unless $new_exons[$i-1] && $new_exons[$i+1]; # its an intron trim the exons accordingly $new_exons[$i-1]->end( $intron->start ); $new_exons[$i+1]->start( $intron->end ); $intron_count++; $intron_score+= $intron->score; # keep tabs on how many fake introns we have $fake_introns++ if $intron->display_label =~ /FAKE/; } } next MODEL unless $intron_count; # trim padding from the start and end exons $new_exons[0]->start($new_exons[0]->start + 20) if $new_exons[0]->length > 20; $new_exons[-1]->end ($new_exons[-1]->end - 20) if $new_exons[-1]->length > 20; # make it into a gene my @modified_exons; foreach my $exon ( @new_exons ) { next if $exon->isa("Bio::EnsEMBL::SimpleFeature"); # check it works if ( $exon->start > $exon->end ) { warn("Exons start > exon end " . $exon->start . " " . $exon->end ."\n"); next MODEL; } push @modified_exons, clone_Exon($exon); } if ( $strand == 1 ) { @modified_exons = sort { $a->start <=> $b->start } @modified_exons; } else { @modified_exons = sort { $b->start <=> $a->start } @modified_exons; } # make it into a gene my $t = new Bio::EnsEMBL::Transcript(-EXONS => \@modified_exons); # add a translation my $tran = compute_translation(clone_Transcript($t)); # keep track of the scores for this transcript $tran->analysis($self->analysis); $tran->version(1); $tran->biotype('modified'); $tran->{'_score'} = $intron_score + $exon_score; $tran->{'_fake_introns'} = $fake_introns ; unless ($self->ABINITIO_INTRONS) { # if we are not using fake exons compare the introns to # how many there are in the (merged) rough model $tran->{'_fake_introns'} = $exon_count - $intron_count; $intron_count = $exon_count ; } #print STDERR " EXON count $exon_count\n"; $tran->{'_intron_count'} = $intron_count; $tran->{'_proportion_real_introns'} = int((($tran->{'_intron_count'} - $tran->{'_fake_introns'}) /$tran->{'_intron_count'}) *100); # make note of best scores and best introns if ( $tran->{'_proportion_real_introns'} > $most_real_introns ) { $most_real_introns = $tran->{'_proportion_real_introns'} } if ( $tran->{'_score'} > $highest_score ) { $highest_score = $tran->{'_score'} } push @trans, $tran; } # cluster on peptides to get the trans with the least exons where the # peptides are identical - ie less UTR exons my @cluster = @{$self->peptide_cluster(\@trans)}; next unless scalar(@cluster) > 0; # label by score / introns foreach my $tran ( @cluster ) { my ( $new_gene ) = @{convert_to_genes(($tran),$gene->analysis)}; $new_gene->biotype('disguard'); $new_gene->stable_id($gene->stable_id . "-" . $tran->{'_score'} ."-" . $tran->{'_fake_introns'} . "/" . $tran->{'_intron_count'} . "-" . $tran->{'_proportion_real_introns'}); # add the biotypes depending on the scores $new_gene->biotype($self->OTHER_ISOFORMS) if $self->OTHER_ISOFORMS; if ( $self->BEST_INTRONS && $tran->{'_proportion_real_introns'} == $most_real_introns) { $new_gene->biotype($self->BEST_INTRONS); } if ( $self->BEST_SCORE && $tran->{'_score'} == $highest_score) { $new_gene->biotype($self->BEST_SCORE); } push@{$self->output} , $new_gene unless $new_gene->biotype eq 'disguard'; } } } sub write_output{ my ($self) = @_; my $outdb = $self->get_dbadaptor($self->OUTPUT_DB); my $gene_adaptor = $outdb->get_GeneAdaptor; my @output = @{$self->output}; my $fails = 0; my $total = 0; foreach my $gene (@output){ $gene->analysis($self->analysis); $gene->source($self->analysis->logic_name); eval { $gene_adaptor->store($gene); }; if ($@){ warning("Unable to store gene!!\n$@"); $fails++; } $total++; } if ($fails > 0) { throw("Not all genes could be written successfully " . "($fails fails out of $total)"); } } =head2 ProcessTree Title : ProcessTree Usage : $self->ProcessTree Returns : String containing paths through the gene Args : A hash reference contianing the possible intron exons : Integer key for the hashref : String containing keys used up to this point : String containing the paths under construction Description : Recursive method that creates paths that explore all possible : routes through a hashref, has a hard limit of 500,000 recursions : to prevent it running out of memory if the paths cannot be solved : or are too large to be practical =cut sub ProcessTree { my ($self,$hashref,$index,$sofar,$paths) = @_; # dont let it go on for ever eating memory if ($limit > $self->recursive_limit){ print STDERR "Too many recursive possibilities\n"; return "ERROR"; } my @node = keys %{$hashref->{$index}} ; $sofar.= "$index."; foreach my $child (@node){ $limit++; $paths = $self->ProcessTree($hashref,$child,$sofar,$paths); if ( $paths eq "ERROR" ) { $limit = 0; return $paths; } } push @{$paths->{$sofar}}, split(/\./,$sofar) if scalar(@node == 0); return $paths; } =head2 peptide_cluster Title : peptide_cluster Usage : $self->peptide_cluster(\@array); Returns : Array ref of Bio::EnsEMBL::Transcript Args : Array ref of Bio::EnsEMBL::Transcript Description : Collapses transcripts that have identical coding : sequences and exons, picks the one with the fewest UTR exons =cut sub peptide_cluster { my ($self,$array_ref) = @_; my @clusters; my @chosen; my %exon_hash; foreach my $tran ( @$array_ref ) { my $key = ''; my @exons = @{$tran->get_all_translateable_Exons}; unless ( scalar(@exons > 0 )) { print ($tran->start." ". $tran->end ." has no translateable exons \n"); return \@chosen; } for ( my $i = 1 ; $i <=$#exons ; $i++ ) { $key .= $exons[$i-1]->end . ":" . $exons[$i]->start . ":" ; } push @{$exon_hash{$key}} , $tran; } foreach my $key ( keys %exon_hash ) { my @trans = sort { scalar(@{$a->get_all_Exons}) <=> (@{$b->get_all_Exons}) } @{$exon_hash{$key}}; foreach my $tran ( @{$exon_hash{$key}} ) { } push @chosen, @trans; } return \@chosen; } =head2 find_splice_sites Title : find_splice_sites Usage : $self->find_splice_sites($exon,$strand,$matrix_type); Returns : None Args : Bio::EnsEMBL::Exon : Integer ( strand of the gene ) : String descibing the type of matrix to use (acceptor or donor) Description : Attempts to define splice sites for exons when no other data is : availibale. Uses a position specific weight matrix to score : potential splice sites, also looks for any overlapping genscan : exons that might have correct splice sites =cut sub find_splice_sites { my ( $self, $exon, $strand, $matrix_type ) = @_; my $pea = $self->prediction_exon_adaptor; my $matrix; if ( $matrix_type eq 'donor' ) { $matrix = $self->donor_matrix; } else { $matrix = $self->acceptor_matrix; } # which end of the exon do we wish to trim - just do one print "GOT EXON " . $exon->seq_region_name . " " . $exon->start . " " . $exon->end ." $strand \n"; print " looking for a splice - $matrix_type \n"; my $genomic_slice; if ($strand == 1 ) { $genomic_slice = $self->db->get_SliceAdaptor->fetch_by_region('toplevel', $exon->seq_region_name, $exon->start, $exon->end, 1); } else { $genomic_slice = $self->db->get_SliceAdaptor->fetch_by_region('toplevel', $exon->seq_region_name, $exon->start, $exon->end, -1); } my $highest_donor; my $highest_acceptor; # first work out the coverage and the max coverage # lets see if we have any genscan exons to help us with our splice sites my @genscan_exons = @{$pea->fetch_all_by_Slice_constraint($genomic_slice,"seq_region_strand = $strand") }; my $genscan_exon; if ( scalar(@genscan_exons) == 1) { $genscan_exon = $genscan_exons[0]; } # get the dna sequence so we can score it against the splice matrix my $dna = $genomic_slice->seq; my @dna = split(//,$dna); # donor my $scores = $self->score_pssm($exon,$matrix,\@dna,$matrix_type); my %best_scores; if ( $genscan_exon ) { if ( $matrix_type eq 'donor') { } else { $scores->[$genscan_exon->start - 2] += .9 if $genscan_exon->start > 0; } } for ( my $i = 0 ; $i <= $#dna ; $i++ ) { #store them # how about we weight scores by virtue of their closeness to the expected position ? # adding 3 to the ends of the masking to prevent <=0 bp exons if ($strand == 1){ $scores->[$i] = -1 if $i < ($exon->{'left_mask'}+1) or $i > ($exon->{'right_mask'}-1); } else { $scores->[$i] = -1 if $i < ($exon->length - ($exon->{'right_mask'}-1)-1) or $i > ( $exon->length - ($exon->{'left_mask'}+1) -1); } my $factor; if ( $matrix_type eq 'acceptor' ) { # thats .25 off the score for every base 4 bases = 1 poimt $factor = abs($i -20) / 100 ; } else { $factor = abs(($#dna - $i) -20 ) / 100; } $factor = .99 if $factor > .99; $scores->[$i] -= $factor if $scores->[$i] && $scores->[$i] > -1; $best_scores{$i} = $scores->[$i] if $scores->[$i] && $scores->[$i] > -1; } my $count; my $pick; # pick the best scoring splice site foreach my $position ( keys %best_scores ) { if ( $pick ) { $pick = $position if $best_scores{$position} > $best_scores{$pick}; } else { $pick = $position; } } unless ($pick ) { return; } # the rough models are always -ve strand rememer if ( $strand == -1 && $matrix_type eq 'donor' ) { return $exon->end - $pick +1; } if ( $strand == 1 && $matrix_type eq 'donor' ) { return $exon->start+ $pick-1 ; } if ( $strand == 1 && $matrix_type eq 'acceptor' ) { return $exon->start+$pick+1 ; } if ( $strand == -1 && $matrix_type eq 'acceptor' ) { return $exon->end-$pick-1 ; } return; } =head2 score_pssm Title : score_pssm Usage : $self->score_pssm($exon,$matrix,$dna,$type) Returns : Array ref of scores Args : Bio::EnsEMBL::Exon : Array ref containing scoring matrix : Array ref containing dna : String describing the type of matrix ( acceptor or donor ) Description : Scores a DNA sequence against a PSWM to identify possible : splice sites. Returns an array of scores =cut sub score_pssm { my ( $self,$exon,$matrix,$dna,$type) = @_; my $scores; my $adjust; $adjust = 3 if $type eq 'donor'; $adjust = 13 if $type eq 'acceptor'; throw("Matrix not recognised\n") unless $matrix; # calculate the score for all the points in the exon for ( my $i = 0 ; $i <= $exon->length - ( scalar(keys %{$matrix} ) ) ; $i++ ) { my $offset = 0; my $index = $i+$adjust; foreach my $key (sort { $a <=> $b } keys %{$matrix} ) { $scores->[$index] += ($matrix->{$key}->{$dna->[$i+$offset]} / 100 ); $offset++; } # adjust for length of matrix $scores->[$index] /= scalar(keys %{$matrix} ); # make sure it has a GT if its a donor or an AG if its an acceptor if ($type eq 'donor' ) { $scores->[$index] = 0 unless $dna->[$index] eq 'G'; $scores->[$index] = 0 unless $dna->[$index+1] eq 'T'; } if ($type eq 'acceptor' ) { $scores->[$index] = 0 unless $dna->[$index-1] eq 'A'; $scores->[$index] = 0 unless $dna->[$index] eq 'G'; } } return $scores; } =head2 load_matrix Title : load_matrix Usage : $self->load_matrix($file) Returns : Array ref - score matrix Args : File handle Description : Parses a tet file containing a scoring matrix and : loads it into a multidimensional array ref =cut sub load_matrix { my ( $self,$file ) = @_; my $matrix; my @columns; open(FILE,$file) or $self->throw("cannot find matrix file $file \n"); while (<FILE>) { chomp; next if $_ =~ /^#/; $_ =~ s/ //g; my @array = split(/\t/,$_); unless ( scalar(@columns) > 0 ) { @columns = @array; } else { for ( my $i = 1 ; $i <= $#columns ; $i++ ) { $matrix->{$array[0]}->{$columns[$i]} = $array[$i]; } } } return $matrix; } =head2 merge_exons Title : merge_exons Usage : $self->merge_exons($gene) Returns : Array ref of Bio::EnsEMBL::Exon Args : Bio::EnsEMBL::Gene Description : Merges adjacent exons where the intron is covered by repeats or : is very small =cut # lets us merge exons with tiny introns between them unless they contain an intron sub merge_exons { my ( $self, $gene) = @_; my @exons; next unless $gene->get_all_Transcripts->[0]; foreach my $exon ( @{$gene->get_all_Transcripts->[0]->get_all_Exons} ) { push @exons, clone_Exon($exon); } @exons = sort { $a->start <=> $b->start } @exons; for ( my $i = 1 ; $i <= $#exons ; $i ++ ) { my $exon = $exons[$i]; my $prev_exon = $exons[$i-1]; my $intron_count = 0; # is the intron tiny? my @introns = @{$self->dna_2_simple_features($prev_exon->start,$exon->end)}; # it must lie across the boundary and be within the two exons foreach my $intron ( @introns ) { if ( $intron->start > 0 && $intron->end < ( $exon->end - $prev_exon->start) && $intron->start < $prev_exon->length && $intron->end > $prev_exon->length && $intron->strand == $gene->strand) { $intron_count++; } } # is it covered by repeats? # remove very small introns if there is no direct evidence for them if ( $exon->start - $prev_exon->end <= 20 && $intron_count == 0) { $exon->start($prev_exon->start); splice(@exons,$i-1,1); $i--; next; } # dont merge long introns even if they are covered with repeats # next if $repeat_slice->length > 200 ; # dont merge introns if there is evidence for splicing next if $intron_count; # is the intron covered by a repeat? my @repeats = @{$self->repeats}; my $repeat_coverage = 0; # so the repeats are now non-overlapping blocks ( if there is more than one ) foreach my $repeat ( @repeats ) { next unless $repeat->start <= $exon->start && $repeat->end >= $prev_exon->end; last if $repeat->start > $exon->start; $repeat->start($prev_exon->end) if $repeat->start < $prev_exon->end; $repeat->end($exon->start) if $repeat->end > $exon->start; $repeat_coverage+= $repeat->end - $repeat->start; # print $exon->start ." " . $prev_exon->end . " " .$repeat->start ." " . $repeat->end . # " $repeat_coverage\n"; } $repeat_coverage /= ($exon->start - $prev_exon->end ) ; # print " Intron " . $exon->start ." " . $prev_exon->end . " coverage $repeat_coverage \n"; # splice the exons together where repeat coverage is > 95% if ($repeat_coverage > 0.95 ) { print "MERGING EXONS Intron " . $exon->start ." " . $prev_exon->end . " coverage $repeat_coverage \n"; $exon->start($prev_exon->start); # combine the exon scores when we merge the exons $exon->get_all_supporting_features->[0]->score ($exon->get_all_supporting_features->[0]->score + $prev_exon->get_all_supporting_features->[0]->score) if $exon->get_all_supporting_features->[0] && $prev_exon->get_all_supporting_features->[0]; splice(@exons,$i-1,1); $i--; } } return \@exons; } =head2 strand_separation Title : strand_separation Usage : $self->strand_separation(\@genes) Returns : Array ref of Bio::EnsEMBL::Gene Args : Array ref of Bio::EnsEMBL::Gene Description : Examines a gene model with repect to overlapping intron alignments. : Looks for places where the intron alignments switch strand : indicating that the model is in fact 2 genes on opposite : strands with overlapping UTR. It then splits the gene into 2 =cut sub strand_separation { my ( $self,$genes ) = @_; my @separated_genes; my $strand; GENE: foreach ( my $i = 0 ; $i< scalar(@$genes) ; $i++ ) { my $total_strand; my $gene = $genes->[$i]; my @intron_strand; # examine strand on an exon by exon basis my @exons = @{$self->merge_exons($gene)}; for ( my $e =1 ; $e <= $#exons ; $e ++ ) { my $exon = $exons[$e]; $intron_strand[$e] = 0; my $prev_exon = $exons[$e-1]; # do we have an intron between these exons; my @introns = @{$self->dna_2_simple_features($prev_exon->start,$exon->end)}; # here is where we figure out how to split the joined genes... foreach my $intron ( @introns ) { # intron has to actually join the 2 exons next unless ( ( $intron->start > $prev_exon->start && $intron->start <= $prev_exon->end) or ( $intron->end >= $exon->start && $intron->end < $exon->end)); # add the intron scores if ( $intron->score ) { $intron_strand[$e] += $intron->strand * $intron->score; $total_strand+= $intron->strand * $intron->score; } } } # do we have *any* introns at all - only continue if we do unless ( $total_strand ) { print STDERR "No strand data to go on - skipping " . $gene->display_id ."\n"; next GENE; } my $last_intron_score = 0; my $last_intron_position = 0; my %switch_strand; # ok do we have a clearly defined strandedness or do we have a split? for ( my $e = 1 ; $e <= $#exons ; $e ++ ) { if ($last_intron_score && ( $intron_strand[$e] > 1 && $last_intron_score < -1 ) or ( $intron_strand[$e] < -1 && $last_intron_score > 1 ) ) { $switch_strand{'score'}++; $switch_strand{'start'} = $last_intron_position; $switch_strand{'end'} = $e; $switch_strand{'strand'} = 1 if $last_intron_score > 0; $switch_strand{'strand'} = -1 if $last_intron_score < 0; } if ( $intron_strand[$e] ) { $last_intron_score = $intron_strand[$e]; $last_intron_position = $e; } } if ($switch_strand{'score'} && $switch_strand{'score'} == 1 ) { print STDERR "We have a strand switch between " . $switch_strand{'start'} ." and " . $switch_strand{'end'} ."\n"; # lets make 2 copies of this transcript # one with introns 1 to switch_strand end -1 # one with switch_strand start +1 to end # then delete the merged one my @modified_exons; my $strand = $switch_strand{'strand'} ; # left half for ( my $e = 0; $e <= $switch_strand{'start'} ; $e ++ ) { my $exon = $exons[$e]; push @modified_exons, clone_Exon($exon); } my $t = new Bio::EnsEMBL::Transcript(-EXONS => \@modified_exons); my ( $new_gene ) = @{convert_to_genes(($t),$gene->analysis)}; $new_gene->strand($strand); $new_gene->stable_id($gene->stable_id."A"); push @separated_genes,$new_gene; my @modified_right_exons; # right half for ( my $e = $switch_strand{'end'} -1 ; $e <= $#exons ; $e ++ ) { my $exon = $exons[$e]; push @modified_right_exons, clone_Exon($exon); } $t = new Bio::EnsEMBL::Transcript(-EXONS => \@modified_right_exons); my ( $new_gene2 ) = @{convert_to_genes(($t),$gene->analysis)}; $new_gene2->strand(-$strand); $new_gene2->stable_id($gene->stable_id."B"); push @separated_genes,$new_gene2; } if ( $switch_strand{'score'} && $switch_strand{'score'} > 1 ) { print STDERR "Strand is ambiguous, lets go with majority rule\n"; $gene->strand(1) if $total_strand > 0; $gene->strand(-1) if $total_strand < 0; push @separated_genes,$gene; } unless ( $switch_strand{'score'} ) { $gene->strand(1) if $total_strand > 0; $gene->strand(-1) if $total_strand < 0; push @separated_genes,$gene; } } return \@separated_genes; } =head2 exon_mask Title : exon_mask Usage : $self->exon_mask($exon,$intron) Returns : None Args : Bio::EnsEMBL::Exon : Bio::EnsEMBL::SimpleFeature Description : records positions of the slice donor and acceptor to : prevent overlapping introns resulting in an exon with : start < end =cut sub exon_mask { my ( $self,$exon,$intron ) = @_; if ( $intron->end >= $exon->start && $intron->start < $exon->start ) { $exon->{'left_mask'} = $intron->end - $exon->start if $exon->{'left_mask'} < $intron->end - $exon->start; } if ( $intron->end > $exon->end && $intron->start <= $exon->end ) { $exon->{'right_mask'} = $intron->start - $exon->start if $exon->{'right_mask'} > $intron->start - $exon->start; } return ; } =head2 dna_2_simple_features Title : dna_2_simple_features Usage : $self->dna_2_simple_features($start,$end) Returns : Array ref of Bio::EnsEMBL::SimpleFeature Args : Int start : Int end Description : Fetches all dna_align_features from the intron db that lie within : the range determined by start and end, collapses them down into a : non redundant set and builds a Bio::EnsEMBL::SimpleFeature to : represent it =cut sub dna_2_simple_features { my ($self,$start,$end) = @_; my @sfs; my %id_list; my $intron_slice = $self->intron_slice_adaptor->fetch_by_region ('toplevel', $self->chr_slice->seq_region_name, $start, $end, ); # featch all the dna_align_features for this slice by logic name my @reads; foreach my $logic_name ( @{$self->LOGICNAME} ) { push @reads, @{$intron_slice->get_all_DnaAlignFeatures($logic_name)}; } foreach my $read ( @reads ) { my $type = 'consensus'; $type = 'non-consensus' if ( $read->hseqname =~ /\:NC$/ ) ; $read = $read->transfer($self->chr_slice); my @ugfs = $read->ungapped_features; next unless ( scalar(@ugfs) == 2 ); @ugfs = sort { $a->start <=> $b->start } @ugfs; # cache them by internal boundaries my $unique_id = $read->seq_region_name . ":" . $ugfs[1]->start . ":" . $ugfs[0]->end . ":" . $read->strand .":" . $self->LOGICNAME .":$type"; $id_list{$unique_id} ++; } # collapse them down and make them into simple features foreach my $key ( keys %id_list ) { my $sf = $self->feature_cash->{$key}; unless($sf) { my @data = split(/:/,$key) ; $sf = Bio::EnsEMBL::SimpleFeature->new (-start => $data[2], -end => $data[1], -strand => $data[3], -slice => $self->chr_slice, -analysis => $self->analysis, -score => 0, -display_label => "INTRON-" .$key . "-". scalar( keys %{$self->feature_cash} ), ); $self->feature_cash->{$key} = $sf; } $sf->score($id_list{$key}); push @sfs , $sf; } return \@sfs; } sub make_repeat_blocks { my ($self,$repeats_ref) = @_; my @repeats = sort { $a->start <=> $b->start }@$repeats_ref; # print " got " . scalar(@repeats) . " repeat blocks initially\n"; # merge repeat blocks for ( my $j = 1 ; $j <= $#repeats ; $j ++ ) { if ( $repeats[$j]->start <= $repeats[$j-1]->end+1 ){ # print "merging repeat $j " . $repeats[$j]->start . "-" . $repeats[$j]->end. " " . $repeats[$j-1]->start ."-" . $repeats[$j-1]->end ."\n"; $repeats[$j-1]->end($repeats[$j]->end) if $repeats[$j]->end > $repeats[$j-1]->end ; splice(@repeats,$j,1); $j--; } # print "REPEAT $j " . $repeats[$j]->start . "-" . $repeats[$j]->end. " " . $repeats[$j]->display_id ."\n"; } print " got " . scalar(@repeats) . " repeat blocks after merging\n"; return \@repeats; } ################################################################## # Containers sub recursive_limit { my ($self, $val) = @_; if (defined $val) { $self->{_recursive_limit} = $val; } return $self->{_recursive_limit}; } sub donor_matrix { my ($self, $val) = @_; if (defined $val) { $self->{_donor_matrix} = $val; } return $self->{_donor_matrix}; } sub acceptor_matrix { my ($self, $val) = @_; if (defined $val) { $self->{_acceptor_matrix} = $val; } return $self->{_acceptor_matrix}; } sub prediction_exon_adaptor { my ($self, $val) = @_; if (defined $val) { $self->{_pea} = $val; } return $self->{_pea}; } sub repeat_feature_adaptor { my ($self, $val) = @_; if (defined $val) { $self->{_rfa} = $val; } return $self->{_rfa}; } sub gene_slice_adaptor { my ($self, $val) = @_; if (defined $val) { $self->{_gsa} = $val; } return $self->{_gsa}; } sub intron_slice_adaptor { my ($self, $val) = @_; if (defined $val) { $self->{_isa} = $val; } return $self->{_isa}; } sub feature_cash { my ($self, $val) = @_; if (defined $val) { $self->{_feature_cash} = $val; } return $self->{_feature_cash}; } sub prelim_genes { my ($self, $val) = @_; if (defined $val) { $self->{_prelim_genes} = $val; } return $self->{_prelim_genes}; } sub repeats { my ($self, $val) = @_; if (defined $val) { $self->{_repeats} = $val; } return $self->{_repeats}; } sub chr_slice { my ($self, $val) = @_; if (defined $val) { $self->{_chr_slice} = $val; } return $self->{_chr_slice}; } #################################### # config variable holders #################################### sub INTRON_DB { my ($self,$value) = @_; if (defined $value) { $self->{'_CONFIG_INTRON_DB'} = $value; } if (exists($self->{'_CONFIG_INTRON_DB'})) { return $self->{'_CONFIG_INTRON_DB'}; } else { return undef; } } sub OUTPUT_DB { my ($self,$value) = @_; if (defined $value) { $self->{'_CONFIG_OUTPUT_DB'} = $value; } if (exists($self->{'_CONFIG_OUTPUT_DB'})) { return $self->{'_CONFIG_OUTPUT_DB'}; } else { return undef; } } sub MODEL_DB { my ($self,$value) = @_; if (defined $value) { $self->{'_CONFIG_MODEL_DB'} = $value; } if (exists($self->{'_CONFIG_MODEL_DB'})) { return $self->{'_CONFIG_MODEL_DB'}; } else { return undef; } } sub DONOR_MATRIX { my ($self,$value) = @_; if (defined $value) { $self->{'_CONFIG_DONOR_MATRIX'} = $value; } if (exists($self->{'_CONFIG_DONOR_MATRIX'})) { return $self->{'_CONFIG_DONOR_MATRIX'}; } else { return undef; } } sub ACCEPTOR_MATRIX { my ($self,$value) = @_; if (defined $value) { $self->{'_CONFIG_ACCEPTOR_MATRIX'} = $value; } if (exists($self->{'_CONFIG_ACCEPTOR_MATRIX'})) { return $self->{'_CONFIG_ACCEPTOR_MATRIX'}; } else { return undef; } } sub LOGICNAME { my ($self,$value) = @_; if (defined $value) { $self->{'_CONFIG_LOGICNAME'} = $value; } if (exists($self->{'_CONFIG_LOGICNAME'})) { return $self->{'_CONFIG_LOGICNAME'}; } else { return undef; } } sub ABINITIO_INTRONS { my ($self,$value) = @_; if (defined $value) { $self->{'_CONFIG_ABINITIO_INTRONS'} = $value; } if (exists($self->{'_CONFIG_ABINITIO_INTRONS'})) { return $self->{'_CONFIG_ABINITIO_INTRONS'}; } else { return undef; } } sub MIN_INTRON_SIZE { my ($self,$value) = @_; if (defined $value) { $self->{'_CONFIG_MIN_INTRON_SIZE'} = $value; } if (exists($self->{'_CONFIG_MIN_INTRON_SIZE'})) { return $self->{'_CONFIG_MIN_INTRON_SIZE'}; } else { return undef; } } sub BEST_SCORE { my ($self,$value) = @_; if (defined $value) { $self->{'_CONFIG_BEST_SCORE'} = $value; } if (exists($self->{'_CONFIG_BEST_SCORE'})) { return $self->{'_CONFIG_BEST_SCORE'}; } else { return undef; } } sub BEST_INTRONS { my ($self,$value) = @_; if (defined $value) { $self->{'_CONFIG_BEST_INTRONS'} = $value; } if (exists($self->{'_CONFIG_BEST_INTRONS'})) { return $self->{'_CONFIG_BEST_INTRONS'}; } else { return undef; } } sub OTHER_ISOFORMS { my ($self,$value) = @_; if (defined $value) { $self->{'_CONFIG_OTHER_ISOFORMS'} = $value; } if (exists($self->{'_CONFIG_OTHER_ISOFORMS'})) { return $self->{'_CONFIG_OTHER_ISOFORMS'}; } else { return undef; } } 1;