Raw content of Bio::EnsEMBL::Analysis::RunnableDB::Solexa2Genes.

Raw content of Bio::EnsEMBL::Analysis::RunnableDB::Solexa2Genes # Cared for by Ensembl # # Copyright GRL & EBI # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =pod =head1 NAME Bio::EnsEMBL::Analysis::RunnableDB::Solexa2Genes =head1 SYNOPSIS my $db = Bio::EnsEMBL::DBAdaptor->new($locator); my $refine_genes = Bio::EnsEMBL::Analysis::RunnableDB::Solexa2Genes->new ( -db => $db, -input_id => $input_id -analysis => $analysis ); $refine_genes->fetch_input(); $refine_genes->run(); $refine_genes->write_output(); #writes to DB =head1 DESCRIPTION The databases containing the various features to combine is defined in Bio::EnsEMBL::Analysis::Config::Databases and the configuration for the module is defined in Bio::EnsEMBL::Analysis::Config::GeneBuild::Solexa2Genes =cut package Bio::EnsEMBL::Analysis::RunnableDB::Solexa2Genes; use strict; use Bio::EnsEMBL::Analysis::RunnableDB; use Bio::EnsEMBL::Analysis::RunnableDB::BaseGeneBuild; use Bio::EnsEMBL::Analysis::Config::GeneBuild::Solexa2Genes; use Bio::EnsEMBL::SimpleFeature; use Bio::EnsEMBL::FeaturePair; use Bio::EnsEMBL::DnaDnaAlignFeature; use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranscriptUtils ; use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::ExonUtils ; use Bio::EnsEMBL::Transcript; use Bio::EnsEMBL::Utils::Exception qw(throw warning); use vars qw(@ISA); @ISA = qw (Bio::EnsEMBL::Analysis::RunnableDB::BaseGeneBuild); sub new{ my ($class,@args) = @_; my $self = $class->SUPER::new(@args); $self->read_and_check_config($SOLEXA2GENES_CONFIG_BY_LOGIC); return $self; } =head2 fetch_input Title : fetch_input Usage : $self->fetch_input Returns : nothing Args : none =cut sub fetch_input { my ($self) = @_; # store some adaptors $self->repeat_feature_adaptor($self->db->get_RepeatFeatureAdaptor); $self->feature_slice_adaptor($self->get_dbadaptor($self->ALIGNMENT_DB)->get_SliceAdaptor); $self->repeat_slice_adaptor($self->db->get_SliceAdaptor); my $id = $self->input_id; my $slice = $self->fetch_sequence($id); my $feature_slice = $self->feature_slice_adaptor->fetch_by_region ( 'toplevel', $slice->seq_region_name, $slice->start, $slice->end, ); my $chr_slice = $self->feature_slice_adaptor->fetch_by_region('toplevel', $slice->seq_region_name, ); $self->chr_slice($chr_slice); my @features = @{$feature_slice->get_all_DnaAlignFeatures}; my %reads; while ( scalar(@features) > 0 ) { my $read = pop(@features); $reads{$read->hseqname} = $read->transfer($chr_slice); } # store the reads internally $self->reads(\%reads); } sub run { my ($self) =@_; my %reads = %{$self->reads}; my @genes; print STDERR "Found " . scalar(keys %reads) . " reads in total\nRunning initial clustering..."; # Do an intial clustering to group together the read pairs # should roughly correspond to genes my $gene_clusters = $self->gene_cluster; print STDERR "Found " . scalar(@$gene_clusters) . " clusters\n"; # take one cluster of reads at a time foreach my $gene_cluster ( @$gene_clusters ) { # break the gene clusters down into exon clusters # linked by the pair information my ($exon_clusters) = $self->exon_cluster($gene_cluster); next unless ( $exon_clusters ); # Now we have collapsed our reads we need to make sure we keep the connections between # them so we can make our fake transcripts print STDERR "Found " . scalar(@$exon_clusters) . " transcripts \n"; foreach my $exon_cluster ( @$exon_clusters ) { #print STDERR scalar(@$exon_cluster) ." exon clusters\n"; next unless scalar(@$exon_cluster) > 0; # make the dna align feature my $padded_exons = $self->pad_exons($exon_cluster); push @genes , $self->make_gene($padded_exons) if $padded_exons ; } } # merge genes separated by long repeats if ( $self->MERGE_GENES ) { my $merged_genes = $self->merge_genes(\@genes); $self->output($merged_genes); } else { $self->output(\@genes); } } sub write_output{ my ($self) = @_; my @genes = @{$self->output}; my $outdb = $self->get_dbadaptor($self->OUTPUT_DB); my $gene_adaptor = $outdb->get_GeneAdaptor; my $fails = 0; my $total = 0; foreach my $gene ( @genes ) { $gene->analysis($self->analysis); $gene->source($self->analysis->logic_name); $gene->biotype('rough'); my $tran = $gene->get_all_Transcripts->[0]; print "FILTERING " . $tran->start ." " , $tran->end ." "; # Filter models before writing them if ( scalar(@{$tran->get_all_Exons}) < $self->MIN_EXONS ) { print "Rejecting because of exon count " . scalar(@{$tran->get_all_Exons}) ."\n"; next; } if( ( $tran->end - $tran->start ) / $tran->length <= $self->MIN_SPAN ) { print "Rejecting because of span " . ( $tran->end - $tran->start ) / $tran->length ."\n"; next; } if ( $tran->length > $self->MIN_LENGTH ){ print "Rejecting because of length " . $tran->length ."\n"; } eval { $gene_adaptor->store($gene); }; if ($@){ warning("Unable to store gene!!\n$@"); $fails++; } $total++; } if ($fails > 0) { throw("Not all genes could be written successfully " . "($fails fails out of $total)"); } print STDERR "$total genes written after filtering\n"; } =head2 exon_cluster Title : exon_cluster Usage : $self->exon_cluster($ugfs) Returns : Array ref Bio::EnsEMBL::Exon Args : Array ref Bio::EnsEMBL::Exon Description : clusters individual reads into blocks representing exons : uses pair information to link blocks into transcripts : filters out poorly supported blocks =cut sub exon_cluster { my ($self,$gene_cluster) = @_; my @gapped_reads = @{$gene_cluster->{'features'}}; my @ugfs; # get the ungapped features ( corresponds to individual reads foreach my $read ( @gapped_reads ) { foreach my $ugf ($read->ungapped_features) { push @ugfs, $ugf; } } my $reads = $self->reads; my @exon_clusters; my $exon_cluster_num = 0; my $clean_exon_clusters; my @final_exon_clusters; my @transcripts; my @ungapped_features = sort { $a->start <=> $b->start } @ugfs; my $cluster_data; my $cluster_hash; my $pairs; # make exon clusters and store the names of the reads and associated cluster number foreach my $ugf ( @ungapped_features ) { my $clustered = 0; foreach (my $i = 0 ; $i < scalar(@exon_clusters) ; $i++ ) { my $exon_cluster = $exon_clusters[$i]; # do they overlap? if ( $ugf->start <= $exon_cluster->end+1 && $ugf->end >= $exon_cluster->start-1 ) { # Expand the exon_cluster $exon_cluster->start($ugf->start) if $ugf->start < $exon_cluster->start; $exon_cluster->end($ugf->end) if $ugf->end > $exon_cluster->end; $exon_cluster->score($exon_cluster->score + 1); $clustered = 1; $cluster_data->{$ugf->hseqname}->{$i} = 1; } } # start a new cluster if there is no overlap unless ( $clustered ) { $cluster_data->{$ugf->hseqname}->{$exon_cluster_num} = 1 ; $ugf->score(1); push @exon_clusters, $ugf; $exon_cluster_num++; } } if ( scalar(@exon_clusters) == 1 ) { # just keep single exon clusters for now - might be useful later my $exon = $exon_clusters[0]; push @transcripts, [$exon]; return \@transcripts; } print STDERR scalar(@exon_clusters) . " Exons\n"; # make the exon pairings store them in cluster hash foreach my $read ( keys %{$cluster_data} ) { my @clusters = keys %{$cluster_data->{$read}}; for (my $i = 0; $i < @clusters ; $i ++ ) { my $cluster = $clusters[$i]; # adjust for compressed reads my $read_depth = 1; $read_depth = $reads->{$read}->p_value if $reads->{$read}->p_value; $cluster_hash->{$clusters[0]}->{$cluster} += $read_depth unless $clusters[0] == $cluster; } } # join the exon_clusters so that terminal exons get penalised # but put in all the other exons my $clean_cluster_hash; my $average_read_depth = 0; foreach my $key ( sort keys %{$cluster_hash} ) { # which exon_clusters is it connected to ? # is it an end exon? ie only connects to exon_clusters on one side? foreach my $key2 ( keys %{$cluster_hash->{$key}} ){ # how many reads join these clusters to the right and the left? # initialise read count first - stop error messages my $read_count = $cluster_hash->{$key}->{$key2} ; $clean_cluster_hash->{$key}->{'right'}+= $read_count ; $clean_cluster_hash->{$key2}->{'left'}+= $read_count ; $average_read_depth += $read_count; print "READ DEPTH $key $key2 $read_count\n"; # dont count single coverage pairs when # working out averages $pairs++ if $read_count >1; } } # whats the average coverage of the exons? $average_read_depth /= $pairs if $pairs; print STDERR "Average READ DEPTH $average_read_depth \n"; # Filter out pairings with very litte support foreach my $key ( sort keys %{$cluster_hash} ) { foreach my $key2 ( keys %{$cluster_hash->{$key}} ){ if ($cluster_hash->{$key}->{$key2} < ($average_read_depth/100)*$self->EXCLUDE_EXONS ) { print "Removing pair $key $key2 with a read_depth of ".$cluster_hash->{$key}->{$key2} ."\n"; $cluster_hash->{$key}->{$key2} = 0; } } } # now lets cycle through our exon_clusters removing end exons # with little support all other exons go forward to the next stage # add in single exon genes might be useful later foreach my $key ( sort keys %{$clean_cluster_hash} ) { # keep them if they are supported on both sides if ( $clean_cluster_hash->{$key}->{'left'} && $clean_cluster_hash->{$key}->{'right'} ) { $clean_exon_clusters->{$key} = 1; next; } # is it a well supported left end exon? if ( $clean_cluster_hash->{$key}->{'left'} && $clean_cluster_hash->{$key}->{'left'} >= $self->END_EXON_COVERAGE) { $clean_exon_clusters->{$key} = 1 ; next; } # is it a well supported right end exon? if ( $clean_cluster_hash->{$key}->{'right'} && $clean_cluster_hash->{$key}->{'right'} >= $self->END_EXON_COVERAGE ) { $clean_exon_clusters->{$key} = 1 ; next; } # othwise ignore it } # now need to find little clusters sitting in introns that are not connected to the transcript # do a reclustering based on which exons are connected to each other my @clean_clusters = keys %$clean_exon_clusters; return unless ( scalar(@clean_clusters) > 0) ; # put one exon into the first cluster my @temp; push @temp, pop(@clean_clusters); push @final_exon_clusters, \@temp; my $trans_count = 0; LOOP: while ( scalar(@clean_clusters) > 0 ) { my $clustered; my $final_exon_cluster = $final_exon_clusters[$trans_count]; foreach my $cluster_num ( @{$final_exon_cluster} ) { $clustered = 0; # do ANY of our exons join to this exon? for ( my $i =0 ; $i <= $#clean_clusters; $i++ ) { my $index = $clean_clusters[$i]; # is the current exon attached to any exon in our cluster? if ( $cluster_hash->{$index}->{$cluster_num} or $cluster_hash->{$cluster_num}->{$index}) { push @{$final_exon_cluster}, $index; # chop it out splice(@clean_clusters,$i,1); $i--; $clustered = 1; } } } unless ($clustered) { next unless scalar(@clean_clusters) > 0; my @temp; push @temp, pop(@clean_clusters); # start another cluster push @final_exon_clusters, \@temp; $trans_count++; } } # So far we have just dealt with array indecies # now store the actual features foreach my $exon_cluster ( @final_exon_clusters ) { my @transcript; foreach my $exon ( @$exon_cluster ) { push @transcript, $exon_clusters[$exon]; } @transcript = sort { $a->start <=> $b->start} @transcript; push @transcripts, \@transcript; } return \@transcripts; } =head2 pad_exons Title : pad_exons Usage : $self->($exon_clusters) Returns : Array ref of Bio::EnsEMBL::Exon Args : Array ref of Bio::EnsEMBL::Exon Description : Takes an array of Exons, pads them and builds a : DnaAlignFeature from it that represents a transcript =cut sub pad_exons { my ($self,$exon_cluster_ref) = @_; my @padded_exons; my @exon_clusters = sort { $a->start <=> $b->start } @$exon_cluster_ref; # make a padded exon array foreach my $exon ( @exon_clusters ){ my $padded_exon = create_Exon ( $exon->start - 20, $exon->end + 20 , -1, -1, -1, $exon->analysis, undef, undef, $self->chr_slice, ); # dont let it fall of the slice because of padding $padded_exon->start(1) if $padded_exon->start <= 0; $padded_exon->end($self->chr_slice->length - 1) if $padded_exon->end >= $self->chr_slice->length; my $feat = new Bio::EnsEMBL::DnaDnaAlignFeature (-slice => $exon->slice, -start => $padded_exon->start, -end => $padded_exon->end, -strand => -1, -hseqname => $exon->display_id, -hstart => 1, -hstrand => 1, -hend => $padded_exon->length, -analysis => $exon->analysis, -score => $exon->score, -cigar_string => $padded_exon->length.'M'); my @feats; push @feats,$feat; $padded_exon->add_supporting_features(@feats); push @padded_exons, $padded_exon; } # dont let adjacent exons overlap for ( my $i = 1 ; $i <= $#padded_exons; $i++ ) { my $exon = $padded_exons[$i]; my $last_exon = $padded_exons[$i-1]; if ( $last_exon->end >= $exon->start ) { # trim the exons so they dont overlap my $trim = int(($exon->start - $last_exon->end) /2); $last_exon->end( $last_exon->end + ($trim) -1 ); $exon->start( $exon->start - ($trim)+1 ); } } return \@padded_exons } =head2 simple_cluster Title : simple_cluster Usage : $self->($reads) Returns : Hash ref of clustered reads Args : None Description : Does a simple clustering of read pairs to identify : groups of pairs that correspond to genes =cut sub gene_cluster { my ($self) = @_; my $reads = $self->reads; my @clusters; my @features = sort { $a->start <=> $b->start } values %$reads ; foreach my $f ( @features ) { my $clustered = 0; foreach my $cluster ( @clusters ) { # do they overlap? if ( $f->start <= $cluster->{'end'} && $f->end >= $cluster->{'start'} ) { # Expand the cluster $cluster->{'start'} = $f->start if $f->start < $cluster->{'start'}; $cluster->{'end'} = $f->end if $f->end > $cluster->{'end'}; push @{$cluster->{'features'}}, $f; $clustered = 1; } } unless ($clustered) { my $cluster; push @{$cluster->{'features'}}, $f; $cluster->{'start'} = $f->start; $cluster->{'end'} = $f->end; push @clusters, $cluster; } } return \@clusters; } =head2 make_gene Title : make_gene Usage : $self->make_gene($exons) Returns : Array ref of Bio::EnsEMBL::Gene objects Args : Array ref of Bio::EnsEMBL::Exon objects Description : Builds gene models from an array of exons =cut sub make_gene { my ($self,$exon_ref) = @_; # we are making reverse strand genes so the exons need to be in the reverse order my @exons = sort { $b->start <=> $a->start } @$exon_ref; my $tran = new Bio::EnsEMBL::Transcript(-EXONS => \@exons); $tran->analysis($self->analysis); return @{convert_to_genes(($tran),$self->analysis)}; } =head2 merge_genes Title : merge_genes Usage : $self->make_genes($genes) Returns : Array ref of Bio::EnsEMBL::Gene objects Args : Array ref of Bio::EnsEMBL::Gene objects Description : Merges gene models when they are separated : by a gap of <= MERGE_GENES and the gap is 100% : covered by repeats =cut sub merge_genes { my ($self, $genes_ref) = @_; my @merged_genes; # join together neighbouring genes separated by a gap of X # where the gap is 100% filled by a repeat # get rid of nested genes first my @genes = sort {$a->start <=> $b->start} @$genes_ref; print " Got ". scalar(@genes) ."\n"; for ( my $i = 1 ; $i <= $#genes ; $i++ ) { if ( $genes[$i-1]->start < $genes[$i]->start && $genes[$i-1]->end > $genes[$i]->end ) { push @merged_genes, splice (@genes,$i,1); $i--; } } # is the gap between the genes short enough? GENE: for ( my $i = 1 ; $i <= $#genes ; $i++ ) { my $left_gene = $genes[$i-1]; my $right_gene = $genes[$i]; my $gap = $right_gene->start - $left_gene->end; if ($gap <= $self->MERGE_GENES && $left_gene->end < $right_gene->start) { # is it covered by repeats? my $repeat_slice = $self->repeat_slice_adaptor->fetch_by_region ('toplevel', $left_gene->slice->seq_region_name, $left_gene->end, $right_gene->start, ); #print "looking at " . $repeat_slice->name ."\n"; # is the gap covered by a repeat? my @repeats = sort { $a->start <=> $b->start } @{$self->repeat_feature_adaptor->fetch_all_by_Slice($repeat_slice)} ; # merge repeat blocks #print scalar(@repeats) . " repeats found \n"; for ( my $j = 1 ; $j <= $#repeats ; $j ++ ) { if ( $repeats[$j]->start <= $repeats[$j-1]->end+1 ){ $repeats[$j-1]->end($repeats[$j]->end) if $repeats[$j]->end > $repeats[$j-1]->end ; splice(@repeats,$j,1); $j--; } } my $repeat_coverage = 0; # so the repeats are now non-overlapping blocks ( if there is more than one ) foreach my $repeat ( @repeats ) { $repeat->start(0) if $repeat->start < 0; $repeat->end($repeat_slice->length) if $repeat->end > $repeat_slice->length; $repeat_coverage+= $repeat->end - $repeat->start; } $repeat_coverage /= $repeat_slice->length; # merge the genes together where repeat coverage is 100% if ($repeat_coverage >= 0.95 ) { print "Do some merging\n"; print "Repeat Coverage = $repeat_coverage \n"; print $repeat_slice->name ."\n"; # to do the merge do we just link the genes or do we join them with a long exon? # how about we keep it as 2 exons but bring them together so they abut each other # if there is evidene of splicing they will get separated in the refine genes code # otherwise they will stay merged, given that we are looking at areas covered by repeat # it seems likely that the exons should read right through. # so we want to extend the last exon of the left gene to finish at the end # of the first exon of the right gene, we also want to lose the first right # gene exon my @merged_exons; my @left_exons = sort { $a->start <=> $b->start } @{$left_gene->get_all_Exons}; my @right_exons = sort { $a->start <=> $b->start } @{$right_gene->get_all_Exons}; my $merged_exon = pop(@left_exons); my $disgarded_exon = shift(@right_exons); $merged_exon->end($disgarded_exon->end); push @merged_exons,@left_exons; push @merged_exons,$merged_exon; push @merged_exons,@right_exons; my @new_genes = $self->make_gene(\@merged_exons); my $new_gene = $new_genes[0]; print "NEW GENE " . $new_gene->start ." ". $new_gene->end ."\n"; # take them out of the array and carry on splice (@genes,$i-1,2,$new_gene); $i-= 1; print "Merged 1 successfully\n"; } } } # add whatever is left to the return array push @merged_genes, @genes; print "Returning " . scalar (@merged_genes) . "\n"; return \@merged_genes; } ########################################### # Containers sub reads { my ($self, $value) = @_; if ($value ) { $self->{'_reads'} = $value; } return $self->{'_reads'}; } sub feature_slice_adaptor { my ($self, $val) = @_; if (defined $val) { $self->{_fsa} = $val; } return $self->{_fsa}; } sub repeat_slice_adaptor { my ($self, $val) = @_; if (defined $val) { $self->{_rsa} = $val; } return $self->{_rsa}; } sub repeat_feature_adaptor { my ($self, $val) = @_; if (defined $val) { $self->{_rfa} = $val; } return $self->{_rfa}; } sub chr_slice { my ($self, $val) = @_; if (defined $val) { $self->{_chr_slice} = $val; } return $self->{_chr_slice}; } ########################################## # Config variables sub ALIGNMENT_DB { my ($self,$value) = @_; if (defined $value) { $self->{'_CONFIG_ALIGNMENT_DB'} = $value; } if (exists($self->{'_CONFIG_ALIGNMENT_DB'})) { return $self->{'_CONFIG_ALIGNMENT_DB'}; } else { return undef; } } sub MIN_LENGTH{ my ($self,$value) = @_; if (defined $value) { $self->{'_CONFIG_MIN_LENGTH'} = $value; } if (exists($self->{'_CONFIG_MIN_LENGTH'})) { return $self->{'_CONFIG_MIN_LENGTH'}; } else { return 0; } } sub MIN_EXONS { my ($self,$value) = @_; if (defined $value) { $self->{'_CONFIG_MIN_EXONS'} = $value; } if (exists($self->{'_CONFIG_MIN_EXONS'})) { return $self->{'_CONFIG_MIN_EXONS'}; } else { return 0; } } sub MIN_SPAN { my ($self,$value) = @_; if (defined $value) { $self->{'_CONFIG_MIN_SPAN'} = $value; } if (exists($self->{'_CONFIG_MIN_SPAN'})) { return $self->{'_CONFIG_MIN_SPAN'}; } else { return 0; } } sub END_EXON_COVERAGE { my ($self,$value) = @_; if (defined $value) { $self->{'_CONFIG_END_EXON_COVERAGE'} = $value; } if (exists($self->{'_CONFIG_END_EXON_COVERAGE'})) { return $self->{'_CONFIG_END_EXON_COVERAGE'}; } else { return 0; } } sub EXCLUDE_EXONS { my ($self,$value) = @_; if (defined $value) { $self->{'_CONFIG_EXCLUDE_EXONS'} = $value; } if (exists($self->{'_CONFIG_EXCLUDE_EXONS'})) { return $self->{'_CONFIG_EXCLUDE_EXONS'}; } else { return 0; } } sub OUTPUT_DB { my ($self,$value) = @_; if (defined $value) { $self->{'_CONFIG_OUTPUT_DB'} = $value; } if (exists($self->{'_CONFIG_OUTPUT_DB'})) { return $self->{'_CONFIG_OUTPUT_DB'}; } else { return undef; } } sub MERGE_GENES { my ($self,$value) = @_; if (defined $value) { $self->{'_CONFIG_MERGE_GENES'} = $value; } if (exists($self->{'_CONFIG_MERGE_GENES'})) { return $self->{'_CONFIG_MERGE_GENES'}; } else { return 0; } } 1;