Raw content of Bio::EnsEMBL::Analysis::RunnableDB::ExonerateSolexa.

Raw content of Bio::EnsEMBL::Analysis::RunnableDB::ExonerateSolexa =pod =head1 NAME Bio::EnsEMBL::Analysis::RunnableDB::ExonerateSolexa =head1 SYNOPSIS my $runnableDB = Bio::EnsEMBL::Analysis::RunnableDB::ExonerateSolexa->new( -db => $refdb, -analysis => $analysis_obj, ); $runnableDB->fetch_input(); $runnableDB->run(); $runnableDB->write_output(); #writes to DB =head1 DESCRIPTION Extends Bio::EnsEMBL::Analysis::RunnableDB::ExonerateAlignFeature to allow use of compressed dna align features, useful when aligning millions of short Solexa reads =head1 CONTACT Post general queries to B<ensembl-dev@ebi.ac.uk> =head1 APPENDIX =cut package Bio::EnsEMBL::Analysis::RunnableDB::ExonerateSolexa; use strict; use Bio::EnsEMBL::Utils::Exception qw(throw warning); use Bio::EnsEMBL::Analysis::RunnableDB; use Bio::EnsEMBL::Analysis::RunnableDB::ExonerateAlignFeature; use Bio::EnsEMBL::Analysis::Config::GeneBuild::ExonerateSolexa; use Bio::EnsEMBL::Analysis::Tools::Utilities; use vars qw(@ISA); @ISA = qw (Bio::EnsEMBL::Analysis::RunnableDB::ExonerateAlignFeature); sub new { my ( $class, @args ) = @_; my $self = $class->SUPER::new(@args); $self->read_and_check_config($EXONERATE_SOLEXA_CONFIG_BY_LOGIC); return $self; } sub run { my ($self) = @_; # do all the normal stuff $self->SUPER::run(); # filter the results carefully to only allow strict matches my $filtered_features = $self->filter_solexa($self->output); # Pair features together if they come from paired end reads if ( $self->PAIREDEND ) { my $paired_features = $self->pair_features($filtered_features); # clear the output array to load in the modified features $self->{'output'} = []; $self->output($paired_features); } } sub filter_solexa { my ($self,$features_ref) = @_; my @features = @$features_ref; my @filtered_features; # allow no more than MISSMATCH missmatches and # dont allow gapping... foreach my $feat ( @features ) { # dont allow gapping if ( $feat->cigar_string =~ /[ID]/ ) { # gapped reject it next } # check missmatches if ( $self->MISSMATCH ) { my $aligned_length = abs($feat->hend - $feat->hstart) +1; # print $feat->hseqname ." "; # print $feat->percent_id . " " ; # print " aligned length = $aligned_length "; my $matches = $aligned_length * $feat->percent_id / 100; # print " matches $matches "; my $missmatches = ( $aligned_length - $matches) / $aligned_length * 100; # print " missmatches $missmatches \n"; next if $missmatches > $self->MISSMATCH; # print "accepting\n"; } push @filtered_features, $feat; } return \@filtered_features; } sub pair_features { my ($self,$features_ref) = @_; my @features = @$features_ref; my $paired_features; my @selected_reads; foreach my $feat ( @features ) { if ( $feat->hseqname =~ /(\S+):([a,b,A,B])$/ ) { my $suffix = lc($2); push @{$paired_features->{$1}->{$suffix}},$feat; } else { $self->throw("Read name not recognised " . $feat->hseqname . "\n"); } } foreach my $pair ( keys %$paired_features ) { my ( @as, @bs ) ; @as = @{$paired_features->{$pair}->{'a'}} if $paired_features->{$pair}->{'a'}; @bs = @{$paired_features->{$pair}->{'b'}} if $paired_features->{$pair}->{'b'}; my @potential_pairs; # Pick the potential paired reads bases on chr positon foreach my $afeat ( @as ) { READ: foreach my $bfeat ( @bs ) { if ( $afeat->seqname eq $bfeat->seqname ) { # They lie on the same chr within PAIREDGAP bases of each other # but they don't overlap if ( ( $afeat->start > $bfeat->end && $afeat->start - $bfeat->end <= $self->PAIREDGAP ) or ( $bfeat->start > $afeat->end && $bfeat->start - $afeat->end <= $self->PAIREDGAP ) ) { # they should be oriented pointing towards each other on # opposite strands... if ( $afeat->strand == 1 && $bfeat->strand == 1 && $afeat->hstrand != $bfeat->hstrand ) { # are they in the right order # on the forward strand a should come before b # and vice versa next READ unless ( ($afeat->hstrand == 1 && $afeat->end < $bfeat->start) or ($afeat->hstrand == -1 && $bfeat->end < $afeat->start) ); } # storing the pair using the score as the index # so I can easily pull out the highest scoring pair # if they score the SAME then choose the pair that is # closest together my $combined_score = $afeat->score + $bfeat->score; # work out the lengths the pairs are separated by if ( $potential_pairs[$combined_score] ) { my $new_length; my $old_length; my $old_afeat = $potential_pairs[$combined_score]->[0]; my $old_bfeat = $potential_pairs[$combined_score]->[1]; if ( $afeat->start > $bfeat->end ) { $new_length = $afeat->start - $bfeat->end; } else { $new_length = $bfeat->start - $afeat->end; } if ( $old_afeat->start > $old_bfeat->end ) { $old_length = $old_afeat->start - $old_bfeat->end; } else { $old_length = $old_bfeat->start - $old_afeat->end; } # only replace the pair if the new pair with the same combined score # are closer together ( hopefully this might stop clusters getting # tangled up if ( $old_length > $new_length ) { $potential_pairs[$combined_score] = [($afeat,$bfeat)] ; } } else { $potential_pairs[$combined_score] = [($afeat,$bfeat)] ; } } } } } if ( scalar(@potential_pairs) >= 1 ){ # lets join together the highest scoring pair of reads to make one new feature push @selected_reads, @{$self->merge_pair(pop @potential_pairs)} if scalar(@potential_pairs) >= 1; } else { if ( scalar(@as) >= 1 ) { @as = sort { $a->score <=> $b->score } @as ; my $selected = pop @as; if ( $selected->hseqname =~ /(\S+):([A])$/ ) { $selected->hseqname($1 .":a3p"); } push @selected_reads, $selected; } if ( scalar(@bs) >= 1 ) { @bs = sort { $a->score <=> $b->score } @bs ; my $selected = pop @bs; if ( $selected->hseqname =~ /(\S+):([B])$/ ) { $selected->hseqname($1 .":b3p"); } push @selected_reads, $selected; } } } return \@selected_reads; } sub merge_pair { my ( $self, $read_pair) = @_; my @ugfs; my @features; if ( $self->PAIR ) { # Use the combined scores and identity of the reads for the new read my $score = $read_pair->[0]->score + $read_pair->[1]->score; my $pid = ( $read_pair->[0]->percent_id + $read_pair->[1]->percent_id ) /2; # Because the features are split you can get the second half of # the read aligning before the first which can screw up the # hit start ends as you might expect # I am going to flip a and b over if ( $read_pair->[1]->end <= $read_pair->[0]->start ) { my $tmp = $read_pair->[0]; $read_pair->[0] = $read_pair->[1]; $read_pair->[1] = $tmp; } for ( my $i = 0 ; $i <=1 ; $i++ ) { my $read = $read_pair->[$i]; if ( $read->hseqname =~ /(\S+):([a,b])$/ ) { $read->hseqname($1); } if ( $read->hseqname =~ /(\S+):([A,B])$/ ) { $read->hseqname($1 .":3p"); } foreach my $ugf ( $read->ungapped_features ) { # need to modify the 'b' read hit starts ends to make it # appear to come from one long read # neeed to flip it onto the forward strand or else # the dna align feature will fall over my $hstart = $ugf->hstart ; my $hend = $ugf->hend ; if ( $i == 1 ) { if ( $ugf->hstrand == -1 ) { $ugf->hstrand(1); } } else { $ugf->hstrand(1) if $ugf->hstrand == -1; } $ugf->score($score); $ugf->percent_id($pid); push @ugfs, $ugf; } } my $feat = new Bio::EnsEMBL::DnaDnaAlignFeature(-features => \@ugfs); $feat->analysis($self->analysis); $feat->hstart($read_pair->[0]->hstart); $feat->hend($read_pair->[0]->hend+$read_pair->[1]->hend); push @features,$feat; } else { $read_pair->[0]->hseqname($read_pair->[0]->hseqname . ":aa"); $read_pair->[1]->hseqname($read_pair->[1]->hseqname . ":bb"); push @features,($read_pair->[0],$read_pair->[1]); } # Otherwise leave them as they are but change the a and b suffix to indictate that they have # been paired but are not joined together. return \@features; } =head2 write_output Arg [1] : array ref of Bio::EnsEMBL::DnaDnaAlignFeature Function : Overrides the write_output method from the superclass to allow use of compressed DnaAlignFeatures Returntype: 1 Exceptions: Throws if the feature cannot be stored =cut sub write_output { my ( $self, @output ) = @_; # Flag set to 1 = return a pipeline adaptor my $outdb; my $fa; if ( $self->COMPRESSION ) { # Flag set to 1 = return a pipeline adaptor # Remember you need to have the pipeline_tables in your OUT_DB if you like to use compression $outdb = $self->get_dbadaptor($self->OUT_DB,1); $fa = $outdb->get_CompressedDnaAlignFeatureAdaptor; } else { # return a NORMAL adaptor NOT a pipeline adaptor ... $outdb = $self->get_dbadaptor($self->OUT_DB); $fa = $outdb->get_DnaAlignFeatureAdaptor; } foreach my $f (@{$self->output}){ # calculate the hcoverage and use the evalue feild to store the # depth of the features $f->p_value(1) if $self->COMPRESSION; eval{ $fa->store($f); }; if ($@) { $self->throw("Unable to store DnaAlignFeature\n $@"); } } } ########################################################### # containers sub INTRON_MODELS { my ($self,$value) = @_; if (defined $value) { $self->{'_CONFIG_INTRON_MODELS'} = $value; } if (exists($self->{'_CONFIG_INTRON_MODELS'})) { return $self->{'_CONFIG_INTRON_MODELS'}; } else { return undef; } } sub INTRON_OVERLAP { my ($self,$value) = @_; if (defined $value) { $self->{'_CONFIG_INTRON_OVERLAP'} = $value; } if (exists($self->{'_CONFIG_INTRON_OVERLAP'})) { return $self->{'_CONFIG_INTRON_OVERLAP'}; } else { return undef; } } sub BIOTYPE { my ($self,$value) = @_; if (defined $value) { $self->{'_CONFIG_BIOTYPE'} = $value; } if (exists($self->{'_CONFIG_BIOTYPE'})) { return $self->{'_CONFIG_BIOTYPE'}; } else { return undef; } } sub TRANSDB { my ($self,$value) = @_; if (defined $value) { $self->{'_CONFIG_TRANSDB'} = $value; } if (exists($self->{'_CONFIG_TRANSDB'})) { return $self->{'_CONFIG_TRANSDB'}; } else { return undef; } } sub OUT_DB { my ($self,$value) = @_; if (defined $value) { $self->{'_CONFIG_OUT_DB'} = $value; } if (exists($self->{'_CONFIG_OUT_DB'})) { return $self->{'_CONFIG_OUT_DB'}; } else { return undef; } } sub COMPRESSION { my ($self,$value) = @_; if (defined $value) { $self->{'_CONFIG_COMPRESSION'} = $value; } if (exists($self->{'_CONFIG_COMPRESSION'})) { return $self->{'_CONFIG_COMPRESSION'}; } else { return undef; } } sub PAIREDEND { my ($self,$value) = @_; if (defined $value) { $self->{'_CONFIG_PAIREDEND'} = $value; } if (exists($self->{'_CONFIG_PAIREDEND'})) { return $self->{'_CONFIG_PAIREDEND'}; } else { return undef; } } sub PAIR { my ($self,$value) = @_; if (defined $value) { $self->{'_CONFIG_PAIR'} = $value; } if (exists($self->{'_CONFIG_PAIR'})) { return $self->{'_CONFIG_PAIR'}; } else { return undef; } } sub PROJECT { my ($self,$value) = @_; if (defined $value) { $self->{'_CONFIG_PROJECT'} = $value; } if (exists($self->{'_CONFIG_PROJECT'})) { return $self->{'_CONFIG_PROJECT'}; } else { return undef; } } sub PAIREDGAP { my ($self,$value) = @_; if (defined $value) { $self->{'_CONFIG_PAIREDGAP'} = $value; } if (exists($self->{'_CONFIG_PAIREDGAP'})) { return $self->{'_CONFIG_PAIREDGAP'}; } else { return undef; } } sub MISSMATCH { my ($self,$value) = @_; if (defined $value) { $self->{'_CONFIG_MISSMATCH'} = $value; } if (exists($self->{'_CONFIG_MISSMATCH'})) { return $self->{'_CONFIG_MISSMATCH'}; } else { return undef; } } 1;