ExonerateSolexa documentation.

Bio::EnsEMBL::Analysis::RunnableDB::ExonerateSolexa

No package variables defined.

my $runnableDB = Bio::EnsEMBL::Analysis::RunnableDB::ExonerateSolexa->new(
-db => $refdb,
-analysis => $analysis_obj,
);
$runnableDB->fetch_input();
$runnableDB->run();
$runnableDB->write_output(); #writes to DB

Extends Bio::EnsEMBL::Analysis::RunnableDB::ExonerateAlignFeature to allow
use of compressed dna align features, useful when aligning millions of short
Solexa reads

sub BIOTYPE {

  my ($self,$value) = @_;

  if (defined $value) {
    $self->{'_CONFIG_BIOTYPE'} = $value;
  }
  
  if (exists($self->{'_CONFIG_BIOTYPE'})) {
    return $self->{'_CONFIG_BIOTYPE'};
  } else {
    return undef;
  }

}

sub COMPRESSION {

  my ($self,$value) = @_;

  if (defined $value) {
    $self->{'_CONFIG_COMPRESSION'} = $value;
  }
  
  if (exists($self->{'_CONFIG_COMPRESSION'})) {
    return $self->{'_CONFIG_COMPRESSION'};
  } else {
    return undef;
  }

}

sub INTRON_MODELS {

  my ($self,$value) = @_;

  if (defined $value) {
    $self->{'_CONFIG_INTRON_MODELS'} = $value;
  }
  
  if (exists($self->{'_CONFIG_INTRON_MODELS'})) {
    return $self->{'_CONFIG_INTRON_MODELS'};
  } else {
    return undef;
  }

}

sub INTRON_OVERLAP {

  my ($self,$value) = @_;

  if (defined $value) {
    $self->{'_CONFIG_INTRON_OVERLAP'} = $value;
  }
  
  if (exists($self->{'_CONFIG_INTRON_OVERLAP'})) {
    return $self->{'_CONFIG_INTRON_OVERLAP'};
  } else {
    return undef;
  }

}

sub MISSMATCH {

  my ($self,$value) = @_;

  if (defined $value) {
    $self->{'_CONFIG_MISSMATCH'} = $value;
  }
  
  if (exists($self->{'_CONFIG_MISSMATCH'})) {
    return $self->{'_CONFIG_MISSMATCH'};
  } else {
    return undef;
  }
}

1;

}

sub OUT_DB {

  my ($self,$value) = @_;

  if (defined $value) {
    $self->{'_CONFIG_OUT_DB'} = $value;
  }
  
  if (exists($self->{'_CONFIG_OUT_DB'})) {
    return $self->{'_CONFIG_OUT_DB'};
  } else {
    return undef;
  }

}

sub PAIR {

  my ($self,$value) = @_;

  if (defined $value) {
    $self->{'_CONFIG_PAIR'} = $value;
  }
  
  if (exists($self->{'_CONFIG_PAIR'})) {
    return $self->{'_CONFIG_PAIR'};
  } else {
    return undef;
  }

}

sub PAIREDEND {

  my ($self,$value) = @_;

  if (defined $value) {
    $self->{'_CONFIG_PAIREDEND'} = $value;
  }
  
  if (exists($self->{'_CONFIG_PAIREDEND'})) {
    return $self->{'_CONFIG_PAIREDEND'};
  } else {
    return undef;
  }

}

sub PAIREDGAP {

  my ($self,$value) = @_;

  if (defined $value) {
    $self->{'_CONFIG_PAIREDGAP'} = $value;
  }
  
  if (exists($self->{'_CONFIG_PAIREDGAP'})) {
    return $self->{'_CONFIG_PAIREDGAP'};
  } else {
    return undef;
  }

}

sub PROJECT {

  my ($self,$value) = @_;

  if (defined $value) {
    $self->{'_CONFIG_PROJECT'} = $value;
  }
  
  if (exists($self->{'_CONFIG_PROJECT'})) {
    return $self->{'_CONFIG_PROJECT'};
  } else {
    return undef;
  }

}

sub TRANSDB {

  my ($self,$value) = @_;

  if (defined $value) {
    $self->{'_CONFIG_TRANSDB'} = $value;
  }
  
  if (exists($self->{'_CONFIG_TRANSDB'})) {
    return $self->{'_CONFIG_TRANSDB'};
  } else {
    return undef;
  }

}

sub filter_solexa {

  my ($self,$features_ref) = @_;
  my @features = @$features_ref;
  my @filtered_features;
  #�allow no more than MISSMATCH missmatches and
  # dont allow gapping...
  foreach my $feat ( @features ) {
    #�dont allow gapping
    if ( $feat->cigar_string =~ /[ID]/ ) {
      # gapped reject it
      next
    }
   # check missmatches
    if ( $self->MISSMATCH ) {
      my $aligned_length = abs($feat->hend - $feat->hstart) +1;
     # print $feat->hseqname ." ";
     # print $feat->percent_id . " " ;
     # print " aligned length = $aligned_length ";
      my $matches = $aligned_length *  $feat->percent_id / 100;
     # print " matches $matches  ";
      my $missmatches = ( $aligned_length - $matches) / $aligned_length * 100;
     # print " missmatches $missmatches \n";
      next if $missmatches > $self->MISSMATCH;
     # print "accepting\n";
    }
   push @filtered_features, $feat;
  }
  return\@ filtered_features;

}

sub merge_pair {

  my ( $self, $read_pair) = @_;
  my @ugfs;
  my @features;
  if ( $self->PAIR ) {
    # Use the combined scores and identity  of the reads for the new read
    my $score = $read_pair->[0]->score +  $read_pair->[1]->score;
    my $pid = ( $read_pair->[0]->percent_id + $read_pair->[1]->percent_id ) /2;
    
    # Because the features are split you can get the second half of
    # the read aligning before the first which can screw up the 
    # hit start ends as you might expect
    # I am �going to flip a and b over
    if ( $read_pair->[1]->end <= $read_pair->[0]->start ) {
      my $tmp = $read_pair->[0];
      $read_pair->[0] = $read_pair->[1];
      $read_pair->[1] = $tmp;
      
    }
    
    for ( my $i = 0 ; $i <=1 ; $i++ ) {
      my $read = $read_pair->[$i];
     if ( $read->hseqname =~ /(\S+):([a,b])$/ ) {
     	$read->hseqname($1);
     }
      if ( $read->hseqname =~ /(\S+):([A,B])$/ ) {
     	$read->hseqname($1 .":3p");
     }    
      foreach my $ugf ( $read->ungapped_features ) {
	# need to modify the 'b' read hit starts ends to make it
	# appear to come from one long read
	# neeed to flip it onto the forward strand or else 
	# the dna align feature will fall over
	my $hstart = $ugf->hstart ;
	my $hend   = $ugf->hend ;
	if ( $i == 1 ) {
	  if ( $ugf->hstrand == -1 ) {
	    $ugf->hstrand(1);
	  }
	} else {
	  $ugf->hstrand(1) if $ugf->hstrand == -1;
	}
	$ugf->score($score);
	$ugf->percent_id($pid);
	push @ugfs, $ugf;
      }
    }
    my $feat = new Bio::EnsEMBL::DnaDnaAlignFeature(-features =>\@ ugfs);
    $feat->analysis($self->analysis);
    $feat->hstart($read_pair->[0]->hstart);
    $feat->hend($read_pair->[0]->hend+$read_pair->[1]->hend);
    push @features,$feat;
  } else {  
    $read_pair->[0]->hseqname($read_pair->[0]->hseqname . ":aa");
    $read_pair->[1]->hseqname($read_pair->[1]->hseqname . ":bb");
    push @features,($read_pair->[0],$read_pair->[1]);
  }
  # Otherwise leave them as they are but change the a and b suffix to indictate that they have
  # been paired but are not joined together.
  return\@ features;

}

sub new {

  my ( $class, @args ) = @_;
  my $self = $class->SUPER::new(@args);

  $self->read_and_check_config($EXONERATE_SOLEXA_CONFIG_BY_LOGIC);

  return $self;

}

sub pair_features {

  my ($self,$features_ref) = @_;
  my @features = @$features_ref;
  my $paired_features;
  my @selected_reads;
  foreach my $feat ( @features ) {
    if ( $feat->hseqname =~ /(\S+):([a,b,A,B])$/ ) {
      my $suffix = lc($2);
      push @{$paired_features->{$1}->{$suffix}},$feat;
    } else {
      $self->throw("Read name not recognised " . $feat->hseqname . "\n");
    }
  }

  foreach my $pair ( keys %$paired_features ) {
    my ( @as, @bs ) ;
    @as =  @{$paired_features->{$pair}->{'a'}} if $paired_features->{$pair}->{'a'};
    @bs =  @{$paired_features->{$pair}->{'b'}} if $paired_features->{$pair}->{'b'};
    my @potential_pairs;
    # Pick the potential paired reads bases on chr positon
    foreach my $afeat ( @as ) {
    READ:  foreach my $bfeat ( @bs ) {
	if ( $afeat->seqname eq $bfeat->seqname ) {
	  # They lie on the same chr within PAIREDGAP bases of each other
	  # but they don't overlap
	  if ( ( $afeat->start > $bfeat->end && 
		 $afeat->start - $bfeat->end <= $self->PAIREDGAP ) or 
	       ( $bfeat->start > $afeat->end && 
		 $bfeat->start - $afeat->end <= $self->PAIREDGAP ) ) {
	    # they should be oriented pointing towards each other on
	    # opposite strands...
	    if ( $afeat->strand == 1 && $bfeat->strand == 1
	       && $afeat->hstrand != $bfeat->hstrand ) {
	      #�are they in the right order
	      #�on the forward strand a should come before b
	      # and vice versa
	      next READ unless 
		( 
		 ($afeat->hstrand == 1 && $afeat->end < $bfeat->start) or 
		 ($afeat->hstrand == -1 && $bfeat->end < $afeat->start) 
		);
	    }
	    # storing the pair using the score as the index
	    # so I can easily pull out the highest scoring pair
	    # if they score the SAME then choose the pair that is
	    # closest together 
	    my $combined_score = $afeat->score + $bfeat->score;
	    # work out the lengths the pairs are separated by
	    if ( $potential_pairs[$combined_score] ) {
	      my $new_length;
	      my $old_length;
	      my $old_afeat = $potential_pairs[$combined_score]->[0];
	      my $old_bfeat = $potential_pairs[$combined_score]->[1];
	      if ( $afeat->start > $bfeat->end ) {
		$new_length = $afeat->start - $bfeat->end;
	      } else {
		$new_length = $bfeat->start - $afeat->end;
	      }
	      if ( $old_afeat->start > $old_bfeat->end ) {
		$old_length = $old_afeat->start - $old_bfeat->end;
	      } else {
		$old_length = $old_bfeat->start - $old_afeat->end;
	      }
	      # only replace the pair if the new pair with the same combined score
	      # are closer together ( hopefully this might stop clusters getting
	      # tangled up
	      if ( $old_length > $new_length ) {
		$potential_pairs[$combined_score] = [($afeat,$bfeat)] ;
	      }
	    } else {
	      $potential_pairs[$combined_score] = [($afeat,$bfeat)] ;
	    }
	  }
	}
      }
    }
    if ( scalar(@potential_pairs) >= 1 ){
      # lets join together the highest scoring pair of reads to make one new feature
      push @selected_reads, @{$self->merge_pair(pop @potential_pairs)} if scalar(@potential_pairs) >= 1;
    } else {
      if ( scalar(@as) >= 1 ) {
	@as = sort { $a->score <=> $b->score } @as ;
	my $selected = pop @as;
	if ( $selected->hseqname =~ /(\S+):([A])$/ ) {
	  $selected->hseqname($1 .":a3p");
	}
	push @selected_reads, $selected;
      }
      if ( scalar(@bs) >= 1 ) {    
	@bs = sort { $a->score <=> $b->score } @bs ;
	my $selected = pop @bs;
	if ( $selected->hseqname =~ /(\S+):([B])$/ ) {
	  $selected->hseqname($1 .":b3p");
	}
	push @selected_reads, $selected;
      }
    }
  }
  return\@ selected_reads;

}

sub run {

  my ($self) = @_;
 # do all the normal stuff
  $self->SUPER::run();
  # filter the results carefully to only allow strict matches
  my $filtered_features = $self->filter_solexa($self->output);
  # Pair features together if they come from paired end reads
  if ( $self->PAIREDEND ) {
    my $paired_features = $self->pair_features($filtered_features);
    # clear the output array to load in the modified features
    $self->{'output'} = [];
    $self->output($paired_features);
  }

}

sub write_output {

  my ( $self, @output ) = @_;
  # Flag set to 1 = return a pipeline adaptor
  my $outdb; 

  my $fa;
  if ( $self->COMPRESSION ) {
    # Flag set to 1 = return a pipeline adaptor 
    # Remember you need to have the pipeline_tables in your OUT_DB if you like to use compression 
    $outdb = $self->get_dbadaptor($self->OUT_DB,1);
    $fa = $outdb->get_CompressedDnaAlignFeatureAdaptor;
  } else { 
    # return a NORMAL adaptor NOT a pipeline adaptor ...
    $outdb = $self->get_dbadaptor($self->OUT_DB);
    $fa = $outdb->get_DnaAlignFeatureAdaptor;
  } 

  
  foreach my $f (@{$self->output}){
    # calculate the hcoverage and use the evalue feild to store the
    # depth of the features
    
    $f->p_value(1) if $self->COMPRESSION;

    eval{
      $fa->store($f);
    };
    if ($@) {
      $self->throw("Unable to store DnaAlignFeature\n $@");
    }
  }
}

###########################################################
# containers

}

General documentation

Post general queries to ensembl-dev@ebi.ac.uk

BIOTYPE	No description	Code
COMPRESSION	No description	Code
INTRON_MODELS	No description	Code
INTRON_OVERLAP	No description	Code
MISSMATCH	No description	Code
OUT_DB	No description	Code
PAIR	No description	Code
PAIREDEND	No description	Code
PAIREDGAP	No description	Code
PROJECT	No description	Code
TRANSDB	No description	Code
filter_solexa	No description	Code
merge_pair	No description	Code
new	No description	Code
pair_features	No description	Code
run	No description	Code
write_output	Description	Code