Raw content of Bio::EnsEMBL::Analysis::RunnableDB::ExonerateSolexaTranscript
=pod
=head1 NAME
Bio::EnsEMBL::Analysis::RunnableDB::ExonerateSolexaTranscript
=head1 SYNOPSIS
my $runnableDB = Bio::EnsEMBL::Analysis::RunnableDB::ExonerateSolexaTranscript->new(
-db => $refdb,
-analysis => $analysis_obj,
);
$runnableDB->fetch_input();
$runnableDB->run();
$runnableDB->write_output(); #writes to DB
=head1 DESCRIPTION
Extends Bio::EnsEMBL::Analysis::RunnableDB::ExonerateSolexa to allow
reads to be aligned against transcript sequences and then projected
onto the genome
=head1 CONTACT
Post general queries to B
=head1 APPENDIX
=cut
package Bio::EnsEMBL::Analysis::RunnableDB::ExonerateSolexaTranscript;
use strict;
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
use Bio::EnsEMBL::Analysis::RunnableDB;
use Bio::EnsEMBL::Analysis::RunnableDB::ExonerateAlignFeature;
use Bio::EnsEMBL::Analysis::RunnableDB::ExonerateSolexa;
use Bio::EnsEMBL::Analysis::Config::GeneBuild::ExonerateSolexa;
use Bio::EnsEMBL::FeaturePair;
use vars qw(@ISA);
@ISA = ("Bio::EnsEMBL::Analysis::RunnableDB::ExonerateSolexa");
sub new {
my ( $class, @args ) = @_;
my $self = $class->SUPER::new(@args);
return $self;
}
sub fetch_input {
my ($self) = @_;
# do all the normal stuff
$self->SUPER::fetch_input();
# then get all the transcripts and exons
my $trans_db = $self->get_dbadaptor($self->TRANSDB);
my $trans_adaptor = $trans_db->get_TranscriptAdaptor;
my %trans_by_id;
my $biotype = $self->BIOTYPE;
my @trans;
# fetch genes, transcripts
if ( $biotype ){
push @trans , @{$trans_adaptor->generic_fetch("biotype = \"$biotype\"")};
} else {
push @trans , @{$trans_adaptor->fetch_all(undef,undef,undef)};
}
foreach my $trans ( @trans ) {
$trans_by_id{$trans->display_id} = $trans;
}
$self->transcripts(\%trans_by_id);
return ;
}
sub filter_solexa {
my ($self,$features_ref) = @_;
my @features = @$features_ref;
my @filtered_features;
# allow no more than MISSMATCH missmatches and
foreach my $feat ( @features ) {
# check missmatches
if ( $self->MISSMATCH ) {
my $aligned_length = abs($feat->hend - $feat->hstart) +1;
#print $feat->hseqname ." ";
#print $feat->percent_id . " " ;
#print " aligned length = $aligned_length ";
my $matches = $aligned_length * $feat->percent_id / 100;
#print " matches $matches ";
my $missmatches = ( $aligned_length - $matches) / $aligned_length * 100;
#print " missmatches $missmatches \n";
next if $missmatches > $self->MISSMATCH;
#print "accepting\n";
}
push @filtered_features, $feat;
}
return \@filtered_features;
}
sub run {
my ($self) = @_;
throw("Can't run - no runnable objects") unless ( $self->runnable );
my ($runnable) = @{$self->runnable};
$runnable->run;
my $features = $runnable->output;
if ($self->MISSMATCH) {
$features = $self->filter_solexa($features);
}
# Pair features together if they come from paired end reads
if ( $self->PAIREDEND ) {
$features = $self->pair_features($features);
}
my $genomic_features = $self->process_features($features);
$self->output($genomic_features);
}
=head2 process_features
Arg [1] : array ref of Bio::EnsEMBL::DnaDnaAlignFeature
Function : Uses cdna2genome to convert the ungapped align
features into genomic coords
Returntype: 1
Exceptions: Throws if the transcript the read has aligned to is
not stored in the transcript hash
=cut
sub process_features {
my ( $self, $flist ) = @_;
# first do all the standard processing, adding a slice and analysis etc.
# unless we are projecting in which case we dont really nead a slice
unless ($self->PROJECT) {
$self->SUPER::process_features($flist);
}
my $slice_adaptor = $self->db->get_SliceAdaptor;
my @dafs;
FEATURE: foreach my $f (@$flist) {
my $trans_id;
my $accept = 0;
$accept = 1 unless $self->INTRON_OVERLAP or $self->INTRON_MODELS ;
if ($f->seqname =~ /\S+\:\S+\:(\S+)\:\d+\:\d+:\d+/ ) {
$trans_id = $1;
} else {
$trans_id = $f->seqname;
}
if ( not exists $self->transcripts->{$trans_id} ) {
$self->throw("Transcript $trans_id not found\n");
}
my $trans = $self->transcripts->{$trans_id};
if ( $self->INTRON_OVERLAP ) {
# only consider introns overlapping exons by at least x base pairs
# on both sides of the intron
# make a hash with the positions of the exon boundaries
# check first if you have already stored it though, no need to
# make more than is needed
unless ( $self->exon_starts($trans->display_id) && $self->exon_ends($trans->display_id) ){
my %exon_starts;
my %exon_ends;
foreach my $e ( @{$trans->get_all_Exons} ) {
$exon_starts{$e->cdna_start($trans)} = 1;
$exon_ends{$e->cdna_end($trans)} = 1;
}
$self->exon_starts($trans->display_id,\%exon_starts);
$self->exon_ends($trans->display_id,\%exon_ends);
}
my $ee = undef;
my $es = undef;
# scan along the read and look for overlaps with the exon boundaries
for ( my $i = $f->start ; $i <= $f->end ; $i++ ){
# exon end position within the feature;
$ee = $i - $f->start if $self->exon_ends($trans->display_id)->{$i};
# next exon start position within the feature
$es = $i - $f->start if $self->exon_starts($trans->display_id)->{$i};
}
# allow reads covering introns
$accept = 1 if ( $es && $ee && $ee >= $self->INTRON_OVERLAP
&& $es <= $f->length - $self->INTRON_OVERLAP );
# disallow unspliced reads spanning introns unless the intron is size 0
# then allow the unspliced alignment to bridge the gap
# also allow any spliced modles even if they dont lie in an intron
if ( $self->INTRON_MODELS ) {
unless ( $f->{"_intron"} or ( $es && $ee && ($es - $ee == 1) ) ) {
$accept = 0;
}
}
}
# restrict just to splice models where read is within an exon
if ( $self->INTRON_MODELS ) {
$accept = 1 if $f->{"_intron"};
}
next FEATURE unless $accept;
if ( $self->PROJECT ) {
my @mapper_objs;
my @features;
my $start = 1;
my $cannonical = 1;
my $end = $f->length;
my $out_slice = $slice_adaptor->fetch_by_name($trans->slice->name);
# get as ungapped features
foreach my $ugf ( $f->ungapped_features ) {
# Project onto the genome
foreach my $obj ($trans->cdna2genomic($ugf->start, $ugf->end)){
if( $obj->isa("Bio::EnsEMBL::Mapper::Coordinate")){
# make into feature pairs?
my $qstrand = $f->strand * $trans->strand;
my $hstrand = $f->hstrand * $trans->strand;
my $fp;
$fp = Bio::EnsEMBL::FeaturePair->new
(-start => $obj->start,
-end => $obj->end,
-strand => $qstrand,
-slice => $trans->slice,
-hstart => 1,
-hend => $obj->length,
-hstrand => $hstrand,
-percent_id => $f->percent_id,
-score => $f->score,
-hseqname => $f->hseqname,
-hcoverage => $f->hcoverage,
-p_value => $f->p_value,
);
push @features, $fp->transfer($out_slice);
}
}
}
@features = sort { $a->start <=> $b->start } @features;
# if we have a spliced alignment check to see if it's non-cannonical
# if so tag it so we can tell later on
if ( $f->{'_intron'} ) {
print $f->hseqname . " ";
if ( scalar(@features) == 2 ) {
my $left_splice = $slice_adaptor->fetch_by_region('toplevel',
$out_slice->seq_region_name,
$features[0]->end+1,
$features[0]->end+2,
$features[0]->strand
);
my $right_splice = $slice_adaptor->fetch_by_region('toplevel',
$out_slice->seq_region_name,
$features[1]->start-2,
$features[1]->start-1,
$features[0]->strand
);
if ( $left_splice->seq eq 'NN' && $right_splice->seq eq 'NN' ) {
warn("Cannot find dna sequence for " . $f->display_id .
" this is used in detetcting non cannonical splices\n");
} else {
# is it cannonical
if ( $features[0]->strand == 1 ) {
print "Splice type " . $left_splice->seq ."- ". $right_splice->seq ." ";
# is it GTAG?
unless ( $left_splice->seq eq 'GT' && $right_splice->seq eq 'AG' ) {
$cannonical = 0;
}
} else {
print "Splice type " . $right_splice->seq ."- ". $left_splice->seq ." ";
# is it GTAG?
unless ( $right_splice->seq eq 'GT' && $left_splice->seq eq 'AG' ) {
$cannonical = 0;
}
}
}
}
}
print "Making feat " . $f->hseqname . "\n";
my $feat = new Bio::EnsEMBL::DnaDnaAlignFeature(-features => \@features);
# corect for hstart end bug
$feat->hstart($f->hstart);
$feat->hend($f->hend);
$feat->analysis($self->analysis);
unless ( $cannonical ) {
print "Non cannonical \n";
# mark it as non cannonical splice
$feat->hseqname($feat->hseqname.":NC");
} else {
print "Cannonical \n";
}
# dont store the same feature twice because it aligns to a different transcript in the same gene.
my $unique_id = $feat->seq_region_name .":" .
$feat->start .":" .
$feat->end .":" .
$feat->strand .":" .
$feat->hseqname;
unless ($self->stored_features($unique_id)){
push @dafs,$feat;
# keep tabs on it so you don't store it again.
$self->stored_features($unique_id,1);
}
} else {
# just write the features on the transcript coord system
# assming there is one ( useful for writing reads that support introns
# on the transcrtipt coord system
push @dafs,$f;
}
}
return \@dafs;
}
##########################################################################
sub transcripts {
my ($self,$value) = @_;
if (defined $value) {
$self->{'_transcripts'} = $value;
}
if (exists($self->{'_transcripts'})) {
return $self->{'_transcripts'};
} else {
return undef;
}
}
sub exon_starts {
my ($self,$key,$value) = @_;
return undef unless defined ($key);
if (defined $value) {
$self->{'_exon_starts'}{$key} = $value;
}
if (exists($self->{'_exon_starts'}{$key})) {
return $self->{'_exon_starts'}{$key};
} else {
return undef;
}
}
sub exon_ends {
my ($self,$key,$value) = @_;
return undef unless defined ($key);
if (defined $value) {
$self->{'_exon_ends'}{$key} = $value;
}
if (exists($self->{'_exon_ends'}{$key})) {
return $self->{'_exon_ends'}{$key};
} else {
return undef;
}
}
sub stored_features {
my ($self,$key,$value) = @_;
return undef unless defined ($key);
if (defined $value) {
$self->{'_stored_features'}{$key} = $value;
}
if (exists($self->{'_stored_features'}{$key})) {
return $self->{'_stored_features'}{$key};
} else {
return undef;
}
}
1;