ExonerateSolexaTranscript documentation.

sub process_features {
  my ( $self, $flist ) = @_;

  # first do all the standard processing, adding a slice and analysis etc.
  # unless we are projecting in which case we dont really nead a slice 
  unless ($self->PROJECT) {
    $self->SUPER::process_features($flist);
  }

  my $slice_adaptor = $self->db->get_SliceAdaptor;
  my @dafs;

FEATURE:  foreach my $f (@$flist) {
    
    my $trans_id;
    my $accept = 0;
    
    $accept = 1 unless  $self->INTRON_OVERLAP or $self->INTRON_MODELS ;

    if ($f->seqname =~ /\S+\:\S+\:(\S+)\:\d+\:\d+:\d+/ ) {
      $trans_id = $1;
    } else {
      $trans_id = $f->seqname;
    }

    if ( not exists $self->transcripts->{$trans_id} ) {
      $self->throw("Transcript $trans_id not found\n");
    }

    my $trans = $self->transcripts->{$trans_id};

    if ( $self->INTRON_OVERLAP ) {
      # only consider introns overlapping exons by at least x base pairs 
      # on both sides of the intron
      # make a hash with the positions of the exon boundaries
      # check first if you have already stored it though, no need to 
      # make more than is needed
	unless ( $self->exon_starts($trans->display_id) && $self->exon_ends($trans->display_id) ){
	my %exon_starts;
	my %exon_ends;
	foreach my $e ( @{$trans->get_all_Exons} ) {
	  $exon_starts{$e->cdna_start($trans)} = 1;
	  $exon_ends{$e->cdna_end($trans)} = 1;
	}
	$self->exon_starts($trans->display_id,\%exon_starts);
	$self->exon_ends($trans->display_id,\%exon_ends);
      }
      my $ee = undef;
      my $es = undef;
      # scan along the read and look for overlaps with the exon boundaries
      for ( my $i = $f->start ; $i <= $f->end ; $i++ ){
	# exon end position within the feature;
	$ee = $i - $f->start if $self->exon_ends($trans->display_id)->{$i};
	# next exon start position within the feature
	$es = $i - $f->start if $self->exon_starts($trans->display_id)->{$i};
      }
      # allow reads covering introns
      $accept = 1 if ( $es && $ee && $ee >= $self->INTRON_OVERLAP 
		       && $es <= $f->length - $self->INTRON_OVERLAP );

      # disallow unspliced reads spanning introns  unless the intron is size 0
      # then allow the unspliced alignment to  bridge the gap
      # also allow any spliced modles even if they dont lie in an intron
      
      if ( $self->INTRON_MODELS ) {
	unless ( $f->{"_intron"} or ( $es && $ee && ($es - $ee == 1) ) ) {
	  $accept = 0;
	}
      }
    }

    # restrict just to splice models where read is within an exon
    if ( $self->INTRON_MODELS ) {
      $accept = 1 if $f->{"_intron"};
    }

    next FEATURE unless $accept;

    if ( $self->PROJECT ) {
      my @mapper_objs;
      my @features;
      my $start = 1;
      my $cannonical = 1;
      my $end = $f->length;
      my $out_slice = $slice_adaptor->fetch_by_name($trans->slice->name);
      # get as ungapped features
      foreach my $ugf ( $f->ungapped_features ) {
	# Project onto the genome
	foreach my $obj ($trans->cdna2genomic($ugf->start, $ugf->end)){
	  if( $obj->isa("Bio::EnsEMBL::Mapper::Coordinate")){
	    # make into feature pairs?
	    my $qstrand = $f->strand  * $trans->strand;
	    my $hstrand = $f->hstrand * $trans->strand;
	    my $fp;
	    $fp = Bio::EnsEMBL::FeaturePair->new
	      (-start    => $obj->start,
	       -end      => $obj->end,
	       -strand   => $qstrand,
	       -slice    => $trans->slice,
	       -hstart   => 1,
	       -hend     => $obj->length,
	       -hstrand  => $hstrand,
	       -percent_id => $f->percent_id,
	       -score    => $f->score,
	       -hseqname => $f->hseqname,
	       -hcoverage => $f->hcoverage,
	       -p_value   => $f->p_value,
	      );
	    push @features, $fp->transfer($out_slice);
	  }
	}
      }
      @features = sort { $a->start <=> $b->start } @features;
      # if we have a spliced alignment check to see if it's non-cannonical
      # if so tag it so we can tell later on

      if ( $f->{'_intron'} ) {
	print $f->hseqname . "  ";
	if ( scalar(@features) == 2 ) {
	  my $left_splice = $slice_adaptor->fetch_by_region('toplevel',
							    $out_slice->seq_region_name,
							    $features[0]->end+1,
							    $features[0]->end+2,
							    $features[0]->strand
							   );
	  my $right_splice = $slice_adaptor->fetch_by_region('toplevel',
							     $out_slice->seq_region_name,
							     $features[1]->start-2,
							     $features[1]->start-1,
							     $features[0]->strand
							    );	
	  if ( $left_splice->seq eq 'NN' && $right_splice->seq eq 'NN' ) {
	    warn("Cannot find dna sequence for " . $f->display_id .
		 " this is used in detetcting non cannonical splices\n");
	  } else {
	    # is it cannonical
	    if ( $features[0]->strand  == 1 ) {
	      print "Splice type " . $left_splice->seq ."- ".  $right_splice->seq ." ";
	      # is it GTAG?
	      unless ( $left_splice->seq eq 'GT' && $right_splice->seq eq 'AG' ) {
		$cannonical = 0;
	      }
	    } else {
	      print "Splice type " . $right_splice->seq ."- ".  $left_splice->seq ." ";
	      # is it GTAG?
	      unless ( $right_splice->seq eq 'GT' && $left_splice->seq eq 'AG' ) {
		$cannonical = 0;
	      }
	    }
	  }
	}
      }
      print "Making feat " . $f->hseqname . "\n";
      my $feat = new Bio::EnsEMBL::DnaDnaAlignFeature(-features =>\@ features);
      # corect for hstart end bug
      $feat->hstart($f->hstart);
      $feat->hend($f->hend);
      $feat->analysis($self->analysis);
      unless ( $cannonical ) {
	print "Non cannonical\n ";
	# mark it as non cannonical splice
	$feat->hseqname($feat->hseqname.":NC");
      } else {
	print "Cannonical\n ";
      }
      # dont store the same feature twice because it aligns to a different transcript in the same gene.
      my $unique_id = $feat->seq_region_name .":" .
	$feat->start .":" .
	  $feat->end .":" .	
	    $feat->strand .":" .
	      $feat->hseqname;
      unless ($self->stored_features($unique_id)){
	push @dafs,$feat;
	# keep tabs on it so you don't store it again.
	$self->stored_features($unique_id,1);
      }
    } else {
      # just write the features on the transcript coord system 
      # assming there is one ( useful for writing reads that support introns 
      # on the transcrtipt coord system
      push @dafs,$f;
    }
  }
  return\@ dafs;
}


##########################################################################
}
exon_ends	No description	Code
exon_starts	No description	Code
fetch_input	No description	Code
filter_solexa	No description	Code
new	No description	Code
process_features	Description	Code
run	No description	Code
stored_features	No description	Code
transcripts	No description	Code