Raw content of Bio::EnsEMBL::Analysis::Runnable::ExonerateTranscript
=pod
=head1 NAME
Bio::EnsEMBL::Analysis::Runnable::ExonerateTranscript
=head1 SYNOPSIS
my $runnable = Bio::EnsEMBL::Analysis::Runnable::ExonerateTranscript->new(
-query_seqs => \@q_seqs,
[or -query_file => $q_file]
-query_type => 'dna',
-target_seqs => \@t_seqs,
[or -target_file => $t_file]
-program => $exonerate,
-options => $options,
);
$runnable->run; #create and fill Bio::Seq object
my @transcripts = @{$runnable->output};
=head1 DESCRIPTION
This module handles a specific use of the Exonerate (G. Slater) program, namely
the prediction of the transcript structure in a piece of genomic DNA by the alignment
of a 'transcribed' sequence (EST, cDNA or protein). The results is a set of
Bio::EnsEMBL::Transcript objects
=head1 CONTACT
ensembl-dev@ebi.ac.uk
=head1 APPENDIX
The rest of the documentation details each of the object methods.
Internal methods are usually preceded with a _
=cut
package Bio::EnsEMBL::Analysis::Runnable::ExonerateTranscript;
use vars qw(@ISA);
use strict;
use Bio::EnsEMBL::Analysis::Runnable::BaseExonerate;
use Bio::EnsEMBL::Transcript;
use Bio::EnsEMBL::Translation;
use Bio::EnsEMBL::Exon;
use Bio::EnsEMBL::DnaDnaAlignFeature;
use Bio::EnsEMBL::DnaPepAlignFeature;
use Bio::EnsEMBL::FeaturePair;
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
use Bio::EnsEMBL::Utils::Argument qw( rearrange );
use Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranscriptUtils qw(calculate_exon_phases);
@ISA = qw(Bio::EnsEMBL::Analysis::Runnable::BaseExonerate);
sub new {
my ($class,@args) = @_;
my $self = $class->SUPER::new(@args);
my ( $coverage_aligned ) =
rearrange([qw(
COVERAGE_BY_ALIGNED
)
], @args);
if (defined($coverage_aligned)) {
$self->coverage_as_proportion_of_aligned_residues($coverage_aligned);
} else {
$self->coverage_as_proportion_of_aligned_residues(1);
}
return $self;
}
############################################################
#
# Analysis methods
#
############################################################
sub parse_results {
my ($self, $fh) = @_;
my %strand_lookup = ('+' => 1, '-' => -1, '.' => 1);
# Each alignment will be stored as a transcript with
# exons and supporting features. Initialise our
# transcript.
my @transcripts;
# Parse output looking for lines beginning with 'RESULT:'.
# Each line represents a distinct match to one sequence
# containing multiple 'exons'.
TRANSCRIPT:
while (<$fh>){
print STDERR $_ if $self->_verbose;
next unless /^RESULT:/;
chomp;
my ($tag, $q_id, $q_start, $q_end, $q_strand, $t_id, $t_start, $t_end,
$t_strand, $score, $perc_id, $q_length, $t_length, $gene_orientation,
@align_components) = split;
$t_strand = $strand_lookup{$t_strand};
$q_strand = $strand_lookup{$q_strand};
$gene_orientation = $strand_lookup{$gene_orientation};
# Read vulgar information and extract exon regions.
my $exons = $self->_parse_vulgar_block($t_start,
$t_end,
$t_strand,
$t_length,
$q_start,
$q_end,
$q_strand,
$q_length,
\@align_components);
# now we have extracted the exons and the coordinates are with
# reference to the forward strand of the query and target, we can
# use the gene_orienation to flip the strands if necessary
if ($gene_orientation == -1 and $t_strand == 1) {
$t_strand *= -1;
$q_strand *= -1;
}
my $covered_count = 0;
if ($self->coverage_as_proportion_of_aligned_residues) {
foreach my $exon (@$exons) {
foreach my $sf (@{$exon->{sf}}) {
$covered_count += $sf->{query_end} - $sf->{query_start} + 1;
}
}
} else {
$covered_count = abs($q_end - $q_start);
}
my $coverage = sprintf("%.2f", 100 * $covered_count / $q_length);
# Build FeaturePairs for each region of query aligned to a single
# Exon. Create a DnaDnaAlignFeature from these FeaturePairs and then
# attach this to our Exon.
my $transcript = Bio::EnsEMBL::Transcript->new();
my (@tran_feature_pairs,
$cds_start_exon, $cds_start,
$cds_end_exon, $cds_end);
foreach my $proto_exon (@$exons){
# Build our exon and set its key values.
my $exon = Bio::EnsEMBL::Exon->new();
$exon->seqname($t_id);
$exon->start($proto_exon->{exon_start});
$exon->end($proto_exon->{exon_end});
$exon->phase($proto_exon->{phase});
$exon->end_phase($proto_exon->{end_phase});
$exon->strand($t_strand);
my @exon_feature_pairs;
foreach my $sf (@{$proto_exon->{sf}}){
my $feature_pair = Bio::EnsEMBL::FeaturePair->new(-seqname => $t_id,
-start => $sf->{target_start},
-end => $sf->{target_end},
-strand => $t_strand,
-hseqname => $q_id,
-hstart => $sf->{query_start},
-hend => $sf->{query_end},
-hstrand => $q_strand,
-hcoverage => $coverage,
-score => $coverage,
-percent_id => $perc_id);
push @exon_feature_pairs, $feature_pair;
push @tran_feature_pairs, $feature_pair;
}
if (@exon_feature_pairs) {
# Use our feature pairs for this exon to create a single
# supporting feature (with cigar line).
my $supp_feature;
eval{
if ($self->query_type eq 'protein') {
$supp_feature =
Bio::EnsEMBL::DnaPepAlignFeature->new(-features => \@exon_feature_pairs);
} else {
$supp_feature =
Bio::EnsEMBL::DnaDnaAlignFeature->new(-features => \@exon_feature_pairs);
}
};
if ($@){
warning($@);
next TRANSCRIPT;
}
$exon->add_supporting_features($supp_feature);
}
if (exists $proto_exon->{coding_start}) {
if (not defined $cds_start_exon) {
$cds_start_exon = $exon;
$cds_start = $proto_exon->{coding_start};
}
$cds_end_exon = $exon;
$cds_end = $proto_exon->{coding_end};
}
$transcript->add_Exon($exon);
}
# Create a single supporting feature for the whole transcript
my $t_supp_feat;
eval{
if ($self->query_type eq 'protein') {
$t_supp_feat =
Bio::EnsEMBL::DnaPepAlignFeature->new(-features => \@tran_feature_pairs);
} else {
$t_supp_feat =
Bio::EnsEMBL::DnaDnaAlignFeature->new(-features => \@tran_feature_pairs);
}
};
if ($@) {
warning("Could not create Transcript supporting feature");
} else {
$transcript->add_supporting_features($t_supp_feat);
}
my @exons = @{$transcript->get_all_Exons};
if (scalar(@exons)) {
if (defined $cds_start_exon) {
my $translation = Bio::EnsEMBL::Translation->new();
$translation->start_Exon($cds_start_exon);
$translation->start($cds_start);
$translation->end_Exon($cds_end_exon);
$translation->end($cds_end);
$transcript->translation($translation);
}
calculate_exon_phases($transcript, 0);
push @transcripts, [$score, $transcript];
}
}
@transcripts = sort { $b->[0] <=> $a->[0] } @transcripts;
@transcripts = map { $_->[1] } @transcripts;
return \@transcripts;
}
sub _parse_vulgar_block {
my ($self,
$target_start, $target_end, $target_strand, $target_length,
$query_start, $query_end, $query_strand, $query_length,
$vulgar_components) = @_;
# This method works along the length of a vulgar line
# exon-by-exon. Matches that comprise an exon are
# grouped and an array of 'proto-exons' is returned.
# Coordinates from the vulgar line are extrapolated
# to actual genomic/query coordinates.
my @exons;
my $exon_number = 0;
# We sometimes need to increment all our start coordinates. Exonerate
# has a coordinate scheme that counts _between_ nucleotides at the start.
# However, for reverse strand matches
my ($query_in_forward_coords, $target_in_forward_coords);
my ($cumulative_query_coord, $cumulative_target_coord);
if ($target_start > $target_end) {
warn("For target, start and end are in thew wrong order for a reverse strand match")
if $target_strand != -1;
$cumulative_target_coord = $target_start;
$target_in_forward_coords = 1;
} else {
$cumulative_target_coord = $target_start + 1;
$target_in_forward_coords = 0;
}
if ($query_start > $query_end) {
warn("For query, start and end are in thew wrong order for a reverse strand match")
if $query_strand != -1;
$cumulative_query_coord = $query_start;
$query_in_forward_coords = 1;
} else {
$cumulative_query_coord = $query_start + 1;
$query_in_forward_coords = 0;
}
while (@$vulgar_components){
throw("Something funny has happened to the input vulgar string." .
" Expecting components in multiples of three, but only have [" .
scalar @$vulgar_components . "] items left to process.")
unless scalar @$vulgar_components >= 3;
my $type = shift @$vulgar_components;
my $query_match_length = shift @$vulgar_components;
my $target_match_length = shift @$vulgar_components;
throw("Vulgar string does not start with a match. Was not " .
"expecting this. (Have type $type)")
if (scalar @exons == 0) && $type ne 'M' && $type ne 'S' && $type ne 'C';
if ($type eq 'M' or $type eq 'S' or $type eq 'C'){
my %hash = (type => $type);
if ($target_strand == -1) {
if ($target_in_forward_coords) {
$hash{target_start} = $cumulative_target_coord - ($target_match_length - 1);
$hash{target_end} = $cumulative_target_coord;
} else {
$hash{target_end} = $target_length - ($cumulative_target_coord - 1);
$hash{target_start} = $hash{target_end} - ($target_match_length - 1);
}
} else {
$hash{target_start} = $cumulative_target_coord;
$hash{target_end} = $cumulative_target_coord + ($target_match_length - 1);
}
if ($query_strand == -1) {
if ($query_in_forward_coords) {
$hash{query_start} = $cumulative_query_coord - ($query_match_length - 1);
$hash{query_end} = $cumulative_query_coord;
} else {
$hash{query_end} = $query_length - ($cumulative_query_coord - 1);
$hash{query_start} = $hash{query_end} - ($query_match_length - 1);
}
} else {
$hash{query_start} = $cumulative_query_coord;
$hash{query_end} = $cumulative_query_coord + ($query_match_length - 1);
}
# there is nothing to add if this is the last state of the exon
$exons[$exon_number]->{gap_end} = 0;
push @{$exons[$exon_number]->{sf}}, \%hash;
}
elsif ($type eq "G") {
if (exists($exons[$exon_number]->{sf})) {
# this is the gap in the middle of an exon, or at the end. Assume it is
# at the end, and then reset if we see another match state in this exon
$exons[$exon_number]->{gap_end} = $target_match_length;
} else {
# this is a gap at the start of an exon;
$exons[$exon_number]->{gap_start} = $target_match_length;
}
}
elsif ($type eq "I" or
$type eq "F") {
# in protein mode, any insertion on the genomic side should be treated as
# an intron to ensure that the result translates. However, we allow for
# codon insertions in the genomic sequence with respect to the protein.
# This introduces the possibility of in-frame stops, but I don't
# think "introning over" these insertions is appropriate here.
# if we see a gap/intron immediately after an intron, the current exon is "empty"
if ($exons[$exon_number]) {
$exon_number++;
}
}
if ($target_in_forward_coords and $target_strand == -1) {
$cumulative_target_coord -= $target_match_length;
} else {
$cumulative_target_coord += $target_match_length;
}
if ($query_in_forward_coords and $query_strand == -1) {
$cumulative_query_coord -= $query_match_length;
}
else {
$cumulative_query_coord += $query_match_length;
}
}
for(my $i = 0; $i < @exons; $i++) {
my $ex = $exons[$i];
my @ex_sf = @{$ex->{sf}};
if ($target_strand == -1) {
$ex->{exon_start} = $ex_sf[-1]->{target_start};
$ex->{exon_end} = $ex_sf[0]->{target_end};
if (exists $ex->{gap_start}) {
$ex->{exon_end} += $ex->{gap_start};
}
if (exists $ex->{gap_end}) {
$ex->{exon_start} -= $ex->{gap_end};
}
} else {
$ex->{exon_start} = $ex_sf[0]->{target_start};
$ex->{exon_end} = $ex_sf[-1]->{target_end};
if (exists $ex->{gap_start}) {
$ex->{exon_start} -= $ex->{gap_start};
}
if (exists $ex->{gap_end}) {
$ex->{exon_end} += $ex->{gap_end};
}
}
# split codons are a pain. If the query is dna, we must be in the
# cdna2genome model so they must be part of the supporting feature.
# If query is protein, they need to be removed from the supporting feature
if ($self->query_type eq 'dna') {
map { $_->{type} = 'C' if $_->{type} eq 'S' } @ex_sf;
if (my @cod = grep { $_->{type} eq 'C' } @ex_sf) {
# at least part of this exon is coding
@cod = sort { $a->{target_start} <=> $b->{target_start} } @cod;
my $cod_start = $cod[0]->{target_start};
my $cod_end = $cod[-1]->{target_end};
my $cod_len = $cod_end - $cod_start + 1;
if ($target_strand == -1) {
$ex->{coding_start} = $ex->{exon_end} - $cod_end + 1;
} else {
$ex->{coding_start} = $cod_start - $ex->{exon_start} + 1;
}
$ex->{coding_end} = $ex->{coding_start} + $cod_len - 1;
}
# merge together abutting ungapped features
my @nr_sf;
foreach my $sf (@ex_sf) {
my $merged = 0;
if (@nr_sf and
$sf->{type} eq 'C' and
$nr_sf[-1]->{type} eq 'C' and
$sf->{query_start} == $nr_sf[-1]->{query_end} + 1) {
if ($target_strand == -1) {
if ($sf->{target_end} == $nr_sf[-1]->{target_start} - 1) {
$nr_sf[-1]->{target_start} = $sf->{target_start};
$nr_sf[-1]->{query_end} = $sf->{query_end};
$merged = 1;
}
} else {
if ($sf->{target_start} == $nr_sf[-1]->{target_end} + 1) {
$nr_sf[-1]->{target_end} = $sf->{target_end};
$nr_sf[-1]->{query_end} = $sf->{query_end};
$merged = 1;
}
}
}
if (not $merged) {
push @nr_sf, $sf;
}
}
@ex_sf = @nr_sf;
} else {
# query type is protein
@ex_sf = grep { $_->{type} ne 'S' } @ex_sf;
$ex->{coding_start} = 1;
$ex->{coding_end} = $ex->{exon_end} - $ex->{exon_start} + 1;
}
$ex->{sf} = \@ex_sf;
}
return \@exons;
}
############################################################
#
# get/set methods
#
############################################################
sub coverage_as_proportion_of_aligned_residues {
my ($self, $val) = @_;
if (defined $val) {
$self->{_coverage_aligned} = $val;
}
return $self->{_coverage_aligned};
}
1;