Bio::EnsEMBL::Analysis::Runnable::Finished Exonerate
Included librariesPackage variablesGeneral documentationMethods
Toolbar
WebCvsRaw content
Package variables
No package variables defined.
Included modules
Bio::EnsEMBL::Utils::Argument qw ( rearrange )
Bio::EnsEMBL::Utils::Exception qw ( verbose throw warning )
BlastableVersion
Fcntl
Tie::File
Inherit
Bio::EnsEMBL::Analysis::Runnable::BaseExonerate
Synopsis
No synopsis!
Description
No description!
Methods
_parse_vulgar_block
No description
Code
exo_options
No description
Code
fetch_databases
No description
Code
get_db_version
No description
Code
new
No description
Code
parse_results
No description
Code
query_db
No description
Code
run
No description
Code
target
No description
Code
Methods description
None available.
Methods code
_parse_vulgar_blockdescriptionprevnextTop
sub _parse_vulgar_block {
	my (
		$self,          $target_start,  $target_end,
		$target_strand, $target_length, $query_start,
		$query_end,     $query_strand,  $query_length,
		$vulgar_components
	  )
	  = @_;

	# This method works along the length of a vulgar line
# exon-by-exon. Matches that comprise an exon are
# grouped and an array of 'proto-exons' is returned.
# Coordinates from the vulgar line are extrapolated
# to actual genomic/query coordinates.
my @exons; my $exon_number = 0; # We sometimes need to increment all our start coordinates. Exonerate
# has a coordinate scheme that counts _between_ nucleotides at the start.
# However, for reverse strand matches
my ( $query_in_forward_coords, $target_in_forward_coords ); my ( $cumulative_query_coord, $cumulative_target_coord ); if ( $target_start > $target_end ) { warn( "For target, start and end are in thew wrong order for a reverse strand match" ) if $target_strand != -1; $cumulative_target_coord = $target_start; $target_in_forward_coords = 1; } else { $cumulative_target_coord = $target_start + 1; $target_in_forward_coords = 0; } if ( $query_start > $query_end ) { warn( "For query, start and end are in thew wrong order for a reverse strand match" ) if $query_strand != -1; $cumulative_query_coord = $query_start; $query_in_forward_coords = 1; } else { $cumulative_query_coord = $query_start + 1; $query_in_forward_coords = 0; } while (@$vulgar_components) { throw( "Something funny has happened to the input vulgar string." . " Expecting components in multiples of three, but only have [" . scalar @$vulgar_components . "] items left to process." ) unless scalar @$vulgar_components >= 3; my $type = shift @$vulgar_components; my $query_match_length = shift @$vulgar_components; my $target_match_length = shift @$vulgar_components; throw( "Vulgar string does not start with a match. Was not " . "expecting this." ) if ( scalar @exons == 0 ) && $type ne 'M' && $type ne 'S'; if ( $type eq 'M' or $type eq 'S' ) { my %hash; if ( $target_strand == -1 ) { if ($target_in_forward_coords) { $hash{target_start} = $cumulative_target_coord - ( $target_match_length - 1 ); $hash{target_end} = $cumulative_target_coord; } else { $hash{target_end} = $target_length - ( $cumulative_target_coord - 1 ); $hash{target_start} = $hash{target_end} - ( $target_match_length - 1 ); } } else { $hash{target_start} = $cumulative_target_coord; $hash{target_end} = $cumulative_target_coord + ( $target_match_length - 1 ); } if ( $query_strand == -1 ) { if ($query_in_forward_coords) { $hash{query_start} = $cumulative_query_coord - ( $query_match_length - 1 ); $hash{query_end} = $cumulative_query_coord; } else { $hash{query_end} = $query_length - ( $cumulative_query_coord - 1 ); $hash{query_start} = $hash{query_end} - ( $query_match_length - 1 ); } } else { $hash{query_start} = $cumulative_query_coord; $hash{query_end} = $cumulative_query_coord + ( $query_match_length - 1 ); } if ( $type eq 'S' ) { $hash{incomplete_codon} = $target_match_length; } # there is nothing to add if this is the last state of the exon
$exons[$exon_number]->{gap_end} = 0; push @{ $exons[$exon_number]->{sf} },\% hash; } elsif ( $type eq "G" ) { if ( exists( $exons[$exon_number]->{sf} ) ) { # this is the gap in the middle of an exon, or at the end. Assume it is
# at the end, and then reset if we see another match state in this exon
$exons[$exon_number]->{gap_end} = $target_match_length; } else { # this is a gap at the start of an exon;
$exons[$exon_number]->{gap_start} = $target_match_length; } } elsif ($type eq "I" or $type eq "F" ) { # in protein mode, any insertion on the genomic side should be treated as
# an intron to ensure that the result translates. However, we allow for
# codon insertions in the genomic sequence with respect to the protein.
# This introduces the possibility of in-frame stops, but I don't
# think "introning over" these insertions is appropriate here.
# if we see a gap/intron immediately after an intron, the current exon is "empty"
if ( $exons[$exon_number] ) { $exon_number++; } } if ( $target_in_forward_coords and $target_strand == -1 ) { $cumulative_target_coord -= $target_match_length; } else { $cumulative_target_coord += $target_match_length; } if ( $query_in_forward_coords and $query_strand == -1 ) { $cumulative_query_coord -= $query_match_length; } else { $cumulative_query_coord += $query_match_length; } } for ( my $i = 0 ; $i < @exons ; $i++ ) { my $ex = $exons[$i]; my $ex_sf = $ex->{sf}; if ( $target_strand == -1 ) { $ex->{exon_start} = $ex_sf->[-1]->{target_start}; $ex->{exon_end} = $ex_sf->[0]->{target_end}; if ( exists $ex->{gap_start} ) { $ex->{exon_end} += $ex->{gap_start}; } if ( exists $ex->{gap_end} ) { $ex->{exon_start} -= $ex->{gap_end}; } } else { $ex->{exon_start} = $ex_sf->[0]->{target_start}; $ex->{exon_end} = $ex_sf->[-1]->{target_end}; if ( exists $ex->{gap_start} ) { $ex->{exon_start} -= $ex->{gap_start}; } if ( exists $ex->{gap_end} ) { $ex->{exon_end} += $ex->{gap_end}; } } # filter out the incomplete supporting features
my @sfs; foreach my $f (@$ex_sf) { if ( not exists( $f->{incomplete_codon} ) ) { push @sfs, $f; } } $ex->{sf} =\@ sfs; } return\@ exons; } 1;
}
exo_optionsdescriptionprevnextTop
sub exo_options {
  my $self = shift;
  $self->{'exo_options'} = shift if(@_);
  return $self->{'exo_options'} || '';
}
fetch_databasesdescriptionprevnextTop
sub fetch_databases {
	my ($self) = @_;
	my @databases;
	my $db_names = $self->query_db;
	$db_names =~ s/\s//g;

	if ( not exists $self->{databases} ) {
		$self->{databases} = [];
	}

	foreach my $dbname ( split( ",", $db_names ) )
	{    # allows the use of a comma separated list in $self->database
# prepend the environment variable $BLASTDB if
# database name is not an absoloute path
unless ( $dbname =~ m!^/! ) {
$dbname = $ENV{BLASTDB} . "/" . $dbname;
} # If the expanded database name exists put this in
# the database array.
#
# If it doesn't exist then see if $database-1,$database-2 exist
# and put them in the database array
if ( -f $dbname ) { push( @databases, $dbname ); } else { my $count = 1; my $db_filename; while ( -f ( $db_filename = "${dbname}-${count}" ) ) { push( @databases, $db_filename ); $count++; } $! = undef ; # to stop pollution as it will be "No such file or directory" after while loop above.
} } if ( scalar(@databases) == 0 ) { throw( "No databases exist for " . $db_names ); } else { foreach my $db_name (@databases){ $self->get_db_version($db_name) if $db_name =~ /emnew_/; } $self->get_db_version($databases[0]); push @{$self->{databases}}, @databases; } return $self->{databases};
}
get_db_versiondescriptionprevnextTop
sub get_db_version {
	my ( $self, $db ) = @_;
	my $debug_this = 0;    # this just shows debug info.
my $force_dbi = 0; # this will force a dbi call SLOW!!!!!!
unless ( $self->{'_db_version_searched'} ) { if ($db) { $BlastableVersion::debug = $debug_this; warning "BlastableVersion is cvs revision $BlastableVersion::revision\n " if $debug_this; my $ver = eval { my $blast_ver = BlastableVersion->new(); $blast_ver->force_dbi($force_dbi); # if set will be SLOW.
$blast_ver->get_version($db); $blast_ver; }; throw("I failed to get a BlastableVersion for $db") if $@; my $dbv = $ver->version(); my $sgv = $ver->sanger_version(); my $name = $ver->name(); my $date = $ver->date(); unless ($dbv) { throw( "I know nothing about $db I tried to find out:\n" . " - name <" . $name . ">\n" . " - date <" . $date . ">\n" . " - version <" . $dbv . ">\n" . " - sanger_version <" . $sgv . ">\n" ); } $self->{'_db_version_searched'} = $dbv; } else { throw( "You've asked about what I searched, but I don't know." . " It's not set. I need to be called with a database filename first" ); # The code probably got here because of a problem with the MLDBM
# cache file on the machine this was running on.
# the cache file is stored @ /var/tmp/blast_versions
# try <rm -f /var/tmp/blast_versions>
} } return $self->{'_db_version_searched'};
}
newdescriptionprevnextTop
sub new {
	my ( $class, @args ) = @_;
	my $self = $class->SUPER::new(@args);

	my ( $target, $query_db, $exo_options ) = rearrange(
		[
			qw(
			  TARGET
			  QUERY_DB
			  EXO_OPTIONS
			  )
		],
		@args
	);
	if ( defined($target) ) {
		$self->target($target);
	}
	if ( defined($query_db) ) {
		$self->query_db($query_db);
	}
	if ( defined($exo_options) ) {
		$self->exo_options($exo_options);
	}
	return $self;
}
parse_resultsdescriptionprevnextTop
sub parse_results {
	my ($self, $fh) = @_;
	my %strand_lookup = ( '+' => 1, '-' => -1, '.' => 1 );

	my @exon_sup_features;

	TRANSCRIPT:
	while (<$fh>){
		print STDERR $_ if $self->_verbose;
		next unless /^RESULT:/;
		chomp;

		my (
			$tag,      $q_id,             $q_start,
			$q_end,    $q_strand,         $t_id,
			$t_start,  $t_end,            $t_strand,
			$score,    $perc_id,          $q_length,
			$t_length, $gene_orientation, @align_components
		  )
		  = split ;

		$t_strand         = $strand_lookup{$t_strand};
		$q_strand         = $strand_lookup{$q_strand};
		$gene_orientation = $strand_lookup{$gene_orientation};

		# Read vulgar information and extract exon regions.
my $exons = $self->_parse_vulgar_block( $t_start, $t_end, $t_strand, $t_length, $q_start, $q_end, $q_strand, $q_length,\@ align_components ); # now we have extracted the exons and the coordinates are with
# reference to the forward strand of the query and target, we can
# use the gene_orienation to flip the strands if necessary
if ( $gene_orientation == -1 ) { $t_strand *= -1; $q_strand *= -1; } my $covered_count = 0; my $coverage = 1; if ($coverage) { foreach my $exon (@$exons) { foreach my $sf ( @{ $exon->{sf} } ) { $covered_count += $sf->{query_end} - $sf->{query_start} + 1; } } } else { $covered_count = abs( $q_end - $q_start ); } $coverage = sprintf( "%.2f", 100 * $covered_count / $q_length );
# Build FeaturePairs for each region of query aligned to a single
# Exon. Create a DnaDnaAlignFeature from these FeaturePairs and then
# attach this to our Exon.
my $transcript = Bio::EnsEMBL::Transcript->new(); my (@tran_feature_pairs); foreach my $proto_exon (@$exons) { # Build our exon and set its key values.
my $exon = Bio::EnsEMBL::Exon->new(); $exon->seqname($t_id); $exon->start( $proto_exon->{exon_start} ); $exon->end( $proto_exon->{exon_end} ); $exon->phase( $proto_exon->{phase} ); $exon->end_phase( $proto_exon->{end_phase} ); $exon->strand($t_strand); my @exon_feature_pairs; foreach my $sf ( @{ $proto_exon->{sf} } ) { my $feature_pair = Bio::EnsEMBL::FeaturePair->new( -seqname => $t_id, -start => $sf->{target_start}, -end => $sf->{target_end}, -strand => $t_strand, -hseqname => $q_id, -hstart => $sf->{query_start}, -hend => $sf->{query_end}, -hstrand => $q_strand, -hcoverage => $coverage, -score => $score, -percent_id => $perc_id ); push @exon_feature_pairs, $feature_pair; push @tran_feature_pairs, $feature_pair; } if (@exon_feature_pairs) { # Use our feature pairs for this exon to create a single
# supporting feature (with cigar line).
my $supp_feature; eval { if ( $self->query_type eq 'protein' ) { $supp_feature = Bio::EnsEMBL::DnaPepAlignFeature->new( -features =>\@ exon_feature_pairs ); } else { $supp_feature = Bio::EnsEMBL::DnaDnaAlignFeature->new( -features =>\@ exon_feature_pairs ); } }; if ($@) { warning($@); return 0; } $exon->add_supporting_features($supp_feature); push @exon_sup_features,$supp_feature; } $transcript->add_Exon($exon); } # Create a single supporting feature for the whole transcript
my $t_supp_feat; eval { if ( $self->query_type eq 'protein' ) { $t_supp_feat = Bio::EnsEMBL::DnaPepAlignFeature->new( -features =>\@ tran_feature_pairs ); } else { $t_supp_feat = Bio::EnsEMBL::DnaDnaAlignFeature->new( -features =>\@ tran_feature_pairs ); } }; if ($@) { warning("Could not create Transcript supporting feature"); } else { $transcript->add_supporting_features($t_supp_feat); } } return\@ exon_sup_features;
}
query_dbdescriptionprevnextTop
sub query_db {
	my ( $self, $db ) = @_;
	if ($db) {
		$self->{_query_db} = $db;
	}
	return $self->{_query_db};
}
rundescriptionprevnextTop
sub run {
	my ($self) = @_;
	my @query;

	if ( $self->query || $self->query_seqs ) {

		my $seq =  $self->query ? $self->query : $self->query_seqs;

		# Write query sequence(s) to file if necessary
my $query_file = $self->write_seq_file( $seq ); # register the file for deletion
$self->files_to_delete($query_file); push @query, $query_file; } elsif ( $self->query_db ) { push @query, @{ $self->fetch_databases }; } else { throw('No query provided'); } # Write target sequence to file
my $target_file = $self->create_filename( 'target_seq', 'fa' ); $self->write_seq_file( $self->target, $target_file ); # register the file for deletion
$self->files_to_delete($target_file); foreach my $q (@query) { # Build exonerate command
my $command = join( ' ', ( $self->program, $self->options, $self->exo_options, ' --querytype ', $self->query_type, ' --targettype ', $self->target_type, ' --query ', $q, ' --target ', $target_file ) ); # Execute command and parse results
print STDOUT "Running exonerate : $command\n"; my $exo_fh; open( $exo_fh, "$command |" ) or throw("Error opening exonerate command <$command>\n $? $!"); $self->output($self->parse_results( $exo_fh )); close($exo_fh) or throw("Error closing exonerate command: $? $!"); } $self->delete_files; return 1;
}
targetdescriptionprevnextTop
sub target {
	my ( $self, $seq ) = @_;
	if ($seq) {
		unless ( $seq->isa("Bio::PrimarySeqI") || $seq->isa("Bio::SeqI") ) {
			throw("target seq must be a Bio::SeqI or Bio::PrimarySeqI");
		}
		$self->{_target} = $seq;
	}
	return $self->{_target};
}
General documentation
NAME - Bio::EnsEMBL::Analysis::Runnable::Finished::ExonerateTop
AUTHORTop
Mustapha Larbaoui email ml6@sanger.ac.uk