Bio::EnsEMBL::Analysis::Runnable Blat
SummaryIncluded librariesPackage variablesSynopsisDescriptionGeneral documentationMethods
Toolbar
WebCvsRaw content
Summary
Bio::EnsEMBL::Analysis::Runnable::Blat
Package variables
No package variables defined.
Included modules
Bio::EnsEMBL::Analysis
Bio::EnsEMBL::Analysis::Runnable
Bio::EnsEMBL::FeaturePair
Bio::EnsEMBL::Root
Bio::EnsEMBL::SeqFeature
Bio::EnsEMBL::Utils::Argument qw ( rearrange )
Bio::EnsEMBL::Utils::Exception qw ( throw warning info )
Bio::PrimarySeqI
Bio::SeqI
Inherit
Bio::EnsEMBL::Analysis::Runnable
Synopsis
  my $runnable = Bio::EnsEMBL::Analysis::Runnable::Blat->new(
-database => $self->database,
-query => $self->query,
-program => $self->program,
-options => $self->options,
);
$runnable->run; #create and fill Bio::Seq object my @results = $runnable->output; where @results is an array of SeqFeatures, each one representing an aligment (e.g. a transcript), and each feature contains a list of alignment blocks (e.g. exons) as sub_SeqFeatures, which are in fact feature pairs.
Description
Blat takes a Bio::Seq (or Bio::PrimarySeq) object and runs Blat
against a set of sequences. The resulting output file is parsed
to produce a set of features.
Methods
blat
No description
Code
databaseDescriptionCode
get_best_score_in_all_frames
No description
Code
new
No description
Code
options
No description
Code
parse
No description
Code
parse_results
No description
Code
queryDescriptionCode
query_type
No description
Code
runDescriptionCode
target_type
No description
Code
Methods description
databasecode    nextTop
  
Title : database
Usage : $self->database($seq)
Function: Get/set method for database. If set with a Bio::Seq object it
will get written to the local tmp directory
Returns : filename
Args : Bio::PrimarySeqI, or filename
querycodeprevnextTop
    Title   :   query
Usage : $self->query($seq)
Function: Get/set method for query. If set with a Bio::Seq object it
will get written to the local tmp directory
Returns : filename
Args : Bio::PrimarySeqI, or filename
runcodeprevnextTop
Usage : $obj->run($workdir, $args)
Function: Runs blat script and puts the results into the file $self->results
It calls $self->parse_results, and results are stored in $self->output
Methods code
blatdescriptionprevnextTop
sub blat {
    my ($self, $location) = @_;
    if ($location) {
	throw("Blat not found at $location: $!\n") unless (-e $location);
	$self->{_blat} = $location ;
    }
    return $self->{_blat};
}

############################################################
}
databasedescriptionprevnextTop
sub database {
    my ($self, $val) = @_;
    
    if (defined $val) {
	if (not ref($val)) {   
	    throw("[$val] : file does not exist\n") unless -e $val;
	} else {
	    if (ref($val) eq 'ARRAY') {
		foreach my $el (@$val) {
		    throw("All elements of given database array should be Bio::PrimarySeqs")
		      if not ref($el) or not $el->isa("Bio::PrimarySeq");
		}
	    } elsif (not $val->isa("Bio::PrimarySeq")) {
		throw("[$val] is neither a file nor array of Bio::Seq");
	    } else {
		$val = [$val];
	    }
	}
	$self->{_database} = $val;
    }
    
    return $self->{_database};
}

############################################################
}
get_best_score_in_all_framesdescriptionprevnextTop
sub get_best_score_in_all_frames {
    my ($seq1, $seq2, $matrix) = @_;
    
    my @aa_seq1_6fr = Bio::SeqUtils->translate_6frames($seq1);
    my @aa_seq2_6fr = Bio::SeqUtils->translate_6frames($seq2);
    
    my $score;
    my $perc_id = 0;
    my $frame = 0;
    ##  my $seqs;
for (my $i=0; $i<6; $i++) { my $this_score = 0; my $this_perc_id = 0; my $this_seq1 = $aa_seq1_6fr[$i]->seq; my $this_seq2 = $aa_seq2_6fr[$i]->seq; my $length = length($this_seq1); $length = length($this_seq2) if (length($this_seq2) < $length); my @this_seq1 = split("", $this_seq1); my @this_seq2 = split("", $this_seq2); if (defined($matrix)) { for (my $j=0; $j<$length; $j++) { my $aa1 = $this_seq1[$j]; my $aa2 = $this_seq2[$j]; $this_score += $matrix->{$aa1}->{$aa2}; $this_perc_id++ if ($aa1 eq $aa2); } } else { for (my $j=0; $j<$length; $j++) { my $aa1 = $this_seq1[$j]; my $aa2 = $this_seq2[$j]; if ($aa1 eq $aa2) { $this_score += 2; $this_perc_id++; } else { $this_score--; } } } if (!defined($score) or ($this_score > $score)) { $score = $this_score; if ($length) { $perc_id = int(100 * $this_perc_id / $length);
} else { $perc_id = 0; } $frame = $i; ## $seqs = $this_seq1."\n".$this_seq2;
} } return ($score, $perc_id, $frame); } ############################################################
#
# get/set methods
#
############################################################
}
newdescriptionprevnextTop
sub new {
  my ($class,@args) = @_;
  my $self = $class->SUPER::new(@args);

  my ($database) = rearrange([qw(DATABASE)], @args);
  $self->database($database) if defined $database;

  throw("You must supply a database") if not $self->database; 
  throw("You must supply a query") if not $self->query;

  $self->program("blat-32") if not $self->program;

  return $self;
}

############################################################
#
# Analysis methods
#
############################################################
}
optionsdescriptionprevnextTop
sub options {
    my ($self, $options) = @_;
    if ($options) {
	$self->{_options} = $options ;
    }
    return $self->{_options};
}

############################################################
}
parsedescriptionprevnextTop
sub parse {
    my ($self, $parse) = @_;
    if ($parse) {
	$self->{_parse} = $parse;
    }
    return $self->{_parse};
}

############################################################
1;
}
parse_resultsdescriptionprevnextTop
sub parse_results {
    my ($blat_output_pipe) = @_;

    my @alignments;

    while (<$blat_output_pipe>) {
	#print STDERR "$_\n";
############################################################
# PSL lines represent alignments and are typically taken from files generated
# by BLAT or psLayout. See the BLAT documentation for more details.
#
# 1.matches - Number of bases that match that aren't repeats
# 2.misMatches - Number of bases that don't match
# 3.repMatches - Number of bases that match but are part of repeats
# 4.nCount - Number of 'N' bases
# 5.qNumInsert - Number of inserts in query
# 6.qBaseInsert - Number of bases inserted in query
# 7.tNumInsert - Number of inserts in target
# 8.tBaseInsert - Number of bases inserted in target
# 9.strand - '+' or '-' for query strand. In mouse, second '+'or '-' is for genomic strand
#10.qName - Query sequence name
#11.qSize - Query sequence size
#12.qStart - Alignment start position in query
#13.qEnd - Alignment end position in query
#14.tName - Target sequence name
#15.tSize - Target sequence size
#16.tStart - Alignment start position in target
#17.tEnd - Alignment end position in target
#18.blockCount - Number of blocks in the alignment
#19.blockSizes - Comma-separated list of sizes of each block
#20.qStarts - Comma-separated list of starting positions of each block in query
#21.tStarts - Comma-separated list of starting positions of each block in target
############################################################
# first split on spaces:
chomp; my ( $matches, $mismatches, $rep_matches, $n_count, $q_num_insert, $q_base_insert, $t_num_insert, $t_base_insert, $strand, $q_name, $q_length, $q_start, $q_end, $t_name, $t_length, $t_start, $t_end, $block_count, $block_sizes, $q_starts, $t_starts, $q_seqs, $t_seqs ) = split; #print STDERR "$matches, $mismatches, $rep_matches, $n_count, $q_num_insert, $q_base_insert,
# $t_num_insert, $t_base_insert, $strand, $q_name, $q_length, $q_start,
# $q_end, $t_name, $t_length, $t_start, $t_end, $block_count,
# $block_sizes, $q_starts, $t_starts\n";
# ignore any preceeding text
#unless ( defined($matches) and $matches =~/^\d+$/ ){
# next;
#}
# create as many features as blocks there are in each output line
my (%feat1, %feat2); $feat1{name} = $t_name; $feat2{name} = $q_name; ################
#Added strand splitter as strand represented by ++ or +- etc
if (length($strand)>1){ ($feat2{strand},$feat1{strand}) = split //,$strand; } else { $feat2{strand}=$strand; $feat1{strand}=1; } # all the block sizes add up to $matches + $mismatches + $rep_matches
# percentage identity = ( matches not in repeats + matches in repeats ) / ( alignment length )
#print STDERR "calculating percent_id and score:\n";
#print STDERR "matches: $matches, rep_matches: $rep_matches, mismatches: $mismatches, q_length: $q_length\n";
#print STDERR "percent_id = 100x".($matches + $rep_matches)."/".( $matches + $mismatches + $rep_matches )."\n";
#my $percent_id = sprintf "%.2f", ( 100 * ($matches + $rep_matches)/( $matches + $mismatches + $rep_matches ) );
# or is it ...?
## percentage identity = ( matches not in repeats + matches in repeats ) / query length
#my $percent_id = sprintf "%.2d", (100 * ($matches + $rep_matches)/$q_length );
# we put basically score = coverage = ( $matches + $mismatches + $rep_matches ) / $q_length
#print STDERR "score = 100x".($matches + $mismatches + $rep_matches)."/".( $q_length )."\n";
unless ( $q_length ){ warning("length of query is zero, something is wrong!"); next; } my $score = sprintf "%.2f", ( 100 * ( $matches + $mismatches + $rep_matches ) / $q_length );
# size of each block of alignment (inclusive)
my @block_sizes = split ",",$block_sizes; # start position of each block (you must add 1 as psl output is off by one in the start coordinate)
my @q_start_positions = split ",",$q_starts; my @t_start_positions = split ",",$t_starts; my @q_sequences = split ",",$q_seqs; my @t_sequences = split ",",$t_seqs; # $superfeature->seqname($q_name);
# $superfeature->score( $score );
# $superfeature->percent_id( $percent_id );
# each line of output represents one possible entire aligment of the query (feat1) and the target(feat2)
#### working out the coordinates: #########################
#
# s e
# ========== EST
# <----------------------------------------------------| (reversed) genomic of length L
#
# we would store this as a hit in the reverse strand, with coordinates:
#
# |---------------------------------------------------->
# s' e'
# ========== EST
# where e' = L - s
# s' = e' - ( e - s + 1 ) + 1
#
# Also, hstrand will be always +1
############################################################
############################################################
#NB strand may =++ or +- etc rather than just + or -
#
#if qstrand negative reverse qstarts and blocks and calculate the correct co-ordinates
#
#if Tstrand(genomic strand) negative reverse the tstarts and then cal the co-ords
#
#if both strands are negative then reverse everything ie blocks and starts before calculating using:
#
# newqstart=length - (qstart + blocklength)
# newqend = length - qstart
#NB not sure about when to add the plus 1 -- as psl starts at 0 for start but the end is correct
#NB add +1 to the plus strand starts
my @query_starts; my @target_starts; my @query_ends; my @target_ends; my @reversed_block_sizes; my @reversed_q_starts; my @reversed_t_starts; if ($feat2{strand} eq '+') { # query in the forward strand
@query_starts = map {my $val=$_; $val+=1} (@q_start_positions); # use inclusive coord
if ($feat1{strand} eq '-') { # target in the reverse strand
for (my $i=0; $i<$block_count; $i++) { $query_ends[$i] = $q_start_positions[$i] + $block_sizes[$i]; $target_ends[$i] = $t_length - $t_start_positions[$i]; $target_starts[$i] = ($target_ends[$i] - $block_sizes[$i]) + 1; } } else { # target in the forward strand
@target_starts = map {my $val=$_; $val+=1} (@t_start_positions); # use inclusive coord
for (my $i=0; $i<$block_count; $i++) { $query_ends[$i] = $q_start_positions[$i] + $block_sizes[$i]; $target_ends[$i] = $t_start_positions[$i] + $block_sizes[$i]; } } } else { # query in the reverse strand
if ($feat1{strand} eq '-') { for (my $i=0; $i<$block_count; $i++ ) { # target in the reverse strand
$query_ends[$i] = $q_length - $q_start_positions[$i]; $query_starts[$i] = ($query_ends[$i] - $block_sizes[$i]) + 1; $target_ends[$i] = $t_length - $t_start_positions[$i]; $target_starts[$i] = ($target_ends[$i] - $block_sizes[$i]) + 1; } } else { # target in forward strand
@target_starts= map {my $val=$_; $val+=1} (@t_start_positions); # use inclusive coord
for (my $i=0; $i<$block_count; $i++ ) { $query_ends[$i] = $q_length - $q_start_positions[$i]; $query_starts[$i] = ($query_ends[$i] - $block_sizes[$i]) + 1; $target_ends[$i] = $t_start_positions[$i] + $block_sizes[$i]; } } } for (my $i=0; $i<$block_count; $i++ ) { next if ($block_sizes[$i] < 15); $feat2 {start} = $query_starts[$i]; $feat2 {end} = $query_ends[$i]; if ( $query_ends[$i] < $query_starts[$i]) { warning("dodgy feature coordinates: end = $query_ends[$i], start = $query_starts[$i]. Reversing..."); $feat2 {end} = $query_starts[$i]; $feat2 {start} = $query_ends[$i]; } $feat1 {start} = $target_starts[$i]; $feat1 {end} = $target_ends[$i]; my $this_q_bioseq = Bio::Seq->new( -seq => $q_sequences[$i], -moltype => "dna", -alphabet => 'dna', -id => "q_seq"); my $this_t_bioseq = Bio::Seq->new( -seq => $t_sequences[$i], -moltype => "dna", -alphabet => 'dna', -id => "t_seq"); my ($score, $percent_id, $frame) = get_best_score_in_all_frames($this_q_bioseq, $this_t_bioseq); # we put all the features with the same score and percent_id
$feat2 {score} = $score; $feat1 {score} = $feat2 {score}; $feat2 {percent} = $percent_id; $feat1 {percent} = $feat2 {percent}; # other stuff:
$feat1 {db} = undef; $feat1 {db_version} = undef; $feat1 {program} = 'blat'; $feat1 {p_version} = '1'; $feat1 {source} = 'blat'; $feat1 {primary} = 'similarity'; $feat2 {source} = 'blat'; $feat2 {primary} = 'similarity'; ## make strand -1 or 1 rather than - or +
my $t_strand = ( $feat1{strand}eq '-')?"-1":"1"; my $q_strand = ($feat2{strand} eq '-')?"-1":"1"; ## make FeaturePair object
my $feature_pair = new Bio::EnsEMBL::FeaturePair; $feature_pair->seqname($feat2{name}); $feature_pair->start($query_starts[$i]); $feature_pair->end($query_ends[$i]); $feature_pair->strand($q_strand); $feature_pair->hseqname($feat1{name}); $feature_pair->hstart($target_starts[$i]); $feature_pair->hend($target_ends[$i]); $feature_pair->hstrand($t_strand); $feature_pair->score($score); $feature_pair->percent_id($percent_id); # note that the cigar_string is created in Bio::EnsEMBL::BaseAlignFeature
my $alignment = new Bio::EnsEMBL::DnaDnaAlignFeature(-features => [$feature_pair]); push @alignments, $alignment; } } #foreach my $out (@alignments) {
# print STDERR $out->seqname."\t"."BLAT"."\tsimilarity\t".$out->start."\t".$out->end."\t".$out->hseqname."\t".$out->hstart."\t".$out->hend."\t".
# $out->score."\t".$out->p_value."\t".$out->hstrand."\t". $out->strand."\t".$out->identical_matches."\t".$out->positive_matches."\t".$out->cigar_string."\n"; #this line is the start of a gff line for display
#}
return\@ alignments;
}
querydescriptionprevnextTop
sub query {
    my ($self, $val) = @_;
    
    if (defined $val) {
	if (not ref($val)) {   
	    throw("[$val] : file does not exist\n") unless -e $val;
	} elsif (not $val->isa("Bio::PrimarySeqI")) {
	    throw("[$val] is neither a Bio::Seq not a file");
	}
	$self->{_query} = $val;
    }
    
    return $self->{_query}
}
query_typedescriptionprevnextTop
sub query_type {
    my ($self, $mytype) = @_;
    if (defined($mytype) ){
	my $type = lc($mytype);
	unless( $type eq 'dna' || $type eq 'rna' || $type eq 'prot' || $type eq 'dnax' || $type eq 'rnax' ){
	    throw("not the right query type: $type");
	}
	$self->{_query_type} = $type;
    }
    return $self->{_query_type};
}

############################################################
}
rundescriptionprevnextTop
sub run {
  my ($self) = @_;
    
  my $cmd = $self->program  ." ".
            $self->database ." ".
            $self->query ." ".
	    $self->options;

  my $blat_output_pipe = undef;

  $cmd .=  " -out=pslx -noHead stdout";

  info("Running blat to pipe...\n$cmd\n");
  print("Running blat to pipe...\n$cmd\n");

  open($blat_output_pipe, "$cmd |") ||
    throw("Error opening Blat cmd <$cmd>." .
	  " Returned error $? BLAT EXIT: '" .
	  ($? >> 8) . "'," ." SIGNAL '" . ($? & 127) .
	  "', There was " . ($? & 128 ? 'a' : 'no') .
	  " core dump");

  my $results = parse_results($blat_output_pipe);
  close $blat_output_pipe;
  $self->output($results);
}

#
#much of this code is taken from
#ensembl-pipeline/modules/Bio/EnsEMBL/Pipeline/Runnable/Blat.pm
#
}
target_typedescriptionprevnextTop
sub target_type {
    my ($self, $mytype) = @_;
    if (defined($mytype) ){
	my $type = lc($mytype);
	unless( $type eq 'dna' || $type eq 'prot' || $type eq 'dnax' ){
	    throw("not the right target type: $type");
	}
	$self->{_target_type} = $type ;
    }
    return $self->{_target_type};
}

############################################################
}
General documentation
CONTACTTop
ensembl-dev@ebi.ac.uk
APPENDIXTop
The rest of the documentation details each of the object methods.
Internal methods are usually preceded with a _