Raw content of Bio::EnsEMBL::Analysis::Runnable::Blat # # Written by Kathryn Beal # Based on the module in Bio::EnsEMBL::Pipeline::Runnable::Blat # # Copyright GRL/EBI 2007 # # You may distribute this module under the same terms as perl itself # # POD documentation - main docs before the code =pod =head1 NAME Bio::EnsEMBL::Analysis::Runnable::Blat =head1 SYNOPSIS my $runnable = Bio::EnsEMBL::Analysis::Runnable::Blat->new( -database => $self->database, -query => $self->query, -program => $self->program, -options => $self->options, ); $runnable->run; #create and fill Bio::Seq object my @results = $runnable->output; where @results is an array of SeqFeatures, each one representing an aligment (e.g. a transcript), and each feature contains a list of alignment blocks (e.g. exons) as sub_SeqFeatures, which are in fact feature pairs. =head1 DESCRIPTION Blat takes a Bio::Seq (or Bio::PrimarySeq) object and runs Blat against a set of sequences. The resulting output file is parsed to produce a set of features. =head1 CONTACT ensembl-dev@ebi.ac.uk =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut package Bio::EnsEMBL::Analysis::Runnable::Blat; use vars qw(@ISA); use strict; use Bio::EnsEMBL::Utils::Exception qw(throw warning info); use Bio::EnsEMBL::Utils::Argument qw(rearrange); use Bio::EnsEMBL::Analysis::Runnable; use Bio::EnsEMBL::SeqFeature; use Bio::EnsEMBL::FeaturePair; use Bio::EnsEMBL::Analysis; use Bio::EnsEMBL::Root; use Bio::PrimarySeqI; use Bio::SeqI; @ISA = qw(Bio::EnsEMBL::Analysis::Runnable); sub new { my ($class,@args) = @_; my $self = $class->SUPER::new(@args); my ($database) = rearrange([qw(DATABASE)], @args); $self->database($database) if defined $database; throw("You must supply a database") if not $self->database; throw("You must supply a query") if not $self->query; $self->program("blat-32") if not $self->program; return $self; } ############################################################ # # Analysis methods # ############################################################ =head2 run Usage : $obj->run($workdir, $args) Function: Runs blat script and puts the results into the file $self->results It calls $self->parse_results, and results are stored in $self->output =cut sub run { my ($self) = @_; my $cmd = $self->program ." ". $self->database ." ". $self->query ." ". $self->options; my $blat_output_pipe = undef; $cmd .= " -out=pslx -noHead stdout"; info("Running blat to pipe...\n$cmd\n"); print("Running blat to pipe...\n$cmd\n"); open($blat_output_pipe, "$cmd |") || throw("Error opening Blat cmd <$cmd>." . " Returned error $? BLAT EXIT: '" . ($? >> 8) . "'," ." SIGNAL '" . ($? & 127) . "', There was " . ($? & 128 ? 'a' : 'no') . " core dump"); my $results = parse_results($blat_output_pipe); close $blat_output_pipe; $self->output($results); } # #much of this code is taken from #ensembl-pipeline/modules/Bio/EnsEMBL/Pipeline/Runnable/Blat.pm # sub parse_results { my ($blat_output_pipe) = @_; my @alignments; while (<$blat_output_pipe>) { #print STDERR "$_\n"; ############################################################ # PSL lines represent alignments and are typically taken from files generated # by BLAT or psLayout. See the BLAT documentation for more details. # # 1.matches - Number of bases that match that aren't repeats # 2.misMatches - Number of bases that don't match # 3.repMatches - Number of bases that match but are part of repeats # 4.nCount - Number of 'N' bases # 5.qNumInsert - Number of inserts in query # 6.qBaseInsert - Number of bases inserted in query # 7.tNumInsert - Number of inserts in target # 8.tBaseInsert - Number of bases inserted in target # 9.strand - '+' or '-' for query strand. In mouse, second '+'or '-' is for genomic strand #10.qName - Query sequence name #11.qSize - Query sequence size #12.qStart - Alignment start position in query #13.qEnd - Alignment end position in query #14.tName - Target sequence name #15.tSize - Target sequence size #16.tStart - Alignment start position in target #17.tEnd - Alignment end position in target #18.blockCount - Number of blocks in the alignment #19.blockSizes - Comma-separated list of sizes of each block #20.qStarts - Comma-separated list of starting positions of each block in query #21.tStarts - Comma-separated list of starting positions of each block in target ############################################################ # first split on spaces: chomp; my ( $matches, $mismatches, $rep_matches, $n_count, $q_num_insert, $q_base_insert, $t_num_insert, $t_base_insert, $strand, $q_name, $q_length, $q_start, $q_end, $t_name, $t_length, $t_start, $t_end, $block_count, $block_sizes, $q_starts, $t_starts, $q_seqs, $t_seqs ) = split; #print STDERR "$matches, $mismatches, $rep_matches, $n_count, $q_num_insert, $q_base_insert, # $t_num_insert, $t_base_insert, $strand, $q_name, $q_length, $q_start, # $q_end, $t_name, $t_length, $t_start, $t_end, $block_count, # $block_sizes, $q_starts, $t_starts\n"; # ignore any preceeding text #unless ( defined($matches) and $matches =~/^\d+$/ ){ # next; #} # create as many features as blocks there are in each output line my (%feat1, %feat2); $feat1{name} = $t_name; $feat2{name} = $q_name; ################ #Added strand splitter as strand represented by ++ or +- etc if (length($strand)>1){ ($feat2{strand},$feat1{strand}) = split //,$strand; } else { $feat2{strand}=$strand; $feat1{strand}=1; } # all the block sizes add up to $matches + $mismatches + $rep_matches # percentage identity = ( matches not in repeats + matches in repeats ) / ( alignment length ) #print STDERR "calculating percent_id and score:\n"; #print STDERR "matches: $matches, rep_matches: $rep_matches, mismatches: $mismatches, q_length: $q_length\n"; #print STDERR "percent_id = 100x".($matches + $rep_matches)."/".( $matches + $mismatches + $rep_matches )."\n"; #my $percent_id = sprintf "%.2f", ( 100 * ($matches + $rep_matches)/( $matches + $mismatches + $rep_matches ) ); # or is it ...? ## percentage identity = ( matches not in repeats + matches in repeats ) / query length #my $percent_id = sprintf "%.2d", (100 * ($matches + $rep_matches)/$q_length ); # we put basically score = coverage = ( $matches + $mismatches + $rep_matches ) / $q_length #print STDERR "score = 100x".($matches + $mismatches + $rep_matches)."/".( $q_length )."\n"; unless ( $q_length ){ warning("length of query is zero, something is wrong!"); next; } my $score = sprintf "%.2f", ( 100 * ( $matches + $mismatches + $rep_matches ) / $q_length ); # size of each block of alignment (inclusive) my @block_sizes = split ",",$block_sizes; # start position of each block (you must add 1 as psl output is off by one in the start coordinate) my @q_start_positions = split ",",$q_starts; my @t_start_positions = split ",",$t_starts; my @q_sequences = split ",",$q_seqs; my @t_sequences = split ",",$t_seqs; # $superfeature->seqname($q_name); # $superfeature->score( $score ); # $superfeature->percent_id( $percent_id ); # each line of output represents one possible entire aligment of the query (feat1) and the target(feat2) #### working out the coordinates: ######################### # # s e # ========== EST # <----------------------------------------------------| (reversed) genomic of length L # # we would store this as a hit in the reverse strand, with coordinates: # # |----------------------------------------------------> # s' e' # ========== EST # where e' = L - s # s' = e' - ( e - s + 1 ) + 1 # # Also, hstrand will be always +1 ############################################################ ############################################################ #NB strand may =++ or +- etc rather than just + or - # #if qstrand negative reverse qstarts and blocks and calculate the correct co-ordinates # #if Tstrand(genomic strand) negative reverse the tstarts and then cal the co-ords # #if both strands are negative then reverse everything ie blocks and starts before calculating using: # # newqstart=length - (qstart + blocklength) # newqend = length - qstart #NB not sure about when to add the plus 1 -- as psl starts at 0 for start but the end is correct #NB add +1 to the plus strand starts my @query_starts; my @target_starts; my @query_ends; my @target_ends; my @reversed_block_sizes; my @reversed_q_starts; my @reversed_t_starts; if ($feat2{strand} eq '+') { # query in the forward strand @query_starts = map {my $val=$_; $val+=1} (@q_start_positions); # use inclusive coord if ($feat1{strand} eq '-') { # target in the reverse strand for (my $i=0; $i<$block_count; $i++) { $query_ends[$i] = $q_start_positions[$i] + $block_sizes[$i]; $target_ends[$i] = $t_length - $t_start_positions[$i]; $target_starts[$i] = ($target_ends[$i] - $block_sizes[$i]) + 1; } } else { # target in the forward strand @target_starts = map {my $val=$_; $val+=1} (@t_start_positions); # use inclusive coord for (my $i=0; $i<$block_count; $i++) { $query_ends[$i] = $q_start_positions[$i] + $block_sizes[$i]; $target_ends[$i] = $t_start_positions[$i] + $block_sizes[$i]; } } } else { # query in the reverse strand if ($feat1{strand} eq '-') { for (my $i=0; $i<$block_count; $i++ ) { # target in the reverse strand $query_ends[$i] = $q_length - $q_start_positions[$i]; $query_starts[$i] = ($query_ends[$i] - $block_sizes[$i]) + 1; $target_ends[$i] = $t_length - $t_start_positions[$i]; $target_starts[$i] = ($target_ends[$i] - $block_sizes[$i]) + 1; } } else { # target in forward strand @target_starts= map {my $val=$_; $val+=1} (@t_start_positions); # use inclusive coord for (my $i=0; $i<$block_count; $i++ ) { $query_ends[$i] = $q_length - $q_start_positions[$i]; $query_starts[$i] = ($query_ends[$i] - $block_sizes[$i]) + 1; $target_ends[$i] = $t_start_positions[$i] + $block_sizes[$i]; } } } for (my $i=0; $i<$block_count; $i++ ) { next if ($block_sizes[$i] < 15); $feat2 {start} = $query_starts[$i]; $feat2 {end} = $query_ends[$i]; if ( $query_ends[$i] < $query_starts[$i]) { warning("dodgy feature coordinates: end = $query_ends[$i], start = $query_starts[$i]. Reversing..."); $feat2 {end} = $query_starts[$i]; $feat2 {start} = $query_ends[$i]; } $feat1 {start} = $target_starts[$i]; $feat1 {end} = $target_ends[$i]; my $this_q_bioseq = Bio::Seq->new( -seq => $q_sequences[$i], -moltype => "dna", -alphabet => 'dna', -id => "q_seq"); my $this_t_bioseq = Bio::Seq->new( -seq => $t_sequences[$i], -moltype => "dna", -alphabet => 'dna', -id => "t_seq"); my ($score, $percent_id, $frame) = get_best_score_in_all_frames($this_q_bioseq, $this_t_bioseq); # we put all the features with the same score and percent_id $feat2 {score} = $score; $feat1 {score} = $feat2 {score}; $feat2 {percent} = $percent_id; $feat1 {percent} = $feat2 {percent}; # other stuff: $feat1 {db} = undef; $feat1 {db_version} = undef; $feat1 {program} = 'blat'; $feat1 {p_version} = '1'; $feat1 {source} = 'blat'; $feat1 {primary} = 'similarity'; $feat2 {source} = 'blat'; $feat2 {primary} = 'similarity'; ## make strand -1 or 1 rather than - or + my $t_strand = ( $feat1{strand}eq '-')?"-1":"1"; my $q_strand = ($feat2{strand} eq '-')?"-1":"1"; ## make FeaturePair object my $feature_pair = new Bio::EnsEMBL::FeaturePair; $feature_pair->seqname($feat2{name}); $feature_pair->start($query_starts[$i]); $feature_pair->end($query_ends[$i]); $feature_pair->strand($q_strand); $feature_pair->hseqname($feat1{name}); $feature_pair->hstart($target_starts[$i]); $feature_pair->hend($target_ends[$i]); $feature_pair->hstrand($t_strand); $feature_pair->score($score); $feature_pair->percent_id($percent_id); # note that the cigar_string is created in Bio::EnsEMBL::BaseAlignFeature my $alignment = new Bio::EnsEMBL::DnaDnaAlignFeature(-features => [$feature_pair]); push @alignments, $alignment; } } #foreach my $out (@alignments) { # print STDERR $out->seqname."\t"."BLAT"."\tsimilarity\t".$out->start."\t".$out->end."\t".$out->hseqname."\t".$out->hstart."\t".$out->hend."\t". # $out->score."\t".$out->p_value."\t".$out->hstrand."\t". $out->strand."\t".$out->identical_matches."\t".$out->positive_matches."\t".$out->cigar_string."\n"; #this line is the start of a gff line for display #} return \@alignments; } sub get_best_score_in_all_frames { my ($seq1, $seq2, $matrix) = @_; my @aa_seq1_6fr = Bio::SeqUtils->translate_6frames($seq1); my @aa_seq2_6fr = Bio::SeqUtils->translate_6frames($seq2); my $score; my $perc_id = 0; my $frame = 0; ## my $seqs; for (my $i=0; $i<6; $i++) { my $this_score = 0; my $this_perc_id = 0; my $this_seq1 = $aa_seq1_6fr[$i]->seq; my $this_seq2 = $aa_seq2_6fr[$i]->seq; my $length = length($this_seq1); $length = length($this_seq2) if (length($this_seq2) < $length); my @this_seq1 = split("", $this_seq1); my @this_seq2 = split("", $this_seq2); if (defined($matrix)) { for (my $j=0; $j<$length; $j++) { my $aa1 = $this_seq1[$j]; my $aa2 = $this_seq2[$j]; $this_score += $matrix->{$aa1}->{$aa2}; $this_perc_id++ if ($aa1 eq $aa2); } } else { for (my $j=0; $j<$length; $j++) { my $aa1 = $this_seq1[$j]; my $aa2 = $this_seq2[$j]; if ($aa1 eq $aa2) { $this_score += 2; $this_perc_id++; } else { $this_score--; } } } if (!defined($score) or ($this_score > $score)) { $score = $this_score; if ($length) { $perc_id = int(100 * $this_perc_id / $length); } else { $perc_id = 0; } $frame = $i; ## $seqs = $this_seq1."\n".$this_seq2; } } return ($score, $perc_id, $frame); } ############################################################ # # get/set methods # ############################################################ =head2 query Title : query Usage : $self->query($seq) Function: Get/set method for query. If set with a Bio::Seq object it will get written to the local tmp directory Returns : filename Args : Bio::PrimarySeqI, or filename =cut sub query { my ($self, $val) = @_; if (defined $val) { if (not ref($val)) { throw("[$val] : file does not exist\n") unless -e $val; } elsif (not $val->isa("Bio::PrimarySeqI")) { throw("[$val] is neither a Bio::Seq not a file"); } $self->{_query} = $val; } return $self->{_query} } =head2 database Title : database Usage : $self->database($seq) Function: Get/set method for database. If set with a Bio::Seq object it will get written to the local tmp directory Returns : filename Args : Bio::PrimarySeqI, or filename =cut sub database { my ($self, $val) = @_; if (defined $val) { if (not ref($val)) { throw("[$val] : file does not exist\n") unless -e $val; } else { if (ref($val) eq 'ARRAY') { foreach my $el (@$val) { throw("All elements of given database array should be Bio::PrimarySeqs") if not ref($el) or not $el->isa("Bio::PrimarySeq"); } } elsif (not $val->isa("Bio::PrimarySeq")) { throw("[$val] is neither a file nor array of Bio::Seq"); } else { $val = [$val]; } } $self->{_database} = $val; } return $self->{_database}; } ############################################################ sub blat { my ($self, $location) = @_; if ($location) { throw("Blat not found at $location: $!\n") unless (-e $location); $self->{_blat} = $location ; } return $self->{_blat}; } ############################################################ sub query_type { my ($self, $mytype) = @_; if (defined($mytype) ){ my $type = lc($mytype); unless( $type eq 'dna' || $type eq 'rna' || $type eq 'prot' || $type eq 'dnax' || $type eq 'rnax' ){ throw("not the right query type: $type"); } $self->{_query_type} = $type; } return $self->{_query_type}; } ############################################################ sub target_type { my ($self, $mytype) = @_; if (defined($mytype) ){ my $type = lc($mytype); unless( $type eq 'dna' || $type eq 'prot' || $type eq 'dnax' ){ throw("not the right target type: $type"); } $self->{_target_type} = $type ; } return $self->{_target_type}; } ############################################################ sub options { my ($self, $options) = @_; if ($options) { $self->{_options} = $options ; } return $self->{_options}; } ############################################################ sub parse { my ($self, $parse) = @_; if ($parse) { $self->{_parse} = $parse; } return $self->{_parse}; } ############################################################ 1;