Raw content of Bio::EnsEMBL::Analysis::Tools::Blastz # EnsEMBL module for parsin blastz output # # Cared for by Abel Ureta-Vidal <abel@ebi.ac.uk> # # You may distribute this module under the same terms as perl itself # =pod =head1 NAME Bio::EnsEMBL::Analysis::Tools::Blastz - Ensembl specific blastz output parser =head1 SYNOPSIS open F,"blastz_output_file"; my $fh = \*F; my $BlastzParser = new Bio::EnsEMBL::Analysis::Tools::Blastz(-fh => $fh); or my $blastz_output_file = "blastz_output_file"; my $BlastzParser = new Bio::EnsEMBL::Analysis::Tools::Blastz(-file => $blastz_output_file); while (defined (my $alignment = $BlastzParser->nextAlignment)) { print $alignment->percent_id," ",$alignment->score," ",$alignment->cigar_string,"\n"; print $alignment->start," ",$alignment->end," ",$alignment->strand,"\n"; print $alignment->hstart," ",$alignment->hend," ",$alignment->hstrand,"\n"; } close F; The constructor only need a filehandle opened on a blastz output file. nextAlignment method return a Bio::EnsEMBL::DnaDnaAlignFeature object corresponding to the next HSP-like alignment. =head1 CONTACT Ensembl development mailing list <ensembl-dev@ebi.ac.uk> Abel Ureta-Vidal <abel@ebi.ac.uk> =head1 APPENDIX The rest of the documentation deals wtih each of the object methods. Internal methods are usually preceded by a _ =cut package Bio::EnsEMBL::Analysis::Tools::Blastz; use strict; use Bio::EnsEMBL::DnaDnaAlignFeature; use Bio::EnsEMBL::Utils::Argument qw(rearrange); our @ISA = qw(Bio::EnsEMBL::Root); sub new { my ($class,@args) = @_; my $self = bless {}, $class; $self->{'_fh'} = undef; # filehandle on results file $self->{'_file'} = undef; # path for a results file $self->{'_eof'} = 0; # indicate if end of file and fh closed $self->{'_parsing_initialized'} = 0; $self->{'_command_line'} = ""; $self->{'_matrix'} = ""; $self->{'_options'} = ""; $self->{'_alignment_reported_before'} = {}; my ($fh,$file) = rearrange([qw(FH FILE)], @args); if ((defined $fh && defined $file) || !(defined $fh || defined $file)){ $self->throw("Must pass in either fh or file argument"); } if (defined $fh) { $self->{'_fh'} = $fh; } else { $self->file($file); open F, $self->file; $self->{'_fh'} = \*F; } $self->_initialize; return $self; } sub _initialize { my ($self) = @_; return undef if ($self->eof); my $fh = $self->fh; while (defined (my $line = <$fh>)) { next if ($line =~ /^\#:lav$/); last if ($line =~ /^\}$/); # d stanza if ($line =~ /^d\s\{$/) { my $command_line = <$fh>; chomp $line; $command_line =~ s/^\s+\"//; $self->command_line($command_line); next; } if ($line =~ /^.*,.*$/) { $line =~ s/\"//g; $self->options($self->options.$line); } else { $self->matrix($self->matrix.$line); } } return $self->_parsing_initialized(1); } =head2 nextAlignmemt Args : none Example : $alignment = $Blastz->nextAligment Descritpion : return the next HSP-like alignment Returntype : array of Bio::EnsEMBL::DnaDnaAlignFeature Exceptions : none Caller : general =cut sub nextAlignment { my ($self) = @_; return undef if ($self->eof); my $fh = $self->fh; my $l_line_fault = 0; while (defined (my $line = <$fh>)) { next if ($line =~ /^\#:lav$/); if ($line =~ /^\#:eof$/) { close $self->fh; $self->eof(1); return undef; } # s stanza : get there sequence length and strand if ($line =~ /^s\s+\{$/) { # on query $line = <$fh>; if ($line =~ /^\s*\"\S+\"\s+(\d+)\s+(\d+)\s+(\d+)\s+\d+$/) { my($start,$end,$strand) = ($1,$2,$3); $self->length($end-$start+1); $self->strand(1) if ($strand == 0); $self->strand(-1) if ($strand == 1); } # on database $line = <$fh>; if ($line =~ /^\s*\"\S+\"\s+(\d+)\s+(\d+)\s+(\d+)\s+\d+$/) { my($hstart,$hend,$hstrand) = ($1,$2,$3); $self->hlength($hend-$hstart+1); $self->hstrand(1) if ($hstrand == 0); $self->hstrand(-1) if ($hstrand == 1); } <$fh>; # skip } line next; } # h stanza : get there seqname and hseqname if ($line =~ /^h\s+\{$/) { # on query $line = <$fh>; if ($line =~ /^\s+\">(\S+)\s*.*\"$/) { my $seqname = $1; $self->seqname($seqname); } # on database $line = <$fh>; if ($line =~ /^\s+\">(\S+)\s*.*\"$/) { my $hseqname = $1; $self->hseqname($hseqname); } <$fh>; # skip } line next; } # a stanza : get there a alignment, with score, percent_id and positions if ($line =~ /^a\s+\{$/) { my ($score,$sum_match_bases,$sum_block_length) ; my @feature_pairs; while (defined ($line = <$fh>)) { last if ($line =~ /^\}$/); if ($line =~ /^\s+s\s+(\d+)$/) { $score = $1; next; } next if ($line =~ /^\s+b\s+\d+\s+\d+$/); next if ($line =~ /^\s+e\s+\d+\s+\d+$/); if ($line =~ /^\s+l\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)$/) { my ($start,$hstart,$end,$hend,$percid) = ($1,$2,$3,$4,$5); if ($start > $end || $hstart > $hend) { # this is a blastz bug that has been reported to the author. No bug fix yet, and # probably not for a long time # so in the meantime, a code hack tries to recover a well format alignment # see below $l_line_fault = 1; next; }; my $block_length = $end - $start + 1; $sum_match_bases += $percid*$block_length/100; $sum_block_length += $block_length; if ($self->strand == -1) { $start = $self->length - $end + 1; $end = $start + $block_length - 1; } if ($self->hstrand == -1) { $hstart = $self->hlength - $hend + 1; $hend = $hstart + $block_length - 1; } if (scalar @feature_pairs == 0 || $l_line_fault == 0) { $l_line_fault = 0; } if ($l_line_fault) { warn("Dealing with a faulty l line\n"); # code hack to fix faulty l lines where start>end or hstart>hend that are ignored. # We extend the previous gap-free piece until it hits the closest # query or target sequence in the next piece # The perc_id and score for this arranged featurepair will not be consistent # but then again, it is a bug in blastz not in the parser... my $f = pop @feature_pairs; my $diff; if ($self->strand == 1) { $diff = $start - $f->end - 1; } else { $diff = $f->start - $end - 1; } if ($self->hstrand == 1 && $diff > $hstart - $f->hend - 1) { $diff = $hstart - $f->hend - 1; } elsif ($diff > $f->hstart - $hend - 1) { $diff = $f->hstart - $hend - 1; } $f->end($f->end + $diff) if ($self->strand == 1); $f->start($f->start - $diff) if ($self->strand == -1); $f->hend($f->hend + $diff) if ($self->hstrand == 1); $f->hstart($f->hstart - $diff) if ($self->hstrand == -1); push @feature_pairs, $f; $l_line_fault = 0; } my $feature_pair = new Bio::EnsEMBL::FeaturePair; $feature_pair->seqname($self->seqname); $feature_pair->start($start); $feature_pair->end($end); $feature_pair->strand($self->strand); $feature_pair->hseqname($self->hseqname); $feature_pair->hstart($hstart); $feature_pair->hend($hend); $feature_pair->hstrand($self->hstrand); $feature_pair->score($score); push @feature_pairs,$feature_pair; } } # calculating the average of percentage identity over the whole HSP-like # not including indels,it probably should... my $average_pecent_id = int($sum_match_bases/$sum_block_length*100); foreach my $feature_pair (@feature_pairs) { $feature_pair->percent_id($average_pecent_id); } my $alignment = new Bio::EnsEMBL::DnaDnaAlignFeature(-features => \@feature_pairs); my $key = ""; map {$key .= $alignment->$_ . "_"} qw(seqname start end strand hseqname hstart hend hstrand score cigar_string); if (defined $self->{'_alignment_reported_before'}{$key}) { next; } $self->{'_alignment_reported_before'}{$key} = 1; return $alignment; } } } =head2 fh Arg [1] : filehandle $filehandle (optional) Example : $Blastz->fh($filehandle) Descritpion : get/set the filehandle value and return it Returntype : filehandle Exceptions : thrown if $filehandle is not the GLOB reference Caller : general =cut sub fh { my ($self,$value) = @_; if(defined $value) { if (ref($value) eq "GLOB") { $self->{'_fh'} = $value; } else { $self->throw("value for fh method should be a filehandle\n"); } } return $self->{'_fh'}; } =head2 file Arg [1] : string $filename_path (optional) Example : $Blastz->file($filename_path) Descritpion : get/set the filename_path value and return it Returntype : string Exceptions : thrown if $filename_path is not found Caller : general =cut sub file { my ($self,$value) = @_; if(defined $value) { if (-e $value) { $self->{'_file'} = $value; } else { $self->throw("file $value not found\n"); } } return $self->{'_file'}; } sub eof { my ($self,$value) = @_; if(defined $value) { $self->{'_eof'} = $value; } return $self->{'_eof'}; } =head2 file Arg [1] : string $commandline (optional) command line used to obtain the blastz output which is parsed Example : $Blastz->commandline($commandline) Descritpion : get/set the commandline value and return it Returntype : string Exceptions : none Caller : general =cut sub command_line { my ($self,$value) = @_; if (defined $value) { $self->{'_command_line'} = $value; } return $self->{'_command_line'}; } =head2 matrix Arg [1] : string $matrix (optional) matrix used to obtain the blastz output which is parsed Example : $Blastz->matrix($matrix) Descritpion : get/set the matrix value and return it Returntype : string Exceptions : none Caller : general =cut sub matrix { my ($self,$value) = @_; if (defined $value) { $self->{'_matrix'} = $value; } return $self->{'_matrix'}; } =head2 options Arg [1] : string $options (optional) options used to obtain the blastz output which is parsed Example : $Blastz->options($options) Descritpion : get/set the options value and return it Returntype : string Exceptions : none Caller : general =cut sub options { my ($self,$value) = @_; if (defined $value) { $self->{'_options'} = $value; } return $self->{'_options'}; } sub _parsing_initialized { my ($self,$value) = @_; if (defined $value) { $self->{'_parsing_initialized'} = $value; } return $self->{'_parsing_initialized'}; } =head2 seqname Arg [1] : string $seqname (optional) name of the query sequence Example : $Blastz->seqname($seqname) Descritpion : get/set the seqname value and return it Returntype : string Exceptions : none Caller : general =cut sub seqname { my ($self,$value) = @_; if (defined $value) { $self->{'_seqname'} = $value; } return $self->{'_seqname'}; } =head2 hseqname Arg [1] : string $hseqname (optional) name of the database sequence Example : $Blastz->hseqname($hseqname) Descritpion : get/set the hseqname value and return it Returntype : string Exceptions : none Caller : general =cut sub hseqname { my ($self,$value) = @_; if (defined $value) { $self->{'_hseqname'} = $value; } return $self->{'_hseqname'}; } =head2 length Arg [1] : int $length (optional) sequence length of the query sequence Example : $Blastz->length($length) Descritpion : get/set the length value and return it Returntype : int Exceptions : none Caller : general =cut sub length { my ($self,$value) = @_; if (defined $value) { $self->{'_length'} = $value; } return $self->{'_length'}; } =head2 hlength Arg [1] : int $length (optional) sequence length of the database sequence Example : $Blastz->hlength($length) Descritpion : get/set the hlength value and return it Returntype : int Exceptions : none Caller : general =cut sub hlength { my ($self,$value) = @_; if (defined $value) { $self->{'_hlength'} = $value; } return $self->{'_hlength'}; } =head2 strand Arg [1] : int $strand (optional) strand of the query sequence Example : $Blastz->strand($strand) Descritpion : get/set the strand value and return it Returntype : int Exceptions : none Caller : general =cut sub strand { my ($self,$value) = @_; if (defined $value) { $self->{'_strand'} = $value; } return $self->{'_strand'}; } =head2 hstrand Arg [1] : int $strand (optional) strand of the query sequence Example : $Blastz->hstrand($strand) Descritpion : get/set the hstrand value and return it Returntype : int Exceptions : none Caller : general =cut sub hstrand { my ($self,$value) = @_; if (defined $value) { $self->{'_hstrand'} = $value; } return $self->{'_hstrand'}; } 1;