Bio::EnsEMBL::Utils CigarString
SummaryIncluded librariesPackage variablesDescriptionGeneral documentationMethods
Toolbar
WebCvsRaw content
Summary
Bio::EnsEMBL::Utils::CigarString, a utilites module to generate cigar
strings
Package variables
Privates (from "my" definitions)
($count, $state);
Included modules
Bio::EnsEMBL::Root
Inherit
Bio::EnsEMBL::Root
Synopsis
No synopsis!
Description
Sequence alignment hits were previously stored within the core database
as ungapped alignments. This imposed 2 major constraints on alignments:
a) alignments for a single hit record would require multiple rows in the
database, and
b) it was not possible to accurately retrieve the exact original
alignment.
Therefore, in the new branch sequence alignments are now stored as
ungapped alignments in the cigar line format (where CIGAR stands for
Concise Idiosyncratic Gapped Alignment Report).
In the cigar line format alignments are sotred as follows:
  M: Match
D: Deletino
I: Insertion
An example of an alignment for a hypthetical protein match is shown
below:
  Query:   42 PGPAGLP----GSVGLQGPRGLRGPLP-GPLGPPL...
PG P G GP R PLGP
Sbjct: 1672 PGTP*TPLVPLGPWVPLGPSSPR--LPSGPLGPTD...
protein_align_feature table as the following cigar line:
  7M4D12M2I2MD7M
Methods
_findIncrements
No description
Code
_findStrands
No description
Code
_findTypes
No description
Code
_sub_cigar_string
No description
Code
generate_cigar_stringDescriptionCode
generate_cigar_string_by_hspDescriptionCode
split_hspDescriptionCode
Methods description
generate_cigar_stringcode    nextTop
  Name : generate_cigar_string
Usage: $cigar_string = $self->generate_cigar_string(\@qchars, \@hchars);
Function: generate the cigar string for a piece of alignment.
Args: 2 array references. The lengths of 2 arrays are the same
Return: a cigar string
generate_cigar_string_by_hspcodeprevnextTop
  
Name : generate_cigar_string_by_hsp
Usage : my $hsp; # a ready GenericHSP object
my $cigar_string = $self->generate_cigar_string_by_hsp($hsp);
Function: generate a cigar string by given HSP object.
Args : a GenericHSP object
Returns: a text string of cigar string
split_hspcodeprevnextTop
    Name  : split_hsp (this name is derived from the original sub in BlastWorn)
Usage : my $hsp; # a ready Bio::Search::HSP::GenericHSP object.
my $factory = new Bio::EnsEMBL::Utils::CigarString;
my $cigar_string = $factory->split_hsp($hsp);

Function: generate cigar string.
Argument: a HSP object.
Returns : a text string.
Methods code
_findIncrementsdescriptionprevnextTop
sub _findIncrements {
    my ($self,$qstrand,$hstrand,$qtype,$htype) = @_;

    my $qinc   = 1 * $qstrand;
    my $hinc   = 1 * $hstrand;

    if ($qtype eq 'dna' && $htype eq 'pep') {
    $qinc = 3 * $qinc;
    }
    if ($qtype eq 'pep' && $htype eq 'dna') {
    $hinc = 3 * $hinc;
    }

    return ($qinc,$hinc);
}

# This is a core logic of cigar string. The finite state machine theory is 
# apply. See the below table, x-axis represents the input, with 3 options:
# (+/+) -- Both current query and subject bases are non-gap. Match
# (-/+) -- The current query base is gap, but subject not. Deletion
# (+/-) -- The current subject base is gap, but query not. Insertion
# While the y-axis means the current state with letter 'M', 'D', 'I'
#
# The content of this table is the action taken in response of the input and
# the current state.
# R remain the state, counter increment.
# G;X generate the cigar line based on the current state and counter;
# clear the counter to zero and change to the state X
#
# || +/+ | -/+ | +/- |
# -------+----------------------+
# M || R | G;D | G;I |
# ------------------------------+
# D || G;M | R | G;I |
# ------------------------------+
# I || G;M | G;D | R |
#
}
_findStrandsdescriptionprevnextTop
sub _findStrands {
    my ($self,$hsp) = @_;
    
    my $qstrand = $hsp->query->strand;
    unless($qstrand == 1 || $qstrand == -1){
        $self->warn("query's strand value is neither 1 or -1");
        $qstrand = 1;
    }
    
    my $hstrand = $hsp->subject->strand;
    unless($hstrand == 1 || $hstrand == -1){
        $self->warn("subject's strand value is neither 1 or -1");
        $hstrand = 1;
    }
    
    return ( $qstrand, $hstrand);
}
_findTypesdescriptionprevnextTop
sub _findTypes {
    my ($self,$hsp) = @_;

    my $type1;
    my $type2;
    my $len1 = $hsp->query->length();
    my $len2 = $hsp->subject->length();

    if ($len1/$len2 > 2) {
$type1 = 'dna';
$type2 = 'pep'; } elsif ($len2/$len1 > 2) {
$type1 = 'pep';
$type2 = 'dna'; } else { $type1 = 'dna'; $type2 = 'dna'; } return ($type1,$type2);
}
_sub_cigar_stringdescriptionprevnextTop
sub _sub_cigar_string {
    my ($self, $new_state) = @_;
    my $sub_cigar_string = '';
    if($state eq $new_state){
        $count++; # Remain the state and increase the counter
}else{ $sub_cigar_string .= $count unless $count == 1; $sub_cigar_string .= $state; $count = 1; $state = $new_state; } return $sub_cigar_string;
}
generate_cigar_stringdescriptionprevnextTop
sub generate_cigar_string {
#   my ($self, $qstart, $hstart, $qinc, $hinc, $qchars_ref, $hchars_ref) = @_;
my ($self, $qchars_ref, $hchars_ref) = @_; my @qchars = @{$qchars_ref}; my @hchars = @{$hchars_ref}; unless(scalar(@qchars) == scalar(@hchars)){ $self->throw("two sequences are not equal in lengths"); } $count = 0; $state = 'M'; # the current state of gaps, (M, D, I)
my $cigar_string = ''; for(my $i=0; $i <= $#qchars; $i++){ my $qchar = $qchars[$i]; my $hchar = $hchars[$i]; if($qchar ne '-' && $hchar ne '-'){ # Match
$cigar_string .= $self->_sub_cigar_string('M'); }elsif($qchar eq '-'){ # Deletion
$cigar_string .= $self->_sub_cigar_string('D'); }elsif($hchar eq '-'){ # Insertion
$cigar_string .= $self->_sub_cigar_string('I'); }else{ $self->throw("Impossible state that 2 gaps on each seq aligned"); } } $cigar_string .= $self->_sub_cigar_string('X'); # not forget the tail.
return $cigar_string;
}
generate_cigar_string_by_hspdescriptionprevnextTop
sub generate_cigar_string_by_hsp {
    my ($self, $hsp) = @_;

    unless(ref($hsp) && $hsp->isa('Bio::Search::HSP::GenericHSP')){
        $self->throw("A GenericHSP object needed");
    }

    my @qchars = split(//, $hsp->query_string);
    my @hchars = split(//, $hsp->hit_string);

    return $self->generate_cigar_string(\@qchars,\@ hchars);
}

1;
}
split_hspdescriptionprevnextTop
sub split_hsp {
    my ($self, $hsp) = @_;

    $self->throw("a defined object needed") unless($hsp && defined($hsp));
    unless(ref($hsp) && $hsp->isa('Bio::Search::HSP::GenericHSP')){
        $self->throw("a HSP object needed");
    }

    my ($qtype, $htype) = $self->_findTypes($hsp);
    my ($qstrand, $hstrand) = $self->_findStrands($hsp);
    my ($qinc, $hinc) = $self->_findIncrements($qstrand,$hstrand,$qtype,$htype);

    my @gaps = ();
    my @qchars = split(//, $hsp->query_string);
    my @hchars = split(//, $hsp->hit_string);
    my $qstart;
    if($qstrand == 1){
        $qstart = $hsp->query->start;
    }elsif($qstart == -1){
        $qstart = $hsp->query->end;
    }else{
        $self->warn("[$qstart], invalid strand value on query");
        $qstart = $hsp->query->start; 
        # Is this a SearchIO's bug???
} my $hstart; if($hstrand == 1){ $hstart = $hsp->subject->start; }elsif($hstrand != -1){ $hstart = $hsp->subject->end; }else{ $self->throw("[$hstart], invalid strand value on subject"); } my $qend = $qstart; my $hend = $hstart; my $count = 0; my $found = 0; my @align_coordinates = (); while($count <= $#qchars){ if($qchars[$count] ne '-' && $hchars[$count] ne '-') { $qend += $qinc; $hend += $hinc; $found = 1; }else{ # gapped region
push(@align_coordinates, [$qstart, $hstart]) if($found == 1); $qstart = $qend; $qstart += $qinc if($qchars[$count] ne '-'); $hstart = $hend; $hstart += $hinc if($hchars[$count] ne '-'); $qend = $qstart; $hend = $hstart; $found = 0; } $count++; } if($found){ push(@align_coordinates, [$qstart, $hstart]); } my $cigar_string = ""; my $last = $#align_coordinates; if($last >= 0){ for(my $i=0; $i<$last; $i++){ my $q_this_start = $align_coordinates[$i]->[0]; my $q_next_start = $align_coordinates[$i+1]->[0]; my $q_length = ($q_next_start-$q_this_start-1)*$qinc; $q_length = abs($q_length); my $h_this_start = $align_coordinates[$i]->[1]; my $h_next_start = $align_coordinates[$i+1]->[1]; my $h_length = ($h_next_start-$h_this_start-1)*$hinc; $h_length = abs($h_length); my $diff = $q_length - $h_length; if($diff > 0){ # Insertion
$cigar_string .= $diff unless($diff == 1); $cigar_string .= 'I'; }elsif($diff < 0){ # Deletion
$cigar_string .= -$diff unless($diff == -1); $cigar_string .= 'D'; }else{ # e.g $diff == 0, Match
$cigar_string .= $q_length unless($q_length == 1); $cigar_string .= 'M'; } } # for
} # if
return $cigar_string;
}
General documentation
LICENSETop
  Copyright (c) 1999-2009 The European Bioinformatics Institute and
Genome Research Limited. All rights reserved.
This software is distributed under a modified Apache license. For license details, please see /info/about/code_licence.html
CONTACTTop
  Please email comments or questions to the public Ensembl
developers list at <ensembl-dev@ebi.ac.uk>.
Questions may also be sent to the Ensembl help desk at <helpdesk@ensembl.org>.
AUTHORTop
Juguang Xiao <juguang@tll.org.sg>