Bio::EnsEMBL::Utils
CigarString
Toolbar
Summary
Bio::EnsEMBL::Utils::CigarString, a utilites module to generate cigar
strings
Package variables
Privates (from "my" definitions)
($count, $state);
Included modules
Inherit
Synopsis
No synopsis!
Description
Sequence alignment hits were previously stored within the core database
as ungapped alignments. This imposed 2 major constraints on alignments:
a) alignments for a single hit record would require multiple rows in the
database, and
b) it was not possible to accurately retrieve the exact original
alignment.
Therefore, in the new branch sequence alignments are now stored as
ungapped alignments in the cigar line format (where CIGAR stands for
Concise Idiosyncratic Gapped Alignment Report).
In the cigar line format alignments are sotred as follows:
M: Match
D: Deletino
I: Insertion
An example of an alignment for a hypthetical protein match is shown
below:
Query: 42 PGPAGLP----GSVGLQGPRGLRGPLP-GPLGPPL...
PG P G GP R PLGP
Sbjct: 1672 PGTP*TPLVPLGPWVPLGPSSPR--LPSGPLGPTD...
protein_align_feature table as the following cigar line:
7M4D12M2I2MD7M
Methods
Methods description
Name : generate_cigar_string Usage: $cigar_string = $self->generate_cigar_string(\@qchars, \@hchars); Function: generate the cigar string for a piece of alignment. Args: 2 array references. The lengths of 2 arrays are the same Return: a cigar string |
Name : generate_cigar_string_by_hsp Usage : my $hsp; # a ready GenericHSP object my $cigar_string = $self->generate_cigar_string_by_hsp($hsp); Function: generate a cigar string by given HSP object. Args : a GenericHSP object Returns: a text string of cigar string |
Name : split_hsp (this name is derived from the original sub in BlastWorn) Usage : my $hsp; # a ready Bio::Search::HSP::GenericHSP object. my $factory = new Bio::EnsEMBL::Utils::CigarString; my $cigar_string = $factory->split_hsp($hsp); Function: generate cigar string. Argument: a HSP object. Returns : a text string. |
Methods code
_findIncrements | description | prev | next | Top |
sub _findIncrements
{ my ($self,$qstrand,$hstrand,$qtype,$htype) = @_;
my $qinc = 1 * $qstrand;
my $hinc = 1 * $hstrand;
if ($qtype eq 'dna' && $htype eq 'pep') {
$qinc = 3 * $qinc;
}
if ($qtype eq 'pep' && $htype eq 'dna') {
$hinc = 3 * $hinc;
}
return ($qinc,$hinc);
}
} |
sub _findStrands
{ my ($self,$hsp) = @_;
my $qstrand = $hsp->query->strand;
unless($qstrand == 1 || $qstrand == -1){
$self->warn("query's strand value is neither 1 or -1");
$qstrand = 1;
}
my $hstrand = $hsp->subject->strand;
unless($hstrand == 1 || $hstrand == -1){
$self->warn("subject's strand value is neither 1 or -1");
$hstrand = 1;
}
return ( $qstrand, $hstrand); } |
sub _findTypes
{ my ($self,$hsp) = @_;
my $type1;
my $type2;
my $len1 = $hsp->query->length();
my $len2 = $hsp->subject->length();
if ($len1/$len2 > 2) { $type1 = 'dna'; $type2 = 'pep';
} elsif ($len2/$len1 > 2) { $type1 = 'pep'; $type2 = 'dna';
} else {
$type1 = 'dna';
$type2 = 'dna';
}
return ($type1,$type2); } |
sub _sub_cigar_string
{ my ($self, $new_state) = @_;
my $sub_cigar_string = '';
if($state eq $new_state){
$count++; }else{
$sub_cigar_string .= $count unless $count == 1;
$sub_cigar_string .= $state;
$count = 1;
$state = $new_state;
}
return $sub_cigar_string; } |
sub generate_cigar_string
{
my ($self, $qchars_ref, $hchars_ref) = @_;
my @qchars = @{$qchars_ref};
my @hchars = @{$hchars_ref};
unless(scalar(@qchars) == scalar(@hchars)){
$self->throw("two sequences are not equal in lengths");
}
$count = 0;
$state = 'M';
my $cigar_string = '';
for(my $i=0; $i <= $#qchars; $i++){
my $qchar = $qchars[$i];
my $hchar = $hchars[$i];
if($qchar ne '-' && $hchar ne '-'){ $cigar_string .= $self->_sub_cigar_string('M');
}elsif($qchar eq '-'){ $cigar_string .= $self->_sub_cigar_string('D');
}elsif($hchar eq '-'){ $cigar_string .= $self->_sub_cigar_string('I');
}else{
$self->throw("Impossible state that 2 gaps on each seq aligned");
}
}
$cigar_string .= $self->_sub_cigar_string('X'); return $cigar_string; } |
sub generate_cigar_string_by_hsp
{ my ($self, $hsp) = @_;
unless(ref($hsp) && $hsp->isa('Bio::Search::HSP::GenericHSP')){
$self->throw("A GenericHSP object needed");
}
my @qchars = split(//, $hsp->query_string);
my @hchars = split(//, $hsp->hit_string);
return $self->generate_cigar_string(\@qchars,\@ hchars);
}
1; } |
sub split_hsp
{ my ($self, $hsp) = @_;
$self->throw("a defined object needed") unless($hsp && defined($hsp));
unless(ref($hsp) && $hsp->isa('Bio::Search::HSP::GenericHSP')){
$self->throw("a HSP object needed");
}
my ($qtype, $htype) = $self->_findTypes($hsp);
my ($qstrand, $hstrand) = $self->_findStrands($hsp);
my ($qinc, $hinc) = $self->_findIncrements($qstrand,$hstrand,$qtype,$htype);
my @gaps = ();
my @qchars = split(//, $hsp->query_string);
my @hchars = split(//, $hsp->hit_string);
my $qstart;
if($qstrand == 1){
$qstart = $hsp->query->start;
}elsif($qstart == -1){
$qstart = $hsp->query->end;
}else{
$self->warn("[$qstart], invalid strand value on query");
$qstart = $hsp->query->start;
}
my $hstart;
if($hstrand == 1){
$hstart = $hsp->subject->start;
}elsif($hstrand != -1){
$hstart = $hsp->subject->end;
}else{
$self->throw("[$hstart], invalid strand value on subject");
}
my $qend = $qstart;
my $hend = $hstart;
my $count = 0;
my $found = 0;
my @align_coordinates = ();
while($count <= $#qchars){
if($qchars[$count] ne '-' && $hchars[$count] ne '-') {
$qend += $qinc;
$hend += $hinc;
$found = 1;
}else{ push(@align_coordinates, [$qstart, $hstart]) if($found == 1);
$qstart = $qend;
$qstart += $qinc if($qchars[$count] ne '-');
$hstart = $hend;
$hstart += $hinc if($hchars[$count] ne '-');
$qend = $qstart;
$hend = $hstart;
$found = 0;
}
$count++;
}
if($found){
push(@align_coordinates, [$qstart, $hstart]);
}
my $cigar_string = "";
my $last = $#align_coordinates;
if($last >= 0){
for(my $i=0; $i<$last; $i++){
my $q_this_start = $align_coordinates[$i]->[0];
my $q_next_start = $align_coordinates[$i+1]->[0];
my $q_length = ($q_next_start-$q_this_start-1)*$qinc;
$q_length = abs($q_length);
my $h_this_start = $align_coordinates[$i]->[1];
my $h_next_start = $align_coordinates[$i+1]->[1];
my $h_length = ($h_next_start-$h_this_start-1)*$hinc;
$h_length = abs($h_length);
my $diff = $q_length - $h_length;
if($diff > 0){ $cigar_string .= $diff unless($diff == 1);
$cigar_string .= 'I';
}elsif($diff < 0){ $cigar_string .= -$diff unless($diff == -1);
$cigar_string .= 'D';
}else{ $cigar_string .= $q_length unless($q_length == 1);
$cigar_string .= 'M';
}
} }
return $cigar_string; } |
General documentation
Copyright (c) 1999-2009 The European Bioinformatics Institute and
Genome Research Limited. All rights reserved.
This software is distributed under a modified Apache license.
For license details, please see
/info/about/code_licence.html