Bio::EnsEMBL::Analysis::Runnable CrossMatch
Included librariesPackage variablesSynopsisDescriptionGeneral documentationMethods
Toolbar
WebCvsRaw content
Package variables
No package variables defined.
Included modules
Bio::EnsEMBL::Analysis::Runnable
Bio::EnsEMBL::FeaturePair
Bio::EnsEMBL::Utils::Argument qw ( rearrange )
Bio::EnsEMBL::Utils::Exception qw ( info verbose throw warning )
strict
Inherit
Bio::EnsEMBL::Analysis::Runnable
Synopsis
	use CrossMatch;

# Create a factory object
$matcher = CrossMatch::Factory->new( '/nfs/disk2001/this_dir',
'/home/jgrg/that_dir' );
$matcher->minMatch( 80 );
# process full crossmatch alignments $matcher->alignments; # Match two fasta fomat sequence files, generating # a match object $cm = $matcher->crossMatch( 'dJ334P19.seq', 'cB49C12.aa' );
Description
Module to provide object-oriented access to Phil Green's cross_match
Smith-Waterman alignment program.
Methods
_save_hit
No description
Code
_trans_fp
No description
Code
add_fpDescriptionCode
fp
No description
Code
hit
No description
Code
masklevelDescriptionCode
minmatchDescriptionCode
minscore
No description
Code
new
No description
Code
outputDescriptionCode
parse_resultsDescriptionCode
run
No description
Code
run_analysisDescriptionCode
Methods description
add_fpcode    nextTop
Title : add_fp
Usage :
Function:
Example :
Returns :
Args :
masklevelcodeprevnextTop
  Title   : masklevel
Usage : $obj->masklevel($newval)
Function: could set and return the masklevel value
Example :
Returns : value of masklevel option used by crossmatch
Args : newvalue (optional)
minmatchcodeprevnextTop
  Title   : minmatch
Usage : $obj->minmatch($newval)
Function: could set and return the minmatch value
Example :
Returns : value of minmatch option used by crossmatch
Args : newvalue (optional)
outputcodeprevnextTop
Title : output
Usage :
Function:
Example :
Returns :
Args :
parse_resultscodeprevnextTop
  Arg [1]   : Bio::EnsEMBL::Analysis::Runnable::ExonerateArray
Arg [2] : string, filename
Function : open and parse the results file into misc_features
features
Returntype: none
Exceptions: throws on failure to open or close output file
Example :
run_analysiscodeprevnextTop
Arg [1] : Bio::EnsEMBL::Analysis::Runnable::ExonerateArray
Arg [2] : string, program name
Function : constructs a commandline and runs the program passed
in, the generic method in Runnable isnt used as ExonerateArray doesnt
fit this module
Returntype: none
Exceptions: throws if run failed because sysetm doesnt
return 0 or the output file doesnt exist
Example :
Methods code
_save_hitdescriptionprevnextTop
sub _save_hit {
  my($self,$rares,$raalign)=@_;

  # only save if something to save
if (@$rares) { # parsing to deal different numbers of items
# if 13 then second sequence is complement
# if 12 then its not, so insert 'W'
my @nres; if (@$rares == 13) { @nres=( @$rares[0..9, 12, 11, 10] ); } elsif (@$rares == 12) { @nres=( @$rares[0..7], 'W', @$rares[8..11] ); } # alignment is stored in a directional fashion: x->y maps to a->b where y>x
my $raw_align; if (@$raalign) { # reverse if required
my $st1=$nres[5]; my $en1=$nres[6]; my $st2; my $en2; my $dirl; if ($$raalign[2] eq 'C') { $$raalign[3]='C'; $$raalign[0]=reverse($$raalign[0]); $$raalign[1]=reverse($$raalign[1]); $dirl='C'; $st2=$nres[11]; $en2=$nres[10]; } else { $st2=$nres[10]; $en2=$nres[11]; } # check length
my $l1=length($$raalign[0]); my $l2=length($$raalign[1]); if ($l1!=$l2) { print "Lengths of alignment are different [$l1,$l2]\n$$raalign[0]\n$$raalign[1]\n"; print join(',',@nres)."\n"; throw( "failed at CrossMatch :_save_hit"); } # walk along sequence in blocks
$raw_align="$st1:$dirl$st2"; my $seq1=$$raalign[0]; my $seq2=$$raalign[1]; { my($s1a,$s1b,$s1c,$s2a,$s2b,$s2c); if ($seq1=~/^([^\-]+)(\-+)(\S+)$/) { ($s1a,$s1b,$s1c)=($1,$2,$3); } else { $s1a=$seq1; $s1b=$s1c=''; } if ($seq2=~/^([^\-]+)(\-+)(\S+)$/) { ($s2a,$s2b,$s2c)=($1,$2,$3); } else { $s2a=$seq2; $s2b=$s2c=''; } #print STDERR "heads are ",substr($s1a,0,10),";",substr($s1b,0,10),":",substr($s2a,0,10),";",substr($s2b,0,10),"\n";
# escape if no more gaps
next if(length($s1c)==0 && length($s2c)==0); # do shortest first
my $lab; if ( length($s1a.$s1b) == 0 || length($s2a.$s2b) == 0 ) { print STDERR "Dodgy alignment processing catch! Bugging out\n"; next; } if (length($s1a.$s1b)<length($s2a.$s2b)) { # update seq1
$lab=length($s1a.$s1b); $st1+=length($s1a); #print STDERR "st1 is $st1 with $lab\n";
$seq1=$s1c; #print STDERR "New head is ",substr($seq1,0,10),"\n";
# update seq2
$seq2=~s/^\S{$lab}//;
#print STDERR "new $lab.. seq2 head is ",substr($seq2,0,10),"\n";
} else { # update seq2
$lab=length($s2a); $seq2=$s2c; # update seq1
my $l2ab=length($s2a.$s2b); $seq1=~s/^\S{$l2ab}//;
$st1+=$l2ab; } if ($dirl eq 'C') { $st2-=$lab; } else { $st2+=$lab; } $raw_align.=",$st1:$dirl$st2"; redo; } $raw_align.=",$en1:$dirl$en2"; } $self->hit(@nres,$raw_align); # clear data
@$rares=(); @$raalign=(); } } # Store data from a hit
}
_trans_fpdescriptionprevnextTop
sub _trans_fp {
  my($align)=@_;
  my($st1p,$st2p);
  $st1p=$st2p=0;
  my @hits;
  foreach my $pair (split(',',$align)) {
    my $dirl;
    my($st1,$st2)=split(':',$pair);
    if ($st2=~/C(\d+)/) {
      $st2=$1;
      $dirl='C';
    }

    # only output if previous saved
if ($st1p) { # calculate ends
my $l1=$st1-$st1p; my $l; if ($dirl) { $l=$st2p-$st2; } else { $l=$st2-$st2p; } # if exact match, must be end, so need full length
if ($l1==$l) { $l++; } elsif ($l1<$l) { $l=$l1; } $l1=$l-1; my $ed1p=$st1p+$l1; my $ed2p; if ($dirl) { $ed2p=$st2p-$l1; } else { $ed2p=$st2p+$l1; } push(@hits,[$st1p,$ed1p,$st2p,$ed2p]); } ($st1p,$st2p)=($st1,$st2); } # if higher than end then no match
return @hits; } 1; __END__
}
add_fpdescriptionprevnextTop
sub add_fp {
  my ($self,@args) = @_;
  
  push(@{$self->{'_fp_array'}},@args);
}
fpdescriptionprevnextTop
sub fp {
  my( $self, $minscore ) = @_;
  # loop over all rows
my @hits; foreach my $row (@{$self->{'active'}}) { my $score=$self->score($row); next if($score<$minscore); my $aname=$self->aName($row); my $bname=$self->bName($row); my @sted=&_trans_fp($self->raw_align($row)); foreach my $sted2 (@sted) { push(@hits,join(":",$aname,$bname,$score,@$sted2)); } } return @hits;
}
hitdescriptionprevnextTop
sub hit {
  my $self = shift;
  
  if (@_ == 14) {
    push( @{$self->{'hit'}}, [@_]);
    push( @{$self->{'active'}}, $#{$self->{'hit'}} );
  } else {
    warning( "Bad number of elements (", scalar @_, ") in '@_'");
  }
}

# Create access methods to access the data from the matches
BEGIN { my( %fields ); { my $i = 0; %fields = map {$_, $i++} qw( score pSub pDel pIns aName aStart aEnd aRemain strand bName bStart bEnd bRemain raw_align); } foreach my $func (keys %fields) { no strict 'refs'; my $i = $fields{ $func }; *$func = sub { my( $match, @rows ) = @_; if (wantarray) { # Extract the requested values
unless (@rows) { # Get a list of all the row indices
@rows = @{$match->{'active'}}; } # Make a vertical slice through the hits
return map $match->{'hit'}[$_][$i], @rows; } else { # Return just one value in scalar context
if (defined $rows[0]) { # Get field from row requested
return $match->{'hit'}[$rows[0]][$i]; } else { # Just get value from first row in active list
my $row = $match->{'active'}[0]; return $match->{'hit'}[$row][$i]; } } } } } # output's full featurepair list
}
maskleveldescriptionprevnextTop
sub masklevel {
  my ($obj,$value) = @_;
  if ( defined $value) {
    $obj->{'masklevel'} = $value;
  }
  return $obj->{'masklevel'};
}
minmatchdescriptionprevnextTop
sub minmatch {
  my ($obj,$value) = @_;
  if ( defined $value) {
    $obj->{'minmatch'} = $value;
  }
  return $obj->{'minmatch'};
}
minscoredescriptionprevnextTop
sub minscore {
  my ($obj,$value) = @_;
  if ( defined $value) {
    $obj->{'minscore'} = $value;
  }
  return $obj->{'minscore'};
}
newdescriptionprevnextTop
sub new {
  my($class,@args) = @_;
  my $self = $class->SUPER::new(@args);
  bless $self,$class;
  
  $self->{'_fp_array'} =[];
  
  my ($seq1,
      $seq2,
      $program,
      $workdir,
      $minscore,
      $minmatch,
      $masklevel) = rearrange([qw(
				  SEQ1 
				  SEQ2 
				  PROGRAM
				  WORKDIR 
				  MINSCORE
				  MINMATCH 
				  MASKLEVEL)],@args);
  
  if( !defined $seq1 || !defined $seq2 ) {
    $self->throw("Must pass in both seq1 and seq1 args");
  }
  
  my $queryfile = $self->queryfile();
  my $targetfile = $queryfile."target";
  
  $self->write_seq_file($seq1,$queryfile);
  $self->write_seq_file($seq2,$targetfile); print "$seq2 and $targetfile\n";
  
  if( $workdir) { 
    $self->workdir($workdir); 
  } else {
    $self->workdir("/tmp");
  }

  $minmatch = 30 unless (defined $minmatch); # $minmatch = $minscore compatatible with cvs version 1.7
$minscore = $minmatch unless (defined $minscore); $masklevel = 101 unless (defined $masklevel); if( defined $minmatch ) { $self->minmatch($minmatch); } if( defined $minscore ) { $self->minmatch($minscore); } if( defined $masklevel ) { $self->masklevel($masklevel); } #my $path = $self->locate_executable("cross_match");
#$self->program($path);
my $options = "-minmatch $minmatch -minscore $minscore -masklevel $masklevel -alignments $queryfile $targetfile "; if ($options){ $self->options($options); } # set stuff in self from @args
return $self;
}
outputdescriptionprevnextTop
sub output {
  my ($self,@args) = @_;
  
  return @{$self->{'_fp_array'}};
}
parse_resultsdescriptionprevnextTop
sub parse_results {
  my ($self, $results) =  @_;
  if(!$results){
    $results = $self->resultsfile;
  }

  my (@res,$ialign,@align);

  open( CROSS_MATCH, $results ) || throw("FAILED to open ".$results." CrossMatch::parse_results");
  
  while (<CROSS_MATCH>){
    
    info ($_) ;
    
    # process alignment lines if requested
# print STDERR "Processing....$_";
if(/^(\w*)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\d+)$/){ ###this is alignment line
if($2 eq $res[4] && $ialign==1){ $align[0].=$4; $align[2]=$1; $ialign=2; }elsif(($2 eq $res[8] || $2 eq $res[9]) && $ialign==2){ $align[1].=$4; $align[3]=$1; $ialign=1; }else{ die "alignment parsing error in Crossmatch.pm\n $_"; } } # this is used to exclude all except alignment summary lines
next unless /\(\d+\).*\(\d+\)/; # Remove parentheses and asterisks
tr/()\*//d; # save previous hit, with alignments
&_save_hit($self,\@res,\@align); @res = split; $ialign=1; } #print STDERR "About to process...\n";
&_save_hit($self,\@res,\@align); #print STDERR "saved hits \n";
undef @res; undef @align; # Check exit status of cross_match
unless (close CROSS_MATCH) { my $error = $! ? "Error from cross_match: $!" : "Error: cross_match exited status $?"; warning( "$error\nCommand: 'cross_command'"); } # Pre-rearranged fields
# 0 1 2 3 4 5 6 7 8 9 10 11 12
# 98 0.00 0.00 0.00 130N4 1 104 84065 W 92M18 68800 68903 0
# 98 0.00 0.00 0.00 92M18 68800 68903 0 W 130N4 1 104 84065
# 8251 0.00 0.00 0.00 130N4 1 84169 0 W 130N4 1 84169 0
# 103 0.00 0.00 0.00 CFAT5 20771 20874 (0) W A1280 1 104 (22149) | W Bs
# 103 0.00 0.00 0.00 CFAT5 20771 20874 (0) C A1280.RC (0) 22253 22150 | C Ar Br
# 103 0.00 0.00 0.00 CFAT5.RC 1 104 (20770) C A1280 (22149) 104 1 | C As Bs
# 26355 1.37 0.42 0.36 Em:AP000350 133977 162328 (0) C Em:AP000352 (125738) 28369 1
# 120 16.53 0.00 0.00 bK363A12 32474 32715 (50) C cE129H9 (4343) 32827 32586
# 100 0.00 0.00 0.00 bK363A12 32666 32765 (0) cE129H9 1 100 (37070) *
# * indicates that there is a higher-scoring match whose domain partly includes the domain of this match.
#process alignments above score
foreach my $fp ( $self->fp($self->minmatch) ) { my ($seq1,$seq2,$score,$start,$end,$hstart,$hend) = split(/:/,$fp); my ($strand,$hstrand,$swap); if ( $start > $end ) { $strand = -1; $swap = $start; $start = $end; $end = $swap; } else { $strand = 1; } if ( $hstart > $hend ) { $hstrand = -1; $swap = $hstart; $hstart = $hend; $hend = $swap; } else { $hstrand = 1; } $fp = Bio::EnsEMBL::FeaturePair->new(); #print STDERR "Processing FP with $start-$end to $hstart-$hend\n";
$fp->start($start); $fp->end($end); $fp->strand($strand); $fp->seqname($seq1); $fp->hstart($hstart); $fp->hend($hend); $fp->hstrand($hstrand); $fp->hseqname($seq2); $fp->score($score); $self->add_fp($fp); }
}
rundescriptionprevnextTop
sub run {
  my ($self, $dir) = @_;

  if (!$dir) {
    $dir = $self->workdir;
  }
  $self->checkdir($dir);
  my $filename = $self->write_seq_file();
  $self->files_to_delete($filename);
  $self->files_to_delete($filename."target");
  $self->files_to_delete($filename.".log");
  $self->files_to_delete($self->resultsfile);
  $self->run_analysis();
  $self->parse_results;
  $self->delete_files;
  return 1;
}
run_analysisdescriptionprevnextTop
sub run_analysis {
  my ($self, $program) = @_;
  
  if(!$program){
    $program = $self->program;
  }
  throw($program." is not executable CrossMatch::run_analysis ")
    unless($program && -x $program);
  my $cmd = $self->program." ";
  $cmd .= $self->options." " if($self->options);
  $cmd .= ">".$self->resultsfile;
  print "Running analysis ".$cmd."\n";
  
  system($cmd) == 0 or throw("FAILED to run ".$cmd." CrossMatch::run_analysis ");

  if(! -e $self->resultsfile){
    throw("FAILED to run CrossMatch on ".$self->queryfile." ".
          $self->resultsfile." has not been produced ".
          "CrossMatch:run_analysis");
  }
}
General documentation
scoreTop
  Title   : score
Usage : $obj->score($newval)
Function: could set and return the score value
Example :
Returns : value of minscore option used by crossmatch
Args : newvalue (optional)
NAME - CrossMatch.pmTop
FUNCTIONSTop
    new
    Create a new CrossMatch factory object. Optional arguments to new is a
list of directories to search (see dir below).
    crossMatch
    Do a cross_match on the two fasta formatted sequence files supplied
as agruments, returning a CrossMatch object. The CrossMatch
object records the parameters used in the match, and the match data
returned my cross_match (if any).
Note: The two fasta files may each contain multiple sequences.
    dir
    Change the list of directories in which crossMatch searches for
files. The list of directories supplied to dir completely replaces
the existing list. Directories are searched in the order supplied.
    This list defaults to the current working directory when the
CrossMatch::Factory object is created.
    extn
    A convenience function which allows you to supply a list of filename
extensions to be considered by crossMatch when finding a file to
pass to cross_match. For example:
	$matcher->extn( 'seq', 'aa' );
$cm = $matcher->crossMatch( 'dJ334P19', 'cB49C12' );
crossMatch will look for files "dJ334P19.seq", "dJ334P19.aa". If
no extns had been set, then only "dJ334P19" would have been
searched for.
    minMatch
    Set the minmatch parameter passed to cross_match. Defaults to 50.
    minScore
    Set the minscore parameter passed to cross_match. Defaults to 50.
    maskLevel
    Set the masklevel parameter passed to
cross_match. Defaults to 101, which displays all
overlapping matches.
    alignments
    Causes the full crossmatch alignments to be parsed and stored in the
Crossmatch object. These can be accessed by the a2b and fp methods.
CrossMatchTop
Data stored in CrossMatch objects, generated by a
CrossMatch::Factory, can be retrieved with a variety of queries.
All the matches are stored internally in an array, in the order in
which they were generated by cross_match. You can apply filters and
sorts (sorts are not yet implemented) to the object, which re-order or
hide matches. These operations save their changes by altering the
active list, which is simply an array of indices which refer to
elements in the array of matches. The data access funtions all
operate through this active list, and the active list can be reset to
show all the matches found by calling the unfilter method.
    SYNOPSIS
    
$firstName = $cm->aName;
@allscores = $cm->score();
@someSores = $cm->score(0,3,4);

    FUNCTIONS
        count
        Returns a count of the number of hits in the active list
        list
        Returns a list of array indices for the current list.
        filter
        
$cm->filter(\&CrossMatch::endHits);

        Takes a reference to a subroutine (or an anonymous subroutine), and
alters the active list to contain only those hits for which the
subroutine returns true. The subroutine should expect to be passed a
CrossMatch object, and an integer corresponding to the index of a
match. Applying a series of filters to the same object removes
successively more objects from the active list.
        unfilter
        Resets the active list to include all the hits, in the order in which
they were generated by cross_match. Returns a list of the active
indices.
        FILTERS
            endHits
            A filter which returns true if one of the sequnces in a hit has its
end in the hit.
    PARAMETERS
    The parameters used by cross_match when doing the match can be
retrieved from the resulting Match object. The two sequences matched
are labelled a and b, where a is the sequence from the first
file passed to crossMatch, and b, the sequence from the second
file. For example:
	$path_to_a = $cm->aFile;
    retrieves the path of the file supplied as the first argument to
cross_match.
        aFile bFile
        The full paths to the files used for sequences a and b.
        minM
        The minmatch parameter used.
        minS
        The minscore parameter used.
        mask
        The masklevel parameter used.
    DATA FIELDS
    Syntax:
	$field = $match->FIELD();
$field = $match->FIELD(INDEX); @fields = $match->FIELD(); @fields = $match->FIELD(LIST);
    Examples:
	$firstScore = $match->score(0);
	@aEnds = $match->aEnd();
    These methods provide access to all the data fields for each hit found
by cross_match. Each of the fields is described below. In scalar
context, a single data field is returned, which is the field from the
first row in the active list if no INDEX is given. In array
context it returns a list with the FIELD from all the hits in the
active list when called without arguments, or from the hits specified
by a LIST of indices.
        score
        The Smith-Waterman score of a hit, adjusted for the complexity of the
matching sequence.
        pSub
        The percent substitutions in the hit.
        pDel
        The percent deletions in the hit (sequence a relative to sequence b).
        pIns
        The percent insertions in the hit (sequence a relative to sequence b).
        aName bName
        ID of the first sequence (sequence a) and second seqeunce (b) in
the match, respecitvely.
        aStart bStart
        Start of hit in a, or b, respectively.
        aEnd bEnd
        End of hit in a, or b, respectively.
        aRemain
        Number of bases in a after the end of the hit. ("0" if the hit
extends to the end of a.)
        bRemain
        The equivalent for sequence b of aRemain, but note that if the
strand of b which matches is the reverse strand, then this is the
number of bases left in b after the hit, back towards the beginning
of b.
        strand
        The strand on sequence b which was matched. This is "W" for the
forward (or parallel or Watson) strand, and "C" for the reverse
(or anti-parallel or Crick) strand.
        a2b
        Returns coordinate in sequence b equivalent to value in sequence
a passed to method. If no corresponding base, returns -1.
        fp
        Returns an array of strings contining full list of ungapped alignment
fragment coordinates, filtered by score value passed to method.
    QUERYING BY SEQUENCE NAME
    These methods allow access to the start, end, and remain
fields for those occasions where you know the name of your sequence,
but don't necessarily know if it is sequence a or b. Syntax and
behaviour is the same as for the DATA FIELDS functions, but the
first argument to the function is the name of the sequence you want to
get data about.
Note: These methods perform a filtering operation, reducing the
active list to only those hits which match the name given (in either
the a or b columns). You'll therefore need to unfilter your
match if you need data from hits which have now become hidden.
    If the sequence name is found in both columns a and b, then a
warning is printed, and the a column is chosen.
    For example, suppose you have matched two fasta files containing only
the sequences cN75H12 and bK1109B5 (in that order), then the following
calls retrieve the same data:
@ends
 = $match->aEnd();
	@ends = $match->end('cN75H12');
$start = $match->bStart(0); $start = $match->start('bK1109B5', 0);
A warning is printed to STDERR if a match contains hits from the name
supplied in both columns, and only hits from the a column are
returned.
        start end remain
        Access the aStart or bStart, aEnd or bEnd, and aRemain or bRemain
fields, depending upon wether the name supplied matches the aName or
bName fields respectively.
BUGSTop
AUTHORTop
James Gilbert email jgrg@sanger.ac.uk
Tim Hubbard email th@sanger.ac.uk (alignment processing extensions)