diff options
Diffstat (limited to 'differ.scm')
-rw-r--r-- | differ.scm | 222 |
1 files changed, 222 insertions, 0 deletions
diff --git a/differ.scm b/differ.scm new file mode 100644 index 0000000..53e0eaf --- /dev/null +++ b/differ.scm @@ -0,0 +1,222 @@ +;;;; "differ.scm" O(NP) Sequence Comparison Algorithm. +;;; Copyright (C) 2001 Aubrey Jaffer +; +;Permission to copy this software, to modify it, to redistribute it, +;to distribute modified versions, and to use it for any purpose is +;granted, subject to the following restrictions and understandings. +; +;1. Any copy made of this software must include this copyright notice +;in full. +; +;2. I have made no warrantee or representation that the operation of +;this software will be error-free, and I am under no obligation to +;provide any services, by way of maintenance, update, or otherwise. +; +;3. In conjunction with products arising from the use of this +;material, there shall be no use of my name in any advertising, +;promotional, or sales literature without prior written consent in +;each case. + +;;@noindent +;;This package implements the algorithm: +;; +;;@ifinfo +;;@example +;;S. Wu, E. Myers, U. Manber, and W. Miller, +;; "An O(NP) Sequence Comparison Algorithm," +;; Information Processing Letters 35, 6 (1990), 317-323. +;; @url{http://www.cs.arizona.edu/people/gene/vita.html} +;;@end example +;;@end ifinfo +;;@ifset html +;;S. Wu, <A HREF="http://www.cs.arizona.edu/people/gene/vita.html"> +;;E. Myers,</A> U. Manber, and W. Miller, +;;<A HREF="http://www.cs.arizona.edu/people/gene/PAPERS/np_diff.ps"> +;;"An O(NP) Sequence Comparison Algorithm,"</A> +;;Information Processing Letters 35, 6 (1990), 317-323. +;;@end ifset +;; +;;@noindent +;;If the items being sequenced are text lines, then the computed +;;edit-list is equivalent to the output of the @dfn{diff} utility +;;program. If the items being sequenced are words, then it is like the +;;lesser known @dfn{spiff} program. +;; +;;@noindent +;;The values returned by @code{diff:edit-length} can be used to gauge +;;the degree of match between two sequences. +;; +;;@noindent +;;I believe that this algorithm is currently the fastest for these +;;tasks, but genome sequencing applications fuel extensive research in +;;this area. + +(require 'array) + +(define (fp:compare fp Delta snake len2) + (let loop ((p 0)) + (do ((k (- p) (+ 1 k))) + ((> k (+ -1 Delta))) + (array-set! fp (snake k (max (+ 1 (array-ref fp (+ -1 k))) + (array-ref fp (+ 1 k)))) + k)) + (do ((k (+ Delta p) (+ -1 k))) + ((< k (+ 1 Delta))) + (array-set! fp (snake k (max (+ 1 (array-ref fp (+ -1 k))) + (array-ref fp (+ 1 k)))) + k)) + (array-set! fp (snake Delta (max (+ 1 (array-ref fp (+ -1 Delta))) + (array-ref fp (+ 1 Delta)))) + Delta) + (if (= (array-ref fp Delta) len2) + (+ Delta (* 2 p)) + (loop (+ 1 p))))) + +(define (fp->edits fp Delta) + (let loop ((idx (+ -1 Delta)) + (ddx (+ 1 Delta)) + (edits '())) + (define ivl (array-ref fp idx)) + (define dvl (array-ref fp ddx)) + (if (not (= -1 dvl)) (set! dvl (- dvl ddx))) + ;;(print idx '-> ivl ddx '-> dvl) + (cond ((= ivl -1) edits) + ((= dvl -1) (loop (+ -1 idx) ddx (cons (list ivl 'insert) edits))) + ((> dvl ivl) (loop idx (+ 1 ddx) (cons (list dvl 'delete) edits))) + (else (loop (+ -1 idx) ddx (cons (list ivl 'insert) edits)))))) + +(define (fp->lcs fp Delta array1 len) + (define len1 (car (array-dimensions array1))) + (define lcs (make-array #f len)) + (define (subarray-copy! array1 start1 end1 array2 start2) + (do ((i start1 (+ i 1)) + (j start2 (+ j 1)) + (l (- end1 start1) (- l 1))) + ((<= l 0)) + (array-set! array2 (array-ref array1 i) j))) + (let loop ((ddx (+ 1 Delta)) + (pos len1) + (dpos len)) + (let* ((dvl (array-ref fp ddx)) + (sublen (- pos (- dvl ddx -1)))) + (cond ((= dvl -1) + (subarray-copy! array1 0 pos lcs 0) + lcs) + (else + (subarray-copy! array1 (- dvl ddx -1) pos lcs (- dpos sublen)) + (loop (+ 1 ddx) (- dvl ddx) (- dpos sublen))))))) + +;;@args array1 array2 =? +;;@args array1 array2 +;;@1 and @2 are one-dimensional arrays. The procedure @3 is used +;;to compare sequence tokens for equality. @3 defaults to @code{eqv?}. +;;@0 returns a one-dimensional array of length @code{(quotient (- (+ +;;len1 len2) (fp:edit-length @1 @2)) 2)} holding the longest sequence +;;common to both @var{array}s. +(define (diff:longest-common-subsequence array1 array2 . =?) + (define len1 (car (array-dimensions array1))) + (define len2 (car (array-dimensions array2))) + (define (snake k y) + (let snloop ((x (- y k)) + (y y)) + (if (and (< x len1) (< y len2) (=? (array-ref array1 x) + (array-ref array2 y))) + (snloop (+ 1 x) (+ 1 y)) + y))) + (set! =? (if (null? =?) eqv? (car =?))) + (if (> len1 len2) + (diff:longest-common-subsequence array2 array1) + (let ((Delta (- len2 len1)) + (fp (make-array -1 (list (- (+ 1 len1)) (+ 1 len2))))) + (fp->lcs fp Delta array1 + (quotient (- (+ len1 len2) (fp:compare fp Delta snake len2)) + 2))))) + +;;@args array1 array2 =? +;;@args array1 array2 +;;@1 and @2 are one-dimensional arrays. The procedure @3 is used +;;to compare sequence tokens for equality. @3 defaults to @code{eqv?}. +;;@0 returns a list of length @code{(fp:edit-length @1 @2)} composed of +;;a shortest sequence of edits transformaing @1 to @2. +;; +;;Each edit is a list of an integer and a symbol: +;;@table @asis +;;@item (@var{j} insert) +;;Inserts @code{(array-ref @1 @var{j})} into the sequence. +;;@item (@var{k} delete) +;;Deletes @code{(array-ref @2 @var{k})} from the sequence. +;;@end table +(define (diff:edits array1 array2 . =?) + (define len1 (car (array-dimensions array1))) + (define len2 (car (array-dimensions array2))) + (define (snake k y) + (let snloop ((x (- y k)) + (y y)) + (if (and (< x len1) (< y len2) (=? (array-ref array1 x) + (array-ref array2 y))) + (snloop (+ 1 x) (+ 1 y)) y))) + (set! =? (if (null? =?) eqv? (car =?))) + (if (> len1 len2) + (diff:reverse-edits (diff:edits array2 array1)) + (let ((Delta (- len2 len1)) + (fp (make-array -1 (list (- (+ 1 len1)) (+ 1 len2))))) + (fp:compare fp Delta snake len2) + ;;(do ((idx (- -1 len1) (+ 1 idx))) ((>= idx (+ 1 len2)) (newline)) (printf "%3d" idx)) + ;;(do ((idx (- -1 len1) (+ 1 idx))) ((>= idx (+ 1 len2)) (newline)) (printf "%3d" (array-ref fp idx))) + (fp->edits fp Delta)))) + +(define (diff:reverse-edits edits) + (map (lambda (edit) + (list (car edit) + (case (cadr edit) + ((delete) 'insert) + ((insert) 'delete)))) + edits)) + +;;@args array1 array2 =? +;;@args array1 array2 +;;@1 and @2 are one-dimensional arrays. The procedure @3 is used +;;to compare sequence tokens for equality. @3 defaults to @code{eqv?}. +;;@0 returns the length of the shortest sequence of edits transformaing +;;@1 to @2. +(define (diff:edit-length array1 array2 . =?) + (define len1 (car (array-dimensions array1))) + (define len2 (car (array-dimensions array2))) + (define (snake k y) + (let snloop ((x (- y k)) + (y y)) + (if (and (< x len1) (< y len2) (=? (array-ref array1 x) + (array-ref array2 y))) + (snloop (+ 1 x) (+ 1 y)) + y))) + (set! =? (if (null? =?) eqv? (car =?))) + (if (> len1 len2) + (diff:edit-length array2 array1) + (let ((Delta (- len2 len1)) + (fp (make-array -1 (list (- (+ 1 len1)) (+ 1 len2))))) + (fp:compare fp Delta snake len2)))) + +;;@example +;;(diff:longest-common-subsequence '#(f g h i e j c k l m) +;; '#(f g e h i j k p q r l m)) +;; @result{} #(f g h i j k l m) +;; +;;(diff:edit-length '#(f g h i e j c k l m) +;; '#(f g e h i j k p q r l m)) +;;@result{} 6 +;; +;;(pretty-print (diff:edits '#(f g h i e j c k l m) +;; '#(f g e h i j k p q r l m))) +;;@print{} +;;((3 insert) ; e +;; (4 delete) ; c +;; (6 delete) ; h +;; (7 insert) ; p +;; (8 insert) ; q +;; (9 insert)) ; r +;;@end example + +;; 12 - 10 = 2 +;; -11-10 -9 -8 -7 -6 -5 -4 -3 -2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 +;; -1 -1 -1 -1 -1 -1 -1 -1 -1 3 7 8 9 12 9 8 -1 -1 -1 -1 -1 -1 -1 -1 +;; edit-distance = 6 |