;;;; "differ.scm" O(NP) Sequence Comparison Algorithm. ;;; Copyright (C) 2001, 2002, 2003 Aubrey Jaffer ; ;Permission to copy this software, to modify it, to redistribute it, ;to distribute modified versions, and to use it for any purpose is ;granted, subject to the following restrictions and understandings. ; ;1. Any copy made of this software must include this copyright notice ;in full. ; ;2. I have made no warranty or representation that the operation of ;this software will be error-free, and I am under no obligation to ;provide any services, by way of maintenance, update, or otherwise. ; ;3. In conjunction with products arising from the use of this ;material, there shall be no use of my name in any advertising, ;promotional, or sales literature without prior written consent in ;each case. ;;@noindent ;;@code{diff:edit-length} implements the algorithm: ;; ;;@ifinfo ;;@example ;;S. Wu, E. Myers, U. Manber, and W. Miller, ;; "An O(NP) Sequence Comparison Algorithm," ;; Information Processing Letters 35, 6 (1990), 317-323. ;; @url{http://www.cs.arizona.edu/people/gene/vita.html} ;;@end example ;;@end ifinfo ;;@ifset html ;;S. Wu, ;;E. Myers, U. Manber, and W. Miller, ;; ;;"An O(NP) Sequence Comparison Algorithm," ;;Information Processing Letters 35, 6 (1990), 317-323. ;;@end ifset ;; ;;@noindent ;;The values returned by @code{diff:edit-length} can be used to gauge ;;the degree of match between two sequences. ;; ;;@noindent ;;Surprisingly, "An O(NP) Sequence Comparison Algorithm" does not ;;derive the edit sequence; only the sequence length. Developing this ;;linear-space sub-quadratic-time algorithm for computing the edit ;;sequence required hundreds of hours of work. I have submitted a ;;paper describing the algorithm to the Journal of Computational ;;Biology. ;; ;;@noindent ;;If the items being sequenced are text lines, then the computed ;;edit-list is equivalent to the output of the @dfn{diff} utility ;;program. If the items being sequenced are words, then it is like the ;;lesser known @dfn{spiff} program. (require 'array) (require 'sort) ;;; p-lim is half the number of gratuitous edits for strings of given ;;; lengths. ;;; When passed #f CC, fp:compare returns edit-distance if successful; ;;; #f otherwise (p > p-lim). When passed CC, fp:compare returns #f. (define (fp:compare fp CC A M B N =? p-lim) (define Delta (- N M)) ;;(if (negative? Delta) (slib:error 'fp:compare (fp:subarray A 0 M) '> (fp:subarray B 0 N))) ;;(set! compares (+ 1 compares)) ;(print 'fp:compare M N p-lim) (let loop ((p 0)) (do ((k (- p) (+ 1 k))) ((>= k Delta)) (fp:run fp k A M B N =? CC p)) (do ((k (+ Delta p) (+ -1 k))) ((<= k Delta)) (fp:run fp k A M B N =? CC p)) (let ((fpval (fp:run fp Delta A M B N =? CC p))) ;; At this point, the cost to (fpval-Delta, fpval) is Delta + 2*p (cond ((and (not CC) (<= N fpval)) (+ Delta (* 2 p))) ((and (not (negative? p-lim)) (>= p p-lim)) #f) (else (loop (+ 1 p))))))) ;;; Traces runs of matches until they end; then set fp[k]=y. ;;; If CC is supplied, set each CC[y] = min(CC[y], cost) for run. ;;; Returns furthest y reached. (define (fp:run fp k A M B N =? CC p) (define y (max (+ 1 (array-ref fp (+ -1 k))) (array-ref fp (+ 1 k)))) (define cost (+ k p p)) (let snloop ((x (- y k)) (y y)) (and CC (<= y N) (let ((xcst (- M x))) (cond ((negative? xcst)) (else (array-set! CC (min (+ xcst cost) (array-ref CC y)) y))))) ;;(set! tick (+ 1 tick)) (cond ((and (< x M) (< y N) (=? (array-ref A x) (array-ref B y))) (snloop (+ 1 x) (+ 1 y))) (else (array-set! fp y k) y)))) ;;; Check that only 1 and -1 steps between adjacent CC entries. ;;(define (fp:step-check A M B N CC) ;; (do ((cdx (+ -1 N) (+ -1 cdx))) ;; ((negative? cdx)) ;; (case (- (array-ref CC cdx) (array-ref CC (+ 1 cdx))) ;; ((1 -1) #t) ;; (else (cond ((> 30 (car (array-dimensions CC))) ;; (display "A: ") (print A) ;; (display "B: ") (print B))) ;; (slib:warn ;; "CC" (append (list (max 0 (+ -5 cdx)) ': (min (+ 1 N) (+ 5 cdx)) ;; 'of) ;; (array-dimensions CC)) ;; (fp:subarray CC (max 0 (+ -5 cdx)) (min (+ 1 N) (+ 5 cdx)))))))) ;;; Correct cost jumps left by fp:compare [which visits only a few (x,y)]. ;;(define (smooth-costs CC N) ;; (do ((cdx (+ -1 N) (+ -1 cdx))) ; smooth from end ;; ((negative? cdx)) ;; (array-set! CC (min (array-ref CC cdx) (+ 1 (array-ref CC (+ 1 cdx)))) ;; cdx)) ;; (do ((cdx 1 (+ 1 cdx))) ; smooth toward end ;; ((> cdx N)) ;; (array-set! CC (min (array-ref CC cdx) (+ 1 (array-ref CC (+ -1 cdx)))) ;; cdx)) ;; CC) (define (diff:mid-split M N RR CC cost) (define b-splt N) ;Default (define bestrun 0) (define thisrun 0) ;; RR is not longer than CC. So do for each element of RR. (let loop ((cdx (+ 1 (quotient N 2))) (rdx (quotient N 2))) ;;(if (negative? rdx) (slib:error 'negative? 'rdx)) (cond ((eqv? cost (+ (array-ref CC rdx) (array-ref RR (- N rdx)))) rdx) ((eqv? cost (+ (array-ref CC cdx) (array-ref RR (- N cdx)))) cdx) (else (loop (+ 1 cdx) (+ -1 rdx)))))) ;;; Return 0-based shared array. ;;; Reverse RA if END < START. (define (fp:subarray RA start end) (define n-len (abs (- end start))) (if (< end start) (make-shared-array RA (lambda (idx) (list (- start 1 idx))) n-len) (make-shared-array RA (lambda (idx) (list (+ start idx))) n-len))) (define (fp:init! fp fill mindx maxdx) (do ((idx maxdx (+ -1 idx))) ((< idx mindx)) (array-set! fp fill idx))) ;;; Split A[start-a..end-a] (shorter array) into smaller and smaller chunks. ;;; EDX is index into EDITS. ;;; EPO is insert/delete polarity (+1 or -1) (define (diff:divide-and-conquer fp CCRR A start-a end-a B start-b end-b edits edx epo =? p-lim) (define mid-a (quotient (+ start-a end-a) 2)) (define len-b (- end-b start-b)) (define len-a (- end-a start-a)) (let ((tcst (+ p-lim p-lim (- len-b len-a)))) (define CC (fp:subarray CCRR 0 (+ len-b 1))) (define RR (fp:subarray CCRR (+ len-b 1) (* 2 (+ len-b 1)))) (define M2 (- end-a mid-a)) (define M1 (- mid-a start-a)) (fp:init! CC (+ len-a len-b) 0 len-b) (fp:init! fp -1 (- (+ 1 p-lim)) (+ 1 p-lim (- len-b M1))) (fp:compare fp CC (fp:subarray A start-a mid-a) M1 (fp:subarray B start-b end-b) len-b =? (min p-lim len-a)) (fp:init! RR (+ len-a len-b) 0 len-b) (fp:init! fp -1 (- (+ 1 p-lim)) (+ 1 p-lim (- len-b M2))) (fp:compare fp RR (fp:subarray A end-a mid-a) M2 (fp:subarray B end-b start-b) len-b =? (min p-lim len-a)) ;;(smooth-costs CC len-b) (smooth-costs RR len-b) (let ((b-splt (diff:mid-split len-a len-b RR CC tcst))) (define est-c (array-ref CC b-splt)) (define est-r (array-ref RR (- len-b b-splt))) ;;(set! splts (cons (/ b-splt (max .1 len-b)) splts)) ;;(display "A: ") (array-for-each display (fp:subarray A start-a mid-a)) (display " + ") (array-for-each display (fp:subarray A mid-a end-a)) (newline) ;;(display "B: ") (array-for-each display (fp:subarray B start-b end-b)) (newline) ;;(print 'cc cc) (print 'rr (fp:subarray RR (+ 1 len-b) 0)) ;;(print (make-string (+ 7 (* 2 b-splt)) #\-) '^ (list b-splt)) (check-cost! 'CC est-c (diff2et fp CCRR A start-a mid-a B start-b (+ start-b b-splt) edits edx epo =? (quotient (- est-c (- b-splt (- mid-a start-a))) 2))) (check-cost! 'RR est-r (diff2et fp CCRR A mid-a end-a B (+ start-b b-splt) end-b edits (+ est-c edx) epo =? (quotient (- est-r (- (- len-b b-splt) (- end-a mid-a))) 2))) (+ est-c est-r)))) ;;; Trim; then diff sub-arrays; either one longer. Returns edit-length (define (diff2et fp CCRR A start-a end-a B start-b end-b edits edx epo =? p-lim) ;; (if (< (- end-a start-a) p-lim) (slib:warn 'diff2et 'len-a (- end-a start-a) 'len-b (- end-b start-b) 'p-lim p-lim)) (do ((bdx (+ -1 end-b) (+ -1 bdx)) (adx (+ -1 end-a) (+ -1 adx))) ((not (and (<= start-b bdx) (<= start-a adx) (=? (array-ref A adx) (array-ref B bdx)))) (do ((bsx start-b (+ 1 bsx)) (asx start-a (+ 1 asx))) ((not (and (< bsx bdx) (< asx adx) (=? (array-ref A asx) (array-ref B bsx)))) ;;(print 'trim-et (- asx start-a) '+ (- end-a adx)) (let ((delta (- (- bdx bsx) (- adx asx)))) (if (negative? delta) (diff2ez fp CCRR B bsx (+ 1 bdx) A asx (+ 1 adx) edits edx (- epo) =? (+ delta p-lim)) (diff2ez fp CCRR A asx (+ 1 adx) B bsx (+ 1 bdx) edits edx epo =? p-lim)))) ;;(set! tick (+ 1 tick)) )) ;;(set! tick (+ 1 tick)) )) ;;; Diff sub-arrays, A not longer than B. Returns edit-length (define (diff2ez fp CCRR A start-a end-a B start-b end-b edits edx epo =? p-lim) (define len-a (- end-a start-a)) (define len-b (- end-b start-b)) ;;(if (> len-a len-b) (slib:error 'diff2ez len-a '> len-b)) (cond ((zero? p-lim) ; B inserts only (if (= len-b len-a) 0 ; A = B; no edits (let loop ((adx start-a) (bdx start-b) (edx edx)) (cond ((>= bdx end-b) (- len-b len-a)) ((>= adx end-a) (do ((idx bdx (+ 1 idx)) (edx edx (+ 1 edx))) ((>= idx end-b) (- len-b len-a)) (array-set! edits (* epo (+ 1 idx)) edx))) ((=? (array-ref A adx) (array-ref B bdx)) ;;(set! tick (+ 1 tick)) (loop (+ 1 adx) (+ 1 bdx) edx)) (else (array-set! edits (* epo (+ 1 bdx)) edx) ;;(set! tick (+ 1 tick)) (loop adx (+ 1 bdx) (+ 1 edx))))))) ((<= len-a p-lim) ; delete all A; insert all B ;;(if (< len-a p-lim) (slib:error 'diff2ez len-a len-b 'p-lim p-lim)) (do ((idx start-a (+ 1 idx)) (edx edx (+ 1 edx))) ((>= idx end-a) (do ((jdx start-b (+ 1 jdx)) (edx edx (+ 1 edx))) ((>= jdx end-b)) (array-set! edits (* epo (+ 1 jdx)) edx))) (array-set! edits (* epo (- -1 idx)) edx)) (+ len-a len-b)) (else (diff:divide-and-conquer fp CCRR A start-a end-a B start-b end-b edits edx epo =? p-lim)))) ;;;Return new vector of edits in correct sequence (define (diff:order-edits edits cost sign) (if (negative? sign) (do ((idx (+ -1 cost) (+ -1 idx))) ((negative? idx)) (array-set! edits (- (array-ref edits idx)) idx))) (if (zero? cost) edits (let ((sedits (sort! edits <)) (nedits (create-array (As32) cost))) ;; Find -/+ boundary (define len-a (max 0 (- (array-ref sedits 0)))) (define len-b (array-ref sedits (+ -1 cost))) (do ((idx 0 (+ 1 idx))) ((or (>= idx cost) (positive? (array-ref sedits idx))) (let loop ((ddx (+ -1 idx)) (idx idx) (ndx 0) (adx 0) (bdx 0)) (define del (if (negative? ddx) 0 (array-ref sedits ddx))) (define ins (if (>= idx cost) 0 (array-ref sedits idx))) (cond ((and (>= bdx len-b) (>= adx len-a)) nedits) ((and (negative? del) (>= adx (- -1 del)) (positive? ins) (>= bdx (+ -1 ins))) (array-set! nedits del ndx) (array-set! nedits ins (+ 1 ndx)) (loop (+ -1 ddx) (+ 1 idx) (+ 2 ndx) (+ 1 adx) (+ 1 bdx))) ((and (negative? del) (>= adx (- -1 del))) (array-set! nedits del ndx) (loop (+ -1 ddx) idx (+ 1 ndx) (+ 1 adx) bdx)) ((and (positive? ins) (>= bdx (+ -1 ins))) (array-set! nedits ins ndx) (loop ddx (+ 1 idx) (+ 1 ndx) adx (+ 1 bdx))) (else (loop ddx idx ndx (+ 1 adx) (+ 1 bdx)))))))))) ;;; len-a < len-b (define (edits2lcs lcs edits cost A len-a len-b) (let loop ((edx 0) (sdx 0) (adx 0)) (let ((edit (if (< edx cost) (array-ref edits edx) 0))) (cond ((>= adx len-a) lcs) ((positive? edit) (loop (+ 1 edx) sdx adx)) ((zero? edit) (array-set! lcs (array-ref A adx) sdx) (loop edx (+ 1 sdx) (+ 1 adx))) ((>= adx (- -1 edit)) (loop (+ 1 edx) sdx (+ 1 adx))) (else (array-set! lcs (array-ref A adx) sdx) (loop edx (+ 1 sdx) (+ 1 adx))))))) ;; A not longer than B (M <= N) (define (diff2edits A M B N =? p-lim) (define maxdx (if (negative? p-lim) (+ 2 N) (+ 1 p-lim (- N M)))) (define mindx (if (negative? p-lim) (- (+ 1 M)) (- (+ 1 p-lim)))) ;;(if (> M N) (slib:error 'diff2edits M '> N)) (let ((fp (create-array (As32) (list mindx maxdx))) (CCRR (create-array (As32) (* 2 (+ N 1))))) (fp:init! fp -1 mindx maxdx) (let ((est (fp:compare fp #f A M B N =? p-lim))) (and est (let ((edits (create-array (As32) est))) (check-cost! 'diff2edits est (diff2et fp CCRR A 0 M B 0 N edits 0 1 =? (quotient (- est (- N M)) 2))) edits))))) ;; A not longer than B (M <= N) (define (diff2editlen A M B N =? p-lim) (define maxdx (if (negative? p-lim) (+ 1 N) (+ 1 p-lim (- N M)))) (define mindx (if (negative? p-lim) (- (+ 1 M)) (- (+ 1 p-lim)))) (let ((fp (create-array (As32) (list mindx maxdx)))) (fp:init! fp -1 mindx maxdx) (fp:compare fp #f A M B N =? p-lim))) (define (check-cost! name est cost) (if (not (eqv? est cost)) (slib:warn "%s: cost check failed %d != %d\\n" name est cost))) ;;@args array1 array2 =? p-lim ;;@args array1 array2 =? ;;@1 and @2 are one-dimensional arrays. The procedure @3 is used ;;to compare sequence tokens for equality. ;; ;;The non-negative integer @4, if provided, is maximum number of ;;deletions of the shorter sequence to allow. @0 will return @code{#f} ;;if more deletions would be necessary. ;; ;;@0 returns a one-dimensional array of length @code{(quotient (- (+ ;;len1 len2) (diff:edit-length @1 @2)) 2)} holding the longest sequence ;;common to both @var{array}s. (define (diff:longest-common-subsequence A B =? . p-lim) (define len-a (car (array-dimensions a))) (define len-b (car (array-dimensions b))) (set! p-lim (if (null? p-lim) -1 (car p-lim))) (let ((edits (if (< len-b len-a) (diff2edits B len-b A len-a =? p-lim) (diff2edits A len-a B len-b =? p-lim)))) (and edits (let* ((cost (car (array-dimensions edits))) (sedit (diff:order-edits edits cost (if (< len-b len-a) -1 1))) (lcs (create-array A (/ (- (+ len-b len-a) cost) 2)))) (if (< len-b len-a) (edits2lcs lcs sedit cost B len-b len-a) (edits2lcs lcs sedit cost A len-a len-b)))))) ;;@args array1 array2 =? p-lim ;;@args array1 array2 =? ;;@1 and @2 are one-dimensional arrays. The procedure @3 is used ;;to compare sequence tokens for equality. ;; ;;The non-negative integer @4, if provided, is maximum number of ;;deletions of the shorter sequence to allow. @0 will return @code{#f} ;;if more deletions would be necessary. ;; ;;@0 returns a vector of length @code{(diff:edit-length @1 @2)} composed ;;of a shortest sequence of edits transformaing @1 to @2. ;; ;;Each edit is an integer: ;;@table @asis ;;@item @var{k} > 0 ;;Inserts @code{(array-ref @1 (+ -1 @var{j}))} into the sequence. ;;@item @var{k} < 0 ;;Deletes @code{(array-ref @2 (- -1 @var{k}))} from the sequence. ;;@end table (define (diff:edits A B =? . p-lim) (define len-a (car (array-dimensions a))) (define len-b (car (array-dimensions b))) (set! p-lim (if (null? p-lim) -1 (car p-lim))) (let ((edits (if (< len-b len-a) (diff2edits B len-b A len-a =? p-lim) (diff2edits A len-a B len-b =? p-lim)))) (and edits (diff:order-edits edits (car (array-dimensions edits)) (if (< len-b len-a) -1 1))))) ;;@args array1 array2 =? p-lim ;;@args array1 array2 =? ;;@1 and @2 are one-dimensional arrays. The procedure @3 is used ;;to compare sequence tokens for equality. ;; ;;The non-negative integer @4, if provided, is maximum number of ;;deletions of the shorter sequence to allow. @0 will return @code{#f} ;;if more deletions would be necessary. ;; ;;@0 returns the length of the shortest sequence of edits transformaing ;;@1 to @2. (define (diff:edit-length A B =? . p-lim) (define M (car (array-dimensions a))) (define N (car (array-dimensions b))) (set! p-lim (if (null? p-lim) -1 (car p-lim))) (if (< N M) (diff2editlen B N A M =? p-lim) (diff2editlen A M B N =? p-lim))) ;;@example ;;(diff:longest-common-subsequence "fghiejcklm" "fgehijkpqrlm" eqv?) ;;@result{} "fghijklm" ;; ;;(diff:edit-length "fghiejcklm" "fgehijkpqrlm" eqv?) ;;@result{} 6 ;; ;;(diff:edits "fghiejcklm" "fgehijkpqrlm" eqv?) ;;@result{} #As32(3 -5 -7 8 9 10) ;; ; e c h p q r ;;@end example ;;(trace-all "/home/jaffer/slib/differ.scm")(set! *qp-width* 333)(untrace fp:run fp:subarray)