summaryrefslogtreecommitdiffstats
path: root/differ.scm
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2017-02-20 00:05:29 -0800
committerBryan Newbold <bnewbold@robocracy.org>2017-02-20 00:05:29 -0800
commit8466d8cfa486fb30d1755c4261b781135083787b (patch)
treec8c12c67246f543c3cc4f64d1c07e003cb1d45ae /differ.scm
parent87b82b5822ca54228cfa6df29be3ad9d4bc47d16 (diff)
downloadslib-8466d8cfa486fb30d1755c4261b781135083787b.tar.gz
slib-8466d8cfa486fb30d1755c4261b781135083787b.zip
Import Upstream version 3a1upstream/3a1
Diffstat (limited to 'differ.scm')
-rw-r--r--differ.scm521
1 files changed, 369 insertions, 152 deletions
diff --git a/differ.scm b/differ.scm
index 53e0eaf..23b0e91 100644
--- a/differ.scm
+++ b/differ.scm
@@ -1,5 +1,5 @@
;;;; "differ.scm" O(NP) Sequence Comparison Algorithm.
-;;; Copyright (C) 2001 Aubrey Jaffer
+;;; Copyright (C) 2001, 2002, 2003 Aubrey Jaffer
;
;Permission to copy this software, to modify it, to redistribute it,
;to distribute modified versions, and to use it for any purpose is
@@ -8,7 +8,7 @@
;1. Any copy made of this software must include this copyright notice
;in full.
;
-;2. I have made no warrantee or representation that the operation of
+;2. I have made no warranty or representation that the operation of
;this software will be error-free, and I am under no obligation to
;provide any services, by way of maintenance, update, or otherwise.
;
@@ -18,7 +18,7 @@
;each case.
;;@noindent
-;;This package implements the algorithm:
+;;@code{diff:edit-length} implements the algorithm:
;;
;;@ifinfo
;;@example
@@ -37,186 +37,403 @@
;;@end ifset
;;
;;@noindent
-;;If the items being sequenced are text lines, then the computed
-;;edit-list is equivalent to the output of the @dfn{diff} utility
-;;program. If the items being sequenced are words, then it is like the
-;;lesser known @dfn{spiff} program.
-;;
-;;@noindent
;;The values returned by @code{diff:edit-length} can be used to gauge
;;the degree of match between two sequences.
;;
;;@noindent
-;;I believe that this algorithm is currently the fastest for these
-;;tasks, but genome sequencing applications fuel extensive research in
-;;this area.
+;;Surprisingly, "An O(NP) Sequence Comparison Algorithm" does not
+;;derive the edit sequence; only the sequence length. Developing this
+;;linear-space sub-quadratic-time algorithm for computing the edit
+;;sequence required hundreds of hours of work. I have submitted a
+;;paper describing the algorithm to the Journal of Computational
+;;Biology.
+;;
+;;@noindent
+;;If the items being sequenced are text lines, then the computed
+;;edit-list is equivalent to the output of the @dfn{diff} utility
+;;program. If the items being sequenced are words, then it is like the
+;;lesser known @dfn{spiff} program.
(require 'array)
+(require 'sort)
-(define (fp:compare fp Delta snake len2)
+;;; p-lim is half the number of gratuitous edits for strings of given
+;;; lengths.
+;;; When passed #f CC, fp:compare returns edit-distance if successful;
+;;; #f otherwise (p > p-lim). When passed CC, fp:compare returns #f.
+(define (fp:compare fp CC A M B N =? p-lim)
+ (define Delta (- N M))
+ ;;(if (negative? Delta) (slib:error 'fp:compare (fp:subarray A 0 M) '> (fp:subarray B 0 N)))
+ ;;(set! compares (+ 1 compares)) ;(print 'fp:compare M N p-lim)
(let loop ((p 0))
(do ((k (- p) (+ 1 k)))
- ((> k (+ -1 Delta)))
- (array-set! fp (snake k (max (+ 1 (array-ref fp (+ -1 k)))
- (array-ref fp (+ 1 k))))
- k))
+ ((>= k Delta))
+ (fp:run fp k A M B N =? CC p))
(do ((k (+ Delta p) (+ -1 k)))
- ((< k (+ 1 Delta)))
- (array-set! fp (snake k (max (+ 1 (array-ref fp (+ -1 k)))
- (array-ref fp (+ 1 k))))
- k))
- (array-set! fp (snake Delta (max (+ 1 (array-ref fp (+ -1 Delta)))
- (array-ref fp (+ 1 Delta))))
- Delta)
- (if (= (array-ref fp Delta) len2)
- (+ Delta (* 2 p))
- (loop (+ 1 p)))))
-
-(define (fp->edits fp Delta)
- (let loop ((idx (+ -1 Delta))
- (ddx (+ 1 Delta))
- (edits '()))
- (define ivl (array-ref fp idx))
- (define dvl (array-ref fp ddx))
- (if (not (= -1 dvl)) (set! dvl (- dvl ddx)))
- ;;(print idx '-> ivl ddx '-> dvl)
- (cond ((= ivl -1) edits)
- ((= dvl -1) (loop (+ -1 idx) ddx (cons (list ivl 'insert) edits)))
- ((> dvl ivl) (loop idx (+ 1 ddx) (cons (list dvl 'delete) edits)))
- (else (loop (+ -1 idx) ddx (cons (list ivl 'insert) edits))))))
-
-(define (fp->lcs fp Delta array1 len)
- (define len1 (car (array-dimensions array1)))
- (define lcs (make-array #f len))
- (define (subarray-copy! array1 start1 end1 array2 start2)
- (do ((i start1 (+ i 1))
- (j start2 (+ j 1))
- (l (- end1 start1) (- l 1)))
- ((<= l 0))
- (array-set! array2 (array-ref array1 i) j)))
- (let loop ((ddx (+ 1 Delta))
- (pos len1)
- (dpos len))
- (let* ((dvl (array-ref fp ddx))
- (sublen (- pos (- dvl ddx -1))))
- (cond ((= dvl -1)
- (subarray-copy! array1 0 pos lcs 0)
- lcs)
+ ((<= k Delta))
+ (fp:run fp k A M B N =? CC p))
+ (let ((fpval (fp:run fp Delta A M B N =? CC p)))
+ ;; At this point, the cost to (fpval-Delta, fpval) is Delta + 2*p
+ (cond ((and (not CC) (<= N fpval)) (+ Delta (* 2 p)))
+ ((and (not (negative? p-lim)) (>= p p-lim)) #f)
+ (else (loop (+ 1 p)))))))
+
+;;; Traces runs of matches until they end; then set fp[k]=y.
+;;; If CC is supplied, set each CC[y] = min(CC[y], cost) for run.
+;;; Returns furthest y reached.
+(define (fp:run fp k A M B N =? CC p)
+ (define y (max (+ 1 (array-ref fp (+ -1 k))) (array-ref fp (+ 1 k))))
+ (define cost (+ k p p))
+ (let snloop ((x (- y k))
+ (y y))
+ (and CC (<= y N)
+ (let ((xcst (- M x)))
+ (cond ((negative? xcst))
+ (else (array-set! CC
+ (min (+ xcst cost) (array-ref CC y))
+ y)))))
+ ;;(set! tick (+ 1 tick))
+ (cond ((and (< x M) (< y N)
+ (=? (array-ref A x) (array-ref B y)))
+ (snloop (+ 1 x) (+ 1 y)))
+ (else (array-set! fp y k)
+ y))))
+
+;;; Check that only 1 and -1 steps between adjacent CC entries.
+;;(define (fp:step-check A M B N CC)
+;; (do ((cdx (+ -1 N) (+ -1 cdx)))
+;; ((negative? cdx))
+;; (case (- (array-ref CC cdx) (array-ref CC (+ 1 cdx)))
+;; ((1 -1) #t)
+;; (else (cond ((> 30 (car (array-dimensions CC)))
+;; (display "A: ") (print A)
+;; (display "B: ") (print B)))
+;; (slib:warn
+;; "CC" (append (list (max 0 (+ -5 cdx)) ': (min (+ 1 N) (+ 5 cdx))
+;; 'of)
+;; (array-dimensions CC))
+;; (fp:subarray CC (max 0 (+ -5 cdx)) (min (+ 1 N) (+ 5 cdx))))))))
+
+;;; Correct cost jumps left by fp:compare [which visits only a few (x,y)].
+;;(define (smooth-costs CC N)
+;; (do ((cdx (+ -1 N) (+ -1 cdx))) ; smooth from end
+;; ((negative? cdx))
+;; (array-set! CC (min (array-ref CC cdx) (+ 1 (array-ref CC (+ 1 cdx))))
+;; cdx))
+;; (do ((cdx 1 (+ 1 cdx))) ; smooth toward end
+;; ((> cdx N))
+;; (array-set! CC (min (array-ref CC cdx) (+ 1 (array-ref CC (+ -1 cdx))))
+;; cdx))
+;; CC)
+
+(define (diff:mid-split M N RR CC cost)
+ (define b-splt N) ;Default
+ (define bestrun 0)
+ (define thisrun 0)
+ ;; RR is not longer than CC. So do for each element of RR.
+ (let loop ((cdx (+ 1 (quotient N 2)))
+ (rdx (quotient N 2)))
+ ;;(if (negative? rdx) (slib:error 'negative? 'rdx))
+ (cond ((eqv? cost (+ (array-ref CC rdx) (array-ref RR (- N rdx)))) rdx)
+ ((eqv? cost (+ (array-ref CC cdx) (array-ref RR (- N cdx)))) cdx)
+ (else (loop (+ 1 cdx) (+ -1 rdx))))))
+
+;;; Return 0-based shared array.
+;;; Reverse RA if END < START.
+(define (fp:subarray RA start end)
+ (define n-len (abs (- end start)))
+ (if (< end start)
+ (make-shared-array RA (lambda (idx) (list (- start 1 idx))) n-len)
+ (make-shared-array RA (lambda (idx) (list (+ start idx))) n-len)))
+
+(define (fp:init! fp fill mindx maxdx)
+ (do ((idx maxdx (+ -1 idx)))
+ ((< idx mindx))
+ (array-set! fp fill idx)))
+
+;;; Split A[start-a..end-a] (shorter array) into smaller and smaller chunks.
+;;; EDX is index into EDITS.
+;;; EPO is insert/delete polarity (+1 or -1)
+(define (diff:divide-and-conquer fp CCRR A start-a end-a B start-b end-b edits edx epo =? p-lim)
+ (define mid-a (quotient (+ start-a end-a) 2))
+ (define len-b (- end-b start-b))
+ (define len-a (- end-a start-a))
+ (let ((tcst (+ p-lim p-lim (- len-b len-a))))
+ (define CC (fp:subarray CCRR 0 (+ len-b 1)))
+ (define RR (fp:subarray CCRR (+ len-b 1) (* 2 (+ len-b 1))))
+ (define M2 (- end-a mid-a))
+ (define M1 (- mid-a start-a))
+ (fp:init! CC (+ len-a len-b) 0 len-b)
+ (fp:init! fp -1 (- (+ 1 p-lim)) (+ 1 p-lim (- len-b M1)))
+ (fp:compare fp CC
+ (fp:subarray A start-a mid-a) M1
+ (fp:subarray B start-b end-b) len-b =? (min p-lim len-a))
+ (fp:init! RR (+ len-a len-b) 0 len-b)
+ (fp:init! fp -1 (- (+ 1 p-lim)) (+ 1 p-lim (- len-b M2)))
+ (fp:compare fp RR
+ (fp:subarray A end-a mid-a) M2
+ (fp:subarray B end-b start-b) len-b =? (min p-lim len-a))
+ ;;(smooth-costs CC len-b) (smooth-costs RR len-b)
+ (let ((b-splt (diff:mid-split len-a len-b RR CC tcst)))
+ (define est-c (array-ref CC b-splt))
+ (define est-r (array-ref RR (- len-b b-splt)))
+ ;;(set! splts (cons (/ b-splt (max .1 len-b)) splts))
+ ;;(display "A: ") (array-for-each display (fp:subarray A start-a mid-a)) (display " + ") (array-for-each display (fp:subarray A mid-a end-a)) (newline)
+ ;;(display "B: ") (array-for-each display (fp:subarray B start-b end-b)) (newline)
+ ;;(print 'cc cc) (print 'rr (fp:subarray RR (+ 1 len-b) 0))
+ ;;(print (make-string (+ 7 (* 2 b-splt)) #\-) '^ (list b-splt))
+ (check-cost! 'CC est-c
+ (diff2et fp CCRR
+ A start-a mid-a
+ B start-b (+ start-b b-splt)
+ edits edx epo =?
+ (quotient (- est-c (- b-splt (- mid-a start-a)))
+ 2)))
+ (check-cost! 'RR est-r
+ (diff2et fp CCRR
+ A mid-a end-a
+ B (+ start-b b-splt) end-b
+ edits (+ est-c edx) epo =?
+ (quotient (- est-r (- (- len-b b-splt)
+ (- end-a mid-a)))
+ 2)))
+ (+ est-c est-r))))
+
+;;; Trim; then diff sub-arrays; either one longer. Returns edit-length
+(define (diff2et fp CCRR A start-a end-a B start-b end-b edits edx epo =? p-lim)
+ ;; (if (< (- end-a start-a) p-lim) (slib:warn 'diff2et 'len-a (- end-a start-a) 'len-b (- end-b start-b) 'p-lim p-lim))
+ (do ((bdx (+ -1 end-b) (+ -1 bdx))
+ (adx (+ -1 end-a) (+ -1 adx)))
+ ((not (and (<= start-b bdx)
+ (<= start-a adx)
+ (=? (array-ref A adx) (array-ref B bdx))))
+ (do ((bsx start-b (+ 1 bsx))
+ (asx start-a (+ 1 asx)))
+ ((not (and (< bsx bdx)
+ (< asx adx)
+ (=? (array-ref A asx) (array-ref B bsx))))
+ ;;(print 'trim-et (- asx start-a) '+ (- end-a adx))
+ (let ((delta (- (- bdx bsx) (- adx asx))))
+ (if (negative? delta)
+ (diff2ez fp CCRR B bsx (+ 1 bdx) A asx (+ 1 adx)
+ edits edx (- epo) =? (+ delta p-lim))
+ (diff2ez fp CCRR A asx (+ 1 adx) B bsx (+ 1 bdx)
+ edits edx epo =? p-lim))))
+ ;;(set! tick (+ 1 tick))
+ ))
+ ;;(set! tick (+ 1 tick))
+ ))
+
+;;; Diff sub-arrays, A not longer than B. Returns edit-length
+(define (diff2ez fp CCRR A start-a end-a B start-b end-b edits edx epo =? p-lim)
+ (define len-a (- end-a start-a))
+ (define len-b (- end-b start-b))
+ ;;(if (> len-a len-b) (slib:error 'diff2ez len-a '> len-b))
+ (cond ((zero? p-lim) ; B inserts only
+ (if (= len-b len-a)
+ 0 ; A = B; no edits
+ (let loop ((adx start-a)
+ (bdx start-b)
+ (edx edx))
+ (cond ((>= bdx end-b) (- len-b len-a))
+ ((>= adx end-a)
+ (do ((idx bdx (+ 1 idx))
+ (edx edx (+ 1 edx)))
+ ((>= idx end-b) (- len-b len-a))
+ (array-set! edits (* epo (+ 1 idx)) edx)))
+ ((=? (array-ref A adx) (array-ref B bdx))
+ ;;(set! tick (+ 1 tick))
+ (loop (+ 1 adx) (+ 1 bdx) edx))
+ (else (array-set! edits (* epo (+ 1 bdx)) edx)
+ ;;(set! tick (+ 1 tick))
+ (loop adx (+ 1 bdx) (+ 1 edx)))))))
+ ((<= len-a p-lim) ; delete all A; insert all B
+ ;;(if (< len-a p-lim) (slib:error 'diff2ez len-a len-b 'p-lim p-lim))
+ (do ((idx start-a (+ 1 idx))
+ (edx edx (+ 1 edx)))
+ ((>= idx end-a)
+ (do ((jdx start-b (+ 1 jdx))
+ (edx edx (+ 1 edx)))
+ ((>= jdx end-b))
+ (array-set! edits (* epo (+ 1 jdx)) edx)))
+ (array-set! edits (* epo (- -1 idx)) edx))
+ (+ len-a len-b))
+ (else (diff:divide-and-conquer
+ fp CCRR A start-a end-a B start-b end-b
+ edits edx epo =? p-lim))))
+
+;;;Return new vector of edits in correct sequence
+(define (diff:order-edits edits cost sign)
+ (if (negative? sign)
+ (do ((idx (+ -1 cost) (+ -1 idx)))
+ ((negative? idx))
+ (array-set! edits (- (array-ref edits idx)) idx)))
+ (if (zero? cost)
+ edits
+ (let ((sedits (sort! edits <))
+ (nedits (create-array (As32) cost)))
+ ;; Find -/+ boundary
+ (define len-a (max 0 (- (array-ref sedits 0))))
+ (define len-b (array-ref sedits (+ -1 cost)))
+ (do ((idx 0 (+ 1 idx)))
+ ((or (>= idx cost) (positive? (array-ref sedits idx)))
+ (let loop ((ddx (+ -1 idx))
+ (idx idx)
+ (ndx 0)
+ (adx 0)
+ (bdx 0))
+ (define del (if (negative? ddx) 0 (array-ref sedits ddx)))
+ (define ins (if (>= idx cost) 0 (array-ref sedits idx)))
+ (cond ((and (>= bdx len-b) (>= adx len-a)) nedits)
+ ((and (negative? del) (>= adx (- -1 del))
+ (positive? ins) (>= bdx (+ -1 ins)))
+ (array-set! nedits del ndx)
+ (array-set! nedits ins (+ 1 ndx))
+ (loop (+ -1 ddx) (+ 1 idx) (+ 2 ndx)
+ (+ 1 adx) (+ 1 bdx)))
+ ((and (negative? del) (>= adx (- -1 del)))
+ (array-set! nedits del ndx)
+ (loop (+ -1 ddx) idx (+ 1 ndx) (+ 1 adx) bdx))
+ ((and (positive? ins) (>= bdx (+ -1 ins)))
+ (array-set! nedits ins ndx)
+ (loop ddx (+ 1 idx) (+ 1 ndx) adx (+ 1 bdx)))
+ (else
+ (loop ddx idx ndx (+ 1 adx) (+ 1 bdx))))))))))
+
+;;; len-a < len-b
+(define (edits2lcs lcs edits cost A len-a len-b)
+ (let loop ((edx 0)
+ (sdx 0)
+ (adx 0))
+ (let ((edit (if (< edx cost)
+ (array-ref edits edx)
+ 0)))
+ (cond ((>= adx len-a) lcs)
+ ((positive? edit)
+ (loop (+ 1 edx) sdx adx))
+ ((zero? edit)
+ (array-set! lcs (array-ref A adx) sdx)
+ (loop edx (+ 1 sdx) (+ 1 adx)))
+ ((>= adx (- -1 edit))
+ (loop (+ 1 edx) sdx (+ 1 adx)))
(else
- (subarray-copy! array1 (- dvl ddx -1) pos lcs (- dpos sublen))
- (loop (+ 1 ddx) (- dvl ddx) (- dpos sublen)))))))
+ (array-set! lcs (array-ref A adx) sdx)
+ (loop edx (+ 1 sdx) (+ 1 adx)))))))
+
+;; A not longer than B (M <= N)
+(define (diff2edits A M B N =? p-lim)
+ (define maxdx (if (negative? p-lim) (+ 2 N) (+ 1 p-lim (- N M))))
+ (define mindx (if (negative? p-lim) (- (+ 1 M)) (- (+ 1 p-lim))))
+ ;;(if (> M N) (slib:error 'diff2edits M '> N))
+ (let ((fp (create-array (As32) (list mindx maxdx)))
+ (CCRR (create-array (As32) (* 2 (+ N 1)))))
+ (fp:init! fp -1 mindx maxdx)
+ (let ((est (fp:compare fp #f A M B N =? p-lim)))
+ (and est
+ (let ((edits (create-array (As32) est)))
+ (check-cost! 'diff2edits
+ est
+ (diff2et fp CCRR A 0 M B 0 N edits 0 1 =?
+ (quotient (- est (- N M)) 2)))
+ edits)))))
+;; A not longer than B (M <= N)
+(define (diff2editlen A M B N =? p-lim)
+ (define maxdx (if (negative? p-lim) (+ 1 N) (+ 1 p-lim (- N M))))
+ (define mindx (if (negative? p-lim) (- (+ 1 M)) (- (+ 1 p-lim))))
+ (let ((fp (create-array (As32) (list mindx maxdx))))
+ (fp:init! fp -1 mindx maxdx)
+ (fp:compare fp #f A M B N =? p-lim)))
+
+(define (check-cost! name est cost)
+ (if (not (eqv? est cost))
+ (slib:warn "%s: cost check failed %d != %d\\n" name est cost)))
+
+;;@args array1 array2 =? p-lim
;;@args array1 array2 =?
-;;@args array1 array2
;;@1 and @2 are one-dimensional arrays. The procedure @3 is used
-;;to compare sequence tokens for equality. @3 defaults to @code{eqv?}.
+;;to compare sequence tokens for equality.
+;;
+;;The non-negative integer @4, if provided, is maximum number of
+;;deletions of the shorter sequence to allow. @0 will return @code{#f}
+;;if more deletions would be necessary.
+;;
;;@0 returns a one-dimensional array of length @code{(quotient (- (+
-;;len1 len2) (fp:edit-length @1 @2)) 2)} holding the longest sequence
+;;len1 len2) (diff:edit-length @1 @2)) 2)} holding the longest sequence
;;common to both @var{array}s.
-(define (diff:longest-common-subsequence array1 array2 . =?)
- (define len1 (car (array-dimensions array1)))
- (define len2 (car (array-dimensions array2)))
- (define (snake k y)
- (let snloop ((x (- y k))
- (y y))
- (if (and (< x len1) (< y len2) (=? (array-ref array1 x)
- (array-ref array2 y)))
- (snloop (+ 1 x) (+ 1 y))
- y)))
- (set! =? (if (null? =?) eqv? (car =?)))
- (if (> len1 len2)
- (diff:longest-common-subsequence array2 array1)
- (let ((Delta (- len2 len1))
- (fp (make-array -1 (list (- (+ 1 len1)) (+ 1 len2)))))
- (fp->lcs fp Delta array1
- (quotient (- (+ len1 len2) (fp:compare fp Delta snake len2))
- 2)))))
+(define (diff:longest-common-subsequence A B =? . p-lim)
+ (define len-a (car (array-dimensions a)))
+ (define len-b (car (array-dimensions b)))
+ (set! p-lim (if (null? p-lim) -1 (car p-lim)))
+ (let ((edits (if (< len-b len-a)
+ (diff2edits B len-b A len-a =? p-lim)
+ (diff2edits A len-a B len-b =? p-lim))))
+ (and edits
+ (let* ((cost (car (array-dimensions edits)))
+ (sedit (diff:order-edits edits cost (if (< len-b len-a) -1 1)))
+ (lcs (create-array A (/ (- (+ len-b len-a) cost) 2))))
+ (if (< len-b len-a)
+ (edits2lcs lcs sedit cost B len-b len-a)
+ (edits2lcs lcs sedit cost A len-a len-b))))))
+;;@args array1 array2 =? p-lim
;;@args array1 array2 =?
-;;@args array1 array2
;;@1 and @2 are one-dimensional arrays. The procedure @3 is used
-;;to compare sequence tokens for equality. @3 defaults to @code{eqv?}.
-;;@0 returns a list of length @code{(fp:edit-length @1 @2)} composed of
-;;a shortest sequence of edits transformaing @1 to @2.
+;;to compare sequence tokens for equality.
;;
-;;Each edit is a list of an integer and a symbol:
+;;The non-negative integer @4, if provided, is maximum number of
+;;deletions of the shorter sequence to allow. @0 will return @code{#f}
+;;if more deletions would be necessary.
+;;
+;;@0 returns a vector of length @code{(diff:edit-length @1 @2)} composed
+;;of a shortest sequence of edits transformaing @1 to @2.
+;;
+;;Each edit is an integer:
;;@table @asis
-;;@item (@var{j} insert)
-;;Inserts @code{(array-ref @1 @var{j})} into the sequence.
-;;@item (@var{k} delete)
-;;Deletes @code{(array-ref @2 @var{k})} from the sequence.
+;;@item @var{k} > 0
+;;Inserts @code{(array-ref @1 (+ -1 @var{j}))} into the sequence.
+;;@item @var{k} < 0
+;;Deletes @code{(array-ref @2 (- -1 @var{k}))} from the sequence.
;;@end table
-(define (diff:edits array1 array2 . =?)
- (define len1 (car (array-dimensions array1)))
- (define len2 (car (array-dimensions array2)))
- (define (snake k y)
- (let snloop ((x (- y k))
- (y y))
- (if (and (< x len1) (< y len2) (=? (array-ref array1 x)
- (array-ref array2 y)))
- (snloop (+ 1 x) (+ 1 y)) y)))
- (set! =? (if (null? =?) eqv? (car =?)))
- (if (> len1 len2)
- (diff:reverse-edits (diff:edits array2 array1))
- (let ((Delta (- len2 len1))
- (fp (make-array -1 (list (- (+ 1 len1)) (+ 1 len2)))))
- (fp:compare fp Delta snake len2)
- ;;(do ((idx (- -1 len1) (+ 1 idx))) ((>= idx (+ 1 len2)) (newline)) (printf "%3d" idx))
- ;;(do ((idx (- -1 len1) (+ 1 idx))) ((>= idx (+ 1 len2)) (newline)) (printf "%3d" (array-ref fp idx)))
- (fp->edits fp Delta))))
-
-(define (diff:reverse-edits edits)
- (map (lambda (edit)
- (list (car edit)
- (case (cadr edit)
- ((delete) 'insert)
- ((insert) 'delete))))
- edits))
+(define (diff:edits A B =? . p-lim)
+ (define len-a (car (array-dimensions a)))
+ (define len-b (car (array-dimensions b)))
+ (set! p-lim (if (null? p-lim) -1 (car p-lim)))
+ (let ((edits (if (< len-b len-a)
+ (diff2edits B len-b A len-a =? p-lim)
+ (diff2edits A len-a B len-b =? p-lim))))
+ (and edits (diff:order-edits edits (car (array-dimensions edits))
+ (if (< len-b len-a) -1 1)))))
+;;@args array1 array2 =? p-lim
;;@args array1 array2 =?
-;;@args array1 array2
;;@1 and @2 are one-dimensional arrays. The procedure @3 is used
-;;to compare sequence tokens for equality. @3 defaults to @code{eqv?}.
+;;to compare sequence tokens for equality.
+;;
+;;The non-negative integer @4, if provided, is maximum number of
+;;deletions of the shorter sequence to allow. @0 will return @code{#f}
+;;if more deletions would be necessary.
+;;
;;@0 returns the length of the shortest sequence of edits transformaing
;;@1 to @2.
-(define (diff:edit-length array1 array2 . =?)
- (define len1 (car (array-dimensions array1)))
- (define len2 (car (array-dimensions array2)))
- (define (snake k y)
- (let snloop ((x (- y k))
- (y y))
- (if (and (< x len1) (< y len2) (=? (array-ref array1 x)
- (array-ref array2 y)))
- (snloop (+ 1 x) (+ 1 y))
- y)))
- (set! =? (if (null? =?) eqv? (car =?)))
- (if (> len1 len2)
- (diff:edit-length array2 array1)
- (let ((Delta (- len2 len1))
- (fp (make-array -1 (list (- (+ 1 len1)) (+ 1 len2)))))
- (fp:compare fp Delta snake len2))))
+(define (diff:edit-length A B =? . p-lim)
+ (define M (car (array-dimensions a)))
+ (define N (car (array-dimensions b)))
+ (set! p-lim (if (null? p-lim) -1 (car p-lim)))
+ (if (< N M)
+ (diff2editlen B N A M =? p-lim)
+ (diff2editlen A M B N =? p-lim)))
;;@example
-;;(diff:longest-common-subsequence '#(f g h i e j c k l m)
-;; '#(f g e h i j k p q r l m))
-;; @result{} #(f g h i j k l m)
+;;(diff:longest-common-subsequence "fghiejcklm" "fgehijkpqrlm" eqv?)
+;;@result{} "fghijklm"
;;
-;;(diff:edit-length '#(f g h i e j c k l m)
-;; '#(f g e h i j k p q r l m))
+;;(diff:edit-length "fghiejcklm" "fgehijkpqrlm" eqv?)
;;@result{} 6
;;
-;;(pretty-print (diff:edits '#(f g h i e j c k l m)
-;; '#(f g e h i j k p q r l m)))
-;;@print{}
-;;((3 insert) ; e
-;; (4 delete) ; c
-;; (6 delete) ; h
-;; (7 insert) ; p
-;; (8 insert) ; q
-;; (9 insert)) ; r
+;;(diff:edits "fghiejcklm" "fgehijkpqrlm" eqv?)
+;;@result{} #As32(3 -5 -7 8 9 10)
+;; ; e c h p q r
;;@end example
-;; 12 - 10 = 2
-;; -11-10 -9 -8 -7 -6 -5 -4 -3 -2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12
-;; -1 -1 -1 -1 -1 -1 -1 -1 -1 3 7 8 9 12 9 8 -1 -1 -1 -1 -1 -1 -1 -1
-;; edit-distance = 6
+;;(trace-all "/home/jaffer/slib/differ.scm")(set! *qp-width* 333)(untrace fp:run fp:subarray)