diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2017-02-20 00:05:29 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2017-02-20 00:05:29 -0800 |
commit | 8466d8cfa486fb30d1755c4261b781135083787b (patch) | |
tree | c8c12c67246f543c3cc4f64d1c07e003cb1d45ae /differ.scm | |
parent | 87b82b5822ca54228cfa6df29be3ad9d4bc47d16 (diff) | |
download | slib-8466d8cfa486fb30d1755c4261b781135083787b.tar.gz slib-8466d8cfa486fb30d1755c4261b781135083787b.zip |
Import Upstream version 3a1upstream/3a1
Diffstat (limited to 'differ.scm')
-rw-r--r-- | differ.scm | 521 |
1 files changed, 369 insertions, 152 deletions
@@ -1,5 +1,5 @@ ;;;; "differ.scm" O(NP) Sequence Comparison Algorithm. -;;; Copyright (C) 2001 Aubrey Jaffer +;;; Copyright (C) 2001, 2002, 2003 Aubrey Jaffer ; ;Permission to copy this software, to modify it, to redistribute it, ;to distribute modified versions, and to use it for any purpose is @@ -8,7 +8,7 @@ ;1. Any copy made of this software must include this copyright notice ;in full. ; -;2. I have made no warrantee or representation that the operation of +;2. I have made no warranty or representation that the operation of ;this software will be error-free, and I am under no obligation to ;provide any services, by way of maintenance, update, or otherwise. ; @@ -18,7 +18,7 @@ ;each case. ;;@noindent -;;This package implements the algorithm: +;;@code{diff:edit-length} implements the algorithm: ;; ;;@ifinfo ;;@example @@ -37,186 +37,403 @@ ;;@end ifset ;; ;;@noindent -;;If the items being sequenced are text lines, then the computed -;;edit-list is equivalent to the output of the @dfn{diff} utility -;;program. If the items being sequenced are words, then it is like the -;;lesser known @dfn{spiff} program. -;; -;;@noindent ;;The values returned by @code{diff:edit-length} can be used to gauge ;;the degree of match between two sequences. ;; ;;@noindent -;;I believe that this algorithm is currently the fastest for these -;;tasks, but genome sequencing applications fuel extensive research in -;;this area. +;;Surprisingly, "An O(NP) Sequence Comparison Algorithm" does not +;;derive the edit sequence; only the sequence length. Developing this +;;linear-space sub-quadratic-time algorithm for computing the edit +;;sequence required hundreds of hours of work. I have submitted a +;;paper describing the algorithm to the Journal of Computational +;;Biology. +;; +;;@noindent +;;If the items being sequenced are text lines, then the computed +;;edit-list is equivalent to the output of the @dfn{diff} utility +;;program. If the items being sequenced are words, then it is like the +;;lesser known @dfn{spiff} program. (require 'array) +(require 'sort) -(define (fp:compare fp Delta snake len2) +;;; p-lim is half the number of gratuitous edits for strings of given +;;; lengths. +;;; When passed #f CC, fp:compare returns edit-distance if successful; +;;; #f otherwise (p > p-lim). When passed CC, fp:compare returns #f. +(define (fp:compare fp CC A M B N =? p-lim) + (define Delta (- N M)) + ;;(if (negative? Delta) (slib:error 'fp:compare (fp:subarray A 0 M) '> (fp:subarray B 0 N))) + ;;(set! compares (+ 1 compares)) ;(print 'fp:compare M N p-lim) (let loop ((p 0)) (do ((k (- p) (+ 1 k))) - ((> k (+ -1 Delta))) - (array-set! fp (snake k (max (+ 1 (array-ref fp (+ -1 k))) - (array-ref fp (+ 1 k)))) - k)) + ((>= k Delta)) + (fp:run fp k A M B N =? CC p)) (do ((k (+ Delta p) (+ -1 k))) - ((< k (+ 1 Delta))) - (array-set! fp (snake k (max (+ 1 (array-ref fp (+ -1 k))) - (array-ref fp (+ 1 k)))) - k)) - (array-set! fp (snake Delta (max (+ 1 (array-ref fp (+ -1 Delta))) - (array-ref fp (+ 1 Delta)))) - Delta) - (if (= (array-ref fp Delta) len2) - (+ Delta (* 2 p)) - (loop (+ 1 p))))) - -(define (fp->edits fp Delta) - (let loop ((idx (+ -1 Delta)) - (ddx (+ 1 Delta)) - (edits '())) - (define ivl (array-ref fp idx)) - (define dvl (array-ref fp ddx)) - (if (not (= -1 dvl)) (set! dvl (- dvl ddx))) - ;;(print idx '-> ivl ddx '-> dvl) - (cond ((= ivl -1) edits) - ((= dvl -1) (loop (+ -1 idx) ddx (cons (list ivl 'insert) edits))) - ((> dvl ivl) (loop idx (+ 1 ddx) (cons (list dvl 'delete) edits))) - (else (loop (+ -1 idx) ddx (cons (list ivl 'insert) edits)))))) - -(define (fp->lcs fp Delta array1 len) - (define len1 (car (array-dimensions array1))) - (define lcs (make-array #f len)) - (define (subarray-copy! array1 start1 end1 array2 start2) - (do ((i start1 (+ i 1)) - (j start2 (+ j 1)) - (l (- end1 start1) (- l 1))) - ((<= l 0)) - (array-set! array2 (array-ref array1 i) j))) - (let loop ((ddx (+ 1 Delta)) - (pos len1) - (dpos len)) - (let* ((dvl (array-ref fp ddx)) - (sublen (- pos (- dvl ddx -1)))) - (cond ((= dvl -1) - (subarray-copy! array1 0 pos lcs 0) - lcs) + ((<= k Delta)) + (fp:run fp k A M B N =? CC p)) + (let ((fpval (fp:run fp Delta A M B N =? CC p))) + ;; At this point, the cost to (fpval-Delta, fpval) is Delta + 2*p + (cond ((and (not CC) (<= N fpval)) (+ Delta (* 2 p))) + ((and (not (negative? p-lim)) (>= p p-lim)) #f) + (else (loop (+ 1 p))))))) + +;;; Traces runs of matches until they end; then set fp[k]=y. +;;; If CC is supplied, set each CC[y] = min(CC[y], cost) for run. +;;; Returns furthest y reached. +(define (fp:run fp k A M B N =? CC p) + (define y (max (+ 1 (array-ref fp (+ -1 k))) (array-ref fp (+ 1 k)))) + (define cost (+ k p p)) + (let snloop ((x (- y k)) + (y y)) + (and CC (<= y N) + (let ((xcst (- M x))) + (cond ((negative? xcst)) + (else (array-set! CC + (min (+ xcst cost) (array-ref CC y)) + y))))) + ;;(set! tick (+ 1 tick)) + (cond ((and (< x M) (< y N) + (=? (array-ref A x) (array-ref B y))) + (snloop (+ 1 x) (+ 1 y))) + (else (array-set! fp y k) + y)))) + +;;; Check that only 1 and -1 steps between adjacent CC entries. +;;(define (fp:step-check A M B N CC) +;; (do ((cdx (+ -1 N) (+ -1 cdx))) +;; ((negative? cdx)) +;; (case (- (array-ref CC cdx) (array-ref CC (+ 1 cdx))) +;; ((1 -1) #t) +;; (else (cond ((> 30 (car (array-dimensions CC))) +;; (display "A: ") (print A) +;; (display "B: ") (print B))) +;; (slib:warn +;; "CC" (append (list (max 0 (+ -5 cdx)) ': (min (+ 1 N) (+ 5 cdx)) +;; 'of) +;; (array-dimensions CC)) +;; (fp:subarray CC (max 0 (+ -5 cdx)) (min (+ 1 N) (+ 5 cdx)))))))) + +;;; Correct cost jumps left by fp:compare [which visits only a few (x,y)]. +;;(define (smooth-costs CC N) +;; (do ((cdx (+ -1 N) (+ -1 cdx))) ; smooth from end +;; ((negative? cdx)) +;; (array-set! CC (min (array-ref CC cdx) (+ 1 (array-ref CC (+ 1 cdx)))) +;; cdx)) +;; (do ((cdx 1 (+ 1 cdx))) ; smooth toward end +;; ((> cdx N)) +;; (array-set! CC (min (array-ref CC cdx) (+ 1 (array-ref CC (+ -1 cdx)))) +;; cdx)) +;; CC) + +(define (diff:mid-split M N RR CC cost) + (define b-splt N) ;Default + (define bestrun 0) + (define thisrun 0) + ;; RR is not longer than CC. So do for each element of RR. + (let loop ((cdx (+ 1 (quotient N 2))) + (rdx (quotient N 2))) + ;;(if (negative? rdx) (slib:error 'negative? 'rdx)) + (cond ((eqv? cost (+ (array-ref CC rdx) (array-ref RR (- N rdx)))) rdx) + ((eqv? cost (+ (array-ref CC cdx) (array-ref RR (- N cdx)))) cdx) + (else (loop (+ 1 cdx) (+ -1 rdx)))))) + +;;; Return 0-based shared array. +;;; Reverse RA if END < START. +(define (fp:subarray RA start end) + (define n-len (abs (- end start))) + (if (< end start) + (make-shared-array RA (lambda (idx) (list (- start 1 idx))) n-len) + (make-shared-array RA (lambda (idx) (list (+ start idx))) n-len))) + +(define (fp:init! fp fill mindx maxdx) + (do ((idx maxdx (+ -1 idx))) + ((< idx mindx)) + (array-set! fp fill idx))) + +;;; Split A[start-a..end-a] (shorter array) into smaller and smaller chunks. +;;; EDX is index into EDITS. +;;; EPO is insert/delete polarity (+1 or -1) +(define (diff:divide-and-conquer fp CCRR A start-a end-a B start-b end-b edits edx epo =? p-lim) + (define mid-a (quotient (+ start-a end-a) 2)) + (define len-b (- end-b start-b)) + (define len-a (- end-a start-a)) + (let ((tcst (+ p-lim p-lim (- len-b len-a)))) + (define CC (fp:subarray CCRR 0 (+ len-b 1))) + (define RR (fp:subarray CCRR (+ len-b 1) (* 2 (+ len-b 1)))) + (define M2 (- end-a mid-a)) + (define M1 (- mid-a start-a)) + (fp:init! CC (+ len-a len-b) 0 len-b) + (fp:init! fp -1 (- (+ 1 p-lim)) (+ 1 p-lim (- len-b M1))) + (fp:compare fp CC + (fp:subarray A start-a mid-a) M1 + (fp:subarray B start-b end-b) len-b =? (min p-lim len-a)) + (fp:init! RR (+ len-a len-b) 0 len-b) + (fp:init! fp -1 (- (+ 1 p-lim)) (+ 1 p-lim (- len-b M2))) + (fp:compare fp RR + (fp:subarray A end-a mid-a) M2 + (fp:subarray B end-b start-b) len-b =? (min p-lim len-a)) + ;;(smooth-costs CC len-b) (smooth-costs RR len-b) + (let ((b-splt (diff:mid-split len-a len-b RR CC tcst))) + (define est-c (array-ref CC b-splt)) + (define est-r (array-ref RR (- len-b b-splt))) + ;;(set! splts (cons (/ b-splt (max .1 len-b)) splts)) + ;;(display "A: ") (array-for-each display (fp:subarray A start-a mid-a)) (display " + ") (array-for-each display (fp:subarray A mid-a end-a)) (newline) + ;;(display "B: ") (array-for-each display (fp:subarray B start-b end-b)) (newline) + ;;(print 'cc cc) (print 'rr (fp:subarray RR (+ 1 len-b) 0)) + ;;(print (make-string (+ 7 (* 2 b-splt)) #\-) '^ (list b-splt)) + (check-cost! 'CC est-c + (diff2et fp CCRR + A start-a mid-a + B start-b (+ start-b b-splt) + edits edx epo =? + (quotient (- est-c (- b-splt (- mid-a start-a))) + 2))) + (check-cost! 'RR est-r + (diff2et fp CCRR + A mid-a end-a + B (+ start-b b-splt) end-b + edits (+ est-c edx) epo =? + (quotient (- est-r (- (- len-b b-splt) + (- end-a mid-a))) + 2))) + (+ est-c est-r)))) + +;;; Trim; then diff sub-arrays; either one longer. Returns edit-length +(define (diff2et fp CCRR A start-a end-a B start-b end-b edits edx epo =? p-lim) + ;; (if (< (- end-a start-a) p-lim) (slib:warn 'diff2et 'len-a (- end-a start-a) 'len-b (- end-b start-b) 'p-lim p-lim)) + (do ((bdx (+ -1 end-b) (+ -1 bdx)) + (adx (+ -1 end-a) (+ -1 adx))) + ((not (and (<= start-b bdx) + (<= start-a adx) + (=? (array-ref A adx) (array-ref B bdx)))) + (do ((bsx start-b (+ 1 bsx)) + (asx start-a (+ 1 asx))) + ((not (and (< bsx bdx) + (< asx adx) + (=? (array-ref A asx) (array-ref B bsx)))) + ;;(print 'trim-et (- asx start-a) '+ (- end-a adx)) + (let ((delta (- (- bdx bsx) (- adx asx)))) + (if (negative? delta) + (diff2ez fp CCRR B bsx (+ 1 bdx) A asx (+ 1 adx) + edits edx (- epo) =? (+ delta p-lim)) + (diff2ez fp CCRR A asx (+ 1 adx) B bsx (+ 1 bdx) + edits edx epo =? p-lim)))) + ;;(set! tick (+ 1 tick)) + )) + ;;(set! tick (+ 1 tick)) + )) + +;;; Diff sub-arrays, A not longer than B. Returns edit-length +(define (diff2ez fp CCRR A start-a end-a B start-b end-b edits edx epo =? p-lim) + (define len-a (- end-a start-a)) + (define len-b (- end-b start-b)) + ;;(if (> len-a len-b) (slib:error 'diff2ez len-a '> len-b)) + (cond ((zero? p-lim) ; B inserts only + (if (= len-b len-a) + 0 ; A = B; no edits + (let loop ((adx start-a) + (bdx start-b) + (edx edx)) + (cond ((>= bdx end-b) (- len-b len-a)) + ((>= adx end-a) + (do ((idx bdx (+ 1 idx)) + (edx edx (+ 1 edx))) + ((>= idx end-b) (- len-b len-a)) + (array-set! edits (* epo (+ 1 idx)) edx))) + ((=? (array-ref A adx) (array-ref B bdx)) + ;;(set! tick (+ 1 tick)) + (loop (+ 1 adx) (+ 1 bdx) edx)) + (else (array-set! edits (* epo (+ 1 bdx)) edx) + ;;(set! tick (+ 1 tick)) + (loop adx (+ 1 bdx) (+ 1 edx))))))) + ((<= len-a p-lim) ; delete all A; insert all B + ;;(if (< len-a p-lim) (slib:error 'diff2ez len-a len-b 'p-lim p-lim)) + (do ((idx start-a (+ 1 idx)) + (edx edx (+ 1 edx))) + ((>= idx end-a) + (do ((jdx start-b (+ 1 jdx)) + (edx edx (+ 1 edx))) + ((>= jdx end-b)) + (array-set! edits (* epo (+ 1 jdx)) edx))) + (array-set! edits (* epo (- -1 idx)) edx)) + (+ len-a len-b)) + (else (diff:divide-and-conquer + fp CCRR A start-a end-a B start-b end-b + edits edx epo =? p-lim)))) + +;;;Return new vector of edits in correct sequence +(define (diff:order-edits edits cost sign) + (if (negative? sign) + (do ((idx (+ -1 cost) (+ -1 idx))) + ((negative? idx)) + (array-set! edits (- (array-ref edits idx)) idx))) + (if (zero? cost) + edits + (let ((sedits (sort! edits <)) + (nedits (create-array (As32) cost))) + ;; Find -/+ boundary + (define len-a (max 0 (- (array-ref sedits 0)))) + (define len-b (array-ref sedits (+ -1 cost))) + (do ((idx 0 (+ 1 idx))) + ((or (>= idx cost) (positive? (array-ref sedits idx))) + (let loop ((ddx (+ -1 idx)) + (idx idx) + (ndx 0) + (adx 0) + (bdx 0)) + (define del (if (negative? ddx) 0 (array-ref sedits ddx))) + (define ins (if (>= idx cost) 0 (array-ref sedits idx))) + (cond ((and (>= bdx len-b) (>= adx len-a)) nedits) + ((and (negative? del) (>= adx (- -1 del)) + (positive? ins) (>= bdx (+ -1 ins))) + (array-set! nedits del ndx) + (array-set! nedits ins (+ 1 ndx)) + (loop (+ -1 ddx) (+ 1 idx) (+ 2 ndx) + (+ 1 adx) (+ 1 bdx))) + ((and (negative? del) (>= adx (- -1 del))) + (array-set! nedits del ndx) + (loop (+ -1 ddx) idx (+ 1 ndx) (+ 1 adx) bdx)) + ((and (positive? ins) (>= bdx (+ -1 ins))) + (array-set! nedits ins ndx) + (loop ddx (+ 1 idx) (+ 1 ndx) adx (+ 1 bdx))) + (else + (loop ddx idx ndx (+ 1 adx) (+ 1 bdx)))))))))) + +;;; len-a < len-b +(define (edits2lcs lcs edits cost A len-a len-b) + (let loop ((edx 0) + (sdx 0) + (adx 0)) + (let ((edit (if (< edx cost) + (array-ref edits edx) + 0))) + (cond ((>= adx len-a) lcs) + ((positive? edit) + (loop (+ 1 edx) sdx adx)) + ((zero? edit) + (array-set! lcs (array-ref A adx) sdx) + (loop edx (+ 1 sdx) (+ 1 adx))) + ((>= adx (- -1 edit)) + (loop (+ 1 edx) sdx (+ 1 adx))) (else - (subarray-copy! array1 (- dvl ddx -1) pos lcs (- dpos sublen)) - (loop (+ 1 ddx) (- dvl ddx) (- dpos sublen))))))) + (array-set! lcs (array-ref A adx) sdx) + (loop edx (+ 1 sdx) (+ 1 adx))))))) + +;; A not longer than B (M <= N) +(define (diff2edits A M B N =? p-lim) + (define maxdx (if (negative? p-lim) (+ 2 N) (+ 1 p-lim (- N M)))) + (define mindx (if (negative? p-lim) (- (+ 1 M)) (- (+ 1 p-lim)))) + ;;(if (> M N) (slib:error 'diff2edits M '> N)) + (let ((fp (create-array (As32) (list mindx maxdx))) + (CCRR (create-array (As32) (* 2 (+ N 1))))) + (fp:init! fp -1 mindx maxdx) + (let ((est (fp:compare fp #f A M B N =? p-lim))) + (and est + (let ((edits (create-array (As32) est))) + (check-cost! 'diff2edits + est + (diff2et fp CCRR A 0 M B 0 N edits 0 1 =? + (quotient (- est (- N M)) 2))) + edits))))) +;; A not longer than B (M <= N) +(define (diff2editlen A M B N =? p-lim) + (define maxdx (if (negative? p-lim) (+ 1 N) (+ 1 p-lim (- N M)))) + (define mindx (if (negative? p-lim) (- (+ 1 M)) (- (+ 1 p-lim)))) + (let ((fp (create-array (As32) (list mindx maxdx)))) + (fp:init! fp -1 mindx maxdx) + (fp:compare fp #f A M B N =? p-lim))) + +(define (check-cost! name est cost) + (if (not (eqv? est cost)) + (slib:warn "%s: cost check failed %d != %d\\n" name est cost))) + +;;@args array1 array2 =? p-lim ;;@args array1 array2 =? -;;@args array1 array2 ;;@1 and @2 are one-dimensional arrays. The procedure @3 is used -;;to compare sequence tokens for equality. @3 defaults to @code{eqv?}. +;;to compare sequence tokens for equality. +;; +;;The non-negative integer @4, if provided, is maximum number of +;;deletions of the shorter sequence to allow. @0 will return @code{#f} +;;if more deletions would be necessary. +;; ;;@0 returns a one-dimensional array of length @code{(quotient (- (+ -;;len1 len2) (fp:edit-length @1 @2)) 2)} holding the longest sequence +;;len1 len2) (diff:edit-length @1 @2)) 2)} holding the longest sequence ;;common to both @var{array}s. -(define (diff:longest-common-subsequence array1 array2 . =?) - (define len1 (car (array-dimensions array1))) - (define len2 (car (array-dimensions array2))) - (define (snake k y) - (let snloop ((x (- y k)) - (y y)) - (if (and (< x len1) (< y len2) (=? (array-ref array1 x) - (array-ref array2 y))) - (snloop (+ 1 x) (+ 1 y)) - y))) - (set! =? (if (null? =?) eqv? (car =?))) - (if (> len1 len2) - (diff:longest-common-subsequence array2 array1) - (let ((Delta (- len2 len1)) - (fp (make-array -1 (list (- (+ 1 len1)) (+ 1 len2))))) - (fp->lcs fp Delta array1 - (quotient (- (+ len1 len2) (fp:compare fp Delta snake len2)) - 2))))) +(define (diff:longest-common-subsequence A B =? . p-lim) + (define len-a (car (array-dimensions a))) + (define len-b (car (array-dimensions b))) + (set! p-lim (if (null? p-lim) -1 (car p-lim))) + (let ((edits (if (< len-b len-a) + (diff2edits B len-b A len-a =? p-lim) + (diff2edits A len-a B len-b =? p-lim)))) + (and edits + (let* ((cost (car (array-dimensions edits))) + (sedit (diff:order-edits edits cost (if (< len-b len-a) -1 1))) + (lcs (create-array A (/ (- (+ len-b len-a) cost) 2)))) + (if (< len-b len-a) + (edits2lcs lcs sedit cost B len-b len-a) + (edits2lcs lcs sedit cost A len-a len-b)))))) +;;@args array1 array2 =? p-lim ;;@args array1 array2 =? -;;@args array1 array2 ;;@1 and @2 are one-dimensional arrays. The procedure @3 is used -;;to compare sequence tokens for equality. @3 defaults to @code{eqv?}. -;;@0 returns a list of length @code{(fp:edit-length @1 @2)} composed of -;;a shortest sequence of edits transformaing @1 to @2. +;;to compare sequence tokens for equality. ;; -;;Each edit is a list of an integer and a symbol: +;;The non-negative integer @4, if provided, is maximum number of +;;deletions of the shorter sequence to allow. @0 will return @code{#f} +;;if more deletions would be necessary. +;; +;;@0 returns a vector of length @code{(diff:edit-length @1 @2)} composed +;;of a shortest sequence of edits transformaing @1 to @2. +;; +;;Each edit is an integer: ;;@table @asis -;;@item (@var{j} insert) -;;Inserts @code{(array-ref @1 @var{j})} into the sequence. -;;@item (@var{k} delete) -;;Deletes @code{(array-ref @2 @var{k})} from the sequence. +;;@item @var{k} > 0 +;;Inserts @code{(array-ref @1 (+ -1 @var{j}))} into the sequence. +;;@item @var{k} < 0 +;;Deletes @code{(array-ref @2 (- -1 @var{k}))} from the sequence. ;;@end table -(define (diff:edits array1 array2 . =?) - (define len1 (car (array-dimensions array1))) - (define len2 (car (array-dimensions array2))) - (define (snake k y) - (let snloop ((x (- y k)) - (y y)) - (if (and (< x len1) (< y len2) (=? (array-ref array1 x) - (array-ref array2 y))) - (snloop (+ 1 x) (+ 1 y)) y))) - (set! =? (if (null? =?) eqv? (car =?))) - (if (> len1 len2) - (diff:reverse-edits (diff:edits array2 array1)) - (let ((Delta (- len2 len1)) - (fp (make-array -1 (list (- (+ 1 len1)) (+ 1 len2))))) - (fp:compare fp Delta snake len2) - ;;(do ((idx (- -1 len1) (+ 1 idx))) ((>= idx (+ 1 len2)) (newline)) (printf "%3d" idx)) - ;;(do ((idx (- -1 len1) (+ 1 idx))) ((>= idx (+ 1 len2)) (newline)) (printf "%3d" (array-ref fp idx))) - (fp->edits fp Delta)))) - -(define (diff:reverse-edits edits) - (map (lambda (edit) - (list (car edit) - (case (cadr edit) - ((delete) 'insert) - ((insert) 'delete)))) - edits)) +(define (diff:edits A B =? . p-lim) + (define len-a (car (array-dimensions a))) + (define len-b (car (array-dimensions b))) + (set! p-lim (if (null? p-lim) -1 (car p-lim))) + (let ((edits (if (< len-b len-a) + (diff2edits B len-b A len-a =? p-lim) + (diff2edits A len-a B len-b =? p-lim)))) + (and edits (diff:order-edits edits (car (array-dimensions edits)) + (if (< len-b len-a) -1 1))))) +;;@args array1 array2 =? p-lim ;;@args array1 array2 =? -;;@args array1 array2 ;;@1 and @2 are one-dimensional arrays. The procedure @3 is used -;;to compare sequence tokens for equality. @3 defaults to @code{eqv?}. +;;to compare sequence tokens for equality. +;; +;;The non-negative integer @4, if provided, is maximum number of +;;deletions of the shorter sequence to allow. @0 will return @code{#f} +;;if more deletions would be necessary. +;; ;;@0 returns the length of the shortest sequence of edits transformaing ;;@1 to @2. -(define (diff:edit-length array1 array2 . =?) - (define len1 (car (array-dimensions array1))) - (define len2 (car (array-dimensions array2))) - (define (snake k y) - (let snloop ((x (- y k)) - (y y)) - (if (and (< x len1) (< y len2) (=? (array-ref array1 x) - (array-ref array2 y))) - (snloop (+ 1 x) (+ 1 y)) - y))) - (set! =? (if (null? =?) eqv? (car =?))) - (if (> len1 len2) - (diff:edit-length array2 array1) - (let ((Delta (- len2 len1)) - (fp (make-array -1 (list (- (+ 1 len1)) (+ 1 len2))))) - (fp:compare fp Delta snake len2)))) +(define (diff:edit-length A B =? . p-lim) + (define M (car (array-dimensions a))) + (define N (car (array-dimensions b))) + (set! p-lim (if (null? p-lim) -1 (car p-lim))) + (if (< N M) + (diff2editlen B N A M =? p-lim) + (diff2editlen A M B N =? p-lim))) ;;@example -;;(diff:longest-common-subsequence '#(f g h i e j c k l m) -;; '#(f g e h i j k p q r l m)) -;; @result{} #(f g h i j k l m) +;;(diff:longest-common-subsequence "fghiejcklm" "fgehijkpqrlm" eqv?) +;;@result{} "fghijklm" ;; -;;(diff:edit-length '#(f g h i e j c k l m) -;; '#(f g e h i j k p q r l m)) +;;(diff:edit-length "fghiejcklm" "fgehijkpqrlm" eqv?) ;;@result{} 6 ;; -;;(pretty-print (diff:edits '#(f g h i e j c k l m) -;; '#(f g e h i j k p q r l m))) -;;@print{} -;;((3 insert) ; e -;; (4 delete) ; c -;; (6 delete) ; h -;; (7 insert) ; p -;; (8 insert) ; q -;; (9 insert)) ; r +;;(diff:edits "fghiejcklm" "fgehijkpqrlm" eqv?) +;;@result{} #As32(3 -5 -7 8 9 10) +;; ; e c h p q r ;;@end example -;; 12 - 10 = 2 -;; -11-10 -9 -8 -7 -6 -5 -4 -3 -2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 -;; -1 -1 -1 -1 -1 -1 -1 -1 -1 3 7 8 9 12 9 8 -1 -1 -1 -1 -1 -1 -1 -1 -;; edit-distance = 6 +;;(trace-all "/home/jaffer/slib/differ.scm")(set! *qp-width* 333)(untrace fp:run fp:subarray) |