;;;; "differ.scm" O(NP) Sequence Comparison Algorithm.
;;; Copyright (C) 2001 Aubrey Jaffer
;
;Permission to copy this software, to modify it, to redistribute it,
;to distribute modified versions, and to use it for any purpose is
;granted, subject to the following restrictions and understandings.
;
;1. Any copy made of this software must include this copyright notice
;in full.
;
;2. I have made no warrantee or representation that the operation of
;this software will be error-free, and I am under no obligation to
;provide any services, by way of maintenance, update, or otherwise.
;
;3. In conjunction with products arising from the use of this
;material, there shall be no use of my name in any advertising,
;promotional, or sales literature without prior written consent in
;each case.
;;@noindent
;;This package implements the algorithm:
;;
;;@ifinfo
;;@example
;;S. Wu, E. Myers, U. Manber, and W. Miller,
;; "An O(NP) Sequence Comparison Algorithm,"
;; Information Processing Letters 35, 6 (1990), 317-323.
;; @url{http://www.cs.arizona.edu/people/gene/vita.html}
;;@end example
;;@end ifinfo
;;@ifset html
;;S. Wu,
;;E. Myers, U. Manber, and W. Miller,
;;
;;"An O(NP) Sequence Comparison Algorithm,"
;;Information Processing Letters 35, 6 (1990), 317-323.
;;@end ifset
;;
;;@noindent
;;If the items being sequenced are text lines, then the computed
;;edit-list is equivalent to the output of the @dfn{diff} utility
;;program. If the items being sequenced are words, then it is like the
;;lesser known @dfn{spiff} program.
;;
;;@noindent
;;The values returned by @code{diff:edit-length} can be used to gauge
;;the degree of match between two sequences.
;;
;;@noindent
;;I believe that this algorithm is currently the fastest for these
;;tasks, but genome sequencing applications fuel extensive research in
;;this area.
(require 'array)
(define (fp:compare fp Delta snake len2)
(let loop ((p 0))
(do ((k (- p) (+ 1 k)))
((> k (+ -1 Delta)))
(array-set! fp (snake k (max (+ 1 (array-ref fp (+ -1 k)))
(array-ref fp (+ 1 k))))
k))
(do ((k (+ Delta p) (+ -1 k)))
((< k (+ 1 Delta)))
(array-set! fp (snake k (max (+ 1 (array-ref fp (+ -1 k)))
(array-ref fp (+ 1 k))))
k))
(array-set! fp (snake Delta (max (+ 1 (array-ref fp (+ -1 Delta)))
(array-ref fp (+ 1 Delta))))
Delta)
(if (= (array-ref fp Delta) len2)
(+ Delta (* 2 p))
(loop (+ 1 p)))))
(define (fp->edits fp Delta)
(let loop ((idx (+ -1 Delta))
(ddx (+ 1 Delta))
(edits '()))
(define ivl (array-ref fp idx))
(define dvl (array-ref fp ddx))
(if (not (= -1 dvl)) (set! dvl (- dvl ddx)))
;;(print idx '-> ivl ddx '-> dvl)
(cond ((= ivl -1) edits)
((= dvl -1) (loop (+ -1 idx) ddx (cons (list ivl 'insert) edits)))
((> dvl ivl) (loop idx (+ 1 ddx) (cons (list dvl 'delete) edits)))
(else (loop (+ -1 idx) ddx (cons (list ivl 'insert) edits))))))
(define (fp->lcs fp Delta array1 len)
(define len1 (car (array-dimensions array1)))
(define lcs (make-array #f len))
(define (subarray-copy! array1 start1 end1 array2 start2)
(do ((i start1 (+ i 1))
(j start2 (+ j 1))
(l (- end1 start1) (- l 1)))
((<= l 0))
(array-set! array2 (array-ref array1 i) j)))
(let loop ((ddx (+ 1 Delta))
(pos len1)
(dpos len))
(let* ((dvl (array-ref fp ddx))
(sublen (- pos (- dvl ddx -1))))
(cond ((= dvl -1)
(subarray-copy! array1 0 pos lcs 0)
lcs)
(else
(subarray-copy! array1 (- dvl ddx -1) pos lcs (- dpos sublen))
(loop (+ 1 ddx) (- dvl ddx) (- dpos sublen)))))))
;;@args array1 array2 =?
;;@args array1 array2
;;@1 and @2 are one-dimensional arrays. The procedure @3 is used
;;to compare sequence tokens for equality. @3 defaults to @code{eqv?}.
;;@0 returns a one-dimensional array of length @code{(quotient (- (+
;;len1 len2) (fp:edit-length @1 @2)) 2)} holding the longest sequence
;;common to both @var{array}s.
(define (diff:longest-common-subsequence array1 array2 . =?)
(define len1 (car (array-dimensions array1)))
(define len2 (car (array-dimensions array2)))
(define (snake k y)
(let snloop ((x (- y k))
(y y))
(if (and (< x len1) (< y len2) (=? (array-ref array1 x)
(array-ref array2 y)))
(snloop (+ 1 x) (+ 1 y))
y)))
(set! =? (if (null? =?) eqv? (car =?)))
(if (> len1 len2)
(diff:longest-common-subsequence array2 array1)
(let ((Delta (- len2 len1))
(fp (make-array -1 (list (- (+ 1 len1)) (+ 1 len2)))))
(fp->lcs fp Delta array1
(quotient (- (+ len1 len2) (fp:compare fp Delta snake len2))
2)))))
;;@args array1 array2 =?
;;@args array1 array2
;;@1 and @2 are one-dimensional arrays. The procedure @3 is used
;;to compare sequence tokens for equality. @3 defaults to @code{eqv?}.
;;@0 returns a list of length @code{(fp:edit-length @1 @2)} composed of
;;a shortest sequence of edits transformaing @1 to @2.
;;
;;Each edit is a list of an integer and a symbol:
;;@table @asis
;;@item (@var{j} insert)
;;Inserts @code{(array-ref @1 @var{j})} into the sequence.
;;@item (@var{k} delete)
;;Deletes @code{(array-ref @2 @var{k})} from the sequence.
;;@end table
(define (diff:edits array1 array2 . =?)
(define len1 (car (array-dimensions array1)))
(define len2 (car (array-dimensions array2)))
(define (snake k y)
(let snloop ((x (- y k))
(y y))
(if (and (< x len1) (< y len2) (=? (array-ref array1 x)
(array-ref array2 y)))
(snloop (+ 1 x) (+ 1 y)) y)))
(set! =? (if (null? =?) eqv? (car =?)))
(if (> len1 len2)
(diff:reverse-edits (diff:edits array2 array1))
(let ((Delta (- len2 len1))
(fp (make-array -1 (list (- (+ 1 len1)) (+ 1 len2)))))
(fp:compare fp Delta snake len2)
;;(do ((idx (- -1 len1) (+ 1 idx))) ((>= idx (+ 1 len2)) (newline)) (printf "%3d" idx))
;;(do ((idx (- -1 len1) (+ 1 idx))) ((>= idx (+ 1 len2)) (newline)) (printf "%3d" (array-ref fp idx)))
(fp->edits fp Delta))))
(define (diff:reverse-edits edits)
(map (lambda (edit)
(list (car edit)
(case (cadr edit)
((delete) 'insert)
((insert) 'delete))))
edits))
;;@args array1 array2 =?
;;@args array1 array2
;;@1 and @2 are one-dimensional arrays. The procedure @3 is used
;;to compare sequence tokens for equality. @3 defaults to @code{eqv?}.
;;@0 returns the length of the shortest sequence of edits transformaing
;;@1 to @2.
(define (diff:edit-length array1 array2 . =?)
(define len1 (car (array-dimensions array1)))
(define len2 (car (array-dimensions array2)))
(define (snake k y)
(let snloop ((x (- y k))
(y y))
(if (and (< x len1) (< y len2) (=? (array-ref array1 x)
(array-ref array2 y)))
(snloop (+ 1 x) (+ 1 y))
y)))
(set! =? (if (null? =?) eqv? (car =?)))
(if (> len1 len2)
(diff:edit-length array2 array1)
(let ((Delta (- len2 len1))
(fp (make-array -1 (list (- (+ 1 len1)) (+ 1 len2)))))
(fp:compare fp Delta snake len2))))
;;@example
;;(diff:longest-common-subsequence '#(f g h i e j c k l m)
;; '#(f g e h i j k p q r l m))
;; @result{} #(f g h i j k l m)
;;
;;(diff:edit-length '#(f g h i e j c k l m)
;; '#(f g e h i j k p q r l m))
;;@result{} 6
;;
;;(pretty-print (diff:edits '#(f g h i e j c k l m)
;; '#(f g e h i j k p q r l m)))
;;@print{}
;;((3 insert) ; e
;; (4 delete) ; c
;; (6 delete) ; h
;; (7 insert) ; p
;; (8 insert) ; q
;; (9 insert)) ; r
;;@end example
;; 12 - 10 = 2
;; -11-10 -9 -8 -7 -6 -5 -4 -3 -2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12
;; -1 -1 -1 -1 -1 -1 -1 -1 -1 3 7 8 9 12 9 8 -1 -1 -1 -1 -1 -1 -1 -1
;; edit-distance = 6