summaryrefslogtreecommitdiffstats
path: root/differ.scm
blob: 53e0eaf5e53c2848f517c44b7c893cb5c50ae8b5 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
;;;; "differ.scm" O(NP) Sequence Comparison Algorithm.
;;; Copyright (C) 2001 Aubrey Jaffer
;
;Permission to copy this software, to modify it, to redistribute it,
;to distribute modified versions, and to use it for any purpose is
;granted, subject to the following restrictions and understandings.
;
;1.  Any copy made of this software must include this copyright notice
;in full.
;
;2.  I have made no warrantee or representation that the operation of
;this software will be error-free, and I am under no obligation to
;provide any services, by way of maintenance, update, or otherwise.
;
;3.  In conjunction with products arising from the use of this
;material, there shall be no use of my name in any advertising,
;promotional, or sales literature without prior written consent in
;each case.

;;@noindent
;;This package implements the algorithm:
;;
;;@ifinfo
;;@example
;;S. Wu, E. Myers, U. Manber, and W. Miller,
;;   "An O(NP) Sequence Comparison Algorithm,"
;;   Information Processing Letters 35, 6 (1990), 317-323.
;;   @url{http://www.cs.arizona.edu/people/gene/vita.html}
;;@end example
;;@end ifinfo
;;@ifset html
;;S. Wu, <A HREF="http://www.cs.arizona.edu/people/gene/vita.html">
;;E. Myers,</A> U. Manber, and W. Miller,
;;<A HREF="http://www.cs.arizona.edu/people/gene/PAPERS/np_diff.ps">
;;"An O(NP) Sequence Comparison Algorithm,"</A>
;;Information Processing Letters 35, 6 (1990), 317-323.
;;@end ifset
;;
;;@noindent
;;If the items being sequenced are text lines, then the computed
;;edit-list is equivalent to the output of the @dfn{diff} utility
;;program.  If the items being sequenced are words, then it is like the
;;lesser known @dfn{spiff} program.
;;
;;@noindent
;;The values returned by @code{diff:edit-length} can be used to gauge
;;the degree of match between two sequences.
;;
;;@noindent
;;I believe that this algorithm is currently the fastest for these
;;tasks, but genome sequencing applications fuel extensive research in
;;this area.

(require 'array)

(define (fp:compare fp Delta snake len2)
  (let loop ((p 0))
    (do ((k (- p) (+ 1 k)))
	((> k (+ -1 Delta)))
      (array-set! fp (snake k (max (+ 1 (array-ref fp (+ -1 k)))
				   (array-ref fp (+ 1 k))))
		  k))
    (do ((k (+ Delta p) (+ -1 k)))
	((< k (+ 1 Delta)))
      (array-set! fp (snake k (max (+ 1 (array-ref fp (+ -1 k)))
				   (array-ref fp (+ 1 k))))
		  k))
    (array-set! fp (snake Delta (max (+ 1 (array-ref fp (+ -1 Delta)))
				     (array-ref fp (+ 1 Delta))))
		Delta)
    (if (= (array-ref fp Delta) len2)
	(+ Delta (* 2 p))
	(loop (+ 1 p)))))

(define (fp->edits fp Delta)
  (let loop ((idx (+ -1 Delta))
	     (ddx (+ 1 Delta))
	     (edits '()))
    (define ivl (array-ref fp idx))
    (define dvl (array-ref fp ddx))
    (if (not (= -1 dvl)) (set! dvl (- dvl ddx)))
    ;;(print idx '-> ivl ddx '-> dvl)
    (cond ((= ivl -1) edits)
	  ((= dvl -1) (loop (+ -1 idx) ddx (cons (list ivl 'insert) edits)))
	  ((> dvl ivl) (loop idx (+ 1 ddx) (cons (list dvl 'delete) edits)))
	  (else       (loop (+ -1 idx) ddx (cons (list ivl 'insert) edits))))))

(define (fp->lcs fp Delta array1 len)
  (define len1 (car (array-dimensions array1)))
  (define lcs (make-array #f len))
  (define (subarray-copy! array1 start1 end1 array2 start2)
    (do ((i start1 (+ i 1))
	 (j start2 (+ j 1))
	 (l (- end1 start1) (- l 1)))
	((<= l 0))
      (array-set! array2 (array-ref array1 i) j)))
  (let loop ((ddx (+ 1 Delta))
	     (pos len1)
	     (dpos len))
    (let* ((dvl (array-ref fp ddx))
	   (sublen (- pos (- dvl ddx -1))))
      (cond ((= dvl -1)
	     (subarray-copy! array1 0 pos lcs 0)
	     lcs)
	    (else
	     (subarray-copy! array1 (- dvl ddx -1) pos lcs (- dpos sublen))
	     (loop (+ 1 ddx) (- dvl ddx) (- dpos sublen)))))))

;;@args array1 array2 =?
;;@args array1 array2
;;@1 and @2 are one-dimensional arrays.  The procedure @3 is used
;;to compare sequence tokens for equality.  @3 defaults to @code{eqv?}.
;;@0 returns a one-dimensional array of length @code{(quotient (- (+
;;len1 len2) (fp:edit-length @1 @2)) 2)} holding the longest sequence
;;common to both @var{array}s.
(define (diff:longest-common-subsequence array1 array2 . =?)
  (define len1 (car (array-dimensions array1)))
  (define len2 (car (array-dimensions array2)))
  (define (snake k y)
    (let snloop ((x (- y k))
		 (y y))
      (if (and (< x len1) (< y len2) (=? (array-ref array1 x)
					 (array-ref array2 y)))
	  (snloop (+ 1 x) (+ 1 y))
	  y)))
  (set! =? (if (null? =?) eqv? (car =?)))
  (if (> len1 len2)
      (diff:longest-common-subsequence array2 array1)
      (let ((Delta (- len2 len1))
	    (fp (make-array -1 (list (- (+ 1 len1)) (+ 1 len2)))))
	(fp->lcs fp Delta array1
		 (quotient (- (+ len1 len2) (fp:compare fp Delta snake len2))
			   2)))))

;;@args array1 array2 =?
;;@args array1 array2
;;@1 and @2 are one-dimensional arrays.  The procedure @3 is used
;;to compare sequence tokens for equality.  @3 defaults to @code{eqv?}.
;;@0 returns a list of length @code{(fp:edit-length @1 @2)} composed of
;;a shortest sequence of edits transformaing @1 to @2.
;;
;;Each edit is a list of an integer and a symbol:
;;@table @asis
;;@item (@var{j} insert)
;;Inserts @code{(array-ref @1 @var{j})} into the sequence.
;;@item (@var{k} delete)
;;Deletes @code{(array-ref @2 @var{k})} from the sequence.
;;@end table
(define (diff:edits array1 array2 . =?)
  (define len1 (car (array-dimensions array1)))
  (define len2 (car (array-dimensions array2)))
  (define (snake k y)
    (let snloop ((x (- y k))
		 (y y))
      (if (and (< x len1) (< y len2) (=? (array-ref array1 x)
					 (array-ref array2 y)))
	  (snloop (+ 1 x) (+ 1 y)) y)))
  (set! =? (if (null? =?) eqv? (car =?)))
  (if (> len1 len2)
      (diff:reverse-edits (diff:edits array2 array1))
      (let ((Delta (- len2 len1))
	    (fp (make-array -1 (list (- (+ 1 len1)) (+ 1 len2)))))
	(fp:compare fp Delta snake len2)
	;;(do ((idx (- -1 len1) (+ 1 idx))) ((>= idx (+ 1 len2)) (newline)) (printf "%3d" idx))
	;;(do ((idx (- -1 len1) (+ 1 idx))) ((>= idx (+ 1 len2)) (newline)) (printf "%3d" (array-ref fp idx)))
	(fp->edits fp Delta))))

(define (diff:reverse-edits edits)
  (map (lambda (edit)
	 (list (car edit)
	       (case (cadr edit)
		 ((delete) 'insert)
		 ((insert) 'delete))))
       edits))

;;@args array1 array2 =?
;;@args array1 array2
;;@1 and @2 are one-dimensional arrays.  The procedure @3 is used
;;to compare sequence tokens for equality.  @3 defaults to @code{eqv?}.
;;@0 returns the length of the shortest sequence of edits transformaing
;;@1 to @2.
(define (diff:edit-length array1 array2 . =?)
  (define len1 (car (array-dimensions array1)))
  (define len2 (car (array-dimensions array2)))
  (define (snake k y)
    (let snloop ((x (- y k))
		 (y y))
      (if (and (< x len1) (< y len2) (=? (array-ref array1 x)
					 (array-ref array2 y)))
	  (snloop (+ 1 x) (+ 1 y))
	  y)))
  (set! =? (if (null? =?) eqv? (car =?)))
  (if (> len1 len2)
      (diff:edit-length array2 array1)
      (let ((Delta (- len2 len1))
	    (fp (make-array -1 (list (- (+ 1 len1)) (+ 1 len2)))))
	(fp:compare fp Delta snake len2))))

;;@example
;;(diff:longest-common-subsequence '#(f g h i e j c k l m)
;;                                 '#(f g e h i j k p q r l m))
;;                                 @result{} #(f g h i j k l m)
;;
;;(diff:edit-length '#(f g h i e j c k l m)
;;                  '#(f g e h i j k p q r l m))
;;@result{} 6
;;
;;(pretty-print (diff:edits '#(f g h i e j c k l m)
;;                          '#(f g e h i j k p q r l m)))
;;@print{}
;;((3 insert)                           ; e
;; (4 delete)                           ; c
;; (6 delete)                           ; h
;; (7 insert)                           ; p
;; (8 insert)                           ; q
;; (9 insert))                          ; r
;;@end example

;; 12 - 10 = 2
;; -11-10 -9 -8 -7 -6 -5 -4 -3 -2 -1  0  1  2  3  4  5  6  7  8  9 10 11 12
;;  -1 -1 -1 -1 -1 -1 -1 -1 -1  3  7  8  9 12  9  8 -1 -1 -1 -1 -1 -1 -1 -1
;; edit-distance = 6