summaryrefslogtreecommitdiffstats
path: root/ps01_grep/ps01_bnewbold.txt
blob: 52979e7d126684b5cb54593922a7028f0a31ed49 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
;;; 6.945 Problem Set #1 
;;; 02/10/2009
;;; Bryan Newbold <bnewbold@mit.edu>


Problem 1.1: Warmup
---------------

(define (r:+ pattern)
  (r:repeat 1 #f pattern))

(define (r:* pattern)
  (r:repeat 0 #f pattern))

(pp (r:grep (r:seq (r:quote "dog")
                   (r:+ (r:quote "cat")))
            "tests.txt"))

;("[09]. catdogcat" "[11]. dogdogcatdogdog" "[13]. acatdogdogcats")

(pp (r:grep (r:seq (r:quote "cat")
                   (r:+ (r:char-from (string->char-set "ogd")))
                   (r:quote "cat"))
            "tests.txt"))

;("[09]. catdogcat" "[13]. acatdogdogcats")

(pp (r:grep (r:seq " "
                   (r:+ (r:quote "cat"))
                   (r:eol))
            "tests.txt"))
; #f

(pp (r:grep (r:* (r:quote "something that isn't there"))
            "tests.txt"))
; matches all

(pp (r:grep (r:repeat 2 3 (r:* (r:char-from (string->char-set "tca"))))
            "tests.txt"))
; matches all: nothing repeated several times

(pp (r:grep (r:seq (r:quote "dog")
                   (r:+ (r:quote "cat"))
                   (r:quote "dog"))
            "tests.txt"))

Problem 1.2: The Proposals
---------------

a) This would be the same as asking Louis Reasoner to explain why his
suggestion is wrong; the problem lies within r:repeat, so re-calling r:repeat
does not sidestep the problem.

b) Bonnie's method is computationally more efficient because checking up to the
minimum number of repetitions is only executed once; after that successive
checks are made for additional repetitions. From a straight-forward
perspective, Bonnie's approach also uses significantly fewer characters of
final regex code: this could potentially (but unlikely) become important if
large expressions were searched for with a large range of possible repetitions.
Most importantly, Bonnie's method is more readable and brings the
implementation more inline with the underlying philosophy of regular
expressions (in my opinion).

c) Ben's solution is the one that an experienced regex-er would think of first
and is the closest conceptual analog to r:repeat in the actual specification.
The code is compact and should be the most readable and explicit, though we'll
see what kind of syntaxual issues crop up...

d)

(define (r:repeat min max expr)
  (if (not (exact-nonnegative-integer? min))
      (error "Min must be non-negative integer:" min))
  (if max
      (begin
        (if (not (exact-nonnegative-integer? max))
            (error "Max must be non-negative integer (or null):" max))
        (if (not (<= min max))
            (error "Min not less than max:" min max))))
  (cond 
   ((not max) (r:seq expr (string-append "\\{" (number->string min) ",\\}")))
   ((= max min) (r:seq expr (string-append "\\{" (number->string min) "\\}")))
   (else (r:seq expr (string-append "\\{" 
                                    (number->string min) 
                                    "," 
                                    (number->string max) 
                                    "\\}")))))
#| tests:
(pp (r:grep (r:repeat 2 3 (r:quote "cat"))
            "tests.txt"))
(pp (r:grep (r:repeat 2 #f (r:quote "cat"))
            "tests.txt"))
(pp (r:grep (r:repeat 1 1 (r:quote "cat"))
            "tests.txt"))
(pp (r:grep (r:repeat 4 3 (r:quote "cat"))
            "tests.txt"))
(r:repeat 2 3 (r:quote "cat"))
|#


Problem 1.3: Optimization
--------------
I simply removed the default parentheses around all expressions and implemented
a new r:subexp wrapper which puts parentheses around expressions. Then I had
r:alt and r:repeat wrap their expressions in r:subexp. This results in /some/
redundant nesting but is a huge reduction.

(define (r:subexp . exprs)
  (string-append "\\(" (apply string-append exprs) "\\)"))

(define (r:seq . exprs)
  (apply string-append exprs))

(r:seq " " (r:subexp "sdf") "dddd")

(define (r:repeat min max expr)
  (if (not (exact-nonnegative-integer? min))
      (error "Min must be non-negative integer:" min))
  (if max
      (begin
        (if (not (exact-nonnegative-integer? max))
            (error "Max must be non-negative integer (or null):" max))
        (if (not (<= min max))
            (error "Min not less than max:" min max))))
  (cond 
   ((not max) (r:seq (r:subexp expr) 
                     (string-append "\\{" (number->string min) ",\\}")))
   ((= max min) (r:seq (r:subexp expr) 
                       (string-append "\\{" (number->string min) "\\}")))
   (else (r:seq (r:subexp expr) 
                (string-append "\\{" 
                               (number->string min) 
                               "," 
                               (number->string max) 
                               "\\}")))))


Problem 1.4: Back-references
--------------
I only implemented the most crude form of back-expressions. A better
implementation would include a method of naming subexpressions and refering 
back to them by name: the implementation would do the appropriate counting to
find the correct backreference number for a given named subexpression.

(define (r:backref n)
  (string-append "\\" (number->string n)))

Problem 1.5: Ugh!
-------------
a) Differences between BREs and EREs (I referenced the table at 
http://www.greenend.org.uk/rjk/2002/06/regexp.html):

* Some characters do not need escaping backslashes: mutiple matching brackets
    ('\{' becomes '{'), subexpressions ('\(' becomes '(')
* The alternation operator '|' is added
* The '+' operator is added meaning one or more matches
* The '?' operator is added meaning one or zero matches

b) The basic method I chose is to have each r: procedure return a list 
containing two lambda procedures: the first will return the regular grep
version of the expression, while the second will return the extended grep
version. The r:grep procedure selects the first lambda and r:egrep selects the
second, and this decision cascades down recursively to the atomic expressions.

My implementation is crude; the r-grep: and r-egrep: procedures should be 
internal to the primary r: procedures (using let), and the whole process could
be implemented as a funtion.

c) (not, uh, fully tested...)

; the following are only the expressions which had to be rewritten...

(define (r-grep:seq . exprs)
  (let ((choose-grep (lambda (x)
                      (cond
                       ((string? x) x)
                       ((list? x) (car x))))))
  (apply string-append (map choose-grep exprs))))

(r-grep:seq "a" "b" '("grep" "egrep"))
; abgrep

(define (r-egrep:seq . exprs)
  (let ((choose-egrep (lambda (x)
                      (cond
                       ((string? x) x)
                       ((list? x) (cadr x))))))
  (apply string-append (map choose-egrep exprs))))

#| test:
(r-grep:seq "a" "b" '("grep" "egrep"))
; "abgrep"
(r-egrep:seq "a" "b" '("grep" "egrep"))
; "abegrep"
|#

(define (r:seq . exprs)
  (cons (lambda () (apply r-grep:seq exprs)) 
        (cons (lambda () (apply r-egrep:seq exprs)) '())))

#| test:
(r:seq "a" "b" '("grep" "egrep"))
;Value: (#[compound-procedure 14] #[compound-procedure 15])
((car (r:seq "a" "b" '("grep" "egrep"))))
; "abgrep"
((cadr (r:seq "a" "b" '("grep" "egrep"))))
; "abegrep"
|#

(define (r-egrep:alt . exprs)
  (let ((choose-egrep (lambda (x)
                      (cond
                       ((string? x) x)
                       ((list? x) ((cadr x)))))))
  (if (pair? exprs)
      (apply r-egrep:seq
             (cons (choose-egrep (car exprs))
                   (append-map (lambda (expr)
                                 (list "|" (choose-egrep expr)))
                               (cdr exprs))))
      (r-egrep:seq))))

(define (r-grep:alt . exprs)
  (let ((choose-grep (lambda (x)
                      (cond
                       ((string? x) x)
                       ((list? x) ((car x)))))))
    (if (pair? exprs)
        (apply r-grep:seq
               (cons (choose-grep (car exprs))
                     (append-map (lambda (expr)
                                   (list "\\|" (choose-grep expr)))
                                 (cdr exprs))))
      (r-grep:seq))))

(define (r:alt . exprs)
  (cons (lambda () (apply r-grep:alt exprs)) 
        (cons (lambda () (apply r-egrep:alt exprs)) '())))

#| test:
(r-egrep:alt "a" (r:dot) "c")
;Value: "a|.|c"
(r:alt "asdf" (r:dot))
;Value: (#[compound-procedure 16] #[compound-procedure 17])
((car (r:alt "asdf" (r:dot))))
;Value: "asdf\\|."
((cadr (r:alt "asdf" (r:dot))))
;Value: "asdf|."
|#

(define (r-grep:repeat min max expr)
  (let ((choose-grep (lambda (x)
                      (cond
                       ((string? x) x)
                       ((list? x) ((car x)))))))
    (if (not (exact-nonnegative-integer? min))
        (error "Min must be non-negative integer:" min))
    (if max
        (begin
          (if (not (exact-nonnegative-integer? max))
              (error "Max must be non-negative integer (or null):" max))
          (if (not (<= min max))
              (error "Min not less than max:" min max))))
    (cond 
     ((not max) (r-grep:seq (r-grep:subexp (choose-grep expr)) 
                       (string-append "\\{" (number->string min) ",\\}")))
   ((= max min) (r-grep:seq (r-grep:subexp (choose-grep expr)) 
                       (string-append "\\{" (number->string min) "\\}")))
   (else (r-grep:seq (r-grep:subexp (choose-grep expr)) 
                (string-append "\\{" 
                               (number->string min) 
                               "," 
                               (number->string max) 
                               "\\}"))))))

(define (r-egrep:repeat min max expr)
  (let ((choose-egrep (lambda (x)
                      (cond
                       ((string? x) x)
                       ((list? x) ((cadr x)))))))
    (if (not (exact-nonnegative-integer? min))
        (error "Min must be non-negative integer:" min))
    (if max
        (begin
          (if (not (exact-nonnegative-integer? max))
              (error "Max must be non-negative integer (or null):" max))
          (if (not (<= min max))
              (error "Min not less than max:" min max))))
    (cond 
     ((not max) (r-egrep:seq (r-egrep:subexp (choose-egrep expr)) 
                       (string-append "{" (number->string min) ",}")))
   ((= max min) (r-egrep:seq (r-egrep:subexp (choose-egrep expr)) 
                       (string-append "{" (number->string min) "}")))
   (else (r-egrep:seq (r-egrep:subexp (choose-egrep expr)) 
                (string-append "{" 
                               (number->string min) 
                               "," 
                               (number->string max) 
                               "}"))))))

(define (r:repeat min max expr)
  (cons (lambda () (r-grep:repeat min max expr)) 
        (cons (lambda () (r-egrep:repeat min max expr)) '())))

(define (r-grep:subexp . exprs)
  (string-append "\\(" (apply string-append 
                            (map (lambda (x)
                                   (cond
                                    ((string? x) x)
                                    ((pair? x) ((car x)))))
                                 exprs)) "\\)"))

(define (r-egrep:subexp . exprs)
  (string-append "(" (apply string-append 
                            (map (lambda (x)
                                   (cond
                                    ((string? x) x)
                                    ((pair? x) ((cadr x)))))
                                 exprs)) ")"))

(define (r:subexp . exprs)
  (cons (lambda () (apply r-grep:subexp exprs)) 
        (cons (lambda () (apply r-egrep:subexp exprs)) '())))


#| test:
(r:subexp "a" "b" (r:dot))
;Value: (#[compound-procedure 25] #[compound-procedure 26])
((cadr (r:subexp "a" "b" (r:dot))))
;Value: "(ab.)"
((car (r:subexp "a" "b" (r:dot))))
;Value: "\\(ab.\\)"
|#

(define (r:grep expr filename)
  (r:grep-like "grep" '() ((car expr)) filename))

(define (r:egrep expr filename)
  (if (eq? microcode-id/operating-system 'nt)
      (r:grep-like "grep" '("-E") ((cadr expr)) filename)
      (r:grep-like "egrep" '() ((cadr expr)) filename)))

(pp (r:grep (r:seq (r:repeat 2 3 (r:quote "cat")) (r:+ (r:quote "dog")))
            "tests.txt"))