;;; 6.945 Problem Set #1 ;;; 02/10/2009 ;;; Bryan Newbold Problem 1.1: Warmup --------------- (define (r:+ pattern) (r:repeat 1 #f pattern)) (define (r:* pattern) (r:repeat 0 #f pattern)) (pp (r:grep (r:seq (r:quote "dog") (r:+ (r:quote "cat"))) "tests.txt")) ;("[09]. catdogcat" "[11]. dogdogcatdogdog" "[13]. acatdogdogcats") (pp (r:grep (r:seq (r:quote "cat") (r:+ (r:char-from (string->char-set "ogd"))) (r:quote "cat")) "tests.txt")) ;("[09]. catdogcat" "[13]. acatdogdogcats") (pp (r:grep (r:seq " " (r:+ (r:quote "cat")) (r:eol)) "tests.txt")) ; #f (pp (r:grep (r:* (r:quote "something that isn't there")) "tests.txt")) ; matches all (pp (r:grep (r:repeat 2 3 (r:* (r:char-from (string->char-set "tca")))) "tests.txt")) ; matches all: nothing repeated several times (pp (r:grep (r:seq (r:quote "dog") (r:+ (r:quote "cat")) (r:quote "dog")) "tests.txt")) Problem 1.2: The Proposals --------------- a) This would be the same as asking Louis Reasoner to explain why his suggestion is wrong; the problem lies within r:repeat, so re-calling r:repeat does not sidestep the problem. b) Bonnie's method is computationally more efficient because checking up to the minimum number of repetitions is only executed once; after that successive checks are made for additional repetitions. From a straight-forward perspective, Bonnie's approach also uses significantly fewer characters of final regex code: this could potentially (but unlikely) become important if large expressions were searched for with a large range of possible repetitions. Most importantly, Bonnie's method is more readable and brings the implementation more inline with the underlying philosophy of regular expressions (in my opinion). c) Ben's solution is the one that an experienced regex-er would think of first and is the closest conceptual analog to r:repeat in the actual specification. The code is compact and should be the most readable and explicit, though we'll see what kind of syntaxual issues crop up... d) (define (r:repeat min max expr) (if (not (exact-nonnegative-integer? min)) (error "Min must be non-negative integer:" min)) (if max (begin (if (not (exact-nonnegative-integer? max)) (error "Max must be non-negative integer (or null):" max)) (if (not (<= min max)) (error "Min not less than max:" min max)))) (cond ((not max) (r:seq expr (string-append "\\{" (number->string min) ",\\}"))) ((= max min) (r:seq expr (string-append "\\{" (number->string min) "\\}"))) (else (r:seq expr (string-append "\\{" (number->string min) "," (number->string max) "\\}"))))) #| tests: (pp (r:grep (r:repeat 2 3 (r:quote "cat")) "tests.txt")) (pp (r:grep (r:repeat 2 #f (r:quote "cat")) "tests.txt")) (pp (r:grep (r:repeat 1 1 (r:quote "cat")) "tests.txt")) (pp (r:grep (r:repeat 4 3 (r:quote "cat")) "tests.txt")) (r:repeat 2 3 (r:quote "cat")) |# Problem 1.3: Optimization -------------- I simply removed the default parentheses around all expressions and implemented a new r:subexp wrapper which puts parentheses around expressions. Then I had r:alt and r:repeat wrap their expressions in r:subexp. This results in /some/ redundant nesting but is a huge reduction. (define (r:subexp . exprs) (string-append "\\(" (apply string-append exprs) "\\)")) (define (r:seq . exprs) (apply string-append exprs)) (r:seq " " (r:subexp "sdf") "dddd") (define (r:repeat min max expr) (if (not (exact-nonnegative-integer? min)) (error "Min must be non-negative integer:" min)) (if max (begin (if (not (exact-nonnegative-integer? max)) (error "Max must be non-negative integer (or null):" max)) (if (not (<= min max)) (error "Min not less than max:" min max)))) (cond ((not max) (r:seq (r:subexp expr) (string-append "\\{" (number->string min) ",\\}"))) ((= max min) (r:seq (r:subexp expr) (string-append "\\{" (number->string min) "\\}"))) (else (r:seq (r:subexp expr) (string-append "\\{" (number->string min) "," (number->string max) "\\}"))))) Problem 1.4: Back-references -------------- I only implemented the most crude form of back-expressions. A better implementation would include a method of naming subexpressions and refering back to them by name: the implementation would do the appropriate counting to find the correct backreference number for a given named subexpression. (define (r:backref n) (string-append "\\" (number->string n))) Problem 1.5: Ugh! ------------- a) Differences between BREs and EREs (I referenced the table at http://www.greenend.org.uk/rjk/2002/06/regexp.html): * Some characters do not need escaping backslashes: mutiple matching brackets ('\{' becomes '{'), subexpressions ('\(' becomes '(') * The alternation operator '|' is added * The '+' operator is added meaning one or more matches * The '?' operator is added meaning one or zero matches b) The basic method I chose is to have each r: procedure return a list containing two lambda procedures: the first will return the regular grep version of the expression, while the second will return the extended grep version. The r:grep procedure selects the first lambda and r:egrep selects the second, and this decision cascades down recursively to the atomic expressions. My implementation is crude; the r-grep: and r-egrep: procedures should be internal to the primary r: procedures (using let), and the whole process could be implemented as a funtion. c) (not, uh, fully tested...) ; the following are only the expressions which had to be rewritten... (define (r-grep:seq . exprs) (let ((choose-grep (lambda (x) (cond ((string? x) x) ((list? x) (car x)))))) (apply string-append (map choose-grep exprs)))) (r-grep:seq "a" "b" '("grep" "egrep")) ; abgrep (define (r-egrep:seq . exprs) (let ((choose-egrep (lambda (x) (cond ((string? x) x) ((list? x) (cadr x)))))) (apply string-append (map choose-egrep exprs)))) #| test: (r-grep:seq "a" "b" '("grep" "egrep")) ; "abgrep" (r-egrep:seq "a" "b" '("grep" "egrep")) ; "abegrep" |# (define (r:seq . exprs) (cons (lambda () (apply r-grep:seq exprs)) (cons (lambda () (apply r-egrep:seq exprs)) '()))) #| test: (r:seq "a" "b" '("grep" "egrep")) ;Value: (#[compound-procedure 14] #[compound-procedure 15]) ((car (r:seq "a" "b" '("grep" "egrep")))) ; "abgrep" ((cadr (r:seq "a" "b" '("grep" "egrep")))) ; "abegrep" |# (define (r-egrep:alt . exprs) (let ((choose-egrep (lambda (x) (cond ((string? x) x) ((list? x) ((cadr x))))))) (if (pair? exprs) (apply r-egrep:seq (cons (choose-egrep (car exprs)) (append-map (lambda (expr) (list "|" (choose-egrep expr))) (cdr exprs)))) (r-egrep:seq)))) (define (r-grep:alt . exprs) (let ((choose-grep (lambda (x) (cond ((string? x) x) ((list? x) ((car x))))))) (if (pair? exprs) (apply r-grep:seq (cons (choose-grep (car exprs)) (append-map (lambda (expr) (list "\\|" (choose-grep expr))) (cdr exprs)))) (r-grep:seq)))) (define (r:alt . exprs) (cons (lambda () (apply r-grep:alt exprs)) (cons (lambda () (apply r-egrep:alt exprs)) '()))) #| test: (r-egrep:alt "a" (r:dot) "c") ;Value: "a|.|c" (r:alt "asdf" (r:dot)) ;Value: (#[compound-procedure 16] #[compound-procedure 17]) ((car (r:alt "asdf" (r:dot)))) ;Value: "asdf\\|." ((cadr (r:alt "asdf" (r:dot)))) ;Value: "asdf|." |# (define (r-grep:repeat min max expr) (let ((choose-grep (lambda (x) (cond ((string? x) x) ((list? x) ((car x))))))) (if (not (exact-nonnegative-integer? min)) (error "Min must be non-negative integer:" min)) (if max (begin (if (not (exact-nonnegative-integer? max)) (error "Max must be non-negative integer (or null):" max)) (if (not (<= min max)) (error "Min not less than max:" min max)))) (cond ((not max) (r-grep:seq (r-grep:subexp (choose-grep expr)) (string-append "\\{" (number->string min) ",\\}"))) ((= max min) (r-grep:seq (r-grep:subexp (choose-grep expr)) (string-append "\\{" (number->string min) "\\}"))) (else (r-grep:seq (r-grep:subexp (choose-grep expr)) (string-append "\\{" (number->string min) "," (number->string max) "\\}")))))) (define (r-egrep:repeat min max expr) (let ((choose-egrep (lambda (x) (cond ((string? x) x) ((list? x) ((cadr x))))))) (if (not (exact-nonnegative-integer? min)) (error "Min must be non-negative integer:" min)) (if max (begin (if (not (exact-nonnegative-integer? max)) (error "Max must be non-negative integer (or null):" max)) (if (not (<= min max)) (error "Min not less than max:" min max)))) (cond ((not max) (r-egrep:seq (r-egrep:subexp (choose-egrep expr)) (string-append "{" (number->string min) ",}"))) ((= max min) (r-egrep:seq (r-egrep:subexp (choose-egrep expr)) (string-append "{" (number->string min) "}"))) (else (r-egrep:seq (r-egrep:subexp (choose-egrep expr)) (string-append "{" (number->string min) "," (number->string max) "}")))))) (define (r:repeat min max expr) (cons (lambda () (r-grep:repeat min max expr)) (cons (lambda () (r-egrep:repeat min max expr)) '()))) (define (r-grep:subexp . exprs) (string-append "\\(" (apply string-append (map (lambda (x) (cond ((string? x) x) ((pair? x) ((car x))))) exprs)) "\\)")) (define (r-egrep:subexp . exprs) (string-append "(" (apply string-append (map (lambda (x) (cond ((string? x) x) ((pair? x) ((cadr x))))) exprs)) ")")) (define (r:subexp . exprs) (cons (lambda () (apply r-grep:subexp exprs)) (cons (lambda () (apply r-egrep:subexp exprs)) '()))) #| test: (r:subexp "a" "b" (r:dot)) ;Value: (#[compound-procedure 25] #[compound-procedure 26]) ((cadr (r:subexp "a" "b" (r:dot)))) ;Value: "(ab.)" ((car (r:subexp "a" "b" (r:dot)))) ;Value: "\\(ab.\\)" |# (define (r:grep expr filename) (r:grep-like "grep" '() ((car expr)) filename)) (define (r:egrep expr filename) (if (eq? microcode-id/operating-system 'nt) (r:grep-like "grep" '("-E") ((cadr expr)) filename) (r:grep-like "egrep" '() ((cadr expr)) filename))) (pp (r:grep (r:seq (r:repeat 2 3 (r:quote "cat")) (r:+ (r:quote "dog"))) "tests.txt"))