diff options
Diffstat (limited to 'prec.scm')
-rw-r--r-- | prec.scm | 438 |
1 files changed, 438 insertions, 0 deletions
diff --git a/prec.scm b/prec.scm new file mode 100644 index 0000000..bb66763 --- /dev/null +++ b/prec.scm @@ -0,0 +1,438 @@ +; "prec.scm", dynamically extensible parser/tokenizer -*-scheme-*- +; Copyright 1989, 1990, 1991, 1992, 1993, 1995, 1997 Aubrey Jaffer. +; +;Permission to copy this software, to redistribute it, and to use it +;for any purpose is granted, subject to the following restrictions and +;understandings. +; +;1. Any copy made of this software must include this copyright notice +;in full. +; +;2. I have made no warrantee or representation that the operation of +;this software will be error-free, and I am under no obligation to +;provide any services, by way of maintenance, update, or otherwise. +; +;3. In conjunction with products arising from the use of this +;material, there shall be no use of my name in any advertising, +;promotional, or sales literature without prior written consent in +;each case. + +; This file implements: +; * a Pratt style parser. +; * a tokenizer which congeals tokens according to assigned classes of +; constituent characters. +; +; This module is a significant improvement because grammar can be +; changed dynamically from rulesets which don't need compilation. +; Theoretically, all possibilities of bad input are handled and return +; as much structure as was parsed when the error occured; The symbol +; `?' is substituted for missing input. + +; References for the parser are: + +; Pratt, V. R. +; Top Down Operator Precendence. +; SIGACT/SIGPLAN +; Symposium on Principles of Programming Languages, +; Boston, 1973, 41-51 + +; WORKING PAPER 121 +; CGOL - an Alternative External Representation For LISP users +; Vaughan R. Pratt +; MIT Artificial Intelligence Lab. +; March 1976 + +; Mathlab Group, +; MACSYMA Reference Manual, Version Ten, +; Laboratory for Computer Science, MIT, 1983 + +(require 'fluid-let) +(require 'string-search) +(require 'string-port) +(require 'delay) + +(define *syn-defs* #f) +(define *syn-rules* #f) ;Dynamically bound +(define *prec:port* #f) ;Dynamically bound + +;; keeps track of input column so we can generate useful error displays. +(define tok:column 0) +(define (tok:peek-char) (peek-char *prec:port*)) +(define (tok:read-char) + (let ((c (read-char *prec:port*))) + (if (or (eqv? c #\newline) (eof-object? c)) + (set! tok:column 0) + (set! tok:column (+ 1 tok:column))) + c)) +(define (tok:bump-column pos . ports) + ((lambda (thunk) + (cond ((null? ports) (thunk)) + (else (fluid-let ((*prec:port* (car ports))) (thunk))))) + (lambda () + (cond ((eqv? #\newline (tok:peek-char)) + (tok:read-char))) ;to do newline + (set! tok:column (+ tok:column pos))))) +(define (prec:warn msg) + (do ((j (+ -1 tok:column) (+ -8 j))) + ((> 8 j) + (do ((i j (+ -1 i))) + ((>= 0 i)) + (display #\ ))) + (display slib:tab)) + (display "^ ") + (display msg) + (newline)) + +;; Structure of lexical records. +(define tok:make-rec cons) +(define tok:cc car) +(define tok:sfp cdr) + +(define (tok:lookup alist char) + (if (eof-object? char) + #f + (let ((pair (assv char alist))) + (and pair (cdr pair))))) + +(define (tok:char-group group chars chars-proc) + (map (lambda (token) +;;; (let ((oldlexrec (tok:lookup *syn-defs* token))) +;;; (cond ((or (not oldlexrec) (eqv? (tok:cc oldlexrec) group))) +;;; (else (math:warn 'cc-of token 'redefined-to- group)))) + (cons token (tok:make-rec group chars-proc))) + (cond ((string? chars) (string->list chars)) + ((char? chars) (list chars)) + (else chars)))) + +(define (tokenize) + (let* ((char (tok:read-char)) + (rec (tok:lookup *syn-rules* char)) + (proc (and rec (tok:cc rec))) + (clist (list char))) + (cond + ((not proc) char) + ((procedure? proc) + (do ((cl clist (begin (set-cdr! cl (list (tok:read-char))) (cdr cl)))) + ((proc (tok:peek-char)) + ((or (tok:sfp rec) list->string) clist)))) + ((eqv? 0 proc) (tokenize)) + (else + (do ((cl clist (begin (set-cdr! cl (list (tok:read-char))) (cdr cl)))) + ((not (let* ((prec (tok:lookup *syn-rules* (tok:peek-char))) + (cclass (and prec (tok:cc prec)))) + (or (eqv? cclass proc) + (eqv? cclass (+ -1 proc))))) + ((tok:sfp rec) clist))))))) + +;;; PREC:NUD is the null denotation (function and arguments to call when no +;;; unclaimed tokens). +;;; PREC:LED is the left denotation (function and arguments to call when +;;; unclaimed token is on left). +;;; PREC:LBP is the left binding power of this LED. It is the first +;;; argument position of PREC:LED + +(define (prec:nudf alist self) + (let ((pair (assoc (cons 'nud self) alist))) + (and pair (cdr pair)))) +(define (prec:ledf alist self) + (let ((pair (assoc (cons 'led self) alist))) + (and pair (cdr pair)))) +(define (prec:lbp alist self) + (let ((pair (assoc (cons 'led self) alist))) + (and pair (cadr pair)))) + +(define (prec:call-or-list proc . args) + (prec:apply-or-cons proc args)) +(define (prec:apply-or-cons proc args) + (if (procedure? proc) (apply proc args) (cons (or proc '?) args))) + +;;; PREC:SYMBOLFY and PREC:DE-SYMBOLFY are not exact inverses. +(define (prec:symbolfy obj) + (cond ((symbol? obj) obj) + ((string? obj) (string->symbol obj)) + ((char? obj) (string->symbol (string obj))) + (else obj))) + +(define (prec:de-symbolfy obj) + (cond ((symbol? obj) (symbol->string obj)) + (else obj))) + +;;;Calls to set up tables. + +(define (prec:define-grammar . synlsts) + (set! *syn-defs* (append (apply append synlsts) *syn-defs*))) + +(define (prec:make-led toks . args) + (map (lambda (tok) + (cons (cons 'led (prec:de-symbolfy tok)) + args)) + (if (pair? toks) toks (list toks)))) +(define (prec:make-nud toks . args) + (map (lambda (tok) + (cons (cons 'nud (prec:de-symbolfy tok)) + args)) + (if (pair? toks) toks (list toks)))) + +;;; Produce dynamically augmented grammars. +(define (prec:process-binds binds rules) + (if (and #f (not (null? binds)) (eq? #t (car binds))) + (cdr binds) + (append binds rules))) + +;;(define (prec:replace-rules) some-sort-of-magic-cookie) + +;;; Here are the procedures to define high-level grammar, along with +;;; utility functions called during parsing. The utility functions +;;; (prec:parse-*) could be incorportated into the defining commands, +;;; but tracing these functions is useful for debugging. + +(define (prec:delim tk) + (prec:make-led tk 0 #f)) + +(define (prec:nofix tk sop) + (prec:make-nud tk prec:parse-nofix sop)) +(define (prec:parse-nofix self sop) + (prec:call-or-list (or sop (prec:symbolfy self)))) + +(define (prec:prefix tk sop bp . binds) + (prec:make-nud tk prec:parse-prefix sop bp (apply append binds))) +(define (prec:parse-prefix self sop bp binds) + (fluid-let ((*syn-rules* (prec:process-binds binds *syn-rules*))) + (prec:call-or-list (or sop (prec:symbolfy self)) (prec:parse1 bp)))) + +(define (prec:infix tk sop lbp bp . binds) + (prec:make-led tk lbp prec:parse-infix sop bp (apply append binds))) +(define (prec:parse-infix left self lbp sop bp binds) + (fluid-let ((*syn-rules* (prec:process-binds binds *syn-rules*))) + (prec:call-or-list (or sop (prec:symbolfy self)) left (prec:parse1 bp)))) + +(define (prec:nary tk sop bp) + (prec:make-led tk bp prec:parse-nary sop bp)) +(define (prec:parse-nary left self lbp sop bp) + (prec:apply-or-cons (or sop (prec:symbolfy self)) + (cons left (prec:parse-list self bp)))) + +(define (prec:postfix tk sop lbp) + (prec:make-led tk lbp prec:parse-postfix sop)) +(define (prec:parse-postfix left self lbp sop) + (prec:call-or-list (or sop (prec:symbolfy self)) left)) + +(define (prec:prestfix tk sop bp . binds) + (prec:make-nud tk prec:parse-rest sop bp (apply append binds))) +(define (prec:parse-rest self sop bp binds) + (fluid-let ((*syn-rules* (prec:process-binds binds *syn-rules*))) + (prec:apply-or-cons (or sop (prec:symbolfy self)) (prec:parse-list #f bp)))) + +(define (prec:commentfix tk stp match . binds) + (append + (prec:make-nud tk prec:parse-nudcomment stp match (apply append binds)) + (prec:make-led tk 220 prec:parse-ledcomment stp match (apply append binds)))) +(define (prec:parse-nudcomment self stp match binds) + (fluid-let ((*syn-rules* (prec:process-binds binds *syn-rules*))) + (tok:read-through-comment stp match) + (prec:advance) + (cond ((prec:delim? (force prec:token)) #f) + (else (prec:parse1 prec:bp))))) +(define (prec:parse-ledcomment left lbp self stp match binds) + (fluid-let ((*syn-rules* (prec:process-binds binds *syn-rules*))) + (tok:read-through-comment stp match) + (prec:advance) + left)) +(define (tok:read-through-comment stp match) + (set! match (if (char? match) + (string match) + (prec:de-symbolfy match))) + (cond ((procedure? stp) + (let* ((len #f) + (str (call-with-output-string + (lambda (sp) + (set! len (find-string-from-port? + match *prec:port* + (lambda (c) (display c sp) #f))))))) + (stp (and len (substring str 0 (- len (string-length match))))))) + (else (find-string-from-port? match *prec:port*)))) + +(define (prec:matchfix tk sop sep match . binds) + (define sep-lbp 0) + (prec:make-nud tk prec:parse-matchfix + sop sep-lbp sep match + (apply append (prec:delim match) binds))) +(define (prec:parse-matchfix self sop sep-lbp sep match binds) + (fluid-let ((*syn-rules* (prec:process-binds binds *syn-rules*))) + (cond (sop (prec:apply-or-cons + sop (prec:parse-delimited sep sep-lbp match))) + ((equal? (force prec:token) match) + (prec:warn 'expression-missing) + (prec:advance) + '?) + (else (let ((ans (prec:parse1 0))) ;just parenthesized expression + (cond ((equal? (force prec:token) match) + (prec:advance)) + ((prec:delim? (force prec:token)) + (prec:warn 'mismatched-delimiter) + (prec:advance)) + (else (prec:warn 'delimiter-expected--ignoring-rest) + (do () ((prec:delim? (force prec:token))) + (prec:parse1 0)))) + ans))))) + +(define (prec:inmatchfix tk sop sep match lbp . binds) + (define sep-lbp 0) + (prec:make-led tk lbp prec:parse-inmatchfix + sop sep-lbp sep match + (apply append (prec:delim match) binds))) +(define (prec:parse-inmatchfix left self lbp sop sep-lbp sep match binds) + (fluid-let ((*syn-rules* (prec:process-binds binds *syn-rules*))) + (prec:apply-or-cons + sop (cons left (prec:parse-delimited sep sep-lbp match))))) + +;;;; Here is the code which actually parses. + +(define prec:bp #f) ;dynamically bound +(define prec:token #f) +(define (prec:advance) + (set! prec:token (delay (tokenize)))) +(define (prec:advance-return-last) + (let ((last (and prec:token (force prec:token)))) + (prec:advance) + last)) + +(define (prec:nudcall self) + (let ((pob (prec:nudf *syn-rules* self))) + (cond + (pob (let ((proc (car pob))) + (cond ((procedure? proc) (apply proc self (cdr pob))) + (proc (cons proc (cdr pob))) + (else '?)))) + ((char? self) (prec:warn 'extra-separator) + (prec:advance) + (prec:nudcall (force prec:token))) + ((string? self) (string->symbol self)) + (else self)))) + +(define (prec:ledcall left self) + (let* ((pob (prec:ledf *syn-rules* self))) + (apply (cadr pob) left self (cdr pob)))) + +;;; PREC:PARSE1 is the heart. +(define (prec:parse1 bp) + (fluid-let ((prec:bp bp)) + (do ((left (prec:nudcall (prec:advance-return-last)) + (prec:ledcall left (prec:advance-return-last)))) + ((or (>= bp 200) ;to avoid unneccesary lookahead + (>= bp (or (prec:lbp *syn-rules* (force prec:token)) 0)) + (not left)) + left)))) + +(define (prec:delim? token) + (or (eof-object? token) (<= (or (prec:lbp *syn-rules* token) 220) 0))) + +(define (prec:parse-list sep bp) + (cond ((prec:delim? (force prec:token)) + (prec:warn 'expression-missing) + '(?)) + (else + (let ((f (prec:parse1 bp))) + (cons f (cond ((equal? (force prec:token) sep) + (prec:advance) + (cond ((equal? (force prec:token) sep) + (prec:warn 'expression-missing) + (prec:advance) + (cons '? (prec:parse-list sep bp))) + ((prec:delim? (force prec:token)) + (prec:warn 'expression-missing) + '(?)) + (else (prec:parse-list sep bp)))) + ((prec:delim? (force prec:token)) '()) + ((not sep) (prec:parse-list sep bp)) + ((prec:delim? sep) (prec:warn 'separator-missing) + (prec:parse-list sep bp)) + (else '()))))))) + +(define (prec:parse-delimited sep bp delim) + (cond ((equal? (force prec:token) sep) + (prec:warn 'expression-missing) + (prec:advance) + (cons '? (prec:parse-delimited sep delim))) + ((prec:delim? (force prec:token)) + (if (not (equal? (force prec:token) delim)) + (prec:warn 'mismatched-delimiter)) + (if (not sep) (prec:warn 'expression-missing)) + (prec:advance) + (if sep '() '(?))) + (else (let ((ans (prec:parse-list sep bp))) + (cond ((equal? (force prec:token) delim)) + ((prec:delim? (force prec:token)) + (prec:warn 'mismatched-delimiter)) + (else (prec:warn 'delimiter-expected--ignoring-rest) + (do () ((prec:delim? (force prec:token))) + (prec:parse1 bp)))) + (prec:advance) + ans)))) + +(define (prec:parse grammar delim . port) + (set! delim (prec:de-symbolfy delim)) + (fluid-let ((*syn-rules* (append (prec:delim delim) grammar)) + (*prec:port* (if (null? port) (current-input-port) (car port)))) + (prec:advance) ; setup prec:token with first token + (cond ((eof-object? (force prec:token)) (force prec:token)) + ((equal? (force prec:token) delim) #f) + (else + (let ((ans (prec:parse1 0))) + (cond ((eof-object? (force prec:token))) + ((equal? (force prec:token) delim)) + (else (prec:warn 'delimiter-expected--ignoring-rest) + (do () ((or (equal? (force prec:token) delim) + (eof-object? (force prec:token)))) + (prec:advance)))) + ans))))) + +(define tok:decimal-digits "0123456789") +(define tok:upper-case "ABCDEFGHIJKLMNOPQRSTUVWXYZ") +(define tok:lower-case "abcdefghijklmnopqrstuvwxyz") +(define tok:whitespaces + (do ((i (+ -1 (min 256 char-code-limit)) (+ -1 i)) + (ws "" (if (char-whitespace? (integer->char i)) + (string-append ws (string (integer->char i))) + ws))) + ((negative? i) ws))) + +;;;;The parse tables. +;;; Definitions accumulate in top-level variable *SYN-DEFS*. +(set! *syn-defs* '()) ;Make sure *SYN-DEFS* is empty. + +;;; Ignore Whitespace characters. +(prec:define-grammar (tok:char-group 0 tok:whitespaces #f)) + +;;; On MS-DOS systems, <ctrl>-Z (26) needs to be ignored in order to +;;; avoid problems at end of files. +(case (software-type) + ((MSDOS) + (if (not (char-whitespace? (integer->char 26))) + (prec:define-grammar (tok:char-group 0 (integer->char 26) #f)) + ))) + +;;; Save these convenient definitions. +(define *syn-ignore-whitespace* *syn-defs*) +(set! *syn-defs* '()) + +(define (prec:trace) + (require 'trace) + (trace prec:parse prec:parse1 + prec:parse-delimited prec:parse-list + prec:call-or-list prec:apply-or-cons + ;;tokenize prec:advance-return-last prec:advance + prec:nudcall prec:ledcall + prec:parse-nudcomment prec:parse-ledcomment + prec:parse-delimited prec:parse-list + prec:parse-nary prec:parse-rest + prec:parse-matchfix prec:parse-inmatchfix + prec:parse-prefix prec:parse-infix prec:parse-postfix + ;;prec:delim? + ;;prec:ledf prec:nudf prec:lbp + ) + (set! *qp-width* 333)) + +;;(begin (trace-all "prec.scm") (set! *qp-width* 333)) +;;(pretty-print (grammar-read-tab (get-grammar 'standard))) +;;(prec:trace) |