From 64f037d91e0c9296dcaef9a0ff3eb33b19a2ed34 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 20 Feb 2017 00:05:38 -0800 Subject: Import Upstream version 3a5 --- xml-parse.scm | 2025 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 2025 insertions(+) create mode 100644 xml-parse.scm (limited to 'xml-parse.scm') diff --git a/xml-parse.scm b/xml-parse.scm new file mode 100644 index 0000000..6056220 --- /dev/null +++ b/xml-parse.scm @@ -0,0 +1,2025 @@ +;;;;"xml-parse" XML parsing and conversion to SXML (Scheme-XML) +;;; Copyright (C) 2007 Aubrey Jaffer +;;; 2007-04 jaffer: demacrofied from public-domain SSAX 5.1 +; +;Permission to copy this software, to modify it, to redistribute it, +;to distribute modified versions, and to use it for any purpose is +;granted, subject to the following restrictions and understandings. +; +;1. Any copy made of this software must include this copyright notice +;in full. +; +;2. I have made no warranty or representation that the operation of +;this software will be error-free, and I am under no obligation to +;provide any services, by way of maintenance, update, or otherwise. +; +;3. In conjunction with products arising from the use of this +;material, there shall be no use of my name in any advertising, +;promotional, or sales literature without prior written consent in +;each case. + +;;@code{(require 'xml-parse)} or @code{(require 'ssax)} +;; +;;@noindent +;;The XML standard document referred to in this module is@* +;;@url{http://www.w3.org/TR/1998/REC-xml-19980210.html}. +;; +;;@noindent +;;The present frameworks fully supports the XML Namespaces +;;Recommendation@* +;;@url{http://www.w3.org/TR/REC-xml-names}. + +(require 'rev2-procedures) ; for substring-move-left! +(require 'string-search) +(require 'let-values) +(require 'values) +(require 'srfi-1) ; for fold-right, fold, cons* + +;;@subsection String Glue + +;;;; Three functions from SRFI-13 +; procedure string-concatenate-reverse STRINGS [FINAL END] +(define (ssax:string-concatenate-reverse strs final end) + (if (null? strs) (substring final 0 end) + (let* + ((total-len + (let loop ((len end) (lst strs)) + (if (null? lst) len + (loop (+ len (string-length (car lst))) (cdr lst))))) + (result (make-string total-len))) + (let loop ((len end) (j total-len) (str final) (lst strs)) + (substring-move-left! str 0 len result (- j len)) + (if (null? lst) result + (loop (string-length (car lst)) (- j len) + (car lst) (cdr lst))))))) +; string-concatenate/shared STRING-LIST -> STRING +(define (ssax:string-concatenate/shared strs) + (cond ((null? strs) "") ; Test for the fast path first + ((null? (cdr strs)) (car strs)) + (else + (let* + ((total-len + (let loop ((len (string-length (car strs))) (lst (cdr strs))) + (if (null? lst) len + (loop (+ len (string-length (car lst))) (cdr lst))))) + (result (make-string total-len))) + (let loop ((j 0) (str (car strs)) (lst (cdr strs))) + (substring-move-left! str 0 (string-length str) result j) + (if (null? lst) result + (loop (+ j (string-length str)) + (car lst) (cdr lst)))))))) +; string-concatenate-reverse/shared STRING-LIST [FINAL-STRING END] -> STRING +; We do not use the optional arguments of this procedure. Therefore, +; we do not implement them. See SRFI-13 for the complete +; implementation. +(define (ssax:string-concatenate-reverse/shared strs) + (cond ((null? strs) "") ; Test for the fast path first + ((null? (cdr strs)) (car strs)) + (else + (ssax:string-concatenate-reverse (cdr strs) + (car strs) + (string-length (car strs)))))) + + +;;@args list-of-frags +;; +;;Given the list of fragments (some of which are text strings), +;;reverse the list and concatenate adjacent text strings. If +;;LIST-OF-FRAGS has zero or one element, the result of the procedure +;;is @code{equal?} to its argument. +(define (ssax:reverse-collect-str fragments) + (cond + ((null? fragments) '()) ; a shortcut + ((null? (cdr fragments)) fragments) ; see the comment above + (else + (let loop ((fragments fragments) (result '()) (strs '())) + (cond + ((null? fragments) + (if (null? strs) + result + (cons (ssax:string-concatenate/shared strs) result))) + ((string? (car fragments)) + (loop (cdr fragments) result (cons (car fragments) strs))) + (else + (loop (cdr fragments) + (cons (car fragments) + (if (null? strs) + result + (cons (ssax:string-concatenate/shared strs) result))) + '()))))))) + +;;@args list-of-frags +;; +;;Given the list of fragments (some of which are text strings), +;;reverse the list and concatenate adjacent text strings while +;;dropping "unsignificant" whitespace, that is, whitespace in front, +;;behind and between elements. The whitespace that is included in +;;character data is not affected. +;; +;;Use this procedure to "intelligently" drop "insignificant" +;;whitespace in the parsed SXML. If the strict compliance with the +;;XML Recommendation regarding the whitespace is desired, use the +;;@code{ssax:reverse-collect-str} procedure instead. +(define (ssax:reverse-collect-str-drop-ws fragments) + ;; Test if a string is made of only whitespace. + ;; An empty string is considered made of whitespace as well + (define (string-whitespace? str) + (let ((len (string-length str))) + (cond ((zero? len) #t) + ((= 1 len) (char-whitespace? (string-ref str 0))) + (else + (let loop ((i 0)) + (or (>= i len) + (and (char-whitespace? (string-ref str i)) + (loop (+ 1 i))))))))) + (cond + ((null? fragments) '()) ; a shortcut + ((null? (cdr fragments)) ; another shortcut + (if (and (string? (car fragments)) (string-whitespace? (car fragments))) + '() ; remove trailing ws + fragments)) + (else + (let loop ((fragments fragments) (result '()) (strs '()) + (all-whitespace? #t)) + (cond + ((null? fragments) + (if all-whitespace? + result ; remove leading ws + (cons (ssax:string-concatenate/shared strs) result))) + ((string? (car fragments)) + (loop (cdr fragments) + result + (cons (car fragments) strs) + (and all-whitespace? (string-whitespace? (car fragments))))) + (else + (loop (cdr fragments) + (cons (car fragments) + (if all-whitespace? + result + (cons (ssax:string-concatenate/shared strs) result))) + '() + #t))))))) + +;;@subsection Character and Token Functions +;; +;;The following functions either skip, or build and return tokens, +;;according to inclusion or delimiting semantics. The list of +;;characters to expect, include, or to break at may vary from one +;;invocation of a function to another. This allows the functions to +;;easily parse even context-sensitive languages. +;; +;;Exceptions are mentioned specifically. The list of expected +;;characters (characters to skip until, or break-characters) may +;;include an EOF "character", which is coded as symbol *eof* +;; +;;The input stream to parse is specified as a PORT, which is the last +;;argument. + +;;@args char-list string port +;; +;;Reads a character from the @3 and looks it up in the +;;@1 of expected characters. If the read character was +;;found among expected, it is returned. Otherwise, the +;;procedure writes a message using @2 as a comment +;;and quits. +(define (ssax:assert-current-char expected-chars comment port) + (let ((c (read-char port))) + (if (memv c expected-chars) c + (slib:error port "Wrong character " c + " (0x" (if (eof-object? c) + "*eof*" + (number->string (char->integer c) 16)) ") " + comment ". " expected-chars " expected")))) + +;;@args char-list port +;; +;;Reads characters from the @2 and disregards them, as long as they +;;are mentioned in the @1. The first character (which may be EOF) +;;peeked from the stream that is @emph{not} a member of the @1 is +;;returned. +(define (ssax:skip-while skip-chars port) + (do ((c (peek-char port) (peek-char port))) + ((not (memv c skip-chars)) c) + (read-char port))) + +;;; Stream tokenizers +;; +;;Note: since we can't tell offhand how large the token being read is +;;going to be, we make a guess, pre-allocate a string, and grow it by +;;quanta if necessary. The quantum is always the length of the string +;;before it was extended the last time. Thus the algorithm does a +;;Fibonacci-type extension, which has been proven optimal. +;; +;;Size 32 turns out to be fairly good, on average. That policy is +;;good only when a Scheme system is multi-threaded with preemptive +;;scheduling, or when a Scheme system supports shared substrings. In +;;all the other cases, it's better for ssax:init-buffer to return the +;;same static buffer. ssax:next-token* functions return a copy (a +;;substring) of accumulated data, so the same buffer can be reused. +;;We shouldn't worry about an incoming token being too large: +;;ssax:next-token will use another chunk automatically. Still, the +;;best size for the static buffer is to allow most of the tokens to +;;fit in. Using a static buffer _dramatically_ reduces the amount of +;;produced garbage (e.g., during XML parsing). + +;;@body +;; +;;Returns an initial buffer for @code{ssax:next-token*} procedures. +;;@0 may allocate a new buffer at each invocation. +(define (ssax:init-buffer) (make-string 32)) + +;;;(define ssax:init-buffer +;;; (let ((buffer (make-string 512))) +;;; (lambda () buffer))) + +;;@args prefix-char-list break-char-list comment-string port +;; +;;Skips any number of the prefix characters (members of the @1), if +;;any, and reads the sequence of characters up to (but not including) +;;a break character, one of the @2. +;; +;;The string of characters thus read is returned. The break character +;;is left on the input stream. @2 may include the symbol @code{*eof*}; +;;otherwise, EOF is fatal, generating an error message including a +;;specified @3. +(define (ssax:next-token prefix-skipped-chars break-chars comment port) + (let outer ((buffer (ssax:init-buffer)) (filled-buffer-l '()) + (c (ssax:skip-while prefix-skipped-chars port))) + (let ((curr-buf-len (string-length buffer))) + (let loop ((i 0) (c c)) + (cond + ((memv c break-chars) + (if (null? filled-buffer-l) (substring buffer 0 i) + (ssax:string-concatenate-reverse filled-buffer-l buffer i))) + ((eof-object? c) + (if (memq '*eof* break-chars) ; was EOF expected? + (if (null? filled-buffer-l) (substring buffer 0 i) + (ssax:string-concatenate-reverse filled-buffer-l buffer i)) + (slib:error port "EOF while reading a token " comment))) + ((>= i curr-buf-len) + (outer (make-string curr-buf-len) + (cons buffer filled-buffer-l) c)) + (else + (string-set! buffer i c) + (read-char port) ; move to the next char + (loop (+ 1 i) (peek-char port)))))))) + +;;@noindent +;;@code{ssax:next-token-of} is similar to @code{ssax:next-token} +;;except that it implements an inclusion rather than delimiting +;;semantics. + +;;@args inc-charset port +;; +;;Reads characters from the @2 that belong to the list of characters +;;@1. The reading stops at the first character which is not a member +;;of the set. This character is left on the stream. All the read +;;characters are returned in a string. +;; +;;@args pred port +;; +;;Reads characters from the @2 for which @var{pred} (a procedure of +;;one argument) returns non-#f. The reading stops at the first +;;character for which @var{pred} returns #f. That character is left +;;on the stream. All the results of evaluating of @var{pred} up to #f +;;are returned in a string. +;; +;;@var{pred} is a procedure that takes one argument (a character or +;;the EOF object) and returns a character or #f. The returned +;;character does not have to be the same as the input argument to the +;;@var{pred}. For example, +;; +;;@example +;;(ssax:next-token-of (lambda (c) +;; (cond ((eof-object? c) #f) +;; ((char-alphabetic? c) (char-downcase c)) +;; (else #f))) +;; (current-input-port)) +;;@end example +;; +;;will try to read an alphabetic token from the current input port, +;;and return it in lower case. +(define (ssax:next-token-of incl-list/pred port) + (let* ((buffer (ssax:init-buffer)) + (curr-buf-len (string-length buffer))) + (if (procedure? incl-list/pred) + (let outer ((buffer buffer) (filled-buffer-l '())) + (let loop ((i 0)) + (if (>= i curr-buf-len) ; make sure we have space + (outer (make-string curr-buf-len) (cons buffer filled-buffer-l)) + (let ((c (incl-list/pred (peek-char port)))) + (if c + (begin + (string-set! buffer i c) + (read-char port) ; move to the next char + (loop (+ 1 i))) + ;; incl-list/pred decided it had had enough + (if (null? filled-buffer-l) (substring buffer 0 i) + (ssax:string-concatenate-reverse filled-buffer-l buffer i))))))) + + ;; incl-list/pred is a list of allowed characters + (let outer ((buffer buffer) (filled-buffer-l '())) + (let loop ((i 0)) + (if (>= i curr-buf-len) ; make sure we have space + (outer (make-string curr-buf-len) (cons buffer filled-buffer-l)) + (let ((c (peek-char port))) + (cond + ((not (memv c incl-list/pred)) + (if (null? filled-buffer-l) (substring buffer 0 i) + (ssax:string-concatenate-reverse filled-buffer-l buffer i))) + (else + (string-set! buffer i c) + (read-char port) ; move to the next char + (loop (+ 1 i)))))))) + ))) + +;;@body +;; +;;Reads @1 characters from the @2, and returns them in a string. If +;;EOF is encountered before @1 characters are read, a shorter string +;;will be returned. +(define (ssax:read-string len port) + (define buffer (make-string len)) + (do ((idx 0 (+ 1 idx))) + ((>= idx len) idx) + (let ((chr (read-char port))) + (cond ((eof-object? byt) + (set! idx (+ -1 idx)) + (set! len idx)) + (else (string-set! buffer idx chr)))))) + +;;@subsection Data Types +;; +;;@table @code +;; +;;@item TAG-KIND +;; +;;A symbol @samp{START}, @samp{END}, @samp{PI}, @samp{DECL}, +;;@samp{COMMENT}, @samp{CDSECT}, or @samp{ENTITY-REF} that identifies +;;a markup token +;; +;;@item UNRES-NAME +;; +;;a name (called GI in the XML Recommendation) as given in an XML +;;document for a markup token: start-tag, PI target, attribute name. +;;If a GI is an NCName, UNRES-NAME is this NCName converted into a +;;Scheme symbol. If a GI is a QName, @samp{UNRES-NAME} is a pair of +;;symbols: @code{(@var{PREFIX} . @var{LOCALPART})}. +;; +;;@item RES-NAME +;; +;;An expanded name, a resolved version of an @samp{UNRES-NAME}. For +;;an element or an attribute name with a non-empty namespace URI, +;;@samp{RES-NAME} is a pair of symbols, +;;@code{(@var{URI-SYMB} . @var{LOCALPART})}. +;;Otherwise, it's a single symbol. +;; +;;@item ELEM-CONTENT-MODEL +;; +;;A symbol: +;;@table @samp +;;@item ANY +;;anything goes, expect an END tag. +;;@item EMPTY-TAG +;;no content, and no END-tag is coming +;;@item EMPTY +;;no content, expect the END-tag as the next token +;;@item PCDATA +;;expect character data only, and no children elements +;;@item MIXED +;;@item ELEM-CONTENT +;;@end table +;; +;;@item URI-SYMB +;; +;;A symbol representing a namespace URI -- or other symbol chosen by +;;the user to represent URI. In the former case, @code{URI-SYMB} is +;;created by %-quoting of bad URI characters and converting the +;;resulting string into a symbol. +;; +;;@item NAMESPACES +;; +;;A list representing namespaces in effect. An element of the list +;;has one of the following forms: +;; +;;@table @code +;; +;;@item (@var{prefix} @var{uri-symb} . @var{uri-symb}) or +;; +;;@item (@var{prefix} @var{user-prefix} . @var{uri-symb}) +;;@var{user-prefix} is a symbol chosen by the user to represent the URI. +;; +;;@item (#f @var{user-prefix} . @var{uri-symb}) +;;Specification of the user-chosen prefix and a URI-SYMBOL. +;; +;;@item (*DEFAULT* @var{user-prefix} . @var{uri-symb}) +;;Declaration of the default namespace +;; +;;@item (*DEFAULT* #f . #f) +;;Un-declaration of the default namespace. This notation +;;represents overriding of the previous declaration +;; +;;@end table +;; +;;A NAMESPACES list may contain several elements for the same @var{prefix}. +;;The one closest to the beginning of the list takes effect. +;; +;;@item ATTLIST +;; +;;An ordered collection of (@var{NAME} . @var{VALUE}) pairs, where +;;@var{NAME} is a RES-NAME or an UNRES-NAME. The collection is an ADT. +;; +;;@item STR-HANDLER +;; +;;A procedure of three arguments: @var{string1} @var{string2} +;;@var{seed} returning a new @var{seed}. The procedure is supposed to +;;handle a chunk of character data @var{string1} followed by a chunk +;;of character data @var{string2}. @var{string2} is a short string, +;;often @samp{"\n"} and even @samp{""}. +;; +;;@item ENTITIES +;;An assoc list of pairs: +;;@lisp +;; (@var{named-entity-name} . @var{named-entity-body}) +;;@end lisp +;; +;;where @var{named-entity-name} is a symbol under which the entity was +;;declared, @var{named-entity-body} is either a string, or (for an +;;external entity) a thunk that will return an input port (from which +;;the entity can be read). @var{named-entity-body} may also be #f. +;;This is an indication that a @var{named-entity-name} is currently +;;being expanded. A reference to this @var{named-entity-name} will be +;;an error: violation of the WFC nonrecursion. +;; +;;@item XML-TOKEN +;; +;;This record represents a markup, which is, according to the XML +;;Recommendation, "takes the form of start-tags, end-tags, +;;empty-element tags, entity references, character references, +;;comments, CDATA section delimiters, document type declarations, and +;;processing instructions." +;; +;;@table @asis +;;@item kind +;;a TAG-KIND +;;@item head +;;an UNRES-NAME. For XML-TOKENs of kinds 'COMMENT and 'CDSECT, the +;;head is #f. +;;@end table +;; +;;For example, +;;@example +;;

=> kind=START, head=P +;;

=> kind=END, head=P +;;
=> kind=EMPTY-EL, head=BR +;; => kind=DECL, head=DOCTYPE +;; => kind=PI, head=xml +;;&my-ent; => kind=ENTITY-REF, head=my-ent +;;@end example +;; +;;Character references are not represented by xml-tokens as these +;;references are transparently resolved into the corresponding +;;characters. +;; +;;@item XML-DECL +;; +;;The record represents a datatype of an XML document: the list of +;;declared elements and their attributes, declared notations, list of +;;replacement strings or loading procedures for parsed general +;;entities, etc. Normally an XML-DECL record is created from a DTD or +;;an XML Schema, although it can be created and filled in in many +;;other ways (e.g., loaded from a file). +;; +;;@table @var +;;@item elems +;;an (assoc) list of decl-elem or #f. The latter instructs +;;the parser to do no validation of elements and attributes. +;; +;;@item decl-elem +;;declaration of one element: +;; +;;@code{(@var{elem-name} @var{elem-content} @var{decl-attrs})} +;; +;;@var{elem-name} is an UNRES-NAME for the element. +;; +;;@var{elem-content} is an ELEM-CONTENT-MODEL. +;; +;;@var{decl-attrs} is an @code{ATTLIST}, of +;;@code{(@var{attr-name} . @var{value})} associations. +;; +;;This element can declare a user procedure to handle parsing of an +;;element (e.g., to do a custom validation, or to build a hash of IDs +;;as they're encountered). +;; +;;@item decl-attr +;;an element of an @code{ATTLIST}, declaration of one attribute: +;; +;;@code{(@var{attr-name} @var{content-type} @var{use-type} @var{default-value})} +;; +;;@var{attr-name} is an UNRES-NAME for the declared attribute. +;; +;;@var{content-type} is a symbol: @code{CDATA}, @code{NMTOKEN}, +;;@code{NMTOKENS}, @dots{} or a list of strings for the enumerated +;;type. +;; +;;@var{use-type} is a symbol: @code{REQUIRED}, @code{IMPLIED}, or +;;@code{FIXED}. +;; +;;@var{default-value} is a string for the default value, or #f if not +;;given. +;; +;;@end table +;; +;;@end table + +;;see a function make-empty-xml-decl to make a XML declaration entry +;;suitable for a non-validating parsing. + +;;We define xml-token simply as a pair. +(define (make-xml-token kind head) (cons kind head)) +(define xml-token? pair?) +(define xml-token-kind car) +(define xml-token-head cdr) + +;;@subsection Low-Level Parsers and Scanners +;; +;;@noindent +;;These procedures deal with primitive lexical units (Names, +;;whitespaces, tags) and with pieces of more generic productions. +;;Most of these parsers must be called in appropriate context. For +;;example, @code{ssax:complete-start-tag} must be called only when the +;;start-tag has been detected and its GI has been read. + +(define char-return (integer->char 13)) +(define ssax:S-chars (map integer->char '(32 10 9 13))) + +;;@body +;; +;;Skip the S (whitespace) production as defined by +;;@example +;;[3] S ::= (#x20 | #x09 | #x0D | #x0A) +;;@end example +;; +;;@0 returns the first not-whitespace character it encounters while +;;scanning the @1. This character is left on the input stream. +(define (ssax:skip-S port) + (ssax:skip-while ssax:S-chars port)) + +;;Check to see if a-char may start a NCName +(define (ssax:ncname-starting-char? a-char) + (and (char? a-char) + (or (char-alphabetic? a-char) + (char=? #\_ a-char)))) + +;;@body +;; +;;Read a NCName starting from the current position in the @1 and +;;return it as a symbol. +;; +;;@example +;;[4] NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' +;; | CombiningChar | Extender +;;[5] Name ::= (Letter | '_' | ':') (NameChar)* +;;@end example +;; +;;This code supports the XML Namespace Recommendation REC-xml-names, +;;which modifies the above productions as follows: +;; +;;@example +;;[4] NCNameChar ::= Letter | Digit | '.' | '-' | '_' +;; | CombiningChar | Extender +;;[5] NCName ::= (Letter | '_') (NCNameChar)* +;;@end example +;; +;;As the Rec-xml-names says, +;; +;;@quotation +;;"An XML document conforms to this specification if all other tokens +;;[other than element types and attribute names] in the document which +;;are required, for XML conformance, to match the XML production for +;;Name, match this specification's production for NCName." +;;@end quotation +;; +;;Element types and attribute names must match the production QName, +;;defined below. +(define (ssax:read-NCName port) + (let ((first-char (peek-char port))) + (or (ssax:ncname-starting-char? first-char) + (slib:error port "XMLNS [4] for '" first-char "'"))) + (string->symbol (ssax:next-token-of (lambda (c) + (cond + ((eof-object? c) #f) + ((char-alphabetic? c) c) + ((string-index "0123456789.-_" c) c) + (else #f))) + port))) + +;;@body +;; +;;Read a (namespace-) Qualified Name, QName, from the current position +;;in @1; and return an UNRES-NAME. +;; +;;From REC-xml-names: +;;@example +;;[6] QName ::= (Prefix ':')? LocalPart +;;[7] Prefix ::= NCName +;;[8] LocalPart ::= NCName +;;@end example +(define (ssax:read-QName port) + (let ((prefix-or-localpart (ssax:read-NCName port))) + (case (peek-char port) + ((#\:) ; prefix was given after all + (read-char port) ; consume the colon + (cons prefix-or-localpart (ssax:read-NCName port))) + (else prefix-or-localpart) ; Prefix was omitted + ))) + +;;The prefix of the pre-defined XML namespace +(define ssax:Prefix-XML (string->symbol "xml")) + +;;An UNRES-NAME that is postulated to be larger than anything that can +;;occur in a well-formed XML document. ssax:name-compare enforces +;;this postulate. +(define ssax:largest-unres-name (cons (string->symbol "#LARGEST-SYMBOL") + (string->symbol "#LARGEST-SYMBOL"))) + +;;Compare one RES-NAME or an UNRES-NAME with the other. +;;Return a symbol '<, '>, or '= depending on the result of +;;the comparison. +;;Names without @var{prefix} are always smaller than those with the @var{prefix}. +(define ssax:name-compare + (letrec ((symbol-compare + (lambda (symb1 symb2) + (cond + ((eq? symb1 symb2) '=) + ((stringstring symb1) (symbol->string symb2)) + '<) + (else '>))))) + (lambda (name1 name2) + (cond + ((symbol? name1) (if (symbol? name2) (symbol-compare name1 name2) + '<)) + ((symbol? name2) '>) + ((eq? name2 ssax:largest-unres-name) '<) + ((eq? name1 ssax:largest-unres-name) '>) + ((eq? (car name1) (car name2)) ; prefixes the same + (symbol-compare (cdr name1) (cdr name2))) + (else (symbol-compare (car name1) (car name2))))))) + +;;@args port +;; +;;This procedure starts parsing of a markup token. The current +;;position in the stream must be @samp{<}. This procedure scans +;;enough of the input stream to figure out what kind of a markup token +;;it is seeing. The procedure returns an XML-TOKEN structure +;;describing the token. Note, generally reading of the current markup +;;is not finished! In particular, no attributes of the start-tag +;;token are scanned. +;; +;;Here's a detailed break out of the return values and the position in +;;the PORT when that particular value is returned: +;; +;;@table @asis +;; +;;@item PI-token +;; +;;only PI-target is read. To finish the Processing-Instruction and +;;disregard it, call @code{ssax:skip-pi}. @code{ssax:read-attributes} +;;may be useful as well (for PIs whose content is attribute-value +;;pairs). +;; +;;@item END-token +;; +;;The end tag is read completely; the current position is right after +;;the terminating @samp{>} character. +;; +;;@item COMMENT +;; +;;is read and skipped completely. The current position is right after +;;@samp{-->} that terminates the comment. +;; +;;@item CDSECT +;; +;;The current position is right after @samp{" port)) + (slib:error port "XML [15], no -->")) + (make-xml-token 'COMMENT #f)) + ;; we have read ") "XML [42]" port) + val)) + ((#\?) (read-char port) (make-xml-token 'PI (ssax:read-NCName port))) + ((#\!) + (read-char port) + (case (peek-char port) + ((#\-) (read-char port) (skip-comment port)) + ((#\[) (read-char port) (read-cdata port)) + (else (make-xml-token 'DECL (ssax:read-NCName port))))) + (else (make-xml-token 'START (ssax:read-QName port))))))) + +;;@body +;; +;;The current position is inside a PI. Skip till the rest of the PI +(define (ssax:skip-pi port) + (if (not (find-string-from-port? "?>" port)) + (slib:error port "Failed to find ?> terminating the PI"))) + +;;@body +;; +;;The current position is right after reading the PITarget. We read +;;the body of PI and return is as a string. The port will point to +;;the character right after @samp{?>} combination that terminates PI. +;; +;;@example +;;[16] PI ::= '' Char*)))? '?>' +;;@end example +(define (ssax:read-pi-body-as-string port) + (ssax:skip-S port) ; skip WS after the PI target name + (ssax:string-concatenate/shared + (let loop () + (let ((pi-fragment + (ssax:next-token '() '(#\?) "reading PI content" port))) + (read-char port) + (if (eqv? #\> (peek-char port)) + (begin + (read-char port) + (cons pi-fragment '())) + (cons* pi-fragment "?" (loop))))))) + +;;@body +;; +;;The current pos in the port is inside an internal DTD subset (e.g., +;;after reading @samp{#\[} that begins an internal DTD subset) Skip +;;until the @samp{]>} combination that terminates this DTD. +(define (ssax:skip-internal-dtd port) + (slib:warn port "Internal DTD subset is not currently handled ") + (if (not (find-string-from-port? "]>" port)) + (slib:error port + "Failed to find ]> terminating the internal DTD subset"))) + +;;@args port str-handler seed +;; +;;This procedure must be called after we have read a string +;;@samp{} combination is the end of the CDATA section. +;;@samp{>} is treated as an embedded @samp{>} character. +;; +;;@item +;;@samp{<} and @samp{&} are not specially recognized (and are +;;not expanded)! +;; +;;@end itemize +(define ssax:read-cdata-body + (let ((cdata-delimiters (list char-return #\newline #\] #\&))) + (lambda (port str-handler seed) + (let loop ((seed seed)) + (let ((fragment (ssax:next-token '() cdata-delimiters "reading CDATA" port))) + ;; that is, we're reading the char after the 'fragment' + (case (read-char port) + ((#\newline) (loop (str-handler fragment #\newline seed))) + ((#\]) + (if (not (eqv? (peek-char port) #\])) + (loop (str-handler fragment "]" seed)) + (let check-after-second-braket + ((seed (if (string-null? fragment) seed + (str-handler fragment "" seed)))) + (read-char port) + (case (peek-char port) ; after the second bracket + ((#\>) (read-char port) seed) ; we have read "]]>" + ((#\]) (check-after-second-braket + (str-handler "]" "" seed))) + (else (loop (str-handler "]]" "" seed))))))) + ((#\&) ; Note that #\& within CDATA may stand for itself + (let ((ent-ref ; it does not have to start an entity ref + (ssax:next-token-of + (lambda (c) + (and (not (eof-object? c)) (char-alphabetic? c) c)) + port))) + (cond ; replace ">" with #\> + ((and (string=? "gt" ent-ref) (eqv? (peek-char port) #\;)) + (read-char port) + (loop (str-handler fragment ">" seed))) + (else + (loop + (str-handler ent-ref "" + (str-handler fragment "&" seed))))))) + (else ; Must be CR: if the next char is #\newline, skip it + (if (eqv? (peek-char port) #\newline) (read-char port)) + (loop (str-handler fragment #\newline seed))) + )))))) + +;;@body +;; +;;@example +;;[66] CharRef ::= '&#' [0-9]+ ';' +;; | '&#x' [0-9a-fA-F]+ ';' +;;@end example +;; +;;This procedure must be called after we we have read @samp{&#} that +;;introduces a char reference. The procedure reads this reference and +;;returns the corresponding char. The current position in PORT will +;;be after the @samp{;} that terminates the char reference. +;; +;;Faults detected:@* +;;WFC: XML-Spec.html#wf-Legalchar +;; +;;According to Section @cite{4.1 Character and Entity References} +;;of the XML Recommendation: +;; +;;@quotation +;;"[Definition: A character reference refers to a specific character +;;in the ISO/IEC 10646 character set, for example one not directly +;;accessible from available input devices.]" +;;@end quotation +;; +;;@c Therefore, we use a @code{ucscode->char} function to convert a +;;@c character code into the character -- *regardless* of the current +;;@c character encoding of the input stream. +(define (ssax:read-char-ref port) + (let* ((base (cond ((eqv? (peek-char port) #\x) (read-char port) 16) + (else 10))) + (name (ssax:next-token '() '(#\;) "XML [66]" port)) + (char-code (string->number name base))) + (read-char port) ; read the terminating #\; char + (if (integer? char-code) (integer->char char-code) + (slib:error port "[wf-Legalchar] broken for '" name "'")))) + +(define ssax:predefined-parsed-entities + `( + (,(string->symbol "amp") . "&") + (,(string->symbol "lt") . "<") + (,(string->symbol "gt") . ">") + (,(string->symbol "apos") . "'") + (,(string->symbol "quot") . "\"") + )) + +;;@body +;; +;;Expands and handles a parsed-entity reference. +;; +;;@2 is a symbol, the name of the parsed entity to expand. +;;@c entities - see ENTITIES +;;@4 is a procedure of arguments @var{port}, @var{entities}, and +;;@var{seed} that returns a seed. +;;@5 is called if the entity in question is a pre-declared entity. +;; +;;@0 returns the result returned by @4 or @5. +;; +;;Faults detected:@* +;;WFC: XML-Spec.html#wf-entdeclared@* +;;WFC: XML-Spec.html#norecursion +(define (ssax:handle-parsed-entity port name entities content-handler str-handler seed) + (cond ; First we check the list of the declared entities + ((assq name entities) => + (lambda (decl-entity) + (let ((ent-body (cdr decl-entity)) ; mark the list to prevent recursion + (new-entities (cons (cons name #f) entities))) + (cond + ((string? ent-body) + (call-with-input-string ent-body + (lambda (port) (content-handler port new-entities seed)))) + ((procedure? ent-body) + (let ((port (ent-body))) + (define val (content-handler port new-entities seed)) + (close-input-port port) + val)) + (else + (slib:error port "[norecursion] broken for " name)))))) + ((assq name ssax:predefined-parsed-entities) + => (lambda (decl-entity) + (str-handler (cdr decl-entity) "" seed))) + (else (slib:error port "[wf-entdeclared] broken for " name)))) + +;;;; The ATTLIST Abstract Data Type + +;;; Currently is implemented as an assoc list sorted in the ascending +;;; order of NAMES. + +(define attlist-fold fold) +(define attlist-null? null?) +(define attlist->alist identity) +(define (make-empty-attlist) '()) + +;;@body +;; +;;Add a @2 pair to the existing @1, preserving its sorted ascending +;;order; and return the new list. Return #f if a pair with the same +;;name already exists in @1 +(define (attlist-add attlist name-value) + (if (null? attlist) (cons name-value attlist) + (case (ssax:name-compare (car name-value) (caar attlist)) + ((=) #f) + ((<) (cons name-value attlist)) + (else (cons (car attlist) (attlist-add (cdr attlist) name-value))) + ))) + +;;@body +;; +;;Given an non-null @1, return a pair of values: the top and the rest. +(define (attlist-remove-top attlist) + (values (car attlist) (cdr attlist))) + +;;@args port entities +;; +;;This procedure reads and parses a production @dfn{Attribute}. +;; +;;@example +;;[41] Attribute ::= Name Eq AttValue +;;[10] AttValue ::= '"' ([^<&"] | Reference)* '"' +;; | "'" ([^<&'] | Reference)* "'" +;;[25] Eq ::= S? '=' S? +;;@end example +;; +;;The procedure returns an ATTLIST, of Name (as UNRES-NAME), Value (as +;;string) pairs. The current character on the @1 is a non-whitespace +;;character that is not an NCName-starting character. +;; +;;Note the following rules to keep in mind when reading an +;;@dfn{AttValue}: +;;@quotation +;;Before the value of an attribute is passed to the application or +;;checked for validity, the XML processor must normalize it as +;;follows: +;; +;;@itemize @bullet +;;@item +;;A character reference is processed by appending the referenced +;;character to the attribute value. +;; +;;@item +;;An entity reference is processed by recursively processing the +;;replacement text of the entity. The named entities @samp{amp}, +;;@samp{lt}, @samp{gt}, @samp{quot}, and @samp{apos} are pre-declared. +;; +;;@item +;;A whitespace character (#x20, #x0D, #x0A, #x09) is processed by +;;appending #x20 to the normalized value, except that only a single +;;#x20 is appended for a "#x0D#x0A" sequence that is part of an +;;external parsed entity or the literal entity value of an internal +;;parsed entity. +;; +;;@item +;;Other characters are processed by appending them to the normalized +;;value. +;; +;;@end itemize +;; +;;@end quotation +;; +;;Faults detected:@* +;;WFC: XML-Spec.html#CleanAttrVals@* +;;WFC: XML-Spec.html#uniqattspec +(define ssax:read-attributes ; ssax:read-attributes port entities + (let ((value-delimeters (append '(#\< #\&) ssax:S-chars))) + ;; Read the AttValue from the PORT up to the delimiter (which can + ;; be a single or double-quote character, or even a symbol *eof*). + ;; 'prev-fragments' is the list of string fragments, accumulated + ;; so far, in reverse order. Return the list of fragments with + ;; newly read fragments prepended. + (define (read-attrib-value delimiter port entities prev-fragments) + (let* ((new-fragments + (cons (ssax:next-token '() (cons delimiter value-delimeters) + "XML [10]" port) + prev-fragments)) + (cterm (read-char port))) + (cond + ((or (eof-object? cterm) (eqv? cterm delimiter)) + new-fragments) + ((eqv? cterm char-return) ; treat a CR and CRLF as a LF + (if (eqv? (peek-char port) #\newline) (read-char port)) + (read-attrib-value delimiter port entities + (cons " " new-fragments))) + ((memv cterm ssax:S-chars) + (read-attrib-value delimiter port entities + (cons " " new-fragments))) + ((eqv? cterm #\&) + (cond + ((eqv? (peek-char port) #\#) + (read-char port) + (read-attrib-value delimiter port entities + (cons (string (ssax:read-char-ref port)) new-fragments))) + (else + (read-attrib-value delimiter port entities + (read-named-entity port entities new-fragments))))) + (else (slib:error port "[CleanAttrVals] broken"))))) + + ;; we have read "&" that introduces a named entity reference. + ;; read this reference and return the result of normalizing of the + ;; corresponding string (that is, read-attrib-value is applied to + ;; the replacement text of the entity). The current position will + ;; be after ";" that terminates the entity reference + (define (read-named-entity port entities fragments) + (let ((name (ssax:read-NCName port))) + (ssax:assert-current-char '(#\;) "XML [68]" port) + (ssax:handle-parsed-entity port name entities + (lambda (port entities fragments) + (read-attrib-value '*eof* port entities fragments)) + (lambda (str1 str2 fragments) + (if (equal? "" str2) (cons str1 fragments) + (cons* str2 str1 fragments))) + fragments))) + + (lambda (port entities) + (let loop ((attr-list (make-empty-attlist))) + (if (not (ssax:ncname-starting-char? (ssax:skip-S port))) attr-list + (let ((name (ssax:read-QName port))) + (ssax:skip-S port) + (ssax:assert-current-char '(#\=) "XML [25]" port) + (ssax:skip-S port) + (let ((delimiter + (ssax:assert-current-char '(#\' #\" ) "XML [10]" port))) + (loop + (or (attlist-add attr-list + (cons name + (ssax:string-concatenate-reverse/shared + (read-attrib-value delimiter port entities + '())))) + (slib:error port "[uniqattspec] broken for " name)))))))) + )) + +;;@body +;; +;;Convert an @2 to a RES-NAME, given the appropriate @3 declarations. +;;The last parameter, @4, determines if the default namespace applies +;;(for instance, it does not for attribute names). +;; +;;Per REC-xml-names/#nsc-NSDeclared, the "xml" prefix is considered +;;pre-declared and bound to the namespace name +;;"http://www.w3.org/XML/1998/namespace". +;; +;;@0 tests for the namespace constraints:@* +;;@url{http://www.w3.org/TR/REC-xml-names/#nsc-NSDeclared} +(define (ssax:resolve-name port unres-name namespaces apply-default-ns?) + (cond + ((pair? unres-name) ; it's a QNAME + (cons + (cond + ((assq (car unres-name) namespaces) => cadr) + ((eq? (car unres-name) ssax:Prefix-XML) ssax:Prefix-XML) + (else + (slib:error port "[nsc-NSDeclared] broken; prefix " (car unres-name)))) + (cdr unres-name))) + (apply-default-ns? ; Do apply the default namespace, if any + (let ((default-ns (assq '*DEFAULT* namespaces))) + (if (and default-ns (cadr default-ns)) + (cons (cadr default-ns) unres-name) + unres-name))) ; no default namespace declared + (else unres-name))) ; no prefix, don't apply the default-ns + + +;;Procedure: ssax:uri-string->symbol URI-STR +;;Convert a URI-STR to an appropriate symbol +(define ssax:uri-string->symbol string->symbol) + + +;;@args tag port elems entities namespaces +;; +;;Complete parsing of a start-tag markup. @0 must be called after the +;;start tag token has been read. @1 is an UNRES-NAME. @3 is an +;;instance of the ELEMS slot of XML-DECL; it can be #f to tell the +;;function to do @emph{no} validation of elements and their +;;attributes. +;; +;;@0 returns several values: +;;@itemize @bullet +;;@item ELEM-GI: +;;a RES-NAME. +;;@item ATTRIBUTES: +;;element's attributes, an ATTLIST of (RES-NAME . STRING) pairs. +;;The list does NOT include xmlns attributes. +;;@item NAMESPACES: +;;the input list of namespaces amended with namespace +;;(re-)declarations contained within the start-tag under parsing +;;@item ELEM-CONTENT-MODEL +;;@end itemize +;; +;;On exit, the current position in @2 will be the first character +;;after @samp{>} that terminates the start-tag markup. +;; +;;Faults detected:@* +;;VC: XML-Spec.html#enum@* +;;VC: XML-Spec.html#RequiredAttr@* +;;VC: XML-Spec.html#FixedAttr@* +;;VC: XML-Spec.html#ValueType@* +;;WFC: XML-Spec.html#uniqattspec (after namespaces prefixes are resolved)@* +;;VC: XML-Spec.html#elementvalid@* +;;WFC: REC-xml-names/#dt-NSName +;; +;;@emph{Note}: although XML Recommendation does not explicitly say it, +;;xmlns and xmlns: attributes don't have to be declared (although they +;;can be declared, to specify their default value). +(define ssax:complete-start-tag + + (let ((xmlns (string->symbol "xmlns")) + (largest-dummy-decl-attr (list ssax:largest-unres-name #f #f #f))) + + ;; Scan through the attlist and validate it, against decl-attrs + ;; Return an assoc list with added fixed or implied attrs. + ;; Note that both attlist and decl-attrs are ATTLISTs, and therefore, + ;; sorted + (define (validate-attrs port attlist decl-attrs) + + ;; Check to see decl-attr is not of use type REQUIRED. Add + ;; the association with the default value, if any declared + (define (add-default-decl decl-attr result) + (let*-values + (((attr-name content-type use-type default-value) + (apply values decl-attr))) + (and (eq? use-type 'REQUIRED) + (slib:error port "[RequiredAttr] broken for" attr-name)) + (if default-value + (cons (cons attr-name default-value) result) + result))) + + (let loop ((attlist attlist) (decl-attrs decl-attrs) (result '())) + (if (attlist-null? attlist) + (attlist-fold add-default-decl result decl-attrs) + (let*-values + (((attr attr-others) + (attlist-remove-top attlist)) + ((decl-attr other-decls) + (if (attlist-null? decl-attrs) + (values largest-dummy-decl-attr decl-attrs) + (attlist-remove-top decl-attrs))) + ) + (case (ssax:name-compare (car attr) (car decl-attr)) + ((<) + (if (or (eq? xmlns (car attr)) + (and (pair? (car attr)) (eq? xmlns (caar attr)))) + (loop attr-others decl-attrs (cons attr result)) + (slib:error port "[ValueType] broken for " attr))) + ((>) + (loop attlist other-decls + (add-default-decl decl-attr result))) + (else ; matched occurrence of an attr with its declaration + (let*-values + (((attr-name content-type use-type default-value) + (apply values decl-attr))) + ;; Run some tests on the content of the attribute + (cond + ((eq? use-type 'FIXED) + (or (equal? (cdr attr) default-value) + (slib:error port "[FixedAttr] broken for " attr-name))) + ((eq? content-type 'CDATA) #t) ; everything goes + ((pair? content-type) + (or (member (cdr attr) content-type) + (slib:error port "[enum] broken for " attr-name "=" + (cdr attr)))) + (else + (slib:warn port "declared content type " content-type + " not verified yet"))) + (loop attr-others other-decls (cons attr result))))) + )))) + + + ;; Add a new namespace declaration to namespaces. + ;; First we convert the uri-str to a uri-symbol and search namespaces for + ;; an association (_ user-prefix . uri-symbol). + ;; If found, we return the argument namespaces with an association + ;; (prefix user-prefix . uri-symbol) prepended. + ;; Otherwise, we prepend (prefix uri-symbol . uri-symbol) + (define (add-ns port prefix uri-str namespaces) + (and (equal? "" uri-str) + (slib:error port "[dt-NSName] broken for " prefix)) + (let ((uri-symbol (ssax:uri-string->symbol uri-str))) + (let loop ((nss namespaces)) + (cond + ((null? nss) + (cons (cons* prefix uri-symbol uri-symbol) namespaces)) + ((eq? uri-symbol (cddar nss)) + (cons (cons* prefix (cadar nss) uri-symbol) namespaces)) + (else (loop (cdr nss))))))) + + ;; partition attrs into proper attrs and new namespace declarations + ;; return two values: proper attrs and the updated namespace declarations + (define (adjust-namespace-decl port attrs namespaces) + (let loop ((attrs attrs) (proper-attrs '()) (namespaces namespaces)) + (cond + ((null? attrs) (values proper-attrs namespaces)) + ((eq? xmlns (caar attrs)) ; re-decl of the default namespace + (loop (cdr attrs) proper-attrs + (if (equal? "" (cdar attrs)) ; un-decl of the default ns + (cons (cons* '*DEFAULT* #f #f) namespaces) + (add-ns port '*DEFAULT* (cdar attrs) namespaces)))) + ((and (pair? (caar attrs)) (eq? xmlns (caaar attrs))) + (loop (cdr attrs) proper-attrs + (add-ns port (cdaar attrs) (cdar attrs) namespaces))) + (else + (loop (cdr attrs) (cons (car attrs) proper-attrs) namespaces))))) + + ;; The body of the function + (lambda (tag-head port elems entities namespaces) + (let*-values + (((attlist) (ssax:read-attributes port entities)) + ((empty-el-tag?) + (begin + (ssax:skip-S port) + (and + (eqv? #\/ + (ssax:assert-current-char '(#\> #\/) "XML [40], XML [44], no '>'" port)) + (ssax:assert-current-char '(#\>) "XML [44], no '>'" port)))) + ((elem-content decl-attrs) ; see xml-decl for their type + (if elems ; elements declared: validate! + (cond + ((assoc tag-head elems) => + (lambda (decl-elem) ; of type xml-decl::decl-elem + (values + (if empty-el-tag? 'EMPTY-TAG (cadr decl-elem)) + (caddr decl-elem)))) + (else + (slib:error port "[elementvalid] broken, no decl for " tag-head))) + (values ; non-validating parsing + (if empty-el-tag? 'EMPTY-TAG 'ANY) + #f) ; no attributes declared + )) + ((merged-attrs) (if decl-attrs (validate-attrs port attlist decl-attrs) + (attlist->alist attlist))) + ((proper-attrs namespaces) + (adjust-namespace-decl port merged-attrs namespaces)) + ) + ;; build the return value + (values + (ssax:resolve-name port tag-head namespaces #t) + (fold-right + (lambda (name-value attlist) + (or + (attlist-add attlist + (cons (ssax:resolve-name port (car name-value) namespaces #f) + (cdr name-value))) + (slib:error port "[uniqattspec] after NS expansion broken for " + name-value))) + (make-empty-attlist) + proper-attrs) + namespaces + elem-content))))) + +;;@body +;; +;;Parses an ExternalID production: +;; +;;@example +;;[75] ExternalID ::= 'SYSTEM' S SystemLiteral +;; | 'PUBLIC' S PubidLiteral S SystemLiteral +;;[11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") +;;[12] PubidLiteral ::= '"' PubidChar* '"' +;; | "'" (PubidChar - "'")* "'" +;;[13] PubidChar ::= #x20 | #x0D | #x0A | [a-zA-Z0-9] +;; | [-'()+,./:=?;!*#@@$_%] +;;@end example +;; +;;Call @0 when an ExternalID is expected; that is, the current +;;character must be either #\S or #\P that starts correspondingly a +;;SYSTEM or PUBLIC token. @0 returns the @var{SystemLiteral} as a +;;string. A @var{PubidLiteral} is disregarded if present. +(define (ssax:read-external-id port) + (let ((discriminator (ssax:read-NCName port))) + (ssax:assert-current-char ssax:S-chars "space after SYSTEM or PUBLIC" port) + (ssax:skip-S port) + (let ((delimiter + (ssax:assert-current-char '(#\' #\" ) "XML [11], XML [12]" port))) + (cond + ((eq? discriminator (string->symbol "SYSTEM")) + (let ((val (ssax:next-token '() (list delimiter) "XML [11]" port))) + (read-char port) ; reading the closing delim + val)) + ((eq? discriminator (string->symbol "PUBLIC")) + (let loop ((c (read-char port))) + (cond + ((eqv? c delimiter) c) + ((eof-object? c) + (slib:error port "Unexpected EOF while skipping until " delimiter)) + (else (loop (read-char port))))) + (ssax:assert-current-char ssax:S-chars "space after PubidLiteral" port) + (ssax:skip-S port) + (let* ((delimiter + (ssax:assert-current-char '(#\' #\" ) "XML [11]" port)) + (systemid + (ssax:next-token '() (list delimiter) "XML [11]" port))) + (read-char port) ; reading the closing delim + systemid)) + (else + (slib:error port "XML [75], " discriminator + " rather than SYSTEM or PUBLIC")))))) + + +;;@subsection Mid-Level Parsers and Scanners +;; +;;@noindent +;;These procedures parse productions corresponding to the whole +;;(document) entity or its higher-level pieces (prolog, root element, +;;etc). + +;;@body +;; +;;Scan the Misc production in the context: +;; +;;@example +;;[1] document ::= prolog element Misc* +;;[22] prolog ::= XMLDecl? Misc* (doctypedec l Misc*)? +;;[27] Misc ::= Comment | PI | S +;;@end example +;; +;;Call @0 in the prolog or epilog contexts. In these contexts, +;;whitespaces are completely ignored. The return value from @0 is +;;either a PI-token, a DECL-token, a START token, or *EOF*. Comments +;;are ignored and not reported. +(define (ssax:scan-misc port) + (let loop ((c (ssax:skip-S port))) + (cond + ((eof-object? c) c) + ((not (char=? c #\<)) + (slib:error port "XML [22], char '" c "' unexpected")) + (else + (let ((token (ssax:read-markup-token port))) + (case (xml-token-kind token) + ((COMMENT) (loop (ssax:skip-S port))) + ((PI DECL START) token) + (else + (slib:error port "XML [22], unexpected token of kind " + (xml-token-kind token) + )))))))) + +;;@args port expect-eof? str-handler iseed +;; +;;Read the character content of an XML document or an XML element. +;; +;;@example +;;[43] content ::= +;;(element | CharData | Reference | CDSect | PI | Comment)* +;;@end example +;; +;;To be more precise, @0 reads CharData, expands CDSect and character +;;entities, and skips comments. @0 stops at a named reference, EOF, +;;at the beginning of a PI, or a start/end tag. +;; +;;@2 is a boolean indicating if EOF is normal; i.e., the character +;;data may be terminated by the EOF. EOF is normal while processing a +;;parsed entity. +;; +;;@4 is an argument passed to the first invocation of @3. +;; +;;@0 returns two results: @var{seed} and @var{token}. The @var{seed} +;;is the result of the last invocation of @3, or the original @4 if @3 +;;was never called. +;; +;;@var{token} can be either an eof-object (this can happen only if @2 +;;was #t), or: +;;@itemize @bullet +;; +;;@item +;;an xml-token describing a START tag or an END-tag; +;;For a start token, the caller has to finish reading it. +;; +;;@item +;;an xml-token describing the beginning of a PI. It's up to an +;;application to read or skip through the rest of this PI; +;; +;;@item +;;an xml-token describing a named entity reference. +;; +;;@end itemize +;; +;;CDATA sections and character references are expanded inline and +;;never returned. Comments are silently disregarded. +;; +;;As the XML Recommendation requires, all whitespace in character data +;;must be preserved. However, a CR character (#x0D) must be +;;disregarded if it appears before a LF character (#x0A), or replaced +;;by a #x0A character otherwise. See Secs. 2.10 and 2.11 of the XML +;;Recommendation. See also the canonical XML Recommendation. +(define ssax:read-char-data + (let ((terminators-usual (list #\< #\& char-return)) + (terminators-usual-eof (list #\< '*eof* #\& char-return)) + (handle-fragment + (lambda (fragment str-handler seed) + (if (string-null? fragment) seed + (str-handler fragment "" seed))))) + (lambda (port expect-eof? str-handler seed) + ;; Very often, the first character we encounter is #\< + ;; Therefore, we handle this case in a special, fast path + (if (eqv? #\< (peek-char port)) + ;; The fast path + (let ((token (ssax:read-markup-token port))) + (case (xml-token-kind token) + ((START END) ; The most common case + (values seed token)) + ((CDSECT) + (let ((seed (ssax:read-cdata-body port str-handler seed))) + (ssax:read-char-data port expect-eof? str-handler seed))) + ((COMMENT) + (ssax:read-char-data port expect-eof? str-handler seed)) + (else + (values seed token)))) + ;; The slow path + (let ((char-data-terminators + (if expect-eof? terminators-usual-eof terminators-usual))) + (let loop ((seed seed)) + (let* ((fragment + (ssax:next-token '() char-data-terminators + "reading char data" port)) + (term-char (peek-char port)) ; one of char-data-terminators + ) + (if (eof-object? term-char) + (values + (handle-fragment fragment str-handler seed) + term-char) + (case term-char + ((#\<) + (let ((token (ssax:read-markup-token port))) + (case (xml-token-kind token) + ((CDSECT) + (loop + (ssax:read-cdata-body port str-handler + (handle-fragment fragment str-handler seed)))) + ((COMMENT) + (loop (handle-fragment fragment str-handler seed))) + (else + (values + (handle-fragment fragment str-handler seed) + token))))) + ((#\&) + (read-char port) + (case (peek-char port) + ((#\#) (read-char port) + (loop (str-handler fragment + (string (ssax:read-char-ref port)) + seed))) + (else + (let ((name (ssax:read-NCName port))) + (ssax:assert-current-char '(#\;) "XML [68]" port) + (values + (handle-fragment fragment str-handler seed) + (make-xml-token 'ENTITY-REF name)))))) + (else ; This must be a CR character + (read-char port) + (if (eqv? (peek-char port) #\newline) + (read-char port)) + (loop (str-handler fragment (string #\newline) seed)))) + )))))))) + +;;@body +;; +;;Make sure that @1 is of anticipated @2 and has anticipated @3. Note +;;that the @3 argument may actually be a pair of two symbols, +;;Namespace-URI or the prefix, and of the localname. If the assertion +;;fails, @4 is evaluated by passing it three arguments: @1 @2 @3. The +;;result of @4 is returned. +(define (ssax:assert-token token kind gi error-cont) + (or (and (xml-token? token) + (eq? kind (xml-token-kind token)) + (equal? gi (xml-token-head token))) + (error-cont token kind gi))) + +;;@subsection High-level Parsers +;; +;;These procedures are to instantiate a SSAX parser. A user can +;;instantiate the parser to do the full validation, or no validation, +;;or any particular validation. The user specifies which PI he wants +;;to be notified about. The user tells what to do with the parsed +;;character and element data. The latter handlers determine if the +;;parsing follows a SAX or a DOM model. + +;;@args my-pi-handlers +;; +;;Create a parser to parse and process one Processing Element (PI). +;; +;;@1 is an association list of pairs +;;@code{(@var{pi-tag} . @var{pi-handler})} where @var{pi-tag} is an +;;NCName symbol, the PI target; and @var{pi-handler} is a procedure +;;taking arguments @var{port}, @var{pi-tag}, and @var{seed}. +;; +;;@var{pi-handler} should read the rest of the PI up to and including +;;the combination @samp{?>} that terminates the PI. The handler +;;should return a new seed. One of the @var{pi-tag}s may be the +;;symbol @code{*DEFAULT*}. The corresponding handler will handle PIs +;;that no other handler will. If the *DEFAULT* @var{pi-tag} is not +;;specified, @0 will assume the default handler that skips the body of +;;the PI. +;; +;;@0 returns a procedure of arguments @var{port}, @var{pi-tag}, and +;;@var{seed}; that will parse the current PI according to @1. +(define (ssax:make-pi-parser handlers) + (lambda (port target seed) + (define pair (assv target handlers)) + (or pair (set! pair (assv '*DEFAULT* handlers))) + (cond ((not pair) + (slib:warn port "Skipping PI: " target #\newline) + (ssax:skip-pi port) + seed) + (else ((cdr pair) port target seed))))) + +;;syntax: ssax:make-elem-parser +;; my-new-level-seed my-finish-element my-char-data-handler my-pi-handlers + +;;@body +;; +;;Create a parser to parse and process one element, including its +;;character content or children elements. The parser is typically +;;applied to the root element of a document. +;; +;;@table @asis +;; +;;@item @1 +;;is a procedure taking arguments: +;; +;;@var{elem-gi} @var{attributes} @var{namespaces} @var{expected-content} @var{seed} +;; +;;where @var{elem-gi} is a RES-NAME of the element about to be +;;processed. +;; +;;@1 is to generate the seed to be passed to handlers that process the +;;content of the element. +;; +;;@item @2 +;;is a procedure taking arguments: +;; +;;@var{elem-gi} @var{attributes} @var{namespaces} @var{parent-seed} @var{seed} +;; +;;@2 is called when parsing of @var{elem-gi} is finished. +;;The @var{seed} is the result from the last content parser (or +;;from @1 if the element has the empty content). +;;@var{parent-seed} is the same seed as was passed to @1. +;;@2 is to generate a seed that will be the result +;;of the element parser. +;; +;;@item @3 +;;is a STR-HANDLER as described in Data Types above. +;; +;;@item @4 +;;is as described for @code{ssax:make-pi-handler} above. +;; +;;@end table +;; +;;The generated parser is a procedure taking arguments: +;; +;;@var{start-tag-head} @var{port} @var{elems} @var{entities} @var{namespaces} @var{preserve-ws?} @var{seed} +;; +;;The procedure must be called after the start tag token has been +;;read. @var{start-tag-head} is an UNRES-NAME from the start-element +;;tag. ELEMS is an instance of ELEMS slot of XML-DECL. +;; +;;Faults detected:@* +;;VC: XML-Spec.html#elementvalid@* +;;WFC: XML-Spec.html#GIMatch +(define (ssax:make-elem-parser my-new-level-seed my-finish-element + my-char-data-handler my-pi-handlers) + (lambda (start-tag-head port elems entities namespaces preserve-ws? seed) + (define xml-space-gi (cons ssax:Prefix-XML + (string->symbol "space"))) + (let handle-start-tag ((start-tag-head start-tag-head) + (port port) (entities entities) + (namespaces namespaces) + (preserve-ws? preserve-ws?) (parent-seed seed)) + (let*-values + (((elem-gi attributes namespaces expected-content) + (ssax:complete-start-tag start-tag-head port elems + entities namespaces)) + ((seed) + (my-new-level-seed elem-gi attributes + namespaces expected-content parent-seed))) + (case expected-content + ((EMPTY-TAG) + (my-finish-element + elem-gi attributes namespaces parent-seed seed)) + ((EMPTY) ; The end tag must immediately follow + (ssax:assert-token (and (eqv? #\< (ssax:skip-S port)) + (ssax:read-markup-token port)) + 'END + start-tag-head + (lambda (token exp-kind exp-head) + (slib:error port "[elementvalid] broken for " token + " while expecting " + exp-kind exp-head))) + (my-finish-element + elem-gi attributes namespaces parent-seed seed)) + (else ; reading the content... + (let ((preserve-ws? ; inherit or set the preserve-ws? flag + (cond ((assoc xml-space-gi attributes) => + (lambda (name-value) + (equal? "preserve" (cdr name-value)))) + (else preserve-ws?)))) + (let loop ((port port) (entities entities) + (expect-eof? #f) (seed seed)) + (let*-values + (((seed term-token) + (ssax:read-char-data port expect-eof? + my-char-data-handler seed))) + (if (eof-object? term-token) + seed + (case (xml-token-kind term-token) + ((END) + (ssax:assert-token term-token 'END start-tag-head + (lambda (token exp-kind exp-head) + (slib:error port "[GIMatch] broken for " + term-token " while expecting " + exp-kind exp-head))) + (my-finish-element + elem-gi attributes namespaces parent-seed seed)) + ((PI) + (let ((seed + ((ssax:make-pi-parser my-pi-handlers) + port (xml-token-head term-token) seed))) + (loop port entities expect-eof? seed))) + ((ENTITY-REF) + (let ((seed + (ssax:handle-parsed-entity + port (xml-token-head term-token) + entities + (lambda (port entities seed) + (loop port entities #t seed)) + my-char-data-handler + seed))) ; keep on reading the content after ent + (loop port entities expect-eof? seed))) + ((START) ; Start of a child element + (if (eq? expected-content 'PCDATA) + (slib:error port "[elementvalid] broken for " + elem-gi + " with char content only; unexpected token " + term-token)) + ;; Do other validation of the element content + (let ((seed + (handle-start-tag + (xml-token-head term-token) + port entities namespaces + preserve-ws? seed))) + (loop port entities expect-eof? seed))) + (else + (slib:error port "XML [43] broken for " + term-token)))))))) + ))) + )) + + +;;This is ssax:make-parser with all the (specialization) handlers given +;;as positional arguments. It is called by ssax:make-parser, see below +(define (ssax:make-parser/positional-args + *handler-DOCTYPE + *handler-UNDECL-ROOT + *handler-DECL-ROOT + *handler-NEW-LEVEL-SEED + *handler-FINISH-ELEMENT + *handler-CHAR-DATA-HANDLER + *handler-PROCESSING-INSTRUCTIONS) + (lambda (port seed) + ;; We must've just scanned the DOCTYPE token. Handle the + ;; doctype declaration and exit to + ;; scan-for-significant-prolog-token-2, and eventually, to the + ;; element parser. + (define (handle-decl port token-head seed) + (or (eq? (string->symbol "DOCTYPE") token-head) + (slib:error port "XML [22], expected DOCTYPE declaration, found " + token-head)) + (ssax:assert-current-char ssax:S-chars "XML [28], space after DOCTYPE" port) + (ssax:skip-S port) + (let*-values + (((docname) (ssax:read-QName port)) + ((systemid) + (and (ssax:ncname-starting-char? (ssax:skip-S port)) + (ssax:read-external-id port))) + ((internal-subset?) + (begin + (ssax:skip-S port) + (eqv? #\[ + (ssax:assert-current-char '(#\> #\[) + "XML [28], end-of-DOCTYPE" port)))) + ((elems entities namespaces seed) + (*handler-DOCTYPE port docname systemid internal-subset? seed))) + (scan-for-significant-prolog-token-2 port elems entities namespaces + seed))) + ;; Scan the leading PIs until we encounter either a doctype + ;; declaration or a start token (of the root element). In the + ;; latter two cases, we exit to the appropriate continuation + (define (scan-for-significant-prolog-token-1 port seed) + (let ((token (ssax:scan-misc port))) + (if (eof-object? token) + (slib:error port "XML [22], unexpected EOF") + (case (xml-token-kind token) + ((PI) + (let ((seed + ((ssax:make-pi-parser *handler-PROCESSING-INSTRUCTIONS) + port (xml-token-head token) seed))) + (scan-for-significant-prolog-token-1 port seed))) + ((DECL) (handle-decl port (xml-token-head token) seed)) + ((START) + (let*-values + (((elems entities namespaces seed) + (*handler-UNDECL-ROOT (xml-token-head token) seed))) + (element-parser (xml-token-head token) port elems + entities namespaces #f seed))) + (else (slib:error port "XML [22], unexpected markup " + token)))))) + ;; Scan PIs after the doctype declaration, till we encounter + ;; the start tag of the root element. After that we exit + ;; to the element parser + (define (scan-for-significant-prolog-token-2 port elems entities namespaces seed) + (let ((token (ssax:scan-misc port))) + (if (eof-object? token) + (slib:error port "XML [22], unexpected EOF") + (case (xml-token-kind token) + ((PI) + (let ((seed ((ssax:make-pi-parser *handler-PROCESSING-INSTRUCTIONS) + port (xml-token-head token) seed))) + (scan-for-significant-prolog-token-2 port elems entities + namespaces seed))) + ((START) + (element-parser (xml-token-head token) port elems + entities namespaces #f + (*handler-DECL-ROOT (xml-token-head token) seed))) + (else (slib:error port "XML [22], unexpected markup " + token)))))) + ;; A procedure start-tag-head port elems entities namespaces + ;; preserve-ws? seed + (define element-parser + (ssax:make-elem-parser *handler-NEW-LEVEL-SEED + *handler-FINISH-ELEMENT + *handler-CHAR-DATA-HANDLER + *handler-PROCESSING-INSTRUCTIONS)) + + ;; Get the ball rolling ... + (scan-for-significant-prolog-token-1 port seed) + )) + +(define DOCTYPE 'DOCTYPE) +(define UNDECL-ROOT 'UNDECL-ROOT) +(define DECL-ROOT 'DECL-ROOT) +(define NEW-LEVEL-SEED 'NEW-LEVEL-SEED) +(define FINISH-ELEMENT 'FINISH-ELEMENT) +(define CHAR-DATA-HANDLER 'CHAR-DATA-HANDLER) +(define PROCESSING-INSTRUCTIONS 'PROCESSING-INSTRUCTIONS) + +;;@args user-handler-tag user-handler ... +;; +;;Create an XML parser, an instance of the XML parsing framework. +;;This will be a SAX, a DOM, or a specialized parser depending on the +;;supplied user-handlers. +;; +;;@0 takes an even number of arguments; @1 is a symbol that identifies +;;a procedure (or association list for @code{PROCESSING-INSTRUCTIONS}) +;;(@2) that follows the tag. Given below are tags and signatures of +;;the corresponding procedures. Not all tags have to be specified. +;;If some are omitted, reasonable defaults will apply. +;; +;;@table @samp +;; +;;@item DOCTYPE +;;handler-procedure: @var{port} @var{docname} @var{systemid} @var{internal-subset?} @var{seed} +;; +;;If @var{internal-subset?} is #t, the current position in the port is +;;right after we have read @samp{[} that begins the internal DTD +;;subset. We must finish reading of this subset before we return (or +;;must call @code{skip-internal-dtd} if we aren't interested in +;;reading it). @var{port} at exit must be at the first symbol after +;;the whole DOCTYPE declaration. +;; +;;The handler-procedure must generate four values: +;;@quotation +;;@var{elems} @var{entities} @var{namespaces} @var{seed} +;;@end quotation +;; +;;@var{elems} is as defined for the ELEMS slot of XML-DECL. It may be +;;#f to switch off validation. @var{namespaces} will typically +;;contain @var{user-prefix}es for selected @var{uri-symb}s. The +;;default handler-procedure skips the internal subset, if any, and +;;returns @code{(values #f '() '() seed)}. +;; +;;@item UNDECL-ROOT +;;procedure: @var{elem-gi} @var{seed} +;; +;;where @var{elem-gi} is an UNRES-NAME of the root element. This +;;procedure is called when an XML document under parsing contains +;;@emph{no} DOCTYPE declaration. +;; +;;The handler-procedure, as a DOCTYPE handler procedure above, +;;must generate four values: +;;@quotation +;;@var{elems} @var{entities} @var{namespaces} @var{seed} +;;@end quotation +;; +;;The default handler-procedure returns (values #f '() '() seed) +;; +;;@item DECL-ROOT +;;procedure: @var{elem-gi} @var{seed} +;; +;;where @var{elem-gi} is an UNRES-NAME of the root element. This +;;procedure is called when an XML document under parsing does contains +;;the DOCTYPE declaration. The handler-procedure must generate a new +;;@var{seed} (and verify that the name of the root element matches the +;;doctype, if the handler so wishes). The default handler-procedure +;;is the identity function. +;; +;;@item NEW-LEVEL-SEED +;;procedure: see ssax:make-elem-parser, my-new-level-seed +;; +;;@item FINISH-ELEMENT +;;procedure: see ssax:make-elem-parser, my-finish-element +;; +;;@item CHAR-DATA-HANDLER +;;procedure: see ssax:make-elem-parser, my-char-data-handler +;; +;;@item PROCESSING-INSTRUCTIONS +;;association list as is passed to @code{ssax:make-pi-parser}. +;;The default value is '() +;; +;;@end table +;; +;;The generated parser is a procedure of arguments @var{port} and +;;@var{seed}. +;; +;;This procedure parses the document prolog and then exits to an +;;element parser (created by @code{ssax:make-elem-parser}) to handle +;;the rest. +;; +;;@example +;;[1] document ::= prolog element Misc* +;;[22] prolog ::= XMLDecl? Misc* (doctypedec | Misc*)? +;;[27] Misc ::= Comment | PI | S +;;[28] doctypedecl ::= '' +;;[29] markupdecl ::= elementdecl | AttlistDecl +;; | EntityDecl +;; | NotationDecl | PI +;; | Comment +;;@end example +(define ssax:make-parser + (let ((descriptors + `((DOCTYPE + ,(lambda (port docname systemid internal-subset? seed) + (cond (internal-subset? + (ssax:skip-internal-dtd port))) + (slib:warn port "DOCTYPE DECL " docname " " + systemid " found and skipped") + (values #f '() '() seed) + )) + (UNDECL-ROOT + ,(lambda (elem-gi seed) (values #f '() '() seed))) + (DECL-ROOT + ,(lambda (elem-gi seed) seed)) + (NEW-LEVEL-SEED) ; required + (FINISH-ELEMENT) ; required + (CHAR-DATA-HANDLER) ; required + (PROCESSING-INSTRUCTIONS ()) + ))) + (lambda proplist + (define count 0) + (if (odd? (length proplist)) + (slib:error 'ssax:make-parser "takes even number of arguments" + proplist)) + (let ((posititional-args + (map (lambda (spec) + (define ptail (member (car spec) proplist)) + (cond ((and ptail (odd? (length ptail))) + (slib:error 'ssax:make-parser 'bad 'argument ptail)) + (ptail + (set! count (+ 1 count)) + (cadr ptail)) + ((not (null? (cdr spec))) + (cadr spec)) + (else + (slib:error + 'ssax:make-parser 'missing (car spec) 'property)))) + descriptors))) + (if (= count (quotient (length proplist) 2)) + (apply ssax:make-parser/positional-args posititional-args) + (slib:error 'ssax:make-parser 'extra 'arguments proplist)))))) + +;;@subsection Parsing XML to SXML + +;;@body +;; +;;This is an instance of the SSAX parser that returns an SXML +;;representation of the XML document to be read from @1. @2 is a list +;;of @code{(@var{user-prefix} . @var{uri-string})} that assigns +;;@var{user-prefix}es to certain namespaces identified by particular +;;@var{uri-string}s. It may be an empty list. @0 returns an SXML +;;tree. The port points out to the first character after the root +;;element. +(define (ssax:xml->sxml port namespace-prefix-assig) + (define namespaces + (map (lambda (el) (cons* #f (car el) (ssax:uri-string->symbol (cdr el)))) + namespace-prefix-assig)) + (define (RES-NAME->SXML res-name) + (string->symbol + (string-append + (symbol->string (car res-name)) + ":" + (symbol->string (cdr res-name))))) + (let ((result + (reverse + ((ssax:make-parser + + 'DOCTYPE + (lambda (port docname systemid internal-subset? seed) + (cond (internal-subset? + (ssax:skip-internal-dtd port))) + (slib:warn port "DOCTYPE DECL " docname " " + systemid " found and skipped") + (values #f '() namespaces seed)) + + 'NEW-LEVEL-SEED + (lambda (elem-gi attributes namespaces expected-content seed) + '()) + + 'FINISH-ELEMENT + (lambda (elem-gi attributes namespaces parent-seed seed) + (define nseed (ssax:reverse-collect-str-drop-ws seed)) + (define attrs + (attlist-fold + (lambda (attr accum) + (cons (list (if (symbol? (car attr)) + (car attr) + (RES-NAME->SXML (car attr))) + (cdr attr)) + accum)) + '() attributes)) + (cons (cons (if (symbol? elem-gi) + elem-gi + (RES-NAME->SXML elem-gi)) + (if (null? attrs) + nseed + (cons (cons '@ attrs) nseed))) + parent-seed)) + + 'CHAR-DATA-HANDLER + (lambda (string1 string2 seed) + (if (string-null? string2) + (cons string1 seed) + (cons* string2 string1 seed))) + + 'UNDECL-ROOT + (lambda (elem-gi seed) + (values #f '() namespaces seed)) + + 'PROCESSING-INSTRUCTIONS + (list + (cons '*DEFAULT* + (lambda (port pi-tag seed) + (cons (list '*PROCESSING-INSTRUCTIONS* + pi-tag + (ssax:read-pi-body-as-string port)) + seed)))) + ) + port + '())))) + (cons '*TOP* + (if (null? namespace-prefix-assig) + result + (cons + (list '@ (cons '*NAMESPACES* + (map (lambda (ns) (list (car ns) (cdr ns))) + namespace-prefix-assig))) + result))))) -- cgit v1.2.3