;;;;"xml-parse.scm" XML parsing and conversion to SXML (Scheme-XML)
;;; Copyright (C) 2007 Aubrey Jaffer
;;; 2007-04 jaffer: demacrofied from public-domain SSAX 5.1
;
;Permission to copy this software, to modify it, to redistribute it,
;to distribute modified versions, and to use it for any purpose is
;granted, subject to the following restrictions and understandings.
;
;1. Any copy made of this software must include this copyright notice
;in full.
;
;2. I have made no warranty or representation that the operation of
;this software will be error-free, and I am under no obligation to
;provide any services, by way of maintenance, update, or otherwise.
;
;3. In conjunction with products arising from the use of this
;material, there shall be no use of my name in any advertising,
;promotional, or sales literature without prior written consent in
;each case.
;;@code{(require 'xml-parse)} or @code{(require 'ssax)}
;;
;;@noindent
;;The XML standard document referred to in this module is@*
;;@url{http://www.w3.org/TR/1998/REC-xml-19980210.html}.
;;
;;@noindent
;;The present frameworks fully supports the XML Namespaces
;;Recommendation@*
;;@url{http://www.w3.org/TR/REC-xml-names}.
(require 'rev2-procedures) ; for substring-move-left!
(require 'string-search)
(require 'let-values)
(require 'values)
(require 'srfi-1) ; for fold-right, fold, cons*
;;@subsection String Glue
;;;; Three functions from SRFI-13
; procedure string-concatenate-reverse STRINGS [FINAL END]
(define (ssax:string-concatenate-reverse strs final end)
(if (null? strs) (substring final 0 end)
(let*
((total-len
(let loop ((len end) (lst strs))
(if (null? lst) len
(loop (+ len (string-length (car lst))) (cdr lst)))))
(result (make-string total-len)))
(let loop ((len end) (j total-len) (str final) (lst strs))
(substring-move-left! str 0 len result (- j len))
(if (null? lst) result
(loop (string-length (car lst)) (- j len)
(car lst) (cdr lst)))))))
; string-concatenate/shared STRING-LIST -> STRING
(define (ssax:string-concatenate/shared strs)
(cond ((null? strs) "") ; Test for the fast path first
((null? (cdr strs)) (car strs))
(else
(let*
((total-len
(let loop ((len (string-length (car strs))) (lst (cdr strs)))
(if (null? lst) len
(loop (+ len (string-length (car lst))) (cdr lst)))))
(result (make-string total-len)))
(let loop ((j 0) (str (car strs)) (lst (cdr strs)))
(substring-move-left! str 0 (string-length str) result j)
(if (null? lst) result
(loop (+ j (string-length str))
(car lst) (cdr lst))))))))
; string-concatenate-reverse/shared STRING-LIST [FINAL-STRING END] -> STRING
; We do not use the optional arguments of this procedure. Therefore,
; we do not implement them. See SRFI-13 for the complete
; implementation.
(define (ssax:string-concatenate-reverse/shared strs)
(cond ((null? strs) "") ; Test for the fast path first
((null? (cdr strs)) (car strs))
(else
(ssax:string-concatenate-reverse (cdr strs)
(car strs)
(string-length (car strs))))))
;;@args list-of-frags
;;
;;Given the list of fragments (some of which are text strings),
;;reverse the list and concatenate adjacent text strings. If
;;LIST-OF-FRAGS has zero or one element, the result of the procedure
;;is @code{equal?} to its argument.
(define (ssax:reverse-collect-str fragments)
(cond
((null? fragments) '()) ; a shortcut
((null? (cdr fragments)) fragments) ; see the comment above
(else
(let loop ((fragments fragments) (result '()) (strs '()))
(cond
((null? fragments)
(if (null? strs)
result
(cons (ssax:string-concatenate/shared strs) result)))
((string? (car fragments))
(loop (cdr fragments) result (cons (car fragments) strs)))
(else
(loop (cdr fragments)
(cons (car fragments)
(if (null? strs)
result
(cons (ssax:string-concatenate/shared strs) result)))
'())))))))
;;@args list-of-frags
;;
;;Given the list of fragments (some of which are text strings),
;;reverse the list and concatenate adjacent text strings while
;;dropping "unsignificant" whitespace, that is, whitespace in front,
;;behind and between elements. The whitespace that is included in
;;character data is not affected.
;;
;;Use this procedure to "intelligently" drop "insignificant"
;;whitespace in the parsed SXML. If the strict compliance with the
;;XML Recommendation regarding the whitespace is desired, use the
;;@code{ssax:reverse-collect-str} procedure instead.
(define (ssax:reverse-collect-str-drop-ws fragments)
;; Test if a string is made of only whitespace.
;; An empty string is considered made of whitespace as well
(define (string-whitespace? str)
(let ((len (string-length str)))
(cond ((zero? len) #t)
((= 1 len) (char-whitespace? (string-ref str 0)))
(else
(let loop ((i 0))
(or (>= i len)
(and (char-whitespace? (string-ref str i))
(loop (+ 1 i)))))))))
(cond
((null? fragments) '()) ; a shortcut
((null? (cdr fragments)) ; another shortcut
(if (and (string? (car fragments)) (string-whitespace? (car fragments)))
'() ; remove trailing ws
fragments))
(else
(let loop ((fragments fragments) (result '()) (strs '())
(all-whitespace? #t))
(cond
((null? fragments)
(if all-whitespace?
result ; remove leading ws
(cons (ssax:string-concatenate/shared strs) result)))
((string? (car fragments))
(loop (cdr fragments)
result
(cons (car fragments) strs)
(and all-whitespace? (string-whitespace? (car fragments)))))
(else
(loop (cdr fragments)
(cons (car fragments)
(if all-whitespace?
result
(cons (ssax:string-concatenate/shared strs) result)))
'()
#t)))))))
;;@subsection Character and Token Functions
;;
;;The following functions either skip, or build and return tokens,
;;according to inclusion or delimiting semantics. The list of
;;characters to expect, include, or to break at may vary from one
;;invocation of a function to another. This allows the functions to
;;easily parse even context-sensitive languages.
;;
;;Exceptions are mentioned specifically. The list of expected
;;characters (characters to skip until, or break-characters) may
;;include an EOF "character", which is coded as symbol *eof*
;;
;;The input stream to parse is specified as a PORT, which is the last
;;argument.
;;@args char-list string port
;;
;;Reads a character from the @3 and looks it up in the
;;@1 of expected characters. If the read character was
;;found among expected, it is returned. Otherwise, the
;;procedure writes a message using @2 as a comment
;;and quits.
(define (ssax:assert-current-char expected-chars comment port)
(let ((c (read-char port)))
(if (memv c expected-chars) c
(slib:error port "Wrong character " c
" (0x" (if (eof-object? c)
"*eof*"
(number->string (char->integer c) 16)) ") "
comment ". " expected-chars " expected"))))
;;@args char-list port
;;
;;Reads characters from the @2 and disregards them, as long as they
;;are mentioned in the @1. The first character (which may be EOF)
;;peeked from the stream that is @emph{not} a member of the @1 is
;;returned.
(define (ssax:skip-while skip-chars port)
(do ((c (peek-char port) (peek-char port)))
((not (memv c skip-chars)) c)
(read-char port)))
;;; Stream tokenizers
;;
;;Note: since we can't tell offhand how large the token being read is
;;going to be, we make a guess, pre-allocate a string, and grow it by
;;quanta if necessary. The quantum is always the length of the string
;;before it was extended the last time. Thus the algorithm does a
;;Fibonacci-type extension, which has been proven optimal.
;;
;;Size 32 turns out to be fairly good, on average. That policy is
;;good only when a Scheme system is multi-threaded with preemptive
;;scheduling, or when a Scheme system supports shared substrings. In
;;all the other cases, it's better for ssax:init-buffer to return the
;;same static buffer. ssax:next-token* functions return a copy (a
;;substring) of accumulated data, so the same buffer can be reused.
;;We shouldn't worry about an incoming token being too large:
;;ssax:next-token will use another chunk automatically. Still, the
;;best size for the static buffer is to allow most of the tokens to
;;fit in. Using a static buffer _dramatically_ reduces the amount of
;;produced garbage (e.g., during XML parsing).
;;@body
;;
;;Returns an initial buffer for @code{ssax:next-token*} procedures.
;;@0 may allocate a new buffer at each invocation.
(define (ssax:init-buffer) (make-string 32))
;;;(define ssax:init-buffer
;;; (let ((buffer (make-string 512)))
;;; (lambda () buffer)))
;;@args prefix-char-list break-char-list comment-string port
;;
;;Skips any number of the prefix characters (members of the @1), if
;;any, and reads the sequence of characters up to (but not including)
;;a break character, one of the @2.
;;
;;The string of characters thus read is returned. The break character
;;is left on the input stream. @2 may include the symbol @code{*eof*};
;;otherwise, EOF is fatal, generating an error message including a
;;specified @3.
(define (ssax:next-token prefix-skipped-chars break-chars comment port)
(let outer ((buffer (ssax:init-buffer)) (filled-buffer-l '())
(c (ssax:skip-while prefix-skipped-chars port)))
(let ((curr-buf-len (string-length buffer)))
(let loop ((i 0) (c c))
(cond
((memv c break-chars)
(if (null? filled-buffer-l) (substring buffer 0 i)
(ssax:string-concatenate-reverse filled-buffer-l buffer i)))
((eof-object? c)
(if (memq '*eof* break-chars) ; was EOF expected?
(if (null? filled-buffer-l) (substring buffer 0 i)
(ssax:string-concatenate-reverse filled-buffer-l buffer i))
(slib:error port "EOF while reading a token " comment)))
((>= i curr-buf-len)
(outer (make-string curr-buf-len)
(cons buffer filled-buffer-l) c))
(else
(string-set! buffer i c)
(read-char port) ; move to the next char
(loop (+ 1 i) (peek-char port))))))))
;;@noindent
;;@code{ssax:next-token-of} is similar to @code{ssax:next-token}
;;except that it implements an inclusion rather than delimiting
;;semantics.
;;@args inc-charset port
;;
;;Reads characters from the @2 that belong to the list of characters
;;@1. The reading stops at the first character which is not a member
;;of the set. This character is left on the stream. All the read
;;characters are returned in a string.
;;
;;@args pred port
;;
;;Reads characters from the @2 for which @var{pred} (a procedure of
;;one argument) returns non-#f. The reading stops at the first
;;character for which @var{pred} returns #f. That character is left
;;on the stream. All the results of evaluating of @var{pred} up to #f
;;are returned in a string.
;;
;;@var{pred} is a procedure that takes one argument (a character or
;;the EOF object) and returns a character or #f. The returned
;;character does not have to be the same as the input argument to the
;;@var{pred}. For example,
;;
;;@example
;;(ssax:next-token-of (lambda (c)
;; (cond ((eof-object? c) #f)
;; ((char-alphabetic? c) (char-downcase c))
;; (else #f)))
;; (current-input-port))
;;@end example
;;
;;will try to read an alphabetic token from the current input port,
;;and return it in lower case.
(define (ssax:next-token-of incl-list/pred port)
(let* ((buffer (ssax:init-buffer))
(curr-buf-len (string-length buffer)))
(if (procedure? incl-list/pred)
(let outer ((buffer buffer) (filled-buffer-l '()))
(let loop ((i 0))
(if (>= i curr-buf-len) ; make sure we have space
(outer (make-string curr-buf-len) (cons buffer filled-buffer-l))
(let ((c (incl-list/pred (peek-char port))))
(if c
(begin
(string-set! buffer i c)
(read-char port) ; move to the next char
(loop (+ 1 i)))
;; incl-list/pred decided it had had enough
(if (null? filled-buffer-l) (substring buffer 0 i)
(ssax:string-concatenate-reverse filled-buffer-l buffer i)))))))
;; incl-list/pred is a list of allowed characters
(let outer ((buffer buffer) (filled-buffer-l '()))
(let loop ((i 0))
(if (>= i curr-buf-len) ; make sure we have space
(outer (make-string curr-buf-len) (cons buffer filled-buffer-l))
(let ((c (peek-char port)))
(cond
((not (memv c incl-list/pred))
(if (null? filled-buffer-l) (substring buffer 0 i)
(ssax:string-concatenate-reverse filled-buffer-l buffer i)))
(else
(string-set! buffer i c)
(read-char port) ; move to the next char
(loop (+ 1 i))))))))
)))
;;@body
;;
;;Reads @1 characters from the @2, and returns them in a string. If
;;EOF is encountered before @1 characters are read, a shorter string
;;will be returned.
(define (ssax:read-string len port)
(define buffer (make-string len))
(do ((idx 0 (+ 1 idx)))
((>= idx len) idx)
(let ((chr (read-char port)))
(cond ((eof-object? byt)
(set! idx (+ -1 idx))
(set! len idx))
(else (string-set! buffer idx chr))))))
;;@subsection Data Types
;;
;;@table @code
;;
;;@item TAG-KIND
;;
;;A symbol @samp{START}, @samp{END}, @samp{PI}, @samp{DECL},
;;@samp{COMMENT}, @samp{CDSECT}, or @samp{ENTITY-REF} that identifies
;;a markup token
;;
;;@item UNRES-NAME
;;
;;a name (called GI in the XML Recommendation) as given in an XML
;;document for a markup token: start-tag, PI target, attribute name.
;;If a GI is an NCName, UNRES-NAME is this NCName converted into a
;;Scheme symbol. If a GI is a QName, @samp{UNRES-NAME} is a pair of
;;symbols: @code{(@var{PREFIX} . @var{LOCALPART})}.
;;
;;@item RES-NAME
;;
;;An expanded name, a resolved version of an @samp{UNRES-NAME}. For
;;an element or an attribute name with a non-empty namespace URI,
;;@samp{RES-NAME} is a pair of symbols,
;;@code{(@var{URI-SYMB} . @var{LOCALPART})}.
;;Otherwise, it's a single symbol.
;;
;;@item ELEM-CONTENT-MODEL
;;
;;A symbol:
;;@table @samp
;;@item ANY
;;anything goes, expect an END tag.
;;@item EMPTY-TAG
;;no content, and no END-tag is coming
;;@item EMPTY
;;no content, expect the END-tag as the next token
;;@item PCDATA
;;expect character data only, and no children elements
;;@item MIXED
;;@item ELEM-CONTENT
;;@end table
;;
;;@item URI-SYMB
;;
;;A symbol representing a namespace URI -- or other symbol chosen by
;;the user to represent URI. In the former case, @code{URI-SYMB} is
;;created by %-quoting of bad URI characters and converting the
;;resulting string into a symbol.
;;
;;@item NAMESPACES
;;
;;A list representing namespaces in effect. An element of the list
;;has one of the following forms:
;;
;;@table @code
;;
;;@item (@var{prefix} @var{uri-symb} . @var{uri-symb}) or
;;
;;@item (@var{prefix} @var{user-prefix} . @var{uri-symb})
;;@var{user-prefix} is a symbol chosen by the user to represent the URI.
;;
;;@item (#f @var{user-prefix} . @var{uri-symb})
;;Specification of the user-chosen prefix and a URI-SYMBOL.
;;
;;@item (*DEFAULT* @var{user-prefix} . @var{uri-symb})
;;Declaration of the default namespace
;;
;;@item (*DEFAULT* #f . #f)
;;Un-declaration of the default namespace. This notation
;;represents overriding of the previous declaration
;;
;;@end table
;;
;;A NAMESPACES list may contain several elements for the same @var{prefix}.
;;The one closest to the beginning of the list takes effect.
;;
;;@item ATTLIST
;;
;;An ordered collection of (@var{NAME} . @var{VALUE}) pairs, where
;;@var{NAME} is a RES-NAME or an UNRES-NAME. The collection is an ADT.
;;
;;@item STR-HANDLER
;;
;;A procedure of three arguments: @var{string1} @var{string2}
;;@var{seed} returning a new @var{seed}. The procedure is supposed to
;;handle a chunk of character data @var{string1} followed by a chunk
;;of character data @var{string2}. @var{string2} is a short string,
;;often @samp{"\n"} and even @samp{""}.
;;
;;@item ENTITIES
;;An assoc list of pairs:
;;@lisp
;; (@var{named-entity-name} . @var{named-entity-body})
;;@end lisp
;;
;;where @var{named-entity-name} is a symbol under which the entity was
;;declared, @var{named-entity-body} is either a string, or (for an
;;external entity) a thunk that will return an input port (from which
;;the entity can be read). @var{named-entity-body} may also be #f.
;;This is an indication that a @var{named-entity-name} is currently
;;being expanded. A reference to this @var{named-entity-name} will be
;;an error: violation of the WFC nonrecursion.
;;
;;@item XML-TOKEN
;;
;;This record represents a markup, which is, according to the XML
;;Recommendation, "takes the form of start-tags, end-tags,
;;empty-element tags, entity references, character references,
;;comments, CDATA section delimiters, document type declarations, and
;;processing instructions."
;;
;;@table @asis
;;@item kind
;;a TAG-KIND
;;@item head
;;an UNRES-NAME. For XML-TOKENs of kinds 'COMMENT and 'CDSECT, the
;;head is #f.
;;@end table
;;
;;For example,
;;@example
;;
=> kind=START, head=P
;;
=> kind=END, head=P
;;
=> kind=EMPTY-EL, head=BR
;; => kind=DECL, head=DOCTYPE
;; => kind=PI, head=xml
;;&my-ent; => kind=ENTITY-REF, head=my-ent
;;@end example
;;
;;Character references are not represented by xml-tokens as these
;;references are transparently resolved into the corresponding
;;characters.
;;
;;@item XML-DECL
;;
;;The record represents a datatype of an XML document: the list of
;;declared elements and their attributes, declared notations, list of
;;replacement strings or loading procedures for parsed general
;;entities, etc. Normally an XML-DECL record is created from a DTD or
;;an XML Schema, although it can be created and filled in in many
;;other ways (e.g., loaded from a file).
;;
;;@table @var
;;@item elems
;;an (assoc) list of decl-elem or #f. The latter instructs
;;the parser to do no validation of elements and attributes.
;;
;;@item decl-elem
;;declaration of one element:
;;
;;@code{(@var{elem-name} @var{elem-content} @var{decl-attrs})}
;;
;;@var{elem-name} is an UNRES-NAME for the element.
;;
;;@var{elem-content} is an ELEM-CONTENT-MODEL.
;;
;;@var{decl-attrs} is an @code{ATTLIST}, of
;;@code{(@var{attr-name} . @var{value})} associations.
;;
;;This element can declare a user procedure to handle parsing of an
;;element (e.g., to do a custom validation, or to build a hash of IDs
;;as they're encountered).
;;
;;@item decl-attr
;;an element of an @code{ATTLIST}, declaration of one attribute:
;;
;;@code{(@var{attr-name} @var{content-type} @var{use-type} @var{default-value})}
;;
;;@var{attr-name} is an UNRES-NAME for the declared attribute.
;;
;;@var{content-type} is a symbol: @code{CDATA}, @code{NMTOKEN},
;;@code{NMTOKENS}, @dots{} or a list of strings for the enumerated
;;type.
;;
;;@var{use-type} is a symbol: @code{REQUIRED}, @code{IMPLIED}, or
;;@code{FIXED}.
;;
;;@var{default-value} is a string for the default value, or #f if not
;;given.
;;
;;@end table
;;
;;@end table
;;see a function make-empty-xml-decl to make a XML declaration entry
;;suitable for a non-validating parsing.
;;We define xml-token simply as a pair.
(define (make-xml-token kind head) (cons kind head))
(define xml-token? pair?)
(define xml-token-kind car)
(define xml-token-head cdr)
;;@subsection Low-Level Parsers and Scanners
;;
;;@noindent
;;These procedures deal with primitive lexical units (Names,
;;whitespaces, tags) and with pieces of more generic productions.
;;Most of these parsers must be called in appropriate context. For
;;example, @code{ssax:complete-start-tag} must be called only when the
;;start-tag has been detected and its GI has been read.
(define char-return (integer->char 13))
(define ssax:S-chars (map integer->char '(32 10 9 13)))
;;@body
;;
;;Skip the S (whitespace) production as defined by
;;@example
;;[3] S ::= (#x20 | #x09 | #x0D | #x0A)
;;@end example
;;
;;@0 returns the first not-whitespace character it encounters while
;;scanning the @1. This character is left on the input stream.
(define (ssax:skip-S port)
(ssax:skip-while ssax:S-chars port))
;;Check to see if a-char may start a NCName
(define (ssax:ncname-starting-char? a-char)
(and (char? a-char)
(or (char-alphabetic? a-char)
(char=? #\_ a-char))))
;;@body
;;
;;Read a NCName starting from the current position in the @1 and
;;return it as a symbol.
;;
;;@example
;;[4] NameChar ::= Letter | Digit | '.' | '-' | '_' | ':'
;; | CombiningChar | Extender
;;[5] Name ::= (Letter | '_' | ':') (NameChar)*
;;@end example
;;
;;This code supports the XML Namespace Recommendation REC-xml-names,
;;which modifies the above productions as follows:
;;
;;@example
;;[4] NCNameChar ::= Letter | Digit | '.' | '-' | '_'
;; | CombiningChar | Extender
;;[5] NCName ::= (Letter | '_') (NCNameChar)*
;;@end example
;;
;;As the Rec-xml-names says,
;;
;;@quotation
;;"An XML document conforms to this specification if all other tokens
;;[other than element types and attribute names] in the document which
;;are required, for XML conformance, to match the XML production for
;;Name, match this specification's production for NCName."
;;@end quotation
;;
;;Element types and attribute names must match the production QName,
;;defined below.
(define (ssax:read-NCName port)
(let ((first-char (peek-char port)))
(or (ssax:ncname-starting-char? first-char)
(slib:error port "XMLNS [4] for '" first-char "'")))
(string->symbol (ssax:next-token-of (lambda (c)
(cond
((eof-object? c) #f)
((char-alphabetic? c) c)
((string-index "0123456789.-_" c) c)
(else #f)))
port)))
;;@body
;;
;;Read a (namespace-) Qualified Name, QName, from the current position
;;in @1; and return an UNRES-NAME.
;;
;;From REC-xml-names:
;;@example
;;[6] QName ::= (Prefix ':')? LocalPart
;;[7] Prefix ::= NCName
;;[8] LocalPart ::= NCName
;;@end example
(define (ssax:read-QName port)
(let ((prefix-or-localpart (ssax:read-NCName port)))
(case (peek-char port)
((#\:) ; prefix was given after all
(read-char port) ; consume the colon
(cons prefix-or-localpart (ssax:read-NCName port)))
(else prefix-or-localpart) ; Prefix was omitted
)))
;;The prefix of the pre-defined XML namespace
(define ssax:Prefix-XML (string->symbol "xml"))
;;An UNRES-NAME that is postulated to be larger than anything that can
;;occur in a well-formed XML document. ssax:name-compare enforces
;;this postulate.
(define ssax:largest-unres-name (cons (string->symbol "#LARGEST-SYMBOL")
(string->symbol "#LARGEST-SYMBOL")))
;;Compare one RES-NAME or an UNRES-NAME with the other.
;;Return a symbol '<, '>, or '= depending on the result of
;;the comparison.
;;Names without @var{prefix} are always smaller than those with the @var{prefix}.
(define ssax:name-compare
(letrec ((symbol-compare
(lambda (symb1 symb2)
(cond
((eq? symb1 symb2) '=)
((string (symbol->string symb1) (symbol->string symb2))
'<)
(else '>)))))
(lambda (name1 name2)
(cond
((symbol? name1) (if (symbol? name2) (symbol-compare name1 name2)
'<))
((symbol? name2) '>)
((eq? name2 ssax:largest-unres-name) '<)
((eq? name1 ssax:largest-unres-name) '>)
((eq? (car name1) (car name2)) ; prefixes the same
(symbol-compare (cdr name1) (cdr name2)))
(else (symbol-compare (car name1) (car name2)))))))
;;@args port
;;
;;This procedure starts parsing of a markup token. The current
;;position in the stream must be @samp{<}. This procedure scans
;;enough of the input stream to figure out what kind of a markup token
;;it is seeing. The procedure returns an XML-TOKEN structure
;;describing the token. Note, generally reading of the current markup
;;is not finished! In particular, no attributes of the start-tag
;;token are scanned.
;;
;;Here's a detailed break out of the return values and the position in
;;the PORT when that particular value is returned:
;;
;;@table @asis
;;
;;@item PI-token
;;
;;only PI-target is read. To finish the Processing-Instruction and
;;disregard it, call @code{ssax:skip-pi}. @code{ssax:read-attributes}
;;may be useful as well (for PIs whose content is attribute-value
;;pairs).
;;
;;@item END-token
;;
;;The end tag is read completely; the current position is right after
;;the terminating @samp{>} character.
;;
;;@item COMMENT
;;
;;is read and skipped completely. The current position is right after
;;@samp{-->} that terminates the comment.
;;
;;@item CDSECT
;;
;;The current position is right after @samp{" port))
(slib:error port "XML [15], no -->"))
(make-xml-token 'COMMENT #f))
;; we have read ") "XML [42]" port)
val))
((#\?) (read-char port) (make-xml-token 'PI (ssax:read-NCName port)))
((#\!)
(read-char port)
(case (peek-char port)
((#\-) (read-char port) (skip-comment port))
((#\[) (read-char port) (read-cdata port))
(else (make-xml-token 'DECL (ssax:read-NCName port)))))
(else (make-xml-token 'START (ssax:read-QName port)))))))
;;@body
;;
;;The current position is inside a PI. Skip till the rest of the PI
(define (ssax:skip-pi port)
(if (not (find-string-from-port? "?>" port))
(slib:error port "Failed to find ?> terminating the PI")))
;;@body
;;
;;The current position is right after reading the PITarget. We read
;;the body of PI and return is as a string. The port will point to
;;the character right after @samp{?>} combination that terminates PI.
;;
;;@example
;;[16] PI ::= '' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
;;@end example
(define (ssax:read-pi-body-as-string port)
(ssax:skip-S port) ; skip WS after the PI target name
(ssax:string-concatenate/shared
(let loop ()
(let ((pi-fragment
(ssax:next-token '() '(#\?) "reading PI content" port)))
(read-char port)
(if (eqv? #\> (peek-char port))
(begin
(read-char port)
(cons pi-fragment '()))
(cons* pi-fragment "?" (loop)))))))
;;@body
;;
;;The current pos in the port is inside an internal DTD subset (e.g.,
;;after reading @samp{#\[} that begins an internal DTD subset) Skip
;;until the @samp{]>} combination that terminates this DTD.
(define (ssax:skip-internal-dtd port)
(slib:warn port "Internal DTD subset is not currently handled ")
(if (not (find-string-from-port? "]>" port))
(slib:error port
"Failed to find ]> terminating the internal DTD subset")))
;;@args port str-handler seed
;;
;;This procedure must be called after we have read a string
;;@samp{} combination is the end of the CDATA section.
;;@samp{>} is treated as an embedded @samp{>} character.
;;
;;@item
;;@samp{<} and @samp{&} are not specially recognized (and are
;;not expanded)!
;;
;;@end itemize
(define ssax:read-cdata-body
(let ((cdata-delimiters (list char-return #\newline #\] #\&)))
(lambda (port str-handler seed)
(let loop ((seed seed))
(let ((fragment (ssax:next-token '() cdata-delimiters "reading CDATA" port)))
;; that is, we're reading the char after the 'fragment'
(case (read-char port)
((#\newline) (loop (str-handler fragment #\newline seed)))
((#\])
(if (not (eqv? (peek-char port) #\]))
(loop (str-handler fragment "]" seed))
(let check-after-second-braket
((seed (if (string-null? fragment) seed
(str-handler fragment "" seed))))
(read-char port)
(case (peek-char port) ; after the second bracket
((#\>) (read-char port) seed) ; we have read "]]>"
((#\]) (check-after-second-braket
(str-handler "]" "" seed)))
(else (loop (str-handler "]]" "" seed)))))))
((#\&) ; Note that #\& within CDATA may stand for itself
(let ((ent-ref ; it does not have to start an entity ref
(ssax:next-token-of
(lambda (c)
(and (not (eof-object? c)) (char-alphabetic? c) c))
port)))
(cond ; replace ">" with #\>
((and (string=? "gt" ent-ref) (eqv? (peek-char port) #\;))
(read-char port)
(loop (str-handler fragment ">" seed)))
(else
(loop
(str-handler ent-ref ""
(str-handler fragment "&" seed)))))))
(else ; Must be CR: if the next char is #\newline, skip it
(if (eqv? (peek-char port) #\newline) (read-char port))
(loop (str-handler fragment #\newline seed)))
))))))
;;@body
;;
;;@example
;;[66] CharRef ::= '' [0-9]+ ';'
;; | '' [0-9a-fA-F]+ ';'
;;@end example
;;
;;This procedure must be called after we we have read @samp{} that
;;introduces a char reference. The procedure reads this reference and
;;returns the corresponding char. The current position in PORT will
;;be after the @samp{;} that terminates the char reference.
;;
;;Faults detected:@*
;;WFC: XML-Spec.html#wf-Legalchar
;;
;;According to Section @cite{4.1 Character and Entity References}
;;of the XML Recommendation:
;;
;;@quotation
;;"[Definition: A character reference refers to a specific character
;;in the ISO/IEC 10646 character set, for example one not directly
;;accessible from available input devices.]"
;;@end quotation
;;
;;@c Therefore, we use a @code{ucscode->char} function to convert a
;;@c character code into the character -- *regardless* of the current
;;@c character encoding of the input stream.
(define (ssax:read-char-ref port)
(let* ((base (cond ((eqv? (peek-char port) #\x) (read-char port) 16)
(else 10)))
(name (ssax:next-token '() '(#\;) "XML [66]" port))
(char-code (string->number name base)))
(read-char port) ; read the terminating #\; char
(if (integer? char-code) (integer->char char-code)
(slib:error port "[wf-Legalchar] broken for '" name "'"))))
(define ssax:predefined-parsed-entities
`(
(,(string->symbol "amp") . "&")
(,(string->symbol "lt") . "<")
(,(string->symbol "gt") . ">")
(,(string->symbol "apos") . "'")
(,(string->symbol "quot") . "\"")
))
;;@body
;;
;;Expands and handles a parsed-entity reference.
;;
;;@2 is a symbol, the name of the parsed entity to expand.
;;@c entities - see ENTITIES
;;@4 is a procedure of arguments @var{port}, @var{entities}, and
;;@var{seed} that returns a seed.
;;@5 is called if the entity in question is a pre-declared entity.
;;
;;@0 returns the result returned by @4 or @5.
;;
;;Faults detected:@*
;;WFC: XML-Spec.html#wf-entdeclared@*
;;WFC: XML-Spec.html#norecursion
(define (ssax:handle-parsed-entity port name entities content-handler str-handler seed)
(cond ; First we check the list of the declared entities
((assq name entities) =>
(lambda (decl-entity)
(let ((ent-body (cdr decl-entity)) ; mark the list to prevent recursion
(new-entities (cons (cons name #f) entities)))
(cond
((string? ent-body)
(call-with-input-string ent-body
(lambda (port) (content-handler port new-entities seed))))
((procedure? ent-body)
(let ((port (ent-body)))
(define val (content-handler port new-entities seed))
(close-input-port port)
val))
(else
(slib:error port "[norecursion] broken for " name))))))
((assq name ssax:predefined-parsed-entities)
=> (lambda (decl-entity)
(str-handler (cdr decl-entity) "" seed)))
(else (slib:error port "[wf-entdeclared] broken for " name))))
;;;; The ATTLIST Abstract Data Type
;;; Currently is implemented as an assoc list sorted in the ascending
;;; order of NAMES.
(define attlist-fold fold)
(define attlist-null? null?)
(define attlist->alist identity)
(define (make-empty-attlist) '())
;;@body
;;
;;Add a @2 pair to the existing @1, preserving its sorted ascending
;;order; and return the new list. Return #f if a pair with the same
;;name already exists in @1
(define (attlist-add attlist name-value)
(if (null? attlist) (cons name-value attlist)
(case (ssax:name-compare (car name-value) (caar attlist))
((=) #f)
((<) (cons name-value attlist))
(else (cons (car attlist) (attlist-add (cdr attlist) name-value)))
)))
;;@body
;;
;;Given an non-null @1, return a pair of values: the top and the rest.
(define (attlist-remove-top attlist)
(values (car attlist) (cdr attlist)))
;;@args port entities
;;
;;This procedure reads and parses a production @dfn{Attribute}.
;;
;;@example
;;[41] Attribute ::= Name Eq AttValue
;;[10] AttValue ::= '"' ([^<&"] | Reference)* '"'
;; | "'" ([^<&'] | Reference)* "'"
;;[25] Eq ::= S? '=' S?
;;@end example
;;
;;The procedure returns an ATTLIST, of Name (as UNRES-NAME), Value (as
;;string) pairs. The current character on the @1 is a non-whitespace
;;character that is not an NCName-starting character.
;;
;;Note the following rules to keep in mind when reading an
;;@dfn{AttValue}:
;;@quotation
;;Before the value of an attribute is passed to the application or
;;checked for validity, the XML processor must normalize it as
;;follows:
;;
;;@itemize @bullet
;;@item
;;A character reference is processed by appending the referenced
;;character to the attribute value.
;;
;;@item
;;An entity reference is processed by recursively processing the
;;replacement text of the entity. The named entities @samp{amp},
;;@samp{lt}, @samp{gt}, @samp{quot}, and @samp{apos} are pre-declared.
;;
;;@item
;;A whitespace character (#x20, #x0D, #x0A, #x09) is processed by
;;appending #x20 to the normalized value, except that only a single
;;#x20 is appended for a "#x0D#x0A" sequence that is part of an
;;external parsed entity or the literal entity value of an internal
;;parsed entity.
;;
;;@item
;;Other characters are processed by appending them to the normalized
;;value.
;;
;;@end itemize
;;
;;@end quotation
;;
;;Faults detected:@*
;;WFC: XML-Spec.html#CleanAttrVals@*
;;WFC: XML-Spec.html#uniqattspec
(define ssax:read-attributes ; ssax:read-attributes port entities
(let ((value-delimeters (append '(#\< #\&) ssax:S-chars)))
;; Read the AttValue from the PORT up to the delimiter (which can
;; be a single or double-quote character, or even a symbol *eof*).
;; 'prev-fragments' is the list of string fragments, accumulated
;; so far, in reverse order. Return the list of fragments with
;; newly read fragments prepended.
(define (read-attrib-value delimiter port entities prev-fragments)
(let* ((new-fragments
(cons (ssax:next-token '() (cons delimiter value-delimeters)
"XML [10]" port)
prev-fragments))
(cterm (read-char port)))
(cond
((or (eof-object? cterm) (eqv? cterm delimiter))
new-fragments)
((eqv? cterm char-return) ; treat a CR and CRLF as a LF
(if (eqv? (peek-char port) #\newline) (read-char port))
(read-attrib-value delimiter port entities
(cons " " new-fragments)))
((memv cterm ssax:S-chars)
(read-attrib-value delimiter port entities
(cons " " new-fragments)))
((eqv? cterm #\&)
(cond
((eqv? (peek-char port) #\#)
(read-char port)
(read-attrib-value delimiter port entities
(cons (string (ssax:read-char-ref port)) new-fragments)))
(else
(read-attrib-value delimiter port entities
(read-named-entity port entities new-fragments)))))
(else (slib:error port "[CleanAttrVals] broken")))))
;; we have read "&" that introduces a named entity reference.
;; read this reference and return the result of normalizing of the
;; corresponding string (that is, read-attrib-value is applied to
;; the replacement text of the entity). The current position will
;; be after ";" that terminates the entity reference
(define (read-named-entity port entities fragments)
(let ((name (ssax:read-NCName port)))
(ssax:assert-current-char '(#\;) "XML [68]" port)
(ssax:handle-parsed-entity port name entities
(lambda (port entities fragments)
(read-attrib-value '*eof* port entities fragments))
(lambda (str1 str2 fragments)
(if (equal? "" str2) (cons str1 fragments)
(cons* str2 str1 fragments)))
fragments)))
(lambda (port entities)
(let loop ((attr-list (make-empty-attlist)))
(if (not (ssax:ncname-starting-char? (ssax:skip-S port))) attr-list
(let ((name (ssax:read-QName port)))
(ssax:skip-S port)
(ssax:assert-current-char '(#\=) "XML [25]" port)
(ssax:skip-S port)
(let ((delimiter
(ssax:assert-current-char '(#\' #\" ) "XML [10]" port)))
(loop
(or (attlist-add attr-list
(cons name
(ssax:string-concatenate-reverse/shared
(read-attrib-value delimiter port entities
'()))))
(slib:error port "[uniqattspec] broken for " name))))))))
))
;;@body
;;
;;Convert an @2 to a RES-NAME, given the appropriate @3 declarations.
;;The last parameter, @4, determines if the default namespace applies
;;(for instance, it does not for attribute names).
;;
;;Per REC-xml-names/#nsc-NSDeclared, the "xml" prefix is considered
;;pre-declared and bound to the namespace name
;;"http://www.w3.org/XML/1998/namespace".
;;
;;@0 tests for the namespace constraints:@*
;;@url{http://www.w3.org/TR/REC-xml-names/#nsc-NSDeclared}
(define (ssax:resolve-name port unres-name namespaces apply-default-ns?)
(cond
((pair? unres-name) ; it's a QNAME
(cons
(cond
((assq (car unres-name) namespaces) => cadr)
((eq? (car unres-name) ssax:Prefix-XML) ssax:Prefix-XML)
(else
(slib:error port "[nsc-NSDeclared] broken; prefix " (car unres-name))))
(cdr unres-name)))
(apply-default-ns? ; Do apply the default namespace, if any
(let ((default-ns (assq '*DEFAULT* namespaces)))
(if (and default-ns (cadr default-ns))
(cons (cadr default-ns) unres-name)
unres-name))) ; no default namespace declared
(else unres-name))) ; no prefix, don't apply the default-ns
;;Procedure: ssax:uri-string->symbol URI-STR
;;Convert a URI-STR to an appropriate symbol
(define ssax:uri-string->symbol string->symbol)
;;@args tag port elems entities namespaces
;;
;;Complete parsing of a start-tag markup. @0 must be called after the
;;start tag token has been read. @1 is an UNRES-NAME. @3 is an
;;instance of the ELEMS slot of XML-DECL; it can be #f to tell the
;;function to do @emph{no} validation of elements and their
;;attributes.
;;
;;@0 returns several values:
;;@itemize @bullet
;;@item ELEM-GI:
;;a RES-NAME.
;;@item ATTRIBUTES:
;;element's attributes, an ATTLIST of (RES-NAME . STRING) pairs.
;;The list does NOT include xmlns attributes.
;;@item NAMESPACES:
;;the input list of namespaces amended with namespace
;;(re-)declarations contained within the start-tag under parsing
;;@item ELEM-CONTENT-MODEL
;;@end itemize
;;
;;On exit, the current position in @2 will be the first character
;;after @samp{>} that terminates the start-tag markup.
;;
;;Faults detected:@*
;;VC: XML-Spec.html#enum@*
;;VC: XML-Spec.html#RequiredAttr@*
;;VC: XML-Spec.html#FixedAttr@*
;;VC: XML-Spec.html#ValueType@*
;;WFC: XML-Spec.html#uniqattspec (after namespaces prefixes are resolved)@*
;;VC: XML-Spec.html#elementvalid@*
;;WFC: REC-xml-names/#dt-NSName
;;
;;@emph{Note}: although XML Recommendation does not explicitly say it,
;;xmlns and xmlns: attributes don't have to be declared (although they
;;can be declared, to specify their default value).
(define ssax:complete-start-tag
(let ((xmlns (string->symbol "xmlns"))
(largest-dummy-decl-attr (list ssax:largest-unres-name #f #f #f)))
;; Scan through the attlist and validate it, against decl-attrs
;; Return an assoc list with added fixed or implied attrs.
;; Note that both attlist and decl-attrs are ATTLISTs, and therefore,
;; sorted
(define (validate-attrs port attlist decl-attrs)
;; Check to see decl-attr is not of use type REQUIRED. Add
;; the association with the default value, if any declared
(define (add-default-decl decl-attr result)
(let*-values
(((attr-name content-type use-type default-value)
(apply values decl-attr)))
(and (eq? use-type 'REQUIRED)
(slib:error port "[RequiredAttr] broken for" attr-name))
(if default-value
(cons (cons attr-name default-value) result)
result)))
(let loop ((attlist attlist) (decl-attrs decl-attrs) (result '()))
(if (attlist-null? attlist)
(attlist-fold add-default-decl result decl-attrs)
(let*-values
(((attr attr-others)
(attlist-remove-top attlist))
((decl-attr other-decls)
(if (attlist-null? decl-attrs)
(values largest-dummy-decl-attr decl-attrs)
(attlist-remove-top decl-attrs)))
)
(case (ssax:name-compare (car attr) (car decl-attr))
((<)
(if (or (eq? xmlns (car attr))
(and (pair? (car attr)) (eq? xmlns (caar attr))))
(loop attr-others decl-attrs (cons attr result))
(slib:error port "[ValueType] broken for " attr)))
((>)
(loop attlist other-decls
(add-default-decl decl-attr result)))
(else ; matched occurrence of an attr with its declaration
(let*-values
(((attr-name content-type use-type default-value)
(apply values decl-attr)))
;; Run some tests on the content of the attribute
(cond
((eq? use-type 'FIXED)
(or (equal? (cdr attr) default-value)
(slib:error port "[FixedAttr] broken for " attr-name)))
((eq? content-type 'CDATA) #t) ; everything goes
((pair? content-type)
(or (member (cdr attr) content-type)
(slib:error port "[enum] broken for " attr-name "="
(cdr attr))))
(else
(slib:warn port "declared content type " content-type
" not verified yet")))
(loop attr-others other-decls (cons attr result)))))
))))
;; Add a new namespace declaration to namespaces.
;; First we convert the uri-str to a uri-symbol and search namespaces for
;; an association (_ user-prefix . uri-symbol).
;; If found, we return the argument namespaces with an association
;; (prefix user-prefix . uri-symbol) prepended.
;; Otherwise, we prepend (prefix uri-symbol . uri-symbol)
(define (add-ns port prefix uri-str namespaces)
(and (equal? "" uri-str)
(slib:error port "[dt-NSName] broken for " prefix))
(let ((uri-symbol (ssax:uri-string->symbol uri-str)))
(let loop ((nss namespaces))
(cond
((null? nss)
(cons (cons* prefix uri-symbol uri-symbol) namespaces))
((eq? uri-symbol (cddar nss))
(cons (cons* prefix (cadar nss) uri-symbol) namespaces))
(else (loop (cdr nss)))))))
;; partition attrs into proper attrs and new namespace declarations
;; return two values: proper attrs and the updated namespace declarations
(define (adjust-namespace-decl port attrs namespaces)
(let loop ((attrs attrs) (proper-attrs '()) (namespaces namespaces))
(cond
((null? attrs) (values proper-attrs namespaces))
((eq? xmlns (caar attrs)) ; re-decl of the default namespace
(loop (cdr attrs) proper-attrs
(if (equal? "" (cdar attrs)) ; un-decl of the default ns
(cons (cons* '*DEFAULT* #f #f) namespaces)
(add-ns port '*DEFAULT* (cdar attrs) namespaces))))
((and (pair? (caar attrs)) (eq? xmlns (caaar attrs)))
(loop (cdr attrs) proper-attrs
(add-ns port (cdaar attrs) (cdar attrs) namespaces)))
(else
(loop (cdr attrs) (cons (car attrs) proper-attrs) namespaces)))))
;; The body of the function
(lambda (tag-head port elems entities namespaces)
(let*-values
(((attlist) (ssax:read-attributes port entities))
((empty-el-tag?)
(begin
(ssax:skip-S port)
(and
(eqv? #\/
(ssax:assert-current-char '(#\> #\/) "XML [40], XML [44], no '>'" port))
(ssax:assert-current-char '(#\>) "XML [44], no '>'" port))))
((elem-content decl-attrs) ; see xml-decl for their type
(if elems ; elements declared: validate!
(cond
((assoc tag-head elems) =>
(lambda (decl-elem) ; of type xml-decl::decl-elem
(values
(if empty-el-tag? 'EMPTY-TAG (cadr decl-elem))
(caddr decl-elem))))
(else
(slib:error port "[elementvalid] broken, no decl for " tag-head)))
(values ; non-validating parsing
(if empty-el-tag? 'EMPTY-TAG 'ANY)
#f) ; no attributes declared
))
((merged-attrs) (if decl-attrs (validate-attrs port attlist decl-attrs)
(attlist->alist attlist)))
((proper-attrs namespaces)
(adjust-namespace-decl port merged-attrs namespaces))
)
;; build the return value
(values
(ssax:resolve-name port tag-head namespaces #t)
(fold-right
(lambda (name-value attlist)
(or
(attlist-add attlist
(cons (ssax:resolve-name port (car name-value) namespaces #f)
(cdr name-value)))
(slib:error port "[uniqattspec] after NS expansion broken for "
name-value)))
(make-empty-attlist)
proper-attrs)
namespaces
elem-content)))))
;;@body
;;
;;Parses an ExternalID production:
;;
;;@example
;;[75] ExternalID ::= 'SYSTEM' S SystemLiteral
;; | 'PUBLIC' S PubidLiteral S SystemLiteral
;;[11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
;;[12] PubidLiteral ::= '"' PubidChar* '"'
;; | "'" (PubidChar - "'")* "'"
;;[13] PubidChar ::= #x20 | #x0D | #x0A | [a-zA-Z0-9]
;; | [-'()+,./:=?;!*#@@$_%]
;;@end example
;;
;;Call @0 when an ExternalID is expected; that is, the current
;;character must be either #\S or #\P that starts correspondingly a
;;SYSTEM or PUBLIC token. @0 returns the @var{SystemLiteral} as a
;;string. A @var{PubidLiteral} is disregarded if present.
(define (ssax:read-external-id port)
(let ((discriminator (ssax:read-NCName port)))
(ssax:assert-current-char ssax:S-chars "space after SYSTEM or PUBLIC" port)
(ssax:skip-S port)
(let ((delimiter
(ssax:assert-current-char '(#\' #\" ) "XML [11], XML [12]" port)))
(cond
((eq? discriminator (string->symbol "SYSTEM"))
(let ((val (ssax:next-token '() (list delimiter) "XML [11]" port)))
(read-char port) ; reading the closing delim
val))
((eq? discriminator (string->symbol "PUBLIC"))
(let loop ((c (read-char port)))
(cond
((eqv? c delimiter) c)
((eof-object? c)
(slib:error port "Unexpected EOF while skipping until " delimiter))
(else (loop (read-char port)))))
(ssax:assert-current-char ssax:S-chars "space after PubidLiteral" port)
(ssax:skip-S port)
(let* ((delimiter
(ssax:assert-current-char '(#\' #\" ) "XML [11]" port))
(systemid
(ssax:next-token '() (list delimiter) "XML [11]" port)))
(read-char port) ; reading the closing delim
systemid))
(else
(slib:error port "XML [75], " discriminator
" rather than SYSTEM or PUBLIC"))))))
;;@subsection Mid-Level Parsers and Scanners
;;
;;@noindent
;;These procedures parse productions corresponding to the whole
;;(document) entity or its higher-level pieces (prolog, root element,
;;etc).
;;@body
;;
;;Scan the Misc production in the context:
;;
;;@example
;;[1] document ::= prolog element Misc*
;;[22] prolog ::= XMLDecl? Misc* (doctypedec l Misc*)?
;;[27] Misc ::= Comment | PI | S
;;@end example
;;
;;Call @0 in the prolog or epilog contexts. In these contexts,
;;whitespaces are completely ignored. The return value from @0 is
;;either a PI-token, a DECL-token, a START token, or *EOF*. Comments
;;are ignored and not reported.
(define (ssax:scan-misc port)
(let loop ((c (ssax:skip-S port)))
(cond
((eof-object? c) c)
((not (char=? c #\<))
(slib:error port "XML [22], char '" c "' unexpected"))
(else
(let ((token (ssax:read-markup-token port)))
(case (xml-token-kind token)
((COMMENT) (loop (ssax:skip-S port)))
((PI DECL START) token)
(else
(slib:error port "XML [22], unexpected token of kind "
(xml-token-kind token)
))))))))
;;@args port expect-eof? str-handler iseed
;;
;;Read the character content of an XML document or an XML element.
;;
;;@example
;;[43] content ::=
;;(element | CharData | Reference | CDSect | PI | Comment)*
;;@end example
;;
;;To be more precise, @0 reads CharData, expands CDSect and character
;;entities, and skips comments. @0 stops at a named reference, EOF,
;;at the beginning of a PI, or a start/end tag.
;;
;;@2 is a boolean indicating if EOF is normal; i.e., the character
;;data may be terminated by the EOF. EOF is normal while processing a
;;parsed entity.
;;
;;@4 is an argument passed to the first invocation of @3.
;;
;;@0 returns two results: @var{seed} and @var{token}. The @var{seed}
;;is the result of the last invocation of @3, or the original @4 if @3
;;was never called.
;;
;;@var{token} can be either an eof-object (this can happen only if @2
;;was #t), or:
;;@itemize @bullet
;;
;;@item
;;an xml-token describing a START tag or an END-tag;
;;For a start token, the caller has to finish reading it.
;;
;;@item
;;an xml-token describing the beginning of a PI. It's up to an
;;application to read or skip through the rest of this PI;
;;
;;@item
;;an xml-token describing a named entity reference.
;;
;;@end itemize
;;
;;CDATA sections and character references are expanded inline and
;;never returned. Comments are silently disregarded.
;;
;;As the XML Recommendation requires, all whitespace in character data
;;must be preserved. However, a CR character (#x0D) must be
;;disregarded if it appears before a LF character (#x0A), or replaced
;;by a #x0A character otherwise. See Secs. 2.10 and 2.11 of the XML
;;Recommendation. See also the canonical XML Recommendation.
(define ssax:read-char-data
(let ((terminators-usual (list #\< #\& char-return))
(terminators-usual-eof (list #\< '*eof* #\& char-return))
(handle-fragment
(lambda (fragment str-handler seed)
(if (string-null? fragment) seed
(str-handler fragment "" seed)))))
(lambda (port expect-eof? str-handler seed)
;; Very often, the first character we encounter is #\<
;; Therefore, we handle this case in a special, fast path
(if (eqv? #\< (peek-char port))
;; The fast path
(let ((token (ssax:read-markup-token port)))
(case (xml-token-kind token)
((START END) ; The most common case
(values seed token))
((CDSECT)
(let ((seed (ssax:read-cdata-body port str-handler seed)))
(ssax:read-char-data port expect-eof? str-handler seed)))
((COMMENT)
(ssax:read-char-data port expect-eof? str-handler seed))
(else
(values seed token))))
;; The slow path
(let ((char-data-terminators
(if expect-eof? terminators-usual-eof terminators-usual)))
(let loop ((seed seed))
(let* ((fragment
(ssax:next-token '() char-data-terminators
"reading char data" port))
(term-char (peek-char port)) ; one of char-data-terminators
)
(if (eof-object? term-char)
(values
(handle-fragment fragment str-handler seed)
term-char)
(case term-char
((#\<)
(let ((token (ssax:read-markup-token port)))
(case (xml-token-kind token)
((CDSECT)
(loop
(ssax:read-cdata-body port str-handler
(handle-fragment fragment str-handler seed))))
((COMMENT)
(loop (handle-fragment fragment str-handler seed)))
(else
(values
(handle-fragment fragment str-handler seed)
token)))))
((#\&)
(read-char port)
(case (peek-char port)
((#\#) (read-char port)
(loop (str-handler fragment
(string (ssax:read-char-ref port))
seed)))
(else
(let ((name (ssax:read-NCName port)))
(ssax:assert-current-char '(#\;) "XML [68]" port)
(values
(handle-fragment fragment str-handler seed)
(make-xml-token 'ENTITY-REF name))))))
(else ; This must be a CR character
(read-char port)
(if (eqv? (peek-char port) #\newline)
(read-char port))
(loop (str-handler fragment (string #\newline) seed))))
))))))))
;;@body
;;
;;Make sure that @1 is of anticipated @2 and has anticipated @3. Note
;;that the @3 argument may actually be a pair of two symbols,
;;Namespace-URI or the prefix, and of the localname. If the assertion
;;fails, @4 is evaluated by passing it three arguments: @1 @2 @3. The
;;result of @4 is returned.
(define (ssax:assert-token token kind gi error-cont)
(or (and (xml-token? token)
(eq? kind (xml-token-kind token))
(equal? gi (xml-token-head token)))
(error-cont token kind gi)))
;;@subsection High-level Parsers
;;
;;These procedures are to instantiate a SSAX parser. A user can
;;instantiate the parser to do the full validation, or no validation,
;;or any particular validation. The user specifies which PI he wants
;;to be notified about. The user tells what to do with the parsed
;;character and element data. The latter handlers determine if the
;;parsing follows a SAX or a DOM model.
;;@args my-pi-handlers
;;
;;Create a parser to parse and process one Processing Element (PI).
;;
;;@1 is an association list of pairs
;;@code{(@var{pi-tag} . @var{pi-handler})} where @var{pi-tag} is an
;;NCName symbol, the PI target; and @var{pi-handler} is a procedure
;;taking arguments @var{port}, @var{pi-tag}, and @var{seed}.
;;
;;@var{pi-handler} should read the rest of the PI up to and including
;;the combination @samp{?>} that terminates the PI. The handler
;;should return a new seed. One of the @var{pi-tag}s may be the
;;symbol @code{*DEFAULT*}. The corresponding handler will handle PIs
;;that no other handler will. If the *DEFAULT* @var{pi-tag} is not
;;specified, @0 will assume the default handler that skips the body of
;;the PI.
;;
;;@0 returns a procedure of arguments @var{port}, @var{pi-tag}, and
;;@var{seed}; that will parse the current PI according to @1.
(define (ssax:make-pi-parser handlers)
(lambda (port target seed)
(define pair (assv target handlers))
(or pair (set! pair (assv '*DEFAULT* handlers)))
(cond ((not pair)
(slib:warn port "Skipping PI: " target #\newline)
(ssax:skip-pi port)
seed)
(else ((cdr pair) port target seed)))))
;;syntax: ssax:make-elem-parser
;; my-new-level-seed my-finish-element my-char-data-handler my-pi-handlers
;;@body
;;
;;Create a parser to parse and process one element, including its
;;character content or children elements. The parser is typically
;;applied to the root element of a document.
;;
;;@table @asis
;;
;;@item @1
;;is a procedure taking arguments:
;;
;;@var{elem-gi} @var{attributes} @var{namespaces} @var{expected-content} @var{seed}
;;
;;where @var{elem-gi} is a RES-NAME of the element about to be
;;processed.
;;
;;@1 is to generate the seed to be passed to handlers that process the
;;content of the element.
;;
;;@item @2
;;is a procedure taking arguments:
;;
;;@var{elem-gi} @var{attributes} @var{namespaces} @var{parent-seed} @var{seed}
;;
;;@2 is called when parsing of @var{elem-gi} is finished.
;;The @var{seed} is the result from the last content parser (or
;;from @1 if the element has the empty content).
;;@var{parent-seed} is the same seed as was passed to @1.
;;@2 is to generate a seed that will be the result
;;of the element parser.
;;
;;@item @3
;;is a STR-HANDLER as described in Data Types above.
;;
;;@item @4
;;is as described for @code{ssax:make-pi-handler} above.
;;
;;@end table
;;
;;The generated parser is a procedure taking arguments:
;;
;;@var{start-tag-head} @var{port} @var{elems} @var{entities} @var{namespaces} @var{preserve-ws?} @var{seed}
;;
;;The procedure must be called after the start tag token has been
;;read. @var{start-tag-head} is an UNRES-NAME from the start-element
;;tag. ELEMS is an instance of ELEMS slot of XML-DECL.
;;
;;Faults detected:@*
;;VC: XML-Spec.html#elementvalid@*
;;WFC: XML-Spec.html#GIMatch
(define (ssax:make-elem-parser my-new-level-seed my-finish-element
my-char-data-handler my-pi-handlers)
(lambda (start-tag-head port elems entities namespaces preserve-ws? seed)
(define xml-space-gi (cons ssax:Prefix-XML
(string->symbol "space")))
(let handle-start-tag ((start-tag-head start-tag-head)
(port port) (entities entities)
(namespaces namespaces)
(preserve-ws? preserve-ws?) (parent-seed seed))
(let*-values
(((elem-gi attributes namespaces expected-content)
(ssax:complete-start-tag start-tag-head port elems
entities namespaces))
((seed)
(my-new-level-seed elem-gi attributes
namespaces expected-content parent-seed)))
(case expected-content
((EMPTY-TAG)
(my-finish-element
elem-gi attributes namespaces parent-seed seed))
((EMPTY) ; The end tag must immediately follow
(ssax:assert-token (and (eqv? #\< (ssax:skip-S port))
(ssax:read-markup-token port))
'END
start-tag-head
(lambda (token exp-kind exp-head)
(slib:error port "[elementvalid] broken for " token
" while expecting "
exp-kind exp-head)))
(my-finish-element
elem-gi attributes namespaces parent-seed seed))
(else ; reading the content...
(let ((preserve-ws? ; inherit or set the preserve-ws? flag
(cond ((assoc xml-space-gi attributes) =>
(lambda (name-value)
(equal? "preserve" (cdr name-value))))
(else preserve-ws?))))
(let loop ((port port) (entities entities)
(expect-eof? #f) (seed seed))
(let*-values
(((seed term-token)
(ssax:read-char-data port expect-eof?
my-char-data-handler seed)))
(if (eof-object? term-token)
seed
(case (xml-token-kind term-token)
((END)
(ssax:assert-token term-token 'END start-tag-head
(lambda (token exp-kind exp-head)
(slib:error port "[GIMatch] broken for "
term-token " while expecting "
exp-kind exp-head)))
(my-finish-element
elem-gi attributes namespaces parent-seed seed))
((PI)
(let ((seed
((ssax:make-pi-parser my-pi-handlers)
port (xml-token-head term-token) seed)))
(loop port entities expect-eof? seed)))
((ENTITY-REF)
(let ((seed
(ssax:handle-parsed-entity
port (xml-token-head term-token)
entities
(lambda (port entities seed)
(loop port entities #t seed))
my-char-data-handler
seed))) ; keep on reading the content after ent
(loop port entities expect-eof? seed)))
((START) ; Start of a child element
(if (eq? expected-content 'PCDATA)
(slib:error port "[elementvalid] broken for "
elem-gi
" with char content only; unexpected token "
term-token))
;; Do other validation of the element content
(let ((seed
(handle-start-tag
(xml-token-head term-token)
port entities namespaces
preserve-ws? seed)))
(loop port entities expect-eof? seed)))
(else
(slib:error port "XML [43] broken for "
term-token))))))))
)))
))
;;This is ssax:make-parser with all the (specialization) handlers given
;;as positional arguments. It is called by ssax:make-parser, see below
(define (ssax:make-parser/positional-args
*handler-DOCTYPE
*handler-UNDECL-ROOT
*handler-DECL-ROOT
*handler-NEW-LEVEL-SEED
*handler-FINISH-ELEMENT
*handler-CHAR-DATA-HANDLER
*handler-PROCESSING-INSTRUCTIONS)
(lambda (port seed)
;; We must've just scanned the DOCTYPE token. Handle the
;; doctype declaration and exit to
;; scan-for-significant-prolog-token-2, and eventually, to the
;; element parser.
(define (handle-decl port token-head seed)
(or (eq? (string->symbol "DOCTYPE") token-head)
(slib:error port "XML [22], expected DOCTYPE declaration, found "
token-head))
(ssax:assert-current-char ssax:S-chars "XML [28], space after DOCTYPE" port)
(ssax:skip-S port)
(let*-values
(((docname) (ssax:read-QName port))
((systemid)
(and (ssax:ncname-starting-char? (ssax:skip-S port))
(ssax:read-external-id port)))
((internal-subset?)
(begin
(ssax:skip-S port)
(eqv? #\[
(ssax:assert-current-char '(#\> #\[)
"XML [28], end-of-DOCTYPE" port))))
((elems entities namespaces seed)
(*handler-DOCTYPE port docname systemid internal-subset? seed)))
(scan-for-significant-prolog-token-2 port elems entities namespaces
seed)))
;; Scan the leading PIs until we encounter either a doctype
;; declaration or a start token (of the root element). In the
;; latter two cases, we exit to the appropriate continuation
(define (scan-for-significant-prolog-token-1 port seed)
(let ((token (ssax:scan-misc port)))
(if (eof-object? token)
(slib:error port "XML [22], unexpected EOF")
(case (xml-token-kind token)
((PI)
(let ((seed
((ssax:make-pi-parser *handler-PROCESSING-INSTRUCTIONS)
port (xml-token-head token) seed)))
(scan-for-significant-prolog-token-1 port seed)))
((DECL) (handle-decl port (xml-token-head token) seed))
((START)
(let*-values
(((elems entities namespaces seed)
(*handler-UNDECL-ROOT (xml-token-head token) seed)))
(element-parser (xml-token-head token) port elems
entities namespaces #f seed)))
(else (slib:error port "XML [22], unexpected markup "
token))))))
;; Scan PIs after the doctype declaration, till we encounter
;; the start tag of the root element. After that we exit
;; to the element parser
(define (scan-for-significant-prolog-token-2 port elems entities namespaces seed)
(let ((token (ssax:scan-misc port)))
(if (eof-object? token)
(slib:error port "XML [22], unexpected EOF")
(case (xml-token-kind token)
((PI)
(let ((seed ((ssax:make-pi-parser *handler-PROCESSING-INSTRUCTIONS)
port (xml-token-head token) seed)))
(scan-for-significant-prolog-token-2 port elems entities
namespaces seed)))
((START)
(element-parser (xml-token-head token) port elems
entities namespaces #f
(*handler-DECL-ROOT (xml-token-head token) seed)))
(else (slib:error port "XML [22], unexpected markup "
token))))))
;; A procedure start-tag-head port elems entities namespaces
;; preserve-ws? seed
(define element-parser
(ssax:make-elem-parser *handler-NEW-LEVEL-SEED
*handler-FINISH-ELEMENT
*handler-CHAR-DATA-HANDLER
*handler-PROCESSING-INSTRUCTIONS))
;; Get the ball rolling ...
(scan-for-significant-prolog-token-1 port seed)
))
(define DOCTYPE 'DOCTYPE)
(define UNDECL-ROOT 'UNDECL-ROOT)
(define DECL-ROOT 'DECL-ROOT)
(define NEW-LEVEL-SEED 'NEW-LEVEL-SEED)
(define FINISH-ELEMENT 'FINISH-ELEMENT)
(define CHAR-DATA-HANDLER 'CHAR-DATA-HANDLER)
(define PROCESSING-INSTRUCTIONS 'PROCESSING-INSTRUCTIONS)
;;@args user-handler-tag user-handler ...
;;
;;Create an XML parser, an instance of the XML parsing framework.
;;This will be a SAX, a DOM, or a specialized parser depending on the
;;supplied user-handlers.
;;
;;@0 takes an even number of arguments; @1 is a symbol that identifies
;;a procedure (or association list for @code{PROCESSING-INSTRUCTIONS})
;;(@2) that follows the tag. Given below are tags and signatures of
;;the corresponding procedures. Not all tags have to be specified.
;;If some are omitted, reasonable defaults will apply.
;;
;;@table @samp
;;
;;@item DOCTYPE
;;handler-procedure: @var{port} @var{docname} @var{systemid} @var{internal-subset?} @var{seed}
;;
;;If @var{internal-subset?} is #t, the current position in the port is
;;right after we have read @samp{[} that begins the internal DTD
;;subset. We must finish reading of this subset before we return (or
;;must call @code{skip-internal-dtd} if we aren't interested in
;;reading it). @var{port} at exit must be at the first symbol after
;;the whole DOCTYPE declaration.
;;
;;The handler-procedure must generate four values:
;;@quotation
;;@var{elems} @var{entities} @var{namespaces} @var{seed}
;;@end quotation
;;
;;@var{elems} is as defined for the ELEMS slot of XML-DECL. It may be
;;#f to switch off validation. @var{namespaces} will typically
;;contain @var{user-prefix}es for selected @var{uri-symb}s. The
;;default handler-procedure skips the internal subset, if any, and
;;returns @code{(values #f '() '() seed)}.
;;
;;@item UNDECL-ROOT
;;procedure: @var{elem-gi} @var{seed}
;;
;;where @var{elem-gi} is an UNRES-NAME of the root element. This
;;procedure is called when an XML document under parsing contains
;;@emph{no} DOCTYPE declaration.
;;
;;The handler-procedure, as a DOCTYPE handler procedure above,
;;must generate four values:
;;@quotation
;;@var{elems} @var{entities} @var{namespaces} @var{seed}
;;@end quotation
;;
;;The default handler-procedure returns (values #f '() '() seed)
;;
;;@item DECL-ROOT
;;procedure: @var{elem-gi} @var{seed}
;;
;;where @var{elem-gi} is an UNRES-NAME of the root element. This
;;procedure is called when an XML document under parsing does contains
;;the DOCTYPE declaration. The handler-procedure must generate a new
;;@var{seed} (and verify that the name of the root element matches the
;;doctype, if the handler so wishes). The default handler-procedure
;;is the identity function.
;;
;;@item NEW-LEVEL-SEED
;;procedure: see ssax:make-elem-parser, my-new-level-seed
;;
;;@item FINISH-ELEMENT
;;procedure: see ssax:make-elem-parser, my-finish-element
;;
;;@item CHAR-DATA-HANDLER
;;procedure: see ssax:make-elem-parser, my-char-data-handler
;;
;;@item PROCESSING-INSTRUCTIONS
;;association list as is passed to @code{ssax:make-pi-parser}.
;;The default value is '()
;;
;;@end table
;;
;;The generated parser is a procedure of arguments @var{port} and
;;@var{seed}.
;;
;;This procedure parses the document prolog and then exits to an
;;element parser (created by @code{ssax:make-elem-parser}) to handle
;;the rest.
;;
;;@example
;;[1] document ::= prolog element Misc*
;;[22] prolog ::= XMLDecl? Misc* (doctypedec | Misc*)?
;;[27] Misc ::= Comment | PI | S
;;[28] doctypedecl ::= ''
;;[29] markupdecl ::= elementdecl | AttlistDecl
;; | EntityDecl
;; | NotationDecl | PI
;; | Comment
;;@end example
(define ssax:make-parser
(let ((descriptors
`((DOCTYPE
,(lambda (port docname systemid internal-subset? seed)
(cond (internal-subset?
(ssax:skip-internal-dtd port)))
(slib:warn port "DOCTYPE DECL " docname " "
systemid " found and skipped")
(values #f '() '() seed)
))
(UNDECL-ROOT
,(lambda (elem-gi seed) (values #f '() '() seed)))
(DECL-ROOT
,(lambda (elem-gi seed) seed))
(NEW-LEVEL-SEED) ; required
(FINISH-ELEMENT) ; required
(CHAR-DATA-HANDLER) ; required
(PROCESSING-INSTRUCTIONS ())
)))
(lambda proplist
(define count 0)
(if (odd? (length proplist))
(slib:error 'ssax:make-parser "takes even number of arguments"
proplist))
(let ((posititional-args
(map (lambda (spec)
(define ptail (member (car spec) proplist))
(cond ((and ptail (odd? (length ptail)))
(slib:error 'ssax:make-parser 'bad 'argument ptail))
(ptail
(set! count (+ 1 count))
(cadr ptail))
((not (null? (cdr spec)))
(cadr spec))
(else
(slib:error
'ssax:make-parser 'missing (car spec) 'property))))
descriptors)))
(if (= count (quotient (length proplist) 2))
(apply ssax:make-parser/positional-args posititional-args)
(slib:error 'ssax:make-parser 'extra 'arguments proplist))))))
;;@subsection Parsing XML to SXML
;;@body
;;
;;This is an instance of the SSAX parser that returns an SXML
;;representation of the XML document to be read from @1. @2 is a list
;;of @code{(@var{user-prefix} . @var{uri-string})} that assigns
;;@var{user-prefix}es to certain namespaces identified by particular
;;@var{uri-string}s. It may be an empty list. @0 returns an SXML
;;tree. The port points out to the first character after the root
;;element.
(define (ssax:xml->sxml port namespace-prefix-assig)
(define namespaces
(map (lambda (el) (cons* #f (car el) (ssax:uri-string->symbol (cdr el))))
namespace-prefix-assig))
(define (RES-NAME->SXML res-name)
(string->symbol
(string-append
(symbol->string (car res-name))
":"
(symbol->string (cdr res-name)))))
(let ((result
(reverse
((ssax:make-parser
'DOCTYPE
(lambda (port docname systemid internal-subset? seed)
(cond (internal-subset?
(ssax:skip-internal-dtd port)))
(slib:warn port "DOCTYPE DECL " docname " "
systemid " found and skipped")
(values #f '() namespaces seed))
'NEW-LEVEL-SEED
(lambda (elem-gi attributes namespaces expected-content seed)
'())
'FINISH-ELEMENT
(lambda (elem-gi attributes namespaces parent-seed seed)
(define nseed (ssax:reverse-collect-str-drop-ws seed))
(define attrs
(attlist-fold
(lambda (attr accum)
(cons (list (if (symbol? (car attr))
(car attr)
(RES-NAME->SXML (car attr)))
(cdr attr))
accum))
'() attributes))
(cons (cons (if (symbol? elem-gi)
elem-gi
(RES-NAME->SXML elem-gi))
(if (null? attrs)
nseed
(cons (cons '@ attrs) nseed)))
parent-seed))
'CHAR-DATA-HANDLER
(lambda (string1 string2 seed)
(if (string-null? string2)
(cons string1 seed)
(cons* string2 string1 seed)))
'UNDECL-ROOT
(lambda (elem-gi seed)
(values #f '() namespaces seed))
'PROCESSING-INSTRUCTIONS
(list
(cons '*DEFAULT*
(lambda (port pi-tag seed)
(cons (list '*PROCESSING-INSTRUCTIONS*
pi-tag
(ssax:read-pi-body-as-string port))
seed))))
)
port
'()))))
(cons '*TOP*
(if (null? namespace-prefix-assig)
result
(cons
(list '@ (cons '*NAMESPACES*
(map (lambda (ns) (list (car ns) (cdr ns)))
namespace-prefix-assig)))
result)))))