diff options
Diffstat (limited to 'html4each.scm')
-rw-r--r-- | html4each.scm | 240 |
1 files changed, 240 insertions, 0 deletions
diff --git a/html4each.scm b/html4each.scm new file mode 100644 index 0000000..02e666e --- /dev/null +++ b/html4each.scm @@ -0,0 +1,240 @@ +;;;; HTML scan calls procedures for word, tag, whitespac, and newline. +;;; Copyright 2002 Aubrey Jaffer +; +;Permission to copy this software, to modify it, to redistribute it, +;to distribute modified versions, and to use it for any purpose is +;granted, subject to the following restrictions and understandings. +; +;1. Any copy made of this software must include this copyright notice +;in full. +; +;2. I have made no warranty or representation that the operation of +;this software will be error-free, and I am under no obligation to +;provide any services, by way of maintenance, update, or otherwise. +; +;3. In conjunction with products arising from the use of this +;material, there shall be no use of my name in any advertising, +;promotional, or sales literature without prior written consent in +;each case. + +(require 'line-i/o) +(require 'string-port) +(require 'scanf) +(require-if 'compiling 'string-case) + +;;@code{(require 'html-for-each)} +;;@ftindex html-for-each + +;;@body +;;@1 is an input port or a string naming an existing file containing +;;HTML text. +;;@2 is a procedure of one argument or #f. +;;@3 is a procedure of one argument or #f. +;;@4 is a procedure of one argument or #f. +;;@5 is a procedure of no arguments or #f. +;; +;;@0 opens and reads characters from port @1 or the file named by +;;string @1. Sequential groups of characters are assembled into +;;strings which are either +;; +;;@itemize @bullet +;;@item +;;enclosed by @samp{<} and @samp{>} (hypertext markups or comments); +;;@item +;;end-of-line; +;;@item +;;whitespace; or +;;@item +;;none of the above (words). +;;@end itemize +;; +;;Procedures are called according to these distinctions in order of +;;the string's occurrence in @1. +;; +;;@5 is called with no arguments for end-of-line @emph{not within a +;;markup or comment}. +;; +;;@4 is called with strings of non-newline whitespace. +;; +;;@3 is called with hypertext markup strings (including @samp{<} and +;;@samp{>}). +;; +;;@2 is called with the remaining strings. +;; +;;@0 returns an unspecified value. +(define (html-for-each file word-proc markup-proc white-proc newline-proc) + (define nl (string #\newline)) + (define (string-index str . chrs) + (define len (string-length str)) + (do ((pos 0 (+ 1 pos))) + ((or (>= pos len) (memv (string-ref str pos) chrs)) + (and (< pos len) pos)))) + (define (proc-words line edx) + (let loop ((idx 0)) + (define ldx idx) + (do ((idx idx (+ 1 idx))) + ((or (>= idx edx) + (not (char-whitespace? (string-ref line idx)))) + (do ((jdx idx (+ 1 jdx))) + ((or (>= jdx edx) + (char-whitespace? (string-ref line jdx))) + (and white-proc (not (= ldx idx)) + (white-proc (substring line ldx idx))) + (and word-proc (not (= idx jdx)) + (word-proc (substring line idx jdx))) + (if (< jdx edx) (loop jdx)))))))) + ((if (input-port? file) call-with-open-ports call-with-input-file) + file + (lambda (iport) + (do ((line (read-line iport) (read-line iport))) + ((eof-object? line)) + (do ((idx (string-index line #\<) (string-index line #\<))) + ((not idx) (proc-words line (string-length line))) + ; seen '<' + (proc-words line idx) + (let ((trm (if (and (<= (+ 4 idx) (string-length line)) + (string=? "<!--" (substring line idx (+ 4 idx)))) + "-->" #\>))) + (let loop ((lne (substring line idx (string-length line))) + (tag "") + (quot #f)) + (define edx (or (eof-object? lne) + (if quot + (string-index lne quot) + (if (char? trm) + (string-index lne #\" #\' #\>) + (string-index lne #\>))))) + (cond + ((not edx) ; still inside tag + ;;(print quot trm 'within-tag lne) + (loop (read-line iport) + (and markup-proc (string-append tag lne nl)) + quot)) + ((eqv? #t edx) ; EOF + ;;(print quot trm 'eof lne) + (slib:error 'unterminated 'HTML 'entity file) + (and markup-proc (markup-proc tag))) + ((eqv? quot (string-ref lne edx)) ; end of quoted string + ;;(print quot trm 'end-quote lne) + (set! edx (+ 1 edx)) + (loop (substring lne edx (string-length lne)) + (and markup-proc + (string-append tag (substring lne 0 edx))) + #f)) + ((not (eqv? #\> (string-ref lne edx))) ; start of quoted + ;;(print quot trm 'start-quote lne) + (set! edx (+ 1 edx)) + (loop (substring lne edx (string-length lne)) + (and markup-proc + (string-append tag (substring lne 0 edx))) + (string-ref lne (+ -1 edx)))) + ((or (and (string? trm) ; found matching '>' or '-->' + (<= 2 edx) + (equal? trm (substring lne (+ -2 edx) (+ 1 edx)))) + (eqv? (string-ref lne edx) trm)) + ;;(print quot trm 'end-> lne) + (set! edx (+ 1 edx)) + (and markup-proc + (markup-proc (string-append tag (substring lne 0 edx)))) + ; process words after '>' + (set! line (substring lne edx (string-length lne)))) + (else + ;;(print quot trm 'within-comment lne) + (set! edx (+ 1 edx)) + (loop (substring lne edx (string-length lne)) + (and markup-proc + (string-append tag (substring lne 0 edx))) + #f)))))) + (and newline-proc (newline-proc)))))) + +;;@args file limit +;;@args file +;;@1 is an input port or a string naming an existing file containing +;;HTML text. If supplied, @2 must be an integer. @2 defaults to +;;1000. +;; +;;@0 opens and reads HTML from port @1 or the file named by string @1, +;;until reaching the (mandatory) @samp{TITLE} field. @0 returns the +;;title string with adjacent whitespaces collapsed to one space. @0 +;;returns #f if the title field is empty, absent, if the first +;;character read from @1 is not @samp{#\<}, or if the end of title is +;;not found within the first (approximately) @2 words. +(define (html:read-title file . limit) + (set! limit (if (null? limit) 1000 (* 2 (car limit)))) + ((if (input-port? file) call-with-open-ports call-with-input-file) + file + (lambda (port) + (and (eqv? #\< (peek-char port)) + (call-with-current-continuation + (lambda (return) + (define (cnt . args) + (if (negative? limit) + (return #f) + (set! limit (+ -1 limit)))) + (define capturing? #f) + (define text '()) + (html-for-each + port + (lambda (str) + (cnt) + (if capturing? (set! text (cons " " (cons str text))))) + (lambda (str) + (cnt) + (cond ((prefix-ci? "<title" str) + (set! capturing? #t)) + ((prefix-ci? "</title" str) + (return (and (not (null? text)) + (apply string-append + (reverse (cdr text)))))) + ((or (prefix-ci? "</head" str) + (prefix-ci? "<body" str)) + (return #f)))) + cnt + cnt) + #f)))))) + +(define (prefix-ci? pre str) + (define prelen (string-length pre)) + (and (< prelen (string-length str)) + (string-ci=? pre (substring str 0 prelen)))) + +;;@body +;;@1 is a hypertext markup string. +;; +;;If @1 is a (hypertext) comment, then @0 returns #f. +;;Otherwise @0 returns the hypertext element symbol (created by +;;@code{string-ci->symbol}) consed onto an association list of the +;;attribute name-symbols and values. Each value is a number or +;;string; or #t if the name had no value assigned within the markup. +(define (htm-fields htm) + (require 'string-case) + (and + (not (and (> (string-length htm) 4) (equal? "<!--" (substring htm 0 4)))) + (call-with-input-string htm + (lambda (port) + (define element #f) + (define fields '()) + (cond ((not (eqv? 1 (fscanf port "<%s" element))) + (slib:error 'htm-fields 'strange htm))) + (let loop ((chr (peek-char port))) + (define name #f) + (define junk #f) + (define value #t) + (cond + ((eof-object? chr) (slib:warn 'htm-fields 'missing '> htm) + (reverse fields)) + ((eqv? #\> chr) (cons element (reverse fields))) + ((char-whitespace? chr) (read-char port) (loop (peek-char port))) + ((case (fscanf port "%[a-zA-Z0-9]%[=]%[-.a-zA-Z0-9]" name junk value) + ((3 1) #t) + ((2) + (case (peek-char port) + ((#\") (eqv? 1 (fscanf port "\"%[^\"]\"" value))) + ((#\') (eqv? 1 (fscanf port "'%[^']'" value))) + (else #f))) + (else #f)) + (set! fields (cons (cons (string-ci->symbol name) + (or (string->number value) value)) + fields)) + (loop (peek-char port))) + (else (slib:warn 'htm-fields 'bad 'field htm) (reverse fields)))))))) |