;;;; HTML scan calls procedures for word, tag, whitespac, and newline. ;;; Copyright 2002 Aubrey Jaffer ; ;Permission to copy this software, to modify it, to redistribute it, ;to distribute modified versions, and to use it for any purpose is ;granted, subject to the following restrictions and understandings. ; ;1. Any copy made of this software must include this copyright notice ;in full. ; ;2. I have made no warranty or representation that the operation of ;this software will be error-free, and I am under no obligation to ;provide any services, by way of maintenance, update, or otherwise. ; ;3. In conjunction with products arising from the use of this ;material, there shall be no use of my name in any advertising, ;promotional, or sales literature without prior written consent in ;each case. (require 'line-i/o) (require 'string-port) (require 'scanf) (require-if 'compiling 'string-case) ;;@code{(require 'html-for-each)} ;;@ftindex html-for-each ;;@body ;;@1 is an input port or a string naming an existing file containing ;;HTML text. ;;@2 is a procedure of one argument or #f. ;;@3 is a procedure of one argument or #f. ;;@4 is a procedure of one argument or #f. ;;@5 is a procedure of no arguments or #f. ;; ;;@0 opens and reads characters from port @1 or the file named by ;;string @1. Sequential groups of characters are assembled into ;;strings which are either ;; ;;@itemize @bullet ;;@item ;;enclosed by @samp{<} and @samp{>} (hypertext markups or comments); ;;@item ;;end-of-line; ;;@item ;;whitespace; or ;;@item ;;none of the above (words). ;;@end itemize ;; ;;Procedures are called according to these distinctions in order of ;;the string's occurrence in @1. ;; ;;@5 is called with no arguments for end-of-line @emph{not within a ;;markup or comment}. ;; ;;@4 is called with strings of non-newline whitespace. ;; ;;@3 is called with hypertext markup strings (including @samp{<} and ;;@samp{>}). ;; ;;@2 is called with the remaining strings. ;; ;;@0 returns an unspecified value. (define (html-for-each file word-proc markup-proc white-proc newline-proc) (define nl (string #\newline)) (define (string-index str . chrs) (define len (string-length str)) (do ((pos 0 (+ 1 pos))) ((or (>= pos len) (memv (string-ref str pos) chrs)) (and (< pos len) pos)))) (define (proc-words line edx) (let loop ((idx 0)) (define ldx idx) (do ((idx idx (+ 1 idx))) ((or (>= idx edx) (not (char-whitespace? (string-ref line idx)))) (do ((jdx idx (+ 1 jdx))) ((or (>= jdx edx) (char-whitespace? (string-ref line jdx))) (and white-proc (not (= ldx idx)) (white-proc (substring line ldx idx))) (and word-proc (not (= idx jdx)) (word-proc (substring line idx jdx))) (if (< jdx edx) (loop jdx)))))))) ((if (input-port? file) call-with-open-ports call-with-input-file) file (lambda (iport) (do ((line (read-line iport) (read-line iport))) ((eof-object? line)) (do ((idx (string-index line #\<) (string-index line #\<))) ((not idx) (proc-words line (string-length line))) ; seen '<' (proc-words line idx) (let ((trm (if (and (<= (+ 4 idx) (string-length line)) (string=? "" #\>))) (let loop ((lne (substring line idx (string-length line))) (tag "") (quot #f)) (define edx (or (eof-object? lne) (if quot (string-index lne quot) (if (char? trm) (string-index lne #\" #\' #\>) (string-index lne #\>))))) (cond ((not edx) ; still inside tag ;;(print quot trm 'within-tag lne) (loop (read-line iport) (and markup-proc (string-append tag lne nl)) quot)) ((eqv? #t edx) ; EOF ;;(print quot trm 'eof lne) (slib:error 'unterminated 'HTML 'entity file) (and markup-proc (markup-proc tag))) ((eqv? quot (string-ref lne edx)) ; end of quoted string ;;(print quot trm 'end-quote lne) (set! edx (+ 1 edx)) (loop (substring lne edx (string-length lne)) (and markup-proc (string-append tag (substring lne 0 edx))) #f)) ((not (eqv? #\> (string-ref lne edx))) ; start of quoted ;;(print quot trm 'start-quote lne) (set! edx (+ 1 edx)) (loop (substring lne edx (string-length lne)) (and markup-proc (string-append tag (substring lne 0 edx))) (string-ref lne (+ -1 edx)))) ((or (and (string? trm) ; found matching '>' or '-->' (<= 2 edx) (equal? trm (substring lne (+ -2 edx) (+ 1 edx)))) (eqv? (string-ref lne edx) trm)) ;;(print quot trm 'end-> lne) (set! edx (+ 1 edx)) (and markup-proc (markup-proc (string-append tag (substring lne 0 edx)))) ; process words after '>' (set! line (substring lne edx (string-length lne)))) (else ;;(print quot trm 'within-comment lne) (set! edx (+ 1 edx)) (loop (substring lne edx (string-length lne)) (and markup-proc (string-append tag (substring lne 0 edx))) #f)))))) (and newline-proc (newline-proc)))))) ;;@args file limit ;;@args file ;;@1 is an input port or a string naming an existing file containing ;;HTML text. If supplied, @2 must be an integer. @2 defaults to ;;1000. ;; ;;@0 opens and reads HTML from port @1 or the file named by string @1, ;;until reaching the (mandatory) @samp{TITLE} field. @0 returns the ;;title string with adjacent whitespaces collapsed to one space. @0 ;;returns #f if the title field is empty, absent, if the first ;;character read from @1 is not @samp{#\<}, or if the end of title is ;;not found within the first (approximately) @2 words. (define (html:read-title file . limit) (set! limit (if (null? limit) 1000 (* 2 (car limit)))) ((if (input-port? file) call-with-open-ports call-with-input-file) file (lambda (port) (and (eqv? #\< (peek-char port)) (call-with-current-continuation (lambda (return) (define (cnt . args) (if (negative? limit) (return #f) (set! limit (+ -1 limit)))) (define capturing? #f) (define text '()) (html-for-each port (lambda (str) (cnt) (if capturing? (set! text (cons " " (cons str text))))) (lambda (str) (cnt) (cond ((prefix-ci? "