aboutsummaryrefslogtreecommitdiffstats
path: root/html4each.scm
diff options
context:
space:
mode:
Diffstat (limited to 'html4each.scm')
-rw-r--r--html4each.scm240
1 files changed, 240 insertions, 0 deletions
diff --git a/html4each.scm b/html4each.scm
new file mode 100644
index 0000000..02e666e
--- /dev/null
+++ b/html4each.scm
@@ -0,0 +1,240 @@
+;;;; HTML scan calls procedures for word, tag, whitespac, and newline.
+;;; Copyright 2002 Aubrey Jaffer
+;
+;Permission to copy this software, to modify it, to redistribute it,
+;to distribute modified versions, and to use it for any purpose is
+;granted, subject to the following restrictions and understandings.
+;
+;1. Any copy made of this software must include this copyright notice
+;in full.
+;
+;2. I have made no warranty or representation that the operation of
+;this software will be error-free, and I am under no obligation to
+;provide any services, by way of maintenance, update, or otherwise.
+;
+;3. In conjunction with products arising from the use of this
+;material, there shall be no use of my name in any advertising,
+;promotional, or sales literature without prior written consent in
+;each case.
+
+(require 'line-i/o)
+(require 'string-port)
+(require 'scanf)
+(require-if 'compiling 'string-case)
+
+;;@code{(require 'html-for-each)}
+;;@ftindex html-for-each
+
+;;@body
+;;@1 is an input port or a string naming an existing file containing
+;;HTML text.
+;;@2 is a procedure of one argument or #f.
+;;@3 is a procedure of one argument or #f.
+;;@4 is a procedure of one argument or #f.
+;;@5 is a procedure of no arguments or #f.
+;;
+;;@0 opens and reads characters from port @1 or the file named by
+;;string @1. Sequential groups of characters are assembled into
+;;strings which are either
+;;
+;;@itemize @bullet
+;;@item
+;;enclosed by @samp{<} and @samp{>} (hypertext markups or comments);
+;;@item
+;;end-of-line;
+;;@item
+;;whitespace; or
+;;@item
+;;none of the above (words).
+;;@end itemize
+;;
+;;Procedures are called according to these distinctions in order of
+;;the string's occurrence in @1.
+;;
+;;@5 is called with no arguments for end-of-line @emph{not within a
+;;markup or comment}.
+;;
+;;@4 is called with strings of non-newline whitespace.
+;;
+;;@3 is called with hypertext markup strings (including @samp{<} and
+;;@samp{>}).
+;;
+;;@2 is called with the remaining strings.
+;;
+;;@0 returns an unspecified value.
+(define (html-for-each file word-proc markup-proc white-proc newline-proc)
+ (define nl (string #\newline))
+ (define (string-index str . chrs)
+ (define len (string-length str))
+ (do ((pos 0 (+ 1 pos)))
+ ((or (>= pos len) (memv (string-ref str pos) chrs))
+ (and (< pos len) pos))))
+ (define (proc-words line edx)
+ (let loop ((idx 0))
+ (define ldx idx)
+ (do ((idx idx (+ 1 idx)))
+ ((or (>= idx edx)
+ (not (char-whitespace? (string-ref line idx))))
+ (do ((jdx idx (+ 1 jdx)))
+ ((or (>= jdx edx)
+ (char-whitespace? (string-ref line jdx)))
+ (and white-proc (not (= ldx idx))
+ (white-proc (substring line ldx idx)))
+ (and word-proc (not (= idx jdx))
+ (word-proc (substring line idx jdx)))
+ (if (< jdx edx) (loop jdx))))))))
+ ((if (input-port? file) call-with-open-ports call-with-input-file)
+ file
+ (lambda (iport)
+ (do ((line (read-line iport) (read-line iport)))
+ ((eof-object? line))
+ (do ((idx (string-index line #\<) (string-index line #\<)))
+ ((not idx) (proc-words line (string-length line)))
+ ; seen '<'
+ (proc-words line idx)
+ (let ((trm (if (and (<= (+ 4 idx) (string-length line))
+ (string=? "<!--" (substring line idx (+ 4 idx))))
+ "-->" #\>)))
+ (let loop ((lne (substring line idx (string-length line)))
+ (tag "")
+ (quot #f))
+ (define edx (or (eof-object? lne)
+ (if quot
+ (string-index lne quot)
+ (if (char? trm)
+ (string-index lne #\" #\' #\>)
+ (string-index lne #\>)))))
+ (cond
+ ((not edx) ; still inside tag
+ ;;(print quot trm 'within-tag lne)
+ (loop (read-line iport)
+ (and markup-proc (string-append tag lne nl))
+ quot))
+ ((eqv? #t edx) ; EOF
+ ;;(print quot trm 'eof lne)
+ (slib:error 'unterminated 'HTML 'entity file)
+ (and markup-proc (markup-proc tag)))
+ ((eqv? quot (string-ref lne edx)) ; end of quoted string
+ ;;(print quot trm 'end-quote lne)
+ (set! edx (+ 1 edx))
+ (loop (substring lne edx (string-length lne))
+ (and markup-proc
+ (string-append tag (substring lne 0 edx)))
+ #f))
+ ((not (eqv? #\> (string-ref lne edx))) ; start of quoted
+ ;;(print quot trm 'start-quote lne)
+ (set! edx (+ 1 edx))
+ (loop (substring lne edx (string-length lne))
+ (and markup-proc
+ (string-append tag (substring lne 0 edx)))
+ (string-ref lne (+ -1 edx))))
+ ((or (and (string? trm) ; found matching '>' or '-->'
+ (<= 2 edx)
+ (equal? trm (substring lne (+ -2 edx) (+ 1 edx))))
+ (eqv? (string-ref lne edx) trm))
+ ;;(print quot trm 'end-> lne)
+ (set! edx (+ 1 edx))
+ (and markup-proc
+ (markup-proc (string-append tag (substring lne 0 edx))))
+ ; process words after '>'
+ (set! line (substring lne edx (string-length lne))))
+ (else
+ ;;(print quot trm 'within-comment lne)
+ (set! edx (+ 1 edx))
+ (loop (substring lne edx (string-length lne))
+ (and markup-proc
+ (string-append tag (substring lne 0 edx)))
+ #f))))))
+ (and newline-proc (newline-proc))))))
+
+;;@args file limit
+;;@args file
+;;@1 is an input port or a string naming an existing file containing
+;;HTML text. If supplied, @2 must be an integer. @2 defaults to
+;;1000.
+;;
+;;@0 opens and reads HTML from port @1 or the file named by string @1,
+;;until reaching the (mandatory) @samp{TITLE} field. @0 returns the
+;;title string with adjacent whitespaces collapsed to one space. @0
+;;returns #f if the title field is empty, absent, if the first
+;;character read from @1 is not @samp{#\<}, or if the end of title is
+;;not found within the first (approximately) @2 words.
+(define (html:read-title file . limit)
+ (set! limit (if (null? limit) 1000 (* 2 (car limit))))
+ ((if (input-port? file) call-with-open-ports call-with-input-file)
+ file
+ (lambda (port)
+ (and (eqv? #\< (peek-char port))
+ (call-with-current-continuation
+ (lambda (return)
+ (define (cnt . args)
+ (if (negative? limit)
+ (return #f)
+ (set! limit (+ -1 limit))))
+ (define capturing? #f)
+ (define text '())
+ (html-for-each
+ port
+ (lambda (str)
+ (cnt)
+ (if capturing? (set! text (cons " " (cons str text)))))
+ (lambda (str)
+ (cnt)
+ (cond ((prefix-ci? "<title" str)
+ (set! capturing? #t))
+ ((prefix-ci? "</title" str)
+ (return (and (not (null? text))
+ (apply string-append
+ (reverse (cdr text))))))
+ ((or (prefix-ci? "</head" str)
+ (prefix-ci? "<body" str))
+ (return #f))))
+ cnt
+ cnt)
+ #f))))))
+
+(define (prefix-ci? pre str)
+ (define prelen (string-length pre))
+ (and (< prelen (string-length str))
+ (string-ci=? pre (substring str 0 prelen))))
+
+;;@body
+;;@1 is a hypertext markup string.
+;;
+;;If @1 is a (hypertext) comment, then @0 returns #f.
+;;Otherwise @0 returns the hypertext element symbol (created by
+;;@code{string-ci->symbol}) consed onto an association list of the
+;;attribute name-symbols and values. Each value is a number or
+;;string; or #t if the name had no value assigned within the markup.
+(define (htm-fields htm)
+ (require 'string-case)
+ (and
+ (not (and (> (string-length htm) 4) (equal? "<!--" (substring htm 0 4))))
+ (call-with-input-string htm
+ (lambda (port)
+ (define element #f)
+ (define fields '())
+ (cond ((not (eqv? 1 (fscanf port "<%s" element)))
+ (slib:error 'htm-fields 'strange htm)))
+ (let loop ((chr (peek-char port)))
+ (define name #f)
+ (define junk #f)
+ (define value #t)
+ (cond
+ ((eof-object? chr) (slib:warn 'htm-fields 'missing '> htm)
+ (reverse fields))
+ ((eqv? #\> chr) (cons element (reverse fields)))
+ ((char-whitespace? chr) (read-char port) (loop (peek-char port)))
+ ((case (fscanf port "%[a-zA-Z0-9]%[=]%[-.a-zA-Z0-9]" name junk value)
+ ((3 1) #t)
+ ((2)
+ (case (peek-char port)
+ ((#\") (eqv? 1 (fscanf port "\"%[^\"]\"" value)))
+ ((#\') (eqv? 1 (fscanf port "'%[^']'" value)))
+ (else #f)))
+ (else #f))
+ (set! fields (cons (cons (string-ci->symbol name)
+ (or (string->number value) value))
+ fields))
+ (loop (peek-char port)))
+ (else (slib:warn 'htm-fields 'bad 'field htm) (reverse fields))))))))