aboutsummaryrefslogtreecommitdiffstats
path: root/html4each.scm
blob: f1156165ddf9a4404c0aa1325647c9e6a98f6fa5 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
;;;; HTML scan calls procedures for word, tag, whitespac, and newline.
;;; Copyright 2002 Aubrey Jaffer
;
;Permission to copy this software, to modify it, to redistribute it,
;to distribute modified versions, and to use it for any purpose is
;granted, subject to the following restrictions and understandings.
;
;1.  Any copy made of this software must include this copyright notice
;in full.
;
;2.  I have made no warranty or representation that the operation of
;this software will be error-free, and I am under no obligation to
;provide any services, by way of maintenance, update, or otherwise.
;
;3.  In conjunction with products arising from the use of this
;material, there shall be no use of my name in any advertising,
;promotional, or sales literature without prior written consent in
;each case.

(require 'line-i/o)
(require 'string-port)
(require 'scanf)
(require-if 'compiling 'string-case)

;;@code{(require 'html-for-each)}
;;@ftindex html-for-each

;;@body
;;@1 is an input port or a string naming an existing file containing
;;HTML text.
;;@2 is a procedure of one argument or #f.
;;@3 is a procedure of one argument or #f.
;;@4 is a procedure of one argument or #f.
;;@5 is a procedure of no arguments or #f.
;;
;;@0 opens and reads characters from port @1 or the file named by
;;string @1.  Sequential groups of characters are assembled into
;;strings which are either
;;
;;@itemize @bullet
;;@item
;;enclosed by @samp{<} and @samp{>} (hypertext markups or comments);
;;@item
;;end-of-line;
;;@item
;;whitespace; or
;;@item
;;none of the above (words).
;;@end itemize
;;
;;Procedures are called according to these distinctions in order of
;;the string's occurrence in @1.
;;
;;@5 is called with no arguments for end-of-line @emph{not within a
;;markup or comment}.
;;
;;@4 is called with strings of non-newline whitespace.
;;
;;@3 is called with hypertext markup strings (including @samp{<} and
;;@samp{>}).
;;
;;@2 is called with the remaining strings.
;;
;;@0 returns an unspecified value.
(define (html-for-each file word-proc markup-proc white-proc newline-proc)
  (define nl (string #\newline))
  (define (string-index str . chrs)
    (define len (string-length str))
    (do ((pos 0 (+ 1 pos)))
	((or (>= pos len) (memv (string-ref str pos) chrs))
	 (and (< pos len) pos))))
  (define (proc-words line edx)
    (let loop ((idx 0))
      (define ldx idx)
      (do ((idx idx (+ 1 idx)))
	  ((or (>= idx edx)
	       (not (char-whitespace? (string-ref line idx))))
	   (do ((jdx idx (+ 1 jdx)))
	       ((or (>= jdx edx)
		    (char-whitespace? (string-ref line jdx)))
		(and white-proc (not (= ldx idx))
		     (white-proc (substring line ldx idx)))
		(and word-proc (not (= idx jdx))
		     (word-proc (substring line idx jdx)))
		(if (< jdx edx) (loop jdx))))))))
  ((if (input-port? file) call-with-open-ports call-with-input-file)
   file
   (lambda (iport)
     (do ((line (read-line iport) (read-line iport)))
	 ((eof-object? line))
       (do ((idx (string-index line #\<) (string-index line #\<)))
	   ((not idx) (proc-words line (string-length line)))
					; seen '<'
	 (proc-words line idx)
	 (let ((trm (if (and (<= (+ 4 idx) (string-length line))
			     (string=? "<!--" (substring line idx (+ 4 idx))))
			"-->" #\>)))
	   (let loop ((lne (substring line idx (string-length line)))
		      (tag "")
		      (quot #f))
	     (define edx (or (eof-object? lne)
			     (if quot
				 (string-index lne quot)
				 (if (char? trm)
				     (string-index lne #\" #\' #\>)
				     (string-index lne #\>)))))
	     (cond
	      ((not edx)		; still inside tag
	       ;;(print quot trm 'within-tag lne)
	       (loop (read-line iport)
		     (and markup-proc (string-append tag lne nl))
		     quot))
	      ((eqv? #t edx)		; EOF
	       ;;(print quot trm 'eof lne)
	       (slib:error 'unterminated 'HTML 'entity file)
	       (and markup-proc (markup-proc tag)))
	      ((eqv? quot (string-ref lne edx))	; end of quoted string
	       ;;(print quot trm 'end-quote lne)
	       (set! edx (+ 1 edx))
	       (loop (substring lne edx (string-length lne))
		     (and markup-proc
			  (string-append tag (substring lne 0 edx)))
		     #f))
	      ((not (eqv? #\> (string-ref lne edx))) ; start of quoted
	       ;;(print quot trm 'start-quote lne)
	       (set! edx (+ 1 edx))
	       (loop (substring lne edx (string-length lne))
		     (and markup-proc
			  (string-append tag (substring lne 0 edx)))
		     (string-ref lne (+ -1 edx))))
	      ((or (and (string? trm)	; found matching '>' or '-->'
			(<= 2 edx)
			(equal? trm (substring lne (+ -2 edx) (+ 1 edx))))
		   (eqv? (string-ref lne edx) trm))
	       ;;(print quot trm 'end-> lne)
	       (set! edx (+ 1 edx))
	       (and markup-proc
		    (markup-proc (string-append tag (substring lne 0 edx))))
					; process words after '>'
	       (set! line (substring lne edx (string-length lne))))
	      (else
	       ;;(print quot trm 'within-comment lne)
	       (set! edx (+ 1 edx))
	       (loop (substring lne edx (string-length lne))
		     (and markup-proc
			  (string-append tag (substring lne 0 edx)))
		     #f))))))
       (and newline-proc (newline-proc))))))

;;@args file limit
;;@args file
;;@1 is an input port or a string naming an existing file containing
;;HTML text.  If supplied, @2 must be an integer.  @2 defaults to
;;1000.
;;
;;@0 opens and reads HTML from port @1 or the file named by string @1,
;;until reaching the (mandatory) @samp{TITLE} field.  @0 returns the
;;title string with adjacent whitespaces collapsed to one space.  @0
;;returns #f if the title field is empty, absent, if the first
;;character read from @1 is not @samp{#\<}, or if the end of title is
;;not found within the first (approximately) @2 words.
(define (html:read-title file . limit)
  (set! limit (if (null? limit) 1000 (* 2 (car limit))))
  ((if (input-port? file) call-with-open-ports call-with-input-file)
   file
   (lambda (port)
     (and (eqv? #\< (peek-char port))
	  (call-with-current-continuation
	   (lambda (return)
	     (define (cnt . args)
	       (if (negative? limit)
		   (return #f)
		   (set! limit (+ -1 limit))))
	     (define capturing? #f)
	     (define text '())
	     (html-for-each
	      port
	      (lambda (str)
		(cnt)
		(if capturing? (set! text (cons " " (cons str text)))))
	      (lambda (str)
		(cnt)
		(cond ((prefix-ci? "<title" str)
		       (set! capturing? #t))
		      ((prefix-ci? "</title" str)
		       (return (and (not (null? text))
				    (apply string-append
					   (reverse (cdr text))))))
		      ((or (prefix-ci? "</head" str)
			   (prefix-ci? "<body" str))
		       (return #f))))
	      cnt
	      cnt)
	     #f))))))

(define (prefix-ci? pre str)
  (define prelen (string-length pre))
  (and (< prelen (string-length str))
       (string-ci=? pre (substring str 0 prelen))))

;;@body
;;@1 is a hypertext markup string.
;;
;;If @1 is a (hypertext) comment, then @0 returns #f.
;;Otherwise @0 returns the hypertext element symbol (created by
;;@code{string-ci->symbol}) consed onto an association list of the
;;attribute name-symbols and values.  Each value is a number or
;;string; or #t if the name had no value assigned within the markup.
(define (htm-fields htm)
  (require 'string-case)
  (and
   (not (and (> (string-length htm) 4) (equal? "<!--" (substring htm 0 4))))
   (call-with-input-string htm
     (lambda (port)
       (define element #f)
       (define fields '())
       (cond ((not (eqv? 1 (fscanf port "<%s" element)))
	      (slib:error 'htm-fields 'strange htm)))
       (let loop ((chr (peek-char port)))
	 (define name #f)
	 (define junk #f)
	 (define value #t)
	 (cond
	  ((eof-object? chr) (slib:warn 'htm-fields 'missing '> htm)
	   (reverse fields))
	  ((eqv? #\> chr) (cons element (reverse fields)))
	  ((char-whitespace? chr) (read-char port) (loop (peek-char port)))
	  ((case (fscanf port "%[a-zA-Z0-9]%[=]%[-.a-zA-Z0-9]"
			 name junk value)
	     ((3 1) #t)
	     ((2)
	      (case (peek-char port)
		((#\") (cond ((eqv? 1 (fscanf port "\"%[^\"]\"" value)))
			     ((eqv? #\" (peek-char port))
			      (read-char port)
			      (set! value ""))
			     (else #f)))
		((#\') (cond ((eqv? 1 (fscanf port "'%[^']'" value)))
			     ((eqv? #\' (peek-char port))
			      (read-char port)
			      (set! value ""))
			     (else #f)))
		(else #f)))
	     (else #f))
	   (set! fields (cons (cons (string-ci->symbol name)
				    (if (string? value)
					(or (string->number value) value)
					value))
			      fields))
	   (loop (peek-char port)))
	  (else (slib:warn 'htm-fields 'bad 'field htm)
		(reverse fields))))))))