aboutsummaryrefslogtreecommitdiffstats
path: root/strsrch.scm
blob: a73023492816d0b6a756aff061caca99783b359f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
;;; "strsrch.scm" Search for string from port.
; Written 1995, 1996 by Oleg Kiselyov (oleg@acm.org)
; Modified 1996, 1997, 1998, 2001 by A. Jaffer (agj@alum.mit.edu)
; Modified 2003 by Steve VanDevender (stevev@hexadecimal.uoregon.edu)
;
; This code is in the public domain.

;;;@ Return the index of the first occurence of chr in str, or #f
(define (string-index str chr)
  (define len (string-length str))
  (do ((pos 0 (+ 1 pos)))
      ((or (>= pos len) (char=? chr (string-ref str pos)))
       (and (< pos len) pos))))
;@
(define (string-index-ci str chr)
  (define len (string-length str))
  (do ((pos 0 (+ 1 pos)))
      ((or (>= pos len) (char-ci=? chr (string-ref str pos)))
       (and (< pos len) pos))))
;@
(define (string-reverse-index str chr)
  (do ((pos (+ -1 (string-length str)) (+ -1 pos)))
      ((or (negative? pos) (char=? (string-ref str pos) chr))
       (and (not (negative? pos)) pos))))
;@
(define (string-reverse-index-ci str chr)
  (do ((pos (+ -1 (string-length str)) (+ -1 pos)))
      ((or (negative? pos) (char-ci=? (string-ref str pos) chr))
       (and (not (negative? pos)) pos))))
;@
(define (substring? pat str)
  (define patlen (string-length pat))
  (define strlen (string-length str))
  (cond ((zero? patlen) 0)		; trivial match
	((>= patlen strlen) (and (= patlen strlen) (string=? pat str) 0))
	;; use faster string-index to match a single-character pattern
	((= 1 patlen) (string-index str (string-ref pat 0)))
	((or (<= strlen (+ patlen patlen (quotient char-code-limit 2)))
	     (<= patlen 4))
	 (subloop pat patlen str strlen char=?))
	(else
	 ;; compute skip values for search pattern characters
	 ;; for all c not in pat, skip[c] = patlen + 1
	 ;; for c in pat, skip[c] is distance of rightmost occurrence
	 ;;  of c from end of str
	 (let ((skip (make-vector char-code-limit (+ patlen 1))))
	   (do ((i 0 (+ i 1)))
	       ((= i patlen))
	     (vector-set! skip (char->integer (string-ref pat i))
			  (- patlen i)))
	   (subskip skip pat patlen str strlen char=?)))))
;@
(define (substring-ci? pat str)
  (define patlen (string-length pat))
  (define strlen (string-length str))
  (cond ((zero? patlen) 0)		; trivial match
	((>= patlen strlen) (and (= patlen strlen) (string-ci=? pat str) 0))
	((= 1 patlen) (string-index-ci str (string-ref pat 0)))
	((or (<= strlen (+ patlen patlen (quotient char-code-limit 2)))
	     (<= patlen 4))
	 (subloop pat patlen str strlen char-ci=?))
	(else
	 (let ((skip (make-vector char-code-limit (+ patlen 1))))
	   (do ((i 0 (+ i 1)))
	       ((= i patlen))
	     (let ((c (string-ref pat i))
		   (d (- patlen i)))
	       ;; use same skip value for both upper- and lowercase characters
	       (vector-set! skip (char->integer (char-upcase c)) d)
	       (vector-set! skip (char->integer (char-downcase c)) d)))
	   (subskip skip pat patlen str strlen char-ci=?)))))

(define (subskip skip pat patlen str strlen char=)
  (do ((k patlen (if (< k strlen)
		     (+ k (vector-ref skip (char->integer (string-ref str k))))
		     (+ strlen 1))))
      ((or (> k strlen)
	   (do ((i 0 (+ i 1))
		(j (- k patlen) (+ j 1)))
	       ((or (= i patlen)
		    (not (char= (string-ref pat i) (string-ref str j))))
		(= i patlen))))
       (and (<= k strlen) (- k patlen)))))

;;; Assumes that PATLEN > 1
(define (subloop pat patlen str strlen char=)
  (define span (- strlen patlen))
  (define c1 (string-ref pat 0))
  (define c2 (string-ref pat 1))
  (let outer ((pos 0))
    (cond
     ((> pos span) #f)		; nothing was found thru the whole str
     ((not (char= c1 (string-ref str pos)))
      (outer (+ 1 pos)))	; keep looking for the right beginning
     ((not (char= c2 (string-ref str (+ 1 pos))))
      (outer (+ 1 pos)))	 ; could've done pos+2 if c1 == c2....
     (else			  ; two char matched: high probability
					; the rest will match too
      (let inner ((pdx 2) (sdx (+ 2 pos)))
	(if (>= pdx patlen) pos	; the whole pat matched
	    (if (char= (string-ref pat pdx)
		       (string-ref str sdx))
		(inner (+ 1 pdx) (+ 1 sdx))
		;; mismatch after partial match
		(outer (+ 1 pos)))))))))
;@
(define (find-string-from-port? str <input-port> . max-no-char)
  (set! max-no-char (if (null? max-no-char) #f (car max-no-char)))
  (letrec
      ((no-chars-read 0)
       (peeked? #f)
       (my-peek-char			; Return a peeked char or #f
	(lambda () (and (or (not (number? max-no-char))
			    (< no-chars-read max-no-char))
			(let ((c (peek-char <input-port>)))
			  (cond (peeked? c)
				((eof-object? c) #f)
				((procedure? max-no-char)
				 (set! peeked? #t)
				 (if (max-no-char c) #f c))
				((eqv? max-no-char c) #f)
				(else c))))))
       (next-char (lambda () (set! peeked? #f) (read-char <input-port>)
			  (set! no-chars-read  (+ 1 no-chars-read))))
       (match-1st-char			; of the string str
	(lambda ()
	  (let ((c (my-peek-char)))
	    (and c
		 (begin (next-char)
			(if (char=? c (string-ref str 0))
			    (match-other-chars 1)
			    (match-1st-char)))))))
       ;; There has been a partial match, up to the point pos-to-match
       ;; (for example, str[0] has been found in the stream)
       ;; Now look to see if str[pos-to-match] for would be found, too
       (match-other-chars
	(lambda (pos-to-match)
	  (if (>= pos-to-match (string-length str))
	      no-chars-read	       ; the entire string has matched
	      (let ((c (my-peek-char)))
		(and c
		     (if (not (char=? c (string-ref str pos-to-match)))
			 (backtrack 1 pos-to-match)
			 (begin (next-char)
				(match-other-chars (+ 1 pos-to-match)))))))))

       ;; There had been a partial match, but then a wrong char showed up.
       ;; Before discarding previously read (and matched) characters, we check
       ;; to see if there was some smaller partial match. Note, characters read
       ;; so far (which matter) are those of str[0..matched-substr-len - 1]
       ;; In other words, we will check to see if there is such i>0 that
       ;; substr(str,0,j) = substr(str,i,matched-substr-len)
       ;; where j=matched-substr-len - i
       (backtrack
	(lambda (i matched-substr-len)
	  (let ((j (- matched-substr-len i)))
	    (if (<= j 0)
		;; backed off completely to the begining of str
		(match-1st-char)
		(let loop ((k 0))
		  (if (>= k j)
		      (match-other-chars j) ; there was indeed a shorter match
		      (if (char=? (string-ref str k)
				  (string-ref str (+ i k)))
			  (loop (+ 1 k))
			  (backtrack (+ 1 i) matched-substr-len))))))))
       )
    (match-1st-char)))
;@
(define (string-subst text old new . rest)
  (define sub
    (lambda (text)
      (set! text
	    (cond ((equal? "" text) text)
		  ((substring? old text)
		   => (lambda (idx)
			(string-append
			 (substring text 0 idx)
			 new
			 (sub (substring
			       text (+ idx (string-length old))
			       (string-length text))))))
		  (else text)))
      (if (null? rest)
	  text
	  (apply string-subst text rest))))
  (sub text))
;@
(define (count-newlines str)
  (do ((idx (+ -1 (string-length str)) (+ -1 idx))
       (cnt 0 (+ (if (eqv? #\newline (string-ref str idx)) 1 0) cnt)))
      ((<= idx 0) cnt)))