summaryrefslogtreecommitdiffstats
path: root/strsrch.scm
blob: a08510ed39a7bade382f1be06c5ceab32397ffbc (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
;;; "MISCIO" Search for string from port.
; Written 1995, 1996 by Oleg Kiselyov (oleg@ponder.csci.unt.edu)
; Modified 1996 by A. Jaffer (jaffer@ai.mit.edu)
;
; This code is in the public domain.

;;; Return the index of the first occurence of a-char in str, or #f
(define (string-index str a-char)
  (let loop ((pos 0))
    (cond
     ;; whole string has been searched, in vain
     ((>= pos (string-length str)) #f)
     ((char=? a-char (string-ref str pos)) pos)
     (else (loop (+ 1 pos))))))

(define (substring? pattern str)
  (let* ((pat-len (string-length pattern))
	 (search-span (- (string-length str) pat-len))
	 (c1 (if (zero? pat-len) #f (string-ref pattern 0)))
	 (c2 (if (<= pat-len 1) #f (string-ref pattern 1))))
    (cond
     ((not c1) 0)			; empty pattern, matches upfront
     ((not c2) (string-index str c1))	; one-char pattern
     (else				; matching pattern of > two chars
      (let outer ((pos 0))
	(cond
	 ((> pos search-span) #f)	; nothing was found thru the whole str
	 ((not (char=? c1 (string-ref str pos)))
	  (outer (+ 1 pos)))		; keep looking for the right beginning
	 ((not (char=? c2 (string-ref str (+ 1 pos))))
	  (outer (+ 1 pos)))		; could've done pos+2 if c1 == c2....
	 (else				; two char matched: high probability
					; the rest will match too
	  (let inner ((i-pat 2) (i-str (+ 2 pos)))
	    (if (>= i-pat pat-len) pos	; the whole pattern matched
		(if (char=? (string-ref pattern i-pat)
			    (string-ref str i-str))
		    (inner (+ 1 i-pat) (+ 1 i-str))
		    ;; mismatch after partial match
		    (outer (+ 1 pos))))))))))))

(define (find-string-from-port? str <input-port> . max-no-char)
  (set! max-no-char (if (null? max-no-char) #f (car max-no-char)))
  (letrec
      ((no-chars-read 0)
       (my-peek-char			; Return a peeked char or #f
	(lambda () (and (or (not max-no-char) (< no-chars-read max-no-char))
			(let ((c (peek-char <input-port>)))
			  (if (eof-object? c) #f c)))))
       (next-char (lambda () (read-char <input-port>)
			  (set! no-chars-read  (+ 1 no-chars-read))))
       (match-1st-char			; of the string str
	(lambda ()
	  (let ((c (my-peek-char)))
	    (if (not c) #f
		(begin (next-char)
		       (if (char=? c (string-ref str 0))
			   (match-other-chars 1)
			   (match-1st-char)))))))
       ;; There has been a partial match, up to the point pos-to-match
       ;; (for example, str[0] has been found in the stream)
       ;; Now look to see if str[pos-to-match] for would be found, too
       (match-other-chars
	(lambda (pos-to-match)
	  (if (>= pos-to-match (string-length str))
	      no-chars-read		; the entire string has matched
	      (let ((c (my-peek-char)))
		(and c
		     (if (not (char=? c (string-ref str pos-to-match)))
			 (backtrack 1 pos-to-match)
			 (begin (next-char)
				(match-other-chars (+ 1 pos-to-match)))))))))

       ;; There had been a partial match, but then a wrong char showed up.
       ;; Before discarding previously read (and matched) characters, we check
       ;; to see if there was some smaller partial match. Note, characters read
       ;; so far (which matter) are those of str[0..matched-substr-len - 1]
       ;; In other words, we will check to see if there is such i>0 that
       ;; substr(str,0,j) = substr(str,i,matched-substr-len)
       ;; where j=matched-substr-len - i
       (backtrack
	(lambda (i matched-substr-len)
	  (let ((j (- matched-substr-len i)))
	    (if (<= j 0)
		;; backed off completely to the begining of str
		(match-1st-char)
		(let loop ((k 0))
		  (if (>= k j)
		      (match-other-chars j) ; there was indeed a shorter match
		      (if (char=? (string-ref str k)
				  (string-ref str (+ i k)))
			  (loop (+ 1 k))
			  (backtrack (+ 1 i) matched-substr-len))))))))
       )
    (match-1st-char)))