aboutsummaryrefslogtreecommitdiff
path: root/sys/fmtio/patmatch.x
blob: 9972060f0a5d73cf88333f074143fcd1d2352cfc (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
# Copyright(c) 1986 Association of Universities for Research in Astronomy Inc.

include	<pattern.h>
include	<ctype.h>
include	<chars.h>

# PATMATCH.X -- Routines for matching regular expressions (general pattern
# matching).  Adapted from Software Tools.
# 
#	      patsize = patmake (patstr, patbuf, sz_patbuf)
#	   next_char = patmatch (str, patbuf)
#	  next_char = gpatmatch (str, patbuf, first_char, last_char)
#	          ip = patindex (patbuf, index_number)
#
# The pattern string must be encoded with PATMAKE before use.  See also
# STRMATCH, STRNCMP, etc.

# Pattern codes (for encoded patterns).

define	EOP		-1		# end of encoded pattern
define	CHAR		-2		# match char
define	UCHAR		-3		# match either case
define	LCHAR		-4		# match either case
define	BOL		-5		# match at beginning of line
define	EOL		-6		# match at end of line
define	ANY		-7		# "?"
define	WHITESPACE	-8		# "#"
define	CCL		-9		# [...
define	NCCL		-10		# [^...
define	CLOSURE		-11		# "*"
define	INDEX		-12		# % (mark index of ^ in pattern)

define	CH_INDEX	'%'		# move to <chars.h> after a while

# Definitions for the closure structure.

define	CLOSIZE		4		# size of closure structure
define	COUNT		1		# repeat count for matches
define	PREVCL		2		# index of previous closure in pat
define	START		3		# index in str where match starts


# PATMATCH -- Match pattern anywhere on line.  Returns the index of the
# first character AFTER the match, or zero if no match.

int procedure patmatch (str, pat)

char	str[ARB]		# string to be scanned
char	pat[ARB]		# encoded pattern

int	first_char, last_char
int	gpatmatch()

begin
	return (gpatmatch (str, pat, first_char, last_char))
end


# GPATMATCH -- Generalized pattern match.  Matches pattern anywhere on
# line (the first such pattern matched terminates the search).  Function
# return same as for PATMATCH, but also returns indices of the first and
# last characters in the matched substring.

int procedure gpatmatch (str, pat, first_char, last_char)

char	str[ARB]		# string to be scanned
char	pat[ARB]		# encoded pattern
int	first_char		# index of first char matched (output)
int	last_char		# index of last  char matched (output)

int	ip, nchars_matched
int	pat_amatch()				# anchored match

begin
	nchars_matched = 0

	if (pat[1] == BOL) {
	    ip = 1
	    nchars_matched = pat_amatch (str, ip, pat)
	} else {
	    for (ip=1;  str[ip] != EOS;  ip=ip+1) {
		nchars_matched = pat_amatch (str, ip, pat)
		if (nchars_matched > 0)
		    break
	    }
	}

	if (nchars_matched > 0) {
	    first_char = ip
	    last_char = ip + nchars_matched - 1
	    return (last_char + 1)
	} else
	    return (0)
end


# PATINDEX -- Return the index of a marked position in the pattern.  Inclusion
# of the character % in the pattern causes the index of the character following
# the % to be saved in the encoded pattern at patmatch time.  We are called
# after a patmatch operation to scan the pattern and recall the Nth saved index.
# Zero is returned if N is larger than the number of saved index points.

int procedure patindex (pat, n)

char	pat[ARB]		# encoded pattern
int	n			# number of index to be returned

int	pp, ix
int	pat_gsize()

begin
	ix = 1
	for (pp=1;  pat[pp] != EOP;  pp=pp+pat_gsize(pat,pp))
	    if (pat[pp] == INDEX)
		if (ix >= n)
		    return (pat[pp+1])
		else
		    ix = ix + 1

	return (0)
end


# PAT_AMATCH -- Anchored match.  Look for match starting at the given
# offset.  Return the number of characters matched.

int procedure pat_amatch (str, from, pat)

char	str[ARB]		# string to be matched
int	from			# starting at this index
char	pat[ARB]		# encoded pattern

int	ip, pp, offset, stack
int	pat_omatch(), pat_gsize()

begin
	stack = 0
	offset = from				# next unexamined input char

	for (pp=1;  pat[pp] != EOP;  pp = pp + pat_gsize(pat,pp)) {
	    if (pat[pp] == CLOSURE) {		# a closure entry
		stack = pp
		pp = pp + CLOSIZE	
		# Match as many characters as possible, save results
		for (ip=offset;  str[ip] != EOS;  )
		    if (pat_omatch (str, ip, pat, pp) == NO)
			break
		pat[stack+COUNT] = ip - offset
		pat[stack+START] = offset
		offset = ip			# character that made us fail

	    } else if (pat_omatch (str, offset, pat, pp) == NO) {
		for (;  stack > 0;  stack = pat[stack+PREVCL])
		    if (pat[stack+COUNT] > 0)
			break
		if (stack <= 0)			# stack is empty
		    return (0)			# return failure

		pat[stack+COUNT] = pat[stack+COUNT] - 1
		pp = stack + CLOSIZE
		offset = pat[stack+START] + pat[stack+COUNT]
	    }
	}

	return (offset-from)			# successful match
end


# PAT_GSIZE -- Returns size of pattern entry at pat[n].

int procedure pat_gsize (pat, n)

char	pat[ARB]		# encoded pattern
int	n			# pointer into pattern
int	pattern_size

begin
	switch (pat[n]) {
	case CHAR, UCHAR, LCHAR, INDEX:
	    pattern_size = 2
	case BOL, EOL, ANY, WHITESPACE:
	    pattern_size = 1
	case CCL, NCCL:
	    pattern_size = pat[n+1] + 2
	case CLOSURE:					# not used
	    pattern_size = CLOSIZE
	default:
	    call error (0, "In patsize: can't happen.")
	}

	return (pattern_size)
end


# PAT_OMATCH -- Try to match a single pattern at pat[pp].  If match, bump IP
# to point to the next unmatched character.  Return OK if match.

int procedure pat_omatch (str, ip, pat, pp)

char	str[ARB]		# string to be scanned
int	ip			# starting index in string (may be changed)
char	pat[ARB]		# encoded pattern
int	pp			# pointer to next pattern element

char	str_ch
int	bump, pat_locate()

begin
	if (str[ip] == EOS)
	    if (pat[pp] == INDEX) {
		pat[pp+1] = ip
		return (YES)
	    } else if (pat[pp] == EOL) {
		return (YES)
	    } else
		return (NO)

	# Treat CHAR (simple character match) as a special case to speed
	# things up a bit.

	if (pat[pp] == CHAR)
	    if (str[ip] == pat[pp+1]) {
		ip = ip + 1
		return (YES)
	    } else
		return (NO)

	# Compare as indicated by encoded pattern opcode.
	bump = -1

	switch (pat[pp]) {
	case UCHAR:					# match either case
	    str_ch = str[ip]
	    if (IS_LOWER (str_ch))
		str_ch = TO_UPPER (str_ch)
	    if (str_ch == pat[pp+1])
		bump = 1
	case LCHAR:					# match either case
	    str_ch = str[ip]
	    if (IS_UPPER (str_ch))
		str_ch = TO_LOWER (str_ch)
	    if (str_ch == pat[pp+1])
		bump = 1
	case BOL:					# beg. of line
	    if (ip == 1)
		bump = 0
	case EOL:					# end of line
	    if (str[ip] == '\n')
		bump = 0
	case ANY:					# match any char
	    if (str[ip] != '\n')
		bump = 1
	case WHITESPACE:
	    for (bump=0;  IS_WHITE (str[ip+bump]);  bump=bump+1)
		;
	case CCL:					# char class
	    if (pat_locate (str[ip], pat, pp + 1) == YES)
		bump = 1
	case NCCL:					# not in char class
	    if (str[ip] != '\n' && pat_locate (str[ip], pat, pp + 1) == NO)
		bump = 1
	case INDEX:
	    pat[pp+1] = ip
	    bump = 0
	default:
	    call error (0, "In omatch: can't happen.")
	}

	if (bump >= 0) {
	    ip = ip + bump
	    return (YES)
	} else
	    return (NO)
end


# PAT_LOCATE -- Look for c in char class at pat[offset].

int procedure pat_locate (ch, pat, offset)

char	ch			# char to search for
char	pat[ARB]		# encoded pattern
int	offset			# offset of character class in pattern

int	nchars, i

begin
	# Size of class is at pat[offset], characters follow.
	nchars = pat[offset]
	do i = 1, nchars
	    if (ch == pat[offset+i])
		return (YES)

	return (NO)
end


# PATMAKE -- Encode pattern specification string.  Returns the size of
# the encoded pattern string.

int procedure patmake (str, pat, sz_pat)

char	str[ARB]		# pattern to be encoded
char	pat[ARB]		# encoded pattern (output)
int	sz_pat			# max size of the pattern string
int	gpatmake()

begin
	return (gpatmake (str, 1, EOS, pat, sz_pat))
end


# GPATMAKE -- Make pattern from str[from], terminate at delim.

int procedure gpatmake (patstr, from, delim, patbuf, sz_pat)

char	patstr[ARB]		# pattern to be encoded
int	from			# starting index
int	delim			# delimiter character
char	patbuf[ARB]		# put encoded pattern here
int	sz_pat			# max chars in encoded pattern

int	ip, op, last_closure, last_op, l_op
char	cval
bool	ignore_case
int	cctoc(), pat_getccl(), pat_stclos()

begin
	op = 1					      # pat index
	last_op = 1
	last_closure = 0
	ignore_case = false

	for (ip=from;  patstr[ip] != delim && patstr[ip] != EOS;  ip=ip+1) {
	    l_op = op

	    # If CVAL gets set to nonzero it will be deposited in the output
	    # buffer at end of switch.

	    cval = 0

	    switch (patstr[ip]) {
	    case CH_ANY:
		cval = ANY
	    case CH_WHITESPACE:
		cval = WHITESPACE

	    case CH_BOL:
		if (ip == from)
		    cval = BOL
		else {
		    cval = CHAR
		    call chdeposit (cval, patbuf, sz_pat, op)
		    cval = CH_BOL
		}

	    case CH_EOL:
		if (patstr[ip+1] == delim)
		    cval = EOL
		else {
		    cval = CHAR
		    call chdeposit (cval, patbuf, sz_pat, op)
		    cval = CH_EOL
		}

	    case CH_IGNORECASE:
		ignore_case = true
	    case CH_MATCHCASE:
		ignore_case = false

	    case CH_CCL:
		if (pat_getccl (patstr, patbuf, sz_pat, ip, op) == ERR)
		    return (ERR)

	    case CH_CLOSURE:
		# The "closure" of a pattern, e.g., "..*".

		l_op = last_op
		# Convert a pattern such as "*..." into "?*...".
		if (ip == from)				# closure of nothing
		    cval = ANY
		else {
		    switch (patbuf[l_op]) {
		    case BOL, EOL, CLOSURE:
			cval = ANY
		    }
		}

		if (cval != 0)
		    call chdeposit (cval, patbuf, sz_pat, op)
		cval = 0

		last_closure = pat_stclos (patbuf, sz_pat, op, last_op,
		    last_closure)

	    case CH_INDEX:
		# This metacharacter does not match anything, but rather is
		# used to record the index of the marked position in the
		# matched pattern.  The index is recorded in the pattern
		# buffer at match time, to be later recovered with patindex.

		cval = INDEX
		call chdeposit (cval, patbuf, sz_pat, op)
		cval = 0
		call chdeposit (cval, patbuf, sz_pat, op)

	    default:
		# Ordinary character.

		# Deposit command code.
		if (ignore_case) {
		    if (IS_UPPER (patstr[ip]))
			cval = UCHAR
		    else
			cval = LCHAR
		} else
		    cval = CHAR
		call chdeposit (cval, patbuf, sz_pat, op)

		# Set CVAL to actual character value.
		if (patstr[ip] == CH_ESCAPE) {
		    if (cctoc (patstr, ip, cval) == 1)
			cval = patstr[ip]
		    else
			ip = ip - 1
		} else
		    cval = patstr[ip]
	    }

	    # Deposit the character left in CVAL by the code above.
	    if (cval != 0)
		call chdeposit (cval, patbuf, sz_pat, op)

	    last_op = l_op
	}

	# Terminate the pattern.
	cval = EOP
	call chdeposit (cval, patbuf, sz_pat, op)

	if (patstr[ip] != delim || op >= sz_pat)
	    return (ERR)
	else
	    return (op - 1)				# return size patbuf
end


# PAT_GETCCL -- Expand character class at patstr[i] into patbuf[op].

int procedure pat_getccl (patstr, patbuf, sz_pat, ip, op)

char	patstr[ARB], patbuf[ARB]
int	sz_pat, ip, op
char	cval
int	op_start

begin
	ip = ip + 1					# skip over [
	if (patstr[ip] == CH_NOT) {
	    cval = NCCL
	    ip = ip + 1
	} else
	    cval = CCL

	call chdeposit (cval, patbuf, sz_pat, op)

	op_start = op
	cval = 0
	call chdeposit (cval, patbuf, sz_pat, op)	# leave room for count
	call pat_filset (CH_CCLEND, patstr, ip, patbuf, sz_pat, op)
	patbuf[op_start] = op - op_start - 1		# fix up count

	if (patstr[ip] == CH_CCLEND)
	    return (OK)
	else
	    return (ERR)
end


# PAT_STCLOS -- Insert closure entry at patbuf[op].

int procedure pat_stclos (patbuf, sz_pat, op, last_op, last_closure)

char	patbuf[ARB]
int	sz_pat
int	op
int	last_op
int	last_closure

char	cvals[4]
int	next_closure, jp, jt, i

begin
	for (jp=op-1;  jp >= last_op;  jp=jp-1) {	# make a hole
	    jt = min (sz_pat, jp + CLOSIZE)
	    patbuf[jt] = patbuf[jp]
	}

	op = op + CLOSIZE
	next_closure = last_op

	cvals[1] = CLOSURE
	cvals[2] = 0				# COUNT
	cvals[3] = last_closure			# PREVCL
	cvals[4] = 0				# START

	do i = 1, 4
	    call chdeposit (cvals[i], patbuf, sz_pat, last_op)

	return (next_closure)
end


# PAT_FILSET -- Process a character class into a simple list of characters.

procedure pat_filset (delim, patstr, ip, patbuf, sz_pat, op)

int	delim			# character class delimiter character
char	patstr[ARB]		# character class characters
int	ip			# index where they start
char	patbuf[ARB]		# encode character class in this string
int	sz_pat			# max chars out
int	op			# offset into patbuf

char	ch, ch1, ch2
int	cctoc()

begin
	for (;  patstr[ip] != delim && patstr[ip] != EOS;  ip=ip+1) {
	    if (patstr[ip] == ESCAPE) {				# escape seq.
		if (cctoc (patstr, ip, ch) == 1)
		    ch = patstr[ip]
		else
		    ip = ip - 1
		call chdeposit (ch, patbuf, sz_pat, op)

	    } else if (patstr[ip] != CH_RANGE) {
		call chdeposit (patstr[ip], patbuf, sz_pat, op)

	    } else if (op <= 1 || patstr[ip+1] == EOS) {	# literal "-"
		ch = CH_RANGE
		call chdeposit (ch, patbuf, sz_pat, op)

	    } else {
		# Here if char is CH_RANGE, denoting a range of characters to be
		# included in the character class.  Range is valid only if limit
		# chars are both digits, both lower case, or both upper case.

		ch1 = patbuf[op-1]		# not same as patstr[ip-1]
		ch2 = patstr[ip+1]

		if ((IS_DIGIT (ch1) && IS_DIGIT (ch2)) ||
		    (IS_LOWER (ch1) && IS_LOWER (ch2)) ||
		    (IS_UPPER (ch1) && IS_UPPER (ch2))) {
			if (ch1 <= ch2)
			    for (ch=ch1+1;  ch <= ch2;  ch=ch+1)
				call chdeposit (ch, patbuf, sz_pat, op)
			else
			    for (ch=ch1-1;  ch >= ch2;  ch=ch-1)
				call chdeposit (ch, patbuf, sz_pat, op)
			ip = ip + 1
		} else {
		    ch = CH_RANGE
		    call chdeposit (ch, patbuf, sz_pat, op)
		}
	    }
	}
end