1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
|
# Copyright(c) 1986 Association of Universities for Research in Astronomy Inc.
include <pattern.h>
include <ctype.h>
include <chars.h>
# PATMATCH.X -- Routines for matching regular expressions (general pattern
# matching). Adapted from Software Tools.
#
# patsize = patmake (patstr, patbuf, sz_patbuf)
# next_char = patmatch (str, patbuf)
# next_char = gpatmatch (str, patbuf, first_char, last_char)
# ip = patindex (patbuf, index_number)
#
# The pattern string must be encoded with PATMAKE before use. See also
# STRMATCH, STRNCMP, etc.
# Pattern codes (for encoded patterns).
define EOP -1 # end of encoded pattern
define CHAR -2 # match char
define UCHAR -3 # match either case
define LCHAR -4 # match either case
define BOL -5 # match at beginning of line
define EOL -6 # match at end of line
define ANY -7 # "?"
define WHITESPACE -8 # "#"
define CCL -9 # [...
define NCCL -10 # [^...
define CLOSURE -11 # "*"
define INDEX -12 # % (mark index of ^ in pattern)
define CH_INDEX '%' # move to <chars.h> after a while
# Definitions for the closure structure.
define CLOSIZE 4 # size of closure structure
define COUNT 1 # repeat count for matches
define PREVCL 2 # index of previous closure in pat
define START 3 # index in str where match starts
# PATMATCH -- Match pattern anywhere on line. Returns the index of the
# first character AFTER the match, or zero if no match.
int procedure patmatch (str, pat)
char str[ARB] # string to be scanned
char pat[ARB] # encoded pattern
int first_char, last_char
int gpatmatch()
begin
return (gpatmatch (str, pat, first_char, last_char))
end
# GPATMATCH -- Generalized pattern match. Matches pattern anywhere on
# line (the first such pattern matched terminates the search). Function
# return same as for PATMATCH, but also returns indices of the first and
# last characters in the matched substring.
int procedure gpatmatch (str, pat, first_char, last_char)
char str[ARB] # string to be scanned
char pat[ARB] # encoded pattern
int first_char # index of first char matched (output)
int last_char # index of last char matched (output)
int ip, nchars_matched
int pat_amatch() # anchored match
begin
nchars_matched = 0
if (pat[1] == BOL) {
ip = 1
nchars_matched = pat_amatch (str, ip, pat)
} else {
for (ip=1; str[ip] != EOS; ip=ip+1) {
nchars_matched = pat_amatch (str, ip, pat)
if (nchars_matched > 0)
break
}
}
if (nchars_matched > 0) {
first_char = ip
last_char = ip + nchars_matched - 1
return (last_char + 1)
} else
return (0)
end
# PATINDEX -- Return the index of a marked position in the pattern. Inclusion
# of the character % in the pattern causes the index of the character following
# the % to be saved in the encoded pattern at patmatch time. We are called
# after a patmatch operation to scan the pattern and recall the Nth saved index.
# Zero is returned if N is larger than the number of saved index points.
int procedure patindex (pat, n)
char pat[ARB] # encoded pattern
int n # number of index to be returned
int pp, ix
int pat_gsize()
begin
ix = 1
for (pp=1; pat[pp] != EOP; pp=pp+pat_gsize(pat,pp))
if (pat[pp] == INDEX)
if (ix >= n)
return (pat[pp+1])
else
ix = ix + 1
return (0)
end
# PAT_AMATCH -- Anchored match. Look for match starting at the given
# offset. Return the number of characters matched.
int procedure pat_amatch (str, from, pat)
char str[ARB] # string to be matched
int from # starting at this index
char pat[ARB] # encoded pattern
int ip, pp, offset, stack
int pat_omatch(), pat_gsize()
begin
stack = 0
offset = from # next unexamined input char
for (pp=1; pat[pp] != EOP; pp = pp + pat_gsize(pat,pp)) {
if (pat[pp] == CLOSURE) { # a closure entry
stack = pp
pp = pp + CLOSIZE
# Match as many characters as possible, save results
for (ip=offset; str[ip] != EOS; )
if (pat_omatch (str, ip, pat, pp) == NO)
break
pat[stack+COUNT] = ip - offset
pat[stack+START] = offset
offset = ip # character that made us fail
} else if (pat_omatch (str, offset, pat, pp) == NO) {
for (; stack > 0; stack = pat[stack+PREVCL])
if (pat[stack+COUNT] > 0)
break
if (stack <= 0) # stack is empty
return (0) # return failure
pat[stack+COUNT] = pat[stack+COUNT] - 1
pp = stack + CLOSIZE
offset = pat[stack+START] + pat[stack+COUNT]
}
}
return (offset-from) # successful match
end
# PAT_GSIZE -- Returns size of pattern entry at pat[n].
int procedure pat_gsize (pat, n)
char pat[ARB] # encoded pattern
int n # pointer into pattern
int pattern_size
begin
switch (pat[n]) {
case CHAR, UCHAR, LCHAR, INDEX:
pattern_size = 2
case BOL, EOL, ANY, WHITESPACE:
pattern_size = 1
case CCL, NCCL:
pattern_size = pat[n+1] + 2
case CLOSURE: # not used
pattern_size = CLOSIZE
default:
call error (0, "In patsize: can't happen.")
}
return (pattern_size)
end
# PAT_OMATCH -- Try to match a single pattern at pat[pp]. If match, bump IP
# to point to the next unmatched character. Return OK if match.
int procedure pat_omatch (str, ip, pat, pp)
char str[ARB] # string to be scanned
int ip # starting index in string (may be changed)
char pat[ARB] # encoded pattern
int pp # pointer to next pattern element
char str_ch
int bump, pat_locate()
begin
if (str[ip] == EOS)
if (pat[pp] == INDEX) {
pat[pp+1] = ip
return (YES)
} else if (pat[pp] == EOL) {
return (YES)
} else
return (NO)
# Treat CHAR (simple character match) as a special case to speed
# things up a bit.
if (pat[pp] == CHAR)
if (str[ip] == pat[pp+1]) {
ip = ip + 1
return (YES)
} else
return (NO)
# Compare as indicated by encoded pattern opcode.
bump = -1
switch (pat[pp]) {
case UCHAR: # match either case
str_ch = str[ip]
if (IS_LOWER (str_ch))
str_ch = TO_UPPER (str_ch)
if (str_ch == pat[pp+1])
bump = 1
case LCHAR: # match either case
str_ch = str[ip]
if (IS_UPPER (str_ch))
str_ch = TO_LOWER (str_ch)
if (str_ch == pat[pp+1])
bump = 1
case BOL: # beg. of line
if (ip == 1)
bump = 0
case EOL: # end of line
if (str[ip] == '\n')
bump = 0
case ANY: # match any char
if (str[ip] != '\n')
bump = 1
case WHITESPACE:
for (bump=0; IS_WHITE (str[ip+bump]); bump=bump+1)
;
case CCL: # char class
if (pat_locate (str[ip], pat, pp + 1) == YES)
bump = 1
case NCCL: # not in char class
if (str[ip] != '\n' && pat_locate (str[ip], pat, pp + 1) == NO)
bump = 1
case INDEX:
pat[pp+1] = ip
bump = 0
default:
call error (0, "In omatch: can't happen.")
}
if (bump >= 0) {
ip = ip + bump
return (YES)
} else
return (NO)
end
# PAT_LOCATE -- Look for c in char class at pat[offset].
int procedure pat_locate (ch, pat, offset)
char ch # char to search for
char pat[ARB] # encoded pattern
int offset # offset of character class in pattern
int nchars, i
begin
# Size of class is at pat[offset], characters follow.
nchars = pat[offset]
do i = 1, nchars
if (ch == pat[offset+i])
return (YES)
return (NO)
end
# PATMAKE -- Encode pattern specification string. Returns the size of
# the encoded pattern string.
int procedure patmake (str, pat, sz_pat)
char str[ARB] # pattern to be encoded
char pat[ARB] # encoded pattern (output)
int sz_pat # max size of the pattern string
int gpatmake()
begin
return (gpatmake (str, 1, EOS, pat, sz_pat))
end
# GPATMAKE -- Make pattern from str[from], terminate at delim.
int procedure gpatmake (patstr, from, delim, patbuf, sz_pat)
char patstr[ARB] # pattern to be encoded
int from # starting index
int delim # delimiter character
char patbuf[ARB] # put encoded pattern here
int sz_pat # max chars in encoded pattern
int ip, op, last_closure, last_op, l_op
char cval
bool ignore_case
int cctoc(), pat_getccl(), pat_stclos()
begin
op = 1 # pat index
last_op = 1
last_closure = 0
ignore_case = false
for (ip=from; patstr[ip] != delim && patstr[ip] != EOS; ip=ip+1) {
l_op = op
# If CVAL gets set to nonzero it will be deposited in the output
# buffer at end of switch.
cval = 0
switch (patstr[ip]) {
case CH_ANY:
cval = ANY
case CH_WHITESPACE:
cval = WHITESPACE
case CH_BOL:
if (ip == from)
cval = BOL
else {
cval = CHAR
call chdeposit (cval, patbuf, sz_pat, op)
cval = CH_BOL
}
case CH_EOL:
if (patstr[ip+1] == delim)
cval = EOL
else {
cval = CHAR
call chdeposit (cval, patbuf, sz_pat, op)
cval = CH_EOL
}
case CH_IGNORECASE:
ignore_case = true
case CH_MATCHCASE:
ignore_case = false
case CH_CCL:
if (pat_getccl (patstr, patbuf, sz_pat, ip, op) == ERR)
return (ERR)
case CH_CLOSURE:
# The "closure" of a pattern, e.g., "..*".
l_op = last_op
# Convert a pattern such as "*..." into "?*...".
if (ip == from) # closure of nothing
cval = ANY
else {
switch (patbuf[l_op]) {
case BOL, EOL, CLOSURE:
cval = ANY
}
}
if (cval != 0)
call chdeposit (cval, patbuf, sz_pat, op)
cval = 0
last_closure = pat_stclos (patbuf, sz_pat, op, last_op,
last_closure)
case CH_INDEX:
# This metacharacter does not match anything, but rather is
# used to record the index of the marked position in the
# matched pattern. The index is recorded in the pattern
# buffer at match time, to be later recovered with patindex.
cval = INDEX
call chdeposit (cval, patbuf, sz_pat, op)
cval = 0
call chdeposit (cval, patbuf, sz_pat, op)
default:
# Ordinary character.
# Deposit command code.
if (ignore_case) {
if (IS_UPPER (patstr[ip]))
cval = UCHAR
else
cval = LCHAR
} else
cval = CHAR
call chdeposit (cval, patbuf, sz_pat, op)
# Set CVAL to actual character value.
if (patstr[ip] == CH_ESCAPE) {
if (cctoc (patstr, ip, cval) == 1)
cval = patstr[ip]
else
ip = ip - 1
} else
cval = patstr[ip]
}
# Deposit the character left in CVAL by the code above.
if (cval != 0)
call chdeposit (cval, patbuf, sz_pat, op)
last_op = l_op
}
# Terminate the pattern.
cval = EOP
call chdeposit (cval, patbuf, sz_pat, op)
if (patstr[ip] != delim || op >= sz_pat)
return (ERR)
else
return (op - 1) # return size patbuf
end
# PAT_GETCCL -- Expand character class at patstr[i] into patbuf[op].
int procedure pat_getccl (patstr, patbuf, sz_pat, ip, op)
char patstr[ARB], patbuf[ARB]
int sz_pat, ip, op
char cval
int op_start
begin
ip = ip + 1 # skip over [
if (patstr[ip] == CH_NOT) {
cval = NCCL
ip = ip + 1
} else
cval = CCL
call chdeposit (cval, patbuf, sz_pat, op)
op_start = op
cval = 0
call chdeposit (cval, patbuf, sz_pat, op) # leave room for count
call pat_filset (CH_CCLEND, patstr, ip, patbuf, sz_pat, op)
patbuf[op_start] = op - op_start - 1 # fix up count
if (patstr[ip] == CH_CCLEND)
return (OK)
else
return (ERR)
end
# PAT_STCLOS -- Insert closure entry at patbuf[op].
int procedure pat_stclos (patbuf, sz_pat, op, last_op, last_closure)
char patbuf[ARB]
int sz_pat
int op
int last_op
int last_closure
char cvals[4]
int next_closure, jp, jt, i
begin
for (jp=op-1; jp >= last_op; jp=jp-1) { # make a hole
jt = min (sz_pat, jp + CLOSIZE)
patbuf[jt] = patbuf[jp]
}
op = op + CLOSIZE
next_closure = last_op
cvals[1] = CLOSURE
cvals[2] = 0 # COUNT
cvals[3] = last_closure # PREVCL
cvals[4] = 0 # START
do i = 1, 4
call chdeposit (cvals[i], patbuf, sz_pat, last_op)
return (next_closure)
end
# PAT_FILSET -- Process a character class into a simple list of characters.
procedure pat_filset (delim, patstr, ip, patbuf, sz_pat, op)
int delim # character class delimiter character
char patstr[ARB] # character class characters
int ip # index where they start
char patbuf[ARB] # encode character class in this string
int sz_pat # max chars out
int op # offset into patbuf
char ch, ch1, ch2
int cctoc()
begin
for (; patstr[ip] != delim && patstr[ip] != EOS; ip=ip+1) {
if (patstr[ip] == ESCAPE) { # escape seq.
if (cctoc (patstr, ip, ch) == 1)
ch = patstr[ip]
else
ip = ip - 1
call chdeposit (ch, patbuf, sz_pat, op)
} else if (patstr[ip] != CH_RANGE) {
call chdeposit (patstr[ip], patbuf, sz_pat, op)
} else if (op <= 1 || patstr[ip+1] == EOS) { # literal "-"
ch = CH_RANGE
call chdeposit (ch, patbuf, sz_pat, op)
} else {
# Here if char is CH_RANGE, denoting a range of characters to be
# included in the character class. Range is valid only if limit
# chars are both digits, both lower case, or both upper case.
ch1 = patbuf[op-1] # not same as patstr[ip-1]
ch2 = patstr[ip+1]
if ((IS_DIGIT (ch1) && IS_DIGIT (ch2)) ||
(IS_LOWER (ch1) && IS_LOWER (ch2)) ||
(IS_UPPER (ch1) && IS_UPPER (ch2))) {
if (ch1 <= ch2)
for (ch=ch1+1; ch <= ch2; ch=ch+1)
call chdeposit (ch, patbuf, sz_pat, op)
else
for (ch=ch1-1; ch >= ch2; ch=ch-1)
call chdeposit (ch, patbuf, sz_pat, op)
ip = ip + 1
} else {
ch = CH_RANGE
call chdeposit (ch, patbuf, sz_pat, op)
}
}
}
end
|