aboutsummaryrefslogtreecommitdiff
path: root/sys/fmtio/patmatch.x
diff options
context:
space:
mode:
Diffstat (limited to 'sys/fmtio/patmatch.x')
-rw-r--r--sys/fmtio/patmatch.x568
1 files changed, 568 insertions, 0 deletions
diff --git a/sys/fmtio/patmatch.x b/sys/fmtio/patmatch.x
new file mode 100644
index 00000000..9972060f
--- /dev/null
+++ b/sys/fmtio/patmatch.x
@@ -0,0 +1,568 @@
+# Copyright(c) 1986 Association of Universities for Research in Astronomy Inc.
+
+include <pattern.h>
+include <ctype.h>
+include <chars.h>
+
+# PATMATCH.X -- Routines for matching regular expressions (general pattern
+# matching). Adapted from Software Tools.
+#
+# patsize = patmake (patstr, patbuf, sz_patbuf)
+# next_char = patmatch (str, patbuf)
+# next_char = gpatmatch (str, patbuf, first_char, last_char)
+# ip = patindex (patbuf, index_number)
+#
+# The pattern string must be encoded with PATMAKE before use. See also
+# STRMATCH, STRNCMP, etc.
+
+# Pattern codes (for encoded patterns).
+
+define EOP -1 # end of encoded pattern
+define CHAR -2 # match char
+define UCHAR -3 # match either case
+define LCHAR -4 # match either case
+define BOL -5 # match at beginning of line
+define EOL -6 # match at end of line
+define ANY -7 # "?"
+define WHITESPACE -8 # "#"
+define CCL -9 # [...
+define NCCL -10 # [^...
+define CLOSURE -11 # "*"
+define INDEX -12 # % (mark index of ^ in pattern)
+
+define CH_INDEX '%' # move to <chars.h> after a while
+
+# Definitions for the closure structure.
+
+define CLOSIZE 4 # size of closure structure
+define COUNT 1 # repeat count for matches
+define PREVCL 2 # index of previous closure in pat
+define START 3 # index in str where match starts
+
+
+# PATMATCH -- Match pattern anywhere on line. Returns the index of the
+# first character AFTER the match, or zero if no match.
+
+int procedure patmatch (str, pat)
+
+char str[ARB] # string to be scanned
+char pat[ARB] # encoded pattern
+
+int first_char, last_char
+int gpatmatch()
+
+begin
+ return (gpatmatch (str, pat, first_char, last_char))
+end
+
+
+# GPATMATCH -- Generalized pattern match. Matches pattern anywhere on
+# line (the first such pattern matched terminates the search). Function
+# return same as for PATMATCH, but also returns indices of the first and
+# last characters in the matched substring.
+
+int procedure gpatmatch (str, pat, first_char, last_char)
+
+char str[ARB] # string to be scanned
+char pat[ARB] # encoded pattern
+int first_char # index of first char matched (output)
+int last_char # index of last char matched (output)
+
+int ip, nchars_matched
+int pat_amatch() # anchored match
+
+begin
+ nchars_matched = 0
+
+ if (pat[1] == BOL) {
+ ip = 1
+ nchars_matched = pat_amatch (str, ip, pat)
+ } else {
+ for (ip=1; str[ip] != EOS; ip=ip+1) {
+ nchars_matched = pat_amatch (str, ip, pat)
+ if (nchars_matched > 0)
+ break
+ }
+ }
+
+ if (nchars_matched > 0) {
+ first_char = ip
+ last_char = ip + nchars_matched - 1
+ return (last_char + 1)
+ } else
+ return (0)
+end
+
+
+# PATINDEX -- Return the index of a marked position in the pattern. Inclusion
+# of the character % in the pattern causes the index of the character following
+# the % to be saved in the encoded pattern at patmatch time. We are called
+# after a patmatch operation to scan the pattern and recall the Nth saved index.
+# Zero is returned if N is larger than the number of saved index points.
+
+int procedure patindex (pat, n)
+
+char pat[ARB] # encoded pattern
+int n # number of index to be returned
+
+int pp, ix
+int pat_gsize()
+
+begin
+ ix = 1
+ for (pp=1; pat[pp] != EOP; pp=pp+pat_gsize(pat,pp))
+ if (pat[pp] == INDEX)
+ if (ix >= n)
+ return (pat[pp+1])
+ else
+ ix = ix + 1
+
+ return (0)
+end
+
+
+# PAT_AMATCH -- Anchored match. Look for match starting at the given
+# offset. Return the number of characters matched.
+
+int procedure pat_amatch (str, from, pat)
+
+char str[ARB] # string to be matched
+int from # starting at this index
+char pat[ARB] # encoded pattern
+
+int ip, pp, offset, stack
+int pat_omatch(), pat_gsize()
+
+begin
+ stack = 0
+ offset = from # next unexamined input char
+
+ for (pp=1; pat[pp] != EOP; pp = pp + pat_gsize(pat,pp)) {
+ if (pat[pp] == CLOSURE) { # a closure entry
+ stack = pp
+ pp = pp + CLOSIZE
+ # Match as many characters as possible, save results
+ for (ip=offset; str[ip] != EOS; )
+ if (pat_omatch (str, ip, pat, pp) == NO)
+ break
+ pat[stack+COUNT] = ip - offset
+ pat[stack+START] = offset
+ offset = ip # character that made us fail
+
+ } else if (pat_omatch (str, offset, pat, pp) == NO) {
+ for (; stack > 0; stack = pat[stack+PREVCL])
+ if (pat[stack+COUNT] > 0)
+ break
+ if (stack <= 0) # stack is empty
+ return (0) # return failure
+
+ pat[stack+COUNT] = pat[stack+COUNT] - 1
+ pp = stack + CLOSIZE
+ offset = pat[stack+START] + pat[stack+COUNT]
+ }
+ }
+
+ return (offset-from) # successful match
+end
+
+
+# PAT_GSIZE -- Returns size of pattern entry at pat[n].
+
+int procedure pat_gsize (pat, n)
+
+char pat[ARB] # encoded pattern
+int n # pointer into pattern
+int pattern_size
+
+begin
+ switch (pat[n]) {
+ case CHAR, UCHAR, LCHAR, INDEX:
+ pattern_size = 2
+ case BOL, EOL, ANY, WHITESPACE:
+ pattern_size = 1
+ case CCL, NCCL:
+ pattern_size = pat[n+1] + 2
+ case CLOSURE: # not used
+ pattern_size = CLOSIZE
+ default:
+ call error (0, "In patsize: can't happen.")
+ }
+
+ return (pattern_size)
+end
+
+
+# PAT_OMATCH -- Try to match a single pattern at pat[pp]. If match, bump IP
+# to point to the next unmatched character. Return OK if match.
+
+int procedure pat_omatch (str, ip, pat, pp)
+
+char str[ARB] # string to be scanned
+int ip # starting index in string (may be changed)
+char pat[ARB] # encoded pattern
+int pp # pointer to next pattern element
+
+char str_ch
+int bump, pat_locate()
+
+begin
+ if (str[ip] == EOS)
+ if (pat[pp] == INDEX) {
+ pat[pp+1] = ip
+ return (YES)
+ } else if (pat[pp] == EOL) {
+ return (YES)
+ } else
+ return (NO)
+
+ # Treat CHAR (simple character match) as a special case to speed
+ # things up a bit.
+
+ if (pat[pp] == CHAR)
+ if (str[ip] == pat[pp+1]) {
+ ip = ip + 1
+ return (YES)
+ } else
+ return (NO)
+
+ # Compare as indicated by encoded pattern opcode.
+ bump = -1
+
+ switch (pat[pp]) {
+ case UCHAR: # match either case
+ str_ch = str[ip]
+ if (IS_LOWER (str_ch))
+ str_ch = TO_UPPER (str_ch)
+ if (str_ch == pat[pp+1])
+ bump = 1
+ case LCHAR: # match either case
+ str_ch = str[ip]
+ if (IS_UPPER (str_ch))
+ str_ch = TO_LOWER (str_ch)
+ if (str_ch == pat[pp+1])
+ bump = 1
+ case BOL: # beg. of line
+ if (ip == 1)
+ bump = 0
+ case EOL: # end of line
+ if (str[ip] == '\n')
+ bump = 0
+ case ANY: # match any char
+ if (str[ip] != '\n')
+ bump = 1
+ case WHITESPACE:
+ for (bump=0; IS_WHITE (str[ip+bump]); bump=bump+1)
+ ;
+ case CCL: # char class
+ if (pat_locate (str[ip], pat, pp + 1) == YES)
+ bump = 1
+ case NCCL: # not in char class
+ if (str[ip] != '\n' && pat_locate (str[ip], pat, pp + 1) == NO)
+ bump = 1
+ case INDEX:
+ pat[pp+1] = ip
+ bump = 0
+ default:
+ call error (0, "In omatch: can't happen.")
+ }
+
+ if (bump >= 0) {
+ ip = ip + bump
+ return (YES)
+ } else
+ return (NO)
+end
+
+
+# PAT_LOCATE -- Look for c in char class at pat[offset].
+
+int procedure pat_locate (ch, pat, offset)
+
+char ch # char to search for
+char pat[ARB] # encoded pattern
+int offset # offset of character class in pattern
+
+int nchars, i
+
+begin
+ # Size of class is at pat[offset], characters follow.
+ nchars = pat[offset]
+ do i = 1, nchars
+ if (ch == pat[offset+i])
+ return (YES)
+
+ return (NO)
+end
+
+
+# PATMAKE -- Encode pattern specification string. Returns the size of
+# the encoded pattern string.
+
+int procedure patmake (str, pat, sz_pat)
+
+char str[ARB] # pattern to be encoded
+char pat[ARB] # encoded pattern (output)
+int sz_pat # max size of the pattern string
+int gpatmake()
+
+begin
+ return (gpatmake (str, 1, EOS, pat, sz_pat))
+end
+
+
+# GPATMAKE -- Make pattern from str[from], terminate at delim.
+
+int procedure gpatmake (patstr, from, delim, patbuf, sz_pat)
+
+char patstr[ARB] # pattern to be encoded
+int from # starting index
+int delim # delimiter character
+char patbuf[ARB] # put encoded pattern here
+int sz_pat # max chars in encoded pattern
+
+int ip, op, last_closure, last_op, l_op
+char cval
+bool ignore_case
+int cctoc(), pat_getccl(), pat_stclos()
+
+begin
+ op = 1 # pat index
+ last_op = 1
+ last_closure = 0
+ ignore_case = false
+
+ for (ip=from; patstr[ip] != delim && patstr[ip] != EOS; ip=ip+1) {
+ l_op = op
+
+ # If CVAL gets set to nonzero it will be deposited in the output
+ # buffer at end of switch.
+
+ cval = 0
+
+ switch (patstr[ip]) {
+ case CH_ANY:
+ cval = ANY
+ case CH_WHITESPACE:
+ cval = WHITESPACE
+
+ case CH_BOL:
+ if (ip == from)
+ cval = BOL
+ else {
+ cval = CHAR
+ call chdeposit (cval, patbuf, sz_pat, op)
+ cval = CH_BOL
+ }
+
+ case CH_EOL:
+ if (patstr[ip+1] == delim)
+ cval = EOL
+ else {
+ cval = CHAR
+ call chdeposit (cval, patbuf, sz_pat, op)
+ cval = CH_EOL
+ }
+
+ case CH_IGNORECASE:
+ ignore_case = true
+ case CH_MATCHCASE:
+ ignore_case = false
+
+ case CH_CCL:
+ if (pat_getccl (patstr, patbuf, sz_pat, ip, op) == ERR)
+ return (ERR)
+
+ case CH_CLOSURE:
+ # The "closure" of a pattern, e.g., "..*".
+
+ l_op = last_op
+ # Convert a pattern such as "*..." into "?*...".
+ if (ip == from) # closure of nothing
+ cval = ANY
+ else {
+ switch (patbuf[l_op]) {
+ case BOL, EOL, CLOSURE:
+ cval = ANY
+ }
+ }
+
+ if (cval != 0)
+ call chdeposit (cval, patbuf, sz_pat, op)
+ cval = 0
+
+ last_closure = pat_stclos (patbuf, sz_pat, op, last_op,
+ last_closure)
+
+ case CH_INDEX:
+ # This metacharacter does not match anything, but rather is
+ # used to record the index of the marked position in the
+ # matched pattern. The index is recorded in the pattern
+ # buffer at match time, to be later recovered with patindex.
+
+ cval = INDEX
+ call chdeposit (cval, patbuf, sz_pat, op)
+ cval = 0
+ call chdeposit (cval, patbuf, sz_pat, op)
+
+ default:
+ # Ordinary character.
+
+ # Deposit command code.
+ if (ignore_case) {
+ if (IS_UPPER (patstr[ip]))
+ cval = UCHAR
+ else
+ cval = LCHAR
+ } else
+ cval = CHAR
+ call chdeposit (cval, patbuf, sz_pat, op)
+
+ # Set CVAL to actual character value.
+ if (patstr[ip] == CH_ESCAPE) {
+ if (cctoc (patstr, ip, cval) == 1)
+ cval = patstr[ip]
+ else
+ ip = ip - 1
+ } else
+ cval = patstr[ip]
+ }
+
+ # Deposit the character left in CVAL by the code above.
+ if (cval != 0)
+ call chdeposit (cval, patbuf, sz_pat, op)
+
+ last_op = l_op
+ }
+
+ # Terminate the pattern.
+ cval = EOP
+ call chdeposit (cval, patbuf, sz_pat, op)
+
+ if (patstr[ip] != delim || op >= sz_pat)
+ return (ERR)
+ else
+ return (op - 1) # return size patbuf
+end
+
+
+# PAT_GETCCL -- Expand character class at patstr[i] into patbuf[op].
+
+int procedure pat_getccl (patstr, patbuf, sz_pat, ip, op)
+
+char patstr[ARB], patbuf[ARB]
+int sz_pat, ip, op
+char cval
+int op_start
+
+begin
+ ip = ip + 1 # skip over [
+ if (patstr[ip] == CH_NOT) {
+ cval = NCCL
+ ip = ip + 1
+ } else
+ cval = CCL
+
+ call chdeposit (cval, patbuf, sz_pat, op)
+
+ op_start = op
+ cval = 0
+ call chdeposit (cval, patbuf, sz_pat, op) # leave room for count
+ call pat_filset (CH_CCLEND, patstr, ip, patbuf, sz_pat, op)
+ patbuf[op_start] = op - op_start - 1 # fix up count
+
+ if (patstr[ip] == CH_CCLEND)
+ return (OK)
+ else
+ return (ERR)
+end
+
+
+# PAT_STCLOS -- Insert closure entry at patbuf[op].
+
+int procedure pat_stclos (patbuf, sz_pat, op, last_op, last_closure)
+
+char patbuf[ARB]
+int sz_pat
+int op
+int last_op
+int last_closure
+
+char cvals[4]
+int next_closure, jp, jt, i
+
+begin
+ for (jp=op-1; jp >= last_op; jp=jp-1) { # make a hole
+ jt = min (sz_pat, jp + CLOSIZE)
+ patbuf[jt] = patbuf[jp]
+ }
+
+ op = op + CLOSIZE
+ next_closure = last_op
+
+ cvals[1] = CLOSURE
+ cvals[2] = 0 # COUNT
+ cvals[3] = last_closure # PREVCL
+ cvals[4] = 0 # START
+
+ do i = 1, 4
+ call chdeposit (cvals[i], patbuf, sz_pat, last_op)
+
+ return (next_closure)
+end
+
+
+# PAT_FILSET -- Process a character class into a simple list of characters.
+
+procedure pat_filset (delim, patstr, ip, patbuf, sz_pat, op)
+
+int delim # character class delimiter character
+char patstr[ARB] # character class characters
+int ip # index where they start
+char patbuf[ARB] # encode character class in this string
+int sz_pat # max chars out
+int op # offset into patbuf
+
+char ch, ch1, ch2
+int cctoc()
+
+begin
+ for (; patstr[ip] != delim && patstr[ip] != EOS; ip=ip+1) {
+ if (patstr[ip] == ESCAPE) { # escape seq.
+ if (cctoc (patstr, ip, ch) == 1)
+ ch = patstr[ip]
+ else
+ ip = ip - 1
+ call chdeposit (ch, patbuf, sz_pat, op)
+
+ } else if (patstr[ip] != CH_RANGE) {
+ call chdeposit (patstr[ip], patbuf, sz_pat, op)
+
+ } else if (op <= 1 || patstr[ip+1] == EOS) { # literal "-"
+ ch = CH_RANGE
+ call chdeposit (ch, patbuf, sz_pat, op)
+
+ } else {
+ # Here if char is CH_RANGE, denoting a range of characters to be
+ # included in the character class. Range is valid only if limit
+ # chars are both digits, both lower case, or both upper case.
+
+ ch1 = patbuf[op-1] # not same as patstr[ip-1]
+ ch2 = patstr[ip+1]
+
+ if ((IS_DIGIT (ch1) && IS_DIGIT (ch2)) ||
+ (IS_LOWER (ch1) && IS_LOWER (ch2)) ||
+ (IS_UPPER (ch1) && IS_UPPER (ch2))) {
+ if (ch1 <= ch2)
+ for (ch=ch1+1; ch <= ch2; ch=ch+1)
+ call chdeposit (ch, patbuf, sz_pat, op)
+ else
+ for (ch=ch1-1; ch >= ch2; ch=ch-1)
+ call chdeposit (ch, patbuf, sz_pat, op)
+ ip = ip + 1
+ } else {
+ ch = CH_RANGE
+ call chdeposit (ch, patbuf, sz_pat, op)
+ }
+ }
+ }
+end