aboutsummaryrefslogtreecommitdiff
path: root/pkg/utilities/nttools/stxtools/similar.x
blob: a7b1a644fd97ddf663aa6ef2308dde6d3d11f86a (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
define	SZ_STACK	25

# SIMILAR -- Return a score base on the similarity between two strings

# This procedure returns a number representing the similarity between 
# two strings. The number is computed by finding the combined length of 
# all the common substrings between the two strings, normalized to a value
# between zero and one hundred.
#
# B.Simon	13-Mar-89	Original

int procedure similar (str1, str2)

char	str1[ARB]	# i: First string
char	str2[ARB]	# i: Second string
#--
int	score, istack, len1, len2, maxch, ic, nc
pointer	stack[4,SZ_STACK]
pointer	sp, word1, word2, s1, s2
pointer	start1, end1, start2, end2, newstart1, newend1, newstart2, newend2

string	overflow  "Stack overflow in procedure similar"

int	strlen()

begin
	# If either string is zero length, return zero as score

	score = 0
	len1 = strlen (str1)
	len2 = strlen (str2)
	if (len1 == 0 || len2 == 0)
	    return (score)

	# Compare the lower case version of the strings

	call smark (sp)
	call salloc (word1, len1, TY_CHAR)
	call salloc (word2, len2, TY_CHAR)

	call strcpy (str1, Memc[word1], len1)
	call strlwr (Memc[word1])

	call strcpy (str2, Memc[word2], len2)
	call strlwr (Memc[word2])

	# The first substrings to compare are the entire strings

	istack = 1
	stack[1,istack] = word1
	stack[2,istack] = word1 + len1 - 1
	stack[3,istack] = word2
	stack[4,istack] = word2 + len2 - 1
	
	# While there are more substrings on the stack

	while (istack > 0) {

	    # Find the longest match between the substrings

	    maxch = 0
	    start1 = stack[1,istack]
	    end1 = stack[2,istack]
	    start2 = stack[3,istack]
	    end2 = stack[4,istack]

	    for (s1 = start1; s1 <= end1 - maxch; s1 = s1 + 1) {

		nc = end1 - s1

		for (s2 = start2; s2 <= end2 - maxch; s2 = s2 + 1) {

		    if (Memc[s1] == Memc[s2]) {

			# Compute the length of the match

			for (ic = 1; 
			     ic <= nc && Memc[s1+ic] == Memc[s2+ic]; 
			     ic = ic + 1)
			    ;

			# If this is the longest match so far, save
			# the length and start and end points

			if (ic > maxch) {
			   maxch = ic
			   newstart1 = s1
			   newstart2 = s2
			   newend1 = s1 + ic - 1
			   newend2 = s2 + ic - 1
			}
			s2 = s2 + ic - 1
		    }
		}
	    }

	    # Pop the stack and push the new substings on the stack

	    istack = istack - 1
	    if (maxch > 0) {
		score = score + 2 * maxch

		if (start1 != newstart1 && start2 != newstart2) {
		    if (istack == SZ_STACK)
			call error (1, overflow)
		    istack = istack + 1
		    stack[1,istack] = start1
		    stack[2,istack] = newstart1 - 1
		    stack[3,istack] = start2
		    stack[4,istack] = newstart2 - 1
		}

		if (end1 != newend1 && end2 != newend2) {
		    if (istack == SZ_STACK)
			call error (1, overflow)
		    istack = istack + 1
		    stack[1,istack] = newend1 + 1
		    stack[2,istack] = end1
		    stack[3,istack] = newend2 + 1
		    stack[4,istack] = end2
		}
	    }
	}

	call sfree (sp)
	return (100 * score / (len1 + len2))
end