License Public Domain
Lines 153
Keywords
match (1) pattern (1) word (1)
Permissions
Owner: jlugocp
Viewable by Everyone
Editable by All Siafoo Users
Hide
Need a quick chart or graph for your blog? Try our reStructured Text renderer. Join Siafoo Now or Learn More

Compare a potentially mis-spelled word against a known word Atom Feed 0

In Brief Compares a potentially mis-spelled word against a known, dictionary word. Returns a score indicating how many fixes needed to make the input word match the dictionary word.
# 's
  1import java.util.HashSet;
2import java.util.Set;
3
4public class WordMatch {
5
6 /**
7 * @param args
8 */
9 public static void main(String[] args) {
10 WordMatch wm = new WordMatch();
11 System.out.println(wm.getNumFixes("pahternm", "pattern"));
12 System.out.println(wm.getNumFixes("pattern", "pattern"));
13 System.out.println(wm.getNumFixes("ptrn", "pattern"));
14 System.out.println(wm.getNumFixes("azxpqrtm", "school"));
15 }
16
17 /**
18 * A perfect match between <code>input</code> and <code>dword</code> returns a numfix of 0. A 1 is added to the
19 * numfix value for each replace, addition and subtraction of a character.
20 *
21 * @param input
22 * The word to get a fix count for
23 * @param dword
24 * The dictionary word to compare with <code>input</code>
25 * @return the numfix score, 0 and higher.
26 */
27 private int getNumFixes(String input, String dword) {
28
29 int inputchars[] = new int[input.length()];
30 int dwordchars[] = new int[dword.length()];
31 set(dwordchars, 0, dwordchars.length, -1);
32
33 // Start with the largest chunks, then get smaller
34 for (int chunkLength = input.length(); chunkLength > 0; chunkLength--) {
35 // try each chunk of length chunkLength
36 for (int chunkStartIndex = 0; chunkStartIndex + chunkLength <= input.length(); chunkStartIndex++) {
37 if (isThereOverlap(inputchars, chunkStartIndex, chunkStartIndex + chunkLength, 0)) {
38 // Have we already found a match on a portion of this chunk?
39 continue;
40 }
41 String chunk = input.substring(chunkStartIndex, chunkStartIndex + chunkLength);
42 int index = dword.indexOf(chunk/* , nextStartIndex */);
43 if (index > -1) {
44 // We found a match
45 // Does this chunk overlap with previous matches?
46 boolean noOverlapAndRelative = !isThereOverlap(dwordchars, index, index + chunkLength, -1);
47 // Is this chunk in the proper position relative to previous matches?
48 noOverlapAndRelative &= isRelative(dwordchars, index, chunkStartIndex, chunkLength);
49 if (!noOverlapAndRelative) {
50 continue;
51 }
52
53 // nextStartIndex = index + chunk.length();
54 set(inputchars, chunkStartIndex, chunkStartIndex + chunkLength, 1);
55 // Store the input character indexes in the matching buckets in dwordchars
56 setIncrementingInt(dwordchars, index, index + chunkLength, chunkStartIndex);
57
58 // skip to the next block of characters outside this block
59 chunkStartIndex = index + chunkLength - 1;// note that the loop will increment chunkStartIndex,
60 // hence the '- 1'
61 }
62 }
63 }
64 return getScore(inputchars, dwordchars);
65 }
66
67 /**
68 * Removes any characters in <code>input</code> which are not in <code>dword</code>
69 *
70 * @param input
71 * The word to remove extra characters from
72 * @param dword
73 * The word to compare <code>input</code> against
74 * @return
75 */
76 private String removeExtraChars(String input, String dword) {
77 Set<String> extraChars = new HashSet<String>();
78 // find the extra characters in input which aren't in dword
79 for (int i = 0; i < input.length(); i++) {
80 String c = input.substring(i, i + 1);
81 if (dword.indexOf(c) == -1) {
82 extraChars.add(c);
83 }
84 }
85 // remove the extra characters
86 for (String c : extraChars) {
87 input = input.replace(c, "");
88 }
89 return input;
90 }
91
92 private int getScore(int[] inputchars, int[] dwordchars) {
93 int score = 0;
94 int inputindex = 0;
95 int dwordindex = 0;
96 while (inputindex < inputchars.length || dwordindex < dwordchars.length) {
97 int dval = -1;
98 if (dwordindex < dwordchars.length) {
99 dval = dwordchars[dwordindex];
100 }
101 int ival = 0;
102 if (inputindex < inputchars.length) {
103 ival = inputchars[inputindex];
104 }
105 if (dval > -1) {
106 if (dval == inputindex) {
107 dwordindex++;
108 inputindex++;
109 continue;
110 } else {
111 // remove the char from the input word
112 inputindex++;
113 score++;
114 }
115 } else {
116 dwordindex++;
117 if (ival == 0) {
118 inputindex++;
119 }
120 score++;
121 }
122 }
123 return score;
124 }
125
126 private boolean isRelative(int[] dwordchars, int index, int chunkStartIndex, int chunkLength) {
127 int chunkEndIndex = chunkStartIndex + chunkLength;
128 for (int i = index + chunkLength; i < dwordchars.length; i++) {
129 if (dwordchars[i] != -1 && dwordchars[i] < chunkEndIndex) {
130 return false;
131 }
132 }
133 return true;
134 }
135
136 /**
137 *
138 * @param intarray
139 * The boolean array to check
140 * @param startIndex
141 * Start index, inclusive
142 * @param endIndex
143 * End index, exclusive
144 * @return boolean true or false
145 */
146 private boolean isThereOverlap(int[] intarray, int startIndex, int endIndex, int initValue) {
147 for (int i = startIndex; i < endIndex; i++) {
148 if (intarray[i] > initValue) {
149 return true;
150 }
151 }
152 return false;
153 }
154
155 private void setIncrementingInt(int[] intarray, int startIndex, int endIndex, int startingInt) {
156 for (int i = startIndex; i < endIndex; i++) {
157 intarray[i] = startingInt++;
158 }
159 }
160
161 private void set(int[] intarray, int startIndex, int endIndex, int value) {
162 for (int i = startIndex; i < endIndex; i++) {
163 intarray[i] = value;
164 }
165 }
166}

Compares a potentially mis-spelled word against a known, dictionary word. Returns a score indicating how many fixes needed to make the input word match the dictionary word.