Wed, 22 Jan 2014 12:37:28 -0800
Added tag jdk8u5-b05 for changeset b90de55aca30
1 /**
2 * @test
3 * @bug 7070134
4 * @summary Hotspot crashes with sigsegv from PorterStemmer
5 *
6 * @run shell Test7070134.sh
7 */
9 /*
11 Porter stemmer in Java. The original paper is in
13 Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
14 no. 3, pp 130-137,
16 http://www.tartarus.org/~martin/PorterStemmer
18 The software is completely free for any purpose, unless notes at the head
19 of the program text indicates otherwise (which is rare). In any case,
20 the notes about licensing are never more restrictive than the BSD License.
22 In every case where the software is not written by me (Martin Porter),
23 this licensing arrangement has been endorsed by the contributor, and it is
24 therefore unnecessary to ask the contributor again to confirm it.
26 I have not asked any contributors (or their employers, if they have them)
27 for proofs that they have the right to distribute their software in this way.
29 History:
31 Release 1
33 Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below.
34 The words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1]
35 is then out outside the bounds of b.
37 Release 2
39 Similarly,
41 Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below.
42 'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and
43 b[j] is then outside the bounds of b.
45 Release 3
47 Considerably revised 4/9/00 in the light of many helpful suggestions
48 from Brian Goetz of Quiotix Corporation (brian@quiotix.com).
50 Release 4
52 */
54 import java.io.*;
56 /**
57 * Stemmer, implementing the Porter Stemming Algorithm
58 *
59 * The Stemmer class transforms a word into its root form. The input
60 * word can be provided a character at time (by calling add()), or at once
61 * by calling one of the various stem(something) methods.
62 */
64 class Stemmer
65 { private char[] b;
66 private int i, /* offset into b */
67 i_end, /* offset to end of stemmed word */
68 j, k;
69 private static final int INC = 50;
70 /* unit of size whereby b is increased */
71 public Stemmer()
72 { b = new char[INC];
73 i = 0;
74 i_end = 0;
75 }
77 /**
78 * Add a character to the word being stemmed. When you are finished
79 * adding characters, you can call stem(void) to stem the word.
80 */
82 public void add(char ch)
83 { if (i == b.length)
84 { char[] new_b = new char[i+INC];
85 for (int c = 0; c < i; c++) new_b[c] = b[c];
86 b = new_b;
87 }
88 b[i++] = ch;
89 }
92 /** Adds wLen characters to the word being stemmed contained in a portion
93 * of a char[] array. This is like repeated calls of add(char ch), but
94 * faster.
95 */
97 public void add(char[] w, int wLen)
98 { if (i+wLen >= b.length)
99 { char[] new_b = new char[i+wLen+INC];
100 for (int c = 0; c < i; c++) new_b[c] = b[c];
101 b = new_b;
102 }
103 for (int c = 0; c < wLen; c++) b[i++] = w[c];
104 }
106 /**
107 * After a word has been stemmed, it can be retrieved by toString(),
108 * or a reference to the internal buffer can be retrieved by getResultBuffer
109 * and getResultLength (which is generally more efficient.)
110 */
111 public String toString() { return new String(b,0,i_end); }
113 /**
114 * Returns the length of the word resulting from the stemming process.
115 */
116 public int getResultLength() { return i_end; }
118 /**
119 * Returns a reference to a character buffer containing the results of
120 * the stemming process. You also need to consult getResultLength()
121 * to determine the length of the result.
122 */
123 public char[] getResultBuffer() { return b; }
125 /* cons(i) is true <=> b[i] is a consonant. */
127 private final boolean cons(int i)
128 { switch (b[i])
129 { case 'a': case 'e': case 'i': case 'o': case 'u': return false;
130 case 'y': return (i==0) ? true : !cons(i-1);
131 default: return true;
132 }
133 }
135 /* m() measures the number of consonant sequences between 0 and j. if c is
136 a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
137 presence,
139 <c><v> gives 0
140 <c>vc<v> gives 1
141 <c>vcvc<v> gives 2
142 <c>vcvcvc<v> gives 3
143 ....
144 */
146 private final int m()
147 { int n = 0;
148 int i = 0;
149 while(true)
150 { if (i > j) return n;
151 if (! cons(i)) break; i++;
152 }
153 i++;
154 while(true)
155 { while(true)
156 { if (i > j) return n;
157 if (cons(i)) break;
158 i++;
159 }
160 i++;
161 n++;
162 while(true)
163 { if (i > j) return n;
164 if (! cons(i)) break;
165 i++;
166 }
167 i++;
168 }
169 }
171 /* vowelinstem() is true <=> 0,...j contains a vowel */
173 private final boolean vowelinstem()
174 { int i; for (i = 0; i <= j; i++) if (! cons(i)) return true;
175 return false;
176 }
178 /* doublec(j) is true <=> j,(j-1) contain a double consonant. */
180 private final boolean doublec(int j)
181 { if (j < 1) return false;
182 if (b[j] != b[j-1]) return false;
183 return cons(j);
184 }
186 /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant
187 and also if the second c is not w,x or y. this is used when trying to
188 restore an e at the end of a short word. e.g.
190 cav(e), lov(e), hop(e), crim(e), but
191 snow, box, tray.
193 */
195 private final boolean cvc(int i)
196 { if (i < 2 || !cons(i) || cons(i-1) || !cons(i-2)) return false;
197 { int ch = b[i];
198 if (ch == 'w' || ch == 'x' || ch == 'y') return false;
199 }
200 return true;
201 }
203 private final boolean ends(String s)
204 { int l = s.length();
205 int o = k-l+1;
206 if (o < 0) return false;
207 for (int i = 0; i < l; i++) if (b[o+i] != s.charAt(i)) return false;
208 j = k-l;
209 return true;
210 }
212 /* setto(s) sets (j+1),...k to the characters in the string s, readjusting
213 k. */
215 private final void setto(String s)
216 { int l = s.length();
217 int o = j+1;
218 for (int i = 0; i < l; i++) b[o+i] = s.charAt(i);
219 k = j+l;
220 }
222 /* r(s) is used further down. */
224 private final void r(String s) { if (m() > 0) setto(s); }
226 /* step1() gets rid of plurals and -ed or -ing. e.g.
228 caresses -> caress
229 ponies -> poni
230 ties -> ti
231 caress -> caress
232 cats -> cat
234 feed -> feed
235 agreed -> agree
236 disabled -> disable
238 matting -> mat
239 mating -> mate
240 meeting -> meet
241 milling -> mill
242 messing -> mess
244 meetings -> meet
246 */
248 private final void step1()
249 { if (b[k] == 's')
250 { if (ends("sses")) k -= 2; else
251 if (ends("ies")) setto("i"); else
252 if (b[k-1] != 's') k--;
253 }
254 if (ends("eed")) { if (m() > 0) k--; } else
255 if ((ends("ed") || ends("ing")) && vowelinstem())
256 { k = j;
257 if (ends("at")) setto("ate"); else
258 if (ends("bl")) setto("ble"); else
259 if (ends("iz")) setto("ize"); else
260 if (doublec(k))
261 { k--;
262 { int ch = b[k];
263 if (ch == 'l' || ch == 's' || ch == 'z') k++;
264 }
265 }
266 else if (m() == 1 && cvc(k)) setto("e");
267 }
268 }
270 /* step2() turns terminal y to i when there is another vowel in the stem. */
272 private final void step2() { if (ends("y") && vowelinstem()) b[k] = 'i'; }
274 /* step3() maps double suffices to single ones. so -ization ( = -ize plus
275 -ation) maps to -ize etc. note that the string before the suffix must give
276 m() > 0. */
278 private final void step3() { if (k == 0) return; /* For Bug 1 */ switch (b[k-1])
279 {
280 case 'a': if (ends("ational")) { r("ate"); break; }
281 if (ends("tional")) { r("tion"); break; }
282 break;
283 case 'c': if (ends("enci")) { r("ence"); break; }
284 if (ends("anci")) { r("ance"); break; }
285 break;
286 case 'e': if (ends("izer")) { r("ize"); break; }
287 break;
288 case 'l': if (ends("bli")) { r("ble"); break; }
289 if (ends("alli")) { r("al"); break; }
290 if (ends("entli")) { r("ent"); break; }
291 if (ends("eli")) { r("e"); break; }
292 if (ends("ousli")) { r("ous"); break; }
293 break;
294 case 'o': if (ends("ization")) { r("ize"); break; }
295 if (ends("ation")) { r("ate"); break; }
296 if (ends("ator")) { r("ate"); break; }
297 break;
298 case 's': if (ends("alism")) { r("al"); break; }
299 if (ends("iveness")) { r("ive"); break; }
300 if (ends("fulness")) { r("ful"); break; }
301 if (ends("ousness")) { r("ous"); break; }
302 break;
303 case 't': if (ends("aliti")) { r("al"); break; }
304 if (ends("iviti")) { r("ive"); break; }
305 if (ends("biliti")) { r("ble"); break; }
306 break;
307 case 'g': if (ends("logi")) { r("log"); break; }
308 } }
310 /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */
312 private final void step4() { switch (b[k])
313 {
314 case 'e': if (ends("icate")) { r("ic"); break; }
315 if (ends("ative")) { r(""); break; }
316 if (ends("alize")) { r("al"); break; }
317 break;
318 case 'i': if (ends("iciti")) { r("ic"); break; }
319 break;
320 case 'l': if (ends("ical")) { r("ic"); break; }
321 if (ends("ful")) { r(""); break; }
322 break;
323 case 's': if (ends("ness")) { r(""); break; }
324 break;
325 } }
327 /* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */
329 private final void step5()
330 { if (k == 0) return; /* for Bug 1 */ switch (b[k-1])
331 { case 'a': if (ends("al")) break; return;
332 case 'c': if (ends("ance")) break;
333 if (ends("ence")) break; return;
334 case 'e': if (ends("er")) break; return;
335 case 'i': if (ends("ic")) break; return;
336 case 'l': if (ends("able")) break;
337 if (ends("ible")) break; return;
338 case 'n': if (ends("ant")) break;
339 if (ends("ement")) break;
340 if (ends("ment")) break;
341 /* element etc. not stripped before the m */
342 if (ends("ent")) break; return;
343 case 'o': if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) break;
344 /* j >= 0 fixes Bug 2 */
345 if (ends("ou")) break; return;
346 /* takes care of -ous */
347 case 's': if (ends("ism")) break; return;
348 case 't': if (ends("ate")) break;
349 if (ends("iti")) break; return;
350 case 'u': if (ends("ous")) break; return;
351 case 'v': if (ends("ive")) break; return;
352 case 'z': if (ends("ize")) break; return;
353 default: return;
354 }
355 if (m() > 1) k = j;
356 }
358 /* step6() removes a final -e if m() > 1. */
360 private final void step6()
361 { j = k;
362 if (b[k] == 'e')
363 { int a = m();
364 if (a > 1 || a == 1 && !cvc(k-1)) k--;
365 }
366 if (b[k] == 'l' && doublec(k) && m() > 1) k--;
367 }
369 /** Stem the word placed into the Stemmer buffer through calls to add().
370 * Returns true if the stemming process resulted in a word different
371 * from the input. You can retrieve the result with
372 * getResultLength()/getResultBuffer() or toString().
373 */
374 public void stem()
375 { k = i - 1;
376 if (k > 1) { step1(); step2(); step3(); step4(); step5(); step6(); }
377 i_end = k+1; i = 0;
378 }
380 /** Test program for demonstrating the Stemmer. It reads text from a
381 * a list of files, stems each word, and writes the result to standard
382 * output. Note that the word stemmed is expected to be in lower case:
383 * forcing lower case must be done outside the Stemmer class.
384 * Usage: Stemmer file-name file-name ...
385 */
386 public static void main(String[] args)
387 {
388 char[] w = new char[501];
389 Stemmer s = new Stemmer();
390 for (int i = 0; i < args.length; i++)
391 try
392 {
393 FileInputStream in = new FileInputStream(args[i]);
395 try
396 { while(true)
398 { int ch = in.read();
399 if (Character.isLetter((char) ch))
400 {
401 int j = 0;
402 while(true)
403 { ch = Character.toLowerCase((char) ch);
404 w[j] = (char) ch;
405 if (j < 500) j++;
406 ch = in.read();
407 if (!Character.isLetter((char) ch))
408 {
409 /* to test add(char ch) */
410 for (int c = 0; c < j; c++) s.add(w[c]);
412 /* or, to test add(char[] w, int j) */
413 /* s.add(w, j); */
415 s.stem();
416 { String u;
418 /* and now, to test toString() : */
419 u = s.toString();
421 /* to test getResultBuffer(), getResultLength() : */
422 /* u = new String(s.getResultBuffer(), 0, s.getResultLength()); */
424 System.out.print(u);
425 }
426 break;
427 }
428 }
429 }
430 if (ch < 0) break;
431 System.out.print((char)ch);
432 }
433 }
434 catch (IOException e)
435 { System.out.println("error reading " + args[i]);
436 break;
437 }
438 }
439 catch (FileNotFoundException e)
440 { System.out.println("file " + args[i] + " not found");
441 break;
442 }
443 }
444 }