jdk8-mips64-public/hotspot: test/compiler/7070134/Stemmer.java@2bac854670c0

     1 /**

     2  * @test

     3  * @bug 7070134

     4  * @summary Hotspot crashes with sigsegv from PorterStemmer

     5  *

     6  * @run shell Test7070134.sh

     7  */

     9 /*

    11    Porter stemmer in Java. The original paper is in

    13        Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,

    14        no. 3, pp 130-137,

    16    http://www.tartarus.org/~martin/PorterStemmer

    18    The software is completely free for any purpose, unless notes at the head

    19    of the program text indicates otherwise (which is rare). In any case,

    20    the notes about licensing are never more restrictive than the BSD License.

    22    In every case where the software is not written by me (Martin Porter),

    23    this licensing arrangement has been endorsed by the contributor, and it is

    24    therefore unnecessary to ask the contributor again to confirm it.

    26    I have not asked any contributors (or their employers, if they have them)

    27    for proofs that they have the right to distribute their software in this way.

    29    History:

    31    Release 1

    33    Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below.

    34    The words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1]

    35    is then out outside the bounds of b.

    37    Release 2

    39    Similarly,

    41    Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below.

    42    'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and

    43    b[j] is then outside the bounds of b.

    45    Release 3

    47    Considerably revised 4/9/00 in the light of many helpful suggestions

    48    from Brian Goetz of Quiotix Corporation (brian@quiotix.com).

    50    Release 4

    52 */

    54 import java.io.*;

    56 /**

    57   * Stemmer, implementing the Porter Stemming Algorithm

    58   *

    59   * The Stemmer class transforms a word into its root form.  The input

    60   * word can be provided a character at time (by calling add()), or at once

    61   * by calling one of the various stem(something) methods.

    62   */

    64 class Stemmer

    65 {  private char[] b;

    66    private int i,     /* offset into b */

    67                i_end, /* offset to end of stemmed word */

    68                j, k;

    69    private static final int INC = 50;

    70                      /* unit of size whereby b is increased */

    71    public Stemmer()

    72    {  b = new char[INC];

    73       i = 0;

    74       i_end = 0;

    75    }

    77    /**

    78     * Add a character to the word being stemmed.  When you are finished

    79     * adding characters, you can call stem(void) to stem the word.

    80     */

    82    public void add(char ch)

    83    {  if (i == b.length)

    84       {  char[] new_b = new char[i+INC];

    85          for (int c = 0; c < i; c++) new_b[c] = b[c];

    86          b = new_b;

    87       }

    88       b[i++] = ch;

    89    }

    92    /** Adds wLen characters to the word being stemmed contained in a portion

    93     * of a char[] array. This is like repeated calls of add(char ch), but

    94     * faster.

    95     */

    97    public void add(char[] w, int wLen)

    98    {  if (i+wLen >= b.length)

    99       {  char[] new_b = new char[i+wLen+INC];

   100          for (int c = 0; c < i; c++) new_b[c] = b[c];

   101          b = new_b;

   102       }

   103       for (int c = 0; c < wLen; c++) b[i++] = w[c];

   104    }

   106    /**

   107     * After a word has been stemmed, it can be retrieved by toString(),

   108     * or a reference to the internal buffer can be retrieved by getResultBuffer

   109     * and getResultLength (which is generally more efficient.)

   110     */

   111    public String toString() { return new String(b,0,i_end); }

   113    /**

   114     * Returns the length of the word resulting from the stemming process.

   115     */

   116    public int getResultLength() { return i_end; }

   118    /**

   119     * Returns a reference to a character buffer containing the results of

   120     * the stemming process.  You also need to consult getResultLength()

   121     * to determine the length of the result.

   122     */

   123    public char[] getResultBuffer() { return b; }

   125    /* cons(i) is true <=> b[i] is a consonant. */

   127    private final boolean cons(int i)

   128    {  switch (b[i])

   129       {  case 'a': case 'e': case 'i': case 'o': case 'u': return false;

   130          case 'y': return (i==0) ? true : !cons(i-1);

   131          default: return true;

   132       }

   133    }

   135    /* m() measures the number of consonant sequences between 0 and j. if c is

   136       a consonant sequence and v a vowel sequence, and <..> indicates arbitrary

   137       presence,

   139          <c><v>       gives 0

   140          <c>vc<v>     gives 1

   141          <c>vcvc<v>   gives 2

   142          <c>vcvcvc<v> gives 3

   143          ....

   144    */

   146    private final int m()

   147    {  int n = 0;

   148       int i = 0;

   149       while(true)

   150       {  if (i > j) return n;

   151          if (! cons(i)) break; i++;

   152       }

   153       i++;

   154       while(true)

   155       {  while(true)

   156          {  if (i > j) return n;

   157                if (cons(i)) break;

   158                i++;

   159          }

   160          i++;

   161          n++;

   162          while(true)

   163          {  if (i > j) return n;

   164             if (! cons(i)) break;

   165             i++;

   166          }

   167          i++;

   168        }

   169    }

   171    /* vowelinstem() is true <=> 0,...j contains a vowel */

   173    private final boolean vowelinstem()

   174    {  int i; for (i = 0; i <= j; i++) if (! cons(i)) return true;

   175       return false;

   176    }

   178    /* doublec(j) is true <=> j,(j-1) contain a double consonant. */

   180    private final boolean doublec(int j)

   181    {  if (j < 1) return false;

   182       if (b[j] != b[j-1]) return false;

   183       return cons(j);

   184    }

   186    /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant

   187       and also if the second c is not w,x or y. this is used when trying to

   188       restore an e at the end of a short word. e.g.

   190          cav(e), lov(e), hop(e), crim(e), but

   191          snow, box, tray.

   193    */

   195    private final boolean cvc(int i)

   196    {  if (i < 2 || !cons(i) || cons(i-1) || !cons(i-2)) return false;

   197       {  int ch = b[i];

   198          if (ch == 'w' || ch == 'x' || ch == 'y') return false;

   199       }

   200       return true;

   201    }

   203    private final boolean ends(String s)

   204    {  int l = s.length();

   205       int o = k-l+1;

   206       if (o < 0) return false;

   207       for (int i = 0; i < l; i++) if (b[o+i] != s.charAt(i)) return false;

   208       j = k-l;

   209       return true;

   210    }

   212    /* setto(s) sets (j+1),...k to the characters in the string s, readjusting

   213       k. */

   215    private final void setto(String s)

   216    {  int l = s.length();

   217       int o = j+1;

   218       for (int i = 0; i < l; i++) b[o+i] = s.charAt(i);

   219       k = j+l;

   220    }

   222    /* r(s) is used further down. */

   224    private final void r(String s) { if (m() > 0) setto(s); }

   226    /* step1() gets rid of plurals and -ed or -ing. e.g.

   228           caresses  ->  caress

   229           ponies    ->  poni

   230           ties      ->  ti

   231           caress    ->  caress

   232           cats      ->  cat

   234           feed      ->  feed

   235           agreed    ->  agree

   236           disabled  ->  disable

   238           matting   ->  mat

   239           mating    ->  mate

   240           meeting   ->  meet

   241           milling   ->  mill

   242           messing   ->  mess

   244           meetings  ->  meet

   246    */

   248    private final void step1()

   249    {  if (b[k] == 's')

   250       {  if (ends("sses")) k -= 2; else

   251          if (ends("ies")) setto("i"); else

   252          if (b[k-1] != 's') k--;

   253       }

   254       if (ends("eed")) { if (m() > 0) k--; } else

   255       if ((ends("ed") || ends("ing")) && vowelinstem())

   256       {  k = j;

   257          if (ends("at")) setto("ate"); else

   258          if (ends("bl")) setto("ble"); else

   259          if (ends("iz")) setto("ize"); else

   260          if (doublec(k))

   261          {  k--;

   262             {  int ch = b[k];

   263                if (ch == 'l' || ch == 's' || ch == 'z') k++;

   264             }

   265          }

   266          else if (m() == 1 && cvc(k)) setto("e");

   267      }

   268    }

   270    /* step2() turns terminal y to i when there is another vowel in the stem. */

   272    private final void step2() { if (ends("y") && vowelinstem()) b[k] = 'i'; }

   274    /* step3() maps double suffices to single ones. so -ization ( = -ize plus

   275       -ation) maps to -ize etc. note that the string before the suffix must give

   276       m() > 0. */

   278    private final void step3() { if (k == 0) return; /* For Bug 1 */ switch (b[k-1])

   279    {

   280        case 'a': if (ends("ational")) { r("ate"); break; }

   281                  if (ends("tional")) { r("tion"); break; }

   282                  break;

   283        case 'c': if (ends("enci")) { r("ence"); break; }

   284                  if (ends("anci")) { r("ance"); break; }

   285                  break;

   286        case 'e': if (ends("izer")) { r("ize"); break; }

   287                  break;

   288        case 'l': if (ends("bli")) { r("ble"); break; }

   289                  if (ends("alli")) { r("al"); break; }

   290                  if (ends("entli")) { r("ent"); break; }

   291                  if (ends("eli")) { r("e"); break; }

   292                  if (ends("ousli")) { r("ous"); break; }

   293                  break;

   294        case 'o': if (ends("ization")) { r("ize"); break; }

   295                  if (ends("ation")) { r("ate"); break; }

   296                  if (ends("ator")) { r("ate"); break; }

   297                  break;

   298        case 's': if (ends("alism")) { r("al"); break; }

   299                  if (ends("iveness")) { r("ive"); break; }

   300                  if (ends("fulness")) { r("ful"); break; }

   301                  if (ends("ousness")) { r("ous"); break; }

   302                  break;

   303        case 't': if (ends("aliti")) { r("al"); break; }

   304                  if (ends("iviti")) { r("ive"); break; }

   305                  if (ends("biliti")) { r("ble"); break; }

   306                  break;

   307        case 'g': if (ends("logi")) { r("log"); break; }

   308    } }

   310    /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */

   312    private final void step4() { switch (b[k])

   313    {

   314        case 'e': if (ends("icate")) { r("ic"); break; }

   315                  if (ends("ative")) { r(""); break; }

   316                  if (ends("alize")) { r("al"); break; }

   317                  break;

   318        case 'i': if (ends("iciti")) { r("ic"); break; }

   319                  break;

   320        case 'l': if (ends("ical")) { r("ic"); break; }

   321                  if (ends("ful")) { r(""); break; }

   322                  break;

   323        case 's': if (ends("ness")) { r(""); break; }

   324                  break;

   325    } }

   327    /* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */

   329    private final void step5()

   330    {   if (k == 0) return; /* for Bug 1 */ switch (b[k-1])

   331        {  case 'a': if (ends("al")) break; return;

   332           case 'c': if (ends("ance")) break;

   333                     if (ends("ence")) break; return;

   334           case 'e': if (ends("er")) break; return;

   335           case 'i': if (ends("ic")) break; return;

   336           case 'l': if (ends("able")) break;

   337                     if (ends("ible")) break; return;

   338           case 'n': if (ends("ant")) break;

   339                     if (ends("ement")) break;

   340                     if (ends("ment")) break;

   341                     /* element etc. not stripped before the m */

   342                     if (ends("ent")) break; return;

   343           case 'o': if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) break;

   344                                     /* j >= 0 fixes Bug 2 */

   345                     if (ends("ou")) break; return;

   346                     /* takes care of -ous */

   347           case 's': if (ends("ism")) break; return;

   348           case 't': if (ends("ate")) break;

   349                     if (ends("iti")) break; return;

   350           case 'u': if (ends("ous")) break; return;

   351           case 'v': if (ends("ive")) break; return;

   352           case 'z': if (ends("ize")) break; return;

   353           default: return;

   354        }

   355        if (m() > 1) k = j;

   356    }

   358    /* step6() removes a final -e if m() > 1. */

   360    private final void step6()

   361    {  j = k;

   362       if (b[k] == 'e')

   363       {  int a = m();

   364          if (a > 1 || a == 1 && !cvc(k-1)) k--;

   365       }

   366       if (b[k] == 'l' && doublec(k) && m() > 1) k--;

   367    }

   369    /** Stem the word placed into the Stemmer buffer through calls to add().

   370     * Returns true if the stemming process resulted in a word different

   371     * from the input.  You can retrieve the result with

   372     * getResultLength()/getResultBuffer() or toString().

   373     */

   374    public void stem()

   375    {  k = i - 1;

   376       if (k > 1) { step1(); step2(); step3(); step4(); step5(); step6(); }

   377       i_end = k+1; i = 0;

   378    }

   380    /** Test program for demonstrating the Stemmer.  It reads text from a

   381     * a list of files, stems each word, and writes the result to standard

   382     * output. Note that the word stemmed is expected to be in lower case:

   383     * forcing lower case must be done outside the Stemmer class.

   384     * Usage: Stemmer file-name file-name ...

   385     */

   386    public static void main(String[] args)

   387    {

   388       char[] w = new char[501];

   389       Stemmer s = new Stemmer();

   390       for (int i = 0; i < args.length; i++)

   391       try

   392       {

   393          FileInputStream in = new FileInputStream(args[i]);

   395          try

   396          { while(true)

   398            {  int ch = in.read();

   399               if (Character.isLetter((char) ch))

   400               {

   401                  int j = 0;

   402                  while(true)

   403                  {  ch = Character.toLowerCase((char) ch);

   404                     w[j] = (char) ch;

   405                     if (j < 500) j++;

   406                     ch = in.read();

   407                     if (!Character.isLetter((char) ch))

   408                     {

   409                        /* to test add(char ch) */

   410                        for (int c = 0; c < j; c++) s.add(w[c]);

   412                        /* or, to test add(char[] w, int j) */

   413                        /* s.add(w, j); */

   415                        s.stem();

   416                        {  String u;

   418                           /* and now, to test toString() : */

   419                           u = s.toString();

   421                           /* to test getResultBuffer(), getResultLength() : */

   422                           /* u = new String(s.getResultBuffer(), 0, s.getResultLength()); */

   424                           System.out.print(u);

   425                        }

   426                        break;

   427                     }

   428                  }

   429               }

   430               if (ch < 0) break;

   431               System.out.print((char)ch);

   432            }

   433          }

   434          catch (IOException e)

   435          {  System.out.println("error reading " + args[i]);

   436             break;

   437          }

   438       }

   439       catch (FileNotFoundException e)

   440       {  System.out.println("file " + args[i] + " not found");

   441          break;

   442       }

   443    }

   444 }

Mercurial > jdk8-mips64-public > hotspot / file revision

test/compiler/7070134/Stemmer.java@2bac854670c0

test/compiler/7070134/Stemmer.java