test/compiler/7070134/Stemmer.java

Thu, 25 Apr 2013 09:24:00 -0700

author
katleman
date
Thu, 25 Apr 2013 09:24:00 -0700
changeset 4925
d080f5168deb
parent 3749
3a97daec1b34
child 6876
710a3c8b516e
permissions
-rw-r--r--

Added tag jdk8-b87 for changeset d4c266784660

     1 /**
     2  * @test
     3  * @bug 7070134
     4  * @summary Hotspot crashes with sigsegv from PorterStemmer
     5  *
     6  * @run shell Test7070134.sh
     7  */
     9 /*
    11    Porter stemmer in Java. The original paper is in
    13        Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
    14        no. 3, pp 130-137,
    16    http://www.tartarus.org/~martin/PorterStemmer
    18    The software is completely free for any purpose, unless notes at the head
    19    of the program text indicates otherwise (which is rare). In any case,
    20    the notes about licensing are never more restrictive than the BSD License.
    22    In every case where the software is not written by me (Martin Porter),
    23    this licensing arrangement has been endorsed by the contributor, and it is
    24    therefore unnecessary to ask the contributor again to confirm it.
    26    I have not asked any contributors (or their employers, if they have them)
    27    for proofs that they have the right to distribute their software in this way.
    29    History:
    31    Release 1
    33    Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below.
    34    The words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1]
    35    is then out outside the bounds of b.
    37    Release 2
    39    Similarly,
    41    Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below.
    42    'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and
    43    b[j] is then outside the bounds of b.
    45    Release 3
    47    Considerably revised 4/9/00 in the light of many helpful suggestions
    48    from Brian Goetz of Quiotix Corporation (brian@quiotix.com).
    50    Release 4
    52 */
    54 import java.io.*;
    56 /**
    57   * Stemmer, implementing the Porter Stemming Algorithm
    58   *
    59   * The Stemmer class transforms a word into its root form.  The input
    60   * word can be provided a character at time (by calling add()), or at once
    61   * by calling one of the various stem(something) methods.
    62   */
    64 class Stemmer
    65 {  private char[] b;
    66    private int i,     /* offset into b */
    67                i_end, /* offset to end of stemmed word */
    68                j, k;
    69    private static final int INC = 50;
    70                      /* unit of size whereby b is increased */
    71    public Stemmer()
    72    {  b = new char[INC];
    73       i = 0;
    74       i_end = 0;
    75    }
    77    /**
    78     * Add a character to the word being stemmed.  When you are finished
    79     * adding characters, you can call stem(void) to stem the word.
    80     */
    82    public void add(char ch)
    83    {  if (i == b.length)
    84       {  char[] new_b = new char[i+INC];
    85          for (int c = 0; c < i; c++) new_b[c] = b[c];
    86          b = new_b;
    87       }
    88       b[i++] = ch;
    89    }
    92    /** Adds wLen characters to the word being stemmed contained in a portion
    93     * of a char[] array. This is like repeated calls of add(char ch), but
    94     * faster.
    95     */
    97    public void add(char[] w, int wLen)
    98    {  if (i+wLen >= b.length)
    99       {  char[] new_b = new char[i+wLen+INC];
   100          for (int c = 0; c < i; c++) new_b[c] = b[c];
   101          b = new_b;
   102       }
   103       for (int c = 0; c < wLen; c++) b[i++] = w[c];
   104    }
   106    /**
   107     * After a word has been stemmed, it can be retrieved by toString(),
   108     * or a reference to the internal buffer can be retrieved by getResultBuffer
   109     * and getResultLength (which is generally more efficient.)
   110     */
   111    public String toString() { return new String(b,0,i_end); }
   113    /**
   114     * Returns the length of the word resulting from the stemming process.
   115     */
   116    public int getResultLength() { return i_end; }
   118    /**
   119     * Returns a reference to a character buffer containing the results of
   120     * the stemming process.  You also need to consult getResultLength()
   121     * to determine the length of the result.
   122     */
   123    public char[] getResultBuffer() { return b; }
   125    /* cons(i) is true <=> b[i] is a consonant. */
   127    private final boolean cons(int i)
   128    {  switch (b[i])
   129       {  case 'a': case 'e': case 'i': case 'o': case 'u': return false;
   130          case 'y': return (i==0) ? true : !cons(i-1);
   131          default: return true;
   132       }
   133    }
   135    /* m() measures the number of consonant sequences between 0 and j. if c is
   136       a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
   137       presence,
   139          <c><v>       gives 0
   140          <c>vc<v>     gives 1
   141          <c>vcvc<v>   gives 2
   142          <c>vcvcvc<v> gives 3
   143          ....
   144    */
   146    private final int m()
   147    {  int n = 0;
   148       int i = 0;
   149       while(true)
   150       {  if (i > j) return n;
   151          if (! cons(i)) break; i++;
   152       }
   153       i++;
   154       while(true)
   155       {  while(true)
   156          {  if (i > j) return n;
   157                if (cons(i)) break;
   158                i++;
   159          }
   160          i++;
   161          n++;
   162          while(true)
   163          {  if (i > j) return n;
   164             if (! cons(i)) break;
   165             i++;
   166          }
   167          i++;
   168        }
   169    }
   171    /* vowelinstem() is true <=> 0,...j contains a vowel */
   173    private final boolean vowelinstem()
   174    {  int i; for (i = 0; i <= j; i++) if (! cons(i)) return true;
   175       return false;
   176    }
   178    /* doublec(j) is true <=> j,(j-1) contain a double consonant. */
   180    private final boolean doublec(int j)
   181    {  if (j < 1) return false;
   182       if (b[j] != b[j-1]) return false;
   183       return cons(j);
   184    }
   186    /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant
   187       and also if the second c is not w,x or y. this is used when trying to
   188       restore an e at the end of a short word. e.g.
   190          cav(e), lov(e), hop(e), crim(e), but
   191          snow, box, tray.
   193    */
   195    private final boolean cvc(int i)
   196    {  if (i < 2 || !cons(i) || cons(i-1) || !cons(i-2)) return false;
   197       {  int ch = b[i];
   198          if (ch == 'w' || ch == 'x' || ch == 'y') return false;
   199       }
   200       return true;
   201    }
   203    private final boolean ends(String s)
   204    {  int l = s.length();
   205       int o = k-l+1;
   206       if (o < 0) return false;
   207       for (int i = 0; i < l; i++) if (b[o+i] != s.charAt(i)) return false;
   208       j = k-l;
   209       return true;
   210    }
   212    /* setto(s) sets (j+1),...k to the characters in the string s, readjusting
   213       k. */
   215    private final void setto(String s)
   216    {  int l = s.length();
   217       int o = j+1;
   218       for (int i = 0; i < l; i++) b[o+i] = s.charAt(i);
   219       k = j+l;
   220    }
   222    /* r(s) is used further down. */
   224    private final void r(String s) { if (m() > 0) setto(s); }
   226    /* step1() gets rid of plurals and -ed or -ing. e.g.
   228           caresses  ->  caress
   229           ponies    ->  poni
   230           ties      ->  ti
   231           caress    ->  caress
   232           cats      ->  cat
   234           feed      ->  feed
   235           agreed    ->  agree
   236           disabled  ->  disable
   238           matting   ->  mat
   239           mating    ->  mate
   240           meeting   ->  meet
   241           milling   ->  mill
   242           messing   ->  mess
   244           meetings  ->  meet
   246    */
   248    private final void step1()
   249    {  if (b[k] == 's')
   250       {  if (ends("sses")) k -= 2; else
   251          if (ends("ies")) setto("i"); else
   252          if (b[k-1] != 's') k--;
   253       }
   254       if (ends("eed")) { if (m() > 0) k--; } else
   255       if ((ends("ed") || ends("ing")) && vowelinstem())
   256       {  k = j;
   257          if (ends("at")) setto("ate"); else
   258          if (ends("bl")) setto("ble"); else
   259          if (ends("iz")) setto("ize"); else
   260          if (doublec(k))
   261          {  k--;
   262             {  int ch = b[k];
   263                if (ch == 'l' || ch == 's' || ch == 'z') k++;
   264             }
   265          }
   266          else if (m() == 1 && cvc(k)) setto("e");
   267      }
   268    }
   270    /* step2() turns terminal y to i when there is another vowel in the stem. */
   272    private final void step2() { if (ends("y") && vowelinstem()) b[k] = 'i'; }
   274    /* step3() maps double suffices to single ones. so -ization ( = -ize plus
   275       -ation) maps to -ize etc. note that the string before the suffix must give
   276       m() > 0. */
   278    private final void step3() { if (k == 0) return; /* For Bug 1 */ switch (b[k-1])
   279    {
   280        case 'a': if (ends("ational")) { r("ate"); break; }
   281                  if (ends("tional")) { r("tion"); break; }
   282                  break;
   283        case 'c': if (ends("enci")) { r("ence"); break; }
   284                  if (ends("anci")) { r("ance"); break; }
   285                  break;
   286        case 'e': if (ends("izer")) { r("ize"); break; }
   287                  break;
   288        case 'l': if (ends("bli")) { r("ble"); break; }
   289                  if (ends("alli")) { r("al"); break; }
   290                  if (ends("entli")) { r("ent"); break; }
   291                  if (ends("eli")) { r("e"); break; }
   292                  if (ends("ousli")) { r("ous"); break; }
   293                  break;
   294        case 'o': if (ends("ization")) { r("ize"); break; }
   295                  if (ends("ation")) { r("ate"); break; }
   296                  if (ends("ator")) { r("ate"); break; }
   297                  break;
   298        case 's': if (ends("alism")) { r("al"); break; }
   299                  if (ends("iveness")) { r("ive"); break; }
   300                  if (ends("fulness")) { r("ful"); break; }
   301                  if (ends("ousness")) { r("ous"); break; }
   302                  break;
   303        case 't': if (ends("aliti")) { r("al"); break; }
   304                  if (ends("iviti")) { r("ive"); break; }
   305                  if (ends("biliti")) { r("ble"); break; }
   306                  break;
   307        case 'g': if (ends("logi")) { r("log"); break; }
   308    } }
   310    /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */
   312    private final void step4() { switch (b[k])
   313    {
   314        case 'e': if (ends("icate")) { r("ic"); break; }
   315                  if (ends("ative")) { r(""); break; }
   316                  if (ends("alize")) { r("al"); break; }
   317                  break;
   318        case 'i': if (ends("iciti")) { r("ic"); break; }
   319                  break;
   320        case 'l': if (ends("ical")) { r("ic"); break; }
   321                  if (ends("ful")) { r(""); break; }
   322                  break;
   323        case 's': if (ends("ness")) { r(""); break; }
   324                  break;
   325    } }
   327    /* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */
   329    private final void step5()
   330    {   if (k == 0) return; /* for Bug 1 */ switch (b[k-1])
   331        {  case 'a': if (ends("al")) break; return;
   332           case 'c': if (ends("ance")) break;
   333                     if (ends("ence")) break; return;
   334           case 'e': if (ends("er")) break; return;
   335           case 'i': if (ends("ic")) break; return;
   336           case 'l': if (ends("able")) break;
   337                     if (ends("ible")) break; return;
   338           case 'n': if (ends("ant")) break;
   339                     if (ends("ement")) break;
   340                     if (ends("ment")) break;
   341                     /* element etc. not stripped before the m */
   342                     if (ends("ent")) break; return;
   343           case 'o': if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) break;
   344                                     /* j >= 0 fixes Bug 2 */
   345                     if (ends("ou")) break; return;
   346                     /* takes care of -ous */
   347           case 's': if (ends("ism")) break; return;
   348           case 't': if (ends("ate")) break;
   349                     if (ends("iti")) break; return;
   350           case 'u': if (ends("ous")) break; return;
   351           case 'v': if (ends("ive")) break; return;
   352           case 'z': if (ends("ize")) break; return;
   353           default: return;
   354        }
   355        if (m() > 1) k = j;
   356    }
   358    /* step6() removes a final -e if m() > 1. */
   360    private final void step6()
   361    {  j = k;
   362       if (b[k] == 'e')
   363       {  int a = m();
   364          if (a > 1 || a == 1 && !cvc(k-1)) k--;
   365       }
   366       if (b[k] == 'l' && doublec(k) && m() > 1) k--;
   367    }
   369    /** Stem the word placed into the Stemmer buffer through calls to add().
   370     * Returns true if the stemming process resulted in a word different
   371     * from the input.  You can retrieve the result with
   372     * getResultLength()/getResultBuffer() or toString().
   373     */
   374    public void stem()
   375    {  k = i - 1;
   376       if (k > 1) { step1(); step2(); step3(); step4(); step5(); step6(); }
   377       i_end = k+1; i = 0;
   378    }
   380    /** Test program for demonstrating the Stemmer.  It reads text from a
   381     * a list of files, stems each word, and writes the result to standard
   382     * output. Note that the word stemmed is expected to be in lower case:
   383     * forcing lower case must be done outside the Stemmer class.
   384     * Usage: Stemmer file-name file-name ...
   385     */
   386    public static void main(String[] args)
   387    {
   388       char[] w = new char[501];
   389       Stemmer s = new Stemmer();
   390       for (int i = 0; i < args.length; i++)
   391       try
   392       {
   393          FileInputStream in = new FileInputStream(args[i]);
   395          try
   396          { while(true)
   398            {  int ch = in.read();
   399               if (Character.isLetter((char) ch))
   400               {
   401                  int j = 0;
   402                  while(true)
   403                  {  ch = Character.toLowerCase((char) ch);
   404                     w[j] = (char) ch;
   405                     if (j < 500) j++;
   406                     ch = in.read();
   407                     if (!Character.isLetter((char) ch))
   408                     {
   409                        /* to test add(char ch) */
   410                        for (int c = 0; c < j; c++) s.add(w[c]);
   412                        /* or, to test add(char[] w, int j) */
   413                        /* s.add(w, j); */
   415                        s.stem();
   416                        {  String u;
   418                           /* and now, to test toString() : */
   419                           u = s.toString();
   421                           /* to test getResultBuffer(), getResultLength() : */
   422                           /* u = new String(s.getResultBuffer(), 0, s.getResultLength()); */
   424                           System.out.print(u);
   425                        }
   426                        break;
   427                     }
   428                  }
   429               }
   430               if (ch < 0) break;
   431               System.out.print((char)ch);
   432            }
   433          }
   434          catch (IOException e)
   435          {  System.out.println("error reading " + args[i]);
   436             break;
   437          }
   438       }
   439       catch (FileNotFoundException e)
   440       {  System.out.println("file " + args[i] + " not found");
   441          break;
   442       }
   443    }
   444 }

mercurial