src/jdk/nashorn/internal/runtime/regexp/RegExpScanner.java

Wed, 10 Apr 2013 14:05:11 +0200

author
hannesw
date
Wed, 10 Apr 2013 14:05:11 +0200
changeset 188
b4ea8678bf15
parent 187
635a93b61d34
child 189
8ae9ed1ac1e2
permissions
-rw-r--r--

8011749: Bugs with empty character class handling
Reviewed-by: lagergren, attila

     1 /*
     2  * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved.
     3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     4  *
     5  * This code is free software; you can redistribute it and/or modify it
     6  * under the terms of the GNU General Public License version 2 only, as
     7  * published by the Free Software Foundation.  Oracle designates this
     8  * particular file as subject to the "Classpath" exception as provided
     9  * by Oracle in the LICENSE file that accompanied this code.
    10  *
    11  * This code is distributed in the hope that it will be useful, but WITHOUT
    12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
    13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    14  * version 2 for more details (a copy is included in the LICENSE file that
    15  * accompanied this code).
    16  *
    17  * You should have received a copy of the GNU General Public License version
    18  * 2 along with this work; if not, write to the Free Software Foundation,
    19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
    20  *
    21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
    22  * or visit www.oracle.com if you need additional information or have any
    23  * questions.
    24  */
    26 package jdk.nashorn.internal.runtime.regexp;
    28 import java.util.HashMap;
    29 import java.util.Iterator;
    30 import java.util.LinkedList;
    31 import java.util.List;
    32 import java.util.Map;
    33 import java.util.regex.PatternSyntaxException;
    35 import jdk.nashorn.internal.parser.Lexer;
    36 import jdk.nashorn.internal.parser.Scanner;
    37 import jdk.nashorn.internal.runtime.BitVector;
    39 /**
    40  * Scan a JavaScript regexp, converting to Java regex if necessary.
    41  *
    42  */
    43 final class RegExpScanner extends Scanner {
    45     /**
    46      * String builder used to rewrite the pattern for the currently used regexp factory.
    47      */
    48     private final StringBuilder sb;
    50     /** Expected token table */
    51     private final Map<Character, Integer> expected = new HashMap<>();
    53     /** Capturing parenthesis that have been found so far. */
    54     private final List<Capture> caps = new LinkedList<>();
    56     /** Forward references to capturing parenthesis to be resolved later.*/
    57     private final LinkedList<Integer> forwardReferences = new LinkedList<>();
    59     /** Current level of zero-width negative lookahead assertions. */
    60     private int negativeLookaheadLevel;
    62     /** Are we currently inside a character class? */
    63     private boolean inCharClass = false;
    65     /** Are we currently inside a negated character class? */
    66     private boolean inNegativeClass = false;
    68     private static final String NON_IDENT_ESCAPES = "$^*+(){}[]|\\.?";
    70     private static class Capture {
    71         /**
    72          * Zero-width negative lookaheads enclosing the capture.
    73          */
    74         private final int negativeLookaheadLevel;
    76         Capture(final int negativeLookaheadLevel) {
    77             this.negativeLookaheadLevel = negativeLookaheadLevel;
    78         }
    80         public int getNegativeLookaheadLevel() {
    81             return negativeLookaheadLevel;
    82         }
    84     }
    86     /**
    87      * Constructor
    88      * @param string the JavaScript regexp to parse
    89      */
    90     private RegExpScanner(final String string) {
    91         super(string);
    92         sb = new StringBuilder(limit);
    93         reset(0);
    94         expected.put(']', 0);
    95         expected.put('}', 0);
    96     }
    98     private void processForwardReferences() {
   100         Iterator<Integer> iterator = forwardReferences.descendingIterator();
   101         while (iterator.hasNext()) {
   102             final int pos = iterator.next();
   103             final int num = iterator.next();
   104             if (num > caps.size()) {
   105                 // Non-existing backreference. If the number begins with a valid octal convert it to
   106                 // Unicode escape and append the rest to a literal character sequence.
   107                 final StringBuilder buffer = new StringBuilder();
   108                 octalOrLiteral(Integer.toString(num), buffer);
   109                 sb.insert(pos, buffer);
   110             }
   111         }
   113         forwardReferences.clear();
   114     }
   116     /**
   117      * Scan a JavaScript regexp string returning a Java safe regex string.
   118      *
   119      * @param string
   120      *            JavaScript regexp string.
   121      * @return Java safe regex string.
   122      */
   123     public static RegExpScanner scan(final String string) {
   124         final RegExpScanner scanner = new RegExpScanner(string);
   126         try {
   127             scanner.disjunction();
   128         } catch (final Exception e) {
   129             throw new PatternSyntaxException(e.getMessage(), string, scanner.position);
   130         }
   132         scanner.processForwardReferences();
   134         // Throw syntax error unless we parsed the entire JavaScript regexp without syntax errors
   135         if (scanner.position != string.length()) {
   136             final String p = scanner.getStringBuilder().toString();
   137             throw new PatternSyntaxException(string, p, p.length() + 1);
   138         }
   140         return scanner;
   141     }
   143     final StringBuilder getStringBuilder() {
   144         return sb;
   145     }
   147     String getJavaPattern() {
   148         return sb.toString();
   149     }
   151     BitVector getGroupsInNegativeLookahead() {
   152         BitVector vec = null;
   153         for (int i = 0; i < caps.size(); i++) {
   154             final Capture cap = caps.get(i);
   155             if (cap.getNegativeLookaheadLevel() > 0) {
   156                 if (vec == null) {
   157                     vec = new BitVector(caps.size() + 1);
   158                 }
   159                 vec.set(i + 1);
   160             }
   161         }
   162         return vec;
   163     }
   165     /**
   166      * Commit n characters to the builder and to a given token
   167      * @param n     Number of characters.
   168      * @return Committed token
   169      */
   170     private boolean commit(final int n) {
   171         switch (n) {
   172         case 1:
   173             sb.append(ch0);
   174             skip(1);
   175             break;
   176         case 2:
   177             sb.append(ch0);
   178             sb.append(ch1);
   179             skip(2);
   180             break;
   181         case 3:
   182             sb.append(ch0);
   183             sb.append(ch1);
   184             sb.append(ch2);
   185             skip(3);
   186             break;
   187         default:
   188             assert false : "Should not reach here";
   189         }
   191         return true;
   192     }
   194     /**
   195      * Restart the buffers back at an earlier position.
   196      *
   197      * @param startIn
   198      *            Position in the input stream.
   199      * @param startOut
   200      *            Position in the output stream.
   201      */
   202     private void restart(final int startIn, final int startOut) {
   203         reset(startIn);
   204         sb.setLength(startOut);
   205     }
   207     private void push(final char ch) {
   208         expected.put(ch, expected.get(ch) + 1);
   209     }
   211     private void pop(final char ch) {
   212         expected.put(ch, Math.min(0, expected.get(ch) - 1));
   213     }
   215     /*
   216      * Recursive descent tokenizer starts below.
   217      */
   219     /*
   220      * Disjunction ::
   221      *      Alternative
   222      *      Alternative | Disjunction
   223      */
   224     private void disjunction() {
   225         while (true) {
   226             alternative();
   228             if (ch0 == '|') {
   229                 commit(1);
   230             } else {
   231                 break;
   232             }
   233         }
   234     }
   236     /*
   237      * Alternative ::
   238      *      [empty]
   239      *      Alternative Term
   240      */
   241     private void alternative() {
   242         while (term()) {
   243             // do nothing
   244         }
   245     }
   247     /*
   248      * Term ::
   249      *      Assertion
   250      *      Atom
   251      *      Atom Quantifier
   252      */
   253     private boolean term() {
   254         final int startIn  = position;
   255         final int startOut = sb.length();
   257         if (assertion()) {
   258             return true;
   259         }
   261         if (atom()) {
   262             // Check for character classes that never or always match
   263             if (sb.toString().endsWith("[]")) {
   264                 sb.setLength(sb.length() - 1);
   265                 sb.append("^\\s\\S]");
   266             } else if (sb.toString().endsWith("[^]")) {
   267                 sb.setLength(sb.length() - 2);
   268                 sb.append("\\s\\S]");
   269             }
   271             quantifier();
   272             return true;
   273         }
   275         restart(startIn, startOut);
   276         return false;
   277     }
   279     /*
   280      * Assertion ::
   281      *      ^
   282      *      $
   283      *      \b
   284      *      \B
   285      *      ( ? = Disjunction )
   286      *      ( ? ! Disjunction )
   287      */
   288     private boolean assertion() {
   289         final int startIn  = position;
   290         final int startOut = sb.length();
   292         switch (ch0) {
   293         case '^':
   294         case '$':
   295             return commit(1);
   297         case '\\':
   298             if (ch1 == 'b' || ch1 == 'B') {
   299                 return commit(2);
   300             }
   301             break;
   303         case '(':
   304             if (ch1 != '?') {
   305                 break;
   306             }
   307             if (ch2 != '=' && ch2 != '!') {
   308                 break;
   309             }
   310             final boolean isNegativeLookahead = (ch2 == '!');
   311             commit(3);
   313             if (isNegativeLookahead) {
   314                 negativeLookaheadLevel++;
   315             }
   316             disjunction();
   317             if (isNegativeLookahead) {
   318                 negativeLookaheadLevel--;
   319             }
   321             if (ch0 == ')') {
   322                 return commit(1);
   323             }
   324             break;
   326         default:
   327             break;
   328         }
   330         restart(startIn, startOut);
   331         return false;
   332     }
   334     /*
   335      * Quantifier ::
   336      *      QuantifierPrefix
   337      *      QuantifierPrefix ?
   338      */
   339     private boolean quantifier() {
   340         if (quantifierPrefix()) {
   341             if (ch0 == '?') {
   342                 commit(1);
   343             }
   344             return true;
   345         }
   346         return false;
   347     }
   349     /*
   350      * QuantifierPrefix ::
   351      *      *
   352      *      +
   353      *      ?
   354      *      { DecimalDigits }
   355      *      { DecimalDigits , }
   356      *      { DecimalDigits , DecimalDigits }
   357      */
   358     private boolean quantifierPrefix() {
   359         final int startIn  = position;
   360         final int startOut = sb.length();
   362         switch (ch0) {
   363         case '*':
   364         case '+':
   365         case '?':
   366             return commit(1);
   368         case '{':
   369             commit(1);
   371             if (!decimalDigits()) {
   372                 break; // not a quantifier - back out
   373             }
   374             push('}');
   376             if (ch0 == ',') {
   377                 commit(1);
   378                 decimalDigits();
   379             }
   381             if (ch0 == '}') {
   382                 pop('}');
   383                 commit(1);
   384             } else {
   385                 // Bad quantifier should be rejected but is accepted by all major engines
   386                 restart(startIn, startOut);
   387                 return false;
   388             }
   390             return true;
   392         default:
   393             break;
   394         }
   396         restart(startIn, startOut);
   397         return false;
   398     }
   400     /*
   401      * Atom ::
   402      *      PatternCharacter
   403      *      .
   404      *      \ AtomEscape
   405      *      CharacterClass
   406      *      ( Disjunction )
   407      *      ( ? : Disjunction )
   408      *
   409      */
   410     private boolean atom() {
   411         final int startIn  = position;
   412         final int startOut = sb.length();
   414         if (patternCharacter()) {
   415             return true;
   416         }
   418         if (ch0 == '.') {
   419             return commit(1);
   420         }
   422         if (ch0 == '\\') {
   423             commit(1);
   425             if (atomEscape()) {
   426                 return true;
   427             }
   428         }
   430         if (characterClass()) {
   431             return true;
   432         }
   434         if (ch0 == '(') {
   435             boolean capturingParens = true;
   436             commit(1);
   437             if (ch0 == '?' && ch1 == ':') {
   438                 capturingParens = false;
   439                 commit(2);
   440             }
   442             disjunction();
   444             if (ch0 == ')') {
   445                 commit(1);
   446                 if (capturingParens) {
   447                     caps.add(new Capture(negativeLookaheadLevel));
   448                 }
   449                 return true;
   450             }
   451         }
   453         restart(startIn, startOut);
   454         return false;
   455     }
   457     /*
   458      * PatternCharacter ::
   459      *      SourceCharacter but not any of: ^$\.*+?()[]{}|
   460      */
   461     @SuppressWarnings("fallthrough")
   462     private boolean patternCharacter() {
   463         if (atEOF()) {
   464             return false;
   465         }
   467         switch (ch0) {
   468         case '^':
   469         case '$':
   470         case '\\':
   471         case '.':
   472         case '*':
   473         case '+':
   474         case '?':
   475         case '(':
   476         case ')':
   477         case '[':
   478         case '|':
   479             return false;
   481         case '}':
   482         case ']':
   483             final int n = expected.get(ch0);
   484             if (n != 0) {
   485                 return false;
   486             }
   488        case '{':
   489            // if not a valid quantifier escape curly brace to match itself
   490            // this ensures compatibility with other JS implementations
   491            if (!quantifierPrefix()) {
   492                sb.append('\\');
   493                return commit(1);
   494            }
   495            return false;
   497         default:
   498             return commit(1); // SOURCECHARACTER
   499         }
   500     }
   502     /*
   503      * AtomEscape ::
   504      *      DecimalEscape
   505      *      CharacterEscape
   506      *      CharacterClassEscape
   507      */
   508     private boolean atomEscape() {
   509         // Note that contrary to ES 5.1 spec we put identityEscape() last because it acts as a catch-all
   510         return decimalEscape() || characterClassEscape() || characterEscape() || identityEscape();
   511     }
   513     /*
   514      * CharacterEscape ::
   515      *      ControlEscape
   516      *      c ControlLetter
   517      *      HexEscapeSequence
   518      *      UnicodeEscapeSequence
   519      *      IdentityEscape
   520      */
   521     private boolean characterEscape() {
   522         final int startIn  = position;
   523         final int startOut = sb.length();
   525         if (controlEscape()) {
   526             return true;
   527         }
   529         if (ch0 == 'c') {
   530             commit(1);
   531             if (controlLetter()) {
   532                 return true;
   533             }
   534             restart(startIn, startOut);
   535         }
   537         if (hexEscapeSequence() || unicodeEscapeSequence()) {
   538             return true;
   539         }
   541         restart(startIn, startOut);
   542         return false;
   543     }
   545     private boolean scanEscapeSequence(final char leader, final int length) {
   546         final int startIn  = position;
   547         final int startOut = sb.length();
   549         if (ch0 != leader) {
   550             return false;
   551         }
   553         commit(1);
   554         for (int i = 0; i < length; i++) {
   555             final char ch0l = Character.toLowerCase(ch0);
   556             if ((ch0l >= 'a' && ch0l <= 'f') || isDecimalDigit(ch0)) {
   557                 commit(1);
   558             } else {
   559                 restart(startIn, startOut);
   560                 return false;
   561             }
   562         }
   564         return true;
   565     }
   567     private boolean hexEscapeSequence() {
   568         return scanEscapeSequence('x', 2);
   569     }
   571     private boolean unicodeEscapeSequence() {
   572         return scanEscapeSequence('u', 4);
   573     }
   575     /*
   576      * ControlEscape ::
   577      *      one of fnrtv
   578      */
   579     private boolean controlEscape() {
   580         switch (ch0) {
   581         case 'f':
   582         case 'n':
   583         case 'r':
   584         case 't':
   585         case 'v':
   586             return commit(1);
   588         default:
   589             return false;
   590         }
   591     }
   593     /*
   594      * ControlLetter ::
   595      *      one of abcdefghijklmnopqrstuvwxyz
   596      *      ABCDEFGHIJKLMNOPQRSTUVWXYZ
   597      */
   598     private boolean controlLetter() {
   599         final char c = Character.toUpperCase(ch0);
   600         if (c >= 'A' && c <= 'Z') {
   601             // for some reason java regexps don't like control characters on the
   602             // form "\\ca".match([string with ascii 1 at char0]). Translating
   603             // them to unicode does it though.
   604             sb.setLength(sb.length() - 1);
   605             unicode(c - 'A' + 1, sb);
   606             skip(1);
   607             return true;
   608         }
   609         return false;
   610     }
   612     /*
   613      * IdentityEscape ::
   614      *      SourceCharacter but not IdentifierPart
   615      *      <ZWJ>  (200c)
   616      *      <ZWNJ> (200d)
   617      */
   618     private boolean identityEscape() {
   619         if (atEOF()) {
   620             throw new RuntimeException("\\ at end of pattern"); // will be converted to PatternSyntaxException
   621         }
   622         // ES 5.1 A.7 requires "not IdentifierPart" here but all major engines accept any character here.
   623         if (ch0 == 'c') {
   624             // Ignore invalid control letter escape if within a character class
   625             if (inCharClass && ch1 != ']') {
   626                 sb.setLength(sb.length() - 1);
   627                 skip(2);
   628                 return true;
   629             } else {
   630                 sb.append('\\'); // Treat invalid \c control sequence as \\c
   631             }
   632         } else if (NON_IDENT_ESCAPES.indexOf(ch0) == -1) {
   633             sb.setLength(sb.length() - 1);
   634         }
   635         return commit(1);
   636     }
   638     /*
   639      * DecimalEscape ::
   640      *      DecimalIntegerLiteral [lookahead DecimalDigit]
   641      */
   642     private boolean decimalEscape() {
   643         final int startIn  = position;
   644         final int startOut = sb.length();
   646         if (ch0 == '0' && !isOctalDigit(ch1)) {
   647             skip(1);
   648             //  DecimalEscape :: 0. If i is zero, return the EscapeValue consisting of a <NUL> character (Unicodevalue0000);
   649             sb.append("\u0000");
   650             return true;
   651         }
   653         if (isDecimalDigit(ch0)) {
   655             if (ch0 == '0') {
   656                 // We know this is an octal escape.
   657                 if (inCharClass) {
   658                     // Convert octal escape to unicode escape if inside character class.
   659                     int octalValue = 0;
   660                     while (isOctalDigit(ch0)) {
   661                         octalValue = octalValue * 8 + ch0 - '0';
   662                         skip(1);
   663                     }
   665                     unicode(octalValue, sb);
   667                 } else {
   668                     // Copy decimal escape as-is
   669                     decimalDigits();
   670                 }
   671             } else {
   672                 // This should be a backreference, but could also be an octal escape or even a literal string.
   673                 int decimalValue = 0;
   674                 while (isDecimalDigit(ch0)) {
   675                     decimalValue = decimalValue * 10 + ch0 - '0';
   676                     skip(1);
   677                 }
   679                 if (inCharClass) {
   680                     // No backreferences in character classes. Encode as unicode escape or literal char sequence
   681                     sb.setLength(sb.length() - 1);
   682                     octalOrLiteral(Integer.toString(decimalValue), sb);
   684                 } else if (decimalValue <= caps.size() && caps.get(decimalValue - 1).getNegativeLookaheadLevel() > 0) {
   685                     //  Captures that live inside a negative lookahead are dead after the
   686                     //  lookahead and will be undefined if referenced from outside.
   687                     if (caps.get(decimalValue - 1).getNegativeLookaheadLevel() > negativeLookaheadLevel) {
   688                         sb.setLength(sb.length() - 1);
   689                     } else {
   690                         sb.append(decimalValue);
   691                     }
   692                 } else if (decimalValue > caps.size()) {
   693                     // Forward reference to a capture group. Forward references are always undefined so we can omit
   694                     // it from the output buffer. However, if the target capture does not exist, we need to rewrite
   695                     // the reference as hex escape or literal string, so register the reference for later processing.
   696                     sb.setLength(sb.length() - 1);
   697                     forwardReferences.add(decimalValue);
   698                     forwardReferences.add(sb.length());
   699                 } else {
   700                     // Append as backreference
   701                     sb.append(decimalValue);
   702                 }
   704             }
   705             return true;
   706         }
   708         restart(startIn, startOut);
   709         return false;
   710     }
   712     /*
   713      * CharacterClassEscape ::
   714      *  one of dDsSwW
   715      */
   716     private boolean characterClassEscape() {
   717         switch (ch0) {
   718         // java.util.regex requires translation of \s and \S to explicit character list
   719         case 's':
   720             if (RegExpFactory.usesJavaUtilRegex()) {
   721                 sb.setLength(sb.length() - 1);
   722                 // No nested class required if we already are inside a character class
   723                 if (inCharClass) {
   724                     sb.append(Lexer.getWhitespaceRegExp());
   725                 } else {
   726                     sb.append('[').append(Lexer.getWhitespaceRegExp()).append(']');
   727                 }
   728                 skip(1);
   729                 return true;
   730             }
   731             return commit(1);
   732         case 'S':
   733             if (RegExpFactory.usesJavaUtilRegex()) {
   734                 sb.setLength(sb.length() - 1);
   735                 // In negative class we must use intersection to get double negation ("not anything else than space")
   736                 sb.append(inNegativeClass ? "&&[" : "[^").append(Lexer.getWhitespaceRegExp()).append(']');
   737                 skip(1);
   738                 return true;
   739             }
   740             return commit(1);
   741         case 'd':
   742         case 'D':
   743         case 'w':
   744         case 'W':
   745             return commit(1);
   747         default:
   748             return false;
   749         }
   750     }
   752     /*
   753      * CharacterClass ::
   754      *      [ [lookahead {^}] ClassRanges ]
   755      *      [ ^ ClassRanges ]
   756      */
   757     private boolean characterClass() {
   758         final int startIn  = position;
   759         final int startOut = sb.length();
   761         if (ch0 == '[') {
   762             try {
   763                 inCharClass = true;
   764                 push(']');
   765                 commit(1);
   767                 if (ch0 == '^') {
   768                     inNegativeClass = true;
   769                     commit(1);
   770                 }
   772                 if (classRanges() && ch0 == ']') {
   773                     pop(']');
   774                     return commit(1);
   775                 }
   776             } finally {
   777                 inCharClass = false;  // no nested character classes in JavaScript
   778                 inNegativeClass = false;
   779             }
   780         }
   782         restart(startIn, startOut);
   783         return false;
   784     }
   786     /*
   787      * ClassRanges ::
   788      *      [empty]
   789      *      NonemptyClassRanges
   790      */
   791     private boolean classRanges() {
   792         nonemptyClassRanges();
   793         return true;
   794     }
   796     /*
   797      * NonemptyClassRanges ::
   798      *      ClassAtom
   799      *      ClassAtom NonemptyClassRangesNoDash
   800      *      ClassAtom - ClassAtom ClassRanges
   801      */
   802     private boolean nonemptyClassRanges() {
   803         final int startIn  = position;
   804         final int startOut = sb.length();
   806         if (classAtom()) {
   808             if (ch0 == '-') {
   809                 commit(1);
   811                 if (classAtom() && classRanges()) {
   812                     return true;
   813                 }
   814             }
   816             nonemptyClassRangesNoDash();
   818             return true;
   819         }
   821         restart(startIn, startOut);
   822         return false;
   823     }
   825     /*
   826      * NonemptyClassRangesNoDash ::
   827      *      ClassAtom
   828      *      ClassAtomNoDash NonemptyClassRangesNoDash
   829      *      ClassAtomNoDash - ClassAtom ClassRanges
   830      */
   831     private boolean nonemptyClassRangesNoDash() {
   832         final int startIn  = position;
   833         final int startOut = sb.length();
   835         if (classAtomNoDash()) {
   837             // need to check dash first, as for e.g. [a-b|c-d] will otherwise parse - as an atom
   838             if (ch0 == '-') {
   839                commit(1);
   841                if (classAtom() && classRanges()) {
   842                    return true;
   843                }
   844                //fallthru
   845            }
   847             nonemptyClassRangesNoDash();
   848             return true; // still a class atom
   849         }
   851         if (classAtom()) {
   852             return true;
   853         }
   855         restart(startIn, startOut);
   856         return false;
   857     }
   859     /*
   860      * ClassAtom : - ClassAtomNoDash
   861      */
   862     private boolean classAtom() {
   864         if (ch0 == '-') {
   865             return commit(1);
   866         }
   868         return classAtomNoDash();
   869     }
   871     /*
   872      * ClassAtomNoDash ::
   873      *      SourceCharacter but not one of \ or ] or -
   874      *      \ ClassEscape
   875      */
   876     private boolean classAtomNoDash() {
   877         final int startIn  = position;
   878         final int startOut = sb.length();
   880         switch (ch0) {
   881         case ']':
   882         case '-':
   883         case '\0':
   884             return false;
   886         case '[':
   887             // unescaped left square bracket - add escape
   888             sb.append('\\');
   889             return commit(1);
   891         case '\\':
   892             commit(1);
   893             if (classEscape()) {
   894                 return true;
   895             }
   897             restart(startIn, startOut);
   898             return false;
   900         default:
   901             return commit(1);
   902         }
   903     }
   905     /*
   906      * ClassEscape ::
   907      *      DecimalEscape
   908      *      b
   909      *      CharacterEscape
   910      *      CharacterClassEscape
   911      */
   912     private boolean classEscape() {
   914         if (decimalEscape()) {
   915             return true;
   916         }
   918         if (ch0 == 'b') {
   919             sb.setLength(sb.length() - 1);
   920             sb.append('\b');
   921             skip(1);
   922             return true;
   923         }
   925         // Note that contrary to ES 5.1 spec we put identityEscape() last because it acts as a catch-all
   926         return characterEscape() || characterClassEscape() || identityEscape();
   927     }
   929     /*
   930      * DecimalDigits
   931      */
   932     private boolean decimalDigits() {
   933         if (!isDecimalDigit(ch0)) {
   934             return false;
   935         }
   937         while (isDecimalDigit(ch0)) {
   938             commit(1);
   939         }
   941         return true;
   942     }
   944     private void unicode(final int value, final StringBuilder buffer) {
   945         final String hex = Integer.toHexString(value);
   946         buffer.append('u');
   947         for (int i = 0; i < 4 - hex.length(); i++) {
   948             buffer.append('0');
   949         }
   950         buffer.append(hex);
   951     }
   953     // Convert what would have been a backreference into a unicode escape, or a number literal, or both.
   954     private void octalOrLiteral(final String numberLiteral, final StringBuilder buffer) {
   955         final int length = numberLiteral.length();
   956         int octalValue = 0;
   957         int pos = 0;
   958         // Maximum value for octal escape is 0377 (255) so we stop the loop at 32
   959         while (pos < length && octalValue < 0x20) {
   960             final char ch = numberLiteral.charAt(pos);
   961             if (isOctalDigit(ch)) {
   962                 octalValue = octalValue * 8 + ch - '0';
   963             } else {
   964                 break;
   965             }
   966             pos++;
   967         }
   968         if (octalValue > 0) {
   969             buffer.append('\\');
   970             unicode(octalValue, buffer);
   971             buffer.append(numberLiteral.substring(pos));
   972         } else {
   973             buffer.append(numberLiteral);
   974         }
   975     }
   977     private static boolean isOctalDigit(final char ch) {
   978         return ch >= '0' && ch <= '7';
   979     }
   981     private static boolean isDecimalDigit(final char ch) {
   982         return ch >= '0' && ch <= '9';
   983     }
   984 }

mercurial