Thu, 11 Apr 2013 12:16:39 +0200
8011980: Allow NUL character in character class
Reviewed-by: sundar, lagergren
jlaskey@3 | 1 | /* |
jlaskey@7 | 2 | * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved. |
jlaskey@3 | 3 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
jlaskey@3 | 4 | * |
jlaskey@3 | 5 | * This code is free software; you can redistribute it and/or modify it |
jlaskey@3 | 6 | * under the terms of the GNU General Public License version 2 only, as |
jlaskey@3 | 7 | * published by the Free Software Foundation. Oracle designates this |
jlaskey@3 | 8 | * particular file as subject to the "Classpath" exception as provided |
jlaskey@3 | 9 | * by Oracle in the LICENSE file that accompanied this code. |
jlaskey@3 | 10 | * |
jlaskey@3 | 11 | * This code is distributed in the hope that it will be useful, but WITHOUT |
jlaskey@3 | 12 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
jlaskey@3 | 13 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
jlaskey@3 | 14 | * version 2 for more details (a copy is included in the LICENSE file that |
jlaskey@3 | 15 | * accompanied this code). |
jlaskey@3 | 16 | * |
jlaskey@3 | 17 | * You should have received a copy of the GNU General Public License version |
jlaskey@3 | 18 | * 2 along with this work; if not, write to the Free Software Foundation, |
jlaskey@3 | 19 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
jlaskey@3 | 20 | * |
jlaskey@3 | 21 | * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
jlaskey@3 | 22 | * or visit www.oracle.com if you need additional information or have any |
jlaskey@3 | 23 | * questions. |
jlaskey@3 | 24 | */ |
jlaskey@3 | 25 | |
hannesw@114 | 26 | package jdk.nashorn.internal.runtime.regexp; |
jlaskey@3 | 27 | |
jlaskey@3 | 28 | import java.util.HashMap; |
hannesw@179 | 29 | import java.util.Iterator; |
jlaskey@3 | 30 | import java.util.LinkedList; |
jlaskey@3 | 31 | import java.util.List; |
jlaskey@3 | 32 | import java.util.Map; |
jlaskey@3 | 33 | import java.util.regex.PatternSyntaxException; |
hannesw@114 | 34 | |
hannesw@127 | 35 | import jdk.nashorn.internal.parser.Lexer; |
sundar@82 | 36 | import jdk.nashorn.internal.parser.Scanner; |
hannesw@114 | 37 | import jdk.nashorn.internal.runtime.BitVector; |
jlaskey@3 | 38 | |
jlaskey@3 | 39 | /** |
jlaskey@3 | 40 | * Scan a JavaScript regexp, converting to Java regex if necessary. |
jlaskey@3 | 41 | * |
jlaskey@3 | 42 | */ |
sundar@82 | 43 | final class RegExpScanner extends Scanner { |
jlaskey@3 | 44 | |
jlaskey@3 | 45 | /** |
hannesw@127 | 46 | * String builder used to rewrite the pattern for the currently used regexp factory. |
jlaskey@3 | 47 | */ |
jlaskey@3 | 48 | private final StringBuilder sb; |
jlaskey@3 | 49 | |
jlaskey@3 | 50 | /** Expected token table */ |
jlaskey@3 | 51 | private final Map<Character, Integer> expected = new HashMap<>(); |
jlaskey@3 | 52 | |
jlaskey@3 | 53 | /** Capturing parenthesis that have been found so far. */ |
jlaskey@3 | 54 | private final List<Capture> caps = new LinkedList<>(); |
jlaskey@3 | 55 | |
jlaskey@3 | 56 | /** Forward references to capturing parenthesis to be resolved later.*/ |
hannesw@179 | 57 | private final LinkedList<Integer> forwardReferences = new LinkedList<>(); |
jlaskey@3 | 58 | |
jlaskey@3 | 59 | /** Current level of zero-width negative lookahead assertions. */ |
jlaskey@3 | 60 | private int negativeLookaheadLevel; |
jlaskey@3 | 61 | |
hannesw@127 | 62 | /** Are we currently inside a character class? */ |
hannesw@127 | 63 | private boolean inCharClass = false; |
hannesw@127 | 64 | |
hannesw@130 | 65 | /** Are we currently inside a negated character class? */ |
hannesw@130 | 66 | private boolean inNegativeClass = false; |
hannesw@130 | 67 | |
jlaskey@3 | 68 | private static final String NON_IDENT_ESCAPES = "$^*+(){}[]|\\.?"; |
jlaskey@3 | 69 | |
jlaskey@3 | 70 | private static class Capture { |
jlaskey@3 | 71 | /** |
jlaskey@3 | 72 | * Zero-width negative lookaheads enclosing the capture. |
jlaskey@3 | 73 | */ |
jlaskey@3 | 74 | private final int negativeLookaheadLevel; |
jlaskey@3 | 75 | |
jlaskey@3 | 76 | Capture(final int negativeLookaheadLevel) { |
jlaskey@3 | 77 | this.negativeLookaheadLevel = negativeLookaheadLevel; |
jlaskey@3 | 78 | } |
jlaskey@3 | 79 | |
jlaskey@3 | 80 | public int getNegativeLookaheadLevel() { |
jlaskey@3 | 81 | return negativeLookaheadLevel; |
jlaskey@3 | 82 | } |
jlaskey@3 | 83 | |
jlaskey@3 | 84 | } |
jlaskey@3 | 85 | |
jlaskey@3 | 86 | /** |
jlaskey@3 | 87 | * Constructor |
jlaskey@3 | 88 | * @param string the JavaScript regexp to parse |
jlaskey@3 | 89 | */ |
jlaskey@3 | 90 | private RegExpScanner(final String string) { |
jlaskey@3 | 91 | super(string); |
jlaskey@3 | 92 | sb = new StringBuilder(limit); |
jlaskey@3 | 93 | reset(0); |
jlaskey@3 | 94 | expected.put(']', 0); |
jlaskey@3 | 95 | expected.put('}', 0); |
jlaskey@3 | 96 | } |
jlaskey@3 | 97 | |
jlaskey@3 | 98 | private void processForwardReferences() { |
jlaskey@3 | 99 | |
hannesw@179 | 100 | Iterator<Integer> iterator = forwardReferences.descendingIterator(); |
hannesw@179 | 101 | while (iterator.hasNext()) { |
hannesw@179 | 102 | final int pos = iterator.next(); |
hannesw@179 | 103 | final int num = iterator.next(); |
hannesw@179 | 104 | if (num > caps.size()) { |
hannesw@187 | 105 | // Non-existing backreference. If the number begins with a valid octal convert it to |
hannesw@187 | 106 | // Unicode escape and append the rest to a literal character sequence. |
hannesw@187 | 107 | final StringBuilder buffer = new StringBuilder(); |
hannesw@187 | 108 | octalOrLiteral(Integer.toString(num), buffer); |
hannesw@187 | 109 | sb.insert(pos, buffer); |
jlaskey@3 | 110 | } |
jlaskey@3 | 111 | } |
jlaskey@3 | 112 | |
jlaskey@3 | 113 | forwardReferences.clear(); |
jlaskey@3 | 114 | } |
jlaskey@3 | 115 | |
jlaskey@3 | 116 | /** |
jlaskey@3 | 117 | * Scan a JavaScript regexp string returning a Java safe regex string. |
jlaskey@3 | 118 | * |
jlaskey@3 | 119 | * @param string |
jlaskey@3 | 120 | * JavaScript regexp string. |
jlaskey@3 | 121 | * @return Java safe regex string. |
jlaskey@3 | 122 | */ |
jlaskey@3 | 123 | public static RegExpScanner scan(final String string) { |
jlaskey@3 | 124 | final RegExpScanner scanner = new RegExpScanner(string); |
jlaskey@3 | 125 | |
jlaskey@3 | 126 | try { |
hannesw@127 | 127 | scanner.disjunction(); |
jlaskey@3 | 128 | } catch (final Exception e) { |
hannesw@127 | 129 | throw new PatternSyntaxException(e.getMessage(), string, scanner.position); |
jlaskey@3 | 130 | } |
jlaskey@3 | 131 | |
jlaskey@3 | 132 | scanner.processForwardReferences(); |
jlaskey@3 | 133 | |
hannesw@127 | 134 | // Throw syntax error unless we parsed the entire JavaScript regexp without syntax errors |
hannesw@127 | 135 | if (scanner.position != string.length()) { |
hannesw@127 | 136 | final String p = scanner.getStringBuilder().toString(); |
jlaskey@3 | 137 | throw new PatternSyntaxException(string, p, p.length() + 1); |
jlaskey@3 | 138 | } |
jlaskey@3 | 139 | |
jlaskey@3 | 140 | return scanner; |
jlaskey@3 | 141 | } |
jlaskey@3 | 142 | |
jlaskey@3 | 143 | final StringBuilder getStringBuilder() { |
jlaskey@3 | 144 | return sb; |
jlaskey@3 | 145 | } |
jlaskey@3 | 146 | |
jlaskey@3 | 147 | String getJavaPattern() { |
hannesw@127 | 148 | return sb.toString(); |
jlaskey@3 | 149 | } |
jlaskey@3 | 150 | |
jlaskey@3 | 151 | BitVector getGroupsInNegativeLookahead() { |
jlaskey@3 | 152 | BitVector vec = null; |
jlaskey@3 | 153 | for (int i = 0; i < caps.size(); i++) { |
jlaskey@3 | 154 | final Capture cap = caps.get(i); |
jlaskey@3 | 155 | if (cap.getNegativeLookaheadLevel() > 0) { |
jlaskey@3 | 156 | if (vec == null) { |
jlaskey@3 | 157 | vec = new BitVector(caps.size() + 1); |
jlaskey@3 | 158 | } |
jlaskey@3 | 159 | vec.set(i + 1); |
jlaskey@3 | 160 | } |
jlaskey@3 | 161 | } |
jlaskey@3 | 162 | return vec; |
jlaskey@3 | 163 | } |
jlaskey@3 | 164 | |
jlaskey@3 | 165 | /** |
jlaskey@3 | 166 | * Commit n characters to the builder and to a given token |
jlaskey@3 | 167 | * @param n Number of characters. |
jlaskey@3 | 168 | * @return Committed token |
jlaskey@3 | 169 | */ |
hannesw@127 | 170 | private boolean commit(final int n) { |
jlaskey@3 | 171 | switch (n) { |
jlaskey@3 | 172 | case 1: |
jlaskey@3 | 173 | sb.append(ch0); |
jlaskey@3 | 174 | skip(1); |
jlaskey@3 | 175 | break; |
jlaskey@3 | 176 | case 2: |
jlaskey@3 | 177 | sb.append(ch0); |
jlaskey@3 | 178 | sb.append(ch1); |
jlaskey@3 | 179 | skip(2); |
jlaskey@3 | 180 | break; |
jlaskey@3 | 181 | case 3: |
jlaskey@3 | 182 | sb.append(ch0); |
jlaskey@3 | 183 | sb.append(ch1); |
jlaskey@3 | 184 | sb.append(ch2); |
jlaskey@3 | 185 | skip(3); |
jlaskey@3 | 186 | break; |
jlaskey@3 | 187 | default: |
jlaskey@3 | 188 | assert false : "Should not reach here"; |
jlaskey@3 | 189 | } |
jlaskey@3 | 190 | |
hannesw@127 | 191 | return true; |
jlaskey@3 | 192 | } |
jlaskey@3 | 193 | |
jlaskey@3 | 194 | /** |
jlaskey@3 | 195 | * Restart the buffers back at an earlier position. |
jlaskey@3 | 196 | * |
jlaskey@3 | 197 | * @param startIn |
jlaskey@3 | 198 | * Position in the input stream. |
jlaskey@3 | 199 | * @param startOut |
jlaskey@3 | 200 | * Position in the output stream. |
jlaskey@3 | 201 | */ |
jlaskey@3 | 202 | private void restart(final int startIn, final int startOut) { |
jlaskey@3 | 203 | reset(startIn); |
jlaskey@3 | 204 | sb.setLength(startOut); |
jlaskey@3 | 205 | } |
jlaskey@3 | 206 | |
jlaskey@3 | 207 | private void push(final char ch) { |
jlaskey@3 | 208 | expected.put(ch, expected.get(ch) + 1); |
jlaskey@3 | 209 | } |
jlaskey@3 | 210 | |
jlaskey@3 | 211 | private void pop(final char ch) { |
jlaskey@3 | 212 | expected.put(ch, Math.min(0, expected.get(ch) - 1)); |
jlaskey@3 | 213 | } |
jlaskey@3 | 214 | |
jlaskey@3 | 215 | /* |
jlaskey@3 | 216 | * Recursive descent tokenizer starts below. |
jlaskey@3 | 217 | */ |
jlaskey@3 | 218 | |
jlaskey@3 | 219 | /* |
jlaskey@3 | 220 | * Disjunction :: |
jlaskey@3 | 221 | * Alternative |
jlaskey@3 | 222 | * Alternative | Disjunction |
jlaskey@3 | 223 | */ |
hannesw@127 | 224 | private void disjunction() { |
jlaskey@3 | 225 | while (true) { |
hannesw@127 | 226 | alternative(); |
jlaskey@3 | 227 | |
jlaskey@3 | 228 | if (ch0 == '|') { |
hannesw@127 | 229 | commit(1); |
jlaskey@3 | 230 | } else { |
jlaskey@3 | 231 | break; |
jlaskey@3 | 232 | } |
jlaskey@3 | 233 | } |
jlaskey@3 | 234 | } |
jlaskey@3 | 235 | |
jlaskey@3 | 236 | /* |
jlaskey@3 | 237 | * Alternative :: |
jlaskey@3 | 238 | * [empty] |
jlaskey@3 | 239 | * Alternative Term |
jlaskey@3 | 240 | */ |
hannesw@127 | 241 | private void alternative() { |
hannesw@127 | 242 | while (term()) { |
hannesw@127 | 243 | // do nothing |
jlaskey@3 | 244 | } |
jlaskey@3 | 245 | } |
jlaskey@3 | 246 | |
jlaskey@3 | 247 | /* |
jlaskey@3 | 248 | * Term :: |
jlaskey@3 | 249 | * Assertion |
jlaskey@3 | 250 | * Atom |
jlaskey@3 | 251 | * Atom Quantifier |
jlaskey@3 | 252 | */ |
hannesw@127 | 253 | private boolean term() { |
jlaskey@3 | 254 | final int startIn = position; |
jlaskey@3 | 255 | final int startOut = sb.length(); |
jlaskey@3 | 256 | |
hannesw@127 | 257 | if (assertion()) { |
hannesw@127 | 258 | return true; |
jlaskey@3 | 259 | } |
jlaskey@3 | 260 | |
hannesw@127 | 261 | if (atom()) { |
hannesw@188 | 262 | // Check for character classes that never or always match |
hannesw@127 | 263 | if (sb.toString().endsWith("[]")) { |
hannesw@188 | 264 | sb.setLength(sb.length() - 1); |
hannesw@188 | 265 | sb.append("^\\s\\S]"); |
hannesw@127 | 266 | } else if (sb.toString().endsWith("[^]")) { |
hannesw@127 | 267 | sb.setLength(sb.length() - 2); |
hannesw@127 | 268 | sb.append("\\s\\S]"); |
jlaskey@3 | 269 | } |
jlaskey@3 | 270 | |
hannesw@188 | 271 | quantifier(); |
hannesw@127 | 272 | return true; |
jlaskey@3 | 273 | } |
jlaskey@3 | 274 | |
jlaskey@3 | 275 | restart(startIn, startOut); |
hannesw@127 | 276 | return false; |
jlaskey@3 | 277 | } |
jlaskey@3 | 278 | |
jlaskey@3 | 279 | /* |
jlaskey@3 | 280 | * Assertion :: |
jlaskey@3 | 281 | * ^ |
jlaskey@3 | 282 | * $ |
jlaskey@3 | 283 | * \b |
jlaskey@3 | 284 | * \B |
jlaskey@3 | 285 | * ( ? = Disjunction ) |
jlaskey@3 | 286 | * ( ? ! Disjunction ) |
jlaskey@3 | 287 | */ |
hannesw@127 | 288 | private boolean assertion() { |
jlaskey@3 | 289 | final int startIn = position; |
jlaskey@3 | 290 | final int startOut = sb.length(); |
jlaskey@3 | 291 | |
jlaskey@3 | 292 | switch (ch0) { |
jlaskey@3 | 293 | case '^': |
jlaskey@3 | 294 | case '$': |
hannesw@127 | 295 | return commit(1); |
jlaskey@3 | 296 | |
jlaskey@3 | 297 | case '\\': |
jlaskey@3 | 298 | if (ch1 == 'b' || ch1 == 'B') { |
hannesw@127 | 299 | return commit(2); |
jlaskey@3 | 300 | } |
jlaskey@3 | 301 | break; |
jlaskey@3 | 302 | |
jlaskey@3 | 303 | case '(': |
jlaskey@3 | 304 | if (ch1 != '?') { |
jlaskey@3 | 305 | break; |
jlaskey@3 | 306 | } |
jlaskey@3 | 307 | if (ch2 != '=' && ch2 != '!') { |
jlaskey@3 | 308 | break; |
jlaskey@3 | 309 | } |
jlaskey@3 | 310 | final boolean isNegativeLookahead = (ch2 == '!'); |
hannesw@127 | 311 | commit(3); |
jlaskey@3 | 312 | |
jlaskey@3 | 313 | if (isNegativeLookahead) { |
jlaskey@3 | 314 | negativeLookaheadLevel++; |
jlaskey@3 | 315 | } |
hannesw@127 | 316 | disjunction(); |
jlaskey@3 | 317 | if (isNegativeLookahead) { |
jlaskey@3 | 318 | negativeLookaheadLevel--; |
jlaskey@3 | 319 | } |
jlaskey@3 | 320 | |
hannesw@127 | 321 | if (ch0 == ')') { |
hannesw@127 | 322 | return commit(1); |
jlaskey@3 | 323 | } |
jlaskey@3 | 324 | break; |
jlaskey@3 | 325 | |
jlaskey@3 | 326 | default: |
jlaskey@3 | 327 | break; |
jlaskey@3 | 328 | } |
jlaskey@3 | 329 | |
jlaskey@3 | 330 | restart(startIn, startOut); |
hannesw@127 | 331 | return false; |
jlaskey@3 | 332 | } |
jlaskey@3 | 333 | |
jlaskey@3 | 334 | /* |
jlaskey@3 | 335 | * Quantifier :: |
jlaskey@3 | 336 | * QuantifierPrefix |
jlaskey@3 | 337 | * QuantifierPrefix ? |
jlaskey@3 | 338 | */ |
hannesw@127 | 339 | private boolean quantifier() { |
hannesw@127 | 340 | if (quantifierPrefix()) { |
jlaskey@3 | 341 | if (ch0 == '?') { |
hannesw@127 | 342 | commit(1); |
jlaskey@3 | 343 | } |
hannesw@127 | 344 | return true; |
jlaskey@3 | 345 | } |
hannesw@127 | 346 | return false; |
jlaskey@3 | 347 | } |
jlaskey@3 | 348 | |
jlaskey@3 | 349 | /* |
jlaskey@3 | 350 | * QuantifierPrefix :: |
jlaskey@3 | 351 | * * |
jlaskey@3 | 352 | * + |
jlaskey@3 | 353 | * ? |
jlaskey@3 | 354 | * { DecimalDigits } |
jlaskey@3 | 355 | * { DecimalDigits , } |
jlaskey@3 | 356 | * { DecimalDigits , DecimalDigits } |
jlaskey@3 | 357 | */ |
hannesw@127 | 358 | private boolean quantifierPrefix() { |
jlaskey@3 | 359 | final int startIn = position; |
jlaskey@3 | 360 | final int startOut = sb.length(); |
jlaskey@3 | 361 | |
jlaskey@3 | 362 | switch (ch0) { |
jlaskey@3 | 363 | case '*': |
jlaskey@3 | 364 | case '+': |
jlaskey@3 | 365 | case '?': |
hannesw@127 | 366 | return commit(1); |
jlaskey@3 | 367 | |
jlaskey@3 | 368 | case '{': |
hannesw@127 | 369 | commit(1); |
jlaskey@3 | 370 | |
hannesw@127 | 371 | if (!decimalDigits()) { |
jlaskey@3 | 372 | break; // not a quantifier - back out |
jlaskey@3 | 373 | } |
jlaskey@3 | 374 | push('}'); |
jlaskey@3 | 375 | |
jlaskey@3 | 376 | if (ch0 == ',') { |
hannesw@127 | 377 | commit(1); |
hannesw@127 | 378 | decimalDigits(); |
jlaskey@3 | 379 | } |
jlaskey@3 | 380 | |
jlaskey@3 | 381 | if (ch0 == '}') { |
jlaskey@3 | 382 | pop('}'); |
hannesw@127 | 383 | commit(1); |
hannesw@179 | 384 | } else { |
hannesw@179 | 385 | // Bad quantifier should be rejected but is accepted by all major engines |
hannesw@179 | 386 | restart(startIn, startOut); |
hannesw@179 | 387 | return false; |
jlaskey@3 | 388 | } |
jlaskey@3 | 389 | |
hannesw@127 | 390 | return true; |
jlaskey@3 | 391 | |
jlaskey@3 | 392 | default: |
jlaskey@3 | 393 | break; |
jlaskey@3 | 394 | } |
jlaskey@3 | 395 | |
jlaskey@3 | 396 | restart(startIn, startOut); |
hannesw@127 | 397 | return false; |
jlaskey@3 | 398 | } |
jlaskey@3 | 399 | |
jlaskey@3 | 400 | /* |
jlaskey@3 | 401 | * Atom :: |
jlaskey@3 | 402 | * PatternCharacter |
jlaskey@3 | 403 | * . |
jlaskey@3 | 404 | * \ AtomEscape |
jlaskey@3 | 405 | * CharacterClass |
jlaskey@3 | 406 | * ( Disjunction ) |
jlaskey@3 | 407 | * ( ? : Disjunction ) |
jlaskey@3 | 408 | * |
jlaskey@3 | 409 | */ |
hannesw@127 | 410 | private boolean atom() { |
jlaskey@3 | 411 | final int startIn = position; |
jlaskey@3 | 412 | final int startOut = sb.length(); |
jlaskey@3 | 413 | |
hannesw@127 | 414 | if (patternCharacter()) { |
hannesw@127 | 415 | return true; |
jlaskey@3 | 416 | } |
jlaskey@3 | 417 | |
jlaskey@3 | 418 | if (ch0 == '.') { |
hannesw@127 | 419 | return commit(1); |
jlaskey@3 | 420 | } |
jlaskey@3 | 421 | |
jlaskey@3 | 422 | if (ch0 == '\\') { |
hannesw@127 | 423 | commit(1); |
jlaskey@3 | 424 | |
hannesw@127 | 425 | if (atomEscape()) { |
hannesw@127 | 426 | return true; |
jlaskey@3 | 427 | } |
jlaskey@3 | 428 | } |
jlaskey@3 | 429 | |
hannesw@127 | 430 | if (characterClass()) { |
hannesw@127 | 431 | return true; |
jlaskey@3 | 432 | } |
jlaskey@3 | 433 | |
jlaskey@3 | 434 | if (ch0 == '(') { |
jlaskey@3 | 435 | boolean capturingParens = true; |
hannesw@127 | 436 | commit(1); |
jlaskey@3 | 437 | if (ch0 == '?' && ch1 == ':') { |
jlaskey@3 | 438 | capturingParens = false; |
hannesw@127 | 439 | commit(2); |
jlaskey@3 | 440 | } |
jlaskey@3 | 441 | |
hannesw@127 | 442 | disjunction(); |
hannesw@127 | 443 | |
hannesw@127 | 444 | if (ch0 == ')') { |
hannesw@127 | 445 | commit(1); |
hannesw@127 | 446 | if (capturingParens) { |
hannesw@127 | 447 | caps.add(new Capture(negativeLookaheadLevel)); |
jlaskey@3 | 448 | } |
hannesw@127 | 449 | return true; |
jlaskey@3 | 450 | } |
jlaskey@3 | 451 | } |
jlaskey@3 | 452 | |
jlaskey@3 | 453 | restart(startIn, startOut); |
hannesw@127 | 454 | return false; |
jlaskey@3 | 455 | } |
jlaskey@3 | 456 | |
jlaskey@3 | 457 | /* |
jlaskey@3 | 458 | * PatternCharacter :: |
jlaskey@3 | 459 | * SourceCharacter but not any of: ^$\.*+?()[]{}| |
jlaskey@3 | 460 | */ |
jlaskey@3 | 461 | @SuppressWarnings("fallthrough") |
hannesw@127 | 462 | private boolean patternCharacter() { |
jlaskey@3 | 463 | if (atEOF()) { |
hannesw@127 | 464 | return false; |
jlaskey@3 | 465 | } |
jlaskey@3 | 466 | |
jlaskey@3 | 467 | switch (ch0) { |
jlaskey@3 | 468 | case '^': |
jlaskey@3 | 469 | case '$': |
jlaskey@3 | 470 | case '\\': |
jlaskey@3 | 471 | case '.': |
jlaskey@3 | 472 | case '*': |
jlaskey@3 | 473 | case '+': |
jlaskey@3 | 474 | case '?': |
jlaskey@3 | 475 | case '(': |
jlaskey@3 | 476 | case ')': |
jlaskey@3 | 477 | case '[': |
jlaskey@3 | 478 | case '|': |
hannesw@127 | 479 | return false; |
jlaskey@3 | 480 | |
jlaskey@3 | 481 | case '}': |
jlaskey@3 | 482 | case ']': |
jlaskey@3 | 483 | final int n = expected.get(ch0); |
jlaskey@3 | 484 | if (n != 0) { |
hannesw@127 | 485 | return false; |
jlaskey@3 | 486 | } |
jlaskey@3 | 487 | |
jlaskey@3 | 488 | case '{': |
jlaskey@3 | 489 | // if not a valid quantifier escape curly brace to match itself |
jlaskey@3 | 490 | // this ensures compatibility with other JS implementations |
hannesw@127 | 491 | if (!quantifierPrefix()) { |
hannesw@127 | 492 | sb.append('\\'); |
hannesw@127 | 493 | return commit(1); |
hannesw@127 | 494 | } |
hannesw@127 | 495 | return false; |
jlaskey@3 | 496 | |
jlaskey@3 | 497 | default: |
hannesw@127 | 498 | return commit(1); // SOURCECHARACTER |
jlaskey@3 | 499 | } |
jlaskey@3 | 500 | } |
jlaskey@3 | 501 | |
jlaskey@3 | 502 | /* |
jlaskey@3 | 503 | * AtomEscape :: |
jlaskey@3 | 504 | * DecimalEscape |
jlaskey@3 | 505 | * CharacterEscape |
jlaskey@3 | 506 | * CharacterClassEscape |
jlaskey@3 | 507 | */ |
hannesw@127 | 508 | private boolean atomEscape() { |
hannesw@127 | 509 | // Note that contrary to ES 5.1 spec we put identityEscape() last because it acts as a catch-all |
hannesw@127 | 510 | return decimalEscape() || characterClassEscape() || characterEscape() || identityEscape(); |
jlaskey@3 | 511 | } |
jlaskey@3 | 512 | |
jlaskey@3 | 513 | /* |
jlaskey@3 | 514 | * CharacterEscape :: |
jlaskey@3 | 515 | * ControlEscape |
jlaskey@3 | 516 | * c ControlLetter |
jlaskey@3 | 517 | * HexEscapeSequence |
jlaskey@3 | 518 | * UnicodeEscapeSequence |
jlaskey@3 | 519 | * IdentityEscape |
jlaskey@3 | 520 | */ |
hannesw@127 | 521 | private boolean characterEscape() { |
jlaskey@3 | 522 | final int startIn = position; |
jlaskey@3 | 523 | final int startOut = sb.length(); |
jlaskey@3 | 524 | |
hannesw@127 | 525 | if (controlEscape()) { |
hannesw@127 | 526 | return true; |
jlaskey@3 | 527 | } |
jlaskey@3 | 528 | |
jlaskey@3 | 529 | if (ch0 == 'c') { |
hannesw@127 | 530 | commit(1); |
hannesw@127 | 531 | if (controlLetter()) { |
hannesw@127 | 532 | return true; |
jlaskey@3 | 533 | } |
jlaskey@3 | 534 | restart(startIn, startOut); |
jlaskey@3 | 535 | } |
jlaskey@3 | 536 | |
hannesw@127 | 537 | if (hexEscapeSequence() || unicodeEscapeSequence()) { |
hannesw@127 | 538 | return true; |
jlaskey@3 | 539 | } |
jlaskey@3 | 540 | |
jlaskey@3 | 541 | restart(startIn, startOut); |
hannesw@127 | 542 | return false; |
jlaskey@3 | 543 | } |
jlaskey@3 | 544 | |
hannesw@127 | 545 | private boolean scanEscapeSequence(final char leader, final int length) { |
jlaskey@3 | 546 | final int startIn = position; |
jlaskey@3 | 547 | final int startOut = sb.length(); |
jlaskey@3 | 548 | |
jlaskey@3 | 549 | if (ch0 != leader) { |
jlaskey@3 | 550 | return false; |
jlaskey@3 | 551 | } |
jlaskey@3 | 552 | |
hannesw@127 | 553 | commit(1); |
jlaskey@3 | 554 | for (int i = 0; i < length; i++) { |
jlaskey@3 | 555 | final char ch0l = Character.toLowerCase(ch0); |
jlaskey@3 | 556 | if ((ch0l >= 'a' && ch0l <= 'f') || isDecimalDigit(ch0)) { |
hannesw@127 | 557 | commit(1); |
jlaskey@3 | 558 | } else { |
jlaskey@3 | 559 | restart(startIn, startOut); |
jlaskey@3 | 560 | return false; |
jlaskey@3 | 561 | } |
jlaskey@3 | 562 | } |
jlaskey@3 | 563 | |
jlaskey@3 | 564 | return true; |
jlaskey@3 | 565 | } |
jlaskey@3 | 566 | |
hannesw@127 | 567 | private boolean hexEscapeSequence() { |
hannesw@127 | 568 | return scanEscapeSequence('x', 2); |
jlaskey@3 | 569 | } |
jlaskey@3 | 570 | |
hannesw@127 | 571 | private boolean unicodeEscapeSequence() { |
hannesw@127 | 572 | return scanEscapeSequence('u', 4); |
jlaskey@3 | 573 | } |
jlaskey@3 | 574 | |
jlaskey@3 | 575 | /* |
jlaskey@3 | 576 | * ControlEscape :: |
jlaskey@3 | 577 | * one of fnrtv |
jlaskey@3 | 578 | */ |
hannesw@127 | 579 | private boolean controlEscape() { |
jlaskey@3 | 580 | switch (ch0) { |
jlaskey@3 | 581 | case 'f': |
jlaskey@3 | 582 | case 'n': |
jlaskey@3 | 583 | case 'r': |
jlaskey@3 | 584 | case 't': |
jlaskey@3 | 585 | case 'v': |
hannesw@127 | 586 | return commit(1); |
jlaskey@3 | 587 | |
jlaskey@3 | 588 | default: |
hannesw@127 | 589 | return false; |
jlaskey@3 | 590 | } |
jlaskey@3 | 591 | } |
jlaskey@3 | 592 | |
jlaskey@3 | 593 | /* |
jlaskey@3 | 594 | * ControlLetter :: |
jlaskey@3 | 595 | * one of abcdefghijklmnopqrstuvwxyz |
jlaskey@3 | 596 | * ABCDEFGHIJKLMNOPQRSTUVWXYZ |
jlaskey@3 | 597 | */ |
hannesw@127 | 598 | private boolean controlLetter() { |
hannesw@189 | 599 | // To match other engines we also accept '0'..'9' and '_' as control letters inside a character class. |
hannesw@189 | 600 | if ((ch0 >= 'A' && ch0 <= 'Z') || (ch0 >= 'a' && ch0 <= 'z') |
hannesw@189 | 601 | || (inCharClass && (isDecimalDigit(ch0) || ch0 == '_'))) { |
hannesw@127 | 602 | // for some reason java regexps don't like control characters on the |
hannesw@127 | 603 | // form "\\ca".match([string with ascii 1 at char0]). Translating |
hannesw@127 | 604 | // them to unicode does it though. |
hannesw@127 | 605 | sb.setLength(sb.length() - 1); |
hannesw@189 | 606 | unicode(ch0 % 32, sb); |
hannesw@127 | 607 | skip(1); |
hannesw@127 | 608 | return true; |
jlaskey@3 | 609 | } |
hannesw@127 | 610 | return false; |
jlaskey@3 | 611 | } |
jlaskey@3 | 612 | |
jlaskey@3 | 613 | /* |
jlaskey@3 | 614 | * IdentityEscape :: |
jlaskey@3 | 615 | * SourceCharacter but not IdentifierPart |
jlaskey@3 | 616 | * <ZWJ> (200c) |
jlaskey@3 | 617 | * <ZWNJ> (200d) |
jlaskey@3 | 618 | */ |
hannesw@127 | 619 | private boolean identityEscape() { |
hannesw@127 | 620 | if (atEOF()) { |
hannesw@127 | 621 | throw new RuntimeException("\\ at end of pattern"); // will be converted to PatternSyntaxException |
hannesw@127 | 622 | } |
hannesw@127 | 623 | // ES 5.1 A.7 requires "not IdentifierPart" here but all major engines accept any character here. |
hannesw@179 | 624 | if (ch0 == 'c') { |
hannesw@189 | 625 | sb.append('\\'); // Treat invalid \c control sequence as \\c |
hannesw@179 | 626 | } else if (NON_IDENT_ESCAPES.indexOf(ch0) == -1) { |
hannesw@127 | 627 | sb.setLength(sb.length() - 1); |
hannesw@127 | 628 | } |
hannesw@127 | 629 | return commit(1); |
jlaskey@3 | 630 | } |
jlaskey@3 | 631 | |
jlaskey@3 | 632 | /* |
jlaskey@3 | 633 | * DecimalEscape :: |
jlaskey@3 | 634 | * DecimalIntegerLiteral [lookahead DecimalDigit] |
jlaskey@3 | 635 | */ |
hannesw@127 | 636 | private boolean decimalEscape() { |
jlaskey@3 | 637 | final int startIn = position; |
jlaskey@3 | 638 | final int startOut = sb.length(); |
jlaskey@3 | 639 | |
hannesw@187 | 640 | if (ch0 == '0' && !isOctalDigit(ch1)) { |
hannesw@127 | 641 | skip(1); |
jlaskey@3 | 642 | // DecimalEscape :: 0. If i is zero, return the EscapeValue consisting of a <NUL> character (Unicodevalue0000); |
hannesw@127 | 643 | sb.append("\u0000"); |
hannesw@127 | 644 | return true; |
jlaskey@3 | 645 | } |
jlaskey@3 | 646 | |
jlaskey@3 | 647 | if (isDecimalDigit(ch0)) { |
hannesw@127 | 648 | |
hannesw@187 | 649 | if (ch0 == '0') { |
hannesw@187 | 650 | // We know this is an octal escape. |
hannesw@187 | 651 | if (inCharClass) { |
hannesw@187 | 652 | // Convert octal escape to unicode escape if inside character class. |
hannesw@187 | 653 | int octalValue = 0; |
hannesw@187 | 654 | while (isOctalDigit(ch0)) { |
hannesw@187 | 655 | octalValue = octalValue * 8 + ch0 - '0'; |
hannesw@187 | 656 | skip(1); |
hannesw@127 | 657 | } |
hannesw@187 | 658 | |
hannesw@187 | 659 | unicode(octalValue, sb); |
hannesw@187 | 660 | |
hannesw@187 | 661 | } else { |
hannesw@187 | 662 | // Copy decimal escape as-is |
hannesw@187 | 663 | decimalDigits(); |
hannesw@127 | 664 | } |
hannesw@187 | 665 | } else { |
hannesw@187 | 666 | // This should be a backreference, but could also be an octal escape or even a literal string. |
hannesw@187 | 667 | int decimalValue = 0; |
hannesw@127 | 668 | while (isDecimalDigit(ch0)) { |
hannesw@187 | 669 | decimalValue = decimalValue * 10 + ch0 - '0'; |
hannesw@127 | 670 | skip(1); |
hannesw@127 | 671 | } |
hannesw@127 | 672 | |
hannesw@187 | 673 | if (inCharClass) { |
hannesw@187 | 674 | // No backreferences in character classes. Encode as unicode escape or literal char sequence |
hannesw@187 | 675 | sb.setLength(sb.length() - 1); |
hannesw@187 | 676 | octalOrLiteral(Integer.toString(decimalValue), sb); |
hannesw@187 | 677 | |
hannesw@187 | 678 | } else if (decimalValue <= caps.size() && caps.get(decimalValue - 1).getNegativeLookaheadLevel() > 0) { |
hannesw@187 | 679 | // Captures that live inside a negative lookahead are dead after the |
hannesw@187 | 680 | // lookahead and will be undefined if referenced from outside. |
hannesw@187 | 681 | if (caps.get(decimalValue - 1).getNegativeLookaheadLevel() > negativeLookaheadLevel) { |
hannesw@187 | 682 | sb.setLength(sb.length() - 1); |
hannesw@187 | 683 | } else { |
hannesw@187 | 684 | sb.append(decimalValue); |
hannesw@187 | 685 | } |
hannesw@187 | 686 | } else if (decimalValue > caps.size()) { |
hannesw@187 | 687 | // Forward reference to a capture group. Forward references are always undefined so we can omit |
hannesw@187 | 688 | // it from the output buffer. However, if the target capture does not exist, we need to rewrite |
hannesw@187 | 689 | // the reference as hex escape or literal string, so register the reference for later processing. |
hannesw@187 | 690 | sb.setLength(sb.length() - 1); |
hannesw@187 | 691 | forwardReferences.add(decimalValue); |
hannesw@187 | 692 | forwardReferences.add(sb.length()); |
hannesw@187 | 693 | } else { |
hannesw@187 | 694 | // Append as backreference |
hannesw@187 | 695 | sb.append(decimalValue); |
hannesw@127 | 696 | } |
hannesw@127 | 697 | |
hannesw@127 | 698 | } |
hannesw@127 | 699 | return true; |
jlaskey@3 | 700 | } |
jlaskey@3 | 701 | |
jlaskey@3 | 702 | restart(startIn, startOut); |
hannesw@127 | 703 | return false; |
jlaskey@3 | 704 | } |
jlaskey@3 | 705 | |
jlaskey@3 | 706 | /* |
jlaskey@3 | 707 | * CharacterClassEscape :: |
jlaskey@3 | 708 | * one of dDsSwW |
jlaskey@3 | 709 | */ |
hannesw@127 | 710 | private boolean characterClassEscape() { |
jlaskey@3 | 711 | switch (ch0) { |
hannesw@127 | 712 | // java.util.regex requires translation of \s and \S to explicit character list |
jlaskey@3 | 713 | case 's': |
hannesw@127 | 714 | if (RegExpFactory.usesJavaUtilRegex()) { |
hannesw@127 | 715 | sb.setLength(sb.length() - 1); |
hannesw@130 | 716 | // No nested class required if we already are inside a character class |
hannesw@130 | 717 | if (inCharClass) { |
hannesw@130 | 718 | sb.append(Lexer.getWhitespaceRegExp()); |
hannesw@130 | 719 | } else { |
hannesw@130 | 720 | sb.append('[').append(Lexer.getWhitespaceRegExp()).append(']'); |
hannesw@130 | 721 | } |
hannesw@127 | 722 | skip(1); |
hannesw@127 | 723 | return true; |
hannesw@127 | 724 | } |
hannesw@127 | 725 | return commit(1); |
jlaskey@3 | 726 | case 'S': |
hannesw@127 | 727 | if (RegExpFactory.usesJavaUtilRegex()) { |
hannesw@127 | 728 | sb.setLength(sb.length() - 1); |
hannesw@130 | 729 | // In negative class we must use intersection to get double negation ("not anything else than space") |
hannesw@130 | 730 | sb.append(inNegativeClass ? "&&[" : "[^").append(Lexer.getWhitespaceRegExp()).append(']'); |
hannesw@127 | 731 | skip(1); |
hannesw@127 | 732 | return true; |
hannesw@127 | 733 | } |
hannesw@127 | 734 | return commit(1); |
jlaskey@3 | 735 | case 'd': |
jlaskey@3 | 736 | case 'D': |
jlaskey@3 | 737 | case 'w': |
jlaskey@3 | 738 | case 'W': |
hannesw@127 | 739 | return commit(1); |
jlaskey@3 | 740 | |
jlaskey@3 | 741 | default: |
hannesw@127 | 742 | return false; |
jlaskey@3 | 743 | } |
jlaskey@3 | 744 | } |
jlaskey@3 | 745 | |
jlaskey@3 | 746 | /* |
jlaskey@3 | 747 | * CharacterClass :: |
jlaskey@3 | 748 | * [ [lookahead {^}] ClassRanges ] |
jlaskey@3 | 749 | * [ ^ ClassRanges ] |
jlaskey@3 | 750 | */ |
hannesw@127 | 751 | private boolean characterClass() { |
jlaskey@3 | 752 | final int startIn = position; |
jlaskey@3 | 753 | final int startOut = sb.length(); |
jlaskey@3 | 754 | |
jlaskey@3 | 755 | if (ch0 == '[') { |
hannesw@127 | 756 | try { |
hannesw@127 | 757 | inCharClass = true; |
hannesw@127 | 758 | push(']'); |
hannesw@127 | 759 | commit(1); |
jlaskey@3 | 760 | |
hannesw@127 | 761 | if (ch0 == '^') { |
hannesw@130 | 762 | inNegativeClass = true; |
hannesw@127 | 763 | commit(1); |
hannesw@127 | 764 | } |
jlaskey@3 | 765 | |
hannesw@127 | 766 | if (classRanges() && ch0 == ']') { |
hannesw@127 | 767 | pop(']'); |
hannesw@127 | 768 | return commit(1); |
hannesw@127 | 769 | } |
hannesw@127 | 770 | } finally { |
hannesw@127 | 771 | inCharClass = false; // no nested character classes in JavaScript |
hannesw@130 | 772 | inNegativeClass = false; |
jlaskey@3 | 773 | } |
jlaskey@3 | 774 | } |
jlaskey@3 | 775 | |
jlaskey@3 | 776 | restart(startIn, startOut); |
hannesw@127 | 777 | return false; |
jlaskey@3 | 778 | } |
jlaskey@3 | 779 | |
jlaskey@3 | 780 | /* |
jlaskey@3 | 781 | * ClassRanges :: |
jlaskey@3 | 782 | * [empty] |
jlaskey@3 | 783 | * NonemptyClassRanges |
jlaskey@3 | 784 | */ |
hannesw@127 | 785 | private boolean classRanges() { |
hannesw@127 | 786 | nonemptyClassRanges(); |
hannesw@127 | 787 | return true; |
jlaskey@3 | 788 | } |
jlaskey@3 | 789 | |
jlaskey@3 | 790 | /* |
jlaskey@3 | 791 | * NonemptyClassRanges :: |
jlaskey@3 | 792 | * ClassAtom |
jlaskey@3 | 793 | * ClassAtom NonemptyClassRangesNoDash |
jlaskey@3 | 794 | * ClassAtom - ClassAtom ClassRanges |
jlaskey@3 | 795 | */ |
hannesw@127 | 796 | private boolean nonemptyClassRanges() { |
jlaskey@3 | 797 | final int startIn = position; |
jlaskey@3 | 798 | final int startOut = sb.length(); |
jlaskey@3 | 799 | |
hannesw@127 | 800 | if (classAtom()) { |
jlaskey@3 | 801 | |
jlaskey@3 | 802 | if (ch0 == '-') { |
hannesw@127 | 803 | commit(1); |
jlaskey@3 | 804 | |
hannesw@127 | 805 | if (classAtom() && classRanges()) { |
hannesw@127 | 806 | return true; |
jlaskey@3 | 807 | } |
jlaskey@3 | 808 | } |
jlaskey@3 | 809 | |
hannesw@127 | 810 | nonemptyClassRangesNoDash(); |
jlaskey@3 | 811 | |
hannesw@127 | 812 | return true; |
jlaskey@3 | 813 | } |
jlaskey@3 | 814 | |
jlaskey@3 | 815 | restart(startIn, startOut); |
hannesw@127 | 816 | return false; |
jlaskey@3 | 817 | } |
jlaskey@3 | 818 | |
jlaskey@3 | 819 | /* |
jlaskey@3 | 820 | * NonemptyClassRangesNoDash :: |
jlaskey@3 | 821 | * ClassAtom |
jlaskey@3 | 822 | * ClassAtomNoDash NonemptyClassRangesNoDash |
jlaskey@3 | 823 | * ClassAtomNoDash - ClassAtom ClassRanges |
jlaskey@3 | 824 | */ |
hannesw@127 | 825 | private boolean nonemptyClassRangesNoDash() { |
jlaskey@3 | 826 | final int startIn = position; |
jlaskey@3 | 827 | final int startOut = sb.length(); |
jlaskey@3 | 828 | |
hannesw@127 | 829 | if (classAtomNoDash()) { |
jlaskey@3 | 830 | |
jlaskey@3 | 831 | // need to check dash first, as for e.g. [a-b|c-d] will otherwise parse - as an atom |
jlaskey@3 | 832 | if (ch0 == '-') { |
hannesw@127 | 833 | commit(1); |
jlaskey@3 | 834 | |
hannesw@127 | 835 | if (classAtom() && classRanges()) { |
hannesw@127 | 836 | return true; |
jlaskey@3 | 837 | } |
jlaskey@3 | 838 | //fallthru |
jlaskey@3 | 839 | } |
jlaskey@3 | 840 | |
hannesw@127 | 841 | nonemptyClassRangesNoDash(); |
hannesw@127 | 842 | return true; // still a class atom |
jlaskey@3 | 843 | } |
jlaskey@3 | 844 | |
hannesw@127 | 845 | if (classAtom()) { |
hannesw@127 | 846 | return true; |
jlaskey@3 | 847 | } |
jlaskey@3 | 848 | |
jlaskey@3 | 849 | restart(startIn, startOut); |
hannesw@127 | 850 | return false; |
jlaskey@3 | 851 | } |
jlaskey@3 | 852 | |
jlaskey@3 | 853 | /* |
jlaskey@3 | 854 | * ClassAtom : - ClassAtomNoDash |
jlaskey@3 | 855 | */ |
hannesw@127 | 856 | private boolean classAtom() { |
jlaskey@3 | 857 | |
jlaskey@3 | 858 | if (ch0 == '-') { |
hannesw@127 | 859 | return commit(1); |
jlaskey@3 | 860 | } |
jlaskey@3 | 861 | |
hannesw@127 | 862 | return classAtomNoDash(); |
jlaskey@3 | 863 | } |
jlaskey@3 | 864 | |
jlaskey@3 | 865 | /* |
jlaskey@3 | 866 | * ClassAtomNoDash :: |
jlaskey@3 | 867 | * SourceCharacter but not one of \ or ] or - |
jlaskey@3 | 868 | * \ ClassEscape |
jlaskey@3 | 869 | */ |
hannesw@127 | 870 | private boolean classAtomNoDash() { |
jlaskey@3 | 871 | final int startIn = position; |
jlaskey@3 | 872 | final int startOut = sb.length(); |
jlaskey@3 | 873 | |
jlaskey@3 | 874 | switch (ch0) { |
jlaskey@3 | 875 | case ']': |
jlaskey@3 | 876 | case '-': |
hannesw@127 | 877 | return false; |
jlaskey@3 | 878 | |
jlaskey@3 | 879 | case '[': |
jlaskey@3 | 880 | // unescaped left square bracket - add escape |
hannesw@127 | 881 | sb.append('\\'); |
hannesw@127 | 882 | return commit(1); |
jlaskey@3 | 883 | |
jlaskey@3 | 884 | case '\\': |
hannesw@127 | 885 | commit(1); |
hannesw@127 | 886 | if (classEscape()) { |
hannesw@127 | 887 | return true; |
jlaskey@3 | 888 | } |
jlaskey@3 | 889 | |
jlaskey@3 | 890 | restart(startIn, startOut); |
hannesw@127 | 891 | return false; |
jlaskey@3 | 892 | |
jlaskey@3 | 893 | default: |
hannesw@127 | 894 | return commit(1); |
jlaskey@3 | 895 | } |
jlaskey@3 | 896 | } |
jlaskey@3 | 897 | |
jlaskey@3 | 898 | /* |
jlaskey@3 | 899 | * ClassEscape :: |
jlaskey@3 | 900 | * DecimalEscape |
jlaskey@3 | 901 | * b |
jlaskey@3 | 902 | * CharacterEscape |
jlaskey@3 | 903 | * CharacterClassEscape |
jlaskey@3 | 904 | */ |
hannesw@127 | 905 | private boolean classEscape() { |
jlaskey@3 | 906 | |
hannesw@127 | 907 | if (decimalEscape()) { |
hannesw@127 | 908 | return true; |
jlaskey@3 | 909 | } |
jlaskey@3 | 910 | |
jlaskey@3 | 911 | if (ch0 == 'b') { |
hannesw@127 | 912 | sb.setLength(sb.length() - 1); |
hannesw@127 | 913 | sb.append('\b'); |
hannesw@127 | 914 | skip(1); |
hannesw@127 | 915 | return true; |
jlaskey@3 | 916 | } |
jlaskey@3 | 917 | |
hannesw@127 | 918 | // Note that contrary to ES 5.1 spec we put identityEscape() last because it acts as a catch-all |
hannesw@127 | 919 | return characterEscape() || characterClassEscape() || identityEscape(); |
jlaskey@3 | 920 | } |
jlaskey@3 | 921 | |
jlaskey@3 | 922 | /* |
jlaskey@3 | 923 | * DecimalDigits |
jlaskey@3 | 924 | */ |
hannesw@127 | 925 | private boolean decimalDigits() { |
jlaskey@3 | 926 | if (!isDecimalDigit(ch0)) { |
hannesw@127 | 927 | return false; |
jlaskey@3 | 928 | } |
jlaskey@3 | 929 | |
jlaskey@3 | 930 | while (isDecimalDigit(ch0)) { |
hannesw@127 | 931 | commit(1); |
jlaskey@3 | 932 | } |
jlaskey@3 | 933 | |
hannesw@127 | 934 | return true; |
hannesw@127 | 935 | } |
hannesw@127 | 936 | |
hannesw@187 | 937 | private void unicode(final int value, final StringBuilder buffer) { |
hannesw@127 | 938 | final String hex = Integer.toHexString(value); |
hannesw@187 | 939 | buffer.append('u'); |
hannesw@127 | 940 | for (int i = 0; i < 4 - hex.length(); i++) { |
hannesw@187 | 941 | buffer.append('0'); |
hannesw@127 | 942 | } |
hannesw@187 | 943 | buffer.append(hex); |
hannesw@187 | 944 | } |
hannesw@187 | 945 | |
hannesw@187 | 946 | // Convert what would have been a backreference into a unicode escape, or a number literal, or both. |
hannesw@187 | 947 | private void octalOrLiteral(final String numberLiteral, final StringBuilder buffer) { |
hannesw@187 | 948 | final int length = numberLiteral.length(); |
hannesw@187 | 949 | int octalValue = 0; |
hannesw@187 | 950 | int pos = 0; |
hannesw@187 | 951 | // Maximum value for octal escape is 0377 (255) so we stop the loop at 32 |
hannesw@187 | 952 | while (pos < length && octalValue < 0x20) { |
hannesw@187 | 953 | final char ch = numberLiteral.charAt(pos); |
hannesw@187 | 954 | if (isOctalDigit(ch)) { |
hannesw@187 | 955 | octalValue = octalValue * 8 + ch - '0'; |
hannesw@187 | 956 | } else { |
hannesw@187 | 957 | break; |
hannesw@187 | 958 | } |
hannesw@187 | 959 | pos++; |
hannesw@187 | 960 | } |
hannesw@187 | 961 | if (octalValue > 0) { |
hannesw@187 | 962 | buffer.append('\\'); |
hannesw@187 | 963 | unicode(octalValue, buffer); |
hannesw@187 | 964 | buffer.append(numberLiteral.substring(pos)); |
hannesw@187 | 965 | } else { |
hannesw@187 | 966 | buffer.append(numberLiteral); |
hannesw@187 | 967 | } |
hannesw@187 | 968 | } |
hannesw@187 | 969 | |
hannesw@187 | 970 | private static boolean isOctalDigit(final char ch) { |
hannesw@187 | 971 | return ch >= '0' && ch <= '7'; |
jlaskey@3 | 972 | } |
jlaskey@3 | 973 | |
jlaskey@3 | 974 | private static boolean isDecimalDigit(final char ch) { |
jlaskey@3 | 975 | return ch >= '0' && ch <= '9'; |
jlaskey@3 | 976 | } |
jlaskey@3 | 977 | } |