src/jdk/nashorn/internal/runtime/regexp/RegExpScanner.java

Thu, 11 Apr 2013 12:16:39 +0200

author
hannesw
date
Thu, 11 Apr 2013 12:16:39 +0200
changeset 192
a3fc89d33072
parent 189
8ae9ed1ac1e2
child 247
5a3f7867e19c
permissions
-rw-r--r--

8011980: Allow NUL character in character class
Reviewed-by: sundar, lagergren

jlaskey@3 1 /*
jlaskey@7 2 * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved.
jlaskey@3 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
jlaskey@3 4 *
jlaskey@3 5 * This code is free software; you can redistribute it and/or modify it
jlaskey@3 6 * under the terms of the GNU General Public License version 2 only, as
jlaskey@3 7 * published by the Free Software Foundation. Oracle designates this
jlaskey@3 8 * particular file as subject to the "Classpath" exception as provided
jlaskey@3 9 * by Oracle in the LICENSE file that accompanied this code.
jlaskey@3 10 *
jlaskey@3 11 * This code is distributed in the hope that it will be useful, but WITHOUT
jlaskey@3 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
jlaskey@3 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
jlaskey@3 14 * version 2 for more details (a copy is included in the LICENSE file that
jlaskey@3 15 * accompanied this code).
jlaskey@3 16 *
jlaskey@3 17 * You should have received a copy of the GNU General Public License version
jlaskey@3 18 * 2 along with this work; if not, write to the Free Software Foundation,
jlaskey@3 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
jlaskey@3 20 *
jlaskey@3 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
jlaskey@3 22 * or visit www.oracle.com if you need additional information or have any
jlaskey@3 23 * questions.
jlaskey@3 24 */
jlaskey@3 25
hannesw@114 26 package jdk.nashorn.internal.runtime.regexp;
jlaskey@3 27
jlaskey@3 28 import java.util.HashMap;
hannesw@179 29 import java.util.Iterator;
jlaskey@3 30 import java.util.LinkedList;
jlaskey@3 31 import java.util.List;
jlaskey@3 32 import java.util.Map;
jlaskey@3 33 import java.util.regex.PatternSyntaxException;
hannesw@114 34
hannesw@127 35 import jdk.nashorn.internal.parser.Lexer;
sundar@82 36 import jdk.nashorn.internal.parser.Scanner;
hannesw@114 37 import jdk.nashorn.internal.runtime.BitVector;
jlaskey@3 38
jlaskey@3 39 /**
jlaskey@3 40 * Scan a JavaScript regexp, converting to Java regex if necessary.
jlaskey@3 41 *
jlaskey@3 42 */
sundar@82 43 final class RegExpScanner extends Scanner {
jlaskey@3 44
jlaskey@3 45 /**
hannesw@127 46 * String builder used to rewrite the pattern for the currently used regexp factory.
jlaskey@3 47 */
jlaskey@3 48 private final StringBuilder sb;
jlaskey@3 49
jlaskey@3 50 /** Expected token table */
jlaskey@3 51 private final Map<Character, Integer> expected = new HashMap<>();
jlaskey@3 52
jlaskey@3 53 /** Capturing parenthesis that have been found so far. */
jlaskey@3 54 private final List<Capture> caps = new LinkedList<>();
jlaskey@3 55
jlaskey@3 56 /** Forward references to capturing parenthesis to be resolved later.*/
hannesw@179 57 private final LinkedList<Integer> forwardReferences = new LinkedList<>();
jlaskey@3 58
jlaskey@3 59 /** Current level of zero-width negative lookahead assertions. */
jlaskey@3 60 private int negativeLookaheadLevel;
jlaskey@3 61
hannesw@127 62 /** Are we currently inside a character class? */
hannesw@127 63 private boolean inCharClass = false;
hannesw@127 64
hannesw@130 65 /** Are we currently inside a negated character class? */
hannesw@130 66 private boolean inNegativeClass = false;
hannesw@130 67
jlaskey@3 68 private static final String NON_IDENT_ESCAPES = "$^*+(){}[]|\\.?";
jlaskey@3 69
jlaskey@3 70 private static class Capture {
jlaskey@3 71 /**
jlaskey@3 72 * Zero-width negative lookaheads enclosing the capture.
jlaskey@3 73 */
jlaskey@3 74 private final int negativeLookaheadLevel;
jlaskey@3 75
jlaskey@3 76 Capture(final int negativeLookaheadLevel) {
jlaskey@3 77 this.negativeLookaheadLevel = negativeLookaheadLevel;
jlaskey@3 78 }
jlaskey@3 79
jlaskey@3 80 public int getNegativeLookaheadLevel() {
jlaskey@3 81 return negativeLookaheadLevel;
jlaskey@3 82 }
jlaskey@3 83
jlaskey@3 84 }
jlaskey@3 85
jlaskey@3 86 /**
jlaskey@3 87 * Constructor
jlaskey@3 88 * @param string the JavaScript regexp to parse
jlaskey@3 89 */
jlaskey@3 90 private RegExpScanner(final String string) {
jlaskey@3 91 super(string);
jlaskey@3 92 sb = new StringBuilder(limit);
jlaskey@3 93 reset(0);
jlaskey@3 94 expected.put(']', 0);
jlaskey@3 95 expected.put('}', 0);
jlaskey@3 96 }
jlaskey@3 97
jlaskey@3 98 private void processForwardReferences() {
jlaskey@3 99
hannesw@179 100 Iterator<Integer> iterator = forwardReferences.descendingIterator();
hannesw@179 101 while (iterator.hasNext()) {
hannesw@179 102 final int pos = iterator.next();
hannesw@179 103 final int num = iterator.next();
hannesw@179 104 if (num > caps.size()) {
hannesw@187 105 // Non-existing backreference. If the number begins with a valid octal convert it to
hannesw@187 106 // Unicode escape and append the rest to a literal character sequence.
hannesw@187 107 final StringBuilder buffer = new StringBuilder();
hannesw@187 108 octalOrLiteral(Integer.toString(num), buffer);
hannesw@187 109 sb.insert(pos, buffer);
jlaskey@3 110 }
jlaskey@3 111 }
jlaskey@3 112
jlaskey@3 113 forwardReferences.clear();
jlaskey@3 114 }
jlaskey@3 115
jlaskey@3 116 /**
jlaskey@3 117 * Scan a JavaScript regexp string returning a Java safe regex string.
jlaskey@3 118 *
jlaskey@3 119 * @param string
jlaskey@3 120 * JavaScript regexp string.
jlaskey@3 121 * @return Java safe regex string.
jlaskey@3 122 */
jlaskey@3 123 public static RegExpScanner scan(final String string) {
jlaskey@3 124 final RegExpScanner scanner = new RegExpScanner(string);
jlaskey@3 125
jlaskey@3 126 try {
hannesw@127 127 scanner.disjunction();
jlaskey@3 128 } catch (final Exception e) {
hannesw@127 129 throw new PatternSyntaxException(e.getMessage(), string, scanner.position);
jlaskey@3 130 }
jlaskey@3 131
jlaskey@3 132 scanner.processForwardReferences();
jlaskey@3 133
hannesw@127 134 // Throw syntax error unless we parsed the entire JavaScript regexp without syntax errors
hannesw@127 135 if (scanner.position != string.length()) {
hannesw@127 136 final String p = scanner.getStringBuilder().toString();
jlaskey@3 137 throw new PatternSyntaxException(string, p, p.length() + 1);
jlaskey@3 138 }
jlaskey@3 139
jlaskey@3 140 return scanner;
jlaskey@3 141 }
jlaskey@3 142
jlaskey@3 143 final StringBuilder getStringBuilder() {
jlaskey@3 144 return sb;
jlaskey@3 145 }
jlaskey@3 146
jlaskey@3 147 String getJavaPattern() {
hannesw@127 148 return sb.toString();
jlaskey@3 149 }
jlaskey@3 150
jlaskey@3 151 BitVector getGroupsInNegativeLookahead() {
jlaskey@3 152 BitVector vec = null;
jlaskey@3 153 for (int i = 0; i < caps.size(); i++) {
jlaskey@3 154 final Capture cap = caps.get(i);
jlaskey@3 155 if (cap.getNegativeLookaheadLevel() > 0) {
jlaskey@3 156 if (vec == null) {
jlaskey@3 157 vec = new BitVector(caps.size() + 1);
jlaskey@3 158 }
jlaskey@3 159 vec.set(i + 1);
jlaskey@3 160 }
jlaskey@3 161 }
jlaskey@3 162 return vec;
jlaskey@3 163 }
jlaskey@3 164
jlaskey@3 165 /**
jlaskey@3 166 * Commit n characters to the builder and to a given token
jlaskey@3 167 * @param n Number of characters.
jlaskey@3 168 * @return Committed token
jlaskey@3 169 */
hannesw@127 170 private boolean commit(final int n) {
jlaskey@3 171 switch (n) {
jlaskey@3 172 case 1:
jlaskey@3 173 sb.append(ch0);
jlaskey@3 174 skip(1);
jlaskey@3 175 break;
jlaskey@3 176 case 2:
jlaskey@3 177 sb.append(ch0);
jlaskey@3 178 sb.append(ch1);
jlaskey@3 179 skip(2);
jlaskey@3 180 break;
jlaskey@3 181 case 3:
jlaskey@3 182 sb.append(ch0);
jlaskey@3 183 sb.append(ch1);
jlaskey@3 184 sb.append(ch2);
jlaskey@3 185 skip(3);
jlaskey@3 186 break;
jlaskey@3 187 default:
jlaskey@3 188 assert false : "Should not reach here";
jlaskey@3 189 }
jlaskey@3 190
hannesw@127 191 return true;
jlaskey@3 192 }
jlaskey@3 193
jlaskey@3 194 /**
jlaskey@3 195 * Restart the buffers back at an earlier position.
jlaskey@3 196 *
jlaskey@3 197 * @param startIn
jlaskey@3 198 * Position in the input stream.
jlaskey@3 199 * @param startOut
jlaskey@3 200 * Position in the output stream.
jlaskey@3 201 */
jlaskey@3 202 private void restart(final int startIn, final int startOut) {
jlaskey@3 203 reset(startIn);
jlaskey@3 204 sb.setLength(startOut);
jlaskey@3 205 }
jlaskey@3 206
jlaskey@3 207 private void push(final char ch) {
jlaskey@3 208 expected.put(ch, expected.get(ch) + 1);
jlaskey@3 209 }
jlaskey@3 210
jlaskey@3 211 private void pop(final char ch) {
jlaskey@3 212 expected.put(ch, Math.min(0, expected.get(ch) - 1));
jlaskey@3 213 }
jlaskey@3 214
jlaskey@3 215 /*
jlaskey@3 216 * Recursive descent tokenizer starts below.
jlaskey@3 217 */
jlaskey@3 218
jlaskey@3 219 /*
jlaskey@3 220 * Disjunction ::
jlaskey@3 221 * Alternative
jlaskey@3 222 * Alternative | Disjunction
jlaskey@3 223 */
hannesw@127 224 private void disjunction() {
jlaskey@3 225 while (true) {
hannesw@127 226 alternative();
jlaskey@3 227
jlaskey@3 228 if (ch0 == '|') {
hannesw@127 229 commit(1);
jlaskey@3 230 } else {
jlaskey@3 231 break;
jlaskey@3 232 }
jlaskey@3 233 }
jlaskey@3 234 }
jlaskey@3 235
jlaskey@3 236 /*
jlaskey@3 237 * Alternative ::
jlaskey@3 238 * [empty]
jlaskey@3 239 * Alternative Term
jlaskey@3 240 */
hannesw@127 241 private void alternative() {
hannesw@127 242 while (term()) {
hannesw@127 243 // do nothing
jlaskey@3 244 }
jlaskey@3 245 }
jlaskey@3 246
jlaskey@3 247 /*
jlaskey@3 248 * Term ::
jlaskey@3 249 * Assertion
jlaskey@3 250 * Atom
jlaskey@3 251 * Atom Quantifier
jlaskey@3 252 */
hannesw@127 253 private boolean term() {
jlaskey@3 254 final int startIn = position;
jlaskey@3 255 final int startOut = sb.length();
jlaskey@3 256
hannesw@127 257 if (assertion()) {
hannesw@127 258 return true;
jlaskey@3 259 }
jlaskey@3 260
hannesw@127 261 if (atom()) {
hannesw@188 262 // Check for character classes that never or always match
hannesw@127 263 if (sb.toString().endsWith("[]")) {
hannesw@188 264 sb.setLength(sb.length() - 1);
hannesw@188 265 sb.append("^\\s\\S]");
hannesw@127 266 } else if (sb.toString().endsWith("[^]")) {
hannesw@127 267 sb.setLength(sb.length() - 2);
hannesw@127 268 sb.append("\\s\\S]");
jlaskey@3 269 }
jlaskey@3 270
hannesw@188 271 quantifier();
hannesw@127 272 return true;
jlaskey@3 273 }
jlaskey@3 274
jlaskey@3 275 restart(startIn, startOut);
hannesw@127 276 return false;
jlaskey@3 277 }
jlaskey@3 278
jlaskey@3 279 /*
jlaskey@3 280 * Assertion ::
jlaskey@3 281 * ^
jlaskey@3 282 * $
jlaskey@3 283 * \b
jlaskey@3 284 * \B
jlaskey@3 285 * ( ? = Disjunction )
jlaskey@3 286 * ( ? ! Disjunction )
jlaskey@3 287 */
hannesw@127 288 private boolean assertion() {
jlaskey@3 289 final int startIn = position;
jlaskey@3 290 final int startOut = sb.length();
jlaskey@3 291
jlaskey@3 292 switch (ch0) {
jlaskey@3 293 case '^':
jlaskey@3 294 case '$':
hannesw@127 295 return commit(1);
jlaskey@3 296
jlaskey@3 297 case '\\':
jlaskey@3 298 if (ch1 == 'b' || ch1 == 'B') {
hannesw@127 299 return commit(2);
jlaskey@3 300 }
jlaskey@3 301 break;
jlaskey@3 302
jlaskey@3 303 case '(':
jlaskey@3 304 if (ch1 != '?') {
jlaskey@3 305 break;
jlaskey@3 306 }
jlaskey@3 307 if (ch2 != '=' && ch2 != '!') {
jlaskey@3 308 break;
jlaskey@3 309 }
jlaskey@3 310 final boolean isNegativeLookahead = (ch2 == '!');
hannesw@127 311 commit(3);
jlaskey@3 312
jlaskey@3 313 if (isNegativeLookahead) {
jlaskey@3 314 negativeLookaheadLevel++;
jlaskey@3 315 }
hannesw@127 316 disjunction();
jlaskey@3 317 if (isNegativeLookahead) {
jlaskey@3 318 negativeLookaheadLevel--;
jlaskey@3 319 }
jlaskey@3 320
hannesw@127 321 if (ch0 == ')') {
hannesw@127 322 return commit(1);
jlaskey@3 323 }
jlaskey@3 324 break;
jlaskey@3 325
jlaskey@3 326 default:
jlaskey@3 327 break;
jlaskey@3 328 }
jlaskey@3 329
jlaskey@3 330 restart(startIn, startOut);
hannesw@127 331 return false;
jlaskey@3 332 }
jlaskey@3 333
jlaskey@3 334 /*
jlaskey@3 335 * Quantifier ::
jlaskey@3 336 * QuantifierPrefix
jlaskey@3 337 * QuantifierPrefix ?
jlaskey@3 338 */
hannesw@127 339 private boolean quantifier() {
hannesw@127 340 if (quantifierPrefix()) {
jlaskey@3 341 if (ch0 == '?') {
hannesw@127 342 commit(1);
jlaskey@3 343 }
hannesw@127 344 return true;
jlaskey@3 345 }
hannesw@127 346 return false;
jlaskey@3 347 }
jlaskey@3 348
jlaskey@3 349 /*
jlaskey@3 350 * QuantifierPrefix ::
jlaskey@3 351 * *
jlaskey@3 352 * +
jlaskey@3 353 * ?
jlaskey@3 354 * { DecimalDigits }
jlaskey@3 355 * { DecimalDigits , }
jlaskey@3 356 * { DecimalDigits , DecimalDigits }
jlaskey@3 357 */
hannesw@127 358 private boolean quantifierPrefix() {
jlaskey@3 359 final int startIn = position;
jlaskey@3 360 final int startOut = sb.length();
jlaskey@3 361
jlaskey@3 362 switch (ch0) {
jlaskey@3 363 case '*':
jlaskey@3 364 case '+':
jlaskey@3 365 case '?':
hannesw@127 366 return commit(1);
jlaskey@3 367
jlaskey@3 368 case '{':
hannesw@127 369 commit(1);
jlaskey@3 370
hannesw@127 371 if (!decimalDigits()) {
jlaskey@3 372 break; // not a quantifier - back out
jlaskey@3 373 }
jlaskey@3 374 push('}');
jlaskey@3 375
jlaskey@3 376 if (ch0 == ',') {
hannesw@127 377 commit(1);
hannesw@127 378 decimalDigits();
jlaskey@3 379 }
jlaskey@3 380
jlaskey@3 381 if (ch0 == '}') {
jlaskey@3 382 pop('}');
hannesw@127 383 commit(1);
hannesw@179 384 } else {
hannesw@179 385 // Bad quantifier should be rejected but is accepted by all major engines
hannesw@179 386 restart(startIn, startOut);
hannesw@179 387 return false;
jlaskey@3 388 }
jlaskey@3 389
hannesw@127 390 return true;
jlaskey@3 391
jlaskey@3 392 default:
jlaskey@3 393 break;
jlaskey@3 394 }
jlaskey@3 395
jlaskey@3 396 restart(startIn, startOut);
hannesw@127 397 return false;
jlaskey@3 398 }
jlaskey@3 399
jlaskey@3 400 /*
jlaskey@3 401 * Atom ::
jlaskey@3 402 * PatternCharacter
jlaskey@3 403 * .
jlaskey@3 404 * \ AtomEscape
jlaskey@3 405 * CharacterClass
jlaskey@3 406 * ( Disjunction )
jlaskey@3 407 * ( ? : Disjunction )
jlaskey@3 408 *
jlaskey@3 409 */
hannesw@127 410 private boolean atom() {
jlaskey@3 411 final int startIn = position;
jlaskey@3 412 final int startOut = sb.length();
jlaskey@3 413
hannesw@127 414 if (patternCharacter()) {
hannesw@127 415 return true;
jlaskey@3 416 }
jlaskey@3 417
jlaskey@3 418 if (ch0 == '.') {
hannesw@127 419 return commit(1);
jlaskey@3 420 }
jlaskey@3 421
jlaskey@3 422 if (ch0 == '\\') {
hannesw@127 423 commit(1);
jlaskey@3 424
hannesw@127 425 if (atomEscape()) {
hannesw@127 426 return true;
jlaskey@3 427 }
jlaskey@3 428 }
jlaskey@3 429
hannesw@127 430 if (characterClass()) {
hannesw@127 431 return true;
jlaskey@3 432 }
jlaskey@3 433
jlaskey@3 434 if (ch0 == '(') {
jlaskey@3 435 boolean capturingParens = true;
hannesw@127 436 commit(1);
jlaskey@3 437 if (ch0 == '?' && ch1 == ':') {
jlaskey@3 438 capturingParens = false;
hannesw@127 439 commit(2);
jlaskey@3 440 }
jlaskey@3 441
hannesw@127 442 disjunction();
hannesw@127 443
hannesw@127 444 if (ch0 == ')') {
hannesw@127 445 commit(1);
hannesw@127 446 if (capturingParens) {
hannesw@127 447 caps.add(new Capture(negativeLookaheadLevel));
jlaskey@3 448 }
hannesw@127 449 return true;
jlaskey@3 450 }
jlaskey@3 451 }
jlaskey@3 452
jlaskey@3 453 restart(startIn, startOut);
hannesw@127 454 return false;
jlaskey@3 455 }
jlaskey@3 456
jlaskey@3 457 /*
jlaskey@3 458 * PatternCharacter ::
jlaskey@3 459 * SourceCharacter but not any of: ^$\.*+?()[]{}|
jlaskey@3 460 */
jlaskey@3 461 @SuppressWarnings("fallthrough")
hannesw@127 462 private boolean patternCharacter() {
jlaskey@3 463 if (atEOF()) {
hannesw@127 464 return false;
jlaskey@3 465 }
jlaskey@3 466
jlaskey@3 467 switch (ch0) {
jlaskey@3 468 case '^':
jlaskey@3 469 case '$':
jlaskey@3 470 case '\\':
jlaskey@3 471 case '.':
jlaskey@3 472 case '*':
jlaskey@3 473 case '+':
jlaskey@3 474 case '?':
jlaskey@3 475 case '(':
jlaskey@3 476 case ')':
jlaskey@3 477 case '[':
jlaskey@3 478 case '|':
hannesw@127 479 return false;
jlaskey@3 480
jlaskey@3 481 case '}':
jlaskey@3 482 case ']':
jlaskey@3 483 final int n = expected.get(ch0);
jlaskey@3 484 if (n != 0) {
hannesw@127 485 return false;
jlaskey@3 486 }
jlaskey@3 487
jlaskey@3 488 case '{':
jlaskey@3 489 // if not a valid quantifier escape curly brace to match itself
jlaskey@3 490 // this ensures compatibility with other JS implementations
hannesw@127 491 if (!quantifierPrefix()) {
hannesw@127 492 sb.append('\\');
hannesw@127 493 return commit(1);
hannesw@127 494 }
hannesw@127 495 return false;
jlaskey@3 496
jlaskey@3 497 default:
hannesw@127 498 return commit(1); // SOURCECHARACTER
jlaskey@3 499 }
jlaskey@3 500 }
jlaskey@3 501
jlaskey@3 502 /*
jlaskey@3 503 * AtomEscape ::
jlaskey@3 504 * DecimalEscape
jlaskey@3 505 * CharacterEscape
jlaskey@3 506 * CharacterClassEscape
jlaskey@3 507 */
hannesw@127 508 private boolean atomEscape() {
hannesw@127 509 // Note that contrary to ES 5.1 spec we put identityEscape() last because it acts as a catch-all
hannesw@127 510 return decimalEscape() || characterClassEscape() || characterEscape() || identityEscape();
jlaskey@3 511 }
jlaskey@3 512
jlaskey@3 513 /*
jlaskey@3 514 * CharacterEscape ::
jlaskey@3 515 * ControlEscape
jlaskey@3 516 * c ControlLetter
jlaskey@3 517 * HexEscapeSequence
jlaskey@3 518 * UnicodeEscapeSequence
jlaskey@3 519 * IdentityEscape
jlaskey@3 520 */
hannesw@127 521 private boolean characterEscape() {
jlaskey@3 522 final int startIn = position;
jlaskey@3 523 final int startOut = sb.length();
jlaskey@3 524
hannesw@127 525 if (controlEscape()) {
hannesw@127 526 return true;
jlaskey@3 527 }
jlaskey@3 528
jlaskey@3 529 if (ch0 == 'c') {
hannesw@127 530 commit(1);
hannesw@127 531 if (controlLetter()) {
hannesw@127 532 return true;
jlaskey@3 533 }
jlaskey@3 534 restart(startIn, startOut);
jlaskey@3 535 }
jlaskey@3 536
hannesw@127 537 if (hexEscapeSequence() || unicodeEscapeSequence()) {
hannesw@127 538 return true;
jlaskey@3 539 }
jlaskey@3 540
jlaskey@3 541 restart(startIn, startOut);
hannesw@127 542 return false;
jlaskey@3 543 }
jlaskey@3 544
hannesw@127 545 private boolean scanEscapeSequence(final char leader, final int length) {
jlaskey@3 546 final int startIn = position;
jlaskey@3 547 final int startOut = sb.length();
jlaskey@3 548
jlaskey@3 549 if (ch0 != leader) {
jlaskey@3 550 return false;
jlaskey@3 551 }
jlaskey@3 552
hannesw@127 553 commit(1);
jlaskey@3 554 for (int i = 0; i < length; i++) {
jlaskey@3 555 final char ch0l = Character.toLowerCase(ch0);
jlaskey@3 556 if ((ch0l >= 'a' && ch0l <= 'f') || isDecimalDigit(ch0)) {
hannesw@127 557 commit(1);
jlaskey@3 558 } else {
jlaskey@3 559 restart(startIn, startOut);
jlaskey@3 560 return false;
jlaskey@3 561 }
jlaskey@3 562 }
jlaskey@3 563
jlaskey@3 564 return true;
jlaskey@3 565 }
jlaskey@3 566
hannesw@127 567 private boolean hexEscapeSequence() {
hannesw@127 568 return scanEscapeSequence('x', 2);
jlaskey@3 569 }
jlaskey@3 570
hannesw@127 571 private boolean unicodeEscapeSequence() {
hannesw@127 572 return scanEscapeSequence('u', 4);
jlaskey@3 573 }
jlaskey@3 574
jlaskey@3 575 /*
jlaskey@3 576 * ControlEscape ::
jlaskey@3 577 * one of fnrtv
jlaskey@3 578 */
hannesw@127 579 private boolean controlEscape() {
jlaskey@3 580 switch (ch0) {
jlaskey@3 581 case 'f':
jlaskey@3 582 case 'n':
jlaskey@3 583 case 'r':
jlaskey@3 584 case 't':
jlaskey@3 585 case 'v':
hannesw@127 586 return commit(1);
jlaskey@3 587
jlaskey@3 588 default:
hannesw@127 589 return false;
jlaskey@3 590 }
jlaskey@3 591 }
jlaskey@3 592
jlaskey@3 593 /*
jlaskey@3 594 * ControlLetter ::
jlaskey@3 595 * one of abcdefghijklmnopqrstuvwxyz
jlaskey@3 596 * ABCDEFGHIJKLMNOPQRSTUVWXYZ
jlaskey@3 597 */
hannesw@127 598 private boolean controlLetter() {
hannesw@189 599 // To match other engines we also accept '0'..'9' and '_' as control letters inside a character class.
hannesw@189 600 if ((ch0 >= 'A' && ch0 <= 'Z') || (ch0 >= 'a' && ch0 <= 'z')
hannesw@189 601 || (inCharClass && (isDecimalDigit(ch0) || ch0 == '_'))) {
hannesw@127 602 // for some reason java regexps don't like control characters on the
hannesw@127 603 // form "\\ca".match([string with ascii 1 at char0]). Translating
hannesw@127 604 // them to unicode does it though.
hannesw@127 605 sb.setLength(sb.length() - 1);
hannesw@189 606 unicode(ch0 % 32, sb);
hannesw@127 607 skip(1);
hannesw@127 608 return true;
jlaskey@3 609 }
hannesw@127 610 return false;
jlaskey@3 611 }
jlaskey@3 612
jlaskey@3 613 /*
jlaskey@3 614 * IdentityEscape ::
jlaskey@3 615 * SourceCharacter but not IdentifierPart
jlaskey@3 616 * <ZWJ> (200c)
jlaskey@3 617 * <ZWNJ> (200d)
jlaskey@3 618 */
hannesw@127 619 private boolean identityEscape() {
hannesw@127 620 if (atEOF()) {
hannesw@127 621 throw new RuntimeException("\\ at end of pattern"); // will be converted to PatternSyntaxException
hannesw@127 622 }
hannesw@127 623 // ES 5.1 A.7 requires "not IdentifierPart" here but all major engines accept any character here.
hannesw@179 624 if (ch0 == 'c') {
hannesw@189 625 sb.append('\\'); // Treat invalid \c control sequence as \\c
hannesw@179 626 } else if (NON_IDENT_ESCAPES.indexOf(ch0) == -1) {
hannesw@127 627 sb.setLength(sb.length() - 1);
hannesw@127 628 }
hannesw@127 629 return commit(1);
jlaskey@3 630 }
jlaskey@3 631
jlaskey@3 632 /*
jlaskey@3 633 * DecimalEscape ::
jlaskey@3 634 * DecimalIntegerLiteral [lookahead DecimalDigit]
jlaskey@3 635 */
hannesw@127 636 private boolean decimalEscape() {
jlaskey@3 637 final int startIn = position;
jlaskey@3 638 final int startOut = sb.length();
jlaskey@3 639
hannesw@187 640 if (ch0 == '0' && !isOctalDigit(ch1)) {
hannesw@127 641 skip(1);
jlaskey@3 642 // DecimalEscape :: 0. If i is zero, return the EscapeValue consisting of a <NUL> character (Unicodevalue0000);
hannesw@127 643 sb.append("\u0000");
hannesw@127 644 return true;
jlaskey@3 645 }
jlaskey@3 646
jlaskey@3 647 if (isDecimalDigit(ch0)) {
hannesw@127 648
hannesw@187 649 if (ch0 == '0') {
hannesw@187 650 // We know this is an octal escape.
hannesw@187 651 if (inCharClass) {
hannesw@187 652 // Convert octal escape to unicode escape if inside character class.
hannesw@187 653 int octalValue = 0;
hannesw@187 654 while (isOctalDigit(ch0)) {
hannesw@187 655 octalValue = octalValue * 8 + ch0 - '0';
hannesw@187 656 skip(1);
hannesw@127 657 }
hannesw@187 658
hannesw@187 659 unicode(octalValue, sb);
hannesw@187 660
hannesw@187 661 } else {
hannesw@187 662 // Copy decimal escape as-is
hannesw@187 663 decimalDigits();
hannesw@127 664 }
hannesw@187 665 } else {
hannesw@187 666 // This should be a backreference, but could also be an octal escape or even a literal string.
hannesw@187 667 int decimalValue = 0;
hannesw@127 668 while (isDecimalDigit(ch0)) {
hannesw@187 669 decimalValue = decimalValue * 10 + ch0 - '0';
hannesw@127 670 skip(1);
hannesw@127 671 }
hannesw@127 672
hannesw@187 673 if (inCharClass) {
hannesw@187 674 // No backreferences in character classes. Encode as unicode escape or literal char sequence
hannesw@187 675 sb.setLength(sb.length() - 1);
hannesw@187 676 octalOrLiteral(Integer.toString(decimalValue), sb);
hannesw@187 677
hannesw@187 678 } else if (decimalValue <= caps.size() && caps.get(decimalValue - 1).getNegativeLookaheadLevel() > 0) {
hannesw@187 679 // Captures that live inside a negative lookahead are dead after the
hannesw@187 680 // lookahead and will be undefined if referenced from outside.
hannesw@187 681 if (caps.get(decimalValue - 1).getNegativeLookaheadLevel() > negativeLookaheadLevel) {
hannesw@187 682 sb.setLength(sb.length() - 1);
hannesw@187 683 } else {
hannesw@187 684 sb.append(decimalValue);
hannesw@187 685 }
hannesw@187 686 } else if (decimalValue > caps.size()) {
hannesw@187 687 // Forward reference to a capture group. Forward references are always undefined so we can omit
hannesw@187 688 // it from the output buffer. However, if the target capture does not exist, we need to rewrite
hannesw@187 689 // the reference as hex escape or literal string, so register the reference for later processing.
hannesw@187 690 sb.setLength(sb.length() - 1);
hannesw@187 691 forwardReferences.add(decimalValue);
hannesw@187 692 forwardReferences.add(sb.length());
hannesw@187 693 } else {
hannesw@187 694 // Append as backreference
hannesw@187 695 sb.append(decimalValue);
hannesw@127 696 }
hannesw@127 697
hannesw@127 698 }
hannesw@127 699 return true;
jlaskey@3 700 }
jlaskey@3 701
jlaskey@3 702 restart(startIn, startOut);
hannesw@127 703 return false;
jlaskey@3 704 }
jlaskey@3 705
jlaskey@3 706 /*
jlaskey@3 707 * CharacterClassEscape ::
jlaskey@3 708 * one of dDsSwW
jlaskey@3 709 */
hannesw@127 710 private boolean characterClassEscape() {
jlaskey@3 711 switch (ch0) {
hannesw@127 712 // java.util.regex requires translation of \s and \S to explicit character list
jlaskey@3 713 case 's':
hannesw@127 714 if (RegExpFactory.usesJavaUtilRegex()) {
hannesw@127 715 sb.setLength(sb.length() - 1);
hannesw@130 716 // No nested class required if we already are inside a character class
hannesw@130 717 if (inCharClass) {
hannesw@130 718 sb.append(Lexer.getWhitespaceRegExp());
hannesw@130 719 } else {
hannesw@130 720 sb.append('[').append(Lexer.getWhitespaceRegExp()).append(']');
hannesw@130 721 }
hannesw@127 722 skip(1);
hannesw@127 723 return true;
hannesw@127 724 }
hannesw@127 725 return commit(1);
jlaskey@3 726 case 'S':
hannesw@127 727 if (RegExpFactory.usesJavaUtilRegex()) {
hannesw@127 728 sb.setLength(sb.length() - 1);
hannesw@130 729 // In negative class we must use intersection to get double negation ("not anything else than space")
hannesw@130 730 sb.append(inNegativeClass ? "&&[" : "[^").append(Lexer.getWhitespaceRegExp()).append(']');
hannesw@127 731 skip(1);
hannesw@127 732 return true;
hannesw@127 733 }
hannesw@127 734 return commit(1);
jlaskey@3 735 case 'd':
jlaskey@3 736 case 'D':
jlaskey@3 737 case 'w':
jlaskey@3 738 case 'W':
hannesw@127 739 return commit(1);
jlaskey@3 740
jlaskey@3 741 default:
hannesw@127 742 return false;
jlaskey@3 743 }
jlaskey@3 744 }
jlaskey@3 745
jlaskey@3 746 /*
jlaskey@3 747 * CharacterClass ::
jlaskey@3 748 * [ [lookahead {^}] ClassRanges ]
jlaskey@3 749 * [ ^ ClassRanges ]
jlaskey@3 750 */
hannesw@127 751 private boolean characterClass() {
jlaskey@3 752 final int startIn = position;
jlaskey@3 753 final int startOut = sb.length();
jlaskey@3 754
jlaskey@3 755 if (ch0 == '[') {
hannesw@127 756 try {
hannesw@127 757 inCharClass = true;
hannesw@127 758 push(']');
hannesw@127 759 commit(1);
jlaskey@3 760
hannesw@127 761 if (ch0 == '^') {
hannesw@130 762 inNegativeClass = true;
hannesw@127 763 commit(1);
hannesw@127 764 }
jlaskey@3 765
hannesw@127 766 if (classRanges() && ch0 == ']') {
hannesw@127 767 pop(']');
hannesw@127 768 return commit(1);
hannesw@127 769 }
hannesw@127 770 } finally {
hannesw@127 771 inCharClass = false; // no nested character classes in JavaScript
hannesw@130 772 inNegativeClass = false;
jlaskey@3 773 }
jlaskey@3 774 }
jlaskey@3 775
jlaskey@3 776 restart(startIn, startOut);
hannesw@127 777 return false;
jlaskey@3 778 }
jlaskey@3 779
jlaskey@3 780 /*
jlaskey@3 781 * ClassRanges ::
jlaskey@3 782 * [empty]
jlaskey@3 783 * NonemptyClassRanges
jlaskey@3 784 */
hannesw@127 785 private boolean classRanges() {
hannesw@127 786 nonemptyClassRanges();
hannesw@127 787 return true;
jlaskey@3 788 }
jlaskey@3 789
jlaskey@3 790 /*
jlaskey@3 791 * NonemptyClassRanges ::
jlaskey@3 792 * ClassAtom
jlaskey@3 793 * ClassAtom NonemptyClassRangesNoDash
jlaskey@3 794 * ClassAtom - ClassAtom ClassRanges
jlaskey@3 795 */
hannesw@127 796 private boolean nonemptyClassRanges() {
jlaskey@3 797 final int startIn = position;
jlaskey@3 798 final int startOut = sb.length();
jlaskey@3 799
hannesw@127 800 if (classAtom()) {
jlaskey@3 801
jlaskey@3 802 if (ch0 == '-') {
hannesw@127 803 commit(1);
jlaskey@3 804
hannesw@127 805 if (classAtom() && classRanges()) {
hannesw@127 806 return true;
jlaskey@3 807 }
jlaskey@3 808 }
jlaskey@3 809
hannesw@127 810 nonemptyClassRangesNoDash();
jlaskey@3 811
hannesw@127 812 return true;
jlaskey@3 813 }
jlaskey@3 814
jlaskey@3 815 restart(startIn, startOut);
hannesw@127 816 return false;
jlaskey@3 817 }
jlaskey@3 818
jlaskey@3 819 /*
jlaskey@3 820 * NonemptyClassRangesNoDash ::
jlaskey@3 821 * ClassAtom
jlaskey@3 822 * ClassAtomNoDash NonemptyClassRangesNoDash
jlaskey@3 823 * ClassAtomNoDash - ClassAtom ClassRanges
jlaskey@3 824 */
hannesw@127 825 private boolean nonemptyClassRangesNoDash() {
jlaskey@3 826 final int startIn = position;
jlaskey@3 827 final int startOut = sb.length();
jlaskey@3 828
hannesw@127 829 if (classAtomNoDash()) {
jlaskey@3 830
jlaskey@3 831 // need to check dash first, as for e.g. [a-b|c-d] will otherwise parse - as an atom
jlaskey@3 832 if (ch0 == '-') {
hannesw@127 833 commit(1);
jlaskey@3 834
hannesw@127 835 if (classAtom() && classRanges()) {
hannesw@127 836 return true;
jlaskey@3 837 }
jlaskey@3 838 //fallthru
jlaskey@3 839 }
jlaskey@3 840
hannesw@127 841 nonemptyClassRangesNoDash();
hannesw@127 842 return true; // still a class atom
jlaskey@3 843 }
jlaskey@3 844
hannesw@127 845 if (classAtom()) {
hannesw@127 846 return true;
jlaskey@3 847 }
jlaskey@3 848
jlaskey@3 849 restart(startIn, startOut);
hannesw@127 850 return false;
jlaskey@3 851 }
jlaskey@3 852
jlaskey@3 853 /*
jlaskey@3 854 * ClassAtom : - ClassAtomNoDash
jlaskey@3 855 */
hannesw@127 856 private boolean classAtom() {
jlaskey@3 857
jlaskey@3 858 if (ch0 == '-') {
hannesw@127 859 return commit(1);
jlaskey@3 860 }
jlaskey@3 861
hannesw@127 862 return classAtomNoDash();
jlaskey@3 863 }
jlaskey@3 864
jlaskey@3 865 /*
jlaskey@3 866 * ClassAtomNoDash ::
jlaskey@3 867 * SourceCharacter but not one of \ or ] or -
jlaskey@3 868 * \ ClassEscape
jlaskey@3 869 */
hannesw@127 870 private boolean classAtomNoDash() {
jlaskey@3 871 final int startIn = position;
jlaskey@3 872 final int startOut = sb.length();
jlaskey@3 873
jlaskey@3 874 switch (ch0) {
jlaskey@3 875 case ']':
jlaskey@3 876 case '-':
hannesw@127 877 return false;
jlaskey@3 878
jlaskey@3 879 case '[':
jlaskey@3 880 // unescaped left square bracket - add escape
hannesw@127 881 sb.append('\\');
hannesw@127 882 return commit(1);
jlaskey@3 883
jlaskey@3 884 case '\\':
hannesw@127 885 commit(1);
hannesw@127 886 if (classEscape()) {
hannesw@127 887 return true;
jlaskey@3 888 }
jlaskey@3 889
jlaskey@3 890 restart(startIn, startOut);
hannesw@127 891 return false;
jlaskey@3 892
jlaskey@3 893 default:
hannesw@127 894 return commit(1);
jlaskey@3 895 }
jlaskey@3 896 }
jlaskey@3 897
jlaskey@3 898 /*
jlaskey@3 899 * ClassEscape ::
jlaskey@3 900 * DecimalEscape
jlaskey@3 901 * b
jlaskey@3 902 * CharacterEscape
jlaskey@3 903 * CharacterClassEscape
jlaskey@3 904 */
hannesw@127 905 private boolean classEscape() {
jlaskey@3 906
hannesw@127 907 if (decimalEscape()) {
hannesw@127 908 return true;
jlaskey@3 909 }
jlaskey@3 910
jlaskey@3 911 if (ch0 == 'b') {
hannesw@127 912 sb.setLength(sb.length() - 1);
hannesw@127 913 sb.append('\b');
hannesw@127 914 skip(1);
hannesw@127 915 return true;
jlaskey@3 916 }
jlaskey@3 917
hannesw@127 918 // Note that contrary to ES 5.1 spec we put identityEscape() last because it acts as a catch-all
hannesw@127 919 return characterEscape() || characterClassEscape() || identityEscape();
jlaskey@3 920 }
jlaskey@3 921
jlaskey@3 922 /*
jlaskey@3 923 * DecimalDigits
jlaskey@3 924 */
hannesw@127 925 private boolean decimalDigits() {
jlaskey@3 926 if (!isDecimalDigit(ch0)) {
hannesw@127 927 return false;
jlaskey@3 928 }
jlaskey@3 929
jlaskey@3 930 while (isDecimalDigit(ch0)) {
hannesw@127 931 commit(1);
jlaskey@3 932 }
jlaskey@3 933
hannesw@127 934 return true;
hannesw@127 935 }
hannesw@127 936
hannesw@187 937 private void unicode(final int value, final StringBuilder buffer) {
hannesw@127 938 final String hex = Integer.toHexString(value);
hannesw@187 939 buffer.append('u');
hannesw@127 940 for (int i = 0; i < 4 - hex.length(); i++) {
hannesw@187 941 buffer.append('0');
hannesw@127 942 }
hannesw@187 943 buffer.append(hex);
hannesw@187 944 }
hannesw@187 945
hannesw@187 946 // Convert what would have been a backreference into a unicode escape, or a number literal, or both.
hannesw@187 947 private void octalOrLiteral(final String numberLiteral, final StringBuilder buffer) {
hannesw@187 948 final int length = numberLiteral.length();
hannesw@187 949 int octalValue = 0;
hannesw@187 950 int pos = 0;
hannesw@187 951 // Maximum value for octal escape is 0377 (255) so we stop the loop at 32
hannesw@187 952 while (pos < length && octalValue < 0x20) {
hannesw@187 953 final char ch = numberLiteral.charAt(pos);
hannesw@187 954 if (isOctalDigit(ch)) {
hannesw@187 955 octalValue = octalValue * 8 + ch - '0';
hannesw@187 956 } else {
hannesw@187 957 break;
hannesw@187 958 }
hannesw@187 959 pos++;
hannesw@187 960 }
hannesw@187 961 if (octalValue > 0) {
hannesw@187 962 buffer.append('\\');
hannesw@187 963 unicode(octalValue, buffer);
hannesw@187 964 buffer.append(numberLiteral.substring(pos));
hannesw@187 965 } else {
hannesw@187 966 buffer.append(numberLiteral);
hannesw@187 967 }
hannesw@187 968 }
hannesw@187 969
hannesw@187 970 private static boolean isOctalDigit(final char ch) {
hannesw@187 971 return ch >= '0' && ch <= '7';
jlaskey@3 972 }
jlaskey@3 973
jlaskey@3 974 private static boolean isDecimalDigit(final char ch) {
jlaskey@3 975 return ch >= '0' && ch <= '9';
jlaskey@3 976 }
jlaskey@3 977 }

mercurial