src/jdk/nashorn/internal/runtime/RegExpScanner.java

Mon, 11 Feb 2013 21:26:06 +0530

author
sundar
date
Mon, 11 Feb 2013 21:26:06 +0530
changeset 82
abea4ba28901
parent 7
src/jdk/nashorn/internal/parser/RegExpScanner.java@5a1b0714df0e
child 97
757a49aaad02
permissions
-rw-r--r--

8007915: Nashorn IR, codegen, parser packages and Context instance should be inaccessible to user code
Reviewed-by: lagergren, jlaskey, attila

jlaskey@3 1 /*
jlaskey@7 2 * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved.
jlaskey@3 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
jlaskey@3 4 *
jlaskey@3 5 * This code is free software; you can redistribute it and/or modify it
jlaskey@3 6 * under the terms of the GNU General Public License version 2 only, as
jlaskey@3 7 * published by the Free Software Foundation. Oracle designates this
jlaskey@3 8 * particular file as subject to the "Classpath" exception as provided
jlaskey@3 9 * by Oracle in the LICENSE file that accompanied this code.
jlaskey@3 10 *
jlaskey@3 11 * This code is distributed in the hope that it will be useful, but WITHOUT
jlaskey@3 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
jlaskey@3 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
jlaskey@3 14 * version 2 for more details (a copy is included in the LICENSE file that
jlaskey@3 15 * accompanied this code).
jlaskey@3 16 *
jlaskey@3 17 * You should have received a copy of the GNU General Public License version
jlaskey@3 18 * 2 along with this work; if not, write to the Free Software Foundation,
jlaskey@3 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
jlaskey@3 20 *
jlaskey@3 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
jlaskey@3 22 * or visit www.oracle.com if you need additional information or have any
jlaskey@3 23 * questions.
jlaskey@3 24 */
jlaskey@3 25
sundar@82 26 package jdk.nashorn.internal.runtime;
jlaskey@3 27
jlaskey@3 28 import java.util.ArrayList;
jlaskey@3 29 import java.util.HashMap;
jlaskey@3 30 import java.util.Iterator;
jlaskey@3 31 import java.util.LinkedHashMap;
jlaskey@3 32 import java.util.LinkedList;
jlaskey@3 33 import java.util.List;
jlaskey@3 34 import java.util.Map;
jlaskey@3 35 import java.util.regex.PatternSyntaxException;
sundar@82 36 import jdk.nashorn.internal.parser.Lexer;
sundar@82 37 import jdk.nashorn.internal.parser.Scanner;
jlaskey@3 38
jlaskey@3 39 /**
jlaskey@3 40 * Scan a JavaScript regexp, converting to Java regex if necessary.
jlaskey@3 41 *
jlaskey@3 42 */
sundar@82 43 final class RegExpScanner extends Scanner {
jlaskey@3 44
jlaskey@3 45 /**
jlaskey@3 46 * String builder to accumulate the result - this contains verbatim parsed JavaScript.
jlaskey@3 47 * to get the java equivalent we need to create a Pattern token and return its toString()
jlaskey@3 48 */
jlaskey@3 49 private final StringBuilder sb;
jlaskey@3 50
jlaskey@3 51 /** An optional error message if one occurred during parse. */
jlaskey@3 52 private String errorMessage;
jlaskey@3 53
jlaskey@3 54 /** Is this the special case of a regexp that never matches anything */
jlaskey@3 55 private boolean neverMatches;
jlaskey@3 56
jlaskey@3 57 /** The resulting java.util.regex pattern string. */
jlaskey@3 58 private String javaPattern;
jlaskey@3 59
jlaskey@3 60 /** Expected token table */
jlaskey@3 61 private final Map<Character, Integer> expected = new HashMap<>();
jlaskey@3 62
jlaskey@3 63 /** Capturing parenthesis that have been found so far. */
jlaskey@3 64 private final List<Capture> caps = new LinkedList<>();
jlaskey@3 65
jlaskey@3 66 /** Forward references to capturing parenthesis to be resolved later.*/
jlaskey@3 67 private final Map<Integer, Token> forwardReferences = new LinkedHashMap<>();
jlaskey@3 68
jlaskey@3 69 /** Current level of zero-width negative lookahead assertions. */
jlaskey@3 70 private int negativeLookaheadLevel;
jlaskey@3 71
jlaskey@3 72 private static final String NON_IDENT_ESCAPES = "$^*+(){}[]|\\.?";
jlaskey@3 73
jlaskey@3 74 private static class Capture {
jlaskey@3 75 /**
jlaskey@3 76 * Zero-width negative lookaheads enclosing the capture.
jlaskey@3 77 */
jlaskey@3 78 private final int negativeLookaheadLevel;
jlaskey@3 79 /**
jlaskey@3 80 * Captures that live inside a negative lookahead are dead after the
jlaskey@3 81 * lookahead and will be undefined if referenced from outside.
jlaskey@3 82 */
jlaskey@3 83 private boolean isDead;
jlaskey@3 84
jlaskey@3 85 Capture(final int negativeLookaheadLevel) {
jlaskey@3 86 this.negativeLookaheadLevel = negativeLookaheadLevel;
jlaskey@3 87 }
jlaskey@3 88
jlaskey@3 89 public int getNegativeLookaheadLevel() {
jlaskey@3 90 return negativeLookaheadLevel;
jlaskey@3 91 }
jlaskey@3 92
jlaskey@3 93 public boolean isDead() {
jlaskey@3 94 return isDead;
jlaskey@3 95 }
jlaskey@3 96
jlaskey@3 97 public void setDead() {
jlaskey@3 98 this.isDead = true;
jlaskey@3 99 }
jlaskey@3 100 }
jlaskey@3 101
jlaskey@3 102 /**
jlaskey@3 103 * This is a token - the JavaScript regexp is scanned into a token tree
jlaskey@3 104 * A token has other tokens as children as well as "atoms", i.e. Strings.
jlaskey@3 105 *
jlaskey@3 106 */
jlaskey@3 107 private static class Token {
jlaskey@3 108
jlaskey@3 109 private enum Type {
jlaskey@3 110 PATTERN,
jlaskey@3 111 DISJUNCTION,
jlaskey@3 112 ALTERNATIVE,
jlaskey@3 113 TERM,
jlaskey@3 114 ASSERTION,
jlaskey@3 115 QUANTIFIER,
jlaskey@3 116 QUANTIFIER_PREFIX,
jlaskey@3 117 ATOM,
jlaskey@3 118 PATTERN_CHARACTER,
jlaskey@3 119 ATOM_ESCAPE,
jlaskey@3 120 CHARACTER_ESCAPE,
jlaskey@3 121 CONTROL_ESCAPE,
jlaskey@3 122 CONTROL_LETTER,
jlaskey@3 123 IDENTITY_ESCAPE,
jlaskey@3 124 DECIMAL_ESCAPE,
jlaskey@3 125 CHARACTERCLASS_ESCAPE,
jlaskey@3 126 CHARACTERCLASS,
jlaskey@3 127 CLASSRANGES,
jlaskey@3 128 NON_EMPTY_CLASSRANGES,
jlaskey@3 129 NON_EMPTY_CLASSRANGES_NODASH,
jlaskey@3 130 CLASSATOM,
jlaskey@3 131 CLASSATOM_NODASH,
jlaskey@3 132 CLASS_ESCAPE,
jlaskey@3 133 DECIMALDIGITS,
jlaskey@3 134 HEX_ESCAPESEQUENCE,
jlaskey@3 135 UNICODE_ESCAPESEQUENCE,
jlaskey@3 136 }
jlaskey@3 137
jlaskey@3 138 /**
jlaskey@3 139 * Token tyoe
jlaskey@3 140 */
jlaskey@3 141 private final Token.Type type;
jlaskey@3 142
jlaskey@3 143 /**
jlaskey@3 144 * Child nodes
jlaskey@3 145 */
jlaskey@3 146 private final List<Object> children;
jlaskey@3 147
jlaskey@3 148 /**
jlaskey@3 149 * Parent node
jlaskey@3 150 */
jlaskey@3 151 private Token parent;
jlaskey@3 152
jlaskey@3 153 /**
jlaskey@3 154 * Dead code flag
jlaskey@3 155 */
jlaskey@3 156 private boolean isDead;
jlaskey@3 157
jlaskey@3 158 private static final Map<Type, ToString> toStringMap = new HashMap<>();
jlaskey@3 159 private static final ToString DEFAULT_TOSTRING = new ToString();
jlaskey@3 160
jlaskey@3 161 private static String unicode(final int value) {
jlaskey@3 162 final StringBuilder sb = new StringBuilder();
jlaskey@3 163 final String hex = Integer.toHexString(value);
jlaskey@3 164 sb.append('u');
jlaskey@3 165 for (int i = 0; i < 4 - hex.length(); i++) {
jlaskey@3 166 sb.append('0');
jlaskey@3 167 }
jlaskey@3 168 sb.append(hex);
jlaskey@3 169
jlaskey@3 170 return sb.toString();
jlaskey@3 171 }
jlaskey@3 172
jlaskey@3 173 static {
jlaskey@3 174 toStringMap.put(Type.CHARACTERCLASS, new ToString() {
jlaskey@3 175 @Override
jlaskey@3 176 public String toString(final Token token) {
jlaskey@3 177 return super.toString(token).replace("\\b", "\b");
jlaskey@3 178 }
jlaskey@3 179 });
jlaskey@3 180
jlaskey@3 181 // for some reason java regexps don't like control characters on the
jlaskey@3 182 // form "\\ca".match([string with ascii 1 at char0]). Translating
jlaskey@3 183 // them to unicode does it though.
jlaskey@3 184 toStringMap.put(Type.CHARACTER_ESCAPE, new ToString() {
jlaskey@3 185 @Override
jlaskey@3 186 public String toString(final Token token) {
jlaskey@3 187 final String str = super.toString(token);
jlaskey@3 188 if (str.length() == 2) {
jlaskey@3 189 return Token.unicode(Character.toLowerCase(str.charAt(1)) - 'a' + 1);
jlaskey@3 190 }
jlaskey@3 191 return str;
jlaskey@3 192 }
jlaskey@3 193 });
jlaskey@3 194
jlaskey@3 195 toStringMap.put(Type.DECIMAL_ESCAPE, new ToString() {
jlaskey@3 196 @Override
jlaskey@3 197 public String toString(final Token token) {
jlaskey@3 198 final String str = super.toString(token);
jlaskey@3 199
jlaskey@3 200 if ("\0".equals(str)) {
jlaskey@3 201 return str;
jlaskey@3 202 }
jlaskey@3 203
jlaskey@3 204 int value;
jlaskey@3 205
jlaskey@3 206 if (!token.hasParentOfType(Type.CLASSRANGES)) {
jlaskey@3 207 return str;
jlaskey@3 208 }
jlaskey@3 209
jlaskey@3 210 value = Integer.parseInt(str, 8); //throws exception that leads to SyntaxError if not octal
jlaskey@3 211 if (value > 0xff) {
jlaskey@3 212 throw new NumberFormatException(str);
jlaskey@3 213 }
jlaskey@3 214
jlaskey@3 215 return Token.unicode(value);
jlaskey@3 216 }
jlaskey@3 217 });
jlaskey@3 218
jlaskey@3 219 }
jlaskey@3 220
jlaskey@3 221 /**
jlaskey@3 222 * JavaScript Token to Java regex substring framework.
jlaskey@3 223 *
jlaskey@3 224 */
jlaskey@3 225 private static class ToString {
jlaskey@3 226 String toString(final Token token) {
jlaskey@3 227 final StringBuilder sb = new StringBuilder();
jlaskey@3 228 for (final Object child : token.getChildren()) {
jlaskey@3 229 sb.append(child);
jlaskey@3 230 }
jlaskey@3 231
jlaskey@3 232 //perform global substitutions that hold true for any evaluated form
jlaskey@3 233 String str = sb.toString();
jlaskey@3 234 switch (str) {
jlaskey@3 235 case "\\s":
jlaskey@3 236 str = "[" + Lexer.getWhitespaceRegExp() + "]";
jlaskey@3 237 break;
jlaskey@3 238 case "\\S":
jlaskey@3 239 str = "[^" + Lexer.getWhitespaceRegExp() + "]";
jlaskey@3 240 break;
jlaskey@3 241 case "[^]":
jlaskey@3 242 str = "[\\s\\S]";
jlaskey@3 243 break;
jlaskey@3 244 default:
jlaskey@3 245 break;
jlaskey@3 246 }
jlaskey@3 247 return str;
jlaskey@3 248 }
jlaskey@3 249 }
jlaskey@3 250
jlaskey@3 251 /**
jlaskey@3 252 * Token iterator. Doesn't return "atom" children. i.e. string representations,
jlaskey@3 253 * just tokens
jlaskey@3 254 *
jlaskey@3 255 */
jlaskey@3 256 private static class TokenIterator implements Iterator<Token> {
jlaskey@3 257 private final List<Token> preorder;
jlaskey@3 258
jlaskey@3 259 private void init(final Token root) {
jlaskey@3 260 preorder.add(root);
jlaskey@3 261 for (final Object child : root.getChildren()) {
jlaskey@3 262 if (child instanceof Token) {
jlaskey@3 263 init((Token)child);
jlaskey@3 264 }
jlaskey@3 265 }
jlaskey@3 266 }
jlaskey@3 267
jlaskey@3 268 TokenIterator(final Token root) {
jlaskey@3 269 preorder = new ArrayList<>();
jlaskey@3 270 init(root);
jlaskey@3 271 }
jlaskey@3 272
jlaskey@3 273 @Override
jlaskey@3 274 public boolean hasNext() {
jlaskey@3 275 return !preorder.isEmpty();
jlaskey@3 276 }
jlaskey@3 277
jlaskey@3 278 @Override
jlaskey@3 279 public Token next() {
jlaskey@3 280 return preorder.remove(0);
jlaskey@3 281 }
jlaskey@3 282
jlaskey@3 283 @Override
jlaskey@3 284 public void remove() {
jlaskey@3 285 next();
jlaskey@3 286 }
jlaskey@3 287 }
jlaskey@3 288
jlaskey@3 289 /**
jlaskey@3 290 * Constructor
jlaskey@3 291 * @param type the token type
jlaskey@3 292 */
jlaskey@3 293 Token(final Token.Type type) {
jlaskey@3 294 this.type = type;
jlaskey@3 295 children = new ArrayList<>();
jlaskey@3 296 }
jlaskey@3 297
jlaskey@3 298 /**
jlaskey@3 299 * Add a an "atom" child to a token
jlaskey@3 300 * @param child the child to add
jlaskey@3 301 * @return the token (for chaining)
jlaskey@3 302 */
jlaskey@3 303 public Token add(final String child) {
jlaskey@3 304 children.add(child);
jlaskey@3 305 return this;
jlaskey@3 306 }
jlaskey@3 307
jlaskey@3 308 /**
jlaskey@3 309 * Add a child to a token
jlaskey@3 310 * @param child the child
jlaskey@3 311 * @return the token (for chaining)
jlaskey@3 312 */
jlaskey@3 313 public Token add(final Token child) {
jlaskey@3 314 if (child != null) {
jlaskey@3 315 children.add(child);
jlaskey@3 316 child.setParent(this);
jlaskey@3 317 }
jlaskey@3 318 return this;
jlaskey@3 319 }
jlaskey@3 320
jlaskey@3 321 /**
jlaskey@3 322 * Remove a child from a token
jlaskey@3 323 * @param child the child to remove
jlaskey@3 324 * @return true if successful
jlaskey@3 325 */
jlaskey@3 326 public boolean remove(final Token child) {
jlaskey@3 327 return children.remove(child);
jlaskey@3 328 }
jlaskey@3 329
jlaskey@3 330 /**
jlaskey@3 331 * Remove the last child from a token
jlaskey@3 332 * @return the removed child
jlaskey@3 333 */
jlaskey@3 334 public Object removeLast() {
jlaskey@3 335 return children.remove(children.size() - 1);
jlaskey@3 336 }
jlaskey@3 337
jlaskey@3 338 /**
jlaskey@3 339 * Flag this token as dead code
jlaskey@3 340 * @param isDead is it dead or not
jlaskey@3 341 */
jlaskey@3 342 private void setIsDead(final boolean isDead) {
jlaskey@3 343 this.isDead = isDead;
jlaskey@3 344 }
jlaskey@3 345
jlaskey@3 346 /**
jlaskey@3 347 * Is this token dead code
jlaskey@3 348 * @return boolean
jlaskey@3 349 */
jlaskey@3 350 private boolean getIsDead() {
jlaskey@3 351 return isDead;
jlaskey@3 352 }
jlaskey@3 353
jlaskey@3 354 /**
jlaskey@3 355 * Get the parent of this token
jlaskey@3 356 * @return parent token
jlaskey@3 357 */
jlaskey@3 358 public Token getParent() {
jlaskey@3 359 return parent;
jlaskey@3 360 }
jlaskey@3 361
jlaskey@3 362 public boolean hasParentOfType(final Token.Type parentType) {
jlaskey@3 363 for (Token p = getParent(); p != null; p = p.getParent()) {
jlaskey@3 364 if (p.getType() == parentType) {
jlaskey@3 365 return true;
jlaskey@3 366 }
jlaskey@3 367 }
jlaskey@3 368 return false;
jlaskey@3 369 }
jlaskey@3 370
jlaskey@3 371 public boolean hasChildOfType(final Token.Type childType) {
jlaskey@3 372 for (final Iterator<Token> iter = iterator() ; iter.hasNext() ; ) {
jlaskey@3 373 if (iter.next().getType() == childType) {
jlaskey@3 374 return true;
jlaskey@3 375 }
jlaskey@3 376 }
jlaskey@3 377 return false;
jlaskey@3 378 }
jlaskey@3 379
jlaskey@3 380 /**
jlaskey@3 381 * Set the parent of this token
jlaskey@3 382 * @param parent
jlaskey@3 383 */
jlaskey@3 384 private void setParent(final Token parent) {
jlaskey@3 385 this.parent = parent;
jlaskey@3 386 }
jlaskey@3 387
jlaskey@3 388 /**
jlaskey@3 389 * Get the children of this token
jlaskey@3 390 * @return an array of children, never null
jlaskey@3 391 */
jlaskey@3 392 public Object[] getChildren() {
jlaskey@3 393 return children.toArray();
jlaskey@3 394 }
jlaskey@3 395
jlaskey@3 396 /**
jlaskey@3 397 * Reset this token, remove all children
jlaskey@3 398 */
jlaskey@3 399 public void reset() {
jlaskey@3 400 children.clear();
jlaskey@3 401 }
jlaskey@3 402
jlaskey@3 403 /**
jlaskey@3 404 * Get a preorder token iterator with this token as root
jlaskey@3 405 * @return an iterator
jlaskey@3 406 */
jlaskey@3 407 public Iterator<Token> iterator() {
jlaskey@3 408 return new TokenIterator(this);
jlaskey@3 409 }
jlaskey@3 410
jlaskey@3 411 /**
jlaskey@3 412 * Get the type of this token
jlaskey@3 413 * @return type
jlaskey@3 414 */
jlaskey@3 415 public Type getType() {
jlaskey@3 416 return type;
jlaskey@3 417 }
jlaskey@3 418
jlaskey@3 419 /**
jlaskey@3 420 * Turn this token into Java regexp compatible text
jlaskey@3 421 * @return part of a java regexp
jlaskey@3 422 */
jlaskey@3 423 @Override
jlaskey@3 424 public String toString() {
jlaskey@3 425 ToString t = toStringMap.get(getType());
jlaskey@3 426 if (t == null) {
jlaskey@3 427 t = DEFAULT_TOSTRING;
jlaskey@3 428 }
jlaskey@3 429 return t.toString(this);
jlaskey@3 430 }
jlaskey@3 431 }
jlaskey@3 432
jlaskey@3 433 /**
jlaskey@3 434 * Constructor
jlaskey@3 435 * @param string the JavaScript regexp to parse
jlaskey@3 436 */
jlaskey@3 437 private RegExpScanner(final String string) {
jlaskey@3 438 super(string);
jlaskey@3 439 sb = new StringBuilder(limit);
jlaskey@3 440 reset(0);
jlaskey@3 441 expected.put(']', 0);
jlaskey@3 442 expected.put('}', 0);
jlaskey@3 443 }
jlaskey@3 444
jlaskey@3 445 private void processForwardReferences() {
jlaskey@3 446 if (neverMatches()) {
jlaskey@3 447 return;
jlaskey@3 448 }
jlaskey@3 449
jlaskey@3 450 for (final Map.Entry<Integer, Token> fwdRef : forwardReferences.entrySet()) {
jlaskey@3 451 if (fwdRef.getKey().intValue() > caps.size()) {
jlaskey@3 452 neverMatches = true;
jlaskey@3 453 break;
jlaskey@3 454 }
jlaskey@3 455
jlaskey@3 456 fwdRef.getValue().setIsDead(true);
jlaskey@3 457 }
jlaskey@3 458
jlaskey@3 459 forwardReferences.clear();
jlaskey@3 460 }
jlaskey@3 461
jlaskey@3 462 /**
jlaskey@3 463 * Scan a JavaScript regexp string returning a Java safe regex string.
jlaskey@3 464 *
jlaskey@3 465 * @param string
jlaskey@3 466 * JavaScript regexp string.
jlaskey@3 467 * @return Java safe regex string.
jlaskey@3 468 */
jlaskey@3 469 public static RegExpScanner scan(final String string) {
jlaskey@3 470 final RegExpScanner scanner = new RegExpScanner(string);
jlaskey@3 471
jlaskey@3 472 Token pattern;
jlaskey@3 473
jlaskey@3 474 try {
jlaskey@3 475 pattern = scanner.pattern();
jlaskey@3 476 } catch (final Exception e) {
jlaskey@3 477 throw new PatternSyntaxException(e.getMessage(), string, scanner.sb.length());
jlaskey@3 478 }
jlaskey@3 479
jlaskey@3 480 scanner.processForwardReferences();
jlaskey@3 481 if (scanner.neverMatches()) {
jlaskey@3 482 return null; // never matches
jlaskey@3 483 }
jlaskey@3 484
jlaskey@3 485 // go over the code and remove dead code
jlaskey@3 486 final Iterator<Token> iter = pattern.iterator();
jlaskey@3 487 while (iter.hasNext()) {
jlaskey@3 488 final Token next = iter.next();
jlaskey@3 489 if (next.getIsDead()) {
jlaskey@3 490 next.getParent().remove(next);
jlaskey@3 491 }
jlaskey@3 492 }
jlaskey@3 493
jlaskey@3 494 // turn the pattern into a string, p, the java equivalent string for our js regexp
jlaskey@3 495 final String p = pattern.toString();
jlaskey@3 496 // if builder contains all tokens that were sent in, we know
jlaskey@3 497 // we correctly parsed the entire JavaScript regexp without syntax errors
jlaskey@3 498 if (!string.equals(scanner.getStringBuilder().toString())) {
jlaskey@3 499 throw new PatternSyntaxException(string, p, p.length() + 1);
jlaskey@3 500 }
jlaskey@3 501
jlaskey@3 502 scanner.javaPattern = p;
jlaskey@3 503 return scanner;
jlaskey@3 504 }
jlaskey@3 505
jlaskey@3 506 /**
jlaskey@3 507 * Does this regexp ever match anything? Use of e.g. [], which is legal in JavaScript,
jlaskey@3 508 * is an example where we never match
jlaskey@3 509 *
jlaskey@3 510 * @return boolean
jlaskey@3 511 */
jlaskey@3 512 private boolean neverMatches() {
jlaskey@3 513 return neverMatches;
jlaskey@3 514 }
jlaskey@3 515
jlaskey@3 516 /**
jlaskey@3 517 * This is used to set better error messages that can be reused
jlaskey@3 518 * in NativeRegExp for augmenting e.g. SyntaxErrors.
jlaskey@3 519 *
jlaskey@3 520 * @return an error message or null if no extra info
jlaskey@3 521 */
jlaskey@3 522 public String getErrorMessage() {
jlaskey@3 523 return errorMessage;
jlaskey@3 524 }
jlaskey@3 525
jlaskey@3 526 final StringBuilder getStringBuilder() {
jlaskey@3 527 return sb;
jlaskey@3 528 }
jlaskey@3 529
jlaskey@3 530 String getJavaPattern() {
jlaskey@3 531 return javaPattern;
jlaskey@3 532 }
jlaskey@3 533
jlaskey@3 534 BitVector getGroupsInNegativeLookahead() {
jlaskey@3 535 BitVector vec = null;
jlaskey@3 536 for (int i = 0; i < caps.size(); i++) {
jlaskey@3 537 final Capture cap = caps.get(i);
jlaskey@3 538 if (cap.getNegativeLookaheadLevel() > 0) {
jlaskey@3 539 if (vec == null) {
jlaskey@3 540 vec = new BitVector(caps.size() + 1);
jlaskey@3 541 }
jlaskey@3 542 vec.set(i + 1);
jlaskey@3 543 }
jlaskey@3 544 }
jlaskey@3 545 return vec;
jlaskey@3 546 }
jlaskey@3 547
jlaskey@3 548 /**
jlaskey@3 549 * Commit n characters to the builder and to a given token
jlaskey@3 550 * @param token Uncommitted token.
jlaskey@3 551 * @param n Number of characters.
jlaskey@3 552 * @return Committed token
jlaskey@3 553 */
jlaskey@3 554 private Token commit(final Token token, final int n) {
jlaskey@3 555 final int startIn = position;
jlaskey@3 556
jlaskey@3 557 switch (n) {
jlaskey@3 558 case 1:
jlaskey@3 559 sb.append(ch0);
jlaskey@3 560 skip(1);
jlaskey@3 561 break;
jlaskey@3 562 case 2:
jlaskey@3 563 sb.append(ch0);
jlaskey@3 564 sb.append(ch1);
jlaskey@3 565 skip(2);
jlaskey@3 566 break;
jlaskey@3 567 case 3:
jlaskey@3 568 sb.append(ch0);
jlaskey@3 569 sb.append(ch1);
jlaskey@3 570 sb.append(ch2);
jlaskey@3 571 skip(3);
jlaskey@3 572 break;
jlaskey@3 573 default:
jlaskey@3 574 assert false : "Should not reach here";
jlaskey@3 575 }
jlaskey@3 576
jlaskey@3 577 if (token == null) {
jlaskey@3 578 return null;
jlaskey@3 579 }
jlaskey@3 580
jlaskey@3 581 return token.add(sb.substring(startIn, sb.length()));
jlaskey@3 582 }
jlaskey@3 583
jlaskey@3 584 /**
jlaskey@3 585 * Restart the buffers back at an earlier position.
jlaskey@3 586 *
jlaskey@3 587 * @param startIn
jlaskey@3 588 * Position in the input stream.
jlaskey@3 589 * @param startOut
jlaskey@3 590 * Position in the output stream.
jlaskey@3 591 */
jlaskey@3 592 private void restart(final int startIn, final int startOut) {
jlaskey@3 593 reset(startIn);
jlaskey@3 594 sb.setLength(startOut);
jlaskey@3 595 }
jlaskey@3 596
jlaskey@3 597 private void push(final char ch) {
jlaskey@3 598 expected.put(ch, expected.get(ch) + 1);
jlaskey@3 599 }
jlaskey@3 600
jlaskey@3 601 private void pop(final char ch) {
jlaskey@3 602 expected.put(ch, Math.min(0, expected.get(ch) - 1));
jlaskey@3 603 }
jlaskey@3 604
jlaskey@3 605 /*
jlaskey@3 606 * Recursive descent tokenizer starts below.
jlaskey@3 607 */
jlaskey@3 608
jlaskey@3 609 /*
jlaskey@3 610 * Pattern ::
jlaskey@3 611 * Disjunction
jlaskey@3 612 */
jlaskey@3 613 private Token pattern() {
jlaskey@3 614 final Token token = new Token(Token.Type.PATTERN);
jlaskey@3 615
jlaskey@3 616 final Token child = disjunction();
jlaskey@3 617 if (child != null) {
jlaskey@3 618 return token.add(child);
jlaskey@3 619 }
jlaskey@3 620
jlaskey@3 621 return null;
jlaskey@3 622 }
jlaskey@3 623
jlaskey@3 624 /*
jlaskey@3 625 * Disjunction ::
jlaskey@3 626 * Alternative
jlaskey@3 627 * Alternative | Disjunction
jlaskey@3 628 */
jlaskey@3 629 private Token disjunction() {
jlaskey@3 630 final Token token = new Token(Token.Type.DISJUNCTION);
jlaskey@3 631
jlaskey@3 632 while (true) {
jlaskey@3 633 token.add(alternative());
jlaskey@3 634
jlaskey@3 635 if (ch0 == '|') {
jlaskey@3 636 commit(token, 1);
jlaskey@3 637 } else {
jlaskey@3 638 break;
jlaskey@3 639 }
jlaskey@3 640 }
jlaskey@3 641
jlaskey@3 642 return token;
jlaskey@3 643 }
jlaskey@3 644
jlaskey@3 645 /*
jlaskey@3 646 * Alternative ::
jlaskey@3 647 * [empty]
jlaskey@3 648 * Alternative Term
jlaskey@3 649 */
jlaskey@3 650 private Token alternative() {
jlaskey@3 651 final Token token = new Token(Token.Type.ALTERNATIVE);
jlaskey@3 652
jlaskey@3 653 Token child;
jlaskey@3 654 while ((child = term()) != null) {
jlaskey@3 655 token.add(child);
jlaskey@3 656 }
jlaskey@3 657
jlaskey@3 658 return token;
jlaskey@3 659 }
jlaskey@3 660
jlaskey@3 661 /*
jlaskey@3 662 * Term ::
jlaskey@3 663 * Assertion
jlaskey@3 664 * Atom
jlaskey@3 665 * Atom Quantifier
jlaskey@3 666 */
jlaskey@3 667 private Token term() {
jlaskey@3 668 final int startIn = position;
jlaskey@3 669 final int startOut = sb.length();
jlaskey@3 670 final Token token = new Token(Token.Type.TERM);
jlaskey@3 671 Token child;
jlaskey@3 672
jlaskey@3 673 child = assertion();
jlaskey@3 674 if (child != null) {
jlaskey@3 675 return token.add(child);
jlaskey@3 676 }
jlaskey@3 677
jlaskey@3 678 child = atom();
jlaskey@3 679 if (child != null) {
jlaskey@3 680 boolean emptyCharacterClass = false;
jlaskey@3 681 if ("[]".equals(child.toString())) {
jlaskey@3 682 emptyCharacterClass = true;
jlaskey@3 683 }
jlaskey@3 684
jlaskey@3 685 token.add(child);
jlaskey@3 686
jlaskey@3 687 final Token quantifier = quantifier();
jlaskey@3 688 if (quantifier != null) {
jlaskey@3 689 token.add(quantifier);
jlaskey@3 690 }
jlaskey@3 691
jlaskey@3 692 if (emptyCharacterClass) {
jlaskey@3 693 if (quantifier == null) {
jlaskey@3 694 neverMatches = true; //never matches ever.
jlaskey@3 695 } else {
jlaskey@3 696 //if we can get away with max zero, remove this entire token
jlaskey@3 697 final String qs = quantifier.toString();
jlaskey@3 698 if ("+".equals(qs) || "*".equals(qs) || qs.startsWith("{0,")) {
jlaskey@3 699 token.setIsDead(true);
jlaskey@3 700 }
jlaskey@3 701 }
jlaskey@3 702 }
jlaskey@3 703
jlaskey@3 704 return token;
jlaskey@3 705 }
jlaskey@3 706
jlaskey@3 707 restart(startIn, startOut);
jlaskey@3 708 return null;
jlaskey@3 709 }
jlaskey@3 710
jlaskey@3 711 /*
jlaskey@3 712 * Assertion ::
jlaskey@3 713 * ^
jlaskey@3 714 * $
jlaskey@3 715 * \b
jlaskey@3 716 * \B
jlaskey@3 717 * ( ? = Disjunction )
jlaskey@3 718 * ( ? ! Disjunction )
jlaskey@3 719 */
jlaskey@3 720 private Token assertion() {
jlaskey@3 721 final int startIn = position;
jlaskey@3 722 final int startOut = sb.length();
jlaskey@3 723 final Token token = new Token(Token.Type.ASSERTION);
jlaskey@3 724
jlaskey@3 725 switch (ch0) {
jlaskey@3 726 case '^':
jlaskey@3 727 case '$':
jlaskey@3 728 return commit(token, 1);
jlaskey@3 729
jlaskey@3 730 case '\\':
jlaskey@3 731 if (ch1 == 'b' || ch1 == 'B') {
jlaskey@3 732 return commit(token, 2);
jlaskey@3 733 }
jlaskey@3 734 break;
jlaskey@3 735
jlaskey@3 736 case '(':
jlaskey@3 737 if (ch1 != '?') {
jlaskey@3 738 break;
jlaskey@3 739 }
jlaskey@3 740 if (ch2 != '=' && ch2 != '!') {
jlaskey@3 741 break;
jlaskey@3 742 }
jlaskey@3 743 final boolean isNegativeLookahead = (ch2 == '!');
jlaskey@3 744 commit(token, 3);
jlaskey@3 745
jlaskey@3 746 if (isNegativeLookahead) {
jlaskey@3 747 negativeLookaheadLevel++;
jlaskey@3 748 }
jlaskey@3 749 final Token disjunction = disjunction();
jlaskey@3 750 if (isNegativeLookahead) {
jlaskey@3 751 for (final Capture cap : caps) {
jlaskey@3 752 if (cap.getNegativeLookaheadLevel() >= negativeLookaheadLevel) {
jlaskey@3 753 cap.setDead();
jlaskey@3 754 }
jlaskey@3 755 }
jlaskey@3 756 negativeLookaheadLevel--;
jlaskey@3 757 }
jlaskey@3 758
jlaskey@3 759 if (disjunction != null && ch0 == ')') {
jlaskey@3 760 token.add(disjunction);
jlaskey@3 761 return commit(token, 1);
jlaskey@3 762 }
jlaskey@3 763 break;
jlaskey@3 764
jlaskey@3 765 default:
jlaskey@3 766 break;
jlaskey@3 767 }
jlaskey@3 768
jlaskey@3 769 restart(startIn, startOut);
jlaskey@3 770
jlaskey@3 771 return null;
jlaskey@3 772 }
jlaskey@3 773
jlaskey@3 774 /*
jlaskey@3 775 * Quantifier ::
jlaskey@3 776 * QuantifierPrefix
jlaskey@3 777 * QuantifierPrefix ?
jlaskey@3 778 */
jlaskey@3 779 private Token quantifier() {
jlaskey@3 780 final Token token = new Token(Token.Type.QUANTIFIER);
jlaskey@3 781 final Token child = quantifierPrefix();
jlaskey@3 782 if (child != null) {
jlaskey@3 783 token.add(child);
jlaskey@3 784 if (ch0 == '?') {
jlaskey@3 785 commit(token, 1);
jlaskey@3 786 }
jlaskey@3 787 return token;
jlaskey@3 788 }
jlaskey@3 789 return null;
jlaskey@3 790 }
jlaskey@3 791
jlaskey@3 792 /*
jlaskey@3 793 * QuantifierPrefix ::
jlaskey@3 794 * *
jlaskey@3 795 * +
jlaskey@3 796 * ?
jlaskey@3 797 * { DecimalDigits }
jlaskey@3 798 * { DecimalDigits , }
jlaskey@3 799 * { DecimalDigits , DecimalDigits }
jlaskey@3 800 */
jlaskey@3 801 private Token quantifierPrefix() {
jlaskey@3 802 final int startIn = position;
jlaskey@3 803 final int startOut = sb.length();
jlaskey@3 804 final Token token = new Token(Token.Type.QUANTIFIER_PREFIX);
jlaskey@3 805
jlaskey@3 806 switch (ch0) {
jlaskey@3 807 case '*':
jlaskey@3 808 case '+':
jlaskey@3 809 case '?':
jlaskey@3 810 return commit(token, 1);
jlaskey@3 811
jlaskey@3 812 case '{':
jlaskey@3 813 commit(token, 1);
jlaskey@3 814
jlaskey@3 815 final Token child = decimalDigits();
jlaskey@3 816 if (child == null) {
jlaskey@3 817 break; // not a quantifier - back out
jlaskey@3 818 }
jlaskey@3 819 push('}');
jlaskey@3 820 token.add(child);
jlaskey@3 821
jlaskey@3 822 if (ch0 == ',') {
jlaskey@3 823 commit(token, 1);
jlaskey@3 824 token.add(decimalDigits());
jlaskey@3 825 }
jlaskey@3 826
jlaskey@3 827 if (ch0 == '}') {
jlaskey@3 828 pop('}');
jlaskey@3 829 commit(token, 1);
jlaskey@3 830 }
jlaskey@3 831
jlaskey@3 832 return token;
jlaskey@3 833
jlaskey@3 834 default:
jlaskey@3 835 break;
jlaskey@3 836 }
jlaskey@3 837
jlaskey@3 838 restart(startIn, startOut);
jlaskey@3 839 return null;
jlaskey@3 840 }
jlaskey@3 841
jlaskey@3 842 /*
jlaskey@3 843 * Atom ::
jlaskey@3 844 * PatternCharacter
jlaskey@3 845 * .
jlaskey@3 846 * \ AtomEscape
jlaskey@3 847 * CharacterClass
jlaskey@3 848 * ( Disjunction )
jlaskey@3 849 * ( ? : Disjunction )
jlaskey@3 850 *
jlaskey@3 851 */
jlaskey@3 852 private Token atom() {
jlaskey@3 853 final int startIn = position;
jlaskey@3 854 final int startOut = sb.length();
jlaskey@3 855 final Token token = new Token(Token.Type.ATOM);
jlaskey@3 856 Token child;
jlaskey@3 857
jlaskey@3 858 child = patternCharacter();
jlaskey@3 859 if (child != null) {
jlaskey@3 860 return token.add(child);
jlaskey@3 861 }
jlaskey@3 862
jlaskey@3 863 if (ch0 == '.') {
jlaskey@3 864 return commit(token, 1);
jlaskey@3 865 }
jlaskey@3 866
jlaskey@3 867 if (ch0 == '\\') {
jlaskey@3 868 commit(token, 1);
jlaskey@3 869 child = atomEscape();
jlaskey@3 870
jlaskey@3 871 if (child != null) {
jlaskey@3 872 if (child.hasChildOfType(Token.Type.IDENTITY_ESCAPE)) {
jlaskey@3 873 final char idEscape = child.toString().charAt(0);
jlaskey@3 874 if (NON_IDENT_ESCAPES.indexOf(idEscape) == -1) {
jlaskey@3 875 token.reset();
jlaskey@3 876 }
jlaskey@3 877 }
jlaskey@3 878
jlaskey@3 879 token.add(child);
jlaskey@3 880
jlaskey@3 881 // forward backreferences always match empty. JavaScript != Java
jlaskey@3 882 if (child.hasChildOfType(Token.Type.DECIMAL_ESCAPE) && !"\u0000".equals(child.toString())) {
jlaskey@3 883 final int refNum = Integer.parseInt(child.toString());
jlaskey@3 884
jlaskey@3 885 if (refNum - 1 < caps.size() && caps.get(refNum - 1).isDead()) {
jlaskey@3 886 // reference to dead in-negative-lookahead capture
jlaskey@3 887 token.setIsDead(true);
jlaskey@3 888 } else if (caps.size() < refNum) {
jlaskey@3 889 // forward reference: always matches against empty string (dead token).
jlaskey@3 890 // invalid reference (non-existant capture): pattern never matches.
jlaskey@3 891 forwardReferences.put(refNum, token);
jlaskey@3 892 }
jlaskey@3 893 }
jlaskey@3 894
jlaskey@3 895 return token;
jlaskey@3 896 }
jlaskey@3 897 }
jlaskey@3 898
jlaskey@3 899 child = characterClass();
jlaskey@3 900 if (child != null) {
jlaskey@3 901 return token.add(child);
jlaskey@3 902 }
jlaskey@3 903
jlaskey@3 904 if (ch0 == '(') {
jlaskey@3 905 boolean capturingParens = true;
jlaskey@3 906 commit(token, 1);
jlaskey@3 907 if (ch0 == '?' && ch1 == ':') {
jlaskey@3 908 capturingParens = false;
jlaskey@3 909 commit(token, 2);
jlaskey@3 910 }
jlaskey@3 911
jlaskey@3 912 child = disjunction();
jlaskey@3 913 if (child != null) {
jlaskey@3 914 token.add(child);
jlaskey@3 915 if (ch0 == ')') {
jlaskey@3 916 final Token atom = commit(token, 1);
jlaskey@3 917 if (capturingParens) {
jlaskey@3 918 caps.add(new Capture(negativeLookaheadLevel));
jlaskey@3 919 }
jlaskey@3 920 return atom;
jlaskey@3 921 }
jlaskey@3 922 }
jlaskey@3 923 }
jlaskey@3 924
jlaskey@3 925 restart(startIn, startOut);
jlaskey@3 926 return null;
jlaskey@3 927 }
jlaskey@3 928
jlaskey@3 929 /*
jlaskey@3 930 * PatternCharacter ::
jlaskey@3 931 * SourceCharacter but not any of: ^$\.*+?()[]{}|
jlaskey@3 932 */
jlaskey@3 933 @SuppressWarnings("fallthrough")
jlaskey@3 934 private Token patternCharacter() {
jlaskey@3 935 if (atEOF()) {
jlaskey@3 936 return null;
jlaskey@3 937 }
jlaskey@3 938
jlaskey@3 939 switch (ch0) {
jlaskey@3 940 case '^':
jlaskey@3 941 case '$':
jlaskey@3 942 case '\\':
jlaskey@3 943 case '.':
jlaskey@3 944 case '*':
jlaskey@3 945 case '+':
jlaskey@3 946 case '?':
jlaskey@3 947 case '(':
jlaskey@3 948 case ')':
jlaskey@3 949 case '[':
jlaskey@3 950 case '|':
jlaskey@3 951 return null;
jlaskey@3 952
jlaskey@3 953 case '}':
jlaskey@3 954 case ']':
jlaskey@3 955 final int n = expected.get(ch0);
jlaskey@3 956 if (n != 0) {
jlaskey@3 957 return null;
jlaskey@3 958 }
jlaskey@3 959
jlaskey@3 960 case '{':
jlaskey@3 961 // if not a valid quantifier escape curly brace to match itself
jlaskey@3 962 // this ensures compatibility with other JS implementations
jlaskey@3 963 final Token quant = quantifierPrefix();
jlaskey@3 964 return (quant == null) ? commit(new Token(Token.Type.PATTERN_CHARACTER).add("\\"), 1) : null;
jlaskey@3 965
jlaskey@3 966 default:
jlaskey@3 967 return commit(new Token(Token.Type.PATTERN_CHARACTER), 1); // SOURCECHARACTER
jlaskey@3 968 }
jlaskey@3 969 }
jlaskey@3 970
jlaskey@3 971 /*
jlaskey@3 972 * AtomEscape ::
jlaskey@3 973 * DecimalEscape
jlaskey@3 974 * CharacterEscape
jlaskey@3 975 * CharacterClassEscape
jlaskey@3 976 */
jlaskey@3 977 private Token atomEscape() {
jlaskey@3 978 final Token token = new Token(Token.Type.ATOM_ESCAPE);
jlaskey@3 979 Token child;
jlaskey@3 980
jlaskey@3 981 child = decimalEscape();
jlaskey@3 982 if (child != null) {
jlaskey@3 983 return token.add(child);
jlaskey@3 984 }
jlaskey@3 985
jlaskey@3 986 child = characterClassEscape();
jlaskey@3 987 if (child != null) {
jlaskey@3 988 return token.add(child);
jlaskey@3 989 }
jlaskey@3 990
jlaskey@3 991 child = characterEscape();
jlaskey@3 992 if (child != null) {
jlaskey@3 993 return token.add(child);
jlaskey@3 994 }
jlaskey@3 995
jlaskey@3 996
jlaskey@3 997 return null;
jlaskey@3 998 }
jlaskey@3 999
jlaskey@3 1000 /*
jlaskey@3 1001 * CharacterEscape ::
jlaskey@3 1002 * ControlEscape
jlaskey@3 1003 * c ControlLetter
jlaskey@3 1004 * HexEscapeSequence
jlaskey@3 1005 * UnicodeEscapeSequence
jlaskey@3 1006 * IdentityEscape
jlaskey@3 1007 */
jlaskey@3 1008 private Token characterEscape() {
jlaskey@3 1009 final int startIn = position;
jlaskey@3 1010 final int startOut = sb.length();
jlaskey@3 1011
jlaskey@3 1012 final Token token = new Token(Token.Type.CHARACTER_ESCAPE);
jlaskey@3 1013 Token child;
jlaskey@3 1014
jlaskey@3 1015 child = controlEscape();
jlaskey@3 1016 if (child != null) {
jlaskey@3 1017 return token.add(child);
jlaskey@3 1018 }
jlaskey@3 1019
jlaskey@3 1020 if (ch0 == 'c') {
jlaskey@3 1021 commit(token, 1);
jlaskey@3 1022 child = controlLetter();
jlaskey@3 1023 if (child != null) {
jlaskey@3 1024 return token.add(child);
jlaskey@3 1025 }
jlaskey@3 1026 restart(startIn, startOut);
jlaskey@3 1027 }
jlaskey@3 1028
jlaskey@3 1029 child = hexEscapeSequence();
jlaskey@3 1030 if (child != null) {
jlaskey@3 1031 return token.add(child);
jlaskey@3 1032 }
jlaskey@3 1033
jlaskey@3 1034 child = unicodeEscapeSequence();
jlaskey@3 1035 if (child != null) {
jlaskey@3 1036 return token.add(child);
jlaskey@3 1037 }
jlaskey@3 1038
jlaskey@3 1039 child = identityEscape();
jlaskey@3 1040 if (child != null) {
jlaskey@3 1041 return token.add(child);
jlaskey@3 1042 }
jlaskey@3 1043
jlaskey@3 1044 restart(startIn, startOut);
jlaskey@3 1045
jlaskey@3 1046 return null;
jlaskey@3 1047 }
jlaskey@3 1048
jlaskey@3 1049 private boolean scanEscapeSequence(final char leader, final int length, final Token token) {
jlaskey@3 1050 final int startIn = position;
jlaskey@3 1051 final int startOut = sb.length();
jlaskey@3 1052
jlaskey@3 1053 if (ch0 != leader) {
jlaskey@3 1054 return false;
jlaskey@3 1055 }
jlaskey@3 1056
jlaskey@3 1057 commit(token, 1);
jlaskey@3 1058 for (int i = 0; i < length; i++) {
jlaskey@3 1059 final char ch0l = Character.toLowerCase(ch0);
jlaskey@3 1060 if ((ch0l >= 'a' && ch0l <= 'f') || isDecimalDigit(ch0)) {
jlaskey@3 1061 commit(token, 1);
jlaskey@3 1062 } else {
jlaskey@3 1063 restart(startIn, startOut);
jlaskey@3 1064 return false;
jlaskey@3 1065 }
jlaskey@3 1066 }
jlaskey@3 1067
jlaskey@3 1068 return true;
jlaskey@3 1069 }
jlaskey@3 1070
jlaskey@3 1071 private Token hexEscapeSequence() {
jlaskey@3 1072 final Token token = new Token(Token.Type.HEX_ESCAPESEQUENCE);
jlaskey@3 1073 if (scanEscapeSequence('x', 2, token)) {
jlaskey@3 1074 return token;
jlaskey@3 1075 }
jlaskey@3 1076 return null;
jlaskey@3 1077 }
jlaskey@3 1078
jlaskey@3 1079 private Token unicodeEscapeSequence() {
jlaskey@3 1080 final Token token = new Token(Token.Type.UNICODE_ESCAPESEQUENCE);
jlaskey@3 1081 if (scanEscapeSequence('u', 4, token)) {
jlaskey@3 1082 return token;
jlaskey@3 1083 }
jlaskey@3 1084 return null;
jlaskey@3 1085 }
jlaskey@3 1086
jlaskey@3 1087 /*
jlaskey@3 1088 * ControlEscape ::
jlaskey@3 1089 * one of fnrtv
jlaskey@3 1090 */
jlaskey@3 1091 private Token controlEscape() {
jlaskey@3 1092 switch (ch0) {
jlaskey@3 1093 case 'f':
jlaskey@3 1094 case 'n':
jlaskey@3 1095 case 'r':
jlaskey@3 1096 case 't':
jlaskey@3 1097 case 'v':
jlaskey@3 1098 return commit(new Token(Token.Type.CONTROL_ESCAPE), 1);
jlaskey@3 1099
jlaskey@3 1100 default:
jlaskey@3 1101 return null;
jlaskey@3 1102 }
jlaskey@3 1103 }
jlaskey@3 1104
jlaskey@3 1105 /*
jlaskey@3 1106 * ControlLetter ::
jlaskey@3 1107 * one of abcdefghijklmnopqrstuvwxyz
jlaskey@3 1108 * ABCDEFGHIJKLMNOPQRSTUVWXYZ
jlaskey@3 1109 */
jlaskey@3 1110 private Token controlLetter() {
jlaskey@3 1111 final char c = Character.toUpperCase(ch0);
jlaskey@3 1112 if (c >= 'A' && c <= 'Z') {
jlaskey@3 1113 final Token token = new Token(Token.Type.CONTROL_LETTER);
jlaskey@3 1114 commit(token, 1);
jlaskey@3 1115 return token;
jlaskey@3 1116 }
jlaskey@3 1117 return null;
jlaskey@3 1118 /*
jlaskey@3 1119 Token token = new Token(Token.Type.CONTROL_LETTER);
jlaskey@3 1120 commit(null, 1);//add original char to builder not to token
jlaskey@3 1121 this.neverMatches = c < 'A' || c > 'Z';
jlaskey@3 1122 return token.add(""+c);*/
jlaskey@3 1123 }
jlaskey@3 1124
jlaskey@3 1125 /*
jlaskey@3 1126 * IdentityEscape ::
jlaskey@3 1127 * SourceCharacter but not IdentifierPart
jlaskey@3 1128 * <ZWJ> (200c)
jlaskey@3 1129 * <ZWNJ> (200d)
jlaskey@3 1130 */
jlaskey@3 1131 private Token identityEscape() {
jlaskey@3 1132 final Token token = new Token(Token.Type.IDENTITY_ESCAPE);
jlaskey@3 1133 commit(token, 1);
jlaskey@3 1134 return token;
jlaskey@3 1135 }
jlaskey@3 1136
jlaskey@3 1137 /*
jlaskey@3 1138 * DecimalEscape ::
jlaskey@3 1139 * DecimalIntegerLiteral [lookahead DecimalDigit]
jlaskey@3 1140 */
jlaskey@3 1141 private Token decimalEscape() {
jlaskey@3 1142 final Token token = new Token(Token.Type.DECIMAL_ESCAPE);
jlaskey@3 1143 final int startIn = position;
jlaskey@3 1144 final int startOut = sb.length();
jlaskey@3 1145
jlaskey@3 1146 if (ch0 == '0' && !isDecimalDigit(ch1)) {
jlaskey@3 1147 commit(token, 1);
jlaskey@3 1148 token.removeLast();
jlaskey@3 1149 // DecimalEscape :: 0. If i is zero, return the EscapeValue consisting of a <NUL> character (Unicodevalue0000);
jlaskey@3 1150 return token.add("\u0000");
jlaskey@3 1151 }
jlaskey@3 1152
jlaskey@3 1153 if (isDecimalDigit(ch0)) {
jlaskey@3 1154 while (isDecimalDigit(ch0)) {
jlaskey@3 1155 commit(token, 1);
jlaskey@3 1156 }
jlaskey@3 1157 return token;
jlaskey@3 1158 }
jlaskey@3 1159
jlaskey@3 1160 restart(startIn, startOut);
jlaskey@3 1161
jlaskey@3 1162 return null;
jlaskey@3 1163 }
jlaskey@3 1164
jlaskey@3 1165 /*
jlaskey@3 1166 * CharacterClassEscape ::
jlaskey@3 1167 * one of dDsSwW
jlaskey@3 1168 */
jlaskey@3 1169 private Token characterClassEscape() {
jlaskey@3 1170 switch (ch0) {
jlaskey@3 1171 case 's':
jlaskey@3 1172 case 'S':
jlaskey@3 1173 case 'd':
jlaskey@3 1174 case 'D':
jlaskey@3 1175 case 'w':
jlaskey@3 1176 case 'W':
jlaskey@3 1177 return commit(new Token(Token.Type.CHARACTERCLASS_ESCAPE), 1);
jlaskey@3 1178
jlaskey@3 1179 default:
jlaskey@3 1180 return null;
jlaskey@3 1181 }
jlaskey@3 1182 }
jlaskey@3 1183
jlaskey@3 1184 /*
jlaskey@3 1185 * CharacterClass ::
jlaskey@3 1186 * [ [lookahead {^}] ClassRanges ]
jlaskey@3 1187 * [ ^ ClassRanges ]
jlaskey@3 1188 */
jlaskey@3 1189 private Token characterClass() {
jlaskey@3 1190 final int startIn = position;
jlaskey@3 1191 final int startOut = sb.length();
jlaskey@3 1192 final Token token = new Token(Token.Type.CHARACTERCLASS);
jlaskey@3 1193
jlaskey@3 1194 if (ch0 == '[') {
jlaskey@3 1195 push(']');
jlaskey@3 1196 commit(token, 1);
jlaskey@3 1197
jlaskey@3 1198 if (ch0 == '^') {
jlaskey@3 1199 commit(token, 1);
jlaskey@3 1200 }
jlaskey@3 1201
jlaskey@3 1202 final Token child = classRanges();
jlaskey@3 1203 if (child != null && ch0 == ']') {
jlaskey@3 1204 pop(']');
jlaskey@3 1205 token.add(child);
jlaskey@3 1206 return commit(token, 1);
jlaskey@3 1207 }
jlaskey@3 1208 }
jlaskey@3 1209
jlaskey@3 1210 restart(startIn, startOut);
jlaskey@3 1211 return null;
jlaskey@3 1212 }
jlaskey@3 1213
jlaskey@3 1214 /*
jlaskey@3 1215 * ClassRanges ::
jlaskey@3 1216 * [empty]
jlaskey@3 1217 * NonemptyClassRanges
jlaskey@3 1218 */
jlaskey@3 1219 private Token classRanges() {
jlaskey@3 1220 return new Token(Token.Type.CLASSRANGES).add(nonemptyClassRanges());
jlaskey@3 1221 }
jlaskey@3 1222
jlaskey@3 1223 /*
jlaskey@3 1224 * NonemptyClassRanges ::
jlaskey@3 1225 * ClassAtom
jlaskey@3 1226 * ClassAtom NonemptyClassRangesNoDash
jlaskey@3 1227 * ClassAtom - ClassAtom ClassRanges
jlaskey@3 1228 */
jlaskey@3 1229 private Token nonemptyClassRanges() {
jlaskey@3 1230 final int startIn = position;
jlaskey@3 1231 final int startOut = sb.length();
jlaskey@3 1232 final Token token = new Token(Token.Type.NON_EMPTY_CLASSRANGES);
jlaskey@3 1233 Token child;
jlaskey@3 1234
jlaskey@3 1235 child = classAtom();
jlaskey@3 1236 if (child != null) {
jlaskey@3 1237 token.add(child);
jlaskey@3 1238
jlaskey@3 1239 if (ch0 == '-') {
jlaskey@3 1240 commit(token, 1);
jlaskey@3 1241
jlaskey@3 1242 final Token child1 = classAtom();
jlaskey@3 1243 final Token child2 = classRanges();
jlaskey@3 1244 if (child1 != null && child2 != null) {
jlaskey@3 1245 token.add(child1);
jlaskey@3 1246 token.add(child2);
jlaskey@3 1247
jlaskey@3 1248 return token;
jlaskey@3 1249 }
jlaskey@3 1250 }
jlaskey@3 1251
jlaskey@3 1252 child = nonemptyClassRangesNoDash();
jlaskey@3 1253 if (child != null) {
jlaskey@3 1254 token.add(child);
jlaskey@3 1255 return token;
jlaskey@3 1256 }
jlaskey@3 1257
jlaskey@3 1258 return token;
jlaskey@3 1259 }
jlaskey@3 1260
jlaskey@3 1261 restart(startIn, startOut);
jlaskey@3 1262 return null;
jlaskey@3 1263 }
jlaskey@3 1264
jlaskey@3 1265 /*
jlaskey@3 1266 * NonemptyClassRangesNoDash ::
jlaskey@3 1267 * ClassAtom
jlaskey@3 1268 * ClassAtomNoDash NonemptyClassRangesNoDash
jlaskey@3 1269 * ClassAtomNoDash - ClassAtom ClassRanges
jlaskey@3 1270 */
jlaskey@3 1271 private Token nonemptyClassRangesNoDash() {
jlaskey@3 1272 final int startIn = position;
jlaskey@3 1273 final int startOut = sb.length();
jlaskey@3 1274 final Token token = new Token(Token.Type.NON_EMPTY_CLASSRANGES_NODASH);
jlaskey@3 1275 Token child;
jlaskey@3 1276
jlaskey@3 1277 child = classAtomNoDash();
jlaskey@3 1278 if (child != null) {
jlaskey@3 1279 token.add(child);
jlaskey@3 1280
jlaskey@3 1281 // need to check dash first, as for e.g. [a-b|c-d] will otherwise parse - as an atom
jlaskey@3 1282 if (ch0 == '-') {
jlaskey@3 1283 commit(token, 1);
jlaskey@3 1284
jlaskey@3 1285 final Token child1 = classAtom();
jlaskey@3 1286 final Token child2 = classRanges();
jlaskey@3 1287 if (child1 != null && child2 != null) {
jlaskey@3 1288 token.add(child1);
jlaskey@3 1289 return token.add(child2);
jlaskey@3 1290 }
jlaskey@3 1291 //fallthru
jlaskey@3 1292 }
jlaskey@3 1293
jlaskey@3 1294 child = nonemptyClassRangesNoDash();
jlaskey@3 1295 if (child != null) {
jlaskey@3 1296 token.add(child);
jlaskey@3 1297 }
jlaskey@3 1298 return token; // still a class atom
jlaskey@3 1299 }
jlaskey@3 1300
jlaskey@3 1301 child = classAtom();
jlaskey@3 1302 if (child != null) {
jlaskey@3 1303 return token.add(child);
jlaskey@3 1304 }
jlaskey@3 1305
jlaskey@3 1306 restart(startIn, startOut);
jlaskey@3 1307 return null;
jlaskey@3 1308 }
jlaskey@3 1309
jlaskey@3 1310 /*
jlaskey@3 1311 * ClassAtom : - ClassAtomNoDash
jlaskey@3 1312 */
jlaskey@3 1313 private Token classAtom() {
jlaskey@3 1314 final Token token = new Token(Token.Type.CLASSATOM);
jlaskey@3 1315
jlaskey@3 1316 if (ch0 == '-') {
jlaskey@3 1317 return commit(token, 1);
jlaskey@3 1318 }
jlaskey@3 1319
jlaskey@3 1320 final Token child = classAtomNoDash();
jlaskey@3 1321 if (child != null) {
jlaskey@3 1322 return token.add(child);
jlaskey@3 1323 }
jlaskey@3 1324
jlaskey@3 1325 return null;
jlaskey@3 1326 }
jlaskey@3 1327
jlaskey@3 1328 /*
jlaskey@3 1329 * ClassAtomNoDash ::
jlaskey@3 1330 * SourceCharacter but not one of \ or ] or -
jlaskey@3 1331 * \ ClassEscape
jlaskey@3 1332 */
jlaskey@3 1333 private Token classAtomNoDash() {
jlaskey@3 1334 final int startIn = position;
jlaskey@3 1335 final int startOut = sb.length();
jlaskey@3 1336 final Token token = new Token(Token.Type.CLASSATOM_NODASH);
jlaskey@3 1337
jlaskey@3 1338 switch (ch0) {
jlaskey@3 1339 case ']':
jlaskey@3 1340 case '-':
jlaskey@3 1341 case '\0':
jlaskey@3 1342 return null;
jlaskey@3 1343
jlaskey@3 1344 case '[':
jlaskey@3 1345 // unescaped left square bracket - add escape
jlaskey@3 1346 return commit(token.add("\\"), 1);
jlaskey@3 1347
jlaskey@3 1348 case '\\':
jlaskey@3 1349 commit(token, 1);
jlaskey@3 1350 final Token child = classEscape();
jlaskey@3 1351 if (child != null) {
jlaskey@3 1352 return token.add(child);
jlaskey@3 1353 }
jlaskey@3 1354
jlaskey@3 1355 restart(startIn, startOut);
jlaskey@3 1356 return null;
jlaskey@3 1357
jlaskey@3 1358 default:
jlaskey@3 1359 return commit(token, 1);
jlaskey@3 1360 }
jlaskey@3 1361 }
jlaskey@3 1362
jlaskey@3 1363 /*
jlaskey@3 1364 * ClassEscape ::
jlaskey@3 1365 * DecimalEscape
jlaskey@3 1366 * b
jlaskey@3 1367 * CharacterEscape
jlaskey@3 1368 * CharacterClassEscape
jlaskey@3 1369 */
jlaskey@3 1370 private Token classEscape() {
jlaskey@3 1371 final Token token = new Token(Token.Type.CLASS_ESCAPE);
jlaskey@3 1372 Token child;
jlaskey@3 1373
jlaskey@3 1374 child = decimalEscape();
jlaskey@3 1375 if (child != null) {
jlaskey@3 1376 return token.add(child);
jlaskey@3 1377 }
jlaskey@3 1378
jlaskey@3 1379 if (ch0 == 'b') {
jlaskey@3 1380 return commit(token, 1);
jlaskey@3 1381 }
jlaskey@3 1382
jlaskey@3 1383 child = characterEscape();
jlaskey@3 1384 if (child != null) {
jlaskey@3 1385 return token.add(child);
jlaskey@3 1386 }
jlaskey@3 1387
jlaskey@3 1388 child = characterClassEscape();
jlaskey@3 1389 if (child != null) {
jlaskey@3 1390 return token.add(child);
jlaskey@3 1391 }
jlaskey@3 1392
jlaskey@3 1393 return null;
jlaskey@3 1394 }
jlaskey@3 1395
jlaskey@3 1396 /*
jlaskey@3 1397 * DecimalDigits
jlaskey@3 1398 */
jlaskey@3 1399 private Token decimalDigits() {
jlaskey@3 1400 if (!isDecimalDigit(ch0)) {
jlaskey@3 1401 return null;
jlaskey@3 1402 }
jlaskey@3 1403
jlaskey@3 1404 final Token token = new Token(Token.Type.DECIMALDIGITS);
jlaskey@3 1405 while (isDecimalDigit(ch0)) {
jlaskey@3 1406 commit(token, 1);
jlaskey@3 1407 }
jlaskey@3 1408
jlaskey@3 1409 return token;
jlaskey@3 1410 }
jlaskey@3 1411
jlaskey@3 1412 private static boolean isDecimalDigit(final char ch) {
jlaskey@3 1413 return ch >= '0' && ch <= '9';
jlaskey@3 1414 }
jlaskey@3 1415 }

mercurial