Fri, 22 Feb 2013 16:31:10 +0100
8008093: Make RegExp engine pluggable
Reviewed-by: lagergren, attila
1.1 --- a/src/jdk/nashorn/internal/objects/Global.java Fri Feb 22 10:39:00 2013 -0400 1.2 +++ b/src/jdk/nashorn/internal/objects/Global.java Fri Feb 22 16:31:10 2013 +0100 1.3 @@ -50,7 +50,7 @@ 1.4 import jdk.nashorn.internal.runtime.NativeJavaPackage; 1.5 import jdk.nashorn.internal.runtime.OptionsObject; 1.6 import jdk.nashorn.internal.runtime.PropertyDescriptor; 1.7 -import jdk.nashorn.internal.runtime.RegExpMatch; 1.8 +import jdk.nashorn.internal.runtime.regexp.RegExpResult; 1.9 import jdk.nashorn.internal.runtime.Scope; 1.10 import jdk.nashorn.internal.runtime.ScriptFunction; 1.11 import jdk.nashorn.internal.runtime.ScriptObject; 1.12 @@ -339,7 +339,7 @@ 1.13 private ClassCache classCache; 1.14 1.15 // Used to store the last RegExp result to support deprecated RegExp constructor properties 1.16 - private RegExpMatch lastRegExpMatch; 1.17 + private RegExpResult lastRegExpResult; 1.18 1.19 private static final MethodHandle EVAL = findOwnMH("eval", Object.class, Object.class, Object.class); 1.20 private static final MethodHandle PRINT = findOwnMH("print", Object.class, Object.class, Object[].class); 1.21 @@ -1709,12 +1709,12 @@ 1.22 return MH.findStatic(MethodHandles.publicLookup(), Global.class, name, MH.type(rtype, types)); 1.23 } 1.24 1.25 - RegExpMatch getLastRegExpMatch() { 1.26 - return lastRegExpMatch; 1.27 + RegExpResult getLastRegExpResult() { 1.28 + return lastRegExpResult; 1.29 } 1.30 1.31 - void setLastRegExpMatch(RegExpMatch regExpMatch) { 1.32 - this.lastRegExpMatch = regExpMatch; 1.33 + void setLastRegExpResult(final RegExpResult regExpResult) { 1.34 + this.lastRegExpResult = regExpResult; 1.35 } 1.36 1.37 }
2.1 --- a/src/jdk/nashorn/internal/objects/NativeRegExp.java Fri Feb 22 10:39:00 2013 -0400 2.2 +++ b/src/jdk/nashorn/internal/objects/NativeRegExp.java Fri Feb 22 16:31:10 2013 +0100 2.3 @@ -31,8 +31,7 @@ 2.4 import java.util.ArrayList; 2.5 import java.util.Arrays; 2.6 import java.util.List; 2.7 -import java.util.regex.Matcher; 2.8 -import java.util.regex.Pattern; 2.9 + 2.10 import jdk.nashorn.internal.objects.annotations.Attribute; 2.11 import jdk.nashorn.internal.objects.annotations.Constructor; 2.12 import jdk.nashorn.internal.objects.annotations.Function; 2.13 @@ -44,8 +43,10 @@ 2.14 import jdk.nashorn.internal.runtime.BitVector; 2.15 import jdk.nashorn.internal.runtime.JSType; 2.16 import jdk.nashorn.internal.runtime.ParserException; 2.17 -import jdk.nashorn.internal.runtime.RegExp; 2.18 -import jdk.nashorn.internal.runtime.RegExpMatch; 2.19 +import jdk.nashorn.internal.runtime.regexp.RegExp; 2.20 +import jdk.nashorn.internal.runtime.regexp.RegExpFactory; 2.21 +import jdk.nashorn.internal.runtime.regexp.RegExpResult; 2.22 +import jdk.nashorn.internal.runtime.regexp.RegExpMatcher; 2.23 import jdk.nashorn.internal.runtime.ScriptFunction; 2.24 import jdk.nashorn.internal.runtime.ScriptObject; 2.25 import jdk.nashorn.internal.runtime.ScriptRuntime; 2.26 @@ -59,35 +60,15 @@ 2.27 @Property(attributes = Attribute.NOT_ENUMERABLE | Attribute.NOT_CONFIGURABLE) 2.28 public Object lastIndex; 2.29 2.30 - /** Pattern string. */ 2.31 - private String input; 2.32 - 2.33 - /** Global search flag for this regexp. */ 2.34 - private boolean global; 2.35 - 2.36 - /** Case insensitive flag for this regexp */ 2.37 - private boolean ignoreCase; 2.38 - 2.39 - /** Multi-line flag for this regexp */ 2.40 - private boolean multiline; 2.41 - 2.42 - /** Java regex pattern to use for match. We compile to one of these */ 2.43 - private Pattern pattern; 2.44 - 2.45 - private BitVector groupsInNegativeLookahead; 2.46 + /** Compiled regexp */ 2.47 + private RegExp regexp; 2.48 2.49 // Reference to global object needed to support static RegExp properties 2.50 private Global globalObject; 2.51 2.52 - /* 2.53 - public NativeRegExp() { 2.54 - init(); 2.55 - }*/ 2.56 - 2.57 NativeRegExp(final String input, final String flagString) { 2.58 - RegExp regExp = null; 2.59 try { 2.60 - regExp = new RegExp(input, flagString); 2.61 + this.regexp = RegExpFactory.create(input, flagString); 2.62 } catch (final ParserException e) { 2.63 // translate it as SyntaxError object and throw it 2.64 e.throwAsEcmaException(); 2.65 @@ -95,13 +76,6 @@ 2.66 } 2.67 2.68 this.setLastIndex(0); 2.69 - this.input = regExp.getInput(); 2.70 - this.global = regExp.isGlobal(); 2.71 - this.ignoreCase = regExp.isIgnoreCase(); 2.72 - this.multiline = regExp.isMultiline(); 2.73 - this.pattern = regExp.getPattern(); 2.74 - this.groupsInNegativeLookahead = regExp.getGroupsInNegativeLookahead(); 2.75 - 2.76 init(); 2.77 } 2.78 2.79 @@ -110,24 +84,8 @@ 2.80 } 2.81 2.82 NativeRegExp(final NativeRegExp regExp) { 2.83 - this.input = regExp.getInput(); 2.84 - this.global = regExp.getGlobal(); 2.85 - this.multiline = regExp.getMultiline(); 2.86 - this.ignoreCase = regExp.getIgnoreCase(); 2.87 this.lastIndex = regExp.getLastIndexObject(); 2.88 - this.pattern = regExp.getPattern(); 2.89 - this.groupsInNegativeLookahead = regExp.getGroupsInNegativeLookahead(); 2.90 - 2.91 - init(); 2.92 - } 2.93 - 2.94 - NativeRegExp(final Pattern pattern) { 2.95 - this.input = pattern.pattern(); 2.96 - this.multiline = (pattern.flags() & Pattern.MULTILINE) != 0; 2.97 - this.ignoreCase = (pattern.flags() & Pattern.CASE_INSENSITIVE) != 0; 2.98 - this.lastIndex = 0; 2.99 - this.pattern = pattern; 2.100 - 2.101 + this.regexp = regExp.getRegExp(); 2.102 init(); 2.103 } 2.104 2.105 @@ -232,16 +190,59 @@ 2.106 return new NativeRegExp(patternString, flagString); 2.107 } 2.108 2.109 + /** 2.110 + * Build a regexp that matches {@code string} as-is. All meta-characters will be escaped. 2.111 + * 2.112 + * @param string pattern string 2.113 + * @return flat regexp 2.114 + */ 2.115 + static NativeRegExp flatRegExp(String string) { 2.116 + // escape special characters 2.117 + StringBuilder sb = null; 2.118 + final int length = string.length(); 2.119 + 2.120 + for (int i = 0; i < length; i++) { 2.121 + final char c = string.charAt(i); 2.122 + switch (c) { 2.123 + case '^': 2.124 + case '$': 2.125 + case '\\': 2.126 + case '.': 2.127 + case '*': 2.128 + case '+': 2.129 + case '?': 2.130 + case '(': 2.131 + case ')': 2.132 + case '[': 2.133 + case '{': 2.134 + case '|': 2.135 + if (sb == null) { 2.136 + sb = new StringBuilder(length * 2); 2.137 + sb.append(string, 0, i); 2.138 + } 2.139 + sb.append('\\'); 2.140 + sb.append(c); 2.141 + break; 2.142 + default: 2.143 + if (sb != null) { 2.144 + sb.append(c); 2.145 + } 2.146 + break; 2.147 + } 2.148 + } 2.149 + return new NativeRegExp(sb == null ? string : sb.toString(), ""); 2.150 + } 2.151 + 2.152 private String getFlagString() { 2.153 - final StringBuilder sb = new StringBuilder(); 2.154 + final StringBuilder sb = new StringBuilder(3); 2.155 2.156 - if (global) { 2.157 + if (regexp.isGlobal()) { 2.158 sb.append('g'); 2.159 } 2.160 - if (ignoreCase) { 2.161 + if (regexp.isIgnoreCase()) { 2.162 sb.append('i'); 2.163 } 2.164 - if (multiline) { 2.165 + if (regexp.isMultiline()) { 2.166 sb.append('m'); 2.167 } 2.168 2.169 @@ -255,7 +256,7 @@ 2.170 2.171 @Override 2.172 public String toString() { 2.173 - return "/" + input + "/" + getFlagString(); 2.174 + return "/" + regexp.getSource() + "/" + getFlagString(); 2.175 } 2.176 2.177 /** 2.178 @@ -270,13 +271,8 @@ 2.179 public static Object compile(final Object self, final Object pattern, final Object flags) { 2.180 final NativeRegExp regExp = checkRegExp(self); 2.181 final NativeRegExp compiled = newRegExp(pattern, flags); 2.182 - // copy over fields to 'self' 2.183 - regExp.setInput(compiled.getInput()); 2.184 - regExp.setGlobal(compiled.getGlobal()); 2.185 - regExp.setIgnoreCase(compiled.getIgnoreCase()); 2.186 - regExp.setMultiline(compiled.getMultiline()); 2.187 - regExp.setPattern(compiled.getPattern()); 2.188 - regExp.setGroupsInNegativeLookahead(compiled.getGroupsInNegativeLookahead()); 2.189 + // copy over regexp to 'self' 2.190 + regExp.setRegExp(compiled.getRegExp()); 2.191 2.192 // Some implementations return undefined. Some return 'self'. Since return 2.193 // value is most likely be ignored, we can play safe and return 'self'. 2.194 @@ -326,7 +322,7 @@ 2.195 */ 2.196 @Getter(attributes = Attribute.NON_ENUMERABLE_CONSTANT) 2.197 public static Object source(final Object self) { 2.198 - return checkRegExp(self).input; 2.199 + return checkRegExp(self).getRegExp().getSource(); 2.200 } 2.201 2.202 /** 2.203 @@ -337,7 +333,7 @@ 2.204 */ 2.205 @Getter(attributes = Attribute.NON_ENUMERABLE_CONSTANT) 2.206 public static Object global(final Object self) { 2.207 - return checkRegExp(self).global; 2.208 + return checkRegExp(self).getRegExp().isGlobal(); 2.209 } 2.210 2.211 /** 2.212 @@ -348,7 +344,7 @@ 2.213 */ 2.214 @Getter(attributes = Attribute.NON_ENUMERABLE_CONSTANT) 2.215 public static Object ignoreCase(final Object self) { 2.216 - return checkRegExp(self).ignoreCase; 2.217 + return checkRegExp(self).getRegExp().isIgnoreCase(); 2.218 } 2.219 2.220 /** 2.221 @@ -359,7 +355,7 @@ 2.222 */ 2.223 @Getter(attributes = Attribute.NON_ENUMERABLE_CONSTANT) 2.224 public static Object multiline(final Object self) { 2.225 - return checkRegExp(self).multiline; 2.226 + return checkRegExp(self).getRegExp().isMultiline(); 2.227 } 2.228 2.229 /** 2.230 @@ -369,7 +365,7 @@ 2.231 */ 2.232 @Getter(where = Where.CONSTRUCTOR, attributes = Attribute.CONSTANT, name = "input") 2.233 public static Object getLastInput(Object self) { 2.234 - final RegExpMatch match = Global.instance().getLastRegExpMatch(); 2.235 + final RegExpResult match = Global.instance().getLastRegExpResult(); 2.236 return match == null ? "" : match.getInput(); 2.237 } 2.238 2.239 @@ -390,7 +386,7 @@ 2.240 */ 2.241 @Getter(where = Where.CONSTRUCTOR, attributes = Attribute.CONSTANT, name = "lastMatch") 2.242 public static Object getLastMatch(Object self) { 2.243 - final RegExpMatch match = Global.instance().getLastRegExpMatch(); 2.244 + final RegExpResult match = Global.instance().getLastRegExpResult(); 2.245 return match == null ? "" : match.getGroup(0); 2.246 } 2.247 2.248 @@ -401,7 +397,7 @@ 2.249 */ 2.250 @Getter(where = Where.CONSTRUCTOR, attributes = Attribute.CONSTANT, name = "lastParen") 2.251 public static Object getLastParen(Object self) { 2.252 - final RegExpMatch match = Global.instance().getLastRegExpMatch(); 2.253 + final RegExpResult match = Global.instance().getLastRegExpResult(); 2.254 return match == null ? "" : match.getLastParen(); 2.255 } 2.256 2.257 @@ -412,7 +408,7 @@ 2.258 */ 2.259 @Getter(where = Where.CONSTRUCTOR, attributes = Attribute.CONSTANT, name = "leftContext") 2.260 public static Object getLeftContext(Object self) { 2.261 - final RegExpMatch match = Global.instance().getLastRegExpMatch(); 2.262 + final RegExpResult match = Global.instance().getLastRegExpResult(); 2.263 return match == null ? "" : match.getInput().substring(0, match.getIndex()); 2.264 } 2.265 2.266 @@ -423,7 +419,7 @@ 2.267 */ 2.268 @Getter(where = Where.CONSTRUCTOR, attributes = Attribute.CONSTANT, name = "rightContext") 2.269 public static Object getRightContext(Object self) { 2.270 - final RegExpMatch match = Global.instance().getLastRegExpMatch(); 2.271 + final RegExpResult match = Global.instance().getLastRegExpResult(); 2.272 return match == null ? "" : match.getInput().substring(match.getIndex() + match.length()); 2.273 } 2.274 2.275 @@ -434,7 +430,7 @@ 2.276 */ 2.277 @Getter(where = Where.CONSTRUCTOR, attributes = Attribute.CONSTANT, name = "$1") 2.278 public static Object getGroup1(Object self) { 2.279 - final RegExpMatch match = Global.instance().getLastRegExpMatch(); 2.280 + final RegExpResult match = Global.instance().getLastRegExpResult(); 2.281 return match == null ? "" : match.getGroup(1); 2.282 } 2.283 2.284 @@ -445,7 +441,7 @@ 2.285 */ 2.286 @Getter(where = Where.CONSTRUCTOR, attributes = Attribute.CONSTANT, name = "$2") 2.287 public static Object getGroup2(Object self) { 2.288 - final RegExpMatch match = Global.instance().getLastRegExpMatch(); 2.289 + final RegExpResult match = Global.instance().getLastRegExpResult(); 2.290 return match == null ? "" : match.getGroup(2); 2.291 } 2.292 2.293 @@ -456,7 +452,7 @@ 2.294 */ 2.295 @Getter(where = Where.CONSTRUCTOR, attributes = Attribute.CONSTANT, name = "$3") 2.296 public static Object getGroup3(Object self) { 2.297 - final RegExpMatch match = Global.instance().getLastRegExpMatch(); 2.298 + final RegExpResult match = Global.instance().getLastRegExpResult(); 2.299 return match == null ? "" : match.getGroup(3); 2.300 } 2.301 2.302 @@ -467,7 +463,7 @@ 2.303 */ 2.304 @Getter(where = Where.CONSTRUCTOR, attributes = Attribute.CONSTANT, name = "$4") 2.305 public static Object getGroup4(Object self) { 2.306 - final RegExpMatch match = Global.instance().getLastRegExpMatch(); 2.307 + final RegExpResult match = Global.instance().getLastRegExpResult(); 2.308 return match == null ? "" : match.getGroup(4); 2.309 } 2.310 2.311 @@ -478,7 +474,7 @@ 2.312 */ 2.313 @Getter(where = Where.CONSTRUCTOR, attributes = Attribute.CONSTANT, name = "$5") 2.314 public static Object getGroup5(Object self) { 2.315 - final RegExpMatch match = Global.instance().getLastRegExpMatch(); 2.316 + final RegExpResult match = Global.instance().getLastRegExpResult(); 2.317 return match == null ? "" : match.getGroup(5); 2.318 } 2.319 2.320 @@ -489,7 +485,7 @@ 2.321 */ 2.322 @Getter(where = Where.CONSTRUCTOR, attributes = Attribute.CONSTANT, name = "$6") 2.323 public static Object getGroup6(Object self) { 2.324 - final RegExpMatch match = Global.instance().getLastRegExpMatch(); 2.325 + final RegExpResult match = Global.instance().getLastRegExpResult(); 2.326 return match == null ? "" : match.getGroup(6); 2.327 } 2.328 2.329 @@ -500,7 +496,7 @@ 2.330 */ 2.331 @Getter(where = Where.CONSTRUCTOR, attributes = Attribute.CONSTANT, name = "$7") 2.332 public static Object getGroup7(Object self) { 2.333 - final RegExpMatch match = Global.instance().getLastRegExpMatch(); 2.334 + final RegExpResult match = Global.instance().getLastRegExpResult(); 2.335 return match == null ? "" : match.getGroup(7); 2.336 } 2.337 2.338 @@ -511,7 +507,7 @@ 2.339 */ 2.340 @Getter(where = Where.CONSTRUCTOR, attributes = Attribute.CONSTANT, name = "$8") 2.341 public static Object getGroup8(Object self) { 2.342 - final RegExpMatch match = Global.instance().getLastRegExpMatch(); 2.343 + final RegExpResult match = Global.instance().getLastRegExpResult(); 2.344 return match == null ? "" : match.getGroup(8); 2.345 } 2.346 2.347 @@ -522,34 +518,30 @@ 2.348 */ 2.349 @Getter(where = Where.CONSTRUCTOR, attributes = Attribute.CONSTANT, name = "$9") 2.350 public static Object getGroup9(Object self) { 2.351 - final RegExpMatch match = Global.instance().getLastRegExpMatch(); 2.352 + final RegExpResult match = Global.instance().getLastRegExpResult(); 2.353 return match == null ? "" : match.getGroup(9); 2.354 } 2.355 2.356 - private RegExpMatch execInner(final String string) { 2.357 - if (this.pattern == null) { 2.358 - return null; // never matches or similar, e.g. a[] 2.359 - } 2.360 + private RegExpResult execInner(final String string) { 2.361 2.362 - final Matcher matcher = pattern.matcher(string); 2.363 - final int start = this.global ? getLastIndex() : 0; 2.364 - 2.365 + final int start = regexp.isGlobal() ? getLastIndex() : 0; 2.366 if (start < 0 || start > string.length()) { 2.367 setLastIndex(0); 2.368 return null; 2.369 } 2.370 2.371 - if (!matcher.find(start)) { 2.372 + final RegExpMatcher matcher = regexp.match(string); 2.373 + if (matcher == null || !matcher.search(start)) { 2.374 setLastIndex(0); 2.375 return null; 2.376 } 2.377 2.378 - if (global) { 2.379 + if (regexp.isGlobal()) { 2.380 setLastIndex(matcher.end()); 2.381 } 2.382 2.383 - final RegExpMatch match = new RegExpMatch(string, matcher.start(), groups(matcher)); 2.384 - globalObject.setLastRegExpMatch(match); 2.385 + final RegExpResult match = new RegExpResult(string, matcher.start(), groups(matcher)); 2.386 + globalObject.setLastRegExpResult(match); 2.387 return match; 2.388 } 2.389 2.390 @@ -557,9 +549,11 @@ 2.391 * Convert java.util.regex.Matcher groups to JavaScript groups. 2.392 * That is, replace null and groups that didn't match with undefined. 2.393 */ 2.394 - private Object[] groups(final Matcher matcher) { 2.395 + private Object[] groups(final RegExpMatcher matcher) { 2.396 final int groupCount = matcher.groupCount(); 2.397 final Object[] groups = new Object[groupCount + 1]; 2.398 + final BitVector groupsInNegativeLookahead = regexp.getGroupsInNegativeLookahead(); 2.399 + 2.400 for (int i = 0, lastGroupStart = matcher.start(); i <= groupCount; i++) { 2.401 final int groupStart = matcher.start(i); 2.402 if (lastGroupStart > groupStart 2.403 @@ -586,7 +580,7 @@ 2.404 * @return NativeArray of matches, string or null. 2.405 */ 2.406 public Object exec(final String string) { 2.407 - final RegExpMatch match = execInner(string); 2.408 + final RegExpResult match = execInner(string); 2.409 2.410 if (match == null) { 2.411 return null; 2.412 @@ -617,7 +611,12 @@ 2.413 * @return String with substitutions. 2.414 */ 2.415 Object replace(final String string, final String replacement, final ScriptFunction function) { 2.416 - final Matcher matcher = pattern.matcher(string); 2.417 + final RegExpMatcher matcher = regexp.match(string); 2.418 + 2.419 + if (matcher == null) { 2.420 + return string; 2.421 + } 2.422 + 2.423 /* 2.424 * $$ -> $ 2.425 * $& -> the matched substring 2.426 @@ -628,8 +627,8 @@ 2.427 */ 2.428 String replace = replacement; 2.429 2.430 - if (!global) { 2.431 - if (!matcher.find()) { 2.432 + if (!regexp.isGlobal()) { 2.433 + if (!matcher.search(0)) { 2.434 return string; 2.435 } 2.436 2.437 @@ -642,45 +641,39 @@ 2.438 return sb.toString(); 2.439 } 2.440 2.441 - int end = 0; // a.k.a. lastAppendPosition 2.442 setLastIndex(0); 2.443 2.444 - boolean found; 2.445 - try { 2.446 - found = matcher.find(end); 2.447 - } catch (final IndexOutOfBoundsException e) { 2.448 - found = false; 2.449 - } 2.450 - 2.451 - if (!found) { 2.452 + if (!matcher.search(0)) { 2.453 return string; 2.454 } 2.455 2.456 + int thisIndex = 0; 2.457 int previousLastIndex = 0; 2.458 final StringBuilder sb = new StringBuilder(); 2.459 + 2.460 do { 2.461 if (function != null) { 2.462 replace = callReplaceValue(function, matcher, string); 2.463 } 2.464 - appendReplacement(matcher, string, replace, sb, end); 2.465 - end = matcher.end(); 2.466 + 2.467 + appendReplacement(matcher, string, replace, sb, thisIndex); 2.468 2.469 // ECMA 15.5.4.10 String.prototype.match(regexp) 2.470 - final int thisIndex = end; 2.471 + thisIndex = matcher.end(); 2.472 if (thisIndex == previousLastIndex) { 2.473 setLastIndex(thisIndex + 1); 2.474 previousLastIndex = thisIndex + 1; 2.475 } else { 2.476 previousLastIndex = thisIndex; 2.477 } 2.478 - } while (matcher.find()); 2.479 + } while (previousLastIndex <= string.length() && matcher.search(previousLastIndex)); 2.480 2.481 - sb.append(string, end, string.length()); 2.482 + sb.append(string, thisIndex, string.length()); 2.483 2.484 return sb.toString(); 2.485 } 2.486 2.487 - private void appendReplacement(final Matcher matcher, final String text, final String replacement, final StringBuilder sb, final int lastAppendPosition) { 2.488 + private void appendReplacement(final RegExpMatcher matcher, final String text, final String replacement, final StringBuilder sb, final int lastAppendPosition) { 2.489 // Process substitution string to replace group references with groups 2.490 int cursor = 0; 2.491 final StringBuilder result = new StringBuilder(); 2.492 @@ -748,7 +741,7 @@ 2.493 sb.append(result); 2.494 } 2.495 2.496 - private String callReplaceValue(final ScriptFunction function, final Matcher matcher, final String string) { 2.497 + private String callReplaceValue(final ScriptFunction function, final RegExpMatcher matcher, final String string) { 2.498 final Object[] groups = groups(matcher); 2.499 final Object[] args = Arrays.copyOf(groups, groups.length + 2); 2.500 2.501 @@ -782,7 +775,7 @@ 2.502 return new NativeArray(); 2.503 } 2.504 2.505 - RegExpMatch match; 2.506 + RegExpResult match; 2.507 final int inputLength = input.length(); 2.508 int lastLength = -1; 2.509 int lastLastIndex = 0; 2.510 @@ -834,7 +827,7 @@ 2.511 * @return Index of match. 2.512 */ 2.513 Object search(final String string) { 2.514 - final RegExpMatch match = execInner(string); 2.515 + final RegExpResult match = execInner(string); 2.516 2.517 if (match == null) { 2.518 return -1; 2.519 @@ -884,52 +877,20 @@ 2.520 } 2.521 } 2.522 2.523 - private String getInput() { 2.524 - return input; 2.525 - } 2.526 - 2.527 - private void setInput(final String input) { 2.528 - this.input = input; 2.529 + private void setGlobal(final boolean global) { 2.530 + regexp.setGlobal(global); 2.531 } 2.532 2.533 boolean getGlobal() { 2.534 - return global; 2.535 + return regexp.isGlobal(); 2.536 } 2.537 2.538 - private void setGlobal(final boolean global) { 2.539 - this.global = global; 2.540 + private RegExp getRegExp() { 2.541 + return regexp; 2.542 } 2.543 2.544 - private boolean getIgnoreCase() { 2.545 - return ignoreCase; 2.546 - } 2.547 - 2.548 - private void setIgnoreCase(final boolean ignoreCase) { 2.549 - this.ignoreCase = ignoreCase; 2.550 - } 2.551 - 2.552 - private boolean getMultiline() { 2.553 - return multiline; 2.554 - } 2.555 - 2.556 - private void setMultiline(final boolean multiline) { 2.557 - this.multiline = multiline; 2.558 - } 2.559 - 2.560 - private Pattern getPattern() { 2.561 - return pattern; 2.562 - } 2.563 - 2.564 - private void setPattern(final Pattern pattern) { 2.565 - this.pattern = pattern; 2.566 - } 2.567 - 2.568 - private BitVector getGroupsInNegativeLookahead() { 2.569 - return groupsInNegativeLookahead; 2.570 - } 2.571 - 2.572 - private void setGroupsInNegativeLookahead(final BitVector groupsInNegativeLookahead) { 2.573 - this.groupsInNegativeLookahead = groupsInNegativeLookahead; 2.574 + private void setRegExp(final RegExp regexp) { 2.575 + this.regexp = regexp; 2.576 } 2.577 2.578 }
3.1 --- a/src/jdk/nashorn/internal/objects/NativeRegExpExecResult.java Fri Feb 22 10:39:00 2013 -0400 3.2 +++ b/src/jdk/nashorn/internal/objects/NativeRegExpExecResult.java Fri Feb 22 16:31:10 2013 +0100 3.3 @@ -31,7 +31,7 @@ 3.4 import jdk.nashorn.internal.objects.annotations.ScriptClass; 3.5 import jdk.nashorn.internal.objects.annotations.Setter; 3.6 import jdk.nashorn.internal.runtime.JSType; 3.7 -import jdk.nashorn.internal.runtime.RegExpMatch; 3.8 +import jdk.nashorn.internal.runtime.regexp.RegExpResult; 3.9 import jdk.nashorn.internal.runtime.ScriptObject; 3.10 import jdk.nashorn.internal.runtime.arrays.ArrayData; 3.11 3.12 @@ -49,11 +49,11 @@ 3.13 @Property 3.14 public Object input; 3.15 3.16 - NativeRegExpExecResult(final RegExpMatch match) { 3.17 + NativeRegExpExecResult(final RegExpResult result) { 3.18 setProto(Global.instance().getArrayPrototype()); 3.19 - this.setArray(ArrayData.allocate(match.getGroups().clone())); 3.20 - this.index = match.getIndex(); 3.21 - this.input = match.getInput(); 3.22 + this.setArray(ArrayData.allocate(result.getGroups().clone())); 3.23 + this.index = result.getIndex(); 3.24 + this.input = result.getInput(); 3.25 } 3.26 3.27 /**
4.1 --- a/src/jdk/nashorn/internal/objects/NativeString.java Fri Feb 22 10:39:00 2013 -0400 4.2 +++ b/src/jdk/nashorn/internal/objects/NativeString.java Fri Feb 22 16:31:10 2013 +0100 4.3 @@ -38,7 +38,6 @@ 4.4 import java.util.Arrays; 4.5 import java.util.LinkedList; 4.6 import java.util.List; 4.7 -import java.util.regex.Pattern; 4.8 import jdk.internal.dynalink.CallSiteDescriptor; 4.9 import jdk.internal.dynalink.linker.GuardedInvocation; 4.10 import jdk.internal.dynalink.linker.LinkRequest; 4.11 @@ -712,7 +711,7 @@ 4.12 if (string instanceof NativeRegExp) { 4.13 nativeRegExp = (NativeRegExp) string; 4.14 } else { 4.15 - nativeRegExp = new NativeRegExp(Pattern.compile(JSType.toString(string), Pattern.LITERAL)); 4.16 + nativeRegExp = NativeRegExp.flatRegExp(JSType.toString(string)); 4.17 } 4.18 4.19 if (replacement instanceof ScriptFunction) {
5.1 --- a/src/jdk/nashorn/internal/parser/AbstractParser.java Fri Feb 22 10:39:00 2013 -0400 5.2 +++ b/src/jdk/nashorn/internal/parser/AbstractParser.java Fri Feb 22 16:31:10 2013 +0100 5.3 @@ -37,7 +37,7 @@ 5.4 import jdk.nashorn.internal.runtime.ErrorManager; 5.5 import jdk.nashorn.internal.runtime.JSErrorType; 5.6 import jdk.nashorn.internal.runtime.ParserException; 5.7 -import jdk.nashorn.internal.runtime.RegExp; 5.8 +import jdk.nashorn.internal.runtime.regexp.RegExpFactory; 5.9 import jdk.nashorn.internal.runtime.Source; 5.10 5.11 /** 5.12 @@ -427,7 +427,7 @@ 5.13 if (value instanceof RegexToken) { 5.14 final RegexToken regex = (RegexToken)value; 5.15 try { 5.16 - RegExp.validate(regex.getExpression(), regex.getOptions()); 5.17 + RegExpFactory.validate(regex.getExpression(), regex.getOptions()); 5.18 } catch (final ParserException e) { 5.19 error(e.getMessage()); 5.20 }
6.1 --- a/src/jdk/nashorn/internal/runtime/RegExp.java Fri Feb 22 10:39:00 2013 -0400 6.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 6.3 @@ -1,177 +0,0 @@ 6.4 -/* 6.5 - * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved. 6.6 - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 6.7 - * 6.8 - * This code is free software; you can redistribute it and/or modify it 6.9 - * under the terms of the GNU General Public License version 2 only, as 6.10 - * published by the Free Software Foundation. Oracle designates this 6.11 - * particular file as subject to the "Classpath" exception as provided 6.12 - * by Oracle in the LICENSE file that accompanied this code. 6.13 - * 6.14 - * This code is distributed in the hope that it will be useful, but WITHOUT 6.15 - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 6.16 - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 6.17 - * version 2 for more details (a copy is included in the LICENSE file that 6.18 - * accompanied this code). 6.19 - * 6.20 - * You should have received a copy of the GNU General Public License version 6.21 - * 2 along with this work; if not, write to the Free Software Foundation, 6.22 - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 6.23 - * 6.24 - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 6.25 - * or visit www.oracle.com if you need additional information or have any 6.26 - * questions. 6.27 - */ 6.28 - 6.29 -package jdk.nashorn.internal.runtime; 6.30 - 6.31 -import static java.util.regex.Pattern.CASE_INSENSITIVE; 6.32 -import static java.util.regex.Pattern.MULTILINE; 6.33 -import static java.util.regex.Pattern.UNICODE_CASE; 6.34 - 6.35 -import java.util.HashSet; 6.36 -import java.util.regex.Pattern; 6.37 -import java.util.regex.PatternSyntaxException; 6.38 - 6.39 -/** 6.40 - * This class is used to represent a parsed regular expression. Accepts input 6.41 - * pattern string and flagString. This is used by AbstractParser to validate 6.42 - * RegExp literals as well as by NativeRegExp to parse RegExp constructor arguments. 6.43 - */ 6.44 -public final class RegExp { 6.45 - /** Pattern string. */ 6.46 - private final String input; 6.47 - 6.48 - /** Global search flag for this regexp.*/ 6.49 - private boolean global; 6.50 - 6.51 - /** Case insensitive flag for this regexp */ 6.52 - private boolean ignoreCase; 6.53 - 6.54 - /** Multi-line flag for this regexp */ 6.55 - private boolean multiline; 6.56 - 6.57 - /** Java regexp pattern to use for match. We compile to one of these */ 6.58 - private Pattern pattern; 6.59 - 6.60 - /** BitVector that keeps track of groups in negative lookahead */ 6.61 - private BitVector groupsInNegativeLookahead; 6.62 - 6.63 - /** 6.64 - * Creates RegExpLiteral object from given input and flagString. 6.65 - * 6.66 - * @param input RegExp pattern string 6.67 - * @param flagString RegExp flags 6.68 - * @throws ParserException if flagString is invalid or input string has syntax error. 6.69 - */ 6.70 - public RegExp(final String input, final String flagString) throws ParserException { 6.71 - this.input = input; 6.72 - final HashSet<Character> usedFlags = new HashSet<>(); 6.73 - int flags = 0; 6.74 - 6.75 - for (final char ch : flagString.toCharArray()) { 6.76 - if (usedFlags.contains(ch)) { 6.77 - throwParserException("repeated.flag", Character.toString(ch)); 6.78 - } 6.79 - 6.80 - switch (ch) { 6.81 - case 'g': 6.82 - this.global = true; 6.83 - usedFlags.add(ch); 6.84 - break; 6.85 - case 'i': 6.86 - this.ignoreCase = true; 6.87 - flags |= CASE_INSENSITIVE | UNICODE_CASE; 6.88 - usedFlags.add(ch); 6.89 - break; 6.90 - case 'm': 6.91 - this.multiline = true; 6.92 - flags |= MULTILINE; 6.93 - usedFlags.add(ch); 6.94 - break; 6.95 - default: 6.96 - throwParserException("unsupported.flag", Character.toString(ch)); 6.97 - } 6.98 - } 6.99 - 6.100 - try { 6.101 - RegExpScanner parsed; 6.102 - 6.103 - try { 6.104 - parsed = RegExpScanner.scan(input); 6.105 - } catch (final PatternSyntaxException e) { 6.106 - // refine the exception with a better syntax error, if this 6.107 - // passes, just rethrow what we have 6.108 - Pattern.compile(input, flags); 6.109 - throw e; 6.110 - } 6.111 - 6.112 - if (parsed != null) { 6.113 - this.pattern = Pattern.compile(parsed.getJavaPattern(), flags); 6.114 - this.groupsInNegativeLookahead = parsed.getGroupsInNegativeLookahead(); 6.115 - } 6.116 - } catch (final PatternSyntaxException e2) { 6.117 - throwParserException("syntax", e2.getMessage()); 6.118 - } 6.119 - 6.120 - } 6.121 - 6.122 - /** 6.123 - * @return the input 6.124 - */ 6.125 - public String getInput() { 6.126 - return input; 6.127 - } 6.128 - 6.129 - /** 6.130 - * @return the global 6.131 - */ 6.132 - public boolean isGlobal() { 6.133 - return global; 6.134 - } 6.135 - 6.136 - /** 6.137 - * @return the ignoreCase 6.138 - */ 6.139 - public boolean isIgnoreCase() { 6.140 - return ignoreCase; 6.141 - } 6.142 - 6.143 - /** 6.144 - * @return the multiline 6.145 - */ 6.146 - public boolean isMultiline() { 6.147 - return multiline; 6.148 - } 6.149 - 6.150 - /** 6.151 - * @return the pattern 6.152 - */ 6.153 - public Pattern getPattern() { 6.154 - return pattern; 6.155 - } 6.156 - 6.157 - /** 6.158 - * @return the groupsInNegativeLookahead 6.159 - */ 6.160 - public BitVector getGroupsInNegativeLookahead() { 6.161 - return groupsInNegativeLookahead; 6.162 - } 6.163 - 6.164 - /** 6.165 - * Validation method for RegExp input and flagString - we don't care about the RegExp object 6.166 - * 6.167 - * @param input regexp input 6.168 - * @param flagString flag string 6.169 - * 6.170 - * @throws ParserException if invalid regexp and flags 6.171 - */ 6.172 - @SuppressWarnings({"unused", "ResultOfObjectAllocationIgnored"}) 6.173 - public static void validate(final String input, final String flagString) throws ParserException { 6.174 - new RegExp(input, flagString); 6.175 - } 6.176 - 6.177 - private static void throwParserException(final String key, final String str) throws ParserException { 6.178 - throw new ParserException(ECMAErrors.getMessage("parser.error.regex." + key, str)); 6.179 - } 6.180 -}
7.1 --- a/src/jdk/nashorn/internal/runtime/RegExpMatch.java Fri Feb 22 10:39:00 2013 -0400 7.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 7.3 @@ -1,98 +0,0 @@ 7.4 -/* 7.5 - * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved. 7.6 - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 7.7 - * 7.8 - * This code is free software; you can redistribute it and/or modify it 7.9 - * under the terms of the GNU General Public License version 2 only, as 7.10 - * published by the Free Software Foundation. Oracle designates this 7.11 - * particular file as subject to the "Classpath" exception as provided 7.12 - * by Oracle in the LICENSE file that accompanied this code. 7.13 - * 7.14 - * This code is distributed in the hope that it will be useful, but WITHOUT 7.15 - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 7.16 - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 7.17 - * version 2 for more details (a copy is included in the LICENSE file that 7.18 - * accompanied this code). 7.19 - * 7.20 - * You should have received a copy of the GNU General Public License version 7.21 - * 2 along with this work; if not, write to the Free Software Foundation, 7.22 - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 7.23 - * 7.24 - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 7.25 - * or visit www.oracle.com if you need additional information or have any 7.26 - * questions. 7.27 - */ 7.28 - 7.29 -package jdk.nashorn.internal.runtime; 7.30 - 7.31 -/** 7.32 - * Match tuple to keep track of ongoing regexp match. 7.33 - */ 7.34 -public final class RegExpMatch { 7.35 - final Object[] groups; 7.36 - final int index; 7.37 - final String input; 7.38 - 7.39 - /** 7.40 - * Constructor 7.41 - * 7.42 - * @param input regexp input 7.43 - * @param index index of match 7.44 - * @param groups groups vector 7.45 - */ 7.46 - public RegExpMatch(final String input, final int index, final Object[] groups) { 7.47 - this.input = input; 7.48 - this.index = index; 7.49 - this.groups = groups; 7.50 - } 7.51 - 7.52 - /** 7.53 - * Get the groups for the match 7.54 - * @return group vector 7.55 - */ 7.56 - public Object[] getGroups() { 7.57 - return groups; 7.58 - } 7.59 - 7.60 - /** 7.61 - * Get the input for the map 7.62 - * @return input 7.63 - */ 7.64 - public String getInput() { 7.65 - return input; 7.66 - } 7.67 - 7.68 - /** 7.69 - * Get the index for the match 7.70 - * @return index 7.71 - */ 7.72 - public int getIndex() { 7.73 - return index; 7.74 - } 7.75 - 7.76 - /** 7.77 - * Get the length of the match 7.78 - * @return length 7.79 - */ 7.80 - public int length() { 7.81 - return ((String)groups[0]).length(); 7.82 - } 7.83 - 7.84 - /** 7.85 - * Get the group with the given index or the empty string if group index is not valid. 7.86 - * @param index the group index 7.87 - * @return the group or "" 7.88 - */ 7.89 - public Object getGroup(int index) { 7.90 - return index >= 0 && index < groups.length ? groups[index] : ""; 7.91 - } 7.92 - 7.93 - /** 7.94 - * Get the last parenthesis group, or the empty string if none exists. 7.95 - * @return the last group or "" 7.96 - */ 7.97 - public Object getLastParen() { 7.98 - return groups.length > 1 ? groups[groups.length - 1] : ""; 7.99 - } 7.100 - 7.101 -}
8.1 --- a/src/jdk/nashorn/internal/runtime/RegExpScanner.java Fri Feb 22 10:39:00 2013 -0400 8.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 8.3 @@ -1,1411 +0,0 @@ 8.4 -/* 8.5 - * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved. 8.6 - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 8.7 - * 8.8 - * This code is free software; you can redistribute it and/or modify it 8.9 - * under the terms of the GNU General Public License version 2 only, as 8.10 - * published by the Free Software Foundation. Oracle designates this 8.11 - * particular file as subject to the "Classpath" exception as provided 8.12 - * by Oracle in the LICENSE file that accompanied this code. 8.13 - * 8.14 - * This code is distributed in the hope that it will be useful, but WITHOUT 8.15 - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 8.16 - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 8.17 - * version 2 for more details (a copy is included in the LICENSE file that 8.18 - * accompanied this code). 8.19 - * 8.20 - * You should have received a copy of the GNU General Public License version 8.21 - * 2 along with this work; if not, write to the Free Software Foundation, 8.22 - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 8.23 - * 8.24 - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 8.25 - * or visit www.oracle.com if you need additional information or have any 8.26 - * questions. 8.27 - */ 8.28 - 8.29 -package jdk.nashorn.internal.runtime; 8.30 - 8.31 -import java.util.ArrayList; 8.32 -import java.util.HashMap; 8.33 -import java.util.Iterator; 8.34 -import java.util.LinkedHashMap; 8.35 -import java.util.LinkedList; 8.36 -import java.util.List; 8.37 -import java.util.Map; 8.38 -import java.util.regex.PatternSyntaxException; 8.39 -import jdk.nashorn.internal.parser.Lexer; 8.40 -import jdk.nashorn.internal.parser.Scanner; 8.41 - 8.42 -/** 8.43 - * Scan a JavaScript regexp, converting to Java regex if necessary. 8.44 - * 8.45 - */ 8.46 -final class RegExpScanner extends Scanner { 8.47 - 8.48 - /** 8.49 - * String builder to accumulate the result - this contains verbatim parsed JavaScript. 8.50 - * to get the java equivalent we need to create a Pattern token and return its toString() 8.51 - */ 8.52 - private final StringBuilder sb; 8.53 - 8.54 - /** An optional error message if one occurred during parse. */ 8.55 - private String errorMessage; 8.56 - 8.57 - /** Is this the special case of a regexp that never matches anything */ 8.58 - private boolean neverMatches; 8.59 - 8.60 - /** The resulting java.util.regex pattern string. */ 8.61 - private String javaPattern; 8.62 - 8.63 - /** Expected token table */ 8.64 - private final Map<Character, Integer> expected = new HashMap<>(); 8.65 - 8.66 - /** Capturing parenthesis that have been found so far. */ 8.67 - private final List<Capture> caps = new LinkedList<>(); 8.68 - 8.69 - /** Forward references to capturing parenthesis to be resolved later.*/ 8.70 - private final Map<Integer, Token> forwardReferences = new LinkedHashMap<>(); 8.71 - 8.72 - /** Current level of zero-width negative lookahead assertions. */ 8.73 - private int negativeLookaheadLevel; 8.74 - 8.75 - private static final String NON_IDENT_ESCAPES = "$^*+(){}[]|\\.?"; 8.76 - 8.77 - private static class Capture { 8.78 - /** 8.79 - * Zero-width negative lookaheads enclosing the capture. 8.80 - */ 8.81 - private final int negativeLookaheadLevel; 8.82 - /** 8.83 - * Captures that live inside a negative lookahead are dead after the 8.84 - * lookahead and will be undefined if referenced from outside. 8.85 - */ 8.86 - private boolean isDead; 8.87 - 8.88 - Capture(final int negativeLookaheadLevel) { 8.89 - this.negativeLookaheadLevel = negativeLookaheadLevel; 8.90 - } 8.91 - 8.92 - public int getNegativeLookaheadLevel() { 8.93 - return negativeLookaheadLevel; 8.94 - } 8.95 - 8.96 - public boolean isDead() { 8.97 - return isDead; 8.98 - } 8.99 - 8.100 - public void setDead() { 8.101 - this.isDead = true; 8.102 - } 8.103 - } 8.104 - 8.105 - /** 8.106 - * This is a token - the JavaScript regexp is scanned into a token tree 8.107 - * A token has other tokens as children as well as "atoms", i.e. Strings. 8.108 - * 8.109 - */ 8.110 - private static class Token { 8.111 - 8.112 - private enum Type { 8.113 - PATTERN, 8.114 - DISJUNCTION, 8.115 - ALTERNATIVE, 8.116 - TERM, 8.117 - ASSERTION, 8.118 - QUANTIFIER, 8.119 - QUANTIFIER_PREFIX, 8.120 - ATOM, 8.121 - PATTERN_CHARACTER, 8.122 - ATOM_ESCAPE, 8.123 - CHARACTER_ESCAPE, 8.124 - CONTROL_ESCAPE, 8.125 - CONTROL_LETTER, 8.126 - IDENTITY_ESCAPE, 8.127 - DECIMAL_ESCAPE, 8.128 - CHARACTERCLASS_ESCAPE, 8.129 - CHARACTERCLASS, 8.130 - CLASSRANGES, 8.131 - NON_EMPTY_CLASSRANGES, 8.132 - NON_EMPTY_CLASSRANGES_NODASH, 8.133 - CLASSATOM, 8.134 - CLASSATOM_NODASH, 8.135 - CLASS_ESCAPE, 8.136 - DECIMALDIGITS, 8.137 - HEX_ESCAPESEQUENCE, 8.138 - UNICODE_ESCAPESEQUENCE, 8.139 - } 8.140 - 8.141 - /** 8.142 - * Token tyoe 8.143 - */ 8.144 - private final Token.Type type; 8.145 - 8.146 - /** 8.147 - * Child nodes 8.148 - */ 8.149 - private final List<Object> children; 8.150 - 8.151 - /** 8.152 - * Parent node 8.153 - */ 8.154 - private Token parent; 8.155 - 8.156 - /** 8.157 - * Dead code flag 8.158 - */ 8.159 - private boolean isDead; 8.160 - 8.161 - private static final Map<Type, ToString> toStringMap = new HashMap<>(); 8.162 - private static final ToString DEFAULT_TOSTRING = new ToString(); 8.163 - 8.164 - private static String unicode(final int value) { 8.165 - final StringBuilder sb = new StringBuilder(); 8.166 - final String hex = Integer.toHexString(value); 8.167 - sb.append('u'); 8.168 - for (int i = 0; i < 4 - hex.length(); i++) { 8.169 - sb.append('0'); 8.170 - } 8.171 - sb.append(hex); 8.172 - 8.173 - return sb.toString(); 8.174 - } 8.175 - 8.176 - static { 8.177 - toStringMap.put(Type.CHARACTERCLASS, new ToString() { 8.178 - @Override 8.179 - public String toString(final Token token) { 8.180 - return super.toString(token).replace("\\b", "\b"); 8.181 - } 8.182 - }); 8.183 - 8.184 - // for some reason java regexps don't like control characters on the 8.185 - // form "\\ca".match([string with ascii 1 at char0]). Translating 8.186 - // them to unicode does it though. 8.187 - toStringMap.put(Type.CHARACTER_ESCAPE, new ToString() { 8.188 - @Override 8.189 - public String toString(final Token token) { 8.190 - final String str = super.toString(token); 8.191 - if (str.length() == 2) { 8.192 - return Token.unicode(Character.toLowerCase(str.charAt(1)) - 'a' + 1); 8.193 - } 8.194 - return str; 8.195 - } 8.196 - }); 8.197 - 8.198 - toStringMap.put(Type.DECIMAL_ESCAPE, new ToString() { 8.199 - @Override 8.200 - public String toString(final Token token) { 8.201 - final String str = super.toString(token); 8.202 - 8.203 - if ("\0".equals(str)) { 8.204 - return str; 8.205 - } 8.206 - 8.207 - int value; 8.208 - 8.209 - if (!token.hasParentOfType(Type.CLASSRANGES)) { 8.210 - return str; 8.211 - } 8.212 - 8.213 - value = Integer.parseInt(str, 8); //throws exception that leads to SyntaxError if not octal 8.214 - if (value > 0xff) { 8.215 - throw new NumberFormatException(str); 8.216 - } 8.217 - 8.218 - return Token.unicode(value); 8.219 - } 8.220 - }); 8.221 - 8.222 - } 8.223 - 8.224 - /** 8.225 - * JavaScript Token to Java regex substring framework. 8.226 - * 8.227 - */ 8.228 - private static class ToString { 8.229 - String toString(final Token token) { 8.230 - final StringBuilder sb = new StringBuilder(); 8.231 - for (final Object child : token.getChildren()) { 8.232 - sb.append(child); 8.233 - } 8.234 - 8.235 - //perform global substitutions that hold true for any evaluated form 8.236 - String str = sb.toString(); 8.237 - switch (str) { 8.238 - case "\\s": 8.239 - str = "[" + Lexer.getWhitespaceRegExp() + "]"; 8.240 - break; 8.241 - case "\\S": 8.242 - str = "[^" + Lexer.getWhitespaceRegExp() + "]"; 8.243 - break; 8.244 - case "[^]": 8.245 - str = "[\\s\\S]"; 8.246 - break; 8.247 - default: 8.248 - break; 8.249 - } 8.250 - return str; 8.251 - } 8.252 - } 8.253 - 8.254 - /** 8.255 - * Token iterator. Doesn't return "atom" children. i.e. string representations, 8.256 - * just tokens 8.257 - * 8.258 - */ 8.259 - private static class TokenIterator implements Iterator<Token> { 8.260 - private final List<Token> preorder; 8.261 - 8.262 - private void init(final Token root) { 8.263 - preorder.add(root); 8.264 - for (final Object child : root.getChildren()) { 8.265 - if (child instanceof Token) { 8.266 - init((Token)child); 8.267 - } 8.268 - } 8.269 - } 8.270 - 8.271 - TokenIterator(final Token root) { 8.272 - preorder = new ArrayList<>(); 8.273 - init(root); 8.274 - } 8.275 - 8.276 - @Override 8.277 - public boolean hasNext() { 8.278 - return !preorder.isEmpty(); 8.279 - } 8.280 - 8.281 - @Override 8.282 - public Token next() { 8.283 - return preorder.remove(0); 8.284 - } 8.285 - 8.286 - @Override 8.287 - public void remove() { 8.288 - next(); 8.289 - } 8.290 - } 8.291 - 8.292 - /** 8.293 - * Constructor 8.294 - * @param type the token type 8.295 - */ 8.296 - Token(final Token.Type type) { 8.297 - this.type = type; 8.298 - children = new ArrayList<>(); 8.299 - } 8.300 - 8.301 - /** 8.302 - * Add a an "atom" child to a token 8.303 - * @param child the child to add 8.304 - * @return the token (for chaining) 8.305 - */ 8.306 - public Token add(final String child) { 8.307 - children.add(child); 8.308 - return this; 8.309 - } 8.310 - 8.311 - /** 8.312 - * Add a child to a token 8.313 - * @param child the child 8.314 - * @return the token (for chaining) 8.315 - */ 8.316 - public Token add(final Token child) { 8.317 - if (child != null) { 8.318 - children.add(child); 8.319 - child.setParent(this); 8.320 - } 8.321 - return this; 8.322 - } 8.323 - 8.324 - /** 8.325 - * Remove a child from a token 8.326 - * @param child the child to remove 8.327 - * @return true if successful 8.328 - */ 8.329 - public boolean remove(final Token child) { 8.330 - return children.remove(child); 8.331 - } 8.332 - 8.333 - /** 8.334 - * Remove the last child from a token 8.335 - * @return the removed child 8.336 - */ 8.337 - public Object removeLast() { 8.338 - return children.remove(children.size() - 1); 8.339 - } 8.340 - 8.341 - /** 8.342 - * Flag this token as dead code 8.343 - * @param isDead is it dead or not 8.344 - */ 8.345 - private void setIsDead(final boolean isDead) { 8.346 - this.isDead = isDead; 8.347 - } 8.348 - 8.349 - /** 8.350 - * Is this token dead code 8.351 - * @return boolean 8.352 - */ 8.353 - private boolean getIsDead() { 8.354 - return isDead; 8.355 - } 8.356 - 8.357 - /** 8.358 - * Get the parent of this token 8.359 - * @return parent token 8.360 - */ 8.361 - public Token getParent() { 8.362 - return parent; 8.363 - } 8.364 - 8.365 - public boolean hasParentOfType(final Token.Type parentType) { 8.366 - for (Token p = getParent(); p != null; p = p.getParent()) { 8.367 - if (p.getType() == parentType) { 8.368 - return true; 8.369 - } 8.370 - } 8.371 - return false; 8.372 - } 8.373 - 8.374 - public boolean hasChildOfType(final Token.Type childType) { 8.375 - for (final Iterator<Token> iter = iterator() ; iter.hasNext() ; ) { 8.376 - if (iter.next().getType() == childType) { 8.377 - return true; 8.378 - } 8.379 - } 8.380 - return false; 8.381 - } 8.382 - 8.383 - /** 8.384 - * Set the parent of this token 8.385 - * @param parent 8.386 - */ 8.387 - private void setParent(final Token parent) { 8.388 - this.parent = parent; 8.389 - } 8.390 - 8.391 - /** 8.392 - * Get the children of this token 8.393 - * @return an array of children, never null 8.394 - */ 8.395 - public Object[] getChildren() { 8.396 - return children.toArray(); 8.397 - } 8.398 - 8.399 - /** 8.400 - * Reset this token, remove all children 8.401 - */ 8.402 - public void reset() { 8.403 - children.clear(); 8.404 - } 8.405 - 8.406 - /** 8.407 - * Get a preorder token iterator with this token as root 8.408 - * @return an iterator 8.409 - */ 8.410 - public Iterator<Token> iterator() { 8.411 - return new TokenIterator(this); 8.412 - } 8.413 - 8.414 - /** 8.415 - * Get the type of this token 8.416 - * @return type 8.417 - */ 8.418 - public Type getType() { 8.419 - return type; 8.420 - } 8.421 - 8.422 - /** 8.423 - * Turn this token into Java regexp compatible text 8.424 - * @return part of a java regexp 8.425 - */ 8.426 - @Override 8.427 - public String toString() { 8.428 - ToString t = toStringMap.get(getType()); 8.429 - if (t == null) { 8.430 - t = DEFAULT_TOSTRING; 8.431 - } 8.432 - return t.toString(this); 8.433 - } 8.434 - } 8.435 - 8.436 - /** 8.437 - * Constructor 8.438 - * @param string the JavaScript regexp to parse 8.439 - */ 8.440 - private RegExpScanner(final String string) { 8.441 - super(string); 8.442 - sb = new StringBuilder(limit); 8.443 - reset(0); 8.444 - expected.put(']', 0); 8.445 - expected.put('}', 0); 8.446 - } 8.447 - 8.448 - private void processForwardReferences() { 8.449 - if (neverMatches()) { 8.450 - return; 8.451 - } 8.452 - 8.453 - for (final Map.Entry<Integer, Token> fwdRef : forwardReferences.entrySet()) { 8.454 - if (fwdRef.getKey().intValue() > caps.size()) { 8.455 - neverMatches = true; 8.456 - break; 8.457 - } 8.458 - 8.459 - fwdRef.getValue().setIsDead(true); 8.460 - } 8.461 - 8.462 - forwardReferences.clear(); 8.463 - } 8.464 - 8.465 - /** 8.466 - * Scan a JavaScript regexp string returning a Java safe regex string. 8.467 - * 8.468 - * @param string 8.469 - * JavaScript regexp string. 8.470 - * @return Java safe regex string. 8.471 - */ 8.472 - public static RegExpScanner scan(final String string) { 8.473 - final RegExpScanner scanner = new RegExpScanner(string); 8.474 - 8.475 - Token pattern; 8.476 - 8.477 - try { 8.478 - pattern = scanner.pattern(); 8.479 - } catch (final Exception e) { 8.480 - throw new PatternSyntaxException(e.getMessage(), string, scanner.sb.length()); 8.481 - } 8.482 - 8.483 - scanner.processForwardReferences(); 8.484 - if (scanner.neverMatches()) { 8.485 - return null; // never matches 8.486 - } 8.487 - 8.488 - // go over the code and remove dead code 8.489 - final Iterator<Token> iter = pattern.iterator(); 8.490 - while (iter.hasNext()) { 8.491 - final Token next = iter.next(); 8.492 - if (next.getIsDead()) { 8.493 - next.getParent().remove(next); 8.494 - } 8.495 - } 8.496 - 8.497 - // turn the pattern into a string, p, the java equivalent string for our js regexp 8.498 - final String p = pattern.toString(); 8.499 - // if builder contains all tokens that were sent in, we know 8.500 - // we correctly parsed the entire JavaScript regexp without syntax errors 8.501 - if (!string.equals(scanner.getStringBuilder().toString())) { 8.502 - throw new PatternSyntaxException(string, p, p.length() + 1); 8.503 - } 8.504 - 8.505 - scanner.javaPattern = p; 8.506 - return scanner; 8.507 - } 8.508 - 8.509 - /** 8.510 - * Does this regexp ever match anything? Use of e.g. [], which is legal in JavaScript, 8.511 - * is an example where we never match 8.512 - * 8.513 - * @return boolean 8.514 - */ 8.515 - private boolean neverMatches() { 8.516 - return neverMatches; 8.517 - } 8.518 - 8.519 - /** 8.520 - * This is used to set better error messages that can be reused 8.521 - * in NativeRegExp for augmenting e.g. SyntaxErrors. 8.522 - * 8.523 - * @return an error message or null if no extra info 8.524 - */ 8.525 - public String getErrorMessage() { 8.526 - return errorMessage; 8.527 - } 8.528 - 8.529 - final StringBuilder getStringBuilder() { 8.530 - return sb; 8.531 - } 8.532 - 8.533 - String getJavaPattern() { 8.534 - return javaPattern; 8.535 - } 8.536 - 8.537 - BitVector getGroupsInNegativeLookahead() { 8.538 - BitVector vec = null; 8.539 - for (int i = 0; i < caps.size(); i++) { 8.540 - final Capture cap = caps.get(i); 8.541 - if (cap.getNegativeLookaheadLevel() > 0) { 8.542 - if (vec == null) { 8.543 - vec = new BitVector(caps.size() + 1); 8.544 - } 8.545 - vec.set(i + 1); 8.546 - } 8.547 - } 8.548 - return vec; 8.549 - } 8.550 - 8.551 - /** 8.552 - * Commit n characters to the builder and to a given token 8.553 - * @param token Uncommitted token. 8.554 - * @param n Number of characters. 8.555 - * @return Committed token 8.556 - */ 8.557 - private Token commit(final Token token, final int n) { 8.558 - final int startIn = position; 8.559 - 8.560 - switch (n) { 8.561 - case 1: 8.562 - sb.append(ch0); 8.563 - skip(1); 8.564 - break; 8.565 - case 2: 8.566 - sb.append(ch0); 8.567 - sb.append(ch1); 8.568 - skip(2); 8.569 - break; 8.570 - case 3: 8.571 - sb.append(ch0); 8.572 - sb.append(ch1); 8.573 - sb.append(ch2); 8.574 - skip(3); 8.575 - break; 8.576 - default: 8.577 - assert false : "Should not reach here"; 8.578 - } 8.579 - 8.580 - if (token == null) { 8.581 - return null; 8.582 - } 8.583 - 8.584 - return token.add(sb.substring(startIn, sb.length())); 8.585 - } 8.586 - 8.587 - /** 8.588 - * Restart the buffers back at an earlier position. 8.589 - * 8.590 - * @param startIn 8.591 - * Position in the input stream. 8.592 - * @param startOut 8.593 - * Position in the output stream. 8.594 - */ 8.595 - private void restart(final int startIn, final int startOut) { 8.596 - reset(startIn); 8.597 - sb.setLength(startOut); 8.598 - } 8.599 - 8.600 - private void push(final char ch) { 8.601 - expected.put(ch, expected.get(ch) + 1); 8.602 - } 8.603 - 8.604 - private void pop(final char ch) { 8.605 - expected.put(ch, Math.min(0, expected.get(ch) - 1)); 8.606 - } 8.607 - 8.608 - /* 8.609 - * Recursive descent tokenizer starts below. 8.610 - */ 8.611 - 8.612 - /* 8.613 - * Pattern :: 8.614 - * Disjunction 8.615 - */ 8.616 - private Token pattern() { 8.617 - final Token token = new Token(Token.Type.PATTERN); 8.618 - 8.619 - final Token child = disjunction(); 8.620 - return token.add(child); 8.621 - } 8.622 - 8.623 - /* 8.624 - * Disjunction :: 8.625 - * Alternative 8.626 - * Alternative | Disjunction 8.627 - */ 8.628 - private Token disjunction() { 8.629 - final Token token = new Token(Token.Type.DISJUNCTION); 8.630 - 8.631 - while (true) { 8.632 - token.add(alternative()); 8.633 - 8.634 - if (ch0 == '|') { 8.635 - commit(token, 1); 8.636 - } else { 8.637 - break; 8.638 - } 8.639 - } 8.640 - 8.641 - return token; 8.642 - } 8.643 - 8.644 - /* 8.645 - * Alternative :: 8.646 - * [empty] 8.647 - * Alternative Term 8.648 - */ 8.649 - private Token alternative() { 8.650 - final Token token = new Token(Token.Type.ALTERNATIVE); 8.651 - 8.652 - Token child; 8.653 - while ((child = term()) != null) { 8.654 - token.add(child); 8.655 - } 8.656 - 8.657 - return token; 8.658 - } 8.659 - 8.660 - /* 8.661 - * Term :: 8.662 - * Assertion 8.663 - * Atom 8.664 - * Atom Quantifier 8.665 - */ 8.666 - private Token term() { 8.667 - final int startIn = position; 8.668 - final int startOut = sb.length(); 8.669 - final Token token = new Token(Token.Type.TERM); 8.670 - Token child; 8.671 - 8.672 - child = assertion(); 8.673 - if (child != null) { 8.674 - return token.add(child); 8.675 - } 8.676 - 8.677 - child = atom(); 8.678 - if (child != null) { 8.679 - boolean emptyCharacterClass = false; 8.680 - if ("[]".equals(child.toString())) { 8.681 - emptyCharacterClass = true; 8.682 - } 8.683 - 8.684 - token.add(child); 8.685 - 8.686 - final Token quantifier = quantifier(); 8.687 - if (quantifier != null) { 8.688 - token.add(quantifier); 8.689 - } 8.690 - 8.691 - if (emptyCharacterClass) { 8.692 - if (quantifier == null) { 8.693 - neverMatches = true; //never matches ever. 8.694 - } else { 8.695 - //if we can get away with max zero, remove this entire token 8.696 - final String qs = quantifier.toString(); 8.697 - if ("+".equals(qs) || "*".equals(qs) || qs.startsWith("{0,")) { 8.698 - token.setIsDead(true); 8.699 - } 8.700 - } 8.701 - } 8.702 - 8.703 - return token; 8.704 - } 8.705 - 8.706 - restart(startIn, startOut); 8.707 - return null; 8.708 - } 8.709 - 8.710 - /* 8.711 - * Assertion :: 8.712 - * ^ 8.713 - * $ 8.714 - * \b 8.715 - * \B 8.716 - * ( ? = Disjunction ) 8.717 - * ( ? ! Disjunction ) 8.718 - */ 8.719 - private Token assertion() { 8.720 - final int startIn = position; 8.721 - final int startOut = sb.length(); 8.722 - final Token token = new Token(Token.Type.ASSERTION); 8.723 - 8.724 - switch (ch0) { 8.725 - case '^': 8.726 - case '$': 8.727 - return commit(token, 1); 8.728 - 8.729 - case '\\': 8.730 - if (ch1 == 'b' || ch1 == 'B') { 8.731 - return commit(token, 2); 8.732 - } 8.733 - break; 8.734 - 8.735 - case '(': 8.736 - if (ch1 != '?') { 8.737 - break; 8.738 - } 8.739 - if (ch2 != '=' && ch2 != '!') { 8.740 - break; 8.741 - } 8.742 - final boolean isNegativeLookahead = (ch2 == '!'); 8.743 - commit(token, 3); 8.744 - 8.745 - if (isNegativeLookahead) { 8.746 - negativeLookaheadLevel++; 8.747 - } 8.748 - final Token disjunction = disjunction(); 8.749 - if (isNegativeLookahead) { 8.750 - for (final Capture cap : caps) { 8.751 - if (cap.getNegativeLookaheadLevel() >= negativeLookaheadLevel) { 8.752 - cap.setDead(); 8.753 - } 8.754 - } 8.755 - negativeLookaheadLevel--; 8.756 - } 8.757 - 8.758 - if (disjunction != null && ch0 == ')') { 8.759 - token.add(disjunction); 8.760 - return commit(token, 1); 8.761 - } 8.762 - break; 8.763 - 8.764 - default: 8.765 - break; 8.766 - } 8.767 - 8.768 - restart(startIn, startOut); 8.769 - 8.770 - return null; 8.771 - } 8.772 - 8.773 - /* 8.774 - * Quantifier :: 8.775 - * QuantifierPrefix 8.776 - * QuantifierPrefix ? 8.777 - */ 8.778 - private Token quantifier() { 8.779 - final Token token = new Token(Token.Type.QUANTIFIER); 8.780 - final Token child = quantifierPrefix(); 8.781 - if (child != null) { 8.782 - token.add(child); 8.783 - if (ch0 == '?') { 8.784 - commit(token, 1); 8.785 - } 8.786 - return token; 8.787 - } 8.788 - return null; 8.789 - } 8.790 - 8.791 - /* 8.792 - * QuantifierPrefix :: 8.793 - * * 8.794 - * + 8.795 - * ? 8.796 - * { DecimalDigits } 8.797 - * { DecimalDigits , } 8.798 - * { DecimalDigits , DecimalDigits } 8.799 - */ 8.800 - private Token quantifierPrefix() { 8.801 - final int startIn = position; 8.802 - final int startOut = sb.length(); 8.803 - final Token token = new Token(Token.Type.QUANTIFIER_PREFIX); 8.804 - 8.805 - switch (ch0) { 8.806 - case '*': 8.807 - case '+': 8.808 - case '?': 8.809 - return commit(token, 1); 8.810 - 8.811 - case '{': 8.812 - commit(token, 1); 8.813 - 8.814 - final Token child = decimalDigits(); 8.815 - if (child == null) { 8.816 - break; // not a quantifier - back out 8.817 - } 8.818 - push('}'); 8.819 - token.add(child); 8.820 - 8.821 - if (ch0 == ',') { 8.822 - commit(token, 1); 8.823 - token.add(decimalDigits()); 8.824 - } 8.825 - 8.826 - if (ch0 == '}') { 8.827 - pop('}'); 8.828 - commit(token, 1); 8.829 - } 8.830 - 8.831 - return token; 8.832 - 8.833 - default: 8.834 - break; 8.835 - } 8.836 - 8.837 - restart(startIn, startOut); 8.838 - return null; 8.839 - } 8.840 - 8.841 - /* 8.842 - * Atom :: 8.843 - * PatternCharacter 8.844 - * . 8.845 - * \ AtomEscape 8.846 - * CharacterClass 8.847 - * ( Disjunction ) 8.848 - * ( ? : Disjunction ) 8.849 - * 8.850 - */ 8.851 - private Token atom() { 8.852 - final int startIn = position; 8.853 - final int startOut = sb.length(); 8.854 - final Token token = new Token(Token.Type.ATOM); 8.855 - Token child; 8.856 - 8.857 - child = patternCharacter(); 8.858 - if (child != null) { 8.859 - return token.add(child); 8.860 - } 8.861 - 8.862 - if (ch0 == '.') { 8.863 - return commit(token, 1); 8.864 - } 8.865 - 8.866 - if (ch0 == '\\') { 8.867 - commit(token, 1); 8.868 - child = atomEscape(); 8.869 - 8.870 - if (child != null) { 8.871 - if (child.hasChildOfType(Token.Type.IDENTITY_ESCAPE)) { 8.872 - final char idEscape = child.toString().charAt(0); 8.873 - if (NON_IDENT_ESCAPES.indexOf(idEscape) == -1) { 8.874 - token.reset(); 8.875 - } 8.876 - } 8.877 - 8.878 - token.add(child); 8.879 - 8.880 - // forward backreferences always match empty. JavaScript != Java 8.881 - if (child.hasChildOfType(Token.Type.DECIMAL_ESCAPE) && !"\u0000".equals(child.toString())) { 8.882 - final int refNum = Integer.parseInt(child.toString()); 8.883 - 8.884 - if (refNum - 1 < caps.size() && caps.get(refNum - 1).isDead()) { 8.885 - // reference to dead in-negative-lookahead capture 8.886 - token.setIsDead(true); 8.887 - } else if (caps.size() < refNum) { 8.888 - // forward reference: always matches against empty string (dead token). 8.889 - // invalid reference (non-existant capture): pattern never matches. 8.890 - forwardReferences.put(refNum, token); 8.891 - } 8.892 - } 8.893 - 8.894 - return token; 8.895 - } 8.896 - } 8.897 - 8.898 - child = characterClass(); 8.899 - if (child != null) { 8.900 - return token.add(child); 8.901 - } 8.902 - 8.903 - if (ch0 == '(') { 8.904 - boolean capturingParens = true; 8.905 - commit(token, 1); 8.906 - if (ch0 == '?' && ch1 == ':') { 8.907 - capturingParens = false; 8.908 - commit(token, 2); 8.909 - } 8.910 - 8.911 - child = disjunction(); 8.912 - if (child != null) { 8.913 - token.add(child); 8.914 - if (ch0 == ')') { 8.915 - final Token atom = commit(token, 1); 8.916 - if (capturingParens) { 8.917 - caps.add(new Capture(negativeLookaheadLevel)); 8.918 - } 8.919 - return atom; 8.920 - } 8.921 - } 8.922 - } 8.923 - 8.924 - restart(startIn, startOut); 8.925 - return null; 8.926 - } 8.927 - 8.928 - /* 8.929 - * PatternCharacter :: 8.930 - * SourceCharacter but not any of: ^$\.*+?()[]{}| 8.931 - */ 8.932 - @SuppressWarnings("fallthrough") 8.933 - private Token patternCharacter() { 8.934 - if (atEOF()) { 8.935 - return null; 8.936 - } 8.937 - 8.938 - switch (ch0) { 8.939 - case '^': 8.940 - case '$': 8.941 - case '\\': 8.942 - case '.': 8.943 - case '*': 8.944 - case '+': 8.945 - case '?': 8.946 - case '(': 8.947 - case ')': 8.948 - case '[': 8.949 - case '|': 8.950 - return null; 8.951 - 8.952 - case '}': 8.953 - case ']': 8.954 - final int n = expected.get(ch0); 8.955 - if (n != 0) { 8.956 - return null; 8.957 - } 8.958 - 8.959 - case '{': 8.960 - // if not a valid quantifier escape curly brace to match itself 8.961 - // this ensures compatibility with other JS implementations 8.962 - final Token quant = quantifierPrefix(); 8.963 - return (quant == null) ? commit(new Token(Token.Type.PATTERN_CHARACTER).add("\\"), 1) : null; 8.964 - 8.965 - default: 8.966 - return commit(new Token(Token.Type.PATTERN_CHARACTER), 1); // SOURCECHARACTER 8.967 - } 8.968 - } 8.969 - 8.970 - /* 8.971 - * AtomEscape :: 8.972 - * DecimalEscape 8.973 - * CharacterEscape 8.974 - * CharacterClassEscape 8.975 - */ 8.976 - private Token atomEscape() { 8.977 - final Token token = new Token(Token.Type.ATOM_ESCAPE); 8.978 - Token child; 8.979 - 8.980 - child = decimalEscape(); 8.981 - if (child != null) { 8.982 - return token.add(child); 8.983 - } 8.984 - 8.985 - child = characterClassEscape(); 8.986 - if (child != null) { 8.987 - return token.add(child); 8.988 - } 8.989 - 8.990 - child = characterEscape(); 8.991 - if (child != null) { 8.992 - return token.add(child); 8.993 - } 8.994 - 8.995 - 8.996 - return null; 8.997 - } 8.998 - 8.999 - /* 8.1000 - * CharacterEscape :: 8.1001 - * ControlEscape 8.1002 - * c ControlLetter 8.1003 - * HexEscapeSequence 8.1004 - * UnicodeEscapeSequence 8.1005 - * IdentityEscape 8.1006 - */ 8.1007 - private Token characterEscape() { 8.1008 - final int startIn = position; 8.1009 - final int startOut = sb.length(); 8.1010 - 8.1011 - final Token token = new Token(Token.Type.CHARACTER_ESCAPE); 8.1012 - Token child; 8.1013 - 8.1014 - child = controlEscape(); 8.1015 - if (child != null) { 8.1016 - return token.add(child); 8.1017 - } 8.1018 - 8.1019 - if (ch0 == 'c') { 8.1020 - commit(token, 1); 8.1021 - child = controlLetter(); 8.1022 - if (child != null) { 8.1023 - return token.add(child); 8.1024 - } 8.1025 - restart(startIn, startOut); 8.1026 - } 8.1027 - 8.1028 - child = hexEscapeSequence(); 8.1029 - if (child != null) { 8.1030 - return token.add(child); 8.1031 - } 8.1032 - 8.1033 - child = unicodeEscapeSequence(); 8.1034 - if (child != null) { 8.1035 - return token.add(child); 8.1036 - } 8.1037 - 8.1038 - child = identityEscape(); 8.1039 - if (child != null) { 8.1040 - return token.add(child); 8.1041 - } 8.1042 - 8.1043 - restart(startIn, startOut); 8.1044 - 8.1045 - return null; 8.1046 - } 8.1047 - 8.1048 - private boolean scanEscapeSequence(final char leader, final int length, final Token token) { 8.1049 - final int startIn = position; 8.1050 - final int startOut = sb.length(); 8.1051 - 8.1052 - if (ch0 != leader) { 8.1053 - return false; 8.1054 - } 8.1055 - 8.1056 - commit(token, 1); 8.1057 - for (int i = 0; i < length; i++) { 8.1058 - final char ch0l = Character.toLowerCase(ch0); 8.1059 - if ((ch0l >= 'a' && ch0l <= 'f') || isDecimalDigit(ch0)) { 8.1060 - commit(token, 1); 8.1061 - } else { 8.1062 - restart(startIn, startOut); 8.1063 - return false; 8.1064 - } 8.1065 - } 8.1066 - 8.1067 - return true; 8.1068 - } 8.1069 - 8.1070 - private Token hexEscapeSequence() { 8.1071 - final Token token = new Token(Token.Type.HEX_ESCAPESEQUENCE); 8.1072 - if (scanEscapeSequence('x', 2, token)) { 8.1073 - return token; 8.1074 - } 8.1075 - return null; 8.1076 - } 8.1077 - 8.1078 - private Token unicodeEscapeSequence() { 8.1079 - final Token token = new Token(Token.Type.UNICODE_ESCAPESEQUENCE); 8.1080 - if (scanEscapeSequence('u', 4, token)) { 8.1081 - return token; 8.1082 - } 8.1083 - return null; 8.1084 - } 8.1085 - 8.1086 - /* 8.1087 - * ControlEscape :: 8.1088 - * one of fnrtv 8.1089 - */ 8.1090 - private Token controlEscape() { 8.1091 - switch (ch0) { 8.1092 - case 'f': 8.1093 - case 'n': 8.1094 - case 'r': 8.1095 - case 't': 8.1096 - case 'v': 8.1097 - return commit(new Token(Token.Type.CONTROL_ESCAPE), 1); 8.1098 - 8.1099 - default: 8.1100 - return null; 8.1101 - } 8.1102 - } 8.1103 - 8.1104 - /* 8.1105 - * ControlLetter :: 8.1106 - * one of abcdefghijklmnopqrstuvwxyz 8.1107 - * ABCDEFGHIJKLMNOPQRSTUVWXYZ 8.1108 - */ 8.1109 - private Token controlLetter() { 8.1110 - final char c = Character.toUpperCase(ch0); 8.1111 - if (c >= 'A' && c <= 'Z') { 8.1112 - final Token token = new Token(Token.Type.CONTROL_LETTER); 8.1113 - commit(token, 1); 8.1114 - return token; 8.1115 - } 8.1116 - return null; 8.1117 - /* 8.1118 - Token token = new Token(Token.Type.CONTROL_LETTER); 8.1119 - commit(null, 1);//add original char to builder not to token 8.1120 - this.neverMatches = c < 'A' || c > 'Z'; 8.1121 - return token.add(""+c);*/ 8.1122 - } 8.1123 - 8.1124 - /* 8.1125 - * IdentityEscape :: 8.1126 - * SourceCharacter but not IdentifierPart 8.1127 - * <ZWJ> (200c) 8.1128 - * <ZWNJ> (200d) 8.1129 - */ 8.1130 - private Token identityEscape() { 8.1131 - final Token token = new Token(Token.Type.IDENTITY_ESCAPE); 8.1132 - commit(token, 1); 8.1133 - return token; 8.1134 - } 8.1135 - 8.1136 - /* 8.1137 - * DecimalEscape :: 8.1138 - * DecimalIntegerLiteral [lookahead DecimalDigit] 8.1139 - */ 8.1140 - private Token decimalEscape() { 8.1141 - final Token token = new Token(Token.Type.DECIMAL_ESCAPE); 8.1142 - final int startIn = position; 8.1143 - final int startOut = sb.length(); 8.1144 - 8.1145 - if (ch0 == '0' && !isDecimalDigit(ch1)) { 8.1146 - commit(token, 1); 8.1147 - token.removeLast(); 8.1148 - // DecimalEscape :: 0. If i is zero, return the EscapeValue consisting of a <NUL> character (Unicodevalue0000); 8.1149 - return token.add("\u0000"); 8.1150 - } 8.1151 - 8.1152 - if (isDecimalDigit(ch0)) { 8.1153 - while (isDecimalDigit(ch0)) { 8.1154 - commit(token, 1); 8.1155 - } 8.1156 - return token; 8.1157 - } 8.1158 - 8.1159 - restart(startIn, startOut); 8.1160 - 8.1161 - return null; 8.1162 - } 8.1163 - 8.1164 - /* 8.1165 - * CharacterClassEscape :: 8.1166 - * one of dDsSwW 8.1167 - */ 8.1168 - private Token characterClassEscape() { 8.1169 - switch (ch0) { 8.1170 - case 's': 8.1171 - case 'S': 8.1172 - case 'd': 8.1173 - case 'D': 8.1174 - case 'w': 8.1175 - case 'W': 8.1176 - return commit(new Token(Token.Type.CHARACTERCLASS_ESCAPE), 1); 8.1177 - 8.1178 - default: 8.1179 - return null; 8.1180 - } 8.1181 - } 8.1182 - 8.1183 - /* 8.1184 - * CharacterClass :: 8.1185 - * [ [lookahead {^}] ClassRanges ] 8.1186 - * [ ^ ClassRanges ] 8.1187 - */ 8.1188 - private Token characterClass() { 8.1189 - final int startIn = position; 8.1190 - final int startOut = sb.length(); 8.1191 - final Token token = new Token(Token.Type.CHARACTERCLASS); 8.1192 - 8.1193 - if (ch0 == '[') { 8.1194 - push(']'); 8.1195 - commit(token, 1); 8.1196 - 8.1197 - if (ch0 == '^') { 8.1198 - commit(token, 1); 8.1199 - } 8.1200 - 8.1201 - final Token child = classRanges(); 8.1202 - if (child != null && ch0 == ']') { 8.1203 - pop(']'); 8.1204 - token.add(child); 8.1205 - return commit(token, 1); 8.1206 - } 8.1207 - } 8.1208 - 8.1209 - restart(startIn, startOut); 8.1210 - return null; 8.1211 - } 8.1212 - 8.1213 - /* 8.1214 - * ClassRanges :: 8.1215 - * [empty] 8.1216 - * NonemptyClassRanges 8.1217 - */ 8.1218 - private Token classRanges() { 8.1219 - return new Token(Token.Type.CLASSRANGES).add(nonemptyClassRanges()); 8.1220 - } 8.1221 - 8.1222 - /* 8.1223 - * NonemptyClassRanges :: 8.1224 - * ClassAtom 8.1225 - * ClassAtom NonemptyClassRangesNoDash 8.1226 - * ClassAtom - ClassAtom ClassRanges 8.1227 - */ 8.1228 - private Token nonemptyClassRanges() { 8.1229 - final int startIn = position; 8.1230 - final int startOut = sb.length(); 8.1231 - final Token token = new Token(Token.Type.NON_EMPTY_CLASSRANGES); 8.1232 - Token child; 8.1233 - 8.1234 - child = classAtom(); 8.1235 - if (child != null) { 8.1236 - token.add(child); 8.1237 - 8.1238 - if (ch0 == '-') { 8.1239 - commit(token, 1); 8.1240 - 8.1241 - final Token child1 = classAtom(); 8.1242 - final Token child2 = classRanges(); 8.1243 - if (child1 != null && child2 != null) { 8.1244 - token.add(child1); 8.1245 - token.add(child2); 8.1246 - 8.1247 - return token; 8.1248 - } 8.1249 - } 8.1250 - 8.1251 - child = nonemptyClassRangesNoDash(); 8.1252 - if (child != null) { 8.1253 - token.add(child); 8.1254 - return token; 8.1255 - } 8.1256 - 8.1257 - return token; 8.1258 - } 8.1259 - 8.1260 - restart(startIn, startOut); 8.1261 - return null; 8.1262 - } 8.1263 - 8.1264 - /* 8.1265 - * NonemptyClassRangesNoDash :: 8.1266 - * ClassAtom 8.1267 - * ClassAtomNoDash NonemptyClassRangesNoDash 8.1268 - * ClassAtomNoDash - ClassAtom ClassRanges 8.1269 - */ 8.1270 - private Token nonemptyClassRangesNoDash() { 8.1271 - final int startIn = position; 8.1272 - final int startOut = sb.length(); 8.1273 - final Token token = new Token(Token.Type.NON_EMPTY_CLASSRANGES_NODASH); 8.1274 - Token child; 8.1275 - 8.1276 - child = classAtomNoDash(); 8.1277 - if (child != null) { 8.1278 - token.add(child); 8.1279 - 8.1280 - // need to check dash first, as for e.g. [a-b|c-d] will otherwise parse - as an atom 8.1281 - if (ch0 == '-') { 8.1282 - commit(token, 1); 8.1283 - 8.1284 - final Token child1 = classAtom(); 8.1285 - final Token child2 = classRanges(); 8.1286 - if (child1 != null && child2 != null) { 8.1287 - token.add(child1); 8.1288 - return token.add(child2); 8.1289 - } 8.1290 - //fallthru 8.1291 - } 8.1292 - 8.1293 - child = nonemptyClassRangesNoDash(); 8.1294 - if (child != null) { 8.1295 - token.add(child); 8.1296 - } 8.1297 - return token; // still a class atom 8.1298 - } 8.1299 - 8.1300 - child = classAtom(); 8.1301 - if (child != null) { 8.1302 - return token.add(child); 8.1303 - } 8.1304 - 8.1305 - restart(startIn, startOut); 8.1306 - return null; 8.1307 - } 8.1308 - 8.1309 - /* 8.1310 - * ClassAtom : - ClassAtomNoDash 8.1311 - */ 8.1312 - private Token classAtom() { 8.1313 - final Token token = new Token(Token.Type.CLASSATOM); 8.1314 - 8.1315 - if (ch0 == '-') { 8.1316 - return commit(token, 1); 8.1317 - } 8.1318 - 8.1319 - final Token child = classAtomNoDash(); 8.1320 - if (child != null) { 8.1321 - return token.add(child); 8.1322 - } 8.1323 - 8.1324 - return null; 8.1325 - } 8.1326 - 8.1327 - /* 8.1328 - * ClassAtomNoDash :: 8.1329 - * SourceCharacter but not one of \ or ] or - 8.1330 - * \ ClassEscape 8.1331 - */ 8.1332 - private Token classAtomNoDash() { 8.1333 - final int startIn = position; 8.1334 - final int startOut = sb.length(); 8.1335 - final Token token = new Token(Token.Type.CLASSATOM_NODASH); 8.1336 - 8.1337 - switch (ch0) { 8.1338 - case ']': 8.1339 - case '-': 8.1340 - case '\0': 8.1341 - return null; 8.1342 - 8.1343 - case '[': 8.1344 - // unescaped left square bracket - add escape 8.1345 - return commit(token.add("\\"), 1); 8.1346 - 8.1347 - case '\\': 8.1348 - commit(token, 1); 8.1349 - final Token child = classEscape(); 8.1350 - if (child != null) { 8.1351 - return token.add(child); 8.1352 - } 8.1353 - 8.1354 - restart(startIn, startOut); 8.1355 - return null; 8.1356 - 8.1357 - default: 8.1358 - return commit(token, 1); 8.1359 - } 8.1360 - } 8.1361 - 8.1362 - /* 8.1363 - * ClassEscape :: 8.1364 - * DecimalEscape 8.1365 - * b 8.1366 - * CharacterEscape 8.1367 - * CharacterClassEscape 8.1368 - */ 8.1369 - private Token classEscape() { 8.1370 - final Token token = new Token(Token.Type.CLASS_ESCAPE); 8.1371 - Token child; 8.1372 - 8.1373 - child = decimalEscape(); 8.1374 - if (child != null) { 8.1375 - return token.add(child); 8.1376 - } 8.1377 - 8.1378 - if (ch0 == 'b') { 8.1379 - return commit(token, 1); 8.1380 - } 8.1381 - 8.1382 - child = characterEscape(); 8.1383 - if (child != null) { 8.1384 - return token.add(child); 8.1385 - } 8.1386 - 8.1387 - child = characterClassEscape(); 8.1388 - if (child != null) { 8.1389 - return token.add(child); 8.1390 - } 8.1391 - 8.1392 - return null; 8.1393 - } 8.1394 - 8.1395 - /* 8.1396 - * DecimalDigits 8.1397 - */ 8.1398 - private Token decimalDigits() { 8.1399 - if (!isDecimalDigit(ch0)) { 8.1400 - return null; 8.1401 - } 8.1402 - 8.1403 - final Token token = new Token(Token.Type.DECIMALDIGITS); 8.1404 - while (isDecimalDigit(ch0)) { 8.1405 - commit(token, 1); 8.1406 - } 8.1407 - 8.1408 - return token; 8.1409 - } 8.1410 - 8.1411 - private static boolean isDecimalDigit(final char ch) { 8.1412 - return ch >= '0' && ch <= '9'; 8.1413 - } 8.1414 -}
9.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 9.2 +++ b/src/jdk/nashorn/internal/runtime/regexp/DefaultRegExp.java Fri Feb 22 16:31:10 2013 +0100 9.3 @@ -0,0 +1,163 @@ 9.4 +/* 9.5 + * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved. 9.6 + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 9.7 + * 9.8 + * This code is free software; you can redistribute it and/or modify it 9.9 + * under the terms of the GNU General Public License version 2 only, as 9.10 + * published by the Free Software Foundation. Oracle designates this 9.11 + * particular file as subject to the "Classpath" exception as provided 9.12 + * by Oracle in the LICENSE file that accompanied this code. 9.13 + * 9.14 + * This code is distributed in the hope that it will be useful, but WITHOUT 9.15 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 9.16 + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 9.17 + * version 2 for more details (a copy is included in the LICENSE file that 9.18 + * accompanied this code). 9.19 + * 9.20 + * You should have received a copy of the GNU General Public License version 9.21 + * 2 along with this work; if not, write to the Free Software Foundation, 9.22 + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 9.23 + * 9.24 + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 9.25 + * or visit www.oracle.com if you need additional information or have any 9.26 + * questions. 9.27 + */ 9.28 + 9.29 +package jdk.nashorn.internal.runtime.regexp; 9.30 + 9.31 +import jdk.nashorn.internal.runtime.ParserException; 9.32 + 9.33 +import static java.util.regex.Pattern.CASE_INSENSITIVE; 9.34 +import static java.util.regex.Pattern.MULTILINE; 9.35 +import static java.util.regex.Pattern.UNICODE_CASE; 9.36 + 9.37 +import java.util.regex.Matcher; 9.38 +import java.util.regex.Pattern; 9.39 +import java.util.regex.PatternSyntaxException; 9.40 + 9.41 +/** 9.42 + * Default regular expression implementation based on java.util.regex package. 9.43 + * 9.44 + * Note that this class is not thread-safe as it stores the current match result 9.45 + * and the string being matched in instance fields. 9.46 + */ 9.47 +public class DefaultRegExp extends RegExp { 9.48 + 9.49 + /** Java regexp pattern to use for match. We compile to one of these */ 9.50 + private Pattern pattern; 9.51 + 9.52 + /** The matcher */ 9.53 + private RegExpMatcher matcher; 9.54 + 9.55 + /** 9.56 + * Construct a Regular expression from the given {@code source} and {@code flags} strings. 9.57 + * 9.58 + * @param source RegExp source string 9.59 + * @param flags RegExp flag string 9.60 + * @throws ParserException if flags is invalid or source string has syntax error. 9.61 + */ 9.62 + public DefaultRegExp(final String source, final String flags) throws ParserException { 9.63 + super(source, flags); 9.64 + 9.65 + int intFlags = 0; 9.66 + 9.67 + if (isIgnoreCase()) { 9.68 + intFlags |= CASE_INSENSITIVE | UNICODE_CASE; 9.69 + } 9.70 + if (isMultiline()) { 9.71 + intFlags |= MULTILINE; 9.72 + } 9.73 + 9.74 + try { 9.75 + RegExpScanner parsed; 9.76 + 9.77 + try { 9.78 + parsed = RegExpScanner.scan(source); 9.79 + } catch (final PatternSyntaxException e) { 9.80 + // refine the exception with a better syntax error, if this 9.81 + // passes, just rethrow what we have 9.82 + Pattern.compile(source, intFlags); 9.83 + throw e; 9.84 + } 9.85 + 9.86 + if (parsed != null) { 9.87 + this.pattern = Pattern.compile(parsed.getJavaPattern(), intFlags); 9.88 + this.groupsInNegativeLookahead = parsed.getGroupsInNegativeLookahead(); 9.89 + } 9.90 + } catch (final PatternSyntaxException e2) { 9.91 + throwParserException("syntax", e2.getMessage()); 9.92 + } 9.93 + } 9.94 + 9.95 + @Override 9.96 + public RegExpMatcher match(final String str) { 9.97 + if (pattern == null) { 9.98 + return null; // never matches or similar, e.g. a[] 9.99 + } 9.100 + 9.101 + RegExpMatcher matcher = this.matcher; 9.102 + 9.103 + if (matcher == null || matcher.getInput() != str) { 9.104 + matcher = new DefaultMatcher(str); 9.105 + this.matcher = matcher; 9.106 + } 9.107 + 9.108 + return matcher; 9.109 + } 9.110 + 9.111 + class DefaultMatcher implements RegExpMatcher { 9.112 + final String input; 9.113 + final Matcher matcher; 9.114 + 9.115 + DefaultMatcher(final String input) { 9.116 + this.input = input; 9.117 + this.matcher = pattern.matcher(input); 9.118 + } 9.119 + 9.120 + @Override 9.121 + public boolean search(final int start) { 9.122 + return matcher.find(start); 9.123 + } 9.124 + 9.125 + @Override 9.126 + public String getInput() { 9.127 + return input; 9.128 + } 9.129 + 9.130 + @Override 9.131 + public int start() { 9.132 + return matcher.start(); 9.133 + } 9.134 + 9.135 + @Override 9.136 + public int start(final int group) { 9.137 + return matcher.start(group); 9.138 + } 9.139 + 9.140 + @Override 9.141 + public int end() { 9.142 + return matcher.end(); 9.143 + } 9.144 + 9.145 + @Override 9.146 + public int end(final int group) { 9.147 + return matcher.end(group); 9.148 + } 9.149 + 9.150 + @Override 9.151 + public String group() { 9.152 + return matcher.group(); 9.153 + } 9.154 + 9.155 + @Override 9.156 + public String group(final int group) { 9.157 + return matcher.group(group); 9.158 + } 9.159 + 9.160 + @Override 9.161 + public int groupCount() { 9.162 + return matcher.groupCount(); 9.163 + } 9.164 + } 9.165 + 9.166 +}
10.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 10.2 +++ b/src/jdk/nashorn/internal/runtime/regexp/RegExp.java Fri Feb 22 16:31:10 2013 +0100 10.3 @@ -0,0 +1,164 @@ 10.4 +/* 10.5 + * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved. 10.6 + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 10.7 + * 10.8 + * This code is free software; you can redistribute it and/or modify it 10.9 + * under the terms of the GNU General Public License version 2 only, as 10.10 + * published by the Free Software Foundation. Oracle designates this 10.11 + * particular file as subject to the "Classpath" exception as provided 10.12 + * by Oracle in the LICENSE file that accompanied this code. 10.13 + * 10.14 + * This code is distributed in the hope that it will be useful, but WITHOUT 10.15 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 10.16 + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 10.17 + * version 2 for more details (a copy is included in the LICENSE file that 10.18 + * accompanied this code). 10.19 + * 10.20 + * You should have received a copy of the GNU General Public License version 10.21 + * 2 along with this work; if not, write to the Free Software Foundation, 10.22 + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 10.23 + * 10.24 + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 10.25 + * or visit www.oracle.com if you need additional information or have any 10.26 + * questions. 10.27 + */ 10.28 + 10.29 +package jdk.nashorn.internal.runtime.regexp; 10.30 + 10.31 +import jdk.nashorn.internal.runtime.BitVector; 10.32 +import jdk.nashorn.internal.runtime.ECMAErrors; 10.33 +import jdk.nashorn.internal.runtime.ParserException; 10.34 + 10.35 +import java.util.regex.MatchResult; 10.36 + 10.37 +/** 10.38 + * This is the base class for representing a parsed regular expression. 10.39 + * 10.40 + * Instances of this class are created by a {@link RegExpFactory}. 10.41 + */ 10.42 +public abstract class RegExp { 10.43 + 10.44 + /** Pattern string. */ 10.45 + private final String source; 10.46 + 10.47 + /** Global search flag for this regexp.*/ 10.48 + private boolean global; 10.49 + 10.50 + /** Case insensitive flag for this regexp */ 10.51 + private boolean ignoreCase; 10.52 + 10.53 + /** Multi-line flag for this regexp */ 10.54 + private boolean multiline; 10.55 + 10.56 + /** BitVector that keeps track of groups in negative lookahead */ 10.57 + protected BitVector groupsInNegativeLookahead; 10.58 + 10.59 + /** 10.60 + * Constructor. 10.61 + * 10.62 + * @param source the source string 10.63 + * @param flags the flags string 10.64 + */ 10.65 + protected RegExp(final String source, final String flags) { 10.66 + this.source = source; 10.67 + for (int i = 0; i < flags.length(); i++) { 10.68 + final char ch = flags.charAt(i); 10.69 + switch (ch) { 10.70 + case 'g': 10.71 + if (this.global) { 10.72 + throwParserException("repeated.flag", "g"); 10.73 + } 10.74 + this.global = true; 10.75 + break; 10.76 + case 'i': 10.77 + if (this.ignoreCase) { 10.78 + throwParserException("repeated.flag", "i"); 10.79 + } 10.80 + this.ignoreCase = true; 10.81 + break; 10.82 + case 'm': 10.83 + if (this.multiline) { 10.84 + throwParserException("repeated.flag", "m"); 10.85 + } 10.86 + this.multiline = true; 10.87 + break; 10.88 + default: 10.89 + throwParserException("unsupported.flag", Character.toString(ch)); 10.90 + } 10.91 + } 10.92 + } 10.93 + 10.94 + /** 10.95 + * Get the source pattern of this regular expression. 10.96 + * 10.97 + * @return the source string 10.98 + */ 10.99 + public String getSource() { 10.100 + return source; 10.101 + } 10.102 + 10.103 + /** 10.104 + * Set the global flag of this regular expression to {@code global}. 10.105 + * 10.106 + * @param global the new global flag 10.107 + */ 10.108 + public void setGlobal(final boolean global) { 10.109 + this.global = global; 10.110 + } 10.111 + 10.112 + /** 10.113 + * Get the global flag of this regular expression. 10.114 + * 10.115 + * @return the global flag 10.116 + */ 10.117 + public boolean isGlobal() { 10.118 + return global; 10.119 + } 10.120 + 10.121 + /** 10.122 + * Get the ignore-case flag of this regular expression. 10.123 + * 10.124 + * @return the ignore-case flag 10.125 + */ 10.126 + public boolean isIgnoreCase() { 10.127 + return ignoreCase; 10.128 + } 10.129 + 10.130 + /** 10.131 + * Get the multiline flag of this regular expression. 10.132 + * 10.133 + * @return the multiline flag 10.134 + */ 10.135 + public boolean isMultiline() { 10.136 + return multiline; 10.137 + } 10.138 + 10.139 + /** 10.140 + * Get a bitset indicating which of the groups in this regular expression are inside a negative lookahead. 10.141 + * 10.142 + * @return the groups-in-negative-lookahead bitset 10.143 + */ 10.144 + public BitVector getGroupsInNegativeLookahead() { 10.145 + return groupsInNegativeLookahead; 10.146 + } 10.147 + 10.148 + /** 10.149 + * Match this regular expression against {@code str}, starting at index {@code start} 10.150 + * and return a {@link MatchResult} with the result. 10.151 + * 10.152 + * @param str the string 10.153 + * @return the matcher 10.154 + */ 10.155 + public abstract RegExpMatcher match(String str); 10.156 + 10.157 + /** 10.158 + * Throw a regexp parser exception. 10.159 + * 10.160 + * @param key the message key 10.161 + * @param str string argument 10.162 + * @throws jdk.nashorn.internal.runtime.ParserException 10.163 + */ 10.164 + protected static void throwParserException(final String key, final String str) throws ParserException { 10.165 + throw new ParserException(ECMAErrors.getMessage("parser.error.regex." + key, str)); 10.166 + } 10.167 +}
11.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 11.2 +++ b/src/jdk/nashorn/internal/runtime/regexp/RegExpFactory.java Fri Feb 22 16:31:10 2013 +0100 11.3 @@ -0,0 +1,103 @@ 11.4 +/* 11.5 + * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved. 11.6 + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 11.7 + * 11.8 + * This code is free software; you can redistribute it and/or modify it 11.9 + * under the terms of the GNU General Public License version 2 only, as 11.10 + * published by the Free Software Foundation. Oracle designates this 11.11 + * particular file as subject to the "Classpath" exception as provided 11.12 + * by Oracle in the LICENSE file that accompanied this code. 11.13 + * 11.14 + * This code is distributed in the hope that it will be useful, but WITHOUT 11.15 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11.16 + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 11.17 + * version 2 for more details (a copy is included in the LICENSE file that 11.18 + * accompanied this code). 11.19 + * 11.20 + * You should have received a copy of the GNU General Public License version 11.21 + * 2 along with this work; if not, write to the Free Software Foundation, 11.22 + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 11.23 + * 11.24 + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 11.25 + * or visit www.oracle.com if you need additional information or have any 11.26 + * questions. 11.27 + */ 11.28 + 11.29 +package jdk.nashorn.internal.runtime.regexp; 11.30 + 11.31 +import jdk.nashorn.internal.parser.Lexer; 11.32 +import jdk.nashorn.internal.runtime.ParserException; 11.33 + 11.34 +/** 11.35 + * Factory class for regular expressions. This class creates instances of {@link DefaultRegExp}. 11.36 + */ 11.37 +public class RegExpFactory { 11.38 + 11.39 + 11.40 + private final static RegExpFactory instance = new RegExpFactory(); 11.41 + 11.42 + /** 11.43 + * Creates a Regular expression from the given {@code pattern} and {@code flags} strings. 11.44 + * 11.45 + * @param pattern RegExp pattern string 11.46 + * @param flags RegExp flags string 11.47 + * @throws ParserException if flags is invalid or pattern string has syntax error. 11.48 + */ 11.49 + protected RegExp compile(final String pattern, final String flags) throws ParserException { 11.50 + return new DefaultRegExp(pattern, flags); 11.51 + } 11.52 + 11.53 + /** 11.54 + * Replace a regexp token as suitable for regexp instances created by this factory. 11.55 + * 11.56 + * @param str a regular expression token 11.57 + * @return the replacement token 11.58 + */ 11.59 + protected String replaceToken(final String str) { 11.60 + switch (str) { 11.61 + case "\\s": 11.62 + return "[" + Lexer.getWhitespaceRegExp() + "]"; 11.63 + case "\\S": 11.64 + return "[^" + Lexer.getWhitespaceRegExp() + "]"; 11.65 + case "[^]": 11.66 + return "[\\s\\S]"; 11.67 + default: 11.68 + return str; 11.69 + } 11.70 + } 11.71 + 11.72 + /** 11.73 + * Compile a regexp with the given {@code source} and {@code flags}. 11.74 + * 11.75 + * @param pattern RegExp pattern string 11.76 + * @param flags flag string 11.77 + * 11.78 + * @throws ParserException if invalid source or flags 11.79 + */ 11.80 + public static RegExp create(final String pattern, final String flags) { 11.81 + return instance.compile(pattern, flags); 11.82 + } 11.83 + 11.84 + /** 11.85 + * Replace a regexp token as needed by the currently installed factory instance. 11.86 + * 11.87 + * @param token a regexp token 11.88 + * @return the replacement token 11.89 + */ 11.90 + public static String replace(final String token) { 11.91 + return instance.replaceToken(token); 11.92 + } 11.93 + 11.94 + /** 11.95 + * Validate a regexp with the given {@code source} and {@code flags}. 11.96 + * 11.97 + * @param pattern RegExp pattern string 11.98 + * @param flags flag string 11.99 + * 11.100 + * @throws ParserException if invalid source or flags 11.101 + */ 11.102 + // @SuppressWarnings({"unused"}) 11.103 + public static void validate(final String pattern, final String flags) throws ParserException { 11.104 + instance.compile(pattern, flags); 11.105 + } 11.106 +}
12.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 12.2 +++ b/src/jdk/nashorn/internal/runtime/regexp/RegExpMatcher.java Fri Feb 22 16:31:10 2013 +0100 12.3 @@ -0,0 +1,51 @@ 12.4 +/* 12.5 + * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved. 12.6 + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 12.7 + * 12.8 + * This code is free software; you can redistribute it and/or modify it 12.9 + * under the terms of the GNU General Public License version 2 only, as 12.10 + * published by the Free Software Foundation. Oracle designates this 12.11 + * particular file as subject to the "Classpath" exception as provided 12.12 + * by Oracle in the LICENSE file that accompanied this code. 12.13 + * 12.14 + * This code is distributed in the hope that it will be useful, but WITHOUT 12.15 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12.16 + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12.17 + * version 2 for more details (a copy is included in the LICENSE file that 12.18 + * accompanied this code). 12.19 + * 12.20 + * You should have received a copy of the GNU General Public License version 12.21 + * 2 along with this work; if not, write to the Free Software Foundation, 12.22 + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 12.23 + * 12.24 + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 12.25 + * or visit www.oracle.com if you need additional information or have any 12.26 + * questions. 12.27 + */ 12.28 + 12.29 +package jdk.nashorn.internal.runtime.regexp; 12.30 + 12.31 +import java.util.regex.MatchResult; 12.32 + 12.33 +/** 12.34 + * Interface for matching a regular expression against a string and retrieving the 12.35 + * match result. Extends {@link MatchResult}. 12.36 + */ 12.37 +public interface RegExpMatcher extends MatchResult { 12.38 + 12.39 + /** 12.40 + * Searches for pattern starting at {@code start}. Returns {@code true} if a match was found. 12.41 + * 12.42 + * @param start the start index in the input string 12.43 + * @return {@code true} if a match was found 12.44 + */ 12.45 + boolean search(int start); 12.46 + 12.47 + /** 12.48 + * Get the input string. 12.49 + * 12.50 + * @return the input string 12.51 + */ 12.52 + String getInput(); 12.53 + 12.54 +}
13.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 13.2 +++ b/src/jdk/nashorn/internal/runtime/regexp/RegExpResult.java Fri Feb 22 16:31:10 2013 +0100 13.3 @@ -0,0 +1,98 @@ 13.4 +/* 13.5 + * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved. 13.6 + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 13.7 + * 13.8 + * This code is free software; you can redistribute it and/or modify it 13.9 + * under the terms of the GNU General Public License version 2 only, as 13.10 + * published by the Free Software Foundation. Oracle designates this 13.11 + * particular file as subject to the "Classpath" exception as provided 13.12 + * by Oracle in the LICENSE file that accompanied this code. 13.13 + * 13.14 + * This code is distributed in the hope that it will be useful, but WITHOUT 13.15 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13.16 + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13.17 + * version 2 for more details (a copy is included in the LICENSE file that 13.18 + * accompanied this code). 13.19 + * 13.20 + * You should have received a copy of the GNU General Public License version 13.21 + * 2 along with this work; if not, write to the Free Software Foundation, 13.22 + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 13.23 + * 13.24 + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 13.25 + * or visit www.oracle.com if you need additional information or have any 13.26 + * questions. 13.27 + */ 13.28 + 13.29 +package jdk.nashorn.internal.runtime.regexp; 13.30 + 13.31 +/** 13.32 + * Match tuple to keep track of ongoing regexp match. 13.33 + */ 13.34 +public final class RegExpResult { 13.35 + final Object[] groups; 13.36 + final int index; 13.37 + final String input; 13.38 + 13.39 + /** 13.40 + * Constructor 13.41 + * 13.42 + * @param input regexp input 13.43 + * @param index index of match 13.44 + * @param groups groups vector 13.45 + */ 13.46 + public RegExpResult(final String input, final int index, final Object[] groups) { 13.47 + this.input = input; 13.48 + this.index = index; 13.49 + this.groups = groups; 13.50 + } 13.51 + 13.52 + /** 13.53 + * Get the groups for the match 13.54 + * @return group vector 13.55 + */ 13.56 + public Object[] getGroups() { 13.57 + return groups; 13.58 + } 13.59 + 13.60 + /** 13.61 + * Get the input for the map 13.62 + * @return input 13.63 + */ 13.64 + public String getInput() { 13.65 + return input; 13.66 + } 13.67 + 13.68 + /** 13.69 + * Get the index for the match 13.70 + * @return index 13.71 + */ 13.72 + public int getIndex() { 13.73 + return index; 13.74 + } 13.75 + 13.76 + /** 13.77 + * Get the length of the match 13.78 + * @return length 13.79 + */ 13.80 + public int length() { 13.81 + return ((String)groups[0]).length(); 13.82 + } 13.83 + 13.84 + /** 13.85 + * Get the group with the given index or the empty string if group index is not valid. 13.86 + * @param index the group index 13.87 + * @return the group or "" 13.88 + */ 13.89 + public Object getGroup(int index) { 13.90 + return index >= 0 && index < groups.length ? groups[index] : ""; 13.91 + } 13.92 + 13.93 + /** 13.94 + * Get the last parenthesis group, or the empty string if none exists. 13.95 + * @return the last group or "" 13.96 + */ 13.97 + public Object getLastParen() { 13.98 + return groups.length > 1 ? groups[groups.length - 1] : ""; 13.99 + } 13.100 + 13.101 +}
14.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 14.2 +++ b/src/jdk/nashorn/internal/runtime/regexp/RegExpScanner.java Fri Feb 22 16:31:10 2013 +0100 14.3 @@ -0,0 +1,1391 @@ 14.4 +/* 14.5 + * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved. 14.6 + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 14.7 + * 14.8 + * This code is free software; you can redistribute it and/or modify it 14.9 + * under the terms of the GNU General Public License version 2 only, as 14.10 + * published by the Free Software Foundation. Oracle designates this 14.11 + * particular file as subject to the "Classpath" exception as provided 14.12 + * by Oracle in the LICENSE file that accompanied this code. 14.13 + * 14.14 + * This code is distributed in the hope that it will be useful, but WITHOUT 14.15 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 14.16 + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14.17 + * version 2 for more details (a copy is included in the LICENSE file that 14.18 + * accompanied this code). 14.19 + * 14.20 + * You should have received a copy of the GNU General Public License version 14.21 + * 2 along with this work; if not, write to the Free Software Foundation, 14.22 + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 14.23 + * 14.24 + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 14.25 + * or visit www.oracle.com if you need additional information or have any 14.26 + * questions. 14.27 + */ 14.28 + 14.29 +package jdk.nashorn.internal.runtime.regexp; 14.30 + 14.31 +import java.util.ArrayList; 14.32 +import java.util.HashMap; 14.33 +import java.util.Iterator; 14.34 +import java.util.LinkedHashMap; 14.35 +import java.util.LinkedList; 14.36 +import java.util.List; 14.37 +import java.util.Map; 14.38 +import java.util.regex.PatternSyntaxException; 14.39 + 14.40 +import jdk.nashorn.internal.parser.Scanner; 14.41 +import jdk.nashorn.internal.runtime.BitVector; 14.42 + 14.43 +/** 14.44 + * Scan a JavaScript regexp, converting to Java regex if necessary. 14.45 + * 14.46 + */ 14.47 +final class RegExpScanner extends Scanner { 14.48 + 14.49 + /** 14.50 + * String builder to accumulate the result - this contains verbatim parsed JavaScript. 14.51 + * to get the java equivalent we need to create a Pattern token and return its toString() 14.52 + */ 14.53 + private final StringBuilder sb; 14.54 + 14.55 + /** Is this the special case of a regexp that never matches anything */ 14.56 + private boolean neverMatches; 14.57 + 14.58 + /** The resulting java.util.regex pattern string. */ 14.59 + private String javaPattern; 14.60 + 14.61 + /** Expected token table */ 14.62 + private final Map<Character, Integer> expected = new HashMap<>(); 14.63 + 14.64 + /** Capturing parenthesis that have been found so far. */ 14.65 + private final List<Capture> caps = new LinkedList<>(); 14.66 + 14.67 + /** Forward references to capturing parenthesis to be resolved later.*/ 14.68 + private final Map<Integer, Token> forwardReferences = new LinkedHashMap<>(); 14.69 + 14.70 + /** Current level of zero-width negative lookahead assertions. */ 14.71 + private int negativeLookaheadLevel; 14.72 + 14.73 + private static final String NON_IDENT_ESCAPES = "$^*+(){}[]|\\.?"; 14.74 + 14.75 + private static class Capture { 14.76 + /** 14.77 + * Zero-width negative lookaheads enclosing the capture. 14.78 + */ 14.79 + private final int negativeLookaheadLevel; 14.80 + /** 14.81 + * Captures that live inside a negative lookahead are dead after the 14.82 + * lookahead and will be undefined if referenced from outside. 14.83 + */ 14.84 + private boolean isDead; 14.85 + 14.86 + Capture(final int negativeLookaheadLevel) { 14.87 + this.negativeLookaheadLevel = negativeLookaheadLevel; 14.88 + } 14.89 + 14.90 + public int getNegativeLookaheadLevel() { 14.91 + return negativeLookaheadLevel; 14.92 + } 14.93 + 14.94 + public boolean isDead() { 14.95 + return isDead; 14.96 + } 14.97 + 14.98 + public void setDead() { 14.99 + this.isDead = true; 14.100 + } 14.101 + } 14.102 + 14.103 + /** 14.104 + * This is a token - the JavaScript regexp is scanned into a token tree 14.105 + * A token has other tokens as children as well as "atoms", i.e. Strings. 14.106 + */ 14.107 + private static class Token { 14.108 + 14.109 + private enum Type { 14.110 + PATTERN, 14.111 + DISJUNCTION, 14.112 + ALTERNATIVE, 14.113 + TERM, 14.114 + ASSERTION, 14.115 + QUANTIFIER, 14.116 + QUANTIFIER_PREFIX, 14.117 + ATOM, 14.118 + PATTERN_CHARACTER, 14.119 + ATOM_ESCAPE, 14.120 + CHARACTER_ESCAPE, 14.121 + CONTROL_ESCAPE, 14.122 + CONTROL_LETTER, 14.123 + IDENTITY_ESCAPE, 14.124 + DECIMAL_ESCAPE, 14.125 + CHARACTERCLASS_ESCAPE, 14.126 + CHARACTERCLASS, 14.127 + CLASSRANGES, 14.128 + NON_EMPTY_CLASSRANGES, 14.129 + NON_EMPTY_CLASSRANGES_NODASH, 14.130 + CLASSATOM, 14.131 + CLASSATOM_NODASH, 14.132 + CLASS_ESCAPE, 14.133 + DECIMALDIGITS, 14.134 + HEX_ESCAPESEQUENCE, 14.135 + UNICODE_ESCAPESEQUENCE, 14.136 + } 14.137 + 14.138 + /** 14.139 + * Token tyoe 14.140 + */ 14.141 + private final Token.Type type; 14.142 + 14.143 + /** 14.144 + * Child nodes 14.145 + */ 14.146 + private final List<Object> children; 14.147 + 14.148 + /** 14.149 + * Parent node 14.150 + */ 14.151 + private Token parent; 14.152 + 14.153 + /** 14.154 + * Dead code flag 14.155 + */ 14.156 + private boolean isDead; 14.157 + 14.158 + private static final Map<Type, ToString> toStringMap = new HashMap<>(); 14.159 + private static final ToString DEFAULT_TOSTRING = new ToString(); 14.160 + 14.161 + private static String unicode(final int value) { 14.162 + final StringBuilder sb = new StringBuilder(); 14.163 + final String hex = Integer.toHexString(value); 14.164 + sb.append('u'); 14.165 + for (int i = 0; i < 4 - hex.length(); i++) { 14.166 + sb.append('0'); 14.167 + } 14.168 + sb.append(hex); 14.169 + 14.170 + return sb.toString(); 14.171 + } 14.172 + 14.173 + static { 14.174 + toStringMap.put(Type.CHARACTERCLASS, new ToString() { 14.175 + @Override 14.176 + public String toString(final Token token) { 14.177 + return super.toString(token).replace("\\b", "\b"); 14.178 + } 14.179 + }); 14.180 + 14.181 + // for some reason java regexps don't like control characters on the 14.182 + // form "\\ca".match([string with ascii 1 at char0]). Translating 14.183 + // them to unicode does it though. 14.184 + toStringMap.put(Type.CHARACTER_ESCAPE, new ToString() { 14.185 + @Override 14.186 + public String toString(final Token token) { 14.187 + final String str = super.toString(token); 14.188 + if (str.length() == 2) { 14.189 + return Token.unicode(Character.toLowerCase(str.charAt(1)) - 'a' + 1); 14.190 + } 14.191 + return str; 14.192 + } 14.193 + }); 14.194 + 14.195 + toStringMap.put(Type.DECIMAL_ESCAPE, new ToString() { 14.196 + @Override 14.197 + public String toString(final Token token) { 14.198 + final String str = super.toString(token); 14.199 + 14.200 + if ("\0".equals(str)) { 14.201 + return str; 14.202 + } 14.203 + 14.204 + int value; 14.205 + 14.206 + if (!token.hasParentOfType(Type.CLASSRANGES)) { 14.207 + return str; 14.208 + } 14.209 + 14.210 + value = Integer.parseInt(str, 8); //throws exception that leads to SyntaxError if not octal 14.211 + if (value > 0xff) { 14.212 + throw new NumberFormatException(str); 14.213 + } 14.214 + 14.215 + return Token.unicode(value); 14.216 + } 14.217 + }); 14.218 + 14.219 + } 14.220 + 14.221 + /** 14.222 + * JavaScript Token to Java regex substring framework. 14.223 + */ 14.224 + private static class ToString { 14.225 + String toString(final Token token) { 14.226 + final Object[] children = token.getChildren(); 14.227 + 14.228 + // Allow the installed regexp factory to perform global substitutions. 14.229 + switch (children.length) { 14.230 + case 0: 14.231 + return ""; 14.232 + case 1: 14.233 + return RegExpFactory.replace(children[0].toString()); 14.234 + default: 14.235 + final StringBuilder sb = new StringBuilder(); 14.236 + for (final Object child : children) { 14.237 + sb.append(child); 14.238 + } 14.239 + return RegExpFactory.replace(sb.toString()); 14.240 + } 14.241 + } 14.242 + } 14.243 + 14.244 + /** 14.245 + * Token iterator. Doesn't return "atom" children. i.e. string representations, 14.246 + * just tokens 14.247 + * 14.248 + */ 14.249 + private static class TokenIterator implements Iterator<Token> { 14.250 + private final List<Token> preorder; 14.251 + 14.252 + private void init(final Token root) { 14.253 + preorder.add(root); 14.254 + for (final Object child : root.getChildren()) { 14.255 + if (child instanceof Token) { 14.256 + init((Token)child); 14.257 + } 14.258 + } 14.259 + } 14.260 + 14.261 + TokenIterator(final Token root) { 14.262 + preorder = new ArrayList<>(); 14.263 + init(root); 14.264 + } 14.265 + 14.266 + @Override 14.267 + public boolean hasNext() { 14.268 + return !preorder.isEmpty(); 14.269 + } 14.270 + 14.271 + @Override 14.272 + public Token next() { 14.273 + return preorder.remove(0); 14.274 + } 14.275 + 14.276 + @Override 14.277 + public void remove() { 14.278 + next(); 14.279 + } 14.280 + } 14.281 + 14.282 + /** 14.283 + * Constructor 14.284 + * @param type the token type 14.285 + */ 14.286 + Token(final Token.Type type) { 14.287 + this.type = type; 14.288 + children = new ArrayList<>(); 14.289 + } 14.290 + 14.291 + /** 14.292 + * Add a an "atom" child to a token 14.293 + * @param child the child to add 14.294 + * @return the token (for chaining) 14.295 + */ 14.296 + public Token add(final String child) { 14.297 + children.add(child); 14.298 + return this; 14.299 + } 14.300 + 14.301 + /** 14.302 + * Add a child to a token 14.303 + * @param child the child 14.304 + * @return the token (for chaining) 14.305 + */ 14.306 + public Token add(final Token child) { 14.307 + if (child != null) { 14.308 + children.add(child); 14.309 + child.setParent(this); 14.310 + } 14.311 + return this; 14.312 + } 14.313 + 14.314 + /** 14.315 + * Remove a child from a token 14.316 + * @param child the child to remove 14.317 + * @return true if successful 14.318 + */ 14.319 + public boolean remove(final Token child) { 14.320 + return children.remove(child); 14.321 + } 14.322 + 14.323 + /** 14.324 + * Remove the last child from a token 14.325 + * @return the removed child 14.326 + */ 14.327 + public Object removeLast() { 14.328 + return children.remove(children.size() - 1); 14.329 + } 14.330 + 14.331 + /** 14.332 + * Flag this token as dead code 14.333 + * @param isDead is it dead or not 14.334 + */ 14.335 + private void setIsDead(final boolean isDead) { 14.336 + this.isDead = isDead; 14.337 + } 14.338 + 14.339 + /** 14.340 + * Is this token dead code 14.341 + * @return boolean 14.342 + */ 14.343 + private boolean getIsDead() { 14.344 + return isDead; 14.345 + } 14.346 + 14.347 + /** 14.348 + * Get the parent of this token 14.349 + * @return parent token 14.350 + */ 14.351 + public Token getParent() { 14.352 + return parent; 14.353 + } 14.354 + 14.355 + public boolean hasParentOfType(final Token.Type parentType) { 14.356 + for (Token p = getParent(); p != null; p = p.getParent()) { 14.357 + if (p.getType() == parentType) { 14.358 + return true; 14.359 + } 14.360 + } 14.361 + return false; 14.362 + } 14.363 + 14.364 + public boolean hasChildOfType(final Token.Type childType) { 14.365 + for (final Iterator<Token> iter = iterator() ; iter.hasNext() ; ) { 14.366 + if (iter.next().getType() == childType) { 14.367 + return true; 14.368 + } 14.369 + } 14.370 + return false; 14.371 + } 14.372 + 14.373 + /** 14.374 + * Set the parent of this token 14.375 + * @param parent 14.376 + */ 14.377 + private void setParent(final Token parent) { 14.378 + this.parent = parent; 14.379 + } 14.380 + 14.381 + /** 14.382 + * Get the children of this token 14.383 + * @return an array of children, never null 14.384 + */ 14.385 + public Object[] getChildren() { 14.386 + return children.toArray(); 14.387 + } 14.388 + 14.389 + /** 14.390 + * Reset this token, remove all children 14.391 + */ 14.392 + public void reset() { 14.393 + children.clear(); 14.394 + } 14.395 + 14.396 + /** 14.397 + * Get a preorder token iterator with this token as root 14.398 + * @return an iterator 14.399 + */ 14.400 + public Iterator<Token> iterator() { 14.401 + return new TokenIterator(this); 14.402 + } 14.403 + 14.404 + /** 14.405 + * Get the type of this token 14.406 + * @return type 14.407 + */ 14.408 + public Type getType() { 14.409 + return type; 14.410 + } 14.411 + 14.412 + /** 14.413 + * Turn this token into Java regexp compatible text 14.414 + * @return part of a java regexp 14.415 + */ 14.416 + @Override 14.417 + public String toString() { 14.418 + ToString t = toStringMap.get(getType()); 14.419 + if (t == null) { 14.420 + t = DEFAULT_TOSTRING; 14.421 + } 14.422 + return t.toString(this); 14.423 + } 14.424 + } 14.425 + 14.426 + /** 14.427 + * Constructor 14.428 + * @param string the JavaScript regexp to parse 14.429 + */ 14.430 + private RegExpScanner(final String string) { 14.431 + super(string); 14.432 + sb = new StringBuilder(limit); 14.433 + reset(0); 14.434 + expected.put(']', 0); 14.435 + expected.put('}', 0); 14.436 + } 14.437 + 14.438 + private void processForwardReferences() { 14.439 + if (neverMatches()) { 14.440 + return; 14.441 + } 14.442 + 14.443 + for (final Map.Entry<Integer, Token> fwdRef : forwardReferences.entrySet()) { 14.444 + if (fwdRef.getKey().intValue() > caps.size()) { 14.445 + neverMatches = true; 14.446 + break; 14.447 + } 14.448 + 14.449 + fwdRef.getValue().setIsDead(true); 14.450 + } 14.451 + 14.452 + forwardReferences.clear(); 14.453 + } 14.454 + 14.455 + /** 14.456 + * Scan a JavaScript regexp string returning a Java safe regex string. 14.457 + * 14.458 + * @param string 14.459 + * JavaScript regexp string. 14.460 + * @return Java safe regex string. 14.461 + */ 14.462 + public static RegExpScanner scan(final String string) { 14.463 + final RegExpScanner scanner = new RegExpScanner(string); 14.464 + 14.465 + Token pattern; 14.466 + 14.467 + try { 14.468 + pattern = scanner.pattern(); 14.469 + } catch (final Exception e) { 14.470 + throw new PatternSyntaxException(e.getMessage(), string, scanner.sb.length()); 14.471 + } 14.472 + 14.473 + scanner.processForwardReferences(); 14.474 + if (scanner.neverMatches()) { 14.475 + return null; // never matches 14.476 + } 14.477 + 14.478 + // go over the code and remove dead code 14.479 + final Iterator<Token> iter = pattern.iterator(); 14.480 + while (iter.hasNext()) { 14.481 + final Token next = iter.next(); 14.482 + if (next.getIsDead()) { 14.483 + next.getParent().remove(next); 14.484 + } 14.485 + } 14.486 + 14.487 + // turn the pattern into a string, p, the java equivalent string for our js regexp 14.488 + final String p = pattern.toString(); 14.489 + // if builder contains all tokens that were sent in, we know 14.490 + // we correctly parsed the entire JavaScript regexp without syntax errors 14.491 + if (!string.equals(scanner.getStringBuilder().toString())) { 14.492 + throw new PatternSyntaxException(string, p, p.length() + 1); 14.493 + } 14.494 + 14.495 + scanner.javaPattern = p; 14.496 + return scanner; 14.497 + } 14.498 + 14.499 + /** 14.500 + * Does this regexp ever match anything? Use of e.g. [], which is legal in JavaScript, 14.501 + * is an example where we never match 14.502 + * 14.503 + * @return boolean 14.504 + */ 14.505 + private boolean neverMatches() { 14.506 + return neverMatches; 14.507 + } 14.508 + 14.509 + final StringBuilder getStringBuilder() { 14.510 + return sb; 14.511 + } 14.512 + 14.513 + String getJavaPattern() { 14.514 + return javaPattern; 14.515 + } 14.516 + 14.517 + BitVector getGroupsInNegativeLookahead() { 14.518 + BitVector vec = null; 14.519 + for (int i = 0; i < caps.size(); i++) { 14.520 + final Capture cap = caps.get(i); 14.521 + if (cap.getNegativeLookaheadLevel() > 0) { 14.522 + if (vec == null) { 14.523 + vec = new BitVector(caps.size() + 1); 14.524 + } 14.525 + vec.set(i + 1); 14.526 + } 14.527 + } 14.528 + return vec; 14.529 + } 14.530 + 14.531 + /** 14.532 + * Commit n characters to the builder and to a given token 14.533 + * @param token Uncommitted token. 14.534 + * @param n Number of characters. 14.535 + * @return Committed token 14.536 + */ 14.537 + private Token commit(final Token token, final int n) { 14.538 + final int startIn = position; 14.539 + 14.540 + switch (n) { 14.541 + case 1: 14.542 + sb.append(ch0); 14.543 + skip(1); 14.544 + break; 14.545 + case 2: 14.546 + sb.append(ch0); 14.547 + sb.append(ch1); 14.548 + skip(2); 14.549 + break; 14.550 + case 3: 14.551 + sb.append(ch0); 14.552 + sb.append(ch1); 14.553 + sb.append(ch2); 14.554 + skip(3); 14.555 + break; 14.556 + default: 14.557 + assert false : "Should not reach here"; 14.558 + } 14.559 + 14.560 + if (token == null) { 14.561 + return null; 14.562 + } 14.563 + 14.564 + return token.add(sb.substring(startIn, sb.length())); 14.565 + } 14.566 + 14.567 + /** 14.568 + * Restart the buffers back at an earlier position. 14.569 + * 14.570 + * @param startIn 14.571 + * Position in the input stream. 14.572 + * @param startOut 14.573 + * Position in the output stream. 14.574 + */ 14.575 + private void restart(final int startIn, final int startOut) { 14.576 + reset(startIn); 14.577 + sb.setLength(startOut); 14.578 + } 14.579 + 14.580 + private void push(final char ch) { 14.581 + expected.put(ch, expected.get(ch) + 1); 14.582 + } 14.583 + 14.584 + private void pop(final char ch) { 14.585 + expected.put(ch, Math.min(0, expected.get(ch) - 1)); 14.586 + } 14.587 + 14.588 + /* 14.589 + * Recursive descent tokenizer starts below. 14.590 + */ 14.591 + 14.592 + /* 14.593 + * Pattern :: 14.594 + * Disjunction 14.595 + */ 14.596 + private Token pattern() { 14.597 + final Token token = new Token(Token.Type.PATTERN); 14.598 + 14.599 + final Token child = disjunction(); 14.600 + return token.add(child); 14.601 + } 14.602 + 14.603 + /* 14.604 + * Disjunction :: 14.605 + * Alternative 14.606 + * Alternative | Disjunction 14.607 + */ 14.608 + private Token disjunction() { 14.609 + final Token token = new Token(Token.Type.DISJUNCTION); 14.610 + 14.611 + while (true) { 14.612 + token.add(alternative()); 14.613 + 14.614 + if (ch0 == '|') { 14.615 + commit(token, 1); 14.616 + } else { 14.617 + break; 14.618 + } 14.619 + } 14.620 + 14.621 + return token; 14.622 + } 14.623 + 14.624 + /* 14.625 + * Alternative :: 14.626 + * [empty] 14.627 + * Alternative Term 14.628 + */ 14.629 + private Token alternative() { 14.630 + final Token token = new Token(Token.Type.ALTERNATIVE); 14.631 + 14.632 + Token child; 14.633 + while ((child = term()) != null) { 14.634 + token.add(child); 14.635 + } 14.636 + 14.637 + return token; 14.638 + } 14.639 + 14.640 + /* 14.641 + * Term :: 14.642 + * Assertion 14.643 + * Atom 14.644 + * Atom Quantifier 14.645 + */ 14.646 + private Token term() { 14.647 + final int startIn = position; 14.648 + final int startOut = sb.length(); 14.649 + final Token token = new Token(Token.Type.TERM); 14.650 + Token child; 14.651 + 14.652 + child = assertion(); 14.653 + if (child != null) { 14.654 + return token.add(child); 14.655 + } 14.656 + 14.657 + child = atom(); 14.658 + if (child != null) { 14.659 + boolean emptyCharacterClass = false; 14.660 + if ("[]".equals(child.toString())) { 14.661 + emptyCharacterClass = true; 14.662 + } 14.663 + 14.664 + token.add(child); 14.665 + 14.666 + final Token quantifier = quantifier(); 14.667 + if (quantifier != null) { 14.668 + token.add(quantifier); 14.669 + } 14.670 + 14.671 + if (emptyCharacterClass) { 14.672 + if (quantifier == null) { 14.673 + neverMatches = true; //never matches ever. 14.674 + } else { 14.675 + //if we can get away with max zero, remove this entire token 14.676 + final String qs = quantifier.toString(); 14.677 + if ("+".equals(qs) || "*".equals(qs) || qs.startsWith("{0,")) { 14.678 + token.setIsDead(true); 14.679 + } 14.680 + } 14.681 + } 14.682 + 14.683 + return token; 14.684 + } 14.685 + 14.686 + restart(startIn, startOut); 14.687 + return null; 14.688 + } 14.689 + 14.690 + /* 14.691 + * Assertion :: 14.692 + * ^ 14.693 + * $ 14.694 + * \b 14.695 + * \B 14.696 + * ( ? = Disjunction ) 14.697 + * ( ? ! Disjunction ) 14.698 + */ 14.699 + private Token assertion() { 14.700 + final int startIn = position; 14.701 + final int startOut = sb.length(); 14.702 + final Token token = new Token(Token.Type.ASSERTION); 14.703 + 14.704 + switch (ch0) { 14.705 + case '^': 14.706 + case '$': 14.707 + return commit(token, 1); 14.708 + 14.709 + case '\\': 14.710 + if (ch1 == 'b' || ch1 == 'B') { 14.711 + return commit(token, 2); 14.712 + } 14.713 + break; 14.714 + 14.715 + case '(': 14.716 + if (ch1 != '?') { 14.717 + break; 14.718 + } 14.719 + if (ch2 != '=' && ch2 != '!') { 14.720 + break; 14.721 + } 14.722 + final boolean isNegativeLookahead = (ch2 == '!'); 14.723 + commit(token, 3); 14.724 + 14.725 + if (isNegativeLookahead) { 14.726 + negativeLookaheadLevel++; 14.727 + } 14.728 + final Token disjunction = disjunction(); 14.729 + if (isNegativeLookahead) { 14.730 + for (final Capture cap : caps) { 14.731 + if (cap.getNegativeLookaheadLevel() >= negativeLookaheadLevel) { 14.732 + cap.setDead(); 14.733 + } 14.734 + } 14.735 + negativeLookaheadLevel--; 14.736 + } 14.737 + 14.738 + if (disjunction != null && ch0 == ')') { 14.739 + token.add(disjunction); 14.740 + return commit(token, 1); 14.741 + } 14.742 + break; 14.743 + 14.744 + default: 14.745 + break; 14.746 + } 14.747 + 14.748 + restart(startIn, startOut); 14.749 + 14.750 + return null; 14.751 + } 14.752 + 14.753 + /* 14.754 + * Quantifier :: 14.755 + * QuantifierPrefix 14.756 + * QuantifierPrefix ? 14.757 + */ 14.758 + private Token quantifier() { 14.759 + final Token token = new Token(Token.Type.QUANTIFIER); 14.760 + final Token child = quantifierPrefix(); 14.761 + if (child != null) { 14.762 + token.add(child); 14.763 + if (ch0 == '?') { 14.764 + commit(token, 1); 14.765 + } 14.766 + return token; 14.767 + } 14.768 + return null; 14.769 + } 14.770 + 14.771 + /* 14.772 + * QuantifierPrefix :: 14.773 + * * 14.774 + * + 14.775 + * ? 14.776 + * { DecimalDigits } 14.777 + * { DecimalDigits , } 14.778 + * { DecimalDigits , DecimalDigits } 14.779 + */ 14.780 + private Token quantifierPrefix() { 14.781 + final int startIn = position; 14.782 + final int startOut = sb.length(); 14.783 + final Token token = new Token(Token.Type.QUANTIFIER_PREFIX); 14.784 + 14.785 + switch (ch0) { 14.786 + case '*': 14.787 + case '+': 14.788 + case '?': 14.789 + return commit(token, 1); 14.790 + 14.791 + case '{': 14.792 + commit(token, 1); 14.793 + 14.794 + final Token child = decimalDigits(); 14.795 + if (child == null) { 14.796 + break; // not a quantifier - back out 14.797 + } 14.798 + push('}'); 14.799 + token.add(child); 14.800 + 14.801 + if (ch0 == ',') { 14.802 + commit(token, 1); 14.803 + token.add(decimalDigits()); 14.804 + } 14.805 + 14.806 + if (ch0 == '}') { 14.807 + pop('}'); 14.808 + commit(token, 1); 14.809 + } 14.810 + 14.811 + return token; 14.812 + 14.813 + default: 14.814 + break; 14.815 + } 14.816 + 14.817 + restart(startIn, startOut); 14.818 + return null; 14.819 + } 14.820 + 14.821 + /* 14.822 + * Atom :: 14.823 + * PatternCharacter 14.824 + * . 14.825 + * \ AtomEscape 14.826 + * CharacterClass 14.827 + * ( Disjunction ) 14.828 + * ( ? : Disjunction ) 14.829 + * 14.830 + */ 14.831 + private Token atom() { 14.832 + final int startIn = position; 14.833 + final int startOut = sb.length(); 14.834 + final Token token = new Token(Token.Type.ATOM); 14.835 + Token child; 14.836 + 14.837 + child = patternCharacter(); 14.838 + if (child != null) { 14.839 + return token.add(child); 14.840 + } 14.841 + 14.842 + if (ch0 == '.') { 14.843 + return commit(token, 1); 14.844 + } 14.845 + 14.846 + if (ch0 == '\\') { 14.847 + commit(token, 1); 14.848 + child = atomEscape(); 14.849 + 14.850 + if (child != null) { 14.851 + if (child.hasChildOfType(Token.Type.IDENTITY_ESCAPE)) { 14.852 + final char idEscape = child.toString().charAt(0); 14.853 + if (NON_IDENT_ESCAPES.indexOf(idEscape) == -1) { 14.854 + token.reset(); 14.855 + } 14.856 + } 14.857 + 14.858 + token.add(child); 14.859 + 14.860 + // forward backreferences always match empty. JavaScript != Java 14.861 + if (child.hasChildOfType(Token.Type.DECIMAL_ESCAPE) && !"\u0000".equals(child.toString())) { 14.862 + final int refNum = Integer.parseInt(child.toString()); 14.863 + 14.864 + if (refNum - 1 < caps.size() && caps.get(refNum - 1).isDead()) { 14.865 + // reference to dead in-negative-lookahead capture 14.866 + token.setIsDead(true); 14.867 + } else if (caps.size() < refNum) { 14.868 + // forward reference: always matches against empty string (dead token). 14.869 + // invalid reference (non-existant capture): pattern never matches. 14.870 + forwardReferences.put(refNum, token); 14.871 + } 14.872 + } 14.873 + 14.874 + return token; 14.875 + } 14.876 + } 14.877 + 14.878 + child = characterClass(); 14.879 + if (child != null) { 14.880 + return token.add(child); 14.881 + } 14.882 + 14.883 + if (ch0 == '(') { 14.884 + boolean capturingParens = true; 14.885 + commit(token, 1); 14.886 + if (ch0 == '?' && ch1 == ':') { 14.887 + capturingParens = false; 14.888 + commit(token, 2); 14.889 + } 14.890 + 14.891 + child = disjunction(); 14.892 + if (child != null) { 14.893 + token.add(child); 14.894 + if (ch0 == ')') { 14.895 + final Token atom = commit(token, 1); 14.896 + if (capturingParens) { 14.897 + caps.add(new Capture(negativeLookaheadLevel)); 14.898 + } 14.899 + return atom; 14.900 + } 14.901 + } 14.902 + } 14.903 + 14.904 + restart(startIn, startOut); 14.905 + return null; 14.906 + } 14.907 + 14.908 + /* 14.909 + * PatternCharacter :: 14.910 + * SourceCharacter but not any of: ^$\.*+?()[]{}| 14.911 + */ 14.912 + @SuppressWarnings("fallthrough") 14.913 + private Token patternCharacter() { 14.914 + if (atEOF()) { 14.915 + return null; 14.916 + } 14.917 + 14.918 + switch (ch0) { 14.919 + case '^': 14.920 + case '$': 14.921 + case '\\': 14.922 + case '.': 14.923 + case '*': 14.924 + case '+': 14.925 + case '?': 14.926 + case '(': 14.927 + case ')': 14.928 + case '[': 14.929 + case '|': 14.930 + return null; 14.931 + 14.932 + case '}': 14.933 + case ']': 14.934 + final int n = expected.get(ch0); 14.935 + if (n != 0) { 14.936 + return null; 14.937 + } 14.938 + 14.939 + case '{': 14.940 + // if not a valid quantifier escape curly brace to match itself 14.941 + // this ensures compatibility with other JS implementations 14.942 + final Token quant = quantifierPrefix(); 14.943 + return (quant == null) ? commit(new Token(Token.Type.PATTERN_CHARACTER).add("\\"), 1) : null; 14.944 + 14.945 + default: 14.946 + return commit(new Token(Token.Type.PATTERN_CHARACTER), 1); // SOURCECHARACTER 14.947 + } 14.948 + } 14.949 + 14.950 + /* 14.951 + * AtomEscape :: 14.952 + * DecimalEscape 14.953 + * CharacterEscape 14.954 + * CharacterClassEscape 14.955 + */ 14.956 + private Token atomEscape() { 14.957 + final Token token = new Token(Token.Type.ATOM_ESCAPE); 14.958 + Token child; 14.959 + 14.960 + child = decimalEscape(); 14.961 + if (child != null) { 14.962 + return token.add(child); 14.963 + } 14.964 + 14.965 + child = characterClassEscape(); 14.966 + if (child != null) { 14.967 + return token.add(child); 14.968 + } 14.969 + 14.970 + child = characterEscape(); 14.971 + if (child != null) { 14.972 + return token.add(child); 14.973 + } 14.974 + 14.975 + 14.976 + return null; 14.977 + } 14.978 + 14.979 + /* 14.980 + * CharacterEscape :: 14.981 + * ControlEscape 14.982 + * c ControlLetter 14.983 + * HexEscapeSequence 14.984 + * UnicodeEscapeSequence 14.985 + * IdentityEscape 14.986 + */ 14.987 + private Token characterEscape() { 14.988 + final int startIn = position; 14.989 + final int startOut = sb.length(); 14.990 + 14.991 + final Token token = new Token(Token.Type.CHARACTER_ESCAPE); 14.992 + Token child; 14.993 + 14.994 + child = controlEscape(); 14.995 + if (child != null) { 14.996 + return token.add(child); 14.997 + } 14.998 + 14.999 + if (ch0 == 'c') { 14.1000 + commit(token, 1); 14.1001 + child = controlLetter(); 14.1002 + if (child != null) { 14.1003 + return token.add(child); 14.1004 + } 14.1005 + restart(startIn, startOut); 14.1006 + } 14.1007 + 14.1008 + child = hexEscapeSequence(); 14.1009 + if (child != null) { 14.1010 + return token.add(child); 14.1011 + } 14.1012 + 14.1013 + child = unicodeEscapeSequence(); 14.1014 + if (child != null) { 14.1015 + return token.add(child); 14.1016 + } 14.1017 + 14.1018 + child = identityEscape(); 14.1019 + if (child != null) { 14.1020 + return token.add(child); 14.1021 + } 14.1022 + 14.1023 + restart(startIn, startOut); 14.1024 + 14.1025 + return null; 14.1026 + } 14.1027 + 14.1028 + private boolean scanEscapeSequence(final char leader, final int length, final Token token) { 14.1029 + final int startIn = position; 14.1030 + final int startOut = sb.length(); 14.1031 + 14.1032 + if (ch0 != leader) { 14.1033 + return false; 14.1034 + } 14.1035 + 14.1036 + commit(token, 1); 14.1037 + for (int i = 0; i < length; i++) { 14.1038 + final char ch0l = Character.toLowerCase(ch0); 14.1039 + if ((ch0l >= 'a' && ch0l <= 'f') || isDecimalDigit(ch0)) { 14.1040 + commit(token, 1); 14.1041 + } else { 14.1042 + restart(startIn, startOut); 14.1043 + return false; 14.1044 + } 14.1045 + } 14.1046 + 14.1047 + return true; 14.1048 + } 14.1049 + 14.1050 + private Token hexEscapeSequence() { 14.1051 + final Token token = new Token(Token.Type.HEX_ESCAPESEQUENCE); 14.1052 + if (scanEscapeSequence('x', 2, token)) { 14.1053 + return token; 14.1054 + } 14.1055 + return null; 14.1056 + } 14.1057 + 14.1058 + private Token unicodeEscapeSequence() { 14.1059 + final Token token = new Token(Token.Type.UNICODE_ESCAPESEQUENCE); 14.1060 + if (scanEscapeSequence('u', 4, token)) { 14.1061 + return token; 14.1062 + } 14.1063 + return null; 14.1064 + } 14.1065 + 14.1066 + /* 14.1067 + * ControlEscape :: 14.1068 + * one of fnrtv 14.1069 + */ 14.1070 + private Token controlEscape() { 14.1071 + switch (ch0) { 14.1072 + case 'f': 14.1073 + case 'n': 14.1074 + case 'r': 14.1075 + case 't': 14.1076 + case 'v': 14.1077 + return commit(new Token(Token.Type.CONTROL_ESCAPE), 1); 14.1078 + 14.1079 + default: 14.1080 + return null; 14.1081 + } 14.1082 + } 14.1083 + 14.1084 + /* 14.1085 + * ControlLetter :: 14.1086 + * one of abcdefghijklmnopqrstuvwxyz 14.1087 + * ABCDEFGHIJKLMNOPQRSTUVWXYZ 14.1088 + */ 14.1089 + private Token controlLetter() { 14.1090 + final char c = Character.toUpperCase(ch0); 14.1091 + if (c >= 'A' && c <= 'Z') { 14.1092 + final Token token = new Token(Token.Type.CONTROL_LETTER); 14.1093 + commit(token, 1); 14.1094 + return token; 14.1095 + } 14.1096 + return null; 14.1097 + /* 14.1098 + Token token = new Token(Token.Type.CONTROL_LETTER); 14.1099 + commit(null, 1);//add original char to builder not to token 14.1100 + this.neverMatches = c < 'A' || c > 'Z'; 14.1101 + return token.add(""+c);*/ 14.1102 + } 14.1103 + 14.1104 + /* 14.1105 + * IdentityEscape :: 14.1106 + * SourceCharacter but not IdentifierPart 14.1107 + * <ZWJ> (200c) 14.1108 + * <ZWNJ> (200d) 14.1109 + */ 14.1110 + private Token identityEscape() { 14.1111 + final Token token = new Token(Token.Type.IDENTITY_ESCAPE); 14.1112 + commit(token, 1); 14.1113 + return token; 14.1114 + } 14.1115 + 14.1116 + /* 14.1117 + * DecimalEscape :: 14.1118 + * DecimalIntegerLiteral [lookahead DecimalDigit] 14.1119 + */ 14.1120 + private Token decimalEscape() { 14.1121 + final Token token = new Token(Token.Type.DECIMAL_ESCAPE); 14.1122 + final int startIn = position; 14.1123 + final int startOut = sb.length(); 14.1124 + 14.1125 + if (ch0 == '0' && !isDecimalDigit(ch1)) { 14.1126 + commit(token, 1); 14.1127 + token.removeLast(); 14.1128 + // DecimalEscape :: 0. If i is zero, return the EscapeValue consisting of a <NUL> character (Unicodevalue0000); 14.1129 + return token.add("\u0000"); 14.1130 + } 14.1131 + 14.1132 + if (isDecimalDigit(ch0)) { 14.1133 + while (isDecimalDigit(ch0)) { 14.1134 + commit(token, 1); 14.1135 + } 14.1136 + return token; 14.1137 + } 14.1138 + 14.1139 + restart(startIn, startOut); 14.1140 + 14.1141 + return null; 14.1142 + } 14.1143 + 14.1144 + /* 14.1145 + * CharacterClassEscape :: 14.1146 + * one of dDsSwW 14.1147 + */ 14.1148 + private Token characterClassEscape() { 14.1149 + switch (ch0) { 14.1150 + case 's': 14.1151 + case 'S': 14.1152 + case 'd': 14.1153 + case 'D': 14.1154 + case 'w': 14.1155 + case 'W': 14.1156 + return commit(new Token(Token.Type.CHARACTERCLASS_ESCAPE), 1); 14.1157 + 14.1158 + default: 14.1159 + return null; 14.1160 + } 14.1161 + } 14.1162 + 14.1163 + /* 14.1164 + * CharacterClass :: 14.1165 + * [ [lookahead {^}] ClassRanges ] 14.1166 + * [ ^ ClassRanges ] 14.1167 + */ 14.1168 + private Token characterClass() { 14.1169 + final int startIn = position; 14.1170 + final int startOut = sb.length(); 14.1171 + final Token token = new Token(Token.Type.CHARACTERCLASS); 14.1172 + 14.1173 + if (ch0 == '[') { 14.1174 + push(']'); 14.1175 + commit(token, 1); 14.1176 + 14.1177 + if (ch0 == '^') { 14.1178 + commit(token, 1); 14.1179 + } 14.1180 + 14.1181 + final Token child = classRanges(); 14.1182 + if (child != null && ch0 == ']') { 14.1183 + pop(']'); 14.1184 + token.add(child); 14.1185 + return commit(token, 1); 14.1186 + } 14.1187 + } 14.1188 + 14.1189 + restart(startIn, startOut); 14.1190 + return null; 14.1191 + } 14.1192 + 14.1193 + /* 14.1194 + * ClassRanges :: 14.1195 + * [empty] 14.1196 + * NonemptyClassRanges 14.1197 + */ 14.1198 + private Token classRanges() { 14.1199 + return new Token(Token.Type.CLASSRANGES).add(nonemptyClassRanges()); 14.1200 + } 14.1201 + 14.1202 + /* 14.1203 + * NonemptyClassRanges :: 14.1204 + * ClassAtom 14.1205 + * ClassAtom NonemptyClassRangesNoDash 14.1206 + * ClassAtom - ClassAtom ClassRanges 14.1207 + */ 14.1208 + private Token nonemptyClassRanges() { 14.1209 + final int startIn = position; 14.1210 + final int startOut = sb.length(); 14.1211 + final Token token = new Token(Token.Type.NON_EMPTY_CLASSRANGES); 14.1212 + Token child; 14.1213 + 14.1214 + child = classAtom(); 14.1215 + if (child != null) { 14.1216 + token.add(child); 14.1217 + 14.1218 + if (ch0 == '-') { 14.1219 + commit(token, 1); 14.1220 + 14.1221 + final Token child1 = classAtom(); 14.1222 + final Token child2 = classRanges(); 14.1223 + if (child1 != null && child2 != null) { 14.1224 + token.add(child1); 14.1225 + token.add(child2); 14.1226 + 14.1227 + return token; 14.1228 + } 14.1229 + } 14.1230 + 14.1231 + child = nonemptyClassRangesNoDash(); 14.1232 + if (child != null) { 14.1233 + token.add(child); 14.1234 + return token; 14.1235 + } 14.1236 + 14.1237 + return token; 14.1238 + } 14.1239 + 14.1240 + restart(startIn, startOut); 14.1241 + return null; 14.1242 + } 14.1243 + 14.1244 + /* 14.1245 + * NonemptyClassRangesNoDash :: 14.1246 + * ClassAtom 14.1247 + * ClassAtomNoDash NonemptyClassRangesNoDash 14.1248 + * ClassAtomNoDash - ClassAtom ClassRanges 14.1249 + */ 14.1250 + private Token nonemptyClassRangesNoDash() { 14.1251 + final int startIn = position; 14.1252 + final int startOut = sb.length(); 14.1253 + final Token token = new Token(Token.Type.NON_EMPTY_CLASSRANGES_NODASH); 14.1254 + Token child; 14.1255 + 14.1256 + child = classAtomNoDash(); 14.1257 + if (child != null) { 14.1258 + token.add(child); 14.1259 + 14.1260 + // need to check dash first, as for e.g. [a-b|c-d] will otherwise parse - as an atom 14.1261 + if (ch0 == '-') { 14.1262 + commit(token, 1); 14.1263 + 14.1264 + final Token child1 = classAtom(); 14.1265 + final Token child2 = classRanges(); 14.1266 + if (child1 != null && child2 != null) { 14.1267 + token.add(child1); 14.1268 + return token.add(child2); 14.1269 + } 14.1270 + //fallthru 14.1271 + } 14.1272 + 14.1273 + child = nonemptyClassRangesNoDash(); 14.1274 + if (child != null) { 14.1275 + token.add(child); 14.1276 + } 14.1277 + return token; // still a class atom 14.1278 + } 14.1279 + 14.1280 + child = classAtom(); 14.1281 + if (child != null) { 14.1282 + return token.add(child); 14.1283 + } 14.1284 + 14.1285 + restart(startIn, startOut); 14.1286 + return null; 14.1287 + } 14.1288 + 14.1289 + /* 14.1290 + * ClassAtom : - ClassAtomNoDash 14.1291 + */ 14.1292 + private Token classAtom() { 14.1293 + final Token token = new Token(Token.Type.CLASSATOM); 14.1294 + 14.1295 + if (ch0 == '-') { 14.1296 + return commit(token, 1); 14.1297 + } 14.1298 + 14.1299 + final Token child = classAtomNoDash(); 14.1300 + if (child != null) { 14.1301 + return token.add(child); 14.1302 + } 14.1303 + 14.1304 + return null; 14.1305 + } 14.1306 + 14.1307 + /* 14.1308 + * ClassAtomNoDash :: 14.1309 + * SourceCharacter but not one of \ or ] or - 14.1310 + * \ ClassEscape 14.1311 + */ 14.1312 + private Token classAtomNoDash() { 14.1313 + final int startIn = position; 14.1314 + final int startOut = sb.length(); 14.1315 + final Token token = new Token(Token.Type.CLASSATOM_NODASH); 14.1316 + 14.1317 + switch (ch0) { 14.1318 + case ']': 14.1319 + case '-': 14.1320 + case '\0': 14.1321 + return null; 14.1322 + 14.1323 + case '[': 14.1324 + // unescaped left square bracket - add escape 14.1325 + return commit(token.add("\\"), 1); 14.1326 + 14.1327 + case '\\': 14.1328 + commit(token, 1); 14.1329 + final Token child = classEscape(); 14.1330 + if (child != null) { 14.1331 + return token.add(child); 14.1332 + } 14.1333 + 14.1334 + restart(startIn, startOut); 14.1335 + return null; 14.1336 + 14.1337 + default: 14.1338 + return commit(token, 1); 14.1339 + } 14.1340 + } 14.1341 + 14.1342 + /* 14.1343 + * ClassEscape :: 14.1344 + * DecimalEscape 14.1345 + * b 14.1346 + * CharacterEscape 14.1347 + * CharacterClassEscape 14.1348 + */ 14.1349 + private Token classEscape() { 14.1350 + final Token token = new Token(Token.Type.CLASS_ESCAPE); 14.1351 + Token child; 14.1352 + 14.1353 + child = decimalEscape(); 14.1354 + if (child != null) { 14.1355 + return token.add(child); 14.1356 + } 14.1357 + 14.1358 + if (ch0 == 'b') { 14.1359 + return commit(token, 1); 14.1360 + } 14.1361 + 14.1362 + child = characterEscape(); 14.1363 + if (child != null) { 14.1364 + return token.add(child); 14.1365 + } 14.1366 + 14.1367 + child = characterClassEscape(); 14.1368 + if (child != null) { 14.1369 + return token.add(child); 14.1370 + } 14.1371 + 14.1372 + return null; 14.1373 + } 14.1374 + 14.1375 + /* 14.1376 + * DecimalDigits 14.1377 + */ 14.1378 + private Token decimalDigits() { 14.1379 + if (!isDecimalDigit(ch0)) { 14.1380 + return null; 14.1381 + } 14.1382 + 14.1383 + final Token token = new Token(Token.Type.DECIMALDIGITS); 14.1384 + while (isDecimalDigit(ch0)) { 14.1385 + commit(token, 1); 14.1386 + } 14.1387 + 14.1388 + return token; 14.1389 + } 14.1390 + 14.1391 + private static boolean isDecimalDigit(final char ch) { 14.1392 + return ch >= '0' && ch <= '9'; 14.1393 + } 14.1394 +}