Wed, 10 Apr 2013 14:05:11 +0200
8011749: Bugs with empty character class handling
Reviewed-by: lagergren, attila
1 /*
2 * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
26 package jdk.nashorn.internal.runtime.regexp;
28 import java.util.HashMap;
29 import java.util.Iterator;
30 import java.util.LinkedList;
31 import java.util.List;
32 import java.util.Map;
33 import java.util.regex.PatternSyntaxException;
35 import jdk.nashorn.internal.parser.Lexer;
36 import jdk.nashorn.internal.parser.Scanner;
37 import jdk.nashorn.internal.runtime.BitVector;
39 /**
40 * Scan a JavaScript regexp, converting to Java regex if necessary.
41 *
42 */
43 final class RegExpScanner extends Scanner {
45 /**
46 * String builder used to rewrite the pattern for the currently used regexp factory.
47 */
48 private final StringBuilder sb;
50 /** Expected token table */
51 private final Map<Character, Integer> expected = new HashMap<>();
53 /** Capturing parenthesis that have been found so far. */
54 private final List<Capture> caps = new LinkedList<>();
56 /** Forward references to capturing parenthesis to be resolved later.*/
57 private final LinkedList<Integer> forwardReferences = new LinkedList<>();
59 /** Current level of zero-width negative lookahead assertions. */
60 private int negativeLookaheadLevel;
62 /** Are we currently inside a character class? */
63 private boolean inCharClass = false;
65 /** Are we currently inside a negated character class? */
66 private boolean inNegativeClass = false;
68 private static final String NON_IDENT_ESCAPES = "$^*+(){}[]|\\.?";
70 private static class Capture {
71 /**
72 * Zero-width negative lookaheads enclosing the capture.
73 */
74 private final int negativeLookaheadLevel;
76 Capture(final int negativeLookaheadLevel) {
77 this.negativeLookaheadLevel = negativeLookaheadLevel;
78 }
80 public int getNegativeLookaheadLevel() {
81 return negativeLookaheadLevel;
82 }
84 }
86 /**
87 * Constructor
88 * @param string the JavaScript regexp to parse
89 */
90 private RegExpScanner(final String string) {
91 super(string);
92 sb = new StringBuilder(limit);
93 reset(0);
94 expected.put(']', 0);
95 expected.put('}', 0);
96 }
98 private void processForwardReferences() {
100 Iterator<Integer> iterator = forwardReferences.descendingIterator();
101 while (iterator.hasNext()) {
102 final int pos = iterator.next();
103 final int num = iterator.next();
104 if (num > caps.size()) {
105 // Non-existing backreference. If the number begins with a valid octal convert it to
106 // Unicode escape and append the rest to a literal character sequence.
107 final StringBuilder buffer = new StringBuilder();
108 octalOrLiteral(Integer.toString(num), buffer);
109 sb.insert(pos, buffer);
110 }
111 }
113 forwardReferences.clear();
114 }
116 /**
117 * Scan a JavaScript regexp string returning a Java safe regex string.
118 *
119 * @param string
120 * JavaScript regexp string.
121 * @return Java safe regex string.
122 */
123 public static RegExpScanner scan(final String string) {
124 final RegExpScanner scanner = new RegExpScanner(string);
126 try {
127 scanner.disjunction();
128 } catch (final Exception e) {
129 throw new PatternSyntaxException(e.getMessage(), string, scanner.position);
130 }
132 scanner.processForwardReferences();
134 // Throw syntax error unless we parsed the entire JavaScript regexp without syntax errors
135 if (scanner.position != string.length()) {
136 final String p = scanner.getStringBuilder().toString();
137 throw new PatternSyntaxException(string, p, p.length() + 1);
138 }
140 return scanner;
141 }
143 final StringBuilder getStringBuilder() {
144 return sb;
145 }
147 String getJavaPattern() {
148 return sb.toString();
149 }
151 BitVector getGroupsInNegativeLookahead() {
152 BitVector vec = null;
153 for (int i = 0; i < caps.size(); i++) {
154 final Capture cap = caps.get(i);
155 if (cap.getNegativeLookaheadLevel() > 0) {
156 if (vec == null) {
157 vec = new BitVector(caps.size() + 1);
158 }
159 vec.set(i + 1);
160 }
161 }
162 return vec;
163 }
165 /**
166 * Commit n characters to the builder and to a given token
167 * @param n Number of characters.
168 * @return Committed token
169 */
170 private boolean commit(final int n) {
171 switch (n) {
172 case 1:
173 sb.append(ch0);
174 skip(1);
175 break;
176 case 2:
177 sb.append(ch0);
178 sb.append(ch1);
179 skip(2);
180 break;
181 case 3:
182 sb.append(ch0);
183 sb.append(ch1);
184 sb.append(ch2);
185 skip(3);
186 break;
187 default:
188 assert false : "Should not reach here";
189 }
191 return true;
192 }
194 /**
195 * Restart the buffers back at an earlier position.
196 *
197 * @param startIn
198 * Position in the input stream.
199 * @param startOut
200 * Position in the output stream.
201 */
202 private void restart(final int startIn, final int startOut) {
203 reset(startIn);
204 sb.setLength(startOut);
205 }
207 private void push(final char ch) {
208 expected.put(ch, expected.get(ch) + 1);
209 }
211 private void pop(final char ch) {
212 expected.put(ch, Math.min(0, expected.get(ch) - 1));
213 }
215 /*
216 * Recursive descent tokenizer starts below.
217 */
219 /*
220 * Disjunction ::
221 * Alternative
222 * Alternative | Disjunction
223 */
224 private void disjunction() {
225 while (true) {
226 alternative();
228 if (ch0 == '|') {
229 commit(1);
230 } else {
231 break;
232 }
233 }
234 }
236 /*
237 * Alternative ::
238 * [empty]
239 * Alternative Term
240 */
241 private void alternative() {
242 while (term()) {
243 // do nothing
244 }
245 }
247 /*
248 * Term ::
249 * Assertion
250 * Atom
251 * Atom Quantifier
252 */
253 private boolean term() {
254 final int startIn = position;
255 final int startOut = sb.length();
257 if (assertion()) {
258 return true;
259 }
261 if (atom()) {
262 // Check for character classes that never or always match
263 if (sb.toString().endsWith("[]")) {
264 sb.setLength(sb.length() - 1);
265 sb.append("^\\s\\S]");
266 } else if (sb.toString().endsWith("[^]")) {
267 sb.setLength(sb.length() - 2);
268 sb.append("\\s\\S]");
269 }
271 quantifier();
272 return true;
273 }
275 restart(startIn, startOut);
276 return false;
277 }
279 /*
280 * Assertion ::
281 * ^
282 * $
283 * \b
284 * \B
285 * ( ? = Disjunction )
286 * ( ? ! Disjunction )
287 */
288 private boolean assertion() {
289 final int startIn = position;
290 final int startOut = sb.length();
292 switch (ch0) {
293 case '^':
294 case '$':
295 return commit(1);
297 case '\\':
298 if (ch1 == 'b' || ch1 == 'B') {
299 return commit(2);
300 }
301 break;
303 case '(':
304 if (ch1 != '?') {
305 break;
306 }
307 if (ch2 != '=' && ch2 != '!') {
308 break;
309 }
310 final boolean isNegativeLookahead = (ch2 == '!');
311 commit(3);
313 if (isNegativeLookahead) {
314 negativeLookaheadLevel++;
315 }
316 disjunction();
317 if (isNegativeLookahead) {
318 negativeLookaheadLevel--;
319 }
321 if (ch0 == ')') {
322 return commit(1);
323 }
324 break;
326 default:
327 break;
328 }
330 restart(startIn, startOut);
331 return false;
332 }
334 /*
335 * Quantifier ::
336 * QuantifierPrefix
337 * QuantifierPrefix ?
338 */
339 private boolean quantifier() {
340 if (quantifierPrefix()) {
341 if (ch0 == '?') {
342 commit(1);
343 }
344 return true;
345 }
346 return false;
347 }
349 /*
350 * QuantifierPrefix ::
351 * *
352 * +
353 * ?
354 * { DecimalDigits }
355 * { DecimalDigits , }
356 * { DecimalDigits , DecimalDigits }
357 */
358 private boolean quantifierPrefix() {
359 final int startIn = position;
360 final int startOut = sb.length();
362 switch (ch0) {
363 case '*':
364 case '+':
365 case '?':
366 return commit(1);
368 case '{':
369 commit(1);
371 if (!decimalDigits()) {
372 break; // not a quantifier - back out
373 }
374 push('}');
376 if (ch0 == ',') {
377 commit(1);
378 decimalDigits();
379 }
381 if (ch0 == '}') {
382 pop('}');
383 commit(1);
384 } else {
385 // Bad quantifier should be rejected but is accepted by all major engines
386 restart(startIn, startOut);
387 return false;
388 }
390 return true;
392 default:
393 break;
394 }
396 restart(startIn, startOut);
397 return false;
398 }
400 /*
401 * Atom ::
402 * PatternCharacter
403 * .
404 * \ AtomEscape
405 * CharacterClass
406 * ( Disjunction )
407 * ( ? : Disjunction )
408 *
409 */
410 private boolean atom() {
411 final int startIn = position;
412 final int startOut = sb.length();
414 if (patternCharacter()) {
415 return true;
416 }
418 if (ch0 == '.') {
419 return commit(1);
420 }
422 if (ch0 == '\\') {
423 commit(1);
425 if (atomEscape()) {
426 return true;
427 }
428 }
430 if (characterClass()) {
431 return true;
432 }
434 if (ch0 == '(') {
435 boolean capturingParens = true;
436 commit(1);
437 if (ch0 == '?' && ch1 == ':') {
438 capturingParens = false;
439 commit(2);
440 }
442 disjunction();
444 if (ch0 == ')') {
445 commit(1);
446 if (capturingParens) {
447 caps.add(new Capture(negativeLookaheadLevel));
448 }
449 return true;
450 }
451 }
453 restart(startIn, startOut);
454 return false;
455 }
457 /*
458 * PatternCharacter ::
459 * SourceCharacter but not any of: ^$\.*+?()[]{}|
460 */
461 @SuppressWarnings("fallthrough")
462 private boolean patternCharacter() {
463 if (atEOF()) {
464 return false;
465 }
467 switch (ch0) {
468 case '^':
469 case '$':
470 case '\\':
471 case '.':
472 case '*':
473 case '+':
474 case '?':
475 case '(':
476 case ')':
477 case '[':
478 case '|':
479 return false;
481 case '}':
482 case ']':
483 final int n = expected.get(ch0);
484 if (n != 0) {
485 return false;
486 }
488 case '{':
489 // if not a valid quantifier escape curly brace to match itself
490 // this ensures compatibility with other JS implementations
491 if (!quantifierPrefix()) {
492 sb.append('\\');
493 return commit(1);
494 }
495 return false;
497 default:
498 return commit(1); // SOURCECHARACTER
499 }
500 }
502 /*
503 * AtomEscape ::
504 * DecimalEscape
505 * CharacterEscape
506 * CharacterClassEscape
507 */
508 private boolean atomEscape() {
509 // Note that contrary to ES 5.1 spec we put identityEscape() last because it acts as a catch-all
510 return decimalEscape() || characterClassEscape() || characterEscape() || identityEscape();
511 }
513 /*
514 * CharacterEscape ::
515 * ControlEscape
516 * c ControlLetter
517 * HexEscapeSequence
518 * UnicodeEscapeSequence
519 * IdentityEscape
520 */
521 private boolean characterEscape() {
522 final int startIn = position;
523 final int startOut = sb.length();
525 if (controlEscape()) {
526 return true;
527 }
529 if (ch0 == 'c') {
530 commit(1);
531 if (controlLetter()) {
532 return true;
533 }
534 restart(startIn, startOut);
535 }
537 if (hexEscapeSequence() || unicodeEscapeSequence()) {
538 return true;
539 }
541 restart(startIn, startOut);
542 return false;
543 }
545 private boolean scanEscapeSequence(final char leader, final int length) {
546 final int startIn = position;
547 final int startOut = sb.length();
549 if (ch0 != leader) {
550 return false;
551 }
553 commit(1);
554 for (int i = 0; i < length; i++) {
555 final char ch0l = Character.toLowerCase(ch0);
556 if ((ch0l >= 'a' && ch0l <= 'f') || isDecimalDigit(ch0)) {
557 commit(1);
558 } else {
559 restart(startIn, startOut);
560 return false;
561 }
562 }
564 return true;
565 }
567 private boolean hexEscapeSequence() {
568 return scanEscapeSequence('x', 2);
569 }
571 private boolean unicodeEscapeSequence() {
572 return scanEscapeSequence('u', 4);
573 }
575 /*
576 * ControlEscape ::
577 * one of fnrtv
578 */
579 private boolean controlEscape() {
580 switch (ch0) {
581 case 'f':
582 case 'n':
583 case 'r':
584 case 't':
585 case 'v':
586 return commit(1);
588 default:
589 return false;
590 }
591 }
593 /*
594 * ControlLetter ::
595 * one of abcdefghijklmnopqrstuvwxyz
596 * ABCDEFGHIJKLMNOPQRSTUVWXYZ
597 */
598 private boolean controlLetter() {
599 final char c = Character.toUpperCase(ch0);
600 if (c >= 'A' && c <= 'Z') {
601 // for some reason java regexps don't like control characters on the
602 // form "\\ca".match([string with ascii 1 at char0]). Translating
603 // them to unicode does it though.
604 sb.setLength(sb.length() - 1);
605 unicode(c - 'A' + 1, sb);
606 skip(1);
607 return true;
608 }
609 return false;
610 }
612 /*
613 * IdentityEscape ::
614 * SourceCharacter but not IdentifierPart
615 * <ZWJ> (200c)
616 * <ZWNJ> (200d)
617 */
618 private boolean identityEscape() {
619 if (atEOF()) {
620 throw new RuntimeException("\\ at end of pattern"); // will be converted to PatternSyntaxException
621 }
622 // ES 5.1 A.7 requires "not IdentifierPart" here but all major engines accept any character here.
623 if (ch0 == 'c') {
624 // Ignore invalid control letter escape if within a character class
625 if (inCharClass && ch1 != ']') {
626 sb.setLength(sb.length() - 1);
627 skip(2);
628 return true;
629 } else {
630 sb.append('\\'); // Treat invalid \c control sequence as \\c
631 }
632 } else if (NON_IDENT_ESCAPES.indexOf(ch0) == -1) {
633 sb.setLength(sb.length() - 1);
634 }
635 return commit(1);
636 }
638 /*
639 * DecimalEscape ::
640 * DecimalIntegerLiteral [lookahead DecimalDigit]
641 */
642 private boolean decimalEscape() {
643 final int startIn = position;
644 final int startOut = sb.length();
646 if (ch0 == '0' && !isOctalDigit(ch1)) {
647 skip(1);
648 // DecimalEscape :: 0. If i is zero, return the EscapeValue consisting of a <NUL> character (Unicodevalue0000);
649 sb.append("\u0000");
650 return true;
651 }
653 if (isDecimalDigit(ch0)) {
655 if (ch0 == '0') {
656 // We know this is an octal escape.
657 if (inCharClass) {
658 // Convert octal escape to unicode escape if inside character class.
659 int octalValue = 0;
660 while (isOctalDigit(ch0)) {
661 octalValue = octalValue * 8 + ch0 - '0';
662 skip(1);
663 }
665 unicode(octalValue, sb);
667 } else {
668 // Copy decimal escape as-is
669 decimalDigits();
670 }
671 } else {
672 // This should be a backreference, but could also be an octal escape or even a literal string.
673 int decimalValue = 0;
674 while (isDecimalDigit(ch0)) {
675 decimalValue = decimalValue * 10 + ch0 - '0';
676 skip(1);
677 }
679 if (inCharClass) {
680 // No backreferences in character classes. Encode as unicode escape or literal char sequence
681 sb.setLength(sb.length() - 1);
682 octalOrLiteral(Integer.toString(decimalValue), sb);
684 } else if (decimalValue <= caps.size() && caps.get(decimalValue - 1).getNegativeLookaheadLevel() > 0) {
685 // Captures that live inside a negative lookahead are dead after the
686 // lookahead and will be undefined if referenced from outside.
687 if (caps.get(decimalValue - 1).getNegativeLookaheadLevel() > negativeLookaheadLevel) {
688 sb.setLength(sb.length() - 1);
689 } else {
690 sb.append(decimalValue);
691 }
692 } else if (decimalValue > caps.size()) {
693 // Forward reference to a capture group. Forward references are always undefined so we can omit
694 // it from the output buffer. However, if the target capture does not exist, we need to rewrite
695 // the reference as hex escape or literal string, so register the reference for later processing.
696 sb.setLength(sb.length() - 1);
697 forwardReferences.add(decimalValue);
698 forwardReferences.add(sb.length());
699 } else {
700 // Append as backreference
701 sb.append(decimalValue);
702 }
704 }
705 return true;
706 }
708 restart(startIn, startOut);
709 return false;
710 }
712 /*
713 * CharacterClassEscape ::
714 * one of dDsSwW
715 */
716 private boolean characterClassEscape() {
717 switch (ch0) {
718 // java.util.regex requires translation of \s and \S to explicit character list
719 case 's':
720 if (RegExpFactory.usesJavaUtilRegex()) {
721 sb.setLength(sb.length() - 1);
722 // No nested class required if we already are inside a character class
723 if (inCharClass) {
724 sb.append(Lexer.getWhitespaceRegExp());
725 } else {
726 sb.append('[').append(Lexer.getWhitespaceRegExp()).append(']');
727 }
728 skip(1);
729 return true;
730 }
731 return commit(1);
732 case 'S':
733 if (RegExpFactory.usesJavaUtilRegex()) {
734 sb.setLength(sb.length() - 1);
735 // In negative class we must use intersection to get double negation ("not anything else than space")
736 sb.append(inNegativeClass ? "&&[" : "[^").append(Lexer.getWhitespaceRegExp()).append(']');
737 skip(1);
738 return true;
739 }
740 return commit(1);
741 case 'd':
742 case 'D':
743 case 'w':
744 case 'W':
745 return commit(1);
747 default:
748 return false;
749 }
750 }
752 /*
753 * CharacterClass ::
754 * [ [lookahead {^}] ClassRanges ]
755 * [ ^ ClassRanges ]
756 */
757 private boolean characterClass() {
758 final int startIn = position;
759 final int startOut = sb.length();
761 if (ch0 == '[') {
762 try {
763 inCharClass = true;
764 push(']');
765 commit(1);
767 if (ch0 == '^') {
768 inNegativeClass = true;
769 commit(1);
770 }
772 if (classRanges() && ch0 == ']') {
773 pop(']');
774 return commit(1);
775 }
776 } finally {
777 inCharClass = false; // no nested character classes in JavaScript
778 inNegativeClass = false;
779 }
780 }
782 restart(startIn, startOut);
783 return false;
784 }
786 /*
787 * ClassRanges ::
788 * [empty]
789 * NonemptyClassRanges
790 */
791 private boolean classRanges() {
792 nonemptyClassRanges();
793 return true;
794 }
796 /*
797 * NonemptyClassRanges ::
798 * ClassAtom
799 * ClassAtom NonemptyClassRangesNoDash
800 * ClassAtom - ClassAtom ClassRanges
801 */
802 private boolean nonemptyClassRanges() {
803 final int startIn = position;
804 final int startOut = sb.length();
806 if (classAtom()) {
808 if (ch0 == '-') {
809 commit(1);
811 if (classAtom() && classRanges()) {
812 return true;
813 }
814 }
816 nonemptyClassRangesNoDash();
818 return true;
819 }
821 restart(startIn, startOut);
822 return false;
823 }
825 /*
826 * NonemptyClassRangesNoDash ::
827 * ClassAtom
828 * ClassAtomNoDash NonemptyClassRangesNoDash
829 * ClassAtomNoDash - ClassAtom ClassRanges
830 */
831 private boolean nonemptyClassRangesNoDash() {
832 final int startIn = position;
833 final int startOut = sb.length();
835 if (classAtomNoDash()) {
837 // need to check dash first, as for e.g. [a-b|c-d] will otherwise parse - as an atom
838 if (ch0 == '-') {
839 commit(1);
841 if (classAtom() && classRanges()) {
842 return true;
843 }
844 //fallthru
845 }
847 nonemptyClassRangesNoDash();
848 return true; // still a class atom
849 }
851 if (classAtom()) {
852 return true;
853 }
855 restart(startIn, startOut);
856 return false;
857 }
859 /*
860 * ClassAtom : - ClassAtomNoDash
861 */
862 private boolean classAtom() {
864 if (ch0 == '-') {
865 return commit(1);
866 }
868 return classAtomNoDash();
869 }
871 /*
872 * ClassAtomNoDash ::
873 * SourceCharacter but not one of \ or ] or -
874 * \ ClassEscape
875 */
876 private boolean classAtomNoDash() {
877 final int startIn = position;
878 final int startOut = sb.length();
880 switch (ch0) {
881 case ']':
882 case '-':
883 case '\0':
884 return false;
886 case '[':
887 // unescaped left square bracket - add escape
888 sb.append('\\');
889 return commit(1);
891 case '\\':
892 commit(1);
893 if (classEscape()) {
894 return true;
895 }
897 restart(startIn, startOut);
898 return false;
900 default:
901 return commit(1);
902 }
903 }
905 /*
906 * ClassEscape ::
907 * DecimalEscape
908 * b
909 * CharacterEscape
910 * CharacterClassEscape
911 */
912 private boolean classEscape() {
914 if (decimalEscape()) {
915 return true;
916 }
918 if (ch0 == 'b') {
919 sb.setLength(sb.length() - 1);
920 sb.append('\b');
921 skip(1);
922 return true;
923 }
925 // Note that contrary to ES 5.1 spec we put identityEscape() last because it acts as a catch-all
926 return characterEscape() || characterClassEscape() || identityEscape();
927 }
929 /*
930 * DecimalDigits
931 */
932 private boolean decimalDigits() {
933 if (!isDecimalDigit(ch0)) {
934 return false;
935 }
937 while (isDecimalDigit(ch0)) {
938 commit(1);
939 }
941 return true;
942 }
944 private void unicode(final int value, final StringBuilder buffer) {
945 final String hex = Integer.toHexString(value);
946 buffer.append('u');
947 for (int i = 0; i < 4 - hex.length(); i++) {
948 buffer.append('0');
949 }
950 buffer.append(hex);
951 }
953 // Convert what would have been a backreference into a unicode escape, or a number literal, or both.
954 private void octalOrLiteral(final String numberLiteral, final StringBuilder buffer) {
955 final int length = numberLiteral.length();
956 int octalValue = 0;
957 int pos = 0;
958 // Maximum value for octal escape is 0377 (255) so we stop the loop at 32
959 while (pos < length && octalValue < 0x20) {
960 final char ch = numberLiteral.charAt(pos);
961 if (isOctalDigit(ch)) {
962 octalValue = octalValue * 8 + ch - '0';
963 } else {
964 break;
965 }
966 pos++;
967 }
968 if (octalValue > 0) {
969 buffer.append('\\');
970 unicode(octalValue, buffer);
971 buffer.append(numberLiteral.substring(pos));
972 } else {
973 buffer.append(numberLiteral);
974 }
975 }
977 private static boolean isOctalDigit(final char ch) {
978 return ch >= '0' && ch <= '7';
979 }
981 private static boolean isDecimalDigit(final char ch) {
982 return ch >= '0' && ch <= '9';
983 }
984 }