# HG changeset patch # User aoqi # Date 1603528983 -28800 # Node ID aaee9ae4799a8af5b67c20b2215ecffe72403c79 # Parent 3b8ebb9579576cefad64c1549d6d12c127943a73# Parent ba503169016f66e4f52e75eca4b3f09d45ebef30 Merge diff -r 3b8ebb957957 -r aaee9ae4799a .hgtags --- a/.hgtags Sat Oct 24 16:18:47 2020 +0800 +++ b/.hgtags Sat Oct 24 16:43:03 2020 +0800 @@ -1069,8 +1069,23 @@ 560093f3167970da2935b745493653420fdea008 jdk8u262-b03 d054aabd2e3c09de0ff622b4fab09388d30aee02 jdk8u262-b04 976e73cfac410997160b1d3d6e14a88a324440c3 jdk8u262-b05 +976e73cfac410997160b1d3d6e14a88a324440c3 jdk8u272-b00 ddbd856338439f2d5f742040d896e27f0f104cd1 jdk8u262-b06 ebb0a284b7e75dfb741af3332eb87b37aca66875 jdk8u262-b07 0cccb32a50471fd52ecf2f697d95e7254798ab26 jdk8u262-b08 779db06fb02444e294b7c93fe3902afee615df2a jdk8u262-b09 +63884b34cac1b652cf49289199a00cb363cb93dd jdk8u262-b10 +63884b34cac1b652cf49289199a00cb363cb93dd jdk8u262-ga 3b85d4e65538af51987a00b276cde9c250615f9d mips-jdk8u262-b10 +63884b34cac1b652cf49289199a00cb363cb93dd jdk8u265-b00 +3147b24fc8b092b34599830b56d03da4731577a2 jdk8u265-b01 +3147b24fc8b092b34599830b56d03da4731577a2 jdk8u265-ga +1bc3598fbad03fa73168f64cea4d0628e75a292b jdk8u272-b01 +7694bb86e0236ba9a89326206af46da8c252aad5 jdk8u272-b02 +370157535629da61a0f0ac045d77c384b98211f6 jdk8u272-b03 +89445883ffdec61e5b32980633b67d932d602582 jdk8u272-b04 +36d18f0fd6eeffc14f311dc5ff5a18ae870fc1d0 jdk8u272-b05 +44cbebcc276cddad3ad0aa67f4da313d50af7e4b jdk8u272-b06 +bd015816ce490762772ca71c86bd90f58a90fb8c jdk8u272-b07 +9d92962b2fe312a045e5814d4604d00e04492515 jdk8u272-b08 +a5b79eebcc1f3c9afbe9927d672be64364647049 jdk8u272-b09 diff -r 3b8ebb957957 -r aaee9ae4799a THIRD_PARTY_README --- a/THIRD_PARTY_README Sat Oct 24 16:18:47 2020 +0800 +++ b/THIRD_PARTY_README Sat Oct 24 16:43:03 2020 +0800 @@ -2240,7 +2240,7 @@ ------------------------------------------------------------------------------- -%% This notice is provided with respect to PC/SC Lite v1.8.24, +%% This notice is provided with respect to PC/SC Lite v1.8.26, which may be included with JRE 8, JDK 8, and OpenJDK 8 on Linux and Solaris. --- begin of LICENSE --- @@ -3028,8 +3028,7 @@ Apache Commons Math 3.2 Apache Derby 10.11.1.2 Apache Jakarta BCEL 5.1 - Apache Jakarta Regexp 1.4 - Apache Santuario XML Security for Java 1.5.4 + Apache Santuario XML Security for Java 2.1.1 Apache Xalan-Java 2.7.2 Apache Xerces Java 2.10.0 Apache XML Resolver 1.1 @@ -3243,3 +3242,41 @@ ------------------------------------------------------------------------------- +%% This notice is provided with respect to OASIS PKCS #11 Cryptographic Token +Interface v2.40, which may be included with JRE 8, JDK 8, and OpenJDK 8. + +--- begin of LICENSE --- + +Copyright (c) OASIS Open 2016. All Rights Reserved. + +All capitalized terms in the following text have the meanings assigned to them +in the OASIS Intellectual Property Rights Policy (the "OASIS IPR Policy"). The +full Policy may be found at the OASIS website: +[http://www.oasis-open.org/policies-guidelines/ipr] + +This document and translations of it may be copied and furnished to others, and +derivative works that comment on or otherwise explain it or assist in its +implementation may be prepared, copied, published, and distributed, in whole or +in part, without restriction of any kind, provided that the above copyright +notice and this section are included on all such copies and derivative works. +However, this document itself may not be modified in any way, including by +removing the copyright notice or references to OASIS, except as needed for the +purpose of developing any document or deliverable produced by an OASIS +Technical Committee (in which case the rules applicable to copyrights, as set +forth in the OASIS IPR Policy, must be followed) or as required to translate it +into languages other than English. + +The limited permissions granted above are perpetual and will not be revoked by +OASIS or its successors or assigns. + +This document and the information contained herein is provided on an "AS IS" +basis and OASIS DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING BUT NOT +LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION HEREIN WILL NOT +INFRINGE ANY OWNERSHIP RIGHTS OR ANY IMPLIED WARRANTIES OF MERCHANTABILITY OR +FITNESS FOR A PARTICULAR PURPOSE. OASIS AND ITS MEMBERS WILL NOT BE LIABLE FOR +ANY DIRECT, INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF ANY USE +OF THIS DOCUMENT OR ANY PART THEREOF. + +--- end of LICENSE --- + +------------------------------------------------------------------------------- diff -r 3b8ebb957957 -r aaee9ae4799a src/com/sun/org/apache/bcel/internal/util/InstructionFinder.java --- a/src/com/sun/org/apache/bcel/internal/util/InstructionFinder.java Sat Oct 24 16:18:47 2020 +0800 +++ b/src/com/sun/org/apache/bcel/internal/util/InstructionFinder.java Sat Oct 24 16:43:03 2020 +0800 @@ -4,64 +4,29 @@ */ package com.sun.org.apache.bcel.internal.util; -/* ==================================================================== - * The Apache Software License, Version 1.1 +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at * - * Copyright (c) 2001 The Apache Software Foundation. All rights - * reserved. + * http://www.apache.org/licenses/LICENSE-2.0 * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. The end-user documentation included with the redistribution, - * if any, must include the following acknowledgment: - * "This product includes software developed by the - * Apache Software Foundation (http://www.apache.org/)." - * Alternately, this acknowledgment may appear in the software itself, - * if and wherever such third-party acknowledgments normally appear. - * - * 4. The names "Apache" and "Apache Software Foundation" and - * "Apache BCEL" must not be used to endorse or promote products - * derived from this software without prior written permission. For - * written permission, please contact apache@apache.org. - * - * 5. Products derived from this software may not be called "Apache", - * "Apache BCEL", nor may "Apache" appear in their name, without - * prior written permission of the Apache Software Foundation. - * - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF - * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT - * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * ==================================================================== - * - * This software consists of voluntary contributions made by many - * individuals on behalf of the Apache Software Foundation. For more - * information on the Apache Software Foundation, please see - * . */ -import java.util.*; import com.sun.org.apache.bcel.internal.Constants; import com.sun.org.apache.bcel.internal.generic.*; -import com.sun.org.apache.regexp.internal.*; +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; /** * InstructionFinder is a tool to search for given instructions patterns, @@ -231,28 +196,22 @@ if(start == -1) throw new ClassGenException("Instruction handle " + from + " not found in instruction list."); - try { - RE regex = new RE(search); - ArrayList matches = new ArrayList(); - while(start < il_string.length() && regex.match(il_string, start)) { - int startExpr = regex.getParenStart(0); - int endExpr = regex.getParenEnd(0); - int lenExpr = regex.getParenLength(0); + Pattern regex = Pattern.compile(search); + List matches = new ArrayList<>(); + Matcher matcher = regex.matcher(il_string); + while(start < il_string.length() && matcher.find(start)) { + int startExpr = matcher.start(); + int endExpr = matcher.end(); + int lenExpr = endExpr - startExpr; + InstructionHandle[] match = getMatch(startExpr, lenExpr); - InstructionHandle[] match = getMatch(startExpr, lenExpr); - - if((constraint == null) || constraint.checkCode(match)) - matches.add(match); - start = endExpr; - } - - return matches.iterator(); - } catch(RESyntaxException e) { - System.err.println(e); + if((constraint == null) || constraint.checkCode(match)) + matches.add(match); + start = endExpr; } - return null; + return matches.iterator(); } /** diff -r 3b8ebb957957 -r aaee9ae4799a src/com/sun/org/apache/regexp/internal/CharacterArrayCharacterIterator.java --- a/src/com/sun/org/apache/regexp/internal/CharacterArrayCharacterIterator.java Sat Oct 24 16:18:47 2020 +0800 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,76 +0,0 @@ -/* - * reserved comment block - * DO NOT REMOVE OR ALTER! - */ -/* - * Copyright 1999-2004 The Apache Software Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.sun.org.apache.regexp.internal; - -/** - * Encapsulates char[] as CharacterIterator - * - * @author Ales Novak - */ -public final class CharacterArrayCharacterIterator implements CharacterIterator -{ - /** encapsulated */ - private final char[] src; - /** offset in the char array */ - private final int off; - /** used portion of the array */ - private final int len; - - /** @param src - encapsulated String */ - public CharacterArrayCharacterIterator(char[] src, int off, int len) - { - this.src = src; - this.off = off; - this.len = len; - } - - /** @return a substring */ - public String substring(int beginIndex, int endIndex) - { - if (endIndex > len) { - throw new IndexOutOfBoundsException("endIndex=" + endIndex - + "; sequence size=" + len); - } - if (beginIndex < 0 || beginIndex > endIndex) { - throw new IndexOutOfBoundsException("beginIndex=" + beginIndex - + "; endIndex=" + endIndex); - } - return new String(src, off + beginIndex, endIndex - beginIndex); - } - - /** @return a substring */ - public String substring(int beginIndex) - { - return substring(beginIndex, len); - } - - /** @return a character at the specified position. */ - public char charAt(int pos) - { - return src[off + pos]; - } - - /** @return true iff if the specified index is after the end of the character stream */ - public boolean isEnd(int pos) - { - return (pos >= len); - } -} diff -r 3b8ebb957957 -r aaee9ae4799a src/com/sun/org/apache/regexp/internal/CharacterIterator.java --- a/src/com/sun/org/apache/regexp/internal/CharacterIterator.java Sat Oct 24 16:18:47 2020 +0800 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,42 +0,0 @@ -/* - * reserved comment block - * DO NOT REMOVE OR ALTER! - */ -/* - * Copyright 1999-2004 The Apache Software Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.sun.org.apache.regexp.internal; - -/** - * Encapsulates different types of character sources - String, InputStream, ... - * Defines a set of common methods - * - * @author Ales Novak - */ -public interface CharacterIterator -{ - /** @return a substring */ - String substring(int beginIndex, int endIndex); - - /** @return a substring */ - String substring(int beginIndex); - - /** @return a character at the specified position. */ - char charAt(int pos); - - /** @return true iff if the specified index is after the end of the character stream */ - boolean isEnd(int pos); -} diff -r 3b8ebb957957 -r aaee9ae4799a src/com/sun/org/apache/regexp/internal/RE.java --- a/src/com/sun/org/apache/regexp/internal/RE.java Sat Oct 24 16:18:47 2020 +0800 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1760 +0,0 @@ -/* - * reserved comment block - * DO NOT REMOVE OR ALTER! - */ -/* - * Copyright 1999-2004 The Apache Software Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.sun.org.apache.regexp.internal; - -import java.io.Serializable; -import java.util.Vector; - -/** - * RE is an efficient, lightweight regular expression evaluator/matcher - * class. Regular expressions are pattern descriptions which enable - * sophisticated matching of strings. In addition to being able to - * match a string against a pattern, you can also extract parts of the - * match. This is especially useful in text parsing! Details on the - * syntax of regular expression patterns are given below. - * - *

- * To compile a regular expression (RE), you can simply construct an RE - * matcher object from the string specification of the pattern, like this: - * - *

- *  RE r = new RE("a*b");
- * 
- * - *

- * Once you have done this, you can call either of the RE.match methods to - * perform matching on a String. For example: - * - *

- *  boolean matched = r.match("aaaab");
- * 
- * - * will cause the boolean matched to be set to true because the - * pattern "a*b" matches the string "aaaab". - * - *

- * If you were interested in the number of a's which matched the - * first part of our example expression, you could change the expression to - * "(a*)b". Then when you compiled the expression and matched it against - * something like "xaaaab", you would get results like this: - * - *

- *  RE r = new RE("(a*)b");                  // Compile expression
- *  boolean matched = r.match("xaaaab");     // Match against "xaaaab"
- *
- *  String wholeExpr = r.getParen(0);        // wholeExpr will be 'aaaab'
- *  String insideParens = r.getParen(1);     // insideParens will be 'aaaa'
- *
- *  int startWholeExpr = r.getParenStart(0); // startWholeExpr will be index 1
- *  int endWholeExpr = r.getParenEnd(0);     // endWholeExpr will be index 6
- *  int lenWholeExpr = r.getParenLength(0);  // lenWholeExpr will be 5
- *
- *  int startInside = r.getParenStart(1);    // startInside will be index 1
- *  int endInside = r.getParenEnd(1);        // endInside will be index 5
- *  int lenInside = r.getParenLength(1);     // lenInside will be 4
- * 
- * - * You can also refer to the contents of a parenthesized expression - * within a regular expression itself. This is called a - * 'backreference'. The first backreference in a regular expression is - * denoted by \1, the second by \2 and so on. So the expression: - * - *
- *  ([0-9]+)=\1
- * 
- * - * will match any string of the form n=n (like 0=0 or 2=2). - * - *

- * The full regular expression syntax accepted by RE is described here: - * - *

- *
- *  Characters
- *
- *    unicodeChar   Matches any identical unicode character
- *    \                    Used to quote a meta-character (like '*')
- *    \\                   Matches a single '\' character
- *    \0nnn                Matches a given octal character
- *    \xhh                 Matches a given 8-bit hexadecimal character
- *    \\uhhhh              Matches a given 16-bit hexadecimal character
- *    \t                   Matches an ASCII tab character
- *    \n                   Matches an ASCII newline character
- *    \r                   Matches an ASCII return character
- *    \f                   Matches an ASCII form feed character
- *
- *
- *  Character Classes
- *
- *    [abc]                Simple character class
- *    [a-zA-Z]             Character class with ranges
- *    [^abc]               Negated character class
- * 
- * - * NOTE: Incomplete ranges will be interpreted as "starts - * from zero" or "ends with last character". - *
- * I.e. [-a] is the same as [\\u0000-a], and [a-] is the same as [a-\\uFFFF], - * [-] means "all characters". - * - *
- *
- *  Standard POSIX Character Classes
- *
- *    [:alnum:]            Alphanumeric characters.
- *    [:alpha:]            Alphabetic characters.
- *    [:blank:]            Space and tab characters.
- *    [:cntrl:]            Control characters.
- *    [:digit:]            Numeric characters.
- *    [:graph:]            Characters that are printable and are also visible.
- *                         (A space is printable, but not visible, while an
- *                         `a' is both.)
- *    [:lower:]            Lower-case alphabetic characters.
- *    [:print:]            Printable characters (characters that are not
- *                         control characters.)
- *    [:punct:]            Punctuation characters (characters that are not letter,
- *                         digits, control characters, or space characters).
- *    [:space:]            Space characters (such as space, tab, and formfeed,
- *                         to name a few).
- *    [:upper:]            Upper-case alphabetic characters.
- *    [:xdigit:]           Characters that are hexadecimal digits.
- *
- *
- *  Non-standard POSIX-style Character Classes
- *
- *    [:javastart:]        Start of a Java identifier
- *    [:javapart:]         Part of a Java identifier
- *
- *
- *  Predefined Classes
- *
- *    .         Matches any character other than newline
- *    \w        Matches a "word" character (alphanumeric plus "_")
- *    \W        Matches a non-word character
- *    \s        Matches a whitespace character
- *    \S        Matches a non-whitespace character
- *    \d        Matches a digit character
- *    \D        Matches a non-digit character
- *
- *
- *  Boundary Matchers
- *
- *    ^         Matches only at the beginning of a line
- *    $         Matches only at the end of a line
- *    \b        Matches only at a word boundary
- *    \B        Matches only at a non-word boundary
- *
- *
- *  Greedy Closures
- *
- *    A*        Matches A 0 or more times (greedy)
- *    A+        Matches A 1 or more times (greedy)
- *    A?        Matches A 1 or 0 times (greedy)
- *    A{n}      Matches A exactly n times (greedy)
- *    A{n,}     Matches A at least n times (greedy)
- *    A{n,m}    Matches A at least n but not more than m times (greedy)
- *
- *
- *  Reluctant Closures
- *
- *    A*?       Matches A 0 or more times (reluctant)
- *    A+?       Matches A 1 or more times (reluctant)
- *    A??       Matches A 0 or 1 times (reluctant)
- *
- *
- *  Logical Operators
- *
- *    AB        Matches A followed by B
- *    A|B       Matches either A or B
- *    (A)       Used for subexpression grouping
- *   (?:A)      Used for subexpression clustering (just like grouping but
- *              no backrefs)
- *
- *
- *  Backreferences
- *
- *    \1    Backreference to 1st parenthesized subexpression
- *    \2    Backreference to 2nd parenthesized subexpression
- *    \3    Backreference to 3rd parenthesized subexpression
- *    \4    Backreference to 4th parenthesized subexpression
- *    \5    Backreference to 5th parenthesized subexpression
- *    \6    Backreference to 6th parenthesized subexpression
- *    \7    Backreference to 7th parenthesized subexpression
- *    \8    Backreference to 8th parenthesized subexpression
- *    \9    Backreference to 9th parenthesized subexpression
- * 
- * - *

- * All closure operators (+, *, ?, {m,n}) are greedy by default, meaning - * that they match as many elements of the string as possible without - * causing the overall match to fail. If you want a closure to be - * reluctant (non-greedy), you can simply follow it with a '?'. A - * reluctant closure will match as few elements of the string as - * possible when finding matches. {m,n} closures don't currently - * support reluctancy. - * - *

- * Line terminators - *
- * A line terminator is a one- or two-character sequence that marks - * the end of a line of the input character sequence. The following - * are recognized as line terminators: - *

- * - *

- * RE runs programs compiled by the RECompiler class. But the RE - * matcher class does not include the actual regular expression compiler - * for reasons of efficiency. In fact, if you want to pre-compile one - * or more regular expressions, the 'recompile' class can be invoked - * from the command line to produce compiled output like this: - * - *

- *    // Pre-compiled regular expression "a*b"
- *    char[] re1Instructions =
- *    {
- *        0x007c, 0x0000, 0x001a, 0x007c, 0x0000, 0x000d, 0x0041,
- *        0x0001, 0x0004, 0x0061, 0x007c, 0x0000, 0x0003, 0x0047,
- *        0x0000, 0xfff6, 0x007c, 0x0000, 0x0003, 0x004e, 0x0000,
- *        0x0003, 0x0041, 0x0001, 0x0004, 0x0062, 0x0045, 0x0000,
- *        0x0000,
- *    };
- *
- *
- *    REProgram re1 = new REProgram(re1Instructions);
- * 
- * - * You can then construct a regular expression matcher (RE) object from - * the pre-compiled expression re1 and thus avoid the overhead of - * compiling the expression at runtime. If you require more dynamic - * regular expressions, you can construct a single RECompiler object and - * re-use it to compile each expression. Similarly, you can change the - * program run by a given matcher object at any time. However, RE and - * RECompiler are not threadsafe (for efficiency reasons, and because - * requiring thread safety in this class is deemed to be a rare - * requirement), so you will need to construct a separate compiler or - * matcher object for each thread (unless you do thread synchronization - * yourself). Once expression compiled into the REProgram object, REProgram - * can be safely shared across multiple threads and RE objects. - * - *


- * - * - * ISSUES: - * - *

- * - * - * - * @see recompile - * @see RECompiler - * - * @author Jonathan Locke - * @author Tobias Schäfer - */ -public class RE implements Serializable -{ - /** - * Specifies normal, case-sensitive matching behaviour. - */ - public static final int MATCH_NORMAL = 0x0000; - - /** - * Flag to indicate that matching should be case-independent (folded) - */ - public static final int MATCH_CASEINDEPENDENT = 0x0001; - - /** - * Newlines should match as BOL/EOL (^ and $) - */ - public static final int MATCH_MULTILINE = 0x0002; - - /** - * Consider all input a single body of text - newlines are matched by . - */ - public static final int MATCH_SINGLELINE = 0x0004; - - /************************************************ - * * - * The format of a node in a program is: * - * * - * [ OPCODE ] [ OPDATA ] [ OPNEXT ] [ OPERAND ] * - * * - * char OPCODE - instruction * - * char OPDATA - modifying data * - * char OPNEXT - next node (relative offset) * - * * - ************************************************/ - - // Opcode Char Opdata/Operand Meaning - // ---------- ---------- --------------- -------------------------------------------------- - static final char OP_END = 'E'; // end of program - static final char OP_BOL = '^'; // match only if at beginning of line - static final char OP_EOL = '$'; // match only if at end of line - static final char OP_ANY = '.'; // match any single character except newline - static final char OP_ANYOF = '['; // count/ranges match any char in the list of ranges - static final char OP_BRANCH = '|'; // node match this alternative or the next one - static final char OP_ATOM = 'A'; // length/string length of string followed by string itself - static final char OP_STAR = '*'; // node kleene closure - static final char OP_PLUS = '+'; // node positive closure - static final char OP_MAYBE = '?'; // node optional closure - static final char OP_ESCAPE = '\\'; // escape special escape code char class (escape is E_* code) - static final char OP_OPEN = '('; // number nth opening paren - static final char OP_OPEN_CLUSTER = '<'; // opening cluster - static final char OP_CLOSE = ')'; // number nth closing paren - static final char OP_CLOSE_CLUSTER = '>'; // closing cluster - static final char OP_BACKREF = '#'; // number reference nth already matched parenthesized string - static final char OP_GOTO = 'G'; // nothing but a (back-)pointer - static final char OP_NOTHING = 'N'; // match null string such as in '(a|)' - static final char OP_RELUCTANTSTAR = '8'; // none/expr reluctant '*' (mnemonic for char is unshifted '*') - static final char OP_RELUCTANTPLUS = '='; // none/expr reluctant '+' (mnemonic for char is unshifted '+') - static final char OP_RELUCTANTMAYBE = '/'; // none/expr reluctant '?' (mnemonic for char is unshifted '?') - static final char OP_POSIXCLASS = 'P'; // classid one of the posix character classes - - // Escape codes - static final char E_ALNUM = 'w'; // Alphanumeric - static final char E_NALNUM = 'W'; // Non-alphanumeric - static final char E_BOUND = 'b'; // Word boundary - static final char E_NBOUND = 'B'; // Non-word boundary - static final char E_SPACE = 's'; // Whitespace - static final char E_NSPACE = 'S'; // Non-whitespace - static final char E_DIGIT = 'd'; // Digit - static final char E_NDIGIT = 'D'; // Non-digit - - // Posix character classes - static final char POSIX_CLASS_ALNUM = 'w'; // Alphanumerics - static final char POSIX_CLASS_ALPHA = 'a'; // Alphabetics - static final char POSIX_CLASS_BLANK = 'b'; // Blanks - static final char POSIX_CLASS_CNTRL = 'c'; // Control characters - static final char POSIX_CLASS_DIGIT = 'd'; // Digits - static final char POSIX_CLASS_GRAPH = 'g'; // Graphic characters - static final char POSIX_CLASS_LOWER = 'l'; // Lowercase characters - static final char POSIX_CLASS_PRINT = 'p'; // Printable characters - static final char POSIX_CLASS_PUNCT = '!'; // Punctuation - static final char POSIX_CLASS_SPACE = 's'; // Spaces - static final char POSIX_CLASS_UPPER = 'u'; // Uppercase characters - static final char POSIX_CLASS_XDIGIT = 'x'; // Hexadecimal digits - static final char POSIX_CLASS_JSTART = 'j'; // Java identifier start - static final char POSIX_CLASS_JPART = 'k'; // Java identifier part - - // Limits - static final int maxNode = 65536; // Maximum number of nodes in a program - static final int MAX_PAREN = 16; // Number of paren pairs (only 9 can be backrefs) - - // Node layout constants - static final int offsetOpcode = 0; // Opcode offset (first character) - static final int offsetOpdata = 1; // Opdata offset (second char) - static final int offsetNext = 2; // Next index offset (third char) - static final int nodeSize = 3; // Node size (in chars) - - // State of current program - REProgram program; // Compiled regular expression 'program' - transient CharacterIterator search; // The string being matched against - int matchFlags; // Match behaviour flags - int maxParen = MAX_PAREN; - - // Parenthesized subexpressions - transient int parenCount; // Number of subexpressions matched (num open parens + 1) - transient int start0; // Cache of start[0] - transient int end0; // Cache of start[0] - transient int start1; // Cache of start[1] - transient int end1; // Cache of start[1] - transient int start2; // Cache of start[2] - transient int end2; // Cache of start[2] - transient int[] startn; // Lazy-alloced array of sub-expression starts - transient int[] endn; // Lazy-alloced array of sub-expression ends - - // Backreferences - transient int[] startBackref; // Lazy-alloced array of backref starts - transient int[] endBackref; // Lazy-alloced array of backref ends - - /** - * Constructs a regular expression matcher from a String by compiling it - * using a new instance of RECompiler. If you will be compiling many - * expressions, you may prefer to use a single RECompiler object instead. - * - * @param pattern The regular expression pattern to compile. - * @exception RESyntaxException Thrown if the regular expression has invalid syntax. - * @see RECompiler - * @see recompile - */ - public RE(String pattern) throws RESyntaxException - { - this(pattern, MATCH_NORMAL); - } - - /** - * Constructs a regular expression matcher from a String by compiling it - * using a new instance of RECompiler. If you will be compiling many - * expressions, you may prefer to use a single RECompiler object instead. - * - * @param pattern The regular expression pattern to compile. - * @param matchFlags The matching style - * @exception RESyntaxException Thrown if the regular expression has invalid syntax. - * @see RECompiler - * @see recompile - */ - public RE(String pattern, int matchFlags) throws RESyntaxException - { - this(new RECompiler().compile(pattern)); - setMatchFlags(matchFlags); - } - - /** - * Construct a matcher for a pre-compiled regular expression from program - * (bytecode) data. Permits special flags to be passed in to modify matching - * behaviour. - * - * @param program Compiled regular expression program (see RECompiler and/or recompile) - * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*): - * - *
-     *   MATCH_NORMAL              // Normal (case-sensitive) matching
-     *   MATCH_CASEINDEPENDENT     // Case folded comparisons
-     *   MATCH_MULTILINE           // Newline matches as BOL/EOL
-     * 
- * - * @see RECompiler - * @see REProgram - * @see recompile - */ - public RE(REProgram program, int matchFlags) - { - setProgram(program); - setMatchFlags(matchFlags); - } - - /** - * Construct a matcher for a pre-compiled regular expression from program - * (bytecode) data. - * - * @param program Compiled regular expression program - * @see RECompiler - * @see recompile - */ - public RE(REProgram program) - { - this(program, MATCH_NORMAL); - } - - /** - * Constructs a regular expression matcher with no initial program. - * This is likely to be an uncommon practice, but is still supported. - */ - public RE() - { - this((REProgram)null, MATCH_NORMAL); - } - - /** - * Converts a 'simplified' regular expression to a full regular expression - * - * @param pattern The pattern to convert - * @return The full regular expression - */ - public static String simplePatternToFullRegularExpression(String pattern) - { - StringBuffer buf = new StringBuffer(); - for (int i = 0; i < pattern.length(); i++) - { - char c = pattern.charAt(i); - switch (c) - { - case '*': - buf.append(".*"); - break; - - case '.': - case '[': - case ']': - case '\\': - case '+': - case '?': - case '{': - case '}': - case '$': - case '^': - case '|': - case '(': - case ')': - buf.append('\\'); - default: - buf.append(c); - break; - } - } - return buf.toString(); - } - - /** - * Sets match behaviour flags which alter the way RE does matching. - * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*): - * - *
-     *   MATCH_NORMAL              // Normal (case-sensitive) matching
-     *   MATCH_CASEINDEPENDENT     // Case folded comparisons
-     *   MATCH_MULTILINE           // Newline matches as BOL/EOL
-     * 
- */ - public void setMatchFlags(int matchFlags) - { - this.matchFlags = matchFlags; - } - - /** - * Returns the current match behaviour flags. - * @return Current match behaviour flags (RE.MATCH_*). - * - *
-     *   MATCH_NORMAL              // Normal (case-sensitive) matching
-     *   MATCH_CASEINDEPENDENT     // Case folded comparisons
-     *   MATCH_MULTILINE           // Newline matches as BOL/EOL
-     * 
- * - * @see #setMatchFlags - */ - public int getMatchFlags() - { - return matchFlags; - } - - /** - * Sets the current regular expression program used by this matcher object. - * - * @param program Regular expression program compiled by RECompiler. - * @see RECompiler - * @see REProgram - * @see recompile - */ - public void setProgram(REProgram program) - { - this.program = program; - if (program != null && program.maxParens != -1) { - this.maxParen = program.maxParens; - } else { - this.maxParen = MAX_PAREN; - } - } - - /** - * Returns the current regular expression program in use by this matcher object. - * - * @return Regular expression program - * @see #setProgram - */ - public REProgram getProgram() - { - return program; - } - - /** - * Returns the number of parenthesized subexpressions available after a successful match. - * - * @return Number of available parenthesized subexpressions - */ - public int getParenCount() - { - return parenCount; - } - - /** - * Gets the contents of a parenthesized subexpression after a successful match. - * - * @param which Nesting level of subexpression - * @return String - */ - public String getParen(int which) - { - int start; - if (which < parenCount && (start = getParenStart(which)) >= 0) - { - return search.substring(start, getParenEnd(which)); - } - return null; - } - - /** - * Returns the start index of a given paren level. - * - * @param which Nesting level of subexpression - * @return String index - */ - public final int getParenStart(int which) - { - if (which < parenCount) - { - switch (which) - { - case 0: - return start0; - - case 1: - return start1; - - case 2: - return start2; - - default: - if (startn == null) - { - allocParens(); - } - return startn[which]; - } - } - return -1; - } - - /** - * Returns the end index of a given paren level. - * - * @param which Nesting level of subexpression - * @return String index - */ - public final int getParenEnd(int which) - { - if (which < parenCount) - { - switch (which) - { - case 0: - return end0; - - case 1: - return end1; - - case 2: - return end2; - - default: - if (endn == null) - { - allocParens(); - } - return endn[which]; - } - } - return -1; - } - - /** - * Returns the length of a given paren level. - * - * @param which Nesting level of subexpression - * @return Number of characters in the parenthesized subexpression - */ - public final int getParenLength(int which) - { - if (which < parenCount) - { - return getParenEnd(which) - getParenStart(which); - } - return -1; - } - - /** - * Sets the start of a paren level - * - * @param which Which paren level - * @param i Index in input array - */ - protected final void setParenStart(int which, int i) - { - if (which < parenCount) - { - switch (which) - { - case 0: - start0 = i; - break; - - case 1: - start1 = i; - break; - - case 2: - start2 = i; - break; - - default: - if (startn == null) - { - allocParens(); - } - startn[which] = i; - break; - } - } - } - - /** - * Sets the end of a paren level - * - * @param which Which paren level - * @param i Index in input array - */ - protected final void setParenEnd(int which, int i) - { - if (which < parenCount) - { - switch (which) - { - case 0: - end0 = i; - break; - - case 1: - end1 = i; - break; - - case 2: - end2 = i; - break; - - default: - if (endn == null) - { - allocParens(); - } - endn[which] = i; - break; - } - } - } - - /** - * Throws an Error representing an internal error condition probably resulting - * from a bug in the regular expression compiler (or possibly data corruption). - * In practice, this should be very rare. - * - * @param s Error description - */ - protected void internalError(String s) throws Error - { - throw new Error("RE internal error: " + s); - } - - /** - * Performs lazy allocation of subexpression arrays - */ - private final void allocParens() - { - // Allocate arrays for subexpressions - startn = new int[maxParen]; - endn = new int[maxParen]; - - // Set sub-expression pointers to invalid values - for (int i = 0; i < maxParen; i++) - { - startn[i] = -1; - endn[i] = -1; - } - } - - /** - * Try to match a string against a subset of nodes in the program - * - * @param firstNode Node to start at in program - * @param lastNode Last valid node (used for matching a subexpression without - * matching the rest of the program as well). - * @param idxStart Starting position in character array - * @return Final input array index if match succeeded. -1 if not. - */ - protected int matchNodes(int firstNode, int lastNode, int idxStart) - { - // Our current place in the string - int idx = idxStart; - - // Loop while node is valid - int next, opcode, opdata; - int idxNew; - char[] instruction = program.instruction; - for (int node = firstNode; node < lastNode; ) - { - opcode = instruction[node + offsetOpcode]; - next = node + (short)instruction[node + offsetNext]; - opdata = instruction[node + offsetOpdata]; - - switch (opcode) - { - case OP_RELUCTANTMAYBE: - { - int once = 0; - do - { - // Try to match the rest without using the reluctant subexpr - if ((idxNew = matchNodes(next, maxNode, idx)) != -1) - { - return idxNew; - } - } - while ((once++ == 0) && (idx = matchNodes(node + nodeSize, next, idx)) != -1); - return -1; - } - - case OP_RELUCTANTPLUS: - while ((idx = matchNodes(node + nodeSize, next, idx)) != -1) - { - // Try to match the rest without using the reluctant subexpr - if ((idxNew = matchNodes(next, maxNode, idx)) != -1) - { - return idxNew; - } - } - return -1; - - case OP_RELUCTANTSTAR: - do - { - // Try to match the rest without using the reluctant subexpr - if ((idxNew = matchNodes(next, maxNode, idx)) != -1) - { - return idxNew; - } - } - while ((idx = matchNodes(node + nodeSize, next, idx)) != -1); - return -1; - - case OP_OPEN: - - // Match subexpression - if ((program.flags & REProgram.OPT_HASBACKREFS) != 0) - { - startBackref[opdata] = idx; - } - if ((idxNew = matchNodes(next, maxNode, idx)) != -1) - { - // Increase valid paren count - if ((opdata + 1) > parenCount) - { - parenCount = opdata + 1; - } - - // Don't set paren if already set later on - if (getParenStart(opdata) == -1) - { - setParenStart(opdata, idx); - } - } - return idxNew; - - case OP_CLOSE: - - // Done matching subexpression - if ((program.flags & REProgram.OPT_HASBACKREFS) != 0) - { - endBackref[opdata] = idx; - } - if ((idxNew = matchNodes(next, maxNode, idx)) != -1) - { - // Increase valid paren count - if ((opdata + 1) > parenCount) - { - parenCount = opdata + 1; - } - - // Don't set paren if already set later on - if (getParenEnd(opdata) == -1) - { - setParenEnd(opdata, idx); - } - } - return idxNew; - - case OP_OPEN_CLUSTER: - case OP_CLOSE_CLUSTER: - // starting or ending the matching of a subexpression which has no backref. - return matchNodes( next, maxNode, idx ); - - case OP_BACKREF: - { - // Get the start and end of the backref - int s = startBackref[opdata]; - int e = endBackref[opdata]; - - // We don't know the backref yet - if (s == -1 || e == -1) - { - return -1; - } - - // The backref is empty size - if (s == e) - { - break; - } - - // Get the length of the backref - int l = e - s; - - // If there's not enough input left, give up. - if (search.isEnd(idx + l - 1)) - { - return -1; - } - - // Case fold the backref? - final boolean caseFold = - ((matchFlags & MATCH_CASEINDEPENDENT) != 0); - // Compare backref to input - for (int i = 0; i < l; i++) - { - if (compareChars(search.charAt(idx++), search.charAt(s + i), caseFold) != 0) - { - return -1; - } - } - } - break; - - case OP_BOL: - - // Fail if we're not at the start of the string - if (idx != 0) - { - // If we're multiline matching, we could still be at the start of a line - if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE) - { - // If not at start of line, give up - if (idx <= 0 || !isNewline(idx - 1)) { - return -1; - } else { - break; - } - } - return -1; - } - break; - - case OP_EOL: - - // If we're not at the end of string - if (!search.isEnd(0) && !search.isEnd(idx)) - { - // If we're multi-line matching - if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE) - { - // Give up if we're not at the end of a line - if (!isNewline(idx)) { - return -1; - } else { - break; - } - } - return -1; - } - break; - - case OP_ESCAPE: - - // Which escape? - switch (opdata) - { - // Word boundary match - case E_NBOUND: - case E_BOUND: - { - char cLast = ((idx == 0) ? '\n' : search.charAt(idx - 1)); - char cNext = ((search.isEnd(idx)) ? '\n' : search.charAt(idx)); - if ((Character.isLetterOrDigit(cLast) == Character.isLetterOrDigit(cNext)) == (opdata == E_BOUND)) - { - return -1; - } - } - break; - - // Alpha-numeric, digit, space, javaLetter, javaLetterOrDigit - case E_ALNUM: - case E_NALNUM: - case E_DIGIT: - case E_NDIGIT: - case E_SPACE: - case E_NSPACE: - - // Give up if out of input - if (search.isEnd(idx)) - { - return -1; - } - - char c = search.charAt(idx); - - // Switch on escape - switch (opdata) - { - case E_ALNUM: - case E_NALNUM: - if (!((Character.isLetterOrDigit(c) || c == '_') == (opdata == E_ALNUM))) - { - return -1; - } - break; - - case E_DIGIT: - case E_NDIGIT: - if (!(Character.isDigit(c) == (opdata == E_DIGIT))) - { - return -1; - } - break; - - case E_SPACE: - case E_NSPACE: - if (!(Character.isWhitespace(c) == (opdata == E_SPACE))) - { - return -1; - } - break; - } - idx++; - break; - - default: - internalError("Unrecognized escape '" + opdata + "'"); - } - break; - - case OP_ANY: - - if ((matchFlags & MATCH_SINGLELINE) == MATCH_SINGLELINE) { - // Match anything - if (search.isEnd(idx)) - { - return -1; - } - } - else - { - // Match anything but a newline - if (search.isEnd(idx) || isNewline(idx)) - { - return -1; - } - } - idx++; - break; - - case OP_ATOM: - { - // Match an atom value - if (search.isEnd(idx)) - { - return -1; - } - - // Get length of atom and starting index - int lenAtom = opdata; - int startAtom = node + nodeSize; - - // Give up if not enough input remains to have a match - if (search.isEnd(lenAtom + idx - 1)) - { - return -1; - } - - // Match atom differently depending on casefolding flag - final boolean caseFold = - ((matchFlags & MATCH_CASEINDEPENDENT) != 0); - - for (int i = 0; i < lenAtom; i++) - { - if (compareChars(search.charAt(idx++), instruction[startAtom + i], caseFold) != 0) - { - return -1; - } - } - } - break; - - case OP_POSIXCLASS: - { - // Out of input? - if (search.isEnd(idx)) - { - return -1; - } - - switch (opdata) - { - case POSIX_CLASS_ALNUM: - if (!Character.isLetterOrDigit(search.charAt(idx))) - { - return -1; - } - break; - - case POSIX_CLASS_ALPHA: - if (!Character.isLetter(search.charAt(idx))) - { - return -1; - } - break; - - case POSIX_CLASS_DIGIT: - if (!Character.isDigit(search.charAt(idx))) - { - return -1; - } - break; - - case POSIX_CLASS_BLANK: // JWL - bugbug: is this right?? - if (!Character.isSpaceChar(search.charAt(idx))) - { - return -1; - } - break; - - case POSIX_CLASS_SPACE: - if (!Character.isWhitespace(search.charAt(idx))) - { - return -1; - } - break; - - case POSIX_CLASS_CNTRL: - if (Character.getType(search.charAt(idx)) != Character.CONTROL) - { - return -1; - } - break; - - case POSIX_CLASS_GRAPH: // JWL - bugbug??? - switch (Character.getType(search.charAt(idx))) - { - case Character.MATH_SYMBOL: - case Character.CURRENCY_SYMBOL: - case Character.MODIFIER_SYMBOL: - case Character.OTHER_SYMBOL: - break; - - default: - return -1; - } - break; - - case POSIX_CLASS_LOWER: - if (Character.getType(search.charAt(idx)) != Character.LOWERCASE_LETTER) - { - return -1; - } - break; - - case POSIX_CLASS_UPPER: - if (Character.getType(search.charAt(idx)) != Character.UPPERCASE_LETTER) - { - return -1; - } - break; - - case POSIX_CLASS_PRINT: - if (Character.getType(search.charAt(idx)) == Character.CONTROL) - { - return -1; - } - break; - - case POSIX_CLASS_PUNCT: - { - int type = Character.getType(search.charAt(idx)); - switch(type) - { - case Character.DASH_PUNCTUATION: - case Character.START_PUNCTUATION: - case Character.END_PUNCTUATION: - case Character.CONNECTOR_PUNCTUATION: - case Character.OTHER_PUNCTUATION: - break; - - default: - return -1; - } - } - break; - - case POSIX_CLASS_XDIGIT: // JWL - bugbug?? - { - boolean isXDigit = ((search.charAt(idx) >= '0' && search.charAt(idx) <= '9') || - (search.charAt(idx) >= 'a' && search.charAt(idx) <= 'f') || - (search.charAt(idx) >= 'A' && search.charAt(idx) <= 'F')); - if (!isXDigit) - { - return -1; - } - } - break; - - case POSIX_CLASS_JSTART: - if (!Character.isJavaIdentifierStart(search.charAt(idx))) - { - return -1; - } - break; - - case POSIX_CLASS_JPART: - if (!Character.isJavaIdentifierPart(search.charAt(idx))) - { - return -1; - } - break; - - default: - internalError("Bad posix class"); - break; - } - - // Matched. - idx++; - } - break; - - case OP_ANYOF: - { - // Out of input? - if (search.isEnd(idx)) - { - return -1; - } - - // Get character to match against character class and maybe casefold - char c = search.charAt(idx); - boolean caseFold = (matchFlags & MATCH_CASEINDEPENDENT) != 0; - // Loop through character class checking our match character - int idxRange = node + nodeSize; - int idxEnd = idxRange + (opdata * 2); - boolean match = false; - for (int i = idxRange; !match && i < idxEnd; ) - { - // Get start, end and match characters - char s = instruction[i++]; - char e = instruction[i++]; - - match = ((compareChars(c, s, caseFold) >= 0) - && (compareChars(c, e, caseFold) <= 0)); - } - - // Fail if we didn't match the character class - if (!match) - { - return -1; - } - idx++; - } - break; - - case OP_BRANCH: - { - // Check for choices - if (instruction[next + offsetOpcode] != OP_BRANCH) - { - // If there aren't any other choices, just evaluate this branch. - node += nodeSize; - continue; - } - - // Try all available branches - short nextBranch; - do - { - // Try matching the branch against the string - if ((idxNew = matchNodes(node + nodeSize, maxNode, idx)) != -1) - { - return idxNew; - } - - // Go to next branch (if any) - nextBranch = (short)instruction[node + offsetNext]; - node += nextBranch; - } - while (nextBranch != 0 && (instruction[node + offsetOpcode] == OP_BRANCH)); - - // Failed to match any branch! - return -1; - } - - case OP_NOTHING: - case OP_GOTO: - - // Just advance to the next node without doing anything - break; - - case OP_END: - - // Match has succeeded! - setParenEnd(0, idx); - return idx; - - default: - - // Corrupt program - internalError("Invalid opcode '" + opcode + "'"); - } - - // Advance to the next node in the program - node = next; - } - - // We "should" never end up here - internalError("Corrupt program"); - return -1; - } - - /** - * Match the current regular expression program against the current - * input string, starting at index i of the input string. This method - * is only meant for internal use. - * - * @param i The input string index to start matching at - * @return True if the input matched the expression - */ - protected boolean matchAt(int i) - { - // Initialize start pointer, paren cache and paren count - start0 = -1; - end0 = -1; - start1 = -1; - end1 = -1; - start2 = -1; - end2 = -1; - startn = null; - endn = null; - parenCount = 1; - setParenStart(0, i); - - // Allocate backref arrays (unless optimizations indicate otherwise) - if ((program.flags & REProgram.OPT_HASBACKREFS) != 0) - { - startBackref = new int[maxParen]; - endBackref = new int[maxParen]; - } - - // Match against string - int idx; - if ((idx = matchNodes(0, maxNode, i)) != -1) - { - setParenEnd(0, idx); - return true; - } - - // Didn't match - parenCount = 0; - return false; - } - - /** - * Matches the current regular expression program against a character array, - * starting at a given index. - * - * @param search String to match against - * @param i Index to start searching at - * @return True if string matched - */ - public boolean match(String search, int i) - { - return match(new StringCharacterIterator(search), i); - } - - /** - * Matches the current regular expression program against a character array, - * starting at a given index. - * - * @param search String to match against - * @param i Index to start searching at - * @return True if string matched - */ - public boolean match(CharacterIterator search, int i) - { - // There is no compiled program to search with! - if (program == null) - { - // This should be uncommon enough to be an error case rather - // than an exception (which would have to be handled everywhere) - internalError("No RE program to run!"); - } - - // Save string to search - this.search = search; - - // Can we optimize the search by looking for a prefix string? - if (program.prefix == null) - { - // Unprefixed matching must try for a match at each character - for ( ;! search.isEnd(i - 1); i++) - { - // Try a match at index i - if (matchAt(i)) - { - return true; - } - } - return false; - } - else - { - // Prefix-anchored matching is possible - boolean caseIndependent = (matchFlags & MATCH_CASEINDEPENDENT) != 0; - char[] prefix = program.prefix; - for ( ; !search.isEnd(i + prefix.length - 1); i++) - { - int j = i; - int k = 0; - - boolean match; - do { - // If there's a mismatch of any character in the prefix, give up - match = (compareChars(search.charAt(j++), prefix[k++], caseIndependent) == 0); - } while (match && k < prefix.length); - - // See if the whole prefix string matched - if (k == prefix.length) - { - // We matched the full prefix at firstChar, so try it - if (matchAt(i)) - { - return true; - } - } - } - return false; - } - } - - /** - * Matches the current regular expression program against a String. - * - * @param search String to match against - * @return True if string matched - */ - public boolean match(String search) - { - return match(search, 0); - } - - /** - * Splits a string into an array of strings on regular expression boundaries. - * This function works the same way as the Perl function of the same name. - * Given a regular expression of "[ab]+" and a string to split of - * "xyzzyababbayyzabbbab123", the result would be the array of Strings - * "[xyzzy, yyz, 123]". - * - *

Please note that the first string in the resulting array may be an empty - * string. This happens when the very first character of input string is - * matched by the pattern. - * - * @param s String to split on this regular exression - * @return Array of strings - */ - public String[] split(String s) - { - // Create new vector - Vector v = new Vector(); - - // Start at position 0 and search the whole string - int pos = 0; - int len = s.length(); - - // Try a match at each position - while (pos < len && match(s, pos)) - { - // Get start of match - int start = getParenStart(0); - - // Get end of match - int newpos = getParenEnd(0); - - // Check if no progress was made - if (newpos == pos) - { - v.addElement(s.substring(pos, start + 1)); - newpos++; - } - else - { - v.addElement(s.substring(pos, start)); - } - - // Move to new position - pos = newpos; - } - - // Push remainder if it's not empty - String remainder = s.substring(pos); - if (remainder.length() != 0) - { - v.addElement(remainder); - } - - // Return vector as an array of strings - String[] ret = new String[v.size()]; - v.copyInto(ret); - return ret; - } - - /** - * Flag bit that indicates that subst should replace all occurrences of this - * regular expression. - */ - public static final int REPLACE_ALL = 0x0000; - - /** - * Flag bit that indicates that subst should only replace the first occurrence - * of this regular expression. - */ - public static final int REPLACE_FIRSTONLY = 0x0001; - - /** - * Flag bit that indicates that subst should replace backreferences - */ - public static final int REPLACE_BACKREFERENCES = 0x0002; - - /** - * Substitutes a string for this regular expression in another string. - * This method works like the Perl function of the same name. - * Given a regular expression of "a*b", a String to substituteIn of - * "aaaabfooaaabgarplyaaabwackyb" and the substitution String "-", the - * resulting String returned by subst would be "-foo-garply-wacky-". - * - * @param substituteIn String to substitute within - * @param substitution String to substitute for all matches of this regular expression. - * @return The string substituteIn with zero or more occurrences of the current - * regular expression replaced with the substitution String (if this regular - * expression object doesn't match at any position, the original String is returned - * unchanged). - */ - public String subst(String substituteIn, String substitution) - { - return subst(substituteIn, substitution, REPLACE_ALL); - } - - /** - * Substitutes a string for this regular expression in another string. - * This method works like the Perl function of the same name. - * Given a regular expression of "a*b", a String to substituteIn of - * "aaaabfooaaabgarplyaaabwackyb" and the substitution String "-", the - * resulting String returned by subst would be "-foo-garply-wacky-". - *

- * It is also possible to reference the contents of a parenthesized expression - * with $0, $1, ... $9. A regular expression of "http://[\\.\\w\\-\\?/~_@&=%]+", - * a String to substituteIn of "visit us: http://www.apache.org!" and the - * substitution String "<a href=\"$0\">$0</a>", the resulting String - * returned by subst would be - * "visit us: <a href=\"http://www.apache.org\">http://www.apache.org</a>!". - *

- * Note: $0 represents the whole match. - * - * @param substituteIn String to substitute within - * @param substitution String to substitute for matches of this regular expression - * @param flags One or more bitwise flags from REPLACE_*. If the REPLACE_FIRSTONLY - * flag bit is set, only the first occurrence of this regular expression is replaced. - * If the bit is not set (REPLACE_ALL), all occurrences of this pattern will be - * replaced. If the flag REPLACE_BACKREFERENCES is set, all backreferences will - * be processed. - * @return The string substituteIn with zero or more occurrences of the current - * regular expression replaced with the substitution String (if this regular - * expression object doesn't match at any position, the original String is returned - * unchanged). - */ - public String subst(String substituteIn, String substitution, int flags) - { - // String to return - StringBuffer ret = new StringBuffer(); - - // Start at position 0 and search the whole string - int pos = 0; - int len = substituteIn.length(); - - // Try a match at each position - while (pos < len && match(substituteIn, pos)) - { - // Append string before match - ret.append(substituteIn.substring(pos, getParenStart(0))); - - if ((flags & REPLACE_BACKREFERENCES) != 0) - { - // Process backreferences - int lCurrentPosition = 0; - int lLastPosition = -2; - int lLength = substitution.length(); - boolean bAddedPrefix = false; - - while ((lCurrentPosition = substitution.indexOf("$", lCurrentPosition)) >= 0) - { - if ((lCurrentPosition == 0 || substitution.charAt(lCurrentPosition - 1) != '\\') - && lCurrentPosition+1 < lLength) - { - char c = substitution.charAt(lCurrentPosition + 1); - if (c >= '0' && c <= '9') - { - if (bAddedPrefix == false) - { - // Append everything between the beginning of the - // substitution string and the current $ sign - ret.append(substitution.substring(0, lCurrentPosition)); - bAddedPrefix = true; - } - else - { - // Append everything between the last and the current $ sign - ret.append(substitution.substring(lLastPosition + 2, lCurrentPosition)); - } - - // Append the parenthesized expression - // Note: if a parenthesized expression of the requested - // index is not available "null" is added to the string - ret.append(getParen(c - '0')); - lLastPosition = lCurrentPosition; - } - } - - // Move forward, skipping past match - lCurrentPosition++; - } - - // Append everything after the last $ sign - ret.append(substitution.substring(lLastPosition + 2, lLength)); - } - else - { - // Append substitution without processing backreferences - ret.append(substitution); - } - - // Move forward, skipping past match - int newpos = getParenEnd(0); - - // We always want to make progress! - if (newpos == pos) - { - newpos++; - } - - // Try new position - pos = newpos; - - // Break out if we're only supposed to replace one occurrence - if ((flags & REPLACE_FIRSTONLY) != 0) - { - break; - } - } - - // If there's remaining input, append it - if (pos < len) - { - ret.append(substituteIn.substring(pos)); - } - - // Return string buffer as string - return ret.toString(); - } - - /** - * Returns an array of Strings, whose toString representation matches a regular - * expression. This method works like the Perl function of the same name. Given - * a regular expression of "a*b" and an array of String objects of [foo, aab, zzz, - * aaaab], the array of Strings returned by grep would be [aab, aaaab]. - * - * @param search Array of Objects to search - * @return Array of Strings whose toString() value matches this regular expression. - */ - public String[] grep(Object[] search) - { - // Create new vector to hold return items - Vector v = new Vector(); - - // Traverse array of objects - for (int i = 0; i < search.length; i++) - { - // Get next object as a string - String s = search[i].toString(); - - // If it matches this regexp, add it to the list - if (match(s)) - { - v.addElement(s); - } - } - - // Return vector as an array of strings - String[] ret = new String[v.size()]; - v.copyInto(ret); - return ret; - } - - /** - * @return true if character at i-th position in the search string is a newline - */ - private boolean isNewline(int i) - { - char nextChar = search.charAt(i); - - if (nextChar == '\n' || nextChar == '\r' || nextChar == '\u0085' - || nextChar == '\u2028' || nextChar == '\u2029') - { - return true; - } - - return false; - } - - /** - * Compares two characters. - * - * @param c1 first character to compare. - * @param c2 second character to compare. - * @param caseIndependent whether comparision is case insensitive or not. - * @return negative, 0, or positive integer as the first character - * less than, equal to, or greater then the second. - */ - private int compareChars(char c1, char c2, boolean caseIndependent) - { - if (caseIndependent) - { - c1 = Character.toLowerCase(c1); - c2 = Character.toLowerCase(c2); - } - return ((int)c1 - (int)c2); - } -} diff -r 3b8ebb957957 -r aaee9ae4799a src/com/sun/org/apache/regexp/internal/RECompiler.java --- a/src/com/sun/org/apache/regexp/internal/RECompiler.java Sat Oct 24 16:18:47 2020 +0800 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1520 +0,0 @@ -/* - * reserved comment block - * DO NOT REMOVE OR ALTER! - */ -/* - * Copyright 1999-2004 The Apache Software Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.sun.org.apache.regexp.internal; - -import com.sun.org.apache.regexp.internal.RE; -import java.util.Hashtable; - -/** - * A regular expression compiler class. This class compiles a pattern string into a - * regular expression program interpretable by the RE evaluator class. The 'recompile' - * command line tool uses this compiler to pre-compile regular expressions for use - * with RE. For a description of the syntax accepted by RECompiler and what you can - * do with regular expressions, see the documentation for the RE matcher class. - * - * @see RE - * @see recompile - * - * @author Jonathan Locke - * @author Michael McCallum - */ -public class RECompiler -{ - // The compiled program - char[] instruction; // The compiled RE 'program' instruction buffer - int lenInstruction; // The amount of the program buffer currently in use - - // Input state for compiling regular expression - String pattern; // Input string - int len; // Length of the pattern string - int idx; // Current input index into ac - int parens; // Total number of paren pairs - - // Node flags - static final int NODE_NORMAL = 0; // No flags (nothing special) - static final int NODE_NULLABLE = 1; // True if node is potentially null - static final int NODE_TOPLEVEL = 2; // True if top level expr - - // Special types of 'escapes' - static final int ESC_MASK = 0xffff0; // Escape complexity mask - static final int ESC_BACKREF = 0xfffff; // Escape is really a backreference - static final int ESC_COMPLEX = 0xffffe; // Escape isn't really a true character - static final int ESC_CLASS = 0xffffd; // Escape represents a whole class of characters - - // {m,n} stacks - int maxBrackets = 10; // Maximum number of bracket pairs - static final int bracketUnbounded = -1; // Unbounded value - int brackets = 0; // Number of bracket sets - int[] bracketStart = null; // Starting point - int[] bracketEnd = null; // Ending point - int[] bracketMin = null; // Minimum number of matches - int[] bracketOpt = null; // Additional optional matches - - // Lookup table for POSIX character class names - static Hashtable hashPOSIX = new Hashtable(); - static - { - hashPOSIX.put("alnum", new Character(RE.POSIX_CLASS_ALNUM)); - hashPOSIX.put("alpha", new Character(RE.POSIX_CLASS_ALPHA)); - hashPOSIX.put("blank", new Character(RE.POSIX_CLASS_BLANK)); - hashPOSIX.put("cntrl", new Character(RE.POSIX_CLASS_CNTRL)); - hashPOSIX.put("digit", new Character(RE.POSIX_CLASS_DIGIT)); - hashPOSIX.put("graph", new Character(RE.POSIX_CLASS_GRAPH)); - hashPOSIX.put("lower", new Character(RE.POSIX_CLASS_LOWER)); - hashPOSIX.put("print", new Character(RE.POSIX_CLASS_PRINT)); - hashPOSIX.put("punct", new Character(RE.POSIX_CLASS_PUNCT)); - hashPOSIX.put("space", new Character(RE.POSIX_CLASS_SPACE)); - hashPOSIX.put("upper", new Character(RE.POSIX_CLASS_UPPER)); - hashPOSIX.put("xdigit", new Character(RE.POSIX_CLASS_XDIGIT)); - hashPOSIX.put("javastart", new Character(RE.POSIX_CLASS_JSTART)); - hashPOSIX.put("javapart", new Character(RE.POSIX_CLASS_JPART)); - } - - /** - * Constructor. Creates (initially empty) storage for a regular expression program. - */ - public RECompiler() - { - // Start off with a generous, yet reasonable, initial size - instruction = new char[128]; - lenInstruction = 0; - } - - /** - * Ensures that n more characters can fit in the program buffer. - * If n more can't fit, then the size is doubled until it can. - * @param n Number of additional characters to ensure will fit. - */ - void ensure(int n) - { - // Get current program length - int curlen = instruction.length; - - // If the current length + n more is too much - if (lenInstruction + n >= curlen) - { - // Double the size of the program array until n more will fit - while (lenInstruction + n >= curlen) - { - curlen *= 2; - } - - // Allocate new program array and move data into it - char[] newInstruction = new char[curlen]; - System.arraycopy(instruction, 0, newInstruction, 0, lenInstruction); - instruction = newInstruction; - } - } - - /** - * Emit a single character into the program stream. - * @param c Character to add - */ - void emit(char c) - { - // Make room for character - ensure(1); - - // Add character - instruction[lenInstruction++] = c; - } - - /** - * Inserts a node with a given opcode and opdata at insertAt. The node relative next - * pointer is initialized to 0. - * @param opcode Opcode for new node - * @param opdata Opdata for new node (only the low 16 bits are currently used) - * @param insertAt Index at which to insert the new node in the program - */ - void nodeInsert(char opcode, int opdata, int insertAt) - { - // Make room for a new node - ensure(RE.nodeSize); - - // Move everything from insertAt to the end down nodeSize elements - System.arraycopy(instruction, insertAt, instruction, insertAt + RE.nodeSize, lenInstruction - insertAt); - instruction[insertAt + RE.offsetOpcode] = opcode; - instruction[insertAt + RE.offsetOpdata] = (char)opdata; - instruction[insertAt + RE.offsetNext] = 0; - lenInstruction += RE.nodeSize; - } - - /** - * Appends a node to the end of a node chain - * @param node Start of node chain to traverse - * @param pointTo Node to have the tail of the chain point to - */ - void setNextOfEnd(int node, int pointTo) - { - // Traverse the chain until the next offset is 0 - int next = instruction[node + RE.offsetNext]; - // while the 'node' is not the last in the chain - // and the 'node' is not the last in the program. - while ( next != 0 && node < lenInstruction ) - { - // if the node we are supposed to point to is in the chain then - // point to the end of the program instead. - // Michael McCallum - // FIXME: // This is a _hack_ to stop infinite programs. - // I believe that the implementation of the reluctant matches is wrong but - // have not worked out a better way yet. - if ( node == pointTo ) { - pointTo = lenInstruction; - } - node += next; - next = instruction[node + RE.offsetNext]; - } - // if we have reached the end of the program then dont set the pointTo. - // im not sure if this will break any thing but passes all the tests. - if ( node < lenInstruction ) { - // Point the last node in the chain to pointTo. - instruction[node + RE.offsetNext] = (char)(short)(pointTo - node); - } - } - - /** - * Adds a new node - * @param opcode Opcode for node - * @param opdata Opdata for node (only the low 16 bits are currently used) - * @return Index of new node in program - */ - int node(char opcode, int opdata) - { - // Make room for a new node - ensure(RE.nodeSize); - - // Add new node at end - instruction[lenInstruction + RE.offsetOpcode] = opcode; - instruction[lenInstruction + RE.offsetOpdata] = (char)opdata; - instruction[lenInstruction + RE.offsetNext] = 0; - lenInstruction += RE.nodeSize; - - // Return index of new node - return lenInstruction - RE.nodeSize; - } - - - /** - * Throws a new internal error exception - * @exception Error Thrown in the event of an internal error. - */ - void internalError() throws Error - { - throw new Error("Internal error!"); - } - - /** - * Throws a new syntax error exception - * @exception RESyntaxException Thrown if the regular expression has invalid syntax. - */ - void syntaxError(String s) throws RESyntaxException - { - throw new RESyntaxException(s); - } - - /** - * Allocate storage for brackets only as needed - */ - void allocBrackets() - { - // Allocate bracket stacks if not already done - if (bracketStart == null) - { - // Allocate storage - bracketStart = new int[maxBrackets]; - bracketEnd = new int[maxBrackets]; - bracketMin = new int[maxBrackets]; - bracketOpt = new int[maxBrackets]; - - // Initialize to invalid values - for (int i = 0; i < maxBrackets; i++) - { - bracketStart[i] = bracketEnd[i] = bracketMin[i] = bracketOpt[i] = -1; - } - } - } - - /** Enlarge storage for brackets only as needed. */ - synchronized void reallocBrackets() { - // trick the tricky - if (bracketStart == null) { - allocBrackets(); - } - - int new_size = maxBrackets * 2; - int[] new_bS = new int[new_size]; - int[] new_bE = new int[new_size]; - int[] new_bM = new int[new_size]; - int[] new_bO = new int[new_size]; - // Initialize to invalid values - for (int i=brackets; i= len || pattern.charAt(idx++) != '{') - { - internalError(); - } - - // Next char must be a digit - if (idx >= len || !Character.isDigit(pattern.charAt(idx))) - { - syntaxError("Expected digit"); - } - - // Get min ('m' of {m,n}) number - StringBuffer number = new StringBuffer(); - while (idx < len && Character.isDigit(pattern.charAt(idx))) - { - number.append(pattern.charAt(idx++)); - } - try - { - bracketMin[brackets] = Integer.parseInt(number.toString()); - } - catch (NumberFormatException e) - { - syntaxError("Expected valid number"); - } - - // If out of input, fail - if (idx >= len) - { - syntaxError("Expected comma or right bracket"); - } - - // If end of expr, optional limit is 0 - if (pattern.charAt(idx) == '}') - { - idx++; - bracketOpt[brackets] = 0; - return; - } - - // Must have at least {m,} and maybe {m,n}. - if (idx >= len || pattern.charAt(idx++) != ',') - { - syntaxError("Expected comma"); - } - - // If out of input, fail - if (idx >= len) - { - syntaxError("Expected comma or right bracket"); - } - - // If {m,} max is unlimited - if (pattern.charAt(idx) == '}') - { - idx++; - bracketOpt[brackets] = bracketUnbounded; - return; - } - - // Next char must be a digit - if (idx >= len || !Character.isDigit(pattern.charAt(idx))) - { - syntaxError("Expected digit"); - } - - // Get max number - number.setLength(0); - while (idx < len && Character.isDigit(pattern.charAt(idx))) - { - number.append(pattern.charAt(idx++)); - } - try - { - bracketOpt[brackets] = Integer.parseInt(number.toString()) - bracketMin[brackets]; - } - catch (NumberFormatException e) - { - syntaxError("Expected valid number"); - } - - // Optional repetitions must be >= 0 - if (bracketOpt[brackets] < 0) - { - syntaxError("Bad range"); - } - - // Must have close brace - if (idx >= len || pattern.charAt(idx++) != '}') - { - syntaxError("Missing close brace"); - } - } - - /** - * Match an escape sequence. Handles quoted chars and octal escapes as well - * as normal escape characters. Always advances the input stream by the - * right amount. This code "understands" the subtle difference between an - * octal escape and a backref. You can access the type of ESC_CLASS or - * ESC_COMPLEX or ESC_BACKREF by looking at pattern[idx - 1]. - * @return ESC_* code or character if simple escape - * @exception RESyntaxException Thrown if the regular expression has invalid syntax. - */ - int escape() throws RESyntaxException - { - // "Shouldn't" happen - if (pattern.charAt(idx) != '\\') - { - internalError(); - } - - // Escape shouldn't occur as last character in string! - if (idx + 1 == len) - { - syntaxError("Escape terminates string"); - } - - // Switch on character after backslash - idx += 2; - char escapeChar = pattern.charAt(idx - 1); - switch (escapeChar) - { - case RE.E_BOUND: - case RE.E_NBOUND: - return ESC_COMPLEX; - - case RE.E_ALNUM: - case RE.E_NALNUM: - case RE.E_SPACE: - case RE.E_NSPACE: - case RE.E_DIGIT: - case RE.E_NDIGIT: - return ESC_CLASS; - - case 'u': - case 'x': - { - // Exact required hex digits for escape type - int hexDigits = (escapeChar == 'u' ? 4 : 2); - - // Parse up to hexDigits characters from input - int val = 0; - for ( ; idx < len && hexDigits-- > 0; idx++) - { - // Get char - char c = pattern.charAt(idx); - - // If it's a hexadecimal digit (0-9) - if (c >= '0' && c <= '9') - { - // Compute new value - val = (val << 4) + c - '0'; - } - else - { - // If it's a hexadecimal letter (a-f) - c = Character.toLowerCase(c); - if (c >= 'a' && c <= 'f') - { - // Compute new value - val = (val << 4) + (c - 'a') + 10; - } - else - { - // If it's not a valid digit or hex letter, the escape must be invalid - // because hexDigits of input have not been absorbed yet. - syntaxError("Expected " + hexDigits + " hexadecimal digits after \\" + escapeChar); - } - } - } - return val; - } - - case 't': - return '\t'; - - case 'n': - return '\n'; - - case 'r': - return '\r'; - - case 'f': - return '\f'; - - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - - // An octal escape starts with a 0 or has two digits in a row - if ((idx < len && Character.isDigit(pattern.charAt(idx))) || escapeChar == '0') - { - // Handle \nnn octal escapes - int val = escapeChar - '0'; - if (idx < len && Character.isDigit(pattern.charAt(idx))) - { - val = ((val << 3) + (pattern.charAt(idx++) - '0')); - if (idx < len && Character.isDigit(pattern.charAt(idx))) - { - val = ((val << 3) + (pattern.charAt(idx++) - '0')); - } - } - return val; - } - - // It's actually a backreference (\[1-9]), not an escape - return ESC_BACKREF; - - default: - - // Simple quoting of a character - return escapeChar; - } - } - - /** - * Compile a character class - * @return Index of class node - * @exception RESyntaxException Thrown if the regular expression has invalid syntax. - */ - int characterClass() throws RESyntaxException - { - // Check for bad calling or empty class - if (pattern.charAt(idx) != '[') - { - internalError(); - } - - // Check for unterminated or empty class - if ((idx + 1) >= len || pattern.charAt(++idx) == ']') - { - syntaxError("Empty or unterminated class"); - } - - // Check for POSIX character class - if (idx < len && pattern.charAt(idx) == ':') - { - // Skip colon - idx++; - - // POSIX character classes are denoted with lowercase ASCII strings - int idxStart = idx; - while (idx < len && pattern.charAt(idx) >= 'a' && pattern.charAt(idx) <= 'z') - { - idx++; - } - - // Should be a ":]" to terminate the POSIX character class - if ((idx + 1) < len && pattern.charAt(idx) == ':' && pattern.charAt(idx + 1) == ']') - { - // Get character class - String charClass = pattern.substring(idxStart, idx); - - // Select the POSIX class id - Character i = (Character)hashPOSIX.get(charClass); - if (i != null) - { - // Move past colon and right bracket - idx += 2; - - // Return new POSIX character class node - return node(RE.OP_POSIXCLASS, i.charValue()); - } - syntaxError("Invalid POSIX character class '" + charClass + "'"); - } - syntaxError("Invalid POSIX character class syntax"); - } - - // Try to build a class. Create OP_ANYOF node - int ret = node(RE.OP_ANYOF, 0); - - // Parse class declaration - char CHAR_INVALID = Character.MAX_VALUE; - char last = CHAR_INVALID; - char simpleChar = 0; - boolean include = true; - boolean definingRange = false; - int idxFirst = idx; - char rangeStart = Character.MIN_VALUE; - char rangeEnd; - RERange range = new RERange(); - while (idx < len && pattern.charAt(idx) != ']') - { - - switchOnCharacter: - - // Switch on character - switch (pattern.charAt(idx)) - { - case '^': - include = !include; - if (idx == idxFirst) - { - range.include(Character.MIN_VALUE, Character.MAX_VALUE, true); - } - idx++; - continue; - - case '\\': - { - // Escape always advances the stream - int c; - switch (c = escape ()) - { - case ESC_COMPLEX: - case ESC_BACKREF: - - // Word boundaries and backrefs not allowed in a character class! - syntaxError("Bad character class"); - - case ESC_CLASS: - - // Classes can't be an endpoint of a range - if (definingRange) - { - syntaxError("Bad character class"); - } - - // Handle specific type of class (some are ok) - switch (pattern.charAt(idx - 1)) - { - case RE.E_NSPACE: - case RE.E_NDIGIT: - case RE.E_NALNUM: - syntaxError("Bad character class"); - - case RE.E_SPACE: - range.include('\t', include); - range.include('\r', include); - range.include('\f', include); - range.include('\n', include); - range.include('\b', include); - range.include(' ', include); - break; - - case RE.E_ALNUM: - range.include('a', 'z', include); - range.include('A', 'Z', include); - range.include('_', include); - - // Fall through! - - case RE.E_DIGIT: - range.include('0', '9', include); - break; - } - - // Make last char invalid (can't be a range start) - last = CHAR_INVALID; - break; - - default: - - // Escape is simple so treat as a simple char - simpleChar = (char) c; - break switchOnCharacter; - } - } - continue; - - case '-': - - // Start a range if one isn't already started - if (definingRange) - { - syntaxError("Bad class range"); - } - definingRange = true; - - // If no last character, start of range is 0 - rangeStart = (last == CHAR_INVALID ? 0 : last); - - // Premature end of range. define up to Character.MAX_VALUE - if ((idx + 1) < len && pattern.charAt(++idx) == ']') - { - simpleChar = Character.MAX_VALUE; - break; - } - continue; - - default: - simpleChar = pattern.charAt(idx++); - break; - } - - // Handle simple character simpleChar - if (definingRange) - { - // if we are defining a range make it now - rangeEnd = simpleChar; - - // Actually create a range if the range is ok - if (rangeStart >= rangeEnd) - { - syntaxError("Bad character class"); - } - range.include(rangeStart, rangeEnd, include); - - // We are done defining the range - last = CHAR_INVALID; - definingRange = false; - } - else - { - // If simple character and not start of range, include it - if (idx >= len || pattern.charAt(idx) != '-') - { - range.include(simpleChar, include); - } - last = simpleChar; - } - } - - // Shouldn't be out of input - if (idx == len) - { - syntaxError("Unterminated character class"); - } - - // Absorb the ']' end of class marker - idx++; - - // Emit character class definition - instruction[ret + RE.offsetOpdata] = (char)range.num; - for (int i = 0; i < range.num; i++) - { - emit((char)range.minRange[i]); - emit((char)range.maxRange[i]); - } - return ret; - } - - /** - * Absorb an atomic character string. This method is a little tricky because - * it can un-include the last character of string if a closure operator follows. - * This is correct because *+? have higher precedence than concatentation (thus - * ABC* means AB(C*) and NOT (ABC)*). - * @return Index of new atom node - * @exception RESyntaxException Thrown if the regular expression has invalid syntax. - */ - int atom() throws RESyntaxException - { - // Create a string node - int ret = node(RE.OP_ATOM, 0); - - // Length of atom - int lenAtom = 0; - - // Loop while we've got input - - atomLoop: - - while (idx < len) - { - // Is there a next char? - if ((idx + 1) < len) - { - char c = pattern.charAt(idx + 1); - - // If the next 'char' is an escape, look past the whole escape - if (pattern.charAt(idx) == '\\') - { - int idxEscape = idx; - escape(); - if (idx < len) - { - c = pattern.charAt(idx); - } - idx = idxEscape; - } - - // Switch on next char - switch (c) - { - case '{': - case '?': - case '*': - case '+': - - // If the next character is a closure operator and our atom is non-empty, the - // current character should bind to the closure operator rather than the atom - if (lenAtom != 0) - { - break atomLoop; - } - } - } - - // Switch on current char - switch (pattern.charAt(idx)) - { - case ']': - case '^': - case '$': - case '.': - case '[': - case '(': - case ')': - case '|': - break atomLoop; - - case '{': - case '?': - case '*': - case '+': - - // We should have an atom by now - if (lenAtom == 0) - { - // No atom before closure - syntaxError("Missing operand to closure"); - } - break atomLoop; - - case '\\': - - { - // Get the escaped character (advances input automatically) - int idxBeforeEscape = idx; - int c = escape(); - - // Check if it's a simple escape (as opposed to, say, a backreference) - if ((c & ESC_MASK) == ESC_MASK) - { - // Not a simple escape, so backup to where we were before the escape. - idx = idxBeforeEscape; - break atomLoop; - } - - // Add escaped char to atom - emit((char) c); - lenAtom++; - } - break; - - default: - - // Add normal character to atom - emit(pattern.charAt(idx++)); - lenAtom++; - break; - } - } - - // This "shouldn't" happen - if (lenAtom == 0) - { - internalError(); - } - - // Emit the atom length into the program - instruction[ret + RE.offsetOpdata] = (char)lenAtom; - return ret; - } - - /** - * Match a terminal node. - * @param flags Flags - * @return Index of terminal node (closeable) - * @exception RESyntaxException Thrown if the regular expression has invalid syntax. - */ - int terminal(int[] flags) throws RESyntaxException - { - switch (pattern.charAt(idx)) - { - case RE.OP_EOL: - case RE.OP_BOL: - case RE.OP_ANY: - return node(pattern.charAt(idx++), 0); - - case '[': - return characterClass(); - - case '(': - return expr(flags); - - case ')': - syntaxError("Unexpected close paren"); - - case '|': - internalError(); - - case ']': - syntaxError("Mismatched class"); - - case 0: - syntaxError("Unexpected end of input"); - - case '?': - case '+': - case '{': - case '*': - syntaxError("Missing operand to closure"); - - case '\\': - { - // Don't forget, escape() advances the input stream! - int idxBeforeEscape = idx; - - // Switch on escaped character - switch (escape()) - { - case ESC_CLASS: - case ESC_COMPLEX: - flags[0] &= ~NODE_NULLABLE; - return node(RE.OP_ESCAPE, pattern.charAt(idx - 1)); - - case ESC_BACKREF: - { - char backreference = (char)(pattern.charAt(idx - 1) - '0'); - if (parens <= backreference) - { - syntaxError("Bad backreference"); - } - flags[0] |= NODE_NULLABLE; - return node(RE.OP_BACKREF, backreference); - } - - default: - - // We had a simple escape and we want to have it end up in - // an atom, so we back up and fall though to the default handling - idx = idxBeforeEscape; - flags[0] &= ~NODE_NULLABLE; - break; - } - } - } - - // Everything above either fails or returns. - // If it wasn't one of the above, it must be the start of an atom. - flags[0] &= ~NODE_NULLABLE; - return atom(); - } - - /** - * Compile a possibly closured terminal - * @param flags Flags passed by reference - * @return Index of closured node - * @exception RESyntaxException Thrown if the regular expression has invalid syntax. - */ - int closure(int[] flags) throws RESyntaxException - { - // Before terminal - int idxBeforeTerminal = idx; - - // Values to pass by reference to terminal() - int[] terminalFlags = { NODE_NORMAL }; - - // Get terminal symbol - int ret = terminal(terminalFlags); - - // Or in flags from terminal symbol - flags[0] |= terminalFlags[0]; - - // Advance input, set NODE_NULLABLE flag and do sanity checks - if (idx >= len) - { - return ret; - } - boolean greedy = true; - char closureType = pattern.charAt(idx); - switch (closureType) - { - case '?': - case '*': - - // The current node can be null - flags[0] |= NODE_NULLABLE; - - case '+': - - // Eat closure character - idx++; - - case '{': - - // Don't allow blantant stupidity - int opcode = instruction[ret + RE.offsetOpcode]; - if (opcode == RE.OP_BOL || opcode == RE.OP_EOL) - { - syntaxError("Bad closure operand"); - } - if ((terminalFlags[0] & NODE_NULLABLE) != 0) - { - syntaxError("Closure operand can't be nullable"); - } - break; - } - - // If the next character is a '?', make the closure non-greedy (reluctant) - if (idx < len && pattern.charAt(idx) == '?') - { - idx++; - greedy = false; - } - - if (greedy) - { - // Actually do the closure now - switch (closureType) - { - case '{': - { - // We look for our bracket in the list - boolean found = false; - int i; - allocBrackets(); - for (i = 0; i < brackets; i++) - { - if (bracketStart[i] == idx) - { - found = true; - break; - } - } - - // If its not in the list we parse the {m,n} - if (!found) - { - if (brackets >= maxBrackets) - { - reallocBrackets(); - } - bracketStart[brackets] = idx; - bracket(); - bracketEnd[brackets] = idx; - i = brackets++; - } - - // Process min first - if (bracketMin[i]-- > 0) - { - if (bracketMin[i] > 0 || bracketOpt[i] != 0) { - // Rewind stream and run it through again - more matchers coming - for (int j = 0; j < brackets; j++) { - if (j != i && bracketStart[j] < idx - && bracketStart[j] >= idxBeforeTerminal) - { - brackets--; - bracketStart[j] = bracketStart[brackets]; - bracketEnd[j] = bracketEnd[brackets]; - bracketMin[j] = bracketMin[brackets]; - bracketOpt[j] = bracketOpt[brackets]; - } - } - - idx = idxBeforeTerminal; - } else { - // Bug #1030: No optinal matches - no need to rewind - idx = bracketEnd[i]; - } - break; - } - - // Do the right thing for maximum ({m,}) - if (bracketOpt[i] == bracketUnbounded) - { - // Drop through now and closure expression. - // We are done with the {m,} expr, so skip rest - closureType = '*'; - bracketOpt[i] = 0; - idx = bracketEnd[i]; - } - else - if (bracketOpt[i]-- > 0) - { - if (bracketOpt[i] > 0) - { - // More optional matchers - 'play it again sam!' - idx = idxBeforeTerminal; - } else { - // Bug #1030: We are done - this one is last and optional - idx = bracketEnd[i]; - } - // Drop through to optionally close - closureType = '?'; - } - else - { - // Rollback terminal - neither min nor opt matchers present - lenInstruction = ret; - node(RE.OP_NOTHING, 0); - - // We are done. skip the rest of {m,n} expr - idx = bracketEnd[i]; - break; - } - } - - // Fall through! - - case '?': - case '*': - - if (!greedy) - { - break; - } - - if (closureType == '?') - { - // X? is compiled as (X|) - nodeInsert(RE.OP_BRANCH, 0, ret); // branch before X - setNextOfEnd(ret, node (RE.OP_BRANCH, 0)); // inserted branch to option - int nothing = node (RE.OP_NOTHING, 0); // which is OP_NOTHING - setNextOfEnd(ret, nothing); // point (second) branch to OP_NOTHING - setNextOfEnd(ret + RE.nodeSize, nothing); // point the end of X to OP_NOTHING node - } - - if (closureType == '*') - { - // X* is compiled as (X{gotoX}|) - nodeInsert(RE.OP_BRANCH, 0, ret); // branch before X - setNextOfEnd(ret + RE.nodeSize, node(RE.OP_BRANCH, 0)); // end of X points to an option - setNextOfEnd(ret + RE.nodeSize, node(RE.OP_GOTO, 0)); // to goto - setNextOfEnd(ret + RE.nodeSize, ret); // the start again - setNextOfEnd(ret, node(RE.OP_BRANCH, 0)); // the other option is - setNextOfEnd(ret, node(RE.OP_NOTHING, 0)); // OP_NOTHING - } - break; - - case '+': - { - // X+ is compiled as X({gotoX}|) - int branch; - branch = node(RE.OP_BRANCH, 0); // a new branch - setNextOfEnd(ret, branch); // is added to the end of X - setNextOfEnd(node(RE.OP_GOTO, 0), ret); // one option is to go back to the start - setNextOfEnd(branch, node(RE.OP_BRANCH, 0)); // the other option - setNextOfEnd(ret, node(RE.OP_NOTHING, 0)); // is OP_NOTHING - } - break; - } - } - else - { - // Add end after closured subexpr - setNextOfEnd(ret, node(RE.OP_END, 0)); - - // Actually do the closure now - switch (closureType) - { - case '?': - nodeInsert(RE.OP_RELUCTANTMAYBE, 0, ret); - break; - - case '*': - nodeInsert(RE.OP_RELUCTANTSTAR, 0, ret); - break; - - case '+': - nodeInsert(RE.OP_RELUCTANTPLUS, 0, ret); - break; - } - - // Point to the expr after the closure - setNextOfEnd(ret, lenInstruction); - } - return ret; - } - - /** - * Compile one branch of an or operator (implements concatenation) - * @param flags Flags passed by reference - * @return Pointer to branch node - * @exception RESyntaxException Thrown if the regular expression has invalid syntax. - */ - int branch(int[] flags) throws RESyntaxException - { - // Get each possibly closured piece and concat - int node; - int ret = node(RE.OP_BRANCH, 0); - int chain = -1; - int[] closureFlags = new int[1]; - boolean nullable = true; - while (idx < len && pattern.charAt(idx) != '|' && pattern.charAt(idx) != ')') - { - // Get new node - closureFlags[0] = NODE_NORMAL; - node = closure(closureFlags); - if (closureFlags[0] == NODE_NORMAL) - { - nullable = false; - } - - // If there's a chain, append to the end - if (chain != -1) - { - setNextOfEnd(chain, node); - } - - // Chain starts at current - chain = node; - } - - // If we don't run loop, make a nothing node - if (chain == -1) - { - node(RE.OP_NOTHING, 0); - } - - // Set nullable flag for this branch - if (nullable) - { - flags[0] |= NODE_NULLABLE; - } - return ret; - } - - /** - * Compile an expression with possible parens around it. Paren matching - * is done at this level so we can tie the branch tails together. - * @param flags Flag value passed by reference - * @return Node index of expression in instruction array - * @exception RESyntaxException Thrown if the regular expression has invalid syntax. - */ - int expr(int[] flags) throws RESyntaxException - { - // Create open paren node unless we were called from the top level (which has no parens) - int paren = -1; - int ret = -1; - int closeParens = parens; - if ((flags[0] & NODE_TOPLEVEL) == 0 && pattern.charAt(idx) == '(') - { - // if its a cluster ( rather than a proper subexpression ie with backrefs ) - if ( idx + 2 < len && pattern.charAt( idx + 1 ) == '?' && pattern.charAt( idx + 2 ) == ':' ) - { - paren = 2; - idx += 3; - ret = node( RE.OP_OPEN_CLUSTER, 0 ); - } - else - { - paren = 1; - idx++; - ret = node(RE.OP_OPEN, parens++); - } - } - flags[0] &= ~NODE_TOPLEVEL; - - // Create a branch node - int branch = branch(flags); - if (ret == -1) - { - ret = branch; - } - else - { - setNextOfEnd(ret, branch); - } - - // Loop through branches - while (idx < len && pattern.charAt(idx) == '|') - { - idx++; - branch = branch(flags); - setNextOfEnd(ret, branch); - } - - // Create an ending node (either a close paren or an OP_END) - int end; - if ( paren > 0 ) - { - if (idx < len && pattern.charAt(idx) == ')') - { - idx++; - } - else - { - syntaxError("Missing close paren"); - } - if ( paren == 1 ) - { - end = node(RE.OP_CLOSE, closeParens); - } - else - { - end = node( RE.OP_CLOSE_CLUSTER, 0 ); - } - } - else - { - end = node(RE.OP_END, 0); - } - - // Append the ending node to the ret nodelist - setNextOfEnd(ret, end); - - // Hook the ends of each branch to the end node - int currentNode = ret; - int nextNodeOffset = instruction[ currentNode + RE.offsetNext ]; - // while the next node o - while ( nextNodeOffset != 0 && currentNode < lenInstruction ) - { - // If branch, make the end of the branch's operand chain point to the end node. - if ( instruction[ currentNode + RE.offsetOpcode ] == RE.OP_BRANCH ) - { - setNextOfEnd( currentNode + RE.nodeSize, end ); - } - nextNodeOffset = instruction[ currentNode + RE.offsetNext ]; - currentNode += nextNodeOffset; - } - - // Return the node list - return ret; - } - - /** - * Compiles a regular expression pattern into a program runnable by the pattern - * matcher class 'RE'. - * @param pattern Regular expression pattern to compile (see RECompiler class - * for details). - * @return A compiled regular expression program. - * @exception RESyntaxException Thrown if the regular expression has invalid syntax. - * @see RECompiler - * @see RE - */ - public REProgram compile(String pattern) throws RESyntaxException - { - // Initialize variables for compilation - this.pattern = pattern; // Save pattern in instance variable - len = pattern.length(); // Precompute pattern length for speed - idx = 0; // Set parsing index to the first character - lenInstruction = 0; // Set emitted instruction count to zero - parens = 1; // Set paren level to 1 (the implicit outer parens) - brackets = 0; // No bracketed closures yet - - // Initialize pass by reference flags value - int[] flags = { NODE_TOPLEVEL }; - - // Parse expression - expr(flags); - - // Should be at end of input - if (idx != len) - { - if (pattern.charAt(idx) == ')') - { - syntaxError("Unmatched close paren"); - } - syntaxError("Unexpected input remains"); - } - - // Return the result - char[] ins = new char[lenInstruction]; - System.arraycopy(instruction, 0, ins, 0, lenInstruction); - return new REProgram(parens, ins); - } - - /** - * Local, nested class for maintaining character ranges for character classes. - */ - class RERange - { - int size = 16; // Capacity of current range arrays - int[] minRange = new int[size]; // Range minima - int[] maxRange = new int[size]; // Range maxima - int num = 0; // Number of range array elements in use - - /** - * Deletes the range at a given index from the range lists - * @param index Index of range to delete from minRange and maxRange arrays. - */ - void delete(int index) - { - // Return if no elements left or index is out of range - if (num == 0 || index >= num) - { - return; - } - - // Move elements down - while (++index < num) - { - if (index - 1 >= 0) - { - minRange[index-1] = minRange[index]; - maxRange[index-1] = maxRange[index]; - } - } - - // One less element now - num--; - } - - /** - * Merges a range into the range list, coalescing ranges if possible. - * @param min Minimum end of range - * @param max Maximum end of range - */ - void merge(int min, int max) - { - // Loop through ranges - for (int i = 0; i < num; i++) - { - // Min-max is subsumed by minRange[i]-maxRange[i] - if (min >= minRange[i] && max <= maxRange[i]) - { - return; - } - - // Min-max subsumes minRange[i]-maxRange[i] - else if (min <= minRange[i] && max >= maxRange[i]) - { - delete(i); - merge(min, max); - return; - } - - // Min is in the range, but max is outside - else if (min >= minRange[i] && min <= maxRange[i]) - { - delete(i); - min = minRange[i]; - merge(min, max); - return; - } - - // Max is in the range, but min is outside - else if (max >= minRange[i] && max <= maxRange[i]) - { - delete(i); - max = maxRange[i]; - merge(min, max); - return; - } - } - - // Must not overlap any other ranges - if (num >= size) - { - size *= 2; - int[] newMin = new int[size]; - int[] newMax = new int[size]; - System.arraycopy(minRange, 0, newMin, 0, num); - System.arraycopy(maxRange, 0, newMax, 0, num); - minRange = newMin; - maxRange = newMax; - } - minRange[num] = min; - maxRange[num] = max; - num++; - } - - /** - * Removes a range by deleting or shrinking all other ranges - * @param min Minimum end of range - * @param max Maximum end of range - */ - void remove(int min, int max) - { - // Loop through ranges - for (int i = 0; i < num; i++) - { - // minRange[i]-maxRange[i] is subsumed by min-max - if (minRange[i] >= min && maxRange[i] <= max) - { - delete(i); - i--; - return; - } - - // min-max is subsumed by minRange[i]-maxRange[i] - else if (min >= minRange[i] && max <= maxRange[i]) - { - int minr = minRange[i]; - int maxr = maxRange[i]; - delete(i); - if (minr < min) - { - merge(minr, min - 1); - } - if (max < maxr) - { - merge(max + 1, maxr); - } - return; - } - - // minRange is in the range, but maxRange is outside - else if (minRange[i] >= min && minRange[i] <= max) - { - minRange[i] = max + 1; - return; - } - - // maxRange is in the range, but minRange is outside - else if (maxRange[i] >= min && maxRange[i] <= max) - { - maxRange[i] = min - 1; - return; - } - } - } - - /** - * Includes (or excludes) the range from min to max, inclusive. - * @param min Minimum end of range - * @param max Maximum end of range - * @param include True if range should be included. False otherwise. - */ - void include(int min, int max, boolean include) - { - if (include) - { - merge(min, max); - } - else - { - remove(min, max); - } - } - - /** - * Includes a range with the same min and max - * @param minmax Minimum and maximum end of range (inclusive) - * @param include True if range should be included. False otherwise. - */ - void include(char minmax, boolean include) - { - include(minmax, minmax, include); - } - } -} diff -r 3b8ebb957957 -r aaee9ae4799a src/com/sun/org/apache/regexp/internal/REDebugCompiler.java --- a/src/com/sun/org/apache/regexp/internal/REDebugCompiler.java Sat Oct 24 16:18:47 2020 +0800 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,225 +0,0 @@ -/* - * reserved comment block - * DO NOT REMOVE OR ALTER! - */ -/* - * Copyright 1999-2004 The Apache Software Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.sun.org.apache.regexp.internal; - -import java.io.PrintWriter; -import java.util.Hashtable; - -/** - * A subclass of RECompiler which can dump a regular expression program - * for debugging purposes. - * - * @author Jonathan Locke - */ -public class REDebugCompiler extends RECompiler -{ - /** - * Mapping from opcodes to descriptive strings - */ - static Hashtable hashOpcode = new Hashtable(); - static - { - hashOpcode.put(new Integer(RE.OP_RELUCTANTSTAR), "OP_RELUCTANTSTAR"); - hashOpcode.put(new Integer(RE.OP_RELUCTANTPLUS), "OP_RELUCTANTPLUS"); - hashOpcode.put(new Integer(RE.OP_RELUCTANTMAYBE), "OP_RELUCTANTMAYBE"); - hashOpcode.put(new Integer(RE.OP_END), "OP_END"); - hashOpcode.put(new Integer(RE.OP_BOL), "OP_BOL"); - hashOpcode.put(new Integer(RE.OP_EOL), "OP_EOL"); - hashOpcode.put(new Integer(RE.OP_ANY), "OP_ANY"); - hashOpcode.put(new Integer(RE.OP_ANYOF), "OP_ANYOF"); - hashOpcode.put(new Integer(RE.OP_BRANCH), "OP_BRANCH"); - hashOpcode.put(new Integer(RE.OP_ATOM), "OP_ATOM"); - hashOpcode.put(new Integer(RE.OP_STAR), "OP_STAR"); - hashOpcode.put(new Integer(RE.OP_PLUS), "OP_PLUS"); - hashOpcode.put(new Integer(RE.OP_MAYBE), "OP_MAYBE"); - hashOpcode.put(new Integer(RE.OP_NOTHING), "OP_NOTHING"); - hashOpcode.put(new Integer(RE.OP_GOTO), "OP_GOTO"); - hashOpcode.put(new Integer(RE.OP_ESCAPE), "OP_ESCAPE"); - hashOpcode.put(new Integer(RE.OP_OPEN), "OP_OPEN"); - hashOpcode.put(new Integer(RE.OP_CLOSE), "OP_CLOSE"); - hashOpcode.put(new Integer(RE.OP_BACKREF), "OP_BACKREF"); - hashOpcode.put(new Integer(RE.OP_POSIXCLASS), "OP_POSIXCLASS"); - hashOpcode.put(new Integer(RE.OP_OPEN_CLUSTER), "OP_OPEN_CLUSTER"); - hashOpcode.put(new Integer(RE.OP_CLOSE_CLUSTER), "OP_CLOSE_CLUSTER"); - } - - /** - * Returns a descriptive string for an opcode. - * @param opcode Opcode to convert to a string - * @return Description of opcode - */ - String opcodeToString(char opcode) - { - // Get string for opcode - String ret =(String)hashOpcode.get(new Integer(opcode)); - - // Just in case we have a corrupt program - if (ret == null) - { - ret = "OP_????"; - } - return ret; - } - - /** - * Return a string describing a (possibly unprintable) character. - * @param c Character to convert to a printable representation - * @return String representation of character - */ - String charToString(char c) - { - // If it's unprintable, convert to '\###' - if (c < ' ' || c > 127) - { - return "\\" + (int)c; - } - - // Return the character as a string - return String.valueOf(c); - } - - /** - * Returns a descriptive string for a node in a regular expression program. - * @param node Node to describe - * @return Description of node - */ - String nodeToString(int node) - { - // Get opcode and opdata for node - char opcode = instruction[node + RE.offsetOpcode]; - int opdata = (int)instruction[node + RE.offsetOpdata]; - - // Return opcode as a string and opdata value - return opcodeToString(opcode) + ", opdata = " + opdata; - } - - /** - * Inserts a node with a given opcode and opdata at insertAt. The node relative next - * pointer is initialized to 0. - * @param opcode Opcode for new node - * @param opdata Opdata for new node (only the low 16 bits are currently used) - * @param insertAt Index at which to insert the new node in the program * / - void nodeInsert(char opcode, int opdata, int insertAt) { - System.out.println( "====> " + opcode + " " + opdata + " " + insertAt ); - PrintWriter writer = new PrintWriter( System.out ); - dumpProgram( writer ); - super.nodeInsert( opcode, opdata, insertAt ); - System.out.println( "====< " ); - dumpProgram( writer ); - writer.flush(); - }/**/ - - - /** - * Appends a node to the end of a node chain - * @param node Start of node chain to traverse - * @param pointTo Node to have the tail of the chain point to * / - void setNextOfEnd(int node, int pointTo) { - System.out.println( "====> " + node + " " + pointTo ); - PrintWriter writer = new PrintWriter( System.out ); - dumpProgram( writer ); - super.setNextOfEnd( node, pointTo ); - System.out.println( "====< " ); - dumpProgram( writer ); - writer.flush(); - }/**/ - - - /** - * Dumps the current program to a PrintWriter - * @param p PrintWriter for program dump output - */ - public void dumpProgram(PrintWriter p) - { - // Loop through the whole program - for (int i = 0; i < lenInstruction; ) - { - // Get opcode, opdata and next fields of current program node - char opcode = instruction[i + RE.offsetOpcode]; - char opdata = instruction[i + RE.offsetOpdata]; - short next = (short)instruction[i + RE.offsetNext]; - - // Display the current program node - p.print(i + ". " + nodeToString(i) + ", next = "); - - // If there's no next, say 'none', otherwise give absolute index of next node - if (next == 0) - { - p.print("none"); - } - else - { - p.print(i + next); - } - - // Move past node - i += RE.nodeSize; - - // If character class - if (opcode == RE.OP_ANYOF) - { - // Opening bracket for start of char class - p.print(", ["); - - // Show each range in the char class - int rangeCount = opdata; - for (int r = 0; r < rangeCount; r++) - { - // Get first and last chars in range - char charFirst = instruction[i++]; - char charLast = instruction[i++]; - - // Print range as X-Y, unless range encompasses only one char - if (charFirst == charLast) - { - p.print(charToString(charFirst)); - } - else - { - p.print(charToString(charFirst) + "-" + charToString(charLast)); - } - } - - // Annotate the end of the char class - p.print("]"); - } - - // If atom - if (opcode == RE.OP_ATOM) - { - // Open quote - p.print(", \""); - - // Print each character in the atom - for (int len = opdata; len-- != 0; ) - { - p.print(charToString(instruction[i++])); - } - - // Close quote - p.print("\""); - } - - // Print a newline - p.println(""); - } - } -} diff -r 3b8ebb957957 -r aaee9ae4799a src/com/sun/org/apache/regexp/internal/REProgram.java --- a/src/com/sun/org/apache/regexp/internal/REProgram.java Sat Oct 24 16:18:47 2020 +0800 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,158 +0,0 @@ -/* - * reserved comment block - * DO NOT REMOVE OR ALTER! - */ -/* - * Copyright 1999-2004 The Apache Software Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.sun.org.apache.regexp.internal; - -import java.io.Serializable; - -/** - * A class that holds compiled regular expressions. This is exposed mainly - * for use by the recompile utility (which helps you produce precompiled - * REProgram objects). You should not otherwise need to work directly with - * this class. -* - * @see RE - * @see RECompiler - * - * @author Jonathan Locke - */ -public class REProgram implements Serializable -{ - static final int OPT_HASBACKREFS = 1; - - char[] instruction; // The compiled regular expression 'program' - int lenInstruction; // The amount of the instruction buffer in use - char[] prefix; // Prefix string optimization - int flags; // Optimization flags (REProgram.OPT_*) - int maxParens = -1; - - /** - * Constructs a program object from a character array - * @param instruction Character array with RE opcode instructions in it - */ - public REProgram(char[] instruction) - { - this(instruction, instruction.length); - } - - /** - * Constructs a program object from a character array - * @param parens Count of parens in the program - * @param instruction Character array with RE opcode instructions in it - */ - public REProgram(int parens, char[] instruction) - { - this(instruction, instruction.length); - this.maxParens = parens; - } - - /** - * Constructs a program object from a character array - * @param instruction Character array with RE opcode instructions in it - * @param lenInstruction Amount of instruction array in use - */ - public REProgram(char[] instruction, int lenInstruction) - { - setInstructions(instruction, lenInstruction); - } - - /** - * Returns a copy of the current regular expression program in a character - * array that is exactly the right length to hold the program. If there is - * no program compiled yet, getInstructions() will return null. - * @return A copy of the current compiled RE program - */ - public char[] getInstructions() - { - // Ensure program has been compiled! - if (lenInstruction != 0) - { - // Return copy of program - char[] ret = new char[lenInstruction]; - System.arraycopy(instruction, 0, ret, 0, lenInstruction); - return ret; - } - return null; - } - - /** - * Sets a new regular expression program to run. It is this method which - * performs any special compile-time search optimizations. Currently only - * two optimizations are in place - one which checks for backreferences - * (so that they can be lazily allocated) and another which attempts to - * find an prefix anchor string so that substantial amounts of input can - * potentially be skipped without running the actual program. - * @param instruction Program instruction buffer - * @param lenInstruction Length of instruction buffer in use - */ - public void setInstructions(char[] instruction, int lenInstruction) - { - // Save reference to instruction array - this.instruction = instruction; - this.lenInstruction = lenInstruction; - - // Initialize other program-related variables - flags = 0; - prefix = null; - - // Try various compile-time optimizations if there's a program - if (instruction != null && lenInstruction != 0) - { - // If the first node is a branch - if (lenInstruction >= RE.nodeSize && instruction[0 + RE.offsetOpcode] == RE.OP_BRANCH) - { - // to the end node - int next = instruction[0 + RE.offsetNext]; - if (instruction[next + RE.offsetOpcode] == RE.OP_END) - { - // and the branch starts with an atom - if (lenInstruction >= (RE.nodeSize * 2) && instruction[RE.nodeSize + RE.offsetOpcode] == RE.OP_ATOM) - { - // then get that atom as an prefix because there's no other choice - int lenAtom = instruction[RE.nodeSize + RE.offsetOpdata]; - prefix = new char[lenAtom]; - System.arraycopy(instruction, RE.nodeSize * 2, prefix, 0, lenAtom); - } - } - } - - BackrefScanLoop: - - // Check for backreferences - for (int i = 0; i < lenInstruction; i += RE.nodeSize) - { - switch (instruction[i + RE.offsetOpcode]) - { - case RE.OP_ANYOF: - i += (instruction[i + RE.offsetOpdata] * 2); - break; - - case RE.OP_ATOM: - i += instruction[i + RE.offsetOpdata]; - break; - - case RE.OP_BACKREF: - flags |= OPT_HASBACKREFS; - break BackrefScanLoop; - } - } - } - } -} diff -r 3b8ebb957957 -r aaee9ae4799a src/com/sun/org/apache/regexp/internal/RESyntaxException.java --- a/src/com/sun/org/apache/regexp/internal/RESyntaxException.java Sat Oct 24 16:18:47 2020 +0800 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,43 +0,0 @@ -/* - * reserved comment block - * DO NOT REMOVE OR ALTER! - */ -/* - * Copyright 1999-2004 The Apache Software Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.sun.org.apache.regexp.internal; - -/** - * Exception thrown to indicate a syntax error in a regular expression. - * This is a non-checked exception because you should only have problems compiling - * a regular expression during development. - * If you are making regular expresion programs dynamically then you can catch it - * if you wish. But should not be forced to. - * - * @author Jonathan Locke - * @author Jonathan Locke - * @author Jon S. Stevens - * @author Michael McCallum - */ -public class RETest -{ - // True if we want to see output from success cases - static final boolean showSuccesses = false; - - // A new line character. - static final String NEW_LINE = System.getProperty( "line.separator" ); - - // Construct a debug compiler - REDebugCompiler compiler = new REDebugCompiler(); - - /** - * Main program entrypoint. If an argument is given, it will be compiled - * and interactive matching will ensue. If no argument is given, the - * file RETest.txt will be used as automated testing input. - * @param args Command line arguments (optional regular expression) - */ - public static void main(String[] args) - { - try - { - if (!test( args )) { - System.exit(1); - } - } - catch (Exception e) - { - e.printStackTrace(); - System.exit(1); - } - } - - /** - * Testing entrypoint. - * @param args Command line arguments - * @exception Exception thrown in case of error - */ - public static boolean test( String[] args ) throws Exception - { - RETest test = new RETest(); - // Run interactive tests against a single regexp - if (args.length == 2) - { - test.runInteractiveTests(args[1]); - } - else if (args.length == 1) - { - // Run automated tests - test.runAutomatedTests(args[0]); - } - else - { - System.out.println( "Usage: RETest ([-i] [regex]) ([/path/to/testfile.txt])" ); - System.out.println( "By Default will run automated tests from file 'docs/RETest.txt' ..." ); - System.out.println(); - test.runAutomatedTests("docs/RETest.txt"); - } - return test.failures == 0; - } - - /** - * Constructor - */ - public RETest() - { - } - - /** - * Compile and test matching against a single expression - * @param expr Expression to compile and test - */ - void runInteractiveTests(String expr) - { - RE r = new RE(); - try - { - // Compile expression - r.setProgram(compiler.compile(expr)); - - // Show expression - say("" + NEW_LINE + "" + expr + "" + NEW_LINE + ""); - - // Show program for compiled expression - PrintWriter writer = new PrintWriter( System.out ); - compiler.dumpProgram( writer ); - writer.flush(); - - boolean running = true; - // Test matching against compiled expression - while ( running ) - { - // Read from keyboard - BufferedReader br = new BufferedReader(new InputStreamReader(System.in)); - System.out.print("> "); - System.out.flush(); - String match = br.readLine(); - - if ( match != null ) - { - // Try a match against the keyboard input - if (r.match(match)) - { - say("Match successful."); - } - else - { - say("Match failed."); - } - - // Show subparen registers - showParens(r); - } - else - { - running = false; - System.out.println(); - } - } - } - catch (Exception e) - { - say("Error: " + e.toString()); - e.printStackTrace(); - } - } - - /** - * Exit with a fatal error. - * @param s Last famous words before exiting - */ - void die(String s) - { - say("FATAL ERROR: " + s); - System.exit(-1); - } - - /** - * Fail with an error. Will print a big failure message to System.out. - * - * @param log Output before failure - * @param s Failure description - */ - void fail(StringBuffer log, String s) - { - System.out.print(log.toString()); - fail(s); - } - - /** - * Fail with an error. Will print a big failure message to System.out. - * - * @param s Failure description - */ - void fail(String s) - { - failures++; - say("" + NEW_LINE + ""); - say("*******************************************************"); - say("********************* FAILURE! **********************"); - say("*******************************************************"); - say("" + NEW_LINE + ""); - say(s); - say(""); - // make sure the writer gets flushed. - if (compiler != null) { - PrintWriter writer = new PrintWriter( System.out ); - compiler.dumpProgram( writer ); - writer.flush(); - say("" + NEW_LINE + ""); - } - } - - /** - * Say something to standard out - * @param s What to say - */ - void say(String s) - { - System.out.println(s); - } - - /** - * Dump parenthesized subexpressions found by a regular expression matcher object - * @param r Matcher object with results to show - */ - void showParens(RE r) - { - // Loop through each paren - for (int i = 0; i < r.getParenCount(); i++) - { - // Show paren register - say("$" + i + " = " + r.getParen(i)); - } - } - - /* - * number in automated test - */ - int testCount = 0; - - /* - * Count of failures in automated test - */ - int failures = 0; - - /** - * Run automated tests in RETest.txt file (from Perl 4.0 test battery) - * @exception Exception thrown in case of error - */ - void runAutomatedTests(String testDocument) throws Exception - { - long ms = System.currentTimeMillis(); - - // Some unit tests - testPrecompiledRE(); - testSplitAndGrep(); - testSubst(); - testOther(); - - // Test from script file - File testInput = new File(testDocument); - if (! testInput.exists()) { - throw new Exception ("Could not find: " + testDocument); - } - - BufferedReader br = new BufferedReader(new FileReader(testInput)); - try - { - // While input is available, parse lines - while (br.ready()) - { - RETestCase testcase = getNextTestCase(br); - if (testcase != null) { - testcase.runTest(); - } - } - } - finally - { - br.close(); - } - - // Show match time - say(NEW_LINE + NEW_LINE + "Match time = " + (System.currentTimeMillis() - ms) + " ms."); - - // Print final results - if (failures > 0) { - say("*************** THERE ARE FAILURES! *******************"); - } - say("Tests complete. " + testCount + " tests, " + failures + " failure(s)."); - } - - /** - * Run automated unit test - * @exception Exception thrown in case of error - */ - void testOther() throws Exception - { - // Serialization test 1: Compile regexp and serialize/deserialize it - RE r = new RE("(a*)b"); - say("Serialized/deserialized (a*)b"); - ByteArrayOutputStream out = new ByteArrayOutputStream(128); - new ObjectOutputStream(out).writeObject(r); - ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray()); - r = (RE)new ObjectInputStream(in).readObject(); - if (!r.match("aaab")) - { - fail("Did not match 'aaab' with deserialized RE."); - } else { - say("aaaab = true"); - showParens(r); - } - - // Serialization test 2: serialize/deserialize used regexp - out.reset(); - say("Deserialized (a*)b"); - new ObjectOutputStream(out).writeObject(r); - in = new ByteArrayInputStream(out.toByteArray()); - r = (RE)new ObjectInputStream(in).readObject(); - if (r.getParenCount() != 0) - { - fail("Has parens after deserialization."); - } - if (!r.match("aaab")) - { - fail("Did not match 'aaab' with deserialized RE."); - } else { - say("aaaab = true"); - showParens(r); - } - - // Test MATCH_CASEINDEPENDENT - r = new RE("abc(\\w*)"); - say("MATCH_CASEINDEPENDENT abc(\\w*)"); - r.setMatchFlags(RE.MATCH_CASEINDEPENDENT); - say("abc(d*)"); - if (!r.match("abcddd")) - { - fail("Did not match 'abcddd'."); - } else { - say("abcddd = true"); - showParens(r); - } - - if (!r.match("aBcDDdd")) - { - fail("Did not match 'aBcDDdd'."); - } else { - say("aBcDDdd = true"); - showParens(r); - } - - if (!r.match("ABCDDDDD")) - { - fail("Did not match 'ABCDDDDD'."); - } else { - say("ABCDDDDD = true"); - showParens(r); - } - - r = new RE("(A*)b\\1"); - r.setMatchFlags(RE.MATCH_CASEINDEPENDENT); - if (!r.match("AaAaaaBAAAAAA")) - { - fail("Did not match 'AaAaaaBAAAAAA'."); - } else { - say("AaAaaaBAAAAAA = true"); - showParens(r); - } - - r = new RE("[A-Z]*"); - r.setMatchFlags(RE.MATCH_CASEINDEPENDENT); - if (!r.match("CaBgDe12")) - { - fail("Did not match 'CaBgDe12'."); - } else { - say("CaBgDe12 = true"); - showParens(r); - } - - // Test MATCH_MULTILINE. Test for eol/bol symbols. - r = new RE("^abc$", RE.MATCH_MULTILINE); - if (!r.match("\nabc")) { - fail("\"\\nabc\" doesn't match \"^abc$\""); - } - if (!r.match("\rabc")) { - fail("\"\\rabc\" doesn't match \"^abc$\""); - } - if (!r.match("\r\nabc")) { - fail("\"\\r\\nabc\" doesn't match \"^abc$\""); - } - if (!r.match("\u0085abc")) { - fail("\"\\u0085abc\" doesn't match \"^abc$\""); - } - if (!r.match("\u2028abc")) { - fail("\"\\u2028abc\" doesn't match \"^abc$\""); - } - if (!r.match("\u2029abc")) { - fail("\"\\u2029abc\" doesn't match \"^abc$\""); - } - - // Test MATCH_MULTILINE. Test that '.' does not matches new line. - r = new RE("^a.*b$", RE.MATCH_MULTILINE); - if (r.match("a\nb")) { - fail("\"a\\nb\" matches \"^a.*b$\""); - } - if (r.match("a\rb")) { - fail("\"a\\rb\" matches \"^a.*b$\""); - } - if (r.match("a\r\nb")) { - fail("\"a\\r\\nb\" matches \"^a.*b$\""); - } - if (r.match("a\u0085b")) { - fail("\"a\\u0085b\" matches \"^a.*b$\""); - } - if (r.match("a\u2028b")) { - fail("\"a\\u2028b\" matches \"^a.*b$\""); - } - if (r.match("a\u2029b")) { - fail("\"a\\u2029b\" matches \"^a.*b$\""); - } - } - - private void testPrecompiledRE() - { - // Pre-compiled regular expression "a*b" - char[] re1Instructions = - { - 0x007c, 0x0000, 0x001a, 0x007c, 0x0000, 0x000d, 0x0041, - 0x0001, 0x0004, 0x0061, 0x007c, 0x0000, 0x0003, 0x0047, - 0x0000, 0xfff6, 0x007c, 0x0000, 0x0003, 0x004e, 0x0000, - 0x0003, 0x0041, 0x0001, 0x0004, 0x0062, 0x0045, 0x0000, - 0x0000, - }; - - REProgram re1 = new REProgram(re1Instructions); - - // Simple test of pre-compiled regular expressions - RE r = new RE(re1); - say("a*b"); - boolean result = r.match("aaab"); - say("aaab = " + result); - showParens(r); - if (!result) { - fail("\"aaab\" doesn't match to precompiled \"a*b\""); - } - - result = r.match("b"); - say("b = " + result); - showParens(r); - if (!result) { - fail("\"b\" doesn't match to precompiled \"a*b\""); - } - - result = r.match("c"); - say("c = " + result); - showParens(r); - if (result) { - fail("\"c\" matches to precompiled \"a*b\""); - } - - result = r.match("ccccaaaaab"); - say("ccccaaaaab = " + result); - showParens(r); - if (!result) { - fail("\"ccccaaaaab\" doesn't match to precompiled \"a*b\""); - } - } - - private void testSplitAndGrep() - { - String[] expected = {"xxxx", "xxxx", "yyyy", "zzz"}; - RE r = new RE("a*b"); - String[] s = r.split("xxxxaabxxxxbyyyyaaabzzz"); - for (int i = 0; i < expected.length && i < s.length; i++) { - assertEquals("Wrong splitted part", expected[i], s[i]); - } - assertEquals("Wrong number of splitted parts", expected.length, - s.length); - - r = new RE("x+"); - expected = new String[] {"xxxx", "xxxx"}; - s = r.grep(s); - for (int i = 0; i < s.length; i++) - { - say("s[" + i + "] = " + s[i]); - assertEquals("Grep fails", expected[i], s[i]); - } - assertEquals("Wrong number of string found by grep", expected.length, - s.length); - } - - private void testSubst() - { - RE r = new RE("a*b"); - String expected = "-foo-garply-wacky-"; - String actual = r.subst("aaaabfooaaabgarplyaaabwackyb", "-"); - assertEquals("Wrong result of substitution in \"a*b\"", expected, actual); - - // Test subst() with backreferences - r = new RE("http://[\\.\\w\\-\\?/~_@&=%]+"); - actual = r.subst("visit us: http://www.apache.org!", - "1234$0", RE.REPLACE_BACKREFERENCES); - assertEquals("Wrong subst() result", "visit us: 1234http://www.apache.org!", actual); - - // Test subst() with backreferences without leading characters - // before first backreference - r = new RE("(.*?)=(.*)"); - actual = r.subst("variable=value", - "$1_test_$212", RE.REPLACE_BACKREFERENCES); - assertEquals("Wrong subst() result", "variable_test_value12", actual); - - // Test subst() with NO backreferences - r = new RE("^a$"); - actual = r.subst("a", - "b", RE.REPLACE_BACKREFERENCES); - assertEquals("Wrong subst() result", "b", actual); - - // Test subst() with NO backreferences - r = new RE("^a$", RE.MATCH_MULTILINE); - actual = r.subst("\r\na\r\n", - "b", RE.REPLACE_BACKREFERENCES); - assertEquals("Wrong subst() result", "\r\nb\r\n", actual); - } - - public void assertEquals(String message, String expected, String actual) - { - if (expected != null && !expected.equals(actual) - || actual != null && !actual.equals(expected)) - { - fail(message + " (expected \"" + expected - + "\", actual \"" + actual + "\")"); - } - } - - public void assertEquals(String message, int expected, int actual) - { - if (expected != actual) { - fail(message + " (expected \"" + expected - + "\", actual \"" + actual + "\")"); - } - } - - /** - * Converts yesno string to boolean. - * @param yesno string representation of expected result - * @return true if yesno is "YES", false if yesno is "NO" - * stops program otherwise. - */ - private boolean getExpectedResult(String yesno) - { - if ("NO".equals(yesno)) - { - return false; - } - else if ("YES".equals(yesno)) - { - return true; - } - else - { - // Bad test script - die("Test script error!"); - return false; //to please javac - } - } - - /** - * Finds next test description in a given script. - * @param br BufferedReader for a script file - * @return strign tag for next test description - * @exception IOException if some io problems occured - */ - private String findNextTest(BufferedReader br) throws IOException - { - String number = ""; - - while (br.ready()) - { - number = br.readLine(); - if (number == null) - { - break; - } - number = number.trim(); - if (number.startsWith("#")) - { - break; - } - if (!number.equals("")) - { - say("Script error. Line = " + number); - System.exit(-1); - } - } - return number; - } - - /** - * Creates testcase for the next test description in the script file. - * @param br BufferedReader for script file. - * @return a new tescase or null. - * @exception IOException if some io problems occured - */ - private RETestCase getNextTestCase(BufferedReader br) throws IOException - { - // Find next re test case - final String tag = findNextTest(br); - - // Are we done? - if (!br.ready()) - { - return null; - } - - // Get expression - final String expr = br.readLine(); - - // Get test information - final String matchAgainst = br.readLine(); - final boolean badPattern = "ERR".equals(matchAgainst); - boolean shouldMatch = false; - int expectedParenCount = 0; - String[] expectedParens = null; - - if (!badPattern) { - shouldMatch = getExpectedResult(br.readLine().trim()); - if (shouldMatch) { - expectedParenCount = Integer.parseInt(br.readLine().trim()); - expectedParens = new String[expectedParenCount]; - for (int i = 0; i < expectedParenCount; i++) { - expectedParens[i] = br.readLine(); - } - } - } - - return new RETestCase(this, tag, expr, matchAgainst, badPattern, - shouldMatch, expectedParens); - } -} - -final class RETestCase -{ - final private StringBuffer log = new StringBuffer(); - final private int number; - final private String tag; // number from script file - final private String pattern; - final private String toMatch; - final private boolean badPattern; - final private boolean shouldMatch; - final private String[] parens; - final private RETest test; - private RE regexp; - - public RETestCase(RETest test, String tag, String pattern, - String toMatch, boolean badPattern, - boolean shouldMatch, String[] parens) - { - this.number = ++test.testCount; - this.test = test; - this.tag = tag; - this.pattern = pattern; - this.toMatch = toMatch; - this.badPattern = badPattern; - this.shouldMatch = shouldMatch; - if (parens != null) { - this.parens = new String[parens.length]; - for (int i = 0; i < parens.length; i++) { - this.parens[i] = parens[i]; - } - } else { - this.parens = null; - } - } - - public void runTest() - { - test.say(tag + "(" + number + "): " + pattern); - if (testCreation()) { - testMatch(); - } - } - - boolean testCreation() - { - try - { - // Compile it - regexp = new RE(); - regexp.setProgram(test.compiler.compile(pattern)); - // Expression didn't cause an expected error - if (badPattern) - { - test.fail(log, "Was expected to be an error, but wasn't."); - return false; - } - - return true; - } - // Some expressions *should* cause exceptions to be thrown - catch (Exception e) - { - // If it was supposed to be an error, report success and continue - if (badPattern) - { - log.append(" Match: ERR\n"); - success("Produces an error (" + e.toString() + "), as expected."); - return false; - } - - // Wasn't supposed to be an error - String message = (e.getMessage() == null) ? e.toString() : e.getMessage(); - test.fail(log, "Produces an unexpected exception \"" + message + "\""); - e.printStackTrace(); - } - catch (Error e) - { - // Internal error happened - test.fail(log, "Compiler threw fatal error \"" + e.getMessage() + "\""); - e.printStackTrace(); - } - - return false; - } - - private void testMatch() - { - log.append(" Match against: '" + toMatch + "'\n"); - // Try regular matching - try - { - // Match against the string - boolean result = regexp.match(toMatch); - log.append(" Matched: " + (result ? "YES" : "NO") + "\n"); - - // Check result, parens, and iterators - if (checkResult(result) && (!shouldMatch || checkParens())) - { - // test match(CharacterIterator, int) - // for every CharacterIterator implementation. - log.append(" Match using StringCharacterIterator\n"); - if (!tryMatchUsingCI(new StringCharacterIterator(toMatch))) - return; - - log.append(" Match using CharacterArrayCharacterIterator\n"); - if (!tryMatchUsingCI(new CharacterArrayCharacterIterator(toMatch.toCharArray(), 0, toMatch.length()))) - return; - - log.append(" Match using StreamCharacterIterator\n"); - if (!tryMatchUsingCI(new StreamCharacterIterator(new StringBufferInputStream(toMatch)))) - return; - - log.append(" Match using ReaderCharacterIterator\n"); - if (!tryMatchUsingCI(new ReaderCharacterIterator(new StringReader(toMatch)))) - return; - } - } - // Matcher blew it - catch(Exception e) - { - test.fail(log, "Matcher threw exception: " + e.toString()); - e.printStackTrace(); - } - // Internal error - catch(Error e) - { - test.fail(log, "Matcher threw fatal error \"" + e.getMessage() + "\""); - e.printStackTrace(); - } - } - - private boolean checkResult(boolean result) - { - // Write status - if (result == shouldMatch) { - success((shouldMatch ? "Matched" : "Did not match") - + " \"" + toMatch + "\", as expected:"); - return true; - } else { - if (shouldMatch) { - test.fail(log, "Did not match \"" + toMatch + "\", when expected to."); - } else { - test.fail(log, "Matched \"" + toMatch + "\", when not expected to."); - } - return false; - } - } - - private boolean checkParens() - { - // Show subexpression registers - if (RETest.showSuccesses) - { - test.showParens(regexp); - } - - log.append(" Paren count: " + regexp.getParenCount() + "\n"); - if (!assertEquals(log, "Wrong number of parens", parens.length, regexp.getParenCount())) - { - return false; - } - - // Check registers against expected contents - for (int p = 0; p < regexp.getParenCount(); p++) - { - log.append(" Paren " + p + ": " + regexp.getParen(p) + "\n"); - - // Compare expected result with actual - if ("null".equals(parens[p]) && regexp.getParen(p) == null) - { - // Consider "null" in test file equal to null - continue; - } - if (!assertEquals(log, "Wrong register " + p, parens[p], regexp.getParen(p))) - { - return false; - } - } - - return true; - } - - boolean tryMatchUsingCI(CharacterIterator matchAgainst) - { - try { - boolean result = regexp.match(matchAgainst, 0); - log.append(" Match: " + (result ? "YES" : "NO") + "\n"); - return checkResult(result) && (!shouldMatch || checkParens()); - } - // Matcher blew it - catch(Exception e) - { - test.fail(log, "Matcher threw exception: " + e.toString()); - e.printStackTrace(); - } - // Internal error - catch(Error e) - { - test.fail(log, "Matcher threw fatal error \"" + e.getMessage() + "\""); - e.printStackTrace(); - } - return false; - } - - public boolean assertEquals(StringBuffer log, String message, String expected, String actual) - { - if (expected != null && !expected.equals(actual) - || actual != null && !actual.equals(expected)) - { - test.fail(log, message + " (expected \"" + expected - + "\", actual \"" + actual + "\")"); - return false; - } - return true; - } - - public boolean assertEquals(StringBuffer log, String message, int expected, int actual) - { - if (expected != actual) { - test.fail(log, message + " (expected \"" + expected - + "\", actual \"" + actual + "\")"); - return false; - } - return true; - } - - /** - * Show a success - * @param s Success story - */ - void success(String s) - { - if (RETest.showSuccesses) - { - test.say("" + RETest.NEW_LINE + "-----------------------" + RETest.NEW_LINE + ""); - test.say("Expression #" + (number) + " \"" + pattern + "\" "); - test.say("Success: " + s); - } - } -} diff -r 3b8ebb957957 -r aaee9ae4799a src/com/sun/org/apache/regexp/internal/REUtil.java --- a/src/com/sun/org/apache/regexp/internal/REUtil.java Sat Oct 24 16:18:47 2020 +0800 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,61 +0,0 @@ -/* - * reserved comment block - * DO NOT REMOVE OR ALTER! - */ -/* - * Copyright 1999-2004 The Apache Software Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.sun.org.apache.regexp.internal; - -/** - * This is a class that contains utility helper methods for this package. - * - * @author Jonathan Locke - */ -public class REUtil -{ - /** complex: */ - private static final String complexPrefix = "complex:"; - - /** - * Creates a regular expression, permitting simple or complex syntax - * @param expression The expression, beginning with a prefix if it's complex or - * having no prefix if it's simple - * @param matchFlags Matching style flags - * @return The regular expression object - * @exception RESyntaxException thrown in case of error - */ - public static RE createRE(String expression, int matchFlags) throws RESyntaxException - { - if (expression.startsWith(complexPrefix)) - { - return new RE(expression.substring(complexPrefix.length()), matchFlags); - } - return new RE(RE.simplePatternToFullRegularExpression(expression), matchFlags); - } - - /** - * Creates a regular expression, permitting simple or complex syntax - * @param expression The expression, beginning with a prefix if it's complex or - * having no prefix if it's simple - * @return The regular expression object - * @exception RESyntaxException thrown in case of error - */ - public static RE createRE(String expression) throws RESyntaxException - { - return createRE(expression, RE.MATCH_NORMAL); - } -} diff -r 3b8ebb957957 -r aaee9ae4799a src/com/sun/org/apache/regexp/internal/ReaderCharacterIterator.java --- a/src/com/sun/org/apache/regexp/internal/ReaderCharacterIterator.java Sat Oct 24 16:18:47 2020 +0800 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,164 +0,0 @@ -/* - * reserved comment block - * DO NOT REMOVE OR ALTER! - */ -/* - * Copyright 1999-2004 The Apache Software Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.sun.org.apache.regexp.internal; - -import java.io.Reader; -import java.io.IOException; - -/** - * Encapsulates java.io.Reader as CharacterIterator - * - * @author Ales Novak - */ -public final class ReaderCharacterIterator implements CharacterIterator -{ - /** Underlying reader */ - private final Reader reader; - - /** Buffer of read chars */ - private final StringBuffer buff; - - /** read end? */ - private boolean closed; - - /** @param reader a Reader, which is parsed */ - public ReaderCharacterIterator(Reader reader) - { - this.reader = reader; - this.buff = new StringBuffer(512); - this.closed = false; - } - - /** @return a substring */ - public String substring(int beginIndex, int endIndex) - { - try - { - ensure(endIndex); - return buff.toString().substring(beginIndex, endIndex); - } - catch (IOException e) - { - throw new StringIndexOutOfBoundsException(e.getMessage()); - } - } - - /** @return a substring */ - public String substring(int beginIndex) - { - try - { - readAll(); - return buff.toString().substring(beginIndex); - } - catch (IOException e) - { - throw new StringIndexOutOfBoundsException(e.getMessage()); - } - } - - /** @return a character at the specified position. */ - public char charAt(int pos) - { - try - { - ensure(pos); - return buff.charAt(pos); - } - catch (IOException e) - { - throw new StringIndexOutOfBoundsException(e.getMessage()); - } - } - - /** @return true iff if the specified index is after the end of the character stream */ - public boolean isEnd(int pos) - { - if (buff.length() > pos) - { - return false; - } - else - { - try - { - ensure(pos); - return (buff.length() <= pos); - } - catch (IOException e) - { - throw new StringIndexOutOfBoundsException(e.getMessage()); - } - } - } - - /** Reads n characters from the stream and appends them to the buffer */ - private int read(int n) throws IOException - { - if (closed) - { - return 0; - } - - char[] c = new char[n]; - int count = 0; - int read = 0; - - do - { - read = reader.read(c); - if (read < 0) // EOF - { - closed = true; - break; - } - count += read; - buff.append(c, 0, read); - } - while (count < n); - - return count; - } - - /** Reads rest of the stream. */ - private void readAll() throws IOException - { - while(! closed) - { - read(1000); - } - } - - /** Reads chars up to the idx */ - private void ensure(int idx) throws IOException - { - if (closed) - { - return; - } - - if (idx < buff.length()) - { - return; - } - read(idx + 1 - buff.length()); - } -} diff -r 3b8ebb957957 -r aaee9ae4799a src/com/sun/org/apache/regexp/internal/StreamCharacterIterator.java --- a/src/com/sun/org/apache/regexp/internal/StreamCharacterIterator.java Sat Oct 24 16:18:47 2020 +0800 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,161 +0,0 @@ -/* - * reserved comment block - * DO NOT REMOVE OR ALTER! - */ -/* - * Copyright 1999-2004 The Apache Software Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.sun.org.apache.regexp.internal; - -import java.io.InputStream; -import java.io.IOException; - -/** - * Encapsulates java.io.InputStream as CharacterIterator. - * - * @author Ales Novak - */ -public final class StreamCharacterIterator implements CharacterIterator -{ - /** Underlying is */ - private final InputStream is; - - /** Buffer of read chars */ - private final StringBuffer buff; - - /** read end? */ - private boolean closed; - - /** @param is an InputStream, which is parsed */ - public StreamCharacterIterator(InputStream is) - { - this.is = is; - this.buff = new StringBuffer(512); - this.closed = false; - } - - /** @return a substring */ - public String substring(int beginIndex, int endIndex) - { - try - { - ensure(endIndex); - return buff.toString().substring(beginIndex, endIndex); - } - catch (IOException e) - { - throw new StringIndexOutOfBoundsException(e.getMessage()); - } - } - - /** @return a substring */ - public String substring(int beginIndex) - { - try - { - readAll(); - return buff.toString().substring(beginIndex); - } - catch (IOException e) - { - throw new StringIndexOutOfBoundsException(e.getMessage()); - } - } - - - /** @return a character at the specified position. */ - public char charAt(int pos) - { - try - { - ensure(pos); - return buff.charAt(pos); - } - catch (IOException e) - { - throw new StringIndexOutOfBoundsException(e.getMessage()); - } - } - - /** @return true iff if the specified index is after the end of the character stream */ - public boolean isEnd(int pos) - { - if (buff.length() > pos) - { - return false; - } - else - { - try - { - ensure(pos); - return (buff.length() <= pos); - } - catch (IOException e) - { - throw new StringIndexOutOfBoundsException(e.getMessage()); - } - } - } - - /** Reads n characters from the stream and appends them to the buffer */ - private int read(int n) throws IOException - { - if (closed) - { - return 0; - } - - int c; - int i = n; - while (--i >= 0) - { - c = is.read(); - if (c < 0) // EOF - { - closed = true; - break; - } - buff.append((char) c); - } - return n - i; - } - - /** Reads rest of the stream. */ - private void readAll() throws IOException - { - while(! closed) - { - read(1000); - } - } - - /** Reads chars up to the idx */ - private void ensure(int idx) throws IOException - { - if (closed) - { - return; - } - - if (idx < buff.length()) - { - return; - } - - read(idx + 1 - buff.length()); - } -} diff -r 3b8ebb957957 -r aaee9ae4799a src/com/sun/org/apache/regexp/internal/StringCharacterIterator.java --- a/src/com/sun/org/apache/regexp/internal/StringCharacterIterator.java Sat Oct 24 16:18:47 2020 +0800 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,62 +0,0 @@ -/* - * reserved comment block - * DO NOT REMOVE OR ALTER! - */ -/* - * Copyright 1999-2004 The Apache Software Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.sun.org.apache.regexp.internal; - -/** - * Encapsulates String as CharacterIterator. - * - * @author Ales Novak - */ -public final class StringCharacterIterator implements CharacterIterator -{ - /** encapsulated */ - private final String src; - - /** @param src - encapsulated String */ - public StringCharacterIterator(String src) - { - this.src = src; - } - - /** @return a substring */ - public String substring(int beginIndex, int endIndex) - { - return src.substring(beginIndex, endIndex); - } - - /** @return a substring */ - public String substring(int beginIndex) - { - return src.substring(beginIndex); - } - - /** @return a character at the specified position. */ - public char charAt(int pos) - { - return src.charAt(pos); - } - - /** @return true iff if the specified index is after the end of the character stream */ - public boolean isEnd(int pos) - { - return (pos >= src.length()); - } -} diff -r 3b8ebb957957 -r aaee9ae4799a src/com/sun/org/apache/regexp/internal/recompile.java --- a/src/com/sun/org/apache/regexp/internal/recompile.java Sat Oct 24 16:18:47 2020 +0800 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,137 +0,0 @@ -/* - * reserved comment block - * DO NOT REMOVE OR ALTER! - */ -/* - * Copyright 1999-2004 The Apache Software Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.sun.org.apache.regexp.internal; - -import com.sun.org.apache.regexp.internal.RECompiler; -import com.sun.org.apache.regexp.internal.RESyntaxException; - -/** - * 'recompile' is a command line tool that pre-compiles one or more regular expressions - * for use with the regular expression matcher class 'RE'. For example, the command - * "java recompile a*b" produces output like this: - * - *

- *
- *    // Pre-compiled regular expression "a*b"
- *    char[] re1Instructions =
- *    {
- *        0x007c, 0x0000, 0x001a, 0x007c, 0x0000, 0x000d, 0x0041,
- *        0x0001, 0x0004, 0x0061, 0x007c, 0x0000, 0x0003, 0x0047,
- *        0x0000, 0xfff6, 0x007c, 0x0000, 0x0003, 0x004e, 0x0000,
- *        0x0003, 0x0041, 0x0001, 0x0004, 0x0062, 0x0045, 0x0000,
- *        0x0000,
- *    };
- *
- *    REProgram re1 = new REProgram(re1Instructions);
- *
- * 
- * - * By pasting this output into your code, you can construct a regular expression matcher - * (RE) object directly from the pre-compiled data (the character array re1), thus avoiding - * the overhead of compiling the expression at runtime. For example: - * - *
- *
- *    RE r = new RE(re1);
- *
- * 
- * - * @see RE - * @see RECompiler - * - * @author Jonathan Locke - */ -public class recompile -{ - /** - * Main application entrypoint. - * @param arg Command line arguments - */ - static public void main(String[] arg) - { - // Create a compiler object - RECompiler r = new RECompiler(); - - // Print usage if arguments are incorrect - if (arg.length <= 0 || arg.length % 2 != 0) - { - System.out.println("Usage: recompile "); - System.exit(0); - } - - // Loop through arguments, compiling each - for (int i = 0; i < arg.length; i += 2) - { - try - { - // Compile regular expression - String name = arg[i]; - String pattern = arg[i+1]; - String instructions = name + "PatternInstructions"; - - // Output program as a nice, formatted character array - System.out.print("\n // Pre-compiled regular expression '" + pattern + "'\n" - + " private static char[] " + instructions + " = \n {"); - - // Compile program for pattern - REProgram program = r.compile(pattern); - - // Number of columns in output - int numColumns = 7; - - // Loop through program - char[] p = program.getInstructions(); - for (int j = 0; j < p.length; j++) - { - // End of column? - if ((j % numColumns) == 0) - { - System.out.print("\n "); - } - - // Print character as padded hex number - String hex = Integer.toHexString(p[j]); - while (hex.length() < 4) - { - hex = "0" + hex; - } - System.out.print("0x" + hex + ", "); - } - - // End of program block - System.out.println("\n };"); - System.out.println("\n private static RE " + name + "Pattern = new RE(new REProgram(" + instructions + "));"); - } - catch (RESyntaxException e) - { - System.out.println("Syntax error in expression \"" + arg[i] + "\": " + e.toString()); - } - catch (Exception e) - { - System.out.println("Unexpected exception: " + e.toString()); - } - catch (Error e) - { - System.out.println("Internal error: " + e.toString()); - } - } - } -}