1.1 --- a/src/com/sun/org/apache/regexp/internal/RE.java Sat Oct 24 16:18:47 2020 +0800 1.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 1.3 @@ -1,1760 +0,0 @@ 1.4 -/* 1.5 - * reserved comment block 1.6 - * DO NOT REMOVE OR ALTER! 1.7 - */ 1.8 -/* 1.9 - * Copyright 1999-2004 The Apache Software Foundation. 1.10 - * 1.11 - * Licensed under the Apache License, Version 2.0 (the "License"); 1.12 - * you may not use this file except in compliance with the License. 1.13 - * You may obtain a copy of the License at 1.14 - * 1.15 - * http://www.apache.org/licenses/LICENSE-2.0 1.16 - * 1.17 - * Unless required by applicable law or agreed to in writing, software 1.18 - * distributed under the License is distributed on an "AS IS" BASIS, 1.19 - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1.20 - * See the License for the specific language governing permissions and 1.21 - * limitations under the License. 1.22 - */ 1.23 - 1.24 -package com.sun.org.apache.regexp.internal; 1.25 - 1.26 -import java.io.Serializable; 1.27 -import java.util.Vector; 1.28 - 1.29 -/** 1.30 - * RE is an efficient, lightweight regular expression evaluator/matcher 1.31 - * class. Regular expressions are pattern descriptions which enable 1.32 - * sophisticated matching of strings. In addition to being able to 1.33 - * match a string against a pattern, you can also extract parts of the 1.34 - * match. This is especially useful in text parsing! Details on the 1.35 - * syntax of regular expression patterns are given below. 1.36 - * 1.37 - * <p> 1.38 - * To compile a regular expression (RE), you can simply construct an RE 1.39 - * matcher object from the string specification of the pattern, like this: 1.40 - * 1.41 - * <pre> 1.42 - * RE r = new RE("a*b"); 1.43 - * </pre> 1.44 - * 1.45 - * <p> 1.46 - * Once you have done this, you can call either of the RE.match methods to 1.47 - * perform matching on a String. For example: 1.48 - * 1.49 - * <pre> 1.50 - * boolean matched = r.match("aaaab"); 1.51 - * </pre> 1.52 - * 1.53 - * will cause the boolean matched to be set to true because the 1.54 - * pattern "a*b" matches the string "aaaab". 1.55 - * 1.56 - * <p> 1.57 - * If you were interested in the <i>number</i> of a's which matched the 1.58 - * first part of our example expression, you could change the expression to 1.59 - * "(a*)b". Then when you compiled the expression and matched it against 1.60 - * something like "xaaaab", you would get results like this: 1.61 - * 1.62 - * <pre> 1.63 - * RE r = new RE("(a*)b"); // Compile expression 1.64 - * boolean matched = r.match("xaaaab"); // Match against "xaaaab" 1.65 - * 1.66 - * String wholeExpr = r.getParen(0); // wholeExpr will be 'aaaab' 1.67 - * String insideParens = r.getParen(1); // insideParens will be 'aaaa' 1.68 - * 1.69 - * int startWholeExpr = r.getParenStart(0); // startWholeExpr will be index 1 1.70 - * int endWholeExpr = r.getParenEnd(0); // endWholeExpr will be index 6 1.71 - * int lenWholeExpr = r.getParenLength(0); // lenWholeExpr will be 5 1.72 - * 1.73 - * int startInside = r.getParenStart(1); // startInside will be index 1 1.74 - * int endInside = r.getParenEnd(1); // endInside will be index 5 1.75 - * int lenInside = r.getParenLength(1); // lenInside will be 4 1.76 - * </pre> 1.77 - * 1.78 - * You can also refer to the contents of a parenthesized expression 1.79 - * within a regular expression itself. This is called a 1.80 - * 'backreference'. The first backreference in a regular expression is 1.81 - * denoted by \1, the second by \2 and so on. So the expression: 1.82 - * 1.83 - * <pre> 1.84 - * ([0-9]+)=\1 1.85 - * </pre> 1.86 - * 1.87 - * will match any string of the form n=n (like 0=0 or 2=2). 1.88 - * 1.89 - * <p> 1.90 - * The full regular expression syntax accepted by RE is described here: 1.91 - * 1.92 - * <pre> 1.93 - * 1.94 - * <b><font face=times roman>Characters</font></b> 1.95 - * 1.96 - * <i>unicodeChar</i> Matches any identical unicode character 1.97 - * \ Used to quote a meta-character (like '*') 1.98 - * \\ Matches a single '\' character 1.99 - * \0nnn Matches a given octal character 1.100 - * \xhh Matches a given 8-bit hexadecimal character 1.101 - * \\uhhhh Matches a given 16-bit hexadecimal character 1.102 - * \t Matches an ASCII tab character 1.103 - * \n Matches an ASCII newline character 1.104 - * \r Matches an ASCII return character 1.105 - * \f Matches an ASCII form feed character 1.106 - * 1.107 - * 1.108 - * <b><font face=times roman>Character Classes</font></b> 1.109 - * 1.110 - * [abc] Simple character class 1.111 - * [a-zA-Z] Character class with ranges 1.112 - * [^abc] Negated character class 1.113 - * </pre> 1.114 - * 1.115 - * <b>NOTE:</b> Incomplete ranges will be interpreted as "starts 1.116 - * from zero" or "ends with last character". 1.117 - * <br> 1.118 - * I.e. [-a] is the same as [\\u0000-a], and [a-] is the same as [a-\\uFFFF], 1.119 - * [-] means "all characters". 1.120 - * 1.121 - * <pre> 1.122 - * 1.123 - * <b><font face=times roman>Standard POSIX Character Classes</font></b> 1.124 - * 1.125 - * [:alnum:] Alphanumeric characters. 1.126 - * [:alpha:] Alphabetic characters. 1.127 - * [:blank:] Space and tab characters. 1.128 - * [:cntrl:] Control characters. 1.129 - * [:digit:] Numeric characters. 1.130 - * [:graph:] Characters that are printable and are also visible. 1.131 - * (A space is printable, but not visible, while an 1.132 - * `a' is both.) 1.133 - * [:lower:] Lower-case alphabetic characters. 1.134 - * [:print:] Printable characters (characters that are not 1.135 - * control characters.) 1.136 - * [:punct:] Punctuation characters (characters that are not letter, 1.137 - * digits, control characters, or space characters). 1.138 - * [:space:] Space characters (such as space, tab, and formfeed, 1.139 - * to name a few). 1.140 - * [:upper:] Upper-case alphabetic characters. 1.141 - * [:xdigit:] Characters that are hexadecimal digits. 1.142 - * 1.143 - * 1.144 - * <b><font face=times roman>Non-standard POSIX-style Character Classes</font></b> 1.145 - * 1.146 - * [:javastart:] Start of a Java identifier 1.147 - * [:javapart:] Part of a Java identifier 1.148 - * 1.149 - * 1.150 - * <b><font face=times roman>Predefined Classes</font></b> 1.151 - * 1.152 - * . Matches any character other than newline 1.153 - * \w Matches a "word" character (alphanumeric plus "_") 1.154 - * \W Matches a non-word character 1.155 - * \s Matches a whitespace character 1.156 - * \S Matches a non-whitespace character 1.157 - * \d Matches a digit character 1.158 - * \D Matches a non-digit character 1.159 - * 1.160 - * 1.161 - * <b><font face=times roman>Boundary Matchers</font></b> 1.162 - * 1.163 - * ^ Matches only at the beginning of a line 1.164 - * $ Matches only at the end of a line 1.165 - * \b Matches only at a word boundary 1.166 - * \B Matches only at a non-word boundary 1.167 - * 1.168 - * 1.169 - * <b><font face=times roman>Greedy Closures</font></b> 1.170 - * 1.171 - * A* Matches A 0 or more times (greedy) 1.172 - * A+ Matches A 1 or more times (greedy) 1.173 - * A? Matches A 1 or 0 times (greedy) 1.174 - * A{n} Matches A exactly n times (greedy) 1.175 - * A{n,} Matches A at least n times (greedy) 1.176 - * A{n,m} Matches A at least n but not more than m times (greedy) 1.177 - * 1.178 - * 1.179 - * <b><font face=times roman>Reluctant Closures</font></b> 1.180 - * 1.181 - * A*? Matches A 0 or more times (reluctant) 1.182 - * A+? Matches A 1 or more times (reluctant) 1.183 - * A?? Matches A 0 or 1 times (reluctant) 1.184 - * 1.185 - * 1.186 - * <b><font face=times roman>Logical Operators</font></b> 1.187 - * 1.188 - * AB Matches A followed by B 1.189 - * A|B Matches either A or B 1.190 - * (A) Used for subexpression grouping 1.191 - * (?:A) Used for subexpression clustering (just like grouping but 1.192 - * no backrefs) 1.193 - * 1.194 - * 1.195 - * <b><font face=times roman>Backreferences</font></b> 1.196 - * 1.197 - * \1 Backreference to 1st parenthesized subexpression 1.198 - * \2 Backreference to 2nd parenthesized subexpression 1.199 - * \3 Backreference to 3rd parenthesized subexpression 1.200 - * \4 Backreference to 4th parenthesized subexpression 1.201 - * \5 Backreference to 5th parenthesized subexpression 1.202 - * \6 Backreference to 6th parenthesized subexpression 1.203 - * \7 Backreference to 7th parenthesized subexpression 1.204 - * \8 Backreference to 8th parenthesized subexpression 1.205 - * \9 Backreference to 9th parenthesized subexpression 1.206 - * </pre> 1.207 - * 1.208 - * <p> 1.209 - * All closure operators (+, *, ?, {m,n}) are greedy by default, meaning 1.210 - * that they match as many elements of the string as possible without 1.211 - * causing the overall match to fail. If you want a closure to be 1.212 - * reluctant (non-greedy), you can simply follow it with a '?'. A 1.213 - * reluctant closure will match as few elements of the string as 1.214 - * possible when finding matches. {m,n} closures don't currently 1.215 - * support reluctancy. 1.216 - * 1.217 - * <p> 1.218 - * <b><font face="times roman">Line terminators</font></b> 1.219 - * <br> 1.220 - * A line terminator is a one- or two-character sequence that marks 1.221 - * the end of a line of the input character sequence. The following 1.222 - * are recognized as line terminators: 1.223 - * <ul> 1.224 - * <li>A newline (line feed) character ('\n'),</li> 1.225 - * <li>A carriage-return character followed immediately by a newline character ("\r\n"),</li> 1.226 - * <li>A standalone carriage-return character ('\r'),</li> 1.227 - * <li>A next-line character ('\u0085'),</li> 1.228 - * <li>A line-separator character ('\u2028'), or</li> 1.229 - * <li>A paragraph-separator character ('\u2029).</li> 1.230 - * </ul> 1.231 - * 1.232 - * <p> 1.233 - * RE runs programs compiled by the RECompiler class. But the RE 1.234 - * matcher class does not include the actual regular expression compiler 1.235 - * for reasons of efficiency. In fact, if you want to pre-compile one 1.236 - * or more regular expressions, the 'recompile' class can be invoked 1.237 - * from the command line to produce compiled output like this: 1.238 - * 1.239 - * <pre> 1.240 - * // Pre-compiled regular expression "a*b" 1.241 - * char[] re1Instructions = 1.242 - * { 1.243 - * 0x007c, 0x0000, 0x001a, 0x007c, 0x0000, 0x000d, 0x0041, 1.244 - * 0x0001, 0x0004, 0x0061, 0x007c, 0x0000, 0x0003, 0x0047, 1.245 - * 0x0000, 0xfff6, 0x007c, 0x0000, 0x0003, 0x004e, 0x0000, 1.246 - * 0x0003, 0x0041, 0x0001, 0x0004, 0x0062, 0x0045, 0x0000, 1.247 - * 0x0000, 1.248 - * }; 1.249 - * 1.250 - * 1.251 - * REProgram re1 = new REProgram(re1Instructions); 1.252 - * </pre> 1.253 - * 1.254 - * You can then construct a regular expression matcher (RE) object from 1.255 - * the pre-compiled expression re1 and thus avoid the overhead of 1.256 - * compiling the expression at runtime. If you require more dynamic 1.257 - * regular expressions, you can construct a single RECompiler object and 1.258 - * re-use it to compile each expression. Similarly, you can change the 1.259 - * program run by a given matcher object at any time. However, RE and 1.260 - * RECompiler are not threadsafe (for efficiency reasons, and because 1.261 - * requiring thread safety in this class is deemed to be a rare 1.262 - * requirement), so you will need to construct a separate compiler or 1.263 - * matcher object for each thread (unless you do thread synchronization 1.264 - * yourself). Once expression compiled into the REProgram object, REProgram 1.265 - * can be safely shared across multiple threads and RE objects. 1.266 - * 1.267 - * <br><p><br> 1.268 - * 1.269 - * <font color="red"> 1.270 - * <i>ISSUES:</i> 1.271 - * 1.272 - * <ul> 1.273 - * <li>com.weusours.util.re is not currently compatible with all 1.274 - * standard POSIX regcomp flags</li> 1.275 - * <li>com.weusours.util.re does not support POSIX equivalence classes 1.276 - * ([=foo=] syntax) (I18N/locale issue)</li> 1.277 - * <li>com.weusours.util.re does not support nested POSIX character 1.278 - * classes (definitely should, but not completely trivial)</li> 1.279 - * <li>com.weusours.util.re Does not support POSIX character collation 1.280 - * concepts ([.foo.] syntax) (I18N/locale issue)</li> 1.281 - * <li>Should there be different matching styles (simple, POSIX, Perl etc?)</li> 1.282 - * <li>Should RE support character iterators (for backwards RE matching!)?</li> 1.283 - * <li>Should RE support reluctant {m,n} closures (does anyone care)?</li> 1.284 - * <li>Not *all* possibilities are considered for greediness when backreferences 1.285 - * are involved (as POSIX suggests should be the case). The POSIX RE 1.286 - * "(ac*)c*d[ac]*\1", when matched against "acdacaa" should yield a match 1.287 - * of acdacaa where \1 is "a". This is not the case in this RE package, 1.288 - * and actually Perl doesn't go to this extent either! Until someone 1.289 - * actually complains about this, I'm not sure it's worth "fixing". 1.290 - * If it ever is fixed, test #137 in RETest.txt should be updated.</li> 1.291 - * </ul> 1.292 - * 1.293 - * </font> 1.294 - * 1.295 - * @see recompile 1.296 - * @see RECompiler 1.297 - * 1.298 - * @author <a href="mailto:jonl@muppetlabs.com">Jonathan Locke</a> 1.299 - * @author <a href="mailto:ts@sch-fer.de">Tobias Schäfer</a> 1.300 - */ 1.301 -public class RE implements Serializable 1.302 -{ 1.303 - /** 1.304 - * Specifies normal, case-sensitive matching behaviour. 1.305 - */ 1.306 - public static final int MATCH_NORMAL = 0x0000; 1.307 - 1.308 - /** 1.309 - * Flag to indicate that matching should be case-independent (folded) 1.310 - */ 1.311 - public static final int MATCH_CASEINDEPENDENT = 0x0001; 1.312 - 1.313 - /** 1.314 - * Newlines should match as BOL/EOL (^ and $) 1.315 - */ 1.316 - public static final int MATCH_MULTILINE = 0x0002; 1.317 - 1.318 - /** 1.319 - * Consider all input a single body of text - newlines are matched by . 1.320 - */ 1.321 - public static final int MATCH_SINGLELINE = 0x0004; 1.322 - 1.323 - /************************************************ 1.324 - * * 1.325 - * The format of a node in a program is: * 1.326 - * * 1.327 - * [ OPCODE ] [ OPDATA ] [ OPNEXT ] [ OPERAND ] * 1.328 - * * 1.329 - * char OPCODE - instruction * 1.330 - * char OPDATA - modifying data * 1.331 - * char OPNEXT - next node (relative offset) * 1.332 - * * 1.333 - ************************************************/ 1.334 - 1.335 - // Opcode Char Opdata/Operand Meaning 1.336 - // ---------- ---------- --------------- -------------------------------------------------- 1.337 - static final char OP_END = 'E'; // end of program 1.338 - static final char OP_BOL = '^'; // match only if at beginning of line 1.339 - static final char OP_EOL = '$'; // match only if at end of line 1.340 - static final char OP_ANY = '.'; // match any single character except newline 1.341 - static final char OP_ANYOF = '['; // count/ranges match any char in the list of ranges 1.342 - static final char OP_BRANCH = '|'; // node match this alternative or the next one 1.343 - static final char OP_ATOM = 'A'; // length/string length of string followed by string itself 1.344 - static final char OP_STAR = '*'; // node kleene closure 1.345 - static final char OP_PLUS = '+'; // node positive closure 1.346 - static final char OP_MAYBE = '?'; // node optional closure 1.347 - static final char OP_ESCAPE = '\\'; // escape special escape code char class (escape is E_* code) 1.348 - static final char OP_OPEN = '('; // number nth opening paren 1.349 - static final char OP_OPEN_CLUSTER = '<'; // opening cluster 1.350 - static final char OP_CLOSE = ')'; // number nth closing paren 1.351 - static final char OP_CLOSE_CLUSTER = '>'; // closing cluster 1.352 - static final char OP_BACKREF = '#'; // number reference nth already matched parenthesized string 1.353 - static final char OP_GOTO = 'G'; // nothing but a (back-)pointer 1.354 - static final char OP_NOTHING = 'N'; // match null string such as in '(a|)' 1.355 - static final char OP_RELUCTANTSTAR = '8'; // none/expr reluctant '*' (mnemonic for char is unshifted '*') 1.356 - static final char OP_RELUCTANTPLUS = '='; // none/expr reluctant '+' (mnemonic for char is unshifted '+') 1.357 - static final char OP_RELUCTANTMAYBE = '/'; // none/expr reluctant '?' (mnemonic for char is unshifted '?') 1.358 - static final char OP_POSIXCLASS = 'P'; // classid one of the posix character classes 1.359 - 1.360 - // Escape codes 1.361 - static final char E_ALNUM = 'w'; // Alphanumeric 1.362 - static final char E_NALNUM = 'W'; // Non-alphanumeric 1.363 - static final char E_BOUND = 'b'; // Word boundary 1.364 - static final char E_NBOUND = 'B'; // Non-word boundary 1.365 - static final char E_SPACE = 's'; // Whitespace 1.366 - static final char E_NSPACE = 'S'; // Non-whitespace 1.367 - static final char E_DIGIT = 'd'; // Digit 1.368 - static final char E_NDIGIT = 'D'; // Non-digit 1.369 - 1.370 - // Posix character classes 1.371 - static final char POSIX_CLASS_ALNUM = 'w'; // Alphanumerics 1.372 - static final char POSIX_CLASS_ALPHA = 'a'; // Alphabetics 1.373 - static final char POSIX_CLASS_BLANK = 'b'; // Blanks 1.374 - static final char POSIX_CLASS_CNTRL = 'c'; // Control characters 1.375 - static final char POSIX_CLASS_DIGIT = 'd'; // Digits 1.376 - static final char POSIX_CLASS_GRAPH = 'g'; // Graphic characters 1.377 - static final char POSIX_CLASS_LOWER = 'l'; // Lowercase characters 1.378 - static final char POSIX_CLASS_PRINT = 'p'; // Printable characters 1.379 - static final char POSIX_CLASS_PUNCT = '!'; // Punctuation 1.380 - static final char POSIX_CLASS_SPACE = 's'; // Spaces 1.381 - static final char POSIX_CLASS_UPPER = 'u'; // Uppercase characters 1.382 - static final char POSIX_CLASS_XDIGIT = 'x'; // Hexadecimal digits 1.383 - static final char POSIX_CLASS_JSTART = 'j'; // Java identifier start 1.384 - static final char POSIX_CLASS_JPART = 'k'; // Java identifier part 1.385 - 1.386 - // Limits 1.387 - static final int maxNode = 65536; // Maximum number of nodes in a program 1.388 - static final int MAX_PAREN = 16; // Number of paren pairs (only 9 can be backrefs) 1.389 - 1.390 - // Node layout constants 1.391 - static final int offsetOpcode = 0; // Opcode offset (first character) 1.392 - static final int offsetOpdata = 1; // Opdata offset (second char) 1.393 - static final int offsetNext = 2; // Next index offset (third char) 1.394 - static final int nodeSize = 3; // Node size (in chars) 1.395 - 1.396 - // State of current program 1.397 - REProgram program; // Compiled regular expression 'program' 1.398 - transient CharacterIterator search; // The string being matched against 1.399 - int matchFlags; // Match behaviour flags 1.400 - int maxParen = MAX_PAREN; 1.401 - 1.402 - // Parenthesized subexpressions 1.403 - transient int parenCount; // Number of subexpressions matched (num open parens + 1) 1.404 - transient int start0; // Cache of start[0] 1.405 - transient int end0; // Cache of start[0] 1.406 - transient int start1; // Cache of start[1] 1.407 - transient int end1; // Cache of start[1] 1.408 - transient int start2; // Cache of start[2] 1.409 - transient int end2; // Cache of start[2] 1.410 - transient int[] startn; // Lazy-alloced array of sub-expression starts 1.411 - transient int[] endn; // Lazy-alloced array of sub-expression ends 1.412 - 1.413 - // Backreferences 1.414 - transient int[] startBackref; // Lazy-alloced array of backref starts 1.415 - transient int[] endBackref; // Lazy-alloced array of backref ends 1.416 - 1.417 - /** 1.418 - * Constructs a regular expression matcher from a String by compiling it 1.419 - * using a new instance of RECompiler. If you will be compiling many 1.420 - * expressions, you may prefer to use a single RECompiler object instead. 1.421 - * 1.422 - * @param pattern The regular expression pattern to compile. 1.423 - * @exception RESyntaxException Thrown if the regular expression has invalid syntax. 1.424 - * @see RECompiler 1.425 - * @see recompile 1.426 - */ 1.427 - public RE(String pattern) throws RESyntaxException 1.428 - { 1.429 - this(pattern, MATCH_NORMAL); 1.430 - } 1.431 - 1.432 - /** 1.433 - * Constructs a regular expression matcher from a String by compiling it 1.434 - * using a new instance of RECompiler. If you will be compiling many 1.435 - * expressions, you may prefer to use a single RECompiler object instead. 1.436 - * 1.437 - * @param pattern The regular expression pattern to compile. 1.438 - * @param matchFlags The matching style 1.439 - * @exception RESyntaxException Thrown if the regular expression has invalid syntax. 1.440 - * @see RECompiler 1.441 - * @see recompile 1.442 - */ 1.443 - public RE(String pattern, int matchFlags) throws RESyntaxException 1.444 - { 1.445 - this(new RECompiler().compile(pattern)); 1.446 - setMatchFlags(matchFlags); 1.447 - } 1.448 - 1.449 - /** 1.450 - * Construct a matcher for a pre-compiled regular expression from program 1.451 - * (bytecode) data. Permits special flags to be passed in to modify matching 1.452 - * behaviour. 1.453 - * 1.454 - * @param program Compiled regular expression program (see RECompiler and/or recompile) 1.455 - * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*): 1.456 - * 1.457 - * <pre> 1.458 - * MATCH_NORMAL // Normal (case-sensitive) matching 1.459 - * MATCH_CASEINDEPENDENT // Case folded comparisons 1.460 - * MATCH_MULTILINE // Newline matches as BOL/EOL 1.461 - * </pre> 1.462 - * 1.463 - * @see RECompiler 1.464 - * @see REProgram 1.465 - * @see recompile 1.466 - */ 1.467 - public RE(REProgram program, int matchFlags) 1.468 - { 1.469 - setProgram(program); 1.470 - setMatchFlags(matchFlags); 1.471 - } 1.472 - 1.473 - /** 1.474 - * Construct a matcher for a pre-compiled regular expression from program 1.475 - * (bytecode) data. 1.476 - * 1.477 - * @param program Compiled regular expression program 1.478 - * @see RECompiler 1.479 - * @see recompile 1.480 - */ 1.481 - public RE(REProgram program) 1.482 - { 1.483 - this(program, MATCH_NORMAL); 1.484 - } 1.485 - 1.486 - /** 1.487 - * Constructs a regular expression matcher with no initial program. 1.488 - * This is likely to be an uncommon practice, but is still supported. 1.489 - */ 1.490 - public RE() 1.491 - { 1.492 - this((REProgram)null, MATCH_NORMAL); 1.493 - } 1.494 - 1.495 - /** 1.496 - * Converts a 'simplified' regular expression to a full regular expression 1.497 - * 1.498 - * @param pattern The pattern to convert 1.499 - * @return The full regular expression 1.500 - */ 1.501 - public static String simplePatternToFullRegularExpression(String pattern) 1.502 - { 1.503 - StringBuffer buf = new StringBuffer(); 1.504 - for (int i = 0; i < pattern.length(); i++) 1.505 - { 1.506 - char c = pattern.charAt(i); 1.507 - switch (c) 1.508 - { 1.509 - case '*': 1.510 - buf.append(".*"); 1.511 - break; 1.512 - 1.513 - case '.': 1.514 - case '[': 1.515 - case ']': 1.516 - case '\\': 1.517 - case '+': 1.518 - case '?': 1.519 - case '{': 1.520 - case '}': 1.521 - case '$': 1.522 - case '^': 1.523 - case '|': 1.524 - case '(': 1.525 - case ')': 1.526 - buf.append('\\'); 1.527 - default: 1.528 - buf.append(c); 1.529 - break; 1.530 - } 1.531 - } 1.532 - return buf.toString(); 1.533 - } 1.534 - 1.535 - /** 1.536 - * Sets match behaviour flags which alter the way RE does matching. 1.537 - * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*): 1.538 - * 1.539 - * <pre> 1.540 - * MATCH_NORMAL // Normal (case-sensitive) matching 1.541 - * MATCH_CASEINDEPENDENT // Case folded comparisons 1.542 - * MATCH_MULTILINE // Newline matches as BOL/EOL 1.543 - * </pre> 1.544 - */ 1.545 - public void setMatchFlags(int matchFlags) 1.546 - { 1.547 - this.matchFlags = matchFlags; 1.548 - } 1.549 - 1.550 - /** 1.551 - * Returns the current match behaviour flags. 1.552 - * @return Current match behaviour flags (RE.MATCH_*). 1.553 - * 1.554 - * <pre> 1.555 - * MATCH_NORMAL // Normal (case-sensitive) matching 1.556 - * MATCH_CASEINDEPENDENT // Case folded comparisons 1.557 - * MATCH_MULTILINE // Newline matches as BOL/EOL 1.558 - * </pre> 1.559 - * 1.560 - * @see #setMatchFlags 1.561 - */ 1.562 - public int getMatchFlags() 1.563 - { 1.564 - return matchFlags; 1.565 - } 1.566 - 1.567 - /** 1.568 - * Sets the current regular expression program used by this matcher object. 1.569 - * 1.570 - * @param program Regular expression program compiled by RECompiler. 1.571 - * @see RECompiler 1.572 - * @see REProgram 1.573 - * @see recompile 1.574 - */ 1.575 - public void setProgram(REProgram program) 1.576 - { 1.577 - this.program = program; 1.578 - if (program != null && program.maxParens != -1) { 1.579 - this.maxParen = program.maxParens; 1.580 - } else { 1.581 - this.maxParen = MAX_PAREN; 1.582 - } 1.583 - } 1.584 - 1.585 - /** 1.586 - * Returns the current regular expression program in use by this matcher object. 1.587 - * 1.588 - * @return Regular expression program 1.589 - * @see #setProgram 1.590 - */ 1.591 - public REProgram getProgram() 1.592 - { 1.593 - return program; 1.594 - } 1.595 - 1.596 - /** 1.597 - * Returns the number of parenthesized subexpressions available after a successful match. 1.598 - * 1.599 - * @return Number of available parenthesized subexpressions 1.600 - */ 1.601 - public int getParenCount() 1.602 - { 1.603 - return parenCount; 1.604 - } 1.605 - 1.606 - /** 1.607 - * Gets the contents of a parenthesized subexpression after a successful match. 1.608 - * 1.609 - * @param which Nesting level of subexpression 1.610 - * @return String 1.611 - */ 1.612 - public String getParen(int which) 1.613 - { 1.614 - int start; 1.615 - if (which < parenCount && (start = getParenStart(which)) >= 0) 1.616 - { 1.617 - return search.substring(start, getParenEnd(which)); 1.618 - } 1.619 - return null; 1.620 - } 1.621 - 1.622 - /** 1.623 - * Returns the start index of a given paren level. 1.624 - * 1.625 - * @param which Nesting level of subexpression 1.626 - * @return String index 1.627 - */ 1.628 - public final int getParenStart(int which) 1.629 - { 1.630 - if (which < parenCount) 1.631 - { 1.632 - switch (which) 1.633 - { 1.634 - case 0: 1.635 - return start0; 1.636 - 1.637 - case 1: 1.638 - return start1; 1.639 - 1.640 - case 2: 1.641 - return start2; 1.642 - 1.643 - default: 1.644 - if (startn == null) 1.645 - { 1.646 - allocParens(); 1.647 - } 1.648 - return startn[which]; 1.649 - } 1.650 - } 1.651 - return -1; 1.652 - } 1.653 - 1.654 - /** 1.655 - * Returns the end index of a given paren level. 1.656 - * 1.657 - * @param which Nesting level of subexpression 1.658 - * @return String index 1.659 - */ 1.660 - public final int getParenEnd(int which) 1.661 - { 1.662 - if (which < parenCount) 1.663 - { 1.664 - switch (which) 1.665 - { 1.666 - case 0: 1.667 - return end0; 1.668 - 1.669 - case 1: 1.670 - return end1; 1.671 - 1.672 - case 2: 1.673 - return end2; 1.674 - 1.675 - default: 1.676 - if (endn == null) 1.677 - { 1.678 - allocParens(); 1.679 - } 1.680 - return endn[which]; 1.681 - } 1.682 - } 1.683 - return -1; 1.684 - } 1.685 - 1.686 - /** 1.687 - * Returns the length of a given paren level. 1.688 - * 1.689 - * @param which Nesting level of subexpression 1.690 - * @return Number of characters in the parenthesized subexpression 1.691 - */ 1.692 - public final int getParenLength(int which) 1.693 - { 1.694 - if (which < parenCount) 1.695 - { 1.696 - return getParenEnd(which) - getParenStart(which); 1.697 - } 1.698 - return -1; 1.699 - } 1.700 - 1.701 - /** 1.702 - * Sets the start of a paren level 1.703 - * 1.704 - * @param which Which paren level 1.705 - * @param i Index in input array 1.706 - */ 1.707 - protected final void setParenStart(int which, int i) 1.708 - { 1.709 - if (which < parenCount) 1.710 - { 1.711 - switch (which) 1.712 - { 1.713 - case 0: 1.714 - start0 = i; 1.715 - break; 1.716 - 1.717 - case 1: 1.718 - start1 = i; 1.719 - break; 1.720 - 1.721 - case 2: 1.722 - start2 = i; 1.723 - break; 1.724 - 1.725 - default: 1.726 - if (startn == null) 1.727 - { 1.728 - allocParens(); 1.729 - } 1.730 - startn[which] = i; 1.731 - break; 1.732 - } 1.733 - } 1.734 - } 1.735 - 1.736 - /** 1.737 - * Sets the end of a paren level 1.738 - * 1.739 - * @param which Which paren level 1.740 - * @param i Index in input array 1.741 - */ 1.742 - protected final void setParenEnd(int which, int i) 1.743 - { 1.744 - if (which < parenCount) 1.745 - { 1.746 - switch (which) 1.747 - { 1.748 - case 0: 1.749 - end0 = i; 1.750 - break; 1.751 - 1.752 - case 1: 1.753 - end1 = i; 1.754 - break; 1.755 - 1.756 - case 2: 1.757 - end2 = i; 1.758 - break; 1.759 - 1.760 - default: 1.761 - if (endn == null) 1.762 - { 1.763 - allocParens(); 1.764 - } 1.765 - endn[which] = i; 1.766 - break; 1.767 - } 1.768 - } 1.769 - } 1.770 - 1.771 - /** 1.772 - * Throws an Error representing an internal error condition probably resulting 1.773 - * from a bug in the regular expression compiler (or possibly data corruption). 1.774 - * In practice, this should be very rare. 1.775 - * 1.776 - * @param s Error description 1.777 - */ 1.778 - protected void internalError(String s) throws Error 1.779 - { 1.780 - throw new Error("RE internal error: " + s); 1.781 - } 1.782 - 1.783 - /** 1.784 - * Performs lazy allocation of subexpression arrays 1.785 - */ 1.786 - private final void allocParens() 1.787 - { 1.788 - // Allocate arrays for subexpressions 1.789 - startn = new int[maxParen]; 1.790 - endn = new int[maxParen]; 1.791 - 1.792 - // Set sub-expression pointers to invalid values 1.793 - for (int i = 0; i < maxParen; i++) 1.794 - { 1.795 - startn[i] = -1; 1.796 - endn[i] = -1; 1.797 - } 1.798 - } 1.799 - 1.800 - /** 1.801 - * Try to match a string against a subset of nodes in the program 1.802 - * 1.803 - * @param firstNode Node to start at in program 1.804 - * @param lastNode Last valid node (used for matching a subexpression without 1.805 - * matching the rest of the program as well). 1.806 - * @param idxStart Starting position in character array 1.807 - * @return Final input array index if match succeeded. -1 if not. 1.808 - */ 1.809 - protected int matchNodes(int firstNode, int lastNode, int idxStart) 1.810 - { 1.811 - // Our current place in the string 1.812 - int idx = idxStart; 1.813 - 1.814 - // Loop while node is valid 1.815 - int next, opcode, opdata; 1.816 - int idxNew; 1.817 - char[] instruction = program.instruction; 1.818 - for (int node = firstNode; node < lastNode; ) 1.819 - { 1.820 - opcode = instruction[node + offsetOpcode]; 1.821 - next = node + (short)instruction[node + offsetNext]; 1.822 - opdata = instruction[node + offsetOpdata]; 1.823 - 1.824 - switch (opcode) 1.825 - { 1.826 - case OP_RELUCTANTMAYBE: 1.827 - { 1.828 - int once = 0; 1.829 - do 1.830 - { 1.831 - // Try to match the rest without using the reluctant subexpr 1.832 - if ((idxNew = matchNodes(next, maxNode, idx)) != -1) 1.833 - { 1.834 - return idxNew; 1.835 - } 1.836 - } 1.837 - while ((once++ == 0) && (idx = matchNodes(node + nodeSize, next, idx)) != -1); 1.838 - return -1; 1.839 - } 1.840 - 1.841 - case OP_RELUCTANTPLUS: 1.842 - while ((idx = matchNodes(node + nodeSize, next, idx)) != -1) 1.843 - { 1.844 - // Try to match the rest without using the reluctant subexpr 1.845 - if ((idxNew = matchNodes(next, maxNode, idx)) != -1) 1.846 - { 1.847 - return idxNew; 1.848 - } 1.849 - } 1.850 - return -1; 1.851 - 1.852 - case OP_RELUCTANTSTAR: 1.853 - do 1.854 - { 1.855 - // Try to match the rest without using the reluctant subexpr 1.856 - if ((idxNew = matchNodes(next, maxNode, idx)) != -1) 1.857 - { 1.858 - return idxNew; 1.859 - } 1.860 - } 1.861 - while ((idx = matchNodes(node + nodeSize, next, idx)) != -1); 1.862 - return -1; 1.863 - 1.864 - case OP_OPEN: 1.865 - 1.866 - // Match subexpression 1.867 - if ((program.flags & REProgram.OPT_HASBACKREFS) != 0) 1.868 - { 1.869 - startBackref[opdata] = idx; 1.870 - } 1.871 - if ((idxNew = matchNodes(next, maxNode, idx)) != -1) 1.872 - { 1.873 - // Increase valid paren count 1.874 - if ((opdata + 1) > parenCount) 1.875 - { 1.876 - parenCount = opdata + 1; 1.877 - } 1.878 - 1.879 - // Don't set paren if already set later on 1.880 - if (getParenStart(opdata) == -1) 1.881 - { 1.882 - setParenStart(opdata, idx); 1.883 - } 1.884 - } 1.885 - return idxNew; 1.886 - 1.887 - case OP_CLOSE: 1.888 - 1.889 - // Done matching subexpression 1.890 - if ((program.flags & REProgram.OPT_HASBACKREFS) != 0) 1.891 - { 1.892 - endBackref[opdata] = idx; 1.893 - } 1.894 - if ((idxNew = matchNodes(next, maxNode, idx)) != -1) 1.895 - { 1.896 - // Increase valid paren count 1.897 - if ((opdata + 1) > parenCount) 1.898 - { 1.899 - parenCount = opdata + 1; 1.900 - } 1.901 - 1.902 - // Don't set paren if already set later on 1.903 - if (getParenEnd(opdata) == -1) 1.904 - { 1.905 - setParenEnd(opdata, idx); 1.906 - } 1.907 - } 1.908 - return idxNew; 1.909 - 1.910 - case OP_OPEN_CLUSTER: 1.911 - case OP_CLOSE_CLUSTER: 1.912 - // starting or ending the matching of a subexpression which has no backref. 1.913 - return matchNodes( next, maxNode, idx ); 1.914 - 1.915 - case OP_BACKREF: 1.916 - { 1.917 - // Get the start and end of the backref 1.918 - int s = startBackref[opdata]; 1.919 - int e = endBackref[opdata]; 1.920 - 1.921 - // We don't know the backref yet 1.922 - if (s == -1 || e == -1) 1.923 - { 1.924 - return -1; 1.925 - } 1.926 - 1.927 - // The backref is empty size 1.928 - if (s == e) 1.929 - { 1.930 - break; 1.931 - } 1.932 - 1.933 - // Get the length of the backref 1.934 - int l = e - s; 1.935 - 1.936 - // If there's not enough input left, give up. 1.937 - if (search.isEnd(idx + l - 1)) 1.938 - { 1.939 - return -1; 1.940 - } 1.941 - 1.942 - // Case fold the backref? 1.943 - final boolean caseFold = 1.944 - ((matchFlags & MATCH_CASEINDEPENDENT) != 0); 1.945 - // Compare backref to input 1.946 - for (int i = 0; i < l; i++) 1.947 - { 1.948 - if (compareChars(search.charAt(idx++), search.charAt(s + i), caseFold) != 0) 1.949 - { 1.950 - return -1; 1.951 - } 1.952 - } 1.953 - } 1.954 - break; 1.955 - 1.956 - case OP_BOL: 1.957 - 1.958 - // Fail if we're not at the start of the string 1.959 - if (idx != 0) 1.960 - { 1.961 - // If we're multiline matching, we could still be at the start of a line 1.962 - if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE) 1.963 - { 1.964 - // If not at start of line, give up 1.965 - if (idx <= 0 || !isNewline(idx - 1)) { 1.966 - return -1; 1.967 - } else { 1.968 - break; 1.969 - } 1.970 - } 1.971 - return -1; 1.972 - } 1.973 - break; 1.974 - 1.975 - case OP_EOL: 1.976 - 1.977 - // If we're not at the end of string 1.978 - if (!search.isEnd(0) && !search.isEnd(idx)) 1.979 - { 1.980 - // If we're multi-line matching 1.981 - if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE) 1.982 - { 1.983 - // Give up if we're not at the end of a line 1.984 - if (!isNewline(idx)) { 1.985 - return -1; 1.986 - } else { 1.987 - break; 1.988 - } 1.989 - } 1.990 - return -1; 1.991 - } 1.992 - break; 1.993 - 1.994 - case OP_ESCAPE: 1.995 - 1.996 - // Which escape? 1.997 - switch (opdata) 1.998 - { 1.999 - // Word boundary match 1.1000 - case E_NBOUND: 1.1001 - case E_BOUND: 1.1002 - { 1.1003 - char cLast = ((idx == 0) ? '\n' : search.charAt(idx - 1)); 1.1004 - char cNext = ((search.isEnd(idx)) ? '\n' : search.charAt(idx)); 1.1005 - if ((Character.isLetterOrDigit(cLast) == Character.isLetterOrDigit(cNext)) == (opdata == E_BOUND)) 1.1006 - { 1.1007 - return -1; 1.1008 - } 1.1009 - } 1.1010 - break; 1.1011 - 1.1012 - // Alpha-numeric, digit, space, javaLetter, javaLetterOrDigit 1.1013 - case E_ALNUM: 1.1014 - case E_NALNUM: 1.1015 - case E_DIGIT: 1.1016 - case E_NDIGIT: 1.1017 - case E_SPACE: 1.1018 - case E_NSPACE: 1.1019 - 1.1020 - // Give up if out of input 1.1021 - if (search.isEnd(idx)) 1.1022 - { 1.1023 - return -1; 1.1024 - } 1.1025 - 1.1026 - char c = search.charAt(idx); 1.1027 - 1.1028 - // Switch on escape 1.1029 - switch (opdata) 1.1030 - { 1.1031 - case E_ALNUM: 1.1032 - case E_NALNUM: 1.1033 - if (!((Character.isLetterOrDigit(c) || c == '_') == (opdata == E_ALNUM))) 1.1034 - { 1.1035 - return -1; 1.1036 - } 1.1037 - break; 1.1038 - 1.1039 - case E_DIGIT: 1.1040 - case E_NDIGIT: 1.1041 - if (!(Character.isDigit(c) == (opdata == E_DIGIT))) 1.1042 - { 1.1043 - return -1; 1.1044 - } 1.1045 - break; 1.1046 - 1.1047 - case E_SPACE: 1.1048 - case E_NSPACE: 1.1049 - if (!(Character.isWhitespace(c) == (opdata == E_SPACE))) 1.1050 - { 1.1051 - return -1; 1.1052 - } 1.1053 - break; 1.1054 - } 1.1055 - idx++; 1.1056 - break; 1.1057 - 1.1058 - default: 1.1059 - internalError("Unrecognized escape '" + opdata + "'"); 1.1060 - } 1.1061 - break; 1.1062 - 1.1063 - case OP_ANY: 1.1064 - 1.1065 - if ((matchFlags & MATCH_SINGLELINE) == MATCH_SINGLELINE) { 1.1066 - // Match anything 1.1067 - if (search.isEnd(idx)) 1.1068 - { 1.1069 - return -1; 1.1070 - } 1.1071 - } 1.1072 - else 1.1073 - { 1.1074 - // Match anything but a newline 1.1075 - if (search.isEnd(idx) || isNewline(idx)) 1.1076 - { 1.1077 - return -1; 1.1078 - } 1.1079 - } 1.1080 - idx++; 1.1081 - break; 1.1082 - 1.1083 - case OP_ATOM: 1.1084 - { 1.1085 - // Match an atom value 1.1086 - if (search.isEnd(idx)) 1.1087 - { 1.1088 - return -1; 1.1089 - } 1.1090 - 1.1091 - // Get length of atom and starting index 1.1092 - int lenAtom = opdata; 1.1093 - int startAtom = node + nodeSize; 1.1094 - 1.1095 - // Give up if not enough input remains to have a match 1.1096 - if (search.isEnd(lenAtom + idx - 1)) 1.1097 - { 1.1098 - return -1; 1.1099 - } 1.1100 - 1.1101 - // Match atom differently depending on casefolding flag 1.1102 - final boolean caseFold = 1.1103 - ((matchFlags & MATCH_CASEINDEPENDENT) != 0); 1.1104 - 1.1105 - for (int i = 0; i < lenAtom; i++) 1.1106 - { 1.1107 - if (compareChars(search.charAt(idx++), instruction[startAtom + i], caseFold) != 0) 1.1108 - { 1.1109 - return -1; 1.1110 - } 1.1111 - } 1.1112 - } 1.1113 - break; 1.1114 - 1.1115 - case OP_POSIXCLASS: 1.1116 - { 1.1117 - // Out of input? 1.1118 - if (search.isEnd(idx)) 1.1119 - { 1.1120 - return -1; 1.1121 - } 1.1122 - 1.1123 - switch (opdata) 1.1124 - { 1.1125 - case POSIX_CLASS_ALNUM: 1.1126 - if (!Character.isLetterOrDigit(search.charAt(idx))) 1.1127 - { 1.1128 - return -1; 1.1129 - } 1.1130 - break; 1.1131 - 1.1132 - case POSIX_CLASS_ALPHA: 1.1133 - if (!Character.isLetter(search.charAt(idx))) 1.1134 - { 1.1135 - return -1; 1.1136 - } 1.1137 - break; 1.1138 - 1.1139 - case POSIX_CLASS_DIGIT: 1.1140 - if (!Character.isDigit(search.charAt(idx))) 1.1141 - { 1.1142 - return -1; 1.1143 - } 1.1144 - break; 1.1145 - 1.1146 - case POSIX_CLASS_BLANK: // JWL - bugbug: is this right?? 1.1147 - if (!Character.isSpaceChar(search.charAt(idx))) 1.1148 - { 1.1149 - return -1; 1.1150 - } 1.1151 - break; 1.1152 - 1.1153 - case POSIX_CLASS_SPACE: 1.1154 - if (!Character.isWhitespace(search.charAt(idx))) 1.1155 - { 1.1156 - return -1; 1.1157 - } 1.1158 - break; 1.1159 - 1.1160 - case POSIX_CLASS_CNTRL: 1.1161 - if (Character.getType(search.charAt(idx)) != Character.CONTROL) 1.1162 - { 1.1163 - return -1; 1.1164 - } 1.1165 - break; 1.1166 - 1.1167 - case POSIX_CLASS_GRAPH: // JWL - bugbug??? 1.1168 - switch (Character.getType(search.charAt(idx))) 1.1169 - { 1.1170 - case Character.MATH_SYMBOL: 1.1171 - case Character.CURRENCY_SYMBOL: 1.1172 - case Character.MODIFIER_SYMBOL: 1.1173 - case Character.OTHER_SYMBOL: 1.1174 - break; 1.1175 - 1.1176 - default: 1.1177 - return -1; 1.1178 - } 1.1179 - break; 1.1180 - 1.1181 - case POSIX_CLASS_LOWER: 1.1182 - if (Character.getType(search.charAt(idx)) != Character.LOWERCASE_LETTER) 1.1183 - { 1.1184 - return -1; 1.1185 - } 1.1186 - break; 1.1187 - 1.1188 - case POSIX_CLASS_UPPER: 1.1189 - if (Character.getType(search.charAt(idx)) != Character.UPPERCASE_LETTER) 1.1190 - { 1.1191 - return -1; 1.1192 - } 1.1193 - break; 1.1194 - 1.1195 - case POSIX_CLASS_PRINT: 1.1196 - if (Character.getType(search.charAt(idx)) == Character.CONTROL) 1.1197 - { 1.1198 - return -1; 1.1199 - } 1.1200 - break; 1.1201 - 1.1202 - case POSIX_CLASS_PUNCT: 1.1203 - { 1.1204 - int type = Character.getType(search.charAt(idx)); 1.1205 - switch(type) 1.1206 - { 1.1207 - case Character.DASH_PUNCTUATION: 1.1208 - case Character.START_PUNCTUATION: 1.1209 - case Character.END_PUNCTUATION: 1.1210 - case Character.CONNECTOR_PUNCTUATION: 1.1211 - case Character.OTHER_PUNCTUATION: 1.1212 - break; 1.1213 - 1.1214 - default: 1.1215 - return -1; 1.1216 - } 1.1217 - } 1.1218 - break; 1.1219 - 1.1220 - case POSIX_CLASS_XDIGIT: // JWL - bugbug?? 1.1221 - { 1.1222 - boolean isXDigit = ((search.charAt(idx) >= '0' && search.charAt(idx) <= '9') || 1.1223 - (search.charAt(idx) >= 'a' && search.charAt(idx) <= 'f') || 1.1224 - (search.charAt(idx) >= 'A' && search.charAt(idx) <= 'F')); 1.1225 - if (!isXDigit) 1.1226 - { 1.1227 - return -1; 1.1228 - } 1.1229 - } 1.1230 - break; 1.1231 - 1.1232 - case POSIX_CLASS_JSTART: 1.1233 - if (!Character.isJavaIdentifierStart(search.charAt(idx))) 1.1234 - { 1.1235 - return -1; 1.1236 - } 1.1237 - break; 1.1238 - 1.1239 - case POSIX_CLASS_JPART: 1.1240 - if (!Character.isJavaIdentifierPart(search.charAt(idx))) 1.1241 - { 1.1242 - return -1; 1.1243 - } 1.1244 - break; 1.1245 - 1.1246 - default: 1.1247 - internalError("Bad posix class"); 1.1248 - break; 1.1249 - } 1.1250 - 1.1251 - // Matched. 1.1252 - idx++; 1.1253 - } 1.1254 - break; 1.1255 - 1.1256 - case OP_ANYOF: 1.1257 - { 1.1258 - // Out of input? 1.1259 - if (search.isEnd(idx)) 1.1260 - { 1.1261 - return -1; 1.1262 - } 1.1263 - 1.1264 - // Get character to match against character class and maybe casefold 1.1265 - char c = search.charAt(idx); 1.1266 - boolean caseFold = (matchFlags & MATCH_CASEINDEPENDENT) != 0; 1.1267 - // Loop through character class checking our match character 1.1268 - int idxRange = node + nodeSize; 1.1269 - int idxEnd = idxRange + (opdata * 2); 1.1270 - boolean match = false; 1.1271 - for (int i = idxRange; !match && i < idxEnd; ) 1.1272 - { 1.1273 - // Get start, end and match characters 1.1274 - char s = instruction[i++]; 1.1275 - char e = instruction[i++]; 1.1276 - 1.1277 - match = ((compareChars(c, s, caseFold) >= 0) 1.1278 - && (compareChars(c, e, caseFold) <= 0)); 1.1279 - } 1.1280 - 1.1281 - // Fail if we didn't match the character class 1.1282 - if (!match) 1.1283 - { 1.1284 - return -1; 1.1285 - } 1.1286 - idx++; 1.1287 - } 1.1288 - break; 1.1289 - 1.1290 - case OP_BRANCH: 1.1291 - { 1.1292 - // Check for choices 1.1293 - if (instruction[next + offsetOpcode] != OP_BRANCH) 1.1294 - { 1.1295 - // If there aren't any other choices, just evaluate this branch. 1.1296 - node += nodeSize; 1.1297 - continue; 1.1298 - } 1.1299 - 1.1300 - // Try all available branches 1.1301 - short nextBranch; 1.1302 - do 1.1303 - { 1.1304 - // Try matching the branch against the string 1.1305 - if ((idxNew = matchNodes(node + nodeSize, maxNode, idx)) != -1) 1.1306 - { 1.1307 - return idxNew; 1.1308 - } 1.1309 - 1.1310 - // Go to next branch (if any) 1.1311 - nextBranch = (short)instruction[node + offsetNext]; 1.1312 - node += nextBranch; 1.1313 - } 1.1314 - while (nextBranch != 0 && (instruction[node + offsetOpcode] == OP_BRANCH)); 1.1315 - 1.1316 - // Failed to match any branch! 1.1317 - return -1; 1.1318 - } 1.1319 - 1.1320 - case OP_NOTHING: 1.1321 - case OP_GOTO: 1.1322 - 1.1323 - // Just advance to the next node without doing anything 1.1324 - break; 1.1325 - 1.1326 - case OP_END: 1.1327 - 1.1328 - // Match has succeeded! 1.1329 - setParenEnd(0, idx); 1.1330 - return idx; 1.1331 - 1.1332 - default: 1.1333 - 1.1334 - // Corrupt program 1.1335 - internalError("Invalid opcode '" + opcode + "'"); 1.1336 - } 1.1337 - 1.1338 - // Advance to the next node in the program 1.1339 - node = next; 1.1340 - } 1.1341 - 1.1342 - // We "should" never end up here 1.1343 - internalError("Corrupt program"); 1.1344 - return -1; 1.1345 - } 1.1346 - 1.1347 - /** 1.1348 - * Match the current regular expression program against the current 1.1349 - * input string, starting at index i of the input string. This method 1.1350 - * is only meant for internal use. 1.1351 - * 1.1352 - * @param i The input string index to start matching at 1.1353 - * @return True if the input matched the expression 1.1354 - */ 1.1355 - protected boolean matchAt(int i) 1.1356 - { 1.1357 - // Initialize start pointer, paren cache and paren count 1.1358 - start0 = -1; 1.1359 - end0 = -1; 1.1360 - start1 = -1; 1.1361 - end1 = -1; 1.1362 - start2 = -1; 1.1363 - end2 = -1; 1.1364 - startn = null; 1.1365 - endn = null; 1.1366 - parenCount = 1; 1.1367 - setParenStart(0, i); 1.1368 - 1.1369 - // Allocate backref arrays (unless optimizations indicate otherwise) 1.1370 - if ((program.flags & REProgram.OPT_HASBACKREFS) != 0) 1.1371 - { 1.1372 - startBackref = new int[maxParen]; 1.1373 - endBackref = new int[maxParen]; 1.1374 - } 1.1375 - 1.1376 - // Match against string 1.1377 - int idx; 1.1378 - if ((idx = matchNodes(0, maxNode, i)) != -1) 1.1379 - { 1.1380 - setParenEnd(0, idx); 1.1381 - return true; 1.1382 - } 1.1383 - 1.1384 - // Didn't match 1.1385 - parenCount = 0; 1.1386 - return false; 1.1387 - } 1.1388 - 1.1389 - /** 1.1390 - * Matches the current regular expression program against a character array, 1.1391 - * starting at a given index. 1.1392 - * 1.1393 - * @param search String to match against 1.1394 - * @param i Index to start searching at 1.1395 - * @return True if string matched 1.1396 - */ 1.1397 - public boolean match(String search, int i) 1.1398 - { 1.1399 - return match(new StringCharacterIterator(search), i); 1.1400 - } 1.1401 - 1.1402 - /** 1.1403 - * Matches the current regular expression program against a character array, 1.1404 - * starting at a given index. 1.1405 - * 1.1406 - * @param search String to match against 1.1407 - * @param i Index to start searching at 1.1408 - * @return True if string matched 1.1409 - */ 1.1410 - public boolean match(CharacterIterator search, int i) 1.1411 - { 1.1412 - // There is no compiled program to search with! 1.1413 - if (program == null) 1.1414 - { 1.1415 - // This should be uncommon enough to be an error case rather 1.1416 - // than an exception (which would have to be handled everywhere) 1.1417 - internalError("No RE program to run!"); 1.1418 - } 1.1419 - 1.1420 - // Save string to search 1.1421 - this.search = search; 1.1422 - 1.1423 - // Can we optimize the search by looking for a prefix string? 1.1424 - if (program.prefix == null) 1.1425 - { 1.1426 - // Unprefixed matching must try for a match at each character 1.1427 - for ( ;! search.isEnd(i - 1); i++) 1.1428 - { 1.1429 - // Try a match at index i 1.1430 - if (matchAt(i)) 1.1431 - { 1.1432 - return true; 1.1433 - } 1.1434 - } 1.1435 - return false; 1.1436 - } 1.1437 - else 1.1438 - { 1.1439 - // Prefix-anchored matching is possible 1.1440 - boolean caseIndependent = (matchFlags & MATCH_CASEINDEPENDENT) != 0; 1.1441 - char[] prefix = program.prefix; 1.1442 - for ( ; !search.isEnd(i + prefix.length - 1); i++) 1.1443 - { 1.1444 - int j = i; 1.1445 - int k = 0; 1.1446 - 1.1447 - boolean match; 1.1448 - do { 1.1449 - // If there's a mismatch of any character in the prefix, give up 1.1450 - match = (compareChars(search.charAt(j++), prefix[k++], caseIndependent) == 0); 1.1451 - } while (match && k < prefix.length); 1.1452 - 1.1453 - // See if the whole prefix string matched 1.1454 - if (k == prefix.length) 1.1455 - { 1.1456 - // We matched the full prefix at firstChar, so try it 1.1457 - if (matchAt(i)) 1.1458 - { 1.1459 - return true; 1.1460 - } 1.1461 - } 1.1462 - } 1.1463 - return false; 1.1464 - } 1.1465 - } 1.1466 - 1.1467 - /** 1.1468 - * Matches the current regular expression program against a String. 1.1469 - * 1.1470 - * @param search String to match against 1.1471 - * @return True if string matched 1.1472 - */ 1.1473 - public boolean match(String search) 1.1474 - { 1.1475 - return match(search, 0); 1.1476 - } 1.1477 - 1.1478 - /** 1.1479 - * Splits a string into an array of strings on regular expression boundaries. 1.1480 - * This function works the same way as the Perl function of the same name. 1.1481 - * Given a regular expression of "[ab]+" and a string to split of 1.1482 - * "xyzzyababbayyzabbbab123", the result would be the array of Strings 1.1483 - * "[xyzzy, yyz, 123]". 1.1484 - * 1.1485 - * <p>Please note that the first string in the resulting array may be an empty 1.1486 - * string. This happens when the very first character of input string is 1.1487 - * matched by the pattern. 1.1488 - * 1.1489 - * @param s String to split on this regular exression 1.1490 - * @return Array of strings 1.1491 - */ 1.1492 - public String[] split(String s) 1.1493 - { 1.1494 - // Create new vector 1.1495 - Vector v = new Vector(); 1.1496 - 1.1497 - // Start at position 0 and search the whole string 1.1498 - int pos = 0; 1.1499 - int len = s.length(); 1.1500 - 1.1501 - // Try a match at each position 1.1502 - while (pos < len && match(s, pos)) 1.1503 - { 1.1504 - // Get start of match 1.1505 - int start = getParenStart(0); 1.1506 - 1.1507 - // Get end of match 1.1508 - int newpos = getParenEnd(0); 1.1509 - 1.1510 - // Check if no progress was made 1.1511 - if (newpos == pos) 1.1512 - { 1.1513 - v.addElement(s.substring(pos, start + 1)); 1.1514 - newpos++; 1.1515 - } 1.1516 - else 1.1517 - { 1.1518 - v.addElement(s.substring(pos, start)); 1.1519 - } 1.1520 - 1.1521 - // Move to new position 1.1522 - pos = newpos; 1.1523 - } 1.1524 - 1.1525 - // Push remainder if it's not empty 1.1526 - String remainder = s.substring(pos); 1.1527 - if (remainder.length() != 0) 1.1528 - { 1.1529 - v.addElement(remainder); 1.1530 - } 1.1531 - 1.1532 - // Return vector as an array of strings 1.1533 - String[] ret = new String[v.size()]; 1.1534 - v.copyInto(ret); 1.1535 - return ret; 1.1536 - } 1.1537 - 1.1538 - /** 1.1539 - * Flag bit that indicates that subst should replace all occurrences of this 1.1540 - * regular expression. 1.1541 - */ 1.1542 - public static final int REPLACE_ALL = 0x0000; 1.1543 - 1.1544 - /** 1.1545 - * Flag bit that indicates that subst should only replace the first occurrence 1.1546 - * of this regular expression. 1.1547 - */ 1.1548 - public static final int REPLACE_FIRSTONLY = 0x0001; 1.1549 - 1.1550 - /** 1.1551 - * Flag bit that indicates that subst should replace backreferences 1.1552 - */ 1.1553 - public static final int REPLACE_BACKREFERENCES = 0x0002; 1.1554 - 1.1555 - /** 1.1556 - * Substitutes a string for this regular expression in another string. 1.1557 - * This method works like the Perl function of the same name. 1.1558 - * Given a regular expression of "a*b", a String to substituteIn of 1.1559 - * "aaaabfooaaabgarplyaaabwackyb" and the substitution String "-", the 1.1560 - * resulting String returned by subst would be "-foo-garply-wacky-". 1.1561 - * 1.1562 - * @param substituteIn String to substitute within 1.1563 - * @param substitution String to substitute for all matches of this regular expression. 1.1564 - * @return The string substituteIn with zero or more occurrences of the current 1.1565 - * regular expression replaced with the substitution String (if this regular 1.1566 - * expression object doesn't match at any position, the original String is returned 1.1567 - * unchanged). 1.1568 - */ 1.1569 - public String subst(String substituteIn, String substitution) 1.1570 - { 1.1571 - return subst(substituteIn, substitution, REPLACE_ALL); 1.1572 - } 1.1573 - 1.1574 - /** 1.1575 - * Substitutes a string for this regular expression in another string. 1.1576 - * This method works like the Perl function of the same name. 1.1577 - * Given a regular expression of "a*b", a String to substituteIn of 1.1578 - * "aaaabfooaaabgarplyaaabwackyb" and the substitution String "-", the 1.1579 - * resulting String returned by subst would be "-foo-garply-wacky-". 1.1580 - * <p> 1.1581 - * It is also possible to reference the contents of a parenthesized expression 1.1582 - * with $0, $1, ... $9. A regular expression of "http://[\\.\\w\\-\\?/~_@&=%]+", 1.1583 - * a String to substituteIn of "visit us: http://www.apache.org!" and the 1.1584 - * substitution String "<a href=\"$0\">$0</a>", the resulting String 1.1585 - * returned by subst would be 1.1586 - * "visit us: <a href=\"http://www.apache.org\">http://www.apache.org</a>!". 1.1587 - * <p> 1.1588 - * <i>Note:</i> $0 represents the whole match. 1.1589 - * 1.1590 - * @param substituteIn String to substitute within 1.1591 - * @param substitution String to substitute for matches of this regular expression 1.1592 - * @param flags One or more bitwise flags from REPLACE_*. If the REPLACE_FIRSTONLY 1.1593 - * flag bit is set, only the first occurrence of this regular expression is replaced. 1.1594 - * If the bit is not set (REPLACE_ALL), all occurrences of this pattern will be 1.1595 - * replaced. If the flag REPLACE_BACKREFERENCES is set, all backreferences will 1.1596 - * be processed. 1.1597 - * @return The string substituteIn with zero or more occurrences of the current 1.1598 - * regular expression replaced with the substitution String (if this regular 1.1599 - * expression object doesn't match at any position, the original String is returned 1.1600 - * unchanged). 1.1601 - */ 1.1602 - public String subst(String substituteIn, String substitution, int flags) 1.1603 - { 1.1604 - // String to return 1.1605 - StringBuffer ret = new StringBuffer(); 1.1606 - 1.1607 - // Start at position 0 and search the whole string 1.1608 - int pos = 0; 1.1609 - int len = substituteIn.length(); 1.1610 - 1.1611 - // Try a match at each position 1.1612 - while (pos < len && match(substituteIn, pos)) 1.1613 - { 1.1614 - // Append string before match 1.1615 - ret.append(substituteIn.substring(pos, getParenStart(0))); 1.1616 - 1.1617 - if ((flags & REPLACE_BACKREFERENCES) != 0) 1.1618 - { 1.1619 - // Process backreferences 1.1620 - int lCurrentPosition = 0; 1.1621 - int lLastPosition = -2; 1.1622 - int lLength = substitution.length(); 1.1623 - boolean bAddedPrefix = false; 1.1624 - 1.1625 - while ((lCurrentPosition = substitution.indexOf("$", lCurrentPosition)) >= 0) 1.1626 - { 1.1627 - if ((lCurrentPosition == 0 || substitution.charAt(lCurrentPosition - 1) != '\\') 1.1628 - && lCurrentPosition+1 < lLength) 1.1629 - { 1.1630 - char c = substitution.charAt(lCurrentPosition + 1); 1.1631 - if (c >= '0' && c <= '9') 1.1632 - { 1.1633 - if (bAddedPrefix == false) 1.1634 - { 1.1635 - // Append everything between the beginning of the 1.1636 - // substitution string and the current $ sign 1.1637 - ret.append(substitution.substring(0, lCurrentPosition)); 1.1638 - bAddedPrefix = true; 1.1639 - } 1.1640 - else 1.1641 - { 1.1642 - // Append everything between the last and the current $ sign 1.1643 - ret.append(substitution.substring(lLastPosition + 2, lCurrentPosition)); 1.1644 - } 1.1645 - 1.1646 - // Append the parenthesized expression 1.1647 - // Note: if a parenthesized expression of the requested 1.1648 - // index is not available "null" is added to the string 1.1649 - ret.append(getParen(c - '0')); 1.1650 - lLastPosition = lCurrentPosition; 1.1651 - } 1.1652 - } 1.1653 - 1.1654 - // Move forward, skipping past match 1.1655 - lCurrentPosition++; 1.1656 - } 1.1657 - 1.1658 - // Append everything after the last $ sign 1.1659 - ret.append(substitution.substring(lLastPosition + 2, lLength)); 1.1660 - } 1.1661 - else 1.1662 - { 1.1663 - // Append substitution without processing backreferences 1.1664 - ret.append(substitution); 1.1665 - } 1.1666 - 1.1667 - // Move forward, skipping past match 1.1668 - int newpos = getParenEnd(0); 1.1669 - 1.1670 - // We always want to make progress! 1.1671 - if (newpos == pos) 1.1672 - { 1.1673 - newpos++; 1.1674 - } 1.1675 - 1.1676 - // Try new position 1.1677 - pos = newpos; 1.1678 - 1.1679 - // Break out if we're only supposed to replace one occurrence 1.1680 - if ((flags & REPLACE_FIRSTONLY) != 0) 1.1681 - { 1.1682 - break; 1.1683 - } 1.1684 - } 1.1685 - 1.1686 - // If there's remaining input, append it 1.1687 - if (pos < len) 1.1688 - { 1.1689 - ret.append(substituteIn.substring(pos)); 1.1690 - } 1.1691 - 1.1692 - // Return string buffer as string 1.1693 - return ret.toString(); 1.1694 - } 1.1695 - 1.1696 - /** 1.1697 - * Returns an array of Strings, whose toString representation matches a regular 1.1698 - * expression. This method works like the Perl function of the same name. Given 1.1699 - * a regular expression of "a*b" and an array of String objects of [foo, aab, zzz, 1.1700 - * aaaab], the array of Strings returned by grep would be [aab, aaaab]. 1.1701 - * 1.1702 - * @param search Array of Objects to search 1.1703 - * @return Array of Strings whose toString() value matches this regular expression. 1.1704 - */ 1.1705 - public String[] grep(Object[] search) 1.1706 - { 1.1707 - // Create new vector to hold return items 1.1708 - Vector v = new Vector(); 1.1709 - 1.1710 - // Traverse array of objects 1.1711 - for (int i = 0; i < search.length; i++) 1.1712 - { 1.1713 - // Get next object as a string 1.1714 - String s = search[i].toString(); 1.1715 - 1.1716 - // If it matches this regexp, add it to the list 1.1717 - if (match(s)) 1.1718 - { 1.1719 - v.addElement(s); 1.1720 - } 1.1721 - } 1.1722 - 1.1723 - // Return vector as an array of strings 1.1724 - String[] ret = new String[v.size()]; 1.1725 - v.copyInto(ret); 1.1726 - return ret; 1.1727 - } 1.1728 - 1.1729 - /** 1.1730 - * @return true if character at i-th position in the <code>search</code> string is a newline 1.1731 - */ 1.1732 - private boolean isNewline(int i) 1.1733 - { 1.1734 - char nextChar = search.charAt(i); 1.1735 - 1.1736 - if (nextChar == '\n' || nextChar == '\r' || nextChar == '\u0085' 1.1737 - || nextChar == '\u2028' || nextChar == '\u2029') 1.1738 - { 1.1739 - return true; 1.1740 - } 1.1741 - 1.1742 - return false; 1.1743 - } 1.1744 - 1.1745 - /** 1.1746 - * Compares two characters. 1.1747 - * 1.1748 - * @param c1 first character to compare. 1.1749 - * @param c2 second character to compare. 1.1750 - * @param caseIndependent whether comparision is case insensitive or not. 1.1751 - * @return negative, 0, or positive integer as the first character 1.1752 - * less than, equal to, or greater then the second. 1.1753 - */ 1.1754 - private int compareChars(char c1, char c2, boolean caseIndependent) 1.1755 - { 1.1756 - if (caseIndependent) 1.1757 - { 1.1758 - c1 = Character.toLowerCase(c1); 1.1759 - c2 = Character.toLowerCase(c2); 1.1760 - } 1.1761 - return ((int)c1 - (int)c2); 1.1762 - } 1.1763 -}