1 /* |
|
2 * reserved comment block |
|
3 * DO NOT REMOVE OR ALTER! |
|
4 */ |
|
5 /* |
|
6 * Copyright 1999-2004 The Apache Software Foundation. |
|
7 * |
|
8 * Licensed under the Apache License, Version 2.0 (the "License"); |
|
9 * you may not use this file except in compliance with the License. |
|
10 * You may obtain a copy of the License at |
|
11 * |
|
12 * http://www.apache.org/licenses/LICENSE-2.0 |
|
13 * |
|
14 * Unless required by applicable law or agreed to in writing, software |
|
15 * distributed under the License is distributed on an "AS IS" BASIS, |
|
16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
17 * See the License for the specific language governing permissions and |
|
18 * limitations under the License. |
|
19 */ |
|
20 |
|
21 package com.sun.org.apache.regexp.internal; |
|
22 |
|
23 import java.io.Serializable; |
|
24 import java.util.Vector; |
|
25 |
|
26 /** |
|
27 * RE is an efficient, lightweight regular expression evaluator/matcher |
|
28 * class. Regular expressions are pattern descriptions which enable |
|
29 * sophisticated matching of strings. In addition to being able to |
|
30 * match a string against a pattern, you can also extract parts of the |
|
31 * match. This is especially useful in text parsing! Details on the |
|
32 * syntax of regular expression patterns are given below. |
|
33 * |
|
34 * <p> |
|
35 * To compile a regular expression (RE), you can simply construct an RE |
|
36 * matcher object from the string specification of the pattern, like this: |
|
37 * |
|
38 * <pre> |
|
39 * RE r = new RE("a*b"); |
|
40 * </pre> |
|
41 * |
|
42 * <p> |
|
43 * Once you have done this, you can call either of the RE.match methods to |
|
44 * perform matching on a String. For example: |
|
45 * |
|
46 * <pre> |
|
47 * boolean matched = r.match("aaaab"); |
|
48 * </pre> |
|
49 * |
|
50 * will cause the boolean matched to be set to true because the |
|
51 * pattern "a*b" matches the string "aaaab". |
|
52 * |
|
53 * <p> |
|
54 * If you were interested in the <i>number</i> of a's which matched the |
|
55 * first part of our example expression, you could change the expression to |
|
56 * "(a*)b". Then when you compiled the expression and matched it against |
|
57 * something like "xaaaab", you would get results like this: |
|
58 * |
|
59 * <pre> |
|
60 * RE r = new RE("(a*)b"); // Compile expression |
|
61 * boolean matched = r.match("xaaaab"); // Match against "xaaaab" |
|
62 * |
|
63 * String wholeExpr = r.getParen(0); // wholeExpr will be 'aaaab' |
|
64 * String insideParens = r.getParen(1); // insideParens will be 'aaaa' |
|
65 * |
|
66 * int startWholeExpr = r.getParenStart(0); // startWholeExpr will be index 1 |
|
67 * int endWholeExpr = r.getParenEnd(0); // endWholeExpr will be index 6 |
|
68 * int lenWholeExpr = r.getParenLength(0); // lenWholeExpr will be 5 |
|
69 * |
|
70 * int startInside = r.getParenStart(1); // startInside will be index 1 |
|
71 * int endInside = r.getParenEnd(1); // endInside will be index 5 |
|
72 * int lenInside = r.getParenLength(1); // lenInside will be 4 |
|
73 * </pre> |
|
74 * |
|
75 * You can also refer to the contents of a parenthesized expression |
|
76 * within a regular expression itself. This is called a |
|
77 * 'backreference'. The first backreference in a regular expression is |
|
78 * denoted by \1, the second by \2 and so on. So the expression: |
|
79 * |
|
80 * <pre> |
|
81 * ([0-9]+)=\1 |
|
82 * </pre> |
|
83 * |
|
84 * will match any string of the form n=n (like 0=0 or 2=2). |
|
85 * |
|
86 * <p> |
|
87 * The full regular expression syntax accepted by RE is described here: |
|
88 * |
|
89 * <pre> |
|
90 * |
|
91 * <b><font face=times roman>Characters</font></b> |
|
92 * |
|
93 * <i>unicodeChar</i> Matches any identical unicode character |
|
94 * \ Used to quote a meta-character (like '*') |
|
95 * \\ Matches a single '\' character |
|
96 * \0nnn Matches a given octal character |
|
97 * \xhh Matches a given 8-bit hexadecimal character |
|
98 * \\uhhhh Matches a given 16-bit hexadecimal character |
|
99 * \t Matches an ASCII tab character |
|
100 * \n Matches an ASCII newline character |
|
101 * \r Matches an ASCII return character |
|
102 * \f Matches an ASCII form feed character |
|
103 * |
|
104 * |
|
105 * <b><font face=times roman>Character Classes</font></b> |
|
106 * |
|
107 * [abc] Simple character class |
|
108 * [a-zA-Z] Character class with ranges |
|
109 * [^abc] Negated character class |
|
110 * </pre> |
|
111 * |
|
112 * <b>NOTE:</b> Incomplete ranges will be interpreted as "starts |
|
113 * from zero" or "ends with last character". |
|
114 * <br> |
|
115 * I.e. [-a] is the same as [\\u0000-a], and [a-] is the same as [a-\\uFFFF], |
|
116 * [-] means "all characters". |
|
117 * |
|
118 * <pre> |
|
119 * |
|
120 * <b><font face=times roman>Standard POSIX Character Classes</font></b> |
|
121 * |
|
122 * [:alnum:] Alphanumeric characters. |
|
123 * [:alpha:] Alphabetic characters. |
|
124 * [:blank:] Space and tab characters. |
|
125 * [:cntrl:] Control characters. |
|
126 * [:digit:] Numeric characters. |
|
127 * [:graph:] Characters that are printable and are also visible. |
|
128 * (A space is printable, but not visible, while an |
|
129 * `a' is both.) |
|
130 * [:lower:] Lower-case alphabetic characters. |
|
131 * [:print:] Printable characters (characters that are not |
|
132 * control characters.) |
|
133 * [:punct:] Punctuation characters (characters that are not letter, |
|
134 * digits, control characters, or space characters). |
|
135 * [:space:] Space characters (such as space, tab, and formfeed, |
|
136 * to name a few). |
|
137 * [:upper:] Upper-case alphabetic characters. |
|
138 * [:xdigit:] Characters that are hexadecimal digits. |
|
139 * |
|
140 * |
|
141 * <b><font face=times roman>Non-standard POSIX-style Character Classes</font></b> |
|
142 * |
|
143 * [:javastart:] Start of a Java identifier |
|
144 * [:javapart:] Part of a Java identifier |
|
145 * |
|
146 * |
|
147 * <b><font face=times roman>Predefined Classes</font></b> |
|
148 * |
|
149 * . Matches any character other than newline |
|
150 * \w Matches a "word" character (alphanumeric plus "_") |
|
151 * \W Matches a non-word character |
|
152 * \s Matches a whitespace character |
|
153 * \S Matches a non-whitespace character |
|
154 * \d Matches a digit character |
|
155 * \D Matches a non-digit character |
|
156 * |
|
157 * |
|
158 * <b><font face=times roman>Boundary Matchers</font></b> |
|
159 * |
|
160 * ^ Matches only at the beginning of a line |
|
161 * $ Matches only at the end of a line |
|
162 * \b Matches only at a word boundary |
|
163 * \B Matches only at a non-word boundary |
|
164 * |
|
165 * |
|
166 * <b><font face=times roman>Greedy Closures</font></b> |
|
167 * |
|
168 * A* Matches A 0 or more times (greedy) |
|
169 * A+ Matches A 1 or more times (greedy) |
|
170 * A? Matches A 1 or 0 times (greedy) |
|
171 * A{n} Matches A exactly n times (greedy) |
|
172 * A{n,} Matches A at least n times (greedy) |
|
173 * A{n,m} Matches A at least n but not more than m times (greedy) |
|
174 * |
|
175 * |
|
176 * <b><font face=times roman>Reluctant Closures</font></b> |
|
177 * |
|
178 * A*? Matches A 0 or more times (reluctant) |
|
179 * A+? Matches A 1 or more times (reluctant) |
|
180 * A?? Matches A 0 or 1 times (reluctant) |
|
181 * |
|
182 * |
|
183 * <b><font face=times roman>Logical Operators</font></b> |
|
184 * |
|
185 * AB Matches A followed by B |
|
186 * A|B Matches either A or B |
|
187 * (A) Used for subexpression grouping |
|
188 * (?:A) Used for subexpression clustering (just like grouping but |
|
189 * no backrefs) |
|
190 * |
|
191 * |
|
192 * <b><font face=times roman>Backreferences</font></b> |
|
193 * |
|
194 * \1 Backreference to 1st parenthesized subexpression |
|
195 * \2 Backreference to 2nd parenthesized subexpression |
|
196 * \3 Backreference to 3rd parenthesized subexpression |
|
197 * \4 Backreference to 4th parenthesized subexpression |
|
198 * \5 Backreference to 5th parenthesized subexpression |
|
199 * \6 Backreference to 6th parenthesized subexpression |
|
200 * \7 Backreference to 7th parenthesized subexpression |
|
201 * \8 Backreference to 8th parenthesized subexpression |
|
202 * \9 Backreference to 9th parenthesized subexpression |
|
203 * </pre> |
|
204 * |
|
205 * <p> |
|
206 * All closure operators (+, *, ?, {m,n}) are greedy by default, meaning |
|
207 * that they match as many elements of the string as possible without |
|
208 * causing the overall match to fail. If you want a closure to be |
|
209 * reluctant (non-greedy), you can simply follow it with a '?'. A |
|
210 * reluctant closure will match as few elements of the string as |
|
211 * possible when finding matches. {m,n} closures don't currently |
|
212 * support reluctancy. |
|
213 * |
|
214 * <p> |
|
215 * <b><font face="times roman">Line terminators</font></b> |
|
216 * <br> |
|
217 * A line terminator is a one- or two-character sequence that marks |
|
218 * the end of a line of the input character sequence. The following |
|
219 * are recognized as line terminators: |
|
220 * <ul> |
|
221 * <li>A newline (line feed) character ('\n'),</li> |
|
222 * <li>A carriage-return character followed immediately by a newline character ("\r\n"),</li> |
|
223 * <li>A standalone carriage-return character ('\r'),</li> |
|
224 * <li>A next-line character ('\u0085'),</li> |
|
225 * <li>A line-separator character ('\u2028'), or</li> |
|
226 * <li>A paragraph-separator character ('\u2029).</li> |
|
227 * </ul> |
|
228 * |
|
229 * <p> |
|
230 * RE runs programs compiled by the RECompiler class. But the RE |
|
231 * matcher class does not include the actual regular expression compiler |
|
232 * for reasons of efficiency. In fact, if you want to pre-compile one |
|
233 * or more regular expressions, the 'recompile' class can be invoked |
|
234 * from the command line to produce compiled output like this: |
|
235 * |
|
236 * <pre> |
|
237 * // Pre-compiled regular expression "a*b" |
|
238 * char[] re1Instructions = |
|
239 * { |
|
240 * 0x007c, 0x0000, 0x001a, 0x007c, 0x0000, 0x000d, 0x0041, |
|
241 * 0x0001, 0x0004, 0x0061, 0x007c, 0x0000, 0x0003, 0x0047, |
|
242 * 0x0000, 0xfff6, 0x007c, 0x0000, 0x0003, 0x004e, 0x0000, |
|
243 * 0x0003, 0x0041, 0x0001, 0x0004, 0x0062, 0x0045, 0x0000, |
|
244 * 0x0000, |
|
245 * }; |
|
246 * |
|
247 * |
|
248 * REProgram re1 = new REProgram(re1Instructions); |
|
249 * </pre> |
|
250 * |
|
251 * You can then construct a regular expression matcher (RE) object from |
|
252 * the pre-compiled expression re1 and thus avoid the overhead of |
|
253 * compiling the expression at runtime. If you require more dynamic |
|
254 * regular expressions, you can construct a single RECompiler object and |
|
255 * re-use it to compile each expression. Similarly, you can change the |
|
256 * program run by a given matcher object at any time. However, RE and |
|
257 * RECompiler are not threadsafe (for efficiency reasons, and because |
|
258 * requiring thread safety in this class is deemed to be a rare |
|
259 * requirement), so you will need to construct a separate compiler or |
|
260 * matcher object for each thread (unless you do thread synchronization |
|
261 * yourself). Once expression compiled into the REProgram object, REProgram |
|
262 * can be safely shared across multiple threads and RE objects. |
|
263 * |
|
264 * <br><p><br> |
|
265 * |
|
266 * <font color="red"> |
|
267 * <i>ISSUES:</i> |
|
268 * |
|
269 * <ul> |
|
270 * <li>com.weusours.util.re is not currently compatible with all |
|
271 * standard POSIX regcomp flags</li> |
|
272 * <li>com.weusours.util.re does not support POSIX equivalence classes |
|
273 * ([=foo=] syntax) (I18N/locale issue)</li> |
|
274 * <li>com.weusours.util.re does not support nested POSIX character |
|
275 * classes (definitely should, but not completely trivial)</li> |
|
276 * <li>com.weusours.util.re Does not support POSIX character collation |
|
277 * concepts ([.foo.] syntax) (I18N/locale issue)</li> |
|
278 * <li>Should there be different matching styles (simple, POSIX, Perl etc?)</li> |
|
279 * <li>Should RE support character iterators (for backwards RE matching!)?</li> |
|
280 * <li>Should RE support reluctant {m,n} closures (does anyone care)?</li> |
|
281 * <li>Not *all* possibilities are considered for greediness when backreferences |
|
282 * are involved (as POSIX suggests should be the case). The POSIX RE |
|
283 * "(ac*)c*d[ac]*\1", when matched against "acdacaa" should yield a match |
|
284 * of acdacaa where \1 is "a". This is not the case in this RE package, |
|
285 * and actually Perl doesn't go to this extent either! Until someone |
|
286 * actually complains about this, I'm not sure it's worth "fixing". |
|
287 * If it ever is fixed, test #137 in RETest.txt should be updated.</li> |
|
288 * </ul> |
|
289 * |
|
290 * </font> |
|
291 * |
|
292 * @see recompile |
|
293 * @see RECompiler |
|
294 * |
|
295 * @author <a href="mailto:jonl@muppetlabs.com">Jonathan Locke</a> |
|
296 * @author <a href="mailto:ts@sch-fer.de">Tobias Schäfer</a> |
|
297 */ |
|
298 public class RE implements Serializable |
|
299 { |
|
300 /** |
|
301 * Specifies normal, case-sensitive matching behaviour. |
|
302 */ |
|
303 public static final int MATCH_NORMAL = 0x0000; |
|
304 |
|
305 /** |
|
306 * Flag to indicate that matching should be case-independent (folded) |
|
307 */ |
|
308 public static final int MATCH_CASEINDEPENDENT = 0x0001; |
|
309 |
|
310 /** |
|
311 * Newlines should match as BOL/EOL (^ and $) |
|
312 */ |
|
313 public static final int MATCH_MULTILINE = 0x0002; |
|
314 |
|
315 /** |
|
316 * Consider all input a single body of text - newlines are matched by . |
|
317 */ |
|
318 public static final int MATCH_SINGLELINE = 0x0004; |
|
319 |
|
320 /************************************************ |
|
321 * * |
|
322 * The format of a node in a program is: * |
|
323 * * |
|
324 * [ OPCODE ] [ OPDATA ] [ OPNEXT ] [ OPERAND ] * |
|
325 * * |
|
326 * char OPCODE - instruction * |
|
327 * char OPDATA - modifying data * |
|
328 * char OPNEXT - next node (relative offset) * |
|
329 * * |
|
330 ************************************************/ |
|
331 |
|
332 // Opcode Char Opdata/Operand Meaning |
|
333 // ---------- ---------- --------------- -------------------------------------------------- |
|
334 static final char OP_END = 'E'; // end of program |
|
335 static final char OP_BOL = '^'; // match only if at beginning of line |
|
336 static final char OP_EOL = '$'; // match only if at end of line |
|
337 static final char OP_ANY = '.'; // match any single character except newline |
|
338 static final char OP_ANYOF = '['; // count/ranges match any char in the list of ranges |
|
339 static final char OP_BRANCH = '|'; // node match this alternative or the next one |
|
340 static final char OP_ATOM = 'A'; // length/string length of string followed by string itself |
|
341 static final char OP_STAR = '*'; // node kleene closure |
|
342 static final char OP_PLUS = '+'; // node positive closure |
|
343 static final char OP_MAYBE = '?'; // node optional closure |
|
344 static final char OP_ESCAPE = '\\'; // escape special escape code char class (escape is E_* code) |
|
345 static final char OP_OPEN = '('; // number nth opening paren |
|
346 static final char OP_OPEN_CLUSTER = '<'; // opening cluster |
|
347 static final char OP_CLOSE = ')'; // number nth closing paren |
|
348 static final char OP_CLOSE_CLUSTER = '>'; // closing cluster |
|
349 static final char OP_BACKREF = '#'; // number reference nth already matched parenthesized string |
|
350 static final char OP_GOTO = 'G'; // nothing but a (back-)pointer |
|
351 static final char OP_NOTHING = 'N'; // match null string such as in '(a|)' |
|
352 static final char OP_RELUCTANTSTAR = '8'; // none/expr reluctant '*' (mnemonic for char is unshifted '*') |
|
353 static final char OP_RELUCTANTPLUS = '='; // none/expr reluctant '+' (mnemonic for char is unshifted '+') |
|
354 static final char OP_RELUCTANTMAYBE = '/'; // none/expr reluctant '?' (mnemonic for char is unshifted '?') |
|
355 static final char OP_POSIXCLASS = 'P'; // classid one of the posix character classes |
|
356 |
|
357 // Escape codes |
|
358 static final char E_ALNUM = 'w'; // Alphanumeric |
|
359 static final char E_NALNUM = 'W'; // Non-alphanumeric |
|
360 static final char E_BOUND = 'b'; // Word boundary |
|
361 static final char E_NBOUND = 'B'; // Non-word boundary |
|
362 static final char E_SPACE = 's'; // Whitespace |
|
363 static final char E_NSPACE = 'S'; // Non-whitespace |
|
364 static final char E_DIGIT = 'd'; // Digit |
|
365 static final char E_NDIGIT = 'D'; // Non-digit |
|
366 |
|
367 // Posix character classes |
|
368 static final char POSIX_CLASS_ALNUM = 'w'; // Alphanumerics |
|
369 static final char POSIX_CLASS_ALPHA = 'a'; // Alphabetics |
|
370 static final char POSIX_CLASS_BLANK = 'b'; // Blanks |
|
371 static final char POSIX_CLASS_CNTRL = 'c'; // Control characters |
|
372 static final char POSIX_CLASS_DIGIT = 'd'; // Digits |
|
373 static final char POSIX_CLASS_GRAPH = 'g'; // Graphic characters |
|
374 static final char POSIX_CLASS_LOWER = 'l'; // Lowercase characters |
|
375 static final char POSIX_CLASS_PRINT = 'p'; // Printable characters |
|
376 static final char POSIX_CLASS_PUNCT = '!'; // Punctuation |
|
377 static final char POSIX_CLASS_SPACE = 's'; // Spaces |
|
378 static final char POSIX_CLASS_UPPER = 'u'; // Uppercase characters |
|
379 static final char POSIX_CLASS_XDIGIT = 'x'; // Hexadecimal digits |
|
380 static final char POSIX_CLASS_JSTART = 'j'; // Java identifier start |
|
381 static final char POSIX_CLASS_JPART = 'k'; // Java identifier part |
|
382 |
|
383 // Limits |
|
384 static final int maxNode = 65536; // Maximum number of nodes in a program |
|
385 static final int MAX_PAREN = 16; // Number of paren pairs (only 9 can be backrefs) |
|
386 |
|
387 // Node layout constants |
|
388 static final int offsetOpcode = 0; // Opcode offset (first character) |
|
389 static final int offsetOpdata = 1; // Opdata offset (second char) |
|
390 static final int offsetNext = 2; // Next index offset (third char) |
|
391 static final int nodeSize = 3; // Node size (in chars) |
|
392 |
|
393 // State of current program |
|
394 REProgram program; // Compiled regular expression 'program' |
|
395 transient CharacterIterator search; // The string being matched against |
|
396 int matchFlags; // Match behaviour flags |
|
397 int maxParen = MAX_PAREN; |
|
398 |
|
399 // Parenthesized subexpressions |
|
400 transient int parenCount; // Number of subexpressions matched (num open parens + 1) |
|
401 transient int start0; // Cache of start[0] |
|
402 transient int end0; // Cache of start[0] |
|
403 transient int start1; // Cache of start[1] |
|
404 transient int end1; // Cache of start[1] |
|
405 transient int start2; // Cache of start[2] |
|
406 transient int end2; // Cache of start[2] |
|
407 transient int[] startn; // Lazy-alloced array of sub-expression starts |
|
408 transient int[] endn; // Lazy-alloced array of sub-expression ends |
|
409 |
|
410 // Backreferences |
|
411 transient int[] startBackref; // Lazy-alloced array of backref starts |
|
412 transient int[] endBackref; // Lazy-alloced array of backref ends |
|
413 |
|
414 /** |
|
415 * Constructs a regular expression matcher from a String by compiling it |
|
416 * using a new instance of RECompiler. If you will be compiling many |
|
417 * expressions, you may prefer to use a single RECompiler object instead. |
|
418 * |
|
419 * @param pattern The regular expression pattern to compile. |
|
420 * @exception RESyntaxException Thrown if the regular expression has invalid syntax. |
|
421 * @see RECompiler |
|
422 * @see recompile |
|
423 */ |
|
424 public RE(String pattern) throws RESyntaxException |
|
425 { |
|
426 this(pattern, MATCH_NORMAL); |
|
427 } |
|
428 |
|
429 /** |
|
430 * Constructs a regular expression matcher from a String by compiling it |
|
431 * using a new instance of RECompiler. If you will be compiling many |
|
432 * expressions, you may prefer to use a single RECompiler object instead. |
|
433 * |
|
434 * @param pattern The regular expression pattern to compile. |
|
435 * @param matchFlags The matching style |
|
436 * @exception RESyntaxException Thrown if the regular expression has invalid syntax. |
|
437 * @see RECompiler |
|
438 * @see recompile |
|
439 */ |
|
440 public RE(String pattern, int matchFlags) throws RESyntaxException |
|
441 { |
|
442 this(new RECompiler().compile(pattern)); |
|
443 setMatchFlags(matchFlags); |
|
444 } |
|
445 |
|
446 /** |
|
447 * Construct a matcher for a pre-compiled regular expression from program |
|
448 * (bytecode) data. Permits special flags to be passed in to modify matching |
|
449 * behaviour. |
|
450 * |
|
451 * @param program Compiled regular expression program (see RECompiler and/or recompile) |
|
452 * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*): |
|
453 * |
|
454 * <pre> |
|
455 * MATCH_NORMAL // Normal (case-sensitive) matching |
|
456 * MATCH_CASEINDEPENDENT // Case folded comparisons |
|
457 * MATCH_MULTILINE // Newline matches as BOL/EOL |
|
458 * </pre> |
|
459 * |
|
460 * @see RECompiler |
|
461 * @see REProgram |
|
462 * @see recompile |
|
463 */ |
|
464 public RE(REProgram program, int matchFlags) |
|
465 { |
|
466 setProgram(program); |
|
467 setMatchFlags(matchFlags); |
|
468 } |
|
469 |
|
470 /** |
|
471 * Construct a matcher for a pre-compiled regular expression from program |
|
472 * (bytecode) data. |
|
473 * |
|
474 * @param program Compiled regular expression program |
|
475 * @see RECompiler |
|
476 * @see recompile |
|
477 */ |
|
478 public RE(REProgram program) |
|
479 { |
|
480 this(program, MATCH_NORMAL); |
|
481 } |
|
482 |
|
483 /** |
|
484 * Constructs a regular expression matcher with no initial program. |
|
485 * This is likely to be an uncommon practice, but is still supported. |
|
486 */ |
|
487 public RE() |
|
488 { |
|
489 this((REProgram)null, MATCH_NORMAL); |
|
490 } |
|
491 |
|
492 /** |
|
493 * Converts a 'simplified' regular expression to a full regular expression |
|
494 * |
|
495 * @param pattern The pattern to convert |
|
496 * @return The full regular expression |
|
497 */ |
|
498 public static String simplePatternToFullRegularExpression(String pattern) |
|
499 { |
|
500 StringBuffer buf = new StringBuffer(); |
|
501 for (int i = 0; i < pattern.length(); i++) |
|
502 { |
|
503 char c = pattern.charAt(i); |
|
504 switch (c) |
|
505 { |
|
506 case '*': |
|
507 buf.append(".*"); |
|
508 break; |
|
509 |
|
510 case '.': |
|
511 case '[': |
|
512 case ']': |
|
513 case '\\': |
|
514 case '+': |
|
515 case '?': |
|
516 case '{': |
|
517 case '}': |
|
518 case '$': |
|
519 case '^': |
|
520 case '|': |
|
521 case '(': |
|
522 case ')': |
|
523 buf.append('\\'); |
|
524 default: |
|
525 buf.append(c); |
|
526 break; |
|
527 } |
|
528 } |
|
529 return buf.toString(); |
|
530 } |
|
531 |
|
532 /** |
|
533 * Sets match behaviour flags which alter the way RE does matching. |
|
534 * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*): |
|
535 * |
|
536 * <pre> |
|
537 * MATCH_NORMAL // Normal (case-sensitive) matching |
|
538 * MATCH_CASEINDEPENDENT // Case folded comparisons |
|
539 * MATCH_MULTILINE // Newline matches as BOL/EOL |
|
540 * </pre> |
|
541 */ |
|
542 public void setMatchFlags(int matchFlags) |
|
543 { |
|
544 this.matchFlags = matchFlags; |
|
545 } |
|
546 |
|
547 /** |
|
548 * Returns the current match behaviour flags. |
|
549 * @return Current match behaviour flags (RE.MATCH_*). |
|
550 * |
|
551 * <pre> |
|
552 * MATCH_NORMAL // Normal (case-sensitive) matching |
|
553 * MATCH_CASEINDEPENDENT // Case folded comparisons |
|
554 * MATCH_MULTILINE // Newline matches as BOL/EOL |
|
555 * </pre> |
|
556 * |
|
557 * @see #setMatchFlags |
|
558 */ |
|
559 public int getMatchFlags() |
|
560 { |
|
561 return matchFlags; |
|
562 } |
|
563 |
|
564 /** |
|
565 * Sets the current regular expression program used by this matcher object. |
|
566 * |
|
567 * @param program Regular expression program compiled by RECompiler. |
|
568 * @see RECompiler |
|
569 * @see REProgram |
|
570 * @see recompile |
|
571 */ |
|
572 public void setProgram(REProgram program) |
|
573 { |
|
574 this.program = program; |
|
575 if (program != null && program.maxParens != -1) { |
|
576 this.maxParen = program.maxParens; |
|
577 } else { |
|
578 this.maxParen = MAX_PAREN; |
|
579 } |
|
580 } |
|
581 |
|
582 /** |
|
583 * Returns the current regular expression program in use by this matcher object. |
|
584 * |
|
585 * @return Regular expression program |
|
586 * @see #setProgram |
|
587 */ |
|
588 public REProgram getProgram() |
|
589 { |
|
590 return program; |
|
591 } |
|
592 |
|
593 /** |
|
594 * Returns the number of parenthesized subexpressions available after a successful match. |
|
595 * |
|
596 * @return Number of available parenthesized subexpressions |
|
597 */ |
|
598 public int getParenCount() |
|
599 { |
|
600 return parenCount; |
|
601 } |
|
602 |
|
603 /** |
|
604 * Gets the contents of a parenthesized subexpression after a successful match. |
|
605 * |
|
606 * @param which Nesting level of subexpression |
|
607 * @return String |
|
608 */ |
|
609 public String getParen(int which) |
|
610 { |
|
611 int start; |
|
612 if (which < parenCount && (start = getParenStart(which)) >= 0) |
|
613 { |
|
614 return search.substring(start, getParenEnd(which)); |
|
615 } |
|
616 return null; |
|
617 } |
|
618 |
|
619 /** |
|
620 * Returns the start index of a given paren level. |
|
621 * |
|
622 * @param which Nesting level of subexpression |
|
623 * @return String index |
|
624 */ |
|
625 public final int getParenStart(int which) |
|
626 { |
|
627 if (which < parenCount) |
|
628 { |
|
629 switch (which) |
|
630 { |
|
631 case 0: |
|
632 return start0; |
|
633 |
|
634 case 1: |
|
635 return start1; |
|
636 |
|
637 case 2: |
|
638 return start2; |
|
639 |
|
640 default: |
|
641 if (startn == null) |
|
642 { |
|
643 allocParens(); |
|
644 } |
|
645 return startn[which]; |
|
646 } |
|
647 } |
|
648 return -1; |
|
649 } |
|
650 |
|
651 /** |
|
652 * Returns the end index of a given paren level. |
|
653 * |
|
654 * @param which Nesting level of subexpression |
|
655 * @return String index |
|
656 */ |
|
657 public final int getParenEnd(int which) |
|
658 { |
|
659 if (which < parenCount) |
|
660 { |
|
661 switch (which) |
|
662 { |
|
663 case 0: |
|
664 return end0; |
|
665 |
|
666 case 1: |
|
667 return end1; |
|
668 |
|
669 case 2: |
|
670 return end2; |
|
671 |
|
672 default: |
|
673 if (endn == null) |
|
674 { |
|
675 allocParens(); |
|
676 } |
|
677 return endn[which]; |
|
678 } |
|
679 } |
|
680 return -1; |
|
681 } |
|
682 |
|
683 /** |
|
684 * Returns the length of a given paren level. |
|
685 * |
|
686 * @param which Nesting level of subexpression |
|
687 * @return Number of characters in the parenthesized subexpression |
|
688 */ |
|
689 public final int getParenLength(int which) |
|
690 { |
|
691 if (which < parenCount) |
|
692 { |
|
693 return getParenEnd(which) - getParenStart(which); |
|
694 } |
|
695 return -1; |
|
696 } |
|
697 |
|
698 /** |
|
699 * Sets the start of a paren level |
|
700 * |
|
701 * @param which Which paren level |
|
702 * @param i Index in input array |
|
703 */ |
|
704 protected final void setParenStart(int which, int i) |
|
705 { |
|
706 if (which < parenCount) |
|
707 { |
|
708 switch (which) |
|
709 { |
|
710 case 0: |
|
711 start0 = i; |
|
712 break; |
|
713 |
|
714 case 1: |
|
715 start1 = i; |
|
716 break; |
|
717 |
|
718 case 2: |
|
719 start2 = i; |
|
720 break; |
|
721 |
|
722 default: |
|
723 if (startn == null) |
|
724 { |
|
725 allocParens(); |
|
726 } |
|
727 startn[which] = i; |
|
728 break; |
|
729 } |
|
730 } |
|
731 } |
|
732 |
|
733 /** |
|
734 * Sets the end of a paren level |
|
735 * |
|
736 * @param which Which paren level |
|
737 * @param i Index in input array |
|
738 */ |
|
739 protected final void setParenEnd(int which, int i) |
|
740 { |
|
741 if (which < parenCount) |
|
742 { |
|
743 switch (which) |
|
744 { |
|
745 case 0: |
|
746 end0 = i; |
|
747 break; |
|
748 |
|
749 case 1: |
|
750 end1 = i; |
|
751 break; |
|
752 |
|
753 case 2: |
|
754 end2 = i; |
|
755 break; |
|
756 |
|
757 default: |
|
758 if (endn == null) |
|
759 { |
|
760 allocParens(); |
|
761 } |
|
762 endn[which] = i; |
|
763 break; |
|
764 } |
|
765 } |
|
766 } |
|
767 |
|
768 /** |
|
769 * Throws an Error representing an internal error condition probably resulting |
|
770 * from a bug in the regular expression compiler (or possibly data corruption). |
|
771 * In practice, this should be very rare. |
|
772 * |
|
773 * @param s Error description |
|
774 */ |
|
775 protected void internalError(String s) throws Error |
|
776 { |
|
777 throw new Error("RE internal error: " + s); |
|
778 } |
|
779 |
|
780 /** |
|
781 * Performs lazy allocation of subexpression arrays |
|
782 */ |
|
783 private final void allocParens() |
|
784 { |
|
785 // Allocate arrays for subexpressions |
|
786 startn = new int[maxParen]; |
|
787 endn = new int[maxParen]; |
|
788 |
|
789 // Set sub-expression pointers to invalid values |
|
790 for (int i = 0; i < maxParen; i++) |
|
791 { |
|
792 startn[i] = -1; |
|
793 endn[i] = -1; |
|
794 } |
|
795 } |
|
796 |
|
797 /** |
|
798 * Try to match a string against a subset of nodes in the program |
|
799 * |
|
800 * @param firstNode Node to start at in program |
|
801 * @param lastNode Last valid node (used for matching a subexpression without |
|
802 * matching the rest of the program as well). |
|
803 * @param idxStart Starting position in character array |
|
804 * @return Final input array index if match succeeded. -1 if not. |
|
805 */ |
|
806 protected int matchNodes(int firstNode, int lastNode, int idxStart) |
|
807 { |
|
808 // Our current place in the string |
|
809 int idx = idxStart; |
|
810 |
|
811 // Loop while node is valid |
|
812 int next, opcode, opdata; |
|
813 int idxNew; |
|
814 char[] instruction = program.instruction; |
|
815 for (int node = firstNode; node < lastNode; ) |
|
816 { |
|
817 opcode = instruction[node + offsetOpcode]; |
|
818 next = node + (short)instruction[node + offsetNext]; |
|
819 opdata = instruction[node + offsetOpdata]; |
|
820 |
|
821 switch (opcode) |
|
822 { |
|
823 case OP_RELUCTANTMAYBE: |
|
824 { |
|
825 int once = 0; |
|
826 do |
|
827 { |
|
828 // Try to match the rest without using the reluctant subexpr |
|
829 if ((idxNew = matchNodes(next, maxNode, idx)) != -1) |
|
830 { |
|
831 return idxNew; |
|
832 } |
|
833 } |
|
834 while ((once++ == 0) && (idx = matchNodes(node + nodeSize, next, idx)) != -1); |
|
835 return -1; |
|
836 } |
|
837 |
|
838 case OP_RELUCTANTPLUS: |
|
839 while ((idx = matchNodes(node + nodeSize, next, idx)) != -1) |
|
840 { |
|
841 // Try to match the rest without using the reluctant subexpr |
|
842 if ((idxNew = matchNodes(next, maxNode, idx)) != -1) |
|
843 { |
|
844 return idxNew; |
|
845 } |
|
846 } |
|
847 return -1; |
|
848 |
|
849 case OP_RELUCTANTSTAR: |
|
850 do |
|
851 { |
|
852 // Try to match the rest without using the reluctant subexpr |
|
853 if ((idxNew = matchNodes(next, maxNode, idx)) != -1) |
|
854 { |
|
855 return idxNew; |
|
856 } |
|
857 } |
|
858 while ((idx = matchNodes(node + nodeSize, next, idx)) != -1); |
|
859 return -1; |
|
860 |
|
861 case OP_OPEN: |
|
862 |
|
863 // Match subexpression |
|
864 if ((program.flags & REProgram.OPT_HASBACKREFS) != 0) |
|
865 { |
|
866 startBackref[opdata] = idx; |
|
867 } |
|
868 if ((idxNew = matchNodes(next, maxNode, idx)) != -1) |
|
869 { |
|
870 // Increase valid paren count |
|
871 if ((opdata + 1) > parenCount) |
|
872 { |
|
873 parenCount = opdata + 1; |
|
874 } |
|
875 |
|
876 // Don't set paren if already set later on |
|
877 if (getParenStart(opdata) == -1) |
|
878 { |
|
879 setParenStart(opdata, idx); |
|
880 } |
|
881 } |
|
882 return idxNew; |
|
883 |
|
884 case OP_CLOSE: |
|
885 |
|
886 // Done matching subexpression |
|
887 if ((program.flags & REProgram.OPT_HASBACKREFS) != 0) |
|
888 { |
|
889 endBackref[opdata] = idx; |
|
890 } |
|
891 if ((idxNew = matchNodes(next, maxNode, idx)) != -1) |
|
892 { |
|
893 // Increase valid paren count |
|
894 if ((opdata + 1) > parenCount) |
|
895 { |
|
896 parenCount = opdata + 1; |
|
897 } |
|
898 |
|
899 // Don't set paren if already set later on |
|
900 if (getParenEnd(opdata) == -1) |
|
901 { |
|
902 setParenEnd(opdata, idx); |
|
903 } |
|
904 } |
|
905 return idxNew; |
|
906 |
|
907 case OP_OPEN_CLUSTER: |
|
908 case OP_CLOSE_CLUSTER: |
|
909 // starting or ending the matching of a subexpression which has no backref. |
|
910 return matchNodes( next, maxNode, idx ); |
|
911 |
|
912 case OP_BACKREF: |
|
913 { |
|
914 // Get the start and end of the backref |
|
915 int s = startBackref[opdata]; |
|
916 int e = endBackref[opdata]; |
|
917 |
|
918 // We don't know the backref yet |
|
919 if (s == -1 || e == -1) |
|
920 { |
|
921 return -1; |
|
922 } |
|
923 |
|
924 // The backref is empty size |
|
925 if (s == e) |
|
926 { |
|
927 break; |
|
928 } |
|
929 |
|
930 // Get the length of the backref |
|
931 int l = e - s; |
|
932 |
|
933 // If there's not enough input left, give up. |
|
934 if (search.isEnd(idx + l - 1)) |
|
935 { |
|
936 return -1; |
|
937 } |
|
938 |
|
939 // Case fold the backref? |
|
940 final boolean caseFold = |
|
941 ((matchFlags & MATCH_CASEINDEPENDENT) != 0); |
|
942 // Compare backref to input |
|
943 for (int i = 0; i < l; i++) |
|
944 { |
|
945 if (compareChars(search.charAt(idx++), search.charAt(s + i), caseFold) != 0) |
|
946 { |
|
947 return -1; |
|
948 } |
|
949 } |
|
950 } |
|
951 break; |
|
952 |
|
953 case OP_BOL: |
|
954 |
|
955 // Fail if we're not at the start of the string |
|
956 if (idx != 0) |
|
957 { |
|
958 // If we're multiline matching, we could still be at the start of a line |
|
959 if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE) |
|
960 { |
|
961 // If not at start of line, give up |
|
962 if (idx <= 0 || !isNewline(idx - 1)) { |
|
963 return -1; |
|
964 } else { |
|
965 break; |
|
966 } |
|
967 } |
|
968 return -1; |
|
969 } |
|
970 break; |
|
971 |
|
972 case OP_EOL: |
|
973 |
|
974 // If we're not at the end of string |
|
975 if (!search.isEnd(0) && !search.isEnd(idx)) |
|
976 { |
|
977 // If we're multi-line matching |
|
978 if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE) |
|
979 { |
|
980 // Give up if we're not at the end of a line |
|
981 if (!isNewline(idx)) { |
|
982 return -1; |
|
983 } else { |
|
984 break; |
|
985 } |
|
986 } |
|
987 return -1; |
|
988 } |
|
989 break; |
|
990 |
|
991 case OP_ESCAPE: |
|
992 |
|
993 // Which escape? |
|
994 switch (opdata) |
|
995 { |
|
996 // Word boundary match |
|
997 case E_NBOUND: |
|
998 case E_BOUND: |
|
999 { |
|
1000 char cLast = ((idx == 0) ? '\n' : search.charAt(idx - 1)); |
|
1001 char cNext = ((search.isEnd(idx)) ? '\n' : search.charAt(idx)); |
|
1002 if ((Character.isLetterOrDigit(cLast) == Character.isLetterOrDigit(cNext)) == (opdata == E_BOUND)) |
|
1003 { |
|
1004 return -1; |
|
1005 } |
|
1006 } |
|
1007 break; |
|
1008 |
|
1009 // Alpha-numeric, digit, space, javaLetter, javaLetterOrDigit |
|
1010 case E_ALNUM: |
|
1011 case E_NALNUM: |
|
1012 case E_DIGIT: |
|
1013 case E_NDIGIT: |
|
1014 case E_SPACE: |
|
1015 case E_NSPACE: |
|
1016 |
|
1017 // Give up if out of input |
|
1018 if (search.isEnd(idx)) |
|
1019 { |
|
1020 return -1; |
|
1021 } |
|
1022 |
|
1023 char c = search.charAt(idx); |
|
1024 |
|
1025 // Switch on escape |
|
1026 switch (opdata) |
|
1027 { |
|
1028 case E_ALNUM: |
|
1029 case E_NALNUM: |
|
1030 if (!((Character.isLetterOrDigit(c) || c == '_') == (opdata == E_ALNUM))) |
|
1031 { |
|
1032 return -1; |
|
1033 } |
|
1034 break; |
|
1035 |
|
1036 case E_DIGIT: |
|
1037 case E_NDIGIT: |
|
1038 if (!(Character.isDigit(c) == (opdata == E_DIGIT))) |
|
1039 { |
|
1040 return -1; |
|
1041 } |
|
1042 break; |
|
1043 |
|
1044 case E_SPACE: |
|
1045 case E_NSPACE: |
|
1046 if (!(Character.isWhitespace(c) == (opdata == E_SPACE))) |
|
1047 { |
|
1048 return -1; |
|
1049 } |
|
1050 break; |
|
1051 } |
|
1052 idx++; |
|
1053 break; |
|
1054 |
|
1055 default: |
|
1056 internalError("Unrecognized escape '" + opdata + "'"); |
|
1057 } |
|
1058 break; |
|
1059 |
|
1060 case OP_ANY: |
|
1061 |
|
1062 if ((matchFlags & MATCH_SINGLELINE) == MATCH_SINGLELINE) { |
|
1063 // Match anything |
|
1064 if (search.isEnd(idx)) |
|
1065 { |
|
1066 return -1; |
|
1067 } |
|
1068 } |
|
1069 else |
|
1070 { |
|
1071 // Match anything but a newline |
|
1072 if (search.isEnd(idx) || isNewline(idx)) |
|
1073 { |
|
1074 return -1; |
|
1075 } |
|
1076 } |
|
1077 idx++; |
|
1078 break; |
|
1079 |
|
1080 case OP_ATOM: |
|
1081 { |
|
1082 // Match an atom value |
|
1083 if (search.isEnd(idx)) |
|
1084 { |
|
1085 return -1; |
|
1086 } |
|
1087 |
|
1088 // Get length of atom and starting index |
|
1089 int lenAtom = opdata; |
|
1090 int startAtom = node + nodeSize; |
|
1091 |
|
1092 // Give up if not enough input remains to have a match |
|
1093 if (search.isEnd(lenAtom + idx - 1)) |
|
1094 { |
|
1095 return -1; |
|
1096 } |
|
1097 |
|
1098 // Match atom differently depending on casefolding flag |
|
1099 final boolean caseFold = |
|
1100 ((matchFlags & MATCH_CASEINDEPENDENT) != 0); |
|
1101 |
|
1102 for (int i = 0; i < lenAtom; i++) |
|
1103 { |
|
1104 if (compareChars(search.charAt(idx++), instruction[startAtom + i], caseFold) != 0) |
|
1105 { |
|
1106 return -1; |
|
1107 } |
|
1108 } |
|
1109 } |
|
1110 break; |
|
1111 |
|
1112 case OP_POSIXCLASS: |
|
1113 { |
|
1114 // Out of input? |
|
1115 if (search.isEnd(idx)) |
|
1116 { |
|
1117 return -1; |
|
1118 } |
|
1119 |
|
1120 switch (opdata) |
|
1121 { |
|
1122 case POSIX_CLASS_ALNUM: |
|
1123 if (!Character.isLetterOrDigit(search.charAt(idx))) |
|
1124 { |
|
1125 return -1; |
|
1126 } |
|
1127 break; |
|
1128 |
|
1129 case POSIX_CLASS_ALPHA: |
|
1130 if (!Character.isLetter(search.charAt(idx))) |
|
1131 { |
|
1132 return -1; |
|
1133 } |
|
1134 break; |
|
1135 |
|
1136 case POSIX_CLASS_DIGIT: |
|
1137 if (!Character.isDigit(search.charAt(idx))) |
|
1138 { |
|
1139 return -1; |
|
1140 } |
|
1141 break; |
|
1142 |
|
1143 case POSIX_CLASS_BLANK: // JWL - bugbug: is this right?? |
|
1144 if (!Character.isSpaceChar(search.charAt(idx))) |
|
1145 { |
|
1146 return -1; |
|
1147 } |
|
1148 break; |
|
1149 |
|
1150 case POSIX_CLASS_SPACE: |
|
1151 if (!Character.isWhitespace(search.charAt(idx))) |
|
1152 { |
|
1153 return -1; |
|
1154 } |
|
1155 break; |
|
1156 |
|
1157 case POSIX_CLASS_CNTRL: |
|
1158 if (Character.getType(search.charAt(idx)) != Character.CONTROL) |
|
1159 { |
|
1160 return -1; |
|
1161 } |
|
1162 break; |
|
1163 |
|
1164 case POSIX_CLASS_GRAPH: // JWL - bugbug??? |
|
1165 switch (Character.getType(search.charAt(idx))) |
|
1166 { |
|
1167 case Character.MATH_SYMBOL: |
|
1168 case Character.CURRENCY_SYMBOL: |
|
1169 case Character.MODIFIER_SYMBOL: |
|
1170 case Character.OTHER_SYMBOL: |
|
1171 break; |
|
1172 |
|
1173 default: |
|
1174 return -1; |
|
1175 } |
|
1176 break; |
|
1177 |
|
1178 case POSIX_CLASS_LOWER: |
|
1179 if (Character.getType(search.charAt(idx)) != Character.LOWERCASE_LETTER) |
|
1180 { |
|
1181 return -1; |
|
1182 } |
|
1183 break; |
|
1184 |
|
1185 case POSIX_CLASS_UPPER: |
|
1186 if (Character.getType(search.charAt(idx)) != Character.UPPERCASE_LETTER) |
|
1187 { |
|
1188 return -1; |
|
1189 } |
|
1190 break; |
|
1191 |
|
1192 case POSIX_CLASS_PRINT: |
|
1193 if (Character.getType(search.charAt(idx)) == Character.CONTROL) |
|
1194 { |
|
1195 return -1; |
|
1196 } |
|
1197 break; |
|
1198 |
|
1199 case POSIX_CLASS_PUNCT: |
|
1200 { |
|
1201 int type = Character.getType(search.charAt(idx)); |
|
1202 switch(type) |
|
1203 { |
|
1204 case Character.DASH_PUNCTUATION: |
|
1205 case Character.START_PUNCTUATION: |
|
1206 case Character.END_PUNCTUATION: |
|
1207 case Character.CONNECTOR_PUNCTUATION: |
|
1208 case Character.OTHER_PUNCTUATION: |
|
1209 break; |
|
1210 |
|
1211 default: |
|
1212 return -1; |
|
1213 } |
|
1214 } |
|
1215 break; |
|
1216 |
|
1217 case POSIX_CLASS_XDIGIT: // JWL - bugbug?? |
|
1218 { |
|
1219 boolean isXDigit = ((search.charAt(idx) >= '0' && search.charAt(idx) <= '9') || |
|
1220 (search.charAt(idx) >= 'a' && search.charAt(idx) <= 'f') || |
|
1221 (search.charAt(idx) >= 'A' && search.charAt(idx) <= 'F')); |
|
1222 if (!isXDigit) |
|
1223 { |
|
1224 return -1; |
|
1225 } |
|
1226 } |
|
1227 break; |
|
1228 |
|
1229 case POSIX_CLASS_JSTART: |
|
1230 if (!Character.isJavaIdentifierStart(search.charAt(idx))) |
|
1231 { |
|
1232 return -1; |
|
1233 } |
|
1234 break; |
|
1235 |
|
1236 case POSIX_CLASS_JPART: |
|
1237 if (!Character.isJavaIdentifierPart(search.charAt(idx))) |
|
1238 { |
|
1239 return -1; |
|
1240 } |
|
1241 break; |
|
1242 |
|
1243 default: |
|
1244 internalError("Bad posix class"); |
|
1245 break; |
|
1246 } |
|
1247 |
|
1248 // Matched. |
|
1249 idx++; |
|
1250 } |
|
1251 break; |
|
1252 |
|
1253 case OP_ANYOF: |
|
1254 { |
|
1255 // Out of input? |
|
1256 if (search.isEnd(idx)) |
|
1257 { |
|
1258 return -1; |
|
1259 } |
|
1260 |
|
1261 // Get character to match against character class and maybe casefold |
|
1262 char c = search.charAt(idx); |
|
1263 boolean caseFold = (matchFlags & MATCH_CASEINDEPENDENT) != 0; |
|
1264 // Loop through character class checking our match character |
|
1265 int idxRange = node + nodeSize; |
|
1266 int idxEnd = idxRange + (opdata * 2); |
|
1267 boolean match = false; |
|
1268 for (int i = idxRange; !match && i < idxEnd; ) |
|
1269 { |
|
1270 // Get start, end and match characters |
|
1271 char s = instruction[i++]; |
|
1272 char e = instruction[i++]; |
|
1273 |
|
1274 match = ((compareChars(c, s, caseFold) >= 0) |
|
1275 && (compareChars(c, e, caseFold) <= 0)); |
|
1276 } |
|
1277 |
|
1278 // Fail if we didn't match the character class |
|
1279 if (!match) |
|
1280 { |
|
1281 return -1; |
|
1282 } |
|
1283 idx++; |
|
1284 } |
|
1285 break; |
|
1286 |
|
1287 case OP_BRANCH: |
|
1288 { |
|
1289 // Check for choices |
|
1290 if (instruction[next + offsetOpcode] != OP_BRANCH) |
|
1291 { |
|
1292 // If there aren't any other choices, just evaluate this branch. |
|
1293 node += nodeSize; |
|
1294 continue; |
|
1295 } |
|
1296 |
|
1297 // Try all available branches |
|
1298 short nextBranch; |
|
1299 do |
|
1300 { |
|
1301 // Try matching the branch against the string |
|
1302 if ((idxNew = matchNodes(node + nodeSize, maxNode, idx)) != -1) |
|
1303 { |
|
1304 return idxNew; |
|
1305 } |
|
1306 |
|
1307 // Go to next branch (if any) |
|
1308 nextBranch = (short)instruction[node + offsetNext]; |
|
1309 node += nextBranch; |
|
1310 } |
|
1311 while (nextBranch != 0 && (instruction[node + offsetOpcode] == OP_BRANCH)); |
|
1312 |
|
1313 // Failed to match any branch! |
|
1314 return -1; |
|
1315 } |
|
1316 |
|
1317 case OP_NOTHING: |
|
1318 case OP_GOTO: |
|
1319 |
|
1320 // Just advance to the next node without doing anything |
|
1321 break; |
|
1322 |
|
1323 case OP_END: |
|
1324 |
|
1325 // Match has succeeded! |
|
1326 setParenEnd(0, idx); |
|
1327 return idx; |
|
1328 |
|
1329 default: |
|
1330 |
|
1331 // Corrupt program |
|
1332 internalError("Invalid opcode '" + opcode + "'"); |
|
1333 } |
|
1334 |
|
1335 // Advance to the next node in the program |
|
1336 node = next; |
|
1337 } |
|
1338 |
|
1339 // We "should" never end up here |
|
1340 internalError("Corrupt program"); |
|
1341 return -1; |
|
1342 } |
|
1343 |
|
1344 /** |
|
1345 * Match the current regular expression program against the current |
|
1346 * input string, starting at index i of the input string. This method |
|
1347 * is only meant for internal use. |
|
1348 * |
|
1349 * @param i The input string index to start matching at |
|
1350 * @return True if the input matched the expression |
|
1351 */ |
|
1352 protected boolean matchAt(int i) |
|
1353 { |
|
1354 // Initialize start pointer, paren cache and paren count |
|
1355 start0 = -1; |
|
1356 end0 = -1; |
|
1357 start1 = -1; |
|
1358 end1 = -1; |
|
1359 start2 = -1; |
|
1360 end2 = -1; |
|
1361 startn = null; |
|
1362 endn = null; |
|
1363 parenCount = 1; |
|
1364 setParenStart(0, i); |
|
1365 |
|
1366 // Allocate backref arrays (unless optimizations indicate otherwise) |
|
1367 if ((program.flags & REProgram.OPT_HASBACKREFS) != 0) |
|
1368 { |
|
1369 startBackref = new int[maxParen]; |
|
1370 endBackref = new int[maxParen]; |
|
1371 } |
|
1372 |
|
1373 // Match against string |
|
1374 int idx; |
|
1375 if ((idx = matchNodes(0, maxNode, i)) != -1) |
|
1376 { |
|
1377 setParenEnd(0, idx); |
|
1378 return true; |
|
1379 } |
|
1380 |
|
1381 // Didn't match |
|
1382 parenCount = 0; |
|
1383 return false; |
|
1384 } |
|
1385 |
|
1386 /** |
|
1387 * Matches the current regular expression program against a character array, |
|
1388 * starting at a given index. |
|
1389 * |
|
1390 * @param search String to match against |
|
1391 * @param i Index to start searching at |
|
1392 * @return True if string matched |
|
1393 */ |
|
1394 public boolean match(String search, int i) |
|
1395 { |
|
1396 return match(new StringCharacterIterator(search), i); |
|
1397 } |
|
1398 |
|
1399 /** |
|
1400 * Matches the current regular expression program against a character array, |
|
1401 * starting at a given index. |
|
1402 * |
|
1403 * @param search String to match against |
|
1404 * @param i Index to start searching at |
|
1405 * @return True if string matched |
|
1406 */ |
|
1407 public boolean match(CharacterIterator search, int i) |
|
1408 { |
|
1409 // There is no compiled program to search with! |
|
1410 if (program == null) |
|
1411 { |
|
1412 // This should be uncommon enough to be an error case rather |
|
1413 // than an exception (which would have to be handled everywhere) |
|
1414 internalError("No RE program to run!"); |
|
1415 } |
|
1416 |
|
1417 // Save string to search |
|
1418 this.search = search; |
|
1419 |
|
1420 // Can we optimize the search by looking for a prefix string? |
|
1421 if (program.prefix == null) |
|
1422 { |
|
1423 // Unprefixed matching must try for a match at each character |
|
1424 for ( ;! search.isEnd(i - 1); i++) |
|
1425 { |
|
1426 // Try a match at index i |
|
1427 if (matchAt(i)) |
|
1428 { |
|
1429 return true; |
|
1430 } |
|
1431 } |
|
1432 return false; |
|
1433 } |
|
1434 else |
|
1435 { |
|
1436 // Prefix-anchored matching is possible |
|
1437 boolean caseIndependent = (matchFlags & MATCH_CASEINDEPENDENT) != 0; |
|
1438 char[] prefix = program.prefix; |
|
1439 for ( ; !search.isEnd(i + prefix.length - 1); i++) |
|
1440 { |
|
1441 int j = i; |
|
1442 int k = 0; |
|
1443 |
|
1444 boolean match; |
|
1445 do { |
|
1446 // If there's a mismatch of any character in the prefix, give up |
|
1447 match = (compareChars(search.charAt(j++), prefix[k++], caseIndependent) == 0); |
|
1448 } while (match && k < prefix.length); |
|
1449 |
|
1450 // See if the whole prefix string matched |
|
1451 if (k == prefix.length) |
|
1452 { |
|
1453 // We matched the full prefix at firstChar, so try it |
|
1454 if (matchAt(i)) |
|
1455 { |
|
1456 return true; |
|
1457 } |
|
1458 } |
|
1459 } |
|
1460 return false; |
|
1461 } |
|
1462 } |
|
1463 |
|
1464 /** |
|
1465 * Matches the current regular expression program against a String. |
|
1466 * |
|
1467 * @param search String to match against |
|
1468 * @return True if string matched |
|
1469 */ |
|
1470 public boolean match(String search) |
|
1471 { |
|
1472 return match(search, 0); |
|
1473 } |
|
1474 |
|
1475 /** |
|
1476 * Splits a string into an array of strings on regular expression boundaries. |
|
1477 * This function works the same way as the Perl function of the same name. |
|
1478 * Given a regular expression of "[ab]+" and a string to split of |
|
1479 * "xyzzyababbayyzabbbab123", the result would be the array of Strings |
|
1480 * "[xyzzy, yyz, 123]". |
|
1481 * |
|
1482 * <p>Please note that the first string in the resulting array may be an empty |
|
1483 * string. This happens when the very first character of input string is |
|
1484 * matched by the pattern. |
|
1485 * |
|
1486 * @param s String to split on this regular exression |
|
1487 * @return Array of strings |
|
1488 */ |
|
1489 public String[] split(String s) |
|
1490 { |
|
1491 // Create new vector |
|
1492 Vector v = new Vector(); |
|
1493 |
|
1494 // Start at position 0 and search the whole string |
|
1495 int pos = 0; |
|
1496 int len = s.length(); |
|
1497 |
|
1498 // Try a match at each position |
|
1499 while (pos < len && match(s, pos)) |
|
1500 { |
|
1501 // Get start of match |
|
1502 int start = getParenStart(0); |
|
1503 |
|
1504 // Get end of match |
|
1505 int newpos = getParenEnd(0); |
|
1506 |
|
1507 // Check if no progress was made |
|
1508 if (newpos == pos) |
|
1509 { |
|
1510 v.addElement(s.substring(pos, start + 1)); |
|
1511 newpos++; |
|
1512 } |
|
1513 else |
|
1514 { |
|
1515 v.addElement(s.substring(pos, start)); |
|
1516 } |
|
1517 |
|
1518 // Move to new position |
|
1519 pos = newpos; |
|
1520 } |
|
1521 |
|
1522 // Push remainder if it's not empty |
|
1523 String remainder = s.substring(pos); |
|
1524 if (remainder.length() != 0) |
|
1525 { |
|
1526 v.addElement(remainder); |
|
1527 } |
|
1528 |
|
1529 // Return vector as an array of strings |
|
1530 String[] ret = new String[v.size()]; |
|
1531 v.copyInto(ret); |
|
1532 return ret; |
|
1533 } |
|
1534 |
|
1535 /** |
|
1536 * Flag bit that indicates that subst should replace all occurrences of this |
|
1537 * regular expression. |
|
1538 */ |
|
1539 public static final int REPLACE_ALL = 0x0000; |
|
1540 |
|
1541 /** |
|
1542 * Flag bit that indicates that subst should only replace the first occurrence |
|
1543 * of this regular expression. |
|
1544 */ |
|
1545 public static final int REPLACE_FIRSTONLY = 0x0001; |
|
1546 |
|
1547 /** |
|
1548 * Flag bit that indicates that subst should replace backreferences |
|
1549 */ |
|
1550 public static final int REPLACE_BACKREFERENCES = 0x0002; |
|
1551 |
|
1552 /** |
|
1553 * Substitutes a string for this regular expression in another string. |
|
1554 * This method works like the Perl function of the same name. |
|
1555 * Given a regular expression of "a*b", a String to substituteIn of |
|
1556 * "aaaabfooaaabgarplyaaabwackyb" and the substitution String "-", the |
|
1557 * resulting String returned by subst would be "-foo-garply-wacky-". |
|
1558 * |
|
1559 * @param substituteIn String to substitute within |
|
1560 * @param substitution String to substitute for all matches of this regular expression. |
|
1561 * @return The string substituteIn with zero or more occurrences of the current |
|
1562 * regular expression replaced with the substitution String (if this regular |
|
1563 * expression object doesn't match at any position, the original String is returned |
|
1564 * unchanged). |
|
1565 */ |
|
1566 public String subst(String substituteIn, String substitution) |
|
1567 { |
|
1568 return subst(substituteIn, substitution, REPLACE_ALL); |
|
1569 } |
|
1570 |
|
1571 /** |
|
1572 * Substitutes a string for this regular expression in another string. |
|
1573 * This method works like the Perl function of the same name. |
|
1574 * Given a regular expression of "a*b", a String to substituteIn of |
|
1575 * "aaaabfooaaabgarplyaaabwackyb" and the substitution String "-", the |
|
1576 * resulting String returned by subst would be "-foo-garply-wacky-". |
|
1577 * <p> |
|
1578 * It is also possible to reference the contents of a parenthesized expression |
|
1579 * with $0, $1, ... $9. A regular expression of "http://[\\.\\w\\-\\?/~_@&=%]+", |
|
1580 * a String to substituteIn of "visit us: http://www.apache.org!" and the |
|
1581 * substitution String "<a href=\"$0\">$0</a>", the resulting String |
|
1582 * returned by subst would be |
|
1583 * "visit us: <a href=\"http://www.apache.org\">http://www.apache.org</a>!". |
|
1584 * <p> |
|
1585 * <i>Note:</i> $0 represents the whole match. |
|
1586 * |
|
1587 * @param substituteIn String to substitute within |
|
1588 * @param substitution String to substitute for matches of this regular expression |
|
1589 * @param flags One or more bitwise flags from REPLACE_*. If the REPLACE_FIRSTONLY |
|
1590 * flag bit is set, only the first occurrence of this regular expression is replaced. |
|
1591 * If the bit is not set (REPLACE_ALL), all occurrences of this pattern will be |
|
1592 * replaced. If the flag REPLACE_BACKREFERENCES is set, all backreferences will |
|
1593 * be processed. |
|
1594 * @return The string substituteIn with zero or more occurrences of the current |
|
1595 * regular expression replaced with the substitution String (if this regular |
|
1596 * expression object doesn't match at any position, the original String is returned |
|
1597 * unchanged). |
|
1598 */ |
|
1599 public String subst(String substituteIn, String substitution, int flags) |
|
1600 { |
|
1601 // String to return |
|
1602 StringBuffer ret = new StringBuffer(); |
|
1603 |
|
1604 // Start at position 0 and search the whole string |
|
1605 int pos = 0; |
|
1606 int len = substituteIn.length(); |
|
1607 |
|
1608 // Try a match at each position |
|
1609 while (pos < len && match(substituteIn, pos)) |
|
1610 { |
|
1611 // Append string before match |
|
1612 ret.append(substituteIn.substring(pos, getParenStart(0))); |
|
1613 |
|
1614 if ((flags & REPLACE_BACKREFERENCES) != 0) |
|
1615 { |
|
1616 // Process backreferences |
|
1617 int lCurrentPosition = 0; |
|
1618 int lLastPosition = -2; |
|
1619 int lLength = substitution.length(); |
|
1620 boolean bAddedPrefix = false; |
|
1621 |
|
1622 while ((lCurrentPosition = substitution.indexOf("$", lCurrentPosition)) >= 0) |
|
1623 { |
|
1624 if ((lCurrentPosition == 0 || substitution.charAt(lCurrentPosition - 1) != '\\') |
|
1625 && lCurrentPosition+1 < lLength) |
|
1626 { |
|
1627 char c = substitution.charAt(lCurrentPosition + 1); |
|
1628 if (c >= '0' && c <= '9') |
|
1629 { |
|
1630 if (bAddedPrefix == false) |
|
1631 { |
|
1632 // Append everything between the beginning of the |
|
1633 // substitution string and the current $ sign |
|
1634 ret.append(substitution.substring(0, lCurrentPosition)); |
|
1635 bAddedPrefix = true; |
|
1636 } |
|
1637 else |
|
1638 { |
|
1639 // Append everything between the last and the current $ sign |
|
1640 ret.append(substitution.substring(lLastPosition + 2, lCurrentPosition)); |
|
1641 } |
|
1642 |
|
1643 // Append the parenthesized expression |
|
1644 // Note: if a parenthesized expression of the requested |
|
1645 // index is not available "null" is added to the string |
|
1646 ret.append(getParen(c - '0')); |
|
1647 lLastPosition = lCurrentPosition; |
|
1648 } |
|
1649 } |
|
1650 |
|
1651 // Move forward, skipping past match |
|
1652 lCurrentPosition++; |
|
1653 } |
|
1654 |
|
1655 // Append everything after the last $ sign |
|
1656 ret.append(substitution.substring(lLastPosition + 2, lLength)); |
|
1657 } |
|
1658 else |
|
1659 { |
|
1660 // Append substitution without processing backreferences |
|
1661 ret.append(substitution); |
|
1662 } |
|
1663 |
|
1664 // Move forward, skipping past match |
|
1665 int newpos = getParenEnd(0); |
|
1666 |
|
1667 // We always want to make progress! |
|
1668 if (newpos == pos) |
|
1669 { |
|
1670 newpos++; |
|
1671 } |
|
1672 |
|
1673 // Try new position |
|
1674 pos = newpos; |
|
1675 |
|
1676 // Break out if we're only supposed to replace one occurrence |
|
1677 if ((flags & REPLACE_FIRSTONLY) != 0) |
|
1678 { |
|
1679 break; |
|
1680 } |
|
1681 } |
|
1682 |
|
1683 // If there's remaining input, append it |
|
1684 if (pos < len) |
|
1685 { |
|
1686 ret.append(substituteIn.substring(pos)); |
|
1687 } |
|
1688 |
|
1689 // Return string buffer as string |
|
1690 return ret.toString(); |
|
1691 } |
|
1692 |
|
1693 /** |
|
1694 * Returns an array of Strings, whose toString representation matches a regular |
|
1695 * expression. This method works like the Perl function of the same name. Given |
|
1696 * a regular expression of "a*b" and an array of String objects of [foo, aab, zzz, |
|
1697 * aaaab], the array of Strings returned by grep would be [aab, aaaab]. |
|
1698 * |
|
1699 * @param search Array of Objects to search |
|
1700 * @return Array of Strings whose toString() value matches this regular expression. |
|
1701 */ |
|
1702 public String[] grep(Object[] search) |
|
1703 { |
|
1704 // Create new vector to hold return items |
|
1705 Vector v = new Vector(); |
|
1706 |
|
1707 // Traverse array of objects |
|
1708 for (int i = 0; i < search.length; i++) |
|
1709 { |
|
1710 // Get next object as a string |
|
1711 String s = search[i].toString(); |
|
1712 |
|
1713 // If it matches this regexp, add it to the list |
|
1714 if (match(s)) |
|
1715 { |
|
1716 v.addElement(s); |
|
1717 } |
|
1718 } |
|
1719 |
|
1720 // Return vector as an array of strings |
|
1721 String[] ret = new String[v.size()]; |
|
1722 v.copyInto(ret); |
|
1723 return ret; |
|
1724 } |
|
1725 |
|
1726 /** |
|
1727 * @return true if character at i-th position in the <code>search</code> string is a newline |
|
1728 */ |
|
1729 private boolean isNewline(int i) |
|
1730 { |
|
1731 char nextChar = search.charAt(i); |
|
1732 |
|
1733 if (nextChar == '\n' || nextChar == '\r' || nextChar == '\u0085' |
|
1734 || nextChar == '\u2028' || nextChar == '\u2029') |
|
1735 { |
|
1736 return true; |
|
1737 } |
|
1738 |
|
1739 return false; |
|
1740 } |
|
1741 |
|
1742 /** |
|
1743 * Compares two characters. |
|
1744 * |
|
1745 * @param c1 first character to compare. |
|
1746 * @param c2 second character to compare. |
|
1747 * @param caseIndependent whether comparision is case insensitive or not. |
|
1748 * @return negative, 0, or positive integer as the first character |
|
1749 * less than, equal to, or greater then the second. |
|
1750 */ |
|
1751 private int compareChars(char c1, char c2, boolean caseIndependent) |
|
1752 { |
|
1753 if (caseIndependent) |
|
1754 { |
|
1755 c1 = Character.toLowerCase(c1); |
|
1756 c2 = Character.toLowerCase(c2); |
|
1757 } |
|
1758 return ((int)c1 - (int)c2); |
|
1759 } |
|
1760 } |
|