src/com/sun/org/apache/regexp/internal/RE.java

changeset 2116
aaee9ae4799a
parent 759
7ea027fae4d8
equal deleted inserted replaced
2090:3b8ebb957957 2116:aaee9ae4799a
1 /*
2 * reserved comment block
3 * DO NOT REMOVE OR ALTER!
4 */
5 /*
6 * Copyright 1999-2004 The Apache Software Foundation.
7 *
8 * Licensed under the Apache License, Version 2.0 (the "License");
9 * you may not use this file except in compliance with the License.
10 * You may obtain a copy of the License at
11 *
12 * http://www.apache.org/licenses/LICENSE-2.0
13 *
14 * Unless required by applicable law or agreed to in writing, software
15 * distributed under the License is distributed on an "AS IS" BASIS,
16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 * See the License for the specific language governing permissions and
18 * limitations under the License.
19 */
20
21 package com.sun.org.apache.regexp.internal;
22
23 import java.io.Serializable;
24 import java.util.Vector;
25
26 /**
27 * RE is an efficient, lightweight regular expression evaluator/matcher
28 * class. Regular expressions are pattern descriptions which enable
29 * sophisticated matching of strings. In addition to being able to
30 * match a string against a pattern, you can also extract parts of the
31 * match. This is especially useful in text parsing! Details on the
32 * syntax of regular expression patterns are given below.
33 *
34 * <p>
35 * To compile a regular expression (RE), you can simply construct an RE
36 * matcher object from the string specification of the pattern, like this:
37 *
38 * <pre>
39 * RE r = new RE("a*b");
40 * </pre>
41 *
42 * <p>
43 * Once you have done this, you can call either of the RE.match methods to
44 * perform matching on a String. For example:
45 *
46 * <pre>
47 * boolean matched = r.match("aaaab");
48 * </pre>
49 *
50 * will cause the boolean matched to be set to true because the
51 * pattern "a*b" matches the string "aaaab".
52 *
53 * <p>
54 * If you were interested in the <i>number</i> of a's which matched the
55 * first part of our example expression, you could change the expression to
56 * "(a*)b". Then when you compiled the expression and matched it against
57 * something like "xaaaab", you would get results like this:
58 *
59 * <pre>
60 * RE r = new RE("(a*)b"); // Compile expression
61 * boolean matched = r.match("xaaaab"); // Match against "xaaaab"
62 *
63 * String wholeExpr = r.getParen(0); // wholeExpr will be 'aaaab'
64 * String insideParens = r.getParen(1); // insideParens will be 'aaaa'
65 *
66 * int startWholeExpr = r.getParenStart(0); // startWholeExpr will be index 1
67 * int endWholeExpr = r.getParenEnd(0); // endWholeExpr will be index 6
68 * int lenWholeExpr = r.getParenLength(0); // lenWholeExpr will be 5
69 *
70 * int startInside = r.getParenStart(1); // startInside will be index 1
71 * int endInside = r.getParenEnd(1); // endInside will be index 5
72 * int lenInside = r.getParenLength(1); // lenInside will be 4
73 * </pre>
74 *
75 * You can also refer to the contents of a parenthesized expression
76 * within a regular expression itself. This is called a
77 * 'backreference'. The first backreference in a regular expression is
78 * denoted by \1, the second by \2 and so on. So the expression:
79 *
80 * <pre>
81 * ([0-9]+)=\1
82 * </pre>
83 *
84 * will match any string of the form n=n (like 0=0 or 2=2).
85 *
86 * <p>
87 * The full regular expression syntax accepted by RE is described here:
88 *
89 * <pre>
90 *
91 * <b><font face=times roman>Characters</font></b>
92 *
93 * <i>unicodeChar</i> Matches any identical unicode character
94 * \ Used to quote a meta-character (like '*')
95 * \\ Matches a single '\' character
96 * \0nnn Matches a given octal character
97 * \xhh Matches a given 8-bit hexadecimal character
98 * \\uhhhh Matches a given 16-bit hexadecimal character
99 * \t Matches an ASCII tab character
100 * \n Matches an ASCII newline character
101 * \r Matches an ASCII return character
102 * \f Matches an ASCII form feed character
103 *
104 *
105 * <b><font face=times roman>Character Classes</font></b>
106 *
107 * [abc] Simple character class
108 * [a-zA-Z] Character class with ranges
109 * [^abc] Negated character class
110 * </pre>
111 *
112 * <b>NOTE:</b> Incomplete ranges will be interpreted as &quot;starts
113 * from zero&quot; or &quot;ends with last character&quot;.
114 * <br>
115 * I.e. [-a] is the same as [\\u0000-a], and [a-] is the same as [a-\\uFFFF],
116 * [-] means &quot;all characters&quot;.
117 *
118 * <pre>
119 *
120 * <b><font face=times roman>Standard POSIX Character Classes</font></b>
121 *
122 * [:alnum:] Alphanumeric characters.
123 * [:alpha:] Alphabetic characters.
124 * [:blank:] Space and tab characters.
125 * [:cntrl:] Control characters.
126 * [:digit:] Numeric characters.
127 * [:graph:] Characters that are printable and are also visible.
128 * (A space is printable, but not visible, while an
129 * `a' is both.)
130 * [:lower:] Lower-case alphabetic characters.
131 * [:print:] Printable characters (characters that are not
132 * control characters.)
133 * [:punct:] Punctuation characters (characters that are not letter,
134 * digits, control characters, or space characters).
135 * [:space:] Space characters (such as space, tab, and formfeed,
136 * to name a few).
137 * [:upper:] Upper-case alphabetic characters.
138 * [:xdigit:] Characters that are hexadecimal digits.
139 *
140 *
141 * <b><font face=times roman>Non-standard POSIX-style Character Classes</font></b>
142 *
143 * [:javastart:] Start of a Java identifier
144 * [:javapart:] Part of a Java identifier
145 *
146 *
147 * <b><font face=times roman>Predefined Classes</font></b>
148 *
149 * . Matches any character other than newline
150 * \w Matches a "word" character (alphanumeric plus "_")
151 * \W Matches a non-word character
152 * \s Matches a whitespace character
153 * \S Matches a non-whitespace character
154 * \d Matches a digit character
155 * \D Matches a non-digit character
156 *
157 *
158 * <b><font face=times roman>Boundary Matchers</font></b>
159 *
160 * ^ Matches only at the beginning of a line
161 * $ Matches only at the end of a line
162 * \b Matches only at a word boundary
163 * \B Matches only at a non-word boundary
164 *
165 *
166 * <b><font face=times roman>Greedy Closures</font></b>
167 *
168 * A* Matches A 0 or more times (greedy)
169 * A+ Matches A 1 or more times (greedy)
170 * A? Matches A 1 or 0 times (greedy)
171 * A{n} Matches A exactly n times (greedy)
172 * A{n,} Matches A at least n times (greedy)
173 * A{n,m} Matches A at least n but not more than m times (greedy)
174 *
175 *
176 * <b><font face=times roman>Reluctant Closures</font></b>
177 *
178 * A*? Matches A 0 or more times (reluctant)
179 * A+? Matches A 1 or more times (reluctant)
180 * A?? Matches A 0 or 1 times (reluctant)
181 *
182 *
183 * <b><font face=times roman>Logical Operators</font></b>
184 *
185 * AB Matches A followed by B
186 * A|B Matches either A or B
187 * (A) Used for subexpression grouping
188 * (?:A) Used for subexpression clustering (just like grouping but
189 * no backrefs)
190 *
191 *
192 * <b><font face=times roman>Backreferences</font></b>
193 *
194 * \1 Backreference to 1st parenthesized subexpression
195 * \2 Backreference to 2nd parenthesized subexpression
196 * \3 Backreference to 3rd parenthesized subexpression
197 * \4 Backreference to 4th parenthesized subexpression
198 * \5 Backreference to 5th parenthesized subexpression
199 * \6 Backreference to 6th parenthesized subexpression
200 * \7 Backreference to 7th parenthesized subexpression
201 * \8 Backreference to 8th parenthesized subexpression
202 * \9 Backreference to 9th parenthesized subexpression
203 * </pre>
204 *
205 * <p>
206 * All closure operators (+, *, ?, {m,n}) are greedy by default, meaning
207 * that they match as many elements of the string as possible without
208 * causing the overall match to fail. If you want a closure to be
209 * reluctant (non-greedy), you can simply follow it with a '?'. A
210 * reluctant closure will match as few elements of the string as
211 * possible when finding matches. {m,n} closures don't currently
212 * support reluctancy.
213 *
214 * <p>
215 * <b><font face="times roman">Line terminators</font></b>
216 * <br>
217 * A line terminator is a one- or two-character sequence that marks
218 * the end of a line of the input character sequence. The following
219 * are recognized as line terminators:
220 * <ul>
221 * <li>A newline (line feed) character ('\n'),</li>
222 * <li>A carriage-return character followed immediately by a newline character ("\r\n"),</li>
223 * <li>A standalone carriage-return character ('\r'),</li>
224 * <li>A next-line character ('\u0085'),</li>
225 * <li>A line-separator character ('\u2028'), or</li>
226 * <li>A paragraph-separator character ('\u2029).</li>
227 * </ul>
228 *
229 * <p>
230 * RE runs programs compiled by the RECompiler class. But the RE
231 * matcher class does not include the actual regular expression compiler
232 * for reasons of efficiency. In fact, if you want to pre-compile one
233 * or more regular expressions, the 'recompile' class can be invoked
234 * from the command line to produce compiled output like this:
235 *
236 * <pre>
237 * // Pre-compiled regular expression "a*b"
238 * char[] re1Instructions =
239 * {
240 * 0x007c, 0x0000, 0x001a, 0x007c, 0x0000, 0x000d, 0x0041,
241 * 0x0001, 0x0004, 0x0061, 0x007c, 0x0000, 0x0003, 0x0047,
242 * 0x0000, 0xfff6, 0x007c, 0x0000, 0x0003, 0x004e, 0x0000,
243 * 0x0003, 0x0041, 0x0001, 0x0004, 0x0062, 0x0045, 0x0000,
244 * 0x0000,
245 * };
246 *
247 *
248 * REProgram re1 = new REProgram(re1Instructions);
249 * </pre>
250 *
251 * You can then construct a regular expression matcher (RE) object from
252 * the pre-compiled expression re1 and thus avoid the overhead of
253 * compiling the expression at runtime. If you require more dynamic
254 * regular expressions, you can construct a single RECompiler object and
255 * re-use it to compile each expression. Similarly, you can change the
256 * program run by a given matcher object at any time. However, RE and
257 * RECompiler are not threadsafe (for efficiency reasons, and because
258 * requiring thread safety in this class is deemed to be a rare
259 * requirement), so you will need to construct a separate compiler or
260 * matcher object for each thread (unless you do thread synchronization
261 * yourself). Once expression compiled into the REProgram object, REProgram
262 * can be safely shared across multiple threads and RE objects.
263 *
264 * <br><p><br>
265 *
266 * <font color="red">
267 * <i>ISSUES:</i>
268 *
269 * <ul>
270 * <li>com.weusours.util.re is not currently compatible with all
271 * standard POSIX regcomp flags</li>
272 * <li>com.weusours.util.re does not support POSIX equivalence classes
273 * ([=foo=] syntax) (I18N/locale issue)</li>
274 * <li>com.weusours.util.re does not support nested POSIX character
275 * classes (definitely should, but not completely trivial)</li>
276 * <li>com.weusours.util.re Does not support POSIX character collation
277 * concepts ([.foo.] syntax) (I18N/locale issue)</li>
278 * <li>Should there be different matching styles (simple, POSIX, Perl etc?)</li>
279 * <li>Should RE support character iterators (for backwards RE matching!)?</li>
280 * <li>Should RE support reluctant {m,n} closures (does anyone care)?</li>
281 * <li>Not *all* possibilities are considered for greediness when backreferences
282 * are involved (as POSIX suggests should be the case). The POSIX RE
283 * "(ac*)c*d[ac]*\1", when matched against "acdacaa" should yield a match
284 * of acdacaa where \1 is "a". This is not the case in this RE package,
285 * and actually Perl doesn't go to this extent either! Until someone
286 * actually complains about this, I'm not sure it's worth "fixing".
287 * If it ever is fixed, test #137 in RETest.txt should be updated.</li>
288 * </ul>
289 *
290 * </font>
291 *
292 * @see recompile
293 * @see RECompiler
294 *
295 * @author <a href="mailto:jonl@muppetlabs.com">Jonathan Locke</a>
296 * @author <a href="mailto:ts@sch-fer.de">Tobias Sch&auml;fer</a>
297 */
298 public class RE implements Serializable
299 {
300 /**
301 * Specifies normal, case-sensitive matching behaviour.
302 */
303 public static final int MATCH_NORMAL = 0x0000;
304
305 /**
306 * Flag to indicate that matching should be case-independent (folded)
307 */
308 public static final int MATCH_CASEINDEPENDENT = 0x0001;
309
310 /**
311 * Newlines should match as BOL/EOL (^ and $)
312 */
313 public static final int MATCH_MULTILINE = 0x0002;
314
315 /**
316 * Consider all input a single body of text - newlines are matched by .
317 */
318 public static final int MATCH_SINGLELINE = 0x0004;
319
320 /************************************************
321 * *
322 * The format of a node in a program is: *
323 * *
324 * [ OPCODE ] [ OPDATA ] [ OPNEXT ] [ OPERAND ] *
325 * *
326 * char OPCODE - instruction *
327 * char OPDATA - modifying data *
328 * char OPNEXT - next node (relative offset) *
329 * *
330 ************************************************/
331
332 // Opcode Char Opdata/Operand Meaning
333 // ---------- ---------- --------------- --------------------------------------------------
334 static final char OP_END = 'E'; // end of program
335 static final char OP_BOL = '^'; // match only if at beginning of line
336 static final char OP_EOL = '$'; // match only if at end of line
337 static final char OP_ANY = '.'; // match any single character except newline
338 static final char OP_ANYOF = '['; // count/ranges match any char in the list of ranges
339 static final char OP_BRANCH = '|'; // node match this alternative or the next one
340 static final char OP_ATOM = 'A'; // length/string length of string followed by string itself
341 static final char OP_STAR = '*'; // node kleene closure
342 static final char OP_PLUS = '+'; // node positive closure
343 static final char OP_MAYBE = '?'; // node optional closure
344 static final char OP_ESCAPE = '\\'; // escape special escape code char class (escape is E_* code)
345 static final char OP_OPEN = '('; // number nth opening paren
346 static final char OP_OPEN_CLUSTER = '<'; // opening cluster
347 static final char OP_CLOSE = ')'; // number nth closing paren
348 static final char OP_CLOSE_CLUSTER = '>'; // closing cluster
349 static final char OP_BACKREF = '#'; // number reference nth already matched parenthesized string
350 static final char OP_GOTO = 'G'; // nothing but a (back-)pointer
351 static final char OP_NOTHING = 'N'; // match null string such as in '(a|)'
352 static final char OP_RELUCTANTSTAR = '8'; // none/expr reluctant '*' (mnemonic for char is unshifted '*')
353 static final char OP_RELUCTANTPLUS = '='; // none/expr reluctant '+' (mnemonic for char is unshifted '+')
354 static final char OP_RELUCTANTMAYBE = '/'; // none/expr reluctant '?' (mnemonic for char is unshifted '?')
355 static final char OP_POSIXCLASS = 'P'; // classid one of the posix character classes
356
357 // Escape codes
358 static final char E_ALNUM = 'w'; // Alphanumeric
359 static final char E_NALNUM = 'W'; // Non-alphanumeric
360 static final char E_BOUND = 'b'; // Word boundary
361 static final char E_NBOUND = 'B'; // Non-word boundary
362 static final char E_SPACE = 's'; // Whitespace
363 static final char E_NSPACE = 'S'; // Non-whitespace
364 static final char E_DIGIT = 'd'; // Digit
365 static final char E_NDIGIT = 'D'; // Non-digit
366
367 // Posix character classes
368 static final char POSIX_CLASS_ALNUM = 'w'; // Alphanumerics
369 static final char POSIX_CLASS_ALPHA = 'a'; // Alphabetics
370 static final char POSIX_CLASS_BLANK = 'b'; // Blanks
371 static final char POSIX_CLASS_CNTRL = 'c'; // Control characters
372 static final char POSIX_CLASS_DIGIT = 'd'; // Digits
373 static final char POSIX_CLASS_GRAPH = 'g'; // Graphic characters
374 static final char POSIX_CLASS_LOWER = 'l'; // Lowercase characters
375 static final char POSIX_CLASS_PRINT = 'p'; // Printable characters
376 static final char POSIX_CLASS_PUNCT = '!'; // Punctuation
377 static final char POSIX_CLASS_SPACE = 's'; // Spaces
378 static final char POSIX_CLASS_UPPER = 'u'; // Uppercase characters
379 static final char POSIX_CLASS_XDIGIT = 'x'; // Hexadecimal digits
380 static final char POSIX_CLASS_JSTART = 'j'; // Java identifier start
381 static final char POSIX_CLASS_JPART = 'k'; // Java identifier part
382
383 // Limits
384 static final int maxNode = 65536; // Maximum number of nodes in a program
385 static final int MAX_PAREN = 16; // Number of paren pairs (only 9 can be backrefs)
386
387 // Node layout constants
388 static final int offsetOpcode = 0; // Opcode offset (first character)
389 static final int offsetOpdata = 1; // Opdata offset (second char)
390 static final int offsetNext = 2; // Next index offset (third char)
391 static final int nodeSize = 3; // Node size (in chars)
392
393 // State of current program
394 REProgram program; // Compiled regular expression 'program'
395 transient CharacterIterator search; // The string being matched against
396 int matchFlags; // Match behaviour flags
397 int maxParen = MAX_PAREN;
398
399 // Parenthesized subexpressions
400 transient int parenCount; // Number of subexpressions matched (num open parens + 1)
401 transient int start0; // Cache of start[0]
402 transient int end0; // Cache of start[0]
403 transient int start1; // Cache of start[1]
404 transient int end1; // Cache of start[1]
405 transient int start2; // Cache of start[2]
406 transient int end2; // Cache of start[2]
407 transient int[] startn; // Lazy-alloced array of sub-expression starts
408 transient int[] endn; // Lazy-alloced array of sub-expression ends
409
410 // Backreferences
411 transient int[] startBackref; // Lazy-alloced array of backref starts
412 transient int[] endBackref; // Lazy-alloced array of backref ends
413
414 /**
415 * Constructs a regular expression matcher from a String by compiling it
416 * using a new instance of RECompiler. If you will be compiling many
417 * expressions, you may prefer to use a single RECompiler object instead.
418 *
419 * @param pattern The regular expression pattern to compile.
420 * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
421 * @see RECompiler
422 * @see recompile
423 */
424 public RE(String pattern) throws RESyntaxException
425 {
426 this(pattern, MATCH_NORMAL);
427 }
428
429 /**
430 * Constructs a regular expression matcher from a String by compiling it
431 * using a new instance of RECompiler. If you will be compiling many
432 * expressions, you may prefer to use a single RECompiler object instead.
433 *
434 * @param pattern The regular expression pattern to compile.
435 * @param matchFlags The matching style
436 * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
437 * @see RECompiler
438 * @see recompile
439 */
440 public RE(String pattern, int matchFlags) throws RESyntaxException
441 {
442 this(new RECompiler().compile(pattern));
443 setMatchFlags(matchFlags);
444 }
445
446 /**
447 * Construct a matcher for a pre-compiled regular expression from program
448 * (bytecode) data. Permits special flags to be passed in to modify matching
449 * behaviour.
450 *
451 * @param program Compiled regular expression program (see RECompiler and/or recompile)
452 * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*):
453 *
454 * <pre>
455 * MATCH_NORMAL // Normal (case-sensitive) matching
456 * MATCH_CASEINDEPENDENT // Case folded comparisons
457 * MATCH_MULTILINE // Newline matches as BOL/EOL
458 * </pre>
459 *
460 * @see RECompiler
461 * @see REProgram
462 * @see recompile
463 */
464 public RE(REProgram program, int matchFlags)
465 {
466 setProgram(program);
467 setMatchFlags(matchFlags);
468 }
469
470 /**
471 * Construct a matcher for a pre-compiled regular expression from program
472 * (bytecode) data.
473 *
474 * @param program Compiled regular expression program
475 * @see RECompiler
476 * @see recompile
477 */
478 public RE(REProgram program)
479 {
480 this(program, MATCH_NORMAL);
481 }
482
483 /**
484 * Constructs a regular expression matcher with no initial program.
485 * This is likely to be an uncommon practice, but is still supported.
486 */
487 public RE()
488 {
489 this((REProgram)null, MATCH_NORMAL);
490 }
491
492 /**
493 * Converts a 'simplified' regular expression to a full regular expression
494 *
495 * @param pattern The pattern to convert
496 * @return The full regular expression
497 */
498 public static String simplePatternToFullRegularExpression(String pattern)
499 {
500 StringBuffer buf = new StringBuffer();
501 for (int i = 0; i < pattern.length(); i++)
502 {
503 char c = pattern.charAt(i);
504 switch (c)
505 {
506 case '*':
507 buf.append(".*");
508 break;
509
510 case '.':
511 case '[':
512 case ']':
513 case '\\':
514 case '+':
515 case '?':
516 case '{':
517 case '}':
518 case '$':
519 case '^':
520 case '|':
521 case '(':
522 case ')':
523 buf.append('\\');
524 default:
525 buf.append(c);
526 break;
527 }
528 }
529 return buf.toString();
530 }
531
532 /**
533 * Sets match behaviour flags which alter the way RE does matching.
534 * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*):
535 *
536 * <pre>
537 * MATCH_NORMAL // Normal (case-sensitive) matching
538 * MATCH_CASEINDEPENDENT // Case folded comparisons
539 * MATCH_MULTILINE // Newline matches as BOL/EOL
540 * </pre>
541 */
542 public void setMatchFlags(int matchFlags)
543 {
544 this.matchFlags = matchFlags;
545 }
546
547 /**
548 * Returns the current match behaviour flags.
549 * @return Current match behaviour flags (RE.MATCH_*).
550 *
551 * <pre>
552 * MATCH_NORMAL // Normal (case-sensitive) matching
553 * MATCH_CASEINDEPENDENT // Case folded comparisons
554 * MATCH_MULTILINE // Newline matches as BOL/EOL
555 * </pre>
556 *
557 * @see #setMatchFlags
558 */
559 public int getMatchFlags()
560 {
561 return matchFlags;
562 }
563
564 /**
565 * Sets the current regular expression program used by this matcher object.
566 *
567 * @param program Regular expression program compiled by RECompiler.
568 * @see RECompiler
569 * @see REProgram
570 * @see recompile
571 */
572 public void setProgram(REProgram program)
573 {
574 this.program = program;
575 if (program != null && program.maxParens != -1) {
576 this.maxParen = program.maxParens;
577 } else {
578 this.maxParen = MAX_PAREN;
579 }
580 }
581
582 /**
583 * Returns the current regular expression program in use by this matcher object.
584 *
585 * @return Regular expression program
586 * @see #setProgram
587 */
588 public REProgram getProgram()
589 {
590 return program;
591 }
592
593 /**
594 * Returns the number of parenthesized subexpressions available after a successful match.
595 *
596 * @return Number of available parenthesized subexpressions
597 */
598 public int getParenCount()
599 {
600 return parenCount;
601 }
602
603 /**
604 * Gets the contents of a parenthesized subexpression after a successful match.
605 *
606 * @param which Nesting level of subexpression
607 * @return String
608 */
609 public String getParen(int which)
610 {
611 int start;
612 if (which < parenCount && (start = getParenStart(which)) >= 0)
613 {
614 return search.substring(start, getParenEnd(which));
615 }
616 return null;
617 }
618
619 /**
620 * Returns the start index of a given paren level.
621 *
622 * @param which Nesting level of subexpression
623 * @return String index
624 */
625 public final int getParenStart(int which)
626 {
627 if (which < parenCount)
628 {
629 switch (which)
630 {
631 case 0:
632 return start0;
633
634 case 1:
635 return start1;
636
637 case 2:
638 return start2;
639
640 default:
641 if (startn == null)
642 {
643 allocParens();
644 }
645 return startn[which];
646 }
647 }
648 return -1;
649 }
650
651 /**
652 * Returns the end index of a given paren level.
653 *
654 * @param which Nesting level of subexpression
655 * @return String index
656 */
657 public final int getParenEnd(int which)
658 {
659 if (which < parenCount)
660 {
661 switch (which)
662 {
663 case 0:
664 return end0;
665
666 case 1:
667 return end1;
668
669 case 2:
670 return end2;
671
672 default:
673 if (endn == null)
674 {
675 allocParens();
676 }
677 return endn[which];
678 }
679 }
680 return -1;
681 }
682
683 /**
684 * Returns the length of a given paren level.
685 *
686 * @param which Nesting level of subexpression
687 * @return Number of characters in the parenthesized subexpression
688 */
689 public final int getParenLength(int which)
690 {
691 if (which < parenCount)
692 {
693 return getParenEnd(which) - getParenStart(which);
694 }
695 return -1;
696 }
697
698 /**
699 * Sets the start of a paren level
700 *
701 * @param which Which paren level
702 * @param i Index in input array
703 */
704 protected final void setParenStart(int which, int i)
705 {
706 if (which < parenCount)
707 {
708 switch (which)
709 {
710 case 0:
711 start0 = i;
712 break;
713
714 case 1:
715 start1 = i;
716 break;
717
718 case 2:
719 start2 = i;
720 break;
721
722 default:
723 if (startn == null)
724 {
725 allocParens();
726 }
727 startn[which] = i;
728 break;
729 }
730 }
731 }
732
733 /**
734 * Sets the end of a paren level
735 *
736 * @param which Which paren level
737 * @param i Index in input array
738 */
739 protected final void setParenEnd(int which, int i)
740 {
741 if (which < parenCount)
742 {
743 switch (which)
744 {
745 case 0:
746 end0 = i;
747 break;
748
749 case 1:
750 end1 = i;
751 break;
752
753 case 2:
754 end2 = i;
755 break;
756
757 default:
758 if (endn == null)
759 {
760 allocParens();
761 }
762 endn[which] = i;
763 break;
764 }
765 }
766 }
767
768 /**
769 * Throws an Error representing an internal error condition probably resulting
770 * from a bug in the regular expression compiler (or possibly data corruption).
771 * In practice, this should be very rare.
772 *
773 * @param s Error description
774 */
775 protected void internalError(String s) throws Error
776 {
777 throw new Error("RE internal error: " + s);
778 }
779
780 /**
781 * Performs lazy allocation of subexpression arrays
782 */
783 private final void allocParens()
784 {
785 // Allocate arrays for subexpressions
786 startn = new int[maxParen];
787 endn = new int[maxParen];
788
789 // Set sub-expression pointers to invalid values
790 for (int i = 0; i < maxParen; i++)
791 {
792 startn[i] = -1;
793 endn[i] = -1;
794 }
795 }
796
797 /**
798 * Try to match a string against a subset of nodes in the program
799 *
800 * @param firstNode Node to start at in program
801 * @param lastNode Last valid node (used for matching a subexpression without
802 * matching the rest of the program as well).
803 * @param idxStart Starting position in character array
804 * @return Final input array index if match succeeded. -1 if not.
805 */
806 protected int matchNodes(int firstNode, int lastNode, int idxStart)
807 {
808 // Our current place in the string
809 int idx = idxStart;
810
811 // Loop while node is valid
812 int next, opcode, opdata;
813 int idxNew;
814 char[] instruction = program.instruction;
815 for (int node = firstNode; node < lastNode; )
816 {
817 opcode = instruction[node + offsetOpcode];
818 next = node + (short)instruction[node + offsetNext];
819 opdata = instruction[node + offsetOpdata];
820
821 switch (opcode)
822 {
823 case OP_RELUCTANTMAYBE:
824 {
825 int once = 0;
826 do
827 {
828 // Try to match the rest without using the reluctant subexpr
829 if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
830 {
831 return idxNew;
832 }
833 }
834 while ((once++ == 0) && (idx = matchNodes(node + nodeSize, next, idx)) != -1);
835 return -1;
836 }
837
838 case OP_RELUCTANTPLUS:
839 while ((idx = matchNodes(node + nodeSize, next, idx)) != -1)
840 {
841 // Try to match the rest without using the reluctant subexpr
842 if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
843 {
844 return idxNew;
845 }
846 }
847 return -1;
848
849 case OP_RELUCTANTSTAR:
850 do
851 {
852 // Try to match the rest without using the reluctant subexpr
853 if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
854 {
855 return idxNew;
856 }
857 }
858 while ((idx = matchNodes(node + nodeSize, next, idx)) != -1);
859 return -1;
860
861 case OP_OPEN:
862
863 // Match subexpression
864 if ((program.flags & REProgram.OPT_HASBACKREFS) != 0)
865 {
866 startBackref[opdata] = idx;
867 }
868 if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
869 {
870 // Increase valid paren count
871 if ((opdata + 1) > parenCount)
872 {
873 parenCount = opdata + 1;
874 }
875
876 // Don't set paren if already set later on
877 if (getParenStart(opdata) == -1)
878 {
879 setParenStart(opdata, idx);
880 }
881 }
882 return idxNew;
883
884 case OP_CLOSE:
885
886 // Done matching subexpression
887 if ((program.flags & REProgram.OPT_HASBACKREFS) != 0)
888 {
889 endBackref[opdata] = idx;
890 }
891 if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
892 {
893 // Increase valid paren count
894 if ((opdata + 1) > parenCount)
895 {
896 parenCount = opdata + 1;
897 }
898
899 // Don't set paren if already set later on
900 if (getParenEnd(opdata) == -1)
901 {
902 setParenEnd(opdata, idx);
903 }
904 }
905 return idxNew;
906
907 case OP_OPEN_CLUSTER:
908 case OP_CLOSE_CLUSTER:
909 // starting or ending the matching of a subexpression which has no backref.
910 return matchNodes( next, maxNode, idx );
911
912 case OP_BACKREF:
913 {
914 // Get the start and end of the backref
915 int s = startBackref[opdata];
916 int e = endBackref[opdata];
917
918 // We don't know the backref yet
919 if (s == -1 || e == -1)
920 {
921 return -1;
922 }
923
924 // The backref is empty size
925 if (s == e)
926 {
927 break;
928 }
929
930 // Get the length of the backref
931 int l = e - s;
932
933 // If there's not enough input left, give up.
934 if (search.isEnd(idx + l - 1))
935 {
936 return -1;
937 }
938
939 // Case fold the backref?
940 final boolean caseFold =
941 ((matchFlags & MATCH_CASEINDEPENDENT) != 0);
942 // Compare backref to input
943 for (int i = 0; i < l; i++)
944 {
945 if (compareChars(search.charAt(idx++), search.charAt(s + i), caseFold) != 0)
946 {
947 return -1;
948 }
949 }
950 }
951 break;
952
953 case OP_BOL:
954
955 // Fail if we're not at the start of the string
956 if (idx != 0)
957 {
958 // If we're multiline matching, we could still be at the start of a line
959 if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE)
960 {
961 // If not at start of line, give up
962 if (idx <= 0 || !isNewline(idx - 1)) {
963 return -1;
964 } else {
965 break;
966 }
967 }
968 return -1;
969 }
970 break;
971
972 case OP_EOL:
973
974 // If we're not at the end of string
975 if (!search.isEnd(0) && !search.isEnd(idx))
976 {
977 // If we're multi-line matching
978 if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE)
979 {
980 // Give up if we're not at the end of a line
981 if (!isNewline(idx)) {
982 return -1;
983 } else {
984 break;
985 }
986 }
987 return -1;
988 }
989 break;
990
991 case OP_ESCAPE:
992
993 // Which escape?
994 switch (opdata)
995 {
996 // Word boundary match
997 case E_NBOUND:
998 case E_BOUND:
999 {
1000 char cLast = ((idx == 0) ? '\n' : search.charAt(idx - 1));
1001 char cNext = ((search.isEnd(idx)) ? '\n' : search.charAt(idx));
1002 if ((Character.isLetterOrDigit(cLast) == Character.isLetterOrDigit(cNext)) == (opdata == E_BOUND))
1003 {
1004 return -1;
1005 }
1006 }
1007 break;
1008
1009 // Alpha-numeric, digit, space, javaLetter, javaLetterOrDigit
1010 case E_ALNUM:
1011 case E_NALNUM:
1012 case E_DIGIT:
1013 case E_NDIGIT:
1014 case E_SPACE:
1015 case E_NSPACE:
1016
1017 // Give up if out of input
1018 if (search.isEnd(idx))
1019 {
1020 return -1;
1021 }
1022
1023 char c = search.charAt(idx);
1024
1025 // Switch on escape
1026 switch (opdata)
1027 {
1028 case E_ALNUM:
1029 case E_NALNUM:
1030 if (!((Character.isLetterOrDigit(c) || c == '_') == (opdata == E_ALNUM)))
1031 {
1032 return -1;
1033 }
1034 break;
1035
1036 case E_DIGIT:
1037 case E_NDIGIT:
1038 if (!(Character.isDigit(c) == (opdata == E_DIGIT)))
1039 {
1040 return -1;
1041 }
1042 break;
1043
1044 case E_SPACE:
1045 case E_NSPACE:
1046 if (!(Character.isWhitespace(c) == (opdata == E_SPACE)))
1047 {
1048 return -1;
1049 }
1050 break;
1051 }
1052 idx++;
1053 break;
1054
1055 default:
1056 internalError("Unrecognized escape '" + opdata + "'");
1057 }
1058 break;
1059
1060 case OP_ANY:
1061
1062 if ((matchFlags & MATCH_SINGLELINE) == MATCH_SINGLELINE) {
1063 // Match anything
1064 if (search.isEnd(idx))
1065 {
1066 return -1;
1067 }
1068 }
1069 else
1070 {
1071 // Match anything but a newline
1072 if (search.isEnd(idx) || isNewline(idx))
1073 {
1074 return -1;
1075 }
1076 }
1077 idx++;
1078 break;
1079
1080 case OP_ATOM:
1081 {
1082 // Match an atom value
1083 if (search.isEnd(idx))
1084 {
1085 return -1;
1086 }
1087
1088 // Get length of atom and starting index
1089 int lenAtom = opdata;
1090 int startAtom = node + nodeSize;
1091
1092 // Give up if not enough input remains to have a match
1093 if (search.isEnd(lenAtom + idx - 1))
1094 {
1095 return -1;
1096 }
1097
1098 // Match atom differently depending on casefolding flag
1099 final boolean caseFold =
1100 ((matchFlags & MATCH_CASEINDEPENDENT) != 0);
1101
1102 for (int i = 0; i < lenAtom; i++)
1103 {
1104 if (compareChars(search.charAt(idx++), instruction[startAtom + i], caseFold) != 0)
1105 {
1106 return -1;
1107 }
1108 }
1109 }
1110 break;
1111
1112 case OP_POSIXCLASS:
1113 {
1114 // Out of input?
1115 if (search.isEnd(idx))
1116 {
1117 return -1;
1118 }
1119
1120 switch (opdata)
1121 {
1122 case POSIX_CLASS_ALNUM:
1123 if (!Character.isLetterOrDigit(search.charAt(idx)))
1124 {
1125 return -1;
1126 }
1127 break;
1128
1129 case POSIX_CLASS_ALPHA:
1130 if (!Character.isLetter(search.charAt(idx)))
1131 {
1132 return -1;
1133 }
1134 break;
1135
1136 case POSIX_CLASS_DIGIT:
1137 if (!Character.isDigit(search.charAt(idx)))
1138 {
1139 return -1;
1140 }
1141 break;
1142
1143 case POSIX_CLASS_BLANK: // JWL - bugbug: is this right??
1144 if (!Character.isSpaceChar(search.charAt(idx)))
1145 {
1146 return -1;
1147 }
1148 break;
1149
1150 case POSIX_CLASS_SPACE:
1151 if (!Character.isWhitespace(search.charAt(idx)))
1152 {
1153 return -1;
1154 }
1155 break;
1156
1157 case POSIX_CLASS_CNTRL:
1158 if (Character.getType(search.charAt(idx)) != Character.CONTROL)
1159 {
1160 return -1;
1161 }
1162 break;
1163
1164 case POSIX_CLASS_GRAPH: // JWL - bugbug???
1165 switch (Character.getType(search.charAt(idx)))
1166 {
1167 case Character.MATH_SYMBOL:
1168 case Character.CURRENCY_SYMBOL:
1169 case Character.MODIFIER_SYMBOL:
1170 case Character.OTHER_SYMBOL:
1171 break;
1172
1173 default:
1174 return -1;
1175 }
1176 break;
1177
1178 case POSIX_CLASS_LOWER:
1179 if (Character.getType(search.charAt(idx)) != Character.LOWERCASE_LETTER)
1180 {
1181 return -1;
1182 }
1183 break;
1184
1185 case POSIX_CLASS_UPPER:
1186 if (Character.getType(search.charAt(idx)) != Character.UPPERCASE_LETTER)
1187 {
1188 return -1;
1189 }
1190 break;
1191
1192 case POSIX_CLASS_PRINT:
1193 if (Character.getType(search.charAt(idx)) == Character.CONTROL)
1194 {
1195 return -1;
1196 }
1197 break;
1198
1199 case POSIX_CLASS_PUNCT:
1200 {
1201 int type = Character.getType(search.charAt(idx));
1202 switch(type)
1203 {
1204 case Character.DASH_PUNCTUATION:
1205 case Character.START_PUNCTUATION:
1206 case Character.END_PUNCTUATION:
1207 case Character.CONNECTOR_PUNCTUATION:
1208 case Character.OTHER_PUNCTUATION:
1209 break;
1210
1211 default:
1212 return -1;
1213 }
1214 }
1215 break;
1216
1217 case POSIX_CLASS_XDIGIT: // JWL - bugbug??
1218 {
1219 boolean isXDigit = ((search.charAt(idx) >= '0' && search.charAt(idx) <= '9') ||
1220 (search.charAt(idx) >= 'a' && search.charAt(idx) <= 'f') ||
1221 (search.charAt(idx) >= 'A' && search.charAt(idx) <= 'F'));
1222 if (!isXDigit)
1223 {
1224 return -1;
1225 }
1226 }
1227 break;
1228
1229 case POSIX_CLASS_JSTART:
1230 if (!Character.isJavaIdentifierStart(search.charAt(idx)))
1231 {
1232 return -1;
1233 }
1234 break;
1235
1236 case POSIX_CLASS_JPART:
1237 if (!Character.isJavaIdentifierPart(search.charAt(idx)))
1238 {
1239 return -1;
1240 }
1241 break;
1242
1243 default:
1244 internalError("Bad posix class");
1245 break;
1246 }
1247
1248 // Matched.
1249 idx++;
1250 }
1251 break;
1252
1253 case OP_ANYOF:
1254 {
1255 // Out of input?
1256 if (search.isEnd(idx))
1257 {
1258 return -1;
1259 }
1260
1261 // Get character to match against character class and maybe casefold
1262 char c = search.charAt(idx);
1263 boolean caseFold = (matchFlags & MATCH_CASEINDEPENDENT) != 0;
1264 // Loop through character class checking our match character
1265 int idxRange = node + nodeSize;
1266 int idxEnd = idxRange + (opdata * 2);
1267 boolean match = false;
1268 for (int i = idxRange; !match && i < idxEnd; )
1269 {
1270 // Get start, end and match characters
1271 char s = instruction[i++];
1272 char e = instruction[i++];
1273
1274 match = ((compareChars(c, s, caseFold) >= 0)
1275 && (compareChars(c, e, caseFold) <= 0));
1276 }
1277
1278 // Fail if we didn't match the character class
1279 if (!match)
1280 {
1281 return -1;
1282 }
1283 idx++;
1284 }
1285 break;
1286
1287 case OP_BRANCH:
1288 {
1289 // Check for choices
1290 if (instruction[next + offsetOpcode] != OP_BRANCH)
1291 {
1292 // If there aren't any other choices, just evaluate this branch.
1293 node += nodeSize;
1294 continue;
1295 }
1296
1297 // Try all available branches
1298 short nextBranch;
1299 do
1300 {
1301 // Try matching the branch against the string
1302 if ((idxNew = matchNodes(node + nodeSize, maxNode, idx)) != -1)
1303 {
1304 return idxNew;
1305 }
1306
1307 // Go to next branch (if any)
1308 nextBranch = (short)instruction[node + offsetNext];
1309 node += nextBranch;
1310 }
1311 while (nextBranch != 0 && (instruction[node + offsetOpcode] == OP_BRANCH));
1312
1313 // Failed to match any branch!
1314 return -1;
1315 }
1316
1317 case OP_NOTHING:
1318 case OP_GOTO:
1319
1320 // Just advance to the next node without doing anything
1321 break;
1322
1323 case OP_END:
1324
1325 // Match has succeeded!
1326 setParenEnd(0, idx);
1327 return idx;
1328
1329 default:
1330
1331 // Corrupt program
1332 internalError("Invalid opcode '" + opcode + "'");
1333 }
1334
1335 // Advance to the next node in the program
1336 node = next;
1337 }
1338
1339 // We "should" never end up here
1340 internalError("Corrupt program");
1341 return -1;
1342 }
1343
1344 /**
1345 * Match the current regular expression program against the current
1346 * input string, starting at index i of the input string. This method
1347 * is only meant for internal use.
1348 *
1349 * @param i The input string index to start matching at
1350 * @return True if the input matched the expression
1351 */
1352 protected boolean matchAt(int i)
1353 {
1354 // Initialize start pointer, paren cache and paren count
1355 start0 = -1;
1356 end0 = -1;
1357 start1 = -1;
1358 end1 = -1;
1359 start2 = -1;
1360 end2 = -1;
1361 startn = null;
1362 endn = null;
1363 parenCount = 1;
1364 setParenStart(0, i);
1365
1366 // Allocate backref arrays (unless optimizations indicate otherwise)
1367 if ((program.flags & REProgram.OPT_HASBACKREFS) != 0)
1368 {
1369 startBackref = new int[maxParen];
1370 endBackref = new int[maxParen];
1371 }
1372
1373 // Match against string
1374 int idx;
1375 if ((idx = matchNodes(0, maxNode, i)) != -1)
1376 {
1377 setParenEnd(0, idx);
1378 return true;
1379 }
1380
1381 // Didn't match
1382 parenCount = 0;
1383 return false;
1384 }
1385
1386 /**
1387 * Matches the current regular expression program against a character array,
1388 * starting at a given index.
1389 *
1390 * @param search String to match against
1391 * @param i Index to start searching at
1392 * @return True if string matched
1393 */
1394 public boolean match(String search, int i)
1395 {
1396 return match(new StringCharacterIterator(search), i);
1397 }
1398
1399 /**
1400 * Matches the current regular expression program against a character array,
1401 * starting at a given index.
1402 *
1403 * @param search String to match against
1404 * @param i Index to start searching at
1405 * @return True if string matched
1406 */
1407 public boolean match(CharacterIterator search, int i)
1408 {
1409 // There is no compiled program to search with!
1410 if (program == null)
1411 {
1412 // This should be uncommon enough to be an error case rather
1413 // than an exception (which would have to be handled everywhere)
1414 internalError("No RE program to run!");
1415 }
1416
1417 // Save string to search
1418 this.search = search;
1419
1420 // Can we optimize the search by looking for a prefix string?
1421 if (program.prefix == null)
1422 {
1423 // Unprefixed matching must try for a match at each character
1424 for ( ;! search.isEnd(i - 1); i++)
1425 {
1426 // Try a match at index i
1427 if (matchAt(i))
1428 {
1429 return true;
1430 }
1431 }
1432 return false;
1433 }
1434 else
1435 {
1436 // Prefix-anchored matching is possible
1437 boolean caseIndependent = (matchFlags & MATCH_CASEINDEPENDENT) != 0;
1438 char[] prefix = program.prefix;
1439 for ( ; !search.isEnd(i + prefix.length - 1); i++)
1440 {
1441 int j = i;
1442 int k = 0;
1443
1444 boolean match;
1445 do {
1446 // If there's a mismatch of any character in the prefix, give up
1447 match = (compareChars(search.charAt(j++), prefix[k++], caseIndependent) == 0);
1448 } while (match && k < prefix.length);
1449
1450 // See if the whole prefix string matched
1451 if (k == prefix.length)
1452 {
1453 // We matched the full prefix at firstChar, so try it
1454 if (matchAt(i))
1455 {
1456 return true;
1457 }
1458 }
1459 }
1460 return false;
1461 }
1462 }
1463
1464 /**
1465 * Matches the current regular expression program against a String.
1466 *
1467 * @param search String to match against
1468 * @return True if string matched
1469 */
1470 public boolean match(String search)
1471 {
1472 return match(search, 0);
1473 }
1474
1475 /**
1476 * Splits a string into an array of strings on regular expression boundaries.
1477 * This function works the same way as the Perl function of the same name.
1478 * Given a regular expression of "[ab]+" and a string to split of
1479 * "xyzzyababbayyzabbbab123", the result would be the array of Strings
1480 * "[xyzzy, yyz, 123]".
1481 *
1482 * <p>Please note that the first string in the resulting array may be an empty
1483 * string. This happens when the very first character of input string is
1484 * matched by the pattern.
1485 *
1486 * @param s String to split on this regular exression
1487 * @return Array of strings
1488 */
1489 public String[] split(String s)
1490 {
1491 // Create new vector
1492 Vector v = new Vector();
1493
1494 // Start at position 0 and search the whole string
1495 int pos = 0;
1496 int len = s.length();
1497
1498 // Try a match at each position
1499 while (pos < len && match(s, pos))
1500 {
1501 // Get start of match
1502 int start = getParenStart(0);
1503
1504 // Get end of match
1505 int newpos = getParenEnd(0);
1506
1507 // Check if no progress was made
1508 if (newpos == pos)
1509 {
1510 v.addElement(s.substring(pos, start + 1));
1511 newpos++;
1512 }
1513 else
1514 {
1515 v.addElement(s.substring(pos, start));
1516 }
1517
1518 // Move to new position
1519 pos = newpos;
1520 }
1521
1522 // Push remainder if it's not empty
1523 String remainder = s.substring(pos);
1524 if (remainder.length() != 0)
1525 {
1526 v.addElement(remainder);
1527 }
1528
1529 // Return vector as an array of strings
1530 String[] ret = new String[v.size()];
1531 v.copyInto(ret);
1532 return ret;
1533 }
1534
1535 /**
1536 * Flag bit that indicates that subst should replace all occurrences of this
1537 * regular expression.
1538 */
1539 public static final int REPLACE_ALL = 0x0000;
1540
1541 /**
1542 * Flag bit that indicates that subst should only replace the first occurrence
1543 * of this regular expression.
1544 */
1545 public static final int REPLACE_FIRSTONLY = 0x0001;
1546
1547 /**
1548 * Flag bit that indicates that subst should replace backreferences
1549 */
1550 public static final int REPLACE_BACKREFERENCES = 0x0002;
1551
1552 /**
1553 * Substitutes a string for this regular expression in another string.
1554 * This method works like the Perl function of the same name.
1555 * Given a regular expression of "a*b", a String to substituteIn of
1556 * "aaaabfooaaabgarplyaaabwackyb" and the substitution String "-", the
1557 * resulting String returned by subst would be "-foo-garply-wacky-".
1558 *
1559 * @param substituteIn String to substitute within
1560 * @param substitution String to substitute for all matches of this regular expression.
1561 * @return The string substituteIn with zero or more occurrences of the current
1562 * regular expression replaced with the substitution String (if this regular
1563 * expression object doesn't match at any position, the original String is returned
1564 * unchanged).
1565 */
1566 public String subst(String substituteIn, String substitution)
1567 {
1568 return subst(substituteIn, substitution, REPLACE_ALL);
1569 }
1570
1571 /**
1572 * Substitutes a string for this regular expression in another string.
1573 * This method works like the Perl function of the same name.
1574 * Given a regular expression of "a*b", a String to substituteIn of
1575 * "aaaabfooaaabgarplyaaabwackyb" and the substitution String "-", the
1576 * resulting String returned by subst would be "-foo-garply-wacky-".
1577 * <p>
1578 * It is also possible to reference the contents of a parenthesized expression
1579 * with $0, $1, ... $9. A regular expression of "http://[\\.\\w\\-\\?/~_@&=%]+",
1580 * a String to substituteIn of "visit us: http://www.apache.org!" and the
1581 * substitution String "&lt;a href=\"$0\"&gt;$0&lt;/a&gt;", the resulting String
1582 * returned by subst would be
1583 * "visit us: &lt;a href=\"http://www.apache.org\"&gt;http://www.apache.org&lt;/a&gt;!".
1584 * <p>
1585 * <i>Note:</i> $0 represents the whole match.
1586 *
1587 * @param substituteIn String to substitute within
1588 * @param substitution String to substitute for matches of this regular expression
1589 * @param flags One or more bitwise flags from REPLACE_*. If the REPLACE_FIRSTONLY
1590 * flag bit is set, only the first occurrence of this regular expression is replaced.
1591 * If the bit is not set (REPLACE_ALL), all occurrences of this pattern will be
1592 * replaced. If the flag REPLACE_BACKREFERENCES is set, all backreferences will
1593 * be processed.
1594 * @return The string substituteIn with zero or more occurrences of the current
1595 * regular expression replaced with the substitution String (if this regular
1596 * expression object doesn't match at any position, the original String is returned
1597 * unchanged).
1598 */
1599 public String subst(String substituteIn, String substitution, int flags)
1600 {
1601 // String to return
1602 StringBuffer ret = new StringBuffer();
1603
1604 // Start at position 0 and search the whole string
1605 int pos = 0;
1606 int len = substituteIn.length();
1607
1608 // Try a match at each position
1609 while (pos < len && match(substituteIn, pos))
1610 {
1611 // Append string before match
1612 ret.append(substituteIn.substring(pos, getParenStart(0)));
1613
1614 if ((flags & REPLACE_BACKREFERENCES) != 0)
1615 {
1616 // Process backreferences
1617 int lCurrentPosition = 0;
1618 int lLastPosition = -2;
1619 int lLength = substitution.length();
1620 boolean bAddedPrefix = false;
1621
1622 while ((lCurrentPosition = substitution.indexOf("$", lCurrentPosition)) >= 0)
1623 {
1624 if ((lCurrentPosition == 0 || substitution.charAt(lCurrentPosition - 1) != '\\')
1625 && lCurrentPosition+1 < lLength)
1626 {
1627 char c = substitution.charAt(lCurrentPosition + 1);
1628 if (c >= '0' && c <= '9')
1629 {
1630 if (bAddedPrefix == false)
1631 {
1632 // Append everything between the beginning of the
1633 // substitution string and the current $ sign
1634 ret.append(substitution.substring(0, lCurrentPosition));
1635 bAddedPrefix = true;
1636 }
1637 else
1638 {
1639 // Append everything between the last and the current $ sign
1640 ret.append(substitution.substring(lLastPosition + 2, lCurrentPosition));
1641 }
1642
1643 // Append the parenthesized expression
1644 // Note: if a parenthesized expression of the requested
1645 // index is not available "null" is added to the string
1646 ret.append(getParen(c - '0'));
1647 lLastPosition = lCurrentPosition;
1648 }
1649 }
1650
1651 // Move forward, skipping past match
1652 lCurrentPosition++;
1653 }
1654
1655 // Append everything after the last $ sign
1656 ret.append(substitution.substring(lLastPosition + 2, lLength));
1657 }
1658 else
1659 {
1660 // Append substitution without processing backreferences
1661 ret.append(substitution);
1662 }
1663
1664 // Move forward, skipping past match
1665 int newpos = getParenEnd(0);
1666
1667 // We always want to make progress!
1668 if (newpos == pos)
1669 {
1670 newpos++;
1671 }
1672
1673 // Try new position
1674 pos = newpos;
1675
1676 // Break out if we're only supposed to replace one occurrence
1677 if ((flags & REPLACE_FIRSTONLY) != 0)
1678 {
1679 break;
1680 }
1681 }
1682
1683 // If there's remaining input, append it
1684 if (pos < len)
1685 {
1686 ret.append(substituteIn.substring(pos));
1687 }
1688
1689 // Return string buffer as string
1690 return ret.toString();
1691 }
1692
1693 /**
1694 * Returns an array of Strings, whose toString representation matches a regular
1695 * expression. This method works like the Perl function of the same name. Given
1696 * a regular expression of "a*b" and an array of String objects of [foo, aab, zzz,
1697 * aaaab], the array of Strings returned by grep would be [aab, aaaab].
1698 *
1699 * @param search Array of Objects to search
1700 * @return Array of Strings whose toString() value matches this regular expression.
1701 */
1702 public String[] grep(Object[] search)
1703 {
1704 // Create new vector to hold return items
1705 Vector v = new Vector();
1706
1707 // Traverse array of objects
1708 for (int i = 0; i < search.length; i++)
1709 {
1710 // Get next object as a string
1711 String s = search[i].toString();
1712
1713 // If it matches this regexp, add it to the list
1714 if (match(s))
1715 {
1716 v.addElement(s);
1717 }
1718 }
1719
1720 // Return vector as an array of strings
1721 String[] ret = new String[v.size()];
1722 v.copyInto(ret);
1723 return ret;
1724 }
1725
1726 /**
1727 * @return true if character at i-th position in the <code>search</code> string is a newline
1728 */
1729 private boolean isNewline(int i)
1730 {
1731 char nextChar = search.charAt(i);
1732
1733 if (nextChar == '\n' || nextChar == '\r' || nextChar == '\u0085'
1734 || nextChar == '\u2028' || nextChar == '\u2029')
1735 {
1736 return true;
1737 }
1738
1739 return false;
1740 }
1741
1742 /**
1743 * Compares two characters.
1744 *
1745 * @param c1 first character to compare.
1746 * @param c2 second character to compare.
1747 * @param caseIndependent whether comparision is case insensitive or not.
1748 * @return negative, 0, or positive integer as the first character
1749 * less than, equal to, or greater then the second.
1750 */
1751 private int compareChars(char c1, char c2, boolean caseIndependent)
1752 {
1753 if (caseIndependent)
1754 {
1755 c1 = Character.toLowerCase(c1);
1756 c2 = Character.toLowerCase(c2);
1757 }
1758 return ((int)c1 - (int)c2);
1759 }
1760 }

mercurial