src/com/sun/org/apache/regexp/internal/REProgram.java

changeset 2116
aaee9ae4799a
parent 759
7ea027fae4d8
equal deleted inserted replaced
2090:3b8ebb957957 2116:aaee9ae4799a
1 /*
2 * reserved comment block
3 * DO NOT REMOVE OR ALTER!
4 */
5 /*
6 * Copyright 1999-2004 The Apache Software Foundation.
7 *
8 * Licensed under the Apache License, Version 2.0 (the "License");
9 * you may not use this file except in compliance with the License.
10 * You may obtain a copy of the License at
11 *
12 * http://www.apache.org/licenses/LICENSE-2.0
13 *
14 * Unless required by applicable law or agreed to in writing, software
15 * distributed under the License is distributed on an "AS IS" BASIS,
16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 * See the License for the specific language governing permissions and
18 * limitations under the License.
19 */
20
21 package com.sun.org.apache.regexp.internal;
22
23 import java.io.Serializable;
24
25 /**
26 * A class that holds compiled regular expressions. This is exposed mainly
27 * for use by the recompile utility (which helps you produce precompiled
28 * REProgram objects). You should not otherwise need to work directly with
29 * this class.
30 *
31 * @see RE
32 * @see RECompiler
33 *
34 * @author <a href="mailto:jonl@muppetlabs.com">Jonathan Locke</a>
35 */
36 public class REProgram implements Serializable
37 {
38 static final int OPT_HASBACKREFS = 1;
39
40 char[] instruction; // The compiled regular expression 'program'
41 int lenInstruction; // The amount of the instruction buffer in use
42 char[] prefix; // Prefix string optimization
43 int flags; // Optimization flags (REProgram.OPT_*)
44 int maxParens = -1;
45
46 /**
47 * Constructs a program object from a character array
48 * @param instruction Character array with RE opcode instructions in it
49 */
50 public REProgram(char[] instruction)
51 {
52 this(instruction, instruction.length);
53 }
54
55 /**
56 * Constructs a program object from a character array
57 * @param parens Count of parens in the program
58 * @param instruction Character array with RE opcode instructions in it
59 */
60 public REProgram(int parens, char[] instruction)
61 {
62 this(instruction, instruction.length);
63 this.maxParens = parens;
64 }
65
66 /**
67 * Constructs a program object from a character array
68 * @param instruction Character array with RE opcode instructions in it
69 * @param lenInstruction Amount of instruction array in use
70 */
71 public REProgram(char[] instruction, int lenInstruction)
72 {
73 setInstructions(instruction, lenInstruction);
74 }
75
76 /**
77 * Returns a copy of the current regular expression program in a character
78 * array that is exactly the right length to hold the program. If there is
79 * no program compiled yet, getInstructions() will return null.
80 * @return A copy of the current compiled RE program
81 */
82 public char[] getInstructions()
83 {
84 // Ensure program has been compiled!
85 if (lenInstruction != 0)
86 {
87 // Return copy of program
88 char[] ret = new char[lenInstruction];
89 System.arraycopy(instruction, 0, ret, 0, lenInstruction);
90 return ret;
91 }
92 return null;
93 }
94
95 /**
96 * Sets a new regular expression program to run. It is this method which
97 * performs any special compile-time search optimizations. Currently only
98 * two optimizations are in place - one which checks for backreferences
99 * (so that they can be lazily allocated) and another which attempts to
100 * find an prefix anchor string so that substantial amounts of input can
101 * potentially be skipped without running the actual program.
102 * @param instruction Program instruction buffer
103 * @param lenInstruction Length of instruction buffer in use
104 */
105 public void setInstructions(char[] instruction, int lenInstruction)
106 {
107 // Save reference to instruction array
108 this.instruction = instruction;
109 this.lenInstruction = lenInstruction;
110
111 // Initialize other program-related variables
112 flags = 0;
113 prefix = null;
114
115 // Try various compile-time optimizations if there's a program
116 if (instruction != null && lenInstruction != 0)
117 {
118 // If the first node is a branch
119 if (lenInstruction >= RE.nodeSize && instruction[0 + RE.offsetOpcode] == RE.OP_BRANCH)
120 {
121 // to the end node
122 int next = instruction[0 + RE.offsetNext];
123 if (instruction[next + RE.offsetOpcode] == RE.OP_END)
124 {
125 // and the branch starts with an atom
126 if (lenInstruction >= (RE.nodeSize * 2) && instruction[RE.nodeSize + RE.offsetOpcode] == RE.OP_ATOM)
127 {
128 // then get that atom as an prefix because there's no other choice
129 int lenAtom = instruction[RE.nodeSize + RE.offsetOpdata];
130 prefix = new char[lenAtom];
131 System.arraycopy(instruction, RE.nodeSize * 2, prefix, 0, lenAtom);
132 }
133 }
134 }
135
136 BackrefScanLoop:
137
138 // Check for backreferences
139 for (int i = 0; i < lenInstruction; i += RE.nodeSize)
140 {
141 switch (instruction[i + RE.offsetOpcode])
142 {
143 case RE.OP_ANYOF:
144 i += (instruction[i + RE.offsetOpdata] * 2);
145 break;
146
147 case RE.OP_ATOM:
148 i += instruction[i + RE.offsetOpdata];
149 break;
150
151 case RE.OP_BACKREF:
152 flags |= OPT_HASBACKREFS;
153 break BackrefScanLoop;
154 }
155 }
156 }
157 }
158 }

mercurial