Thu, 31 Aug 2017 15:18:52 +0800
merge
1 /*
2 * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
26 /*
27 * @(#)HeaderTokenizer.java 1.9 02/03/27
28 */
32 package com.sun.xml.internal.messaging.saaj.packaging.mime.internet;
35 /**
36 * This class tokenizes RFC822 and MIME headers into the basic
37 * symbols specified by RFC822 and MIME. <p>
38 *
39 * This class handles folded headers (ie headers with embedded
40 * CRLF SPACE sequences). The folds are removed in the returned
41 * tokens.
42 *
43 * @version 1.9, 02/03/27
44 * @author John Mani
45 */
47 public class HeaderTokenizer {
49 /**
50 * The Token class represents tokens returned by the
51 * HeaderTokenizer.
52 */
53 public static class Token {
55 private int type;
56 private String value;
58 /**
59 * Token type indicating an ATOM.
60 */
61 public static final int ATOM = -1;
63 /**
64 * Token type indicating a quoted string. The value
65 * field contains the string without the quotes.
66 */
67 public static final int QUOTEDSTRING = -2;
69 /**
70 * Token type indicating a comment. The value field
71 * contains the comment string without the comment
72 * start and end symbols.
73 */
74 public static final int COMMENT = -3;
76 /**
77 * Token type indicating end of input.
78 */
79 public static final int EOF = -4;
81 /**
82 * Constructor.
83 * @param type Token type
84 * @param value Token value
85 */
86 public Token(int type, String value) {
87 this.type = type;
88 this.value = value;
89 }
91 /**
92 * Return the type of the token. If the token represents a
93 * delimiter or a control character, the type is that character
94 * itself, converted to an integer. Otherwise, it's value is
95 * one of the following:
96 * <ul>
97 * <li><code>ATOM</code> A sequence of ASCII characters
98 * delimited by either SPACE, CTL, "(", <"> or the
99 * specified SPECIALS
100 * <li><code>QUOTEDSTRING</code> A sequence of ASCII characters
101 * within quotes
102 * <li><code>COMMENT</code> A sequence of ASCII characters
103 * within "(" and ")".
104 * <li><code>EOF</code> End of header
105 * </ul>
106 */
107 public int getType() {
108 return type;
109 }
111 /**
112 * Returns the value of the token just read. When the current
113 * token is a quoted string, this field contains the body of the
114 * string, without the quotes. When the current token is a comment,
115 * this field contains the body of the comment.
116 *
117 * @return token value
118 */
119 public String getValue() {
120 return value;
121 }
122 }
124 private String string; // the string to be tokenized
125 private boolean skipComments; // should comments be skipped ?
126 private String delimiters; // delimiter string
127 private int currentPos; // current parse position
128 private int maxPos; // string length
129 private int nextPos; // track start of next Token for next()
130 private int peekPos; // track start of next Token for peek()
132 /**
133 * RFC822 specials
134 */
135 public final static String RFC822 = "()<>@,;:\\\"\t .[]";
137 /**
138 * MIME specials
139 */
140 public final static String MIME = "()<>@,;:\\\"\t []/?=";
142 // The EOF Token
143 private final static Token EOFToken = new Token(Token.EOF, null);
145 /**
146 * Constructor that takes a rfc822 style header.
147 *
148 * @param header The rfc822 header to be tokenized
149 * @param delimiters Set of delimiter characters
150 * to be used to delimit ATOMS. These
151 * are usually <code>RFC822</code> or
152 * <code>MIME</code>
153 * @param skipComments If true, comments are skipped and
154 * not returned as tokens
155 */
156 public HeaderTokenizer(String header, String delimiters,
157 boolean skipComments) {
158 string = (header == null) ? "" : header; // paranoia ?!
159 this.skipComments = skipComments;
160 this.delimiters = delimiters;
161 currentPos = nextPos = peekPos = 0;
162 maxPos = string.length();
163 }
165 /**
166 * Constructor. Comments are ignored and not returned as tokens
167 *
168 * @param header The header that is tokenized
169 * @param delimiters The delimiters to be used
170 */
171 public HeaderTokenizer(String header, String delimiters) {
172 this(header, delimiters, true);
173 }
175 /**
176 * Constructor. The RFC822 defined delimiters - RFC822 - are
177 * used to delimit ATOMS. Also comments are skipped and not
178 * returned as tokens
179 */
180 public HeaderTokenizer(String header) {
181 this(header, RFC822);
182 }
184 /**
185 * Parses the next token from this String. <p>
186 *
187 * Clients sit in a loop calling next() to parse successive
188 * tokens until an EOF Token is returned.
189 *
190 * @return the next Token
191 * @exception ParseException if the parse fails
192 */
193 public Token next() throws ParseException {
194 Token tk;
196 currentPos = nextPos; // setup currentPos
197 tk = getNext();
198 nextPos = peekPos = currentPos; // update currentPos and peekPos
199 return tk;
200 }
202 /**
203 * Peek at the next token, without actually removing the token
204 * from the parse stream. Invoking this method multiple times
205 * will return successive tokens, until <code>next()</code> is
206 * called. <p>
207 *
208 * @return the next Token
209 * @exception ParseException if the parse fails
210 */
211 public Token peek() throws ParseException {
212 Token tk;
214 currentPos = peekPos; // setup currentPos
215 tk = getNext();
216 peekPos = currentPos; // update peekPos
217 return tk;
218 }
220 /**
221 * Return the rest of the Header.
222 *
223 * @return String rest of header. null is returned if we are
224 * already at end of header
225 */
226 public String getRemainder() {
227 return string.substring(nextPos);
228 }
230 /*
231 * Return the next token starting from 'currentPos'. After the
232 * parse, 'currentPos' is updated to point to the start of the
233 * next token.
234 */
235 private Token getNext() throws ParseException {
236 // If we're already at end of string, return EOF
237 if (currentPos >= maxPos)
238 return EOFToken;
240 // Skip white-space, position currentPos beyond the space
241 if (skipWhiteSpace() == Token.EOF)
242 return EOFToken;
244 char c;
245 int start;
246 boolean filter = false;
248 c = string.charAt(currentPos);
250 // Check or Skip comments and position currentPos
251 // beyond the comment
252 while (c == '(') {
253 // Parsing comment ..
254 int nesting;
255 for (start = ++currentPos, nesting = 1;
256 nesting > 0 && currentPos < maxPos;
257 currentPos++) {
258 c = string.charAt(currentPos);
259 if (c == '\\') { // Escape sequence
260 currentPos++; // skip the escaped character
261 filter = true;
262 } else if (c == '\r')
263 filter = true;
264 else if (c == '(')
265 nesting++;
266 else if (c == ')')
267 nesting--;
268 }
269 if (nesting != 0)
270 throw new ParseException("Unbalanced comments");
272 if (!skipComments) {
273 // Return the comment, if we are asked to.
274 // Note that the comment start & end markers are ignored.
275 String s;
276 if (filter) // need to go thru the token again.
277 s = filterToken(string, start, currentPos-1);
278 else
279 s = string.substring(start,currentPos-1);
281 return new Token(Token.COMMENT, s);
282 }
284 // Skip any whitespace after the comment.
285 if (skipWhiteSpace() == Token.EOF)
286 return EOFToken;
287 c = string.charAt(currentPos);
288 }
290 // Check for quoted-string and position currentPos
291 // beyond the terminating quote
292 if (c == '"') {
293 for (start = ++currentPos; currentPos < maxPos; currentPos++) {
294 c = string.charAt(currentPos);
295 if (c == '\\') { // Escape sequence
296 currentPos++;
297 filter = true;
298 } else if (c == '\r')
299 filter = true;
300 else if (c == '"') {
301 currentPos++;
302 String s;
304 if (filter)
305 s = filterToken(string, start, currentPos-1);
306 else
307 s = string.substring(start,currentPos-1);
309 return new Token(Token.QUOTEDSTRING, s);
310 }
311 }
312 throw new ParseException("Unbalanced quoted string");
313 }
315 // Check for SPECIAL or CTL
316 if (c < 040 || c >= 0177 || delimiters.indexOf(c) >= 0) {
317 currentPos++; // re-position currentPos
318 char ch[] = new char[1];
319 ch[0] = c;
320 return new Token((int)c, new String(ch));
321 }
323 // Check for ATOM
324 for (start = currentPos; currentPos < maxPos; currentPos++) {
325 c = string.charAt(currentPos);
326 // ATOM is delimited by either SPACE, CTL, "(", <">
327 // or the specified SPECIALS
328 if (c < 040 || c >= 0177 || c == '(' || c == ' ' ||
329 c == '"' || delimiters.indexOf(c) >= 0)
330 break;
331 }
332 return new Token(Token.ATOM, string.substring(start, currentPos));
333 }
335 // Skip SPACE, HT, CR and NL
336 private int skipWhiteSpace() {
337 char c;
338 for (; currentPos < maxPos; currentPos++)
339 if (((c = string.charAt(currentPos)) != ' ') &&
340 (c != '\t') && (c != '\r') && (c != '\n'))
341 return currentPos;
342 return Token.EOF;
343 }
345 /* Process escape sequences and embedded LWSPs from a comment or
346 * quoted string.
347 */
348 private static String filterToken(String s, int start, int end) {
349 StringBuffer sb = new StringBuffer();
350 char c;
351 boolean gotEscape = false;
352 boolean gotCR = false;
354 for (int i = start; i < end; i++) {
355 c = s.charAt(i);
356 if (c == '\n' && gotCR) {
357 // This LF is part of an unescaped
358 // CRLF sequence (i.e, LWSP). Skip it.
359 gotCR = false;
360 continue;
361 }
363 gotCR = false;
364 if (!gotEscape) {
365 // Previous character was NOT '\'
366 if (c == '\\') // skip this character
367 gotEscape = true;
368 else if (c == '\r') // skip this character
369 gotCR = true;
370 else // append this character
371 sb.append(c);
372 } else {
373 // Previous character was '\'. So no need to
374 // bother with any special processing, just
375 // append this character
376 sb.append(c);
377 gotEscape = false;
378 }
379 }
380 return sb.toString();
381 }
382 }