src/share/jaxws_classes/com/sun/xml/internal/messaging/saaj/packaging/mime/internet/HeaderTokenizer.java

Thu, 31 Aug 2017 15:18:52 +0800

author
aoqi
date
Thu, 31 Aug 2017 15:18:52 +0800
changeset 637
9c07ef4934dd
parent 368
0989ad8c0860
parent 0
373ffda63c9a
permissions
-rw-r--r--

merge

     1 /*
     2  * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
     3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     4  *
     5  * This code is free software; you can redistribute it and/or modify it
     6  * under the terms of the GNU General Public License version 2 only, as
     7  * published by the Free Software Foundation.  Oracle designates this
     8  * particular file as subject to the "Classpath" exception as provided
     9  * by Oracle in the LICENSE file that accompanied this code.
    10  *
    11  * This code is distributed in the hope that it will be useful, but WITHOUT
    12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
    13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    14  * version 2 for more details (a copy is included in the LICENSE file that
    15  * accompanied this code).
    16  *
    17  * You should have received a copy of the GNU General Public License version
    18  * 2 along with this work; if not, write to the Free Software Foundation,
    19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
    20  *
    21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
    22  * or visit www.oracle.com if you need additional information or have any
    23  * questions.
    24  */
    26 /*
    27  * @(#)HeaderTokenizer.java   1.9 02/03/27
    28  */
    32 package com.sun.xml.internal.messaging.saaj.packaging.mime.internet;
    35 /**
    36  * This class tokenizes RFC822 and MIME headers into the basic
    37  * symbols specified by RFC822 and MIME. <p>
    38  *
    39  * This class handles folded headers (ie headers with embedded
    40  * CRLF SPACE sequences). The folds are removed in the returned
    41  * tokens.
    42  *
    43  * @version 1.9, 02/03/27
    44  * @author  John Mani
    45  */
    47 public class HeaderTokenizer {
    49     /**
    50      * The Token class represents tokens returned by the
    51      * HeaderTokenizer.
    52      */
    53     public static class Token {
    55         private int type;
    56         private String value;
    58         /**
    59          * Token type indicating an ATOM.
    60          */
    61         public static final int ATOM            = -1;
    63         /**
    64          * Token type indicating a quoted string. The value
    65          * field contains the string without the quotes.
    66          */
    67         public static final int QUOTEDSTRING    = -2;
    69         /**
    70          * Token type indicating a comment. The value field
    71          * contains the comment string without the comment
    72          * start and end symbols.
    73          */
    74         public static final int COMMENT         = -3;
    76         /**
    77          * Token type indicating end of input.
    78          */
    79         public static final int  EOF            = -4;
    81         /**
    82          * Constructor.
    83          * @param       type    Token type
    84          * @param       value   Token value
    85          */
    86         public Token(int type, String value) {
    87              this.type = type;
    88              this.value = value;
    89         }
    91         /**
    92          * Return the type of the token. If the token represents a
    93          * delimiter or a control character, the type is that character
    94          * itself, converted to an integer. Otherwise, it's value is
    95          * one of the following:
    96          * <ul>
    97          * <li><code>ATOM</code> A sequence of ASCII characters
    98          *      delimited by either SPACE, CTL, "(", <"> or the
    99          *      specified SPECIALS
   100          * <li><code>QUOTEDSTRING</code> A sequence of ASCII characters
   101          *      within quotes
   102          * <li><code>COMMENT</code> A sequence of ASCII characters
   103          *      within "(" and ")".
   104          * <li><code>EOF</code> End of header
   105          * </ul>
   106          */
   107         public int getType() {
   108             return type;
   109         }
   111         /**
   112          * Returns the value of the token just read. When the current
   113          * token is a quoted string, this field contains the body of the
   114          * string, without the quotes. When the current token is a comment,
   115          * this field contains the body of the comment.
   116          *
   117          * @return      token value
   118          */
   119         public String getValue() {
   120             return value;
   121         }
   122     }
   124     private String string; // the string to be tokenized
   125     private boolean skipComments; // should comments be skipped ?
   126     private String delimiters; // delimiter string
   127     private int currentPos; // current parse position
   128     private int maxPos; // string length
   129     private int nextPos; // track start of next Token for next()
   130     private int peekPos; // track start of next Token for peek()
   132     /**
   133      * RFC822 specials
   134      */
   135     public final static String RFC822 = "()<>@,;:\\\"\t .[]";
   137     /**
   138      * MIME specials
   139      */
   140     public final static String MIME = "()<>@,;:\\\"\t []/?=";
   142     // The EOF Token
   143     private final static Token EOFToken = new Token(Token.EOF, null);
   145     /**
   146      * Constructor that takes a rfc822 style header.
   147      *
   148      * @param   header  The rfc822 header to be tokenized
   149      * @param   delimiters      Set of delimiter characters
   150      *                          to be used to delimit ATOMS. These
   151      *                          are usually <code>RFC822</code> or
   152      *                          <code>MIME</code>
   153      * @param   skipComments  If true, comments are skipped and
   154      *                          not returned as tokens
   155      */
   156     public HeaderTokenizer(String header, String delimiters,
   157                            boolean skipComments) {
   158         string = (header == null) ? "" : header; // paranoia ?!
   159         this.skipComments = skipComments;
   160         this.delimiters = delimiters;
   161         currentPos = nextPos = peekPos = 0;
   162         maxPos = string.length();
   163     }
   165     /**
   166      * Constructor. Comments are ignored and not returned as tokens
   167      *
   168      * @param   header  The header that is tokenized
   169      * @param   delimiters  The delimiters to be used
   170      */
   171     public HeaderTokenizer(String header, String delimiters) {
   172         this(header, delimiters, true);
   173     }
   175     /**
   176      * Constructor. The RFC822 defined delimiters - RFC822 - are
   177      * used to delimit ATOMS. Also comments are skipped and not
   178      * returned as tokens
   179      */
   180     public HeaderTokenizer(String header)  {
   181         this(header, RFC822);
   182     }
   184     /**
   185      * Parses the next token from this String. <p>
   186      *
   187      * Clients sit in a loop calling next() to parse successive
   188      * tokens until an EOF Token is returned.
   189      *
   190      * @return          the next Token
   191      * @exception       ParseException if the parse fails
   192      */
   193     public Token next() throws ParseException {
   194         Token tk;
   196         currentPos = nextPos; // setup currentPos
   197         tk = getNext();
   198         nextPos = peekPos = currentPos; // update currentPos and peekPos
   199         return tk;
   200     }
   202     /**
   203      * Peek at the next token, without actually removing the token
   204      * from the parse stream. Invoking this method multiple times
   205      * will return successive tokens, until <code>next()</code> is
   206      * called. <p>
   207      *
   208      * @return          the next Token
   209      * @exception       ParseException if the parse fails
   210      */
   211     public Token peek() throws ParseException {
   212         Token tk;
   214         currentPos = peekPos; // setup currentPos
   215         tk = getNext();
   216         peekPos = currentPos; // update peekPos
   217         return tk;
   218     }
   220     /**
   221      * Return the rest of the Header.
   222      *
   223      * @return String   rest of header. null is returned if we are
   224      *                  already at end of header
   225      */
   226     public String getRemainder() {
   227         return string.substring(nextPos);
   228     }
   230     /*
   231      * Return the next token starting from 'currentPos'. After the
   232      * parse, 'currentPos' is updated to point to the start of the
   233      * next token.
   234      */
   235     private Token getNext() throws ParseException {
   236         // If we're already at end of string, return EOF
   237         if (currentPos >= maxPos)
   238             return EOFToken;
   240         // Skip white-space, position currentPos beyond the space
   241         if (skipWhiteSpace() == Token.EOF)
   242             return EOFToken;
   244         char c;
   245         int start;
   246         boolean filter = false;
   248         c = string.charAt(currentPos);
   250         // Check or Skip comments and position currentPos
   251         // beyond the comment
   252         while (c == '(') {
   253             // Parsing comment ..
   254             int nesting;
   255             for (start = ++currentPos, nesting = 1;
   256                  nesting > 0 && currentPos < maxPos;
   257                  currentPos++) {
   258                 c = string.charAt(currentPos);
   259                 if (c == '\\') {  // Escape sequence
   260                     currentPos++; // skip the escaped character
   261                     filter = true;
   262                 } else if (c == '\r')
   263                     filter = true;
   264                 else if (c == '(')
   265                     nesting++;
   266                 else if (c == ')')
   267                     nesting--;
   268             }
   269             if (nesting != 0)
   270                 throw new ParseException("Unbalanced comments");
   272             if (!skipComments) {
   273                 // Return the comment, if we are asked to.
   274                 // Note that the comment start & end markers are ignored.
   275                 String s;
   276                 if (filter) // need to go thru the token again.
   277                     s = filterToken(string, start, currentPos-1);
   278                 else
   279                     s = string.substring(start,currentPos-1);
   281                 return new Token(Token.COMMENT, s);
   282             }
   284             // Skip any whitespace after the comment.
   285             if (skipWhiteSpace() == Token.EOF)
   286                 return EOFToken;
   287             c = string.charAt(currentPos);
   288         }
   290         // Check for quoted-string and position currentPos
   291         //  beyond the terminating quote
   292         if (c == '"') {
   293             for (start = ++currentPos; currentPos < maxPos; currentPos++) {
   294                 c = string.charAt(currentPos);
   295                 if (c == '\\') { // Escape sequence
   296                     currentPos++;
   297                     filter = true;
   298                 } else if (c == '\r')
   299                     filter = true;
   300                 else if (c == '"') {
   301                     currentPos++;
   302                     String s;
   304                     if (filter)
   305                         s = filterToken(string, start, currentPos-1);
   306                     else
   307                         s = string.substring(start,currentPos-1);
   309                     return new Token(Token.QUOTEDSTRING, s);
   310                 }
   311             }
   312             throw new ParseException("Unbalanced quoted string");
   313         }
   315         // Check for SPECIAL or CTL
   316         if (c < 040 || c >= 0177 || delimiters.indexOf(c) >= 0) {
   317             currentPos++; // re-position currentPos
   318             char ch[] = new char[1];
   319             ch[0] = c;
   320             return new Token((int)c, new String(ch));
   321         }
   323         // Check for ATOM
   324         for (start = currentPos; currentPos < maxPos; currentPos++) {
   325             c = string.charAt(currentPos);
   326             // ATOM is delimited by either SPACE, CTL, "(", <">
   327             // or the specified SPECIALS
   328             if (c < 040 || c >= 0177 || c == '(' || c == ' ' ||
   329                 c == '"' || delimiters.indexOf(c) >= 0)
   330                 break;
   331         }
   332         return new Token(Token.ATOM, string.substring(start, currentPos));
   333     }
   335     // Skip SPACE, HT, CR and NL
   336     private int skipWhiteSpace() {
   337         char c;
   338         for (; currentPos < maxPos; currentPos++)
   339             if (((c = string.charAt(currentPos)) != ' ') &&
   340                 (c != '\t') && (c != '\r') && (c != '\n'))
   341                 return currentPos;
   342         return Token.EOF;
   343     }
   345     /* Process escape sequences and embedded LWSPs from a comment or
   346      * quoted string.
   347      */
   348     private static String filterToken(String s, int start, int end) {
   349         StringBuffer sb = new StringBuffer();
   350         char c;
   351         boolean gotEscape = false;
   352         boolean gotCR = false;
   354         for (int i = start; i < end; i++) {
   355             c = s.charAt(i);
   356             if (c == '\n' && gotCR) {
   357                 // This LF is part of an unescaped
   358                 // CRLF sequence (i.e, LWSP). Skip it.
   359                 gotCR = false;
   360                 continue;
   361             }
   363             gotCR = false;
   364             if (!gotEscape) {
   365                 // Previous character was NOT '\'
   366                 if (c == '\\') // skip this character
   367                     gotEscape = true;
   368                 else if (c == '\r') // skip this character
   369                     gotCR = true;
   370                 else // append this character
   371                     sb.append(c);
   372             } else {
   373                 // Previous character was '\'. So no need to
   374                 // bother with any special processing, just
   375                 // append this character
   376                 sb.append(c);
   377                 gotEscape = false;
   378             }
   379         }
   380         return sb.toString();
   381     }
   382 }

mercurial