src/share/jaxws_classes/com/sun/xml/internal/messaging/saaj/packaging/mime/internet/MimeUtility.java

Thu, 31 Aug 2017 15:18:52 +0800

author
aoqi
date
Thu, 31 Aug 2017 15:18:52 +0800
changeset 637
9c07ef4934dd
parent 368
0989ad8c0860
parent 0
373ffda63c9a
permissions
-rw-r--r--

merge

     1 /*
     2  * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
     3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     4  *
     5  * This code is free software; you can redistribute it and/or modify it
     6  * under the terms of the GNU General Public License version 2 only, as
     7  * published by the Free Software Foundation.  Oracle designates this
     8  * particular file as subject to the "Classpath" exception as provided
     9  * by Oracle in the LICENSE file that accompanied this code.
    10  *
    11  * This code is distributed in the hope that it will be useful, but WITHOUT
    12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
    13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    14  * version 2 for more details (a copy is included in the LICENSE file that
    15  * accompanied this code).
    16  *
    17  * You should have received a copy of the GNU General Public License version
    18  * 2 along with this work; if not, write to the Free Software Foundation,
    19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
    20  *
    21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
    22  * or visit www.oracle.com if you need additional information or have any
    23  * questions.
    24  */
    26 /*
    27  * @(#)MimeUtility.java       1.45 03/03/10
    28  */
    32 package com.sun.xml.internal.messaging.saaj.packaging.mime.internet;
    34 import java.io.*;
    35 import java.util.*;
    37 import javax.activation.DataHandler;
    38 import javax.activation.DataSource;
    40 import com.sun.xml.internal.messaging.saaj.packaging.mime.MessagingException;
    41 import com.sun.xml.internal.messaging.saaj.packaging.mime.util.*;
    42 import com.sun.xml.internal.messaging.saaj.util.SAAJUtil;
    44 /**
    45  * This is a utility class that provides various MIME related
    46  * functionality. <p>
    47  *
    48  * There are a set of methods to encode and decode MIME headers as
    49  * per RFC 2047. A brief description on handling such headers is
    50  * given below: <p>
    51  *
    52  * RFC 822 mail headers <strong>must</strong> contain only US-ASCII
    53  * characters. Headers that contain non US-ASCII characters must be
    54  * encoded so that they contain only US-ASCII characters. Basically,
    55  * this process involves using either BASE64 or QP to encode certain
    56  * characters. RFC 2047 describes this in detail. <p>
    57  *
    58  * In Java, Strings contain (16 bit) Unicode characters. ASCII is a
    59  * subset of Unicode (and occupies the range 0 - 127). A String
    60  * that contains only ASCII characters is already mail-safe. If the
    61  * String contains non US-ASCII characters, it must be encoded. An
    62  * additional complexity in this step is that since Unicode is not
    63  * yet a widely used charset, one might want to first charset-encode
    64  * the String into another charset and then do the transfer-encoding.
    65  * <p>
    66  * Note that to get the actual bytes of a mail-safe String (say,
    67  * for sending over SMTP), one must do
    68  * <p><blockquote><pre>
    69  *
    70  *      byte[] bytes = string.getBytes("iso-8859-1");
    71  *
    72  * </pre></blockquote><p>
    73  *
    74  * The <code>setHeader</code> and <code>addHeader</code> methods
    75  * on MimeMessage and MimeBodyPart assume that the given header values
    76  * are Unicode strings that contain only US-ASCII characters. Hence
    77  * the callers of those methods must insure that the values they pass
    78  * do not contain non US-ASCII characters. The methods in this class
    79  * help do this. <p>
    80  *
    81  * The <code>getHeader</code> family of methods on MimeMessage and
    82  * MimeBodyPart return the raw header value. These might be encoded
    83  * as per RFC 2047, and if so, must be decoded into Unicode Strings.
    84  * The methods in this class help to do this. <p>
    85  *
    86  * Several System properties control strict conformance to the MIME
    87  * spec.  Note that these are not session properties but must be set
    88  * globally as System properties. <p>
    89  *
    90  * The <code>mail.mime.decodetext.strict</code> property controls
    91  * decoding of MIME encoded words.  The MIME spec requires that encoded
    92  * words start at the beginning of a whitespace separated word.  Some
    93  * mailers incorrectly include encoded words in the middle of a word.
    94  * If the <code>mail.mime.decodetext.strict</code> System property is
    95  * set to <code>"false"</code>, an attempt will be made to decode these
    96  * illegal encoded words. The default is true. <p>
    97  *
    98  * The <code>mail.mime.encodeeol.strict</code> property controls the
    99  * choice of Content-Transfer-Encoding for MIME parts that are not of
   100  * type "text".  Often such parts will contain textual data for which
   101  * an encoding that allows normal end of line conventions is appropriate.
   102  * In rare cases, such a part will appear to contain entirely textual
   103  * data, but will require an encoding that preserves CR and LF characters
   104  * without change.  If the <code>mail.mime.decodetext.strict</code>
   105  * System property is set to <code>"true"</code>, such an encoding will
   106  * be used when necessary.  The default is false. <p>
   107  *
   108  * In addition, the <code>mail.mime.charset</code> System property can
   109  * be used to specify the default MIME charset to use for encoded words
   110  * and text parts that don't otherwise specify a charset.  Normally, the
   111  * default MIME charset is derived from the default Java charset, as
   112  * specified in the <code>file.encoding</code> System property.  Most
   113  * applications will have no need to explicitly set the default MIME
   114  * charset.  In cases where the default MIME charset to be used for
   115  * mail messages is different than the charset used for files stored on
   116  * the system, this property should be set.
   117  *
   118  * @version 1.45, 03/03/10
   119  * @author  John Mani
   120  * @author  Bill Shannon
   121  */
   123 public class MimeUtility {
   125     // This class cannot be instantiated
   126     private MimeUtility() { }
   128     public static final int ALL = -1;
   130     private static final int BUFFER_SIZE = 1024;
   131     private static boolean decodeStrict = true;
   132     private static boolean encodeEolStrict = false;
   133     private static boolean foldEncodedWords = false;
   134     private static boolean foldText = true;
   136     static {
   137         try {
   138             String s = SAAJUtil.getSystemProperty("mail.mime.decodetext.strict");
   139             // default to true
   140             decodeStrict = s == null || !s.equalsIgnoreCase("false");
   141             s = SAAJUtil.getSystemProperty("mail.mime.encodeeol.strict");
   142             // default to false
   143             encodeEolStrict = s != null && s.equalsIgnoreCase("true");
   144             s = SAAJUtil.getSystemProperty("mail.mime.foldencodedwords");
   145             // default to false
   146             foldEncodedWords = s != null && s.equalsIgnoreCase("true");
   147             s = SAAJUtil.getSystemProperty("mail.mime.foldtext");
   148             // default to true
   149             foldText = s == null || !s.equalsIgnoreCase("false");
   150         } catch (SecurityException sex) {
   151             // ignore it
   152         }
   153     }
   156     /**
   157      * Get the content-transfer-encoding that should be applied
   158      * to the input stream of this datasource, to make it mailsafe. <p>
   159      *
   160      * The algorithm used here is: <br>
   161      * <ul>
   162      * <li>
   163      * If the primary type of this datasource is "text" and if all
   164      * the bytes in its input stream are US-ASCII, then the encoding
   165      * is "7bit". If more than half of the bytes are non-US-ASCII, then
   166      * the encoding is "base64". If less than half of the bytes are
   167      * non-US-ASCII, then the encoding is "quoted-printable".
   168      * <li>
   169      * If the primary type of this datasource is not "text", then if
   170      * all the bytes of its input stream are US-ASCII, the encoding
   171      * is "7bit". If there is even one non-US-ASCII character, the
   172      * encoding is "base64".
   173      * </ul>
   174      *
   175      * @param   ds      DataSource
   176      * @return          the encoding. This is either "7bit",
   177      *                  "quoted-printable" or "base64"
   178      */
   179     public static String getEncoding(DataSource ds) {
   180         ContentType cType = null;
   181         InputStream is = null;
   182         String encoding = null;
   184         try {
   185             cType = new ContentType(ds.getContentType());
   186             is = ds.getInputStream();
   187         } catch (Exception ex) {
   188             return "base64"; // what else ?!
   189         }
   191         boolean isText = cType.match("text/*");
   192         // if not text, stop processing when we see non-ASCII
   193         int i = checkAscii(is, ALL, !isText);
   194         switch (i) {
   195         case ALL_ASCII:
   196             encoding = "7bit"; // all ascii
   197             break;
   198         case MOSTLY_ASCII:
   199             encoding = "quoted-printable"; // mostly ascii
   200             break;
   201         default:
   202             encoding = "base64"; // mostly binary
   203             break;
   204         }
   206         // Close the input stream
   207         try {
   208             is.close();
   209         } catch (IOException ioex) { }
   211         return encoding;
   212     }
   214     /**
   215      * Same as <code>getEncoding(DataSource)</code> except that instead
   216      * of reading the data from an <code>InputStream</code> it uses the
   217      * <code>writeTo</code> method to examine the data.  This is more
   218      * efficient in the common case of a <code>DataHandler</code>
   219      * created with an object and a MIME type (for example, a
   220      * "text/plain" String) because all the I/O is done in this
   221      * thread.  In the case requiring an <code>InputStream</code> the
   222      * <code>DataHandler</code> uses a thread, a pair of pipe streams,
   223      * and the <code>writeTo</code> method to produce the data. <p>
   224      *
   225      * @since   JavaMail 1.2
   226      */
   227     public static String getEncoding(DataHandler dh) {
   228         ContentType cType = null;
   229         String encoding = null;
   231         /*
   232          * Try to pick the most efficient means of determining the
   233          * encoding.  If this DataHandler was created using a DataSource,
   234          * the getEncoding(DataSource) method is typically faster.  If
   235          * the DataHandler was created with an object, this method is
   236          * much faster.  To distinguish the two cases, we use a heuristic.
   237          * A DataHandler created with an object will always have a null name.
   238          * A DataHandler created with a DataSource will usually have a
   239          * non-null name.
   240          *
   241          * XXX - This is actually quite a disgusting hack, but it makes
   242          *       a common case run over twice as fast.
   243          */
   244         if (dh.getName() != null)
   245             return getEncoding(dh.getDataSource());
   247         try {
   248             cType = new ContentType(dh.getContentType());
   249         } catch (Exception ex) {
   250             return "base64"; // what else ?!
   251         }
   253         if (cType.match("text/*")) {
   254             // Check all of the available bytes
   255             AsciiOutputStream aos = new AsciiOutputStream(false, false);
   256             try {
   257                 dh.writeTo(aos);
   258             } catch (IOException ex) { }        // ignore it
   259             switch (aos.getAscii()) {
   260             case ALL_ASCII:
   261                 encoding = "7bit"; // all ascii
   262                 break;
   263             case MOSTLY_ASCII:
   264                 encoding = "quoted-printable"; // mostly ascii
   265                 break;
   266             default:
   267                 encoding = "base64"; // mostly binary
   268                 break;
   269             }
   270         } else { // not "text"
   271             // Check all of available bytes, break out if we find
   272             // at least one non-US-ASCII character
   273             AsciiOutputStream aos =
   274                         new AsciiOutputStream(true, encodeEolStrict);
   275             try {
   276                 dh.writeTo(aos);
   277             } catch (IOException ex) { }        // ignore it
   278             if (aos.getAscii() == ALL_ASCII) // all ascii
   279                 encoding = "7bit";
   280             else // found atleast one non-ascii character, use b64
   281                 encoding = "base64";
   282         }
   284         return encoding;
   285     }
   287     /**
   288      * Decode the given input stream. The Input stream returned is
   289      * the decoded input stream. All the encodings defined in RFC 2045
   290      * are supported here. They include "base64", "quoted-printable",
   291      * "7bit", "8bit", and "binary". In addition, "uuencode" is also
   292      * supported.
   293      *
   294      * @param   is              input stream
   295      * @param   encoding        the encoding of the stream.
   296      * @return                  decoded input stream.
   297      */
   298     public static InputStream decode(InputStream is, String encoding)
   299                 throws MessagingException {
   300         if (encoding.equalsIgnoreCase("base64"))
   301             return new BASE64DecoderStream(is);
   302         else if (encoding.equalsIgnoreCase("quoted-printable"))
   303             return new QPDecoderStream(is);
   304         else if (encoding.equalsIgnoreCase("uuencode") ||
   305                  encoding.equalsIgnoreCase("x-uuencode") ||
   306                  encoding.equalsIgnoreCase("x-uue"))
   307             return new UUDecoderStream(is);
   308         else if (encoding.equalsIgnoreCase("binary") ||
   309                  encoding.equalsIgnoreCase("7bit") ||
   310                  encoding.equalsIgnoreCase("8bit"))
   311             return is;
   312         else
   313             throw new MessagingException("Unknown encoding: " + encoding);
   314     }
   316     /**
   317      * Wrap an encoder around the given output stream.
   318      * All the encodings defined in RFC 2045 are supported here.
   319      * They include "base64", "quoted-printable", "7bit", "8bit" and
   320      * "binary". In addition, "uuencode" is also supported.
   321      *
   322      * @param   os              output stream
   323      * @param   encoding        the encoding of the stream.
   324      * @return                  output stream that applies the
   325      *                          specified encoding.
   326      */
   327     public static OutputStream encode(OutputStream os, String encoding)
   328                 throws MessagingException {
   329         if (encoding == null)
   330             return os;
   331         else if (encoding.equalsIgnoreCase("base64"))
   332             return new BASE64EncoderStream(os);
   333         else if (encoding.equalsIgnoreCase("quoted-printable"))
   334             return new QPEncoderStream(os);
   335         else if (encoding.equalsIgnoreCase("uuencode") ||
   336                  encoding.equalsIgnoreCase("x-uuencode") ||
   337                  encoding.equalsIgnoreCase("x-uue"))
   338             return new UUEncoderStream(os);
   339         else if (encoding.equalsIgnoreCase("binary") ||
   340                  encoding.equalsIgnoreCase("7bit") ||
   341                  encoding.equalsIgnoreCase("8bit"))
   342             return os;
   343         else
   344             throw new MessagingException("Unknown encoding: " +encoding);
   345     }
   347     /**
   348      * Wrap an encoder around the given output stream.
   349      * All the encodings defined in RFC 2045 are supported here.
   350      * They include "base64", "quoted-printable", "7bit", "8bit" and
   351      * "binary". In addition, "uuencode" is also supported.
   352      * The <code>filename</code> parameter is used with the "uuencode"
   353      * encoding and is included in the encoded output.
   354      *
   355      * @param   os              output stream
   356      * @param   encoding        the encoding of the stream.
   357      * @param   filename        name for the file being encoded (only used
   358      *                          with uuencode)
   359      * @return                  output stream that applies the
   360      *                          specified encoding.
   361      * @since                   JavaMail 1.2
   362      */
   363     public static OutputStream encode(OutputStream os, String encoding,
   364                                       String filename)
   365                 throws MessagingException {
   366         if (encoding == null)
   367             return os;
   368         else if (encoding.equalsIgnoreCase("base64"))
   369             return new BASE64EncoderStream(os);
   370         else if (encoding.equalsIgnoreCase("quoted-printable"))
   371             return new QPEncoderStream(os);
   372         else if (encoding.equalsIgnoreCase("uuencode") ||
   373                  encoding.equalsIgnoreCase("x-uuencode") ||
   374                  encoding.equalsIgnoreCase("x-uue"))
   375             return new UUEncoderStream(os, filename);
   376         else if (encoding.equalsIgnoreCase("binary") ||
   377                  encoding.equalsIgnoreCase("7bit") ||
   378                  encoding.equalsIgnoreCase("8bit"))
   379             return os;
   380         else
   381             throw new MessagingException("Unknown encoding: " +encoding);
   382     }
   384     /**
   385      * Encode a RFC 822 "text" token into mail-safe form as per
   386      * RFC 2047. <p>
   387      *
   388      * The given Unicode string is examined for non US-ASCII
   389      * characters. If the string contains only US-ASCII characters,
   390      * it is returned as-is.  If the string contains non US-ASCII
   391      * characters, it is first character-encoded using the platform's
   392      * default charset, then transfer-encoded using either the B or
   393      * Q encoding. The resulting bytes are then returned as a Unicode
   394      * string containing only ASCII  characters. <p>
   395      *
   396      * Note that this method should be used to encode only
   397      * "unstructured" RFC 822 headers. <p>
   398      *
   399      * Example of usage:
   400      * <p><blockquote><pre>
   401      *
   402      *  MimeBodyPart part = ...
   403      *  String rawvalue = "FooBar Mailer, Japanese version 1.1"
   404      *  try {
   405      *    // If we know for sure that rawvalue contains only US-ASCII
   406      *    // characters, we can skip the encoding part
   407      *    part.setHeader("X-mailer", MimeUtility.encodeText(rawvalue));
   408      *  } catch (UnsupportedEncodingException e) {
   409      *    // encoding failure
   410      *  } catch (MessagingException me) {
   411      *   // setHeader() failure
   412      *  }
   413      *
   414      * </pre></blockquote><p>
   415      *
   416      * @param   text    unicode string
   417      * @return  Unicode string containing only US-ASCII characters
   418      * @exception UnsupportedEncodingException if the encoding fails
   419      */
   420     public static String encodeText(String text)
   421                         throws UnsupportedEncodingException {
   422         return encodeText(text, null, null);
   423     }
   425     /**
   426      * Encode a RFC 822 "text" token into mail-safe form as per
   427      * RFC 2047. <p>
   428      *
   429      * The given Unicode string is examined for non US-ASCII
   430      * characters. If the string contains only US-ASCII characters,
   431      * it is returned as-is.  If the string contains non US-ASCII
   432      * characters, it is first character-encoded using the specified
   433      * charset, then transfer-encoded using either the B or Q encoding.
   434      * The resulting bytes are then returned as a Unicode string
   435      * containing only ASCII characters. <p>
   436      *
   437      * Note that this method should be used to encode only
   438      * "unstructured" RFC 822 headers.
   439      *
   440      * @param   text    the header value
   441      * @param   charset the charset. If this parameter is null, the
   442      *          platform's default chatset is used.
   443      * @param   encoding the encoding to be used. Currently supported
   444      *          values are "B" and "Q". If this parameter is null, then
   445      *          the "Q" encoding is used if most of characters to be
   446      *          encoded are in the ASCII charset, otherwise "B" encoding
   447      *          is used.
   448      * @return  Unicode string containing only US-ASCII characters
   449      */
   450     public static String encodeText(String text, String charset,
   451                                     String encoding)
   452                         throws UnsupportedEncodingException {
   453         return encodeWord(text, charset, encoding, false);
   454     }
   456     /**
   457      * Decode "unstructured" headers, that is, headers that are defined
   458      * as '*text' as per RFC 822. <p>
   459      *
   460      * The string is decoded using the algorithm specified in
   461      * RFC 2047, Section 6.1.1. If the charset-conversion fails
   462      * for any sequence, an UnsupportedEncodingException is thrown.
   463      * If the String is not an RFC 2047 style encoded header, it is
   464      * returned as-is <p>
   465      *
   466      * Example of usage:
   467      * <p><blockquote><pre>
   468      *
   469      *  MimeBodyPart part = ...
   470      *  String rawvalue = null;
   471      *  String  value = null;
   472      *  try {
   473      *    if ((rawvalue = part.getHeader("X-mailer")[0]) != null)
   474      *      value = MimeUtility.decodeText(rawvalue);
   475      *  } catch (UnsupportedEncodingException e) {
   476      *      // Don't care
   477      *      value = rawvalue;
   478      *  } catch (MessagingException me) { }
   479      *
   480      *  return value;
   481      *
   482      * </pre></blockquote><p>
   483      *
   484      * @param   etext   the possibly encoded value
   485      * @exception       UnsupportedEncodingException if the charset
   486      *                  conversion failed.
   487      */
   488     public static String decodeText(String etext)
   489                 throws UnsupportedEncodingException {
   490         /*
   491          * We look for sequences separated by "linear-white-space".
   492          * (as per RFC 2047, Section 6.1.1)
   493          * RFC 822 defines "linear-white-space" as SPACE | HT | CR | NL.
   494          */
   495         String lwsp = " \t\n\r";
   496         StringTokenizer st;
   498         /*
   499          * First, lets do a quick run thru the string and check
   500          * whether the sequence "=?"  exists at all. If none exists,
   501          * we know there are no encoded-words in here and we can just
   502          * return the string as-is, without suffering thru the later
   503          * decoding logic.
   504          * This handles the most common case of unencoded headers
   505          * efficiently.
   506          */
   507         if (etext.indexOf("=?") == -1)
   508             return etext;
   510         // Encoded words found. Start decoding ...
   512         st = new StringTokenizer(etext, lwsp, true);
   513         StringBuffer sb = new StringBuffer();  // decode buffer
   514         StringBuffer wsb = new StringBuffer(); // white space buffer
   515         boolean prevWasEncoded = false;
   517         while (st.hasMoreTokens()) {
   518             char c;
   519             String s = st.nextToken();
   520             // If whitespace, append it to the whitespace buffer
   521             if (((c = s.charAt(0)) == ' ') || (c == '\t') ||
   522                 (c == '\r') || (c == '\n'))
   523                 wsb.append(c);
   524             else {
   525                 // Check if token is an 'encoded-word' ..
   526                 String word;
   527                 try {
   528                     word = decodeWord(s);
   529                     // Yes, this IS an 'encoded-word'.
   530                     if (!prevWasEncoded && wsb.length() > 0) {
   531                         // if the previous word was also encoded, we
   532                         // should ignore the collected whitespace. Else
   533                         // we include the whitespace as well.
   534                         sb.append(wsb);
   535                     }
   536                     prevWasEncoded = true;
   537                 } catch (ParseException pex) {
   538                     // This is NOT an 'encoded-word'.
   539                     word = s;
   540                     // possibly decode inner encoded words
   541                     if (!decodeStrict)
   542                         word = decodeInnerWords(word);
   543                     // include colleced whitespace ..
   544                     if (wsb.length() > 0)
   545                         sb.append(wsb);
   546                     prevWasEncoded = false;
   547                 }
   548                 sb.append(word); // append the actual word
   549                 wsb.setLength(0); // reset wsb for reuse
   550             }
   551         }
   552         return sb.toString();
   553     }
   555     /**
   556      * Encode a RFC 822 "word" token into mail-safe form as per
   557      * RFC 2047. <p>
   558      *
   559      * The given Unicode string is examined for non US-ASCII
   560      * characters. If the string contains only US-ASCII characters,
   561      * it is returned as-is.  If the string contains non US-ASCII
   562      * characters, it is first character-encoded using the platform's
   563      * default charset, then transfer-encoded using either the B or
   564      * Q encoding. The resulting bytes are then returned as a Unicode
   565      * string containing only ASCII  characters. <p>
   566      *
   567      * This method is meant to be used when creating RFC 822 "phrases".
   568      * The InternetAddress class, for example, uses this to encode
   569      * it's 'phrase' component.
   570      *
   571      * @param   text    unicode string
   572      * @return  Array of Unicode strings containing only US-ASCII
   573      *          characters.
   574      * @exception UnsupportedEncodingException if the encoding fails
   575      */
   576     public static String encodeWord(String word)
   577                         throws UnsupportedEncodingException {
   578         return encodeWord(word, null, null);
   579     }
   581     /**
   582      * Encode a RFC 822 "word" token into mail-safe form as per
   583      * RFC 2047. <p>
   584      *
   585      * The given Unicode string is examined for non US-ASCII
   586      * characters. If the string contains only US-ASCII characters,
   587      * it is returned as-is.  If the string contains non US-ASCII
   588      * characters, it is first character-encoded using the specified
   589      * charset, then transfer-encoded using either the B or Q encoding.
   590      * The resulting bytes are then returned as a Unicode string
   591      * containing only ASCII characters. <p>
   592      *
   593      * @param   text    unicode string
   594      * @param   charset the MIME charset
   595      * @param   encoding the encoding to be used. Currently supported
   596      *          values are "B" and "Q". If this parameter is null, then
   597      *          the "Q" encoding is used if most of characters to be
   598      *          encoded are in the ASCII charset, otherwise "B" encoding
   599      *          is used.
   600      * @return  Unicode string containing only US-ASCII characters
   601      * @exception UnsupportedEncodingException if the encoding fails
   602      */
   603     public static String encodeWord(String word, String charset,
   604                                     String encoding)
   605                         throws UnsupportedEncodingException {
   606         return encodeWord(word, charset, encoding, true);
   607     }
   609     /*
   610      * Encode the given string. The parameter 'encodingWord' should
   611      * be true if a RFC 822 "word" token is being encoded and false if a
   612      * RFC 822 "text" token is being encoded. This is because the
   613      * "Q" encoding defined in RFC 2047 has more restrictions when
   614      * encoding "word" tokens. (Sigh)
   615      */
   616     private static String encodeWord(String string, String charset,
   617                                      String encoding, boolean encodingWord)
   618                         throws UnsupportedEncodingException {
   620         // If 'string' contains only US-ASCII characters, just
   621         // return it.
   622         int ascii = checkAscii(string);
   623         if (ascii == ALL_ASCII)
   624             return string;
   626         // Else, apply the specified charset conversion.
   627         String jcharset;
   628         if (charset == null) { // use default charset
   629             jcharset = getDefaultJavaCharset(); // the java charset
   630             charset = getDefaultMIMECharset(); // the MIME equivalent
   631         } else // MIME charset -> java charset
   632             jcharset = javaCharset(charset);
   634         // If no transfer-encoding is specified, figure one out.
   635         if (encoding == null) {
   636             if (ascii != MOSTLY_NONASCII)
   637                 encoding = "Q";
   638             else
   639                 encoding = "B";
   640         }
   642         boolean b64;
   643         if (encoding.equalsIgnoreCase("B"))
   644             b64 = true;
   645         else if (encoding.equalsIgnoreCase("Q"))
   646             b64 = false;
   647         else
   648             throw new UnsupportedEncodingException(
   649                         "Unknown transfer encoding: " + encoding);
   651         StringBuffer outb = new StringBuffer(); // the output buffer
   652         doEncode(string, b64, jcharset,
   653                  // As per RFC 2047, size of an encoded string should not
   654                  // exceed 75 bytes.
   655                  // 7 = size of "=?", '?', 'B'/'Q', '?', "?="
   656                  75 - 7 - charset.length(), // the available space
   657                  "=?" + charset + "?" + encoding + "?", // prefix
   658                  true, encodingWord, outb);
   660         return outb.toString();
   661     }
   663     private static void doEncode(String string, boolean b64,
   664                 String jcharset, int avail, String prefix,
   665                 boolean first, boolean encodingWord, StringBuffer buf)
   666                         throws UnsupportedEncodingException {
   668         // First find out what the length of the encoded version of
   669         // 'string' would be.
   670         byte[] bytes = string.getBytes(jcharset);
   671         int len;
   672         if (b64) // "B" encoding
   673             len = BEncoderStream.encodedLength(bytes);
   674         else // "Q"
   675             len = QEncoderStream.encodedLength(bytes, encodingWord);
   677         int size;
   678         if ((len > avail) && ((size = string.length()) > 1)) {
   679             // If the length is greater than 'avail', split 'string'
   680             // into two and recurse.
   681             doEncode(string.substring(0, size/2), b64, jcharset,
   682                      avail, prefix, first, encodingWord, buf);
   683             doEncode(string.substring(size/2, size), b64, jcharset,
   684                      avail, prefix, false, encodingWord, buf);
   685         } else {
   686             // length <= than 'avail'. Encode the given string
   687             ByteArrayOutputStream os = new ByteArrayOutputStream(BUFFER_SIZE);
   688             OutputStream eos; // the encoder
   689             if (b64) // "B" encoding
   690                 eos = new BEncoderStream(os);
   691             else // "Q" encoding
   692                 eos = new QEncoderStream(os, encodingWord);
   694             try { // do the encoding
   695                 eos.write(bytes);
   696                 eos.close();
   697             } catch (IOException ioex) { }
   699             byte[] encodedBytes = os.toByteArray(); // the encoded stuff
   700             // Now write out the encoded (all ASCII) bytes into our
   701             // StringBuffer
   702             if (!first) // not the first line of this sequence
   703                 if (foldEncodedWords)
   704                     buf.append("\r\n "); // start a continuation line
   705                 else
   706                     buf.append(" "); // line will be folded later
   708             buf.append(prefix);
   709             for (int i = 0; i < encodedBytes.length; i++)
   710                 buf.append((char)encodedBytes[i]);
   711             buf.append("?="); // terminate the current sequence
   712         }
   713     }
   715     /**
   716      * The string is parsed using the rules in RFC 2047 for parsing
   717      * an "encoded-word". If the parse fails, a ParseException is
   718      * thrown. Otherwise, it is transfer-decoded, and then
   719      * charset-converted into Unicode. If the charset-conversion
   720      * fails, an UnsupportedEncodingException is thrown.<p>
   721      *
   722      * @param   eword   the possibly encoded value
   723      * @exception       ParseException if the string is not an
   724      *                  encoded-word as per RFC 2047.
   725      * @exception       UnsupportedEncodingException if the charset
   726      *                  conversion failed.
   727      */
   728     public static String decodeWord(String eword)
   729                 throws ParseException, UnsupportedEncodingException {
   731         if (!eword.startsWith("=?")) // not an encoded word
   732             throw new ParseException();
   734         // get charset
   735         int start = 2; int pos;
   736         if ((pos = eword.indexOf('?', start)) == -1)
   737             throw new ParseException();
   738         String charset = javaCharset(eword.substring(start, pos));
   740         // get encoding
   741         start = pos+1;
   742         if ((pos = eword.indexOf('?', start)) == -1)
   743             throw new ParseException();
   744         String encoding = eword.substring(start, pos);
   746         // get encoded-sequence
   747         start = pos+1;
   748         if ((pos = eword.indexOf("?=", start)) == -1)
   749             throw new ParseException();
   750         String word = eword.substring(start, pos);
   752         try {
   753             // Extract the bytes from word
   754             ByteArrayInputStream bis =
   755                 new ByteArrayInputStream(ASCIIUtility.getBytes(word));
   757             // Get the appropriate decoder
   758             InputStream is;
   759             if (encoding.equalsIgnoreCase("B"))
   760                 is = new BASE64DecoderStream(bis);
   761             else if (encoding.equalsIgnoreCase("Q"))
   762                 is = new QDecoderStream(bis);
   763             else
   764                 throw new UnsupportedEncodingException(
   765                                 "unknown encoding: " + encoding);
   767             // For b64 & q, size of decoded word <= size of word. So
   768             // the decoded bytes must fit into the 'bytes' array. This
   769             // is certainly more efficient than writing bytes into a
   770             // ByteArrayOutputStream and then pulling out the byte[]
   771             // from it.
   772             int count = bis.available();
   773             byte[] bytes = new byte[count];
   774             // count is set to the actual number of decoded bytes
   775             count = is.read(bytes, 0, count);
   777             // Finally, convert the decoded bytes into a String using
   778             // the specified charset
   779             String s = new String(bytes, 0, count, charset);
   780             if (pos + 2 < eword.length()) {
   781                 // there's still more text in the string
   782                 String rest = eword.substring(pos + 2);
   783                 if (!decodeStrict)
   784                     rest = decodeInnerWords(rest);
   785                 s += rest;
   786             }
   787             return s;
   788         } catch (UnsupportedEncodingException uex) {
   789             // explicitly catch and rethrow this exception, otherwise
   790             // the below IOException catch will swallow this up!
   791             throw uex;
   792         } catch (IOException ioex) {
   793             // Shouldn't happen.
   794             throw new ParseException();
   795         } catch (IllegalArgumentException iex) {
   796             /* An unknown charset of the form ISO-XXX-XXX, will cause
   797              * the JDK to throw an IllegalArgumentException ... Since the
   798              * JDK will attempt to create a classname using this string,
   799              * but valid classnames must not contain the character '-',
   800              * and this results in an IllegalArgumentException, rather than
   801              * the expected UnsupportedEncodingException. Yikes
   802              */
   803             throw new UnsupportedEncodingException();
   804         }
   805     }
   807     /**
   808      * Look for encoded words within a word.  The MIME spec doesn't
   809      * allow this, but many broken mailers, especially Japanese mailers,
   810      * produce such incorrect encodings.
   811      */
   812     private static String decodeInnerWords(String word)
   813                                 throws UnsupportedEncodingException {
   814         int start = 0, i;
   815         StringBuffer buf = new StringBuffer();
   816         while ((i = word.indexOf("=?", start)) >= 0) {
   817             buf.append(word.substring(start, i));
   818             int end = word.indexOf("?=", i);
   819             if (end < 0)
   820                 break;
   821             String s = word.substring(i, end + 2);
   822             try {
   823                 s = decodeWord(s);
   824             } catch (ParseException pex) {
   825                 // ignore it, just use the original string
   826             }
   827             buf.append(s);
   828             start = end + 2;
   829         }
   830         if (start == 0)
   831             return word;
   832         if (start < word.length())
   833             buf.append(word.substring(start));
   834         return buf.toString();
   835     }
   837     /**
   838      * A utility method to quote a word, if the word contains any
   839      * characters from the specified 'specials' list.<p>
   840      *
   841      * The <code>HeaderTokenizer</code> class defines two special
   842      * sets of delimiters - MIME and RFC 822. <p>
   843      *
   844      * This method is typically used during the generation of
   845      * RFC 822 and MIME header fields.
   846      *
   847      * @param   word    word to be quoted
   848      * @param   specials the set of special characters
   849      * @return          the possibly quoted word
   850      * @see     javax.mail.internet.HeaderTokenizer#MIME
   851      * @see     javax.mail.internet.HeaderTokenizer#RFC822
   852      */
   853     public static String quote(String word, String specials) {
   854         int len = word.length();
   856         /*
   857          * Look for any "bad" characters, Escape and
   858          *  quote the entire string if necessary.
   859          */
   860         boolean needQuoting = false;
   861         for (int i = 0; i < len; i++) {
   862             char c = word.charAt(i);
   863             if (c == '"' || c == '\\' || c == '\r' || c == '\n') {
   864                 // need to escape them and then quote the whole string
   865                 StringBuffer sb = new StringBuffer(len + 3);
   866                 sb.append('"');
   867                 sb.append(word.substring(0, i));
   868                 int lastc = 0;
   869                 for (int j = i; j < len; j++) {
   870                     char cc = word.charAt(j);
   871                     if ((cc == '"') || (cc == '\\') ||
   872                         (cc == '\r') || (cc == '\n'))
   873                         if (cc == '\n' && lastc == '\r')
   874                             ;   // do nothing, CR was already escaped
   875                         else
   876                             sb.append('\\');    // Escape the character
   877                     sb.append(cc);
   878                     lastc = cc;
   879                 }
   880                 sb.append('"');
   881                 return sb.toString();
   882             } else if (c < 040 || c >= 0177 || specials.indexOf(c) >= 0)
   883                 // These characters cause the string to be quoted
   884                 needQuoting = true;
   885         }
   887         if (needQuoting) {
   888             StringBuffer sb = new StringBuffer(len + 2);
   889             sb.append('"').append(word).append('"');
   890             return sb.toString();
   891         } else
   892             return word;
   893     }
   895     /**
   896      * Fold a string at linear whitespace so that each line is no longer
   897      * than 76 characters, if possible.  If there are more than 76
   898      * non-whitespace characters consecutively, the string is folded at
   899      * the first whitespace after that sequence.  The parameter
   900      * <code>used</code> indicates how many characters have been used in
   901      * the current line; it is usually the length of the header name. <p>
   902      *
   903      * Note that line breaks in the string aren't escaped; they probably
   904      * should be.
   905      *
   906      * @param   used    characters used in line so far
   907      * @param   s       the string to fold
   908      * @return          the folded string
   909      */
   910     /*public*/ static String fold(int used, String s) {
   911         if (!foldText)
   912             return s;
   914         int end;
   915         char c;
   916         // Strip trailing spaces
   917         for (end = s.length() - 1; end >= 0; end--) {
   918             c = s.charAt(end);
   919             if (c != ' ' && c != '\t')
   920                 break;
   921         }
   922         if (end != s.length() - 1)
   923             s = s.substring(0, end + 1);
   925         // if the string fits now, just return it
   926         if (used + s.length() <= 76)
   927             return s;
   929         // have to actually fold the string
   930         StringBuffer sb = new StringBuffer(s.length() + 4);
   931         char lastc = 0;
   932         while (used + s.length() > 76) {
   933             int lastspace = -1;
   934             for (int i = 0; i < s.length(); i++) {
   935                 if (lastspace != -1 && used + i > 76)
   936                     break;
   937                 c = s.charAt(i);
   938                 if (c == ' ' || c == '\t')
   939                     if (!(lastc == ' ' || lastc == '\t'))
   940                         lastspace = i;
   941                 lastc = c;
   942             }
   943             if (lastspace == -1) {
   944                 // no space, use the whole thing
   945                 sb.append(s);
   946                 s = "";
   947                 used = 0;
   948                 break;
   949             }
   950             sb.append(s.substring(0, lastspace));
   951             sb.append("\r\n");
   952             lastc = s.charAt(lastspace);
   953             sb.append(lastc);
   954             s = s.substring(lastspace + 1);
   955             used = 1;
   956         }
   957         sb.append(s);
   958         return sb.toString();
   959     }
   961     /**
   962      * Unfold a folded header.  Any line breaks that aren't escaped and
   963      * are followed by whitespace are removed.
   964      *
   965      * @param   s       the string to unfold
   966      * @return          the unfolded string
   967      */
   968     /*public*/ static String unfold(String s) {
   969         if (!foldText)
   970             return s;
   972         StringBuffer sb = null;
   973         int i;
   974         while ((i = indexOfAny(s, "\r\n")) >= 0) {
   975             int start = i;
   976             int l = s.length();
   977             i++;                // skip CR or NL
   978             if (i < l && s.charAt(i - 1) == '\r' && s.charAt(i) == '\n')
   979                 i++;    // skip LF
   980             if (start == 0 || s.charAt(start - 1) != '\\') {
   981                 char c;
   982                 // if next line starts with whitespace, skip all of it
   983                 // XXX - always has to be true?
   984                 if (i < l && ((c = s.charAt(i)) == ' ' || c == '\t')) {
   985                     i++;        // skip whitespace
   986                     while (i < l && ((c = s.charAt(i)) == ' ' || c == '\t'))
   987                         i++;
   988                     if (sb == null)
   989                         sb = new StringBuffer(s.length());
   990                     if (start != 0) {
   991                         sb.append(s.substring(0, start));
   992                         sb.append(' ');
   993                     }
   994                     s = s.substring(i);
   995                     continue;
   996                 }
   997                 // it's not a continuation line, just leave it in
   998                 if (sb == null)
   999                     sb = new StringBuffer(s.length());
  1000                 sb.append(s.substring(0, i));
  1001                 s = s.substring(i);
  1002             } else {
  1003                 // there's a backslash at "start - 1"
  1004                 // strip it out, but leave in the line break
  1005                 if (sb == null)
  1006                     sb = new StringBuffer(s.length());
  1007                 sb.append(s.substring(0, start - 1));
  1008                 sb.append(s.substring(start, i));
  1009                 s = s.substring(i);
  1012         if (sb != null) {
  1013             sb.append(s);
  1014             return sb.toString();
  1015         } else
  1016             return s;
  1019     /**
  1020      * Return the first index of any of the characters in "any" in "s",
  1021      * or -1 if none are found.
  1023      * This should be a method on String.
  1024      */
  1025     private static int indexOfAny(String s, String any) {
  1026         return indexOfAny(s, any, 0);
  1029     private static int indexOfAny(String s, String any, int start) {
  1030         try {
  1031             int len = s.length();
  1032             for (int i = start; i < len; i++) {
  1033                 if (any.indexOf(s.charAt(i)) >= 0)
  1034                     return i;
  1036             return -1;
  1037         } catch (StringIndexOutOfBoundsException e) {
  1038             return -1;
  1042     /**
  1043      * Convert a MIME charset name into a valid Java charset name. <p>
  1045      * @param charset   the MIME charset name
  1046      * @return  the Java charset equivalent. If a suitable mapping is
  1047      *          not available, the passed in charset is itself returned.
  1048      */
  1049     public static String javaCharset(String charset) {
  1050         if (mime2java == null || charset == null)
  1051             // no mapping table, or charset parameter is null
  1052             return charset;
  1054         String alias = (String)mime2java.get(charset.toLowerCase());
  1055         return alias == null ? charset : alias;
  1058     /**
  1059      * Convert a java charset into its MIME charset name. <p>
  1061      * Note that a future version of JDK (post 1.2) might provide
  1062      * this functionality, in which case, we may deprecate this
  1063      * method then.
  1065      * @param   charset    the JDK charset
  1066      * @return          the MIME/IANA equivalent. If a mapping
  1067      *                  is not possible, the passed in charset itself
  1068      *                  is returned.
  1069      * @since           JavaMail 1.1
  1070      */
  1071     public static String mimeCharset(String charset) {
  1072         if (java2mime == null || charset == null)
  1073             // no mapping table or charset param is null
  1074             return charset;
  1076         String alias = (String)java2mime.get(charset.toLowerCase());
  1077         return alias == null ? charset : alias;
  1080     private static String defaultJavaCharset;
  1081     private static String defaultMIMECharset;
  1083     /**
  1084      * Get the default charset corresponding to the system's current
  1085      * default locale.  If the System property <code>mail.mime.charset</code>
  1086      * is set, a system charset corresponding to this MIME charset will be
  1087      * returned. <p>
  1089      * @return  the default charset of the system's default locale,
  1090      *          as a Java charset. (NOT a MIME charset)
  1091      * @since   JavaMail 1.1
  1092      */
  1093     public static String getDefaultJavaCharset() {
  1094         if (defaultJavaCharset == null) {
  1095             /*
  1096              * If mail.mime.charset is set, it controls the default
  1097              * Java charset as well.
  1098              */
  1099             String mimecs = null;
  1101             mimecs = SAAJUtil.getSystemProperty("mail.mime.charset");
  1103             if (mimecs != null && mimecs.length() > 0) {
  1104                 defaultJavaCharset = javaCharset(mimecs);
  1105                 return defaultJavaCharset;
  1108             try {
  1109                 defaultJavaCharset = System.getProperty("file.encoding",
  1110                                                         "8859_1");
  1111             } catch (SecurityException sex) {
  1113                 class NullInputStream extends InputStream {
  1114                     public int read() {
  1115                         return 0;
  1118                 InputStreamReader reader =
  1119                         new InputStreamReader(new NullInputStream());
  1120                 defaultJavaCharset = reader.getEncoding();
  1121                 if (defaultJavaCharset == null)
  1122                     defaultJavaCharset = "8859_1";
  1126         return defaultJavaCharset;
  1129     /*
  1130      * Get the default MIME charset for this locale.
  1131      */
  1132     static String getDefaultMIMECharset() {
  1133         if (defaultMIMECharset == null) {
  1134                 defaultMIMECharset = SAAJUtil.getSystemProperty("mail.mime.charset");
  1136         if (defaultMIMECharset == null)
  1137             defaultMIMECharset = mimeCharset(getDefaultJavaCharset());
  1138         return defaultMIMECharset;
  1141     // Tables to map MIME charset names to Java names and vice versa.
  1142     // XXX - Should eventually use J2SE 1.4 java.nio.charset.Charset
  1143     private static Hashtable mime2java;
  1144     private static Hashtable java2mime;
  1146     static {
  1147         java2mime = new Hashtable(40);
  1148         mime2java = new Hashtable(10);
  1150         try {
  1151             // Use this class's classloader to load the mapping file
  1152             // XXX - we should use SecuritySupport, but it's in another package
  1153             InputStream is =
  1154                     com.sun.xml.internal.messaging.saaj.packaging.mime.internet.MimeUtility.class.getResourceAsStream(
  1155                     "/META-INF/javamail.charset.map");
  1157             if (is != null) {
  1158                 is = new LineInputStream(is);
  1160                 // Load the JDK-to-MIME charset mapping table
  1161                 loadMappings((LineInputStream)is, java2mime);
  1163                 // Load the MIME-to-JDK charset mapping table
  1164                 loadMappings((LineInputStream)is, mime2java);
  1166         } catch (Exception ex) { }
  1168         // If we didn't load the tables, e.g., because we didn't have
  1169         // permission, load them manually.  The entries here should be
  1170         // the same as the default javamail.charset.map.
  1171         if (java2mime.isEmpty()) {
  1172             java2mime.put("8859_1", "ISO-8859-1");
  1173             java2mime.put("iso8859_1", "ISO-8859-1");
  1174             java2mime.put("ISO8859-1", "ISO-8859-1");
  1176             java2mime.put("8859_2", "ISO-8859-2");
  1177             java2mime.put("iso8859_2", "ISO-8859-2");
  1178             java2mime.put("ISO8859-2", "ISO-8859-2");
  1180             java2mime.put("8859_3", "ISO-8859-3");
  1181             java2mime.put("iso8859_3", "ISO-8859-3");
  1182             java2mime.put("ISO8859-3", "ISO-8859-3");
  1184             java2mime.put("8859_4", "ISO-8859-4");
  1185             java2mime.put("iso8859_4", "ISO-8859-4");
  1186             java2mime.put("ISO8859-4", "ISO-8859-4");
  1188             java2mime.put("8859_5", "ISO-8859-5");
  1189             java2mime.put("iso8859_5", "ISO-8859-5");
  1190             java2mime.put("ISO8859-5", "ISO-8859-5");
  1192             java2mime.put("8859_6", "ISO-8859-6");
  1193             java2mime.put("iso8859_6", "ISO-8859-6");
  1194             java2mime.put("ISO8859-6", "ISO-8859-6");
  1196             java2mime.put("8859_7", "ISO-8859-7");
  1197             java2mime.put("iso8859_7", "ISO-8859-7");
  1198             java2mime.put("ISO8859-7", "ISO-8859-7");
  1200             java2mime.put("8859_8", "ISO-8859-8");
  1201             java2mime.put("iso8859_8", "ISO-8859-8");
  1202             java2mime.put("ISO8859-8", "ISO-8859-8");
  1204             java2mime.put("8859_9", "ISO-8859-9");
  1205             java2mime.put("iso8859_9", "ISO-8859-9");
  1206             java2mime.put("ISO8859-9", "ISO-8859-9");
  1208             java2mime.put("SJIS", "Shift_JIS");
  1209             java2mime.put("MS932", "Shift_JIS");
  1210             java2mime.put("JIS", "ISO-2022-JP");
  1211             java2mime.put("ISO2022JP", "ISO-2022-JP");
  1212             java2mime.put("EUC_JP", "euc-jp");
  1213             java2mime.put("KOI8_R", "koi8-r");
  1214             java2mime.put("EUC_CN", "euc-cn");
  1215             java2mime.put("EUC_TW", "euc-tw");
  1216             java2mime.put("EUC_KR", "euc-kr");
  1218         if (mime2java.isEmpty()) {
  1219             mime2java.put("iso-2022-cn", "ISO2022CN");
  1220             mime2java.put("iso-2022-kr", "ISO2022KR");
  1221             mime2java.put("utf-8", "UTF8");
  1222             mime2java.put("utf8", "UTF8");
  1223             mime2java.put("ja_jp.iso2022-7", "ISO2022JP");
  1224             mime2java.put("ja_jp.eucjp", "EUCJIS");
  1225             mime2java.put("euc-kr", "KSC5601");
  1226             mime2java.put("euckr", "KSC5601");
  1227             mime2java.put("us-ascii", "ISO-8859-1");
  1228             mime2java.put("x-us-ascii", "ISO-8859-1");
  1232     private static void loadMappings(LineInputStream is, Hashtable table) {
  1233         String currLine;
  1235         while (true) {
  1236             try {
  1237                 currLine = is.readLine();
  1238             } catch (IOException ioex) {
  1239                 break; // error in reading, stop
  1242             if (currLine == null) // end of file, stop
  1243                 break;
  1244             if (currLine.startsWith("--") && currLine.endsWith("--"))
  1245                 // end of this table
  1246                 break;
  1248             // ignore empty lines and comments
  1249             if (currLine.trim().length() == 0 || currLine.startsWith("#"))
  1250                 continue;
  1252             // A valid entry is of the form <key><separator><value>
  1253             // where, <separator> := SPACE | HT. Parse this
  1254             StringTokenizer tk = new StringTokenizer(currLine, " \t");
  1255             try {
  1256                 String key = tk.nextToken();
  1257                 String value = tk.nextToken();
  1258                 table.put(key.toLowerCase(), value);
  1259             } catch (NoSuchElementException nex) { }
  1263     static final int ALL_ASCII          = 1;
  1264     static final int MOSTLY_ASCII       = 2;
  1265     static final int MOSTLY_NONASCII    = 3;
  1267     /**
  1268      * Check if the given string contains non US-ASCII characters.
  1269      * @param   s       string
  1270      * @return          ALL_ASCII if all characters in the string
  1271      *                  belong to the US-ASCII charset. MOSTLY_ASCII
  1272      *                  if more than half of the available characters
  1273      *                  are US-ASCII characters. Else MOSTLY_NONASCII.
  1274      */
  1275     static int checkAscii(String s) {
  1276         int ascii = 0, non_ascii = 0;
  1277         int l = s.length();
  1279         for (int i = 0; i < l; i++) {
  1280             if (nonascii((int)s.charAt(i))) // non-ascii
  1281                 non_ascii++;
  1282             else
  1283                 ascii++;
  1286         if (non_ascii == 0)
  1287             return ALL_ASCII;
  1288         if (ascii > non_ascii)
  1289             return MOSTLY_ASCII;
  1291         return MOSTLY_NONASCII;
  1294     /**
  1295      * Check if the given byte array contains non US-ASCII characters.
  1296      * @param   b       byte array
  1297      * @return          ALL_ASCII if all characters in the string
  1298      *                  belong to the US-ASCII charset. MOSTLY_ASCII
  1299      *                  if more than half of the available characters
  1300      *                  are US-ASCII characters. Else MOSTLY_NONASCII.
  1302      * XXX - this method is no longer used
  1303      */
  1304     static int checkAscii(byte[] b) {
  1305         int ascii = 0, non_ascii = 0;
  1307         for (int i=0; i < b.length; i++) {
  1308             // The '&' operator automatically causes b[i] to be promoted
  1309             // to an int, and we mask out the higher bytes in the int
  1310             // so that the resulting value is not a negative integer.
  1311             if (nonascii(b[i] & 0xff)) // non-ascii
  1312                 non_ascii++;
  1313             else
  1314                 ascii++;
  1317         if (non_ascii == 0)
  1318             return ALL_ASCII;
  1319         if (ascii > non_ascii)
  1320             return MOSTLY_ASCII;
  1322         return MOSTLY_NONASCII;
  1325     /**
  1326      * Check if the given input stream contains non US-ASCII characters.
  1327      * Upto <code>max</code> bytes are checked. If <code>max</code> is
  1328      * set to <code>ALL</code>, then all the bytes available in this
  1329      * input stream are checked. If <code>breakOnNonAscii</code> is true
  1330      * the check terminates when the first non-US-ASCII character is
  1331      * found and MOSTLY_NONASCII is returned. Else, the check continues
  1332      * till <code>max</code> bytes or till the end of stream.
  1334      * @param   is      the input stream
  1335      * @param   max     maximum bytes to check for. The special value
  1336      *                  ALL indicates that all the bytes in this input
  1337      *                  stream must be checked.
  1338      * @param   breakOnNonAscii if <code>true</code>, then terminate the
  1339      *                  the check when the first non-US-ASCII character
  1340      *                  is found.
  1341      * @return          ALL_ASCII if all characters in the string
  1342      *                  belong to the US-ASCII charset. MOSTLY_ASCII
  1343      *                  if more than half of the available characters
  1344      *                  are US-ASCII characters. Else MOSTLY_NONASCII.
  1345      */
  1346     static int checkAscii(InputStream is, int max, boolean breakOnNonAscii) {
  1347         int ascii = 0, non_ascii = 0;
  1348         int len;
  1349         int block = 4096;
  1350         int linelen = 0;
  1351         boolean longLine = false, badEOL = false;
  1352         boolean checkEOL = encodeEolStrict && breakOnNonAscii;
  1353         byte buf[] = null;
  1354         if (max != 0) {
  1355             block = (max == ALL) ? 4096 : Math.min(max, 4096);
  1356             buf = new byte[block];
  1358         while (max != 0) {
  1359             try {
  1360                 if ((len = is.read(buf, 0, block)) == -1)
  1361                     break;
  1362                 int lastb = 0;
  1363                 for (int i = 0; i < len; i++) {
  1364                     // The '&' operator automatically causes b[i] to
  1365                     // be promoted to an int, and we mask out the higher
  1366                     // bytes in the int so that the resulting value is
  1367                     // not a negative integer.
  1368                     int b = buf[i] & 0xff;
  1369                     if (checkEOL &&
  1370                             ((lastb == '\r' && b != '\n') ||
  1371                             (lastb != '\r' && b == '\n')))
  1372                         badEOL = true;
  1373                     if (b == '\r' || b == '\n')
  1374                         linelen = 0;
  1375                     else {
  1376                         linelen++;
  1377                         if (linelen > 998)      // 1000 - CRLF
  1378                             longLine = true;
  1380                     if (nonascii(b)) {  // non-ascii
  1381                         if (breakOnNonAscii) // we are done
  1382                             return MOSTLY_NONASCII;
  1383                         else
  1384                             non_ascii++;
  1385                     } else
  1386                         ascii++;
  1387                     lastb = b;
  1389             } catch (IOException ioex) {
  1390                 break;
  1392             if (max != ALL)
  1393                 max -= len;
  1396         if (max == 0 && breakOnNonAscii)
  1397             // We have been told to break on the first non-ascii character.
  1398             // We haven't got any non-ascii character yet, but then we
  1399             // have not checked all of the available bytes either. So we
  1400             // cannot say for sure that this input stream is ALL_ASCII,
  1401             // and hence we must play safe and return MOSTLY_NONASCII
  1403             return MOSTLY_NONASCII;
  1405         if (non_ascii == 0) { // no non-us-ascii characters so far
  1406             // If we're looking at non-text data, and we saw CR without LF
  1407             // or vice versa, consider this mostly non-ASCII so that it
  1408             // will be base64 encoded (since the quoted-printable encoder
  1409             // doesn't encode this case properly).
  1410             if (badEOL)
  1411                 return MOSTLY_NONASCII;
  1412             // if we've seen a long line, we degrade to mostly ascii
  1413             else if (longLine)
  1414                 return MOSTLY_ASCII;
  1415             else
  1416                 return ALL_ASCII;
  1418         if (ascii > non_ascii) // mostly ascii
  1419             return MOSTLY_ASCII;
  1420         return MOSTLY_NONASCII;
  1423     static final boolean nonascii(int b) {
  1424         return b >= 0177 || (b < 040 && b != '\r' && b != '\n' && b != '\t');
  1428 /**
  1429  * An OutputStream that determines whether the data written to
  1430  * it is all ASCII, mostly ASCII, or mostly non-ASCII.
  1431  */
  1432 class AsciiOutputStream extends OutputStream {
  1433     private boolean breakOnNonAscii;
  1434     private int ascii = 0, non_ascii = 0;
  1435     private int linelen = 0;
  1436     private boolean longLine = false;
  1437     private boolean badEOL = false;
  1438     private boolean checkEOL = false;
  1439     private int lastb = 0;
  1440     private int ret = 0;
  1442     public AsciiOutputStream(boolean breakOnNonAscii, boolean encodeEolStrict) {
  1443         this.breakOnNonAscii = breakOnNonAscii;
  1444         checkEOL = encodeEolStrict && breakOnNonAscii;
  1447     public void write(int b) throws IOException {
  1448         check(b);
  1451     public void write(byte b[]) throws IOException {
  1452         write(b, 0, b.length);
  1455     public void write(byte b[], int off, int len) throws IOException {
  1456         len += off;
  1457         for (int i = off; i < len ; i++)
  1458             check(b[i]);
  1461     private final void check(int b) throws IOException {
  1462         b &= 0xff;
  1463         if (checkEOL &&
  1464                 ((lastb == '\r' && b != '\n') || (lastb != '\r' && b == '\n')))
  1465             badEOL = true;
  1466         if (b == '\r' || b == '\n')
  1467             linelen = 0;
  1468         else {
  1469             linelen++;
  1470             if (linelen > 998)  // 1000 - CRLF
  1471                 longLine = true;
  1473         if (MimeUtility.nonascii(b)) { // non-ascii
  1474             non_ascii++;
  1475             if (breakOnNonAscii) {      // we are done
  1476                 ret = MimeUtility.MOSTLY_NONASCII;
  1477                 throw new EOFException();
  1479         } else
  1480             ascii++;
  1481         lastb = b;
  1484     /**
  1485      * Return ASCII-ness of data stream.
  1486      */
  1487     public int getAscii() {
  1488         if (ret != 0)
  1489             return ret;
  1490         // If we're looking at non-text data, and we saw CR without LF
  1491         // or vice versa, consider this mostly non-ASCII so that it
  1492         // will be base64 encoded (since the quoted-printable encoder
  1493         // doesn't encode this case properly).
  1494         if (badEOL)
  1495             return MimeUtility.MOSTLY_NONASCII;
  1496         else if (non_ascii == 0) { // no non-us-ascii characters so far
  1497             // if we've seen a long line, we degrade to mostly ascii
  1498             if (longLine)
  1499                 return MimeUtility.MOSTLY_ASCII;
  1500             else
  1501                 return MimeUtility.ALL_ASCII;
  1503         if (ascii > non_ascii) // mostly ascii
  1504             return MimeUtility.MOSTLY_ASCII;
  1505         return MimeUtility.MOSTLY_NONASCII;

mercurial