src/share/jaxws_classes/com/sun/xml/internal/messaging/saaj/packaging/mime/internet/MimeUtility.java

Thu, 31 Aug 2017 15:18:52 +0800

author
aoqi
date
Thu, 31 Aug 2017 15:18:52 +0800
changeset 637
9c07ef4934dd
parent 368
0989ad8c0860
parent 0
373ffda63c9a
permissions
-rw-r--r--

merge

aoqi@0 1 /*
aoqi@0 2 * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
aoqi@0 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
aoqi@0 4 *
aoqi@0 5 * This code is free software; you can redistribute it and/or modify it
aoqi@0 6 * under the terms of the GNU General Public License version 2 only, as
aoqi@0 7 * published by the Free Software Foundation. Oracle designates this
aoqi@0 8 * particular file as subject to the "Classpath" exception as provided
aoqi@0 9 * by Oracle in the LICENSE file that accompanied this code.
aoqi@0 10 *
aoqi@0 11 * This code is distributed in the hope that it will be useful, but WITHOUT
aoqi@0 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
aoqi@0 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
aoqi@0 14 * version 2 for more details (a copy is included in the LICENSE file that
aoqi@0 15 * accompanied this code).
aoqi@0 16 *
aoqi@0 17 * You should have received a copy of the GNU General Public License version
aoqi@0 18 * 2 along with this work; if not, write to the Free Software Foundation,
aoqi@0 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
aoqi@0 20 *
aoqi@0 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
aoqi@0 22 * or visit www.oracle.com if you need additional information or have any
aoqi@0 23 * questions.
aoqi@0 24 */
aoqi@0 25
aoqi@0 26 /*
aoqi@0 27 * @(#)MimeUtility.java 1.45 03/03/10
aoqi@0 28 */
aoqi@0 29
aoqi@0 30
aoqi@0 31
aoqi@0 32 package com.sun.xml.internal.messaging.saaj.packaging.mime.internet;
aoqi@0 33
aoqi@0 34 import java.io.*;
aoqi@0 35 import java.util.*;
aoqi@0 36
aoqi@0 37 import javax.activation.DataHandler;
aoqi@0 38 import javax.activation.DataSource;
aoqi@0 39
aoqi@0 40 import com.sun.xml.internal.messaging.saaj.packaging.mime.MessagingException;
aoqi@0 41 import com.sun.xml.internal.messaging.saaj.packaging.mime.util.*;
aoqi@0 42 import com.sun.xml.internal.messaging.saaj.util.SAAJUtil;
aoqi@0 43
aoqi@0 44 /**
aoqi@0 45 * This is a utility class that provides various MIME related
aoqi@0 46 * functionality. <p>
aoqi@0 47 *
aoqi@0 48 * There are a set of methods to encode and decode MIME headers as
aoqi@0 49 * per RFC 2047. A brief description on handling such headers is
aoqi@0 50 * given below: <p>
aoqi@0 51 *
aoqi@0 52 * RFC 822 mail headers <strong>must</strong> contain only US-ASCII
aoqi@0 53 * characters. Headers that contain non US-ASCII characters must be
aoqi@0 54 * encoded so that they contain only US-ASCII characters. Basically,
aoqi@0 55 * this process involves using either BASE64 or QP to encode certain
aoqi@0 56 * characters. RFC 2047 describes this in detail. <p>
aoqi@0 57 *
aoqi@0 58 * In Java, Strings contain (16 bit) Unicode characters. ASCII is a
aoqi@0 59 * subset of Unicode (and occupies the range 0 - 127). A String
aoqi@0 60 * that contains only ASCII characters is already mail-safe. If the
aoqi@0 61 * String contains non US-ASCII characters, it must be encoded. An
aoqi@0 62 * additional complexity in this step is that since Unicode is not
aoqi@0 63 * yet a widely used charset, one might want to first charset-encode
aoqi@0 64 * the String into another charset and then do the transfer-encoding.
aoqi@0 65 * <p>
aoqi@0 66 * Note that to get the actual bytes of a mail-safe String (say,
aoqi@0 67 * for sending over SMTP), one must do
aoqi@0 68 * <p><blockquote><pre>
aoqi@0 69 *
aoqi@0 70 * byte[] bytes = string.getBytes("iso-8859-1");
aoqi@0 71 *
aoqi@0 72 * </pre></blockquote><p>
aoqi@0 73 *
aoqi@0 74 * The <code>setHeader</code> and <code>addHeader</code> methods
aoqi@0 75 * on MimeMessage and MimeBodyPart assume that the given header values
aoqi@0 76 * are Unicode strings that contain only US-ASCII characters. Hence
aoqi@0 77 * the callers of those methods must insure that the values they pass
aoqi@0 78 * do not contain non US-ASCII characters. The methods in this class
aoqi@0 79 * help do this. <p>
aoqi@0 80 *
aoqi@0 81 * The <code>getHeader</code> family of methods on MimeMessage and
aoqi@0 82 * MimeBodyPart return the raw header value. These might be encoded
aoqi@0 83 * as per RFC 2047, and if so, must be decoded into Unicode Strings.
aoqi@0 84 * The methods in this class help to do this. <p>
aoqi@0 85 *
aoqi@0 86 * Several System properties control strict conformance to the MIME
aoqi@0 87 * spec. Note that these are not session properties but must be set
aoqi@0 88 * globally as System properties. <p>
aoqi@0 89 *
aoqi@0 90 * The <code>mail.mime.decodetext.strict</code> property controls
aoqi@0 91 * decoding of MIME encoded words. The MIME spec requires that encoded
aoqi@0 92 * words start at the beginning of a whitespace separated word. Some
aoqi@0 93 * mailers incorrectly include encoded words in the middle of a word.
aoqi@0 94 * If the <code>mail.mime.decodetext.strict</code> System property is
aoqi@0 95 * set to <code>"false"</code>, an attempt will be made to decode these
aoqi@0 96 * illegal encoded words. The default is true. <p>
aoqi@0 97 *
aoqi@0 98 * The <code>mail.mime.encodeeol.strict</code> property controls the
aoqi@0 99 * choice of Content-Transfer-Encoding for MIME parts that are not of
aoqi@0 100 * type "text". Often such parts will contain textual data for which
aoqi@0 101 * an encoding that allows normal end of line conventions is appropriate.
aoqi@0 102 * In rare cases, such a part will appear to contain entirely textual
aoqi@0 103 * data, but will require an encoding that preserves CR and LF characters
aoqi@0 104 * without change. If the <code>mail.mime.decodetext.strict</code>
aoqi@0 105 * System property is set to <code>"true"</code>, such an encoding will
aoqi@0 106 * be used when necessary. The default is false. <p>
aoqi@0 107 *
aoqi@0 108 * In addition, the <code>mail.mime.charset</code> System property can
aoqi@0 109 * be used to specify the default MIME charset to use for encoded words
aoqi@0 110 * and text parts that don't otherwise specify a charset. Normally, the
aoqi@0 111 * default MIME charset is derived from the default Java charset, as
aoqi@0 112 * specified in the <code>file.encoding</code> System property. Most
aoqi@0 113 * applications will have no need to explicitly set the default MIME
aoqi@0 114 * charset. In cases where the default MIME charset to be used for
aoqi@0 115 * mail messages is different than the charset used for files stored on
aoqi@0 116 * the system, this property should be set.
aoqi@0 117 *
aoqi@0 118 * @version 1.45, 03/03/10
aoqi@0 119 * @author John Mani
aoqi@0 120 * @author Bill Shannon
aoqi@0 121 */
aoqi@0 122
aoqi@0 123 public class MimeUtility {
aoqi@0 124
aoqi@0 125 // This class cannot be instantiated
aoqi@0 126 private MimeUtility() { }
aoqi@0 127
aoqi@0 128 public static final int ALL = -1;
aoqi@0 129
aoqi@0 130 private static final int BUFFER_SIZE = 1024;
aoqi@0 131 private static boolean decodeStrict = true;
aoqi@0 132 private static boolean encodeEolStrict = false;
aoqi@0 133 private static boolean foldEncodedWords = false;
aoqi@0 134 private static boolean foldText = true;
aoqi@0 135
aoqi@0 136 static {
aoqi@0 137 try {
aoqi@0 138 String s = SAAJUtil.getSystemProperty("mail.mime.decodetext.strict");
aoqi@0 139 // default to true
aoqi@0 140 decodeStrict = s == null || !s.equalsIgnoreCase("false");
aoqi@0 141 s = SAAJUtil.getSystemProperty("mail.mime.encodeeol.strict");
aoqi@0 142 // default to false
aoqi@0 143 encodeEolStrict = s != null && s.equalsIgnoreCase("true");
aoqi@0 144 s = SAAJUtil.getSystemProperty("mail.mime.foldencodedwords");
aoqi@0 145 // default to false
aoqi@0 146 foldEncodedWords = s != null && s.equalsIgnoreCase("true");
aoqi@0 147 s = SAAJUtil.getSystemProperty("mail.mime.foldtext");
aoqi@0 148 // default to true
aoqi@0 149 foldText = s == null || !s.equalsIgnoreCase("false");
aoqi@0 150 } catch (SecurityException sex) {
aoqi@0 151 // ignore it
aoqi@0 152 }
aoqi@0 153 }
aoqi@0 154
aoqi@0 155
aoqi@0 156 /**
aoqi@0 157 * Get the content-transfer-encoding that should be applied
aoqi@0 158 * to the input stream of this datasource, to make it mailsafe. <p>
aoqi@0 159 *
aoqi@0 160 * The algorithm used here is: <br>
aoqi@0 161 * <ul>
aoqi@0 162 * <li>
aoqi@0 163 * If the primary type of this datasource is "text" and if all
aoqi@0 164 * the bytes in its input stream are US-ASCII, then the encoding
aoqi@0 165 * is "7bit". If more than half of the bytes are non-US-ASCII, then
aoqi@0 166 * the encoding is "base64". If less than half of the bytes are
aoqi@0 167 * non-US-ASCII, then the encoding is "quoted-printable".
aoqi@0 168 * <li>
aoqi@0 169 * If the primary type of this datasource is not "text", then if
aoqi@0 170 * all the bytes of its input stream are US-ASCII, the encoding
aoqi@0 171 * is "7bit". If there is even one non-US-ASCII character, the
aoqi@0 172 * encoding is "base64".
aoqi@0 173 * </ul>
aoqi@0 174 *
aoqi@0 175 * @param ds DataSource
aoqi@0 176 * @return the encoding. This is either "7bit",
aoqi@0 177 * "quoted-printable" or "base64"
aoqi@0 178 */
aoqi@0 179 public static String getEncoding(DataSource ds) {
aoqi@0 180 ContentType cType = null;
aoqi@0 181 InputStream is = null;
aoqi@0 182 String encoding = null;
aoqi@0 183
aoqi@0 184 try {
aoqi@0 185 cType = new ContentType(ds.getContentType());
aoqi@0 186 is = ds.getInputStream();
aoqi@0 187 } catch (Exception ex) {
aoqi@0 188 return "base64"; // what else ?!
aoqi@0 189 }
aoqi@0 190
aoqi@0 191 boolean isText = cType.match("text/*");
aoqi@0 192 // if not text, stop processing when we see non-ASCII
aoqi@0 193 int i = checkAscii(is, ALL, !isText);
aoqi@0 194 switch (i) {
aoqi@0 195 case ALL_ASCII:
aoqi@0 196 encoding = "7bit"; // all ascii
aoqi@0 197 break;
aoqi@0 198 case MOSTLY_ASCII:
aoqi@0 199 encoding = "quoted-printable"; // mostly ascii
aoqi@0 200 break;
aoqi@0 201 default:
aoqi@0 202 encoding = "base64"; // mostly binary
aoqi@0 203 break;
aoqi@0 204 }
aoqi@0 205
aoqi@0 206 // Close the input stream
aoqi@0 207 try {
aoqi@0 208 is.close();
aoqi@0 209 } catch (IOException ioex) { }
aoqi@0 210
aoqi@0 211 return encoding;
aoqi@0 212 }
aoqi@0 213
aoqi@0 214 /**
aoqi@0 215 * Same as <code>getEncoding(DataSource)</code> except that instead
aoqi@0 216 * of reading the data from an <code>InputStream</code> it uses the
aoqi@0 217 * <code>writeTo</code> method to examine the data. This is more
aoqi@0 218 * efficient in the common case of a <code>DataHandler</code>
aoqi@0 219 * created with an object and a MIME type (for example, a
aoqi@0 220 * "text/plain" String) because all the I/O is done in this
aoqi@0 221 * thread. In the case requiring an <code>InputStream</code> the
aoqi@0 222 * <code>DataHandler</code> uses a thread, a pair of pipe streams,
aoqi@0 223 * and the <code>writeTo</code> method to produce the data. <p>
aoqi@0 224 *
aoqi@0 225 * @since JavaMail 1.2
aoqi@0 226 */
aoqi@0 227 public static String getEncoding(DataHandler dh) {
aoqi@0 228 ContentType cType = null;
aoqi@0 229 String encoding = null;
aoqi@0 230
aoqi@0 231 /*
aoqi@0 232 * Try to pick the most efficient means of determining the
aoqi@0 233 * encoding. If this DataHandler was created using a DataSource,
aoqi@0 234 * the getEncoding(DataSource) method is typically faster. If
aoqi@0 235 * the DataHandler was created with an object, this method is
aoqi@0 236 * much faster. To distinguish the two cases, we use a heuristic.
aoqi@0 237 * A DataHandler created with an object will always have a null name.
aoqi@0 238 * A DataHandler created with a DataSource will usually have a
aoqi@0 239 * non-null name.
aoqi@0 240 *
aoqi@0 241 * XXX - This is actually quite a disgusting hack, but it makes
aoqi@0 242 * a common case run over twice as fast.
aoqi@0 243 */
aoqi@0 244 if (dh.getName() != null)
aoqi@0 245 return getEncoding(dh.getDataSource());
aoqi@0 246
aoqi@0 247 try {
aoqi@0 248 cType = new ContentType(dh.getContentType());
aoqi@0 249 } catch (Exception ex) {
aoqi@0 250 return "base64"; // what else ?!
aoqi@0 251 }
aoqi@0 252
aoqi@0 253 if (cType.match("text/*")) {
aoqi@0 254 // Check all of the available bytes
aoqi@0 255 AsciiOutputStream aos = new AsciiOutputStream(false, false);
aoqi@0 256 try {
aoqi@0 257 dh.writeTo(aos);
aoqi@0 258 } catch (IOException ex) { } // ignore it
aoqi@0 259 switch (aos.getAscii()) {
aoqi@0 260 case ALL_ASCII:
aoqi@0 261 encoding = "7bit"; // all ascii
aoqi@0 262 break;
aoqi@0 263 case MOSTLY_ASCII:
aoqi@0 264 encoding = "quoted-printable"; // mostly ascii
aoqi@0 265 break;
aoqi@0 266 default:
aoqi@0 267 encoding = "base64"; // mostly binary
aoqi@0 268 break;
aoqi@0 269 }
aoqi@0 270 } else { // not "text"
aoqi@0 271 // Check all of available bytes, break out if we find
aoqi@0 272 // at least one non-US-ASCII character
aoqi@0 273 AsciiOutputStream aos =
aoqi@0 274 new AsciiOutputStream(true, encodeEolStrict);
aoqi@0 275 try {
aoqi@0 276 dh.writeTo(aos);
aoqi@0 277 } catch (IOException ex) { } // ignore it
aoqi@0 278 if (aos.getAscii() == ALL_ASCII) // all ascii
aoqi@0 279 encoding = "7bit";
aoqi@0 280 else // found atleast one non-ascii character, use b64
aoqi@0 281 encoding = "base64";
aoqi@0 282 }
aoqi@0 283
aoqi@0 284 return encoding;
aoqi@0 285 }
aoqi@0 286
aoqi@0 287 /**
aoqi@0 288 * Decode the given input stream. The Input stream returned is
aoqi@0 289 * the decoded input stream. All the encodings defined in RFC 2045
aoqi@0 290 * are supported here. They include "base64", "quoted-printable",
aoqi@0 291 * "7bit", "8bit", and "binary". In addition, "uuencode" is also
aoqi@0 292 * supported.
aoqi@0 293 *
aoqi@0 294 * @param is input stream
aoqi@0 295 * @param encoding the encoding of the stream.
aoqi@0 296 * @return decoded input stream.
aoqi@0 297 */
aoqi@0 298 public static InputStream decode(InputStream is, String encoding)
aoqi@0 299 throws MessagingException {
aoqi@0 300 if (encoding.equalsIgnoreCase("base64"))
aoqi@0 301 return new BASE64DecoderStream(is);
aoqi@0 302 else if (encoding.equalsIgnoreCase("quoted-printable"))
aoqi@0 303 return new QPDecoderStream(is);
aoqi@0 304 else if (encoding.equalsIgnoreCase("uuencode") ||
aoqi@0 305 encoding.equalsIgnoreCase("x-uuencode") ||
aoqi@0 306 encoding.equalsIgnoreCase("x-uue"))
aoqi@0 307 return new UUDecoderStream(is);
aoqi@0 308 else if (encoding.equalsIgnoreCase("binary") ||
aoqi@0 309 encoding.equalsIgnoreCase("7bit") ||
aoqi@0 310 encoding.equalsIgnoreCase("8bit"))
aoqi@0 311 return is;
aoqi@0 312 else
aoqi@0 313 throw new MessagingException("Unknown encoding: " + encoding);
aoqi@0 314 }
aoqi@0 315
aoqi@0 316 /**
aoqi@0 317 * Wrap an encoder around the given output stream.
aoqi@0 318 * All the encodings defined in RFC 2045 are supported here.
aoqi@0 319 * They include "base64", "quoted-printable", "7bit", "8bit" and
aoqi@0 320 * "binary". In addition, "uuencode" is also supported.
aoqi@0 321 *
aoqi@0 322 * @param os output stream
aoqi@0 323 * @param encoding the encoding of the stream.
aoqi@0 324 * @return output stream that applies the
aoqi@0 325 * specified encoding.
aoqi@0 326 */
aoqi@0 327 public static OutputStream encode(OutputStream os, String encoding)
aoqi@0 328 throws MessagingException {
aoqi@0 329 if (encoding == null)
aoqi@0 330 return os;
aoqi@0 331 else if (encoding.equalsIgnoreCase("base64"))
aoqi@0 332 return new BASE64EncoderStream(os);
aoqi@0 333 else if (encoding.equalsIgnoreCase("quoted-printable"))
aoqi@0 334 return new QPEncoderStream(os);
aoqi@0 335 else if (encoding.equalsIgnoreCase("uuencode") ||
aoqi@0 336 encoding.equalsIgnoreCase("x-uuencode") ||
aoqi@0 337 encoding.equalsIgnoreCase("x-uue"))
aoqi@0 338 return new UUEncoderStream(os);
aoqi@0 339 else if (encoding.equalsIgnoreCase("binary") ||
aoqi@0 340 encoding.equalsIgnoreCase("7bit") ||
aoqi@0 341 encoding.equalsIgnoreCase("8bit"))
aoqi@0 342 return os;
aoqi@0 343 else
aoqi@0 344 throw new MessagingException("Unknown encoding: " +encoding);
aoqi@0 345 }
aoqi@0 346
aoqi@0 347 /**
aoqi@0 348 * Wrap an encoder around the given output stream.
aoqi@0 349 * All the encodings defined in RFC 2045 are supported here.
aoqi@0 350 * They include "base64", "quoted-printable", "7bit", "8bit" and
aoqi@0 351 * "binary". In addition, "uuencode" is also supported.
aoqi@0 352 * The <code>filename</code> parameter is used with the "uuencode"
aoqi@0 353 * encoding and is included in the encoded output.
aoqi@0 354 *
aoqi@0 355 * @param os output stream
aoqi@0 356 * @param encoding the encoding of the stream.
aoqi@0 357 * @param filename name for the file being encoded (only used
aoqi@0 358 * with uuencode)
aoqi@0 359 * @return output stream that applies the
aoqi@0 360 * specified encoding.
aoqi@0 361 * @since JavaMail 1.2
aoqi@0 362 */
aoqi@0 363 public static OutputStream encode(OutputStream os, String encoding,
aoqi@0 364 String filename)
aoqi@0 365 throws MessagingException {
aoqi@0 366 if (encoding == null)
aoqi@0 367 return os;
aoqi@0 368 else if (encoding.equalsIgnoreCase("base64"))
aoqi@0 369 return new BASE64EncoderStream(os);
aoqi@0 370 else if (encoding.equalsIgnoreCase("quoted-printable"))
aoqi@0 371 return new QPEncoderStream(os);
aoqi@0 372 else if (encoding.equalsIgnoreCase("uuencode") ||
aoqi@0 373 encoding.equalsIgnoreCase("x-uuencode") ||
aoqi@0 374 encoding.equalsIgnoreCase("x-uue"))
aoqi@0 375 return new UUEncoderStream(os, filename);
aoqi@0 376 else if (encoding.equalsIgnoreCase("binary") ||
aoqi@0 377 encoding.equalsIgnoreCase("7bit") ||
aoqi@0 378 encoding.equalsIgnoreCase("8bit"))
aoqi@0 379 return os;
aoqi@0 380 else
aoqi@0 381 throw new MessagingException("Unknown encoding: " +encoding);
aoqi@0 382 }
aoqi@0 383
aoqi@0 384 /**
aoqi@0 385 * Encode a RFC 822 "text" token into mail-safe form as per
aoqi@0 386 * RFC 2047. <p>
aoqi@0 387 *
aoqi@0 388 * The given Unicode string is examined for non US-ASCII
aoqi@0 389 * characters. If the string contains only US-ASCII characters,
aoqi@0 390 * it is returned as-is. If the string contains non US-ASCII
aoqi@0 391 * characters, it is first character-encoded using the platform's
aoqi@0 392 * default charset, then transfer-encoded using either the B or
aoqi@0 393 * Q encoding. The resulting bytes are then returned as a Unicode
aoqi@0 394 * string containing only ASCII characters. <p>
aoqi@0 395 *
aoqi@0 396 * Note that this method should be used to encode only
aoqi@0 397 * "unstructured" RFC 822 headers. <p>
aoqi@0 398 *
aoqi@0 399 * Example of usage:
aoqi@0 400 * <p><blockquote><pre>
aoqi@0 401 *
aoqi@0 402 * MimeBodyPart part = ...
aoqi@0 403 * String rawvalue = "FooBar Mailer, Japanese version 1.1"
aoqi@0 404 * try {
aoqi@0 405 * // If we know for sure that rawvalue contains only US-ASCII
aoqi@0 406 * // characters, we can skip the encoding part
aoqi@0 407 * part.setHeader("X-mailer", MimeUtility.encodeText(rawvalue));
aoqi@0 408 * } catch (UnsupportedEncodingException e) {
aoqi@0 409 * // encoding failure
aoqi@0 410 * } catch (MessagingException me) {
aoqi@0 411 * // setHeader() failure
aoqi@0 412 * }
aoqi@0 413 *
aoqi@0 414 * </pre></blockquote><p>
aoqi@0 415 *
aoqi@0 416 * @param text unicode string
aoqi@0 417 * @return Unicode string containing only US-ASCII characters
aoqi@0 418 * @exception UnsupportedEncodingException if the encoding fails
aoqi@0 419 */
aoqi@0 420 public static String encodeText(String text)
aoqi@0 421 throws UnsupportedEncodingException {
aoqi@0 422 return encodeText(text, null, null);
aoqi@0 423 }
aoqi@0 424
aoqi@0 425 /**
aoqi@0 426 * Encode a RFC 822 "text" token into mail-safe form as per
aoqi@0 427 * RFC 2047. <p>
aoqi@0 428 *
aoqi@0 429 * The given Unicode string is examined for non US-ASCII
aoqi@0 430 * characters. If the string contains only US-ASCII characters,
aoqi@0 431 * it is returned as-is. If the string contains non US-ASCII
aoqi@0 432 * characters, it is first character-encoded using the specified
aoqi@0 433 * charset, then transfer-encoded using either the B or Q encoding.
aoqi@0 434 * The resulting bytes are then returned as a Unicode string
aoqi@0 435 * containing only ASCII characters. <p>
aoqi@0 436 *
aoqi@0 437 * Note that this method should be used to encode only
aoqi@0 438 * "unstructured" RFC 822 headers.
aoqi@0 439 *
aoqi@0 440 * @param text the header value
aoqi@0 441 * @param charset the charset. If this parameter is null, the
aoqi@0 442 * platform's default chatset is used.
aoqi@0 443 * @param encoding the encoding to be used. Currently supported
aoqi@0 444 * values are "B" and "Q". If this parameter is null, then
aoqi@0 445 * the "Q" encoding is used if most of characters to be
aoqi@0 446 * encoded are in the ASCII charset, otherwise "B" encoding
aoqi@0 447 * is used.
aoqi@0 448 * @return Unicode string containing only US-ASCII characters
aoqi@0 449 */
aoqi@0 450 public static String encodeText(String text, String charset,
aoqi@0 451 String encoding)
aoqi@0 452 throws UnsupportedEncodingException {
aoqi@0 453 return encodeWord(text, charset, encoding, false);
aoqi@0 454 }
aoqi@0 455
aoqi@0 456 /**
aoqi@0 457 * Decode "unstructured" headers, that is, headers that are defined
aoqi@0 458 * as '*text' as per RFC 822. <p>
aoqi@0 459 *
aoqi@0 460 * The string is decoded using the algorithm specified in
aoqi@0 461 * RFC 2047, Section 6.1.1. If the charset-conversion fails
aoqi@0 462 * for any sequence, an UnsupportedEncodingException is thrown.
aoqi@0 463 * If the String is not an RFC 2047 style encoded header, it is
aoqi@0 464 * returned as-is <p>
aoqi@0 465 *
aoqi@0 466 * Example of usage:
aoqi@0 467 * <p><blockquote><pre>
aoqi@0 468 *
aoqi@0 469 * MimeBodyPart part = ...
aoqi@0 470 * String rawvalue = null;
aoqi@0 471 * String value = null;
aoqi@0 472 * try {
aoqi@0 473 * if ((rawvalue = part.getHeader("X-mailer")[0]) != null)
aoqi@0 474 * value = MimeUtility.decodeText(rawvalue);
aoqi@0 475 * } catch (UnsupportedEncodingException e) {
aoqi@0 476 * // Don't care
aoqi@0 477 * value = rawvalue;
aoqi@0 478 * } catch (MessagingException me) { }
aoqi@0 479 *
aoqi@0 480 * return value;
aoqi@0 481 *
aoqi@0 482 * </pre></blockquote><p>
aoqi@0 483 *
aoqi@0 484 * @param etext the possibly encoded value
aoqi@0 485 * @exception UnsupportedEncodingException if the charset
aoqi@0 486 * conversion failed.
aoqi@0 487 */
aoqi@0 488 public static String decodeText(String etext)
aoqi@0 489 throws UnsupportedEncodingException {
aoqi@0 490 /*
aoqi@0 491 * We look for sequences separated by "linear-white-space".
aoqi@0 492 * (as per RFC 2047, Section 6.1.1)
aoqi@0 493 * RFC 822 defines "linear-white-space" as SPACE | HT | CR | NL.
aoqi@0 494 */
aoqi@0 495 String lwsp = " \t\n\r";
aoqi@0 496 StringTokenizer st;
aoqi@0 497
aoqi@0 498 /*
aoqi@0 499 * First, lets do a quick run thru the string and check
aoqi@0 500 * whether the sequence "=?" exists at all. If none exists,
aoqi@0 501 * we know there are no encoded-words in here and we can just
aoqi@0 502 * return the string as-is, without suffering thru the later
aoqi@0 503 * decoding logic.
aoqi@0 504 * This handles the most common case of unencoded headers
aoqi@0 505 * efficiently.
aoqi@0 506 */
aoqi@0 507 if (etext.indexOf("=?") == -1)
aoqi@0 508 return etext;
aoqi@0 509
aoqi@0 510 // Encoded words found. Start decoding ...
aoqi@0 511
aoqi@0 512 st = new StringTokenizer(etext, lwsp, true);
aoqi@0 513 StringBuffer sb = new StringBuffer(); // decode buffer
aoqi@0 514 StringBuffer wsb = new StringBuffer(); // white space buffer
aoqi@0 515 boolean prevWasEncoded = false;
aoqi@0 516
aoqi@0 517 while (st.hasMoreTokens()) {
aoqi@0 518 char c;
aoqi@0 519 String s = st.nextToken();
aoqi@0 520 // If whitespace, append it to the whitespace buffer
aoqi@0 521 if (((c = s.charAt(0)) == ' ') || (c == '\t') ||
aoqi@0 522 (c == '\r') || (c == '\n'))
aoqi@0 523 wsb.append(c);
aoqi@0 524 else {
aoqi@0 525 // Check if token is an 'encoded-word' ..
aoqi@0 526 String word;
aoqi@0 527 try {
aoqi@0 528 word = decodeWord(s);
aoqi@0 529 // Yes, this IS an 'encoded-word'.
aoqi@0 530 if (!prevWasEncoded && wsb.length() > 0) {
aoqi@0 531 // if the previous word was also encoded, we
aoqi@0 532 // should ignore the collected whitespace. Else
aoqi@0 533 // we include the whitespace as well.
aoqi@0 534 sb.append(wsb);
aoqi@0 535 }
aoqi@0 536 prevWasEncoded = true;
aoqi@0 537 } catch (ParseException pex) {
aoqi@0 538 // This is NOT an 'encoded-word'.
aoqi@0 539 word = s;
aoqi@0 540 // possibly decode inner encoded words
aoqi@0 541 if (!decodeStrict)
aoqi@0 542 word = decodeInnerWords(word);
aoqi@0 543 // include colleced whitespace ..
aoqi@0 544 if (wsb.length() > 0)
aoqi@0 545 sb.append(wsb);
aoqi@0 546 prevWasEncoded = false;
aoqi@0 547 }
aoqi@0 548 sb.append(word); // append the actual word
aoqi@0 549 wsb.setLength(0); // reset wsb for reuse
aoqi@0 550 }
aoqi@0 551 }
aoqi@0 552 return sb.toString();
aoqi@0 553 }
aoqi@0 554
aoqi@0 555 /**
aoqi@0 556 * Encode a RFC 822 "word" token into mail-safe form as per
aoqi@0 557 * RFC 2047. <p>
aoqi@0 558 *
aoqi@0 559 * The given Unicode string is examined for non US-ASCII
aoqi@0 560 * characters. If the string contains only US-ASCII characters,
aoqi@0 561 * it is returned as-is. If the string contains non US-ASCII
aoqi@0 562 * characters, it is first character-encoded using the platform's
aoqi@0 563 * default charset, then transfer-encoded using either the B or
aoqi@0 564 * Q encoding. The resulting bytes are then returned as a Unicode
aoqi@0 565 * string containing only ASCII characters. <p>
aoqi@0 566 *
aoqi@0 567 * This method is meant to be used when creating RFC 822 "phrases".
aoqi@0 568 * The InternetAddress class, for example, uses this to encode
aoqi@0 569 * it's 'phrase' component.
aoqi@0 570 *
aoqi@0 571 * @param text unicode string
aoqi@0 572 * @return Array of Unicode strings containing only US-ASCII
aoqi@0 573 * characters.
aoqi@0 574 * @exception UnsupportedEncodingException if the encoding fails
aoqi@0 575 */
aoqi@0 576 public static String encodeWord(String word)
aoqi@0 577 throws UnsupportedEncodingException {
aoqi@0 578 return encodeWord(word, null, null);
aoqi@0 579 }
aoqi@0 580
aoqi@0 581 /**
aoqi@0 582 * Encode a RFC 822 "word" token into mail-safe form as per
aoqi@0 583 * RFC 2047. <p>
aoqi@0 584 *
aoqi@0 585 * The given Unicode string is examined for non US-ASCII
aoqi@0 586 * characters. If the string contains only US-ASCII characters,
aoqi@0 587 * it is returned as-is. If the string contains non US-ASCII
aoqi@0 588 * characters, it is first character-encoded using the specified
aoqi@0 589 * charset, then transfer-encoded using either the B or Q encoding.
aoqi@0 590 * The resulting bytes are then returned as a Unicode string
aoqi@0 591 * containing only ASCII characters. <p>
aoqi@0 592 *
aoqi@0 593 * @param text unicode string
aoqi@0 594 * @param charset the MIME charset
aoqi@0 595 * @param encoding the encoding to be used. Currently supported
aoqi@0 596 * values are "B" and "Q". If this parameter is null, then
aoqi@0 597 * the "Q" encoding is used if most of characters to be
aoqi@0 598 * encoded are in the ASCII charset, otherwise "B" encoding
aoqi@0 599 * is used.
aoqi@0 600 * @return Unicode string containing only US-ASCII characters
aoqi@0 601 * @exception UnsupportedEncodingException if the encoding fails
aoqi@0 602 */
aoqi@0 603 public static String encodeWord(String word, String charset,
aoqi@0 604 String encoding)
aoqi@0 605 throws UnsupportedEncodingException {
aoqi@0 606 return encodeWord(word, charset, encoding, true);
aoqi@0 607 }
aoqi@0 608
aoqi@0 609 /*
aoqi@0 610 * Encode the given string. The parameter 'encodingWord' should
aoqi@0 611 * be true if a RFC 822 "word" token is being encoded and false if a
aoqi@0 612 * RFC 822 "text" token is being encoded. This is because the
aoqi@0 613 * "Q" encoding defined in RFC 2047 has more restrictions when
aoqi@0 614 * encoding "word" tokens. (Sigh)
aoqi@0 615 */
aoqi@0 616 private static String encodeWord(String string, String charset,
aoqi@0 617 String encoding, boolean encodingWord)
aoqi@0 618 throws UnsupportedEncodingException {
aoqi@0 619
aoqi@0 620 // If 'string' contains only US-ASCII characters, just
aoqi@0 621 // return it.
aoqi@0 622 int ascii = checkAscii(string);
aoqi@0 623 if (ascii == ALL_ASCII)
aoqi@0 624 return string;
aoqi@0 625
aoqi@0 626 // Else, apply the specified charset conversion.
aoqi@0 627 String jcharset;
aoqi@0 628 if (charset == null) { // use default charset
aoqi@0 629 jcharset = getDefaultJavaCharset(); // the java charset
aoqi@0 630 charset = getDefaultMIMECharset(); // the MIME equivalent
aoqi@0 631 } else // MIME charset -> java charset
aoqi@0 632 jcharset = javaCharset(charset);
aoqi@0 633
aoqi@0 634 // If no transfer-encoding is specified, figure one out.
aoqi@0 635 if (encoding == null) {
aoqi@0 636 if (ascii != MOSTLY_NONASCII)
aoqi@0 637 encoding = "Q";
aoqi@0 638 else
aoqi@0 639 encoding = "B";
aoqi@0 640 }
aoqi@0 641
aoqi@0 642 boolean b64;
aoqi@0 643 if (encoding.equalsIgnoreCase("B"))
aoqi@0 644 b64 = true;
aoqi@0 645 else if (encoding.equalsIgnoreCase("Q"))
aoqi@0 646 b64 = false;
aoqi@0 647 else
aoqi@0 648 throw new UnsupportedEncodingException(
aoqi@0 649 "Unknown transfer encoding: " + encoding);
aoqi@0 650
aoqi@0 651 StringBuffer outb = new StringBuffer(); // the output buffer
aoqi@0 652 doEncode(string, b64, jcharset,
aoqi@0 653 // As per RFC 2047, size of an encoded string should not
aoqi@0 654 // exceed 75 bytes.
aoqi@0 655 // 7 = size of "=?", '?', 'B'/'Q', '?', "?="
aoqi@0 656 75 - 7 - charset.length(), // the available space
aoqi@0 657 "=?" + charset + "?" + encoding + "?", // prefix
aoqi@0 658 true, encodingWord, outb);
aoqi@0 659
aoqi@0 660 return outb.toString();
aoqi@0 661 }
aoqi@0 662
aoqi@0 663 private static void doEncode(String string, boolean b64,
aoqi@0 664 String jcharset, int avail, String prefix,
aoqi@0 665 boolean first, boolean encodingWord, StringBuffer buf)
aoqi@0 666 throws UnsupportedEncodingException {
aoqi@0 667
aoqi@0 668 // First find out what the length of the encoded version of
aoqi@0 669 // 'string' would be.
aoqi@0 670 byte[] bytes = string.getBytes(jcharset);
aoqi@0 671 int len;
aoqi@0 672 if (b64) // "B" encoding
aoqi@0 673 len = BEncoderStream.encodedLength(bytes);
aoqi@0 674 else // "Q"
aoqi@0 675 len = QEncoderStream.encodedLength(bytes, encodingWord);
aoqi@0 676
aoqi@0 677 int size;
aoqi@0 678 if ((len > avail) && ((size = string.length()) > 1)) {
aoqi@0 679 // If the length is greater than 'avail', split 'string'
aoqi@0 680 // into two and recurse.
aoqi@0 681 doEncode(string.substring(0, size/2), b64, jcharset,
aoqi@0 682 avail, prefix, first, encodingWord, buf);
aoqi@0 683 doEncode(string.substring(size/2, size), b64, jcharset,
aoqi@0 684 avail, prefix, false, encodingWord, buf);
aoqi@0 685 } else {
aoqi@0 686 // length <= than 'avail'. Encode the given string
aoqi@0 687 ByteArrayOutputStream os = new ByteArrayOutputStream(BUFFER_SIZE);
aoqi@0 688 OutputStream eos; // the encoder
aoqi@0 689 if (b64) // "B" encoding
aoqi@0 690 eos = new BEncoderStream(os);
aoqi@0 691 else // "Q" encoding
aoqi@0 692 eos = new QEncoderStream(os, encodingWord);
aoqi@0 693
aoqi@0 694 try { // do the encoding
aoqi@0 695 eos.write(bytes);
aoqi@0 696 eos.close();
aoqi@0 697 } catch (IOException ioex) { }
aoqi@0 698
aoqi@0 699 byte[] encodedBytes = os.toByteArray(); // the encoded stuff
aoqi@0 700 // Now write out the encoded (all ASCII) bytes into our
aoqi@0 701 // StringBuffer
aoqi@0 702 if (!first) // not the first line of this sequence
aoqi@0 703 if (foldEncodedWords)
aoqi@0 704 buf.append("\r\n "); // start a continuation line
aoqi@0 705 else
aoqi@0 706 buf.append(" "); // line will be folded later
aoqi@0 707
aoqi@0 708 buf.append(prefix);
aoqi@0 709 for (int i = 0; i < encodedBytes.length; i++)
aoqi@0 710 buf.append((char)encodedBytes[i]);
aoqi@0 711 buf.append("?="); // terminate the current sequence
aoqi@0 712 }
aoqi@0 713 }
aoqi@0 714
aoqi@0 715 /**
aoqi@0 716 * The string is parsed using the rules in RFC 2047 for parsing
aoqi@0 717 * an "encoded-word". If the parse fails, a ParseException is
aoqi@0 718 * thrown. Otherwise, it is transfer-decoded, and then
aoqi@0 719 * charset-converted into Unicode. If the charset-conversion
aoqi@0 720 * fails, an UnsupportedEncodingException is thrown.<p>
aoqi@0 721 *
aoqi@0 722 * @param eword the possibly encoded value
aoqi@0 723 * @exception ParseException if the string is not an
aoqi@0 724 * encoded-word as per RFC 2047.
aoqi@0 725 * @exception UnsupportedEncodingException if the charset
aoqi@0 726 * conversion failed.
aoqi@0 727 */
aoqi@0 728 public static String decodeWord(String eword)
aoqi@0 729 throws ParseException, UnsupportedEncodingException {
aoqi@0 730
aoqi@0 731 if (!eword.startsWith("=?")) // not an encoded word
aoqi@0 732 throw new ParseException();
aoqi@0 733
aoqi@0 734 // get charset
aoqi@0 735 int start = 2; int pos;
aoqi@0 736 if ((pos = eword.indexOf('?', start)) == -1)
aoqi@0 737 throw new ParseException();
aoqi@0 738 String charset = javaCharset(eword.substring(start, pos));
aoqi@0 739
aoqi@0 740 // get encoding
aoqi@0 741 start = pos+1;
aoqi@0 742 if ((pos = eword.indexOf('?', start)) == -1)
aoqi@0 743 throw new ParseException();
aoqi@0 744 String encoding = eword.substring(start, pos);
aoqi@0 745
aoqi@0 746 // get encoded-sequence
aoqi@0 747 start = pos+1;
aoqi@0 748 if ((pos = eword.indexOf("?=", start)) == -1)
aoqi@0 749 throw new ParseException();
aoqi@0 750 String word = eword.substring(start, pos);
aoqi@0 751
aoqi@0 752 try {
aoqi@0 753 // Extract the bytes from word
aoqi@0 754 ByteArrayInputStream bis =
aoqi@0 755 new ByteArrayInputStream(ASCIIUtility.getBytes(word));
aoqi@0 756
aoqi@0 757 // Get the appropriate decoder
aoqi@0 758 InputStream is;
aoqi@0 759 if (encoding.equalsIgnoreCase("B"))
aoqi@0 760 is = new BASE64DecoderStream(bis);
aoqi@0 761 else if (encoding.equalsIgnoreCase("Q"))
aoqi@0 762 is = new QDecoderStream(bis);
aoqi@0 763 else
aoqi@0 764 throw new UnsupportedEncodingException(
aoqi@0 765 "unknown encoding: " + encoding);
aoqi@0 766
aoqi@0 767 // For b64 & q, size of decoded word <= size of word. So
aoqi@0 768 // the decoded bytes must fit into the 'bytes' array. This
aoqi@0 769 // is certainly more efficient than writing bytes into a
aoqi@0 770 // ByteArrayOutputStream and then pulling out the byte[]
aoqi@0 771 // from it.
aoqi@0 772 int count = bis.available();
aoqi@0 773 byte[] bytes = new byte[count];
aoqi@0 774 // count is set to the actual number of decoded bytes
aoqi@0 775 count = is.read(bytes, 0, count);
aoqi@0 776
aoqi@0 777 // Finally, convert the decoded bytes into a String using
aoqi@0 778 // the specified charset
aoqi@0 779 String s = new String(bytes, 0, count, charset);
aoqi@0 780 if (pos + 2 < eword.length()) {
aoqi@0 781 // there's still more text in the string
aoqi@0 782 String rest = eword.substring(pos + 2);
aoqi@0 783 if (!decodeStrict)
aoqi@0 784 rest = decodeInnerWords(rest);
aoqi@0 785 s += rest;
aoqi@0 786 }
aoqi@0 787 return s;
aoqi@0 788 } catch (UnsupportedEncodingException uex) {
aoqi@0 789 // explicitly catch and rethrow this exception, otherwise
aoqi@0 790 // the below IOException catch will swallow this up!
aoqi@0 791 throw uex;
aoqi@0 792 } catch (IOException ioex) {
aoqi@0 793 // Shouldn't happen.
aoqi@0 794 throw new ParseException();
aoqi@0 795 } catch (IllegalArgumentException iex) {
aoqi@0 796 /* An unknown charset of the form ISO-XXX-XXX, will cause
aoqi@0 797 * the JDK to throw an IllegalArgumentException ... Since the
aoqi@0 798 * JDK will attempt to create a classname using this string,
aoqi@0 799 * but valid classnames must not contain the character '-',
aoqi@0 800 * and this results in an IllegalArgumentException, rather than
aoqi@0 801 * the expected UnsupportedEncodingException. Yikes
aoqi@0 802 */
aoqi@0 803 throw new UnsupportedEncodingException();
aoqi@0 804 }
aoqi@0 805 }
aoqi@0 806
aoqi@0 807 /**
aoqi@0 808 * Look for encoded words within a word. The MIME spec doesn't
aoqi@0 809 * allow this, but many broken mailers, especially Japanese mailers,
aoqi@0 810 * produce such incorrect encodings.
aoqi@0 811 */
aoqi@0 812 private static String decodeInnerWords(String word)
aoqi@0 813 throws UnsupportedEncodingException {
aoqi@0 814 int start = 0, i;
aoqi@0 815 StringBuffer buf = new StringBuffer();
aoqi@0 816 while ((i = word.indexOf("=?", start)) >= 0) {
aoqi@0 817 buf.append(word.substring(start, i));
aoqi@0 818 int end = word.indexOf("?=", i);
aoqi@0 819 if (end < 0)
aoqi@0 820 break;
aoqi@0 821 String s = word.substring(i, end + 2);
aoqi@0 822 try {
aoqi@0 823 s = decodeWord(s);
aoqi@0 824 } catch (ParseException pex) {
aoqi@0 825 // ignore it, just use the original string
aoqi@0 826 }
aoqi@0 827 buf.append(s);
aoqi@0 828 start = end + 2;
aoqi@0 829 }
aoqi@0 830 if (start == 0)
aoqi@0 831 return word;
aoqi@0 832 if (start < word.length())
aoqi@0 833 buf.append(word.substring(start));
aoqi@0 834 return buf.toString();
aoqi@0 835 }
aoqi@0 836
aoqi@0 837 /**
aoqi@0 838 * A utility method to quote a word, if the word contains any
aoqi@0 839 * characters from the specified 'specials' list.<p>
aoqi@0 840 *
aoqi@0 841 * The <code>HeaderTokenizer</code> class defines two special
aoqi@0 842 * sets of delimiters - MIME and RFC 822. <p>
aoqi@0 843 *
aoqi@0 844 * This method is typically used during the generation of
aoqi@0 845 * RFC 822 and MIME header fields.
aoqi@0 846 *
aoqi@0 847 * @param word word to be quoted
aoqi@0 848 * @param specials the set of special characters
aoqi@0 849 * @return the possibly quoted word
aoqi@0 850 * @see javax.mail.internet.HeaderTokenizer#MIME
aoqi@0 851 * @see javax.mail.internet.HeaderTokenizer#RFC822
aoqi@0 852 */
aoqi@0 853 public static String quote(String word, String specials) {
aoqi@0 854 int len = word.length();
aoqi@0 855
aoqi@0 856 /*
aoqi@0 857 * Look for any "bad" characters, Escape and
aoqi@0 858 * quote the entire string if necessary.
aoqi@0 859 */
aoqi@0 860 boolean needQuoting = false;
aoqi@0 861 for (int i = 0; i < len; i++) {
aoqi@0 862 char c = word.charAt(i);
aoqi@0 863 if (c == '"' || c == '\\' || c == '\r' || c == '\n') {
aoqi@0 864 // need to escape them and then quote the whole string
aoqi@0 865 StringBuffer sb = new StringBuffer(len + 3);
aoqi@0 866 sb.append('"');
aoqi@0 867 sb.append(word.substring(0, i));
aoqi@0 868 int lastc = 0;
aoqi@0 869 for (int j = i; j < len; j++) {
aoqi@0 870 char cc = word.charAt(j);
aoqi@0 871 if ((cc == '"') || (cc == '\\') ||
aoqi@0 872 (cc == '\r') || (cc == '\n'))
aoqi@0 873 if (cc == '\n' && lastc == '\r')
aoqi@0 874 ; // do nothing, CR was already escaped
aoqi@0 875 else
aoqi@0 876 sb.append('\\'); // Escape the character
aoqi@0 877 sb.append(cc);
aoqi@0 878 lastc = cc;
aoqi@0 879 }
aoqi@0 880 sb.append('"');
aoqi@0 881 return sb.toString();
aoqi@0 882 } else if (c < 040 || c >= 0177 || specials.indexOf(c) >= 0)
aoqi@0 883 // These characters cause the string to be quoted
aoqi@0 884 needQuoting = true;
aoqi@0 885 }
aoqi@0 886
aoqi@0 887 if (needQuoting) {
aoqi@0 888 StringBuffer sb = new StringBuffer(len + 2);
aoqi@0 889 sb.append('"').append(word).append('"');
aoqi@0 890 return sb.toString();
aoqi@0 891 } else
aoqi@0 892 return word;
aoqi@0 893 }
aoqi@0 894
aoqi@0 895 /**
aoqi@0 896 * Fold a string at linear whitespace so that each line is no longer
aoqi@0 897 * than 76 characters, if possible. If there are more than 76
aoqi@0 898 * non-whitespace characters consecutively, the string is folded at
aoqi@0 899 * the first whitespace after that sequence. The parameter
aoqi@0 900 * <code>used</code> indicates how many characters have been used in
aoqi@0 901 * the current line; it is usually the length of the header name. <p>
aoqi@0 902 *
aoqi@0 903 * Note that line breaks in the string aren't escaped; they probably
aoqi@0 904 * should be.
aoqi@0 905 *
aoqi@0 906 * @param used characters used in line so far
aoqi@0 907 * @param s the string to fold
aoqi@0 908 * @return the folded string
aoqi@0 909 */
aoqi@0 910 /*public*/ static String fold(int used, String s) {
aoqi@0 911 if (!foldText)
aoqi@0 912 return s;
aoqi@0 913
aoqi@0 914 int end;
aoqi@0 915 char c;
aoqi@0 916 // Strip trailing spaces
aoqi@0 917 for (end = s.length() - 1; end >= 0; end--) {
aoqi@0 918 c = s.charAt(end);
aoqi@0 919 if (c != ' ' && c != '\t')
aoqi@0 920 break;
aoqi@0 921 }
aoqi@0 922 if (end != s.length() - 1)
aoqi@0 923 s = s.substring(0, end + 1);
aoqi@0 924
aoqi@0 925 // if the string fits now, just return it
aoqi@0 926 if (used + s.length() <= 76)
aoqi@0 927 return s;
aoqi@0 928
aoqi@0 929 // have to actually fold the string
aoqi@0 930 StringBuffer sb = new StringBuffer(s.length() + 4);
aoqi@0 931 char lastc = 0;
aoqi@0 932 while (used + s.length() > 76) {
aoqi@0 933 int lastspace = -1;
aoqi@0 934 for (int i = 0; i < s.length(); i++) {
aoqi@0 935 if (lastspace != -1 && used + i > 76)
aoqi@0 936 break;
aoqi@0 937 c = s.charAt(i);
aoqi@0 938 if (c == ' ' || c == '\t')
aoqi@0 939 if (!(lastc == ' ' || lastc == '\t'))
aoqi@0 940 lastspace = i;
aoqi@0 941 lastc = c;
aoqi@0 942 }
aoqi@0 943 if (lastspace == -1) {
aoqi@0 944 // no space, use the whole thing
aoqi@0 945 sb.append(s);
aoqi@0 946 s = "";
aoqi@0 947 used = 0;
aoqi@0 948 break;
aoqi@0 949 }
aoqi@0 950 sb.append(s.substring(0, lastspace));
aoqi@0 951 sb.append("\r\n");
aoqi@0 952 lastc = s.charAt(lastspace);
aoqi@0 953 sb.append(lastc);
aoqi@0 954 s = s.substring(lastspace + 1);
aoqi@0 955 used = 1;
aoqi@0 956 }
aoqi@0 957 sb.append(s);
aoqi@0 958 return sb.toString();
aoqi@0 959 }
aoqi@0 960
aoqi@0 961 /**
aoqi@0 962 * Unfold a folded header. Any line breaks that aren't escaped and
aoqi@0 963 * are followed by whitespace are removed.
aoqi@0 964 *
aoqi@0 965 * @param s the string to unfold
aoqi@0 966 * @return the unfolded string
aoqi@0 967 */
aoqi@0 968 /*public*/ static String unfold(String s) {
aoqi@0 969 if (!foldText)
aoqi@0 970 return s;
aoqi@0 971
aoqi@0 972 StringBuffer sb = null;
aoqi@0 973 int i;
aoqi@0 974 while ((i = indexOfAny(s, "\r\n")) >= 0) {
aoqi@0 975 int start = i;
aoqi@0 976 int l = s.length();
aoqi@0 977 i++; // skip CR or NL
aoqi@0 978 if (i < l && s.charAt(i - 1) == '\r' && s.charAt(i) == '\n')
aoqi@0 979 i++; // skip LF
aoqi@0 980 if (start == 0 || s.charAt(start - 1) != '\\') {
aoqi@0 981 char c;
aoqi@0 982 // if next line starts with whitespace, skip all of it
aoqi@0 983 // XXX - always has to be true?
aoqi@0 984 if (i < l && ((c = s.charAt(i)) == ' ' || c == '\t')) {
aoqi@0 985 i++; // skip whitespace
aoqi@0 986 while (i < l && ((c = s.charAt(i)) == ' ' || c == '\t'))
aoqi@0 987 i++;
aoqi@0 988 if (sb == null)
aoqi@0 989 sb = new StringBuffer(s.length());
aoqi@0 990 if (start != 0) {
aoqi@0 991 sb.append(s.substring(0, start));
aoqi@0 992 sb.append(' ');
aoqi@0 993 }
aoqi@0 994 s = s.substring(i);
aoqi@0 995 continue;
aoqi@0 996 }
aoqi@0 997 // it's not a continuation line, just leave it in
aoqi@0 998 if (sb == null)
aoqi@0 999 sb = new StringBuffer(s.length());
aoqi@0 1000 sb.append(s.substring(0, i));
aoqi@0 1001 s = s.substring(i);
aoqi@0 1002 } else {
aoqi@0 1003 // there's a backslash at "start - 1"
aoqi@0 1004 // strip it out, but leave in the line break
aoqi@0 1005 if (sb == null)
aoqi@0 1006 sb = new StringBuffer(s.length());
aoqi@0 1007 sb.append(s.substring(0, start - 1));
aoqi@0 1008 sb.append(s.substring(start, i));
aoqi@0 1009 s = s.substring(i);
aoqi@0 1010 }
aoqi@0 1011 }
aoqi@0 1012 if (sb != null) {
aoqi@0 1013 sb.append(s);
aoqi@0 1014 return sb.toString();
aoqi@0 1015 } else
aoqi@0 1016 return s;
aoqi@0 1017 }
aoqi@0 1018
aoqi@0 1019 /**
aoqi@0 1020 * Return the first index of any of the characters in "any" in "s",
aoqi@0 1021 * or -1 if none are found.
aoqi@0 1022 *
aoqi@0 1023 * This should be a method on String.
aoqi@0 1024 */
aoqi@0 1025 private static int indexOfAny(String s, String any) {
aoqi@0 1026 return indexOfAny(s, any, 0);
aoqi@0 1027 }
aoqi@0 1028
aoqi@0 1029 private static int indexOfAny(String s, String any, int start) {
aoqi@0 1030 try {
aoqi@0 1031 int len = s.length();
aoqi@0 1032 for (int i = start; i < len; i++) {
aoqi@0 1033 if (any.indexOf(s.charAt(i)) >= 0)
aoqi@0 1034 return i;
aoqi@0 1035 }
aoqi@0 1036 return -1;
aoqi@0 1037 } catch (StringIndexOutOfBoundsException e) {
aoqi@0 1038 return -1;
aoqi@0 1039 }
aoqi@0 1040 }
aoqi@0 1041
aoqi@0 1042 /**
aoqi@0 1043 * Convert a MIME charset name into a valid Java charset name. <p>
aoqi@0 1044 *
aoqi@0 1045 * @param charset the MIME charset name
aoqi@0 1046 * @return the Java charset equivalent. If a suitable mapping is
aoqi@0 1047 * not available, the passed in charset is itself returned.
aoqi@0 1048 */
aoqi@0 1049 public static String javaCharset(String charset) {
aoqi@0 1050 if (mime2java == null || charset == null)
aoqi@0 1051 // no mapping table, or charset parameter is null
aoqi@0 1052 return charset;
aoqi@0 1053
aoqi@0 1054 String alias = (String)mime2java.get(charset.toLowerCase());
aoqi@0 1055 return alias == null ? charset : alias;
aoqi@0 1056 }
aoqi@0 1057
aoqi@0 1058 /**
aoqi@0 1059 * Convert a java charset into its MIME charset name. <p>
aoqi@0 1060 *
aoqi@0 1061 * Note that a future version of JDK (post 1.2) might provide
aoqi@0 1062 * this functionality, in which case, we may deprecate this
aoqi@0 1063 * method then.
aoqi@0 1064 *
aoqi@0 1065 * @param charset the JDK charset
aoqi@0 1066 * @return the MIME/IANA equivalent. If a mapping
aoqi@0 1067 * is not possible, the passed in charset itself
aoqi@0 1068 * is returned.
aoqi@0 1069 * @since JavaMail 1.1
aoqi@0 1070 */
aoqi@0 1071 public static String mimeCharset(String charset) {
aoqi@0 1072 if (java2mime == null || charset == null)
aoqi@0 1073 // no mapping table or charset param is null
aoqi@0 1074 return charset;
aoqi@0 1075
aoqi@0 1076 String alias = (String)java2mime.get(charset.toLowerCase());
aoqi@0 1077 return alias == null ? charset : alias;
aoqi@0 1078 }
aoqi@0 1079
aoqi@0 1080 private static String defaultJavaCharset;
aoqi@0 1081 private static String defaultMIMECharset;
aoqi@0 1082
aoqi@0 1083 /**
aoqi@0 1084 * Get the default charset corresponding to the system's current
aoqi@0 1085 * default locale. If the System property <code>mail.mime.charset</code>
aoqi@0 1086 * is set, a system charset corresponding to this MIME charset will be
aoqi@0 1087 * returned. <p>
aoqi@0 1088 *
aoqi@0 1089 * @return the default charset of the system's default locale,
aoqi@0 1090 * as a Java charset. (NOT a MIME charset)
aoqi@0 1091 * @since JavaMail 1.1
aoqi@0 1092 */
aoqi@0 1093 public static String getDefaultJavaCharset() {
aoqi@0 1094 if (defaultJavaCharset == null) {
aoqi@0 1095 /*
aoqi@0 1096 * If mail.mime.charset is set, it controls the default
aoqi@0 1097 * Java charset as well.
aoqi@0 1098 */
aoqi@0 1099 String mimecs = null;
aoqi@0 1100
aoqi@0 1101 mimecs = SAAJUtil.getSystemProperty("mail.mime.charset");
aoqi@0 1102
aoqi@0 1103 if (mimecs != null && mimecs.length() > 0) {
aoqi@0 1104 defaultJavaCharset = javaCharset(mimecs);
aoqi@0 1105 return defaultJavaCharset;
aoqi@0 1106 }
aoqi@0 1107
aoqi@0 1108 try {
aoqi@0 1109 defaultJavaCharset = System.getProperty("file.encoding",
aoqi@0 1110 "8859_1");
aoqi@0 1111 } catch (SecurityException sex) {
aoqi@0 1112
aoqi@0 1113 class NullInputStream extends InputStream {
aoqi@0 1114 public int read() {
aoqi@0 1115 return 0;
aoqi@0 1116 }
aoqi@0 1117 }
aoqi@0 1118 InputStreamReader reader =
aoqi@0 1119 new InputStreamReader(new NullInputStream());
aoqi@0 1120 defaultJavaCharset = reader.getEncoding();
aoqi@0 1121 if (defaultJavaCharset == null)
aoqi@0 1122 defaultJavaCharset = "8859_1";
aoqi@0 1123 }
aoqi@0 1124 }
aoqi@0 1125
aoqi@0 1126 return defaultJavaCharset;
aoqi@0 1127 }
aoqi@0 1128
aoqi@0 1129 /*
aoqi@0 1130 * Get the default MIME charset for this locale.
aoqi@0 1131 */
aoqi@0 1132 static String getDefaultMIMECharset() {
aoqi@0 1133 if (defaultMIMECharset == null) {
aoqi@0 1134 defaultMIMECharset = SAAJUtil.getSystemProperty("mail.mime.charset");
aoqi@0 1135 }
aoqi@0 1136 if (defaultMIMECharset == null)
aoqi@0 1137 defaultMIMECharset = mimeCharset(getDefaultJavaCharset());
aoqi@0 1138 return defaultMIMECharset;
aoqi@0 1139 }
aoqi@0 1140
aoqi@0 1141 // Tables to map MIME charset names to Java names and vice versa.
aoqi@0 1142 // XXX - Should eventually use J2SE 1.4 java.nio.charset.Charset
aoqi@0 1143 private static Hashtable mime2java;
aoqi@0 1144 private static Hashtable java2mime;
aoqi@0 1145
aoqi@0 1146 static {
aoqi@0 1147 java2mime = new Hashtable(40);
aoqi@0 1148 mime2java = new Hashtable(10);
aoqi@0 1149
aoqi@0 1150 try {
aoqi@0 1151 // Use this class's classloader to load the mapping file
aoqi@0 1152 // XXX - we should use SecuritySupport, but it's in another package
aoqi@0 1153 InputStream is =
aoqi@0 1154 com.sun.xml.internal.messaging.saaj.packaging.mime.internet.MimeUtility.class.getResourceAsStream(
aoqi@0 1155 "/META-INF/javamail.charset.map");
aoqi@0 1156
aoqi@0 1157 if (is != null) {
aoqi@0 1158 is = new LineInputStream(is);
aoqi@0 1159
aoqi@0 1160 // Load the JDK-to-MIME charset mapping table
aoqi@0 1161 loadMappings((LineInputStream)is, java2mime);
aoqi@0 1162
aoqi@0 1163 // Load the MIME-to-JDK charset mapping table
aoqi@0 1164 loadMappings((LineInputStream)is, mime2java);
aoqi@0 1165 }
aoqi@0 1166 } catch (Exception ex) { }
aoqi@0 1167
aoqi@0 1168 // If we didn't load the tables, e.g., because we didn't have
aoqi@0 1169 // permission, load them manually. The entries here should be
aoqi@0 1170 // the same as the default javamail.charset.map.
aoqi@0 1171 if (java2mime.isEmpty()) {
aoqi@0 1172 java2mime.put("8859_1", "ISO-8859-1");
aoqi@0 1173 java2mime.put("iso8859_1", "ISO-8859-1");
aoqi@0 1174 java2mime.put("ISO8859-1", "ISO-8859-1");
aoqi@0 1175
aoqi@0 1176 java2mime.put("8859_2", "ISO-8859-2");
aoqi@0 1177 java2mime.put("iso8859_2", "ISO-8859-2");
aoqi@0 1178 java2mime.put("ISO8859-2", "ISO-8859-2");
aoqi@0 1179
aoqi@0 1180 java2mime.put("8859_3", "ISO-8859-3");
aoqi@0 1181 java2mime.put("iso8859_3", "ISO-8859-3");
aoqi@0 1182 java2mime.put("ISO8859-3", "ISO-8859-3");
aoqi@0 1183
aoqi@0 1184 java2mime.put("8859_4", "ISO-8859-4");
aoqi@0 1185 java2mime.put("iso8859_4", "ISO-8859-4");
aoqi@0 1186 java2mime.put("ISO8859-4", "ISO-8859-4");
aoqi@0 1187
aoqi@0 1188 java2mime.put("8859_5", "ISO-8859-5");
aoqi@0 1189 java2mime.put("iso8859_5", "ISO-8859-5");
aoqi@0 1190 java2mime.put("ISO8859-5", "ISO-8859-5");
aoqi@0 1191
aoqi@0 1192 java2mime.put("8859_6", "ISO-8859-6");
aoqi@0 1193 java2mime.put("iso8859_6", "ISO-8859-6");
aoqi@0 1194 java2mime.put("ISO8859-6", "ISO-8859-6");
aoqi@0 1195
aoqi@0 1196 java2mime.put("8859_7", "ISO-8859-7");
aoqi@0 1197 java2mime.put("iso8859_7", "ISO-8859-7");
aoqi@0 1198 java2mime.put("ISO8859-7", "ISO-8859-7");
aoqi@0 1199
aoqi@0 1200 java2mime.put("8859_8", "ISO-8859-8");
aoqi@0 1201 java2mime.put("iso8859_8", "ISO-8859-8");
aoqi@0 1202 java2mime.put("ISO8859-8", "ISO-8859-8");
aoqi@0 1203
aoqi@0 1204 java2mime.put("8859_9", "ISO-8859-9");
aoqi@0 1205 java2mime.put("iso8859_9", "ISO-8859-9");
aoqi@0 1206 java2mime.put("ISO8859-9", "ISO-8859-9");
aoqi@0 1207
aoqi@0 1208 java2mime.put("SJIS", "Shift_JIS");
aoqi@0 1209 java2mime.put("MS932", "Shift_JIS");
aoqi@0 1210 java2mime.put("JIS", "ISO-2022-JP");
aoqi@0 1211 java2mime.put("ISO2022JP", "ISO-2022-JP");
aoqi@0 1212 java2mime.put("EUC_JP", "euc-jp");
aoqi@0 1213 java2mime.put("KOI8_R", "koi8-r");
aoqi@0 1214 java2mime.put("EUC_CN", "euc-cn");
aoqi@0 1215 java2mime.put("EUC_TW", "euc-tw");
aoqi@0 1216 java2mime.put("EUC_KR", "euc-kr");
aoqi@0 1217 }
aoqi@0 1218 if (mime2java.isEmpty()) {
aoqi@0 1219 mime2java.put("iso-2022-cn", "ISO2022CN");
aoqi@0 1220 mime2java.put("iso-2022-kr", "ISO2022KR");
aoqi@0 1221 mime2java.put("utf-8", "UTF8");
aoqi@0 1222 mime2java.put("utf8", "UTF8");
aoqi@0 1223 mime2java.put("ja_jp.iso2022-7", "ISO2022JP");
aoqi@0 1224 mime2java.put("ja_jp.eucjp", "EUCJIS");
aoqi@0 1225 mime2java.put("euc-kr", "KSC5601");
aoqi@0 1226 mime2java.put("euckr", "KSC5601");
aoqi@0 1227 mime2java.put("us-ascii", "ISO-8859-1");
aoqi@0 1228 mime2java.put("x-us-ascii", "ISO-8859-1");
aoqi@0 1229 }
aoqi@0 1230 }
aoqi@0 1231
aoqi@0 1232 private static void loadMappings(LineInputStream is, Hashtable table) {
aoqi@0 1233 String currLine;
aoqi@0 1234
aoqi@0 1235 while (true) {
aoqi@0 1236 try {
aoqi@0 1237 currLine = is.readLine();
aoqi@0 1238 } catch (IOException ioex) {
aoqi@0 1239 break; // error in reading, stop
aoqi@0 1240 }
aoqi@0 1241
aoqi@0 1242 if (currLine == null) // end of file, stop
aoqi@0 1243 break;
aoqi@0 1244 if (currLine.startsWith("--") && currLine.endsWith("--"))
aoqi@0 1245 // end of this table
aoqi@0 1246 break;
aoqi@0 1247
aoqi@0 1248 // ignore empty lines and comments
aoqi@0 1249 if (currLine.trim().length() == 0 || currLine.startsWith("#"))
aoqi@0 1250 continue;
aoqi@0 1251
aoqi@0 1252 // A valid entry is of the form <key><separator><value>
aoqi@0 1253 // where, <separator> := SPACE | HT. Parse this
aoqi@0 1254 StringTokenizer tk = new StringTokenizer(currLine, " \t");
aoqi@0 1255 try {
aoqi@0 1256 String key = tk.nextToken();
aoqi@0 1257 String value = tk.nextToken();
aoqi@0 1258 table.put(key.toLowerCase(), value);
aoqi@0 1259 } catch (NoSuchElementException nex) { }
aoqi@0 1260 }
aoqi@0 1261 }
aoqi@0 1262
aoqi@0 1263 static final int ALL_ASCII = 1;
aoqi@0 1264 static final int MOSTLY_ASCII = 2;
aoqi@0 1265 static final int MOSTLY_NONASCII = 3;
aoqi@0 1266
aoqi@0 1267 /**
aoqi@0 1268 * Check if the given string contains non US-ASCII characters.
aoqi@0 1269 * @param s string
aoqi@0 1270 * @return ALL_ASCII if all characters in the string
aoqi@0 1271 * belong to the US-ASCII charset. MOSTLY_ASCII
aoqi@0 1272 * if more than half of the available characters
aoqi@0 1273 * are US-ASCII characters. Else MOSTLY_NONASCII.
aoqi@0 1274 */
aoqi@0 1275 static int checkAscii(String s) {
aoqi@0 1276 int ascii = 0, non_ascii = 0;
aoqi@0 1277 int l = s.length();
aoqi@0 1278
aoqi@0 1279 for (int i = 0; i < l; i++) {
aoqi@0 1280 if (nonascii((int)s.charAt(i))) // non-ascii
aoqi@0 1281 non_ascii++;
aoqi@0 1282 else
aoqi@0 1283 ascii++;
aoqi@0 1284 }
aoqi@0 1285
aoqi@0 1286 if (non_ascii == 0)
aoqi@0 1287 return ALL_ASCII;
aoqi@0 1288 if (ascii > non_ascii)
aoqi@0 1289 return MOSTLY_ASCII;
aoqi@0 1290
aoqi@0 1291 return MOSTLY_NONASCII;
aoqi@0 1292 }
aoqi@0 1293
aoqi@0 1294 /**
aoqi@0 1295 * Check if the given byte array contains non US-ASCII characters.
aoqi@0 1296 * @param b byte array
aoqi@0 1297 * @return ALL_ASCII if all characters in the string
aoqi@0 1298 * belong to the US-ASCII charset. MOSTLY_ASCII
aoqi@0 1299 * if more than half of the available characters
aoqi@0 1300 * are US-ASCII characters. Else MOSTLY_NONASCII.
aoqi@0 1301 *
aoqi@0 1302 * XXX - this method is no longer used
aoqi@0 1303 */
aoqi@0 1304 static int checkAscii(byte[] b) {
aoqi@0 1305 int ascii = 0, non_ascii = 0;
aoqi@0 1306
aoqi@0 1307 for (int i=0; i < b.length; i++) {
aoqi@0 1308 // The '&' operator automatically causes b[i] to be promoted
aoqi@0 1309 // to an int, and we mask out the higher bytes in the int
aoqi@0 1310 // so that the resulting value is not a negative integer.
aoqi@0 1311 if (nonascii(b[i] & 0xff)) // non-ascii
aoqi@0 1312 non_ascii++;
aoqi@0 1313 else
aoqi@0 1314 ascii++;
aoqi@0 1315 }
aoqi@0 1316
aoqi@0 1317 if (non_ascii == 0)
aoqi@0 1318 return ALL_ASCII;
aoqi@0 1319 if (ascii > non_ascii)
aoqi@0 1320 return MOSTLY_ASCII;
aoqi@0 1321
aoqi@0 1322 return MOSTLY_NONASCII;
aoqi@0 1323 }
aoqi@0 1324
aoqi@0 1325 /**
aoqi@0 1326 * Check if the given input stream contains non US-ASCII characters.
aoqi@0 1327 * Upto <code>max</code> bytes are checked. If <code>max</code> is
aoqi@0 1328 * set to <code>ALL</code>, then all the bytes available in this
aoqi@0 1329 * input stream are checked. If <code>breakOnNonAscii</code> is true
aoqi@0 1330 * the check terminates when the first non-US-ASCII character is
aoqi@0 1331 * found and MOSTLY_NONASCII is returned. Else, the check continues
aoqi@0 1332 * till <code>max</code> bytes or till the end of stream.
aoqi@0 1333 *
aoqi@0 1334 * @param is the input stream
aoqi@0 1335 * @param max maximum bytes to check for. The special value
aoqi@0 1336 * ALL indicates that all the bytes in this input
aoqi@0 1337 * stream must be checked.
aoqi@0 1338 * @param breakOnNonAscii if <code>true</code>, then terminate the
aoqi@0 1339 * the check when the first non-US-ASCII character
aoqi@0 1340 * is found.
aoqi@0 1341 * @return ALL_ASCII if all characters in the string
aoqi@0 1342 * belong to the US-ASCII charset. MOSTLY_ASCII
aoqi@0 1343 * if more than half of the available characters
aoqi@0 1344 * are US-ASCII characters. Else MOSTLY_NONASCII.
aoqi@0 1345 */
aoqi@0 1346 static int checkAscii(InputStream is, int max, boolean breakOnNonAscii) {
aoqi@0 1347 int ascii = 0, non_ascii = 0;
aoqi@0 1348 int len;
aoqi@0 1349 int block = 4096;
aoqi@0 1350 int linelen = 0;
aoqi@0 1351 boolean longLine = false, badEOL = false;
aoqi@0 1352 boolean checkEOL = encodeEolStrict && breakOnNonAscii;
aoqi@0 1353 byte buf[] = null;
aoqi@0 1354 if (max != 0) {
aoqi@0 1355 block = (max == ALL) ? 4096 : Math.min(max, 4096);
aoqi@0 1356 buf = new byte[block];
aoqi@0 1357 }
aoqi@0 1358 while (max != 0) {
aoqi@0 1359 try {
aoqi@0 1360 if ((len = is.read(buf, 0, block)) == -1)
aoqi@0 1361 break;
aoqi@0 1362 int lastb = 0;
aoqi@0 1363 for (int i = 0; i < len; i++) {
aoqi@0 1364 // The '&' operator automatically causes b[i] to
aoqi@0 1365 // be promoted to an int, and we mask out the higher
aoqi@0 1366 // bytes in the int so that the resulting value is
aoqi@0 1367 // not a negative integer.
aoqi@0 1368 int b = buf[i] & 0xff;
aoqi@0 1369 if (checkEOL &&
aoqi@0 1370 ((lastb == '\r' && b != '\n') ||
aoqi@0 1371 (lastb != '\r' && b == '\n')))
aoqi@0 1372 badEOL = true;
aoqi@0 1373 if (b == '\r' || b == '\n')
aoqi@0 1374 linelen = 0;
aoqi@0 1375 else {
aoqi@0 1376 linelen++;
aoqi@0 1377 if (linelen > 998) // 1000 - CRLF
aoqi@0 1378 longLine = true;
aoqi@0 1379 }
aoqi@0 1380 if (nonascii(b)) { // non-ascii
aoqi@0 1381 if (breakOnNonAscii) // we are done
aoqi@0 1382 return MOSTLY_NONASCII;
aoqi@0 1383 else
aoqi@0 1384 non_ascii++;
aoqi@0 1385 } else
aoqi@0 1386 ascii++;
aoqi@0 1387 lastb = b;
aoqi@0 1388 }
aoqi@0 1389 } catch (IOException ioex) {
aoqi@0 1390 break;
aoqi@0 1391 }
aoqi@0 1392 if (max != ALL)
aoqi@0 1393 max -= len;
aoqi@0 1394 }
aoqi@0 1395
aoqi@0 1396 if (max == 0 && breakOnNonAscii)
aoqi@0 1397 // We have been told to break on the first non-ascii character.
aoqi@0 1398 // We haven't got any non-ascii character yet, but then we
aoqi@0 1399 // have not checked all of the available bytes either. So we
aoqi@0 1400 // cannot say for sure that this input stream is ALL_ASCII,
aoqi@0 1401 // and hence we must play safe and return MOSTLY_NONASCII
aoqi@0 1402
aoqi@0 1403 return MOSTLY_NONASCII;
aoqi@0 1404
aoqi@0 1405 if (non_ascii == 0) { // no non-us-ascii characters so far
aoqi@0 1406 // If we're looking at non-text data, and we saw CR without LF
aoqi@0 1407 // or vice versa, consider this mostly non-ASCII so that it
aoqi@0 1408 // will be base64 encoded (since the quoted-printable encoder
aoqi@0 1409 // doesn't encode this case properly).
aoqi@0 1410 if (badEOL)
aoqi@0 1411 return MOSTLY_NONASCII;
aoqi@0 1412 // if we've seen a long line, we degrade to mostly ascii
aoqi@0 1413 else if (longLine)
aoqi@0 1414 return MOSTLY_ASCII;
aoqi@0 1415 else
aoqi@0 1416 return ALL_ASCII;
aoqi@0 1417 }
aoqi@0 1418 if (ascii > non_ascii) // mostly ascii
aoqi@0 1419 return MOSTLY_ASCII;
aoqi@0 1420 return MOSTLY_NONASCII;
aoqi@0 1421 }
aoqi@0 1422
aoqi@0 1423 static final boolean nonascii(int b) {
aoqi@0 1424 return b >= 0177 || (b < 040 && b != '\r' && b != '\n' && b != '\t');
aoqi@0 1425 }
aoqi@0 1426 }
aoqi@0 1427
aoqi@0 1428 /**
aoqi@0 1429 * An OutputStream that determines whether the data written to
aoqi@0 1430 * it is all ASCII, mostly ASCII, or mostly non-ASCII.
aoqi@0 1431 */
aoqi@0 1432 class AsciiOutputStream extends OutputStream {
aoqi@0 1433 private boolean breakOnNonAscii;
aoqi@0 1434 private int ascii = 0, non_ascii = 0;
aoqi@0 1435 private int linelen = 0;
aoqi@0 1436 private boolean longLine = false;
aoqi@0 1437 private boolean badEOL = false;
aoqi@0 1438 private boolean checkEOL = false;
aoqi@0 1439 private int lastb = 0;
aoqi@0 1440 private int ret = 0;
aoqi@0 1441
aoqi@0 1442 public AsciiOutputStream(boolean breakOnNonAscii, boolean encodeEolStrict) {
aoqi@0 1443 this.breakOnNonAscii = breakOnNonAscii;
aoqi@0 1444 checkEOL = encodeEolStrict && breakOnNonAscii;
aoqi@0 1445 }
aoqi@0 1446
aoqi@0 1447 public void write(int b) throws IOException {
aoqi@0 1448 check(b);
aoqi@0 1449 }
aoqi@0 1450
aoqi@0 1451 public void write(byte b[]) throws IOException {
aoqi@0 1452 write(b, 0, b.length);
aoqi@0 1453 }
aoqi@0 1454
aoqi@0 1455 public void write(byte b[], int off, int len) throws IOException {
aoqi@0 1456 len += off;
aoqi@0 1457 for (int i = off; i < len ; i++)
aoqi@0 1458 check(b[i]);
aoqi@0 1459 }
aoqi@0 1460
aoqi@0 1461 private final void check(int b) throws IOException {
aoqi@0 1462 b &= 0xff;
aoqi@0 1463 if (checkEOL &&
aoqi@0 1464 ((lastb == '\r' && b != '\n') || (lastb != '\r' && b == '\n')))
aoqi@0 1465 badEOL = true;
aoqi@0 1466 if (b == '\r' || b == '\n')
aoqi@0 1467 linelen = 0;
aoqi@0 1468 else {
aoqi@0 1469 linelen++;
aoqi@0 1470 if (linelen > 998) // 1000 - CRLF
aoqi@0 1471 longLine = true;
aoqi@0 1472 }
aoqi@0 1473 if (MimeUtility.nonascii(b)) { // non-ascii
aoqi@0 1474 non_ascii++;
aoqi@0 1475 if (breakOnNonAscii) { // we are done
aoqi@0 1476 ret = MimeUtility.MOSTLY_NONASCII;
aoqi@0 1477 throw new EOFException();
aoqi@0 1478 }
aoqi@0 1479 } else
aoqi@0 1480 ascii++;
aoqi@0 1481 lastb = b;
aoqi@0 1482 }
aoqi@0 1483
aoqi@0 1484 /**
aoqi@0 1485 * Return ASCII-ness of data stream.
aoqi@0 1486 */
aoqi@0 1487 public int getAscii() {
aoqi@0 1488 if (ret != 0)
aoqi@0 1489 return ret;
aoqi@0 1490 // If we're looking at non-text data, and we saw CR without LF
aoqi@0 1491 // or vice versa, consider this mostly non-ASCII so that it
aoqi@0 1492 // will be base64 encoded (since the quoted-printable encoder
aoqi@0 1493 // doesn't encode this case properly).
aoqi@0 1494 if (badEOL)
aoqi@0 1495 return MimeUtility.MOSTLY_NONASCII;
aoqi@0 1496 else if (non_ascii == 0) { // no non-us-ascii characters so far
aoqi@0 1497 // if we've seen a long line, we degrade to mostly ascii
aoqi@0 1498 if (longLine)
aoqi@0 1499 return MimeUtility.MOSTLY_ASCII;
aoqi@0 1500 else
aoqi@0 1501 return MimeUtility.ALL_ASCII;
aoqi@0 1502 }
aoqi@0 1503 if (ascii > non_ascii) // mostly ascii
aoqi@0 1504 return MimeUtility.MOSTLY_ASCII;
aoqi@0 1505 return MimeUtility.MOSTLY_NONASCII;
aoqi@0 1506 }
aoqi@0 1507 }

mercurial