src/share/jaxws_classes/com/sun/xml/internal/messaging/saaj/packaging/mime/internet/MimeUtility.java

changeset 0
373ffda63c9a
child 637
9c07ef4934dd
equal deleted inserted replaced
-1:000000000000 0:373ffda63c9a
1 /*
2 * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26 /*
27 * @(#)MimeUtility.java 1.45 03/03/10
28 */
29
30
31
32 package com.sun.xml.internal.messaging.saaj.packaging.mime.internet;
33
34 import java.io.*;
35 import java.util.*;
36
37 import javax.activation.DataHandler;
38 import javax.activation.DataSource;
39
40 import com.sun.xml.internal.messaging.saaj.packaging.mime.MessagingException;
41 import com.sun.xml.internal.messaging.saaj.packaging.mime.util.*;
42 import com.sun.xml.internal.messaging.saaj.util.SAAJUtil;
43
44 /**
45 * This is a utility class that provides various MIME related
46 * functionality. <p>
47 *
48 * There are a set of methods to encode and decode MIME headers as
49 * per RFC 2047. A brief description on handling such headers is
50 * given below: <p>
51 *
52 * RFC 822 mail headers <strong>must</strong> contain only US-ASCII
53 * characters. Headers that contain non US-ASCII characters must be
54 * encoded so that they contain only US-ASCII characters. Basically,
55 * this process involves using either BASE64 or QP to encode certain
56 * characters. RFC 2047 describes this in detail. <p>
57 *
58 * In Java, Strings contain (16 bit) Unicode characters. ASCII is a
59 * subset of Unicode (and occupies the range 0 - 127). A String
60 * that contains only ASCII characters is already mail-safe. If the
61 * String contains non US-ASCII characters, it must be encoded. An
62 * additional complexity in this step is that since Unicode is not
63 * yet a widely used charset, one might want to first charset-encode
64 * the String into another charset and then do the transfer-encoding.
65 * <p>
66 * Note that to get the actual bytes of a mail-safe String (say,
67 * for sending over SMTP), one must do
68 * <p><blockquote><pre>
69 *
70 * byte[] bytes = string.getBytes("iso-8859-1");
71 *
72 * </pre></blockquote><p>
73 *
74 * The <code>setHeader</code> and <code>addHeader</code> methods
75 * on MimeMessage and MimeBodyPart assume that the given header values
76 * are Unicode strings that contain only US-ASCII characters. Hence
77 * the callers of those methods must insure that the values they pass
78 * do not contain non US-ASCII characters. The methods in this class
79 * help do this. <p>
80 *
81 * The <code>getHeader</code> family of methods on MimeMessage and
82 * MimeBodyPart return the raw header value. These might be encoded
83 * as per RFC 2047, and if so, must be decoded into Unicode Strings.
84 * The methods in this class help to do this. <p>
85 *
86 * Several System properties control strict conformance to the MIME
87 * spec. Note that these are not session properties but must be set
88 * globally as System properties. <p>
89 *
90 * The <code>mail.mime.decodetext.strict</code> property controls
91 * decoding of MIME encoded words. The MIME spec requires that encoded
92 * words start at the beginning of a whitespace separated word. Some
93 * mailers incorrectly include encoded words in the middle of a word.
94 * If the <code>mail.mime.decodetext.strict</code> System property is
95 * set to <code>"false"</code>, an attempt will be made to decode these
96 * illegal encoded words. The default is true. <p>
97 *
98 * The <code>mail.mime.encodeeol.strict</code> property controls the
99 * choice of Content-Transfer-Encoding for MIME parts that are not of
100 * type "text". Often such parts will contain textual data for which
101 * an encoding that allows normal end of line conventions is appropriate.
102 * In rare cases, such a part will appear to contain entirely textual
103 * data, but will require an encoding that preserves CR and LF characters
104 * without change. If the <code>mail.mime.decodetext.strict</code>
105 * System property is set to <code>"true"</code>, such an encoding will
106 * be used when necessary. The default is false. <p>
107 *
108 * In addition, the <code>mail.mime.charset</code> System property can
109 * be used to specify the default MIME charset to use for encoded words
110 * and text parts that don't otherwise specify a charset. Normally, the
111 * default MIME charset is derived from the default Java charset, as
112 * specified in the <code>file.encoding</code> System property. Most
113 * applications will have no need to explicitly set the default MIME
114 * charset. In cases where the default MIME charset to be used for
115 * mail messages is different than the charset used for files stored on
116 * the system, this property should be set.
117 *
118 * @version 1.45, 03/03/10
119 * @author John Mani
120 * @author Bill Shannon
121 */
122
123 public class MimeUtility {
124
125 // This class cannot be instantiated
126 private MimeUtility() { }
127
128 public static final int ALL = -1;
129
130 private static final int BUFFER_SIZE = 1024;
131 private static boolean decodeStrict = true;
132 private static boolean encodeEolStrict = false;
133 private static boolean foldEncodedWords = false;
134 private static boolean foldText = true;
135
136 static {
137 try {
138 String s = SAAJUtil.getSystemProperty("mail.mime.decodetext.strict");
139 // default to true
140 decodeStrict = s == null || !s.equalsIgnoreCase("false");
141 s = SAAJUtil.getSystemProperty("mail.mime.encodeeol.strict");
142 // default to false
143 encodeEolStrict = s != null && s.equalsIgnoreCase("true");
144 s = SAAJUtil.getSystemProperty("mail.mime.foldencodedwords");
145 // default to false
146 foldEncodedWords = s != null && s.equalsIgnoreCase("true");
147 s = SAAJUtil.getSystemProperty("mail.mime.foldtext");
148 // default to true
149 foldText = s == null || !s.equalsIgnoreCase("false");
150 } catch (SecurityException sex) {
151 // ignore it
152 }
153 }
154
155
156 /**
157 * Get the content-transfer-encoding that should be applied
158 * to the input stream of this datasource, to make it mailsafe. <p>
159 *
160 * The algorithm used here is: <br>
161 * <ul>
162 * <li>
163 * If the primary type of this datasource is "text" and if all
164 * the bytes in its input stream are US-ASCII, then the encoding
165 * is "7bit". If more than half of the bytes are non-US-ASCII, then
166 * the encoding is "base64". If less than half of the bytes are
167 * non-US-ASCII, then the encoding is "quoted-printable".
168 * <li>
169 * If the primary type of this datasource is not "text", then if
170 * all the bytes of its input stream are US-ASCII, the encoding
171 * is "7bit". If there is even one non-US-ASCII character, the
172 * encoding is "base64".
173 * </ul>
174 *
175 * @param ds DataSource
176 * @return the encoding. This is either "7bit",
177 * "quoted-printable" or "base64"
178 */
179 public static String getEncoding(DataSource ds) {
180 ContentType cType = null;
181 InputStream is = null;
182 String encoding = null;
183
184 try {
185 cType = new ContentType(ds.getContentType());
186 is = ds.getInputStream();
187 } catch (Exception ex) {
188 return "base64"; // what else ?!
189 }
190
191 boolean isText = cType.match("text/*");
192 // if not text, stop processing when we see non-ASCII
193 int i = checkAscii(is, ALL, !isText);
194 switch (i) {
195 case ALL_ASCII:
196 encoding = "7bit"; // all ascii
197 break;
198 case MOSTLY_ASCII:
199 encoding = "quoted-printable"; // mostly ascii
200 break;
201 default:
202 encoding = "base64"; // mostly binary
203 break;
204 }
205
206 // Close the input stream
207 try {
208 is.close();
209 } catch (IOException ioex) { }
210
211 return encoding;
212 }
213
214 /**
215 * Same as <code>getEncoding(DataSource)</code> except that instead
216 * of reading the data from an <code>InputStream</code> it uses the
217 * <code>writeTo</code> method to examine the data. This is more
218 * efficient in the common case of a <code>DataHandler</code>
219 * created with an object and a MIME type (for example, a
220 * "text/plain" String) because all the I/O is done in this
221 * thread. In the case requiring an <code>InputStream</code> the
222 * <code>DataHandler</code> uses a thread, a pair of pipe streams,
223 * and the <code>writeTo</code> method to produce the data. <p>
224 *
225 * @since JavaMail 1.2
226 */
227 public static String getEncoding(DataHandler dh) {
228 ContentType cType = null;
229 String encoding = null;
230
231 /*
232 * Try to pick the most efficient means of determining the
233 * encoding. If this DataHandler was created using a DataSource,
234 * the getEncoding(DataSource) method is typically faster. If
235 * the DataHandler was created with an object, this method is
236 * much faster. To distinguish the two cases, we use a heuristic.
237 * A DataHandler created with an object will always have a null name.
238 * A DataHandler created with a DataSource will usually have a
239 * non-null name.
240 *
241 * XXX - This is actually quite a disgusting hack, but it makes
242 * a common case run over twice as fast.
243 */
244 if (dh.getName() != null)
245 return getEncoding(dh.getDataSource());
246
247 try {
248 cType = new ContentType(dh.getContentType());
249 } catch (Exception ex) {
250 return "base64"; // what else ?!
251 }
252
253 if (cType.match("text/*")) {
254 // Check all of the available bytes
255 AsciiOutputStream aos = new AsciiOutputStream(false, false);
256 try {
257 dh.writeTo(aos);
258 } catch (IOException ex) { } // ignore it
259 switch (aos.getAscii()) {
260 case ALL_ASCII:
261 encoding = "7bit"; // all ascii
262 break;
263 case MOSTLY_ASCII:
264 encoding = "quoted-printable"; // mostly ascii
265 break;
266 default:
267 encoding = "base64"; // mostly binary
268 break;
269 }
270 } else { // not "text"
271 // Check all of available bytes, break out if we find
272 // at least one non-US-ASCII character
273 AsciiOutputStream aos =
274 new AsciiOutputStream(true, encodeEolStrict);
275 try {
276 dh.writeTo(aos);
277 } catch (IOException ex) { } // ignore it
278 if (aos.getAscii() == ALL_ASCII) // all ascii
279 encoding = "7bit";
280 else // found atleast one non-ascii character, use b64
281 encoding = "base64";
282 }
283
284 return encoding;
285 }
286
287 /**
288 * Decode the given input stream. The Input stream returned is
289 * the decoded input stream. All the encodings defined in RFC 2045
290 * are supported here. They include "base64", "quoted-printable",
291 * "7bit", "8bit", and "binary". In addition, "uuencode" is also
292 * supported.
293 *
294 * @param is input stream
295 * @param encoding the encoding of the stream.
296 * @return decoded input stream.
297 */
298 public static InputStream decode(InputStream is, String encoding)
299 throws MessagingException {
300 if (encoding.equalsIgnoreCase("base64"))
301 return new BASE64DecoderStream(is);
302 else if (encoding.equalsIgnoreCase("quoted-printable"))
303 return new QPDecoderStream(is);
304 else if (encoding.equalsIgnoreCase("uuencode") ||
305 encoding.equalsIgnoreCase("x-uuencode") ||
306 encoding.equalsIgnoreCase("x-uue"))
307 return new UUDecoderStream(is);
308 else if (encoding.equalsIgnoreCase("binary") ||
309 encoding.equalsIgnoreCase("7bit") ||
310 encoding.equalsIgnoreCase("8bit"))
311 return is;
312 else
313 throw new MessagingException("Unknown encoding: " + encoding);
314 }
315
316 /**
317 * Wrap an encoder around the given output stream.
318 * All the encodings defined in RFC 2045 are supported here.
319 * They include "base64", "quoted-printable", "7bit", "8bit" and
320 * "binary". In addition, "uuencode" is also supported.
321 *
322 * @param os output stream
323 * @param encoding the encoding of the stream.
324 * @return output stream that applies the
325 * specified encoding.
326 */
327 public static OutputStream encode(OutputStream os, String encoding)
328 throws MessagingException {
329 if (encoding == null)
330 return os;
331 else if (encoding.equalsIgnoreCase("base64"))
332 return new BASE64EncoderStream(os);
333 else if (encoding.equalsIgnoreCase("quoted-printable"))
334 return new QPEncoderStream(os);
335 else if (encoding.equalsIgnoreCase("uuencode") ||
336 encoding.equalsIgnoreCase("x-uuencode") ||
337 encoding.equalsIgnoreCase("x-uue"))
338 return new UUEncoderStream(os);
339 else if (encoding.equalsIgnoreCase("binary") ||
340 encoding.equalsIgnoreCase("7bit") ||
341 encoding.equalsIgnoreCase("8bit"))
342 return os;
343 else
344 throw new MessagingException("Unknown encoding: " +encoding);
345 }
346
347 /**
348 * Wrap an encoder around the given output stream.
349 * All the encodings defined in RFC 2045 are supported here.
350 * They include "base64", "quoted-printable", "7bit", "8bit" and
351 * "binary". In addition, "uuencode" is also supported.
352 * The <code>filename</code> parameter is used with the "uuencode"
353 * encoding and is included in the encoded output.
354 *
355 * @param os output stream
356 * @param encoding the encoding of the stream.
357 * @param filename name for the file being encoded (only used
358 * with uuencode)
359 * @return output stream that applies the
360 * specified encoding.
361 * @since JavaMail 1.2
362 */
363 public static OutputStream encode(OutputStream os, String encoding,
364 String filename)
365 throws MessagingException {
366 if (encoding == null)
367 return os;
368 else if (encoding.equalsIgnoreCase("base64"))
369 return new BASE64EncoderStream(os);
370 else if (encoding.equalsIgnoreCase("quoted-printable"))
371 return new QPEncoderStream(os);
372 else if (encoding.equalsIgnoreCase("uuencode") ||
373 encoding.equalsIgnoreCase("x-uuencode") ||
374 encoding.equalsIgnoreCase("x-uue"))
375 return new UUEncoderStream(os, filename);
376 else if (encoding.equalsIgnoreCase("binary") ||
377 encoding.equalsIgnoreCase("7bit") ||
378 encoding.equalsIgnoreCase("8bit"))
379 return os;
380 else
381 throw new MessagingException("Unknown encoding: " +encoding);
382 }
383
384 /**
385 * Encode a RFC 822 "text" token into mail-safe form as per
386 * RFC 2047. <p>
387 *
388 * The given Unicode string is examined for non US-ASCII
389 * characters. If the string contains only US-ASCII characters,
390 * it is returned as-is. If the string contains non US-ASCII
391 * characters, it is first character-encoded using the platform's
392 * default charset, then transfer-encoded using either the B or
393 * Q encoding. The resulting bytes are then returned as a Unicode
394 * string containing only ASCII characters. <p>
395 *
396 * Note that this method should be used to encode only
397 * "unstructured" RFC 822 headers. <p>
398 *
399 * Example of usage:
400 * <p><blockquote><pre>
401 *
402 * MimeBodyPart part = ...
403 * String rawvalue = "FooBar Mailer, Japanese version 1.1"
404 * try {
405 * // If we know for sure that rawvalue contains only US-ASCII
406 * // characters, we can skip the encoding part
407 * part.setHeader("X-mailer", MimeUtility.encodeText(rawvalue));
408 * } catch (UnsupportedEncodingException e) {
409 * // encoding failure
410 * } catch (MessagingException me) {
411 * // setHeader() failure
412 * }
413 *
414 * </pre></blockquote><p>
415 *
416 * @param text unicode string
417 * @return Unicode string containing only US-ASCII characters
418 * @exception UnsupportedEncodingException if the encoding fails
419 */
420 public static String encodeText(String text)
421 throws UnsupportedEncodingException {
422 return encodeText(text, null, null);
423 }
424
425 /**
426 * Encode a RFC 822 "text" token into mail-safe form as per
427 * RFC 2047. <p>
428 *
429 * The given Unicode string is examined for non US-ASCII
430 * characters. If the string contains only US-ASCII characters,
431 * it is returned as-is. If the string contains non US-ASCII
432 * characters, it is first character-encoded using the specified
433 * charset, then transfer-encoded using either the B or Q encoding.
434 * The resulting bytes are then returned as a Unicode string
435 * containing only ASCII characters. <p>
436 *
437 * Note that this method should be used to encode only
438 * "unstructured" RFC 822 headers.
439 *
440 * @param text the header value
441 * @param charset the charset. If this parameter is null, the
442 * platform's default chatset is used.
443 * @param encoding the encoding to be used. Currently supported
444 * values are "B" and "Q". If this parameter is null, then
445 * the "Q" encoding is used if most of characters to be
446 * encoded are in the ASCII charset, otherwise "B" encoding
447 * is used.
448 * @return Unicode string containing only US-ASCII characters
449 */
450 public static String encodeText(String text, String charset,
451 String encoding)
452 throws UnsupportedEncodingException {
453 return encodeWord(text, charset, encoding, false);
454 }
455
456 /**
457 * Decode "unstructured" headers, that is, headers that are defined
458 * as '*text' as per RFC 822. <p>
459 *
460 * The string is decoded using the algorithm specified in
461 * RFC 2047, Section 6.1.1. If the charset-conversion fails
462 * for any sequence, an UnsupportedEncodingException is thrown.
463 * If the String is not an RFC 2047 style encoded header, it is
464 * returned as-is <p>
465 *
466 * Example of usage:
467 * <p><blockquote><pre>
468 *
469 * MimeBodyPart part = ...
470 * String rawvalue = null;
471 * String value = null;
472 * try {
473 * if ((rawvalue = part.getHeader("X-mailer")[0]) != null)
474 * value = MimeUtility.decodeText(rawvalue);
475 * } catch (UnsupportedEncodingException e) {
476 * // Don't care
477 * value = rawvalue;
478 * } catch (MessagingException me) { }
479 *
480 * return value;
481 *
482 * </pre></blockquote><p>
483 *
484 * @param etext the possibly encoded value
485 * @exception UnsupportedEncodingException if the charset
486 * conversion failed.
487 */
488 public static String decodeText(String etext)
489 throws UnsupportedEncodingException {
490 /*
491 * We look for sequences separated by "linear-white-space".
492 * (as per RFC 2047, Section 6.1.1)
493 * RFC 822 defines "linear-white-space" as SPACE | HT | CR | NL.
494 */
495 String lwsp = " \t\n\r";
496 StringTokenizer st;
497
498 /*
499 * First, lets do a quick run thru the string and check
500 * whether the sequence "=?" exists at all. If none exists,
501 * we know there are no encoded-words in here and we can just
502 * return the string as-is, without suffering thru the later
503 * decoding logic.
504 * This handles the most common case of unencoded headers
505 * efficiently.
506 */
507 if (etext.indexOf("=?") == -1)
508 return etext;
509
510 // Encoded words found. Start decoding ...
511
512 st = new StringTokenizer(etext, lwsp, true);
513 StringBuffer sb = new StringBuffer(); // decode buffer
514 StringBuffer wsb = new StringBuffer(); // white space buffer
515 boolean prevWasEncoded = false;
516
517 while (st.hasMoreTokens()) {
518 char c;
519 String s = st.nextToken();
520 // If whitespace, append it to the whitespace buffer
521 if (((c = s.charAt(0)) == ' ') || (c == '\t') ||
522 (c == '\r') || (c == '\n'))
523 wsb.append(c);
524 else {
525 // Check if token is an 'encoded-word' ..
526 String word;
527 try {
528 word = decodeWord(s);
529 // Yes, this IS an 'encoded-word'.
530 if (!prevWasEncoded && wsb.length() > 0) {
531 // if the previous word was also encoded, we
532 // should ignore the collected whitespace. Else
533 // we include the whitespace as well.
534 sb.append(wsb);
535 }
536 prevWasEncoded = true;
537 } catch (ParseException pex) {
538 // This is NOT an 'encoded-word'.
539 word = s;
540 // possibly decode inner encoded words
541 if (!decodeStrict)
542 word = decodeInnerWords(word);
543 // include colleced whitespace ..
544 if (wsb.length() > 0)
545 sb.append(wsb);
546 prevWasEncoded = false;
547 }
548 sb.append(word); // append the actual word
549 wsb.setLength(0); // reset wsb for reuse
550 }
551 }
552 return sb.toString();
553 }
554
555 /**
556 * Encode a RFC 822 "word" token into mail-safe form as per
557 * RFC 2047. <p>
558 *
559 * The given Unicode string is examined for non US-ASCII
560 * characters. If the string contains only US-ASCII characters,
561 * it is returned as-is. If the string contains non US-ASCII
562 * characters, it is first character-encoded using the platform's
563 * default charset, then transfer-encoded using either the B or
564 * Q encoding. The resulting bytes are then returned as a Unicode
565 * string containing only ASCII characters. <p>
566 *
567 * This method is meant to be used when creating RFC 822 "phrases".
568 * The InternetAddress class, for example, uses this to encode
569 * it's 'phrase' component.
570 *
571 * @param text unicode string
572 * @return Array of Unicode strings containing only US-ASCII
573 * characters.
574 * @exception UnsupportedEncodingException if the encoding fails
575 */
576 public static String encodeWord(String word)
577 throws UnsupportedEncodingException {
578 return encodeWord(word, null, null);
579 }
580
581 /**
582 * Encode a RFC 822 "word" token into mail-safe form as per
583 * RFC 2047. <p>
584 *
585 * The given Unicode string is examined for non US-ASCII
586 * characters. If the string contains only US-ASCII characters,
587 * it is returned as-is. If the string contains non US-ASCII
588 * characters, it is first character-encoded using the specified
589 * charset, then transfer-encoded using either the B or Q encoding.
590 * The resulting bytes are then returned as a Unicode string
591 * containing only ASCII characters. <p>
592 *
593 * @param text unicode string
594 * @param charset the MIME charset
595 * @param encoding the encoding to be used. Currently supported
596 * values are "B" and "Q". If this parameter is null, then
597 * the "Q" encoding is used if most of characters to be
598 * encoded are in the ASCII charset, otherwise "B" encoding
599 * is used.
600 * @return Unicode string containing only US-ASCII characters
601 * @exception UnsupportedEncodingException if the encoding fails
602 */
603 public static String encodeWord(String word, String charset,
604 String encoding)
605 throws UnsupportedEncodingException {
606 return encodeWord(word, charset, encoding, true);
607 }
608
609 /*
610 * Encode the given string. The parameter 'encodingWord' should
611 * be true if a RFC 822 "word" token is being encoded and false if a
612 * RFC 822 "text" token is being encoded. This is because the
613 * "Q" encoding defined in RFC 2047 has more restrictions when
614 * encoding "word" tokens. (Sigh)
615 */
616 private static String encodeWord(String string, String charset,
617 String encoding, boolean encodingWord)
618 throws UnsupportedEncodingException {
619
620 // If 'string' contains only US-ASCII characters, just
621 // return it.
622 int ascii = checkAscii(string);
623 if (ascii == ALL_ASCII)
624 return string;
625
626 // Else, apply the specified charset conversion.
627 String jcharset;
628 if (charset == null) { // use default charset
629 jcharset = getDefaultJavaCharset(); // the java charset
630 charset = getDefaultMIMECharset(); // the MIME equivalent
631 } else // MIME charset -> java charset
632 jcharset = javaCharset(charset);
633
634 // If no transfer-encoding is specified, figure one out.
635 if (encoding == null) {
636 if (ascii != MOSTLY_NONASCII)
637 encoding = "Q";
638 else
639 encoding = "B";
640 }
641
642 boolean b64;
643 if (encoding.equalsIgnoreCase("B"))
644 b64 = true;
645 else if (encoding.equalsIgnoreCase("Q"))
646 b64 = false;
647 else
648 throw new UnsupportedEncodingException(
649 "Unknown transfer encoding: " + encoding);
650
651 StringBuffer outb = new StringBuffer(); // the output buffer
652 doEncode(string, b64, jcharset,
653 // As per RFC 2047, size of an encoded string should not
654 // exceed 75 bytes.
655 // 7 = size of "=?", '?', 'B'/'Q', '?', "?="
656 75 - 7 - charset.length(), // the available space
657 "=?" + charset + "?" + encoding + "?", // prefix
658 true, encodingWord, outb);
659
660 return outb.toString();
661 }
662
663 private static void doEncode(String string, boolean b64,
664 String jcharset, int avail, String prefix,
665 boolean first, boolean encodingWord, StringBuffer buf)
666 throws UnsupportedEncodingException {
667
668 // First find out what the length of the encoded version of
669 // 'string' would be.
670 byte[] bytes = string.getBytes(jcharset);
671 int len;
672 if (b64) // "B" encoding
673 len = BEncoderStream.encodedLength(bytes);
674 else // "Q"
675 len = QEncoderStream.encodedLength(bytes, encodingWord);
676
677 int size;
678 if ((len > avail) && ((size = string.length()) > 1)) {
679 // If the length is greater than 'avail', split 'string'
680 // into two and recurse.
681 doEncode(string.substring(0, size/2), b64, jcharset,
682 avail, prefix, first, encodingWord, buf);
683 doEncode(string.substring(size/2, size), b64, jcharset,
684 avail, prefix, false, encodingWord, buf);
685 } else {
686 // length <= than 'avail'. Encode the given string
687 ByteArrayOutputStream os = new ByteArrayOutputStream(BUFFER_SIZE);
688 OutputStream eos; // the encoder
689 if (b64) // "B" encoding
690 eos = new BEncoderStream(os);
691 else // "Q" encoding
692 eos = new QEncoderStream(os, encodingWord);
693
694 try { // do the encoding
695 eos.write(bytes);
696 eos.close();
697 } catch (IOException ioex) { }
698
699 byte[] encodedBytes = os.toByteArray(); // the encoded stuff
700 // Now write out the encoded (all ASCII) bytes into our
701 // StringBuffer
702 if (!first) // not the first line of this sequence
703 if (foldEncodedWords)
704 buf.append("\r\n "); // start a continuation line
705 else
706 buf.append(" "); // line will be folded later
707
708 buf.append(prefix);
709 for (int i = 0; i < encodedBytes.length; i++)
710 buf.append((char)encodedBytes[i]);
711 buf.append("?="); // terminate the current sequence
712 }
713 }
714
715 /**
716 * The string is parsed using the rules in RFC 2047 for parsing
717 * an "encoded-word". If the parse fails, a ParseException is
718 * thrown. Otherwise, it is transfer-decoded, and then
719 * charset-converted into Unicode. If the charset-conversion
720 * fails, an UnsupportedEncodingException is thrown.<p>
721 *
722 * @param eword the possibly encoded value
723 * @exception ParseException if the string is not an
724 * encoded-word as per RFC 2047.
725 * @exception UnsupportedEncodingException if the charset
726 * conversion failed.
727 */
728 public static String decodeWord(String eword)
729 throws ParseException, UnsupportedEncodingException {
730
731 if (!eword.startsWith("=?")) // not an encoded word
732 throw new ParseException();
733
734 // get charset
735 int start = 2; int pos;
736 if ((pos = eword.indexOf('?', start)) == -1)
737 throw new ParseException();
738 String charset = javaCharset(eword.substring(start, pos));
739
740 // get encoding
741 start = pos+1;
742 if ((pos = eword.indexOf('?', start)) == -1)
743 throw new ParseException();
744 String encoding = eword.substring(start, pos);
745
746 // get encoded-sequence
747 start = pos+1;
748 if ((pos = eword.indexOf("?=", start)) == -1)
749 throw new ParseException();
750 String word = eword.substring(start, pos);
751
752 try {
753 // Extract the bytes from word
754 ByteArrayInputStream bis =
755 new ByteArrayInputStream(ASCIIUtility.getBytes(word));
756
757 // Get the appropriate decoder
758 InputStream is;
759 if (encoding.equalsIgnoreCase("B"))
760 is = new BASE64DecoderStream(bis);
761 else if (encoding.equalsIgnoreCase("Q"))
762 is = new QDecoderStream(bis);
763 else
764 throw new UnsupportedEncodingException(
765 "unknown encoding: " + encoding);
766
767 // For b64 & q, size of decoded word <= size of word. So
768 // the decoded bytes must fit into the 'bytes' array. This
769 // is certainly more efficient than writing bytes into a
770 // ByteArrayOutputStream and then pulling out the byte[]
771 // from it.
772 int count = bis.available();
773 byte[] bytes = new byte[count];
774 // count is set to the actual number of decoded bytes
775 count = is.read(bytes, 0, count);
776
777 // Finally, convert the decoded bytes into a String using
778 // the specified charset
779 String s = new String(bytes, 0, count, charset);
780 if (pos + 2 < eword.length()) {
781 // there's still more text in the string
782 String rest = eword.substring(pos + 2);
783 if (!decodeStrict)
784 rest = decodeInnerWords(rest);
785 s += rest;
786 }
787 return s;
788 } catch (UnsupportedEncodingException uex) {
789 // explicitly catch and rethrow this exception, otherwise
790 // the below IOException catch will swallow this up!
791 throw uex;
792 } catch (IOException ioex) {
793 // Shouldn't happen.
794 throw new ParseException();
795 } catch (IllegalArgumentException iex) {
796 /* An unknown charset of the form ISO-XXX-XXX, will cause
797 * the JDK to throw an IllegalArgumentException ... Since the
798 * JDK will attempt to create a classname using this string,
799 * but valid classnames must not contain the character '-',
800 * and this results in an IllegalArgumentException, rather than
801 * the expected UnsupportedEncodingException. Yikes
802 */
803 throw new UnsupportedEncodingException();
804 }
805 }
806
807 /**
808 * Look for encoded words within a word. The MIME spec doesn't
809 * allow this, but many broken mailers, especially Japanese mailers,
810 * produce such incorrect encodings.
811 */
812 private static String decodeInnerWords(String word)
813 throws UnsupportedEncodingException {
814 int start = 0, i;
815 StringBuffer buf = new StringBuffer();
816 while ((i = word.indexOf("=?", start)) >= 0) {
817 buf.append(word.substring(start, i));
818 int end = word.indexOf("?=", i);
819 if (end < 0)
820 break;
821 String s = word.substring(i, end + 2);
822 try {
823 s = decodeWord(s);
824 } catch (ParseException pex) {
825 // ignore it, just use the original string
826 }
827 buf.append(s);
828 start = end + 2;
829 }
830 if (start == 0)
831 return word;
832 if (start < word.length())
833 buf.append(word.substring(start));
834 return buf.toString();
835 }
836
837 /**
838 * A utility method to quote a word, if the word contains any
839 * characters from the specified 'specials' list.<p>
840 *
841 * The <code>HeaderTokenizer</code> class defines two special
842 * sets of delimiters - MIME and RFC 822. <p>
843 *
844 * This method is typically used during the generation of
845 * RFC 822 and MIME header fields.
846 *
847 * @param word word to be quoted
848 * @param specials the set of special characters
849 * @return the possibly quoted word
850 * @see javax.mail.internet.HeaderTokenizer#MIME
851 * @see javax.mail.internet.HeaderTokenizer#RFC822
852 */
853 public static String quote(String word, String specials) {
854 int len = word.length();
855
856 /*
857 * Look for any "bad" characters, Escape and
858 * quote the entire string if necessary.
859 */
860 boolean needQuoting = false;
861 for (int i = 0; i < len; i++) {
862 char c = word.charAt(i);
863 if (c == '"' || c == '\\' || c == '\r' || c == '\n') {
864 // need to escape them and then quote the whole string
865 StringBuffer sb = new StringBuffer(len + 3);
866 sb.append('"');
867 sb.append(word.substring(0, i));
868 int lastc = 0;
869 for (int j = i; j < len; j++) {
870 char cc = word.charAt(j);
871 if ((cc == '"') || (cc == '\\') ||
872 (cc == '\r') || (cc == '\n'))
873 if (cc == '\n' && lastc == '\r')
874 ; // do nothing, CR was already escaped
875 else
876 sb.append('\\'); // Escape the character
877 sb.append(cc);
878 lastc = cc;
879 }
880 sb.append('"');
881 return sb.toString();
882 } else if (c < 040 || c >= 0177 || specials.indexOf(c) >= 0)
883 // These characters cause the string to be quoted
884 needQuoting = true;
885 }
886
887 if (needQuoting) {
888 StringBuffer sb = new StringBuffer(len + 2);
889 sb.append('"').append(word).append('"');
890 return sb.toString();
891 } else
892 return word;
893 }
894
895 /**
896 * Fold a string at linear whitespace so that each line is no longer
897 * than 76 characters, if possible. If there are more than 76
898 * non-whitespace characters consecutively, the string is folded at
899 * the first whitespace after that sequence. The parameter
900 * <code>used</code> indicates how many characters have been used in
901 * the current line; it is usually the length of the header name. <p>
902 *
903 * Note that line breaks in the string aren't escaped; they probably
904 * should be.
905 *
906 * @param used characters used in line so far
907 * @param s the string to fold
908 * @return the folded string
909 */
910 /*public*/ static String fold(int used, String s) {
911 if (!foldText)
912 return s;
913
914 int end;
915 char c;
916 // Strip trailing spaces
917 for (end = s.length() - 1; end >= 0; end--) {
918 c = s.charAt(end);
919 if (c != ' ' && c != '\t')
920 break;
921 }
922 if (end != s.length() - 1)
923 s = s.substring(0, end + 1);
924
925 // if the string fits now, just return it
926 if (used + s.length() <= 76)
927 return s;
928
929 // have to actually fold the string
930 StringBuffer sb = new StringBuffer(s.length() + 4);
931 char lastc = 0;
932 while (used + s.length() > 76) {
933 int lastspace = -1;
934 for (int i = 0; i < s.length(); i++) {
935 if (lastspace != -1 && used + i > 76)
936 break;
937 c = s.charAt(i);
938 if (c == ' ' || c == '\t')
939 if (!(lastc == ' ' || lastc == '\t'))
940 lastspace = i;
941 lastc = c;
942 }
943 if (lastspace == -1) {
944 // no space, use the whole thing
945 sb.append(s);
946 s = "";
947 used = 0;
948 break;
949 }
950 sb.append(s.substring(0, lastspace));
951 sb.append("\r\n");
952 lastc = s.charAt(lastspace);
953 sb.append(lastc);
954 s = s.substring(lastspace + 1);
955 used = 1;
956 }
957 sb.append(s);
958 return sb.toString();
959 }
960
961 /**
962 * Unfold a folded header. Any line breaks that aren't escaped and
963 * are followed by whitespace are removed.
964 *
965 * @param s the string to unfold
966 * @return the unfolded string
967 */
968 /*public*/ static String unfold(String s) {
969 if (!foldText)
970 return s;
971
972 StringBuffer sb = null;
973 int i;
974 while ((i = indexOfAny(s, "\r\n")) >= 0) {
975 int start = i;
976 int l = s.length();
977 i++; // skip CR or NL
978 if (i < l && s.charAt(i - 1) == '\r' && s.charAt(i) == '\n')
979 i++; // skip LF
980 if (start == 0 || s.charAt(start - 1) != '\\') {
981 char c;
982 // if next line starts with whitespace, skip all of it
983 // XXX - always has to be true?
984 if (i < l && ((c = s.charAt(i)) == ' ' || c == '\t')) {
985 i++; // skip whitespace
986 while (i < l && ((c = s.charAt(i)) == ' ' || c == '\t'))
987 i++;
988 if (sb == null)
989 sb = new StringBuffer(s.length());
990 if (start != 0) {
991 sb.append(s.substring(0, start));
992 sb.append(' ');
993 }
994 s = s.substring(i);
995 continue;
996 }
997 // it's not a continuation line, just leave it in
998 if (sb == null)
999 sb = new StringBuffer(s.length());
1000 sb.append(s.substring(0, i));
1001 s = s.substring(i);
1002 } else {
1003 // there's a backslash at "start - 1"
1004 // strip it out, but leave in the line break
1005 if (sb == null)
1006 sb = new StringBuffer(s.length());
1007 sb.append(s.substring(0, start - 1));
1008 sb.append(s.substring(start, i));
1009 s = s.substring(i);
1010 }
1011 }
1012 if (sb != null) {
1013 sb.append(s);
1014 return sb.toString();
1015 } else
1016 return s;
1017 }
1018
1019 /**
1020 * Return the first index of any of the characters in "any" in "s",
1021 * or -1 if none are found.
1022 *
1023 * This should be a method on String.
1024 */
1025 private static int indexOfAny(String s, String any) {
1026 return indexOfAny(s, any, 0);
1027 }
1028
1029 private static int indexOfAny(String s, String any, int start) {
1030 try {
1031 int len = s.length();
1032 for (int i = start; i < len; i++) {
1033 if (any.indexOf(s.charAt(i)) >= 0)
1034 return i;
1035 }
1036 return -1;
1037 } catch (StringIndexOutOfBoundsException e) {
1038 return -1;
1039 }
1040 }
1041
1042 /**
1043 * Convert a MIME charset name into a valid Java charset name. <p>
1044 *
1045 * @param charset the MIME charset name
1046 * @return the Java charset equivalent. If a suitable mapping is
1047 * not available, the passed in charset is itself returned.
1048 */
1049 public static String javaCharset(String charset) {
1050 if (mime2java == null || charset == null)
1051 // no mapping table, or charset parameter is null
1052 return charset;
1053
1054 String alias = (String)mime2java.get(charset.toLowerCase());
1055 return alias == null ? charset : alias;
1056 }
1057
1058 /**
1059 * Convert a java charset into its MIME charset name. <p>
1060 *
1061 * Note that a future version of JDK (post 1.2) might provide
1062 * this functionality, in which case, we may deprecate this
1063 * method then.
1064 *
1065 * @param charset the JDK charset
1066 * @return the MIME/IANA equivalent. If a mapping
1067 * is not possible, the passed in charset itself
1068 * is returned.
1069 * @since JavaMail 1.1
1070 */
1071 public static String mimeCharset(String charset) {
1072 if (java2mime == null || charset == null)
1073 // no mapping table or charset param is null
1074 return charset;
1075
1076 String alias = (String)java2mime.get(charset.toLowerCase());
1077 return alias == null ? charset : alias;
1078 }
1079
1080 private static String defaultJavaCharset;
1081 private static String defaultMIMECharset;
1082
1083 /**
1084 * Get the default charset corresponding to the system's current
1085 * default locale. If the System property <code>mail.mime.charset</code>
1086 * is set, a system charset corresponding to this MIME charset will be
1087 * returned. <p>
1088 *
1089 * @return the default charset of the system's default locale,
1090 * as a Java charset. (NOT a MIME charset)
1091 * @since JavaMail 1.1
1092 */
1093 public static String getDefaultJavaCharset() {
1094 if (defaultJavaCharset == null) {
1095 /*
1096 * If mail.mime.charset is set, it controls the default
1097 * Java charset as well.
1098 */
1099 String mimecs = null;
1100
1101 mimecs = SAAJUtil.getSystemProperty("mail.mime.charset");
1102
1103 if (mimecs != null && mimecs.length() > 0) {
1104 defaultJavaCharset = javaCharset(mimecs);
1105 return defaultJavaCharset;
1106 }
1107
1108 try {
1109 defaultJavaCharset = System.getProperty("file.encoding",
1110 "8859_1");
1111 } catch (SecurityException sex) {
1112
1113 class NullInputStream extends InputStream {
1114 public int read() {
1115 return 0;
1116 }
1117 }
1118 InputStreamReader reader =
1119 new InputStreamReader(new NullInputStream());
1120 defaultJavaCharset = reader.getEncoding();
1121 if (defaultJavaCharset == null)
1122 defaultJavaCharset = "8859_1";
1123 }
1124 }
1125
1126 return defaultJavaCharset;
1127 }
1128
1129 /*
1130 * Get the default MIME charset for this locale.
1131 */
1132 static String getDefaultMIMECharset() {
1133 if (defaultMIMECharset == null) {
1134 defaultMIMECharset = SAAJUtil.getSystemProperty("mail.mime.charset");
1135 }
1136 if (defaultMIMECharset == null)
1137 defaultMIMECharset = mimeCharset(getDefaultJavaCharset());
1138 return defaultMIMECharset;
1139 }
1140
1141 // Tables to map MIME charset names to Java names and vice versa.
1142 // XXX - Should eventually use J2SE 1.4 java.nio.charset.Charset
1143 private static Hashtable mime2java;
1144 private static Hashtable java2mime;
1145
1146 static {
1147 java2mime = new Hashtable(40);
1148 mime2java = new Hashtable(10);
1149
1150 try {
1151 // Use this class's classloader to load the mapping file
1152 // XXX - we should use SecuritySupport, but it's in another package
1153 InputStream is =
1154 com.sun.xml.internal.messaging.saaj.packaging.mime.internet.MimeUtility.class.getResourceAsStream(
1155 "/META-INF/javamail.charset.map");
1156
1157 if (is != null) {
1158 is = new LineInputStream(is);
1159
1160 // Load the JDK-to-MIME charset mapping table
1161 loadMappings((LineInputStream)is, java2mime);
1162
1163 // Load the MIME-to-JDK charset mapping table
1164 loadMappings((LineInputStream)is, mime2java);
1165 }
1166 } catch (Exception ex) { }
1167
1168 // If we didn't load the tables, e.g., because we didn't have
1169 // permission, load them manually. The entries here should be
1170 // the same as the default javamail.charset.map.
1171 if (java2mime.isEmpty()) {
1172 java2mime.put("8859_1", "ISO-8859-1");
1173 java2mime.put("iso8859_1", "ISO-8859-1");
1174 java2mime.put("ISO8859-1", "ISO-8859-1");
1175
1176 java2mime.put("8859_2", "ISO-8859-2");
1177 java2mime.put("iso8859_2", "ISO-8859-2");
1178 java2mime.put("ISO8859-2", "ISO-8859-2");
1179
1180 java2mime.put("8859_3", "ISO-8859-3");
1181 java2mime.put("iso8859_3", "ISO-8859-3");
1182 java2mime.put("ISO8859-3", "ISO-8859-3");
1183
1184 java2mime.put("8859_4", "ISO-8859-4");
1185 java2mime.put("iso8859_4", "ISO-8859-4");
1186 java2mime.put("ISO8859-4", "ISO-8859-4");
1187
1188 java2mime.put("8859_5", "ISO-8859-5");
1189 java2mime.put("iso8859_5", "ISO-8859-5");
1190 java2mime.put("ISO8859-5", "ISO-8859-5");
1191
1192 java2mime.put("8859_6", "ISO-8859-6");
1193 java2mime.put("iso8859_6", "ISO-8859-6");
1194 java2mime.put("ISO8859-6", "ISO-8859-6");
1195
1196 java2mime.put("8859_7", "ISO-8859-7");
1197 java2mime.put("iso8859_7", "ISO-8859-7");
1198 java2mime.put("ISO8859-7", "ISO-8859-7");
1199
1200 java2mime.put("8859_8", "ISO-8859-8");
1201 java2mime.put("iso8859_8", "ISO-8859-8");
1202 java2mime.put("ISO8859-8", "ISO-8859-8");
1203
1204 java2mime.put("8859_9", "ISO-8859-9");
1205 java2mime.put("iso8859_9", "ISO-8859-9");
1206 java2mime.put("ISO8859-9", "ISO-8859-9");
1207
1208 java2mime.put("SJIS", "Shift_JIS");
1209 java2mime.put("MS932", "Shift_JIS");
1210 java2mime.put("JIS", "ISO-2022-JP");
1211 java2mime.put("ISO2022JP", "ISO-2022-JP");
1212 java2mime.put("EUC_JP", "euc-jp");
1213 java2mime.put("KOI8_R", "koi8-r");
1214 java2mime.put("EUC_CN", "euc-cn");
1215 java2mime.put("EUC_TW", "euc-tw");
1216 java2mime.put("EUC_KR", "euc-kr");
1217 }
1218 if (mime2java.isEmpty()) {
1219 mime2java.put("iso-2022-cn", "ISO2022CN");
1220 mime2java.put("iso-2022-kr", "ISO2022KR");
1221 mime2java.put("utf-8", "UTF8");
1222 mime2java.put("utf8", "UTF8");
1223 mime2java.put("ja_jp.iso2022-7", "ISO2022JP");
1224 mime2java.put("ja_jp.eucjp", "EUCJIS");
1225 mime2java.put("euc-kr", "KSC5601");
1226 mime2java.put("euckr", "KSC5601");
1227 mime2java.put("us-ascii", "ISO-8859-1");
1228 mime2java.put("x-us-ascii", "ISO-8859-1");
1229 }
1230 }
1231
1232 private static void loadMappings(LineInputStream is, Hashtable table) {
1233 String currLine;
1234
1235 while (true) {
1236 try {
1237 currLine = is.readLine();
1238 } catch (IOException ioex) {
1239 break; // error in reading, stop
1240 }
1241
1242 if (currLine == null) // end of file, stop
1243 break;
1244 if (currLine.startsWith("--") && currLine.endsWith("--"))
1245 // end of this table
1246 break;
1247
1248 // ignore empty lines and comments
1249 if (currLine.trim().length() == 0 || currLine.startsWith("#"))
1250 continue;
1251
1252 // A valid entry is of the form <key><separator><value>
1253 // where, <separator> := SPACE | HT. Parse this
1254 StringTokenizer tk = new StringTokenizer(currLine, " \t");
1255 try {
1256 String key = tk.nextToken();
1257 String value = tk.nextToken();
1258 table.put(key.toLowerCase(), value);
1259 } catch (NoSuchElementException nex) { }
1260 }
1261 }
1262
1263 static final int ALL_ASCII = 1;
1264 static final int MOSTLY_ASCII = 2;
1265 static final int MOSTLY_NONASCII = 3;
1266
1267 /**
1268 * Check if the given string contains non US-ASCII characters.
1269 * @param s string
1270 * @return ALL_ASCII if all characters in the string
1271 * belong to the US-ASCII charset. MOSTLY_ASCII
1272 * if more than half of the available characters
1273 * are US-ASCII characters. Else MOSTLY_NONASCII.
1274 */
1275 static int checkAscii(String s) {
1276 int ascii = 0, non_ascii = 0;
1277 int l = s.length();
1278
1279 for (int i = 0; i < l; i++) {
1280 if (nonascii((int)s.charAt(i))) // non-ascii
1281 non_ascii++;
1282 else
1283 ascii++;
1284 }
1285
1286 if (non_ascii == 0)
1287 return ALL_ASCII;
1288 if (ascii > non_ascii)
1289 return MOSTLY_ASCII;
1290
1291 return MOSTLY_NONASCII;
1292 }
1293
1294 /**
1295 * Check if the given byte array contains non US-ASCII characters.
1296 * @param b byte array
1297 * @return ALL_ASCII if all characters in the string
1298 * belong to the US-ASCII charset. MOSTLY_ASCII
1299 * if more than half of the available characters
1300 * are US-ASCII characters. Else MOSTLY_NONASCII.
1301 *
1302 * XXX - this method is no longer used
1303 */
1304 static int checkAscii(byte[] b) {
1305 int ascii = 0, non_ascii = 0;
1306
1307 for (int i=0; i < b.length; i++) {
1308 // The '&' operator automatically causes b[i] to be promoted
1309 // to an int, and we mask out the higher bytes in the int
1310 // so that the resulting value is not a negative integer.
1311 if (nonascii(b[i] & 0xff)) // non-ascii
1312 non_ascii++;
1313 else
1314 ascii++;
1315 }
1316
1317 if (non_ascii == 0)
1318 return ALL_ASCII;
1319 if (ascii > non_ascii)
1320 return MOSTLY_ASCII;
1321
1322 return MOSTLY_NONASCII;
1323 }
1324
1325 /**
1326 * Check if the given input stream contains non US-ASCII characters.
1327 * Upto <code>max</code> bytes are checked. If <code>max</code> is
1328 * set to <code>ALL</code>, then all the bytes available in this
1329 * input stream are checked. If <code>breakOnNonAscii</code> is true
1330 * the check terminates when the first non-US-ASCII character is
1331 * found and MOSTLY_NONASCII is returned. Else, the check continues
1332 * till <code>max</code> bytes or till the end of stream.
1333 *
1334 * @param is the input stream
1335 * @param max maximum bytes to check for. The special value
1336 * ALL indicates that all the bytes in this input
1337 * stream must be checked.
1338 * @param breakOnNonAscii if <code>true</code>, then terminate the
1339 * the check when the first non-US-ASCII character
1340 * is found.
1341 * @return ALL_ASCII if all characters in the string
1342 * belong to the US-ASCII charset. MOSTLY_ASCII
1343 * if more than half of the available characters
1344 * are US-ASCII characters. Else MOSTLY_NONASCII.
1345 */
1346 static int checkAscii(InputStream is, int max, boolean breakOnNonAscii) {
1347 int ascii = 0, non_ascii = 0;
1348 int len;
1349 int block = 4096;
1350 int linelen = 0;
1351 boolean longLine = false, badEOL = false;
1352 boolean checkEOL = encodeEolStrict && breakOnNonAscii;
1353 byte buf[] = null;
1354 if (max != 0) {
1355 block = (max == ALL) ? 4096 : Math.min(max, 4096);
1356 buf = new byte[block];
1357 }
1358 while (max != 0) {
1359 try {
1360 if ((len = is.read(buf, 0, block)) == -1)
1361 break;
1362 int lastb = 0;
1363 for (int i = 0; i < len; i++) {
1364 // The '&' operator automatically causes b[i] to
1365 // be promoted to an int, and we mask out the higher
1366 // bytes in the int so that the resulting value is
1367 // not a negative integer.
1368 int b = buf[i] & 0xff;
1369 if (checkEOL &&
1370 ((lastb == '\r' && b != '\n') ||
1371 (lastb != '\r' && b == '\n')))
1372 badEOL = true;
1373 if (b == '\r' || b == '\n')
1374 linelen = 0;
1375 else {
1376 linelen++;
1377 if (linelen > 998) // 1000 - CRLF
1378 longLine = true;
1379 }
1380 if (nonascii(b)) { // non-ascii
1381 if (breakOnNonAscii) // we are done
1382 return MOSTLY_NONASCII;
1383 else
1384 non_ascii++;
1385 } else
1386 ascii++;
1387 lastb = b;
1388 }
1389 } catch (IOException ioex) {
1390 break;
1391 }
1392 if (max != ALL)
1393 max -= len;
1394 }
1395
1396 if (max == 0 && breakOnNonAscii)
1397 // We have been told to break on the first non-ascii character.
1398 // We haven't got any non-ascii character yet, but then we
1399 // have not checked all of the available bytes either. So we
1400 // cannot say for sure that this input stream is ALL_ASCII,
1401 // and hence we must play safe and return MOSTLY_NONASCII
1402
1403 return MOSTLY_NONASCII;
1404
1405 if (non_ascii == 0) { // no non-us-ascii characters so far
1406 // If we're looking at non-text data, and we saw CR without LF
1407 // or vice versa, consider this mostly non-ASCII so that it
1408 // will be base64 encoded (since the quoted-printable encoder
1409 // doesn't encode this case properly).
1410 if (badEOL)
1411 return MOSTLY_NONASCII;
1412 // if we've seen a long line, we degrade to mostly ascii
1413 else if (longLine)
1414 return MOSTLY_ASCII;
1415 else
1416 return ALL_ASCII;
1417 }
1418 if (ascii > non_ascii) // mostly ascii
1419 return MOSTLY_ASCII;
1420 return MOSTLY_NONASCII;
1421 }
1422
1423 static final boolean nonascii(int b) {
1424 return b >= 0177 || (b < 040 && b != '\r' && b != '\n' && b != '\t');
1425 }
1426 }
1427
1428 /**
1429 * An OutputStream that determines whether the data written to
1430 * it is all ASCII, mostly ASCII, or mostly non-ASCII.
1431 */
1432 class AsciiOutputStream extends OutputStream {
1433 private boolean breakOnNonAscii;
1434 private int ascii = 0, non_ascii = 0;
1435 private int linelen = 0;
1436 private boolean longLine = false;
1437 private boolean badEOL = false;
1438 private boolean checkEOL = false;
1439 private int lastb = 0;
1440 private int ret = 0;
1441
1442 public AsciiOutputStream(boolean breakOnNonAscii, boolean encodeEolStrict) {
1443 this.breakOnNonAscii = breakOnNonAscii;
1444 checkEOL = encodeEolStrict && breakOnNonAscii;
1445 }
1446
1447 public void write(int b) throws IOException {
1448 check(b);
1449 }
1450
1451 public void write(byte b[]) throws IOException {
1452 write(b, 0, b.length);
1453 }
1454
1455 public void write(byte b[], int off, int len) throws IOException {
1456 len += off;
1457 for (int i = off; i < len ; i++)
1458 check(b[i]);
1459 }
1460
1461 private final void check(int b) throws IOException {
1462 b &= 0xff;
1463 if (checkEOL &&
1464 ((lastb == '\r' && b != '\n') || (lastb != '\r' && b == '\n')))
1465 badEOL = true;
1466 if (b == '\r' || b == '\n')
1467 linelen = 0;
1468 else {
1469 linelen++;
1470 if (linelen > 998) // 1000 - CRLF
1471 longLine = true;
1472 }
1473 if (MimeUtility.nonascii(b)) { // non-ascii
1474 non_ascii++;
1475 if (breakOnNonAscii) { // we are done
1476 ret = MimeUtility.MOSTLY_NONASCII;
1477 throw new EOFException();
1478 }
1479 } else
1480 ascii++;
1481 lastb = b;
1482 }
1483
1484 /**
1485 * Return ASCII-ness of data stream.
1486 */
1487 public int getAscii() {
1488 if (ret != 0)
1489 return ret;
1490 // If we're looking at non-text data, and we saw CR without LF
1491 // or vice versa, consider this mostly non-ASCII so that it
1492 // will be base64 encoded (since the quoted-printable encoder
1493 // doesn't encode this case properly).
1494 if (badEOL)
1495 return MimeUtility.MOSTLY_NONASCII;
1496 else if (non_ascii == 0) { // no non-us-ascii characters so far
1497 // if we've seen a long line, we degrade to mostly ascii
1498 if (longLine)
1499 return MimeUtility.MOSTLY_ASCII;
1500 else
1501 return MimeUtility.ALL_ASCII;
1502 }
1503 if (ascii > non_ascii) // mostly ascii
1504 return MimeUtility.MOSTLY_ASCII;
1505 return MimeUtility.MOSTLY_NONASCII;
1506 }
1507 }

mercurial