src/share/jaxws_classes/com/sun/xml/internal/dtdparser/XmlReader.java

changeset 0
373ffda63c9a
child 637
9c07ef4934dd
equal deleted inserted replaced
-1:000000000000 0:373ffda63c9a
1 /*
2 * Copyright (c) 2009, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26 package com.sun.xml.internal.dtdparser;
27
28 import java.io.ByteArrayInputStream;
29 import java.io.CharConversionException;
30 import java.io.IOException;
31 import java.io.InputStream;
32 import java.io.InputStreamReader;
33 import java.io.PushbackInputStream;
34 import java.io.Reader;
35 import java.util.Hashtable;
36
37
38 // NOTE: Add I18N support to this class when JDK gets the ability to
39 // defer selection of locale for exception messages ... use the same
40 // technique for both.
41
42
43 /**
44 * This handles several XML-related tasks that normal java.io Readers
45 * don't support, inluding use of IETF standard encoding names and
46 * automatic detection of most XML encodings. The former is needed
47 * for interoperability; the latter is needed to conform with the XML
48 * spec. This class also optimizes reading some common encodings by
49 * providing low-overhead unsynchronized Reader support.
50 * <p/>
51 * <P> Note that the autodetection facility should be used only on
52 * data streams which have an unknown character encoding. For example,
53 * it should never be used on MIME text/xml entities.
54 * <p/>
55 * <P> Note that XML processors are only required to support UTF-8 and
56 * UTF-16 character encodings. Autodetection permits the underlying Java
57 * implementation to provide support for many other encodings, such as
58 * US-ASCII, ISO-8859-5, Shift_JIS, EUC-JP, and ISO-2022-JP.
59 *
60 * @author David Brownell
61 * @author Janet Koenig
62 * @version 1.3 00/02/24
63 */
64 // package private
65 final class XmlReader extends Reader {
66 private static final int MAXPUSHBACK = 512;
67
68 private Reader in;
69 private String assignedEncoding;
70 private boolean closed;
71
72 //
73 // This class always delegates I/O to a reader, which gets
74 // its data from the very beginning of the XML text. It needs
75 // to use a pushback stream since (a) autodetection can read
76 // partial UTF-8 characters which need to be fully processed,
77 // (b) the "Unicode" readers swallow characters that they think
78 // are byte order marks, so tests fail if they don't see the
79 // real byte order mark.
80 //
81 // It's got do this efficiently: character I/O is solidly on the
82 // critical path. (So keep buffer length over 2 Kbytes to avoid
83 // excess buffering. Many URL handlers stuff a BufferedInputStream
84 // between here and the real data source, and larger buffers keep
85 // that from slowing you down.)
86 //
87
88 /**
89 * Constructs the reader from an input stream, autodetecting
90 * the encoding to use according to the heuristic specified
91 * in the XML 1.0 recommendation.
92 *
93 * @param in the input stream from which the reader is constructed
94 * @throws IOException on error, such as unrecognized encoding
95 */
96 public static Reader createReader(InputStream in) throws IOException {
97 return new XmlReader(in);
98 }
99
100 /**
101 * Creates a reader supporting the given encoding, mapping
102 * from standard encoding names to ones that understood by
103 * Java where necessary.
104 *
105 * @param in the input stream from which the reader is constructed
106 * @param encoding the IETF standard name of the encoding to use;
107 * if null, autodetection is used.
108 * @throws IOException on error, including unrecognized encoding
109 */
110 public static Reader createReader(InputStream in, String encoding)
111 throws IOException {
112 if (encoding == null)
113 return new XmlReader(in);
114 if ("UTF-8".equalsIgnoreCase(encoding)
115 || "UTF8".equalsIgnoreCase(encoding))
116 return new Utf8Reader(in);
117 if ("US-ASCII".equalsIgnoreCase(encoding)
118 || "ASCII".equalsIgnoreCase(encoding))
119 return new AsciiReader(in);
120 if ("ISO-8859-1".equalsIgnoreCase(encoding)
121 // plus numerous aliases ...
122 )
123 return new Iso8859_1Reader(in);
124
125 //
126 // What we really want is an administerable resource mapping
127 // encoding names/aliases to classnames. For example a property
128 // file resource, "readers/mapping.props", holding and a set
129 // of readers in that (sub)package... defaulting to this call
130 // only if no better choice is available.
131 //
132 return new InputStreamReader(in, std2java(encoding));
133 }
134
135 //
136 // JDK doesn't know all of the standard encoding names, and
137 // in particular none of the EBCDIC ones IANA defines (and
138 // which IBM encourages).
139 //
140 static private final Hashtable charsets = new Hashtable(31);
141
142 static {
143 charsets.put("UTF-16", "Unicode");
144 charsets.put("ISO-10646-UCS-2", "Unicode");
145
146 // NOTE: no support for ISO-10646-UCS-4 yet.
147
148 charsets.put("EBCDIC-CP-US", "cp037");
149 charsets.put("EBCDIC-CP-CA", "cp037");
150 charsets.put("EBCDIC-CP-NL", "cp037");
151 charsets.put("EBCDIC-CP-WT", "cp037");
152
153 charsets.put("EBCDIC-CP-DK", "cp277");
154 charsets.put("EBCDIC-CP-NO", "cp277");
155 charsets.put("EBCDIC-CP-FI", "cp278");
156 charsets.put("EBCDIC-CP-SE", "cp278");
157
158 charsets.put("EBCDIC-CP-IT", "cp280");
159 charsets.put("EBCDIC-CP-ES", "cp284");
160 charsets.put("EBCDIC-CP-GB", "cp285");
161 charsets.put("EBCDIC-CP-FR", "cp297");
162
163 charsets.put("EBCDIC-CP-AR1", "cp420");
164 charsets.put("EBCDIC-CP-HE", "cp424");
165 charsets.put("EBCDIC-CP-BE", "cp500");
166 charsets.put("EBCDIC-CP-CH", "cp500");
167
168 charsets.put("EBCDIC-CP-ROECE", "cp870");
169 charsets.put("EBCDIC-CP-YU", "cp870");
170 charsets.put("EBCDIC-CP-IS", "cp871");
171 charsets.put("EBCDIC-CP-AR2", "cp918");
172
173 // IANA also defines two that JDK 1.2 doesn't handle:
174 // EBCDIC-CP-GR --> CP423
175 // EBCDIC-CP-TR --> CP905
176 }
177
178 // returns an encoding name supported by JDK >= 1.1.6
179 // for some cases required by the XML spec
180 private static String std2java(String encoding) {
181 String temp = encoding.toUpperCase();
182 temp = (String) charsets.get(temp);
183 return temp != null ? temp : encoding;
184 }
185
186 /**
187 * Returns the standard name of the encoding in use
188 */
189 public String getEncoding() {
190 return assignedEncoding;
191 }
192
193 private XmlReader(InputStream stream) throws IOException {
194 super(stream);
195
196 PushbackInputStream pb;
197 byte buf [];
198 int len;
199
200 if (stream instanceof PushbackInputStream)
201 pb = (PushbackInputStream) stream;
202 else
203 pb = new PushbackInputStream(stream, MAXPUSHBACK);
204
205 //
206 // See if we can figure out the character encoding used
207 // in this file by peeking at the first few bytes.
208 //
209 buf = new byte[4];
210 len = pb.read(buf);
211 if (len > 0)
212 pb.unread(buf, 0, len);
213
214 if (len == 4)
215 switch (buf[0] & 0x0ff) {
216 case 0:
217 // 00 3c 00 3f == illegal UTF-16 big-endian
218 if (buf[1] == 0x3c && buf[2] == 0x00 && buf[3] == 0x3f) {
219 setEncoding(pb, "UnicodeBig");
220 return;
221 }
222 // else it's probably UCS-4
223 break;
224
225 case '<': // 0x3c: the most common cases!
226 switch (buf[1] & 0x0ff) {
227 // First character is '<'; could be XML without
228 // an XML directive such as "<hello>", "<!-- ...",
229 // and so on.
230 default:
231 break;
232
233 // 3c 00 3f 00 == illegal UTF-16 little endian
234 case 0x00:
235 if (buf[2] == 0x3f && buf[3] == 0x00) {
236 setEncoding(pb, "UnicodeLittle");
237 return;
238 }
239 // else probably UCS-4
240 break;
241
242 // 3c 3f 78 6d == ASCII and supersets '<?xm'
243 case '?':
244 if (buf[2] != 'x' || buf[3] != 'm')
245 break;
246 //
247 // One of several encodings could be used:
248 // Shift-JIS, ASCII, UTF-8, ISO-8859-*, etc
249 //
250 useEncodingDecl(pb, "UTF8");
251 return;
252 }
253 break;
254
255 // 4c 6f a7 94 ... some EBCDIC code page
256 case 0x4c:
257 if (buf[1] == 0x6f
258 && (0x0ff & buf[2]) == 0x0a7
259 && (0x0ff & buf[3]) == 0x094) {
260 useEncodingDecl(pb, "CP037");
261 return;
262 }
263 // whoops, treat as UTF-8
264 break;
265
266 // UTF-16 big-endian
267 case 0xfe:
268 if ((buf[1] & 0x0ff) != 0xff)
269 break;
270 setEncoding(pb, "UTF-16");
271 return;
272
273 // UTF-16 little-endian
274 case 0xff:
275 if ((buf[1] & 0x0ff) != 0xfe)
276 break;
277 setEncoding(pb, "UTF-16");
278 return;
279
280 // default ... no XML declaration
281 default:
282 break;
283 }
284
285 //
286 // If all else fails, assume XML without a declaration, and
287 // using UTF-8 encoding.
288 //
289 setEncoding(pb, "UTF-8");
290 }
291
292 /*
293 * Read the encoding decl on the stream, knowing that it should
294 * be readable using the specified encoding (basically, ASCII or
295 * EBCDIC). The body of the document may use a wider range of
296 * characters than the XML/Text decl itself, so we switch to use
297 * the specified encoding as soon as we can. (ASCII is a subset
298 * of UTF-8, ISO-8859-*, ISO-2022-JP, EUC-JP, and more; EBCDIC
299 * has a variety of "code pages" that have these characters as
300 * a common subset.)
301 */
302 private void useEncodingDecl(PushbackInputStream pb, String encoding)
303 throws IOException {
304 byte buffer [] = new byte[MAXPUSHBACK];
305 int len;
306 Reader r;
307 int c;
308
309 //
310 // Buffer up a bunch of input, and set up to read it in
311 // the specified encoding ... we can skip the first four
312 // bytes since we know that "<?xm" was read to determine
313 // what encoding to use!
314 //
315 len = pb.read(buffer, 0, buffer.length);
316 pb.unread(buffer, 0, len);
317 r = new InputStreamReader(new ByteArrayInputStream(buffer, 4, len),
318 encoding);
319
320 //
321 // Next must be "l" (and whitespace) else we conclude
322 // error and choose UTF-8.
323 //
324 if ((c = r.read()) != 'l') {
325 setEncoding(pb, "UTF-8");
326 return;
327 }
328
329 //
330 // Then, we'll skip any
331 // S version="..." [or single quotes]
332 // bit and get any subsequent
333 // S encoding="..." [or single quotes]
334 //
335 // We put an arbitrary size limit on how far we read; lots
336 // of space will break this algorithm.
337 //
338 StringBuffer buf = new StringBuffer();
339 StringBuffer keyBuf = null;
340 String key = null;
341 boolean sawEq = false;
342 char quoteChar = 0;
343 boolean sawQuestion = false;
344
345 XmlDecl:
346 for (int i = 0; i < MAXPUSHBACK - 5; ++i) {
347 if ((c = r.read()) == -1)
348 break;
349
350 // ignore whitespace before/between "key = 'value'"
351 if (c == ' ' || c == '\t' || c == '\n' || c == '\r')
352 continue;
353
354 // ... but require at least a little!
355 if (i == 0)
356 break;
357
358 // terminate the loop ASAP
359 if (c == '?')
360 sawQuestion = true;
361 else if (sawQuestion) {
362 if (c == '>')
363 break;
364 sawQuestion = false;
365 }
366
367 // did we get the "key =" bit yet?
368 if (key == null || !sawEq) {
369 if (keyBuf == null) {
370 if (Character.isWhitespace((char) c))
371 continue;
372 keyBuf = buf;
373 buf.setLength(0);
374 buf.append((char) c);
375 sawEq = false;
376 } else if (Character.isWhitespace((char) c)) {
377 key = keyBuf.toString();
378 } else if (c == '=') {
379 if (key == null)
380 key = keyBuf.toString();
381 sawEq = true;
382 keyBuf = null;
383 quoteChar = 0;
384 } else
385 keyBuf.append((char) c);
386 continue;
387 }
388
389 // space before quoted value
390 if (Character.isWhitespace((char) c))
391 continue;
392 if (c == '"' || c == '\'') {
393 if (quoteChar == 0) {
394 quoteChar = (char) c;
395 buf.setLength(0);
396 continue;
397 } else if (c == quoteChar) {
398 if ("encoding".equals(key)) {
399 assignedEncoding = buf.toString();
400
401 // [81] Encname ::= [A-Za-z] ([A-Za-z0-9._]|'-')*
402 for (i = 0; i < assignedEncoding.length(); i++) {
403 c = assignedEncoding.charAt(i);
404 if ((c >= 'A' && c <= 'Z')
405 || (c >= 'a' && c <= 'z'))
406 continue;
407 if (i == 0)
408 break XmlDecl;
409 if (i > 0 && (c == '-'
410 || (c >= '0' && c <= '9')
411 || c == '.' || c == '_'))
412 continue;
413 // map illegal names to UTF-8 default
414 break XmlDecl;
415 }
416
417 setEncoding(pb, assignedEncoding);
418 return;
419
420 } else {
421 key = null;
422 continue;
423 }
424 }
425 }
426 buf.append((char) c);
427 }
428
429 setEncoding(pb, "UTF-8");
430 }
431
432 private void setEncoding(InputStream stream, String encoding)
433 throws IOException {
434 assignedEncoding = encoding;
435 in = createReader(stream, encoding);
436 }
437
438 /**
439 * Reads the number of characters read into the buffer, or -1 on EOF.
440 */
441 public int read(char buf [], int off, int len) throws IOException {
442 int val;
443
444 if (closed)
445 return -1; // throw new IOException ("closed");
446 val = in.read(buf, off, len);
447 if (val == -1)
448 close();
449 return val;
450 }
451
452 /**
453 * Reads a single character.
454 */
455 public int read() throws IOException {
456 int val;
457
458 if (closed)
459 throw new IOException("closed");
460 val = in.read();
461 if (val == -1)
462 close();
463 return val;
464 }
465
466 /**
467 * Returns true iff the reader supports mark/reset.
468 */
469 public boolean markSupported() {
470 return in == null ? false : in.markSupported();
471 }
472
473 /**
474 * Sets a mark allowing a limited number of characters to
475 * be "peeked", by reading and then resetting.
476 *
477 * @param value how many characters may be "peeked".
478 */
479 public void mark(int value) throws IOException {
480 if (in != null) in.mark(value);
481 }
482
483 /**
484 * Resets the current position to the last marked position.
485 */
486 public void reset() throws IOException {
487 if (in != null) in.reset();
488 }
489
490 /**
491 * Skips a specified number of characters.
492 */
493 public long skip(long value) throws IOException {
494 return in == null ? 0 : in.skip(value);
495 }
496
497 /**
498 * Returns true iff input characters are known to be ready.
499 */
500 public boolean ready() throws IOException {
501 return in == null ? false : in.ready();
502 }
503
504 /**
505 * Closes the reader.
506 */
507 public void close() throws IOException {
508 if (closed)
509 return;
510 in.close();
511 in = null;
512 closed = true;
513 }
514
515 //
516 // Delegating to a converter module will always be slower than
517 // direct conversion. Use a similar approach for any other
518 // readers that need to be particularly fast; only block I/O
519 // speed matters to this package. For UTF-16, separate readers
520 // for big and little endian streams make a difference, too;
521 // fewer conditionals in the critical path!
522 //
523 static abstract class BaseReader extends Reader {
524 protected InputStream instream;
525 protected byte buffer [];
526 protected int start, finish;
527
528 BaseReader(InputStream stream) {
529 super(stream);
530
531 instream = stream;
532 buffer = new byte[8192];
533 }
534
535 public boolean ready() throws IOException {
536 return instream == null
537 || (finish - start) > 0
538 || instream.available() != 0;
539 }
540
541 // caller shouldn't read again
542 public void close() throws IOException {
543 if (instream != null) {
544 instream.close();
545 start = finish = 0;
546 buffer = null;
547 instream = null;
548 }
549 }
550 }
551
552 //
553 // We want this reader, to make the default encoding be as fast
554 // as we can make it. JDK's "UTF8" (not "UTF-8" till JDK 1.2)
555 // InputStreamReader works, but 20+% slower speed isn't OK for
556 // the default/primary encoding.
557 //
558 static final class Utf8Reader extends BaseReader {
559 // 2nd half of UTF-8 surrogate pair
560 private char nextChar;
561
562 Utf8Reader(InputStream stream) {
563 super(stream);
564 }
565
566 public int read(char buf [], int offset, int len) throws IOException {
567 int i = 0, c = 0;
568
569 if (len <= 0)
570 return 0;
571
572 // Consume remaining half of any surrogate pair immediately
573 if (nextChar != 0) {
574 buf[offset + i++] = nextChar;
575 nextChar = 0;
576 }
577
578 while (i < len) {
579 // stop or read data if needed
580 if (finish <= start) {
581 if (instream == null) {
582 c = -1;
583 break;
584 }
585 start = 0;
586 finish = instream.read(buffer, 0, buffer.length);
587 if (finish <= 0) {
588 this.close();
589 c = -1;
590 break;
591 }
592 }
593
594 //
595 // RFC 2279 describes UTF-8; there are six encodings.
596 // Each encoding takes a fixed number of characters
597 // (1-6 bytes) and is flagged by a bit pattern in the
598 // first byte. The five and six byte-per-character
599 // encodings address characters which are disallowed
600 // in XML documents, as do some four byte ones.
601 //
602
603 //
604 // Single byte == ASCII. Common; optimize.
605 //
606 c = buffer[start] & 0x0ff;
607 if ((c & 0x80) == 0x00) {
608 // 0x0000 <= c <= 0x007f
609 start++;
610 buf[offset + i++] = (char) c;
611 continue;
612 }
613
614 //
615 // Multibyte chars -- check offsets optimistically,
616 // ditto the "10xx xxxx" format for subsequent bytes
617 //
618 int off = start;
619
620 try {
621 // 2 bytes
622 if ((buffer[off] & 0x0E0) == 0x0C0) {
623 c = (buffer[off++] & 0x1f) << 6;
624 c += buffer[off++] & 0x3f;
625
626 // 0x0080 <= c <= 0x07ff
627
628 // 3 bytes
629 } else if ((buffer[off] & 0x0F0) == 0x0E0) {
630 c = (buffer[off++] & 0x0f) << 12;
631 c += (buffer[off++] & 0x3f) << 6;
632 c += buffer[off++] & 0x3f;
633
634 // 0x0800 <= c <= 0xffff
635
636 // 4 bytes
637 } else if ((buffer[off] & 0x0f8) == 0x0F0) {
638 c = (buffer[off++] & 0x07) << 18;
639 c += (buffer[off++] & 0x3f) << 12;
640 c += (buffer[off++] & 0x3f) << 6;
641 c += buffer[off++] & 0x3f;
642
643 // 0x0001 0000 <= c <= 0x001f ffff
644
645 // Unicode supports c <= 0x0010 ffff ...
646 if (c > 0x0010ffff)
647 throw new CharConversionException("UTF-8 encoding of character 0x00"
648 + Integer.toHexString(c)
649 + " can't be converted to Unicode.");
650
651 // Convert UCS-4 char to surrogate pair (UTF-16)
652 c -= 0x10000;
653 nextChar = (char) (0xDC00 + (c & 0x03ff));
654 c = 0xD800 + (c >> 10);
655
656 // 5 and 6 byte versions are XML WF errors, but
657 // typically come from mislabeled encodings
658 } else
659 throw new CharConversionException("Unconvertible UTF-8 character"
660 + " beginning with 0x"
661 + Integer.toHexString(buffer[start] & 0xff));
662
663 } catch (ArrayIndexOutOfBoundsException e) {
664 // off > length && length >= buffer.length
665 c = 0;
666 }
667
668 //
669 // if the buffer held only a partial character,
670 // compact it and try to read the rest of the
671 // character. worst case involves three
672 // single-byte reads -- quite rare.
673 //
674 if (off > finish) {
675 System.arraycopy(buffer, start,
676 buffer, 0, finish - start);
677 finish -= start;
678 start = 0;
679 off = instream.read(buffer, finish,
680 buffer.length - finish);
681 if (off < 0) {
682 this.close();
683 throw new CharConversionException("Partial UTF-8 char");
684 }
685 finish += off;
686 continue;
687 }
688
689 //
690 // check the format of the non-initial bytes
691 //
692 for (start++; start < off; start++) {
693 if ((buffer[start] & 0xC0) != 0x80) {
694 this.close();
695 throw new CharConversionException("Malformed UTF-8 char -- "
696 + "is an XML encoding declaration missing?");
697 }
698 }
699
700 //
701 // If this needed a surrogate pair, consume ASAP
702 //
703 buf[offset + i++] = (char) c;
704 if (nextChar != 0 && i < len) {
705 buf[offset + i++] = nextChar;
706 nextChar = 0;
707 }
708 }
709 if (i > 0)
710 return i;
711 return (c == -1) ? -1 : 0;
712 }
713 }
714
715 //
716 // We want ASCII and ISO-8859 Readers since they're the most common
717 // encodings in the US and Europe, and we don't want performance
718 // regressions for them. They're also easy to implement efficiently,
719 // since they're bitmask subsets of UNICODE.
720 //
721 // XXX haven't benchmarked these readers vs what we get out of JDK.
722 //
723 static final class AsciiReader extends BaseReader {
724 AsciiReader(InputStream in) {
725 super(in);
726 }
727
728 public int read(char buf [], int offset, int len) throws IOException {
729 int i, c;
730
731 if (instream == null)
732 return -1;
733
734 for (i = 0; i < len; i++) {
735 if (start >= finish) {
736 start = 0;
737 finish = instream.read(buffer, 0, buffer.length);
738 if (finish <= 0) {
739 if (finish <= 0)
740 this.close();
741 break;
742 }
743 }
744 c = buffer[start++];
745 if ((c & 0x80) != 0)
746 throw new CharConversionException("Illegal ASCII character, 0x"
747 + Integer.toHexString(c & 0xff));
748 buf[offset + i] = (char) c;
749 }
750 if (i == 0 && finish <= 0)
751 return -1;
752 return i;
753 }
754 }
755
756 static final class Iso8859_1Reader extends BaseReader {
757 Iso8859_1Reader(InputStream in) {
758 super(in);
759 }
760
761 public int read(char buf [], int offset, int len) throws IOException {
762 int i;
763
764 if (instream == null)
765 return -1;
766
767 for (i = 0; i < len; i++) {
768 if (start >= finish) {
769 start = 0;
770 finish = instream.read(buffer, 0, buffer.length);
771 if (finish <= 0) {
772 if (finish <= 0)
773 this.close();
774 break;
775 }
776 }
777 buf[offset + i] = (char) (0x0ff & buffer[start++]);
778 }
779 if (i == 0 && finish <= 0)
780 return -1;
781 return i;
782 }
783 }
784 }

mercurial