aoqi@0: /* aoqi@0: * Copyright (c) 2009, Oracle and/or its affiliates. All rights reserved. aoqi@0: * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. aoqi@0: * aoqi@0: * This code is free software; you can redistribute it and/or modify it aoqi@0: * under the terms of the GNU General Public License version 2 only, as aoqi@0: * published by the Free Software Foundation. Oracle designates this aoqi@0: * particular file as subject to the "Classpath" exception as provided aoqi@0: * by Oracle in the LICENSE file that accompanied this code. aoqi@0: * aoqi@0: * This code is distributed in the hope that it will be useful, but WITHOUT aoqi@0: * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or aoqi@0: * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License aoqi@0: * version 2 for more details (a copy is included in the LICENSE file that aoqi@0: * accompanied this code). aoqi@0: * aoqi@0: * You should have received a copy of the GNU General Public License version aoqi@0: * 2 along with this work; if not, write to the Free Software Foundation, aoqi@0: * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. aoqi@0: * aoqi@0: * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA aoqi@0: * or visit www.oracle.com if you need additional information or have any aoqi@0: * questions. aoqi@0: */ aoqi@0: aoqi@0: package com.sun.xml.internal.dtdparser; aoqi@0: aoqi@0: import java.io.ByteArrayInputStream; aoqi@0: import java.io.CharConversionException; aoqi@0: import java.io.IOException; aoqi@0: import java.io.InputStream; aoqi@0: import java.io.InputStreamReader; aoqi@0: import java.io.PushbackInputStream; aoqi@0: import java.io.Reader; aoqi@0: import java.util.Hashtable; aoqi@0: aoqi@0: aoqi@0: // NOTE: Add I18N support to this class when JDK gets the ability to aoqi@0: // defer selection of locale for exception messages ... use the same aoqi@0: // technique for both. aoqi@0: aoqi@0: aoqi@0: /** aoqi@0: * This handles several XML-related tasks that normal java.io Readers aoqi@0: * don't support, inluding use of IETF standard encoding names and aoqi@0: * automatic detection of most XML encodings. The former is needed aoqi@0: * for interoperability; the latter is needed to conform with the XML aoqi@0: * spec. This class also optimizes reading some common encodings by aoqi@0: * providing low-overhead unsynchronized Reader support. aoqi@0: *
aoqi@0: *Note that the autodetection facility should be used only on aoqi@0: * data streams which have an unknown character encoding. For example, aoqi@0: * it should never be used on MIME text/xml entities. aoqi@0: *
aoqi@0: * Note that XML processors are only required to support UTF-8 and
aoqi@0: * UTF-16 character encodings. Autodetection permits the underlying Java
aoqi@0: * implementation to provide support for many other encodings, such as
aoqi@0: * US-ASCII, ISO-8859-5, Shift_JIS, EUC-JP, and ISO-2022-JP.
aoqi@0: *
aoqi@0: * @author David Brownell
aoqi@0: * @author Janet Koenig
aoqi@0: * @version 1.3 00/02/24
aoqi@0: */
aoqi@0: // package private
aoqi@0: final class XmlReader extends Reader {
aoqi@0: private static final int MAXPUSHBACK = 512;
aoqi@0:
aoqi@0: private Reader in;
aoqi@0: private String assignedEncoding;
aoqi@0: private boolean closed;
aoqi@0:
aoqi@0: //
aoqi@0: // This class always delegates I/O to a reader, which gets
aoqi@0: // its data from the very beginning of the XML text. It needs
aoqi@0: // to use a pushback stream since (a) autodetection can read
aoqi@0: // partial UTF-8 characters which need to be fully processed,
aoqi@0: // (b) the "Unicode" readers swallow characters that they think
aoqi@0: // are byte order marks, so tests fail if they don't see the
aoqi@0: // real byte order mark.
aoqi@0: //
aoqi@0: // It's got do this efficiently: character I/O is solidly on the
aoqi@0: // critical path. (So keep buffer length over 2 Kbytes to avoid
aoqi@0: // excess buffering. Many URL handlers stuff a BufferedInputStream
aoqi@0: // between here and the real data source, and larger buffers keep
aoqi@0: // that from slowing you down.)
aoqi@0: //
aoqi@0:
aoqi@0: /**
aoqi@0: * Constructs the reader from an input stream, autodetecting
aoqi@0: * the encoding to use according to the heuristic specified
aoqi@0: * in the XML 1.0 recommendation.
aoqi@0: *
aoqi@0: * @param in the input stream from which the reader is constructed
aoqi@0: * @throws IOException on error, such as unrecognized encoding
aoqi@0: */
aoqi@0: public static Reader createReader(InputStream in) throws IOException {
aoqi@0: return new XmlReader(in);
aoqi@0: }
aoqi@0:
aoqi@0: /**
aoqi@0: * Creates a reader supporting the given encoding, mapping
aoqi@0: * from standard encoding names to ones that understood by
aoqi@0: * Java where necessary.
aoqi@0: *
aoqi@0: * @param in the input stream from which the reader is constructed
aoqi@0: * @param encoding the IETF standard name of the encoding to use;
aoqi@0: * if null, autodetection is used.
aoqi@0: * @throws IOException on error, including unrecognized encoding
aoqi@0: */
aoqi@0: public static Reader createReader(InputStream in, String encoding)
aoqi@0: throws IOException {
aoqi@0: if (encoding == null)
aoqi@0: return new XmlReader(in);
aoqi@0: if ("UTF-8".equalsIgnoreCase(encoding)
aoqi@0: || "UTF8".equalsIgnoreCase(encoding))
aoqi@0: return new Utf8Reader(in);
aoqi@0: if ("US-ASCII".equalsIgnoreCase(encoding)
aoqi@0: || "ASCII".equalsIgnoreCase(encoding))
aoqi@0: return new AsciiReader(in);
aoqi@0: if ("ISO-8859-1".equalsIgnoreCase(encoding)
aoqi@0: // plus numerous aliases ...
aoqi@0: )
aoqi@0: return new Iso8859_1Reader(in);
aoqi@0:
aoqi@0: //
aoqi@0: // What we really want is an administerable resource mapping
aoqi@0: // encoding names/aliases to classnames. For example a property
aoqi@0: // file resource, "readers/mapping.props", holding and a set
aoqi@0: // of readers in that (sub)package... defaulting to this call
aoqi@0: // only if no better choice is available.
aoqi@0: //
aoqi@0: return new InputStreamReader(in, std2java(encoding));
aoqi@0: }
aoqi@0:
aoqi@0: //
aoqi@0: // JDK doesn't know all of the standard encoding names, and
aoqi@0: // in particular none of the EBCDIC ones IANA defines (and
aoqi@0: // which IBM encourages).
aoqi@0: //
aoqi@0: static private final Hashtable charsets = new Hashtable(31);
aoqi@0:
aoqi@0: static {
aoqi@0: charsets.put("UTF-16", "Unicode");
aoqi@0: charsets.put("ISO-10646-UCS-2", "Unicode");
aoqi@0:
aoqi@0: // NOTE: no support for ISO-10646-UCS-4 yet.
aoqi@0:
aoqi@0: charsets.put("EBCDIC-CP-US", "cp037");
aoqi@0: charsets.put("EBCDIC-CP-CA", "cp037");
aoqi@0: charsets.put("EBCDIC-CP-NL", "cp037");
aoqi@0: charsets.put("EBCDIC-CP-WT", "cp037");
aoqi@0:
aoqi@0: charsets.put("EBCDIC-CP-DK", "cp277");
aoqi@0: charsets.put("EBCDIC-CP-NO", "cp277");
aoqi@0: charsets.put("EBCDIC-CP-FI", "cp278");
aoqi@0: charsets.put("EBCDIC-CP-SE", "cp278");
aoqi@0:
aoqi@0: charsets.put("EBCDIC-CP-IT", "cp280");
aoqi@0: charsets.put("EBCDIC-CP-ES", "cp284");
aoqi@0: charsets.put("EBCDIC-CP-GB", "cp285");
aoqi@0: charsets.put("EBCDIC-CP-FR", "cp297");
aoqi@0:
aoqi@0: charsets.put("EBCDIC-CP-AR1", "cp420");
aoqi@0: charsets.put("EBCDIC-CP-HE", "cp424");
aoqi@0: charsets.put("EBCDIC-CP-BE", "cp500");
aoqi@0: charsets.put("EBCDIC-CP-CH", "cp500");
aoqi@0:
aoqi@0: charsets.put("EBCDIC-CP-ROECE", "cp870");
aoqi@0: charsets.put("EBCDIC-CP-YU", "cp870");
aoqi@0: charsets.put("EBCDIC-CP-IS", "cp871");
aoqi@0: charsets.put("EBCDIC-CP-AR2", "cp918");
aoqi@0:
aoqi@0: // IANA also defines two that JDK 1.2 doesn't handle:
aoqi@0: // EBCDIC-CP-GR --> CP423
aoqi@0: // EBCDIC-CP-TR --> CP905
aoqi@0: }
aoqi@0:
aoqi@0: // returns an encoding name supported by JDK >= 1.1.6
aoqi@0: // for some cases required by the XML spec
aoqi@0: private static String std2java(String encoding) {
aoqi@0: String temp = encoding.toUpperCase();
aoqi@0: temp = (String) charsets.get(temp);
aoqi@0: return temp != null ? temp : encoding;
aoqi@0: }
aoqi@0:
aoqi@0: /**
aoqi@0: * Returns the standard name of the encoding in use
aoqi@0: */
aoqi@0: public String getEncoding() {
aoqi@0: return assignedEncoding;
aoqi@0: }
aoqi@0:
aoqi@0: private XmlReader(InputStream stream) throws IOException {
aoqi@0: super(stream);
aoqi@0:
aoqi@0: PushbackInputStream pb;
aoqi@0: byte buf [];
aoqi@0: int len;
aoqi@0:
aoqi@0: if (stream instanceof PushbackInputStream)
aoqi@0: pb = (PushbackInputStream) stream;
aoqi@0: else
aoqi@0: pb = new PushbackInputStream(stream, MAXPUSHBACK);
aoqi@0:
aoqi@0: //
aoqi@0: // See if we can figure out the character encoding used
aoqi@0: // in this file by peeking at the first few bytes.
aoqi@0: //
aoqi@0: buf = new byte[4];
aoqi@0: len = pb.read(buf);
aoqi@0: if (len > 0)
aoqi@0: pb.unread(buf, 0, len);
aoqi@0:
aoqi@0: if (len == 4)
aoqi@0: switch (buf[0] & 0x0ff) {
aoqi@0: case 0:
aoqi@0: // 00 3c 00 3f == illegal UTF-16 big-endian
aoqi@0: if (buf[1] == 0x3c && buf[2] == 0x00 && buf[3] == 0x3f) {
aoqi@0: setEncoding(pb, "UnicodeBig");
aoqi@0: return;
aoqi@0: }
aoqi@0: // else it's probably UCS-4
aoqi@0: break;
aoqi@0:
aoqi@0: case '<': // 0x3c: the most common cases!
aoqi@0: switch (buf[1] & 0x0ff) {
aoqi@0: // First character is '<'; could be XML without
aoqi@0: // an XML directive such as "