ohair@286: /* mkos@397: * Copyright (c) 2009, Oracle and/or its affiliates. All rights reserved. ohair@286: * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. ohair@286: * ohair@286: * This code is free software; you can redistribute it and/or modify it ohair@286: * under the terms of the GNU General Public License version 2 only, as ohair@286: * published by the Free Software Foundation. Oracle designates this ohair@286: * particular file as subject to the "Classpath" exception as provided ohair@286: * by Oracle in the LICENSE file that accompanied this code. ohair@286: * ohair@286: * This code is distributed in the hope that it will be useful, but WITHOUT ohair@286: * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ohair@286: * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ohair@286: * version 2 for more details (a copy is included in the LICENSE file that ohair@286: * accompanied this code). ohair@286: * ohair@286: * You should have received a copy of the GNU General Public License version ohair@286: * 2 along with this work; if not, write to the Free Software Foundation, ohair@286: * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. ohair@286: * ohair@286: * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA ohair@286: * or visit www.oracle.com if you need additional information or have any ohair@286: * questions. ohair@286: */ ohair@286: ohair@286: package com.sun.xml.internal.dtdparser; ohair@286: ohair@286: import java.io.ByteArrayInputStream; ohair@286: import java.io.CharConversionException; ohair@286: import java.io.IOException; ohair@286: import java.io.InputStream; ohair@286: import java.io.InputStreamReader; ohair@286: import java.io.PushbackInputStream; ohair@286: import java.io.Reader; ohair@286: import java.util.Hashtable; ohair@286: ohair@286: ohair@286: // NOTE: Add I18N support to this class when JDK gets the ability to ohair@286: // defer selection of locale for exception messages ... use the same ohair@286: // technique for both. ohair@286: ohair@286: ohair@286: /** ohair@286: * This handles several XML-related tasks that normal java.io Readers ohair@286: * don't support, inluding use of IETF standard encoding names and ohair@286: * automatic detection of most XML encodings. The former is needed ohair@286: * for interoperability; the latter is needed to conform with the XML ohair@286: * spec. This class also optimizes reading some common encodings by ohair@286: * providing low-overhead unsynchronized Reader support. ohair@286: *

ohair@286: *

Note that the autodetection facility should be used only on ohair@286: * data streams which have an unknown character encoding. For example, ohair@286: * it should never be used on MIME text/xml entities. ohair@286: *

ohair@286: *

Note that XML processors are only required to support UTF-8 and ohair@286: * UTF-16 character encodings. Autodetection permits the underlying Java ohair@286: * implementation to provide support for many other encodings, such as ohair@286: * US-ASCII, ISO-8859-5, Shift_JIS, EUC-JP, and ISO-2022-JP. ohair@286: * ohair@286: * @author David Brownell ohair@286: * @author Janet Koenig ohair@286: * @version 1.3 00/02/24 ohair@286: */ ohair@286: // package private ohair@286: final class XmlReader extends Reader { ohair@286: private static final int MAXPUSHBACK = 512; ohair@286: ohair@286: private Reader in; ohair@286: private String assignedEncoding; ohair@286: private boolean closed; ohair@286: ohair@286: // ohair@286: // This class always delegates I/O to a reader, which gets ohair@286: // its data from the very beginning of the XML text. It needs ohair@286: // to use a pushback stream since (a) autodetection can read ohair@286: // partial UTF-8 characters which need to be fully processed, ohair@286: // (b) the "Unicode" readers swallow characters that they think ohair@286: // are byte order marks, so tests fail if they don't see the ohair@286: // real byte order mark. ohair@286: // ohair@286: // It's got do this efficiently: character I/O is solidly on the ohair@286: // critical path. (So keep buffer length over 2 Kbytes to avoid ohair@286: // excess buffering. Many URL handlers stuff a BufferedInputStream ohair@286: // between here and the real data source, and larger buffers keep ohair@286: // that from slowing you down.) ohair@286: // ohair@286: ohair@286: /** mkos@397: * Constructs the reader from an input stream, autodetecting ohair@286: * the encoding to use according to the heuristic specified ohair@286: * in the XML 1.0 recommendation. ohair@286: * ohair@286: * @param in the input stream from which the reader is constructed ohair@286: * @throws IOException on error, such as unrecognized encoding ohair@286: */ ohair@286: public static Reader createReader(InputStream in) throws IOException { ohair@286: return new XmlReader(in); ohair@286: } ohair@286: ohair@286: /** ohair@286: * Creates a reader supporting the given encoding, mapping ohair@286: * from standard encoding names to ones that understood by ohair@286: * Java where necessary. ohair@286: * ohair@286: * @param in the input stream from which the reader is constructed ohair@286: * @param encoding the IETF standard name of the encoding to use; mkos@397: * if null, autodetection is used. ohair@286: * @throws IOException on error, including unrecognized encoding ohair@286: */ ohair@286: public static Reader createReader(InputStream in, String encoding) ohair@286: throws IOException { ohair@286: if (encoding == null) ohair@286: return new XmlReader(in); ohair@286: if ("UTF-8".equalsIgnoreCase(encoding) ohair@286: || "UTF8".equalsIgnoreCase(encoding)) ohair@286: return new Utf8Reader(in); ohair@286: if ("US-ASCII".equalsIgnoreCase(encoding) ohair@286: || "ASCII".equalsIgnoreCase(encoding)) ohair@286: return new AsciiReader(in); ohair@286: if ("ISO-8859-1".equalsIgnoreCase(encoding) ohair@286: // plus numerous aliases ... ohair@286: ) ohair@286: return new Iso8859_1Reader(in); ohair@286: ohair@286: // ohair@286: // What we really want is an administerable resource mapping ohair@286: // encoding names/aliases to classnames. For example a property ohair@286: // file resource, "readers/mapping.props", holding and a set ohair@286: // of readers in that (sub)package... defaulting to this call ohair@286: // only if no better choice is available. ohair@286: // ohair@286: return new InputStreamReader(in, std2java(encoding)); ohair@286: } ohair@286: ohair@286: // ohair@286: // JDK doesn't know all of the standard encoding names, and ohair@286: // in particular none of the EBCDIC ones IANA defines (and ohair@286: // which IBM encourages). ohair@286: // ohair@286: static private final Hashtable charsets = new Hashtable(31); ohair@286: ohair@286: static { ohair@286: charsets.put("UTF-16", "Unicode"); ohair@286: charsets.put("ISO-10646-UCS-2", "Unicode"); ohair@286: ohair@286: // NOTE: no support for ISO-10646-UCS-4 yet. ohair@286: ohair@286: charsets.put("EBCDIC-CP-US", "cp037"); ohair@286: charsets.put("EBCDIC-CP-CA", "cp037"); ohair@286: charsets.put("EBCDIC-CP-NL", "cp037"); ohair@286: charsets.put("EBCDIC-CP-WT", "cp037"); ohair@286: ohair@286: charsets.put("EBCDIC-CP-DK", "cp277"); ohair@286: charsets.put("EBCDIC-CP-NO", "cp277"); ohair@286: charsets.put("EBCDIC-CP-FI", "cp278"); ohair@286: charsets.put("EBCDIC-CP-SE", "cp278"); ohair@286: ohair@286: charsets.put("EBCDIC-CP-IT", "cp280"); ohair@286: charsets.put("EBCDIC-CP-ES", "cp284"); ohair@286: charsets.put("EBCDIC-CP-GB", "cp285"); ohair@286: charsets.put("EBCDIC-CP-FR", "cp297"); ohair@286: ohair@286: charsets.put("EBCDIC-CP-AR1", "cp420"); ohair@286: charsets.put("EBCDIC-CP-HE", "cp424"); ohair@286: charsets.put("EBCDIC-CP-BE", "cp500"); ohair@286: charsets.put("EBCDIC-CP-CH", "cp500"); ohair@286: ohair@286: charsets.put("EBCDIC-CP-ROECE", "cp870"); ohair@286: charsets.put("EBCDIC-CP-YU", "cp870"); ohair@286: charsets.put("EBCDIC-CP-IS", "cp871"); ohair@286: charsets.put("EBCDIC-CP-AR2", "cp918"); ohair@286: ohair@286: // IANA also defines two that JDK 1.2 doesn't handle: ohair@286: // EBCDIC-CP-GR --> CP423 ohair@286: // EBCDIC-CP-TR --> CP905 ohair@286: } ohair@286: ohair@286: // returns an encoding name supported by JDK >= 1.1.6 ohair@286: // for some cases required by the XML spec ohair@286: private static String std2java(String encoding) { ohair@286: String temp = encoding.toUpperCase(); ohair@286: temp = (String) charsets.get(temp); ohair@286: return temp != null ? temp : encoding; ohair@286: } ohair@286: ohair@286: /** ohair@286: * Returns the standard name of the encoding in use ohair@286: */ ohair@286: public String getEncoding() { ohair@286: return assignedEncoding; ohair@286: } ohair@286: ohair@286: private XmlReader(InputStream stream) throws IOException { ohair@286: super(stream); ohair@286: ohair@286: PushbackInputStream pb; ohair@286: byte buf []; ohair@286: int len; ohair@286: ohair@286: if (stream instanceof PushbackInputStream) ohair@286: pb = (PushbackInputStream) stream; ohair@286: else ohair@286: pb = new PushbackInputStream(stream, MAXPUSHBACK); ohair@286: ohair@286: // ohair@286: // See if we can figure out the character encoding used ohair@286: // in this file by peeking at the first few bytes. ohair@286: // ohair@286: buf = new byte[4]; ohair@286: len = pb.read(buf); ohair@286: if (len > 0) ohair@286: pb.unread(buf, 0, len); ohair@286: ohair@286: if (len == 4) ohair@286: switch (buf[0] & 0x0ff) { ohair@286: case 0: ohair@286: // 00 3c 00 3f == illegal UTF-16 big-endian ohair@286: if (buf[1] == 0x3c && buf[2] == 0x00 && buf[3] == 0x3f) { ohair@286: setEncoding(pb, "UnicodeBig"); ohair@286: return; ohair@286: } ohair@286: // else it's probably UCS-4 ohair@286: break; ohair@286: ohair@286: case '<': // 0x3c: the most common cases! ohair@286: switch (buf[1] & 0x0ff) { ohair@286: // First character is '<'; could be XML without ohair@286: // an XML directive such as "", "