src/share/jaxws_classes/com/sun/xml/internal/dtdparser/XmlReader.java

changeset 0
373ffda63c9a
child 637
9c07ef4934dd
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/src/share/jaxws_classes/com/sun/xml/internal/dtdparser/XmlReader.java	Wed Apr 27 01:27:09 2016 +0800
     1.3 @@ -0,0 +1,784 @@
     1.4 +/*
     1.5 + * Copyright (c) 2009, Oracle and/or its affiliates. All rights reserved.
     1.6 + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     1.7 + *
     1.8 + * This code is free software; you can redistribute it and/or modify it
     1.9 + * under the terms of the GNU General Public License version 2 only, as
    1.10 + * published by the Free Software Foundation.  Oracle designates this
    1.11 + * particular file as subject to the "Classpath" exception as provided
    1.12 + * by Oracle in the LICENSE file that accompanied this code.
    1.13 + *
    1.14 + * This code is distributed in the hope that it will be useful, but WITHOUT
    1.15 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
    1.16 + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    1.17 + * version 2 for more details (a copy is included in the LICENSE file that
    1.18 + * accompanied this code).
    1.19 + *
    1.20 + * You should have received a copy of the GNU General Public License version
    1.21 + * 2 along with this work; if not, write to the Free Software Foundation,
    1.22 + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
    1.23 + *
    1.24 + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
    1.25 + * or visit www.oracle.com if you need additional information or have any
    1.26 + * questions.
    1.27 + */
    1.28 +
    1.29 +package com.sun.xml.internal.dtdparser;
    1.30 +
    1.31 +import java.io.ByteArrayInputStream;
    1.32 +import java.io.CharConversionException;
    1.33 +import java.io.IOException;
    1.34 +import java.io.InputStream;
    1.35 +import java.io.InputStreamReader;
    1.36 +import java.io.PushbackInputStream;
    1.37 +import java.io.Reader;
    1.38 +import java.util.Hashtable;
    1.39 +
    1.40 +
    1.41 +// NOTE:  Add I18N support to this class when JDK gets the ability to
    1.42 +// defer selection of locale for exception messages ... use the same
    1.43 +// technique for both.
    1.44 +
    1.45 +
    1.46 +/**
    1.47 + * This handles several XML-related tasks that normal java.io Readers
    1.48 + * don't support, inluding use of IETF standard encoding names and
    1.49 + * automatic detection of most XML encodings.  The former is needed
    1.50 + * for interoperability; the latter is needed to conform with the XML
    1.51 + * spec.  This class also optimizes reading some common encodings by
    1.52 + * providing low-overhead unsynchronized Reader support.
    1.53 + * <p/>
    1.54 + * <P> Note that the autodetection facility should be used only on
    1.55 + * data streams which have an unknown character encoding.  For example,
    1.56 + * it should never be used on MIME text/xml entities.
    1.57 + * <p/>
    1.58 + * <P> Note that XML processors are only required to support UTF-8 and
    1.59 + * UTF-16 character encodings.  Autodetection permits the underlying Java
    1.60 + * implementation to provide support for many other encodings, such as
    1.61 + * US-ASCII, ISO-8859-5, Shift_JIS, EUC-JP, and ISO-2022-JP.
    1.62 + *
    1.63 + * @author David Brownell
    1.64 + * @author Janet Koenig
    1.65 + * @version 1.3 00/02/24
    1.66 + */
    1.67 +// package private
    1.68 +final class XmlReader extends Reader {
    1.69 +    private static final int MAXPUSHBACK = 512;
    1.70 +
    1.71 +    private Reader in;
    1.72 +    private String assignedEncoding;
    1.73 +    private boolean closed;
    1.74 +
    1.75 +    //
    1.76 +    // This class always delegates I/O to a reader, which gets
    1.77 +    // its data from the very beginning of the XML text.  It needs
    1.78 +    // to use a pushback stream since (a) autodetection can read
    1.79 +    // partial UTF-8 characters which need to be fully processed,
    1.80 +    // (b) the "Unicode" readers swallow characters that they think
    1.81 +    // are byte order marks, so tests fail if they don't see the
    1.82 +    // real byte order mark.
    1.83 +    //
    1.84 +    // It's got do this efficiently:  character I/O is solidly on the
    1.85 +    // critical path.  (So keep buffer length over 2 Kbytes to avoid
    1.86 +    // excess buffering. Many URL handlers stuff a BufferedInputStream
    1.87 +    // between here and the real data source, and larger buffers keep
    1.88 +    // that from slowing you down.)
    1.89 +    //
    1.90 +
    1.91 +    /**
    1.92 +     * Constructs the reader from an input stream, autodetecting
    1.93 +     * the encoding to use according to the heuristic specified
    1.94 +     * in the XML 1.0 recommendation.
    1.95 +     *
    1.96 +     * @param in the input stream from which the reader is constructed
    1.97 +     * @throws IOException on error, such as unrecognized encoding
    1.98 +     */
    1.99 +    public static Reader createReader(InputStream in) throws IOException {
   1.100 +        return new XmlReader(in);
   1.101 +    }
   1.102 +
   1.103 +    /**
   1.104 +     * Creates a reader supporting the given encoding, mapping
   1.105 +     * from standard encoding names to ones that understood by
   1.106 +     * Java where necessary.
   1.107 +     *
   1.108 +     * @param in       the input stream from which the reader is constructed
   1.109 +     * @param encoding the IETF standard name of the encoding to use;
   1.110 +     *                 if null, autodetection is used.
   1.111 +     * @throws IOException on error, including unrecognized encoding
   1.112 +     */
   1.113 +    public static Reader createReader(InputStream in, String encoding)
   1.114 +            throws IOException {
   1.115 +        if (encoding == null)
   1.116 +            return new XmlReader(in);
   1.117 +        if ("UTF-8".equalsIgnoreCase(encoding)
   1.118 +                || "UTF8".equalsIgnoreCase(encoding))
   1.119 +            return new Utf8Reader(in);
   1.120 +        if ("US-ASCII".equalsIgnoreCase(encoding)
   1.121 +                || "ASCII".equalsIgnoreCase(encoding))
   1.122 +            return new AsciiReader(in);
   1.123 +        if ("ISO-8859-1".equalsIgnoreCase(encoding)
   1.124 +        // plus numerous aliases ...
   1.125 +        )
   1.126 +            return new Iso8859_1Reader(in);
   1.127 +
   1.128 +        //
   1.129 +        // What we really want is an administerable resource mapping
   1.130 +        // encoding names/aliases to classnames.  For example a property
   1.131 +        // file resource, "readers/mapping.props", holding and a set
   1.132 +        // of readers in that (sub)package... defaulting to this call
   1.133 +        // only if no better choice is available.
   1.134 +        //
   1.135 +        return new InputStreamReader(in, std2java(encoding));
   1.136 +    }
   1.137 +
   1.138 +    //
   1.139 +    // JDK doesn't know all of the standard encoding names, and
   1.140 +    // in particular none of the EBCDIC ones IANA defines (and
   1.141 +    // which IBM encourages).
   1.142 +    //
   1.143 +    static private final Hashtable charsets = new Hashtable(31);
   1.144 +
   1.145 +    static {
   1.146 +        charsets.put("UTF-16", "Unicode");
   1.147 +        charsets.put("ISO-10646-UCS-2", "Unicode");
   1.148 +
   1.149 +        // NOTE: no support for ISO-10646-UCS-4 yet.
   1.150 +
   1.151 +        charsets.put("EBCDIC-CP-US", "cp037");
   1.152 +        charsets.put("EBCDIC-CP-CA", "cp037");
   1.153 +        charsets.put("EBCDIC-CP-NL", "cp037");
   1.154 +        charsets.put("EBCDIC-CP-WT", "cp037");
   1.155 +
   1.156 +        charsets.put("EBCDIC-CP-DK", "cp277");
   1.157 +        charsets.put("EBCDIC-CP-NO", "cp277");
   1.158 +        charsets.put("EBCDIC-CP-FI", "cp278");
   1.159 +        charsets.put("EBCDIC-CP-SE", "cp278");
   1.160 +
   1.161 +        charsets.put("EBCDIC-CP-IT", "cp280");
   1.162 +        charsets.put("EBCDIC-CP-ES", "cp284");
   1.163 +        charsets.put("EBCDIC-CP-GB", "cp285");
   1.164 +        charsets.put("EBCDIC-CP-FR", "cp297");
   1.165 +
   1.166 +        charsets.put("EBCDIC-CP-AR1", "cp420");
   1.167 +        charsets.put("EBCDIC-CP-HE", "cp424");
   1.168 +        charsets.put("EBCDIC-CP-BE", "cp500");
   1.169 +        charsets.put("EBCDIC-CP-CH", "cp500");
   1.170 +
   1.171 +        charsets.put("EBCDIC-CP-ROECE", "cp870");
   1.172 +        charsets.put("EBCDIC-CP-YU", "cp870");
   1.173 +        charsets.put("EBCDIC-CP-IS", "cp871");
   1.174 +        charsets.put("EBCDIC-CP-AR2", "cp918");
   1.175 +
   1.176 +        // IANA also defines two that JDK 1.2 doesn't handle:
   1.177 +        //    EBCDIC-CP-GR        --> CP423
   1.178 +        //    EBCDIC-CP-TR        --> CP905
   1.179 +    }
   1.180 +
   1.181 +    // returns an encoding name supported by JDK >= 1.1.6
   1.182 +    // for some cases required by the XML spec
   1.183 +    private static String std2java(String encoding) {
   1.184 +        String temp = encoding.toUpperCase();
   1.185 +        temp = (String) charsets.get(temp);
   1.186 +        return temp != null ? temp : encoding;
   1.187 +    }
   1.188 +
   1.189 +    /**
   1.190 +     * Returns the standard name of the encoding in use
   1.191 +     */
   1.192 +    public String getEncoding() {
   1.193 +        return assignedEncoding;
   1.194 +    }
   1.195 +
   1.196 +    private XmlReader(InputStream stream) throws IOException {
   1.197 +        super(stream);
   1.198 +
   1.199 +        PushbackInputStream pb;
   1.200 +        byte buf [];
   1.201 +        int len;
   1.202 +
   1.203 +        if (stream instanceof PushbackInputStream)
   1.204 +            pb = (PushbackInputStream) stream;
   1.205 +        else
   1.206 +            pb = new PushbackInputStream(stream, MAXPUSHBACK);
   1.207 +
   1.208 +        //
   1.209 +        // See if we can figure out the character encoding used
   1.210 +        // in this file by peeking at the first few bytes.
   1.211 +        //
   1.212 +        buf = new byte[4];
   1.213 +        len = pb.read(buf);
   1.214 +        if (len > 0)
   1.215 +            pb.unread(buf, 0, len);
   1.216 +
   1.217 +        if (len == 4)
   1.218 +            switch (buf[0] & 0x0ff) {
   1.219 +            case 0:
   1.220 +                // 00 3c 00 3f == illegal UTF-16 big-endian
   1.221 +                if (buf[1] == 0x3c && buf[2] == 0x00 && buf[3] == 0x3f) {
   1.222 +                    setEncoding(pb, "UnicodeBig");
   1.223 +                    return;
   1.224 +                }
   1.225 +                // else it's probably UCS-4
   1.226 +                break;
   1.227 +
   1.228 +            case '<':      // 0x3c: the most common cases!
   1.229 +                switch (buf[1] & 0x0ff) {
   1.230 +                // First character is '<'; could be XML without
   1.231 +                // an XML directive such as "<hello>", "<!-- ...",
   1.232 +                // and so on.
   1.233 +                default:
   1.234 +                    break;
   1.235 +
   1.236 +                    // 3c 00 3f 00 == illegal UTF-16 little endian
   1.237 +                case 0x00:
   1.238 +                    if (buf[2] == 0x3f && buf[3] == 0x00) {
   1.239 +                        setEncoding(pb, "UnicodeLittle");
   1.240 +                        return;
   1.241 +                    }
   1.242 +                    // else probably UCS-4
   1.243 +                    break;
   1.244 +
   1.245 +                    // 3c 3f 78 6d == ASCII and supersets '<?xm'
   1.246 +                case '?':
   1.247 +                    if (buf[2] != 'x' || buf[3] != 'm')
   1.248 +                        break;
   1.249 +                    //
   1.250 +                    // One of several encodings could be used:
   1.251 +                    // Shift-JIS, ASCII, UTF-8, ISO-8859-*, etc
   1.252 +                    //
   1.253 +                    useEncodingDecl(pb, "UTF8");
   1.254 +                    return;
   1.255 +                }
   1.256 +                break;
   1.257 +
   1.258 +                // 4c 6f a7 94 ... some EBCDIC code page
   1.259 +            case 0x4c:
   1.260 +                if (buf[1] == 0x6f
   1.261 +                        && (0x0ff & buf[2]) == 0x0a7
   1.262 +                        && (0x0ff & buf[3]) == 0x094) {
   1.263 +                    useEncodingDecl(pb, "CP037");
   1.264 +                    return;
   1.265 +                }
   1.266 +                // whoops, treat as UTF-8
   1.267 +                break;
   1.268 +
   1.269 +                // UTF-16 big-endian
   1.270 +            case 0xfe:
   1.271 +                if ((buf[1] & 0x0ff) != 0xff)
   1.272 +                    break;
   1.273 +                setEncoding(pb, "UTF-16");
   1.274 +                return;
   1.275 +
   1.276 +                // UTF-16 little-endian
   1.277 +            case 0xff:
   1.278 +                if ((buf[1] & 0x0ff) != 0xfe)
   1.279 +                    break;
   1.280 +                setEncoding(pb, "UTF-16");
   1.281 +                return;
   1.282 +
   1.283 +                // default ... no XML declaration
   1.284 +            default:
   1.285 +                break;
   1.286 +            }
   1.287 +
   1.288 +        //
   1.289 +        // If all else fails, assume XML without a declaration, and
   1.290 +        // using UTF-8 encoding.
   1.291 +        //
   1.292 +        setEncoding(pb, "UTF-8");
   1.293 +    }
   1.294 +
   1.295 +    /*
   1.296 +     * Read the encoding decl on the stream, knowing that it should
   1.297 +     * be readable using the specified encoding (basically, ASCII or
   1.298 +     * EBCDIC).  The body of the document may use a wider range of
   1.299 +     * characters than the XML/Text decl itself, so we switch to use
   1.300 +     * the specified encoding as soon as we can.  (ASCII is a subset
   1.301 +     * of UTF-8, ISO-8859-*, ISO-2022-JP, EUC-JP, and more; EBCDIC
   1.302 +     * has a variety of "code pages" that have these characters as
   1.303 +     * a common subset.)
   1.304 +     */
   1.305 +    private void useEncodingDecl(PushbackInputStream pb, String encoding)
   1.306 +            throws IOException {
   1.307 +        byte buffer [] = new byte[MAXPUSHBACK];
   1.308 +        int len;
   1.309 +        Reader r;
   1.310 +        int c;
   1.311 +
   1.312 +        //
   1.313 +        // Buffer up a bunch of input, and set up to read it in
   1.314 +        // the specified encoding ... we can skip the first four
   1.315 +        // bytes since we know that "<?xm" was read to determine
   1.316 +        // what encoding to use!
   1.317 +        //
   1.318 +        len = pb.read(buffer, 0, buffer.length);
   1.319 +        pb.unread(buffer, 0, len);
   1.320 +        r = new InputStreamReader(new ByteArrayInputStream(buffer, 4, len),
   1.321 +                encoding);
   1.322 +
   1.323 +        //
   1.324 +        // Next must be "l" (and whitespace) else we conclude
   1.325 +        // error and choose UTF-8.
   1.326 +        //
   1.327 +        if ((c = r.read()) != 'l') {
   1.328 +            setEncoding(pb, "UTF-8");
   1.329 +            return;
   1.330 +        }
   1.331 +
   1.332 +        //
   1.333 +        // Then, we'll skip any
   1.334 +        //     S version="..."     [or single quotes]
   1.335 +        // bit and get any subsequent
   1.336 +        //     S encoding="..."     [or single quotes]
   1.337 +        //
   1.338 +        // We put an arbitrary size limit on how far we read; lots
   1.339 +        // of space will break this algorithm.
   1.340 +        //
   1.341 +        StringBuffer buf = new StringBuffer();
   1.342 +        StringBuffer keyBuf = null;
   1.343 +        String key = null;
   1.344 +        boolean sawEq = false;
   1.345 +        char quoteChar = 0;
   1.346 +        boolean sawQuestion = false;
   1.347 +
   1.348 +        XmlDecl:
   1.349 +        for (int i = 0; i < MAXPUSHBACK - 5; ++i) {
   1.350 +            if ((c = r.read()) == -1)
   1.351 +                break;
   1.352 +
   1.353 +            // ignore whitespace before/between "key = 'value'"
   1.354 +            if (c == ' ' || c == '\t' || c == '\n' || c == '\r')
   1.355 +                continue;
   1.356 +
   1.357 +            // ... but require at least a little!
   1.358 +            if (i == 0)
   1.359 +                break;
   1.360 +
   1.361 +            // terminate the loop ASAP
   1.362 +            if (c == '?')
   1.363 +                sawQuestion = true;
   1.364 +            else if (sawQuestion) {
   1.365 +                if (c == '>')
   1.366 +                    break;
   1.367 +                sawQuestion = false;
   1.368 +            }
   1.369 +
   1.370 +            // did we get the "key =" bit yet?
   1.371 +            if (key == null || !sawEq) {
   1.372 +                if (keyBuf == null) {
   1.373 +                    if (Character.isWhitespace((char) c))
   1.374 +                        continue;
   1.375 +                    keyBuf = buf;
   1.376 +                    buf.setLength(0);
   1.377 +                    buf.append((char) c);
   1.378 +                    sawEq = false;
   1.379 +                } else if (Character.isWhitespace((char) c)) {
   1.380 +                    key = keyBuf.toString();
   1.381 +                } else if (c == '=') {
   1.382 +                    if (key == null)
   1.383 +                        key = keyBuf.toString();
   1.384 +                    sawEq = true;
   1.385 +                    keyBuf = null;
   1.386 +                    quoteChar = 0;
   1.387 +                } else
   1.388 +                    keyBuf.append((char) c);
   1.389 +                continue;
   1.390 +            }
   1.391 +
   1.392 +            // space before quoted value
   1.393 +            if (Character.isWhitespace((char) c))
   1.394 +                continue;
   1.395 +            if (c == '"' || c == '\'') {
   1.396 +                if (quoteChar == 0) {
   1.397 +                    quoteChar = (char) c;
   1.398 +                    buf.setLength(0);
   1.399 +                    continue;
   1.400 +                } else if (c == quoteChar) {
   1.401 +                    if ("encoding".equals(key)) {
   1.402 +                        assignedEncoding = buf.toString();
   1.403 +
   1.404 +                        // [81] Encname ::= [A-Za-z] ([A-Za-z0-9._]|'-')*
   1.405 +                        for (i = 0; i < assignedEncoding.length(); i++) {
   1.406 +                            c = assignedEncoding.charAt(i);
   1.407 +                            if ((c >= 'A' && c <= 'Z')
   1.408 +                                    || (c >= 'a' && c <= 'z'))
   1.409 +                                continue;
   1.410 +                            if (i == 0)
   1.411 +                                break XmlDecl;
   1.412 +                            if (i > 0 && (c == '-'
   1.413 +                                    || (c >= '0' && c <= '9')
   1.414 +                                    || c == '.' || c == '_'))
   1.415 +                                continue;
   1.416 +                            // map illegal names to UTF-8 default
   1.417 +                            break XmlDecl;
   1.418 +                        }
   1.419 +
   1.420 +                        setEncoding(pb, assignedEncoding);
   1.421 +                        return;
   1.422 +
   1.423 +                    } else {
   1.424 +                        key = null;
   1.425 +                        continue;
   1.426 +                    }
   1.427 +                }
   1.428 +            }
   1.429 +            buf.append((char) c);
   1.430 +        }
   1.431 +
   1.432 +        setEncoding(pb, "UTF-8");
   1.433 +    }
   1.434 +
   1.435 +    private void setEncoding(InputStream stream, String encoding)
   1.436 +            throws IOException {
   1.437 +        assignedEncoding = encoding;
   1.438 +        in = createReader(stream, encoding);
   1.439 +    }
   1.440 +
   1.441 +    /**
   1.442 +     * Reads the number of characters read into the buffer, or -1 on EOF.
   1.443 +     */
   1.444 +    public int read(char buf [], int off, int len) throws IOException {
   1.445 +        int val;
   1.446 +
   1.447 +        if (closed)
   1.448 +            return -1;        // throw new IOException ("closed");
   1.449 +        val = in.read(buf, off, len);
   1.450 +        if (val == -1)
   1.451 +            close();
   1.452 +        return val;
   1.453 +    }
   1.454 +
   1.455 +    /**
   1.456 +     * Reads a single character.
   1.457 +     */
   1.458 +    public int read() throws IOException {
   1.459 +        int val;
   1.460 +
   1.461 +        if (closed)
   1.462 +            throw new IOException("closed");
   1.463 +        val = in.read();
   1.464 +        if (val == -1)
   1.465 +            close();
   1.466 +        return val;
   1.467 +    }
   1.468 +
   1.469 +    /**
   1.470 +     * Returns true iff the reader supports mark/reset.
   1.471 +     */
   1.472 +    public boolean markSupported() {
   1.473 +        return in == null ? false : in.markSupported();
   1.474 +    }
   1.475 +
   1.476 +    /**
   1.477 +     * Sets a mark allowing a limited number of characters to
   1.478 +     * be "peeked", by reading and then resetting.
   1.479 +     *
   1.480 +     * @param value how many characters may be "peeked".
   1.481 +     */
   1.482 +    public void mark(int value) throws IOException {
   1.483 +        if (in != null) in.mark(value);
   1.484 +    }
   1.485 +
   1.486 +    /**
   1.487 +     * Resets the current position to the last marked position.
   1.488 +     */
   1.489 +    public void reset() throws IOException {
   1.490 +        if (in != null) in.reset();
   1.491 +    }
   1.492 +
   1.493 +    /**
   1.494 +     * Skips a specified number of characters.
   1.495 +     */
   1.496 +    public long skip(long value) throws IOException {
   1.497 +        return in == null ? 0 : in.skip(value);
   1.498 +    }
   1.499 +
   1.500 +    /**
   1.501 +     * Returns true iff input characters are known to be ready.
   1.502 +     */
   1.503 +    public boolean ready() throws IOException {
   1.504 +        return in == null ? false : in.ready();
   1.505 +    }
   1.506 +
   1.507 +    /**
   1.508 +     * Closes the reader.
   1.509 +     */
   1.510 +    public void close() throws IOException {
   1.511 +        if (closed)
   1.512 +            return;
   1.513 +        in.close();
   1.514 +        in = null;
   1.515 +        closed = true;
   1.516 +    }
   1.517 +
   1.518 +    //
   1.519 +    // Delegating to a converter module will always be slower than
   1.520 +    // direct conversion.  Use a similar approach for any other
   1.521 +    // readers that need to be particularly fast; only block I/O
   1.522 +    // speed matters to this package.  For UTF-16, separate readers
   1.523 +    // for big and little endian streams make a difference, too;
   1.524 +    // fewer conditionals in the critical path!
   1.525 +    //
   1.526 +    static abstract class BaseReader extends Reader {
   1.527 +        protected InputStream instream;
   1.528 +        protected byte buffer [];
   1.529 +        protected int start, finish;
   1.530 +
   1.531 +        BaseReader(InputStream stream) {
   1.532 +            super(stream);
   1.533 +
   1.534 +            instream = stream;
   1.535 +            buffer = new byte[8192];
   1.536 +        }
   1.537 +
   1.538 +        public boolean ready() throws IOException {
   1.539 +            return instream == null
   1.540 +                    || (finish - start) > 0
   1.541 +                    || instream.available() != 0;
   1.542 +        }
   1.543 +
   1.544 +        // caller shouldn't read again
   1.545 +        public void close() throws IOException {
   1.546 +            if (instream != null) {
   1.547 +                instream.close();
   1.548 +                start = finish = 0;
   1.549 +                buffer = null;
   1.550 +                instream = null;
   1.551 +            }
   1.552 +        }
   1.553 +    }
   1.554 +
   1.555 +    //
   1.556 +    // We want this reader, to make the default encoding be as fast
   1.557 +    // as we can make it.  JDK's "UTF8" (not "UTF-8" till JDK 1.2)
   1.558 +    // InputStreamReader works, but 20+% slower speed isn't OK for
   1.559 +    // the default/primary encoding.
   1.560 +    //
   1.561 +    static final class Utf8Reader extends BaseReader {
   1.562 +        // 2nd half of UTF-8 surrogate pair
   1.563 +        private char nextChar;
   1.564 +
   1.565 +        Utf8Reader(InputStream stream) {
   1.566 +            super(stream);
   1.567 +        }
   1.568 +
   1.569 +        public int read(char buf [], int offset, int len) throws IOException {
   1.570 +            int i = 0, c = 0;
   1.571 +
   1.572 +            if (len <= 0)
   1.573 +                return 0;
   1.574 +
   1.575 +            // Consume remaining half of any surrogate pair immediately
   1.576 +            if (nextChar != 0) {
   1.577 +                buf[offset + i++] = nextChar;
   1.578 +                nextChar = 0;
   1.579 +            }
   1.580 +
   1.581 +            while (i < len) {
   1.582 +                // stop or read data if needed
   1.583 +                if (finish <= start) {
   1.584 +                    if (instream == null) {
   1.585 +                        c = -1;
   1.586 +                        break;
   1.587 +                    }
   1.588 +                    start = 0;
   1.589 +                    finish = instream.read(buffer, 0, buffer.length);
   1.590 +                    if (finish <= 0) {
   1.591 +                        this.close();
   1.592 +                        c = -1;
   1.593 +                        break;
   1.594 +                    }
   1.595 +                }
   1.596 +
   1.597 +                //
   1.598 +                // RFC 2279 describes UTF-8; there are six encodings.
   1.599 +                // Each encoding takes a fixed number of characters
   1.600 +                // (1-6 bytes) and is flagged by a bit pattern in the
   1.601 +                // first byte.  The five and six byte-per-character
   1.602 +                // encodings address characters which are disallowed
   1.603 +                // in XML documents, as do some four byte ones.
   1.604 +                //
   1.605 +
   1.606 +                //
   1.607 +                // Single byte == ASCII.  Common; optimize.
   1.608 +                //
   1.609 +                c = buffer[start] & 0x0ff;
   1.610 +                if ((c & 0x80) == 0x00) {
   1.611 +                    // 0x0000 <= c <= 0x007f
   1.612 +                    start++;
   1.613 +                    buf[offset + i++] = (char) c;
   1.614 +                    continue;
   1.615 +                }
   1.616 +
   1.617 +                //
   1.618 +                // Multibyte chars -- check offsets optimistically,
   1.619 +                // ditto the "10xx xxxx" format for subsequent bytes
   1.620 +                //
   1.621 +                int off = start;
   1.622 +
   1.623 +                try {
   1.624 +                    // 2 bytes
   1.625 +                    if ((buffer[off] & 0x0E0) == 0x0C0) {
   1.626 +                        c = (buffer[off++] & 0x1f) << 6;
   1.627 +                        c += buffer[off++] & 0x3f;
   1.628 +
   1.629 +                        // 0x0080 <= c <= 0x07ff
   1.630 +
   1.631 +                        // 3 bytes
   1.632 +                    } else if ((buffer[off] & 0x0F0) == 0x0E0) {
   1.633 +                        c = (buffer[off++] & 0x0f) << 12;
   1.634 +                        c += (buffer[off++] & 0x3f) << 6;
   1.635 +                        c += buffer[off++] & 0x3f;
   1.636 +
   1.637 +                        // 0x0800 <= c <= 0xffff
   1.638 +
   1.639 +                        // 4 bytes
   1.640 +                    } else if ((buffer[off] & 0x0f8) == 0x0F0) {
   1.641 +                        c = (buffer[off++] & 0x07) << 18;
   1.642 +                        c += (buffer[off++] & 0x3f) << 12;
   1.643 +                        c += (buffer[off++] & 0x3f) << 6;
   1.644 +                        c += buffer[off++] & 0x3f;
   1.645 +
   1.646 +                        // 0x0001 0000  <= c  <= 0x001f ffff
   1.647 +
   1.648 +                        // Unicode supports c <= 0x0010 ffff ...
   1.649 +                        if (c > 0x0010ffff)
   1.650 +                            throw new CharConversionException("UTF-8 encoding of character 0x00"
   1.651 +                                    + Integer.toHexString(c)
   1.652 +                                    + " can't be converted to Unicode.");
   1.653 +
   1.654 +                        // Convert UCS-4 char to surrogate pair (UTF-16)
   1.655 +                        c -= 0x10000;
   1.656 +                        nextChar = (char) (0xDC00 + (c & 0x03ff));
   1.657 +                        c = 0xD800 + (c >> 10);
   1.658 +
   1.659 +                        // 5 and 6 byte versions are XML WF errors, but
   1.660 +                        // typically come from mislabeled encodings
   1.661 +                    } else
   1.662 +                        throw new CharConversionException("Unconvertible UTF-8 character"
   1.663 +                                + " beginning with 0x"
   1.664 +                                + Integer.toHexString(buffer[start] & 0xff));
   1.665 +
   1.666 +                } catch (ArrayIndexOutOfBoundsException e) {
   1.667 +                    // off > length && length >= buffer.length
   1.668 +                    c = 0;
   1.669 +                }
   1.670 +
   1.671 +                //
   1.672 +                // if the buffer held only a partial character,
   1.673 +                // compact it and try to read the rest of the
   1.674 +                // character.  worst case involves three
   1.675 +                // single-byte reads -- quite rare.
   1.676 +                //
   1.677 +                if (off > finish) {
   1.678 +                    System.arraycopy(buffer, start,
   1.679 +                            buffer, 0, finish - start);
   1.680 +                    finish -= start;
   1.681 +                    start = 0;
   1.682 +                    off = instream.read(buffer, finish,
   1.683 +                            buffer.length - finish);
   1.684 +                    if (off < 0) {
   1.685 +                        this.close();
   1.686 +                        throw new CharConversionException("Partial UTF-8 char");
   1.687 +                    }
   1.688 +                    finish += off;
   1.689 +                    continue;
   1.690 +                }
   1.691 +
   1.692 +                //
   1.693 +                // check the format of the non-initial bytes
   1.694 +                //
   1.695 +                for (start++; start < off; start++) {
   1.696 +                    if ((buffer[start] & 0xC0) != 0x80) {
   1.697 +                        this.close();
   1.698 +                        throw new CharConversionException("Malformed UTF-8 char -- "
   1.699 +                                + "is an XML encoding declaration missing?");
   1.700 +                    }
   1.701 +                }
   1.702 +
   1.703 +                //
   1.704 +                // If this needed a surrogate pair, consume ASAP
   1.705 +                //
   1.706 +                buf[offset + i++] = (char) c;
   1.707 +                if (nextChar != 0 && i < len) {
   1.708 +                    buf[offset + i++] = nextChar;
   1.709 +                    nextChar = 0;
   1.710 +                }
   1.711 +            }
   1.712 +            if (i > 0)
   1.713 +                return i;
   1.714 +            return (c == -1) ? -1 : 0;
   1.715 +        }
   1.716 +    }
   1.717 +
   1.718 +    //
   1.719 +    // We want ASCII and ISO-8859 Readers since they're the most common
   1.720 +    // encodings in the US and Europe, and we don't want performance
   1.721 +    // regressions for them.  They're also easy to implement efficiently,
   1.722 +    // since they're bitmask subsets of UNICODE.
   1.723 +    //
   1.724 +    // XXX haven't benchmarked these readers vs what we get out of JDK.
   1.725 +    //
   1.726 +    static final class AsciiReader extends BaseReader {
   1.727 +        AsciiReader(InputStream in) {
   1.728 +            super(in);
   1.729 +        }
   1.730 +
   1.731 +        public int read(char buf [], int offset, int len) throws IOException {
   1.732 +            int i, c;
   1.733 +
   1.734 +            if (instream == null)
   1.735 +                return -1;
   1.736 +
   1.737 +            for (i = 0; i < len; i++) {
   1.738 +                if (start >= finish) {
   1.739 +                    start = 0;
   1.740 +                    finish = instream.read(buffer, 0, buffer.length);
   1.741 +                    if (finish <= 0) {
   1.742 +                        if (finish <= 0)
   1.743 +                            this.close();
   1.744 +                        break;
   1.745 +                    }
   1.746 +                }
   1.747 +                c = buffer[start++];
   1.748 +                if ((c & 0x80) != 0)
   1.749 +                    throw new CharConversionException("Illegal ASCII character, 0x"
   1.750 +                            + Integer.toHexString(c & 0xff));
   1.751 +                buf[offset + i] = (char) c;
   1.752 +            }
   1.753 +            if (i == 0 && finish <= 0)
   1.754 +                return -1;
   1.755 +            return i;
   1.756 +        }
   1.757 +    }
   1.758 +
   1.759 +    static final class Iso8859_1Reader extends BaseReader {
   1.760 +        Iso8859_1Reader(InputStream in) {
   1.761 +            super(in);
   1.762 +        }
   1.763 +
   1.764 +        public int read(char buf [], int offset, int len) throws IOException {
   1.765 +            int i;
   1.766 +
   1.767 +            if (instream == null)
   1.768 +                return -1;
   1.769 +
   1.770 +            for (i = 0; i < len; i++) {
   1.771 +                if (start >= finish) {
   1.772 +                    start = 0;
   1.773 +                    finish = instream.read(buffer, 0, buffer.length);
   1.774 +                    if (finish <= 0) {
   1.775 +                        if (finish <= 0)
   1.776 +                            this.close();
   1.777 +                        break;
   1.778 +                    }
   1.779 +                }
   1.780 +                buf[offset + i] = (char) (0x0ff & buffer[start++]);
   1.781 +            }
   1.782 +            if (i == 0 && finish <= 0)
   1.783 +                return -1;
   1.784 +            return i;
   1.785 +        }
   1.786 +    }
   1.787 +}

mercurial