1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/src/share/jaxws_classes/com/sun/xml/internal/dtdparser/XmlReader.java Wed Apr 27 01:27:09 2016 +0800 1.3 @@ -0,0 +1,784 @@ 1.4 +/* 1.5 + * Copyright (c) 2009, Oracle and/or its affiliates. All rights reserved. 1.6 + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 1.7 + * 1.8 + * This code is free software; you can redistribute it and/or modify it 1.9 + * under the terms of the GNU General Public License version 2 only, as 1.10 + * published by the Free Software Foundation. Oracle designates this 1.11 + * particular file as subject to the "Classpath" exception as provided 1.12 + * by Oracle in the LICENSE file that accompanied this code. 1.13 + * 1.14 + * This code is distributed in the hope that it will be useful, but WITHOUT 1.15 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 1.16 + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 1.17 + * version 2 for more details (a copy is included in the LICENSE file that 1.18 + * accompanied this code). 1.19 + * 1.20 + * You should have received a copy of the GNU General Public License version 1.21 + * 2 along with this work; if not, write to the Free Software Foundation, 1.22 + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 1.23 + * 1.24 + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 1.25 + * or visit www.oracle.com if you need additional information or have any 1.26 + * questions. 1.27 + */ 1.28 + 1.29 +package com.sun.xml.internal.dtdparser; 1.30 + 1.31 +import java.io.ByteArrayInputStream; 1.32 +import java.io.CharConversionException; 1.33 +import java.io.IOException; 1.34 +import java.io.InputStream; 1.35 +import java.io.InputStreamReader; 1.36 +import java.io.PushbackInputStream; 1.37 +import java.io.Reader; 1.38 +import java.util.Hashtable; 1.39 + 1.40 + 1.41 +// NOTE: Add I18N support to this class when JDK gets the ability to 1.42 +// defer selection of locale for exception messages ... use the same 1.43 +// technique for both. 1.44 + 1.45 + 1.46 +/** 1.47 + * This handles several XML-related tasks that normal java.io Readers 1.48 + * don't support, inluding use of IETF standard encoding names and 1.49 + * automatic detection of most XML encodings. The former is needed 1.50 + * for interoperability; the latter is needed to conform with the XML 1.51 + * spec. This class also optimizes reading some common encodings by 1.52 + * providing low-overhead unsynchronized Reader support. 1.53 + * <p/> 1.54 + * <P> Note that the autodetection facility should be used only on 1.55 + * data streams which have an unknown character encoding. For example, 1.56 + * it should never be used on MIME text/xml entities. 1.57 + * <p/> 1.58 + * <P> Note that XML processors are only required to support UTF-8 and 1.59 + * UTF-16 character encodings. Autodetection permits the underlying Java 1.60 + * implementation to provide support for many other encodings, such as 1.61 + * US-ASCII, ISO-8859-5, Shift_JIS, EUC-JP, and ISO-2022-JP. 1.62 + * 1.63 + * @author David Brownell 1.64 + * @author Janet Koenig 1.65 + * @version 1.3 00/02/24 1.66 + */ 1.67 +// package private 1.68 +final class XmlReader extends Reader { 1.69 + private static final int MAXPUSHBACK = 512; 1.70 + 1.71 + private Reader in; 1.72 + private String assignedEncoding; 1.73 + private boolean closed; 1.74 + 1.75 + // 1.76 + // This class always delegates I/O to a reader, which gets 1.77 + // its data from the very beginning of the XML text. It needs 1.78 + // to use a pushback stream since (a) autodetection can read 1.79 + // partial UTF-8 characters which need to be fully processed, 1.80 + // (b) the "Unicode" readers swallow characters that they think 1.81 + // are byte order marks, so tests fail if they don't see the 1.82 + // real byte order mark. 1.83 + // 1.84 + // It's got do this efficiently: character I/O is solidly on the 1.85 + // critical path. (So keep buffer length over 2 Kbytes to avoid 1.86 + // excess buffering. Many URL handlers stuff a BufferedInputStream 1.87 + // between here and the real data source, and larger buffers keep 1.88 + // that from slowing you down.) 1.89 + // 1.90 + 1.91 + /** 1.92 + * Constructs the reader from an input stream, autodetecting 1.93 + * the encoding to use according to the heuristic specified 1.94 + * in the XML 1.0 recommendation. 1.95 + * 1.96 + * @param in the input stream from which the reader is constructed 1.97 + * @throws IOException on error, such as unrecognized encoding 1.98 + */ 1.99 + public static Reader createReader(InputStream in) throws IOException { 1.100 + return new XmlReader(in); 1.101 + } 1.102 + 1.103 + /** 1.104 + * Creates a reader supporting the given encoding, mapping 1.105 + * from standard encoding names to ones that understood by 1.106 + * Java where necessary. 1.107 + * 1.108 + * @param in the input stream from which the reader is constructed 1.109 + * @param encoding the IETF standard name of the encoding to use; 1.110 + * if null, autodetection is used. 1.111 + * @throws IOException on error, including unrecognized encoding 1.112 + */ 1.113 + public static Reader createReader(InputStream in, String encoding) 1.114 + throws IOException { 1.115 + if (encoding == null) 1.116 + return new XmlReader(in); 1.117 + if ("UTF-8".equalsIgnoreCase(encoding) 1.118 + || "UTF8".equalsIgnoreCase(encoding)) 1.119 + return new Utf8Reader(in); 1.120 + if ("US-ASCII".equalsIgnoreCase(encoding) 1.121 + || "ASCII".equalsIgnoreCase(encoding)) 1.122 + return new AsciiReader(in); 1.123 + if ("ISO-8859-1".equalsIgnoreCase(encoding) 1.124 + // plus numerous aliases ... 1.125 + ) 1.126 + return new Iso8859_1Reader(in); 1.127 + 1.128 + // 1.129 + // What we really want is an administerable resource mapping 1.130 + // encoding names/aliases to classnames. For example a property 1.131 + // file resource, "readers/mapping.props", holding and a set 1.132 + // of readers in that (sub)package... defaulting to this call 1.133 + // only if no better choice is available. 1.134 + // 1.135 + return new InputStreamReader(in, std2java(encoding)); 1.136 + } 1.137 + 1.138 + // 1.139 + // JDK doesn't know all of the standard encoding names, and 1.140 + // in particular none of the EBCDIC ones IANA defines (and 1.141 + // which IBM encourages). 1.142 + // 1.143 + static private final Hashtable charsets = new Hashtable(31); 1.144 + 1.145 + static { 1.146 + charsets.put("UTF-16", "Unicode"); 1.147 + charsets.put("ISO-10646-UCS-2", "Unicode"); 1.148 + 1.149 + // NOTE: no support for ISO-10646-UCS-4 yet. 1.150 + 1.151 + charsets.put("EBCDIC-CP-US", "cp037"); 1.152 + charsets.put("EBCDIC-CP-CA", "cp037"); 1.153 + charsets.put("EBCDIC-CP-NL", "cp037"); 1.154 + charsets.put("EBCDIC-CP-WT", "cp037"); 1.155 + 1.156 + charsets.put("EBCDIC-CP-DK", "cp277"); 1.157 + charsets.put("EBCDIC-CP-NO", "cp277"); 1.158 + charsets.put("EBCDIC-CP-FI", "cp278"); 1.159 + charsets.put("EBCDIC-CP-SE", "cp278"); 1.160 + 1.161 + charsets.put("EBCDIC-CP-IT", "cp280"); 1.162 + charsets.put("EBCDIC-CP-ES", "cp284"); 1.163 + charsets.put("EBCDIC-CP-GB", "cp285"); 1.164 + charsets.put("EBCDIC-CP-FR", "cp297"); 1.165 + 1.166 + charsets.put("EBCDIC-CP-AR1", "cp420"); 1.167 + charsets.put("EBCDIC-CP-HE", "cp424"); 1.168 + charsets.put("EBCDIC-CP-BE", "cp500"); 1.169 + charsets.put("EBCDIC-CP-CH", "cp500"); 1.170 + 1.171 + charsets.put("EBCDIC-CP-ROECE", "cp870"); 1.172 + charsets.put("EBCDIC-CP-YU", "cp870"); 1.173 + charsets.put("EBCDIC-CP-IS", "cp871"); 1.174 + charsets.put("EBCDIC-CP-AR2", "cp918"); 1.175 + 1.176 + // IANA also defines two that JDK 1.2 doesn't handle: 1.177 + // EBCDIC-CP-GR --> CP423 1.178 + // EBCDIC-CP-TR --> CP905 1.179 + } 1.180 + 1.181 + // returns an encoding name supported by JDK >= 1.1.6 1.182 + // for some cases required by the XML spec 1.183 + private static String std2java(String encoding) { 1.184 + String temp = encoding.toUpperCase(); 1.185 + temp = (String) charsets.get(temp); 1.186 + return temp != null ? temp : encoding; 1.187 + } 1.188 + 1.189 + /** 1.190 + * Returns the standard name of the encoding in use 1.191 + */ 1.192 + public String getEncoding() { 1.193 + return assignedEncoding; 1.194 + } 1.195 + 1.196 + private XmlReader(InputStream stream) throws IOException { 1.197 + super(stream); 1.198 + 1.199 + PushbackInputStream pb; 1.200 + byte buf []; 1.201 + int len; 1.202 + 1.203 + if (stream instanceof PushbackInputStream) 1.204 + pb = (PushbackInputStream) stream; 1.205 + else 1.206 + pb = new PushbackInputStream(stream, MAXPUSHBACK); 1.207 + 1.208 + // 1.209 + // See if we can figure out the character encoding used 1.210 + // in this file by peeking at the first few bytes. 1.211 + // 1.212 + buf = new byte[4]; 1.213 + len = pb.read(buf); 1.214 + if (len > 0) 1.215 + pb.unread(buf, 0, len); 1.216 + 1.217 + if (len == 4) 1.218 + switch (buf[0] & 0x0ff) { 1.219 + case 0: 1.220 + // 00 3c 00 3f == illegal UTF-16 big-endian 1.221 + if (buf[1] == 0x3c && buf[2] == 0x00 && buf[3] == 0x3f) { 1.222 + setEncoding(pb, "UnicodeBig"); 1.223 + return; 1.224 + } 1.225 + // else it's probably UCS-4 1.226 + break; 1.227 + 1.228 + case '<': // 0x3c: the most common cases! 1.229 + switch (buf[1] & 0x0ff) { 1.230 + // First character is '<'; could be XML without 1.231 + // an XML directive such as "<hello>", "<!-- ...", 1.232 + // and so on. 1.233 + default: 1.234 + break; 1.235 + 1.236 + // 3c 00 3f 00 == illegal UTF-16 little endian 1.237 + case 0x00: 1.238 + if (buf[2] == 0x3f && buf[3] == 0x00) { 1.239 + setEncoding(pb, "UnicodeLittle"); 1.240 + return; 1.241 + } 1.242 + // else probably UCS-4 1.243 + break; 1.244 + 1.245 + // 3c 3f 78 6d == ASCII and supersets '<?xm' 1.246 + case '?': 1.247 + if (buf[2] != 'x' || buf[3] != 'm') 1.248 + break; 1.249 + // 1.250 + // One of several encodings could be used: 1.251 + // Shift-JIS, ASCII, UTF-8, ISO-8859-*, etc 1.252 + // 1.253 + useEncodingDecl(pb, "UTF8"); 1.254 + return; 1.255 + } 1.256 + break; 1.257 + 1.258 + // 4c 6f a7 94 ... some EBCDIC code page 1.259 + case 0x4c: 1.260 + if (buf[1] == 0x6f 1.261 + && (0x0ff & buf[2]) == 0x0a7 1.262 + && (0x0ff & buf[3]) == 0x094) { 1.263 + useEncodingDecl(pb, "CP037"); 1.264 + return; 1.265 + } 1.266 + // whoops, treat as UTF-8 1.267 + break; 1.268 + 1.269 + // UTF-16 big-endian 1.270 + case 0xfe: 1.271 + if ((buf[1] & 0x0ff) != 0xff) 1.272 + break; 1.273 + setEncoding(pb, "UTF-16"); 1.274 + return; 1.275 + 1.276 + // UTF-16 little-endian 1.277 + case 0xff: 1.278 + if ((buf[1] & 0x0ff) != 0xfe) 1.279 + break; 1.280 + setEncoding(pb, "UTF-16"); 1.281 + return; 1.282 + 1.283 + // default ... no XML declaration 1.284 + default: 1.285 + break; 1.286 + } 1.287 + 1.288 + // 1.289 + // If all else fails, assume XML without a declaration, and 1.290 + // using UTF-8 encoding. 1.291 + // 1.292 + setEncoding(pb, "UTF-8"); 1.293 + } 1.294 + 1.295 + /* 1.296 + * Read the encoding decl on the stream, knowing that it should 1.297 + * be readable using the specified encoding (basically, ASCII or 1.298 + * EBCDIC). The body of the document may use a wider range of 1.299 + * characters than the XML/Text decl itself, so we switch to use 1.300 + * the specified encoding as soon as we can. (ASCII is a subset 1.301 + * of UTF-8, ISO-8859-*, ISO-2022-JP, EUC-JP, and more; EBCDIC 1.302 + * has a variety of "code pages" that have these characters as 1.303 + * a common subset.) 1.304 + */ 1.305 + private void useEncodingDecl(PushbackInputStream pb, String encoding) 1.306 + throws IOException { 1.307 + byte buffer [] = new byte[MAXPUSHBACK]; 1.308 + int len; 1.309 + Reader r; 1.310 + int c; 1.311 + 1.312 + // 1.313 + // Buffer up a bunch of input, and set up to read it in 1.314 + // the specified encoding ... we can skip the first four 1.315 + // bytes since we know that "<?xm" was read to determine 1.316 + // what encoding to use! 1.317 + // 1.318 + len = pb.read(buffer, 0, buffer.length); 1.319 + pb.unread(buffer, 0, len); 1.320 + r = new InputStreamReader(new ByteArrayInputStream(buffer, 4, len), 1.321 + encoding); 1.322 + 1.323 + // 1.324 + // Next must be "l" (and whitespace) else we conclude 1.325 + // error and choose UTF-8. 1.326 + // 1.327 + if ((c = r.read()) != 'l') { 1.328 + setEncoding(pb, "UTF-8"); 1.329 + return; 1.330 + } 1.331 + 1.332 + // 1.333 + // Then, we'll skip any 1.334 + // S version="..." [or single quotes] 1.335 + // bit and get any subsequent 1.336 + // S encoding="..." [or single quotes] 1.337 + // 1.338 + // We put an arbitrary size limit on how far we read; lots 1.339 + // of space will break this algorithm. 1.340 + // 1.341 + StringBuffer buf = new StringBuffer(); 1.342 + StringBuffer keyBuf = null; 1.343 + String key = null; 1.344 + boolean sawEq = false; 1.345 + char quoteChar = 0; 1.346 + boolean sawQuestion = false; 1.347 + 1.348 + XmlDecl: 1.349 + for (int i = 0; i < MAXPUSHBACK - 5; ++i) { 1.350 + if ((c = r.read()) == -1) 1.351 + break; 1.352 + 1.353 + // ignore whitespace before/between "key = 'value'" 1.354 + if (c == ' ' || c == '\t' || c == '\n' || c == '\r') 1.355 + continue; 1.356 + 1.357 + // ... but require at least a little! 1.358 + if (i == 0) 1.359 + break; 1.360 + 1.361 + // terminate the loop ASAP 1.362 + if (c == '?') 1.363 + sawQuestion = true; 1.364 + else if (sawQuestion) { 1.365 + if (c == '>') 1.366 + break; 1.367 + sawQuestion = false; 1.368 + } 1.369 + 1.370 + // did we get the "key =" bit yet? 1.371 + if (key == null || !sawEq) { 1.372 + if (keyBuf == null) { 1.373 + if (Character.isWhitespace((char) c)) 1.374 + continue; 1.375 + keyBuf = buf; 1.376 + buf.setLength(0); 1.377 + buf.append((char) c); 1.378 + sawEq = false; 1.379 + } else if (Character.isWhitespace((char) c)) { 1.380 + key = keyBuf.toString(); 1.381 + } else if (c == '=') { 1.382 + if (key == null) 1.383 + key = keyBuf.toString(); 1.384 + sawEq = true; 1.385 + keyBuf = null; 1.386 + quoteChar = 0; 1.387 + } else 1.388 + keyBuf.append((char) c); 1.389 + continue; 1.390 + } 1.391 + 1.392 + // space before quoted value 1.393 + if (Character.isWhitespace((char) c)) 1.394 + continue; 1.395 + if (c == '"' || c == '\'') { 1.396 + if (quoteChar == 0) { 1.397 + quoteChar = (char) c; 1.398 + buf.setLength(0); 1.399 + continue; 1.400 + } else if (c == quoteChar) { 1.401 + if ("encoding".equals(key)) { 1.402 + assignedEncoding = buf.toString(); 1.403 + 1.404 + // [81] Encname ::= [A-Za-z] ([A-Za-z0-9._]|'-')* 1.405 + for (i = 0; i < assignedEncoding.length(); i++) { 1.406 + c = assignedEncoding.charAt(i); 1.407 + if ((c >= 'A' && c <= 'Z') 1.408 + || (c >= 'a' && c <= 'z')) 1.409 + continue; 1.410 + if (i == 0) 1.411 + break XmlDecl; 1.412 + if (i > 0 && (c == '-' 1.413 + || (c >= '0' && c <= '9') 1.414 + || c == '.' || c == '_')) 1.415 + continue; 1.416 + // map illegal names to UTF-8 default 1.417 + break XmlDecl; 1.418 + } 1.419 + 1.420 + setEncoding(pb, assignedEncoding); 1.421 + return; 1.422 + 1.423 + } else { 1.424 + key = null; 1.425 + continue; 1.426 + } 1.427 + } 1.428 + } 1.429 + buf.append((char) c); 1.430 + } 1.431 + 1.432 + setEncoding(pb, "UTF-8"); 1.433 + } 1.434 + 1.435 + private void setEncoding(InputStream stream, String encoding) 1.436 + throws IOException { 1.437 + assignedEncoding = encoding; 1.438 + in = createReader(stream, encoding); 1.439 + } 1.440 + 1.441 + /** 1.442 + * Reads the number of characters read into the buffer, or -1 on EOF. 1.443 + */ 1.444 + public int read(char buf [], int off, int len) throws IOException { 1.445 + int val; 1.446 + 1.447 + if (closed) 1.448 + return -1; // throw new IOException ("closed"); 1.449 + val = in.read(buf, off, len); 1.450 + if (val == -1) 1.451 + close(); 1.452 + return val; 1.453 + } 1.454 + 1.455 + /** 1.456 + * Reads a single character. 1.457 + */ 1.458 + public int read() throws IOException { 1.459 + int val; 1.460 + 1.461 + if (closed) 1.462 + throw new IOException("closed"); 1.463 + val = in.read(); 1.464 + if (val == -1) 1.465 + close(); 1.466 + return val; 1.467 + } 1.468 + 1.469 + /** 1.470 + * Returns true iff the reader supports mark/reset. 1.471 + */ 1.472 + public boolean markSupported() { 1.473 + return in == null ? false : in.markSupported(); 1.474 + } 1.475 + 1.476 + /** 1.477 + * Sets a mark allowing a limited number of characters to 1.478 + * be "peeked", by reading and then resetting. 1.479 + * 1.480 + * @param value how many characters may be "peeked". 1.481 + */ 1.482 + public void mark(int value) throws IOException { 1.483 + if (in != null) in.mark(value); 1.484 + } 1.485 + 1.486 + /** 1.487 + * Resets the current position to the last marked position. 1.488 + */ 1.489 + public void reset() throws IOException { 1.490 + if (in != null) in.reset(); 1.491 + } 1.492 + 1.493 + /** 1.494 + * Skips a specified number of characters. 1.495 + */ 1.496 + public long skip(long value) throws IOException { 1.497 + return in == null ? 0 : in.skip(value); 1.498 + } 1.499 + 1.500 + /** 1.501 + * Returns true iff input characters are known to be ready. 1.502 + */ 1.503 + public boolean ready() throws IOException { 1.504 + return in == null ? false : in.ready(); 1.505 + } 1.506 + 1.507 + /** 1.508 + * Closes the reader. 1.509 + */ 1.510 + public void close() throws IOException { 1.511 + if (closed) 1.512 + return; 1.513 + in.close(); 1.514 + in = null; 1.515 + closed = true; 1.516 + } 1.517 + 1.518 + // 1.519 + // Delegating to a converter module will always be slower than 1.520 + // direct conversion. Use a similar approach for any other 1.521 + // readers that need to be particularly fast; only block I/O 1.522 + // speed matters to this package. For UTF-16, separate readers 1.523 + // for big and little endian streams make a difference, too; 1.524 + // fewer conditionals in the critical path! 1.525 + // 1.526 + static abstract class BaseReader extends Reader { 1.527 + protected InputStream instream; 1.528 + protected byte buffer []; 1.529 + protected int start, finish; 1.530 + 1.531 + BaseReader(InputStream stream) { 1.532 + super(stream); 1.533 + 1.534 + instream = stream; 1.535 + buffer = new byte[8192]; 1.536 + } 1.537 + 1.538 + public boolean ready() throws IOException { 1.539 + return instream == null 1.540 + || (finish - start) > 0 1.541 + || instream.available() != 0; 1.542 + } 1.543 + 1.544 + // caller shouldn't read again 1.545 + public void close() throws IOException { 1.546 + if (instream != null) { 1.547 + instream.close(); 1.548 + start = finish = 0; 1.549 + buffer = null; 1.550 + instream = null; 1.551 + } 1.552 + } 1.553 + } 1.554 + 1.555 + // 1.556 + // We want this reader, to make the default encoding be as fast 1.557 + // as we can make it. JDK's "UTF8" (not "UTF-8" till JDK 1.2) 1.558 + // InputStreamReader works, but 20+% slower speed isn't OK for 1.559 + // the default/primary encoding. 1.560 + // 1.561 + static final class Utf8Reader extends BaseReader { 1.562 + // 2nd half of UTF-8 surrogate pair 1.563 + private char nextChar; 1.564 + 1.565 + Utf8Reader(InputStream stream) { 1.566 + super(stream); 1.567 + } 1.568 + 1.569 + public int read(char buf [], int offset, int len) throws IOException { 1.570 + int i = 0, c = 0; 1.571 + 1.572 + if (len <= 0) 1.573 + return 0; 1.574 + 1.575 + // Consume remaining half of any surrogate pair immediately 1.576 + if (nextChar != 0) { 1.577 + buf[offset + i++] = nextChar; 1.578 + nextChar = 0; 1.579 + } 1.580 + 1.581 + while (i < len) { 1.582 + // stop or read data if needed 1.583 + if (finish <= start) { 1.584 + if (instream == null) { 1.585 + c = -1; 1.586 + break; 1.587 + } 1.588 + start = 0; 1.589 + finish = instream.read(buffer, 0, buffer.length); 1.590 + if (finish <= 0) { 1.591 + this.close(); 1.592 + c = -1; 1.593 + break; 1.594 + } 1.595 + } 1.596 + 1.597 + // 1.598 + // RFC 2279 describes UTF-8; there are six encodings. 1.599 + // Each encoding takes a fixed number of characters 1.600 + // (1-6 bytes) and is flagged by a bit pattern in the 1.601 + // first byte. The five and six byte-per-character 1.602 + // encodings address characters which are disallowed 1.603 + // in XML documents, as do some four byte ones. 1.604 + // 1.605 + 1.606 + // 1.607 + // Single byte == ASCII. Common; optimize. 1.608 + // 1.609 + c = buffer[start] & 0x0ff; 1.610 + if ((c & 0x80) == 0x00) { 1.611 + // 0x0000 <= c <= 0x007f 1.612 + start++; 1.613 + buf[offset + i++] = (char) c; 1.614 + continue; 1.615 + } 1.616 + 1.617 + // 1.618 + // Multibyte chars -- check offsets optimistically, 1.619 + // ditto the "10xx xxxx" format for subsequent bytes 1.620 + // 1.621 + int off = start; 1.622 + 1.623 + try { 1.624 + // 2 bytes 1.625 + if ((buffer[off] & 0x0E0) == 0x0C0) { 1.626 + c = (buffer[off++] & 0x1f) << 6; 1.627 + c += buffer[off++] & 0x3f; 1.628 + 1.629 + // 0x0080 <= c <= 0x07ff 1.630 + 1.631 + // 3 bytes 1.632 + } else if ((buffer[off] & 0x0F0) == 0x0E0) { 1.633 + c = (buffer[off++] & 0x0f) << 12; 1.634 + c += (buffer[off++] & 0x3f) << 6; 1.635 + c += buffer[off++] & 0x3f; 1.636 + 1.637 + // 0x0800 <= c <= 0xffff 1.638 + 1.639 + // 4 bytes 1.640 + } else if ((buffer[off] & 0x0f8) == 0x0F0) { 1.641 + c = (buffer[off++] & 0x07) << 18; 1.642 + c += (buffer[off++] & 0x3f) << 12; 1.643 + c += (buffer[off++] & 0x3f) << 6; 1.644 + c += buffer[off++] & 0x3f; 1.645 + 1.646 + // 0x0001 0000 <= c <= 0x001f ffff 1.647 + 1.648 + // Unicode supports c <= 0x0010 ffff ... 1.649 + if (c > 0x0010ffff) 1.650 + throw new CharConversionException("UTF-8 encoding of character 0x00" 1.651 + + Integer.toHexString(c) 1.652 + + " can't be converted to Unicode."); 1.653 + 1.654 + // Convert UCS-4 char to surrogate pair (UTF-16) 1.655 + c -= 0x10000; 1.656 + nextChar = (char) (0xDC00 + (c & 0x03ff)); 1.657 + c = 0xD800 + (c >> 10); 1.658 + 1.659 + // 5 and 6 byte versions are XML WF errors, but 1.660 + // typically come from mislabeled encodings 1.661 + } else 1.662 + throw new CharConversionException("Unconvertible UTF-8 character" 1.663 + + " beginning with 0x" 1.664 + + Integer.toHexString(buffer[start] & 0xff)); 1.665 + 1.666 + } catch (ArrayIndexOutOfBoundsException e) { 1.667 + // off > length && length >= buffer.length 1.668 + c = 0; 1.669 + } 1.670 + 1.671 + // 1.672 + // if the buffer held only a partial character, 1.673 + // compact it and try to read the rest of the 1.674 + // character. worst case involves three 1.675 + // single-byte reads -- quite rare. 1.676 + // 1.677 + if (off > finish) { 1.678 + System.arraycopy(buffer, start, 1.679 + buffer, 0, finish - start); 1.680 + finish -= start; 1.681 + start = 0; 1.682 + off = instream.read(buffer, finish, 1.683 + buffer.length - finish); 1.684 + if (off < 0) { 1.685 + this.close(); 1.686 + throw new CharConversionException("Partial UTF-8 char"); 1.687 + } 1.688 + finish += off; 1.689 + continue; 1.690 + } 1.691 + 1.692 + // 1.693 + // check the format of the non-initial bytes 1.694 + // 1.695 + for (start++; start < off; start++) { 1.696 + if ((buffer[start] & 0xC0) != 0x80) { 1.697 + this.close(); 1.698 + throw new CharConversionException("Malformed UTF-8 char -- " 1.699 + + "is an XML encoding declaration missing?"); 1.700 + } 1.701 + } 1.702 + 1.703 + // 1.704 + // If this needed a surrogate pair, consume ASAP 1.705 + // 1.706 + buf[offset + i++] = (char) c; 1.707 + if (nextChar != 0 && i < len) { 1.708 + buf[offset + i++] = nextChar; 1.709 + nextChar = 0; 1.710 + } 1.711 + } 1.712 + if (i > 0) 1.713 + return i; 1.714 + return (c == -1) ? -1 : 0; 1.715 + } 1.716 + } 1.717 + 1.718 + // 1.719 + // We want ASCII and ISO-8859 Readers since they're the most common 1.720 + // encodings in the US and Europe, and we don't want performance 1.721 + // regressions for them. They're also easy to implement efficiently, 1.722 + // since they're bitmask subsets of UNICODE. 1.723 + // 1.724 + // XXX haven't benchmarked these readers vs what we get out of JDK. 1.725 + // 1.726 + static final class AsciiReader extends BaseReader { 1.727 + AsciiReader(InputStream in) { 1.728 + super(in); 1.729 + } 1.730 + 1.731 + public int read(char buf [], int offset, int len) throws IOException { 1.732 + int i, c; 1.733 + 1.734 + if (instream == null) 1.735 + return -1; 1.736 + 1.737 + for (i = 0; i < len; i++) { 1.738 + if (start >= finish) { 1.739 + start = 0; 1.740 + finish = instream.read(buffer, 0, buffer.length); 1.741 + if (finish <= 0) { 1.742 + if (finish <= 0) 1.743 + this.close(); 1.744 + break; 1.745 + } 1.746 + } 1.747 + c = buffer[start++]; 1.748 + if ((c & 0x80) != 0) 1.749 + throw new CharConversionException("Illegal ASCII character, 0x" 1.750 + + Integer.toHexString(c & 0xff)); 1.751 + buf[offset + i] = (char) c; 1.752 + } 1.753 + if (i == 0 && finish <= 0) 1.754 + return -1; 1.755 + return i; 1.756 + } 1.757 + } 1.758 + 1.759 + static final class Iso8859_1Reader extends BaseReader { 1.760 + Iso8859_1Reader(InputStream in) { 1.761 + super(in); 1.762 + } 1.763 + 1.764 + public int read(char buf [], int offset, int len) throws IOException { 1.765 + int i; 1.766 + 1.767 + if (instream == null) 1.768 + return -1; 1.769 + 1.770 + for (i = 0; i < len; i++) { 1.771 + if (start >= finish) { 1.772 + start = 0; 1.773 + finish = instream.read(buffer, 0, buffer.length); 1.774 + if (finish <= 0) { 1.775 + if (finish <= 0) 1.776 + this.close(); 1.777 + break; 1.778 + } 1.779 + } 1.780 + buf[offset + i] = (char) (0x0ff & buffer[start++]); 1.781 + } 1.782 + if (i == 0 && finish <= 0) 1.783 + return -1; 1.784 + return i; 1.785 + } 1.786 + } 1.787 +}