src/share/jaxws_classes/com/sun/xml/internal/dtdparser/XmlReader.java

Fri, 23 Aug 2013 09:57:21 +0100

author
mkos
date
Fri, 23 Aug 2013 09:57:21 +0100
changeset 397
b99d7e355d4b
parent 286
f50545b5e2f1
child 637
9c07ef4934dd
permissions
-rw-r--r--

8022885: Update JAX-WS RI integration to 2.2.9-b14140
8013016: Rebase 8009009 against the latest jdk8/jaxws
Reviewed-by: alanb, chegar

ohair@286 1 /*
mkos@397 2 * Copyright (c) 2009, Oracle and/or its affiliates. All rights reserved.
ohair@286 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
ohair@286 4 *
ohair@286 5 * This code is free software; you can redistribute it and/or modify it
ohair@286 6 * under the terms of the GNU General Public License version 2 only, as
ohair@286 7 * published by the Free Software Foundation. Oracle designates this
ohair@286 8 * particular file as subject to the "Classpath" exception as provided
ohair@286 9 * by Oracle in the LICENSE file that accompanied this code.
ohair@286 10 *
ohair@286 11 * This code is distributed in the hope that it will be useful, but WITHOUT
ohair@286 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
ohair@286 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
ohair@286 14 * version 2 for more details (a copy is included in the LICENSE file that
ohair@286 15 * accompanied this code).
ohair@286 16 *
ohair@286 17 * You should have received a copy of the GNU General Public License version
ohair@286 18 * 2 along with this work; if not, write to the Free Software Foundation,
ohair@286 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
ohair@286 20 *
ohair@286 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
ohair@286 22 * or visit www.oracle.com if you need additional information or have any
ohair@286 23 * questions.
ohair@286 24 */
ohair@286 25
ohair@286 26 package com.sun.xml.internal.dtdparser;
ohair@286 27
ohair@286 28 import java.io.ByteArrayInputStream;
ohair@286 29 import java.io.CharConversionException;
ohair@286 30 import java.io.IOException;
ohair@286 31 import java.io.InputStream;
ohair@286 32 import java.io.InputStreamReader;
ohair@286 33 import java.io.PushbackInputStream;
ohair@286 34 import java.io.Reader;
ohair@286 35 import java.util.Hashtable;
ohair@286 36
ohair@286 37
ohair@286 38 // NOTE: Add I18N support to this class when JDK gets the ability to
ohair@286 39 // defer selection of locale for exception messages ... use the same
ohair@286 40 // technique for both.
ohair@286 41
ohair@286 42
ohair@286 43 /**
ohair@286 44 * This handles several XML-related tasks that normal java.io Readers
ohair@286 45 * don't support, inluding use of IETF standard encoding names and
ohair@286 46 * automatic detection of most XML encodings. The former is needed
ohair@286 47 * for interoperability; the latter is needed to conform with the XML
ohair@286 48 * spec. This class also optimizes reading some common encodings by
ohair@286 49 * providing low-overhead unsynchronized Reader support.
ohair@286 50 * <p/>
ohair@286 51 * <P> Note that the autodetection facility should be used only on
ohair@286 52 * data streams which have an unknown character encoding. For example,
ohair@286 53 * it should never be used on MIME text/xml entities.
ohair@286 54 * <p/>
ohair@286 55 * <P> Note that XML processors are only required to support UTF-8 and
ohair@286 56 * UTF-16 character encodings. Autodetection permits the underlying Java
ohair@286 57 * implementation to provide support for many other encodings, such as
ohair@286 58 * US-ASCII, ISO-8859-5, Shift_JIS, EUC-JP, and ISO-2022-JP.
ohair@286 59 *
ohair@286 60 * @author David Brownell
ohair@286 61 * @author Janet Koenig
ohair@286 62 * @version 1.3 00/02/24
ohair@286 63 */
ohair@286 64 // package private
ohair@286 65 final class XmlReader extends Reader {
ohair@286 66 private static final int MAXPUSHBACK = 512;
ohair@286 67
ohair@286 68 private Reader in;
ohair@286 69 private String assignedEncoding;
ohair@286 70 private boolean closed;
ohair@286 71
ohair@286 72 //
ohair@286 73 // This class always delegates I/O to a reader, which gets
ohair@286 74 // its data from the very beginning of the XML text. It needs
ohair@286 75 // to use a pushback stream since (a) autodetection can read
ohair@286 76 // partial UTF-8 characters which need to be fully processed,
ohair@286 77 // (b) the "Unicode" readers swallow characters that they think
ohair@286 78 // are byte order marks, so tests fail if they don't see the
ohair@286 79 // real byte order mark.
ohair@286 80 //
ohair@286 81 // It's got do this efficiently: character I/O is solidly on the
ohair@286 82 // critical path. (So keep buffer length over 2 Kbytes to avoid
ohair@286 83 // excess buffering. Many URL handlers stuff a BufferedInputStream
ohair@286 84 // between here and the real data source, and larger buffers keep
ohair@286 85 // that from slowing you down.)
ohair@286 86 //
ohair@286 87
ohair@286 88 /**
mkos@397 89 * Constructs the reader from an input stream, autodetecting
ohair@286 90 * the encoding to use according to the heuristic specified
ohair@286 91 * in the XML 1.0 recommendation.
ohair@286 92 *
ohair@286 93 * @param in the input stream from which the reader is constructed
ohair@286 94 * @throws IOException on error, such as unrecognized encoding
ohair@286 95 */
ohair@286 96 public static Reader createReader(InputStream in) throws IOException {
ohair@286 97 return new XmlReader(in);
ohair@286 98 }
ohair@286 99
ohair@286 100 /**
ohair@286 101 * Creates a reader supporting the given encoding, mapping
ohair@286 102 * from standard encoding names to ones that understood by
ohair@286 103 * Java where necessary.
ohair@286 104 *
ohair@286 105 * @param in the input stream from which the reader is constructed
ohair@286 106 * @param encoding the IETF standard name of the encoding to use;
mkos@397 107 * if null, autodetection is used.
ohair@286 108 * @throws IOException on error, including unrecognized encoding
ohair@286 109 */
ohair@286 110 public static Reader createReader(InputStream in, String encoding)
ohair@286 111 throws IOException {
ohair@286 112 if (encoding == null)
ohair@286 113 return new XmlReader(in);
ohair@286 114 if ("UTF-8".equalsIgnoreCase(encoding)
ohair@286 115 || "UTF8".equalsIgnoreCase(encoding))
ohair@286 116 return new Utf8Reader(in);
ohair@286 117 if ("US-ASCII".equalsIgnoreCase(encoding)
ohair@286 118 || "ASCII".equalsIgnoreCase(encoding))
ohair@286 119 return new AsciiReader(in);
ohair@286 120 if ("ISO-8859-1".equalsIgnoreCase(encoding)
ohair@286 121 // plus numerous aliases ...
ohair@286 122 )
ohair@286 123 return new Iso8859_1Reader(in);
ohair@286 124
ohair@286 125 //
ohair@286 126 // What we really want is an administerable resource mapping
ohair@286 127 // encoding names/aliases to classnames. For example a property
ohair@286 128 // file resource, "readers/mapping.props", holding and a set
ohair@286 129 // of readers in that (sub)package... defaulting to this call
ohair@286 130 // only if no better choice is available.
ohair@286 131 //
ohair@286 132 return new InputStreamReader(in, std2java(encoding));
ohair@286 133 }
ohair@286 134
ohair@286 135 //
ohair@286 136 // JDK doesn't know all of the standard encoding names, and
ohair@286 137 // in particular none of the EBCDIC ones IANA defines (and
ohair@286 138 // which IBM encourages).
ohair@286 139 //
ohair@286 140 static private final Hashtable charsets = new Hashtable(31);
ohair@286 141
ohair@286 142 static {
ohair@286 143 charsets.put("UTF-16", "Unicode");
ohair@286 144 charsets.put("ISO-10646-UCS-2", "Unicode");
ohair@286 145
ohair@286 146 // NOTE: no support for ISO-10646-UCS-4 yet.
ohair@286 147
ohair@286 148 charsets.put("EBCDIC-CP-US", "cp037");
ohair@286 149 charsets.put("EBCDIC-CP-CA", "cp037");
ohair@286 150 charsets.put("EBCDIC-CP-NL", "cp037");
ohair@286 151 charsets.put("EBCDIC-CP-WT", "cp037");
ohair@286 152
ohair@286 153 charsets.put("EBCDIC-CP-DK", "cp277");
ohair@286 154 charsets.put("EBCDIC-CP-NO", "cp277");
ohair@286 155 charsets.put("EBCDIC-CP-FI", "cp278");
ohair@286 156 charsets.put("EBCDIC-CP-SE", "cp278");
ohair@286 157
ohair@286 158 charsets.put("EBCDIC-CP-IT", "cp280");
ohair@286 159 charsets.put("EBCDIC-CP-ES", "cp284");
ohair@286 160 charsets.put("EBCDIC-CP-GB", "cp285");
ohair@286 161 charsets.put("EBCDIC-CP-FR", "cp297");
ohair@286 162
ohair@286 163 charsets.put("EBCDIC-CP-AR1", "cp420");
ohair@286 164 charsets.put("EBCDIC-CP-HE", "cp424");
ohair@286 165 charsets.put("EBCDIC-CP-BE", "cp500");
ohair@286 166 charsets.put("EBCDIC-CP-CH", "cp500");
ohair@286 167
ohair@286 168 charsets.put("EBCDIC-CP-ROECE", "cp870");
ohair@286 169 charsets.put("EBCDIC-CP-YU", "cp870");
ohair@286 170 charsets.put("EBCDIC-CP-IS", "cp871");
ohair@286 171 charsets.put("EBCDIC-CP-AR2", "cp918");
ohair@286 172
ohair@286 173 // IANA also defines two that JDK 1.2 doesn't handle:
ohair@286 174 // EBCDIC-CP-GR --> CP423
ohair@286 175 // EBCDIC-CP-TR --> CP905
ohair@286 176 }
ohair@286 177
ohair@286 178 // returns an encoding name supported by JDK >= 1.1.6
ohair@286 179 // for some cases required by the XML spec
ohair@286 180 private static String std2java(String encoding) {
ohair@286 181 String temp = encoding.toUpperCase();
ohair@286 182 temp = (String) charsets.get(temp);
ohair@286 183 return temp != null ? temp : encoding;
ohair@286 184 }
ohair@286 185
ohair@286 186 /**
ohair@286 187 * Returns the standard name of the encoding in use
ohair@286 188 */
ohair@286 189 public String getEncoding() {
ohair@286 190 return assignedEncoding;
ohair@286 191 }
ohair@286 192
ohair@286 193 private XmlReader(InputStream stream) throws IOException {
ohair@286 194 super(stream);
ohair@286 195
ohair@286 196 PushbackInputStream pb;
ohair@286 197 byte buf [];
ohair@286 198 int len;
ohair@286 199
ohair@286 200 if (stream instanceof PushbackInputStream)
ohair@286 201 pb = (PushbackInputStream) stream;
ohair@286 202 else
ohair@286 203 pb = new PushbackInputStream(stream, MAXPUSHBACK);
ohair@286 204
ohair@286 205 //
ohair@286 206 // See if we can figure out the character encoding used
ohair@286 207 // in this file by peeking at the first few bytes.
ohair@286 208 //
ohair@286 209 buf = new byte[4];
ohair@286 210 len = pb.read(buf);
ohair@286 211 if (len > 0)
ohair@286 212 pb.unread(buf, 0, len);
ohair@286 213
ohair@286 214 if (len == 4)
ohair@286 215 switch (buf[0] & 0x0ff) {
ohair@286 216 case 0:
ohair@286 217 // 00 3c 00 3f == illegal UTF-16 big-endian
ohair@286 218 if (buf[1] == 0x3c && buf[2] == 0x00 && buf[3] == 0x3f) {
ohair@286 219 setEncoding(pb, "UnicodeBig");
ohair@286 220 return;
ohair@286 221 }
ohair@286 222 // else it's probably UCS-4
ohair@286 223 break;
ohair@286 224
ohair@286 225 case '<': // 0x3c: the most common cases!
ohair@286 226 switch (buf[1] & 0x0ff) {
ohair@286 227 // First character is '<'; could be XML without
ohair@286 228 // an XML directive such as "<hello>", "<!-- ...",
ohair@286 229 // and so on.
ohair@286 230 default:
ohair@286 231 break;
ohair@286 232
ohair@286 233 // 3c 00 3f 00 == illegal UTF-16 little endian
ohair@286 234 case 0x00:
ohair@286 235 if (buf[2] == 0x3f && buf[3] == 0x00) {
ohair@286 236 setEncoding(pb, "UnicodeLittle");
ohair@286 237 return;
ohair@286 238 }
ohair@286 239 // else probably UCS-4
ohair@286 240 break;
ohair@286 241
ohair@286 242 // 3c 3f 78 6d == ASCII and supersets '<?xm'
ohair@286 243 case '?':
ohair@286 244 if (buf[2] != 'x' || buf[3] != 'm')
ohair@286 245 break;
ohair@286 246 //
ohair@286 247 // One of several encodings could be used:
ohair@286 248 // Shift-JIS, ASCII, UTF-8, ISO-8859-*, etc
ohair@286 249 //
ohair@286 250 useEncodingDecl(pb, "UTF8");
ohair@286 251 return;
ohair@286 252 }
ohair@286 253 break;
ohair@286 254
ohair@286 255 // 4c 6f a7 94 ... some EBCDIC code page
ohair@286 256 case 0x4c:
ohair@286 257 if (buf[1] == 0x6f
ohair@286 258 && (0x0ff & buf[2]) == 0x0a7
ohair@286 259 && (0x0ff & buf[3]) == 0x094) {
ohair@286 260 useEncodingDecl(pb, "CP037");
ohair@286 261 return;
ohair@286 262 }
ohair@286 263 // whoops, treat as UTF-8
ohair@286 264 break;
ohair@286 265
ohair@286 266 // UTF-16 big-endian
ohair@286 267 case 0xfe:
ohair@286 268 if ((buf[1] & 0x0ff) != 0xff)
ohair@286 269 break;
ohair@286 270 setEncoding(pb, "UTF-16");
ohair@286 271 return;
ohair@286 272
ohair@286 273 // UTF-16 little-endian
ohair@286 274 case 0xff:
ohair@286 275 if ((buf[1] & 0x0ff) != 0xfe)
ohair@286 276 break;
ohair@286 277 setEncoding(pb, "UTF-16");
ohair@286 278 return;
ohair@286 279
ohair@286 280 // default ... no XML declaration
ohair@286 281 default:
ohair@286 282 break;
ohair@286 283 }
ohair@286 284
ohair@286 285 //
ohair@286 286 // If all else fails, assume XML without a declaration, and
ohair@286 287 // using UTF-8 encoding.
ohair@286 288 //
ohair@286 289 setEncoding(pb, "UTF-8");
ohair@286 290 }
ohair@286 291
ohair@286 292 /*
ohair@286 293 * Read the encoding decl on the stream, knowing that it should
ohair@286 294 * be readable using the specified encoding (basically, ASCII or
ohair@286 295 * EBCDIC). The body of the document may use a wider range of
ohair@286 296 * characters than the XML/Text decl itself, so we switch to use
ohair@286 297 * the specified encoding as soon as we can. (ASCII is a subset
ohair@286 298 * of UTF-8, ISO-8859-*, ISO-2022-JP, EUC-JP, and more; EBCDIC
ohair@286 299 * has a variety of "code pages" that have these characters as
ohair@286 300 * a common subset.)
ohair@286 301 */
ohair@286 302 private void useEncodingDecl(PushbackInputStream pb, String encoding)
ohair@286 303 throws IOException {
ohair@286 304 byte buffer [] = new byte[MAXPUSHBACK];
ohair@286 305 int len;
ohair@286 306 Reader r;
ohair@286 307 int c;
ohair@286 308
ohair@286 309 //
ohair@286 310 // Buffer up a bunch of input, and set up to read it in
ohair@286 311 // the specified encoding ... we can skip the first four
ohair@286 312 // bytes since we know that "<?xm" was read to determine
ohair@286 313 // what encoding to use!
ohair@286 314 //
ohair@286 315 len = pb.read(buffer, 0, buffer.length);
ohair@286 316 pb.unread(buffer, 0, len);
ohair@286 317 r = new InputStreamReader(new ByteArrayInputStream(buffer, 4, len),
ohair@286 318 encoding);
ohair@286 319
ohair@286 320 //
ohair@286 321 // Next must be "l" (and whitespace) else we conclude
ohair@286 322 // error and choose UTF-8.
ohair@286 323 //
ohair@286 324 if ((c = r.read()) != 'l') {
ohair@286 325 setEncoding(pb, "UTF-8");
ohair@286 326 return;
ohair@286 327 }
ohair@286 328
ohair@286 329 //
ohair@286 330 // Then, we'll skip any
ohair@286 331 // S version="..." [or single quotes]
ohair@286 332 // bit and get any subsequent
ohair@286 333 // S encoding="..." [or single quotes]
ohair@286 334 //
ohair@286 335 // We put an arbitrary size limit on how far we read; lots
ohair@286 336 // of space will break this algorithm.
ohair@286 337 //
ohair@286 338 StringBuffer buf = new StringBuffer();
ohair@286 339 StringBuffer keyBuf = null;
ohair@286 340 String key = null;
ohair@286 341 boolean sawEq = false;
ohair@286 342 char quoteChar = 0;
ohair@286 343 boolean sawQuestion = false;
ohair@286 344
ohair@286 345 XmlDecl:
ohair@286 346 for (int i = 0; i < MAXPUSHBACK - 5; ++i) {
ohair@286 347 if ((c = r.read()) == -1)
ohair@286 348 break;
ohair@286 349
ohair@286 350 // ignore whitespace before/between "key = 'value'"
ohair@286 351 if (c == ' ' || c == '\t' || c == '\n' || c == '\r')
ohair@286 352 continue;
ohair@286 353
ohair@286 354 // ... but require at least a little!
ohair@286 355 if (i == 0)
ohair@286 356 break;
ohair@286 357
ohair@286 358 // terminate the loop ASAP
ohair@286 359 if (c == '?')
ohair@286 360 sawQuestion = true;
ohair@286 361 else if (sawQuestion) {
ohair@286 362 if (c == '>')
ohair@286 363 break;
ohair@286 364 sawQuestion = false;
ohair@286 365 }
ohair@286 366
ohair@286 367 // did we get the "key =" bit yet?
ohair@286 368 if (key == null || !sawEq) {
ohair@286 369 if (keyBuf == null) {
ohair@286 370 if (Character.isWhitespace((char) c))
ohair@286 371 continue;
ohair@286 372 keyBuf = buf;
ohair@286 373 buf.setLength(0);
ohair@286 374 buf.append((char) c);
ohair@286 375 sawEq = false;
ohair@286 376 } else if (Character.isWhitespace((char) c)) {
ohair@286 377 key = keyBuf.toString();
ohair@286 378 } else if (c == '=') {
ohair@286 379 if (key == null)
ohair@286 380 key = keyBuf.toString();
ohair@286 381 sawEq = true;
ohair@286 382 keyBuf = null;
ohair@286 383 quoteChar = 0;
ohair@286 384 } else
ohair@286 385 keyBuf.append((char) c);
ohair@286 386 continue;
ohair@286 387 }
ohair@286 388
ohair@286 389 // space before quoted value
ohair@286 390 if (Character.isWhitespace((char) c))
ohair@286 391 continue;
ohair@286 392 if (c == '"' || c == '\'') {
ohair@286 393 if (quoteChar == 0) {
ohair@286 394 quoteChar = (char) c;
ohair@286 395 buf.setLength(0);
ohair@286 396 continue;
ohair@286 397 } else if (c == quoteChar) {
ohair@286 398 if ("encoding".equals(key)) {
ohair@286 399 assignedEncoding = buf.toString();
ohair@286 400
ohair@286 401 // [81] Encname ::= [A-Za-z] ([A-Za-z0-9._]|'-')*
ohair@286 402 for (i = 0; i < assignedEncoding.length(); i++) {
ohair@286 403 c = assignedEncoding.charAt(i);
ohair@286 404 if ((c >= 'A' && c <= 'Z')
ohair@286 405 || (c >= 'a' && c <= 'z'))
ohair@286 406 continue;
ohair@286 407 if (i == 0)
ohair@286 408 break XmlDecl;
ohair@286 409 if (i > 0 && (c == '-'
ohair@286 410 || (c >= '0' && c <= '9')
ohair@286 411 || c == '.' || c == '_'))
ohair@286 412 continue;
ohair@286 413 // map illegal names to UTF-8 default
ohair@286 414 break XmlDecl;
ohair@286 415 }
ohair@286 416
ohair@286 417 setEncoding(pb, assignedEncoding);
ohair@286 418 return;
ohair@286 419
ohair@286 420 } else {
ohair@286 421 key = null;
ohair@286 422 continue;
ohair@286 423 }
ohair@286 424 }
ohair@286 425 }
ohair@286 426 buf.append((char) c);
ohair@286 427 }
ohair@286 428
ohair@286 429 setEncoding(pb, "UTF-8");
ohair@286 430 }
ohair@286 431
ohair@286 432 private void setEncoding(InputStream stream, String encoding)
ohair@286 433 throws IOException {
ohair@286 434 assignedEncoding = encoding;
ohair@286 435 in = createReader(stream, encoding);
ohair@286 436 }
ohair@286 437
ohair@286 438 /**
ohair@286 439 * Reads the number of characters read into the buffer, or -1 on EOF.
ohair@286 440 */
ohair@286 441 public int read(char buf [], int off, int len) throws IOException {
ohair@286 442 int val;
ohair@286 443
ohair@286 444 if (closed)
ohair@286 445 return -1; // throw new IOException ("closed");
ohair@286 446 val = in.read(buf, off, len);
ohair@286 447 if (val == -1)
ohair@286 448 close();
ohair@286 449 return val;
ohair@286 450 }
ohair@286 451
ohair@286 452 /**
ohair@286 453 * Reads a single character.
ohair@286 454 */
ohair@286 455 public int read() throws IOException {
ohair@286 456 int val;
ohair@286 457
ohair@286 458 if (closed)
ohair@286 459 throw new IOException("closed");
ohair@286 460 val = in.read();
ohair@286 461 if (val == -1)
ohair@286 462 close();
ohair@286 463 return val;
ohair@286 464 }
ohair@286 465
ohair@286 466 /**
ohair@286 467 * Returns true iff the reader supports mark/reset.
ohair@286 468 */
ohair@286 469 public boolean markSupported() {
ohair@286 470 return in == null ? false : in.markSupported();
ohair@286 471 }
ohair@286 472
ohair@286 473 /**
ohair@286 474 * Sets a mark allowing a limited number of characters to
ohair@286 475 * be "peeked", by reading and then resetting.
ohair@286 476 *
ohair@286 477 * @param value how many characters may be "peeked".
ohair@286 478 */
ohair@286 479 public void mark(int value) throws IOException {
ohair@286 480 if (in != null) in.mark(value);
ohair@286 481 }
ohair@286 482
ohair@286 483 /**
ohair@286 484 * Resets the current position to the last marked position.
ohair@286 485 */
ohair@286 486 public void reset() throws IOException {
ohair@286 487 if (in != null) in.reset();
ohair@286 488 }
ohair@286 489
ohair@286 490 /**
ohair@286 491 * Skips a specified number of characters.
ohair@286 492 */
ohair@286 493 public long skip(long value) throws IOException {
ohair@286 494 return in == null ? 0 : in.skip(value);
ohair@286 495 }
ohair@286 496
ohair@286 497 /**
ohair@286 498 * Returns true iff input characters are known to be ready.
ohair@286 499 */
ohair@286 500 public boolean ready() throws IOException {
ohair@286 501 return in == null ? false : in.ready();
ohair@286 502 }
ohair@286 503
ohair@286 504 /**
ohair@286 505 * Closes the reader.
ohair@286 506 */
ohair@286 507 public void close() throws IOException {
ohair@286 508 if (closed)
ohair@286 509 return;
ohair@286 510 in.close();
ohair@286 511 in = null;
ohair@286 512 closed = true;
ohair@286 513 }
ohair@286 514
ohair@286 515 //
ohair@286 516 // Delegating to a converter module will always be slower than
ohair@286 517 // direct conversion. Use a similar approach for any other
ohair@286 518 // readers that need to be particularly fast; only block I/O
ohair@286 519 // speed matters to this package. For UTF-16, separate readers
ohair@286 520 // for big and little endian streams make a difference, too;
ohair@286 521 // fewer conditionals in the critical path!
ohair@286 522 //
ohair@286 523 static abstract class BaseReader extends Reader {
ohair@286 524 protected InputStream instream;
ohair@286 525 protected byte buffer [];
ohair@286 526 protected int start, finish;
ohair@286 527
ohair@286 528 BaseReader(InputStream stream) {
ohair@286 529 super(stream);
ohair@286 530
ohair@286 531 instream = stream;
ohair@286 532 buffer = new byte[8192];
ohair@286 533 }
ohair@286 534
ohair@286 535 public boolean ready() throws IOException {
ohair@286 536 return instream == null
ohair@286 537 || (finish - start) > 0
ohair@286 538 || instream.available() != 0;
ohair@286 539 }
ohair@286 540
ohair@286 541 // caller shouldn't read again
ohair@286 542 public void close() throws IOException {
ohair@286 543 if (instream != null) {
ohair@286 544 instream.close();
ohair@286 545 start = finish = 0;
ohair@286 546 buffer = null;
ohair@286 547 instream = null;
ohair@286 548 }
ohair@286 549 }
ohair@286 550 }
ohair@286 551
ohair@286 552 //
ohair@286 553 // We want this reader, to make the default encoding be as fast
ohair@286 554 // as we can make it. JDK's "UTF8" (not "UTF-8" till JDK 1.2)
ohair@286 555 // InputStreamReader works, but 20+% slower speed isn't OK for
ohair@286 556 // the default/primary encoding.
ohair@286 557 //
ohair@286 558 static final class Utf8Reader extends BaseReader {
ohair@286 559 // 2nd half of UTF-8 surrogate pair
ohair@286 560 private char nextChar;
ohair@286 561
ohair@286 562 Utf8Reader(InputStream stream) {
ohair@286 563 super(stream);
ohair@286 564 }
ohair@286 565
ohair@286 566 public int read(char buf [], int offset, int len) throws IOException {
ohair@286 567 int i = 0, c = 0;
ohair@286 568
ohair@286 569 if (len <= 0)
ohair@286 570 return 0;
ohair@286 571
ohair@286 572 // Consume remaining half of any surrogate pair immediately
ohair@286 573 if (nextChar != 0) {
ohair@286 574 buf[offset + i++] = nextChar;
ohair@286 575 nextChar = 0;
ohair@286 576 }
ohair@286 577
ohair@286 578 while (i < len) {
ohair@286 579 // stop or read data if needed
ohair@286 580 if (finish <= start) {
ohair@286 581 if (instream == null) {
ohair@286 582 c = -1;
ohair@286 583 break;
ohair@286 584 }
ohair@286 585 start = 0;
ohair@286 586 finish = instream.read(buffer, 0, buffer.length);
ohair@286 587 if (finish <= 0) {
ohair@286 588 this.close();
ohair@286 589 c = -1;
ohair@286 590 break;
ohair@286 591 }
ohair@286 592 }
ohair@286 593
ohair@286 594 //
ohair@286 595 // RFC 2279 describes UTF-8; there are six encodings.
ohair@286 596 // Each encoding takes a fixed number of characters
ohair@286 597 // (1-6 bytes) and is flagged by a bit pattern in the
ohair@286 598 // first byte. The five and six byte-per-character
ohair@286 599 // encodings address characters which are disallowed
ohair@286 600 // in XML documents, as do some four byte ones.
ohair@286 601 //
ohair@286 602
ohair@286 603 //
ohair@286 604 // Single byte == ASCII. Common; optimize.
ohair@286 605 //
ohair@286 606 c = buffer[start] & 0x0ff;
ohair@286 607 if ((c & 0x80) == 0x00) {
ohair@286 608 // 0x0000 <= c <= 0x007f
ohair@286 609 start++;
ohair@286 610 buf[offset + i++] = (char) c;
ohair@286 611 continue;
ohair@286 612 }
ohair@286 613
ohair@286 614 //
ohair@286 615 // Multibyte chars -- check offsets optimistically,
ohair@286 616 // ditto the "10xx xxxx" format for subsequent bytes
ohair@286 617 //
ohair@286 618 int off = start;
ohair@286 619
ohair@286 620 try {
ohair@286 621 // 2 bytes
ohair@286 622 if ((buffer[off] & 0x0E0) == 0x0C0) {
ohair@286 623 c = (buffer[off++] & 0x1f) << 6;
ohair@286 624 c += buffer[off++] & 0x3f;
ohair@286 625
ohair@286 626 // 0x0080 <= c <= 0x07ff
ohair@286 627
ohair@286 628 // 3 bytes
ohair@286 629 } else if ((buffer[off] & 0x0F0) == 0x0E0) {
ohair@286 630 c = (buffer[off++] & 0x0f) << 12;
ohair@286 631 c += (buffer[off++] & 0x3f) << 6;
ohair@286 632 c += buffer[off++] & 0x3f;
ohair@286 633
ohair@286 634 // 0x0800 <= c <= 0xffff
ohair@286 635
ohair@286 636 // 4 bytes
ohair@286 637 } else if ((buffer[off] & 0x0f8) == 0x0F0) {
ohair@286 638 c = (buffer[off++] & 0x07) << 18;
ohair@286 639 c += (buffer[off++] & 0x3f) << 12;
ohair@286 640 c += (buffer[off++] & 0x3f) << 6;
ohair@286 641 c += buffer[off++] & 0x3f;
ohair@286 642
ohair@286 643 // 0x0001 0000 <= c <= 0x001f ffff
ohair@286 644
ohair@286 645 // Unicode supports c <= 0x0010 ffff ...
ohair@286 646 if (c > 0x0010ffff)
ohair@286 647 throw new CharConversionException("UTF-8 encoding of character 0x00"
ohair@286 648 + Integer.toHexString(c)
ohair@286 649 + " can't be converted to Unicode.");
ohair@286 650
ohair@286 651 // Convert UCS-4 char to surrogate pair (UTF-16)
ohair@286 652 c -= 0x10000;
ohair@286 653 nextChar = (char) (0xDC00 + (c & 0x03ff));
ohair@286 654 c = 0xD800 + (c >> 10);
ohair@286 655
ohair@286 656 // 5 and 6 byte versions are XML WF errors, but
ohair@286 657 // typically come from mislabeled encodings
ohair@286 658 } else
ohair@286 659 throw new CharConversionException("Unconvertible UTF-8 character"
ohair@286 660 + " beginning with 0x"
ohair@286 661 + Integer.toHexString(buffer[start] & 0xff));
ohair@286 662
ohair@286 663 } catch (ArrayIndexOutOfBoundsException e) {
ohair@286 664 // off > length && length >= buffer.length
ohair@286 665 c = 0;
ohair@286 666 }
ohair@286 667
ohair@286 668 //
ohair@286 669 // if the buffer held only a partial character,
ohair@286 670 // compact it and try to read the rest of the
ohair@286 671 // character. worst case involves three
ohair@286 672 // single-byte reads -- quite rare.
ohair@286 673 //
ohair@286 674 if (off > finish) {
ohair@286 675 System.arraycopy(buffer, start,
ohair@286 676 buffer, 0, finish - start);
ohair@286 677 finish -= start;
ohair@286 678 start = 0;
ohair@286 679 off = instream.read(buffer, finish,
ohair@286 680 buffer.length - finish);
ohair@286 681 if (off < 0) {
ohair@286 682 this.close();
ohair@286 683 throw new CharConversionException("Partial UTF-8 char");
ohair@286 684 }
ohair@286 685 finish += off;
ohair@286 686 continue;
ohair@286 687 }
ohair@286 688
ohair@286 689 //
ohair@286 690 // check the format of the non-initial bytes
ohair@286 691 //
ohair@286 692 for (start++; start < off; start++) {
ohair@286 693 if ((buffer[start] & 0xC0) != 0x80) {
ohair@286 694 this.close();
ohair@286 695 throw new CharConversionException("Malformed UTF-8 char -- "
ohair@286 696 + "is an XML encoding declaration missing?");
ohair@286 697 }
ohair@286 698 }
ohair@286 699
ohair@286 700 //
ohair@286 701 // If this needed a surrogate pair, consume ASAP
ohair@286 702 //
ohair@286 703 buf[offset + i++] = (char) c;
ohair@286 704 if (nextChar != 0 && i < len) {
ohair@286 705 buf[offset + i++] = nextChar;
ohair@286 706 nextChar = 0;
ohair@286 707 }
ohair@286 708 }
ohair@286 709 if (i > 0)
ohair@286 710 return i;
ohair@286 711 return (c == -1) ? -1 : 0;
ohair@286 712 }
ohair@286 713 }
ohair@286 714
ohair@286 715 //
ohair@286 716 // We want ASCII and ISO-8859 Readers since they're the most common
ohair@286 717 // encodings in the US and Europe, and we don't want performance
ohair@286 718 // regressions for them. They're also easy to implement efficiently,
ohair@286 719 // since they're bitmask subsets of UNICODE.
ohair@286 720 //
ohair@286 721 // XXX haven't benchmarked these readers vs what we get out of JDK.
ohair@286 722 //
ohair@286 723 static final class AsciiReader extends BaseReader {
ohair@286 724 AsciiReader(InputStream in) {
ohair@286 725 super(in);
ohair@286 726 }
ohair@286 727
ohair@286 728 public int read(char buf [], int offset, int len) throws IOException {
ohair@286 729 int i, c;
ohair@286 730
ohair@286 731 if (instream == null)
ohair@286 732 return -1;
ohair@286 733
ohair@286 734 for (i = 0; i < len; i++) {
ohair@286 735 if (start >= finish) {
ohair@286 736 start = 0;
ohair@286 737 finish = instream.read(buffer, 0, buffer.length);
ohair@286 738 if (finish <= 0) {
ohair@286 739 if (finish <= 0)
ohair@286 740 this.close();
ohair@286 741 break;
ohair@286 742 }
ohair@286 743 }
ohair@286 744 c = buffer[start++];
ohair@286 745 if ((c & 0x80) != 0)
ohair@286 746 throw new CharConversionException("Illegal ASCII character, 0x"
ohair@286 747 + Integer.toHexString(c & 0xff));
ohair@286 748 buf[offset + i] = (char) c;
ohair@286 749 }
ohair@286 750 if (i == 0 && finish <= 0)
ohair@286 751 return -1;
ohair@286 752 return i;
ohair@286 753 }
ohair@286 754 }
ohair@286 755
ohair@286 756 static final class Iso8859_1Reader extends BaseReader {
ohair@286 757 Iso8859_1Reader(InputStream in) {
ohair@286 758 super(in);
ohair@286 759 }
ohair@286 760
ohair@286 761 public int read(char buf [], int offset, int len) throws IOException {
ohair@286 762 int i;
ohair@286 763
ohair@286 764 if (instream == null)
ohair@286 765 return -1;
ohair@286 766
ohair@286 767 for (i = 0; i < len; i++) {
ohair@286 768 if (start >= finish) {
ohair@286 769 start = 0;
ohair@286 770 finish = instream.read(buffer, 0, buffer.length);
ohair@286 771 if (finish <= 0) {
ohair@286 772 if (finish <= 0)
ohair@286 773 this.close();
ohair@286 774 break;
ohair@286 775 }
ohair@286 776 }
ohair@286 777 buf[offset + i] = (char) (0x0ff & buffer[start++]);
ohair@286 778 }
ohair@286 779 if (i == 0 && finish <= 0)
ohair@286 780 return -1;
ohair@286 781 return i;
ohair@286 782 }
ohair@286 783 }
ohair@286 784 }

mercurial