src/share/jaxws_classes/com/sun/xml/internal/dtdparser/XmlReader.java

Thu, 12 Oct 2017 19:44:07 +0800

author
aoqi
date
Thu, 12 Oct 2017 19:44:07 +0800
changeset 760
e530533619ec
parent 637
9c07ef4934dd
permissions
-rw-r--r--

merge

aoqi@0 1 /*
aoqi@0 2 * Copyright (c) 2009, Oracle and/or its affiliates. All rights reserved.
aoqi@0 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
aoqi@0 4 *
aoqi@0 5 * This code is free software; you can redistribute it and/or modify it
aoqi@0 6 * under the terms of the GNU General Public License version 2 only, as
aoqi@0 7 * published by the Free Software Foundation. Oracle designates this
aoqi@0 8 * particular file as subject to the "Classpath" exception as provided
aoqi@0 9 * by Oracle in the LICENSE file that accompanied this code.
aoqi@0 10 *
aoqi@0 11 * This code is distributed in the hope that it will be useful, but WITHOUT
aoqi@0 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
aoqi@0 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
aoqi@0 14 * version 2 for more details (a copy is included in the LICENSE file that
aoqi@0 15 * accompanied this code).
aoqi@0 16 *
aoqi@0 17 * You should have received a copy of the GNU General Public License version
aoqi@0 18 * 2 along with this work; if not, write to the Free Software Foundation,
aoqi@0 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
aoqi@0 20 *
aoqi@0 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
aoqi@0 22 * or visit www.oracle.com if you need additional information or have any
aoqi@0 23 * questions.
aoqi@0 24 */
aoqi@0 25
aoqi@0 26 package com.sun.xml.internal.dtdparser;
aoqi@0 27
aoqi@0 28 import java.io.ByteArrayInputStream;
aoqi@0 29 import java.io.CharConversionException;
aoqi@0 30 import java.io.IOException;
aoqi@0 31 import java.io.InputStream;
aoqi@0 32 import java.io.InputStreamReader;
aoqi@0 33 import java.io.PushbackInputStream;
aoqi@0 34 import java.io.Reader;
aoqi@0 35 import java.util.Hashtable;
aoqi@0 36
aoqi@0 37
aoqi@0 38 // NOTE: Add I18N support to this class when JDK gets the ability to
aoqi@0 39 // defer selection of locale for exception messages ... use the same
aoqi@0 40 // technique for both.
aoqi@0 41
aoqi@0 42
aoqi@0 43 /**
aoqi@0 44 * This handles several XML-related tasks that normal java.io Readers
aoqi@0 45 * don't support, inluding use of IETF standard encoding names and
aoqi@0 46 * automatic detection of most XML encodings. The former is needed
aoqi@0 47 * for interoperability; the latter is needed to conform with the XML
aoqi@0 48 * spec. This class also optimizes reading some common encodings by
aoqi@0 49 * providing low-overhead unsynchronized Reader support.
aoqi@0 50 * <p/>
aoqi@0 51 * <P> Note that the autodetection facility should be used only on
aoqi@0 52 * data streams which have an unknown character encoding. For example,
aoqi@0 53 * it should never be used on MIME text/xml entities.
aoqi@0 54 * <p/>
aoqi@0 55 * <P> Note that XML processors are only required to support UTF-8 and
aoqi@0 56 * UTF-16 character encodings. Autodetection permits the underlying Java
aoqi@0 57 * implementation to provide support for many other encodings, such as
aoqi@0 58 * US-ASCII, ISO-8859-5, Shift_JIS, EUC-JP, and ISO-2022-JP.
aoqi@0 59 *
aoqi@0 60 * @author David Brownell
aoqi@0 61 * @author Janet Koenig
aoqi@0 62 * @version 1.3 00/02/24
aoqi@0 63 */
aoqi@0 64 // package private
aoqi@0 65 final class XmlReader extends Reader {
aoqi@0 66 private static final int MAXPUSHBACK = 512;
aoqi@0 67
aoqi@0 68 private Reader in;
aoqi@0 69 private String assignedEncoding;
aoqi@0 70 private boolean closed;
aoqi@0 71
aoqi@0 72 //
aoqi@0 73 // This class always delegates I/O to a reader, which gets
aoqi@0 74 // its data from the very beginning of the XML text. It needs
aoqi@0 75 // to use a pushback stream since (a) autodetection can read
aoqi@0 76 // partial UTF-8 characters which need to be fully processed,
aoqi@0 77 // (b) the "Unicode" readers swallow characters that they think
aoqi@0 78 // are byte order marks, so tests fail if they don't see the
aoqi@0 79 // real byte order mark.
aoqi@0 80 //
aoqi@0 81 // It's got do this efficiently: character I/O is solidly on the
aoqi@0 82 // critical path. (So keep buffer length over 2 Kbytes to avoid
aoqi@0 83 // excess buffering. Many URL handlers stuff a BufferedInputStream
aoqi@0 84 // between here and the real data source, and larger buffers keep
aoqi@0 85 // that from slowing you down.)
aoqi@0 86 //
aoqi@0 87
aoqi@0 88 /**
aoqi@0 89 * Constructs the reader from an input stream, autodetecting
aoqi@0 90 * the encoding to use according to the heuristic specified
aoqi@0 91 * in the XML 1.0 recommendation.
aoqi@0 92 *
aoqi@0 93 * @param in the input stream from which the reader is constructed
aoqi@0 94 * @throws IOException on error, such as unrecognized encoding
aoqi@0 95 */
aoqi@0 96 public static Reader createReader(InputStream in) throws IOException {
aoqi@0 97 return new XmlReader(in);
aoqi@0 98 }
aoqi@0 99
aoqi@0 100 /**
aoqi@0 101 * Creates a reader supporting the given encoding, mapping
aoqi@0 102 * from standard encoding names to ones that understood by
aoqi@0 103 * Java where necessary.
aoqi@0 104 *
aoqi@0 105 * @param in the input stream from which the reader is constructed
aoqi@0 106 * @param encoding the IETF standard name of the encoding to use;
aoqi@0 107 * if null, autodetection is used.
aoqi@0 108 * @throws IOException on error, including unrecognized encoding
aoqi@0 109 */
aoqi@0 110 public static Reader createReader(InputStream in, String encoding)
aoqi@0 111 throws IOException {
aoqi@0 112 if (encoding == null)
aoqi@0 113 return new XmlReader(in);
aoqi@0 114 if ("UTF-8".equalsIgnoreCase(encoding)
aoqi@0 115 || "UTF8".equalsIgnoreCase(encoding))
aoqi@0 116 return new Utf8Reader(in);
aoqi@0 117 if ("US-ASCII".equalsIgnoreCase(encoding)
aoqi@0 118 || "ASCII".equalsIgnoreCase(encoding))
aoqi@0 119 return new AsciiReader(in);
aoqi@0 120 if ("ISO-8859-1".equalsIgnoreCase(encoding)
aoqi@0 121 // plus numerous aliases ...
aoqi@0 122 )
aoqi@0 123 return new Iso8859_1Reader(in);
aoqi@0 124
aoqi@0 125 //
aoqi@0 126 // What we really want is an administerable resource mapping
aoqi@0 127 // encoding names/aliases to classnames. For example a property
aoqi@0 128 // file resource, "readers/mapping.props", holding and a set
aoqi@0 129 // of readers in that (sub)package... defaulting to this call
aoqi@0 130 // only if no better choice is available.
aoqi@0 131 //
aoqi@0 132 return new InputStreamReader(in, std2java(encoding));
aoqi@0 133 }
aoqi@0 134
aoqi@0 135 //
aoqi@0 136 // JDK doesn't know all of the standard encoding names, and
aoqi@0 137 // in particular none of the EBCDIC ones IANA defines (and
aoqi@0 138 // which IBM encourages).
aoqi@0 139 //
aoqi@0 140 static private final Hashtable charsets = new Hashtable(31);
aoqi@0 141
aoqi@0 142 static {
aoqi@0 143 charsets.put("UTF-16", "Unicode");
aoqi@0 144 charsets.put("ISO-10646-UCS-2", "Unicode");
aoqi@0 145
aoqi@0 146 // NOTE: no support for ISO-10646-UCS-4 yet.
aoqi@0 147
aoqi@0 148 charsets.put("EBCDIC-CP-US", "cp037");
aoqi@0 149 charsets.put("EBCDIC-CP-CA", "cp037");
aoqi@0 150 charsets.put("EBCDIC-CP-NL", "cp037");
aoqi@0 151 charsets.put("EBCDIC-CP-WT", "cp037");
aoqi@0 152
aoqi@0 153 charsets.put("EBCDIC-CP-DK", "cp277");
aoqi@0 154 charsets.put("EBCDIC-CP-NO", "cp277");
aoqi@0 155 charsets.put("EBCDIC-CP-FI", "cp278");
aoqi@0 156 charsets.put("EBCDIC-CP-SE", "cp278");
aoqi@0 157
aoqi@0 158 charsets.put("EBCDIC-CP-IT", "cp280");
aoqi@0 159 charsets.put("EBCDIC-CP-ES", "cp284");
aoqi@0 160 charsets.put("EBCDIC-CP-GB", "cp285");
aoqi@0 161 charsets.put("EBCDIC-CP-FR", "cp297");
aoqi@0 162
aoqi@0 163 charsets.put("EBCDIC-CP-AR1", "cp420");
aoqi@0 164 charsets.put("EBCDIC-CP-HE", "cp424");
aoqi@0 165 charsets.put("EBCDIC-CP-BE", "cp500");
aoqi@0 166 charsets.put("EBCDIC-CP-CH", "cp500");
aoqi@0 167
aoqi@0 168 charsets.put("EBCDIC-CP-ROECE", "cp870");
aoqi@0 169 charsets.put("EBCDIC-CP-YU", "cp870");
aoqi@0 170 charsets.put("EBCDIC-CP-IS", "cp871");
aoqi@0 171 charsets.put("EBCDIC-CP-AR2", "cp918");
aoqi@0 172
aoqi@0 173 // IANA also defines two that JDK 1.2 doesn't handle:
aoqi@0 174 // EBCDIC-CP-GR --> CP423
aoqi@0 175 // EBCDIC-CP-TR --> CP905
aoqi@0 176 }
aoqi@0 177
aoqi@0 178 // returns an encoding name supported by JDK >= 1.1.6
aoqi@0 179 // for some cases required by the XML spec
aoqi@0 180 private static String std2java(String encoding) {
aoqi@0 181 String temp = encoding.toUpperCase();
aoqi@0 182 temp = (String) charsets.get(temp);
aoqi@0 183 return temp != null ? temp : encoding;
aoqi@0 184 }
aoqi@0 185
aoqi@0 186 /**
aoqi@0 187 * Returns the standard name of the encoding in use
aoqi@0 188 */
aoqi@0 189 public String getEncoding() {
aoqi@0 190 return assignedEncoding;
aoqi@0 191 }
aoqi@0 192
aoqi@0 193 private XmlReader(InputStream stream) throws IOException {
aoqi@0 194 super(stream);
aoqi@0 195
aoqi@0 196 PushbackInputStream pb;
aoqi@0 197 byte buf [];
aoqi@0 198 int len;
aoqi@0 199
aoqi@0 200 if (stream instanceof PushbackInputStream)
aoqi@0 201 pb = (PushbackInputStream) stream;
aoqi@0 202 else
aoqi@0 203 pb = new PushbackInputStream(stream, MAXPUSHBACK);
aoqi@0 204
aoqi@0 205 //
aoqi@0 206 // See if we can figure out the character encoding used
aoqi@0 207 // in this file by peeking at the first few bytes.
aoqi@0 208 //
aoqi@0 209 buf = new byte[4];
aoqi@0 210 len = pb.read(buf);
aoqi@0 211 if (len > 0)
aoqi@0 212 pb.unread(buf, 0, len);
aoqi@0 213
aoqi@0 214 if (len == 4)
aoqi@0 215 switch (buf[0] & 0x0ff) {
aoqi@0 216 case 0:
aoqi@0 217 // 00 3c 00 3f == illegal UTF-16 big-endian
aoqi@0 218 if (buf[1] == 0x3c && buf[2] == 0x00 && buf[3] == 0x3f) {
aoqi@0 219 setEncoding(pb, "UnicodeBig");
aoqi@0 220 return;
aoqi@0 221 }
aoqi@0 222 // else it's probably UCS-4
aoqi@0 223 break;
aoqi@0 224
aoqi@0 225 case '<': // 0x3c: the most common cases!
aoqi@0 226 switch (buf[1] & 0x0ff) {
aoqi@0 227 // First character is '<'; could be XML without
aoqi@0 228 // an XML directive such as "<hello>", "<!-- ...",
aoqi@0 229 // and so on.
aoqi@0 230 default:
aoqi@0 231 break;
aoqi@0 232
aoqi@0 233 // 3c 00 3f 00 == illegal UTF-16 little endian
aoqi@0 234 case 0x00:
aoqi@0 235 if (buf[2] == 0x3f && buf[3] == 0x00) {
aoqi@0 236 setEncoding(pb, "UnicodeLittle");
aoqi@0 237 return;
aoqi@0 238 }
aoqi@0 239 // else probably UCS-4
aoqi@0 240 break;
aoqi@0 241
aoqi@0 242 // 3c 3f 78 6d == ASCII and supersets '<?xm'
aoqi@0 243 case '?':
aoqi@0 244 if (buf[2] != 'x' || buf[3] != 'm')
aoqi@0 245 break;
aoqi@0 246 //
aoqi@0 247 // One of several encodings could be used:
aoqi@0 248 // Shift-JIS, ASCII, UTF-8, ISO-8859-*, etc
aoqi@0 249 //
aoqi@0 250 useEncodingDecl(pb, "UTF8");
aoqi@0 251 return;
aoqi@0 252 }
aoqi@0 253 break;
aoqi@0 254
aoqi@0 255 // 4c 6f a7 94 ... some EBCDIC code page
aoqi@0 256 case 0x4c:
aoqi@0 257 if (buf[1] == 0x6f
aoqi@0 258 && (0x0ff & buf[2]) == 0x0a7
aoqi@0 259 && (0x0ff & buf[3]) == 0x094) {
aoqi@0 260 useEncodingDecl(pb, "CP037");
aoqi@0 261 return;
aoqi@0 262 }
aoqi@0 263 // whoops, treat as UTF-8
aoqi@0 264 break;
aoqi@0 265
aoqi@0 266 // UTF-16 big-endian
aoqi@0 267 case 0xfe:
aoqi@0 268 if ((buf[1] & 0x0ff) != 0xff)
aoqi@0 269 break;
aoqi@0 270 setEncoding(pb, "UTF-16");
aoqi@0 271 return;
aoqi@0 272
aoqi@0 273 // UTF-16 little-endian
aoqi@0 274 case 0xff:
aoqi@0 275 if ((buf[1] & 0x0ff) != 0xfe)
aoqi@0 276 break;
aoqi@0 277 setEncoding(pb, "UTF-16");
aoqi@0 278 return;
aoqi@0 279
aoqi@0 280 // default ... no XML declaration
aoqi@0 281 default:
aoqi@0 282 break;
aoqi@0 283 }
aoqi@0 284
aoqi@0 285 //
aoqi@0 286 // If all else fails, assume XML without a declaration, and
aoqi@0 287 // using UTF-8 encoding.
aoqi@0 288 //
aoqi@0 289 setEncoding(pb, "UTF-8");
aoqi@0 290 }
aoqi@0 291
aoqi@0 292 /*
aoqi@0 293 * Read the encoding decl on the stream, knowing that it should
aoqi@0 294 * be readable using the specified encoding (basically, ASCII or
aoqi@0 295 * EBCDIC). The body of the document may use a wider range of
aoqi@0 296 * characters than the XML/Text decl itself, so we switch to use
aoqi@0 297 * the specified encoding as soon as we can. (ASCII is a subset
aoqi@0 298 * of UTF-8, ISO-8859-*, ISO-2022-JP, EUC-JP, and more; EBCDIC
aoqi@0 299 * has a variety of "code pages" that have these characters as
aoqi@0 300 * a common subset.)
aoqi@0 301 */
aoqi@0 302 private void useEncodingDecl(PushbackInputStream pb, String encoding)
aoqi@0 303 throws IOException {
aoqi@0 304 byte buffer [] = new byte[MAXPUSHBACK];
aoqi@0 305 int len;
aoqi@0 306 Reader r;
aoqi@0 307 int c;
aoqi@0 308
aoqi@0 309 //
aoqi@0 310 // Buffer up a bunch of input, and set up to read it in
aoqi@0 311 // the specified encoding ... we can skip the first four
aoqi@0 312 // bytes since we know that "<?xm" was read to determine
aoqi@0 313 // what encoding to use!
aoqi@0 314 //
aoqi@0 315 len = pb.read(buffer, 0, buffer.length);
aoqi@0 316 pb.unread(buffer, 0, len);
aoqi@0 317 r = new InputStreamReader(new ByteArrayInputStream(buffer, 4, len),
aoqi@0 318 encoding);
aoqi@0 319
aoqi@0 320 //
aoqi@0 321 // Next must be "l" (and whitespace) else we conclude
aoqi@0 322 // error and choose UTF-8.
aoqi@0 323 //
aoqi@0 324 if ((c = r.read()) != 'l') {
aoqi@0 325 setEncoding(pb, "UTF-8");
aoqi@0 326 return;
aoqi@0 327 }
aoqi@0 328
aoqi@0 329 //
aoqi@0 330 // Then, we'll skip any
aoqi@0 331 // S version="..." [or single quotes]
aoqi@0 332 // bit and get any subsequent
aoqi@0 333 // S encoding="..." [or single quotes]
aoqi@0 334 //
aoqi@0 335 // We put an arbitrary size limit on how far we read; lots
aoqi@0 336 // of space will break this algorithm.
aoqi@0 337 //
aoqi@0 338 StringBuffer buf = new StringBuffer();
aoqi@0 339 StringBuffer keyBuf = null;
aoqi@0 340 String key = null;
aoqi@0 341 boolean sawEq = false;
aoqi@0 342 char quoteChar = 0;
aoqi@0 343 boolean sawQuestion = false;
aoqi@0 344
aoqi@0 345 XmlDecl:
aoqi@0 346 for (int i = 0; i < MAXPUSHBACK - 5; ++i) {
aoqi@0 347 if ((c = r.read()) == -1)
aoqi@0 348 break;
aoqi@0 349
aoqi@0 350 // ignore whitespace before/between "key = 'value'"
aoqi@0 351 if (c == ' ' || c == '\t' || c == '\n' || c == '\r')
aoqi@0 352 continue;
aoqi@0 353
aoqi@0 354 // ... but require at least a little!
aoqi@0 355 if (i == 0)
aoqi@0 356 break;
aoqi@0 357
aoqi@0 358 // terminate the loop ASAP
aoqi@0 359 if (c == '?')
aoqi@0 360 sawQuestion = true;
aoqi@0 361 else if (sawQuestion) {
aoqi@0 362 if (c == '>')
aoqi@0 363 break;
aoqi@0 364 sawQuestion = false;
aoqi@0 365 }
aoqi@0 366
aoqi@0 367 // did we get the "key =" bit yet?
aoqi@0 368 if (key == null || !sawEq) {
aoqi@0 369 if (keyBuf == null) {
aoqi@0 370 if (Character.isWhitespace((char) c))
aoqi@0 371 continue;
aoqi@0 372 keyBuf = buf;
aoqi@0 373 buf.setLength(0);
aoqi@0 374 buf.append((char) c);
aoqi@0 375 sawEq = false;
aoqi@0 376 } else if (Character.isWhitespace((char) c)) {
aoqi@0 377 key = keyBuf.toString();
aoqi@0 378 } else if (c == '=') {
aoqi@0 379 if (key == null)
aoqi@0 380 key = keyBuf.toString();
aoqi@0 381 sawEq = true;
aoqi@0 382 keyBuf = null;
aoqi@0 383 quoteChar = 0;
aoqi@0 384 } else
aoqi@0 385 keyBuf.append((char) c);
aoqi@0 386 continue;
aoqi@0 387 }
aoqi@0 388
aoqi@0 389 // space before quoted value
aoqi@0 390 if (Character.isWhitespace((char) c))
aoqi@0 391 continue;
aoqi@0 392 if (c == '"' || c == '\'') {
aoqi@0 393 if (quoteChar == 0) {
aoqi@0 394 quoteChar = (char) c;
aoqi@0 395 buf.setLength(0);
aoqi@0 396 continue;
aoqi@0 397 } else if (c == quoteChar) {
aoqi@0 398 if ("encoding".equals(key)) {
aoqi@0 399 assignedEncoding = buf.toString();
aoqi@0 400
aoqi@0 401 // [81] Encname ::= [A-Za-z] ([A-Za-z0-9._]|'-')*
aoqi@0 402 for (i = 0; i < assignedEncoding.length(); i++) {
aoqi@0 403 c = assignedEncoding.charAt(i);
aoqi@0 404 if ((c >= 'A' && c <= 'Z')
aoqi@0 405 || (c >= 'a' && c <= 'z'))
aoqi@0 406 continue;
aoqi@0 407 if (i == 0)
aoqi@0 408 break XmlDecl;
aoqi@0 409 if (i > 0 && (c == '-'
aoqi@0 410 || (c >= '0' && c <= '9')
aoqi@0 411 || c == '.' || c == '_'))
aoqi@0 412 continue;
aoqi@0 413 // map illegal names to UTF-8 default
aoqi@0 414 break XmlDecl;
aoqi@0 415 }
aoqi@0 416
aoqi@0 417 setEncoding(pb, assignedEncoding);
aoqi@0 418 return;
aoqi@0 419
aoqi@0 420 } else {
aoqi@0 421 key = null;
aoqi@0 422 continue;
aoqi@0 423 }
aoqi@0 424 }
aoqi@0 425 }
aoqi@0 426 buf.append((char) c);
aoqi@0 427 }
aoqi@0 428
aoqi@0 429 setEncoding(pb, "UTF-8");
aoqi@0 430 }
aoqi@0 431
aoqi@0 432 private void setEncoding(InputStream stream, String encoding)
aoqi@0 433 throws IOException {
aoqi@0 434 assignedEncoding = encoding;
aoqi@0 435 in = createReader(stream, encoding);
aoqi@0 436 }
aoqi@0 437
aoqi@0 438 /**
aoqi@0 439 * Reads the number of characters read into the buffer, or -1 on EOF.
aoqi@0 440 */
aoqi@0 441 public int read(char buf [], int off, int len) throws IOException {
aoqi@0 442 int val;
aoqi@0 443
aoqi@0 444 if (closed)
aoqi@0 445 return -1; // throw new IOException ("closed");
aoqi@0 446 val = in.read(buf, off, len);
aoqi@0 447 if (val == -1)
aoqi@0 448 close();
aoqi@0 449 return val;
aoqi@0 450 }
aoqi@0 451
aoqi@0 452 /**
aoqi@0 453 * Reads a single character.
aoqi@0 454 */
aoqi@0 455 public int read() throws IOException {
aoqi@0 456 int val;
aoqi@0 457
aoqi@0 458 if (closed)
aoqi@0 459 throw new IOException("closed");
aoqi@0 460 val = in.read();
aoqi@0 461 if (val == -1)
aoqi@0 462 close();
aoqi@0 463 return val;
aoqi@0 464 }
aoqi@0 465
aoqi@0 466 /**
aoqi@0 467 * Returns true iff the reader supports mark/reset.
aoqi@0 468 */
aoqi@0 469 public boolean markSupported() {
aoqi@0 470 return in == null ? false : in.markSupported();
aoqi@0 471 }
aoqi@0 472
aoqi@0 473 /**
aoqi@0 474 * Sets a mark allowing a limited number of characters to
aoqi@0 475 * be "peeked", by reading and then resetting.
aoqi@0 476 *
aoqi@0 477 * @param value how many characters may be "peeked".
aoqi@0 478 */
aoqi@0 479 public void mark(int value) throws IOException {
aoqi@0 480 if (in != null) in.mark(value);
aoqi@0 481 }
aoqi@0 482
aoqi@0 483 /**
aoqi@0 484 * Resets the current position to the last marked position.
aoqi@0 485 */
aoqi@0 486 public void reset() throws IOException {
aoqi@0 487 if (in != null) in.reset();
aoqi@0 488 }
aoqi@0 489
aoqi@0 490 /**
aoqi@0 491 * Skips a specified number of characters.
aoqi@0 492 */
aoqi@0 493 public long skip(long value) throws IOException {
aoqi@0 494 return in == null ? 0 : in.skip(value);
aoqi@0 495 }
aoqi@0 496
aoqi@0 497 /**
aoqi@0 498 * Returns true iff input characters are known to be ready.
aoqi@0 499 */
aoqi@0 500 public boolean ready() throws IOException {
aoqi@0 501 return in == null ? false : in.ready();
aoqi@0 502 }
aoqi@0 503
aoqi@0 504 /**
aoqi@0 505 * Closes the reader.
aoqi@0 506 */
aoqi@0 507 public void close() throws IOException {
aoqi@0 508 if (closed)
aoqi@0 509 return;
aoqi@0 510 in.close();
aoqi@0 511 in = null;
aoqi@0 512 closed = true;
aoqi@0 513 }
aoqi@0 514
aoqi@0 515 //
aoqi@0 516 // Delegating to a converter module will always be slower than
aoqi@0 517 // direct conversion. Use a similar approach for any other
aoqi@0 518 // readers that need to be particularly fast; only block I/O
aoqi@0 519 // speed matters to this package. For UTF-16, separate readers
aoqi@0 520 // for big and little endian streams make a difference, too;
aoqi@0 521 // fewer conditionals in the critical path!
aoqi@0 522 //
aoqi@0 523 static abstract class BaseReader extends Reader {
aoqi@0 524 protected InputStream instream;
aoqi@0 525 protected byte buffer [];
aoqi@0 526 protected int start, finish;
aoqi@0 527
aoqi@0 528 BaseReader(InputStream stream) {
aoqi@0 529 super(stream);
aoqi@0 530
aoqi@0 531 instream = stream;
aoqi@0 532 buffer = new byte[8192];
aoqi@0 533 }
aoqi@0 534
aoqi@0 535 public boolean ready() throws IOException {
aoqi@0 536 return instream == null
aoqi@0 537 || (finish - start) > 0
aoqi@0 538 || instream.available() != 0;
aoqi@0 539 }
aoqi@0 540
aoqi@0 541 // caller shouldn't read again
aoqi@0 542 public void close() throws IOException {
aoqi@0 543 if (instream != null) {
aoqi@0 544 instream.close();
aoqi@0 545 start = finish = 0;
aoqi@0 546 buffer = null;
aoqi@0 547 instream = null;
aoqi@0 548 }
aoqi@0 549 }
aoqi@0 550 }
aoqi@0 551
aoqi@0 552 //
aoqi@0 553 // We want this reader, to make the default encoding be as fast
aoqi@0 554 // as we can make it. JDK's "UTF8" (not "UTF-8" till JDK 1.2)
aoqi@0 555 // InputStreamReader works, but 20+% slower speed isn't OK for
aoqi@0 556 // the default/primary encoding.
aoqi@0 557 //
aoqi@0 558 static final class Utf8Reader extends BaseReader {
aoqi@0 559 // 2nd half of UTF-8 surrogate pair
aoqi@0 560 private char nextChar;
aoqi@0 561
aoqi@0 562 Utf8Reader(InputStream stream) {
aoqi@0 563 super(stream);
aoqi@0 564 }
aoqi@0 565
aoqi@0 566 public int read(char buf [], int offset, int len) throws IOException {
aoqi@0 567 int i = 0, c = 0;
aoqi@0 568
aoqi@0 569 if (len <= 0)
aoqi@0 570 return 0;
aoqi@0 571
aoqi@0 572 // Consume remaining half of any surrogate pair immediately
aoqi@0 573 if (nextChar != 0) {
aoqi@0 574 buf[offset + i++] = nextChar;
aoqi@0 575 nextChar = 0;
aoqi@0 576 }
aoqi@0 577
aoqi@0 578 while (i < len) {
aoqi@0 579 // stop or read data if needed
aoqi@0 580 if (finish <= start) {
aoqi@0 581 if (instream == null) {
aoqi@0 582 c = -1;
aoqi@0 583 break;
aoqi@0 584 }
aoqi@0 585 start = 0;
aoqi@0 586 finish = instream.read(buffer, 0, buffer.length);
aoqi@0 587 if (finish <= 0) {
aoqi@0 588 this.close();
aoqi@0 589 c = -1;
aoqi@0 590 break;
aoqi@0 591 }
aoqi@0 592 }
aoqi@0 593
aoqi@0 594 //
aoqi@0 595 // RFC 2279 describes UTF-8; there are six encodings.
aoqi@0 596 // Each encoding takes a fixed number of characters
aoqi@0 597 // (1-6 bytes) and is flagged by a bit pattern in the
aoqi@0 598 // first byte. The five and six byte-per-character
aoqi@0 599 // encodings address characters which are disallowed
aoqi@0 600 // in XML documents, as do some four byte ones.
aoqi@0 601 //
aoqi@0 602
aoqi@0 603 //
aoqi@0 604 // Single byte == ASCII. Common; optimize.
aoqi@0 605 //
aoqi@0 606 c = buffer[start] & 0x0ff;
aoqi@0 607 if ((c & 0x80) == 0x00) {
aoqi@0 608 // 0x0000 <= c <= 0x007f
aoqi@0 609 start++;
aoqi@0 610 buf[offset + i++] = (char) c;
aoqi@0 611 continue;
aoqi@0 612 }
aoqi@0 613
aoqi@0 614 //
aoqi@0 615 // Multibyte chars -- check offsets optimistically,
aoqi@0 616 // ditto the "10xx xxxx" format for subsequent bytes
aoqi@0 617 //
aoqi@0 618 int off = start;
aoqi@0 619
aoqi@0 620 try {
aoqi@0 621 // 2 bytes
aoqi@0 622 if ((buffer[off] & 0x0E0) == 0x0C0) {
aoqi@0 623 c = (buffer[off++] & 0x1f) << 6;
aoqi@0 624 c += buffer[off++] & 0x3f;
aoqi@0 625
aoqi@0 626 // 0x0080 <= c <= 0x07ff
aoqi@0 627
aoqi@0 628 // 3 bytes
aoqi@0 629 } else if ((buffer[off] & 0x0F0) == 0x0E0) {
aoqi@0 630 c = (buffer[off++] & 0x0f) << 12;
aoqi@0 631 c += (buffer[off++] & 0x3f) << 6;
aoqi@0 632 c += buffer[off++] & 0x3f;
aoqi@0 633
aoqi@0 634 // 0x0800 <= c <= 0xffff
aoqi@0 635
aoqi@0 636 // 4 bytes
aoqi@0 637 } else if ((buffer[off] & 0x0f8) == 0x0F0) {
aoqi@0 638 c = (buffer[off++] & 0x07) << 18;
aoqi@0 639 c += (buffer[off++] & 0x3f) << 12;
aoqi@0 640 c += (buffer[off++] & 0x3f) << 6;
aoqi@0 641 c += buffer[off++] & 0x3f;
aoqi@0 642
aoqi@0 643 // 0x0001 0000 <= c <= 0x001f ffff
aoqi@0 644
aoqi@0 645 // Unicode supports c <= 0x0010 ffff ...
aoqi@0 646 if (c > 0x0010ffff)
aoqi@0 647 throw new CharConversionException("UTF-8 encoding of character 0x00"
aoqi@0 648 + Integer.toHexString(c)
aoqi@0 649 + " can't be converted to Unicode.");
aoqi@0 650
aoqi@0 651 // Convert UCS-4 char to surrogate pair (UTF-16)
aoqi@0 652 c -= 0x10000;
aoqi@0 653 nextChar = (char) (0xDC00 + (c & 0x03ff));
aoqi@0 654 c = 0xD800 + (c >> 10);
aoqi@0 655
aoqi@0 656 // 5 and 6 byte versions are XML WF errors, but
aoqi@0 657 // typically come from mislabeled encodings
aoqi@0 658 } else
aoqi@0 659 throw new CharConversionException("Unconvertible UTF-8 character"
aoqi@0 660 + " beginning with 0x"
aoqi@0 661 + Integer.toHexString(buffer[start] & 0xff));
aoqi@0 662
aoqi@0 663 } catch (ArrayIndexOutOfBoundsException e) {
aoqi@0 664 // off > length && length >= buffer.length
aoqi@0 665 c = 0;
aoqi@0 666 }
aoqi@0 667
aoqi@0 668 //
aoqi@0 669 // if the buffer held only a partial character,
aoqi@0 670 // compact it and try to read the rest of the
aoqi@0 671 // character. worst case involves three
aoqi@0 672 // single-byte reads -- quite rare.
aoqi@0 673 //
aoqi@0 674 if (off > finish) {
aoqi@0 675 System.arraycopy(buffer, start,
aoqi@0 676 buffer, 0, finish - start);
aoqi@0 677 finish -= start;
aoqi@0 678 start = 0;
aoqi@0 679 off = instream.read(buffer, finish,
aoqi@0 680 buffer.length - finish);
aoqi@0 681 if (off < 0) {
aoqi@0 682 this.close();
aoqi@0 683 throw new CharConversionException("Partial UTF-8 char");
aoqi@0 684 }
aoqi@0 685 finish += off;
aoqi@0 686 continue;
aoqi@0 687 }
aoqi@0 688
aoqi@0 689 //
aoqi@0 690 // check the format of the non-initial bytes
aoqi@0 691 //
aoqi@0 692 for (start++; start < off; start++) {
aoqi@0 693 if ((buffer[start] & 0xC0) != 0x80) {
aoqi@0 694 this.close();
aoqi@0 695 throw new CharConversionException("Malformed UTF-8 char -- "
aoqi@0 696 + "is an XML encoding declaration missing?");
aoqi@0 697 }
aoqi@0 698 }
aoqi@0 699
aoqi@0 700 //
aoqi@0 701 // If this needed a surrogate pair, consume ASAP
aoqi@0 702 //
aoqi@0 703 buf[offset + i++] = (char) c;
aoqi@0 704 if (nextChar != 0 && i < len) {
aoqi@0 705 buf[offset + i++] = nextChar;
aoqi@0 706 nextChar = 0;
aoqi@0 707 }
aoqi@0 708 }
aoqi@0 709 if (i > 0)
aoqi@0 710 return i;
aoqi@0 711 return (c == -1) ? -1 : 0;
aoqi@0 712 }
aoqi@0 713 }
aoqi@0 714
aoqi@0 715 //
aoqi@0 716 // We want ASCII and ISO-8859 Readers since they're the most common
aoqi@0 717 // encodings in the US and Europe, and we don't want performance
aoqi@0 718 // regressions for them. They're also easy to implement efficiently,
aoqi@0 719 // since they're bitmask subsets of UNICODE.
aoqi@0 720 //
aoqi@0 721 // XXX haven't benchmarked these readers vs what we get out of JDK.
aoqi@0 722 //
aoqi@0 723 static final class AsciiReader extends BaseReader {
aoqi@0 724 AsciiReader(InputStream in) {
aoqi@0 725 super(in);
aoqi@0 726 }
aoqi@0 727
aoqi@0 728 public int read(char buf [], int offset, int len) throws IOException {
aoqi@0 729 int i, c;
aoqi@0 730
aoqi@0 731 if (instream == null)
aoqi@0 732 return -1;
aoqi@0 733
aoqi@0 734 for (i = 0; i < len; i++) {
aoqi@0 735 if (start >= finish) {
aoqi@0 736 start = 0;
aoqi@0 737 finish = instream.read(buffer, 0, buffer.length);
aoqi@0 738 if (finish <= 0) {
aoqi@0 739 if (finish <= 0)
aoqi@0 740 this.close();
aoqi@0 741 break;
aoqi@0 742 }
aoqi@0 743 }
aoqi@0 744 c = buffer[start++];
aoqi@0 745 if ((c & 0x80) != 0)
aoqi@0 746 throw new CharConversionException("Illegal ASCII character, 0x"
aoqi@0 747 + Integer.toHexString(c & 0xff));
aoqi@0 748 buf[offset + i] = (char) c;
aoqi@0 749 }
aoqi@0 750 if (i == 0 && finish <= 0)
aoqi@0 751 return -1;
aoqi@0 752 return i;
aoqi@0 753 }
aoqi@0 754 }
aoqi@0 755
aoqi@0 756 static final class Iso8859_1Reader extends BaseReader {
aoqi@0 757 Iso8859_1Reader(InputStream in) {
aoqi@0 758 super(in);
aoqi@0 759 }
aoqi@0 760
aoqi@0 761 public int read(char buf [], int offset, int len) throws IOException {
aoqi@0 762 int i;
aoqi@0 763
aoqi@0 764 if (instream == null)
aoqi@0 765 return -1;
aoqi@0 766
aoqi@0 767 for (i = 0; i < len; i++) {
aoqi@0 768 if (start >= finish) {
aoqi@0 769 start = 0;
aoqi@0 770 finish = instream.read(buffer, 0, buffer.length);
aoqi@0 771 if (finish <= 0) {
aoqi@0 772 if (finish <= 0)
aoqi@0 773 this.close();
aoqi@0 774 break;
aoqi@0 775 }
aoqi@0 776 }
aoqi@0 777 buf[offset + i] = (char) (0x0ff & buffer[start++]);
aoqi@0 778 }
aoqi@0 779 if (i == 0 && finish <= 0)
aoqi@0 780 return -1;
aoqi@0 781 return i;
aoqi@0 782 }
aoqi@0 783 }
aoqi@0 784 }

mercurial