|
1 /* |
|
2 * Copyright (c) 2009, Oracle and/or its affiliates. All rights reserved. |
|
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
|
4 * |
|
5 * This code is free software; you can redistribute it and/or modify it |
|
6 * under the terms of the GNU General Public License version 2 only, as |
|
7 * published by the Free Software Foundation. Oracle designates this |
|
8 * particular file as subject to the "Classpath" exception as provided |
|
9 * by Oracle in the LICENSE file that accompanied this code. |
|
10 * |
|
11 * This code is distributed in the hope that it will be useful, but WITHOUT |
|
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
|
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
|
14 * version 2 for more details (a copy is included in the LICENSE file that |
|
15 * accompanied this code). |
|
16 * |
|
17 * You should have received a copy of the GNU General Public License version |
|
18 * 2 along with this work; if not, write to the Free Software Foundation, |
|
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
|
20 * |
|
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
|
22 * or visit www.oracle.com if you need additional information or have any |
|
23 * questions. |
|
24 */ |
|
25 |
|
26 package com.sun.xml.internal.dtdparser; |
|
27 |
|
28 import java.io.ByteArrayInputStream; |
|
29 import java.io.CharConversionException; |
|
30 import java.io.IOException; |
|
31 import java.io.InputStream; |
|
32 import java.io.InputStreamReader; |
|
33 import java.io.PushbackInputStream; |
|
34 import java.io.Reader; |
|
35 import java.util.Hashtable; |
|
36 |
|
37 |
|
38 // NOTE: Add I18N support to this class when JDK gets the ability to |
|
39 // defer selection of locale for exception messages ... use the same |
|
40 // technique for both. |
|
41 |
|
42 |
|
43 /** |
|
44 * This handles several XML-related tasks that normal java.io Readers |
|
45 * don't support, inluding use of IETF standard encoding names and |
|
46 * automatic detection of most XML encodings. The former is needed |
|
47 * for interoperability; the latter is needed to conform with the XML |
|
48 * spec. This class also optimizes reading some common encodings by |
|
49 * providing low-overhead unsynchronized Reader support. |
|
50 * <p/> |
|
51 * <P> Note that the autodetection facility should be used only on |
|
52 * data streams which have an unknown character encoding. For example, |
|
53 * it should never be used on MIME text/xml entities. |
|
54 * <p/> |
|
55 * <P> Note that XML processors are only required to support UTF-8 and |
|
56 * UTF-16 character encodings. Autodetection permits the underlying Java |
|
57 * implementation to provide support for many other encodings, such as |
|
58 * US-ASCII, ISO-8859-5, Shift_JIS, EUC-JP, and ISO-2022-JP. |
|
59 * |
|
60 * @author David Brownell |
|
61 * @author Janet Koenig |
|
62 * @version 1.3 00/02/24 |
|
63 */ |
|
64 // package private |
|
65 final class XmlReader extends Reader { |
|
66 private static final int MAXPUSHBACK = 512; |
|
67 |
|
68 private Reader in; |
|
69 private String assignedEncoding; |
|
70 private boolean closed; |
|
71 |
|
72 // |
|
73 // This class always delegates I/O to a reader, which gets |
|
74 // its data from the very beginning of the XML text. It needs |
|
75 // to use a pushback stream since (a) autodetection can read |
|
76 // partial UTF-8 characters which need to be fully processed, |
|
77 // (b) the "Unicode" readers swallow characters that they think |
|
78 // are byte order marks, so tests fail if they don't see the |
|
79 // real byte order mark. |
|
80 // |
|
81 // It's got do this efficiently: character I/O is solidly on the |
|
82 // critical path. (So keep buffer length over 2 Kbytes to avoid |
|
83 // excess buffering. Many URL handlers stuff a BufferedInputStream |
|
84 // between here and the real data source, and larger buffers keep |
|
85 // that from slowing you down.) |
|
86 // |
|
87 |
|
88 /** |
|
89 * Constructs the reader from an input stream, autodetecting |
|
90 * the encoding to use according to the heuristic specified |
|
91 * in the XML 1.0 recommendation. |
|
92 * |
|
93 * @param in the input stream from which the reader is constructed |
|
94 * @throws IOException on error, such as unrecognized encoding |
|
95 */ |
|
96 public static Reader createReader(InputStream in) throws IOException { |
|
97 return new XmlReader(in); |
|
98 } |
|
99 |
|
100 /** |
|
101 * Creates a reader supporting the given encoding, mapping |
|
102 * from standard encoding names to ones that understood by |
|
103 * Java where necessary. |
|
104 * |
|
105 * @param in the input stream from which the reader is constructed |
|
106 * @param encoding the IETF standard name of the encoding to use; |
|
107 * if null, autodetection is used. |
|
108 * @throws IOException on error, including unrecognized encoding |
|
109 */ |
|
110 public static Reader createReader(InputStream in, String encoding) |
|
111 throws IOException { |
|
112 if (encoding == null) |
|
113 return new XmlReader(in); |
|
114 if ("UTF-8".equalsIgnoreCase(encoding) |
|
115 || "UTF8".equalsIgnoreCase(encoding)) |
|
116 return new Utf8Reader(in); |
|
117 if ("US-ASCII".equalsIgnoreCase(encoding) |
|
118 || "ASCII".equalsIgnoreCase(encoding)) |
|
119 return new AsciiReader(in); |
|
120 if ("ISO-8859-1".equalsIgnoreCase(encoding) |
|
121 // plus numerous aliases ... |
|
122 ) |
|
123 return new Iso8859_1Reader(in); |
|
124 |
|
125 // |
|
126 // What we really want is an administerable resource mapping |
|
127 // encoding names/aliases to classnames. For example a property |
|
128 // file resource, "readers/mapping.props", holding and a set |
|
129 // of readers in that (sub)package... defaulting to this call |
|
130 // only if no better choice is available. |
|
131 // |
|
132 return new InputStreamReader(in, std2java(encoding)); |
|
133 } |
|
134 |
|
135 // |
|
136 // JDK doesn't know all of the standard encoding names, and |
|
137 // in particular none of the EBCDIC ones IANA defines (and |
|
138 // which IBM encourages). |
|
139 // |
|
140 static private final Hashtable charsets = new Hashtable(31); |
|
141 |
|
142 static { |
|
143 charsets.put("UTF-16", "Unicode"); |
|
144 charsets.put("ISO-10646-UCS-2", "Unicode"); |
|
145 |
|
146 // NOTE: no support for ISO-10646-UCS-4 yet. |
|
147 |
|
148 charsets.put("EBCDIC-CP-US", "cp037"); |
|
149 charsets.put("EBCDIC-CP-CA", "cp037"); |
|
150 charsets.put("EBCDIC-CP-NL", "cp037"); |
|
151 charsets.put("EBCDIC-CP-WT", "cp037"); |
|
152 |
|
153 charsets.put("EBCDIC-CP-DK", "cp277"); |
|
154 charsets.put("EBCDIC-CP-NO", "cp277"); |
|
155 charsets.put("EBCDIC-CP-FI", "cp278"); |
|
156 charsets.put("EBCDIC-CP-SE", "cp278"); |
|
157 |
|
158 charsets.put("EBCDIC-CP-IT", "cp280"); |
|
159 charsets.put("EBCDIC-CP-ES", "cp284"); |
|
160 charsets.put("EBCDIC-CP-GB", "cp285"); |
|
161 charsets.put("EBCDIC-CP-FR", "cp297"); |
|
162 |
|
163 charsets.put("EBCDIC-CP-AR1", "cp420"); |
|
164 charsets.put("EBCDIC-CP-HE", "cp424"); |
|
165 charsets.put("EBCDIC-CP-BE", "cp500"); |
|
166 charsets.put("EBCDIC-CP-CH", "cp500"); |
|
167 |
|
168 charsets.put("EBCDIC-CP-ROECE", "cp870"); |
|
169 charsets.put("EBCDIC-CP-YU", "cp870"); |
|
170 charsets.put("EBCDIC-CP-IS", "cp871"); |
|
171 charsets.put("EBCDIC-CP-AR2", "cp918"); |
|
172 |
|
173 // IANA also defines two that JDK 1.2 doesn't handle: |
|
174 // EBCDIC-CP-GR --> CP423 |
|
175 // EBCDIC-CP-TR --> CP905 |
|
176 } |
|
177 |
|
178 // returns an encoding name supported by JDK >= 1.1.6 |
|
179 // for some cases required by the XML spec |
|
180 private static String std2java(String encoding) { |
|
181 String temp = encoding.toUpperCase(); |
|
182 temp = (String) charsets.get(temp); |
|
183 return temp != null ? temp : encoding; |
|
184 } |
|
185 |
|
186 /** |
|
187 * Returns the standard name of the encoding in use |
|
188 */ |
|
189 public String getEncoding() { |
|
190 return assignedEncoding; |
|
191 } |
|
192 |
|
193 private XmlReader(InputStream stream) throws IOException { |
|
194 super(stream); |
|
195 |
|
196 PushbackInputStream pb; |
|
197 byte buf []; |
|
198 int len; |
|
199 |
|
200 if (stream instanceof PushbackInputStream) |
|
201 pb = (PushbackInputStream) stream; |
|
202 else |
|
203 pb = new PushbackInputStream(stream, MAXPUSHBACK); |
|
204 |
|
205 // |
|
206 // See if we can figure out the character encoding used |
|
207 // in this file by peeking at the first few bytes. |
|
208 // |
|
209 buf = new byte[4]; |
|
210 len = pb.read(buf); |
|
211 if (len > 0) |
|
212 pb.unread(buf, 0, len); |
|
213 |
|
214 if (len == 4) |
|
215 switch (buf[0] & 0x0ff) { |
|
216 case 0: |
|
217 // 00 3c 00 3f == illegal UTF-16 big-endian |
|
218 if (buf[1] == 0x3c && buf[2] == 0x00 && buf[3] == 0x3f) { |
|
219 setEncoding(pb, "UnicodeBig"); |
|
220 return; |
|
221 } |
|
222 // else it's probably UCS-4 |
|
223 break; |
|
224 |
|
225 case '<': // 0x3c: the most common cases! |
|
226 switch (buf[1] & 0x0ff) { |
|
227 // First character is '<'; could be XML without |
|
228 // an XML directive such as "<hello>", "<!-- ...", |
|
229 // and so on. |
|
230 default: |
|
231 break; |
|
232 |
|
233 // 3c 00 3f 00 == illegal UTF-16 little endian |
|
234 case 0x00: |
|
235 if (buf[2] == 0x3f && buf[3] == 0x00) { |
|
236 setEncoding(pb, "UnicodeLittle"); |
|
237 return; |
|
238 } |
|
239 // else probably UCS-4 |
|
240 break; |
|
241 |
|
242 // 3c 3f 78 6d == ASCII and supersets '<?xm' |
|
243 case '?': |
|
244 if (buf[2] != 'x' || buf[3] != 'm') |
|
245 break; |
|
246 // |
|
247 // One of several encodings could be used: |
|
248 // Shift-JIS, ASCII, UTF-8, ISO-8859-*, etc |
|
249 // |
|
250 useEncodingDecl(pb, "UTF8"); |
|
251 return; |
|
252 } |
|
253 break; |
|
254 |
|
255 // 4c 6f a7 94 ... some EBCDIC code page |
|
256 case 0x4c: |
|
257 if (buf[1] == 0x6f |
|
258 && (0x0ff & buf[2]) == 0x0a7 |
|
259 && (0x0ff & buf[3]) == 0x094) { |
|
260 useEncodingDecl(pb, "CP037"); |
|
261 return; |
|
262 } |
|
263 // whoops, treat as UTF-8 |
|
264 break; |
|
265 |
|
266 // UTF-16 big-endian |
|
267 case 0xfe: |
|
268 if ((buf[1] & 0x0ff) != 0xff) |
|
269 break; |
|
270 setEncoding(pb, "UTF-16"); |
|
271 return; |
|
272 |
|
273 // UTF-16 little-endian |
|
274 case 0xff: |
|
275 if ((buf[1] & 0x0ff) != 0xfe) |
|
276 break; |
|
277 setEncoding(pb, "UTF-16"); |
|
278 return; |
|
279 |
|
280 // default ... no XML declaration |
|
281 default: |
|
282 break; |
|
283 } |
|
284 |
|
285 // |
|
286 // If all else fails, assume XML without a declaration, and |
|
287 // using UTF-8 encoding. |
|
288 // |
|
289 setEncoding(pb, "UTF-8"); |
|
290 } |
|
291 |
|
292 /* |
|
293 * Read the encoding decl on the stream, knowing that it should |
|
294 * be readable using the specified encoding (basically, ASCII or |
|
295 * EBCDIC). The body of the document may use a wider range of |
|
296 * characters than the XML/Text decl itself, so we switch to use |
|
297 * the specified encoding as soon as we can. (ASCII is a subset |
|
298 * of UTF-8, ISO-8859-*, ISO-2022-JP, EUC-JP, and more; EBCDIC |
|
299 * has a variety of "code pages" that have these characters as |
|
300 * a common subset.) |
|
301 */ |
|
302 private void useEncodingDecl(PushbackInputStream pb, String encoding) |
|
303 throws IOException { |
|
304 byte buffer [] = new byte[MAXPUSHBACK]; |
|
305 int len; |
|
306 Reader r; |
|
307 int c; |
|
308 |
|
309 // |
|
310 // Buffer up a bunch of input, and set up to read it in |
|
311 // the specified encoding ... we can skip the first four |
|
312 // bytes since we know that "<?xm" was read to determine |
|
313 // what encoding to use! |
|
314 // |
|
315 len = pb.read(buffer, 0, buffer.length); |
|
316 pb.unread(buffer, 0, len); |
|
317 r = new InputStreamReader(new ByteArrayInputStream(buffer, 4, len), |
|
318 encoding); |
|
319 |
|
320 // |
|
321 // Next must be "l" (and whitespace) else we conclude |
|
322 // error and choose UTF-8. |
|
323 // |
|
324 if ((c = r.read()) != 'l') { |
|
325 setEncoding(pb, "UTF-8"); |
|
326 return; |
|
327 } |
|
328 |
|
329 // |
|
330 // Then, we'll skip any |
|
331 // S version="..." [or single quotes] |
|
332 // bit and get any subsequent |
|
333 // S encoding="..." [or single quotes] |
|
334 // |
|
335 // We put an arbitrary size limit on how far we read; lots |
|
336 // of space will break this algorithm. |
|
337 // |
|
338 StringBuffer buf = new StringBuffer(); |
|
339 StringBuffer keyBuf = null; |
|
340 String key = null; |
|
341 boolean sawEq = false; |
|
342 char quoteChar = 0; |
|
343 boolean sawQuestion = false; |
|
344 |
|
345 XmlDecl: |
|
346 for (int i = 0; i < MAXPUSHBACK - 5; ++i) { |
|
347 if ((c = r.read()) == -1) |
|
348 break; |
|
349 |
|
350 // ignore whitespace before/between "key = 'value'" |
|
351 if (c == ' ' || c == '\t' || c == '\n' || c == '\r') |
|
352 continue; |
|
353 |
|
354 // ... but require at least a little! |
|
355 if (i == 0) |
|
356 break; |
|
357 |
|
358 // terminate the loop ASAP |
|
359 if (c == '?') |
|
360 sawQuestion = true; |
|
361 else if (sawQuestion) { |
|
362 if (c == '>') |
|
363 break; |
|
364 sawQuestion = false; |
|
365 } |
|
366 |
|
367 // did we get the "key =" bit yet? |
|
368 if (key == null || !sawEq) { |
|
369 if (keyBuf == null) { |
|
370 if (Character.isWhitespace((char) c)) |
|
371 continue; |
|
372 keyBuf = buf; |
|
373 buf.setLength(0); |
|
374 buf.append((char) c); |
|
375 sawEq = false; |
|
376 } else if (Character.isWhitespace((char) c)) { |
|
377 key = keyBuf.toString(); |
|
378 } else if (c == '=') { |
|
379 if (key == null) |
|
380 key = keyBuf.toString(); |
|
381 sawEq = true; |
|
382 keyBuf = null; |
|
383 quoteChar = 0; |
|
384 } else |
|
385 keyBuf.append((char) c); |
|
386 continue; |
|
387 } |
|
388 |
|
389 // space before quoted value |
|
390 if (Character.isWhitespace((char) c)) |
|
391 continue; |
|
392 if (c == '"' || c == '\'') { |
|
393 if (quoteChar == 0) { |
|
394 quoteChar = (char) c; |
|
395 buf.setLength(0); |
|
396 continue; |
|
397 } else if (c == quoteChar) { |
|
398 if ("encoding".equals(key)) { |
|
399 assignedEncoding = buf.toString(); |
|
400 |
|
401 // [81] Encname ::= [A-Za-z] ([A-Za-z0-9._]|'-')* |
|
402 for (i = 0; i < assignedEncoding.length(); i++) { |
|
403 c = assignedEncoding.charAt(i); |
|
404 if ((c >= 'A' && c <= 'Z') |
|
405 || (c >= 'a' && c <= 'z')) |
|
406 continue; |
|
407 if (i == 0) |
|
408 break XmlDecl; |
|
409 if (i > 0 && (c == '-' |
|
410 || (c >= '0' && c <= '9') |
|
411 || c == '.' || c == '_')) |
|
412 continue; |
|
413 // map illegal names to UTF-8 default |
|
414 break XmlDecl; |
|
415 } |
|
416 |
|
417 setEncoding(pb, assignedEncoding); |
|
418 return; |
|
419 |
|
420 } else { |
|
421 key = null; |
|
422 continue; |
|
423 } |
|
424 } |
|
425 } |
|
426 buf.append((char) c); |
|
427 } |
|
428 |
|
429 setEncoding(pb, "UTF-8"); |
|
430 } |
|
431 |
|
432 private void setEncoding(InputStream stream, String encoding) |
|
433 throws IOException { |
|
434 assignedEncoding = encoding; |
|
435 in = createReader(stream, encoding); |
|
436 } |
|
437 |
|
438 /** |
|
439 * Reads the number of characters read into the buffer, or -1 on EOF. |
|
440 */ |
|
441 public int read(char buf [], int off, int len) throws IOException { |
|
442 int val; |
|
443 |
|
444 if (closed) |
|
445 return -1; // throw new IOException ("closed"); |
|
446 val = in.read(buf, off, len); |
|
447 if (val == -1) |
|
448 close(); |
|
449 return val; |
|
450 } |
|
451 |
|
452 /** |
|
453 * Reads a single character. |
|
454 */ |
|
455 public int read() throws IOException { |
|
456 int val; |
|
457 |
|
458 if (closed) |
|
459 throw new IOException("closed"); |
|
460 val = in.read(); |
|
461 if (val == -1) |
|
462 close(); |
|
463 return val; |
|
464 } |
|
465 |
|
466 /** |
|
467 * Returns true iff the reader supports mark/reset. |
|
468 */ |
|
469 public boolean markSupported() { |
|
470 return in == null ? false : in.markSupported(); |
|
471 } |
|
472 |
|
473 /** |
|
474 * Sets a mark allowing a limited number of characters to |
|
475 * be "peeked", by reading and then resetting. |
|
476 * |
|
477 * @param value how many characters may be "peeked". |
|
478 */ |
|
479 public void mark(int value) throws IOException { |
|
480 if (in != null) in.mark(value); |
|
481 } |
|
482 |
|
483 /** |
|
484 * Resets the current position to the last marked position. |
|
485 */ |
|
486 public void reset() throws IOException { |
|
487 if (in != null) in.reset(); |
|
488 } |
|
489 |
|
490 /** |
|
491 * Skips a specified number of characters. |
|
492 */ |
|
493 public long skip(long value) throws IOException { |
|
494 return in == null ? 0 : in.skip(value); |
|
495 } |
|
496 |
|
497 /** |
|
498 * Returns true iff input characters are known to be ready. |
|
499 */ |
|
500 public boolean ready() throws IOException { |
|
501 return in == null ? false : in.ready(); |
|
502 } |
|
503 |
|
504 /** |
|
505 * Closes the reader. |
|
506 */ |
|
507 public void close() throws IOException { |
|
508 if (closed) |
|
509 return; |
|
510 in.close(); |
|
511 in = null; |
|
512 closed = true; |
|
513 } |
|
514 |
|
515 // |
|
516 // Delegating to a converter module will always be slower than |
|
517 // direct conversion. Use a similar approach for any other |
|
518 // readers that need to be particularly fast; only block I/O |
|
519 // speed matters to this package. For UTF-16, separate readers |
|
520 // for big and little endian streams make a difference, too; |
|
521 // fewer conditionals in the critical path! |
|
522 // |
|
523 static abstract class BaseReader extends Reader { |
|
524 protected InputStream instream; |
|
525 protected byte buffer []; |
|
526 protected int start, finish; |
|
527 |
|
528 BaseReader(InputStream stream) { |
|
529 super(stream); |
|
530 |
|
531 instream = stream; |
|
532 buffer = new byte[8192]; |
|
533 } |
|
534 |
|
535 public boolean ready() throws IOException { |
|
536 return instream == null |
|
537 || (finish - start) > 0 |
|
538 || instream.available() != 0; |
|
539 } |
|
540 |
|
541 // caller shouldn't read again |
|
542 public void close() throws IOException { |
|
543 if (instream != null) { |
|
544 instream.close(); |
|
545 start = finish = 0; |
|
546 buffer = null; |
|
547 instream = null; |
|
548 } |
|
549 } |
|
550 } |
|
551 |
|
552 // |
|
553 // We want this reader, to make the default encoding be as fast |
|
554 // as we can make it. JDK's "UTF8" (not "UTF-8" till JDK 1.2) |
|
555 // InputStreamReader works, but 20+% slower speed isn't OK for |
|
556 // the default/primary encoding. |
|
557 // |
|
558 static final class Utf8Reader extends BaseReader { |
|
559 // 2nd half of UTF-8 surrogate pair |
|
560 private char nextChar; |
|
561 |
|
562 Utf8Reader(InputStream stream) { |
|
563 super(stream); |
|
564 } |
|
565 |
|
566 public int read(char buf [], int offset, int len) throws IOException { |
|
567 int i = 0, c = 0; |
|
568 |
|
569 if (len <= 0) |
|
570 return 0; |
|
571 |
|
572 // Consume remaining half of any surrogate pair immediately |
|
573 if (nextChar != 0) { |
|
574 buf[offset + i++] = nextChar; |
|
575 nextChar = 0; |
|
576 } |
|
577 |
|
578 while (i < len) { |
|
579 // stop or read data if needed |
|
580 if (finish <= start) { |
|
581 if (instream == null) { |
|
582 c = -1; |
|
583 break; |
|
584 } |
|
585 start = 0; |
|
586 finish = instream.read(buffer, 0, buffer.length); |
|
587 if (finish <= 0) { |
|
588 this.close(); |
|
589 c = -1; |
|
590 break; |
|
591 } |
|
592 } |
|
593 |
|
594 // |
|
595 // RFC 2279 describes UTF-8; there are six encodings. |
|
596 // Each encoding takes a fixed number of characters |
|
597 // (1-6 bytes) and is flagged by a bit pattern in the |
|
598 // first byte. The five and six byte-per-character |
|
599 // encodings address characters which are disallowed |
|
600 // in XML documents, as do some four byte ones. |
|
601 // |
|
602 |
|
603 // |
|
604 // Single byte == ASCII. Common; optimize. |
|
605 // |
|
606 c = buffer[start] & 0x0ff; |
|
607 if ((c & 0x80) == 0x00) { |
|
608 // 0x0000 <= c <= 0x007f |
|
609 start++; |
|
610 buf[offset + i++] = (char) c; |
|
611 continue; |
|
612 } |
|
613 |
|
614 // |
|
615 // Multibyte chars -- check offsets optimistically, |
|
616 // ditto the "10xx xxxx" format for subsequent bytes |
|
617 // |
|
618 int off = start; |
|
619 |
|
620 try { |
|
621 // 2 bytes |
|
622 if ((buffer[off] & 0x0E0) == 0x0C0) { |
|
623 c = (buffer[off++] & 0x1f) << 6; |
|
624 c += buffer[off++] & 0x3f; |
|
625 |
|
626 // 0x0080 <= c <= 0x07ff |
|
627 |
|
628 // 3 bytes |
|
629 } else if ((buffer[off] & 0x0F0) == 0x0E0) { |
|
630 c = (buffer[off++] & 0x0f) << 12; |
|
631 c += (buffer[off++] & 0x3f) << 6; |
|
632 c += buffer[off++] & 0x3f; |
|
633 |
|
634 // 0x0800 <= c <= 0xffff |
|
635 |
|
636 // 4 bytes |
|
637 } else if ((buffer[off] & 0x0f8) == 0x0F0) { |
|
638 c = (buffer[off++] & 0x07) << 18; |
|
639 c += (buffer[off++] & 0x3f) << 12; |
|
640 c += (buffer[off++] & 0x3f) << 6; |
|
641 c += buffer[off++] & 0x3f; |
|
642 |
|
643 // 0x0001 0000 <= c <= 0x001f ffff |
|
644 |
|
645 // Unicode supports c <= 0x0010 ffff ... |
|
646 if (c > 0x0010ffff) |
|
647 throw new CharConversionException("UTF-8 encoding of character 0x00" |
|
648 + Integer.toHexString(c) |
|
649 + " can't be converted to Unicode."); |
|
650 |
|
651 // Convert UCS-4 char to surrogate pair (UTF-16) |
|
652 c -= 0x10000; |
|
653 nextChar = (char) (0xDC00 + (c & 0x03ff)); |
|
654 c = 0xD800 + (c >> 10); |
|
655 |
|
656 // 5 and 6 byte versions are XML WF errors, but |
|
657 // typically come from mislabeled encodings |
|
658 } else |
|
659 throw new CharConversionException("Unconvertible UTF-8 character" |
|
660 + " beginning with 0x" |
|
661 + Integer.toHexString(buffer[start] & 0xff)); |
|
662 |
|
663 } catch (ArrayIndexOutOfBoundsException e) { |
|
664 // off > length && length >= buffer.length |
|
665 c = 0; |
|
666 } |
|
667 |
|
668 // |
|
669 // if the buffer held only a partial character, |
|
670 // compact it and try to read the rest of the |
|
671 // character. worst case involves three |
|
672 // single-byte reads -- quite rare. |
|
673 // |
|
674 if (off > finish) { |
|
675 System.arraycopy(buffer, start, |
|
676 buffer, 0, finish - start); |
|
677 finish -= start; |
|
678 start = 0; |
|
679 off = instream.read(buffer, finish, |
|
680 buffer.length - finish); |
|
681 if (off < 0) { |
|
682 this.close(); |
|
683 throw new CharConversionException("Partial UTF-8 char"); |
|
684 } |
|
685 finish += off; |
|
686 continue; |
|
687 } |
|
688 |
|
689 // |
|
690 // check the format of the non-initial bytes |
|
691 // |
|
692 for (start++; start < off; start++) { |
|
693 if ((buffer[start] & 0xC0) != 0x80) { |
|
694 this.close(); |
|
695 throw new CharConversionException("Malformed UTF-8 char -- " |
|
696 + "is an XML encoding declaration missing?"); |
|
697 } |
|
698 } |
|
699 |
|
700 // |
|
701 // If this needed a surrogate pair, consume ASAP |
|
702 // |
|
703 buf[offset + i++] = (char) c; |
|
704 if (nextChar != 0 && i < len) { |
|
705 buf[offset + i++] = nextChar; |
|
706 nextChar = 0; |
|
707 } |
|
708 } |
|
709 if (i > 0) |
|
710 return i; |
|
711 return (c == -1) ? -1 : 0; |
|
712 } |
|
713 } |
|
714 |
|
715 // |
|
716 // We want ASCII and ISO-8859 Readers since they're the most common |
|
717 // encodings in the US and Europe, and we don't want performance |
|
718 // regressions for them. They're also easy to implement efficiently, |
|
719 // since they're bitmask subsets of UNICODE. |
|
720 // |
|
721 // XXX haven't benchmarked these readers vs what we get out of JDK. |
|
722 // |
|
723 static final class AsciiReader extends BaseReader { |
|
724 AsciiReader(InputStream in) { |
|
725 super(in); |
|
726 } |
|
727 |
|
728 public int read(char buf [], int offset, int len) throws IOException { |
|
729 int i, c; |
|
730 |
|
731 if (instream == null) |
|
732 return -1; |
|
733 |
|
734 for (i = 0; i < len; i++) { |
|
735 if (start >= finish) { |
|
736 start = 0; |
|
737 finish = instream.read(buffer, 0, buffer.length); |
|
738 if (finish <= 0) { |
|
739 if (finish <= 0) |
|
740 this.close(); |
|
741 break; |
|
742 } |
|
743 } |
|
744 c = buffer[start++]; |
|
745 if ((c & 0x80) != 0) |
|
746 throw new CharConversionException("Illegal ASCII character, 0x" |
|
747 + Integer.toHexString(c & 0xff)); |
|
748 buf[offset + i] = (char) c; |
|
749 } |
|
750 if (i == 0 && finish <= 0) |
|
751 return -1; |
|
752 return i; |
|
753 } |
|
754 } |
|
755 |
|
756 static final class Iso8859_1Reader extends BaseReader { |
|
757 Iso8859_1Reader(InputStream in) { |
|
758 super(in); |
|
759 } |
|
760 |
|
761 public int read(char buf [], int offset, int len) throws IOException { |
|
762 int i; |
|
763 |
|
764 if (instream == null) |
|
765 return -1; |
|
766 |
|
767 for (i = 0; i < len; i++) { |
|
768 if (start >= finish) { |
|
769 start = 0; |
|
770 finish = instream.read(buffer, 0, buffer.length); |
|
771 if (finish <= 0) { |
|
772 if (finish <= 0) |
|
773 this.close(); |
|
774 break; |
|
775 } |
|
776 } |
|
777 buf[offset + i] = (char) (0x0ff & buffer[start++]); |
|
778 } |
|
779 if (i == 0 && finish <= 0) |
|
780 return -1; |
|
781 return i; |
|
782 } |
|
783 } |
|
784 } |