|
1 /* |
|
2 * Copyright (c) 1998, 2011, Oracle and/or its affiliates. All rights reserved. |
|
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
|
4 * |
|
5 * This code is free software; you can redistribute it and/or modify it |
|
6 * under the terms of the GNU General Public License version 2 only, as |
|
7 * published by the Free Software Foundation. Oracle designates this |
|
8 * particular file as subject to the "Classpath" exception as provided |
|
9 * by Oracle in the LICENSE file that accompanied this code. |
|
10 * |
|
11 * This code is distributed in the hope that it will be useful, but WITHOUT |
|
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
|
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
|
14 * version 2 for more details (a copy is included in the LICENSE file that |
|
15 * accompanied this code). |
|
16 * |
|
17 * You should have received a copy of the GNU General Public License version |
|
18 * 2 along with this work; if not, write to the Free Software Foundation, |
|
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
|
20 * |
|
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
|
22 * or visit www.oracle.com if you need additional information or have any |
|
23 * questions. |
|
24 */ |
|
25 |
|
26 package com.sun.xml.internal.dtdparser; |
|
27 |
|
28 |
|
29 /** |
|
30 * Methods in this class are used to determine whether characters may |
|
31 * appear in certain roles in XML documents. Such methods are used |
|
32 * both to parse and to create such documents. |
|
33 * |
|
34 * @author David Brownell |
|
35 * @version 1.1, 00/08/05 |
|
36 */ |
|
37 public class XmlChars { |
|
38 // can't construct instances |
|
39 private XmlChars() { |
|
40 } |
|
41 |
|
42 /** |
|
43 * Returns true if the argument, a UCS-4 character code, is valid in |
|
44 * XML documents. Unicode characters fit into the low sixteen |
|
45 * bits of a UCS-4 character, and pairs of Unicode <em>surrogate |
|
46 * characters</em> can be combined to encode UCS-4 characters in |
|
47 * documents containing only Unicode. (The <code>char</code> datatype |
|
48 * in the Java Programming Language represents Unicode characters, |
|
49 * including unpaired surrogates.) |
|
50 * <p/> |
|
51 * <P> In XML, UCS-4 characters can also be encoded by the use of |
|
52 * <em>character references</em> such as <b>&#x12345678;</b>, which |
|
53 * happens to refer to a character that is disallowed in XML documents. |
|
54 * UCS-4 characters allowed in XML documents can be expressed with |
|
55 * one or two Unicode characters. |
|
56 * |
|
57 * @param ucs4char The 32-bit UCS-4 character being tested. |
|
58 */ |
|
59 static public boolean isChar(int ucs4char) { |
|
60 // [2] Char ::= #x0009 | #x000A | #x000D |
|
61 // | [#x0020-#xD7FF] |
|
62 // ... surrogates excluded! |
|
63 // | [#xE000-#xFFFD] |
|
64 // | [#x10000-#x10ffff] |
|
65 return ((ucs4char >= 0x0020 && ucs4char <= 0xD7FF) |
|
66 || ucs4char == 0x000A || ucs4char == 0x0009 |
|
67 || ucs4char == 0x000D |
|
68 || (ucs4char >= 0xE000 && ucs4char <= 0xFFFD) |
|
69 || (ucs4char >= 0x10000 && ucs4char <= 0x10ffff)); |
|
70 } |
|
71 |
|
72 /** |
|
73 * Returns true if the character is allowed to be a non-initial |
|
74 * character in names according to the XML recommendation. |
|
75 * |
|
76 * @see #isNCNameChar(char) |
|
77 * @see #isLetter(char) |
|
78 */ |
|
79 public static boolean isNameChar(char c) { |
|
80 // [4] NameChar ::= Letter | Digit | '.' | '_' | ':' |
|
81 // | CombiningChar | Extender |
|
82 |
|
83 if (isLetter2(c)) |
|
84 return true; |
|
85 else if (c == '>') |
|
86 return false; |
|
87 else if (c == '.' || c == '-' || c == '_' || c == ':' |
|
88 || isExtender(c)) |
|
89 return true; |
|
90 else |
|
91 return false; |
|
92 } |
|
93 |
|
94 /** |
|
95 * Returns true if the character is allowed to be a non-initial |
|
96 * character in unscoped names according to the rules of the XML |
|
97 * Namespaces proposed recommendation. Except for precluding |
|
98 * the colon (used to separate names from their scopes) these |
|
99 * characters are just as allowed by the XML recommendation. |
|
100 * |
|
101 * @see #isNameChar(char) |
|
102 * @see #isLetter(char) |
|
103 */ |
|
104 public static boolean isNCNameChar(char c) { |
|
105 // [NC 5] NCNameChar ::= Letter | Digit | '.' | '_' |
|
106 // | CombiningChar | Extender |
|
107 return c != ':' && isNameChar(c); |
|
108 } |
|
109 |
|
110 /** |
|
111 * Returns true if the character is allowed where XML supports |
|
112 * whitespace characters, false otherwise. |
|
113 */ |
|
114 public static boolean isSpace(char c) { |
|
115 return c == ' ' || c == '\t' || c == '\n' || c == '\r'; |
|
116 } |
|
117 |
|
118 |
|
119 /* |
|
120 * NOTE: java.lang.Character.getType() values are: |
|
121 * |
|
122 * UNASSIGNED = 0, |
|
123 * |
|
124 * UPPERCASE_LETTER = 1, // Lu |
|
125 * LOWERCASE_LETTER = 2, // Ll |
|
126 * TITLECASE_LETTER = 3, // Lt |
|
127 * MODIFIER_LETTER = 4, // Lm |
|
128 * OTHER_LETTER = 5, // Lo |
|
129 * NON_SPACING_MARK = 6, // Mn |
|
130 * ENCLOSING_MARK = 7, // Me |
|
131 * COMBINING_SPACING_MARK = 8, // Mc |
|
132 * DECIMAL_DIGIT_NUMBER = 9, // Nd |
|
133 * LETTER_NUMBER = 10, // Nl |
|
134 * OTHER_NUMBER = 11, // No |
|
135 * SPACE_SEPARATOR = 12, // Zs |
|
136 * LINE_SEPARATOR = 13, // Zl |
|
137 * PARAGRAPH_SEPARATOR = 14, // Zp |
|
138 * CONTROL = 15, // Cc |
|
139 * FORMAT = 16, // Cf |
|
140 * // 17 reserved for proposed Ci category |
|
141 * PRIVATE_USE = 18, // Co |
|
142 * SURROGATE = 19, // Cs |
|
143 * DASH_PUNCTUATION = 20, // Pd |
|
144 * START_PUNCTUATION = 21, // Ps |
|
145 * END_PUNCTUATION = 22, // Pe |
|
146 * CONNECTOR_PUNCTUATION = 23, // Pc |
|
147 * OTHER_PUNCTUATION = 24, // Po |
|
148 * MATH_SYMBOL = 25, // Sm |
|
149 * CURRENCY_SYMBOL = 26, // Sc |
|
150 * MODIFIER_SYMBOL = 27, // Sk |
|
151 * OTHER_SYMBOL = 28; // So |
|
152 */ |
|
153 |
|
154 /** |
|
155 * Returns true if the character is an XML "letter". XML Names must |
|
156 * start with Letters or a few other characters, but other characters |
|
157 * in names must only satisfy the <em>isNameChar</em> predicate. |
|
158 * |
|
159 * @see #isNameChar(char) |
|
160 * @see #isNCNameChar(char) |
|
161 */ |
|
162 public static boolean isLetter(char c) { |
|
163 // [84] Letter ::= BaseChar | Ideographic |
|
164 // [85] BaseChar ::= ... too much to repeat |
|
165 // [86] Ideographic ::= ... too much to repeat |
|
166 |
|
167 // |
|
168 // Optimize the typical case. |
|
169 // |
|
170 if (c >= 'a' && c <= 'z') |
|
171 return true; |
|
172 if (c == '/') |
|
173 return false; |
|
174 if (c >= 'A' && c <= 'Z') |
|
175 return true; |
|
176 |
|
177 // |
|
178 // Since the tables are too ridiculous to use in code, |
|
179 // we're using the footnotes here to drive this test. |
|
180 // |
|
181 switch (Character.getType(c)) { |
|
182 // app. B footnote says these are 'name start' |
|
183 // chars' ... |
|
184 case Character.LOWERCASE_LETTER: // Ll |
|
185 case Character.UPPERCASE_LETTER: // Lu |
|
186 case Character.OTHER_LETTER: // Lo |
|
187 case Character.TITLECASE_LETTER: // Lt |
|
188 case Character.LETTER_NUMBER: // Nl |
|
189 |
|
190 // OK, here we just have some exceptions to check... |
|
191 return !isCompatibilityChar(c) |
|
192 // per "5.14 of Unicode", rule out some combiners |
|
193 && !(c >= 0x20dd && c <= 0x20e0); |
|
194 |
|
195 default: |
|
196 // check for some exceptions: these are "alphabetic" |
|
197 return ((c >= 0x02bb && c <= 0x02c1) |
|
198 || c == 0x0559 || c == 0x06e5 || c == 0x06e6); |
|
199 } |
|
200 } |
|
201 |
|
202 // |
|
203 // XML 1.0 discourages "compatibility" characters in names; these |
|
204 // were defined to permit passing through some information stored in |
|
205 // older non-Unicode character sets. These always have alternative |
|
206 // representations in Unicode, e.g. using combining chars. |
|
207 // |
|
208 private static boolean isCompatibilityChar(char c) { |
|
209 // the numerous comparisions here seem unavoidable, |
|
210 // but the switch can reduce the number which must |
|
211 // actually be executed. |
|
212 |
|
213 switch ((c >> 8) & 0x0ff) { |
|
214 case 0x00: |
|
215 // ISO Latin/1 has a few compatibility characters |
|
216 return c == 0x00aa || c == 0x00b5 || c == 0x00ba; |
|
217 |
|
218 case 0x01: |
|
219 // as do Latin Extended A and (parts of) B |
|
220 return (c >= 0x0132 && c <= 0x0133) |
|
221 || (c >= 0x013f && c <= 0x0140) |
|
222 || c == 0x0149 |
|
223 || c == 0x017f |
|
224 || (c >= 0x01c4 && c <= 0x01cc) |
|
225 || (c >= 0x01f1 && c <= 0x01f3); |
|
226 |
|
227 case 0x02: |
|
228 // some spacing modifiers |
|
229 return (c >= 0x02b0 && c <= 0x02b8) |
|
230 || (c >= 0x02e0 && c <= 0x02e4); |
|
231 |
|
232 case 0x03: |
|
233 return c == 0x037a; // Greek |
|
234 |
|
235 case 0x05: |
|
236 return c == 0x0587; // Armenian |
|
237 |
|
238 case 0x0e: |
|
239 return c >= 0x0edc && c <= 0x0edd; // Laotian |
|
240 |
|
241 case 0x11: |
|
242 // big chunks of Hangul Jamo are all "compatibility" |
|
243 return c == 0x1101 |
|
244 || c == 0x1104 |
|
245 || c == 0x1108 |
|
246 || c == 0x110a |
|
247 || c == 0x110d |
|
248 || (c >= 0x1113 && c <= 0x113b) |
|
249 || c == 0x113d |
|
250 || c == 0x113f |
|
251 || (c >= 0x1141 && c <= 0x114b) |
|
252 || c == 0x114d |
|
253 || c == 0x114f |
|
254 || (c >= 0x1151 && c <= 0x1153) |
|
255 || (c >= 0x1156 && c <= 0x1158) |
|
256 || c == 0x1162 |
|
257 || c == 0x1164 |
|
258 || c == 0x1166 |
|
259 || c == 0x1168 |
|
260 || (c >= 0x116a && c <= 0x116c) |
|
261 || (c >= 0x116f && c <= 0x1171) |
|
262 || c == 0x1174 |
|
263 || (c >= 0x1176 && c <= 0x119d) |
|
264 || (c >= 0x119f && c <= 0x11a2) |
|
265 || (c >= 0x11a9 && c <= 0x11aa) |
|
266 || (c >= 0x11ac && c <= 0x11ad) |
|
267 || (c >= 0x11b0 && c <= 0x11b6) |
|
268 || c == 0x11b9 |
|
269 || c == 0x11bb |
|
270 || (c >= 0x11c3 && c <= 0x11ea) |
|
271 || (c >= 0x11ec && c <= 0x11ef) |
|
272 || (c >= 0x11f1 && c <= 0x11f8) |
|
273 ; |
|
274 |
|
275 case 0x20: |
|
276 return c == 0x207f; // superscript |
|
277 |
|
278 case 0x21: |
|
279 return |
|
280 // various letterlike symbols |
|
281 c == 0x2102 |
|
282 || c == 0x2107 |
|
283 || (c >= 0x210a && c <= 0x2113) |
|
284 || c == 0x2115 |
|
285 || (c >= 0x2118 && c <= 0x211d) |
|
286 || c == 0x2124 |
|
287 || c == 0x2128 |
|
288 || (c >= 0x212c && c <= 0x212d) |
|
289 || (c >= 0x212f && c <= 0x2138) |
|
290 |
|
291 // most Roman numerals (less 1K, 5K, 10K) |
|
292 || (c >= 0x2160 && c <= 0x217f) |
|
293 ; |
|
294 |
|
295 case 0x30: |
|
296 // some Hiragana |
|
297 return c >= 0x309b && c <= 0x309c; |
|
298 |
|
299 case 0x31: |
|
300 // all Hangul Compatibility Jamo |
|
301 return c >= 0x3131 && c <= 0x318e; |
|
302 |
|
303 case 0xf9: |
|
304 case 0xfa: |
|
305 case 0xfb: |
|
306 case 0xfc: |
|
307 case 0xfd: |
|
308 case 0xfe: |
|
309 case 0xff: |
|
310 // the whole "compatibility" area is for that purpose! |
|
311 return true; |
|
312 |
|
313 default: |
|
314 // most of Unicode isn't flagged as being for compatibility |
|
315 return false; |
|
316 } |
|
317 } |
|
318 |
|
319 // guts of isNameChar/isNCNameChar |
|
320 private static boolean isLetter2(char c) { |
|
321 // [84] Letter ::= BaseChar | Ideographic |
|
322 // [85] BaseChar ::= ... too much to repeat |
|
323 // [86] Ideographic ::= ... too much to repeat |
|
324 // [87] CombiningChar ::= ... too much to repeat |
|
325 |
|
326 // |
|
327 // Optimize the typical case. |
|
328 // |
|
329 if (c >= 'a' && c <= 'z') |
|
330 return true; |
|
331 if (c == '>') |
|
332 return false; |
|
333 if (c >= 'A' && c <= 'Z') |
|
334 return true; |
|
335 |
|
336 // |
|
337 // Since the tables are too ridiculous to use in code, |
|
338 // we're using the footnotes here to drive this test. |
|
339 // |
|
340 switch (Character.getType(c)) { |
|
341 // app. B footnote says these are 'name start' |
|
342 // chars' ... |
|
343 case Character.LOWERCASE_LETTER: // Ll |
|
344 case Character.UPPERCASE_LETTER: // Lu |
|
345 case Character.OTHER_LETTER: // Lo |
|
346 case Character.TITLECASE_LETTER: // Lt |
|
347 case Character.LETTER_NUMBER: // Nl |
|
348 // ... and these are name characters 'other |
|
349 // than name start characters' |
|
350 case Character.COMBINING_SPACING_MARK: // Mc |
|
351 case Character.ENCLOSING_MARK: // Me |
|
352 case Character.NON_SPACING_MARK: // Mn |
|
353 case Character.MODIFIER_LETTER: // Lm |
|
354 case Character.DECIMAL_DIGIT_NUMBER: // Nd |
|
355 |
|
356 // OK, here we just have some exceptions to check... |
|
357 return !isCompatibilityChar(c) |
|
358 // per "5.14 of Unicode", rule out some combiners |
|
359 && !(c >= 0x20dd && c <= 0x20e0); |
|
360 |
|
361 default: |
|
362 // added a character ... |
|
363 return c == 0x0387; |
|
364 } |
|
365 } |
|
366 |
|
367 private static boolean isDigit(char c) { |
|
368 // [88] Digit ::= ... |
|
369 |
|
370 // |
|
371 // java.lang.Character.isDigit is correct from the XML point |
|
372 // of view except that it allows "fullwidth" digits. |
|
373 // |
|
374 return Character.isDigit(c) |
|
375 && !((c >= 0xff10) && (c <= 0xff19)); |
|
376 } |
|
377 |
|
378 private static boolean isExtender(char c) { |
|
379 // [89] Extender ::= ... |
|
380 return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387 |
|
381 || c == 0x0640 || c == 0x0e46 || c == 0x0ec6 |
|
382 || c == 0x3005 || (c >= 0x3031 && c <= 0x3035) |
|
383 || (c >= 0x309d && c <= 0x309e) |
|
384 || (c >= 0x30fc && c <= 0x30fe) |
|
385 ; |
|
386 } |
|
387 } |