src/share/jaxws_classes/com/sun/xml/internal/dtdparser/XmlChars.java

changeset 0
373ffda63c9a
child 637
9c07ef4934dd
equal deleted inserted replaced
-1:000000000000 0:373ffda63c9a
1 /*
2 * Copyright (c) 2009, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26 package com.sun.xml.internal.dtdparser;
27
28
29 /**
30 * Methods in this class are used to determine whether characters may
31 * appear in certain roles in XML documents. Such methods are used
32 * both to parse and to create such documents.
33 *
34 * @author David Brownell
35 * @version 1.1, 00/08/05
36 */
37 public class XmlChars {
38 // can't construct instances
39 private XmlChars() {
40 }
41
42 /**
43 * Returns true if the argument, a UCS-4 character code, is valid in
44 * XML documents. Unicode characters fit into the low sixteen
45 * bits of a UCS-4 character, and pairs of Unicode <em>surrogate
46 * characters</em> can be combined to encode UCS-4 characters in
47 * documents containing only Unicode. (The <code>char</code> datatype
48 * in the Java Programming Language represents Unicode characters,
49 * including unpaired surrogates.)
50 * <p/>
51 * <P> In XML, UCS-4 characters can also be encoded by the use of
52 * <em>character references</em> such as <b>&amp;#x12345678;</b>, which
53 * happens to refer to a character that is disallowed in XML documents.
54 * UCS-4 characters allowed in XML documents can be expressed with
55 * one or two Unicode characters.
56 *
57 * @param ucs4char The 32-bit UCS-4 character being tested.
58 */
59 static public boolean isChar(int ucs4char) {
60 // [2] Char ::= #x0009 | #x000A | #x000D
61 // | [#x0020-#xD7FF]
62 // ... surrogates excluded!
63 // | [#xE000-#xFFFD]
64 // | [#x10000-#x10ffff]
65 return ((ucs4char >= 0x0020 && ucs4char <= 0xD7FF)
66 || ucs4char == 0x000A || ucs4char == 0x0009
67 || ucs4char == 0x000D
68 || (ucs4char >= 0xE000 && ucs4char <= 0xFFFD)
69 || (ucs4char >= 0x10000 && ucs4char <= 0x10ffff));
70 }
71
72 /**
73 * Returns true if the character is allowed to be a non-initial
74 * character in names according to the XML recommendation.
75 *
76 * @see #isNCNameChar(char)
77 * @see #isLetter(char)
78 */
79 public static boolean isNameChar(char c) {
80 // [4] NameChar ::= Letter | Digit | '.' | '_' | ':'
81 // | CombiningChar | Extender
82
83 if (isLetter2(c))
84 return true;
85 else if (c == '>')
86 return false;
87 else if (c == '.' || c == '-' || c == '_' || c == ':'
88 || isExtender(c))
89 return true;
90 else
91 return false;
92 }
93
94 /**
95 * Returns true if the character is allowed to be a non-initial
96 * character in unscoped names according to the rules of the XML
97 * Namespaces proposed recommendation. Except for precluding
98 * the colon (used to separate names from their scopes) these
99 * characters are just as allowed by the XML recommendation.
100 *
101 * @see #isNameChar(char)
102 * @see #isLetter(char)
103 */
104 public static boolean isNCNameChar(char c) {
105 // [NC 5] NCNameChar ::= Letter | Digit | '.' | '_'
106 // | CombiningChar | Extender
107 return c != ':' && isNameChar(c);
108 }
109
110 /**
111 * Returns true if the character is allowed where XML supports
112 * whitespace characters, false otherwise.
113 */
114 public static boolean isSpace(char c) {
115 return c == ' ' || c == '\t' || c == '\n' || c == '\r';
116 }
117
118
119 /*
120 * NOTE: java.lang.Character.getType() values are:
121 *
122 * UNASSIGNED = 0,
123 *
124 * UPPERCASE_LETTER = 1, // Lu
125 * LOWERCASE_LETTER = 2, // Ll
126 * TITLECASE_LETTER = 3, // Lt
127 * MODIFIER_LETTER = 4, // Lm
128 * OTHER_LETTER = 5, // Lo
129 * NON_SPACING_MARK = 6, // Mn
130 * ENCLOSING_MARK = 7, // Me
131 * COMBINING_SPACING_MARK = 8, // Mc
132 * DECIMAL_DIGIT_NUMBER = 9, // Nd
133 * LETTER_NUMBER = 10, // Nl
134 * OTHER_NUMBER = 11, // No
135 * SPACE_SEPARATOR = 12, // Zs
136 * LINE_SEPARATOR = 13, // Zl
137 * PARAGRAPH_SEPARATOR = 14, // Zp
138 * CONTROL = 15, // Cc
139 * FORMAT = 16, // Cf
140 * // 17 reserved for proposed Ci category
141 * PRIVATE_USE = 18, // Co
142 * SURROGATE = 19, // Cs
143 * DASH_PUNCTUATION = 20, // Pd
144 * START_PUNCTUATION = 21, // Ps
145 * END_PUNCTUATION = 22, // Pe
146 * CONNECTOR_PUNCTUATION = 23, // Pc
147 * OTHER_PUNCTUATION = 24, // Po
148 * MATH_SYMBOL = 25, // Sm
149 * CURRENCY_SYMBOL = 26, // Sc
150 * MODIFIER_SYMBOL = 27, // Sk
151 * OTHER_SYMBOL = 28; // So
152 */
153
154 /**
155 * Returns true if the character is an XML "letter". XML Names must
156 * start with Letters or a few other characters, but other characters
157 * in names must only satisfy the <em>isNameChar</em> predicate.
158 *
159 * @see #isNameChar(char)
160 * @see #isNCNameChar(char)
161 */
162 public static boolean isLetter(char c) {
163 // [84] Letter ::= BaseChar | Ideographic
164 // [85] BaseChar ::= ... too much to repeat
165 // [86] Ideographic ::= ... too much to repeat
166
167 //
168 // Optimize the typical case.
169 //
170 if (c >= 'a' && c <= 'z')
171 return true;
172 if (c == '/')
173 return false;
174 if (c >= 'A' && c <= 'Z')
175 return true;
176
177 //
178 // Since the tables are too ridiculous to use in code,
179 // we're using the footnotes here to drive this test.
180 //
181 switch (Character.getType(c)) {
182 // app. B footnote says these are 'name start'
183 // chars' ...
184 case Character.LOWERCASE_LETTER: // Ll
185 case Character.UPPERCASE_LETTER: // Lu
186 case Character.OTHER_LETTER: // Lo
187 case Character.TITLECASE_LETTER: // Lt
188 case Character.LETTER_NUMBER: // Nl
189
190 // OK, here we just have some exceptions to check...
191 return !isCompatibilityChar(c)
192 // per "5.14 of Unicode", rule out some combiners
193 && !(c >= 0x20dd && c <= 0x20e0);
194
195 default:
196 // check for some exceptions: these are "alphabetic"
197 return ((c >= 0x02bb && c <= 0x02c1)
198 || c == 0x0559 || c == 0x06e5 || c == 0x06e6);
199 }
200 }
201
202 //
203 // XML 1.0 discourages "compatibility" characters in names; these
204 // were defined to permit passing through some information stored in
205 // older non-Unicode character sets. These always have alternative
206 // representations in Unicode, e.g. using combining chars.
207 //
208 private static boolean isCompatibilityChar(char c) {
209 // the numerous comparisions here seem unavoidable,
210 // but the switch can reduce the number which must
211 // actually be executed.
212
213 switch ((c >> 8) & 0x0ff) {
214 case 0x00:
215 // ISO Latin/1 has a few compatibility characters
216 return c == 0x00aa || c == 0x00b5 || c == 0x00ba;
217
218 case 0x01:
219 // as do Latin Extended A and (parts of) B
220 return (c >= 0x0132 && c <= 0x0133)
221 || (c >= 0x013f && c <= 0x0140)
222 || c == 0x0149
223 || c == 0x017f
224 || (c >= 0x01c4 && c <= 0x01cc)
225 || (c >= 0x01f1 && c <= 0x01f3);
226
227 case 0x02:
228 // some spacing modifiers
229 return (c >= 0x02b0 && c <= 0x02b8)
230 || (c >= 0x02e0 && c <= 0x02e4);
231
232 case 0x03:
233 return c == 0x037a; // Greek
234
235 case 0x05:
236 return c == 0x0587; // Armenian
237
238 case 0x0e:
239 return c >= 0x0edc && c <= 0x0edd; // Laotian
240
241 case 0x11:
242 // big chunks of Hangul Jamo are all "compatibility"
243 return c == 0x1101
244 || c == 0x1104
245 || c == 0x1108
246 || c == 0x110a
247 || c == 0x110d
248 || (c >= 0x1113 && c <= 0x113b)
249 || c == 0x113d
250 || c == 0x113f
251 || (c >= 0x1141 && c <= 0x114b)
252 || c == 0x114d
253 || c == 0x114f
254 || (c >= 0x1151 && c <= 0x1153)
255 || (c >= 0x1156 && c <= 0x1158)
256 || c == 0x1162
257 || c == 0x1164
258 || c == 0x1166
259 || c == 0x1168
260 || (c >= 0x116a && c <= 0x116c)
261 || (c >= 0x116f && c <= 0x1171)
262 || c == 0x1174
263 || (c >= 0x1176 && c <= 0x119d)
264 || (c >= 0x119f && c <= 0x11a2)
265 || (c >= 0x11a9 && c <= 0x11aa)
266 || (c >= 0x11ac && c <= 0x11ad)
267 || (c >= 0x11b0 && c <= 0x11b6)
268 || c == 0x11b9
269 || c == 0x11bb
270 || (c >= 0x11c3 && c <= 0x11ea)
271 || (c >= 0x11ec && c <= 0x11ef)
272 || (c >= 0x11f1 && c <= 0x11f8)
273 ;
274
275 case 0x20:
276 return c == 0x207f; // superscript
277
278 case 0x21:
279 return
280 // various letterlike symbols
281 c == 0x2102
282 || c == 0x2107
283 || (c >= 0x210a && c <= 0x2113)
284 || c == 0x2115
285 || (c >= 0x2118 && c <= 0x211d)
286 || c == 0x2124
287 || c == 0x2128
288 || (c >= 0x212c && c <= 0x212d)
289 || (c >= 0x212f && c <= 0x2138)
290
291 // most Roman numerals (less 1K, 5K, 10K)
292 || (c >= 0x2160 && c <= 0x217f)
293 ;
294
295 case 0x30:
296 // some Hiragana
297 return c >= 0x309b && c <= 0x309c;
298
299 case 0x31:
300 // all Hangul Compatibility Jamo
301 return c >= 0x3131 && c <= 0x318e;
302
303 case 0xf9:
304 case 0xfa:
305 case 0xfb:
306 case 0xfc:
307 case 0xfd:
308 case 0xfe:
309 case 0xff:
310 // the whole "compatibility" area is for that purpose!
311 return true;
312
313 default:
314 // most of Unicode isn't flagged as being for compatibility
315 return false;
316 }
317 }
318
319 // guts of isNameChar/isNCNameChar
320 private static boolean isLetter2(char c) {
321 // [84] Letter ::= BaseChar | Ideographic
322 // [85] BaseChar ::= ... too much to repeat
323 // [86] Ideographic ::= ... too much to repeat
324 // [87] CombiningChar ::= ... too much to repeat
325
326 //
327 // Optimize the typical case.
328 //
329 if (c >= 'a' && c <= 'z')
330 return true;
331 if (c == '>')
332 return false;
333 if (c >= 'A' && c <= 'Z')
334 return true;
335
336 //
337 // Since the tables are too ridiculous to use in code,
338 // we're using the footnotes here to drive this test.
339 //
340 switch (Character.getType(c)) {
341 // app. B footnote says these are 'name start'
342 // chars' ...
343 case Character.LOWERCASE_LETTER: // Ll
344 case Character.UPPERCASE_LETTER: // Lu
345 case Character.OTHER_LETTER: // Lo
346 case Character.TITLECASE_LETTER: // Lt
347 case Character.LETTER_NUMBER: // Nl
348 // ... and these are name characters 'other
349 // than name start characters'
350 case Character.COMBINING_SPACING_MARK: // Mc
351 case Character.ENCLOSING_MARK: // Me
352 case Character.NON_SPACING_MARK: // Mn
353 case Character.MODIFIER_LETTER: // Lm
354 case Character.DECIMAL_DIGIT_NUMBER: // Nd
355
356 // OK, here we just have some exceptions to check...
357 return !isCompatibilityChar(c)
358 // per "5.14 of Unicode", rule out some combiners
359 && !(c >= 0x20dd && c <= 0x20e0);
360
361 default:
362 // added a character ...
363 return c == 0x0387;
364 }
365 }
366
367 private static boolean isDigit(char c) {
368 // [88] Digit ::= ...
369
370 //
371 // java.lang.Character.isDigit is correct from the XML point
372 // of view except that it allows "fullwidth" digits.
373 //
374 return Character.isDigit(c)
375 && !((c >= 0xff10) && (c <= 0xff19));
376 }
377
378 private static boolean isExtender(char c) {
379 // [89] Extender ::= ...
380 return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387
381 || c == 0x0640 || c == 0x0e46 || c == 0x0ec6
382 || c == 0x3005 || (c >= 0x3031 && c <= 0x3035)
383 || (c >= 0x309d && c <= 0x309e)
384 || (c >= 0x30fc && c <= 0x30fe)
385 ;
386 }
387 }

mercurial