src/share/jaxws_classes/com/sun/xml/internal/dtdparser/XmlChars.java

changeset 0
373ffda63c9a
child 637
9c07ef4934dd
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/src/share/jaxws_classes/com/sun/xml/internal/dtdparser/XmlChars.java	Wed Apr 27 01:27:09 2016 +0800
     1.3 @@ -0,0 +1,387 @@
     1.4 +/*
     1.5 + * Copyright (c) 2009, Oracle and/or its affiliates. All rights reserved.
     1.6 + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     1.7 + *
     1.8 + * This code is free software; you can redistribute it and/or modify it
     1.9 + * under the terms of the GNU General Public License version 2 only, as
    1.10 + * published by the Free Software Foundation.  Oracle designates this
    1.11 + * particular file as subject to the "Classpath" exception as provided
    1.12 + * by Oracle in the LICENSE file that accompanied this code.
    1.13 + *
    1.14 + * This code is distributed in the hope that it will be useful, but WITHOUT
    1.15 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
    1.16 + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    1.17 + * version 2 for more details (a copy is included in the LICENSE file that
    1.18 + * accompanied this code).
    1.19 + *
    1.20 + * You should have received a copy of the GNU General Public License version
    1.21 + * 2 along with this work; if not, write to the Free Software Foundation,
    1.22 + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
    1.23 + *
    1.24 + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
    1.25 + * or visit www.oracle.com if you need additional information or have any
    1.26 + * questions.
    1.27 + */
    1.28 +
    1.29 +package com.sun.xml.internal.dtdparser;
    1.30 +
    1.31 +
    1.32 +/**
    1.33 + * Methods in this class are used to determine whether characters may
    1.34 + * appear in certain roles in XML documents.  Such methods are used
    1.35 + * both to parse and to create such documents.
    1.36 + *
    1.37 + * @author David Brownell
    1.38 + * @version 1.1, 00/08/05
    1.39 + */
    1.40 +public class XmlChars {
    1.41 +    // can't construct instances
    1.42 +    private XmlChars() {
    1.43 +    }
    1.44 +
    1.45 +    /**
    1.46 +     * Returns true if the argument, a UCS-4 character code, is valid in
    1.47 +     * XML documents.  Unicode characters fit into the low sixteen
    1.48 +     * bits of a UCS-4 character, and pairs of Unicode <em>surrogate
    1.49 +     * characters</em> can be combined to encode UCS-4 characters in
    1.50 +     * documents containing only Unicode.  (The <code>char</code> datatype
    1.51 +     * in the Java Programming Language represents Unicode characters,
    1.52 +     * including unpaired surrogates.)
    1.53 +     * <p/>
    1.54 +     * <P> In XML, UCS-4 characters can also be encoded by the use of
    1.55 +     * <em>character references</em> such as <b>&amp;#x12345678;</b>, which
    1.56 +     * happens to refer to a character that is disallowed in XML documents.
    1.57 +     * UCS-4 characters allowed in XML documents can be expressed with
    1.58 +     * one or two Unicode characters.
    1.59 +     *
    1.60 +     * @param ucs4char The 32-bit UCS-4 character being tested.
    1.61 +     */
    1.62 +    static public boolean isChar(int ucs4char) {
    1.63 +        // [2] Char ::= #x0009 | #x000A | #x000D
    1.64 +        //            | [#x0020-#xD7FF]
    1.65 +        //    ... surrogates excluded!
    1.66 +        //            | [#xE000-#xFFFD]
    1.67 +        //             | [#x10000-#x10ffff]
    1.68 +        return ((ucs4char >= 0x0020 && ucs4char <= 0xD7FF)
    1.69 +                || ucs4char == 0x000A || ucs4char == 0x0009
    1.70 +                || ucs4char == 0x000D
    1.71 +                || (ucs4char >= 0xE000 && ucs4char <= 0xFFFD)
    1.72 +                || (ucs4char >= 0x10000 && ucs4char <= 0x10ffff));
    1.73 +    }
    1.74 +
    1.75 +    /**
    1.76 +     * Returns true if the character is allowed to be a non-initial
    1.77 +     * character in names according to the XML recommendation.
    1.78 +     *
    1.79 +     * @see #isNCNameChar(char)
    1.80 +     * @see #isLetter(char)
    1.81 +     */
    1.82 +    public static boolean isNameChar(char c) {
    1.83 +        // [4] NameChar ::= Letter | Digit | '.' | '_' | ':'
    1.84 +        //            | CombiningChar | Extender
    1.85 +
    1.86 +        if (isLetter2(c))
    1.87 +            return true;
    1.88 +        else if (c == '>')
    1.89 +            return false;
    1.90 +        else if (c == '.' || c == '-' || c == '_' || c == ':'
    1.91 +                || isExtender(c))
    1.92 +            return true;
    1.93 +        else
    1.94 +            return false;
    1.95 +    }
    1.96 +
    1.97 +    /**
    1.98 +     * Returns true if the character is allowed to be a non-initial
    1.99 +     * character in unscoped names according to the rules of the XML
   1.100 +     * Namespaces proposed recommendation.  Except for precluding
   1.101 +     * the colon (used to separate names from their scopes) these
   1.102 +     * characters are just as allowed by the XML recommendation.
   1.103 +     *
   1.104 +     * @see #isNameChar(char)
   1.105 +     * @see #isLetter(char)
   1.106 +     */
   1.107 +    public static boolean isNCNameChar(char c) {
   1.108 +        // [NC 5] NCNameChar ::= Letter | Digit | '.' | '_'
   1.109 +        //            | CombiningChar | Extender
   1.110 +        return c != ':' && isNameChar(c);
   1.111 +    }
   1.112 +
   1.113 +    /**
   1.114 +     * Returns true if the character is allowed where XML supports
   1.115 +     * whitespace characters, false otherwise.
   1.116 +     */
   1.117 +    public static boolean isSpace(char c) {
   1.118 +        return c == ' ' || c == '\t' || c == '\n' || c == '\r';
   1.119 +    }
   1.120 +
   1.121 +
   1.122 +    /*
   1.123 +     * NOTE:  java.lang.Character.getType() values are:
   1.124 +     *
   1.125 +     * UNASSIGNED                    = 0,
   1.126 +     *
   1.127 +     * UPPERCASE_LETTER            = 1,    // Lu
   1.128 +     * LOWERCASE_LETTER            = 2,    // Ll
   1.129 +     * TITLECASE_LETTER            = 3,    // Lt
   1.130 +     * MODIFIER_LETTER             = 4,    // Lm
   1.131 +     * OTHER_LETTER                = 5,    // Lo
   1.132 +     * NON_SPACING_MARK            = 6,    // Mn
   1.133 +     * ENCLOSING_MARK              = 7,    // Me
   1.134 +     * COMBINING_SPACING_MARK      = 8,    // Mc
   1.135 +     * DECIMAL_DIGIT_NUMBER        = 9,    // Nd
   1.136 +     * LETTER_NUMBER               = 10,   // Nl
   1.137 +     * OTHER_NUMBER                = 11,   // No
   1.138 +     * SPACE_SEPARATOR             = 12,   // Zs
   1.139 +     * LINE_SEPARATOR              = 13,   // Zl
   1.140 +     * PARAGRAPH_SEPARATOR         = 14,   // Zp
   1.141 +     * CONTROL                     = 15,   // Cc
   1.142 +     * FORMAT                      = 16,   // Cf
   1.143 +     *                         // 17 reserved for proposed Ci category
   1.144 +     * PRIVATE_USE                 = 18,   // Co
   1.145 +     * SURROGATE                   = 19,   // Cs
   1.146 +     * DASH_PUNCTUATION            = 20,   // Pd
   1.147 +     * START_PUNCTUATION           = 21,   // Ps
   1.148 +     * END_PUNCTUATION             = 22,   // Pe
   1.149 +     * CONNECTOR_PUNCTUATION       = 23,   // Pc
   1.150 +     * OTHER_PUNCTUATION           = 24,   // Po
   1.151 +     * MATH_SYMBOL                 = 25,   // Sm
   1.152 +     * CURRENCY_SYMBOL             = 26,   // Sc
   1.153 +     * MODIFIER_SYMBOL             = 27,   // Sk
   1.154 +     * OTHER_SYMBOL                = 28;   // So
   1.155 +     */
   1.156 +
   1.157 +    /**
   1.158 +     * Returns true if the character is an XML "letter".  XML Names must
   1.159 +     * start with Letters or a few other characters, but other characters
   1.160 +     * in names must only satisfy the <em>isNameChar</em> predicate.
   1.161 +     *
   1.162 +     * @see #isNameChar(char)
   1.163 +     * @see #isNCNameChar(char)
   1.164 +     */
   1.165 +    public static boolean isLetter(char c) {
   1.166 +        // [84] Letter ::= BaseChar | Ideographic
   1.167 +        // [85] BaseChar ::= ... too much to repeat
   1.168 +        // [86] Ideographic ::= ... too much to repeat
   1.169 +
   1.170 +        //
   1.171 +        // Optimize the typical case.
   1.172 +        //
   1.173 +        if (c >= 'a' && c <= 'z')
   1.174 +            return true;
   1.175 +        if (c == '/')
   1.176 +            return false;
   1.177 +        if (c >= 'A' && c <= 'Z')
   1.178 +            return true;
   1.179 +
   1.180 +        //
   1.181 +        // Since the tables are too ridiculous to use in code,
   1.182 +        // we're using the footnotes here to drive this test.
   1.183 +        //
   1.184 +        switch (Character.getType(c)) {
   1.185 +        // app. B footnote says these are 'name start'
   1.186 +        // chars' ...
   1.187 +        case Character.LOWERCASE_LETTER:        // Ll
   1.188 +        case Character.UPPERCASE_LETTER:        // Lu
   1.189 +        case Character.OTHER_LETTER:            // Lo
   1.190 +        case Character.TITLECASE_LETTER:        // Lt
   1.191 +        case Character.LETTER_NUMBER:            // Nl
   1.192 +
   1.193 +            // OK, here we just have some exceptions to check...
   1.194 +            return !isCompatibilityChar(c)
   1.195 +                    // per "5.14 of Unicode", rule out some combiners
   1.196 +                    && !(c >= 0x20dd && c <= 0x20e0);
   1.197 +
   1.198 +        default:
   1.199 +            // check for some exceptions:  these are "alphabetic"
   1.200 +            return ((c >= 0x02bb && c <= 0x02c1)
   1.201 +                    || c == 0x0559 || c == 0x06e5 || c == 0x06e6);
   1.202 +        }
   1.203 +    }
   1.204 +
   1.205 +    //
   1.206 +    // XML 1.0 discourages "compatibility" characters in names; these
   1.207 +    // were defined to permit passing through some information stored in
   1.208 +    // older non-Unicode character sets.  These always have alternative
   1.209 +    // representations in Unicode, e.g. using combining chars.
   1.210 +    //
   1.211 +    private static boolean isCompatibilityChar(char c) {
   1.212 +        // the numerous comparisions here seem unavoidable,
   1.213 +        // but the switch can reduce the number which must
   1.214 +        // actually be executed.
   1.215 +
   1.216 +        switch ((c >> 8) & 0x0ff) {
   1.217 +        case 0x00:
   1.218 +            // ISO Latin/1 has a few compatibility characters
   1.219 +            return c == 0x00aa || c == 0x00b5 || c == 0x00ba;
   1.220 +
   1.221 +        case 0x01:
   1.222 +            // as do Latin Extended A and (parts of) B
   1.223 +            return (c >= 0x0132 && c <= 0x0133)
   1.224 +                    || (c >= 0x013f && c <= 0x0140)
   1.225 +                    || c == 0x0149
   1.226 +                    || c == 0x017f
   1.227 +                    || (c >= 0x01c4 && c <= 0x01cc)
   1.228 +                    || (c >= 0x01f1 && c <= 0x01f3);
   1.229 +
   1.230 +        case 0x02:
   1.231 +            // some spacing modifiers
   1.232 +            return (c >= 0x02b0 && c <= 0x02b8)
   1.233 +                    || (c >= 0x02e0 && c <= 0x02e4);
   1.234 +
   1.235 +        case 0x03:
   1.236 +            return c == 0x037a;            // Greek
   1.237 +
   1.238 +        case 0x05:
   1.239 +            return c == 0x0587;            // Armenian
   1.240 +
   1.241 +        case 0x0e:
   1.242 +            return c >= 0x0edc && c <= 0x0edd;    // Laotian
   1.243 +
   1.244 +        case 0x11:
   1.245 +            // big chunks of Hangul Jamo are all "compatibility"
   1.246 +            return c == 0x1101
   1.247 +                    || c == 0x1104
   1.248 +                    || c == 0x1108
   1.249 +                    || c == 0x110a
   1.250 +                    || c == 0x110d
   1.251 +                    || (c >= 0x1113 && c <= 0x113b)
   1.252 +                    || c == 0x113d
   1.253 +                    || c == 0x113f
   1.254 +                    || (c >= 0x1141 && c <= 0x114b)
   1.255 +                    || c == 0x114d
   1.256 +                    || c == 0x114f
   1.257 +                    || (c >= 0x1151 && c <= 0x1153)
   1.258 +                    || (c >= 0x1156 && c <= 0x1158)
   1.259 +                    || c == 0x1162
   1.260 +                    || c == 0x1164
   1.261 +                    || c == 0x1166
   1.262 +                    || c == 0x1168
   1.263 +                    || (c >= 0x116a && c <= 0x116c)
   1.264 +                    || (c >= 0x116f && c <= 0x1171)
   1.265 +                    || c == 0x1174
   1.266 +                    || (c >= 0x1176 && c <= 0x119d)
   1.267 +                    || (c >= 0x119f && c <= 0x11a2)
   1.268 +                    || (c >= 0x11a9 && c <= 0x11aa)
   1.269 +                    || (c >= 0x11ac && c <= 0x11ad)
   1.270 +                    || (c >= 0x11b0 && c <= 0x11b6)
   1.271 +                    || c == 0x11b9
   1.272 +                    || c == 0x11bb
   1.273 +                    || (c >= 0x11c3 && c <= 0x11ea)
   1.274 +                    || (c >= 0x11ec && c <= 0x11ef)
   1.275 +                    || (c >= 0x11f1 && c <= 0x11f8)
   1.276 +                    ;
   1.277 +
   1.278 +        case 0x20:
   1.279 +            return c == 0x207f;            // superscript
   1.280 +
   1.281 +        case 0x21:
   1.282 +            return
   1.283 +                    // various letterlike symbols
   1.284 +                    c == 0x2102
   1.285 +                    || c == 0x2107
   1.286 +                    || (c >= 0x210a && c <= 0x2113)
   1.287 +                    || c == 0x2115
   1.288 +                    || (c >= 0x2118 && c <= 0x211d)
   1.289 +                    || c == 0x2124
   1.290 +                    || c == 0x2128
   1.291 +                    || (c >= 0x212c && c <= 0x212d)
   1.292 +                    || (c >= 0x212f && c <= 0x2138)
   1.293 +
   1.294 +                    // most Roman numerals (less 1K, 5K, 10K)
   1.295 +                    || (c >= 0x2160 && c <= 0x217f)
   1.296 +                    ;
   1.297 +
   1.298 +        case 0x30:
   1.299 +            // some Hiragana
   1.300 +            return c >= 0x309b && c <= 0x309c;
   1.301 +
   1.302 +        case 0x31:
   1.303 +            // all Hangul Compatibility Jamo
   1.304 +            return c >= 0x3131 && c <= 0x318e;
   1.305 +
   1.306 +        case 0xf9:
   1.307 +        case 0xfa:
   1.308 +        case 0xfb:
   1.309 +        case 0xfc:
   1.310 +        case 0xfd:
   1.311 +        case 0xfe:
   1.312 +        case 0xff:
   1.313 +            // the whole "compatibility" area is for that purpose!
   1.314 +            return true;
   1.315 +
   1.316 +        default:
   1.317 +            // most of Unicode isn't flagged as being for compatibility
   1.318 +            return false;
   1.319 +        }
   1.320 +    }
   1.321 +
   1.322 +    // guts of isNameChar/isNCNameChar
   1.323 +    private static boolean isLetter2(char c) {
   1.324 +        // [84] Letter ::= BaseChar | Ideographic
   1.325 +        // [85] BaseChar ::= ... too much to repeat
   1.326 +        // [86] Ideographic ::= ... too much to repeat
   1.327 +        // [87] CombiningChar ::= ... too much to repeat
   1.328 +
   1.329 +        //
   1.330 +        // Optimize the typical case.
   1.331 +        //
   1.332 +        if (c >= 'a' && c <= 'z')
   1.333 +            return true;
   1.334 +        if (c == '>')
   1.335 +            return false;
   1.336 +        if (c >= 'A' && c <= 'Z')
   1.337 +            return true;
   1.338 +
   1.339 +        //
   1.340 +        // Since the tables are too ridiculous to use in code,
   1.341 +        // we're using the footnotes here to drive this test.
   1.342 +        //
   1.343 +        switch (Character.getType(c)) {
   1.344 +        // app. B footnote says these are 'name start'
   1.345 +        // chars' ...
   1.346 +        case Character.LOWERCASE_LETTER:        // Ll
   1.347 +        case Character.UPPERCASE_LETTER:        // Lu
   1.348 +        case Character.OTHER_LETTER:            // Lo
   1.349 +        case Character.TITLECASE_LETTER:        // Lt
   1.350 +        case Character.LETTER_NUMBER:            // Nl
   1.351 +            // ... and these are name characters 'other
   1.352 +            // than name start characters'
   1.353 +        case Character.COMBINING_SPACING_MARK:    // Mc
   1.354 +        case Character.ENCLOSING_MARK:        // Me
   1.355 +        case Character.NON_SPACING_MARK:        // Mn
   1.356 +        case Character.MODIFIER_LETTER:        // Lm
   1.357 +        case Character.DECIMAL_DIGIT_NUMBER:        // Nd
   1.358 +
   1.359 +            // OK, here we just have some exceptions to check...
   1.360 +            return !isCompatibilityChar(c)
   1.361 +                    // per "5.14 of Unicode", rule out some combiners
   1.362 +                    && !(c >= 0x20dd && c <= 0x20e0);
   1.363 +
   1.364 +        default:
   1.365 +            // added a character ...
   1.366 +            return c == 0x0387;
   1.367 +        }
   1.368 +    }
   1.369 +
   1.370 +    private static boolean isDigit(char c) {
   1.371 +        // [88] Digit ::= ...
   1.372 +
   1.373 +        //
   1.374 +        // java.lang.Character.isDigit is correct from the XML point
   1.375 +        // of view except that it allows "fullwidth" digits.
   1.376 +        //
   1.377 +        return Character.isDigit(c)
   1.378 +                && !((c >= 0xff10) && (c <= 0xff19));
   1.379 +    }
   1.380 +
   1.381 +    private static boolean isExtender(char c) {
   1.382 +        // [89] Extender ::= ...
   1.383 +        return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387
   1.384 +                || c == 0x0640 || c == 0x0e46 || c == 0x0ec6
   1.385 +                || c == 0x3005 || (c >= 0x3031 && c <= 0x3035)
   1.386 +                || (c >= 0x309d && c <= 0x309e)
   1.387 +                || (c >= 0x30fc && c <= 0x30fe)
   1.388 +                ;
   1.389 +    }
   1.390 +}

mercurial