jdk8-mips64-public/jaxws: src/share/jaxws_classes/com/sun/xml/internal/dtdparser/XmlChars.java@9c07ef4934dd

     1 /*

     2  * Copyright (c) 2009, Oracle and/or its affiliates. All rights reserved.

     3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.

     4  *

     5  * This code is free software; you can redistribute it and/or modify it

     6  * under the terms of the GNU General Public License version 2 only, as

     7  * published by the Free Software Foundation.  Oracle designates this

     8  * particular file as subject to the "Classpath" exception as provided

     9  * by Oracle in the LICENSE file that accompanied this code.

    10  *

    11  * This code is distributed in the hope that it will be useful, but WITHOUT

    12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or

    13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License

    14  * version 2 for more details (a copy is included in the LICENSE file that

    15  * accompanied this code).

    16  *

    17  * You should have received a copy of the GNU General Public License version

    18  * 2 along with this work; if not, write to the Free Software Foundation,

    19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.

    20  *

    21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA

    22  * or visit www.oracle.com if you need additional information or have any

    23  * questions.

    24  */

    26 package com.sun.xml.internal.dtdparser;

    29 /**

    30  * Methods in this class are used to determine whether characters may

    31  * appear in certain roles in XML documents.  Such methods are used

    32  * both to parse and to create such documents.

    33  *

    34  * @author David Brownell

    35  * @version 1.1, 00/08/05

    36  */

    37 public class XmlChars {

    38     // can't construct instances

    39     private XmlChars() {

    40     }

    42     /**

    43      * Returns true if the argument, a UCS-4 character code, is valid in

    44      * XML documents.  Unicode characters fit into the low sixteen

    45      * bits of a UCS-4 character, and pairs of Unicode <em>surrogate

    46      * characters</em> can be combined to encode UCS-4 characters in

    47      * documents containing only Unicode.  (The <code>char</code> datatype

    48      * in the Java Programming Language represents Unicode characters,

    49      * including unpaired surrogates.)

    50      * <p/>

    51      * <P> In XML, UCS-4 characters can also be encoded by the use of

    52      * <em>character references</em> such as <b>&amp;#x12345678;</b>, which

    53      * happens to refer to a character that is disallowed in XML documents.

    54      * UCS-4 characters allowed in XML documents can be expressed with

    55      * one or two Unicode characters.

    56      *

    57      * @param ucs4char The 32-bit UCS-4 character being tested.

    58      */

    59     static public boolean isChar(int ucs4char) {

    60         // [2] Char ::= #x0009 | #x000A | #x000D

    61         //            | [#x0020-#xD7FF]

    62         //    ... surrogates excluded!

    63         //            | [#xE000-#xFFFD]

    64         //             | [#x10000-#x10ffff]

    65         return ((ucs4char >= 0x0020 && ucs4char <= 0xD7FF)

    66                 || ucs4char == 0x000A || ucs4char == 0x0009

    67                 || ucs4char == 0x000D

    68                 || (ucs4char >= 0xE000 && ucs4char <= 0xFFFD)

    69                 || (ucs4char >= 0x10000 && ucs4char <= 0x10ffff));

    70     }

    72     /**

    73      * Returns true if the character is allowed to be a non-initial

    74      * character in names according to the XML recommendation.

    75      *

    76      * @see #isNCNameChar(char)

    77      * @see #isLetter(char)

    78      */

    79     public static boolean isNameChar(char c) {

    80         // [4] NameChar ::= Letter | Digit | '.' | '_' | ':'

    81         //            | CombiningChar | Extender

    83         if (isLetter2(c))

    84             return true;

    85         else if (c == '>')

    86             return false;

    87         else if (c == '.' || c == '-' || c == '_' || c == ':'

    88                 || isExtender(c))

    89             return true;

    90         else

    91             return false;

    92     }

    94     /**

    95      * Returns true if the character is allowed to be a non-initial

    96      * character in unscoped names according to the rules of the XML

    97      * Namespaces proposed recommendation.  Except for precluding

    98      * the colon (used to separate names from their scopes) these

    99      * characters are just as allowed by the XML recommendation.

   100      *

   101      * @see #isNameChar(char)

   102      * @see #isLetter(char)

   103      */

   104     public static boolean isNCNameChar(char c) {

   105         // [NC 5] NCNameChar ::= Letter | Digit | '.' | '_'

   106         //            | CombiningChar | Extender

   107         return c != ':' && isNameChar(c);

   108     }

   110     /**

   111      * Returns true if the character is allowed where XML supports

   112      * whitespace characters, false otherwise.

   113      */

   114     public static boolean isSpace(char c) {

   115         return c == ' ' || c == '\t' || c == '\n' || c == '\r';

   116     }

   119     /*

   120      * NOTE:  java.lang.Character.getType() values are:

   121      *

   122      * UNASSIGNED                    = 0,

   123      *

   124      * UPPERCASE_LETTER            = 1,    // Lu

   125      * LOWERCASE_LETTER            = 2,    // Ll

   126      * TITLECASE_LETTER            = 3,    // Lt

   127      * MODIFIER_LETTER             = 4,    // Lm

   128      * OTHER_LETTER                = 5,    // Lo

   129      * NON_SPACING_MARK            = 6,    // Mn

   130      * ENCLOSING_MARK              = 7,    // Me

   131      * COMBINING_SPACING_MARK      = 8,    // Mc

   132      * DECIMAL_DIGIT_NUMBER        = 9,    // Nd

   133      * LETTER_NUMBER               = 10,   // Nl

   134      * OTHER_NUMBER                = 11,   // No

   135      * SPACE_SEPARATOR             = 12,   // Zs

   136      * LINE_SEPARATOR              = 13,   // Zl

   137      * PARAGRAPH_SEPARATOR         = 14,   // Zp

   138      * CONTROL                     = 15,   // Cc

   139      * FORMAT                      = 16,   // Cf

   140      *                         // 17 reserved for proposed Ci category

   141      * PRIVATE_USE                 = 18,   // Co

   142      * SURROGATE                   = 19,   // Cs

   143      * DASH_PUNCTUATION            = 20,   // Pd

   144      * START_PUNCTUATION           = 21,   // Ps

   145      * END_PUNCTUATION             = 22,   // Pe

   146      * CONNECTOR_PUNCTUATION       = 23,   // Pc

   147      * OTHER_PUNCTUATION           = 24,   // Po

   148      * MATH_SYMBOL                 = 25,   // Sm

   149      * CURRENCY_SYMBOL             = 26,   // Sc

   150      * MODIFIER_SYMBOL             = 27,   // Sk

   151      * OTHER_SYMBOL                = 28;   // So

   152      */

   154     /**

   155      * Returns true if the character is an XML "letter".  XML Names must

   156      * start with Letters or a few other characters, but other characters

   157      * in names must only satisfy the <em>isNameChar</em> predicate.

   158      *

   159      * @see #isNameChar(char)

   160      * @see #isNCNameChar(char)

   161      */

   162     public static boolean isLetter(char c) {

   163         // [84] Letter ::= BaseChar | Ideographic

   164         // [85] BaseChar ::= ... too much to repeat

   165         // [86] Ideographic ::= ... too much to repeat

   167         //

   168         // Optimize the typical case.

   169         //

   170         if (c >= 'a' && c <= 'z')

   171             return true;

   172         if (c == '/')

   173             return false;

   174         if (c >= 'A' && c <= 'Z')

   175             return true;

   177         //

   178         // Since the tables are too ridiculous to use in code,

   179         // we're using the footnotes here to drive this test.

   180         //

   181         switch (Character.getType(c)) {

   182         // app. B footnote says these are 'name start'

   183         // chars' ...

   184         case Character.LOWERCASE_LETTER:        // Ll

   185         case Character.UPPERCASE_LETTER:        // Lu

   186         case Character.OTHER_LETTER:            // Lo

   187         case Character.TITLECASE_LETTER:        // Lt

   188         case Character.LETTER_NUMBER:            // Nl

   190             // OK, here we just have some exceptions to check...

   191             return !isCompatibilityChar(c)

   192                     // per "5.14 of Unicode", rule out some combiners

   193                     && !(c >= 0x20dd && c <= 0x20e0);

   195         default:

   196             // check for some exceptions:  these are "alphabetic"

   197             return ((c >= 0x02bb && c <= 0x02c1)

   198                     || c == 0x0559 || c == 0x06e5 || c == 0x06e6);

   199         }

   200     }

   202     //

   203     // XML 1.0 discourages "compatibility" characters in names; these

   204     // were defined to permit passing through some information stored in

   205     // older non-Unicode character sets.  These always have alternative

   206     // representations in Unicode, e.g. using combining chars.

   207     //

   208     private static boolean isCompatibilityChar(char c) {

   209         // the numerous comparisions here seem unavoidable,

   210         // but the switch can reduce the number which must

   211         // actually be executed.

   213         switch ((c >> 8) & 0x0ff) {

   214         case 0x00:

   215             // ISO Latin/1 has a few compatibility characters

   216             return c == 0x00aa || c == 0x00b5 || c == 0x00ba;

   218         case 0x01:

   219             // as do Latin Extended A and (parts of) B

   220             return (c >= 0x0132 && c <= 0x0133)

   221                     || (c >= 0x013f && c <= 0x0140)

   222                     || c == 0x0149

   223                     || c == 0x017f

   224                     || (c >= 0x01c4 && c <= 0x01cc)

   225                     || (c >= 0x01f1 && c <= 0x01f3);

   227         case 0x02:

   228             // some spacing modifiers

   229             return (c >= 0x02b0 && c <= 0x02b8)

   230                     || (c >= 0x02e0 && c <= 0x02e4);

   232         case 0x03:

   233             return c == 0x037a;            // Greek

   235         case 0x05:

   236             return c == 0x0587;            // Armenian

   238         case 0x0e:

   239             return c >= 0x0edc && c <= 0x0edd;    // Laotian

   241         case 0x11:

   242             // big chunks of Hangul Jamo are all "compatibility"

   243             return c == 0x1101

   244                     || c == 0x1104

   245                     || c == 0x1108

   246                     || c == 0x110a

   247                     || c == 0x110d

   248                     || (c >= 0x1113 && c <= 0x113b)

   249                     || c == 0x113d

   250                     || c == 0x113f

   251                     || (c >= 0x1141 && c <= 0x114b)

   252                     || c == 0x114d

   253                     || c == 0x114f

   254                     || (c >= 0x1151 && c <= 0x1153)

   255                     || (c >= 0x1156 && c <= 0x1158)

   256                     || c == 0x1162

   257                     || c == 0x1164

   258                     || c == 0x1166

   259                     || c == 0x1168

   260                     || (c >= 0x116a && c <= 0x116c)

   261                     || (c >= 0x116f && c <= 0x1171)

   262                     || c == 0x1174

   263                     || (c >= 0x1176 && c <= 0x119d)

   264                     || (c >= 0x119f && c <= 0x11a2)

   265                     || (c >= 0x11a9 && c <= 0x11aa)

   266                     || (c >= 0x11ac && c <= 0x11ad)

   267                     || (c >= 0x11b0 && c <= 0x11b6)

   268                     || c == 0x11b9

   269                     || c == 0x11bb

   270                     || (c >= 0x11c3 && c <= 0x11ea)

   271                     || (c >= 0x11ec && c <= 0x11ef)

   272                     || (c >= 0x11f1 && c <= 0x11f8)

   273                     ;

   275         case 0x20:

   276             return c == 0x207f;            // superscript

   278         case 0x21:

   279             return

   280                     // various letterlike symbols

   281                     c == 0x2102

   282                     || c == 0x2107

   283                     || (c >= 0x210a && c <= 0x2113)

   284                     || c == 0x2115

   285                     || (c >= 0x2118 && c <= 0x211d)

   286                     || c == 0x2124

   287                     || c == 0x2128

   288                     || (c >= 0x212c && c <= 0x212d)

   289                     || (c >= 0x212f && c <= 0x2138)

   291                     // most Roman numerals (less 1K, 5K, 10K)

   292                     || (c >= 0x2160 && c <= 0x217f)

   293                     ;

   295         case 0x30:

   296             // some Hiragana

   297             return c >= 0x309b && c <= 0x309c;

   299         case 0x31:

   300             // all Hangul Compatibility Jamo

   301             return c >= 0x3131 && c <= 0x318e;

   303         case 0xf9:

   304         case 0xfa:

   305         case 0xfb:

   306         case 0xfc:

   307         case 0xfd:

   308         case 0xfe:

   309         case 0xff:

   310             // the whole "compatibility" area is for that purpose!

   311             return true;

   313         default:

   314             // most of Unicode isn't flagged as being for compatibility

   315             return false;

   316         }

   317     }

   319     // guts of isNameChar/isNCNameChar

   320     private static boolean isLetter2(char c) {

   321         // [84] Letter ::= BaseChar | Ideographic

   322         // [85] BaseChar ::= ... too much to repeat

   323         // [86] Ideographic ::= ... too much to repeat

   324         // [87] CombiningChar ::= ... too much to repeat

   326         //

   327         // Optimize the typical case.

   328         //

   329         if (c >= 'a' && c <= 'z')

   330             return true;

   331         if (c == '>')

   332             return false;

   333         if (c >= 'A' && c <= 'Z')

   334             return true;

   336         //

   337         // Since the tables are too ridiculous to use in code,

   338         // we're using the footnotes here to drive this test.

   339         //

   340         switch (Character.getType(c)) {

   341         // app. B footnote says these are 'name start'

   342         // chars' ...

   343         case Character.LOWERCASE_LETTER:        // Ll

   344         case Character.UPPERCASE_LETTER:        // Lu

   345         case Character.OTHER_LETTER:            // Lo

   346         case Character.TITLECASE_LETTER:        // Lt

   347         case Character.LETTER_NUMBER:            // Nl

   348             // ... and these are name characters 'other

   349             // than name start characters'

   350         case Character.COMBINING_SPACING_MARK:    // Mc

   351         case Character.ENCLOSING_MARK:        // Me

   352         case Character.NON_SPACING_MARK:        // Mn

   353         case Character.MODIFIER_LETTER:        // Lm

   354         case Character.DECIMAL_DIGIT_NUMBER:        // Nd

   356             // OK, here we just have some exceptions to check...

   357             return !isCompatibilityChar(c)

   358                     // per "5.14 of Unicode", rule out some combiners

   359                     && !(c >= 0x20dd && c <= 0x20e0);

   361         default:

   362             // added a character ...

   363             return c == 0x0387;

   364         }

   365     }

   367     private static boolean isDigit(char c) {

   368         // [88] Digit ::= ...

   370         //

   371         // java.lang.Character.isDigit is correct from the XML point

   372         // of view except that it allows "fullwidth" digits.

   373         //

   374         return Character.isDigit(c)

   375                 && !((c >= 0xff10) && (c <= 0xff19));

   376     }

   378     private static boolean isExtender(char c) {

   379         // [89] Extender ::= ...

   380         return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387

   381                 || c == 0x0640 || c == 0x0e46 || c == 0x0ec6

   382                 || c == 0x3005 || (c >= 0x3031 && c <= 0x3035)

   383                 || (c >= 0x309d && c <= 0x309e)

   384                 || (c >= 0x30fc && c <= 0x30fe)

   385                 ;

   386     }

   387 }

Mercurial > jdk8-mips64-public > jaxws / file revision

src/share/jaxws_classes/com/sun/xml/internal/dtdparser/XmlChars.java@9c07ef4934dd

src/share/jaxws_classes/com/sun/xml/internal/dtdparser/XmlChars.java