src/jdk/nashorn/internal/runtime/regexp/joni/EncodingHelper.java

Mon, 03 Nov 2014 11:47:41 +0100

author
lagergren
date
Mon, 03 Nov 2014 11:47:41 +0100
changeset 1082
e1e27c4262be
parent 962
ac62e33a99b0
child 1205
4112748288bb
child 1402
523767716eb3
permissions
-rw-r--r--

8060204: Fix warnings in Joni and tests
Reviewed-by: hannesw, sundar, attila

     1 /*
     2  * Permission is hereby granted, free of charge, to any person obtaining a copy of
     3  * this software and associated documentation files (the "Software"), to deal in
     4  * the Software without restriction, including without limitation the rights to
     5  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
     6  * of the Software, and to permit persons to whom the Software is furnished to do
     7  * so, subject to the following conditions:
     8  *
     9  * The above copyright notice and this permission notice shall be included in all
    10  * copies or substantial portions of the Software.
    11  *
    12  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    13  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    14  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    15  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    16  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    17  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
    18  * SOFTWARE.
    19  */
    20 package jdk.nashorn.internal.runtime.regexp.joni;
    22 import java.util.Arrays;
    23 import jdk.nashorn.internal.runtime.regexp.joni.encoding.CharacterType;
    24 import jdk.nashorn.internal.runtime.regexp.joni.encoding.IntHolder;
    26 @SuppressWarnings("javadoc")
    27 public final class EncodingHelper {
    29     final static int NEW_LINE            = 0x000a;
    30     final static int RETURN              = 0x000d;
    31     final static int LINE_SEPARATOR      = 0x2028;
    32     final static int PARAGRAPH_SEPARATOR = 0x2029;
    34     final static char[] EMPTYCHARS = new char[0];
    35     final static int[][] codeRanges = new int[15][];
    37     public static int digitVal(final int code) {
    38         return code - '0';
    39     }
    41     public static int odigitVal(final int code) {
    42         return digitVal(code);
    43     }
    45     public static boolean isXDigit(final int code) {
    46         return Character.isDigit(code) || (code >= 'a' && code <= 'f') || (code >= 'A' && code <= 'F');
    47     }
    49     public static int xdigitVal(final int code) {
    50         if (Character.isDigit(code)) {
    51             return code - '0';
    52         } else if (code >= 'a' && code <= 'f') {
    53             return code - 'a' + 10;
    54         } else {
    55             return code - 'A' + 10;
    56         }
    57     }
    59     public static boolean isDigit(final int code) {
    60         return code >= '0' && code <= '9';
    61     }
    63     public static boolean isWord(final int code) {
    64         // letter, digit, or '_'
    65         return (1 << Character.getType(code) & CharacterType.WORD_MASK) != 0;
    66     }
    68     public static boolean isNewLine(final int code) {
    69         return code == NEW_LINE || code == RETURN || code == LINE_SEPARATOR || code == PARAGRAPH_SEPARATOR;
    70     }
    72     public static boolean isNewLine(final char[] chars, final int p, final int end) {
    73         return p < end && isNewLine(chars[p]);
    74     }
    76     // Encoding.prevCharHead
    77     public static int prevCharHead(final int p, final int s) {
    78         return s <= p ? -1 : s - 1;
    79     }
    81     /* onigenc_get_right_adjust_char_head_with_prev */
    82     public static int rightAdjustCharHeadWithPrev(final int s, final IntHolder prev) {
    83         if (prev != null) {
    84             prev.value = -1; /* Sorry */
    85         }
    86         return s;
    87     }
    89     // Encoding.stepBack
    90     public static int stepBack(final int p, final int sp, final int np) {
    91         int s = sp, n = np;
    92         while (s != -1 && n-- > 0) {
    93            if (s <= p) {
    94             return -1;
    95         }
    96            s--;
    97        }
    98        return s;
    99     }
   101     public static int mbcodeStartPosition() {
   102         return 0x80;
   103     }
   105     public static char[] caseFoldCodesByString(final int flag, final char c) {
   106         char[] codes = EMPTYCHARS;
   107         final char upper = toUpperCase(c);
   109         if (upper != toLowerCase(upper)) {
   110             int count = 0;
   111             char ch = 0;
   113             do {
   114                 final char u = toUpperCase(ch);
   115                 if (u == upper && ch != c) {
   116                     // Almost all characters will return array of length 1, very few 2 or 3, so growing by one is fine.
   117                     codes = count == 0 ? new char[1] : Arrays.copyOf(codes, count + 1);
   118                     codes[count++] = ch;
   119                 }
   120             } while (ch++ < 0xffff);
   121         }
   122         return codes;
   123     }
   125     public static void applyAllCaseFold(final int flag, final ApplyCaseFold fun, final Object arg) {
   126         for (int c = 0; c < 0xffff; c++) {
   127             if (Character.isLowerCase(c)) {
   128                 final int upper = toUpperCase(c);
   130                 if (upper != c) {
   131                     ApplyCaseFold.apply(c, upper, arg);
   132                 }
   133             }
   134         }
   136         // Some characters have multiple lower case variants, hence we need to do a second run
   137         for (int c = 0; c < 0xffff; c++) {
   138             if (Character.isLowerCase(c)) {
   139                 final int upper = toUpperCase(c);
   141                 if (upper != c) {
   142                     ApplyCaseFold.apply(upper, c, arg);
   143                 }
   144             }
   145         }
   146     }
   148     public static char toLowerCase(final char c) {
   149         return (char)toLowerCase((int)c);
   150     }
   152     public static int toLowerCase(final int c) {
   153         if (c < 128) {
   154             return ('A' <= c && c <= 'Z') ? (c + ('a' - 'A')) : c;
   155         }
   156         // Do not convert non-ASCII upper case character to ASCII lower case.
   157         final int lower = Character.toLowerCase(c);
   158         return (lower < 128) ? c : lower;
   160     }
   162     public static char toUpperCase(final char c) {
   163         return (char)toUpperCase((int)c);
   164     }
   166     public static int toUpperCase(final int c) {
   167         if (c < 128) {
   168             return ('a' <= c && c <= 'z') ? c + ('A' - 'a') : c;
   169         }
   170         // Do not convert non-ASCII lower case character to ASCII upper case.
   171         final int upper = Character.toUpperCase(c);
   172         return (upper < 128) ? c : upper;
   173     }
   175     public static int[] ctypeCodeRange(final int ctype, final IntHolder sbOut) {
   176         sbOut.value = 0x100; // use bitset for codes smaller than 256
   177         int[] range = null;
   179         if (ctype < codeRanges.length) {
   180             range = codeRanges[ctype];
   182             if (range == null) {
   183                 // format: [numberOfRanges, rangeStart, rangeEnd, ...]
   184                 range = new int[16];
   185                 int rangeCount = 0;
   186                 int lastCode = -2;
   188                 for (int code = 0; code <= 0xffff; code++) {
   189                     if (isCodeCType(code, ctype)) {
   190                         if (lastCode < code -1) {
   191                             if (rangeCount * 2 + 2 >= range.length) {
   192                                 range = Arrays.copyOf(range, range.length * 2);
   193                             }
   194                             range[rangeCount * 2 + 1] = code;
   195                             rangeCount++;
   196                         }
   197                         range[rangeCount * 2] = lastCode = code;
   198                     }
   199                 }
   201                 if (rangeCount * 2 + 1 < range.length) {
   202                     range = Arrays.copyOf(range, rangeCount * 2 + 1);
   203                 }
   205                 range[0] = rangeCount;
   206                 codeRanges[ctype] = range;
   207             }
   208         }
   210         return range;
   211     }
   213     // CodeRange.isInCodeRange
   214     public static boolean isInCodeRange(final int[] p, final int offset, final int code) {
   215         int low = 0;
   216         final int n = p[offset];
   217         int high = n ;
   219         while (low < high) {
   220             final int x = (low + high) >> 1;
   221             if (code > p[(x << 1) + 2 + offset]) {
   222                 low = x + 1;
   223             } else {
   224                 high = x;
   225             }
   226         }
   227         return low < n && code >= p[(low << 1) + 1 + offset];
   228     }
   230     /**
   231      * @see <a href="http://www.geocities.jp/kosako3/oniguruma/doc/RE.txt">http://www.geocities.jp/kosako3/oniguruma/doc/RE.txt</a>
   232      */
   233     public static boolean isCodeCType(final int code, final int ctype) {
   234         int type;
   235         switch (ctype) {
   236             case CharacterType.NEWLINE:
   237                 return isNewLine(code);
   238             case CharacterType.ALPHA:
   239                 return (1 << Character.getType(code) & CharacterType.ALPHA_MASK) != 0;
   240             case CharacterType.BLANK:
   241                 return code == 0x09 || Character.getType(code) == Character.SPACE_SEPARATOR;
   242             case CharacterType.CNTRL:
   243                 type = Character.getType(code);
   244                 return (1 << type & CharacterType.CNTRL_MASK) != 0 || type == Character.UNASSIGNED;
   245             case CharacterType.DIGIT:
   246                 return EncodingHelper.isDigit(code);
   247             case CharacterType.GRAPH:
   248                 switch (code) {
   249                     case 0x09:
   250                     case 0x0a:
   251                     case 0x0b:
   252                     case 0x0c:
   253                     case 0x0d:
   254                         return false;
   255                     default:
   256                         type = Character.getType(code);
   257                         return (1 << type & CharacterType.GRAPH_MASK) == 0 && type != Character.UNASSIGNED;
   258                 }
   259             case CharacterType.LOWER:
   260                 return Character.isLowerCase(code);
   261             case CharacterType.PRINT:
   262                 type = Character.getType(code);
   263                 return (1 << type & CharacterType.PRINT_MASK) == 0 && type != Character.UNASSIGNED;
   264             case CharacterType.PUNCT:
   265                 return (1 << Character.getType(code) & CharacterType.PUNCT_MASK) != 0;
   266             case CharacterType.SPACE:
   267                 // ECMA 7.2 and 7.3
   268                 switch (code) {
   269                     case 0x09:
   270                     case 0x0a:
   271                     case 0x0b:
   272                     case 0x0c:
   273                     case 0x0d:
   274                         return true;
   275                     default:
   276                         // true if Unicode separator or BOM
   277                         return (1 << Character.getType(code) & CharacterType.SPACE_MASK) != 0 || code == 0xfeff;
   278                 }
   279             case CharacterType.UPPER:
   280                 return Character.isUpperCase(code);
   281             case CharacterType.XDIGIT:
   282                 return EncodingHelper.isXDigit(code);
   283             case CharacterType.WORD:
   284                 return (1 << Character.getType(code) & CharacterType.WORD_MASK) != 0;
   285             case CharacterType.ALNUM:
   286                 return (1 << Character.getType(code) & CharacterType.ALNUM_MASK) != 0;
   287             case CharacterType.ASCII:
   288                 return code < 0x80;
   289             default:
   290                 throw new RuntimeException("illegal character type: " + ctype);
   291         }
   292     }
   293 }

mercurial