Mon, 24 Oct 2011 13:00:20 +0100
7096014: Javac tokens should retain state
Summary: Refactor javac tokens from enum constants to stateful instances (to keep track of position, comments, etc.)
Reviewed-by: jjg
mcimadamore@1113 | 1 | /* |
mcimadamore@1113 | 2 | * Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved. |
mcimadamore@1113 | 3 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
mcimadamore@1113 | 4 | * |
mcimadamore@1113 | 5 | * This code is free software; you can redistribute it and/or modify it |
mcimadamore@1113 | 6 | * under the terms of the GNU General Public License version 2 only, as |
mcimadamore@1113 | 7 | * published by the Free Software Foundation. Oracle designates this |
mcimadamore@1113 | 8 | * particular file as subject to the "Classpath" exception as provided |
mcimadamore@1113 | 9 | * by Oracle in the LICENSE file that accompanied this code. |
mcimadamore@1113 | 10 | * |
mcimadamore@1113 | 11 | * This code is distributed in the hope that it will be useful, but WITHOUT |
mcimadamore@1113 | 12 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
mcimadamore@1113 | 13 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
mcimadamore@1113 | 14 | * version 2 for more details (a copy is included in the LICENSE file that |
mcimadamore@1113 | 15 | * accompanied this code). |
mcimadamore@1113 | 16 | * |
mcimadamore@1113 | 17 | * You should have received a copy of the GNU General Public License version |
mcimadamore@1113 | 18 | * 2 along with this work; if not, write to the Free Software Foundation, |
mcimadamore@1113 | 19 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
mcimadamore@1113 | 20 | * |
mcimadamore@1113 | 21 | * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
mcimadamore@1113 | 22 | * or visit www.oracle.com if you need additional information or have any |
mcimadamore@1113 | 23 | * questions. |
mcimadamore@1113 | 24 | */ |
mcimadamore@1113 | 25 | |
mcimadamore@1113 | 26 | package com.sun.tools.javac.parser; |
mcimadamore@1113 | 27 | |
mcimadamore@1113 | 28 | import com.sun.tools.javac.file.JavacFileManager; |
mcimadamore@1113 | 29 | import java.nio.CharBuffer; |
mcimadamore@1113 | 30 | import com.sun.tools.javac.util.Log; |
mcimadamore@1113 | 31 | import static com.sun.tools.javac.util.LayoutCharacters.*; |
mcimadamore@1113 | 32 | |
mcimadamore@1113 | 33 | /** The char reader used by the javac lexer/tokenizer. Returns the sequence of |
mcimadamore@1113 | 34 | * characters contained in the input stream, handling unicode escape accordingly. |
mcimadamore@1113 | 35 | * Additionally, it provide features for saving chars into a buffer and to retrieve |
mcimadamore@1113 | 36 | * them at a later stage. |
mcimadamore@1113 | 37 | * |
mcimadamore@1113 | 38 | * <p><b>This is NOT part of any supported API. |
mcimadamore@1113 | 39 | * If you write code that depends on this, you do so at your own risk. |
mcimadamore@1113 | 40 | * This code and its internal interfaces are subject to change or |
mcimadamore@1113 | 41 | * deletion without notice.</b> |
mcimadamore@1113 | 42 | */ |
mcimadamore@1113 | 43 | public class UnicodeReader { |
mcimadamore@1113 | 44 | |
mcimadamore@1113 | 45 | /** The input buffer, index of next character to be read, |
mcimadamore@1113 | 46 | * index of one past last character in buffer. |
mcimadamore@1113 | 47 | */ |
mcimadamore@1113 | 48 | protected char[] buf; |
mcimadamore@1113 | 49 | protected int bp; |
mcimadamore@1113 | 50 | protected final int buflen; |
mcimadamore@1113 | 51 | |
mcimadamore@1113 | 52 | /** The current character. |
mcimadamore@1113 | 53 | */ |
mcimadamore@1113 | 54 | protected char ch; |
mcimadamore@1113 | 55 | |
mcimadamore@1113 | 56 | /** The buffer index of the last converted unicode character |
mcimadamore@1113 | 57 | */ |
mcimadamore@1113 | 58 | protected int unicodeConversionBp = -1; |
mcimadamore@1113 | 59 | |
mcimadamore@1113 | 60 | protected Log log; |
mcimadamore@1113 | 61 | |
mcimadamore@1113 | 62 | /** |
mcimadamore@1113 | 63 | * Create a scanner from the input array. This method might |
mcimadamore@1113 | 64 | * modify the array. To avoid copying the input array, ensure |
mcimadamore@1113 | 65 | * that {@code inputLength < input.length} or |
mcimadamore@1113 | 66 | * {@code input[input.length -1]} is a white space character. |
mcimadamore@1113 | 67 | * |
mcimadamore@1113 | 68 | * @param fac the factory which created this Scanner |
mcimadamore@1113 | 69 | * @param input the input, might be modified |
mcimadamore@1113 | 70 | * @param inputLength the size of the input. |
mcimadamore@1113 | 71 | * Must be positive and less than or equal to input.length. |
mcimadamore@1113 | 72 | */ |
mcimadamore@1113 | 73 | protected UnicodeReader(ScannerFactory sf, CharBuffer buffer) { |
mcimadamore@1113 | 74 | this(sf, JavacFileManager.toArray(buffer), buffer.limit()); |
mcimadamore@1113 | 75 | } |
mcimadamore@1113 | 76 | |
mcimadamore@1113 | 77 | protected UnicodeReader(ScannerFactory sf, char[] input, int inputLength) { |
mcimadamore@1113 | 78 | log = sf.log; |
mcimadamore@1113 | 79 | if (inputLength == input.length) { |
mcimadamore@1113 | 80 | if (input.length > 0 && Character.isWhitespace(input[input.length - 1])) { |
mcimadamore@1113 | 81 | inputLength--; |
mcimadamore@1113 | 82 | } else { |
mcimadamore@1113 | 83 | char[] newInput = new char[inputLength + 1]; |
mcimadamore@1113 | 84 | System.arraycopy(input, 0, newInput, 0, input.length); |
mcimadamore@1113 | 85 | input = newInput; |
mcimadamore@1113 | 86 | } |
mcimadamore@1113 | 87 | } |
mcimadamore@1113 | 88 | buf = input; |
mcimadamore@1113 | 89 | buflen = inputLength; |
mcimadamore@1113 | 90 | buf[buflen] = EOI; |
mcimadamore@1113 | 91 | bp = -1; |
mcimadamore@1113 | 92 | scanChar(); |
mcimadamore@1113 | 93 | } |
mcimadamore@1113 | 94 | |
mcimadamore@1113 | 95 | /** Read next character. |
mcimadamore@1113 | 96 | */ |
mcimadamore@1113 | 97 | protected void scanChar() { |
mcimadamore@1113 | 98 | if (bp < buflen) { |
mcimadamore@1113 | 99 | ch = buf[++bp]; |
mcimadamore@1113 | 100 | if (ch == '\\') { |
mcimadamore@1113 | 101 | convertUnicode(); |
mcimadamore@1113 | 102 | } |
mcimadamore@1113 | 103 | } |
mcimadamore@1113 | 104 | } |
mcimadamore@1113 | 105 | |
mcimadamore@1113 | 106 | /** Convert unicode escape; bp points to initial '\' character |
mcimadamore@1113 | 107 | * (Spec 3.3). |
mcimadamore@1113 | 108 | */ |
mcimadamore@1113 | 109 | protected void convertUnicode() { |
mcimadamore@1113 | 110 | if (ch == '\\' && unicodeConversionBp != bp) { |
mcimadamore@1113 | 111 | bp++; ch = buf[bp]; |
mcimadamore@1113 | 112 | if (ch == 'u') { |
mcimadamore@1113 | 113 | do { |
mcimadamore@1113 | 114 | bp++; ch = buf[bp]; |
mcimadamore@1113 | 115 | } while (ch == 'u'); |
mcimadamore@1113 | 116 | int limit = bp + 3; |
mcimadamore@1113 | 117 | if (limit < buflen) { |
mcimadamore@1113 | 118 | int d = digit(bp, 16); |
mcimadamore@1113 | 119 | int code = d; |
mcimadamore@1113 | 120 | while (bp < limit && d >= 0) { |
mcimadamore@1113 | 121 | bp++; ch = buf[bp]; |
mcimadamore@1113 | 122 | d = digit(bp, 16); |
mcimadamore@1113 | 123 | code = (code << 4) + d; |
mcimadamore@1113 | 124 | } |
mcimadamore@1113 | 125 | if (d >= 0) { |
mcimadamore@1113 | 126 | ch = (char)code; |
mcimadamore@1113 | 127 | unicodeConversionBp = bp; |
mcimadamore@1113 | 128 | return; |
mcimadamore@1113 | 129 | } |
mcimadamore@1113 | 130 | } |
mcimadamore@1113 | 131 | log.error(bp, "illegal.unicode.esc"); |
mcimadamore@1113 | 132 | } else { |
mcimadamore@1113 | 133 | bp--; |
mcimadamore@1113 | 134 | ch = '\\'; |
mcimadamore@1113 | 135 | } |
mcimadamore@1113 | 136 | } |
mcimadamore@1113 | 137 | } |
mcimadamore@1113 | 138 | |
mcimadamore@1113 | 139 | /** Are surrogates supported? |
mcimadamore@1113 | 140 | */ |
mcimadamore@1113 | 141 | final static boolean surrogatesSupported = surrogatesSupported(); |
mcimadamore@1113 | 142 | private static boolean surrogatesSupported() { |
mcimadamore@1113 | 143 | try { |
mcimadamore@1113 | 144 | Character.isHighSurrogate('a'); |
mcimadamore@1113 | 145 | return true; |
mcimadamore@1113 | 146 | } catch (NoSuchMethodError ex) { |
mcimadamore@1113 | 147 | return false; |
mcimadamore@1113 | 148 | } |
mcimadamore@1113 | 149 | } |
mcimadamore@1113 | 150 | |
mcimadamore@1113 | 151 | /** Scan surrogate pairs. If 'ch' is a high surrogate and |
mcimadamore@1113 | 152 | * the next character is a low surrogate, then put the low |
mcimadamore@1113 | 153 | * surrogate in 'ch', and return the high surrogate. |
mcimadamore@1113 | 154 | * otherwise, just return 0. |
mcimadamore@1113 | 155 | */ |
mcimadamore@1113 | 156 | protected char scanSurrogates() { |
mcimadamore@1113 | 157 | if (surrogatesSupported && Character.isHighSurrogate(ch)) { |
mcimadamore@1113 | 158 | char high = ch; |
mcimadamore@1113 | 159 | |
mcimadamore@1113 | 160 | scanChar(); |
mcimadamore@1113 | 161 | |
mcimadamore@1113 | 162 | if (Character.isLowSurrogate(ch)) { |
mcimadamore@1113 | 163 | return high; |
mcimadamore@1113 | 164 | } |
mcimadamore@1113 | 165 | |
mcimadamore@1113 | 166 | ch = high; |
mcimadamore@1113 | 167 | } |
mcimadamore@1113 | 168 | |
mcimadamore@1113 | 169 | return 0; |
mcimadamore@1113 | 170 | } |
mcimadamore@1113 | 171 | |
mcimadamore@1113 | 172 | /** Convert an ASCII digit from its base (8, 10, or 16) |
mcimadamore@1113 | 173 | * to its value. |
mcimadamore@1113 | 174 | */ |
mcimadamore@1113 | 175 | protected int digit(int pos, int base) { |
mcimadamore@1113 | 176 | char c = ch; |
mcimadamore@1113 | 177 | int result = Character.digit(c, base); |
mcimadamore@1113 | 178 | if (result >= 0 && c > 0x7f) { |
mcimadamore@1113 | 179 | log.error(pos + 1, "illegal.nonascii.digit"); |
mcimadamore@1113 | 180 | ch = "0123456789abcdef".charAt(result); |
mcimadamore@1113 | 181 | } |
mcimadamore@1113 | 182 | return result; |
mcimadamore@1113 | 183 | } |
mcimadamore@1113 | 184 | |
mcimadamore@1113 | 185 | protected boolean isUnicode() { |
mcimadamore@1113 | 186 | return unicodeConversionBp == bp; |
mcimadamore@1113 | 187 | } |
mcimadamore@1113 | 188 | |
mcimadamore@1113 | 189 | protected void skipChar() { |
mcimadamore@1113 | 190 | bp++; |
mcimadamore@1113 | 191 | } |
mcimadamore@1113 | 192 | |
mcimadamore@1113 | 193 | protected char peekChar() { |
mcimadamore@1113 | 194 | return buf[bp + 1]; |
mcimadamore@1113 | 195 | } |
mcimadamore@1113 | 196 | |
mcimadamore@1113 | 197 | /** |
mcimadamore@1113 | 198 | * Returns a copy of the input buffer, up to its inputLength. |
mcimadamore@1113 | 199 | * Unicode escape sequences are not translated. |
mcimadamore@1113 | 200 | */ |
mcimadamore@1113 | 201 | public char[] getRawCharacters() { |
mcimadamore@1113 | 202 | char[] chars = new char[buflen]; |
mcimadamore@1113 | 203 | System.arraycopy(buf, 0, chars, 0, buflen); |
mcimadamore@1113 | 204 | return chars; |
mcimadamore@1113 | 205 | } |
mcimadamore@1113 | 206 | |
mcimadamore@1113 | 207 | /** |
mcimadamore@1113 | 208 | * Returns a copy of a character array subset of the input buffer. |
mcimadamore@1113 | 209 | * The returned array begins at the <code>beginIndex</code> and |
mcimadamore@1113 | 210 | * extends to the character at index <code>endIndex - 1</code>. |
mcimadamore@1113 | 211 | * Thus the length of the substring is <code>endIndex-beginIndex</code>. |
mcimadamore@1113 | 212 | * This behavior is like |
mcimadamore@1113 | 213 | * <code>String.substring(beginIndex, endIndex)</code>. |
mcimadamore@1113 | 214 | * Unicode escape sequences are not translated. |
mcimadamore@1113 | 215 | * |
mcimadamore@1113 | 216 | * @param beginIndex the beginning index, inclusive. |
mcimadamore@1113 | 217 | * @param endIndex the ending index, exclusive. |
mcimadamore@1113 | 218 | * @throws IndexOutOfBounds if either offset is outside of the |
mcimadamore@1113 | 219 | * array bounds |
mcimadamore@1113 | 220 | */ |
mcimadamore@1113 | 221 | public char[] getRawCharacters(int beginIndex, int endIndex) { |
mcimadamore@1113 | 222 | int length = endIndex - beginIndex; |
mcimadamore@1113 | 223 | char[] chars = new char[length]; |
mcimadamore@1113 | 224 | System.arraycopy(buf, beginIndex, chars, 0, length); |
mcimadamore@1113 | 225 | return chars; |
mcimadamore@1113 | 226 | } |
mcimadamore@1113 | 227 | } |