1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/src/share/classes/com/sun/tools/javac/parser/UnicodeReader.java Mon Oct 24 13:00:20 2011 +0100 1.3 @@ -0,0 +1,227 @@ 1.4 +/* 1.5 + * Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved. 1.6 + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 1.7 + * 1.8 + * This code is free software; you can redistribute it and/or modify it 1.9 + * under the terms of the GNU General Public License version 2 only, as 1.10 + * published by the Free Software Foundation. Oracle designates this 1.11 + * particular file as subject to the "Classpath" exception as provided 1.12 + * by Oracle in the LICENSE file that accompanied this code. 1.13 + * 1.14 + * This code is distributed in the hope that it will be useful, but WITHOUT 1.15 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 1.16 + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 1.17 + * version 2 for more details (a copy is included in the LICENSE file that 1.18 + * accompanied this code). 1.19 + * 1.20 + * You should have received a copy of the GNU General Public License version 1.21 + * 2 along with this work; if not, write to the Free Software Foundation, 1.22 + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 1.23 + * 1.24 + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 1.25 + * or visit www.oracle.com if you need additional information or have any 1.26 + * questions. 1.27 + */ 1.28 + 1.29 +package com.sun.tools.javac.parser; 1.30 + 1.31 +import com.sun.tools.javac.file.JavacFileManager; 1.32 +import java.nio.CharBuffer; 1.33 +import com.sun.tools.javac.util.Log; 1.34 +import static com.sun.tools.javac.util.LayoutCharacters.*; 1.35 + 1.36 +/** The char reader used by the javac lexer/tokenizer. Returns the sequence of 1.37 + * characters contained in the input stream, handling unicode escape accordingly. 1.38 + * Additionally, it provide features for saving chars into a buffer and to retrieve 1.39 + * them at a later stage. 1.40 + * 1.41 + * <p><b>This is NOT part of any supported API. 1.42 + * If you write code that depends on this, you do so at your own risk. 1.43 + * This code and its internal interfaces are subject to change or 1.44 + * deletion without notice.</b> 1.45 + */ 1.46 +public class UnicodeReader { 1.47 + 1.48 + /** The input buffer, index of next character to be read, 1.49 + * index of one past last character in buffer. 1.50 + */ 1.51 + protected char[] buf; 1.52 + protected int bp; 1.53 + protected final int buflen; 1.54 + 1.55 + /** The current character. 1.56 + */ 1.57 + protected char ch; 1.58 + 1.59 + /** The buffer index of the last converted unicode character 1.60 + */ 1.61 + protected int unicodeConversionBp = -1; 1.62 + 1.63 + protected Log log; 1.64 + 1.65 + /** 1.66 + * Create a scanner from the input array. This method might 1.67 + * modify the array. To avoid copying the input array, ensure 1.68 + * that {@code inputLength < input.length} or 1.69 + * {@code input[input.length -1]} is a white space character. 1.70 + * 1.71 + * @param fac the factory which created this Scanner 1.72 + * @param input the input, might be modified 1.73 + * @param inputLength the size of the input. 1.74 + * Must be positive and less than or equal to input.length. 1.75 + */ 1.76 + protected UnicodeReader(ScannerFactory sf, CharBuffer buffer) { 1.77 + this(sf, JavacFileManager.toArray(buffer), buffer.limit()); 1.78 + } 1.79 + 1.80 + protected UnicodeReader(ScannerFactory sf, char[] input, int inputLength) { 1.81 + log = sf.log; 1.82 + if (inputLength == input.length) { 1.83 + if (input.length > 0 && Character.isWhitespace(input[input.length - 1])) { 1.84 + inputLength--; 1.85 + } else { 1.86 + char[] newInput = new char[inputLength + 1]; 1.87 + System.arraycopy(input, 0, newInput, 0, input.length); 1.88 + input = newInput; 1.89 + } 1.90 + } 1.91 + buf = input; 1.92 + buflen = inputLength; 1.93 + buf[buflen] = EOI; 1.94 + bp = -1; 1.95 + scanChar(); 1.96 + } 1.97 + 1.98 + /** Read next character. 1.99 + */ 1.100 + protected void scanChar() { 1.101 + if (bp < buflen) { 1.102 + ch = buf[++bp]; 1.103 + if (ch == '\\') { 1.104 + convertUnicode(); 1.105 + } 1.106 + } 1.107 + } 1.108 + 1.109 + /** Convert unicode escape; bp points to initial '\' character 1.110 + * (Spec 3.3). 1.111 + */ 1.112 + protected void convertUnicode() { 1.113 + if (ch == '\\' && unicodeConversionBp != bp) { 1.114 + bp++; ch = buf[bp]; 1.115 + if (ch == 'u') { 1.116 + do { 1.117 + bp++; ch = buf[bp]; 1.118 + } while (ch == 'u'); 1.119 + int limit = bp + 3; 1.120 + if (limit < buflen) { 1.121 + int d = digit(bp, 16); 1.122 + int code = d; 1.123 + while (bp < limit && d >= 0) { 1.124 + bp++; ch = buf[bp]; 1.125 + d = digit(bp, 16); 1.126 + code = (code << 4) + d; 1.127 + } 1.128 + if (d >= 0) { 1.129 + ch = (char)code; 1.130 + unicodeConversionBp = bp; 1.131 + return; 1.132 + } 1.133 + } 1.134 + log.error(bp, "illegal.unicode.esc"); 1.135 + } else { 1.136 + bp--; 1.137 + ch = '\\'; 1.138 + } 1.139 + } 1.140 + } 1.141 + 1.142 + /** Are surrogates supported? 1.143 + */ 1.144 + final static boolean surrogatesSupported = surrogatesSupported(); 1.145 + private static boolean surrogatesSupported() { 1.146 + try { 1.147 + Character.isHighSurrogate('a'); 1.148 + return true; 1.149 + } catch (NoSuchMethodError ex) { 1.150 + return false; 1.151 + } 1.152 + } 1.153 + 1.154 + /** Scan surrogate pairs. If 'ch' is a high surrogate and 1.155 + * the next character is a low surrogate, then put the low 1.156 + * surrogate in 'ch', and return the high surrogate. 1.157 + * otherwise, just return 0. 1.158 + */ 1.159 + protected char scanSurrogates() { 1.160 + if (surrogatesSupported && Character.isHighSurrogate(ch)) { 1.161 + char high = ch; 1.162 + 1.163 + scanChar(); 1.164 + 1.165 + if (Character.isLowSurrogate(ch)) { 1.166 + return high; 1.167 + } 1.168 + 1.169 + ch = high; 1.170 + } 1.171 + 1.172 + return 0; 1.173 + } 1.174 + 1.175 + /** Convert an ASCII digit from its base (8, 10, or 16) 1.176 + * to its value. 1.177 + */ 1.178 + protected int digit(int pos, int base) { 1.179 + char c = ch; 1.180 + int result = Character.digit(c, base); 1.181 + if (result >= 0 && c > 0x7f) { 1.182 + log.error(pos + 1, "illegal.nonascii.digit"); 1.183 + ch = "0123456789abcdef".charAt(result); 1.184 + } 1.185 + return result; 1.186 + } 1.187 + 1.188 + protected boolean isUnicode() { 1.189 + return unicodeConversionBp == bp; 1.190 + } 1.191 + 1.192 + protected void skipChar() { 1.193 + bp++; 1.194 + } 1.195 + 1.196 + protected char peekChar() { 1.197 + return buf[bp + 1]; 1.198 + } 1.199 + 1.200 + /** 1.201 + * Returns a copy of the input buffer, up to its inputLength. 1.202 + * Unicode escape sequences are not translated. 1.203 + */ 1.204 + public char[] getRawCharacters() { 1.205 + char[] chars = new char[buflen]; 1.206 + System.arraycopy(buf, 0, chars, 0, buflen); 1.207 + return chars; 1.208 + } 1.209 + 1.210 + /** 1.211 + * Returns a copy of a character array subset of the input buffer. 1.212 + * The returned array begins at the <code>beginIndex</code> and 1.213 + * extends to the character at index <code>endIndex - 1</code>. 1.214 + * Thus the length of the substring is <code>endIndex-beginIndex</code>. 1.215 + * This behavior is like 1.216 + * <code>String.substring(beginIndex, endIndex)</code>. 1.217 + * Unicode escape sequences are not translated. 1.218 + * 1.219 + * @param beginIndex the beginning index, inclusive. 1.220 + * @param endIndex the ending index, exclusive. 1.221 + * @throws IndexOutOfBounds if either offset is outside of the 1.222 + * array bounds 1.223 + */ 1.224 + public char[] getRawCharacters(int beginIndex, int endIndex) { 1.225 + int length = endIndex - beginIndex; 1.226 + char[] chars = new char[length]; 1.227 + System.arraycopy(buf, beginIndex, chars, 0, length); 1.228 + return chars; 1.229 + } 1.230 +}