Thu, 28 Mar 2019 15:18:20 +0000
8207760: SAXException: Invalid UTF-16 surrogate detected: d83c ?
Summary: Properly handle unicode16 characters split across buffer chunks.
Reviewed-by: lancea, dfuchs
1.1 --- a/src/com/sun/org/apache/xml/internal/serializer/ToHTMLStream.java Mon Mar 18 09:00:19 2019 +0100 1.2 +++ b/src/com/sun/org/apache/xml/internal/serializer/ToHTMLStream.java Thu Mar 28 15:18:20 2019 +0000 1.3 @@ -1,6 +1,5 @@ 1.4 /* 1.5 - * reserved comment block 1.6 - * DO NOT REMOVE OR ALTER! 1.7 + * Copyright (c) 2014, 2018, Oracle and/or its affiliates. All rights reserved. 1.8 */ 1.9 /* 1.10 * Copyright 2001-2004 The Apache Software Foundation. 1.11 @@ -43,6 +42,7 @@ 1.12 * because it is used from another package. 1.13 * 1.14 * @xsl.usage internal 1.15 + * @LastModified: Sept 2018 1.16 */ 1.17 public final class ToHTMLStream extends ToStream 1.18 { 1.19 @@ -1021,7 +1021,7 @@ 1.20 String name, 1.21 String value, 1.22 ElemDesc elemDesc) 1.23 - throws IOException 1.24 + throws IOException, SAXException 1.25 { 1.26 writer.write(' '); 1.27 1.28 @@ -1345,7 +1345,7 @@ 1.29 */ 1.30 public void writeAttrString( 1.31 final java.io.Writer writer, String string, String encoding) 1.32 - throws IOException 1.33 + throws IOException, SAXException 1.34 { 1.35 final int end = string.length(); 1.36 if (end > m_attrBuff.length) 1.37 @@ -1397,13 +1397,16 @@ 1.38 } 1.39 else 1.40 { 1.41 - if (Encodings.isHighUTF16Surrogate(ch)) 1.42 + if (Encodings.isHighUTF16Surrogate(ch) || 1.43 + Encodings.isLowUTF16Surrogate(ch)) 1.44 { 1.45 - 1.46 - writeUTF16Surrogate(ch, chars, i, end); 1.47 - i++; // two input characters processed 1.48 - // this increments by one and the for() 1.49 - // loop itself increments by another one. 1.50 + if (writeUTF16Surrogate(ch, chars, i, end) >= 0) { 1.51 + // move the index if the low surrogate is consumed 1.52 + // as writeUTF16Surrogate has written the pair 1.53 + if (Encodings.isHighUTF16Surrogate(ch)) { 1.54 + i++; 1.55 + } 1.56 + } 1.57 } 1.58 1.59 // The next is kind of a hack to keep from escaping in the case
2.1 --- a/src/com/sun/org/apache/xml/internal/serializer/ToStream.java Mon Mar 18 09:00:19 2019 +0100 2.2 +++ b/src/com/sun/org/apache/xml/internal/serializer/ToStream.java Thu Mar 28 15:18:20 2019 +0000 2.3 @@ -1,6 +1,5 @@ 2.4 /* 2.5 - * reserved comment block 2.6 - * DO NOT REMOVE OR ALTER! 2.7 + * Copyright (c) 2006, 2018, Oracle and/or its affiliates. All rights reserved. 2.8 */ 2.9 /* 2.10 * Copyright 2001-2004 The Apache Software Foundation. 2.11 @@ -51,6 +50,7 @@ 2.12 * serializers (xml, html, text ...) that write output to a stream. 2.13 * 2.14 * @xsl.usage internal 2.15 + * @LastModified: Sept 2018 2.16 */ 2.17 abstract public class ToStream extends SerializerBase 2.18 { 2.19 @@ -200,6 +200,7 @@ 2.20 */ 2.21 private boolean m_expandDTDEntities = true; 2.22 2.23 + private char m_highSurrogate = 0; 2.24 2.25 /** 2.26 * Default constructor 2.27 @@ -947,45 +948,46 @@ 2.28 * @param ch Character array. 2.29 * @param i position Where the surrogate was detected. 2.30 * @param end The end index of the significant characters. 2.31 - * @return 0 if the pair of characters was written out as-is, 2.32 - * the unicode code point of the character represented by 2.33 - * the surrogate pair if an entity reference with that value 2.34 - * was written out. 2.35 + * @return the status of writing a surrogate pair. 2.36 + * -1 -- nothing is written 2.37 + * 0 -- the pair is written as-is 2.38 + * code point -- the pair is written as an entity reference 2.39 * 2.40 * @throws IOException 2.41 * @throws org.xml.sax.SAXException if invalid UTF-16 surrogate detected. 2.42 */ 2.43 protected int writeUTF16Surrogate(char c, char ch[], int i, int end) 2.44 - throws IOException 2.45 + throws IOException, SAXException 2.46 { 2.47 - int codePoint = 0; 2.48 + int status = -1; 2.49 if (i + 1 >= end) 2.50 { 2.51 - throw new IOException( 2.52 - Utils.messages.createMessage( 2.53 - MsgKey.ER_INVALID_UTF16_SURROGATE, 2.54 - new Object[] { Integer.toHexString((int) c)})); 2.55 + m_highSurrogate = c; 2.56 + return status; 2.57 } 2.58 2.59 - final char high = c; 2.60 - final char low = ch[i+1]; 2.61 + char high, low; 2.62 + if (m_highSurrogate == 0) { 2.63 + high = c; 2.64 + low = ch[i+1]; 2.65 + status = 0; 2.66 + } else { 2.67 + high = m_highSurrogate; 2.68 + low = c; 2.69 + m_highSurrogate = 0; 2.70 + } 2.71 + 2.72 if (!Encodings.isLowUTF16Surrogate(low)) { 2.73 - throw new IOException( 2.74 - Utils.messages.createMessage( 2.75 - MsgKey.ER_INVALID_UTF16_SURROGATE, 2.76 - new Object[] { 2.77 - Integer.toHexString((int) c) 2.78 - + " " 2.79 - + Integer.toHexString(low)})); 2.80 + throwIOE(high, low); 2.81 } 2.82 2.83 final java.io.Writer writer = m_writer; 2.84 2.85 // If we make it to here we have a valid high, low surrogate pair 2.86 - if (m_encodingInfo.isInEncoding(c,low)) { 2.87 + if (m_encodingInfo.isInEncoding(high,low)) { 2.88 // If the character formed by the surrogate pair 2.89 // is in the encoding, so just write it out 2.90 - writer.write(ch,i,2); 2.91 + writer.write(new char[]{high, low}, 0, 2); 2.92 } 2.93 else { 2.94 // Don't know what to do with this char, it is 2.95 @@ -993,24 +995,16 @@ 2.96 // a surrogate pair, so write out as an entity ref 2.97 final String encoding = getEncoding(); 2.98 if (encoding != null) { 2.99 - /* The output encoding is known, 2.100 - * so somthing is wrong. 2.101 - */ 2.102 - codePoint = Encodings.toCodePoint(high, low); 2.103 - // not in the encoding, so write out a character reference 2.104 - writer.write('&'); 2.105 - writer.write('#'); 2.106 - writer.write(Integer.toString(codePoint)); 2.107 - writer.write(';'); 2.108 + status = writeCharRef(writer, high, low); 2.109 } else { 2.110 /* The output encoding is not known, 2.111 * so just write it out as-is. 2.112 */ 2.113 - writer.write(ch, i, 2); 2.114 + writer.write(new char[]{high, low}, 0, 2); 2.115 } 2.116 } 2.117 // non-zero only if character reference was written out. 2.118 - return codePoint; 2.119 + return status; 2.120 } 2.121 2.122 /** 2.123 @@ -1100,32 +1094,7 @@ 2.124 } 2.125 else if (isCData && (!escapingNotNeeded(c))) 2.126 { 2.127 - // if (i != 0) 2.128 - if (m_cdataTagOpen) 2.129 - closeCDATA(); 2.130 - 2.131 - // This needs to go into a function... 2.132 - if (Encodings.isHighUTF16Surrogate(c)) 2.133 - { 2.134 - writeUTF16Surrogate(c, ch, i, end); 2.135 - i++ ; // process two input characters 2.136 - } 2.137 - else 2.138 - { 2.139 - writer.write("&#"); 2.140 - 2.141 - String intStr = Integer.toString((int) c); 2.142 - 2.143 - writer.write(intStr); 2.144 - writer.write(';'); 2.145 - } 2.146 - 2.147 - // if ((i != 0) && (i < (end - 1))) 2.148 - // if (!m_cdataTagOpen && (i < (end - 1))) 2.149 - // { 2.150 - // writer.write(CDATA_DELIMITER_OPEN); 2.151 - // m_cdataTagOpen = true; 2.152 - // } 2.153 + i = handleEscaping(writer, c, ch, i, end); 2.154 } 2.155 else if ( 2.156 isCData 2.157 @@ -1149,25 +1118,8 @@ 2.158 } 2.159 writer.write(c); 2.160 } 2.161 - 2.162 - // This needs to go into a function... 2.163 - else if (Encodings.isHighUTF16Surrogate(c)) 2.164 - { 2.165 - if (m_cdataTagOpen) 2.166 - closeCDATA(); 2.167 - writeUTF16Surrogate(c, ch, i, end); 2.168 - i++; // process two input characters 2.169 - } 2.170 - else 2.171 - { 2.172 - if (m_cdataTagOpen) 2.173 - closeCDATA(); 2.174 - writer.write("&#"); 2.175 - 2.176 - String intStr = Integer.toString((int) c); 2.177 - 2.178 - writer.write(intStr); 2.179 - writer.write(';'); 2.180 + else { 2.181 + i = handleEscaping(writer, c, ch, i, end); 2.182 } 2.183 } 2.184 } 2.185 @@ -1175,6 +1127,38 @@ 2.186 } 2.187 2.188 /** 2.189 + * Handles escaping, writes either with a surrogate pair or a character 2.190 + * reference. 2.191 + * 2.192 + * @param c the current char 2.193 + * @param ch the character array 2.194 + * @param i the current position 2.195 + * @param end the end index of the array 2.196 + * @return the next index 2.197 + * 2.198 + * @throws IOException 2.199 + * @throws org.xml.sax.SAXException if invalid UTF-16 surrogate detected. 2.200 + */ 2.201 + private int handleEscaping(Writer writer, char c, char ch[], int i, int end) 2.202 + throws IOException, SAXException { 2.203 + if (Encodings.isHighUTF16Surrogate(c) || Encodings.isLowUTF16Surrogate(c)) 2.204 + { 2.205 + if (writeUTF16Surrogate(c, ch, i, end) >= 0) { 2.206 + // move the index if the low surrogate is consumed 2.207 + // as writeUTF16Surrogate has written the pair 2.208 + if (Encodings.isHighUTF16Surrogate(c)) { 2.209 + i++ ; 2.210 + } 2.211 + } 2.212 + } 2.213 + else 2.214 + { 2.215 + writeCharRef(writer, c); 2.216 + } 2.217 + return i; 2.218 + } 2.219 + 2.220 + /** 2.221 * Ends an un-escaping section. 2.222 * 2.223 * @see #startNonEscaping 2.224 @@ -1242,7 +1226,7 @@ 2.225 } 2.226 m_ispreserve = true; 2.227 2.228 - if (shouldIndent()) 2.229 + if (!m_cdataTagOpen && shouldIndent()) 2.230 indent(); 2.231 2.232 boolean writeCDataBrackets = 2.233 @@ -1564,7 +1548,7 @@ 2.234 int i, 2.235 char ch, 2.236 int lastDirty, 2.237 - boolean fromTextNode) throws IOException 2.238 + boolean fromTextNode) throws IOException, SAXException 2.239 { 2.240 int startClean = lastDirty + 1; 2.241 // if we have some clean characters accumulated 2.242 @@ -1643,54 +1627,41 @@ 2.243 int len, 2.244 boolean fromTextNode, 2.245 boolean escLF) 2.246 - throws IOException 2.247 + throws IOException, SAXException 2.248 { 2.249 2.250 int pos = accumDefaultEntity(writer, ch, i, chars, len, fromTextNode, escLF); 2.251 2.252 if (i == pos) 2.253 { 2.254 + if (m_highSurrogate != 0) { 2.255 + if (!(Encodings.isLowUTF16Surrogate(ch))) { 2.256 + throwIOE(m_highSurrogate, ch); 2.257 + } 2.258 + writeCharRef(writer, m_highSurrogate, ch); 2.259 + m_highSurrogate = 0; 2.260 + return ++pos; 2.261 + } 2.262 + 2.263 if (Encodings.isHighUTF16Surrogate(ch)) 2.264 { 2.265 - 2.266 - // Should be the UTF-16 low surrogate of the hig/low pair. 2.267 - char next; 2.268 - // Unicode code point formed from the high/low pair. 2.269 - int codePoint = 0; 2.270 - 2.271 if (i + 1 >= len) 2.272 { 2.273 - throw new IOException( 2.274 - Utils.messages.createMessage( 2.275 - MsgKey.ER_INVALID_UTF16_SURROGATE, 2.276 - new Object[] { Integer.toHexString(ch)})); 2.277 - //"Invalid UTF-16 surrogate detected: " 2.278 - 2.279 - //+Integer.toHexString(ch)+ " ?"); 2.280 + // save for the next read 2.281 + m_highSurrogate = ch; 2.282 + pos++; 2.283 } 2.284 else 2.285 { 2.286 - next = chars[++i]; 2.287 + // the next should be the UTF-16 low surrogate of the hig/low pair. 2.288 + char next = chars[++i]; 2.289 2.290 if (!(Encodings.isLowUTF16Surrogate(next))) 2.291 - throw new IOException( 2.292 - Utils.messages.createMessage( 2.293 - MsgKey 2.294 - .ER_INVALID_UTF16_SURROGATE, 2.295 - new Object[] { 2.296 - Integer.toHexString(ch) 2.297 - + " " 2.298 - + Integer.toHexString(next)})); 2.299 - //"Invalid UTF-16 surrogate detected: " 2.300 - 2.301 - //+Integer.toHexString(ch)+" "+Integer.toHexString(next)); 2.302 - codePoint = Encodings.toCodePoint(ch,next); 2.303 + throwIOE(ch, next); 2.304 + 2.305 + writeCharRef(writer, ch, next); 2.306 + pos += 2; // count the two characters that went into writing out this entity 2.307 } 2.308 - 2.309 - writer.write("&#"); 2.310 - writer.write(Integer.toString(codePoint)); 2.311 - writer.write(';'); 2.312 - pos += 2; // count the two characters that went into writing out this entity 2.313 } 2.314 else 2.315 { 2.316 @@ -1702,18 +1673,14 @@ 2.317 if (isCharacterInC0orC1Range(ch) || 2.318 (XMLVERSION11.equals(getVersion()) && isNELorLSEPCharacter(ch))) 2.319 { 2.320 - writer.write("&#"); 2.321 - writer.write(Integer.toString(ch)); 2.322 - writer.write(';'); 2.323 + writeCharRef(writer, ch); 2.324 } 2.325 else if ((!escapingNotNeeded(ch) || 2.326 ( (fromTextNode && m_charInfo.isSpecialTextChar(ch)) 2.327 || (!fromTextNode && m_charInfo.isSpecialAttrChar(ch)))) 2.328 - && m_elemContext.m_currentElemDepth > 0) 2.329 + && m_elemContext.m_currentElemDepth > 0) 2.330 { 2.331 - writer.write("&#"); 2.332 - writer.write(Integer.toString(ch)); 2.333 - writer.write(';'); 2.334 + writeCharRef(writer, ch); 2.335 } 2.336 else 2.337 { 2.338 @@ -1727,6 +1694,45 @@ 2.339 } 2.340 2.341 /** 2.342 + * Writes out a character reference. 2.343 + * @param writer the writer 2.344 + * @param c the character 2.345 + * @throws IOException 2.346 + */ 2.347 + private void writeCharRef(Writer writer, char c) throws IOException, SAXException { 2.348 + if (m_cdataTagOpen) 2.349 + closeCDATA(); 2.350 + writer.write("&#"); 2.351 + writer.write(Integer.toString(c)); 2.352 + writer.write(';'); 2.353 + } 2.354 + 2.355 + /** 2.356 + * Writes out a pair of surrogates as a character reference 2.357 + * @param writer the writer 2.358 + * @param high the high surrogate 2.359 + * @param low the low surrogate 2.360 + * @throws IOException 2.361 + */ 2.362 + private int writeCharRef(Writer writer, char high, char low) throws IOException, SAXException { 2.363 + if (m_cdataTagOpen) 2.364 + closeCDATA(); 2.365 + // Unicode code point formed from the high/low pair. 2.366 + int codePoint = Encodings.toCodePoint(high, low); 2.367 + writer.write("&#"); 2.368 + writer.write(Integer.toString(codePoint)); 2.369 + writer.write(';'); 2.370 + return codePoint; 2.371 + } 2.372 + 2.373 + private void throwIOE(char ch, char next) throws IOException { 2.374 + throw new IOException(Utils.messages.createMessage( 2.375 + MsgKey.ER_INVALID_UTF16_SURROGATE, 2.376 + new Object[] {Integer.toHexString(ch) + " " 2.377 + + Integer.toHexString(next)})); 2.378 + } 2.379 + 2.380 + /** 2.381 * Receive notification of the beginning of an element, although this is a 2.382 * SAX method additional namespace or attribute information can occur before 2.383 * or after this call, that is associated with this element. 2.384 @@ -1962,7 +1968,7 @@ 2.385 Writer writer, 2.386 String string, 2.387 String encoding) 2.388 - throws IOException 2.389 + throws IOException, SAXException 2.390 { 2.391 final int len = string.length(); 2.392 if (len > m_attrBuff.length)
3.1 --- a/src/com/sun/org/apache/xml/internal/serializer/ToTextStream.java Mon Mar 18 09:00:19 2019 +0100 3.2 +++ b/src/com/sun/org/apache/xml/internal/serializer/ToTextStream.java Thu Mar 28 15:18:20 2019 +0000 3.3 @@ -1,6 +1,5 @@ 3.4 /* 3.5 - * reserved comment block 3.6 - * DO NOT REMOVE OR ALTER! 3.7 + * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved. 3.8 */ 3.9 /* 3.10 * Copyright 2001-2004 The Apache Software Foundation. 3.11 @@ -35,6 +34,7 @@ 3.12 * This class converts SAX or SAX-like calls to a 3.13 * serialized document for xsl:output method of "text". 3.14 * @xsl.usage internal 3.15 + * @LastModified: Sept 2018 3.16 */ 3.17 public final class ToTextStream extends ToStream 3.18 { 3.19 @@ -296,23 +296,32 @@ 3.20 } else if (m_encodingInfo.isInEncoding(c)) { 3.21 writer.write(c); 3.22 // one input char processed 3.23 - } else if (Encodings.isHighUTF16Surrogate(c)) { 3.24 + } else if (Encodings.isHighUTF16Surrogate(c) || 3.25 + Encodings.isLowUTF16Surrogate(c)) { 3.26 final int codePoint = writeUTF16Surrogate(c, ch, i, end); 3.27 - if (codePoint != 0) { 3.28 - // I think we can just emit the message, 3.29 - // not crash and burn. 3.30 - final String integralValue = Integer.toString(codePoint); 3.31 - final String msg = Utils.messages.createMessage( 3.32 - MsgKey.ER_ILLEGAL_CHARACTER, 3.33 - new Object[] { integralValue, encoding }); 3.34 + if (codePoint >= 0) { 3.35 + // move the index if the low surrogate is consumed 3.36 + // as writeUTF16Surrogate has written the pair 3.37 + if (Encodings.isHighUTF16Surrogate(c)) { 3.38 + i++; 3.39 + } 3.40 3.41 - //Older behavior was to throw the message, 3.42 - //but newer gentler behavior is to write a message to System.err 3.43 - //throw new SAXException(msg); 3.44 - System.err.println(msg); 3.45 + // printing to the console is not appropriate, but will leave 3.46 + // it as is for compatibility. 3.47 + if (codePoint >0) { 3.48 + // I think we can just emit the message, 3.49 + // not crash and burn. 3.50 + final String integralValue = Integer.toString(codePoint); 3.51 + final String msg = Utils.messages.createMessage( 3.52 + MsgKey.ER_ILLEGAL_CHARACTER, 3.53 + new Object[] { integralValue, encoding }); 3.54 3.55 + //Older behavior was to throw the message, 3.56 + //but newer gentler behavior is to write a message to System.err 3.57 + //throw new SAXException(msg); 3.58 + System.err.println(msg); 3.59 + } 3.60 } 3.61 - i++; // two input chars processed 3.62 } else { 3.63 // Don't know what to do with this char, it is 3.64 // not in the encoding and not a high char in