8207760: SAXException: Invalid UTF-16 surrogate detected: d83c ?

Thu, 28 Mar 2019 15:18:20 +0000

author
joehw
date
Thu, 28 Mar 2019 15:18:20 +0000
changeset 1996
ff33172d2522
parent 1995
35cdcb220a39
child 1997
9842fc4ed823

8207760: SAXException: Invalid UTF-16 surrogate detected: d83c ?
Summary: Properly handle unicode16 characters split across buffer chunks.
Reviewed-by: lancea, dfuchs

src/com/sun/org/apache/xml/internal/serializer/ToHTMLStream.java file | annotate | diff | comparison | revisions
src/com/sun/org/apache/xml/internal/serializer/ToStream.java file | annotate | diff | comparison | revisions
src/com/sun/org/apache/xml/internal/serializer/ToTextStream.java file | annotate | diff | comparison | revisions
     1.1 --- a/src/com/sun/org/apache/xml/internal/serializer/ToHTMLStream.java	Mon Mar 18 09:00:19 2019 +0100
     1.2 +++ b/src/com/sun/org/apache/xml/internal/serializer/ToHTMLStream.java	Thu Mar 28 15:18:20 2019 +0000
     1.3 @@ -1,6 +1,5 @@
     1.4  /*
     1.5 - * reserved comment block
     1.6 - * DO NOT REMOVE OR ALTER!
     1.7 + * Copyright (c) 2014, 2018, Oracle and/or its affiliates. All rights reserved.
     1.8   */
     1.9  /*
    1.10   * Copyright 2001-2004 The Apache Software Foundation.
    1.11 @@ -43,6 +42,7 @@
    1.12   * because it is used from another package.
    1.13   *
    1.14   * @xsl.usage internal
    1.15 + * @LastModified: Sept 2018
    1.16   */
    1.17  public final class ToHTMLStream extends ToStream
    1.18  {
    1.19 @@ -1021,7 +1021,7 @@
    1.20          String name,
    1.21          String value,
    1.22          ElemDesc elemDesc)
    1.23 -        throws IOException
    1.24 +        throws IOException, SAXException
    1.25      {
    1.26          writer.write(' ');
    1.27  
    1.28 @@ -1345,7 +1345,7 @@
    1.29       */
    1.30      public void writeAttrString(
    1.31          final java.io.Writer writer, String string, String encoding)
    1.32 -        throws IOException
    1.33 +        throws IOException, SAXException
    1.34      {
    1.35          final int end = string.length();
    1.36          if (end > m_attrBuff.length)
    1.37 @@ -1397,13 +1397,16 @@
    1.38                  }
    1.39                  else
    1.40                  {
    1.41 -                    if (Encodings.isHighUTF16Surrogate(ch))
    1.42 +                    if (Encodings.isHighUTF16Surrogate(ch) ||
    1.43 +                            Encodings.isLowUTF16Surrogate(ch))
    1.44                      {
    1.45 -
    1.46 -                            writeUTF16Surrogate(ch, chars, i, end);
    1.47 -                            i++; // two input characters processed
    1.48 -                                 // this increments by one and the for()
    1.49 -                                 // loop itself increments by another one.
    1.50 +                        if (writeUTF16Surrogate(ch, chars, i, end) >= 0) {
    1.51 +                            // move the index if the low surrogate is consumed
    1.52 +                            // as writeUTF16Surrogate has written the pair
    1.53 +                            if (Encodings.isHighUTF16Surrogate(ch)) {
    1.54 +                                i++;
    1.55 +                            }
    1.56 +                        }
    1.57                      }
    1.58  
    1.59                      // The next is kind of a hack to keep from escaping in the case
     2.1 --- a/src/com/sun/org/apache/xml/internal/serializer/ToStream.java	Mon Mar 18 09:00:19 2019 +0100
     2.2 +++ b/src/com/sun/org/apache/xml/internal/serializer/ToStream.java	Thu Mar 28 15:18:20 2019 +0000
     2.3 @@ -1,6 +1,5 @@
     2.4  /*
     2.5 - * reserved comment block
     2.6 - * DO NOT REMOVE OR ALTER!
     2.7 + * Copyright (c) 2006, 2018, Oracle and/or its affiliates. All rights reserved.
     2.8   */
     2.9  /*
    2.10   * Copyright 2001-2004 The Apache Software Foundation.
    2.11 @@ -51,6 +50,7 @@
    2.12   * serializers (xml, html, text ...) that write output to a stream.
    2.13   *
    2.14   * @xsl.usage internal
    2.15 + * @LastModified: Sept 2018
    2.16   */
    2.17  abstract public class ToStream extends SerializerBase
    2.18  {
    2.19 @@ -200,6 +200,7 @@
    2.20       */
    2.21      private boolean m_expandDTDEntities = true;
    2.22  
    2.23 +    private char m_highSurrogate = 0;
    2.24  
    2.25      /**
    2.26       * Default constructor
    2.27 @@ -947,45 +948,46 @@
    2.28       * @param ch Character array.
    2.29       * @param i position Where the surrogate was detected.
    2.30       * @param end The end index of the significant characters.
    2.31 -     * @return 0 if the pair of characters was written out as-is,
    2.32 -     * the unicode code point of the character represented by
    2.33 -     * the surrogate pair if an entity reference with that value
    2.34 -     * was written out.
    2.35 +     * @return the status of writing a surrogate pair.
    2.36 +     *        -1 -- nothing is written
    2.37 +     *         0 -- the pair is written as-is
    2.38 +     *         code point -- the pair is written as an entity reference
    2.39       *
    2.40       * @throws IOException
    2.41       * @throws org.xml.sax.SAXException if invalid UTF-16 surrogate detected.
    2.42       */
    2.43      protected int writeUTF16Surrogate(char c, char ch[], int i, int end)
    2.44 -        throws IOException
    2.45 +        throws IOException, SAXException
    2.46      {
    2.47 -        int codePoint = 0;
    2.48 +        int status = -1;
    2.49          if (i + 1 >= end)
    2.50          {
    2.51 -            throw new IOException(
    2.52 -                Utils.messages.createMessage(
    2.53 -                    MsgKey.ER_INVALID_UTF16_SURROGATE,
    2.54 -                    new Object[] { Integer.toHexString((int) c)}));
    2.55 +            m_highSurrogate = c;
    2.56 +            return status;
    2.57          }
    2.58  
    2.59 -        final char high = c;
    2.60 -        final char low = ch[i+1];
    2.61 +        char high, low;
    2.62 +        if (m_highSurrogate == 0) {
    2.63 +            high = c;
    2.64 +            low = ch[i+1];
    2.65 +            status = 0;
    2.66 +        } else {
    2.67 +            high = m_highSurrogate;
    2.68 +            low = c;
    2.69 +            m_highSurrogate = 0;
    2.70 +        }
    2.71 +
    2.72          if (!Encodings.isLowUTF16Surrogate(low)) {
    2.73 -            throw new IOException(
    2.74 -                Utils.messages.createMessage(
    2.75 -                    MsgKey.ER_INVALID_UTF16_SURROGATE,
    2.76 -                    new Object[] {
    2.77 -                        Integer.toHexString((int) c)
    2.78 -                            + " "
    2.79 -                            + Integer.toHexString(low)}));
    2.80 +            throwIOE(high, low);
    2.81          }
    2.82  
    2.83          final java.io.Writer writer = m_writer;
    2.84  
    2.85          // If we make it to here we have a valid high, low surrogate pair
    2.86 -        if (m_encodingInfo.isInEncoding(c,low)) {
    2.87 +        if (m_encodingInfo.isInEncoding(high,low)) {
    2.88              // If the character formed by the surrogate pair
    2.89              // is in the encoding, so just write it out
    2.90 -            writer.write(ch,i,2);
    2.91 +            writer.write(new char[]{high, low}, 0, 2);
    2.92          }
    2.93          else {
    2.94              // Don't know what to do with this char, it is
    2.95 @@ -993,24 +995,16 @@
    2.96              // a surrogate pair, so write out as an entity ref
    2.97              final String encoding = getEncoding();
    2.98              if (encoding != null) {
    2.99 -                /* The output encoding is known,
   2.100 -                 * so somthing is wrong.
   2.101 -                  */
   2.102 -                codePoint = Encodings.toCodePoint(high, low);
   2.103 -                // not in the encoding, so write out a character reference
   2.104 -                writer.write('&');
   2.105 -                writer.write('#');
   2.106 -                writer.write(Integer.toString(codePoint));
   2.107 -                writer.write(';');
   2.108 +                status = writeCharRef(writer, high, low);
   2.109              } else {
   2.110                  /* The output encoding is not known,
   2.111                   * so just write it out as-is.
   2.112                   */
   2.113 -                writer.write(ch, i, 2);
   2.114 +                writer.write(new char[]{high, low}, 0, 2);
   2.115              }
   2.116          }
   2.117          // non-zero only if character reference was written out.
   2.118 -        return codePoint;
   2.119 +        return status;
   2.120      }
   2.121  
   2.122      /**
   2.123 @@ -1100,32 +1094,7 @@
   2.124              }
   2.125              else if (isCData && (!escapingNotNeeded(c)))
   2.126              {
   2.127 -                //                if (i != 0)
   2.128 -                if (m_cdataTagOpen)
   2.129 -                    closeCDATA();
   2.130 -
   2.131 -                // This needs to go into a function...
   2.132 -                if (Encodings.isHighUTF16Surrogate(c))
   2.133 -                {
   2.134 -                    writeUTF16Surrogate(c, ch, i, end);
   2.135 -                    i++ ; // process two input characters
   2.136 -                }
   2.137 -                else
   2.138 -                {
   2.139 -                    writer.write("&#");
   2.140 -
   2.141 -                    String intStr = Integer.toString((int) c);
   2.142 -
   2.143 -                    writer.write(intStr);
   2.144 -                    writer.write(';');
   2.145 -                }
   2.146 -
   2.147 -                //                if ((i != 0) && (i < (end - 1)))
   2.148 -                //                if (!m_cdataTagOpen && (i < (end - 1)))
   2.149 -                //                {
   2.150 -                //                    writer.write(CDATA_DELIMITER_OPEN);
   2.151 -                //                    m_cdataTagOpen = true;
   2.152 -                //                }
   2.153 +                i = handleEscaping(writer, c, ch, i, end);
   2.154              }
   2.155              else if (
   2.156                  isCData
   2.157 @@ -1149,25 +1118,8 @@
   2.158                      }
   2.159                      writer.write(c);
   2.160                  }
   2.161 -
   2.162 -                // This needs to go into a function...
   2.163 -                else if (Encodings.isHighUTF16Surrogate(c))
   2.164 -                {
   2.165 -                    if (m_cdataTagOpen)
   2.166 -                        closeCDATA();
   2.167 -                    writeUTF16Surrogate(c, ch, i, end);
   2.168 -                    i++; // process two input characters
   2.169 -                }
   2.170 -                else
   2.171 -                {
   2.172 -                    if (m_cdataTagOpen)
   2.173 -                        closeCDATA();
   2.174 -                    writer.write("&#");
   2.175 -
   2.176 -                    String intStr = Integer.toString((int) c);
   2.177 -
   2.178 -                    writer.write(intStr);
   2.179 -                    writer.write(';');
   2.180 +                else {
   2.181 +                    i = handleEscaping(writer, c, ch, i, end);
   2.182                  }
   2.183              }
   2.184          }
   2.185 @@ -1175,6 +1127,38 @@
   2.186      }
   2.187  
   2.188      /**
   2.189 +     * Handles escaping, writes either with a surrogate pair or a character
   2.190 +     * reference.
   2.191 +     *
   2.192 +     * @param c the current char
   2.193 +     * @param ch the character array
   2.194 +     * @param i the current position
   2.195 +     * @param end the end index of the array
   2.196 +     * @return the next index
   2.197 +     *
   2.198 +     * @throws IOException
   2.199 +     * @throws org.xml.sax.SAXException if invalid UTF-16 surrogate detected.
   2.200 +     */
   2.201 +    private int handleEscaping(Writer writer, char c, char ch[], int i, int end)
   2.202 +            throws IOException, SAXException {
   2.203 +        if (Encodings.isHighUTF16Surrogate(c) || Encodings.isLowUTF16Surrogate(c))
   2.204 +        {
   2.205 +            if (writeUTF16Surrogate(c, ch, i, end) >= 0) {
   2.206 +                // move the index if the low surrogate is consumed
   2.207 +                // as writeUTF16Surrogate has written the pair
   2.208 +                if (Encodings.isHighUTF16Surrogate(c)) {
   2.209 +                    i++ ;
   2.210 +                }
   2.211 +            }
   2.212 +        }
   2.213 +        else
   2.214 +        {
   2.215 +            writeCharRef(writer, c);
   2.216 +        }
   2.217 +        return i;
   2.218 +    }
   2.219 +
   2.220 +    /**
   2.221       * Ends an un-escaping section.
   2.222       *
   2.223       * @see #startNonEscaping
   2.224 @@ -1242,7 +1226,7 @@
   2.225              }
   2.226              m_ispreserve = true;
   2.227  
   2.228 -            if (shouldIndent())
   2.229 +            if (!m_cdataTagOpen && shouldIndent())
   2.230                  indent();
   2.231  
   2.232              boolean writeCDataBrackets =
   2.233 @@ -1564,7 +1548,7 @@
   2.234          int i,
   2.235          char ch,
   2.236          int lastDirty,
   2.237 -        boolean fromTextNode) throws IOException
   2.238 +        boolean fromTextNode) throws IOException, SAXException
   2.239      {
   2.240          int startClean = lastDirty + 1;
   2.241          // if we have some clean characters accumulated
   2.242 @@ -1643,54 +1627,41 @@
   2.243          int len,
   2.244          boolean fromTextNode,
   2.245          boolean escLF)
   2.246 -        throws IOException
   2.247 +        throws IOException, SAXException
   2.248      {
   2.249  
   2.250          int pos = accumDefaultEntity(writer, ch, i, chars, len, fromTextNode, escLF);
   2.251  
   2.252          if (i == pos)
   2.253          {
   2.254 +            if (m_highSurrogate != 0) {
   2.255 +                if (!(Encodings.isLowUTF16Surrogate(ch))) {
   2.256 +                    throwIOE(m_highSurrogate, ch);
   2.257 +                }
   2.258 +                writeCharRef(writer, m_highSurrogate, ch);
   2.259 +                m_highSurrogate = 0;
   2.260 +                return ++pos;
   2.261 +            }
   2.262 +
   2.263              if (Encodings.isHighUTF16Surrogate(ch))
   2.264              {
   2.265 -
   2.266 -                // Should be the UTF-16 low surrogate of the hig/low pair.
   2.267 -                char next;
   2.268 -                // Unicode code point formed from the high/low pair.
   2.269 -                int codePoint = 0;
   2.270 -
   2.271                  if (i + 1 >= len)
   2.272                  {
   2.273 -                    throw new IOException(
   2.274 -                        Utils.messages.createMessage(
   2.275 -                            MsgKey.ER_INVALID_UTF16_SURROGATE,
   2.276 -                            new Object[] { Integer.toHexString(ch)}));
   2.277 -                    //"Invalid UTF-16 surrogate detected: "
   2.278 -
   2.279 -                    //+Integer.toHexString(ch)+ " ?");
   2.280 +                    // save for the next read
   2.281 +                    m_highSurrogate = ch;
   2.282 +                    pos++;
   2.283                  }
   2.284                  else
   2.285                  {
   2.286 -                    next = chars[++i];
   2.287 +                    // the next should be the UTF-16 low surrogate of the hig/low pair.
   2.288 +                    char next = chars[++i];
   2.289  
   2.290                      if (!(Encodings.isLowUTF16Surrogate(next)))
   2.291 -                        throw new IOException(
   2.292 -                            Utils.messages.createMessage(
   2.293 -                                MsgKey
   2.294 -                                    .ER_INVALID_UTF16_SURROGATE,
   2.295 -                                new Object[] {
   2.296 -                                    Integer.toHexString(ch)
   2.297 -                                        + " "
   2.298 -                                        + Integer.toHexString(next)}));
   2.299 -                    //"Invalid UTF-16 surrogate detected: "
   2.300 -
   2.301 -                    //+Integer.toHexString(ch)+" "+Integer.toHexString(next));
   2.302 -                    codePoint = Encodings.toCodePoint(ch,next);
   2.303 +                        throwIOE(ch, next);
   2.304 +
   2.305 +                    writeCharRef(writer, ch, next);
   2.306 +                    pos += 2; // count the two characters that went into writing out this entity
   2.307                  }
   2.308 -
   2.309 -                writer.write("&#");
   2.310 -                writer.write(Integer.toString(codePoint));
   2.311 -                writer.write(';');
   2.312 -                pos += 2; // count the two characters that went into writing out this entity
   2.313              }
   2.314              else
   2.315              {
   2.316 @@ -1702,18 +1673,14 @@
   2.317                  if (isCharacterInC0orC1Range(ch) ||
   2.318                          (XMLVERSION11.equals(getVersion()) && isNELorLSEPCharacter(ch)))
   2.319                  {
   2.320 -                    writer.write("&#");
   2.321 -                    writer.write(Integer.toString(ch));
   2.322 -                    writer.write(';');
   2.323 +                    writeCharRef(writer, ch);
   2.324                  }
   2.325                  else if ((!escapingNotNeeded(ch) ||
   2.326                      (  (fromTextNode && m_charInfo.isSpecialTextChar(ch))
   2.327                       || (!fromTextNode && m_charInfo.isSpecialAttrChar(ch))))
   2.328 -                && m_elemContext.m_currentElemDepth > 0)
   2.329 +                     && m_elemContext.m_currentElemDepth > 0)
   2.330                  {
   2.331 -                    writer.write("&#");
   2.332 -                    writer.write(Integer.toString(ch));
   2.333 -                    writer.write(';');
   2.334 +                    writeCharRef(writer, ch);
   2.335                  }
   2.336                  else
   2.337                  {
   2.338 @@ -1727,6 +1694,45 @@
   2.339      }
   2.340  
   2.341      /**
   2.342 +     * Writes out a character reference.
   2.343 +     * @param writer the writer
   2.344 +     * @param c the character
   2.345 +     * @throws IOException
   2.346 +     */
   2.347 +    private void writeCharRef(Writer writer, char c) throws IOException, SAXException {
   2.348 +        if (m_cdataTagOpen)
   2.349 +            closeCDATA();
   2.350 +        writer.write("&#");
   2.351 +        writer.write(Integer.toString(c));
   2.352 +        writer.write(';');
   2.353 +    }
   2.354 +
   2.355 +    /**
   2.356 +     * Writes out a pair of surrogates as a character reference
   2.357 +     * @param writer the writer
   2.358 +     * @param high the high surrogate
   2.359 +     * @param low the low surrogate
   2.360 +     * @throws IOException
   2.361 +     */
   2.362 +    private int writeCharRef(Writer writer, char high, char low) throws IOException, SAXException {
   2.363 +        if (m_cdataTagOpen)
   2.364 +            closeCDATA();
   2.365 +        // Unicode code point formed from the high/low pair.
   2.366 +        int codePoint = Encodings.toCodePoint(high, low);
   2.367 +        writer.write("&#");
   2.368 +        writer.write(Integer.toString(codePoint));
   2.369 +        writer.write(';');
   2.370 +        return codePoint;
   2.371 +    }
   2.372 +
   2.373 +    private void throwIOE(char ch, char next) throws IOException {
   2.374 +        throw new IOException(Utils.messages.createMessage(
   2.375 +                MsgKey.ER_INVALID_UTF16_SURROGATE,
   2.376 +                new Object[] {Integer.toHexString(ch) + " "
   2.377 +                        + Integer.toHexString(next)}));
   2.378 +    }
   2.379 +
   2.380 +    /**
   2.381       * Receive notification of the beginning of an element, although this is a
   2.382       * SAX method additional namespace or attribute information can occur before
   2.383       * or after this call, that is associated with this element.
   2.384 @@ -1962,7 +1968,7 @@
   2.385          Writer writer,
   2.386          String string,
   2.387          String encoding)
   2.388 -        throws IOException
   2.389 +        throws IOException, SAXException
   2.390      {
   2.391          final int len = string.length();
   2.392          if (len > m_attrBuff.length)
     3.1 --- a/src/com/sun/org/apache/xml/internal/serializer/ToTextStream.java	Mon Mar 18 09:00:19 2019 +0100
     3.2 +++ b/src/com/sun/org/apache/xml/internal/serializer/ToTextStream.java	Thu Mar 28 15:18:20 2019 +0000
     3.3 @@ -1,6 +1,5 @@
     3.4  /*
     3.5 - * reserved comment block
     3.6 - * DO NOT REMOVE OR ALTER!
     3.7 + * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
     3.8   */
     3.9  /*
    3.10   * Copyright 2001-2004 The Apache Software Foundation.
    3.11 @@ -35,6 +34,7 @@
    3.12   * This class converts SAX or SAX-like calls to a
    3.13   * serialized document for xsl:output method of "text".
    3.14   * @xsl.usage internal
    3.15 + * @LastModified: Sept 2018
    3.16   */
    3.17  public final class ToTextStream extends ToStream
    3.18  {
    3.19 @@ -296,23 +296,32 @@
    3.20              } else if (m_encodingInfo.isInEncoding(c)) {
    3.21                  writer.write(c);
    3.22                  // one input char processed
    3.23 -            } else if (Encodings.isHighUTF16Surrogate(c)) {
    3.24 +            } else if (Encodings.isHighUTF16Surrogate(c) ||
    3.25 +                       Encodings.isLowUTF16Surrogate(c)) {
    3.26                  final int codePoint = writeUTF16Surrogate(c, ch, i, end);
    3.27 -                if (codePoint != 0) {
    3.28 -                    // I think we can just emit the message,
    3.29 -                    // not crash and burn.
    3.30 -                    final String integralValue = Integer.toString(codePoint);
    3.31 -                    final String msg = Utils.messages.createMessage(
    3.32 -                        MsgKey.ER_ILLEGAL_CHARACTER,
    3.33 -                        new Object[] { integralValue, encoding });
    3.34 +                if (codePoint >= 0) {
    3.35 +                    // move the index if the low surrogate is consumed
    3.36 +                    // as writeUTF16Surrogate has written the pair
    3.37 +                    if (Encodings.isHighUTF16Surrogate(c)) {
    3.38 +                        i++;
    3.39 +                    }
    3.40  
    3.41 -                    //Older behavior was to throw the message,
    3.42 -                    //but newer gentler behavior is to write a message to System.err
    3.43 -                    //throw new SAXException(msg);
    3.44 -                    System.err.println(msg);
    3.45 +                    // printing to the console is not appropriate, but will leave
    3.46 +                    // it as is for compatibility.
    3.47 +                    if (codePoint >0) {
    3.48 +                        // I think we can just emit the message,
    3.49 +                        // not crash and burn.
    3.50 +                        final String integralValue = Integer.toString(codePoint);
    3.51 +                        final String msg = Utils.messages.createMessage(
    3.52 +                            MsgKey.ER_ILLEGAL_CHARACTER,
    3.53 +                            new Object[] { integralValue, encoding });
    3.54  
    3.55 +                        //Older behavior was to throw the message,
    3.56 +                        //but newer gentler behavior is to write a message to System.err
    3.57 +                        //throw new SAXException(msg);
    3.58 +                        System.err.println(msg);
    3.59 +                    }
    3.60                  }
    3.61 -                i++; // two input chars processed
    3.62              } else {
    3.63                  // Don't know what to do with this char, it is
    3.64                  // not in the encoding and not a high char in

mercurial