8078497: C2's superword optimization causes unaligned memory accesses

Fri, 08 May 2015 12:19:17 +0200

author
thartmann
date
Fri, 08 May 2015 12:19:17 +0200
changeset 7819
95dbbc0431d9
parent 7818
84d55f179e24
child 7820
f5800068c61d

8078497: C2's superword optimization causes unaligned memory accesses
Summary: Prevent vectorization of memory operations with different invariant offsets if unaligned memory accesses are not allowed.
Reviewed-by: kvn

src/share/vm/opto/superword.cpp file | annotate | diff | comparison | revisions
src/share/vm/opto/superword.hpp file | annotate | diff | comparison | revisions
test/compiler/loopopts/superword/TestVectorizationWithInvariant.java file | annotate | diff | comparison | revisions
     1.1 --- a/src/share/vm/opto/superword.cpp	Mon May 11 07:44:46 2015 +0200
     1.2 +++ b/src/share/vm/opto/superword.cpp	Fri May 08 12:19:17 2015 +0200
     1.3 @@ -232,6 +232,13 @@
     1.4            // if unaligned memory access is not allowed because number of
     1.5            // iterations in pre-loop will be not enough to align it.
     1.6            create_pack = false;
     1.7 +        } else {
     1.8 +          SWPointer p2(best_align_to_mem_ref, this);
     1.9 +          if (align_to_ref_p.invar() != p2.invar()) {
    1.10 +            // Do not vectorize memory accesses with different invariants
    1.11 +            // if unaligned memory accesses are not allowed.
    1.12 +            create_pack = false;
    1.13 +          }
    1.14          }
    1.15        }
    1.16      } else {
    1.17 @@ -452,24 +459,50 @@
    1.18    if (ABS(span) == mem_size && (ABS(offset) % mem_size) == 0) {
    1.19      return true;
    1.20    }
    1.21 -  // If initial offset from start of object is computable,
    1.22 -  // compute alignment within the vector.
    1.23 +  // If the initial offset from start of the object is computable,
    1.24 +  // check if the pre-loop can align the final offset accordingly.
    1.25 +  //
    1.26 +  // In other words: Can we find an i such that the offset
    1.27 +  // after i pre-loop iterations is aligned to vw?
    1.28 +  //   (init_offset + pre_loop) % vw == 0              (1)
    1.29 +  // where
    1.30 +  //   pre_loop = i * span
    1.31 +  // is the number of bytes added to the offset by i pre-loop iterations.
    1.32 +  //
    1.33 +  // For this to hold we need pre_loop to increase init_offset by
    1.34 +  //   pre_loop = vw - (init_offset % vw)
    1.35 +  //
    1.36 +  // This is only possible if pre_loop is divisible by span because each
    1.37 +  // pre-loop iteration increases the initial offset by 'span' bytes:
    1.38 +  //   (vw - (init_offset % vw)) % span == 0
    1.39 +  //
    1.40    int vw = vector_width_in_bytes(p.mem());
    1.41    assert(vw > 1, "sanity");
    1.42 -  if (vw % span == 0) {
    1.43 -    Node* init_nd = pre_end->init_trip();
    1.44 -    if (init_nd->is_Con() && p.invar() == NULL) {
    1.45 -      int init = init_nd->bottom_type()->is_int()->get_con();
    1.46 -
    1.47 -      int init_offset = init * p.scale_in_bytes() + offset;
    1.48 -      assert(init_offset >= 0, "positive offset from object start");
    1.49 -
    1.50 +  Node* init_nd = pre_end->init_trip();
    1.51 +  if (init_nd->is_Con() && p.invar() == NULL) {
    1.52 +    int init = init_nd->bottom_type()->is_int()->get_con();
    1.53 +    int init_offset = init * p.scale_in_bytes() + offset;
    1.54 +    assert(init_offset >= 0, "positive offset from object start");
    1.55 +    if (vw % span == 0) {
    1.56 +      // If vm is a multiple of span, we use formula (1).
    1.57        if (span > 0) {
    1.58          return (vw - (init_offset % vw)) % span == 0;
    1.59        } else {
    1.60          assert(span < 0, "nonzero stride * scale");
    1.61          return (init_offset % vw) % -span == 0;
    1.62        }
    1.63 +    } else if (span % vw == 0) {
    1.64 +      // If span is a multiple of vw, we can simplify formula (1) to:
    1.65 +      //   (init_offset + i * span) % vw == 0
    1.66 +      //     =>
    1.67 +      //   (init_offset % vw) + ((i * span) % vw) == 0
    1.68 +      //     =>
    1.69 +      //   init_offset % vw == 0
    1.70 +      //
    1.71 +      // Because we add a multiple of vw to the initial offset, the final
    1.72 +      // offset is a multiple of vw if and only if init_offset is a multiple.
    1.73 +      //
    1.74 +      return (init_offset % vw) == 0;
    1.75      }
    1.76    }
    1.77    return false;
    1.78 @@ -481,17 +514,23 @@
    1.79    SWPointer align_to_ref_p(mem_ref, this);
    1.80    int offset = align_to_ref_p.offset_in_bytes();
    1.81    int scale  = align_to_ref_p.scale_in_bytes();
    1.82 +  int elt_size = align_to_ref_p.memory_size();
    1.83    int vw       = vector_width_in_bytes(mem_ref);
    1.84    assert(vw > 1, "sanity");
    1.85 -  int stride_sign   = (scale * iv_stride()) > 0 ? 1 : -1;
    1.86 -  // At least one iteration is executed in pre-loop by default. As result
    1.87 -  // several iterations are needed to align memory operations in main-loop even
    1.88 -  // if offset is 0.
    1.89 -  int iv_adjustment_in_bytes = (stride_sign * vw - (offset % vw));
    1.90 -  int elt_size = align_to_ref_p.memory_size();
    1.91 -  assert(((ABS(iv_adjustment_in_bytes) % elt_size) == 0),
    1.92 -         err_msg_res("(%d) should be divisible by (%d)", iv_adjustment_in_bytes, elt_size));
    1.93 -  int iv_adjustment = iv_adjustment_in_bytes/elt_size;
    1.94 +  int iv_adjustment;
    1.95 +  if (scale != 0) {
    1.96 +    int stride_sign = (scale * iv_stride()) > 0 ? 1 : -1;
    1.97 +    // At least one iteration is executed in pre-loop by default. As result
    1.98 +    // several iterations are needed to align memory operations in main-loop even
    1.99 +    // if offset is 0.
   1.100 +    int iv_adjustment_in_bytes = (stride_sign * vw - (offset % vw));
   1.101 +    assert(((ABS(iv_adjustment_in_bytes) % elt_size) == 0),
   1.102 +           err_msg_res("(%d) should be divisible by (%d)", iv_adjustment_in_bytes, elt_size));
   1.103 +    iv_adjustment = iv_adjustment_in_bytes/elt_size;
   1.104 +  } else {
   1.105 +    // This memory op is not dependent on iv (scale == 0)
   1.106 +    iv_adjustment = 0;
   1.107 +  }
   1.108  
   1.109  #ifndef PRODUCT
   1.110    if (TraceSuperWord)
     2.1 --- a/src/share/vm/opto/superword.hpp	Mon May 11 07:44:46 2015 +0200
     2.2 +++ b/src/share/vm/opto/superword.hpp	Fri May 08 12:19:17 2015 +0200
     2.3 @@ -41,7 +41,7 @@
     2.4  // Exploiting SuperWord Level Parallelism with
     2.5  //   Multimedia Instruction Sets
     2.6  // by
     2.7 -//   Samuel Larsen and Saman Amarasighe
     2.8 +//   Samuel Larsen and Saman Amarasinghe
     2.9  //   MIT Laboratory for Computer Science
    2.10  // date
    2.11  //   May 2000
    2.12 @@ -432,7 +432,7 @@
    2.13  
    2.14    Node* _base;         // NULL if unsafe nonheap reference
    2.15    Node* _adr;          // address pointer
    2.16 -  jint  _scale;        // multipler for iv (in bytes), 0 if no loop iv
    2.17 +  jint  _scale;        // multiplier for iv (in bytes), 0 if no loop iv
    2.18    jint  _offset;       // constant offset (in bytes)
    2.19    Node* _invar;        // invariant offset (in bytes), NULL if none
    2.20    bool  _negate_invar; // if true then use: (0 - _invar)
     3.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.2 +++ b/test/compiler/loopopts/superword/TestVectorizationWithInvariant.java	Fri May 08 12:19:17 2015 +0200
     3.3 @@ -0,0 +1,144 @@
     3.4 +/*
     3.5 + * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
     3.6 + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     3.7 + *
     3.8 + * This code is free software; you can redistribute it and/or modify it
     3.9 + * under the terms of the GNU General Public License version 2 only, as
    3.10 + * published by the Free Software Foundation.
    3.11 + *
    3.12 + * This code is distributed in the hope that it will be useful, but WITHOUT
    3.13 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
    3.14 + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    3.15 + * version 2 for more details (a copy is included in the LICENSE file that
    3.16 + * accompanied this code).
    3.17 + *
    3.18 + * You should have received a copy of the GNU General Public License version
    3.19 + * 2 along with this work; if not, write to the Free Software Foundation,
    3.20 + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
    3.21 + *
    3.22 + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
    3.23 + * or visit www.oracle.com if you need additional information or have any
    3.24 + * questions.
    3.25 + *
    3.26 + */
    3.27 +
    3.28 +import com.oracle.java.testlibrary.*;
    3.29 +import sun.misc.Unsafe;
    3.30 +
    3.31 +/**
    3.32 + * @test
    3.33 + * @bug 8078497
    3.34 + * @summary Tests correct alignment of vectors with loop invariant offset.
    3.35 + * @library /testlibrary
    3.36 + * @run main TestVectorizationWithInvariant
    3.37 + */
    3.38 +public class TestVectorizationWithInvariant {
    3.39 +
    3.40 +    private static Unsafe unsafe;
    3.41 +    private static final long BYTE_ARRAY_OFFSET;
    3.42 +    private static final long CHAR_ARRAY_OFFSET;
    3.43 +
    3.44 +    static {
    3.45 +        unsafe = Utils.getUnsafe();
    3.46 +        BYTE_ARRAY_OFFSET = unsafe.arrayBaseOffset(byte[].class);
    3.47 +        CHAR_ARRAY_OFFSET = unsafe.arrayBaseOffset(char[].class);
    3.48 +    }
    3.49 +
    3.50 +    public static void main(String[] args) throws Exception {
    3.51 +        byte[] byte_array1 = new byte[1000];
    3.52 +        byte[] byte_array2 = new byte[1000];
    3.53 +        char[] char_array = new char[1000];
    3.54 +
    3.55 +        for (int i = 0; i < 20_000; ++i) {
    3.56 +            copyByteToChar(byte_array1, byte_array2, char_array, 1);
    3.57 +            copyCharToByte(char_array, byte_array1, 1);
    3.58 +            copyCharToByteAligned(char_array, byte_array1);
    3.59 +            copyCharToByteUnaligned(char_array, byte_array1);
    3.60 +        }
    3.61 +    }
    3.62 +
    3.63 +    /*
    3.64 +     * Copy multiple consecutive chars from a byte array to a given offset in a char array
    3.65 +     * to trigger C2's superword optimization. The offset in the byte array is independent
    3.66 +     * of the loop induction variable and can be set to an arbitrary value. It may then not
    3.67 +     * be possible to both align the LoadUS and the StoreC operations. Therefore, vectorization
    3.68 +     * should only be done in this case if unaligned memory accesses are allowed.
    3.69 +     */
    3.70 +    public static void copyByteToChar(byte[] src1, byte[] src2, char[] dst, int off) {
    3.71 +        off = (int) BYTE_ARRAY_OFFSET + (off << 1);
    3.72 +        byte[] src = src1;
    3.73 +        for (int i = (int) CHAR_ARRAY_OFFSET; i < 100; i = i + 8) {
    3.74 +            // Copy 8 chars from src to dst
    3.75 +            unsafe.putChar(dst, i + 0, unsafe.getChar(src, off + 0));
    3.76 +            unsafe.putChar(dst, i + 2, unsafe.getChar(src, off + 2));
    3.77 +            unsafe.putChar(dst, i + 4, unsafe.getChar(src, off + 4));
    3.78 +            unsafe.putChar(dst, i + 6, unsafe.getChar(src, off + 6));
    3.79 +            unsafe.putChar(dst, i + 8, unsafe.getChar(src, off + 8));
    3.80 +            unsafe.putChar(dst, i + 10, unsafe.getChar(src, off + 10));
    3.81 +            unsafe.putChar(dst, i + 12, unsafe.getChar(src, off + 12));
    3.82 +            unsafe.putChar(dst, i + 14, unsafe.getChar(src, off + 14));
    3.83 +
    3.84 +            // Prevent loop invariant code motion of char read.
    3.85 +            src = (src == src1) ? src2 : src1;
    3.86 +        }
    3.87 +    }
    3.88 +
    3.89 +    /*
    3.90 +     * Copy multiple consecutive chars from a char array to a given offset in a byte array
    3.91 +     * to trigger C2's superword optimization. Checks for similar problems as 'copyByteToChar'.
    3.92 +     */
    3.93 +    public static void copyCharToByte(char[] src, byte[] dst, int off) {
    3.94 +        off = (int) BYTE_ARRAY_OFFSET + (off << 1);
    3.95 +        for (int i = 0; i < 100; i = i + 8) {
    3.96 +            // Copy 8 chars from src to dst
    3.97 +            unsafe.putChar(dst, off + 0, src[i + 0]);
    3.98 +            unsafe.putChar(dst, off + 2, src[i + 1]);
    3.99 +            unsafe.putChar(dst, off + 4, src[i + 2]);
   3.100 +            unsafe.putChar(dst, off + 6, src[i + 3]);
   3.101 +            unsafe.putChar(dst, off + 8, src[i + 4]);
   3.102 +            unsafe.putChar(dst, off + 10, src[i + 5]);
   3.103 +            unsafe.putChar(dst, off + 12, src[i + 6]);
   3.104 +            unsafe.putChar(dst, off + 14, src[i + 7]);
   3.105 +        }
   3.106 +    }
   3.107 +
   3.108 +    /*
   3.109 +     * Variant of copyCharToByte with a constant destination array offset.
   3.110 +     * The loop should always be vectorized because both the LoadUS and StoreC
   3.111 +     * operations can be aligned.
   3.112 +     */
   3.113 +    public static void copyCharToByteAligned(char[] src, byte[] dst) {
   3.114 +        final int off = (int) BYTE_ARRAY_OFFSET;
   3.115 +        for (int i = 8; i < 100; i = i + 8) {
   3.116 +            // Copy 8 chars from src to dst
   3.117 +            unsafe.putChar(dst, off + 0, src[i + 0]);
   3.118 +            unsafe.putChar(dst, off + 2, src[i + 1]);
   3.119 +            unsafe.putChar(dst, off + 4, src[i + 2]);
   3.120 +            unsafe.putChar(dst, off + 6, src[i + 3]);
   3.121 +            unsafe.putChar(dst, off + 8, src[i + 4]);
   3.122 +            unsafe.putChar(dst, off + 10, src[i + 5]);
   3.123 +            unsafe.putChar(dst, off + 12, src[i + 6]);
   3.124 +            unsafe.putChar(dst, off + 14, src[i + 7]);
   3.125 +        }
   3.126 +    }
   3.127 +
   3.128 +    /*
   3.129 +     * Variant of copyCharToByte with a constant destination array offset. The
   3.130 +     * loop should only be vectorized if unaligned memory operations are allowed
   3.131 +     * because not both the LoadUS and the StoreC can be aligned.
   3.132 +     */
   3.133 +    public static void copyCharToByteUnaligned(char[] src, byte[] dst) {
   3.134 +        final int off = (int) BYTE_ARRAY_OFFSET + 2;
   3.135 +        for (int i = 0; i < 100; i = i + 8) {
   3.136 +            // Copy 8 chars from src to dst
   3.137 +            unsafe.putChar(dst, off + 0, src[i + 0]);
   3.138 +            unsafe.putChar(dst, off + 2, src[i + 1]);
   3.139 +            unsafe.putChar(dst, off + 4, src[i + 2]);
   3.140 +            unsafe.putChar(dst, off + 6, src[i + 3]);
   3.141 +            unsafe.putChar(dst, off + 8, src[i + 4]);
   3.142 +            unsafe.putChar(dst, off + 10, src[i + 5]);
   3.143 +            unsafe.putChar(dst, off + 12, src[i + 6]);
   3.144 +            unsafe.putChar(dst, off + 14, src[i + 7]);
   3.145 +        }
   3.146 +    }
   3.147 +}

mercurial