Fri, 08 May 2015 12:19:17 +0200
8078497: C2's superword optimization causes unaligned memory accesses
Summary: Prevent vectorization of memory operations with different invariant offsets if unaligned memory accesses are not allowed.
Reviewed-by: kvn
1.1 --- a/src/share/vm/opto/superword.cpp Mon May 11 07:44:46 2015 +0200 1.2 +++ b/src/share/vm/opto/superword.cpp Fri May 08 12:19:17 2015 +0200 1.3 @@ -232,6 +232,13 @@ 1.4 // if unaligned memory access is not allowed because number of 1.5 // iterations in pre-loop will be not enough to align it. 1.6 create_pack = false; 1.7 + } else { 1.8 + SWPointer p2(best_align_to_mem_ref, this); 1.9 + if (align_to_ref_p.invar() != p2.invar()) { 1.10 + // Do not vectorize memory accesses with different invariants 1.11 + // if unaligned memory accesses are not allowed. 1.12 + create_pack = false; 1.13 + } 1.14 } 1.15 } 1.16 } else { 1.17 @@ -452,24 +459,50 @@ 1.18 if (ABS(span) == mem_size && (ABS(offset) % mem_size) == 0) { 1.19 return true; 1.20 } 1.21 - // If initial offset from start of object is computable, 1.22 - // compute alignment within the vector. 1.23 + // If the initial offset from start of the object is computable, 1.24 + // check if the pre-loop can align the final offset accordingly. 1.25 + // 1.26 + // In other words: Can we find an i such that the offset 1.27 + // after i pre-loop iterations is aligned to vw? 1.28 + // (init_offset + pre_loop) % vw == 0 (1) 1.29 + // where 1.30 + // pre_loop = i * span 1.31 + // is the number of bytes added to the offset by i pre-loop iterations. 1.32 + // 1.33 + // For this to hold we need pre_loop to increase init_offset by 1.34 + // pre_loop = vw - (init_offset % vw) 1.35 + // 1.36 + // This is only possible if pre_loop is divisible by span because each 1.37 + // pre-loop iteration increases the initial offset by 'span' bytes: 1.38 + // (vw - (init_offset % vw)) % span == 0 1.39 + // 1.40 int vw = vector_width_in_bytes(p.mem()); 1.41 assert(vw > 1, "sanity"); 1.42 - if (vw % span == 0) { 1.43 - Node* init_nd = pre_end->init_trip(); 1.44 - if (init_nd->is_Con() && p.invar() == NULL) { 1.45 - int init = init_nd->bottom_type()->is_int()->get_con(); 1.46 - 1.47 - int init_offset = init * p.scale_in_bytes() + offset; 1.48 - assert(init_offset >= 0, "positive offset from object start"); 1.49 - 1.50 + Node* init_nd = pre_end->init_trip(); 1.51 + if (init_nd->is_Con() && p.invar() == NULL) { 1.52 + int init = init_nd->bottom_type()->is_int()->get_con(); 1.53 + int init_offset = init * p.scale_in_bytes() + offset; 1.54 + assert(init_offset >= 0, "positive offset from object start"); 1.55 + if (vw % span == 0) { 1.56 + // If vm is a multiple of span, we use formula (1). 1.57 if (span > 0) { 1.58 return (vw - (init_offset % vw)) % span == 0; 1.59 } else { 1.60 assert(span < 0, "nonzero stride * scale"); 1.61 return (init_offset % vw) % -span == 0; 1.62 } 1.63 + } else if (span % vw == 0) { 1.64 + // If span is a multiple of vw, we can simplify formula (1) to: 1.65 + // (init_offset + i * span) % vw == 0 1.66 + // => 1.67 + // (init_offset % vw) + ((i * span) % vw) == 0 1.68 + // => 1.69 + // init_offset % vw == 0 1.70 + // 1.71 + // Because we add a multiple of vw to the initial offset, the final 1.72 + // offset is a multiple of vw if and only if init_offset is a multiple. 1.73 + // 1.74 + return (init_offset % vw) == 0; 1.75 } 1.76 } 1.77 return false; 1.78 @@ -481,17 +514,23 @@ 1.79 SWPointer align_to_ref_p(mem_ref, this); 1.80 int offset = align_to_ref_p.offset_in_bytes(); 1.81 int scale = align_to_ref_p.scale_in_bytes(); 1.82 + int elt_size = align_to_ref_p.memory_size(); 1.83 int vw = vector_width_in_bytes(mem_ref); 1.84 assert(vw > 1, "sanity"); 1.85 - int stride_sign = (scale * iv_stride()) > 0 ? 1 : -1; 1.86 - // At least one iteration is executed in pre-loop by default. As result 1.87 - // several iterations are needed to align memory operations in main-loop even 1.88 - // if offset is 0. 1.89 - int iv_adjustment_in_bytes = (stride_sign * vw - (offset % vw)); 1.90 - int elt_size = align_to_ref_p.memory_size(); 1.91 - assert(((ABS(iv_adjustment_in_bytes) % elt_size) == 0), 1.92 - err_msg_res("(%d) should be divisible by (%d)", iv_adjustment_in_bytes, elt_size)); 1.93 - int iv_adjustment = iv_adjustment_in_bytes/elt_size; 1.94 + int iv_adjustment; 1.95 + if (scale != 0) { 1.96 + int stride_sign = (scale * iv_stride()) > 0 ? 1 : -1; 1.97 + // At least one iteration is executed in pre-loop by default. As result 1.98 + // several iterations are needed to align memory operations in main-loop even 1.99 + // if offset is 0. 1.100 + int iv_adjustment_in_bytes = (stride_sign * vw - (offset % vw)); 1.101 + assert(((ABS(iv_adjustment_in_bytes) % elt_size) == 0), 1.102 + err_msg_res("(%d) should be divisible by (%d)", iv_adjustment_in_bytes, elt_size)); 1.103 + iv_adjustment = iv_adjustment_in_bytes/elt_size; 1.104 + } else { 1.105 + // This memory op is not dependent on iv (scale == 0) 1.106 + iv_adjustment = 0; 1.107 + } 1.108 1.109 #ifndef PRODUCT 1.110 if (TraceSuperWord)
2.1 --- a/src/share/vm/opto/superword.hpp Mon May 11 07:44:46 2015 +0200 2.2 +++ b/src/share/vm/opto/superword.hpp Fri May 08 12:19:17 2015 +0200 2.3 @@ -41,7 +41,7 @@ 2.4 // Exploiting SuperWord Level Parallelism with 2.5 // Multimedia Instruction Sets 2.6 // by 2.7 -// Samuel Larsen and Saman Amarasighe 2.8 +// Samuel Larsen and Saman Amarasinghe 2.9 // MIT Laboratory for Computer Science 2.10 // date 2.11 // May 2000 2.12 @@ -432,7 +432,7 @@ 2.13 2.14 Node* _base; // NULL if unsafe nonheap reference 2.15 Node* _adr; // address pointer 2.16 - jint _scale; // multipler for iv (in bytes), 0 if no loop iv 2.17 + jint _scale; // multiplier for iv (in bytes), 0 if no loop iv 2.18 jint _offset; // constant offset (in bytes) 2.19 Node* _invar; // invariant offset (in bytes), NULL if none 2.20 bool _negate_invar; // if true then use: (0 - _invar)
3.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 3.2 +++ b/test/compiler/loopopts/superword/TestVectorizationWithInvariant.java Fri May 08 12:19:17 2015 +0200 3.3 @@ -0,0 +1,144 @@ 3.4 +/* 3.5 + * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. 3.6 + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 3.7 + * 3.8 + * This code is free software; you can redistribute it and/or modify it 3.9 + * under the terms of the GNU General Public License version 2 only, as 3.10 + * published by the Free Software Foundation. 3.11 + * 3.12 + * This code is distributed in the hope that it will be useful, but WITHOUT 3.13 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 3.14 + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 3.15 + * version 2 for more details (a copy is included in the LICENSE file that 3.16 + * accompanied this code). 3.17 + * 3.18 + * You should have received a copy of the GNU General Public License version 3.19 + * 2 along with this work; if not, write to the Free Software Foundation, 3.20 + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 3.21 + * 3.22 + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 3.23 + * or visit www.oracle.com if you need additional information or have any 3.24 + * questions. 3.25 + * 3.26 + */ 3.27 + 3.28 +import com.oracle.java.testlibrary.*; 3.29 +import sun.misc.Unsafe; 3.30 + 3.31 +/** 3.32 + * @test 3.33 + * @bug 8078497 3.34 + * @summary Tests correct alignment of vectors with loop invariant offset. 3.35 + * @library /testlibrary 3.36 + * @run main TestVectorizationWithInvariant 3.37 + */ 3.38 +public class TestVectorizationWithInvariant { 3.39 + 3.40 + private static Unsafe unsafe; 3.41 + private static final long BYTE_ARRAY_OFFSET; 3.42 + private static final long CHAR_ARRAY_OFFSET; 3.43 + 3.44 + static { 3.45 + unsafe = Utils.getUnsafe(); 3.46 + BYTE_ARRAY_OFFSET = unsafe.arrayBaseOffset(byte[].class); 3.47 + CHAR_ARRAY_OFFSET = unsafe.arrayBaseOffset(char[].class); 3.48 + } 3.49 + 3.50 + public static void main(String[] args) throws Exception { 3.51 + byte[] byte_array1 = new byte[1000]; 3.52 + byte[] byte_array2 = new byte[1000]; 3.53 + char[] char_array = new char[1000]; 3.54 + 3.55 + for (int i = 0; i < 20_000; ++i) { 3.56 + copyByteToChar(byte_array1, byte_array2, char_array, 1); 3.57 + copyCharToByte(char_array, byte_array1, 1); 3.58 + copyCharToByteAligned(char_array, byte_array1); 3.59 + copyCharToByteUnaligned(char_array, byte_array1); 3.60 + } 3.61 + } 3.62 + 3.63 + /* 3.64 + * Copy multiple consecutive chars from a byte array to a given offset in a char array 3.65 + * to trigger C2's superword optimization. The offset in the byte array is independent 3.66 + * of the loop induction variable and can be set to an arbitrary value. It may then not 3.67 + * be possible to both align the LoadUS and the StoreC operations. Therefore, vectorization 3.68 + * should only be done in this case if unaligned memory accesses are allowed. 3.69 + */ 3.70 + public static void copyByteToChar(byte[] src1, byte[] src2, char[] dst, int off) { 3.71 + off = (int) BYTE_ARRAY_OFFSET + (off << 1); 3.72 + byte[] src = src1; 3.73 + for (int i = (int) CHAR_ARRAY_OFFSET; i < 100; i = i + 8) { 3.74 + // Copy 8 chars from src to dst 3.75 + unsafe.putChar(dst, i + 0, unsafe.getChar(src, off + 0)); 3.76 + unsafe.putChar(dst, i + 2, unsafe.getChar(src, off + 2)); 3.77 + unsafe.putChar(dst, i + 4, unsafe.getChar(src, off + 4)); 3.78 + unsafe.putChar(dst, i + 6, unsafe.getChar(src, off + 6)); 3.79 + unsafe.putChar(dst, i + 8, unsafe.getChar(src, off + 8)); 3.80 + unsafe.putChar(dst, i + 10, unsafe.getChar(src, off + 10)); 3.81 + unsafe.putChar(dst, i + 12, unsafe.getChar(src, off + 12)); 3.82 + unsafe.putChar(dst, i + 14, unsafe.getChar(src, off + 14)); 3.83 + 3.84 + // Prevent loop invariant code motion of char read. 3.85 + src = (src == src1) ? src2 : src1; 3.86 + } 3.87 + } 3.88 + 3.89 + /* 3.90 + * Copy multiple consecutive chars from a char array to a given offset in a byte array 3.91 + * to trigger C2's superword optimization. Checks for similar problems as 'copyByteToChar'. 3.92 + */ 3.93 + public static void copyCharToByte(char[] src, byte[] dst, int off) { 3.94 + off = (int) BYTE_ARRAY_OFFSET + (off << 1); 3.95 + for (int i = 0; i < 100; i = i + 8) { 3.96 + // Copy 8 chars from src to dst 3.97 + unsafe.putChar(dst, off + 0, src[i + 0]); 3.98 + unsafe.putChar(dst, off + 2, src[i + 1]); 3.99 + unsafe.putChar(dst, off + 4, src[i + 2]); 3.100 + unsafe.putChar(dst, off + 6, src[i + 3]); 3.101 + unsafe.putChar(dst, off + 8, src[i + 4]); 3.102 + unsafe.putChar(dst, off + 10, src[i + 5]); 3.103 + unsafe.putChar(dst, off + 12, src[i + 6]); 3.104 + unsafe.putChar(dst, off + 14, src[i + 7]); 3.105 + } 3.106 + } 3.107 + 3.108 + /* 3.109 + * Variant of copyCharToByte with a constant destination array offset. 3.110 + * The loop should always be vectorized because both the LoadUS and StoreC 3.111 + * operations can be aligned. 3.112 + */ 3.113 + public static void copyCharToByteAligned(char[] src, byte[] dst) { 3.114 + final int off = (int) BYTE_ARRAY_OFFSET; 3.115 + for (int i = 8; i < 100; i = i + 8) { 3.116 + // Copy 8 chars from src to dst 3.117 + unsafe.putChar(dst, off + 0, src[i + 0]); 3.118 + unsafe.putChar(dst, off + 2, src[i + 1]); 3.119 + unsafe.putChar(dst, off + 4, src[i + 2]); 3.120 + unsafe.putChar(dst, off + 6, src[i + 3]); 3.121 + unsafe.putChar(dst, off + 8, src[i + 4]); 3.122 + unsafe.putChar(dst, off + 10, src[i + 5]); 3.123 + unsafe.putChar(dst, off + 12, src[i + 6]); 3.124 + unsafe.putChar(dst, off + 14, src[i + 7]); 3.125 + } 3.126 + } 3.127 + 3.128 + /* 3.129 + * Variant of copyCharToByte with a constant destination array offset. The 3.130 + * loop should only be vectorized if unaligned memory operations are allowed 3.131 + * because not both the LoadUS and the StoreC can be aligned. 3.132 + */ 3.133 + public static void copyCharToByteUnaligned(char[] src, byte[] dst) { 3.134 + final int off = (int) BYTE_ARRAY_OFFSET + 2; 3.135 + for (int i = 0; i < 100; i = i + 8) { 3.136 + // Copy 8 chars from src to dst 3.137 + unsafe.putChar(dst, off + 0, src[i + 0]); 3.138 + unsafe.putChar(dst, off + 2, src[i + 1]); 3.139 + unsafe.putChar(dst, off + 4, src[i + 2]); 3.140 + unsafe.putChar(dst, off + 6, src[i + 3]); 3.141 + unsafe.putChar(dst, off + 8, src[i + 4]); 3.142 + unsafe.putChar(dst, off + 10, src[i + 5]); 3.143 + unsafe.putChar(dst, off + 12, src[i + 6]); 3.144 + unsafe.putChar(dst, off + 14, src[i + 7]); 3.145 + } 3.146 + } 3.147 +}