1.1 --- a/src/share/vm/opto/superword.cpp Mon May 11 07:44:46 2015 +0200 1.2 +++ b/src/share/vm/opto/superword.cpp Fri May 08 12:19:17 2015 +0200 1.3 @@ -232,6 +232,13 @@ 1.4 // if unaligned memory access is not allowed because number of 1.5 // iterations in pre-loop will be not enough to align it. 1.6 create_pack = false; 1.7 + } else { 1.8 + SWPointer p2(best_align_to_mem_ref, this); 1.9 + if (align_to_ref_p.invar() != p2.invar()) { 1.10 + // Do not vectorize memory accesses with different invariants 1.11 + // if unaligned memory accesses are not allowed. 1.12 + create_pack = false; 1.13 + } 1.14 } 1.15 } 1.16 } else { 1.17 @@ -452,24 +459,50 @@ 1.18 if (ABS(span) == mem_size && (ABS(offset) % mem_size) == 0) { 1.19 return true; 1.20 } 1.21 - // If initial offset from start of object is computable, 1.22 - // compute alignment within the vector. 1.23 + // If the initial offset from start of the object is computable, 1.24 + // check if the pre-loop can align the final offset accordingly. 1.25 + // 1.26 + // In other words: Can we find an i such that the offset 1.27 + // after i pre-loop iterations is aligned to vw? 1.28 + // (init_offset + pre_loop) % vw == 0 (1) 1.29 + // where 1.30 + // pre_loop = i * span 1.31 + // is the number of bytes added to the offset by i pre-loop iterations. 1.32 + // 1.33 + // For this to hold we need pre_loop to increase init_offset by 1.34 + // pre_loop = vw - (init_offset % vw) 1.35 + // 1.36 + // This is only possible if pre_loop is divisible by span because each 1.37 + // pre-loop iteration increases the initial offset by 'span' bytes: 1.38 + // (vw - (init_offset % vw)) % span == 0 1.39 + // 1.40 int vw = vector_width_in_bytes(p.mem()); 1.41 assert(vw > 1, "sanity"); 1.42 - if (vw % span == 0) { 1.43 - Node* init_nd = pre_end->init_trip(); 1.44 - if (init_nd->is_Con() && p.invar() == NULL) { 1.45 - int init = init_nd->bottom_type()->is_int()->get_con(); 1.46 - 1.47 - int init_offset = init * p.scale_in_bytes() + offset; 1.48 - assert(init_offset >= 0, "positive offset from object start"); 1.49 - 1.50 + Node* init_nd = pre_end->init_trip(); 1.51 + if (init_nd->is_Con() && p.invar() == NULL) { 1.52 + int init = init_nd->bottom_type()->is_int()->get_con(); 1.53 + int init_offset = init * p.scale_in_bytes() + offset; 1.54 + assert(init_offset >= 0, "positive offset from object start"); 1.55 + if (vw % span == 0) { 1.56 + // If vm is a multiple of span, we use formula (1). 1.57 if (span > 0) { 1.58 return (vw - (init_offset % vw)) % span == 0; 1.59 } else { 1.60 assert(span < 0, "nonzero stride * scale"); 1.61 return (init_offset % vw) % -span == 0; 1.62 } 1.63 + } else if (span % vw == 0) { 1.64 + // If span is a multiple of vw, we can simplify formula (1) to: 1.65 + // (init_offset + i * span) % vw == 0 1.66 + // => 1.67 + // (init_offset % vw) + ((i * span) % vw) == 0 1.68 + // => 1.69 + // init_offset % vw == 0 1.70 + // 1.71 + // Because we add a multiple of vw to the initial offset, the final 1.72 + // offset is a multiple of vw if and only if init_offset is a multiple. 1.73 + // 1.74 + return (init_offset % vw) == 0; 1.75 } 1.76 } 1.77 return false; 1.78 @@ -481,17 +514,23 @@ 1.79 SWPointer align_to_ref_p(mem_ref, this); 1.80 int offset = align_to_ref_p.offset_in_bytes(); 1.81 int scale = align_to_ref_p.scale_in_bytes(); 1.82 + int elt_size = align_to_ref_p.memory_size(); 1.83 int vw = vector_width_in_bytes(mem_ref); 1.84 assert(vw > 1, "sanity"); 1.85 - int stride_sign = (scale * iv_stride()) > 0 ? 1 : -1; 1.86 - // At least one iteration is executed in pre-loop by default. As result 1.87 - // several iterations are needed to align memory operations in main-loop even 1.88 - // if offset is 0. 1.89 - int iv_adjustment_in_bytes = (stride_sign * vw - (offset % vw)); 1.90 - int elt_size = align_to_ref_p.memory_size(); 1.91 - assert(((ABS(iv_adjustment_in_bytes) % elt_size) == 0), 1.92 - err_msg_res("(%d) should be divisible by (%d)", iv_adjustment_in_bytes, elt_size)); 1.93 - int iv_adjustment = iv_adjustment_in_bytes/elt_size; 1.94 + int iv_adjustment; 1.95 + if (scale != 0) { 1.96 + int stride_sign = (scale * iv_stride()) > 0 ? 1 : -1; 1.97 + // At least one iteration is executed in pre-loop by default. As result 1.98 + // several iterations are needed to align memory operations in main-loop even 1.99 + // if offset is 0. 1.100 + int iv_adjustment_in_bytes = (stride_sign * vw - (offset % vw)); 1.101 + assert(((ABS(iv_adjustment_in_bytes) % elt_size) == 0), 1.102 + err_msg_res("(%d) should be divisible by (%d)", iv_adjustment_in_bytes, elt_size)); 1.103 + iv_adjustment = iv_adjustment_in_bytes/elt_size; 1.104 + } else { 1.105 + // This memory op is not dependent on iv (scale == 0) 1.106 + iv_adjustment = 0; 1.107 + } 1.108 1.109 #ifndef PRODUCT 1.110 if (TraceSuperWord)