Tue, 08 Jan 2013 11:30:51 -0800
8005419: Improve intrinsics code performance on x86 by using AVX2
Summary: use 256bit vpxor,vptest instructions in String.compareTo() and equals() intrinsics.
Reviewed-by: twisti
1.1 --- a/src/cpu/x86/vm/assembler_x86.cpp Mon Jan 07 14:08:28 2013 -0800 1.2 +++ b/src/cpu/x86/vm/assembler_x86.cpp Tue Jan 08 11:30:51 2013 -0800 1.3 @@ -2468,6 +2468,26 @@ 1.4 emit_int8((unsigned char)(0xC0 | encode)); 1.5 } 1.6 1.7 +void Assembler::vptest(XMMRegister dst, Address src) { 1.8 + assert(VM_Version::supports_avx(), ""); 1.9 + InstructionMark im(this); 1.10 + bool vector256 = true; 1.11 + assert(dst != xnoreg, "sanity"); 1.12 + int dst_enc = dst->encoding(); 1.13 + // swap src<->dst for encoding 1.14 + vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector256); 1.15 + emit_int8(0x17); 1.16 + emit_operand(dst, src); 1.17 +} 1.18 + 1.19 +void Assembler::vptest(XMMRegister dst, XMMRegister src) { 1.20 + assert(VM_Version::supports_avx(), ""); 1.21 + bool vector256 = true; 1.22 + int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_38); 1.23 + emit_int8(0x17); 1.24 + emit_int8((unsigned char)(0xC0 | encode)); 1.25 +} 1.26 + 1.27 void Assembler::punpcklbw(XMMRegister dst, Address src) { 1.28 NOT_LP64(assert(VM_Version::supports_sse2(), "")); 1.29 assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
2.1 --- a/src/cpu/x86/vm/assembler_x86.hpp Mon Jan 07 14:08:28 2013 -0800 2.2 +++ b/src/cpu/x86/vm/assembler_x86.hpp Tue Jan 08 11:30:51 2013 -0800 2.3 @@ -1444,9 +1444,12 @@ 2.4 // Shift Right by bytes Logical DoubleQuadword Immediate 2.5 void psrldq(XMMRegister dst, int shift); 2.6 2.7 - // Logical Compare Double Quadword 2.8 + // Logical Compare 128bit 2.9 void ptest(XMMRegister dst, XMMRegister src); 2.10 void ptest(XMMRegister dst, Address src); 2.11 + // Logical Compare 256bit 2.12 + void vptest(XMMRegister dst, XMMRegister src); 2.13 + void vptest(XMMRegister dst, Address src); 2.14 2.15 // Interleave Low Bytes 2.16 void punpcklbw(XMMRegister dst, XMMRegister src);
3.1 --- a/src/cpu/x86/vm/macroAssembler_x86.cpp Mon Jan 07 14:08:28 2013 -0800 3.2 +++ b/src/cpu/x86/vm/macroAssembler_x86.cpp Tue Jan 08 11:30:51 2013 -0800 3.3 @@ -5675,42 +5675,114 @@ 3.4 testl(cnt2, cnt2); 3.5 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3.6 3.7 - // Load first characters 3.8 + // Compare first characters 3.9 load_unsigned_short(result, Address(str1, 0)); 3.10 load_unsigned_short(cnt1, Address(str2, 0)); 3.11 - 3.12 - // Compare first characters 3.13 subl(result, cnt1); 3.14 jcc(Assembler::notZero, POP_LABEL); 3.15 - decrementl(cnt2); 3.16 - jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3.17 - 3.18 - { 3.19 - // Check after comparing first character to see if strings are equivalent 3.20 - Label LSkip2; 3.21 - // Check if the strings start at same location 3.22 - cmpptr(str1, str2); 3.23 - jccb(Assembler::notEqual, LSkip2); 3.24 - 3.25 - // Check if the length difference is zero (from stack) 3.26 - cmpl(Address(rsp, 0), 0x0); 3.27 - jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3.28 - 3.29 - // Strings might not be equivalent 3.30 - bind(LSkip2); 3.31 - } 3.32 + cmpl(cnt2, 1); 3.33 + jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3.34 + 3.35 + // Check if the strings start at the same location. 3.36 + cmpptr(str1, str2); 3.37 + jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3.38 3.39 Address::ScaleFactor scale = Address::times_2; 3.40 int stride = 8; 3.41 3.42 - // Advance to next element 3.43 - addptr(str1, 16/stride); 3.44 - addptr(str2, 16/stride); 3.45 - 3.46 - if (UseSSE42Intrinsics) { 3.47 + if (UseAVX >= 2) { 3.48 + Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3.49 + Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3.50 + Label COMPARE_TAIL_LONG; 3.51 + int pcmpmask = 0x19; 3.52 + 3.53 + // Setup to compare 16-chars (32-bytes) vectors, 3.54 + // start from first character again because it has aligned address. 3.55 + int stride2 = 16; 3.56 + int adr_stride = stride << scale; 3.57 + int adr_stride2 = stride2 << scale; 3.58 + 3.59 + assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3.60 + // rax and rdx are used by pcmpestri as elements counters 3.61 + movl(result, cnt2); 3.62 + andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3.63 + jcc(Assembler::zero, COMPARE_TAIL_LONG); 3.64 + 3.65 + // fast path : compare first 2 8-char vectors. 3.66 + bind(COMPARE_16_CHARS); 3.67 + movdqu(vec1, Address(str1, 0)); 3.68 + pcmpestri(vec1, Address(str2, 0), pcmpmask); 3.69 + jccb(Assembler::below, COMPARE_INDEX_CHAR); 3.70 + 3.71 + movdqu(vec1, Address(str1, adr_stride)); 3.72 + pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3.73 + jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3.74 + addl(cnt1, stride); 3.75 + 3.76 + // Compare the characters at index in cnt1 3.77 + bind(COMPARE_INDEX_CHAR); //cnt1 has the offset of the mismatching character 3.78 + load_unsigned_short(result, Address(str1, cnt1, scale)); 3.79 + load_unsigned_short(cnt2, Address(str2, cnt1, scale)); 3.80 + subl(result, cnt2); 3.81 + jmp(POP_LABEL); 3.82 + 3.83 + // Setup the registers to start vector comparison loop 3.84 + bind(COMPARE_WIDE_VECTORS); 3.85 + lea(str1, Address(str1, result, scale)); 3.86 + lea(str2, Address(str2, result, scale)); 3.87 + subl(result, stride2); 3.88 + subl(cnt2, stride2); 3.89 + jccb(Assembler::zero, COMPARE_WIDE_TAIL); 3.90 + negptr(result); 3.91 + 3.92 + // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3.93 + bind(COMPARE_WIDE_VECTORS_LOOP); 3.94 + vmovdqu(vec1, Address(str1, result, scale)); 3.95 + vpxor(vec1, Address(str2, result, scale)); 3.96 + vptest(vec1, vec1); 3.97 + jccb(Assembler::notZero, VECTOR_NOT_EQUAL); 3.98 + addptr(result, stride2); 3.99 + subl(cnt2, stride2); 3.100 + jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3.101 + 3.102 + // compare wide vectors tail 3.103 + bind(COMPARE_WIDE_TAIL); 3.104 + testptr(result, result); 3.105 + jccb(Assembler::zero, LENGTH_DIFF_LABEL); 3.106 + 3.107 + movl(result, stride2); 3.108 + movl(cnt2, result); 3.109 + negptr(result); 3.110 + jmpb(COMPARE_WIDE_VECTORS_LOOP); 3.111 + 3.112 + // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3.113 + bind(VECTOR_NOT_EQUAL); 3.114 + lea(str1, Address(str1, result, scale)); 3.115 + lea(str2, Address(str2, result, scale)); 3.116 + jmp(COMPARE_16_CHARS); 3.117 + 3.118 + // Compare tail chars, length between 1 to 15 chars 3.119 + bind(COMPARE_TAIL_LONG); 3.120 + movl(cnt2, result); 3.121 + cmpl(cnt2, stride); 3.122 + jccb(Assembler::less, COMPARE_SMALL_STR); 3.123 + 3.124 + movdqu(vec1, Address(str1, 0)); 3.125 + pcmpestri(vec1, Address(str2, 0), pcmpmask); 3.126 + jcc(Assembler::below, COMPARE_INDEX_CHAR); 3.127 + subptr(cnt2, stride); 3.128 + jccb(Assembler::zero, LENGTH_DIFF_LABEL); 3.129 + lea(str1, Address(str1, result, scale)); 3.130 + lea(str2, Address(str2, result, scale)); 3.131 + negptr(cnt2); 3.132 + jmpb(WHILE_HEAD_LABEL); 3.133 + 3.134 + bind(COMPARE_SMALL_STR); 3.135 + } else if (UseSSE42Intrinsics) { 3.136 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 3.137 int pcmpmask = 0x19; 3.138 - // Setup to compare 16-byte vectors 3.139 + // Setup to compare 8-char (16-byte) vectors, 3.140 + // start from first character again because it has aligned address. 3.141 movl(result, cnt2); 3.142 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 3.143 jccb(Assembler::zero, COMPARE_TAIL); 3.144 @@ -5742,7 +5814,7 @@ 3.145 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3.146 3.147 // compare wide vectors tail 3.148 - testl(result, result); 3.149 + testptr(result, result); 3.150 jccb(Assembler::zero, LENGTH_DIFF_LABEL); 3.151 3.152 movl(cnt2, stride); 3.153 @@ -5754,21 +5826,20 @@ 3.154 3.155 // Mismatched characters in the vectors 3.156 bind(VECTOR_NOT_EQUAL); 3.157 - addptr(result, cnt1); 3.158 - movptr(cnt2, result); 3.159 - load_unsigned_short(result, Address(str1, cnt2, scale)); 3.160 - load_unsigned_short(cnt1, Address(str2, cnt2, scale)); 3.161 - subl(result, cnt1); 3.162 + addptr(cnt1, result); 3.163 + load_unsigned_short(result, Address(str1, cnt1, scale)); 3.164 + load_unsigned_short(cnt2, Address(str2, cnt1, scale)); 3.165 + subl(result, cnt2); 3.166 jmpb(POP_LABEL); 3.167 3.168 bind(COMPARE_TAIL); // limit is zero 3.169 movl(cnt2, result); 3.170 // Fallthru to tail compare 3.171 } 3.172 - 3.173 // Shift str2 and str1 to the end of the arrays, negate min 3.174 - lea(str1, Address(str1, cnt2, scale, 0)); 3.175 - lea(str2, Address(str2, cnt2, scale, 0)); 3.176 + lea(str1, Address(str1, cnt2, scale)); 3.177 + lea(str2, Address(str2, cnt2, scale)); 3.178 + decrementl(cnt2); // first character was compared already 3.179 negptr(cnt2); 3.180 3.181 // Compare the rest of the elements 3.182 @@ -5833,7 +5904,44 @@ 3.183 shll(limit, 1); // byte count != 0 3.184 movl(result, limit); // copy 3.185 3.186 - if (UseSSE42Intrinsics) { 3.187 + if (UseAVX >= 2) { 3.188 + // With AVX2, use 32-byte vector compare 3.189 + Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 3.190 + 3.191 + // Compare 32-byte vectors 3.192 + andl(result, 0x0000001e); // tail count (in bytes) 3.193 + andl(limit, 0xffffffe0); // vector count (in bytes) 3.194 + jccb(Assembler::zero, COMPARE_TAIL); 3.195 + 3.196 + lea(ary1, Address(ary1, limit, Address::times_1)); 3.197 + lea(ary2, Address(ary2, limit, Address::times_1)); 3.198 + negptr(limit); 3.199 + 3.200 + bind(COMPARE_WIDE_VECTORS); 3.201 + vmovdqu(vec1, Address(ary1, limit, Address::times_1)); 3.202 + vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 3.203 + vpxor(vec1, vec2); 3.204 + 3.205 + vptest(vec1, vec1); 3.206 + jccb(Assembler::notZero, FALSE_LABEL); 3.207 + addptr(limit, 32); 3.208 + jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 3.209 + 3.210 + testl(result, result); 3.211 + jccb(Assembler::zero, TRUE_LABEL); 3.212 + 3.213 + vmovdqu(vec1, Address(ary1, result, Address::times_1, -32)); 3.214 + vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 3.215 + vpxor(vec1, vec2); 3.216 + 3.217 + vptest(vec1, vec1); 3.218 + jccb(Assembler::notZero, FALSE_LABEL); 3.219 + jmpb(TRUE_LABEL); 3.220 + 3.221 + bind(COMPARE_TAIL); // limit is zero 3.222 + movl(limit, result); 3.223 + // Fallthru to tail compare 3.224 + } else if (UseSSE42Intrinsics) { 3.225 // With SSE4.2, use double quad vector compare 3.226 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 3.227
4.1 --- a/src/cpu/x86/vm/macroAssembler_x86.hpp Mon Jan 07 14:08:28 2013 -0800 4.2 +++ b/src/cpu/x86/vm/macroAssembler_x86.hpp Tue Jan 08 11:30:51 2013 -0800 4.3 @@ -1011,6 +1011,10 @@ 4.4 Assembler::vxorpd(dst, nds, src, vector256); 4.5 } 4.6 4.7 + // Simple version for AVX2 256bit vectors 4.8 + void vpxor(XMMRegister dst, XMMRegister src) { Assembler::vpxor(dst, dst, src, true); } 4.9 + void vpxor(XMMRegister dst, Address src) { Assembler::vpxor(dst, dst, src, true); } 4.10 + 4.11 // Move packed integer values from low 128 bit to hign 128 bit in 256 bit vector. 4.12 void vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src) { 4.13 if (UseAVX > 1) // vinserti128h is available only in AVX2
5.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 5.2 +++ b/test/compiler/8005419/Test8005419.java Tue Jan 08 11:30:51 2013 -0800 5.3 @@ -0,0 +1,120 @@ 5.4 +/* 5.5 + * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved. 5.6 + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5.7 + * 5.8 + * This code is free software; you can redistribute it and/or modify it 5.9 + * under the terms of the GNU General Public License version 2 only, as 5.10 + * published by the Free Software Foundation. 5.11 + * 5.12 + * This code is distributed in the hope that it will be useful, but WITHOUT 5.13 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 5.14 + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 5.15 + * version 2 for more details (a copy is included in the LICENSE file that 5.16 + * accompanied this code). 5.17 + * 5.18 + * You should have received a copy of the GNU General Public License version 5.19 + * 2 along with this work; if not, write to the Free Software Foundation, 5.20 + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 5.21 + * 5.22 + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 5.23 + * or visit www.oracle.com if you need additional information or have any 5.24 + * questions. 5.25 + */ 5.26 + 5.27 +/* 5.28 + * @test 5.29 + * @bug 8005419 5.30 + * @summary Improve intrinsics code performance on x86 by using AVX2 5.31 + * @run main/othervm -Xbatch -Xmx64m Test8005419 5.32 + * 5.33 + */ 5.34 + 5.35 +public class Test8005419 { 5.36 + public static int SIZE = 64; 5.37 + 5.38 + public static void main(String[] args) { 5.39 + char[] a = new char[SIZE]; 5.40 + char[] b = new char[SIZE]; 5.41 + 5.42 + for (int i = 16; i < SIZE; i++) { 5.43 + a[i] = (char)i; 5.44 + b[i] = (char)i; 5.45 + } 5.46 + String s1 = new String(a); 5.47 + String s2 = new String(b); 5.48 + 5.49 + // Warm up 5.50 + boolean failed = false; 5.51 + int result = 0; 5.52 + for (int i = 0; i < 10000; i++) { 5.53 + result += test(s1, s2); 5.54 + } 5.55 + for (int i = 0; i < 10000; i++) { 5.56 + result += test(s1, s2); 5.57 + } 5.58 + for (int i = 0; i < 10000; i++) { 5.59 + result += test(s1, s2); 5.60 + } 5.61 + if (result != 0) failed = true; 5.62 + 5.63 + System.out.println("Start testing"); 5.64 + // Compare same string 5.65 + result = test(s1, s1); 5.66 + if (result != 0) { 5.67 + failed = true; 5.68 + System.out.println("Failed same: result = " + result + ", expected 0"); 5.69 + } 5.70 + // Compare equal strings 5.71 + for (int i = 1; i <= SIZE; i++) { 5.72 + s1 = new String(a, 0, i); 5.73 + s2 = new String(b, 0, i); 5.74 + result = test(s1, s2); 5.75 + if (result != 0) { 5.76 + failed = true; 5.77 + System.out.println("Failed equals s1[" + i + "], s2[" + i + "]: result = " + result + ", expected 0"); 5.78 + } 5.79 + } 5.80 + // Compare equal strings but different sizes 5.81 + for (int i = 1; i <= SIZE; i++) { 5.82 + s1 = new String(a, 0, i); 5.83 + for (int j = 1; j <= SIZE; j++) { 5.84 + s2 = new String(b, 0, j); 5.85 + result = test(s1, s2); 5.86 + if (result != (i-j)) { 5.87 + failed = true; 5.88 + System.out.println("Failed diff size s1[" + i + "], s2[" + j + "]: result = " + result + ", expected " + (i-j)); 5.89 + } 5.90 + } 5.91 + } 5.92 + // Compare strings with one char different and different sizes 5.93 + for (int i = 1; i <= SIZE; i++) { 5.94 + s1 = new String(a, 0, i); 5.95 + for (int j = 0; j < i; j++) { 5.96 + b[j] -= 3; // change char 5.97 + s2 = new String(b, 0, i); 5.98 + result = test(s1, s2); 5.99 + int chdiff = a[j] - b[j]; 5.100 + if (result != chdiff) { 5.101 + failed = true; 5.102 + System.out.println("Failed diff char s1[" + i + "], s2[" + i + "]: result = " + result + ", expected " + chdiff); 5.103 + } 5.104 + result = test(s2, s1); 5.105 + chdiff = b[j] - a[j]; 5.106 + if (result != chdiff) { 5.107 + failed = true; 5.108 + System.out.println("Failed diff char s2[" + i + "], s1[" + i + "]: result = " + result + ", expected " + chdiff); 5.109 + } 5.110 + b[j] += 3; // restore 5.111 + } 5.112 + } 5.113 + if (failed) { 5.114 + System.out.println("FAILED"); 5.115 + System.exit(97); 5.116 + } 5.117 + System.out.println("PASSED"); 5.118 + } 5.119 + 5.120 + private static int test(String str1, String str2) { 5.121 + return str1.compareTo(str2); 5.122 + } 5.123 +}