8005419: Improve intrinsics code performance on x86 by using AVX2

Tue, 08 Jan 2013 11:30:51 -0800

author
kvn
date
Tue, 08 Jan 2013 11:30:51 -0800
changeset 4413
038dd2875b94
parent 4412
ffa87474d7a4
child 4414
5698813d45eb

8005419: Improve intrinsics code performance on x86 by using AVX2
Summary: use 256bit vpxor,vptest instructions in String.compareTo() and equals() intrinsics.
Reviewed-by: twisti

src/cpu/x86/vm/assembler_x86.cpp file | annotate | diff | comparison | revisions
src/cpu/x86/vm/assembler_x86.hpp file | annotate | diff | comparison | revisions
src/cpu/x86/vm/macroAssembler_x86.cpp file | annotate | diff | comparison | revisions
src/cpu/x86/vm/macroAssembler_x86.hpp file | annotate | diff | comparison | revisions
test/compiler/8005419/Test8005419.java file | annotate | diff | comparison | revisions
     1.1 --- a/src/cpu/x86/vm/assembler_x86.cpp	Mon Jan 07 14:08:28 2013 -0800
     1.2 +++ b/src/cpu/x86/vm/assembler_x86.cpp	Tue Jan 08 11:30:51 2013 -0800
     1.3 @@ -2468,6 +2468,26 @@
     1.4    emit_int8((unsigned char)(0xC0 | encode));
     1.5  }
     1.6  
     1.7 +void Assembler::vptest(XMMRegister dst, Address src) {
     1.8 +  assert(VM_Version::supports_avx(), "");
     1.9 +  InstructionMark im(this);
    1.10 +  bool vector256 = true;
    1.11 +  assert(dst != xnoreg, "sanity");
    1.12 +  int dst_enc = dst->encoding();
    1.13 +  // swap src<->dst for encoding
    1.14 +  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector256);
    1.15 +  emit_int8(0x17);
    1.16 +  emit_operand(dst, src);
    1.17 +}
    1.18 +
    1.19 +void Assembler::vptest(XMMRegister dst, XMMRegister src) {
    1.20 +  assert(VM_Version::supports_avx(), "");
    1.21 +  bool vector256 = true;
    1.22 +  int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_38);
    1.23 +  emit_int8(0x17);
    1.24 +  emit_int8((unsigned char)(0xC0 | encode));
    1.25 +}
    1.26 +
    1.27  void Assembler::punpcklbw(XMMRegister dst, Address src) {
    1.28    NOT_LP64(assert(VM_Version::supports_sse2(), ""));
    1.29    assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
     2.1 --- a/src/cpu/x86/vm/assembler_x86.hpp	Mon Jan 07 14:08:28 2013 -0800
     2.2 +++ b/src/cpu/x86/vm/assembler_x86.hpp	Tue Jan 08 11:30:51 2013 -0800
     2.3 @@ -1444,9 +1444,12 @@
     2.4    // Shift Right by bytes Logical DoubleQuadword Immediate
     2.5    void psrldq(XMMRegister dst, int shift);
     2.6  
     2.7 -  // Logical Compare Double Quadword
     2.8 +  // Logical Compare 128bit
     2.9    void ptest(XMMRegister dst, XMMRegister src);
    2.10    void ptest(XMMRegister dst, Address src);
    2.11 +  // Logical Compare 256bit
    2.12 +  void vptest(XMMRegister dst, XMMRegister src);
    2.13 +  void vptest(XMMRegister dst, Address src);
    2.14  
    2.15    // Interleave Low Bytes
    2.16    void punpcklbw(XMMRegister dst, XMMRegister src);
     3.1 --- a/src/cpu/x86/vm/macroAssembler_x86.cpp	Mon Jan 07 14:08:28 2013 -0800
     3.2 +++ b/src/cpu/x86/vm/macroAssembler_x86.cpp	Tue Jan 08 11:30:51 2013 -0800
     3.3 @@ -5675,42 +5675,114 @@
     3.4    testl(cnt2, cnt2);
     3.5    jcc(Assembler::zero, LENGTH_DIFF_LABEL);
     3.6  
     3.7 -  // Load first characters
     3.8 +  // Compare first characters
     3.9    load_unsigned_short(result, Address(str1, 0));
    3.10    load_unsigned_short(cnt1, Address(str2, 0));
    3.11 -
    3.12 -  // Compare first characters
    3.13    subl(result, cnt1);
    3.14    jcc(Assembler::notZero,  POP_LABEL);
    3.15 -  decrementl(cnt2);
    3.16 -  jcc(Assembler::zero, LENGTH_DIFF_LABEL);
    3.17 -
    3.18 -  {
    3.19 -    // Check after comparing first character to see if strings are equivalent
    3.20 -    Label LSkip2;
    3.21 -    // Check if the strings start at same location
    3.22 -    cmpptr(str1, str2);
    3.23 -    jccb(Assembler::notEqual, LSkip2);
    3.24 -
    3.25 -    // Check if the length difference is zero (from stack)
    3.26 -    cmpl(Address(rsp, 0), 0x0);
    3.27 -    jcc(Assembler::equal,  LENGTH_DIFF_LABEL);
    3.28 -
    3.29 -    // Strings might not be equivalent
    3.30 -    bind(LSkip2);
    3.31 -  }
    3.32 +  cmpl(cnt2, 1);
    3.33 +  jcc(Assembler::equal, LENGTH_DIFF_LABEL);
    3.34 +
    3.35 +  // Check if the strings start at the same location.
    3.36 +  cmpptr(str1, str2);
    3.37 +  jcc(Assembler::equal, LENGTH_DIFF_LABEL);
    3.38  
    3.39    Address::ScaleFactor scale = Address::times_2;
    3.40    int stride = 8;
    3.41  
    3.42 -  // Advance to next element
    3.43 -  addptr(str1, 16/stride);
    3.44 -  addptr(str2, 16/stride);
    3.45 -
    3.46 -  if (UseSSE42Intrinsics) {
    3.47 +  if (UseAVX >= 2) {
    3.48 +    Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
    3.49 +    Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
    3.50 +    Label COMPARE_TAIL_LONG;
    3.51 +    int pcmpmask = 0x19;
    3.52 +
    3.53 +    // Setup to compare 16-chars (32-bytes) vectors,
    3.54 +    // start from first character again because it has aligned address.
    3.55 +    int stride2 = 16;
    3.56 +    int adr_stride  = stride  << scale;
    3.57 +    int adr_stride2 = stride2 << scale;
    3.58 +
    3.59 +    assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
    3.60 +    // rax and rdx are used by pcmpestri as elements counters
    3.61 +    movl(result, cnt2);
    3.62 +    andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
    3.63 +    jcc(Assembler::zero, COMPARE_TAIL_LONG);
    3.64 +
    3.65 +    // fast path : compare first 2 8-char vectors.
    3.66 +    bind(COMPARE_16_CHARS);
    3.67 +    movdqu(vec1, Address(str1, 0));
    3.68 +    pcmpestri(vec1, Address(str2, 0), pcmpmask);
    3.69 +    jccb(Assembler::below, COMPARE_INDEX_CHAR);
    3.70 +
    3.71 +    movdqu(vec1, Address(str1, adr_stride));
    3.72 +    pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
    3.73 +    jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
    3.74 +    addl(cnt1, stride);
    3.75 +
    3.76 +    // Compare the characters at index in cnt1
    3.77 +    bind(COMPARE_INDEX_CHAR); //cnt1 has the offset of the mismatching character
    3.78 +    load_unsigned_short(result, Address(str1, cnt1, scale));
    3.79 +    load_unsigned_short(cnt2, Address(str2, cnt1, scale));
    3.80 +    subl(result, cnt2);
    3.81 +    jmp(POP_LABEL);
    3.82 +
    3.83 +    // Setup the registers to start vector comparison loop
    3.84 +    bind(COMPARE_WIDE_VECTORS);
    3.85 +    lea(str1, Address(str1, result, scale));
    3.86 +    lea(str2, Address(str2, result, scale));
    3.87 +    subl(result, stride2);
    3.88 +    subl(cnt2, stride2);
    3.89 +    jccb(Assembler::zero, COMPARE_WIDE_TAIL);
    3.90 +    negptr(result);
    3.91 +
    3.92 +    //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
    3.93 +    bind(COMPARE_WIDE_VECTORS_LOOP);
    3.94 +    vmovdqu(vec1, Address(str1, result, scale));
    3.95 +    vpxor(vec1, Address(str2, result, scale));
    3.96 +    vptest(vec1, vec1);
    3.97 +    jccb(Assembler::notZero, VECTOR_NOT_EQUAL);
    3.98 +    addptr(result, stride2);
    3.99 +    subl(cnt2, stride2);
   3.100 +    jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
   3.101 +
   3.102 +    // compare wide vectors tail
   3.103 +    bind(COMPARE_WIDE_TAIL);
   3.104 +    testptr(result, result);
   3.105 +    jccb(Assembler::zero, LENGTH_DIFF_LABEL);
   3.106 +
   3.107 +    movl(result, stride2);
   3.108 +    movl(cnt2, result);
   3.109 +    negptr(result);
   3.110 +    jmpb(COMPARE_WIDE_VECTORS_LOOP);
   3.111 +
   3.112 +    // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
   3.113 +    bind(VECTOR_NOT_EQUAL);
   3.114 +    lea(str1, Address(str1, result, scale));
   3.115 +    lea(str2, Address(str2, result, scale));
   3.116 +    jmp(COMPARE_16_CHARS);
   3.117 +
   3.118 +    // Compare tail chars, length between 1 to 15 chars
   3.119 +    bind(COMPARE_TAIL_LONG);
   3.120 +    movl(cnt2, result);
   3.121 +    cmpl(cnt2, stride);
   3.122 +    jccb(Assembler::less, COMPARE_SMALL_STR);
   3.123 +
   3.124 +    movdqu(vec1, Address(str1, 0));
   3.125 +    pcmpestri(vec1, Address(str2, 0), pcmpmask);
   3.126 +    jcc(Assembler::below, COMPARE_INDEX_CHAR);
   3.127 +    subptr(cnt2, stride);
   3.128 +    jccb(Assembler::zero, LENGTH_DIFF_LABEL);
   3.129 +    lea(str1, Address(str1, result, scale));
   3.130 +    lea(str2, Address(str2, result, scale));
   3.131 +    negptr(cnt2);
   3.132 +    jmpb(WHILE_HEAD_LABEL);
   3.133 +
   3.134 +    bind(COMPARE_SMALL_STR);
   3.135 +  } else if (UseSSE42Intrinsics) {
   3.136      Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
   3.137      int pcmpmask = 0x19;
   3.138 -    // Setup to compare 16-byte vectors
   3.139 +    // Setup to compare 8-char (16-byte) vectors,
   3.140 +    // start from first character again because it has aligned address.
   3.141      movl(result, cnt2);
   3.142      andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
   3.143      jccb(Assembler::zero, COMPARE_TAIL);
   3.144 @@ -5742,7 +5814,7 @@
   3.145      jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
   3.146  
   3.147      // compare wide vectors tail
   3.148 -    testl(result, result);
   3.149 +    testptr(result, result);
   3.150      jccb(Assembler::zero, LENGTH_DIFF_LABEL);
   3.151  
   3.152      movl(cnt2, stride);
   3.153 @@ -5754,21 +5826,20 @@
   3.154  
   3.155      // Mismatched characters in the vectors
   3.156      bind(VECTOR_NOT_EQUAL);
   3.157 -    addptr(result, cnt1);
   3.158 -    movptr(cnt2, result);
   3.159 -    load_unsigned_short(result, Address(str1, cnt2, scale));
   3.160 -    load_unsigned_short(cnt1, Address(str2, cnt2, scale));
   3.161 -    subl(result, cnt1);
   3.162 +    addptr(cnt1, result);
   3.163 +    load_unsigned_short(result, Address(str1, cnt1, scale));
   3.164 +    load_unsigned_short(cnt2, Address(str2, cnt1, scale));
   3.165 +    subl(result, cnt2);
   3.166      jmpb(POP_LABEL);
   3.167  
   3.168      bind(COMPARE_TAIL); // limit is zero
   3.169      movl(cnt2, result);
   3.170      // Fallthru to tail compare
   3.171    }
   3.172 -
   3.173    // Shift str2 and str1 to the end of the arrays, negate min
   3.174 -  lea(str1, Address(str1, cnt2, scale, 0));
   3.175 -  lea(str2, Address(str2, cnt2, scale, 0));
   3.176 +  lea(str1, Address(str1, cnt2, scale));
   3.177 +  lea(str2, Address(str2, cnt2, scale));
   3.178 +  decrementl(cnt2);  // first character was compared already
   3.179    negptr(cnt2);
   3.180  
   3.181    // Compare the rest of the elements
   3.182 @@ -5833,7 +5904,44 @@
   3.183    shll(limit, 1);      // byte count != 0
   3.184    movl(result, limit); // copy
   3.185  
   3.186 -  if (UseSSE42Intrinsics) {
   3.187 +  if (UseAVX >= 2) {
   3.188 +    // With AVX2, use 32-byte vector compare
   3.189 +    Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
   3.190 +
   3.191 +    // Compare 32-byte vectors
   3.192 +    andl(result, 0x0000001e);  //   tail count (in bytes)
   3.193 +    andl(limit, 0xffffffe0);   // vector count (in bytes)
   3.194 +    jccb(Assembler::zero, COMPARE_TAIL);
   3.195 +
   3.196 +    lea(ary1, Address(ary1, limit, Address::times_1));
   3.197 +    lea(ary2, Address(ary2, limit, Address::times_1));
   3.198 +    negptr(limit);
   3.199 +
   3.200 +    bind(COMPARE_WIDE_VECTORS);
   3.201 +    vmovdqu(vec1, Address(ary1, limit, Address::times_1));
   3.202 +    vmovdqu(vec2, Address(ary2, limit, Address::times_1));
   3.203 +    vpxor(vec1, vec2);
   3.204 +
   3.205 +    vptest(vec1, vec1);
   3.206 +    jccb(Assembler::notZero, FALSE_LABEL);
   3.207 +    addptr(limit, 32);
   3.208 +    jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
   3.209 +
   3.210 +    testl(result, result);
   3.211 +    jccb(Assembler::zero, TRUE_LABEL);
   3.212 +
   3.213 +    vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
   3.214 +    vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
   3.215 +    vpxor(vec1, vec2);
   3.216 +
   3.217 +    vptest(vec1, vec1);
   3.218 +    jccb(Assembler::notZero, FALSE_LABEL);
   3.219 +    jmpb(TRUE_LABEL);
   3.220 +
   3.221 +    bind(COMPARE_TAIL); // limit is zero
   3.222 +    movl(limit, result);
   3.223 +    // Fallthru to tail compare
   3.224 +  } else if (UseSSE42Intrinsics) {
   3.225      // With SSE4.2, use double quad vector compare
   3.226      Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
   3.227  
     4.1 --- a/src/cpu/x86/vm/macroAssembler_x86.hpp	Mon Jan 07 14:08:28 2013 -0800
     4.2 +++ b/src/cpu/x86/vm/macroAssembler_x86.hpp	Tue Jan 08 11:30:51 2013 -0800
     4.3 @@ -1011,6 +1011,10 @@
     4.4        Assembler::vxorpd(dst, nds, src, vector256);
     4.5    }
     4.6  
     4.7 +  // Simple version for AVX2 256bit vectors
     4.8 +  void vpxor(XMMRegister dst, XMMRegister src) { Assembler::vpxor(dst, dst, src, true); }
     4.9 +  void vpxor(XMMRegister dst, Address src) { Assembler::vpxor(dst, dst, src, true); }
    4.10 +
    4.11    // Move packed integer values from low 128 bit to hign 128 bit in 256 bit vector.
    4.12    void vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src) {
    4.13      if (UseAVX > 1) // vinserti128h is available only in AVX2
     5.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     5.2 +++ b/test/compiler/8005419/Test8005419.java	Tue Jan 08 11:30:51 2013 -0800
     5.3 @@ -0,0 +1,120 @@
     5.4 +/*
     5.5 + * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
     5.6 + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     5.7 + *
     5.8 + * This code is free software; you can redistribute it and/or modify it
     5.9 + * under the terms of the GNU General Public License version 2 only, as
    5.10 + * published by the Free Software Foundation.
    5.11 + *
    5.12 + * This code is distributed in the hope that it will be useful, but WITHOUT
    5.13 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
    5.14 + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    5.15 + * version 2 for more details (a copy is included in the LICENSE file that
    5.16 + * accompanied this code).
    5.17 + *
    5.18 + * You should have received a copy of the GNU General Public License version
    5.19 + * 2 along with this work; if not, write to the Free Software Foundation,
    5.20 + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
    5.21 + *
    5.22 + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
    5.23 + * or visit www.oracle.com if you need additional information or have any
    5.24 + * questions.
    5.25 + */
    5.26 +
    5.27 +/*
    5.28 + * @test
    5.29 + * @bug 8005419
    5.30 + * @summary Improve intrinsics code performance on x86 by using AVX2
    5.31 + * @run main/othervm -Xbatch -Xmx64m Test8005419
    5.32 + *
    5.33 + */
    5.34 +
    5.35 +public class Test8005419 {
    5.36 +    public static int SIZE = 64;
    5.37 +
    5.38 +    public static void main(String[] args) {
    5.39 +        char[] a = new char[SIZE];
    5.40 +        char[] b = new char[SIZE];
    5.41 +
    5.42 +        for (int i = 16; i < SIZE; i++) {
    5.43 +          a[i] = (char)i;
    5.44 +          b[i] = (char)i;
    5.45 +        }
    5.46 +        String s1 = new String(a);
    5.47 +        String s2 = new String(b);
    5.48 +
    5.49 +        // Warm up
    5.50 +        boolean failed = false;
    5.51 +        int result = 0;
    5.52 +        for (int i = 0; i < 10000; i++) {
    5.53 +          result += test(s1, s2);
    5.54 +        }
    5.55 +        for (int i = 0; i < 10000; i++) {
    5.56 +          result += test(s1, s2);
    5.57 +        }
    5.58 +        for (int i = 0; i < 10000; i++) {
    5.59 +          result += test(s1, s2);
    5.60 +        }
    5.61 +        if (result != 0) failed = true;
    5.62 +
    5.63 +        System.out.println("Start testing");
    5.64 +        // Compare same string
    5.65 +        result = test(s1, s1);
    5.66 +        if (result != 0) {
    5.67 +          failed = true;
    5.68 +          System.out.println("Failed same: result = " + result + ", expected 0");
    5.69 +        }
    5.70 +        // Compare equal strings
    5.71 +        for (int i = 1; i <= SIZE; i++) {
    5.72 +          s1 = new String(a, 0, i);
    5.73 +          s2 = new String(b, 0, i);
    5.74 +          result = test(s1, s2);
    5.75 +          if (result != 0) {
    5.76 +            failed = true;
    5.77 +            System.out.println("Failed equals s1[" + i + "], s2[" + i + "]: result = " + result + ", expected 0");
    5.78 +          }
    5.79 +        }
    5.80 +        // Compare equal strings but different sizes
    5.81 +        for (int i = 1; i <= SIZE; i++) {
    5.82 +          s1 = new String(a, 0, i);
    5.83 +          for (int j = 1; j <= SIZE; j++) {
    5.84 +            s2 = new String(b, 0, j);
    5.85 +            result = test(s1, s2);
    5.86 +            if (result != (i-j)) {
    5.87 +              failed = true;
    5.88 +              System.out.println("Failed diff size s1[" + i + "], s2[" + j + "]: result = " + result + ", expected " + (i-j));
    5.89 +            }
    5.90 +          }
    5.91 +        }
    5.92 +        // Compare strings with one char different and different sizes
    5.93 +        for (int i = 1; i <= SIZE; i++) {
    5.94 +          s1 = new String(a, 0, i);
    5.95 +          for (int j = 0; j < i; j++) {
    5.96 +            b[j] -= 3; // change char
    5.97 +            s2 = new String(b, 0, i);
    5.98 +            result = test(s1, s2);
    5.99 +            int chdiff = a[j] - b[j];
   5.100 +            if (result != chdiff) {
   5.101 +              failed = true;
   5.102 +              System.out.println("Failed diff char s1[" + i + "], s2[" + i + "]: result = " + result + ", expected " + chdiff);
   5.103 +            }
   5.104 +            result = test(s2, s1);
   5.105 +            chdiff = b[j] - a[j];
   5.106 +            if (result != chdiff) {
   5.107 +              failed = true;
   5.108 +              System.out.println("Failed diff char s2[" + i + "], s1[" + i + "]: result = " + result + ", expected " + chdiff);
   5.109 +            }
   5.110 +            b[j] += 3; // restore
   5.111 +          }
   5.112 +        }
   5.113 +        if (failed) {
   5.114 +          System.out.println("FAILED");
   5.115 +          System.exit(97);
   5.116 +        }
   5.117 +        System.out.println("PASSED");
   5.118 +    }
   5.119 +
   5.120 +    private static int test(String str1, String str2) {
   5.121 +        return str1.compareTo(str2);
   5.122 +    }
   5.123 +}

mercurial