8166684: PPC64: implement intrinsic code with vector instructions for Unsafe.copyMemory()

Sun, 26 May 2019 21:02:55 -0400

author
gromero
date
Sun, 26 May 2019 21:02:55 -0400
changeset 9684
69f33959c27f
parent 9682
9905a72841d7
child 9685
9f0ea552da23

8166684: PPC64: implement intrinsic code with vector instructions for Unsafe.copyMemory()
Reviewed-by: simonis, mdoerr
Contributed-by: Michihiro Horie <horie@jp.ibm.com>

src/cpu/ppc/vm/assembler_ppc.hpp file | annotate | diff | comparison | revisions
src/cpu/ppc/vm/assembler_ppc.inline.hpp file | annotate | diff | comparison | revisions
src/cpu/ppc/vm/stubGenerator_ppc.cpp file | annotate | diff | comparison | revisions
src/cpu/ppc/vm/vm_version_ppc.cpp file | annotate | diff | comparison | revisions
     1.1 --- a/src/cpu/ppc/vm/assembler_ppc.hpp	Thu May 23 04:05:08 2019 +0100
     1.2 +++ b/src/cpu/ppc/vm/assembler_ppc.hpp	Sun May 26 21:02:55 2019 -0400
     1.3 @@ -1963,7 +1963,9 @@
     1.4    inline void mfvscr(   VectorRegister d);
     1.5  
     1.6    // Vector-Scalar (VSX) instructions.
     1.7 +  inline void lxvd2x(   VectorSRegister d, Register a);
     1.8    inline void lxvd2x(   VectorSRegister d, Register a, Register b);
     1.9 +  inline void stxvd2x(  VectorSRegister d, Register a);
    1.10    inline void stxvd2x(  VectorSRegister d, Register a, Register b);
    1.11    inline void mtvrd(    VectorRegister  d, Register a);
    1.12    inline void mfvrd(    Register        a, VectorRegister d);
     2.1 --- a/src/cpu/ppc/vm/assembler_ppc.inline.hpp	Thu May 23 04:05:08 2019 +0100
     2.2 +++ b/src/cpu/ppc/vm/assembler_ppc.inline.hpp	Sun May 26 21:02:55 2019 -0400
     2.3 @@ -627,8 +627,10 @@
     2.4  inline void Assembler::lvsr(  VectorRegister d, Register s1, Register s2) { emit_int32( LVSR_OPCODE   | vrt(d) | ra0mem(s1) | rb(s2)); }
     2.5  
     2.6  // Vector-Scalar (VSX) instructions.
     2.7 -inline void Assembler::lxvd2x (VectorSRegister d, Register s1, Register s2) { emit_int32( LXVD2X_OPCODE  | vsrt(d) | ra(s1) | rb(s2)); }
     2.8 -inline void Assembler::stxvd2x(VectorSRegister d, Register s1, Register s2) { emit_int32( STXVD2X_OPCODE | vsrt(d) | ra(s1) | rb(s2)); }
     2.9 +inline void Assembler::lxvd2x (VectorSRegister d, Register s1) { emit_int32( LXVD2X_OPCODE  | vsrt(d) | ra(0) | rb(s1)); }
    2.10 +inline void Assembler::lxvd2x (VectorSRegister d, Register s1, Register s2) { emit_int32( LXVD2X_OPCODE  | vsrt(d) | ra0mem(s1) | rb(s2)); }
    2.11 +inline void Assembler::stxvd2x(VectorSRegister d, Register s1) { emit_int32( STXVD2X_OPCODE | vsrt(d) | ra(0) | rb(s1)); }
    2.12 +inline void Assembler::stxvd2x(VectorSRegister d, Register s1, Register s2) { emit_int32( STXVD2X_OPCODE | vsrt(d) | ra0mem(s1) | rb(s2)); }
    2.13  inline void Assembler::mtvrd(  VectorRegister  d, Register a)               { emit_int32( MTVSRD_OPCODE  | vrt(d)  | ra(a)  | 1u); } // 1u: d is treated as Vector (VMX/Altivec).
    2.14  inline void Assembler::mfvrd(  Register        a, VectorRegister d)         { emit_int32( MFVSRD_OPCODE  | vrt(d)  | ra(a)  | 1u); } // 1u: d is treated as Vector (VMX/Altivec).
    2.15  
     3.1 --- a/src/cpu/ppc/vm/stubGenerator_ppc.cpp	Thu May 23 04:05:08 2019 +0100
     3.2 +++ b/src/cpu/ppc/vm/stubGenerator_ppc.cpp	Sun May 26 21:02:55 2019 -0400
     3.3 @@ -1228,8 +1228,8 @@
     3.4        __ bind(l_10);
     3.5        // Use loop with VSX load/store instructions to
     3.6        // copy 32 elements a time.
     3.7 -      __ lxvd2x(tmp_vsr1, 0, R3_ARG1);     // Load src
     3.8 -      __ stxvd2x(tmp_vsr1, 0, R4_ARG2);    // Store to dst
     3.9 +      __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load src
    3.10 +      __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
    3.11        __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src + 16
    3.12        __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
    3.13        __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32
    3.14 @@ -1496,8 +1496,8 @@
    3.15          __ bind(l_9);
    3.16          // Use loop with VSX load/store instructions to
    3.17          // copy 16 elements a time.
    3.18 -        __ lxvd2x(tmp_vsr1, 0, R3_ARG1);     // Load from src.
    3.19 -        __ stxvd2x(tmp_vsr1, 0, R4_ARG2);    // Store to dst.
    3.20 +        __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load from src.
    3.21 +        __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst.
    3.22          __ lxvd2x(tmp_vsr2, R3_ARG1, tmp1);  // Load from src + 16.
    3.23          __ stxvd2x(tmp_vsr2, R4_ARG2, tmp1); // Store to dst + 16.
    3.24          __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32.
    3.25 @@ -1690,8 +1690,8 @@
    3.26        __ bind(l_7);
    3.27        // Use loop with VSX load/store instructions to
    3.28        // copy 8 elements a time.
    3.29 -      __ lxvd2x(tmp_vsr1, 0, R3_ARG1);     // Load src
    3.30 -      __ stxvd2x(tmp_vsr1, 0, R4_ARG2);    // Store to dst
    3.31 +      __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load src
    3.32 +      __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
    3.33        __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src + 16
    3.34        __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
    3.35        __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32
    3.36 @@ -1756,13 +1756,16 @@
    3.37      // Do reverse copy.  We assume the case of actual overlap is rare enough
    3.38      // that we don't have to optimize it.
    3.39  
    3.40 -    Label l_1, l_2, l_3, l_4, l_5, l_6;
    3.41 +    Label l_1, l_2, l_3, l_4, l_5, l_6, l_7;
    3.42  
    3.43      Register tmp1 = R6_ARG4;
    3.44      Register tmp2 = R7_ARG5;
    3.45      Register tmp3 = R8_ARG6;
    3.46      Register tmp4 = R0;
    3.47  
    3.48 +    VectorSRegister tmp_vsr1  = VSR1;
    3.49 +    VectorSRegister tmp_vsr2  = VSR2;
    3.50 +
    3.51      { // FasterArrayCopy
    3.52        __ cmpwi(CCR0, R5_ARG3, 0);
    3.53        __ beq(CCR0, l_6);
    3.54 @@ -1772,6 +1775,25 @@
    3.55        __ add(R4_ARG2, R4_ARG2, R5_ARG3);
    3.56        __ srdi(R5_ARG3, R5_ARG3, 2);
    3.57  
    3.58 +      if (!aligned) {
    3.59 +        // check if arrays have same alignment mod 8.
    3.60 +        __ xorr(tmp1, R3_ARG1, R4_ARG2);
    3.61 +        __ andi_(R0, tmp1, 7);
    3.62 +        // Not the same alignment, but ld and std just need to be 4 byte aligned.
    3.63 +        __ bne(CCR0, l_7); // to OR from is 8 byte aligned -> copy 2 at a time
    3.64 +
    3.65 +        // copy 1 element to align to and from on an 8 byte boundary
    3.66 +        __ andi_(R0, R3_ARG1, 7);
    3.67 +        __ beq(CCR0, l_7);
    3.68 +
    3.69 +        __ addi(R3_ARG1, R3_ARG1, -4);
    3.70 +        __ addi(R4_ARG2, R4_ARG2, -4);
    3.71 +        __ addi(R5_ARG3, R5_ARG3, -1);
    3.72 +        __ lwzx(tmp2, R3_ARG1);
    3.73 +        __ stwx(tmp2, R4_ARG2);
    3.74 +        __ bind(l_7);
    3.75 +      }
    3.76 +
    3.77        __ cmpwi(CCR0, R5_ARG3, 7);
    3.78        __ ble(CCR0, l_5); // copy 1 at a time if less than 8 elements remain
    3.79  
    3.80 @@ -1779,6 +1801,7 @@
    3.81        __ andi(R5_ARG3, R5_ARG3, 7);
    3.82        __ mtctr(tmp1);
    3.83  
    3.84 +     if (!VM_Version::has_vsx()) {
    3.85        __ bind(l_4);
    3.86        // Use unrolled version for mass copying (copy 4 elements a time).
    3.87        // Load feeding store gets zero latency on Power6, however not on Power5.
    3.88 @@ -1794,6 +1817,40 @@
    3.89        __ std(tmp2, 8, R4_ARG2);
    3.90        __ std(tmp1, 0, R4_ARG2);
    3.91        __ bdnz(l_4);
    3.92 +     } else {  // Processor supports VSX, so use it to mass copy.
    3.93 +      // Prefetch the data into the L2 cache.
    3.94 +      __ dcbt(R3_ARG1, 0);
    3.95 +
    3.96 +      // If supported set DSCR pre-fetch to deepest.
    3.97 +      if (VM_Version::has_mfdscr()) {
    3.98 +        __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
    3.99 +        __ mtdscr(tmp2);
   3.100 +      }
   3.101 +
   3.102 +      __ li(tmp1, 16);
   3.103 +
   3.104 +      // Backbranch target aligned to 32-byte. Not 16-byte align as
   3.105 +      // loop contains < 8 instructions that fit inside a single
   3.106 +      // i-cache sector.
   3.107 +      __ align(32);
   3.108 +
   3.109 +      __ bind(l_4);
   3.110 +      // Use loop with VSX load/store instructions to
   3.111 +      // copy 8 elements a time.
   3.112 +      __ addi(R3_ARG1, R3_ARG1, -32);      // Update src-=32
   3.113 +      __ addi(R4_ARG2, R4_ARG2, -32);      // Update dsc-=32
   3.114 +      __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src+16
   3.115 +      __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load src
   3.116 +      __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst+16
   3.117 +      __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
   3.118 +      __ bdnz(l_4);
   3.119 +
   3.120 +      // Restore DSCR pre-fetch value.
   3.121 +      if (VM_Version::has_mfdscr()) {
   3.122 +        __ load_const_optimized(tmp2, VM_Version::_dscr_val);
   3.123 +        __ mtdscr(tmp2);
   3.124 +      }
   3.125 +     }
   3.126  
   3.127        __ cmpwi(CCR0, R5_ARG3, 0);
   3.128        __ beq(CCR0, l_6);
   3.129 @@ -1908,8 +1965,8 @@
   3.130        __ bind(l_5);
   3.131        // Use loop with VSX load/store instructions to
   3.132        // copy 4 elements a time.
   3.133 -      __ lxvd2x(tmp_vsr1, 0, R3_ARG1);     // Load src
   3.134 -      __ stxvd2x(tmp_vsr1, 0, R4_ARG2);    // Store to dst
   3.135 +      __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load src
   3.136 +      __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
   3.137        __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src + 16
   3.138        __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
   3.139        __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32
   3.140 @@ -1976,6 +2033,9 @@
   3.141      Register tmp3 = R8_ARG6;
   3.142      Register tmp4 = R0;
   3.143  
   3.144 +    VectorSRegister tmp_vsr1  = VSR1;
   3.145 +    VectorSRegister tmp_vsr2  = VSR2;
   3.146 +
   3.147      Label l_1, l_2, l_3, l_4, l_5;
   3.148  
   3.149      __ cmpwi(CCR0, R5_ARG3, 0);
   3.150 @@ -1994,6 +2054,7 @@
   3.151        __ andi(R5_ARG3, R5_ARG3, 3);
   3.152        __ mtctr(tmp1);
   3.153  
   3.154 +     if (!VM_Version::has_vsx()) {
   3.155        __ bind(l_4);
   3.156        // Use unrolled version for mass copying (copy 4 elements a time).
   3.157        // Load feeding store gets zero latency on Power6, however not on Power5.
   3.158 @@ -2009,6 +2070,40 @@
   3.159        __ std(tmp2, 8, R4_ARG2);
   3.160        __ std(tmp1, 0, R4_ARG2);
   3.161        __ bdnz(l_4);
   3.162 +     } else { // Processor supports VSX, so use it to mass copy.
   3.163 +      // Prefetch the data into the L2 cache.
   3.164 +      __ dcbt(R3_ARG1, 0);
   3.165 +
   3.166 +      // If supported set DSCR pre-fetch to deepest.
   3.167 +      if (VM_Version::has_mfdscr()) {
   3.168 +        __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
   3.169 +        __ mtdscr(tmp2);
   3.170 +      }
   3.171 +
   3.172 +      __ li(tmp1, 16);
   3.173 +
   3.174 +      // Backbranch target aligned to 32-byte. Not 16-byte align as
   3.175 +      // loop contains < 8 instructions that fit inside a single
   3.176 +      // i-cache sector.
   3.177 +      __ align(32);
   3.178 +
   3.179 +      __ bind(l_4);
   3.180 +      // Use loop with VSX load/store instructions to
   3.181 +      // copy 4 elements a time.
   3.182 +      __ addi(R3_ARG1, R3_ARG1, -32);      // Update src-=32
   3.183 +      __ addi(R4_ARG2, R4_ARG2, -32);      // Update dsc-=32
   3.184 +      __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src+16
   3.185 +      __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load src
   3.186 +      __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst+16
   3.187 +      __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
   3.188 +      __ bdnz(l_4);
   3.189 +
   3.190 +      // Restore DSCR pre-fetch value.
   3.191 +      if (VM_Version::has_mfdscr()) {
   3.192 +        __ load_const_optimized(tmp2, VM_Version::_dscr_val);
   3.193 +        __ mtdscr(tmp2);
   3.194 +      }
   3.195 +     }
   3.196  
   3.197        __ cmpwi(CCR0, R5_ARG3, 0);
   3.198        __ beq(CCR0, l_1);
     4.1 --- a/src/cpu/ppc/vm/vm_version_ppc.cpp	Thu May 23 04:05:08 2019 +0100
     4.2 +++ b/src/cpu/ppc/vm/vm_version_ppc.cpp	Sun May 26 21:02:55 2019 -0400
     4.3 @@ -502,7 +502,7 @@
     4.4    a->vcipher(VR0, VR1, VR2);                   // code[11] -> vcipher
     4.5    a->vpmsumb(VR0, VR1, VR2);                   // code[12] -> vpmsumb
     4.6    a->mfdscr(R0);                               // code[13] -> mfdscr
     4.7 -  a->lxvd2x(VSR0, 0, R3_ARG1);                 // code[14] -> vsx
     4.8 +  a->lxvd2x(VSR0, R3_ARG1);                    // code[14] -> vsx
     4.9    a->blr();
    4.10  
    4.11    // Emit function to set one cache line to zero. Emit function descriptor and get pointer to it.

mercurial