Sun, 26 May 2019 21:02:55 -0400
8166684: PPC64: implement intrinsic code with vector instructions for Unsafe.copyMemory()
Reviewed-by: simonis, mdoerr
Contributed-by: Michihiro Horie <horie@jp.ibm.com>
1.1 --- a/src/cpu/ppc/vm/assembler_ppc.hpp Thu May 23 04:05:08 2019 +0100 1.2 +++ b/src/cpu/ppc/vm/assembler_ppc.hpp Sun May 26 21:02:55 2019 -0400 1.3 @@ -1963,7 +1963,9 @@ 1.4 inline void mfvscr( VectorRegister d); 1.5 1.6 // Vector-Scalar (VSX) instructions. 1.7 + inline void lxvd2x( VectorSRegister d, Register a); 1.8 inline void lxvd2x( VectorSRegister d, Register a, Register b); 1.9 + inline void stxvd2x( VectorSRegister d, Register a); 1.10 inline void stxvd2x( VectorSRegister d, Register a, Register b); 1.11 inline void mtvrd( VectorRegister d, Register a); 1.12 inline void mfvrd( Register a, VectorRegister d);
2.1 --- a/src/cpu/ppc/vm/assembler_ppc.inline.hpp Thu May 23 04:05:08 2019 +0100 2.2 +++ b/src/cpu/ppc/vm/assembler_ppc.inline.hpp Sun May 26 21:02:55 2019 -0400 2.3 @@ -627,8 +627,10 @@ 2.4 inline void Assembler::lvsr( VectorRegister d, Register s1, Register s2) { emit_int32( LVSR_OPCODE | vrt(d) | ra0mem(s1) | rb(s2)); } 2.5 2.6 // Vector-Scalar (VSX) instructions. 2.7 -inline void Assembler::lxvd2x (VectorSRegister d, Register s1, Register s2) { emit_int32( LXVD2X_OPCODE | vsrt(d) | ra(s1) | rb(s2)); } 2.8 -inline void Assembler::stxvd2x(VectorSRegister d, Register s1, Register s2) { emit_int32( STXVD2X_OPCODE | vsrt(d) | ra(s1) | rb(s2)); } 2.9 +inline void Assembler::lxvd2x (VectorSRegister d, Register s1) { emit_int32( LXVD2X_OPCODE | vsrt(d) | ra(0) | rb(s1)); } 2.10 +inline void Assembler::lxvd2x (VectorSRegister d, Register s1, Register s2) { emit_int32( LXVD2X_OPCODE | vsrt(d) | ra0mem(s1) | rb(s2)); } 2.11 +inline void Assembler::stxvd2x(VectorSRegister d, Register s1) { emit_int32( STXVD2X_OPCODE | vsrt(d) | ra(0) | rb(s1)); } 2.12 +inline void Assembler::stxvd2x(VectorSRegister d, Register s1, Register s2) { emit_int32( STXVD2X_OPCODE | vsrt(d) | ra0mem(s1) | rb(s2)); } 2.13 inline void Assembler::mtvrd( VectorRegister d, Register a) { emit_int32( MTVSRD_OPCODE | vrt(d) | ra(a) | 1u); } // 1u: d is treated as Vector (VMX/Altivec). 2.14 inline void Assembler::mfvrd( Register a, VectorRegister d) { emit_int32( MFVSRD_OPCODE | vrt(d) | ra(a) | 1u); } // 1u: d is treated as Vector (VMX/Altivec). 2.15
3.1 --- a/src/cpu/ppc/vm/stubGenerator_ppc.cpp Thu May 23 04:05:08 2019 +0100 3.2 +++ b/src/cpu/ppc/vm/stubGenerator_ppc.cpp Sun May 26 21:02:55 2019 -0400 3.3 @@ -1228,8 +1228,8 @@ 3.4 __ bind(l_10); 3.5 // Use loop with VSX load/store instructions to 3.6 // copy 32 elements a time. 3.7 - __ lxvd2x(tmp_vsr1, 0, R3_ARG1); // Load src 3.8 - __ stxvd2x(tmp_vsr1, 0, R4_ARG2); // Store to dst 3.9 + __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src 3.10 + __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst 3.11 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16 3.12 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16 3.13 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32 3.14 @@ -1496,8 +1496,8 @@ 3.15 __ bind(l_9); 3.16 // Use loop with VSX load/store instructions to 3.17 // copy 16 elements a time. 3.18 - __ lxvd2x(tmp_vsr1, 0, R3_ARG1); // Load from src. 3.19 - __ stxvd2x(tmp_vsr1, 0, R4_ARG2); // Store to dst. 3.20 + __ lxvd2x(tmp_vsr1, R3_ARG1); // Load from src. 3.21 + __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst. 3.22 __ lxvd2x(tmp_vsr2, R3_ARG1, tmp1); // Load from src + 16. 3.23 __ stxvd2x(tmp_vsr2, R4_ARG2, tmp1); // Store to dst + 16. 3.24 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32. 3.25 @@ -1690,8 +1690,8 @@ 3.26 __ bind(l_7); 3.27 // Use loop with VSX load/store instructions to 3.28 // copy 8 elements a time. 3.29 - __ lxvd2x(tmp_vsr1, 0, R3_ARG1); // Load src 3.30 - __ stxvd2x(tmp_vsr1, 0, R4_ARG2); // Store to dst 3.31 + __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src 3.32 + __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst 3.33 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16 3.34 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16 3.35 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32 3.36 @@ -1756,13 +1756,16 @@ 3.37 // Do reverse copy. We assume the case of actual overlap is rare enough 3.38 // that we don't have to optimize it. 3.39 3.40 - Label l_1, l_2, l_3, l_4, l_5, l_6; 3.41 + Label l_1, l_2, l_3, l_4, l_5, l_6, l_7; 3.42 3.43 Register tmp1 = R6_ARG4; 3.44 Register tmp2 = R7_ARG5; 3.45 Register tmp3 = R8_ARG6; 3.46 Register tmp4 = R0; 3.47 3.48 + VectorSRegister tmp_vsr1 = VSR1; 3.49 + VectorSRegister tmp_vsr2 = VSR2; 3.50 + 3.51 { // FasterArrayCopy 3.52 __ cmpwi(CCR0, R5_ARG3, 0); 3.53 __ beq(CCR0, l_6); 3.54 @@ -1772,6 +1775,25 @@ 3.55 __ add(R4_ARG2, R4_ARG2, R5_ARG3); 3.56 __ srdi(R5_ARG3, R5_ARG3, 2); 3.57 3.58 + if (!aligned) { 3.59 + // check if arrays have same alignment mod 8. 3.60 + __ xorr(tmp1, R3_ARG1, R4_ARG2); 3.61 + __ andi_(R0, tmp1, 7); 3.62 + // Not the same alignment, but ld and std just need to be 4 byte aligned. 3.63 + __ bne(CCR0, l_7); // to OR from is 8 byte aligned -> copy 2 at a time 3.64 + 3.65 + // copy 1 element to align to and from on an 8 byte boundary 3.66 + __ andi_(R0, R3_ARG1, 7); 3.67 + __ beq(CCR0, l_7); 3.68 + 3.69 + __ addi(R3_ARG1, R3_ARG1, -4); 3.70 + __ addi(R4_ARG2, R4_ARG2, -4); 3.71 + __ addi(R5_ARG3, R5_ARG3, -1); 3.72 + __ lwzx(tmp2, R3_ARG1); 3.73 + __ stwx(tmp2, R4_ARG2); 3.74 + __ bind(l_7); 3.75 + } 3.76 + 3.77 __ cmpwi(CCR0, R5_ARG3, 7); 3.78 __ ble(CCR0, l_5); // copy 1 at a time if less than 8 elements remain 3.79 3.80 @@ -1779,6 +1801,7 @@ 3.81 __ andi(R5_ARG3, R5_ARG3, 7); 3.82 __ mtctr(tmp1); 3.83 3.84 + if (!VM_Version::has_vsx()) { 3.85 __ bind(l_4); 3.86 // Use unrolled version for mass copying (copy 4 elements a time). 3.87 // Load feeding store gets zero latency on Power6, however not on Power5. 3.88 @@ -1794,6 +1817,40 @@ 3.89 __ std(tmp2, 8, R4_ARG2); 3.90 __ std(tmp1, 0, R4_ARG2); 3.91 __ bdnz(l_4); 3.92 + } else { // Processor supports VSX, so use it to mass copy. 3.93 + // Prefetch the data into the L2 cache. 3.94 + __ dcbt(R3_ARG1, 0); 3.95 + 3.96 + // If supported set DSCR pre-fetch to deepest. 3.97 + if (VM_Version::has_mfdscr()) { 3.98 + __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7); 3.99 + __ mtdscr(tmp2); 3.100 + } 3.101 + 3.102 + __ li(tmp1, 16); 3.103 + 3.104 + // Backbranch target aligned to 32-byte. Not 16-byte align as 3.105 + // loop contains < 8 instructions that fit inside a single 3.106 + // i-cache sector. 3.107 + __ align(32); 3.108 + 3.109 + __ bind(l_4); 3.110 + // Use loop with VSX load/store instructions to 3.111 + // copy 8 elements a time. 3.112 + __ addi(R3_ARG1, R3_ARG1, -32); // Update src-=32 3.113 + __ addi(R4_ARG2, R4_ARG2, -32); // Update dsc-=32 3.114 + __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src+16 3.115 + __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src 3.116 + __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst+16 3.117 + __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst 3.118 + __ bdnz(l_4); 3.119 + 3.120 + // Restore DSCR pre-fetch value. 3.121 + if (VM_Version::has_mfdscr()) { 3.122 + __ load_const_optimized(tmp2, VM_Version::_dscr_val); 3.123 + __ mtdscr(tmp2); 3.124 + } 3.125 + } 3.126 3.127 __ cmpwi(CCR0, R5_ARG3, 0); 3.128 __ beq(CCR0, l_6); 3.129 @@ -1908,8 +1965,8 @@ 3.130 __ bind(l_5); 3.131 // Use loop with VSX load/store instructions to 3.132 // copy 4 elements a time. 3.133 - __ lxvd2x(tmp_vsr1, 0, R3_ARG1); // Load src 3.134 - __ stxvd2x(tmp_vsr1, 0, R4_ARG2); // Store to dst 3.135 + __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src 3.136 + __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst 3.137 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16 3.138 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16 3.139 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32 3.140 @@ -1976,6 +2033,9 @@ 3.141 Register tmp3 = R8_ARG6; 3.142 Register tmp4 = R0; 3.143 3.144 + VectorSRegister tmp_vsr1 = VSR1; 3.145 + VectorSRegister tmp_vsr2 = VSR2; 3.146 + 3.147 Label l_1, l_2, l_3, l_4, l_5; 3.148 3.149 __ cmpwi(CCR0, R5_ARG3, 0); 3.150 @@ -1994,6 +2054,7 @@ 3.151 __ andi(R5_ARG3, R5_ARG3, 3); 3.152 __ mtctr(tmp1); 3.153 3.154 + if (!VM_Version::has_vsx()) { 3.155 __ bind(l_4); 3.156 // Use unrolled version for mass copying (copy 4 elements a time). 3.157 // Load feeding store gets zero latency on Power6, however not on Power5. 3.158 @@ -2009,6 +2070,40 @@ 3.159 __ std(tmp2, 8, R4_ARG2); 3.160 __ std(tmp1, 0, R4_ARG2); 3.161 __ bdnz(l_4); 3.162 + } else { // Processor supports VSX, so use it to mass copy. 3.163 + // Prefetch the data into the L2 cache. 3.164 + __ dcbt(R3_ARG1, 0); 3.165 + 3.166 + // If supported set DSCR pre-fetch to deepest. 3.167 + if (VM_Version::has_mfdscr()) { 3.168 + __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7); 3.169 + __ mtdscr(tmp2); 3.170 + } 3.171 + 3.172 + __ li(tmp1, 16); 3.173 + 3.174 + // Backbranch target aligned to 32-byte. Not 16-byte align as 3.175 + // loop contains < 8 instructions that fit inside a single 3.176 + // i-cache sector. 3.177 + __ align(32); 3.178 + 3.179 + __ bind(l_4); 3.180 + // Use loop with VSX load/store instructions to 3.181 + // copy 4 elements a time. 3.182 + __ addi(R3_ARG1, R3_ARG1, -32); // Update src-=32 3.183 + __ addi(R4_ARG2, R4_ARG2, -32); // Update dsc-=32 3.184 + __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src+16 3.185 + __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src 3.186 + __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst+16 3.187 + __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst 3.188 + __ bdnz(l_4); 3.189 + 3.190 + // Restore DSCR pre-fetch value. 3.191 + if (VM_Version::has_mfdscr()) { 3.192 + __ load_const_optimized(tmp2, VM_Version::_dscr_val); 3.193 + __ mtdscr(tmp2); 3.194 + } 3.195 + } 3.196 3.197 __ cmpwi(CCR0, R5_ARG3, 0); 3.198 __ beq(CCR0, l_1);
4.1 --- a/src/cpu/ppc/vm/vm_version_ppc.cpp Thu May 23 04:05:08 2019 +0100 4.2 +++ b/src/cpu/ppc/vm/vm_version_ppc.cpp Sun May 26 21:02:55 2019 -0400 4.3 @@ -502,7 +502,7 @@ 4.4 a->vcipher(VR0, VR1, VR2); // code[11] -> vcipher 4.5 a->vpmsumb(VR0, VR1, VR2); // code[12] -> vpmsumb 4.6 a->mfdscr(R0); // code[13] -> mfdscr 4.7 - a->lxvd2x(VSR0, 0, R3_ARG1); // code[14] -> vsx 4.8 + a->lxvd2x(VSR0, R3_ARG1); // code[14] -> vsx 4.9 a->blr(); 4.10 4.11 // Emit function to set one cache line to zero. Emit function descriptor and get pointer to it.