Thu, 23 May 2019 03:59:28 +0100
Merge
1.1 --- a/src/cpu/ppc/vm/stubGenerator_ppc.cpp Fri May 17 18:53:31 2019 +0100 1.2 +++ b/src/cpu/ppc/vm/stubGenerator_ppc.cpp Thu May 23 03:59:28 2019 +0100 1.3 @@ -1131,8 +1131,11 @@ 1.4 Register tmp3 = R8_ARG6; 1.5 Register tmp4 = R9_ARG7; 1.6 1.7 - 1.8 - Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9; 1.9 + VectorSRegister tmp_vsr1 = VSR1; 1.10 + VectorSRegister tmp_vsr2 = VSR2; 1.11 + 1.12 + Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9, l_10; 1.13 + 1.14 // Don't try anything fancy if arrays don't have many elements. 1.15 __ li(tmp3, 0); 1.16 __ cmpwi(CCR0, R5_ARG3, 17); 1.17 @@ -1186,6 +1189,8 @@ 1.18 __ andi_(R5_ARG3, R5_ARG3, 31); 1.19 __ mtctr(tmp1); 1.20 1.21 + if (!VM_Version::has_vsx()) { 1.22 + 1.23 __ bind(l_8); 1.24 // Use unrolled version for mass copying (copy 32 elements a time) 1.25 // Load feeding store gets zero latency on Power6, however not on Power5. 1.26 @@ -1201,7 +1206,44 @@ 1.27 __ addi(R3_ARG1, R3_ARG1, 32); 1.28 __ addi(R4_ARG2, R4_ARG2, 32); 1.29 __ bdnz(l_8); 1.30 - } 1.31 + 1.32 + } else { // Processor supports VSX, so use it to mass copy. 1.33 + 1.34 + // Prefetch the data into the L2 cache. 1.35 + __ dcbt(R3_ARG1, 0); 1.36 + 1.37 + // If supported set DSCR pre-fetch to deepest. 1.38 + if (VM_Version::has_mfdscr()) { 1.39 + __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7); 1.40 + __ mtdscr(tmp2); 1.41 + } 1.42 + 1.43 + __ li(tmp1, 16); 1.44 + 1.45 + // Backbranch target aligned to 32-byte. Not 16-byte align as 1.46 + // loop contains < 8 instructions that fit inside a single 1.47 + // i-cache sector. 1.48 + __ align(32); 1.49 + 1.50 + __ bind(l_10); 1.51 + // Use loop with VSX load/store instructions to 1.52 + // copy 32 elements a time. 1.53 + __ lxvd2x(tmp_vsr1, 0, R3_ARG1); // Load src 1.54 + __ stxvd2x(tmp_vsr1, 0, R4_ARG2); // Store to dst 1.55 + __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16 1.56 + __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16 1.57 + __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32 1.58 + __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32 1.59 + __ bdnz(l_10); // Dec CTR and loop if not zero. 1.60 + 1.61 + // Restore DSCR pre-fetch value. 1.62 + if (VM_Version::has_mfdscr()) { 1.63 + __ load_const_optimized(tmp2, VM_Version::_dscr_val); 1.64 + __ mtdscr(tmp2); 1.65 + } 1.66 + 1.67 + } // VSX 1.68 + } // FasterArrayCopy 1.69 1.70 __ bind(l_6); 1.71 1.72 @@ -1570,7 +1612,11 @@ 1.73 Register tmp3 = R8_ARG6; 1.74 Register tmp4 = R0; 1.75 1.76 - Label l_1, l_2, l_3, l_4, l_5, l_6; 1.77 + VectorSRegister tmp_vsr1 = VSR1; 1.78 + VectorSRegister tmp_vsr2 = VSR2; 1.79 + 1.80 + Label l_1, l_2, l_3, l_4, l_5, l_6, l_7; 1.81 + 1.82 // for short arrays, just do single element copy 1.83 __ li(tmp3, 0); 1.84 __ cmpwi(CCR0, R5_ARG3, 5); 1.85 @@ -1605,6 +1651,8 @@ 1.86 __ andi_(R5_ARG3, R5_ARG3, 7); 1.87 __ mtctr(tmp1); 1.88 1.89 + if (!VM_Version::has_vsx()) { 1.90 + 1.91 __ bind(l_6); 1.92 // Use unrolled version for mass copying (copy 8 elements a time). 1.93 // Load feeding store gets zero latency on power6, however not on power 5. 1.94 @@ -1620,7 +1668,44 @@ 1.95 __ addi(R3_ARG1, R3_ARG1, 32); 1.96 __ addi(R4_ARG2, R4_ARG2, 32); 1.97 __ bdnz(l_6); 1.98 - } 1.99 + 1.100 + } else { // Processor supports VSX, so use it to mass copy. 1.101 + 1.102 + // Prefetch the data into the L2 cache. 1.103 + __ dcbt(R3_ARG1, 0); 1.104 + 1.105 + // If supported set DSCR pre-fetch to deepest. 1.106 + if (VM_Version::has_mfdscr()) { 1.107 + __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7); 1.108 + __ mtdscr(tmp2); 1.109 + } 1.110 + 1.111 + __ li(tmp1, 16); 1.112 + 1.113 + // Backbranch target aligned to 32-byte. Not 16-byte align as 1.114 + // loop contains < 8 instructions that fit inside a single 1.115 + // i-cache sector. 1.116 + __ align(32); 1.117 + 1.118 + __ bind(l_7); 1.119 + // Use loop with VSX load/store instructions to 1.120 + // copy 8 elements a time. 1.121 + __ lxvd2x(tmp_vsr1, 0, R3_ARG1); // Load src 1.122 + __ stxvd2x(tmp_vsr1, 0, R4_ARG2); // Store to dst 1.123 + __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16 1.124 + __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16 1.125 + __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32 1.126 + __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32 1.127 + __ bdnz(l_7); // Dec CTR and loop if not zero. 1.128 + 1.129 + // Restore DSCR pre-fetch value. 1.130 + if (VM_Version::has_mfdscr()) { 1.131 + __ load_const_optimized(tmp2, VM_Version::_dscr_val); 1.132 + __ mtdscr(tmp2); 1.133 + } 1.134 + 1.135 + } // VSX 1.136 + } // FasterArrayCopy 1.137 1.138 // copy 1 element at a time 1.139 __ bind(l_2); 1.140 @@ -1772,7 +1857,10 @@ 1.141 Register tmp3 = R8_ARG6; 1.142 Register tmp4 = R0; 1.143 1.144 - Label l_1, l_2, l_3, l_4; 1.145 + Label l_1, l_2, l_3, l_4, l_5; 1.146 + 1.147 + VectorSRegister tmp_vsr1 = VSR1; 1.148 + VectorSRegister tmp_vsr2 = VSR2; 1.149 1.150 { // FasterArrayCopy 1.151 __ cmpwi(CCR0, R5_ARG3, 3); 1.152 @@ -1782,6 +1870,7 @@ 1.153 __ andi_(R5_ARG3, R5_ARG3, 3); 1.154 __ mtctr(tmp1); 1.155 1.156 + if (!VM_Version::has_vsx()) { 1.157 __ bind(l_4); 1.158 // Use unrolled version for mass copying (copy 4 elements a time). 1.159 // Load feeding store gets zero latency on Power6, however not on Power5. 1.160 @@ -1797,7 +1886,44 @@ 1.161 __ addi(R3_ARG1, R3_ARG1, 32); 1.162 __ addi(R4_ARG2, R4_ARG2, 32); 1.163 __ bdnz(l_4); 1.164 - } 1.165 + 1.166 + } else { // Processor supports VSX, so use it to mass copy. 1.167 + 1.168 + // Prefetch the data into the L2 cache. 1.169 + __ dcbt(R3_ARG1, 0); 1.170 + 1.171 + // If supported set DSCR pre-fetch to deepest. 1.172 + if (VM_Version::has_mfdscr()) { 1.173 + __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7); 1.174 + __ mtdscr(tmp2); 1.175 + } 1.176 + 1.177 + __ li(tmp1, 16); 1.178 + 1.179 + // Backbranch target aligned to 32-byte. Not 16-byte align as 1.180 + // loop contains < 8 instructions that fit inside a single 1.181 + // i-cache sector. 1.182 + __ align(32); 1.183 + 1.184 + __ bind(l_5); 1.185 + // Use loop with VSX load/store instructions to 1.186 + // copy 4 elements a time. 1.187 + __ lxvd2x(tmp_vsr1, 0, R3_ARG1); // Load src 1.188 + __ stxvd2x(tmp_vsr1, 0, R4_ARG2); // Store to dst 1.189 + __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16 1.190 + __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16 1.191 + __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32 1.192 + __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32 1.193 + __ bdnz(l_5); // Dec CTR and loop if not zero. 1.194 + 1.195 + // Restore DSCR pre-fetch value. 1.196 + if (VM_Version::has_mfdscr()) { 1.197 + __ load_const_optimized(tmp2, VM_Version::_dscr_val); 1.198 + __ mtdscr(tmp2); 1.199 + } 1.200 + 1.201 + } // VSX 1.202 + } // FasterArrayCopy 1.203 1.204 // copy 1 element at a time 1.205 __ bind(l_3);
2.1 --- a/src/share/vm/prims/jni.cpp Fri May 17 18:53:31 2019 +0100 2.2 +++ b/src/share/vm/prims/jni.cpp Thu May 23 03:59:28 2019 +0100 2.3 @@ -1,5 +1,5 @@ 2.4 /* 2.5 - * Copyright (c) 1997, 2017, Oracle and/or its affiliates. All rights reserved. 2.6 + * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved. 2.7 * Copyright (c) 2012 Red Hat, Inc. 2.8 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 2.9 *