# HG changeset patch # User andrew # Date 1558580368 -3600 # Node ID d690709cc3398f8cfd6ffebb89a229105fb3e69a # Parent 00837a7bc3494b75ebed173ca6c2df40076dca54# Parent 39678a65a0e87fa19656ada4368213030e95dcec Merge diff -r 00837a7bc349 -r d690709cc339 src/cpu/ppc/vm/stubGenerator_ppc.cpp --- a/src/cpu/ppc/vm/stubGenerator_ppc.cpp Fri May 17 18:53:31 2019 +0100 +++ b/src/cpu/ppc/vm/stubGenerator_ppc.cpp Thu May 23 03:59:28 2019 +0100 @@ -1131,8 +1131,11 @@ Register tmp3 = R8_ARG6; Register tmp4 = R9_ARG7; - - Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9; + VectorSRegister tmp_vsr1 = VSR1; + VectorSRegister tmp_vsr2 = VSR2; + + Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9, l_10; + // Don't try anything fancy if arrays don't have many elements. __ li(tmp3, 0); __ cmpwi(CCR0, R5_ARG3, 17); @@ -1186,6 +1189,8 @@ __ andi_(R5_ARG3, R5_ARG3, 31); __ mtctr(tmp1); + if (!VM_Version::has_vsx()) { + __ bind(l_8); // Use unrolled version for mass copying (copy 32 elements a time) // Load feeding store gets zero latency on Power6, however not on Power5. @@ -1201,7 +1206,44 @@ __ addi(R3_ARG1, R3_ARG1, 32); __ addi(R4_ARG2, R4_ARG2, 32); __ bdnz(l_8); - } + + } else { // Processor supports VSX, so use it to mass copy. + + // Prefetch the data into the L2 cache. + __ dcbt(R3_ARG1, 0); + + // If supported set DSCR pre-fetch to deepest. + if (VM_Version::has_mfdscr()) { + __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7); + __ mtdscr(tmp2); + } + + __ li(tmp1, 16); + + // Backbranch target aligned to 32-byte. Not 16-byte align as + // loop contains < 8 instructions that fit inside a single + // i-cache sector. + __ align(32); + + __ bind(l_10); + // Use loop with VSX load/store instructions to + // copy 32 elements a time. + __ lxvd2x(tmp_vsr1, 0, R3_ARG1); // Load src + __ stxvd2x(tmp_vsr1, 0, R4_ARG2); // Store to dst + __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16 + __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16 + __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32 + __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32 + __ bdnz(l_10); // Dec CTR and loop if not zero. + + // Restore DSCR pre-fetch value. + if (VM_Version::has_mfdscr()) { + __ load_const_optimized(tmp2, VM_Version::_dscr_val); + __ mtdscr(tmp2); + } + + } // VSX + } // FasterArrayCopy __ bind(l_6); @@ -1570,7 +1612,11 @@ Register tmp3 = R8_ARG6; Register tmp4 = R0; - Label l_1, l_2, l_3, l_4, l_5, l_6; + VectorSRegister tmp_vsr1 = VSR1; + VectorSRegister tmp_vsr2 = VSR2; + + Label l_1, l_2, l_3, l_4, l_5, l_6, l_7; + // for short arrays, just do single element copy __ li(tmp3, 0); __ cmpwi(CCR0, R5_ARG3, 5); @@ -1605,6 +1651,8 @@ __ andi_(R5_ARG3, R5_ARG3, 7); __ mtctr(tmp1); + if (!VM_Version::has_vsx()) { + __ bind(l_6); // Use unrolled version for mass copying (copy 8 elements a time). // Load feeding store gets zero latency on power6, however not on power 5. @@ -1620,7 +1668,44 @@ __ addi(R3_ARG1, R3_ARG1, 32); __ addi(R4_ARG2, R4_ARG2, 32); __ bdnz(l_6); - } + + } else { // Processor supports VSX, so use it to mass copy. + + // Prefetch the data into the L2 cache. + __ dcbt(R3_ARG1, 0); + + // If supported set DSCR pre-fetch to deepest. + if (VM_Version::has_mfdscr()) { + __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7); + __ mtdscr(tmp2); + } + + __ li(tmp1, 16); + + // Backbranch target aligned to 32-byte. Not 16-byte align as + // loop contains < 8 instructions that fit inside a single + // i-cache sector. + __ align(32); + + __ bind(l_7); + // Use loop with VSX load/store instructions to + // copy 8 elements a time. + __ lxvd2x(tmp_vsr1, 0, R3_ARG1); // Load src + __ stxvd2x(tmp_vsr1, 0, R4_ARG2); // Store to dst + __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16 + __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16 + __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32 + __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32 + __ bdnz(l_7); // Dec CTR and loop if not zero. + + // Restore DSCR pre-fetch value. + if (VM_Version::has_mfdscr()) { + __ load_const_optimized(tmp2, VM_Version::_dscr_val); + __ mtdscr(tmp2); + } + + } // VSX + } // FasterArrayCopy // copy 1 element at a time __ bind(l_2); @@ -1772,7 +1857,10 @@ Register tmp3 = R8_ARG6; Register tmp4 = R0; - Label l_1, l_2, l_3, l_4; + Label l_1, l_2, l_3, l_4, l_5; + + VectorSRegister tmp_vsr1 = VSR1; + VectorSRegister tmp_vsr2 = VSR2; { // FasterArrayCopy __ cmpwi(CCR0, R5_ARG3, 3); @@ -1782,6 +1870,7 @@ __ andi_(R5_ARG3, R5_ARG3, 3); __ mtctr(tmp1); + if (!VM_Version::has_vsx()) { __ bind(l_4); // Use unrolled version for mass copying (copy 4 elements a time). // Load feeding store gets zero latency on Power6, however not on Power5. @@ -1797,7 +1886,44 @@ __ addi(R3_ARG1, R3_ARG1, 32); __ addi(R4_ARG2, R4_ARG2, 32); __ bdnz(l_4); - } + + } else { // Processor supports VSX, so use it to mass copy. + + // Prefetch the data into the L2 cache. + __ dcbt(R3_ARG1, 0); + + // If supported set DSCR pre-fetch to deepest. + if (VM_Version::has_mfdscr()) { + __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7); + __ mtdscr(tmp2); + } + + __ li(tmp1, 16); + + // Backbranch target aligned to 32-byte. Not 16-byte align as + // loop contains < 8 instructions that fit inside a single + // i-cache sector. + __ align(32); + + __ bind(l_5); + // Use loop with VSX load/store instructions to + // copy 4 elements a time. + __ lxvd2x(tmp_vsr1, 0, R3_ARG1); // Load src + __ stxvd2x(tmp_vsr1, 0, R4_ARG2); // Store to dst + __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16 + __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16 + __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32 + __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32 + __ bdnz(l_5); // Dec CTR and loop if not zero. + + // Restore DSCR pre-fetch value. + if (VM_Version::has_mfdscr()) { + __ load_const_optimized(tmp2, VM_Version::_dscr_val); + __ mtdscr(tmp2); + } + + } // VSX + } // FasterArrayCopy // copy 1 element at a time __ bind(l_3); diff -r 00837a7bc349 -r d690709cc339 src/share/vm/prims/jni.cpp --- a/src/share/vm/prims/jni.cpp Fri May 17 18:53:31 2019 +0100 +++ b/src/share/vm/prims/jni.cpp Thu May 23 03:59:28 2019 +0100 @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2017, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 Red Hat, Inc. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. *