# HG changeset patch
# User andrew
# Date 1558580368 -3600
# Node ID d690709cc3398f8cfd6ffebb89a229105fb3e69a
# Parent  00837a7bc3494b75ebed173ca6c2df40076dca54# Parent  39678a65a0e87fa19656ada4368213030e95dcec
Merge

diff -r 00837a7bc349 -r d690709cc339 src/cpu/ppc/vm/stubGenerator_ppc.cpp
--- a/src/cpu/ppc/vm/stubGenerator_ppc.cpp	Fri May 17 18:53:31 2019 +0100
+++ b/src/cpu/ppc/vm/stubGenerator_ppc.cpp	Thu May 23 03:59:28 2019 +0100
@@ -1131,8 +1131,11 @@
     Register tmp3 = R8_ARG6;
     Register tmp4 = R9_ARG7;
 
-
-    Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9;
+    VectorSRegister tmp_vsr1  = VSR1;
+    VectorSRegister tmp_vsr2  = VSR2;
+
+    Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9, l_10;
+
     // Don't try anything fancy if arrays don't have many elements.
     __ li(tmp3, 0);
     __ cmpwi(CCR0, R5_ARG3, 17);
@@ -1186,6 +1189,8 @@
       __ andi_(R5_ARG3, R5_ARG3, 31);
       __ mtctr(tmp1);
 
+     if (!VM_Version::has_vsx()) {
+
       __ bind(l_8);
       // Use unrolled version for mass copying (copy 32 elements a time)
       // Load feeding store gets zero latency on Power6, however not on Power5.
@@ -1201,7 +1206,44 @@
       __ addi(R3_ARG1, R3_ARG1, 32);
       __ addi(R4_ARG2, R4_ARG2, 32);
       __ bdnz(l_8);
-    }
+
+    } else { // Processor supports VSX, so use it to mass copy.
+
+      // Prefetch the data into the L2 cache.
+      __ dcbt(R3_ARG1, 0);
+
+      // If supported set DSCR pre-fetch to deepest.
+      if (VM_Version::has_mfdscr()) {
+        __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
+        __ mtdscr(tmp2);
+      }
+
+      __ li(tmp1, 16);
+
+      // Backbranch target aligned to 32-byte. Not 16-byte align as
+      // loop contains < 8 instructions that fit inside a single
+      // i-cache sector.
+      __ align(32);
+
+      __ bind(l_10);
+      // Use loop with VSX load/store instructions to
+      // copy 32 elements a time.
+      __ lxvd2x(tmp_vsr1, 0, R3_ARG1);     // Load src
+      __ stxvd2x(tmp_vsr1, 0, R4_ARG2);    // Store to dst
+      __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src + 16
+      __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
+      __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32
+      __ addi(R4_ARG2, R4_ARG2, 32);       // Update dsc+=32
+      __ bdnz(l_10);                       // Dec CTR and loop if not zero.
+
+      // Restore DSCR pre-fetch value.
+      if (VM_Version::has_mfdscr()) {
+        __ load_const_optimized(tmp2, VM_Version::_dscr_val);
+        __ mtdscr(tmp2);
+      }
+
+    } // VSX
+   } // FasterArrayCopy
 
     __ bind(l_6);
 
@@ -1570,7 +1612,11 @@
     Register tmp3 = R8_ARG6;
     Register tmp4 = R0;
 
-    Label l_1, l_2, l_3, l_4, l_5, l_6;
+    VectorSRegister tmp_vsr1  = VSR1;
+    VectorSRegister tmp_vsr2  = VSR2;
+
+    Label l_1, l_2, l_3, l_4, l_5, l_6, l_7;
+
     // for short arrays, just do single element copy
     __ li(tmp3, 0);
     __ cmpwi(CCR0, R5_ARG3, 5);
@@ -1605,6 +1651,8 @@
       __ andi_(R5_ARG3, R5_ARG3, 7);
       __ mtctr(tmp1);
 
+     if (!VM_Version::has_vsx()) {
+
       __ bind(l_6);
       // Use unrolled version for mass copying (copy 8 elements a time).
       // Load feeding store gets zero latency on power6, however not on power 5.
@@ -1620,7 +1668,44 @@
       __ addi(R3_ARG1, R3_ARG1, 32);
       __ addi(R4_ARG2, R4_ARG2, 32);
       __ bdnz(l_6);
-    }
+
+    } else { // Processor supports VSX, so use it to mass copy.
+
+      // Prefetch the data into the L2 cache.
+      __ dcbt(R3_ARG1, 0);
+
+      // If supported set DSCR pre-fetch to deepest.
+      if (VM_Version::has_mfdscr()) {
+        __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
+        __ mtdscr(tmp2);
+      }
+
+      __ li(tmp1, 16);
+
+      // Backbranch target aligned to 32-byte. Not 16-byte align as
+      // loop contains < 8 instructions that fit inside a single
+      // i-cache sector.
+      __ align(32);
+
+      __ bind(l_7);
+      // Use loop with VSX load/store instructions to
+      // copy 8 elements a time.
+      __ lxvd2x(tmp_vsr1, 0, R3_ARG1);     // Load src
+      __ stxvd2x(tmp_vsr1, 0, R4_ARG2);    // Store to dst
+      __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src + 16
+      __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
+      __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32
+      __ addi(R4_ARG2, R4_ARG2, 32);       // Update dsc+=32
+      __ bdnz(l_7);                        // Dec CTR and loop if not zero.
+
+      // Restore DSCR pre-fetch value.
+      if (VM_Version::has_mfdscr()) {
+        __ load_const_optimized(tmp2, VM_Version::_dscr_val);
+        __ mtdscr(tmp2);
+      }
+
+    } // VSX
+   } // FasterArrayCopy
 
     // copy 1 element at a time
     __ bind(l_2);
@@ -1772,7 +1857,10 @@
     Register tmp3 = R8_ARG6;
     Register tmp4 = R0;
 
-    Label l_1, l_2, l_3, l_4;
+    Label l_1, l_2, l_3, l_4, l_5;
+
+    VectorSRegister tmp_vsr1  = VSR1;
+    VectorSRegister tmp_vsr2  = VSR2;
 
     { // FasterArrayCopy
       __ cmpwi(CCR0, R5_ARG3, 3);
@@ -1782,6 +1870,7 @@
       __ andi_(R5_ARG3, R5_ARG3, 3);
       __ mtctr(tmp1);
 
+    if (!VM_Version::has_vsx()) {
       __ bind(l_4);
       // Use unrolled version for mass copying (copy 4 elements a time).
       // Load feeding store gets zero latency on Power6, however not on Power5.
@@ -1797,7 +1886,44 @@
       __ addi(R3_ARG1, R3_ARG1, 32);
       __ addi(R4_ARG2, R4_ARG2, 32);
       __ bdnz(l_4);
-    }
+
+    } else { // Processor supports VSX, so use it to mass copy.
+
+      // Prefetch the data into the L2 cache.
+      __ dcbt(R3_ARG1, 0);
+
+      // If supported set DSCR pre-fetch to deepest.
+      if (VM_Version::has_mfdscr()) {
+        __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
+        __ mtdscr(tmp2);
+      }
+
+      __ li(tmp1, 16);
+
+      // Backbranch target aligned to 32-byte. Not 16-byte align as
+      // loop contains < 8 instructions that fit inside a single
+      // i-cache sector.
+      __ align(32);
+
+      __ bind(l_5);
+      // Use loop with VSX load/store instructions to
+      // copy 4 elements a time.
+      __ lxvd2x(tmp_vsr1, 0, R3_ARG1);     // Load src
+      __ stxvd2x(tmp_vsr1, 0, R4_ARG2);    // Store to dst
+      __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src + 16
+      __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
+      __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32
+      __ addi(R4_ARG2, R4_ARG2, 32);       // Update dsc+=32
+      __ bdnz(l_5);                        // Dec CTR and loop if not zero.
+
+      // Restore DSCR pre-fetch value.
+      if (VM_Version::has_mfdscr()) {
+        __ load_const_optimized(tmp2, VM_Version::_dscr_val);
+        __ mtdscr(tmp2);
+      }
+
+    } // VSX
+   } // FasterArrayCopy
 
     // copy 1 element at a time
     __ bind(l_3);
diff -r 00837a7bc349 -r d690709cc339 src/share/vm/prims/jni.cpp
--- a/src/share/vm/prims/jni.cpp	Fri May 17 18:53:31 2019 +0100
+++ b/src/share/vm/prims/jni.cpp	Thu May 23 03:59:28 2019 +0100
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2017, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012 Red Hat, Inc.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *