8154156: PPC64: improve array copy stubs by using vector instructions

Wed, 24 Apr 2019 11:48:37 -0400

author
gromero
date
Wed, 24 Apr 2019 11:48:37 -0400
changeset 9662
6eedcffa129d
parent 9661
379a59bf685d
child 9663
7fe2cda84af1

8154156: PPC64: improve array copy stubs by using vector instructions
Reviewed-by: goetz, mdoerr
Contributed-by: Kazunori Ogata <ogatak@jp.ibm.com>

src/cpu/ppc/vm/assembler_ppc.hpp file | annotate | diff | comparison | revisions
src/cpu/ppc/vm/assembler_ppc.inline.hpp file | annotate | diff | comparison | revisions
src/cpu/ppc/vm/globals_ppc.hpp file | annotate | diff | comparison | revisions
src/cpu/ppc/vm/register_ppc.cpp file | annotate | diff | comparison | revisions
src/cpu/ppc/vm/register_ppc.hpp file | annotate | diff | comparison | revisions
src/cpu/ppc/vm/stubGenerator_ppc.cpp file | annotate | diff | comparison | revisions
src/cpu/ppc/vm/vm_version_ppc.cpp file | annotate | diff | comparison | revisions
src/cpu/ppc/vm/vm_version_ppc.hpp file | annotate | diff | comparison | revisions
     1.1 --- a/src/cpu/ppc/vm/assembler_ppc.hpp	Mon Apr 15 16:27:50 2019 +0000
     1.2 +++ b/src/cpu/ppc/vm/assembler_ppc.hpp	Wed Apr 24 11:48:37 2019 -0400
     1.3 @@ -469,6 +469,8 @@
     1.4      LVSR_OPCODE    = (31u << OPCODE_SHIFT |   38u << 1),
     1.5  
     1.6      // Vector-Scalar (VSX) instruction support.
     1.7 +    LXVD2X_OPCODE  = (31u << OPCODE_SHIFT |  844u << 1),
     1.8 +    STXVD2X_OPCODE = (31u << OPCODE_SHIFT |  972u << 1),
     1.9      MTVSRD_OPCODE  = (31u << OPCODE_SHIFT |  179u << 1),
    1.10      MFVSRD_OPCODE  = (31u << OPCODE_SHIFT |   51u << 1),
    1.11  
    1.12 @@ -670,8 +672,10 @@
    1.13      // Atomics.
    1.14      LWARX_OPCODE   = (31u << OPCODE_SHIFT |   20u << 1),
    1.15      LDARX_OPCODE   = (31u << OPCODE_SHIFT |   84u << 1),
    1.16 +    LQARX_OPCODE   = (31u << OPCODE_SHIFT |  276u << 1),
    1.17      STWCX_OPCODE   = (31u << OPCODE_SHIFT |  150u << 1),
    1.18 -    STDCX_OPCODE   = (31u << OPCODE_SHIFT |  214u << 1)
    1.19 +    STDCX_OPCODE   = (31u << OPCODE_SHIFT |  214u << 1),
    1.20 +    STQCX_OPCODE   = (31u << OPCODE_SHIFT |  182u << 1)
    1.21  
    1.22    };
    1.23  
    1.24 @@ -1052,6 +1056,19 @@
    1.25    static int vrs(   VectorRegister r)  { return  vrs(r->encoding());}
    1.26    static int vrt(   VectorRegister r)  { return  vrt(r->encoding());}
    1.27  
    1.28 +  // Support Vector-Scalar (VSX) instructions.
    1.29 +  static int vsra(      int         x)  { return  opp_u_field(x,            15, 11); }
    1.30 +  static int vsrb(      int         x)  { return  opp_u_field(x,            20, 16); }
    1.31 +  static int vsrc(      int         x)  { return  opp_u_field(x,            25, 21); }
    1.32 +  static int vsrs(      int         x)  { return  opp_u_field(x,            10,  6); }
    1.33 +  static int vsrt(      int         x)  { return  opp_u_field(x,            10,  6); }
    1.34 +
    1.35 +  static int vsra(   VectorSRegister r)  { return  vsra(r->encoding());}
    1.36 +  static int vsrb(   VectorSRegister r)  { return  vsrb(r->encoding());}
    1.37 +  static int vsrc(   VectorSRegister r)  { return  vsrc(r->encoding());}
    1.38 +  static int vsrs(   VectorSRegister r)  { return  vsrs(r->encoding());}
    1.39 +  static int vsrt(   VectorSRegister r)  { return  vsrt(r->encoding());}
    1.40 +
    1.41    static int vsplt_uim( int        x)  { return  opp_u_field(x,             15, 12); } // for vsplt* instructions
    1.42    static int vsplti_sim(int        x)  { return  opp_u_field(x,             15, 11); } // for vsplti* instructions
    1.43    static int vsldoi_shb(int        x)  { return  opp_u_field(x,             25, 22); } // for vsldoi instruction
    1.44 @@ -1663,11 +1680,14 @@
    1.45    // atomics
    1.46    inline void lwarx_unchecked(Register d, Register a, Register b, int eh1 = 0);
    1.47    inline void ldarx_unchecked(Register d, Register a, Register b, int eh1 = 0);
    1.48 +  inline void lqarx_unchecked(Register d, Register a, Register b, int eh1 = 0);
    1.49    inline bool lxarx_hint_exclusive_access();
    1.50    inline void lwarx(  Register d, Register a, Register b, bool hint_exclusive_access = false);
    1.51    inline void ldarx(  Register d, Register a, Register b, bool hint_exclusive_access = false);
    1.52 +  inline void lqarx(  Register d, Register a, Register b, bool hint_exclusive_access = false);
    1.53    inline void stwcx_( Register s, Register a, Register b);
    1.54    inline void stdcx_( Register s, Register a, Register b);
    1.55 +  inline void stqcx_( Register s, Register a, Register b);
    1.56  
    1.57    // Instructions for adjusting thread priority for simultaneous
    1.58    // multithreading (SMT) on Power5.
    1.59 @@ -1943,6 +1963,8 @@
    1.60    inline void mfvscr(   VectorRegister d);
    1.61  
    1.62    // Vector-Scalar (VSX) instructions.
    1.63 +  inline void lxvd2x(   VectorSRegister d, Register a, Register b);
    1.64 +  inline void stxvd2x(  VectorSRegister d, Register a, Register b);
    1.65    inline void mtvrd(    VectorRegister  d, Register a);
    1.66    inline void mfvrd(    Register        a, VectorRegister d);
    1.67  
    1.68 @@ -2022,10 +2044,13 @@
    1.69    // Atomics: use ra0mem to disallow R0 as base.
    1.70    inline void lwarx_unchecked(Register d, Register b, int eh1);
    1.71    inline void ldarx_unchecked(Register d, Register b, int eh1);
    1.72 +  inline void lqarx_unchecked(Register d, Register b, int eh1);
    1.73    inline void lwarx( Register d, Register b, bool hint_exclusive_access);
    1.74    inline void ldarx( Register d, Register b, bool hint_exclusive_access);
    1.75 +  inline void lqarx( Register d, Register b, bool hint_exclusive_access);
    1.76    inline void stwcx_(Register s, Register b);
    1.77    inline void stdcx_(Register s, Register b);
    1.78 +  inline void stqcx_(Register s, Register b);
    1.79    inline void lfs(   FloatRegister d, int si16);
    1.80    inline void lfsx(  FloatRegister d, Register b);
    1.81    inline void lfd(   FloatRegister d, int si16);
     2.1 --- a/src/cpu/ppc/vm/assembler_ppc.inline.hpp	Mon Apr 15 16:27:50 2019 +0000
     2.2 +++ b/src/cpu/ppc/vm/assembler_ppc.inline.hpp	Wed Apr 24 11:48:37 2019 -0400
     2.3 @@ -1,6 +1,6 @@
     2.4  /*
     2.5 - * Copyright (c) 2002, 2018, Oracle and/or its affiliates. All rights reserved.
     2.6 - * Copyright (c) 2012, 2018, SAP SE. All rights reserved.
     2.7 + * Copyright (c) 2002, 2019, Oracle and/or its affiliates. All rights reserved.
     2.8 + * Copyright (c) 2012, 2019, SAP SE. All rights reserved.
     2.9   * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    2.10   *
    2.11   * This code is free software; you can redistribute it and/or modify it
    2.12 @@ -504,11 +504,14 @@
    2.13  // Use ra0mem to disallow R0 as base.
    2.14  inline void Assembler::lwarx_unchecked(Register d, Register a, Register b, int eh1)           { emit_int32( LWARX_OPCODE | rt(d) | ra0mem(a) | rb(b) | eh(eh1)); }
    2.15  inline void Assembler::ldarx_unchecked(Register d, Register a, Register b, int eh1)           { emit_int32( LDARX_OPCODE | rt(d) | ra0mem(a) | rb(b) | eh(eh1)); }
    2.16 +inline void Assembler::lqarx_unchecked(Register d, Register a, Register b, int eh1)           { emit_int32( LQARX_OPCODE | rt(d) | ra0mem(a) | rb(b) | eh(eh1)); }
    2.17  inline bool Assembler::lxarx_hint_exclusive_access()                                          { return VM_Version::has_lxarxeh(); }
    2.18  inline void Assembler::lwarx( Register d, Register a, Register b, bool hint_exclusive_access) { lwarx_unchecked(d, a, b, (hint_exclusive_access && lxarx_hint_exclusive_access() && UseExtendedLoadAndReserveInstructionsPPC64) ? 1 : 0); }
    2.19  inline void Assembler::ldarx( Register d, Register a, Register b, bool hint_exclusive_access) { ldarx_unchecked(d, a, b, (hint_exclusive_access && lxarx_hint_exclusive_access() && UseExtendedLoadAndReserveInstructionsPPC64) ? 1 : 0); }
    2.20 +inline void Assembler::lqarx( Register d, Register a, Register b, bool hint_exclusive_access) { lqarx_unchecked(d, a, b, (hint_exclusive_access && lxarx_hint_exclusive_access() && UseExtendedLoadAndReserveInstructionsPPC64) ? 1 : 0); }
    2.21  inline void Assembler::stwcx_(Register s, Register a, Register b)                             { emit_int32( STWCX_OPCODE | rs(s) | ra0mem(a) | rb(b) | rc(1)); }
    2.22  inline void Assembler::stdcx_(Register s, Register a, Register b)                             { emit_int32( STDCX_OPCODE | rs(s) | ra0mem(a) | rb(b) | rc(1)); }
    2.23 +inline void Assembler::stqcx_(Register s, Register a, Register b)                             { emit_int32( STQCX_OPCODE | rs(s) | ra0mem(a) | rb(b) | rc(1)); }
    2.24  
    2.25  // Instructions for adjusting thread priority
    2.26  // for simultaneous multithreading (SMT) on POWER5.
    2.27 @@ -624,6 +627,8 @@
    2.28  inline void Assembler::lvsr(  VectorRegister d, Register s1, Register s2) { emit_int32( LVSR_OPCODE   | vrt(d) | ra0mem(s1) | rb(s2)); }
    2.29  
    2.30  // Vector-Scalar (VSX) instructions.
    2.31 +inline void Assembler::lxvd2x (VectorSRegister d, Register s1, Register s2) { emit_int32( LXVD2X_OPCODE  | vsrt(d) | ra(s1) | rb(s2)); }
    2.32 +inline void Assembler::stxvd2x(VectorSRegister d, Register s1, Register s2) { emit_int32( STXVD2X_OPCODE | vsrt(d) | ra(s1) | rb(s2)); }
    2.33  inline void Assembler::mtvrd(  VectorRegister  d, Register a)               { emit_int32( MTVSRD_OPCODE  | vrt(d)  | ra(a)  | 1u); } // 1u: d is treated as Vector (VMX/Altivec).
    2.34  inline void Assembler::mfvrd(  Register        a, VectorRegister d)         { emit_int32( MFVSRD_OPCODE  | vrt(d)  | ra(a)  | 1u); } // 1u: d is treated as Vector (VMX/Altivec).
    2.35  
    2.36 @@ -833,10 +838,13 @@
    2.37  // ra0 version
    2.38  inline void Assembler::lwarx_unchecked(Register d, Register b, int eh1)          { emit_int32( LWARX_OPCODE | rt(d) | rb(b) | eh(eh1)); }
    2.39  inline void Assembler::ldarx_unchecked(Register d, Register b, int eh1)          { emit_int32( LDARX_OPCODE | rt(d) | rb(b) | eh(eh1)); }
    2.40 +inline void Assembler::lqarx_unchecked(Register d, Register b, int eh1)          { emit_int32( LQARX_OPCODE | rt(d) | rb(b) | eh(eh1)); }
    2.41  inline void Assembler::lwarx( Register d, Register b, bool hint_exclusive_access){ lwarx_unchecked(d, b, (hint_exclusive_access && lxarx_hint_exclusive_access() && UseExtendedLoadAndReserveInstructionsPPC64) ? 1 : 0); }
    2.42  inline void Assembler::ldarx( Register d, Register b, bool hint_exclusive_access){ ldarx_unchecked(d, b, (hint_exclusive_access && lxarx_hint_exclusive_access() && UseExtendedLoadAndReserveInstructionsPPC64) ? 1 : 0); }
    2.43 +inline void Assembler::lqarx( Register d, Register b, bool hint_exclusive_access){ lqarx_unchecked(d, b, (hint_exclusive_access && lxarx_hint_exclusive_access() && UseExtendedLoadAndReserveInstructionsPPC64) ? 1 : 0); }
    2.44  inline void Assembler::stwcx_(Register s, Register b)                            { emit_int32( STWCX_OPCODE | rs(s) | rb(b) | rc(1)); }
    2.45  inline void Assembler::stdcx_(Register s, Register b)                            { emit_int32( STDCX_OPCODE | rs(s) | rb(b) | rc(1)); }
    2.46 +inline void Assembler::stqcx_(Register s, Register b)                            { emit_int32( STQCX_OPCODE | rs(s) | rb(b) | rc(1)); }
    2.47  
    2.48  // ra0 version
    2.49  inline void Assembler::lfs( FloatRegister d, int si16)   { emit_int32( LFS_OPCODE  | frt(d) | simm(si16,16)); }
     3.1 --- a/src/cpu/ppc/vm/globals_ppc.hpp	Mon Apr 15 16:27:50 2019 +0000
     3.2 +++ b/src/cpu/ppc/vm/globals_ppc.hpp	Wed Apr 24 11:48:37 2019 -0400
     3.3 @@ -1,6 +1,6 @@
     3.4  /*
     3.5 - * Copyright (c) 2002, 2013, Oracle and/or its affiliates. All rights reserved.
     3.6 - * Copyright 2012, 2013 SAP AG. All rights reserved.
     3.7 + * Copyright (c) 2002, 2018, Oracle and/or its affiliates. All rights reserved.
     3.8 + * Copyright 2012, 2018 SAP AG. All rights reserved.
     3.9   * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    3.10   *
    3.11   * This code is free software; you can redistribute it and/or modify it
    3.12 @@ -81,6 +81,19 @@
    3.13    product(bool, ReoptimizeCallSequences, true,                              \
    3.14            "Reoptimize code-sequences of calls at runtime.")                 \
    3.15                                                                              \
    3.16 +  /* Power 8: Configure Data Stream Control Register. */                    \
    3.17 +  product(uint64_t,DSCR_PPC64, (uintx)-1,                                   \
    3.18 +          "Power8 or later: Specify encoded value for Data Stream Control " \
    3.19 +          "Register")                                                       \
    3.20 +  product(uint64_t,DSCR_DPFD_PPC64, 8,                                      \
    3.21 +          "Power8 or later: DPFD (default prefetch depth) value of the "    \
    3.22 +          "Data Stream Control Register."                                   \
    3.23 +          " 0: hardware default, 1: none, 2-7: min-max, 8: don't touch")    \
    3.24 +  product(uint64_t,DSCR_URG_PPC64, 8,                                       \
    3.25 +          "Power8 or later: URG (depth attainment urgency) value of the "   \
    3.26 +          "Data Stream Control Register."                                   \
    3.27 +          " 0: hardware default, 1: none, 2-7: min-max, 8: don't touch")    \
    3.28 +                                                                            \
    3.29    product(bool, UseLoadInstructionsForStackBangingPPC64, false,             \
    3.30            "Use load instructions for stack banging.")                       \
    3.31                                                                              \
     4.1 --- a/src/cpu/ppc/vm/register_ppc.cpp	Mon Apr 15 16:27:50 2019 +0000
     4.2 +++ b/src/cpu/ppc/vm/register_ppc.cpp	Wed Apr 24 11:48:37 2019 -0400
     4.3 @@ -1,6 +1,6 @@
     4.4  /*
     4.5 - * Copyright (c) 2000, 2013, Oracle and/or its affiliates. All rights reserved.
     4.6 - * Copyright 2012, 2013 SAP AG. All rights reserved.
     4.7 + * Copyright (c) 2000, 2018, Oracle and/or its affiliates. All rights reserved.
     4.8 + * Copyright 2012, 2018 SAP AG. All rights reserved.
     4.9   * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    4.10   *
    4.11   * This code is free software; you can redistribute it and/or modify it
    4.12 @@ -75,3 +75,14 @@
    4.13    };
    4.14    return is_valid() ? names[encoding()] : "vnoreg";
    4.15  }
    4.16 +
    4.17 +const char* VectorSRegisterImpl::name() const {
    4.18 +  const char* names[number_of_registers] = {
    4.19 +    "VSR0",  "VSR1",  "VSR2",  "VSR3",  "VSR4",  "VSR5",  "VSR6",  "VSR7",
    4.20 +    "VSR8",  "VSR9",  "VSR10", "VSR11", "VSR12", "VSR13", "VSR14", "VSR15",
    4.21 +    "VSR16", "VSR17", "VSR18", "VSR19", "VSR20", "VSR21", "VSR22", "VSR23",
    4.22 +    "VSR24", "VSR25", "VSR26", "VSR27", "VSR28", "VSR29", "VSR30", "VSR31"
    4.23 +  };
    4.24 +  return is_valid() ? names[encoding()] : "vsnoreg";
    4.25 +}
    4.26 +
     5.1 --- a/src/cpu/ppc/vm/register_ppc.hpp	Mon Apr 15 16:27:50 2019 +0000
     5.2 +++ b/src/cpu/ppc/vm/register_ppc.hpp	Wed Apr 24 11:48:37 2019 -0400
     5.3 @@ -1,6 +1,6 @@
     5.4  /*
     5.5 - * Copyright (c) 2000, 2013, Oracle and/or its affiliates. All rights reserved.
     5.6 - * Copyright 2012, 2014 SAP AG. All rights reserved.
     5.7 + * Copyright (c) 2000, 2018, Oracle and/or its affiliates. All rights reserved.
     5.8 + * Copyright 2012, 2018 SAP AG. All rights reserved.
     5.9   * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    5.10   *
    5.11   * This code is free software; you can redistribute it and/or modify it
    5.12 @@ -492,6 +492,106 @@
    5.13  #endif // DONT_USE_REGISTER_DEFINES
    5.14  
    5.15  
    5.16 +// Use VectorSRegister as a shortcut.
    5.17 +class VectorSRegisterImpl;
    5.18 +typedef VectorSRegisterImpl* VectorSRegister;
    5.19 +
    5.20 +inline VectorSRegister as_VectorSRegister(int encoding) {
    5.21 +  return (VectorSRegister)(intptr_t)encoding;
    5.22 +}
    5.23 +
    5.24 +// The implementation of Vector-Scalar (VSX) registers on POWER architecture.
    5.25 +class VectorSRegisterImpl: public AbstractRegisterImpl {
    5.26 + public:
    5.27 +  enum {
    5.28 +    number_of_registers = 32
    5.29 +  };
    5.30 +
    5.31 +  // construction
    5.32 +  inline friend VectorSRegister as_VectorSRegister(int encoding);
    5.33 +
    5.34 +  // accessors
    5.35 +  int encoding() const { assert(is_valid(), "invalid register"); return value(); }
    5.36 +
    5.37 +  // testers
    5.38 +  bool is_valid() const { return 0 <=  value() &&  value() < number_of_registers; }
    5.39 +
    5.40 +  const char* name() const;
    5.41 +};
    5.42 +
    5.43 +// The Vector-Scalar (VSX) registers of the POWER architecture.
    5.44 +
    5.45 +CONSTANT_REGISTER_DECLARATION(VectorSRegister, vsnoreg, (-1));
    5.46 +
    5.47 +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR0,  ( 0));
    5.48 +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR1,  ( 1));
    5.49 +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR2,  ( 2));
    5.50 +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR3,  ( 3));
    5.51 +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR4,  ( 4));
    5.52 +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR5,  ( 5));
    5.53 +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR6,  ( 6));
    5.54 +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR7,  ( 7));
    5.55 +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR8,  ( 8));
    5.56 +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR9,  ( 9));
    5.57 +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR10, (10));
    5.58 +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR11, (11));
    5.59 +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR12, (12));
    5.60 +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR13, (13));
    5.61 +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR14, (14));
    5.62 +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR15, (15));
    5.63 +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR16, (16));
    5.64 +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR17, (17));
    5.65 +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR18, (18));
    5.66 +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR19, (19));
    5.67 +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR20, (20));
    5.68 +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR21, (21));
    5.69 +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR22, (22));
    5.70 +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR23, (23));
    5.71 +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR24, (24));
    5.72 +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR25, (25));
    5.73 +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR26, (26));
    5.74 +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR27, (27));
    5.75 +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR28, (28));
    5.76 +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR29, (29));
    5.77 +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR30, (30));
    5.78 +CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR31, (31));
    5.79 +
    5.80 +#ifndef DONT_USE_REGISTER_DEFINES
    5.81 +#define vsnoregi ((VectorSRegister)(vsnoreg_VectorSRegisterEnumValue))
    5.82 +#define VSR0    ((VectorSRegister)(   VSR0_VectorSRegisterEnumValue))
    5.83 +#define VSR1    ((VectorSRegister)(   VSR1_VectorSRegisterEnumValue))
    5.84 +#define VSR2    ((VectorSRegister)(   VSR2_VectorSRegisterEnumValue))
    5.85 +#define VSR3    ((VectorSRegister)(   VSR3_VectorSRegisterEnumValue))
    5.86 +#define VSR4    ((VectorSRegister)(   VSR4_VectorSRegisterEnumValue))
    5.87 +#define VSR5    ((VectorSRegister)(   VSR5_VectorSRegisterEnumValue))
    5.88 +#define VSR6    ((VectorSRegister)(   VSR6_VectorSRegisterEnumValue))
    5.89 +#define VSR7    ((VectorSRegister)(   VSR7_VectorSRegisterEnumValue))
    5.90 +#define VSR8    ((VectorSRegister)(   VSR8_VectorSRegisterEnumValue))
    5.91 +#define VSR9    ((VectorSRegister)(   VSR9_VectorSRegisterEnumValue))
    5.92 +#define VSR10   ((VectorSRegister)(  VSR10_VectorSRegisterEnumValue))
    5.93 +#define VSR11   ((VectorSRegister)(  VSR11_VectorSRegisterEnumValue))
    5.94 +#define VSR12   ((VectorSRegister)(  VSR12_VectorSRegisterEnumValue))
    5.95 +#define VSR13   ((VectorSRegister)(  VSR13_VectorSRegisterEnumValue))
    5.96 +#define VSR14   ((VectorSRegister)(  VSR14_VectorSRegisterEnumValue))
    5.97 +#define VSR15   ((VectorSRegister)(  VSR15_VectorSRegisterEnumValue))
    5.98 +#define VSR16   ((VectorSRegister)(  VSR16_VectorSRegisterEnumValue))
    5.99 +#define VSR17   ((VectorSRegister)(  VSR17_VectorSRegisterEnumValue))
   5.100 +#define VSR18   ((VectorSRegister)(  VSR18_VectorSRegisterEnumValue))
   5.101 +#define VSR19   ((VectorSRegister)(  VSR19_VectorSRegisterEnumValue))
   5.102 +#define VSR20   ((VectorSRegister)(  VSR20_VectorSRegisterEnumValue))
   5.103 +#define VSR21   ((VectorSRegister)(  VSR21_VectorSRegisterEnumValue))
   5.104 +#define VSR22   ((VectorSRegister)(  VSR22_VectorSRegisterEnumValue))
   5.105 +#define VSR23   ((VectorSRegister)(  VSR23_VectorSRegisterEnumValue))
   5.106 +#define VSR24   ((VectorSRegister)(  VSR24_VectorSRegisterEnumValue))
   5.107 +#define VSR25   ((VectorSRegister)(  VSR25_VectorSRegisterEnumValue))
   5.108 +#define VSR26   ((VectorSRegister)(  VSR26_VectorSRegisterEnumValue))
   5.109 +#define VSR27   ((VectorSRegister)(  VSR27_VectorSRegisterEnumValue))
   5.110 +#define VSR28   ((VectorSRegister)(  VSR28_VectorSRegisterEnumValue))
   5.111 +#define VSR29   ((VectorSRegister)(  VSR29_VectorSRegisterEnumValue))
   5.112 +#define VSR30   ((VectorSRegister)(  VSR30_VectorSRegisterEnumValue))
   5.113 +#define VSR31   ((VectorSRegister)(  VSR31_VectorSRegisterEnumValue))
   5.114 +#endif // DONT_USE_REGISTER_DEFINES
   5.115 +
   5.116  // Maximum number of incoming arguments that can be passed in i registers.
   5.117  const int PPC_ARGS_IN_REGS_NUM = 8;
   5.118  
     6.1 --- a/src/cpu/ppc/vm/stubGenerator_ppc.cpp	Mon Apr 15 16:27:50 2019 +0000
     6.2 +++ b/src/cpu/ppc/vm/stubGenerator_ppc.cpp	Wed Apr 24 11:48:37 2019 -0400
     6.3 @@ -1,6 +1,6 @@
     6.4  /*
     6.5 - * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
     6.6 - * Copyright (c) 2012, 2018, SAP SE. All rights reserved.
     6.7 + * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved.
     6.8 + * Copyright (c) 2012, 2019, SAP SE. All rights reserved.
     6.9   * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    6.10   *
    6.11   * This code is free software; you can redistribute it and/or modify it
    6.12 @@ -1352,9 +1352,13 @@
    6.13      Register tmp3 = R8_ARG6;
    6.14      Register tmp4 = R9_ARG7;
    6.15  
    6.16 +    VectorSRegister tmp_vsr1  = VSR1;
    6.17 +    VectorSRegister tmp_vsr2  = VSR2;
    6.18 +
    6.19      address start = __ function_entry();
    6.20  
    6.21 -      Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8;
    6.22 +    Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9;
    6.23 +
    6.24      // don't try anything fancy if arrays don't have many elements
    6.25      __ li(tmp3, 0);
    6.26      __ cmpwi(CCR0, R5_ARG3, 9);
    6.27 @@ -1412,22 +1416,60 @@
    6.28        __ andi_(R5_ARG3, R5_ARG3, 15);
    6.29        __ mtctr(tmp1);
    6.30  
    6.31 -      __ bind(l_8);
    6.32 -      // Use unrolled version for mass copying (copy 16 elements a time).
    6.33 -      // Load feeding store gets zero latency on Power6, however not on Power5.
    6.34 -      // Therefore, the following sequence is made for the good of both.
    6.35 -      __ ld(tmp1, 0, R3_ARG1);
    6.36 -      __ ld(tmp2, 8, R3_ARG1);
    6.37 -      __ ld(tmp3, 16, R3_ARG1);
    6.38 -      __ ld(tmp4, 24, R3_ARG1);
    6.39 -      __ std(tmp1, 0, R4_ARG2);
    6.40 -      __ std(tmp2, 8, R4_ARG2);
    6.41 -      __ std(tmp3, 16, R4_ARG2);
    6.42 -      __ std(tmp4, 24, R4_ARG2);
    6.43 -      __ addi(R3_ARG1, R3_ARG1, 32);
    6.44 -      __ addi(R4_ARG2, R4_ARG2, 32);
    6.45 -      __ bdnz(l_8);
    6.46 -    }
    6.47 +      if (!VM_Version::has_vsx()) {
    6.48 +
    6.49 +        __ bind(l_8);
    6.50 +        // Use unrolled version for mass copying (copy 16 elements a time).
    6.51 +        // Load feeding store gets zero latency on Power6, however not on Power5.
    6.52 +        // Therefore, the following sequence is made for the good of both.
    6.53 +        __ ld(tmp1, 0, R3_ARG1);
    6.54 +        __ ld(tmp2, 8, R3_ARG1);
    6.55 +        __ ld(tmp3, 16, R3_ARG1);
    6.56 +        __ ld(tmp4, 24, R3_ARG1);
    6.57 +        __ std(tmp1, 0, R4_ARG2);
    6.58 +        __ std(tmp2, 8, R4_ARG2);
    6.59 +        __ std(tmp3, 16, R4_ARG2);
    6.60 +        __ std(tmp4, 24, R4_ARG2);
    6.61 +        __ addi(R3_ARG1, R3_ARG1, 32);
    6.62 +        __ addi(R4_ARG2, R4_ARG2, 32);
    6.63 +        __ bdnz(l_8);
    6.64 +
    6.65 +      } else { // Processor supports VSX, so use it to mass copy.
    6.66 +
    6.67 +        // Prefetch src data into L2 cache.
    6.68 +        __ dcbt(R3_ARG1, 0);
    6.69 +
    6.70 +        // If supported set DSCR pre-fetch to deepest.
    6.71 +        if (VM_Version::has_mfdscr()) {
    6.72 +          __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
    6.73 +          __ mtdscr(tmp2);
    6.74 +        }
    6.75 +        __ li(tmp1, 16);
    6.76 +
    6.77 +        // Backbranch target aligned to 32-byte. It's not aligned 16-byte
    6.78 +        // as loop contains < 8 instructions that fit inside a single
    6.79 +        // i-cache sector.
    6.80 +        __ align(32);
    6.81 +
    6.82 +        __ bind(l_9);
    6.83 +        // Use loop with VSX load/store instructions to
    6.84 +        // copy 16 elements a time.
    6.85 +        __ lxvd2x(tmp_vsr1, 0, R3_ARG1);     // Load from src.
    6.86 +        __ stxvd2x(tmp_vsr1, 0, R4_ARG2);    // Store to dst.
    6.87 +        __ lxvd2x(tmp_vsr2, R3_ARG1, tmp1);  // Load from src + 16.
    6.88 +        __ stxvd2x(tmp_vsr2, R4_ARG2, tmp1); // Store to dst + 16.
    6.89 +        __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32.
    6.90 +        __ addi(R4_ARG2, R4_ARG2, 32);       // Update dsc+=32.
    6.91 +        __ bdnz(l_9);                        // Dec CTR and loop if not zero.
    6.92 +
    6.93 +        // Restore DSCR pre-fetch value.
    6.94 +        if (VM_Version::has_mfdscr()) {
    6.95 +          __ load_const_optimized(tmp2, VM_Version::_dscr_val);
    6.96 +          __ mtdscr(tmp2);
    6.97 +        }
    6.98 +
    6.99 +      }
   6.100 +    } // FasterArrayCopy
   6.101      __ bind(l_6);
   6.102  
   6.103      // copy 2 elements at a time
     7.1 --- a/src/cpu/ppc/vm/vm_version_ppc.cpp	Mon Apr 15 16:27:50 2019 +0000
     7.2 +++ b/src/cpu/ppc/vm/vm_version_ppc.cpp	Wed Apr 24 11:48:37 2019 -0400
     7.3 @@ -1,6 +1,6 @@
     7.4  /*
     7.5 - * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
     7.6 - * Copyright 2012, 2014 SAP AG. All rights reserved.
     7.7 + * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
     7.8 + * Copyright 2012, 2018 SAP AG. All rights reserved.
     7.9   * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    7.10   *
    7.11   * This code is free software; you can redistribute it and/or modify it
    7.12 @@ -45,7 +45,7 @@
    7.13  int VM_Version::_measured_cache_line_size = 128; // default value
    7.14  const char* VM_Version::_features_str = "";
    7.15  bool VM_Version::_is_determine_features_test_running = false;
    7.16 -
    7.17 +uint64_t VM_Version::_dscr_val = 0;
    7.18  
    7.19  #define MSG(flag)   \
    7.20    if (flag && !FLAG_IS_DEFAULT(flag))                                  \
    7.21 @@ -60,7 +60,9 @@
    7.22  
    7.23    // If PowerArchitecturePPC64 hasn't been specified explicitly determine from features.
    7.24    if (FLAG_IS_DEFAULT(PowerArchitecturePPC64)) {
    7.25 -    if (VM_Version::has_popcntw()) {
    7.26 +    if (VM_Version::has_lqarx()) {
    7.27 +      FLAG_SET_ERGO(uintx, PowerArchitecturePPC64, 8);
    7.28 +    } else if (VM_Version::has_popcntw()) {
    7.29        FLAG_SET_ERGO(uintx, PowerArchitecturePPC64, 7);
    7.30      } else if (VM_Version::has_cmpb()) {
    7.31        FLAG_SET_ERGO(uintx, PowerArchitecturePPC64, 6);
    7.32 @@ -71,8 +73,14 @@
    7.33      }
    7.34    }
    7.35    guarantee(PowerArchitecturePPC64 == 0 || PowerArchitecturePPC64 == 5 ||
    7.36 -            PowerArchitecturePPC64 == 6 || PowerArchitecturePPC64 == 7,
    7.37 -            "PowerArchitecturePPC64 should be 0, 5, 6 or 7");
    7.38 +            PowerArchitecturePPC64 == 6 || PowerArchitecturePPC64 == 7 ||
    7.39 +            PowerArchitecturePPC64 == 8,
    7.40 +            "PowerArchitecturePPC64 should be 0, 5, 6, 7, or 8");
    7.41 +
    7.42 +  // Power 8: Configure Data Stream Control Register.
    7.43 +  if (PowerArchitecturePPC64 >= 8) {
    7.44 +    config_dscr();
    7.45 +  }
    7.46  
    7.47    if (!UseSIGTRAP) {
    7.48      MSG(TrapBasedICMissChecks);
    7.49 @@ -102,7 +110,7 @@
    7.50    // Create and print feature-string.
    7.51    char buf[(num_features+1) * 16]; // Max 16 chars per feature.
    7.52    jio_snprintf(buf, sizeof(buf),
    7.53 -               "ppc64%s%s%s%s%s%s%s%s%s%s",
    7.54 +               "ppc64%s%s%s%s%s%s%s%s%s%s%s%s%s",
    7.55                 (has_fsqrt()   ? " fsqrt"   : ""),
    7.56                 (has_isel()    ? " isel"    : ""),
    7.57                 (has_lxarxeh() ? " lxarxeh" : ""),
    7.58 @@ -112,12 +120,17 @@
    7.59                 (has_popcntw() ? " popcntw" : ""),
    7.60                 (has_fcfids()  ? " fcfids"  : ""),
    7.61                 (has_vand()    ? " vand"    : ""),
    7.62 +               (has_lqarx()   ? " lqarx"   : ""),
    7.63                 (has_vcipher() ? " aes"     : ""),
    7.64 -               (has_vpmsumb() ? " vpmsumb" : "")
    7.65 +               (has_vpmsumb() ? " vpmsumb" : ""),
    7.66 +               (has_mfdscr()  ? " mfdscr"  : ""),
    7.67 +               (has_vsx()     ? " vsx"     : "")
    7.68                 // Make sure number of %s matches num_features!
    7.69                );
    7.70    _features_str = strdup(buf);
    7.71 -  NOT_PRODUCT(if (Verbose) print_features(););
    7.72 +  if (Verbose) {
    7.73 +    print_features();
    7.74 +  }
    7.75  
    7.76    // PPC64 supports 8-byte compare-exchange operations (see
    7.77    // Atomic::cmpxchg and StubGenerator::generate_atomic_cmpxchg_ptr)
    7.78 @@ -485,8 +498,11 @@
    7.79    a->popcntw(R7, R5);                          // code[7] -> popcntw
    7.80    a->fcfids(F3, F4);                           // code[8] -> fcfids
    7.81    a->vand(VR0, VR0, VR0);                      // code[9] -> vand
    7.82 -  a->vcipher(VR0, VR1, VR2);                   // code[10] -> vcipher
    7.83 -  a->vpmsumb(VR0, VR1, VR2);                   // code[11] -> vpmsumb
    7.84 +  a->lqarx_unchecked(R7, R3_ARG1, R4_ARG2, 1); // code[10] -> lqarx_m
    7.85 +  a->vcipher(VR0, VR1, VR2);                   // code[11] -> vcipher
    7.86 +  a->vpmsumb(VR0, VR1, VR2);                   // code[12] -> vpmsumb
    7.87 +  a->mfdscr(R0);                               // code[13] -> mfdscr
    7.88 +  a->lxvd2x(VSR0, 0, R3_ARG1);                 // code[14] -> vsx
    7.89    a->blr();
    7.90  
    7.91    // Emit function to set one cache line to zero. Emit function descriptor and get pointer to it.
    7.92 @@ -530,8 +546,11 @@
    7.93    if (code[feature_cntr++]) features |= popcntw_m;
    7.94    if (code[feature_cntr++]) features |= fcfids_m;
    7.95    if (code[feature_cntr++]) features |= vand_m;
    7.96 +  if (code[feature_cntr++]) features |= lqarx_m;
    7.97    if (code[feature_cntr++]) features |= vcipher_m;
    7.98    if (code[feature_cntr++]) features |= vpmsumb_m;
    7.99 +  if (code[feature_cntr++]) features |= mfdscr_m;
   7.100 +  if (code[feature_cntr++]) features |= vsx_m;
   7.101  
   7.102    // Print the detection code.
   7.103    if (PrintAssembly) {
   7.104 @@ -543,6 +562,69 @@
   7.105    _features = features;
   7.106  }
   7.107  
   7.108 +// Power 8: Configure Data Stream Control Register.
   7.109 +void VM_Version::config_dscr() {
   7.110 +  assert(has_lqarx(), "Only execute on Power 8 or later!");
   7.111 +
   7.112 +  // 7 InstWords for each call (function descriptor + blr instruction).
   7.113 +  const int code_size = (2+2*7)*BytesPerInstWord;
   7.114 +
   7.115 +  // Allocate space for the code.
   7.116 +  ResourceMark rm;
   7.117 +  CodeBuffer cb("config_dscr", code_size, 0);
   7.118 +  MacroAssembler* a = new MacroAssembler(&cb);
   7.119 +
   7.120 +  // Emit code.
   7.121 +  uint64_t (*get_dscr)() = (uint64_t(*)())(void *)a->function_entry();
   7.122 +  uint32_t *code = (uint32_t *)a->pc();
   7.123 +  a->mfdscr(R3);
   7.124 +  a->blr();
   7.125 +
   7.126 +  void (*set_dscr)(long) = (void(*)(long))(void *)a->function_entry();
   7.127 +  a->mtdscr(R3);
   7.128 +  a->blr();
   7.129 +
   7.130 +  uint32_t *code_end = (uint32_t *)a->pc();
   7.131 +  a->flush();
   7.132 +
   7.133 +  // Print the detection code.
   7.134 +  if (PrintAssembly) {
   7.135 +    ttyLocker ttyl;
   7.136 +    tty->print_cr("Decoding dscr configuration stub at " INTPTR_FORMAT " before execution:", p2i(code));
   7.137 +    Disassembler::decode((u_char*)code, (u_char*)code_end, tty);
   7.138 +  }
   7.139 +
   7.140 +  // Apply the configuration if needed.
   7.141 +  _dscr_val = (*get_dscr)();
   7.142 +  if (Verbose) {
   7.143 +    tty->print_cr("dscr value was 0x%lx" , _dscr_val);
   7.144 +  }
   7.145 +  bool change_requested = false;
   7.146 +  if (DSCR_PPC64 != (uintx)-1) {
   7.147 +    _dscr_val = DSCR_PPC64;
   7.148 +    change_requested = true;
   7.149 +  }
   7.150 +  if (DSCR_DPFD_PPC64 <= 7) {
   7.151 +    uint64_t mask = 0x7;
   7.152 +    if ((_dscr_val & mask) != DSCR_DPFD_PPC64) {
   7.153 +      _dscr_val = (_dscr_val & ~mask) | (DSCR_DPFD_PPC64);
   7.154 +      change_requested = true;
   7.155 +    }
   7.156 +  }
   7.157 +  if (DSCR_URG_PPC64 <= 7) {
   7.158 +    uint64_t mask = 0x7 << 6;
   7.159 +    if ((_dscr_val & mask) != DSCR_DPFD_PPC64 << 6) {
   7.160 +      _dscr_val = (_dscr_val & ~mask) | (DSCR_URG_PPC64 << 6);
   7.161 +      change_requested = true;
   7.162 +    }
   7.163 +  }
   7.164 +  if (change_requested) {
   7.165 +    (*set_dscr)(_dscr_val);
   7.166 +    if (Verbose) {
   7.167 +      tty->print_cr("dscr was set to 0x%lx" , (*get_dscr)());
   7.168 +    }
   7.169 +  }
   7.170 +}
   7.171  
   7.172  static int saved_features = 0;
   7.173  
     8.1 --- a/src/cpu/ppc/vm/vm_version_ppc.hpp	Mon Apr 15 16:27:50 2019 +0000
     8.2 +++ b/src/cpu/ppc/vm/vm_version_ppc.hpp	Wed Apr 24 11:48:37 2019 -0400
     8.3 @@ -1,6 +1,6 @@
     8.4  /*
     8.5 - * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
     8.6 - * Copyright 2012, 2014 SAP AG. All rights reserved.
     8.7 + * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
     8.8 + * Copyright 2012, 2018 SAP AG. All rights reserved.
     8.9   * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    8.10   *
    8.11   * This code is free software; you can redistribute it and/or modify it
    8.12 @@ -42,8 +42,11 @@
    8.13      fcfids,
    8.14      vand,
    8.15      dcba,
    8.16 +    lqarx,
    8.17      vcipher,
    8.18      vpmsumb,
    8.19 +    mfdscr,
    8.20 +    vsx,
    8.21      num_features // last entry to count features
    8.22    };
    8.23    enum Feature_Flag_Set {
    8.24 @@ -58,8 +61,11 @@
    8.25      fcfids_m              = (1 << fcfids ),
    8.26      vand_m                = (1 << vand   ),
    8.27      dcba_m                = (1 << dcba   ),
    8.28 +    lqarx_m               = (1 << lqarx  ),
    8.29      vcipher_m             = (1 << vcipher),
    8.30      vpmsumb_m             = (1 << vpmsumb),
    8.31 +    mfdscr_m              = (1 << mfdscr ),
    8.32 +    vsx_m                 = (1 << vsx    ),
    8.33      all_features_m        = -1
    8.34    };
    8.35    static int  _features;
    8.36 @@ -69,6 +75,7 @@
    8.37  
    8.38    static void print_features();
    8.39    static void determine_features(); // also measures cache line size
    8.40 +  static void config_dscr(); // Power 8: Configure Data Stream Control Register.
    8.41    static void determine_section_size();
    8.42    static void power6_micro_bench();
    8.43  public:
    8.44 @@ -87,8 +94,11 @@
    8.45    static bool has_fcfids()  { return (_features & fcfids_m) != 0; }
    8.46    static bool has_vand()    { return (_features & vand_m) != 0; }
    8.47    static bool has_dcba()    { return (_features & dcba_m) != 0; }
    8.48 +  static bool has_lqarx()   { return (_features & lqarx_m) != 0; }
    8.49    static bool has_vcipher() { return (_features & vcipher_m) != 0; }
    8.50    static bool has_vpmsumb() { return (_features & vpmsumb_m) != 0; }
    8.51 +  static bool has_mfdscr()  { return (_features & mfdscr_m) != 0; }
    8.52 +  static bool has_vsx()     { return (_features & vsx_m) != 0; }
    8.53  
    8.54    static const char* cpu_features() { return _features_str; }
    8.55  
    8.56 @@ -97,6 +107,9 @@
    8.57    // Assembler testing
    8.58    static void allow_all();
    8.59    static void revert();
    8.60 +
    8.61 +  // POWER 8: DSCR current value.
    8.62 +  static uint64_t _dscr_val;
    8.63  };
    8.64  
    8.65  #endif // CPU_PPC_VM_VM_VERSION_PPC_HPP

mercurial