jdk8-mips64-public/hotspot: changeset 6661:28bbbecff5f0

     1.1 --- a/.hgtags	Wed May 07 10:58:47 2014 -0700
     1.2 +++ b/.hgtags	Thu May 08 23:07:11 2014 -0700
     1.3 @@ -462,3 +462,4 @@
     1.4  3c291bc2aa7c58efb1219701f38c41731609e595 hs25.20-b12
     1.5  18ae0dac7620474547aa1721bc3fd748af07b8b5 jdk8u20-b12
     1.6  47951595af60460a479b8574622375bfbf5c8ed2 jdk8u20-b13
     1.7 +798f5b02be897151fdad44d695446088b1cca6b1 hs25.20-b13

     2.1 --- a/make/hotspot_version	Wed May 07 10:58:47 2014 -0700
     2.2 +++ b/make/hotspot_version	Thu May 08 23:07:11 2014 -0700
     2.3 @@ -35,7 +35,7 @@
     2.4  
     2.5  HS_MAJOR_VER=25
     2.6  HS_MINOR_VER=20
     2.7 -HS_BUILD_NUMBER=12
     2.8 +HS_BUILD_NUMBER=14
     2.9  
    2.10  JDK_MAJOR_VER=1
    2.11  JDK_MINOR_VER=8

     3.1 --- a/src/cpu/ppc/vm/cppInterpreter_ppc.cpp	Wed May 07 10:58:47 2014 -0700
     3.2 +++ b/src/cpu/ppc/vm/cppInterpreter_ppc.cpp	Thu May 08 23:07:11 2014 -0700
     3.3 @@ -1,3 +1,4 @@
     3.4 +
     3.5  /*
     3.6   * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
     3.7   * Copyright 2012, 2013 SAP AG. All rights reserved.
     3.8 @@ -403,7 +404,7 @@
     3.9    BLOCK_COMMENT("compute_interpreter_state {");
    3.10  
    3.11    // access_flags = method->access_flags();
    3.12 -  // TODO: PPC port: assert(4 == methodOopDesc::sz_access_flags(), "unexpected field size");
    3.13 +  // TODO: PPC port: assert(4 == sizeof(AccessFlags), "unexpected field size");
    3.14    __ lwa(access_flags, method_(access_flags));
    3.15  
    3.16    // parameter_count = method->constMethod->size_of_parameters();
    3.17 @@ -1055,7 +1056,7 @@
    3.18    assert(access_flags->is_nonvolatile(),
    3.19           "access_flags must be in a non-volatile register");
    3.20    // Type check.
    3.21 -  // TODO: PPC port: assert(4 == methodOopDesc::sz_access_flags(), "unexpected field size");
    3.22 +  // TODO: PPC port: assert(4 == sizeof(AccessFlags), "unexpected field size");
    3.23    __ lwz(access_flags, method_(access_flags));
    3.24  
    3.25    // We don't want to reload R19_method and access_flags after calls
    3.26 @@ -1838,7 +1839,7 @@
    3.27    // Interpreter state fields.
    3.28    const Register msg               = R24_tmp4;
    3.29  
    3.30 -  // MethodOop fields.
    3.31 +  // Method fields.
    3.32    const Register parameter_count   = R25_tmp5;
    3.33    const Register result_index      = R26_tmp6;
    3.34  
    3.35 @@ -2023,7 +2024,7 @@
    3.36    __ add(R17_tos, R17_tos, parameter_count);
    3.37  
    3.38    // Result stub address array index
    3.39 -  // TODO: PPC port: assert(4 == methodOopDesc::sz_result_index(), "unexpected field size");
    3.40 +  // TODO: PPC port: assert(4 == sizeof(AccessFlags), "unexpected field size");
    3.41    __ lwa(result_index, method_(result_index));
    3.42  
    3.43    __ li(msg, BytecodeInterpreter::method_resume);
    3.44 @@ -2709,7 +2710,7 @@
    3.45    __ ld(R3_ARG1, state_(_result._osr._osr_buf));
    3.46    __ mtctr(R12_scratch2);
    3.47  
    3.48 -  // Load method oop, gc may move it during execution of osr'd method.
    3.49 +  // Load method, gc may move it during execution of osr'd method.
    3.50    __ ld(R22_tmp2, state_(_method));
    3.51    // Load message 'call_method'.
    3.52    __ li(R23_tmp3, BytecodeInterpreter::call_method);

     4.1 --- a/src/cpu/ppc/vm/frame_ppc.inline.hpp	Wed May 07 10:58:47 2014 -0700
     4.2 +++ b/src/cpu/ppc/vm/frame_ppc.inline.hpp	Thu May 08 23:07:11 2014 -0700
     4.3 @@ -26,6 +26,8 @@
     4.4  #ifndef CPU_PPC_VM_FRAME_PPC_INLINE_HPP
     4.5  #define CPU_PPC_VM_FRAME_PPC_INLINE_HPP
     4.6  
     4.7 +#include "code/codeCache.hpp"
     4.8 +
     4.9  // Inline functions for ppc64 frames:
    4.10  
    4.11  // Find codeblob and set deopt_state.

     5.1 --- a/src/cpu/ppc/vm/interp_masm_ppc_64.hpp	Wed May 07 10:58:47 2014 -0700
     5.2 +++ b/src/cpu/ppc/vm/interp_masm_ppc_64.hpp	Thu May 08 23:07:11 2014 -0700
     5.3 @@ -26,7 +26,7 @@
     5.4  #ifndef CPU_PPC_VM_INTERP_MASM_PPC_64_HPP
     5.5  #define CPU_PPC_VM_INTERP_MASM_PPC_64_HPP
     5.6  
     5.7 -#include "assembler_ppc.inline.hpp"
     5.8 +#include "asm/macroAssembler.hpp"
     5.9  #include "interpreter/invocationCounter.hpp"
    5.10  
    5.11  // This file specializes the assembler with interpreter-specific macros.

     6.1 --- a/src/cpu/ppc/vm/interpreterRT_ppc.cpp	Wed May 07 10:58:47 2014 -0700
     6.2 +++ b/src/cpu/ppc/vm/interpreterRT_ppc.cpp	Thu May 08 23:07:11 2014 -0700
     6.3 @@ -24,6 +24,7 @@
     6.4   */
     6.5  
     6.6  #include "precompiled.hpp"
     6.7 +#include "asm/assembler.inline.hpp"
     6.8  #include "interpreter/interpreter.hpp"
     6.9  #include "interpreter/interpreterRuntime.hpp"
    6.10  #include "memory/allocation.inline.hpp"

     7.1 --- a/src/cpu/ppc/vm/interpreter_ppc.cpp	Wed May 07 10:58:47 2014 -0700
     7.2 +++ b/src/cpu/ppc/vm/interpreter_ppc.cpp	Thu May 08 23:07:11 2014 -0700
     7.3 @@ -139,32 +139,16 @@
     7.4    // Signature is in R3_RET. Signature is callee saved.
     7.5    __ mr(signature, R3_RET);
     7.6  
     7.7 -  // Reload method, it may have moved.
     7.8 -#ifdef CC_INTERP
     7.9 -  __ ld(R19_method, state_(_method));
    7.10 -#else
    7.11 -  __ ld(R19_method, 0, target_sp);
    7.12 -  __ ld(R19_method, _ijava_state_neg(method), R19_method);
    7.13 -#endif
    7.14 -
    7.15    // Get the result handler.
    7.16    __ call_VM_leaf(CAST_FROM_FN_PTR(address, InterpreterRuntime::get_result_handler), R16_thread, R19_method);
    7.17  
    7.18 -  // Reload method, it may have moved.
    7.19 -#ifdef CC_INTERP
    7.20 -  __ ld(R19_method, state_(_method));
    7.21 -#else
    7.22 -  __ ld(R19_method, 0, target_sp);
    7.23 -  __ ld(R19_method, _ijava_state_neg(method), R19_method);
    7.24 -#endif
    7.25 -
    7.26    {
    7.27      Label L;
    7.28      // test if static
    7.29      // _access_flags._flags must be at offset 0.
    7.30      // TODO PPC port: requires change in shared code.
    7.31      //assert(in_bytes(AccessFlags::flags_offset()) == 0,
    7.32 -    //       "MethodOopDesc._access_flags == MethodOopDesc._access_flags._flags");
    7.33 +    //       "MethodDesc._access_flags == MethodDesc._access_flags._flags");
    7.34      // _access_flags must be a 32 bit value.
    7.35      assert(sizeof(AccessFlags) == 4, "wrong size");
    7.36      __ lwa(R11_scratch1/*access_flags*/, method_(access_flags));

     8.1 --- a/src/cpu/ppc/vm/jniFastGetField_ppc.cpp	Wed May 07 10:58:47 2014 -0700
     8.2 +++ b/src/cpu/ppc/vm/jniFastGetField_ppc.cpp	Thu May 08 23:07:11 2014 -0700
     8.3 @@ -32,7 +32,7 @@
     8.4  
     8.5  
     8.6  address JNI_FastGetField::generate_fast_get_int_field0(BasicType type) {
     8.7 -  // we don't have fast jni accessors.
     8.8 +  // We don't have fast jni accessors.
     8.9    return (address) -1;
    8.10  }
    8.11  
    8.12 @@ -57,12 +57,12 @@
    8.13  }
    8.14  
    8.15  address JNI_FastGetField::generate_fast_get_long_field() {
    8.16 -  // we don't have fast jni accessors.
    8.17 +  // We don't have fast jni accessors.
    8.18    return (address) -1;
    8.19  }
    8.20  
    8.21  address JNI_FastGetField::generate_fast_get_float_field0(BasicType type) {
    8.22 -  // e don't have fast jni accessors.
    8.23 +  // We don't have fast jni accessors.
    8.24    return (address) -1;
    8.25  }
    8.26

     9.1 --- a/src/cpu/ppc/vm/ppc.ad	Wed May 07 10:58:47 2014 -0700
     9.2 +++ b/src/cpu/ppc/vm/ppc.ad	Thu May 08 23:07:11 2014 -0700
     9.3 @@ -898,7 +898,7 @@
     9.4    // To keep related declarations/definitions/uses close together,
     9.5    // we switch between source %{ }% and source_hpp %{ }% freely as needed.
     9.6  
     9.7 -  // Returns true if Node n is followed by a MemBar node that 
     9.8 +  // Returns true if Node n is followed by a MemBar node that
     9.9    // will do an acquire. If so, this node must not do the acquire
    9.10    // operation.
    9.11    bool followed_by_acquire(const Node *n);
    9.12 @@ -908,7 +908,7 @@
    9.13  
    9.14  // Optimize load-acquire.
    9.15  //
    9.16 -// Check if acquire is unnecessary due to following operation that does 
    9.17 +// Check if acquire is unnecessary due to following operation that does
    9.18  // acquire anyways.
    9.19  // Walk the pattern:
    9.20  //
    9.21 @@ -919,12 +919,12 @@
    9.22  //  Proj(ctrl)  Proj(mem)
    9.23  //       |         |
    9.24  //   MemBarRelease/Volatile
    9.25 -// 
    9.26 +//
    9.27  bool followed_by_acquire(const Node *load) {
    9.28    assert(load->is_Load(), "So far implemented only for loads.");
    9.29  
    9.30    // Find MemBarAcquire.
    9.31 -  const Node *mba = NULL;         
    9.32 +  const Node *mba = NULL;
    9.33    for (DUIterator_Fast imax, i = load->fast_outs(imax); i < imax; i++) {
    9.34      const Node *out = load->fast_out(i);
    9.35      if (out->Opcode() == Op_MemBarAcquire) {
    9.36 @@ -937,7 +937,7 @@
    9.37  
    9.38    // Find following MemBar node.
    9.39    //
    9.40 -  // The following node must be reachable by control AND memory 
    9.41 +  // The following node must be reachable by control AND memory
    9.42    // edge to assure no other operations are in between the two nodes.
    9.43    //
    9.44    // So first get the Proj node, mem_proj, to use it to iterate forward.
    9.45 @@ -1135,6 +1135,7 @@
    9.46  
    9.47   public:
    9.48  
    9.49 +  // Emit call stub, compiled java to interpreter.
    9.50    static void emit_trampoline_stub(MacroAssembler &_masm, int destination_toc_offset, int insts_call_instruction_offset);
    9.51  
    9.52    // Size of call trampoline stub.
    9.53 @@ -2752,7 +2753,7 @@
    9.54        // inputs for new nodes
    9.55        m1->add_req(NULL, n_toc);
    9.56        m2->add_req(NULL, m1);
    9.57 -      
    9.58 +
    9.59        // operands for new nodes
    9.60        m1->_opnds[0] = new (C) iRegPdstOper(); // dst
    9.61        m1->_opnds[1] = op_src;                 // src
    9.62 @@ -2760,29 +2761,29 @@
    9.63        m2->_opnds[0] = new (C) iRegPdstOper(); // dst
    9.64        m2->_opnds[1] = op_src;                 // src
    9.65        m2->_opnds[2] = new (C) iRegLdstOper(); // base
    9.66 -      
    9.67 +
    9.68        // Initialize ins_attrib TOC fields.
    9.69        m1->_const_toc_offset = -1;
    9.70        m2->_const_toc_offset_hi_node = m1;
    9.71 -      
    9.72 +
    9.73        // Register allocation for new nodes.
    9.74        ra_->set_pair(m1->_idx, ra_->get_reg_second(this), ra_->get_reg_first(this));
    9.75        ra_->set_pair(m2->_idx, ra_->get_reg_second(this), ra_->get_reg_first(this));
    9.76 -      
    9.77 +
    9.78        nodes->push(m1);
    9.79        nodes->push(m2);
    9.80        assert(m2->bottom_type()->isa_ptr(), "must be ptr");
    9.81      } else {
    9.82        loadConPNode *m2 = new (C) loadConPNode();
    9.83 -      
    9.84 +
    9.85        // inputs for new nodes
    9.86        m2->add_req(NULL, n_toc);
    9.87 -      
    9.88 +
    9.89        // operands for new nodes
    9.90        m2->_opnds[0] = new (C) iRegPdstOper(); // dst
    9.91        m2->_opnds[1] = op_src;                 // src
    9.92        m2->_opnds[2] = new (C) iRegPdstOper(); // toc
    9.93 -      
    9.94 +
    9.95        // Register allocation for new nodes.
    9.96        ra_->set_pair(m2->_idx, ra_->get_reg_second(this), ra_->get_reg_first(this));
    9.97  
    9.98 @@ -2974,17 +2975,17 @@
    9.99        n_sub_base->_opnds[1] = op_crx;
   9.100        n_sub_base->_opnds[2] = op_src;
   9.101        n_sub_base->_bottom_type = _bottom_type;
   9.102 -   
   9.103 +
   9.104        n_shift->add_req(n_region, n_sub_base);
   9.105        n_shift->_opnds[0] = op_dst;
   9.106        n_shift->_opnds[1] = op_dst;
   9.107        n_shift->_bottom_type = _bottom_type;
   9.108 -   
   9.109 +
   9.110        ra_->set_pair(n_shift->_idx, ra_->get_reg_second(this), ra_->get_reg_first(this));
   9.111        ra_->set_pair(n_compare->_idx, ra_->get_reg_second(n_crx), ra_->get_reg_first(n_crx));
   9.112        ra_->set_pair(n_sub_base->_idx, ra_->get_reg_second(this), ra_->get_reg_first(this));
   9.113        ra_->set_pair(n_move->_idx, ra_->get_reg_second(this), ra_->get_reg_first(this));
   9.114 -   
   9.115 +
   9.116        nodes->push(n_move);
   9.117        nodes->push(n_compare);
   9.118        nodes->push(n_sub_base);
   9.119 @@ -3061,20 +3062,20 @@
   9.120      } else {
   9.121        // before Power 7
   9.122        cond_add_baseNode *n_add_base = new (C) cond_add_baseNode();
   9.123 -     
   9.124 +
   9.125        n_add_base->add_req(n_region, n_compare, n_shift);
   9.126        n_add_base->_opnds[0] = op_dst;
   9.127        n_add_base->_opnds[1] = op_crx;
   9.128        n_add_base->_opnds[2] = op_dst;
   9.129        n_add_base->_bottom_type = _bottom_type;
   9.130 -     
   9.131 +
   9.132        assert(ra_->is_oop(this) == true, "A decodeN node must produce an oop!");
   9.133        ra_->set_oop(n_add_base, true);
   9.134 -     
   9.135 +
   9.136        ra_->set_pair(n_shift->_idx, ra_->get_reg_second(this), ra_->get_reg_first(this));
   9.137        ra_->set_pair(n_compare->_idx, ra_->get_reg_second(n_crx), ra_->get_reg_first(n_crx));
   9.138        ra_->set_pair(n_add_base->_idx, ra_->get_reg_second(this), ra_->get_reg_first(this));
   9.139 -     
   9.140 +
   9.141        nodes->push(n_compare);
   9.142        nodes->push(n_shift);
   9.143        nodes->push(n_add_base);
   9.144 @@ -3631,11 +3632,11 @@
   9.145      // Req...
   9.146      for (uint i = 0; i < req(); ++i) {
   9.147        // The expanded node does not need toc any more.
   9.148 -      // Add the inline cache constant here instead.  This expresses the 
   9.149 +      // Add the inline cache constant here instead. This expresses the
   9.150        // register of the inline cache must be live at the call.
   9.151        // Else we would have to adapt JVMState by -1.
   9.152        if (i == mach_constant_base_node_input()) {
   9.153 -        call->add_req(loadConLNodes_IC._last);        
   9.154 +        call->add_req(loadConLNodes_IC._last);
   9.155        } else {
   9.156          call->add_req(in(i));
   9.157        }
   9.158 @@ -3663,6 +3664,8 @@
   9.159    %}
   9.160  
   9.161    // Compound version of call dynamic
   9.162 +  // Toc is only passed so that it can be used in ins_encode statement.
   9.163 +  // In the code we have to use $constanttablebase.
   9.164    enc_class enc_java_dynamic_call(method meth, iRegLdst toc) %{
   9.165      // TODO: PPC port $archOpcode(ppc64Opcode_compound);
   9.166      MacroAssembler _masm(&cbuf);
   9.167 @@ -3670,14 +3673,17 @@
   9.168  
   9.169      Register Rtoc = (ra_) ? $constanttablebase : R2_TOC;
   9.170  #if 0
   9.171 +    int vtable_index = this->_vtable_index;
   9.172      if (_vtable_index < 0) {
   9.173        // Must be invalid_vtable_index, not nonvirtual_vtable_index.
   9.174        assert(_vtable_index == Method::invalid_vtable_index, "correct sentinel value");
   9.175        Register ic_reg = as_Register(Matcher::inline_cache_reg_encode());
   9.176 -      AddressLiteral meta = __ allocate_metadata_address((Metadata *)Universe::non_oop_word());
   9.177 -
   9.178 +
   9.179 +      // Virtual call relocation will point to ic load.
   9.180        address virtual_call_meta_addr = __ pc();
   9.181 -      __ load_const_from_method_toc(ic_reg, meta, Rtoc);
   9.182 +      // Load a clear inline cache.
   9.183 +      AddressLiteral empty_ic((address) Universe::non_oop_word());
   9.184 +      __ load_const_from_method_toc(ic_reg, empty_ic, Rtoc);
   9.185        // CALL to fixup routine.  Fixup routine uses ScopeDesc info
   9.186        // to determine who we intended to call.
   9.187        __ relocate(virtual_call_Relocation::spec(virtual_call_meta_addr));
   9.188 @@ -3710,7 +3716,6 @@
   9.189               "Fix constant in ret_addr_offset()");
   9.190      }
   9.191  #endif
   9.192 -    guarantee(0, "Fix handling of toc edge: messes up derived/base pairs.");
   9.193      Unimplemented();  // ret_addr_offset not yet fixed. Depends on compressed oops (load klass!).
   9.194    %}
   9.195  
   9.196 @@ -5436,7 +5441,7 @@
   9.197    ins_pipe(pipe_class_memory);
   9.198  %}
   9.199  
   9.200 -// Match loading integer and casting it to unsigned int in 
   9.201 +// Match loading integer and casting it to unsigned int in
   9.202  // long register.
   9.203  // LoadI + ConvI2L + AndL 0xffffffff.
   9.204  instruct loadUI2L(iRegLdst dst, memory mem, immL_32bits mask) %{
   9.205 @@ -6078,7 +6083,7 @@
   9.206    ins_pipe(pipe_class_default);
   9.207  %}
   9.208  
   9.209 -// This needs a match rule so that build_oop_map knows this is 
   9.210 +// This needs a match rule so that build_oop_map knows this is
   9.211  // not a narrow oop.
   9.212  instruct loadConNKlass_lo(iRegNdst dst, immNKlass_NM src1, iRegNsrc src2) %{
   9.213    match(Set dst src1);
   9.214 @@ -6702,7 +6707,7 @@
   9.215    size(4);
   9.216    ins_encode %{
   9.217      // This is a Power7 instruction for which no machine description exists.
   9.218 -    // TODO: PPC port $archOpcode(ppc64Opcode_compound); 
   9.219 +    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
   9.220      __ isel_0($dst$$Register, $crx$$CondRegister, Assembler::equal, $src1$$Register);
   9.221    %}
   9.222    ins_pipe(pipe_class_default);
   9.223 @@ -6847,7 +6852,7 @@
   9.224    size(4);
   9.225    ins_encode %{
   9.226      // This is a Power7 instruction for which no machine description exists.
   9.227 -    // TODO: PPC port $archOpcode(ppc64Opcode_compound); 
   9.228 +    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
   9.229      __ isel_0($dst$$Register, $crx$$CondRegister, Assembler::equal, $src1$$Register);
   9.230    %}
   9.231    ins_pipe(pipe_class_default);
   9.232 @@ -7064,7 +7069,7 @@
   9.233      n1->_bottom_type = _bottom_type;
   9.234  
   9.235      decodeNKlass_shiftNode *n2 = new (C) decodeNKlass_shiftNode();
   9.236 -    n2->add_req(n_region, n2);
   9.237 +    n2->add_req(n_region, n1);
   9.238      n2->_opnds[0] = op_dst;
   9.239      n2->_opnds[1] = op_dst;
   9.240      n2->_bottom_type = _bottom_type;
   9.241 @@ -7199,7 +7204,7 @@
   9.242  //  inline_unsafe_load_store).
   9.243  //
   9.244  // Add this node again if we found a good solution for inline_unsafe_load_store().
   9.245 -// Don't forget to look at the implementation of post_store_load_barrier again, 
   9.246 +// Don't forget to look at the implementation of post_store_load_barrier again,
   9.247  // we did other fixes in that method.
   9.248  //instruct unnecessary_membar_volatile() %{
   9.249  //  match(MemBarVolatile);
   9.250 @@ -7237,7 +7242,7 @@
   9.251      // exists. Anyways, the scheduler should be off on Power7.
   9.252      // TODO: PPC port $archOpcode(ppc64Opcode_compound);
   9.253      int cc        = $cmp$$cmpcode;
   9.254 -    __ isel($dst$$Register, $crx$$CondRegister, 
   9.255 +    __ isel($dst$$Register, $crx$$CondRegister,
   9.256              (Assembler::Condition)(cc & 3), /*invert*/((~cc) & 8), $src$$Register);
   9.257    %}
   9.258    ins_pipe(pipe_class_default);
   9.259 @@ -7283,7 +7288,7 @@
   9.260      // exists. Anyways, the scheduler should be off on Power7.
   9.261      // TODO: PPC port $archOpcode(ppc64Opcode_compound);
   9.262      int cc        = $cmp$$cmpcode;
   9.263 -    __ isel($dst$$Register, $crx$$CondRegister, 
   9.264 +    __ isel($dst$$Register, $crx$$CondRegister,
   9.265              (Assembler::Condition)(cc & 3), /*invert*/((~cc) & 8), $src$$Register);
   9.266    %}
   9.267    ins_pipe(pipe_class_default);
   9.268 @@ -7329,7 +7334,7 @@
   9.269      // exists. Anyways, the scheduler should be off on Power7.
   9.270      // TODO: PPC port $archOpcode(ppc64Opcode_compound);
   9.271      int cc        = $cmp$$cmpcode;
   9.272 -    __ isel($dst$$Register, $crx$$CondRegister, 
   9.273 +    __ isel($dst$$Register, $crx$$CondRegister,
   9.274              (Assembler::Condition)(cc & 3), /*invert*/((~cc) & 8), $src$$Register);
   9.275    %}
   9.276    ins_pipe(pipe_class_default);
   9.277 @@ -7376,7 +7381,7 @@
   9.278      // exists. Anyways, the scheduler should be off on Power7.
   9.279      // TODO: PPC port $archOpcode(ppc64Opcode_compound);
   9.280      int cc        = $cmp$$cmpcode;
   9.281 -    __ isel($dst$$Register, $crx$$CondRegister, 
   9.282 +    __ isel($dst$$Register, $crx$$CondRegister,
   9.283              (Assembler::Condition)(cc & 3), /*invert*/((~cc) & 8), $src$$Register);
   9.284    %}
   9.285    ins_pipe(pipe_class_default);
   9.286 @@ -7522,8 +7527,8 @@
   9.287    ins_encode %{
   9.288      // TODO: PPC port $archOpcode(ppc64Opcode_compound);
   9.289      // CmpxchgX sets CCR0 to cmpX(src1, src2) and Rres to 'true'/'false'.
   9.290 -    __ cmpxchgw(CCR0, R0, $src1$$Register, $src2$$Register, $mem_ptr$$Register, 
   9.291 -                MacroAssembler::MemBarFenceAfter, MacroAssembler::cmpxchgx_hint_atomic_update(), 
   9.292 +    __ cmpxchgw(CCR0, R0, $src1$$Register, $src2$$Register, $mem_ptr$$Register,
   9.293 +                MacroAssembler::MemBarFenceAfter, MacroAssembler::cmpxchgx_hint_atomic_update(),
   9.294                  $res$$Register, true);
   9.295    %}
   9.296    ins_pipe(pipe_class_default);
   9.297 @@ -7929,7 +7934,23 @@
   9.298  
   9.299  // Turn the sign-bit of a long into a 64-bit mask, 0x0...0 for
   9.300  // positive longs and 0xF...F for negative ones.
   9.301 -instruct signmask64I_regI(iRegIdst dst, iRegIsrc src) %{
   9.302 +instruct signmask64I_regL(iRegIdst dst, iRegLsrc src) %{
   9.303 +  // no match-rule, false predicate
   9.304 +  effect(DEF dst, USE src);
   9.305 +  predicate(false);
   9.306 +
   9.307 +  format %{ "SRADI   $dst, $src, #63" %}
   9.308 +  size(4);
   9.309 +  ins_encode %{
   9.310 +    // TODO: PPC port $archOpcode(ppc64Opcode_sradi);
   9.311 +    __ sradi($dst$$Register, $src$$Register, 0x3f);
   9.312 +  %}
   9.313 +  ins_pipe(pipe_class_default);
   9.314 +%}
   9.315 +
   9.316 +// Turn the sign-bit of a long into a 64-bit mask, 0x0...0 for
   9.317 +// positive longs and 0xF...F for negative ones.
   9.318 +instruct signmask64L_regL(iRegLdst dst, iRegLsrc src) %{
   9.319    // no match-rule, false predicate
   9.320    effect(DEF dst, USE src);
   9.321    predicate(false);
   9.322 @@ -8893,7 +8914,7 @@
   9.323    size(4);
   9.324    ins_encode %{
   9.325      // TODO: PPC port $archOpcode(ppc64Opcode_rlwinm);
   9.326 -    __ rlwinm($dst$$Register, $src1$$Register, 0, 
   9.327 +    __ rlwinm($dst$$Register, $src1$$Register, 0,
   9.328                (31-log2_long((jlong) $src2$$constant)) & 0x1f, (31-log2_long((jlong) $src2$$constant)) & 0x1f);
   9.329    %}
   9.330    ins_pipe(pipe_class_default);
   9.331 @@ -9619,14 +9640,14 @@
   9.332    ins_cost(DEFAULT_COST*4);
   9.333  
   9.334    expand %{
   9.335 -    iRegIdst src1s;
   9.336 -    iRegIdst src2s;
   9.337 -    iRegIdst diff;
   9.338 -    sxtI_reg(src1s, src1); // ensure proper sign extention
   9.339 -    sxtI_reg(src2s, src2); // ensure proper sign extention
   9.340 -    subI_reg_reg(diff, src1s, src2s);
   9.341 +    iRegLdst src1s;
   9.342 +    iRegLdst src2s;
   9.343 +    iRegLdst diff;
   9.344 +    convI2L_reg(src1s, src1); // Ensure proper sign extension.
   9.345 +    convI2L_reg(src2s, src2); // Ensure proper sign extension.
   9.346 +    subL_reg_reg(diff, src1s, src2s);
   9.347      // Need to consider >=33 bit result, therefore we need signmaskL.
   9.348 -    signmask64I_regI(dst, diff);
   9.349 +    signmask64I_regL(dst, diff);
   9.350    %}
   9.351  %}
   9.352  
   9.353 @@ -10863,7 +10884,7 @@
   9.354    format %{ "PartialSubtypeCheck $result = ($subklass instanceOf $superklass) tmp: $tmp_klass, $tmp_arrayptr" %}
   9.355    ins_encode %{
   9.356      // TODO: PPC port $archOpcode(ppc64Opcode_compound);
   9.357 -    __ check_klass_subtype_slow_path($subklass$$Register, $superklass$$Register, $tmp_arrayptr$$Register, 
   9.358 +    __ check_klass_subtype_slow_path($subklass$$Register, $superklass$$Register, $tmp_arrayptr$$Register,
   9.359                                       $tmp_klass$$Register, NULL, $result$$Register);
   9.360    %}
   9.361    ins_pipe(pipe_class_default);
   9.362 @@ -11178,18 +11199,18 @@
   9.363    ins_cost(DEFAULT_COST*6);
   9.364  
   9.365    expand %{
   9.366 -    iRegIdst src1s;
   9.367 -    iRegIdst src2s;
   9.368 -    iRegIdst diff;
   9.369 -    iRegIdst sm;
   9.370 -    iRegIdst doz; // difference or zero
   9.371 -    sxtI_reg(src1s, src1); // Ensure proper sign extention.
   9.372 -    sxtI_reg(src2s, src2); // Ensure proper sign extention.
   9.373 -    subI_reg_reg(diff, src2s, src1s);
   9.374 +    iRegLdst src1s;
   9.375 +    iRegLdst src2s;
   9.376 +    iRegLdst diff;
   9.377 +    iRegLdst sm;
   9.378 +    iRegLdst doz; // difference or zero
   9.379 +    convI2L_reg(src1s, src1); // Ensure proper sign extension.
   9.380 +    convI2L_reg(src2s, src2); // Ensure proper sign extension.
   9.381 +    subL_reg_reg(diff, src2s, src1s);
   9.382      // Need to consider >=33 bit result, therefore we need signmaskL.
   9.383 -    signmask64I_regI(sm, diff);
   9.384 -    andI_reg_reg(doz, diff, sm); // <=0
   9.385 -    addI_reg_reg(dst, doz, src1s);
   9.386 +    signmask64L_regL(sm, diff);
   9.387 +    andL_reg_reg(doz, diff, sm); // <=0
   9.388 +    addI_regL_regL(dst, doz, src1s);
   9.389    %}
   9.390  %}
   9.391  
   9.392 @@ -11198,19 +11219,18 @@
   9.393    ins_cost(DEFAULT_COST*6);
   9.394  
   9.395    expand %{
   9.396 -    immI_minus1 m1 %{ -1 %}
   9.397 -    iRegIdst src1s;
   9.398 -    iRegIdst src2s;
   9.399 -    iRegIdst diff;
   9.400 -    iRegIdst sm;
   9.401 -    iRegIdst doz; // difference or zero
   9.402 -    sxtI_reg(src1s, src1); // Ensure proper sign extention.
   9.403 -    sxtI_reg(src2s, src2); // Ensure proper sign extention.
   9.404 -    subI_reg_reg(diff, src2s, src1s);
   9.405 +    iRegLdst src1s;
   9.406 +    iRegLdst src2s;
   9.407 +    iRegLdst diff;
   9.408 +    iRegLdst sm;
   9.409 +    iRegLdst doz; // difference or zero
   9.410 +    convI2L_reg(src1s, src1); // Ensure proper sign extension.
   9.411 +    convI2L_reg(src2s, src2); // Ensure proper sign extension.
   9.412 +    subL_reg_reg(diff, src2s, src1s);
   9.413      // Need to consider >=33 bit result, therefore we need signmaskL.
   9.414 -    signmask64I_regI(sm, diff);
   9.415 -    andcI_reg_reg(doz, sm, m1, diff); // >=0
   9.416 -    addI_reg_reg(dst, doz, src1s);
   9.417 +    signmask64L_regL(sm, diff);
   9.418 +    andcL_reg_reg(doz, diff, sm); // >=0
   9.419 +    addI_regL_regL(dst, doz, src1s);
   9.420    %}
   9.421  %}
   9.422

    10.1 --- a/src/cpu/ppc/vm/templateInterpreter_ppc.cpp	Wed May 07 10:58:47 2014 -0700
    10.2 +++ b/src/cpu/ppc/vm/templateInterpreter_ppc.cpp	Thu May 08 23:07:11 2014 -0700
    10.3 @@ -81,24 +81,18 @@
    10.4  #if 0
    10.5  // Call special ClassCastException constructor taking object to cast
    10.6  // and target class as arguments.
    10.7 -address TemplateInterpreterGenerator::generate_ClassCastException_verbose_handler(const char* name) {
    10.8 +address TemplateInterpreterGenerator::generate_ClassCastException_verbose_handler() {
    10.9    address entry = __ pc();
   10.10  
   10.11 -  // Target class oop is in register R6_ARG4 by convention!
   10.12 -
   10.13    // Expression stack must be empty before entering the VM if an
   10.14    // exception happened.
   10.15    __ empty_expression_stack();
   10.16 -  // Setup parameters.
   10.17 +
   10.18    // Thread will be loaded to R3_ARG1.
   10.19 -  __ load_const_optimized(R4_ARG2, (address) name);
   10.20 -  __ mr(R5_ARG3, R17_tos);
   10.21 -  // R6_ARG4 contains specified class.
   10.22 -  __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::throw_ClassCastException_verbose));
   10.23 -#ifdef ASSERT
   10.24 +  // Target class oop is in register R5_ARG3 by convention!
   10.25 +  __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::throw_ClassCastException_verbose, R17_tos, R5_ARG3));
   10.26    // Above call must not return here since exception pending.
   10.27 -  __ should_not_reach_here();
   10.28 -#endif
   10.29 +  DEBUG_ONLY(__ should_not_reach_here();)
   10.30    return entry;
   10.31  }
   10.32  #endif
   10.33 @@ -1535,14 +1529,32 @@
   10.34      __ stw(R0, in_bytes(JavaThread::popframe_condition_offset()), R16_thread);
   10.35  
   10.36      // Get out of the current method and re-execute the call that called us.
   10.37 -    __ merge_frames(/*top_frame_sp*/ R21_sender_SP, /*return_pc*/ return_pc, R11_scratch1, R12_scratch2);
   10.38 +    __ merge_frames(/*top_frame_sp*/ R21_sender_SP, /*return_pc*/ noreg, R11_scratch1, R12_scratch2);
   10.39      __ restore_interpreter_state(R11_scratch1);
   10.40      __ ld(R12_scratch2, _ijava_state_neg(top_frame_sp), R11_scratch1);
   10.41      __ resize_frame_absolute(R12_scratch2, R11_scratch1, R0);
   10.42 -    __ mtlr(return_pc);
   10.43      if (ProfileInterpreter) {
   10.44        __ set_method_data_pointer_for_bcp();
   10.45      }
   10.46 +#if INCLUDE_JVMTI
   10.47 +    Label L_done;
   10.48 +
   10.49 +    __ lbz(R11_scratch1, 0, R14_bcp);
   10.50 +    __ cmpwi(CCR0, R11_scratch1, Bytecodes::_invokestatic);
   10.51 +    __ bne(CCR0, L_done);
   10.52 +
   10.53 +    // The member name argument must be restored if _invokestatic is re-executed after a PopFrame call.
   10.54 +    // Detect such a case in the InterpreterRuntime function and return the member name argument, or NULL.
   10.55 +    __ ld(R4_ARG2, 0, R18_locals);
   10.56 +    __ call_VM(R11_scratch1, CAST_FROM_FN_PTR(address, InterpreterRuntime::member_name_arg_or_null),
   10.57 +               R4_ARG2, R19_method, R14_bcp);
   10.58 +
   10.59 +    __ cmpdi(CCR0, R11_scratch1, 0);
   10.60 +    __ beq(CCR0, L_done);
   10.61 +
   10.62 +    __ std(R11_scratch1, wordSize, R15_esp);
   10.63 +    __ bind(L_done);
   10.64 +#endif // INCLUDE_JVMTI
   10.65      __ dispatch_next(vtos);
   10.66    }
   10.67    // end of JVMTI PopFrame support

    11.1 --- a/src/cpu/ppc/vm/templateTable_ppc_64.cpp	Wed May 07 10:58:47 2014 -0700
    11.2 +++ b/src/cpu/ppc/vm/templateTable_ppc_64.cpp	Thu May 08 23:07:11 2014 -0700
    11.3 @@ -64,7 +64,7 @@
    11.4    assert_different_registers(Rtmp1, Rtmp2, Rtmp3, Rval, Rbase);
    11.5  
    11.6    switch (barrier) {
    11.7 -#ifndef SERIALGC
    11.8 +#if INCLUDE_ALL_GCS
    11.9      case BarrierSet::G1SATBCT:
   11.10      case BarrierSet::G1SATBCTLogging:
   11.11        {
   11.12 @@ -104,7 +104,7 @@
   11.13          __ bind(Ldone);
   11.14        }
   11.15        break;
   11.16 -#endif // SERIALGC
   11.17 +#endif // INCLUDE_ALL_GCS
   11.18      case BarrierSet::CardTableModRef:
   11.19      case BarrierSet::CardTableExtension:
   11.20        {
   11.21 @@ -259,17 +259,17 @@
   11.22    switch (value) {
   11.23      default: ShouldNotReachHere();
   11.24      case 0: {
   11.25 -      int simm16_offset = __ load_const_optimized(R11_scratch1, (address*)&zero, R0);
   11.26 +      int simm16_offset = __ load_const_optimized(R11_scratch1, (address*)&zero, R0, true);
   11.27        __ lfs(F15_ftos, simm16_offset, R11_scratch1);
   11.28        break;
   11.29      }
   11.30      case 1: {
   11.31 -      int simm16_offset = __ load_const_optimized(R11_scratch1, (address*)&one, R0);
   11.32 +      int simm16_offset = __ load_const_optimized(R11_scratch1, (address*)&one, R0, true);
   11.33        __ lfs(F15_ftos, simm16_offset, R11_scratch1);
   11.34        break;
   11.35      }
   11.36      case 2: {
   11.37 -      int simm16_offset = __ load_const_optimized(R11_scratch1, (address*)&two, R0);
   11.38 +      int simm16_offset = __ load_const_optimized(R11_scratch1, (address*)&two, R0, true);
   11.39        __ lfs(F15_ftos, simm16_offset, R11_scratch1);
   11.40        break;
   11.41      }
   11.42 @@ -282,12 +282,12 @@
   11.43    static double one  = 1.0;
   11.44    switch (value) {
   11.45      case 0: {
   11.46 -      int simm16_offset = __ load_const_optimized(R11_scratch1, (address*)&zero, R0);
   11.47 +      int simm16_offset = __ load_const_optimized(R11_scratch1, (address*)&zero, R0, true);
   11.48        __ lfd(F15_ftos, simm16_offset, R11_scratch1);
   11.49        break;
   11.50      }
   11.51      case 1: {
   11.52 -      int simm16_offset = __ load_const_optimized(R11_scratch1, (address*)&one, R0);
   11.53 +      int simm16_offset = __ load_const_optimized(R11_scratch1, (address*)&one, R0, true);
   11.54        __ lfd(F15_ftos, simm16_offset, R11_scratch1);
   11.55        break;
   11.56      }
   11.57 @@ -3728,9 +3728,9 @@
   11.58    transition(atos, atos);
   11.59  
   11.60    Label Ldone, Lis_null, Lquicked, Lresolved;
   11.61 -  Register Roffset         = R5_ARG3,
   11.62 +  Register Roffset         = R6_ARG4,
   11.63             RobjKlass       = R4_ARG2,
   11.64 -           RspecifiedKlass = R6_ARG4, // Generate_ClassCastException_verbose_handler will expect this register.
   11.65 +           RspecifiedKlass = R5_ARG3, // Generate_ClassCastException_verbose_handler will read value from this register.
   11.66             Rcpool          = R11_scratch1,
   11.67             Rtags           = R12_scratch2;
   11.68

    12.1 --- a/src/cpu/sparc/vm/assembler_sparc.hpp	Wed May 07 10:58:47 2014 -0700
    12.2 +++ b/src/cpu/sparc/vm/assembler_sparc.hpp	Thu May 08 23:07:11 2014 -0700
    12.3 @@ -1,5 +1,5 @@
    12.4  /*
    12.5 - * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
    12.6 + * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
    12.7   * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    12.8   *
    12.9   * This code is free software; you can redistribute it and/or modify it
   12.10 @@ -123,8 +123,13 @@
   12.11      fpop2_op3    = 0x35,
   12.12      impdep1_op3  = 0x36,
   12.13      aes3_op3     = 0x36,
   12.14 +    alignaddr_op3  = 0x36,
   12.15 +    faligndata_op3 = 0x36,
   12.16      flog3_op3    = 0x36,
   12.17 +    edge_op3     = 0x36,
   12.18 +    fsrc_op3     = 0x36,
   12.19      impdep2_op3  = 0x37,
   12.20 +    stpartialf_op3 = 0x37,
   12.21      jmpl_op3     = 0x38,
   12.22      rett_op3     = 0x39,
   12.23      trap_op3     = 0x3a,
   12.24 @@ -175,17 +180,23 @@
   12.25  
   12.26    enum opfs {
   12.27      // selected opfs
   12.28 +    edge8n_opf         = 0x01,
   12.29 +
   12.30      fmovs_opf          = 0x01,
   12.31      fmovd_opf          = 0x02,
   12.32  
   12.33      fnegs_opf          = 0x05,
   12.34      fnegd_opf          = 0x06,
   12.35  
   12.36 +    alignaddr_opf      = 0x18,
   12.37 +
   12.38      fadds_opf          = 0x41,
   12.39      faddd_opf          = 0x42,
   12.40      fsubs_opf          = 0x45,
   12.41      fsubd_opf          = 0x46,
   12.42  
   12.43 +    faligndata_opf     = 0x48,
   12.44 +
   12.45      fmuls_opf          = 0x49,
   12.46      fmuld_opf          = 0x4a,
   12.47      fdivs_opf          = 0x4d,
   12.48 @@ -348,6 +359,8 @@
   12.49      ASI_PRIMARY            = 0x80,
   12.50      ASI_PRIMARY_NOFAULT    = 0x82,
   12.51      ASI_PRIMARY_LITTLE     = 0x88,
   12.52 +    // 8x8-bit partial store
   12.53 +    ASI_PST8_PRIMARY       = 0xC0,
   12.54      // Block initializing store
   12.55      ASI_ST_BLKINIT_PRIMARY = 0xE2,
   12.56      // Most-Recently-Used (MRU) BIS variant
   12.57 @@ -585,6 +598,9 @@
   12.58    // instruction only in VIS1
   12.59    static void vis1_only() { assert( VM_Version::has_vis1(), "This instruction only works on SPARC with VIS1"); }
   12.60  
   12.61 +  // instruction only in VIS2
   12.62 +  static void vis2_only() { assert( VM_Version::has_vis2(), "This instruction only works on SPARC with VIS2"); }
   12.63 +
   12.64    // instruction only in VIS3
   12.65    static void vis3_only() { assert( VM_Version::has_vis3(), "This instruction only works on SPARC with VIS3"); }
   12.66  
   12.67 @@ -1164,6 +1180,20 @@
   12.68    inline void wrfprs( Register d) { v9_only(); emit_int32( op(arith_op) | rs1(d) | op3(wrreg_op3) | u_field(6, 29, 25)); }
   12.69  
   12.70  
   12.71 +  //  VIS1 instructions
   12.72 +
   12.73 +  void alignaddr( Register s1, Register s2, Register d ) { vis1_only(); emit_int32( op(arith_op) | rd(d) | op3(alignaddr_op3) | rs1(s1) | opf(alignaddr_opf) | rs2(s2)); }
   12.74 +
   12.75 +  void faligndata( FloatRegister s1, FloatRegister s2, FloatRegister d ) { vis1_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(faligndata_op3) | fs1(s1, FloatRegisterImpl::D) | opf(faligndata_opf) | fs2(s2, FloatRegisterImpl::D)); }
   12.76 +
   12.77 +  void fsrc2( FloatRegisterImpl::Width w, FloatRegister s2, FloatRegister d ) { vis1_only(); emit_int32( op(arith_op) | fd(d, w) | op3(fsrc_op3) | opf(0x7A - w) | fs2(s2, w)); }
   12.78 +
   12.79 +  void stpartialf( Register s1, Register s2, FloatRegister d, int ia = -1 ) { vis1_only(); emit_int32( op(ldst_op) | fd(d, FloatRegisterImpl::D) | op3(stpartialf_op3) | rs1(s1) | imm_asi(ia) | rs2(s2)); }
   12.80 +
   12.81 +  //  VIS2 instructions
   12.82 +
   12.83 +  void edge8n( Register s1, Register s2, Register d ) { vis2_only(); emit_int32( op(arith_op) | rd(d) | op3(edge_op3) | rs1(s1) | opf(edge8n_opf) | rs2(s2)); }
   12.84 +
   12.85    // VIS3 instructions
   12.86  
   12.87    void movstosw( FloatRegister s, Register d ) { vis3_only();  emit_int32( op(arith_op) | rd(d) | op3(mftoi_op3) | opf(mstosw_opf) | fs2(s, FloatRegisterImpl::S)); }

    13.1 --- a/src/cpu/sparc/vm/stubGenerator_sparc.cpp	Wed May 07 10:58:47 2014 -0700
    13.2 +++ b/src/cpu/sparc/vm/stubGenerator_sparc.cpp	Thu May 08 23:07:11 2014 -0700
    13.3 @@ -1,5 +1,5 @@
    13.4  /*
    13.5 - * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
    13.6 + * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
    13.7   * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    13.8   *
    13.9   * This code is free software; you can redistribute it and/or modify it
   13.10 @@ -3305,9 +3305,12 @@
   13.11    }
   13.12  
   13.13    address generate_aescrypt_encryptBlock() {
   13.14 +    // required since we read expanded key 'int' array starting first element without alignment considerations
   13.15 +    assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
   13.16 +           "the following code assumes that first element of an int array is aligned to 8 bytes");
   13.17      __ align(CodeEntryAlignment);
   13.18 -    StubCodeMark mark(this, "StubRoutines", "aesencryptBlock");
   13.19 -    Label L_doLast128bit, L_storeOutput;
   13.20 +    StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
   13.21 +    Label L_load_misaligned_input, L_load_expanded_key, L_doLast128bit, L_storeOutput, L_store_misaligned_output;
   13.22      address start = __ pc();
   13.23      Register from = O0; // source byte array
   13.24      Register to = O1;   // destination byte array
   13.25 @@ -3317,15 +3320,33 @@
   13.26      // read expanded key length
   13.27      __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
   13.28  
   13.29 -    // load input into F54-F56; F30-F31 used as temp
   13.30 -    __ ldf(FloatRegisterImpl::S, from, 0, F30);
   13.31 -    __ ldf(FloatRegisterImpl::S, from, 4, F31);
   13.32 -    __ fmov(FloatRegisterImpl::D, F30, F54);
   13.33 -    __ ldf(FloatRegisterImpl::S, from, 8, F30);
   13.34 -    __ ldf(FloatRegisterImpl::S, from, 12, F31);
   13.35 -    __ fmov(FloatRegisterImpl::D, F30, F56);
   13.36 -
   13.37 -    // load expanded key
   13.38 +    // Method to address arbitrary alignment for load instructions:
   13.39 +    // Check last 3 bits of 'from' address to see if it is aligned to 8-byte boundary
   13.40 +    // If zero/aligned then continue with double FP load instructions
   13.41 +    // If not zero/mis-aligned then alignaddr will set GSR.align with number of bytes to skip during faligndata
   13.42 +    // alignaddr will also convert arbitrary aligned 'from' address to nearest 8-byte aligned address
   13.43 +    // load 3 * 8-byte components (to read 16 bytes input) in 3 different FP regs starting at this aligned address
   13.44 +    // faligndata will then extract (based on GSR.align value) the appropriate 8 bytes from the 2 source regs
   13.45 +
   13.46 +    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
   13.47 +    __ andcc(from, 7, G0);
   13.48 +    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input);
   13.49 +    __ delayed()->alignaddr(from, G0, from);
   13.50 +
   13.51 +    // aligned case: load input into F54-F56
   13.52 +    __ ldf(FloatRegisterImpl::D, from, 0, F54);
   13.53 +    __ ldf(FloatRegisterImpl::D, from, 8, F56);
   13.54 +    __ ba_short(L_load_expanded_key);
   13.55 +
   13.56 +    __ BIND(L_load_misaligned_input);
   13.57 +    __ ldf(FloatRegisterImpl::D, from, 0, F54);
   13.58 +    __ ldf(FloatRegisterImpl::D, from, 8, F56);
   13.59 +    __ ldf(FloatRegisterImpl::D, from, 16, F58);
   13.60 +    __ faligndata(F54, F56, F54);
   13.61 +    __ faligndata(F56, F58, F56);
   13.62 +
   13.63 +    __ BIND(L_load_expanded_key);
   13.64 +    // Since we load expanded key buffers starting first element, 8-byte alignment is guaranteed
   13.65      for ( int i = 0;  i <= 38; i += 2 ) {
   13.66        __ ldf(FloatRegisterImpl::D, key, i*4, as_FloatRegister(i));
   13.67      }
   13.68 @@ -3365,8 +3386,7 @@
   13.69      __ ldf(FloatRegisterImpl::D, key, 232, F50);
   13.70      __ aes_eround01(F52, F54, F56, F58); //round 13
   13.71      __ aes_eround23(F46, F54, F56, F60);
   13.72 -    __ br(Assembler::always, false, Assembler::pt, L_storeOutput);
   13.73 -    __ delayed()->nop();
   13.74 +    __ ba_short(L_storeOutput);
   13.75  
   13.76      __ BIND(L_doLast128bit);
   13.77      __ ldf(FloatRegisterImpl::D, key, 160, F48);
   13.78 @@ -3377,23 +3397,62 @@
   13.79      __ aes_eround01_l(F48, F58, F60, F54); //last round
   13.80      __ aes_eround23_l(F50, F58, F60, F56);
   13.81  
   13.82 -    // store output into the destination array, F0-F1 used as temp
   13.83 -    __ fmov(FloatRegisterImpl::D, F54, F0);
   13.84 -    __ stf(FloatRegisterImpl::S, F0, to, 0);
   13.85 -    __ stf(FloatRegisterImpl::S, F1, to, 4);
   13.86 -    __ fmov(FloatRegisterImpl::D, F56, F0);
   13.87 -    __ stf(FloatRegisterImpl::S, F0, to, 8);
   13.88 +    // Method to address arbitrary alignment for store instructions:
   13.89 +    // Check last 3 bits of 'dest' address to see if it is aligned to 8-byte boundary
   13.90 +    // If zero/aligned then continue with double FP store instructions
   13.91 +    // If not zero/mis-aligned then edge8n will generate edge mask in result reg (O3 in below case)
   13.92 +    // Example: If dest address is 0x07 and nearest 8-byte aligned address is 0x00 then edge mask will be 00000001
   13.93 +    // Compute (8-n) where n is # of bytes skipped by partial store(stpartialf) inst from edge mask, n=7 in this case
   13.94 +    // We get the value of n from the andcc that checks 'dest' alignment. n is available in O5 in below case.
   13.95 +    // Set GSR.align to (8-n) using alignaddr
   13.96 +    // Circular byte shift store values by n places so that the original bytes are at correct position for stpartialf
   13.97 +    // Set the arbitrarily aligned 'dest' address to nearest 8-byte aligned address
   13.98 +    // Store (partial) the original first (8-n) bytes starting at the original 'dest' address
   13.99 +    // Negate the edge mask so that the subsequent stpartialf can store the original (8-n-1)th through 8th bytes at appropriate address
  13.100 +    // We need to execute this process for both the 8-byte result values
  13.101 +
  13.102 +    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
  13.103 +    __ andcc(to, 7, O5);
  13.104 +    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output);
  13.105 +    __ delayed()->edge8n(to, G0, O3);
  13.106 +
  13.107 +    // aligned case: store output into the destination array
  13.108 +    __ stf(FloatRegisterImpl::D, F54, to, 0);
  13.109      __ retl();
  13.110 -    __ delayed()->stf(FloatRegisterImpl::S, F1, to, 12);
  13.111 +    __ delayed()->stf(FloatRegisterImpl::D, F56, to, 8);
  13.112 +
  13.113 +    __ BIND(L_store_misaligned_output);
  13.114 +    __ add(to, 8, O4);
  13.115 +    __ mov(8, O2);
  13.116 +    __ sub(O2, O5, O2);
  13.117 +    __ alignaddr(O2, G0, O2);
  13.118 +    __ faligndata(F54, F54, F54);
  13.119 +    __ faligndata(F56, F56, F56);
  13.120 +    __ and3(to, -8, to);
  13.121 +    __ and3(O4, -8, O4);
  13.122 +    __ stpartialf(to, O3, F54, Assembler::ASI_PST8_PRIMARY);
  13.123 +    __ stpartialf(O4, O3, F56, Assembler::ASI_PST8_PRIMARY);
  13.124 +    __ add(to, 8, to);
  13.125 +    __ add(O4, 8, O4);
  13.126 +    __ orn(G0, O3, O3);
  13.127 +    __ stpartialf(to, O3, F54, Assembler::ASI_PST8_PRIMARY);
  13.128 +    __ retl();
  13.129 +    __ delayed()->stpartialf(O4, O3, F56, Assembler::ASI_PST8_PRIMARY);
  13.130  
  13.131      return start;
  13.132    }
  13.133  
  13.134    address generate_aescrypt_decryptBlock() {
  13.135 +    assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
  13.136 +           "the following code assumes that first element of an int array is aligned to 8 bytes");
  13.137 +    // required since we read original key 'byte' array as well in the decryption stubs
  13.138 +    assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0,
  13.139 +           "the following code assumes that first element of a byte array is aligned to 8 bytes");
  13.140      __ align(CodeEntryAlignment);
  13.141 -    StubCodeMark mark(this, "StubRoutines", "aesdecryptBlock");
  13.142 +    StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
  13.143      address start = __ pc();
  13.144 -    Label L_expand192bit, L_expand256bit, L_common_transform;
  13.145 +    Label L_load_misaligned_input, L_load_original_key, L_expand192bit, L_expand256bit, L_reload_misaligned_input;
  13.146 +    Label L_256bit_transform, L_common_transform, L_store_misaligned_output;
  13.147      Register from = O0; // source byte array
  13.148      Register to = O1;   // destination byte array
  13.149      Register key = O2;  // expanded key array
  13.150 @@ -3403,15 +3462,29 @@
  13.151      // read expanded key array length
  13.152      __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
  13.153  
  13.154 -    // load input into F52-F54; F30,F31 used as temp
  13.155 -    __ ldf(FloatRegisterImpl::S, from, 0, F30);
  13.156 -    __ ldf(FloatRegisterImpl::S, from, 4, F31);
  13.157 -    __ fmov(FloatRegisterImpl::D, F30, F52);
  13.158 -    __ ldf(FloatRegisterImpl::S, from, 8, F30);
  13.159 -    __ ldf(FloatRegisterImpl::S, from, 12, F31);
  13.160 -    __ fmov(FloatRegisterImpl::D, F30, F54);
  13.161 -
  13.162 +    // save 'from' since we may need to recheck alignment in case of 256-bit decryption
  13.163 +    __ mov(from, G1);
  13.164 +
  13.165 +    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
  13.166 +    __ andcc(from, 7, G0);
  13.167 +    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input);
  13.168 +    __ delayed()->alignaddr(from, G0, from);
  13.169 +
  13.170 +    // aligned case: load input into F52-F54
  13.171 +    __ ldf(FloatRegisterImpl::D, from, 0, F52);
  13.172 +    __ ldf(FloatRegisterImpl::D, from, 8, F54);
  13.173 +    __ ba_short(L_load_original_key);
  13.174 +
  13.175 +    __ BIND(L_load_misaligned_input);
  13.176 +    __ ldf(FloatRegisterImpl::D, from, 0, F52);
  13.177 +    __ ldf(FloatRegisterImpl::D, from, 8, F54);
  13.178 +    __ ldf(FloatRegisterImpl::D, from, 16, F56);
  13.179 +    __ faligndata(F52, F54, F52);
  13.180 +    __ faligndata(F54, F56, F54);
  13.181 +
  13.182 +    __ BIND(L_load_original_key);
  13.183      // load original key from SunJCE expanded decryption key
  13.184 +    // Since we load original key buffer starting first element, 8-byte alignment is guaranteed
  13.185      for ( int i = 0;  i <= 3; i++ ) {
  13.186        __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
  13.187      }
  13.188 @@ -3432,8 +3505,7 @@
  13.189      // perform 128-bit key specific inverse cipher transformation
  13.190      __ fxor(FloatRegisterImpl::D, F42, F54, F54);
  13.191      __ fxor(FloatRegisterImpl::D, F40, F52, F52);
  13.192 -    __ br(Assembler::always, false, Assembler::pt, L_common_transform);
  13.193 -    __ delayed()->nop();
  13.194 +    __ ba_short(L_common_transform);
  13.195  
  13.196      __ BIND(L_expand192bit);
  13.197  
  13.198 @@ -3457,8 +3529,7 @@
  13.199      __ aes_dround01(F44, F52, F54, F56);
  13.200      __ aes_dround23(F42, F56, F58, F54);
  13.201      __ aes_dround01(F40, F56, F58, F52);
  13.202 -    __ br(Assembler::always, false, Assembler::pt, L_common_transform);
  13.203 -    __ delayed()->nop();
  13.204 +    __ ba_short(L_common_transform);
  13.205  
  13.206      __ BIND(L_expand256bit);
  13.207  
  13.208 @@ -3478,14 +3549,31 @@
  13.209      __ aes_kexpand2(F50, F56, F58);
  13.210  
  13.211      for ( int i = 0;  i <= 6; i += 2 ) {
  13.212 -      __ fmov(FloatRegisterImpl::D, as_FloatRegister(58-i), as_FloatRegister(i));
  13.213 +      __ fsrc2(FloatRegisterImpl::D, as_FloatRegister(58-i), as_FloatRegister(i));
  13.214      }
  13.215  
  13.216 -    // load input into F52-F54
  13.217 +    // reload original 'from' address
  13.218 +    __ mov(G1, from);
  13.219 +
  13.220 +    // re-check 8-byte alignment
  13.221 +    __ andcc(from, 7, G0);
  13.222 +    __ br(Assembler::notZero, true, Assembler::pn, L_reload_misaligned_input);
  13.223 +    __ delayed()->alignaddr(from, G0, from);
  13.224 +
  13.225 +    // aligned case: load input into F52-F54
  13.226      __ ldf(FloatRegisterImpl::D, from, 0, F52);
  13.227      __ ldf(FloatRegisterImpl::D, from, 8, F54);
  13.228 +    __ ba_short(L_256bit_transform);
  13.229 +
  13.230 +    __ BIND(L_reload_misaligned_input);
  13.231 +    __ ldf(FloatRegisterImpl::D, from, 0, F52);
  13.232 +    __ ldf(FloatRegisterImpl::D, from, 8, F54);
  13.233 +    __ ldf(FloatRegisterImpl::D, from, 16, F56);
  13.234 +    __ faligndata(F52, F54, F52);
  13.235 +    __ faligndata(F54, F56, F54);
  13.236  
  13.237      // perform 256-bit key specific inverse cipher transformation
  13.238 +    __ BIND(L_256bit_transform);
  13.239      __ fxor(FloatRegisterImpl::D, F0, F54, F54);
  13.240      __ fxor(FloatRegisterImpl::D, F2, F52, F52);
  13.241      __ aes_dround23(F4, F52, F54, F58);
  13.242 @@ -3515,43 +3603,71 @@
  13.243        }
  13.244      }
  13.245  
  13.246 -    // store output to destination array, F0-F1 used as temp
  13.247 -    __ fmov(FloatRegisterImpl::D, F52, F0);
  13.248 -    __ stf(FloatRegisterImpl::S, F0, to, 0);
  13.249 -    __ stf(FloatRegisterImpl::S, F1, to, 4);
  13.250 -    __ fmov(FloatRegisterImpl::D, F54, F0);
  13.251 -    __ stf(FloatRegisterImpl::S, F0, to, 8);
  13.252 +    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
  13.253 +    __ andcc(to, 7, O5);
  13.254 +    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output);
  13.255 +    __ delayed()->edge8n(to, G0, O3);
  13.256 +
  13.257 +    // aligned case: store output into the destination array
  13.258 +    __ stf(FloatRegisterImpl::D, F52, to, 0);
  13.259      __ retl();
  13.260 -    __ delayed()->stf(FloatRegisterImpl::S, F1, to, 12);
  13.261 +    __ delayed()->stf(FloatRegisterImpl::D, F54, to, 8);
  13.262 +
  13.263 +    __ BIND(L_store_misaligned_output);
  13.264 +    __ add(to, 8, O4);
  13.265 +    __ mov(8, O2);
  13.266 +    __ sub(O2, O5, O2);
  13.267 +    __ alignaddr(O2, G0, O2);
  13.268 +    __ faligndata(F52, F52, F52);
  13.269 +    __ faligndata(F54, F54, F54);
  13.270 +    __ and3(to, -8, to);
  13.271 +    __ and3(O4, -8, O4);
  13.272 +    __ stpartialf(to, O3, F52, Assembler::ASI_PST8_PRIMARY);
  13.273 +    __ stpartialf(O4, O3, F54, Assembler::ASI_PST8_PRIMARY);
  13.274 +    __ add(to, 8, to);
  13.275 +    __ add(O4, 8, O4);
  13.276 +    __ orn(G0, O3, O3);
  13.277 +    __ stpartialf(to, O3, F52, Assembler::ASI_PST8_PRIMARY);
  13.278 +    __ retl();
  13.279 +    __ delayed()->stpartialf(O4, O3, F54, Assembler::ASI_PST8_PRIMARY);
  13.280  
  13.281      return start;
  13.282    }
  13.283  
  13.284    address generate_cipherBlockChaining_encryptAESCrypt() {
  13.285 +    assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
  13.286 +           "the following code assumes that first element of an int array is aligned to 8 bytes");
  13.287 +    assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0,
  13.288 +           "the following code assumes that first element of a byte array is aligned to 8 bytes");
  13.289      __ align(CodeEntryAlignment);
  13.290      StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
  13.291 -    Label L_cbcenc128, L_cbcenc192, L_cbcenc256;
  13.292 +    Label L_cbcenc128, L_load_misaligned_input_128bit, L_128bit_transform, L_store_misaligned_output_128bit;
  13.293 +    Label L_check_loop_end_128bit, L_cbcenc192, L_load_misaligned_input_192bit, L_192bit_transform;
  13.294 +    Label L_store_misaligned_output_192bit, L_check_loop_end_192bit, L_cbcenc256, L_load_misaligned_input_256bit;
  13.295 +    Label L_256bit_transform, L_store_misaligned_output_256bit, L_check_loop_end_256bit;
  13.296      address start = __ pc();
  13.297 -    Register from = O0; // source byte array
  13.298 -    Register to = O1;   // destination byte array
  13.299 -    Register key = O2;  // expanded key array
  13.300 -    Register rvec = O3; // init vector
  13.301 -    const Register len_reg = O4; // cipher length
  13.302 -    const Register keylen = O5;  // reg for storing expanded key array length
  13.303 -
  13.304 -    // save cipher len to return in the end
  13.305 -    __ mov(len_reg, L1);
  13.306 +    Register from = I0; // source byte array
  13.307 +    Register to = I1;   // destination byte array
  13.308 +    Register key = I2;  // expanded key array
  13.309 +    Register rvec = I3; // init vector
  13.310 +    const Register len_reg = I4; // cipher length
  13.311 +    const Register keylen = I5;  // reg for storing expanded key array length
  13.312 +
  13.313 +    // save cipher len before save_frame, to return in the end
  13.314 +    __ mov(O4, L0);
  13.315 +    __ save_frame(0);
  13.316  
  13.317      // read expanded key length
  13.318      __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
  13.319  
  13.320 -    // load init vector
  13.321 +    // load initial vector, 8-byte alignment is guranteed
  13.322      __ ldf(FloatRegisterImpl::D, rvec, 0, F60);
  13.323      __ ldf(FloatRegisterImpl::D, rvec, 8, F62);
  13.324 +    // load key, 8-byte alignment is guranteed
  13.325      __ ldx(key,0,G1);
  13.326 -    __ ldx(key,8,G2);
  13.327 -
  13.328 -    // start loading expanded key
  13.329 +    __ ldx(key,8,G5);
  13.330 +
  13.331 +    // start loading expanded key, 8-byte alignment is guranteed
  13.332      for ( int i = 0, j = 16;  i <= 38; i += 2, j += 8 ) {
  13.333        __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i));
  13.334      }
  13.335 @@ -3571,15 +3687,35 @@
  13.336      }
  13.337  
  13.338      // 256-bit original key size
  13.339 -    __ br(Assembler::always, false, Assembler::pt, L_cbcenc256);
  13.340 -    __ delayed()->nop();
  13.341 +    __ ba_short(L_cbcenc256);
  13.342  
  13.343      __ align(OptoLoopAlignment);
  13.344      __ BIND(L_cbcenc128);
  13.345 +    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
  13.346 +    __ andcc(from, 7, G0);
  13.347 +    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_128bit);
  13.348 +    __ delayed()->mov(from, L1); // save original 'from' address before alignaddr
  13.349 +
  13.350 +    // aligned case: load input into G3 and G4
  13.351      __ ldx(from,0,G3);
  13.352      __ ldx(from,8,G4);
  13.353 +    __ ba_short(L_128bit_transform);
  13.354 +
  13.355 +    __ BIND(L_load_misaligned_input_128bit);
  13.356 +    // can clobber F48, F50 and F52 as they are not used in 128 and 192-bit key encryption
  13.357 +    __ alignaddr(from, G0, from);
  13.358 +    __ ldf(FloatRegisterImpl::D, from, 0, F48);
  13.359 +    __ ldf(FloatRegisterImpl::D, from, 8, F50);
  13.360 +    __ ldf(FloatRegisterImpl::D, from, 16, F52);
  13.361 +    __ faligndata(F48, F50, F48);
  13.362 +    __ faligndata(F50, F52, F50);
  13.363 +    __ movdtox(F48, G3);
  13.364 +    __ movdtox(F50, G4);
  13.365 +    __ mov(L1, from);
  13.366 +
  13.367 +    __ BIND(L_128bit_transform);
  13.368      __ xor3(G1,G3,G3);
  13.369 -    __ xor3(G2,G4,G4);
  13.370 +    __ xor3(G5,G4,G4);
  13.371      __ movxtod(G3,F56);
  13.372      __ movxtod(G4,F58);
  13.373      __ fxor(FloatRegisterImpl::D, F60, F56, F60);
  13.374 @@ -3598,24 +3734,81 @@
  13.375        }
  13.376      }
  13.377  
  13.378 +    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
  13.379 +    __ andcc(to, 7, L1);
  13.380 +    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_128bit);
  13.381 +    __ delayed()->edge8n(to, G0, L2);
  13.382 +
  13.383 +    // aligned case: store output into the destination array
  13.384      __ stf(FloatRegisterImpl::D, F60, to, 0);
  13.385      __ stf(FloatRegisterImpl::D, F62, to, 8);
  13.386 +    __ ba_short(L_check_loop_end_128bit);
  13.387 +
  13.388 +    __ BIND(L_store_misaligned_output_128bit);
  13.389 +    __ add(to, 8, L3);
  13.390 +    __ mov(8, L4);
  13.391 +    __ sub(L4, L1, L4);
  13.392 +    __ alignaddr(L4, G0, L4);
  13.393 +    // save cipher text before circular right shift
  13.394 +    // as it needs to be stored as iv for next block (see code before next retl)
  13.395 +    __ movdtox(F60, L6);
  13.396 +    __ movdtox(F62, L7);
  13.397 +    __ faligndata(F60, F60, F60);
  13.398 +    __ faligndata(F62, F62, F62);
  13.399 +    __ mov(to, L5);
  13.400 +    __ and3(to, -8, to);
  13.401 +    __ and3(L3, -8, L3);
  13.402 +    __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
  13.403 +    __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
  13.404 +    __ add(to, 8, to);
  13.405 +    __ add(L3, 8, L3);
  13.406 +    __ orn(G0, L2, L2);
  13.407 +    __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
  13.408 +    __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
  13.409 +    __ mov(L5, to);
  13.410 +    __ movxtod(L6, F60);
  13.411 +    __ movxtod(L7, F62);
  13.412 +
  13.413 +    __ BIND(L_check_loop_end_128bit);
  13.414      __ add(from, 16, from);
  13.415      __ add(to, 16, to);
  13.416      __ subcc(len_reg, 16, len_reg);
  13.417      __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc128);
  13.418      __ delayed()->nop();
  13.419 +    // re-init intial vector for next block, 8-byte alignment is guaranteed
  13.420      __ stf(FloatRegisterImpl::D, F60, rvec, 0);
  13.421      __ stf(FloatRegisterImpl::D, F62, rvec, 8);
  13.422 +    __ restore();
  13.423      __ retl();
  13.424 -    __ delayed()->mov(L1, O0);
  13.425 +    __ delayed()->mov(L0, O0);
  13.426  
  13.427      __ align(OptoLoopAlignment);
  13.428      __ BIND(L_cbcenc192);
  13.429 +    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
  13.430 +    __ andcc(from, 7, G0);
  13.431 +    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_192bit);
  13.432 +    __ delayed()->mov(from, L1); // save original 'from' address before alignaddr
  13.433 +
  13.434 +    // aligned case: load input into G3 and G4
  13.435      __ ldx(from,0,G3);
  13.436      __ ldx(from,8,G4);
  13.437 +    __ ba_short(L_192bit_transform);
  13.438 +
  13.439 +    __ BIND(L_load_misaligned_input_192bit);
  13.440 +    // can clobber F48, F50 and F52 as they are not used in 128 and 192-bit key encryption
  13.441 +    __ alignaddr(from, G0, from);
  13.442 +    __ ldf(FloatRegisterImpl::D, from, 0, F48);
  13.443 +    __ ldf(FloatRegisterImpl::D, from, 8, F50);
  13.444 +    __ ldf(FloatRegisterImpl::D, from, 16, F52);
  13.445 +    __ faligndata(F48, F50, F48);
  13.446 +    __ faligndata(F50, F52, F50);
  13.447 +    __ movdtox(F48, G3);
  13.448 +    __ movdtox(F50, G4);
  13.449 +    __ mov(L1, from);
  13.450 +
  13.451 +    __ BIND(L_192bit_transform);
  13.452      __ xor3(G1,G3,G3);
  13.453 -    __ xor3(G2,G4,G4);
  13.454 +    __ xor3(G5,G4,G4);
  13.455      __ movxtod(G3,F56);
  13.456      __ movxtod(G4,F58);
  13.457      __ fxor(FloatRegisterImpl::D, F60, F56, F60);
  13.458 @@ -3634,24 +3827,81 @@
  13.459        }
  13.460      }
  13.461  
  13.462 +    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
  13.463 +    __ andcc(to, 7, L1);
  13.464 +    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_192bit);
  13.465 +    __ delayed()->edge8n(to, G0, L2);
  13.466 +
  13.467 +    // aligned case: store output into the destination array
  13.468      __ stf(FloatRegisterImpl::D, F60, to, 0);
  13.469      __ stf(FloatRegisterImpl::D, F62, to, 8);
  13.470 +    __ ba_short(L_check_loop_end_192bit);
  13.471 +
  13.472 +    __ BIND(L_store_misaligned_output_192bit);
  13.473 +    __ add(to, 8, L3);
  13.474 +    __ mov(8, L4);
  13.475 +    __ sub(L4, L1, L4);
  13.476 +    __ alignaddr(L4, G0, L4);
  13.477 +    __ movdtox(F60, L6);
  13.478 +    __ movdtox(F62, L7);
  13.479 +    __ faligndata(F60, F60, F60);
  13.480 +    __ faligndata(F62, F62, F62);
  13.481 +    __ mov(to, L5);
  13.482 +    __ and3(to, -8, to);
  13.483 +    __ and3(L3, -8, L3);
  13.484 +    __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
  13.485 +    __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
  13.486 +    __ add(to, 8, to);
  13.487 +    __ add(L3, 8, L3);
  13.488 +    __ orn(G0, L2, L2);
  13.489 +    __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
  13.490 +    __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
  13.491 +    __ mov(L5, to);
  13.492 +    __ movxtod(L6, F60);
  13.493 +    __ movxtod(L7, F62);
  13.494 +
  13.495 +    __ BIND(L_check_loop_end_192bit);
  13.496      __ add(from, 16, from);
  13.497      __ subcc(len_reg, 16, len_reg);
  13.498      __ add(to, 16, to);
  13.499      __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc192);
  13.500      __ delayed()->nop();
  13.501 +    // re-init intial vector for next block, 8-byte alignment is guaranteed
  13.502      __ stf(FloatRegisterImpl::D, F60, rvec, 0);
  13.503      __ stf(FloatRegisterImpl::D, F62, rvec, 8);
  13.504 +    __ restore();
  13.505      __ retl();
  13.506 -    __ delayed()->mov(L1, O0);
  13.507 +    __ delayed()->mov(L0, O0);
  13.508  
  13.509      __ align(OptoLoopAlignment);
  13.510      __ BIND(L_cbcenc256);
  13.511 +    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
  13.512 +    __ andcc(from, 7, G0);
  13.513 +    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_256bit);
  13.514 +    __ delayed()->mov(from, L1); // save original 'from' address before alignaddr
  13.515 +
  13.516 +    // aligned case: load input into G3 and G4
  13.517      __ ldx(from,0,G3);
  13.518      __ ldx(from,8,G4);
  13.519 +    __ ba_short(L_256bit_transform);
  13.520 +
  13.521 +    __ BIND(L_load_misaligned_input_256bit);
  13.522 +    // cannot clobber F48, F50 and F52. F56, F58 can be used though
  13.523 +    __ alignaddr(from, G0, from);
  13.524 +    __ movdtox(F60, L2); // save F60 before overwriting
  13.525 +    __ ldf(FloatRegisterImpl::D, from, 0, F56);
  13.526 +    __ ldf(FloatRegisterImpl::D, from, 8, F58);
  13.527 +    __ ldf(FloatRegisterImpl::D, from, 16, F60);
  13.528 +    __ faligndata(F56, F58, F56);
  13.529 +    __ faligndata(F58, F60, F58);
  13.530 +    __ movdtox(F56, G3);
  13.531 +    __ movdtox(F58, G4);
  13.532 +    __ mov(L1, from);
  13.533 +    __ movxtod(L2, F60);
  13.534 +
  13.535 +    __ BIND(L_256bit_transform);
  13.536      __ xor3(G1,G3,G3);
  13.537 -    __ xor3(G2,G4,G4);
  13.538 +    __ xor3(G5,G4,G4);
  13.539      __ movxtod(G3,F56);
  13.540      __ movxtod(G4,F58);
  13.541      __ fxor(FloatRegisterImpl::D, F60, F56, F60);
  13.542 @@ -3670,26 +3920,69 @@
  13.543        }
  13.544      }
  13.545  
  13.546 +    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
  13.547 +    __ andcc(to, 7, L1);
  13.548 +    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_256bit);
  13.549 +    __ delayed()->edge8n(to, G0, L2);
  13.550 +
  13.551 +    // aligned case: store output into the destination array
  13.552      __ stf(FloatRegisterImpl::D, F60, to, 0);
  13.553      __ stf(FloatRegisterImpl::D, F62, to, 8);
  13.554 +    __ ba_short(L_check_loop_end_256bit);
  13.555 +
  13.556 +    __ BIND(L_store_misaligned_output_256bit);
  13.557 +    __ add(to, 8, L3);
  13.558 +    __ mov(8, L4);
  13.559 +    __ sub(L4, L1, L4);
  13.560 +    __ alignaddr(L4, G0, L4);
  13.561 +    __ movdtox(F60, L6);
  13.562 +    __ movdtox(F62, L7);
  13.563 +    __ faligndata(F60, F60, F60);
  13.564 +    __ faligndata(F62, F62, F62);
  13.565 +    __ mov(to, L5);
  13.566 +    __ and3(to, -8, to);
  13.567 +    __ and3(L3, -8, L3);
  13.568 +    __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
  13.569 +    __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
  13.570 +    __ add(to, 8, to);
  13.571 +    __ add(L3, 8, L3);
  13.572 +    __ orn(G0, L2, L2);
  13.573 +    __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
  13.574 +    __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
  13.575 +    __ mov(L5, to);
  13.576 +    __ movxtod(L6, F60);
  13.577 +    __ movxtod(L7, F62);
  13.578 +
  13.579 +    __ BIND(L_check_loop_end_256bit);
  13.580      __ add(from, 16, from);
  13.581      __ subcc(len_reg, 16, len_reg);
  13.582      __ add(to, 16, to);
  13.583      __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc256);
  13.584      __ delayed()->nop();
  13.585 +    // re-init intial vector for next block, 8-byte alignment is guaranteed
  13.586      __ stf(FloatRegisterImpl::D, F60, rvec, 0);
  13.587      __ stf(FloatRegisterImpl::D, F62, rvec, 8);
  13.588 +    __ restore();
  13.589      __ retl();
  13.590 -    __ delayed()->mov(L1, O0);
  13.591 +    __ delayed()->mov(L0, O0);
  13.592  
  13.593      return start;
  13.594    }
  13.595  
  13.596    address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
  13.597 +    assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
  13.598 +           "the following code assumes that first element of an int array is aligned to 8 bytes");
  13.599 +    assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0,
  13.600 +           "the following code assumes that first element of a byte array is aligned to 8 bytes");
  13.601      __ align(CodeEntryAlignment);
  13.602      StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
  13.603      Label L_cbcdec_end, L_expand192bit, L_expand256bit, L_dec_first_block_start;
  13.604      Label L_dec_first_block128, L_dec_first_block192, L_dec_next2_blocks128, L_dec_next2_blocks192, L_dec_next2_blocks256;
  13.605 +    Label L_load_misaligned_input_first_block, L_transform_first_block, L_load_misaligned_next2_blocks128, L_transform_next2_blocks128;
  13.606 +    Label L_load_misaligned_next2_blocks192, L_transform_next2_blocks192, L_load_misaligned_next2_blocks256, L_transform_next2_blocks256;
  13.607 +    Label L_store_misaligned_output_first_block, L_check_decrypt_end, L_store_misaligned_output_next2_blocks128;
  13.608 +    Label L_check_decrypt_loop_end128, L_store_misaligned_output_next2_blocks192, L_check_decrypt_loop_end192;
  13.609 +    Label L_store_misaligned_output_next2_blocks256, L_check_decrypt_loop_end256;
  13.610      address start = __ pc();
  13.611      Register from = I0; // source byte array
  13.612      Register to = I1;   // destination byte array
  13.613 @@ -3704,11 +3997,12 @@
  13.614      __ save_frame(0); //args are read from I* registers since we save the frame in the beginning
  13.615  
  13.616      // load original key from SunJCE expanded decryption key
  13.617 +    // Since we load original key buffer starting first element, 8-byte alignment is guaranteed
  13.618      for ( int i = 0;  i <= 3; i++ ) {
  13.619        __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
  13.620      }
  13.621  
  13.622 -    // load initial vector
  13.623 +    // load initial vector, 8-byte alignment is guaranteed
  13.624      __ ldx(rvec,0,L0);
  13.625      __ ldx(rvec,8,L1);
  13.626  
  13.627 @@ -3733,11 +4027,10 @@
  13.628      __ movdtox(F42,L3);
  13.629  
  13.630      __ and3(len_reg, 16, L4);
  13.631 -    __ br_null(L4, false, Assembler::pt, L_dec_next2_blocks128);
  13.632 -    __ delayed()->nop();
  13.633 -
  13.634 -    __ br(Assembler::always, false, Assembler::pt, L_dec_first_block_start);
  13.635 -    __ delayed()->nop();
  13.636 +    __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks128);
  13.637 +    __ nop();
  13.638 +
  13.639 +    __ ba_short(L_dec_first_block_start);
  13.640  
  13.641      __ BIND(L_expand192bit);
  13.642      // load rest of the 192-bit key
  13.643 @@ -3758,11 +4051,10 @@
  13.644      __ movdtox(F50,L3);
  13.645  
  13.646      __ and3(len_reg, 16, L4);
  13.647 -    __ br_null(L4, false, Assembler::pt, L_dec_next2_blocks192);
  13.648 -    __ delayed()->nop();
  13.649 -
  13.650 -    __ br(Assembler::always, false, Assembler::pt, L_dec_first_block_start);
  13.651 -    __ delayed()->nop();
  13.652 +    __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks192);
  13.653 +    __ nop();
  13.654 +
  13.655 +    __ ba_short(L_dec_first_block_start);
  13.656  
  13.657      __ BIND(L_expand256bit);
  13.658      // load rest of the 256-bit key
  13.659 @@ -3785,12 +4077,32 @@
  13.660      __ movdtox(F58,L3);
  13.661  
  13.662      __ and3(len_reg, 16, L4);
  13.663 -    __ br_null(L4, false, Assembler::pt, L_dec_next2_blocks256);
  13.664 -    __ delayed()->nop();
  13.665 +    __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks256);
  13.666  
  13.667      __ BIND(L_dec_first_block_start);
  13.668 +    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
  13.669 +    __ andcc(from, 7, G0);
  13.670 +    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_first_block);
  13.671 +    __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
  13.672 +
  13.673 +    // aligned case: load input into L4 and L5
  13.674      __ ldx(from,0,L4);
  13.675      __ ldx(from,8,L5);
  13.676 +    __ ba_short(L_transform_first_block);
  13.677 +
  13.678 +    __ BIND(L_load_misaligned_input_first_block);
  13.679 +    __ alignaddr(from, G0, from);
  13.680 +    // F58, F60, F62 can be clobbered
  13.681 +    __ ldf(FloatRegisterImpl::D, from, 0, F58);
  13.682 +    __ ldf(FloatRegisterImpl::D, from, 8, F60);
  13.683 +    __ ldf(FloatRegisterImpl::D, from, 16, F62);
  13.684 +    __ faligndata(F58, F60, F58);
  13.685 +    __ faligndata(F60, F62, F60);
  13.686 +    __ movdtox(F58, L4);
  13.687 +    __ movdtox(F60, L5);
  13.688 +    __ mov(G1, from);
  13.689 +
  13.690 +    __ BIND(L_transform_first_block);
  13.691      __ xor3(L2,L4,G1);
  13.692      __ movxtod(G1,F60);
  13.693      __ xor3(L3,L5,G1);
  13.694 @@ -3833,9 +4145,36 @@
  13.695      __ fxor(FloatRegisterImpl::D, F56, F60, F60);
  13.696      __ fxor(FloatRegisterImpl::D, F58, F62, F62);
  13.697  
  13.698 +    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
  13.699 +    __ andcc(to, 7, G1);
  13.700 +    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_first_block);
  13.701 +    __ delayed()->edge8n(to, G0, G2);
  13.702 +
  13.703 +    // aligned case: store output into the destination array
  13.704      __ stf(FloatRegisterImpl::D, F60, to, 0);
  13.705      __ stf(FloatRegisterImpl::D, F62, to, 8);
  13.706 -
  13.707 +    __ ba_short(L_check_decrypt_end);
  13.708 +
  13.709 +    __ BIND(L_store_misaligned_output_first_block);
  13.710 +    __ add(to, 8, G3);
  13.711 +    __ mov(8, G4);
  13.712 +    __ sub(G4, G1, G4);
  13.713 +    __ alignaddr(G4, G0, G4);
  13.714 +    __ faligndata(F60, F60, F60);
  13.715 +    __ faligndata(F62, F62, F62);
  13.716 +    __ mov(to, G1);
  13.717 +    __ and3(to, -8, to);
  13.718 +    __ and3(G3, -8, G3);
  13.719 +    __ stpartialf(to, G2, F60, Assembler::ASI_PST8_PRIMARY);
  13.720 +    __ stpartialf(G3, G2, F62, Assembler::ASI_PST8_PRIMARY);
  13.721 +    __ add(to, 8, to);
  13.722 +    __ add(G3, 8, G3);
  13.723 +    __ orn(G0, G2, G2);
  13.724 +    __ stpartialf(to, G2, F60, Assembler::ASI_PST8_PRIMARY);
  13.725 +    __ stpartialf(G3, G2, F62, Assembler::ASI_PST8_PRIMARY);
  13.726 +    __ mov(G1, to);
  13.727 +
  13.728 +    __ BIND(L_check_decrypt_end);
  13.729      __ add(from, 16, from);
  13.730      __ add(to, 16, to);
  13.731      __ subcc(len_reg, 16, len_reg);
  13.732 @@ -3852,17 +4191,44 @@
  13.733      __ BIND(L_dec_next2_blocks128);
  13.734      __ nop();
  13.735  
  13.736 -    // F40:F42 used for first 16-bytes
  13.737 +    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
  13.738 +    __ andcc(from, 7, G0);
  13.739 +    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks128);
  13.740 +    __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
  13.741 +
  13.742 +    // aligned case: load input into G4, G5, L4 and L5
  13.743      __ ldx(from,0,G4);
  13.744      __ ldx(from,8,G5);
  13.745 +    __ ldx(from,16,L4);
  13.746 +    __ ldx(from,24,L5);
  13.747 +    __ ba_short(L_transform_next2_blocks128);
  13.748 +
  13.749 +    __ BIND(L_load_misaligned_next2_blocks128);
  13.750 +    __ alignaddr(from, G0, from);
  13.751 +    // F40, F42, F58, F60, F62 can be clobbered
  13.752 +    __ ldf(FloatRegisterImpl::D, from, 0, F40);
  13.753 +    __ ldf(FloatRegisterImpl::D, from, 8, F42);
  13.754 +    __ ldf(FloatRegisterImpl::D, from, 16, F60);
  13.755 +    __ ldf(FloatRegisterImpl::D, from, 24, F62);
  13.756 +    __ ldf(FloatRegisterImpl::D, from, 32, F58);
  13.757 +    __ faligndata(F40, F42, F40);
  13.758 +    __ faligndata(F42, F60, F42);
  13.759 +    __ faligndata(F60, F62, F60);
  13.760 +    __ faligndata(F62, F58, F62);
  13.761 +    __ movdtox(F40, G4);
  13.762 +    __ movdtox(F42, G5);
  13.763 +    __ movdtox(F60, L4);
  13.764 +    __ movdtox(F62, L5);
  13.765 +    __ mov(G1, from);
  13.766 +
  13.767 +    __ BIND(L_transform_next2_blocks128);
  13.768 +    // F40:F42 used for first 16-bytes
  13.769      __ xor3(L2,G4,G1);
  13.770      __ movxtod(G1,F40);
  13.771      __ xor3(L3,G5,G1);
  13.772      __ movxtod(G1,F42);
  13.773  
  13.774      // F60:F62 used for next 16-bytes
  13.775 -    __ ldx(from,16,L4);
  13.776 -    __ ldx(from,24,L5);
  13.777      __ xor3(L2,L4,G1);
  13.778      __ movxtod(G1,F60);
  13.779      __ xor3(L3,L5,G1);
  13.780 @@ -3891,9 +4257,6 @@
  13.781      __ fxor(FloatRegisterImpl::D, F46, F40, F40);
  13.782      __ fxor(FloatRegisterImpl::D, F44, F42, F42);
  13.783  
  13.784 -    __ stf(FloatRegisterImpl::D, F40, to, 0);
  13.785 -    __ stf(FloatRegisterImpl::D, F42, to, 8);
  13.786 -
  13.787      __ movxtod(G4,F56);
  13.788      __ movxtod(G5,F58);
  13.789      __ mov(L4,L0);
  13.790 @@ -3901,32 +4264,93 @@
  13.791      __ fxor(FloatRegisterImpl::D, F56, F60, F60);
  13.792      __ fxor(FloatRegisterImpl::D, F58, F62, F62);
  13.793  
  13.794 +    // For mis-aligned store of 32 bytes of result we can do:
  13.795 +    // Circular right-shift all 4 FP registers so that 'head' and 'tail'
  13.796 +    // parts that need to be stored starting at mis-aligned address are in a FP reg
  13.797 +    // the other 3 FP regs can thus be stored using regular store
  13.798 +    // we then use the edge + partial-store mechanism to store the 'head' and 'tail' parts
  13.799 +
  13.800 +    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
  13.801 +    __ andcc(to, 7, G1);
  13.802 +    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks128);
  13.803 +    __ delayed()->edge8n(to, G0, G2);
  13.804 +
  13.805 +    // aligned case: store output into the destination array
  13.806 +    __ stf(FloatRegisterImpl::D, F40, to, 0);
  13.807 +    __ stf(FloatRegisterImpl::D, F42, to, 8);
  13.808      __ stf(FloatRegisterImpl::D, F60, to, 16);
  13.809      __ stf(FloatRegisterImpl::D, F62, to, 24);
  13.810 -
  13.811 +    __ ba_short(L_check_decrypt_loop_end128);
  13.812 +
  13.813 +    __ BIND(L_store_misaligned_output_next2_blocks128);
  13.814 +    __ mov(8, G4);
  13.815 +    __ sub(G4, G1, G4);
  13.816 +    __ alignaddr(G4, G0, G4);
  13.817 +    __ faligndata(F40, F42, F56); // F56 can be clobbered
  13.818 +    __ faligndata(F42, F60, F42);
  13.819 +    __ faligndata(F60, F62, F60);
  13.820 +    __ faligndata(F62, F40, F40);
  13.821 +    __ mov(to, G1);
  13.822 +    __ and3(to, -8, to);
  13.823 +    __ stpartialf(to, G2, F40, Assembler::ASI_PST8_PRIMARY);
  13.824 +    __ stf(FloatRegisterImpl::D, F56, to, 8);
  13.825 +    __ stf(FloatRegisterImpl::D, F42, to, 16);
  13.826 +    __ stf(FloatRegisterImpl::D, F60, to, 24);
  13.827 +    __ add(to, 32, to);
  13.828 +    __ orn(G0, G2, G2);
  13.829 +    __ stpartialf(to, G2, F40, Assembler::ASI_PST8_PRIMARY);
  13.830 +    __ mov(G1, to);
  13.831 +
  13.832 +    __ BIND(L_check_decrypt_loop_end128);
  13.833      __ add(from, 32, from);
  13.834      __ add(to, 32, to);
  13.835      __ subcc(len_reg, 32, len_reg);
  13.836      __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks128);
  13.837      __ delayed()->nop();
  13.838 -    __ br(Assembler::always, false, Assembler::pt, L_cbcdec_end);
  13.839 -    __ delayed()->nop();
  13.840 +    __ ba_short(L_cbcdec_end);
  13.841  
  13.842      __ align(OptoLoopAlignment);
  13.843      __ BIND(L_dec_next2_blocks192);
  13.844      __ nop();
  13.845  
  13.846 -    // F48:F50 used for first 16-bytes
  13.847 +    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
  13.848 +    __ andcc(from, 7, G0);
  13.849 +    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks192);
  13.850 +    __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
  13.851 +
  13.852 +    // aligned case: load input into G4, G5, L4 and L5
  13.853      __ ldx(from,0,G4);
  13.854      __ ldx(from,8,G5);
  13.855 +    __ ldx(from,16,L4);
  13.856 +    __ ldx(from,24,L5);
  13.857 +    __ ba_short(L_transform_next2_blocks192);
  13.858 +
  13.859 +    __ BIND(L_load_misaligned_next2_blocks192);
  13.860 +    __ alignaddr(from, G0, from);
  13.861 +    // F48, F50, F52, F60, F62 can be clobbered
  13.862 +    __ ldf(FloatRegisterImpl::D, from, 0, F48);
  13.863 +    __ ldf(FloatRegisterImpl::D, from, 8, F50);
  13.864 +    __ ldf(FloatRegisterImpl::D, from, 16, F60);
  13.865 +    __ ldf(FloatRegisterImpl::D, from, 24, F62);
  13.866 +    __ ldf(FloatRegisterImpl::D, from, 32, F52);
  13.867 +    __ faligndata(F48, F50, F48);
  13.868 +    __ faligndata(F50, F60, F50);
  13.869 +    __ faligndata(F60, F62, F60);
  13.870 +    __ faligndata(F62, F52, F62);
  13.871 +    __ movdtox(F48, G4);
  13.872 +    __ movdtox(F50, G5);
  13.873 +    __ movdtox(F60, L4);
  13.874 +    __ movdtox(F62, L5);
  13.875 +    __ mov(G1, from);
  13.876 +
  13.877 +    __ BIND(L_transform_next2_blocks192);
  13.878 +    // F48:F50 used for first 16-bytes
  13.879      __ xor3(L2,G4,G1);
  13.880      __ movxtod(G1,F48);
  13.881      __ xor3(L3,G5,G1);
  13.882      __ movxtod(G1,F50);
  13.883  
  13.884      // F60:F62 used for next 16-bytes
  13.885 -    __ ldx(from,16,L4);
  13.886 -    __ ldx(from,24,L5);
  13.887      __ xor3(L2,L4,G1);
  13.888      __ movxtod(G1,F60);
  13.889      __ xor3(L3,L5,G1);
  13.890 @@ -3955,9 +4379,6 @@
  13.891      __ fxor(FloatRegisterImpl::D, F54, F48, F48);
  13.892      __ fxor(FloatRegisterImpl::D, F52, F50, F50);
  13.893  
  13.894 -    __ stf(FloatRegisterImpl::D, F48, to, 0);
  13.895 -    __ stf(FloatRegisterImpl::D, F50, to, 8);
  13.896 -
  13.897      __ movxtod(G4,F56);
  13.898      __ movxtod(G5,F58);
  13.899      __ mov(L4,L0);
  13.900 @@ -3965,32 +4386,87 @@
  13.901      __ fxor(FloatRegisterImpl::D, F56, F60, F60);
  13.902      __ fxor(FloatRegisterImpl::D, F58, F62, F62);
  13.903  
  13.904 +    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
  13.905 +    __ andcc(to, 7, G1);
  13.906 +    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks192);
  13.907 +    __ delayed()->edge8n(to, G0, G2);
  13.908 +
  13.909 +    // aligned case: store output into the destination array
  13.910 +    __ stf(FloatRegisterImpl::D, F48, to, 0);
  13.911 +    __ stf(FloatRegisterImpl::D, F50, to, 8);
  13.912      __ stf(FloatRegisterImpl::D, F60, to, 16);
  13.913      __ stf(FloatRegisterImpl::D, F62, to, 24);
  13.914 -
  13.915 +    __ ba_short(L_check_decrypt_loop_end192);
  13.916 +
  13.917 +    __ BIND(L_store_misaligned_output_next2_blocks192);
  13.918 +    __ mov(8, G4);
  13.919 +    __ sub(G4, G1, G4);
  13.920 +    __ alignaddr(G4, G0, G4);
  13.921 +    __ faligndata(F48, F50, F56); // F56 can be clobbered
  13.922 +    __ faligndata(F50, F60, F50);
  13.923 +    __ faligndata(F60, F62, F60);
  13.924 +    __ faligndata(F62, F48, F48);
  13.925 +    __ mov(to, G1);
  13.926 +    __ and3(to, -8, to);
  13.927 +    __ stpartialf(to, G2, F48, Assembler::ASI_PST8_PRIMARY);
  13.928 +    __ stf(FloatRegisterImpl::D, F56, to, 8);
  13.929 +    __ stf(FloatRegisterImpl::D, F50, to, 16);
  13.930 +    __ stf(FloatRegisterImpl::D, F60, to, 24);
  13.931 +    __ add(to, 32, to);
  13.932 +    __ orn(G0, G2, G2);
  13.933 +    __ stpartialf(to, G2, F48, Assembler::ASI_PST8_PRIMARY);
  13.934 +    __ mov(G1, to);
  13.935 +
  13.936 +    __ BIND(L_check_decrypt_loop_end192);
  13.937      __ add(from, 32, from);
  13.938      __ add(to, 32, to);
  13.939      __ subcc(len_reg, 32, len_reg);
  13.940      __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks192);
  13.941      __ delayed()->nop();
  13.942 -    __ br(Assembler::always, false, Assembler::pt, L_cbcdec_end);
  13.943 -    __ delayed()->nop();
  13.944 +    __ ba_short(L_cbcdec_end);
  13.945  
  13.946      __ align(OptoLoopAlignment);
  13.947      __ BIND(L_dec_next2_blocks256);
  13.948      __ nop();
  13.949  
  13.950 -    // F0:F2 used for first 16-bytes
  13.951 +    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
  13.952 +    __ andcc(from, 7, G0);
  13.953 +    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks256);
  13.954 +    __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
  13.955 +
  13.956 +    // aligned case: load input into G4, G5, L4 and L5
  13.957      __ ldx(from,0,G4);
  13.958      __ ldx(from,8,G5);
  13.959 +    __ ldx(from,16,L4);
  13.960 +    __ ldx(from,24,L5);
  13.961 +    __ ba_short(L_transform_next2_blocks256);
  13.962 +
  13.963 +    __ BIND(L_load_misaligned_next2_blocks256);
  13.964 +    __ alignaddr(from, G0, from);
  13.965 +    // F0, F2, F4, F60, F62 can be clobbered
  13.966 +    __ ldf(FloatRegisterImpl::D, from, 0, F0);
  13.967 +    __ ldf(FloatRegisterImpl::D, from, 8, F2);
  13.968 +    __ ldf(FloatRegisterImpl::D, from, 16, F60);
  13.969 +    __ ldf(FloatRegisterImpl::D, from, 24, F62);
  13.970 +    __ ldf(FloatRegisterImpl::D, from, 32, F4);
  13.971 +    __ faligndata(F0, F2, F0);
  13.972 +    __ faligndata(F2, F60, F2);
  13.973 +    __ faligndata(F60, F62, F60);
  13.974 +    __ faligndata(F62, F4, F62);
  13.975 +    __ movdtox(F0, G4);
  13.976 +    __ movdtox(F2, G5);
  13.977 +    __ movdtox(F60, L4);
  13.978 +    __ movdtox(F62, L5);
  13.979 +    __ mov(G1, from);
  13.980 +
  13.981 +    __ BIND(L_transform_next2_blocks256);
  13.982 +    // F0:F2 used for first 16-bytes
  13.983      __ xor3(L2,G4,G1);
  13.984      __ movxtod(G1,F0);
  13.985      __ xor3(L3,G5,G1);
  13.986      __ movxtod(G1,F2);
  13.987  
  13.988      // F60:F62 used for next 16-bytes
  13.989 -    __ ldx(from,16,L4);
  13.990 -    __ ldx(from,24,L5);
  13.991      __ xor3(L2,L4,G1);
  13.992      __ movxtod(G1,F60);
  13.993      __ xor3(L3,L5,G1);
  13.994 @@ -4043,9 +4519,6 @@
  13.995      __ fxor(FloatRegisterImpl::D, F6, F0, F0);
  13.996      __ fxor(FloatRegisterImpl::D, F4, F2, F2);
  13.997  
  13.998 -    __ stf(FloatRegisterImpl::D, F0, to, 0);
  13.999 -    __ stf(FloatRegisterImpl::D, F2, to, 8);
 13.1000 -
 13.1001      __ movxtod(G4,F56);
 13.1002      __ movxtod(G5,F58);
 13.1003      __ mov(L4,L0);
 13.1004 @@ -4053,9 +4526,38 @@
 13.1005      __ fxor(FloatRegisterImpl::D, F56, F60, F60);
 13.1006      __ fxor(FloatRegisterImpl::D, F58, F62, F62);
 13.1007  
 13.1008 +    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
 13.1009 +    __ andcc(to, 7, G1);
 13.1010 +    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks256);
 13.1011 +    __ delayed()->edge8n(to, G0, G2);
 13.1012 +
 13.1013 +    // aligned case: store output into the destination array
 13.1014 +    __ stf(FloatRegisterImpl::D, F0, to, 0);
 13.1015 +    __ stf(FloatRegisterImpl::D, F2, to, 8);
 13.1016      __ stf(FloatRegisterImpl::D, F60, to, 16);
 13.1017      __ stf(FloatRegisterImpl::D, F62, to, 24);
 13.1018 -
 13.1019 +    __ ba_short(L_check_decrypt_loop_end256);
 13.1020 +
 13.1021 +    __ BIND(L_store_misaligned_output_next2_blocks256);
 13.1022 +    __ mov(8, G4);
 13.1023 +    __ sub(G4, G1, G4);
 13.1024 +    __ alignaddr(G4, G0, G4);
 13.1025 +    __ faligndata(F0, F2, F56); // F56 can be clobbered
 13.1026 +    __ faligndata(F2, F60, F2);
 13.1027 +    __ faligndata(F60, F62, F60);
 13.1028 +    __ faligndata(F62, F0, F0);
 13.1029 +    __ mov(to, G1);
 13.1030 +    __ and3(to, -8, to);
 13.1031 +    __ stpartialf(to, G2, F0, Assembler::ASI_PST8_PRIMARY);
 13.1032 +    __ stf(FloatRegisterImpl::D, F56, to, 8);
 13.1033 +    __ stf(FloatRegisterImpl::D, F2, to, 16);
 13.1034 +    __ stf(FloatRegisterImpl::D, F60, to, 24);
 13.1035 +    __ add(to, 32, to);
 13.1036 +    __ orn(G0, G2, G2);
 13.1037 +    __ stpartialf(to, G2, F0, Assembler::ASI_PST8_PRIMARY);
 13.1038 +    __ mov(G1, to);
 13.1039 +
 13.1040 +    __ BIND(L_check_decrypt_loop_end256);
 13.1041      __ add(from, 32, from);
 13.1042      __ add(to, 32, to);
 13.1043      __ subcc(len_reg, 32, len_reg);
 13.1044 @@ -4063,6 +4565,7 @@
 13.1045      __ delayed()->nop();
 13.1046  
 13.1047      __ BIND(L_cbcdec_end);
 13.1048 +    // re-init intial vector for next block, 8-byte alignment is guaranteed
 13.1049      __ stx(L0, rvec, 0);
 13.1050      __ stx(L1, rvec, 8);
 13.1051      __ restore();

    14.1 --- a/src/cpu/sparc/vm/stubRoutines_sparc.hpp	Wed May 07 10:58:47 2014 -0700
    14.2 +++ b/src/cpu/sparc/vm/stubRoutines_sparc.hpp	Thu May 08 23:07:11 2014 -0700
    14.3 @@ -41,7 +41,7 @@
    14.4  enum /* platform_dependent_constants */ {
    14.5    // %%%%%%%% May be able to shrink this a lot
    14.6    code_size1 = 20000,           // simply increase if too small (assembler will crash if too small)
    14.7 -  code_size2 = 20000            // simply increase if too small (assembler will crash if too small)
    14.8 +  code_size2 = 22000            // simply increase if too small (assembler will crash if too small)
    14.9  };
   14.10  
   14.11  class Sparc {

    15.1 --- a/src/cpu/sparc/vm/vm_version_sparc.cpp	Wed May 07 10:58:47 2014 -0700
    15.2 +++ b/src/cpu/sparc/vm/vm_version_sparc.cpp	Thu May 08 23:07:11 2014 -0700
    15.3 @@ -1,5 +1,5 @@
    15.4  /*
    15.5 - * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
    15.6 + * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
    15.7   * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    15.8   *
    15.9   * This code is free software; you can redistribute it and/or modify it
   15.10 @@ -266,9 +266,9 @@
   15.11    if (!has_vis1()) // Drop to 0 if no VIS1 support
   15.12      UseVIS = 0;
   15.13  
   15.14 -  // T2 and above should have support for AES instructions
   15.15 +  // SPARC T4 and above should have support for AES instructions
   15.16    if (has_aes()) {
   15.17 -    if (UseVIS > 0) { // AES intrinsics use FXOR instruction which is VIS1
   15.18 +    if (UseVIS > 2) { // AES intrinsics use MOVxTOd/MOVdTOx which are VIS3
   15.19        if (FLAG_IS_DEFAULT(UseAES)) {
   15.20          FLAG_SET_DEFAULT(UseAES, true);
   15.21        }
   15.22 @@ -282,7 +282,7 @@
   15.23        }
   15.24      } else {
   15.25          if (UseAES || UseAESIntrinsics) {
   15.26 -          warning("SPARC AES intrinsics require VIS1 instruction support. Intrinsics will be disabled.");
   15.27 +          warning("SPARC AES intrinsics require VIS3 instruction support. Intrinsics will be disabled.");
   15.28            if (UseAES) {
   15.29              FLAG_SET_DEFAULT(UseAES, false);
   15.30            }

    16.1 --- a/src/cpu/x86/vm/assembler_x86.cpp	Wed May 07 10:58:47 2014 -0700
    16.2 +++ b/src/cpu/x86/vm/assembler_x86.cpp	Thu May 08 23:07:11 2014 -0700
    16.3 @@ -1766,7 +1766,7 @@
    16.4  
    16.5  // Move Unaligned 256bit Vector
    16.6  void Assembler::vmovdqu(XMMRegister dst, XMMRegister src) {
    16.7 -  assert(UseAVX, "");
    16.8 +  assert(UseAVX > 0, "");
    16.9    bool vector256 = true;
   16.10    int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_F3, vector256);
   16.11    emit_int8(0x6F);
   16.12 @@ -1774,7 +1774,7 @@
   16.13  }
   16.14  
   16.15  void Assembler::vmovdqu(XMMRegister dst, Address src) {
   16.16 -  assert(UseAVX, "");
   16.17 +  assert(UseAVX > 0, "");
   16.18    InstructionMark im(this);
   16.19    bool vector256 = true;
   16.20    vex_prefix(dst, xnoreg, src, VEX_SIMD_F3, vector256);
   16.21 @@ -1783,7 +1783,7 @@
   16.22  }
   16.23  
   16.24  void Assembler::vmovdqu(Address dst, XMMRegister src) {
   16.25 -  assert(UseAVX, "");
   16.26 +  assert(UseAVX > 0, "");
   16.27    InstructionMark im(this);
   16.28    bool vector256 = true;
   16.29    // swap src<->dst for encoding

    17.1 --- a/src/cpu/x86/vm/vm_version_x86.cpp	Wed May 07 10:58:47 2014 -0700
    17.2 +++ b/src/cpu/x86/vm/vm_version_x86.cpp	Thu May 08 23:07:11 2014 -0700
    17.3 @@ -263,6 +263,10 @@
    17.4      // and check upper YMM bits after it.
    17.5      //
    17.6      VM_Version::set_avx_cpuFeatures(); // Enable temporary to pass asserts
    17.7 +    intx saved_useavx = UseAVX;
    17.8 +    intx saved_usesse = UseSSE;
    17.9 +    UseAVX = 1;
   17.10 +    UseSSE = 2;
   17.11  
   17.12      // load value into all 32 bytes of ymm7 register
   17.13      __ movl(rcx, VM_Version::ymm_test_value());
   17.14 @@ -292,6 +296,8 @@
   17.15  #endif
   17.16  
   17.17      VM_Version::clean_cpuFeatures();
   17.18 +    UseAVX = saved_useavx;
   17.19 +    UseSSE = saved_usesse;
   17.20  
   17.21      //
   17.22      // cpuid(0x7) Structured Extended Features

    18.1 --- a/src/os_cpu/linux_ppc/vm/atomic_linux_ppc.inline.hpp	Wed May 07 10:58:47 2014 -0700
    18.2 +++ b/src/os_cpu/linux_ppc/vm/atomic_linux_ppc.inline.hpp	Thu May 08 23:07:11 2014 -0700
    18.3 @@ -53,41 +53,41 @@
    18.4  
    18.5  inline jlong Atomic::load(volatile jlong* src) { return *src; }
    18.6  
    18.7 -/*
    18.8 -  machine barrier instructions:
    18.9 -
   18.10 -  - sync            two-way memory barrier, aka fence
   18.11 -  - lwsync          orders  Store|Store,
   18.12 -                             Load|Store,
   18.13 -                             Load|Load,
   18.14 -                    but not Store|Load
   18.15 -  - eieio           orders memory accesses for device memory (only)
   18.16 -  - isync           invalidates speculatively executed instructions
   18.17 -                    From the POWER ISA 2.06 documentation:
   18.18 -                     "[...] an isync instruction prevents the execution of
   18.19 -                    instructions following the isync until instructions
   18.20 -                    preceding the isync have completed, [...]"
   18.21 -                    From IBM's AIX assembler reference:
   18.22 -                     "The isync [...] instructions causes the processor to
   18.23 -                    refetch any instructions that might have been fetched
   18.24 -                    prior to the isync instruction. The instruction isync
   18.25 -                    causes the processor to wait for all previous instructions
   18.26 -                    to complete. Then any instructions already fetched are
   18.27 -                    discarded and instruction processing continues in the
   18.28 -                    environment established by the previous instructions."
   18.29 -
   18.30 -  semantic barrier instructions:
   18.31 -  (as defined in orderAccess.hpp)
   18.32 -
   18.33 -  - release         orders Store|Store,       (maps to lwsync)
   18.34 -                            Load|Store
   18.35 -  - acquire         orders  Load|Store,       (maps to lwsync)
   18.36 -                            Load|Load
   18.37 -  - fence           orders Store|Store,       (maps to sync)
   18.38 -                            Load|Store,
   18.39 -                            Load|Load,
   18.40 -                           Store|Load
   18.41 -*/
   18.42 +//
   18.43 +// machine barrier instructions:
   18.44 +//
   18.45 +// - sync            two-way memory barrier, aka fence
   18.46 +// - lwsync          orders  Store|Store,
   18.47 +//                            Load|Store,
   18.48 +//                            Load|Load,
   18.49 +//                   but not Store|Load
   18.50 +// - eieio           orders memory accesses for device memory (only)
   18.51 +// - isync           invalidates speculatively executed instructions
   18.52 +//                   From the POWER ISA 2.06 documentation:
   18.53 +//                    "[...] an isync instruction prevents the execution of
   18.54 +//                   instructions following the isync until instructions
   18.55 +//                   preceding the isync have completed, [...]"
   18.56 +//                   From IBM's AIX assembler reference:
   18.57 +//                    "The isync [...] instructions causes the processor to
   18.58 +//                   refetch any instructions that might have been fetched
   18.59 +//                   prior to the isync instruction. The instruction isync
   18.60 +//                   causes the processor to wait for all previous instructions
   18.61 +//                   to complete. Then any instructions already fetched are
   18.62 +//                   discarded and instruction processing continues in the
   18.63 +//                   environment established by the previous instructions."
   18.64 +//
   18.65 +// semantic barrier instructions:
   18.66 +// (as defined in orderAccess.hpp)
   18.67 +//
   18.68 +// - release         orders Store|Store,       (maps to lwsync)
   18.69 +//                           Load|Store
   18.70 +// - acquire         orders  Load|Store,       (maps to lwsync)
   18.71 +//                           Load|Load
   18.72 +// - fence           orders Store|Store,       (maps to sync)
   18.73 +//                           Load|Store,
   18.74 +//                           Load|Load,
   18.75 +//                          Store|Load
   18.76 +//
   18.77  
   18.78  #define strasm_sync                       "\n  sync    \n"
   18.79  #define strasm_lwsync                     "\n  lwsync  \n"

    19.1 --- a/src/share/vm/ci/ciReplay.cpp	Wed May 07 10:58:47 2014 -0700
    19.2 +++ b/src/share/vm/ci/ciReplay.cpp	Thu May 08 23:07:11 2014 -0700
    19.3 @@ -376,11 +376,15 @@
    19.4      int c = getc(_stream);
    19.5      while(c != EOF) {
    19.6        c = get_line(c);
    19.7 -      process_command(CHECK);
    19.8 +      process_command(THREAD);
    19.9        if (had_error()) {
   19.10          tty->print_cr("Error while parsing line %d: %s\n", line_no, _error_message);
   19.11 -        tty->print_cr("%s", _buffer);
   19.12 -        return;
   19.13 +        if (ReplayIgnoreInitErrors) {
   19.14 +          CLEAR_PENDING_EXCEPTION;
   19.15 +          _error_message = NULL;
   19.16 +        } else {
   19.17 +          return;
   19.18 +        }
   19.19        }
   19.20        line_no++;
   19.21      }
   19.22 @@ -565,10 +569,14 @@
   19.23    void process_ciMethodData(TRAPS) {
   19.24      Method* method = parse_method(CHECK);
   19.25      if (had_error()) return;
   19.26 -    /* jsut copied from Method, to build interpret data*/
   19.27 +    /* just copied from Method, to build interpret data*/
   19.28      if (InstanceRefKlass::owns_pending_list_lock((JavaThread*)THREAD)) {
   19.29        return;
   19.30      }
   19.31 +    // To be properly initialized, some profiling in the MDO needs the
   19.32 +    // method to be rewritten (number of arguments at a call for
   19.33 +    // instance)
   19.34 +    method->method_holder()->link_class(CHECK);
   19.35      // methodOopDesc::build_interpreter_method_data(method, CHECK);
   19.36      {
   19.37        // Grab a lock here to prevent multiple

    20.1 --- a/src/share/vm/classfile/vmSymbols.hpp	Wed May 07 10:58:47 2014 -0700
    20.2 +++ b/src/share/vm/classfile/vmSymbols.hpp	Thu May 08 23:07:11 2014 -0700
    20.3 @@ -774,7 +774,7 @@
    20.4    /* java/lang/ref/Reference */                                                                                         \
    20.5    do_intrinsic(_Reference_get,            java_lang_ref_Reference, get_name,    void_object_signature, F_R)             \
    20.6                                                                                                                          \
    20.7 -  /* support for com.sum.crypto.provider.AESCrypt and some of its callers */                                            \
    20.8 +  /* support for com.sun.crypto.provider.AESCrypt and some of its callers */                                            \
    20.9    do_class(com_sun_crypto_provider_aescrypt,      "com/sun/crypto/provider/AESCrypt")                                   \
   20.10    do_intrinsic(_aescrypt_encryptBlock, com_sun_crypto_provider_aescrypt, encryptBlock_name, byteArray_int_byteArray_int_signature, F_R)   \
   20.11    do_intrinsic(_aescrypt_decryptBlock, com_sun_crypto_provider_aescrypt, decryptBlock_name, byteArray_int_byteArray_int_signature, F_R)   \

    21.1 --- a/src/share/vm/code/nmethod.cpp	Wed May 07 10:58:47 2014 -0700
    21.2 +++ b/src/share/vm/code/nmethod.cpp	Thu May 08 23:07:11 2014 -0700
    21.3 @@ -771,7 +771,11 @@
    21.4      _hotness_counter         = NMethodSweeper::hotness_counter_reset_val();
    21.5  
    21.6      code_buffer->copy_values_to(this);
    21.7 -    debug_only(verify_scavenge_root_oops());
    21.8 +    if (ScavengeRootsInCode && detect_scavenge_root_oops()) {
    21.9 +      CodeCache::add_scavenge_root_nmethod(this);
   21.10 +      Universe::heap()->register_nmethod(this);
   21.11 +    }
   21.12 +    DEBUG_ONLY(verify_scavenge_root_oops();)
   21.13      CodeCache::commit(this);
   21.14    }
   21.15

    22.1 --- a/src/share/vm/oops/klass.cpp	Wed May 07 10:58:47 2014 -0700
    22.2 +++ b/src/share/vm/oops/klass.cpp	Thu May 08 23:07:11 2014 -0700
    22.3 @@ -496,6 +496,7 @@
    22.4  }
    22.5  
    22.6  void Klass::restore_unshareable_info(TRAPS) {
    22.7 +  TRACE_INIT_ID(this);
    22.8    // If an exception happened during CDS restore, some of these fields may already be
    22.9    // set.  We leave the class on the CLD list, even if incomplete so that we don't
   22.10    // modify the CLD list outside a safepoint.

    23.1 --- a/src/share/vm/opto/compile.cpp	Wed May 07 10:58:47 2014 -0700
    23.2 +++ b/src/share/vm/opto/compile.cpp	Thu May 08 23:07:11 2014 -0700
    23.3 @@ -693,6 +693,7 @@
    23.4  #endif
    23.5    set_print_inlining(PrintInlining || method()->has_option("PrintInlining") NOT_PRODUCT( || PrintOptoInlining));
    23.6    set_print_intrinsics(PrintIntrinsics || method()->has_option("PrintIntrinsics"));
    23.7 +  set_has_irreducible_loop(true); // conservative until build_loop_tree() reset it
    23.8  
    23.9    if (ProfileTraps RTM_OPT_ONLY( || UseRTMLocking )) {
   23.10      // Make sure the method being compiled gets its own MDO,
   23.11 @@ -977,6 +978,8 @@
   23.12    set_print_assembly(PrintFrameConverterAssembly);
   23.13    set_parsed_irreducible_loop(false);
   23.14  #endif
   23.15 +  set_has_irreducible_loop(false); // no loops
   23.16 +
   23.17    CompileWrapper cw(this);
   23.18    Init(/*AliasLevel=*/ 0);
   23.19    init_tf((*generator)());
   23.20 @@ -1147,7 +1150,7 @@
   23.21      if( start->is_Start() )
   23.22        return start->as_Start();
   23.23    }
   23.24 -  ShouldNotReachHere();
   23.25 +  fatal("Did not find Start node!");
   23.26    return NULL;
   23.27  }
   23.28

    24.1 --- a/src/share/vm/opto/compile.hpp	Wed May 07 10:58:47 2014 -0700
    24.2 +++ b/src/share/vm/opto/compile.hpp	Thu May 08 23:07:11 2014 -0700
    24.3 @@ -319,6 +319,7 @@
    24.4    bool                  _trace_opto_output;
    24.5    bool                  _parsed_irreducible_loop; // True if ciTypeFlow detected irreducible loops during parsing
    24.6  #endif
    24.7 +  bool                  _has_irreducible_loop;  // Found irreducible loops
    24.8    // JSR 292
    24.9    bool                  _has_method_handle_invokes; // True if this method has MethodHandle invokes.
   24.10    RTMState              _rtm_state;             // State of Restricted Transactional Memory usage
   24.11 @@ -605,6 +606,8 @@
   24.12    void          set_parsed_irreducible_loop(bool z) { _parsed_irreducible_loop = z; }
   24.13    int _in_dump_cnt;  // Required for dumping ir nodes.
   24.14  #endif
   24.15 +  bool              has_irreducible_loop() const { return _has_irreducible_loop; }
   24.16 +  void          set_has_irreducible_loop(bool z) { _has_irreducible_loop = z; }
   24.17  
   24.18    // JSR 292
   24.19    bool              has_method_handle_invokes() const { return _has_method_handle_invokes;     }

    25.1 --- a/src/share/vm/opto/loopnode.cpp	Wed May 07 10:58:47 2014 -0700
    25.2 +++ b/src/share/vm/opto/loopnode.cpp	Thu May 08 23:07:11 2014 -0700
    25.3 @@ -266,9 +266,9 @@
    25.4  
    25.5    // Counted loop head must be a good RegionNode with only 3 not NULL
    25.6    // control input edges: Self, Entry, LoopBack.
    25.7 -  if (x->in(LoopNode::Self) == NULL || x->req() != 3)
    25.8 +  if (x->in(LoopNode::Self) == NULL || x->req() != 3 || loop->_irreducible) {
    25.9      return false;
   25.10 -
   25.11 +  }
   25.12    Node *init_control = x->in(LoopNode::EntryControl);
   25.13    Node *back_control = x->in(LoopNode::LoopBackControl);
   25.14    if (init_control == NULL || back_control == NULL)    // Partially dead
   25.15 @@ -1522,11 +1522,11 @@
   25.16  
   25.17    // If I have one hot backedge, peel off myself loop.
   25.18    // I better be the outermost loop.
   25.19 -  if( _head->req() > 3 ) {
   25.20 +  if (_head->req() > 3 && !_irreducible) {
   25.21      split_outer_loop( phase );
   25.22      result = true;
   25.23  
   25.24 -  } else if( !_head->is_Loop() && !_irreducible ) {
   25.25 +  } else if (!_head->is_Loop() && !_irreducible) {
   25.26      // Make a new LoopNode to replace the old loop head
   25.27      Node *l = new (phase->C) LoopNode( _head->in(1), _head->in(2) );
   25.28      l = igvn.register_new_node_with_optimizer(l, _head);
   25.29 @@ -2938,6 +2938,7 @@
   25.30            return pre_order;
   25.31          }
   25.32        }
   25.33 +      C->set_has_irreducible_loop(_has_irreducible_loops);
   25.34      }
   25.35  
   25.36      // This Node might be a decision point for loops.  It is only if

    26.1 --- a/src/share/vm/opto/memnode.cpp	Wed May 07 10:58:47 2014 -0700
    26.2 +++ b/src/share/vm/opto/memnode.cpp	Thu May 08 23:07:11 2014 -0700
    26.3 @@ -306,33 +306,16 @@
    26.4      int alias_idx = phase->C->get_alias_index(t_adr->is_ptr());
    26.5    }
    26.6  
    26.7 -#ifdef ASSERT
    26.8    Node* base = NULL;
    26.9 -  if (address->is_AddP())
   26.10 +  if (address->is_AddP()) {
   26.11      base = address->in(AddPNode::Base);
   26.12 +  }
   26.13    if (base != NULL && phase->type(base)->higher_equal(TypePtr::NULL_PTR) &&
   26.14        !t_adr->isa_rawptr()) {
   26.15      // Note: raw address has TOP base and top->higher_equal(TypePtr::NULL_PTR) is true.
   26.16 -    Compile* C = phase->C;
   26.17 -    tty->cr();
   26.18 -    tty->print_cr("===== NULL+offs not RAW address =====");
   26.19 -    if (C->is_dead_node(this->_idx))    tty->print_cr("'this' is dead");
   26.20 -    if ((ctl != NULL) && C->is_dead_node(ctl->_idx)) tty->print_cr("'ctl' is dead");
   26.21 -    if (C->is_dead_node(mem->_idx))     tty->print_cr("'mem' is dead");
   26.22 -    if (C->is_dead_node(address->_idx)) tty->print_cr("'address' is dead");
   26.23 -    if (C->is_dead_node(base->_idx))    tty->print_cr("'base' is dead");
   26.24 -    tty->cr();
   26.25 -    base->dump(1);
   26.26 -    tty->cr();
   26.27 -    this->dump(2);
   26.28 -    tty->print("this->adr_type():     "); adr_type()->dump(); tty->cr();
   26.29 -    tty->print("phase->type(address): "); t_adr->dump(); tty->cr();
   26.30 -    tty->print("phase->type(base):    "); phase->type(address)->dump(); tty->cr();
   26.31 -    tty->cr();
   26.32 +    // Skip this node optimization if its address has TOP base.
   26.33 +    return NodeSentinel; // caller will return NULL
   26.34    }
   26.35 -  assert(base == NULL || t_adr->isa_rawptr() ||
   26.36 -        !phase->type(base)->higher_equal(TypePtr::NULL_PTR), "NULL+offs not RAW address?");
   26.37 -#endif
   26.38  
   26.39    // Avoid independent memory operations
   26.40    Node* old_mem = mem;

    27.1 --- a/src/share/vm/opto/node.cpp	Wed May 07 10:58:47 2014 -0700
    27.2 +++ b/src/share/vm/opto/node.cpp	Thu May 08 23:07:11 2014 -0700
    27.3 @@ -27,6 +27,7 @@
    27.4  #include "memory/allocation.inline.hpp"
    27.5  #include "opto/cfgnode.hpp"
    27.6  #include "opto/connode.hpp"
    27.7 +#include "opto/loopnode.hpp"
    27.8  #include "opto/machnode.hpp"
    27.9  #include "opto/matcher.hpp"
   27.10  #include "opto/node.hpp"
   27.11 @@ -1255,6 +1256,7 @@
   27.12  
   27.13    Node *top = igvn->C->top();
   27.14    nstack.push(dead);
   27.15 +  bool has_irreducible_loop = igvn->C->has_irreducible_loop();
   27.16  
   27.17    while (nstack.size() > 0) {
   27.18      dead = nstack.pop();
   27.19 @@ -1269,13 +1271,31 @@
   27.20            assert (!use->is_Con(), "Control for Con node should be Root node.");
   27.21            use->set_req(0, top);       // Cut dead edge to prevent processing
   27.22            nstack.push(use);           // the dead node again.
   27.23 +        } else if (!has_irreducible_loop && // Backedge could be alive in irreducible loop
   27.24 +                   use->is_Loop() && !use->is_Root() &&       // Don't kill Root (RootNode extends LoopNode)
   27.25 +                   use->in(LoopNode::EntryControl) == dead) { // Dead loop if its entry is dead
   27.26 +          use->set_req(LoopNode::EntryControl, top);          // Cut dead edge to prevent processing
   27.27 +          use->set_req(0, top);       // Cut self edge
   27.28 +          nstack.push(use);
   27.29          } else {                      // Else found a not-dead user
   27.30 +          // Dead if all inputs are top or null
   27.31 +          bool dead_use = !use->is_Root(); // Keep empty graph alive
   27.32            for (uint j = 1; j < use->req(); j++) {
   27.33 -            if (use->in(j) == dead) { // Turn all dead inputs into TOP
   27.34 +            Node* in = use->in(j);
   27.35 +            if (in == dead) {         // Turn all dead inputs into TOP
   27.36                use->set_req(j, top);
   27.37 +            } else if (in != NULL && !in->is_top()) {
   27.38 +              dead_use = false;
   27.39              }
   27.40            }
   27.41 -          igvn->_worklist.push(use);
   27.42 +          if (dead_use) {
   27.43 +            if (use->is_Region()) {
   27.44 +              use->set_req(0, top);   // Cut self edge
   27.45 +            }
   27.46 +            nstack.push(use);
   27.47 +          } else {
   27.48 +            igvn->_worklist.push(use);
   27.49 +          }
   27.50          }
   27.51          // Refresh the iterator, since any number of kills might have happened.
   27.52          k = dead->last_outs(kmin);

    28.1 --- a/src/share/vm/opto/runtime.cpp	Wed May 07 10:58:47 2014 -0700
    28.2 +++ b/src/share/vm/opto/runtime.cpp	Thu May 08 23:07:11 2014 -0700
    28.3 @@ -1,5 +1,5 @@
    28.4  /*
    28.5 - * Copyright (c) 1998, 2013, Oracle and/or its affiliates. All rights reserved.
    28.6 + * Copyright (c) 1998, 2014, Oracle and/or its affiliates. All rights reserved.
    28.7   * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    28.8   *
    28.9   * This code is free software; you can redistribute it and/or modify it
   28.10 @@ -870,7 +870,7 @@
   28.11    return TypeFunc::make(domain, range);
   28.12  }
   28.13  
   28.14 -// for cipherBlockChaining calls of aescrypt encrypt/decrypt, four pointers and a length, returning void
   28.15 +// for cipherBlockChaining calls of aescrypt encrypt/decrypt, four pointers and a length, returning int
   28.16  const TypeFunc* OptoRuntime::cipherBlockChaining_aescrypt_Type() {
   28.17    // create input type (domain)
   28.18    int num_args      = 5;

    29.1 --- a/src/share/vm/runtime/advancedThresholdPolicy.cpp	Wed May 07 10:58:47 2014 -0700
    29.2 +++ b/src/share/vm/runtime/advancedThresholdPolicy.cpp	Thu May 08 23:07:11 2014 -0700
    29.3 @@ -53,7 +53,8 @@
    29.4    }
    29.5  
    29.6    set_c1_count(MAX2(count / 3, 1));
    29.7 -  set_c2_count(MAX2(count - count / 3, 1));
    29.8 +  set_c2_count(MAX2(count - c1_count(), 1));
    29.9 +  FLAG_SET_ERGO(intx, CICompilerCount, c1_count() + c2_count());
   29.10  
   29.11    // Some inlining tuning
   29.12  #ifdef X86

    30.1 --- a/src/share/vm/runtime/arguments.cpp	Wed May 07 10:58:47 2014 -0700
    30.2 +++ b/src/share/vm/runtime/arguments.cpp	Thu May 08 23:07:11 2014 -0700
    30.3 @@ -2383,6 +2383,10 @@
    30.4    status &= verify_interval(NmethodSweepFraction, 1, ReservedCodeCacheSize/K, "NmethodSweepFraction");
    30.5    status &= verify_interval(NmethodSweepActivity, 0, 2000, "NmethodSweepActivity");
    30.6  
    30.7 +  if (!FLAG_IS_DEFAULT(CICompilerCount) && !FLAG_IS_DEFAULT(CICompilerCountPerCPU) && CICompilerCountPerCPU) {
    30.8 +    warning("The VM option CICompilerCountPerCPU overrides CICompilerCount.");
    30.9 +  }
   30.10 +
   30.11    return status;
   30.12  }
   30.13

    31.1 --- a/src/share/vm/runtime/compilationPolicy.cpp	Wed May 07 10:58:47 2014 -0700
    31.2 +++ b/src/share/vm/runtime/compilationPolicy.cpp	Thu May 08 23:07:11 2014 -0700
    31.3 @@ -182,6 +182,7 @@
    31.4      // max(log2(8)-1,1) = 2 compiler threads on an 8-way machine.
    31.5      // May help big-app startup time.
    31.6      _compiler_count = MAX2(log2_intptr(os::active_processor_count())-1,1);
    31.7 +    FLAG_SET_ERGO(intx, CICompilerCount, _compiler_count);
    31.8    } else {
    31.9      _compiler_count = CICompilerCount;
   31.10    }

    32.1 --- a/src/share/vm/runtime/sharedRuntime.cpp	Wed May 07 10:58:47 2014 -0700
    32.2 +++ b/src/share/vm/runtime/sharedRuntime.cpp	Thu May 08 23:07:11 2014 -0700
    32.3 @@ -2690,19 +2690,20 @@
    32.4  JRT_END
    32.5  
    32.6  #ifdef HAVE_DTRACE_H
    32.7 -// Create a dtrace nmethod for this method.  The wrapper converts the
    32.8 -// java compiled calling convention to the native convention, makes a dummy call
    32.9 -// (actually nops for the size of the call instruction, which become a trap if
   32.10 -// probe is enabled). The returns to the caller. Since this all looks like a
   32.11 -// leaf no thread transition is needed.
   32.12 -
   32.13 +/**
   32.14 + * Create a dtrace nmethod for this method.  The wrapper converts the
   32.15 + * Java-compiled calling convention to the native convention, makes a dummy call
   32.16 + * (actually nops for the size of the call instruction, which become a trap if
   32.17 + * probe is enabled), and finally returns to the caller. Since this all looks like a
   32.18 + * leaf, no thread transition is needed.
   32.19 + */
   32.20  nmethod *AdapterHandlerLibrary::create_dtrace_nmethod(methodHandle method) {
   32.21    ResourceMark rm;
   32.22    nmethod* nm = NULL;
   32.23  
   32.24    if (PrintCompilation) {
   32.25      ttyLocker ttyl;
   32.26 -    tty->print("---   n%s  ");
   32.27 +    tty->print("---   n  ");
   32.28      method->print_short_name(tty);
   32.29      if (method->is_static()) {
   32.30        tty->print(" (static)");

    33.1 --- a/src/share/vm/runtime/simpleThresholdPolicy.cpp	Wed May 07 10:58:47 2014 -0700
    33.2 +++ b/src/share/vm/runtime/simpleThresholdPolicy.cpp	Thu May 08 23:07:11 2014 -0700
    33.3 @@ -142,7 +142,8 @@
    33.4      count = MAX2(log2_intptr(os::active_processor_count()), 1) * 3 / 2;
    33.5    }
    33.6    set_c1_count(MAX2(count / 3, 1));
    33.7 -  set_c2_count(MAX2(count - count / 3, 1));
    33.8 +  set_c2_count(MAX2(count - c1_count(), 1));
    33.9 +  FLAG_SET_ERGO(intx, CICompilerCount, c1_count() + c2_count());
   33.10  }
   33.11  
   33.12  void SimpleThresholdPolicy::set_carry_if_necessary(InvocationCounter *counter) {
   33.13 @@ -191,6 +192,10 @@
   33.14        thread->is_interp_only_mode()) {
   33.15      return NULL;
   33.16    }
   33.17 +  if (CompileTheWorld || ReplayCompiles) {
   33.18 +    // Don't trigger other compiles in testing mode
   33.19 +    return NULL;
   33.20 +  }
   33.21    nmethod *osr_nm = NULL;
   33.22  
   33.23    handle_counter_overflow(method());

    34.1 --- a/test/compiler/7184394/TestAESBase.java	Wed May 07 10:58:47 2014 -0700
    34.2 +++ b/test/compiler/7184394/TestAESBase.java	Thu May 08 23:07:11 2014 -0700
    34.3 @@ -1,5 +1,5 @@
    34.4  /*
    34.5 - * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
    34.6 + * Copyright (c) 2012, 2014, Oracle and/or its affiliates. All rights reserved.
    34.7   * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    34.8   *
    34.9   * This code is free software; you can redistribute it and/or modify it
   34.10 @@ -40,9 +40,20 @@
   34.11    int msgSize = Integer.getInteger("msgSize", 646);
   34.12    boolean checkOutput = Boolean.getBoolean("checkOutput");
   34.13    boolean noReinit = Boolean.getBoolean("noReinit");
   34.14 +  boolean testingMisalignment;
   34.15 +  private static final int ALIGN = 8;
   34.16 +  int encInputOffset = Integer.getInteger("encInputOffset", 0) % ALIGN;
   34.17 +  int encOutputOffset = Integer.getInteger("encOutputOffset", 0) % ALIGN;
   34.18 +  int decOutputOffset = Integer.getInteger("decOutputOffset", 0) % ALIGN;
   34.19 +  int lastChunkSize = Integer.getInteger("lastChunkSize", 32);
   34.20    int keySize = Integer.getInteger("keySize", 128);
   34.21 +  int inputLength;
   34.22 +  int encodeLength;
   34.23 +  int decodeLength;
   34.24 +  int decodeMsgSize;
   34.25    String algorithm = System.getProperty("algorithm", "AES");
   34.26    String mode = System.getProperty("mode", "CBC");
   34.27 +  String paddingStr = System.getProperty("paddingStr", "PKCS5Padding");
   34.28    byte[] input;
   34.29    byte[] encode;
   34.30    byte[] expectedEncode;
   34.31 @@ -51,7 +62,6 @@
   34.32    Random random = new Random(0);
   34.33    Cipher cipher;
   34.34    Cipher dCipher;
   34.35 -  String paddingStr = "PKCS5Padding";
   34.36    AlgorithmParameters algParams;
   34.37    SecretKey key;
   34.38  
   34.39 @@ -67,7 +77,10 @@
   34.40  
   34.41    public void prepare() {
   34.42      try {
   34.43 -    System.out.println("\nalgorithm=" + algorithm + ", mode=" + mode + ", msgSize=" + msgSize + ", keySize=" + keySize + ", noReinit=" + noReinit + ", checkOutput=" + checkOutput);
   34.44 +    System.out.println("\nalgorithm=" + algorithm + ", mode=" + mode + ", paddingStr=" + paddingStr + ", msgSize=" + msgSize + ", keySize=" + keySize + ", noReinit=" + noReinit + ", checkOutput=" + checkOutput + ", encInputOffset=" + encInputOffset + ", encOutputOffset=" + encOutputOffset + ", decOutputOffset=" + decOutputOffset + ", lastChunkSize=" +lastChunkSize );
   34.45 +
   34.46 +      if (encInputOffset % ALIGN != 0 || encOutputOffset % ALIGN != 0 || decOutputOffset % ALIGN !=0 )
   34.47 +        testingMisalignment = true;
   34.48  
   34.49        int keyLenBytes = (keySize == 0 ? 16 : keySize/8);
   34.50        byte keyBytes[] = new byte[keyLenBytes];
   34.51 @@ -81,10 +94,6 @@
   34.52          System.out.println("Algorithm: " + key.getAlgorithm() + "("
   34.53                             + key.getEncoded().length * 8 + "bit)");
   34.54        }
   34.55 -      input = new byte[msgSize];
   34.56 -      for (int i=0; i<input.length; i++) {
   34.57 -        input[i] = (byte) (i & 0xff);
   34.58 -      }
   34.59  
   34.60        cipher = Cipher.getInstance(algorithm + "/" + mode + "/" + paddingStr, "SunJCE");
   34.61        dCipher = Cipher.getInstance(algorithm + "/" + mode + "/" + paddingStr, "SunJCE");
   34.62 @@ -103,10 +112,35 @@
   34.63          childShowCipher();
   34.64        }
   34.65  
   34.66 +      inputLength = msgSize + encInputOffset;
   34.67 +      if (testingMisalignment) {
   34.68 +        encodeLength = cipher.getOutputSize(msgSize - lastChunkSize) + encOutputOffset;
   34.69 +        encodeLength += cipher.getOutputSize(lastChunkSize);
   34.70 +        decodeLength = dCipher.getOutputSize(encodeLength - lastChunkSize) + decOutputOffset;
   34.71 +        decodeLength += dCipher.getOutputSize(lastChunkSize);
   34.72 +      } else {
   34.73 +        encodeLength = cipher.getOutputSize(msgSize) + encOutputOffset;
   34.74 +        decodeLength = dCipher.getOutputSize(encodeLength) + decOutputOffset;
   34.75 +      }
   34.76 +
   34.77 +      input = new byte[inputLength];
   34.78 +      for (int i=encInputOffset, j=0; i<inputLength; i++, j++) {
   34.79 +        input[i] = (byte) (j & 0xff);
   34.80 +      }
   34.81 +
   34.82        // do one encode and decode in preparation
   34.83 -      // this will also create the encode buffer and decode buffer
   34.84 -      encode = cipher.doFinal(input);
   34.85 -      decode = dCipher.doFinal(encode);
   34.86 +      encode = new byte[encodeLength];
   34.87 +      decode = new byte[decodeLength];
   34.88 +      if (testingMisalignment) {
   34.89 +        decodeMsgSize = cipher.update(input, encInputOffset, (msgSize - lastChunkSize), encode, encOutputOffset);
   34.90 +        decodeMsgSize += cipher.doFinal(input, (encInputOffset + msgSize - lastChunkSize), lastChunkSize, encode, (encOutputOffset + decodeMsgSize));
   34.91 +
   34.92 +        int tempSize = dCipher.update(encode, encOutputOffset, (decodeMsgSize - lastChunkSize), decode, decOutputOffset);
   34.93 +        dCipher.doFinal(encode, (encOutputOffset + decodeMsgSize - lastChunkSize), lastChunkSize, decode, (decOutputOffset + tempSize));
   34.94 +      } else {
   34.95 +        decodeMsgSize = cipher.doFinal(input, encInputOffset, msgSize, encode, encOutputOffset);
   34.96 +        dCipher.doFinal(encode, encOutputOffset, decodeMsgSize, decode, decOutputOffset);
   34.97 +      }
   34.98        if (checkOutput) {
   34.99          expectedEncode = (byte[]) encode.clone();
  34.100          expectedDecode = (byte[]) decode.clone();

    35.1 --- a/test/compiler/7184394/TestAESDecode.java	Wed May 07 10:58:47 2014 -0700
    35.2 +++ b/test/compiler/7184394/TestAESDecode.java	Thu May 08 23:07:11 2014 -0700
    35.3 @@ -1,5 +1,5 @@
    35.4  /*
    35.5 - * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
    35.6 + * Copyright (c) 2012, 2014, Oracle and/or its affiliates. All rights reserved.
    35.7   * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    35.8   *
    35.9   * This code is free software; you can redistribute it and/or modify it
   35.10 @@ -33,14 +33,15 @@
   35.11    public void run() {
   35.12      try {
   35.13        if (!noReinit) dCipher.init(Cipher.DECRYPT_MODE, key, algParams);
   35.14 +      decode = new byte[decodeLength];
   35.15 +      if (testingMisalignment) {
   35.16 +        int tempSize = dCipher.update(encode, encOutputOffset, (decodeMsgSize - lastChunkSize), decode, decOutputOffset);
   35.17 +        dCipher.doFinal(encode, (encOutputOffset + decodeMsgSize - lastChunkSize), lastChunkSize, decode, (decOutputOffset + tempSize));
   35.18 +      } else {
   35.19 +        dCipher.doFinal(encode, encOutputOffset, decodeMsgSize, decode, decOutputOffset);
   35.20 +      }
   35.21        if (checkOutput) {
   35.22 -        // checked version creates new output buffer each time
   35.23 -        decode = dCipher.doFinal(encode, 0, encode.length);
   35.24          compareArrays(decode, expectedDecode);
   35.25 -      } else {
   35.26 -        // non-checked version outputs to existing encode buffer for maximum speed
   35.27 -        decode = new byte[dCipher.getOutputSize(encode.length)];
   35.28 -        dCipher.doFinal(encode, 0, encode.length, decode);
   35.29        }
   35.30      }
   35.31      catch (Exception e) {

    36.1 --- a/test/compiler/7184394/TestAESEncode.java	Wed May 07 10:58:47 2014 -0700
    36.2 +++ b/test/compiler/7184394/TestAESEncode.java	Thu May 08 23:07:11 2014 -0700
    36.3 @@ -1,5 +1,5 @@
    36.4  /*
    36.5 - * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
    36.6 + * Copyright (c) 2012, 2014, Oracle and/or its affiliates. All rights reserved.
    36.7   * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    36.8   *
    36.9   * This code is free software; you can redistribute it and/or modify it
   36.10 @@ -33,14 +33,15 @@
   36.11    public void run() {
   36.12      try {
   36.13        if (!noReinit) cipher.init(Cipher.ENCRYPT_MODE, key, algParams);
   36.14 +      encode = new byte[encodeLength];
   36.15 +      if (testingMisalignment) {
   36.16 +        int tempSize = cipher.update(input, encInputOffset, (msgSize - lastChunkSize), encode, encOutputOffset);
   36.17 +        cipher.doFinal(input, (encInputOffset + msgSize - lastChunkSize), lastChunkSize, encode, (encOutputOffset + tempSize));
   36.18 +      } else {
   36.19 +        cipher.doFinal(input, encInputOffset, msgSize, encode, encOutputOffset);
   36.20 +      }
   36.21        if (checkOutput) {
   36.22 -        // checked version creates new output buffer each time
   36.23 -        encode = cipher.doFinal(input, 0, msgSize);
   36.24          compareArrays(encode, expectedEncode);
   36.25 -      } else {
   36.26 -        // non-checked version outputs to existing encode buffer for maximum speed
   36.27 -        encode = new byte[cipher.getOutputSize(msgSize)];
   36.28 -        cipher.doFinal(input, 0, msgSize, encode);
   36.29        }
   36.30      }
   36.31      catch (Exception e) {

    37.1 --- a/test/compiler/7184394/TestAESMain.java	Wed May 07 10:58:47 2014 -0700
    37.2 +++ b/test/compiler/7184394/TestAESMain.java	Thu May 08 23:07:11 2014 -0700
    37.3 @@ -1,5 +1,5 @@
    37.4  /*
    37.5 - * Copyright (c) 2012, 2014 Oracle and/or its affiliates. All rights reserved.
    37.6 + * Copyright (c) 2012, 2014, Oracle and/or its affiliates. All rights reserved.
    37.7   * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    37.8   *
    37.9   * This code is free software; you can redistribute it and/or modify it
   37.10 @@ -28,7 +28,19 @@
   37.11   * @summary add intrinsics to use AES instructions
   37.12   *
   37.13   * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CBC TestAESMain
   37.14 + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CBC -DencInputOffset=1 TestAESMain
   37.15 + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CBC -DencOutputOffset=1 TestAESMain
   37.16 + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CBC -DdecOutputOffset=1 TestAESMain
   37.17 + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CBC -DencInputOffset=1 -DencOutputOffset=1 TestAESMain
   37.18 + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CBC -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 TestAESMain
   37.19 + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CBC -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 -DpaddingStr=NoPadding -DmsgSize=640 TestAESMain
   37.20   * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB TestAESMain
   37.21 + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB -DencInputOffset=1 TestAESMain
   37.22 + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB -DencOutputOffset=1 TestAESMain
   37.23 + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB -DdecOutputOffset=1 TestAESMain
   37.24 + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB -DencInputOffset=1 -DencOutputOffset=1 TestAESMain
   37.25 + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 TestAESMain
   37.26 + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 -DpaddingStr=NoPadding -DmsgSize=640 TestAESMain
   37.27   *
   37.28   * @author Tom Deneau
   37.29   */
   37.30 @@ -36,12 +48,13 @@
   37.31  public class TestAESMain {
   37.32    public static void main(String[] args) {
   37.33      int iters = (args.length > 0 ? Integer.valueOf(args[0]) : 1000000);
   37.34 +    int warmupIters = (args.length > 1 ? Integer.valueOf(args[1]) : 20000);
   37.35      System.out.println(iters + " iterations");
   37.36      TestAESEncode etest = new TestAESEncode();
   37.37      etest.prepare();
   37.38 -    // warm-up for 20K iterations
   37.39 +    // warm-up
   37.40      System.out.println("Starting encryption warm-up");
   37.41 -    for (int i=0; i<20000; i++) {
   37.42 +    for (int i=0; i<warmupIters; i++) {
   37.43        etest.run();
   37.44      }
   37.45      System.out.println("Finished encryption warm-up");
   37.46 @@ -54,9 +67,9 @@
   37.47  
   37.48      TestAESDecode dtest = new TestAESDecode();
   37.49      dtest.prepare();
   37.50 -    // warm-up for 20K iterations
   37.51 +    // warm-up
   37.52      System.out.println("Starting decryption warm-up");
   37.53 -    for (int i=0; i<20000; i++) {
   37.54 +    for (int i=0; i<warmupIters; i++) {
   37.55        dtest.run();
   37.56      }
   37.57      System.out.println("Finished decryption warm-up");

Mercurial > jdk8-mips64-public > hotspot / changeset

changeset

Merge hs25.20-b14