Thu, 08 May 2014 23:07:11 -0700
Merge
.hgtags | file | annotate | diff | comparison | revisions |
1.1 --- a/.hgtags Wed May 07 10:58:47 2014 -0700 1.2 +++ b/.hgtags Thu May 08 23:07:11 2014 -0700 1.3 @@ -462,3 +462,4 @@ 1.4 3c291bc2aa7c58efb1219701f38c41731609e595 hs25.20-b12 1.5 18ae0dac7620474547aa1721bc3fd748af07b8b5 jdk8u20-b12 1.6 47951595af60460a479b8574622375bfbf5c8ed2 jdk8u20-b13 1.7 +798f5b02be897151fdad44d695446088b1cca6b1 hs25.20-b13
2.1 --- a/make/hotspot_version Wed May 07 10:58:47 2014 -0700 2.2 +++ b/make/hotspot_version Thu May 08 23:07:11 2014 -0700 2.3 @@ -35,7 +35,7 @@ 2.4 2.5 HS_MAJOR_VER=25 2.6 HS_MINOR_VER=20 2.7 -HS_BUILD_NUMBER=12 2.8 +HS_BUILD_NUMBER=14 2.9 2.10 JDK_MAJOR_VER=1 2.11 JDK_MINOR_VER=8
3.1 --- a/src/cpu/ppc/vm/cppInterpreter_ppc.cpp Wed May 07 10:58:47 2014 -0700 3.2 +++ b/src/cpu/ppc/vm/cppInterpreter_ppc.cpp Thu May 08 23:07:11 2014 -0700 3.3 @@ -1,3 +1,4 @@ 3.4 + 3.5 /* 3.6 * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved. 3.7 * Copyright 2012, 2013 SAP AG. All rights reserved. 3.8 @@ -403,7 +404,7 @@ 3.9 BLOCK_COMMENT("compute_interpreter_state {"); 3.10 3.11 // access_flags = method->access_flags(); 3.12 - // TODO: PPC port: assert(4 == methodOopDesc::sz_access_flags(), "unexpected field size"); 3.13 + // TODO: PPC port: assert(4 == sizeof(AccessFlags), "unexpected field size"); 3.14 __ lwa(access_flags, method_(access_flags)); 3.15 3.16 // parameter_count = method->constMethod->size_of_parameters(); 3.17 @@ -1055,7 +1056,7 @@ 3.18 assert(access_flags->is_nonvolatile(), 3.19 "access_flags must be in a non-volatile register"); 3.20 // Type check. 3.21 - // TODO: PPC port: assert(4 == methodOopDesc::sz_access_flags(), "unexpected field size"); 3.22 + // TODO: PPC port: assert(4 == sizeof(AccessFlags), "unexpected field size"); 3.23 __ lwz(access_flags, method_(access_flags)); 3.24 3.25 // We don't want to reload R19_method and access_flags after calls 3.26 @@ -1838,7 +1839,7 @@ 3.27 // Interpreter state fields. 3.28 const Register msg = R24_tmp4; 3.29 3.30 - // MethodOop fields. 3.31 + // Method fields. 3.32 const Register parameter_count = R25_tmp5; 3.33 const Register result_index = R26_tmp6; 3.34 3.35 @@ -2023,7 +2024,7 @@ 3.36 __ add(R17_tos, R17_tos, parameter_count); 3.37 3.38 // Result stub address array index 3.39 - // TODO: PPC port: assert(4 == methodOopDesc::sz_result_index(), "unexpected field size"); 3.40 + // TODO: PPC port: assert(4 == sizeof(AccessFlags), "unexpected field size"); 3.41 __ lwa(result_index, method_(result_index)); 3.42 3.43 __ li(msg, BytecodeInterpreter::method_resume); 3.44 @@ -2709,7 +2710,7 @@ 3.45 __ ld(R3_ARG1, state_(_result._osr._osr_buf)); 3.46 __ mtctr(R12_scratch2); 3.47 3.48 - // Load method oop, gc may move it during execution of osr'd method. 3.49 + // Load method, gc may move it during execution of osr'd method. 3.50 __ ld(R22_tmp2, state_(_method)); 3.51 // Load message 'call_method'. 3.52 __ li(R23_tmp3, BytecodeInterpreter::call_method);
4.1 --- a/src/cpu/ppc/vm/frame_ppc.inline.hpp Wed May 07 10:58:47 2014 -0700 4.2 +++ b/src/cpu/ppc/vm/frame_ppc.inline.hpp Thu May 08 23:07:11 2014 -0700 4.3 @@ -26,6 +26,8 @@ 4.4 #ifndef CPU_PPC_VM_FRAME_PPC_INLINE_HPP 4.5 #define CPU_PPC_VM_FRAME_PPC_INLINE_HPP 4.6 4.7 +#include "code/codeCache.hpp" 4.8 + 4.9 // Inline functions for ppc64 frames: 4.10 4.11 // Find codeblob and set deopt_state.
5.1 --- a/src/cpu/ppc/vm/interp_masm_ppc_64.hpp Wed May 07 10:58:47 2014 -0700 5.2 +++ b/src/cpu/ppc/vm/interp_masm_ppc_64.hpp Thu May 08 23:07:11 2014 -0700 5.3 @@ -26,7 +26,7 @@ 5.4 #ifndef CPU_PPC_VM_INTERP_MASM_PPC_64_HPP 5.5 #define CPU_PPC_VM_INTERP_MASM_PPC_64_HPP 5.6 5.7 -#include "assembler_ppc.inline.hpp" 5.8 +#include "asm/macroAssembler.hpp" 5.9 #include "interpreter/invocationCounter.hpp" 5.10 5.11 // This file specializes the assembler with interpreter-specific macros.
6.1 --- a/src/cpu/ppc/vm/interpreterRT_ppc.cpp Wed May 07 10:58:47 2014 -0700 6.2 +++ b/src/cpu/ppc/vm/interpreterRT_ppc.cpp Thu May 08 23:07:11 2014 -0700 6.3 @@ -24,6 +24,7 @@ 6.4 */ 6.5 6.6 #include "precompiled.hpp" 6.7 +#include "asm/assembler.inline.hpp" 6.8 #include "interpreter/interpreter.hpp" 6.9 #include "interpreter/interpreterRuntime.hpp" 6.10 #include "memory/allocation.inline.hpp"
7.1 --- a/src/cpu/ppc/vm/interpreter_ppc.cpp Wed May 07 10:58:47 2014 -0700 7.2 +++ b/src/cpu/ppc/vm/interpreter_ppc.cpp Thu May 08 23:07:11 2014 -0700 7.3 @@ -139,32 +139,16 @@ 7.4 // Signature is in R3_RET. Signature is callee saved. 7.5 __ mr(signature, R3_RET); 7.6 7.7 - // Reload method, it may have moved. 7.8 -#ifdef CC_INTERP 7.9 - __ ld(R19_method, state_(_method)); 7.10 -#else 7.11 - __ ld(R19_method, 0, target_sp); 7.12 - __ ld(R19_method, _ijava_state_neg(method), R19_method); 7.13 -#endif 7.14 - 7.15 // Get the result handler. 7.16 __ call_VM_leaf(CAST_FROM_FN_PTR(address, InterpreterRuntime::get_result_handler), R16_thread, R19_method); 7.17 7.18 - // Reload method, it may have moved. 7.19 -#ifdef CC_INTERP 7.20 - __ ld(R19_method, state_(_method)); 7.21 -#else 7.22 - __ ld(R19_method, 0, target_sp); 7.23 - __ ld(R19_method, _ijava_state_neg(method), R19_method); 7.24 -#endif 7.25 - 7.26 { 7.27 Label L; 7.28 // test if static 7.29 // _access_flags._flags must be at offset 0. 7.30 // TODO PPC port: requires change in shared code. 7.31 //assert(in_bytes(AccessFlags::flags_offset()) == 0, 7.32 - // "MethodOopDesc._access_flags == MethodOopDesc._access_flags._flags"); 7.33 + // "MethodDesc._access_flags == MethodDesc._access_flags._flags"); 7.34 // _access_flags must be a 32 bit value. 7.35 assert(sizeof(AccessFlags) == 4, "wrong size"); 7.36 __ lwa(R11_scratch1/*access_flags*/, method_(access_flags));
8.1 --- a/src/cpu/ppc/vm/jniFastGetField_ppc.cpp Wed May 07 10:58:47 2014 -0700 8.2 +++ b/src/cpu/ppc/vm/jniFastGetField_ppc.cpp Thu May 08 23:07:11 2014 -0700 8.3 @@ -32,7 +32,7 @@ 8.4 8.5 8.6 address JNI_FastGetField::generate_fast_get_int_field0(BasicType type) { 8.7 - // we don't have fast jni accessors. 8.8 + // We don't have fast jni accessors. 8.9 return (address) -1; 8.10 } 8.11 8.12 @@ -57,12 +57,12 @@ 8.13 } 8.14 8.15 address JNI_FastGetField::generate_fast_get_long_field() { 8.16 - // we don't have fast jni accessors. 8.17 + // We don't have fast jni accessors. 8.18 return (address) -1; 8.19 } 8.20 8.21 address JNI_FastGetField::generate_fast_get_float_field0(BasicType type) { 8.22 - // e don't have fast jni accessors. 8.23 + // We don't have fast jni accessors. 8.24 return (address) -1; 8.25 } 8.26
9.1 --- a/src/cpu/ppc/vm/ppc.ad Wed May 07 10:58:47 2014 -0700 9.2 +++ b/src/cpu/ppc/vm/ppc.ad Thu May 08 23:07:11 2014 -0700 9.3 @@ -898,7 +898,7 @@ 9.4 // To keep related declarations/definitions/uses close together, 9.5 // we switch between source %{ }% and source_hpp %{ }% freely as needed. 9.6 9.7 - // Returns true if Node n is followed by a MemBar node that 9.8 + // Returns true if Node n is followed by a MemBar node that 9.9 // will do an acquire. If so, this node must not do the acquire 9.10 // operation. 9.11 bool followed_by_acquire(const Node *n); 9.12 @@ -908,7 +908,7 @@ 9.13 9.14 // Optimize load-acquire. 9.15 // 9.16 -// Check if acquire is unnecessary due to following operation that does 9.17 +// Check if acquire is unnecessary due to following operation that does 9.18 // acquire anyways. 9.19 // Walk the pattern: 9.20 // 9.21 @@ -919,12 +919,12 @@ 9.22 // Proj(ctrl) Proj(mem) 9.23 // | | 9.24 // MemBarRelease/Volatile 9.25 -// 9.26 +// 9.27 bool followed_by_acquire(const Node *load) { 9.28 assert(load->is_Load(), "So far implemented only for loads."); 9.29 9.30 // Find MemBarAcquire. 9.31 - const Node *mba = NULL; 9.32 + const Node *mba = NULL; 9.33 for (DUIterator_Fast imax, i = load->fast_outs(imax); i < imax; i++) { 9.34 const Node *out = load->fast_out(i); 9.35 if (out->Opcode() == Op_MemBarAcquire) { 9.36 @@ -937,7 +937,7 @@ 9.37 9.38 // Find following MemBar node. 9.39 // 9.40 - // The following node must be reachable by control AND memory 9.41 + // The following node must be reachable by control AND memory 9.42 // edge to assure no other operations are in between the two nodes. 9.43 // 9.44 // So first get the Proj node, mem_proj, to use it to iterate forward. 9.45 @@ -1135,6 +1135,7 @@ 9.46 9.47 public: 9.48 9.49 + // Emit call stub, compiled java to interpreter. 9.50 static void emit_trampoline_stub(MacroAssembler &_masm, int destination_toc_offset, int insts_call_instruction_offset); 9.51 9.52 // Size of call trampoline stub. 9.53 @@ -2752,7 +2753,7 @@ 9.54 // inputs for new nodes 9.55 m1->add_req(NULL, n_toc); 9.56 m2->add_req(NULL, m1); 9.57 - 9.58 + 9.59 // operands for new nodes 9.60 m1->_opnds[0] = new (C) iRegPdstOper(); // dst 9.61 m1->_opnds[1] = op_src; // src 9.62 @@ -2760,29 +2761,29 @@ 9.63 m2->_opnds[0] = new (C) iRegPdstOper(); // dst 9.64 m2->_opnds[1] = op_src; // src 9.65 m2->_opnds[2] = new (C) iRegLdstOper(); // base 9.66 - 9.67 + 9.68 // Initialize ins_attrib TOC fields. 9.69 m1->_const_toc_offset = -1; 9.70 m2->_const_toc_offset_hi_node = m1; 9.71 - 9.72 + 9.73 // Register allocation for new nodes. 9.74 ra_->set_pair(m1->_idx, ra_->get_reg_second(this), ra_->get_reg_first(this)); 9.75 ra_->set_pair(m2->_idx, ra_->get_reg_second(this), ra_->get_reg_first(this)); 9.76 - 9.77 + 9.78 nodes->push(m1); 9.79 nodes->push(m2); 9.80 assert(m2->bottom_type()->isa_ptr(), "must be ptr"); 9.81 } else { 9.82 loadConPNode *m2 = new (C) loadConPNode(); 9.83 - 9.84 + 9.85 // inputs for new nodes 9.86 m2->add_req(NULL, n_toc); 9.87 - 9.88 + 9.89 // operands for new nodes 9.90 m2->_opnds[0] = new (C) iRegPdstOper(); // dst 9.91 m2->_opnds[1] = op_src; // src 9.92 m2->_opnds[2] = new (C) iRegPdstOper(); // toc 9.93 - 9.94 + 9.95 // Register allocation for new nodes. 9.96 ra_->set_pair(m2->_idx, ra_->get_reg_second(this), ra_->get_reg_first(this)); 9.97 9.98 @@ -2974,17 +2975,17 @@ 9.99 n_sub_base->_opnds[1] = op_crx; 9.100 n_sub_base->_opnds[2] = op_src; 9.101 n_sub_base->_bottom_type = _bottom_type; 9.102 - 9.103 + 9.104 n_shift->add_req(n_region, n_sub_base); 9.105 n_shift->_opnds[0] = op_dst; 9.106 n_shift->_opnds[1] = op_dst; 9.107 n_shift->_bottom_type = _bottom_type; 9.108 - 9.109 + 9.110 ra_->set_pair(n_shift->_idx, ra_->get_reg_second(this), ra_->get_reg_first(this)); 9.111 ra_->set_pair(n_compare->_idx, ra_->get_reg_second(n_crx), ra_->get_reg_first(n_crx)); 9.112 ra_->set_pair(n_sub_base->_idx, ra_->get_reg_second(this), ra_->get_reg_first(this)); 9.113 ra_->set_pair(n_move->_idx, ra_->get_reg_second(this), ra_->get_reg_first(this)); 9.114 - 9.115 + 9.116 nodes->push(n_move); 9.117 nodes->push(n_compare); 9.118 nodes->push(n_sub_base); 9.119 @@ -3061,20 +3062,20 @@ 9.120 } else { 9.121 // before Power 7 9.122 cond_add_baseNode *n_add_base = new (C) cond_add_baseNode(); 9.123 - 9.124 + 9.125 n_add_base->add_req(n_region, n_compare, n_shift); 9.126 n_add_base->_opnds[0] = op_dst; 9.127 n_add_base->_opnds[1] = op_crx; 9.128 n_add_base->_opnds[2] = op_dst; 9.129 n_add_base->_bottom_type = _bottom_type; 9.130 - 9.131 + 9.132 assert(ra_->is_oop(this) == true, "A decodeN node must produce an oop!"); 9.133 ra_->set_oop(n_add_base, true); 9.134 - 9.135 + 9.136 ra_->set_pair(n_shift->_idx, ra_->get_reg_second(this), ra_->get_reg_first(this)); 9.137 ra_->set_pair(n_compare->_idx, ra_->get_reg_second(n_crx), ra_->get_reg_first(n_crx)); 9.138 ra_->set_pair(n_add_base->_idx, ra_->get_reg_second(this), ra_->get_reg_first(this)); 9.139 - 9.140 + 9.141 nodes->push(n_compare); 9.142 nodes->push(n_shift); 9.143 nodes->push(n_add_base); 9.144 @@ -3631,11 +3632,11 @@ 9.145 // Req... 9.146 for (uint i = 0; i < req(); ++i) { 9.147 // The expanded node does not need toc any more. 9.148 - // Add the inline cache constant here instead. This expresses the 9.149 + // Add the inline cache constant here instead. This expresses the 9.150 // register of the inline cache must be live at the call. 9.151 // Else we would have to adapt JVMState by -1. 9.152 if (i == mach_constant_base_node_input()) { 9.153 - call->add_req(loadConLNodes_IC._last); 9.154 + call->add_req(loadConLNodes_IC._last); 9.155 } else { 9.156 call->add_req(in(i)); 9.157 } 9.158 @@ -3663,6 +3664,8 @@ 9.159 %} 9.160 9.161 // Compound version of call dynamic 9.162 + // Toc is only passed so that it can be used in ins_encode statement. 9.163 + // In the code we have to use $constanttablebase. 9.164 enc_class enc_java_dynamic_call(method meth, iRegLdst toc) %{ 9.165 // TODO: PPC port $archOpcode(ppc64Opcode_compound); 9.166 MacroAssembler _masm(&cbuf); 9.167 @@ -3670,14 +3673,17 @@ 9.168 9.169 Register Rtoc = (ra_) ? $constanttablebase : R2_TOC; 9.170 #if 0 9.171 + int vtable_index = this->_vtable_index; 9.172 if (_vtable_index < 0) { 9.173 // Must be invalid_vtable_index, not nonvirtual_vtable_index. 9.174 assert(_vtable_index == Method::invalid_vtable_index, "correct sentinel value"); 9.175 Register ic_reg = as_Register(Matcher::inline_cache_reg_encode()); 9.176 - AddressLiteral meta = __ allocate_metadata_address((Metadata *)Universe::non_oop_word()); 9.177 - 9.178 + 9.179 + // Virtual call relocation will point to ic load. 9.180 address virtual_call_meta_addr = __ pc(); 9.181 - __ load_const_from_method_toc(ic_reg, meta, Rtoc); 9.182 + // Load a clear inline cache. 9.183 + AddressLiteral empty_ic((address) Universe::non_oop_word()); 9.184 + __ load_const_from_method_toc(ic_reg, empty_ic, Rtoc); 9.185 // CALL to fixup routine. Fixup routine uses ScopeDesc info 9.186 // to determine who we intended to call. 9.187 __ relocate(virtual_call_Relocation::spec(virtual_call_meta_addr)); 9.188 @@ -3710,7 +3716,6 @@ 9.189 "Fix constant in ret_addr_offset()"); 9.190 } 9.191 #endif 9.192 - guarantee(0, "Fix handling of toc edge: messes up derived/base pairs."); 9.193 Unimplemented(); // ret_addr_offset not yet fixed. Depends on compressed oops (load klass!). 9.194 %} 9.195 9.196 @@ -5436,7 +5441,7 @@ 9.197 ins_pipe(pipe_class_memory); 9.198 %} 9.199 9.200 -// Match loading integer and casting it to unsigned int in 9.201 +// Match loading integer and casting it to unsigned int in 9.202 // long register. 9.203 // LoadI + ConvI2L + AndL 0xffffffff. 9.204 instruct loadUI2L(iRegLdst dst, memory mem, immL_32bits mask) %{ 9.205 @@ -6078,7 +6083,7 @@ 9.206 ins_pipe(pipe_class_default); 9.207 %} 9.208 9.209 -// This needs a match rule so that build_oop_map knows this is 9.210 +// This needs a match rule so that build_oop_map knows this is 9.211 // not a narrow oop. 9.212 instruct loadConNKlass_lo(iRegNdst dst, immNKlass_NM src1, iRegNsrc src2) %{ 9.213 match(Set dst src1); 9.214 @@ -6702,7 +6707,7 @@ 9.215 size(4); 9.216 ins_encode %{ 9.217 // This is a Power7 instruction for which no machine description exists. 9.218 - // TODO: PPC port $archOpcode(ppc64Opcode_compound); 9.219 + // TODO: PPC port $archOpcode(ppc64Opcode_compound); 9.220 __ isel_0($dst$$Register, $crx$$CondRegister, Assembler::equal, $src1$$Register); 9.221 %} 9.222 ins_pipe(pipe_class_default); 9.223 @@ -6847,7 +6852,7 @@ 9.224 size(4); 9.225 ins_encode %{ 9.226 // This is a Power7 instruction for which no machine description exists. 9.227 - // TODO: PPC port $archOpcode(ppc64Opcode_compound); 9.228 + // TODO: PPC port $archOpcode(ppc64Opcode_compound); 9.229 __ isel_0($dst$$Register, $crx$$CondRegister, Assembler::equal, $src1$$Register); 9.230 %} 9.231 ins_pipe(pipe_class_default); 9.232 @@ -7064,7 +7069,7 @@ 9.233 n1->_bottom_type = _bottom_type; 9.234 9.235 decodeNKlass_shiftNode *n2 = new (C) decodeNKlass_shiftNode(); 9.236 - n2->add_req(n_region, n2); 9.237 + n2->add_req(n_region, n1); 9.238 n2->_opnds[0] = op_dst; 9.239 n2->_opnds[1] = op_dst; 9.240 n2->_bottom_type = _bottom_type; 9.241 @@ -7199,7 +7204,7 @@ 9.242 // inline_unsafe_load_store). 9.243 // 9.244 // Add this node again if we found a good solution for inline_unsafe_load_store(). 9.245 -// Don't forget to look at the implementation of post_store_load_barrier again, 9.246 +// Don't forget to look at the implementation of post_store_load_barrier again, 9.247 // we did other fixes in that method. 9.248 //instruct unnecessary_membar_volatile() %{ 9.249 // match(MemBarVolatile); 9.250 @@ -7237,7 +7242,7 @@ 9.251 // exists. Anyways, the scheduler should be off on Power7. 9.252 // TODO: PPC port $archOpcode(ppc64Opcode_compound); 9.253 int cc = $cmp$$cmpcode; 9.254 - __ isel($dst$$Register, $crx$$CondRegister, 9.255 + __ isel($dst$$Register, $crx$$CondRegister, 9.256 (Assembler::Condition)(cc & 3), /*invert*/((~cc) & 8), $src$$Register); 9.257 %} 9.258 ins_pipe(pipe_class_default); 9.259 @@ -7283,7 +7288,7 @@ 9.260 // exists. Anyways, the scheduler should be off on Power7. 9.261 // TODO: PPC port $archOpcode(ppc64Opcode_compound); 9.262 int cc = $cmp$$cmpcode; 9.263 - __ isel($dst$$Register, $crx$$CondRegister, 9.264 + __ isel($dst$$Register, $crx$$CondRegister, 9.265 (Assembler::Condition)(cc & 3), /*invert*/((~cc) & 8), $src$$Register); 9.266 %} 9.267 ins_pipe(pipe_class_default); 9.268 @@ -7329,7 +7334,7 @@ 9.269 // exists. Anyways, the scheduler should be off on Power7. 9.270 // TODO: PPC port $archOpcode(ppc64Opcode_compound); 9.271 int cc = $cmp$$cmpcode; 9.272 - __ isel($dst$$Register, $crx$$CondRegister, 9.273 + __ isel($dst$$Register, $crx$$CondRegister, 9.274 (Assembler::Condition)(cc & 3), /*invert*/((~cc) & 8), $src$$Register); 9.275 %} 9.276 ins_pipe(pipe_class_default); 9.277 @@ -7376,7 +7381,7 @@ 9.278 // exists. Anyways, the scheduler should be off on Power7. 9.279 // TODO: PPC port $archOpcode(ppc64Opcode_compound); 9.280 int cc = $cmp$$cmpcode; 9.281 - __ isel($dst$$Register, $crx$$CondRegister, 9.282 + __ isel($dst$$Register, $crx$$CondRegister, 9.283 (Assembler::Condition)(cc & 3), /*invert*/((~cc) & 8), $src$$Register); 9.284 %} 9.285 ins_pipe(pipe_class_default); 9.286 @@ -7522,8 +7527,8 @@ 9.287 ins_encode %{ 9.288 // TODO: PPC port $archOpcode(ppc64Opcode_compound); 9.289 // CmpxchgX sets CCR0 to cmpX(src1, src2) and Rres to 'true'/'false'. 9.290 - __ cmpxchgw(CCR0, R0, $src1$$Register, $src2$$Register, $mem_ptr$$Register, 9.291 - MacroAssembler::MemBarFenceAfter, MacroAssembler::cmpxchgx_hint_atomic_update(), 9.292 + __ cmpxchgw(CCR0, R0, $src1$$Register, $src2$$Register, $mem_ptr$$Register, 9.293 + MacroAssembler::MemBarFenceAfter, MacroAssembler::cmpxchgx_hint_atomic_update(), 9.294 $res$$Register, true); 9.295 %} 9.296 ins_pipe(pipe_class_default); 9.297 @@ -7929,7 +7934,23 @@ 9.298 9.299 // Turn the sign-bit of a long into a 64-bit mask, 0x0...0 for 9.300 // positive longs and 0xF...F for negative ones. 9.301 -instruct signmask64I_regI(iRegIdst dst, iRegIsrc src) %{ 9.302 +instruct signmask64I_regL(iRegIdst dst, iRegLsrc src) %{ 9.303 + // no match-rule, false predicate 9.304 + effect(DEF dst, USE src); 9.305 + predicate(false); 9.306 + 9.307 + format %{ "SRADI $dst, $src, #63" %} 9.308 + size(4); 9.309 + ins_encode %{ 9.310 + // TODO: PPC port $archOpcode(ppc64Opcode_sradi); 9.311 + __ sradi($dst$$Register, $src$$Register, 0x3f); 9.312 + %} 9.313 + ins_pipe(pipe_class_default); 9.314 +%} 9.315 + 9.316 +// Turn the sign-bit of a long into a 64-bit mask, 0x0...0 for 9.317 +// positive longs and 0xF...F for negative ones. 9.318 +instruct signmask64L_regL(iRegLdst dst, iRegLsrc src) %{ 9.319 // no match-rule, false predicate 9.320 effect(DEF dst, USE src); 9.321 predicate(false); 9.322 @@ -8893,7 +8914,7 @@ 9.323 size(4); 9.324 ins_encode %{ 9.325 // TODO: PPC port $archOpcode(ppc64Opcode_rlwinm); 9.326 - __ rlwinm($dst$$Register, $src1$$Register, 0, 9.327 + __ rlwinm($dst$$Register, $src1$$Register, 0, 9.328 (31-log2_long((jlong) $src2$$constant)) & 0x1f, (31-log2_long((jlong) $src2$$constant)) & 0x1f); 9.329 %} 9.330 ins_pipe(pipe_class_default); 9.331 @@ -9619,14 +9640,14 @@ 9.332 ins_cost(DEFAULT_COST*4); 9.333 9.334 expand %{ 9.335 - iRegIdst src1s; 9.336 - iRegIdst src2s; 9.337 - iRegIdst diff; 9.338 - sxtI_reg(src1s, src1); // ensure proper sign extention 9.339 - sxtI_reg(src2s, src2); // ensure proper sign extention 9.340 - subI_reg_reg(diff, src1s, src2s); 9.341 + iRegLdst src1s; 9.342 + iRegLdst src2s; 9.343 + iRegLdst diff; 9.344 + convI2L_reg(src1s, src1); // Ensure proper sign extension. 9.345 + convI2L_reg(src2s, src2); // Ensure proper sign extension. 9.346 + subL_reg_reg(diff, src1s, src2s); 9.347 // Need to consider >=33 bit result, therefore we need signmaskL. 9.348 - signmask64I_regI(dst, diff); 9.349 + signmask64I_regL(dst, diff); 9.350 %} 9.351 %} 9.352 9.353 @@ -10863,7 +10884,7 @@ 9.354 format %{ "PartialSubtypeCheck $result = ($subklass instanceOf $superklass) tmp: $tmp_klass, $tmp_arrayptr" %} 9.355 ins_encode %{ 9.356 // TODO: PPC port $archOpcode(ppc64Opcode_compound); 9.357 - __ check_klass_subtype_slow_path($subklass$$Register, $superklass$$Register, $tmp_arrayptr$$Register, 9.358 + __ check_klass_subtype_slow_path($subklass$$Register, $superklass$$Register, $tmp_arrayptr$$Register, 9.359 $tmp_klass$$Register, NULL, $result$$Register); 9.360 %} 9.361 ins_pipe(pipe_class_default); 9.362 @@ -11178,18 +11199,18 @@ 9.363 ins_cost(DEFAULT_COST*6); 9.364 9.365 expand %{ 9.366 - iRegIdst src1s; 9.367 - iRegIdst src2s; 9.368 - iRegIdst diff; 9.369 - iRegIdst sm; 9.370 - iRegIdst doz; // difference or zero 9.371 - sxtI_reg(src1s, src1); // Ensure proper sign extention. 9.372 - sxtI_reg(src2s, src2); // Ensure proper sign extention. 9.373 - subI_reg_reg(diff, src2s, src1s); 9.374 + iRegLdst src1s; 9.375 + iRegLdst src2s; 9.376 + iRegLdst diff; 9.377 + iRegLdst sm; 9.378 + iRegLdst doz; // difference or zero 9.379 + convI2L_reg(src1s, src1); // Ensure proper sign extension. 9.380 + convI2L_reg(src2s, src2); // Ensure proper sign extension. 9.381 + subL_reg_reg(diff, src2s, src1s); 9.382 // Need to consider >=33 bit result, therefore we need signmaskL. 9.383 - signmask64I_regI(sm, diff); 9.384 - andI_reg_reg(doz, diff, sm); // <=0 9.385 - addI_reg_reg(dst, doz, src1s); 9.386 + signmask64L_regL(sm, diff); 9.387 + andL_reg_reg(doz, diff, sm); // <=0 9.388 + addI_regL_regL(dst, doz, src1s); 9.389 %} 9.390 %} 9.391 9.392 @@ -11198,19 +11219,18 @@ 9.393 ins_cost(DEFAULT_COST*6); 9.394 9.395 expand %{ 9.396 - immI_minus1 m1 %{ -1 %} 9.397 - iRegIdst src1s; 9.398 - iRegIdst src2s; 9.399 - iRegIdst diff; 9.400 - iRegIdst sm; 9.401 - iRegIdst doz; // difference or zero 9.402 - sxtI_reg(src1s, src1); // Ensure proper sign extention. 9.403 - sxtI_reg(src2s, src2); // Ensure proper sign extention. 9.404 - subI_reg_reg(diff, src2s, src1s); 9.405 + iRegLdst src1s; 9.406 + iRegLdst src2s; 9.407 + iRegLdst diff; 9.408 + iRegLdst sm; 9.409 + iRegLdst doz; // difference or zero 9.410 + convI2L_reg(src1s, src1); // Ensure proper sign extension. 9.411 + convI2L_reg(src2s, src2); // Ensure proper sign extension. 9.412 + subL_reg_reg(diff, src2s, src1s); 9.413 // Need to consider >=33 bit result, therefore we need signmaskL. 9.414 - signmask64I_regI(sm, diff); 9.415 - andcI_reg_reg(doz, sm, m1, diff); // >=0 9.416 - addI_reg_reg(dst, doz, src1s); 9.417 + signmask64L_regL(sm, diff); 9.418 + andcL_reg_reg(doz, diff, sm); // >=0 9.419 + addI_regL_regL(dst, doz, src1s); 9.420 %} 9.421 %} 9.422
10.1 --- a/src/cpu/ppc/vm/templateInterpreter_ppc.cpp Wed May 07 10:58:47 2014 -0700 10.2 +++ b/src/cpu/ppc/vm/templateInterpreter_ppc.cpp Thu May 08 23:07:11 2014 -0700 10.3 @@ -81,24 +81,18 @@ 10.4 #if 0 10.5 // Call special ClassCastException constructor taking object to cast 10.6 // and target class as arguments. 10.7 -address TemplateInterpreterGenerator::generate_ClassCastException_verbose_handler(const char* name) { 10.8 +address TemplateInterpreterGenerator::generate_ClassCastException_verbose_handler() { 10.9 address entry = __ pc(); 10.10 10.11 - // Target class oop is in register R6_ARG4 by convention! 10.12 - 10.13 // Expression stack must be empty before entering the VM if an 10.14 // exception happened. 10.15 __ empty_expression_stack(); 10.16 - // Setup parameters. 10.17 + 10.18 // Thread will be loaded to R3_ARG1. 10.19 - __ load_const_optimized(R4_ARG2, (address) name); 10.20 - __ mr(R5_ARG3, R17_tos); 10.21 - // R6_ARG4 contains specified class. 10.22 - __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::throw_ClassCastException_verbose)); 10.23 -#ifdef ASSERT 10.24 + // Target class oop is in register R5_ARG3 by convention! 10.25 + __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::throw_ClassCastException_verbose, R17_tos, R5_ARG3)); 10.26 // Above call must not return here since exception pending. 10.27 - __ should_not_reach_here(); 10.28 -#endif 10.29 + DEBUG_ONLY(__ should_not_reach_here();) 10.30 return entry; 10.31 } 10.32 #endif 10.33 @@ -1535,14 +1529,32 @@ 10.34 __ stw(R0, in_bytes(JavaThread::popframe_condition_offset()), R16_thread); 10.35 10.36 // Get out of the current method and re-execute the call that called us. 10.37 - __ merge_frames(/*top_frame_sp*/ R21_sender_SP, /*return_pc*/ return_pc, R11_scratch1, R12_scratch2); 10.38 + __ merge_frames(/*top_frame_sp*/ R21_sender_SP, /*return_pc*/ noreg, R11_scratch1, R12_scratch2); 10.39 __ restore_interpreter_state(R11_scratch1); 10.40 __ ld(R12_scratch2, _ijava_state_neg(top_frame_sp), R11_scratch1); 10.41 __ resize_frame_absolute(R12_scratch2, R11_scratch1, R0); 10.42 - __ mtlr(return_pc); 10.43 if (ProfileInterpreter) { 10.44 __ set_method_data_pointer_for_bcp(); 10.45 } 10.46 +#if INCLUDE_JVMTI 10.47 + Label L_done; 10.48 + 10.49 + __ lbz(R11_scratch1, 0, R14_bcp); 10.50 + __ cmpwi(CCR0, R11_scratch1, Bytecodes::_invokestatic); 10.51 + __ bne(CCR0, L_done); 10.52 + 10.53 + // The member name argument must be restored if _invokestatic is re-executed after a PopFrame call. 10.54 + // Detect such a case in the InterpreterRuntime function and return the member name argument, or NULL. 10.55 + __ ld(R4_ARG2, 0, R18_locals); 10.56 + __ call_VM(R11_scratch1, CAST_FROM_FN_PTR(address, InterpreterRuntime::member_name_arg_or_null), 10.57 + R4_ARG2, R19_method, R14_bcp); 10.58 + 10.59 + __ cmpdi(CCR0, R11_scratch1, 0); 10.60 + __ beq(CCR0, L_done); 10.61 + 10.62 + __ std(R11_scratch1, wordSize, R15_esp); 10.63 + __ bind(L_done); 10.64 +#endif // INCLUDE_JVMTI 10.65 __ dispatch_next(vtos); 10.66 } 10.67 // end of JVMTI PopFrame support
11.1 --- a/src/cpu/ppc/vm/templateTable_ppc_64.cpp Wed May 07 10:58:47 2014 -0700 11.2 +++ b/src/cpu/ppc/vm/templateTable_ppc_64.cpp Thu May 08 23:07:11 2014 -0700 11.3 @@ -64,7 +64,7 @@ 11.4 assert_different_registers(Rtmp1, Rtmp2, Rtmp3, Rval, Rbase); 11.5 11.6 switch (barrier) { 11.7 -#ifndef SERIALGC 11.8 +#if INCLUDE_ALL_GCS 11.9 case BarrierSet::G1SATBCT: 11.10 case BarrierSet::G1SATBCTLogging: 11.11 { 11.12 @@ -104,7 +104,7 @@ 11.13 __ bind(Ldone); 11.14 } 11.15 break; 11.16 -#endif // SERIALGC 11.17 +#endif // INCLUDE_ALL_GCS 11.18 case BarrierSet::CardTableModRef: 11.19 case BarrierSet::CardTableExtension: 11.20 { 11.21 @@ -259,17 +259,17 @@ 11.22 switch (value) { 11.23 default: ShouldNotReachHere(); 11.24 case 0: { 11.25 - int simm16_offset = __ load_const_optimized(R11_scratch1, (address*)&zero, R0); 11.26 + int simm16_offset = __ load_const_optimized(R11_scratch1, (address*)&zero, R0, true); 11.27 __ lfs(F15_ftos, simm16_offset, R11_scratch1); 11.28 break; 11.29 } 11.30 case 1: { 11.31 - int simm16_offset = __ load_const_optimized(R11_scratch1, (address*)&one, R0); 11.32 + int simm16_offset = __ load_const_optimized(R11_scratch1, (address*)&one, R0, true); 11.33 __ lfs(F15_ftos, simm16_offset, R11_scratch1); 11.34 break; 11.35 } 11.36 case 2: { 11.37 - int simm16_offset = __ load_const_optimized(R11_scratch1, (address*)&two, R0); 11.38 + int simm16_offset = __ load_const_optimized(R11_scratch1, (address*)&two, R0, true); 11.39 __ lfs(F15_ftos, simm16_offset, R11_scratch1); 11.40 break; 11.41 } 11.42 @@ -282,12 +282,12 @@ 11.43 static double one = 1.0; 11.44 switch (value) { 11.45 case 0: { 11.46 - int simm16_offset = __ load_const_optimized(R11_scratch1, (address*)&zero, R0); 11.47 + int simm16_offset = __ load_const_optimized(R11_scratch1, (address*)&zero, R0, true); 11.48 __ lfd(F15_ftos, simm16_offset, R11_scratch1); 11.49 break; 11.50 } 11.51 case 1: { 11.52 - int simm16_offset = __ load_const_optimized(R11_scratch1, (address*)&one, R0); 11.53 + int simm16_offset = __ load_const_optimized(R11_scratch1, (address*)&one, R0, true); 11.54 __ lfd(F15_ftos, simm16_offset, R11_scratch1); 11.55 break; 11.56 } 11.57 @@ -3728,9 +3728,9 @@ 11.58 transition(atos, atos); 11.59 11.60 Label Ldone, Lis_null, Lquicked, Lresolved; 11.61 - Register Roffset = R5_ARG3, 11.62 + Register Roffset = R6_ARG4, 11.63 RobjKlass = R4_ARG2, 11.64 - RspecifiedKlass = R6_ARG4, // Generate_ClassCastException_verbose_handler will expect this register. 11.65 + RspecifiedKlass = R5_ARG3, // Generate_ClassCastException_verbose_handler will read value from this register. 11.66 Rcpool = R11_scratch1, 11.67 Rtags = R12_scratch2; 11.68
12.1 --- a/src/cpu/sparc/vm/assembler_sparc.hpp Wed May 07 10:58:47 2014 -0700 12.2 +++ b/src/cpu/sparc/vm/assembler_sparc.hpp Thu May 08 23:07:11 2014 -0700 12.3 @@ -1,5 +1,5 @@ 12.4 /* 12.5 - * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved. 12.6 + * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved. 12.7 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 12.8 * 12.9 * This code is free software; you can redistribute it and/or modify it 12.10 @@ -123,8 +123,13 @@ 12.11 fpop2_op3 = 0x35, 12.12 impdep1_op3 = 0x36, 12.13 aes3_op3 = 0x36, 12.14 + alignaddr_op3 = 0x36, 12.15 + faligndata_op3 = 0x36, 12.16 flog3_op3 = 0x36, 12.17 + edge_op3 = 0x36, 12.18 + fsrc_op3 = 0x36, 12.19 impdep2_op3 = 0x37, 12.20 + stpartialf_op3 = 0x37, 12.21 jmpl_op3 = 0x38, 12.22 rett_op3 = 0x39, 12.23 trap_op3 = 0x3a, 12.24 @@ -175,17 +180,23 @@ 12.25 12.26 enum opfs { 12.27 // selected opfs 12.28 + edge8n_opf = 0x01, 12.29 + 12.30 fmovs_opf = 0x01, 12.31 fmovd_opf = 0x02, 12.32 12.33 fnegs_opf = 0x05, 12.34 fnegd_opf = 0x06, 12.35 12.36 + alignaddr_opf = 0x18, 12.37 + 12.38 fadds_opf = 0x41, 12.39 faddd_opf = 0x42, 12.40 fsubs_opf = 0x45, 12.41 fsubd_opf = 0x46, 12.42 12.43 + faligndata_opf = 0x48, 12.44 + 12.45 fmuls_opf = 0x49, 12.46 fmuld_opf = 0x4a, 12.47 fdivs_opf = 0x4d, 12.48 @@ -348,6 +359,8 @@ 12.49 ASI_PRIMARY = 0x80, 12.50 ASI_PRIMARY_NOFAULT = 0x82, 12.51 ASI_PRIMARY_LITTLE = 0x88, 12.52 + // 8x8-bit partial store 12.53 + ASI_PST8_PRIMARY = 0xC0, 12.54 // Block initializing store 12.55 ASI_ST_BLKINIT_PRIMARY = 0xE2, 12.56 // Most-Recently-Used (MRU) BIS variant 12.57 @@ -585,6 +598,9 @@ 12.58 // instruction only in VIS1 12.59 static void vis1_only() { assert( VM_Version::has_vis1(), "This instruction only works on SPARC with VIS1"); } 12.60 12.61 + // instruction only in VIS2 12.62 + static void vis2_only() { assert( VM_Version::has_vis2(), "This instruction only works on SPARC with VIS2"); } 12.63 + 12.64 // instruction only in VIS3 12.65 static void vis3_only() { assert( VM_Version::has_vis3(), "This instruction only works on SPARC with VIS3"); } 12.66 12.67 @@ -1164,6 +1180,20 @@ 12.68 inline void wrfprs( Register d) { v9_only(); emit_int32( op(arith_op) | rs1(d) | op3(wrreg_op3) | u_field(6, 29, 25)); } 12.69 12.70 12.71 + // VIS1 instructions 12.72 + 12.73 + void alignaddr( Register s1, Register s2, Register d ) { vis1_only(); emit_int32( op(arith_op) | rd(d) | op3(alignaddr_op3) | rs1(s1) | opf(alignaddr_opf) | rs2(s2)); } 12.74 + 12.75 + void faligndata( FloatRegister s1, FloatRegister s2, FloatRegister d ) { vis1_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(faligndata_op3) | fs1(s1, FloatRegisterImpl::D) | opf(faligndata_opf) | fs2(s2, FloatRegisterImpl::D)); } 12.76 + 12.77 + void fsrc2( FloatRegisterImpl::Width w, FloatRegister s2, FloatRegister d ) { vis1_only(); emit_int32( op(arith_op) | fd(d, w) | op3(fsrc_op3) | opf(0x7A - w) | fs2(s2, w)); } 12.78 + 12.79 + void stpartialf( Register s1, Register s2, FloatRegister d, int ia = -1 ) { vis1_only(); emit_int32( op(ldst_op) | fd(d, FloatRegisterImpl::D) | op3(stpartialf_op3) | rs1(s1) | imm_asi(ia) | rs2(s2)); } 12.80 + 12.81 + // VIS2 instructions 12.82 + 12.83 + void edge8n( Register s1, Register s2, Register d ) { vis2_only(); emit_int32( op(arith_op) | rd(d) | op3(edge_op3) | rs1(s1) | opf(edge8n_opf) | rs2(s2)); } 12.84 + 12.85 // VIS3 instructions 12.86 12.87 void movstosw( FloatRegister s, Register d ) { vis3_only(); emit_int32( op(arith_op) | rd(d) | op3(mftoi_op3) | opf(mstosw_opf) | fs2(s, FloatRegisterImpl::S)); }
13.1 --- a/src/cpu/sparc/vm/stubGenerator_sparc.cpp Wed May 07 10:58:47 2014 -0700 13.2 +++ b/src/cpu/sparc/vm/stubGenerator_sparc.cpp Thu May 08 23:07:11 2014 -0700 13.3 @@ -1,5 +1,5 @@ 13.4 /* 13.5 - * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved. 13.6 + * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved. 13.7 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 13.8 * 13.9 * This code is free software; you can redistribute it and/or modify it 13.10 @@ -3305,9 +3305,12 @@ 13.11 } 13.12 13.13 address generate_aescrypt_encryptBlock() { 13.14 + // required since we read expanded key 'int' array starting first element without alignment considerations 13.15 + assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0, 13.16 + "the following code assumes that first element of an int array is aligned to 8 bytes"); 13.17 __ align(CodeEntryAlignment); 13.18 - StubCodeMark mark(this, "StubRoutines", "aesencryptBlock"); 13.19 - Label L_doLast128bit, L_storeOutput; 13.20 + StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 13.21 + Label L_load_misaligned_input, L_load_expanded_key, L_doLast128bit, L_storeOutput, L_store_misaligned_output; 13.22 address start = __ pc(); 13.23 Register from = O0; // source byte array 13.24 Register to = O1; // destination byte array 13.25 @@ -3317,15 +3320,33 @@ 13.26 // read expanded key length 13.27 __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0); 13.28 13.29 - // load input into F54-F56; F30-F31 used as temp 13.30 - __ ldf(FloatRegisterImpl::S, from, 0, F30); 13.31 - __ ldf(FloatRegisterImpl::S, from, 4, F31); 13.32 - __ fmov(FloatRegisterImpl::D, F30, F54); 13.33 - __ ldf(FloatRegisterImpl::S, from, 8, F30); 13.34 - __ ldf(FloatRegisterImpl::S, from, 12, F31); 13.35 - __ fmov(FloatRegisterImpl::D, F30, F56); 13.36 - 13.37 - // load expanded key 13.38 + // Method to address arbitrary alignment for load instructions: 13.39 + // Check last 3 bits of 'from' address to see if it is aligned to 8-byte boundary 13.40 + // If zero/aligned then continue with double FP load instructions 13.41 + // If not zero/mis-aligned then alignaddr will set GSR.align with number of bytes to skip during faligndata 13.42 + // alignaddr will also convert arbitrary aligned 'from' address to nearest 8-byte aligned address 13.43 + // load 3 * 8-byte components (to read 16 bytes input) in 3 different FP regs starting at this aligned address 13.44 + // faligndata will then extract (based on GSR.align value) the appropriate 8 bytes from the 2 source regs 13.45 + 13.46 + // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 13.47 + __ andcc(from, 7, G0); 13.48 + __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input); 13.49 + __ delayed()->alignaddr(from, G0, from); 13.50 + 13.51 + // aligned case: load input into F54-F56 13.52 + __ ldf(FloatRegisterImpl::D, from, 0, F54); 13.53 + __ ldf(FloatRegisterImpl::D, from, 8, F56); 13.54 + __ ba_short(L_load_expanded_key); 13.55 + 13.56 + __ BIND(L_load_misaligned_input); 13.57 + __ ldf(FloatRegisterImpl::D, from, 0, F54); 13.58 + __ ldf(FloatRegisterImpl::D, from, 8, F56); 13.59 + __ ldf(FloatRegisterImpl::D, from, 16, F58); 13.60 + __ faligndata(F54, F56, F54); 13.61 + __ faligndata(F56, F58, F56); 13.62 + 13.63 + __ BIND(L_load_expanded_key); 13.64 + // Since we load expanded key buffers starting first element, 8-byte alignment is guaranteed 13.65 for ( int i = 0; i <= 38; i += 2 ) { 13.66 __ ldf(FloatRegisterImpl::D, key, i*4, as_FloatRegister(i)); 13.67 } 13.68 @@ -3365,8 +3386,7 @@ 13.69 __ ldf(FloatRegisterImpl::D, key, 232, F50); 13.70 __ aes_eround01(F52, F54, F56, F58); //round 13 13.71 __ aes_eround23(F46, F54, F56, F60); 13.72 - __ br(Assembler::always, false, Assembler::pt, L_storeOutput); 13.73 - __ delayed()->nop(); 13.74 + __ ba_short(L_storeOutput); 13.75 13.76 __ BIND(L_doLast128bit); 13.77 __ ldf(FloatRegisterImpl::D, key, 160, F48); 13.78 @@ -3377,23 +3397,62 @@ 13.79 __ aes_eround01_l(F48, F58, F60, F54); //last round 13.80 __ aes_eround23_l(F50, F58, F60, F56); 13.81 13.82 - // store output into the destination array, F0-F1 used as temp 13.83 - __ fmov(FloatRegisterImpl::D, F54, F0); 13.84 - __ stf(FloatRegisterImpl::S, F0, to, 0); 13.85 - __ stf(FloatRegisterImpl::S, F1, to, 4); 13.86 - __ fmov(FloatRegisterImpl::D, F56, F0); 13.87 - __ stf(FloatRegisterImpl::S, F0, to, 8); 13.88 + // Method to address arbitrary alignment for store instructions: 13.89 + // Check last 3 bits of 'dest' address to see if it is aligned to 8-byte boundary 13.90 + // If zero/aligned then continue with double FP store instructions 13.91 + // If not zero/mis-aligned then edge8n will generate edge mask in result reg (O3 in below case) 13.92 + // Example: If dest address is 0x07 and nearest 8-byte aligned address is 0x00 then edge mask will be 00000001 13.93 + // Compute (8-n) where n is # of bytes skipped by partial store(stpartialf) inst from edge mask, n=7 in this case 13.94 + // We get the value of n from the andcc that checks 'dest' alignment. n is available in O5 in below case. 13.95 + // Set GSR.align to (8-n) using alignaddr 13.96 + // Circular byte shift store values by n places so that the original bytes are at correct position for stpartialf 13.97 + // Set the arbitrarily aligned 'dest' address to nearest 8-byte aligned address 13.98 + // Store (partial) the original first (8-n) bytes starting at the original 'dest' address 13.99 + // Negate the edge mask so that the subsequent stpartialf can store the original (8-n-1)th through 8th bytes at appropriate address 13.100 + // We need to execute this process for both the 8-byte result values 13.101 + 13.102 + // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 13.103 + __ andcc(to, 7, O5); 13.104 + __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output); 13.105 + __ delayed()->edge8n(to, G0, O3); 13.106 + 13.107 + // aligned case: store output into the destination array 13.108 + __ stf(FloatRegisterImpl::D, F54, to, 0); 13.109 __ retl(); 13.110 - __ delayed()->stf(FloatRegisterImpl::S, F1, to, 12); 13.111 + __ delayed()->stf(FloatRegisterImpl::D, F56, to, 8); 13.112 + 13.113 + __ BIND(L_store_misaligned_output); 13.114 + __ add(to, 8, O4); 13.115 + __ mov(8, O2); 13.116 + __ sub(O2, O5, O2); 13.117 + __ alignaddr(O2, G0, O2); 13.118 + __ faligndata(F54, F54, F54); 13.119 + __ faligndata(F56, F56, F56); 13.120 + __ and3(to, -8, to); 13.121 + __ and3(O4, -8, O4); 13.122 + __ stpartialf(to, O3, F54, Assembler::ASI_PST8_PRIMARY); 13.123 + __ stpartialf(O4, O3, F56, Assembler::ASI_PST8_PRIMARY); 13.124 + __ add(to, 8, to); 13.125 + __ add(O4, 8, O4); 13.126 + __ orn(G0, O3, O3); 13.127 + __ stpartialf(to, O3, F54, Assembler::ASI_PST8_PRIMARY); 13.128 + __ retl(); 13.129 + __ delayed()->stpartialf(O4, O3, F56, Assembler::ASI_PST8_PRIMARY); 13.130 13.131 return start; 13.132 } 13.133 13.134 address generate_aescrypt_decryptBlock() { 13.135 + assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0, 13.136 + "the following code assumes that first element of an int array is aligned to 8 bytes"); 13.137 + // required since we read original key 'byte' array as well in the decryption stubs 13.138 + assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0, 13.139 + "the following code assumes that first element of a byte array is aligned to 8 bytes"); 13.140 __ align(CodeEntryAlignment); 13.141 - StubCodeMark mark(this, "StubRoutines", "aesdecryptBlock"); 13.142 + StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 13.143 address start = __ pc(); 13.144 - Label L_expand192bit, L_expand256bit, L_common_transform; 13.145 + Label L_load_misaligned_input, L_load_original_key, L_expand192bit, L_expand256bit, L_reload_misaligned_input; 13.146 + Label L_256bit_transform, L_common_transform, L_store_misaligned_output; 13.147 Register from = O0; // source byte array 13.148 Register to = O1; // destination byte array 13.149 Register key = O2; // expanded key array 13.150 @@ -3403,15 +3462,29 @@ 13.151 // read expanded key array length 13.152 __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0); 13.153 13.154 - // load input into F52-F54; F30,F31 used as temp 13.155 - __ ldf(FloatRegisterImpl::S, from, 0, F30); 13.156 - __ ldf(FloatRegisterImpl::S, from, 4, F31); 13.157 - __ fmov(FloatRegisterImpl::D, F30, F52); 13.158 - __ ldf(FloatRegisterImpl::S, from, 8, F30); 13.159 - __ ldf(FloatRegisterImpl::S, from, 12, F31); 13.160 - __ fmov(FloatRegisterImpl::D, F30, F54); 13.161 - 13.162 + // save 'from' since we may need to recheck alignment in case of 256-bit decryption 13.163 + __ mov(from, G1); 13.164 + 13.165 + // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 13.166 + __ andcc(from, 7, G0); 13.167 + __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input); 13.168 + __ delayed()->alignaddr(from, G0, from); 13.169 + 13.170 + // aligned case: load input into F52-F54 13.171 + __ ldf(FloatRegisterImpl::D, from, 0, F52); 13.172 + __ ldf(FloatRegisterImpl::D, from, 8, F54); 13.173 + __ ba_short(L_load_original_key); 13.174 + 13.175 + __ BIND(L_load_misaligned_input); 13.176 + __ ldf(FloatRegisterImpl::D, from, 0, F52); 13.177 + __ ldf(FloatRegisterImpl::D, from, 8, F54); 13.178 + __ ldf(FloatRegisterImpl::D, from, 16, F56); 13.179 + __ faligndata(F52, F54, F52); 13.180 + __ faligndata(F54, F56, F54); 13.181 + 13.182 + __ BIND(L_load_original_key); 13.183 // load original key from SunJCE expanded decryption key 13.184 + // Since we load original key buffer starting first element, 8-byte alignment is guaranteed 13.185 for ( int i = 0; i <= 3; i++ ) { 13.186 __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i)); 13.187 } 13.188 @@ -3432,8 +3505,7 @@ 13.189 // perform 128-bit key specific inverse cipher transformation 13.190 __ fxor(FloatRegisterImpl::D, F42, F54, F54); 13.191 __ fxor(FloatRegisterImpl::D, F40, F52, F52); 13.192 - __ br(Assembler::always, false, Assembler::pt, L_common_transform); 13.193 - __ delayed()->nop(); 13.194 + __ ba_short(L_common_transform); 13.195 13.196 __ BIND(L_expand192bit); 13.197 13.198 @@ -3457,8 +3529,7 @@ 13.199 __ aes_dround01(F44, F52, F54, F56); 13.200 __ aes_dround23(F42, F56, F58, F54); 13.201 __ aes_dround01(F40, F56, F58, F52); 13.202 - __ br(Assembler::always, false, Assembler::pt, L_common_transform); 13.203 - __ delayed()->nop(); 13.204 + __ ba_short(L_common_transform); 13.205 13.206 __ BIND(L_expand256bit); 13.207 13.208 @@ -3478,14 +3549,31 @@ 13.209 __ aes_kexpand2(F50, F56, F58); 13.210 13.211 for ( int i = 0; i <= 6; i += 2 ) { 13.212 - __ fmov(FloatRegisterImpl::D, as_FloatRegister(58-i), as_FloatRegister(i)); 13.213 + __ fsrc2(FloatRegisterImpl::D, as_FloatRegister(58-i), as_FloatRegister(i)); 13.214 } 13.215 13.216 - // load input into F52-F54 13.217 + // reload original 'from' address 13.218 + __ mov(G1, from); 13.219 + 13.220 + // re-check 8-byte alignment 13.221 + __ andcc(from, 7, G0); 13.222 + __ br(Assembler::notZero, true, Assembler::pn, L_reload_misaligned_input); 13.223 + __ delayed()->alignaddr(from, G0, from); 13.224 + 13.225 + // aligned case: load input into F52-F54 13.226 __ ldf(FloatRegisterImpl::D, from, 0, F52); 13.227 __ ldf(FloatRegisterImpl::D, from, 8, F54); 13.228 + __ ba_short(L_256bit_transform); 13.229 + 13.230 + __ BIND(L_reload_misaligned_input); 13.231 + __ ldf(FloatRegisterImpl::D, from, 0, F52); 13.232 + __ ldf(FloatRegisterImpl::D, from, 8, F54); 13.233 + __ ldf(FloatRegisterImpl::D, from, 16, F56); 13.234 + __ faligndata(F52, F54, F52); 13.235 + __ faligndata(F54, F56, F54); 13.236 13.237 // perform 256-bit key specific inverse cipher transformation 13.238 + __ BIND(L_256bit_transform); 13.239 __ fxor(FloatRegisterImpl::D, F0, F54, F54); 13.240 __ fxor(FloatRegisterImpl::D, F2, F52, F52); 13.241 __ aes_dround23(F4, F52, F54, F58); 13.242 @@ -3515,43 +3603,71 @@ 13.243 } 13.244 } 13.245 13.246 - // store output to destination array, F0-F1 used as temp 13.247 - __ fmov(FloatRegisterImpl::D, F52, F0); 13.248 - __ stf(FloatRegisterImpl::S, F0, to, 0); 13.249 - __ stf(FloatRegisterImpl::S, F1, to, 4); 13.250 - __ fmov(FloatRegisterImpl::D, F54, F0); 13.251 - __ stf(FloatRegisterImpl::S, F0, to, 8); 13.252 + // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 13.253 + __ andcc(to, 7, O5); 13.254 + __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output); 13.255 + __ delayed()->edge8n(to, G0, O3); 13.256 + 13.257 + // aligned case: store output into the destination array 13.258 + __ stf(FloatRegisterImpl::D, F52, to, 0); 13.259 __ retl(); 13.260 - __ delayed()->stf(FloatRegisterImpl::S, F1, to, 12); 13.261 + __ delayed()->stf(FloatRegisterImpl::D, F54, to, 8); 13.262 + 13.263 + __ BIND(L_store_misaligned_output); 13.264 + __ add(to, 8, O4); 13.265 + __ mov(8, O2); 13.266 + __ sub(O2, O5, O2); 13.267 + __ alignaddr(O2, G0, O2); 13.268 + __ faligndata(F52, F52, F52); 13.269 + __ faligndata(F54, F54, F54); 13.270 + __ and3(to, -8, to); 13.271 + __ and3(O4, -8, O4); 13.272 + __ stpartialf(to, O3, F52, Assembler::ASI_PST8_PRIMARY); 13.273 + __ stpartialf(O4, O3, F54, Assembler::ASI_PST8_PRIMARY); 13.274 + __ add(to, 8, to); 13.275 + __ add(O4, 8, O4); 13.276 + __ orn(G0, O3, O3); 13.277 + __ stpartialf(to, O3, F52, Assembler::ASI_PST8_PRIMARY); 13.278 + __ retl(); 13.279 + __ delayed()->stpartialf(O4, O3, F54, Assembler::ASI_PST8_PRIMARY); 13.280 13.281 return start; 13.282 } 13.283 13.284 address generate_cipherBlockChaining_encryptAESCrypt() { 13.285 + assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0, 13.286 + "the following code assumes that first element of an int array is aligned to 8 bytes"); 13.287 + assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0, 13.288 + "the following code assumes that first element of a byte array is aligned to 8 bytes"); 13.289 __ align(CodeEntryAlignment); 13.290 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 13.291 - Label L_cbcenc128, L_cbcenc192, L_cbcenc256; 13.292 + Label L_cbcenc128, L_load_misaligned_input_128bit, L_128bit_transform, L_store_misaligned_output_128bit; 13.293 + Label L_check_loop_end_128bit, L_cbcenc192, L_load_misaligned_input_192bit, L_192bit_transform; 13.294 + Label L_store_misaligned_output_192bit, L_check_loop_end_192bit, L_cbcenc256, L_load_misaligned_input_256bit; 13.295 + Label L_256bit_transform, L_store_misaligned_output_256bit, L_check_loop_end_256bit; 13.296 address start = __ pc(); 13.297 - Register from = O0; // source byte array 13.298 - Register to = O1; // destination byte array 13.299 - Register key = O2; // expanded key array 13.300 - Register rvec = O3; // init vector 13.301 - const Register len_reg = O4; // cipher length 13.302 - const Register keylen = O5; // reg for storing expanded key array length 13.303 - 13.304 - // save cipher len to return in the end 13.305 - __ mov(len_reg, L1); 13.306 + Register from = I0; // source byte array 13.307 + Register to = I1; // destination byte array 13.308 + Register key = I2; // expanded key array 13.309 + Register rvec = I3; // init vector 13.310 + const Register len_reg = I4; // cipher length 13.311 + const Register keylen = I5; // reg for storing expanded key array length 13.312 + 13.313 + // save cipher len before save_frame, to return in the end 13.314 + __ mov(O4, L0); 13.315 + __ save_frame(0); 13.316 13.317 // read expanded key length 13.318 __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0); 13.319 13.320 - // load init vector 13.321 + // load initial vector, 8-byte alignment is guranteed 13.322 __ ldf(FloatRegisterImpl::D, rvec, 0, F60); 13.323 __ ldf(FloatRegisterImpl::D, rvec, 8, F62); 13.324 + // load key, 8-byte alignment is guranteed 13.325 __ ldx(key,0,G1); 13.326 - __ ldx(key,8,G2); 13.327 - 13.328 - // start loading expanded key 13.329 + __ ldx(key,8,G5); 13.330 + 13.331 + // start loading expanded key, 8-byte alignment is guranteed 13.332 for ( int i = 0, j = 16; i <= 38; i += 2, j += 8 ) { 13.333 __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i)); 13.334 } 13.335 @@ -3571,15 +3687,35 @@ 13.336 } 13.337 13.338 // 256-bit original key size 13.339 - __ br(Assembler::always, false, Assembler::pt, L_cbcenc256); 13.340 - __ delayed()->nop(); 13.341 + __ ba_short(L_cbcenc256); 13.342 13.343 __ align(OptoLoopAlignment); 13.344 __ BIND(L_cbcenc128); 13.345 + // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 13.346 + __ andcc(from, 7, G0); 13.347 + __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_128bit); 13.348 + __ delayed()->mov(from, L1); // save original 'from' address before alignaddr 13.349 + 13.350 + // aligned case: load input into G3 and G4 13.351 __ ldx(from,0,G3); 13.352 __ ldx(from,8,G4); 13.353 + __ ba_short(L_128bit_transform); 13.354 + 13.355 + __ BIND(L_load_misaligned_input_128bit); 13.356 + // can clobber F48, F50 and F52 as they are not used in 128 and 192-bit key encryption 13.357 + __ alignaddr(from, G0, from); 13.358 + __ ldf(FloatRegisterImpl::D, from, 0, F48); 13.359 + __ ldf(FloatRegisterImpl::D, from, 8, F50); 13.360 + __ ldf(FloatRegisterImpl::D, from, 16, F52); 13.361 + __ faligndata(F48, F50, F48); 13.362 + __ faligndata(F50, F52, F50); 13.363 + __ movdtox(F48, G3); 13.364 + __ movdtox(F50, G4); 13.365 + __ mov(L1, from); 13.366 + 13.367 + __ BIND(L_128bit_transform); 13.368 __ xor3(G1,G3,G3); 13.369 - __ xor3(G2,G4,G4); 13.370 + __ xor3(G5,G4,G4); 13.371 __ movxtod(G3,F56); 13.372 __ movxtod(G4,F58); 13.373 __ fxor(FloatRegisterImpl::D, F60, F56, F60); 13.374 @@ -3598,24 +3734,81 @@ 13.375 } 13.376 } 13.377 13.378 + // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 13.379 + __ andcc(to, 7, L1); 13.380 + __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_128bit); 13.381 + __ delayed()->edge8n(to, G0, L2); 13.382 + 13.383 + // aligned case: store output into the destination array 13.384 __ stf(FloatRegisterImpl::D, F60, to, 0); 13.385 __ stf(FloatRegisterImpl::D, F62, to, 8); 13.386 + __ ba_short(L_check_loop_end_128bit); 13.387 + 13.388 + __ BIND(L_store_misaligned_output_128bit); 13.389 + __ add(to, 8, L3); 13.390 + __ mov(8, L4); 13.391 + __ sub(L4, L1, L4); 13.392 + __ alignaddr(L4, G0, L4); 13.393 + // save cipher text before circular right shift 13.394 + // as it needs to be stored as iv for next block (see code before next retl) 13.395 + __ movdtox(F60, L6); 13.396 + __ movdtox(F62, L7); 13.397 + __ faligndata(F60, F60, F60); 13.398 + __ faligndata(F62, F62, F62); 13.399 + __ mov(to, L5); 13.400 + __ and3(to, -8, to); 13.401 + __ and3(L3, -8, L3); 13.402 + __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY); 13.403 + __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY); 13.404 + __ add(to, 8, to); 13.405 + __ add(L3, 8, L3); 13.406 + __ orn(G0, L2, L2); 13.407 + __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY); 13.408 + __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY); 13.409 + __ mov(L5, to); 13.410 + __ movxtod(L6, F60); 13.411 + __ movxtod(L7, F62); 13.412 + 13.413 + __ BIND(L_check_loop_end_128bit); 13.414 __ add(from, 16, from); 13.415 __ add(to, 16, to); 13.416 __ subcc(len_reg, 16, len_reg); 13.417 __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc128); 13.418 __ delayed()->nop(); 13.419 + // re-init intial vector for next block, 8-byte alignment is guaranteed 13.420 __ stf(FloatRegisterImpl::D, F60, rvec, 0); 13.421 __ stf(FloatRegisterImpl::D, F62, rvec, 8); 13.422 + __ restore(); 13.423 __ retl(); 13.424 - __ delayed()->mov(L1, O0); 13.425 + __ delayed()->mov(L0, O0); 13.426 13.427 __ align(OptoLoopAlignment); 13.428 __ BIND(L_cbcenc192); 13.429 + // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 13.430 + __ andcc(from, 7, G0); 13.431 + __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_192bit); 13.432 + __ delayed()->mov(from, L1); // save original 'from' address before alignaddr 13.433 + 13.434 + // aligned case: load input into G3 and G4 13.435 __ ldx(from,0,G3); 13.436 __ ldx(from,8,G4); 13.437 + __ ba_short(L_192bit_transform); 13.438 + 13.439 + __ BIND(L_load_misaligned_input_192bit); 13.440 + // can clobber F48, F50 and F52 as they are not used in 128 and 192-bit key encryption 13.441 + __ alignaddr(from, G0, from); 13.442 + __ ldf(FloatRegisterImpl::D, from, 0, F48); 13.443 + __ ldf(FloatRegisterImpl::D, from, 8, F50); 13.444 + __ ldf(FloatRegisterImpl::D, from, 16, F52); 13.445 + __ faligndata(F48, F50, F48); 13.446 + __ faligndata(F50, F52, F50); 13.447 + __ movdtox(F48, G3); 13.448 + __ movdtox(F50, G4); 13.449 + __ mov(L1, from); 13.450 + 13.451 + __ BIND(L_192bit_transform); 13.452 __ xor3(G1,G3,G3); 13.453 - __ xor3(G2,G4,G4); 13.454 + __ xor3(G5,G4,G4); 13.455 __ movxtod(G3,F56); 13.456 __ movxtod(G4,F58); 13.457 __ fxor(FloatRegisterImpl::D, F60, F56, F60); 13.458 @@ -3634,24 +3827,81 @@ 13.459 } 13.460 } 13.461 13.462 + // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 13.463 + __ andcc(to, 7, L1); 13.464 + __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_192bit); 13.465 + __ delayed()->edge8n(to, G0, L2); 13.466 + 13.467 + // aligned case: store output into the destination array 13.468 __ stf(FloatRegisterImpl::D, F60, to, 0); 13.469 __ stf(FloatRegisterImpl::D, F62, to, 8); 13.470 + __ ba_short(L_check_loop_end_192bit); 13.471 + 13.472 + __ BIND(L_store_misaligned_output_192bit); 13.473 + __ add(to, 8, L3); 13.474 + __ mov(8, L4); 13.475 + __ sub(L4, L1, L4); 13.476 + __ alignaddr(L4, G0, L4); 13.477 + __ movdtox(F60, L6); 13.478 + __ movdtox(F62, L7); 13.479 + __ faligndata(F60, F60, F60); 13.480 + __ faligndata(F62, F62, F62); 13.481 + __ mov(to, L5); 13.482 + __ and3(to, -8, to); 13.483 + __ and3(L3, -8, L3); 13.484 + __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY); 13.485 + __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY); 13.486 + __ add(to, 8, to); 13.487 + __ add(L3, 8, L3); 13.488 + __ orn(G0, L2, L2); 13.489 + __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY); 13.490 + __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY); 13.491 + __ mov(L5, to); 13.492 + __ movxtod(L6, F60); 13.493 + __ movxtod(L7, F62); 13.494 + 13.495 + __ BIND(L_check_loop_end_192bit); 13.496 __ add(from, 16, from); 13.497 __ subcc(len_reg, 16, len_reg); 13.498 __ add(to, 16, to); 13.499 __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc192); 13.500 __ delayed()->nop(); 13.501 + // re-init intial vector for next block, 8-byte alignment is guaranteed 13.502 __ stf(FloatRegisterImpl::D, F60, rvec, 0); 13.503 __ stf(FloatRegisterImpl::D, F62, rvec, 8); 13.504 + __ restore(); 13.505 __ retl(); 13.506 - __ delayed()->mov(L1, O0); 13.507 + __ delayed()->mov(L0, O0); 13.508 13.509 __ align(OptoLoopAlignment); 13.510 __ BIND(L_cbcenc256); 13.511 + // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 13.512 + __ andcc(from, 7, G0); 13.513 + __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_256bit); 13.514 + __ delayed()->mov(from, L1); // save original 'from' address before alignaddr 13.515 + 13.516 + // aligned case: load input into G3 and G4 13.517 __ ldx(from,0,G3); 13.518 __ ldx(from,8,G4); 13.519 + __ ba_short(L_256bit_transform); 13.520 + 13.521 + __ BIND(L_load_misaligned_input_256bit); 13.522 + // cannot clobber F48, F50 and F52. F56, F58 can be used though 13.523 + __ alignaddr(from, G0, from); 13.524 + __ movdtox(F60, L2); // save F60 before overwriting 13.525 + __ ldf(FloatRegisterImpl::D, from, 0, F56); 13.526 + __ ldf(FloatRegisterImpl::D, from, 8, F58); 13.527 + __ ldf(FloatRegisterImpl::D, from, 16, F60); 13.528 + __ faligndata(F56, F58, F56); 13.529 + __ faligndata(F58, F60, F58); 13.530 + __ movdtox(F56, G3); 13.531 + __ movdtox(F58, G4); 13.532 + __ mov(L1, from); 13.533 + __ movxtod(L2, F60); 13.534 + 13.535 + __ BIND(L_256bit_transform); 13.536 __ xor3(G1,G3,G3); 13.537 - __ xor3(G2,G4,G4); 13.538 + __ xor3(G5,G4,G4); 13.539 __ movxtod(G3,F56); 13.540 __ movxtod(G4,F58); 13.541 __ fxor(FloatRegisterImpl::D, F60, F56, F60); 13.542 @@ -3670,26 +3920,69 @@ 13.543 } 13.544 } 13.545 13.546 + // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 13.547 + __ andcc(to, 7, L1); 13.548 + __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_256bit); 13.549 + __ delayed()->edge8n(to, G0, L2); 13.550 + 13.551 + // aligned case: store output into the destination array 13.552 __ stf(FloatRegisterImpl::D, F60, to, 0); 13.553 __ stf(FloatRegisterImpl::D, F62, to, 8); 13.554 + __ ba_short(L_check_loop_end_256bit); 13.555 + 13.556 + __ BIND(L_store_misaligned_output_256bit); 13.557 + __ add(to, 8, L3); 13.558 + __ mov(8, L4); 13.559 + __ sub(L4, L1, L4); 13.560 + __ alignaddr(L4, G0, L4); 13.561 + __ movdtox(F60, L6); 13.562 + __ movdtox(F62, L7); 13.563 + __ faligndata(F60, F60, F60); 13.564 + __ faligndata(F62, F62, F62); 13.565 + __ mov(to, L5); 13.566 + __ and3(to, -8, to); 13.567 + __ and3(L3, -8, L3); 13.568 + __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY); 13.569 + __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY); 13.570 + __ add(to, 8, to); 13.571 + __ add(L3, 8, L3); 13.572 + __ orn(G0, L2, L2); 13.573 + __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY); 13.574 + __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY); 13.575 + __ mov(L5, to); 13.576 + __ movxtod(L6, F60); 13.577 + __ movxtod(L7, F62); 13.578 + 13.579 + __ BIND(L_check_loop_end_256bit); 13.580 __ add(from, 16, from); 13.581 __ subcc(len_reg, 16, len_reg); 13.582 __ add(to, 16, to); 13.583 __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc256); 13.584 __ delayed()->nop(); 13.585 + // re-init intial vector for next block, 8-byte alignment is guaranteed 13.586 __ stf(FloatRegisterImpl::D, F60, rvec, 0); 13.587 __ stf(FloatRegisterImpl::D, F62, rvec, 8); 13.588 + __ restore(); 13.589 __ retl(); 13.590 - __ delayed()->mov(L1, O0); 13.591 + __ delayed()->mov(L0, O0); 13.592 13.593 return start; 13.594 } 13.595 13.596 address generate_cipherBlockChaining_decryptAESCrypt_Parallel() { 13.597 + assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0, 13.598 + "the following code assumes that first element of an int array is aligned to 8 bytes"); 13.599 + assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0, 13.600 + "the following code assumes that first element of a byte array is aligned to 8 bytes"); 13.601 __ align(CodeEntryAlignment); 13.602 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 13.603 Label L_cbcdec_end, L_expand192bit, L_expand256bit, L_dec_first_block_start; 13.604 Label L_dec_first_block128, L_dec_first_block192, L_dec_next2_blocks128, L_dec_next2_blocks192, L_dec_next2_blocks256; 13.605 + Label L_load_misaligned_input_first_block, L_transform_first_block, L_load_misaligned_next2_blocks128, L_transform_next2_blocks128; 13.606 + Label L_load_misaligned_next2_blocks192, L_transform_next2_blocks192, L_load_misaligned_next2_blocks256, L_transform_next2_blocks256; 13.607 + Label L_store_misaligned_output_first_block, L_check_decrypt_end, L_store_misaligned_output_next2_blocks128; 13.608 + Label L_check_decrypt_loop_end128, L_store_misaligned_output_next2_blocks192, L_check_decrypt_loop_end192; 13.609 + Label L_store_misaligned_output_next2_blocks256, L_check_decrypt_loop_end256; 13.610 address start = __ pc(); 13.611 Register from = I0; // source byte array 13.612 Register to = I1; // destination byte array 13.613 @@ -3704,11 +3997,12 @@ 13.614 __ save_frame(0); //args are read from I* registers since we save the frame in the beginning 13.615 13.616 // load original key from SunJCE expanded decryption key 13.617 + // Since we load original key buffer starting first element, 8-byte alignment is guaranteed 13.618 for ( int i = 0; i <= 3; i++ ) { 13.619 __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i)); 13.620 } 13.621 13.622 - // load initial vector 13.623 + // load initial vector, 8-byte alignment is guaranteed 13.624 __ ldx(rvec,0,L0); 13.625 __ ldx(rvec,8,L1); 13.626 13.627 @@ -3733,11 +4027,10 @@ 13.628 __ movdtox(F42,L3); 13.629 13.630 __ and3(len_reg, 16, L4); 13.631 - __ br_null(L4, false, Assembler::pt, L_dec_next2_blocks128); 13.632 - __ delayed()->nop(); 13.633 - 13.634 - __ br(Assembler::always, false, Assembler::pt, L_dec_first_block_start); 13.635 - __ delayed()->nop(); 13.636 + __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks128); 13.637 + __ nop(); 13.638 + 13.639 + __ ba_short(L_dec_first_block_start); 13.640 13.641 __ BIND(L_expand192bit); 13.642 // load rest of the 192-bit key 13.643 @@ -3758,11 +4051,10 @@ 13.644 __ movdtox(F50,L3); 13.645 13.646 __ and3(len_reg, 16, L4); 13.647 - __ br_null(L4, false, Assembler::pt, L_dec_next2_blocks192); 13.648 - __ delayed()->nop(); 13.649 - 13.650 - __ br(Assembler::always, false, Assembler::pt, L_dec_first_block_start); 13.651 - __ delayed()->nop(); 13.652 + __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks192); 13.653 + __ nop(); 13.654 + 13.655 + __ ba_short(L_dec_first_block_start); 13.656 13.657 __ BIND(L_expand256bit); 13.658 // load rest of the 256-bit key 13.659 @@ -3785,12 +4077,32 @@ 13.660 __ movdtox(F58,L3); 13.661 13.662 __ and3(len_reg, 16, L4); 13.663 - __ br_null(L4, false, Assembler::pt, L_dec_next2_blocks256); 13.664 - __ delayed()->nop(); 13.665 + __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks256); 13.666 13.667 __ BIND(L_dec_first_block_start); 13.668 + // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 13.669 + __ andcc(from, 7, G0); 13.670 + __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_first_block); 13.671 + __ delayed()->mov(from, G1); // save original 'from' address before alignaddr 13.672 + 13.673 + // aligned case: load input into L4 and L5 13.674 __ ldx(from,0,L4); 13.675 __ ldx(from,8,L5); 13.676 + __ ba_short(L_transform_first_block); 13.677 + 13.678 + __ BIND(L_load_misaligned_input_first_block); 13.679 + __ alignaddr(from, G0, from); 13.680 + // F58, F60, F62 can be clobbered 13.681 + __ ldf(FloatRegisterImpl::D, from, 0, F58); 13.682 + __ ldf(FloatRegisterImpl::D, from, 8, F60); 13.683 + __ ldf(FloatRegisterImpl::D, from, 16, F62); 13.684 + __ faligndata(F58, F60, F58); 13.685 + __ faligndata(F60, F62, F60); 13.686 + __ movdtox(F58, L4); 13.687 + __ movdtox(F60, L5); 13.688 + __ mov(G1, from); 13.689 + 13.690 + __ BIND(L_transform_first_block); 13.691 __ xor3(L2,L4,G1); 13.692 __ movxtod(G1,F60); 13.693 __ xor3(L3,L5,G1); 13.694 @@ -3833,9 +4145,36 @@ 13.695 __ fxor(FloatRegisterImpl::D, F56, F60, F60); 13.696 __ fxor(FloatRegisterImpl::D, F58, F62, F62); 13.697 13.698 + // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 13.699 + __ andcc(to, 7, G1); 13.700 + __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_first_block); 13.701 + __ delayed()->edge8n(to, G0, G2); 13.702 + 13.703 + // aligned case: store output into the destination array 13.704 __ stf(FloatRegisterImpl::D, F60, to, 0); 13.705 __ stf(FloatRegisterImpl::D, F62, to, 8); 13.706 - 13.707 + __ ba_short(L_check_decrypt_end); 13.708 + 13.709 + __ BIND(L_store_misaligned_output_first_block); 13.710 + __ add(to, 8, G3); 13.711 + __ mov(8, G4); 13.712 + __ sub(G4, G1, G4); 13.713 + __ alignaddr(G4, G0, G4); 13.714 + __ faligndata(F60, F60, F60); 13.715 + __ faligndata(F62, F62, F62); 13.716 + __ mov(to, G1); 13.717 + __ and3(to, -8, to); 13.718 + __ and3(G3, -8, G3); 13.719 + __ stpartialf(to, G2, F60, Assembler::ASI_PST8_PRIMARY); 13.720 + __ stpartialf(G3, G2, F62, Assembler::ASI_PST8_PRIMARY); 13.721 + __ add(to, 8, to); 13.722 + __ add(G3, 8, G3); 13.723 + __ orn(G0, G2, G2); 13.724 + __ stpartialf(to, G2, F60, Assembler::ASI_PST8_PRIMARY); 13.725 + __ stpartialf(G3, G2, F62, Assembler::ASI_PST8_PRIMARY); 13.726 + __ mov(G1, to); 13.727 + 13.728 + __ BIND(L_check_decrypt_end); 13.729 __ add(from, 16, from); 13.730 __ add(to, 16, to); 13.731 __ subcc(len_reg, 16, len_reg); 13.732 @@ -3852,17 +4191,44 @@ 13.733 __ BIND(L_dec_next2_blocks128); 13.734 __ nop(); 13.735 13.736 - // F40:F42 used for first 16-bytes 13.737 + // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 13.738 + __ andcc(from, 7, G0); 13.739 + __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks128); 13.740 + __ delayed()->mov(from, G1); // save original 'from' address before alignaddr 13.741 + 13.742 + // aligned case: load input into G4, G5, L4 and L5 13.743 __ ldx(from,0,G4); 13.744 __ ldx(from,8,G5); 13.745 + __ ldx(from,16,L4); 13.746 + __ ldx(from,24,L5); 13.747 + __ ba_short(L_transform_next2_blocks128); 13.748 + 13.749 + __ BIND(L_load_misaligned_next2_blocks128); 13.750 + __ alignaddr(from, G0, from); 13.751 + // F40, F42, F58, F60, F62 can be clobbered 13.752 + __ ldf(FloatRegisterImpl::D, from, 0, F40); 13.753 + __ ldf(FloatRegisterImpl::D, from, 8, F42); 13.754 + __ ldf(FloatRegisterImpl::D, from, 16, F60); 13.755 + __ ldf(FloatRegisterImpl::D, from, 24, F62); 13.756 + __ ldf(FloatRegisterImpl::D, from, 32, F58); 13.757 + __ faligndata(F40, F42, F40); 13.758 + __ faligndata(F42, F60, F42); 13.759 + __ faligndata(F60, F62, F60); 13.760 + __ faligndata(F62, F58, F62); 13.761 + __ movdtox(F40, G4); 13.762 + __ movdtox(F42, G5); 13.763 + __ movdtox(F60, L4); 13.764 + __ movdtox(F62, L5); 13.765 + __ mov(G1, from); 13.766 + 13.767 + __ BIND(L_transform_next2_blocks128); 13.768 + // F40:F42 used for first 16-bytes 13.769 __ xor3(L2,G4,G1); 13.770 __ movxtod(G1,F40); 13.771 __ xor3(L3,G5,G1); 13.772 __ movxtod(G1,F42); 13.773 13.774 // F60:F62 used for next 16-bytes 13.775 - __ ldx(from,16,L4); 13.776 - __ ldx(from,24,L5); 13.777 __ xor3(L2,L4,G1); 13.778 __ movxtod(G1,F60); 13.779 __ xor3(L3,L5,G1); 13.780 @@ -3891,9 +4257,6 @@ 13.781 __ fxor(FloatRegisterImpl::D, F46, F40, F40); 13.782 __ fxor(FloatRegisterImpl::D, F44, F42, F42); 13.783 13.784 - __ stf(FloatRegisterImpl::D, F40, to, 0); 13.785 - __ stf(FloatRegisterImpl::D, F42, to, 8); 13.786 - 13.787 __ movxtod(G4,F56); 13.788 __ movxtod(G5,F58); 13.789 __ mov(L4,L0); 13.790 @@ -3901,32 +4264,93 @@ 13.791 __ fxor(FloatRegisterImpl::D, F56, F60, F60); 13.792 __ fxor(FloatRegisterImpl::D, F58, F62, F62); 13.793 13.794 + // For mis-aligned store of 32 bytes of result we can do: 13.795 + // Circular right-shift all 4 FP registers so that 'head' and 'tail' 13.796 + // parts that need to be stored starting at mis-aligned address are in a FP reg 13.797 + // the other 3 FP regs can thus be stored using regular store 13.798 + // we then use the edge + partial-store mechanism to store the 'head' and 'tail' parts 13.799 + 13.800 + // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 13.801 + __ andcc(to, 7, G1); 13.802 + __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks128); 13.803 + __ delayed()->edge8n(to, G0, G2); 13.804 + 13.805 + // aligned case: store output into the destination array 13.806 + __ stf(FloatRegisterImpl::D, F40, to, 0); 13.807 + __ stf(FloatRegisterImpl::D, F42, to, 8); 13.808 __ stf(FloatRegisterImpl::D, F60, to, 16); 13.809 __ stf(FloatRegisterImpl::D, F62, to, 24); 13.810 - 13.811 + __ ba_short(L_check_decrypt_loop_end128); 13.812 + 13.813 + __ BIND(L_store_misaligned_output_next2_blocks128); 13.814 + __ mov(8, G4); 13.815 + __ sub(G4, G1, G4); 13.816 + __ alignaddr(G4, G0, G4); 13.817 + __ faligndata(F40, F42, F56); // F56 can be clobbered 13.818 + __ faligndata(F42, F60, F42); 13.819 + __ faligndata(F60, F62, F60); 13.820 + __ faligndata(F62, F40, F40); 13.821 + __ mov(to, G1); 13.822 + __ and3(to, -8, to); 13.823 + __ stpartialf(to, G2, F40, Assembler::ASI_PST8_PRIMARY); 13.824 + __ stf(FloatRegisterImpl::D, F56, to, 8); 13.825 + __ stf(FloatRegisterImpl::D, F42, to, 16); 13.826 + __ stf(FloatRegisterImpl::D, F60, to, 24); 13.827 + __ add(to, 32, to); 13.828 + __ orn(G0, G2, G2); 13.829 + __ stpartialf(to, G2, F40, Assembler::ASI_PST8_PRIMARY); 13.830 + __ mov(G1, to); 13.831 + 13.832 + __ BIND(L_check_decrypt_loop_end128); 13.833 __ add(from, 32, from); 13.834 __ add(to, 32, to); 13.835 __ subcc(len_reg, 32, len_reg); 13.836 __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks128); 13.837 __ delayed()->nop(); 13.838 - __ br(Assembler::always, false, Assembler::pt, L_cbcdec_end); 13.839 - __ delayed()->nop(); 13.840 + __ ba_short(L_cbcdec_end); 13.841 13.842 __ align(OptoLoopAlignment); 13.843 __ BIND(L_dec_next2_blocks192); 13.844 __ nop(); 13.845 13.846 - // F48:F50 used for first 16-bytes 13.847 + // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 13.848 + __ andcc(from, 7, G0); 13.849 + __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks192); 13.850 + __ delayed()->mov(from, G1); // save original 'from' address before alignaddr 13.851 + 13.852 + // aligned case: load input into G4, G5, L4 and L5 13.853 __ ldx(from,0,G4); 13.854 __ ldx(from,8,G5); 13.855 + __ ldx(from,16,L4); 13.856 + __ ldx(from,24,L5); 13.857 + __ ba_short(L_transform_next2_blocks192); 13.858 + 13.859 + __ BIND(L_load_misaligned_next2_blocks192); 13.860 + __ alignaddr(from, G0, from); 13.861 + // F48, F50, F52, F60, F62 can be clobbered 13.862 + __ ldf(FloatRegisterImpl::D, from, 0, F48); 13.863 + __ ldf(FloatRegisterImpl::D, from, 8, F50); 13.864 + __ ldf(FloatRegisterImpl::D, from, 16, F60); 13.865 + __ ldf(FloatRegisterImpl::D, from, 24, F62); 13.866 + __ ldf(FloatRegisterImpl::D, from, 32, F52); 13.867 + __ faligndata(F48, F50, F48); 13.868 + __ faligndata(F50, F60, F50); 13.869 + __ faligndata(F60, F62, F60); 13.870 + __ faligndata(F62, F52, F62); 13.871 + __ movdtox(F48, G4); 13.872 + __ movdtox(F50, G5); 13.873 + __ movdtox(F60, L4); 13.874 + __ movdtox(F62, L5); 13.875 + __ mov(G1, from); 13.876 + 13.877 + __ BIND(L_transform_next2_blocks192); 13.878 + // F48:F50 used for first 16-bytes 13.879 __ xor3(L2,G4,G1); 13.880 __ movxtod(G1,F48); 13.881 __ xor3(L3,G5,G1); 13.882 __ movxtod(G1,F50); 13.883 13.884 // F60:F62 used for next 16-bytes 13.885 - __ ldx(from,16,L4); 13.886 - __ ldx(from,24,L5); 13.887 __ xor3(L2,L4,G1); 13.888 __ movxtod(G1,F60); 13.889 __ xor3(L3,L5,G1); 13.890 @@ -3955,9 +4379,6 @@ 13.891 __ fxor(FloatRegisterImpl::D, F54, F48, F48); 13.892 __ fxor(FloatRegisterImpl::D, F52, F50, F50); 13.893 13.894 - __ stf(FloatRegisterImpl::D, F48, to, 0); 13.895 - __ stf(FloatRegisterImpl::D, F50, to, 8); 13.896 - 13.897 __ movxtod(G4,F56); 13.898 __ movxtod(G5,F58); 13.899 __ mov(L4,L0); 13.900 @@ -3965,32 +4386,87 @@ 13.901 __ fxor(FloatRegisterImpl::D, F56, F60, F60); 13.902 __ fxor(FloatRegisterImpl::D, F58, F62, F62); 13.903 13.904 + // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 13.905 + __ andcc(to, 7, G1); 13.906 + __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks192); 13.907 + __ delayed()->edge8n(to, G0, G2); 13.908 + 13.909 + // aligned case: store output into the destination array 13.910 + __ stf(FloatRegisterImpl::D, F48, to, 0); 13.911 + __ stf(FloatRegisterImpl::D, F50, to, 8); 13.912 __ stf(FloatRegisterImpl::D, F60, to, 16); 13.913 __ stf(FloatRegisterImpl::D, F62, to, 24); 13.914 - 13.915 + __ ba_short(L_check_decrypt_loop_end192); 13.916 + 13.917 + __ BIND(L_store_misaligned_output_next2_blocks192); 13.918 + __ mov(8, G4); 13.919 + __ sub(G4, G1, G4); 13.920 + __ alignaddr(G4, G0, G4); 13.921 + __ faligndata(F48, F50, F56); // F56 can be clobbered 13.922 + __ faligndata(F50, F60, F50); 13.923 + __ faligndata(F60, F62, F60); 13.924 + __ faligndata(F62, F48, F48); 13.925 + __ mov(to, G1); 13.926 + __ and3(to, -8, to); 13.927 + __ stpartialf(to, G2, F48, Assembler::ASI_PST8_PRIMARY); 13.928 + __ stf(FloatRegisterImpl::D, F56, to, 8); 13.929 + __ stf(FloatRegisterImpl::D, F50, to, 16); 13.930 + __ stf(FloatRegisterImpl::D, F60, to, 24); 13.931 + __ add(to, 32, to); 13.932 + __ orn(G0, G2, G2); 13.933 + __ stpartialf(to, G2, F48, Assembler::ASI_PST8_PRIMARY); 13.934 + __ mov(G1, to); 13.935 + 13.936 + __ BIND(L_check_decrypt_loop_end192); 13.937 __ add(from, 32, from); 13.938 __ add(to, 32, to); 13.939 __ subcc(len_reg, 32, len_reg); 13.940 __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks192); 13.941 __ delayed()->nop(); 13.942 - __ br(Assembler::always, false, Assembler::pt, L_cbcdec_end); 13.943 - __ delayed()->nop(); 13.944 + __ ba_short(L_cbcdec_end); 13.945 13.946 __ align(OptoLoopAlignment); 13.947 __ BIND(L_dec_next2_blocks256); 13.948 __ nop(); 13.949 13.950 - // F0:F2 used for first 16-bytes 13.951 + // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero 13.952 + __ andcc(from, 7, G0); 13.953 + __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks256); 13.954 + __ delayed()->mov(from, G1); // save original 'from' address before alignaddr 13.955 + 13.956 + // aligned case: load input into G4, G5, L4 and L5 13.957 __ ldx(from,0,G4); 13.958 __ ldx(from,8,G5); 13.959 + __ ldx(from,16,L4); 13.960 + __ ldx(from,24,L5); 13.961 + __ ba_short(L_transform_next2_blocks256); 13.962 + 13.963 + __ BIND(L_load_misaligned_next2_blocks256); 13.964 + __ alignaddr(from, G0, from); 13.965 + // F0, F2, F4, F60, F62 can be clobbered 13.966 + __ ldf(FloatRegisterImpl::D, from, 0, F0); 13.967 + __ ldf(FloatRegisterImpl::D, from, 8, F2); 13.968 + __ ldf(FloatRegisterImpl::D, from, 16, F60); 13.969 + __ ldf(FloatRegisterImpl::D, from, 24, F62); 13.970 + __ ldf(FloatRegisterImpl::D, from, 32, F4); 13.971 + __ faligndata(F0, F2, F0); 13.972 + __ faligndata(F2, F60, F2); 13.973 + __ faligndata(F60, F62, F60); 13.974 + __ faligndata(F62, F4, F62); 13.975 + __ movdtox(F0, G4); 13.976 + __ movdtox(F2, G5); 13.977 + __ movdtox(F60, L4); 13.978 + __ movdtox(F62, L5); 13.979 + __ mov(G1, from); 13.980 + 13.981 + __ BIND(L_transform_next2_blocks256); 13.982 + // F0:F2 used for first 16-bytes 13.983 __ xor3(L2,G4,G1); 13.984 __ movxtod(G1,F0); 13.985 __ xor3(L3,G5,G1); 13.986 __ movxtod(G1,F2); 13.987 13.988 // F60:F62 used for next 16-bytes 13.989 - __ ldx(from,16,L4); 13.990 - __ ldx(from,24,L5); 13.991 __ xor3(L2,L4,G1); 13.992 __ movxtod(G1,F60); 13.993 __ xor3(L3,L5,G1); 13.994 @@ -4043,9 +4519,6 @@ 13.995 __ fxor(FloatRegisterImpl::D, F6, F0, F0); 13.996 __ fxor(FloatRegisterImpl::D, F4, F2, F2); 13.997 13.998 - __ stf(FloatRegisterImpl::D, F0, to, 0); 13.999 - __ stf(FloatRegisterImpl::D, F2, to, 8); 13.1000 - 13.1001 __ movxtod(G4,F56); 13.1002 __ movxtod(G5,F58); 13.1003 __ mov(L4,L0); 13.1004 @@ -4053,9 +4526,38 @@ 13.1005 __ fxor(FloatRegisterImpl::D, F56, F60, F60); 13.1006 __ fxor(FloatRegisterImpl::D, F58, F62, F62); 13.1007 13.1008 + // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero 13.1009 + __ andcc(to, 7, G1); 13.1010 + __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks256); 13.1011 + __ delayed()->edge8n(to, G0, G2); 13.1012 + 13.1013 + // aligned case: store output into the destination array 13.1014 + __ stf(FloatRegisterImpl::D, F0, to, 0); 13.1015 + __ stf(FloatRegisterImpl::D, F2, to, 8); 13.1016 __ stf(FloatRegisterImpl::D, F60, to, 16); 13.1017 __ stf(FloatRegisterImpl::D, F62, to, 24); 13.1018 - 13.1019 + __ ba_short(L_check_decrypt_loop_end256); 13.1020 + 13.1021 + __ BIND(L_store_misaligned_output_next2_blocks256); 13.1022 + __ mov(8, G4); 13.1023 + __ sub(G4, G1, G4); 13.1024 + __ alignaddr(G4, G0, G4); 13.1025 + __ faligndata(F0, F2, F56); // F56 can be clobbered 13.1026 + __ faligndata(F2, F60, F2); 13.1027 + __ faligndata(F60, F62, F60); 13.1028 + __ faligndata(F62, F0, F0); 13.1029 + __ mov(to, G1); 13.1030 + __ and3(to, -8, to); 13.1031 + __ stpartialf(to, G2, F0, Assembler::ASI_PST8_PRIMARY); 13.1032 + __ stf(FloatRegisterImpl::D, F56, to, 8); 13.1033 + __ stf(FloatRegisterImpl::D, F2, to, 16); 13.1034 + __ stf(FloatRegisterImpl::D, F60, to, 24); 13.1035 + __ add(to, 32, to); 13.1036 + __ orn(G0, G2, G2); 13.1037 + __ stpartialf(to, G2, F0, Assembler::ASI_PST8_PRIMARY); 13.1038 + __ mov(G1, to); 13.1039 + 13.1040 + __ BIND(L_check_decrypt_loop_end256); 13.1041 __ add(from, 32, from); 13.1042 __ add(to, 32, to); 13.1043 __ subcc(len_reg, 32, len_reg); 13.1044 @@ -4063,6 +4565,7 @@ 13.1045 __ delayed()->nop(); 13.1046 13.1047 __ BIND(L_cbcdec_end); 13.1048 + // re-init intial vector for next block, 8-byte alignment is guaranteed 13.1049 __ stx(L0, rvec, 0); 13.1050 __ stx(L1, rvec, 8); 13.1051 __ restore();
14.1 --- a/src/cpu/sparc/vm/stubRoutines_sparc.hpp Wed May 07 10:58:47 2014 -0700 14.2 +++ b/src/cpu/sparc/vm/stubRoutines_sparc.hpp Thu May 08 23:07:11 2014 -0700 14.3 @@ -41,7 +41,7 @@ 14.4 enum /* platform_dependent_constants */ { 14.5 // %%%%%%%% May be able to shrink this a lot 14.6 code_size1 = 20000, // simply increase if too small (assembler will crash if too small) 14.7 - code_size2 = 20000 // simply increase if too small (assembler will crash if too small) 14.8 + code_size2 = 22000 // simply increase if too small (assembler will crash if too small) 14.9 }; 14.10 14.11 class Sparc {
15.1 --- a/src/cpu/sparc/vm/vm_version_sparc.cpp Wed May 07 10:58:47 2014 -0700 15.2 +++ b/src/cpu/sparc/vm/vm_version_sparc.cpp Thu May 08 23:07:11 2014 -0700 15.3 @@ -1,5 +1,5 @@ 15.4 /* 15.5 - * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved. 15.6 + * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved. 15.7 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 15.8 * 15.9 * This code is free software; you can redistribute it and/or modify it 15.10 @@ -266,9 +266,9 @@ 15.11 if (!has_vis1()) // Drop to 0 if no VIS1 support 15.12 UseVIS = 0; 15.13 15.14 - // T2 and above should have support for AES instructions 15.15 + // SPARC T4 and above should have support for AES instructions 15.16 if (has_aes()) { 15.17 - if (UseVIS > 0) { // AES intrinsics use FXOR instruction which is VIS1 15.18 + if (UseVIS > 2) { // AES intrinsics use MOVxTOd/MOVdTOx which are VIS3 15.19 if (FLAG_IS_DEFAULT(UseAES)) { 15.20 FLAG_SET_DEFAULT(UseAES, true); 15.21 } 15.22 @@ -282,7 +282,7 @@ 15.23 } 15.24 } else { 15.25 if (UseAES || UseAESIntrinsics) { 15.26 - warning("SPARC AES intrinsics require VIS1 instruction support. Intrinsics will be disabled."); 15.27 + warning("SPARC AES intrinsics require VIS3 instruction support. Intrinsics will be disabled."); 15.28 if (UseAES) { 15.29 FLAG_SET_DEFAULT(UseAES, false); 15.30 }
16.1 --- a/src/cpu/x86/vm/assembler_x86.cpp Wed May 07 10:58:47 2014 -0700 16.2 +++ b/src/cpu/x86/vm/assembler_x86.cpp Thu May 08 23:07:11 2014 -0700 16.3 @@ -1766,7 +1766,7 @@ 16.4 16.5 // Move Unaligned 256bit Vector 16.6 void Assembler::vmovdqu(XMMRegister dst, XMMRegister src) { 16.7 - assert(UseAVX, ""); 16.8 + assert(UseAVX > 0, ""); 16.9 bool vector256 = true; 16.10 int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_F3, vector256); 16.11 emit_int8(0x6F); 16.12 @@ -1774,7 +1774,7 @@ 16.13 } 16.14 16.15 void Assembler::vmovdqu(XMMRegister dst, Address src) { 16.16 - assert(UseAVX, ""); 16.17 + assert(UseAVX > 0, ""); 16.18 InstructionMark im(this); 16.19 bool vector256 = true; 16.20 vex_prefix(dst, xnoreg, src, VEX_SIMD_F3, vector256); 16.21 @@ -1783,7 +1783,7 @@ 16.22 } 16.23 16.24 void Assembler::vmovdqu(Address dst, XMMRegister src) { 16.25 - assert(UseAVX, ""); 16.26 + assert(UseAVX > 0, ""); 16.27 InstructionMark im(this); 16.28 bool vector256 = true; 16.29 // swap src<->dst for encoding
17.1 --- a/src/cpu/x86/vm/vm_version_x86.cpp Wed May 07 10:58:47 2014 -0700 17.2 +++ b/src/cpu/x86/vm/vm_version_x86.cpp Thu May 08 23:07:11 2014 -0700 17.3 @@ -263,6 +263,10 @@ 17.4 // and check upper YMM bits after it. 17.5 // 17.6 VM_Version::set_avx_cpuFeatures(); // Enable temporary to pass asserts 17.7 + intx saved_useavx = UseAVX; 17.8 + intx saved_usesse = UseSSE; 17.9 + UseAVX = 1; 17.10 + UseSSE = 2; 17.11 17.12 // load value into all 32 bytes of ymm7 register 17.13 __ movl(rcx, VM_Version::ymm_test_value()); 17.14 @@ -292,6 +296,8 @@ 17.15 #endif 17.16 17.17 VM_Version::clean_cpuFeatures(); 17.18 + UseAVX = saved_useavx; 17.19 + UseSSE = saved_usesse; 17.20 17.21 // 17.22 // cpuid(0x7) Structured Extended Features
18.1 --- a/src/os_cpu/linux_ppc/vm/atomic_linux_ppc.inline.hpp Wed May 07 10:58:47 2014 -0700 18.2 +++ b/src/os_cpu/linux_ppc/vm/atomic_linux_ppc.inline.hpp Thu May 08 23:07:11 2014 -0700 18.3 @@ -53,41 +53,41 @@ 18.4 18.5 inline jlong Atomic::load(volatile jlong* src) { return *src; } 18.6 18.7 -/* 18.8 - machine barrier instructions: 18.9 - 18.10 - - sync two-way memory barrier, aka fence 18.11 - - lwsync orders Store|Store, 18.12 - Load|Store, 18.13 - Load|Load, 18.14 - but not Store|Load 18.15 - - eieio orders memory accesses for device memory (only) 18.16 - - isync invalidates speculatively executed instructions 18.17 - From the POWER ISA 2.06 documentation: 18.18 - "[...] an isync instruction prevents the execution of 18.19 - instructions following the isync until instructions 18.20 - preceding the isync have completed, [...]" 18.21 - From IBM's AIX assembler reference: 18.22 - "The isync [...] instructions causes the processor to 18.23 - refetch any instructions that might have been fetched 18.24 - prior to the isync instruction. The instruction isync 18.25 - causes the processor to wait for all previous instructions 18.26 - to complete. Then any instructions already fetched are 18.27 - discarded and instruction processing continues in the 18.28 - environment established by the previous instructions." 18.29 - 18.30 - semantic barrier instructions: 18.31 - (as defined in orderAccess.hpp) 18.32 - 18.33 - - release orders Store|Store, (maps to lwsync) 18.34 - Load|Store 18.35 - - acquire orders Load|Store, (maps to lwsync) 18.36 - Load|Load 18.37 - - fence orders Store|Store, (maps to sync) 18.38 - Load|Store, 18.39 - Load|Load, 18.40 - Store|Load 18.41 -*/ 18.42 +// 18.43 +// machine barrier instructions: 18.44 +// 18.45 +// - sync two-way memory barrier, aka fence 18.46 +// - lwsync orders Store|Store, 18.47 +// Load|Store, 18.48 +// Load|Load, 18.49 +// but not Store|Load 18.50 +// - eieio orders memory accesses for device memory (only) 18.51 +// - isync invalidates speculatively executed instructions 18.52 +// From the POWER ISA 2.06 documentation: 18.53 +// "[...] an isync instruction prevents the execution of 18.54 +// instructions following the isync until instructions 18.55 +// preceding the isync have completed, [...]" 18.56 +// From IBM's AIX assembler reference: 18.57 +// "The isync [...] instructions causes the processor to 18.58 +// refetch any instructions that might have been fetched 18.59 +// prior to the isync instruction. The instruction isync 18.60 +// causes the processor to wait for all previous instructions 18.61 +// to complete. Then any instructions already fetched are 18.62 +// discarded and instruction processing continues in the 18.63 +// environment established by the previous instructions." 18.64 +// 18.65 +// semantic barrier instructions: 18.66 +// (as defined in orderAccess.hpp) 18.67 +// 18.68 +// - release orders Store|Store, (maps to lwsync) 18.69 +// Load|Store 18.70 +// - acquire orders Load|Store, (maps to lwsync) 18.71 +// Load|Load 18.72 +// - fence orders Store|Store, (maps to sync) 18.73 +// Load|Store, 18.74 +// Load|Load, 18.75 +// Store|Load 18.76 +// 18.77 18.78 #define strasm_sync "\n sync \n" 18.79 #define strasm_lwsync "\n lwsync \n"
19.1 --- a/src/share/vm/ci/ciReplay.cpp Wed May 07 10:58:47 2014 -0700 19.2 +++ b/src/share/vm/ci/ciReplay.cpp Thu May 08 23:07:11 2014 -0700 19.3 @@ -376,11 +376,15 @@ 19.4 int c = getc(_stream); 19.5 while(c != EOF) { 19.6 c = get_line(c); 19.7 - process_command(CHECK); 19.8 + process_command(THREAD); 19.9 if (had_error()) { 19.10 tty->print_cr("Error while parsing line %d: %s\n", line_no, _error_message); 19.11 - tty->print_cr("%s", _buffer); 19.12 - return; 19.13 + if (ReplayIgnoreInitErrors) { 19.14 + CLEAR_PENDING_EXCEPTION; 19.15 + _error_message = NULL; 19.16 + } else { 19.17 + return; 19.18 + } 19.19 } 19.20 line_no++; 19.21 } 19.22 @@ -565,10 +569,14 @@ 19.23 void process_ciMethodData(TRAPS) { 19.24 Method* method = parse_method(CHECK); 19.25 if (had_error()) return; 19.26 - /* jsut copied from Method, to build interpret data*/ 19.27 + /* just copied from Method, to build interpret data*/ 19.28 if (InstanceRefKlass::owns_pending_list_lock((JavaThread*)THREAD)) { 19.29 return; 19.30 } 19.31 + // To be properly initialized, some profiling in the MDO needs the 19.32 + // method to be rewritten (number of arguments at a call for 19.33 + // instance) 19.34 + method->method_holder()->link_class(CHECK); 19.35 // methodOopDesc::build_interpreter_method_data(method, CHECK); 19.36 { 19.37 // Grab a lock here to prevent multiple
20.1 --- a/src/share/vm/classfile/vmSymbols.hpp Wed May 07 10:58:47 2014 -0700 20.2 +++ b/src/share/vm/classfile/vmSymbols.hpp Thu May 08 23:07:11 2014 -0700 20.3 @@ -774,7 +774,7 @@ 20.4 /* java/lang/ref/Reference */ \ 20.5 do_intrinsic(_Reference_get, java_lang_ref_Reference, get_name, void_object_signature, F_R) \ 20.6 \ 20.7 - /* support for com.sum.crypto.provider.AESCrypt and some of its callers */ \ 20.8 + /* support for com.sun.crypto.provider.AESCrypt and some of its callers */ \ 20.9 do_class(com_sun_crypto_provider_aescrypt, "com/sun/crypto/provider/AESCrypt") \ 20.10 do_intrinsic(_aescrypt_encryptBlock, com_sun_crypto_provider_aescrypt, encryptBlock_name, byteArray_int_byteArray_int_signature, F_R) \ 20.11 do_intrinsic(_aescrypt_decryptBlock, com_sun_crypto_provider_aescrypt, decryptBlock_name, byteArray_int_byteArray_int_signature, F_R) \
21.1 --- a/src/share/vm/code/nmethod.cpp Wed May 07 10:58:47 2014 -0700 21.2 +++ b/src/share/vm/code/nmethod.cpp Thu May 08 23:07:11 2014 -0700 21.3 @@ -771,7 +771,11 @@ 21.4 _hotness_counter = NMethodSweeper::hotness_counter_reset_val(); 21.5 21.6 code_buffer->copy_values_to(this); 21.7 - debug_only(verify_scavenge_root_oops()); 21.8 + if (ScavengeRootsInCode && detect_scavenge_root_oops()) { 21.9 + CodeCache::add_scavenge_root_nmethod(this); 21.10 + Universe::heap()->register_nmethod(this); 21.11 + } 21.12 + DEBUG_ONLY(verify_scavenge_root_oops();) 21.13 CodeCache::commit(this); 21.14 } 21.15
22.1 --- a/src/share/vm/oops/klass.cpp Wed May 07 10:58:47 2014 -0700 22.2 +++ b/src/share/vm/oops/klass.cpp Thu May 08 23:07:11 2014 -0700 22.3 @@ -496,6 +496,7 @@ 22.4 } 22.5 22.6 void Klass::restore_unshareable_info(TRAPS) { 22.7 + TRACE_INIT_ID(this); 22.8 // If an exception happened during CDS restore, some of these fields may already be 22.9 // set. We leave the class on the CLD list, even if incomplete so that we don't 22.10 // modify the CLD list outside a safepoint.
23.1 --- a/src/share/vm/opto/compile.cpp Wed May 07 10:58:47 2014 -0700 23.2 +++ b/src/share/vm/opto/compile.cpp Thu May 08 23:07:11 2014 -0700 23.3 @@ -693,6 +693,7 @@ 23.4 #endif 23.5 set_print_inlining(PrintInlining || method()->has_option("PrintInlining") NOT_PRODUCT( || PrintOptoInlining)); 23.6 set_print_intrinsics(PrintIntrinsics || method()->has_option("PrintIntrinsics")); 23.7 + set_has_irreducible_loop(true); // conservative until build_loop_tree() reset it 23.8 23.9 if (ProfileTraps RTM_OPT_ONLY( || UseRTMLocking )) { 23.10 // Make sure the method being compiled gets its own MDO, 23.11 @@ -977,6 +978,8 @@ 23.12 set_print_assembly(PrintFrameConverterAssembly); 23.13 set_parsed_irreducible_loop(false); 23.14 #endif 23.15 + set_has_irreducible_loop(false); // no loops 23.16 + 23.17 CompileWrapper cw(this); 23.18 Init(/*AliasLevel=*/ 0); 23.19 init_tf((*generator)()); 23.20 @@ -1147,7 +1150,7 @@ 23.21 if( start->is_Start() ) 23.22 return start->as_Start(); 23.23 } 23.24 - ShouldNotReachHere(); 23.25 + fatal("Did not find Start node!"); 23.26 return NULL; 23.27 } 23.28
24.1 --- a/src/share/vm/opto/compile.hpp Wed May 07 10:58:47 2014 -0700 24.2 +++ b/src/share/vm/opto/compile.hpp Thu May 08 23:07:11 2014 -0700 24.3 @@ -319,6 +319,7 @@ 24.4 bool _trace_opto_output; 24.5 bool _parsed_irreducible_loop; // True if ciTypeFlow detected irreducible loops during parsing 24.6 #endif 24.7 + bool _has_irreducible_loop; // Found irreducible loops 24.8 // JSR 292 24.9 bool _has_method_handle_invokes; // True if this method has MethodHandle invokes. 24.10 RTMState _rtm_state; // State of Restricted Transactional Memory usage 24.11 @@ -605,6 +606,8 @@ 24.12 void set_parsed_irreducible_loop(bool z) { _parsed_irreducible_loop = z; } 24.13 int _in_dump_cnt; // Required for dumping ir nodes. 24.14 #endif 24.15 + bool has_irreducible_loop() const { return _has_irreducible_loop; } 24.16 + void set_has_irreducible_loop(bool z) { _has_irreducible_loop = z; } 24.17 24.18 // JSR 292 24.19 bool has_method_handle_invokes() const { return _has_method_handle_invokes; }
25.1 --- a/src/share/vm/opto/loopnode.cpp Wed May 07 10:58:47 2014 -0700 25.2 +++ b/src/share/vm/opto/loopnode.cpp Thu May 08 23:07:11 2014 -0700 25.3 @@ -266,9 +266,9 @@ 25.4 25.5 // Counted loop head must be a good RegionNode with only 3 not NULL 25.6 // control input edges: Self, Entry, LoopBack. 25.7 - if (x->in(LoopNode::Self) == NULL || x->req() != 3) 25.8 + if (x->in(LoopNode::Self) == NULL || x->req() != 3 || loop->_irreducible) { 25.9 return false; 25.10 - 25.11 + } 25.12 Node *init_control = x->in(LoopNode::EntryControl); 25.13 Node *back_control = x->in(LoopNode::LoopBackControl); 25.14 if (init_control == NULL || back_control == NULL) // Partially dead 25.15 @@ -1522,11 +1522,11 @@ 25.16 25.17 // If I have one hot backedge, peel off myself loop. 25.18 // I better be the outermost loop. 25.19 - if( _head->req() > 3 ) { 25.20 + if (_head->req() > 3 && !_irreducible) { 25.21 split_outer_loop( phase ); 25.22 result = true; 25.23 25.24 - } else if( !_head->is_Loop() && !_irreducible ) { 25.25 + } else if (!_head->is_Loop() && !_irreducible) { 25.26 // Make a new LoopNode to replace the old loop head 25.27 Node *l = new (phase->C) LoopNode( _head->in(1), _head->in(2) ); 25.28 l = igvn.register_new_node_with_optimizer(l, _head); 25.29 @@ -2938,6 +2938,7 @@ 25.30 return pre_order; 25.31 } 25.32 } 25.33 + C->set_has_irreducible_loop(_has_irreducible_loops); 25.34 } 25.35 25.36 // This Node might be a decision point for loops. It is only if
26.1 --- a/src/share/vm/opto/memnode.cpp Wed May 07 10:58:47 2014 -0700 26.2 +++ b/src/share/vm/opto/memnode.cpp Thu May 08 23:07:11 2014 -0700 26.3 @@ -306,33 +306,16 @@ 26.4 int alias_idx = phase->C->get_alias_index(t_adr->is_ptr()); 26.5 } 26.6 26.7 -#ifdef ASSERT 26.8 Node* base = NULL; 26.9 - if (address->is_AddP()) 26.10 + if (address->is_AddP()) { 26.11 base = address->in(AddPNode::Base); 26.12 + } 26.13 if (base != NULL && phase->type(base)->higher_equal(TypePtr::NULL_PTR) && 26.14 !t_adr->isa_rawptr()) { 26.15 // Note: raw address has TOP base and top->higher_equal(TypePtr::NULL_PTR) is true. 26.16 - Compile* C = phase->C; 26.17 - tty->cr(); 26.18 - tty->print_cr("===== NULL+offs not RAW address ====="); 26.19 - if (C->is_dead_node(this->_idx)) tty->print_cr("'this' is dead"); 26.20 - if ((ctl != NULL) && C->is_dead_node(ctl->_idx)) tty->print_cr("'ctl' is dead"); 26.21 - if (C->is_dead_node(mem->_idx)) tty->print_cr("'mem' is dead"); 26.22 - if (C->is_dead_node(address->_idx)) tty->print_cr("'address' is dead"); 26.23 - if (C->is_dead_node(base->_idx)) tty->print_cr("'base' is dead"); 26.24 - tty->cr(); 26.25 - base->dump(1); 26.26 - tty->cr(); 26.27 - this->dump(2); 26.28 - tty->print("this->adr_type(): "); adr_type()->dump(); tty->cr(); 26.29 - tty->print("phase->type(address): "); t_adr->dump(); tty->cr(); 26.30 - tty->print("phase->type(base): "); phase->type(address)->dump(); tty->cr(); 26.31 - tty->cr(); 26.32 + // Skip this node optimization if its address has TOP base. 26.33 + return NodeSentinel; // caller will return NULL 26.34 } 26.35 - assert(base == NULL || t_adr->isa_rawptr() || 26.36 - !phase->type(base)->higher_equal(TypePtr::NULL_PTR), "NULL+offs not RAW address?"); 26.37 -#endif 26.38 26.39 // Avoid independent memory operations 26.40 Node* old_mem = mem;
27.1 --- a/src/share/vm/opto/node.cpp Wed May 07 10:58:47 2014 -0700 27.2 +++ b/src/share/vm/opto/node.cpp Thu May 08 23:07:11 2014 -0700 27.3 @@ -27,6 +27,7 @@ 27.4 #include "memory/allocation.inline.hpp" 27.5 #include "opto/cfgnode.hpp" 27.6 #include "opto/connode.hpp" 27.7 +#include "opto/loopnode.hpp" 27.8 #include "opto/machnode.hpp" 27.9 #include "opto/matcher.hpp" 27.10 #include "opto/node.hpp" 27.11 @@ -1255,6 +1256,7 @@ 27.12 27.13 Node *top = igvn->C->top(); 27.14 nstack.push(dead); 27.15 + bool has_irreducible_loop = igvn->C->has_irreducible_loop(); 27.16 27.17 while (nstack.size() > 0) { 27.18 dead = nstack.pop(); 27.19 @@ -1269,13 +1271,31 @@ 27.20 assert (!use->is_Con(), "Control for Con node should be Root node."); 27.21 use->set_req(0, top); // Cut dead edge to prevent processing 27.22 nstack.push(use); // the dead node again. 27.23 + } else if (!has_irreducible_loop && // Backedge could be alive in irreducible loop 27.24 + use->is_Loop() && !use->is_Root() && // Don't kill Root (RootNode extends LoopNode) 27.25 + use->in(LoopNode::EntryControl) == dead) { // Dead loop if its entry is dead 27.26 + use->set_req(LoopNode::EntryControl, top); // Cut dead edge to prevent processing 27.27 + use->set_req(0, top); // Cut self edge 27.28 + nstack.push(use); 27.29 } else { // Else found a not-dead user 27.30 + // Dead if all inputs are top or null 27.31 + bool dead_use = !use->is_Root(); // Keep empty graph alive 27.32 for (uint j = 1; j < use->req(); j++) { 27.33 - if (use->in(j) == dead) { // Turn all dead inputs into TOP 27.34 + Node* in = use->in(j); 27.35 + if (in == dead) { // Turn all dead inputs into TOP 27.36 use->set_req(j, top); 27.37 + } else if (in != NULL && !in->is_top()) { 27.38 + dead_use = false; 27.39 } 27.40 } 27.41 - igvn->_worklist.push(use); 27.42 + if (dead_use) { 27.43 + if (use->is_Region()) { 27.44 + use->set_req(0, top); // Cut self edge 27.45 + } 27.46 + nstack.push(use); 27.47 + } else { 27.48 + igvn->_worklist.push(use); 27.49 + } 27.50 } 27.51 // Refresh the iterator, since any number of kills might have happened. 27.52 k = dead->last_outs(kmin);
28.1 --- a/src/share/vm/opto/runtime.cpp Wed May 07 10:58:47 2014 -0700 28.2 +++ b/src/share/vm/opto/runtime.cpp Thu May 08 23:07:11 2014 -0700 28.3 @@ -1,5 +1,5 @@ 28.4 /* 28.5 - * Copyright (c) 1998, 2013, Oracle and/or its affiliates. All rights reserved. 28.6 + * Copyright (c) 1998, 2014, Oracle and/or its affiliates. All rights reserved. 28.7 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 28.8 * 28.9 * This code is free software; you can redistribute it and/or modify it 28.10 @@ -870,7 +870,7 @@ 28.11 return TypeFunc::make(domain, range); 28.12 } 28.13 28.14 -// for cipherBlockChaining calls of aescrypt encrypt/decrypt, four pointers and a length, returning void 28.15 +// for cipherBlockChaining calls of aescrypt encrypt/decrypt, four pointers and a length, returning int 28.16 const TypeFunc* OptoRuntime::cipherBlockChaining_aescrypt_Type() { 28.17 // create input type (domain) 28.18 int num_args = 5;
29.1 --- a/src/share/vm/runtime/advancedThresholdPolicy.cpp Wed May 07 10:58:47 2014 -0700 29.2 +++ b/src/share/vm/runtime/advancedThresholdPolicy.cpp Thu May 08 23:07:11 2014 -0700 29.3 @@ -53,7 +53,8 @@ 29.4 } 29.5 29.6 set_c1_count(MAX2(count / 3, 1)); 29.7 - set_c2_count(MAX2(count - count / 3, 1)); 29.8 + set_c2_count(MAX2(count - c1_count(), 1)); 29.9 + FLAG_SET_ERGO(intx, CICompilerCount, c1_count() + c2_count()); 29.10 29.11 // Some inlining tuning 29.12 #ifdef X86
30.1 --- a/src/share/vm/runtime/arguments.cpp Wed May 07 10:58:47 2014 -0700 30.2 +++ b/src/share/vm/runtime/arguments.cpp Thu May 08 23:07:11 2014 -0700 30.3 @@ -2383,6 +2383,10 @@ 30.4 status &= verify_interval(NmethodSweepFraction, 1, ReservedCodeCacheSize/K, "NmethodSweepFraction"); 30.5 status &= verify_interval(NmethodSweepActivity, 0, 2000, "NmethodSweepActivity"); 30.6 30.7 + if (!FLAG_IS_DEFAULT(CICompilerCount) && !FLAG_IS_DEFAULT(CICompilerCountPerCPU) && CICompilerCountPerCPU) { 30.8 + warning("The VM option CICompilerCountPerCPU overrides CICompilerCount."); 30.9 + } 30.10 + 30.11 return status; 30.12 } 30.13
31.1 --- a/src/share/vm/runtime/compilationPolicy.cpp Wed May 07 10:58:47 2014 -0700 31.2 +++ b/src/share/vm/runtime/compilationPolicy.cpp Thu May 08 23:07:11 2014 -0700 31.3 @@ -182,6 +182,7 @@ 31.4 // max(log2(8)-1,1) = 2 compiler threads on an 8-way machine. 31.5 // May help big-app startup time. 31.6 _compiler_count = MAX2(log2_intptr(os::active_processor_count())-1,1); 31.7 + FLAG_SET_ERGO(intx, CICompilerCount, _compiler_count); 31.8 } else { 31.9 _compiler_count = CICompilerCount; 31.10 }
32.1 --- a/src/share/vm/runtime/sharedRuntime.cpp Wed May 07 10:58:47 2014 -0700 32.2 +++ b/src/share/vm/runtime/sharedRuntime.cpp Thu May 08 23:07:11 2014 -0700 32.3 @@ -2690,19 +2690,20 @@ 32.4 JRT_END 32.5 32.6 #ifdef HAVE_DTRACE_H 32.7 -// Create a dtrace nmethod for this method. The wrapper converts the 32.8 -// java compiled calling convention to the native convention, makes a dummy call 32.9 -// (actually nops for the size of the call instruction, which become a trap if 32.10 -// probe is enabled). The returns to the caller. Since this all looks like a 32.11 -// leaf no thread transition is needed. 32.12 - 32.13 +/** 32.14 + * Create a dtrace nmethod for this method. The wrapper converts the 32.15 + * Java-compiled calling convention to the native convention, makes a dummy call 32.16 + * (actually nops for the size of the call instruction, which become a trap if 32.17 + * probe is enabled), and finally returns to the caller. Since this all looks like a 32.18 + * leaf, no thread transition is needed. 32.19 + */ 32.20 nmethod *AdapterHandlerLibrary::create_dtrace_nmethod(methodHandle method) { 32.21 ResourceMark rm; 32.22 nmethod* nm = NULL; 32.23 32.24 if (PrintCompilation) { 32.25 ttyLocker ttyl; 32.26 - tty->print("--- n%s "); 32.27 + tty->print("--- n "); 32.28 method->print_short_name(tty); 32.29 if (method->is_static()) { 32.30 tty->print(" (static)");
33.1 --- a/src/share/vm/runtime/simpleThresholdPolicy.cpp Wed May 07 10:58:47 2014 -0700 33.2 +++ b/src/share/vm/runtime/simpleThresholdPolicy.cpp Thu May 08 23:07:11 2014 -0700 33.3 @@ -142,7 +142,8 @@ 33.4 count = MAX2(log2_intptr(os::active_processor_count()), 1) * 3 / 2; 33.5 } 33.6 set_c1_count(MAX2(count / 3, 1)); 33.7 - set_c2_count(MAX2(count - count / 3, 1)); 33.8 + set_c2_count(MAX2(count - c1_count(), 1)); 33.9 + FLAG_SET_ERGO(intx, CICompilerCount, c1_count() + c2_count()); 33.10 } 33.11 33.12 void SimpleThresholdPolicy::set_carry_if_necessary(InvocationCounter *counter) { 33.13 @@ -191,6 +192,10 @@ 33.14 thread->is_interp_only_mode()) { 33.15 return NULL; 33.16 } 33.17 + if (CompileTheWorld || ReplayCompiles) { 33.18 + // Don't trigger other compiles in testing mode 33.19 + return NULL; 33.20 + } 33.21 nmethod *osr_nm = NULL; 33.22 33.23 handle_counter_overflow(method());
34.1 --- a/test/compiler/7184394/TestAESBase.java Wed May 07 10:58:47 2014 -0700 34.2 +++ b/test/compiler/7184394/TestAESBase.java Thu May 08 23:07:11 2014 -0700 34.3 @@ -1,5 +1,5 @@ 34.4 /* 34.5 - * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved. 34.6 + * Copyright (c) 2012, 2014, Oracle and/or its affiliates. All rights reserved. 34.7 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 34.8 * 34.9 * This code is free software; you can redistribute it and/or modify it 34.10 @@ -40,9 +40,20 @@ 34.11 int msgSize = Integer.getInteger("msgSize", 646); 34.12 boolean checkOutput = Boolean.getBoolean("checkOutput"); 34.13 boolean noReinit = Boolean.getBoolean("noReinit"); 34.14 + boolean testingMisalignment; 34.15 + private static final int ALIGN = 8; 34.16 + int encInputOffset = Integer.getInteger("encInputOffset", 0) % ALIGN; 34.17 + int encOutputOffset = Integer.getInteger("encOutputOffset", 0) % ALIGN; 34.18 + int decOutputOffset = Integer.getInteger("decOutputOffset", 0) % ALIGN; 34.19 + int lastChunkSize = Integer.getInteger("lastChunkSize", 32); 34.20 int keySize = Integer.getInteger("keySize", 128); 34.21 + int inputLength; 34.22 + int encodeLength; 34.23 + int decodeLength; 34.24 + int decodeMsgSize; 34.25 String algorithm = System.getProperty("algorithm", "AES"); 34.26 String mode = System.getProperty("mode", "CBC"); 34.27 + String paddingStr = System.getProperty("paddingStr", "PKCS5Padding"); 34.28 byte[] input; 34.29 byte[] encode; 34.30 byte[] expectedEncode; 34.31 @@ -51,7 +62,6 @@ 34.32 Random random = new Random(0); 34.33 Cipher cipher; 34.34 Cipher dCipher; 34.35 - String paddingStr = "PKCS5Padding"; 34.36 AlgorithmParameters algParams; 34.37 SecretKey key; 34.38 34.39 @@ -67,7 +77,10 @@ 34.40 34.41 public void prepare() { 34.42 try { 34.43 - System.out.println("\nalgorithm=" + algorithm + ", mode=" + mode + ", msgSize=" + msgSize + ", keySize=" + keySize + ", noReinit=" + noReinit + ", checkOutput=" + checkOutput); 34.44 + System.out.println("\nalgorithm=" + algorithm + ", mode=" + mode + ", paddingStr=" + paddingStr + ", msgSize=" + msgSize + ", keySize=" + keySize + ", noReinit=" + noReinit + ", checkOutput=" + checkOutput + ", encInputOffset=" + encInputOffset + ", encOutputOffset=" + encOutputOffset + ", decOutputOffset=" + decOutputOffset + ", lastChunkSize=" +lastChunkSize ); 34.45 + 34.46 + if (encInputOffset % ALIGN != 0 || encOutputOffset % ALIGN != 0 || decOutputOffset % ALIGN !=0 ) 34.47 + testingMisalignment = true; 34.48 34.49 int keyLenBytes = (keySize == 0 ? 16 : keySize/8); 34.50 byte keyBytes[] = new byte[keyLenBytes]; 34.51 @@ -81,10 +94,6 @@ 34.52 System.out.println("Algorithm: " + key.getAlgorithm() + "(" 34.53 + key.getEncoded().length * 8 + "bit)"); 34.54 } 34.55 - input = new byte[msgSize]; 34.56 - for (int i=0; i<input.length; i++) { 34.57 - input[i] = (byte) (i & 0xff); 34.58 - } 34.59 34.60 cipher = Cipher.getInstance(algorithm + "/" + mode + "/" + paddingStr, "SunJCE"); 34.61 dCipher = Cipher.getInstance(algorithm + "/" + mode + "/" + paddingStr, "SunJCE"); 34.62 @@ -103,10 +112,35 @@ 34.63 childShowCipher(); 34.64 } 34.65 34.66 + inputLength = msgSize + encInputOffset; 34.67 + if (testingMisalignment) { 34.68 + encodeLength = cipher.getOutputSize(msgSize - lastChunkSize) + encOutputOffset; 34.69 + encodeLength += cipher.getOutputSize(lastChunkSize); 34.70 + decodeLength = dCipher.getOutputSize(encodeLength - lastChunkSize) + decOutputOffset; 34.71 + decodeLength += dCipher.getOutputSize(lastChunkSize); 34.72 + } else { 34.73 + encodeLength = cipher.getOutputSize(msgSize) + encOutputOffset; 34.74 + decodeLength = dCipher.getOutputSize(encodeLength) + decOutputOffset; 34.75 + } 34.76 + 34.77 + input = new byte[inputLength]; 34.78 + for (int i=encInputOffset, j=0; i<inputLength; i++, j++) { 34.79 + input[i] = (byte) (j & 0xff); 34.80 + } 34.81 + 34.82 // do one encode and decode in preparation 34.83 - // this will also create the encode buffer and decode buffer 34.84 - encode = cipher.doFinal(input); 34.85 - decode = dCipher.doFinal(encode); 34.86 + encode = new byte[encodeLength]; 34.87 + decode = new byte[decodeLength]; 34.88 + if (testingMisalignment) { 34.89 + decodeMsgSize = cipher.update(input, encInputOffset, (msgSize - lastChunkSize), encode, encOutputOffset); 34.90 + decodeMsgSize += cipher.doFinal(input, (encInputOffset + msgSize - lastChunkSize), lastChunkSize, encode, (encOutputOffset + decodeMsgSize)); 34.91 + 34.92 + int tempSize = dCipher.update(encode, encOutputOffset, (decodeMsgSize - lastChunkSize), decode, decOutputOffset); 34.93 + dCipher.doFinal(encode, (encOutputOffset + decodeMsgSize - lastChunkSize), lastChunkSize, decode, (decOutputOffset + tempSize)); 34.94 + } else { 34.95 + decodeMsgSize = cipher.doFinal(input, encInputOffset, msgSize, encode, encOutputOffset); 34.96 + dCipher.doFinal(encode, encOutputOffset, decodeMsgSize, decode, decOutputOffset); 34.97 + } 34.98 if (checkOutput) { 34.99 expectedEncode = (byte[]) encode.clone(); 34.100 expectedDecode = (byte[]) decode.clone();
35.1 --- a/test/compiler/7184394/TestAESDecode.java Wed May 07 10:58:47 2014 -0700 35.2 +++ b/test/compiler/7184394/TestAESDecode.java Thu May 08 23:07:11 2014 -0700 35.3 @@ -1,5 +1,5 @@ 35.4 /* 35.5 - * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved. 35.6 + * Copyright (c) 2012, 2014, Oracle and/or its affiliates. All rights reserved. 35.7 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 35.8 * 35.9 * This code is free software; you can redistribute it and/or modify it 35.10 @@ -33,14 +33,15 @@ 35.11 public void run() { 35.12 try { 35.13 if (!noReinit) dCipher.init(Cipher.DECRYPT_MODE, key, algParams); 35.14 + decode = new byte[decodeLength]; 35.15 + if (testingMisalignment) { 35.16 + int tempSize = dCipher.update(encode, encOutputOffset, (decodeMsgSize - lastChunkSize), decode, decOutputOffset); 35.17 + dCipher.doFinal(encode, (encOutputOffset + decodeMsgSize - lastChunkSize), lastChunkSize, decode, (decOutputOffset + tempSize)); 35.18 + } else { 35.19 + dCipher.doFinal(encode, encOutputOffset, decodeMsgSize, decode, decOutputOffset); 35.20 + } 35.21 if (checkOutput) { 35.22 - // checked version creates new output buffer each time 35.23 - decode = dCipher.doFinal(encode, 0, encode.length); 35.24 compareArrays(decode, expectedDecode); 35.25 - } else { 35.26 - // non-checked version outputs to existing encode buffer for maximum speed 35.27 - decode = new byte[dCipher.getOutputSize(encode.length)]; 35.28 - dCipher.doFinal(encode, 0, encode.length, decode); 35.29 } 35.30 } 35.31 catch (Exception e) {
36.1 --- a/test/compiler/7184394/TestAESEncode.java Wed May 07 10:58:47 2014 -0700 36.2 +++ b/test/compiler/7184394/TestAESEncode.java Thu May 08 23:07:11 2014 -0700 36.3 @@ -1,5 +1,5 @@ 36.4 /* 36.5 - * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved. 36.6 + * Copyright (c) 2012, 2014, Oracle and/or its affiliates. All rights reserved. 36.7 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 36.8 * 36.9 * This code is free software; you can redistribute it and/or modify it 36.10 @@ -33,14 +33,15 @@ 36.11 public void run() { 36.12 try { 36.13 if (!noReinit) cipher.init(Cipher.ENCRYPT_MODE, key, algParams); 36.14 + encode = new byte[encodeLength]; 36.15 + if (testingMisalignment) { 36.16 + int tempSize = cipher.update(input, encInputOffset, (msgSize - lastChunkSize), encode, encOutputOffset); 36.17 + cipher.doFinal(input, (encInputOffset + msgSize - lastChunkSize), lastChunkSize, encode, (encOutputOffset + tempSize)); 36.18 + } else { 36.19 + cipher.doFinal(input, encInputOffset, msgSize, encode, encOutputOffset); 36.20 + } 36.21 if (checkOutput) { 36.22 - // checked version creates new output buffer each time 36.23 - encode = cipher.doFinal(input, 0, msgSize); 36.24 compareArrays(encode, expectedEncode); 36.25 - } else { 36.26 - // non-checked version outputs to existing encode buffer for maximum speed 36.27 - encode = new byte[cipher.getOutputSize(msgSize)]; 36.28 - cipher.doFinal(input, 0, msgSize, encode); 36.29 } 36.30 } 36.31 catch (Exception e) {
37.1 --- a/test/compiler/7184394/TestAESMain.java Wed May 07 10:58:47 2014 -0700 37.2 +++ b/test/compiler/7184394/TestAESMain.java Thu May 08 23:07:11 2014 -0700 37.3 @@ -1,5 +1,5 @@ 37.4 /* 37.5 - * Copyright (c) 2012, 2014 Oracle and/or its affiliates. All rights reserved. 37.6 + * Copyright (c) 2012, 2014, Oracle and/or its affiliates. All rights reserved. 37.7 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 37.8 * 37.9 * This code is free software; you can redistribute it and/or modify it 37.10 @@ -28,7 +28,19 @@ 37.11 * @summary add intrinsics to use AES instructions 37.12 * 37.13 * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CBC TestAESMain 37.14 + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CBC -DencInputOffset=1 TestAESMain 37.15 + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CBC -DencOutputOffset=1 TestAESMain 37.16 + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CBC -DdecOutputOffset=1 TestAESMain 37.17 + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CBC -DencInputOffset=1 -DencOutputOffset=1 TestAESMain 37.18 + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CBC -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 TestAESMain 37.19 + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CBC -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 -DpaddingStr=NoPadding -DmsgSize=640 TestAESMain 37.20 * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB TestAESMain 37.21 + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB -DencInputOffset=1 TestAESMain 37.22 + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB -DencOutputOffset=1 TestAESMain 37.23 + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB -DdecOutputOffset=1 TestAESMain 37.24 + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB -DencInputOffset=1 -DencOutputOffset=1 TestAESMain 37.25 + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 TestAESMain 37.26 + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 -DpaddingStr=NoPadding -DmsgSize=640 TestAESMain 37.27 * 37.28 * @author Tom Deneau 37.29 */ 37.30 @@ -36,12 +48,13 @@ 37.31 public class TestAESMain { 37.32 public static void main(String[] args) { 37.33 int iters = (args.length > 0 ? Integer.valueOf(args[0]) : 1000000); 37.34 + int warmupIters = (args.length > 1 ? Integer.valueOf(args[1]) : 20000); 37.35 System.out.println(iters + " iterations"); 37.36 TestAESEncode etest = new TestAESEncode(); 37.37 etest.prepare(); 37.38 - // warm-up for 20K iterations 37.39 + // warm-up 37.40 System.out.println("Starting encryption warm-up"); 37.41 - for (int i=0; i<20000; i++) { 37.42 + for (int i=0; i<warmupIters; i++) { 37.43 etest.run(); 37.44 } 37.45 System.out.println("Finished encryption warm-up"); 37.46 @@ -54,9 +67,9 @@ 37.47 37.48 TestAESDecode dtest = new TestAESDecode(); 37.49 dtest.prepare(); 37.50 - // warm-up for 20K iterations 37.51 + // warm-up 37.52 System.out.println("Starting decryption warm-up"); 37.53 - for (int i=0; i<20000; i++) { 37.54 + for (int i=0; i<warmupIters; i++) { 37.55 dtest.run(); 37.56 } 37.57 System.out.println("Finished decryption warm-up");