src/cpu/x86/vm/sharedRuntime_x86_64.cpp

changeset 0
f90c822e73f8
child 6876
710a3c8b516e
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/src/cpu/x86/vm/sharedRuntime_x86_64.cpp	Wed Apr 27 01:25:04 2016 +0800
     1.3 @@ -0,0 +1,4105 @@
     1.4 +/*
     1.5 + * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
     1.6 + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     1.7 + *
     1.8 + * This code is free software; you can redistribute it and/or modify it
     1.9 + * under the terms of the GNU General Public License version 2 only, as
    1.10 + * published by the Free Software Foundation.
    1.11 + *
    1.12 + * This code is distributed in the hope that it will be useful, but WITHOUT
    1.13 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
    1.14 + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    1.15 + * version 2 for more details (a copy is included in the LICENSE file that
    1.16 + * accompanied this code).
    1.17 + *
    1.18 + * You should have received a copy of the GNU General Public License version
    1.19 + * 2 along with this work; if not, write to the Free Software Foundation,
    1.20 + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
    1.21 + *
    1.22 + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
    1.23 + * or visit www.oracle.com if you need additional information or have any
    1.24 + * questions.
    1.25 + *
    1.26 + */
    1.27 +
    1.28 +#include "precompiled.hpp"
    1.29 +#include "asm/macroAssembler.hpp"
    1.30 +#include "asm/macroAssembler.inline.hpp"
    1.31 +#include "code/debugInfoRec.hpp"
    1.32 +#include "code/icBuffer.hpp"
    1.33 +#include "code/vtableStubs.hpp"
    1.34 +#include "interpreter/interpreter.hpp"
    1.35 +#include "oops/compiledICHolder.hpp"
    1.36 +#include "prims/jvmtiRedefineClassesTrace.hpp"
    1.37 +#include "runtime/sharedRuntime.hpp"
    1.38 +#include "runtime/vframeArray.hpp"
    1.39 +#include "vmreg_x86.inline.hpp"
    1.40 +#ifdef COMPILER1
    1.41 +#include "c1/c1_Runtime1.hpp"
    1.42 +#endif
    1.43 +#ifdef COMPILER2
    1.44 +#include "opto/runtime.hpp"
    1.45 +#endif
    1.46 +
    1.47 +#define __ masm->
    1.48 +
    1.49 +const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
    1.50 +
    1.51 +class SimpleRuntimeFrame {
    1.52 +
    1.53 +  public:
    1.54 +
    1.55 +  // Most of the runtime stubs have this simple frame layout.
    1.56 +  // This class exists to make the layout shared in one place.
    1.57 +  // Offsets are for compiler stack slots, which are jints.
    1.58 +  enum layout {
    1.59 +    // The frame sender code expects that rbp will be in the "natural" place and
    1.60 +    // will override any oopMap setting for it. We must therefore force the layout
    1.61 +    // so that it agrees with the frame sender code.
    1.62 +    rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
    1.63 +    rbp_off2,
    1.64 +    return_off, return_off2,
    1.65 +    framesize
    1.66 +  };
    1.67 +};
    1.68 +
    1.69 +class RegisterSaver {
    1.70 +  // Capture info about frame layout.  Layout offsets are in jint
    1.71 +  // units because compiler frame slots are jints.
    1.72 +#define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
    1.73 +  enum layout {
    1.74 +    fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
    1.75 +    xmm_off       = fpu_state_off + 160/BytesPerInt,            // offset in fxsave save area
    1.76 +    DEF_XMM_OFFS(0),
    1.77 +    DEF_XMM_OFFS(1),
    1.78 +    DEF_XMM_OFFS(2),
    1.79 +    DEF_XMM_OFFS(3),
    1.80 +    DEF_XMM_OFFS(4),
    1.81 +    DEF_XMM_OFFS(5),
    1.82 +    DEF_XMM_OFFS(6),
    1.83 +    DEF_XMM_OFFS(7),
    1.84 +    DEF_XMM_OFFS(8),
    1.85 +    DEF_XMM_OFFS(9),
    1.86 +    DEF_XMM_OFFS(10),
    1.87 +    DEF_XMM_OFFS(11),
    1.88 +    DEF_XMM_OFFS(12),
    1.89 +    DEF_XMM_OFFS(13),
    1.90 +    DEF_XMM_OFFS(14),
    1.91 +    DEF_XMM_OFFS(15),
    1.92 +    fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
    1.93 +    fpu_stateH_end,
    1.94 +    r15_off, r15H_off,
    1.95 +    r14_off, r14H_off,
    1.96 +    r13_off, r13H_off,
    1.97 +    r12_off, r12H_off,
    1.98 +    r11_off, r11H_off,
    1.99 +    r10_off, r10H_off,
   1.100 +    r9_off,  r9H_off,
   1.101 +    r8_off,  r8H_off,
   1.102 +    rdi_off, rdiH_off,
   1.103 +    rsi_off, rsiH_off,
   1.104 +    ignore_off, ignoreH_off,  // extra copy of rbp
   1.105 +    rsp_off, rspH_off,
   1.106 +    rbx_off, rbxH_off,
   1.107 +    rdx_off, rdxH_off,
   1.108 +    rcx_off, rcxH_off,
   1.109 +    rax_off, raxH_off,
   1.110 +    // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
   1.111 +    align_off, alignH_off,
   1.112 +    flags_off, flagsH_off,
   1.113 +    // The frame sender code expects that rbp will be in the "natural" place and
   1.114 +    // will override any oopMap setting for it. We must therefore force the layout
   1.115 +    // so that it agrees with the frame sender code.
   1.116 +    rbp_off, rbpH_off,        // copy of rbp we will restore
   1.117 +    return_off, returnH_off,  // slot for return address
   1.118 +    reg_save_size             // size in compiler stack slots
   1.119 +  };
   1.120 +
   1.121 + public:
   1.122 +  static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors = false);
   1.123 +  static void restore_live_registers(MacroAssembler* masm, bool restore_vectors = false);
   1.124 +
   1.125 +  // Offsets into the register save area
   1.126 +  // Used by deoptimization when it is managing result register
   1.127 +  // values on its own
   1.128 +
   1.129 +  static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
   1.130 +  static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
   1.131 +  static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
   1.132 +  static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
   1.133 +  static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
   1.134 +
   1.135 +  // During deoptimization only the result registers need to be restored,
   1.136 +  // all the other values have already been extracted.
   1.137 +  static void restore_result_registers(MacroAssembler* masm);
   1.138 +};
   1.139 +
   1.140 +OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors) {
   1.141 +  int vect_words = 0;
   1.142 +#ifdef COMPILER2
   1.143 +  if (save_vectors) {
   1.144 +    assert(UseAVX > 0, "256bit vectors are supported only with AVX");
   1.145 +    assert(MaxVectorSize == 32, "only 256bit vectors are supported now");
   1.146 +    // Save upper half of YMM registes
   1.147 +    vect_words = 16 * 16 / wordSize;
   1.148 +    additional_frame_words += vect_words;
   1.149 +  }
   1.150 +#else
   1.151 +  assert(!save_vectors, "vectors are generated only by C2");
   1.152 +#endif
   1.153 +
   1.154 +  // Always make the frame size 16-byte aligned
   1.155 +  int frame_size_in_bytes = round_to(additional_frame_words*wordSize +
   1.156 +                                     reg_save_size*BytesPerInt, 16);
   1.157 +  // OopMap frame size is in compiler stack slots (jint's) not bytes or words
   1.158 +  int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
   1.159 +  // The caller will allocate additional_frame_words
   1.160 +  int additional_frame_slots = additional_frame_words*wordSize / BytesPerInt;
   1.161 +  // CodeBlob frame size is in words.
   1.162 +  int frame_size_in_words = frame_size_in_bytes / wordSize;
   1.163 +  *total_frame_words = frame_size_in_words;
   1.164 +
   1.165 +  // Save registers, fpu state, and flags.
   1.166 +  // We assume caller has already pushed the return address onto the
   1.167 +  // stack, so rsp is 8-byte aligned here.
   1.168 +  // We push rpb twice in this sequence because we want the real rbp
   1.169 +  // to be under the return like a normal enter.
   1.170 +
   1.171 +  __ enter();          // rsp becomes 16-byte aligned here
   1.172 +  __ push_CPU_state(); // Push a multiple of 16 bytes
   1.173 +
   1.174 +  if (vect_words > 0) {
   1.175 +    assert(vect_words*wordSize == 256, "");
   1.176 +    __ subptr(rsp, 256); // Save upper half of YMM registes
   1.177 +    __ vextractf128h(Address(rsp,  0),xmm0);
   1.178 +    __ vextractf128h(Address(rsp, 16),xmm1);
   1.179 +    __ vextractf128h(Address(rsp, 32),xmm2);
   1.180 +    __ vextractf128h(Address(rsp, 48),xmm3);
   1.181 +    __ vextractf128h(Address(rsp, 64),xmm4);
   1.182 +    __ vextractf128h(Address(rsp, 80),xmm5);
   1.183 +    __ vextractf128h(Address(rsp, 96),xmm6);
   1.184 +    __ vextractf128h(Address(rsp,112),xmm7);
   1.185 +    __ vextractf128h(Address(rsp,128),xmm8);
   1.186 +    __ vextractf128h(Address(rsp,144),xmm9);
   1.187 +    __ vextractf128h(Address(rsp,160),xmm10);
   1.188 +    __ vextractf128h(Address(rsp,176),xmm11);
   1.189 +    __ vextractf128h(Address(rsp,192),xmm12);
   1.190 +    __ vextractf128h(Address(rsp,208),xmm13);
   1.191 +    __ vextractf128h(Address(rsp,224),xmm14);
   1.192 +    __ vextractf128h(Address(rsp,240),xmm15);
   1.193 +  }
   1.194 +  if (frame::arg_reg_save_area_bytes != 0) {
   1.195 +    // Allocate argument register save area
   1.196 +    __ subptr(rsp, frame::arg_reg_save_area_bytes);
   1.197 +  }
   1.198 +
   1.199 +  // Set an oopmap for the call site.  This oopmap will map all
   1.200 +  // oop-registers and debug-info registers as callee-saved.  This
   1.201 +  // will allow deoptimization at this safepoint to find all possible
   1.202 +  // debug-info recordings, as well as let GC find all oops.
   1.203 +
   1.204 +  OopMapSet *oop_maps = new OopMapSet();
   1.205 +  OopMap* map = new OopMap(frame_size_in_slots, 0);
   1.206 +
   1.207 +#define STACK_OFFSET(x) VMRegImpl::stack2reg((x) + additional_frame_slots)
   1.208 +
   1.209 +  map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
   1.210 +  map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
   1.211 +  map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
   1.212 +  map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
   1.213 +  // rbp location is known implicitly by the frame sender code, needs no oopmap
   1.214 +  // and the location where rbp was saved by is ignored
   1.215 +  map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
   1.216 +  map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
   1.217 +  map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
   1.218 +  map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
   1.219 +  map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
   1.220 +  map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
   1.221 +  map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
   1.222 +  map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
   1.223 +  map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
   1.224 +  map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
   1.225 +  map->set_callee_saved(STACK_OFFSET(xmm0_off ), xmm0->as_VMReg());
   1.226 +  map->set_callee_saved(STACK_OFFSET(xmm1_off ), xmm1->as_VMReg());
   1.227 +  map->set_callee_saved(STACK_OFFSET(xmm2_off ), xmm2->as_VMReg());
   1.228 +  map->set_callee_saved(STACK_OFFSET(xmm3_off ), xmm3->as_VMReg());
   1.229 +  map->set_callee_saved(STACK_OFFSET(xmm4_off ), xmm4->as_VMReg());
   1.230 +  map->set_callee_saved(STACK_OFFSET(xmm5_off ), xmm5->as_VMReg());
   1.231 +  map->set_callee_saved(STACK_OFFSET(xmm6_off ), xmm6->as_VMReg());
   1.232 +  map->set_callee_saved(STACK_OFFSET(xmm7_off ), xmm7->as_VMReg());
   1.233 +  map->set_callee_saved(STACK_OFFSET(xmm8_off ), xmm8->as_VMReg());
   1.234 +  map->set_callee_saved(STACK_OFFSET(xmm9_off ), xmm9->as_VMReg());
   1.235 +  map->set_callee_saved(STACK_OFFSET(xmm10_off), xmm10->as_VMReg());
   1.236 +  map->set_callee_saved(STACK_OFFSET(xmm11_off), xmm11->as_VMReg());
   1.237 +  map->set_callee_saved(STACK_OFFSET(xmm12_off), xmm12->as_VMReg());
   1.238 +  map->set_callee_saved(STACK_OFFSET(xmm13_off), xmm13->as_VMReg());
   1.239 +  map->set_callee_saved(STACK_OFFSET(xmm14_off), xmm14->as_VMReg());
   1.240 +  map->set_callee_saved(STACK_OFFSET(xmm15_off), xmm15->as_VMReg());
   1.241 +
   1.242 +  // %%% These should all be a waste but we'll keep things as they were for now
   1.243 +  if (true) {
   1.244 +    map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
   1.245 +    map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
   1.246 +    map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
   1.247 +    map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
   1.248 +    // rbp location is known implicitly by the frame sender code, needs no oopmap
   1.249 +    map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
   1.250 +    map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
   1.251 +    map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
   1.252 +    map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
   1.253 +    map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
   1.254 +    map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
   1.255 +    map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
   1.256 +    map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
   1.257 +    map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
   1.258 +    map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
   1.259 +    map->set_callee_saved(STACK_OFFSET(xmm0H_off ), xmm0->as_VMReg()->next());
   1.260 +    map->set_callee_saved(STACK_OFFSET(xmm1H_off ), xmm1->as_VMReg()->next());
   1.261 +    map->set_callee_saved(STACK_OFFSET(xmm2H_off ), xmm2->as_VMReg()->next());
   1.262 +    map->set_callee_saved(STACK_OFFSET(xmm3H_off ), xmm3->as_VMReg()->next());
   1.263 +    map->set_callee_saved(STACK_OFFSET(xmm4H_off ), xmm4->as_VMReg()->next());
   1.264 +    map->set_callee_saved(STACK_OFFSET(xmm5H_off ), xmm5->as_VMReg()->next());
   1.265 +    map->set_callee_saved(STACK_OFFSET(xmm6H_off ), xmm6->as_VMReg()->next());
   1.266 +    map->set_callee_saved(STACK_OFFSET(xmm7H_off ), xmm7->as_VMReg()->next());
   1.267 +    map->set_callee_saved(STACK_OFFSET(xmm8H_off ), xmm8->as_VMReg()->next());
   1.268 +    map->set_callee_saved(STACK_OFFSET(xmm9H_off ), xmm9->as_VMReg()->next());
   1.269 +    map->set_callee_saved(STACK_OFFSET(xmm10H_off), xmm10->as_VMReg()->next());
   1.270 +    map->set_callee_saved(STACK_OFFSET(xmm11H_off), xmm11->as_VMReg()->next());
   1.271 +    map->set_callee_saved(STACK_OFFSET(xmm12H_off), xmm12->as_VMReg()->next());
   1.272 +    map->set_callee_saved(STACK_OFFSET(xmm13H_off), xmm13->as_VMReg()->next());
   1.273 +    map->set_callee_saved(STACK_OFFSET(xmm14H_off), xmm14->as_VMReg()->next());
   1.274 +    map->set_callee_saved(STACK_OFFSET(xmm15H_off), xmm15->as_VMReg()->next());
   1.275 +  }
   1.276 +
   1.277 +  return map;
   1.278 +}
   1.279 +
   1.280 +void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_vectors) {
   1.281 +  if (frame::arg_reg_save_area_bytes != 0) {
   1.282 +    // Pop arg register save area
   1.283 +    __ addptr(rsp, frame::arg_reg_save_area_bytes);
   1.284 +  }
   1.285 +#ifdef COMPILER2
   1.286 +  if (restore_vectors) {
   1.287 +    // Restore upper half of YMM registes.
   1.288 +    assert(UseAVX > 0, "256bit vectors are supported only with AVX");
   1.289 +    assert(MaxVectorSize == 32, "only 256bit vectors are supported now");
   1.290 +    __ vinsertf128h(xmm0, Address(rsp,  0));
   1.291 +    __ vinsertf128h(xmm1, Address(rsp, 16));
   1.292 +    __ vinsertf128h(xmm2, Address(rsp, 32));
   1.293 +    __ vinsertf128h(xmm3, Address(rsp, 48));
   1.294 +    __ vinsertf128h(xmm4, Address(rsp, 64));
   1.295 +    __ vinsertf128h(xmm5, Address(rsp, 80));
   1.296 +    __ vinsertf128h(xmm6, Address(rsp, 96));
   1.297 +    __ vinsertf128h(xmm7, Address(rsp,112));
   1.298 +    __ vinsertf128h(xmm8, Address(rsp,128));
   1.299 +    __ vinsertf128h(xmm9, Address(rsp,144));
   1.300 +    __ vinsertf128h(xmm10, Address(rsp,160));
   1.301 +    __ vinsertf128h(xmm11, Address(rsp,176));
   1.302 +    __ vinsertf128h(xmm12, Address(rsp,192));
   1.303 +    __ vinsertf128h(xmm13, Address(rsp,208));
   1.304 +    __ vinsertf128h(xmm14, Address(rsp,224));
   1.305 +    __ vinsertf128h(xmm15, Address(rsp,240));
   1.306 +    __ addptr(rsp, 256);
   1.307 +  }
   1.308 +#else
   1.309 +  assert(!restore_vectors, "vectors are generated only by C2");
   1.310 +#endif
   1.311 +  // Recover CPU state
   1.312 +  __ pop_CPU_state();
   1.313 +  // Get the rbp described implicitly by the calling convention (no oopMap)
   1.314 +  __ pop(rbp);
   1.315 +}
   1.316 +
   1.317 +void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
   1.318 +
   1.319 +  // Just restore result register. Only used by deoptimization. By
   1.320 +  // now any callee save register that needs to be restored to a c2
   1.321 +  // caller of the deoptee has been extracted into the vframeArray
   1.322 +  // and will be stuffed into the c2i adapter we create for later
   1.323 +  // restoration so only result registers need to be restored here.
   1.324 +
   1.325 +  // Restore fp result register
   1.326 +  __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
   1.327 +  // Restore integer result register
   1.328 +  __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
   1.329 +  __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
   1.330 +
   1.331 +  // Pop all of the register save are off the stack except the return address
   1.332 +  __ addptr(rsp, return_offset_in_bytes());
   1.333 +}
   1.334 +
   1.335 +// Is vector's size (in bytes) bigger than a size saved by default?
   1.336 +// 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
   1.337 +bool SharedRuntime::is_wide_vector(int size) {
   1.338 +  return size > 16;
   1.339 +}
   1.340 +
   1.341 +// The java_calling_convention describes stack locations as ideal slots on
   1.342 +// a frame with no abi restrictions. Since we must observe abi restrictions
   1.343 +// (like the placement of the register window) the slots must be biased by
   1.344 +// the following value.
   1.345 +static int reg2offset_in(VMReg r) {
   1.346 +  // Account for saved rbp and return address
   1.347 +  // This should really be in_preserve_stack_slots
   1.348 +  return (r->reg2stack() + 4) * VMRegImpl::stack_slot_size;
   1.349 +}
   1.350 +
   1.351 +static int reg2offset_out(VMReg r) {
   1.352 +  return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
   1.353 +}
   1.354 +
   1.355 +// ---------------------------------------------------------------------------
   1.356 +// Read the array of BasicTypes from a signature, and compute where the
   1.357 +// arguments should go.  Values in the VMRegPair regs array refer to 4-byte
   1.358 +// quantities.  Values less than VMRegImpl::stack0 are registers, those above
   1.359 +// refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
   1.360 +// as framesizes are fixed.
   1.361 +// VMRegImpl::stack0 refers to the first slot 0(sp).
   1.362 +// and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.  Register
   1.363 +// up to RegisterImpl::number_of_registers) are the 64-bit
   1.364 +// integer registers.
   1.365 +
   1.366 +// Note: the INPUTS in sig_bt are in units of Java argument words, which are
   1.367 +// either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
   1.368 +// units regardless of build. Of course for i486 there is no 64 bit build
   1.369 +
   1.370 +// The Java calling convention is a "shifted" version of the C ABI.
   1.371 +// By skipping the first C ABI register we can call non-static jni methods
   1.372 +// with small numbers of arguments without having to shuffle the arguments
   1.373 +// at all. Since we control the java ABI we ought to at least get some
   1.374 +// advantage out of it.
   1.375 +
   1.376 +int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
   1.377 +                                           VMRegPair *regs,
   1.378 +                                           int total_args_passed,
   1.379 +                                           int is_outgoing) {
   1.380 +
   1.381 +  // Create the mapping between argument positions and
   1.382 +  // registers.
   1.383 +  static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
   1.384 +    j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
   1.385 +  };
   1.386 +  static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
   1.387 +    j_farg0, j_farg1, j_farg2, j_farg3,
   1.388 +    j_farg4, j_farg5, j_farg6, j_farg7
   1.389 +  };
   1.390 +
   1.391 +
   1.392 +  uint int_args = 0;
   1.393 +  uint fp_args = 0;
   1.394 +  uint stk_args = 0; // inc by 2 each time
   1.395 +
   1.396 +  for (int i = 0; i < total_args_passed; i++) {
   1.397 +    switch (sig_bt[i]) {
   1.398 +    case T_BOOLEAN:
   1.399 +    case T_CHAR:
   1.400 +    case T_BYTE:
   1.401 +    case T_SHORT:
   1.402 +    case T_INT:
   1.403 +      if (int_args < Argument::n_int_register_parameters_j) {
   1.404 +        regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
   1.405 +      } else {
   1.406 +        regs[i].set1(VMRegImpl::stack2reg(stk_args));
   1.407 +        stk_args += 2;
   1.408 +      }
   1.409 +      break;
   1.410 +    case T_VOID:
   1.411 +      // halves of T_LONG or T_DOUBLE
   1.412 +      assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
   1.413 +      regs[i].set_bad();
   1.414 +      break;
   1.415 +    case T_LONG:
   1.416 +      assert(sig_bt[i + 1] == T_VOID, "expecting half");
   1.417 +      // fall through
   1.418 +    case T_OBJECT:
   1.419 +    case T_ARRAY:
   1.420 +    case T_ADDRESS:
   1.421 +      if (int_args < Argument::n_int_register_parameters_j) {
   1.422 +        regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
   1.423 +      } else {
   1.424 +        regs[i].set2(VMRegImpl::stack2reg(stk_args));
   1.425 +        stk_args += 2;
   1.426 +      }
   1.427 +      break;
   1.428 +    case T_FLOAT:
   1.429 +      if (fp_args < Argument::n_float_register_parameters_j) {
   1.430 +        regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
   1.431 +      } else {
   1.432 +        regs[i].set1(VMRegImpl::stack2reg(stk_args));
   1.433 +        stk_args += 2;
   1.434 +      }
   1.435 +      break;
   1.436 +    case T_DOUBLE:
   1.437 +      assert(sig_bt[i + 1] == T_VOID, "expecting half");
   1.438 +      if (fp_args < Argument::n_float_register_parameters_j) {
   1.439 +        regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
   1.440 +      } else {
   1.441 +        regs[i].set2(VMRegImpl::stack2reg(stk_args));
   1.442 +        stk_args += 2;
   1.443 +      }
   1.444 +      break;
   1.445 +    default:
   1.446 +      ShouldNotReachHere();
   1.447 +      break;
   1.448 +    }
   1.449 +  }
   1.450 +
   1.451 +  return round_to(stk_args, 2);
   1.452 +}
   1.453 +
   1.454 +// Patch the callers callsite with entry to compiled code if it exists.
   1.455 +static void patch_callers_callsite(MacroAssembler *masm) {
   1.456 +  Label L;
   1.457 +  __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD);
   1.458 +  __ jcc(Assembler::equal, L);
   1.459 +
   1.460 +  // Save the current stack pointer
   1.461 +  __ mov(r13, rsp);
   1.462 +  // Schedule the branch target address early.
   1.463 +  // Call into the VM to patch the caller, then jump to compiled callee
   1.464 +  // rax isn't live so capture return address while we easily can
   1.465 +  __ movptr(rax, Address(rsp, 0));
   1.466 +
   1.467 +  // align stack so push_CPU_state doesn't fault
   1.468 +  __ andptr(rsp, -(StackAlignmentInBytes));
   1.469 +  __ push_CPU_state();
   1.470 +
   1.471 +  // VM needs caller's callsite
   1.472 +  // VM needs target method
   1.473 +  // This needs to be a long call since we will relocate this adapter to
   1.474 +  // the codeBuffer and it may not reach
   1.475 +
   1.476 +  // Allocate argument register save area
   1.477 +  if (frame::arg_reg_save_area_bytes != 0) {
   1.478 +    __ subptr(rsp, frame::arg_reg_save_area_bytes);
   1.479 +  }
   1.480 +  __ mov(c_rarg0, rbx);
   1.481 +  __ mov(c_rarg1, rax);
   1.482 +  __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
   1.483 +
   1.484 +  // De-allocate argument register save area
   1.485 +  if (frame::arg_reg_save_area_bytes != 0) {
   1.486 +    __ addptr(rsp, frame::arg_reg_save_area_bytes);
   1.487 +  }
   1.488 +
   1.489 +  __ pop_CPU_state();
   1.490 +  // restore sp
   1.491 +  __ mov(rsp, r13);
   1.492 +  __ bind(L);
   1.493 +}
   1.494 +
   1.495 +
   1.496 +static void gen_c2i_adapter(MacroAssembler *masm,
   1.497 +                            int total_args_passed,
   1.498 +                            int comp_args_on_stack,
   1.499 +                            const BasicType *sig_bt,
   1.500 +                            const VMRegPair *regs,
   1.501 +                            Label& skip_fixup) {
   1.502 +  // Before we get into the guts of the C2I adapter, see if we should be here
   1.503 +  // at all.  We've come from compiled code and are attempting to jump to the
   1.504 +  // interpreter, which means the caller made a static call to get here
   1.505 +  // (vcalls always get a compiled target if there is one).  Check for a
   1.506 +  // compiled target.  If there is one, we need to patch the caller's call.
   1.507 +  patch_callers_callsite(masm);
   1.508 +
   1.509 +  __ bind(skip_fixup);
   1.510 +
   1.511 +  // Since all args are passed on the stack, total_args_passed *
   1.512 +  // Interpreter::stackElementSize is the space we need. Plus 1 because
   1.513 +  // we also account for the return address location since
   1.514 +  // we store it first rather than hold it in rax across all the shuffling
   1.515 +
   1.516 +  int extraspace = (total_args_passed * Interpreter::stackElementSize) + wordSize;
   1.517 +
   1.518 +  // stack is aligned, keep it that way
   1.519 +  extraspace = round_to(extraspace, 2*wordSize);
   1.520 +
   1.521 +  // Get return address
   1.522 +  __ pop(rax);
   1.523 +
   1.524 +  // set senderSP value
   1.525 +  __ mov(r13, rsp);
   1.526 +
   1.527 +  __ subptr(rsp, extraspace);
   1.528 +
   1.529 +  // Store the return address in the expected location
   1.530 +  __ movptr(Address(rsp, 0), rax);
   1.531 +
   1.532 +  // Now write the args into the outgoing interpreter space
   1.533 +  for (int i = 0; i < total_args_passed; i++) {
   1.534 +    if (sig_bt[i] == T_VOID) {
   1.535 +      assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
   1.536 +      continue;
   1.537 +    }
   1.538 +
   1.539 +    // offset to start parameters
   1.540 +    int st_off   = (total_args_passed - i) * Interpreter::stackElementSize;
   1.541 +    int next_off = st_off - Interpreter::stackElementSize;
   1.542 +
   1.543 +    // Say 4 args:
   1.544 +    // i   st_off
   1.545 +    // 0   32 T_LONG
   1.546 +    // 1   24 T_VOID
   1.547 +    // 2   16 T_OBJECT
   1.548 +    // 3    8 T_BOOL
   1.549 +    // -    0 return address
   1.550 +    //
   1.551 +    // However to make thing extra confusing. Because we can fit a long/double in
   1.552 +    // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
   1.553 +    // leaves one slot empty and only stores to a single slot. In this case the
   1.554 +    // slot that is occupied is the T_VOID slot. See I said it was confusing.
   1.555 +
   1.556 +    VMReg r_1 = regs[i].first();
   1.557 +    VMReg r_2 = regs[i].second();
   1.558 +    if (!r_1->is_valid()) {
   1.559 +      assert(!r_2->is_valid(), "");
   1.560 +      continue;
   1.561 +    }
   1.562 +    if (r_1->is_stack()) {
   1.563 +      // memory to memory use rax
   1.564 +      int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
   1.565 +      if (!r_2->is_valid()) {
   1.566 +        // sign extend??
   1.567 +        __ movl(rax, Address(rsp, ld_off));
   1.568 +        __ movptr(Address(rsp, st_off), rax);
   1.569 +
   1.570 +      } else {
   1.571 +
   1.572 +        __ movq(rax, Address(rsp, ld_off));
   1.573 +
   1.574 +        // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
   1.575 +        // T_DOUBLE and T_LONG use two slots in the interpreter
   1.576 +        if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
   1.577 +          // ld_off == LSW, ld_off+wordSize == MSW
   1.578 +          // st_off == MSW, next_off == LSW
   1.579 +          __ movq(Address(rsp, next_off), rax);
   1.580 +#ifdef ASSERT
   1.581 +          // Overwrite the unused slot with known junk
   1.582 +          __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
   1.583 +          __ movptr(Address(rsp, st_off), rax);
   1.584 +#endif /* ASSERT */
   1.585 +        } else {
   1.586 +          __ movq(Address(rsp, st_off), rax);
   1.587 +        }
   1.588 +      }
   1.589 +    } else if (r_1->is_Register()) {
   1.590 +      Register r = r_1->as_Register();
   1.591 +      if (!r_2->is_valid()) {
   1.592 +        // must be only an int (or less ) so move only 32bits to slot
   1.593 +        // why not sign extend??
   1.594 +        __ movl(Address(rsp, st_off), r);
   1.595 +      } else {
   1.596 +        // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
   1.597 +        // T_DOUBLE and T_LONG use two slots in the interpreter
   1.598 +        if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
   1.599 +          // long/double in gpr
   1.600 +#ifdef ASSERT
   1.601 +          // Overwrite the unused slot with known junk
   1.602 +          __ mov64(rax, CONST64(0xdeadffffdeadaaab));
   1.603 +          __ movptr(Address(rsp, st_off), rax);
   1.604 +#endif /* ASSERT */
   1.605 +          __ movq(Address(rsp, next_off), r);
   1.606 +        } else {
   1.607 +          __ movptr(Address(rsp, st_off), r);
   1.608 +        }
   1.609 +      }
   1.610 +    } else {
   1.611 +      assert(r_1->is_XMMRegister(), "");
   1.612 +      if (!r_2->is_valid()) {
   1.613 +        // only a float use just part of the slot
   1.614 +        __ movflt(Address(rsp, st_off), r_1->as_XMMRegister());
   1.615 +      } else {
   1.616 +#ifdef ASSERT
   1.617 +        // Overwrite the unused slot with known junk
   1.618 +        __ mov64(rax, CONST64(0xdeadffffdeadaaac));
   1.619 +        __ movptr(Address(rsp, st_off), rax);
   1.620 +#endif /* ASSERT */
   1.621 +        __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister());
   1.622 +      }
   1.623 +    }
   1.624 +  }
   1.625 +
   1.626 +  // Schedule the branch target address early.
   1.627 +  __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
   1.628 +  __ jmp(rcx);
   1.629 +}
   1.630 +
   1.631 +static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg,
   1.632 +                        address code_start, address code_end,
   1.633 +                        Label& L_ok) {
   1.634 +  Label L_fail;
   1.635 +  __ lea(temp_reg, ExternalAddress(code_start));
   1.636 +  __ cmpptr(pc_reg, temp_reg);
   1.637 +  __ jcc(Assembler::belowEqual, L_fail);
   1.638 +  __ lea(temp_reg, ExternalAddress(code_end));
   1.639 +  __ cmpptr(pc_reg, temp_reg);
   1.640 +  __ jcc(Assembler::below, L_ok);
   1.641 +  __ bind(L_fail);
   1.642 +}
   1.643 +
   1.644 +static void gen_i2c_adapter(MacroAssembler *masm,
   1.645 +                            int total_args_passed,
   1.646 +                            int comp_args_on_stack,
   1.647 +                            const BasicType *sig_bt,
   1.648 +                            const VMRegPair *regs) {
   1.649 +
   1.650 +  // Note: r13 contains the senderSP on entry. We must preserve it since
   1.651 +  // we may do a i2c -> c2i transition if we lose a race where compiled
   1.652 +  // code goes non-entrant while we get args ready.
   1.653 +  // In addition we use r13 to locate all the interpreter args as
   1.654 +  // we must align the stack to 16 bytes on an i2c entry else we
   1.655 +  // lose alignment we expect in all compiled code and register
   1.656 +  // save code can segv when fxsave instructions find improperly
   1.657 +  // aligned stack pointer.
   1.658 +
   1.659 +  // Adapters can be frameless because they do not require the caller
   1.660 +  // to perform additional cleanup work, such as correcting the stack pointer.
   1.661 +  // An i2c adapter is frameless because the *caller* frame, which is interpreted,
   1.662 +  // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
   1.663 +  // even if a callee has modified the stack pointer.
   1.664 +  // A c2i adapter is frameless because the *callee* frame, which is interpreted,
   1.665 +  // routinely repairs its caller's stack pointer (from sender_sp, which is set
   1.666 +  // up via the senderSP register).
   1.667 +  // In other words, if *either* the caller or callee is interpreted, we can
   1.668 +  // get the stack pointer repaired after a call.
   1.669 +  // This is why c2i and i2c adapters cannot be indefinitely composed.
   1.670 +  // In particular, if a c2i adapter were to somehow call an i2c adapter,
   1.671 +  // both caller and callee would be compiled methods, and neither would
   1.672 +  // clean up the stack pointer changes performed by the two adapters.
   1.673 +  // If this happens, control eventually transfers back to the compiled
   1.674 +  // caller, but with an uncorrected stack, causing delayed havoc.
   1.675 +
   1.676 +  // Pick up the return address
   1.677 +  __ movptr(rax, Address(rsp, 0));
   1.678 +
   1.679 +  if (VerifyAdapterCalls &&
   1.680 +      (Interpreter::code() != NULL || StubRoutines::code1() != NULL)) {
   1.681 +    // So, let's test for cascading c2i/i2c adapters right now.
   1.682 +    //  assert(Interpreter::contains($return_addr) ||
   1.683 +    //         StubRoutines::contains($return_addr),
   1.684 +    //         "i2c adapter must return to an interpreter frame");
   1.685 +    __ block_comment("verify_i2c { ");
   1.686 +    Label L_ok;
   1.687 +    if (Interpreter::code() != NULL)
   1.688 +      range_check(masm, rax, r11,
   1.689 +                  Interpreter::code()->code_start(), Interpreter::code()->code_end(),
   1.690 +                  L_ok);
   1.691 +    if (StubRoutines::code1() != NULL)
   1.692 +      range_check(masm, rax, r11,
   1.693 +                  StubRoutines::code1()->code_begin(), StubRoutines::code1()->code_end(),
   1.694 +                  L_ok);
   1.695 +    if (StubRoutines::code2() != NULL)
   1.696 +      range_check(masm, rax, r11,
   1.697 +                  StubRoutines::code2()->code_begin(), StubRoutines::code2()->code_end(),
   1.698 +                  L_ok);
   1.699 +    const char* msg = "i2c adapter must return to an interpreter frame";
   1.700 +    __ block_comment(msg);
   1.701 +    __ stop(msg);
   1.702 +    __ bind(L_ok);
   1.703 +    __ block_comment("} verify_i2ce ");
   1.704 +  }
   1.705 +
   1.706 +  // Must preserve original SP for loading incoming arguments because
   1.707 +  // we need to align the outgoing SP for compiled code.
   1.708 +  __ movptr(r11, rsp);
   1.709 +
   1.710 +  // Cut-out for having no stack args.  Since up to 2 int/oop args are passed
   1.711 +  // in registers, we will occasionally have no stack args.
   1.712 +  int comp_words_on_stack = 0;
   1.713 +  if (comp_args_on_stack) {
   1.714 +    // Sig words on the stack are greater-than VMRegImpl::stack0.  Those in
   1.715 +    // registers are below.  By subtracting stack0, we either get a negative
   1.716 +    // number (all values in registers) or the maximum stack slot accessed.
   1.717 +
   1.718 +    // Convert 4-byte c2 stack slots to words.
   1.719 +    comp_words_on_stack = round_to(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
   1.720 +    // Round up to miminum stack alignment, in wordSize
   1.721 +    comp_words_on_stack = round_to(comp_words_on_stack, 2);
   1.722 +    __ subptr(rsp, comp_words_on_stack * wordSize);
   1.723 +  }
   1.724 +
   1.725 +
   1.726 +  // Ensure compiled code always sees stack at proper alignment
   1.727 +  __ andptr(rsp, -16);
   1.728 +
   1.729 +  // push the return address and misalign the stack that youngest frame always sees
   1.730 +  // as far as the placement of the call instruction
   1.731 +  __ push(rax);
   1.732 +
   1.733 +  // Put saved SP in another register
   1.734 +  const Register saved_sp = rax;
   1.735 +  __ movptr(saved_sp, r11);
   1.736 +
   1.737 +  // Will jump to the compiled code just as if compiled code was doing it.
   1.738 +  // Pre-load the register-jump target early, to schedule it better.
   1.739 +  __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset())));
   1.740 +
   1.741 +  // Now generate the shuffle code.  Pick up all register args and move the
   1.742 +  // rest through the floating point stack top.
   1.743 +  for (int i = 0; i < total_args_passed; i++) {
   1.744 +    if (sig_bt[i] == T_VOID) {
   1.745 +      // Longs and doubles are passed in native word order, but misaligned
   1.746 +      // in the 32-bit build.
   1.747 +      assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
   1.748 +      continue;
   1.749 +    }
   1.750 +
   1.751 +    // Pick up 0, 1 or 2 words from SP+offset.
   1.752 +
   1.753 +    assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
   1.754 +            "scrambled load targets?");
   1.755 +    // Load in argument order going down.
   1.756 +    int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
   1.757 +    // Point to interpreter value (vs. tag)
   1.758 +    int next_off = ld_off - Interpreter::stackElementSize;
   1.759 +    //
   1.760 +    //
   1.761 +    //
   1.762 +    VMReg r_1 = regs[i].first();
   1.763 +    VMReg r_2 = regs[i].second();
   1.764 +    if (!r_1->is_valid()) {
   1.765 +      assert(!r_2->is_valid(), "");
   1.766 +      continue;
   1.767 +    }
   1.768 +    if (r_1->is_stack()) {
   1.769 +      // Convert stack slot to an SP offset (+ wordSize to account for return address )
   1.770 +      int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
   1.771 +
   1.772 +      // We can use r13 as a temp here because compiled code doesn't need r13 as an input
   1.773 +      // and if we end up going thru a c2i because of a miss a reasonable value of r13
   1.774 +      // will be generated.
   1.775 +      if (!r_2->is_valid()) {
   1.776 +        // sign extend???
   1.777 +        __ movl(r13, Address(saved_sp, ld_off));
   1.778 +        __ movptr(Address(rsp, st_off), r13);
   1.779 +      } else {
   1.780 +        //
   1.781 +        // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
   1.782 +        // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
   1.783 +        // So we must adjust where to pick up the data to match the interpreter.
   1.784 +        //
   1.785 +        // Interpreter local[n] == MSW, local[n+1] == LSW however locals
   1.786 +        // are accessed as negative so LSW is at LOW address
   1.787 +
   1.788 +        // ld_off is MSW so get LSW
   1.789 +        const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
   1.790 +                           next_off : ld_off;
   1.791 +        __ movq(r13, Address(saved_sp, offset));
   1.792 +        // st_off is LSW (i.e. reg.first())
   1.793 +        __ movq(Address(rsp, st_off), r13);
   1.794 +      }
   1.795 +    } else if (r_1->is_Register()) {  // Register argument
   1.796 +      Register r = r_1->as_Register();
   1.797 +      assert(r != rax, "must be different");
   1.798 +      if (r_2->is_valid()) {
   1.799 +        //
   1.800 +        // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
   1.801 +        // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
   1.802 +        // So we must adjust where to pick up the data to match the interpreter.
   1.803 +
   1.804 +        const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
   1.805 +                           next_off : ld_off;
   1.806 +
   1.807 +        // this can be a misaligned move
   1.808 +        __ movq(r, Address(saved_sp, offset));
   1.809 +      } else {
   1.810 +        // sign extend and use a full word?
   1.811 +        __ movl(r, Address(saved_sp, ld_off));
   1.812 +      }
   1.813 +    } else {
   1.814 +      if (!r_2->is_valid()) {
   1.815 +        __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
   1.816 +      } else {
   1.817 +        __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
   1.818 +      }
   1.819 +    }
   1.820 +  }
   1.821 +
   1.822 +  // 6243940 We might end up in handle_wrong_method if
   1.823 +  // the callee is deoptimized as we race thru here. If that
   1.824 +  // happens we don't want to take a safepoint because the
   1.825 +  // caller frame will look interpreted and arguments are now
   1.826 +  // "compiled" so it is much better to make this transition
   1.827 +  // invisible to the stack walking code. Unfortunately if
   1.828 +  // we try and find the callee by normal means a safepoint
   1.829 +  // is possible. So we stash the desired callee in the thread
   1.830 +  // and the vm will find there should this case occur.
   1.831 +
   1.832 +  __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
   1.833 +
   1.834 +  // put Method* where a c2i would expect should we end up there
   1.835 +  // only needed becaus eof c2 resolve stubs return Method* as a result in
   1.836 +  // rax
   1.837 +  __ mov(rax, rbx);
   1.838 +  __ jmp(r11);
   1.839 +}
   1.840 +
   1.841 +// ---------------------------------------------------------------
   1.842 +AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
   1.843 +                                                            int total_args_passed,
   1.844 +                                                            int comp_args_on_stack,
   1.845 +                                                            const BasicType *sig_bt,
   1.846 +                                                            const VMRegPair *regs,
   1.847 +                                                            AdapterFingerPrint* fingerprint) {
   1.848 +  address i2c_entry = __ pc();
   1.849 +
   1.850 +  gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
   1.851 +
   1.852 +  // -------------------------------------------------------------------------
   1.853 +  // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
   1.854 +  // to the interpreter.  The args start out packed in the compiled layout.  They
   1.855 +  // need to be unpacked into the interpreter layout.  This will almost always
   1.856 +  // require some stack space.  We grow the current (compiled) stack, then repack
   1.857 +  // the args.  We  finally end in a jump to the generic interpreter entry point.
   1.858 +  // On exit from the interpreter, the interpreter will restore our SP (lest the
   1.859 +  // compiled code, which relys solely on SP and not RBP, get sick).
   1.860 +
   1.861 +  address c2i_unverified_entry = __ pc();
   1.862 +  Label skip_fixup;
   1.863 +  Label ok;
   1.864 +
   1.865 +  Register holder = rax;
   1.866 +  Register receiver = j_rarg0;
   1.867 +  Register temp = rbx;
   1.868 +
   1.869 +  {
   1.870 +    __ load_klass(temp, receiver);
   1.871 +    __ cmpptr(temp, Address(holder, CompiledICHolder::holder_klass_offset()));
   1.872 +    __ movptr(rbx, Address(holder, CompiledICHolder::holder_method_offset()));
   1.873 +    __ jcc(Assembler::equal, ok);
   1.874 +    __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
   1.875 +
   1.876 +    __ bind(ok);
   1.877 +    // Method might have been compiled since the call site was patched to
   1.878 +    // interpreted if that is the case treat it as a miss so we can get
   1.879 +    // the call site corrected.
   1.880 +    __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD);
   1.881 +    __ jcc(Assembler::equal, skip_fixup);
   1.882 +    __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
   1.883 +  }
   1.884 +
   1.885 +  address c2i_entry = __ pc();
   1.886 +
   1.887 +  gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
   1.888 +
   1.889 +  __ flush();
   1.890 +  return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry);
   1.891 +}
   1.892 +
   1.893 +int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
   1.894 +                                         VMRegPair *regs,
   1.895 +                                         VMRegPair *regs2,
   1.896 +                                         int total_args_passed) {
   1.897 +  assert(regs2 == NULL, "not needed on x86");
   1.898 +// We return the amount of VMRegImpl stack slots we need to reserve for all
   1.899 +// the arguments NOT counting out_preserve_stack_slots.
   1.900 +
   1.901 +// NOTE: These arrays will have to change when c1 is ported
   1.902 +#ifdef _WIN64
   1.903 +    static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
   1.904 +      c_rarg0, c_rarg1, c_rarg2, c_rarg3
   1.905 +    };
   1.906 +    static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
   1.907 +      c_farg0, c_farg1, c_farg2, c_farg3
   1.908 +    };
   1.909 +#else
   1.910 +    static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
   1.911 +      c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
   1.912 +    };
   1.913 +    static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
   1.914 +      c_farg0, c_farg1, c_farg2, c_farg3,
   1.915 +      c_farg4, c_farg5, c_farg6, c_farg7
   1.916 +    };
   1.917 +#endif // _WIN64
   1.918 +
   1.919 +
   1.920 +    uint int_args = 0;
   1.921 +    uint fp_args = 0;
   1.922 +    uint stk_args = 0; // inc by 2 each time
   1.923 +
   1.924 +    for (int i = 0; i < total_args_passed; i++) {
   1.925 +      switch (sig_bt[i]) {
   1.926 +      case T_BOOLEAN:
   1.927 +      case T_CHAR:
   1.928 +      case T_BYTE:
   1.929 +      case T_SHORT:
   1.930 +      case T_INT:
   1.931 +        if (int_args < Argument::n_int_register_parameters_c) {
   1.932 +          regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
   1.933 +#ifdef _WIN64
   1.934 +          fp_args++;
   1.935 +          // Allocate slots for callee to stuff register args the stack.
   1.936 +          stk_args += 2;
   1.937 +#endif
   1.938 +        } else {
   1.939 +          regs[i].set1(VMRegImpl::stack2reg(stk_args));
   1.940 +          stk_args += 2;
   1.941 +        }
   1.942 +        break;
   1.943 +      case T_LONG:
   1.944 +        assert(sig_bt[i + 1] == T_VOID, "expecting half");
   1.945 +        // fall through
   1.946 +      case T_OBJECT:
   1.947 +      case T_ARRAY:
   1.948 +      case T_ADDRESS:
   1.949 +      case T_METADATA:
   1.950 +        if (int_args < Argument::n_int_register_parameters_c) {
   1.951 +          regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
   1.952 +#ifdef _WIN64
   1.953 +          fp_args++;
   1.954 +          stk_args += 2;
   1.955 +#endif
   1.956 +        } else {
   1.957 +          regs[i].set2(VMRegImpl::stack2reg(stk_args));
   1.958 +          stk_args += 2;
   1.959 +        }
   1.960 +        break;
   1.961 +      case T_FLOAT:
   1.962 +        if (fp_args < Argument::n_float_register_parameters_c) {
   1.963 +          regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
   1.964 +#ifdef _WIN64
   1.965 +          int_args++;
   1.966 +          // Allocate slots for callee to stuff register args the stack.
   1.967 +          stk_args += 2;
   1.968 +#endif
   1.969 +        } else {
   1.970 +          regs[i].set1(VMRegImpl::stack2reg(stk_args));
   1.971 +          stk_args += 2;
   1.972 +        }
   1.973 +        break;
   1.974 +      case T_DOUBLE:
   1.975 +        assert(sig_bt[i + 1] == T_VOID, "expecting half");
   1.976 +        if (fp_args < Argument::n_float_register_parameters_c) {
   1.977 +          regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
   1.978 +#ifdef _WIN64
   1.979 +          int_args++;
   1.980 +          // Allocate slots for callee to stuff register args the stack.
   1.981 +          stk_args += 2;
   1.982 +#endif
   1.983 +        } else {
   1.984 +          regs[i].set2(VMRegImpl::stack2reg(stk_args));
   1.985 +          stk_args += 2;
   1.986 +        }
   1.987 +        break;
   1.988 +      case T_VOID: // Halves of longs and doubles
   1.989 +        assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
   1.990 +        regs[i].set_bad();
   1.991 +        break;
   1.992 +      default:
   1.993 +        ShouldNotReachHere();
   1.994 +        break;
   1.995 +      }
   1.996 +    }
   1.997 +#ifdef _WIN64
   1.998 +  // windows abi requires that we always allocate enough stack space
   1.999 +  // for 4 64bit registers to be stored down.
  1.1000 +  if (stk_args < 8) {
  1.1001 +    stk_args = 8;
  1.1002 +  }
  1.1003 +#endif // _WIN64
  1.1004 +
  1.1005 +  return stk_args;
  1.1006 +}
  1.1007 +
  1.1008 +// On 64 bit we will store integer like items to the stack as
  1.1009 +// 64 bits items (sparc abi) even though java would only store
  1.1010 +// 32bits for a parameter. On 32bit it will simply be 32 bits
  1.1011 +// So this routine will do 32->32 on 32bit and 32->64 on 64bit
  1.1012 +static void move32_64(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {
  1.1013 +  if (src.first()->is_stack()) {
  1.1014 +    if (dst.first()->is_stack()) {
  1.1015 +      // stack to stack
  1.1016 +      __ movslq(rax, Address(rbp, reg2offset_in(src.first())));
  1.1017 +      __ movq(Address(rsp, reg2offset_out(dst.first())), rax);
  1.1018 +    } else {
  1.1019 +      // stack to reg
  1.1020 +      __ movslq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first())));
  1.1021 +    }
  1.1022 +  } else if (dst.first()->is_stack()) {
  1.1023 +    // reg to stack
  1.1024 +    // Do we really have to sign extend???
  1.1025 +    // __ movslq(src.first()->as_Register(), src.first()->as_Register());
  1.1026 +    __ movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register());
  1.1027 +  } else {
  1.1028 +    // Do we really have to sign extend???
  1.1029 +    // __ movslq(dst.first()->as_Register(), src.first()->as_Register());
  1.1030 +    if (dst.first() != src.first()) {
  1.1031 +      __ movq(dst.first()->as_Register(), src.first()->as_Register());
  1.1032 +    }
  1.1033 +  }
  1.1034 +}
  1.1035 +
  1.1036 +static void move_ptr(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {
  1.1037 +  if (src.first()->is_stack()) {
  1.1038 +    if (dst.first()->is_stack()) {
  1.1039 +      // stack to stack
  1.1040 +      __ movq(rax, Address(rbp, reg2offset_in(src.first())));
  1.1041 +      __ movq(Address(rsp, reg2offset_out(dst.first())), rax);
  1.1042 +    } else {
  1.1043 +      // stack to reg
  1.1044 +      __ movq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first())));
  1.1045 +    }
  1.1046 +  } else if (dst.first()->is_stack()) {
  1.1047 +    // reg to stack
  1.1048 +    __ movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register());
  1.1049 +  } else {
  1.1050 +    if (dst.first() != src.first()) {
  1.1051 +      __ movq(dst.first()->as_Register(), src.first()->as_Register());
  1.1052 +    }
  1.1053 +  }
  1.1054 +}
  1.1055 +
  1.1056 +// An oop arg. Must pass a handle not the oop itself
  1.1057 +static void object_move(MacroAssembler* masm,
  1.1058 +                        OopMap* map,
  1.1059 +                        int oop_handle_offset,
  1.1060 +                        int framesize_in_slots,
  1.1061 +                        VMRegPair src,
  1.1062 +                        VMRegPair dst,
  1.1063 +                        bool is_receiver,
  1.1064 +                        int* receiver_offset) {
  1.1065 +
  1.1066 +  // must pass a handle. First figure out the location we use as a handle
  1.1067 +
  1.1068 +  Register rHandle = dst.first()->is_stack() ? rax : dst.first()->as_Register();
  1.1069 +
  1.1070 +  // See if oop is NULL if it is we need no handle
  1.1071 +
  1.1072 +  if (src.first()->is_stack()) {
  1.1073 +
  1.1074 +    // Oop is already on the stack as an argument
  1.1075 +    int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
  1.1076 +    map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
  1.1077 +    if (is_receiver) {
  1.1078 +      *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
  1.1079 +    }
  1.1080 +
  1.1081 +    __ cmpptr(Address(rbp, reg2offset_in(src.first())), (int32_t)NULL_WORD);
  1.1082 +    __ lea(rHandle, Address(rbp, reg2offset_in(src.first())));
  1.1083 +    // conditionally move a NULL
  1.1084 +    __ cmovptr(Assembler::equal, rHandle, Address(rbp, reg2offset_in(src.first())));
  1.1085 +  } else {
  1.1086 +
  1.1087 +    // Oop is in an a register we must store it to the space we reserve
  1.1088 +    // on the stack for oop_handles and pass a handle if oop is non-NULL
  1.1089 +
  1.1090 +    const Register rOop = src.first()->as_Register();
  1.1091 +    int oop_slot;
  1.1092 +    if (rOop == j_rarg0)
  1.1093 +      oop_slot = 0;
  1.1094 +    else if (rOop == j_rarg1)
  1.1095 +      oop_slot = 1;
  1.1096 +    else if (rOop == j_rarg2)
  1.1097 +      oop_slot = 2;
  1.1098 +    else if (rOop == j_rarg3)
  1.1099 +      oop_slot = 3;
  1.1100 +    else if (rOop == j_rarg4)
  1.1101 +      oop_slot = 4;
  1.1102 +    else {
  1.1103 +      assert(rOop == j_rarg5, "wrong register");
  1.1104 +      oop_slot = 5;
  1.1105 +    }
  1.1106 +
  1.1107 +    oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
  1.1108 +    int offset = oop_slot*VMRegImpl::stack_slot_size;
  1.1109 +
  1.1110 +    map->set_oop(VMRegImpl::stack2reg(oop_slot));
  1.1111 +    // Store oop in handle area, may be NULL
  1.1112 +    __ movptr(Address(rsp, offset), rOop);
  1.1113 +    if (is_receiver) {
  1.1114 +      *receiver_offset = offset;
  1.1115 +    }
  1.1116 +
  1.1117 +    __ cmpptr(rOop, (int32_t)NULL_WORD);
  1.1118 +    __ lea(rHandle, Address(rsp, offset));
  1.1119 +    // conditionally move a NULL from the handle area where it was just stored
  1.1120 +    __ cmovptr(Assembler::equal, rHandle, Address(rsp, offset));
  1.1121 +  }
  1.1122 +
  1.1123 +  // If arg is on the stack then place it otherwise it is already in correct reg.
  1.1124 +  if (dst.first()->is_stack()) {
  1.1125 +    __ movptr(Address(rsp, reg2offset_out(dst.first())), rHandle);
  1.1126 +  }
  1.1127 +}
  1.1128 +
  1.1129 +// A float arg may have to do float reg int reg conversion
  1.1130 +static void float_move(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {
  1.1131 +  assert(!src.second()->is_valid() && !dst.second()->is_valid(), "bad float_move");
  1.1132 +
  1.1133 +  // The calling conventions assures us that each VMregpair is either
  1.1134 +  // all really one physical register or adjacent stack slots.
  1.1135 +  // This greatly simplifies the cases here compared to sparc.
  1.1136 +
  1.1137 +  if (src.first()->is_stack()) {
  1.1138 +    if (dst.first()->is_stack()) {
  1.1139 +      __ movl(rax, Address(rbp, reg2offset_in(src.first())));
  1.1140 +      __ movptr(Address(rsp, reg2offset_out(dst.first())), rax);
  1.1141 +    } else {
  1.1142 +      // stack to reg
  1.1143 +      assert(dst.first()->is_XMMRegister(), "only expect xmm registers as parameters");
  1.1144 +      __ movflt(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_in(src.first())));
  1.1145 +    }
  1.1146 +  } else if (dst.first()->is_stack()) {
  1.1147 +    // reg to stack
  1.1148 +    assert(src.first()->is_XMMRegister(), "only expect xmm registers as parameters");
  1.1149 +    __ movflt(Address(rsp, reg2offset_out(dst.first())), src.first()->as_XMMRegister());
  1.1150 +  } else {
  1.1151 +    // reg to reg
  1.1152 +    // In theory these overlap but the ordering is such that this is likely a nop
  1.1153 +    if ( src.first() != dst.first()) {
  1.1154 +      __ movdbl(dst.first()->as_XMMRegister(),  src.first()->as_XMMRegister());
  1.1155 +    }
  1.1156 +  }
  1.1157 +}
  1.1158 +
  1.1159 +// A long move
  1.1160 +static void long_move(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {
  1.1161 +
  1.1162 +  // The calling conventions assures us that each VMregpair is either
  1.1163 +  // all really one physical register or adjacent stack slots.
  1.1164 +  // This greatly simplifies the cases here compared to sparc.
  1.1165 +
  1.1166 +  if (src.is_single_phys_reg() ) {
  1.1167 +    if (dst.is_single_phys_reg()) {
  1.1168 +      if (dst.first() != src.first()) {
  1.1169 +        __ mov(dst.first()->as_Register(), src.first()->as_Register());
  1.1170 +      }
  1.1171 +    } else {
  1.1172 +      assert(dst.is_single_reg(), "not a stack pair");
  1.1173 +      __ movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register());
  1.1174 +    }
  1.1175 +  } else if (dst.is_single_phys_reg()) {
  1.1176 +    assert(src.is_single_reg(),  "not a stack pair");
  1.1177 +    __ movq(dst.first()->as_Register(), Address(rbp, reg2offset_out(src.first())));
  1.1178 +  } else {
  1.1179 +    assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
  1.1180 +    __ movq(rax, Address(rbp, reg2offset_in(src.first())));
  1.1181 +    __ movq(Address(rsp, reg2offset_out(dst.first())), rax);
  1.1182 +  }
  1.1183 +}
  1.1184 +
  1.1185 +// A double move
  1.1186 +static void double_move(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {
  1.1187 +
  1.1188 +  // The calling conventions assures us that each VMregpair is either
  1.1189 +  // all really one physical register or adjacent stack slots.
  1.1190 +  // This greatly simplifies the cases here compared to sparc.
  1.1191 +
  1.1192 +  if (src.is_single_phys_reg() ) {
  1.1193 +    if (dst.is_single_phys_reg()) {
  1.1194 +      // In theory these overlap but the ordering is such that this is likely a nop
  1.1195 +      if ( src.first() != dst.first()) {
  1.1196 +        __ movdbl(dst.first()->as_XMMRegister(), src.first()->as_XMMRegister());
  1.1197 +      }
  1.1198 +    } else {
  1.1199 +      assert(dst.is_single_reg(), "not a stack pair");
  1.1200 +      __ movdbl(Address(rsp, reg2offset_out(dst.first())), src.first()->as_XMMRegister());
  1.1201 +    }
  1.1202 +  } else if (dst.is_single_phys_reg()) {
  1.1203 +    assert(src.is_single_reg(),  "not a stack pair");
  1.1204 +    __ movdbl(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_out(src.first())));
  1.1205 +  } else {
  1.1206 +    assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
  1.1207 +    __ movq(rax, Address(rbp, reg2offset_in(src.first())));
  1.1208 +    __ movq(Address(rsp, reg2offset_out(dst.first())), rax);
  1.1209 +  }
  1.1210 +}
  1.1211 +
  1.1212 +
  1.1213 +void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
  1.1214 +  // We always ignore the frame_slots arg and just use the space just below frame pointer
  1.1215 +  // which by this time is free to use
  1.1216 +  switch (ret_type) {
  1.1217 +  case T_FLOAT:
  1.1218 +    __ movflt(Address(rbp, -wordSize), xmm0);
  1.1219 +    break;
  1.1220 +  case T_DOUBLE:
  1.1221 +    __ movdbl(Address(rbp, -wordSize), xmm0);
  1.1222 +    break;
  1.1223 +  case T_VOID:  break;
  1.1224 +  default: {
  1.1225 +    __ movptr(Address(rbp, -wordSize), rax);
  1.1226 +    }
  1.1227 +  }
  1.1228 +}
  1.1229 +
  1.1230 +void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
  1.1231 +  // We always ignore the frame_slots arg and just use the space just below frame pointer
  1.1232 +  // which by this time is free to use
  1.1233 +  switch (ret_type) {
  1.1234 +  case T_FLOAT:
  1.1235 +    __ movflt(xmm0, Address(rbp, -wordSize));
  1.1236 +    break;
  1.1237 +  case T_DOUBLE:
  1.1238 +    __ movdbl(xmm0, Address(rbp, -wordSize));
  1.1239 +    break;
  1.1240 +  case T_VOID:  break;
  1.1241 +  default: {
  1.1242 +    __ movptr(rax, Address(rbp, -wordSize));
  1.1243 +    }
  1.1244 +  }
  1.1245 +}
  1.1246 +
  1.1247 +static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
  1.1248 +    for ( int i = first_arg ; i < arg_count ; i++ ) {
  1.1249 +      if (args[i].first()->is_Register()) {
  1.1250 +        __ push(args[i].first()->as_Register());
  1.1251 +      } else if (args[i].first()->is_XMMRegister()) {
  1.1252 +        __ subptr(rsp, 2*wordSize);
  1.1253 +        __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
  1.1254 +      }
  1.1255 +    }
  1.1256 +}
  1.1257 +
  1.1258 +static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
  1.1259 +    for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
  1.1260 +      if (args[i].first()->is_Register()) {
  1.1261 +        __ pop(args[i].first()->as_Register());
  1.1262 +      } else if (args[i].first()->is_XMMRegister()) {
  1.1263 +        __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
  1.1264 +        __ addptr(rsp, 2*wordSize);
  1.1265 +      }
  1.1266 +    }
  1.1267 +}
  1.1268 +
  1.1269 +
  1.1270 +static void save_or_restore_arguments(MacroAssembler* masm,
  1.1271 +                                      const int stack_slots,
  1.1272 +                                      const int total_in_args,
  1.1273 +                                      const int arg_save_area,
  1.1274 +                                      OopMap* map,
  1.1275 +                                      VMRegPair* in_regs,
  1.1276 +                                      BasicType* in_sig_bt) {
  1.1277 +  // if map is non-NULL then the code should store the values,
  1.1278 +  // otherwise it should load them.
  1.1279 +  int slot = arg_save_area;
  1.1280 +  // Save down double word first
  1.1281 +  for ( int i = 0; i < total_in_args; i++) {
  1.1282 +    if (in_regs[i].first()->is_XMMRegister() && in_sig_bt[i] == T_DOUBLE) {
  1.1283 +      int offset = slot * VMRegImpl::stack_slot_size;
  1.1284 +      slot += VMRegImpl::slots_per_word;
  1.1285 +      assert(slot <= stack_slots, "overflow");
  1.1286 +      if (map != NULL) {
  1.1287 +        __ movdbl(Address(rsp, offset), in_regs[i].first()->as_XMMRegister());
  1.1288 +      } else {
  1.1289 +        __ movdbl(in_regs[i].first()->as_XMMRegister(), Address(rsp, offset));
  1.1290 +      }
  1.1291 +    }
  1.1292 +    if (in_regs[i].first()->is_Register() &&
  1.1293 +        (in_sig_bt[i] == T_LONG || in_sig_bt[i] == T_ARRAY)) {
  1.1294 +      int offset = slot * VMRegImpl::stack_slot_size;
  1.1295 +      if (map != NULL) {
  1.1296 +        __ movq(Address(rsp, offset), in_regs[i].first()->as_Register());
  1.1297 +        if (in_sig_bt[i] == T_ARRAY) {
  1.1298 +          map->set_oop(VMRegImpl::stack2reg(slot));;
  1.1299 +        }
  1.1300 +      } else {
  1.1301 +        __ movq(in_regs[i].first()->as_Register(), Address(rsp, offset));
  1.1302 +      }
  1.1303 +      slot += VMRegImpl::slots_per_word;
  1.1304 +    }
  1.1305 +  }
  1.1306 +  // Save or restore single word registers
  1.1307 +  for ( int i = 0; i < total_in_args; i++) {
  1.1308 +    if (in_regs[i].first()->is_Register()) {
  1.1309 +      int offset = slot * VMRegImpl::stack_slot_size;
  1.1310 +      slot++;
  1.1311 +      assert(slot <= stack_slots, "overflow");
  1.1312 +
  1.1313 +      // Value is in an input register pass we must flush it to the stack
  1.1314 +      const Register reg = in_regs[i].first()->as_Register();
  1.1315 +      switch (in_sig_bt[i]) {
  1.1316 +        case T_BOOLEAN:
  1.1317 +        case T_CHAR:
  1.1318 +        case T_BYTE:
  1.1319 +        case T_SHORT:
  1.1320 +        case T_INT:
  1.1321 +          if (map != NULL) {
  1.1322 +            __ movl(Address(rsp, offset), reg);
  1.1323 +          } else {
  1.1324 +            __ movl(reg, Address(rsp, offset));
  1.1325 +          }
  1.1326 +          break;
  1.1327 +        case T_ARRAY:
  1.1328 +        case T_LONG:
  1.1329 +          // handled above
  1.1330 +          break;
  1.1331 +        case T_OBJECT:
  1.1332 +        default: ShouldNotReachHere();
  1.1333 +      }
  1.1334 +    } else if (in_regs[i].first()->is_XMMRegister()) {
  1.1335 +      if (in_sig_bt[i] == T_FLOAT) {
  1.1336 +        int offset = slot * VMRegImpl::stack_slot_size;
  1.1337 +        slot++;
  1.1338 +        assert(slot <= stack_slots, "overflow");
  1.1339 +        if (map != NULL) {
  1.1340 +          __ movflt(Address(rsp, offset), in_regs[i].first()->as_XMMRegister());
  1.1341 +        } else {
  1.1342 +          __ movflt(in_regs[i].first()->as_XMMRegister(), Address(rsp, offset));
  1.1343 +        }
  1.1344 +      }
  1.1345 +    } else if (in_regs[i].first()->is_stack()) {
  1.1346 +      if (in_sig_bt[i] == T_ARRAY && map != NULL) {
  1.1347 +        int offset_in_older_frame = in_regs[i].first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
  1.1348 +        map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + stack_slots));
  1.1349 +      }
  1.1350 +    }
  1.1351 +  }
  1.1352 +}
  1.1353 +
  1.1354 +
  1.1355 +// Check GC_locker::needs_gc and enter the runtime if it's true.  This
  1.1356 +// keeps a new JNI critical region from starting until a GC has been
  1.1357 +// forced.  Save down any oops in registers and describe them in an
  1.1358 +// OopMap.
  1.1359 +static void check_needs_gc_for_critical_native(MacroAssembler* masm,
  1.1360 +                                               int stack_slots,
  1.1361 +                                               int total_c_args,
  1.1362 +                                               int total_in_args,
  1.1363 +                                               int arg_save_area,
  1.1364 +                                               OopMapSet* oop_maps,
  1.1365 +                                               VMRegPair* in_regs,
  1.1366 +                                               BasicType* in_sig_bt) {
  1.1367 +  __ block_comment("check GC_locker::needs_gc");
  1.1368 +  Label cont;
  1.1369 +  __ cmp8(ExternalAddress((address)GC_locker::needs_gc_address()), false);
  1.1370 +  __ jcc(Assembler::equal, cont);
  1.1371 +
  1.1372 +  // Save down any incoming oops and call into the runtime to halt for a GC
  1.1373 +
  1.1374 +  OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
  1.1375 +  save_or_restore_arguments(masm, stack_slots, total_in_args,
  1.1376 +                            arg_save_area, map, in_regs, in_sig_bt);
  1.1377 +
  1.1378 +  address the_pc = __ pc();
  1.1379 +  oop_maps->add_gc_map( __ offset(), map);
  1.1380 +  __ set_last_Java_frame(rsp, noreg, the_pc);
  1.1381 +
  1.1382 +  __ block_comment("block_for_jni_critical");
  1.1383 +  __ movptr(c_rarg0, r15_thread);
  1.1384 +  __ mov(r12, rsp); // remember sp
  1.1385 +  __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
  1.1386 +  __ andptr(rsp, -16); // align stack as required by ABI
  1.1387 +  __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::block_for_jni_critical)));
  1.1388 +  __ mov(rsp, r12); // restore sp
  1.1389 +  __ reinit_heapbase();
  1.1390 +
  1.1391 +  __ reset_last_Java_frame(false, true);
  1.1392 +
  1.1393 +  save_or_restore_arguments(masm, stack_slots, total_in_args,
  1.1394 +                            arg_save_area, NULL, in_regs, in_sig_bt);
  1.1395 +
  1.1396 +  __ bind(cont);
  1.1397 +#ifdef ASSERT
  1.1398 +  if (StressCriticalJNINatives) {
  1.1399 +    // Stress register saving
  1.1400 +    OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
  1.1401 +    save_or_restore_arguments(masm, stack_slots, total_in_args,
  1.1402 +                              arg_save_area, map, in_regs, in_sig_bt);
  1.1403 +    // Destroy argument registers
  1.1404 +    for (int i = 0; i < total_in_args - 1; i++) {
  1.1405 +      if (in_regs[i].first()->is_Register()) {
  1.1406 +        const Register reg = in_regs[i].first()->as_Register();
  1.1407 +        __ xorptr(reg, reg);
  1.1408 +      } else if (in_regs[i].first()->is_XMMRegister()) {
  1.1409 +        __ xorpd(in_regs[i].first()->as_XMMRegister(), in_regs[i].first()->as_XMMRegister());
  1.1410 +      } else if (in_regs[i].first()->is_FloatRegister()) {
  1.1411 +        ShouldNotReachHere();
  1.1412 +      } else if (in_regs[i].first()->is_stack()) {
  1.1413 +        // Nothing to do
  1.1414 +      } else {
  1.1415 +        ShouldNotReachHere();
  1.1416 +      }
  1.1417 +      if (in_sig_bt[i] == T_LONG || in_sig_bt[i] == T_DOUBLE) {
  1.1418 +        i++;
  1.1419 +      }
  1.1420 +    }
  1.1421 +
  1.1422 +    save_or_restore_arguments(masm, stack_slots, total_in_args,
  1.1423 +                              arg_save_area, NULL, in_regs, in_sig_bt);
  1.1424 +  }
  1.1425 +#endif
  1.1426 +}
  1.1427 +
  1.1428 +// Unpack an array argument into a pointer to the body and the length
  1.1429 +// if the array is non-null, otherwise pass 0 for both.
  1.1430 +static void unpack_array_argument(MacroAssembler* masm, VMRegPair reg, BasicType in_elem_type, VMRegPair body_arg, VMRegPair length_arg) {
  1.1431 +  Register tmp_reg = rax;
  1.1432 +  assert(!body_arg.first()->is_Register() || body_arg.first()->as_Register() != tmp_reg,
  1.1433 +         "possible collision");
  1.1434 +  assert(!length_arg.first()->is_Register() || length_arg.first()->as_Register() != tmp_reg,
  1.1435 +         "possible collision");
  1.1436 +
  1.1437 +  __ block_comment("unpack_array_argument {");
  1.1438 +
  1.1439 +  // Pass the length, ptr pair
  1.1440 +  Label is_null, done;
  1.1441 +  VMRegPair tmp;
  1.1442 +  tmp.set_ptr(tmp_reg->as_VMReg());
  1.1443 +  if (reg.first()->is_stack()) {
  1.1444 +    // Load the arg up from the stack
  1.1445 +    move_ptr(masm, reg, tmp);
  1.1446 +    reg = tmp;
  1.1447 +  }
  1.1448 +  __ testptr(reg.first()->as_Register(), reg.first()->as_Register());
  1.1449 +  __ jccb(Assembler::equal, is_null);
  1.1450 +  __ lea(tmp_reg, Address(reg.first()->as_Register(), arrayOopDesc::base_offset_in_bytes(in_elem_type)));
  1.1451 +  move_ptr(masm, tmp, body_arg);
  1.1452 +  // load the length relative to the body.
  1.1453 +  __ movl(tmp_reg, Address(tmp_reg, arrayOopDesc::length_offset_in_bytes() -
  1.1454 +                           arrayOopDesc::base_offset_in_bytes(in_elem_type)));
  1.1455 +  move32_64(masm, tmp, length_arg);
  1.1456 +  __ jmpb(done);
  1.1457 +  __ bind(is_null);
  1.1458 +  // Pass zeros
  1.1459 +  __ xorptr(tmp_reg, tmp_reg);
  1.1460 +  move_ptr(masm, tmp, body_arg);
  1.1461 +  move32_64(masm, tmp, length_arg);
  1.1462 +  __ bind(done);
  1.1463 +
  1.1464 +  __ block_comment("} unpack_array_argument");
  1.1465 +}
  1.1466 +
  1.1467 +
  1.1468 +// Different signatures may require very different orders for the move
  1.1469 +// to avoid clobbering other arguments.  There's no simple way to
  1.1470 +// order them safely.  Compute a safe order for issuing stores and
  1.1471 +// break any cycles in those stores.  This code is fairly general but
  1.1472 +// it's not necessary on the other platforms so we keep it in the
  1.1473 +// platform dependent code instead of moving it into a shared file.
  1.1474 +// (See bugs 7013347 & 7145024.)
  1.1475 +// Note that this code is specific to LP64.
  1.1476 +class ComputeMoveOrder: public StackObj {
  1.1477 +  class MoveOperation: public ResourceObj {
  1.1478 +    friend class ComputeMoveOrder;
  1.1479 +   private:
  1.1480 +    VMRegPair        _src;
  1.1481 +    VMRegPair        _dst;
  1.1482 +    int              _src_index;
  1.1483 +    int              _dst_index;
  1.1484 +    bool             _processed;
  1.1485 +    MoveOperation*  _next;
  1.1486 +    MoveOperation*  _prev;
  1.1487 +
  1.1488 +    static int get_id(VMRegPair r) {
  1.1489 +      return r.first()->value();
  1.1490 +    }
  1.1491 +
  1.1492 +   public:
  1.1493 +    MoveOperation(int src_index, VMRegPair src, int dst_index, VMRegPair dst):
  1.1494 +      _src(src)
  1.1495 +    , _src_index(src_index)
  1.1496 +    , _dst(dst)
  1.1497 +    , _dst_index(dst_index)
  1.1498 +    , _next(NULL)
  1.1499 +    , _prev(NULL)
  1.1500 +    , _processed(false) {
  1.1501 +    }
  1.1502 +
  1.1503 +    VMRegPair src() const              { return _src; }
  1.1504 +    int src_id() const                 { return get_id(src()); }
  1.1505 +    int src_index() const              { return _src_index; }
  1.1506 +    VMRegPair dst() const              { return _dst; }
  1.1507 +    void set_dst(int i, VMRegPair dst) { _dst_index = i, _dst = dst; }
  1.1508 +    int dst_index() const              { return _dst_index; }
  1.1509 +    int dst_id() const                 { return get_id(dst()); }
  1.1510 +    MoveOperation* next() const       { return _next; }
  1.1511 +    MoveOperation* prev() const       { return _prev; }
  1.1512 +    void set_processed()               { _processed = true; }
  1.1513 +    bool is_processed() const          { return _processed; }
  1.1514 +
  1.1515 +    // insert
  1.1516 +    void break_cycle(VMRegPair temp_register) {
  1.1517 +      // create a new store following the last store
  1.1518 +      // to move from the temp_register to the original
  1.1519 +      MoveOperation* new_store = new MoveOperation(-1, temp_register, dst_index(), dst());
  1.1520 +
  1.1521 +      // break the cycle of links and insert new_store at the end
  1.1522 +      // break the reverse link.
  1.1523 +      MoveOperation* p = prev();
  1.1524 +      assert(p->next() == this, "must be");
  1.1525 +      _prev = NULL;
  1.1526 +      p->_next = new_store;
  1.1527 +      new_store->_prev = p;
  1.1528 +
  1.1529 +      // change the original store to save it's value in the temp.
  1.1530 +      set_dst(-1, temp_register);
  1.1531 +    }
  1.1532 +
  1.1533 +    void link(GrowableArray<MoveOperation*>& killer) {
  1.1534 +      // link this store in front the store that it depends on
  1.1535 +      MoveOperation* n = killer.at_grow(src_id(), NULL);
  1.1536 +      if (n != NULL) {
  1.1537 +        assert(_next == NULL && n->_prev == NULL, "shouldn't have been set yet");
  1.1538 +        _next = n;
  1.1539 +        n->_prev = this;
  1.1540 +      }
  1.1541 +    }
  1.1542 +  };
  1.1543 +
  1.1544 + private:
  1.1545 +  GrowableArray<MoveOperation*> edges;
  1.1546 +
  1.1547 + public:
  1.1548 +  ComputeMoveOrder(int total_in_args, VMRegPair* in_regs, int total_c_args, VMRegPair* out_regs,
  1.1549 +                    BasicType* in_sig_bt, GrowableArray<int>& arg_order, VMRegPair tmp_vmreg) {
  1.1550 +    // Move operations where the dest is the stack can all be
  1.1551 +    // scheduled first since they can't interfere with the other moves.
  1.1552 +    for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
  1.1553 +      if (in_sig_bt[i] == T_ARRAY) {
  1.1554 +        c_arg--;
  1.1555 +        if (out_regs[c_arg].first()->is_stack() &&
  1.1556 +            out_regs[c_arg + 1].first()->is_stack()) {
  1.1557 +          arg_order.push(i);
  1.1558 +          arg_order.push(c_arg);
  1.1559 +        } else {
  1.1560 +          if (out_regs[c_arg].first()->is_stack() ||
  1.1561 +              in_regs[i].first() == out_regs[c_arg].first()) {
  1.1562 +            add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg + 1]);
  1.1563 +          } else {
  1.1564 +            add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]);
  1.1565 +          }
  1.1566 +        }
  1.1567 +      } else if (in_sig_bt[i] == T_VOID) {
  1.1568 +        arg_order.push(i);
  1.1569 +        arg_order.push(c_arg);
  1.1570 +      } else {
  1.1571 +        if (out_regs[c_arg].first()->is_stack() ||
  1.1572 +            in_regs[i].first() == out_regs[c_arg].first()) {
  1.1573 +          arg_order.push(i);
  1.1574 +          arg_order.push(c_arg);
  1.1575 +        } else {
  1.1576 +          add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]);
  1.1577 +        }
  1.1578 +      }
  1.1579 +    }
  1.1580 +    // Break any cycles in the register moves and emit the in the
  1.1581 +    // proper order.
  1.1582 +    GrowableArray<MoveOperation*>* stores = get_store_order(tmp_vmreg);
  1.1583 +    for (int i = 0; i < stores->length(); i++) {
  1.1584 +      arg_order.push(stores->at(i)->src_index());
  1.1585 +      arg_order.push(stores->at(i)->dst_index());
  1.1586 +    }
  1.1587 + }
  1.1588 +
  1.1589 +  // Collected all the move operations
  1.1590 +  void add_edge(int src_index, VMRegPair src, int dst_index, VMRegPair dst) {
  1.1591 +    if (src.first() == dst.first()) return;
  1.1592 +    edges.append(new MoveOperation(src_index, src, dst_index, dst));
  1.1593 +  }
  1.1594 +
  1.1595 +  // Walk the edges breaking cycles between moves.  The result list
  1.1596 +  // can be walked in order to produce the proper set of loads
  1.1597 +  GrowableArray<MoveOperation*>* get_store_order(VMRegPair temp_register) {
  1.1598 +    // Record which moves kill which values
  1.1599 +    GrowableArray<MoveOperation*> killer;
  1.1600 +    for (int i = 0; i < edges.length(); i++) {
  1.1601 +      MoveOperation* s = edges.at(i);
  1.1602 +      assert(killer.at_grow(s->dst_id(), NULL) == NULL, "only one killer");
  1.1603 +      killer.at_put_grow(s->dst_id(), s, NULL);
  1.1604 +    }
  1.1605 +    assert(killer.at_grow(MoveOperation::get_id(temp_register), NULL) == NULL,
  1.1606 +           "make sure temp isn't in the registers that are killed");
  1.1607 +
  1.1608 +    // create links between loads and stores
  1.1609 +    for (int i = 0; i < edges.length(); i++) {
  1.1610 +      edges.at(i)->link(killer);
  1.1611 +    }
  1.1612 +
  1.1613 +    // at this point, all the move operations are chained together
  1.1614 +    // in a doubly linked list.  Processing it backwards finds
  1.1615 +    // the beginning of the chain, forwards finds the end.  If there's
  1.1616 +    // a cycle it can be broken at any point,  so pick an edge and walk
  1.1617 +    // backward until the list ends or we end where we started.
  1.1618 +    GrowableArray<MoveOperation*>* stores = new GrowableArray<MoveOperation*>();
  1.1619 +    for (int e = 0; e < edges.length(); e++) {
  1.1620 +      MoveOperation* s = edges.at(e);
  1.1621 +      if (!s->is_processed()) {
  1.1622 +        MoveOperation* start = s;
  1.1623 +        // search for the beginning of the chain or cycle
  1.1624 +        while (start->prev() != NULL && start->prev() != s) {
  1.1625 +          start = start->prev();
  1.1626 +        }
  1.1627 +        if (start->prev() == s) {
  1.1628 +          start->break_cycle(temp_register);
  1.1629 +        }
  1.1630 +        // walk the chain forward inserting to store list
  1.1631 +        while (start != NULL) {
  1.1632 +          stores->append(start);
  1.1633 +          start->set_processed();
  1.1634 +          start = start->next();
  1.1635 +        }
  1.1636 +      }
  1.1637 +    }
  1.1638 +    return stores;
  1.1639 +  }
  1.1640 +};
  1.1641 +
  1.1642 +static void verify_oop_args(MacroAssembler* masm,
  1.1643 +                            methodHandle method,
  1.1644 +                            const BasicType* sig_bt,
  1.1645 +                            const VMRegPair* regs) {
  1.1646 +  Register temp_reg = rbx;  // not part of any compiled calling seq
  1.1647 +  if (VerifyOops) {
  1.1648 +    for (int i = 0; i < method->size_of_parameters(); i++) {
  1.1649 +      if (sig_bt[i] == T_OBJECT ||
  1.1650 +          sig_bt[i] == T_ARRAY) {
  1.1651 +        VMReg r = regs[i].first();
  1.1652 +        assert(r->is_valid(), "bad oop arg");
  1.1653 +        if (r->is_stack()) {
  1.1654 +          __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
  1.1655 +          __ verify_oop(temp_reg);
  1.1656 +        } else {
  1.1657 +          __ verify_oop(r->as_Register());
  1.1658 +        }
  1.1659 +      }
  1.1660 +    }
  1.1661 +  }
  1.1662 +}
  1.1663 +
  1.1664 +static void gen_special_dispatch(MacroAssembler* masm,
  1.1665 +                                 methodHandle method,
  1.1666 +                                 const BasicType* sig_bt,
  1.1667 +                                 const VMRegPair* regs) {
  1.1668 +  verify_oop_args(masm, method, sig_bt, regs);
  1.1669 +  vmIntrinsics::ID iid = method->intrinsic_id();
  1.1670 +
  1.1671 +  // Now write the args into the outgoing interpreter space
  1.1672 +  bool     has_receiver   = false;
  1.1673 +  Register receiver_reg   = noreg;
  1.1674 +  int      member_arg_pos = -1;
  1.1675 +  Register member_reg     = noreg;
  1.1676 +  int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
  1.1677 +  if (ref_kind != 0) {
  1.1678 +    member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
  1.1679 +    member_reg = rbx;  // known to be free at this point
  1.1680 +    has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
  1.1681 +  } else if (iid == vmIntrinsics::_invokeBasic) {
  1.1682 +    has_receiver = true;
  1.1683 +  } else {
  1.1684 +    fatal(err_msg_res("unexpected intrinsic id %d", iid));
  1.1685 +  }
  1.1686 +
  1.1687 +  if (member_reg != noreg) {
  1.1688 +    // Load the member_arg into register, if necessary.
  1.1689 +    SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
  1.1690 +    VMReg r = regs[member_arg_pos].first();
  1.1691 +    if (r->is_stack()) {
  1.1692 +      __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
  1.1693 +    } else {
  1.1694 +      // no data motion is needed
  1.1695 +      member_reg = r->as_Register();
  1.1696 +    }
  1.1697 +  }
  1.1698 +
  1.1699 +  if (has_receiver) {
  1.1700 +    // Make sure the receiver is loaded into a register.
  1.1701 +    assert(method->size_of_parameters() > 0, "oob");
  1.1702 +    assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
  1.1703 +    VMReg r = regs[0].first();
  1.1704 +    assert(r->is_valid(), "bad receiver arg");
  1.1705 +    if (r->is_stack()) {
  1.1706 +      // Porting note:  This assumes that compiled calling conventions always
  1.1707 +      // pass the receiver oop in a register.  If this is not true on some
  1.1708 +      // platform, pick a temp and load the receiver from stack.
  1.1709 +      fatal("receiver always in a register");
  1.1710 +      receiver_reg = j_rarg0;  // known to be free at this point
  1.1711 +      __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
  1.1712 +    } else {
  1.1713 +      // no data motion is needed
  1.1714 +      receiver_reg = r->as_Register();
  1.1715 +    }
  1.1716 +  }
  1.1717 +
  1.1718 +  // Figure out which address we are really jumping to:
  1.1719 +  MethodHandles::generate_method_handle_dispatch(masm, iid,
  1.1720 +                                                 receiver_reg, member_reg, /*for_compiler_entry:*/ true);
  1.1721 +}
  1.1722 +
  1.1723 +// ---------------------------------------------------------------------------
  1.1724 +// Generate a native wrapper for a given method.  The method takes arguments
  1.1725 +// in the Java compiled code convention, marshals them to the native
  1.1726 +// convention (handlizes oops, etc), transitions to native, makes the call,
  1.1727 +// returns to java state (possibly blocking), unhandlizes any result and
  1.1728 +// returns.
  1.1729 +//
  1.1730 +// Critical native functions are a shorthand for the use of
  1.1731 +// GetPrimtiveArrayCritical and disallow the use of any other JNI
  1.1732 +// functions.  The wrapper is expected to unpack the arguments before
  1.1733 +// passing them to the callee and perform checks before and after the
  1.1734 +// native call to ensure that they GC_locker
  1.1735 +// lock_critical/unlock_critical semantics are followed.  Some other
  1.1736 +// parts of JNI setup are skipped like the tear down of the JNI handle
  1.1737 +// block and the check for pending exceptions it's impossible for them
  1.1738 +// to be thrown.
  1.1739 +//
  1.1740 +// They are roughly structured like this:
  1.1741 +//    if (GC_locker::needs_gc())
  1.1742 +//      SharedRuntime::block_for_jni_critical();
  1.1743 +//    tranistion to thread_in_native
  1.1744 +//    unpack arrray arguments and call native entry point
  1.1745 +//    check for safepoint in progress
  1.1746 +//    check if any thread suspend flags are set
  1.1747 +//      call into JVM and possible unlock the JNI critical
  1.1748 +//      if a GC was suppressed while in the critical native.
  1.1749 +//    transition back to thread_in_Java
  1.1750 +//    return to caller
  1.1751 +//
  1.1752 +nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
  1.1753 +                                                methodHandle method,
  1.1754 +                                                int compile_id,
  1.1755 +                                                BasicType* in_sig_bt,
  1.1756 +                                                VMRegPair* in_regs,
  1.1757 +                                                BasicType ret_type) {
  1.1758 +  if (method->is_method_handle_intrinsic()) {
  1.1759 +    vmIntrinsics::ID iid = method->intrinsic_id();
  1.1760 +    intptr_t start = (intptr_t)__ pc();
  1.1761 +    int vep_offset = ((intptr_t)__ pc()) - start;
  1.1762 +    gen_special_dispatch(masm,
  1.1763 +                         method,
  1.1764 +                         in_sig_bt,
  1.1765 +                         in_regs);
  1.1766 +    int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
  1.1767 +    __ flush();
  1.1768 +    int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
  1.1769 +    return nmethod::new_native_nmethod(method,
  1.1770 +                                       compile_id,
  1.1771 +                                       masm->code(),
  1.1772 +                                       vep_offset,
  1.1773 +                                       frame_complete,
  1.1774 +                                       stack_slots / VMRegImpl::slots_per_word,
  1.1775 +                                       in_ByteSize(-1),
  1.1776 +                                       in_ByteSize(-1),
  1.1777 +                                       (OopMapSet*)NULL);
  1.1778 +  }
  1.1779 +  bool is_critical_native = true;
  1.1780 +  address native_func = method->critical_native_function();
  1.1781 +  if (native_func == NULL) {
  1.1782 +    native_func = method->native_function();
  1.1783 +    is_critical_native = false;
  1.1784 +  }
  1.1785 +  assert(native_func != NULL, "must have function");
  1.1786 +
  1.1787 +  // An OopMap for lock (and class if static)
  1.1788 +  OopMapSet *oop_maps = new OopMapSet();
  1.1789 +  intptr_t start = (intptr_t)__ pc();
  1.1790 +
  1.1791 +  // We have received a description of where all the java arg are located
  1.1792 +  // on entry to the wrapper. We need to convert these args to where
  1.1793 +  // the jni function will expect them. To figure out where they go
  1.1794 +  // we convert the java signature to a C signature by inserting
  1.1795 +  // the hidden arguments as arg[0] and possibly arg[1] (static method)
  1.1796 +
  1.1797 +  const int total_in_args = method->size_of_parameters();
  1.1798 +  int total_c_args = total_in_args;
  1.1799 +  if (!is_critical_native) {
  1.1800 +    total_c_args += 1;
  1.1801 +    if (method->is_static()) {
  1.1802 +      total_c_args++;
  1.1803 +    }
  1.1804 +  } else {
  1.1805 +    for (int i = 0; i < total_in_args; i++) {
  1.1806 +      if (in_sig_bt[i] == T_ARRAY) {
  1.1807 +        total_c_args++;
  1.1808 +      }
  1.1809 +    }
  1.1810 +  }
  1.1811 +
  1.1812 +  BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
  1.1813 +  VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
  1.1814 +  BasicType* in_elem_bt = NULL;
  1.1815 +
  1.1816 +  int argc = 0;
  1.1817 +  if (!is_critical_native) {
  1.1818 +    out_sig_bt[argc++] = T_ADDRESS;
  1.1819 +    if (method->is_static()) {
  1.1820 +      out_sig_bt[argc++] = T_OBJECT;
  1.1821 +    }
  1.1822 +
  1.1823 +    for (int i = 0; i < total_in_args ; i++ ) {
  1.1824 +      out_sig_bt[argc++] = in_sig_bt[i];
  1.1825 +    }
  1.1826 +  } else {
  1.1827 +    Thread* THREAD = Thread::current();
  1.1828 +    in_elem_bt = NEW_RESOURCE_ARRAY(BasicType, total_in_args);
  1.1829 +    SignatureStream ss(method->signature());
  1.1830 +    for (int i = 0; i < total_in_args ; i++ ) {
  1.1831 +      if (in_sig_bt[i] == T_ARRAY) {
  1.1832 +        // Arrays are passed as int, elem* pair
  1.1833 +        out_sig_bt[argc++] = T_INT;
  1.1834 +        out_sig_bt[argc++] = T_ADDRESS;
  1.1835 +        Symbol* atype = ss.as_symbol(CHECK_NULL);
  1.1836 +        const char* at = atype->as_C_string();
  1.1837 +        if (strlen(at) == 2) {
  1.1838 +          assert(at[0] == '[', "must be");
  1.1839 +          switch (at[1]) {
  1.1840 +            case 'B': in_elem_bt[i]  = T_BYTE; break;
  1.1841 +            case 'C': in_elem_bt[i]  = T_CHAR; break;
  1.1842 +            case 'D': in_elem_bt[i]  = T_DOUBLE; break;
  1.1843 +            case 'F': in_elem_bt[i]  = T_FLOAT; break;
  1.1844 +            case 'I': in_elem_bt[i]  = T_INT; break;
  1.1845 +            case 'J': in_elem_bt[i]  = T_LONG; break;
  1.1846 +            case 'S': in_elem_bt[i]  = T_SHORT; break;
  1.1847 +            case 'Z': in_elem_bt[i]  = T_BOOLEAN; break;
  1.1848 +            default: ShouldNotReachHere();
  1.1849 +          }
  1.1850 +        }
  1.1851 +      } else {
  1.1852 +        out_sig_bt[argc++] = in_sig_bt[i];
  1.1853 +        in_elem_bt[i] = T_VOID;
  1.1854 +      }
  1.1855 +      if (in_sig_bt[i] != T_VOID) {
  1.1856 +        assert(in_sig_bt[i] == ss.type(), "must match");
  1.1857 +        ss.next();
  1.1858 +      }
  1.1859 +    }
  1.1860 +  }
  1.1861 +
  1.1862 +  // Now figure out where the args must be stored and how much stack space
  1.1863 +  // they require.
  1.1864 +  int out_arg_slots;
  1.1865 +  out_arg_slots = c_calling_convention(out_sig_bt, out_regs, NULL, total_c_args);
  1.1866 +
  1.1867 +  // Compute framesize for the wrapper.  We need to handlize all oops in
  1.1868 +  // incoming registers
  1.1869 +
  1.1870 +  // Calculate the total number of stack slots we will need.
  1.1871 +
  1.1872 +  // First count the abi requirement plus all of the outgoing args
  1.1873 +  int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
  1.1874 +
  1.1875 +  // Now the space for the inbound oop handle area
  1.1876 +  int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
  1.1877 +  if (is_critical_native) {
  1.1878 +    // Critical natives may have to call out so they need a save area
  1.1879 +    // for register arguments.
  1.1880 +    int double_slots = 0;
  1.1881 +    int single_slots = 0;
  1.1882 +    for ( int i = 0; i < total_in_args; i++) {
  1.1883 +      if (in_regs[i].first()->is_Register()) {
  1.1884 +        const Register reg = in_regs[i].first()->as_Register();
  1.1885 +        switch (in_sig_bt[i]) {
  1.1886 +          case T_BOOLEAN:
  1.1887 +          case T_BYTE:
  1.1888 +          case T_SHORT:
  1.1889 +          case T_CHAR:
  1.1890 +          case T_INT:  single_slots++; break;
  1.1891 +          case T_ARRAY:  // specific to LP64 (7145024)
  1.1892 +          case T_LONG: double_slots++; break;
  1.1893 +          default:  ShouldNotReachHere();
  1.1894 +        }
  1.1895 +      } else if (in_regs[i].first()->is_XMMRegister()) {
  1.1896 +        switch (in_sig_bt[i]) {
  1.1897 +          case T_FLOAT:  single_slots++; break;
  1.1898 +          case T_DOUBLE: double_slots++; break;
  1.1899 +          default:  ShouldNotReachHere();
  1.1900 +        }
  1.1901 +      } else if (in_regs[i].first()->is_FloatRegister()) {
  1.1902 +        ShouldNotReachHere();
  1.1903 +      }
  1.1904 +    }
  1.1905 +    total_save_slots = double_slots * 2 + single_slots;
  1.1906 +    // align the save area
  1.1907 +    if (double_slots != 0) {
  1.1908 +      stack_slots = round_to(stack_slots, 2);
  1.1909 +    }
  1.1910 +  }
  1.1911 +
  1.1912 +  int oop_handle_offset = stack_slots;
  1.1913 +  stack_slots += total_save_slots;
  1.1914 +
  1.1915 +  // Now any space we need for handlizing a klass if static method
  1.1916 +
  1.1917 +  int klass_slot_offset = 0;
  1.1918 +  int klass_offset = -1;
  1.1919 +  int lock_slot_offset = 0;
  1.1920 +  bool is_static = false;
  1.1921 +
  1.1922 +  if (method->is_static()) {
  1.1923 +    klass_slot_offset = stack_slots;
  1.1924 +    stack_slots += VMRegImpl::slots_per_word;
  1.1925 +    klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
  1.1926 +    is_static = true;
  1.1927 +  }
  1.1928 +
  1.1929 +  // Plus a lock if needed
  1.1930 +
  1.1931 +  if (method->is_synchronized()) {
  1.1932 +    lock_slot_offset = stack_slots;
  1.1933 +    stack_slots += VMRegImpl::slots_per_word;
  1.1934 +  }
  1.1935 +
  1.1936 +  // Now a place (+2) to save return values or temp during shuffling
  1.1937 +  // + 4 for return address (which we own) and saved rbp
  1.1938 +  stack_slots += 6;
  1.1939 +
  1.1940 +  // Ok The space we have allocated will look like:
  1.1941 +  //
  1.1942 +  //
  1.1943 +  // FP-> |                     |
  1.1944 +  //      |---------------------|
  1.1945 +  //      | 2 slots for moves   |
  1.1946 +  //      |---------------------|
  1.1947 +  //      | lock box (if sync)  |
  1.1948 +  //      |---------------------| <- lock_slot_offset
  1.1949 +  //      | klass (if static)   |
  1.1950 +  //      |---------------------| <- klass_slot_offset
  1.1951 +  //      | oopHandle area      |
  1.1952 +  //      |---------------------| <- oop_handle_offset (6 java arg registers)
  1.1953 +  //      | outbound memory     |
  1.1954 +  //      | based arguments     |
  1.1955 +  //      |                     |
  1.1956 +  //      |---------------------|
  1.1957 +  //      |                     |
  1.1958 +  // SP-> | out_preserved_slots |
  1.1959 +  //
  1.1960 +  //
  1.1961 +
  1.1962 +
  1.1963 +  // Now compute actual number of stack words we need rounding to make
  1.1964 +  // stack properly aligned.
  1.1965 +  stack_slots = round_to(stack_slots, StackAlignmentInSlots);
  1.1966 +
  1.1967 +  int stack_size = stack_slots * VMRegImpl::stack_slot_size;
  1.1968 +
  1.1969 +  // First thing make an ic check to see if we should even be here
  1.1970 +
  1.1971 +  // We are free to use all registers as temps without saving them and
  1.1972 +  // restoring them except rbp. rbp is the only callee save register
  1.1973 +  // as far as the interpreter and the compiler(s) are concerned.
  1.1974 +
  1.1975 +
  1.1976 +  const Register ic_reg = rax;
  1.1977 +  const Register receiver = j_rarg0;
  1.1978 +
  1.1979 +  Label hit;
  1.1980 +  Label exception_pending;
  1.1981 +
  1.1982 +  assert_different_registers(ic_reg, receiver, rscratch1);
  1.1983 +  __ verify_oop(receiver);
  1.1984 +  __ load_klass(rscratch1, receiver);
  1.1985 +  __ cmpq(ic_reg, rscratch1);
  1.1986 +  __ jcc(Assembler::equal, hit);
  1.1987 +
  1.1988 +  __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
  1.1989 +
  1.1990 +  // Verified entry point must be aligned
  1.1991 +  __ align(8);
  1.1992 +
  1.1993 +  __ bind(hit);
  1.1994 +
  1.1995 +  int vep_offset = ((intptr_t)__ pc()) - start;
  1.1996 +
  1.1997 +  // The instruction at the verified entry point must be 5 bytes or longer
  1.1998 +  // because it can be patched on the fly by make_non_entrant. The stack bang
  1.1999 +  // instruction fits that requirement.
  1.2000 +
  1.2001 +  // Generate stack overflow check
  1.2002 +
  1.2003 +  if (UseStackBanging) {
  1.2004 +    __ bang_stack_with_offset(StackShadowPages*os::vm_page_size());
  1.2005 +  } else {
  1.2006 +    // need a 5 byte instruction to allow MT safe patching to non-entrant
  1.2007 +    __ fat_nop();
  1.2008 +  }
  1.2009 +
  1.2010 +  // Generate a new frame for the wrapper.
  1.2011 +  __ enter();
  1.2012 +  // -2 because return address is already present and so is saved rbp
  1.2013 +  __ subptr(rsp, stack_size - 2*wordSize);
  1.2014 +
  1.2015 +  // Frame is now completed as far as size and linkage.
  1.2016 +  int frame_complete = ((intptr_t)__ pc()) - start;
  1.2017 +
  1.2018 +    if (UseRTMLocking) {
  1.2019 +      // Abort RTM transaction before calling JNI
  1.2020 +      // because critical section will be large and will be
  1.2021 +      // aborted anyway. Also nmethod could be deoptimized.
  1.2022 +      __ xabort(0);
  1.2023 +    }
  1.2024 +
  1.2025 +#ifdef ASSERT
  1.2026 +    {
  1.2027 +      Label L;
  1.2028 +      __ mov(rax, rsp);
  1.2029 +      __ andptr(rax, -16); // must be 16 byte boundary (see amd64 ABI)
  1.2030 +      __ cmpptr(rax, rsp);
  1.2031 +      __ jcc(Assembler::equal, L);
  1.2032 +      __ stop("improperly aligned stack");
  1.2033 +      __ bind(L);
  1.2034 +    }
  1.2035 +#endif /* ASSERT */
  1.2036 +
  1.2037 +
  1.2038 +  // We use r14 as the oop handle for the receiver/klass
  1.2039 +  // It is callee save so it survives the call to native
  1.2040 +
  1.2041 +  const Register oop_handle_reg = r14;
  1.2042 +
  1.2043 +  if (is_critical_native) {
  1.2044 +    check_needs_gc_for_critical_native(masm, stack_slots, total_c_args, total_in_args,
  1.2045 +                                       oop_handle_offset, oop_maps, in_regs, in_sig_bt);
  1.2046 +  }
  1.2047 +
  1.2048 +  //
  1.2049 +  // We immediately shuffle the arguments so that any vm call we have to
  1.2050 +  // make from here on out (sync slow path, jvmti, etc.) we will have
  1.2051 +  // captured the oops from our caller and have a valid oopMap for
  1.2052 +  // them.
  1.2053 +
  1.2054 +  // -----------------
  1.2055 +  // The Grand Shuffle
  1.2056 +
  1.2057 +  // The Java calling convention is either equal (linux) or denser (win64) than the
  1.2058 +  // c calling convention. However the because of the jni_env argument the c calling
  1.2059 +  // convention always has at least one more (and two for static) arguments than Java.
  1.2060 +  // Therefore if we move the args from java -> c backwards then we will never have
  1.2061 +  // a register->register conflict and we don't have to build a dependency graph
  1.2062 +  // and figure out how to break any cycles.
  1.2063 +  //
  1.2064 +
  1.2065 +  // Record esp-based slot for receiver on stack for non-static methods
  1.2066 +  int receiver_offset = -1;
  1.2067 +
  1.2068 +  // This is a trick. We double the stack slots so we can claim
  1.2069 +  // the oops in the caller's frame. Since we are sure to have
  1.2070 +  // more args than the caller doubling is enough to make
  1.2071 +  // sure we can capture all the incoming oop args from the
  1.2072 +  // caller.
  1.2073 +  //
  1.2074 +  OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
  1.2075 +
  1.2076 +  // Mark location of rbp (someday)
  1.2077 +  // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
  1.2078 +
  1.2079 +  // Use eax, ebx as temporaries during any memory-memory moves we have to do
  1.2080 +  // All inbound args are referenced based on rbp and all outbound args via rsp.
  1.2081 +
  1.2082 +
  1.2083 +#ifdef ASSERT
  1.2084 +  bool reg_destroyed[RegisterImpl::number_of_registers];
  1.2085 +  bool freg_destroyed[XMMRegisterImpl::number_of_registers];
  1.2086 +  for ( int r = 0 ; r < RegisterImpl::number_of_registers ; r++ ) {
  1.2087 +    reg_destroyed[r] = false;
  1.2088 +  }
  1.2089 +  for ( int f = 0 ; f < XMMRegisterImpl::number_of_registers ; f++ ) {
  1.2090 +    freg_destroyed[f] = false;
  1.2091 +  }
  1.2092 +
  1.2093 +#endif /* ASSERT */
  1.2094 +
  1.2095 +  // This may iterate in two different directions depending on the
  1.2096 +  // kind of native it is.  The reason is that for regular JNI natives
  1.2097 +  // the incoming and outgoing registers are offset upwards and for
  1.2098 +  // critical natives they are offset down.
  1.2099 +  GrowableArray<int> arg_order(2 * total_in_args);
  1.2100 +  VMRegPair tmp_vmreg;
  1.2101 +  tmp_vmreg.set1(rbx->as_VMReg());
  1.2102 +
  1.2103 +  if (!is_critical_native) {
  1.2104 +    for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
  1.2105 +      arg_order.push(i);
  1.2106 +      arg_order.push(c_arg);
  1.2107 +    }
  1.2108 +  } else {
  1.2109 +    // Compute a valid move order, using tmp_vmreg to break any cycles
  1.2110 +    ComputeMoveOrder cmo(total_in_args, in_regs, total_c_args, out_regs, in_sig_bt, arg_order, tmp_vmreg);
  1.2111 +  }
  1.2112 +
  1.2113 +  int temploc = -1;
  1.2114 +  for (int ai = 0; ai < arg_order.length(); ai += 2) {
  1.2115 +    int i = arg_order.at(ai);
  1.2116 +    int c_arg = arg_order.at(ai + 1);
  1.2117 +    __ block_comment(err_msg("move %d -> %d", i, c_arg));
  1.2118 +    if (c_arg == -1) {
  1.2119 +      assert(is_critical_native, "should only be required for critical natives");
  1.2120 +      // This arg needs to be moved to a temporary
  1.2121 +      __ mov(tmp_vmreg.first()->as_Register(), in_regs[i].first()->as_Register());
  1.2122 +      in_regs[i] = tmp_vmreg;
  1.2123 +      temploc = i;
  1.2124 +      continue;
  1.2125 +    } else if (i == -1) {
  1.2126 +      assert(is_critical_native, "should only be required for critical natives");
  1.2127 +      // Read from the temporary location
  1.2128 +      assert(temploc != -1, "must be valid");
  1.2129 +      i = temploc;
  1.2130 +      temploc = -1;
  1.2131 +    }
  1.2132 +#ifdef ASSERT
  1.2133 +    if (in_regs[i].first()->is_Register()) {
  1.2134 +      assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
  1.2135 +    } else if (in_regs[i].first()->is_XMMRegister()) {
  1.2136 +      assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
  1.2137 +    }
  1.2138 +    if (out_regs[c_arg].first()->is_Register()) {
  1.2139 +      reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
  1.2140 +    } else if (out_regs[c_arg].first()->is_XMMRegister()) {
  1.2141 +      freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
  1.2142 +    }
  1.2143 +#endif /* ASSERT */
  1.2144 +    switch (in_sig_bt[i]) {
  1.2145 +      case T_ARRAY:
  1.2146 +        if (is_critical_native) {
  1.2147 +          unpack_array_argument(masm, in_regs[i], in_elem_bt[i], out_regs[c_arg + 1], out_regs[c_arg]);
  1.2148 +          c_arg++;
  1.2149 +#ifdef ASSERT
  1.2150 +          if (out_regs[c_arg].first()->is_Register()) {
  1.2151 +            reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
  1.2152 +          } else if (out_regs[c_arg].first()->is_XMMRegister()) {
  1.2153 +            freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
  1.2154 +          }
  1.2155 +#endif
  1.2156 +          break;
  1.2157 +        }
  1.2158 +      case T_OBJECT:
  1.2159 +        assert(!is_critical_native, "no oop arguments");
  1.2160 +        object_move(masm, map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
  1.2161 +                    ((i == 0) && (!is_static)),
  1.2162 +                    &receiver_offset);
  1.2163 +        break;
  1.2164 +      case T_VOID:
  1.2165 +        break;
  1.2166 +
  1.2167 +      case T_FLOAT:
  1.2168 +        float_move(masm, in_regs[i], out_regs[c_arg]);
  1.2169 +          break;
  1.2170 +
  1.2171 +      case T_DOUBLE:
  1.2172 +        assert( i + 1 < total_in_args &&
  1.2173 +                in_sig_bt[i + 1] == T_VOID &&
  1.2174 +                out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
  1.2175 +        double_move(masm, in_regs[i], out_regs[c_arg]);
  1.2176 +        break;
  1.2177 +
  1.2178 +      case T_LONG :
  1.2179 +        long_move(masm, in_regs[i], out_regs[c_arg]);
  1.2180 +        break;
  1.2181 +
  1.2182 +      case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
  1.2183 +
  1.2184 +      default:
  1.2185 +        move32_64(masm, in_regs[i], out_regs[c_arg]);
  1.2186 +    }
  1.2187 +  }
  1.2188 +
  1.2189 +  int c_arg;
  1.2190 +
  1.2191 +  // Pre-load a static method's oop into r14.  Used both by locking code and
  1.2192 +  // the normal JNI call code.
  1.2193 +  if (!is_critical_native) {
  1.2194 +    // point c_arg at the first arg that is already loaded in case we
  1.2195 +    // need to spill before we call out
  1.2196 +    c_arg = total_c_args - total_in_args;
  1.2197 +
  1.2198 +    if (method->is_static()) {
  1.2199 +
  1.2200 +      //  load oop into a register
  1.2201 +      __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
  1.2202 +
  1.2203 +      // Now handlize the static class mirror it's known not-null.
  1.2204 +      __ movptr(Address(rsp, klass_offset), oop_handle_reg);
  1.2205 +      map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
  1.2206 +
  1.2207 +      // Now get the handle
  1.2208 +      __ lea(oop_handle_reg, Address(rsp, klass_offset));
  1.2209 +      // store the klass handle as second argument
  1.2210 +      __ movptr(c_rarg1, oop_handle_reg);
  1.2211 +      // and protect the arg if we must spill
  1.2212 +      c_arg--;
  1.2213 +    }
  1.2214 +  } else {
  1.2215 +    // For JNI critical methods we need to save all registers in save_args.
  1.2216 +    c_arg = 0;
  1.2217 +  }
  1.2218 +
  1.2219 +  // Change state to native (we save the return address in the thread, since it might not
  1.2220 +  // be pushed on the stack when we do a a stack traversal). It is enough that the pc()
  1.2221 +  // points into the right code segment. It does not have to be the correct return pc.
  1.2222 +  // We use the same pc/oopMap repeatedly when we call out
  1.2223 +
  1.2224 +  intptr_t the_pc = (intptr_t) __ pc();
  1.2225 +  oop_maps->add_gc_map(the_pc - start, map);
  1.2226 +
  1.2227 +  __ set_last_Java_frame(rsp, noreg, (address)the_pc);
  1.2228 +
  1.2229 +
  1.2230 +  // We have all of the arguments setup at this point. We must not touch any register
  1.2231 +  // argument registers at this point (what if we save/restore them there are no oop?
  1.2232 +
  1.2233 +  {
  1.2234 +    SkipIfEqual skip(masm, &DTraceMethodProbes, false);
  1.2235 +    // protect the args we've loaded
  1.2236 +    save_args(masm, total_c_args, c_arg, out_regs);
  1.2237 +    __ mov_metadata(c_rarg1, method());
  1.2238 +    __ call_VM_leaf(
  1.2239 +      CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
  1.2240 +      r15_thread, c_rarg1);
  1.2241 +    restore_args(masm, total_c_args, c_arg, out_regs);
  1.2242 +  }
  1.2243 +
  1.2244 +  // RedefineClasses() tracing support for obsolete method entry
  1.2245 +  if (RC_TRACE_IN_RANGE(0x00001000, 0x00002000)) {
  1.2246 +    // protect the args we've loaded
  1.2247 +    save_args(masm, total_c_args, c_arg, out_regs);
  1.2248 +    __ mov_metadata(c_rarg1, method());
  1.2249 +    __ call_VM_leaf(
  1.2250 +      CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
  1.2251 +      r15_thread, c_rarg1);
  1.2252 +    restore_args(masm, total_c_args, c_arg, out_regs);
  1.2253 +  }
  1.2254 +
  1.2255 +  // Lock a synchronized method
  1.2256 +
  1.2257 +  // Register definitions used by locking and unlocking
  1.2258 +
  1.2259 +  const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
  1.2260 +  const Register obj_reg  = rbx;  // Will contain the oop
  1.2261 +  const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
  1.2262 +  const Register old_hdr  = r13;  // value of old header at unlock time
  1.2263 +
  1.2264 +  Label slow_path_lock;
  1.2265 +  Label lock_done;
  1.2266 +
  1.2267 +  if (method->is_synchronized()) {
  1.2268 +    assert(!is_critical_native, "unhandled");
  1.2269 +
  1.2270 +
  1.2271 +    const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
  1.2272 +
  1.2273 +    // Get the handle (the 2nd argument)
  1.2274 +    __ mov(oop_handle_reg, c_rarg1);
  1.2275 +
  1.2276 +    // Get address of the box
  1.2277 +
  1.2278 +    __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
  1.2279 +
  1.2280 +    // Load the oop from the handle
  1.2281 +    __ movptr(obj_reg, Address(oop_handle_reg, 0));
  1.2282 +
  1.2283 +    if (UseBiasedLocking) {
  1.2284 +      __ biased_locking_enter(lock_reg, obj_reg, swap_reg, rscratch1, false, lock_done, &slow_path_lock);
  1.2285 +    }
  1.2286 +
  1.2287 +    // Load immediate 1 into swap_reg %rax
  1.2288 +    __ movl(swap_reg, 1);
  1.2289 +
  1.2290 +    // Load (object->mark() | 1) into swap_reg %rax
  1.2291 +    __ orptr(swap_reg, Address(obj_reg, 0));
  1.2292 +
  1.2293 +    // Save (object->mark() | 1) into BasicLock's displaced header
  1.2294 +    __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
  1.2295 +
  1.2296 +    if (os::is_MP()) {
  1.2297 +      __ lock();
  1.2298 +    }
  1.2299 +
  1.2300 +    // src -> dest iff dest == rax else rax <- dest
  1.2301 +    __ cmpxchgptr(lock_reg, Address(obj_reg, 0));
  1.2302 +    __ jcc(Assembler::equal, lock_done);
  1.2303 +
  1.2304 +    // Hmm should this move to the slow path code area???
  1.2305 +
  1.2306 +    // Test if the oopMark is an obvious stack pointer, i.e.,
  1.2307 +    //  1) (mark & 3) == 0, and
  1.2308 +    //  2) rsp <= mark < mark + os::pagesize()
  1.2309 +    // These 3 tests can be done by evaluating the following
  1.2310 +    // expression: ((mark - rsp) & (3 - os::vm_page_size())),
  1.2311 +    // assuming both stack pointer and pagesize have their
  1.2312 +    // least significant 2 bits clear.
  1.2313 +    // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
  1.2314 +
  1.2315 +    __ subptr(swap_reg, rsp);
  1.2316 +    __ andptr(swap_reg, 3 - os::vm_page_size());
  1.2317 +
  1.2318 +    // Save the test result, for recursive case, the result is zero
  1.2319 +    __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
  1.2320 +    __ jcc(Assembler::notEqual, slow_path_lock);
  1.2321 +
  1.2322 +    // Slow path will re-enter here
  1.2323 +
  1.2324 +    __ bind(lock_done);
  1.2325 +  }
  1.2326 +
  1.2327 +
  1.2328 +  // Finally just about ready to make the JNI call
  1.2329 +
  1.2330 +
  1.2331 +  // get JNIEnv* which is first argument to native
  1.2332 +  if (!is_critical_native) {
  1.2333 +    __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
  1.2334 +  }
  1.2335 +
  1.2336 +  // Now set thread in native
  1.2337 +  __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
  1.2338 +
  1.2339 +  __ call(RuntimeAddress(native_func));
  1.2340 +
  1.2341 +  // Verify or restore cpu control state after JNI call
  1.2342 +  __ restore_cpu_control_state_after_jni();
  1.2343 +
  1.2344 +  // Unpack native results.
  1.2345 +  switch (ret_type) {
  1.2346 +  case T_BOOLEAN: __ c2bool(rax);            break;
  1.2347 +  case T_CHAR   : __ movzwl(rax, rax);      break;
  1.2348 +  case T_BYTE   : __ sign_extend_byte (rax); break;
  1.2349 +  case T_SHORT  : __ sign_extend_short(rax); break;
  1.2350 +  case T_INT    : /* nothing to do */        break;
  1.2351 +  case T_DOUBLE :
  1.2352 +  case T_FLOAT  :
  1.2353 +    // Result is in xmm0 we'll save as needed
  1.2354 +    break;
  1.2355 +  case T_ARRAY:                 // Really a handle
  1.2356 +  case T_OBJECT:                // Really a handle
  1.2357 +      break; // can't de-handlize until after safepoint check
  1.2358 +  case T_VOID: break;
  1.2359 +  case T_LONG: break;
  1.2360 +  default       : ShouldNotReachHere();
  1.2361 +  }
  1.2362 +
  1.2363 +  // Switch thread to "native transition" state before reading the synchronization state.
  1.2364 +  // This additional state is necessary because reading and testing the synchronization
  1.2365 +  // state is not atomic w.r.t. GC, as this scenario demonstrates:
  1.2366 +  //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
  1.2367 +  //     VM thread changes sync state to synchronizing and suspends threads for GC.
  1.2368 +  //     Thread A is resumed to finish this native method, but doesn't block here since it
  1.2369 +  //     didn't see any synchronization is progress, and escapes.
  1.2370 +  __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
  1.2371 +
  1.2372 +  if(os::is_MP()) {
  1.2373 +    if (UseMembar) {
  1.2374 +      // Force this write out before the read below
  1.2375 +      __ membar(Assembler::Membar_mask_bits(
  1.2376 +           Assembler::LoadLoad | Assembler::LoadStore |
  1.2377 +           Assembler::StoreLoad | Assembler::StoreStore));
  1.2378 +    } else {
  1.2379 +      // Write serialization page so VM thread can do a pseudo remote membar.
  1.2380 +      // We use the current thread pointer to calculate a thread specific
  1.2381 +      // offset to write to within the page. This minimizes bus traffic
  1.2382 +      // due to cache line collision.
  1.2383 +      __ serialize_memory(r15_thread, rcx);
  1.2384 +    }
  1.2385 +  }
  1.2386 +
  1.2387 +  Label after_transition;
  1.2388 +
  1.2389 +  // check for safepoint operation in progress and/or pending suspend requests
  1.2390 +  {
  1.2391 +    Label Continue;
  1.2392 +
  1.2393 +    __ cmp32(ExternalAddress((address)SafepointSynchronize::address_of_state()),
  1.2394 +             SafepointSynchronize::_not_synchronized);
  1.2395 +
  1.2396 +    Label L;
  1.2397 +    __ jcc(Assembler::notEqual, L);
  1.2398 +    __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
  1.2399 +    __ jcc(Assembler::equal, Continue);
  1.2400 +    __ bind(L);
  1.2401 +
  1.2402 +    // Don't use call_VM as it will see a possible pending exception and forward it
  1.2403 +    // and never return here preventing us from clearing _last_native_pc down below.
  1.2404 +    // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
  1.2405 +    // preserved and correspond to the bcp/locals pointers. So we do a runtime call
  1.2406 +    // by hand.
  1.2407 +    //
  1.2408 +    save_native_result(masm, ret_type, stack_slots);
  1.2409 +    __ mov(c_rarg0, r15_thread);
  1.2410 +    __ mov(r12, rsp); // remember sp
  1.2411 +    __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
  1.2412 +    __ andptr(rsp, -16); // align stack as required by ABI
  1.2413 +    if (!is_critical_native) {
  1.2414 +      __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
  1.2415 +    } else {
  1.2416 +      __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans_and_transition)));
  1.2417 +    }
  1.2418 +    __ mov(rsp, r12); // restore sp
  1.2419 +    __ reinit_heapbase();
  1.2420 +    // Restore any method result value
  1.2421 +    restore_native_result(masm, ret_type, stack_slots);
  1.2422 +
  1.2423 +    if (is_critical_native) {
  1.2424 +      // The call above performed the transition to thread_in_Java so
  1.2425 +      // skip the transition logic below.
  1.2426 +      __ jmpb(after_transition);
  1.2427 +    }
  1.2428 +
  1.2429 +    __ bind(Continue);
  1.2430 +  }
  1.2431 +
  1.2432 +  // change thread state
  1.2433 +  __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
  1.2434 +  __ bind(after_transition);
  1.2435 +
  1.2436 +  Label reguard;
  1.2437 +  Label reguard_done;
  1.2438 +  __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), JavaThread::stack_guard_yellow_disabled);
  1.2439 +  __ jcc(Assembler::equal, reguard);
  1.2440 +  __ bind(reguard_done);
  1.2441 +
  1.2442 +  // native result if any is live
  1.2443 +
  1.2444 +  // Unlock
  1.2445 +  Label unlock_done;
  1.2446 +  Label slow_path_unlock;
  1.2447 +  if (method->is_synchronized()) {
  1.2448 +
  1.2449 +    // Get locked oop from the handle we passed to jni
  1.2450 +    __ movptr(obj_reg, Address(oop_handle_reg, 0));
  1.2451 +
  1.2452 +    Label done;
  1.2453 +
  1.2454 +    if (UseBiasedLocking) {
  1.2455 +      __ biased_locking_exit(obj_reg, old_hdr, done);
  1.2456 +    }
  1.2457 +
  1.2458 +    // Simple recursive lock?
  1.2459 +
  1.2460 +    __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), (int32_t)NULL_WORD);
  1.2461 +    __ jcc(Assembler::equal, done);
  1.2462 +
  1.2463 +    // Must save rax if if it is live now because cmpxchg must use it
  1.2464 +    if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
  1.2465 +      save_native_result(masm, ret_type, stack_slots);
  1.2466 +    }
  1.2467 +
  1.2468 +
  1.2469 +    // get address of the stack lock
  1.2470 +    __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
  1.2471 +    //  get old displaced header
  1.2472 +    __ movptr(old_hdr, Address(rax, 0));
  1.2473 +
  1.2474 +    // Atomic swap old header if oop still contains the stack lock
  1.2475 +    if (os::is_MP()) {
  1.2476 +      __ lock();
  1.2477 +    }
  1.2478 +    __ cmpxchgptr(old_hdr, Address(obj_reg, 0));
  1.2479 +    __ jcc(Assembler::notEqual, slow_path_unlock);
  1.2480 +
  1.2481 +    // slow path re-enters here
  1.2482 +    __ bind(unlock_done);
  1.2483 +    if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
  1.2484 +      restore_native_result(masm, ret_type, stack_slots);
  1.2485 +    }
  1.2486 +
  1.2487 +    __ bind(done);
  1.2488 +
  1.2489 +  }
  1.2490 +  {
  1.2491 +    SkipIfEqual skip(masm, &DTraceMethodProbes, false);
  1.2492 +    save_native_result(masm, ret_type, stack_slots);
  1.2493 +    __ mov_metadata(c_rarg1, method());
  1.2494 +    __ call_VM_leaf(
  1.2495 +         CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
  1.2496 +         r15_thread, c_rarg1);
  1.2497 +    restore_native_result(masm, ret_type, stack_slots);
  1.2498 +  }
  1.2499 +
  1.2500 +  __ reset_last_Java_frame(false, true);
  1.2501 +
  1.2502 +  // Unpack oop result
  1.2503 +  if (ret_type == T_OBJECT || ret_type == T_ARRAY) {
  1.2504 +      Label L;
  1.2505 +      __ testptr(rax, rax);
  1.2506 +      __ jcc(Assembler::zero, L);
  1.2507 +      __ movptr(rax, Address(rax, 0));
  1.2508 +      __ bind(L);
  1.2509 +      __ verify_oop(rax);
  1.2510 +  }
  1.2511 +
  1.2512 +  if (!is_critical_native) {
  1.2513 +    // reset handle block
  1.2514 +    __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
  1.2515 +    __ movl(Address(rcx, JNIHandleBlock::top_offset_in_bytes()), (int32_t)NULL_WORD);
  1.2516 +  }
  1.2517 +
  1.2518 +  // pop our frame
  1.2519 +
  1.2520 +  __ leave();
  1.2521 +
  1.2522 +  if (!is_critical_native) {
  1.2523 +    // Any exception pending?
  1.2524 +    __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
  1.2525 +    __ jcc(Assembler::notEqual, exception_pending);
  1.2526 +  }
  1.2527 +
  1.2528 +  // Return
  1.2529 +
  1.2530 +  __ ret(0);
  1.2531 +
  1.2532 +  // Unexpected paths are out of line and go here
  1.2533 +
  1.2534 +  if (!is_critical_native) {
  1.2535 +    // forward the exception
  1.2536 +    __ bind(exception_pending);
  1.2537 +
  1.2538 +    // and forward the exception
  1.2539 +    __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
  1.2540 +  }
  1.2541 +
  1.2542 +  // Slow path locking & unlocking
  1.2543 +  if (method->is_synchronized()) {
  1.2544 +
  1.2545 +    // BEGIN Slow path lock
  1.2546 +    __ bind(slow_path_lock);
  1.2547 +
  1.2548 +    // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
  1.2549 +    // args are (oop obj, BasicLock* lock, JavaThread* thread)
  1.2550 +
  1.2551 +    // protect the args we've loaded
  1.2552 +    save_args(masm, total_c_args, c_arg, out_regs);
  1.2553 +
  1.2554 +    __ mov(c_rarg0, obj_reg);
  1.2555 +    __ mov(c_rarg1, lock_reg);
  1.2556 +    __ mov(c_rarg2, r15_thread);
  1.2557 +
  1.2558 +    // Not a leaf but we have last_Java_frame setup as we want
  1.2559 +    __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
  1.2560 +    restore_args(masm, total_c_args, c_arg, out_regs);
  1.2561 +
  1.2562 +#ifdef ASSERT
  1.2563 +    { Label L;
  1.2564 +    __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
  1.2565 +    __ jcc(Assembler::equal, L);
  1.2566 +    __ stop("no pending exception allowed on exit from monitorenter");
  1.2567 +    __ bind(L);
  1.2568 +    }
  1.2569 +#endif
  1.2570 +    __ jmp(lock_done);
  1.2571 +
  1.2572 +    // END Slow path lock
  1.2573 +
  1.2574 +    // BEGIN Slow path unlock
  1.2575 +    __ bind(slow_path_unlock);
  1.2576 +
  1.2577 +    // If we haven't already saved the native result we must save it now as xmm registers
  1.2578 +    // are still exposed.
  1.2579 +
  1.2580 +    if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
  1.2581 +      save_native_result(masm, ret_type, stack_slots);
  1.2582 +    }
  1.2583 +
  1.2584 +    __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
  1.2585 +
  1.2586 +    __ mov(c_rarg0, obj_reg);
  1.2587 +    __ mov(r12, rsp); // remember sp
  1.2588 +    __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
  1.2589 +    __ andptr(rsp, -16); // align stack as required by ABI
  1.2590 +
  1.2591 +    // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
  1.2592 +    // NOTE that obj_reg == rbx currently
  1.2593 +    __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
  1.2594 +    __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
  1.2595 +
  1.2596 +    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
  1.2597 +    __ mov(rsp, r12); // restore sp
  1.2598 +    __ reinit_heapbase();
  1.2599 +#ifdef ASSERT
  1.2600 +    {
  1.2601 +      Label L;
  1.2602 +      __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int)NULL_WORD);
  1.2603 +      __ jcc(Assembler::equal, L);
  1.2604 +      __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
  1.2605 +      __ bind(L);
  1.2606 +    }
  1.2607 +#endif /* ASSERT */
  1.2608 +
  1.2609 +    __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
  1.2610 +
  1.2611 +    if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
  1.2612 +      restore_native_result(masm, ret_type, stack_slots);
  1.2613 +    }
  1.2614 +    __ jmp(unlock_done);
  1.2615 +
  1.2616 +    // END Slow path unlock
  1.2617 +
  1.2618 +  } // synchronized
  1.2619 +
  1.2620 +  // SLOW PATH Reguard the stack if needed
  1.2621 +
  1.2622 +  __ bind(reguard);
  1.2623 +  save_native_result(masm, ret_type, stack_slots);
  1.2624 +  __ mov(r12, rsp); // remember sp
  1.2625 +  __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
  1.2626 +  __ andptr(rsp, -16); // align stack as required by ABI
  1.2627 +  __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
  1.2628 +  __ mov(rsp, r12); // restore sp
  1.2629 +  __ reinit_heapbase();
  1.2630 +  restore_native_result(masm, ret_type, stack_slots);
  1.2631 +  // and continue
  1.2632 +  __ jmp(reguard_done);
  1.2633 +
  1.2634 +
  1.2635 +
  1.2636 +  __ flush();
  1.2637 +
  1.2638 +  nmethod *nm = nmethod::new_native_nmethod(method,
  1.2639 +                                            compile_id,
  1.2640 +                                            masm->code(),
  1.2641 +                                            vep_offset,
  1.2642 +                                            frame_complete,
  1.2643 +                                            stack_slots / VMRegImpl::slots_per_word,
  1.2644 +                                            (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
  1.2645 +                                            in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
  1.2646 +                                            oop_maps);
  1.2647 +
  1.2648 +  if (is_critical_native) {
  1.2649 +    nm->set_lazy_critical_native(true);
  1.2650 +  }
  1.2651 +
  1.2652 +  return nm;
  1.2653 +
  1.2654 +}
  1.2655 +
  1.2656 +#ifdef HAVE_DTRACE_H
  1.2657 +// ---------------------------------------------------------------------------
  1.2658 +// Generate a dtrace nmethod for a given signature.  The method takes arguments
  1.2659 +// in the Java compiled code convention, marshals them to the native
  1.2660 +// abi and then leaves nops at the position you would expect to call a native
  1.2661 +// function. When the probe is enabled the nops are replaced with a trap
  1.2662 +// instruction that dtrace inserts and the trace will cause a notification
  1.2663 +// to dtrace.
  1.2664 +//
  1.2665 +// The probes are only able to take primitive types and java/lang/String as
  1.2666 +// arguments.  No other java types are allowed. Strings are converted to utf8
  1.2667 +// strings so that from dtrace point of view java strings are converted to C
  1.2668 +// strings. There is an arbitrary fixed limit on the total space that a method
  1.2669 +// can use for converting the strings. (256 chars per string in the signature).
  1.2670 +// So any java string larger then this is truncated.
  1.2671 +
  1.2672 +static int  fp_offset[ConcreteRegisterImpl::number_of_registers] = { 0 };
  1.2673 +static bool offsets_initialized = false;
  1.2674 +
  1.2675 +
  1.2676 +nmethod *SharedRuntime::generate_dtrace_nmethod(MacroAssembler *masm,
  1.2677 +                                                methodHandle method) {
  1.2678 +
  1.2679 +
  1.2680 +  // generate_dtrace_nmethod is guarded by a mutex so we are sure to
  1.2681 +  // be single threaded in this method.
  1.2682 +  assert(AdapterHandlerLibrary_lock->owned_by_self(), "must be");
  1.2683 +
  1.2684 +  if (!offsets_initialized) {
  1.2685 +    fp_offset[c_rarg0->as_VMReg()->value()] = -1 * wordSize;
  1.2686 +    fp_offset[c_rarg1->as_VMReg()->value()] = -2 * wordSize;
  1.2687 +    fp_offset[c_rarg2->as_VMReg()->value()] = -3 * wordSize;
  1.2688 +    fp_offset[c_rarg3->as_VMReg()->value()] = -4 * wordSize;
  1.2689 +    fp_offset[c_rarg4->as_VMReg()->value()] = -5 * wordSize;
  1.2690 +    fp_offset[c_rarg5->as_VMReg()->value()] = -6 * wordSize;
  1.2691 +
  1.2692 +    fp_offset[c_farg0->as_VMReg()->value()] = -7 * wordSize;
  1.2693 +    fp_offset[c_farg1->as_VMReg()->value()] = -8 * wordSize;
  1.2694 +    fp_offset[c_farg2->as_VMReg()->value()] = -9 * wordSize;
  1.2695 +    fp_offset[c_farg3->as_VMReg()->value()] = -10 * wordSize;
  1.2696 +    fp_offset[c_farg4->as_VMReg()->value()] = -11 * wordSize;
  1.2697 +    fp_offset[c_farg5->as_VMReg()->value()] = -12 * wordSize;
  1.2698 +    fp_offset[c_farg6->as_VMReg()->value()] = -13 * wordSize;
  1.2699 +    fp_offset[c_farg7->as_VMReg()->value()] = -14 * wordSize;
  1.2700 +
  1.2701 +    offsets_initialized = true;
  1.2702 +  }
  1.2703 +  // Fill in the signature array, for the calling-convention call.
  1.2704 +  int total_args_passed = method->size_of_parameters();
  1.2705 +
  1.2706 +  BasicType* in_sig_bt  = NEW_RESOURCE_ARRAY(BasicType, total_args_passed);
  1.2707 +  VMRegPair  *in_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_args_passed);
  1.2708 +
  1.2709 +  // The signature we are going to use for the trap that dtrace will see
  1.2710 +  // java/lang/String is converted. We drop "this" and any other object
  1.2711 +  // is converted to NULL.  (A one-slot java/lang/Long object reference
  1.2712 +  // is converted to a two-slot long, which is why we double the allocation).
  1.2713 +  BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_args_passed * 2);
  1.2714 +  VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_args_passed * 2);
  1.2715 +
  1.2716 +  int i=0;
  1.2717 +  int total_strings = 0;
  1.2718 +  int first_arg_to_pass = 0;
  1.2719 +  int total_c_args = 0;
  1.2720 +
  1.2721 +  // Skip the receiver as dtrace doesn't want to see it
  1.2722 +  if( !method->is_static() ) {
  1.2723 +    in_sig_bt[i++] = T_OBJECT;
  1.2724 +    first_arg_to_pass = 1;
  1.2725 +  }
  1.2726 +
  1.2727 +  // We need to convert the java args to where a native (non-jni) function
  1.2728 +  // would expect them. To figure out where they go we convert the java
  1.2729 +  // signature to a C signature.
  1.2730 +
  1.2731 +  SignatureStream ss(method->signature());
  1.2732 +  for ( ; !ss.at_return_type(); ss.next()) {
  1.2733 +    BasicType bt = ss.type();
  1.2734 +    in_sig_bt[i++] = bt;  // Collect remaining bits of signature
  1.2735 +    out_sig_bt[total_c_args++] = bt;
  1.2736 +    if( bt == T_OBJECT) {
  1.2737 +      Symbol* s = ss.as_symbol_or_null();   // symbol is created
  1.2738 +      if (s == vmSymbols::java_lang_String()) {
  1.2739 +        total_strings++;
  1.2740 +        out_sig_bt[total_c_args-1] = T_ADDRESS;
  1.2741 +      } else if (s == vmSymbols::java_lang_Boolean() ||
  1.2742 +                 s == vmSymbols::java_lang_Character() ||
  1.2743 +                 s == vmSymbols::java_lang_Byte() ||
  1.2744 +                 s == vmSymbols::java_lang_Short() ||
  1.2745 +                 s == vmSymbols::java_lang_Integer() ||
  1.2746 +                 s == vmSymbols::java_lang_Float()) {
  1.2747 +        out_sig_bt[total_c_args-1] = T_INT;
  1.2748 +      } else if (s == vmSymbols::java_lang_Long() ||
  1.2749 +                 s == vmSymbols::java_lang_Double()) {
  1.2750 +        out_sig_bt[total_c_args-1] = T_LONG;
  1.2751 +        out_sig_bt[total_c_args++] = T_VOID;
  1.2752 +      }
  1.2753 +    } else if ( bt == T_LONG || bt == T_DOUBLE ) {
  1.2754 +      in_sig_bt[i++] = T_VOID;   // Longs & doubles take 2 Java slots
  1.2755 +      // We convert double to long
  1.2756 +      out_sig_bt[total_c_args-1] = T_LONG;
  1.2757 +      out_sig_bt[total_c_args++] = T_VOID;
  1.2758 +    } else if ( bt == T_FLOAT) {
  1.2759 +      // We convert float to int
  1.2760 +      out_sig_bt[total_c_args-1] = T_INT;
  1.2761 +    }
  1.2762 +  }
  1.2763 +
  1.2764 +  assert(i==total_args_passed, "validly parsed signature");
  1.2765 +
  1.2766 +  // Now get the compiled-Java layout as input arguments
  1.2767 +  int comp_args_on_stack;
  1.2768 +  comp_args_on_stack = SharedRuntime::java_calling_convention(
  1.2769 +      in_sig_bt, in_regs, total_args_passed, false);
  1.2770 +
  1.2771 +  // Now figure out where the args must be stored and how much stack space
  1.2772 +  // they require (neglecting out_preserve_stack_slots but space for storing
  1.2773 +  // the 1st six register arguments). It's weird see int_stk_helper.
  1.2774 +
  1.2775 +  int out_arg_slots;
  1.2776 +  out_arg_slots = c_calling_convention(out_sig_bt, out_regs, NULL, total_c_args);
  1.2777 +
  1.2778 +  // Calculate the total number of stack slots we will need.
  1.2779 +
  1.2780 +  // First count the abi requirement plus all of the outgoing args
  1.2781 +  int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
  1.2782 +
  1.2783 +  // Now space for the string(s) we must convert
  1.2784 +  int* string_locs   = NEW_RESOURCE_ARRAY(int, total_strings + 1);
  1.2785 +  for (i = 0; i < total_strings ; i++) {
  1.2786 +    string_locs[i] = stack_slots;
  1.2787 +    stack_slots += max_dtrace_string_size / VMRegImpl::stack_slot_size;
  1.2788 +  }
  1.2789 +
  1.2790 +  // Plus the temps we might need to juggle register args
  1.2791 +  // regs take two slots each
  1.2792 +  stack_slots += (Argument::n_int_register_parameters_c +
  1.2793 +                  Argument::n_float_register_parameters_c) * 2;
  1.2794 +
  1.2795 +
  1.2796 +  // + 4 for return address (which we own) and saved rbp,
  1.2797 +
  1.2798 +  stack_slots += 4;
  1.2799 +
  1.2800 +  // Ok The space we have allocated will look like:
  1.2801 +  //
  1.2802 +  //
  1.2803 +  // FP-> |                     |
  1.2804 +  //      |---------------------|
  1.2805 +  //      | string[n]           |
  1.2806 +  //      |---------------------| <- string_locs[n]
  1.2807 +  //      | string[n-1]         |
  1.2808 +  //      |---------------------| <- string_locs[n-1]
  1.2809 +  //      | ...                 |
  1.2810 +  //      | ...                 |
  1.2811 +  //      |---------------------| <- string_locs[1]
  1.2812 +  //      | string[0]           |
  1.2813 +  //      |---------------------| <- string_locs[0]
  1.2814 +  //      | outbound memory     |
  1.2815 +  //      | based arguments     |
  1.2816 +  //      |                     |
  1.2817 +  //      |---------------------|
  1.2818 +  //      |                     |
  1.2819 +  // SP-> | out_preserved_slots |
  1.2820 +  //
  1.2821 +  //
  1.2822 +
  1.2823 +  // Now compute actual number of stack words we need rounding to make
  1.2824 +  // stack properly aligned.
  1.2825 +  stack_slots = round_to(stack_slots, 4 * VMRegImpl::slots_per_word);
  1.2826 +
  1.2827 +  int stack_size = stack_slots * VMRegImpl::stack_slot_size;
  1.2828 +
  1.2829 +  intptr_t start = (intptr_t)__ pc();
  1.2830 +
  1.2831 +  // First thing make an ic check to see if we should even be here
  1.2832 +
  1.2833 +  // We are free to use all registers as temps without saving them and
  1.2834 +  // restoring them except rbp. rbp, is the only callee save register
  1.2835 +  // as far as the interpreter and the compiler(s) are concerned.
  1.2836 +
  1.2837 +  const Register ic_reg = rax;
  1.2838 +  const Register receiver = rcx;
  1.2839 +  Label hit;
  1.2840 +  Label exception_pending;
  1.2841 +
  1.2842 +
  1.2843 +  __ verify_oop(receiver);
  1.2844 +  __ cmpl(ic_reg, Address(receiver, oopDesc::klass_offset_in_bytes()));
  1.2845 +  __ jcc(Assembler::equal, hit);
  1.2846 +
  1.2847 +  __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
  1.2848 +
  1.2849 +  // verified entry must be aligned for code patching.
  1.2850 +  // and the first 5 bytes must be in the same cache line
  1.2851 +  // if we align at 8 then we will be sure 5 bytes are in the same line
  1.2852 +  __ align(8);
  1.2853 +
  1.2854 +  __ bind(hit);
  1.2855 +
  1.2856 +  int vep_offset = ((intptr_t)__ pc()) - start;
  1.2857 +
  1.2858 +
  1.2859 +  // The instruction at the verified entry point must be 5 bytes or longer
  1.2860 +  // because it can be patched on the fly by make_non_entrant. The stack bang
  1.2861 +  // instruction fits that requirement.
  1.2862 +
  1.2863 +  // Generate stack overflow check
  1.2864 +
  1.2865 +  if (UseStackBanging) {
  1.2866 +    if (stack_size <= StackShadowPages*os::vm_page_size()) {
  1.2867 +      __ bang_stack_with_offset(StackShadowPages*os::vm_page_size());
  1.2868 +    } else {
  1.2869 +      __ movl(rax, stack_size);
  1.2870 +      __ bang_stack_size(rax, rbx);
  1.2871 +    }
  1.2872 +  } else {
  1.2873 +    // need a 5 byte instruction to allow MT safe patching to non-entrant
  1.2874 +    __ fat_nop();
  1.2875 +  }
  1.2876 +
  1.2877 +  assert(((uintptr_t)__ pc() - start - vep_offset) >= 5,
  1.2878 +         "valid size for make_non_entrant");
  1.2879 +
  1.2880 +  // Generate a new frame for the wrapper.
  1.2881 +  __ enter();
  1.2882 +
  1.2883 +  // -4 because return address is already present and so is saved rbp,
  1.2884 +  if (stack_size - 2*wordSize != 0) {
  1.2885 +    __ subq(rsp, stack_size - 2*wordSize);
  1.2886 +  }
  1.2887 +
  1.2888 +  // Frame is now completed as far a size and linkage.
  1.2889 +
  1.2890 +  int frame_complete = ((intptr_t)__ pc()) - start;
  1.2891 +
  1.2892 +  int c_arg, j_arg;
  1.2893 +
  1.2894 +  // State of input register args
  1.2895 +
  1.2896 +  bool  live[ConcreteRegisterImpl::number_of_registers];
  1.2897 +
  1.2898 +  live[j_rarg0->as_VMReg()->value()] = false;
  1.2899 +  live[j_rarg1->as_VMReg()->value()] = false;
  1.2900 +  live[j_rarg2->as_VMReg()->value()] = false;
  1.2901 +  live[j_rarg3->as_VMReg()->value()] = false;
  1.2902 +  live[j_rarg4->as_VMReg()->value()] = false;
  1.2903 +  live[j_rarg5->as_VMReg()->value()] = false;
  1.2904 +
  1.2905 +  live[j_farg0->as_VMReg()->value()] = false;
  1.2906 +  live[j_farg1->as_VMReg()->value()] = false;
  1.2907 +  live[j_farg2->as_VMReg()->value()] = false;
  1.2908 +  live[j_farg3->as_VMReg()->value()] = false;
  1.2909 +  live[j_farg4->as_VMReg()->value()] = false;
  1.2910 +  live[j_farg5->as_VMReg()->value()] = false;
  1.2911 +  live[j_farg6->as_VMReg()->value()] = false;
  1.2912 +  live[j_farg7->as_VMReg()->value()] = false;
  1.2913 +
  1.2914 +
  1.2915 +  bool rax_is_zero = false;
  1.2916 +
  1.2917 +  // All args (except strings) destined for the stack are moved first
  1.2918 +  for (j_arg = first_arg_to_pass, c_arg = 0 ;
  1.2919 +       j_arg < total_args_passed ; j_arg++, c_arg++ ) {
  1.2920 +    VMRegPair src = in_regs[j_arg];
  1.2921 +    VMRegPair dst = out_regs[c_arg];
  1.2922 +
  1.2923 +    // Get the real reg value or a dummy (rsp)
  1.2924 +
  1.2925 +    int src_reg = src.first()->is_reg() ?
  1.2926 +                  src.first()->value() :
  1.2927 +                  rsp->as_VMReg()->value();
  1.2928 +
  1.2929 +    bool useless =  in_sig_bt[j_arg] == T_ARRAY ||
  1.2930 +                    (in_sig_bt[j_arg] == T_OBJECT &&
  1.2931 +                     out_sig_bt[c_arg] != T_INT &&
  1.2932 +                     out_sig_bt[c_arg] != T_ADDRESS &&
  1.2933 +                     out_sig_bt[c_arg] != T_LONG);
  1.2934 +
  1.2935 +    live[src_reg] = !useless;
  1.2936 +
  1.2937 +    if (dst.first()->is_stack()) {
  1.2938 +
  1.2939 +      // Even though a string arg in a register is still live after this loop
  1.2940 +      // after the string conversion loop (next) it will be dead so we take
  1.2941 +      // advantage of that now for simpler code to manage live.
  1.2942 +
  1.2943 +      live[src_reg] = false;
  1.2944 +      switch (in_sig_bt[j_arg]) {
  1.2945 +
  1.2946 +        case T_ARRAY:
  1.2947 +        case T_OBJECT:
  1.2948 +          {
  1.2949 +            Address stack_dst(rsp, reg2offset_out(dst.first()));
  1.2950 +
  1.2951 +            if (out_sig_bt[c_arg] == T_INT || out_sig_bt[c_arg] == T_LONG) {
  1.2952 +              // need to unbox a one-word value
  1.2953 +              Register in_reg = rax;
  1.2954 +              if ( src.first()->is_reg() ) {
  1.2955 +                in_reg = src.first()->as_Register();
  1.2956 +              } else {
  1.2957 +                __ movq(rax, Address(rbp, reg2offset_in(src.first())));
  1.2958 +                rax_is_zero = false;
  1.2959 +              }
  1.2960 +              Label skipUnbox;
  1.2961 +              __ movptr(Address(rsp, reg2offset_out(dst.first())),
  1.2962 +                        (int32_t)NULL_WORD);
  1.2963 +              __ testq(in_reg, in_reg);
  1.2964 +              __ jcc(Assembler::zero, skipUnbox);
  1.2965 +
  1.2966 +              BasicType bt = out_sig_bt[c_arg];
  1.2967 +              int box_offset = java_lang_boxing_object::value_offset_in_bytes(bt);
  1.2968 +              Address src1(in_reg, box_offset);
  1.2969 +              if ( bt == T_LONG ) {
  1.2970 +                __ movq(in_reg,  src1);
  1.2971 +                __ movq(stack_dst, in_reg);
  1.2972 +                assert(out_sig_bt[c_arg+1] == T_VOID, "must be");
  1.2973 +                ++c_arg; // skip over T_VOID to keep the loop indices in sync
  1.2974 +              } else {
  1.2975 +                __ movl(in_reg,  src1);
  1.2976 +                __ movl(stack_dst, in_reg);
  1.2977 +              }
  1.2978 +
  1.2979 +              __ bind(skipUnbox);
  1.2980 +            } else if (out_sig_bt[c_arg] != T_ADDRESS) {
  1.2981 +              // Convert the arg to NULL
  1.2982 +              if (!rax_is_zero) {
  1.2983 +                __ xorq(rax, rax);
  1.2984 +                rax_is_zero = true;
  1.2985 +              }
  1.2986 +              __ movq(stack_dst, rax);
  1.2987 +            }
  1.2988 +          }
  1.2989 +          break;
  1.2990 +
  1.2991 +        case T_VOID:
  1.2992 +          break;
  1.2993 +
  1.2994 +        case T_FLOAT:
  1.2995 +          // This does the right thing since we know it is destined for the
  1.2996 +          // stack
  1.2997 +          float_move(masm, src, dst);
  1.2998 +          break;
  1.2999 +
  1.3000 +        case T_DOUBLE:
  1.3001 +          // This does the right thing since we know it is destined for the
  1.3002 +          // stack
  1.3003 +          double_move(masm, src, dst);
  1.3004 +          break;
  1.3005 +
  1.3006 +        case T_LONG :
  1.3007 +          long_move(masm, src, dst);
  1.3008 +          break;
  1.3009 +
  1.3010 +        case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
  1.3011 +
  1.3012 +        default:
  1.3013 +          move32_64(masm, src, dst);
  1.3014 +      }
  1.3015 +    }
  1.3016 +
  1.3017 +  }
  1.3018 +
  1.3019 +  // If we have any strings we must store any register based arg to the stack
  1.3020 +  // This includes any still live xmm registers too.
  1.3021 +
  1.3022 +  int sid = 0;
  1.3023 +
  1.3024 +  if (total_strings > 0 ) {
  1.3025 +    for (j_arg = first_arg_to_pass, c_arg = 0 ;
  1.3026 +         j_arg < total_args_passed ; j_arg++, c_arg++ ) {
  1.3027 +      VMRegPair src = in_regs[j_arg];
  1.3028 +      VMRegPair dst = out_regs[c_arg];
  1.3029 +
  1.3030 +      if (src.first()->is_reg()) {
  1.3031 +        Address src_tmp(rbp, fp_offset[src.first()->value()]);
  1.3032 +
  1.3033 +        // string oops were left untouched by the previous loop even if the
  1.3034 +        // eventual (converted) arg is destined for the stack so park them
  1.3035 +        // away now (except for first)
  1.3036 +
  1.3037 +        if (out_sig_bt[c_arg] == T_ADDRESS) {
  1.3038 +          Address utf8_addr = Address(
  1.3039 +              rsp, string_locs[sid++] * VMRegImpl::stack_slot_size);
  1.3040 +          if (sid != 1) {
  1.3041 +            // The first string arg won't be killed until after the utf8
  1.3042 +            // conversion
  1.3043 +            __ movq(utf8_addr, src.first()->as_Register());
  1.3044 +          }
  1.3045 +        } else if (dst.first()->is_reg()) {
  1.3046 +          if (in_sig_bt[j_arg] == T_FLOAT || in_sig_bt[j_arg] == T_DOUBLE) {
  1.3047 +
  1.3048 +            // Convert the xmm register to an int and store it in the reserved
  1.3049 +            // location for the eventual c register arg
  1.3050 +            XMMRegister f = src.first()->as_XMMRegister();
  1.3051 +            if (in_sig_bt[j_arg] == T_FLOAT) {
  1.3052 +              __ movflt(src_tmp, f);
  1.3053 +            } else {
  1.3054 +              __ movdbl(src_tmp, f);
  1.3055 +            }
  1.3056 +          } else {
  1.3057 +            // If the arg is an oop type we don't support don't bother to store
  1.3058 +            // it remember string was handled above.
  1.3059 +            bool useless =  in_sig_bt[j_arg] == T_ARRAY ||
  1.3060 +                            (in_sig_bt[j_arg] == T_OBJECT &&
  1.3061 +                             out_sig_bt[c_arg] != T_INT &&
  1.3062 +                             out_sig_bt[c_arg] != T_LONG);
  1.3063 +
  1.3064 +            if (!useless) {
  1.3065 +              __ movq(src_tmp, src.first()->as_Register());
  1.3066 +            }
  1.3067 +          }
  1.3068 +        }
  1.3069 +      }
  1.3070 +      if (in_sig_bt[j_arg] == T_OBJECT && out_sig_bt[c_arg] == T_LONG) {
  1.3071 +        assert(out_sig_bt[c_arg+1] == T_VOID, "must be");
  1.3072 +        ++c_arg; // skip over T_VOID to keep the loop indices in sync
  1.3073 +      }
  1.3074 +    }
  1.3075 +
  1.3076 +    // Now that the volatile registers are safe, convert all the strings
  1.3077 +    sid = 0;
  1.3078 +
  1.3079 +    for (j_arg = first_arg_to_pass, c_arg = 0 ;
  1.3080 +         j_arg < total_args_passed ; j_arg++, c_arg++ ) {
  1.3081 +      if (out_sig_bt[c_arg] == T_ADDRESS) {
  1.3082 +        // It's a string
  1.3083 +        Address utf8_addr = Address(
  1.3084 +            rsp, string_locs[sid++] * VMRegImpl::stack_slot_size);
  1.3085 +        // The first string we find might still be in the original java arg
  1.3086 +        // register
  1.3087 +
  1.3088 +        VMReg src = in_regs[j_arg].first();
  1.3089 +
  1.3090 +        // We will need to eventually save the final argument to the trap
  1.3091 +        // in the von-volatile location dedicated to src. This is the offset
  1.3092 +        // from fp we will use.
  1.3093 +        int src_off = src->is_reg() ?
  1.3094 +            fp_offset[src->value()] : reg2offset_in(src);
  1.3095 +
  1.3096 +        // This is where the argument will eventually reside
  1.3097 +        VMRegPair dst = out_regs[c_arg];
  1.3098 +
  1.3099 +        if (src->is_reg()) {
  1.3100 +          if (sid == 1) {
  1.3101 +            __ movq(c_rarg0, src->as_Register());
  1.3102 +          } else {
  1.3103 +            __ movq(c_rarg0, utf8_addr);
  1.3104 +          }
  1.3105 +        } else {
  1.3106 +          // arg is still in the original location
  1.3107 +          __ movq(c_rarg0, Address(rbp, reg2offset_in(src)));
  1.3108 +        }
  1.3109 +        Label done, convert;
  1.3110 +
  1.3111 +        // see if the oop is NULL
  1.3112 +        __ testq(c_rarg0, c_rarg0);
  1.3113 +        __ jcc(Assembler::notEqual, convert);
  1.3114 +
  1.3115 +        if (dst.first()->is_reg()) {
  1.3116 +          // Save the ptr to utf string in the origina src loc or the tmp
  1.3117 +          // dedicated to it
  1.3118 +          __ movq(Address(rbp, src_off), c_rarg0);
  1.3119 +        } else {
  1.3120 +          __ movq(Address(rsp, reg2offset_out(dst.first())), c_rarg0);
  1.3121 +        }
  1.3122 +        __ jmp(done);
  1.3123 +
  1.3124 +        __ bind(convert);
  1.3125 +
  1.3126 +        __ lea(c_rarg1, utf8_addr);
  1.3127 +        if (dst.first()->is_reg()) {
  1.3128 +          __ movq(Address(rbp, src_off), c_rarg1);
  1.3129 +        } else {
  1.3130 +          __ movq(Address(rsp, reg2offset_out(dst.first())), c_rarg1);
  1.3131 +        }
  1.3132 +        // And do the conversion
  1.3133 +        __ call(RuntimeAddress(
  1.3134 +                CAST_FROM_FN_PTR(address, SharedRuntime::get_utf)));
  1.3135 +
  1.3136 +        __ bind(done);
  1.3137 +      }
  1.3138 +      if (in_sig_bt[j_arg] == T_OBJECT && out_sig_bt[c_arg] == T_LONG) {
  1.3139 +        assert(out_sig_bt[c_arg+1] == T_VOID, "must be");
  1.3140 +        ++c_arg; // skip over T_VOID to keep the loop indices in sync
  1.3141 +      }
  1.3142 +    }
  1.3143 +    // The get_utf call killed all the c_arg registers
  1.3144 +    live[c_rarg0->as_VMReg()->value()] = false;
  1.3145 +    live[c_rarg1->as_VMReg()->value()] = false;
  1.3146 +    live[c_rarg2->as_VMReg()->value()] = false;
  1.3147 +    live[c_rarg3->as_VMReg()->value()] = false;
  1.3148 +    live[c_rarg4->as_VMReg()->value()] = false;
  1.3149 +    live[c_rarg5->as_VMReg()->value()] = false;
  1.3150 +
  1.3151 +    live[c_farg0->as_VMReg()->value()] = false;
  1.3152 +    live[c_farg1->as_VMReg()->value()] = false;
  1.3153 +    live[c_farg2->as_VMReg()->value()] = false;
  1.3154 +    live[c_farg3->as_VMReg()->value()] = false;
  1.3155 +    live[c_farg4->as_VMReg()->value()] = false;
  1.3156 +    live[c_farg5->as_VMReg()->value()] = false;
  1.3157 +    live[c_farg6->as_VMReg()->value()] = false;
  1.3158 +    live[c_farg7->as_VMReg()->value()] = false;
  1.3159 +  }
  1.3160 +
  1.3161 +  // Now we can finally move the register args to their desired locations
  1.3162 +
  1.3163 +  rax_is_zero = false;
  1.3164 +
  1.3165 +  for (j_arg = first_arg_to_pass, c_arg = 0 ;
  1.3166 +       j_arg < total_args_passed ; j_arg++, c_arg++ ) {
  1.3167 +
  1.3168 +    VMRegPair src = in_regs[j_arg];
  1.3169 +    VMRegPair dst = out_regs[c_arg];
  1.3170 +
  1.3171 +    // Only need to look for args destined for the interger registers (since we
  1.3172 +    // convert float/double args to look like int/long outbound)
  1.3173 +    if (dst.first()->is_reg()) {
  1.3174 +      Register r =  dst.first()->as_Register();
  1.3175 +
  1.3176 +      // Check if the java arg is unsupported and thereofre useless
  1.3177 +      bool useless =  in_sig_bt[j_arg] == T_ARRAY ||
  1.3178 +                      (in_sig_bt[j_arg] == T_OBJECT &&
  1.3179 +                       out_sig_bt[c_arg] != T_INT &&
  1.3180 +                       out_sig_bt[c_arg] != T_ADDRESS &&
  1.3181 +                       out_sig_bt[c_arg] != T_LONG);
  1.3182 +
  1.3183 +
  1.3184 +      // If we're going to kill an existing arg save it first
  1.3185 +      if (live[dst.first()->value()]) {
  1.3186 +        // you can't kill yourself
  1.3187 +        if (src.first() != dst.first()) {
  1.3188 +          __ movq(Address(rbp, fp_offset[dst.first()->value()]), r);
  1.3189 +        }
  1.3190 +      }
  1.3191 +      if (src.first()->is_reg()) {
  1.3192 +        if (live[src.first()->value()] ) {
  1.3193 +          if (in_sig_bt[j_arg] == T_FLOAT) {
  1.3194 +            __ movdl(r, src.first()->as_XMMRegister());
  1.3195 +          } else if (in_sig_bt[j_arg] == T_DOUBLE) {
  1.3196 +            __ movdq(r, src.first()->as_XMMRegister());
  1.3197 +          } else if (r != src.first()->as_Register()) {
  1.3198 +            if (!useless) {
  1.3199 +              __ movq(r, src.first()->as_Register());
  1.3200 +            }
  1.3201 +          }
  1.3202 +        } else {
  1.3203 +          // If the arg is an oop type we don't support don't bother to store
  1.3204 +          // it
  1.3205 +          if (!useless) {
  1.3206 +            if (in_sig_bt[j_arg] == T_DOUBLE ||
  1.3207 +                in_sig_bt[j_arg] == T_LONG  ||
  1.3208 +                in_sig_bt[j_arg] == T_OBJECT ) {
  1.3209 +              __ movq(r, Address(rbp, fp_offset[src.first()->value()]));
  1.3210 +            } else {
  1.3211 +              __ movl(r, Address(rbp, fp_offset[src.first()->value()]));
  1.3212 +            }
  1.3213 +          }
  1.3214 +        }
  1.3215 +        live[src.first()->value()] = false;
  1.3216 +      } else if (!useless) {
  1.3217 +        // full sized move even for int should be ok
  1.3218 +        __ movq(r, Address(rbp, reg2offset_in(src.first())));
  1.3219 +      }
  1.3220 +
  1.3221 +      // At this point r has the original java arg in the final location
  1.3222 +      // (assuming it wasn't useless). If the java arg was an oop
  1.3223 +      // we have a bit more to do
  1.3224 +
  1.3225 +      if (in_sig_bt[j_arg] == T_ARRAY || in_sig_bt[j_arg] == T_OBJECT ) {
  1.3226 +        if (out_sig_bt[c_arg] == T_INT || out_sig_bt[c_arg] == T_LONG) {
  1.3227 +          // need to unbox a one-word value
  1.3228 +          Label skip;
  1.3229 +          __ testq(r, r);
  1.3230 +          __ jcc(Assembler::equal, skip);
  1.3231 +          BasicType bt = out_sig_bt[c_arg];
  1.3232 +          int box_offset = java_lang_boxing_object::value_offset_in_bytes(bt);
  1.3233 +          Address src1(r, box_offset);
  1.3234 +          if ( bt == T_LONG ) {
  1.3235 +            __ movq(r, src1);
  1.3236 +          } else {
  1.3237 +            __ movl(r, src1);
  1.3238 +          }
  1.3239 +          __ bind(skip);
  1.3240 +
  1.3241 +        } else if (out_sig_bt[c_arg] != T_ADDRESS) {
  1.3242 +          // Convert the arg to NULL
  1.3243 +          __ xorq(r, r);
  1.3244 +        }
  1.3245 +      }
  1.3246 +
  1.3247 +      // dst can longer be holding an input value
  1.3248 +      live[dst.first()->value()] = false;
  1.3249 +    }
  1.3250 +    if (in_sig_bt[j_arg] == T_OBJECT && out_sig_bt[c_arg] == T_LONG) {
  1.3251 +      assert(out_sig_bt[c_arg+1] == T_VOID, "must be");
  1.3252 +      ++c_arg; // skip over T_VOID to keep the loop indices in sync
  1.3253 +    }
  1.3254 +  }
  1.3255 +
  1.3256 +
  1.3257 +  // Ok now we are done. Need to place the nop that dtrace wants in order to
  1.3258 +  // patch in the trap
  1.3259 +  int patch_offset = ((intptr_t)__ pc()) - start;
  1.3260 +
  1.3261 +  __ nop();
  1.3262 +
  1.3263 +
  1.3264 +  // Return
  1.3265 +
  1.3266 +  __ leave();
  1.3267 +  __ ret(0);
  1.3268 +
  1.3269 +  __ flush();
  1.3270 +
  1.3271 +  nmethod *nm = nmethod::new_dtrace_nmethod(
  1.3272 +      method, masm->code(), vep_offset, patch_offset, frame_complete,
  1.3273 +      stack_slots / VMRegImpl::slots_per_word);
  1.3274 +  return nm;
  1.3275 +
  1.3276 +}
  1.3277 +
  1.3278 +#endif // HAVE_DTRACE_H
  1.3279 +
  1.3280 +// this function returns the adjust size (in number of words) to a c2i adapter
  1.3281 +// activation for use during deoptimization
  1.3282 +int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
  1.3283 +  return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
  1.3284 +}
  1.3285 +
  1.3286 +
  1.3287 +uint SharedRuntime::out_preserve_stack_slots() {
  1.3288 +  return 0;
  1.3289 +}
  1.3290 +
  1.3291 +//------------------------------generate_deopt_blob----------------------------
  1.3292 +void SharedRuntime::generate_deopt_blob() {
  1.3293 +  // Allocate space for the code
  1.3294 +  ResourceMark rm;
  1.3295 +  // Setup code generation tools
  1.3296 +  CodeBuffer buffer("deopt_blob", 2048, 1024);
  1.3297 +  MacroAssembler* masm = new MacroAssembler(&buffer);
  1.3298 +  int frame_size_in_words;
  1.3299 +  OopMap* map = NULL;
  1.3300 +  OopMapSet *oop_maps = new OopMapSet();
  1.3301 +
  1.3302 +  // -------------
  1.3303 +  // This code enters when returning to a de-optimized nmethod.  A return
  1.3304 +  // address has been pushed on the the stack, and return values are in
  1.3305 +  // registers.
  1.3306 +  // If we are doing a normal deopt then we were called from the patched
  1.3307 +  // nmethod from the point we returned to the nmethod. So the return
  1.3308 +  // address on the stack is wrong by NativeCall::instruction_size
  1.3309 +  // We will adjust the value so it looks like we have the original return
  1.3310 +  // address on the stack (like when we eagerly deoptimized).
  1.3311 +  // In the case of an exception pending when deoptimizing, we enter
  1.3312 +  // with a return address on the stack that points after the call we patched
  1.3313 +  // into the exception handler. We have the following register state from,
  1.3314 +  // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
  1.3315 +  //    rax: exception oop
  1.3316 +  //    rbx: exception handler
  1.3317 +  //    rdx: throwing pc
  1.3318 +  // So in this case we simply jam rdx into the useless return address and
  1.3319 +  // the stack looks just like we want.
  1.3320 +  //
  1.3321 +  // At this point we need to de-opt.  We save the argument return
  1.3322 +  // registers.  We call the first C routine, fetch_unroll_info().  This
  1.3323 +  // routine captures the return values and returns a structure which
  1.3324 +  // describes the current frame size and the sizes of all replacement frames.
  1.3325 +  // The current frame is compiled code and may contain many inlined
  1.3326 +  // functions, each with their own JVM state.  We pop the current frame, then
  1.3327 +  // push all the new frames.  Then we call the C routine unpack_frames() to
  1.3328 +  // populate these frames.  Finally unpack_frames() returns us the new target
  1.3329 +  // address.  Notice that callee-save registers are BLOWN here; they have
  1.3330 +  // already been captured in the vframeArray at the time the return PC was
  1.3331 +  // patched.
  1.3332 +  address start = __ pc();
  1.3333 +  Label cont;
  1.3334 +
  1.3335 +  // Prolog for non exception case!
  1.3336 +
  1.3337 +  // Save everything in sight.
  1.3338 +  map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words);
  1.3339 +
  1.3340 +  // Normal deoptimization.  Save exec mode for unpack_frames.
  1.3341 +  __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
  1.3342 +  __ jmp(cont);
  1.3343 +
  1.3344 +  int reexecute_offset = __ pc() - start;
  1.3345 +
  1.3346 +  // Reexecute case
  1.3347 +  // return address is the pc describes what bci to do re-execute at
  1.3348 +
  1.3349 +  // No need to update map as each call to save_live_registers will produce identical oopmap
  1.3350 +  (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words);
  1.3351 +
  1.3352 +  __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
  1.3353 +  __ jmp(cont);
  1.3354 +
  1.3355 +  int exception_offset = __ pc() - start;
  1.3356 +
  1.3357 +  // Prolog for exception case
  1.3358 +
  1.3359 +  // all registers are dead at this entry point, except for rax, and
  1.3360 +  // rdx which contain the exception oop and exception pc
  1.3361 +  // respectively.  Set them in TLS and fall thru to the
  1.3362 +  // unpack_with_exception_in_tls entry point.
  1.3363 +
  1.3364 +  __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
  1.3365 +  __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
  1.3366 +
  1.3367 +  int exception_in_tls_offset = __ pc() - start;
  1.3368 +
  1.3369 +  // new implementation because exception oop is now passed in JavaThread
  1.3370 +
  1.3371 +  // Prolog for exception case
  1.3372 +  // All registers must be preserved because they might be used by LinearScan
  1.3373 +  // Exceptiop oop and throwing PC are passed in JavaThread
  1.3374 +  // tos: stack at point of call to method that threw the exception (i.e. only
  1.3375 +  // args are on the stack, no return address)
  1.3376 +
  1.3377 +  // make room on stack for the return address
  1.3378 +  // It will be patched later with the throwing pc. The correct value is not
  1.3379 +  // available now because loading it from memory would destroy registers.
  1.3380 +  __ push(0);
  1.3381 +
  1.3382 +  // Save everything in sight.
  1.3383 +  map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words);
  1.3384 +
  1.3385 +  // Now it is safe to overwrite any register
  1.3386 +
  1.3387 +  // Deopt during an exception.  Save exec mode for unpack_frames.
  1.3388 +  __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
  1.3389 +
  1.3390 +  // load throwing pc from JavaThread and patch it as the return address
  1.3391 +  // of the current frame. Then clear the field in JavaThread
  1.3392 +
  1.3393 +  __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
  1.3394 +  __ movptr(Address(rbp, wordSize), rdx);
  1.3395 +  __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD);
  1.3396 +
  1.3397 +#ifdef ASSERT
  1.3398 +  // verify that there is really an exception oop in JavaThread
  1.3399 +  __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
  1.3400 +  __ verify_oop(rax);
  1.3401 +
  1.3402 +  // verify that there is no pending exception
  1.3403 +  Label no_pending_exception;
  1.3404 +  __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
  1.3405 +  __ testptr(rax, rax);
  1.3406 +  __ jcc(Assembler::zero, no_pending_exception);
  1.3407 +  __ stop("must not have pending exception here");
  1.3408 +  __ bind(no_pending_exception);
  1.3409 +#endif
  1.3410 +
  1.3411 +  __ bind(cont);
  1.3412 +
  1.3413 +  // Call C code.  Need thread and this frame, but NOT official VM entry
  1.3414 +  // crud.  We cannot block on this call, no GC can happen.
  1.3415 +  //
  1.3416 +  // UnrollBlock* fetch_unroll_info(JavaThread* thread)
  1.3417 +
  1.3418 +  // fetch_unroll_info needs to call last_java_frame().
  1.3419 +
  1.3420 +  __ set_last_Java_frame(noreg, noreg, NULL);
  1.3421 +#ifdef ASSERT
  1.3422 +  { Label L;
  1.3423 +    __ cmpptr(Address(r15_thread,
  1.3424 +                    JavaThread::last_Java_fp_offset()),
  1.3425 +            (int32_t)0);
  1.3426 +    __ jcc(Assembler::equal, L);
  1.3427 +    __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
  1.3428 +    __ bind(L);
  1.3429 +  }
  1.3430 +#endif // ASSERT
  1.3431 +  __ mov(c_rarg0, r15_thread);
  1.3432 +  __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
  1.3433 +
  1.3434 +  // Need to have an oopmap that tells fetch_unroll_info where to
  1.3435 +  // find any register it might need.
  1.3436 +  oop_maps->add_gc_map(__ pc() - start, map);
  1.3437 +
  1.3438 +  __ reset_last_Java_frame(false, false);
  1.3439 +
  1.3440 +  // Load UnrollBlock* into rdi
  1.3441 +  __ mov(rdi, rax);
  1.3442 +
  1.3443 +   Label noException;
  1.3444 +  __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
  1.3445 +  __ jcc(Assembler::notEqual, noException);
  1.3446 +  __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
  1.3447 +  // QQQ this is useless it was NULL above
  1.3448 +  __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
  1.3449 +  __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int32_t)NULL_WORD);
  1.3450 +  __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD);
  1.3451 +
  1.3452 +  __ verify_oop(rax);
  1.3453 +
  1.3454 +  // Overwrite the result registers with the exception results.
  1.3455 +  __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
  1.3456 +  // I think this is useless
  1.3457 +  __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
  1.3458 +
  1.3459 +  __ bind(noException);
  1.3460 +
  1.3461 +  // Only register save data is on the stack.
  1.3462 +  // Now restore the result registers.  Everything else is either dead
  1.3463 +  // or captured in the vframeArray.
  1.3464 +  RegisterSaver::restore_result_registers(masm);
  1.3465 +
  1.3466 +  // All of the register save area has been popped of the stack. Only the
  1.3467 +  // return address remains.
  1.3468 +
  1.3469 +  // Pop all the frames we must move/replace.
  1.3470 +  //
  1.3471 +  // Frame picture (youngest to oldest)
  1.3472 +  // 1: self-frame (no frame link)
  1.3473 +  // 2: deopting frame  (no frame link)
  1.3474 +  // 3: caller of deopting frame (could be compiled/interpreted).
  1.3475 +  //
  1.3476 +  // Note: by leaving the return address of self-frame on the stack
  1.3477 +  // and using the size of frame 2 to adjust the stack
  1.3478 +  // when we are done the return to frame 3 will still be on the stack.
  1.3479 +
  1.3480 +  // Pop deoptimized frame
  1.3481 +  __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset_in_bytes()));
  1.3482 +  __ addptr(rsp, rcx);
  1.3483 +
  1.3484 +  // rsp should be pointing at the return address to the caller (3)
  1.3485 +
  1.3486 +  // Pick up the initial fp we should save
  1.3487 +  // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
  1.3488 +  __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
  1.3489 +
  1.3490 +#ifdef ASSERT
  1.3491 +  // Compilers generate code that bang the stack by as much as the
  1.3492 +  // interpreter would need. So this stack banging should never
  1.3493 +  // trigger a fault. Verify that it does not on non product builds.
  1.3494 +  if (UseStackBanging) {
  1.3495 +    __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
  1.3496 +    __ bang_stack_size(rbx, rcx);
  1.3497 +  }
  1.3498 +#endif
  1.3499 +
  1.3500 +  // Load address of array of frame pcs into rcx
  1.3501 +  __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
  1.3502 +
  1.3503 +  // Trash the old pc
  1.3504 +  __ addptr(rsp, wordSize);
  1.3505 +
  1.3506 +  // Load address of array of frame sizes into rsi
  1.3507 +  __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset_in_bytes()));
  1.3508 +
  1.3509 +  // Load counter into rdx
  1.3510 +  __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset_in_bytes()));
  1.3511 +
  1.3512 +  // Now adjust the caller's stack to make up for the extra locals
  1.3513 +  // but record the original sp so that we can save it in the skeletal interpreter
  1.3514 +  // frame and the stack walking of interpreter_sender will get the unextended sp
  1.3515 +  // value and not the "real" sp value.
  1.3516 +
  1.3517 +  const Register sender_sp = r8;
  1.3518 +
  1.3519 +  __ mov(sender_sp, rsp);
  1.3520 +  __ movl(rbx, Address(rdi,
  1.3521 +                       Deoptimization::UnrollBlock::
  1.3522 +                       caller_adjustment_offset_in_bytes()));
  1.3523 +  __ subptr(rsp, rbx);
  1.3524 +
  1.3525 +  // Push interpreter frames in a loop
  1.3526 +  Label loop;
  1.3527 +  __ bind(loop);
  1.3528 +  __ movptr(rbx, Address(rsi, 0));      // Load frame size
  1.3529 +#ifdef CC_INTERP
  1.3530 +  __ subptr(rbx, 4*wordSize);           // we'll push pc and ebp by hand and
  1.3531 +#ifdef ASSERT
  1.3532 +  __ push(0xDEADDEAD);                  // Make a recognizable pattern
  1.3533 +  __ push(0xDEADDEAD);
  1.3534 +#else /* ASSERT */
  1.3535 +  __ subptr(rsp, 2*wordSize);           // skip the "static long no_param"
  1.3536 +#endif /* ASSERT */
  1.3537 +#else
  1.3538 +  __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
  1.3539 +#endif // CC_INTERP
  1.3540 +  __ pushptr(Address(rcx, 0));          // Save return address
  1.3541 +  __ enter();                           // Save old & set new ebp
  1.3542 +  __ subptr(rsp, rbx);                  // Prolog
  1.3543 +#ifdef CC_INTERP
  1.3544 +  __ movptr(Address(rbp,
  1.3545 +                  -(sizeof(BytecodeInterpreter)) + in_bytes(byte_offset_of(BytecodeInterpreter, _sender_sp))),
  1.3546 +            sender_sp); // Make it walkable
  1.3547 +#else /* CC_INTERP */
  1.3548 +  // This value is corrected by layout_activation_impl
  1.3549 +  __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD );
  1.3550 +  __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
  1.3551 +#endif /* CC_INTERP */
  1.3552 +  __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
  1.3553 +  __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
  1.3554 +  __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
  1.3555 +  __ decrementl(rdx);                   // Decrement counter
  1.3556 +  __ jcc(Assembler::notZero, loop);
  1.3557 +  __ pushptr(Address(rcx, 0));          // Save final return address
  1.3558 +
  1.3559 +  // Re-push self-frame
  1.3560 +  __ enter();                           // Save old & set new ebp
  1.3561 +
  1.3562 +  // Allocate a full sized register save area.
  1.3563 +  // Return address and rbp are in place, so we allocate two less words.
  1.3564 +  __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
  1.3565 +
  1.3566 +  // Restore frame locals after moving the frame
  1.3567 +  __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
  1.3568 +  __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
  1.3569 +
  1.3570 +  // Call C code.  Need thread but NOT official VM entry
  1.3571 +  // crud.  We cannot block on this call, no GC can happen.  Call should
  1.3572 +  // restore return values to their stack-slots with the new SP.
  1.3573 +  //
  1.3574 +  // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
  1.3575 +
  1.3576 +  // Use rbp because the frames look interpreted now
  1.3577 +  // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
  1.3578 +  // Don't need the precise return PC here, just precise enough to point into this code blob.
  1.3579 +  address the_pc = __ pc();
  1.3580 +  __ set_last_Java_frame(noreg, rbp, the_pc);
  1.3581 +
  1.3582 +  __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
  1.3583 +  __ mov(c_rarg0, r15_thread);
  1.3584 +  __ movl(c_rarg1, r14); // second arg: exec_mode
  1.3585 +  __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
  1.3586 +  // Revert SP alignment after call since we're going to do some SP relative addressing below
  1.3587 +  __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
  1.3588 +
  1.3589 +  // Set an oopmap for the call site
  1.3590 +  // Use the same PC we used for the last java frame
  1.3591 +  oop_maps->add_gc_map(the_pc - start,
  1.3592 +                       new OopMap( frame_size_in_words, 0 ));
  1.3593 +
  1.3594 +  // Clear fp AND pc
  1.3595 +  __ reset_last_Java_frame(true, true);
  1.3596 +
  1.3597 +  // Collect return values
  1.3598 +  __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
  1.3599 +  __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
  1.3600 +  // I think this is useless (throwing pc?)
  1.3601 +  __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
  1.3602 +
  1.3603 +  // Pop self-frame.
  1.3604 +  __ leave();                           // Epilog
  1.3605 +
  1.3606 +  // Jump to interpreter
  1.3607 +  __ ret(0);
  1.3608 +
  1.3609 +  // Make sure all code is generated
  1.3610 +  masm->flush();
  1.3611 +
  1.3612 +  _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
  1.3613 +  _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
  1.3614 +}
  1.3615 +
  1.3616 +#ifdef COMPILER2
  1.3617 +//------------------------------generate_uncommon_trap_blob--------------------
  1.3618 +void SharedRuntime::generate_uncommon_trap_blob() {
  1.3619 +  // Allocate space for the code
  1.3620 +  ResourceMark rm;
  1.3621 +  // Setup code generation tools
  1.3622 +  CodeBuffer buffer("uncommon_trap_blob", 2048, 1024);
  1.3623 +  MacroAssembler* masm = new MacroAssembler(&buffer);
  1.3624 +
  1.3625 +  assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
  1.3626 +
  1.3627 +  address start = __ pc();
  1.3628 +
  1.3629 +  if (UseRTMLocking) {
  1.3630 +    // Abort RTM transaction before possible nmethod deoptimization.
  1.3631 +    __ xabort(0);
  1.3632 +  }
  1.3633 +
  1.3634 +  // Push self-frame.  We get here with a return address on the
  1.3635 +  // stack, so rsp is 8-byte aligned until we allocate our frame.
  1.3636 +  __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog!
  1.3637 +
  1.3638 +  // No callee saved registers. rbp is assumed implicitly saved
  1.3639 +  __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
  1.3640 +
  1.3641 +  // compiler left unloaded_class_index in j_rarg0 move to where the
  1.3642 +  // runtime expects it.
  1.3643 +  __ movl(c_rarg1, j_rarg0);
  1.3644 +
  1.3645 +  __ set_last_Java_frame(noreg, noreg, NULL);
  1.3646 +
  1.3647 +  // Call C code.  Need thread but NOT official VM entry
  1.3648 +  // crud.  We cannot block on this call, no GC can happen.  Call should
  1.3649 +  // capture callee-saved registers as well as return values.
  1.3650 +  // Thread is in rdi already.
  1.3651 +  //
  1.3652 +  // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index);
  1.3653 +
  1.3654 +  __ mov(c_rarg0, r15_thread);
  1.3655 +  __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
  1.3656 +
  1.3657 +  // Set an oopmap for the call site
  1.3658 +  OopMapSet* oop_maps = new OopMapSet();
  1.3659 +  OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0);
  1.3660 +
  1.3661 +  // location of rbp is known implicitly by the frame sender code
  1.3662 +
  1.3663 +  oop_maps->add_gc_map(__ pc() - start, map);
  1.3664 +
  1.3665 +  __ reset_last_Java_frame(false, false);
  1.3666 +
  1.3667 +  // Load UnrollBlock* into rdi
  1.3668 +  __ mov(rdi, rax);
  1.3669 +
  1.3670 +  // Pop all the frames we must move/replace.
  1.3671 +  //
  1.3672 +  // Frame picture (youngest to oldest)
  1.3673 +  // 1: self-frame (no frame link)
  1.3674 +  // 2: deopting frame  (no frame link)
  1.3675 +  // 3: caller of deopting frame (could be compiled/interpreted).
  1.3676 +
  1.3677 +  // Pop self-frame.  We have no frame, and must rely only on rax and rsp.
  1.3678 +  __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog!
  1.3679 +
  1.3680 +  // Pop deoptimized frame (int)
  1.3681 +  __ movl(rcx, Address(rdi,
  1.3682 +                       Deoptimization::UnrollBlock::
  1.3683 +                       size_of_deoptimized_frame_offset_in_bytes()));
  1.3684 +  __ addptr(rsp, rcx);
  1.3685 +
  1.3686 +  // rsp should be pointing at the return address to the caller (3)
  1.3687 +
  1.3688 +  // Pick up the initial fp we should save
  1.3689 +  // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
  1.3690 +  __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
  1.3691 +
  1.3692 +#ifdef ASSERT
  1.3693 +  // Compilers generate code that bang the stack by as much as the
  1.3694 +  // interpreter would need. So this stack banging should never
  1.3695 +  // trigger a fault. Verify that it does not on non product builds.
  1.3696 +  if (UseStackBanging) {
  1.3697 +    __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
  1.3698 +    __ bang_stack_size(rbx, rcx);
  1.3699 +  }
  1.3700 +#endif
  1.3701 +
  1.3702 +  // Load address of array of frame pcs into rcx (address*)
  1.3703 +  __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
  1.3704 +
  1.3705 +  // Trash the return pc
  1.3706 +  __ addptr(rsp, wordSize);
  1.3707 +
  1.3708 +  // Load address of array of frame sizes into rsi (intptr_t*)
  1.3709 +  __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset_in_bytes()));
  1.3710 +
  1.3711 +  // Counter
  1.3712 +  __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset_in_bytes())); // (int)
  1.3713 +
  1.3714 +  // Now adjust the caller's stack to make up for the extra locals but
  1.3715 +  // record the original sp so that we can save it in the skeletal
  1.3716 +  // interpreter frame and the stack walking of interpreter_sender
  1.3717 +  // will get the unextended sp value and not the "real" sp value.
  1.3718 +
  1.3719 +  const Register sender_sp = r8;
  1.3720 +
  1.3721 +  __ mov(sender_sp, rsp);
  1.3722 +  __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset_in_bytes())); // (int)
  1.3723 +  __ subptr(rsp, rbx);
  1.3724 +
  1.3725 +  // Push interpreter frames in a loop
  1.3726 +  Label loop;
  1.3727 +  __ bind(loop);
  1.3728 +  __ movptr(rbx, Address(rsi, 0)); // Load frame size
  1.3729 +  __ subptr(rbx, 2 * wordSize);    // We'll push pc and rbp by hand
  1.3730 +  __ pushptr(Address(rcx, 0));     // Save return address
  1.3731 +  __ enter();                      // Save old & set new rbp
  1.3732 +  __ subptr(rsp, rbx);             // Prolog
  1.3733 +#ifdef CC_INTERP
  1.3734 +  __ movptr(Address(rbp,
  1.3735 +                  -(sizeof(BytecodeInterpreter)) + in_bytes(byte_offset_of(BytecodeInterpreter, _sender_sp))),
  1.3736 +            sender_sp); // Make it walkable
  1.3737 +#else // CC_INTERP
  1.3738 +  __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize),
  1.3739 +            sender_sp);            // Make it walkable
  1.3740 +  // This value is corrected by layout_activation_impl
  1.3741 +  __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD );
  1.3742 +#endif // CC_INTERP
  1.3743 +  __ mov(sender_sp, rsp);          // Pass sender_sp to next frame
  1.3744 +  __ addptr(rsi, wordSize);        // Bump array pointer (sizes)
  1.3745 +  __ addptr(rcx, wordSize);        // Bump array pointer (pcs)
  1.3746 +  __ decrementl(rdx);              // Decrement counter
  1.3747 +  __ jcc(Assembler::notZero, loop);
  1.3748 +  __ pushptr(Address(rcx, 0));     // Save final return address
  1.3749 +
  1.3750 +  // Re-push self-frame
  1.3751 +  __ enter();                 // Save old & set new rbp
  1.3752 +  __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt);
  1.3753 +                              // Prolog
  1.3754 +
  1.3755 +  // Use rbp because the frames look interpreted now
  1.3756 +  // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
  1.3757 +  // Don't need the precise return PC here, just precise enough to point into this code blob.
  1.3758 +  address the_pc = __ pc();
  1.3759 +  __ set_last_Java_frame(noreg, rbp, the_pc);
  1.3760 +
  1.3761 +  // Call C code.  Need thread but NOT official VM entry
  1.3762 +  // crud.  We cannot block on this call, no GC can happen.  Call should
  1.3763 +  // restore return values to their stack-slots with the new SP.
  1.3764 +  // Thread is in rdi already.
  1.3765 +  //
  1.3766 +  // BasicType unpack_frames(JavaThread* thread, int exec_mode);
  1.3767 +
  1.3768 +  __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI
  1.3769 +  __ mov(c_rarg0, r15_thread);
  1.3770 +  __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap);
  1.3771 +  __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
  1.3772 +
  1.3773 +  // Set an oopmap for the call site
  1.3774 +  // Use the same PC we used for the last java frame
  1.3775 +  oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
  1.3776 +
  1.3777 +  // Clear fp AND pc
  1.3778 +  __ reset_last_Java_frame(true, true);
  1.3779 +
  1.3780 +  // Pop self-frame.
  1.3781 +  __ leave();                 // Epilog
  1.3782 +
  1.3783 +  // Jump to interpreter
  1.3784 +  __ ret(0);
  1.3785 +
  1.3786 +  // Make sure all code is generated
  1.3787 +  masm->flush();
  1.3788 +
  1.3789 +  _uncommon_trap_blob =  UncommonTrapBlob::create(&buffer, oop_maps,
  1.3790 +                                                 SimpleRuntimeFrame::framesize >> 1);
  1.3791 +}
  1.3792 +#endif // COMPILER2
  1.3793 +
  1.3794 +
  1.3795 +//------------------------------generate_handler_blob------
  1.3796 +//
  1.3797 +// Generate a special Compile2Runtime blob that saves all registers,
  1.3798 +// and setup oopmap.
  1.3799 +//
  1.3800 +SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
  1.3801 +  assert(StubRoutines::forward_exception_entry() != NULL,
  1.3802 +         "must be generated before");
  1.3803 +
  1.3804 +  ResourceMark rm;
  1.3805 +  OopMapSet *oop_maps = new OopMapSet();
  1.3806 +  OopMap* map;
  1.3807 +
  1.3808 +  // Allocate space for the code.  Setup code generation tools.
  1.3809 +  CodeBuffer buffer("handler_blob", 2048, 1024);
  1.3810 +  MacroAssembler* masm = new MacroAssembler(&buffer);
  1.3811 +
  1.3812 +  address start   = __ pc();
  1.3813 +  address call_pc = NULL;
  1.3814 +  int frame_size_in_words;
  1.3815 +  bool cause_return = (poll_type == POLL_AT_RETURN);
  1.3816 +  bool save_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
  1.3817 +
  1.3818 +  if (UseRTMLocking) {
  1.3819 +    // Abort RTM transaction before calling runtime
  1.3820 +    // because critical section will be large and will be
  1.3821 +    // aborted anyway. Also nmethod could be deoptimized.
  1.3822 +    __ xabort(0);
  1.3823 +  }
  1.3824 +
  1.3825 +  // Make room for return address (or push it again)
  1.3826 +  if (!cause_return) {
  1.3827 +    __ push(rbx);
  1.3828 +  }
  1.3829 +
  1.3830 +  // Save registers, fpu state, and flags
  1.3831 +  map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_vectors);
  1.3832 +
  1.3833 +  // The following is basically a call_VM.  However, we need the precise
  1.3834 +  // address of the call in order to generate an oopmap. Hence, we do all the
  1.3835 +  // work outselves.
  1.3836 +
  1.3837 +  __ set_last_Java_frame(noreg, noreg, NULL);
  1.3838 +
  1.3839 +  // The return address must always be correct so that frame constructor never
  1.3840 +  // sees an invalid pc.
  1.3841 +
  1.3842 +  if (!cause_return) {
  1.3843 +    // overwrite the dummy value we pushed on entry
  1.3844 +    __ movptr(c_rarg0, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
  1.3845 +    __ movptr(Address(rbp, wordSize), c_rarg0);
  1.3846 +  }
  1.3847 +
  1.3848 +  // Do the call
  1.3849 +  __ mov(c_rarg0, r15_thread);
  1.3850 +  __ call(RuntimeAddress(call_ptr));
  1.3851 +
  1.3852 +  // Set an oopmap for the call site.  This oopmap will map all
  1.3853 +  // oop-registers and debug-info registers as callee-saved.  This
  1.3854 +  // will allow deoptimization at this safepoint to find all possible
  1.3855 +  // debug-info recordings, as well as let GC find all oops.
  1.3856 +
  1.3857 +  oop_maps->add_gc_map( __ pc() - start, map);
  1.3858 +
  1.3859 +  Label noException;
  1.3860 +
  1.3861 +  __ reset_last_Java_frame(false, false);
  1.3862 +
  1.3863 +  __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
  1.3864 +  __ jcc(Assembler::equal, noException);
  1.3865 +
  1.3866 +  // Exception pending
  1.3867 +
  1.3868 +  RegisterSaver::restore_live_registers(masm, save_vectors);
  1.3869 +
  1.3870 +  __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
  1.3871 +
  1.3872 +  // No exception case
  1.3873 +  __ bind(noException);
  1.3874 +
  1.3875 +  // Normal exit, restore registers and exit.
  1.3876 +  RegisterSaver::restore_live_registers(masm, save_vectors);
  1.3877 +
  1.3878 +  __ ret(0);
  1.3879 +
  1.3880 +  // Make sure all code is generated
  1.3881 +  masm->flush();
  1.3882 +
  1.3883 +  // Fill-out other meta info
  1.3884 +  return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
  1.3885 +}
  1.3886 +
  1.3887 +//
  1.3888 +// generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
  1.3889 +//
  1.3890 +// Generate a stub that calls into vm to find out the proper destination
  1.3891 +// of a java call. All the argument registers are live at this point
  1.3892 +// but since this is generic code we don't know what they are and the caller
  1.3893 +// must do any gc of the args.
  1.3894 +//
  1.3895 +RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {
  1.3896 +  assert (StubRoutines::forward_exception_entry() != NULL, "must be generated before");
  1.3897 +
  1.3898 +  // allocate space for the code
  1.3899 +  ResourceMark rm;
  1.3900 +
  1.3901 +  CodeBuffer buffer(name, 1000, 512);
  1.3902 +  MacroAssembler* masm                = new MacroAssembler(&buffer);
  1.3903 +
  1.3904 +  int frame_size_in_words;
  1.3905 +
  1.3906 +  OopMapSet *oop_maps = new OopMapSet();
  1.3907 +  OopMap* map = NULL;
  1.3908 +
  1.3909 +  int start = __ offset();
  1.3910 +
  1.3911 +  map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words);
  1.3912 +
  1.3913 +  int frame_complete = __ offset();
  1.3914 +
  1.3915 +  __ set_last_Java_frame(noreg, noreg, NULL);
  1.3916 +
  1.3917 +  __ mov(c_rarg0, r15_thread);
  1.3918 +
  1.3919 +  __ call(RuntimeAddress(destination));
  1.3920 +
  1.3921 +
  1.3922 +  // Set an oopmap for the call site.
  1.3923 +  // We need this not only for callee-saved registers, but also for volatile
  1.3924 +  // registers that the compiler might be keeping live across a safepoint.
  1.3925 +
  1.3926 +  oop_maps->add_gc_map( __ offset() - start, map);
  1.3927 +
  1.3928 +  // rax contains the address we are going to jump to assuming no exception got installed
  1.3929 +
  1.3930 +  // clear last_Java_sp
  1.3931 +  __ reset_last_Java_frame(false, false);
  1.3932 +  // check for pending exceptions
  1.3933 +  Label pending;
  1.3934 +  __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
  1.3935 +  __ jcc(Assembler::notEqual, pending);
  1.3936 +
  1.3937 +  // get the returned Method*
  1.3938 +  __ get_vm_result_2(rbx, r15_thread);
  1.3939 +  __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
  1.3940 +
  1.3941 +  __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
  1.3942 +
  1.3943 +  RegisterSaver::restore_live_registers(masm);
  1.3944 +
  1.3945 +  // We are back the the original state on entry and ready to go.
  1.3946 +
  1.3947 +  __ jmp(rax);
  1.3948 +
  1.3949 +  // Pending exception after the safepoint
  1.3950 +
  1.3951 +  __ bind(pending);
  1.3952 +
  1.3953 +  RegisterSaver::restore_live_registers(masm);
  1.3954 +
  1.3955 +  // exception pending => remove activation and forward to exception handler
  1.3956 +
  1.3957 +  __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), (int)NULL_WORD);
  1.3958 +
  1.3959 +  __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
  1.3960 +  __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
  1.3961 +
  1.3962 +  // -------------
  1.3963 +  // make sure all code is generated
  1.3964 +  masm->flush();
  1.3965 +
  1.3966 +  // return the  blob
  1.3967 +  // frame_size_words or bytes??
  1.3968 +  return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
  1.3969 +}
  1.3970 +
  1.3971 +
  1.3972 +#ifdef COMPILER2
  1.3973 +// This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
  1.3974 +//
  1.3975 +//------------------------------generate_exception_blob---------------------------
  1.3976 +// creates exception blob at the end
  1.3977 +// Using exception blob, this code is jumped from a compiled method.
  1.3978 +// (see emit_exception_handler in x86_64.ad file)
  1.3979 +//
  1.3980 +// Given an exception pc at a call we call into the runtime for the
  1.3981 +// handler in this method. This handler might merely restore state
  1.3982 +// (i.e. callee save registers) unwind the frame and jump to the
  1.3983 +// exception handler for the nmethod if there is no Java level handler
  1.3984 +// for the nmethod.
  1.3985 +//
  1.3986 +// This code is entered with a jmp.
  1.3987 +//
  1.3988 +// Arguments:
  1.3989 +//   rax: exception oop
  1.3990 +//   rdx: exception pc
  1.3991 +//
  1.3992 +// Results:
  1.3993 +//   rax: exception oop
  1.3994 +//   rdx: exception pc in caller or ???
  1.3995 +//   destination: exception handler of caller
  1.3996 +//
  1.3997 +// Note: the exception pc MUST be at a call (precise debug information)
  1.3998 +//       Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved.
  1.3999 +//
  1.4000 +
  1.4001 +void OptoRuntime::generate_exception_blob() {
  1.4002 +  assert(!OptoRuntime::is_callee_saved_register(RDX_num), "");
  1.4003 +  assert(!OptoRuntime::is_callee_saved_register(RAX_num), "");
  1.4004 +  assert(!OptoRuntime::is_callee_saved_register(RCX_num), "");
  1.4005 +
  1.4006 +  assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
  1.4007 +
  1.4008 +  // Allocate space for the code
  1.4009 +  ResourceMark rm;
  1.4010 +  // Setup code generation tools
  1.4011 +  CodeBuffer buffer("exception_blob", 2048, 1024);
  1.4012 +  MacroAssembler* masm = new MacroAssembler(&buffer);
  1.4013 +
  1.4014 +
  1.4015 +  address start = __ pc();
  1.4016 +
  1.4017 +  // Exception pc is 'return address' for stack walker
  1.4018 +  __ push(rdx);
  1.4019 +  __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog
  1.4020 +
  1.4021 +  // Save callee-saved registers.  See x86_64.ad.
  1.4022 +
  1.4023 +  // rbp is an implicitly saved callee saved register (i.e. the calling
  1.4024 +  // convention will save restore it in prolog/epilog) Other than that
  1.4025 +  // there are no callee save registers now that adapter frames are gone.
  1.4026 +
  1.4027 +  __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
  1.4028 +
  1.4029 +  // Store exception in Thread object. We cannot pass any arguments to the
  1.4030 +  // handle_exception call, since we do not want to make any assumption
  1.4031 +  // about the size of the frame where the exception happened in.
  1.4032 +  // c_rarg0 is either rdi (Linux) or rcx (Windows).
  1.4033 +  __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax);
  1.4034 +  __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
  1.4035 +
  1.4036 +  // This call does all the hard work.  It checks if an exception handler
  1.4037 +  // exists in the method.
  1.4038 +  // If so, it returns the handler address.
  1.4039 +  // If not, it prepares for stack-unwinding, restoring the callee-save
  1.4040 +  // registers of the frame being removed.
  1.4041 +  //
  1.4042 +  // address OptoRuntime::handle_exception_C(JavaThread* thread)
  1.4043 +
  1.4044 +  // At a method handle call, the stack may not be properly aligned
  1.4045 +  // when returning with an exception.
  1.4046 +  address the_pc = __ pc();
  1.4047 +  __ set_last_Java_frame(noreg, noreg, the_pc);
  1.4048 +  __ mov(c_rarg0, r15_thread);
  1.4049 +  __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
  1.4050 +  __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C)));
  1.4051 +
  1.4052 +  // Set an oopmap for the call site.  This oopmap will only be used if we
  1.4053 +  // are unwinding the stack.  Hence, all locations will be dead.
  1.4054 +  // Callee-saved registers will be the same as the frame above (i.e.,
  1.4055 +  // handle_exception_stub), since they were restored when we got the
  1.4056 +  // exception.
  1.4057 +
  1.4058 +  OopMapSet* oop_maps = new OopMapSet();
  1.4059 +
  1.4060 +  oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
  1.4061 +
  1.4062 +  __ reset_last_Java_frame(false, true);
  1.4063 +
  1.4064 +  // Restore callee-saved registers
  1.4065 +
  1.4066 +  // rbp is an implicitly saved callee saved register (i.e. the calling
  1.4067 +  // convention will save restore it in prolog/epilog) Other than that
  1.4068 +  // there are no callee save registers no that adapter frames are gone.
  1.4069 +
  1.4070 +  __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt));
  1.4071 +
  1.4072 +  __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog
  1.4073 +  __ pop(rdx);                  // No need for exception pc anymore
  1.4074 +
  1.4075 +  // rax: exception handler
  1.4076 +
  1.4077 +  // Restore SP from BP if the exception PC is a MethodHandle call site.
  1.4078 +  __ cmpl(Address(r15_thread, JavaThread::is_method_handle_return_offset()), 0);
  1.4079 +  __ cmovptr(Assembler::notEqual, rsp, rbp_mh_SP_save);
  1.4080 +
  1.4081 +  // We have a handler in rax (could be deopt blob).
  1.4082 +  __ mov(r8, rax);
  1.4083 +
  1.4084 +  // Get the exception oop
  1.4085 +  __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
  1.4086 +  // Get the exception pc in case we are deoptimized
  1.4087 +  __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
  1.4088 +#ifdef ASSERT
  1.4089 +  __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), (int)NULL_WORD);
  1.4090 +  __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int)NULL_WORD);
  1.4091 +#endif
  1.4092 +  // Clear the exception oop so GC no longer processes it as a root.
  1.4093 +  __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int)NULL_WORD);
  1.4094 +
  1.4095 +  // rax: exception oop
  1.4096 +  // r8:  exception handler
  1.4097 +  // rdx: exception pc
  1.4098 +  // Jump to handler
  1.4099 +
  1.4100 +  __ jmp(r8);
  1.4101 +
  1.4102 +  // Make sure all code is generated
  1.4103 +  masm->flush();
  1.4104 +
  1.4105 +  // Set exception blob
  1.4106 +  _exception_blob =  ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1);
  1.4107 +}
  1.4108 +#endif // COMPILER2

mercurial