src/cpu/mips/vm/sharedRuntime_mips_64.cpp

Fri, 27 Sep 2019 11:31:13 +0800

author
huangjia
date
Fri, 27 Sep 2019 11:31:13 +0800
changeset 9705
0b27fc8adf1b
parent 9645
ac996ba07f9d
child 9759
8c71022cf5f3
permissions
-rw-r--r--

#10071 MIPS Port of 8176100: [REDO][REDO] G1 Needs pre barrier on dereference of weak JNI handles
Summary: runtime/jni/CallWithJNIWeak/test.sh runtime/jni/ReturnJNIWeak/test.sh crash
Reviewed-by: aoqi

     1 /*
     2  * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
     3  * Copyright (c) 2015, 2019, Loongson Technology. All rights reserved.
     4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     5  *
     6  * This code is free software; you can redistribute it and/or modify it
     7  * under the terms of the GNU General Public License version 2 only, as
     8  * published by the Free Software Foundation.
     9  *
    10  * This code is distributed in the hope that it will be useful, but WITHOUT
    11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
    12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    13  * version 2 for more details (a copy is included in the LICENSE file that
    14  * accompanied this code).
    15  *
    16  * You should have received a copy of the GNU General Public License version
    17  * 2 along with this work; if not, write to the Free Software Foundation,
    18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
    19  *
    20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
    21  * or visit www.oracle.com if you need additional information or have any
    22  * questions.
    23  *
    24  */
    26 #include "precompiled.hpp"
    27 #include "asm/macroAssembler.hpp"
    28 #include "asm/macroAssembler.inline.hpp"
    29 #include "code/debugInfoRec.hpp"
    30 #include "code/icBuffer.hpp"
    31 #include "code/vtableStubs.hpp"
    32 #include "interpreter/interpreter.hpp"
    33 #include "oops/compiledICHolder.hpp"
    34 #include "prims/jvmtiRedefineClassesTrace.hpp"
    35 #include "runtime/sharedRuntime.hpp"
    36 #include "runtime/vframeArray.hpp"
    37 #include "vmreg_mips.inline.hpp"
    38 #ifdef COMPILER1
    39 #include "c1/c1_Runtime1.hpp"
    40 #endif
    41 #ifdef COMPILER2
    42 #include "opto/runtime.hpp"
    43 #endif
    45 #include <alloca.h>
    47 #define __ masm->
    49 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
    51 class RegisterSaver {
    52   enum { FPU_regs_live = 32 };
    53   // Capture info about frame layout
    54   enum layout {
    55 #define DEF_LAYOUT_OFFS(regname)  regname ## _off,  regname ## H_off,
    56     DEF_LAYOUT_OFFS(for_16_bytes_aligned)
    57     DEF_LAYOUT_OFFS(fpr0)
    58     DEF_LAYOUT_OFFS(fpr1)
    59     DEF_LAYOUT_OFFS(fpr2)
    60     DEF_LAYOUT_OFFS(fpr3)
    61     DEF_LAYOUT_OFFS(fpr4)
    62     DEF_LAYOUT_OFFS(fpr5)
    63     DEF_LAYOUT_OFFS(fpr6)
    64     DEF_LAYOUT_OFFS(fpr7)
    65     DEF_LAYOUT_OFFS(fpr8)
    66     DEF_LAYOUT_OFFS(fpr9)
    67     DEF_LAYOUT_OFFS(fpr10)
    68     DEF_LAYOUT_OFFS(fpr11)
    69     DEF_LAYOUT_OFFS(fpr12)
    70     DEF_LAYOUT_OFFS(fpr13)
    71     DEF_LAYOUT_OFFS(fpr14)
    72     DEF_LAYOUT_OFFS(fpr15)
    73     DEF_LAYOUT_OFFS(fpr16)
    74     DEF_LAYOUT_OFFS(fpr17)
    75     DEF_LAYOUT_OFFS(fpr18)
    76     DEF_LAYOUT_OFFS(fpr19)
    77     DEF_LAYOUT_OFFS(fpr20)
    78     DEF_LAYOUT_OFFS(fpr21)
    79     DEF_LAYOUT_OFFS(fpr22)
    80     DEF_LAYOUT_OFFS(fpr23)
    81     DEF_LAYOUT_OFFS(fpr24)
    82     DEF_LAYOUT_OFFS(fpr25)
    83     DEF_LAYOUT_OFFS(fpr26)
    84     DEF_LAYOUT_OFFS(fpr27)
    85     DEF_LAYOUT_OFFS(fpr28)
    86     DEF_LAYOUT_OFFS(fpr29)
    87     DEF_LAYOUT_OFFS(fpr30)
    88     DEF_LAYOUT_OFFS(fpr31)
    90     DEF_LAYOUT_OFFS(v0)
    91     DEF_LAYOUT_OFFS(v1)
    92     DEF_LAYOUT_OFFS(a0)
    93     DEF_LAYOUT_OFFS(a1)
    94     DEF_LAYOUT_OFFS(a2)
    95     DEF_LAYOUT_OFFS(a3)
    96     DEF_LAYOUT_OFFS(a4)
    97     DEF_LAYOUT_OFFS(a5)
    98     DEF_LAYOUT_OFFS(a6)
    99     DEF_LAYOUT_OFFS(a7)
   100     DEF_LAYOUT_OFFS(t0)
   101     DEF_LAYOUT_OFFS(t1)
   102     DEF_LAYOUT_OFFS(t2)
   103     DEF_LAYOUT_OFFS(t3)
   104     DEF_LAYOUT_OFFS(s0)
   105     DEF_LAYOUT_OFFS(s1)
   106     DEF_LAYOUT_OFFS(s2)
   107     DEF_LAYOUT_OFFS(s3)
   108     DEF_LAYOUT_OFFS(s4)
   109     DEF_LAYOUT_OFFS(s5)
   110     DEF_LAYOUT_OFFS(s6)
   111     DEF_LAYOUT_OFFS(s7)
   112     DEF_LAYOUT_OFFS(t8)
   113     DEF_LAYOUT_OFFS(t9)
   115     DEF_LAYOUT_OFFS(gp)
   116     DEF_LAYOUT_OFFS(fp)
   117     DEF_LAYOUT_OFFS(return)
   118     reg_save_size
   119   };
   121   public:
   123   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors =false );
   124   static void restore_live_registers(MacroAssembler* masm, bool restore_vectors = false);
   125   static int raOffset(void) { return return_off / 2; }
   126   //Rmethod
   127   static int methodOffset(void) { return s3_off / 2; }
   129   static int v0Offset(void) { return v0_off / 2; }
   130   static int v1Offset(void) { return v1_off / 2; }
   132   static int fpResultOffset(void) { return fpr0_off / 2; }
   134   // During deoptimization only the result register need to be restored
   135   // all the other values have already been extracted.
   136   static void restore_result_registers(MacroAssembler* masm);
   137 };
   139 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors ) {
   141   // Always make the frame size 16-byte aligned
   142   int frame_size_in_bytes = round_to(additional_frame_words*wordSize +
   143                                      reg_save_size*BytesPerInt, 16);
   144   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
   145   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
   146   // The caller will allocate additional_frame_words
   147   int additional_frame_slots = additional_frame_words*wordSize / BytesPerInt;
   148   // CodeBlob frame size is in words.
   149   int frame_size_in_words = frame_size_in_bytes / wordSize;
   150   *total_frame_words = frame_size_in_words;
   152   // save registers
   154   __ daddiu(SP, SP, - reg_save_size * jintSize);
   156   __ sdc1(F0, SP, fpr0_off * jintSize); __ sdc1(F1, SP, fpr1_off * jintSize);
   157   __ sdc1(F2, SP, fpr2_off * jintSize); __ sdc1(F3, SP, fpr3_off * jintSize);
   158   __ sdc1(F4, SP, fpr4_off * jintSize); __ sdc1(F5, SP, fpr5_off * jintSize);
   159   __ sdc1(F6, SP, fpr6_off * jintSize);  __ sdc1(F7, SP, fpr7_off * jintSize);
   160   __ sdc1(F8, SP, fpr8_off * jintSize);  __ sdc1(F9, SP, fpr9_off * jintSize);
   161   __ sdc1(F10, SP, fpr10_off * jintSize);  __ sdc1(F11, SP, fpr11_off * jintSize);
   162   __ sdc1(F12, SP, fpr12_off * jintSize);  __ sdc1(F13, SP, fpr13_off * jintSize);
   163   __ sdc1(F14, SP, fpr14_off * jintSize);  __ sdc1(F15, SP, fpr15_off * jintSize);
   164   __ sdc1(F16, SP, fpr16_off * jintSize);  __ sdc1(F17, SP, fpr17_off * jintSize);
   165   __ sdc1(F18, SP, fpr18_off * jintSize);  __ sdc1(F19, SP, fpr19_off * jintSize);
   166   __ sdc1(F20, SP, fpr20_off * jintSize);  __ sdc1(F21, SP, fpr21_off * jintSize);
   167   __ sdc1(F22, SP, fpr22_off * jintSize);  __ sdc1(F23, SP, fpr23_off * jintSize);
   168   __ sdc1(F24, SP, fpr24_off * jintSize);  __ sdc1(F25, SP, fpr25_off * jintSize);
   169   __ sdc1(F26, SP, fpr26_off * jintSize);  __ sdc1(F27, SP, fpr27_off * jintSize);
   170   __ sdc1(F28, SP, fpr28_off * jintSize);  __ sdc1(F29, SP, fpr29_off * jintSize);
   171   __ sdc1(F30, SP, fpr30_off * jintSize);  __ sdc1(F31, SP, fpr31_off * jintSize);
   172   __ sd(V0, SP, v0_off * jintSize);  __ sd(V1, SP, v1_off * jintSize);
   173   __ sd(A0, SP, a0_off * jintSize);  __ sd(A1, SP, a1_off * jintSize);
   174   __ sd(A2, SP, a2_off * jintSize);  __ sd(A3, SP, a3_off * jintSize);
   175   __ sd(A4, SP, a4_off * jintSize);  __ sd(A5, SP, a5_off * jintSize);
   176   __ sd(A6, SP, a6_off * jintSize);  __ sd(A7, SP, a7_off * jintSize);
   177   __ sd(T0, SP, t0_off * jintSize);
   178   __ sd(T1, SP, t1_off * jintSize);
   179   __ sd(T2, SP, t2_off * jintSize);
   180   __ sd(T3, SP, t3_off * jintSize);
   181   __ sd(S0, SP, s0_off * jintSize);
   182   __ sd(S1, SP, s1_off * jintSize);
   183   __ sd(S2, SP, s2_off * jintSize);
   184   __ sd(S3, SP, s3_off * jintSize);
   185   __ sd(S4, SP, s4_off * jintSize);
   186   __ sd(S5, SP, s5_off * jintSize);
   187   __ sd(S6, SP, s6_off * jintSize);
   188   __ sd(S7, SP, s7_off * jintSize);
   190   __ sd(T8, SP, t8_off * jintSize);
   191   __ sd(T9, SP, t9_off * jintSize);
   193   __ sd(GP, SP, gp_off * jintSize);
   194   __ sd(FP, SP, fp_off * jintSize);
   195   __ sd(RA, SP, return_off * jintSize);
   196   __ daddi(FP, SP, fp_off * jintSize);
   198   OopMapSet *oop_maps = new OopMapSet();
   199   //OopMap* map =  new OopMap( frame_words, 0 );
   200   OopMap* map =  new OopMap( frame_size_in_slots, 0 );
   203 //#define STACK_OFFSET(x) VMRegImpl::stack2reg((x) + additional_frame_words)
   204 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x) + additional_frame_slots)
   205   map->set_callee_saved(STACK_OFFSET( v0_off), V0->as_VMReg());
   206   map->set_callee_saved(STACK_OFFSET( v1_off), V1->as_VMReg());
   207   map->set_callee_saved(STACK_OFFSET( a0_off), A0->as_VMReg());
   208   map->set_callee_saved(STACK_OFFSET( a1_off), A1->as_VMReg());
   209   map->set_callee_saved(STACK_OFFSET( a2_off), A2->as_VMReg());
   210   map->set_callee_saved(STACK_OFFSET( a3_off), A3->as_VMReg());
   211   map->set_callee_saved(STACK_OFFSET( a4_off), A4->as_VMReg());
   212   map->set_callee_saved(STACK_OFFSET( a5_off), A5->as_VMReg());
   213   map->set_callee_saved(STACK_OFFSET( a6_off), A6->as_VMReg());
   214   map->set_callee_saved(STACK_OFFSET( a7_off), A7->as_VMReg());
   215   map->set_callee_saved(STACK_OFFSET( t0_off), T0->as_VMReg());
   216   map->set_callee_saved(STACK_OFFSET( t1_off), T1->as_VMReg());
   217   map->set_callee_saved(STACK_OFFSET( t2_off), T2->as_VMReg());
   218   map->set_callee_saved(STACK_OFFSET( t3_off), T3->as_VMReg());
   219   map->set_callee_saved(STACK_OFFSET( s0_off), S0->as_VMReg());
   220   map->set_callee_saved(STACK_OFFSET( s1_off), S1->as_VMReg());
   221   map->set_callee_saved(STACK_OFFSET( s2_off), S2->as_VMReg());
   222   map->set_callee_saved(STACK_OFFSET( s3_off), S3->as_VMReg());
   223   map->set_callee_saved(STACK_OFFSET( s4_off), S4->as_VMReg());
   224   map->set_callee_saved(STACK_OFFSET( s5_off), S5->as_VMReg());
   225   map->set_callee_saved(STACK_OFFSET( s6_off), S6->as_VMReg());
   226   map->set_callee_saved(STACK_OFFSET( s7_off), S7->as_VMReg());
   227   map->set_callee_saved(STACK_OFFSET( t8_off), T8->as_VMReg());
   228   map->set_callee_saved(STACK_OFFSET( t9_off), T9->as_VMReg());
   229   map->set_callee_saved(STACK_OFFSET( gp_off), GP->as_VMReg());
   230   map->set_callee_saved(STACK_OFFSET( fp_off), FP->as_VMReg());
   231   map->set_callee_saved(STACK_OFFSET( return_off), RA->as_VMReg());
   233   map->set_callee_saved(STACK_OFFSET( fpr0_off), F0->as_VMReg());
   234   map->set_callee_saved(STACK_OFFSET( fpr1_off), F1->as_VMReg());
   235   map->set_callee_saved(STACK_OFFSET( fpr2_off), F2->as_VMReg());
   236   map->set_callee_saved(STACK_OFFSET( fpr3_off), F3->as_VMReg());
   237   map->set_callee_saved(STACK_OFFSET( fpr4_off), F4->as_VMReg());
   238   map->set_callee_saved(STACK_OFFSET( fpr5_off), F5->as_VMReg());
   239   map->set_callee_saved(STACK_OFFSET( fpr6_off), F6->as_VMReg());
   240   map->set_callee_saved(STACK_OFFSET( fpr7_off), F7->as_VMReg());
   241   map->set_callee_saved(STACK_OFFSET( fpr8_off), F8->as_VMReg());
   242   map->set_callee_saved(STACK_OFFSET( fpr9_off), F9->as_VMReg());
   243   map->set_callee_saved(STACK_OFFSET( fpr10_off), F10->as_VMReg());
   244   map->set_callee_saved(STACK_OFFSET( fpr11_off), F11->as_VMReg());
   245   map->set_callee_saved(STACK_OFFSET( fpr12_off), F12->as_VMReg());
   246   map->set_callee_saved(STACK_OFFSET( fpr13_off), F13->as_VMReg());
   247   map->set_callee_saved(STACK_OFFSET( fpr14_off), F14->as_VMReg());
   248   map->set_callee_saved(STACK_OFFSET( fpr15_off), F15->as_VMReg());
   249   map->set_callee_saved(STACK_OFFSET( fpr16_off), F16->as_VMReg());
   250   map->set_callee_saved(STACK_OFFSET( fpr17_off), F17->as_VMReg());
   251   map->set_callee_saved(STACK_OFFSET( fpr18_off), F18->as_VMReg());
   252   map->set_callee_saved(STACK_OFFSET( fpr19_off), F19->as_VMReg());
   253   map->set_callee_saved(STACK_OFFSET( fpr20_off), F20->as_VMReg());
   254   map->set_callee_saved(STACK_OFFSET( fpr21_off), F21->as_VMReg());
   255   map->set_callee_saved(STACK_OFFSET( fpr22_off), F22->as_VMReg());
   256   map->set_callee_saved(STACK_OFFSET( fpr23_off), F23->as_VMReg());
   257   map->set_callee_saved(STACK_OFFSET( fpr24_off), F24->as_VMReg());
   258   map->set_callee_saved(STACK_OFFSET( fpr25_off), F25->as_VMReg());
   259   map->set_callee_saved(STACK_OFFSET( fpr26_off), F26->as_VMReg());
   260   map->set_callee_saved(STACK_OFFSET( fpr27_off), F27->as_VMReg());
   261   map->set_callee_saved(STACK_OFFSET( fpr28_off), F28->as_VMReg());
   262   map->set_callee_saved(STACK_OFFSET( fpr29_off), F29->as_VMReg());
   263   map->set_callee_saved(STACK_OFFSET( fpr30_off), F30->as_VMReg());
   264   map->set_callee_saved(STACK_OFFSET( fpr31_off), F31->as_VMReg());
   266 #undef STACK_OFFSET
   267   return map;
   268 }
   271 // Pop the current frame and restore all the registers that we
   272 // saved.
   273 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_vectors) {
   274   __ ldc1(F0, SP, fpr0_off * jintSize); __ ldc1(F1, SP, fpr1_off * jintSize);
   275   __ ldc1(F2, SP, fpr2_off * jintSize); __ ldc1(F3, SP, fpr3_off * jintSize);
   276   __ ldc1(F4, SP, fpr4_off * jintSize); __ ldc1(F5, SP, fpr5_off * jintSize);
   277   __ ldc1(F6, SP, fpr6_off * jintSize);  __ ldc1(F7, SP, fpr7_off * jintSize);
   278   __ ldc1(F8, SP, fpr8_off * jintSize);  __ ldc1(F9, SP, fpr9_off * jintSize);
   279   __ ldc1(F10, SP, fpr10_off * jintSize);  __ ldc1(F11, SP, fpr11_off * jintSize);
   280   __ ldc1(F12, SP, fpr12_off * jintSize);  __ ldc1(F13, SP, fpr13_off * jintSize);
   281   __ ldc1(F14, SP, fpr14_off * jintSize);  __ ldc1(F15, SP, fpr15_off * jintSize);
   282   __ ldc1(F16, SP, fpr16_off * jintSize);  __ ldc1(F17, SP, fpr17_off * jintSize);
   283   __ ldc1(F18, SP, fpr18_off * jintSize);  __ ldc1(F19, SP, fpr19_off * jintSize);
   284   __ ldc1(F20, SP, fpr20_off * jintSize);  __ ldc1(F21, SP, fpr21_off * jintSize);
   285   __ ldc1(F22, SP, fpr22_off * jintSize);  __ ldc1(F23, SP, fpr23_off * jintSize);
   286   __ ldc1(F24, SP, fpr24_off * jintSize);  __ ldc1(F25, SP, fpr25_off * jintSize);
   287   __ ldc1(F26, SP, fpr26_off * jintSize);  __ ldc1(F27, SP, fpr27_off * jintSize);
   288   __ ldc1(F28, SP, fpr28_off * jintSize);  __ ldc1(F29, SP, fpr29_off * jintSize);
   289   __ ldc1(F30, SP, fpr30_off * jintSize);  __ ldc1(F31, SP, fpr31_off * jintSize);
   291   __ ld(V0, SP, v0_off * jintSize);  __ ld(V1, SP, v1_off * jintSize);
   292   __ ld(A0, SP, a0_off * jintSize);  __ ld(A1, SP, a1_off * jintSize);
   293   __ ld(A2, SP, a2_off * jintSize);  __ ld(A3, SP, a3_off * jintSize);
   294   __ ld(A4, SP, a4_off * jintSize);  __ ld(A5, SP, a5_off * jintSize);
   295   __ ld(A6, SP, a6_off * jintSize);  __ ld(A7, SP, a7_off * jintSize);
   296   __ ld(T0, SP, t0_off * jintSize);
   297   __ ld(T1, SP, t1_off * jintSize);
   298   __ ld(T2, SP, t2_off * jintSize);
   299   __ ld(T3, SP, t3_off * jintSize);
   300   __ ld(S0, SP, s0_off * jintSize);
   301   __ ld(S1, SP, s1_off * jintSize);
   302   __ ld(S2, SP, s2_off * jintSize);
   303   __ ld(S3, SP, s3_off * jintSize);
   304   __ ld(S4, SP, s4_off * jintSize);
   305   __ ld(S5, SP, s5_off * jintSize);
   306   __ ld(S6, SP, s6_off * jintSize);
   307   __ ld(S7, SP, s7_off * jintSize);
   309   __ ld(T8, SP, t8_off * jintSize);
   310   __ ld(T9, SP, t9_off * jintSize);
   312   __ ld(GP, SP, gp_off * jintSize);
   313   __ ld(FP, SP, fp_off * jintSize);
   314   __ ld(RA, SP, return_off * jintSize);
   316   __ addiu(SP, SP, reg_save_size * jintSize);
   317 }
   319 // Pop the current frame and restore the registers that might be holding
   320 // a result.
   321 // FIXME, if the result is float?
   322 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
   324   // Just restore result register. Only used by deoptimization. By
   325   // now any callee save register that needs to be restore to a c2
   326   // caller of the deoptee has been extracted into the vframeArray
   327   // and will be stuffed into the c2i adapter we create for later
   328   // restoration so only result registers need to be restored here.
   330   __ ld(V0, SP, v0_off * jintSize);
   331   __ ld(V1, SP, v1_off * jintSize);
   332   __ addiu(SP, SP, return_off * jintSize);
   333 }
   335 // Is vector's size (in bytes) bigger than a size saved by default?
   336 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
   337 bool SharedRuntime::is_wide_vector(int size) {
   338   return size > 16;
   339 }
   341 // The java_calling_convention describes stack locations as ideal slots on
   342 // a frame with no abi restrictions. Since we must observe abi restrictions
   343 // (like the placement of the register window) the slots must be biased by
   344 // the following value.
   346 static int reg2offset_in(VMReg r) {
   347   // Account for saved fp and return address
   348   // This should really be in_preserve_stack_slots
   349   return (r->reg2stack() + 2 * VMRegImpl::slots_per_word) * VMRegImpl::stack_slot_size;  // + 2 * VMRegImpl::stack_slot_size);
   350 }
   352 static int reg2offset_out(VMReg r) {
   353   return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
   354 }
   356 // ---------------------------------------------------------------------------
   357 // Read the array of BasicTypes from a signature, and compute where the
   358 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
   359 // quantities.  Values less than SharedInfo::stack0 are registers, those above
   360 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
   361 // as framesizes are fixed.
   362 // VMRegImpl::stack0 refers to the first slot 0(sp).
   363 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.  Register
   364 // up to RegisterImpl::number_of_registers) are the 32-bit
   365 // integer registers.
   367 // Pass first five oop/int args in registers T0, A0 - A3.
   368 // Pass float/double/long args in stack.
   369 // Doubles have precedence, so if you pass a mix of floats and doubles
   370 // the doubles will grab the registers before the floats will.
   372 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
   373 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
   374 // units regardless of build.
   377 // ---------------------------------------------------------------------------
   378 // The compiled Java calling convention.
   379 // Pass first five oop/int args in registers T0, A0 - A3.
   380 // Pass float/double/long args in stack.
   381 // Doubles have precedence, so if you pass a mix of floats and doubles
   382 // the doubles will grab the registers before the floats will.
   384 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
   385                                            VMRegPair *regs,
   386                                            int total_args_passed,
   387                                            int is_outgoing) {
   389   // Create the mapping between argument positions and
   390   // registers.
   391   //static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
   392   static const Register INT_ArgReg[Argument::n_register_parameters + 1] = {
   393     T0, A0, A1, A2, A3, A4, A5, A6, A7
   394   };
   395   //static const FloatRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
   396   static const FloatRegister FP_ArgReg[Argument::n_float_register_parameters] = {
   397     F12, F13, F14, F15, F16, F17, F18, F19
   398   };
   401   uint args = 0;
   402   uint stk_args = 0; // inc by 2 each time
   404   for (int i = 0; i < total_args_passed; i++) {
   405     switch (sig_bt[i]) {
   406     case T_VOID:
   407       // halves of T_LONG or T_DOUBLE
   408       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
   409       regs[i].set_bad();
   410       break;
   411     case T_BOOLEAN:
   412     case T_CHAR:
   413     case T_BYTE:
   414     case T_SHORT:
   415     case T_INT:
   416       if (args < Argument::n_register_parameters) {
   417         regs[i].set1(INT_ArgReg[args++]->as_VMReg());
   418       } else {
   419         regs[i].set1(VMRegImpl::stack2reg(stk_args));
   420         stk_args += 2;
   421       }
   422       break;
   423     case T_LONG:
   424       assert(sig_bt[i + 1] == T_VOID, "expecting half");
   425       // fall through
   426     case T_OBJECT:
   427     case T_ARRAY:
   428     case T_ADDRESS:
   429       if (args < Argument::n_register_parameters) {
   430         regs[i].set2(INT_ArgReg[args++]->as_VMReg());
   431       } else {
   432         regs[i].set2(VMRegImpl::stack2reg(stk_args));
   433         stk_args += 2;
   434       }
   435       break;
   436     case T_FLOAT:
   437       if (args < Argument::n_float_register_parameters) {
   438         regs[i].set1(FP_ArgReg[args++]->as_VMReg());
   439       } else {
   440         regs[i].set1(VMRegImpl::stack2reg(stk_args));
   441         stk_args += 2;
   442       }
   443       break;
   444     case T_DOUBLE:
   445       assert(sig_bt[i + 1] == T_VOID, "expecting half");
   446       if (args < Argument::n_float_register_parameters) {
   447         regs[i].set2(FP_ArgReg[args++]->as_VMReg());
   448       } else {
   449         regs[i].set2(VMRegImpl::stack2reg(stk_args));
   450         stk_args += 2;
   451       }
   452       break;
   453     default:
   454       ShouldNotReachHere();
   455       break;
   456     }
   457   }
   459   return round_to(stk_args, 2);
   460 }
   462 // Helper class mostly to avoid passing masm everywhere, and handle store
   463 // displacement overflow logic for LP64
   464 class AdapterGenerator {
   465   MacroAssembler *masm;
   466 #ifdef _LP64
   467   Register Rdisp;
   468   void set_Rdisp(Register r)  { Rdisp = r; }
   469 #endif // _LP64
   471   void patch_callers_callsite();
   473   // base+st_off points to top of argument
   474   int arg_offset(const int st_off) { return st_off; }
   475   int next_arg_offset(const int st_off) {
   476     return st_off - Interpreter::stackElementSize;
   477   }
   479 #ifdef _LP64
   480   // On _LP64 argument slot values are loaded first into a register
   481   // because they might not fit into displacement.
   482   Register arg_slot(const int st_off);
   483   Register next_arg_slot(const int st_off);
   484 #else
   485   int arg_slot(const int st_off)      { return arg_offset(st_off); }
   486   int next_arg_slot(const int st_off) { return next_arg_offset(st_off); }
   487 #endif // _LP64
   489   // Stores long into offset pointed to by base
   490   void store_c2i_long(Register r, Register base,
   491                       const int st_off, bool is_stack);
   492   void store_c2i_object(Register r, Register base,
   493                         const int st_off);
   494   void store_c2i_int(Register r, Register base,
   495                      const int st_off);
   496   void store_c2i_double(VMReg r_2,
   497                         VMReg r_1, Register base, const int st_off);
   498   void store_c2i_float(FloatRegister f, Register base,
   499                        const int st_off);
   501  public:
   502   //void tag_stack(const BasicType sig, int st_off);
   503   void gen_c2i_adapter(int total_args_passed,
   504                               // VMReg max_arg,
   505                               int comp_args_on_stack, // VMRegStackSlots
   506                               const BasicType *sig_bt,
   507                               const VMRegPair *regs,
   508                               Label& skip_fixup);
   509   void gen_i2c_adapter(int total_args_passed,
   510                               // VMReg max_arg,
   511                               int comp_args_on_stack, // VMRegStackSlots
   512                               const BasicType *sig_bt,
   513                               const VMRegPair *regs);
   515   AdapterGenerator(MacroAssembler *_masm) : masm(_masm) {}
   516 };
   519 // Patch the callers callsite with entry to compiled code if it exists.
   520 void AdapterGenerator::patch_callers_callsite() {
   521   Label L;
   522   __ verify_oop(Rmethod);
   523   __ ld_ptr(AT, Rmethod, in_bytes(Method::code_offset()));
   524   __ beq(AT, R0, L);
   525   __ delayed()->nop();
   526   // Schedule the branch target address early.
   527   // Call into the VM to patch the caller, then jump to compiled callee
   528   // V0 isn't live so capture return address while we easily can
   529   __ move(V0, RA);
   531   __ pushad();
   532 #ifdef COMPILER2
   533   // C2 may leave the stack dirty if not in SSE2+ mode
   534   __ empty_FPU_stack();
   535 #endif
   537   // VM needs caller's callsite
   538   // VM needs target method
   540   __ move(A0, Rmethod);
   541   __ move(A1, V0);
   542   // we should preserve the return address
   543   __ verify_oop(Rmethod);
   544   __ move(S0, SP);
   545   __ move(AT, -(StackAlignmentInBytes));   // align the stack
   546   __ andr(SP, SP, AT);
   547   __ call(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite),
   548           relocInfo::runtime_call_type);
   550   __ delayed()->nop();
   551   __ move(SP, S0);
   552   __ popad();
   553   __ bind(L);
   554 }
   556 #ifdef _LP64
   557 Register AdapterGenerator::arg_slot(const int st_off) {
   558   Unimplemented();
   559 }
   561 Register AdapterGenerator::next_arg_slot(const int st_off){
   562   Unimplemented();
   563 }
   564 #endif // _LP64
   566 // Stores long into offset pointed to by base
   567 void AdapterGenerator::store_c2i_long(Register r, Register base,
   568                                       const int st_off, bool is_stack) {
   569   Unimplemented();
   570 }
   572 void AdapterGenerator::store_c2i_object(Register r, Register base,
   573                                         const int st_off) {
   574   Unimplemented();
   575 }
   577 void AdapterGenerator::store_c2i_int(Register r, Register base,
   578                                      const int st_off) {
   579   Unimplemented();
   580 }
   582 // Stores into offset pointed to by base
   583 void AdapterGenerator::store_c2i_double(VMReg r_2,
   584                       VMReg r_1, Register base, const int st_off) {
   585   Unimplemented();
   586 }
   588 void AdapterGenerator::store_c2i_float(FloatRegister f, Register base,
   589                                        const int st_off) {
   590   Unimplemented();
   591 }
   593 void AdapterGenerator::gen_c2i_adapter(
   594                             int total_args_passed,
   595                             // VMReg max_arg,
   596                             int comp_args_on_stack, // VMRegStackSlots
   597                             const BasicType *sig_bt,
   598                             const VMRegPair *regs,
   599                             Label& skip_fixup) {
   601   // Before we get into the guts of the C2I adapter, see if we should be here
   602   // at all.  We've come from compiled code and are attempting to jump to the
   603   // interpreter, which means the caller made a static call to get here
   604   // (vcalls always get a compiled target if there is one).  Check for a
   605   // compiled target.  If there is one, we need to patch the caller's call.
   606   // However we will run interpreted if we come thru here. The next pass
   607   // thru the call site will run compiled. If we ran compiled here then
   608   // we can (theorectically) do endless i2c->c2i->i2c transitions during
   609   // deopt/uncommon trap cycles. If we always go interpreted here then
   610   // we can have at most one and don't need to play any tricks to keep
   611   // from endlessly growing the stack.
   612   //
   613   // Actually if we detected that we had an i2c->c2i transition here we
   614   // ought to be able to reset the world back to the state of the interpreted
   615   // call and not bother building another interpreter arg area. We don't
   616   // do that at this point.
   618   patch_callers_callsite();
   620   __ bind(skip_fixup);
   622 #ifdef COMPILER2
   623   __ empty_FPU_stack();
   624 #endif
   625   //this is for native ?
   626   // Since all args are passed on the stack, total_args_passed * interpreter_
   627   // stack_element_size  is the
   628   // space we need.
   629   int extraspace = total_args_passed * Interpreter::stackElementSize;
   631   // stack is aligned, keep it that way
   632   extraspace = round_to(extraspace, 2*wordSize);
   634   // Get return address
   635   __ move(V0, RA);
   636   // set senderSP value
   637   //refer to interpreter_mips.cpp:generate_asm_entry
   638   __ move(Rsender, SP);
   639   __ addi(SP, SP, -extraspace);
   641   // Now write the args into the outgoing interpreter space
   642   for (int i = 0; i < total_args_passed; i++) {
   643     if (sig_bt[i] == T_VOID) {
   644       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
   645       continue;
   646     }
   648     // st_off points to lowest address on stack.
   649     int st_off = ((total_args_passed - 1) - i) * Interpreter::stackElementSize;
   650     // Say 4 args:
   651     // i   st_off
   652     // 0   12 T_LONG
   653     // 1    8 T_VOID
   654     // 2    4 T_OBJECT
   655     // 3    0 T_BOOL
   656     VMReg r_1 = regs[i].first();
   657     VMReg r_2 = regs[i].second();
   658     if (!r_1->is_valid()) {
   659       assert(!r_2->is_valid(), "");
   660       continue;
   661     }
   662     if (r_1->is_stack()) {
   663       // memory to memory use fpu stack top
   664       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
   665       if (!r_2->is_valid()) {
   666         __ ld_ptr(AT, SP, ld_off);
   667         __ st_ptr(AT, SP, st_off);
   669       } else {
   672         int next_off = st_off - Interpreter::stackElementSize;
   673         __ ld_ptr(AT, SP, ld_off);
   674         __ st_ptr(AT, SP, st_off);
   676         // Ref to is_Register condition
   677         if(sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE)
   678           __ st_ptr(AT, SP, st_off - 8);
   679       }
   680     } else if (r_1->is_Register()) {
   681       Register r = r_1->as_Register();
   682       if (!r_2->is_valid()) {
   683           __ sd(r, SP, st_off);
   684       } else {
   685         //FIXME, mips will not enter here
   686         // long/double in gpr
   687         __ sd(r, SP, st_off);
   688         // In [java/util/zip/ZipFile.java]
   689         //
   690         //    private static native long open(String name, int mode, long lastModified);
   691         //    private static native int getTotal(long jzfile);
   692         //
   693         // We need to transfer T_LONG paramenters from a compiled method to a native method.
   694         // It's a complex process:
   695         //
   696         // Caller -> lir_static_call -> gen_resolve_stub
   697         //      -> -- resolve_static_call_C
   698         //         `- gen_c2i_adapter()  [*]
   699         //             |
   700         //       `- AdapterHandlerLibrary::get_create_apapter_index
   701         //      -> generate_native_entry
   702         //      -> InterpreterRuntime::SignatureHandlerGenerator::pass_long [**]
   703         //
   704         // In [**], T_Long parameter is stored in stack as:
   705         //
   706         //   (high)
   707         //    |         |
   708         //    -----------
   709         //    | 8 bytes |
   710         //    | (void)  |
   711         //    -----------
   712         //    | 8 bytes |
   713         //    | (long)  |
   714         //    -----------
   715         //    |         |
   716         //   (low)
   717         //
   718         // However, the sequence is reversed here:
   719         //
   720         //   (high)
   721         //    |         |
   722         //    -----------
   723         //    | 8 bytes |
   724         //    | (long)  |
   725         //    -----------
   726         //    | 8 bytes |
   727         //    | (void)  |
   728         //    -----------
   729         //    |         |
   730         //   (low)
   731         //
   732         // So I stored another 8 bytes in the T_VOID slot. It then can be accessed from generate_native_entry().
   733         //
   734         if (sig_bt[i] == T_LONG)
   735           __ sd(r, SP, st_off - 8);
   736       }
   737     } else if (r_1->is_FloatRegister()) {
   738       assert(sig_bt[i] == T_FLOAT || sig_bt[i] == T_DOUBLE, "Must be a float register");
   740       FloatRegister fr = r_1->as_FloatRegister();
   741       if (sig_bt[i] == T_FLOAT)
   742         __ swc1(fr, SP, st_off);
   743       else {
   744         __ sdc1(fr, SP, st_off);
   745         __ sdc1(fr, SP, st_off - 8);  // T_DOUBLE needs two slots
   746       }
   747     }
   748   }
   750   // Schedule the branch target address early.
   751   __ ld_ptr(AT, Rmethod, in_bytes(Method::interpreter_entry_offset()) );
   752   // And repush original return address
   753   __ move(RA, V0);
   754   __ jr (AT);
   755   __ delayed()->nop();
   756 }
   758 void AdapterGenerator::gen_i2c_adapter(
   759                                        int total_args_passed,
   760                                        // VMReg max_arg,
   761                                        int comp_args_on_stack, // VMRegStackSlots
   762                                        const BasicType *sig_bt,
   763                                        const VMRegPair *regs) {
   765   // Generate an I2C adapter: adjust the I-frame to make space for the C-frame
   766   // layout.  Lesp was saved by the calling I-frame and will be restored on
   767   // return.  Meanwhile, outgoing arg space is all owned by the callee
   768   // C-frame, so we can mangle it at will.  After adjusting the frame size,
   769   // hoist register arguments and repack other args according to the compiled
   770   // code convention.  Finally, end in a jump to the compiled code.  The entry
   771   // point address is the start of the buffer.
   773   // We will only enter here from an interpreted frame and never from after
   774   // passing thru a c2i. Azul allowed this but we do not. If we lose the
   775   // race and use a c2i we will remain interpreted for the race loser(s).
   776   // This removes all sorts of headaches on the mips side and also eliminates
   777   // the possibility of having c2i -> i2c -> c2i -> ... endless transitions.
   780   __ move(T9, SP);
   782   // Cut-out for having no stack args.  Since up to 2 int/oop args are passed
   783   // in registers, we will occasionally have no stack args.
   784   int comp_words_on_stack = 0;
   785   if (comp_args_on_stack) {
   786     // Sig words on the stack are greater-than VMRegImpl::stack0.  Those in
   787     // registers are below.  By subtracting stack0, we either get a negative
   788     // number (all values in registers) or the maximum stack slot accessed.
   789     // int comp_args_on_stack = VMRegImpl::reg2stack(max_arg);
   790     // Convert 4-byte stack slots to words.
   791     // did mips need round? FIXME  aoqi
   792     comp_words_on_stack = round_to(comp_args_on_stack*4, wordSize)>>LogBytesPerWord;
   793     // Round up to miminum stack alignment, in wordSize
   794     comp_words_on_stack = round_to(comp_words_on_stack, 2);
   795     __ daddi(SP, SP, -comp_words_on_stack * wordSize);
   796   }
   798   // Align the outgoing SP
   799   __ move(AT, -(StackAlignmentInBytes));
   800   __ andr(SP, SP, AT);
   801   // push the return address on the stack (note that pushing, rather
   802   // than storing it, yields the correct frame alignment for the callee)
   803   // Put saved SP in another register
   804   const Register saved_sp = V0;
   805   __ move(saved_sp, T9);
   808   // Will jump to the compiled code just as if compiled code was doing it.
   809   // Pre-load the register-jump target early, to schedule it better.
   810   __ ld(T9, Rmethod, in_bytes(Method::from_compiled_offset()));
   812   // Now generate the shuffle code.  Pick up all register args and move the
   813   // rest through the floating point stack top.
   814   for (int i = 0; i < total_args_passed; i++) {
   815     if (sig_bt[i] == T_VOID) {
   816       // Longs and doubles are passed in native word order, but misaligned
   817       // in the 32-bit build.
   818       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
   819       continue;
   820     }
   822     // Pick up 0, 1 or 2 words from SP+offset.
   824     //FIXME. aoqi. just delete the assert
   825     //assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(), "scrambled load targets?");
   826     // Load in argument order going down.
   827     int ld_off = (total_args_passed -1 - i)*Interpreter::stackElementSize;
   828     // Point to interpreter value (vs. tag)
   829     int next_off = ld_off - Interpreter::stackElementSize;
   830     VMReg r_1 = regs[i].first();
   831     VMReg r_2 = regs[i].second();
   832     if (!r_1->is_valid()) {
   833       assert(!r_2->is_valid(), "");
   834       continue;
   835     }
   836     if (r_1->is_stack()) {
   837       // Convert stack slot to an SP offset (+ wordSize to
   838       // account for return address )
   839       // NOTICE HERE!!!! I sub a wordSize here
   840       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size;
   841       //+ wordSize;
   843       if (!r_2->is_valid()) {
   844         __ ld(AT, saved_sp, ld_off);
   845         __ sd(AT, SP, st_off);
   846       } else {
   847         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
   848         // are accessed as negative so LSW is at LOW address
   850         // ld_off is MSW so get LSW
   851         // st_off is LSW (i.e. reg.first())
   853         // [./org/eclipse/swt/graphics/GC.java]
   854         // void drawImageXRender(Image srcImage, int srcX, int srcY, int srcWidth, int srcHeight,
   855         //  int destX, int destY, int destWidth, int destHeight,
   856         //  boolean simple,
   857         //  int imgWidth, int imgHeight,
   858         //  long maskPixmap,  <-- Pass T_LONG in stack
   859         //  int maskType);
   860         // Before this modification, Eclipse displays icons with solid black background.
   861         //
   862         __ ld(AT, saved_sp, ld_off);
   863         if (sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE)
   864           __ ld(AT, saved_sp, ld_off - 8);
   865         __ sd(AT, SP, st_off);
   866       }
   867     } else if (r_1->is_Register()) {  // Register argument
   868       Register r = r_1->as_Register();
   869       if (r_2->is_valid()) {
   870         // Remember r_1 is low address (and LSB on mips)
   871         // So r_2 gets loaded from high address regardless of the platform
   872         assert(r_2->as_Register() == r_1->as_Register(), "");
   873         __ ld(r, saved_sp, ld_off);
   875         //
   876         // For T_LONG type, the real layout is as below:
   877         //
   878         //   (high)
   879         //    |         |
   880         //    -----------
   881         //    | 8 bytes |
   882         //    | (void)  |
   883         //    -----------
   884         //    | 8 bytes |
   885         //    | (long)  |
   886         //    -----------
   887         //    |         |
   888         //   (low)
   889         //
   890         // We should load the low-8 bytes.
   891         //
   892         if (sig_bt[i] == T_LONG)
   893           __ ld(r, saved_sp, ld_off - 8);
   894       } else {
   895         __ lw(r, saved_sp, ld_off);
   896       }
   897     } else if (r_1->is_FloatRegister()) { // Float Register
   898       assert(sig_bt[i] == T_FLOAT || sig_bt[i] == T_DOUBLE, "Must be a float register");
   900       FloatRegister fr = r_1->as_FloatRegister();
   901       if (sig_bt[i] == T_FLOAT)
   902           __ lwc1(fr, saved_sp, ld_off);
   903       else {
   904           __ ldc1(fr, saved_sp, ld_off);
   905           __ ldc1(fr, saved_sp, ld_off - 8);
   906       }
   907     }
   908   }
   910   // 6243940 We might end up in handle_wrong_method if
   911   // the callee is deoptimized as we race thru here. If that
   912   // happens we don't want to take a safepoint because the
   913   // caller frame will look interpreted and arguments are now
   914   // "compiled" so it is much better to make this transition
   915   // invisible to the stack walking code. Unfortunately if
   916   // we try and find the callee by normal means a safepoint
   917   // is possible. So we stash the desired callee in the thread
   918   // and the vm will find there should this case occur.
   919   __ get_thread(T8);
   920   __ sd(Rmethod, T8, in_bytes(JavaThread::callee_target_offset()));
   922   // move methodOop to V0 in case we end up in an c2i adapter.
   923   // the c2i adapters expect methodOop in V0 (c2) because c2's
   924   // resolve stubs return the result (the method) in V0.
   925   // I'd love to fix this.
   926   __ move(V0, Rmethod);
   927   __ jr(T9);
   928   __ delayed()->nop();
   929 }
   931 // ---------------------------------------------------------------
   932 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
   933                                                             int total_args_passed,
   934                                                             // VMReg max_arg,
   935                                                             int comp_args_on_stack, // VMRegStackSlots
   936                                                             const BasicType *sig_bt,
   937                                                             const VMRegPair *regs,
   938                                                             AdapterFingerPrint* fingerprint) {
   939   address i2c_entry = __ pc();
   941   AdapterGenerator agen(masm);
   943   agen.gen_i2c_adapter(total_args_passed, comp_args_on_stack, sig_bt, regs);
   946   // -------------------------------------------------------------------------
   947   // Generate a C2I adapter.  On entry we know G5 holds the methodOop.  The
   948   // args start out packed in the compiled layout.  They need to be unpacked
   949   // into the interpreter layout.  This will almost always require some stack
   950   // space.  We grow the current (compiled) stack, then repack the args.  We
   951   // finally end in a jump to the generic interpreter entry point.  On exit
   952   // from the interpreter, the interpreter will restore our SP (lest the
   953   // compiled code, which relys solely on SP and not FP, get sick).
   955   address c2i_unverified_entry = __ pc();
   956   Label skip_fixup;
   957   {
   958     Register holder = T1;
   959     Register receiver = T0;
   960     Register temp = T8;
   961     address ic_miss = SharedRuntime::get_ic_miss_stub();
   963     Label missed;
   965     __ verify_oop(holder);
   966     //add for compressedoops
   967     __ load_klass(temp, receiver);
   968     __ verify_oop(temp);
   970     __ ld_ptr(AT, holder, CompiledICHolder::holder_klass_offset());
   971     __ ld_ptr(Rmethod, holder, CompiledICHolder::holder_metadata_offset());
   972     __ bne(AT, temp, missed);
   973     __ delayed()->nop();
   974     // Method might have been compiled since the call site was patched to
   975     // interpreted if that is the case treat it as a miss so we can get
   976     // the call site corrected.
   977     __ ld_ptr(AT, Rmethod, in_bytes(Method::code_offset()));
   978     __ beq(AT, R0, skip_fixup);
   979     __ delayed()->nop();
   980     __ bind(missed);
   982     __ jmp(ic_miss, relocInfo::runtime_call_type);
   983     __ delayed()->nop();
   984   }
   986   address c2i_entry = __ pc();
   988   agen.gen_c2i_adapter(total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
   990   __ flush();
   991   return  AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry);
   992 }
   994 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
   995                                          VMRegPair *regs,
   996                                          VMRegPair *regs2,
   997                                          int total_args_passed) {
   998   assert(regs2 == NULL, "not needed on MIPS");
   999   // Return the number of VMReg stack_slots needed for the args.
  1000   // This value does not include an abi space (like register window
  1001   // save area).
  1003   // The native convention is V8 if !LP64
  1004   // The LP64 convention is the V9 convention which is slightly more sane.
  1006   // We return the amount of VMReg stack slots we need to reserve for all
  1007   // the arguments NOT counting out_preserve_stack_slots. Since we always
  1008   // have space for storing at least 6 registers to memory we start with that.
  1009   // See int_stk_helper for a further discussion.
  1010   // We return the amount of VMRegImpl stack slots we need to reserve for all
  1011   // the arguments NOT counting out_preserve_stack_slots.
  1012   static const Register INT_ArgReg[Argument::n_register_parameters] = {
  1013     A0, A1, A2, A3, A4, A5, A6, A7
  1014   };
  1015   static const FloatRegister FP_ArgReg[Argument::n_float_register_parameters] = {
  1016     F12, F13, F14, F15, F16, F17, F18, F19
  1017   };
  1018   uint args = 0;
  1019   uint stk_args = 0; // inc by 2 each time
  1021 // Example:
  1022 //    n   java.lang.UNIXProcess::forkAndExec
  1023 //     private native int forkAndExec(byte[] prog,
  1024 //                                    byte[] argBlock, int argc,
  1025 //                                    byte[] envBlock, int envc,
  1026 //                                    byte[] dir,
  1027 //                                    boolean redirectErrorStream,
  1028 //                                    FileDescriptor stdin_fd,
  1029 //                                    FileDescriptor stdout_fd,
  1030 //                                    FileDescriptor stderr_fd)
  1031 // JNIEXPORT jint JNICALL
  1032 // Java_java_lang_UNIXProcess_forkAndExec(JNIEnv *env,
  1033 //                                        jobject process,
  1034 //                                        jbyteArray prog,
  1035 //                                        jbyteArray argBlock, jint argc,
  1036 //                                        jbyteArray envBlock, jint envc,
  1037 //                                        jbyteArray dir,
  1038 //                                        jboolean redirectErrorStream,
  1039 //                                        jobject stdin_fd,
  1040 //                                        jobject stdout_fd,
  1041 //                                        jobject stderr_fd)
  1042 //
  1043 // ::c_calling_convention
  1044 // 0:     // env    <-- a0
  1045 // 1: L    // klass/obj  <-- t0 => a1
  1046 // 2: [    // prog[]  <-- a0 => a2
  1047 // 3: [    // argBlock[]  <-- a1 => a3
  1048 // 4: I    // argc
  1049 // 5: [    // envBlock[]  <-- a3 => a5
  1050 // 6: I    // envc
  1051 // 7: [    // dir[]  <-- a5 => a7
  1052 // 8: Z    // redirectErrorStream  a6 => sp[0]
  1053 // 9: L    // stdin    a7 => sp[8]
  1054 // 10: L    // stdout    fp[16] => sp[16]
  1055 // 11: L    // stderr    fp[24] => sp[24]
  1056 //
  1057   for (int i = 0; i < total_args_passed; i++) {
  1058     switch (sig_bt[i]) {
  1059     case T_VOID: // Halves of longs and doubles
  1060       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
  1061       regs[i].set_bad();
  1062       break;
  1063     case T_BOOLEAN:
  1064     case T_CHAR:
  1065     case T_BYTE:
  1066     case T_SHORT:
  1067     case T_INT:
  1068       if (args < Argument::n_register_parameters) {
  1069         regs[i].set1(INT_ArgReg[args++]->as_VMReg());
  1070       } else {
  1071         regs[i].set1(VMRegImpl::stack2reg(stk_args));
  1072         stk_args += 2;
  1074       break;
  1075     case T_LONG:
  1076       assert(sig_bt[i + 1] == T_VOID, "expecting half");
  1077       // fall through
  1078     case T_OBJECT:
  1079     case T_ARRAY:
  1080     case T_ADDRESS:
  1081     case T_METADATA:
  1082       if (args < Argument::n_register_parameters) {
  1083         regs[i].set2(INT_ArgReg[args++]->as_VMReg());
  1084       } else {
  1085         regs[i].set2(VMRegImpl::stack2reg(stk_args));
  1086         stk_args += 2;
  1088       break;
  1089     case T_FLOAT:
  1090       if (args < Argument::n_float_register_parameters) {
  1091         regs[i].set1(FP_ArgReg[args++]->as_VMReg());
  1092       } else {
  1093         regs[i].set1(VMRegImpl::stack2reg(stk_args));
  1094         stk_args += 2;
  1096       break;
  1097     case T_DOUBLE:
  1098       assert(sig_bt[i + 1] == T_VOID, "expecting half");
  1099       if (args < Argument::n_float_register_parameters) {
  1100         regs[i].set2(FP_ArgReg[args++]->as_VMReg());
  1101       } else {
  1102         regs[i].set2(VMRegImpl::stack2reg(stk_args));
  1103         stk_args += 2;
  1105       break;
  1106     default:
  1107       ShouldNotReachHere();
  1108       break;
  1112   return round_to(stk_args, 2);
  1115 // ---------------------------------------------------------------------------
  1116 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
  1117   // We always ignore the frame_slots arg and just use the space just below frame pointer
  1118   // which by this time is free to use
  1119   switch (ret_type) {
  1120     case T_FLOAT:
  1121       __ swc1(FSF, FP, -wordSize);
  1122       break;
  1123     case T_DOUBLE:
  1124       __ sdc1(FSF, FP, -wordSize );
  1125       break;
  1126     case T_VOID:  break;
  1127     case T_LONG:
  1128       __ sd(V0, FP, -wordSize);
  1129       break;
  1130     case T_OBJECT:
  1131     case T_ARRAY:
  1132       __ sd(V0, FP, -wordSize);
  1133       break;
  1134     default: {
  1135       __ sw(V0, FP, -wordSize);
  1140 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
  1141   // We always ignore the frame_slots arg and just use the space just below frame pointer
  1142   // which by this time is free to use
  1143   switch (ret_type) {
  1144     case T_FLOAT:
  1145       __ lwc1(FSF, FP, -wordSize);
  1146       break;
  1147     case T_DOUBLE:
  1148       __ ldc1(FSF, FP, -wordSize );
  1149       break;
  1150     case T_LONG:
  1151       __ ld(V0, FP, -wordSize);
  1152       break;
  1153     case T_VOID:  break;
  1154     case T_OBJECT:
  1155     case T_ARRAY:
  1156       __ ld(V0, FP, -wordSize);
  1157       break;
  1158     default: {
  1159       __ lw(V0, FP, -wordSize);
  1164 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
  1165   for ( int i = first_arg ; i < arg_count ; i++ ) {
  1166     if (args[i].first()->is_Register()) {
  1167       __ push(args[i].first()->as_Register());
  1168     } else if (args[i].first()->is_FloatRegister()) {
  1169       __ push(args[i].first()->as_FloatRegister());
  1174 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
  1175   for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
  1176     if (args[i].first()->is_Register()) {
  1177       __ pop(args[i].first()->as_Register());
  1178     } else if (args[i].first()->is_FloatRegister()) {
  1179       __ pop(args[i].first()->as_FloatRegister());
  1184 // A simple move of integer like type
  1185 static void simple_move32(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {
  1186   if (src.first()->is_stack()) {
  1187     if (dst.first()->is_stack()) {
  1188       // stack to stack
  1189       __ lw(AT, FP, reg2offset_in(src.first()));
  1190       __ sd(AT, SP, reg2offset_out(dst.first()));
  1191     } else {
  1192       // stack to reg
  1193       __ lw(dst.first()->as_Register(),  FP, reg2offset_in(src.first()));
  1195   } else if (dst.first()->is_stack()) {
  1196     // reg to stack
  1197     __ sd(src.first()->as_Register(), SP, reg2offset_out(dst.first()));
  1198   } else {
  1199     if (dst.first() != src.first()){
  1200       __ move(dst.first()->as_Register(), src.first()->as_Register()); // fujie error:dst.first()
  1205 // An oop arg. Must pass a handle not the oop itself
  1206 static void object_move(MacroAssembler* masm,
  1207                         OopMap* map,
  1208                         int oop_handle_offset,
  1209                         int framesize_in_slots,
  1210                         VMRegPair src,
  1211                         VMRegPair dst,
  1212                         bool is_receiver,
  1213                         int* receiver_offset) {
  1215   // must pass a handle. First figure out the location we use as a handle
  1217   //FIXME, for mips, dst can be register
  1218   if (src.first()->is_stack()) {
  1219     // Oop is already on the stack as an argument
  1220     Register rHandle = V0;
  1221     Label nil;
  1222     __ xorr(rHandle, rHandle, rHandle);
  1223     __ ld(AT, FP, reg2offset_in(src.first()));
  1224     __ beq(AT, R0, nil);
  1225     __ delayed()->nop();
  1226     __ lea(rHandle, Address(FP, reg2offset_in(src.first())));
  1227     __ bind(nil);
  1228     if(dst.first()->is_stack())__ sd( rHandle, SP, reg2offset_out(dst.first()));
  1229     else                       __ move( (dst.first())->as_Register(), rHandle);
  1230     //if dst is register
  1231     //FIXME, do mips need out preserve stack slots?
  1232     int offset_in_older_frame = src.first()->reg2stack()
  1233       + SharedRuntime::out_preserve_stack_slots();
  1234     map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
  1235     if (is_receiver) {
  1236       *receiver_offset = (offset_in_older_frame
  1237           + framesize_in_slots) * VMRegImpl::stack_slot_size;
  1239   } else {
  1240     // Oop is in an a register we must store it to the space we reserve
  1241     // on the stack for oop_handles
  1242     const Register rOop = src.first()->as_Register();
  1243     assert( (rOop->encoding() >= A0->encoding()) && (rOop->encoding() <= T0->encoding()),"wrong register");
  1244     const Register rHandle = V0;
  1245     //Important: refer to java_calling_convertion
  1246     int oop_slot = (rOop->encoding() - A0->encoding()) * VMRegImpl::slots_per_word + oop_handle_offset;
  1247     int offset = oop_slot*VMRegImpl::stack_slot_size;
  1248     Label skip;
  1249     __ sd( rOop , SP, offset );
  1250     map->set_oop(VMRegImpl::stack2reg(oop_slot));
  1251     __ xorr( rHandle, rHandle, rHandle);
  1252     __ beq(rOop, R0, skip);
  1253     __ delayed()->nop();
  1254     __ lea(rHandle, Address(SP, offset));
  1255     __ bind(skip);
  1256     // Store the handle parameter
  1257     if(dst.first()->is_stack())__ sd( rHandle, SP, reg2offset_out(dst.first()));
  1258     else                       __ move((dst.first())->as_Register(), rHandle);
  1259     //if dst is register
  1261     if (is_receiver) {
  1262       *receiver_offset = offset;
  1267 // A float arg may have to do float reg int reg conversion
  1268 static void float_move(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {
  1269   assert(!src.second()->is_valid() && !dst.second()->is_valid(), "bad float_move");
  1271   if (src.first()->is_stack()) {
  1272     if (dst.first()->is_stack()) {
  1273       __ lwc1(F12, FP, reg2offset_in(src.first()));
  1274       __ swc1(F12, SP, reg2offset_out(dst.first()));
  1276     else
  1277       __ lwc1(dst.first()->as_FloatRegister(), FP, reg2offset_in(src.first()));
  1278   } else {
  1279     // reg to stack
  1280     if(dst.first()->is_stack())
  1281       __ swc1(src.first()->as_FloatRegister(), SP, reg2offset_out(dst.first()));
  1282     else
  1283       __ mov_s(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
  1287 // A long move
  1288 static void long_move(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {
  1290   // The only legal possibility for a long_move VMRegPair is:
  1291   // 1: two stack slots (possibly unaligned)
  1292   // as neither the java  or C calling convention will use registers
  1293   // for longs.
  1295   if (src.first()->is_stack()) {
  1296     assert(src.second()->is_stack() && dst.second()->is_stack(), "must be all stack");
  1297     if( dst.first()->is_stack()){
  1298       __ ld(AT, FP, reg2offset_in(src.first()));
  1299       __ sd(AT, SP, reg2offset_out(dst.first()));
  1300     } else {
  1301       __ ld( (dst.first())->as_Register() , FP, reg2offset_in(src.first()));
  1303   } else {
  1304     if( dst.first()->is_stack()){
  1305       __ sd( (src.first())->as_Register(), SP, reg2offset_out(dst.first()));
  1306     } else {
  1307       __ move( (dst.first())->as_Register() , (src.first())->as_Register());
  1312 // A double move
  1313 static void double_move(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {
  1315   // The only legal possibilities for a double_move VMRegPair are:
  1316   // The painful thing here is that like long_move a VMRegPair might be
  1318   // Because of the calling convention we know that src is either
  1319   //   1: a single physical register (xmm registers only)
  1320   //   2: two stack slots (possibly unaligned)
  1321   // dst can only be a pair of stack slots.
  1324   if (src.first()->is_stack()) {
  1325     // source is all stack
  1326     if( dst.first()->is_stack()){
  1327       __ ldc1(F12, FP, reg2offset_in(src.first()));
  1329       __ sdc1(F12, SP, reg2offset_out(dst.first()));
  1330     } else {
  1331       __ ldc1( (dst.first())->as_FloatRegister(), FP, reg2offset_in(src.first()));
  1334   } else {
  1335     // reg to stack
  1336     // No worries about stack alignment
  1337     if( dst.first()->is_stack()){
  1338       __ sdc1(src.first()->as_FloatRegister(), SP, reg2offset_out(dst.first()));
  1340     else
  1341       __ mov_d( dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
  1346 static void verify_oop_args(MacroAssembler* masm,
  1347                             methodHandle method,
  1348                             const BasicType* sig_bt,
  1349                             const VMRegPair* regs) {
  1350   Register temp_reg = T9;  // not part of any compiled calling seq
  1351   if (VerifyOops) {
  1352     for (int i = 0; i < method->size_of_parameters(); i++) {
  1353       if (sig_bt[i] == T_OBJECT ||
  1354           sig_bt[i] == T_ARRAY) {
  1355         VMReg r = regs[i].first();
  1356         assert(r->is_valid(), "bad oop arg");
  1357         if (r->is_stack()) {
  1358           __ ld(temp_reg, Address(SP, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
  1359           __ verify_oop(temp_reg);
  1360         } else {
  1361           __ verify_oop(r->as_Register());
  1368 static void gen_special_dispatch(MacroAssembler* masm,
  1369                                  methodHandle method,
  1370                                  const BasicType* sig_bt,
  1371                                  const VMRegPair* regs) {
  1372   verify_oop_args(masm, method, sig_bt, regs);
  1373   vmIntrinsics::ID iid = method->intrinsic_id();
  1375   // Now write the args into the outgoing interpreter space
  1376   bool     has_receiver   = false;
  1377   Register receiver_reg   = noreg;
  1378   int      member_arg_pos = -1;
  1379   Register member_reg     = noreg;
  1380   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
  1381   if (ref_kind != 0) {
  1382     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
  1383     member_reg = S3;  // known to be free at this point
  1384     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
  1385   } else if (iid == vmIntrinsics::_invokeBasic) {
  1386     has_receiver = true;
  1387   } else {
  1388     fatal(err_msg_res("unexpected intrinsic id %d", iid));
  1391   if (member_reg != noreg) {
  1392     // Load the member_arg into register, if necessary.
  1393     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
  1394     VMReg r = regs[member_arg_pos].first();
  1395     if (r->is_stack()) {
  1396       __ ld(member_reg, Address(SP, r->reg2stack() * VMRegImpl::stack_slot_size));
  1397     } else {
  1398       // no data motion is needed
  1399       member_reg = r->as_Register();
  1403   if (has_receiver) {
  1404     // Make sure the receiver is loaded into a register.
  1405     assert(method->size_of_parameters() > 0, "oob");
  1406     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
  1407     VMReg r = regs[0].first();
  1408     assert(r->is_valid(), "bad receiver arg");
  1409     if (r->is_stack()) {
  1410       // Porting note:  This assumes that compiled calling conventions always
  1411       // pass the receiver oop in a register.  If this is not true on some
  1412       // platform, pick a temp and load the receiver from stack.
  1413       fatal("receiver always in a register");
  1414       receiver_reg = SSR;  // known to be free at this point
  1415       __ ld(receiver_reg, Address(SP, r->reg2stack() * VMRegImpl::stack_slot_size));
  1416     } else {
  1417       // no data motion is needed
  1418       receiver_reg = r->as_Register();
  1422   // Figure out which address we are really jumping to:
  1423   MethodHandles::generate_method_handle_dispatch(masm, iid,
  1424                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
  1427 // ---------------------------------------------------------------------------
  1428 // Generate a native wrapper for a given method.  The method takes arguments
  1429 // in the Java compiled code convention, marshals them to the native
  1430 // convention (handlizes oops, etc), transitions to native, makes the call,
  1431 // returns to java state (possibly blocking), unhandlizes any result and
  1432 // returns.
  1433 nmethod *SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
  1434                                                 methodHandle method,
  1435                                                 int compile_id,
  1436                                                 BasicType* in_sig_bt,
  1437                                                 VMRegPair* in_regs,
  1438                                                 BasicType ret_type) {
  1439   if (method->is_method_handle_intrinsic()) {
  1440     vmIntrinsics::ID iid = method->intrinsic_id();
  1441     intptr_t start = (intptr_t)__ pc();
  1442     int vep_offset = ((intptr_t)__ pc()) - start;
  1443     gen_special_dispatch(masm,
  1444                          method,
  1445                          in_sig_bt,
  1446                          in_regs);
  1447     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
  1448     __ flush();
  1449     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
  1450     return nmethod::new_native_nmethod(method,
  1451                                        compile_id,
  1452                                        masm->code(),
  1453                                        vep_offset,
  1454                                        frame_complete,
  1455                                        stack_slots / VMRegImpl::slots_per_word,
  1456                                        in_ByteSize(-1),
  1457                                        in_ByteSize(-1),
  1458                                        (OopMapSet*)NULL);
  1460   bool is_critical_native = true;
  1461   address native_func = method->critical_native_function();
  1462   if (native_func == NULL) {
  1463     native_func = method->native_function();
  1464     is_critical_native = false;
  1466   assert(native_func != NULL, "must have function");
  1468   // Native nmethod wrappers never take possesion of the oop arguments.
  1469   // So the caller will gc the arguments. The only thing we need an
  1470   // oopMap for is if the call is static
  1471   //
  1472   // An OopMap for lock (and class if static), and one for the VM call itself
  1473   OopMapSet *oop_maps = new OopMapSet();
  1475   // We have received a description of where all the java arg are located
  1476   // on entry to the wrapper. We need to convert these args to where
  1477   // the jni function will expect them. To figure out where they go
  1478   // we convert the java signature to a C signature by inserting
  1479   // the hidden arguments as arg[0] and possibly arg[1] (static method)
  1481   const int total_in_args = method->size_of_parameters();
  1482   int total_c_args = total_in_args;
  1483   if (!is_critical_native) {
  1484     total_c_args += 1;
  1485     if (method->is_static()) {
  1486       total_c_args++;
  1488   } else {
  1489     for (int i = 0; i < total_in_args; i++) {
  1490       if (in_sig_bt[i] == T_ARRAY) {
  1491         total_c_args++;
  1496   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
  1497   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
  1498   BasicType* in_elem_bt = NULL;
  1500   int argc = 0;
  1501   if (!is_critical_native) {
  1502     out_sig_bt[argc++] = T_ADDRESS;
  1503     if (method->is_static()) {
  1504       out_sig_bt[argc++] = T_OBJECT;
  1507     for (int i = 0; i < total_in_args ; i++ ) {
  1508       out_sig_bt[argc++] = in_sig_bt[i];
  1510   } else {
  1511     Thread* THREAD = Thread::current();
  1512     in_elem_bt = NEW_RESOURCE_ARRAY(BasicType, total_in_args);
  1513     SignatureStream ss(method->signature());
  1514     for (int i = 0; i < total_in_args ; i++ ) {
  1515       if (in_sig_bt[i] == T_ARRAY) {
  1516         // Arrays are passed as int, elem* pair
  1517         out_sig_bt[argc++] = T_INT;
  1518         out_sig_bt[argc++] = T_ADDRESS;
  1519         Symbol* atype = ss.as_symbol(CHECK_NULL);
  1520         const char* at = atype->as_C_string();
  1521         if (strlen(at) == 2) {
  1522           assert(at[0] == '[', "must be");
  1523           switch (at[1]) {
  1524             case 'B': in_elem_bt[i]  = T_BYTE; break;
  1525             case 'C': in_elem_bt[i]  = T_CHAR; break;
  1526             case 'D': in_elem_bt[i]  = T_DOUBLE; break;
  1527             case 'F': in_elem_bt[i]  = T_FLOAT; break;
  1528             case 'I': in_elem_bt[i]  = T_INT; break;
  1529             case 'J': in_elem_bt[i]  = T_LONG; break;
  1530             case 'S': in_elem_bt[i]  = T_SHORT; break;
  1531             case 'Z': in_elem_bt[i]  = T_BOOLEAN; break;
  1532             default: ShouldNotReachHere();
  1535       } else {
  1536         out_sig_bt[argc++] = in_sig_bt[i];
  1537         in_elem_bt[i] = T_VOID;
  1539       if (in_sig_bt[i] != T_VOID) {
  1540         assert(in_sig_bt[i] == ss.type(), "must match");
  1541         ss.next();
  1546   // Now figure out where the args must be stored and how much stack space
  1547   // they require (neglecting out_preserve_stack_slots but space for storing
  1548   // the 1st six register arguments). It's weird see int_stk_helper.
  1549   //
  1550   int out_arg_slots;
  1551   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, NULL, total_c_args);
  1553   // Compute framesize for the wrapper.  We need to handlize all oops in
  1554   // registers. We must create space for them here that is disjoint from
  1555   // the windowed save area because we have no control over when we might
  1556   // flush the window again and overwrite values that gc has since modified.
  1557   // (The live window race)
  1558   //
  1559   // We always just allocate 6 word for storing down these object. This allow
  1560   // us to simply record the base and use the Ireg number to decide which
  1561   // slot to use. (Note that the reg number is the inbound number not the
  1562   // outbound number).
  1563   // We must shuffle args to match the native convention, and include var-args space.
  1565   // Calculate the total number of stack slots we will need.
  1567   // First count the abi requirement plus all of the outgoing args
  1568   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
  1570   // Now the space for the inbound oop handle area
  1571   int total_save_slots = 9 * VMRegImpl::slots_per_word;  // 9 arguments passed in registers
  1572   if (is_critical_native) {
  1573     // Critical natives may have to call out so they need a save area
  1574     // for register arguments.
  1575     int double_slots = 0;
  1576     int single_slots = 0;
  1577     for ( int i = 0; i < total_in_args; i++) {
  1578       if (in_regs[i].first()->is_Register()) {
  1579         const Register reg = in_regs[i].first()->as_Register();
  1580         switch (in_sig_bt[i]) {
  1581           case T_BOOLEAN:
  1582           case T_BYTE:
  1583           case T_SHORT:
  1584           case T_CHAR:
  1585           case T_INT:  single_slots++; break;
  1586           case T_ARRAY:  // specific to LP64 (7145024)
  1587           case T_LONG: double_slots++; break;
  1588           default:  ShouldNotReachHere();
  1590       } else if (in_regs[i].first()->is_FloatRegister()) {
  1591         switch (in_sig_bt[i]) {
  1592           case T_FLOAT:  single_slots++; break;
  1593           case T_DOUBLE: double_slots++; break;
  1594           default:  ShouldNotReachHere();
  1598     total_save_slots = double_slots * 2 + single_slots;
  1599     // align the save area
  1600     if (double_slots != 0) {
  1601       stack_slots = round_to(stack_slots, 2);
  1605   int oop_handle_offset = stack_slots;
  1606   stack_slots += total_save_slots;
  1608   // Now any space we need for handlizing a klass if static method
  1610   int klass_slot_offset = 0;
  1611   int klass_offset = -1;
  1612   int lock_slot_offset = 0;
  1613   bool is_static = false;
  1615   if (method->is_static()) {
  1616     klass_slot_offset = stack_slots;
  1617     stack_slots += VMRegImpl::slots_per_word;
  1618     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
  1619     is_static = true;
  1622   // Plus a lock if needed
  1624   if (method->is_synchronized()) {
  1625     lock_slot_offset = stack_slots;
  1626     stack_slots += VMRegImpl::slots_per_word;
  1629   // Now a place to save return value or as a temporary for any gpr -> fpr moves
  1630   // + 2 for return address (which we own) and saved fp
  1631   stack_slots += 2 + 9 * VMRegImpl::slots_per_word;  // (T0, A0, A1, A2, A3, A4, A5, A6, A7)
  1633   // Ok The space we have allocated will look like:
  1634   //
  1635   //
  1636   // FP-> |                     |
  1637   //      |---------------------|
  1638   //      | 2 slots for moves   |
  1639   //      |---------------------|
  1640   //      | lock box (if sync)  |
  1641   //      |---------------------| <- lock_slot_offset
  1642   //      | klass (if static)   |
  1643   //      |---------------------| <- klass_slot_offset
  1644   //      | oopHandle area      |
  1645   //      |---------------------| <- oop_handle_offset
  1646   //      | outbound memory     |
  1647   //      | based arguments     |
  1648   //      |                     |
  1649   //      |---------------------|
  1650   //      | vararg area         |
  1651   //      |---------------------|
  1652   //      |                     |
  1653   // SP-> | out_preserved_slots |
  1654   //
  1655   //
  1658   // Now compute actual number of stack words we need rounding to make
  1659   // stack properly aligned.
  1660   stack_slots = round_to(stack_slots, StackAlignmentInSlots);
  1662   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
  1664   intptr_t start = (intptr_t)__ pc();
  1668   // First thing make an ic check to see if we should even be here
  1669   address ic_miss = SharedRuntime::get_ic_miss_stub();
  1671   // We are free to use all registers as temps without saving them and
  1672   // restoring them except fp. fp is the only callee save register
  1673   // as far as the interpreter and the compiler(s) are concerned.
  1675   //refer to register_mips.hpp:IC_Klass
  1676   const Register ic_reg = T1;
  1677   const Register receiver = T0;
  1679   Label hit;
  1680   Label exception_pending;
  1682   __ verify_oop(receiver);
  1683   //add for compressedoops
  1684   __ load_klass(T9, receiver);
  1685   __ beq(T9, ic_reg, hit);
  1686   __ delayed()->nop();
  1687   __ jmp(ic_miss, relocInfo::runtime_call_type);
  1688   __ delayed()->nop();
  1689   // verified entry must be aligned for code patching.
  1690   // and the first 5 bytes must be in the same cache line
  1691   // if we align at 8 then we will be sure 5 bytes are in the same line
  1692   __ align(8);
  1694   __ bind(hit);
  1697   int vep_offset = ((intptr_t)__ pc()) - start;
  1698 #ifdef COMPILER1
  1699   if (InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) {
  1700     // Object.hashCode can pull the hashCode from the header word
  1701     // instead of doing a full VM transition once it's been computed.
  1702     // Since hashCode is usually polymorphic at call sites we can't do
  1703     // this optimization at the call site without a lot of work.
  1704     Label slowCase;
  1705     Register receiver = T0;
  1706     Register result = V0;
  1707     __ ld ( result, receiver, oopDesc::mark_offset_in_bytes());
  1708     // check if locked
  1709     __ andi(AT, result, markOopDesc::unlocked_value);
  1710     __ beq(AT, R0, slowCase);
  1711     __ delayed()->nop();
  1712     if (UseBiasedLocking) {
  1713       // Check if biased and fall through to runtime if so
  1714       __ andi (AT, result, markOopDesc::biased_lock_bit_in_place);
  1715       __ bne(AT, R0, slowCase);
  1716       __ delayed()->nop();
  1718     // get hash
  1719     __ li(AT, markOopDesc::hash_mask_in_place);
  1720     __ andr (AT, result, AT);
  1721     // test if hashCode exists
  1722     __ beq (AT, R0, slowCase);
  1723     __ delayed()->nop();
  1724     __ shr(result, markOopDesc::hash_shift);
  1725     __ jr(RA);
  1726     __ delayed()->nop();
  1727     __ bind (slowCase);
  1729 #endif // COMPILER1
  1731   // The instruction at the verified entry point must be 5 bytes or longer
  1732   // because it can be patched on the fly by make_non_entrant. The stack bang
  1733   // instruction fits that requirement.
  1735   // Generate stack overflow check
  1737   if (UseStackBanging) {
  1738     __ bang_stack_with_offset(StackShadowPages*os::vm_page_size());
  1739   } else {
  1740     // need a 5 byte instruction to allow MT safe patching to non-entrant
  1741     __ nop();
  1742     __ nop();
  1743     __ nop();
  1744     __ nop();
  1745     __ nop();
  1747   // Generate a new frame for the wrapper.
  1748   // do mips need this ?
  1749 #ifndef OPT_THREAD
  1750   __ get_thread(TREG);
  1751 #endif
  1752   __ st_ptr(SP, TREG, in_bytes(JavaThread::last_Java_sp_offset()));
  1753   __ move(AT, -(StackAlignmentInBytes));
  1754   __ andr(SP, SP, AT);
  1756   __ enter();
  1757   // -2 because return address is already present and so is saved fp
  1758   __ addiu(SP, SP, -1 * (stack_size - 2*wordSize));
  1760   // Frame is now completed as far a size and linkage.
  1762   int frame_complete = ((intptr_t)__ pc()) - start;
  1764   // Calculate the difference between sp and fp. We need to know it
  1765   // after the native call because on windows Java Natives will pop
  1766   // the arguments and it is painful to do sp relative addressing
  1767   // in a platform independent way. So after the call we switch to
  1768   // fp relative addressing.
  1769   //FIXME actually , the fp_adjustment may not be the right, because andr(sp, sp, at) may change
  1770   //the SP
  1771   int fp_adjustment = stack_size - 2*wordSize;
  1773 #ifdef COMPILER2
  1774   // C2 may leave the stack dirty if not in SSE2+ mode
  1775   __ empty_FPU_stack();
  1776 #endif
  1778   // Compute the fp offset for any slots used after the jni call
  1780   int lock_slot_fp_offset = (lock_slot_offset*VMRegImpl::stack_slot_size) - fp_adjustment;
  1781   // We use TREG as a thread pointer because it is callee save and
  1782   // if we load it once it is usable thru the entire wrapper
  1783   const Register thread = TREG;
  1785   // We use S4 as the oop handle for the receiver/klass
  1786   // It is callee save so it survives the call to native
  1788   const Register oop_handle_reg = S4;
  1789   if (is_critical_native) {
  1790      __ stop("generate_native_wrapper in sharedRuntime <2>");
  1791     //TODO:Fu
  1792     // check_needs_gc_for_critical_native(masm, stack_slots, total_c_args, total_in_args,
  1793     //                                   oop_handle_offset, oop_maps, in_regs, in_sig_bt);
  1796 #ifndef OPT_THREAD
  1797   __ get_thread(thread);
  1798 #endif
  1800   //
  1801   // We immediately shuffle the arguments so that any vm call we have to
  1802   // make from here on out (sync slow path, jvmpi, etc.) we will have
  1803   // captured the oops from our caller and have a valid oopMap for
  1804   // them.
  1806   // -----------------
  1807   // The Grand Shuffle
  1808   //
  1809   // Natives require 1 or 2 extra arguments over the normal ones: the JNIEnv*
  1810   // and, if static, the class mirror instead of a receiver.  This pretty much
  1811   // guarantees that register layout will not match (and mips doesn't use reg
  1812   // parms though amd does).  Since the native abi doesn't use register args
  1813   // and the java conventions does we don't have to worry about collisions.
  1814   // All of our moved are reg->stack or stack->stack.
  1815   // We ignore the extra arguments during the shuffle and handle them at the
  1816   // last moment. The shuffle is described by the two calling convention
  1817   // vectors we have in our possession. We simply walk the java vector to
  1818   // get the source locations and the c vector to get the destinations.
  1820   int c_arg = method->is_static() ? 2 : 1 ;
  1822   // Record sp-based slot for receiver on stack for non-static methods
  1823   int receiver_offset = -1;
  1825   // This is a trick. We double the stack slots so we can claim
  1826   // the oops in the caller's frame. Since we are sure to have
  1827   // more args than the caller doubling is enough to make
  1828   // sure we can capture all the incoming oop args from the
  1829   // caller.
  1830   //
  1831   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
  1833   // Mark location of fp (someday)
  1834   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(fp));
  1836 #ifdef ASSERT
  1837   bool reg_destroyed[RegisterImpl::number_of_registers];
  1838   bool freg_destroyed[FloatRegisterImpl::number_of_registers];
  1839   for ( int r = 0 ; r < RegisterImpl::number_of_registers ; r++ ) {
  1840     reg_destroyed[r] = false;
  1842   for ( int f = 0 ; f < FloatRegisterImpl::number_of_registers ; f++ ) {
  1843     freg_destroyed[f] = false;
  1846 #endif /* ASSERT */
  1848   // This may iterate in two different directions depending on the
  1849   // kind of native it is.  The reason is that for regular JNI natives
  1850   // the incoming and outgoing registers are offset upwards and for
  1851   // critical natives they are offset down.
  1852   GrowableArray<int> arg_order(2 * total_in_args);
  1853   VMRegPair tmp_vmreg;
  1854   tmp_vmreg.set1(T8->as_VMReg());
  1856   if (!is_critical_native) {
  1857     for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
  1858       arg_order.push(i);
  1859       arg_order.push(c_arg);
  1861   } else {
  1862     // Compute a valid move order, using tmp_vmreg to break any cycles
  1863      __ stop("generate_native_wrapper in sharedRuntime <2>");
  1864     //TODO:Fu
  1865     // ComputeMoveOrder cmo(total_in_args, in_regs, total_c_args, out_regs, in_sig_bt, arg_order, tmp_vmreg);
  1868   int temploc = -1;
  1869   for (int ai = 0; ai < arg_order.length(); ai += 2) {
  1870     int i = arg_order.at(ai);
  1871     int c_arg = arg_order.at(ai + 1);
  1872     __ block_comment(err_msg("move %d -> %d", i, c_arg));
  1873     if (c_arg == -1) {
  1874       assert(is_critical_native, "should only be required for critical natives");
  1875       // This arg needs to be moved to a temporary
  1876       __ move(tmp_vmreg.first()->as_Register(), in_regs[i].first()->as_Register());
  1877       in_regs[i] = tmp_vmreg;
  1878       temploc = i;
  1879       continue;
  1880     } else if (i == -1) {
  1881       assert(is_critical_native, "should only be required for critical natives");
  1882       // Read from the temporary location
  1883       assert(temploc != -1, "must be valid");
  1884       i = temploc;
  1885       temploc = -1;
  1887 #ifdef ASSERT
  1888     if (in_regs[i].first()->is_Register()) {
  1889       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
  1890     } else if (in_regs[i].first()->is_FloatRegister()) {
  1891       assert(!freg_destroyed[in_regs[i].first()->as_FloatRegister()->encoding()], "destroyed reg!");
  1893     if (out_regs[c_arg].first()->is_Register()) {
  1894       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
  1895     } else if (out_regs[c_arg].first()->is_FloatRegister()) {
  1896       freg_destroyed[out_regs[c_arg].first()->as_FloatRegister()->encoding()] = true;
  1898 #endif /* ASSERT */
  1899     switch (in_sig_bt[i]) {
  1900       case T_ARRAY:
  1901         if (is_critical_native) {
  1902           __ stop("generate_native_wrapper in sharedRuntime <2>");
  1903           //TODO:Fu
  1904           // unpack_array_argument(masm, in_regs[i], in_elem_bt[i], out_regs[c_arg + 1], out_regs[c_arg]);
  1905           c_arg++;
  1906 #ifdef ASSERT
  1907           if (out_regs[c_arg].first()->is_Register()) {
  1908             reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
  1909           } else if (out_regs[c_arg].first()->is_FloatRegister()) {
  1910             freg_destroyed[out_regs[c_arg].first()->as_FloatRegister()->encoding()] = true;
  1912 #endif
  1913           break;
  1915       case T_OBJECT:
  1916         assert(!is_critical_native, "no oop arguments");
  1917         object_move(masm, map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
  1918                     ((i == 0) && (!is_static)),
  1919                     &receiver_offset);
  1920         break;
  1921       case T_VOID:
  1922         break;
  1924       case T_FLOAT:
  1925         float_move(masm, in_regs[i], out_regs[c_arg]);
  1926           break;
  1928       case T_DOUBLE:
  1929         assert( i + 1 < total_in_args &&
  1930                 in_sig_bt[i + 1] == T_VOID &&
  1931                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
  1932         double_move(masm, in_regs[i], out_regs[c_arg]);
  1933         break;
  1935       case T_LONG :
  1936         long_move(masm, in_regs[i], out_regs[c_arg]);
  1937         break;
  1939       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
  1941       default:
  1942         simple_move32(masm, in_regs[i], out_regs[c_arg]);
  1946   // point c_arg at the first arg that is already loaded in case we
  1947   // need to spill before we call out
  1948   c_arg = total_c_args - total_in_args;
  1949   // Pre-load a static method's oop.  Used both by locking code and
  1950   // the normal JNI call code.
  1952   __ move(oop_handle_reg, A1);
  1954   if (method->is_static() && !is_critical_native) {
  1956     //  load opp into a register
  1957     int oop_index = __ oop_recorder()->find_index(JNIHandles::make_local(
  1958           (method->method_holder())->java_mirror()));
  1961     RelocationHolder rspec = oop_Relocation::spec(oop_index);
  1962     __ relocate(rspec);
  1963     __ patchable_set48(oop_handle_reg, (long)JNIHandles::make_local((method->method_holder())->java_mirror()));
  1964     // Now handlize the static class mirror it's known not-null.
  1965     __ sd( oop_handle_reg, SP, klass_offset);
  1966     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
  1968     // Now get the handle
  1969     __ lea(oop_handle_reg, Address(SP, klass_offset));
  1970     // store the klass handle as second argument
  1971     __ move(A1, oop_handle_reg);
  1972     // and protect the arg if we must spill
  1973     c_arg--;
  1976   // Change state to native (we save the return address in the thread, since it might not
  1977   // be pushed on the stack when we do a a stack traversal). It is enough that the pc()
  1978   // points into the right code segment. It does not have to be the correct return pc.
  1979   // We use the same pc/oopMap repeatedly when we call out
  1981   intptr_t the_pc = (intptr_t) __ pc();
  1982   oop_maps->add_gc_map(the_pc - start, map);
  1984   __ set_last_Java_frame(SP, noreg, NULL);
  1985   __ relocate(relocInfo::internal_pc_type);
  1987     intptr_t save_pc = (intptr_t)the_pc ;
  1988     __ patchable_set48(AT, save_pc);
  1990   __ sd(AT, thread, in_bytes(JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()));
  1993   // We have all of the arguments setup at this point. We must not touch any register
  1994   // argument registers at this point (what if we save/restore them there are no oop?
  1996     SkipIfEqual skip_if(masm, &DTraceMethodProbes, 0);
  1997     int metadata_index = __ oop_recorder()->find_index(method());
  1998     RelocationHolder rspec = metadata_Relocation::spec(metadata_index);
  1999     __ relocate(rspec);
  2000     __ patchable_set48(AT, (long)(method()));
  2002     __ call_VM_leaf(
  2003       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
  2004       thread, AT);
  2008   // These are register definitions we need for locking/unlocking
  2009   const Register swap_reg = T8;  // Must use T8 for cmpxchg instruction
  2010   const Register obj_reg  = T9;  // Will contain the oop
  2011   //const Register lock_reg = T6;  // Address of compiler lock object (BasicLock)
  2012   const Register lock_reg = c_rarg0;  // Address of compiler lock object (BasicLock)
  2016   Label slow_path_lock;
  2017   Label lock_done;
  2019   // Lock a synchronized method
  2020   if (method->is_synchronized()) {
  2021     assert(!is_critical_native, "unhandled");
  2023     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
  2025     // Get the handle (the 2nd argument)
  2026     __ move(oop_handle_reg, A1);
  2028     // Get address of the box
  2029     __ lea(lock_reg, Address(FP, lock_slot_fp_offset));
  2031     // Load the oop from the handle
  2032     __ ld(obj_reg, oop_handle_reg, 0);
  2034     if (UseBiasedLocking) {
  2035       // Note that oop_handle_reg is trashed during this call
  2036       __ biased_locking_enter(lock_reg, obj_reg, swap_reg, A1, false, lock_done, &slow_path_lock);
  2039     // Load immediate 1 into swap_reg %T8
  2040     __ move(swap_reg, 1);
  2042     __ ld(AT, obj_reg, 0);
  2043     __ orr(swap_reg, swap_reg, AT);
  2045     __ sd( swap_reg, lock_reg, mark_word_offset);
  2046     __ cmpxchg(lock_reg, Address(obj_reg, 0), swap_reg);
  2047     __ bne(AT, R0, lock_done);
  2048     __ delayed()->nop();
  2049     // Test if the oopMark is an obvious stack pointer, i.e.,
  2050     //  1) (mark & 3) == 0, and
  2051     //  2) sp <= mark < mark + os::pagesize()
  2052     // These 3 tests can be done by evaluating the following
  2053     // expression: ((mark - sp) & (3 - os::vm_page_size())),
  2054     // assuming both stack pointer and pagesize have their
  2055     // least significant 2 bits clear.
  2056     // NOTE: the oopMark is in swap_reg %T8 as the result of cmpxchg
  2058     __ dsub(swap_reg, swap_reg, SP);
  2059     __ move(AT, 3 - os::vm_page_size());
  2060     __ andr(swap_reg , swap_reg, AT);
  2061     // Save the test result, for recursive case, the result is zero
  2062     __ sd(swap_reg, lock_reg, mark_word_offset);
  2063     __ bne(swap_reg, R0, slow_path_lock);
  2064     __ delayed()->nop();
  2065     // Slow path will re-enter here
  2066     __ bind(lock_done);
  2068     if (UseBiasedLocking) {
  2069       // Re-fetch oop_handle_reg as we trashed it above
  2070       __ move(A1, oop_handle_reg);
  2075   // Finally just about ready to make the JNI call
  2078   // get JNIEnv* which is first argument to native
  2079   if (!is_critical_native) {
  2080     __ addi(A0, thread, in_bytes(JavaThread::jni_environment_offset()));
  2083   // Example: Java_java_lang_ref_Finalizer_invokeFinalizeMethod(JNIEnv *env, jclass clazz, jobject ob)
  2084   // Load the second arguments into A1
  2085   //__ ld(A1, SP , wordSize );   // klass
  2087   // Now set thread in native
  2088   __ addi(AT, R0, _thread_in_native);
  2089   __ sw(AT, thread, in_bytes(JavaThread::thread_state_offset()));
  2090   // do the call
  2091   __ call(method->native_function(), relocInfo::runtime_call_type);
  2092   __ delayed()->nop();
  2093   // WARNING - on Windows Java Natives use pascal calling convention and pop the
  2094   // arguments off of the stack. We could just re-adjust the stack pointer here
  2095   // and continue to do SP relative addressing but we instead switch to FP
  2096   // relative addressing.
  2098   // Unpack native results.
  2099   switch (ret_type) {
  2100   case T_BOOLEAN: __ c2bool(V0);            break;
  2101   case T_CHAR   : __ andi(V0, V0, 0xFFFF);      break;
  2102   case T_BYTE   : __ sign_extend_byte (V0); break;
  2103   case T_SHORT  : __ sign_extend_short(V0); break;
  2104   case T_INT    : // nothing to do         break;
  2105   case T_DOUBLE :
  2106   case T_FLOAT  :
  2107   // Result is in st0 we'll save as needed
  2108   break;
  2109   case T_ARRAY:                 // Really a handle
  2110   case T_OBJECT:                // Really a handle
  2111   break; // can't de-handlize until after safepoint check
  2112   case T_VOID: break;
  2113   case T_LONG: break;
  2114   default       : ShouldNotReachHere();
  2116   // Switch thread to "native transition" state before reading the synchronization state.
  2117   // This additional state is necessary because reading and testing the synchronization
  2118   // state is not atomic w.r.t. GC, as this scenario demonstrates:
  2119   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
  2120   //     VM thread changes sync state to synchronizing and suspends threads for GC.
  2121   //     Thread A is resumed to finish this native method, but doesn't block here since it
  2122   //     didn't see any synchronization is progress, and escapes.
  2123   __ addi(AT, R0, _thread_in_native_trans);
  2124   __ sw(AT, thread, in_bytes(JavaThread::thread_state_offset()));
  2126   //if(os::is_MP()) {}
  2128   Label after_transition;
  2130   // check for safepoint operation in progress and/or pending suspend requests
  2132     Label Continue;
  2133     __ li(AT, SafepointSynchronize::address_of_state());
  2134     __ lw(A0, AT, 0);
  2135     __ addi(AT, A0, -SafepointSynchronize::_not_synchronized);
  2136     Label L;
  2137     __ bne(AT, R0, L);
  2138     __ delayed()->nop();
  2139     __ lw(AT, thread, in_bytes(JavaThread::suspend_flags_offset()));
  2140     __ beq(AT, R0, Continue);
  2141     __ delayed()->nop();
  2142     __ bind(L);
  2144     // Don't use call_VM as it will see a possible pending exception and forward it
  2145     // and never return here preventing us from clearing _last_native_pc down below.
  2146     //
  2147     save_native_result(masm, ret_type, stack_slots);
  2148     __ move(A0, thread);
  2149     __ addi(SP, SP, -wordSize);
  2150     __ push(S2);
  2151     __ move(AT, -(StackAlignmentInBytes));
  2152     __ move(S2, SP);     // use S2 as a sender SP holder
  2153     __ andr(SP, SP, AT); // align stack as required by ABI
  2154     if (!is_critical_native) {
  2155       __ call(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans), relocInfo::runtime_call_type);
  2156       __ delayed()->nop();
  2157     } else {
  2158       __ call(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans_and_transition), relocInfo::runtime_call_type);
  2159       __ delayed()->nop();
  2161     __ move(SP, S2);     // use S2 as a sender SP holder
  2162     __ pop(S2);
  2163     __ addi(SP, SP, wordSize);
  2164     //add for compressedoops
  2165     __ reinit_heapbase();
  2166     // Restore any method result value
  2167     restore_native_result(masm, ret_type, stack_slots);
  2169     if (is_critical_native) {
  2170       // The call above performed the transition to thread_in_Java so
  2171       // skip the transition logic below.
  2172       __ beq(R0, R0, after_transition);
  2173       __ delayed()->nop();
  2176     __ bind(Continue);
  2179   // change thread state
  2180   __ addi(AT, R0, _thread_in_Java);
  2181   __ sw(AT,  thread, in_bytes(JavaThread::thread_state_offset()));
  2182   __ bind(after_transition);
  2183   Label reguard;
  2184   Label reguard_done;
  2185   __ lw(AT, thread, in_bytes(JavaThread::stack_guard_state_offset()));
  2186   __ addi(AT, AT, -JavaThread::stack_guard_yellow_disabled);
  2187   __ beq(AT, R0, reguard);
  2188   __ delayed()->nop();
  2189   // slow path reguard  re-enters here
  2190   __ bind(reguard_done);
  2192   // Handle possible exception (will unlock if necessary)
  2194   // native result if any is live
  2196   // Unlock
  2197   Label slow_path_unlock;
  2198   Label unlock_done;
  2199   if (method->is_synchronized()) {
  2201     Label done;
  2203     // Get locked oop from the handle we passed to jni
  2204     __ ld( obj_reg, oop_handle_reg, 0);
  2205     if (UseBiasedLocking) {
  2206       __ biased_locking_exit(obj_reg, T8, done);
  2210     // Simple recursive lock?
  2212     __ ld(AT, FP, lock_slot_fp_offset);
  2213     __ beq(AT, R0, done);
  2214     __ delayed()->nop();
  2215     // Must save FSF if if it is live now because cmpxchg must use it
  2216     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
  2217       save_native_result(masm, ret_type, stack_slots);
  2220     //  get old displaced header
  2221     __ ld (T8, FP, lock_slot_fp_offset);
  2222     // get address of the stack lock
  2223     __ addi (c_rarg0, FP, lock_slot_fp_offset);
  2224     // Atomic swap old header if oop still contains the stack lock
  2225     __ cmpxchg(T8, Address(obj_reg, 0), c_rarg0);
  2227     __ beq(AT, R0, slow_path_unlock);
  2228     __ delayed()->nop();
  2229     // slow path re-enters here
  2230     __ bind(unlock_done);
  2231     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
  2232       restore_native_result(masm, ret_type, stack_slots);
  2235     __ bind(done);
  2239     SkipIfEqual skip_if(masm, &DTraceMethodProbes, 0);
  2240     // Tell dtrace about this method exit
  2241     save_native_result(masm, ret_type, stack_slots);
  2242     int metadata_index = __ oop_recorder()->find_index( (method()));
  2243     RelocationHolder rspec = metadata_Relocation::spec(metadata_index);
  2244     __ relocate(rspec);
  2245     __ patchable_set48(AT, (long)(method()));
  2247     __ call_VM_leaf(
  2248          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
  2249          thread, AT);
  2250     restore_native_result(masm, ret_type, stack_slots);
  2253   // We can finally stop using that last_Java_frame we setup ages ago
  2255   __ reset_last_Java_frame(false);
  2257   // Unpack oop result, e.g. JNIHandles::resolve value.
  2258   if (ret_type == T_OBJECT || ret_type == T_ARRAY) {
  2259     __ resolve_jobject(V0, thread, T9);
  2262   if (!is_critical_native) {
  2263     // reset handle block
  2264     __ ld(AT, thread, in_bytes(JavaThread::active_handles_offset()));
  2265     __ sw(R0, AT, JNIHandleBlock::top_offset_in_bytes());
  2268   if (!is_critical_native) {
  2269     // Any exception pending?
  2270     __ ld(AT, thread, in_bytes(Thread::pending_exception_offset()));
  2271     __ bne(AT, R0, exception_pending);
  2272     __ delayed()->nop();
  2274   // no exception, we're almost done
  2276   // check that only result value is on FPU stack
  2277   __ verify_FPU(ret_type == T_FLOAT || ret_type == T_DOUBLE ? 1 : 0, "native_wrapper normal exit");
  2279   // Return
  2280 #ifndef OPT_THREAD
  2281   __ get_thread(TREG);
  2282 #endif
  2283   //__ ld_ptr(SP, TREG, in_bytes(JavaThread::last_Java_sp_offset()));
  2284   __ leave();
  2286   __ jr(RA);
  2287   __ delayed()->nop();
  2288   // Unexpected paths are out of line and go here
  2289   // Slow path locking & unlocking
  2290   if (method->is_synchronized()) {
  2292     // BEGIN Slow path lock
  2293     __ bind(slow_path_lock);
  2295     // protect the args we've loaded
  2296     save_args(masm, total_c_args, c_arg, out_regs);
  2298     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
  2299     // args are (oop obj, BasicLock* lock, JavaThread* thread)
  2301     __ move(A0, obj_reg);
  2302     __ move(A1, lock_reg);
  2303     __ move(A2, thread);
  2304     __ addi(SP, SP, - 3*wordSize);
  2306     __ move(AT, -(StackAlignmentInBytes));
  2307     __ move(S2, SP);     // use S2 as a sender SP holder
  2308     __ andr(SP, SP, AT); // align stack as required by ABI
  2310     __ call(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), relocInfo::runtime_call_type);
  2311     __ delayed()->nop();
  2312                 __ move(SP, S2);
  2313     __ addi(SP, SP, 3*wordSize);
  2315     restore_args(masm, total_c_args, c_arg, out_regs);
  2317 #ifdef ASSERT
  2318     { Label L;
  2319       __ ld(AT, thread, in_bytes(Thread::pending_exception_offset()));
  2320       __ beq(AT, R0, L);
  2321       __ delayed()->nop();
  2322       __ stop("no pending exception allowed on exit from monitorenter");
  2323       __ bind(L);
  2325 #endif
  2326     __ b(lock_done);
  2327     __ delayed()->nop();
  2328     // END Slow path lock
  2330     // BEGIN Slow path unlock
  2331     __ bind(slow_path_unlock);
  2333     // Slow path unlock
  2335     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
  2336       save_native_result(masm, ret_type, stack_slots);
  2338     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
  2340     __ ld(AT, thread, in_bytes(Thread::pending_exception_offset()));
  2341     __ push(AT);
  2342     __ sd(R0, thread, in_bytes(Thread::pending_exception_offset()));
  2344                 __ move(AT, -(StackAlignmentInBytes));
  2345                 __ move(S2, SP);     // use S2 as a sender SP holder
  2346                 __ andr(SP, SP, AT); // align stack as required by ABI
  2348     // should be a peal
  2349     // +wordSize because of the push above
  2350     __ addi(A1, FP, lock_slot_fp_offset);
  2352     __ move(A0, obj_reg);
  2353     __ addi(SP,SP, -2*wordSize);
  2354     __ call(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C),
  2355         relocInfo::runtime_call_type);
  2356     __ delayed()->nop();
  2357     __ addi(SP, SP, 2*wordSize);
  2358                 __ move(SP, S2);
  2359     //add for compressedoops
  2360     __ reinit_heapbase();
  2361 #ifdef ASSERT
  2363       Label L;
  2364       __ lw( AT, thread, in_bytes(Thread::pending_exception_offset()));
  2365       __ beq(AT, R0, L);
  2366       __ delayed()->nop();
  2367       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
  2368       __ bind(L);
  2370 #endif /* ASSERT */
  2372     __ pop(AT);
  2373     __ sd(AT, thread, in_bytes(Thread::pending_exception_offset()));
  2374     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
  2375       restore_native_result(masm, ret_type, stack_slots);
  2377     __ b(unlock_done);
  2378     __ delayed()->nop();
  2379     // END Slow path unlock
  2383   // SLOW PATH Reguard the stack if needed
  2385   __ bind(reguard);
  2386   save_native_result(masm, ret_type, stack_slots);
  2387   __ call(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages),
  2388       relocInfo::runtime_call_type);
  2389   __ delayed()->nop();
  2390   //add for compressedoops
  2391   __ reinit_heapbase();
  2392   restore_native_result(masm, ret_type, stack_slots);
  2393   __ b(reguard_done);
  2394   __ delayed()->nop();
  2396   // BEGIN EXCEPTION PROCESSING
  2397   if (!is_critical_native) {
  2398     // Forward  the exception
  2399     __ bind(exception_pending);
  2401     // remove possible return value from FPU register stack
  2402     __ empty_FPU_stack();
  2404     // pop our frame
  2405     //forward_exception_entry need return address on stack
  2406     __ addiu(SP, FP, wordSize);
  2407     __ ld(FP, SP, (-1) * wordSize);
  2409     // and forward the exception
  2410     __ jmp(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type);
  2411     __ delayed()->nop();
  2413   __ flush();
  2415   nmethod *nm = nmethod::new_native_nmethod(method,
  2416                                             compile_id,
  2417                                             masm->code(),
  2418                                             vep_offset,
  2419                                             frame_complete,
  2420                                             stack_slots / VMRegImpl::slots_per_word,
  2421                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
  2422                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
  2423                                             oop_maps);
  2425   if (is_critical_native) {
  2426     nm->set_lazy_critical_native(true);
  2429   return nm;
  2433 #ifdef HAVE_DTRACE_H
  2434 // ---------------------------------------------------------------------------
  2435 // Generate a dtrace nmethod for a given signature.  The method takes arguments
  2436 // in the Java compiled code convention, marshals them to the native
  2437 // abi and then leaves nops at the position you would expect to call a native
  2438 // function. When the probe is enabled the nops are replaced with a trap
  2439 // instruction that dtrace inserts and the trace will cause a notification
  2440 // to dtrace.
  2441 //
  2442 // The probes are only able to take primitive types and java/lang/String as
  2443 // arguments.  No other java types are allowed. Strings are converted to utf8
  2444 // strings so that from dtrace point of view java strings are converted to C
  2445 // strings. There is an arbitrary fixed limit on the total space that a method
  2446 // can use for converting the strings. (256 chars per string in the signature).
  2447 // So any java string larger then this is truncated.
  2449 static int  fp_offset[ConcreteRegisterImpl::number_of_registers] = { 0 };
  2450 static bool offsets_initialized = false;
  2452 static VMRegPair reg64_to_VMRegPair(Register r) {
  2453   VMRegPair ret;
  2454   if (wordSize == 8) {
  2455     ret.set2(r->as_VMReg());
  2456   } else {
  2457     ret.set_pair(r->successor()->as_VMReg(), r->as_VMReg());
  2459   return ret;
  2463 nmethod *SharedRuntime::generate_dtrace_nmethod(MacroAssembler *masm,
  2464                                                 methodHandle method) {
  2467   // generate_dtrace_nmethod is guarded by a mutex so we are sure to
  2468   // be single threaded in this method.
  2469   assert(AdapterHandlerLibrary_lock->owned_by_self(), "must be");
  2471   // Fill in the signature array, for the calling-convention call.
  2472   int total_args_passed = method->size_of_parameters();
  2474   BasicType* in_sig_bt  = NEW_RESOURCE_ARRAY(BasicType, total_args_passed);
  2475   VMRegPair  *in_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_args_passed);
  2477   // The signature we are going to use for the trap that dtrace will see
  2478   // java/lang/String is converted. We drop "this" and any other object
  2479   // is converted to NULL.  (A one-slot java/lang/Long object reference
  2480   // is converted to a two-slot long, which is why we double the allocation).
  2481   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_args_passed * 2);
  2482   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_args_passed * 2);
  2484   int i=0;
  2485   int total_strings = 0;
  2486   int first_arg_to_pass = 0;
  2487   int total_c_args = 0;
  2489   // Skip the receiver as dtrace doesn't want to see it
  2490   if( !method->is_static() ) {
  2491     in_sig_bt[i++] = T_OBJECT;
  2492     first_arg_to_pass = 1;
  2495   SignatureStream ss(method->signature());
  2496   for ( ; !ss.at_return_type(); ss.next()) {
  2497     BasicType bt = ss.type();
  2498     in_sig_bt[i++] = bt;  // Collect remaining bits of signature
  2499     out_sig_bt[total_c_args++] = bt;
  2500     if( bt == T_OBJECT) {
  2501       symbolOop s = ss.as_symbol_or_null();
  2502       if (s == vmSymbols::java_lang_String()) {
  2503         total_strings++;
  2504         out_sig_bt[total_c_args-1] = T_ADDRESS;
  2505       } else if (s == vmSymbols::java_lang_Boolean() ||
  2506                  s == vmSymbols::java_lang_Byte()) {
  2507         out_sig_bt[total_c_args-1] = T_BYTE;
  2508       } else if (s == vmSymbols::java_lang_Character() ||
  2509                  s == vmSymbols::java_lang_Short()) {
  2510         out_sig_bt[total_c_args-1] = T_SHORT;
  2511       } else if (s == vmSymbols::java_lang_Integer() ||
  2512                  s == vmSymbols::java_lang_Float()) {
  2513         out_sig_bt[total_c_args-1] = T_INT;
  2514       } else if (s == vmSymbols::java_lang_Long() ||
  2515                  s == vmSymbols::java_lang_Double()) {
  2516         out_sig_bt[total_c_args-1] = T_LONG;
  2517         out_sig_bt[total_c_args++] = T_VOID;
  2519     } else if ( bt == T_LONG || bt == T_DOUBLE ) {
  2520       in_sig_bt[i++] = T_VOID;   // Longs & doubles take 2 Java slots
  2521       // We convert double to long
  2522       out_sig_bt[total_c_args-1] = T_LONG;
  2523       out_sig_bt[total_c_args++] = T_VOID;
  2524     } else if ( bt == T_FLOAT) {
  2525       // We convert float to int
  2526       out_sig_bt[total_c_args-1] = T_INT;
  2530   assert(i==total_args_passed, "validly parsed signature");
  2532   // Now get the compiled-Java layout as input arguments
  2533   int comp_args_on_stack;
  2534   comp_args_on_stack = SharedRuntime::java_calling_convention(
  2535       in_sig_bt, in_regs, total_args_passed, false);
  2537   // We have received a description of where all the java arg are located
  2538   // on entry to the wrapper. We need to convert these args to where
  2539   // the a  native (non-jni) function would expect them. To figure out
  2540   // where they go we convert the java signature to a C signature and remove
  2541   // T_VOID for any long/double we might have received.
  2544   // Now figure out where the args must be stored and how much stack space
  2545   // they require (neglecting out_preserve_stack_slots but space for storing
  2546   // the 1st six register arguments). It's weird see int_stk_helper.
  2548   int out_arg_slots;
  2549   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, NULL, total_c_args);
  2551   // Calculate the total number of stack slots we will need.
  2553   // First count the abi requirement plus all of the outgoing args
  2554   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
  2556   // Plus a temp for possible converion of float/double/long register args
  2558   int conversion_temp = stack_slots;
  2559   stack_slots += 2;
  2562   // Now space for the string(s) we must convert
  2564   int string_locs = stack_slots;
  2565   stack_slots += total_strings *
  2566                    (max_dtrace_string_size / VMRegImpl::stack_slot_size);
  2568   // Ok The space we have allocated will look like:
  2569   //
  2570   //
  2571   // FP-> |                     |
  2572   //      |---------------------|
  2573   //      | string[n]           |
  2574   //      |---------------------| <- string_locs[n]
  2575   //      | string[n-1]         |
  2576   //      |---------------------| <- string_locs[n-1]
  2577   //      | ...                 |
  2578   //      | ...                 |
  2579   //      |---------------------| <- string_locs[1]
  2580   //      | string[0]           |
  2581   //      |---------------------| <- string_locs[0]
  2582   //      | temp                |
  2583   //      |---------------------| <- conversion_temp
  2584   //      | outbound memory     |
  2585   //      | based arguments     |
  2586   //      |                     |
  2587   //      |---------------------|
  2588   //      |                     |
  2589   // SP-> | out_preserved_slots |
  2590   //
  2591   //
  2593   // Now compute actual number of stack words we need rounding to make
  2594   // stack properly aligned.
  2595   stack_slots = round_to(stack_slots, 4 * VMRegImpl::slots_per_word);
  2597   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
  2599   intptr_t start = (intptr_t)__ pc();
  2601   // First thing make an ic check to see if we should even be here
  2604     Label L;
  2605     const Register temp_reg = G3_scratch;
  2606     Address ic_miss(temp_reg, SharedRuntime::get_ic_miss_stub());
  2607     __ verify_oop(O0);
  2608     __ ld_ptr(O0, oopDesc::klass_offset_in_bytes(), temp_reg);
  2609     __ cmp(temp_reg, G5_inline_cache_reg);
  2610     __ brx(Assembler::equal, true, Assembler::pt, L);
  2611     __ delayed()->nop();
  2613     __ jump_to(ic_miss, 0);
  2614     __ delayed()->nop();
  2615     __ align(CodeEntryAlignment);
  2616     __ bind(L);
  2619   int vep_offset = ((intptr_t)__ pc()) - start;
  2622   // The instruction at the verified entry point must be 5 bytes or longer
  2623   // because it can be patched on the fly by make_non_entrant. The stack bang
  2624   // instruction fits that requirement.
  2626   // Generate stack overflow check before creating frame
  2627   __ generate_stack_overflow_check(stack_size);
  2629   assert(((intptr_t)__ pc() - start - vep_offset) >= 5,
  2630          "valid size for make_non_entrant");
  2632   // Generate a new frame for the wrapper.
  2633   __ save(SP, -stack_size, SP);
  2635   // Frame is now completed as far a size and linkage.
  2637   int frame_complete = ((intptr_t)__ pc()) - start;
  2639 #ifdef ASSERT
  2640   bool reg_destroyed[RegisterImpl::number_of_registers];
  2641   bool freg_destroyed[FloatRegisterImpl::number_of_registers];
  2642   for ( int r = 0 ; r < RegisterImpl::number_of_registers ; r++ ) {
  2643     reg_destroyed[r] = false;
  2645   for ( int f = 0 ; f < FloatRegisterImpl::number_of_registers ; f++ ) {
  2646     freg_destroyed[f] = false;
  2649 #endif /* ASSERT */
  2651   VMRegPair zero;
  2652   const Register g0 = G0; // without this we get a compiler warning (why??)
  2653   zero.set2(g0->as_VMReg());
  2655   int c_arg, j_arg;
  2657   Register conversion_off = noreg;
  2659   for (j_arg = first_arg_to_pass, c_arg = 0 ;
  2660        j_arg < total_args_passed ; j_arg++, c_arg++ ) {
  2662     VMRegPair src = in_regs[j_arg];
  2663     VMRegPair dst = out_regs[c_arg];
  2665 #ifdef ASSERT
  2666     if (src.first()->is_Register()) {
  2667       assert(!reg_destroyed[src.first()->as_Register()->encoding()], "ack!");
  2668     } else if (src.first()->is_FloatRegister()) {
  2669       assert(!freg_destroyed[src.first()->as_FloatRegister()->encoding(
  2670                                                FloatRegisterImpl::S)], "ack!");
  2672     if (dst.first()->is_Register()) {
  2673       reg_destroyed[dst.first()->as_Register()->encoding()] = true;
  2674     } else if (dst.first()->is_FloatRegister()) {
  2675       freg_destroyed[dst.first()->as_FloatRegister()->encoding(
  2676                                                  FloatRegisterImpl::S)] = true;
  2678 #endif /* ASSERT */
  2680     switch (in_sig_bt[j_arg]) {
  2681       case T_ARRAY:
  2682       case T_OBJECT:
  2684           if (out_sig_bt[c_arg] == T_BYTE  || out_sig_bt[c_arg] == T_SHORT ||
  2685               out_sig_bt[c_arg] == T_INT || out_sig_bt[c_arg] == T_LONG) {
  2686             // need to unbox a one-slot value
  2687             Register in_reg = L0;
  2688             Register tmp = L2;
  2689             if ( src.first()->is_reg() ) {
  2690               in_reg = src.first()->as_Register();
  2691             } else {
  2692               assert(Assembler::is_simm13(reg2offset(src.first()) + STACK_BIAS),
  2693                      "must be");
  2694               __ ld_ptr(FP, reg2offset(src.first()) + STACK_BIAS, in_reg);
  2696             // If the final destination is an acceptable register
  2697             if ( dst.first()->is_reg() ) {
  2698               if ( dst.is_single_phys_reg() || out_sig_bt[c_arg] != T_LONG ) {
  2699                 tmp = dst.first()->as_Register();
  2703             Label skipUnbox;
  2704             if ( wordSize == 4 && out_sig_bt[c_arg] == T_LONG ) {
  2705               __ mov(G0, tmp->successor());
  2707             __ br_null(in_reg, true, Assembler::pn, skipUnbox);
  2708             __ delayed()->mov(G0, tmp);
  2710             BasicType bt = out_sig_bt[c_arg];
  2711             int box_offset = java_lang_boxing_object::value_offset_in_bytes(bt);
  2712             switch (bt) {
  2713                 case T_BYTE:
  2714                   __ ldub(in_reg, box_offset, tmp); break;
  2715                 case T_SHORT:
  2716                   __ lduh(in_reg, box_offset, tmp); break;
  2717                 case T_INT:
  2718                   __ ld(in_reg, box_offset, tmp); break;
  2719                 case T_LONG:
  2720                   __ ld_long(in_reg, box_offset, tmp); break;
  2721                 default: ShouldNotReachHere();
  2724             __ bind(skipUnbox);
  2725             // If tmp wasn't final destination copy to final destination
  2726             if (tmp == L2) {
  2727               VMRegPair tmp_as_VM = reg64_to_VMRegPair(L2);
  2728               if (out_sig_bt[c_arg] == T_LONG) {
  2729                 long_move(masm, tmp_as_VM, dst);
  2730               } else {
  2731                 move32_64(masm, tmp_as_VM, out_regs[c_arg]);
  2734             if (out_sig_bt[c_arg] == T_LONG) {
  2735               assert(out_sig_bt[c_arg+1] == T_VOID, "must be");
  2736               ++c_arg; // move over the T_VOID to keep the loop indices in sync
  2738           } else if (out_sig_bt[c_arg] == T_ADDRESS) {
  2739             Register s =
  2740                 src.first()->is_reg() ? src.first()->as_Register() : L2;
  2741             Register d =
  2742                 dst.first()->is_reg() ? dst.first()->as_Register() : L2;
  2744             // We store the oop now so that the conversion pass can reach
  2745             // while in the inner frame. This will be the only store if
  2746             // the oop is NULL.
  2747             if (s != L2) {
  2748               // src is register
  2749               if (d != L2) {
  2750                 // dst is register
  2751                 __ mov(s, d);
  2752               } else {
  2753                 assert(Assembler::is_simm13(reg2offset(dst.first()) +
  2754                           STACK_BIAS), "must be");
  2755                 __ st_ptr(s, SP, reg2offset(dst.first()) + STACK_BIAS);
  2757             } else {
  2758                 // src not a register
  2759                 assert(Assembler::is_simm13(reg2offset(src.first()) +
  2760                            STACK_BIAS), "must be");
  2761                 __ ld_ptr(FP, reg2offset(src.first()) + STACK_BIAS, d);
  2762                 if (d == L2) {
  2763                   assert(Assembler::is_simm13(reg2offset(dst.first()) +
  2764                              STACK_BIAS), "must be");
  2765                   __ st_ptr(d, SP, reg2offset(dst.first()) + STACK_BIAS);
  2768           } else if (out_sig_bt[c_arg] != T_VOID) {
  2769             // Convert the arg to NULL
  2770             if (dst.first()->is_reg()) {
  2771               __ mov(G0, dst.first()->as_Register());
  2772             } else {
  2773               assert(Assembler::is_simm13(reg2offset(dst.first()) +
  2774                          STACK_BIAS), "must be");
  2775               __ st_ptr(G0, SP, reg2offset(dst.first()) + STACK_BIAS);
  2779         break;
  2780       case T_VOID:
  2781         break;
  2783       case T_FLOAT:
  2784         if (src.first()->is_stack()) {
  2785           // Stack to stack/reg is simple
  2786           move32_64(masm, src, dst);
  2787         } else {
  2788           if (dst.first()->is_reg()) {
  2789             // freg -> reg
  2790             int off =
  2791               STACK_BIAS + conversion_temp * VMRegImpl::stack_slot_size;
  2792             Register d = dst.first()->as_Register();
  2793             if (Assembler::is_simm13(off)) {
  2794               __ stf(FloatRegisterImpl::S, src.first()->as_FloatRegister(),
  2795                      SP, off);
  2796               __ ld(SP, off, d);
  2797             } else {
  2798               if (conversion_off == noreg) {
  2799                 __ set(off, L6);
  2800                 conversion_off = L6;
  2802               __ stf(FloatRegisterImpl::S, src.first()->as_FloatRegister(),
  2803                      SP, conversion_off);
  2804               __ ld(SP, conversion_off , d);
  2806           } else {
  2807             // freg -> mem
  2808             int off = STACK_BIAS + reg2offset(dst.first());
  2809             if (Assembler::is_simm13(off)) {
  2810               __ stf(FloatRegisterImpl::S, src.first()->as_FloatRegister(),
  2811                      SP, off);
  2812             } else {
  2813               if (conversion_off == noreg) {
  2814                 __ set(off, L6);
  2815                 conversion_off = L6;
  2817               __ stf(FloatRegisterImpl::S, src.first()->as_FloatRegister(),
  2818                      SP, conversion_off);
  2822         break;
  2824       case T_DOUBLE:
  2825         assert( j_arg + 1 < total_args_passed &&
  2826                 in_sig_bt[j_arg + 1] == T_VOID &&
  2827                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
  2828         if (src.first()->is_stack()) {
  2829           // Stack to stack/reg is simple
  2830           long_move(masm, src, dst);
  2831         } else {
  2832           Register d = dst.first()->is_reg() ? dst.first()->as_Register() : L2;
  2834           // Destination could be an odd reg on 32bit in which case
  2835           // we can't load direct to the destination.
  2837           if (!d->is_even() && wordSize == 4) {
  2838             d = L2;
  2840           int off = STACK_BIAS + conversion_temp * VMRegImpl::stack_slot_size;
  2841           if (Assembler::is_simm13(off)) {
  2842             __ stf(FloatRegisterImpl::D, src.first()->as_FloatRegister(),
  2843                    SP, off);
  2844             __ ld_long(SP, off, d);
  2845           } else {
  2846             if (conversion_off == noreg) {
  2847               __ set(off, L6);
  2848               conversion_off = L6;
  2850             __ stf(FloatRegisterImpl::D, src.first()->as_FloatRegister(),
  2851                    SP, conversion_off);
  2852             __ ld_long(SP, conversion_off, d);
  2854           if (d == L2) {
  2855             long_move(masm, reg64_to_VMRegPair(L2), dst);
  2858         break;
  2860       case T_LONG :
  2861         // 32bit can't do a split move of something like g1 -> O0, O1
  2862         // so use a memory temp
  2863         if (src.is_single_phys_reg() && wordSize == 4) {
  2864           Register tmp = L2;
  2865           if (dst.first()->is_reg() &&
  2866               (wordSize == 8 || dst.first()->as_Register()->is_even())) {
  2867             tmp = dst.first()->as_Register();
  2870           int off = STACK_BIAS + conversion_temp * VMRegImpl::stack_slot_size;
  2871           if (Assembler::is_simm13(off)) {
  2872             __ stx(src.first()->as_Register(), SP, off);
  2873             __ ld_long(SP, off, tmp);
  2874           } else {
  2875             if (conversion_off == noreg) {
  2876               __ set(off, L6);
  2877               conversion_off = L6;
  2879             __ stx(src.first()->as_Register(), SP, conversion_off);
  2880             __ ld_long(SP, conversion_off, tmp);
  2883           if (tmp == L2) {
  2884             long_move(masm, reg64_to_VMRegPair(L2), dst);
  2886         } else {
  2887           long_move(masm, src, dst);
  2889         break;
  2891       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
  2893       default:
  2894         move32_64(masm, src, dst);
  2899   // If we have any strings we must store any register based arg to the stack
  2900   // This includes any still live xmm registers too.
  2902   if (total_strings > 0 ) {
  2904     // protect all the arg registers
  2905     __ save_frame(0);
  2906     __ mov(G2_thread, L7_thread_cache);
  2907     const Register L2_string_off = L2;
  2909     // Get first string offset
  2910     __ set(string_locs * VMRegImpl::stack_slot_size, L2_string_off);
  2912     for (c_arg = 0 ; c_arg < total_c_args ; c_arg++ ) {
  2913       if (out_sig_bt[c_arg] == T_ADDRESS) {
  2915         VMRegPair dst = out_regs[c_arg];
  2916         const Register d = dst.first()->is_reg() ?
  2917             dst.first()->as_Register()->after_save() : noreg;
  2919         // It's a string the oop and it was already copied to the out arg
  2920         // position
  2921         if (d != noreg) {
  2922           __ mov(d, O0);
  2923         } else {
  2924           assert(Assembler::is_simm13(reg2offset(dst.first()) + STACK_BIAS),
  2925                  "must be");
  2926           __ ld_ptr(FP,  reg2offset(dst.first()) + STACK_BIAS, O0);
  2928         Label skip;
  2930         __ br_null(O0, false, Assembler::pn, skip);
  2931         __ delayed()->add(FP, L2_string_off, O1);
  2933         if (d != noreg) {
  2934           __ mov(O1, d);
  2935         } else {
  2936           assert(Assembler::is_simm13(reg2offset(dst.first()) + STACK_BIAS),
  2937                  "must be");
  2938           __ st_ptr(O1, FP,  reg2offset(dst.first()) + STACK_BIAS);
  2941         __ call(CAST_FROM_FN_PTR(address, SharedRuntime::get_utf),
  2942                 relocInfo::runtime_call_type);
  2943         __ delayed()->add(L2_string_off, max_dtrace_string_size, L2_string_off);
  2945         __ bind(skip);
  2950     __ mov(L7_thread_cache, G2_thread);
  2951     __ restore();
  2956   // Ok now we are done. Need to place the nop that dtrace wants in order to
  2957   // patch in the trap
  2959   int patch_offset = ((intptr_t)__ pc()) - start;
  2961   __ nop();
  2964   // Return
  2966   __ ret();
  2967   __ delayed()->restore();
  2969   __ flush();
  2971   nmethod *nm = nmethod::new_dtrace_nmethod(
  2972       method, masm->code(), vep_offset, patch_offset, frame_complete,
  2973       stack_slots / VMRegImpl::slots_per_word);
  2974   return nm;
  2978 #endif // HAVE_DTRACE_H
  2980 // this function returns the adjust size (in number of words) to a c2i adapter
  2981 // activation for use during deoptimization
  2982 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals) {
  2983   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
  2986 // "Top of Stack" slots that may be unused by the calling convention but must
  2987 // otherwise be preserved.
  2988 // On Intel these are not necessary and the value can be zero.
  2989 // On Sparc this describes the words reserved for storing a register window
  2990 // when an interrupt occurs.
  2991 uint SharedRuntime::out_preserve_stack_slots() {
  2992    return 0;
  2995 //------------------------------generate_deopt_blob----------------------------
  2996 // Ought to generate an ideal graph & compile, but here's some SPARC ASM
  2997 // instead.
  2998 void SharedRuntime::generate_deopt_blob() {
  2999   // allocate space for the code
  3000   ResourceMark rm;
  3001   // setup code generation tools
  3002   //CodeBuffer     buffer ("deopt_blob", 4000, 2048);
  3003   CodeBuffer     buffer ("deopt_blob", 8000, 2048);//aoqi FIXME for debug
  3004   MacroAssembler* masm  = new MacroAssembler( & buffer);
  3005   int frame_size_in_words;
  3006   OopMap* map = NULL;
  3007   // Account for the extra args we place on the stack
  3008   // by the time we call fetch_unroll_info
  3009   const int additional_words = 2; // deopt kind, thread
  3011   OopMapSet *oop_maps = new OopMapSet();
  3013   address start = __ pc();
  3014   Label cont;
  3015   // we use S3 for DeOpt reason register
  3016   Register reason = S3;
  3017   // use S6 for thread register
  3018   Register thread = TREG;
  3019   // use S7 for fetch_unroll_info returned UnrollBlock
  3020   Register unroll = S7;
  3021   // Prolog for non exception case!
  3022   // Correct the return address we were given.
  3023   //FIXME, return address is on the tos or Ra?
  3024   __ addi(RA, RA, - (NativeCall::return_address_offset_long));
  3025   // Save everything in sight.
  3026   map = RegisterSaver::save_live_registers(masm, additional_words, &frame_size_in_words);
  3027   // Normal deoptimization
  3028   __ move(reason, Deoptimization::Unpack_deopt);
  3029   __ b(cont);
  3030   __ delayed()->nop();
  3032   int reexecute_offset = __ pc() - start;
  3034   // Reexecute case
  3035   // return address is the pc describes what bci to do re-execute at
  3037   // No need to update map as each call to save_live_registers will produce identical oopmap
  3038   (void) RegisterSaver::save_live_registers(masm, additional_words, &frame_size_in_words);
  3039   __ move(reason, Deoptimization::Unpack_reexecute);
  3040   __ b(cont);
  3041   __ delayed()->nop();
  3043   int   exception_offset = __ pc() - start;
  3044   // Prolog for exception case
  3046   // all registers are dead at this entry point, except for V0 and
  3047   // V1 which contain the exception oop and exception pc
  3048   // respectively.  Set them in TLS and fall thru to the
  3049   // unpack_with_exception_in_tls entry point.
  3051   __ get_thread(thread);
  3052   __ st_ptr(V1, thread, in_bytes(JavaThread::exception_pc_offset()));
  3053   __ st_ptr(V0, thread, in_bytes(JavaThread::exception_oop_offset()));
  3054   int exception_in_tls_offset = __ pc() - start;
  3055   // new implementation because exception oop is now passed in JavaThread
  3057   // Prolog for exception case
  3058   // All registers must be preserved because they might be used by LinearScan
  3059   // Exceptiop oop and throwing PC are passed in JavaThread
  3060   // tos: stack at point of call to method that threw the exception (i.e. only
  3061   // args are on the stack, no return address)
  3063   // Return address will be patched later with the throwing pc. The correct value is not
  3064   // available now because loading it from memory would destroy registers.
  3065   // Save everything in sight.
  3066   // No need to update map as each call to save_live_registers will produce identical oopmap
  3067   __ addi(RA, RA, - (NativeCall::return_address_offset_long));
  3068   (void) RegisterSaver::save_live_registers(masm, additional_words, &frame_size_in_words);
  3070   // Now it is safe to overwrite any register
  3071   // store the correct deoptimization type
  3072   __ move(reason, Deoptimization::Unpack_exception);
  3073   // load throwing pc from JavaThread and patch it as the return address
  3074   // of the current frame. Then clear the field in JavaThread
  3075   __ get_thread(thread);
  3076   __ ld_ptr(V1, thread, in_bytes(JavaThread::exception_pc_offset()));
  3077   __ st_ptr(V1, SP, RegisterSaver::raOffset() * wordSize); //save ra
  3078   __ st_ptr(R0, thread, in_bytes(JavaThread::exception_pc_offset()));
  3081 #ifdef ASSERT
  3082   // verify that there is really an exception oop in JavaThread
  3083   __ ld_ptr(AT, thread, in_bytes(JavaThread::exception_oop_offset()));
  3084   __ verify_oop(AT);
  3085   // verify that there is no pending exception
  3086   Label no_pending_exception;
  3087   __ ld_ptr(AT, thread, in_bytes(Thread::pending_exception_offset()));
  3088   __ beq(AT, R0, no_pending_exception);
  3089   __ delayed()->nop();
  3090   __ stop("must not have pending exception here");
  3091   __ bind(no_pending_exception);
  3092 #endif
  3093   __ bind(cont);
  3094   // Compiled code leaves the floating point stack dirty, empty it.
  3095   __ empty_FPU_stack();
  3098   // Call C code.  Need thread and this frame, but NOT official VM entry
  3099   // crud.  We cannot block on this call, no GC can happen.
  3100 #ifndef OPT_THREAD
  3101   __ get_thread(thread);
  3102 #endif
  3104   __ move(A0, thread);
  3105   __ addi(SP, SP, -additional_words  * wordSize);
  3107   __ set_last_Java_frame(NOREG, NOREG, NULL);
  3109   // Call fetch_unroll_info().  Need thread and this frame, but NOT official VM entry - cannot block on
  3110   // this call, no GC can happen.  Call should capture return values.
  3112   __ relocate(relocInfo::internal_pc_type);
  3114     intptr_t save_pc = (intptr_t)__ pc() +  NativeMovConstReg::instruction_size + 28;
  3115     __ patchable_set48(AT, save_pc);
  3117   __ sd(AT, thread, in_bytes(JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()));
  3119   __ call((address)Deoptimization::fetch_unroll_info);
  3120   //__ call(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info), relocInfo::runtime_call_type);
  3121   __ delayed()->nop();
  3122   oop_maps->add_gc_map(__ pc() - start, map);
  3123   __ addiu(SP, SP, additional_words * wordSize);
  3124   __ get_thread(thread);
  3125   __ reset_last_Java_frame(false);
  3127   // Load UnrollBlock into S7
  3128   __ move(unroll, V0);
  3131   // Move the unpack kind to a safe place in the UnrollBlock because
  3132   // we are very short of registers
  3134   Address unpack_kind(unroll, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes());
  3135   __ sw(reason, unpack_kind);
  3136   // save the unpack_kind value
  3137   // Retrieve the possible live values (return values)
  3138   // All callee save registers representing jvm state
  3139   // are now in the vframeArray.
  3141   Label noException;
  3142   __ move(AT, Deoptimization::Unpack_exception);
  3143   __ bne(AT, reason, noException);// Was exception pending?
  3144   __ delayed()->nop();
  3145   __ ld_ptr(V0, thread, in_bytes(JavaThread::exception_oop_offset()));
  3146   __ ld_ptr(V1, thread, in_bytes(JavaThread::exception_pc_offset()));
  3147   __ st_ptr(R0, thread, in_bytes(JavaThread::exception_pc_offset()));
  3148   __ st_ptr(R0, thread, in_bytes(JavaThread::exception_oop_offset()));
  3150   __ verify_oop(V0);
  3152   // Overwrite the result registers with the exception results.
  3153   __ st_ptr(V0, SP, RegisterSaver::v0Offset()*wordSize);
  3154   __ st_ptr(V1, SP, RegisterSaver::v1Offset()*wordSize);
  3156   __ bind(noException);
  3159   // Stack is back to only having register save data on the stack.
  3160   // Now restore the result registers. Everything else is either dead or captured
  3161   // in the vframeArray.
  3163   RegisterSaver::restore_result_registers(masm);
  3164   // All of the register save area has been popped of the stack. Only the
  3165   // return address remains.
  3166   // Pop all the frames we must move/replace.
  3167   // Frame picture (youngest to oldest)
  3168   // 1: self-frame (no frame link)
  3169   // 2: deopting frame  (no frame link)
  3170   // 3: caller of deopting frame (could be compiled/interpreted).
  3171   //
  3172   // Note: by leaving the return address of self-frame on the stack
  3173   // and using the size of frame 2 to adjust the stack
  3174   // when we are done the return to frame 3 will still be on the stack.
  3176   // register for the sender's sp
  3177   Register sender_sp = Rsender;
  3178   // register for frame pcs
  3179   Register pcs = T0;
  3180   // register for frame sizes
  3181   Register sizes = T1;
  3182   // register for frame count
  3183   Register count = T3;
  3185   // Pop deoptimized frame
  3186   __ lw(AT, unroll, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset_in_bytes());
  3187   __ add(SP, SP, AT);
  3188   // sp should be pointing at the return address to the caller (3)
  3190   // Load array of frame pcs into pcs
  3191   __ ld_ptr(pcs, unroll, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes());
  3192   __ addi(SP, SP, wordSize);  // trash the old pc
  3193   // Load array of frame sizes into T6
  3194   __ ld_ptr(sizes, unroll, Deoptimization::UnrollBlock::frame_sizes_offset_in_bytes());
  3198   // Load count of frams into T3
  3199   __ lw(count, unroll, Deoptimization::UnrollBlock::number_of_frames_offset_in_bytes());
  3200   // Pick up the initial fp we should save
  3201   __ ld(FP, unroll,  Deoptimization::UnrollBlock::initial_info_offset_in_bytes());
  3202    // Now adjust the caller's stack to make up for the extra locals
  3203   // but record the original sp so that we can save it in the skeletal interpreter
  3204   // frame and the stack walking of interpreter_sender will get the unextended sp
  3205   // value and not the "real" sp value.
  3206   __ move(sender_sp, SP);
  3207   __ lw(AT, unroll, Deoptimization::UnrollBlock::caller_adjustment_offset_in_bytes());
  3208   __ sub(SP, SP, AT);
  3210   // Push interpreter frames in a loop
  3211   //
  3212   //Loop:
  3213   //   0x000000555bd82d18: lw t2, 0x0(t1)           ; lw sizes[i]  <--- error lw->ld
  3214   //   0x000000555bd82d1c: ld at, 0x0(t0)           ; ld pcs[i]
  3215   //   0x000000555bd82d20: daddi t2, t2, 0xfffffff0 ; t2 -= 16
  3216   //   0x000000555bd82d24: daddi sp, sp, 0xfffffff0
  3217   //   0x000000555bd82d28: sd fp, 0x0(sp)           ; push fp
  3218   //   0x000000555bd82d2c: sd at, 0x8(sp)           ; push at
  3219   //   0x000000555bd82d30: dadd fp, sp, zero        ; fp <- sp
  3220   //   0x000000555bd82d34: dsub sp, sp, t2          ; sp -= t2
  3221   //   0x000000555bd82d38: sd zero, 0xfffffff0(fp)  ; __ sd(R0, FP, frame::interpreter_frame_last_sp_offset * wordSize);
  3222   //   0x000000555bd82d3c: sd s4, 0xfffffff8(fp)    ; __ sd(sender_sp, FP, frame::interpreter_frame_sender_sp_offset * wordSize);
  3223   //   0x000000555bd82d40: dadd s4, sp, zero        ; move(sender_sp, SP);
  3224   //   0x000000555bd82d44: daddi t3, t3, 0xffffffff ; count --
  3225   //   0x000000555bd82d48: daddi t1, t1, 0x4        ; sizes += 4
  3226   //   0x000000555bd82d4c: bne t3, zero, 0x000000555bd82d18
  3227   //   0x000000555bd82d50: daddi t0, t0, 0x4        ; <--- error    t0 += 8
  3228   //
  3229   // pcs[0] = frame_pcs[0] = deopt_sender.raw_pc(); regex.split
  3230   Label loop;
  3231   __ bind(loop);
  3232   __ ld(T2, sizes, 0);    // Load frame size
  3233   __ ld_ptr(AT, pcs, 0);           // save return address
  3234   __ addi(T2, T2, -2*wordSize);           // we'll push pc and fp, by hand
  3235   __ push2(AT, FP);
  3236   __ move(FP, SP);
  3237   __ sub(SP, SP, T2);       // Prolog!
  3238   // This value is corrected by layout_activation_impl
  3239   __ sd(R0, FP, frame::interpreter_frame_last_sp_offset * wordSize);
  3240   __ sd(sender_sp, FP, frame::interpreter_frame_sender_sp_offset * wordSize);// Make it walkable
  3241   __ move(sender_sp, SP);  // pass to next frame
  3242   __ addi(count, count, -1);   // decrement counter
  3243   __ addi(sizes, sizes, wordSize);   // Bump array pointer (sizes)
  3244   __ bne(count, R0, loop);
  3245   __ delayed()->addi(pcs, pcs, wordSize);   // Bump array pointer (pcs)
  3246   __ ld(AT, pcs, 0);      // frame_pcs[number_of_frames] = Interpreter::deopt_entry(vtos, 0);
  3247   // Re-push self-frame
  3248   __ push2(AT, FP);
  3249   __ move(FP, SP);
  3250   __ sd(R0, FP, frame::interpreter_frame_last_sp_offset * wordSize);
  3251   __ sd(sender_sp, FP, frame::interpreter_frame_sender_sp_offset * wordSize);
  3252   __ addi(SP, SP, -(frame_size_in_words - 2 - additional_words) * wordSize);
  3254   // Restore frame locals after moving the frame
  3255   __ sd(V0, SP, RegisterSaver::v0Offset() * wordSize);
  3256   __ sd(V1, SP, RegisterSaver::v1Offset() * wordSize);
  3257   __ sdc1(F0, SP, RegisterSaver::fpResultOffset()* wordSize);// Pop float stack and store in local
  3258   __ sdc1(F1, SP, (RegisterSaver::fpResultOffset() + 1) * wordSize);
  3261   // Call unpack_frames().  Need thread and this frame, but NOT official VM entry - cannot block on
  3262   // this call, no GC can happen.
  3263   __ move(A1, reason);  // exec_mode
  3264   __ get_thread(thread);
  3265   __ move(A0, thread);  // thread
  3266   __ addi(SP, SP, (-additional_words) *wordSize);
  3268   // set last_Java_sp, last_Java_fp
  3269   __ set_last_Java_frame(NOREG, FP, NULL);
  3271   __ move(AT, -(StackAlignmentInBytes));
  3272   __ andr(SP, SP, AT);   // Fix stack alignment as required by ABI
  3274   __ relocate(relocInfo::internal_pc_type);
  3276     intptr_t save_pc = (intptr_t)__ pc() +  NativeMovConstReg::instruction_size + 28;
  3277     __ patchable_set48(AT, save_pc);
  3279   __ sd(AT, thread, in_bytes(JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()));
  3281   __ call(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames), relocInfo::runtime_call_type);
  3282   __ delayed()->nop();
  3283   // Revert SP alignment after call since we're going to do some SP relative addressing below
  3284   __ ld(SP, thread, in_bytes(JavaThread::last_Java_sp_offset()));
  3285   // Set an oopmap for the call site
  3286   oop_maps->add_gc_map(__ offset(), new OopMap( frame_size_in_words , 0));
  3288   __ push(V0);
  3290   __ get_thread(thread);
  3291   __ reset_last_Java_frame(true);
  3293   // Collect return values
  3294   __ ld(V0, SP, (RegisterSaver::v0Offset() + additional_words +1) * wordSize);
  3295   __ ld(V1, SP, (RegisterSaver::v1Offset() + additional_words +1) * wordSize);
  3296   __ ldc1(F0, SP, RegisterSaver::fpResultOffset()* wordSize);// Pop float stack and store in local
  3297   __ ldc1(F1, SP, (RegisterSaver::fpResultOffset() + 1) * wordSize);
  3298   //FIXME,
  3299   // Clear floating point stack before returning to interpreter
  3300   __ empty_FPU_stack();
  3301   //FIXME, we should consider about float and double
  3302   // Push a float or double return value if necessary.
  3303   __ leave();
  3305   // Jump to interpreter
  3306   __ jr(RA);
  3307   __ delayed()->nop();
  3309   masm->flush();
  3310   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
  3311   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
  3314 #ifdef COMPILER2
  3316 //------------------------------generate_uncommon_trap_blob--------------------
  3317 // Ought to generate an ideal graph & compile, but here's some SPARC ASM
  3318 // instead.
  3319 void SharedRuntime::generate_uncommon_trap_blob() {
  3320   // allocate space for the code
  3321   ResourceMark rm;
  3322   // setup code generation tools
  3323   CodeBuffer  buffer ("uncommon_trap_blob", 512*80 , 512*40 );
  3324   MacroAssembler* masm = new MacroAssembler(&buffer);
  3326   enum frame_layout {
  3327     s0_off, s0_off2,
  3328     s1_off, s1_off2,
  3329     s2_off, s2_off2,
  3330     s3_off, s3_off2,
  3331     s4_off, s4_off2,
  3332     s5_off, s5_off2,
  3333     s6_off, s6_off2,
  3334     s7_off, s7_off2,
  3335     fp_off, fp_off2,
  3336     return_off, return_off2,    // slot for return address    sp + 9
  3337     framesize
  3338   };
  3339   assert(framesize % 4 == 0, "sp not 16-byte aligned");
  3341   address start = __ pc();
  3343   // Push self-frame.
  3344   __ daddiu(SP, SP, -framesize * BytesPerInt);
  3346   __ sd(RA, SP, return_off * BytesPerInt);
  3347   __ sd(FP, SP, fp_off * BytesPerInt);
  3349   // Save callee saved registers.  None for UseSSE=0,
  3350   // floats-only for UseSSE=1, and doubles for UseSSE=2.
  3351   __ sd(S0, SP, s0_off * BytesPerInt);
  3352   __ sd(S1, SP, s1_off * BytesPerInt);
  3353   __ sd(S2, SP, s2_off * BytesPerInt);
  3354   __ sd(S3, SP, s3_off * BytesPerInt);
  3355   __ sd(S4, SP, s4_off * BytesPerInt);
  3356   __ sd(S5, SP, s5_off * BytesPerInt);
  3357   __ sd(S6, SP, s6_off * BytesPerInt);
  3358   __ sd(S7, SP, s7_off * BytesPerInt);
  3360   __ daddi(FP, SP, fp_off * BytesPerInt);
  3362   // Clear the floating point exception stack
  3363   __ empty_FPU_stack();
  3365   Register thread = TREG;
  3367 #ifndef OPT_THREAD
  3368   __ get_thread(thread);
  3369 #endif
  3370   // set last_Java_sp
  3371   __ set_last_Java_frame(NOREG, FP, NULL);
  3372   __ relocate(relocInfo::internal_pc_type);
  3374     long save_pc = (long)__ pc() + 52;
  3375     __ patchable_set48(AT, (long)save_pc);
  3376     __ sd(AT, thread, in_bytes(JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()));
  3378   // Call C code.  Need thread but NOT official VM entry
  3379   // crud.  We cannot block on this call, no GC can happen.  Call should
  3380   // capture callee-saved registers as well as return values.
  3381   __ move(A0, thread);
  3382   // argument already in T0
  3383   __ move(A1, T0);
  3384   __ patchable_call((address)Deoptimization::uncommon_trap);
  3386   // Set an oopmap for the call site
  3387   OopMapSet *oop_maps = new OopMapSet();
  3388   OopMap* map =  new OopMap( framesize, 0 );
  3390   map->set_callee_saved( VMRegImpl::stack2reg(s0_off    ),  S0->as_VMReg() );
  3391   map->set_callee_saved( VMRegImpl::stack2reg(s1_off    ),  S1->as_VMReg() );
  3392   map->set_callee_saved( VMRegImpl::stack2reg(s2_off    ),  S2->as_VMReg() );
  3393   map->set_callee_saved( VMRegImpl::stack2reg(s3_off    ),  S3->as_VMReg() );
  3394   map->set_callee_saved( VMRegImpl::stack2reg(s4_off    ),  S4->as_VMReg() );
  3395   map->set_callee_saved( VMRegImpl::stack2reg(s5_off    ),  S5->as_VMReg() );
  3396   map->set_callee_saved( VMRegImpl::stack2reg(s6_off    ),  S6->as_VMReg() );
  3397   map->set_callee_saved( VMRegImpl::stack2reg(s7_off    ),  S7->as_VMReg() );
  3399   //oop_maps->add_gc_map( __ offset(), true, map);
  3400   oop_maps->add_gc_map( __ offset(),  map);
  3402 #ifndef OPT_THREAD
  3403   __ get_thread(thread);
  3404 #endif
  3405   __ reset_last_Java_frame(false);
  3407   // Load UnrollBlock into S7
  3408   Register unroll = S7;
  3409   __ move(unroll, V0);
  3411   // Pop all the frames we must move/replace.
  3412   //
  3413   // Frame picture (youngest to oldest)
  3414   // 1: self-frame (no frame link)
  3415   // 2: deopting frame  (no frame link)
  3416   // 3: possible-i2c-adapter-frame
  3417   // 4: caller of deopting frame (could be compiled/interpreted. If interpreted we will create an
  3418   //    and c2i here)
  3420   __ daddiu(SP, SP, framesize * BytesPerInt);
  3422   // Pop deoptimized frame
  3423   __ lw(AT, unroll, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset_in_bytes());
  3424   __ dadd(SP, SP, AT);
  3426   // register for frame pcs
  3427   Register pcs = T8;
  3428   // register for frame sizes
  3429   Register sizes = T9;
  3430   // register for frame count
  3431   Register count = T3;
  3432   // register for the sender's sp
  3433   Register sender_sp = T1;
  3435   // sp should be pointing at the return address to the caller (4)
  3436   // Load array of frame pcs
  3437   __ ld(pcs, unroll, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes());
  3439   // Load array of frame sizes
  3440   __ ld(sizes, unroll, Deoptimization::UnrollBlock::frame_sizes_offset_in_bytes());
  3441   __ lwu(count, unroll, Deoptimization::UnrollBlock::number_of_frames_offset_in_bytes());
  3443   // Pick up the initial fp we should save
  3444   __ ld(FP, unroll, Deoptimization::UnrollBlock::initial_info_offset_in_bytes());
  3445   // Now adjust the caller's stack to make up for the extra locals
  3446   // but record the original sp so that we can save it in the skeletal interpreter
  3447   // frame and the stack walking of interpreter_sender will get the unextended sp
  3448   // value and not the "real" sp value.
  3450   __ move(sender_sp, SP);
  3451   __ lw(AT, unroll, Deoptimization::UnrollBlock::caller_adjustment_offset_in_bytes());
  3452   __ dsub(SP, SP, AT);
  3453   // Push interpreter frames in a loop
  3454   Label loop;
  3455   __ bind(loop);
  3456   __ ld(T2, sizes, 0);          // Load frame size
  3457   __ ld(AT, pcs, 0);           // save return address
  3458   __ daddi(T2, T2, -2*wordSize);           // we'll push pc and fp, by hand
  3459   __ push2(AT, FP);
  3460   __ move(FP, SP);
  3461   __ dsub(SP, SP, T2);                   // Prolog!
  3462   // This value is corrected by layout_activation_impl
  3463   __ sd(R0, FP, frame::interpreter_frame_last_sp_offset * wordSize);
  3464   __ sd(sender_sp, FP, frame::interpreter_frame_sender_sp_offset * wordSize);// Make it walkable
  3465   __ move(sender_sp, SP);       // pass to next frame
  3466   __ daddi(count, count, -1);    // decrement counter
  3467   __ daddi(sizes, sizes, wordSize);     // Bump array pointer (sizes)
  3468   __ addi(pcs, pcs, wordSize);      // Bump array pointer (pcs)
  3469   __ bne(count, R0, loop);
  3470   __ delayed()->nop();      // Bump array pointer (pcs)
  3472   __ ld(RA, pcs, 0);
  3474   // Re-push self-frame
  3475   __ daddi(SP, SP, - 2 * wordSize);      // save old & set new FP
  3476   __ sd(FP, SP, 0 * wordSize);          // save final return address
  3477   __ sd(RA, SP, 1 * wordSize);
  3478   __ move(FP, SP);
  3479   __ daddi(SP, SP, -(framesize / 2 - 2) * wordSize);
  3481   // set last_Java_sp, last_Java_fp
  3482   __ set_last_Java_frame(NOREG, FP, NULL);
  3484   __ move(AT, -(StackAlignmentInBytes));
  3485   __ andr(SP, SP, AT);   // Fix stack alignment as required by ABI
  3487   __ relocate(relocInfo::internal_pc_type);
  3489     long save_pc = (long)__ pc() + 52;
  3490     __ patchable_set48(AT, (long)save_pc);
  3492   __ sd(AT, thread, in_bytes(JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()));
  3494   // Call C code.  Need thread but NOT official VM entry
  3495   // crud.  We cannot block on this call, no GC can happen.  Call should
  3496   // restore return values to their stack-slots with the new SP.
  3497   __ move(A0, thread);
  3498   __ move(A1, Deoptimization::Unpack_uncommon_trap);
  3499   __ patchable_call((address)Deoptimization::unpack_frames);
  3500   // Set an oopmap for the call site
  3501   oop_maps->add_gc_map( __ offset(),  new OopMap( framesize, 0 ) );
  3503   __ reset_last_Java_frame(true);
  3505   // Pop self-frame.
  3506   __ leave();     // Epilog!
  3508   // Jump to interpreter
  3509   __ jr(RA);
  3510   __ delayed()->nop();
  3511   // -------------
  3512   // make sure all code is generated
  3513   masm->flush();
  3515   _uncommon_trap_blob = UncommonTrapBlob::create(&buffer, oop_maps, framesize / 2);
  3518 #endif // COMPILER2
  3520 //------------------------------generate_handler_blob-------------------
  3521 //
  3522 // Generate a special Compile2Runtime blob that saves all registers, and sets
  3523 // up an OopMap and calls safepoint code to stop the compiled code for
  3524 // a safepoint.
  3525 //
  3526 // This blob is jumped to (via a breakpoint and the signal handler) from a
  3527 // safepoint in compiled code.
  3529 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int pool_type) {
  3531   // Account for thread arg in our frame
  3532   const int additional_words = 0;
  3533   int frame_size_in_words;
  3535   assert (StubRoutines::forward_exception_entry() != NULL, "must be generated before");
  3537   ResourceMark rm;
  3538   OopMapSet *oop_maps = new OopMapSet();
  3539   OopMap* map;
  3541   // allocate space for the code
  3542   // setup code generation tools
  3543   CodeBuffer  buffer ("handler_blob", 2048, 512);
  3544   MacroAssembler* masm = new MacroAssembler( &buffer);
  3546   const Register thread = TREG;
  3547   address start   = __ pc();
  3548   address call_pc = NULL;
  3549   bool cause_return = (pool_type == POLL_AT_RETURN);
  3550   bool save_vectors = (pool_type == POLL_AT_VECTOR_LOOP);
  3552   // If cause_return is true we are at a poll_return and there is
  3553   // the return address in RA to the caller on the nmethod
  3554   // that is safepoint. We can leave this return in RA and
  3555   // effectively complete the return and safepoint in the caller.
  3556   // Otherwise we load exception pc to RA.
  3557   __ push(thread);
  3558 #ifndef OPT_THREAD
  3559   __ get_thread(thread);
  3560 #endif
  3562   if(!cause_return) {
  3563     __ ld_ptr(RA, Address(thread, JavaThread::saved_exception_pc_offset()));
  3566   __ pop(thread);
  3567   map = RegisterSaver::save_live_registers(masm, additional_words, &frame_size_in_words, save_vectors);
  3569 #ifndef OPT_THREAD
  3570   __ get_thread(thread);
  3571 #endif
  3572   // The following is basically a call_VM. However, we need the precise
  3573   // address of the call in order to generate an oopmap. Hence, we do all the
  3574   // work outselvs.
  3576   __ move(A0, thread);
  3577   __ set_last_Java_frame(NOREG, NOREG, NULL);
  3580   // do the call
  3581   __ call(call_ptr);
  3582   __ delayed()->nop();
  3584   // Set an oopmap for the call site.  This oopmap will map all
  3585   // oop-registers and debug-info registers as callee-saved.  This
  3586   // will allow deoptimization at this safepoint to find all possible
  3587   // debug-info recordings, as well as let GC find all oops.
  3588   oop_maps->add_gc_map(__ offset(),  map);
  3590   Label noException;
  3592   // Clear last_Java_sp again
  3593   __ reset_last_Java_frame(false);
  3595   __ ld_ptr(AT, thread, in_bytes(Thread::pending_exception_offset()));
  3596   __ beq(AT, R0, noException);
  3597   __ delayed()->nop();
  3599   // Exception pending
  3601   RegisterSaver::restore_live_registers(masm, save_vectors);
  3602   //forward_exception_entry need return address on the stack
  3603   __ push(RA);
  3604   __ patchable_jump((address)StubRoutines::forward_exception_entry());
  3606   // No exception case
  3607   __ bind(noException);
  3608   // Normal exit, register restoring and exit
  3609   RegisterSaver::restore_live_registers(masm, save_vectors);
  3610   __ jr(RA);
  3611   __ delayed()->nop();
  3613   masm->flush();
  3615   // Fill-out other meta info
  3616   return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
  3619 //
  3620 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
  3621 //
  3622 // Generate a stub that calls into vm to find out the proper destination
  3623 // of a java call. All the argument registers are live at this point
  3624 // but since this is generic code we don't know what they are and the caller
  3625 // must do any gc of the args.
  3626 //
  3627 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {
  3628   assert (StubRoutines::forward_exception_entry() != NULL, "must be generated before");
  3630   // allocate space for the code
  3631   ResourceMark rm;
  3633   //CodeBuffer buffer(name, 1000, 512);
  3634   //FIXME. aoqi. code_size
  3635   CodeBuffer buffer(name, 2000, 2048);
  3636   MacroAssembler* masm  = new MacroAssembler(&buffer);
  3638   int frame_size_words;
  3639   //we put the thread in A0
  3641   OopMapSet *oop_maps = new OopMapSet();
  3642   OopMap* map = NULL;
  3644   int start = __ offset();
  3645   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_words);
  3648   int frame_complete = __ offset();
  3650   const Register thread = T8;
  3651   __ get_thread(thread);
  3653   __ move(A0, thread);
  3654   __ set_last_Java_frame(noreg, FP, NULL);
  3655   //align the stack before invoke native
  3656   __ move(AT, -(StackAlignmentInBytes));
  3657   __ andr(SP, SP, AT);
  3658   __ relocate(relocInfo::internal_pc_type);
  3660     intptr_t save_pc = (intptr_t)__ pc() +  NativeMovConstReg::instruction_size + 24 + 1 * BytesPerInstWord;
  3661     __ patchable_set48(AT, save_pc);
  3663   __ sd(AT, thread, in_bytes(JavaThread::last_Java_pc_offset()));
  3665   __ call(destination);
  3666   __ delayed()->nop();
  3668   // Set an oopmap for the call site.
  3669   // We need this not only for callee-saved registers, but also for volatile
  3670   // registers that the compiler might be keeping live across a safepoint.
  3671   oop_maps->add_gc_map( __ offset() - start, map);
  3672   // V0 contains the address we are going to jump to assuming no exception got installed
  3673   __ get_thread(thread);
  3674   __ ld_ptr(SP, thread, in_bytes(JavaThread::last_Java_sp_offset()));
  3675   // clear last_Java_sp
  3676   __ reset_last_Java_frame(true);
  3677   // check for pending exceptions
  3678   Label pending;
  3679   __ ld_ptr(AT, thread, in_bytes(Thread::pending_exception_offset()));
  3680   __ bne(AT, R0, pending);
  3681   __ delayed()->nop();
  3682   // get the returned Method*
  3683   //FIXME, do mips need this ?
  3684   __ get_vm_result_2(Rmethod, thread);  // Refer to OpenJDK8
  3685   __ st_ptr(Rmethod, SP, RegisterSaver::methodOffset() * wordSize);
  3686   __ st_ptr(V0, SP, RegisterSaver::v0Offset() * wordSize);
  3687   RegisterSaver::restore_live_registers(masm);
  3689   // We are back the the original state on entry and ready to go the callee method.
  3690   __ jr(V0);
  3691   __ delayed()->nop();
  3692   // Pending exception after the safepoint
  3694   __ bind(pending);
  3696   RegisterSaver::restore_live_registers(masm);
  3698   // exception pending => remove activation and forward to exception handler
  3699   //forward_exception_entry need return address on the stack
  3700   __ push(RA);
  3701   __ get_thread(thread);
  3702   __ st_ptr(R0, thread, in_bytes(JavaThread::vm_result_offset()));
  3703   __ ld_ptr(V0, thread, in_bytes(Thread::pending_exception_offset()));
  3704   __ jmp(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type);
  3705   __ delayed()->nop();
  3706   //
  3707   // make sure all code is generated
  3708   masm->flush();
  3710   RuntimeStub* tmp= RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_words, oop_maps, true);
  3711   return tmp;
  3714 extern "C" int SpinPause() {return 0;}
  3717 //------------------------------Montgomery multiplication------------------------
  3718 //
  3720 // Subtract 0:b from carry:a.  Return carry.
  3721 static unsigned long
  3722 sub(unsigned long a[], unsigned long b[], unsigned long carry, long len) {
  3723   long borrow = 0, t = 0;
  3724   unsigned long tmp0, tmp1;
  3725   __asm__ __volatile__ (
  3726     "0:                                            \n"
  3727     "ld      %[tmp0],     0(%[a])                  \n"
  3728     "ld      %[tmp1],     0(%[b])                  \n"
  3729     "sltu    %[t],        %[tmp0],     %[borrow]   \n"
  3730     "dsubu   %[tmp0],     %[tmp0],     %[borrow]   \n"
  3731     "sltu    %[borrow],   %[tmp0],     %[tmp1]     \n"
  3732     "or      %[borrow],   %[borrow],   %[t]        \n"
  3733     "dsubu   %[tmp0],     %[tmp0],     %[tmp1]     \n"
  3734     "sd      %[tmp0],     0(%[a])                  \n"
  3735     "daddiu  %[a],        %[a],         8          \n"
  3736     "daddiu  %[b],        %[b],         8          \n"
  3737     "daddiu  %[len],      %[len],      -1          \n"
  3738     "bgtz    %[len],      0b                       \n"
  3739     "dsubu   %[tmp0],     %[carry],    %[borrow]   \n"
  3740     : [len]"+r"(len), [tmp0]"=&r"(tmp0), [tmp1]"=&r"(tmp1), [borrow]"+r"(borrow), [a]"+r"(a), [b]"+r"(b), [t]"+r"(t)
  3741     : [carry]"r"(carry)
  3742     : "memory"
  3743   );
  3744   return tmp0;
  3747 // Multiply (unsigned) Long A by Long B, accumulating the double-
  3748 // length result into the accumulator formed of t0, t1, and t2.
  3749 inline void MACC(unsigned long A, unsigned long B, unsigned long &t0, unsigned long &t1, unsigned long &t2) {
  3750   unsigned long hi, lo, carry = 0, t = 0;
  3751   __asm__ __volatile__(
  3752     "dmultu  %[A],        %[B]                     \n"
  3753     "mfhi    %[hi]                                 \n"
  3754     "mflo    %[lo]                                 \n"
  3755     "daddu   %[t0],       %[t0],       %[lo]       \n"
  3756     "sltu    %[carry],    %[t0],       %[lo]       \n"
  3757     "daddu   %[t1],       %[t1],       %[carry]    \n"
  3758     "sltu    %[t],        %[t1],       %[carry]    \n"
  3759     "daddu   %[t1],       %[t1],       %[hi]       \n"
  3760     "sltu    %[carry],    %[t1],       %[hi]       \n"
  3761     "or      %[carry],    %[carry],    %[t]        \n"
  3762     "daddu   %[t2],       %[t2],       %[carry]    \n"
  3763     : [hi]"=&r"(hi), [lo]"=&r"(lo), [t0]"+r"(t0), [t1]"+r"(t1), [t2]"+r"(t2), [carry]"+r"(carry), [t]"+r"(t)
  3764     : [A]"r"(A), [B]"r"(B)
  3766   );
  3769 // As above, but add twice the double-length result into the
  3770 // accumulator.
  3771 inline void MACC2(unsigned long A, unsigned long B, unsigned long &t0, unsigned long &t1, unsigned long &t2) {
  3772   unsigned long hi, lo, carry = 0, t = 0;
  3773   __asm__ __volatile__(
  3774     "dmultu  %[A],        %[B]                     \n"
  3775     "mfhi    %[hi]                                 \n"
  3776     "mflo    %[lo]                                 \n"
  3777     "daddu   %[t0],       %[t0],       %[lo]       \n"
  3778     "sltu    %[carry],    %[t0],       %[lo]       \n"
  3779     "daddu   %[t1],       %[t1],       %[carry]    \n"
  3780     "sltu    %[t],        %[t1],       %[carry]    \n"
  3781     "daddu   %[t1],       %[t1],       %[hi]       \n"
  3782     "sltu    %[carry],    %[t1],       %[hi]       \n"
  3783     "or      %[carry],    %[carry],    %[t]        \n"
  3784     "daddu   %[t2],       %[t2],       %[carry]    \n"
  3785     "daddu   %[t0],       %[t0],       %[lo]       \n"
  3786     "sltu    %[carry],    %[t0],       %[lo]       \n"
  3787     "daddu   %[t1],       %[t1],       %[carry]    \n"
  3788     "sltu    %[t],        %[t1],       %[carry]    \n"
  3789     "daddu   %[t1],       %[t1],       %[hi]       \n"
  3790     "sltu    %[carry],    %[t1],       %[hi]       \n"
  3791     "or      %[carry],    %[carry],    %[t]        \n"
  3792     "daddu   %[t2],       %[t2],       %[carry]    \n"
  3793     : [hi]"=&r"(hi), [lo]"=&r"(lo), [t0]"+r"(t0), [t1]"+r"(t1), [t2]"+r"(t2), [carry]"+r"(carry), [t]"+r"(t)
  3794     : [A]"r"(A), [B]"r"(B)
  3796   );
  3799 // Fast Montgomery multiplication.  The derivation of the algorithm is
  3800 // in  A Cryptographic Library for the Motorola DSP56000,
  3801 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
  3803 static void __attribute__((noinline))
  3804 montgomery_multiply(unsigned long a[], unsigned long b[], unsigned long n[],
  3805                     unsigned long m[], unsigned long inv, int len) {
  3806   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
  3807   int i;
  3809   assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
  3811   for (i = 0; i < len; i++) {
  3812     int j;
  3813     for (j = 0; j < i; j++) {
  3814       MACC(a[j], b[i-j], t0, t1, t2);
  3815       MACC(m[j], n[i-j], t0, t1, t2);
  3817     MACC(a[i], b[0], t0, t1, t2);
  3818     m[i] = t0 * inv;
  3819     MACC(m[i], n[0], t0, t1, t2);
  3821     assert(t0 == 0, "broken Montgomery multiply");
  3823     t0 = t1; t1 = t2; t2 = 0;
  3826   for (i = len; i < 2*len; i++) {
  3827     int j;
  3828     for (j = i-len+1; j < len; j++) {
  3829       MACC(a[j], b[i-j], t0, t1, t2);
  3830       MACC(m[j], n[i-j], t0, t1, t2);
  3832     m[i-len] = t0;
  3833     t0 = t1; t1 = t2; t2 = 0;
  3836   while (t0)
  3837     t0 = sub(m, n, t0, len);
  3840 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
  3841 // multiplies so it should be up to 25% faster than Montgomery
  3842 // multiplication.  However, its loop control is more complex and it
  3843 // may actually run slower on some machines.
  3845 static void __attribute__((noinline))
  3846 montgomery_square(unsigned long a[], unsigned long n[],
  3847                   unsigned long m[], unsigned long inv, int len) {
  3848   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
  3849   int i;
  3851   assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
  3853   for (i = 0; i < len; i++) {
  3854     int j;
  3855     int end = (i+1)/2;
  3856     for (j = 0; j < end; j++) {
  3857       MACC2(a[j], a[i-j], t0, t1, t2);
  3858       MACC(m[j], n[i-j], t0, t1, t2);
  3860     if ((i & 1) == 0) {
  3861       MACC(a[j], a[j], t0, t1, t2);
  3863     for (; j < i; j++) {
  3864       MACC(m[j], n[i-j], t0, t1, t2);
  3866     m[i] = t0 * inv;
  3867     MACC(m[i], n[0], t0, t1, t2);
  3869     assert(t0 == 0, "broken Montgomery square");
  3871     t0 = t1; t1 = t2; t2 = 0;
  3874   for (i = len; i < 2*len; i++) {
  3875     int start = i-len+1;
  3876     int end = start + (len - start)/2;
  3877     int j;
  3878     for (j = start; j < end; j++) {
  3879       MACC2(a[j], a[i-j], t0, t1, t2);
  3880       MACC(m[j], n[i-j], t0, t1, t2);
  3882     if ((i & 1) == 0) {
  3883       MACC(a[j], a[j], t0, t1, t2);
  3885     for (; j < len; j++) {
  3886       MACC(m[j], n[i-j], t0, t1, t2);
  3888     m[i-len] = t0;
  3889     t0 = t1; t1 = t2; t2 = 0;
  3892   while (t0)
  3893     t0 = sub(m, n, t0, len);
  3896 // Swap words in a longword.
  3897 static unsigned long swap(unsigned long x) {
  3898   return (x << 32) | (x >> 32);
  3901 // Copy len longwords from s to d, word-swapping as we go.  The
  3902 // destination array is reversed.
  3903 static void reverse_words(unsigned long *s, unsigned long *d, int len) {
  3904   d += len;
  3905   while(len-- > 0) {
  3906     d--;
  3907     *d = swap(*s);
  3908     s++;
  3912 // The threshold at which squaring is advantageous was determined
  3913 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
  3914 // Doesn't seem to be relevant for MIPS64 so we use the same value.
  3915 #define MONTGOMERY_SQUARING_THRESHOLD 64
  3917 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
  3918                                         jint len, jlong inv,
  3919                                         jint *m_ints) {
  3920   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
  3921   int longwords = len/2;
  3923   // Make very sure we don't use so much space that the stack might
  3924   // overflow.  512 jints corresponds to an 16384-bit integer and
  3925   // will use here a total of 8k bytes of stack space.
  3926   int total_allocation = longwords * sizeof (unsigned long) * 4;
  3927   guarantee(total_allocation <= 8192, "must be");
  3928   unsigned long *scratch = (unsigned long *)alloca(total_allocation);
  3930   // Local scratch arrays
  3931   unsigned long
  3932     *a = scratch + 0 * longwords,
  3933     *b = scratch + 1 * longwords,
  3934     *n = scratch + 2 * longwords,
  3935     *m = scratch + 3 * longwords;
  3937   reverse_words((unsigned long *)a_ints, a, longwords);
  3938   reverse_words((unsigned long *)b_ints, b, longwords);
  3939   reverse_words((unsigned long *)n_ints, n, longwords);
  3941   ::montgomery_multiply(a, b, n, m, (unsigned long)inv, longwords);
  3943   reverse_words(m, (unsigned long *)m_ints, longwords);
  3946 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
  3947                                       jint len, jlong inv,
  3948                                       jint *m_ints) {
  3949   assert(len % 2 == 0, "array length in montgomery_square must be even");
  3950   int longwords = len/2;
  3952   // Make very sure we don't use so much space that the stack might
  3953   // overflow.  512 jints corresponds to an 16384-bit integer and
  3954   // will use here a total of 6k bytes of stack space.
  3955   int total_allocation = longwords * sizeof (unsigned long) * 3;
  3956   guarantee(total_allocation <= 8192, "must be");
  3957   unsigned long *scratch = (unsigned long *)alloca(total_allocation);
  3959   // Local scratch arrays
  3960   unsigned long
  3961     *a = scratch + 0 * longwords,
  3962     *n = scratch + 1 * longwords,
  3963     *m = scratch + 2 * longwords;
  3965   reverse_words((unsigned long *)a_ints, a, longwords);
  3966   reverse_words((unsigned long *)n_ints, n, longwords);
  3968   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
  3969     ::montgomery_square(a, n, m, (unsigned long)inv, longwords);
  3970   } else {
  3971     ::montgomery_multiply(a, a, n, m, (unsigned long)inv, longwords);
  3974   reverse_words(m, (unsigned long *)m_ints, longwords);

mercurial