src/cpu/mips/vm/sharedRuntime_mips_64.cpp

Mon, 18 Nov 2019 10:41:48 +0800

author
huangjia
date
Mon, 18 Nov 2019 10:41:48 +0800
changeset 9759
8c71022cf5f3
parent 9705
0b27fc8adf1b
child 9932
86ea9a02a717
permissions
-rw-r--r--

#10052 Backport of #9904 compiler/floatingpoint/TestFloatSyncJNIArgs.java failed
Reviewed-by: aoqi

     1 /*
     2  * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
     3  * Copyright (c) 2015, 2019, Loongson Technology. All rights reserved.
     4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     5  *
     6  * This code is free software; you can redistribute it and/or modify it
     7  * under the terms of the GNU General Public License version 2 only, as
     8  * published by the Free Software Foundation.
     9  *
    10  * This code is distributed in the hope that it will be useful, but WITHOUT
    11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
    12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    13  * version 2 for more details (a copy is included in the LICENSE file that
    14  * accompanied this code).
    15  *
    16  * You should have received a copy of the GNU General Public License version
    17  * 2 along with this work; if not, write to the Free Software Foundation,
    18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
    19  *
    20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
    21  * or visit www.oracle.com if you need additional information or have any
    22  * questions.
    23  *
    24  */
    26 #include "precompiled.hpp"
    27 #include "asm/macroAssembler.hpp"
    28 #include "asm/macroAssembler.inline.hpp"
    29 #include "code/debugInfoRec.hpp"
    30 #include "code/icBuffer.hpp"
    31 #include "code/vtableStubs.hpp"
    32 #include "interpreter/interpreter.hpp"
    33 #include "oops/compiledICHolder.hpp"
    34 #include "prims/jvmtiRedefineClassesTrace.hpp"
    35 #include "runtime/sharedRuntime.hpp"
    36 #include "runtime/vframeArray.hpp"
    37 #include "vmreg_mips.inline.hpp"
    38 #ifdef COMPILER1
    39 #include "c1/c1_Runtime1.hpp"
    40 #endif
    41 #ifdef COMPILER2
    42 #include "opto/runtime.hpp"
    43 #endif
    45 #include <alloca.h>
    47 #define __ masm->
    49 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
    51 class RegisterSaver {
    52   enum { FPU_regs_live = 32 };
    53   // Capture info about frame layout
    54   enum layout {
    55 #define DEF_LAYOUT_OFFS(regname)  regname ## _off,  regname ## H_off,
    56     DEF_LAYOUT_OFFS(for_16_bytes_aligned)
    57     DEF_LAYOUT_OFFS(fpr0)
    58     DEF_LAYOUT_OFFS(fpr1)
    59     DEF_LAYOUT_OFFS(fpr2)
    60     DEF_LAYOUT_OFFS(fpr3)
    61     DEF_LAYOUT_OFFS(fpr4)
    62     DEF_LAYOUT_OFFS(fpr5)
    63     DEF_LAYOUT_OFFS(fpr6)
    64     DEF_LAYOUT_OFFS(fpr7)
    65     DEF_LAYOUT_OFFS(fpr8)
    66     DEF_LAYOUT_OFFS(fpr9)
    67     DEF_LAYOUT_OFFS(fpr10)
    68     DEF_LAYOUT_OFFS(fpr11)
    69     DEF_LAYOUT_OFFS(fpr12)
    70     DEF_LAYOUT_OFFS(fpr13)
    71     DEF_LAYOUT_OFFS(fpr14)
    72     DEF_LAYOUT_OFFS(fpr15)
    73     DEF_LAYOUT_OFFS(fpr16)
    74     DEF_LAYOUT_OFFS(fpr17)
    75     DEF_LAYOUT_OFFS(fpr18)
    76     DEF_LAYOUT_OFFS(fpr19)
    77     DEF_LAYOUT_OFFS(fpr20)
    78     DEF_LAYOUT_OFFS(fpr21)
    79     DEF_LAYOUT_OFFS(fpr22)
    80     DEF_LAYOUT_OFFS(fpr23)
    81     DEF_LAYOUT_OFFS(fpr24)
    82     DEF_LAYOUT_OFFS(fpr25)
    83     DEF_LAYOUT_OFFS(fpr26)
    84     DEF_LAYOUT_OFFS(fpr27)
    85     DEF_LAYOUT_OFFS(fpr28)
    86     DEF_LAYOUT_OFFS(fpr29)
    87     DEF_LAYOUT_OFFS(fpr30)
    88     DEF_LAYOUT_OFFS(fpr31)
    90     DEF_LAYOUT_OFFS(v0)
    91     DEF_LAYOUT_OFFS(v1)
    92     DEF_LAYOUT_OFFS(a0)
    93     DEF_LAYOUT_OFFS(a1)
    94     DEF_LAYOUT_OFFS(a2)
    95     DEF_LAYOUT_OFFS(a3)
    96     DEF_LAYOUT_OFFS(a4)
    97     DEF_LAYOUT_OFFS(a5)
    98     DEF_LAYOUT_OFFS(a6)
    99     DEF_LAYOUT_OFFS(a7)
   100     DEF_LAYOUT_OFFS(t0)
   101     DEF_LAYOUT_OFFS(t1)
   102     DEF_LAYOUT_OFFS(t2)
   103     DEF_LAYOUT_OFFS(t3)
   104     DEF_LAYOUT_OFFS(s0)
   105     DEF_LAYOUT_OFFS(s1)
   106     DEF_LAYOUT_OFFS(s2)
   107     DEF_LAYOUT_OFFS(s3)
   108     DEF_LAYOUT_OFFS(s4)
   109     DEF_LAYOUT_OFFS(s5)
   110     DEF_LAYOUT_OFFS(s6)
   111     DEF_LAYOUT_OFFS(s7)
   112     DEF_LAYOUT_OFFS(t8)
   113     DEF_LAYOUT_OFFS(t9)
   115     DEF_LAYOUT_OFFS(gp)
   116     DEF_LAYOUT_OFFS(fp)
   117     DEF_LAYOUT_OFFS(return)
   118     reg_save_size
   119   };
   121   public:
   123   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors =false );
   124   static void restore_live_registers(MacroAssembler* masm, bool restore_vectors = false);
   125   static int raOffset(void) { return return_off / 2; }
   126   //Rmethod
   127   static int methodOffset(void) { return s3_off / 2; }
   129   static int v0Offset(void) { return v0_off / 2; }
   130   static int v1Offset(void) { return v1_off / 2; }
   132   static int fpResultOffset(void) { return fpr0_off / 2; }
   134   // During deoptimization only the result register need to be restored
   135   // all the other values have already been extracted.
   136   static void restore_result_registers(MacroAssembler* masm);
   137 };
   139 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors ) {
   141   // Always make the frame size 16-byte aligned
   142   int frame_size_in_bytes = round_to(additional_frame_words*wordSize +
   143                                      reg_save_size*BytesPerInt, 16);
   144   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
   145   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
   146   // The caller will allocate additional_frame_words
   147   int additional_frame_slots = additional_frame_words*wordSize / BytesPerInt;
   148   // CodeBlob frame size is in words.
   149   int frame_size_in_words = frame_size_in_bytes / wordSize;
   150   *total_frame_words = frame_size_in_words;
   152   // save registers
   154   __ daddiu(SP, SP, - reg_save_size * jintSize);
   156   __ sdc1(F0, SP, fpr0_off * jintSize); __ sdc1(F1, SP, fpr1_off * jintSize);
   157   __ sdc1(F2, SP, fpr2_off * jintSize); __ sdc1(F3, SP, fpr3_off * jintSize);
   158   __ sdc1(F4, SP, fpr4_off * jintSize); __ sdc1(F5, SP, fpr5_off * jintSize);
   159   __ sdc1(F6, SP, fpr6_off * jintSize);  __ sdc1(F7, SP, fpr7_off * jintSize);
   160   __ sdc1(F8, SP, fpr8_off * jintSize);  __ sdc1(F9, SP, fpr9_off * jintSize);
   161   __ sdc1(F10, SP, fpr10_off * jintSize);  __ sdc1(F11, SP, fpr11_off * jintSize);
   162   __ sdc1(F12, SP, fpr12_off * jintSize);  __ sdc1(F13, SP, fpr13_off * jintSize);
   163   __ sdc1(F14, SP, fpr14_off * jintSize);  __ sdc1(F15, SP, fpr15_off * jintSize);
   164   __ sdc1(F16, SP, fpr16_off * jintSize);  __ sdc1(F17, SP, fpr17_off * jintSize);
   165   __ sdc1(F18, SP, fpr18_off * jintSize);  __ sdc1(F19, SP, fpr19_off * jintSize);
   166   __ sdc1(F20, SP, fpr20_off * jintSize);  __ sdc1(F21, SP, fpr21_off * jintSize);
   167   __ sdc1(F22, SP, fpr22_off * jintSize);  __ sdc1(F23, SP, fpr23_off * jintSize);
   168   __ sdc1(F24, SP, fpr24_off * jintSize);  __ sdc1(F25, SP, fpr25_off * jintSize);
   169   __ sdc1(F26, SP, fpr26_off * jintSize);  __ sdc1(F27, SP, fpr27_off * jintSize);
   170   __ sdc1(F28, SP, fpr28_off * jintSize);  __ sdc1(F29, SP, fpr29_off * jintSize);
   171   __ sdc1(F30, SP, fpr30_off * jintSize);  __ sdc1(F31, SP, fpr31_off * jintSize);
   172   __ sd(V0, SP, v0_off * jintSize);  __ sd(V1, SP, v1_off * jintSize);
   173   __ sd(A0, SP, a0_off * jintSize);  __ sd(A1, SP, a1_off * jintSize);
   174   __ sd(A2, SP, a2_off * jintSize);  __ sd(A3, SP, a3_off * jintSize);
   175   __ sd(A4, SP, a4_off * jintSize);  __ sd(A5, SP, a5_off * jintSize);
   176   __ sd(A6, SP, a6_off * jintSize);  __ sd(A7, SP, a7_off * jintSize);
   177   __ sd(T0, SP, t0_off * jintSize);
   178   __ sd(T1, SP, t1_off * jintSize);
   179   __ sd(T2, SP, t2_off * jintSize);
   180   __ sd(T3, SP, t3_off * jintSize);
   181   __ sd(S0, SP, s0_off * jintSize);
   182   __ sd(S1, SP, s1_off * jintSize);
   183   __ sd(S2, SP, s2_off * jintSize);
   184   __ sd(S3, SP, s3_off * jintSize);
   185   __ sd(S4, SP, s4_off * jintSize);
   186   __ sd(S5, SP, s5_off * jintSize);
   187   __ sd(S6, SP, s6_off * jintSize);
   188   __ sd(S7, SP, s7_off * jintSize);
   190   __ sd(T8, SP, t8_off * jintSize);
   191   __ sd(T9, SP, t9_off * jintSize);
   193   __ sd(GP, SP, gp_off * jintSize);
   194   __ sd(FP, SP, fp_off * jintSize);
   195   __ sd(RA, SP, return_off * jintSize);
   196   __ daddi(FP, SP, fp_off * jintSize);
   198   OopMapSet *oop_maps = new OopMapSet();
   199   //OopMap* map =  new OopMap( frame_words, 0 );
   200   OopMap* map =  new OopMap( frame_size_in_slots, 0 );
   203 //#define STACK_OFFSET(x) VMRegImpl::stack2reg((x) + additional_frame_words)
   204 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x) + additional_frame_slots)
   205   map->set_callee_saved(STACK_OFFSET( v0_off), V0->as_VMReg());
   206   map->set_callee_saved(STACK_OFFSET( v1_off), V1->as_VMReg());
   207   map->set_callee_saved(STACK_OFFSET( a0_off), A0->as_VMReg());
   208   map->set_callee_saved(STACK_OFFSET( a1_off), A1->as_VMReg());
   209   map->set_callee_saved(STACK_OFFSET( a2_off), A2->as_VMReg());
   210   map->set_callee_saved(STACK_OFFSET( a3_off), A3->as_VMReg());
   211   map->set_callee_saved(STACK_OFFSET( a4_off), A4->as_VMReg());
   212   map->set_callee_saved(STACK_OFFSET( a5_off), A5->as_VMReg());
   213   map->set_callee_saved(STACK_OFFSET( a6_off), A6->as_VMReg());
   214   map->set_callee_saved(STACK_OFFSET( a7_off), A7->as_VMReg());
   215   map->set_callee_saved(STACK_OFFSET( t0_off), T0->as_VMReg());
   216   map->set_callee_saved(STACK_OFFSET( t1_off), T1->as_VMReg());
   217   map->set_callee_saved(STACK_OFFSET( t2_off), T2->as_VMReg());
   218   map->set_callee_saved(STACK_OFFSET( t3_off), T3->as_VMReg());
   219   map->set_callee_saved(STACK_OFFSET( s0_off), S0->as_VMReg());
   220   map->set_callee_saved(STACK_OFFSET( s1_off), S1->as_VMReg());
   221   map->set_callee_saved(STACK_OFFSET( s2_off), S2->as_VMReg());
   222   map->set_callee_saved(STACK_OFFSET( s3_off), S3->as_VMReg());
   223   map->set_callee_saved(STACK_OFFSET( s4_off), S4->as_VMReg());
   224   map->set_callee_saved(STACK_OFFSET( s5_off), S5->as_VMReg());
   225   map->set_callee_saved(STACK_OFFSET( s6_off), S6->as_VMReg());
   226   map->set_callee_saved(STACK_OFFSET( s7_off), S7->as_VMReg());
   227   map->set_callee_saved(STACK_OFFSET( t8_off), T8->as_VMReg());
   228   map->set_callee_saved(STACK_OFFSET( t9_off), T9->as_VMReg());
   229   map->set_callee_saved(STACK_OFFSET( gp_off), GP->as_VMReg());
   230   map->set_callee_saved(STACK_OFFSET( fp_off), FP->as_VMReg());
   231   map->set_callee_saved(STACK_OFFSET( return_off), RA->as_VMReg());
   233   map->set_callee_saved(STACK_OFFSET( fpr0_off), F0->as_VMReg());
   234   map->set_callee_saved(STACK_OFFSET( fpr1_off), F1->as_VMReg());
   235   map->set_callee_saved(STACK_OFFSET( fpr2_off), F2->as_VMReg());
   236   map->set_callee_saved(STACK_OFFSET( fpr3_off), F3->as_VMReg());
   237   map->set_callee_saved(STACK_OFFSET( fpr4_off), F4->as_VMReg());
   238   map->set_callee_saved(STACK_OFFSET( fpr5_off), F5->as_VMReg());
   239   map->set_callee_saved(STACK_OFFSET( fpr6_off), F6->as_VMReg());
   240   map->set_callee_saved(STACK_OFFSET( fpr7_off), F7->as_VMReg());
   241   map->set_callee_saved(STACK_OFFSET( fpr8_off), F8->as_VMReg());
   242   map->set_callee_saved(STACK_OFFSET( fpr9_off), F9->as_VMReg());
   243   map->set_callee_saved(STACK_OFFSET( fpr10_off), F10->as_VMReg());
   244   map->set_callee_saved(STACK_OFFSET( fpr11_off), F11->as_VMReg());
   245   map->set_callee_saved(STACK_OFFSET( fpr12_off), F12->as_VMReg());
   246   map->set_callee_saved(STACK_OFFSET( fpr13_off), F13->as_VMReg());
   247   map->set_callee_saved(STACK_OFFSET( fpr14_off), F14->as_VMReg());
   248   map->set_callee_saved(STACK_OFFSET( fpr15_off), F15->as_VMReg());
   249   map->set_callee_saved(STACK_OFFSET( fpr16_off), F16->as_VMReg());
   250   map->set_callee_saved(STACK_OFFSET( fpr17_off), F17->as_VMReg());
   251   map->set_callee_saved(STACK_OFFSET( fpr18_off), F18->as_VMReg());
   252   map->set_callee_saved(STACK_OFFSET( fpr19_off), F19->as_VMReg());
   253   map->set_callee_saved(STACK_OFFSET( fpr20_off), F20->as_VMReg());
   254   map->set_callee_saved(STACK_OFFSET( fpr21_off), F21->as_VMReg());
   255   map->set_callee_saved(STACK_OFFSET( fpr22_off), F22->as_VMReg());
   256   map->set_callee_saved(STACK_OFFSET( fpr23_off), F23->as_VMReg());
   257   map->set_callee_saved(STACK_OFFSET( fpr24_off), F24->as_VMReg());
   258   map->set_callee_saved(STACK_OFFSET( fpr25_off), F25->as_VMReg());
   259   map->set_callee_saved(STACK_OFFSET( fpr26_off), F26->as_VMReg());
   260   map->set_callee_saved(STACK_OFFSET( fpr27_off), F27->as_VMReg());
   261   map->set_callee_saved(STACK_OFFSET( fpr28_off), F28->as_VMReg());
   262   map->set_callee_saved(STACK_OFFSET( fpr29_off), F29->as_VMReg());
   263   map->set_callee_saved(STACK_OFFSET( fpr30_off), F30->as_VMReg());
   264   map->set_callee_saved(STACK_OFFSET( fpr31_off), F31->as_VMReg());
   266 #undef STACK_OFFSET
   267   return map;
   268 }
   271 // Pop the current frame and restore all the registers that we
   272 // saved.
   273 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_vectors) {
   274   __ ldc1(F0, SP, fpr0_off * jintSize); __ ldc1(F1, SP, fpr1_off * jintSize);
   275   __ ldc1(F2, SP, fpr2_off * jintSize); __ ldc1(F3, SP, fpr3_off * jintSize);
   276   __ ldc1(F4, SP, fpr4_off * jintSize); __ ldc1(F5, SP, fpr5_off * jintSize);
   277   __ ldc1(F6, SP, fpr6_off * jintSize);  __ ldc1(F7, SP, fpr7_off * jintSize);
   278   __ ldc1(F8, SP, fpr8_off * jintSize);  __ ldc1(F9, SP, fpr9_off * jintSize);
   279   __ ldc1(F10, SP, fpr10_off * jintSize);  __ ldc1(F11, SP, fpr11_off * jintSize);
   280   __ ldc1(F12, SP, fpr12_off * jintSize);  __ ldc1(F13, SP, fpr13_off * jintSize);
   281   __ ldc1(F14, SP, fpr14_off * jintSize);  __ ldc1(F15, SP, fpr15_off * jintSize);
   282   __ ldc1(F16, SP, fpr16_off * jintSize);  __ ldc1(F17, SP, fpr17_off * jintSize);
   283   __ ldc1(F18, SP, fpr18_off * jintSize);  __ ldc1(F19, SP, fpr19_off * jintSize);
   284   __ ldc1(F20, SP, fpr20_off * jintSize);  __ ldc1(F21, SP, fpr21_off * jintSize);
   285   __ ldc1(F22, SP, fpr22_off * jintSize);  __ ldc1(F23, SP, fpr23_off * jintSize);
   286   __ ldc1(F24, SP, fpr24_off * jintSize);  __ ldc1(F25, SP, fpr25_off * jintSize);
   287   __ ldc1(F26, SP, fpr26_off * jintSize);  __ ldc1(F27, SP, fpr27_off * jintSize);
   288   __ ldc1(F28, SP, fpr28_off * jintSize);  __ ldc1(F29, SP, fpr29_off * jintSize);
   289   __ ldc1(F30, SP, fpr30_off * jintSize);  __ ldc1(F31, SP, fpr31_off * jintSize);
   291   __ ld(V0, SP, v0_off * jintSize);  __ ld(V1, SP, v1_off * jintSize);
   292   __ ld(A0, SP, a0_off * jintSize);  __ ld(A1, SP, a1_off * jintSize);
   293   __ ld(A2, SP, a2_off * jintSize);  __ ld(A3, SP, a3_off * jintSize);
   294   __ ld(A4, SP, a4_off * jintSize);  __ ld(A5, SP, a5_off * jintSize);
   295   __ ld(A6, SP, a6_off * jintSize);  __ ld(A7, SP, a7_off * jintSize);
   296   __ ld(T0, SP, t0_off * jintSize);
   297   __ ld(T1, SP, t1_off * jintSize);
   298   __ ld(T2, SP, t2_off * jintSize);
   299   __ ld(T3, SP, t3_off * jintSize);
   300   __ ld(S0, SP, s0_off * jintSize);
   301   __ ld(S1, SP, s1_off * jintSize);
   302   __ ld(S2, SP, s2_off * jintSize);
   303   __ ld(S3, SP, s3_off * jintSize);
   304   __ ld(S4, SP, s4_off * jintSize);
   305   __ ld(S5, SP, s5_off * jintSize);
   306   __ ld(S6, SP, s6_off * jintSize);
   307   __ ld(S7, SP, s7_off * jintSize);
   309   __ ld(T8, SP, t8_off * jintSize);
   310   __ ld(T9, SP, t9_off * jintSize);
   312   __ ld(GP, SP, gp_off * jintSize);
   313   __ ld(FP, SP, fp_off * jintSize);
   314   __ ld(RA, SP, return_off * jintSize);
   316   __ addiu(SP, SP, reg_save_size * jintSize);
   317 }
   319 // Pop the current frame and restore the registers that might be holding
   320 // a result.
   321 // FIXME, if the result is float?
   322 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
   324   // Just restore result register. Only used by deoptimization. By
   325   // now any callee save register that needs to be restore to a c2
   326   // caller of the deoptee has been extracted into the vframeArray
   327   // and will be stuffed into the c2i adapter we create for later
   328   // restoration so only result registers need to be restored here.
   330   __ ld(V0, SP, v0_off * jintSize);
   331   __ ld(V1, SP, v1_off * jintSize);
   332   __ addiu(SP, SP, return_off * jintSize);
   333 }
   335 // Is vector's size (in bytes) bigger than a size saved by default?
   336 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
   337 bool SharedRuntime::is_wide_vector(int size) {
   338   return size > 16;
   339 }
   341 // The java_calling_convention describes stack locations as ideal slots on
   342 // a frame with no abi restrictions. Since we must observe abi restrictions
   343 // (like the placement of the register window) the slots must be biased by
   344 // the following value.
   346 static int reg2offset_in(VMReg r) {
   347   // Account for saved fp and return address
   348   // This should really be in_preserve_stack_slots
   349   return (r->reg2stack() + 2 * VMRegImpl::slots_per_word) * VMRegImpl::stack_slot_size;  // + 2 * VMRegImpl::stack_slot_size);
   350 }
   352 static int reg2offset_out(VMReg r) {
   353   return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
   354 }
   356 // ---------------------------------------------------------------------------
   357 // Read the array of BasicTypes from a signature, and compute where the
   358 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
   359 // quantities.  Values less than SharedInfo::stack0 are registers, those above
   360 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
   361 // as framesizes are fixed.
   362 // VMRegImpl::stack0 refers to the first slot 0(sp).
   363 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.  Register
   364 // up to RegisterImpl::number_of_registers) are the 32-bit
   365 // integer registers.
   367 // Pass first five oop/int args in registers T0, A0 - A3.
   368 // Pass float/double/long args in stack.
   369 // Doubles have precedence, so if you pass a mix of floats and doubles
   370 // the doubles will grab the registers before the floats will.
   372 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
   373 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
   374 // units regardless of build.
   377 // ---------------------------------------------------------------------------
   378 // The compiled Java calling convention.
   379 // Pass first five oop/int args in registers T0, A0 - A3.
   380 // Pass float/double/long args in stack.
   381 // Doubles have precedence, so if you pass a mix of floats and doubles
   382 // the doubles will grab the registers before the floats will.
   384 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
   385                                            VMRegPair *regs,
   386                                            int total_args_passed,
   387                                            int is_outgoing) {
   389   // Create the mapping between argument positions and
   390   // registers.
   391   //static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
   392   static const Register INT_ArgReg[Argument::n_register_parameters + 1] = {
   393     T0, A0, A1, A2, A3, A4, A5, A6, A7
   394   };
   395   //static const FloatRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
   396   static const FloatRegister FP_ArgReg[Argument::n_float_register_parameters] = {
   397     F12, F13, F14, F15, F16, F17, F18, F19
   398   };
   401   uint args = 0;
   402   uint stk_args = 0; // inc by 2 each time
   404   for (int i = 0; i < total_args_passed; i++) {
   405     switch (sig_bt[i]) {
   406     case T_VOID:
   407       // halves of T_LONG or T_DOUBLE
   408       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
   409       regs[i].set_bad();
   410       break;
   411     case T_BOOLEAN:
   412     case T_CHAR:
   413     case T_BYTE:
   414     case T_SHORT:
   415     case T_INT:
   416       if (args < Argument::n_register_parameters) {
   417         regs[i].set1(INT_ArgReg[args++]->as_VMReg());
   418       } else {
   419         regs[i].set1(VMRegImpl::stack2reg(stk_args));
   420         stk_args += 2;
   421       }
   422       break;
   423     case T_LONG:
   424       assert(sig_bt[i + 1] == T_VOID, "expecting half");
   425       // fall through
   426     case T_OBJECT:
   427     case T_ARRAY:
   428     case T_ADDRESS:
   429       if (args < Argument::n_register_parameters) {
   430         regs[i].set2(INT_ArgReg[args++]->as_VMReg());
   431       } else {
   432         regs[i].set2(VMRegImpl::stack2reg(stk_args));
   433         stk_args += 2;
   434       }
   435       break;
   436     case T_FLOAT:
   437       if (args < Argument::n_float_register_parameters) {
   438         regs[i].set1(FP_ArgReg[args++]->as_VMReg());
   439       } else {
   440         regs[i].set1(VMRegImpl::stack2reg(stk_args));
   441         stk_args += 2;
   442       }
   443       break;
   444     case T_DOUBLE:
   445       assert(sig_bt[i + 1] == T_VOID, "expecting half");
   446       if (args < Argument::n_float_register_parameters) {
   447         regs[i].set2(FP_ArgReg[args++]->as_VMReg());
   448       } else {
   449         regs[i].set2(VMRegImpl::stack2reg(stk_args));
   450         stk_args += 2;
   451       }
   452       break;
   453     default:
   454       ShouldNotReachHere();
   455       break;
   456     }
   457   }
   459   return round_to(stk_args, 2);
   460 }
   462 // Helper class mostly to avoid passing masm everywhere, and handle store
   463 // displacement overflow logic for LP64
   464 class AdapterGenerator {
   465   MacroAssembler *masm;
   466 #ifdef _LP64
   467   Register Rdisp;
   468   void set_Rdisp(Register r)  { Rdisp = r; }
   469 #endif // _LP64
   471   void patch_callers_callsite();
   473   // base+st_off points to top of argument
   474   int arg_offset(const int st_off) { return st_off; }
   475   int next_arg_offset(const int st_off) {
   476     return st_off - Interpreter::stackElementSize;
   477   }
   479 #ifdef _LP64
   480   // On _LP64 argument slot values are loaded first into a register
   481   // because they might not fit into displacement.
   482   Register arg_slot(const int st_off);
   483   Register next_arg_slot(const int st_off);
   484 #else
   485   int arg_slot(const int st_off)      { return arg_offset(st_off); }
   486   int next_arg_slot(const int st_off) { return next_arg_offset(st_off); }
   487 #endif // _LP64
   489   // Stores long into offset pointed to by base
   490   void store_c2i_long(Register r, Register base,
   491                       const int st_off, bool is_stack);
   492   void store_c2i_object(Register r, Register base,
   493                         const int st_off);
   494   void store_c2i_int(Register r, Register base,
   495                      const int st_off);
   496   void store_c2i_double(VMReg r_2,
   497                         VMReg r_1, Register base, const int st_off);
   498   void store_c2i_float(FloatRegister f, Register base,
   499                        const int st_off);
   501  public:
   502   //void tag_stack(const BasicType sig, int st_off);
   503   void gen_c2i_adapter(int total_args_passed,
   504                               // VMReg max_arg,
   505                               int comp_args_on_stack, // VMRegStackSlots
   506                               const BasicType *sig_bt,
   507                               const VMRegPair *regs,
   508                               Label& skip_fixup);
   509   void gen_i2c_adapter(int total_args_passed,
   510                               // VMReg max_arg,
   511                               int comp_args_on_stack, // VMRegStackSlots
   512                               const BasicType *sig_bt,
   513                               const VMRegPair *regs);
   515   AdapterGenerator(MacroAssembler *_masm) : masm(_masm) {}
   516 };
   519 // Patch the callers callsite with entry to compiled code if it exists.
   520 void AdapterGenerator::patch_callers_callsite() {
   521   Label L;
   522   __ verify_oop(Rmethod);
   523   __ ld_ptr(AT, Rmethod, in_bytes(Method::code_offset()));
   524   __ beq(AT, R0, L);
   525   __ delayed()->nop();
   526   // Schedule the branch target address early.
   527   // Call into the VM to patch the caller, then jump to compiled callee
   528   // V0 isn't live so capture return address while we easily can
   529   __ move(V0, RA);
   531   __ pushad();
   532 #ifdef COMPILER2
   533   // C2 may leave the stack dirty if not in SSE2+ mode
   534   __ empty_FPU_stack();
   535 #endif
   537   // VM needs caller's callsite
   538   // VM needs target method
   540   __ move(A0, Rmethod);
   541   __ move(A1, V0);
   542   // we should preserve the return address
   543   __ verify_oop(Rmethod);
   544   __ move(S0, SP);
   545   __ move(AT, -(StackAlignmentInBytes));   // align the stack
   546   __ andr(SP, SP, AT);
   547   __ call(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite),
   548           relocInfo::runtime_call_type);
   550   __ delayed()->nop();
   551   __ move(SP, S0);
   552   __ popad();
   553   __ bind(L);
   554 }
   556 #ifdef _LP64
   557 Register AdapterGenerator::arg_slot(const int st_off) {
   558   Unimplemented();
   559 }
   561 Register AdapterGenerator::next_arg_slot(const int st_off){
   562   Unimplemented();
   563 }
   564 #endif // _LP64
   566 // Stores long into offset pointed to by base
   567 void AdapterGenerator::store_c2i_long(Register r, Register base,
   568                                       const int st_off, bool is_stack) {
   569   Unimplemented();
   570 }
   572 void AdapterGenerator::store_c2i_object(Register r, Register base,
   573                                         const int st_off) {
   574   Unimplemented();
   575 }
   577 void AdapterGenerator::store_c2i_int(Register r, Register base,
   578                                      const int st_off) {
   579   Unimplemented();
   580 }
   582 // Stores into offset pointed to by base
   583 void AdapterGenerator::store_c2i_double(VMReg r_2,
   584                       VMReg r_1, Register base, const int st_off) {
   585   Unimplemented();
   586 }
   588 void AdapterGenerator::store_c2i_float(FloatRegister f, Register base,
   589                                        const int st_off) {
   590   Unimplemented();
   591 }
   593 void AdapterGenerator::gen_c2i_adapter(
   594                             int total_args_passed,
   595                             // VMReg max_arg,
   596                             int comp_args_on_stack, // VMRegStackSlots
   597                             const BasicType *sig_bt,
   598                             const VMRegPair *regs,
   599                             Label& skip_fixup) {
   601   // Before we get into the guts of the C2I adapter, see if we should be here
   602   // at all.  We've come from compiled code and are attempting to jump to the
   603   // interpreter, which means the caller made a static call to get here
   604   // (vcalls always get a compiled target if there is one).  Check for a
   605   // compiled target.  If there is one, we need to patch the caller's call.
   606   // However we will run interpreted if we come thru here. The next pass
   607   // thru the call site will run compiled. If we ran compiled here then
   608   // we can (theorectically) do endless i2c->c2i->i2c transitions during
   609   // deopt/uncommon trap cycles. If we always go interpreted here then
   610   // we can have at most one and don't need to play any tricks to keep
   611   // from endlessly growing the stack.
   612   //
   613   // Actually if we detected that we had an i2c->c2i transition here we
   614   // ought to be able to reset the world back to the state of the interpreted
   615   // call and not bother building another interpreter arg area. We don't
   616   // do that at this point.
   618   patch_callers_callsite();
   620   __ bind(skip_fixup);
   622 #ifdef COMPILER2
   623   __ empty_FPU_stack();
   624 #endif
   625   //this is for native ?
   626   // Since all args are passed on the stack, total_args_passed * interpreter_
   627   // stack_element_size  is the
   628   // space we need.
   629   int extraspace = total_args_passed * Interpreter::stackElementSize;
   631   // stack is aligned, keep it that way
   632   extraspace = round_to(extraspace, 2*wordSize);
   634   // Get return address
   635   __ move(V0, RA);
   636   // set senderSP value
   637   //refer to interpreter_mips.cpp:generate_asm_entry
   638   __ move(Rsender, SP);
   639   __ addi(SP, SP, -extraspace);
   641   // Now write the args into the outgoing interpreter space
   642   for (int i = 0; i < total_args_passed; i++) {
   643     if (sig_bt[i] == T_VOID) {
   644       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
   645       continue;
   646     }
   648     // st_off points to lowest address on stack.
   649     int st_off = ((total_args_passed - 1) - i) * Interpreter::stackElementSize;
   650     // Say 4 args:
   651     // i   st_off
   652     // 0   12 T_LONG
   653     // 1    8 T_VOID
   654     // 2    4 T_OBJECT
   655     // 3    0 T_BOOL
   656     VMReg r_1 = regs[i].first();
   657     VMReg r_2 = regs[i].second();
   658     if (!r_1->is_valid()) {
   659       assert(!r_2->is_valid(), "");
   660       continue;
   661     }
   662     if (r_1->is_stack()) {
   663       // memory to memory use fpu stack top
   664       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
   665       if (!r_2->is_valid()) {
   666         __ ld_ptr(AT, SP, ld_off);
   667         __ st_ptr(AT, SP, st_off);
   669       } else {
   672         int next_off = st_off - Interpreter::stackElementSize;
   673         __ ld_ptr(AT, SP, ld_off);
   674         __ st_ptr(AT, SP, st_off);
   676         // Ref to is_Register condition
   677         if(sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE)
   678           __ st_ptr(AT, SP, st_off - 8);
   679       }
   680     } else if (r_1->is_Register()) {
   681       Register r = r_1->as_Register();
   682       if (!r_2->is_valid()) {
   683           __ sd(r, SP, st_off);
   684       } else {
   685         //FIXME, mips will not enter here
   686         // long/double in gpr
   687         __ sd(r, SP, st_off);
   688         // In [java/util/zip/ZipFile.java]
   689         //
   690         //    private static native long open(String name, int mode, long lastModified);
   691         //    private static native int getTotal(long jzfile);
   692         //
   693         // We need to transfer T_LONG paramenters from a compiled method to a native method.
   694         // It's a complex process:
   695         //
   696         // Caller -> lir_static_call -> gen_resolve_stub
   697         //      -> -- resolve_static_call_C
   698         //         `- gen_c2i_adapter()  [*]
   699         //             |
   700         //       `- AdapterHandlerLibrary::get_create_apapter_index
   701         //      -> generate_native_entry
   702         //      -> InterpreterRuntime::SignatureHandlerGenerator::pass_long [**]
   703         //
   704         // In [**], T_Long parameter is stored in stack as:
   705         //
   706         //   (high)
   707         //    |         |
   708         //    -----------
   709         //    | 8 bytes |
   710         //    | (void)  |
   711         //    -----------
   712         //    | 8 bytes |
   713         //    | (long)  |
   714         //    -----------
   715         //    |         |
   716         //   (low)
   717         //
   718         // However, the sequence is reversed here:
   719         //
   720         //   (high)
   721         //    |         |
   722         //    -----------
   723         //    | 8 bytes |
   724         //    | (long)  |
   725         //    -----------
   726         //    | 8 bytes |
   727         //    | (void)  |
   728         //    -----------
   729         //    |         |
   730         //   (low)
   731         //
   732         // So I stored another 8 bytes in the T_VOID slot. It then can be accessed from generate_native_entry().
   733         //
   734         if (sig_bt[i] == T_LONG)
   735           __ sd(r, SP, st_off - 8);
   736       }
   737     } else if (r_1->is_FloatRegister()) {
   738       assert(sig_bt[i] == T_FLOAT || sig_bt[i] == T_DOUBLE, "Must be a float register");
   740       FloatRegister fr = r_1->as_FloatRegister();
   741       if (sig_bt[i] == T_FLOAT)
   742         __ swc1(fr, SP, st_off);
   743       else {
   744         __ sdc1(fr, SP, st_off);
   745         __ sdc1(fr, SP, st_off - 8);  // T_DOUBLE needs two slots
   746       }
   747     }
   748   }
   750   // Schedule the branch target address early.
   751   __ ld_ptr(AT, Rmethod, in_bytes(Method::interpreter_entry_offset()) );
   752   // And repush original return address
   753   __ move(RA, V0);
   754   __ jr (AT);
   755   __ delayed()->nop();
   756 }
   758 void AdapterGenerator::gen_i2c_adapter(
   759                                        int total_args_passed,
   760                                        // VMReg max_arg,
   761                                        int comp_args_on_stack, // VMRegStackSlots
   762                                        const BasicType *sig_bt,
   763                                        const VMRegPair *regs) {
   765   // Generate an I2C adapter: adjust the I-frame to make space for the C-frame
   766   // layout.  Lesp was saved by the calling I-frame and will be restored on
   767   // return.  Meanwhile, outgoing arg space is all owned by the callee
   768   // C-frame, so we can mangle it at will.  After adjusting the frame size,
   769   // hoist register arguments and repack other args according to the compiled
   770   // code convention.  Finally, end in a jump to the compiled code.  The entry
   771   // point address is the start of the buffer.
   773   // We will only enter here from an interpreted frame and never from after
   774   // passing thru a c2i. Azul allowed this but we do not. If we lose the
   775   // race and use a c2i we will remain interpreted for the race loser(s).
   776   // This removes all sorts of headaches on the mips side and also eliminates
   777   // the possibility of having c2i -> i2c -> c2i -> ... endless transitions.
   780   __ move(T9, SP);
   782   // Cut-out for having no stack args.  Since up to 2 int/oop args are passed
   783   // in registers, we will occasionally have no stack args.
   784   int comp_words_on_stack = 0;
   785   if (comp_args_on_stack) {
   786     // Sig words on the stack are greater-than VMRegImpl::stack0.  Those in
   787     // registers are below.  By subtracting stack0, we either get a negative
   788     // number (all values in registers) or the maximum stack slot accessed.
   789     // int comp_args_on_stack = VMRegImpl::reg2stack(max_arg);
   790     // Convert 4-byte stack slots to words.
   791     // did mips need round? FIXME  aoqi
   792     comp_words_on_stack = round_to(comp_args_on_stack*4, wordSize)>>LogBytesPerWord;
   793     // Round up to miminum stack alignment, in wordSize
   794     comp_words_on_stack = round_to(comp_words_on_stack, 2);
   795     __ daddi(SP, SP, -comp_words_on_stack * wordSize);
   796   }
   798   // Align the outgoing SP
   799   __ move(AT, -(StackAlignmentInBytes));
   800   __ andr(SP, SP, AT);
   801   // push the return address on the stack (note that pushing, rather
   802   // than storing it, yields the correct frame alignment for the callee)
   803   // Put saved SP in another register
   804   const Register saved_sp = V0;
   805   __ move(saved_sp, T9);
   808   // Will jump to the compiled code just as if compiled code was doing it.
   809   // Pre-load the register-jump target early, to schedule it better.
   810   __ ld(T9, Rmethod, in_bytes(Method::from_compiled_offset()));
   812   // Now generate the shuffle code.  Pick up all register args and move the
   813   // rest through the floating point stack top.
   814   for (int i = 0; i < total_args_passed; i++) {
   815     if (sig_bt[i] == T_VOID) {
   816       // Longs and doubles are passed in native word order, but misaligned
   817       // in the 32-bit build.
   818       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
   819       continue;
   820     }
   822     // Pick up 0, 1 or 2 words from SP+offset.
   824     //FIXME. aoqi. just delete the assert
   825     //assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(), "scrambled load targets?");
   826     // Load in argument order going down.
   827     int ld_off = (total_args_passed -1 - i)*Interpreter::stackElementSize;
   828     // Point to interpreter value (vs. tag)
   829     int next_off = ld_off - Interpreter::stackElementSize;
   830     VMReg r_1 = regs[i].first();
   831     VMReg r_2 = regs[i].second();
   832     if (!r_1->is_valid()) {
   833       assert(!r_2->is_valid(), "");
   834       continue;
   835     }
   836     if (r_1->is_stack()) {
   837       // Convert stack slot to an SP offset (+ wordSize to
   838       // account for return address )
   839       // NOTICE HERE!!!! I sub a wordSize here
   840       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size;
   841       //+ wordSize;
   843       if (!r_2->is_valid()) {
   844         __ ld(AT, saved_sp, ld_off);
   845         __ sd(AT, SP, st_off);
   846       } else {
   847         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
   848         // are accessed as negative so LSW is at LOW address
   850         // ld_off is MSW so get LSW
   851         // st_off is LSW (i.e. reg.first())
   853         // [./org/eclipse/swt/graphics/GC.java]
   854         // void drawImageXRender(Image srcImage, int srcX, int srcY, int srcWidth, int srcHeight,
   855         //  int destX, int destY, int destWidth, int destHeight,
   856         //  boolean simple,
   857         //  int imgWidth, int imgHeight,
   858         //  long maskPixmap,  <-- Pass T_LONG in stack
   859         //  int maskType);
   860         // Before this modification, Eclipse displays icons with solid black background.
   861         //
   862         __ ld(AT, saved_sp, ld_off);
   863         if (sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE)
   864           __ ld(AT, saved_sp, ld_off - 8);
   865         __ sd(AT, SP, st_off);
   866       }
   867     } else if (r_1->is_Register()) {  // Register argument
   868       Register r = r_1->as_Register();
   869       if (r_2->is_valid()) {
   870         // Remember r_1 is low address (and LSB on mips)
   871         // So r_2 gets loaded from high address regardless of the platform
   872         assert(r_2->as_Register() == r_1->as_Register(), "");
   873         __ ld(r, saved_sp, ld_off);
   875         //
   876         // For T_LONG type, the real layout is as below:
   877         //
   878         //   (high)
   879         //    |         |
   880         //    -----------
   881         //    | 8 bytes |
   882         //    | (void)  |
   883         //    -----------
   884         //    | 8 bytes |
   885         //    | (long)  |
   886         //    -----------
   887         //    |         |
   888         //   (low)
   889         //
   890         // We should load the low-8 bytes.
   891         //
   892         if (sig_bt[i] == T_LONG)
   893           __ ld(r, saved_sp, ld_off - 8);
   894       } else {
   895         __ lw(r, saved_sp, ld_off);
   896       }
   897     } else if (r_1->is_FloatRegister()) { // Float Register
   898       assert(sig_bt[i] == T_FLOAT || sig_bt[i] == T_DOUBLE, "Must be a float register");
   900       FloatRegister fr = r_1->as_FloatRegister();
   901       if (sig_bt[i] == T_FLOAT)
   902           __ lwc1(fr, saved_sp, ld_off);
   903       else {
   904           __ ldc1(fr, saved_sp, ld_off);
   905           __ ldc1(fr, saved_sp, ld_off - 8);
   906       }
   907     }
   908   }
   910   // 6243940 We might end up in handle_wrong_method if
   911   // the callee is deoptimized as we race thru here. If that
   912   // happens we don't want to take a safepoint because the
   913   // caller frame will look interpreted and arguments are now
   914   // "compiled" so it is much better to make this transition
   915   // invisible to the stack walking code. Unfortunately if
   916   // we try and find the callee by normal means a safepoint
   917   // is possible. So we stash the desired callee in the thread
   918   // and the vm will find there should this case occur.
   919   __ get_thread(T8);
   920   __ sd(Rmethod, T8, in_bytes(JavaThread::callee_target_offset()));
   922   // move methodOop to V0 in case we end up in an c2i adapter.
   923   // the c2i adapters expect methodOop in V0 (c2) because c2's
   924   // resolve stubs return the result (the method) in V0.
   925   // I'd love to fix this.
   926   __ move(V0, Rmethod);
   927   __ jr(T9);
   928   __ delayed()->nop();
   929 }
   931 // ---------------------------------------------------------------
   932 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
   933                                                             int total_args_passed,
   934                                                             // VMReg max_arg,
   935                                                             int comp_args_on_stack, // VMRegStackSlots
   936                                                             const BasicType *sig_bt,
   937                                                             const VMRegPair *regs,
   938                                                             AdapterFingerPrint* fingerprint) {
   939   address i2c_entry = __ pc();
   941   AdapterGenerator agen(masm);
   943   agen.gen_i2c_adapter(total_args_passed, comp_args_on_stack, sig_bt, regs);
   946   // -------------------------------------------------------------------------
   947   // Generate a C2I adapter.  On entry we know G5 holds the methodOop.  The
   948   // args start out packed in the compiled layout.  They need to be unpacked
   949   // into the interpreter layout.  This will almost always require some stack
   950   // space.  We grow the current (compiled) stack, then repack the args.  We
   951   // finally end in a jump to the generic interpreter entry point.  On exit
   952   // from the interpreter, the interpreter will restore our SP (lest the
   953   // compiled code, which relys solely on SP and not FP, get sick).
   955   address c2i_unverified_entry = __ pc();
   956   Label skip_fixup;
   957   {
   958     Register holder = T1;
   959     Register receiver = T0;
   960     Register temp = T8;
   961     address ic_miss = SharedRuntime::get_ic_miss_stub();
   963     Label missed;
   965     __ verify_oop(holder);
   966     //add for compressedoops
   967     __ load_klass(temp, receiver);
   968     __ verify_oop(temp);
   970     __ ld_ptr(AT, holder, CompiledICHolder::holder_klass_offset());
   971     __ ld_ptr(Rmethod, holder, CompiledICHolder::holder_metadata_offset());
   972     __ bne(AT, temp, missed);
   973     __ delayed()->nop();
   974     // Method might have been compiled since the call site was patched to
   975     // interpreted if that is the case treat it as a miss so we can get
   976     // the call site corrected.
   977     __ ld_ptr(AT, Rmethod, in_bytes(Method::code_offset()));
   978     __ beq(AT, R0, skip_fixup);
   979     __ delayed()->nop();
   980     __ bind(missed);
   982     __ jmp(ic_miss, relocInfo::runtime_call_type);
   983     __ delayed()->nop();
   984   }
   986   address c2i_entry = __ pc();
   988   agen.gen_c2i_adapter(total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
   990   __ flush();
   991   return  AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry);
   992 }
   994 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
   995                                          VMRegPair *regs,
   996                                          VMRegPair *regs2,
   997                                          int total_args_passed) {
   998   assert(regs2 == NULL, "not needed on MIPS");
   999   // Return the number of VMReg stack_slots needed for the args.
  1000   // This value does not include an abi space (like register window
  1001   // save area).
  1003   // The native convention is V8 if !LP64
  1004   // The LP64 convention is the V9 convention which is slightly more sane.
  1006   // We return the amount of VMReg stack slots we need to reserve for all
  1007   // the arguments NOT counting out_preserve_stack_slots. Since we always
  1008   // have space for storing at least 6 registers to memory we start with that.
  1009   // See int_stk_helper for a further discussion.
  1010   // We return the amount of VMRegImpl stack slots we need to reserve for all
  1011   // the arguments NOT counting out_preserve_stack_slots.
  1012   static const Register INT_ArgReg[Argument::n_register_parameters] = {
  1013     A0, A1, A2, A3, A4, A5, A6, A7
  1014   };
  1015   static const FloatRegister FP_ArgReg[Argument::n_float_register_parameters] = {
  1016     F12, F13, F14, F15, F16, F17, F18, F19
  1017   };
  1018   uint args = 0;
  1019   uint stk_args = 0; // inc by 2 each time
  1021 // Example:
  1022 //    n   java.lang.UNIXProcess::forkAndExec
  1023 //     private native int forkAndExec(byte[] prog,
  1024 //                                    byte[] argBlock, int argc,
  1025 //                                    byte[] envBlock, int envc,
  1026 //                                    byte[] dir,
  1027 //                                    boolean redirectErrorStream,
  1028 //                                    FileDescriptor stdin_fd,
  1029 //                                    FileDescriptor stdout_fd,
  1030 //                                    FileDescriptor stderr_fd)
  1031 // JNIEXPORT jint JNICALL
  1032 // Java_java_lang_UNIXProcess_forkAndExec(JNIEnv *env,
  1033 //                                        jobject process,
  1034 //                                        jbyteArray prog,
  1035 //                                        jbyteArray argBlock, jint argc,
  1036 //                                        jbyteArray envBlock, jint envc,
  1037 //                                        jbyteArray dir,
  1038 //                                        jboolean redirectErrorStream,
  1039 //                                        jobject stdin_fd,
  1040 //                                        jobject stdout_fd,
  1041 //                                        jobject stderr_fd)
  1042 //
  1043 // ::c_calling_convention
  1044 // 0:     // env    <-- a0
  1045 // 1: L    // klass/obj  <-- t0 => a1
  1046 // 2: [    // prog[]  <-- a0 => a2
  1047 // 3: [    // argBlock[]  <-- a1 => a3
  1048 // 4: I    // argc
  1049 // 5: [    // envBlock[]  <-- a3 => a5
  1050 // 6: I    // envc
  1051 // 7: [    // dir[]  <-- a5 => a7
  1052 // 8: Z    // redirectErrorStream  a6 => sp[0]
  1053 // 9: L    // stdin    a7 => sp[8]
  1054 // 10: L    // stdout    fp[16] => sp[16]
  1055 // 11: L    // stderr    fp[24] => sp[24]
  1056 //
  1057   for (int i = 0; i < total_args_passed; i++) {
  1058     switch (sig_bt[i]) {
  1059     case T_VOID: // Halves of longs and doubles
  1060       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
  1061       regs[i].set_bad();
  1062       break;
  1063     case T_BOOLEAN:
  1064     case T_CHAR:
  1065     case T_BYTE:
  1066     case T_SHORT:
  1067     case T_INT:
  1068       if (args < Argument::n_register_parameters) {
  1069         regs[i].set1(INT_ArgReg[args++]->as_VMReg());
  1070       } else {
  1071         regs[i].set1(VMRegImpl::stack2reg(stk_args));
  1072         stk_args += 2;
  1074       break;
  1075     case T_LONG:
  1076       assert(sig_bt[i + 1] == T_VOID, "expecting half");
  1077       // fall through
  1078     case T_OBJECT:
  1079     case T_ARRAY:
  1080     case T_ADDRESS:
  1081     case T_METADATA:
  1082       if (args < Argument::n_register_parameters) {
  1083         regs[i].set2(INT_ArgReg[args++]->as_VMReg());
  1084       } else {
  1085         regs[i].set2(VMRegImpl::stack2reg(stk_args));
  1086         stk_args += 2;
  1088       break;
  1089     case T_FLOAT:
  1090       if (args < Argument::n_float_register_parameters) {
  1091         regs[i].set1(FP_ArgReg[args++]->as_VMReg());
  1092       } else {
  1093         regs[i].set1(VMRegImpl::stack2reg(stk_args));
  1094         stk_args += 2;
  1096       break;
  1097     case T_DOUBLE:
  1098       assert(sig_bt[i + 1] == T_VOID, "expecting half");
  1099       if (args < Argument::n_float_register_parameters) {
  1100         regs[i].set2(FP_ArgReg[args++]->as_VMReg());
  1101       } else {
  1102         regs[i].set2(VMRegImpl::stack2reg(stk_args));
  1103         stk_args += 2;
  1105       break;
  1106     default:
  1107       ShouldNotReachHere();
  1108       break;
  1112   return round_to(stk_args, 2);
  1115 // ---------------------------------------------------------------------------
  1116 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
  1117   // We always ignore the frame_slots arg and just use the space just below frame pointer
  1118   // which by this time is free to use
  1119   switch (ret_type) {
  1120     case T_FLOAT:
  1121       __ swc1(FSF, FP, -wordSize);
  1122       break;
  1123     case T_DOUBLE:
  1124       __ sdc1(FSF, FP, -wordSize );
  1125       break;
  1126     case T_VOID:  break;
  1127     case T_LONG:
  1128       __ sd(V0, FP, -wordSize);
  1129       break;
  1130     case T_OBJECT:
  1131     case T_ARRAY:
  1132       __ sd(V0, FP, -wordSize);
  1133       break;
  1134     default: {
  1135       __ sw(V0, FP, -wordSize);
  1140 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
  1141   // We always ignore the frame_slots arg and just use the space just below frame pointer
  1142   // which by this time is free to use
  1143   switch (ret_type) {
  1144     case T_FLOAT:
  1145       __ lwc1(FSF, FP, -wordSize);
  1146       break;
  1147     case T_DOUBLE:
  1148       __ ldc1(FSF, FP, -wordSize );
  1149       break;
  1150     case T_LONG:
  1151       __ ld(V0, FP, -wordSize);
  1152       break;
  1153     case T_VOID:  break;
  1154     case T_OBJECT:
  1155     case T_ARRAY:
  1156       __ ld(V0, FP, -wordSize);
  1157       break;
  1158     default: {
  1159       __ lw(V0, FP, -wordSize);
  1164 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
  1165   for ( int i = first_arg ; i < arg_count ; i++ ) {
  1166     if (args[i].first()->is_Register()) {
  1167       __ push(args[i].first()->as_Register());
  1168     } else if (args[i].first()->is_FloatRegister()) {
  1169       __ push(args[i].first()->as_FloatRegister());
  1174 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
  1175   for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
  1176     if (args[i].first()->is_Register()) {
  1177       __ pop(args[i].first()->as_Register());
  1178     } else if (args[i].first()->is_FloatRegister()) {
  1179       __ pop(args[i].first()->as_FloatRegister());
  1184 // A simple move of integer like type
  1185 static void simple_move32(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {
  1186   if (src.first()->is_stack()) {
  1187     if (dst.first()->is_stack()) {
  1188       // stack to stack
  1189       __ lw(AT, FP, reg2offset_in(src.first()));
  1190       __ sd(AT, SP, reg2offset_out(dst.first()));
  1191     } else {
  1192       // stack to reg
  1193       __ lw(dst.first()->as_Register(),  FP, reg2offset_in(src.first()));
  1195   } else if (dst.first()->is_stack()) {
  1196     // reg to stack
  1197     __ sd(src.first()->as_Register(), SP, reg2offset_out(dst.first()));
  1198   } else {
  1199     if (dst.first() != src.first()){
  1200       __ move(dst.first()->as_Register(), src.first()->as_Register()); // fujie error:dst.first()
  1205 // An oop arg. Must pass a handle not the oop itself
  1206 static void object_move(MacroAssembler* masm,
  1207                         OopMap* map,
  1208                         int oop_handle_offset,
  1209                         int framesize_in_slots,
  1210                         VMRegPair src,
  1211                         VMRegPair dst,
  1212                         bool is_receiver,
  1213                         int* receiver_offset) {
  1215   // must pass a handle. First figure out the location we use as a handle
  1217   //FIXME, for mips, dst can be register
  1218   if (src.first()->is_stack()) {
  1219     // Oop is already on the stack as an argument
  1220     Register rHandle = V0;
  1221     Label nil;
  1222     __ xorr(rHandle, rHandle, rHandle);
  1223     __ ld(AT, FP, reg2offset_in(src.first()));
  1224     __ beq(AT, R0, nil);
  1225     __ delayed()->nop();
  1226     __ lea(rHandle, Address(FP, reg2offset_in(src.first())));
  1227     __ bind(nil);
  1228     if(dst.first()->is_stack())__ sd( rHandle, SP, reg2offset_out(dst.first()));
  1229     else                       __ move( (dst.first())->as_Register(), rHandle);
  1230     //if dst is register
  1231     //FIXME, do mips need out preserve stack slots?
  1232     int offset_in_older_frame = src.first()->reg2stack()
  1233       + SharedRuntime::out_preserve_stack_slots();
  1234     map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
  1235     if (is_receiver) {
  1236       *receiver_offset = (offset_in_older_frame
  1237           + framesize_in_slots) * VMRegImpl::stack_slot_size;
  1239   } else {
  1240     // Oop is in an a register we must store it to the space we reserve
  1241     // on the stack for oop_handles
  1242     const Register rOop = src.first()->as_Register();
  1243     assert( (rOop->encoding() >= A0->encoding()) && (rOop->encoding() <= T0->encoding()),"wrong register");
  1244     const Register rHandle = V0;
  1245     //Important: refer to java_calling_convertion
  1246     int oop_slot = (rOop->encoding() - A0->encoding()) * VMRegImpl::slots_per_word + oop_handle_offset;
  1247     int offset = oop_slot*VMRegImpl::stack_slot_size;
  1248     Label skip;
  1249     __ sd( rOop , SP, offset );
  1250     map->set_oop(VMRegImpl::stack2reg(oop_slot));
  1251     __ xorr( rHandle, rHandle, rHandle);
  1252     __ beq(rOop, R0, skip);
  1253     __ delayed()->nop();
  1254     __ lea(rHandle, Address(SP, offset));
  1255     __ bind(skip);
  1256     // Store the handle parameter
  1257     if(dst.first()->is_stack())__ sd( rHandle, SP, reg2offset_out(dst.first()));
  1258     else                       __ move((dst.first())->as_Register(), rHandle);
  1259     //if dst is register
  1261     if (is_receiver) {
  1262       *receiver_offset = offset;
  1267 // A float arg may have to do float reg int reg conversion
  1268 static void float_move(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {
  1269   assert(!src.second()->is_valid() && !dst.second()->is_valid(), "bad float_move");
  1271   if (src.first()->is_stack()) {
  1272     if (dst.first()->is_stack()) {
  1273       __ lw(AT, FP, reg2offset_in(src.first()));
  1274       __ sw(AT, SP, reg2offset_out(dst.first()));
  1276     else
  1277       __ lwc1(dst.first()->as_FloatRegister(), FP, reg2offset_in(src.first()));
  1278   } else {
  1279     // reg to stack
  1280     if(dst.first()->is_stack())
  1281       __ swc1(src.first()->as_FloatRegister(), SP, reg2offset_out(dst.first()));
  1282     else
  1283       __ mov_s(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
  1287 // A long move
  1288 static void long_move(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {
  1290   // The only legal possibility for a long_move VMRegPair is:
  1291   // 1: two stack slots (possibly unaligned)
  1292   // as neither the java  or C calling convention will use registers
  1293   // for longs.
  1295   if (src.first()->is_stack()) {
  1296     assert(src.second()->is_stack() && dst.second()->is_stack(), "must be all stack");
  1297     if( dst.first()->is_stack()){
  1298       __ ld(AT, FP, reg2offset_in(src.first()));
  1299       __ sd(AT, SP, reg2offset_out(dst.first()));
  1300     } else {
  1301       __ ld( (dst.first())->as_Register() , FP, reg2offset_in(src.first()));
  1303   } else {
  1304     if( dst.first()->is_stack()){
  1305       __ sd( (src.first())->as_Register(), SP, reg2offset_out(dst.first()));
  1306     } else {
  1307       __ move( (dst.first())->as_Register() , (src.first())->as_Register());
  1312 // A double move
  1313 static void double_move(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {
  1315   // The only legal possibilities for a double_move VMRegPair are:
  1316   // The painful thing here is that like long_move a VMRegPair might be
  1318   // Because of the calling convention we know that src is either
  1319   //   1: a single physical register (xmm registers only)
  1320   //   2: two stack slots (possibly unaligned)
  1321   // dst can only be a pair of stack slots.
  1324   if (src.first()->is_stack()) {
  1325     // source is all stack
  1326     if( dst.first()->is_stack()){
  1327       __ ld(AT, FP, reg2offset_in(src.first()));
  1328       __ sd(AT, SP, reg2offset_out(dst.first()));
  1329     } else {
  1330       __ ldc1( (dst.first())->as_FloatRegister(), FP, reg2offset_in(src.first()));
  1333   } else {
  1334     // reg to stack
  1335     // No worries about stack alignment
  1336     if( dst.first()->is_stack()){
  1337       __ sdc1(src.first()->as_FloatRegister(), SP, reg2offset_out(dst.first()));
  1339     else
  1340       __ mov_d( dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
  1345 static void verify_oop_args(MacroAssembler* masm,
  1346                             methodHandle method,
  1347                             const BasicType* sig_bt,
  1348                             const VMRegPair* regs) {
  1349   Register temp_reg = T9;  // not part of any compiled calling seq
  1350   if (VerifyOops) {
  1351     for (int i = 0; i < method->size_of_parameters(); i++) {
  1352       if (sig_bt[i] == T_OBJECT ||
  1353           sig_bt[i] == T_ARRAY) {
  1354         VMReg r = regs[i].first();
  1355         assert(r->is_valid(), "bad oop arg");
  1356         if (r->is_stack()) {
  1357           __ ld(temp_reg, Address(SP, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
  1358           __ verify_oop(temp_reg);
  1359         } else {
  1360           __ verify_oop(r->as_Register());
  1367 static void gen_special_dispatch(MacroAssembler* masm,
  1368                                  methodHandle method,
  1369                                  const BasicType* sig_bt,
  1370                                  const VMRegPair* regs) {
  1371   verify_oop_args(masm, method, sig_bt, regs);
  1372   vmIntrinsics::ID iid = method->intrinsic_id();
  1374   // Now write the args into the outgoing interpreter space
  1375   bool     has_receiver   = false;
  1376   Register receiver_reg   = noreg;
  1377   int      member_arg_pos = -1;
  1378   Register member_reg     = noreg;
  1379   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
  1380   if (ref_kind != 0) {
  1381     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
  1382     member_reg = S3;  // known to be free at this point
  1383     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
  1384   } else if (iid == vmIntrinsics::_invokeBasic) {
  1385     has_receiver = true;
  1386   } else {
  1387     fatal(err_msg_res("unexpected intrinsic id %d", iid));
  1390   if (member_reg != noreg) {
  1391     // Load the member_arg into register, if necessary.
  1392     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
  1393     VMReg r = regs[member_arg_pos].first();
  1394     if (r->is_stack()) {
  1395       __ ld(member_reg, Address(SP, r->reg2stack() * VMRegImpl::stack_slot_size));
  1396     } else {
  1397       // no data motion is needed
  1398       member_reg = r->as_Register();
  1402   if (has_receiver) {
  1403     // Make sure the receiver is loaded into a register.
  1404     assert(method->size_of_parameters() > 0, "oob");
  1405     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
  1406     VMReg r = regs[0].first();
  1407     assert(r->is_valid(), "bad receiver arg");
  1408     if (r->is_stack()) {
  1409       // Porting note:  This assumes that compiled calling conventions always
  1410       // pass the receiver oop in a register.  If this is not true on some
  1411       // platform, pick a temp and load the receiver from stack.
  1412       fatal("receiver always in a register");
  1413       receiver_reg = SSR;  // known to be free at this point
  1414       __ ld(receiver_reg, Address(SP, r->reg2stack() * VMRegImpl::stack_slot_size));
  1415     } else {
  1416       // no data motion is needed
  1417       receiver_reg = r->as_Register();
  1421   // Figure out which address we are really jumping to:
  1422   MethodHandles::generate_method_handle_dispatch(masm, iid,
  1423                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
  1426 // ---------------------------------------------------------------------------
  1427 // Generate a native wrapper for a given method.  The method takes arguments
  1428 // in the Java compiled code convention, marshals them to the native
  1429 // convention (handlizes oops, etc), transitions to native, makes the call,
  1430 // returns to java state (possibly blocking), unhandlizes any result and
  1431 // returns.
  1432 nmethod *SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
  1433                                                 methodHandle method,
  1434                                                 int compile_id,
  1435                                                 BasicType* in_sig_bt,
  1436                                                 VMRegPair* in_regs,
  1437                                                 BasicType ret_type) {
  1438   if (method->is_method_handle_intrinsic()) {
  1439     vmIntrinsics::ID iid = method->intrinsic_id();
  1440     intptr_t start = (intptr_t)__ pc();
  1441     int vep_offset = ((intptr_t)__ pc()) - start;
  1442     gen_special_dispatch(masm,
  1443                          method,
  1444                          in_sig_bt,
  1445                          in_regs);
  1446     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
  1447     __ flush();
  1448     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
  1449     return nmethod::new_native_nmethod(method,
  1450                                        compile_id,
  1451                                        masm->code(),
  1452                                        vep_offset,
  1453                                        frame_complete,
  1454                                        stack_slots / VMRegImpl::slots_per_word,
  1455                                        in_ByteSize(-1),
  1456                                        in_ByteSize(-1),
  1457                                        (OopMapSet*)NULL);
  1459   bool is_critical_native = true;
  1460   address native_func = method->critical_native_function();
  1461   if (native_func == NULL) {
  1462     native_func = method->native_function();
  1463     is_critical_native = false;
  1465   assert(native_func != NULL, "must have function");
  1467   // Native nmethod wrappers never take possesion of the oop arguments.
  1468   // So the caller will gc the arguments. The only thing we need an
  1469   // oopMap for is if the call is static
  1470   //
  1471   // An OopMap for lock (and class if static), and one for the VM call itself
  1472   OopMapSet *oop_maps = new OopMapSet();
  1474   // We have received a description of where all the java arg are located
  1475   // on entry to the wrapper. We need to convert these args to where
  1476   // the jni function will expect them. To figure out where they go
  1477   // we convert the java signature to a C signature by inserting
  1478   // the hidden arguments as arg[0] and possibly arg[1] (static method)
  1480   const int total_in_args = method->size_of_parameters();
  1481   int total_c_args = total_in_args;
  1482   if (!is_critical_native) {
  1483     total_c_args += 1;
  1484     if (method->is_static()) {
  1485       total_c_args++;
  1487   } else {
  1488     for (int i = 0; i < total_in_args; i++) {
  1489       if (in_sig_bt[i] == T_ARRAY) {
  1490         total_c_args++;
  1495   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
  1496   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
  1497   BasicType* in_elem_bt = NULL;
  1499   int argc = 0;
  1500   if (!is_critical_native) {
  1501     out_sig_bt[argc++] = T_ADDRESS;
  1502     if (method->is_static()) {
  1503       out_sig_bt[argc++] = T_OBJECT;
  1506     for (int i = 0; i < total_in_args ; i++ ) {
  1507       out_sig_bt[argc++] = in_sig_bt[i];
  1509   } else {
  1510     Thread* THREAD = Thread::current();
  1511     in_elem_bt = NEW_RESOURCE_ARRAY(BasicType, total_in_args);
  1512     SignatureStream ss(method->signature());
  1513     for (int i = 0; i < total_in_args ; i++ ) {
  1514       if (in_sig_bt[i] == T_ARRAY) {
  1515         // Arrays are passed as int, elem* pair
  1516         out_sig_bt[argc++] = T_INT;
  1517         out_sig_bt[argc++] = T_ADDRESS;
  1518         Symbol* atype = ss.as_symbol(CHECK_NULL);
  1519         const char* at = atype->as_C_string();
  1520         if (strlen(at) == 2) {
  1521           assert(at[0] == '[', "must be");
  1522           switch (at[1]) {
  1523             case 'B': in_elem_bt[i]  = T_BYTE; break;
  1524             case 'C': in_elem_bt[i]  = T_CHAR; break;
  1525             case 'D': in_elem_bt[i]  = T_DOUBLE; break;
  1526             case 'F': in_elem_bt[i]  = T_FLOAT; break;
  1527             case 'I': in_elem_bt[i]  = T_INT; break;
  1528             case 'J': in_elem_bt[i]  = T_LONG; break;
  1529             case 'S': in_elem_bt[i]  = T_SHORT; break;
  1530             case 'Z': in_elem_bt[i]  = T_BOOLEAN; break;
  1531             default: ShouldNotReachHere();
  1534       } else {
  1535         out_sig_bt[argc++] = in_sig_bt[i];
  1536         in_elem_bt[i] = T_VOID;
  1538       if (in_sig_bt[i] != T_VOID) {
  1539         assert(in_sig_bt[i] == ss.type(), "must match");
  1540         ss.next();
  1545   // Now figure out where the args must be stored and how much stack space
  1546   // they require (neglecting out_preserve_stack_slots but space for storing
  1547   // the 1st six register arguments). It's weird see int_stk_helper.
  1548   //
  1549   int out_arg_slots;
  1550   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, NULL, total_c_args);
  1552   // Compute framesize for the wrapper.  We need to handlize all oops in
  1553   // registers. We must create space for them here that is disjoint from
  1554   // the windowed save area because we have no control over when we might
  1555   // flush the window again and overwrite values that gc has since modified.
  1556   // (The live window race)
  1557   //
  1558   // We always just allocate 6 word for storing down these object. This allow
  1559   // us to simply record the base and use the Ireg number to decide which
  1560   // slot to use. (Note that the reg number is the inbound number not the
  1561   // outbound number).
  1562   // We must shuffle args to match the native convention, and include var-args space.
  1564   // Calculate the total number of stack slots we will need.
  1566   // First count the abi requirement plus all of the outgoing args
  1567   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
  1569   // Now the space for the inbound oop handle area
  1570   int total_save_slots = 9 * VMRegImpl::slots_per_word;  // 9 arguments passed in registers
  1571   if (is_critical_native) {
  1572     // Critical natives may have to call out so they need a save area
  1573     // for register arguments.
  1574     int double_slots = 0;
  1575     int single_slots = 0;
  1576     for ( int i = 0; i < total_in_args; i++) {
  1577       if (in_regs[i].first()->is_Register()) {
  1578         const Register reg = in_regs[i].first()->as_Register();
  1579         switch (in_sig_bt[i]) {
  1580           case T_BOOLEAN:
  1581           case T_BYTE:
  1582           case T_SHORT:
  1583           case T_CHAR:
  1584           case T_INT:  single_slots++; break;
  1585           case T_ARRAY:  // specific to LP64 (7145024)
  1586           case T_LONG: double_slots++; break;
  1587           default:  ShouldNotReachHere();
  1589       } else if (in_regs[i].first()->is_FloatRegister()) {
  1590         switch (in_sig_bt[i]) {
  1591           case T_FLOAT:  single_slots++; break;
  1592           case T_DOUBLE: double_slots++; break;
  1593           default:  ShouldNotReachHere();
  1597     total_save_slots = double_slots * 2 + single_slots;
  1598     // align the save area
  1599     if (double_slots != 0) {
  1600       stack_slots = round_to(stack_slots, 2);
  1604   int oop_handle_offset = stack_slots;
  1605   stack_slots += total_save_slots;
  1607   // Now any space we need for handlizing a klass if static method
  1609   int klass_slot_offset = 0;
  1610   int klass_offset = -1;
  1611   int lock_slot_offset = 0;
  1612   bool is_static = false;
  1614   if (method->is_static()) {
  1615     klass_slot_offset = stack_slots;
  1616     stack_slots += VMRegImpl::slots_per_word;
  1617     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
  1618     is_static = true;
  1621   // Plus a lock if needed
  1623   if (method->is_synchronized()) {
  1624     lock_slot_offset = stack_slots;
  1625     stack_slots += VMRegImpl::slots_per_word;
  1628   // Now a place to save return value or as a temporary for any gpr -> fpr moves
  1629   // + 2 for return address (which we own) and saved fp
  1630   stack_slots += 2 + 9 * VMRegImpl::slots_per_word;  // (T0, A0, A1, A2, A3, A4, A5, A6, A7)
  1632   // Ok The space we have allocated will look like:
  1633   //
  1634   //
  1635   // FP-> |                     |
  1636   //      |---------------------|
  1637   //      | 2 slots for moves   |
  1638   //      |---------------------|
  1639   //      | lock box (if sync)  |
  1640   //      |---------------------| <- lock_slot_offset
  1641   //      | klass (if static)   |
  1642   //      |---------------------| <- klass_slot_offset
  1643   //      | oopHandle area      |
  1644   //      |---------------------| <- oop_handle_offset
  1645   //      | outbound memory     |
  1646   //      | based arguments     |
  1647   //      |                     |
  1648   //      |---------------------|
  1649   //      | vararg area         |
  1650   //      |---------------------|
  1651   //      |                     |
  1652   // SP-> | out_preserved_slots |
  1653   //
  1654   //
  1657   // Now compute actual number of stack words we need rounding to make
  1658   // stack properly aligned.
  1659   stack_slots = round_to(stack_slots, StackAlignmentInSlots);
  1661   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
  1663   intptr_t start = (intptr_t)__ pc();
  1667   // First thing make an ic check to see if we should even be here
  1668   address ic_miss = SharedRuntime::get_ic_miss_stub();
  1670   // We are free to use all registers as temps without saving them and
  1671   // restoring them except fp. fp is the only callee save register
  1672   // as far as the interpreter and the compiler(s) are concerned.
  1674   //refer to register_mips.hpp:IC_Klass
  1675   const Register ic_reg = T1;
  1676   const Register receiver = T0;
  1678   Label hit;
  1679   Label exception_pending;
  1681   __ verify_oop(receiver);
  1682   //add for compressedoops
  1683   __ load_klass(T9, receiver);
  1684   __ beq(T9, ic_reg, hit);
  1685   __ delayed()->nop();
  1686   __ jmp(ic_miss, relocInfo::runtime_call_type);
  1687   __ delayed()->nop();
  1688   // verified entry must be aligned for code patching.
  1689   // and the first 5 bytes must be in the same cache line
  1690   // if we align at 8 then we will be sure 5 bytes are in the same line
  1691   __ align(8);
  1693   __ bind(hit);
  1696   int vep_offset = ((intptr_t)__ pc()) - start;
  1697 #ifdef COMPILER1
  1698   if (InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) {
  1699     // Object.hashCode can pull the hashCode from the header word
  1700     // instead of doing a full VM transition once it's been computed.
  1701     // Since hashCode is usually polymorphic at call sites we can't do
  1702     // this optimization at the call site without a lot of work.
  1703     Label slowCase;
  1704     Register receiver = T0;
  1705     Register result = V0;
  1706     __ ld ( result, receiver, oopDesc::mark_offset_in_bytes());
  1707     // check if locked
  1708     __ andi(AT, result, markOopDesc::unlocked_value);
  1709     __ beq(AT, R0, slowCase);
  1710     __ delayed()->nop();
  1711     if (UseBiasedLocking) {
  1712       // Check if biased and fall through to runtime if so
  1713       __ andi (AT, result, markOopDesc::biased_lock_bit_in_place);
  1714       __ bne(AT, R0, slowCase);
  1715       __ delayed()->nop();
  1717     // get hash
  1718     __ li(AT, markOopDesc::hash_mask_in_place);
  1719     __ andr (AT, result, AT);
  1720     // test if hashCode exists
  1721     __ beq (AT, R0, slowCase);
  1722     __ delayed()->nop();
  1723     __ shr(result, markOopDesc::hash_shift);
  1724     __ jr(RA);
  1725     __ delayed()->nop();
  1726     __ bind (slowCase);
  1728 #endif // COMPILER1
  1730   // The instruction at the verified entry point must be 5 bytes or longer
  1731   // because it can be patched on the fly by make_non_entrant. The stack bang
  1732   // instruction fits that requirement.
  1734   // Generate stack overflow check
  1736   if (UseStackBanging) {
  1737     __ bang_stack_with_offset(StackShadowPages*os::vm_page_size());
  1738   } else {
  1739     // need a 5 byte instruction to allow MT safe patching to non-entrant
  1740     __ nop();
  1741     __ nop();
  1742     __ nop();
  1743     __ nop();
  1744     __ nop();
  1746   // Generate a new frame for the wrapper.
  1747   // do mips need this ?
  1748 #ifndef OPT_THREAD
  1749   __ get_thread(TREG);
  1750 #endif
  1751   __ st_ptr(SP, TREG, in_bytes(JavaThread::last_Java_sp_offset()));
  1752   __ move(AT, -(StackAlignmentInBytes));
  1753   __ andr(SP, SP, AT);
  1755   __ enter();
  1756   // -2 because return address is already present and so is saved fp
  1757   __ addiu(SP, SP, -1 * (stack_size - 2*wordSize));
  1759   // Frame is now completed as far a size and linkage.
  1761   int frame_complete = ((intptr_t)__ pc()) - start;
  1763   // Calculate the difference between sp and fp. We need to know it
  1764   // after the native call because on windows Java Natives will pop
  1765   // the arguments and it is painful to do sp relative addressing
  1766   // in a platform independent way. So after the call we switch to
  1767   // fp relative addressing.
  1768   //FIXME actually , the fp_adjustment may not be the right, because andr(sp, sp, at) may change
  1769   //the SP
  1770   int fp_adjustment = stack_size - 2*wordSize;
  1772 #ifdef COMPILER2
  1773   // C2 may leave the stack dirty if not in SSE2+ mode
  1774   __ empty_FPU_stack();
  1775 #endif
  1777   // Compute the fp offset for any slots used after the jni call
  1779   int lock_slot_fp_offset = (lock_slot_offset*VMRegImpl::stack_slot_size) - fp_adjustment;
  1780   // We use TREG as a thread pointer because it is callee save and
  1781   // if we load it once it is usable thru the entire wrapper
  1782   const Register thread = TREG;
  1784   // We use S4 as the oop handle for the receiver/klass
  1785   // It is callee save so it survives the call to native
  1787   const Register oop_handle_reg = S4;
  1788   if (is_critical_native) {
  1789      __ stop("generate_native_wrapper in sharedRuntime <2>");
  1790     //TODO:Fu
  1791     // check_needs_gc_for_critical_native(masm, stack_slots, total_c_args, total_in_args,
  1792     //                                   oop_handle_offset, oop_maps, in_regs, in_sig_bt);
  1795 #ifndef OPT_THREAD
  1796   __ get_thread(thread);
  1797 #endif
  1799   //
  1800   // We immediately shuffle the arguments so that any vm call we have to
  1801   // make from here on out (sync slow path, jvmpi, etc.) we will have
  1802   // captured the oops from our caller and have a valid oopMap for
  1803   // them.
  1805   // -----------------
  1806   // The Grand Shuffle
  1807   //
  1808   // Natives require 1 or 2 extra arguments over the normal ones: the JNIEnv*
  1809   // and, if static, the class mirror instead of a receiver.  This pretty much
  1810   // guarantees that register layout will not match (and mips doesn't use reg
  1811   // parms though amd does).  Since the native abi doesn't use register args
  1812   // and the java conventions does we don't have to worry about collisions.
  1813   // All of our moved are reg->stack or stack->stack.
  1814   // We ignore the extra arguments during the shuffle and handle them at the
  1815   // last moment. The shuffle is described by the two calling convention
  1816   // vectors we have in our possession. We simply walk the java vector to
  1817   // get the source locations and the c vector to get the destinations.
  1819   int c_arg = method->is_static() ? 2 : 1 ;
  1821   // Record sp-based slot for receiver on stack for non-static methods
  1822   int receiver_offset = -1;
  1824   // This is a trick. We double the stack slots so we can claim
  1825   // the oops in the caller's frame. Since we are sure to have
  1826   // more args than the caller doubling is enough to make
  1827   // sure we can capture all the incoming oop args from the
  1828   // caller.
  1829   //
  1830   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
  1832   // Mark location of fp (someday)
  1833   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(fp));
  1835 #ifdef ASSERT
  1836   bool reg_destroyed[RegisterImpl::number_of_registers];
  1837   bool freg_destroyed[FloatRegisterImpl::number_of_registers];
  1838   for ( int r = 0 ; r < RegisterImpl::number_of_registers ; r++ ) {
  1839     reg_destroyed[r] = false;
  1841   for ( int f = 0 ; f < FloatRegisterImpl::number_of_registers ; f++ ) {
  1842     freg_destroyed[f] = false;
  1845 #endif /* ASSERT */
  1847   // This may iterate in two different directions depending on the
  1848   // kind of native it is.  The reason is that for regular JNI natives
  1849   // the incoming and outgoing registers are offset upwards and for
  1850   // critical natives they are offset down.
  1851   GrowableArray<int> arg_order(2 * total_in_args);
  1852   VMRegPair tmp_vmreg;
  1853   tmp_vmreg.set1(T8->as_VMReg());
  1855   if (!is_critical_native) {
  1856     for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
  1857       arg_order.push(i);
  1858       arg_order.push(c_arg);
  1860   } else {
  1861     // Compute a valid move order, using tmp_vmreg to break any cycles
  1862      __ stop("generate_native_wrapper in sharedRuntime <2>");
  1863     //TODO:Fu
  1864     // ComputeMoveOrder cmo(total_in_args, in_regs, total_c_args, out_regs, in_sig_bt, arg_order, tmp_vmreg);
  1867   int temploc = -1;
  1868   for (int ai = 0; ai < arg_order.length(); ai += 2) {
  1869     int i = arg_order.at(ai);
  1870     int c_arg = arg_order.at(ai + 1);
  1871     __ block_comment(err_msg("move %d -> %d", i, c_arg));
  1872     if (c_arg == -1) {
  1873       assert(is_critical_native, "should only be required for critical natives");
  1874       // This arg needs to be moved to a temporary
  1875       __ move(tmp_vmreg.first()->as_Register(), in_regs[i].first()->as_Register());
  1876       in_regs[i] = tmp_vmreg;
  1877       temploc = i;
  1878       continue;
  1879     } else if (i == -1) {
  1880       assert(is_critical_native, "should only be required for critical natives");
  1881       // Read from the temporary location
  1882       assert(temploc != -1, "must be valid");
  1883       i = temploc;
  1884       temploc = -1;
  1886 #ifdef ASSERT
  1887     if (in_regs[i].first()->is_Register()) {
  1888       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
  1889     } else if (in_regs[i].first()->is_FloatRegister()) {
  1890       assert(!freg_destroyed[in_regs[i].first()->as_FloatRegister()->encoding()], "destroyed reg!");
  1892     if (out_regs[c_arg].first()->is_Register()) {
  1893       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
  1894     } else if (out_regs[c_arg].first()->is_FloatRegister()) {
  1895       freg_destroyed[out_regs[c_arg].first()->as_FloatRegister()->encoding()] = true;
  1897 #endif /* ASSERT */
  1898     switch (in_sig_bt[i]) {
  1899       case T_ARRAY:
  1900         if (is_critical_native) {
  1901           __ stop("generate_native_wrapper in sharedRuntime <2>");
  1902           //TODO:Fu
  1903           // unpack_array_argument(masm, in_regs[i], in_elem_bt[i], out_regs[c_arg + 1], out_regs[c_arg]);
  1904           c_arg++;
  1905 #ifdef ASSERT
  1906           if (out_regs[c_arg].first()->is_Register()) {
  1907             reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
  1908           } else if (out_regs[c_arg].first()->is_FloatRegister()) {
  1909             freg_destroyed[out_regs[c_arg].first()->as_FloatRegister()->encoding()] = true;
  1911 #endif
  1912           break;
  1914       case T_OBJECT:
  1915         assert(!is_critical_native, "no oop arguments");
  1916         object_move(masm, map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
  1917                     ((i == 0) && (!is_static)),
  1918                     &receiver_offset);
  1919         break;
  1920       case T_VOID:
  1921         break;
  1923       case T_FLOAT:
  1924         float_move(masm, in_regs[i], out_regs[c_arg]);
  1925           break;
  1927       case T_DOUBLE:
  1928         assert( i + 1 < total_in_args &&
  1929                 in_sig_bt[i + 1] == T_VOID &&
  1930                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
  1931         double_move(masm, in_regs[i], out_regs[c_arg]);
  1932         break;
  1934       case T_LONG :
  1935         long_move(masm, in_regs[i], out_regs[c_arg]);
  1936         break;
  1938       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
  1940       default:
  1941         simple_move32(masm, in_regs[i], out_regs[c_arg]);
  1945   // point c_arg at the first arg that is already loaded in case we
  1946   // need to spill before we call out
  1947   c_arg = total_c_args - total_in_args;
  1948   // Pre-load a static method's oop.  Used both by locking code and
  1949   // the normal JNI call code.
  1951   __ move(oop_handle_reg, A1);
  1953   if (method->is_static() && !is_critical_native) {
  1955     //  load opp into a register
  1956     int oop_index = __ oop_recorder()->find_index(JNIHandles::make_local(
  1957           (method->method_holder())->java_mirror()));
  1960     RelocationHolder rspec = oop_Relocation::spec(oop_index);
  1961     __ relocate(rspec);
  1962     __ patchable_set48(oop_handle_reg, (long)JNIHandles::make_local((method->method_holder())->java_mirror()));
  1963     // Now handlize the static class mirror it's known not-null.
  1964     __ sd( oop_handle_reg, SP, klass_offset);
  1965     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
  1967     // Now get the handle
  1968     __ lea(oop_handle_reg, Address(SP, klass_offset));
  1969     // store the klass handle as second argument
  1970     __ move(A1, oop_handle_reg);
  1971     // and protect the arg if we must spill
  1972     c_arg--;
  1975   // Change state to native (we save the return address in the thread, since it might not
  1976   // be pushed on the stack when we do a a stack traversal). It is enough that the pc()
  1977   // points into the right code segment. It does not have to be the correct return pc.
  1978   // We use the same pc/oopMap repeatedly when we call out
  1980   intptr_t the_pc = (intptr_t) __ pc();
  1981   oop_maps->add_gc_map(the_pc - start, map);
  1983   __ set_last_Java_frame(SP, noreg, NULL);
  1984   __ relocate(relocInfo::internal_pc_type);
  1986     intptr_t save_pc = (intptr_t)the_pc ;
  1987     __ patchable_set48(AT, save_pc);
  1989   __ sd(AT, thread, in_bytes(JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()));
  1992   // We have all of the arguments setup at this point. We must not touch any register
  1993   // argument registers at this point (what if we save/restore them there are no oop?
  1995     SkipIfEqual skip_if(masm, &DTraceMethodProbes, 0);
  1996     int metadata_index = __ oop_recorder()->find_index(method());
  1997     RelocationHolder rspec = metadata_Relocation::spec(metadata_index);
  1998     __ relocate(rspec);
  1999     __ patchable_set48(AT, (long)(method()));
  2001     __ call_VM_leaf(
  2002       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
  2003       thread, AT);
  2007   // These are register definitions we need for locking/unlocking
  2008   const Register swap_reg = T8;  // Must use T8 for cmpxchg instruction
  2009   const Register obj_reg  = T9;  // Will contain the oop
  2010   //const Register lock_reg = T6;  // Address of compiler lock object (BasicLock)
  2011   const Register lock_reg = c_rarg0;  // Address of compiler lock object (BasicLock)
  2015   Label slow_path_lock;
  2016   Label lock_done;
  2018   // Lock a synchronized method
  2019   if (method->is_synchronized()) {
  2020     assert(!is_critical_native, "unhandled");
  2022     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
  2024     // Get the handle (the 2nd argument)
  2025     __ move(oop_handle_reg, A1);
  2027     // Get address of the box
  2028     __ lea(lock_reg, Address(FP, lock_slot_fp_offset));
  2030     // Load the oop from the handle
  2031     __ ld(obj_reg, oop_handle_reg, 0);
  2033     if (UseBiasedLocking) {
  2034       // Note that oop_handle_reg is trashed during this call
  2035       __ biased_locking_enter(lock_reg, obj_reg, swap_reg, A1, false, lock_done, &slow_path_lock);
  2038     // Load immediate 1 into swap_reg %T8
  2039     __ move(swap_reg, 1);
  2041     __ ld(AT, obj_reg, 0);
  2042     __ orr(swap_reg, swap_reg, AT);
  2044     __ sd( swap_reg, lock_reg, mark_word_offset);
  2045     __ cmpxchg(lock_reg, Address(obj_reg, 0), swap_reg);
  2046     __ bne(AT, R0, lock_done);
  2047     __ delayed()->nop();
  2048     // Test if the oopMark is an obvious stack pointer, i.e.,
  2049     //  1) (mark & 3) == 0, and
  2050     //  2) sp <= mark < mark + os::pagesize()
  2051     // These 3 tests can be done by evaluating the following
  2052     // expression: ((mark - sp) & (3 - os::vm_page_size())),
  2053     // assuming both stack pointer and pagesize have their
  2054     // least significant 2 bits clear.
  2055     // NOTE: the oopMark is in swap_reg %T8 as the result of cmpxchg
  2057     __ dsub(swap_reg, swap_reg, SP);
  2058     __ move(AT, 3 - os::vm_page_size());
  2059     __ andr(swap_reg , swap_reg, AT);
  2060     // Save the test result, for recursive case, the result is zero
  2061     __ sd(swap_reg, lock_reg, mark_word_offset);
  2062     __ bne(swap_reg, R0, slow_path_lock);
  2063     __ delayed()->nop();
  2064     // Slow path will re-enter here
  2065     __ bind(lock_done);
  2067     if (UseBiasedLocking) {
  2068       // Re-fetch oop_handle_reg as we trashed it above
  2069       __ move(A1, oop_handle_reg);
  2074   // Finally just about ready to make the JNI call
  2077   // get JNIEnv* which is first argument to native
  2078   if (!is_critical_native) {
  2079     __ addi(A0, thread, in_bytes(JavaThread::jni_environment_offset()));
  2082   // Example: Java_java_lang_ref_Finalizer_invokeFinalizeMethod(JNIEnv *env, jclass clazz, jobject ob)
  2083   // Load the second arguments into A1
  2084   //__ ld(A1, SP , wordSize );   // klass
  2086   // Now set thread in native
  2087   __ addi(AT, R0, _thread_in_native);
  2088   __ sw(AT, thread, in_bytes(JavaThread::thread_state_offset()));
  2089   // do the call
  2090   __ call(method->native_function(), relocInfo::runtime_call_type);
  2091   __ delayed()->nop();
  2092   // WARNING - on Windows Java Natives use pascal calling convention and pop the
  2093   // arguments off of the stack. We could just re-adjust the stack pointer here
  2094   // and continue to do SP relative addressing but we instead switch to FP
  2095   // relative addressing.
  2097   // Unpack native results.
  2098   switch (ret_type) {
  2099   case T_BOOLEAN: __ c2bool(V0);            break;
  2100   case T_CHAR   : __ andi(V0, V0, 0xFFFF);      break;
  2101   case T_BYTE   : __ sign_extend_byte (V0); break;
  2102   case T_SHORT  : __ sign_extend_short(V0); break;
  2103   case T_INT    : // nothing to do         break;
  2104   case T_DOUBLE :
  2105   case T_FLOAT  :
  2106   // Result is in st0 we'll save as needed
  2107   break;
  2108   case T_ARRAY:                 // Really a handle
  2109   case T_OBJECT:                // Really a handle
  2110   break; // can't de-handlize until after safepoint check
  2111   case T_VOID: break;
  2112   case T_LONG: break;
  2113   default       : ShouldNotReachHere();
  2115   // Switch thread to "native transition" state before reading the synchronization state.
  2116   // This additional state is necessary because reading and testing the synchronization
  2117   // state is not atomic w.r.t. GC, as this scenario demonstrates:
  2118   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
  2119   //     VM thread changes sync state to synchronizing and suspends threads for GC.
  2120   //     Thread A is resumed to finish this native method, but doesn't block here since it
  2121   //     didn't see any synchronization is progress, and escapes.
  2122   __ addi(AT, R0, _thread_in_native_trans);
  2123   __ sw(AT, thread, in_bytes(JavaThread::thread_state_offset()));
  2125   //if(os::is_MP()) {}
  2127   Label after_transition;
  2129   // check for safepoint operation in progress and/or pending suspend requests
  2131     Label Continue;
  2132     __ li(AT, SafepointSynchronize::address_of_state());
  2133     __ lw(A0, AT, 0);
  2134     __ addi(AT, A0, -SafepointSynchronize::_not_synchronized);
  2135     Label L;
  2136     __ bne(AT, R0, L);
  2137     __ delayed()->nop();
  2138     __ lw(AT, thread, in_bytes(JavaThread::suspend_flags_offset()));
  2139     __ beq(AT, R0, Continue);
  2140     __ delayed()->nop();
  2141     __ bind(L);
  2143     // Don't use call_VM as it will see a possible pending exception and forward it
  2144     // and never return here preventing us from clearing _last_native_pc down below.
  2145     //
  2146     save_native_result(masm, ret_type, stack_slots);
  2147     __ move(A0, thread);
  2148     __ addi(SP, SP, -wordSize);
  2149     __ push(S2);
  2150     __ move(AT, -(StackAlignmentInBytes));
  2151     __ move(S2, SP);     // use S2 as a sender SP holder
  2152     __ andr(SP, SP, AT); // align stack as required by ABI
  2153     if (!is_critical_native) {
  2154       __ call(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans), relocInfo::runtime_call_type);
  2155       __ delayed()->nop();
  2156     } else {
  2157       __ call(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans_and_transition), relocInfo::runtime_call_type);
  2158       __ delayed()->nop();
  2160     __ move(SP, S2);     // use S2 as a sender SP holder
  2161     __ pop(S2);
  2162     __ addi(SP, SP, wordSize);
  2163     //add for compressedoops
  2164     __ reinit_heapbase();
  2165     // Restore any method result value
  2166     restore_native_result(masm, ret_type, stack_slots);
  2168     if (is_critical_native) {
  2169       // The call above performed the transition to thread_in_Java so
  2170       // skip the transition logic below.
  2171       __ beq(R0, R0, after_transition);
  2172       __ delayed()->nop();
  2175     __ bind(Continue);
  2178   // change thread state
  2179   __ addi(AT, R0, _thread_in_Java);
  2180   __ sw(AT,  thread, in_bytes(JavaThread::thread_state_offset()));
  2181   __ bind(after_transition);
  2182   Label reguard;
  2183   Label reguard_done;
  2184   __ lw(AT, thread, in_bytes(JavaThread::stack_guard_state_offset()));
  2185   __ addi(AT, AT, -JavaThread::stack_guard_yellow_disabled);
  2186   __ beq(AT, R0, reguard);
  2187   __ delayed()->nop();
  2188   // slow path reguard  re-enters here
  2189   __ bind(reguard_done);
  2191   // Handle possible exception (will unlock if necessary)
  2193   // native result if any is live
  2195   // Unlock
  2196   Label slow_path_unlock;
  2197   Label unlock_done;
  2198   if (method->is_synchronized()) {
  2200     Label done;
  2202     // Get locked oop from the handle we passed to jni
  2203     __ ld( obj_reg, oop_handle_reg, 0);
  2204     if (UseBiasedLocking) {
  2205       __ biased_locking_exit(obj_reg, T8, done);
  2209     // Simple recursive lock?
  2211     __ ld(AT, FP, lock_slot_fp_offset);
  2212     __ beq(AT, R0, done);
  2213     __ delayed()->nop();
  2214     // Must save FSF if if it is live now because cmpxchg must use it
  2215     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
  2216       save_native_result(masm, ret_type, stack_slots);
  2219     //  get old displaced header
  2220     __ ld (T8, FP, lock_slot_fp_offset);
  2221     // get address of the stack lock
  2222     __ addi (c_rarg0, FP, lock_slot_fp_offset);
  2223     // Atomic swap old header if oop still contains the stack lock
  2224     __ cmpxchg(T8, Address(obj_reg, 0), c_rarg0);
  2226     __ beq(AT, R0, slow_path_unlock);
  2227     __ delayed()->nop();
  2228     // slow path re-enters here
  2229     __ bind(unlock_done);
  2230     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
  2231       restore_native_result(masm, ret_type, stack_slots);
  2234     __ bind(done);
  2238     SkipIfEqual skip_if(masm, &DTraceMethodProbes, 0);
  2239     // Tell dtrace about this method exit
  2240     save_native_result(masm, ret_type, stack_slots);
  2241     int metadata_index = __ oop_recorder()->find_index( (method()));
  2242     RelocationHolder rspec = metadata_Relocation::spec(metadata_index);
  2243     __ relocate(rspec);
  2244     __ patchable_set48(AT, (long)(method()));
  2246     __ call_VM_leaf(
  2247          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
  2248          thread, AT);
  2249     restore_native_result(masm, ret_type, stack_slots);
  2252   // We can finally stop using that last_Java_frame we setup ages ago
  2254   __ reset_last_Java_frame(false);
  2256   // Unpack oop result, e.g. JNIHandles::resolve value.
  2257   if (ret_type == T_OBJECT || ret_type == T_ARRAY) {
  2258     __ resolve_jobject(V0, thread, T9);
  2261   if (!is_critical_native) {
  2262     // reset handle block
  2263     __ ld(AT, thread, in_bytes(JavaThread::active_handles_offset()));
  2264     __ sw(R0, AT, JNIHandleBlock::top_offset_in_bytes());
  2267   if (!is_critical_native) {
  2268     // Any exception pending?
  2269     __ ld(AT, thread, in_bytes(Thread::pending_exception_offset()));
  2270     __ bne(AT, R0, exception_pending);
  2271     __ delayed()->nop();
  2273   // no exception, we're almost done
  2275   // check that only result value is on FPU stack
  2276   __ verify_FPU(ret_type == T_FLOAT || ret_type == T_DOUBLE ? 1 : 0, "native_wrapper normal exit");
  2278   // Return
  2279 #ifndef OPT_THREAD
  2280   __ get_thread(TREG);
  2281 #endif
  2282   //__ ld_ptr(SP, TREG, in_bytes(JavaThread::last_Java_sp_offset()));
  2283   __ leave();
  2285   __ jr(RA);
  2286   __ delayed()->nop();
  2287   // Unexpected paths are out of line and go here
  2288   // Slow path locking & unlocking
  2289   if (method->is_synchronized()) {
  2291     // BEGIN Slow path lock
  2292     __ bind(slow_path_lock);
  2294     // protect the args we've loaded
  2295     save_args(masm, total_c_args, c_arg, out_regs);
  2297     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
  2298     // args are (oop obj, BasicLock* lock, JavaThread* thread)
  2300     __ move(A0, obj_reg);
  2301     __ move(A1, lock_reg);
  2302     __ move(A2, thread);
  2303     __ addi(SP, SP, - 3*wordSize);
  2305     __ move(AT, -(StackAlignmentInBytes));
  2306     __ move(S2, SP);     // use S2 as a sender SP holder
  2307     __ andr(SP, SP, AT); // align stack as required by ABI
  2309     __ call(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), relocInfo::runtime_call_type);
  2310     __ delayed()->nop();
  2311                 __ move(SP, S2);
  2312     __ addi(SP, SP, 3*wordSize);
  2314     restore_args(masm, total_c_args, c_arg, out_regs);
  2316 #ifdef ASSERT
  2317     { Label L;
  2318       __ ld(AT, thread, in_bytes(Thread::pending_exception_offset()));
  2319       __ beq(AT, R0, L);
  2320       __ delayed()->nop();
  2321       __ stop("no pending exception allowed on exit from monitorenter");
  2322       __ bind(L);
  2324 #endif
  2325     __ b(lock_done);
  2326     __ delayed()->nop();
  2327     // END Slow path lock
  2329     // BEGIN Slow path unlock
  2330     __ bind(slow_path_unlock);
  2332     // Slow path unlock
  2334     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
  2335       save_native_result(masm, ret_type, stack_slots);
  2337     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
  2339     __ ld(AT, thread, in_bytes(Thread::pending_exception_offset()));
  2340     __ push(AT);
  2341     __ sd(R0, thread, in_bytes(Thread::pending_exception_offset()));
  2343                 __ move(AT, -(StackAlignmentInBytes));
  2344                 __ move(S2, SP);     // use S2 as a sender SP holder
  2345                 __ andr(SP, SP, AT); // align stack as required by ABI
  2347     // should be a peal
  2348     // +wordSize because of the push above
  2349     __ addi(A1, FP, lock_slot_fp_offset);
  2351     __ move(A0, obj_reg);
  2352     __ addi(SP,SP, -2*wordSize);
  2353     __ call(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C),
  2354         relocInfo::runtime_call_type);
  2355     __ delayed()->nop();
  2356     __ addi(SP, SP, 2*wordSize);
  2357                 __ move(SP, S2);
  2358     //add for compressedoops
  2359     __ reinit_heapbase();
  2360 #ifdef ASSERT
  2362       Label L;
  2363       __ lw( AT, thread, in_bytes(Thread::pending_exception_offset()));
  2364       __ beq(AT, R0, L);
  2365       __ delayed()->nop();
  2366       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
  2367       __ bind(L);
  2369 #endif /* ASSERT */
  2371     __ pop(AT);
  2372     __ sd(AT, thread, in_bytes(Thread::pending_exception_offset()));
  2373     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
  2374       restore_native_result(masm, ret_type, stack_slots);
  2376     __ b(unlock_done);
  2377     __ delayed()->nop();
  2378     // END Slow path unlock
  2382   // SLOW PATH Reguard the stack if needed
  2384   __ bind(reguard);
  2385   save_native_result(masm, ret_type, stack_slots);
  2386   __ call(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages),
  2387       relocInfo::runtime_call_type);
  2388   __ delayed()->nop();
  2389   //add for compressedoops
  2390   __ reinit_heapbase();
  2391   restore_native_result(masm, ret_type, stack_slots);
  2392   __ b(reguard_done);
  2393   __ delayed()->nop();
  2395   // BEGIN EXCEPTION PROCESSING
  2396   if (!is_critical_native) {
  2397     // Forward  the exception
  2398     __ bind(exception_pending);
  2400     // remove possible return value from FPU register stack
  2401     __ empty_FPU_stack();
  2403     // pop our frame
  2404     //forward_exception_entry need return address on stack
  2405     __ addiu(SP, FP, wordSize);
  2406     __ ld(FP, SP, (-1) * wordSize);
  2408     // and forward the exception
  2409     __ jmp(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type);
  2410     __ delayed()->nop();
  2412   __ flush();
  2414   nmethod *nm = nmethod::new_native_nmethod(method,
  2415                                             compile_id,
  2416                                             masm->code(),
  2417                                             vep_offset,
  2418                                             frame_complete,
  2419                                             stack_slots / VMRegImpl::slots_per_word,
  2420                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
  2421                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
  2422                                             oop_maps);
  2424   if (is_critical_native) {
  2425     nm->set_lazy_critical_native(true);
  2428   return nm;
  2432 #ifdef HAVE_DTRACE_H
  2433 // ---------------------------------------------------------------------------
  2434 // Generate a dtrace nmethod for a given signature.  The method takes arguments
  2435 // in the Java compiled code convention, marshals them to the native
  2436 // abi and then leaves nops at the position you would expect to call a native
  2437 // function. When the probe is enabled the nops are replaced with a trap
  2438 // instruction that dtrace inserts and the trace will cause a notification
  2439 // to dtrace.
  2440 //
  2441 // The probes are only able to take primitive types and java/lang/String as
  2442 // arguments.  No other java types are allowed. Strings are converted to utf8
  2443 // strings so that from dtrace point of view java strings are converted to C
  2444 // strings. There is an arbitrary fixed limit on the total space that a method
  2445 // can use for converting the strings. (256 chars per string in the signature).
  2446 // So any java string larger then this is truncated.
  2448 static int  fp_offset[ConcreteRegisterImpl::number_of_registers] = { 0 };
  2449 static bool offsets_initialized = false;
  2451 static VMRegPair reg64_to_VMRegPair(Register r) {
  2452   VMRegPair ret;
  2453   if (wordSize == 8) {
  2454     ret.set2(r->as_VMReg());
  2455   } else {
  2456     ret.set_pair(r->successor()->as_VMReg(), r->as_VMReg());
  2458   return ret;
  2462 nmethod *SharedRuntime::generate_dtrace_nmethod(MacroAssembler *masm,
  2463                                                 methodHandle method) {
  2466   // generate_dtrace_nmethod is guarded by a mutex so we are sure to
  2467   // be single threaded in this method.
  2468   assert(AdapterHandlerLibrary_lock->owned_by_self(), "must be");
  2470   // Fill in the signature array, for the calling-convention call.
  2471   int total_args_passed = method->size_of_parameters();
  2473   BasicType* in_sig_bt  = NEW_RESOURCE_ARRAY(BasicType, total_args_passed);
  2474   VMRegPair  *in_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_args_passed);
  2476   // The signature we are going to use for the trap that dtrace will see
  2477   // java/lang/String is converted. We drop "this" and any other object
  2478   // is converted to NULL.  (A one-slot java/lang/Long object reference
  2479   // is converted to a two-slot long, which is why we double the allocation).
  2480   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_args_passed * 2);
  2481   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_args_passed * 2);
  2483   int i=0;
  2484   int total_strings = 0;
  2485   int first_arg_to_pass = 0;
  2486   int total_c_args = 0;
  2488   // Skip the receiver as dtrace doesn't want to see it
  2489   if( !method->is_static() ) {
  2490     in_sig_bt[i++] = T_OBJECT;
  2491     first_arg_to_pass = 1;
  2494   SignatureStream ss(method->signature());
  2495   for ( ; !ss.at_return_type(); ss.next()) {
  2496     BasicType bt = ss.type();
  2497     in_sig_bt[i++] = bt;  // Collect remaining bits of signature
  2498     out_sig_bt[total_c_args++] = bt;
  2499     if( bt == T_OBJECT) {
  2500       symbolOop s = ss.as_symbol_or_null();
  2501       if (s == vmSymbols::java_lang_String()) {
  2502         total_strings++;
  2503         out_sig_bt[total_c_args-1] = T_ADDRESS;
  2504       } else if (s == vmSymbols::java_lang_Boolean() ||
  2505                  s == vmSymbols::java_lang_Byte()) {
  2506         out_sig_bt[total_c_args-1] = T_BYTE;
  2507       } else if (s == vmSymbols::java_lang_Character() ||
  2508                  s == vmSymbols::java_lang_Short()) {
  2509         out_sig_bt[total_c_args-1] = T_SHORT;
  2510       } else if (s == vmSymbols::java_lang_Integer() ||
  2511                  s == vmSymbols::java_lang_Float()) {
  2512         out_sig_bt[total_c_args-1] = T_INT;
  2513       } else if (s == vmSymbols::java_lang_Long() ||
  2514                  s == vmSymbols::java_lang_Double()) {
  2515         out_sig_bt[total_c_args-1] = T_LONG;
  2516         out_sig_bt[total_c_args++] = T_VOID;
  2518     } else if ( bt == T_LONG || bt == T_DOUBLE ) {
  2519       in_sig_bt[i++] = T_VOID;   // Longs & doubles take 2 Java slots
  2520       // We convert double to long
  2521       out_sig_bt[total_c_args-1] = T_LONG;
  2522       out_sig_bt[total_c_args++] = T_VOID;
  2523     } else if ( bt == T_FLOAT) {
  2524       // We convert float to int
  2525       out_sig_bt[total_c_args-1] = T_INT;
  2529   assert(i==total_args_passed, "validly parsed signature");
  2531   // Now get the compiled-Java layout as input arguments
  2532   int comp_args_on_stack;
  2533   comp_args_on_stack = SharedRuntime::java_calling_convention(
  2534       in_sig_bt, in_regs, total_args_passed, false);
  2536   // We have received a description of where all the java arg are located
  2537   // on entry to the wrapper. We need to convert these args to where
  2538   // the a  native (non-jni) function would expect them. To figure out
  2539   // where they go we convert the java signature to a C signature and remove
  2540   // T_VOID for any long/double we might have received.
  2543   // Now figure out where the args must be stored and how much stack space
  2544   // they require (neglecting out_preserve_stack_slots but space for storing
  2545   // the 1st six register arguments). It's weird see int_stk_helper.
  2547   int out_arg_slots;
  2548   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, NULL, total_c_args);
  2550   // Calculate the total number of stack slots we will need.
  2552   // First count the abi requirement plus all of the outgoing args
  2553   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
  2555   // Plus a temp for possible converion of float/double/long register args
  2557   int conversion_temp = stack_slots;
  2558   stack_slots += 2;
  2561   // Now space for the string(s) we must convert
  2563   int string_locs = stack_slots;
  2564   stack_slots += total_strings *
  2565                    (max_dtrace_string_size / VMRegImpl::stack_slot_size);
  2567   // Ok The space we have allocated will look like:
  2568   //
  2569   //
  2570   // FP-> |                     |
  2571   //      |---------------------|
  2572   //      | string[n]           |
  2573   //      |---------------------| <- string_locs[n]
  2574   //      | string[n-1]         |
  2575   //      |---------------------| <- string_locs[n-1]
  2576   //      | ...                 |
  2577   //      | ...                 |
  2578   //      |---------------------| <- string_locs[1]
  2579   //      | string[0]           |
  2580   //      |---------------------| <- string_locs[0]
  2581   //      | temp                |
  2582   //      |---------------------| <- conversion_temp
  2583   //      | outbound memory     |
  2584   //      | based arguments     |
  2585   //      |                     |
  2586   //      |---------------------|
  2587   //      |                     |
  2588   // SP-> | out_preserved_slots |
  2589   //
  2590   //
  2592   // Now compute actual number of stack words we need rounding to make
  2593   // stack properly aligned.
  2594   stack_slots = round_to(stack_slots, 4 * VMRegImpl::slots_per_word);
  2596   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
  2598   intptr_t start = (intptr_t)__ pc();
  2600   // First thing make an ic check to see if we should even be here
  2603     Label L;
  2604     const Register temp_reg = G3_scratch;
  2605     Address ic_miss(temp_reg, SharedRuntime::get_ic_miss_stub());
  2606     __ verify_oop(O0);
  2607     __ ld_ptr(O0, oopDesc::klass_offset_in_bytes(), temp_reg);
  2608     __ cmp(temp_reg, G5_inline_cache_reg);
  2609     __ brx(Assembler::equal, true, Assembler::pt, L);
  2610     __ delayed()->nop();
  2612     __ jump_to(ic_miss, 0);
  2613     __ delayed()->nop();
  2614     __ align(CodeEntryAlignment);
  2615     __ bind(L);
  2618   int vep_offset = ((intptr_t)__ pc()) - start;
  2621   // The instruction at the verified entry point must be 5 bytes or longer
  2622   // because it can be patched on the fly by make_non_entrant. The stack bang
  2623   // instruction fits that requirement.
  2625   // Generate stack overflow check before creating frame
  2626   __ generate_stack_overflow_check(stack_size);
  2628   assert(((intptr_t)__ pc() - start - vep_offset) >= 5,
  2629          "valid size for make_non_entrant");
  2631   // Generate a new frame for the wrapper.
  2632   __ save(SP, -stack_size, SP);
  2634   // Frame is now completed as far a size and linkage.
  2636   int frame_complete = ((intptr_t)__ pc()) - start;
  2638 #ifdef ASSERT
  2639   bool reg_destroyed[RegisterImpl::number_of_registers];
  2640   bool freg_destroyed[FloatRegisterImpl::number_of_registers];
  2641   for ( int r = 0 ; r < RegisterImpl::number_of_registers ; r++ ) {
  2642     reg_destroyed[r] = false;
  2644   for ( int f = 0 ; f < FloatRegisterImpl::number_of_registers ; f++ ) {
  2645     freg_destroyed[f] = false;
  2648 #endif /* ASSERT */
  2650   VMRegPair zero;
  2651   const Register g0 = G0; // without this we get a compiler warning (why??)
  2652   zero.set2(g0->as_VMReg());
  2654   int c_arg, j_arg;
  2656   Register conversion_off = noreg;
  2658   for (j_arg = first_arg_to_pass, c_arg = 0 ;
  2659        j_arg < total_args_passed ; j_arg++, c_arg++ ) {
  2661     VMRegPair src = in_regs[j_arg];
  2662     VMRegPair dst = out_regs[c_arg];
  2664 #ifdef ASSERT
  2665     if (src.first()->is_Register()) {
  2666       assert(!reg_destroyed[src.first()->as_Register()->encoding()], "ack!");
  2667     } else if (src.first()->is_FloatRegister()) {
  2668       assert(!freg_destroyed[src.first()->as_FloatRegister()->encoding(
  2669                                                FloatRegisterImpl::S)], "ack!");
  2671     if (dst.first()->is_Register()) {
  2672       reg_destroyed[dst.first()->as_Register()->encoding()] = true;
  2673     } else if (dst.first()->is_FloatRegister()) {
  2674       freg_destroyed[dst.first()->as_FloatRegister()->encoding(
  2675                                                  FloatRegisterImpl::S)] = true;
  2677 #endif /* ASSERT */
  2679     switch (in_sig_bt[j_arg]) {
  2680       case T_ARRAY:
  2681       case T_OBJECT:
  2683           if (out_sig_bt[c_arg] == T_BYTE  || out_sig_bt[c_arg] == T_SHORT ||
  2684               out_sig_bt[c_arg] == T_INT || out_sig_bt[c_arg] == T_LONG) {
  2685             // need to unbox a one-slot value
  2686             Register in_reg = L0;
  2687             Register tmp = L2;
  2688             if ( src.first()->is_reg() ) {
  2689               in_reg = src.first()->as_Register();
  2690             } else {
  2691               assert(Assembler::is_simm13(reg2offset(src.first()) + STACK_BIAS),
  2692                      "must be");
  2693               __ ld_ptr(FP, reg2offset(src.first()) + STACK_BIAS, in_reg);
  2695             // If the final destination is an acceptable register
  2696             if ( dst.first()->is_reg() ) {
  2697               if ( dst.is_single_phys_reg() || out_sig_bt[c_arg] != T_LONG ) {
  2698                 tmp = dst.first()->as_Register();
  2702             Label skipUnbox;
  2703             if ( wordSize == 4 && out_sig_bt[c_arg] == T_LONG ) {
  2704               __ mov(G0, tmp->successor());
  2706             __ br_null(in_reg, true, Assembler::pn, skipUnbox);
  2707             __ delayed()->mov(G0, tmp);
  2709             BasicType bt = out_sig_bt[c_arg];
  2710             int box_offset = java_lang_boxing_object::value_offset_in_bytes(bt);
  2711             switch (bt) {
  2712                 case T_BYTE:
  2713                   __ ldub(in_reg, box_offset, tmp); break;
  2714                 case T_SHORT:
  2715                   __ lduh(in_reg, box_offset, tmp); break;
  2716                 case T_INT:
  2717                   __ ld(in_reg, box_offset, tmp); break;
  2718                 case T_LONG:
  2719                   __ ld_long(in_reg, box_offset, tmp); break;
  2720                 default: ShouldNotReachHere();
  2723             __ bind(skipUnbox);
  2724             // If tmp wasn't final destination copy to final destination
  2725             if (tmp == L2) {
  2726               VMRegPair tmp_as_VM = reg64_to_VMRegPair(L2);
  2727               if (out_sig_bt[c_arg] == T_LONG) {
  2728                 long_move(masm, tmp_as_VM, dst);
  2729               } else {
  2730                 move32_64(masm, tmp_as_VM, out_regs[c_arg]);
  2733             if (out_sig_bt[c_arg] == T_LONG) {
  2734               assert(out_sig_bt[c_arg+1] == T_VOID, "must be");
  2735               ++c_arg; // move over the T_VOID to keep the loop indices in sync
  2737           } else if (out_sig_bt[c_arg] == T_ADDRESS) {
  2738             Register s =
  2739                 src.first()->is_reg() ? src.first()->as_Register() : L2;
  2740             Register d =
  2741                 dst.first()->is_reg() ? dst.first()->as_Register() : L2;
  2743             // We store the oop now so that the conversion pass can reach
  2744             // while in the inner frame. This will be the only store if
  2745             // the oop is NULL.
  2746             if (s != L2) {
  2747               // src is register
  2748               if (d != L2) {
  2749                 // dst is register
  2750                 __ mov(s, d);
  2751               } else {
  2752                 assert(Assembler::is_simm13(reg2offset(dst.first()) +
  2753                           STACK_BIAS), "must be");
  2754                 __ st_ptr(s, SP, reg2offset(dst.first()) + STACK_BIAS);
  2756             } else {
  2757                 // src not a register
  2758                 assert(Assembler::is_simm13(reg2offset(src.first()) +
  2759                            STACK_BIAS), "must be");
  2760                 __ ld_ptr(FP, reg2offset(src.first()) + STACK_BIAS, d);
  2761                 if (d == L2) {
  2762                   assert(Assembler::is_simm13(reg2offset(dst.first()) +
  2763                              STACK_BIAS), "must be");
  2764                   __ st_ptr(d, SP, reg2offset(dst.first()) + STACK_BIAS);
  2767           } else if (out_sig_bt[c_arg] != T_VOID) {
  2768             // Convert the arg to NULL
  2769             if (dst.first()->is_reg()) {
  2770               __ mov(G0, dst.first()->as_Register());
  2771             } else {
  2772               assert(Assembler::is_simm13(reg2offset(dst.first()) +
  2773                          STACK_BIAS), "must be");
  2774               __ st_ptr(G0, SP, reg2offset(dst.first()) + STACK_BIAS);
  2778         break;
  2779       case T_VOID:
  2780         break;
  2782       case T_FLOAT:
  2783         if (src.first()->is_stack()) {
  2784           // Stack to stack/reg is simple
  2785           move32_64(masm, src, dst);
  2786         } else {
  2787           if (dst.first()->is_reg()) {
  2788             // freg -> reg
  2789             int off =
  2790               STACK_BIAS + conversion_temp * VMRegImpl::stack_slot_size;
  2791             Register d = dst.first()->as_Register();
  2792             if (Assembler::is_simm13(off)) {
  2793               __ stf(FloatRegisterImpl::S, src.first()->as_FloatRegister(),
  2794                      SP, off);
  2795               __ ld(SP, off, d);
  2796             } else {
  2797               if (conversion_off == noreg) {
  2798                 __ set(off, L6);
  2799                 conversion_off = L6;
  2801               __ stf(FloatRegisterImpl::S, src.first()->as_FloatRegister(),
  2802                      SP, conversion_off);
  2803               __ ld(SP, conversion_off , d);
  2805           } else {
  2806             // freg -> mem
  2807             int off = STACK_BIAS + reg2offset(dst.first());
  2808             if (Assembler::is_simm13(off)) {
  2809               __ stf(FloatRegisterImpl::S, src.first()->as_FloatRegister(),
  2810                      SP, off);
  2811             } else {
  2812               if (conversion_off == noreg) {
  2813                 __ set(off, L6);
  2814                 conversion_off = L6;
  2816               __ stf(FloatRegisterImpl::S, src.first()->as_FloatRegister(),
  2817                      SP, conversion_off);
  2821         break;
  2823       case T_DOUBLE:
  2824         assert( j_arg + 1 < total_args_passed &&
  2825                 in_sig_bt[j_arg + 1] == T_VOID &&
  2826                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
  2827         if (src.first()->is_stack()) {
  2828           // Stack to stack/reg is simple
  2829           long_move(masm, src, dst);
  2830         } else {
  2831           Register d = dst.first()->is_reg() ? dst.first()->as_Register() : L2;
  2833           // Destination could be an odd reg on 32bit in which case
  2834           // we can't load direct to the destination.
  2836           if (!d->is_even() && wordSize == 4) {
  2837             d = L2;
  2839           int off = STACK_BIAS + conversion_temp * VMRegImpl::stack_slot_size;
  2840           if (Assembler::is_simm13(off)) {
  2841             __ stf(FloatRegisterImpl::D, src.first()->as_FloatRegister(),
  2842                    SP, off);
  2843             __ ld_long(SP, off, d);
  2844           } else {
  2845             if (conversion_off == noreg) {
  2846               __ set(off, L6);
  2847               conversion_off = L6;
  2849             __ stf(FloatRegisterImpl::D, src.first()->as_FloatRegister(),
  2850                    SP, conversion_off);
  2851             __ ld_long(SP, conversion_off, d);
  2853           if (d == L2) {
  2854             long_move(masm, reg64_to_VMRegPair(L2), dst);
  2857         break;
  2859       case T_LONG :
  2860         // 32bit can't do a split move of something like g1 -> O0, O1
  2861         // so use a memory temp
  2862         if (src.is_single_phys_reg() && wordSize == 4) {
  2863           Register tmp = L2;
  2864           if (dst.first()->is_reg() &&
  2865               (wordSize == 8 || dst.first()->as_Register()->is_even())) {
  2866             tmp = dst.first()->as_Register();
  2869           int off = STACK_BIAS + conversion_temp * VMRegImpl::stack_slot_size;
  2870           if (Assembler::is_simm13(off)) {
  2871             __ stx(src.first()->as_Register(), SP, off);
  2872             __ ld_long(SP, off, tmp);
  2873           } else {
  2874             if (conversion_off == noreg) {
  2875               __ set(off, L6);
  2876               conversion_off = L6;
  2878             __ stx(src.first()->as_Register(), SP, conversion_off);
  2879             __ ld_long(SP, conversion_off, tmp);
  2882           if (tmp == L2) {
  2883             long_move(masm, reg64_to_VMRegPair(L2), dst);
  2885         } else {
  2886           long_move(masm, src, dst);
  2888         break;
  2890       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
  2892       default:
  2893         move32_64(masm, src, dst);
  2898   // If we have any strings we must store any register based arg to the stack
  2899   // This includes any still live xmm registers too.
  2901   if (total_strings > 0 ) {
  2903     // protect all the arg registers
  2904     __ save_frame(0);
  2905     __ mov(G2_thread, L7_thread_cache);
  2906     const Register L2_string_off = L2;
  2908     // Get first string offset
  2909     __ set(string_locs * VMRegImpl::stack_slot_size, L2_string_off);
  2911     for (c_arg = 0 ; c_arg < total_c_args ; c_arg++ ) {
  2912       if (out_sig_bt[c_arg] == T_ADDRESS) {
  2914         VMRegPair dst = out_regs[c_arg];
  2915         const Register d = dst.first()->is_reg() ?
  2916             dst.first()->as_Register()->after_save() : noreg;
  2918         // It's a string the oop and it was already copied to the out arg
  2919         // position
  2920         if (d != noreg) {
  2921           __ mov(d, O0);
  2922         } else {
  2923           assert(Assembler::is_simm13(reg2offset(dst.first()) + STACK_BIAS),
  2924                  "must be");
  2925           __ ld_ptr(FP,  reg2offset(dst.first()) + STACK_BIAS, O0);
  2927         Label skip;
  2929         __ br_null(O0, false, Assembler::pn, skip);
  2930         __ delayed()->add(FP, L2_string_off, O1);
  2932         if (d != noreg) {
  2933           __ mov(O1, d);
  2934         } else {
  2935           assert(Assembler::is_simm13(reg2offset(dst.first()) + STACK_BIAS),
  2936                  "must be");
  2937           __ st_ptr(O1, FP,  reg2offset(dst.first()) + STACK_BIAS);
  2940         __ call(CAST_FROM_FN_PTR(address, SharedRuntime::get_utf),
  2941                 relocInfo::runtime_call_type);
  2942         __ delayed()->add(L2_string_off, max_dtrace_string_size, L2_string_off);
  2944         __ bind(skip);
  2949     __ mov(L7_thread_cache, G2_thread);
  2950     __ restore();
  2955   // Ok now we are done. Need to place the nop that dtrace wants in order to
  2956   // patch in the trap
  2958   int patch_offset = ((intptr_t)__ pc()) - start;
  2960   __ nop();
  2963   // Return
  2965   __ ret();
  2966   __ delayed()->restore();
  2968   __ flush();
  2970   nmethod *nm = nmethod::new_dtrace_nmethod(
  2971       method, masm->code(), vep_offset, patch_offset, frame_complete,
  2972       stack_slots / VMRegImpl::slots_per_word);
  2973   return nm;
  2977 #endif // HAVE_DTRACE_H
  2979 // this function returns the adjust size (in number of words) to a c2i adapter
  2980 // activation for use during deoptimization
  2981 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals) {
  2982   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
  2985 // "Top of Stack" slots that may be unused by the calling convention but must
  2986 // otherwise be preserved.
  2987 // On Intel these are not necessary and the value can be zero.
  2988 // On Sparc this describes the words reserved for storing a register window
  2989 // when an interrupt occurs.
  2990 uint SharedRuntime::out_preserve_stack_slots() {
  2991    return 0;
  2994 //------------------------------generate_deopt_blob----------------------------
  2995 // Ought to generate an ideal graph & compile, but here's some SPARC ASM
  2996 // instead.
  2997 void SharedRuntime::generate_deopt_blob() {
  2998   // allocate space for the code
  2999   ResourceMark rm;
  3000   // setup code generation tools
  3001   //CodeBuffer     buffer ("deopt_blob", 4000, 2048);
  3002   CodeBuffer     buffer ("deopt_blob", 8000, 2048);//aoqi FIXME for debug
  3003   MacroAssembler* masm  = new MacroAssembler( & buffer);
  3004   int frame_size_in_words;
  3005   OopMap* map = NULL;
  3006   // Account for the extra args we place on the stack
  3007   // by the time we call fetch_unroll_info
  3008   const int additional_words = 2; // deopt kind, thread
  3010   OopMapSet *oop_maps = new OopMapSet();
  3012   address start = __ pc();
  3013   Label cont;
  3014   // we use S3 for DeOpt reason register
  3015   Register reason = S3;
  3016   // use S6 for thread register
  3017   Register thread = TREG;
  3018   // use S7 for fetch_unroll_info returned UnrollBlock
  3019   Register unroll = S7;
  3020   // Prolog for non exception case!
  3021   // Correct the return address we were given.
  3022   //FIXME, return address is on the tos or Ra?
  3023   __ addi(RA, RA, - (NativeCall::return_address_offset_long));
  3024   // Save everything in sight.
  3025   map = RegisterSaver::save_live_registers(masm, additional_words, &frame_size_in_words);
  3026   // Normal deoptimization
  3027   __ move(reason, Deoptimization::Unpack_deopt);
  3028   __ b(cont);
  3029   __ delayed()->nop();
  3031   int reexecute_offset = __ pc() - start;
  3033   // Reexecute case
  3034   // return address is the pc describes what bci to do re-execute at
  3036   // No need to update map as each call to save_live_registers will produce identical oopmap
  3037   (void) RegisterSaver::save_live_registers(masm, additional_words, &frame_size_in_words);
  3038   __ move(reason, Deoptimization::Unpack_reexecute);
  3039   __ b(cont);
  3040   __ delayed()->nop();
  3042   int   exception_offset = __ pc() - start;
  3043   // Prolog for exception case
  3045   // all registers are dead at this entry point, except for V0 and
  3046   // V1 which contain the exception oop and exception pc
  3047   // respectively.  Set them in TLS and fall thru to the
  3048   // unpack_with_exception_in_tls entry point.
  3050   __ get_thread(thread);
  3051   __ st_ptr(V1, thread, in_bytes(JavaThread::exception_pc_offset()));
  3052   __ st_ptr(V0, thread, in_bytes(JavaThread::exception_oop_offset()));
  3053   int exception_in_tls_offset = __ pc() - start;
  3054   // new implementation because exception oop is now passed in JavaThread
  3056   // Prolog for exception case
  3057   // All registers must be preserved because they might be used by LinearScan
  3058   // Exceptiop oop and throwing PC are passed in JavaThread
  3059   // tos: stack at point of call to method that threw the exception (i.e. only
  3060   // args are on the stack, no return address)
  3062   // Return address will be patched later with the throwing pc. The correct value is not
  3063   // available now because loading it from memory would destroy registers.
  3064   // Save everything in sight.
  3065   // No need to update map as each call to save_live_registers will produce identical oopmap
  3066   __ addi(RA, RA, - (NativeCall::return_address_offset_long));
  3067   (void) RegisterSaver::save_live_registers(masm, additional_words, &frame_size_in_words);
  3069   // Now it is safe to overwrite any register
  3070   // store the correct deoptimization type
  3071   __ move(reason, Deoptimization::Unpack_exception);
  3072   // load throwing pc from JavaThread and patch it as the return address
  3073   // of the current frame. Then clear the field in JavaThread
  3074   __ get_thread(thread);
  3075   __ ld_ptr(V1, thread, in_bytes(JavaThread::exception_pc_offset()));
  3076   __ st_ptr(V1, SP, RegisterSaver::raOffset() * wordSize); //save ra
  3077   __ st_ptr(R0, thread, in_bytes(JavaThread::exception_pc_offset()));
  3080 #ifdef ASSERT
  3081   // verify that there is really an exception oop in JavaThread
  3082   __ ld_ptr(AT, thread, in_bytes(JavaThread::exception_oop_offset()));
  3083   __ verify_oop(AT);
  3084   // verify that there is no pending exception
  3085   Label no_pending_exception;
  3086   __ ld_ptr(AT, thread, in_bytes(Thread::pending_exception_offset()));
  3087   __ beq(AT, R0, no_pending_exception);
  3088   __ delayed()->nop();
  3089   __ stop("must not have pending exception here");
  3090   __ bind(no_pending_exception);
  3091 #endif
  3092   __ bind(cont);
  3093   // Compiled code leaves the floating point stack dirty, empty it.
  3094   __ empty_FPU_stack();
  3097   // Call C code.  Need thread and this frame, but NOT official VM entry
  3098   // crud.  We cannot block on this call, no GC can happen.
  3099 #ifndef OPT_THREAD
  3100   __ get_thread(thread);
  3101 #endif
  3103   __ move(A0, thread);
  3104   __ addi(SP, SP, -additional_words  * wordSize);
  3106   __ set_last_Java_frame(NOREG, NOREG, NULL);
  3108   // Call fetch_unroll_info().  Need thread and this frame, but NOT official VM entry - cannot block on
  3109   // this call, no GC can happen.  Call should capture return values.
  3111   __ relocate(relocInfo::internal_pc_type);
  3113     intptr_t save_pc = (intptr_t)__ pc() +  NativeMovConstReg::instruction_size + 28;
  3114     __ patchable_set48(AT, save_pc);
  3116   __ sd(AT, thread, in_bytes(JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()));
  3118   __ call((address)Deoptimization::fetch_unroll_info);
  3119   //__ call(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info), relocInfo::runtime_call_type);
  3120   __ delayed()->nop();
  3121   oop_maps->add_gc_map(__ pc() - start, map);
  3122   __ addiu(SP, SP, additional_words * wordSize);
  3123   __ get_thread(thread);
  3124   __ reset_last_Java_frame(false);
  3126   // Load UnrollBlock into S7
  3127   __ move(unroll, V0);
  3130   // Move the unpack kind to a safe place in the UnrollBlock because
  3131   // we are very short of registers
  3133   Address unpack_kind(unroll, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes());
  3134   __ sw(reason, unpack_kind);
  3135   // save the unpack_kind value
  3136   // Retrieve the possible live values (return values)
  3137   // All callee save registers representing jvm state
  3138   // are now in the vframeArray.
  3140   Label noException;
  3141   __ move(AT, Deoptimization::Unpack_exception);
  3142   __ bne(AT, reason, noException);// Was exception pending?
  3143   __ delayed()->nop();
  3144   __ ld_ptr(V0, thread, in_bytes(JavaThread::exception_oop_offset()));
  3145   __ ld_ptr(V1, thread, in_bytes(JavaThread::exception_pc_offset()));
  3146   __ st_ptr(R0, thread, in_bytes(JavaThread::exception_pc_offset()));
  3147   __ st_ptr(R0, thread, in_bytes(JavaThread::exception_oop_offset()));
  3149   __ verify_oop(V0);
  3151   // Overwrite the result registers with the exception results.
  3152   __ st_ptr(V0, SP, RegisterSaver::v0Offset()*wordSize);
  3153   __ st_ptr(V1, SP, RegisterSaver::v1Offset()*wordSize);
  3155   __ bind(noException);
  3158   // Stack is back to only having register save data on the stack.
  3159   // Now restore the result registers. Everything else is either dead or captured
  3160   // in the vframeArray.
  3162   RegisterSaver::restore_result_registers(masm);
  3163   // All of the register save area has been popped of the stack. Only the
  3164   // return address remains.
  3165   // Pop all the frames we must move/replace.
  3166   // Frame picture (youngest to oldest)
  3167   // 1: self-frame (no frame link)
  3168   // 2: deopting frame  (no frame link)
  3169   // 3: caller of deopting frame (could be compiled/interpreted).
  3170   //
  3171   // Note: by leaving the return address of self-frame on the stack
  3172   // and using the size of frame 2 to adjust the stack
  3173   // when we are done the return to frame 3 will still be on the stack.
  3175   // register for the sender's sp
  3176   Register sender_sp = Rsender;
  3177   // register for frame pcs
  3178   Register pcs = T0;
  3179   // register for frame sizes
  3180   Register sizes = T1;
  3181   // register for frame count
  3182   Register count = T3;
  3184   // Pop deoptimized frame
  3185   __ lw(AT, unroll, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset_in_bytes());
  3186   __ add(SP, SP, AT);
  3187   // sp should be pointing at the return address to the caller (3)
  3189   // Load array of frame pcs into pcs
  3190   __ ld_ptr(pcs, unroll, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes());
  3191   __ addi(SP, SP, wordSize);  // trash the old pc
  3192   // Load array of frame sizes into T6
  3193   __ ld_ptr(sizes, unroll, Deoptimization::UnrollBlock::frame_sizes_offset_in_bytes());
  3197   // Load count of frams into T3
  3198   __ lw(count, unroll, Deoptimization::UnrollBlock::number_of_frames_offset_in_bytes());
  3199   // Pick up the initial fp we should save
  3200   __ ld(FP, unroll,  Deoptimization::UnrollBlock::initial_info_offset_in_bytes());
  3201    // Now adjust the caller's stack to make up for the extra locals
  3202   // but record the original sp so that we can save it in the skeletal interpreter
  3203   // frame and the stack walking of interpreter_sender will get the unextended sp
  3204   // value and not the "real" sp value.
  3205   __ move(sender_sp, SP);
  3206   __ lw(AT, unroll, Deoptimization::UnrollBlock::caller_adjustment_offset_in_bytes());
  3207   __ sub(SP, SP, AT);
  3209   // Push interpreter frames in a loop
  3210   //
  3211   //Loop:
  3212   //   0x000000555bd82d18: lw t2, 0x0(t1)           ; lw sizes[i]  <--- error lw->ld
  3213   //   0x000000555bd82d1c: ld at, 0x0(t0)           ; ld pcs[i]
  3214   //   0x000000555bd82d20: daddi t2, t2, 0xfffffff0 ; t2 -= 16
  3215   //   0x000000555bd82d24: daddi sp, sp, 0xfffffff0
  3216   //   0x000000555bd82d28: sd fp, 0x0(sp)           ; push fp
  3217   //   0x000000555bd82d2c: sd at, 0x8(sp)           ; push at
  3218   //   0x000000555bd82d30: dadd fp, sp, zero        ; fp <- sp
  3219   //   0x000000555bd82d34: dsub sp, sp, t2          ; sp -= t2
  3220   //   0x000000555bd82d38: sd zero, 0xfffffff0(fp)  ; __ sd(R0, FP, frame::interpreter_frame_last_sp_offset * wordSize);
  3221   //   0x000000555bd82d3c: sd s4, 0xfffffff8(fp)    ; __ sd(sender_sp, FP, frame::interpreter_frame_sender_sp_offset * wordSize);
  3222   //   0x000000555bd82d40: dadd s4, sp, zero        ; move(sender_sp, SP);
  3223   //   0x000000555bd82d44: daddi t3, t3, 0xffffffff ; count --
  3224   //   0x000000555bd82d48: daddi t1, t1, 0x4        ; sizes += 4
  3225   //   0x000000555bd82d4c: bne t3, zero, 0x000000555bd82d18
  3226   //   0x000000555bd82d50: daddi t0, t0, 0x4        ; <--- error    t0 += 8
  3227   //
  3228   // pcs[0] = frame_pcs[0] = deopt_sender.raw_pc(); regex.split
  3229   Label loop;
  3230   __ bind(loop);
  3231   __ ld(T2, sizes, 0);    // Load frame size
  3232   __ ld_ptr(AT, pcs, 0);           // save return address
  3233   __ addi(T2, T2, -2*wordSize);           // we'll push pc and fp, by hand
  3234   __ push2(AT, FP);
  3235   __ move(FP, SP);
  3236   __ sub(SP, SP, T2);       // Prolog!
  3237   // This value is corrected by layout_activation_impl
  3238   __ sd(R0, FP, frame::interpreter_frame_last_sp_offset * wordSize);
  3239   __ sd(sender_sp, FP, frame::interpreter_frame_sender_sp_offset * wordSize);// Make it walkable
  3240   __ move(sender_sp, SP);  // pass to next frame
  3241   __ addi(count, count, -1);   // decrement counter
  3242   __ addi(sizes, sizes, wordSize);   // Bump array pointer (sizes)
  3243   __ bne(count, R0, loop);
  3244   __ delayed()->addi(pcs, pcs, wordSize);   // Bump array pointer (pcs)
  3245   __ ld(AT, pcs, 0);      // frame_pcs[number_of_frames] = Interpreter::deopt_entry(vtos, 0);
  3246   // Re-push self-frame
  3247   __ push2(AT, FP);
  3248   __ move(FP, SP);
  3249   __ sd(R0, FP, frame::interpreter_frame_last_sp_offset * wordSize);
  3250   __ sd(sender_sp, FP, frame::interpreter_frame_sender_sp_offset * wordSize);
  3251   __ addi(SP, SP, -(frame_size_in_words - 2 - additional_words) * wordSize);
  3253   // Restore frame locals after moving the frame
  3254   __ sd(V0, SP, RegisterSaver::v0Offset() * wordSize);
  3255   __ sd(V1, SP, RegisterSaver::v1Offset() * wordSize);
  3256   __ sdc1(F0, SP, RegisterSaver::fpResultOffset()* wordSize);// Pop float stack and store in local
  3257   __ sdc1(F1, SP, (RegisterSaver::fpResultOffset() + 1) * wordSize);
  3260   // Call unpack_frames().  Need thread and this frame, but NOT official VM entry - cannot block on
  3261   // this call, no GC can happen.
  3262   __ move(A1, reason);  // exec_mode
  3263   __ get_thread(thread);
  3264   __ move(A0, thread);  // thread
  3265   __ addi(SP, SP, (-additional_words) *wordSize);
  3267   // set last_Java_sp, last_Java_fp
  3268   __ set_last_Java_frame(NOREG, FP, NULL);
  3270   __ move(AT, -(StackAlignmentInBytes));
  3271   __ andr(SP, SP, AT);   // Fix stack alignment as required by ABI
  3273   __ relocate(relocInfo::internal_pc_type);
  3275     intptr_t save_pc = (intptr_t)__ pc() +  NativeMovConstReg::instruction_size + 28;
  3276     __ patchable_set48(AT, save_pc);
  3278   __ sd(AT, thread, in_bytes(JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()));
  3280   __ call(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames), relocInfo::runtime_call_type);
  3281   __ delayed()->nop();
  3282   // Revert SP alignment after call since we're going to do some SP relative addressing below
  3283   __ ld(SP, thread, in_bytes(JavaThread::last_Java_sp_offset()));
  3284   // Set an oopmap for the call site
  3285   oop_maps->add_gc_map(__ offset(), new OopMap( frame_size_in_words , 0));
  3287   __ push(V0);
  3289   __ get_thread(thread);
  3290   __ reset_last_Java_frame(true);
  3292   // Collect return values
  3293   __ ld(V0, SP, (RegisterSaver::v0Offset() + additional_words +1) * wordSize);
  3294   __ ld(V1, SP, (RegisterSaver::v1Offset() + additional_words +1) * wordSize);
  3295   __ ldc1(F0, SP, RegisterSaver::fpResultOffset()* wordSize);// Pop float stack and store in local
  3296   __ ldc1(F1, SP, (RegisterSaver::fpResultOffset() + 1) * wordSize);
  3297   //FIXME,
  3298   // Clear floating point stack before returning to interpreter
  3299   __ empty_FPU_stack();
  3300   //FIXME, we should consider about float and double
  3301   // Push a float or double return value if necessary.
  3302   __ leave();
  3304   // Jump to interpreter
  3305   __ jr(RA);
  3306   __ delayed()->nop();
  3308   masm->flush();
  3309   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
  3310   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
  3313 #ifdef COMPILER2
  3315 //------------------------------generate_uncommon_trap_blob--------------------
  3316 // Ought to generate an ideal graph & compile, but here's some SPARC ASM
  3317 // instead.
  3318 void SharedRuntime::generate_uncommon_trap_blob() {
  3319   // allocate space for the code
  3320   ResourceMark rm;
  3321   // setup code generation tools
  3322   CodeBuffer  buffer ("uncommon_trap_blob", 512*80 , 512*40 );
  3323   MacroAssembler* masm = new MacroAssembler(&buffer);
  3325   enum frame_layout {
  3326     s0_off, s0_off2,
  3327     s1_off, s1_off2,
  3328     s2_off, s2_off2,
  3329     s3_off, s3_off2,
  3330     s4_off, s4_off2,
  3331     s5_off, s5_off2,
  3332     s6_off, s6_off2,
  3333     s7_off, s7_off2,
  3334     fp_off, fp_off2,
  3335     return_off, return_off2,    // slot for return address    sp + 9
  3336     framesize
  3337   };
  3338   assert(framesize % 4 == 0, "sp not 16-byte aligned");
  3340   address start = __ pc();
  3342   // Push self-frame.
  3343   __ daddiu(SP, SP, -framesize * BytesPerInt);
  3345   __ sd(RA, SP, return_off * BytesPerInt);
  3346   __ sd(FP, SP, fp_off * BytesPerInt);
  3348   // Save callee saved registers.  None for UseSSE=0,
  3349   // floats-only for UseSSE=1, and doubles for UseSSE=2.
  3350   __ sd(S0, SP, s0_off * BytesPerInt);
  3351   __ sd(S1, SP, s1_off * BytesPerInt);
  3352   __ sd(S2, SP, s2_off * BytesPerInt);
  3353   __ sd(S3, SP, s3_off * BytesPerInt);
  3354   __ sd(S4, SP, s4_off * BytesPerInt);
  3355   __ sd(S5, SP, s5_off * BytesPerInt);
  3356   __ sd(S6, SP, s6_off * BytesPerInt);
  3357   __ sd(S7, SP, s7_off * BytesPerInt);
  3359   __ daddi(FP, SP, fp_off * BytesPerInt);
  3361   // Clear the floating point exception stack
  3362   __ empty_FPU_stack();
  3364   Register thread = TREG;
  3366 #ifndef OPT_THREAD
  3367   __ get_thread(thread);
  3368 #endif
  3369   // set last_Java_sp
  3370   __ set_last_Java_frame(NOREG, FP, NULL);
  3371   __ relocate(relocInfo::internal_pc_type);
  3373     long save_pc = (long)__ pc() + 52;
  3374     __ patchable_set48(AT, (long)save_pc);
  3375     __ sd(AT, thread, in_bytes(JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()));
  3377   // Call C code.  Need thread but NOT official VM entry
  3378   // crud.  We cannot block on this call, no GC can happen.  Call should
  3379   // capture callee-saved registers as well as return values.
  3380   __ move(A0, thread);
  3381   // argument already in T0
  3382   __ move(A1, T0);
  3383   __ patchable_call((address)Deoptimization::uncommon_trap);
  3385   // Set an oopmap for the call site
  3386   OopMapSet *oop_maps = new OopMapSet();
  3387   OopMap* map =  new OopMap( framesize, 0 );
  3389   map->set_callee_saved( VMRegImpl::stack2reg(s0_off    ),  S0->as_VMReg() );
  3390   map->set_callee_saved( VMRegImpl::stack2reg(s1_off    ),  S1->as_VMReg() );
  3391   map->set_callee_saved( VMRegImpl::stack2reg(s2_off    ),  S2->as_VMReg() );
  3392   map->set_callee_saved( VMRegImpl::stack2reg(s3_off    ),  S3->as_VMReg() );
  3393   map->set_callee_saved( VMRegImpl::stack2reg(s4_off    ),  S4->as_VMReg() );
  3394   map->set_callee_saved( VMRegImpl::stack2reg(s5_off    ),  S5->as_VMReg() );
  3395   map->set_callee_saved( VMRegImpl::stack2reg(s6_off    ),  S6->as_VMReg() );
  3396   map->set_callee_saved( VMRegImpl::stack2reg(s7_off    ),  S7->as_VMReg() );
  3398   //oop_maps->add_gc_map( __ offset(), true, map);
  3399   oop_maps->add_gc_map( __ offset(),  map);
  3401 #ifndef OPT_THREAD
  3402   __ get_thread(thread);
  3403 #endif
  3404   __ reset_last_Java_frame(false);
  3406   // Load UnrollBlock into S7
  3407   Register unroll = S7;
  3408   __ move(unroll, V0);
  3410   // Pop all the frames we must move/replace.
  3411   //
  3412   // Frame picture (youngest to oldest)
  3413   // 1: self-frame (no frame link)
  3414   // 2: deopting frame  (no frame link)
  3415   // 3: possible-i2c-adapter-frame
  3416   // 4: caller of deopting frame (could be compiled/interpreted. If interpreted we will create an
  3417   //    and c2i here)
  3419   __ daddiu(SP, SP, framesize * BytesPerInt);
  3421   // Pop deoptimized frame
  3422   __ lw(AT, unroll, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset_in_bytes());
  3423   __ dadd(SP, SP, AT);
  3425   // register for frame pcs
  3426   Register pcs = T8;
  3427   // register for frame sizes
  3428   Register sizes = T9;
  3429   // register for frame count
  3430   Register count = T3;
  3431   // register for the sender's sp
  3432   Register sender_sp = T1;
  3434   // sp should be pointing at the return address to the caller (4)
  3435   // Load array of frame pcs
  3436   __ ld(pcs, unroll, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes());
  3438   // Load array of frame sizes
  3439   __ ld(sizes, unroll, Deoptimization::UnrollBlock::frame_sizes_offset_in_bytes());
  3440   __ lwu(count, unroll, Deoptimization::UnrollBlock::number_of_frames_offset_in_bytes());
  3442   // Pick up the initial fp we should save
  3443   __ ld(FP, unroll, Deoptimization::UnrollBlock::initial_info_offset_in_bytes());
  3444   // Now adjust the caller's stack to make up for the extra locals
  3445   // but record the original sp so that we can save it in the skeletal interpreter
  3446   // frame and the stack walking of interpreter_sender will get the unextended sp
  3447   // value and not the "real" sp value.
  3449   __ move(sender_sp, SP);
  3450   __ lw(AT, unroll, Deoptimization::UnrollBlock::caller_adjustment_offset_in_bytes());
  3451   __ dsub(SP, SP, AT);
  3452   // Push interpreter frames in a loop
  3453   Label loop;
  3454   __ bind(loop);
  3455   __ ld(T2, sizes, 0);          // Load frame size
  3456   __ ld(AT, pcs, 0);           // save return address
  3457   __ daddi(T2, T2, -2*wordSize);           // we'll push pc and fp, by hand
  3458   __ push2(AT, FP);
  3459   __ move(FP, SP);
  3460   __ dsub(SP, SP, T2);                   // Prolog!
  3461   // This value is corrected by layout_activation_impl
  3462   __ sd(R0, FP, frame::interpreter_frame_last_sp_offset * wordSize);
  3463   __ sd(sender_sp, FP, frame::interpreter_frame_sender_sp_offset * wordSize);// Make it walkable
  3464   __ move(sender_sp, SP);       // pass to next frame
  3465   __ daddi(count, count, -1);    // decrement counter
  3466   __ daddi(sizes, sizes, wordSize);     // Bump array pointer (sizes)
  3467   __ addi(pcs, pcs, wordSize);      // Bump array pointer (pcs)
  3468   __ bne(count, R0, loop);
  3469   __ delayed()->nop();      // Bump array pointer (pcs)
  3471   __ ld(RA, pcs, 0);
  3473   // Re-push self-frame
  3474   __ daddi(SP, SP, - 2 * wordSize);      // save old & set new FP
  3475   __ sd(FP, SP, 0 * wordSize);          // save final return address
  3476   __ sd(RA, SP, 1 * wordSize);
  3477   __ move(FP, SP);
  3478   __ daddi(SP, SP, -(framesize / 2 - 2) * wordSize);
  3480   // set last_Java_sp, last_Java_fp
  3481   __ set_last_Java_frame(NOREG, FP, NULL);
  3483   __ move(AT, -(StackAlignmentInBytes));
  3484   __ andr(SP, SP, AT);   // Fix stack alignment as required by ABI
  3486   __ relocate(relocInfo::internal_pc_type);
  3488     long save_pc = (long)__ pc() + 52;
  3489     __ patchable_set48(AT, (long)save_pc);
  3491   __ sd(AT, thread, in_bytes(JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()));
  3493   // Call C code.  Need thread but NOT official VM entry
  3494   // crud.  We cannot block on this call, no GC can happen.  Call should
  3495   // restore return values to their stack-slots with the new SP.
  3496   __ move(A0, thread);
  3497   __ move(A1, Deoptimization::Unpack_uncommon_trap);
  3498   __ patchable_call((address)Deoptimization::unpack_frames);
  3499   // Set an oopmap for the call site
  3500   oop_maps->add_gc_map( __ offset(),  new OopMap( framesize, 0 ) );
  3502   __ reset_last_Java_frame(true);
  3504   // Pop self-frame.
  3505   __ leave();     // Epilog!
  3507   // Jump to interpreter
  3508   __ jr(RA);
  3509   __ delayed()->nop();
  3510   // -------------
  3511   // make sure all code is generated
  3512   masm->flush();
  3514   _uncommon_trap_blob = UncommonTrapBlob::create(&buffer, oop_maps, framesize / 2);
  3517 #endif // COMPILER2
  3519 //------------------------------generate_handler_blob-------------------
  3520 //
  3521 // Generate a special Compile2Runtime blob that saves all registers, and sets
  3522 // up an OopMap and calls safepoint code to stop the compiled code for
  3523 // a safepoint.
  3524 //
  3525 // This blob is jumped to (via a breakpoint and the signal handler) from a
  3526 // safepoint in compiled code.
  3528 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int pool_type) {
  3530   // Account for thread arg in our frame
  3531   const int additional_words = 0;
  3532   int frame_size_in_words;
  3534   assert (StubRoutines::forward_exception_entry() != NULL, "must be generated before");
  3536   ResourceMark rm;
  3537   OopMapSet *oop_maps = new OopMapSet();
  3538   OopMap* map;
  3540   // allocate space for the code
  3541   // setup code generation tools
  3542   CodeBuffer  buffer ("handler_blob", 2048, 512);
  3543   MacroAssembler* masm = new MacroAssembler( &buffer);
  3545   const Register thread = TREG;
  3546   address start   = __ pc();
  3547   address call_pc = NULL;
  3548   bool cause_return = (pool_type == POLL_AT_RETURN);
  3549   bool save_vectors = (pool_type == POLL_AT_VECTOR_LOOP);
  3551   // If cause_return is true we are at a poll_return and there is
  3552   // the return address in RA to the caller on the nmethod
  3553   // that is safepoint. We can leave this return in RA and
  3554   // effectively complete the return and safepoint in the caller.
  3555   // Otherwise we load exception pc to RA.
  3556   __ push(thread);
  3557 #ifndef OPT_THREAD
  3558   __ get_thread(thread);
  3559 #endif
  3561   if(!cause_return) {
  3562     __ ld_ptr(RA, Address(thread, JavaThread::saved_exception_pc_offset()));
  3565   __ pop(thread);
  3566   map = RegisterSaver::save_live_registers(masm, additional_words, &frame_size_in_words, save_vectors);
  3568 #ifndef OPT_THREAD
  3569   __ get_thread(thread);
  3570 #endif
  3571   // The following is basically a call_VM. However, we need the precise
  3572   // address of the call in order to generate an oopmap. Hence, we do all the
  3573   // work outselvs.
  3575   __ move(A0, thread);
  3576   __ set_last_Java_frame(NOREG, NOREG, NULL);
  3579   // do the call
  3580   __ call(call_ptr);
  3581   __ delayed()->nop();
  3583   // Set an oopmap for the call site.  This oopmap will map all
  3584   // oop-registers and debug-info registers as callee-saved.  This
  3585   // will allow deoptimization at this safepoint to find all possible
  3586   // debug-info recordings, as well as let GC find all oops.
  3587   oop_maps->add_gc_map(__ offset(),  map);
  3589   Label noException;
  3591   // Clear last_Java_sp again
  3592   __ reset_last_Java_frame(false);
  3594   __ ld_ptr(AT, thread, in_bytes(Thread::pending_exception_offset()));
  3595   __ beq(AT, R0, noException);
  3596   __ delayed()->nop();
  3598   // Exception pending
  3600   RegisterSaver::restore_live_registers(masm, save_vectors);
  3601   //forward_exception_entry need return address on the stack
  3602   __ push(RA);
  3603   __ patchable_jump((address)StubRoutines::forward_exception_entry());
  3605   // No exception case
  3606   __ bind(noException);
  3607   // Normal exit, register restoring and exit
  3608   RegisterSaver::restore_live_registers(masm, save_vectors);
  3609   __ jr(RA);
  3610   __ delayed()->nop();
  3612   masm->flush();
  3614   // Fill-out other meta info
  3615   return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
  3618 //
  3619 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
  3620 //
  3621 // Generate a stub that calls into vm to find out the proper destination
  3622 // of a java call. All the argument registers are live at this point
  3623 // but since this is generic code we don't know what they are and the caller
  3624 // must do any gc of the args.
  3625 //
  3626 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {
  3627   assert (StubRoutines::forward_exception_entry() != NULL, "must be generated before");
  3629   // allocate space for the code
  3630   ResourceMark rm;
  3632   //CodeBuffer buffer(name, 1000, 512);
  3633   //FIXME. aoqi. code_size
  3634   CodeBuffer buffer(name, 2000, 2048);
  3635   MacroAssembler* masm  = new MacroAssembler(&buffer);
  3637   int frame_size_words;
  3638   //we put the thread in A0
  3640   OopMapSet *oop_maps = new OopMapSet();
  3641   OopMap* map = NULL;
  3643   int start = __ offset();
  3644   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_words);
  3647   int frame_complete = __ offset();
  3649   const Register thread = T8;
  3650   __ get_thread(thread);
  3652   __ move(A0, thread);
  3653   __ set_last_Java_frame(noreg, FP, NULL);
  3654   //align the stack before invoke native
  3655   __ move(AT, -(StackAlignmentInBytes));
  3656   __ andr(SP, SP, AT);
  3657   __ relocate(relocInfo::internal_pc_type);
  3659     intptr_t save_pc = (intptr_t)__ pc() +  NativeMovConstReg::instruction_size + 24 + 1 * BytesPerInstWord;
  3660     __ patchable_set48(AT, save_pc);
  3662   __ sd(AT, thread, in_bytes(JavaThread::last_Java_pc_offset()));
  3664   __ call(destination);
  3665   __ delayed()->nop();
  3667   // Set an oopmap for the call site.
  3668   // We need this not only for callee-saved registers, but also for volatile
  3669   // registers that the compiler might be keeping live across a safepoint.
  3670   oop_maps->add_gc_map( __ offset() - start, map);
  3671   // V0 contains the address we are going to jump to assuming no exception got installed
  3672   __ get_thread(thread);
  3673   __ ld_ptr(SP, thread, in_bytes(JavaThread::last_Java_sp_offset()));
  3674   // clear last_Java_sp
  3675   __ reset_last_Java_frame(true);
  3676   // check for pending exceptions
  3677   Label pending;
  3678   __ ld_ptr(AT, thread, in_bytes(Thread::pending_exception_offset()));
  3679   __ bne(AT, R0, pending);
  3680   __ delayed()->nop();
  3681   // get the returned Method*
  3682   //FIXME, do mips need this ?
  3683   __ get_vm_result_2(Rmethod, thread);  // Refer to OpenJDK8
  3684   __ st_ptr(Rmethod, SP, RegisterSaver::methodOffset() * wordSize);
  3685   __ st_ptr(V0, SP, RegisterSaver::v0Offset() * wordSize);
  3686   RegisterSaver::restore_live_registers(masm);
  3688   // We are back the the original state on entry and ready to go the callee method.
  3689   __ jr(V0);
  3690   __ delayed()->nop();
  3691   // Pending exception after the safepoint
  3693   __ bind(pending);
  3695   RegisterSaver::restore_live_registers(masm);
  3697   // exception pending => remove activation and forward to exception handler
  3698   //forward_exception_entry need return address on the stack
  3699   __ push(RA);
  3700   __ get_thread(thread);
  3701   __ st_ptr(R0, thread, in_bytes(JavaThread::vm_result_offset()));
  3702   __ ld_ptr(V0, thread, in_bytes(Thread::pending_exception_offset()));
  3703   __ jmp(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type);
  3704   __ delayed()->nop();
  3705   //
  3706   // make sure all code is generated
  3707   masm->flush();
  3709   RuntimeStub* tmp= RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_words, oop_maps, true);
  3710   return tmp;
  3713 extern "C" int SpinPause() {return 0;}
  3716 //------------------------------Montgomery multiplication------------------------
  3717 //
  3719 // Subtract 0:b from carry:a.  Return carry.
  3720 static unsigned long
  3721 sub(unsigned long a[], unsigned long b[], unsigned long carry, long len) {
  3722   long borrow = 0, t = 0;
  3723   unsigned long tmp0, tmp1;
  3724   __asm__ __volatile__ (
  3725     "0:                                            \n"
  3726     "ld      %[tmp0],     0(%[a])                  \n"
  3727     "ld      %[tmp1],     0(%[b])                  \n"
  3728     "sltu    %[t],        %[tmp0],     %[borrow]   \n"
  3729     "dsubu   %[tmp0],     %[tmp0],     %[borrow]   \n"
  3730     "sltu    %[borrow],   %[tmp0],     %[tmp1]     \n"
  3731     "or      %[borrow],   %[borrow],   %[t]        \n"
  3732     "dsubu   %[tmp0],     %[tmp0],     %[tmp1]     \n"
  3733     "sd      %[tmp0],     0(%[a])                  \n"
  3734     "daddiu  %[a],        %[a],         8          \n"
  3735     "daddiu  %[b],        %[b],         8          \n"
  3736     "daddiu  %[len],      %[len],      -1          \n"
  3737     "bgtz    %[len],      0b                       \n"
  3738     "dsubu   %[tmp0],     %[carry],    %[borrow]   \n"
  3739     : [len]"+r"(len), [tmp0]"=&r"(tmp0), [tmp1]"=&r"(tmp1), [borrow]"+r"(borrow), [a]"+r"(a), [b]"+r"(b), [t]"+r"(t)
  3740     : [carry]"r"(carry)
  3741     : "memory"
  3742   );
  3743   return tmp0;
  3746 // Multiply (unsigned) Long A by Long B, accumulating the double-
  3747 // length result into the accumulator formed of t0, t1, and t2.
  3748 inline void MACC(unsigned long A, unsigned long B, unsigned long &t0, unsigned long &t1, unsigned long &t2) {
  3749   unsigned long hi, lo, carry = 0, t = 0;
  3750   __asm__ __volatile__(
  3751     "dmultu  %[A],        %[B]                     \n"
  3752     "mfhi    %[hi]                                 \n"
  3753     "mflo    %[lo]                                 \n"
  3754     "daddu   %[t0],       %[t0],       %[lo]       \n"
  3755     "sltu    %[carry],    %[t0],       %[lo]       \n"
  3756     "daddu   %[t1],       %[t1],       %[carry]    \n"
  3757     "sltu    %[t],        %[t1],       %[carry]    \n"
  3758     "daddu   %[t1],       %[t1],       %[hi]       \n"
  3759     "sltu    %[carry],    %[t1],       %[hi]       \n"
  3760     "or      %[carry],    %[carry],    %[t]        \n"
  3761     "daddu   %[t2],       %[t2],       %[carry]    \n"
  3762     : [hi]"=&r"(hi), [lo]"=&r"(lo), [t0]"+r"(t0), [t1]"+r"(t1), [t2]"+r"(t2), [carry]"+r"(carry), [t]"+r"(t)
  3763     : [A]"r"(A), [B]"r"(B)
  3765   );
  3768 // As above, but add twice the double-length result into the
  3769 // accumulator.
  3770 inline void MACC2(unsigned long A, unsigned long B, unsigned long &t0, unsigned long &t1, unsigned long &t2) {
  3771   unsigned long hi, lo, carry = 0, t = 0;
  3772   __asm__ __volatile__(
  3773     "dmultu  %[A],        %[B]                     \n"
  3774     "mfhi    %[hi]                                 \n"
  3775     "mflo    %[lo]                                 \n"
  3776     "daddu   %[t0],       %[t0],       %[lo]       \n"
  3777     "sltu    %[carry],    %[t0],       %[lo]       \n"
  3778     "daddu   %[t1],       %[t1],       %[carry]    \n"
  3779     "sltu    %[t],        %[t1],       %[carry]    \n"
  3780     "daddu   %[t1],       %[t1],       %[hi]       \n"
  3781     "sltu    %[carry],    %[t1],       %[hi]       \n"
  3782     "or      %[carry],    %[carry],    %[t]        \n"
  3783     "daddu   %[t2],       %[t2],       %[carry]    \n"
  3784     "daddu   %[t0],       %[t0],       %[lo]       \n"
  3785     "sltu    %[carry],    %[t0],       %[lo]       \n"
  3786     "daddu   %[t1],       %[t1],       %[carry]    \n"
  3787     "sltu    %[t],        %[t1],       %[carry]    \n"
  3788     "daddu   %[t1],       %[t1],       %[hi]       \n"
  3789     "sltu    %[carry],    %[t1],       %[hi]       \n"
  3790     "or      %[carry],    %[carry],    %[t]        \n"
  3791     "daddu   %[t2],       %[t2],       %[carry]    \n"
  3792     : [hi]"=&r"(hi), [lo]"=&r"(lo), [t0]"+r"(t0), [t1]"+r"(t1), [t2]"+r"(t2), [carry]"+r"(carry), [t]"+r"(t)
  3793     : [A]"r"(A), [B]"r"(B)
  3795   );
  3798 // Fast Montgomery multiplication.  The derivation of the algorithm is
  3799 // in  A Cryptographic Library for the Motorola DSP56000,
  3800 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
  3802 static void __attribute__((noinline))
  3803 montgomery_multiply(unsigned long a[], unsigned long b[], unsigned long n[],
  3804                     unsigned long m[], unsigned long inv, int len) {
  3805   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
  3806   int i;
  3808   assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
  3810   for (i = 0; i < len; i++) {
  3811     int j;
  3812     for (j = 0; j < i; j++) {
  3813       MACC(a[j], b[i-j], t0, t1, t2);
  3814       MACC(m[j], n[i-j], t0, t1, t2);
  3816     MACC(a[i], b[0], t0, t1, t2);
  3817     m[i] = t0 * inv;
  3818     MACC(m[i], n[0], t0, t1, t2);
  3820     assert(t0 == 0, "broken Montgomery multiply");
  3822     t0 = t1; t1 = t2; t2 = 0;
  3825   for (i = len; i < 2*len; i++) {
  3826     int j;
  3827     for (j = i-len+1; j < len; j++) {
  3828       MACC(a[j], b[i-j], t0, t1, t2);
  3829       MACC(m[j], n[i-j], t0, t1, t2);
  3831     m[i-len] = t0;
  3832     t0 = t1; t1 = t2; t2 = 0;
  3835   while (t0)
  3836     t0 = sub(m, n, t0, len);
  3839 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
  3840 // multiplies so it should be up to 25% faster than Montgomery
  3841 // multiplication.  However, its loop control is more complex and it
  3842 // may actually run slower on some machines.
  3844 static void __attribute__((noinline))
  3845 montgomery_square(unsigned long a[], unsigned long n[],
  3846                   unsigned long m[], unsigned long inv, int len) {
  3847   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
  3848   int i;
  3850   assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
  3852   for (i = 0; i < len; i++) {
  3853     int j;
  3854     int end = (i+1)/2;
  3855     for (j = 0; j < end; j++) {
  3856       MACC2(a[j], a[i-j], t0, t1, t2);
  3857       MACC(m[j], n[i-j], t0, t1, t2);
  3859     if ((i & 1) == 0) {
  3860       MACC(a[j], a[j], t0, t1, t2);
  3862     for (; j < i; j++) {
  3863       MACC(m[j], n[i-j], t0, t1, t2);
  3865     m[i] = t0 * inv;
  3866     MACC(m[i], n[0], t0, t1, t2);
  3868     assert(t0 == 0, "broken Montgomery square");
  3870     t0 = t1; t1 = t2; t2 = 0;
  3873   for (i = len; i < 2*len; i++) {
  3874     int start = i-len+1;
  3875     int end = start + (len - start)/2;
  3876     int j;
  3877     for (j = start; j < end; j++) {
  3878       MACC2(a[j], a[i-j], t0, t1, t2);
  3879       MACC(m[j], n[i-j], t0, t1, t2);
  3881     if ((i & 1) == 0) {
  3882       MACC(a[j], a[j], t0, t1, t2);
  3884     for (; j < len; j++) {
  3885       MACC(m[j], n[i-j], t0, t1, t2);
  3887     m[i-len] = t0;
  3888     t0 = t1; t1 = t2; t2 = 0;
  3891   while (t0)
  3892     t0 = sub(m, n, t0, len);
  3895 // Swap words in a longword.
  3896 static unsigned long swap(unsigned long x) {
  3897   return (x << 32) | (x >> 32);
  3900 // Copy len longwords from s to d, word-swapping as we go.  The
  3901 // destination array is reversed.
  3902 static void reverse_words(unsigned long *s, unsigned long *d, int len) {
  3903   d += len;
  3904   while(len-- > 0) {
  3905     d--;
  3906     *d = swap(*s);
  3907     s++;
  3911 // The threshold at which squaring is advantageous was determined
  3912 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
  3913 // Doesn't seem to be relevant for MIPS64 so we use the same value.
  3914 #define MONTGOMERY_SQUARING_THRESHOLD 64
  3916 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
  3917                                         jint len, jlong inv,
  3918                                         jint *m_ints) {
  3919   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
  3920   int longwords = len/2;
  3922   // Make very sure we don't use so much space that the stack might
  3923   // overflow.  512 jints corresponds to an 16384-bit integer and
  3924   // will use here a total of 8k bytes of stack space.
  3925   int total_allocation = longwords * sizeof (unsigned long) * 4;
  3926   guarantee(total_allocation <= 8192, "must be");
  3927   unsigned long *scratch = (unsigned long *)alloca(total_allocation);
  3929   // Local scratch arrays
  3930   unsigned long
  3931     *a = scratch + 0 * longwords,
  3932     *b = scratch + 1 * longwords,
  3933     *n = scratch + 2 * longwords,
  3934     *m = scratch + 3 * longwords;
  3936   reverse_words((unsigned long *)a_ints, a, longwords);
  3937   reverse_words((unsigned long *)b_ints, b, longwords);
  3938   reverse_words((unsigned long *)n_ints, n, longwords);
  3940   ::montgomery_multiply(a, b, n, m, (unsigned long)inv, longwords);
  3942   reverse_words(m, (unsigned long *)m_ints, longwords);
  3945 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
  3946                                       jint len, jlong inv,
  3947                                       jint *m_ints) {
  3948   assert(len % 2 == 0, "array length in montgomery_square must be even");
  3949   int longwords = len/2;
  3951   // Make very sure we don't use so much space that the stack might
  3952   // overflow.  512 jints corresponds to an 16384-bit integer and
  3953   // will use here a total of 6k bytes of stack space.
  3954   int total_allocation = longwords * sizeof (unsigned long) * 3;
  3955   guarantee(total_allocation <= 8192, "must be");
  3956   unsigned long *scratch = (unsigned long *)alloca(total_allocation);
  3958   // Local scratch arrays
  3959   unsigned long
  3960     *a = scratch + 0 * longwords,
  3961     *n = scratch + 1 * longwords,
  3962     *m = scratch + 2 * longwords;
  3964   reverse_words((unsigned long *)a_ints, a, longwords);
  3965   reverse_words((unsigned long *)n_ints, n, longwords);
  3967   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
  3968     ::montgomery_square(a, n, m, (unsigned long)inv, longwords);
  3969   } else {
  3970     ::montgomery_multiply(a, a, n, m, (unsigned long)inv, longwords);
  3973   reverse_words(m, (unsigned long *)m_ints, longwords);

mercurial