jdk8-mips64-public/hotspot: src/cpu/mips/vm/sharedRuntime_mips

#10071 MIPS Port of 8176100: [REDO][REDO] G1 Needs pre barrier on dereference of weak JNI handles
Summary: runtime/jni/CallWithJNIWeak/test.sh runtime/jni/ReturnJNIWeak/test.sh crash
Reviewed-by: aoqi

     1 /*

     2  * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.

     3  * Copyright (c) 2015, 2019, Loongson Technology. All rights reserved.

     4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.

     5  *

     6  * This code is free software; you can redistribute it and/or modify it

     7  * under the terms of the GNU General Public License version 2 only, as

     8  * published by the Free Software Foundation.

     9  *

    10  * This code is distributed in the hope that it will be useful, but WITHOUT

    11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or

    12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License

    13  * version 2 for more details (a copy is included in the LICENSE file that

    14  * accompanied this code).

    15  *

    16  * You should have received a copy of the GNU General Public License version

    17  * 2 along with this work; if not, write to the Free Software Foundation,

    18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.

    19  *

    20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA

    21  * or visit www.oracle.com if you need additional information or have any

    22  * questions.

    23  *

    24  */

    26 #include "precompiled.hpp"

    27 #include "asm/macroAssembler.hpp"

    28 #include "asm/macroAssembler.inline.hpp"

    29 #include "code/debugInfoRec.hpp"

    30 #include "code/icBuffer.hpp"

    31 #include "code/vtableStubs.hpp"

    32 #include "interpreter/interpreter.hpp"

    33 #include "oops/compiledICHolder.hpp"

    34 #include "prims/jvmtiRedefineClassesTrace.hpp"

    35 #include "runtime/sharedRuntime.hpp"

    36 #include "runtime/vframeArray.hpp"

    37 #include "vmreg_mips.inline.hpp"

    38 #ifdef COMPILER1

    39 #include "c1/c1_Runtime1.hpp"

    40 #endif

    41 #ifdef COMPILER2

    42 #include "opto/runtime.hpp"

    43 #endif

    45 #include <alloca.h>

    47 #define __ masm->

    49 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;

    51 class RegisterSaver {

    52   enum { FPU_regs_live = 32 };

    53   // Capture info about frame layout

    54   enum layout {

    55 #define DEF_LAYOUT_OFFS(regname)  regname ## _off,  regname ## H_off,

    56     DEF_LAYOUT_OFFS(for_16_bytes_aligned)

    57     DEF_LAYOUT_OFFS(fpr0)

    58     DEF_LAYOUT_OFFS(fpr1)

    59     DEF_LAYOUT_OFFS(fpr2)

    60     DEF_LAYOUT_OFFS(fpr3)

    61     DEF_LAYOUT_OFFS(fpr4)

    62     DEF_LAYOUT_OFFS(fpr5)

    63     DEF_LAYOUT_OFFS(fpr6)

    64     DEF_LAYOUT_OFFS(fpr7)

    65     DEF_LAYOUT_OFFS(fpr8)

    66     DEF_LAYOUT_OFFS(fpr9)

    67     DEF_LAYOUT_OFFS(fpr10)

    68     DEF_LAYOUT_OFFS(fpr11)

    69     DEF_LAYOUT_OFFS(fpr12)

    70     DEF_LAYOUT_OFFS(fpr13)

    71     DEF_LAYOUT_OFFS(fpr14)

    72     DEF_LAYOUT_OFFS(fpr15)

    73     DEF_LAYOUT_OFFS(fpr16)

    74     DEF_LAYOUT_OFFS(fpr17)

    75     DEF_LAYOUT_OFFS(fpr18)

    76     DEF_LAYOUT_OFFS(fpr19)

    77     DEF_LAYOUT_OFFS(fpr20)

    78     DEF_LAYOUT_OFFS(fpr21)

    79     DEF_LAYOUT_OFFS(fpr22)

    80     DEF_LAYOUT_OFFS(fpr23)

    81     DEF_LAYOUT_OFFS(fpr24)

    82     DEF_LAYOUT_OFFS(fpr25)

    83     DEF_LAYOUT_OFFS(fpr26)

    84     DEF_LAYOUT_OFFS(fpr27)

    85     DEF_LAYOUT_OFFS(fpr28)

    86     DEF_LAYOUT_OFFS(fpr29)

    87     DEF_LAYOUT_OFFS(fpr30)

    88     DEF_LAYOUT_OFFS(fpr31)

    90     DEF_LAYOUT_OFFS(v0)

    91     DEF_LAYOUT_OFFS(v1)

    92     DEF_LAYOUT_OFFS(a0)

    93     DEF_LAYOUT_OFFS(a1)

    94     DEF_LAYOUT_OFFS(a2)

    95     DEF_LAYOUT_OFFS(a3)

    96     DEF_LAYOUT_OFFS(a4)

    97     DEF_LAYOUT_OFFS(a5)

    98     DEF_LAYOUT_OFFS(a6)

    99     DEF_LAYOUT_OFFS(a7)

   100     DEF_LAYOUT_OFFS(t0)

   101     DEF_LAYOUT_OFFS(t1)

   102     DEF_LAYOUT_OFFS(t2)

   103     DEF_LAYOUT_OFFS(t3)

   104     DEF_LAYOUT_OFFS(s0)

   105     DEF_LAYOUT_OFFS(s1)

   106     DEF_LAYOUT_OFFS(s2)

   107     DEF_LAYOUT_OFFS(s3)

   108     DEF_LAYOUT_OFFS(s4)

   109     DEF_LAYOUT_OFFS(s5)

   110     DEF_LAYOUT_OFFS(s6)

   111     DEF_LAYOUT_OFFS(s7)

   112     DEF_LAYOUT_OFFS(t8)

   113     DEF_LAYOUT_OFFS(t9)

   115     DEF_LAYOUT_OFFS(gp)

   116     DEF_LAYOUT_OFFS(fp)

   117     DEF_LAYOUT_OFFS(return)

   118     reg_save_size

   119   };

   121   public:

   123   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors =false );

   124   static void restore_live_registers(MacroAssembler* masm, bool restore_vectors = false);

   125   static int raOffset(void) { return return_off / 2; }

   126   //Rmethod

   127   static int methodOffset(void) { return s3_off / 2; }

   129   static int v0Offset(void) { return v0_off / 2; }

   130   static int v1Offset(void) { return v1_off / 2; }

   132   static int fpResultOffset(void) { return fpr0_off / 2; }

   134   // During deoptimization only the result register need to be restored

   135   // all the other values have already been extracted.

   136   static void restore_result_registers(MacroAssembler* masm);

   137 };

   139 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors ) {

   141   // Always make the frame size 16-byte aligned

   142   int frame_size_in_bytes = round_to(additional_frame_words*wordSize +

   143                                      reg_save_size*BytesPerInt, 16);

   144   // OopMap frame size is in compiler stack slots (jint's) not bytes or words

   145   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;

   146   // The caller will allocate additional_frame_words

   147   int additional_frame_slots = additional_frame_words*wordSize / BytesPerInt;

   148   // CodeBlob frame size is in words.

   149   int frame_size_in_words = frame_size_in_bytes / wordSize;

   150   *total_frame_words = frame_size_in_words;

   152   // save registers

   154   __ daddiu(SP, SP, - reg_save_size * jintSize);

   156   __ sdc1(F0, SP, fpr0_off * jintSize); __ sdc1(F1, SP, fpr1_off * jintSize);

   157   __ sdc1(F2, SP, fpr2_off * jintSize); __ sdc1(F3, SP, fpr3_off * jintSize);

   158   __ sdc1(F4, SP, fpr4_off * jintSize); __ sdc1(F5, SP, fpr5_off * jintSize);

   159   __ sdc1(F6, SP, fpr6_off * jintSize);  __ sdc1(F7, SP, fpr7_off * jintSize);

   160   __ sdc1(F8, SP, fpr8_off * jintSize);  __ sdc1(F9, SP, fpr9_off * jintSize);

   161   __ sdc1(F10, SP, fpr10_off * jintSize);  __ sdc1(F11, SP, fpr11_off * jintSize);

   162   __ sdc1(F12, SP, fpr12_off * jintSize);  __ sdc1(F13, SP, fpr13_off * jintSize);

   163   __ sdc1(F14, SP, fpr14_off * jintSize);  __ sdc1(F15, SP, fpr15_off * jintSize);

   164   __ sdc1(F16, SP, fpr16_off * jintSize);  __ sdc1(F17, SP, fpr17_off * jintSize);

   165   __ sdc1(F18, SP, fpr18_off * jintSize);  __ sdc1(F19, SP, fpr19_off * jintSize);

   166   __ sdc1(F20, SP, fpr20_off * jintSize);  __ sdc1(F21, SP, fpr21_off * jintSize);

   167   __ sdc1(F22, SP, fpr22_off * jintSize);  __ sdc1(F23, SP, fpr23_off * jintSize);

   168   __ sdc1(F24, SP, fpr24_off * jintSize);  __ sdc1(F25, SP, fpr25_off * jintSize);

   169   __ sdc1(F26, SP, fpr26_off * jintSize);  __ sdc1(F27, SP, fpr27_off * jintSize);

   170   __ sdc1(F28, SP, fpr28_off * jintSize);  __ sdc1(F29, SP, fpr29_off * jintSize);

   171   __ sdc1(F30, SP, fpr30_off * jintSize);  __ sdc1(F31, SP, fpr31_off * jintSize);

   172   __ sd(V0, SP, v0_off * jintSize);  __ sd(V1, SP, v1_off * jintSize);

   173   __ sd(A0, SP, a0_off * jintSize);  __ sd(A1, SP, a1_off * jintSize);

   174   __ sd(A2, SP, a2_off * jintSize);  __ sd(A3, SP, a3_off * jintSize);

   175   __ sd(A4, SP, a4_off * jintSize);  __ sd(A5, SP, a5_off * jintSize);

   176   __ sd(A6, SP, a6_off * jintSize);  __ sd(A7, SP, a7_off * jintSize);

   177   __ sd(T0, SP, t0_off * jintSize);

   178   __ sd(T1, SP, t1_off * jintSize);

   179   __ sd(T2, SP, t2_off * jintSize);

   180   __ sd(T3, SP, t3_off * jintSize);

   181   __ sd(S0, SP, s0_off * jintSize);

   182   __ sd(S1, SP, s1_off * jintSize);

   183   __ sd(S2, SP, s2_off * jintSize);

   184   __ sd(S3, SP, s3_off * jintSize);

   185   __ sd(S4, SP, s4_off * jintSize);

   186   __ sd(S5, SP, s5_off * jintSize);

   187   __ sd(S6, SP, s6_off * jintSize);

   188   __ sd(S7, SP, s7_off * jintSize);

   190   __ sd(T8, SP, t8_off * jintSize);

   191   __ sd(T9, SP, t9_off * jintSize);

   193   __ sd(GP, SP, gp_off * jintSize);

   194   __ sd(FP, SP, fp_off * jintSize);

   195   __ sd(RA, SP, return_off * jintSize);

   196   __ daddi(FP, SP, fp_off * jintSize);

   198   OopMapSet *oop_maps = new OopMapSet();

   199   //OopMap* map =  new OopMap( frame_words, 0 );

   200   OopMap* map =  new OopMap( frame_size_in_slots, 0 );

   203 //#define STACK_OFFSET(x) VMRegImpl::stack2reg((x) + additional_frame_words)

   204 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x) + additional_frame_slots)

   205   map->set_callee_saved(STACK_OFFSET( v0_off), V0->as_VMReg());

   206   map->set_callee_saved(STACK_OFFSET( v1_off), V1->as_VMReg());

   207   map->set_callee_saved(STACK_OFFSET( a0_off), A0->as_VMReg());

   208   map->set_callee_saved(STACK_OFFSET( a1_off), A1->as_VMReg());

   209   map->set_callee_saved(STACK_OFFSET( a2_off), A2->as_VMReg());

   210   map->set_callee_saved(STACK_OFFSET( a3_off), A3->as_VMReg());

   211   map->set_callee_saved(STACK_OFFSET( a4_off), A4->as_VMReg());

   212   map->set_callee_saved(STACK_OFFSET( a5_off), A5->as_VMReg());

   213   map->set_callee_saved(STACK_OFFSET( a6_off), A6->as_VMReg());

   214   map->set_callee_saved(STACK_OFFSET( a7_off), A7->as_VMReg());

   215   map->set_callee_saved(STACK_OFFSET( t0_off), T0->as_VMReg());

   216   map->set_callee_saved(STACK_OFFSET( t1_off), T1->as_VMReg());

   217   map->set_callee_saved(STACK_OFFSET( t2_off), T2->as_VMReg());

   218   map->set_callee_saved(STACK_OFFSET( t3_off), T3->as_VMReg());

   219   map->set_callee_saved(STACK_OFFSET( s0_off), S0->as_VMReg());

   220   map->set_callee_saved(STACK_OFFSET( s1_off), S1->as_VMReg());

   221   map->set_callee_saved(STACK_OFFSET( s2_off), S2->as_VMReg());

   222   map->set_callee_saved(STACK_OFFSET( s3_off), S3->as_VMReg());

   223   map->set_callee_saved(STACK_OFFSET( s4_off), S4->as_VMReg());

   224   map->set_callee_saved(STACK_OFFSET( s5_off), S5->as_VMReg());

   225   map->set_callee_saved(STACK_OFFSET( s6_off), S6->as_VMReg());

   226   map->set_callee_saved(STACK_OFFSET( s7_off), S7->as_VMReg());

   227   map->set_callee_saved(STACK_OFFSET( t8_off), T8->as_VMReg());

   228   map->set_callee_saved(STACK_OFFSET( t9_off), T9->as_VMReg());

   229   map->set_callee_saved(STACK_OFFSET( gp_off), GP->as_VMReg());

   230   map->set_callee_saved(STACK_OFFSET( fp_off), FP->as_VMReg());

   231   map->set_callee_saved(STACK_OFFSET( return_off), RA->as_VMReg());

   233   map->set_callee_saved(STACK_OFFSET( fpr0_off), F0->as_VMReg());

   234   map->set_callee_saved(STACK_OFFSET( fpr1_off), F1->as_VMReg());

   235   map->set_callee_saved(STACK_OFFSET( fpr2_off), F2->as_VMReg());

   236   map->set_callee_saved(STACK_OFFSET( fpr3_off), F3->as_VMReg());

   237   map->set_callee_saved(STACK_OFFSET( fpr4_off), F4->as_VMReg());

   238   map->set_callee_saved(STACK_OFFSET( fpr5_off), F5->as_VMReg());

   239   map->set_callee_saved(STACK_OFFSET( fpr6_off), F6->as_VMReg());

   240   map->set_callee_saved(STACK_OFFSET( fpr7_off), F7->as_VMReg());

   241   map->set_callee_saved(STACK_OFFSET( fpr8_off), F8->as_VMReg());

   242   map->set_callee_saved(STACK_OFFSET( fpr9_off), F9->as_VMReg());

   243   map->set_callee_saved(STACK_OFFSET( fpr10_off), F10->as_VMReg());

   244   map->set_callee_saved(STACK_OFFSET( fpr11_off), F11->as_VMReg());

   245   map->set_callee_saved(STACK_OFFSET( fpr12_off), F12->as_VMReg());

   246   map->set_callee_saved(STACK_OFFSET( fpr13_off), F13->as_VMReg());

   247   map->set_callee_saved(STACK_OFFSET( fpr14_off), F14->as_VMReg());

   248   map->set_callee_saved(STACK_OFFSET( fpr15_off), F15->as_VMReg());

   249   map->set_callee_saved(STACK_OFFSET( fpr16_off), F16->as_VMReg());

   250   map->set_callee_saved(STACK_OFFSET( fpr17_off), F17->as_VMReg());

   251   map->set_callee_saved(STACK_OFFSET( fpr18_off), F18->as_VMReg());

   252   map->set_callee_saved(STACK_OFFSET( fpr19_off), F19->as_VMReg());

   253   map->set_callee_saved(STACK_OFFSET( fpr20_off), F20->as_VMReg());

   254   map->set_callee_saved(STACK_OFFSET( fpr21_off), F21->as_VMReg());

   255   map->set_callee_saved(STACK_OFFSET( fpr22_off), F22->as_VMReg());

   256   map->set_callee_saved(STACK_OFFSET( fpr23_off), F23->as_VMReg());

   257   map->set_callee_saved(STACK_OFFSET( fpr24_off), F24->as_VMReg());

   258   map->set_callee_saved(STACK_OFFSET( fpr25_off), F25->as_VMReg());

   259   map->set_callee_saved(STACK_OFFSET( fpr26_off), F26->as_VMReg());

   260   map->set_callee_saved(STACK_OFFSET( fpr27_off), F27->as_VMReg());

   261   map->set_callee_saved(STACK_OFFSET( fpr28_off), F28->as_VMReg());

   262   map->set_callee_saved(STACK_OFFSET( fpr29_off), F29->as_VMReg());

   263   map->set_callee_saved(STACK_OFFSET( fpr30_off), F30->as_VMReg());

   264   map->set_callee_saved(STACK_OFFSET( fpr31_off), F31->as_VMReg());

   266 #undef STACK_OFFSET

   267   return map;

   268 }

   271 // Pop the current frame and restore all the registers that we

   272 // saved.

   273 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_vectors) {

   274   __ ldc1(F0, SP, fpr0_off * jintSize); __ ldc1(F1, SP, fpr1_off * jintSize);

   275   __ ldc1(F2, SP, fpr2_off * jintSize); __ ldc1(F3, SP, fpr3_off * jintSize);

   276   __ ldc1(F4, SP, fpr4_off * jintSize); __ ldc1(F5, SP, fpr5_off * jintSize);

   277   __ ldc1(F6, SP, fpr6_off * jintSize);  __ ldc1(F7, SP, fpr7_off * jintSize);

   278   __ ldc1(F8, SP, fpr8_off * jintSize);  __ ldc1(F9, SP, fpr9_off * jintSize);

   279   __ ldc1(F10, SP, fpr10_off * jintSize);  __ ldc1(F11, SP, fpr11_off * jintSize);

   280   __ ldc1(F12, SP, fpr12_off * jintSize);  __ ldc1(F13, SP, fpr13_off * jintSize);

   281   __ ldc1(F14, SP, fpr14_off * jintSize);  __ ldc1(F15, SP, fpr15_off * jintSize);

   282   __ ldc1(F16, SP, fpr16_off * jintSize);  __ ldc1(F17, SP, fpr17_off * jintSize);

   283   __ ldc1(F18, SP, fpr18_off * jintSize);  __ ldc1(F19, SP, fpr19_off * jintSize);

   284   __ ldc1(F20, SP, fpr20_off * jintSize);  __ ldc1(F21, SP, fpr21_off * jintSize);

   285   __ ldc1(F22, SP, fpr22_off * jintSize);  __ ldc1(F23, SP, fpr23_off * jintSize);

   286   __ ldc1(F24, SP, fpr24_off * jintSize);  __ ldc1(F25, SP, fpr25_off * jintSize);

   287   __ ldc1(F26, SP, fpr26_off * jintSize);  __ ldc1(F27, SP, fpr27_off * jintSize);

   288   __ ldc1(F28, SP, fpr28_off * jintSize);  __ ldc1(F29, SP, fpr29_off * jintSize);

   289   __ ldc1(F30, SP, fpr30_off * jintSize);  __ ldc1(F31, SP, fpr31_off * jintSize);

   291   __ ld(V0, SP, v0_off * jintSize);  __ ld(V1, SP, v1_off * jintSize);

   292   __ ld(A0, SP, a0_off * jintSize);  __ ld(A1, SP, a1_off * jintSize);

   293   __ ld(A2, SP, a2_off * jintSize);  __ ld(A3, SP, a3_off * jintSize);

   294   __ ld(A4, SP, a4_off * jintSize);  __ ld(A5, SP, a5_off * jintSize);

   295   __ ld(A6, SP, a6_off * jintSize);  __ ld(A7, SP, a7_off * jintSize);

   296   __ ld(T0, SP, t0_off * jintSize);

   297   __ ld(T1, SP, t1_off * jintSize);

   298   __ ld(T2, SP, t2_off * jintSize);

   299   __ ld(T3, SP, t3_off * jintSize);

   300   __ ld(S0, SP, s0_off * jintSize);

   301   __ ld(S1, SP, s1_off * jintSize);

   302   __ ld(S2, SP, s2_off * jintSize);

   303   __ ld(S3, SP, s3_off * jintSize);

   304   __ ld(S4, SP, s4_off * jintSize);

   305   __ ld(S5, SP, s5_off * jintSize);

   306   __ ld(S6, SP, s6_off * jintSize);

   307   __ ld(S7, SP, s7_off * jintSize);

   309   __ ld(T8, SP, t8_off * jintSize);

   310   __ ld(T9, SP, t9_off * jintSize);

   312   __ ld(GP, SP, gp_off * jintSize);

   313   __ ld(FP, SP, fp_off * jintSize);

   314   __ ld(RA, SP, return_off * jintSize);

   316   __ addiu(SP, SP, reg_save_size * jintSize);

   317 }

   319 // Pop the current frame and restore the registers that might be holding

   320 // a result.

   321 // FIXME, if the result is float?

   322 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {

   324   // Just restore result register. Only used by deoptimization. By

   325   // now any callee save register that needs to be restore to a c2

   326   // caller of the deoptee has been extracted into the vframeArray

   327   // and will be stuffed into the c2i adapter we create for later

   328   // restoration so only result registers need to be restored here.

   330   __ ld(V0, SP, v0_off * jintSize);

   331   __ ld(V1, SP, v1_off * jintSize);

   332   __ addiu(SP, SP, return_off * jintSize);

   333 }

   335 // Is vector's size (in bytes) bigger than a size saved by default?

   336 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.

   337 bool SharedRuntime::is_wide_vector(int size) {

   338   return size > 16;

   339 }

   341 // The java_calling_convention describes stack locations as ideal slots on

   342 // a frame with no abi restrictions. Since we must observe abi restrictions

   343 // (like the placement of the register window) the slots must be biased by

   344 // the following value.

   346 static int reg2offset_in(VMReg r) {

   347   // Account for saved fp and return address

   348   // This should really be in_preserve_stack_slots

   349   return (r->reg2stack() + 2 * VMRegImpl::slots_per_word) * VMRegImpl::stack_slot_size;  // + 2 * VMRegImpl::stack_slot_size);

   350 }

   352 static int reg2offset_out(VMReg r) {

   353   return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;

   354 }

   356 // ---------------------------------------------------------------------------

   357 // Read the array of BasicTypes from a signature, and compute where the

   358 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte

   359 // quantities.  Values less than SharedInfo::stack0 are registers, those above

   360 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer

   361 // as framesizes are fixed.

   362 // VMRegImpl::stack0 refers to the first slot 0(sp).

   363 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.  Register

   364 // up to RegisterImpl::number_of_registers) are the 32-bit

   365 // integer registers.

   367 // Pass first five oop/int args in registers T0, A0 - A3.

   368 // Pass float/double/long args in stack.

   369 // Doubles have precedence, so if you pass a mix of floats and doubles

   370 // the doubles will grab the registers before the floats will.

   372 // Note: the INPUTS in sig_bt are in units of Java argument words, which are

   373 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit

   374 // units regardless of build.

   377 // ---------------------------------------------------------------------------

   378 // The compiled Java calling convention.

   379 // Pass first five oop/int args in registers T0, A0 - A3.

   380 // Pass float/double/long args in stack.

   381 // Doubles have precedence, so if you pass a mix of floats and doubles

   382 // the doubles will grab the registers before the floats will.

   384 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,

   385                                            VMRegPair *regs,

   386                                            int total_args_passed,

   387                                            int is_outgoing) {

   389   // Create the mapping between argument positions and

   390   // registers.

   391   //static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {

   392   static const Register INT_ArgReg[Argument::n_register_parameters + 1] = {

   393     T0, A0, A1, A2, A3, A4, A5, A6, A7

   394   };

   395   //static const FloatRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {

   396   static const FloatRegister FP_ArgReg[Argument::n_float_register_parameters] = {

   397     F12, F13, F14, F15, F16, F17, F18, F19

   398   };

   401   uint args = 0;

   402   uint stk_args = 0; // inc by 2 each time

   404   for (int i = 0; i < total_args_passed; i++) {

   405     switch (sig_bt[i]) {

   406     case T_VOID:

   407       // halves of T_LONG or T_DOUBLE

   408       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");

   409       regs[i].set_bad();

   410       break;

   411     case T_BOOLEAN:

   412     case T_CHAR:

   413     case T_BYTE:

   414     case T_SHORT:

   415     case T_INT:

   416       if (args < Argument::n_register_parameters) {

   417         regs[i].set1(INT_ArgReg[args++]->as_VMReg());

   418       } else {

   419         regs[i].set1(VMRegImpl::stack2reg(stk_args));

   420         stk_args += 2;

   421       }

   422       break;

   423     case T_LONG:

   424       assert(sig_bt[i + 1] == T_VOID, "expecting half");

   425       // fall through

   426     case T_OBJECT:

   427     case T_ARRAY:

   428     case T_ADDRESS:

   429       if (args < Argument::n_register_parameters) {

   430         regs[i].set2(INT_ArgReg[args++]->as_VMReg());

   431       } else {

   432         regs[i].set2(VMRegImpl::stack2reg(stk_args));

   433         stk_args += 2;

   434       }

   435       break;

   436     case T_FLOAT:

   437       if (args < Argument::n_float_register_parameters) {

   438         regs[i].set1(FP_ArgReg[args++]->as_VMReg());

   439       } else {

   440         regs[i].set1(VMRegImpl::stack2reg(stk_args));

   441         stk_args += 2;

   442       }

   443       break;

   444     case T_DOUBLE:

   445       assert(sig_bt[i + 1] == T_VOID, "expecting half");

   446       if (args < Argument::n_float_register_parameters) {

   447         regs[i].set2(FP_ArgReg[args++]->as_VMReg());

   448       } else {

   449         regs[i].set2(VMRegImpl::stack2reg(stk_args));

   450         stk_args += 2;

   451       }

   452       break;

   453     default:

   454       ShouldNotReachHere();

   455       break;

   456     }

   457   }

   459   return round_to(stk_args, 2);

   460 }

   462 // Helper class mostly to avoid passing masm everywhere, and handle store

   463 // displacement overflow logic for LP64

   464 class AdapterGenerator {

   465   MacroAssembler *masm;

   466 #ifdef _LP64

   467   Register Rdisp;

   468   void set_Rdisp(Register r)  { Rdisp = r; }

   469 #endif // _LP64

   471   void patch_callers_callsite();

   473   // base+st_off points to top of argument

   474   int arg_offset(const int st_off) { return st_off; }

   475   int next_arg_offset(const int st_off) {

   476     return st_off - Interpreter::stackElementSize;

   477   }

   479 #ifdef _LP64

   480   // On _LP64 argument slot values are loaded first into a register

   481   // because they might not fit into displacement.

   482   Register arg_slot(const int st_off);

   483   Register next_arg_slot(const int st_off);

   484 #else

   485   int arg_slot(const int st_off)      { return arg_offset(st_off); }

   486   int next_arg_slot(const int st_off) { return next_arg_offset(st_off); }

   487 #endif // _LP64

   489   // Stores long into offset pointed to by base

   490   void store_c2i_long(Register r, Register base,

   491                       const int st_off, bool is_stack);

   492   void store_c2i_object(Register r, Register base,

   493                         const int st_off);

   494   void store_c2i_int(Register r, Register base,

   495                      const int st_off);

   496   void store_c2i_double(VMReg r_2,

   497                         VMReg r_1, Register base, const int st_off);

   498   void store_c2i_float(FloatRegister f, Register base,

   499                        const int st_off);

   501  public:

   502   //void tag_stack(const BasicType sig, int st_off);

   503   void gen_c2i_adapter(int total_args_passed,

   504                               // VMReg max_arg,

   505                               int comp_args_on_stack, // VMRegStackSlots

   506                               const BasicType *sig_bt,

   507                               const VMRegPair *regs,

   508                               Label& skip_fixup);

   509   void gen_i2c_adapter(int total_args_passed,

   510                               // VMReg max_arg,

   511                               int comp_args_on_stack, // VMRegStackSlots

   512                               const BasicType *sig_bt,

   513                               const VMRegPair *regs);

   515   AdapterGenerator(MacroAssembler *_masm) : masm(_masm) {}

   516 };

   519 // Patch the callers callsite with entry to compiled code if it exists.

   520 void AdapterGenerator::patch_callers_callsite() {

   521   Label L;

   522   __ verify_oop(Rmethod);

   523   __ ld_ptr(AT, Rmethod, in_bytes(Method::code_offset()));

   524   __ beq(AT, R0, L);

   525   __ delayed()->nop();

   526   // Schedule the branch target address early.

   527   // Call into the VM to patch the caller, then jump to compiled callee

   528   // V0 isn't live so capture return address while we easily can

   529   __ move(V0, RA);

   531   __ pushad();

   532 #ifdef COMPILER2

   533   // C2 may leave the stack dirty if not in SSE2+ mode

   534   __ empty_FPU_stack();

   535 #endif

   537   // VM needs caller's callsite

   538   // VM needs target method

   540   __ move(A0, Rmethod);

   541   __ move(A1, V0);

   542   // we should preserve the return address

   543   __ verify_oop(Rmethod);

   544   __ move(S0, SP);

   545   __ move(AT, -(StackAlignmentInBytes));   // align the stack

   546   __ andr(SP, SP, AT);

   547   __ call(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite),

   548           relocInfo::runtime_call_type);

   550   __ delayed()->nop();

   551   __ move(SP, S0);

   552   __ popad();

   553   __ bind(L);

   554 }

   556 #ifdef _LP64

   557 Register AdapterGenerator::arg_slot(const int st_off) {

   558   Unimplemented();

   559 }

   561 Register AdapterGenerator::next_arg_slot(const int st_off){

   562   Unimplemented();

   563 }

   564 #endif // _LP64

   566 // Stores long into offset pointed to by base

   567 void AdapterGenerator::store_c2i_long(Register r, Register base,

   568                                       const int st_off, bool is_stack) {

   569   Unimplemented();

   570 }

   572 void AdapterGenerator::store_c2i_object(Register r, Register base,

   573                                         const int st_off) {

   574   Unimplemented();

   575 }

   577 void AdapterGenerator::store_c2i_int(Register r, Register base,

   578                                      const int st_off) {

   579   Unimplemented();

   580 }

   582 // Stores into offset pointed to by base

   583 void AdapterGenerator::store_c2i_double(VMReg r_2,

   584                       VMReg r_1, Register base, const int st_off) {

   585   Unimplemented();

   586 }

   588 void AdapterGenerator::store_c2i_float(FloatRegister f, Register base,

   589                                        const int st_off) {

   590   Unimplemented();

   591 }

   593 void AdapterGenerator::gen_c2i_adapter(

   594                             int total_args_passed,

   595                             // VMReg max_arg,

   596                             int comp_args_on_stack, // VMRegStackSlots

   597                             const BasicType *sig_bt,

   598                             const VMRegPair *regs,

   599                             Label& skip_fixup) {

   601   // Before we get into the guts of the C2I adapter, see if we should be here

   602   // at all.  We've come from compiled code and are attempting to jump to the

   603   // interpreter, which means the caller made a static call to get here

   604   // (vcalls always get a compiled target if there is one).  Check for a

   605   // compiled target.  If there is one, we need to patch the caller's call.

   606   // However we will run interpreted if we come thru here. The next pass

   607   // thru the call site will run compiled. If we ran compiled here then

   608   // we can (theorectically) do endless i2c->c2i->i2c transitions during

   609   // deopt/uncommon trap cycles. If we always go interpreted here then

   610   // we can have at most one and don't need to play any tricks to keep

   611   // from endlessly growing the stack.

   612   //

   613   // Actually if we detected that we had an i2c->c2i transition here we

   614   // ought to be able to reset the world back to the state of the interpreted

   615   // call and not bother building another interpreter arg area. We don't

   616   // do that at this point.

   618   patch_callers_callsite();

   620   __ bind(skip_fixup);

   622 #ifdef COMPILER2

   623   __ empty_FPU_stack();

   624 #endif

   625   //this is for native ?

   626   // Since all args are passed on the stack, total_args_passed * interpreter_

   627   // stack_element_size  is the

   628   // space we need.

   629   int extraspace = total_args_passed * Interpreter::stackElementSize;

   631   // stack is aligned, keep it that way

   632   extraspace = round_to(extraspace, 2*wordSize);

   634   // Get return address

   635   __ move(V0, RA);

   636   // set senderSP value

   637   //refer to interpreter_mips.cpp:generate_asm_entry

   638   __ move(Rsender, SP);

   639   __ addi(SP, SP, -extraspace);

   641   // Now write the args into the outgoing interpreter space

   642   for (int i = 0; i < total_args_passed; i++) {

   643     if (sig_bt[i] == T_VOID) {

   644       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");

   645       continue;

   646     }

   648     // st_off points to lowest address on stack.

   649     int st_off = ((total_args_passed - 1) - i) * Interpreter::stackElementSize;

   650     // Say 4 args:

   651     // i   st_off

   652     // 0   12 T_LONG

   653     // 1    8 T_VOID

   654     // 2    4 T_OBJECT

   655     // 3    0 T_BOOL

   656     VMReg r_1 = regs[i].first();

   657     VMReg r_2 = regs[i].second();

   658     if (!r_1->is_valid()) {

   659       assert(!r_2->is_valid(), "");

   660       continue;

   661     }

   662     if (r_1->is_stack()) {

   663       // memory to memory use fpu stack top

   664       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;

   665       if (!r_2->is_valid()) {

   666         __ ld_ptr(AT, SP, ld_off);

   667         __ st_ptr(AT, SP, st_off);

   669       } else {

   672         int next_off = st_off - Interpreter::stackElementSize;

   673         __ ld_ptr(AT, SP, ld_off);

   674         __ st_ptr(AT, SP, st_off);

   676         // Ref to is_Register condition

   677         if(sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE)

   678           __ st_ptr(AT, SP, st_off - 8);

   679       }

   680     } else if (r_1->is_Register()) {

   681       Register r = r_1->as_Register();

   682       if (!r_2->is_valid()) {

   683           __ sd(r, SP, st_off);

   684       } else {

   685         //FIXME, mips will not enter here

   686         // long/double in gpr

   687         __ sd(r, SP, st_off);

   688         // In [java/util/zip/ZipFile.java]

   689         //

   690         //    private static native long open(String name, int mode, long lastModified);

   691         //    private static native int getTotal(long jzfile);

   692         //

   693         // We need to transfer T_LONG paramenters from a compiled method to a native method.

   694         // It's a complex process:

   695         //

   696         // Caller -> lir_static_call -> gen_resolve_stub

   697         //      -> -- resolve_static_call_C

   698         //         `- gen_c2i_adapter()  [*]

   699         //             |

   700         //       `- AdapterHandlerLibrary::get_create_apapter_index

   701         //      -> generate_native_entry

   702         //      -> InterpreterRuntime::SignatureHandlerGenerator::pass_long [**]

   703         //

   704         // In [**], T_Long parameter is stored in stack as:

   705         //

   706         //   (high)

   707         //    |         |

   708         //    -----------

   709         //    | 8 bytes |

   710         //    | (void)  |

   711         //    -----------

   712         //    | 8 bytes |

   713         //    | (long)  |

   714         //    -----------

   715         //    |         |

   716         //   (low)

   717         //

   718         // However, the sequence is reversed here:

   719         //

   720         //   (high)

   721         //    |         |

   722         //    -----------

   723         //    | 8 bytes |

   724         //    | (long)  |

   725         //    -----------

   726         //    | 8 bytes |

   727         //    | (void)  |

   728         //    -----------

   729         //    |         |

   730         //   (low)

   731         //

   732         // So I stored another 8 bytes in the T_VOID slot. It then can be accessed from generate_native_entry().

   733         //

   734         if (sig_bt[i] == T_LONG)

   735           __ sd(r, SP, st_off - 8);

   736       }

   737     } else if (r_1->is_FloatRegister()) {

   738       assert(sig_bt[i] == T_FLOAT || sig_bt[i] == T_DOUBLE, "Must be a float register");

   740       FloatRegister fr = r_1->as_FloatRegister();

   741       if (sig_bt[i] == T_FLOAT)

   742         __ swc1(fr, SP, st_off);

   743       else {

   744         __ sdc1(fr, SP, st_off);

   745         __ sdc1(fr, SP, st_off - 8);  // T_DOUBLE needs two slots

   746       }

   747     }

   748   }

   750   // Schedule the branch target address early.

   751   __ ld_ptr(AT, Rmethod, in_bytes(Method::interpreter_entry_offset()) );

   752   // And repush original return address

   753   __ move(RA, V0);

   754   __ jr (AT);

   755   __ delayed()->nop();

   756 }

   758 void AdapterGenerator::gen_i2c_adapter(

   759                                        int total_args_passed,

   760                                        // VMReg max_arg,

   761                                        int comp_args_on_stack, // VMRegStackSlots

   762                                        const BasicType *sig_bt,

   763                                        const VMRegPair *regs) {

   765   // Generate an I2C adapter: adjust the I-frame to make space for the C-frame

   766   // layout.  Lesp was saved by the calling I-frame and will be restored on

   767   // return.  Meanwhile, outgoing arg space is all owned by the callee

   768   // C-frame, so we can mangle it at will.  After adjusting the frame size,

   769   // hoist register arguments and repack other args according to the compiled

   770   // code convention.  Finally, end in a jump to the compiled code.  The entry

   771   // point address is the start of the buffer.

   773   // We will only enter here from an interpreted frame and never from after

   774   // passing thru a c2i. Azul allowed this but we do not. If we lose the

   775   // race and use a c2i we will remain interpreted for the race loser(s).

   776   // This removes all sorts of headaches on the mips side and also eliminates

   777   // the possibility of having c2i -> i2c -> c2i -> ... endless transitions.

   780   __ move(T9, SP);

   782   // Cut-out for having no stack args.  Since up to 2 int/oop args are passed

   783   // in registers, we will occasionally have no stack args.

   784   int comp_words_on_stack = 0;

   785   if (comp_args_on_stack) {

   786     // Sig words on the stack are greater-than VMRegImpl::stack0.  Those in

   787     // registers are below.  By subtracting stack0, we either get a negative

   788     // number (all values in registers) or the maximum stack slot accessed.

   789     // int comp_args_on_stack = VMRegImpl::reg2stack(max_arg);

   790     // Convert 4-byte stack slots to words.

   791     // did mips need round? FIXME  aoqi

   792     comp_words_on_stack = round_to(comp_args_on_stack*4, wordSize)>>LogBytesPerWord;

   793     // Round up to miminum stack alignment, in wordSize

   794     comp_words_on_stack = round_to(comp_words_on_stack, 2);

   795     __ daddi(SP, SP, -comp_words_on_stack * wordSize);

   796   }

   798   // Align the outgoing SP

   799   __ move(AT, -(StackAlignmentInBytes));

   800   __ andr(SP, SP, AT);

   801   // push the return address on the stack (note that pushing, rather

   802   // than storing it, yields the correct frame alignment for the callee)

   803   // Put saved SP in another register

   804   const Register saved_sp = V0;

   805   __ move(saved_sp, T9);

   808   // Will jump to the compiled code just as if compiled code was doing it.

   809   // Pre-load the register-jump target early, to schedule it better.

   810   __ ld(T9, Rmethod, in_bytes(Method::from_compiled_offset()));

   812   // Now generate the shuffle code.  Pick up all register args and move the

   813   // rest through the floating point stack top.

   814   for (int i = 0; i < total_args_passed; i++) {

   815     if (sig_bt[i] == T_VOID) {

   816       // Longs and doubles are passed in native word order, but misaligned

   817       // in the 32-bit build.

   818       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");

   819       continue;

   820     }

   822     // Pick up 0, 1 or 2 words from SP+offset.

   824     //FIXME. aoqi. just delete the assert

   825     //assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(), "scrambled load targets?");

   826     // Load in argument order going down.

   827     int ld_off = (total_args_passed -1 - i)*Interpreter::stackElementSize;

   828     // Point to interpreter value (vs. tag)

   829     int next_off = ld_off - Interpreter::stackElementSize;

   830     VMReg r_1 = regs[i].first();

   831     VMReg r_2 = regs[i].second();

   832     if (!r_1->is_valid()) {

   833       assert(!r_2->is_valid(), "");

   834       continue;

   835     }

   836     if (r_1->is_stack()) {

   837       // Convert stack slot to an SP offset (+ wordSize to

   838       // account for return address )

   839       // NOTICE HERE!!!! I sub a wordSize here

   840       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size;

   841       //+ wordSize;

   843       if (!r_2->is_valid()) {

   844         __ ld(AT, saved_sp, ld_off);

   845         __ sd(AT, SP, st_off);

   846       } else {

   847         // Interpreter local[n] == MSW, local[n+1] == LSW however locals

   848         // are accessed as negative so LSW is at LOW address

   850         // ld_off is MSW so get LSW

   851         // st_off is LSW (i.e. reg.first())

   853         // [./org/eclipse/swt/graphics/GC.java]

   854         // void drawImageXRender(Image srcImage, int srcX, int srcY, int srcWidth, int srcHeight,

   855         //  int destX, int destY, int destWidth, int destHeight,

   856         //  boolean simple,

   857         //  int imgWidth, int imgHeight,

   858         //  long maskPixmap,  <-- Pass T_LONG in stack

   859         //  int maskType);

   860         // Before this modification, Eclipse displays icons with solid black background.

   861         //

   862         __ ld(AT, saved_sp, ld_off);

   863         if (sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE)

   864           __ ld(AT, saved_sp, ld_off - 8);

   865         __ sd(AT, SP, st_off);

   866       }

   867     } else if (r_1->is_Register()) {  // Register argument

   868       Register r = r_1->as_Register();

   869       if (r_2->is_valid()) {

   870         // Remember r_1 is low address (and LSB on mips)

   871         // So r_2 gets loaded from high address regardless of the platform

   872         assert(r_2->as_Register() == r_1->as_Register(), "");

   873         __ ld(r, saved_sp, ld_off);

   875         //

   876         // For T_LONG type, the real layout is as below:

   877         //

   878         //   (high)

   879         //    |         |

   880         //    -----------

   881         //    | 8 bytes |

   882         //    | (void)  |

   883         //    -----------

   884         //    | 8 bytes |

   885         //    | (long)  |

   886         //    -----------

   887         //    |         |

   888         //   (low)

   889         //

   890         // We should load the low-8 bytes.

   891         //

   892         if (sig_bt[i] == T_LONG)

   893           __ ld(r, saved_sp, ld_off - 8);

   894       } else {

   895         __ lw(r, saved_sp, ld_off);

   896       }

   897     } else if (r_1->is_FloatRegister()) { // Float Register

   898       assert(sig_bt[i] == T_FLOAT || sig_bt[i] == T_DOUBLE, "Must be a float register");

   900       FloatRegister fr = r_1->as_FloatRegister();

   901       if (sig_bt[i] == T_FLOAT)

   902           __ lwc1(fr, saved_sp, ld_off);

   903       else {

   904           __ ldc1(fr, saved_sp, ld_off);

   905           __ ldc1(fr, saved_sp, ld_off - 8);

   906       }

   907     }

   908   }

   910   // 6243940 We might end up in handle_wrong_method if

   911   // the callee is deoptimized as we race thru here. If that

   912   // happens we don't want to take a safepoint because the

   913   // caller frame will look interpreted and arguments are now

   914   // "compiled" so it is much better to make this transition

   915   // invisible to the stack walking code. Unfortunately if

   916   // we try and find the callee by normal means a safepoint

   917   // is possible. So we stash the desired callee in the thread

   918   // and the vm will find there should this case occur.

   919   __ get_thread(T8);

   920   __ sd(Rmethod, T8, in_bytes(JavaThread::callee_target_offset()));

   922   // move methodOop to V0 in case we end up in an c2i adapter.

   923   // the c2i adapters expect methodOop in V0 (c2) because c2's

   924   // resolve stubs return the result (the method) in V0.

   925   // I'd love to fix this.

   926   __ move(V0, Rmethod);

   927   __ jr(T9);

   928   __ delayed()->nop();

   929 }

   931 // ---------------------------------------------------------------

   932 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,

   933                                                             int total_args_passed,

   934                                                             // VMReg max_arg,

   935                                                             int comp_args_on_stack, // VMRegStackSlots

   936                                                             const BasicType *sig_bt,

   937                                                             const VMRegPair *regs,

   938                                                             AdapterFingerPrint* fingerprint) {

   939   address i2c_entry = __ pc();

   941   AdapterGenerator agen(masm);

   943   agen.gen_i2c_adapter(total_args_passed, comp_args_on_stack, sig_bt, regs);

   946   // -------------------------------------------------------------------------

   947   // Generate a C2I adapter.  On entry we know G5 holds the methodOop.  The

   948   // args start out packed in the compiled layout.  They need to be unpacked

   949   // into the interpreter layout.  This will almost always require some stack

   950   // space.  We grow the current (compiled) stack, then repack the args.  We

   951   // finally end in a jump to the generic interpreter entry point.  On exit

   952   // from the interpreter, the interpreter will restore our SP (lest the

   953   // compiled code, which relys solely on SP and not FP, get sick).

   955   address c2i_unverified_entry = __ pc();

   956   Label skip_fixup;

   957   {

   958     Register holder = T1;

   959     Register receiver = T0;

   960     Register temp = T8;

   961     address ic_miss = SharedRuntime::get_ic_miss_stub();

   963     Label missed;

   965     __ verify_oop(holder);

   966     //add for compressedoops

   967     __ load_klass(temp, receiver);

   968     __ verify_oop(temp);

   970     __ ld_ptr(AT, holder, CompiledICHolder::holder_klass_offset());

   971     __ ld_ptr(Rmethod, holder, CompiledICHolder::holder_metadata_offset());

   972     __ bne(AT, temp, missed);

   973     __ delayed()->nop();

   974     // Method might have been compiled since the call site was patched to

   975     // interpreted if that is the case treat it as a miss so we can get

   976     // the call site corrected.

   977     __ ld_ptr(AT, Rmethod, in_bytes(Method::code_offset()));

   978     __ beq(AT, R0, skip_fixup);

   979     __ delayed()->nop();

   980     __ bind(missed);

   982     __ jmp(ic_miss, relocInfo::runtime_call_type);

   983     __ delayed()->nop();

   984   }

   986   address c2i_entry = __ pc();

   988   agen.gen_c2i_adapter(total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);

   990   __ flush();

   991   return  AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry);

   992 }

   994 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,

   995                                          VMRegPair *regs,

   996                                          VMRegPair *regs2,

   997                                          int total_args_passed) {

   998   assert(regs2 == NULL, "not needed on MIPS");

   999   // Return the number of VMReg stack_slots needed for the args.

  1000   // This value does not include an abi space (like register window

  1001   // save area).

  1003   // The native convention is V8 if !LP64

  1004   // The LP64 convention is the V9 convention which is slightly more sane.

  1006   // We return the amount of VMReg stack slots we need to reserve for all

  1007   // the arguments NOT counting out_preserve_stack_slots. Since we always

  1008   // have space for storing at least 6 registers to memory we start with that.

  1009   // See int_stk_helper for a further discussion.

  1010   // We return the amount of VMRegImpl stack slots we need to reserve for all

  1011   // the arguments NOT counting out_preserve_stack_slots.

  1012   static const Register INT_ArgReg[Argument::n_register_parameters] = {

  1013     A0, A1, A2, A3, A4, A5, A6, A7

  1014   };

  1015   static const FloatRegister FP_ArgReg[Argument::n_float_register_parameters] = {

  1016     F12, F13, F14, F15, F16, F17, F18, F19

  1017   };

  1018   uint args = 0;

  1019   uint stk_args = 0; // inc by 2 each time

  1021 // Example:

  1022 //    n   java.lang.UNIXProcess::forkAndExec

  1023 //     private native int forkAndExec(byte[] prog,

  1024 //                                    byte[] argBlock, int argc,

  1025 //                                    byte[] envBlock, int envc,

  1026 //                                    byte[] dir,

  1027 //                                    boolean redirectErrorStream,

  1028 //                                    FileDescriptor stdin_fd,

  1029 //                                    FileDescriptor stdout_fd,

  1030 //                                    FileDescriptor stderr_fd)

  1031 // JNIEXPORT jint JNICALL

  1032 // Java_java_lang_UNIXProcess_forkAndExec(JNIEnv *env,

  1033 //                                        jobject process,

  1034 //                                        jbyteArray prog,

  1035 //                                        jbyteArray argBlock, jint argc,

  1036 //                                        jbyteArray envBlock, jint envc,

  1037 //                                        jbyteArray dir,

  1038 //                                        jboolean redirectErrorStream,

  1039 //                                        jobject stdin_fd,

  1040 //                                        jobject stdout_fd,

  1041 //                                        jobject stderr_fd)

  1042 //

  1043 // ::c_calling_convention

  1044 // 0:     // env    <-- a0

  1045 // 1: L    // klass/obj  <-- t0 => a1

  1046 // 2: [    // prog[]  <-- a0 => a2

  1047 // 3: [    // argBlock[]  <-- a1 => a3

  1048 // 4: I    // argc

  1049 // 5: [    // envBlock[]  <-- a3 => a5

  1050 // 6: I    // envc

  1051 // 7: [    // dir[]  <-- a5 => a7

  1052 // 8: Z    // redirectErrorStream  a6 => sp[0]

  1053 // 9: L    // stdin    a7 => sp[8]

  1054 // 10: L    // stdout    fp[16] => sp[16]

  1055 // 11: L    // stderr    fp[24] => sp[24]

  1056 //

  1057   for (int i = 0; i < total_args_passed; i++) {

  1058     switch (sig_bt[i]) {

  1059     case T_VOID: // Halves of longs and doubles

  1060       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");

  1061       regs[i].set_bad();

  1062       break;

  1063     case T_BOOLEAN:

  1064     case T_CHAR:

  1065     case T_BYTE:

  1066     case T_SHORT:

  1067     case T_INT:

  1068       if (args < Argument::n_register_parameters) {

  1069         regs[i].set1(INT_ArgReg[args++]->as_VMReg());

  1070       } else {

  1071         regs[i].set1(VMRegImpl::stack2reg(stk_args));

  1072         stk_args += 2;

  1073       }

  1074       break;

  1075     case T_LONG:

  1076       assert(sig_bt[i + 1] == T_VOID, "expecting half");

  1077       // fall through

  1078     case T_OBJECT:

  1079     case T_ARRAY:

  1080     case T_ADDRESS:

  1081     case T_METADATA:

  1082       if (args < Argument::n_register_parameters) {

  1083         regs[i].set2(INT_ArgReg[args++]->as_VMReg());

  1084       } else {

  1085         regs[i].set2(VMRegImpl::stack2reg(stk_args));

  1086         stk_args += 2;

  1087       }

  1088       break;

  1089     case T_FLOAT:

  1090       if (args < Argument::n_float_register_parameters) {

  1091         regs[i].set1(FP_ArgReg[args++]->as_VMReg());

  1092       } else {

  1093         regs[i].set1(VMRegImpl::stack2reg(stk_args));

  1094         stk_args += 2;

  1095       }

  1096       break;

  1097     case T_DOUBLE:

  1098       assert(sig_bt[i + 1] == T_VOID, "expecting half");

  1099       if (args < Argument::n_float_register_parameters) {

  1100         regs[i].set2(FP_ArgReg[args++]->as_VMReg());

  1101       } else {

  1102         regs[i].set2(VMRegImpl::stack2reg(stk_args));

  1103         stk_args += 2;

  1104       }

  1105       break;

  1106     default:

  1107       ShouldNotReachHere();

  1108       break;

  1109     }

  1110   }

  1112   return round_to(stk_args, 2);

  1113 }

  1115 // ---------------------------------------------------------------------------

  1116 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {

  1117   // We always ignore the frame_slots arg and just use the space just below frame pointer

  1118   // which by this time is free to use

  1119   switch (ret_type) {

  1120     case T_FLOAT:

  1121       __ swc1(FSF, FP, -wordSize);

  1122       break;

  1123     case T_DOUBLE:

  1124       __ sdc1(FSF, FP, -wordSize );

  1125       break;

  1126     case T_VOID:  break;

  1127     case T_LONG:

  1128       __ sd(V0, FP, -wordSize);

  1129       break;

  1130     case T_OBJECT:

  1131     case T_ARRAY:

  1132       __ sd(V0, FP, -wordSize);

  1133       break;

  1134     default: {

  1135       __ sw(V0, FP, -wordSize);

  1136       }

  1137   }

  1138 }

  1140 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {

  1141   // We always ignore the frame_slots arg and just use the space just below frame pointer

  1142   // which by this time is free to use

  1143   switch (ret_type) {

  1144     case T_FLOAT:

  1145       __ lwc1(FSF, FP, -wordSize);

  1146       break;

  1147     case T_DOUBLE:

  1148       __ ldc1(FSF, FP, -wordSize );

  1149       break;

  1150     case T_LONG:

  1151       __ ld(V0, FP, -wordSize);

  1152       break;

  1153     case T_VOID:  break;

  1154     case T_OBJECT:

  1155     case T_ARRAY:

  1156       __ ld(V0, FP, -wordSize);

  1157       break;

  1158     default: {

  1159       __ lw(V0, FP, -wordSize);

  1160       }

  1161   }

  1162 }

  1164 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {

  1165   for ( int i = first_arg ; i < arg_count ; i++ ) {

  1166     if (args[i].first()->is_Register()) {

  1167       __ push(args[i].first()->as_Register());

  1168     } else if (args[i].first()->is_FloatRegister()) {

  1169       __ push(args[i].first()->as_FloatRegister());

  1170     }

  1171   }

  1172 }

  1174 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {

  1175   for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {

  1176     if (args[i].first()->is_Register()) {

  1177       __ pop(args[i].first()->as_Register());

  1178     } else if (args[i].first()->is_FloatRegister()) {

  1179       __ pop(args[i].first()->as_FloatRegister());

  1180     }

  1181   }

  1182 }

  1184 // A simple move of integer like type

  1185 static void simple_move32(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {

  1186   if (src.first()->is_stack()) {

  1187     if (dst.first()->is_stack()) {

  1188       // stack to stack

  1189       __ lw(AT, FP, reg2offset_in(src.first()));

  1190       __ sd(AT, SP, reg2offset_out(dst.first()));

  1191     } else {

  1192       // stack to reg

  1193       __ lw(dst.first()->as_Register(),  FP, reg2offset_in(src.first()));

  1194     }

  1195   } else if (dst.first()->is_stack()) {

  1196     // reg to stack

  1197     __ sd(src.first()->as_Register(), SP, reg2offset_out(dst.first()));

  1198   } else {

  1199     if (dst.first() != src.first()){

  1200       __ move(dst.first()->as_Register(), src.first()->as_Register()); // fujie error:dst.first()

  1201     }

  1202   }

  1203 }

  1205 // An oop arg. Must pass a handle not the oop itself

  1206 static void object_move(MacroAssembler* masm,

  1207                         OopMap* map,

  1208                         int oop_handle_offset,

  1209                         int framesize_in_slots,

  1210                         VMRegPair src,

  1211                         VMRegPair dst,

  1212                         bool is_receiver,

  1213                         int* receiver_offset) {

  1215   // must pass a handle. First figure out the location we use as a handle

  1217   //FIXME, for mips, dst can be register

  1218   if (src.first()->is_stack()) {

  1219     // Oop is already on the stack as an argument

  1220     Register rHandle = V0;

  1221     Label nil;

  1222     __ xorr(rHandle, rHandle, rHandle);

  1223     __ ld(AT, FP, reg2offset_in(src.first()));

  1224     __ beq(AT, R0, nil);

  1225     __ delayed()->nop();

  1226     __ lea(rHandle, Address(FP, reg2offset_in(src.first())));

  1227     __ bind(nil);

  1228     if(dst.first()->is_stack())__ sd( rHandle, SP, reg2offset_out(dst.first()));

  1229     else                       __ move( (dst.first())->as_Register(), rHandle);

  1230     //if dst is register

  1231     //FIXME, do mips need out preserve stack slots?

  1232     int offset_in_older_frame = src.first()->reg2stack()

  1233       + SharedRuntime::out_preserve_stack_slots();

  1234     map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));

  1235     if (is_receiver) {

  1236       *receiver_offset = (offset_in_older_frame

  1237           + framesize_in_slots) * VMRegImpl::stack_slot_size;

  1238     }

  1239   } else {

  1240     // Oop is in an a register we must store it to the space we reserve

  1241     // on the stack for oop_handles

  1242     const Register rOop = src.first()->as_Register();

  1243     assert( (rOop->encoding() >= A0->encoding()) && (rOop->encoding() <= T0->encoding()),"wrong register");

  1244     const Register rHandle = V0;

  1245     //Important: refer to java_calling_convertion

  1246     int oop_slot = (rOop->encoding() - A0->encoding()) * VMRegImpl::slots_per_word + oop_handle_offset;

  1247     int offset = oop_slot*VMRegImpl::stack_slot_size;

  1248     Label skip;

  1249     __ sd( rOop , SP, offset );

  1250     map->set_oop(VMRegImpl::stack2reg(oop_slot));

  1251     __ xorr( rHandle, rHandle, rHandle);

  1252     __ beq(rOop, R0, skip);

  1253     __ delayed()->nop();

  1254     __ lea(rHandle, Address(SP, offset));

  1255     __ bind(skip);

  1256     // Store the handle parameter

  1257     if(dst.first()->is_stack())__ sd( rHandle, SP, reg2offset_out(dst.first()));

  1258     else                       __ move((dst.first())->as_Register(), rHandle);

  1259     //if dst is register

  1261     if (is_receiver) {

  1262       *receiver_offset = offset;

  1263     }

  1264   }

  1265 }

  1267 // A float arg may have to do float reg int reg conversion

  1268 static void float_move(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {

  1269   assert(!src.second()->is_valid() && !dst.second()->is_valid(), "bad float_move");

  1271   if (src.first()->is_stack()) {

  1272     if (dst.first()->is_stack()) {

  1273       __ lwc1(F12, FP, reg2offset_in(src.first()));

  1274       __ swc1(F12, SP, reg2offset_out(dst.first()));

  1275     }

  1276     else

  1277       __ lwc1(dst.first()->as_FloatRegister(), FP, reg2offset_in(src.first()));

  1278   } else {

  1279     // reg to stack

  1280     if(dst.first()->is_stack())

  1281       __ swc1(src.first()->as_FloatRegister(), SP, reg2offset_out(dst.first()));

  1282     else

  1283       __ mov_s(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());

  1284   }

  1285 }

  1287 // A long move

  1288 static void long_move(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {

  1290   // The only legal possibility for a long_move VMRegPair is:

  1291   // 1: two stack slots (possibly unaligned)

  1292   // as neither the java  or C calling convention will use registers

  1293   // for longs.

  1295   if (src.first()->is_stack()) {

  1296     assert(src.second()->is_stack() && dst.second()->is_stack(), "must be all stack");

  1297     if( dst.first()->is_stack()){

  1298       __ ld(AT, FP, reg2offset_in(src.first()));

  1299       __ sd(AT, SP, reg2offset_out(dst.first()));

  1300     } else {

  1301       __ ld( (dst.first())->as_Register() , FP, reg2offset_in(src.first()));

  1302     }

  1303   } else {

  1304     if( dst.first()->is_stack()){

  1305       __ sd( (src.first())->as_Register(), SP, reg2offset_out(dst.first()));

  1306     } else {

  1307       __ move( (dst.first())->as_Register() , (src.first())->as_Register());

  1308     }

  1309   }

  1310 }

  1312 // A double move

  1313 static void double_move(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {

  1315   // The only legal possibilities for a double_move VMRegPair are:

  1316   // The painful thing here is that like long_move a VMRegPair might be

  1318   // Because of the calling convention we know that src is either

  1319   //   1: a single physical register (xmm registers only)

  1320   //   2: two stack slots (possibly unaligned)

  1321   // dst can only be a pair of stack slots.

  1324   if (src.first()->is_stack()) {

  1325     // source is all stack

  1326     if( dst.first()->is_stack()){

  1327       __ ldc1(F12, FP, reg2offset_in(src.first()));

  1329       __ sdc1(F12, SP, reg2offset_out(dst.first()));

  1330     } else {

  1331       __ ldc1( (dst.first())->as_FloatRegister(), FP, reg2offset_in(src.first()));

  1332     }

  1334   } else {

  1335     // reg to stack

  1336     // No worries about stack alignment

  1337     if( dst.first()->is_stack()){

  1338       __ sdc1(src.first()->as_FloatRegister(), SP, reg2offset_out(dst.first()));

  1339     }

  1340     else

  1341       __ mov_d( dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());

  1343   }

  1344 }

  1346 static void verify_oop_args(MacroAssembler* masm,

  1347                             methodHandle method,

  1348                             const BasicType* sig_bt,

  1349                             const VMRegPair* regs) {

  1350   Register temp_reg = T9;  // not part of any compiled calling seq

  1351   if (VerifyOops) {

  1352     for (int i = 0; i < method->size_of_parameters(); i++) {

  1353       if (sig_bt[i] == T_OBJECT ||

  1354           sig_bt[i] == T_ARRAY) {

  1355         VMReg r = regs[i].first();

  1356         assert(r->is_valid(), "bad oop arg");

  1357         if (r->is_stack()) {

  1358           __ ld(temp_reg, Address(SP, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));

  1359           __ verify_oop(temp_reg);

  1360         } else {

  1361           __ verify_oop(r->as_Register());

  1362         }

  1363       }

  1364     }

  1365   }

  1366 }

  1368 static void gen_special_dispatch(MacroAssembler* masm,

  1369                                  methodHandle method,

  1370                                  const BasicType* sig_bt,

  1371                                  const VMRegPair* regs) {

  1372   verify_oop_args(masm, method, sig_bt, regs);

  1373   vmIntrinsics::ID iid = method->intrinsic_id();

  1375   // Now write the args into the outgoing interpreter space

  1376   bool     has_receiver   = false;

  1377   Register receiver_reg   = noreg;

  1378   int      member_arg_pos = -1;

  1379   Register member_reg     = noreg;

  1380   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);

  1381   if (ref_kind != 0) {

  1382     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument

  1383     member_reg = S3;  // known to be free at this point

  1384     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);

  1385   } else if (iid == vmIntrinsics::_invokeBasic) {

  1386     has_receiver = true;

  1387   } else {

  1388     fatal(err_msg_res("unexpected intrinsic id %d", iid));

  1389   }

  1391   if (member_reg != noreg) {

  1392     // Load the member_arg into register, if necessary.

  1393     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);

  1394     VMReg r = regs[member_arg_pos].first();

  1395     if (r->is_stack()) {

  1396       __ ld(member_reg, Address(SP, r->reg2stack() * VMRegImpl::stack_slot_size));

  1397     } else {

  1398       // no data motion is needed

  1399       member_reg = r->as_Register();

  1400     }

  1401   }

  1403   if (has_receiver) {

  1404     // Make sure the receiver is loaded into a register.

  1405     assert(method->size_of_parameters() > 0, "oob");

  1406     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");

  1407     VMReg r = regs[0].first();

  1408     assert(r->is_valid(), "bad receiver arg");

  1409     if (r->is_stack()) {

  1410       // Porting note:  This assumes that compiled calling conventions always

  1411       // pass the receiver oop in a register.  If this is not true on some

  1412       // platform, pick a temp and load the receiver from stack.

  1413       fatal("receiver always in a register");

  1414       receiver_reg = SSR;  // known to be free at this point

  1415       __ ld(receiver_reg, Address(SP, r->reg2stack() * VMRegImpl::stack_slot_size));

  1416     } else {

  1417       // no data motion is needed

  1418       receiver_reg = r->as_Register();

  1419     }

  1420   }

  1422   // Figure out which address we are really jumping to:

  1423   MethodHandles::generate_method_handle_dispatch(masm, iid,

  1424                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);

  1425 }

  1427 // ---------------------------------------------------------------------------

  1428 // Generate a native wrapper for a given method.  The method takes arguments

  1429 // in the Java compiled code convention, marshals them to the native

  1430 // convention (handlizes oops, etc), transitions to native, makes the call,

  1431 // returns to java state (possibly blocking), unhandlizes any result and

  1432 // returns.

  1433 nmethod *SharedRuntime::generate_native_wrapper(MacroAssembler* masm,

  1434                                                 methodHandle method,

  1435                                                 int compile_id,

  1436                                                 BasicType* in_sig_bt,

  1437                                                 VMRegPair* in_regs,

  1438                                                 BasicType ret_type) {

  1439   if (method->is_method_handle_intrinsic()) {

  1440     vmIntrinsics::ID iid = method->intrinsic_id();

  1441     intptr_t start = (intptr_t)__ pc();

  1442     int vep_offset = ((intptr_t)__ pc()) - start;

  1443     gen_special_dispatch(masm,

  1444                          method,

  1445                          in_sig_bt,

  1446                          in_regs);

  1447     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period

  1448     __ flush();

  1449     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually

  1450     return nmethod::new_native_nmethod(method,

  1451                                        compile_id,

  1452                                        masm->code(),

  1453                                        vep_offset,

  1454                                        frame_complete,

  1455                                        stack_slots / VMRegImpl::slots_per_word,

  1456                                        in_ByteSize(-1),

  1457                                        in_ByteSize(-1),

  1458                                        (OopMapSet*)NULL);

  1459   }

  1460   bool is_critical_native = true;

  1461   address native_func = method->critical_native_function();

  1462   if (native_func == NULL) {

  1463     native_func = method->native_function();

  1464     is_critical_native = false;

  1465   }

  1466   assert(native_func != NULL, "must have function");

  1468   // Native nmethod wrappers never take possesion of the oop arguments.

  1469   // So the caller will gc the arguments. The only thing we need an

  1470   // oopMap for is if the call is static

  1471   //

  1472   // An OopMap for lock (and class if static), and one for the VM call itself

  1473   OopMapSet *oop_maps = new OopMapSet();

  1475   // We have received a description of where all the java arg are located

  1476   // on entry to the wrapper. We need to convert these args to where

  1477   // the jni function will expect them. To figure out where they go

  1478   // we convert the java signature to a C signature by inserting

  1479   // the hidden arguments as arg[0] and possibly arg[1] (static method)

  1481   const int total_in_args = method->size_of_parameters();

  1482   int total_c_args = total_in_args;

  1483   if (!is_critical_native) {

  1484     total_c_args += 1;

  1485     if (method->is_static()) {

  1486       total_c_args++;

  1487     }

  1488   } else {

  1489     for (int i = 0; i < total_in_args; i++) {

  1490       if (in_sig_bt[i] == T_ARRAY) {

  1491         total_c_args++;

  1492       }

  1493     }

  1494   }

  1496   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);

  1497   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);

  1498   BasicType* in_elem_bt = NULL;

  1500   int argc = 0;

  1501   if (!is_critical_native) {

  1502     out_sig_bt[argc++] = T_ADDRESS;

  1503     if (method->is_static()) {

  1504       out_sig_bt[argc++] = T_OBJECT;

  1505     }

  1507     for (int i = 0; i < total_in_args ; i++ ) {

  1508       out_sig_bt[argc++] = in_sig_bt[i];

  1509     }

  1510   } else {

  1511     Thread* THREAD = Thread::current();

  1512     in_elem_bt = NEW_RESOURCE_ARRAY(BasicType, total_in_args);

  1513     SignatureStream ss(method->signature());

  1514     for (int i = 0; i < total_in_args ; i++ ) {

  1515       if (in_sig_bt[i] == T_ARRAY) {

  1516         // Arrays are passed as int, elem* pair

  1517         out_sig_bt[argc++] = T_INT;

  1518         out_sig_bt[argc++] = T_ADDRESS;

  1519         Symbol* atype = ss.as_symbol(CHECK_NULL);

  1520         const char* at = atype->as_C_string();

  1521         if (strlen(at) == 2) {

  1522           assert(at[0] == '[', "must be");

  1523           switch (at[1]) {

  1524             case 'B': in_elem_bt[i]  = T_BYTE; break;

  1525             case 'C': in_elem_bt[i]  = T_CHAR; break;

  1526             case 'D': in_elem_bt[i]  = T_DOUBLE; break;

  1527             case 'F': in_elem_bt[i]  = T_FLOAT; break;

  1528             case 'I': in_elem_bt[i]  = T_INT; break;

  1529             case 'J': in_elem_bt[i]  = T_LONG; break;

  1530             case 'S': in_elem_bt[i]  = T_SHORT; break;

  1531             case 'Z': in_elem_bt[i]  = T_BOOLEAN; break;

  1532             default: ShouldNotReachHere();

  1533           }

  1534         }

  1535       } else {

  1536         out_sig_bt[argc++] = in_sig_bt[i];

  1537         in_elem_bt[i] = T_VOID;

  1538       }

  1539       if (in_sig_bt[i] != T_VOID) {

  1540         assert(in_sig_bt[i] == ss.type(), "must match");

  1541         ss.next();

  1542       }

  1543     }

  1544   }

  1546   // Now figure out where the args must be stored and how much stack space

  1547   // they require (neglecting out_preserve_stack_slots but space for storing

  1548   // the 1st six register arguments). It's weird see int_stk_helper.

  1549   //

  1550   int out_arg_slots;

  1551   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, NULL, total_c_args);

  1553   // Compute framesize for the wrapper.  We need to handlize all oops in

  1554   // registers. We must create space for them here that is disjoint from

  1555   // the windowed save area because we have no control over when we might

  1556   // flush the window again and overwrite values that gc has since modified.

  1557   // (The live window race)

  1558   //

  1559   // We always just allocate 6 word for storing down these object. This allow

  1560   // us to simply record the base and use the Ireg number to decide which

  1561   // slot to use. (Note that the reg number is the inbound number not the

  1562   // outbound number).

  1563   // We must shuffle args to match the native convention, and include var-args space.

  1565   // Calculate the total number of stack slots we will need.

  1567   // First count the abi requirement plus all of the outgoing args

  1568   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;

  1570   // Now the space for the inbound oop handle area

  1571   int total_save_slots = 9 * VMRegImpl::slots_per_word;  // 9 arguments passed in registers

  1572   if (is_critical_native) {

  1573     // Critical natives may have to call out so they need a save area

  1574     // for register arguments.

  1575     int double_slots = 0;

  1576     int single_slots = 0;

  1577     for ( int i = 0; i < total_in_args; i++) {

  1578       if (in_regs[i].first()->is_Register()) {

  1579         const Register reg = in_regs[i].first()->as_Register();

  1580         switch (in_sig_bt[i]) {

  1581           case T_BOOLEAN:

  1582           case T_BYTE:

  1583           case T_SHORT:

  1584           case T_CHAR:

  1585           case T_INT:  single_slots++; break;

  1586           case T_ARRAY:  // specific to LP64 (7145024)

  1587           case T_LONG: double_slots++; break;

  1588           default:  ShouldNotReachHere();

  1589         }

  1590       } else if (in_regs[i].first()->is_FloatRegister()) {

  1591         switch (in_sig_bt[i]) {

  1592           case T_FLOAT:  single_slots++; break;

  1593           case T_DOUBLE: double_slots++; break;

  1594           default:  ShouldNotReachHere();

  1595         }

  1596       }

  1597     }

  1598     total_save_slots = double_slots * 2 + single_slots;

  1599     // align the save area

  1600     if (double_slots != 0) {

  1601       stack_slots = round_to(stack_slots, 2);

  1602     }

  1603   }

  1605   int oop_handle_offset = stack_slots;

  1606   stack_slots += total_save_slots;

  1608   // Now any space we need for handlizing a klass if static method

  1610   int klass_slot_offset = 0;

  1611   int klass_offset = -1;

  1612   int lock_slot_offset = 0;

  1613   bool is_static = false;

  1615   if (method->is_static()) {

  1616     klass_slot_offset = stack_slots;

  1617     stack_slots += VMRegImpl::slots_per_word;

  1618     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;

  1619     is_static = true;

  1620   }

  1622   // Plus a lock if needed

  1624   if (method->is_synchronized()) {

  1625     lock_slot_offset = stack_slots;

  1626     stack_slots += VMRegImpl::slots_per_word;

  1627   }

  1629   // Now a place to save return value or as a temporary for any gpr -> fpr moves

  1630   // + 2 for return address (which we own) and saved fp

  1631   stack_slots += 2 + 9 * VMRegImpl::slots_per_word;  // (T0, A0, A1, A2, A3, A4, A5, A6, A7)

  1633   // Ok The space we have allocated will look like:

  1634   //

  1635   //

  1636   // FP-> |                     |

  1637   //      |---------------------|

  1638   //      | 2 slots for moves   |

  1639   //      |---------------------|

  1640   //      | lock box (if sync)  |

  1641   //      |---------------------| <- lock_slot_offset

  1642   //      | klass (if static)   |

  1643   //      |---------------------| <- klass_slot_offset

  1644   //      | oopHandle area      |

  1645   //      |---------------------| <- oop_handle_offset

  1646   //      | outbound memory     |

  1647   //      | based arguments     |

  1648   //      |                     |

  1649   //      |---------------------|

  1650   //      | vararg area         |

  1651   //      |---------------------|

  1652   //      |                     |

  1653   // SP-> | out_preserved_slots |

  1654   //

  1655   //

  1658   // Now compute actual number of stack words we need rounding to make

  1659   // stack properly aligned.

  1660   stack_slots = round_to(stack_slots, StackAlignmentInSlots);

  1662   int stack_size = stack_slots * VMRegImpl::stack_slot_size;

  1664   intptr_t start = (intptr_t)__ pc();

  1668   // First thing make an ic check to see if we should even be here

  1669   address ic_miss = SharedRuntime::get_ic_miss_stub();

  1671   // We are free to use all registers as temps without saving them and

  1672   // restoring them except fp. fp is the only callee save register

  1673   // as far as the interpreter and the compiler(s) are concerned.

  1675   //refer to register_mips.hpp:IC_Klass

  1676   const Register ic_reg = T1;

  1677   const Register receiver = T0;

  1679   Label hit;

  1680   Label exception_pending;

  1682   __ verify_oop(receiver);

  1683   //add for compressedoops

  1684   __ load_klass(T9, receiver);

  1685   __ beq(T9, ic_reg, hit);

  1686   __ delayed()->nop();

  1687   __ jmp(ic_miss, relocInfo::runtime_call_type);

  1688   __ delayed()->nop();

  1689   // verified entry must be aligned for code patching.

  1690   // and the first 5 bytes must be in the same cache line

  1691   // if we align at 8 then we will be sure 5 bytes are in the same line

  1692   __ align(8);

  1694   __ bind(hit);

  1697   int vep_offset = ((intptr_t)__ pc()) - start;

  1698 #ifdef COMPILER1

  1699   if (InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) {

  1700     // Object.hashCode can pull the hashCode from the header word

  1701     // instead of doing a full VM transition once it's been computed.

  1702     // Since hashCode is usually polymorphic at call sites we can't do

  1703     // this optimization at the call site without a lot of work.

  1704     Label slowCase;

  1705     Register receiver = T0;

  1706     Register result = V0;

  1707     __ ld ( result, receiver, oopDesc::mark_offset_in_bytes());

  1708     // check if locked

  1709     __ andi(AT, result, markOopDesc::unlocked_value);

  1710     __ beq(AT, R0, slowCase);

  1711     __ delayed()->nop();

  1712     if (UseBiasedLocking) {

  1713       // Check if biased and fall through to runtime if so

  1714       __ andi (AT, result, markOopDesc::biased_lock_bit_in_place);

  1715       __ bne(AT, R0, slowCase);

  1716       __ delayed()->nop();

  1717     }

  1718     // get hash

  1719     __ li(AT, markOopDesc::hash_mask_in_place);

  1720     __ andr (AT, result, AT);

  1721     // test if hashCode exists

  1722     __ beq (AT, R0, slowCase);

  1723     __ delayed()->nop();

  1724     __ shr(result, markOopDesc::hash_shift);

  1725     __ jr(RA);

  1726     __ delayed()->nop();

  1727     __ bind (slowCase);

  1728   }

  1729 #endif // COMPILER1

  1731   // The instruction at the verified entry point must be 5 bytes or longer

  1732   // because it can be patched on the fly by make_non_entrant. The stack bang

  1733   // instruction fits that requirement.

  1735   // Generate stack overflow check

  1737   if (UseStackBanging) {

  1738     __ bang_stack_with_offset(StackShadowPages*os::vm_page_size());

  1739   } else {

  1740     // need a 5 byte instruction to allow MT safe patching to non-entrant

  1741     __ nop();

  1742     __ nop();

  1743     __ nop();

  1744     __ nop();

  1745     __ nop();

  1746   }

  1747   // Generate a new frame for the wrapper.

  1748   // do mips need this ?

  1749 #ifndef OPT_THREAD

  1750   __ get_thread(TREG);

  1751 #endif

  1752   __ st_ptr(SP, TREG, in_bytes(JavaThread::last_Java_sp_offset()));

  1753   __ move(AT, -(StackAlignmentInBytes));

  1754   __ andr(SP, SP, AT);

  1756   __ enter();

  1757   // -2 because return address is already present and so is saved fp

  1758   __ addiu(SP, SP, -1 * (stack_size - 2*wordSize));

  1760   // Frame is now completed as far a size and linkage.

  1762   int frame_complete = ((intptr_t)__ pc()) - start;

  1764   // Calculate the difference between sp and fp. We need to know it

  1765   // after the native call because on windows Java Natives will pop

  1766   // the arguments and it is painful to do sp relative addressing

  1767   // in a platform independent way. So after the call we switch to

  1768   // fp relative addressing.

  1769   //FIXME actually , the fp_adjustment may not be the right, because andr(sp, sp, at) may change

  1770   //the SP

  1771   int fp_adjustment = stack_size - 2*wordSize;

  1773 #ifdef COMPILER2

  1774   // C2 may leave the stack dirty if not in SSE2+ mode

  1775   __ empty_FPU_stack();

  1776 #endif

  1778   // Compute the fp offset for any slots used after the jni call

  1780   int lock_slot_fp_offset = (lock_slot_offset*VMRegImpl::stack_slot_size) - fp_adjustment;

  1781   // We use TREG as a thread pointer because it is callee save and

  1782   // if we load it once it is usable thru the entire wrapper

  1783   const Register thread = TREG;

  1785   // We use S4 as the oop handle for the receiver/klass

  1786   // It is callee save so it survives the call to native

  1788   const Register oop_handle_reg = S4;

  1789   if (is_critical_native) {

  1790      __ stop("generate_native_wrapper in sharedRuntime <2>");

  1791     //TODO:Fu

  1792     // check_needs_gc_for_critical_native(masm, stack_slots, total_c_args, total_in_args,

  1793     //                                   oop_handle_offset, oop_maps, in_regs, in_sig_bt);

  1794   }

  1796 #ifndef OPT_THREAD

  1797   __ get_thread(thread);

  1798 #endif

  1800   //

  1801   // We immediately shuffle the arguments so that any vm call we have to

  1802   // make from here on out (sync slow path, jvmpi, etc.) we will have

  1803   // captured the oops from our caller and have a valid oopMap for

  1804   // them.

  1806   // -----------------

  1807   // The Grand Shuffle

  1808   //

  1809   // Natives require 1 or 2 extra arguments over the normal ones: the JNIEnv*

  1810   // and, if static, the class mirror instead of a receiver.  This pretty much

  1811   // guarantees that register layout will not match (and mips doesn't use reg

  1812   // parms though amd does).  Since the native abi doesn't use register args

  1813   // and the java conventions does we don't have to worry about collisions.

  1814   // All of our moved are reg->stack or stack->stack.

  1815   // We ignore the extra arguments during the shuffle and handle them at the

  1816   // last moment. The shuffle is described by the two calling convention

  1817   // vectors we have in our possession. We simply walk the java vector to

  1818   // get the source locations and the c vector to get the destinations.

  1820   int c_arg = method->is_static() ? 2 : 1 ;

  1822   // Record sp-based slot for receiver on stack for non-static methods

  1823   int receiver_offset = -1;

  1825   // This is a trick. We double the stack slots so we can claim

  1826   // the oops in the caller's frame. Since we are sure to have

  1827   // more args than the caller doubling is enough to make

  1828   // sure we can capture all the incoming oop args from the

  1829   // caller.

  1830   //

  1831   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);

  1833   // Mark location of fp (someday)

  1834   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(fp));

  1836 #ifdef ASSERT

  1837   bool reg_destroyed[RegisterImpl::number_of_registers];

  1838   bool freg_destroyed[FloatRegisterImpl::number_of_registers];

  1839   for ( int r = 0 ; r < RegisterImpl::number_of_registers ; r++ ) {

  1840     reg_destroyed[r] = false;

  1841   }

  1842   for ( int f = 0 ; f < FloatRegisterImpl::number_of_registers ; f++ ) {

  1843     freg_destroyed[f] = false;

  1844   }

  1846 #endif /* ASSERT */

  1848   // This may iterate in two different directions depending on the

  1849   // kind of native it is.  The reason is that for regular JNI natives

  1850   // the incoming and outgoing registers are offset upwards and for

  1851   // critical natives they are offset down.

  1852   GrowableArray<int> arg_order(2 * total_in_args);

  1853   VMRegPair tmp_vmreg;

  1854   tmp_vmreg.set1(T8->as_VMReg());

  1856   if (!is_critical_native) {

  1857     for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {

  1858       arg_order.push(i);

  1859       arg_order.push(c_arg);

  1860     }

  1861   } else {

  1862     // Compute a valid move order, using tmp_vmreg to break any cycles

  1863      __ stop("generate_native_wrapper in sharedRuntime <2>");

  1864     //TODO:Fu

  1865     // ComputeMoveOrder cmo(total_in_args, in_regs, total_c_args, out_regs, in_sig_bt, arg_order, tmp_vmreg);

  1866   }

  1868   int temploc = -1;

  1869   for (int ai = 0; ai < arg_order.length(); ai += 2) {

  1870     int i = arg_order.at(ai);

  1871     int c_arg = arg_order.at(ai + 1);

  1872     __ block_comment(err_msg("move %d -> %d", i, c_arg));

  1873     if (c_arg == -1) {

  1874       assert(is_critical_native, "should only be required for critical natives");

  1875       // This arg needs to be moved to a temporary

  1876       __ move(tmp_vmreg.first()->as_Register(), in_regs[i].first()->as_Register());

  1877       in_regs[i] = tmp_vmreg;

  1878       temploc = i;

  1879       continue;

  1880     } else if (i == -1) {

  1881       assert(is_critical_native, "should only be required for critical natives");

  1882       // Read from the temporary location

  1883       assert(temploc != -1, "must be valid");

  1884       i = temploc;

  1885       temploc = -1;

  1886     }

  1887 #ifdef ASSERT

  1888     if (in_regs[i].first()->is_Register()) {

  1889       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");

  1890     } else if (in_regs[i].first()->is_FloatRegister()) {

  1891       assert(!freg_destroyed[in_regs[i].first()->as_FloatRegister()->encoding()], "destroyed reg!");

  1892     }

  1893     if (out_regs[c_arg].first()->is_Register()) {

  1894       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;

  1895     } else if (out_regs[c_arg].first()->is_FloatRegister()) {

  1896       freg_destroyed[out_regs[c_arg].first()->as_FloatRegister()->encoding()] = true;

  1897     }

  1898 #endif /* ASSERT */

  1899     switch (in_sig_bt[i]) {

  1900       case T_ARRAY:

  1901         if (is_critical_native) {

  1902           __ stop("generate_native_wrapper in sharedRuntime <2>");

  1903           //TODO:Fu

  1904           // unpack_array_argument(masm, in_regs[i], in_elem_bt[i], out_regs[c_arg + 1], out_regs[c_arg]);

  1905           c_arg++;

  1906 #ifdef ASSERT

  1907           if (out_regs[c_arg].first()->is_Register()) {

  1908             reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;

  1909           } else if (out_regs[c_arg].first()->is_FloatRegister()) {

  1910             freg_destroyed[out_regs[c_arg].first()->as_FloatRegister()->encoding()] = true;

  1911           }

  1912 #endif

  1913           break;

  1914         }

  1915       case T_OBJECT:

  1916         assert(!is_critical_native, "no oop arguments");

  1917         object_move(masm, map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],

  1918                     ((i == 0) && (!is_static)),

  1919                     &receiver_offset);

  1920         break;

  1921       case T_VOID:

  1922         break;

  1924       case T_FLOAT:

  1925         float_move(masm, in_regs[i], out_regs[c_arg]);

  1926           break;

  1928       case T_DOUBLE:

  1929         assert( i + 1 < total_in_args &&

  1930                 in_sig_bt[i + 1] == T_VOID &&

  1931                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");

  1932         double_move(masm, in_regs[i], out_regs[c_arg]);

  1933         break;

  1935       case T_LONG :

  1936         long_move(masm, in_regs[i], out_regs[c_arg]);

  1937         break;

  1939       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");

  1941       default:

  1942         simple_move32(masm, in_regs[i], out_regs[c_arg]);

  1943     }

  1944   }

  1946   // point c_arg at the first arg that is already loaded in case we

  1947   // need to spill before we call out

  1948   c_arg = total_c_args - total_in_args;

  1949   // Pre-load a static method's oop.  Used both by locking code and

  1950   // the normal JNI call code.

  1952   __ move(oop_handle_reg, A1);

  1954   if (method->is_static() && !is_critical_native) {

  1956     //  load opp into a register

  1957     int oop_index = __ oop_recorder()->find_index(JNIHandles::make_local(

  1958           (method->method_holder())->java_mirror()));

  1961     RelocationHolder rspec = oop_Relocation::spec(oop_index);

  1962     __ relocate(rspec);

  1963     __ patchable_set48(oop_handle_reg, (long)JNIHandles::make_local((method->method_holder())->java_mirror()));

  1964     // Now handlize the static class mirror it's known not-null.

  1965     __ sd( oop_handle_reg, SP, klass_offset);

  1966     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));

  1968     // Now get the handle

  1969     __ lea(oop_handle_reg, Address(SP, klass_offset));

  1970     // store the klass handle as second argument

  1971     __ move(A1, oop_handle_reg);

  1972     // and protect the arg if we must spill

  1973     c_arg--;

  1974   }

  1976   // Change state to native (we save the return address in the thread, since it might not

  1977   // be pushed on the stack when we do a a stack traversal). It is enough that the pc()

  1978   // points into the right code segment. It does not have to be the correct return pc.

  1979   // We use the same pc/oopMap repeatedly when we call out

  1981   intptr_t the_pc = (intptr_t) __ pc();

  1982   oop_maps->add_gc_map(the_pc - start, map);

  1984   __ set_last_Java_frame(SP, noreg, NULL);

  1985   __ relocate(relocInfo::internal_pc_type);

  1986   {

  1987     intptr_t save_pc = (intptr_t)the_pc ;

  1988     __ patchable_set48(AT, save_pc);

  1989   }

  1990   __ sd(AT, thread, in_bytes(JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()));

  1993   // We have all of the arguments setup at this point. We must not touch any register

  1994   // argument registers at this point (what if we save/restore them there are no oop?

  1995   {

  1996     SkipIfEqual skip_if(masm, &DTraceMethodProbes, 0);

  1997     int metadata_index = __ oop_recorder()->find_index(method());

  1998     RelocationHolder rspec = metadata_Relocation::spec(metadata_index);

  1999     __ relocate(rspec);

  2000     __ patchable_set48(AT, (long)(method()));

  2002     __ call_VM_leaf(

  2003       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),

  2004       thread, AT);

  2006   }

  2008   // These are register definitions we need for locking/unlocking

  2009   const Register swap_reg = T8;  // Must use T8 for cmpxchg instruction

  2010   const Register obj_reg  = T9;  // Will contain the oop

  2011   //const Register lock_reg = T6;  // Address of compiler lock object (BasicLock)

  2012   const Register lock_reg = c_rarg0;  // Address of compiler lock object (BasicLock)

  2016   Label slow_path_lock;

  2017   Label lock_done;

  2019   // Lock a synchronized method

  2020   if (method->is_synchronized()) {

  2021     assert(!is_critical_native, "unhandled");

  2023     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();

  2025     // Get the handle (the 2nd argument)

  2026     __ move(oop_handle_reg, A1);

  2028     // Get address of the box

  2029     __ lea(lock_reg, Address(FP, lock_slot_fp_offset));

  2031     // Load the oop from the handle

  2032     __ ld(obj_reg, oop_handle_reg, 0);

  2034     if (UseBiasedLocking) {

  2035       // Note that oop_handle_reg is trashed during this call

  2036       __ biased_locking_enter(lock_reg, obj_reg, swap_reg, A1, false, lock_done, &slow_path_lock);

  2037     }

  2039     // Load immediate 1 into swap_reg %T8

  2040     __ move(swap_reg, 1);

  2042     __ ld(AT, obj_reg, 0);

  2043     __ orr(swap_reg, swap_reg, AT);

  2045     __ sd( swap_reg, lock_reg, mark_word_offset);

  2046     __ cmpxchg(lock_reg, Address(obj_reg, 0), swap_reg);

  2047     __ bne(AT, R0, lock_done);

  2048     __ delayed()->nop();

  2049     // Test if the oopMark is an obvious stack pointer, i.e.,

  2050     //  1) (mark & 3) == 0, and

  2051     //  2) sp <= mark < mark + os::pagesize()

  2052     // These 3 tests can be done by evaluating the following

  2053     // expression: ((mark - sp) & (3 - os::vm_page_size())),

  2054     // assuming both stack pointer and pagesize have their

  2055     // least significant 2 bits clear.

  2056     // NOTE: the oopMark is in swap_reg %T8 as the result of cmpxchg

  2058     __ dsub(swap_reg, swap_reg, SP);

  2059     __ move(AT, 3 - os::vm_page_size());

  2060     __ andr(swap_reg , swap_reg, AT);

  2061     // Save the test result, for recursive case, the result is zero

  2062     __ sd(swap_reg, lock_reg, mark_word_offset);

  2063     __ bne(swap_reg, R0, slow_path_lock);

  2064     __ delayed()->nop();

  2065     // Slow path will re-enter here

  2066     __ bind(lock_done);

  2068     if (UseBiasedLocking) {

  2069       // Re-fetch oop_handle_reg as we trashed it above

  2070       __ move(A1, oop_handle_reg);

  2071     }

  2072   }

  2075   // Finally just about ready to make the JNI call

  2078   // get JNIEnv* which is first argument to native

  2079   if (!is_critical_native) {

  2080     __ addi(A0, thread, in_bytes(JavaThread::jni_environment_offset()));

  2081   }

  2083   // Example: Java_java_lang_ref_Finalizer_invokeFinalizeMethod(JNIEnv *env, jclass clazz, jobject ob)

  2084   // Load the second arguments into A1

  2085   //__ ld(A1, SP , wordSize );   // klass

  2087   // Now set thread in native

  2088   __ addi(AT, R0, _thread_in_native);

  2089   __ sw(AT, thread, in_bytes(JavaThread::thread_state_offset()));

  2090   // do the call

  2091   __ call(method->native_function(), relocInfo::runtime_call_type);

  2092   __ delayed()->nop();

  2093   // WARNING - on Windows Java Natives use pascal calling convention and pop the

  2094   // arguments off of the stack. We could just re-adjust the stack pointer here

  2095   // and continue to do SP relative addressing but we instead switch to FP

  2096   // relative addressing.

  2098   // Unpack native results.

  2099   switch (ret_type) {

  2100   case T_BOOLEAN: __ c2bool(V0);            break;

  2101   case T_CHAR   : __ andi(V0, V0, 0xFFFF);      break;

  2102   case T_BYTE   : __ sign_extend_byte (V0); break;

  2103   case T_SHORT  : __ sign_extend_short(V0); break;

  2104   case T_INT    : // nothing to do         break;

  2105   case T_DOUBLE :

  2106   case T_FLOAT  :

  2107   // Result is in st0 we'll save as needed

  2108   break;

  2109   case T_ARRAY:                 // Really a handle

  2110   case T_OBJECT:                // Really a handle

  2111   break; // can't de-handlize until after safepoint check

  2112   case T_VOID: break;

  2113   case T_LONG: break;

  2114   default       : ShouldNotReachHere();

  2115   }

  2116   // Switch thread to "native transition" state before reading the synchronization state.

  2117   // This additional state is necessary because reading and testing the synchronization

  2118   // state is not atomic w.r.t. GC, as this scenario demonstrates:

  2119   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.

  2120   //     VM thread changes sync state to synchronizing and suspends threads for GC.

  2121   //     Thread A is resumed to finish this native method, but doesn't block here since it

  2122   //     didn't see any synchronization is progress, and escapes.

  2123   __ addi(AT, R0, _thread_in_native_trans);

  2124   __ sw(AT, thread, in_bytes(JavaThread::thread_state_offset()));

  2126   //if(os::is_MP()) {}

  2128   Label after_transition;

  2130   // check for safepoint operation in progress and/or pending suspend requests

  2131   {

  2132     Label Continue;

  2133     __ li(AT, SafepointSynchronize::address_of_state());

  2134     __ lw(A0, AT, 0);

  2135     __ addi(AT, A0, -SafepointSynchronize::_not_synchronized);

  2136     Label L;

  2137     __ bne(AT, R0, L);

  2138     __ delayed()->nop();

  2139     __ lw(AT, thread, in_bytes(JavaThread::suspend_flags_offset()));

  2140     __ beq(AT, R0, Continue);

  2141     __ delayed()->nop();

  2142     __ bind(L);

  2144     // Don't use call_VM as it will see a possible pending exception and forward it

  2145     // and never return here preventing us from clearing _last_native_pc down below.

  2146     //

  2147     save_native_result(masm, ret_type, stack_slots);

  2148     __ move(A0, thread);

  2149     __ addi(SP, SP, -wordSize);

  2150     __ push(S2);

  2151     __ move(AT, -(StackAlignmentInBytes));

  2152     __ move(S2, SP);     // use S2 as a sender SP holder

  2153     __ andr(SP, SP, AT); // align stack as required by ABI

  2154     if (!is_critical_native) {

  2155       __ call(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans), relocInfo::runtime_call_type);

  2156       __ delayed()->nop();

  2157     } else {

  2158       __ call(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans_and_transition), relocInfo::runtime_call_type);

  2159       __ delayed()->nop();

  2160     }

  2161     __ move(SP, S2);     // use S2 as a sender SP holder

  2162     __ pop(S2);

  2163     __ addi(SP, SP, wordSize);

  2164     //add for compressedoops

  2165     __ reinit_heapbase();

  2166     // Restore any method result value

  2167     restore_native_result(masm, ret_type, stack_slots);

  2169     if (is_critical_native) {

  2170       // The call above performed the transition to thread_in_Java so

  2171       // skip the transition logic below.

  2172       __ beq(R0, R0, after_transition);

  2173       __ delayed()->nop();

  2174     }

  2176     __ bind(Continue);

  2177   }

  2179   // change thread state

  2180   __ addi(AT, R0, _thread_in_Java);

  2181   __ sw(AT,  thread, in_bytes(JavaThread::thread_state_offset()));

  2182   __ bind(after_transition);

  2183   Label reguard;

  2184   Label reguard_done;

  2185   __ lw(AT, thread, in_bytes(JavaThread::stack_guard_state_offset()));

  2186   __ addi(AT, AT, -JavaThread::stack_guard_yellow_disabled);

  2187   __ beq(AT, R0, reguard);

  2188   __ delayed()->nop();

  2189   // slow path reguard  re-enters here

  2190   __ bind(reguard_done);

  2192   // Handle possible exception (will unlock if necessary)

  2194   // native result if any is live

  2196   // Unlock

  2197   Label slow_path_unlock;

  2198   Label unlock_done;

  2199   if (method->is_synchronized()) {

  2201     Label done;

  2203     // Get locked oop from the handle we passed to jni

  2204     __ ld( obj_reg, oop_handle_reg, 0);

  2205     if (UseBiasedLocking) {

  2206       __ biased_locking_exit(obj_reg, T8, done);

  2208     }

  2210     // Simple recursive lock?

  2212     __ ld(AT, FP, lock_slot_fp_offset);

  2213     __ beq(AT, R0, done);

  2214     __ delayed()->nop();

  2215     // Must save FSF if if it is live now because cmpxchg must use it

  2216     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {

  2217       save_native_result(masm, ret_type, stack_slots);

  2218     }

  2220     //  get old displaced header

  2221     __ ld (T8, FP, lock_slot_fp_offset);

  2222     // get address of the stack lock

  2223     __ addi (c_rarg0, FP, lock_slot_fp_offset);

  2224     // Atomic swap old header if oop still contains the stack lock

  2225     __ cmpxchg(T8, Address(obj_reg, 0), c_rarg0);

  2227     __ beq(AT, R0, slow_path_unlock);

  2228     __ delayed()->nop();

  2229     // slow path re-enters here

  2230     __ bind(unlock_done);

  2231     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {

  2232       restore_native_result(masm, ret_type, stack_slots);

  2233     }

  2235     __ bind(done);

  2237   }

  2238   {

  2239     SkipIfEqual skip_if(masm, &DTraceMethodProbes, 0);

  2240     // Tell dtrace about this method exit

  2241     save_native_result(masm, ret_type, stack_slots);

  2242     int metadata_index = __ oop_recorder()->find_index( (method()));

  2243     RelocationHolder rspec = metadata_Relocation::spec(metadata_index);

  2244     __ relocate(rspec);

  2245     __ patchable_set48(AT, (long)(method()));

  2247     __ call_VM_leaf(

  2248          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),

  2249          thread, AT);

  2250     restore_native_result(masm, ret_type, stack_slots);

  2251   }

  2253   // We can finally stop using that last_Java_frame we setup ages ago

  2255   __ reset_last_Java_frame(false);

  2257   // Unpack oop result, e.g. JNIHandles::resolve value.

  2258   if (ret_type == T_OBJECT || ret_type == T_ARRAY) {

  2259     __ resolve_jobject(V0, thread, T9);

  2260   }

  2262   if (!is_critical_native) {

  2263     // reset handle block

  2264     __ ld(AT, thread, in_bytes(JavaThread::active_handles_offset()));

  2265     __ sw(R0, AT, JNIHandleBlock::top_offset_in_bytes());

  2266   }

  2268   if (!is_critical_native) {

  2269     // Any exception pending?

  2270     __ ld(AT, thread, in_bytes(Thread::pending_exception_offset()));

  2271     __ bne(AT, R0, exception_pending);

  2272     __ delayed()->nop();

  2273   }

  2274   // no exception, we're almost done

  2276   // check that only result value is on FPU stack

  2277   __ verify_FPU(ret_type == T_FLOAT || ret_type == T_DOUBLE ? 1 : 0, "native_wrapper normal exit");

  2279   // Return

  2280 #ifndef OPT_THREAD

  2281   __ get_thread(TREG);

  2282 #endif

  2283   //__ ld_ptr(SP, TREG, in_bytes(JavaThread::last_Java_sp_offset()));

  2284   __ leave();

  2286   __ jr(RA);

  2287   __ delayed()->nop();

  2288   // Unexpected paths are out of line and go here

  2289   // Slow path locking & unlocking

  2290   if (method->is_synchronized()) {

  2292     // BEGIN Slow path lock

  2293     __ bind(slow_path_lock);

  2295     // protect the args we've loaded

  2296     save_args(masm, total_c_args, c_arg, out_regs);

  2298     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM

  2299     // args are (oop obj, BasicLock* lock, JavaThread* thread)

  2301     __ move(A0, obj_reg);

  2302     __ move(A1, lock_reg);

  2303     __ move(A2, thread);

  2304     __ addi(SP, SP, - 3*wordSize);

  2306     __ move(AT, -(StackAlignmentInBytes));

  2307     __ move(S2, SP);     // use S2 as a sender SP holder

  2308     __ andr(SP, SP, AT); // align stack as required by ABI

  2310     __ call(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), relocInfo::runtime_call_type);

  2311     __ delayed()->nop();

  2312                 __ move(SP, S2);

  2313     __ addi(SP, SP, 3*wordSize);

  2315     restore_args(masm, total_c_args, c_arg, out_regs);

  2317 #ifdef ASSERT

  2318     { Label L;

  2319       __ ld(AT, thread, in_bytes(Thread::pending_exception_offset()));

  2320       __ beq(AT, R0, L);

  2321       __ delayed()->nop();

  2322       __ stop("no pending exception allowed on exit from monitorenter");

  2323       __ bind(L);

  2324     }

  2325 #endif

  2326     __ b(lock_done);

  2327     __ delayed()->nop();

  2328     // END Slow path lock

  2330     // BEGIN Slow path unlock

  2331     __ bind(slow_path_unlock);

  2333     // Slow path unlock

  2335     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {

  2336       save_native_result(masm, ret_type, stack_slots);

  2337     }

  2338     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)

  2340     __ ld(AT, thread, in_bytes(Thread::pending_exception_offset()));

  2341     __ push(AT);

  2342     __ sd(R0, thread, in_bytes(Thread::pending_exception_offset()));

  2344                 __ move(AT, -(StackAlignmentInBytes));

  2345                 __ move(S2, SP);     // use S2 as a sender SP holder

  2346                 __ andr(SP, SP, AT); // align stack as required by ABI

  2348     // should be a peal

  2349     // +wordSize because of the push above

  2350     __ addi(A1, FP, lock_slot_fp_offset);

  2352     __ move(A0, obj_reg);

  2353     __ addi(SP,SP, -2*wordSize);

  2354     __ call(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C),

  2355         relocInfo::runtime_call_type);

  2356     __ delayed()->nop();

  2357     __ addi(SP, SP, 2*wordSize);

  2358                 __ move(SP, S2);

  2359     //add for compressedoops

  2360     __ reinit_heapbase();

  2361 #ifdef ASSERT

  2362     {

  2363       Label L;

  2364       __ lw( AT, thread, in_bytes(Thread::pending_exception_offset()));

  2365       __ beq(AT, R0, L);

  2366       __ delayed()->nop();

  2367       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");

  2368       __ bind(L);

  2369     }

  2370 #endif /* ASSERT */

  2372     __ pop(AT);

  2373     __ sd(AT, thread, in_bytes(Thread::pending_exception_offset()));

  2374     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {

  2375       restore_native_result(masm, ret_type, stack_slots);

  2376     }

  2377     __ b(unlock_done);

  2378     __ delayed()->nop();

  2379     // END Slow path unlock

  2381   }

  2383   // SLOW PATH Reguard the stack if needed

  2385   __ bind(reguard);

  2386   save_native_result(masm, ret_type, stack_slots);

  2387   __ call(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages),

  2388       relocInfo::runtime_call_type);

  2389   __ delayed()->nop();

  2390   //add for compressedoops

  2391   __ reinit_heapbase();

  2392   restore_native_result(masm, ret_type, stack_slots);

  2393   __ b(reguard_done);

  2394   __ delayed()->nop();

  2396   // BEGIN EXCEPTION PROCESSING

  2397   if (!is_critical_native) {

  2398     // Forward  the exception

  2399     __ bind(exception_pending);

  2401     // remove possible return value from FPU register stack

  2402     __ empty_FPU_stack();

  2404     // pop our frame

  2405     //forward_exception_entry need return address on stack

  2406     __ addiu(SP, FP, wordSize);

  2407     __ ld(FP, SP, (-1) * wordSize);

  2409     // and forward the exception

  2410     __ jmp(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type);

  2411     __ delayed()->nop();

  2412   }

  2413   __ flush();

  2415   nmethod *nm = nmethod::new_native_nmethod(method,

  2416                                             compile_id,

  2417                                             masm->code(),

  2418                                             vep_offset,

  2419                                             frame_complete,

  2420                                             stack_slots / VMRegImpl::slots_per_word,

  2421                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),

  2422                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),

  2423                                             oop_maps);

  2425   if (is_critical_native) {

  2426     nm->set_lazy_critical_native(true);

  2427   }

  2429   return nm;

  2431 }

  2433 #ifdef HAVE_DTRACE_H

  2434 // ---------------------------------------------------------------------------

  2435 // Generate a dtrace nmethod for a given signature.  The method takes arguments

  2436 // in the Java compiled code convention, marshals them to the native

  2437 // abi and then leaves nops at the position you would expect to call a native

  2438 // function. When the probe is enabled the nops are replaced with a trap

  2439 // instruction that dtrace inserts and the trace will cause a notification

  2440 // to dtrace.

  2441 //

  2442 // The probes are only able to take primitive types and java/lang/String as

  2443 // arguments.  No other java types are allowed. Strings are converted to utf8

  2444 // strings so that from dtrace point of view java strings are converted to C

  2445 // strings. There is an arbitrary fixed limit on the total space that a method

  2446 // can use for converting the strings. (256 chars per string in the signature).

  2447 // So any java string larger then this is truncated.

  2449 static int  fp_offset[ConcreteRegisterImpl::number_of_registers] = { 0 };

  2450 static bool offsets_initialized = false;

  2452 static VMRegPair reg64_to_VMRegPair(Register r) {

  2453   VMRegPair ret;

  2454   if (wordSize == 8) {

  2455     ret.set2(r->as_VMReg());

  2456   } else {

  2457     ret.set_pair(r->successor()->as_VMReg(), r->as_VMReg());

  2458   }

  2459   return ret;

  2460 }

  2463 nmethod *SharedRuntime::generate_dtrace_nmethod(MacroAssembler *masm,

  2464                                                 methodHandle method) {

  2467   // generate_dtrace_nmethod is guarded by a mutex so we are sure to

  2468   // be single threaded in this method.

  2469   assert(AdapterHandlerLibrary_lock->owned_by_self(), "must be");

  2471   // Fill in the signature array, for the calling-convention call.

  2472   int total_args_passed = method->size_of_parameters();

  2474   BasicType* in_sig_bt  = NEW_RESOURCE_ARRAY(BasicType, total_args_passed);

  2475   VMRegPair  *in_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_args_passed);

  2477   // The signature we are going to use for the trap that dtrace will see

  2478   // java/lang/String is converted. We drop "this" and any other object

  2479   // is converted to NULL.  (A one-slot java/lang/Long object reference

  2480   // is converted to a two-slot long, which is why we double the allocation).

  2481   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_args_passed * 2);

  2482   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_args_passed * 2);

  2484   int i=0;

  2485   int total_strings = 0;

  2486   int first_arg_to_pass = 0;

  2487   int total_c_args = 0;

  2489   // Skip the receiver as dtrace doesn't want to see it

  2490   if( !method->is_static() ) {

  2491     in_sig_bt[i++] = T_OBJECT;

  2492     first_arg_to_pass = 1;

  2493   }

  2495   SignatureStream ss(method->signature());

  2496   for ( ; !ss.at_return_type(); ss.next()) {

  2497     BasicType bt = ss.type();

  2498     in_sig_bt[i++] = bt;  // Collect remaining bits of signature

  2499     out_sig_bt[total_c_args++] = bt;

  2500     if( bt == T_OBJECT) {

  2501       symbolOop s = ss.as_symbol_or_null();

  2502       if (s == vmSymbols::java_lang_String()) {

  2503         total_strings++;

  2504         out_sig_bt[total_c_args-1] = T_ADDRESS;

  2505       } else if (s == vmSymbols::java_lang_Boolean() ||

  2506                  s == vmSymbols::java_lang_Byte()) {

  2507         out_sig_bt[total_c_args-1] = T_BYTE;

  2508       } else if (s == vmSymbols::java_lang_Character() ||

  2509                  s == vmSymbols::java_lang_Short()) {

  2510         out_sig_bt[total_c_args-1] = T_SHORT;

  2511       } else if (s == vmSymbols::java_lang_Integer() ||

  2512                  s == vmSymbols::java_lang_Float()) {

  2513         out_sig_bt[total_c_args-1] = T_INT;

  2514       } else if (s == vmSymbols::java_lang_Long() ||

  2515                  s == vmSymbols::java_lang_Double()) {

  2516         out_sig_bt[total_c_args-1] = T_LONG;

  2517         out_sig_bt[total_c_args++] = T_VOID;

  2518       }

  2519     } else if ( bt == T_LONG || bt == T_DOUBLE ) {

  2520       in_sig_bt[i++] = T_VOID;   // Longs & doubles take 2 Java slots

  2521       // We convert double to long

  2522       out_sig_bt[total_c_args-1] = T_LONG;

  2523       out_sig_bt[total_c_args++] = T_VOID;

  2524     } else if ( bt == T_FLOAT) {

  2525       // We convert float to int

  2526       out_sig_bt[total_c_args-1] = T_INT;

  2527     }

  2528   }

  2530   assert(i==total_args_passed, "validly parsed signature");

  2532   // Now get the compiled-Java layout as input arguments

  2533   int comp_args_on_stack;

  2534   comp_args_on_stack = SharedRuntime::java_calling_convention(

  2535       in_sig_bt, in_regs, total_args_passed, false);

  2537   // We have received a description of where all the java arg are located

  2538   // on entry to the wrapper. We need to convert these args to where

  2539   // the a  native (non-jni) function would expect them. To figure out

  2540   // where they go we convert the java signature to a C signature and remove

  2541   // T_VOID for any long/double we might have received.

  2544   // Now figure out where the args must be stored and how much stack space

  2545   // they require (neglecting out_preserve_stack_slots but space for storing

  2546   // the 1st six register arguments). It's weird see int_stk_helper.

  2548   int out_arg_slots;

  2549   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, NULL, total_c_args);

  2551   // Calculate the total number of stack slots we will need.

  2553   // First count the abi requirement plus all of the outgoing args

  2554   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;

  2556   // Plus a temp for possible converion of float/double/long register args

  2558   int conversion_temp = stack_slots;

  2559   stack_slots += 2;

  2562   // Now space for the string(s) we must convert

  2564   int string_locs = stack_slots;

  2565   stack_slots += total_strings *

  2566                    (max_dtrace_string_size / VMRegImpl::stack_slot_size);

  2568   // Ok The space we have allocated will look like:

  2569   //

  2570   //

  2571   // FP-> |                     |

  2572   //      |---------------------|

  2573   //      | string[n]           |

  2574   //      |---------------------| <- string_locs[n]

  2575   //      | string[n-1]         |

  2576   //      |---------------------| <- string_locs[n-1]

  2577   //      | ...                 |

  2578   //      | ...                 |

  2579   //      |---------------------| <- string_locs[1]

  2580   //      | string[0]           |

  2581   //      |---------------------| <- string_locs[0]

  2582   //      | temp                |

  2583   //      |---------------------| <- conversion_temp

  2584   //      | outbound memory     |

  2585   //      | based arguments     |

  2586   //      |                     |

  2587   //      |---------------------|

  2588   //      |                     |

  2589   // SP-> | out_preserved_slots |

  2590   //

  2591   //

  2593   // Now compute actual number of stack words we need rounding to make

  2594   // stack properly aligned.

  2595   stack_slots = round_to(stack_slots, 4 * VMRegImpl::slots_per_word);

  2597   int stack_size = stack_slots * VMRegImpl::stack_slot_size;

  2599   intptr_t start = (intptr_t)__ pc();

  2601   // First thing make an ic check to see if we should even be here

  2603   {

  2604     Label L;

  2605     const Register temp_reg = G3_scratch;

  2606     Address ic_miss(temp_reg, SharedRuntime::get_ic_miss_stub());

  2607     __ verify_oop(O0);

  2608     __ ld_ptr(O0, oopDesc::klass_offset_in_bytes(), temp_reg);

  2609     __ cmp(temp_reg, G5_inline_cache_reg);

  2610     __ brx(Assembler::equal, true, Assembler::pt, L);

  2611     __ delayed()->nop();

  2613     __ jump_to(ic_miss, 0);

  2614     __ delayed()->nop();

  2615     __ align(CodeEntryAlignment);

  2616     __ bind(L);

  2617   }

  2619   int vep_offset = ((intptr_t)__ pc()) - start;

  2622   // The instruction at the verified entry point must be 5 bytes or longer

  2623   // because it can be patched on the fly by make_non_entrant. The stack bang

  2624   // instruction fits that requirement.

  2626   // Generate stack overflow check before creating frame

  2627   __ generate_stack_overflow_check(stack_size);

  2629   assert(((intptr_t)__ pc() - start - vep_offset) >= 5,

  2630          "valid size for make_non_entrant");

  2632   // Generate a new frame for the wrapper.

  2633   __ save(SP, -stack_size, SP);

  2635   // Frame is now completed as far a size and linkage.

  2637   int frame_complete = ((intptr_t)__ pc()) - start;

  2639 #ifdef ASSERT

  2640   bool reg_destroyed[RegisterImpl::number_of_registers];

  2641   bool freg_destroyed[FloatRegisterImpl::number_of_registers];

  2642   for ( int r = 0 ; r < RegisterImpl::number_of_registers ; r++ ) {

  2643     reg_destroyed[r] = false;

  2644   }

  2645   for ( int f = 0 ; f < FloatRegisterImpl::number_of_registers ; f++ ) {

  2646     freg_destroyed[f] = false;

  2647   }

  2649 #endif /* ASSERT */

  2651   VMRegPair zero;

  2652   const Register g0 = G0; // without this we get a compiler warning (why??)

  2653   zero.set2(g0->as_VMReg());

  2655   int c_arg, j_arg;

  2657   Register conversion_off = noreg;

  2659   for (j_arg = first_arg_to_pass, c_arg = 0 ;

  2660        j_arg < total_args_passed ; j_arg++, c_arg++ ) {

  2662     VMRegPair src = in_regs[j_arg];

  2663     VMRegPair dst = out_regs[c_arg];

  2665 #ifdef ASSERT

  2666     if (src.first()->is_Register()) {

  2667       assert(!reg_destroyed[src.first()->as_Register()->encoding()], "ack!");

  2668     } else if (src.first()->is_FloatRegister()) {

  2669       assert(!freg_destroyed[src.first()->as_FloatRegister()->encoding(

  2670                                                FloatRegisterImpl::S)], "ack!");

  2671     }

  2672     if (dst.first()->is_Register()) {

  2673       reg_destroyed[dst.first()->as_Register()->encoding()] = true;

  2674     } else if (dst.first()->is_FloatRegister()) {

  2675       freg_destroyed[dst.first()->as_FloatRegister()->encoding(

  2676                                                  FloatRegisterImpl::S)] = true;

  2677     }

  2678 #endif /* ASSERT */

  2680     switch (in_sig_bt[j_arg]) {

  2681       case T_ARRAY:

  2682       case T_OBJECT:

  2683         {

  2684           if (out_sig_bt[c_arg] == T_BYTE  || out_sig_bt[c_arg] == T_SHORT ||

  2685               out_sig_bt[c_arg] == T_INT || out_sig_bt[c_arg] == T_LONG) {

  2686             // need to unbox a one-slot value

  2687             Register in_reg = L0;

  2688             Register tmp = L2;

  2689             if ( src.first()->is_reg() ) {

  2690               in_reg = src.first()->as_Register();

  2691             } else {

  2692               assert(Assembler::is_simm13(reg2offset(src.first()) + STACK_BIAS),

  2693                      "must be");

  2694               __ ld_ptr(FP, reg2offset(src.first()) + STACK_BIAS, in_reg);

  2695             }

  2696             // If the final destination is an acceptable register

  2697             if ( dst.first()->is_reg() ) {

  2698               if ( dst.is_single_phys_reg() || out_sig_bt[c_arg] != T_LONG ) {

  2699                 tmp = dst.first()->as_Register();

  2700               }

  2701             }

  2703             Label skipUnbox;

  2704             if ( wordSize == 4 && out_sig_bt[c_arg] == T_LONG ) {

  2705               __ mov(G0, tmp->successor());

  2706             }

  2707             __ br_null(in_reg, true, Assembler::pn, skipUnbox);

  2708             __ delayed()->mov(G0, tmp);

  2710             BasicType bt = out_sig_bt[c_arg];

  2711             int box_offset = java_lang_boxing_object::value_offset_in_bytes(bt);

  2712             switch (bt) {

  2713                 case T_BYTE:

  2714                   __ ldub(in_reg, box_offset, tmp); break;

  2715                 case T_SHORT:

  2716                   __ lduh(in_reg, box_offset, tmp); break;

  2717                 case T_INT:

  2718                   __ ld(in_reg, box_offset, tmp); break;

  2719                 case T_LONG:

  2720                   __ ld_long(in_reg, box_offset, tmp); break;

  2721                 default: ShouldNotReachHere();

  2722             }

  2724             __ bind(skipUnbox);

  2725             // If tmp wasn't final destination copy to final destination

  2726             if (tmp == L2) {

  2727               VMRegPair tmp_as_VM = reg64_to_VMRegPair(L2);

  2728               if (out_sig_bt[c_arg] == T_LONG) {

  2729                 long_move(masm, tmp_as_VM, dst);

  2730               } else {

  2731                 move32_64(masm, tmp_as_VM, out_regs[c_arg]);

  2732               }

  2733             }

  2734             if (out_sig_bt[c_arg] == T_LONG) {

  2735               assert(out_sig_bt[c_arg+1] == T_VOID, "must be");

  2736               ++c_arg; // move over the T_VOID to keep the loop indices in sync

  2737             }

  2738           } else if (out_sig_bt[c_arg] == T_ADDRESS) {

  2739             Register s =

  2740                 src.first()->is_reg() ? src.first()->as_Register() : L2;

  2741             Register d =

  2742                 dst.first()->is_reg() ? dst.first()->as_Register() : L2;

  2744             // We store the oop now so that the conversion pass can reach

  2745             // while in the inner frame. This will be the only store if

  2746             // the oop is NULL.

  2747             if (s != L2) {

  2748               // src is register

  2749               if (d != L2) {

  2750                 // dst is register

  2751                 __ mov(s, d);

  2752               } else {

  2753                 assert(Assembler::is_simm13(reg2offset(dst.first()) +

  2754                           STACK_BIAS), "must be");

  2755                 __ st_ptr(s, SP, reg2offset(dst.first()) + STACK_BIAS);

  2756               }

  2757             } else {

  2758                 // src not a register

  2759                 assert(Assembler::is_simm13(reg2offset(src.first()) +

  2760                            STACK_BIAS), "must be");

  2761                 __ ld_ptr(FP, reg2offset(src.first()) + STACK_BIAS, d);

  2762                 if (d == L2) {

  2763                   assert(Assembler::is_simm13(reg2offset(dst.first()) +

  2764                              STACK_BIAS), "must be");

  2765                   __ st_ptr(d, SP, reg2offset(dst.first()) + STACK_BIAS);

  2766                 }

  2767             }

  2768           } else if (out_sig_bt[c_arg] != T_VOID) {

  2769             // Convert the arg to NULL

  2770             if (dst.first()->is_reg()) {

  2771               __ mov(G0, dst.first()->as_Register());

  2772             } else {

  2773               assert(Assembler::is_simm13(reg2offset(dst.first()) +

  2774                          STACK_BIAS), "must be");

  2775               __ st_ptr(G0, SP, reg2offset(dst.first()) + STACK_BIAS);

  2776             }

  2777           }

  2778         }

  2779         break;

  2780       case T_VOID:

  2781         break;

  2783       case T_FLOAT:

  2784         if (src.first()->is_stack()) {

  2785           // Stack to stack/reg is simple

  2786           move32_64(masm, src, dst);

  2787         } else {

  2788           if (dst.first()->is_reg()) {

  2789             // freg -> reg

  2790             int off =

  2791               STACK_BIAS + conversion_temp * VMRegImpl::stack_slot_size;

  2792             Register d = dst.first()->as_Register();

  2793             if (Assembler::is_simm13(off)) {

  2794               __ stf(FloatRegisterImpl::S, src.first()->as_FloatRegister(),

  2795                      SP, off);

  2796               __ ld(SP, off, d);

  2797             } else {

  2798               if (conversion_off == noreg) {

  2799                 __ set(off, L6);

  2800                 conversion_off = L6;

  2801               }

  2802               __ stf(FloatRegisterImpl::S, src.first()->as_FloatRegister(),

  2803                      SP, conversion_off);

  2804               __ ld(SP, conversion_off , d);

  2805             }

  2806           } else {

  2807             // freg -> mem

  2808             int off = STACK_BIAS + reg2offset(dst.first());

  2809             if (Assembler::is_simm13(off)) {

  2810               __ stf(FloatRegisterImpl::S, src.first()->as_FloatRegister(),

  2811                      SP, off);

  2812             } else {

  2813               if (conversion_off == noreg) {

  2814                 __ set(off, L6);

  2815                 conversion_off = L6;

  2816               }

  2817               __ stf(FloatRegisterImpl::S, src.first()->as_FloatRegister(),

  2818                      SP, conversion_off);

  2819             }

  2820           }

  2821         }

  2822         break;

  2824       case T_DOUBLE:

  2825         assert( j_arg + 1 < total_args_passed &&

  2826                 in_sig_bt[j_arg + 1] == T_VOID &&

  2827                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");

  2828         if (src.first()->is_stack()) {

  2829           // Stack to stack/reg is simple

  2830           long_move(masm, src, dst);

  2831         } else {

  2832           Register d = dst.first()->is_reg() ? dst.first()->as_Register() : L2;

  2834           // Destination could be an odd reg on 32bit in which case

  2835           // we can't load direct to the destination.

  2837           if (!d->is_even() && wordSize == 4) {

  2838             d = L2;

  2839           }

  2840           int off = STACK_BIAS + conversion_temp * VMRegImpl::stack_slot_size;

  2841           if (Assembler::is_simm13(off)) {

  2842             __ stf(FloatRegisterImpl::D, src.first()->as_FloatRegister(),

  2843                    SP, off);

  2844             __ ld_long(SP, off, d);

  2845           } else {

  2846             if (conversion_off == noreg) {

  2847               __ set(off, L6);

  2848               conversion_off = L6;

  2849             }

  2850             __ stf(FloatRegisterImpl::D, src.first()->as_FloatRegister(),

  2851                    SP, conversion_off);

  2852             __ ld_long(SP, conversion_off, d);

  2853           }

  2854           if (d == L2) {

  2855             long_move(masm, reg64_to_VMRegPair(L2), dst);

  2856           }

  2857         }

  2858         break;

  2860       case T_LONG :

  2861         // 32bit can't do a split move of something like g1 -> O0, O1

  2862         // so use a memory temp

  2863         if (src.is_single_phys_reg() && wordSize == 4) {

  2864           Register tmp = L2;

  2865           if (dst.first()->is_reg() &&

  2866               (wordSize == 8 || dst.first()->as_Register()->is_even())) {

  2867             tmp = dst.first()->as_Register();

  2868           }

  2870           int off = STACK_BIAS + conversion_temp * VMRegImpl::stack_slot_size;

  2871           if (Assembler::is_simm13(off)) {

  2872             __ stx(src.first()->as_Register(), SP, off);

  2873             __ ld_long(SP, off, tmp);

  2874           } else {

  2875             if (conversion_off == noreg) {

  2876               __ set(off, L6);

  2877               conversion_off = L6;

  2878             }

  2879             __ stx(src.first()->as_Register(), SP, conversion_off);

  2880             __ ld_long(SP, conversion_off, tmp);

  2881           }

  2883           if (tmp == L2) {

  2884             long_move(masm, reg64_to_VMRegPair(L2), dst);

  2885           }

  2886         } else {

  2887           long_move(masm, src, dst);

  2888         }

  2889         break;

  2891       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");

  2893       default:

  2894         move32_64(masm, src, dst);

  2895     }

  2896   }

  2899   // If we have any strings we must store any register based arg to the stack

  2900   // This includes any still live xmm registers too.

  2902   if (total_strings > 0 ) {

  2904     // protect all the arg registers

  2905     __ save_frame(0);

  2906     __ mov(G2_thread, L7_thread_cache);

  2907     const Register L2_string_off = L2;

  2909     // Get first string offset

  2910     __ set(string_locs * VMRegImpl::stack_slot_size, L2_string_off);

  2912     for (c_arg = 0 ; c_arg < total_c_args ; c_arg++ ) {

  2913       if (out_sig_bt[c_arg] == T_ADDRESS) {

  2915         VMRegPair dst = out_regs[c_arg];

  2916         const Register d = dst.first()->is_reg() ?

  2917             dst.first()->as_Register()->after_save() : noreg;

  2919         // It's a string the oop and it was already copied to the out arg

  2920         // position

  2921         if (d != noreg) {

  2922           __ mov(d, O0);

  2923         } else {

  2924           assert(Assembler::is_simm13(reg2offset(dst.first()) + STACK_BIAS),

  2925                  "must be");

  2926           __ ld_ptr(FP,  reg2offset(dst.first()) + STACK_BIAS, O0);

  2927         }

  2928         Label skip;

  2930         __ br_null(O0, false, Assembler::pn, skip);

  2931         __ delayed()->add(FP, L2_string_off, O1);

  2933         if (d != noreg) {

  2934           __ mov(O1, d);

  2935         } else {

  2936           assert(Assembler::is_simm13(reg2offset(dst.first()) + STACK_BIAS),

  2937                  "must be");

  2938           __ st_ptr(O1, FP,  reg2offset(dst.first()) + STACK_BIAS);

  2939         }

  2941         __ call(CAST_FROM_FN_PTR(address, SharedRuntime::get_utf),

  2942                 relocInfo::runtime_call_type);

  2943         __ delayed()->add(L2_string_off, max_dtrace_string_size, L2_string_off);

  2945         __ bind(skip);

  2947       }

  2949     }

  2950     __ mov(L7_thread_cache, G2_thread);

  2951     __ restore();

  2953   }

  2956   // Ok now we are done. Need to place the nop that dtrace wants in order to

  2957   // patch in the trap

  2959   int patch_offset = ((intptr_t)__ pc()) - start;

  2961   __ nop();

  2964   // Return

  2966   __ ret();

  2967   __ delayed()->restore();

  2969   __ flush();

  2971   nmethod *nm = nmethod::new_dtrace_nmethod(

  2972       method, masm->code(), vep_offset, patch_offset, frame_complete,

  2973       stack_slots / VMRegImpl::slots_per_word);

  2974   return nm;

  2976 }

  2978 #endif // HAVE_DTRACE_H

  2980 // this function returns the adjust size (in number of words) to a c2i adapter

  2981 // activation for use during deoptimization

  2982 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals) {

  2983   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;

  2984 }

  2986 // "Top of Stack" slots that may be unused by the calling convention but must

  2987 // otherwise be preserved.

  2988 // On Intel these are not necessary and the value can be zero.

  2989 // On Sparc this describes the words reserved for storing a register window

  2990 // when an interrupt occurs.

  2991 uint SharedRuntime::out_preserve_stack_slots() {

  2992    return 0;

  2993 }

  2995 //------------------------------generate_deopt_blob----------------------------

  2996 // Ought to generate an ideal graph & compile, but here's some SPARC ASM

  2997 // instead.

  2998 void SharedRuntime::generate_deopt_blob() {

  2999   // allocate space for the code

  3000   ResourceMark rm;

  3001   // setup code generation tools

  3002   //CodeBuffer     buffer ("deopt_blob", 4000, 2048);

  3003   CodeBuffer     buffer ("deopt_blob", 8000, 2048);//aoqi FIXME for debug

  3004   MacroAssembler* masm  = new MacroAssembler( & buffer);

  3005   int frame_size_in_words;

  3006   OopMap* map = NULL;

  3007   // Account for the extra args we place on the stack

  3008   // by the time we call fetch_unroll_info

  3009   const int additional_words = 2; // deopt kind, thread

  3011   OopMapSet *oop_maps = new OopMapSet();

  3013   address start = __ pc();

  3014   Label cont;

  3015   // we use S3 for DeOpt reason register

  3016   Register reason = S3;

  3017   // use S6 for thread register

  3018   Register thread = TREG;

  3019   // use S7 for fetch_unroll_info returned UnrollBlock

  3020   Register unroll = S7;

  3021   // Prolog for non exception case!

  3022   // Correct the return address we were given.

  3023   //FIXME, return address is on the tos or Ra?

  3024   __ addi(RA, RA, - (NativeCall::return_address_offset_long));

  3025   // Save everything in sight.

  3026   map = RegisterSaver::save_live_registers(masm, additional_words, &frame_size_in_words);

  3027   // Normal deoptimization

  3028   __ move(reason, Deoptimization::Unpack_deopt);

  3029   __ b(cont);

  3030   __ delayed()->nop();

  3032   int reexecute_offset = __ pc() - start;

  3034   // Reexecute case

  3035   // return address is the pc describes what bci to do re-execute at

  3037   // No need to update map as each call to save_live_registers will produce identical oopmap

  3038   (void) RegisterSaver::save_live_registers(masm, additional_words, &frame_size_in_words);

  3039   __ move(reason, Deoptimization::Unpack_reexecute);

  3040   __ b(cont);

  3041   __ delayed()->nop();

  3043   int   exception_offset = __ pc() - start;

  3044   // Prolog for exception case

  3046   // all registers are dead at this entry point, except for V0 and

  3047   // V1 which contain the exception oop and exception pc

  3048   // respectively.  Set them in TLS and fall thru to the

  3049   // unpack_with_exception_in_tls entry point.

  3051   __ get_thread(thread);

  3052   __ st_ptr(V1, thread, in_bytes(JavaThread::exception_pc_offset()));

  3053   __ st_ptr(V0, thread, in_bytes(JavaThread::exception_oop_offset()));

  3054   int exception_in_tls_offset = __ pc() - start;

  3055   // new implementation because exception oop is now passed in JavaThread

  3057   // Prolog for exception case

  3058   // All registers must be preserved because they might be used by LinearScan

  3059   // Exceptiop oop and throwing PC are passed in JavaThread

  3060   // tos: stack at point of call to method that threw the exception (i.e. only

  3061   // args are on the stack, no return address)

  3063   // Return address will be patched later with the throwing pc. The correct value is not

  3064   // available now because loading it from memory would destroy registers.

  3065   // Save everything in sight.

  3066   // No need to update map as each call to save_live_registers will produce identical oopmap

  3067   __ addi(RA, RA, - (NativeCall::return_address_offset_long));

  3068   (void) RegisterSaver::save_live_registers(masm, additional_words, &frame_size_in_words);

  3070   // Now it is safe to overwrite any register

  3071   // store the correct deoptimization type

  3072   __ move(reason, Deoptimization::Unpack_exception);

  3073   // load throwing pc from JavaThread and patch it as the return address

  3074   // of the current frame. Then clear the field in JavaThread

  3075   __ get_thread(thread);

  3076   __ ld_ptr(V1, thread, in_bytes(JavaThread::exception_pc_offset()));

  3077   __ st_ptr(V1, SP, RegisterSaver::raOffset() * wordSize); //save ra

  3078   __ st_ptr(R0, thread, in_bytes(JavaThread::exception_pc_offset()));

  3081 #ifdef ASSERT

  3082   // verify that there is really an exception oop in JavaThread

  3083   __ ld_ptr(AT, thread, in_bytes(JavaThread::exception_oop_offset()));

  3084   __ verify_oop(AT);

  3085   // verify that there is no pending exception

  3086   Label no_pending_exception;

  3087   __ ld_ptr(AT, thread, in_bytes(Thread::pending_exception_offset()));

  3088   __ beq(AT, R0, no_pending_exception);

  3089   __ delayed()->nop();

  3090   __ stop("must not have pending exception here");

  3091   __ bind(no_pending_exception);

  3092 #endif

  3093   __ bind(cont);

  3094   // Compiled code leaves the floating point stack dirty, empty it.

  3095   __ empty_FPU_stack();

  3098   // Call C code.  Need thread and this frame, but NOT official VM entry

  3099   // crud.  We cannot block on this call, no GC can happen.

  3100 #ifndef OPT_THREAD

  3101   __ get_thread(thread);

  3102 #endif

  3104   __ move(A0, thread);

  3105   __ addi(SP, SP, -additional_words  * wordSize);

  3107   __ set_last_Java_frame(NOREG, NOREG, NULL);

  3109   // Call fetch_unroll_info().  Need thread and this frame, but NOT official VM entry - cannot block on

  3110   // this call, no GC can happen.  Call should capture return values.

  3112   __ relocate(relocInfo::internal_pc_type);

  3113   {

  3114     intptr_t save_pc = (intptr_t)__ pc() +  NativeMovConstReg::instruction_size + 28;

  3115     __ patchable_set48(AT, save_pc);

  3116   }

  3117   __ sd(AT, thread, in_bytes(JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()));

  3119   __ call((address)Deoptimization::fetch_unroll_info);

  3120   //__ call(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info), relocInfo::runtime_call_type);

  3121   __ delayed()->nop();

  3122   oop_maps->add_gc_map(__ pc() - start, map);

  3123   __ addiu(SP, SP, additional_words * wordSize);

  3124   __ get_thread(thread);

  3125   __ reset_last_Java_frame(false);

  3127   // Load UnrollBlock into S7

  3128   __ move(unroll, V0);

  3131   // Move the unpack kind to a safe place in the UnrollBlock because

  3132   // we are very short of registers

  3134   Address unpack_kind(unroll, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes());

  3135   __ sw(reason, unpack_kind);

  3136   // save the unpack_kind value

  3137   // Retrieve the possible live values (return values)

  3138   // All callee save registers representing jvm state

  3139   // are now in the vframeArray.

  3141   Label noException;

  3142   __ move(AT, Deoptimization::Unpack_exception);

  3143   __ bne(AT, reason, noException);// Was exception pending?

  3144   __ delayed()->nop();

  3145   __ ld_ptr(V0, thread, in_bytes(JavaThread::exception_oop_offset()));

  3146   __ ld_ptr(V1, thread, in_bytes(JavaThread::exception_pc_offset()));

  3147   __ st_ptr(R0, thread, in_bytes(JavaThread::exception_pc_offset()));

  3148   __ st_ptr(R0, thread, in_bytes(JavaThread::exception_oop_offset()));

  3150   __ verify_oop(V0);

  3152   // Overwrite the result registers with the exception results.

  3153   __ st_ptr(V0, SP, RegisterSaver::v0Offset()*wordSize);

  3154   __ st_ptr(V1, SP, RegisterSaver::v1Offset()*wordSize);

  3156   __ bind(noException);

  3159   // Stack is back to only having register save data on the stack.

  3160   // Now restore the result registers. Everything else is either dead or captured

  3161   // in the vframeArray.

  3163   RegisterSaver::restore_result_registers(masm);

  3164   // All of the register save area has been popped of the stack. Only the

  3165   // return address remains.

  3166   // Pop all the frames we must move/replace.

  3167   // Frame picture (youngest to oldest)

  3168   // 1: self-frame (no frame link)

  3169   // 2: deopting frame  (no frame link)

  3170   // 3: caller of deopting frame (could be compiled/interpreted).

  3171   //

  3172   // Note: by leaving the return address of self-frame on the stack

  3173   // and using the size of frame 2 to adjust the stack

  3174   // when we are done the return to frame 3 will still be on the stack.

  3176   // register for the sender's sp

  3177   Register sender_sp = Rsender;

  3178   // register for frame pcs

  3179   Register pcs = T0;

  3180   // register for frame sizes

  3181   Register sizes = T1;

  3182   // register for frame count

  3183   Register count = T3;

  3185   // Pop deoptimized frame

  3186   __ lw(AT, unroll, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset_in_bytes());

  3187   __ add(SP, SP, AT);

  3188   // sp should be pointing at the return address to the caller (3)

  3190   // Load array of frame pcs into pcs

  3191   __ ld_ptr(pcs, unroll, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes());

  3192   __ addi(SP, SP, wordSize);  // trash the old pc

  3193   // Load array of frame sizes into T6

  3194   __ ld_ptr(sizes, unroll, Deoptimization::UnrollBlock::frame_sizes_offset_in_bytes());

  3198   // Load count of frams into T3

  3199   __ lw(count, unroll, Deoptimization::UnrollBlock::number_of_frames_offset_in_bytes());

  3200   // Pick up the initial fp we should save

  3201   __ ld(FP, unroll,  Deoptimization::UnrollBlock::initial_info_offset_in_bytes());

  3202    // Now adjust the caller's stack to make up for the extra locals

  3203   // but record the original sp so that we can save it in the skeletal interpreter

  3204   // frame and the stack walking of interpreter_sender will get the unextended sp

  3205   // value and not the "real" sp value.

  3206   __ move(sender_sp, SP);

  3207   __ lw(AT, unroll, Deoptimization::UnrollBlock::caller_adjustment_offset_in_bytes());

  3208   __ sub(SP, SP, AT);

  3210   // Push interpreter frames in a loop

  3211   //

  3212   //Loop:

  3213   //   0x000000555bd82d18: lw t2, 0x0(t1)           ; lw sizes[i]  <--- error lw->ld

  3214   //   0x000000555bd82d1c: ld at, 0x0(t0)           ; ld pcs[i]

  3215   //   0x000000555bd82d20: daddi t2, t2, 0xfffffff0 ; t2 -= 16

  3216   //   0x000000555bd82d24: daddi sp, sp, 0xfffffff0

  3217   //   0x000000555bd82d28: sd fp, 0x0(sp)           ; push fp

  3218   //   0x000000555bd82d2c: sd at, 0x8(sp)           ; push at

  3219   //   0x000000555bd82d30: dadd fp, sp, zero        ; fp <- sp

  3220   //   0x000000555bd82d34: dsub sp, sp, t2          ; sp -= t2

  3221   //   0x000000555bd82d38: sd zero, 0xfffffff0(fp)  ; __ sd(R0, FP, frame::interpreter_frame_last_sp_offset * wordSize);

  3222   //   0x000000555bd82d3c: sd s4, 0xfffffff8(fp)    ; __ sd(sender_sp, FP, frame::interpreter_frame_sender_sp_offset * wordSize);

  3223   //   0x000000555bd82d40: dadd s4, sp, zero        ; move(sender_sp, SP);

  3224   //   0x000000555bd82d44: daddi t3, t3, 0xffffffff ; count --

  3225   //   0x000000555bd82d48: daddi t1, t1, 0x4        ; sizes += 4

  3226   //   0x000000555bd82d4c: bne t3, zero, 0x000000555bd82d18

  3227   //   0x000000555bd82d50: daddi t0, t0, 0x4        ; <--- error    t0 += 8

  3228   //

  3229   // pcs[0] = frame_pcs[0] = deopt_sender.raw_pc(); regex.split

  3230   Label loop;

  3231   __ bind(loop);

  3232   __ ld(T2, sizes, 0);    // Load frame size

  3233   __ ld_ptr(AT, pcs, 0);           // save return address

  3234   __ addi(T2, T2, -2*wordSize);           // we'll push pc and fp, by hand

  3235   __ push2(AT, FP);

  3236   __ move(FP, SP);

  3237   __ sub(SP, SP, T2);       // Prolog!

  3238   // This value is corrected by layout_activation_impl

  3239   __ sd(R0, FP, frame::interpreter_frame_last_sp_offset * wordSize);

  3240   __ sd(sender_sp, FP, frame::interpreter_frame_sender_sp_offset * wordSize);// Make it walkable

  3241   __ move(sender_sp, SP);  // pass to next frame

  3242   __ addi(count, count, -1);   // decrement counter

  3243   __ addi(sizes, sizes, wordSize);   // Bump array pointer (sizes)

  3244   __ bne(count, R0, loop);

  3245   __ delayed()->addi(pcs, pcs, wordSize);   // Bump array pointer (pcs)

  3246   __ ld(AT, pcs, 0);      // frame_pcs[number_of_frames] = Interpreter::deopt_entry(vtos, 0);

  3247   // Re-push self-frame

  3248   __ push2(AT, FP);

  3249   __ move(FP, SP);

  3250   __ sd(R0, FP, frame::interpreter_frame_last_sp_offset * wordSize);

  3251   __ sd(sender_sp, FP, frame::interpreter_frame_sender_sp_offset * wordSize);

  3252   __ addi(SP, SP, -(frame_size_in_words - 2 - additional_words) * wordSize);

  3254   // Restore frame locals after moving the frame

  3255   __ sd(V0, SP, RegisterSaver::v0Offset() * wordSize);

  3256   __ sd(V1, SP, RegisterSaver::v1Offset() * wordSize);

  3257   __ sdc1(F0, SP, RegisterSaver::fpResultOffset()* wordSize);// Pop float stack and store in local

  3258   __ sdc1(F1, SP, (RegisterSaver::fpResultOffset() + 1) * wordSize);

  3261   // Call unpack_frames().  Need thread and this frame, but NOT official VM entry - cannot block on

  3262   // this call, no GC can happen.

  3263   __ move(A1, reason);  // exec_mode

  3264   __ get_thread(thread);

  3265   __ move(A0, thread);  // thread

  3266   __ addi(SP, SP, (-additional_words) *wordSize);

  3268   // set last_Java_sp, last_Java_fp

  3269   __ set_last_Java_frame(NOREG, FP, NULL);

  3271   __ move(AT, -(StackAlignmentInBytes));

  3272   __ andr(SP, SP, AT);   // Fix stack alignment as required by ABI

  3274   __ relocate(relocInfo::internal_pc_type);

  3275   {

  3276     intptr_t save_pc = (intptr_t)__ pc() +  NativeMovConstReg::instruction_size + 28;

  3277     __ patchable_set48(AT, save_pc);

  3278   }

  3279   __ sd(AT, thread, in_bytes(JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()));

  3281   __ call(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames), relocInfo::runtime_call_type);

  3282   __ delayed()->nop();

  3283   // Revert SP alignment after call since we're going to do some SP relative addressing below

  3284   __ ld(SP, thread, in_bytes(JavaThread::last_Java_sp_offset()));

  3285   // Set an oopmap for the call site

  3286   oop_maps->add_gc_map(__ offset(), new OopMap( frame_size_in_words , 0));

  3288   __ push(V0);

  3290   __ get_thread(thread);

  3291   __ reset_last_Java_frame(true);

  3293   // Collect return values

  3294   __ ld(V0, SP, (RegisterSaver::v0Offset() + additional_words +1) * wordSize);

  3295   __ ld(V1, SP, (RegisterSaver::v1Offset() + additional_words +1) * wordSize);

  3296   __ ldc1(F0, SP, RegisterSaver::fpResultOffset()* wordSize);// Pop float stack and store in local

  3297   __ ldc1(F1, SP, (RegisterSaver::fpResultOffset() + 1) * wordSize);

  3298   //FIXME,

  3299   // Clear floating point stack before returning to interpreter

  3300   __ empty_FPU_stack();

  3301   //FIXME, we should consider about float and double

  3302   // Push a float or double return value if necessary.

  3303   __ leave();

  3305   // Jump to interpreter

  3306   __ jr(RA);

  3307   __ delayed()->nop();

  3309   masm->flush();

  3310   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);

  3311   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);

  3312 }

  3314 #ifdef COMPILER2

  3316 //------------------------------generate_uncommon_trap_blob--------------------

  3317 // Ought to generate an ideal graph & compile, but here's some SPARC ASM

  3318 // instead.

  3319 void SharedRuntime::generate_uncommon_trap_blob() {

  3320   // allocate space for the code

  3321   ResourceMark rm;

  3322   // setup code generation tools

  3323   CodeBuffer  buffer ("uncommon_trap_blob", 512*80 , 512*40 );

  3324   MacroAssembler* masm = new MacroAssembler(&buffer);

  3326   enum frame_layout {

  3327     s0_off, s0_off2,

  3328     s1_off, s1_off2,

  3329     s2_off, s2_off2,

  3330     s3_off, s3_off2,

  3331     s4_off, s4_off2,

  3332     s5_off, s5_off2,

  3333     s6_off, s6_off2,

  3334     s7_off, s7_off2,

  3335     fp_off, fp_off2,

  3336     return_off, return_off2,    // slot for return address    sp + 9

  3337     framesize

  3338   };

  3339   assert(framesize % 4 == 0, "sp not 16-byte aligned");

  3341   address start = __ pc();

  3343   // Push self-frame.

  3344   __ daddiu(SP, SP, -framesize * BytesPerInt);

  3346   __ sd(RA, SP, return_off * BytesPerInt);

  3347   __ sd(FP, SP, fp_off * BytesPerInt);

  3349   // Save callee saved registers.  None for UseSSE=0,

  3350   // floats-only for UseSSE=1, and doubles for UseSSE=2.

  3351   __ sd(S0, SP, s0_off * BytesPerInt);

  3352   __ sd(S1, SP, s1_off * BytesPerInt);

  3353   __ sd(S2, SP, s2_off * BytesPerInt);

  3354   __ sd(S3, SP, s3_off * BytesPerInt);

  3355   __ sd(S4, SP, s4_off * BytesPerInt);

  3356   __ sd(S5, SP, s5_off * BytesPerInt);

  3357   __ sd(S6, SP, s6_off * BytesPerInt);

  3358   __ sd(S7, SP, s7_off * BytesPerInt);

  3360   __ daddi(FP, SP, fp_off * BytesPerInt);

  3362   // Clear the floating point exception stack

  3363   __ empty_FPU_stack();

  3365   Register thread = TREG;

  3367 #ifndef OPT_THREAD

  3368   __ get_thread(thread);

  3369 #endif

  3370   // set last_Java_sp

  3371   __ set_last_Java_frame(NOREG, FP, NULL);

  3372   __ relocate(relocInfo::internal_pc_type);

  3373   {

  3374     long save_pc = (long)__ pc() + 52;

  3375     __ patchable_set48(AT, (long)save_pc);

  3376     __ sd(AT, thread, in_bytes(JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()));

  3377   }

  3378   // Call C code.  Need thread but NOT official VM entry

  3379   // crud.  We cannot block on this call, no GC can happen.  Call should

  3380   // capture callee-saved registers as well as return values.

  3381   __ move(A0, thread);

  3382   // argument already in T0

  3383   __ move(A1, T0);

  3384   __ patchable_call((address)Deoptimization::uncommon_trap);

  3386   // Set an oopmap for the call site

  3387   OopMapSet *oop_maps = new OopMapSet();

  3388   OopMap* map =  new OopMap( framesize, 0 );

  3390   map->set_callee_saved( VMRegImpl::stack2reg(s0_off    ),  S0->as_VMReg() );

  3391   map->set_callee_saved( VMRegImpl::stack2reg(s1_off    ),  S1->as_VMReg() );

  3392   map->set_callee_saved( VMRegImpl::stack2reg(s2_off    ),  S2->as_VMReg() );

  3393   map->set_callee_saved( VMRegImpl::stack2reg(s3_off    ),  S3->as_VMReg() );

  3394   map->set_callee_saved( VMRegImpl::stack2reg(s4_off    ),  S4->as_VMReg() );

  3395   map->set_callee_saved( VMRegImpl::stack2reg(s5_off    ),  S5->as_VMReg() );

  3396   map->set_callee_saved( VMRegImpl::stack2reg(s6_off    ),  S6->as_VMReg() );

  3397   map->set_callee_saved( VMRegImpl::stack2reg(s7_off    ),  S7->as_VMReg() );

  3399   //oop_maps->add_gc_map( __ offset(), true, map);

  3400   oop_maps->add_gc_map( __ offset(),  map);

  3402 #ifndef OPT_THREAD

  3403   __ get_thread(thread);

  3404 #endif

  3405   __ reset_last_Java_frame(false);

  3407   // Load UnrollBlock into S7

  3408   Register unroll = S7;

  3409   __ move(unroll, V0);

  3411   // Pop all the frames we must move/replace.

  3412   //

  3413   // Frame picture (youngest to oldest)

  3414   // 1: self-frame (no frame link)

  3415   // 2: deopting frame  (no frame link)

  3416   // 3: possible-i2c-adapter-frame

  3417   // 4: caller of deopting frame (could be compiled/interpreted. If interpreted we will create an

  3418   //    and c2i here)

  3420   __ daddiu(SP, SP, framesize * BytesPerInt);

  3422   // Pop deoptimized frame

  3423   __ lw(AT, unroll, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset_in_bytes());

  3424   __ dadd(SP, SP, AT);

  3426   // register for frame pcs

  3427   Register pcs = T8;

  3428   // register for frame sizes

  3429   Register sizes = T9;

  3430   // register for frame count

  3431   Register count = T3;

  3432   // register for the sender's sp

  3433   Register sender_sp = T1;

  3435   // sp should be pointing at the return address to the caller (4)

  3436   // Load array of frame pcs

  3437   __ ld(pcs, unroll, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes());

  3439   // Load array of frame sizes

  3440   __ ld(sizes, unroll, Deoptimization::UnrollBlock::frame_sizes_offset_in_bytes());

  3441   __ lwu(count, unroll, Deoptimization::UnrollBlock::number_of_frames_offset_in_bytes());

  3443   // Pick up the initial fp we should save

  3444   __ ld(FP, unroll, Deoptimization::UnrollBlock::initial_info_offset_in_bytes());

  3445   // Now adjust the caller's stack to make up for the extra locals

  3446   // but record the original sp so that we can save it in the skeletal interpreter

  3447   // frame and the stack walking of interpreter_sender will get the unextended sp

  3448   // value and not the "real" sp value.

  3450   __ move(sender_sp, SP);

  3451   __ lw(AT, unroll, Deoptimization::UnrollBlock::caller_adjustment_offset_in_bytes());

  3452   __ dsub(SP, SP, AT);

  3453   // Push interpreter frames in a loop

  3454   Label loop;

  3455   __ bind(loop);

  3456   __ ld(T2, sizes, 0);          // Load frame size

  3457   __ ld(AT, pcs, 0);           // save return address

  3458   __ daddi(T2, T2, -2*wordSize);           // we'll push pc and fp, by hand

  3459   __ push2(AT, FP);

  3460   __ move(FP, SP);

  3461   __ dsub(SP, SP, T2);                   // Prolog!

  3462   // This value is corrected by layout_activation_impl

  3463   __ sd(R0, FP, frame::interpreter_frame_last_sp_offset * wordSize);

  3464   __ sd(sender_sp, FP, frame::interpreter_frame_sender_sp_offset * wordSize);// Make it walkable

  3465   __ move(sender_sp, SP);       // pass to next frame

  3466   __ daddi(count, count, -1);    // decrement counter

  3467   __ daddi(sizes, sizes, wordSize);     // Bump array pointer (sizes)

  3468   __ addi(pcs, pcs, wordSize);      // Bump array pointer (pcs)

  3469   __ bne(count, R0, loop);

  3470   __ delayed()->nop();      // Bump array pointer (pcs)

  3472   __ ld(RA, pcs, 0);

  3474   // Re-push self-frame

  3475   __ daddi(SP, SP, - 2 * wordSize);      // save old & set new FP

  3476   __ sd(FP, SP, 0 * wordSize);          // save final return address

  3477   __ sd(RA, SP, 1 * wordSize);

  3478   __ move(FP, SP);

  3479   __ daddi(SP, SP, -(framesize / 2 - 2) * wordSize);

  3481   // set last_Java_sp, last_Java_fp

  3482   __ set_last_Java_frame(NOREG, FP, NULL);

  3484   __ move(AT, -(StackAlignmentInBytes));

  3485   __ andr(SP, SP, AT);   // Fix stack alignment as required by ABI

  3487   __ relocate(relocInfo::internal_pc_type);

  3488   {

  3489     long save_pc = (long)__ pc() + 52;

  3490     __ patchable_set48(AT, (long)save_pc);

  3491   }

  3492   __ sd(AT, thread, in_bytes(JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()));

  3494   // Call C code.  Need thread but NOT official VM entry

  3495   // crud.  We cannot block on this call, no GC can happen.  Call should

  3496   // restore return values to their stack-slots with the new SP.

  3497   __ move(A0, thread);

  3498   __ move(A1, Deoptimization::Unpack_uncommon_trap);

  3499   __ patchable_call((address)Deoptimization::unpack_frames);

  3500   // Set an oopmap for the call site

  3501   oop_maps->add_gc_map( __ offset(),  new OopMap( framesize, 0 ) );

  3503   __ reset_last_Java_frame(true);

  3505   // Pop self-frame.

  3506   __ leave();     // Epilog!

  3508   // Jump to interpreter

  3509   __ jr(RA);

  3510   __ delayed()->nop();

  3511   // -------------

  3512   // make sure all code is generated

  3513   masm->flush();

  3515   _uncommon_trap_blob = UncommonTrapBlob::create(&buffer, oop_maps, framesize / 2);

  3516 }

  3518 #endif // COMPILER2

  3520 //------------------------------generate_handler_blob-------------------

  3521 //

  3522 // Generate a special Compile2Runtime blob that saves all registers, and sets

  3523 // up an OopMap and calls safepoint code to stop the compiled code for

  3524 // a safepoint.

  3525 //

  3526 // This blob is jumped to (via a breakpoint and the signal handler) from a

  3527 // safepoint in compiled code.

  3529 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int pool_type) {

  3531   // Account for thread arg in our frame

  3532   const int additional_words = 0;

  3533   int frame_size_in_words;

  3535   assert (StubRoutines::forward_exception_entry() != NULL, "must be generated before");

  3537   ResourceMark rm;

  3538   OopMapSet *oop_maps = new OopMapSet();

  3539   OopMap* map;

  3541   // allocate space for the code

  3542   // setup code generation tools

  3543   CodeBuffer  buffer ("handler_blob", 2048, 512);

  3544   MacroAssembler* masm = new MacroAssembler( &buffer);

  3546   const Register thread = TREG;

  3547   address start   = __ pc();

  3548   address call_pc = NULL;

  3549   bool cause_return = (pool_type == POLL_AT_RETURN);

  3550   bool save_vectors = (pool_type == POLL_AT_VECTOR_LOOP);

  3552   // If cause_return is true we are at a poll_return and there is

  3553   // the return address in RA to the caller on the nmethod

  3554   // that is safepoint. We can leave this return in RA and

  3555   // effectively complete the return and safepoint in the caller.

  3556   // Otherwise we load exception pc to RA.

  3557   __ push(thread);

  3558 #ifndef OPT_THREAD

  3559   __ get_thread(thread);

  3560 #endif

  3562   if(!cause_return) {

  3563     __ ld_ptr(RA, Address(thread, JavaThread::saved_exception_pc_offset()));

  3564   }

  3566   __ pop(thread);

  3567   map = RegisterSaver::save_live_registers(masm, additional_words, &frame_size_in_words, save_vectors);

  3569 #ifndef OPT_THREAD

  3570   __ get_thread(thread);

  3571 #endif

  3572   // The following is basically a call_VM. However, we need the precise

  3573   // address of the call in order to generate an oopmap. Hence, we do all the

  3574   // work outselvs.

  3576   __ move(A0, thread);

  3577   __ set_last_Java_frame(NOREG, NOREG, NULL);

  3580   // do the call

  3581   __ call(call_ptr);

  3582   __ delayed()->nop();

  3584   // Set an oopmap for the call site.  This oopmap will map all

  3585   // oop-registers and debug-info registers as callee-saved.  This

  3586   // will allow deoptimization at this safepoint to find all possible

  3587   // debug-info recordings, as well as let GC find all oops.

  3588   oop_maps->add_gc_map(__ offset(),  map);

  3590   Label noException;

  3592   // Clear last_Java_sp again

  3593   __ reset_last_Java_frame(false);

  3595   __ ld_ptr(AT, thread, in_bytes(Thread::pending_exception_offset()));

  3596   __ beq(AT, R0, noException);

  3597   __ delayed()->nop();

  3599   // Exception pending

  3601   RegisterSaver::restore_live_registers(masm, save_vectors);

  3602   //forward_exception_entry need return address on the stack

  3603   __ push(RA);

  3604   __ patchable_jump((address)StubRoutines::forward_exception_entry());

  3606   // No exception case

  3607   __ bind(noException);

  3608   // Normal exit, register restoring and exit

  3609   RegisterSaver::restore_live_registers(masm, save_vectors);

  3610   __ jr(RA);

  3611   __ delayed()->nop();

  3613   masm->flush();

  3615   // Fill-out other meta info

  3616   return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);

  3617 }

  3619 //

  3620 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss

  3621 //

  3622 // Generate a stub that calls into vm to find out the proper destination

  3623 // of a java call. All the argument registers are live at this point

  3624 // but since this is generic code we don't know what they are and the caller

  3625 // must do any gc of the args.

  3626 //

  3627 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {

  3628   assert (StubRoutines::forward_exception_entry() != NULL, "must be generated before");

  3630   // allocate space for the code

  3631   ResourceMark rm;

  3633   //CodeBuffer buffer(name, 1000, 512);

  3634   //FIXME. aoqi. code_size

  3635   CodeBuffer buffer(name, 2000, 2048);

  3636   MacroAssembler* masm  = new MacroAssembler(&buffer);

  3638   int frame_size_words;

  3639   //we put the thread in A0

  3641   OopMapSet *oop_maps = new OopMapSet();

  3642   OopMap* map = NULL;

  3644   int start = __ offset();

  3645   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_words);

  3648   int frame_complete = __ offset();

  3650   const Register thread = T8;

  3651   __ get_thread(thread);

  3653   __ move(A0, thread);

  3654   __ set_last_Java_frame(noreg, FP, NULL);

  3655   //align the stack before invoke native

  3656   __ move(AT, -(StackAlignmentInBytes));

  3657   __ andr(SP, SP, AT);

  3658   __ relocate(relocInfo::internal_pc_type);

  3659   {

  3660     intptr_t save_pc = (intptr_t)__ pc() +  NativeMovConstReg::instruction_size + 24 + 1 * BytesPerInstWord;

  3661     __ patchable_set48(AT, save_pc);

  3662   }

  3663   __ sd(AT, thread, in_bytes(JavaThread::last_Java_pc_offset()));

  3665   __ call(destination);

  3666   __ delayed()->nop();

  3668   // Set an oopmap for the call site.

  3669   // We need this not only for callee-saved registers, but also for volatile

  3670   // registers that the compiler might be keeping live across a safepoint.

  3671   oop_maps->add_gc_map( __ offset() - start, map);

  3672   // V0 contains the address we are going to jump to assuming no exception got installed

  3673   __ get_thread(thread);

  3674   __ ld_ptr(SP, thread, in_bytes(JavaThread::last_Java_sp_offset()));

  3675   // clear last_Java_sp

  3676   __ reset_last_Java_frame(true);

  3677   // check for pending exceptions

  3678   Label pending;

  3679   __ ld_ptr(AT, thread, in_bytes(Thread::pending_exception_offset()));

  3680   __ bne(AT, R0, pending);

  3681   __ delayed()->nop();

  3682   // get the returned Method*

  3683   //FIXME, do mips need this ?

  3684   __ get_vm_result_2(Rmethod, thread);  // Refer to OpenJDK8

  3685   __ st_ptr(Rmethod, SP, RegisterSaver::methodOffset() * wordSize);

  3686   __ st_ptr(V0, SP, RegisterSaver::v0Offset() * wordSize);

  3687   RegisterSaver::restore_live_registers(masm);

  3689   // We are back the the original state on entry and ready to go the callee method.

  3690   __ jr(V0);

  3691   __ delayed()->nop();

  3692   // Pending exception after the safepoint

  3694   __ bind(pending);

  3696   RegisterSaver::restore_live_registers(masm);

  3698   // exception pending => remove activation and forward to exception handler

  3699   //forward_exception_entry need return address on the stack

  3700   __ push(RA);

  3701   __ get_thread(thread);

  3702   __ st_ptr(R0, thread, in_bytes(JavaThread::vm_result_offset()));

  3703   __ ld_ptr(V0, thread, in_bytes(Thread::pending_exception_offset()));

  3704   __ jmp(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type);

  3705   __ delayed()->nop();

  3706   //

  3707   // make sure all code is generated

  3708   masm->flush();

  3710   RuntimeStub* tmp= RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_words, oop_maps, true);

  3711   return tmp;

  3712 }

  3714 extern "C" int SpinPause() {return 0;}

  3717 //------------------------------Montgomery multiplication------------------------

  3718 //

  3720 // Subtract 0:b from carry:a.  Return carry.

  3721 static unsigned long

  3722 sub(unsigned long a[], unsigned long b[], unsigned long carry, long len) {

  3723   long borrow = 0, t = 0;

  3724   unsigned long tmp0, tmp1;

  3725   __asm__ __volatile__ (

  3726     "0:                                            \n"

  3727     "ld      %[tmp0],     0(%[a])                  \n"

  3728     "ld      %[tmp1],     0(%[b])                  \n"

  3729     "sltu    %[t],        %[tmp0],     %[borrow]   \n"

  3730     "dsubu   %[tmp0],     %[tmp0],     %[borrow]   \n"

  3731     "sltu    %[borrow],   %[tmp0],     %[tmp1]     \n"

  3732     "or      %[borrow],   %[borrow],   %[t]        \n"

  3733     "dsubu   %[tmp0],     %[tmp0],     %[tmp1]     \n"

  3734     "sd      %[tmp0],     0(%[a])                  \n"

  3735     "daddiu  %[a],        %[a],         8          \n"

  3736     "daddiu  %[b],        %[b],         8          \n"

  3737     "daddiu  %[len],      %[len],      -1          \n"

  3738     "bgtz    %[len],      0b                       \n"

  3739     "dsubu   %[tmp0],     %[carry],    %[borrow]   \n"

  3740     : [len]"+r"(len), [tmp0]"=&r"(tmp0), [tmp1]"=&r"(tmp1), [borrow]"+r"(borrow), [a]"+r"(a), [b]"+r"(b), [t]"+r"(t)

  3741     : [carry]"r"(carry)

  3742     : "memory"

  3743   );

  3744   return tmp0;

  3745 }

  3747 // Multiply (unsigned) Long A by Long B, accumulating the double-

  3748 // length result into the accumulator formed of t0, t1, and t2.

  3749 inline void MACC(unsigned long A, unsigned long B, unsigned long &t0, unsigned long &t1, unsigned long &t2) {

  3750   unsigned long hi, lo, carry = 0, t = 0;

  3751   __asm__ __volatile__(

  3752     "dmultu  %[A],        %[B]                     \n"

  3753     "mfhi    %[hi]                                 \n"

  3754     "mflo    %[lo]                                 \n"

  3755     "daddu   %[t0],       %[t0],       %[lo]       \n"

  3756     "sltu    %[carry],    %[t0],       %[lo]       \n"

  3757     "daddu   %[t1],       %[t1],       %[carry]    \n"

  3758     "sltu    %[t],        %[t1],       %[carry]    \n"

  3759     "daddu   %[t1],       %[t1],       %[hi]       \n"

  3760     "sltu    %[carry],    %[t1],       %[hi]       \n"

  3761     "or      %[carry],    %[carry],    %[t]        \n"

  3762     "daddu   %[t2],       %[t2],       %[carry]    \n"

  3763     : [hi]"=&r"(hi), [lo]"=&r"(lo), [t0]"+r"(t0), [t1]"+r"(t1), [t2]"+r"(t2), [carry]"+r"(carry), [t]"+r"(t)

  3764     : [A]"r"(A), [B]"r"(B)

  3765     :

  3766   );

  3767 }

  3769 // As above, but add twice the double-length result into the

  3770 // accumulator.

  3771 inline void MACC2(unsigned long A, unsigned long B, unsigned long &t0, unsigned long &t1, unsigned long &t2) {

  3772   unsigned long hi, lo, carry = 0, t = 0;

  3773   __asm__ __volatile__(

  3774     "dmultu  %[A],        %[B]                     \n"

  3775     "mfhi    %[hi]                                 \n"

  3776     "mflo    %[lo]                                 \n"

  3777     "daddu   %[t0],       %[t0],       %[lo]       \n"

  3778     "sltu    %[carry],    %[t0],       %[lo]       \n"

  3779     "daddu   %[t1],       %[t1],       %[carry]    \n"

  3780     "sltu    %[t],        %[t1],       %[carry]    \n"

  3781     "daddu   %[t1],       %[t1],       %[hi]       \n"

  3782     "sltu    %[carry],    %[t1],       %[hi]       \n"

  3783     "or      %[carry],    %[carry],    %[t]        \n"

  3784     "daddu   %[t2],       %[t2],       %[carry]    \n"

  3785     "daddu   %[t0],       %[t0],       %[lo]       \n"

  3786     "sltu    %[carry],    %[t0],       %[lo]       \n"

  3787     "daddu   %[t1],       %[t1],       %[carry]    \n"

  3788     "sltu    %[t],        %[t1],       %[carry]    \n"

  3789     "daddu   %[t1],       %[t1],       %[hi]       \n"

  3790     "sltu    %[carry],    %[t1],       %[hi]       \n"

  3791     "or      %[carry],    %[carry],    %[t]        \n"

  3792     "daddu   %[t2],       %[t2],       %[carry]    \n"

  3793     : [hi]"=&r"(hi), [lo]"=&r"(lo), [t0]"+r"(t0), [t1]"+r"(t1), [t2]"+r"(t2), [carry]"+r"(carry), [t]"+r"(t)

  3794     : [A]"r"(A), [B]"r"(B)

  3795     :

  3796   );

  3797 }

  3799 // Fast Montgomery multiplication.  The derivation of the algorithm is

  3800 // in  A Cryptographic Library for the Motorola DSP56000,

  3801 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.

  3803 static void __attribute__((noinline))

  3804 montgomery_multiply(unsigned long a[], unsigned long b[], unsigned long n[],

  3805                     unsigned long m[], unsigned long inv, int len) {

  3806   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator

  3807   int i;

  3809   assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");

  3811   for (i = 0; i < len; i++) {

  3812     int j;

  3813     for (j = 0; j < i; j++) {

  3814       MACC(a[j], b[i-j], t0, t1, t2);

  3815       MACC(m[j], n[i-j], t0, t1, t2);

  3816     }

  3817     MACC(a[i], b[0], t0, t1, t2);

  3818     m[i] = t0 * inv;

  3819     MACC(m[i], n[0], t0, t1, t2);

  3821     assert(t0 == 0, "broken Montgomery multiply");

  3823     t0 = t1; t1 = t2; t2 = 0;

  3824   }

  3826   for (i = len; i < 2*len; i++) {

  3827     int j;

  3828     for (j = i-len+1; j < len; j++) {

  3829       MACC(a[j], b[i-j], t0, t1, t2);

  3830       MACC(m[j], n[i-j], t0, t1, t2);

  3831     }

  3832     m[i-len] = t0;

  3833     t0 = t1; t1 = t2; t2 = 0;

  3834   }

  3836   while (t0)

  3837     t0 = sub(m, n, t0, len);

  3838 }

  3840 // Fast Montgomery squaring.  This uses asymptotically 25% fewer

  3841 // multiplies so it should be up to 25% faster than Montgomery

  3842 // multiplication.  However, its loop control is more complex and it

  3843 // may actually run slower on some machines.

  3845 static void __attribute__((noinline))

  3846 montgomery_square(unsigned long a[], unsigned long n[],

  3847                   unsigned long m[], unsigned long inv, int len) {

  3848   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator

  3849   int i;

  3851   assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");

  3853   for (i = 0; i < len; i++) {

  3854     int j;

  3855     int end = (i+1)/2;

  3856     for (j = 0; j < end; j++) {

  3857       MACC2(a[j], a[i-j], t0, t1, t2);

  3858       MACC(m[j], n[i-j], t0, t1, t2);

  3859     }

  3860     if ((i & 1) == 0) {

  3861       MACC(a[j], a[j], t0, t1, t2);

  3862     }

  3863     for (; j < i; j++) {

  3864       MACC(m[j], n[i-j], t0, t1, t2);

  3865     }

  3866     m[i] = t0 * inv;

  3867     MACC(m[i], n[0], t0, t1, t2);

  3869     assert(t0 == 0, "broken Montgomery square");

  3871     t0 = t1; t1 = t2; t2 = 0;

  3872   }

  3874   for (i = len; i < 2*len; i++) {

  3875     int start = i-len+1;

  3876     int end = start + (len - start)/2;

  3877     int j;

  3878     for (j = start; j < end; j++) {

  3879       MACC2(a[j], a[i-j], t0, t1, t2);

  3880       MACC(m[j], n[i-j], t0, t1, t2);

  3881     }

  3882     if ((i & 1) == 0) {

  3883       MACC(a[j], a[j], t0, t1, t2);

  3884     }

  3885     for (; j < len; j++) {

  3886       MACC(m[j], n[i-j], t0, t1, t2);

  3887     }

  3888     m[i-len] = t0;

  3889     t0 = t1; t1 = t2; t2 = 0;

  3890   }

  3892   while (t0)

  3893     t0 = sub(m, n, t0, len);

  3894 }

  3896 // Swap words in a longword.

  3897 static unsigned long swap(unsigned long x) {

  3898   return (x << 32) | (x >> 32);

  3899 }

  3901 // Copy len longwords from s to d, word-swapping as we go.  The

  3902 // destination array is reversed.

  3903 static void reverse_words(unsigned long *s, unsigned long *d, int len) {

  3904   d += len;

  3905   while(len-- > 0) {

  3906     d--;

  3907     *d = swap(*s);

  3908     s++;

  3909   }

  3910 }

  3912 // The threshold at which squaring is advantageous was determined

  3913 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.

  3914 // Doesn't seem to be relevant for MIPS64 so we use the same value.

  3915 #define MONTGOMERY_SQUARING_THRESHOLD 64

  3917 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,

  3918                                         jint len, jlong inv,

  3919                                         jint *m_ints) {

  3920   assert(len % 2 == 0, "array length in montgomery_multiply must be even");

  3921   int longwords = len/2;

  3923   // Make very sure we don't use so much space that the stack might

  3924   // overflow.  512 jints corresponds to an 16384-bit integer and

  3925   // will use here a total of 8k bytes of stack space.

  3926   int total_allocation = longwords * sizeof (unsigned long) * 4;

  3927   guarantee(total_allocation <= 8192, "must be");

  3928   unsigned long *scratch = (unsigned long *)alloca(total_allocation);

  3930   // Local scratch arrays

  3931   unsigned long

  3932     *a = scratch + 0 * longwords,

  3933     *b = scratch + 1 * longwords,

  3934     *n = scratch + 2 * longwords,

  3935     *m = scratch + 3 * longwords;

  3937   reverse_words((unsigned long *)a_ints, a, longwords);

  3938   reverse_words((unsigned long *)b_ints, b, longwords);

  3939   reverse_words((unsigned long *)n_ints, n, longwords);

  3941   ::montgomery_multiply(a, b, n, m, (unsigned long)inv, longwords);

  3943   reverse_words(m, (unsigned long *)m_ints, longwords);

  3944 }

  3946 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,

  3947                                       jint len, jlong inv,

  3948                                       jint *m_ints) {

  3949   assert(len % 2 == 0, "array length in montgomery_square must be even");

  3950   int longwords = len/2;

  3952   // Make very sure we don't use so much space that the stack might

  3953   // overflow.  512 jints corresponds to an 16384-bit integer and

  3954   // will use here a total of 6k bytes of stack space.

  3955   int total_allocation = longwords * sizeof (unsigned long) * 3;

  3956   guarantee(total_allocation <= 8192, "must be");

  3957   unsigned long *scratch = (unsigned long *)alloca(total_allocation);

  3959   // Local scratch arrays

  3960   unsigned long

  3961     *a = scratch + 0 * longwords,

  3962     *n = scratch + 1 * longwords,

  3963     *m = scratch + 2 * longwords;

  3965   reverse_words((unsigned long *)a_ints, a, longwords);

  3966   reverse_words((unsigned long *)n_ints, n, longwords);

  3968   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {

  3969     ::montgomery_square(a, n, m, (unsigned long)inv, longwords);

  3970   } else {

  3971     ::montgomery_multiply(a, a, n, m, (unsigned long)inv, longwords);

  3972   }

  3974   reverse_words(m, (unsigned long *)m_ints, longwords);

  3975 }

src/cpu/mips/vm/sharedRuntime_mips_64.cpp@0b27fc8adf1b

src/cpu/mips/vm/sharedRuntime_mips_64.cpp

Mercurial > jdk8-mips64-public > hotspot / file revision

src/cpu/mips/vm/sharedRuntime_mips_64.cpp@0b27fc8adf1b

src/cpu/mips/vm/sharedRuntime_mips_64.cpp