jdk8-mips64-public/hotspot: src/cpu/mips/vm/sharedRuntime_mips

#10052 Backport of #9904 compiler/floatingpoint/TestFloatSyncJNIArgs.java failed
Reviewed-by: aoqi

     1 /*

     2  * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.

     3  * Copyright (c) 2015, 2019, Loongson Technology. All rights reserved.

     4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.

     5  *

     6  * This code is free software; you can redistribute it and/or modify it

     7  * under the terms of the GNU General Public License version 2 only, as

     8  * published by the Free Software Foundation.

     9  *

    10  * This code is distributed in the hope that it will be useful, but WITHOUT

    11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or

    12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License

    13  * version 2 for more details (a copy is included in the LICENSE file that

    14  * accompanied this code).

    15  *

    16  * You should have received a copy of the GNU General Public License version

    17  * 2 along with this work; if not, write to the Free Software Foundation,

    18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.

    19  *

    20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA

    21  * or visit www.oracle.com if you need additional information or have any

    22  * questions.

    23  *

    24  */

    26 #include "precompiled.hpp"

    27 #include "asm/macroAssembler.hpp"

    28 #include "asm/macroAssembler.inline.hpp"

    29 #include "code/debugInfoRec.hpp"

    30 #include "code/icBuffer.hpp"

    31 #include "code/vtableStubs.hpp"

    32 #include "interpreter/interpreter.hpp"

    33 #include "oops/compiledICHolder.hpp"

    34 #include "prims/jvmtiRedefineClassesTrace.hpp"

    35 #include "runtime/sharedRuntime.hpp"

    36 #include "runtime/vframeArray.hpp"

    37 #include "vmreg_mips.inline.hpp"

    38 #ifdef COMPILER1

    39 #include "c1/c1_Runtime1.hpp"

    40 #endif

    41 #ifdef COMPILER2

    42 #include "opto/runtime.hpp"

    43 #endif

    45 #include <alloca.h>

    47 #define __ masm->

    49 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;

    51 class RegisterSaver {

    52   enum { FPU_regs_live = 32 };

    53   // Capture info about frame layout

    54   enum layout {

    55 #define DEF_LAYOUT_OFFS(regname)  regname ## _off,  regname ## H_off,

    56     DEF_LAYOUT_OFFS(for_16_bytes_aligned)

    57     DEF_LAYOUT_OFFS(fpr0)

    58     DEF_LAYOUT_OFFS(fpr1)

    59     DEF_LAYOUT_OFFS(fpr2)

    60     DEF_LAYOUT_OFFS(fpr3)

    61     DEF_LAYOUT_OFFS(fpr4)

    62     DEF_LAYOUT_OFFS(fpr5)

    63     DEF_LAYOUT_OFFS(fpr6)

    64     DEF_LAYOUT_OFFS(fpr7)

    65     DEF_LAYOUT_OFFS(fpr8)

    66     DEF_LAYOUT_OFFS(fpr9)

    67     DEF_LAYOUT_OFFS(fpr10)

    68     DEF_LAYOUT_OFFS(fpr11)

    69     DEF_LAYOUT_OFFS(fpr12)

    70     DEF_LAYOUT_OFFS(fpr13)

    71     DEF_LAYOUT_OFFS(fpr14)

    72     DEF_LAYOUT_OFFS(fpr15)

    73     DEF_LAYOUT_OFFS(fpr16)

    74     DEF_LAYOUT_OFFS(fpr17)

    75     DEF_LAYOUT_OFFS(fpr18)

    76     DEF_LAYOUT_OFFS(fpr19)

    77     DEF_LAYOUT_OFFS(fpr20)

    78     DEF_LAYOUT_OFFS(fpr21)

    79     DEF_LAYOUT_OFFS(fpr22)

    80     DEF_LAYOUT_OFFS(fpr23)

    81     DEF_LAYOUT_OFFS(fpr24)

    82     DEF_LAYOUT_OFFS(fpr25)

    83     DEF_LAYOUT_OFFS(fpr26)

    84     DEF_LAYOUT_OFFS(fpr27)

    85     DEF_LAYOUT_OFFS(fpr28)

    86     DEF_LAYOUT_OFFS(fpr29)

    87     DEF_LAYOUT_OFFS(fpr30)

    88     DEF_LAYOUT_OFFS(fpr31)

    90     DEF_LAYOUT_OFFS(v0)

    91     DEF_LAYOUT_OFFS(v1)

    92     DEF_LAYOUT_OFFS(a0)

    93     DEF_LAYOUT_OFFS(a1)

    94     DEF_LAYOUT_OFFS(a2)

    95     DEF_LAYOUT_OFFS(a3)

    96     DEF_LAYOUT_OFFS(a4)

    97     DEF_LAYOUT_OFFS(a5)

    98     DEF_LAYOUT_OFFS(a6)

    99     DEF_LAYOUT_OFFS(a7)

   100     DEF_LAYOUT_OFFS(t0)

   101     DEF_LAYOUT_OFFS(t1)

   102     DEF_LAYOUT_OFFS(t2)

   103     DEF_LAYOUT_OFFS(t3)

   104     DEF_LAYOUT_OFFS(s0)

   105     DEF_LAYOUT_OFFS(s1)

   106     DEF_LAYOUT_OFFS(s2)

   107     DEF_LAYOUT_OFFS(s3)

   108     DEF_LAYOUT_OFFS(s4)

   109     DEF_LAYOUT_OFFS(s5)

   110     DEF_LAYOUT_OFFS(s6)

   111     DEF_LAYOUT_OFFS(s7)

   112     DEF_LAYOUT_OFFS(t8)

   113     DEF_LAYOUT_OFFS(t9)

   115     DEF_LAYOUT_OFFS(gp)

   116     DEF_LAYOUT_OFFS(fp)

   117     DEF_LAYOUT_OFFS(return)

   118     reg_save_size

   119   };

   121   public:

   123   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors =false );

   124   static void restore_live_registers(MacroAssembler* masm, bool restore_vectors = false);

   125   static int raOffset(void) { return return_off / 2; }

   126   //Rmethod

   127   static int methodOffset(void) { return s3_off / 2; }

   129   static int v0Offset(void) { return v0_off / 2; }

   130   static int v1Offset(void) { return v1_off / 2; }

   132   static int fpResultOffset(void) { return fpr0_off / 2; }

   134   // During deoptimization only the result register need to be restored

   135   // all the other values have already been extracted.

   136   static void restore_result_registers(MacroAssembler* masm);

   137 };

   139 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors ) {

   141   // Always make the frame size 16-byte aligned

   142   int frame_size_in_bytes = round_to(additional_frame_words*wordSize +

   143                                      reg_save_size*BytesPerInt, 16);

   144   // OopMap frame size is in compiler stack slots (jint's) not bytes or words

   145   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;

   146   // The caller will allocate additional_frame_words

   147   int additional_frame_slots = additional_frame_words*wordSize / BytesPerInt;

   148   // CodeBlob frame size is in words.

   149   int frame_size_in_words = frame_size_in_bytes / wordSize;

   150   *total_frame_words = frame_size_in_words;

   152   // save registers

   154   __ daddiu(SP, SP, - reg_save_size * jintSize);

   156   __ sdc1(F0, SP, fpr0_off * jintSize); __ sdc1(F1, SP, fpr1_off * jintSize);

   157   __ sdc1(F2, SP, fpr2_off * jintSize); __ sdc1(F3, SP, fpr3_off * jintSize);

   158   __ sdc1(F4, SP, fpr4_off * jintSize); __ sdc1(F5, SP, fpr5_off * jintSize);

   159   __ sdc1(F6, SP, fpr6_off * jintSize);  __ sdc1(F7, SP, fpr7_off * jintSize);

   160   __ sdc1(F8, SP, fpr8_off * jintSize);  __ sdc1(F9, SP, fpr9_off * jintSize);

   161   __ sdc1(F10, SP, fpr10_off * jintSize);  __ sdc1(F11, SP, fpr11_off * jintSize);

   162   __ sdc1(F12, SP, fpr12_off * jintSize);  __ sdc1(F13, SP, fpr13_off * jintSize);

   163   __ sdc1(F14, SP, fpr14_off * jintSize);  __ sdc1(F15, SP, fpr15_off * jintSize);

   164   __ sdc1(F16, SP, fpr16_off * jintSize);  __ sdc1(F17, SP, fpr17_off * jintSize);

   165   __ sdc1(F18, SP, fpr18_off * jintSize);  __ sdc1(F19, SP, fpr19_off * jintSize);

   166   __ sdc1(F20, SP, fpr20_off * jintSize);  __ sdc1(F21, SP, fpr21_off * jintSize);

   167   __ sdc1(F22, SP, fpr22_off * jintSize);  __ sdc1(F23, SP, fpr23_off * jintSize);

   168   __ sdc1(F24, SP, fpr24_off * jintSize);  __ sdc1(F25, SP, fpr25_off * jintSize);

   169   __ sdc1(F26, SP, fpr26_off * jintSize);  __ sdc1(F27, SP, fpr27_off * jintSize);

   170   __ sdc1(F28, SP, fpr28_off * jintSize);  __ sdc1(F29, SP, fpr29_off * jintSize);

   171   __ sdc1(F30, SP, fpr30_off * jintSize);  __ sdc1(F31, SP, fpr31_off * jintSize);

   172   __ sd(V0, SP, v0_off * jintSize);  __ sd(V1, SP, v1_off * jintSize);

   173   __ sd(A0, SP, a0_off * jintSize);  __ sd(A1, SP, a1_off * jintSize);

   174   __ sd(A2, SP, a2_off * jintSize);  __ sd(A3, SP, a3_off * jintSize);

   175   __ sd(A4, SP, a4_off * jintSize);  __ sd(A5, SP, a5_off * jintSize);

   176   __ sd(A6, SP, a6_off * jintSize);  __ sd(A7, SP, a7_off * jintSize);

   177   __ sd(T0, SP, t0_off * jintSize);

   178   __ sd(T1, SP, t1_off * jintSize);

   179   __ sd(T2, SP, t2_off * jintSize);

   180   __ sd(T3, SP, t3_off * jintSize);

   181   __ sd(S0, SP, s0_off * jintSize);

   182   __ sd(S1, SP, s1_off * jintSize);

   183   __ sd(S2, SP, s2_off * jintSize);

   184   __ sd(S3, SP, s3_off * jintSize);

   185   __ sd(S4, SP, s4_off * jintSize);

   186   __ sd(S5, SP, s5_off * jintSize);

   187   __ sd(S6, SP, s6_off * jintSize);

   188   __ sd(S7, SP, s7_off * jintSize);

   190   __ sd(T8, SP, t8_off * jintSize);

   191   __ sd(T9, SP, t9_off * jintSize);

   193   __ sd(GP, SP, gp_off * jintSize);

   194   __ sd(FP, SP, fp_off * jintSize);

   195   __ sd(RA, SP, return_off * jintSize);

   196   __ daddi(FP, SP, fp_off * jintSize);

   198   OopMapSet *oop_maps = new OopMapSet();

   199   //OopMap* map =  new OopMap( frame_words, 0 );

   200   OopMap* map =  new OopMap( frame_size_in_slots, 0 );

   203 //#define STACK_OFFSET(x) VMRegImpl::stack2reg((x) + additional_frame_words)

   204 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x) + additional_frame_slots)

   205   map->set_callee_saved(STACK_OFFSET( v0_off), V0->as_VMReg());

   206   map->set_callee_saved(STACK_OFFSET( v1_off), V1->as_VMReg());

   207   map->set_callee_saved(STACK_OFFSET( a0_off), A0->as_VMReg());

   208   map->set_callee_saved(STACK_OFFSET( a1_off), A1->as_VMReg());

   209   map->set_callee_saved(STACK_OFFSET( a2_off), A2->as_VMReg());

   210   map->set_callee_saved(STACK_OFFSET( a3_off), A3->as_VMReg());

   211   map->set_callee_saved(STACK_OFFSET( a4_off), A4->as_VMReg());

   212   map->set_callee_saved(STACK_OFFSET( a5_off), A5->as_VMReg());

   213   map->set_callee_saved(STACK_OFFSET( a6_off), A6->as_VMReg());

   214   map->set_callee_saved(STACK_OFFSET( a7_off), A7->as_VMReg());

   215   map->set_callee_saved(STACK_OFFSET( t0_off), T0->as_VMReg());

   216   map->set_callee_saved(STACK_OFFSET( t1_off), T1->as_VMReg());

   217   map->set_callee_saved(STACK_OFFSET( t2_off), T2->as_VMReg());

   218   map->set_callee_saved(STACK_OFFSET( t3_off), T3->as_VMReg());

   219   map->set_callee_saved(STACK_OFFSET( s0_off), S0->as_VMReg());

   220   map->set_callee_saved(STACK_OFFSET( s1_off), S1->as_VMReg());

   221   map->set_callee_saved(STACK_OFFSET( s2_off), S2->as_VMReg());

   222   map->set_callee_saved(STACK_OFFSET( s3_off), S3->as_VMReg());

   223   map->set_callee_saved(STACK_OFFSET( s4_off), S4->as_VMReg());

   224   map->set_callee_saved(STACK_OFFSET( s5_off), S5->as_VMReg());

   225   map->set_callee_saved(STACK_OFFSET( s6_off), S6->as_VMReg());

   226   map->set_callee_saved(STACK_OFFSET( s7_off), S7->as_VMReg());

   227   map->set_callee_saved(STACK_OFFSET( t8_off), T8->as_VMReg());

   228   map->set_callee_saved(STACK_OFFSET( t9_off), T9->as_VMReg());

   229   map->set_callee_saved(STACK_OFFSET( gp_off), GP->as_VMReg());

   230   map->set_callee_saved(STACK_OFFSET( fp_off), FP->as_VMReg());

   231   map->set_callee_saved(STACK_OFFSET( return_off), RA->as_VMReg());

   233   map->set_callee_saved(STACK_OFFSET( fpr0_off), F0->as_VMReg());

   234   map->set_callee_saved(STACK_OFFSET( fpr1_off), F1->as_VMReg());

   235   map->set_callee_saved(STACK_OFFSET( fpr2_off), F2->as_VMReg());

   236   map->set_callee_saved(STACK_OFFSET( fpr3_off), F3->as_VMReg());

   237   map->set_callee_saved(STACK_OFFSET( fpr4_off), F4->as_VMReg());

   238   map->set_callee_saved(STACK_OFFSET( fpr5_off), F5->as_VMReg());

   239   map->set_callee_saved(STACK_OFFSET( fpr6_off), F6->as_VMReg());

   240   map->set_callee_saved(STACK_OFFSET( fpr7_off), F7->as_VMReg());

   241   map->set_callee_saved(STACK_OFFSET( fpr8_off), F8->as_VMReg());

   242   map->set_callee_saved(STACK_OFFSET( fpr9_off), F9->as_VMReg());

   243   map->set_callee_saved(STACK_OFFSET( fpr10_off), F10->as_VMReg());

   244   map->set_callee_saved(STACK_OFFSET( fpr11_off), F11->as_VMReg());

   245   map->set_callee_saved(STACK_OFFSET( fpr12_off), F12->as_VMReg());

   246   map->set_callee_saved(STACK_OFFSET( fpr13_off), F13->as_VMReg());

   247   map->set_callee_saved(STACK_OFFSET( fpr14_off), F14->as_VMReg());

   248   map->set_callee_saved(STACK_OFFSET( fpr15_off), F15->as_VMReg());

   249   map->set_callee_saved(STACK_OFFSET( fpr16_off), F16->as_VMReg());

   250   map->set_callee_saved(STACK_OFFSET( fpr17_off), F17->as_VMReg());

   251   map->set_callee_saved(STACK_OFFSET( fpr18_off), F18->as_VMReg());

   252   map->set_callee_saved(STACK_OFFSET( fpr19_off), F19->as_VMReg());

   253   map->set_callee_saved(STACK_OFFSET( fpr20_off), F20->as_VMReg());

   254   map->set_callee_saved(STACK_OFFSET( fpr21_off), F21->as_VMReg());

   255   map->set_callee_saved(STACK_OFFSET( fpr22_off), F22->as_VMReg());

   256   map->set_callee_saved(STACK_OFFSET( fpr23_off), F23->as_VMReg());

   257   map->set_callee_saved(STACK_OFFSET( fpr24_off), F24->as_VMReg());

   258   map->set_callee_saved(STACK_OFFSET( fpr25_off), F25->as_VMReg());

   259   map->set_callee_saved(STACK_OFFSET( fpr26_off), F26->as_VMReg());

   260   map->set_callee_saved(STACK_OFFSET( fpr27_off), F27->as_VMReg());

   261   map->set_callee_saved(STACK_OFFSET( fpr28_off), F28->as_VMReg());

   262   map->set_callee_saved(STACK_OFFSET( fpr29_off), F29->as_VMReg());

   263   map->set_callee_saved(STACK_OFFSET( fpr30_off), F30->as_VMReg());

   264   map->set_callee_saved(STACK_OFFSET( fpr31_off), F31->as_VMReg());

   266 #undef STACK_OFFSET

   267   return map;

   268 }

   271 // Pop the current frame and restore all the registers that we

   272 // saved.

   273 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_vectors) {

   274   __ ldc1(F0, SP, fpr0_off * jintSize); __ ldc1(F1, SP, fpr1_off * jintSize);

   275   __ ldc1(F2, SP, fpr2_off * jintSize); __ ldc1(F3, SP, fpr3_off * jintSize);

   276   __ ldc1(F4, SP, fpr4_off * jintSize); __ ldc1(F5, SP, fpr5_off * jintSize);

   277   __ ldc1(F6, SP, fpr6_off * jintSize);  __ ldc1(F7, SP, fpr7_off * jintSize);

   278   __ ldc1(F8, SP, fpr8_off * jintSize);  __ ldc1(F9, SP, fpr9_off * jintSize);

   279   __ ldc1(F10, SP, fpr10_off * jintSize);  __ ldc1(F11, SP, fpr11_off * jintSize);

   280   __ ldc1(F12, SP, fpr12_off * jintSize);  __ ldc1(F13, SP, fpr13_off * jintSize);

   281   __ ldc1(F14, SP, fpr14_off * jintSize);  __ ldc1(F15, SP, fpr15_off * jintSize);

   282   __ ldc1(F16, SP, fpr16_off * jintSize);  __ ldc1(F17, SP, fpr17_off * jintSize);

   283   __ ldc1(F18, SP, fpr18_off * jintSize);  __ ldc1(F19, SP, fpr19_off * jintSize);

   284   __ ldc1(F20, SP, fpr20_off * jintSize);  __ ldc1(F21, SP, fpr21_off * jintSize);

   285   __ ldc1(F22, SP, fpr22_off * jintSize);  __ ldc1(F23, SP, fpr23_off * jintSize);

   286   __ ldc1(F24, SP, fpr24_off * jintSize);  __ ldc1(F25, SP, fpr25_off * jintSize);

   287   __ ldc1(F26, SP, fpr26_off * jintSize);  __ ldc1(F27, SP, fpr27_off * jintSize);

   288   __ ldc1(F28, SP, fpr28_off * jintSize);  __ ldc1(F29, SP, fpr29_off * jintSize);

   289   __ ldc1(F30, SP, fpr30_off * jintSize);  __ ldc1(F31, SP, fpr31_off * jintSize);

   291   __ ld(V0, SP, v0_off * jintSize);  __ ld(V1, SP, v1_off * jintSize);

   292   __ ld(A0, SP, a0_off * jintSize);  __ ld(A1, SP, a1_off * jintSize);

   293   __ ld(A2, SP, a2_off * jintSize);  __ ld(A3, SP, a3_off * jintSize);

   294   __ ld(A4, SP, a4_off * jintSize);  __ ld(A5, SP, a5_off * jintSize);

   295   __ ld(A6, SP, a6_off * jintSize);  __ ld(A7, SP, a7_off * jintSize);

   296   __ ld(T0, SP, t0_off * jintSize);

   297   __ ld(T1, SP, t1_off * jintSize);

   298   __ ld(T2, SP, t2_off * jintSize);

   299   __ ld(T3, SP, t3_off * jintSize);

   300   __ ld(S0, SP, s0_off * jintSize);

   301   __ ld(S1, SP, s1_off * jintSize);

   302   __ ld(S2, SP, s2_off * jintSize);

   303   __ ld(S3, SP, s3_off * jintSize);

   304   __ ld(S4, SP, s4_off * jintSize);

   305   __ ld(S5, SP, s5_off * jintSize);

   306   __ ld(S6, SP, s6_off * jintSize);

   307   __ ld(S7, SP, s7_off * jintSize);

   309   __ ld(T8, SP, t8_off * jintSize);

   310   __ ld(T9, SP, t9_off * jintSize);

   312   __ ld(GP, SP, gp_off * jintSize);

   313   __ ld(FP, SP, fp_off * jintSize);

   314   __ ld(RA, SP, return_off * jintSize);

   316   __ addiu(SP, SP, reg_save_size * jintSize);

   317 }

   319 // Pop the current frame and restore the registers that might be holding

   320 // a result.

   321 // FIXME, if the result is float?

   322 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {

   324   // Just restore result register. Only used by deoptimization. By

   325   // now any callee save register that needs to be restore to a c2

   326   // caller of the deoptee has been extracted into the vframeArray

   327   // and will be stuffed into the c2i adapter we create for later

   328   // restoration so only result registers need to be restored here.

   330   __ ld(V0, SP, v0_off * jintSize);

   331   __ ld(V1, SP, v1_off * jintSize);

   332   __ addiu(SP, SP, return_off * jintSize);

   333 }

   335 // Is vector's size (in bytes) bigger than a size saved by default?

   336 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.

   337 bool SharedRuntime::is_wide_vector(int size) {

   338   return size > 16;

   339 }

   341 // The java_calling_convention describes stack locations as ideal slots on

   342 // a frame with no abi restrictions. Since we must observe abi restrictions

   343 // (like the placement of the register window) the slots must be biased by

   344 // the following value.

   346 static int reg2offset_in(VMReg r) {

   347   // Account for saved fp and return address

   348   // This should really be in_preserve_stack_slots

   349   return (r->reg2stack() + 2 * VMRegImpl::slots_per_word) * VMRegImpl::stack_slot_size;  // + 2 * VMRegImpl::stack_slot_size);

   350 }

   352 static int reg2offset_out(VMReg r) {

   353   return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;

   354 }

   356 // ---------------------------------------------------------------------------

   357 // Read the array of BasicTypes from a signature, and compute where the

   358 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte

   359 // quantities.  Values less than SharedInfo::stack0 are registers, those above

   360 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer

   361 // as framesizes are fixed.

   362 // VMRegImpl::stack0 refers to the first slot 0(sp).

   363 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.  Register

   364 // up to RegisterImpl::number_of_registers) are the 32-bit

   365 // integer registers.

   367 // Pass first five oop/int args in registers T0, A0 - A3.

   368 // Pass float/double/long args in stack.

   369 // Doubles have precedence, so if you pass a mix of floats and doubles

   370 // the doubles will grab the registers before the floats will.

   372 // Note: the INPUTS in sig_bt are in units of Java argument words, which are

   373 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit

   374 // units regardless of build.

   377 // ---------------------------------------------------------------------------

   378 // The compiled Java calling convention.

   379 // Pass first five oop/int args in registers T0, A0 - A3.

   380 // Pass float/double/long args in stack.

   381 // Doubles have precedence, so if you pass a mix of floats and doubles

   382 // the doubles will grab the registers before the floats will.

   384 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,

   385                                            VMRegPair *regs,

   386                                            int total_args_passed,

   387                                            int is_outgoing) {

   389   // Create the mapping between argument positions and

   390   // registers.

   391   //static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {

   392   static const Register INT_ArgReg[Argument::n_register_parameters + 1] = {

   393     T0, A0, A1, A2, A3, A4, A5, A6, A7

   394   };

   395   //static const FloatRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {

   396   static const FloatRegister FP_ArgReg[Argument::n_float_register_parameters] = {

   397     F12, F13, F14, F15, F16, F17, F18, F19

   398   };

   401   uint args = 0;

   402   uint stk_args = 0; // inc by 2 each time

   404   for (int i = 0; i < total_args_passed; i++) {

   405     switch (sig_bt[i]) {

   406     case T_VOID:

   407       // halves of T_LONG or T_DOUBLE

   408       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");

   409       regs[i].set_bad();

   410       break;

   411     case T_BOOLEAN:

   412     case T_CHAR:

   413     case T_BYTE:

   414     case T_SHORT:

   415     case T_INT:

   416       if (args < Argument::n_register_parameters) {

   417         regs[i].set1(INT_ArgReg[args++]->as_VMReg());

   418       } else {

   419         regs[i].set1(VMRegImpl::stack2reg(stk_args));

   420         stk_args += 2;

   421       }

   422       break;

   423     case T_LONG:

   424       assert(sig_bt[i + 1] == T_VOID, "expecting half");

   425       // fall through

   426     case T_OBJECT:

   427     case T_ARRAY:

   428     case T_ADDRESS:

   429       if (args < Argument::n_register_parameters) {

   430         regs[i].set2(INT_ArgReg[args++]->as_VMReg());

   431       } else {

   432         regs[i].set2(VMRegImpl::stack2reg(stk_args));

   433         stk_args += 2;

   434       }

   435       break;

   436     case T_FLOAT:

   437       if (args < Argument::n_float_register_parameters) {

   438         regs[i].set1(FP_ArgReg[args++]->as_VMReg());

   439       } else {

   440         regs[i].set1(VMRegImpl::stack2reg(stk_args));

   441         stk_args += 2;

   442       }

   443       break;

   444     case T_DOUBLE:

   445       assert(sig_bt[i + 1] == T_VOID, "expecting half");

   446       if (args < Argument::n_float_register_parameters) {

   447         regs[i].set2(FP_ArgReg[args++]->as_VMReg());

   448       } else {

   449         regs[i].set2(VMRegImpl::stack2reg(stk_args));

   450         stk_args += 2;

   451       }

   452       break;

   453     default:

   454       ShouldNotReachHere();

   455       break;

   456     }

   457   }

   459   return round_to(stk_args, 2);

   460 }

   462 // Helper class mostly to avoid passing masm everywhere, and handle store

   463 // displacement overflow logic for LP64

   464 class AdapterGenerator {

   465   MacroAssembler *masm;

   466 #ifdef _LP64

   467   Register Rdisp;

   468   void set_Rdisp(Register r)  { Rdisp = r; }

   469 #endif // _LP64

   471   void patch_callers_callsite();

   473   // base+st_off points to top of argument

   474   int arg_offset(const int st_off) { return st_off; }

   475   int next_arg_offset(const int st_off) {

   476     return st_off - Interpreter::stackElementSize;

   477   }

   479 #ifdef _LP64

   480   // On _LP64 argument slot values are loaded first into a register

   481   // because they might not fit into displacement.

   482   Register arg_slot(const int st_off);

   483   Register next_arg_slot(const int st_off);

   484 #else

   485   int arg_slot(const int st_off)      { return arg_offset(st_off); }

   486   int next_arg_slot(const int st_off) { return next_arg_offset(st_off); }

   487 #endif // _LP64

   489   // Stores long into offset pointed to by base

   490   void store_c2i_long(Register r, Register base,

   491                       const int st_off, bool is_stack);

   492   void store_c2i_object(Register r, Register base,

   493                         const int st_off);

   494   void store_c2i_int(Register r, Register base,

   495                      const int st_off);

   496   void store_c2i_double(VMReg r_2,

   497                         VMReg r_1, Register base, const int st_off);

   498   void store_c2i_float(FloatRegister f, Register base,

   499                        const int st_off);

   501  public:

   502   //void tag_stack(const BasicType sig, int st_off);

   503   void gen_c2i_adapter(int total_args_passed,

   504                               // VMReg max_arg,

   505                               int comp_args_on_stack, // VMRegStackSlots

   506                               const BasicType *sig_bt,

   507                               const VMRegPair *regs,

   508                               Label& skip_fixup);

   509   void gen_i2c_adapter(int total_args_passed,

   510                               // VMReg max_arg,

   511                               int comp_args_on_stack, // VMRegStackSlots

   512                               const BasicType *sig_bt,

   513                               const VMRegPair *regs);

   515   AdapterGenerator(MacroAssembler *_masm) : masm(_masm) {}

   516 };

   519 // Patch the callers callsite with entry to compiled code if it exists.

   520 void AdapterGenerator::patch_callers_callsite() {

   521   Label L;

   522   __ verify_oop(Rmethod);

   523   __ ld_ptr(AT, Rmethod, in_bytes(Method::code_offset()));

   524   __ beq(AT, R0, L);

   525   __ delayed()->nop();

   526   // Schedule the branch target address early.

   527   // Call into the VM to patch the caller, then jump to compiled callee

   528   // V0 isn't live so capture return address while we easily can

   529   __ move(V0, RA);

   531   __ pushad();

   532 #ifdef COMPILER2

   533   // C2 may leave the stack dirty if not in SSE2+ mode

   534   __ empty_FPU_stack();

   535 #endif

   537   // VM needs caller's callsite

   538   // VM needs target method

   540   __ move(A0, Rmethod);

   541   __ move(A1, V0);

   542   // we should preserve the return address

   543   __ verify_oop(Rmethod);

   544   __ move(S0, SP);

   545   __ move(AT, -(StackAlignmentInBytes));   // align the stack

   546   __ andr(SP, SP, AT);

   547   __ call(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite),

   548           relocInfo::runtime_call_type);

   550   __ delayed()->nop();

   551   __ move(SP, S0);

   552   __ popad();

   553   __ bind(L);

   554 }

   556 #ifdef _LP64

   557 Register AdapterGenerator::arg_slot(const int st_off) {

   558   Unimplemented();

   559 }

   561 Register AdapterGenerator::next_arg_slot(const int st_off){

   562   Unimplemented();

   563 }

   564 #endif // _LP64

   566 // Stores long into offset pointed to by base

   567 void AdapterGenerator::store_c2i_long(Register r, Register base,

   568                                       const int st_off, bool is_stack) {

   569   Unimplemented();

   570 }

   572 void AdapterGenerator::store_c2i_object(Register r, Register base,

   573                                         const int st_off) {

   574   Unimplemented();

   575 }

   577 void AdapterGenerator::store_c2i_int(Register r, Register base,

   578                                      const int st_off) {

   579   Unimplemented();

   580 }

   582 // Stores into offset pointed to by base

   583 void AdapterGenerator::store_c2i_double(VMReg r_2,

   584                       VMReg r_1, Register base, const int st_off) {

   585   Unimplemented();

   586 }

   588 void AdapterGenerator::store_c2i_float(FloatRegister f, Register base,

   589                                        const int st_off) {

   590   Unimplemented();

   591 }

   593 void AdapterGenerator::gen_c2i_adapter(

   594                             int total_args_passed,

   595                             // VMReg max_arg,

   596                             int comp_args_on_stack, // VMRegStackSlots

   597                             const BasicType *sig_bt,

   598                             const VMRegPair *regs,

   599                             Label& skip_fixup) {

   601   // Before we get into the guts of the C2I adapter, see if we should be here

   602   // at all.  We've come from compiled code and are attempting to jump to the

   603   // interpreter, which means the caller made a static call to get here

   604   // (vcalls always get a compiled target if there is one).  Check for a

   605   // compiled target.  If there is one, we need to patch the caller's call.

   606   // However we will run interpreted if we come thru here. The next pass

   607   // thru the call site will run compiled. If we ran compiled here then

   608   // we can (theorectically) do endless i2c->c2i->i2c transitions during

   609   // deopt/uncommon trap cycles. If we always go interpreted here then

   610   // we can have at most one and don't need to play any tricks to keep

   611   // from endlessly growing the stack.

   612   //

   613   // Actually if we detected that we had an i2c->c2i transition here we

   614   // ought to be able to reset the world back to the state of the interpreted

   615   // call and not bother building another interpreter arg area. We don't

   616   // do that at this point.

   618   patch_callers_callsite();

   620   __ bind(skip_fixup);

   622 #ifdef COMPILER2

   623   __ empty_FPU_stack();

   624 #endif

   625   //this is for native ?

   626   // Since all args are passed on the stack, total_args_passed * interpreter_

   627   // stack_element_size  is the

   628   // space we need.

   629   int extraspace = total_args_passed * Interpreter::stackElementSize;

   631   // stack is aligned, keep it that way

   632   extraspace = round_to(extraspace, 2*wordSize);

   634   // Get return address

   635   __ move(V0, RA);

   636   // set senderSP value

   637   //refer to interpreter_mips.cpp:generate_asm_entry

   638   __ move(Rsender, SP);

   639   __ addi(SP, SP, -extraspace);

   641   // Now write the args into the outgoing interpreter space

   642   for (int i = 0; i < total_args_passed; i++) {

   643     if (sig_bt[i] == T_VOID) {

   644       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");

   645       continue;

   646     }

   648     // st_off points to lowest address on stack.

   649     int st_off = ((total_args_passed - 1) - i) * Interpreter::stackElementSize;

   650     // Say 4 args:

   651     // i   st_off

   652     // 0   12 T_LONG

   653     // 1    8 T_VOID

   654     // 2    4 T_OBJECT

   655     // 3    0 T_BOOL

   656     VMReg r_1 = regs[i].first();

   657     VMReg r_2 = regs[i].second();

   658     if (!r_1->is_valid()) {

   659       assert(!r_2->is_valid(), "");

   660       continue;

   661     }

   662     if (r_1->is_stack()) {

   663       // memory to memory use fpu stack top

   664       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;

   665       if (!r_2->is_valid()) {

   666         __ ld_ptr(AT, SP, ld_off);

   667         __ st_ptr(AT, SP, st_off);

   669       } else {

   672         int next_off = st_off - Interpreter::stackElementSize;

   673         __ ld_ptr(AT, SP, ld_off);

   674         __ st_ptr(AT, SP, st_off);

   676         // Ref to is_Register condition

   677         if(sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE)

   678           __ st_ptr(AT, SP, st_off - 8);

   679       }

   680     } else if (r_1->is_Register()) {

   681       Register r = r_1->as_Register();

   682       if (!r_2->is_valid()) {

   683           __ sd(r, SP, st_off);

   684       } else {

   685         //FIXME, mips will not enter here

   686         // long/double in gpr

   687         __ sd(r, SP, st_off);

   688         // In [java/util/zip/ZipFile.java]

   689         //

   690         //    private static native long open(String name, int mode, long lastModified);

   691         //    private static native int getTotal(long jzfile);

   692         //

   693         // We need to transfer T_LONG paramenters from a compiled method to a native method.

   694         // It's a complex process:

   695         //

   696         // Caller -> lir_static_call -> gen_resolve_stub

   697         //      -> -- resolve_static_call_C

   698         //         `- gen_c2i_adapter()  [*]

   699         //             |

   700         //       `- AdapterHandlerLibrary::get_create_apapter_index

   701         //      -> generate_native_entry

   702         //      -> InterpreterRuntime::SignatureHandlerGenerator::pass_long [**]

   703         //

   704         // In [**], T_Long parameter is stored in stack as:

   705         //

   706         //   (high)

   707         //    |         |

   708         //    -----------

   709         //    | 8 bytes |

   710         //    | (void)  |

   711         //    -----------

   712         //    | 8 bytes |

   713         //    | (long)  |

   714         //    -----------

   715         //    |         |

   716         //   (low)

   717         //

   718         // However, the sequence is reversed here:

   719         //

   720         //   (high)

   721         //    |         |

   722         //    -----------

   723         //    | 8 bytes |

   724         //    | (long)  |

   725         //    -----------

   726         //    | 8 bytes |

   727         //    | (void)  |

   728         //    -----------

   729         //    |         |

   730         //   (low)

   731         //

   732         // So I stored another 8 bytes in the T_VOID slot. It then can be accessed from generate_native_entry().

   733         //

   734         if (sig_bt[i] == T_LONG)

   735           __ sd(r, SP, st_off - 8);

   736       }

   737     } else if (r_1->is_FloatRegister()) {

   738       assert(sig_bt[i] == T_FLOAT || sig_bt[i] == T_DOUBLE, "Must be a float register");

   740       FloatRegister fr = r_1->as_FloatRegister();

   741       if (sig_bt[i] == T_FLOAT)

   742         __ swc1(fr, SP, st_off);

   743       else {

   744         __ sdc1(fr, SP, st_off);

   745         __ sdc1(fr, SP, st_off - 8);  // T_DOUBLE needs two slots

   746       }

   747     }

   748   }

   750   // Schedule the branch target address early.

   751   __ ld_ptr(AT, Rmethod, in_bytes(Method::interpreter_entry_offset()) );

   752   // And repush original return address

   753   __ move(RA, V0);

   754   __ jr (AT);

   755   __ delayed()->nop();

   756 }

   758 void AdapterGenerator::gen_i2c_adapter(

   759                                        int total_args_passed,

   760                                        // VMReg max_arg,

   761                                        int comp_args_on_stack, // VMRegStackSlots

   762                                        const BasicType *sig_bt,

   763                                        const VMRegPair *regs) {

   765   // Generate an I2C adapter: adjust the I-frame to make space for the C-frame

   766   // layout.  Lesp was saved by the calling I-frame and will be restored on

   767   // return.  Meanwhile, outgoing arg space is all owned by the callee

   768   // C-frame, so we can mangle it at will.  After adjusting the frame size,

   769   // hoist register arguments and repack other args according to the compiled

   770   // code convention.  Finally, end in a jump to the compiled code.  The entry

   771   // point address is the start of the buffer.

   773   // We will only enter here from an interpreted frame and never from after

   774   // passing thru a c2i. Azul allowed this but we do not. If we lose the

   775   // race and use a c2i we will remain interpreted for the race loser(s).

   776   // This removes all sorts of headaches on the mips side and also eliminates

   777   // the possibility of having c2i -> i2c -> c2i -> ... endless transitions.

   780   __ move(T9, SP);

   782   // Cut-out for having no stack args.  Since up to 2 int/oop args are passed

   783   // in registers, we will occasionally have no stack args.

   784   int comp_words_on_stack = 0;

   785   if (comp_args_on_stack) {

   786     // Sig words on the stack are greater-than VMRegImpl::stack0.  Those in

   787     // registers are below.  By subtracting stack0, we either get a negative

   788     // number (all values in registers) or the maximum stack slot accessed.

   789     // int comp_args_on_stack = VMRegImpl::reg2stack(max_arg);

   790     // Convert 4-byte stack slots to words.

   791     // did mips need round? FIXME  aoqi

   792     comp_words_on_stack = round_to(comp_args_on_stack*4, wordSize)>>LogBytesPerWord;

   793     // Round up to miminum stack alignment, in wordSize

   794     comp_words_on_stack = round_to(comp_words_on_stack, 2);

   795     __ daddi(SP, SP, -comp_words_on_stack * wordSize);

   796   }

   798   // Align the outgoing SP

   799   __ move(AT, -(StackAlignmentInBytes));

   800   __ andr(SP, SP, AT);

   801   // push the return address on the stack (note that pushing, rather

   802   // than storing it, yields the correct frame alignment for the callee)

   803   // Put saved SP in another register

   804   const Register saved_sp = V0;

   805   __ move(saved_sp, T9);

   808   // Will jump to the compiled code just as if compiled code was doing it.

   809   // Pre-load the register-jump target early, to schedule it better.

   810   __ ld(T9, Rmethod, in_bytes(Method::from_compiled_offset()));

   812   // Now generate the shuffle code.  Pick up all register args and move the

   813   // rest through the floating point stack top.

   814   for (int i = 0; i < total_args_passed; i++) {

   815     if (sig_bt[i] == T_VOID) {

   816       // Longs and doubles are passed in native word order, but misaligned

   817       // in the 32-bit build.

   818       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");

   819       continue;

   820     }

   822     // Pick up 0, 1 or 2 words from SP+offset.

   824     //FIXME. aoqi. just delete the assert

   825     //assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(), "scrambled load targets?");

   826     // Load in argument order going down.

   827     int ld_off = (total_args_passed -1 - i)*Interpreter::stackElementSize;

   828     // Point to interpreter value (vs. tag)

   829     int next_off = ld_off - Interpreter::stackElementSize;

   830     VMReg r_1 = regs[i].first();

   831     VMReg r_2 = regs[i].second();

   832     if (!r_1->is_valid()) {

   833       assert(!r_2->is_valid(), "");

   834       continue;

   835     }

   836     if (r_1->is_stack()) {

   837       // Convert stack slot to an SP offset (+ wordSize to

   838       // account for return address )

   839       // NOTICE HERE!!!! I sub a wordSize here

   840       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size;

   841       //+ wordSize;

   843       if (!r_2->is_valid()) {

   844         __ ld(AT, saved_sp, ld_off);

   845         __ sd(AT, SP, st_off);

   846       } else {

   847         // Interpreter local[n] == MSW, local[n+1] == LSW however locals

   848         // are accessed as negative so LSW is at LOW address

   850         // ld_off is MSW so get LSW

   851         // st_off is LSW (i.e. reg.first())

   853         // [./org/eclipse/swt/graphics/GC.java]

   854         // void drawImageXRender(Image srcImage, int srcX, int srcY, int srcWidth, int srcHeight,

   855         //  int destX, int destY, int destWidth, int destHeight,

   856         //  boolean simple,

   857         //  int imgWidth, int imgHeight,

   858         //  long maskPixmap,  <-- Pass T_LONG in stack

   859         //  int maskType);

   860         // Before this modification, Eclipse displays icons with solid black background.

   861         //

   862         __ ld(AT, saved_sp, ld_off);

   863         if (sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE)

   864           __ ld(AT, saved_sp, ld_off - 8);

   865         __ sd(AT, SP, st_off);

   866       }

   867     } else if (r_1->is_Register()) {  // Register argument

   868       Register r = r_1->as_Register();

   869       if (r_2->is_valid()) {

   870         // Remember r_1 is low address (and LSB on mips)

   871         // So r_2 gets loaded from high address regardless of the platform

   872         assert(r_2->as_Register() == r_1->as_Register(), "");

   873         __ ld(r, saved_sp, ld_off);

   875         //

   876         // For T_LONG type, the real layout is as below:

   877         //

   878         //   (high)

   879         //    |         |

   880         //    -----------

   881         //    | 8 bytes |

   882         //    | (void)  |

   883         //    -----------

   884         //    | 8 bytes |

   885         //    | (long)  |

   886         //    -----------

   887         //    |         |

   888         //   (low)

   889         //

   890         // We should load the low-8 bytes.

   891         //

   892         if (sig_bt[i] == T_LONG)

   893           __ ld(r, saved_sp, ld_off - 8);

   894       } else {

   895         __ lw(r, saved_sp, ld_off);

   896       }

   897     } else if (r_1->is_FloatRegister()) { // Float Register

   898       assert(sig_bt[i] == T_FLOAT || sig_bt[i] == T_DOUBLE, "Must be a float register");

   900       FloatRegister fr = r_1->as_FloatRegister();

   901       if (sig_bt[i] == T_FLOAT)

   902           __ lwc1(fr, saved_sp, ld_off);

   903       else {

   904           __ ldc1(fr, saved_sp, ld_off);

   905           __ ldc1(fr, saved_sp, ld_off - 8);

   906       }

   907     }

   908   }

   910   // 6243940 We might end up in handle_wrong_method if

   911   // the callee is deoptimized as we race thru here. If that

   912   // happens we don't want to take a safepoint because the

   913   // caller frame will look interpreted and arguments are now

   914   // "compiled" so it is much better to make this transition

   915   // invisible to the stack walking code. Unfortunately if

   916   // we try and find the callee by normal means a safepoint

   917   // is possible. So we stash the desired callee in the thread

   918   // and the vm will find there should this case occur.

   919   __ get_thread(T8);

   920   __ sd(Rmethod, T8, in_bytes(JavaThread::callee_target_offset()));

   922   // move methodOop to V0 in case we end up in an c2i adapter.

   923   // the c2i adapters expect methodOop in V0 (c2) because c2's

   924   // resolve stubs return the result (the method) in V0.

   925   // I'd love to fix this.

   926   __ move(V0, Rmethod);

   927   __ jr(T9);

   928   __ delayed()->nop();

   929 }

   931 // ---------------------------------------------------------------

   932 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,

   933                                                             int total_args_passed,

   934                                                             // VMReg max_arg,

   935                                                             int comp_args_on_stack, // VMRegStackSlots

   936                                                             const BasicType *sig_bt,

   937                                                             const VMRegPair *regs,

   938                                                             AdapterFingerPrint* fingerprint) {

   939   address i2c_entry = __ pc();

   941   AdapterGenerator agen(masm);

   943   agen.gen_i2c_adapter(total_args_passed, comp_args_on_stack, sig_bt, regs);

   946   // -------------------------------------------------------------------------

   947   // Generate a C2I adapter.  On entry we know G5 holds the methodOop.  The

   948   // args start out packed in the compiled layout.  They need to be unpacked

   949   // into the interpreter layout.  This will almost always require some stack

   950   // space.  We grow the current (compiled) stack, then repack the args.  We

   951   // finally end in a jump to the generic interpreter entry point.  On exit

   952   // from the interpreter, the interpreter will restore our SP (lest the

   953   // compiled code, which relys solely on SP and not FP, get sick).

   955   address c2i_unverified_entry = __ pc();

   956   Label skip_fixup;

   957   {

   958     Register holder = T1;

   959     Register receiver = T0;

   960     Register temp = T8;

   961     address ic_miss = SharedRuntime::get_ic_miss_stub();

   963     Label missed;

   965     __ verify_oop(holder);

   966     //add for compressedoops

   967     __ load_klass(temp, receiver);

   968     __ verify_oop(temp);

   970     __ ld_ptr(AT, holder, CompiledICHolder::holder_klass_offset());

   971     __ ld_ptr(Rmethod, holder, CompiledICHolder::holder_metadata_offset());

   972     __ bne(AT, temp, missed);

   973     __ delayed()->nop();

   974     // Method might have been compiled since the call site was patched to

   975     // interpreted if that is the case treat it as a miss so we can get

   976     // the call site corrected.

   977     __ ld_ptr(AT, Rmethod, in_bytes(Method::code_offset()));

   978     __ beq(AT, R0, skip_fixup);

   979     __ delayed()->nop();

   980     __ bind(missed);

   982     __ jmp(ic_miss, relocInfo::runtime_call_type);

   983     __ delayed()->nop();

   984   }

   986   address c2i_entry = __ pc();

   988   agen.gen_c2i_adapter(total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);

   990   __ flush();

   991   return  AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry);

   992 }

   994 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,

   995                                          VMRegPair *regs,

   996                                          VMRegPair *regs2,

   997                                          int total_args_passed) {

   998   assert(regs2 == NULL, "not needed on MIPS");

   999   // Return the number of VMReg stack_slots needed for the args.

  1000   // This value does not include an abi space (like register window

  1001   // save area).

  1003   // The native convention is V8 if !LP64

  1004   // The LP64 convention is the V9 convention which is slightly more sane.

  1006   // We return the amount of VMReg stack slots we need to reserve for all

  1007   // the arguments NOT counting out_preserve_stack_slots. Since we always

  1008   // have space for storing at least 6 registers to memory we start with that.

  1009   // See int_stk_helper for a further discussion.

  1010   // We return the amount of VMRegImpl stack slots we need to reserve for all

  1011   // the arguments NOT counting out_preserve_stack_slots.

  1012   static const Register INT_ArgReg[Argument::n_register_parameters] = {

  1013     A0, A1, A2, A3, A4, A5, A6, A7

  1014   };

  1015   static const FloatRegister FP_ArgReg[Argument::n_float_register_parameters] = {

  1016     F12, F13, F14, F15, F16, F17, F18, F19

  1017   };

  1018   uint args = 0;

  1019   uint stk_args = 0; // inc by 2 each time

  1021 // Example:

  1022 //    n   java.lang.UNIXProcess::forkAndExec

  1023 //     private native int forkAndExec(byte[] prog,

  1024 //                                    byte[] argBlock, int argc,

  1025 //                                    byte[] envBlock, int envc,

  1026 //                                    byte[] dir,

  1027 //                                    boolean redirectErrorStream,

  1028 //                                    FileDescriptor stdin_fd,

  1029 //                                    FileDescriptor stdout_fd,

  1030 //                                    FileDescriptor stderr_fd)

  1031 // JNIEXPORT jint JNICALL

  1032 // Java_java_lang_UNIXProcess_forkAndExec(JNIEnv *env,

  1033 //                                        jobject process,

  1034 //                                        jbyteArray prog,

  1035 //                                        jbyteArray argBlock, jint argc,

  1036 //                                        jbyteArray envBlock, jint envc,

  1037 //                                        jbyteArray dir,

  1038 //                                        jboolean redirectErrorStream,

  1039 //                                        jobject stdin_fd,

  1040 //                                        jobject stdout_fd,

  1041 //                                        jobject stderr_fd)

  1042 //

  1043 // ::c_calling_convention

  1044 // 0:     // env    <-- a0

  1045 // 1: L    // klass/obj  <-- t0 => a1

  1046 // 2: [    // prog[]  <-- a0 => a2

  1047 // 3: [    // argBlock[]  <-- a1 => a3

  1048 // 4: I    // argc

  1049 // 5: [    // envBlock[]  <-- a3 => a5

  1050 // 6: I    // envc

  1051 // 7: [    // dir[]  <-- a5 => a7

  1052 // 8: Z    // redirectErrorStream  a6 => sp[0]

  1053 // 9: L    // stdin    a7 => sp[8]

  1054 // 10: L    // stdout    fp[16] => sp[16]

  1055 // 11: L    // stderr    fp[24] => sp[24]

  1056 //

  1057   for (int i = 0; i < total_args_passed; i++) {

  1058     switch (sig_bt[i]) {

  1059     case T_VOID: // Halves of longs and doubles

  1060       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");

  1061       regs[i].set_bad();

  1062       break;

  1063     case T_BOOLEAN:

  1064     case T_CHAR:

  1065     case T_BYTE:

  1066     case T_SHORT:

  1067     case T_INT:

  1068       if (args < Argument::n_register_parameters) {

  1069         regs[i].set1(INT_ArgReg[args++]->as_VMReg());

  1070       } else {

  1071         regs[i].set1(VMRegImpl::stack2reg(stk_args));

  1072         stk_args += 2;

  1073       }

  1074       break;

  1075     case T_LONG:

  1076       assert(sig_bt[i + 1] == T_VOID, "expecting half");

  1077       // fall through

  1078     case T_OBJECT:

  1079     case T_ARRAY:

  1080     case T_ADDRESS:

  1081     case T_METADATA:

  1082       if (args < Argument::n_register_parameters) {

  1083         regs[i].set2(INT_ArgReg[args++]->as_VMReg());

  1084       } else {

  1085         regs[i].set2(VMRegImpl::stack2reg(stk_args));

  1086         stk_args += 2;

  1087       }

  1088       break;

  1089     case T_FLOAT:

  1090       if (args < Argument::n_float_register_parameters) {

  1091         regs[i].set1(FP_ArgReg[args++]->as_VMReg());

  1092       } else {

  1093         regs[i].set1(VMRegImpl::stack2reg(stk_args));

  1094         stk_args += 2;

  1095       }

  1096       break;

  1097     case T_DOUBLE:

  1098       assert(sig_bt[i + 1] == T_VOID, "expecting half");

  1099       if (args < Argument::n_float_register_parameters) {

  1100         regs[i].set2(FP_ArgReg[args++]->as_VMReg());

  1101       } else {

  1102         regs[i].set2(VMRegImpl::stack2reg(stk_args));

  1103         stk_args += 2;

  1104       }

  1105       break;

  1106     default:

  1107       ShouldNotReachHere();

  1108       break;

  1109     }

  1110   }

  1112   return round_to(stk_args, 2);

  1113 }

  1115 // ---------------------------------------------------------------------------

  1116 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {

  1117   // We always ignore the frame_slots arg and just use the space just below frame pointer

  1118   // which by this time is free to use

  1119   switch (ret_type) {

  1120     case T_FLOAT:

  1121       __ swc1(FSF, FP, -wordSize);

  1122       break;

  1123     case T_DOUBLE:

  1124       __ sdc1(FSF, FP, -wordSize );

  1125       break;

  1126     case T_VOID:  break;

  1127     case T_LONG:

  1128       __ sd(V0, FP, -wordSize);

  1129       break;

  1130     case T_OBJECT:

  1131     case T_ARRAY:

  1132       __ sd(V0, FP, -wordSize);

  1133       break;

  1134     default: {

  1135       __ sw(V0, FP, -wordSize);

  1136       }

  1137   }

  1138 }

  1140 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {

  1141   // We always ignore the frame_slots arg and just use the space just below frame pointer

  1142   // which by this time is free to use

  1143   switch (ret_type) {

  1144     case T_FLOAT:

  1145       __ lwc1(FSF, FP, -wordSize);

  1146       break;

  1147     case T_DOUBLE:

  1148       __ ldc1(FSF, FP, -wordSize );

  1149       break;

  1150     case T_LONG:

  1151       __ ld(V0, FP, -wordSize);

  1152       break;

  1153     case T_VOID:  break;

  1154     case T_OBJECT:

  1155     case T_ARRAY:

  1156       __ ld(V0, FP, -wordSize);

  1157       break;

  1158     default: {

  1159       __ lw(V0, FP, -wordSize);

  1160       }

  1161   }

  1162 }

  1164 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {

  1165   for ( int i = first_arg ; i < arg_count ; i++ ) {

  1166     if (args[i].first()->is_Register()) {

  1167       __ push(args[i].first()->as_Register());

  1168     } else if (args[i].first()->is_FloatRegister()) {

  1169       __ push(args[i].first()->as_FloatRegister());

  1170     }

  1171   }

  1172 }

  1174 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {

  1175   for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {

  1176     if (args[i].first()->is_Register()) {

  1177       __ pop(args[i].first()->as_Register());

  1178     } else if (args[i].first()->is_FloatRegister()) {

  1179       __ pop(args[i].first()->as_FloatRegister());

  1180     }

  1181   }

  1182 }

  1184 // A simple move of integer like type

  1185 static void simple_move32(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {

  1186   if (src.first()->is_stack()) {

  1187     if (dst.first()->is_stack()) {

  1188       // stack to stack

  1189       __ lw(AT, FP, reg2offset_in(src.first()));

  1190       __ sd(AT, SP, reg2offset_out(dst.first()));

  1191     } else {

  1192       // stack to reg

  1193       __ lw(dst.first()->as_Register(),  FP, reg2offset_in(src.first()));

  1194     }

  1195   } else if (dst.first()->is_stack()) {

  1196     // reg to stack

  1197     __ sd(src.first()->as_Register(), SP, reg2offset_out(dst.first()));

  1198   } else {

  1199     if (dst.first() != src.first()){

  1200       __ move(dst.first()->as_Register(), src.first()->as_Register()); // fujie error:dst.first()

  1201     }

  1202   }

  1203 }

  1205 // An oop arg. Must pass a handle not the oop itself

  1206 static void object_move(MacroAssembler* masm,

  1207                         OopMap* map,

  1208                         int oop_handle_offset,

  1209                         int framesize_in_slots,

  1210                         VMRegPair src,

  1211                         VMRegPair dst,

  1212                         bool is_receiver,

  1213                         int* receiver_offset) {

  1215   // must pass a handle. First figure out the location we use as a handle

  1217   //FIXME, for mips, dst can be register

  1218   if (src.first()->is_stack()) {

  1219     // Oop is already on the stack as an argument

  1220     Register rHandle = V0;

  1221     Label nil;

  1222     __ xorr(rHandle, rHandle, rHandle);

  1223     __ ld(AT, FP, reg2offset_in(src.first()));

  1224     __ beq(AT, R0, nil);

  1225     __ delayed()->nop();

  1226     __ lea(rHandle, Address(FP, reg2offset_in(src.first())));

  1227     __ bind(nil);

  1228     if(dst.first()->is_stack())__ sd( rHandle, SP, reg2offset_out(dst.first()));

  1229     else                       __ move( (dst.first())->as_Register(), rHandle);

  1230     //if dst is register

  1231     //FIXME, do mips need out preserve stack slots?

  1232     int offset_in_older_frame = src.first()->reg2stack()

  1233       + SharedRuntime::out_preserve_stack_slots();

  1234     map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));

  1235     if (is_receiver) {

  1236       *receiver_offset = (offset_in_older_frame

  1237           + framesize_in_slots) * VMRegImpl::stack_slot_size;

  1238     }

  1239   } else {

  1240     // Oop is in an a register we must store it to the space we reserve

  1241     // on the stack for oop_handles

  1242     const Register rOop = src.first()->as_Register();

  1243     assert( (rOop->encoding() >= A0->encoding()) && (rOop->encoding() <= T0->encoding()),"wrong register");

  1244     const Register rHandle = V0;

  1245     //Important: refer to java_calling_convertion

  1246     int oop_slot = (rOop->encoding() - A0->encoding()) * VMRegImpl::slots_per_word + oop_handle_offset;

  1247     int offset = oop_slot*VMRegImpl::stack_slot_size;

  1248     Label skip;

  1249     __ sd( rOop , SP, offset );

  1250     map->set_oop(VMRegImpl::stack2reg(oop_slot));

  1251     __ xorr( rHandle, rHandle, rHandle);

  1252     __ beq(rOop, R0, skip);

  1253     __ delayed()->nop();

  1254     __ lea(rHandle, Address(SP, offset));

  1255     __ bind(skip);

  1256     // Store the handle parameter

  1257     if(dst.first()->is_stack())__ sd( rHandle, SP, reg2offset_out(dst.first()));

  1258     else                       __ move((dst.first())->as_Register(), rHandle);

  1259     //if dst is register

  1261     if (is_receiver) {

  1262       *receiver_offset = offset;

  1263     }

  1264   }

  1265 }

  1267 // A float arg may have to do float reg int reg conversion

  1268 static void float_move(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {

  1269   assert(!src.second()->is_valid() && !dst.second()->is_valid(), "bad float_move");

  1271   if (src.first()->is_stack()) {

  1272     if (dst.first()->is_stack()) {

  1273       __ lw(AT, FP, reg2offset_in(src.first()));

  1274       __ sw(AT, SP, reg2offset_out(dst.first()));

  1275     }

  1276     else

  1277       __ lwc1(dst.first()->as_FloatRegister(), FP, reg2offset_in(src.first()));

  1278   } else {

  1279     // reg to stack

  1280     if(dst.first()->is_stack())

  1281       __ swc1(src.first()->as_FloatRegister(), SP, reg2offset_out(dst.first()));

  1282     else

  1283       __ mov_s(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());

  1284   }

  1285 }

  1287 // A long move

  1288 static void long_move(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {

  1290   // The only legal possibility for a long_move VMRegPair is:

  1291   // 1: two stack slots (possibly unaligned)

  1292   // as neither the java  or C calling convention will use registers

  1293   // for longs.

  1295   if (src.first()->is_stack()) {

  1296     assert(src.second()->is_stack() && dst.second()->is_stack(), "must be all stack");

  1297     if( dst.first()->is_stack()){

  1298       __ ld(AT, FP, reg2offset_in(src.first()));

  1299       __ sd(AT, SP, reg2offset_out(dst.first()));

  1300     } else {

  1301       __ ld( (dst.first())->as_Register() , FP, reg2offset_in(src.first()));

  1302     }

  1303   } else {

  1304     if( dst.first()->is_stack()){

  1305       __ sd( (src.first())->as_Register(), SP, reg2offset_out(dst.first()));

  1306     } else {

  1307       __ move( (dst.first())->as_Register() , (src.first())->as_Register());

  1308     }

  1309   }

  1310 }

  1312 // A double move

  1313 static void double_move(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {

  1315   // The only legal possibilities for a double_move VMRegPair are:

  1316   // The painful thing here is that like long_move a VMRegPair might be

  1318   // Because of the calling convention we know that src is either

  1319   //   1: a single physical register (xmm registers only)

  1320   //   2: two stack slots (possibly unaligned)

  1321   // dst can only be a pair of stack slots.

  1324   if (src.first()->is_stack()) {

  1325     // source is all stack

  1326     if( dst.first()->is_stack()){

  1327       __ ld(AT, FP, reg2offset_in(src.first()));

  1328       __ sd(AT, SP, reg2offset_out(dst.first()));

  1329     } else {

  1330       __ ldc1( (dst.first())->as_FloatRegister(), FP, reg2offset_in(src.first()));

  1331     }

  1333   } else {

  1334     // reg to stack

  1335     // No worries about stack alignment

  1336     if( dst.first()->is_stack()){

  1337       __ sdc1(src.first()->as_FloatRegister(), SP, reg2offset_out(dst.first()));

  1338     }

  1339     else

  1340       __ mov_d( dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());

  1342   }

  1343 }

  1345 static void verify_oop_args(MacroAssembler* masm,

  1346                             methodHandle method,

  1347                             const BasicType* sig_bt,

  1348                             const VMRegPair* regs) {

  1349   Register temp_reg = T9;  // not part of any compiled calling seq

  1350   if (VerifyOops) {

  1351     for (int i = 0; i < method->size_of_parameters(); i++) {

  1352       if (sig_bt[i] == T_OBJECT ||

  1353           sig_bt[i] == T_ARRAY) {

  1354         VMReg r = regs[i].first();

  1355         assert(r->is_valid(), "bad oop arg");

  1356         if (r->is_stack()) {

  1357           __ ld(temp_reg, Address(SP, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));

  1358           __ verify_oop(temp_reg);

  1359         } else {

  1360           __ verify_oop(r->as_Register());

  1361         }

  1362       }

  1363     }

  1364   }

  1365 }

  1367 static void gen_special_dispatch(MacroAssembler* masm,

  1368                                  methodHandle method,

  1369                                  const BasicType* sig_bt,

  1370                                  const VMRegPair* regs) {

  1371   verify_oop_args(masm, method, sig_bt, regs);

  1372   vmIntrinsics::ID iid = method->intrinsic_id();

  1374   // Now write the args into the outgoing interpreter space

  1375   bool     has_receiver   = false;

  1376   Register receiver_reg   = noreg;

  1377   int      member_arg_pos = -1;

  1378   Register member_reg     = noreg;

  1379   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);

  1380   if (ref_kind != 0) {

  1381     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument

  1382     member_reg = S3;  // known to be free at this point

  1383     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);

  1384   } else if (iid == vmIntrinsics::_invokeBasic) {

  1385     has_receiver = true;

  1386   } else {

  1387     fatal(err_msg_res("unexpected intrinsic id %d", iid));

  1388   }

  1390   if (member_reg != noreg) {

  1391     // Load the member_arg into register, if necessary.

  1392     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);

  1393     VMReg r = regs[member_arg_pos].first();

  1394     if (r->is_stack()) {

  1395       __ ld(member_reg, Address(SP, r->reg2stack() * VMRegImpl::stack_slot_size));

  1396     } else {

  1397       // no data motion is needed

  1398       member_reg = r->as_Register();

  1399     }

  1400   }

  1402   if (has_receiver) {

  1403     // Make sure the receiver is loaded into a register.

  1404     assert(method->size_of_parameters() > 0, "oob");

  1405     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");

  1406     VMReg r = regs[0].first();

  1407     assert(r->is_valid(), "bad receiver arg");

  1408     if (r->is_stack()) {

  1409       // Porting note:  This assumes that compiled calling conventions always

  1410       // pass the receiver oop in a register.  If this is not true on some

  1411       // platform, pick a temp and load the receiver from stack.

  1412       fatal("receiver always in a register");

  1413       receiver_reg = SSR;  // known to be free at this point

  1414       __ ld(receiver_reg, Address(SP, r->reg2stack() * VMRegImpl::stack_slot_size));

  1415     } else {

  1416       // no data motion is needed

  1417       receiver_reg = r->as_Register();

  1418     }

  1419   }

  1421   // Figure out which address we are really jumping to:

  1422   MethodHandles::generate_method_handle_dispatch(masm, iid,

  1423                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);

  1424 }

  1426 // ---------------------------------------------------------------------------

  1427 // Generate a native wrapper for a given method.  The method takes arguments

  1428 // in the Java compiled code convention, marshals them to the native

  1429 // convention (handlizes oops, etc), transitions to native, makes the call,

  1430 // returns to java state (possibly blocking), unhandlizes any result and

  1431 // returns.

  1432 nmethod *SharedRuntime::generate_native_wrapper(MacroAssembler* masm,

  1433                                                 methodHandle method,

  1434                                                 int compile_id,

  1435                                                 BasicType* in_sig_bt,

  1436                                                 VMRegPair* in_regs,

  1437                                                 BasicType ret_type) {

  1438   if (method->is_method_handle_intrinsic()) {

  1439     vmIntrinsics::ID iid = method->intrinsic_id();

  1440     intptr_t start = (intptr_t)__ pc();

  1441     int vep_offset = ((intptr_t)__ pc()) - start;

  1442     gen_special_dispatch(masm,

  1443                          method,

  1444                          in_sig_bt,

  1445                          in_regs);

  1446     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period

  1447     __ flush();

  1448     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually

  1449     return nmethod::new_native_nmethod(method,

  1450                                        compile_id,

  1451                                        masm->code(),

  1452                                        vep_offset,

  1453                                        frame_complete,

  1454                                        stack_slots / VMRegImpl::slots_per_word,

  1455                                        in_ByteSize(-1),

  1456                                        in_ByteSize(-1),

  1457                                        (OopMapSet*)NULL);

  1458   }

  1459   bool is_critical_native = true;

  1460   address native_func = method->critical_native_function();

  1461   if (native_func == NULL) {

  1462     native_func = method->native_function();

  1463     is_critical_native = false;

  1464   }

  1465   assert(native_func != NULL, "must have function");

  1467   // Native nmethod wrappers never take possesion of the oop arguments.

  1468   // So the caller will gc the arguments. The only thing we need an

  1469   // oopMap for is if the call is static

  1470   //

  1471   // An OopMap for lock (and class if static), and one for the VM call itself

  1472   OopMapSet *oop_maps = new OopMapSet();

  1474   // We have received a description of where all the java arg are located

  1475   // on entry to the wrapper. We need to convert these args to where

  1476   // the jni function will expect them. To figure out where they go

  1477   // we convert the java signature to a C signature by inserting

  1478   // the hidden arguments as arg[0] and possibly arg[1] (static method)

  1480   const int total_in_args = method->size_of_parameters();

  1481   int total_c_args = total_in_args;

  1482   if (!is_critical_native) {

  1483     total_c_args += 1;

  1484     if (method->is_static()) {

  1485       total_c_args++;

  1486     }

  1487   } else {

  1488     for (int i = 0; i < total_in_args; i++) {

  1489       if (in_sig_bt[i] == T_ARRAY) {

  1490         total_c_args++;

  1491       }

  1492     }

  1493   }

  1495   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);

  1496   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);

  1497   BasicType* in_elem_bt = NULL;

  1499   int argc = 0;

  1500   if (!is_critical_native) {

  1501     out_sig_bt[argc++] = T_ADDRESS;

  1502     if (method->is_static()) {

  1503       out_sig_bt[argc++] = T_OBJECT;

  1504     }

  1506     for (int i = 0; i < total_in_args ; i++ ) {

  1507       out_sig_bt[argc++] = in_sig_bt[i];

  1508     }

  1509   } else {

  1510     Thread* THREAD = Thread::current();

  1511     in_elem_bt = NEW_RESOURCE_ARRAY(BasicType, total_in_args);

  1512     SignatureStream ss(method->signature());

  1513     for (int i = 0; i < total_in_args ; i++ ) {

  1514       if (in_sig_bt[i] == T_ARRAY) {

  1515         // Arrays are passed as int, elem* pair

  1516         out_sig_bt[argc++] = T_INT;

  1517         out_sig_bt[argc++] = T_ADDRESS;

  1518         Symbol* atype = ss.as_symbol(CHECK_NULL);

  1519         const char* at = atype->as_C_string();

  1520         if (strlen(at) == 2) {

  1521           assert(at[0] == '[', "must be");

  1522           switch (at[1]) {

  1523             case 'B': in_elem_bt[i]  = T_BYTE; break;

  1524             case 'C': in_elem_bt[i]  = T_CHAR; break;

  1525             case 'D': in_elem_bt[i]  = T_DOUBLE; break;

  1526             case 'F': in_elem_bt[i]  = T_FLOAT; break;

  1527             case 'I': in_elem_bt[i]  = T_INT; break;

  1528             case 'J': in_elem_bt[i]  = T_LONG; break;

  1529             case 'S': in_elem_bt[i]  = T_SHORT; break;

  1530             case 'Z': in_elem_bt[i]  = T_BOOLEAN; break;

  1531             default: ShouldNotReachHere();

  1532           }

  1533         }

  1534       } else {

  1535         out_sig_bt[argc++] = in_sig_bt[i];

  1536         in_elem_bt[i] = T_VOID;

  1537       }

  1538       if (in_sig_bt[i] != T_VOID) {

  1539         assert(in_sig_bt[i] == ss.type(), "must match");

  1540         ss.next();

  1541       }

  1542     }

  1543   }

  1545   // Now figure out where the args must be stored and how much stack space

  1546   // they require (neglecting out_preserve_stack_slots but space for storing

  1547   // the 1st six register arguments). It's weird see int_stk_helper.

  1548   //

  1549   int out_arg_slots;

  1550   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, NULL, total_c_args);

  1552   // Compute framesize for the wrapper.  We need to handlize all oops in

  1553   // registers. We must create space for them here that is disjoint from

  1554   // the windowed save area because we have no control over when we might

  1555   // flush the window again and overwrite values that gc has since modified.

  1556   // (The live window race)

  1557   //

  1558   // We always just allocate 6 word for storing down these object. This allow

  1559   // us to simply record the base and use the Ireg number to decide which

  1560   // slot to use. (Note that the reg number is the inbound number not the

  1561   // outbound number).

  1562   // We must shuffle args to match the native convention, and include var-args space.

  1564   // Calculate the total number of stack slots we will need.

  1566   // First count the abi requirement plus all of the outgoing args

  1567   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;

  1569   // Now the space for the inbound oop handle area

  1570   int total_save_slots = 9 * VMRegImpl::slots_per_word;  // 9 arguments passed in registers

  1571   if (is_critical_native) {

  1572     // Critical natives may have to call out so they need a save area

  1573     // for register arguments.

  1574     int double_slots = 0;

  1575     int single_slots = 0;

  1576     for ( int i = 0; i < total_in_args; i++) {

  1577       if (in_regs[i].first()->is_Register()) {

  1578         const Register reg = in_regs[i].first()->as_Register();

  1579         switch (in_sig_bt[i]) {

  1580           case T_BOOLEAN:

  1581           case T_BYTE:

  1582           case T_SHORT:

  1583           case T_CHAR:

  1584           case T_INT:  single_slots++; break;

  1585           case T_ARRAY:  // specific to LP64 (7145024)

  1586           case T_LONG: double_slots++; break;

  1587           default:  ShouldNotReachHere();

  1588         }

  1589       } else if (in_regs[i].first()->is_FloatRegister()) {

  1590         switch (in_sig_bt[i]) {

  1591           case T_FLOAT:  single_slots++; break;

  1592           case T_DOUBLE: double_slots++; break;

  1593           default:  ShouldNotReachHere();

  1594         }

  1595       }

  1596     }

  1597     total_save_slots = double_slots * 2 + single_slots;

  1598     // align the save area

  1599     if (double_slots != 0) {

  1600       stack_slots = round_to(stack_slots, 2);

  1601     }

  1602   }

  1604   int oop_handle_offset = stack_slots;

  1605   stack_slots += total_save_slots;

  1607   // Now any space we need for handlizing a klass if static method

  1609   int klass_slot_offset = 0;

  1610   int klass_offset = -1;

  1611   int lock_slot_offset = 0;

  1612   bool is_static = false;

  1614   if (method->is_static()) {

  1615     klass_slot_offset = stack_slots;

  1616     stack_slots += VMRegImpl::slots_per_word;

  1617     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;

  1618     is_static = true;

  1619   }

  1621   // Plus a lock if needed

  1623   if (method->is_synchronized()) {

  1624     lock_slot_offset = stack_slots;

  1625     stack_slots += VMRegImpl::slots_per_word;

  1626   }

  1628   // Now a place to save return value or as a temporary for any gpr -> fpr moves

  1629   // + 2 for return address (which we own) and saved fp

  1630   stack_slots += 2 + 9 * VMRegImpl::slots_per_word;  // (T0, A0, A1, A2, A3, A4, A5, A6, A7)

  1632   // Ok The space we have allocated will look like:

  1633   //

  1634   //

  1635   // FP-> |                     |

  1636   //      |---------------------|

  1637   //      | 2 slots for moves   |

  1638   //      |---------------------|

  1639   //      | lock box (if sync)  |

  1640   //      |---------------------| <- lock_slot_offset

  1641   //      | klass (if static)   |

  1642   //      |---------------------| <- klass_slot_offset

  1643   //      | oopHandle area      |

  1644   //      |---------------------| <- oop_handle_offset

  1645   //      | outbound memory     |

  1646   //      | based arguments     |

  1647   //      |                     |

  1648   //      |---------------------|

  1649   //      | vararg area         |

  1650   //      |---------------------|

  1651   //      |                     |

  1652   // SP-> | out_preserved_slots |

  1653   //

  1654   //

  1657   // Now compute actual number of stack words we need rounding to make

  1658   // stack properly aligned.

  1659   stack_slots = round_to(stack_slots, StackAlignmentInSlots);

  1661   int stack_size = stack_slots * VMRegImpl::stack_slot_size;

  1663   intptr_t start = (intptr_t)__ pc();

  1667   // First thing make an ic check to see if we should even be here

  1668   address ic_miss = SharedRuntime::get_ic_miss_stub();

  1670   // We are free to use all registers as temps without saving them and

  1671   // restoring them except fp. fp is the only callee save register

  1672   // as far as the interpreter and the compiler(s) are concerned.

  1674   //refer to register_mips.hpp:IC_Klass

  1675   const Register ic_reg = T1;

  1676   const Register receiver = T0;

  1678   Label hit;

  1679   Label exception_pending;

  1681   __ verify_oop(receiver);

  1682   //add for compressedoops

  1683   __ load_klass(T9, receiver);

  1684   __ beq(T9, ic_reg, hit);

  1685   __ delayed()->nop();

  1686   __ jmp(ic_miss, relocInfo::runtime_call_type);

  1687   __ delayed()->nop();

  1688   // verified entry must be aligned for code patching.

  1689   // and the first 5 bytes must be in the same cache line

  1690   // if we align at 8 then we will be sure 5 bytes are in the same line

  1691   __ align(8);

  1693   __ bind(hit);

  1696   int vep_offset = ((intptr_t)__ pc()) - start;

  1697 #ifdef COMPILER1

  1698   if (InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) {

  1699     // Object.hashCode can pull the hashCode from the header word

  1700     // instead of doing a full VM transition once it's been computed.

  1701     // Since hashCode is usually polymorphic at call sites we can't do

  1702     // this optimization at the call site without a lot of work.

  1703     Label slowCase;

  1704     Register receiver = T0;

  1705     Register result = V0;

  1706     __ ld ( result, receiver, oopDesc::mark_offset_in_bytes());

  1707     // check if locked

  1708     __ andi(AT, result, markOopDesc::unlocked_value);

  1709     __ beq(AT, R0, slowCase);

  1710     __ delayed()->nop();

  1711     if (UseBiasedLocking) {

  1712       // Check if biased and fall through to runtime if so

  1713       __ andi (AT, result, markOopDesc::biased_lock_bit_in_place);

  1714       __ bne(AT, R0, slowCase);

  1715       __ delayed()->nop();

  1716     }

  1717     // get hash

  1718     __ li(AT, markOopDesc::hash_mask_in_place);

  1719     __ andr (AT, result, AT);

  1720     // test if hashCode exists

  1721     __ beq (AT, R0, slowCase);

  1722     __ delayed()->nop();

  1723     __ shr(result, markOopDesc::hash_shift);

  1724     __ jr(RA);

  1725     __ delayed()->nop();

  1726     __ bind (slowCase);

  1727   }

  1728 #endif // COMPILER1

  1730   // The instruction at the verified entry point must be 5 bytes or longer

  1731   // because it can be patched on the fly by make_non_entrant. The stack bang

  1732   // instruction fits that requirement.

  1734   // Generate stack overflow check

  1736   if (UseStackBanging) {

  1737     __ bang_stack_with_offset(StackShadowPages*os::vm_page_size());

  1738   } else {

  1739     // need a 5 byte instruction to allow MT safe patching to non-entrant

  1740     __ nop();

  1741     __ nop();

  1742     __ nop();

  1743     __ nop();

  1744     __ nop();

  1745   }

  1746   // Generate a new frame for the wrapper.

  1747   // do mips need this ?

  1748 #ifndef OPT_THREAD

  1749   __ get_thread(TREG);

  1750 #endif

  1751   __ st_ptr(SP, TREG, in_bytes(JavaThread::last_Java_sp_offset()));

  1752   __ move(AT, -(StackAlignmentInBytes));

  1753   __ andr(SP, SP, AT);

  1755   __ enter();

  1756   // -2 because return address is already present and so is saved fp

  1757   __ addiu(SP, SP, -1 * (stack_size - 2*wordSize));

  1759   // Frame is now completed as far a size and linkage.

  1761   int frame_complete = ((intptr_t)__ pc()) - start;

  1763   // Calculate the difference between sp and fp. We need to know it

  1764   // after the native call because on windows Java Natives will pop

  1765   // the arguments and it is painful to do sp relative addressing

  1766   // in a platform independent way. So after the call we switch to

  1767   // fp relative addressing.

  1768   //FIXME actually , the fp_adjustment may not be the right, because andr(sp, sp, at) may change

  1769   //the SP

  1770   int fp_adjustment = stack_size - 2*wordSize;

  1772 #ifdef COMPILER2

  1773   // C2 may leave the stack dirty if not in SSE2+ mode

  1774   __ empty_FPU_stack();

  1775 #endif

  1777   // Compute the fp offset for any slots used after the jni call

  1779   int lock_slot_fp_offset = (lock_slot_offset*VMRegImpl::stack_slot_size) - fp_adjustment;

  1780   // We use TREG as a thread pointer because it is callee save and

  1781   // if we load it once it is usable thru the entire wrapper

  1782   const Register thread = TREG;

  1784   // We use S4 as the oop handle for the receiver/klass

  1785   // It is callee save so it survives the call to native

  1787   const Register oop_handle_reg = S4;

  1788   if (is_critical_native) {

  1789      __ stop("generate_native_wrapper in sharedRuntime <2>");

  1790     //TODO:Fu

  1791     // check_needs_gc_for_critical_native(masm, stack_slots, total_c_args, total_in_args,

  1792     //                                   oop_handle_offset, oop_maps, in_regs, in_sig_bt);

  1793   }

  1795 #ifndef OPT_THREAD

  1796   __ get_thread(thread);

  1797 #endif

  1799   //

  1800   // We immediately shuffle the arguments so that any vm call we have to

  1801   // make from here on out (sync slow path, jvmpi, etc.) we will have

  1802   // captured the oops from our caller and have a valid oopMap for

  1803   // them.

  1805   // -----------------

  1806   // The Grand Shuffle

  1807   //

  1808   // Natives require 1 or 2 extra arguments over the normal ones: the JNIEnv*

  1809   // and, if static, the class mirror instead of a receiver.  This pretty much

  1810   // guarantees that register layout will not match (and mips doesn't use reg

  1811   // parms though amd does).  Since the native abi doesn't use register args

  1812   // and the java conventions does we don't have to worry about collisions.

  1813   // All of our moved are reg->stack or stack->stack.

  1814   // We ignore the extra arguments during the shuffle and handle them at the

  1815   // last moment. The shuffle is described by the two calling convention

  1816   // vectors we have in our possession. We simply walk the java vector to

  1817   // get the source locations and the c vector to get the destinations.

  1819   int c_arg = method->is_static() ? 2 : 1 ;

  1821   // Record sp-based slot for receiver on stack for non-static methods

  1822   int receiver_offset = -1;

  1824   // This is a trick. We double the stack slots so we can claim

  1825   // the oops in the caller's frame. Since we are sure to have

  1826   // more args than the caller doubling is enough to make

  1827   // sure we can capture all the incoming oop args from the

  1828   // caller.

  1829   //

  1830   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);

  1832   // Mark location of fp (someday)

  1833   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(fp));

  1835 #ifdef ASSERT

  1836   bool reg_destroyed[RegisterImpl::number_of_registers];

  1837   bool freg_destroyed[FloatRegisterImpl::number_of_registers];

  1838   for ( int r = 0 ; r < RegisterImpl::number_of_registers ; r++ ) {

  1839     reg_destroyed[r] = false;

  1840   }

  1841   for ( int f = 0 ; f < FloatRegisterImpl::number_of_registers ; f++ ) {

  1842     freg_destroyed[f] = false;

  1843   }

  1845 #endif /* ASSERT */

  1847   // This may iterate in two different directions depending on the

  1848   // kind of native it is.  The reason is that for regular JNI natives

  1849   // the incoming and outgoing registers are offset upwards and for

  1850   // critical natives they are offset down.

  1851   GrowableArray<int> arg_order(2 * total_in_args);

  1852   VMRegPair tmp_vmreg;

  1853   tmp_vmreg.set1(T8->as_VMReg());

  1855   if (!is_critical_native) {

  1856     for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {

  1857       arg_order.push(i);

  1858       arg_order.push(c_arg);

  1859     }

  1860   } else {

  1861     // Compute a valid move order, using tmp_vmreg to break any cycles

  1862      __ stop("generate_native_wrapper in sharedRuntime <2>");

  1863     //TODO:Fu

  1864     // ComputeMoveOrder cmo(total_in_args, in_regs, total_c_args, out_regs, in_sig_bt, arg_order, tmp_vmreg);

  1865   }

  1867   int temploc = -1;

  1868   for (int ai = 0; ai < arg_order.length(); ai += 2) {

  1869     int i = arg_order.at(ai);

  1870     int c_arg = arg_order.at(ai + 1);

  1871     __ block_comment(err_msg("move %d -> %d", i, c_arg));

  1872     if (c_arg == -1) {

  1873       assert(is_critical_native, "should only be required for critical natives");

  1874       // This arg needs to be moved to a temporary

  1875       __ move(tmp_vmreg.first()->as_Register(), in_regs[i].first()->as_Register());

  1876       in_regs[i] = tmp_vmreg;

  1877       temploc = i;

  1878       continue;

  1879     } else if (i == -1) {

  1880       assert(is_critical_native, "should only be required for critical natives");

  1881       // Read from the temporary location

  1882       assert(temploc != -1, "must be valid");

  1883       i = temploc;

  1884       temploc = -1;

  1885     }

  1886 #ifdef ASSERT

  1887     if (in_regs[i].first()->is_Register()) {

  1888       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");

  1889     } else if (in_regs[i].first()->is_FloatRegister()) {

  1890       assert(!freg_destroyed[in_regs[i].first()->as_FloatRegister()->encoding()], "destroyed reg!");

  1891     }

  1892     if (out_regs[c_arg].first()->is_Register()) {

  1893       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;

  1894     } else if (out_regs[c_arg].first()->is_FloatRegister()) {

  1895       freg_destroyed[out_regs[c_arg].first()->as_FloatRegister()->encoding()] = true;

  1896     }

  1897 #endif /* ASSERT */

  1898     switch (in_sig_bt[i]) {

  1899       case T_ARRAY:

  1900         if (is_critical_native) {

  1901           __ stop("generate_native_wrapper in sharedRuntime <2>");

  1902           //TODO:Fu

  1903           // unpack_array_argument(masm, in_regs[i], in_elem_bt[i], out_regs[c_arg + 1], out_regs[c_arg]);

  1904           c_arg++;

  1905 #ifdef ASSERT

  1906           if (out_regs[c_arg].first()->is_Register()) {

  1907             reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;

  1908           } else if (out_regs[c_arg].first()->is_FloatRegister()) {

  1909             freg_destroyed[out_regs[c_arg].first()->as_FloatRegister()->encoding()] = true;

  1910           }

  1911 #endif

  1912           break;

  1913         }

  1914       case T_OBJECT:

  1915         assert(!is_critical_native, "no oop arguments");

  1916         object_move(masm, map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],

  1917                     ((i == 0) && (!is_static)),

  1918                     &receiver_offset);

  1919         break;

  1920       case T_VOID:

  1921         break;

  1923       case T_FLOAT:

  1924         float_move(masm, in_regs[i], out_regs[c_arg]);

  1925           break;

  1927       case T_DOUBLE:

  1928         assert( i + 1 < total_in_args &&

  1929                 in_sig_bt[i + 1] == T_VOID &&

  1930                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");

  1931         double_move(masm, in_regs[i], out_regs[c_arg]);

  1932         break;

  1934       case T_LONG :

  1935         long_move(masm, in_regs[i], out_regs[c_arg]);

  1936         break;

  1938       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");

  1940       default:

  1941         simple_move32(masm, in_regs[i], out_regs[c_arg]);

  1942     }

  1943   }

  1945   // point c_arg at the first arg that is already loaded in case we

  1946   // need to spill before we call out

  1947   c_arg = total_c_args - total_in_args;

  1948   // Pre-load a static method's oop.  Used both by locking code and

  1949   // the normal JNI call code.

  1951   __ move(oop_handle_reg, A1);

  1953   if (method->is_static() && !is_critical_native) {

  1955     //  load opp into a register

  1956     int oop_index = __ oop_recorder()->find_index(JNIHandles::make_local(

  1957           (method->method_holder())->java_mirror()));

  1960     RelocationHolder rspec = oop_Relocation::spec(oop_index);

  1961     __ relocate(rspec);

  1962     __ patchable_set48(oop_handle_reg, (long)JNIHandles::make_local((method->method_holder())->java_mirror()));

  1963     // Now handlize the static class mirror it's known not-null.

  1964     __ sd( oop_handle_reg, SP, klass_offset);

  1965     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));

  1967     // Now get the handle

  1968     __ lea(oop_handle_reg, Address(SP, klass_offset));

  1969     // store the klass handle as second argument

  1970     __ move(A1, oop_handle_reg);

  1971     // and protect the arg if we must spill

  1972     c_arg--;

  1973   }

  1975   // Change state to native (we save the return address in the thread, since it might not

  1976   // be pushed on the stack when we do a a stack traversal). It is enough that the pc()

  1977   // points into the right code segment. It does not have to be the correct return pc.

  1978   // We use the same pc/oopMap repeatedly when we call out

  1980   intptr_t the_pc = (intptr_t) __ pc();

  1981   oop_maps->add_gc_map(the_pc - start, map);

  1983   __ set_last_Java_frame(SP, noreg, NULL);

  1984   __ relocate(relocInfo::internal_pc_type);

  1985   {

  1986     intptr_t save_pc = (intptr_t)the_pc ;

  1987     __ patchable_set48(AT, save_pc);

  1988   }

  1989   __ sd(AT, thread, in_bytes(JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()));

  1992   // We have all of the arguments setup at this point. We must not touch any register

  1993   // argument registers at this point (what if we save/restore them there are no oop?

  1994   {

  1995     SkipIfEqual skip_if(masm, &DTraceMethodProbes, 0);

  1996     int metadata_index = __ oop_recorder()->find_index(method());

  1997     RelocationHolder rspec = metadata_Relocation::spec(metadata_index);

  1998     __ relocate(rspec);

  1999     __ patchable_set48(AT, (long)(method()));

  2001     __ call_VM_leaf(

  2002       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),

  2003       thread, AT);

  2005   }

  2007   // These are register definitions we need for locking/unlocking

  2008   const Register swap_reg = T8;  // Must use T8 for cmpxchg instruction

  2009   const Register obj_reg  = T9;  // Will contain the oop

  2010   //const Register lock_reg = T6;  // Address of compiler lock object (BasicLock)

  2011   const Register lock_reg = c_rarg0;  // Address of compiler lock object (BasicLock)

  2015   Label slow_path_lock;

  2016   Label lock_done;

  2018   // Lock a synchronized method

  2019   if (method->is_synchronized()) {

  2020     assert(!is_critical_native, "unhandled");

  2022     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();

  2024     // Get the handle (the 2nd argument)

  2025     __ move(oop_handle_reg, A1);

  2027     // Get address of the box

  2028     __ lea(lock_reg, Address(FP, lock_slot_fp_offset));

  2030     // Load the oop from the handle

  2031     __ ld(obj_reg, oop_handle_reg, 0);

  2033     if (UseBiasedLocking) {

  2034       // Note that oop_handle_reg is trashed during this call

  2035       __ biased_locking_enter(lock_reg, obj_reg, swap_reg, A1, false, lock_done, &slow_path_lock);

  2036     }

  2038     // Load immediate 1 into swap_reg %T8

  2039     __ move(swap_reg, 1);

  2041     __ ld(AT, obj_reg, 0);

  2042     __ orr(swap_reg, swap_reg, AT);

  2044     __ sd( swap_reg, lock_reg, mark_word_offset);

  2045     __ cmpxchg(lock_reg, Address(obj_reg, 0), swap_reg);

  2046     __ bne(AT, R0, lock_done);

  2047     __ delayed()->nop();

  2048     // Test if the oopMark is an obvious stack pointer, i.e.,

  2049     //  1) (mark & 3) == 0, and

  2050     //  2) sp <= mark < mark + os::pagesize()

  2051     // These 3 tests can be done by evaluating the following

  2052     // expression: ((mark - sp) & (3 - os::vm_page_size())),

  2053     // assuming both stack pointer and pagesize have their

  2054     // least significant 2 bits clear.

  2055     // NOTE: the oopMark is in swap_reg %T8 as the result of cmpxchg

  2057     __ dsub(swap_reg, swap_reg, SP);

  2058     __ move(AT, 3 - os::vm_page_size());

  2059     __ andr(swap_reg , swap_reg, AT);

  2060     // Save the test result, for recursive case, the result is zero

  2061     __ sd(swap_reg, lock_reg, mark_word_offset);

  2062     __ bne(swap_reg, R0, slow_path_lock);

  2063     __ delayed()->nop();

  2064     // Slow path will re-enter here

  2065     __ bind(lock_done);

  2067     if (UseBiasedLocking) {

  2068       // Re-fetch oop_handle_reg as we trashed it above

  2069       __ move(A1, oop_handle_reg);

  2070     }

  2071   }

  2074   // Finally just about ready to make the JNI call

  2077   // get JNIEnv* which is first argument to native

  2078   if (!is_critical_native) {

  2079     __ addi(A0, thread, in_bytes(JavaThread::jni_environment_offset()));

  2080   }

  2082   // Example: Java_java_lang_ref_Finalizer_invokeFinalizeMethod(JNIEnv *env, jclass clazz, jobject ob)

  2083   // Load the second arguments into A1

  2084   //__ ld(A1, SP , wordSize );   // klass

  2086   // Now set thread in native

  2087   __ addi(AT, R0, _thread_in_native);

  2088   __ sw(AT, thread, in_bytes(JavaThread::thread_state_offset()));

  2089   // do the call

  2090   __ call(method->native_function(), relocInfo::runtime_call_type);

  2091   __ delayed()->nop();

  2092   // WARNING - on Windows Java Natives use pascal calling convention and pop the

  2093   // arguments off of the stack. We could just re-adjust the stack pointer here

  2094   // and continue to do SP relative addressing but we instead switch to FP

  2095   // relative addressing.

  2097   // Unpack native results.

  2098   switch (ret_type) {

  2099   case T_BOOLEAN: __ c2bool(V0);            break;

  2100   case T_CHAR   : __ andi(V0, V0, 0xFFFF);      break;

  2101   case T_BYTE   : __ sign_extend_byte (V0); break;

  2102   case T_SHORT  : __ sign_extend_short(V0); break;

  2103   case T_INT    : // nothing to do         break;

  2104   case T_DOUBLE :

  2105   case T_FLOAT  :

  2106   // Result is in st0 we'll save as needed

  2107   break;

  2108   case T_ARRAY:                 // Really a handle

  2109   case T_OBJECT:                // Really a handle

  2110   break; // can't de-handlize until after safepoint check

  2111   case T_VOID: break;

  2112   case T_LONG: break;

  2113   default       : ShouldNotReachHere();

  2114   }

  2115   // Switch thread to "native transition" state before reading the synchronization state.

  2116   // This additional state is necessary because reading and testing the synchronization

  2117   // state is not atomic w.r.t. GC, as this scenario demonstrates:

  2118   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.

  2119   //     VM thread changes sync state to synchronizing and suspends threads for GC.

  2120   //     Thread A is resumed to finish this native method, but doesn't block here since it

  2121   //     didn't see any synchronization is progress, and escapes.

  2122   __ addi(AT, R0, _thread_in_native_trans);

  2123   __ sw(AT, thread, in_bytes(JavaThread::thread_state_offset()));

  2125   //if(os::is_MP()) {}

  2127   Label after_transition;

  2129   // check for safepoint operation in progress and/or pending suspend requests

  2130   {

  2131     Label Continue;

  2132     __ li(AT, SafepointSynchronize::address_of_state());

  2133     __ lw(A0, AT, 0);

  2134     __ addi(AT, A0, -SafepointSynchronize::_not_synchronized);

  2135     Label L;

  2136     __ bne(AT, R0, L);

  2137     __ delayed()->nop();

  2138     __ lw(AT, thread, in_bytes(JavaThread::suspend_flags_offset()));

  2139     __ beq(AT, R0, Continue);

  2140     __ delayed()->nop();

  2141     __ bind(L);

  2143     // Don't use call_VM as it will see a possible pending exception and forward it

  2144     // and never return here preventing us from clearing _last_native_pc down below.

  2145     //

  2146     save_native_result(masm, ret_type, stack_slots);

  2147     __ move(A0, thread);

  2148     __ addi(SP, SP, -wordSize);

  2149     __ push(S2);

  2150     __ move(AT, -(StackAlignmentInBytes));

  2151     __ move(S2, SP);     // use S2 as a sender SP holder

  2152     __ andr(SP, SP, AT); // align stack as required by ABI

  2153     if (!is_critical_native) {

  2154       __ call(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans), relocInfo::runtime_call_type);

  2155       __ delayed()->nop();

  2156     } else {

  2157       __ call(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans_and_transition), relocInfo::runtime_call_type);

  2158       __ delayed()->nop();

  2159     }

  2160     __ move(SP, S2);     // use S2 as a sender SP holder

  2161     __ pop(S2);

  2162     __ addi(SP, SP, wordSize);

  2163     //add for compressedoops

  2164     __ reinit_heapbase();

  2165     // Restore any method result value

  2166     restore_native_result(masm, ret_type, stack_slots);

  2168     if (is_critical_native) {

  2169       // The call above performed the transition to thread_in_Java so

  2170       // skip the transition logic below.

  2171       __ beq(R0, R0, after_transition);

  2172       __ delayed()->nop();

  2173     }

  2175     __ bind(Continue);

  2176   }

  2178   // change thread state

  2179   __ addi(AT, R0, _thread_in_Java);

  2180   __ sw(AT,  thread, in_bytes(JavaThread::thread_state_offset()));

  2181   __ bind(after_transition);

  2182   Label reguard;

  2183   Label reguard_done;

  2184   __ lw(AT, thread, in_bytes(JavaThread::stack_guard_state_offset()));

  2185   __ addi(AT, AT, -JavaThread::stack_guard_yellow_disabled);

  2186   __ beq(AT, R0, reguard);

  2187   __ delayed()->nop();

  2188   // slow path reguard  re-enters here

  2189   __ bind(reguard_done);

  2191   // Handle possible exception (will unlock if necessary)

  2193   // native result if any is live

  2195   // Unlock

  2196   Label slow_path_unlock;

  2197   Label unlock_done;

  2198   if (method->is_synchronized()) {

  2200     Label done;

  2202     // Get locked oop from the handle we passed to jni

  2203     __ ld( obj_reg, oop_handle_reg, 0);

  2204     if (UseBiasedLocking) {

  2205       __ biased_locking_exit(obj_reg, T8, done);

  2207     }

  2209     // Simple recursive lock?

  2211     __ ld(AT, FP, lock_slot_fp_offset);

  2212     __ beq(AT, R0, done);

  2213     __ delayed()->nop();

  2214     // Must save FSF if if it is live now because cmpxchg must use it

  2215     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {

  2216       save_native_result(masm, ret_type, stack_slots);

  2217     }

  2219     //  get old displaced header

  2220     __ ld (T8, FP, lock_slot_fp_offset);

  2221     // get address of the stack lock

  2222     __ addi (c_rarg0, FP, lock_slot_fp_offset);

  2223     // Atomic swap old header if oop still contains the stack lock

  2224     __ cmpxchg(T8, Address(obj_reg, 0), c_rarg0);

  2226     __ beq(AT, R0, slow_path_unlock);

  2227     __ delayed()->nop();

  2228     // slow path re-enters here

  2229     __ bind(unlock_done);

  2230     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {

  2231       restore_native_result(masm, ret_type, stack_slots);

  2232     }

  2234     __ bind(done);

  2236   }

  2237   {

  2238     SkipIfEqual skip_if(masm, &DTraceMethodProbes, 0);

  2239     // Tell dtrace about this method exit

  2240     save_native_result(masm, ret_type, stack_slots);

  2241     int metadata_index = __ oop_recorder()->find_index( (method()));

  2242     RelocationHolder rspec = metadata_Relocation::spec(metadata_index);

  2243     __ relocate(rspec);

  2244     __ patchable_set48(AT, (long)(method()));

  2246     __ call_VM_leaf(

  2247          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),

  2248          thread, AT);

  2249     restore_native_result(masm, ret_type, stack_slots);

  2250   }

  2252   // We can finally stop using that last_Java_frame we setup ages ago

  2254   __ reset_last_Java_frame(false);

  2256   // Unpack oop result, e.g. JNIHandles::resolve value.

  2257   if (ret_type == T_OBJECT || ret_type == T_ARRAY) {

  2258     __ resolve_jobject(V0, thread, T9);

  2259   }

  2261   if (!is_critical_native) {

  2262     // reset handle block

  2263     __ ld(AT, thread, in_bytes(JavaThread::active_handles_offset()));

  2264     __ sw(R0, AT, JNIHandleBlock::top_offset_in_bytes());

  2265   }

  2267   if (!is_critical_native) {

  2268     // Any exception pending?

  2269     __ ld(AT, thread, in_bytes(Thread::pending_exception_offset()));

  2270     __ bne(AT, R0, exception_pending);

  2271     __ delayed()->nop();

  2272   }

  2273   // no exception, we're almost done

  2275   // check that only result value is on FPU stack

  2276   __ verify_FPU(ret_type == T_FLOAT || ret_type == T_DOUBLE ? 1 : 0, "native_wrapper normal exit");

  2278   // Return

  2279 #ifndef OPT_THREAD

  2280   __ get_thread(TREG);

  2281 #endif

  2282   //__ ld_ptr(SP, TREG, in_bytes(JavaThread::last_Java_sp_offset()));

  2283   __ leave();

  2285   __ jr(RA);

  2286   __ delayed()->nop();

  2287   // Unexpected paths are out of line and go here

  2288   // Slow path locking & unlocking

  2289   if (method->is_synchronized()) {

  2291     // BEGIN Slow path lock

  2292     __ bind(slow_path_lock);

  2294     // protect the args we've loaded

  2295     save_args(masm, total_c_args, c_arg, out_regs);

  2297     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM

  2298     // args are (oop obj, BasicLock* lock, JavaThread* thread)

  2300     __ move(A0, obj_reg);

  2301     __ move(A1, lock_reg);

  2302     __ move(A2, thread);

  2303     __ addi(SP, SP, - 3*wordSize);

  2305     __ move(AT, -(StackAlignmentInBytes));

  2306     __ move(S2, SP);     // use S2 as a sender SP holder

  2307     __ andr(SP, SP, AT); // align stack as required by ABI

  2309     __ call(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), relocInfo::runtime_call_type);

  2310     __ delayed()->nop();

  2311                 __ move(SP, S2);

  2312     __ addi(SP, SP, 3*wordSize);

  2314     restore_args(masm, total_c_args, c_arg, out_regs);

  2316 #ifdef ASSERT

  2317     { Label L;

  2318       __ ld(AT, thread, in_bytes(Thread::pending_exception_offset()));

  2319       __ beq(AT, R0, L);

  2320       __ delayed()->nop();

  2321       __ stop("no pending exception allowed on exit from monitorenter");

  2322       __ bind(L);

  2323     }

  2324 #endif

  2325     __ b(lock_done);

  2326     __ delayed()->nop();

  2327     // END Slow path lock

  2329     // BEGIN Slow path unlock

  2330     __ bind(slow_path_unlock);

  2332     // Slow path unlock

  2334     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {

  2335       save_native_result(masm, ret_type, stack_slots);

  2336     }

  2337     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)

  2339     __ ld(AT, thread, in_bytes(Thread::pending_exception_offset()));

  2340     __ push(AT);

  2341     __ sd(R0, thread, in_bytes(Thread::pending_exception_offset()));

  2343                 __ move(AT, -(StackAlignmentInBytes));

  2344                 __ move(S2, SP);     // use S2 as a sender SP holder

  2345                 __ andr(SP, SP, AT); // align stack as required by ABI

  2347     // should be a peal

  2348     // +wordSize because of the push above

  2349     __ addi(A1, FP, lock_slot_fp_offset);

  2351     __ move(A0, obj_reg);

  2352     __ addi(SP,SP, -2*wordSize);

  2353     __ call(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C),

  2354         relocInfo::runtime_call_type);

  2355     __ delayed()->nop();

  2356     __ addi(SP, SP, 2*wordSize);

  2357                 __ move(SP, S2);

  2358     //add for compressedoops

  2359     __ reinit_heapbase();

  2360 #ifdef ASSERT

  2361     {

  2362       Label L;

  2363       __ lw( AT, thread, in_bytes(Thread::pending_exception_offset()));

  2364       __ beq(AT, R0, L);

  2365       __ delayed()->nop();

  2366       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");

  2367       __ bind(L);

  2368     }

  2369 #endif /* ASSERT */

  2371     __ pop(AT);

  2372     __ sd(AT, thread, in_bytes(Thread::pending_exception_offset()));

  2373     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {

  2374       restore_native_result(masm, ret_type, stack_slots);

  2375     }

  2376     __ b(unlock_done);

  2377     __ delayed()->nop();

  2378     // END Slow path unlock

  2380   }

  2382   // SLOW PATH Reguard the stack if needed

  2384   __ bind(reguard);

  2385   save_native_result(masm, ret_type, stack_slots);

  2386   __ call(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages),

  2387       relocInfo::runtime_call_type);

  2388   __ delayed()->nop();

  2389   //add for compressedoops

  2390   __ reinit_heapbase();

  2391   restore_native_result(masm, ret_type, stack_slots);

  2392   __ b(reguard_done);

  2393   __ delayed()->nop();

  2395   // BEGIN EXCEPTION PROCESSING

  2396   if (!is_critical_native) {

  2397     // Forward  the exception

  2398     __ bind(exception_pending);

  2400     // remove possible return value from FPU register stack

  2401     __ empty_FPU_stack();

  2403     // pop our frame

  2404     //forward_exception_entry need return address on stack

  2405     __ addiu(SP, FP, wordSize);

  2406     __ ld(FP, SP, (-1) * wordSize);

  2408     // and forward the exception

  2409     __ jmp(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type);

  2410     __ delayed()->nop();

  2411   }

  2412   __ flush();

  2414   nmethod *nm = nmethod::new_native_nmethod(method,

  2415                                             compile_id,

  2416                                             masm->code(),

  2417                                             vep_offset,

  2418                                             frame_complete,

  2419                                             stack_slots / VMRegImpl::slots_per_word,

  2420                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),

  2421                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),

  2422                                             oop_maps);

  2424   if (is_critical_native) {

  2425     nm->set_lazy_critical_native(true);

  2426   }

  2428   return nm;

  2430 }

  2432 #ifdef HAVE_DTRACE_H

  2433 // ---------------------------------------------------------------------------

  2434 // Generate a dtrace nmethod for a given signature.  The method takes arguments

  2435 // in the Java compiled code convention, marshals them to the native

  2436 // abi and then leaves nops at the position you would expect to call a native

  2437 // function. When the probe is enabled the nops are replaced with a trap

  2438 // instruction that dtrace inserts and the trace will cause a notification

  2439 // to dtrace.

  2440 //

  2441 // The probes are only able to take primitive types and java/lang/String as

  2442 // arguments.  No other java types are allowed. Strings are converted to utf8

  2443 // strings so that from dtrace point of view java strings are converted to C

  2444 // strings. There is an arbitrary fixed limit on the total space that a method

  2445 // can use for converting the strings. (256 chars per string in the signature).

  2446 // So any java string larger then this is truncated.

  2448 static int  fp_offset[ConcreteRegisterImpl::number_of_registers] = { 0 };

  2449 static bool offsets_initialized = false;

  2451 static VMRegPair reg64_to_VMRegPair(Register r) {

  2452   VMRegPair ret;

  2453   if (wordSize == 8) {

  2454     ret.set2(r->as_VMReg());

  2455   } else {

  2456     ret.set_pair(r->successor()->as_VMReg(), r->as_VMReg());

  2457   }

  2458   return ret;

  2459 }

  2462 nmethod *SharedRuntime::generate_dtrace_nmethod(MacroAssembler *masm,

  2463                                                 methodHandle method) {

  2466   // generate_dtrace_nmethod is guarded by a mutex so we are sure to

  2467   // be single threaded in this method.

  2468   assert(AdapterHandlerLibrary_lock->owned_by_self(), "must be");

  2470   // Fill in the signature array, for the calling-convention call.

  2471   int total_args_passed = method->size_of_parameters();

  2473   BasicType* in_sig_bt  = NEW_RESOURCE_ARRAY(BasicType, total_args_passed);

  2474   VMRegPair  *in_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_args_passed);

  2476   // The signature we are going to use for the trap that dtrace will see

  2477   // java/lang/String is converted. We drop "this" and any other object

  2478   // is converted to NULL.  (A one-slot java/lang/Long object reference

  2479   // is converted to a two-slot long, which is why we double the allocation).

  2480   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_args_passed * 2);

  2481   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_args_passed * 2);

  2483   int i=0;

  2484   int total_strings = 0;

  2485   int first_arg_to_pass = 0;

  2486   int total_c_args = 0;

  2488   // Skip the receiver as dtrace doesn't want to see it

  2489   if( !method->is_static() ) {

  2490     in_sig_bt[i++] = T_OBJECT;

  2491     first_arg_to_pass = 1;

  2492   }

  2494   SignatureStream ss(method->signature());

  2495   for ( ; !ss.at_return_type(); ss.next()) {

  2496     BasicType bt = ss.type();

  2497     in_sig_bt[i++] = bt;  // Collect remaining bits of signature

  2498     out_sig_bt[total_c_args++] = bt;

  2499     if( bt == T_OBJECT) {

  2500       symbolOop s = ss.as_symbol_or_null();

  2501       if (s == vmSymbols::java_lang_String()) {

  2502         total_strings++;

  2503         out_sig_bt[total_c_args-1] = T_ADDRESS;

  2504       } else if (s == vmSymbols::java_lang_Boolean() ||

  2505                  s == vmSymbols::java_lang_Byte()) {

  2506         out_sig_bt[total_c_args-1] = T_BYTE;

  2507       } else if (s == vmSymbols::java_lang_Character() ||

  2508                  s == vmSymbols::java_lang_Short()) {

  2509         out_sig_bt[total_c_args-1] = T_SHORT;

  2510       } else if (s == vmSymbols::java_lang_Integer() ||

  2511                  s == vmSymbols::java_lang_Float()) {

  2512         out_sig_bt[total_c_args-1] = T_INT;

  2513       } else if (s == vmSymbols::java_lang_Long() ||

  2514                  s == vmSymbols::java_lang_Double()) {

  2515         out_sig_bt[total_c_args-1] = T_LONG;

  2516         out_sig_bt[total_c_args++] = T_VOID;

  2517       }

  2518     } else if ( bt == T_LONG || bt == T_DOUBLE ) {

  2519       in_sig_bt[i++] = T_VOID;   // Longs & doubles take 2 Java slots

  2520       // We convert double to long

  2521       out_sig_bt[total_c_args-1] = T_LONG;

  2522       out_sig_bt[total_c_args++] = T_VOID;

  2523     } else if ( bt == T_FLOAT) {

  2524       // We convert float to int

  2525       out_sig_bt[total_c_args-1] = T_INT;

  2526     }

  2527   }

  2529   assert(i==total_args_passed, "validly parsed signature");

  2531   // Now get the compiled-Java layout as input arguments

  2532   int comp_args_on_stack;

  2533   comp_args_on_stack = SharedRuntime::java_calling_convention(

  2534       in_sig_bt, in_regs, total_args_passed, false);

  2536   // We have received a description of where all the java arg are located

  2537   // on entry to the wrapper. We need to convert these args to where

  2538   // the a  native (non-jni) function would expect them. To figure out

  2539   // where they go we convert the java signature to a C signature and remove

  2540   // T_VOID for any long/double we might have received.

  2543   // Now figure out where the args must be stored and how much stack space

  2544   // they require (neglecting out_preserve_stack_slots but space for storing

  2545   // the 1st six register arguments). It's weird see int_stk_helper.

  2547   int out_arg_slots;

  2548   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, NULL, total_c_args);

  2550   // Calculate the total number of stack slots we will need.

  2552   // First count the abi requirement plus all of the outgoing args

  2553   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;

  2555   // Plus a temp for possible converion of float/double/long register args

  2557   int conversion_temp = stack_slots;

  2558   stack_slots += 2;

  2561   // Now space for the string(s) we must convert

  2563   int string_locs = stack_slots;

  2564   stack_slots += total_strings *

  2565                    (max_dtrace_string_size / VMRegImpl::stack_slot_size);

  2567   // Ok The space we have allocated will look like:

  2568   //

  2569   //

  2570   // FP-> |                     |

  2571   //      |---------------------|

  2572   //      | string[n]           |

  2573   //      |---------------------| <- string_locs[n]

  2574   //      | string[n-1]         |

  2575   //      |---------------------| <- string_locs[n-1]

  2576   //      | ...                 |

  2577   //      | ...                 |

  2578   //      |---------------------| <- string_locs[1]

  2579   //      | string[0]           |

  2580   //      |---------------------| <- string_locs[0]

  2581   //      | temp                |

  2582   //      |---------------------| <- conversion_temp

  2583   //      | outbound memory     |

  2584   //      | based arguments     |

  2585   //      |                     |

  2586   //      |---------------------|

  2587   //      |                     |

  2588   // SP-> | out_preserved_slots |

  2589   //

  2590   //

  2592   // Now compute actual number of stack words we need rounding to make

  2593   // stack properly aligned.

  2594   stack_slots = round_to(stack_slots, 4 * VMRegImpl::slots_per_word);

  2596   int stack_size = stack_slots * VMRegImpl::stack_slot_size;

  2598   intptr_t start = (intptr_t)__ pc();

  2600   // First thing make an ic check to see if we should even be here

  2602   {

  2603     Label L;

  2604     const Register temp_reg = G3_scratch;

  2605     Address ic_miss(temp_reg, SharedRuntime::get_ic_miss_stub());

  2606     __ verify_oop(O0);

  2607     __ ld_ptr(O0, oopDesc::klass_offset_in_bytes(), temp_reg);

  2608     __ cmp(temp_reg, G5_inline_cache_reg);

  2609     __ brx(Assembler::equal, true, Assembler::pt, L);

  2610     __ delayed()->nop();

  2612     __ jump_to(ic_miss, 0);

  2613     __ delayed()->nop();

  2614     __ align(CodeEntryAlignment);

  2615     __ bind(L);

  2616   }

  2618   int vep_offset = ((intptr_t)__ pc()) - start;

  2621   // The instruction at the verified entry point must be 5 bytes or longer

  2622   // because it can be patched on the fly by make_non_entrant. The stack bang

  2623   // instruction fits that requirement.

  2625   // Generate stack overflow check before creating frame

  2626   __ generate_stack_overflow_check(stack_size);

  2628   assert(((intptr_t)__ pc() - start - vep_offset) >= 5,

  2629          "valid size for make_non_entrant");

  2631   // Generate a new frame for the wrapper.

  2632   __ save(SP, -stack_size, SP);

  2634   // Frame is now completed as far a size and linkage.

  2636   int frame_complete = ((intptr_t)__ pc()) - start;

  2638 #ifdef ASSERT

  2639   bool reg_destroyed[RegisterImpl::number_of_registers];

  2640   bool freg_destroyed[FloatRegisterImpl::number_of_registers];

  2641   for ( int r = 0 ; r < RegisterImpl::number_of_registers ; r++ ) {

  2642     reg_destroyed[r] = false;

  2643   }

  2644   for ( int f = 0 ; f < FloatRegisterImpl::number_of_registers ; f++ ) {

  2645     freg_destroyed[f] = false;

  2646   }

  2648 #endif /* ASSERT */

  2650   VMRegPair zero;

  2651   const Register g0 = G0; // without this we get a compiler warning (why??)

  2652   zero.set2(g0->as_VMReg());

  2654   int c_arg, j_arg;

  2656   Register conversion_off = noreg;

  2658   for (j_arg = first_arg_to_pass, c_arg = 0 ;

  2659        j_arg < total_args_passed ; j_arg++, c_arg++ ) {

  2661     VMRegPair src = in_regs[j_arg];

  2662     VMRegPair dst = out_regs[c_arg];

  2664 #ifdef ASSERT

  2665     if (src.first()->is_Register()) {

  2666       assert(!reg_destroyed[src.first()->as_Register()->encoding()], "ack!");

  2667     } else if (src.first()->is_FloatRegister()) {

  2668       assert(!freg_destroyed[src.first()->as_FloatRegister()->encoding(

  2669                                                FloatRegisterImpl::S)], "ack!");

  2670     }

  2671     if (dst.first()->is_Register()) {

  2672       reg_destroyed[dst.first()->as_Register()->encoding()] = true;

  2673     } else if (dst.first()->is_FloatRegister()) {

  2674       freg_destroyed[dst.first()->as_FloatRegister()->encoding(

  2675                                                  FloatRegisterImpl::S)] = true;

  2676     }

  2677 #endif /* ASSERT */

  2679     switch (in_sig_bt[j_arg]) {

  2680       case T_ARRAY:

  2681       case T_OBJECT:

  2682         {

  2683           if (out_sig_bt[c_arg] == T_BYTE  || out_sig_bt[c_arg] == T_SHORT ||

  2684               out_sig_bt[c_arg] == T_INT || out_sig_bt[c_arg] == T_LONG) {

  2685             // need to unbox a one-slot value

  2686             Register in_reg = L0;

  2687             Register tmp = L2;

  2688             if ( src.first()->is_reg() ) {

  2689               in_reg = src.first()->as_Register();

  2690             } else {

  2691               assert(Assembler::is_simm13(reg2offset(src.first()) + STACK_BIAS),

  2692                      "must be");

  2693               __ ld_ptr(FP, reg2offset(src.first()) + STACK_BIAS, in_reg);

  2694             }

  2695             // If the final destination is an acceptable register

  2696             if ( dst.first()->is_reg() ) {

  2697               if ( dst.is_single_phys_reg() || out_sig_bt[c_arg] != T_LONG ) {

  2698                 tmp = dst.first()->as_Register();

  2699               }

  2700             }

  2702             Label skipUnbox;

  2703             if ( wordSize == 4 && out_sig_bt[c_arg] == T_LONG ) {

  2704               __ mov(G0, tmp->successor());

  2705             }

  2706             __ br_null(in_reg, true, Assembler::pn, skipUnbox);

  2707             __ delayed()->mov(G0, tmp);

  2709             BasicType bt = out_sig_bt[c_arg];

  2710             int box_offset = java_lang_boxing_object::value_offset_in_bytes(bt);

  2711             switch (bt) {

  2712                 case T_BYTE:

  2713                   __ ldub(in_reg, box_offset, tmp); break;

  2714                 case T_SHORT:

  2715                   __ lduh(in_reg, box_offset, tmp); break;

  2716                 case T_INT:

  2717                   __ ld(in_reg, box_offset, tmp); break;

  2718                 case T_LONG:

  2719                   __ ld_long(in_reg, box_offset, tmp); break;

  2720                 default: ShouldNotReachHere();

  2721             }

  2723             __ bind(skipUnbox);

  2724             // If tmp wasn't final destination copy to final destination

  2725             if (tmp == L2) {

  2726               VMRegPair tmp_as_VM = reg64_to_VMRegPair(L2);

  2727               if (out_sig_bt[c_arg] == T_LONG) {

  2728                 long_move(masm, tmp_as_VM, dst);

  2729               } else {

  2730                 move32_64(masm, tmp_as_VM, out_regs[c_arg]);

  2731               }

  2732             }

  2733             if (out_sig_bt[c_arg] == T_LONG) {

  2734               assert(out_sig_bt[c_arg+1] == T_VOID, "must be");

  2735               ++c_arg; // move over the T_VOID to keep the loop indices in sync

  2736             }

  2737           } else if (out_sig_bt[c_arg] == T_ADDRESS) {

  2738             Register s =

  2739                 src.first()->is_reg() ? src.first()->as_Register() : L2;

  2740             Register d =

  2741                 dst.first()->is_reg() ? dst.first()->as_Register() : L2;

  2743             // We store the oop now so that the conversion pass can reach

  2744             // while in the inner frame. This will be the only store if

  2745             // the oop is NULL.

  2746             if (s != L2) {

  2747               // src is register

  2748               if (d != L2) {

  2749                 // dst is register

  2750                 __ mov(s, d);

  2751               } else {

  2752                 assert(Assembler::is_simm13(reg2offset(dst.first()) +

  2753                           STACK_BIAS), "must be");

  2754                 __ st_ptr(s, SP, reg2offset(dst.first()) + STACK_BIAS);

  2755               }

  2756             } else {

  2757                 // src not a register

  2758                 assert(Assembler::is_simm13(reg2offset(src.first()) +

  2759                            STACK_BIAS), "must be");

  2760                 __ ld_ptr(FP, reg2offset(src.first()) + STACK_BIAS, d);

  2761                 if (d == L2) {

  2762                   assert(Assembler::is_simm13(reg2offset(dst.first()) +

  2763                              STACK_BIAS), "must be");

  2764                   __ st_ptr(d, SP, reg2offset(dst.first()) + STACK_BIAS);

  2765                 }

  2766             }

  2767           } else if (out_sig_bt[c_arg] != T_VOID) {

  2768             // Convert the arg to NULL

  2769             if (dst.first()->is_reg()) {

  2770               __ mov(G0, dst.first()->as_Register());

  2771             } else {

  2772               assert(Assembler::is_simm13(reg2offset(dst.first()) +

  2773                          STACK_BIAS), "must be");

  2774               __ st_ptr(G0, SP, reg2offset(dst.first()) + STACK_BIAS);

  2775             }

  2776           }

  2777         }

  2778         break;

  2779       case T_VOID:

  2780         break;

  2782       case T_FLOAT:

  2783         if (src.first()->is_stack()) {

  2784           // Stack to stack/reg is simple

  2785           move32_64(masm, src, dst);

  2786         } else {

  2787           if (dst.first()->is_reg()) {

  2788             // freg -> reg

  2789             int off =

  2790               STACK_BIAS + conversion_temp * VMRegImpl::stack_slot_size;

  2791             Register d = dst.first()->as_Register();

  2792             if (Assembler::is_simm13(off)) {

  2793               __ stf(FloatRegisterImpl::S, src.first()->as_FloatRegister(),

  2794                      SP, off);

  2795               __ ld(SP, off, d);

  2796             } else {

  2797               if (conversion_off == noreg) {

  2798                 __ set(off, L6);

  2799                 conversion_off = L6;

  2800               }

  2801               __ stf(FloatRegisterImpl::S, src.first()->as_FloatRegister(),

  2802                      SP, conversion_off);

  2803               __ ld(SP, conversion_off , d);

  2804             }

  2805           } else {

  2806             // freg -> mem

  2807             int off = STACK_BIAS + reg2offset(dst.first());

  2808             if (Assembler::is_simm13(off)) {

  2809               __ stf(FloatRegisterImpl::S, src.first()->as_FloatRegister(),

  2810                      SP, off);

  2811             } else {

  2812               if (conversion_off == noreg) {

  2813                 __ set(off, L6);

  2814                 conversion_off = L6;

  2815               }

  2816               __ stf(FloatRegisterImpl::S, src.first()->as_FloatRegister(),

  2817                      SP, conversion_off);

  2818             }

  2819           }

  2820         }

  2821         break;

  2823       case T_DOUBLE:

  2824         assert( j_arg + 1 < total_args_passed &&

  2825                 in_sig_bt[j_arg + 1] == T_VOID &&

  2826                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");

  2827         if (src.first()->is_stack()) {

  2828           // Stack to stack/reg is simple

  2829           long_move(masm, src, dst);

  2830         } else {

  2831           Register d = dst.first()->is_reg() ? dst.first()->as_Register() : L2;

  2833           // Destination could be an odd reg on 32bit in which case

  2834           // we can't load direct to the destination.

  2836           if (!d->is_even() && wordSize == 4) {

  2837             d = L2;

  2838           }

  2839           int off = STACK_BIAS + conversion_temp * VMRegImpl::stack_slot_size;

  2840           if (Assembler::is_simm13(off)) {

  2841             __ stf(FloatRegisterImpl::D, src.first()->as_FloatRegister(),

  2842                    SP, off);

  2843             __ ld_long(SP, off, d);

  2844           } else {

  2845             if (conversion_off == noreg) {

  2846               __ set(off, L6);

  2847               conversion_off = L6;

  2848             }

  2849             __ stf(FloatRegisterImpl::D, src.first()->as_FloatRegister(),

  2850                    SP, conversion_off);

  2851             __ ld_long(SP, conversion_off, d);

  2852           }

  2853           if (d == L2) {

  2854             long_move(masm, reg64_to_VMRegPair(L2), dst);

  2855           }

  2856         }

  2857         break;

  2859       case T_LONG :

  2860         // 32bit can't do a split move of something like g1 -> O0, O1

  2861         // so use a memory temp

  2862         if (src.is_single_phys_reg() && wordSize == 4) {

  2863           Register tmp = L2;

  2864           if (dst.first()->is_reg() &&

  2865               (wordSize == 8 || dst.first()->as_Register()->is_even())) {

  2866             tmp = dst.first()->as_Register();

  2867           }

  2869           int off = STACK_BIAS + conversion_temp * VMRegImpl::stack_slot_size;

  2870           if (Assembler::is_simm13(off)) {

  2871             __ stx(src.first()->as_Register(), SP, off);

  2872             __ ld_long(SP, off, tmp);

  2873           } else {

  2874             if (conversion_off == noreg) {

  2875               __ set(off, L6);

  2876               conversion_off = L6;

  2877             }

  2878             __ stx(src.first()->as_Register(), SP, conversion_off);

  2879             __ ld_long(SP, conversion_off, tmp);

  2880           }

  2882           if (tmp == L2) {

  2883             long_move(masm, reg64_to_VMRegPair(L2), dst);

  2884           }

  2885         } else {

  2886           long_move(masm, src, dst);

  2887         }

  2888         break;

  2890       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");

  2892       default:

  2893         move32_64(masm, src, dst);

  2894     }

  2895   }

  2898   // If we have any strings we must store any register based arg to the stack

  2899   // This includes any still live xmm registers too.

  2901   if (total_strings > 0 ) {

  2903     // protect all the arg registers

  2904     __ save_frame(0);

  2905     __ mov(G2_thread, L7_thread_cache);

  2906     const Register L2_string_off = L2;

  2908     // Get first string offset

  2909     __ set(string_locs * VMRegImpl::stack_slot_size, L2_string_off);

  2911     for (c_arg = 0 ; c_arg < total_c_args ; c_arg++ ) {

  2912       if (out_sig_bt[c_arg] == T_ADDRESS) {

  2914         VMRegPair dst = out_regs[c_arg];

  2915         const Register d = dst.first()->is_reg() ?

  2916             dst.first()->as_Register()->after_save() : noreg;

  2918         // It's a string the oop and it was already copied to the out arg

  2919         // position

  2920         if (d != noreg) {

  2921           __ mov(d, O0);

  2922         } else {

  2923           assert(Assembler::is_simm13(reg2offset(dst.first()) + STACK_BIAS),

  2924                  "must be");

  2925           __ ld_ptr(FP,  reg2offset(dst.first()) + STACK_BIAS, O0);

  2926         }

  2927         Label skip;

  2929         __ br_null(O0, false, Assembler::pn, skip);

  2930         __ delayed()->add(FP, L2_string_off, O1);

  2932         if (d != noreg) {

  2933           __ mov(O1, d);

  2934         } else {

  2935           assert(Assembler::is_simm13(reg2offset(dst.first()) + STACK_BIAS),

  2936                  "must be");

  2937           __ st_ptr(O1, FP,  reg2offset(dst.first()) + STACK_BIAS);

  2938         }

  2940         __ call(CAST_FROM_FN_PTR(address, SharedRuntime::get_utf),

  2941                 relocInfo::runtime_call_type);

  2942         __ delayed()->add(L2_string_off, max_dtrace_string_size, L2_string_off);

  2944         __ bind(skip);

  2946       }

  2948     }

  2949     __ mov(L7_thread_cache, G2_thread);

  2950     __ restore();

  2952   }

  2955   // Ok now we are done. Need to place the nop that dtrace wants in order to

  2956   // patch in the trap

  2958   int patch_offset = ((intptr_t)__ pc()) - start;

  2960   __ nop();

  2963   // Return

  2965   __ ret();

  2966   __ delayed()->restore();

  2968   __ flush();

  2970   nmethod *nm = nmethod::new_dtrace_nmethod(

  2971       method, masm->code(), vep_offset, patch_offset, frame_complete,

  2972       stack_slots / VMRegImpl::slots_per_word);

  2973   return nm;

  2975 }

  2977 #endif // HAVE_DTRACE_H

  2979 // this function returns the adjust size (in number of words) to a c2i adapter

  2980 // activation for use during deoptimization

  2981 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals) {

  2982   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;

  2983 }

  2985 // "Top of Stack" slots that may be unused by the calling convention but must

  2986 // otherwise be preserved.

  2987 // On Intel these are not necessary and the value can be zero.

  2988 // On Sparc this describes the words reserved for storing a register window

  2989 // when an interrupt occurs.

  2990 uint SharedRuntime::out_preserve_stack_slots() {

  2991    return 0;

  2992 }

  2994 //------------------------------generate_deopt_blob----------------------------

  2995 // Ought to generate an ideal graph & compile, but here's some SPARC ASM

  2996 // instead.

  2997 void SharedRuntime::generate_deopt_blob() {

  2998   // allocate space for the code

  2999   ResourceMark rm;

  3000   // setup code generation tools

  3001   //CodeBuffer     buffer ("deopt_blob", 4000, 2048);

  3002   CodeBuffer     buffer ("deopt_blob", 8000, 2048);//aoqi FIXME for debug

  3003   MacroAssembler* masm  = new MacroAssembler( & buffer);

  3004   int frame_size_in_words;

  3005   OopMap* map = NULL;

  3006   // Account for the extra args we place on the stack

  3007   // by the time we call fetch_unroll_info

  3008   const int additional_words = 2; // deopt kind, thread

  3010   OopMapSet *oop_maps = new OopMapSet();

  3012   address start = __ pc();

  3013   Label cont;

  3014   // we use S3 for DeOpt reason register

  3015   Register reason = S3;

  3016   // use S6 for thread register

  3017   Register thread = TREG;

  3018   // use S7 for fetch_unroll_info returned UnrollBlock

  3019   Register unroll = S7;

  3020   // Prolog for non exception case!

  3021   // Correct the return address we were given.

  3022   //FIXME, return address is on the tos or Ra?

  3023   __ addi(RA, RA, - (NativeCall::return_address_offset_long));

  3024   // Save everything in sight.

  3025   map = RegisterSaver::save_live_registers(masm, additional_words, &frame_size_in_words);

  3026   // Normal deoptimization

  3027   __ move(reason, Deoptimization::Unpack_deopt);

  3028   __ b(cont);

  3029   __ delayed()->nop();

  3031   int reexecute_offset = __ pc() - start;

  3033   // Reexecute case

  3034   // return address is the pc describes what bci to do re-execute at

  3036   // No need to update map as each call to save_live_registers will produce identical oopmap

  3037   (void) RegisterSaver::save_live_registers(masm, additional_words, &frame_size_in_words);

  3038   __ move(reason, Deoptimization::Unpack_reexecute);

  3039   __ b(cont);

  3040   __ delayed()->nop();

  3042   int   exception_offset = __ pc() - start;

  3043   // Prolog for exception case

  3045   // all registers are dead at this entry point, except for V0 and

  3046   // V1 which contain the exception oop and exception pc

  3047   // respectively.  Set them in TLS and fall thru to the

  3048   // unpack_with_exception_in_tls entry point.

  3050   __ get_thread(thread);

  3051   __ st_ptr(V1, thread, in_bytes(JavaThread::exception_pc_offset()));

  3052   __ st_ptr(V0, thread, in_bytes(JavaThread::exception_oop_offset()));

  3053   int exception_in_tls_offset = __ pc() - start;

  3054   // new implementation because exception oop is now passed in JavaThread

  3056   // Prolog for exception case

  3057   // All registers must be preserved because they might be used by LinearScan

  3058   // Exceptiop oop and throwing PC are passed in JavaThread

  3059   // tos: stack at point of call to method that threw the exception (i.e. only

  3060   // args are on the stack, no return address)

  3062   // Return address will be patched later with the throwing pc. The correct value is not

  3063   // available now because loading it from memory would destroy registers.

  3064   // Save everything in sight.

  3065   // No need to update map as each call to save_live_registers will produce identical oopmap

  3066   __ addi(RA, RA, - (NativeCall::return_address_offset_long));

  3067   (void) RegisterSaver::save_live_registers(masm, additional_words, &frame_size_in_words);

  3069   // Now it is safe to overwrite any register

  3070   // store the correct deoptimization type

  3071   __ move(reason, Deoptimization::Unpack_exception);

  3072   // load throwing pc from JavaThread and patch it as the return address

  3073   // of the current frame. Then clear the field in JavaThread

  3074   __ get_thread(thread);

  3075   __ ld_ptr(V1, thread, in_bytes(JavaThread::exception_pc_offset()));

  3076   __ st_ptr(V1, SP, RegisterSaver::raOffset() * wordSize); //save ra

  3077   __ st_ptr(R0, thread, in_bytes(JavaThread::exception_pc_offset()));

  3080 #ifdef ASSERT

  3081   // verify that there is really an exception oop in JavaThread

  3082   __ ld_ptr(AT, thread, in_bytes(JavaThread::exception_oop_offset()));

  3083   __ verify_oop(AT);

  3084   // verify that there is no pending exception

  3085   Label no_pending_exception;

  3086   __ ld_ptr(AT, thread, in_bytes(Thread::pending_exception_offset()));

  3087   __ beq(AT, R0, no_pending_exception);

  3088   __ delayed()->nop();

  3089   __ stop("must not have pending exception here");

  3090   __ bind(no_pending_exception);

  3091 #endif

  3092   __ bind(cont);

  3093   // Compiled code leaves the floating point stack dirty, empty it.

  3094   __ empty_FPU_stack();

  3097   // Call C code.  Need thread and this frame, but NOT official VM entry

  3098   // crud.  We cannot block on this call, no GC can happen.

  3099 #ifndef OPT_THREAD

  3100   __ get_thread(thread);

  3101 #endif

  3103   __ move(A0, thread);

  3104   __ addi(SP, SP, -additional_words  * wordSize);

  3106   __ set_last_Java_frame(NOREG, NOREG, NULL);

  3108   // Call fetch_unroll_info().  Need thread and this frame, but NOT official VM entry - cannot block on

  3109   // this call, no GC can happen.  Call should capture return values.

  3111   __ relocate(relocInfo::internal_pc_type);

  3112   {

  3113     intptr_t save_pc = (intptr_t)__ pc() +  NativeMovConstReg::instruction_size + 28;

  3114     __ patchable_set48(AT, save_pc);

  3115   }

  3116   __ sd(AT, thread, in_bytes(JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()));

  3118   __ call((address)Deoptimization::fetch_unroll_info);

  3119   //__ call(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info), relocInfo::runtime_call_type);

  3120   __ delayed()->nop();

  3121   oop_maps->add_gc_map(__ pc() - start, map);

  3122   __ addiu(SP, SP, additional_words * wordSize);

  3123   __ get_thread(thread);

  3124   __ reset_last_Java_frame(false);

  3126   // Load UnrollBlock into S7

  3127   __ move(unroll, V0);

  3130   // Move the unpack kind to a safe place in the UnrollBlock because

  3131   // we are very short of registers

  3133   Address unpack_kind(unroll, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes());

  3134   __ sw(reason, unpack_kind);

  3135   // save the unpack_kind value

  3136   // Retrieve the possible live values (return values)

  3137   // All callee save registers representing jvm state

  3138   // are now in the vframeArray.

  3140   Label noException;

  3141   __ move(AT, Deoptimization::Unpack_exception);

  3142   __ bne(AT, reason, noException);// Was exception pending?

  3143   __ delayed()->nop();

  3144   __ ld_ptr(V0, thread, in_bytes(JavaThread::exception_oop_offset()));

  3145   __ ld_ptr(V1, thread, in_bytes(JavaThread::exception_pc_offset()));

  3146   __ st_ptr(R0, thread, in_bytes(JavaThread::exception_pc_offset()));

  3147   __ st_ptr(R0, thread, in_bytes(JavaThread::exception_oop_offset()));

  3149   __ verify_oop(V0);

  3151   // Overwrite the result registers with the exception results.

  3152   __ st_ptr(V0, SP, RegisterSaver::v0Offset()*wordSize);

  3153   __ st_ptr(V1, SP, RegisterSaver::v1Offset()*wordSize);

  3155   __ bind(noException);

  3158   // Stack is back to only having register save data on the stack.

  3159   // Now restore the result registers. Everything else is either dead or captured

  3160   // in the vframeArray.

  3162   RegisterSaver::restore_result_registers(masm);

  3163   // All of the register save area has been popped of the stack. Only the

  3164   // return address remains.

  3165   // Pop all the frames we must move/replace.

  3166   // Frame picture (youngest to oldest)

  3167   // 1: self-frame (no frame link)

  3168   // 2: deopting frame  (no frame link)

  3169   // 3: caller of deopting frame (could be compiled/interpreted).

  3170   //

  3171   // Note: by leaving the return address of self-frame on the stack

  3172   // and using the size of frame 2 to adjust the stack

  3173   // when we are done the return to frame 3 will still be on the stack.

  3175   // register for the sender's sp

  3176   Register sender_sp = Rsender;

  3177   // register for frame pcs

  3178   Register pcs = T0;

  3179   // register for frame sizes

  3180   Register sizes = T1;

  3181   // register for frame count

  3182   Register count = T3;

  3184   // Pop deoptimized frame

  3185   __ lw(AT, unroll, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset_in_bytes());

  3186   __ add(SP, SP, AT);

  3187   // sp should be pointing at the return address to the caller (3)

  3189   // Load array of frame pcs into pcs

  3190   __ ld_ptr(pcs, unroll, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes());

  3191   __ addi(SP, SP, wordSize);  // trash the old pc

  3192   // Load array of frame sizes into T6

  3193   __ ld_ptr(sizes, unroll, Deoptimization::UnrollBlock::frame_sizes_offset_in_bytes());

  3197   // Load count of frams into T3

  3198   __ lw(count, unroll, Deoptimization::UnrollBlock::number_of_frames_offset_in_bytes());

  3199   // Pick up the initial fp we should save

  3200   __ ld(FP, unroll,  Deoptimization::UnrollBlock::initial_info_offset_in_bytes());

  3201    // Now adjust the caller's stack to make up for the extra locals

  3202   // but record the original sp so that we can save it in the skeletal interpreter

  3203   // frame and the stack walking of interpreter_sender will get the unextended sp

  3204   // value and not the "real" sp value.

  3205   __ move(sender_sp, SP);

  3206   __ lw(AT, unroll, Deoptimization::UnrollBlock::caller_adjustment_offset_in_bytes());

  3207   __ sub(SP, SP, AT);

  3209   // Push interpreter frames in a loop

  3210   //

  3211   //Loop:

  3212   //   0x000000555bd82d18: lw t2, 0x0(t1)           ; lw sizes[i]  <--- error lw->ld

  3213   //   0x000000555bd82d1c: ld at, 0x0(t0)           ; ld pcs[i]

  3214   //   0x000000555bd82d20: daddi t2, t2, 0xfffffff0 ; t2 -= 16

  3215   //   0x000000555bd82d24: daddi sp, sp, 0xfffffff0

  3216   //   0x000000555bd82d28: sd fp, 0x0(sp)           ; push fp

  3217   //   0x000000555bd82d2c: sd at, 0x8(sp)           ; push at

  3218   //   0x000000555bd82d30: dadd fp, sp, zero        ; fp <- sp

  3219   //   0x000000555bd82d34: dsub sp, sp, t2          ; sp -= t2

  3220   //   0x000000555bd82d38: sd zero, 0xfffffff0(fp)  ; __ sd(R0, FP, frame::interpreter_frame_last_sp_offset * wordSize);

  3221   //   0x000000555bd82d3c: sd s4, 0xfffffff8(fp)    ; __ sd(sender_sp, FP, frame::interpreter_frame_sender_sp_offset * wordSize);

  3222   //   0x000000555bd82d40: dadd s4, sp, zero        ; move(sender_sp, SP);

  3223   //   0x000000555bd82d44: daddi t3, t3, 0xffffffff ; count --

  3224   //   0x000000555bd82d48: daddi t1, t1, 0x4        ; sizes += 4

  3225   //   0x000000555bd82d4c: bne t3, zero, 0x000000555bd82d18

  3226   //   0x000000555bd82d50: daddi t0, t0, 0x4        ; <--- error    t0 += 8

  3227   //

  3228   // pcs[0] = frame_pcs[0] = deopt_sender.raw_pc(); regex.split

  3229   Label loop;

  3230   __ bind(loop);

  3231   __ ld(T2, sizes, 0);    // Load frame size

  3232   __ ld_ptr(AT, pcs, 0);           // save return address

  3233   __ addi(T2, T2, -2*wordSize);           // we'll push pc and fp, by hand

  3234   __ push2(AT, FP);

  3235   __ move(FP, SP);

  3236   __ sub(SP, SP, T2);       // Prolog!

  3237   // This value is corrected by layout_activation_impl

  3238   __ sd(R0, FP, frame::interpreter_frame_last_sp_offset * wordSize);

  3239   __ sd(sender_sp, FP, frame::interpreter_frame_sender_sp_offset * wordSize);// Make it walkable

  3240   __ move(sender_sp, SP);  // pass to next frame

  3241   __ addi(count, count, -1);   // decrement counter

  3242   __ addi(sizes, sizes, wordSize);   // Bump array pointer (sizes)

  3243   __ bne(count, R0, loop);

  3244   __ delayed()->addi(pcs, pcs, wordSize);   // Bump array pointer (pcs)

  3245   __ ld(AT, pcs, 0);      // frame_pcs[number_of_frames] = Interpreter::deopt_entry(vtos, 0);

  3246   // Re-push self-frame

  3247   __ push2(AT, FP);

  3248   __ move(FP, SP);

  3249   __ sd(R0, FP, frame::interpreter_frame_last_sp_offset * wordSize);

  3250   __ sd(sender_sp, FP, frame::interpreter_frame_sender_sp_offset * wordSize);

  3251   __ addi(SP, SP, -(frame_size_in_words - 2 - additional_words) * wordSize);

  3253   // Restore frame locals after moving the frame

  3254   __ sd(V0, SP, RegisterSaver::v0Offset() * wordSize);

  3255   __ sd(V1, SP, RegisterSaver::v1Offset() * wordSize);

  3256   __ sdc1(F0, SP, RegisterSaver::fpResultOffset()* wordSize);// Pop float stack and store in local

  3257   __ sdc1(F1, SP, (RegisterSaver::fpResultOffset() + 1) * wordSize);

  3260   // Call unpack_frames().  Need thread and this frame, but NOT official VM entry - cannot block on

  3261   // this call, no GC can happen.

  3262   __ move(A1, reason);  // exec_mode

  3263   __ get_thread(thread);

  3264   __ move(A0, thread);  // thread

  3265   __ addi(SP, SP, (-additional_words) *wordSize);

  3267   // set last_Java_sp, last_Java_fp

  3268   __ set_last_Java_frame(NOREG, FP, NULL);

  3270   __ move(AT, -(StackAlignmentInBytes));

  3271   __ andr(SP, SP, AT);   // Fix stack alignment as required by ABI

  3273   __ relocate(relocInfo::internal_pc_type);

  3274   {

  3275     intptr_t save_pc = (intptr_t)__ pc() +  NativeMovConstReg::instruction_size + 28;

  3276     __ patchable_set48(AT, save_pc);

  3277   }

  3278   __ sd(AT, thread, in_bytes(JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()));

  3280   __ call(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames), relocInfo::runtime_call_type);

  3281   __ delayed()->nop();

  3282   // Revert SP alignment after call since we're going to do some SP relative addressing below

  3283   __ ld(SP, thread, in_bytes(JavaThread::last_Java_sp_offset()));

  3284   // Set an oopmap for the call site

  3285   oop_maps->add_gc_map(__ offset(), new OopMap( frame_size_in_words , 0));

  3287   __ push(V0);

  3289   __ get_thread(thread);

  3290   __ reset_last_Java_frame(true);

  3292   // Collect return values

  3293   __ ld(V0, SP, (RegisterSaver::v0Offset() + additional_words +1) * wordSize);

  3294   __ ld(V1, SP, (RegisterSaver::v1Offset() + additional_words +1) * wordSize);

  3295   __ ldc1(F0, SP, RegisterSaver::fpResultOffset()* wordSize);// Pop float stack and store in local

  3296   __ ldc1(F1, SP, (RegisterSaver::fpResultOffset() + 1) * wordSize);

  3297   //FIXME,

  3298   // Clear floating point stack before returning to interpreter

  3299   __ empty_FPU_stack();

  3300   //FIXME, we should consider about float and double

  3301   // Push a float or double return value if necessary.

  3302   __ leave();

  3304   // Jump to interpreter

  3305   __ jr(RA);

  3306   __ delayed()->nop();

  3308   masm->flush();

  3309   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);

  3310   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);

  3311 }

  3313 #ifdef COMPILER2

  3315 //------------------------------generate_uncommon_trap_blob--------------------

  3316 // Ought to generate an ideal graph & compile, but here's some SPARC ASM

  3317 // instead.

  3318 void SharedRuntime::generate_uncommon_trap_blob() {

  3319   // allocate space for the code

  3320   ResourceMark rm;

  3321   // setup code generation tools

  3322   CodeBuffer  buffer ("uncommon_trap_blob", 512*80 , 512*40 );

  3323   MacroAssembler* masm = new MacroAssembler(&buffer);

  3325   enum frame_layout {

  3326     s0_off, s0_off2,

  3327     s1_off, s1_off2,

  3328     s2_off, s2_off2,

  3329     s3_off, s3_off2,

  3330     s4_off, s4_off2,

  3331     s5_off, s5_off2,

  3332     s6_off, s6_off2,

  3333     s7_off, s7_off2,

  3334     fp_off, fp_off2,

  3335     return_off, return_off2,    // slot for return address    sp + 9

  3336     framesize

  3337   };

  3338   assert(framesize % 4 == 0, "sp not 16-byte aligned");

  3340   address start = __ pc();

  3342   // Push self-frame.

  3343   __ daddiu(SP, SP, -framesize * BytesPerInt);

  3345   __ sd(RA, SP, return_off * BytesPerInt);

  3346   __ sd(FP, SP, fp_off * BytesPerInt);

  3348   // Save callee saved registers.  None for UseSSE=0,

  3349   // floats-only for UseSSE=1, and doubles for UseSSE=2.

  3350   __ sd(S0, SP, s0_off * BytesPerInt);

  3351   __ sd(S1, SP, s1_off * BytesPerInt);

  3352   __ sd(S2, SP, s2_off * BytesPerInt);

  3353   __ sd(S3, SP, s3_off * BytesPerInt);

  3354   __ sd(S4, SP, s4_off * BytesPerInt);

  3355   __ sd(S5, SP, s5_off * BytesPerInt);

  3356   __ sd(S6, SP, s6_off * BytesPerInt);

  3357   __ sd(S7, SP, s7_off * BytesPerInt);

  3359   __ daddi(FP, SP, fp_off * BytesPerInt);

  3361   // Clear the floating point exception stack

  3362   __ empty_FPU_stack();

  3364   Register thread = TREG;

  3366 #ifndef OPT_THREAD

  3367   __ get_thread(thread);

  3368 #endif

  3369   // set last_Java_sp

  3370   __ set_last_Java_frame(NOREG, FP, NULL);

  3371   __ relocate(relocInfo::internal_pc_type);

  3372   {

  3373     long save_pc = (long)__ pc() + 52;

  3374     __ patchable_set48(AT, (long)save_pc);

  3375     __ sd(AT, thread, in_bytes(JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()));

  3376   }

  3377   // Call C code.  Need thread but NOT official VM entry

  3378   // crud.  We cannot block on this call, no GC can happen.  Call should

  3379   // capture callee-saved registers as well as return values.

  3380   __ move(A0, thread);

  3381   // argument already in T0

  3382   __ move(A1, T0);

  3383   __ patchable_call((address)Deoptimization::uncommon_trap);

  3385   // Set an oopmap for the call site

  3386   OopMapSet *oop_maps = new OopMapSet();

  3387   OopMap* map =  new OopMap( framesize, 0 );

  3389   map->set_callee_saved( VMRegImpl::stack2reg(s0_off    ),  S0->as_VMReg() );

  3390   map->set_callee_saved( VMRegImpl::stack2reg(s1_off    ),  S1->as_VMReg() );

  3391   map->set_callee_saved( VMRegImpl::stack2reg(s2_off    ),  S2->as_VMReg() );

  3392   map->set_callee_saved( VMRegImpl::stack2reg(s3_off    ),  S3->as_VMReg() );

  3393   map->set_callee_saved( VMRegImpl::stack2reg(s4_off    ),  S4->as_VMReg() );

  3394   map->set_callee_saved( VMRegImpl::stack2reg(s5_off    ),  S5->as_VMReg() );

  3395   map->set_callee_saved( VMRegImpl::stack2reg(s6_off    ),  S6->as_VMReg() );

  3396   map->set_callee_saved( VMRegImpl::stack2reg(s7_off    ),  S7->as_VMReg() );

  3398   //oop_maps->add_gc_map( __ offset(), true, map);

  3399   oop_maps->add_gc_map( __ offset(),  map);

  3401 #ifndef OPT_THREAD

  3402   __ get_thread(thread);

  3403 #endif

  3404   __ reset_last_Java_frame(false);

  3406   // Load UnrollBlock into S7

  3407   Register unroll = S7;

  3408   __ move(unroll, V0);

  3410   // Pop all the frames we must move/replace.

  3411   //

  3412   // Frame picture (youngest to oldest)

  3413   // 1: self-frame (no frame link)

  3414   // 2: deopting frame  (no frame link)

  3415   // 3: possible-i2c-adapter-frame

  3416   // 4: caller of deopting frame (could be compiled/interpreted. If interpreted we will create an

  3417   //    and c2i here)

  3419   __ daddiu(SP, SP, framesize * BytesPerInt);

  3421   // Pop deoptimized frame

  3422   __ lw(AT, unroll, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset_in_bytes());

  3423   __ dadd(SP, SP, AT);

  3425   // register for frame pcs

  3426   Register pcs = T8;

  3427   // register for frame sizes

  3428   Register sizes = T9;

  3429   // register for frame count

  3430   Register count = T3;

  3431   // register for the sender's sp

  3432   Register sender_sp = T1;

  3434   // sp should be pointing at the return address to the caller (4)

  3435   // Load array of frame pcs

  3436   __ ld(pcs, unroll, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes());

  3438   // Load array of frame sizes

  3439   __ ld(sizes, unroll, Deoptimization::UnrollBlock::frame_sizes_offset_in_bytes());

  3440   __ lwu(count, unroll, Deoptimization::UnrollBlock::number_of_frames_offset_in_bytes());

  3442   // Pick up the initial fp we should save

  3443   __ ld(FP, unroll, Deoptimization::UnrollBlock::initial_info_offset_in_bytes());

  3444   // Now adjust the caller's stack to make up for the extra locals

  3445   // but record the original sp so that we can save it in the skeletal interpreter

  3446   // frame and the stack walking of interpreter_sender will get the unextended sp

  3447   // value and not the "real" sp value.

  3449   __ move(sender_sp, SP);

  3450   __ lw(AT, unroll, Deoptimization::UnrollBlock::caller_adjustment_offset_in_bytes());

  3451   __ dsub(SP, SP, AT);

  3452   // Push interpreter frames in a loop

  3453   Label loop;

  3454   __ bind(loop);

  3455   __ ld(T2, sizes, 0);          // Load frame size

  3456   __ ld(AT, pcs, 0);           // save return address

  3457   __ daddi(T2, T2, -2*wordSize);           // we'll push pc and fp, by hand

  3458   __ push2(AT, FP);

  3459   __ move(FP, SP);

  3460   __ dsub(SP, SP, T2);                   // Prolog!

  3461   // This value is corrected by layout_activation_impl

  3462   __ sd(R0, FP, frame::interpreter_frame_last_sp_offset * wordSize);

  3463   __ sd(sender_sp, FP, frame::interpreter_frame_sender_sp_offset * wordSize);// Make it walkable

  3464   __ move(sender_sp, SP);       // pass to next frame

  3465   __ daddi(count, count, -1);    // decrement counter

  3466   __ daddi(sizes, sizes, wordSize);     // Bump array pointer (sizes)

  3467   __ addi(pcs, pcs, wordSize);      // Bump array pointer (pcs)

  3468   __ bne(count, R0, loop);

  3469   __ delayed()->nop();      // Bump array pointer (pcs)

  3471   __ ld(RA, pcs, 0);

  3473   // Re-push self-frame

  3474   __ daddi(SP, SP, - 2 * wordSize);      // save old & set new FP

  3475   __ sd(FP, SP, 0 * wordSize);          // save final return address

  3476   __ sd(RA, SP, 1 * wordSize);

  3477   __ move(FP, SP);

  3478   __ daddi(SP, SP, -(framesize / 2 - 2) * wordSize);

  3480   // set last_Java_sp, last_Java_fp

  3481   __ set_last_Java_frame(NOREG, FP, NULL);

  3483   __ move(AT, -(StackAlignmentInBytes));

  3484   __ andr(SP, SP, AT);   // Fix stack alignment as required by ABI

  3486   __ relocate(relocInfo::internal_pc_type);

  3487   {

  3488     long save_pc = (long)__ pc() + 52;

  3489     __ patchable_set48(AT, (long)save_pc);

  3490   }

  3491   __ sd(AT, thread, in_bytes(JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()));

  3493   // Call C code.  Need thread but NOT official VM entry

  3494   // crud.  We cannot block on this call, no GC can happen.  Call should

  3495   // restore return values to their stack-slots with the new SP.

  3496   __ move(A0, thread);

  3497   __ move(A1, Deoptimization::Unpack_uncommon_trap);

  3498   __ patchable_call((address)Deoptimization::unpack_frames);

  3499   // Set an oopmap for the call site

  3500   oop_maps->add_gc_map( __ offset(),  new OopMap( framesize, 0 ) );

  3502   __ reset_last_Java_frame(true);

  3504   // Pop self-frame.

  3505   __ leave();     // Epilog!

  3507   // Jump to interpreter

  3508   __ jr(RA);

  3509   __ delayed()->nop();

  3510   // -------------

  3511   // make sure all code is generated

  3512   masm->flush();

  3514   _uncommon_trap_blob = UncommonTrapBlob::create(&buffer, oop_maps, framesize / 2);

  3515 }

  3517 #endif // COMPILER2

  3519 //------------------------------generate_handler_blob-------------------

  3520 //

  3521 // Generate a special Compile2Runtime blob that saves all registers, and sets

  3522 // up an OopMap and calls safepoint code to stop the compiled code for

  3523 // a safepoint.

  3524 //

  3525 // This blob is jumped to (via a breakpoint and the signal handler) from a

  3526 // safepoint in compiled code.

  3528 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int pool_type) {

  3530   // Account for thread arg in our frame

  3531   const int additional_words = 0;

  3532   int frame_size_in_words;

  3534   assert (StubRoutines::forward_exception_entry() != NULL, "must be generated before");

  3536   ResourceMark rm;

  3537   OopMapSet *oop_maps = new OopMapSet();

  3538   OopMap* map;

  3540   // allocate space for the code

  3541   // setup code generation tools

  3542   CodeBuffer  buffer ("handler_blob", 2048, 512);

  3543   MacroAssembler* masm = new MacroAssembler( &buffer);

  3545   const Register thread = TREG;

  3546   address start   = __ pc();

  3547   address call_pc = NULL;

  3548   bool cause_return = (pool_type == POLL_AT_RETURN);

  3549   bool save_vectors = (pool_type == POLL_AT_VECTOR_LOOP);

  3551   // If cause_return is true we are at a poll_return and there is

  3552   // the return address in RA to the caller on the nmethod

  3553   // that is safepoint. We can leave this return in RA and

  3554   // effectively complete the return and safepoint in the caller.

  3555   // Otherwise we load exception pc to RA.

  3556   __ push(thread);

  3557 #ifndef OPT_THREAD

  3558   __ get_thread(thread);

  3559 #endif

  3561   if(!cause_return) {

  3562     __ ld_ptr(RA, Address(thread, JavaThread::saved_exception_pc_offset()));

  3563   }

  3565   __ pop(thread);

  3566   map = RegisterSaver::save_live_registers(masm, additional_words, &frame_size_in_words, save_vectors);

  3568 #ifndef OPT_THREAD

  3569   __ get_thread(thread);

  3570 #endif

  3571   // The following is basically a call_VM. However, we need the precise

  3572   // address of the call in order to generate an oopmap. Hence, we do all the

  3573   // work outselvs.

  3575   __ move(A0, thread);

  3576   __ set_last_Java_frame(NOREG, NOREG, NULL);

  3579   // do the call

  3580   __ call(call_ptr);

  3581   __ delayed()->nop();

  3583   // Set an oopmap for the call site.  This oopmap will map all

  3584   // oop-registers and debug-info registers as callee-saved.  This

  3585   // will allow deoptimization at this safepoint to find all possible

  3586   // debug-info recordings, as well as let GC find all oops.

  3587   oop_maps->add_gc_map(__ offset(),  map);

  3589   Label noException;

  3591   // Clear last_Java_sp again

  3592   __ reset_last_Java_frame(false);

  3594   __ ld_ptr(AT, thread, in_bytes(Thread::pending_exception_offset()));

  3595   __ beq(AT, R0, noException);

  3596   __ delayed()->nop();

  3598   // Exception pending

  3600   RegisterSaver::restore_live_registers(masm, save_vectors);

  3601   //forward_exception_entry need return address on the stack

  3602   __ push(RA);

  3603   __ patchable_jump((address)StubRoutines::forward_exception_entry());

  3605   // No exception case

  3606   __ bind(noException);

  3607   // Normal exit, register restoring and exit

  3608   RegisterSaver::restore_live_registers(masm, save_vectors);

  3609   __ jr(RA);

  3610   __ delayed()->nop();

  3612   masm->flush();

  3614   // Fill-out other meta info

  3615   return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);

  3616 }

  3618 //

  3619 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss

  3620 //

  3621 // Generate a stub that calls into vm to find out the proper destination

  3622 // of a java call. All the argument registers are live at this point

  3623 // but since this is generic code we don't know what they are and the caller

  3624 // must do any gc of the args.

  3625 //

  3626 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {

  3627   assert (StubRoutines::forward_exception_entry() != NULL, "must be generated before");

  3629   // allocate space for the code

  3630   ResourceMark rm;

  3632   //CodeBuffer buffer(name, 1000, 512);

  3633   //FIXME. aoqi. code_size

  3634   CodeBuffer buffer(name, 2000, 2048);

  3635   MacroAssembler* masm  = new MacroAssembler(&buffer);

  3637   int frame_size_words;

  3638   //we put the thread in A0

  3640   OopMapSet *oop_maps = new OopMapSet();

  3641   OopMap* map = NULL;

  3643   int start = __ offset();

  3644   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_words);

  3647   int frame_complete = __ offset();

  3649   const Register thread = T8;

  3650   __ get_thread(thread);

  3652   __ move(A0, thread);

  3653   __ set_last_Java_frame(noreg, FP, NULL);

  3654   //align the stack before invoke native

  3655   __ move(AT, -(StackAlignmentInBytes));

  3656   __ andr(SP, SP, AT);

  3657   __ relocate(relocInfo::internal_pc_type);

  3658   {

  3659     intptr_t save_pc = (intptr_t)__ pc() +  NativeMovConstReg::instruction_size + 24 + 1 * BytesPerInstWord;

  3660     __ patchable_set48(AT, save_pc);

  3661   }

  3662   __ sd(AT, thread, in_bytes(JavaThread::last_Java_pc_offset()));

  3664   __ call(destination);

  3665   __ delayed()->nop();

  3667   // Set an oopmap for the call site.

  3668   // We need this not only for callee-saved registers, but also for volatile

  3669   // registers that the compiler might be keeping live across a safepoint.

  3670   oop_maps->add_gc_map( __ offset() - start, map);

  3671   // V0 contains the address we are going to jump to assuming no exception got installed

  3672   __ get_thread(thread);

  3673   __ ld_ptr(SP, thread, in_bytes(JavaThread::last_Java_sp_offset()));

  3674   // clear last_Java_sp

  3675   __ reset_last_Java_frame(true);

  3676   // check for pending exceptions

  3677   Label pending;

  3678   __ ld_ptr(AT, thread, in_bytes(Thread::pending_exception_offset()));

  3679   __ bne(AT, R0, pending);

  3680   __ delayed()->nop();

  3681   // get the returned Method*

  3682   //FIXME, do mips need this ?

  3683   __ get_vm_result_2(Rmethod, thread);  // Refer to OpenJDK8

  3684   __ st_ptr(Rmethod, SP, RegisterSaver::methodOffset() * wordSize);

  3685   __ st_ptr(V0, SP, RegisterSaver::v0Offset() * wordSize);

  3686   RegisterSaver::restore_live_registers(masm);

  3688   // We are back the the original state on entry and ready to go the callee method.

  3689   __ jr(V0);

  3690   __ delayed()->nop();

  3691   // Pending exception after the safepoint

  3693   __ bind(pending);

  3695   RegisterSaver::restore_live_registers(masm);

  3697   // exception pending => remove activation and forward to exception handler

  3698   //forward_exception_entry need return address on the stack

  3699   __ push(RA);

  3700   __ get_thread(thread);

  3701   __ st_ptr(R0, thread, in_bytes(JavaThread::vm_result_offset()));

  3702   __ ld_ptr(V0, thread, in_bytes(Thread::pending_exception_offset()));

  3703   __ jmp(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type);

  3704   __ delayed()->nop();

  3705   //

  3706   // make sure all code is generated

  3707   masm->flush();

  3709   RuntimeStub* tmp= RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_words, oop_maps, true);

  3710   return tmp;

  3711 }

  3713 extern "C" int SpinPause() {return 0;}

  3716 //------------------------------Montgomery multiplication------------------------

  3717 //

  3719 // Subtract 0:b from carry:a.  Return carry.

  3720 static unsigned long

  3721 sub(unsigned long a[], unsigned long b[], unsigned long carry, long len) {

  3722   long borrow = 0, t = 0;

  3723   unsigned long tmp0, tmp1;

  3724   __asm__ __volatile__ (

  3725     "0:                                            \n"

  3726     "ld      %[tmp0],     0(%[a])                  \n"

  3727     "ld      %[tmp1],     0(%[b])                  \n"

  3728     "sltu    %[t],        %[tmp0],     %[borrow]   \n"

  3729     "dsubu   %[tmp0],     %[tmp0],     %[borrow]   \n"

  3730     "sltu    %[borrow],   %[tmp0],     %[tmp1]     \n"

  3731     "or      %[borrow],   %[borrow],   %[t]        \n"

  3732     "dsubu   %[tmp0],     %[tmp0],     %[tmp1]     \n"

  3733     "sd      %[tmp0],     0(%[a])                  \n"

  3734     "daddiu  %[a],        %[a],         8          \n"

  3735     "daddiu  %[b],        %[b],         8          \n"

  3736     "daddiu  %[len],      %[len],      -1          \n"

  3737     "bgtz    %[len],      0b                       \n"

  3738     "dsubu   %[tmp0],     %[carry],    %[borrow]   \n"

  3739     : [len]"+r"(len), [tmp0]"=&r"(tmp0), [tmp1]"=&r"(tmp1), [borrow]"+r"(borrow), [a]"+r"(a), [b]"+r"(b), [t]"+r"(t)

  3740     : [carry]"r"(carry)

  3741     : "memory"

  3742   );

  3743   return tmp0;

  3744 }

  3746 // Multiply (unsigned) Long A by Long B, accumulating the double-

  3747 // length result into the accumulator formed of t0, t1, and t2.

  3748 inline void MACC(unsigned long A, unsigned long B, unsigned long &t0, unsigned long &t1, unsigned long &t2) {

  3749   unsigned long hi, lo, carry = 0, t = 0;

  3750   __asm__ __volatile__(

  3751     "dmultu  %[A],        %[B]                     \n"

  3752     "mfhi    %[hi]                                 \n"

  3753     "mflo    %[lo]                                 \n"

  3754     "daddu   %[t0],       %[t0],       %[lo]       \n"

  3755     "sltu    %[carry],    %[t0],       %[lo]       \n"

  3756     "daddu   %[t1],       %[t1],       %[carry]    \n"

  3757     "sltu    %[t],        %[t1],       %[carry]    \n"

  3758     "daddu   %[t1],       %[t1],       %[hi]       \n"

  3759     "sltu    %[carry],    %[t1],       %[hi]       \n"

  3760     "or      %[carry],    %[carry],    %[t]        \n"

  3761     "daddu   %[t2],       %[t2],       %[carry]    \n"

  3762     : [hi]"=&r"(hi), [lo]"=&r"(lo), [t0]"+r"(t0), [t1]"+r"(t1), [t2]"+r"(t2), [carry]"+r"(carry), [t]"+r"(t)

  3763     : [A]"r"(A), [B]"r"(B)

  3764     :

  3765   );

  3766 }

  3768 // As above, but add twice the double-length result into the

  3769 // accumulator.

  3770 inline void MACC2(unsigned long A, unsigned long B, unsigned long &t0, unsigned long &t1, unsigned long &t2) {

  3771   unsigned long hi, lo, carry = 0, t = 0;

  3772   __asm__ __volatile__(

  3773     "dmultu  %[A],        %[B]                     \n"

  3774     "mfhi    %[hi]                                 \n"

  3775     "mflo    %[lo]                                 \n"

  3776     "daddu   %[t0],       %[t0],       %[lo]       \n"

  3777     "sltu    %[carry],    %[t0],       %[lo]       \n"

  3778     "daddu   %[t1],       %[t1],       %[carry]    \n"

  3779     "sltu    %[t],        %[t1],       %[carry]    \n"

  3780     "daddu   %[t1],       %[t1],       %[hi]       \n"

  3781     "sltu    %[carry],    %[t1],       %[hi]       \n"

  3782     "or      %[carry],    %[carry],    %[t]        \n"

  3783     "daddu   %[t2],       %[t2],       %[carry]    \n"

  3784     "daddu   %[t0],       %[t0],       %[lo]       \n"

  3785     "sltu    %[carry],    %[t0],       %[lo]       \n"

  3786     "daddu   %[t1],       %[t1],       %[carry]    \n"

  3787     "sltu    %[t],        %[t1],       %[carry]    \n"

  3788     "daddu   %[t1],       %[t1],       %[hi]       \n"

  3789     "sltu    %[carry],    %[t1],       %[hi]       \n"

  3790     "or      %[carry],    %[carry],    %[t]        \n"

  3791     "daddu   %[t2],       %[t2],       %[carry]    \n"

  3792     : [hi]"=&r"(hi), [lo]"=&r"(lo), [t0]"+r"(t0), [t1]"+r"(t1), [t2]"+r"(t2), [carry]"+r"(carry), [t]"+r"(t)

  3793     : [A]"r"(A), [B]"r"(B)

  3794     :

  3795   );

  3796 }

  3798 // Fast Montgomery multiplication.  The derivation of the algorithm is

  3799 // in  A Cryptographic Library for the Motorola DSP56000,

  3800 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.

  3802 static void __attribute__((noinline))

  3803 montgomery_multiply(unsigned long a[], unsigned long b[], unsigned long n[],

  3804                     unsigned long m[], unsigned long inv, int len) {

  3805   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator

  3806   int i;

  3808   assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");

  3810   for (i = 0; i < len; i++) {

  3811     int j;

  3812     for (j = 0; j < i; j++) {

  3813       MACC(a[j], b[i-j], t0, t1, t2);

  3814       MACC(m[j], n[i-j], t0, t1, t2);

  3815     }

  3816     MACC(a[i], b[0], t0, t1, t2);

  3817     m[i] = t0 * inv;

  3818     MACC(m[i], n[0], t0, t1, t2);

  3820     assert(t0 == 0, "broken Montgomery multiply");

  3822     t0 = t1; t1 = t2; t2 = 0;

  3823   }

  3825   for (i = len; i < 2*len; i++) {

  3826     int j;

  3827     for (j = i-len+1; j < len; j++) {

  3828       MACC(a[j], b[i-j], t0, t1, t2);

  3829       MACC(m[j], n[i-j], t0, t1, t2);

  3830     }

  3831     m[i-len] = t0;

  3832     t0 = t1; t1 = t2; t2 = 0;

  3833   }

  3835   while (t0)

  3836     t0 = sub(m, n, t0, len);

  3837 }

  3839 // Fast Montgomery squaring.  This uses asymptotically 25% fewer

  3840 // multiplies so it should be up to 25% faster than Montgomery

  3841 // multiplication.  However, its loop control is more complex and it

  3842 // may actually run slower on some machines.

  3844 static void __attribute__((noinline))

  3845 montgomery_square(unsigned long a[], unsigned long n[],

  3846                   unsigned long m[], unsigned long inv, int len) {

  3847   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator

  3848   int i;

  3850   assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");

  3852   for (i = 0; i < len; i++) {

  3853     int j;

  3854     int end = (i+1)/2;

  3855     for (j = 0; j < end; j++) {

  3856       MACC2(a[j], a[i-j], t0, t1, t2);

  3857       MACC(m[j], n[i-j], t0, t1, t2);

  3858     }

  3859     if ((i & 1) == 0) {

  3860       MACC(a[j], a[j], t0, t1, t2);

  3861     }

  3862     for (; j < i; j++) {

  3863       MACC(m[j], n[i-j], t0, t1, t2);

  3864     }

  3865     m[i] = t0 * inv;

  3866     MACC(m[i], n[0], t0, t1, t2);

  3868     assert(t0 == 0, "broken Montgomery square");

  3870     t0 = t1; t1 = t2; t2 = 0;

  3871   }

  3873   for (i = len; i < 2*len; i++) {

  3874     int start = i-len+1;

  3875     int end = start + (len - start)/2;

  3876     int j;

  3877     for (j = start; j < end; j++) {

  3878       MACC2(a[j], a[i-j], t0, t1, t2);

  3879       MACC(m[j], n[i-j], t0, t1, t2);

  3880     }

  3881     if ((i & 1) == 0) {

  3882       MACC(a[j], a[j], t0, t1, t2);

  3883     }

  3884     for (; j < len; j++) {

  3885       MACC(m[j], n[i-j], t0, t1, t2);

  3886     }

  3887     m[i-len] = t0;

  3888     t0 = t1; t1 = t2; t2 = 0;

  3889   }

  3891   while (t0)

  3892     t0 = sub(m, n, t0, len);

  3893 }

  3895 // Swap words in a longword.

  3896 static unsigned long swap(unsigned long x) {

  3897   return (x << 32) | (x >> 32);

  3898 }

  3900 // Copy len longwords from s to d, word-swapping as we go.  The

  3901 // destination array is reversed.

  3902 static void reverse_words(unsigned long *s, unsigned long *d, int len) {

  3903   d += len;

  3904   while(len-- > 0) {

  3905     d--;

  3906     *d = swap(*s);

  3907     s++;

  3908   }

  3909 }

  3911 // The threshold at which squaring is advantageous was determined

  3912 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.

  3913 // Doesn't seem to be relevant for MIPS64 so we use the same value.

  3914 #define MONTGOMERY_SQUARING_THRESHOLD 64

  3916 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,

  3917                                         jint len, jlong inv,

  3918                                         jint *m_ints) {

  3919   assert(len % 2 == 0, "array length in montgomery_multiply must be even");

  3920   int longwords = len/2;

  3922   // Make very sure we don't use so much space that the stack might

  3923   // overflow.  512 jints corresponds to an 16384-bit integer and

  3924   // will use here a total of 8k bytes of stack space.

  3925   int total_allocation = longwords * sizeof (unsigned long) * 4;

  3926   guarantee(total_allocation <= 8192, "must be");

  3927   unsigned long *scratch = (unsigned long *)alloca(total_allocation);

  3929   // Local scratch arrays

  3930   unsigned long

  3931     *a = scratch + 0 * longwords,

  3932     *b = scratch + 1 * longwords,

  3933     *n = scratch + 2 * longwords,

  3934     *m = scratch + 3 * longwords;

  3936   reverse_words((unsigned long *)a_ints, a, longwords);

  3937   reverse_words((unsigned long *)b_ints, b, longwords);

  3938   reverse_words((unsigned long *)n_ints, n, longwords);

  3940   ::montgomery_multiply(a, b, n, m, (unsigned long)inv, longwords);

  3942   reverse_words(m, (unsigned long *)m_ints, longwords);

  3943 }

  3945 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,

  3946                                       jint len, jlong inv,

  3947                                       jint *m_ints) {

  3948   assert(len % 2 == 0, "array length in montgomery_square must be even");

  3949   int longwords = len/2;

  3951   // Make very sure we don't use so much space that the stack might

  3952   // overflow.  512 jints corresponds to an 16384-bit integer and

  3953   // will use here a total of 6k bytes of stack space.

  3954   int total_allocation = longwords * sizeof (unsigned long) * 3;

  3955   guarantee(total_allocation <= 8192, "must be");

  3956   unsigned long *scratch = (unsigned long *)alloca(total_allocation);

  3958   // Local scratch arrays

  3959   unsigned long

  3960     *a = scratch + 0 * longwords,

  3961     *n = scratch + 1 * longwords,

  3962     *m = scratch + 2 * longwords;

  3964   reverse_words((unsigned long *)a_ints, a, longwords);

  3965   reverse_words((unsigned long *)n_ints, n, longwords);

  3967   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {

  3968     ::montgomery_square(a, n, m, (unsigned long)inv, longwords);

  3969   } else {

  3970     ::montgomery_multiply(a, a, n, m, (unsigned long)inv, longwords);

  3971   }

  3973   reverse_words(m, (unsigned long *)m_ints, longwords);

  3974 }

src/cpu/mips/vm/sharedRuntime_mips_64.cpp@8c71022cf5f3

src/cpu/mips/vm/sharedRuntime_mips_64.cpp

Mercurial > jdk8-mips64-public > hotspot / file revision

src/cpu/mips/vm/sharedRuntime_mips_64.cpp@8c71022cf5f3

src/cpu/mips/vm/sharedRuntime_mips_64.cpp