src/cpu/ppc/vm/macroAssembler_ppc.cpp

Tue, 07 May 2019 20:38:26 +0000

author
phh
date
Tue, 07 May 2019 20:38:26 +0000
changeset 9669
32bc598624bd
parent 9603
6ce4101edc7a
child 9703
2fdf635bcf28
permissions
-rw-r--r--

8176100: [REDO][REDO] G1 Needs pre barrier on dereference of weak JNI handles
Summary: Add tag bit to all JNI weak handles
Reviewed-by: kbarrett, coleenp, tschatzl

     1 /*
     2  * Copyright (c) 1997, 2017, Oracle and/or its affiliates. All rights reserved.
     3  * Copyright (c) 2012, 2017, SAP SE. All rights reserved.
     4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     5  *
     6  * This code is free software; you can redistribute it and/or modify it
     7  * under the terms of the GNU General Public License version 2 only, as
     8  * published by the Free Software Foundation.
     9  *
    10  * This code is distributed in the hope that it will be useful, but WITHOUT
    11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
    12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    13  * version 2 for more details (a copy is included in the LICENSE file that
    14  * accompanied this code).
    15  *
    16  * You should have received a copy of the GNU General Public License version
    17  * 2 along with this work; if not, write to the Free Software Foundation,
    18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
    19  *
    20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
    21  * or visit www.oracle.com if you need additional information or have any
    22  * questions.
    23  *
    24  */
    26 #include "precompiled.hpp"
    27 #include "asm/macroAssembler.inline.hpp"
    28 #include "compiler/disassembler.hpp"
    29 #include "gc_interface/collectedHeap.inline.hpp"
    30 #include "interpreter/interpreter.hpp"
    31 #include "memory/cardTableModRefBS.hpp"
    32 #include "memory/resourceArea.hpp"
    33 #include "prims/methodHandles.hpp"
    34 #include "runtime/biasedLocking.hpp"
    35 #include "runtime/interfaceSupport.hpp"
    36 #include "runtime/objectMonitor.hpp"
    37 #include "runtime/os.hpp"
    38 #include "runtime/sharedRuntime.hpp"
    39 #include "runtime/stubRoutines.hpp"
    40 #include "utilities/macros.hpp"
    41 #if INCLUDE_ALL_GCS
    42 #include "gc_implementation/g1/g1CollectedHeap.inline.hpp"
    43 #include "gc_implementation/g1/g1SATBCardTableModRefBS.hpp"
    44 #include "gc_implementation/g1/heapRegion.hpp"
    45 #endif // INCLUDE_ALL_GCS
    47 #ifdef PRODUCT
    48 #define BLOCK_COMMENT(str) // nothing
    49 #else
    50 #define BLOCK_COMMENT(str) block_comment(str)
    51 #endif
    52 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
    54 #ifdef ASSERT
    55 // On RISC, there's no benefit to verifying instruction boundaries.
    56 bool AbstractAssembler::pd_check_instruction_mark() { return false; }
    57 #endif
    59 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
    60   assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
    61   if (Assembler::is_simm(si31, 16)) {
    62     ld(d, si31, a);
    63     if (emit_filler_nop) nop();
    64   } else {
    65     const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
    66     const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
    67     addis(d, a, hi);
    68     ld(d, lo, d);
    69   }
    70 }
    72 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {
    73   assert_different_registers(d, a);
    74   ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);
    75 }
    77 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,
    78                                       size_t size_in_bytes, bool is_signed) {
    79   switch (size_in_bytes) {
    80   case  8:              ld(dst, offs, base);                         break;
    81   case  4:  is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;
    82   case  2:  is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;
    83   case  1:  lbz(dst, offs, base); if (is_signed) extsb(dst, dst);    break; // lba doesn't exist :(
    84   default:  ShouldNotReachHere();
    85   }
    86 }
    88 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,
    89                                        size_t size_in_bytes) {
    90   switch (size_in_bytes) {
    91   case  8:  std(dst, offs, base); break;
    92   case  4:  stw(dst, offs, base); break;
    93   case  2:  sth(dst, offs, base); break;
    94   case  1:  stb(dst, offs, base); break;
    95   default:  ShouldNotReachHere();
    96   }
    97 }
    99 void MacroAssembler::align(int modulus, int max, int rem) {
   100   int padding = (rem + modulus - (offset() % modulus)) % modulus;
   101   if (padding > max) return;
   102   for (int c = (padding >> 2); c > 0; --c) { nop(); }
   103 }
   105 // Issue instructions that calculate given TOC from global TOC.
   106 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,
   107                                                        bool add_relocation, bool emit_dummy_addr) {
   108   int offset = -1;
   109   if (emit_dummy_addr) {
   110     offset = -128; // dummy address
   111   } else if (addr != (address)(intptr_t)-1) {
   112     offset = MacroAssembler::offset_to_global_toc(addr);
   113   }
   115   if (hi16) {
   116     addis(dst, R29, MacroAssembler::largeoffset_si16_si16_hi(offset));
   117   }
   118   if (lo16) {
   119     if (add_relocation) {
   120       // Relocate at the addi to avoid confusion with a load from the method's TOC.
   121       relocate(internal_word_Relocation::spec(addr));
   122     }
   123     addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));
   124   }
   125 }
   127 int MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
   128   const int offset = MacroAssembler::offset_to_global_toc(addr);
   130   const address inst2_addr = a;
   131   const int inst2 = *(int *)inst2_addr;
   133   // The relocation points to the second instruction, the addi,
   134   // and the addi reads and writes the same register dst.
   135   const int dst = inv_rt_field(inst2);
   136   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
   138   // Now, find the preceding addis which writes to dst.
   139   int inst1 = 0;
   140   address inst1_addr = inst2_addr - BytesPerInstWord;
   141   while (inst1_addr >= bound) {
   142     inst1 = *(int *) inst1_addr;
   143     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
   144       // Stop, found the addis which writes dst.
   145       break;
   146     }
   147     inst1_addr -= BytesPerInstWord;
   148   }
   150   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
   151   set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
   152   set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
   153   return (int)((intptr_t)addr - (intptr_t)inst1_addr);
   154 }
   156 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
   157   const address inst2_addr = a;
   158   const int inst2 = *(int *)inst2_addr;
   160   // The relocation points to the second instruction, the addi,
   161   // and the addi reads and writes the same register dst.
   162   const int dst = inv_rt_field(inst2);
   163   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
   165   // Now, find the preceding addis which writes to dst.
   166   int inst1 = 0;
   167   address inst1_addr = inst2_addr - BytesPerInstWord;
   168   while (inst1_addr >= bound) {
   169     inst1 = *(int *) inst1_addr;
   170     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
   171       // stop, found the addis which writes dst
   172       break;
   173     }
   174     inst1_addr -= BytesPerInstWord;
   175   }
   177   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
   179   int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);
   180   // -1 is a special case
   181   if (offset == -1) {
   182     return (address)(intptr_t)-1;
   183   } else {
   184     return global_toc() + offset;
   185   }
   186 }
   188 #ifdef _LP64
   189 // Patch compressed oops or klass constants.
   190 // Assembler sequence is
   191 // 1) compressed oops:
   192 //    lis  rx = const.hi
   193 //    ori rx = rx | const.lo
   194 // 2) compressed klass:
   195 //    lis  rx = const.hi
   196 //    clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
   197 //    ori rx = rx | const.lo
   198 // Clrldi will be passed by.
   199 int MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
   200   assert(UseCompressedOops, "Should only patch compressed oops");
   202   const address inst2_addr = a;
   203   const int inst2 = *(int *)inst2_addr;
   205   // The relocation points to the second instruction, the ori,
   206   // and the ori reads and writes the same register dst.
   207   const int dst = inv_rta_field(inst2);
   208   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
   209   // Now, find the preceding addis which writes to dst.
   210   int inst1 = 0;
   211   address inst1_addr = inst2_addr - BytesPerInstWord;
   212   bool inst1_found = false;
   213   while (inst1_addr >= bound) {
   214     inst1 = *(int *)inst1_addr;
   215     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }
   216     inst1_addr -= BytesPerInstWord;
   217   }
   218   assert(inst1_found, "inst is not lis");
   220   int xc = (data >> 16) & 0xffff;
   221   int xd = (data >>  0) & 0xffff;
   223   set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
   224   set_imm((int *)inst2_addr,        (xd)); // unsigned int
   225   return (int)((intptr_t)inst2_addr - (intptr_t)inst1_addr);
   226 }
   228 // Get compressed oop or klass constant.
   229 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {
   230   assert(UseCompressedOops, "Should only patch compressed oops");
   232   const address inst2_addr = a;
   233   const int inst2 = *(int *)inst2_addr;
   235   // The relocation points to the second instruction, the ori,
   236   // and the ori reads and writes the same register dst.
   237   const int dst = inv_rta_field(inst2);
   238   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
   239   // Now, find the preceding lis which writes to dst.
   240   int inst1 = 0;
   241   address inst1_addr = inst2_addr - BytesPerInstWord;
   242   bool inst1_found = false;
   244   while (inst1_addr >= bound) {
   245     inst1 = *(int *) inst1_addr;
   246     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}
   247     inst1_addr -= BytesPerInstWord;
   248   }
   249   assert(inst1_found, "inst is not lis");
   251   uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));
   252   uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);
   254   return (int) (xl | xh);
   255 }
   256 #endif // _LP64
   258 void MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a, Register toc) {
   259   int toc_offset = 0;
   260   // Use RelocationHolder::none for the constant pool entry, otherwise
   261   // we will end up with a failing NativeCall::verify(x) where x is
   262   // the address of the constant pool entry.
   263   // FIXME: We should insert relocation information for oops at the constant
   264   // pool entries instead of inserting it at the loads; patching of a constant
   265   // pool entry should be less expensive.
   266   address oop_address = address_constant((address)a.value(), RelocationHolder::none);
   267   // Relocate at the pc of the load.
   268   relocate(a.rspec());
   269   toc_offset = (int)(oop_address - code()->consts()->start());
   270   ld_largeoffset_unchecked(dst, toc_offset, toc, true);
   271 }
   273 bool MacroAssembler::is_load_const_from_method_toc_at(address a) {
   274   const address inst1_addr = a;
   275   const int inst1 = *(int *)inst1_addr;
   277    // The relocation points to the ld or the addis.
   278    return (is_ld(inst1)) ||
   279           (is_addis(inst1) && inv_ra_field(inst1) != 0);
   280 }
   282 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {
   283   assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");
   285   const address inst1_addr = a;
   286   const int inst1 = *(int *)inst1_addr;
   288   if (is_ld(inst1)) {
   289     return inv_d1_field(inst1);
   290   } else if (is_addis(inst1)) {
   291     const int dst = inv_rt_field(inst1);
   293     // Now, find the succeeding ld which reads and writes to dst.
   294     address inst2_addr = inst1_addr + BytesPerInstWord;
   295     int inst2 = 0;
   296     while (true) {
   297       inst2 = *(int *) inst2_addr;
   298       if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {
   299         // Stop, found the ld which reads and writes dst.
   300         break;
   301       }
   302       inst2_addr += BytesPerInstWord;
   303     }
   304     return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);
   305   }
   306   ShouldNotReachHere();
   307   return 0;
   308 }
   310 // Get the constant from a `load_const' sequence.
   311 long MacroAssembler::get_const(address a) {
   312   assert(is_load_const_at(a), "not a load of a constant");
   313   const int *p = (const int*) a;
   314   unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);
   315   if (is_ori(*(p+1))) {
   316     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);
   317     x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);
   318     x |= (((unsigned long) (get_imm(a,4) & 0xffff)));
   319   } else if (is_lis(*(p+1))) {
   320     x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);
   321     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);
   322     x |= (((unsigned long) (get_imm(a,3) & 0xffff)));
   323   } else {
   324     ShouldNotReachHere();
   325     return (long) 0;
   326   }
   327   return (long) x;
   328 }
   330 // Patch the 64 bit constant of a `load_const' sequence. This is a low
   331 // level procedure. It neither flushes the instruction cache nor is it
   332 // mt safe.
   333 void MacroAssembler::patch_const(address a, long x) {
   334   assert(is_load_const_at(a), "not a load of a constant");
   335   int *p = (int*) a;
   336   if (is_ori(*(p+1))) {
   337     set_imm(0 + p, (x >> 48) & 0xffff);
   338     set_imm(1 + p, (x >> 32) & 0xffff);
   339     set_imm(3 + p, (x >> 16) & 0xffff);
   340     set_imm(4 + p, x & 0xffff);
   341   } else if (is_lis(*(p+1))) {
   342     set_imm(0 + p, (x >> 48) & 0xffff);
   343     set_imm(2 + p, (x >> 32) & 0xffff);
   344     set_imm(1 + p, (x >> 16) & 0xffff);
   345     set_imm(3 + p, x & 0xffff);
   346   } else {
   347     ShouldNotReachHere();
   348   }
   349 }
   351 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {
   352   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
   353   int index = oop_recorder()->allocate_metadata_index(obj);
   354   RelocationHolder rspec = metadata_Relocation::spec(index);
   355   return AddressLiteral((address)obj, rspec);
   356 }
   358 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {
   359   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
   360   int index = oop_recorder()->find_index(obj);
   361   RelocationHolder rspec = metadata_Relocation::spec(index);
   362   return AddressLiteral((address)obj, rspec);
   363 }
   365 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {
   366   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
   367   int oop_index = oop_recorder()->allocate_oop_index(obj);
   368   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
   369 }
   371 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {
   372   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
   373   int oop_index = oop_recorder()->find_index(obj);
   374   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
   375 }
   377 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
   378                                                       Register tmp, int offset) {
   379   intptr_t value = *delayed_value_addr;
   380   if (value != 0) {
   381     return RegisterOrConstant(value + offset);
   382   }
   384   // Load indirectly to solve generation ordering problem.
   385   // static address, no relocation
   386   int simm16_offset = load_const_optimized(tmp, delayed_value_addr, noreg, true);
   387   ld(tmp, simm16_offset, tmp); // must be aligned ((xa & 3) == 0)
   389   if (offset != 0) {
   390     addi(tmp, tmp, offset);
   391   }
   393   return RegisterOrConstant(tmp);
   394 }
   396 #ifndef PRODUCT
   397 void MacroAssembler::pd_print_patched_instruction(address branch) {
   398   Unimplemented(); // TODO: PPC port
   399 }
   400 #endif // ndef PRODUCT
   402 // Conditional far branch for destinations encodable in 24+2 bits.
   403 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {
   405   // If requested by flag optimize, relocate the bc_far as a
   406   // runtime_call and prepare for optimizing it when the code gets
   407   // relocated.
   408   if (optimize == bc_far_optimize_on_relocate) {
   409     relocate(relocInfo::runtime_call_type);
   410   }
   412   // variant 2:
   413   //
   414   //    b!cxx SKIP
   415   //    bxx   DEST
   416   //  SKIP:
   417   //
   419   const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
   420                                                 opposite_bcond(inv_boint_bcond(boint)));
   422   // We emit two branches.
   423   // First, a conditional branch which jumps around the far branch.
   424   const address not_taken_pc = pc() + 2 * BytesPerInstWord;
   425   const address bc_pc        = pc();
   426   bc(opposite_boint, biint, not_taken_pc);
   428   const int bc_instr = *(int*)bc_pc;
   429   assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");
   430   assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");
   431   assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),
   432                                      opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),
   433          "postcondition");
   434   assert(biint == inv_bi_field(bc_instr), "postcondition");
   436   // Second, an unconditional far branch which jumps to dest.
   437   // Note: target(dest) remembers the current pc (see CodeSection::target)
   438   //       and returns the current pc if the label is not bound yet; when
   439   //       the label gets bound, the unconditional far branch will be patched.
   440   const address target_pc = target(dest);
   441   const address b_pc  = pc();
   442   b(target_pc);
   444   assert(not_taken_pc == pc(),                     "postcondition");
   445   assert(dest.is_bound() || target_pc == b_pc, "postcondition");
   446 }
   448 bool MacroAssembler::is_bc_far_at(address instruction_addr) {
   449   return is_bc_far_variant1_at(instruction_addr) ||
   450          is_bc_far_variant2_at(instruction_addr) ||
   451          is_bc_far_variant3_at(instruction_addr);
   452 }
   454 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {
   455   if (is_bc_far_variant1_at(instruction_addr)) {
   456     const address instruction_1_addr = instruction_addr;
   457     const int instruction_1 = *(int*)instruction_1_addr;
   458     return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);
   459   } else if (is_bc_far_variant2_at(instruction_addr)) {
   460     const address instruction_2_addr = instruction_addr + 4;
   461     return bxx_destination(instruction_2_addr);
   462   } else if (is_bc_far_variant3_at(instruction_addr)) {
   463     return instruction_addr + 8;
   464   }
   465   // variant 4 ???
   466   ShouldNotReachHere();
   467   return NULL;
   468 }
   469 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {
   471   if (is_bc_far_variant3_at(instruction_addr)) {
   472     // variant 3, far cond branch to the next instruction, already patched to nops:
   473     //
   474     //    nop
   475     //    endgroup
   476     //  SKIP/DEST:
   477     //
   478     return;
   479   }
   481   // first, extract boint and biint from the current branch
   482   int boint = 0;
   483   int biint = 0;
   485   ResourceMark rm;
   486   const int code_size = 2 * BytesPerInstWord;
   487   CodeBuffer buf(instruction_addr, code_size);
   488   MacroAssembler masm(&buf);
   489   if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {
   490     // Far branch to next instruction: Optimize it by patching nops (produce variant 3).
   491     masm.nop();
   492     masm.endgroup();
   493   } else {
   494     if (is_bc_far_variant1_at(instruction_addr)) {
   495       // variant 1, the 1st instruction contains the destination address:
   496       //
   497       //    bcxx  DEST
   498       //    endgroup
   499       //
   500       const int instruction_1 = *(int*)(instruction_addr);
   501       boint = inv_bo_field(instruction_1);
   502       biint = inv_bi_field(instruction_1);
   503     } else if (is_bc_far_variant2_at(instruction_addr)) {
   504       // variant 2, the 2nd instruction contains the destination address:
   505       //
   506       //    b!cxx SKIP
   507       //    bxx   DEST
   508       //  SKIP:
   509       //
   510       const int instruction_1 = *(int*)(instruction_addr);
   511       boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),
   512           opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));
   513       biint = inv_bi_field(instruction_1);
   514     } else {
   515       // variant 4???
   516       ShouldNotReachHere();
   517     }
   519     // second, set the new branch destination and optimize the code
   520     if (dest != instruction_addr + 4 && // the bc_far is still unbound!
   521         masm.is_within_range_of_bcxx(dest, instruction_addr)) {
   522       // variant 1:
   523       //
   524       //    bcxx  DEST
   525       //    endgroup
   526       //
   527       masm.bc(boint, biint, dest);
   528       masm.endgroup();
   529     } else {
   530       // variant 2:
   531       //
   532       //    b!cxx SKIP
   533       //    bxx   DEST
   534       //  SKIP:
   535       //
   536       const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
   537                                                     opposite_bcond(inv_boint_bcond(boint)));
   538       const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;
   539       masm.bc(opposite_boint, biint, not_taken_pc);
   540       masm.b(dest);
   541     }
   542   }
   543   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
   544 }
   546 // Emit a NOT mt-safe patchable 64 bit absolute call/jump.
   547 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {
   548   // get current pc
   549   uint64_t start_pc = (uint64_t) pc();
   551   const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last
   552   const address pc_of_b  = (address) (start_pc + (0*BytesPerInstWord)); // b is first
   554   // relocate here
   555   if (rt != relocInfo::none) {
   556     relocate(rt);
   557   }
   559   if ( ReoptimizeCallSequences &&
   560        (( link && is_within_range_of_b(dest, pc_of_bl)) ||
   561         (!link && is_within_range_of_b(dest, pc_of_b)))) {
   562     // variant 2:
   563     // Emit an optimized, pc-relative call/jump.
   565     if (link) {
   566       // some padding
   567       nop();
   568       nop();
   569       nop();
   570       nop();
   571       nop();
   572       nop();
   574       // do the call
   575       assert(pc() == pc_of_bl, "just checking");
   576       bl(dest, relocInfo::none);
   577     } else {
   578       // do the jump
   579       assert(pc() == pc_of_b, "just checking");
   580       b(dest, relocInfo::none);
   582       // some padding
   583       nop();
   584       nop();
   585       nop();
   586       nop();
   587       nop();
   588       nop();
   589     }
   591     // Assert that we can identify the emitted call/jump.
   592     assert(is_bxx64_patchable_variant2_at((address)start_pc, link),
   593            "can't identify emitted call");
   594   } else {
   595     // variant 1:
   596     mr(R0, R11);  // spill R11 -> R0.
   598     // Load the destination address into CTR,
   599     // calculate destination relative to global toc.
   600     calculate_address_from_global_toc(R11, dest, true, true, false);
   602     mtctr(R11);
   603     mr(R11, R0);  // spill R11 <- R0.
   604     nop();
   606     // do the call/jump
   607     if (link) {
   608       bctrl();
   609     } else{
   610       bctr();
   611     }
   612     // Assert that we can identify the emitted call/jump.
   613     assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),
   614            "can't identify emitted call");
   615   }
   617   // Assert that we can identify the emitted call/jump.
   618   assert(is_bxx64_patchable_at((address)start_pc, link),
   619          "can't identify emitted call");
   620   assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,
   621          "wrong encoding of dest address");
   622 }
   624 // Identify a bxx64_patchable instruction.
   625 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {
   626   return is_bxx64_patchable_variant1b_at(instruction_addr, link)
   627     //|| is_bxx64_patchable_variant1_at(instruction_addr, link)
   628       || is_bxx64_patchable_variant2_at(instruction_addr, link);
   629 }
   631 // Does the call64_patchable instruction use a pc-relative encoding of
   632 // the call destination?
   633 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {
   634   // variant 2 is pc-relative
   635   return is_bxx64_patchable_variant2_at(instruction_addr, link);
   636 }
   638 // Identify variant 1.
   639 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {
   640   unsigned int* instr = (unsigned int*) instruction_addr;
   641   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
   642       && is_mtctr(instr[5]) // mtctr
   643     && is_load_const_at(instruction_addr);
   644 }
   646 // Identify variant 1b: load destination relative to global toc.
   647 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {
   648   unsigned int* instr = (unsigned int*) instruction_addr;
   649   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
   650     && is_mtctr(instr[3]) // mtctr
   651     && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);
   652 }
   654 // Identify variant 2.
   655 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {
   656   unsigned int* instr = (unsigned int*) instruction_addr;
   657   if (link) {
   658     return is_bl (instr[6])  // bl dest is last
   659       && is_nop(instr[0])  // nop
   660       && is_nop(instr[1])  // nop
   661       && is_nop(instr[2])  // nop
   662       && is_nop(instr[3])  // nop
   663       && is_nop(instr[4])  // nop
   664       && is_nop(instr[5]); // nop
   665   } else {
   666     return is_b  (instr[0])  // b  dest is first
   667       && is_nop(instr[1])  // nop
   668       && is_nop(instr[2])  // nop
   669       && is_nop(instr[3])  // nop
   670       && is_nop(instr[4])  // nop
   671       && is_nop(instr[5])  // nop
   672       && is_nop(instr[6]); // nop
   673   }
   674 }
   676 // Set dest address of a bxx64_patchable instruction.
   677 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {
   678   ResourceMark rm;
   679   int code_size = MacroAssembler::bxx64_patchable_size;
   680   CodeBuffer buf(instruction_addr, code_size);
   681   MacroAssembler masm(&buf);
   682   masm.bxx64_patchable(dest, relocInfo::none, link);
   683   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
   684 }
   686 // Get dest address of a bxx64_patchable instruction.
   687 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {
   688   if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {
   689     return (address) (unsigned long) get_const(instruction_addr);
   690   } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {
   691     unsigned int* instr = (unsigned int*) instruction_addr;
   692     if (link) {
   693       const int instr_idx = 6; // bl is last
   694       int branchoffset = branch_destination(instr[instr_idx], 0);
   695       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
   696     } else {
   697       const int instr_idx = 0; // b is first
   698       int branchoffset = branch_destination(instr[instr_idx], 0);
   699       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
   700     }
   701   // Load dest relative to global toc.
   702   } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {
   703     return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,
   704                                                                instruction_addr);
   705   } else {
   706     ShouldNotReachHere();
   707     return NULL;
   708   }
   709 }
   711 // Uses ordering which corresponds to ABI:
   712 //    _savegpr0_14:  std  r14,-144(r1)
   713 //    _savegpr0_15:  std  r15,-136(r1)
   714 //    _savegpr0_16:  std  r16,-128(r1)
   715 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) {
   716   std(R14, offset, dst);   offset += 8;
   717   std(R15, offset, dst);   offset += 8;
   718   std(R16, offset, dst);   offset += 8;
   719   std(R17, offset, dst);   offset += 8;
   720   std(R18, offset, dst);   offset += 8;
   721   std(R19, offset, dst);   offset += 8;
   722   std(R20, offset, dst);   offset += 8;
   723   std(R21, offset, dst);   offset += 8;
   724   std(R22, offset, dst);   offset += 8;
   725   std(R23, offset, dst);   offset += 8;
   726   std(R24, offset, dst);   offset += 8;
   727   std(R25, offset, dst);   offset += 8;
   728   std(R26, offset, dst);   offset += 8;
   729   std(R27, offset, dst);   offset += 8;
   730   std(R28, offset, dst);   offset += 8;
   731   std(R29, offset, dst);   offset += 8;
   732   std(R30, offset, dst);   offset += 8;
   733   std(R31, offset, dst);   offset += 8;
   735   stfd(F14, offset, dst);   offset += 8;
   736   stfd(F15, offset, dst);   offset += 8;
   737   stfd(F16, offset, dst);   offset += 8;
   738   stfd(F17, offset, dst);   offset += 8;
   739   stfd(F18, offset, dst);   offset += 8;
   740   stfd(F19, offset, dst);   offset += 8;
   741   stfd(F20, offset, dst);   offset += 8;
   742   stfd(F21, offset, dst);   offset += 8;
   743   stfd(F22, offset, dst);   offset += 8;
   744   stfd(F23, offset, dst);   offset += 8;
   745   stfd(F24, offset, dst);   offset += 8;
   746   stfd(F25, offset, dst);   offset += 8;
   747   stfd(F26, offset, dst);   offset += 8;
   748   stfd(F27, offset, dst);   offset += 8;
   749   stfd(F28, offset, dst);   offset += 8;
   750   stfd(F29, offset, dst);   offset += 8;
   751   stfd(F30, offset, dst);   offset += 8;
   752   stfd(F31, offset, dst);
   753 }
   755 // Uses ordering which corresponds to ABI:
   756 //    _restgpr0_14:  ld   r14,-144(r1)
   757 //    _restgpr0_15:  ld   r15,-136(r1)
   758 //    _restgpr0_16:  ld   r16,-128(r1)
   759 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) {
   760   ld(R14, offset, src);   offset += 8;
   761   ld(R15, offset, src);   offset += 8;
   762   ld(R16, offset, src);   offset += 8;
   763   ld(R17, offset, src);   offset += 8;
   764   ld(R18, offset, src);   offset += 8;
   765   ld(R19, offset, src);   offset += 8;
   766   ld(R20, offset, src);   offset += 8;
   767   ld(R21, offset, src);   offset += 8;
   768   ld(R22, offset, src);   offset += 8;
   769   ld(R23, offset, src);   offset += 8;
   770   ld(R24, offset, src);   offset += 8;
   771   ld(R25, offset, src);   offset += 8;
   772   ld(R26, offset, src);   offset += 8;
   773   ld(R27, offset, src);   offset += 8;
   774   ld(R28, offset, src);   offset += 8;
   775   ld(R29, offset, src);   offset += 8;
   776   ld(R30, offset, src);   offset += 8;
   777   ld(R31, offset, src);   offset += 8;
   779   // FP registers
   780   lfd(F14, offset, src);   offset += 8;
   781   lfd(F15, offset, src);   offset += 8;
   782   lfd(F16, offset, src);   offset += 8;
   783   lfd(F17, offset, src);   offset += 8;
   784   lfd(F18, offset, src);   offset += 8;
   785   lfd(F19, offset, src);   offset += 8;
   786   lfd(F20, offset, src);   offset += 8;
   787   lfd(F21, offset, src);   offset += 8;
   788   lfd(F22, offset, src);   offset += 8;
   789   lfd(F23, offset, src);   offset += 8;
   790   lfd(F24, offset, src);   offset += 8;
   791   lfd(F25, offset, src);   offset += 8;
   792   lfd(F26, offset, src);   offset += 8;
   793   lfd(F27, offset, src);   offset += 8;
   794   lfd(F28, offset, src);   offset += 8;
   795   lfd(F29, offset, src);   offset += 8;
   796   lfd(F30, offset, src);   offset += 8;
   797   lfd(F31, offset, src);
   798 }
   800 // For verify_oops.
   801 void MacroAssembler::save_volatile_gprs(Register dst, int offset) {
   802   std(R2,  offset, dst);   offset += 8;
   803   std(R3,  offset, dst);   offset += 8;
   804   std(R4,  offset, dst);   offset += 8;
   805   std(R5,  offset, dst);   offset += 8;
   806   std(R6,  offset, dst);   offset += 8;
   807   std(R7,  offset, dst);   offset += 8;
   808   std(R8,  offset, dst);   offset += 8;
   809   std(R9,  offset, dst);   offset += 8;
   810   std(R10, offset, dst);   offset += 8;
   811   std(R11, offset, dst);   offset += 8;
   812   std(R12, offset, dst);
   813 }
   815 // For verify_oops.
   816 void MacroAssembler::restore_volatile_gprs(Register src, int offset) {
   817   ld(R2,  offset, src);   offset += 8;
   818   ld(R3,  offset, src);   offset += 8;
   819   ld(R4,  offset, src);   offset += 8;
   820   ld(R5,  offset, src);   offset += 8;
   821   ld(R6,  offset, src);   offset += 8;
   822   ld(R7,  offset, src);   offset += 8;
   823   ld(R8,  offset, src);   offset += 8;
   824   ld(R9,  offset, src);   offset += 8;
   825   ld(R10, offset, src);   offset += 8;
   826   ld(R11, offset, src);   offset += 8;
   827   ld(R12, offset, src);
   828 }
   830 void MacroAssembler::save_LR_CR(Register tmp) {
   831   mfcr(tmp);
   832   std(tmp, _abi(cr), R1_SP);
   833   mflr(tmp);
   834   std(tmp, _abi(lr), R1_SP);
   835   // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)
   836 }
   838 void MacroAssembler::restore_LR_CR(Register tmp) {
   839   assert(tmp != R1_SP, "must be distinct");
   840   ld(tmp, _abi(lr), R1_SP);
   841   mtlr(tmp);
   842   ld(tmp, _abi(cr), R1_SP);
   843   mtcr(tmp);
   844 }
   846 address MacroAssembler::get_PC_trash_LR(Register result) {
   847   Label L;
   848   bl(L);
   849   bind(L);
   850   address lr_pc = pc();
   851   mflr(result);
   852   return lr_pc;
   853 }
   855 void MacroAssembler::resize_frame(Register offset, Register tmp) {
   856 #ifdef ASSERT
   857   assert_different_registers(offset, tmp, R1_SP);
   858   andi_(tmp, offset, frame::alignment_in_bytes-1);
   859   asm_assert_eq("resize_frame: unaligned", 0x204);
   860 #endif
   862   // tmp <- *(SP)
   863   ld(tmp, _abi(callers_sp), R1_SP);
   864   // addr <- SP + offset;
   865   // *(addr) <- tmp;
   866   // SP <- addr
   867   stdux(tmp, R1_SP, offset);
   868 }
   870 void MacroAssembler::resize_frame(int offset, Register tmp) {
   871   assert(is_simm(offset, 16), "too big an offset");
   872   assert_different_registers(tmp, R1_SP);
   873   assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");
   874   // tmp <- *(SP)
   875   ld(tmp, _abi(callers_sp), R1_SP);
   876   // addr <- SP + offset;
   877   // *(addr) <- tmp;
   878   // SP <- addr
   879   stdu(tmp, offset, R1_SP);
   880 }
   882 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {
   883   // (addr == tmp1) || (addr == tmp2) is allowed here!
   884   assert(tmp1 != tmp2, "must be distinct");
   886   // compute offset w.r.t. current stack pointer
   887   // tmp_1 <- addr - SP (!)
   888   subf(tmp1, R1_SP, addr);
   890   // atomically update SP keeping back link.
   891   resize_frame(tmp1/* offset */, tmp2/* tmp */);
   892 }
   894 void MacroAssembler::push_frame(Register bytes, Register tmp) {
   895 #ifdef ASSERT
   896   assert(bytes != R0, "r0 not allowed here");
   897   andi_(R0, bytes, frame::alignment_in_bytes-1);
   898   asm_assert_eq("push_frame(Reg, Reg): unaligned", 0x203);
   899 #endif
   900   neg(tmp, bytes);
   901   stdux(R1_SP, R1_SP, tmp);
   902 }
   904 // Push a frame of size `bytes'.
   905 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {
   906   long offset = align_addr(bytes, frame::alignment_in_bytes);
   907   if (is_simm(-offset, 16)) {
   908     stdu(R1_SP, -offset, R1_SP);
   909   } else {
   910     load_const(tmp, -offset);
   911     stdux(R1_SP, R1_SP, tmp);
   912   }
   913 }
   915 // Push a frame of size `bytes' plus abi_reg_args on top.
   916 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {
   917   push_frame(bytes + frame::abi_reg_args_size, tmp);
   918 }
   920 // Setup up a new C frame with a spill area for non-volatile GPRs and
   921 // additional space for local variables.
   922 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes,
   923                                                       Register tmp) {
   924   push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp);
   925 }
   927 // Pop current C frame.
   928 void MacroAssembler::pop_frame() {
   929   ld(R1_SP, _abi(callers_sp), R1_SP);
   930 }
   932 #if defined(ABI_ELFv2)
   933 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {
   934   // TODO(asmundak): make sure the caller uses R12 as function descriptor
   935   // most of the times.
   936   if (R12 != r_function_entry) {
   937     mr(R12, r_function_entry);
   938   }
   939   mtctr(R12);
   940   // Do a call or a branch.
   941   if (and_link) {
   942     bctrl();
   943   } else {
   944     bctr();
   945   }
   946   _last_calls_return_pc = pc();
   948   return _last_calls_return_pc;
   949 }
   951 // Call a C function via a function descriptor and use full C
   952 // calling conventions. Updates and returns _last_calls_return_pc.
   953 address MacroAssembler::call_c(Register r_function_entry) {
   954   return branch_to(r_function_entry, /*and_link=*/true);
   955 }
   957 // For tail calls: only branch, don't link, so callee returns to caller of this function.
   958 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {
   959   return branch_to(r_function_entry, /*and_link=*/false);
   960 }
   962 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {
   963   load_const(R12, function_entry, R0);
   964   return branch_to(R12,  /*and_link=*/true);
   965 }
   967 #else
   968 // Generic version of a call to C function via a function descriptor
   969 // with variable support for C calling conventions (TOC, ENV, etc.).
   970 // Updates and returns _last_calls_return_pc.
   971 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,
   972                                   bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {
   973   // we emit standard ptrgl glue code here
   974   assert((function_descriptor != R0), "function_descriptor cannot be R0");
   976   // retrieve necessary entries from the function descriptor
   977   ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);
   978   mtctr(R0);
   980   if (load_toc_of_callee) {
   981     ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);
   982   }
   983   if (load_env_of_callee) {
   984     ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);
   985   } else if (load_toc_of_callee) {
   986     li(R11, 0);
   987   }
   989   // do a call or a branch
   990   if (and_link) {
   991     bctrl();
   992   } else {
   993     bctr();
   994   }
   995   _last_calls_return_pc = pc();
   997   return _last_calls_return_pc;
   998 }
  1000 // Call a C function via a function descriptor and use full C calling
  1001 // conventions.
  1002 // We don't use the TOC in generated code, so there is no need to save
  1003 // and restore its value.
  1004 address MacroAssembler::call_c(Register fd) {
  1005   return branch_to(fd, /*and_link=*/true,
  1006                        /*save toc=*/false,
  1007                        /*restore toc=*/false,
  1008                        /*load toc=*/true,
  1009                        /*load env=*/true);
  1012 address MacroAssembler::call_c_and_return_to_caller(Register fd) {
  1013   return branch_to(fd, /*and_link=*/false,
  1014                        /*save toc=*/false,
  1015                        /*restore toc=*/false,
  1016                        /*load toc=*/true,
  1017                        /*load env=*/true);
  1020 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {
  1021   if (rt != relocInfo::none) {
  1022     // this call needs to be relocatable
  1023     if (!ReoptimizeCallSequences
  1024         || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
  1025         || fd == NULL   // support code-size estimation
  1026         || !fd->is_friend_function()
  1027         || fd->entry() == NULL) {
  1028       // it's not a friend function as defined by class FunctionDescriptor,
  1029       // so do a full call-c here.
  1030       load_const(R11, (address)fd, R0);
  1032       bool has_env = (fd != NULL && fd->env() != NULL);
  1033       return branch_to(R11, /*and_link=*/true,
  1034                             /*save toc=*/false,
  1035                             /*restore toc=*/false,
  1036                             /*load toc=*/true,
  1037                             /*load env=*/has_env);
  1038     } else {
  1039       // It's a friend function. Load the entry point and don't care about
  1040       // toc and env. Use an optimizable call instruction, but ensure the
  1041       // same code-size as in the case of a non-friend function.
  1042       nop();
  1043       nop();
  1044       nop();
  1045       bl64_patchable(fd->entry(), rt);
  1046       _last_calls_return_pc = pc();
  1047       return _last_calls_return_pc;
  1049   } else {
  1050     // This call does not need to be relocatable, do more aggressive
  1051     // optimizations.
  1052     if (!ReoptimizeCallSequences
  1053       || !fd->is_friend_function()) {
  1054       // It's not a friend function as defined by class FunctionDescriptor,
  1055       // so do a full call-c here.
  1056       load_const(R11, (address)fd, R0);
  1057       return branch_to(R11, /*and_link=*/true,
  1058                             /*save toc=*/false,
  1059                             /*restore toc=*/false,
  1060                             /*load toc=*/true,
  1061                             /*load env=*/true);
  1062     } else {
  1063       // it's a friend function, load the entry point and don't care about
  1064       // toc and env.
  1065       address dest = fd->entry();
  1066       if (is_within_range_of_b(dest, pc())) {
  1067         bl(dest);
  1068       } else {
  1069         bl64_patchable(dest, rt);
  1071       _last_calls_return_pc = pc();
  1072       return _last_calls_return_pc;
  1077 // Call a C function.  All constants needed reside in TOC.
  1078 //
  1079 // Read the address to call from the TOC.
  1080 // Read env from TOC, if fd specifies an env.
  1081 // Read new TOC from TOC.
  1082 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,
  1083                                          relocInfo::relocType rt, Register toc) {
  1084   if (!ReoptimizeCallSequences
  1085     || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
  1086     || !fd->is_friend_function()) {
  1087     // It's not a friend function as defined by class FunctionDescriptor,
  1088     // so do a full call-c here.
  1089     assert(fd->entry() != NULL, "function must be linked");
  1091     AddressLiteral fd_entry(fd->entry());
  1092     load_const_from_method_toc(R11, fd_entry, toc);
  1093     mtctr(R11);
  1094     if (fd->env() == NULL) {
  1095       li(R11, 0);
  1096       nop();
  1097     } else {
  1098       AddressLiteral fd_env(fd->env());
  1099       load_const_from_method_toc(R11, fd_env, toc);
  1101     AddressLiteral fd_toc(fd->toc());
  1102     load_toc_from_toc(R2_TOC, fd_toc, toc);
  1103     // R2_TOC is killed.
  1104     bctrl();
  1105     _last_calls_return_pc = pc();
  1106   } else {
  1107     // It's a friend function, load the entry point and don't care about
  1108     // toc and env. Use an optimizable call instruction, but ensure the
  1109     // same code-size as in the case of a non-friend function.
  1110     nop();
  1111     bl64_patchable(fd->entry(), rt);
  1112     _last_calls_return_pc = pc();
  1114   return _last_calls_return_pc;
  1116 #endif // ABI_ELFv2
  1118 void MacroAssembler::call_VM_base(Register oop_result,
  1119                                   Register last_java_sp,
  1120                                   address  entry_point,
  1121                                   bool     check_exceptions) {
  1122   BLOCK_COMMENT("call_VM {");
  1123   // Determine last_java_sp register.
  1124   if (!last_java_sp->is_valid()) {
  1125     last_java_sp = R1_SP;
  1127   set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1);
  1129   // ARG1 must hold thread address.
  1130   mr(R3_ARG1, R16_thread);
  1131 #if defined(ABI_ELFv2)
  1132   address return_pc = call_c(entry_point, relocInfo::none);
  1133 #else
  1134   address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none);
  1135 #endif
  1137   reset_last_Java_frame();
  1139   // Check for pending exceptions.
  1140   if (check_exceptions) {
  1141     // We don't check for exceptions here.
  1142     ShouldNotReachHere();
  1145   // Get oop result if there is one and reset the value in the thread.
  1146   if (oop_result->is_valid()) {
  1147     get_vm_result(oop_result);
  1150   _last_calls_return_pc = return_pc;
  1151   BLOCK_COMMENT("} call_VM");
  1154 void MacroAssembler::call_VM_leaf_base(address entry_point) {
  1155   BLOCK_COMMENT("call_VM_leaf {");
  1156 #if defined(ABI_ELFv2)
  1157   call_c(entry_point, relocInfo::none);
  1158 #else
  1159   call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none);
  1160 #endif
  1161   BLOCK_COMMENT("} call_VM_leaf");
  1164 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) {
  1165   call_VM_base(oop_result, noreg, entry_point, check_exceptions);
  1168 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,
  1169                              bool check_exceptions) {
  1170   // R3_ARG1 is reserved for the thread.
  1171   mr_if_needed(R4_ARG2, arg_1);
  1172   call_VM(oop_result, entry_point, check_exceptions);
  1175 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,
  1176                              bool check_exceptions) {
  1177   // R3_ARG1 is reserved for the thread
  1178   mr_if_needed(R4_ARG2, arg_1);
  1179   assert(arg_2 != R4_ARG2, "smashed argument");
  1180   mr_if_needed(R5_ARG3, arg_2);
  1181   call_VM(oop_result, entry_point, check_exceptions);
  1184 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,
  1185                              bool check_exceptions) {
  1186   // R3_ARG1 is reserved for the thread
  1187   mr_if_needed(R4_ARG2, arg_1);
  1188   assert(arg_2 != R4_ARG2, "smashed argument");
  1189   mr_if_needed(R5_ARG3, arg_2);
  1190   mr_if_needed(R6_ARG4, arg_3);
  1191   call_VM(oop_result, entry_point, check_exceptions);
  1194 void MacroAssembler::call_VM_leaf(address entry_point) {
  1195   call_VM_leaf_base(entry_point);
  1198 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {
  1199   mr_if_needed(R3_ARG1, arg_1);
  1200   call_VM_leaf(entry_point);
  1203 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
  1204   mr_if_needed(R3_ARG1, arg_1);
  1205   assert(arg_2 != R3_ARG1, "smashed argument");
  1206   mr_if_needed(R4_ARG2, arg_2);
  1207   call_VM_leaf(entry_point);
  1210 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
  1211   mr_if_needed(R3_ARG1, arg_1);
  1212   assert(arg_2 != R3_ARG1, "smashed argument");
  1213   mr_if_needed(R4_ARG2, arg_2);
  1214   assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument");
  1215   mr_if_needed(R5_ARG3, arg_3);
  1216   call_VM_leaf(entry_point);
  1219 // Check whether instruction is a read access to the polling page
  1220 // which was emitted by load_from_polling_page(..).
  1221 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,
  1222                                                address* polling_address_ptr) {
  1223   if (!is_ld(instruction))
  1224     return false; // It's not a ld. Fail.
  1226   int rt = inv_rt_field(instruction);
  1227   int ra = inv_ra_field(instruction);
  1228   int ds = inv_ds_field(instruction);
  1229   if (!(ds == 0 && ra != 0 && rt == 0)) {
  1230     return false; // It's not a ld(r0, X, ra). Fail.
  1233   if (!ucontext) {
  1234     // Set polling address.
  1235     if (polling_address_ptr != NULL) {
  1236       *polling_address_ptr = NULL;
  1238     return true; // No ucontext given. Can't check value of ra. Assume true.
  1241 #ifdef LINUX
  1242   // Ucontext given. Check that register ra contains the address of
  1243   // the safepoing polling page.
  1244   ucontext_t* uc = (ucontext_t*) ucontext;
  1245   // Set polling address.
  1246   address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;
  1247   if (polling_address_ptr != NULL) {
  1248     *polling_address_ptr = addr;
  1250   return os::is_poll_address(addr);
  1251 #else
  1252   // Not on Linux, ucontext must be NULL.
  1253   ShouldNotReachHere();
  1254   return false;
  1255 #endif
  1258 bool MacroAssembler::is_memory_serialization(int instruction, JavaThread* thread, void* ucontext) {
  1259 #ifdef LINUX
  1260   ucontext_t* uc = (ucontext_t*) ucontext;
  1262   if (is_stwx(instruction) || is_stwux(instruction)) {
  1263     int ra = inv_ra_field(instruction);
  1264     int rb = inv_rb_field(instruction);
  1266     // look up content of ra and rb in ucontext
  1267     address ra_val=(address)uc->uc_mcontext.regs->gpr[ra];
  1268     long rb_val=(long)uc->uc_mcontext.regs->gpr[rb];
  1269     return os::is_memory_serialize_page(thread, ra_val+rb_val);
  1270   } else if (is_stw(instruction) || is_stwu(instruction)) {
  1271     int ra = inv_ra_field(instruction);
  1272     int d1 = inv_d1_field(instruction);
  1274     // look up content of ra in ucontext
  1275     address ra_val=(address)uc->uc_mcontext.regs->gpr[ra];
  1276     return os::is_memory_serialize_page(thread, ra_val+d1);
  1277   } else {
  1278     return false;
  1280 #else
  1281   // workaround not needed on !LINUX :-)
  1282   ShouldNotCallThis();
  1283   return false;
  1284 #endif
  1287 void MacroAssembler::bang_stack_with_offset(int offset) {
  1288   // When increasing the stack, the old stack pointer will be written
  1289   // to the new top of stack according to the PPC64 abi.
  1290   // Therefore, stack banging is not necessary when increasing
  1291   // the stack by <= os::vm_page_size() bytes.
  1292   // When increasing the stack by a larger amount, this method is
  1293   // called repeatedly to bang the intermediate pages.
  1295   // Stack grows down, caller passes positive offset.
  1296   assert(offset > 0, "must bang with positive offset");
  1298   long stdoffset = -offset;
  1300   if (is_simm(stdoffset, 16)) {
  1301     // Signed 16 bit offset, a simple std is ok.
  1302     if (UseLoadInstructionsForStackBangingPPC64) {
  1303       ld(R0, (int)(signed short)stdoffset, R1_SP);
  1304     } else {
  1305       std(R0,(int)(signed short)stdoffset, R1_SP);
  1307   } else if (is_simm(stdoffset, 31)) {
  1308     const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);
  1309     const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);
  1311     Register tmp = R11;
  1312     addis(tmp, R1_SP, hi);
  1313     if (UseLoadInstructionsForStackBangingPPC64) {
  1314       ld(R0,  lo, tmp);
  1315     } else {
  1316       std(R0, lo, tmp);
  1318   } else {
  1319     ShouldNotReachHere();
  1323 // If instruction is a stack bang of the form
  1324 //    std    R0,    x(Ry),       (see bang_stack_with_offset())
  1325 //    stdu   R1_SP, x(R1_SP),    (see push_frame(), resize_frame())
  1326 // or stdux  R1_SP, Rx, R1_SP    (see push_frame(), resize_frame())
  1327 // return the banged address. Otherwise, return 0.
  1328 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {
  1329 #ifdef LINUX
  1330   ucontext_t* uc = (ucontext_t*) ucontext;
  1331   int rs = inv_rs_field(instruction);
  1332   int ra = inv_ra_field(instruction);
  1333   if (   (is_ld(instruction)   && rs == 0 &&  UseLoadInstructionsForStackBangingPPC64)
  1334       || (is_std(instruction)  && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)
  1335       || (is_stdu(instruction) && rs == 1)) {
  1336     int ds = inv_ds_field(instruction);
  1337     // return banged address
  1338     return ds+(address)uc->uc_mcontext.regs->gpr[ra];
  1339   } else if (is_stdux(instruction) && rs == 1) {
  1340     int rb = inv_rb_field(instruction);
  1341     address sp = (address)uc->uc_mcontext.regs->gpr[1];
  1342     long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];
  1343     return ra != 1 || rb_val >= 0 ? NULL         // not a stack bang
  1344                                   : sp + rb_val; // banged address
  1346   return NULL; // not a stack bang
  1347 #else
  1348   // workaround not needed on !LINUX :-)
  1349   ShouldNotCallThis();
  1350   return NULL;
  1351 #endif
  1354 // CmpxchgX sets condition register to cmpX(current, compare).
  1355 void MacroAssembler::cmpxchgw(ConditionRegister flag, Register dest_current_value,
  1356                               Register compare_value, Register exchange_value,
  1357                               Register addr_base, int semantics, bool cmpxchgx_hint,
  1358                               Register int_flag_success, bool contention_hint) {
  1359   Label retry;
  1360   Label failed;
  1361   Label done;
  1363   // Save one branch if result is returned via register and
  1364   // result register is different from the other ones.
  1365   bool use_result_reg    = (int_flag_success != noreg);
  1366   bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value &&
  1367                             int_flag_success != exchange_value && int_flag_success != addr_base);
  1369   // release/fence semantics
  1370   if (semantics & MemBarRel) {
  1371     release();
  1374   if (use_result_reg && preset_result_reg) {
  1375     li(int_flag_success, 0); // preset (assume cas failed)
  1378   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
  1379   if (contention_hint) { // Don't try to reserve if cmp fails.
  1380     lwz(dest_current_value, 0, addr_base);
  1381     cmpw(flag, dest_current_value, compare_value);
  1382     bne(flag, failed);
  1385   // atomic emulation loop
  1386   bind(retry);
  1388   lwarx(dest_current_value, addr_base, cmpxchgx_hint);
  1389   cmpw(flag, dest_current_value, compare_value);
  1390   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
  1391     bne_predict_not_taken(flag, failed);
  1392   } else {
  1393     bne(                  flag, failed);
  1395   // branch to done  => (flag == ne), (dest_current_value != compare_value)
  1396   // fall through    => (flag == eq), (dest_current_value == compare_value)
  1398   stwcx_(exchange_value, addr_base);
  1399   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
  1400     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
  1401   } else {
  1402     bne(                  CCR0, retry); // StXcx_ sets CCR0.
  1404   // fall through    => (flag == eq), (dest_current_value == compare_value), (swapped)
  1406   // Result in register (must do this at the end because int_flag_success can be the
  1407   // same register as one above).
  1408   if (use_result_reg) {
  1409     li(int_flag_success, 1);
  1412   if (semantics & MemBarFenceAfter) {
  1413     fence();
  1414   } else if (semantics & MemBarAcq) {
  1415     isync();
  1418   if (use_result_reg && !preset_result_reg) {
  1419     b(done);
  1422   bind(failed);
  1423   if (use_result_reg && !preset_result_reg) {
  1424     li(int_flag_success, 0);
  1427   bind(done);
  1428   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
  1429   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
  1432 // Preforms atomic compare exchange:
  1433 //   if (compare_value == *addr_base)
  1434 //     *addr_base = exchange_value
  1435 //     int_flag_success = 1;
  1436 //   else
  1437 //     int_flag_success = 0;
  1438 //
  1439 // ConditionRegister flag       = cmp(compare_value, *addr_base)
  1440 // Register dest_current_value  = *addr_base
  1441 // Register compare_value       Used to compare with value in memory
  1442 // Register exchange_value      Written to memory if compare_value == *addr_base
  1443 // Register addr_base           The memory location to compareXChange
  1444 // Register int_flag_success    Set to 1 if exchange_value was written to *addr_base
  1445 //
  1446 // To avoid the costly compare exchange the value is tested beforehand.
  1447 // Several special cases exist to avoid that unnecessary information is generated.
  1448 //
  1449 void MacroAssembler::cmpxchgd(ConditionRegister flag,
  1450                               Register dest_current_value, Register compare_value, Register exchange_value,
  1451                               Register addr_base, int semantics, bool cmpxchgx_hint,
  1452                               Register int_flag_success, Label* failed_ext, bool contention_hint) {
  1453   Label retry;
  1454   Label failed_int;
  1455   Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int;
  1456   Label done;
  1458   // Save one branch if result is returned via register and result register is different from the other ones.
  1459   bool use_result_reg    = (int_flag_success!=noreg);
  1460   bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value &&
  1461                             int_flag_success!=exchange_value && int_flag_success!=addr_base);
  1462   assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both");
  1464   // release/fence semantics
  1465   if (semantics & MemBarRel) {
  1466     release();
  1469   if (use_result_reg && preset_result_reg) {
  1470     li(int_flag_success, 0); // preset (assume cas failed)
  1473   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
  1474   if (contention_hint) { // Don't try to reserve if cmp fails.
  1475     ld(dest_current_value, 0, addr_base);
  1476     cmpd(flag, dest_current_value, compare_value);
  1477     bne(flag, failed);
  1480   // atomic emulation loop
  1481   bind(retry);
  1483   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
  1484   cmpd(flag, dest_current_value, compare_value);
  1485   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
  1486     bne_predict_not_taken(flag, failed);
  1487   } else {
  1488     bne(                  flag, failed);
  1491   stdcx_(exchange_value, addr_base);
  1492   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
  1493     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
  1494   } else {
  1495     bne(                  CCR0, retry); // stXcx_ sets CCR0
  1498   // result in register (must do this at the end because int_flag_success can be the same register as one above)
  1499   if (use_result_reg) {
  1500     li(int_flag_success, 1);
  1503   // POWER6 doesn't need isync in CAS.
  1504   // Always emit isync to be on the safe side.
  1505   if (semantics & MemBarFenceAfter) {
  1506     fence();
  1507   } else if (semantics & MemBarAcq) {
  1508     isync();
  1511   if (use_result_reg && !preset_result_reg) {
  1512     b(done);
  1515   bind(failed_int);
  1516   if (use_result_reg && !preset_result_reg) {
  1517     li(int_flag_success, 0);
  1520   bind(done);
  1521   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
  1522   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
  1525 // Look up the method for a megamorphic invokeinterface call.
  1526 // The target method is determined by <intf_klass, itable_index>.
  1527 // The receiver klass is in recv_klass.
  1528 // On success, the result will be in method_result, and execution falls through.
  1529 // On failure, execution transfers to the given label.
  1530 void MacroAssembler::lookup_interface_method(Register recv_klass,
  1531                                              Register intf_klass,
  1532                                              RegisterOrConstant itable_index,
  1533                                              Register method_result,
  1534                                              Register scan_temp,
  1535                                              Register temp2,
  1536                                              Label& L_no_such_interface,
  1537                                              bool return_method) {
  1538   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
  1540   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
  1541   int vtable_base = InstanceKlass::vtable_start_offset() * wordSize;
  1542   int itentry_off = itableMethodEntry::method_offset_in_bytes();
  1543   int logMEsize   = exact_log2(itableMethodEntry::size() * wordSize);
  1544   int scan_step   = itableOffsetEntry::size() * wordSize;
  1545   int log_vte_size= exact_log2(vtableEntry::size() * wordSize);
  1547   lwz(scan_temp, InstanceKlass::vtable_length_offset() * wordSize, recv_klass);
  1548   // %%% We should store the aligned, prescaled offset in the klassoop.
  1549   // Then the next several instructions would fold away.
  1551   sldi(scan_temp, scan_temp, log_vte_size);
  1552   addi(scan_temp, scan_temp, vtable_base);
  1553   add(scan_temp, recv_klass, scan_temp);
  1555   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
  1556   if (return_method) {
  1557     if (itable_index.is_register()) {
  1558       Register itable_offset = itable_index.as_register();
  1559       sldi(method_result, itable_offset, logMEsize);
  1560       if (itentry_off) { addi(method_result, method_result, itentry_off); }
  1561       add(method_result, method_result, recv_klass);
  1562     } else {
  1563       long itable_offset = (long)itable_index.as_constant();
  1564       // static address, no relocation
  1565       load_const_optimized(temp2, (itable_offset << logMEsize) + itentry_off); // static address, no relocation
  1566       add(method_result, temp2, recv_klass);
  1570   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
  1571   //   if (scan->interface() == intf) {
  1572   //     result = (klass + scan->offset() + itable_index);
  1573   //   }
  1574   // }
  1575   Label search, found_method;
  1577   for (int peel = 1; peel >= 0; peel--) {
  1578     // %%%% Could load both offset and interface in one ldx, if they were
  1579     // in the opposite order. This would save a load.
  1580     ld(temp2, itableOffsetEntry::interface_offset_in_bytes(), scan_temp);
  1582     // Check that this entry is non-null. A null entry means that
  1583     // the receiver class doesn't implement the interface, and wasn't the
  1584     // same as when the caller was compiled.
  1585     cmpd(CCR0, temp2, intf_klass);
  1587     if (peel) {
  1588       beq(CCR0, found_method);
  1589     } else {
  1590       bne(CCR0, search);
  1591       // (invert the test to fall through to found_method...)
  1594     if (!peel) break;
  1596     bind(search);
  1598     cmpdi(CCR0, temp2, 0);
  1599     beq(CCR0, L_no_such_interface);
  1600     addi(scan_temp, scan_temp, scan_step);
  1603   bind(found_method);
  1605   // Got a hit.
  1606   if (return_method) {
  1607     int ito_offset = itableOffsetEntry::offset_offset_in_bytes();
  1608     lwz(scan_temp, ito_offset, scan_temp);
  1609     ldx(method_result, scan_temp, method_result);
  1613 // virtual method calling
  1614 void MacroAssembler::lookup_virtual_method(Register recv_klass,
  1615                                            RegisterOrConstant vtable_index,
  1616                                            Register method_result) {
  1618   assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());
  1620   const int base = InstanceKlass::vtable_start_offset() * wordSize;
  1621   assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
  1623   if (vtable_index.is_register()) {
  1624     sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);
  1625     add(recv_klass, vtable_index.as_register(), recv_klass);
  1626   } else {
  1627     addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);
  1629   ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass);
  1632 /////////////////////////////////////////// subtype checking ////////////////////////////////////////////
  1634 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
  1635                                                    Register super_klass,
  1636                                                    Register temp1_reg,
  1637                                                    Register temp2_reg,
  1638                                                    Label& L_success,
  1639                                                    Label& L_failure) {
  1641   const Register check_cache_offset = temp1_reg;
  1642   const Register cached_super       = temp2_reg;
  1644   assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);
  1646   int sco_offset = in_bytes(Klass::super_check_offset_offset());
  1647   int sc_offset  = in_bytes(Klass::secondary_super_cache_offset());
  1649   // If the pointers are equal, we are done (e.g., String[] elements).
  1650   // This self-check enables sharing of secondary supertype arrays among
  1651   // non-primary types such as array-of-interface. Otherwise, each such
  1652   // type would need its own customized SSA.
  1653   // We move this check to the front of the fast path because many
  1654   // type checks are in fact trivially successful in this manner,
  1655   // so we get a nicely predicted branch right at the start of the check.
  1656   cmpd(CCR0, sub_klass, super_klass);
  1657   beq(CCR0, L_success);
  1659   // Check the supertype display:
  1660   lwz(check_cache_offset, sco_offset, super_klass);
  1661   // The loaded value is the offset from KlassOopDesc.
  1663   ldx(cached_super, check_cache_offset, sub_klass);
  1664   cmpd(CCR0, cached_super, super_klass);
  1665   beq(CCR0, L_success);
  1667   // This check has worked decisively for primary supers.
  1668   // Secondary supers are sought in the super_cache ('super_cache_addr').
  1669   // (Secondary supers are interfaces and very deeply nested subtypes.)
  1670   // This works in the same check above because of a tricky aliasing
  1671   // between the super_cache and the primary super display elements.
  1672   // (The 'super_check_addr' can address either, as the case requires.)
  1673   // Note that the cache is updated below if it does not help us find
  1674   // what we need immediately.
  1675   // So if it was a primary super, we can just fail immediately.
  1676   // Otherwise, it's the slow path for us (no success at this point).
  1678   cmpwi(CCR0, check_cache_offset, sc_offset);
  1679   bne(CCR0, L_failure);
  1680   // bind(slow_path); // fallthru
  1683 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
  1684                                                    Register super_klass,
  1685                                                    Register temp1_reg,
  1686                                                    Register temp2_reg,
  1687                                                    Label* L_success,
  1688                                                    Register result_reg) {
  1689   const Register array_ptr = temp1_reg; // current value from cache array
  1690   const Register temp      = temp2_reg;
  1692   assert_different_registers(sub_klass, super_klass, array_ptr, temp);
  1694   int source_offset = in_bytes(Klass::secondary_supers_offset());
  1695   int target_offset = in_bytes(Klass::secondary_super_cache_offset());
  1697   int length_offset = Array<Klass*>::length_offset_in_bytes();
  1698   int base_offset   = Array<Klass*>::base_offset_in_bytes();
  1700   Label hit, loop, failure, fallthru;
  1702   ld(array_ptr, source_offset, sub_klass);
  1704   //assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");
  1705   lwz(temp, length_offset, array_ptr);
  1706   cmpwi(CCR0, temp, 0);
  1707   beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0
  1709   mtctr(temp); // load ctr
  1711   bind(loop);
  1712   // Oops in table are NO MORE compressed.
  1713   ld(temp, base_offset, array_ptr);
  1714   cmpd(CCR0, temp, super_klass);
  1715   beq(CCR0, hit);
  1716   addi(array_ptr, array_ptr, BytesPerWord);
  1717   bdnz(loop);
  1719   bind(failure);
  1720   if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss)
  1721   b(fallthru);
  1723   bind(hit);
  1724   std(super_klass, target_offset, sub_klass); // save result to cache
  1725   if (result_reg != noreg) li(result_reg, 0); // load zero result (indicates a hit)
  1726   if (L_success != NULL) b(*L_success);
  1728   bind(fallthru);
  1731 // Try fast path, then go to slow one if not successful
  1732 void MacroAssembler::check_klass_subtype(Register sub_klass,
  1733                          Register super_klass,
  1734                          Register temp1_reg,
  1735                          Register temp2_reg,
  1736                          Label& L_success) {
  1737   Label L_failure;
  1738   check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, L_success, L_failure);
  1739   check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);
  1740   bind(L_failure); // Fallthru if not successful.
  1743 void MacroAssembler::check_method_handle_type(Register mtype_reg, Register mh_reg,
  1744                                               Register temp_reg,
  1745                                               Label& wrong_method_type) {
  1746   assert_different_registers(mtype_reg, mh_reg, temp_reg);
  1747   // Compare method type against that of the receiver.
  1748   load_heap_oop_not_null(temp_reg, delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg), mh_reg);
  1749   cmpd(CCR0, temp_reg, mtype_reg);
  1750   bne(CCR0, wrong_method_type);
  1753 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,
  1754                                                    Register temp_reg,
  1755                                                    int extra_slot_offset) {
  1756   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
  1757   int stackElementSize = Interpreter::stackElementSize;
  1758   int offset = extra_slot_offset * stackElementSize;
  1759   if (arg_slot.is_constant()) {
  1760     offset += arg_slot.as_constant() * stackElementSize;
  1761     return offset;
  1762   } else {
  1763     assert(temp_reg != noreg, "must specify");
  1764     sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));
  1765     if (offset != 0)
  1766       addi(temp_reg, temp_reg, offset);
  1767     return temp_reg;
  1771 void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg,
  1772                                           Register mark_reg, Register temp_reg,
  1773                                           Register temp2_reg, Label& done, Label* slow_case) {
  1774   assert(UseBiasedLocking, "why call this otherwise?");
  1776 #ifdef ASSERT
  1777   assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg);
  1778 #endif
  1780   Label cas_label;
  1782   // Branch to done if fast path fails and no slow_case provided.
  1783   Label *slow_case_int = (slow_case != NULL) ? slow_case : &done;
  1785   // Biased locking
  1786   // See whether the lock is currently biased toward our thread and
  1787   // whether the epoch is still valid
  1788   // Note that the runtime guarantees sufficient alignment of JavaThread
  1789   // pointers to allow age to be placed into low bits
  1790   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits,
  1791          "biased locking makes assumptions about bit layout");
  1793   if (PrintBiasedLockingStatistics) {
  1794     load_const(temp_reg, (address) BiasedLocking::total_entry_count_addr(), temp2_reg);
  1795     lwz(temp2_reg, 0, temp_reg);
  1796     addi(temp2_reg, temp2_reg, 1);
  1797     stw(temp2_reg, 0, temp_reg);
  1800   andi(temp_reg, mark_reg, markOopDesc::biased_lock_mask_in_place);
  1801   cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern);
  1802   bne(cr_reg, cas_label);
  1804   load_klass(temp_reg, obj_reg);
  1806   load_const_optimized(temp2_reg, ~((int) markOopDesc::age_mask_in_place));
  1807   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
  1808   orr(temp_reg, R16_thread, temp_reg);
  1809   xorr(temp_reg, mark_reg, temp_reg);
  1810   andr(temp_reg, temp_reg, temp2_reg);
  1811   cmpdi(cr_reg, temp_reg, 0);
  1812   if (PrintBiasedLockingStatistics) {
  1813     Label l;
  1814     bne(cr_reg, l);
  1815     load_const(mark_reg, (address) BiasedLocking::biased_lock_entry_count_addr());
  1816     lwz(temp2_reg, 0, mark_reg);
  1817     addi(temp2_reg, temp2_reg, 1);
  1818     stw(temp2_reg, 0, mark_reg);
  1819     // restore mark_reg
  1820     ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
  1821     bind(l);
  1823   beq(cr_reg, done);
  1825   Label try_revoke_bias;
  1826   Label try_rebias;
  1828   // At this point we know that the header has the bias pattern and
  1829   // that we are not the bias owner in the current epoch. We need to
  1830   // figure out more details about the state of the header in order to
  1831   // know what operations can be legally performed on the object's
  1832   // header.
  1834   // If the low three bits in the xor result aren't clear, that means
  1835   // the prototype header is no longer biased and we have to revoke
  1836   // the bias on this object.
  1837   andi(temp2_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
  1838   cmpwi(cr_reg, temp2_reg, 0);
  1839   bne(cr_reg, try_revoke_bias);
  1841   // Biasing is still enabled for this data type. See whether the
  1842   // epoch of the current bias is still valid, meaning that the epoch
  1843   // bits of the mark word are equal to the epoch bits of the
  1844   // prototype header. (Note that the prototype header's epoch bits
  1845   // only change at a safepoint.) If not, attempt to rebias the object
  1846   // toward the current thread. Note that we must be absolutely sure
  1847   // that the current epoch is invalid in order to do this because
  1848   // otherwise the manipulations it performs on the mark word are
  1849   // illegal.
  1851   int shift_amount = 64 - markOopDesc::epoch_shift;
  1852   // rotate epoch bits to right (little) end and set other bits to 0
  1853   // [ big part | epoch | little part ] -> [ 0..0 | epoch ]
  1854   rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markOopDesc::epoch_bits);
  1855   // branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented
  1856   bne(CCR0, try_rebias);
  1858   // The epoch of the current bias is still valid but we know nothing
  1859   // about the owner; it might be set or it might be clear. Try to
  1860   // acquire the bias of the object using an atomic operation. If this
  1861   // fails we will go in to the runtime to revoke the object's bias.
  1862   // Note that we first construct the presumed unbiased header so we
  1863   // don't accidentally blow away another thread's valid bias.
  1864   andi(mark_reg, mark_reg, (markOopDesc::biased_lock_mask_in_place |
  1865                                 markOopDesc::age_mask_in_place |
  1866                                 markOopDesc::epoch_mask_in_place));
  1867   orr(temp_reg, R16_thread, mark_reg);
  1869   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
  1871   // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
  1872   fence(); // TODO: replace by MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq ?
  1873   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
  1874            /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
  1875            /*where=*/obj_reg,
  1876            MacroAssembler::MemBarAcq,
  1877            MacroAssembler::cmpxchgx_hint_acquire_lock(),
  1878            noreg, slow_case_int); // bail out if failed
  1880   // If the biasing toward our thread failed, this means that
  1881   // another thread succeeded in biasing it toward itself and we
  1882   // need to revoke that bias. The revocation will occur in the
  1883   // interpreter runtime in the slow case.
  1884   if (PrintBiasedLockingStatistics) {
  1885     load_const(temp_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp2_reg);
  1886     lwz(temp2_reg, 0, temp_reg);
  1887     addi(temp2_reg, temp2_reg, 1);
  1888     stw(temp2_reg, 0, temp_reg);
  1890   b(done);
  1892   bind(try_rebias);
  1893   // At this point we know the epoch has expired, meaning that the
  1894   // current "bias owner", if any, is actually invalid. Under these
  1895   // circumstances _only_, we are allowed to use the current header's
  1896   // value as the comparison value when doing the cas to acquire the
  1897   // bias in the current epoch. In other words, we allow transfer of
  1898   // the bias from one thread to another directly in this situation.
  1899   andi(temp_reg, mark_reg, markOopDesc::age_mask_in_place);
  1900   orr(temp_reg, R16_thread, temp_reg);
  1901   load_klass(temp2_reg, obj_reg);
  1902   ld(temp2_reg, in_bytes(Klass::prototype_header_offset()), temp2_reg);
  1903   orr(temp_reg, temp_reg, temp2_reg);
  1905   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
  1907   // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
  1908   fence(); // TODO: replace by MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq ?
  1909   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
  1910                  /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
  1911                  /*where=*/obj_reg,
  1912                  MacroAssembler::MemBarAcq,
  1913                  MacroAssembler::cmpxchgx_hint_acquire_lock(),
  1914                  noreg, slow_case_int); // bail out if failed
  1916   // If the biasing toward our thread failed, this means that
  1917   // another thread succeeded in biasing it toward itself and we
  1918   // need to revoke that bias. The revocation will occur in the
  1919   // interpreter runtime in the slow case.
  1920   if (PrintBiasedLockingStatistics) {
  1921     load_const(temp_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp2_reg);
  1922     lwz(temp2_reg, 0, temp_reg);
  1923     addi(temp2_reg, temp2_reg, 1);
  1924     stw(temp2_reg, 0, temp_reg);
  1926   b(done);
  1928   bind(try_revoke_bias);
  1929   // The prototype mark in the klass doesn't have the bias bit set any
  1930   // more, indicating that objects of this data type are not supposed
  1931   // to be biased any more. We are going to try to reset the mark of
  1932   // this object to the prototype value and fall through to the
  1933   // CAS-based locking scheme. Note that if our CAS fails, it means
  1934   // that another thread raced us for the privilege of revoking the
  1935   // bias of this particular object, so it's okay to continue in the
  1936   // normal locking code.
  1937   load_klass(temp_reg, obj_reg);
  1938   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
  1939   andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place);
  1940   orr(temp_reg, temp_reg, temp2_reg);
  1942   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
  1944   // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
  1945   fence(); // TODO: replace by MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq ?
  1946   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
  1947                  /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
  1948                  /*where=*/obj_reg,
  1949                  MacroAssembler::MemBarAcq,
  1950                  MacroAssembler::cmpxchgx_hint_acquire_lock());
  1952   // reload markOop in mark_reg before continuing with lightweight locking
  1953   ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
  1955   // Fall through to the normal CAS-based lock, because no matter what
  1956   // the result of the above CAS, some thread must have succeeded in
  1957   // removing the bias bit from the object's header.
  1958   if (PrintBiasedLockingStatistics) {
  1959     Label l;
  1960     bne(cr_reg, l);
  1961     load_const(temp_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp2_reg);
  1962     lwz(temp2_reg, 0, temp_reg);
  1963     addi(temp2_reg, temp2_reg, 1);
  1964     stw(temp2_reg, 0, temp_reg);
  1965     bind(l);
  1968   bind(cas_label);
  1971 void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) {
  1972   // Check for biased locking unlock case, which is a no-op
  1973   // Note: we do not have to check the thread ID for two reasons.
  1974   // First, the interpreter checks for IllegalMonitorStateException at
  1975   // a higher level. Second, if the bias was revoked while we held the
  1976   // lock, the object could not be rebiased toward another thread, so
  1977   // the bias bit would be clear.
  1979   ld(temp_reg, 0, mark_addr);
  1980   andi(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
  1982   cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern);
  1983   beq(cr_reg, done);
  1986 // "The box" is the space on the stack where we copy the object mark.
  1987 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box,
  1988                                                Register temp, Register displaced_header, Register current_header) {
  1989   assert_different_registers(oop, box, temp, displaced_header, current_header);
  1990   assert(flag != CCR0, "bad condition register");
  1991   Label cont;
  1992   Label object_has_monitor;
  1993   Label cas_failed;
  1995   // Load markOop from object into displaced_header.
  1996   ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop);
  1999   // Always do locking in runtime.
  2000   if (EmitSync & 0x01) {
  2001     cmpdi(flag, oop, 0); // Oop can't be 0 here => always false.
  2002     return;
  2005   if (UseBiasedLocking) {
  2006     biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont);
  2009   // Handle existing monitor.
  2010   if ((EmitSync & 0x02) == 0) {
  2011     // The object has an existing monitor iff (mark & monitor_value) != 0.
  2012     andi_(temp, displaced_header, markOopDesc::monitor_value);
  2013     bne(CCR0, object_has_monitor);
  2016   // Set displaced_header to be (markOop of object | UNLOCK_VALUE).
  2017   ori(displaced_header, displaced_header, markOopDesc::unlocked_value);
  2019   // Load Compare Value application register.
  2021   // Initialize the box. (Must happen before we update the object mark!)
  2022   std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
  2024   // Must fence, otherwise, preceding store(s) may float below cmpxchg.
  2025   // Compare object markOop with mark and if equal exchange scratch1 with object markOop.
  2026   // CmpxchgX sets cr_reg to cmpX(current, displaced).
  2027   membar(Assembler::StoreStore);
  2028   cmpxchgd(/*flag=*/flag,
  2029            /*current_value=*/current_header,
  2030            /*compare_value=*/displaced_header,
  2031            /*exchange_value=*/box,
  2032            /*where=*/oop,
  2033            MacroAssembler::MemBarAcq,
  2034            MacroAssembler::cmpxchgx_hint_acquire_lock(),
  2035            noreg,
  2036            &cas_failed);
  2037   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
  2039   // If the compare-and-exchange succeeded, then we found an unlocked
  2040   // object and we have now locked it.
  2041   b(cont);
  2043   bind(cas_failed);
  2044   // We did not see an unlocked object so try the fast recursive case.
  2046   // Check if the owner is self by comparing the value in the markOop of object
  2047   // (current_header) with the stack pointer.
  2048   sub(current_header, current_header, R1_SP);
  2049   load_const_optimized(temp, (address) (~(os::vm_page_size()-1) |
  2050                                         markOopDesc::lock_mask_in_place));
  2052   and_(R0/*==0?*/, current_header, temp);
  2053   // If condition is true we are cont and hence we can store 0 as the
  2054   // displaced header in the box, which indicates that it is a recursive lock.
  2055   mcrf(flag,CCR0);
  2056   std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box);
  2058   // Handle existing monitor.
  2059   if ((EmitSync & 0x02) == 0) {
  2060     b(cont);
  2062     bind(object_has_monitor);
  2063     // The object's monitor m is unlocked iff m->owner == NULL,
  2064     // otherwise m->owner may contain a thread or a stack address.
  2065     //
  2066     // Try to CAS m->owner from NULL to current thread.
  2067     addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markOopDesc::monitor_value);
  2068     li(displaced_header, 0);
  2069     // CmpxchgX sets flag to cmpX(current, displaced).
  2070     cmpxchgd(/*flag=*/flag,
  2071              /*current_value=*/current_header,
  2072              /*compare_value=*/displaced_header,
  2073              /*exchange_value=*/R16_thread,
  2074              /*where=*/temp,
  2075              MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
  2076              MacroAssembler::cmpxchgx_hint_acquire_lock());
  2078     // Store a non-null value into the box.
  2079     std(box, BasicLock::displaced_header_offset_in_bytes(), box);
  2081 #   ifdef ASSERT
  2082     bne(flag, cont);
  2083     // We have acquired the monitor, check some invariants.
  2084     addi(/*monitor=*/temp, temp, -ObjectMonitor::owner_offset_in_bytes());
  2085     // Invariant 1: _recursions should be 0.
  2086     //assert(ObjectMonitor::recursions_size_in_bytes() == 8, "unexpected size");
  2087     asm_assert_mem8_is_zero(ObjectMonitor::recursions_offset_in_bytes(), temp,
  2088                             "monitor->_recursions should be 0", -1);
  2089     // Invariant 2: OwnerIsThread shouldn't be 0.
  2090     //assert(ObjectMonitor::OwnerIsThread_size_in_bytes() == 4, "unexpected size");
  2091     //asm_assert_mem4_isnot_zero(ObjectMonitor::OwnerIsThread_offset_in_bytes(), temp,
  2092     //                           "monitor->OwnerIsThread shouldn't be 0", -1);
  2093 #   endif
  2096   bind(cont);
  2097   // flag == EQ indicates success
  2098   // flag == NE indicates failure
  2101 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box,
  2102                                                  Register temp, Register displaced_header, Register current_header) {
  2103   assert_different_registers(oop, box, temp, displaced_header, current_header);
  2104   assert(flag != CCR0, "bad condition register");
  2105   Label cont;
  2106   Label object_has_monitor;
  2108   // Always do locking in runtime.
  2109   if (EmitSync & 0x01) {
  2110     cmpdi(flag, oop, 0); // Oop can't be 0 here => always false.
  2111     return;
  2114   if (UseBiasedLocking) {
  2115     biased_locking_exit(flag, oop, current_header, cont);
  2118   // Find the lock address and load the displaced header from the stack.
  2119   ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
  2121   // If the displaced header is 0, we have a recursive unlock.
  2122   cmpdi(flag, displaced_header, 0);
  2123   beq(flag, cont);
  2125   // Handle existing monitor.
  2126   if ((EmitSync & 0x02) == 0) {
  2127     // The object has an existing monitor iff (mark & monitor_value) != 0.
  2128     ld(current_header, oopDesc::mark_offset_in_bytes(), oop);
  2129     andi(temp, current_header, markOopDesc::monitor_value);
  2130     cmpdi(flag, temp, 0);
  2131     bne(flag, object_has_monitor);
  2135   // Check if it is still a light weight lock, this is is true if we see
  2136   // the stack address of the basicLock in the markOop of the object.
  2137   // Cmpxchg sets flag to cmpd(current_header, box).
  2138   cmpxchgd(/*flag=*/flag,
  2139            /*current_value=*/current_header,
  2140            /*compare_value=*/box,
  2141            /*exchange_value=*/displaced_header,
  2142            /*where=*/oop,
  2143            MacroAssembler::MemBarRel,
  2144            MacroAssembler::cmpxchgx_hint_release_lock(),
  2145            noreg,
  2146            &cont);
  2148   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
  2150   // Handle existing monitor.
  2151   if ((EmitSync & 0x02) == 0) {
  2152     b(cont);
  2154     bind(object_has_monitor);
  2155     addi(current_header, current_header, -markOopDesc::monitor_value); // monitor
  2156     ld(temp,             ObjectMonitor::owner_offset_in_bytes(), current_header);
  2157     ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header);
  2158     xorr(temp, R16_thread, temp);      // Will be 0 if we are the owner.
  2159     orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions.
  2160     cmpdi(flag, temp, 0);
  2161     bne(flag, cont);
  2163     ld(temp,             ObjectMonitor::EntryList_offset_in_bytes(), current_header);
  2164     ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header);
  2165     orr(temp, temp, displaced_header); // Will be 0 if both are 0.
  2166     cmpdi(flag, temp, 0);
  2167     bne(flag, cont);
  2168     release();
  2169     std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header);
  2172   bind(cont);
  2173   // flag == EQ indicates success
  2174   // flag == NE indicates failure
  2177 // Write serialization page so VM thread can do a pseudo remote membar.
  2178 // We use the current thread pointer to calculate a thread specific
  2179 // offset to write to within the page. This minimizes bus traffic
  2180 // due to cache line collision.
  2181 void MacroAssembler::serialize_memory(Register thread, Register tmp1, Register tmp2) {
  2182   srdi(tmp2, thread, os::get_serialize_page_shift_count());
  2184   int mask = os::vm_page_size() - sizeof(int);
  2185   if (Assembler::is_simm(mask, 16)) {
  2186     andi(tmp2, tmp2, mask);
  2187   } else {
  2188     lis(tmp1, (int)((signed short) (mask >> 16)));
  2189     ori(tmp1, tmp1, mask & 0x0000ffff);
  2190     andr(tmp2, tmp2, tmp1);
  2193   load_const(tmp1, (long) os::get_memory_serialize_page());
  2194   release();
  2195   stwx(R0, tmp1, tmp2);
  2199 // GC barrier helper macros
  2201 // Write the card table byte if needed.
  2202 void MacroAssembler::card_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp) {
  2203   CardTableModRefBS* bs = (CardTableModRefBS*) Universe::heap()->barrier_set();
  2204   assert(bs->kind() == BarrierSet::CardTableModRef ||
  2205          bs->kind() == BarrierSet::CardTableExtension, "wrong barrier");
  2206 #ifdef ASSERT
  2207   cmpdi(CCR0, Rnew_val, 0);
  2208   asm_assert_ne("null oop not allowed", 0x321);
  2209 #endif
  2210   card_table_write(bs->byte_map_base, Rtmp, Rstore_addr);
  2213 // Write the card table byte.
  2214 void MacroAssembler::card_table_write(jbyte* byte_map_base, Register Rtmp, Register Robj) {
  2215   assert_different_registers(Robj, Rtmp, R0);
  2216   load_const_optimized(Rtmp, (address)byte_map_base, R0);
  2217   srdi(Robj, Robj, CardTableModRefBS::card_shift);
  2218   li(R0, 0); // dirty
  2219   if (UseConcMarkSweepGC) membar(Assembler::StoreStore);
  2220   stbx(R0, Rtmp, Robj);
  2223 // Kills R31 if value is a volatile register.
  2224 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2, bool needs_frame) {
  2225   Label done;
  2226   cmpdi(CCR0, value, 0);
  2227   beq(CCR0, done);         // Use NULL as-is.
  2229   clrrdi(tmp1, value, JNIHandles::weak_tag_size);
  2230 #if INCLUDE_ALL_GCS
  2231   if (UseG1GC) { andi_(tmp2, value, JNIHandles::weak_tag_mask); }
  2232 #endif
  2233   ld(value, 0, tmp1);      // Resolve (untagged) jobject.
  2235 #if INCLUDE_ALL_GCS
  2236   if (UseG1GC) {
  2237     Label not_weak;
  2238     beq(CCR0, not_weak);   // Test for jweak tag.
  2239     verify_oop(value);
  2240     g1_write_barrier_pre(noreg, // obj
  2241                          noreg, // offset
  2242                          value, // pre_val
  2243                          tmp1, tmp2, needs_frame);
  2244     bind(not_weak);
  2246 #endif // INCLUDE_ALL_GCS
  2247   verify_oop(value);
  2248   bind(done);
  2251 #if INCLUDE_ALL_GCS
  2252 // General G1 pre-barrier generator.
  2253 // Goal: record the previous value if it is not null.
  2254 void MacroAssembler::g1_write_barrier_pre(Register Robj, RegisterOrConstant offset, Register Rpre_val,
  2255                                           Register Rtmp1, Register Rtmp2, bool needs_frame) {
  2256   Label runtime, filtered;
  2258   // Is marking active?
  2259   if (in_bytes(PtrQueue::byte_width_of_active()) == 4) {
  2260     lwz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_active()), R16_thread);
  2261   } else {
  2262     guarantee(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption");
  2263     lbz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_active()), R16_thread);
  2265   cmpdi(CCR0, Rtmp1, 0);
  2266   beq(CCR0, filtered);
  2268   // Do we need to load the previous value?
  2269   if (Robj != noreg) {
  2270     // Load the previous value...
  2271     if (UseCompressedOops) {
  2272       lwz(Rpre_val, offset, Robj);
  2273     } else {
  2274       ld(Rpre_val, offset, Robj);
  2276     // Previous value has been loaded into Rpre_val.
  2278   assert(Rpre_val != noreg, "must have a real register");
  2280   // Is the previous value null?
  2281   cmpdi(CCR0, Rpre_val, 0);
  2282   beq(CCR0, filtered);
  2284   if (Robj != noreg && UseCompressedOops) {
  2285     decode_heap_oop_not_null(Rpre_val);
  2288   // OK, it's not filtered, so we'll need to call enqueue. In the normal
  2289   // case, pre_val will be a scratch G-reg, but there are some cases in
  2290   // which it's an O-reg. In the first case, do a normal call. In the
  2291   // latter, do a save here and call the frameless version.
  2293   // Can we store original value in the thread's buffer?
  2294   // Is index == 0?
  2295   // (The index field is typed as size_t.)
  2296   const Register Rbuffer = Rtmp1, Rindex = Rtmp2;
  2298   ld(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_index()), R16_thread);
  2299   cmpdi(CCR0, Rindex, 0);
  2300   beq(CCR0, runtime); // If index == 0, goto runtime.
  2301   ld(Rbuffer, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_buf()), R16_thread);
  2303   addi(Rindex, Rindex, -wordSize); // Decrement index.
  2304   std(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_index()), R16_thread);
  2306   // Record the previous value.
  2307   stdx(Rpre_val, Rbuffer, Rindex);
  2308   b(filtered);
  2310   bind(runtime);
  2312   // May need to preserve LR. Also needed if current frame is not compatible with C calling convention.
  2313   if (needs_frame) {
  2314     save_LR_CR(Rtmp1);
  2315     push_frame_reg_args(0, Rtmp2);
  2318   if (Rpre_val->is_volatile() && Robj == noreg) mr(R31, Rpre_val); // Save pre_val across C call if it was preloaded.
  2319   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), Rpre_val, R16_thread);
  2320   if (Rpre_val->is_volatile() && Robj == noreg) mr(Rpre_val, R31); // restore
  2322   if (needs_frame) {
  2323     pop_frame();
  2324     restore_LR_CR(Rtmp1);
  2327   bind(filtered);
  2330 // General G1 post-barrier generator
  2331 // Store cross-region card.
  2332 void MacroAssembler::g1_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp1, Register Rtmp2, Register Rtmp3, Label *filtered_ext) {
  2333   Label runtime, filtered_int;
  2334   Label& filtered = (filtered_ext != NULL) ? *filtered_ext : filtered_int;
  2335   assert_different_registers(Rstore_addr, Rnew_val, Rtmp1, Rtmp2);
  2337   G1SATBCardTableModRefBS* bs = (G1SATBCardTableModRefBS*) Universe::heap()->barrier_set();
  2338   assert(bs->kind() == BarrierSet::G1SATBCT ||
  2339          bs->kind() == BarrierSet::G1SATBCTLogging, "wrong barrier");
  2341   // Does store cross heap regions?
  2342   if (G1RSBarrierRegionFilter) {
  2343     xorr(Rtmp1, Rstore_addr, Rnew_val);
  2344     srdi_(Rtmp1, Rtmp1, HeapRegion::LogOfHRGrainBytes);
  2345     beq(CCR0, filtered);
  2348   // Crosses regions, storing NULL?
  2349 #ifdef ASSERT
  2350   cmpdi(CCR0, Rnew_val, 0);
  2351   asm_assert_ne("null oop not allowed (G1)", 0x322); // Checked by caller on PPC64, so following branch is obsolete:
  2352   //beq(CCR0, filtered);
  2353 #endif
  2355   // Storing region crossing non-NULL, is card already dirty?
  2356   assert(sizeof(*bs->byte_map_base) == sizeof(jbyte), "adjust this code");
  2357   const Register Rcard_addr = Rtmp1;
  2358   Register Rbase = Rtmp2;
  2359   load_const_optimized(Rbase, (address)bs->byte_map_base, /*temp*/ Rtmp3);
  2361   srdi(Rcard_addr, Rstore_addr, CardTableModRefBS::card_shift);
  2363   // Get the address of the card.
  2364   lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr);
  2365   cmpwi(CCR0, Rtmp3, (int)G1SATBCardTableModRefBS::g1_young_card_val());
  2366   beq(CCR0, filtered);
  2368   membar(Assembler::StoreLoad);
  2369   lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr);  // Reload after membar.
  2370   cmpwi(CCR0, Rtmp3 /* card value */, CardTableModRefBS::dirty_card_val());
  2371   beq(CCR0, filtered);
  2373   // Storing a region crossing, non-NULL oop, card is clean.
  2374   // Dirty card and log.
  2375   li(Rtmp3, CardTableModRefBS::dirty_card_val());
  2376   //release(); // G1: oops are allowed to get visible after dirty marking.
  2377   stbx(Rtmp3, Rbase, Rcard_addr);
  2379   add(Rcard_addr, Rbase, Rcard_addr); // This is the address which needs to get enqueued.
  2380   Rbase = noreg; // end of lifetime
  2382   const Register Rqueue_index = Rtmp2,
  2383                  Rqueue_buf   = Rtmp3;
  2384   ld(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + PtrQueue::byte_offset_of_index()), R16_thread);
  2385   cmpdi(CCR0, Rqueue_index, 0);
  2386   beq(CCR0, runtime); // index == 0 then jump to runtime
  2387   ld(Rqueue_buf, in_bytes(JavaThread::dirty_card_queue_offset() + PtrQueue::byte_offset_of_buf()), R16_thread);
  2389   addi(Rqueue_index, Rqueue_index, -wordSize); // decrement index
  2390   std(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + PtrQueue::byte_offset_of_index()), R16_thread);
  2392   stdx(Rcard_addr, Rqueue_buf, Rqueue_index); // store card
  2393   b(filtered);
  2395   bind(runtime);
  2397   // Save the live input values.
  2398   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), Rcard_addr, R16_thread);
  2400   bind(filtered_int);
  2402 #endif // INCLUDE_ALL_GCS
  2404 // Values for last_Java_pc, and last_Java_sp must comply to the rules
  2405 // in frame_ppc.hpp.
  2406 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {
  2407   // Always set last_Java_pc and flags first because once last_Java_sp
  2408   // is visible has_last_Java_frame is true and users will look at the
  2409   // rest of the fields. (Note: flags should always be zero before we
  2410   // get here so doesn't need to be set.)
  2412   // Verify that last_Java_pc was zeroed on return to Java
  2413   asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,
  2414                           "last_Java_pc not zeroed before leaving Java", 0x200);
  2416   // When returning from calling out from Java mode the frame anchor's
  2417   // last_Java_pc will always be set to NULL. It is set here so that
  2418   // if we are doing a call to native (not VM) that we capture the
  2419   // known pc and don't have to rely on the native call having a
  2420   // standard frame linkage where we can find the pc.
  2421   if (last_Java_pc != noreg)
  2422     std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
  2424   // Set last_Java_sp last.
  2425   std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
  2428 void MacroAssembler::reset_last_Java_frame(void) {
  2429   asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),
  2430                              R16_thread, "SP was not set, still zero", 0x202);
  2432   BLOCK_COMMENT("reset_last_Java_frame {");
  2433   li(R0, 0);
  2435   // _last_Java_sp = 0
  2436   std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
  2438   // _last_Java_pc = 0
  2439   std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
  2440   BLOCK_COMMENT("} reset_last_Java_frame");
  2443 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) {
  2444   assert_different_registers(sp, tmp1);
  2446   // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via
  2447   // TOP_IJAVA_FRAME_ABI.
  2448   // FIXME: assert that we really have a TOP_IJAVA_FRAME here!
  2449 #ifdef CC_INTERP
  2450   ld(tmp1/*pc*/, _top_ijava_frame_abi(frame_manager_lr), sp);
  2451 #else
  2452   address entry = pc();
  2453   load_const_optimized(tmp1, entry);
  2454 #endif
  2456   set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);
  2459 void MacroAssembler::get_vm_result(Register oop_result) {
  2460   // Read:
  2461   //   R16_thread
  2462   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
  2463   //
  2464   // Updated:
  2465   //   oop_result
  2466   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
  2468   ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread);
  2469   li(R0, 0);
  2470   std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread);
  2472   verify_oop(oop_result);
  2475 void MacroAssembler::get_vm_result_2(Register metadata_result) {
  2476   // Read:
  2477   //   R16_thread
  2478   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
  2479   //
  2480   // Updated:
  2481   //   metadata_result
  2482   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
  2484   ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
  2485   li(R0, 0);
  2486   std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
  2490 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
  2491   Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.
  2492   if (Universe::narrow_klass_base() != 0) {
  2493     // Use dst as temp if it is free.
  2494     load_const(R0, Universe::narrow_klass_base(), (dst != current && dst != R0) ? dst : noreg);
  2495     sub(dst, current, R0);
  2496     current = dst;
  2498   if (Universe::narrow_klass_shift() != 0) {
  2499     srdi(dst, current, Universe::narrow_klass_shift());
  2500     current = dst;
  2502   mr_if_needed(dst, current); // Move may be required.
  2505 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {
  2506   if (UseCompressedClassPointers) {
  2507     encode_klass_not_null(ck, klass);
  2508     stw(ck, oopDesc::klass_offset_in_bytes(), dst_oop);
  2509   } else {
  2510     std(klass, oopDesc::klass_offset_in_bytes(), dst_oop);
  2514 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {
  2515   if (UseCompressedClassPointers) {
  2516     if (val == noreg) {
  2517       val = R0;
  2518       li(val, 0);
  2520     stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed
  2524 int MacroAssembler::instr_size_for_decode_klass_not_null() {
  2525   if (!UseCompressedClassPointers) return 0;
  2526   int num_instrs = 1;  // shift or move
  2527   if (Universe::narrow_klass_base() != 0) num_instrs = 7;  // shift + load const + add
  2528   return num_instrs * BytesPerInstWord;
  2531 void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
  2532   assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");
  2533   if (src == noreg) src = dst;
  2534   Register shifted_src = src;
  2535   if (Universe::narrow_klass_shift() != 0 ||
  2536       Universe::narrow_klass_base() == 0 && src != dst) {  // Move required.
  2537     shifted_src = dst;
  2538     sldi(shifted_src, src, Universe::narrow_klass_shift());
  2540   if (Universe::narrow_klass_base() != 0) {
  2541     load_const(R0, Universe::narrow_klass_base());
  2542     add(dst, shifted_src, R0);
  2546 void MacroAssembler::load_klass(Register dst, Register src) {
  2547   if (UseCompressedClassPointers) {
  2548     lwz(dst, oopDesc::klass_offset_in_bytes(), src);
  2549     // Attention: no null check here!
  2550     decode_klass_not_null(dst, dst);
  2551   } else {
  2552     ld(dst, oopDesc::klass_offset_in_bytes(), src);
  2556 void MacroAssembler::load_klass_with_trap_null_check(Register dst, Register src) {
  2557   if (!os::zero_page_read_protected()) {
  2558     if (TrapBasedNullChecks) {
  2559       trap_null_check(src);
  2562   load_klass(dst, src);
  2565 void MacroAssembler::reinit_heapbase(Register d, Register tmp) {
  2566   if (Universe::heap() != NULL) {
  2567     load_const_optimized(R30, Universe::narrow_ptrs_base(), tmp);
  2568   } else {
  2569     // Heap not yet allocated. Load indirectly.
  2570     int simm16_offset = load_const_optimized(R30, Universe::narrow_ptrs_base_addr(), tmp, true);
  2571     ld(R30, simm16_offset, R30);
  2575 // Clear Array
  2576 // Kills both input registers. tmp == R0 is allowed.
  2577 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp) {
  2578   // Procedure for large arrays (uses data cache block zero instruction).
  2579     Label startloop, fast, fastloop, small_rest, restloop, done;
  2580     const int cl_size         = VM_Version::get_cache_line_size(),
  2581               cl_dwords       = cl_size>>3,
  2582               cl_dw_addr_bits = exact_log2(cl_dwords),
  2583               dcbz_min        = 1;                     // Min count of dcbz executions, needs to be >0.
  2585 //2:
  2586     cmpdi(CCR1, cnt_dwords, ((dcbz_min+1)<<cl_dw_addr_bits)-1); // Big enough? (ensure >=dcbz_min lines included).
  2587     blt(CCR1, small_rest);                                      // Too small.
  2588     rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits);           // Extract dword offset within first cache line.
  2589     beq(CCR0, fast);                                            // Already 128byte aligned.
  2591     subfic(tmp, tmp, cl_dwords);
  2592     mtctr(tmp);                        // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
  2593     subf(cnt_dwords, tmp, cnt_dwords); // rest.
  2594     li(tmp, 0);
  2595 //10:
  2596   bind(startloop);                     // Clear at the beginning to reach 128byte boundary.
  2597     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
  2598     addi(base_ptr, base_ptr, 8);
  2599     bdnz(startloop);
  2600 //13:
  2601   bind(fast);                                  // Clear 128byte blocks.
  2602     srdi(tmp, cnt_dwords, cl_dw_addr_bits);    // Loop count for 128byte loop (>0).
  2603     andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
  2604     mtctr(tmp);                                // Load counter.
  2605 //16:
  2606   bind(fastloop);
  2607     dcbz(base_ptr);                    // Clear 128byte aligned block.
  2608     addi(base_ptr, base_ptr, cl_size);
  2609     bdnz(fastloop);
  2610     if (InsertEndGroupPPC64) { endgroup(); } else { nop(); }
  2611 //20:
  2612   bind(small_rest);
  2613     cmpdi(CCR0, cnt_dwords, 0);        // size 0?
  2614     beq(CCR0, done);                   // rest == 0
  2615     li(tmp, 0);
  2616     mtctr(cnt_dwords);                 // Load counter.
  2617 //24:
  2618   bind(restloop);                      // Clear rest.
  2619     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
  2620     addi(base_ptr, base_ptr, 8);
  2621     bdnz(restloop);
  2622 //27:
  2623   bind(done);
  2626 /////////////////////////////////////////// String intrinsics ////////////////////////////////////////////
  2628 // Search for a single jchar in an jchar[].
  2629 //
  2630 // Assumes that result differs from all other registers.
  2631 //
  2632 // Haystack, needle are the addresses of jchar-arrays.
  2633 // NeedleChar is needle[0] if it is known at compile time.
  2634 // Haycnt is the length of the haystack. We assume haycnt >=1.
  2635 //
  2636 // Preserves haystack, haycnt, kills all other registers.
  2637 //
  2638 // If needle == R0, we search for the constant needleChar.
  2639 void MacroAssembler::string_indexof_1(Register result, Register haystack, Register haycnt,
  2640                                       Register needle, jchar needleChar,
  2641                                       Register tmp1, Register tmp2) {
  2643   assert_different_registers(result, haystack, haycnt, needle, tmp1, tmp2);
  2645   Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_Found3, L_NotFound, L_End;
  2646   Register needle0 = needle, // Contains needle[0].
  2647            addr = tmp1,
  2648            ch1 = tmp2,
  2649            ch2 = R0;
  2651 //2 (variable) or 3 (const):
  2652    if (needle != R0) lhz(needle0, 0, needle); // Preload needle character, needle has len==1.
  2653    dcbtct(haystack, 0x00);                        // Indicate R/O access to haystack.
  2655    srwi_(tmp2, haycnt, 1);   // Shift right by exact_log2(UNROLL_FACTOR).
  2656    mr(addr, haystack);
  2657    beq(CCR0, L_FinalCheck);
  2658    mtctr(tmp2);              // Move to count register.
  2659 //8:
  2660   bind(L_InnerLoop);             // Main work horse (2x unrolled search loop).
  2661    lhz(ch1, 0, addr);        // Load characters from haystack.
  2662    lhz(ch2, 2, addr);
  2663    (needle != R0) ? cmpw(CCR0, ch1, needle0) : cmplwi(CCR0, ch1, needleChar);
  2664    (needle != R0) ? cmpw(CCR1, ch2, needle0) : cmplwi(CCR1, ch2, needleChar);
  2665    beq(CCR0, L_Found1);   // Did we find the needle?
  2666    beq(CCR1, L_Found2);
  2667    addi(addr, addr, 4);
  2668    bdnz(L_InnerLoop);
  2669 //16:
  2670   bind(L_FinalCheck);
  2671    andi_(R0, haycnt, 1);
  2672    beq(CCR0, L_NotFound);
  2673    lhz(ch1, 0, addr);        // One position left at which we have to compare.
  2674    (needle != R0) ? cmpw(CCR1, ch1, needle0) : cmplwi(CCR1, ch1, needleChar);
  2675    beq(CCR1, L_Found3);
  2676 //21:
  2677   bind(L_NotFound);
  2678    li(result, -1);           // Not found.
  2679    b(L_End);
  2681   bind(L_Found2);
  2682    addi(addr, addr, 2);
  2683 //24:
  2684   bind(L_Found1);
  2685   bind(L_Found3);                  // Return index ...
  2686    subf(addr, haystack, addr); // relative to haystack,
  2687    srdi(result, addr, 1);      // in characters.
  2688   bind(L_End);
  2692 // Implementation of IndexOf for jchar arrays.
  2693 //
  2694 // The length of haystack and needle are not constant, i.e. passed in a register.
  2695 //
  2696 // Preserves registers haystack, needle.
  2697 // Kills registers haycnt, needlecnt.
  2698 // Assumes that result differs from all other registers.
  2699 // Haystack, needle are the addresses of jchar-arrays.
  2700 // Haycnt, needlecnt are the lengths of them, respectively.
  2701 //
  2702 // Needlecntval must be zero or 15-bit unsigned immediate and > 1.
  2703 void MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt,
  2704                                     Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval,
  2705                                     Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
  2707   // Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite!
  2708   Label L_TooShort, L_Found, L_NotFound, L_End;
  2709   Register last_addr = haycnt, // Kill haycnt at the beginning.
  2710            addr      = tmp1,
  2711            n_start   = tmp2,
  2712            ch1       = tmp3,
  2713            ch2       = R0;
  2715   // **************************************************************************************************
  2716   // Prepare for main loop: optimized for needle count >=2, bail out otherwise.
  2717   // **************************************************************************************************
  2719 //1 (variable) or 3 (const):
  2720    dcbtct(needle, 0x00);    // Indicate R/O access to str1.
  2721    dcbtct(haystack, 0x00);  // Indicate R/O access to str2.
  2723   // Compute last haystack addr to use if no match gets found.
  2724   if (needlecntval == 0) { // variable needlecnt
  2725 //3:
  2726    subf(ch1, needlecnt, haycnt);      // Last character index to compare is haycnt-needlecnt.
  2727    addi(addr, haystack, -2);          // Accesses use pre-increment.
  2728    cmpwi(CCR6, needlecnt, 2);
  2729    blt(CCR6, L_TooShort);          // Variable needlecnt: handle short needle separately.
  2730    slwi(ch1, ch1, 1);                 // Scale to number of bytes.
  2731    lwz(n_start, 0, needle);           // Load first 2 characters of needle.
  2732    add(last_addr, haystack, ch1);     // Point to last address to compare (haystack+2*(haycnt-needlecnt)).
  2733    addi(needlecnt, needlecnt, -2);    // Rest of needle.
  2734   } else { // constant needlecnt
  2735   guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately");
  2736   assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate");
  2737 //5:
  2738    addi(ch1, haycnt, -needlecntval);  // Last character index to compare is haycnt-needlecnt.
  2739    lwz(n_start, 0, needle);           // Load first 2 characters of needle.
  2740    addi(addr, haystack, -2);          // Accesses use pre-increment.
  2741    slwi(ch1, ch1, 1);                 // Scale to number of bytes.
  2742    add(last_addr, haystack, ch1);     // Point to last address to compare (haystack+2*(haycnt-needlecnt)).
  2743    li(needlecnt, needlecntval-2);     // Rest of needle.
  2746   // Main Loop (now we have at least 3 characters).
  2747 //11:
  2748   Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2, L_Comp3;
  2749   bind(L_OuterLoop); // Search for 1st 2 characters.
  2750   Register addr_diff = tmp4;
  2751    subf(addr_diff, addr, last_addr); // Difference between already checked address and last address to check.
  2752    addi(addr, addr, 2);              // This is the new address we want to use for comparing.
  2753    srdi_(ch2, addr_diff, 2);
  2754    beq(CCR0, L_FinalCheck);       // 2 characters left?
  2755    mtctr(ch2);                       // addr_diff/4
  2756 //16:
  2757   bind(L_InnerLoop);                // Main work horse (2x unrolled search loop)
  2758    lwz(ch1, 0, addr);           // Load 2 characters of haystack (ignore alignment).
  2759    lwz(ch2, 2, addr);
  2760    cmpw(CCR0, ch1, n_start); // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop).
  2761    cmpw(CCR1, ch2, n_start);
  2762    beq(CCR0, L_Comp1);       // Did we find the needle start?
  2763    beq(CCR1, L_Comp2);
  2764    addi(addr, addr, 4);
  2765    bdnz(L_InnerLoop);
  2766 //24:
  2767   bind(L_FinalCheck);
  2768    rldicl_(addr_diff, addr_diff, 64-1, 63); // Remaining characters not covered by InnerLoop: (addr_diff>>1)&1.
  2769    beq(CCR0, L_NotFound);
  2770    lwz(ch1, 0, addr);                       // One position left at which we have to compare.
  2771    cmpw(CCR1, ch1, n_start);
  2772    beq(CCR1, L_Comp3);
  2773 //29:
  2774   bind(L_NotFound);
  2775    li(result, -1); // not found
  2776    b(L_End);
  2779    // **************************************************************************************************
  2780    // Special Case: unfortunately, the variable needle case can be called with needlecnt<2
  2781    // **************************************************************************************************
  2782 //31:
  2783  if ((needlecntval>>1) !=1 ) { // Const needlecnt is 2 or 3? Reduce code size.
  2784   int nopcnt = 5;
  2785   if (needlecntval !=0 ) ++nopcnt; // Balance alignment (other case: see below).
  2786   if (needlecntval == 0) {         // We have to handle these cases separately.
  2787   Label L_OneCharLoop;
  2788   bind(L_TooShort);
  2789    mtctr(haycnt);
  2790    lhz(n_start, 0, needle);    // First character of needle
  2791   bind(L_OneCharLoop);
  2792    lhzu(ch1, 2, addr);
  2793    cmpw(CCR1, ch1, n_start);
  2794    beq(CCR1, L_Found);      // Did we find the one character needle?
  2795    bdnz(L_OneCharLoop);
  2796    li(result, -1);             // Not found.
  2797    b(L_End);
  2798   } // 8 instructions, so no impact on alignment.
  2799   for (int x = 0; x < nopcnt; ++x) nop();
  2802   // **************************************************************************************************
  2803   // Regular Case Part II: compare rest of needle (first 2 characters have been compared already)
  2804   // **************************************************************************************************
  2806   // Compare the rest
  2807 //36 if needlecntval==0, else 37:
  2808   bind(L_Comp2);
  2809    addi(addr, addr, 2); // First comparison has failed, 2nd one hit.
  2810   bind(L_Comp1);            // Addr points to possible needle start.
  2811   bind(L_Comp3);            // Could have created a copy and use a different return address but saving code size here.
  2812   if (needlecntval != 2) {  // Const needlecnt==2?
  2813    if (needlecntval != 3) {
  2814     if (needlecntval == 0) beq(CCR6, L_Found); // Variable needlecnt==2?
  2815     Register ind_reg = tmp4;
  2816     li(ind_reg, 2*2);   // First 2 characters are already compared, use index 2.
  2817     mtctr(needlecnt);   // Decremented by 2, still > 0.
  2818 //40:
  2819    Label L_CompLoop;
  2820    bind(L_CompLoop);
  2821     lhzx(ch2, needle, ind_reg);
  2822     lhzx(ch1, addr, ind_reg);
  2823     cmpw(CCR1, ch1, ch2);
  2824     bne(CCR1, L_OuterLoop);
  2825     addi(ind_reg, ind_reg, 2);
  2826     bdnz(L_CompLoop);
  2827    } else { // No loop required if there's only one needle character left.
  2828     lhz(ch2, 2*2, needle);
  2829     lhz(ch1, 2*2, addr);
  2830     cmpw(CCR1, ch1, ch2);
  2831     bne(CCR1, L_OuterLoop);
  2834   // Return index ...
  2835 //46:
  2836   bind(L_Found);
  2837    subf(addr, haystack, addr); // relative to haystack, ...
  2838    srdi(result, addr, 1);      // in characters.
  2839 //48:
  2840   bind(L_End);
  2843 // Implementation of Compare for jchar arrays.
  2844 //
  2845 // Kills the registers str1, str2, cnt1, cnt2.
  2846 // Kills cr0, ctr.
  2847 // Assumes that result differes from the input registers.
  2848 void MacroAssembler::string_compare(Register str1_reg, Register str2_reg, Register cnt1_reg, Register cnt2_reg,
  2849                                     Register result_reg, Register tmp_reg) {
  2850    assert_different_registers(result_reg, str1_reg, str2_reg, cnt1_reg, cnt2_reg, tmp_reg);
  2852    Label Ldone, Lslow_case, Lslow_loop, Lfast_loop;
  2853    Register cnt_diff = R0,
  2854             limit_reg = cnt1_reg,
  2855             chr1_reg = result_reg,
  2856             chr2_reg = cnt2_reg,
  2857             addr_diff = str2_reg;
  2859    // Offset 0 should be 32 byte aligned.
  2860 //-4:
  2861     dcbtct(str1_reg, 0x00);  // Indicate R/O access to str1.
  2862     dcbtct(str2_reg, 0x00);  // Indicate R/O access to str2.
  2863 //-2:
  2864    // Compute min(cnt1, cnt2) and check if 0 (bail out if we don't need to compare characters).
  2865     subf(result_reg, cnt2_reg, cnt1_reg);  // difference between cnt1/2
  2866     subf_(addr_diff, str1_reg, str2_reg);  // alias?
  2867     beq(CCR0, Ldone);                   // return cnt difference if both ones are identical
  2868     srawi(limit_reg, result_reg, 31);      // generate signmask (cnt1/2 must be non-negative so cnt_diff can't overflow)
  2869     mr(cnt_diff, result_reg);
  2870     andr(limit_reg, result_reg, limit_reg); // difference or zero (negative): cnt1<cnt2 ? cnt1-cnt2 : 0
  2871     add_(limit_reg, cnt2_reg, limit_reg);  // min(cnt1, cnt2)==0?
  2872     beq(CCR0, Ldone);                   // return cnt difference if one has 0 length
  2874     lhz(chr1_reg, 0, str1_reg);            // optional: early out if first characters mismatch
  2875     lhzx(chr2_reg, str1_reg, addr_diff);   // optional: early out if first characters mismatch
  2876     addi(tmp_reg, limit_reg, -1);          // min(cnt1, cnt2)-1
  2877     subf_(result_reg, chr2_reg, chr1_reg); // optional: early out if first characters mismatch
  2878     bne(CCR0, Ldone);                   // optional: early out if first characters mismatch
  2880    // Set loop counter by scaling down tmp_reg
  2881     srawi_(chr2_reg, tmp_reg, exact_log2(4)); // (min(cnt1, cnt2)-1)/4
  2882     ble(CCR0, Lslow_case);                 // need >4 characters for fast loop
  2883     andi(limit_reg, tmp_reg, 4-1);            // remaining characters
  2885    // Adapt str1_reg str2_reg for the first loop iteration
  2886     mtctr(chr2_reg);                 // (min(cnt1, cnt2)-1)/4
  2887     addi(limit_reg, limit_reg, 4+1); // compare last 5-8 characters in slow_case if mismatch found in fast_loop
  2888 //16:
  2889    // Compare the rest of the characters
  2890    bind(Lfast_loop);
  2891     ld(chr1_reg, 0, str1_reg);
  2892     ldx(chr2_reg, str1_reg, addr_diff);
  2893     cmpd(CCR0, chr2_reg, chr1_reg);
  2894     bne(CCR0, Lslow_case); // return chr1_reg
  2895     addi(str1_reg, str1_reg, 4*2);
  2896     bdnz(Lfast_loop);
  2897     addi(limit_reg, limit_reg, -4); // no mismatch found in fast_loop, only 1-4 characters missing
  2898 //23:
  2899    bind(Lslow_case);
  2900     mtctr(limit_reg);
  2901 //24:
  2902    bind(Lslow_loop);
  2903     lhz(chr1_reg, 0, str1_reg);
  2904     lhzx(chr2_reg, str1_reg, addr_diff);
  2905     subf_(result_reg, chr2_reg, chr1_reg);
  2906     bne(CCR0, Ldone); // return chr1_reg
  2907     addi(str1_reg, str1_reg, 1*2);
  2908     bdnz(Lslow_loop);
  2909 //30:
  2910    // If strings are equal up to min length, return the length difference.
  2911     mr(result_reg, cnt_diff);
  2912     nop(); // alignment
  2913 //32:
  2914    // Otherwise, return the difference between the first mismatched chars.
  2915    bind(Ldone);
  2919 // Compare char[] arrays.
  2920 //
  2921 // str1_reg   USE only
  2922 // str2_reg   USE only
  2923 // cnt_reg    USE_DEF, due to tmp reg shortage
  2924 // result_reg DEF only, might compromise USE only registers
  2925 void MacroAssembler::char_arrays_equals(Register str1_reg, Register str2_reg, Register cnt_reg, Register result_reg,
  2926                                         Register tmp1_reg, Register tmp2_reg, Register tmp3_reg, Register tmp4_reg,
  2927                                         Register tmp5_reg) {
  2929   // Str1 may be the same register as str2 which can occur e.g. after scalar replacement.
  2930   assert_different_registers(result_reg, str1_reg, cnt_reg, tmp1_reg, tmp2_reg, tmp3_reg, tmp4_reg, tmp5_reg);
  2931   assert_different_registers(result_reg, str2_reg, cnt_reg, tmp1_reg, tmp2_reg, tmp3_reg, tmp4_reg, tmp5_reg);
  2933   // Offset 0 should be 32 byte aligned.
  2934   Label Linit_cbc, Lcbc, Lloop, Ldone_true, Ldone_false;
  2935   Register index_reg = tmp5_reg;
  2936   Register cbc_iter  = tmp4_reg;
  2938 //-1:
  2939   dcbtct(str1_reg, 0x00);  // Indicate R/O access to str1.
  2940   dcbtct(str2_reg, 0x00);  // Indicate R/O access to str2.
  2941 //1:
  2942   andi(cbc_iter, cnt_reg, 4-1);            // Remaining iterations after 4 java characters per iteration loop.
  2943   li(index_reg, 0); // init
  2944   li(result_reg, 0); // assume false
  2945   srwi_(tmp2_reg, cnt_reg, exact_log2(4)); // Div: 4 java characters per iteration (main loop).
  2947   cmpwi(CCR1, cbc_iter, 0);             // CCR1 = (cbc_iter==0)
  2948   beq(CCR0, Linit_cbc);                 // too short
  2949     mtctr(tmp2_reg);
  2950 //8:
  2951     bind(Lloop);
  2952       ldx(tmp1_reg, str1_reg, index_reg);
  2953       ldx(tmp2_reg, str2_reg, index_reg);
  2954       cmpd(CCR0, tmp1_reg, tmp2_reg);
  2955       bne(CCR0, Ldone_false);  // Unequal char pair found -> done.
  2956       addi(index_reg, index_reg, 4*sizeof(jchar));
  2957       bdnz(Lloop);
  2958 //14:
  2959   bind(Linit_cbc);
  2960   beq(CCR1, Ldone_true);
  2961     mtctr(cbc_iter);
  2962 //16:
  2963     bind(Lcbc);
  2964       lhzx(tmp1_reg, str1_reg, index_reg);
  2965       lhzx(tmp2_reg, str2_reg, index_reg);
  2966       cmpw(CCR0, tmp1_reg, tmp2_reg);
  2967       bne(CCR0, Ldone_false);  // Unequal char pair found -> done.
  2968       addi(index_reg, index_reg, 1*sizeof(jchar));
  2969       bdnz(Lcbc);
  2970     nop();
  2971   bind(Ldone_true);
  2972   li(result_reg, 1);
  2973 //24:
  2974   bind(Ldone_false);
  2978 void MacroAssembler::char_arrays_equalsImm(Register str1_reg, Register str2_reg, int cntval, Register result_reg,
  2979                                            Register tmp1_reg, Register tmp2_reg) {
  2980   // Str1 may be the same register as str2 which can occur e.g. after scalar replacement.
  2981   assert_different_registers(result_reg, str1_reg, tmp1_reg, tmp2_reg);
  2982   assert_different_registers(result_reg, str2_reg, tmp1_reg, tmp2_reg);
  2983   assert(sizeof(jchar) == 2, "must be");
  2984   assert(cntval >= 0 && ((cntval & 0x7fff) == cntval), "wrong immediate");
  2986   Label Ldone_false;
  2988   if (cntval < 16) { // short case
  2989     if (cntval != 0) li(result_reg, 0); // assume false
  2991     const int num_bytes = cntval*sizeof(jchar);
  2992     int index = 0;
  2993     for (int next_index; (next_index = index + 8) <= num_bytes; index = next_index) {
  2994       ld(tmp1_reg, index, str1_reg);
  2995       ld(tmp2_reg, index, str2_reg);
  2996       cmpd(CCR0, tmp1_reg, tmp2_reg);
  2997       bne(CCR0, Ldone_false);
  2999     if (cntval & 2) {
  3000       lwz(tmp1_reg, index, str1_reg);
  3001       lwz(tmp2_reg, index, str2_reg);
  3002       cmpw(CCR0, tmp1_reg, tmp2_reg);
  3003       bne(CCR0, Ldone_false);
  3004       index += 4;
  3006     if (cntval & 1) {
  3007       lhz(tmp1_reg, index, str1_reg);
  3008       lhz(tmp2_reg, index, str2_reg);
  3009       cmpw(CCR0, tmp1_reg, tmp2_reg);
  3010       bne(CCR0, Ldone_false);
  3012     // fallthrough: true
  3013   } else {
  3014     Label Lloop;
  3015     Register index_reg = tmp1_reg;
  3016     const int loopcnt = cntval/4;
  3017     assert(loopcnt > 0, "must be");
  3018     // Offset 0 should be 32 byte aligned.
  3019     //2:
  3020     dcbtct(str1_reg, 0x00);  // Indicate R/O access to str1.
  3021     dcbtct(str2_reg, 0x00);  // Indicate R/O access to str2.
  3022     li(tmp2_reg, loopcnt);
  3023     li(index_reg, 0); // init
  3024     li(result_reg, 0); // assume false
  3025     mtctr(tmp2_reg);
  3026     //8:
  3027     bind(Lloop);
  3028     ldx(R0, str1_reg, index_reg);
  3029     ldx(tmp2_reg, str2_reg, index_reg);
  3030     cmpd(CCR0, R0, tmp2_reg);
  3031     bne(CCR0, Ldone_false);  // Unequal char pair found -> done.
  3032     addi(index_reg, index_reg, 4*sizeof(jchar));
  3033     bdnz(Lloop);
  3034     //14:
  3035     if (cntval & 2) {
  3036       lwzx(R0, str1_reg, index_reg);
  3037       lwzx(tmp2_reg, str2_reg, index_reg);
  3038       cmpw(CCR0, R0, tmp2_reg);
  3039       bne(CCR0, Ldone_false);
  3040       if (cntval & 1) addi(index_reg, index_reg, 2*sizeof(jchar));
  3042     if (cntval & 1) {
  3043       lhzx(R0, str1_reg, index_reg);
  3044       lhzx(tmp2_reg, str2_reg, index_reg);
  3045       cmpw(CCR0, R0, tmp2_reg);
  3046       bne(CCR0, Ldone_false);
  3048     // fallthru: true
  3050   li(result_reg, 1);
  3051   bind(Ldone_false);
  3054 // Helpers for Intrinsic Emitters
  3055 //
  3056 // Revert the byte order of a 32bit value in a register
  3057 //   src: 0x44556677
  3058 //   dst: 0x77665544
  3059 // Three steps to obtain the result:
  3060 //  1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
  3061 //     into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
  3062 //     This value initializes dst.
  3063 //  2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
  3064 //     byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
  3065 //     This value is mask inserted into dst with a [0..23] mask of 1s.
  3066 //  3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
  3067 //     This value is mask inserted into dst with a [8..15] mask of 1s.
  3068 void MacroAssembler::load_reverse_32(Register dst, Register src) {
  3069   assert_different_registers(dst, src);
  3071   rldicl(dst, src, (4+1)*8, 56);       // Rotate byte 4 into position 7 (rightmost), clear all to the left.
  3072   rlwimi(dst, src,     3*8,  0, 23);   // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
  3073   rlwimi(dst, src,     1*8,  8, 15);   // Insert byte 6 into position 5, leave the rest alone.
  3076 // Calculate the column addresses of the crc32 lookup table into distinct registers.
  3077 // This loop-invariant calculation is moved out of the loop body, reducing the loop
  3078 // body size from 20 to 16 instructions.
  3079 // Returns the offset that was used to calculate the address of column tc3.
  3080 // Due to register shortage, setting tc3 may overwrite table. With the return offset
  3081 // at hand, the original table address can be easily reconstructed.
  3082 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
  3084 #ifdef VM_LITTLE_ENDIAN
  3085   // This is what we implement (the DOLIT4 part):
  3086   // ========================================================================= */
  3087   // #define DOLIT4 c ^= *buf4++; \
  3088   //         c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
  3089   //             crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]
  3090   // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4
  3091   // ========================================================================= */
  3092   const int ix0 = 3*(4*CRC32_COLUMN_SIZE);
  3093   const int ix1 = 2*(4*CRC32_COLUMN_SIZE);
  3094   const int ix2 = 1*(4*CRC32_COLUMN_SIZE);
  3095   const int ix3 = 0*(4*CRC32_COLUMN_SIZE);
  3096 #else
  3097   // This is what we implement (the DOBIG4 part):
  3098   // =========================================================================
  3099   // #define DOBIG4 c ^= *++buf4; \
  3100   //         c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
  3101   //             crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
  3102   // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
  3103   // =========================================================================
  3104   const int ix0 = 4*(4*CRC32_COLUMN_SIZE);
  3105   const int ix1 = 5*(4*CRC32_COLUMN_SIZE);
  3106   const int ix2 = 6*(4*CRC32_COLUMN_SIZE);
  3107   const int ix3 = 7*(4*CRC32_COLUMN_SIZE);
  3108 #endif
  3109   assert_different_registers(table, tc0, tc1, tc2);
  3110   assert(table == tc3, "must be!");
  3112   if (ix0 != 0) addi(tc0, table, ix0);
  3113   if (ix1 != 0) addi(tc1, table, ix1);
  3114   if (ix2 != 0) addi(tc2, table, ix2);
  3115   if (ix3 != 0) addi(tc3, table, ix3);
  3117   return ix3;
  3120 /**
  3121  * uint32_t crc;
  3122  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
  3123  */
  3124 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
  3125   assert_different_registers(crc, table, tmp);
  3126   assert_different_registers(val, table);
  3128   if (crc == val) {                   // Must rotate first to use the unmodified value.
  3129     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
  3130                                       // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
  3131     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
  3132   } else {
  3133     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
  3134     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
  3136   lwzx(tmp, table, tmp);
  3137   xorr(crc, crc, tmp);
  3140 /**
  3141  * uint32_t crc;
  3142  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
  3143  */
  3144 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
  3145   fold_byte_crc32(crc, crc, table, tmp);
  3148 /**
  3149  * Emits code to update CRC-32 with a byte value according to constants in table.
  3151  * @param [in,out]crc   Register containing the crc.
  3152  * @param [in]val       Register containing the byte to fold into the CRC.
  3153  * @param [in]table     Register containing the table of crc constants.
  3155  * uint32_t crc;
  3156  * val = crc_table[(val ^ crc) & 0xFF];
  3157  * crc = val ^ (crc >> 8);
  3158  */
  3159 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
  3160   BLOCK_COMMENT("update_byte_crc32:");
  3161   xorr(val, val, crc);
  3162   fold_byte_crc32(crc, val, table, val);
  3165 /**
  3166  * @param crc   register containing existing CRC (32-bit)
  3167  * @param buf   register pointing to input byte buffer (byte*)
  3168  * @param len   register containing number of bytes
  3169  * @param table register pointing to CRC table
  3170  */
  3171 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
  3172                                            Register data, bool loopAlignment, bool invertCRC) {
  3173   assert_different_registers(crc, buf, len, table, data);
  3175   Label L_mainLoop, L_done;
  3176   const int mainLoop_stepping  = 1;
  3177   const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
  3179   // Process all bytes in a single-byte loop.
  3180   cmpdi(CCR0, len, 0);                           // Anything to do?
  3181   mtctr(len);
  3182   beq(CCR0, L_done);
  3184   if (invertCRC) {
  3185     nand(crc, crc, crc);                         // ~c
  3188   align(mainLoop_alignment);
  3189   BIND(L_mainLoop);
  3190     lbz(data, 0, buf);                           // Byte from buffer, zero-extended.
  3191     addi(buf, buf, mainLoop_stepping);           // Advance buffer position.
  3192     update_byte_crc32(crc, data, table);
  3193     bdnz(L_mainLoop);                            // Iterate.
  3195   if (invertCRC) {
  3196     nand(crc, crc, crc);                         // ~c
  3199   bind(L_done);
  3202 /**
  3203  * Emits code to update CRC-32 with a 4-byte value according to constants in table
  3204  * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
  3205  */
  3206 // A not on the lookup table address(es):
  3207 // The lookup table consists of two sets of four columns each.
  3208 // The columns {0..3} are used for little-endian machines.
  3209 // The columns {4..7} are used for big-endian machines.
  3210 // To save the effort of adding the column offset to the table address each time
  3211 // a table element is looked up, it is possible to pass the pre-calculated
  3212 // column addresses.
  3213 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
  3214 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
  3215                                         Register t0,  Register t1,  Register t2,  Register t3,
  3216                                         Register tc0, Register tc1, Register tc2, Register tc3) {
  3217   assert_different_registers(crc, t3);
  3219   // XOR crc with next four bytes of buffer.
  3220   lwz(t3, bufDisp, buf);
  3221   if (bufInc != 0) {
  3222     addi(buf, buf, bufInc);
  3224   xorr(t3, t3, crc);
  3226   // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
  3227   rlwinm(t0, t3,  2,         24-2, 31-2);  // ((t1 >>  0) & 0xff) << 2
  3228   rlwinm(t1, t3,  32+(2- 8), 24-2, 31-2);  // ((t1 >>  8) & 0xff) << 2
  3229   rlwinm(t2, t3,  32+(2-16), 24-2, 31-2);  // ((t1 >> 16) & 0xff) << 2
  3230   rlwinm(t3, t3,  32+(2-24), 24-2, 31-2);  // ((t1 >> 24) & 0xff) << 2
  3232   // Use the pre-calculated column addresses.
  3233   // Load pre-calculated table values.
  3234   lwzx(t0, tc0, t0);
  3235   lwzx(t1, tc1, t1);
  3236   lwzx(t2, tc2, t2);
  3237   lwzx(t3, tc3, t3);
  3239   // Calculate new crc from table values.
  3240   xorr(t0,  t0, t1);
  3241   xorr(t2,  t2, t3);
  3242   xorr(crc, t0, t2);  // Now crc contains the final checksum value.
  3245 /**
  3246  * @param crc   register containing existing CRC (32-bit)
  3247  * @param buf   register pointing to input byte buffer (byte*)
  3248  * @param len   register containing number of bytes
  3249  * @param table register pointing to CRC table
  3251  * Uses R9..R12 as work register. Must be saved/restored by caller!
  3252  */
  3253 void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table,
  3254                                         Register t0,  Register t1,  Register t2,  Register t3,
  3255                                         Register tc0, Register tc1, Register tc2, Register tc3) {
  3256   assert_different_registers(crc, buf, len, table);
  3258   Label L_mainLoop, L_tail;
  3259   Register  tmp  = t0;
  3260   Register  data = t0;
  3261   Register  tmp2 = t1;
  3262   const int mainLoop_stepping  = 8;
  3263   const int tailLoop_stepping  = 1;
  3264   const int log_stepping       = exact_log2(mainLoop_stepping);
  3265   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
  3266   const int complexThreshold   = 2*mainLoop_stepping;
  3268   // Don't test for len <= 0 here. This pathological case should not occur anyway.
  3269   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.
  3270   // The situation itself is detected and handled correctly by the conditional branches
  3271   // following  aghi(len, -stepping) and aghi(len, +stepping).
  3272   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
  3274   BLOCK_COMMENT("kernel_crc32_2word {");
  3276   nand(crc, crc, crc);                           // ~c
  3278   // Check for short (<mainLoop_stepping) buffer.
  3279   cmpdi(CCR0, len, complexThreshold);
  3280   blt(CCR0, L_tail);
  3282   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
  3283   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
  3285     // Align buf addr to mainLoop_stepping boundary.
  3286     neg(tmp2, buf);                           // Calculate # preLoop iterations for alignment.
  3287     rldicl(tmp2, tmp2, 0, 64-log_stepping);   // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
  3289     if (complexThreshold > mainLoop_stepping) {
  3290       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
  3291     } else {
  3292       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
  3293       cmpdi(CCR0, tmp, mainLoop_stepping);
  3294       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
  3295       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
  3297     update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false);
  3300   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
  3301   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
  3302   mtctr(tmp2);
  3304 #ifdef VM_LITTLE_ENDIAN
  3305   Register crc_rv = crc;
  3306 #else
  3307   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
  3308                                                  // Occupies tmp, but frees up crc.
  3309   load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
  3310   tmp = crc;
  3311 #endif
  3313   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
  3315   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
  3316   BIND(L_mainLoop);
  3317     update_1word_crc32(crc_rv, buf, table, 0, 0, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
  3318     update_1word_crc32(crc_rv, buf, table, 4, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
  3319     bdnz(L_mainLoop);
  3321 #ifndef VM_LITTLE_ENDIAN
  3322   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
  3323   tmp = crc_rv;                                  // Tmp uses it's original register again.
  3324 #endif
  3326   // Restore original table address for tailLoop.
  3327   if (reconstructTableOffset != 0) {
  3328     addi(table, table, -reconstructTableOffset);
  3331   // Process last few (<complexThreshold) bytes of buffer.
  3332   BIND(L_tail);
  3333   update_byteLoop_crc32(crc, buf, len, table, data, false, false);
  3335   nand(crc, crc, crc);                           // ~c
  3336   BLOCK_COMMENT("} kernel_crc32_2word");
  3339 /**
  3340  * @param crc   register containing existing CRC (32-bit)
  3341  * @param buf   register pointing to input byte buffer (byte*)
  3342  * @param len   register containing number of bytes
  3343  * @param table register pointing to CRC table
  3345  * uses R9..R12 as work register. Must be saved/restored by caller!
  3346  */
  3347 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
  3348                                         Register t0,  Register t1,  Register t2,  Register t3,
  3349                                         Register tc0, Register tc1, Register tc2, Register tc3) {
  3350   assert_different_registers(crc, buf, len, table);
  3352   Label L_mainLoop, L_tail;
  3353   Register  tmp          = t0;
  3354   Register  data         = t0;
  3355   Register  tmp2         = t1;
  3356   const int mainLoop_stepping  = 4;
  3357   const int tailLoop_stepping  = 1;
  3358   const int log_stepping       = exact_log2(mainLoop_stepping);
  3359   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
  3360   const int complexThreshold   = 2*mainLoop_stepping;
  3362   // Don't test for len <= 0 here. This pathological case should not occur anyway.
  3363   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.
  3364   // The situation itself is detected and handled correctly by the conditional branches
  3365   // following  aghi(len, -stepping) and aghi(len, +stepping).
  3366   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
  3368   BLOCK_COMMENT("kernel_crc32_1word {");
  3370   nand(crc, crc, crc);                           // ~c
  3372   // Check for short (<mainLoop_stepping) buffer.
  3373   cmpdi(CCR0, len, complexThreshold);
  3374   blt(CCR0, L_tail);
  3376   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
  3377   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
  3379     // Align buf addr to mainLoop_stepping boundary.
  3380     neg(tmp2, buf);                              // Calculate # preLoop iterations for alignment.
  3381     rldicl(tmp2, tmp2, 0, 64-log_stepping);      // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
  3383     if (complexThreshold > mainLoop_stepping) {
  3384       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
  3385     } else {
  3386       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
  3387       cmpdi(CCR0, tmp, mainLoop_stepping);
  3388       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
  3389       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
  3391     update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false);
  3394   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
  3395   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
  3396   mtctr(tmp2);
  3398 #ifdef VM_LITTLE_ENDIAN
  3399   Register crc_rv = crc;
  3400 #else
  3401   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
  3402                                                  // Occupies tmp, but frees up crc.
  3403   load_reverse_32(crc_rv, crc);                  // evert byte order because we are dealing with big-endian data.
  3404   tmp = crc;
  3405 #endif
  3407   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
  3409   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
  3410   BIND(L_mainLoop);
  3411     update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
  3412     bdnz(L_mainLoop);
  3414 #ifndef VM_LITTLE_ENDIAN
  3415   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
  3416   tmp = crc_rv;                                  // Tmp uses it's original register again.
  3417 #endif
  3419   // Restore original table address for tailLoop.
  3420   if (reconstructTableOffset != 0) {
  3421     addi(table, table, -reconstructTableOffset);
  3424   // Process last few (<complexThreshold) bytes of buffer.
  3425   BIND(L_tail);
  3426   update_byteLoop_crc32(crc, buf, len, table, data, false, false);
  3428   nand(crc, crc, crc);                           // ~c
  3429   BLOCK_COMMENT("} kernel_crc32_1word");
  3432 /**
  3433  * @param crc   register containing existing CRC (32-bit)
  3434  * @param buf   register pointing to input byte buffer (byte*)
  3435  * @param len   register containing number of bytes
  3436  * @param table register pointing to CRC table
  3438  * Uses R7_ARG5, R8_ARG6 as work registers.
  3439  */
  3440 void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table,
  3441                                         Register t0,  Register t1,  Register t2,  Register t3) {
  3442   assert_different_registers(crc, buf, len, table);
  3444   Register  data = t0;                   // Holds the current byte to be folded into crc.
  3446   BLOCK_COMMENT("kernel_crc32_1byte {");
  3448   // Process all bytes in a single-byte loop.
  3449   update_byteLoop_crc32(crc, buf, len, table, data, true, true);
  3451   BLOCK_COMMENT("} kernel_crc32_1byte");
  3454 /**
  3455  * @param crc             register containing existing CRC (32-bit)
  3456  * @param buf             register pointing to input byte buffer (byte*)
  3457  * @param len             register containing number of bytes
  3458  * @param table           register pointing to CRC table
  3459  * @param constants       register pointing to CRC table for 128-bit aligned memory
  3460  * @param barretConstants register pointing to table for barrett reduction
  3461  * @param t0              volatile register
  3462  * @param t1              volatile register
  3463  * @param t2              volatile register
  3464  * @param t3              volatile register
  3465  */
  3466 void MacroAssembler::kernel_crc32_1word_vpmsumd(Register crc, Register buf, Register len, Register table,
  3467                         Register constants,  Register barretConstants,
  3468                         Register t0,  Register t1, Register t2, Register t3, Register t4) {
  3469   assert_different_registers(crc, buf, len, table);
  3471   Label L_alignedHead, L_tail, L_alignTail, L_start, L_end;
  3473   Register  prealign     = t0;
  3474   Register  postalign    = t0;
  3476   BLOCK_COMMENT("kernel_crc32_1word_vpmsumb {");
  3478   // 1. use kernel_crc32_1word for shorter than 384bit
  3479   clrldi(len, len, 32);
  3480   cmpdi(CCR0, len, 384);
  3481   bge(CCR0, L_start);
  3483     Register tc0 = t4;
  3484     Register tc1 = constants;
  3485     Register tc2 = barretConstants;
  3486     kernel_crc32_1word(crc, buf, len, table,t0, t1, t2, t3, tc0, tc1, tc2, table);
  3487     b(L_end);
  3489   BIND(L_start);
  3491     // 2. ~c
  3492     nand(crc, crc, crc);
  3494     // 3. calculate from 0 to first 128bit-aligned address
  3495     clrldi_(prealign, buf, 57);
  3496     beq(CCR0, L_alignedHead);
  3498     subfic(prealign, prealign, 128);
  3500     subf(len, prealign, len);
  3501     update_byteLoop_crc32(crc, buf, prealign, table, t2, false, false);
  3503     // 4. calculate from first 128bit-aligned address to last 128bit-aligned address
  3504     BIND(L_alignedHead);
  3506     clrldi(postalign, len, 57);
  3507     subf(len, postalign, len);
  3509     // len must be more than 256bit
  3510     kernel_crc32_1word_aligned(crc, buf, len, constants, barretConstants, t1, t2, t3);
  3512     // 5. calculate remaining
  3513     cmpdi(CCR0, postalign, 0);
  3514     beq(CCR0, L_tail);
  3516     update_byteLoop_crc32(crc, buf, postalign, table, t2, false, false);
  3518     BIND(L_tail);
  3520     // 6. ~c
  3521     nand(crc, crc, crc);
  3523   BIND(L_end);
  3525   BLOCK_COMMENT("} kernel_crc32_1word_vpmsumb");
  3528 /**
  3529  * @param crc             register containing existing CRC (32-bit)
  3530  * @param buf             register pointing to input byte buffer (byte*)
  3531  * @param len             register containing number of bytes
  3532  * @param constants       register pointing to CRC table for 128-bit aligned memory
  3533  * @param barretConstants register pointing to table for barrett reduction
  3534  * @param t0              volatile register
  3535  * @param t1              volatile register
  3536  * @param t2              volatile register
  3537  */
  3538 void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Register len,
  3539     Register constants, Register barretConstants, Register t0, Register t1, Register t2) {
  3540   Label L_mainLoop, L_tail, L_alignTail, L_barrett_reduction, L_end, L_first_warm_up_done, L_first_cool_down, L_second_cool_down, L_XOR, L_test;
  3541   Label L_lv0, L_lv1, L_lv2, L_lv3, L_lv4, L_lv5, L_lv6, L_lv7, L_lv8, L_lv9, L_lv10, L_lv11, L_lv12, L_lv13, L_lv14, L_lv15;
  3542   Label L_1, L_2, L_3, L_4;
  3544   Register  rLoaded      = t0;
  3545   Register  rTmp1        = t1;
  3546   Register  rTmp2        = t2;
  3547   Register  off16        = R22;
  3548   Register  off32        = R23;
  3549   Register  off48        = R24;
  3550   Register  off64        = R25;
  3551   Register  off80        = R26;
  3552   Register  off96        = R27;
  3553   Register  off112       = R28;
  3554   Register  rIdx         = R29;
  3555   Register  rMax         = R30;
  3556   Register  constantsPos = R31;
  3558   VectorRegister mask_32bit = VR24;
  3559   VectorRegister mask_64bit = VR25;
  3560   VectorRegister zeroes     = VR26;
  3561   VectorRegister const1     = VR27;
  3562   VectorRegister const2     = VR28;
  3564   // Save non-volatile vector registers (frameless).
  3565   Register offset = t1;   int offsetInt = 0;
  3566   offsetInt -= 16; li(offset, -16);           stvx(VR20, offset, R1_SP);
  3567   offsetInt -= 16; addi(offset, offset, -16); stvx(VR21, offset, R1_SP);
  3568   offsetInt -= 16; addi(offset, offset, -16); stvx(VR22, offset, R1_SP);
  3569   offsetInt -= 16; addi(offset, offset, -16); stvx(VR23, offset, R1_SP);
  3570   offsetInt -= 16; addi(offset, offset, -16); stvx(VR24, offset, R1_SP);
  3571   offsetInt -= 16; addi(offset, offset, -16); stvx(VR25, offset, R1_SP);
  3572   offsetInt -= 16; addi(offset, offset, -16); stvx(VR26, offset, R1_SP);
  3573   offsetInt -= 16; addi(offset, offset, -16); stvx(VR27, offset, R1_SP);
  3574   offsetInt -= 16; addi(offset, offset, -16); stvx(VR28, offset, R1_SP);
  3575   offsetInt -= 8; std(R22, offsetInt, R1_SP);
  3576   offsetInt -= 8; std(R23, offsetInt, R1_SP);
  3577   offsetInt -= 8; std(R24, offsetInt, R1_SP);
  3578   offsetInt -= 8; std(R25, offsetInt, R1_SP);
  3579   offsetInt -= 8; std(R26, offsetInt, R1_SP);
  3580   offsetInt -= 8; std(R27, offsetInt, R1_SP);
  3581   offsetInt -= 8; std(R28, offsetInt, R1_SP);
  3582   offsetInt -= 8; std(R29, offsetInt, R1_SP);
  3583   offsetInt -= 8; std(R30, offsetInt, R1_SP);
  3584   offsetInt -= 8; std(R31, offsetInt, R1_SP);
  3586   // Set constants
  3587   li(off16, 16);
  3588   li(off32, 32);
  3589   li(off48, 48);
  3590   li(off64, 64);
  3591   li(off80, 80);
  3592   li(off96, 96);
  3593   li(off112, 112);
  3595   clrldi(crc, crc, 32);
  3597   vxor(zeroes, zeroes, zeroes);
  3598   vspltisw(VR0, -1);
  3600   vsldoi(mask_32bit, zeroes, VR0, 4);
  3601   vsldoi(mask_64bit, zeroes, VR0, 8);
  3603   // Get the initial value into v8
  3604   vxor(VR8, VR8, VR8);
  3605   mtvrd(VR8, crc);
  3606   vsldoi(VR8, zeroes, VR8, 8); // shift into bottom 32 bits
  3608   li (rLoaded, 0);
  3610   rldicr(rIdx, len, 0, 56);
  3613     BIND(L_1);
  3614     // Checksum in blocks of MAX_SIZE (32768)
  3615     lis(rMax, 0);
  3616     ori(rMax, rMax, 32768);
  3617     mr(rTmp2, rMax);
  3618     cmpd(CCR0, rIdx, rMax);
  3619     bgt(CCR0, L_2);
  3620     mr(rMax, rIdx);
  3622     BIND(L_2);
  3623     subf(rIdx, rMax, rIdx);
  3625     // our main loop does 128 bytes at a time
  3626     srdi(rMax, rMax, 7);
  3628     /*
  3629      * Work out the offset into the constants table to start at. Each
  3630      * constant is 16 bytes, and it is used against 128 bytes of input
  3631      * data - 128 / 16 = 8
  3632      */
  3633     sldi(rTmp1, rMax, 4);
  3634     srdi(rTmp2, rTmp2, 3);
  3635     subf(rTmp1, rTmp1, rTmp2);
  3637     // We reduce our final 128 bytes in a separate step
  3638     addi(rMax, rMax, -1);
  3639     mtctr(rMax);
  3641     // Find the start of our constants
  3642     add(constantsPos, constants, rTmp1);
  3644     // zero VR0-v7 which will contain our checksums
  3645     vxor(VR0, VR0, VR0);
  3646     vxor(VR1, VR1, VR1);
  3647     vxor(VR2, VR2, VR2);
  3648     vxor(VR3, VR3, VR3);
  3649     vxor(VR4, VR4, VR4);
  3650     vxor(VR5, VR5, VR5);
  3651     vxor(VR6, VR6, VR6);
  3652     vxor(VR7, VR7, VR7);
  3654     lvx(const1, constantsPos);
  3656     /*
  3657      * If we are looping back to consume more data we use the values
  3658      * already in VR16-v23.
  3659      */
  3660     cmpdi(CCR0, rLoaded, 1);
  3661     beq(CCR0, L_3);
  3664       // First warm up pass
  3665       lvx(VR16, buf);
  3666       lvx(VR17, off16, buf);
  3667       lvx(VR18, off32, buf);
  3668       lvx(VR19, off48, buf);
  3669       lvx(VR20, off64, buf);
  3670       lvx(VR21, off80, buf);
  3671       lvx(VR22, off96, buf);
  3672       lvx(VR23, off112, buf);
  3673       addi(buf, buf, 8*16);
  3675       // xor in initial value
  3676       vxor(VR16, VR16, VR8);
  3679     BIND(L_3);
  3680     bdz(L_first_warm_up_done);
  3682     addi(constantsPos, constantsPos, 16);
  3683     lvx(const2, constantsPos);
  3685     // Second warm up pass
  3686     vpmsumd(VR8, VR16, const1);
  3687     lvx(VR16, buf);
  3689     vpmsumd(VR9, VR17, const1);
  3690     lvx(VR17, off16, buf);
  3692     vpmsumd(VR10, VR18, const1);
  3693     lvx(VR18, off32, buf);
  3695     vpmsumd(VR11, VR19, const1);
  3696     lvx(VR19, off48, buf);
  3698     vpmsumd(VR12, VR20, const1);
  3699     lvx(VR20, off64, buf);
  3701     vpmsumd(VR13, VR21, const1);
  3702     lvx(VR21, off80, buf);
  3704     vpmsumd(VR14, VR22, const1);
  3705     lvx(VR22, off96, buf);
  3707     vpmsumd(VR15, VR23, const1);
  3708     lvx(VR23, off112, buf);
  3710     addi(buf, buf, 8 * 16);
  3712     bdz(L_first_cool_down);
  3714     /*
  3715      * main loop. We modulo schedule it such that it takes three iterations
  3716      * to complete - first iteration load, second iteration vpmsum, third
  3717      * iteration xor.
  3718      */
  3720       BIND(L_4);
  3721       lvx(const1, constantsPos); addi(constantsPos, constantsPos, 16);
  3723       vxor(VR0, VR0, VR8);
  3724       vpmsumd(VR8, VR16, const2);
  3725       lvx(VR16, buf);
  3727       vxor(VR1, VR1, VR9);
  3728       vpmsumd(VR9, VR17, const2);
  3729       lvx(VR17, off16, buf);
  3731       vxor(VR2, VR2, VR10);
  3732       vpmsumd(VR10, VR18, const2);
  3733       lvx(VR18, off32, buf);
  3735       vxor(VR3, VR3, VR11);
  3736       vpmsumd(VR11, VR19, const2);
  3737       lvx(VR19, off48, buf);
  3738       lvx(const2, constantsPos);
  3740       vxor(VR4, VR4, VR12);
  3741       vpmsumd(VR12, VR20, const1);
  3742       lvx(VR20, off64, buf);
  3744       vxor(VR5, VR5, VR13);
  3745       vpmsumd(VR13, VR21, const1);
  3746       lvx(VR21, off80, buf);
  3748       vxor(VR6, VR6, VR14);
  3749       vpmsumd(VR14, VR22, const1);
  3750       lvx(VR22, off96, buf);
  3752       vxor(VR7, VR7, VR15);
  3753       vpmsumd(VR15, VR23, const1);
  3754       lvx(VR23, off112, buf);
  3756       addi(buf, buf, 8 * 16);
  3758       bdnz(L_4);
  3761     BIND(L_first_cool_down);
  3763     // First cool down pass
  3764     lvx(const1, constantsPos);
  3765     addi(constantsPos, constantsPos, 16);
  3767     vxor(VR0, VR0, VR8);
  3768     vpmsumd(VR8, VR16, const1);
  3770     vxor(VR1, VR1, VR9);
  3771     vpmsumd(VR9, VR17, const1);
  3773     vxor(VR2, VR2, VR10);
  3774     vpmsumd(VR10, VR18, const1);
  3776     vxor(VR3, VR3, VR11);
  3777     vpmsumd(VR11, VR19, const1);
  3779     vxor(VR4, VR4, VR12);
  3780     vpmsumd(VR12, VR20, const1);
  3782     vxor(VR5, VR5, VR13);
  3783     vpmsumd(VR13, VR21, const1);
  3785     vxor(VR6, VR6, VR14);
  3786     vpmsumd(VR14, VR22, const1);
  3788     vxor(VR7, VR7, VR15);
  3789     vpmsumd(VR15, VR23, const1);
  3791     BIND(L_second_cool_down);
  3792     // Second cool down pass
  3793     vxor(VR0, VR0, VR8);
  3794     vxor(VR1, VR1, VR9);
  3795     vxor(VR2, VR2, VR10);
  3796     vxor(VR3, VR3, VR11);
  3797     vxor(VR4, VR4, VR12);
  3798     vxor(VR5, VR5, VR13);
  3799     vxor(VR6, VR6, VR14);
  3800     vxor(VR7, VR7, VR15);
  3802     /*
  3803      * vpmsumd produces a 96 bit result in the least significant bits
  3804      * of the register. Since we are bit reflected we have to shift it
  3805      * left 32 bits so it occupies the least significant bits in the
  3806      * bit reflected domain.
  3807      */
  3808     vsldoi(VR0, VR0, zeroes, 4);
  3809     vsldoi(VR1, VR1, zeroes, 4);
  3810     vsldoi(VR2, VR2, zeroes, 4);
  3811     vsldoi(VR3, VR3, zeroes, 4);
  3812     vsldoi(VR4, VR4, zeroes, 4);
  3813     vsldoi(VR5, VR5, zeroes, 4);
  3814     vsldoi(VR6, VR6, zeroes, 4);
  3815     vsldoi(VR7, VR7, zeroes, 4);
  3817     // xor with last 1024 bits
  3818     lvx(VR8, buf);
  3819     lvx(VR9, off16, buf);
  3820     lvx(VR10, off32, buf);
  3821     lvx(VR11, off48, buf);
  3822     lvx(VR12, off64, buf);
  3823     lvx(VR13, off80, buf);
  3824     lvx(VR14, off96, buf);
  3825     lvx(VR15, off112, buf);
  3826     addi(buf, buf, 8 * 16);
  3828     vxor(VR16, VR0, VR8);
  3829     vxor(VR17, VR1, VR9);
  3830     vxor(VR18, VR2, VR10);
  3831     vxor(VR19, VR3, VR11);
  3832     vxor(VR20, VR4, VR12);
  3833     vxor(VR21, VR5, VR13);
  3834     vxor(VR22, VR6, VR14);
  3835     vxor(VR23, VR7, VR15);
  3837     li(rLoaded, 1);
  3838     cmpdi(CCR0, rIdx, 0);
  3839     addi(rIdx, rIdx, 128);
  3840     bne(CCR0, L_1);
  3843   // Work out how many bytes we have left
  3844   andi_(len, len, 127);
  3846   // Calculate where in the constant table we need to start
  3847   subfic(rTmp1, len, 128);
  3848   add(constantsPos, constantsPos, rTmp1);
  3850   // How many 16 byte chunks are in the tail
  3851   srdi(rIdx, len, 4);
  3852   mtctr(rIdx);
  3854   /*
  3855    * Reduce the previously calculated 1024 bits to 64 bits, shifting
  3856    * 32 bits to include the trailing 32 bits of zeros
  3857    */
  3858   lvx(VR0, constantsPos);
  3859   lvx(VR1, off16, constantsPos);
  3860   lvx(VR2, off32, constantsPos);
  3861   lvx(VR3, off48, constantsPos);
  3862   lvx(VR4, off64, constantsPos);
  3863   lvx(VR5, off80, constantsPos);
  3864   lvx(VR6, off96, constantsPos);
  3865   lvx(VR7, off112, constantsPos);
  3866   addi(constantsPos, constantsPos, 8 * 16);
  3868   vpmsumw(VR0, VR16, VR0);
  3869   vpmsumw(VR1, VR17, VR1);
  3870   vpmsumw(VR2, VR18, VR2);
  3871   vpmsumw(VR3, VR19, VR3);
  3872   vpmsumw(VR4, VR20, VR4);
  3873   vpmsumw(VR5, VR21, VR5);
  3874   vpmsumw(VR6, VR22, VR6);
  3875   vpmsumw(VR7, VR23, VR7);
  3877   // Now reduce the tail (0 - 112 bytes)
  3878   cmpdi(CCR0, rIdx, 0);
  3879   beq(CCR0, L_XOR);
  3881   lvx(VR16, buf); addi(buf, buf, 16);
  3882   lvx(VR17, constantsPos);
  3883   vpmsumw(VR16, VR16, VR17);
  3884   vxor(VR0, VR0, VR16);
  3885   beq(CCR0, L_XOR);
  3887   lvx(VR16, buf); addi(buf, buf, 16);
  3888   lvx(VR17, off16, constantsPos);
  3889   vpmsumw(VR16, VR16, VR17);
  3890   vxor(VR0, VR0, VR16);
  3891   beq(CCR0, L_XOR);
  3893   lvx(VR16, buf); addi(buf, buf, 16);
  3894   lvx(VR17, off32, constantsPos);
  3895   vpmsumw(VR16, VR16, VR17);
  3896   vxor(VR0, VR0, VR16);
  3897   beq(CCR0, L_XOR);
  3899   lvx(VR16, buf); addi(buf, buf, 16);
  3900   lvx(VR17, off48,constantsPos);
  3901   vpmsumw(VR16, VR16, VR17);
  3902   vxor(VR0, VR0, VR16);
  3903   beq(CCR0, L_XOR);
  3905   lvx(VR16, buf); addi(buf, buf, 16);
  3906   lvx(VR17, off64, constantsPos);
  3907   vpmsumw(VR16, VR16, VR17);
  3908   vxor(VR0, VR0, VR16);
  3909   beq(CCR0, L_XOR);
  3911   lvx(VR16, buf); addi(buf, buf, 16);
  3912   lvx(VR17, off80, constantsPos);
  3913   vpmsumw(VR16, VR16, VR17);
  3914   vxor(VR0, VR0, VR16);
  3915   beq(CCR0, L_XOR);
  3917   lvx(VR16, buf); addi(buf, buf, 16);
  3918   lvx(VR17, off96, constantsPos);
  3919   vpmsumw(VR16, VR16, VR17);
  3920   vxor(VR0, VR0, VR16);
  3922   // Now xor all the parallel chunks together
  3923   BIND(L_XOR);
  3924   vxor(VR0, VR0, VR1);
  3925   vxor(VR2, VR2, VR3);
  3926   vxor(VR4, VR4, VR5);
  3927   vxor(VR6, VR6, VR7);
  3929   vxor(VR0, VR0, VR2);
  3930   vxor(VR4, VR4, VR6);
  3932   vxor(VR0, VR0, VR4);
  3934   b(L_barrett_reduction);
  3936   BIND(L_first_warm_up_done);
  3937   lvx(const1, constantsPos);
  3938   addi(constantsPos, constantsPos, 16);
  3939   vpmsumd(VR8,  VR16, const1);
  3940   vpmsumd(VR9,  VR17, const1);
  3941   vpmsumd(VR10, VR18, const1);
  3942   vpmsumd(VR11, VR19, const1);
  3943   vpmsumd(VR12, VR20, const1);
  3944   vpmsumd(VR13, VR21, const1);
  3945   vpmsumd(VR14, VR22, const1);
  3946   vpmsumd(VR15, VR23, const1);
  3947   b(L_second_cool_down);
  3949   BIND(L_barrett_reduction);
  3951   lvx(const1, barretConstants);
  3952   addi(barretConstants, barretConstants, 16);
  3953   lvx(const2, barretConstants);
  3955   vsldoi(VR1, VR0, VR0, 8);
  3956   vxor(VR0, VR0, VR1);    // xor two 64 bit results together
  3958   // shift left one bit
  3959   vspltisb(VR1, 1);
  3960   vsl(VR0, VR0, VR1);
  3962   vand(VR0, VR0, mask_64bit);
  3964   /*
  3965    * The reflected version of Barrett reduction. Instead of bit
  3966    * reflecting our data (which is expensive to do), we bit reflect our
  3967    * constants and our algorithm, which means the intermediate data in
  3968    * our vector registers goes from 0-63 instead of 63-0. We can reflect
  3969    * the algorithm because we don't carry in mod 2 arithmetic.
  3970    */
  3971   vand(VR1, VR0, mask_32bit);  // bottom 32 bits of a
  3972   vpmsumd(VR1, VR1, const1);   // ma
  3973   vand(VR1, VR1, mask_32bit);  // bottom 32bits of ma
  3974   vpmsumd(VR1, VR1, const2);   // qn */
  3975   vxor(VR0, VR0, VR1);         // a - qn, subtraction is xor in GF(2)
  3977   /*
  3978    * Since we are bit reflected, the result (ie the low 32 bits) is in
  3979    * the high 32 bits. We just need to shift it left 4 bytes
  3980    * V0 [ 0 1 X 3 ]
  3981    * V0 [ 0 X 2 3 ]
  3982    */
  3983   vsldoi(VR0, VR0, zeroes, 4);    // shift result into top 64 bits of
  3985   // Get it into r3
  3986   mfvrd(crc, VR0);
  3988   BIND(L_end);
  3990   offsetInt = 0;
  3991   // Restore non-volatile Vector registers (frameless).
  3992   offsetInt -= 16; li(offset, -16);           lvx(VR20, offset, R1_SP);
  3993   offsetInt -= 16; addi(offset, offset, -16); lvx(VR21, offset, R1_SP);
  3994   offsetInt -= 16; addi(offset, offset, -16); lvx(VR22, offset, R1_SP);
  3995   offsetInt -= 16; addi(offset, offset, -16); lvx(VR23, offset, R1_SP);
  3996   offsetInt -= 16; addi(offset, offset, -16); lvx(VR24, offset, R1_SP);
  3997   offsetInt -= 16; addi(offset, offset, -16); lvx(VR25, offset, R1_SP);
  3998   offsetInt -= 16; addi(offset, offset, -16); lvx(VR26, offset, R1_SP);
  3999   offsetInt -= 16; addi(offset, offset, -16); lvx(VR27, offset, R1_SP);
  4000   offsetInt -= 16; addi(offset, offset, -16); lvx(VR28, offset, R1_SP);
  4001   offsetInt -= 8;  ld(R22, offsetInt, R1_SP);
  4002   offsetInt -= 8;  ld(R23, offsetInt, R1_SP);
  4003   offsetInt -= 8;  ld(R24, offsetInt, R1_SP);
  4004   offsetInt -= 8;  ld(R25, offsetInt, R1_SP);
  4005   offsetInt -= 8;  ld(R26, offsetInt, R1_SP);
  4006   offsetInt -= 8;  ld(R27, offsetInt, R1_SP);
  4007   offsetInt -= 8;  ld(R28, offsetInt, R1_SP);
  4008   offsetInt -= 8;  ld(R29, offsetInt, R1_SP);
  4009   offsetInt -= 8;  ld(R30, offsetInt, R1_SP);
  4010   offsetInt -= 8;  ld(R31, offsetInt, R1_SP);
  4013 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp) {
  4014   assert_different_registers(crc, buf, /* len,  not used!! */ table, tmp);
  4016   BLOCK_COMMENT("kernel_crc32_singleByte:");
  4017   nand(crc, crc, crc);       // ~c
  4019   lbz(tmp, 0, buf);          // Byte from buffer, zero-extended.
  4020   update_byte_crc32(crc, tmp, table);
  4022   nand(crc, crc, crc);       // ~c
  4026 void MacroAssembler::asm_assert(bool check_equal, const char *msg, int id) {
  4027 #ifdef ASSERT
  4028   Label ok;
  4029   if (check_equal) {
  4030     beq(CCR0, ok);
  4031   } else {
  4032     bne(CCR0, ok);
  4034   stop(msg, id);
  4035   bind(ok);
  4036 #endif
  4039 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset,
  4040                                           Register mem_base, const char* msg, int id) {
  4041 #ifdef ASSERT
  4042   switch (size) {
  4043     case 4:
  4044       lwz(R0, mem_offset, mem_base);
  4045       cmpwi(CCR0, R0, 0);
  4046       break;
  4047     case 8:
  4048       ld(R0, mem_offset, mem_base);
  4049       cmpdi(CCR0, R0, 0);
  4050       break;
  4051     default:
  4052       ShouldNotReachHere();
  4054   asm_assert(check_equal, msg, id);
  4055 #endif // ASSERT
  4058 void MacroAssembler::verify_thread() {
  4059   if (VerifyThread) {
  4060     unimplemented("'VerifyThread' currently not implemented on PPC");
  4064 // READ: oop. KILL: R0. Volatile floats perhaps.
  4065 void MacroAssembler::verify_oop(Register oop, const char* msg) {
  4066   if (!VerifyOops) {
  4067     return;
  4070   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
  4071   const Register tmp = R11; // Will be preserved.
  4072   const int nbytes_save = 11*8; // Volatile gprs except R0.
  4073   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
  4075   if (oop == tmp) mr(R4_ARG2, oop);
  4076   save_LR_CR(tmp); // save in old frame
  4077   push_frame_reg_args(nbytes_save, tmp);
  4078   // load FunctionDescriptor** / entry_address *
  4079   load_const_optimized(tmp, fd, R0);
  4080   // load FunctionDescriptor* / entry_address
  4081   ld(tmp, 0, tmp);
  4082   if (oop != tmp) mr_if_needed(R4_ARG2, oop);
  4083   load_const_optimized(R3_ARG1, (address)msg, R0);
  4084   // Call destination for its side effect.
  4085   call_c(tmp);
  4087   pop_frame();
  4088   restore_LR_CR(tmp);
  4089   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
  4092 const char* stop_types[] = {
  4093   "stop",
  4094   "untested",
  4095   "unimplemented",
  4096   "shouldnotreachhere"
  4097 };
  4099 static void stop_on_request(int tp, const char* msg) {
  4100   tty->print("PPC assembly code requires stop: (%s) %s\n", stop_types[tp%/*stop_end*/4], msg);
  4101   guarantee(false, err_msg("PPC assembly code requires stop: %s", msg));
  4104 // Call a C-function that prints output.
  4105 void MacroAssembler::stop(int type, const char* msg, int id) {
  4106 #ifndef PRODUCT
  4107   block_comment(err_msg("stop: %s %s {", stop_types[type%stop_end], msg));
  4108 #else
  4109   block_comment("stop {");
  4110 #endif
  4112   // setup arguments
  4113   load_const_optimized(R3_ARG1, type);
  4114   load_const_optimized(R4_ARG2, (void *)msg, /*tmp=*/R0);
  4115   call_VM_leaf(CAST_FROM_FN_PTR(address, stop_on_request), R3_ARG1, R4_ARG2);
  4116   illtrap();
  4117   emit_int32(id);
  4118   block_comment("} stop;");
  4121 #ifndef PRODUCT
  4122 // Write pattern 0x0101010101010101 in memory region [low-before, high+after].
  4123 // Val, addr are temp registers.
  4124 // If low == addr, addr is killed.
  4125 // High is preserved.
  4126 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {
  4127   if (!ZapMemory) return;
  4129   assert_different_registers(low, val);
  4131   BLOCK_COMMENT("zap memory region {");
  4132   load_const_optimized(val, 0x0101010101010101);
  4133   int size = before + after;
  4134   if (low == high && size < 5 && size > 0) {
  4135     int offset = -before*BytesPerWord;
  4136     for (int i = 0; i < size; ++i) {
  4137       std(val, offset, low);
  4138       offset += (1*BytesPerWord);
  4140   } else {
  4141     addi(addr, low, -before*BytesPerWord);
  4142     assert_different_registers(high, val);
  4143     if (after) addi(high, high, after * BytesPerWord);
  4144     Label loop;
  4145     bind(loop);
  4146     std(val, 0, addr);
  4147     addi(addr, addr, 8);
  4148     cmpd(CCR6, addr, high);
  4149     ble(CCR6, loop);
  4150     if (after) addi(high, high, -after * BytesPerWord);  // Correct back to old value.
  4152   BLOCK_COMMENT("} zap memory region");
  4155 #endif // !PRODUCT
  4157 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() {
  4158   int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true);
  4159   assert(sizeof(bool) == 1, "PowerPC ABI");
  4160   masm->lbz(temp, simm16_offset, temp);
  4161   masm->cmpwi(CCR0, temp, 0);
  4162   masm->beq(CCR0, _label);
  4165 SkipIfEqualZero::~SkipIfEqualZero() {
  4166   _masm->bind(_label);

mercurial