Sun, 09 Oct 2016 15:36:29 +0800
#4536 Added 128-bit memory access to generate_disjoint_short_copy.
Reviewed-by: aoqi
Contributed-by: lifangyuan, aoqi
Performance improvement in SPECjvm2008 is not obvious.
1.1 --- a/src/cpu/mips/vm/assembler_mips.cpp Sat Oct 08 09:31:28 2016 -0400 1.2 +++ b/src/cpu/mips/vm/assembler_mips.cpp Sun Oct 09 15:36:29 2016 +0800 1.3 @@ -233,10 +233,25 @@ 1.4 "gslbx", "gslhx", "gslwx", "gsldx", "", "", "gslwxc1", "gsldxc1" 1.5 }; 1.6 1.7 + 1.8 +const char* Assembler::gs_lwc2_name[] = { 1.9 + "", "", "", "", "", "", "", "", "", "", 1.10 + "", "", "", "", "", "", "", "", "", "", 1.11 + "", "", "", "", "", /*"", "", "",*/ "", "", //LWDIR, LWPTE, LDDIR and LDPTE have the same low 6 bits. 1.12 + "", "", "", "", "", "gslq", "" 1.13 +}; 1.14 + 1.15 const char* Assembler::gs_sdc2_name[] = { 1.16 "gssbx", "gsshx", "gsswx", "gssdx", "", "", "gsswxc1", "gssdxc1" 1.17 }; 1.18 1.19 +const char* Assembler::gs_swc2_name[] = { 1.20 + "", "", "", "", "", "", "", "", "", "", 1.21 + "", "", "", "", "", "", "", "", "", "", 1.22 + "", "", "", "", "", "", "", "", "", "", 1.23 + "", "", "gssq", "" 1.24 +}; 1.25 + 1.26 //misleading name, print only branch/jump instruction 1.27 void Assembler::print_instruction(int inst) { 1.28 const char *s;
2.1 --- a/src/cpu/mips/vm/assembler_mips.hpp Sat Oct 08 09:31:28 2016 -0400 2.2 +++ b/src/cpu/mips/vm/assembler_mips.hpp Sun Oct 09 15:36:29 2016 +0800 2.3 @@ -413,11 +413,13 @@ 2.4 cache_op = 0x2f, 2.5 ll_op = 0x30, 2.6 lwc1_op = 0x31, 2.7 + lwc2_op = 0x32, 2.8 lld_op = 0x34, 2.9 ldc1_op = 0x35, 2.10 ld_op = 0x37, 2.11 sc_op = 0x38, 2.12 swc1_op = 0x39, 2.13 + swc2_op = 0x3a, 2.14 scd_op = 0x3c, 2.15 sdc1_op = 0x3d, 2.16 sd_op = 0x3f 2.17 @@ -608,11 +610,18 @@ 2.18 2.19 /* Godson3 extension */ 2.20 enum godson3_ops { 2.21 - gs_ldc2_op = 0x36, 2.22 - gs_sdc2_op = 0x3e, 2.23 + gs_lwc2_op = 0x32, 2.24 + gs_ldc2_op = 0x36, 2.25 + gs_swc2_op = 0x3a, 2.26 + gs_sdc2_op = 0x3e 2.27 }; 2.28 2.29 - 2.30 + enum gs_lwc2_ops { 2.31 + gslq_op = 0x20 2.32 + }; 2.33 + 2.34 + static const char* gs_lwc2_name[]; 2.35 + 2.36 enum gs_ldc2_ops { 2.37 gslbx_op = 0x0, 2.38 gslhx_op = 0x1, 2.39 @@ -622,7 +631,13 @@ 2.40 gsldxc1_op = 0x7 2.41 }; 2.42 2.43 - static const char* gs_ldc2_name[]; 2.44 + static const char* gs_ldc2_name[]; 2.45 + 2.46 + enum gs_swc2_ops { 2.47 + gssq_op = 0x20 2.48 + }; 2.49 + 2.50 + static const char* gs_swc2_name[]; 2.51 2.52 enum gs_sdc2_ops { 2.53 gssbx_op = 0x0, 2.54 @@ -633,7 +648,7 @@ 2.55 gssdxc1_op = 0x7 2.56 }; 2.57 2.58 - static const char* gs_sdc2_name[]; 2.59 + static const char* gs_sdc2_name[]; 2.60 2.61 static int opcode(int insn) { return (insn>>26)&0x3f; } 2.62 static int rs(int insn) { return (insn>>21)&0x1f; } 2.63 @@ -1203,6 +1218,20 @@ 2.64 int branch_destination(int inst, int pos); 2.65 2.66 /* Godson3 extension */ 2.67 + 2.68 +// gssq/gslq/gssqc1/gslqc1: vAddr = sign_extend(offset << 4 ) + GPR[base]. Therefore, the off should be ">> 4". 2.69 + void gslq(Register rq, Register rt, Register base, int off) { 2.70 + off = off >> 4; 2.71 + assert(is_simm(off, 9),"gslq: off exceeds 9 bits"); 2.72 + emit_long((gs_lwc2_op << 26) | ((int)base->encoding() << 21) | ((int)rt->encoding() << 16) | 0 << 15 | (low(off, 9) << 6) | gslq_op | (int)rq->encoding() ); 2.73 + } 2.74 + 2.75 + void gssq(Register rq, Register rt, Register base, int off) { 2.76 + off = off >> 4; 2.77 + assert(is_simm(off, 9),"gssq: off exceeds 9 bits"); 2.78 + emit_long((gs_swc2_op << 26) | ((int)base->encoding() << 21) | ((int)rt->encoding() << 16) | 0 << 15 | (low(off, 9) << 6) | gssq_op | (int)rq->encoding() ); 2.79 + } 2.80 + 2.81 void gsldxc1(FloatRegister rt, Register base, Register index, int off) { 2.82 assert(is_simm(off, 8), "gsldxc1: off exceeds 8 bits"); 2.83 emit_long((gs_ldc2_op << 26) | ((int)base->encoding() << 21) | ((int)rt->encoding() << 16) | ((int)index->encoding() << 11) | (off << 3) | gsldxc1_op);
3.1 --- a/src/cpu/mips/vm/disassembler_mips.cpp Sat Oct 08 09:31:28 2016 -0400 3.2 +++ b/src/cpu/mips/vm/disassembler_mips.cpp Sun Oct 09 15:36:29 2016 +0800 3.3 @@ -165,6 +165,17 @@ 3.4 as_Register(Assembler::rd(insn))->name(), \ 3.5 ((short)Assembler::low(insn, 11) >> 3) ) 3.6 3.7 +/* 3.8 + * "<< 17 >> 23 << 4" is short for "<< 17 >> 17 >> 6 << 4". vAddr = sign_extend(offset << 4 ) + GPR[base] 3.9 + * "<< 17 >> 17": sign-extending 3.10 + * ">> 6": offset pos 3.11 + * "<< 4": offset << 4 3.12 + */ 3.13 +#define PRINT_ORRRI_GSLQ(OP) \ 3.14 + env->print("%s %s, %s, %d(%s)", OP, as_Register((insn)&0x1f)->name(), \ 3.15 + as_Register(Assembler::rt(insn))->name(), (Assembler::low(insn, 15) << 17 >> 23 << 4), \ 3.16 + as_Register(Assembler::rs(insn))->name()) 3.17 + 3.18 #define PRINT_ORRR_2(OP) \ 3.19 env->print("%s %s, %s, %s", OP, as_Register(Assembler::rd(insn))->name(), \ 3.20 as_Register(Assembler::rt(insn))->name(), \ 3.21 @@ -538,11 +549,43 @@ 3.22 PRINT_OFOB(Assembler::ops_name[opcode]); 3.23 break; 3.24 3.25 + case Assembler::gs_lwc2_op: 3.26 + if ((Assembler::special(insn) & 0x20) != 0 ) { 3.27 + //gslq rq, rt, offset(base) 3.28 + if ( (insn & (1 << 15)) == 0) { 3.29 + //gsLQ 3.30 + special = Assembler::gslq_op; 3.31 + PRINT_ORRRI_GSLQ(Assembler::gs_lwc2_name[special]); 3.32 + } else { 3.33 + //gsLQC1 3.34 + env->print("0x%x\n", insn); 3.35 + } 3.36 + } else { 3.37 + env->print("0x%x\n", insn); 3.38 + } 3.39 + break; 3.40 + 3.41 case Assembler::gs_ldc2_op: 3.42 special = Assembler::special(insn) & 0x7; 3.43 PRINT_ORRRI_GSLDC2(Assembler::gs_ldc2_name[special]); 3.44 break; 3.45 3.46 + case Assembler::gs_swc2_op: 3.47 + if ((Assembler::special(insn) & 0x20) != 0 ) { 3.48 + //gssq rq, rt, offset(base) 3.49 + if ( (insn & (1 << 15)) == 0) { 3.50 + //gsSQ 3.51 + special = Assembler::gssq_op; 3.52 + PRINT_ORRRI_GSLQ(Assembler::gs_swc2_name[special]); 3.53 + } else { 3.54 + //gsSQC1 3.55 + env->print("0x%x\n", insn); 3.56 + } 3.57 + } else { 3.58 + env->print("0x%x\n", insn); 3.59 + } 3.60 + break; 3.61 + 3.62 case Assembler::gs_sdc2_op: 3.63 special = Assembler::special(insn) & 0x7; 3.64 PRINT_ORRRI_GSLDC2(Assembler::gs_sdc2_name[special]);
4.1 --- a/src/cpu/mips/vm/stubGenerator_mips_64.cpp Sat Oct 08 09:31:28 2016 -0400 4.2 +++ b/src/cpu/mips/vm/stubGenerator_mips_64.cpp Sun Oct 09 15:36:29 2016 +0800 4.3 @@ -999,17 +999,19 @@ 4.4 Register tmp1 = T0; 4.5 Register tmp2 = T1; 4.6 Register tmp3 = T3; 4.7 + Register tmp4 = T8; 4.8 4.9 address start = __ pc(); 4.10 4.11 __ push(tmp1); 4.12 __ push(tmp2); 4.13 __ push(tmp3); 4.14 + __ push(tmp4); 4.15 __ move(tmp1, A0); 4.16 __ move(tmp2, A1); 4.17 __ move(tmp3, A2); 4.18 4.19 - Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8; 4.20 + Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9, l_10, l_11; 4.21 Label l_debug; 4.22 // don't try anything fancy if arrays don't have many elements 4.23 __ daddi(AT, tmp3, -9); 4.24 @@ -1062,20 +1064,63 @@ 4.25 __ daddi(tmp1, tmp1, 4); 4.26 __ daddi(tmp2, tmp2, 4); 4.27 } 4.28 + }//end of if 4.29 + 4.30 + __ bind(l_7); 4.31 + 4.32 + // At this time the position of both, from and to, are at least 8 byte aligned. 4.33 + 4.34 + // Copy 8 elemnets at a time. 4.35 + // Align to 16 bytes, but only if both from and to have same alignment mod 8. 4.36 + __ xorr(AT, tmp1, tmp2); 4.37 + __ andi(AT, AT, 15); 4.38 + __ bne(AT, R0, l_9); 4.39 + __ delayed()->nop(); 4.40 + 4.41 + // Copy 4-element word if necessary to align to 16 bytes, 4.42 + __ andi(AT, tmp1, 15); 4.43 + __ beq(AT, R0, l_10); 4.44 + __ delayed()->nop(); 4.45 + 4.46 + __ ld(AT, tmp1, 0); 4.47 + __ daddi(tmp3, tmp3, -4); 4.48 + __ sd(AT, tmp2, 0); 4.49 + { // FasterArrayCopy 4.50 + __ daddi(tmp1, tmp1, 8); 4.51 + __ daddi(tmp2, tmp2, 8); 4.52 + } 4.53 + 4.54 + __ bind(l_10); 4.55 + 4.56 + // Copy 8 elements at a time; either the loads or the stores can 4.57 + // be unalligned if aligned == false 4.58 + 4.59 + { // FasterArrayCopy 4.60 + __ daddi(AT, tmp3, -15); 4.61 + __ blez(AT, l_9); 4.62 + __ delayed()->nop(); 4.63 + 4.64 + __ bind(l_11); 4.65 + // For loongson the 128-bit memory access instruction is gslq/gssq 4.66 + __ gslq(AT, tmp4, tmp1, 0); 4.67 + __ daddi(tmp1, tmp1, 16); 4.68 + __ daddi(tmp3, tmp3, -8); 4.69 + __ daddi(tmp2, tmp2, 16); 4.70 + __ gssq(AT, tmp4, tmp2, -16); 4.71 + __ daddi(AT, tmp3, -8); 4.72 + __ bgez(AT, l_11); 4.73 + __ delayed()->nop(); 4.74 } 4.75 - 4.76 - __ bind(l_7); 4.77 - 4.78 + __ bind(l_9); 4.79 // Copy 4 elements at a time; either the loads or the stores can 4.80 // be unaligned if aligned == false. 4.81 4.82 { // FasterArrayCopy 4.83 - __ daddi(AT, tmp3, -15); 4.84 + __ daddi(AT, tmp3, -3); 4.85 __ blez(AT, l_6); // copy 2 at a time if less than 16 elements remain 4.86 __ delayed()->nop(); 4.87 4.88 __ bind(l_8); 4.89 - // For Loongson, there is 128-bit memory access. TODO 4.90 __ ld(AT, tmp1, 0); 4.91 __ sd(AT, tmp2, 0); 4.92 __ daddi(tmp1, tmp1, 8); 4.93 @@ -1123,6 +1168,7 @@ 4.94 __ delayed()->nop(); 4.95 } 4.96 __ bind(l_4); 4.97 + __ pop(tmp4); 4.98 __ pop(tmp3); 4.99 __ pop(tmp2); 4.100 __ pop(tmp1);