#4536 Added 128-bit memory access to generate_disjoint_short_copy.

Sun, 09 Oct 2016 15:36:29 +0800

author
lifangyuan
date
Sun, 09 Oct 2016 15:36:29 +0800
changeset 125
14abbb45a7dd
parent 124
9d7e35a93fad
child 126
23a51da62a65

#4536 Added 128-bit memory access to generate_disjoint_short_copy.
Reviewed-by: aoqi
Contributed-by: lifangyuan, aoqi

Performance improvement in SPECjvm2008 is not obvious.

src/cpu/mips/vm/assembler_mips.cpp file | annotate | diff | comparison | revisions
src/cpu/mips/vm/assembler_mips.hpp file | annotate | diff | comparison | revisions
src/cpu/mips/vm/disassembler_mips.cpp file | annotate | diff | comparison | revisions
src/cpu/mips/vm/stubGenerator_mips_64.cpp file | annotate | diff | comparison | revisions
     1.1 --- a/src/cpu/mips/vm/assembler_mips.cpp	Sat Oct 08 09:31:28 2016 -0400
     1.2 +++ b/src/cpu/mips/vm/assembler_mips.cpp	Sun Oct 09 15:36:29 2016 +0800
     1.3 @@ -233,10 +233,25 @@
     1.4  	"gslbx",    "gslhx",    "gslwx",    "gsldx",    "",         "",         "gslwxc1",  "gsldxc1"
     1.5  };
     1.6  
     1.7 +
     1.8 +const char* Assembler::gs_lwc2_name[] = {
     1.9 +        "",   "",   "",   "",   "",  "",   "",   "",   "",   "",
    1.10 +        "",   "",   "",   "",   "",  "",   "",   "",   "",   "",
    1.11 +        "",   "",   "",   "",   "",  /*"",   "",   "",*/   "",   "",  //LWDIR, LWPTE, LDDIR and LDPTE have the same low 6 bits.
    1.12 +        "",   "",   "",   "",   "",  "gslq",     ""
    1.13 +};
    1.14 +
    1.15  const char* Assembler::gs_sdc2_name[] = {
    1.16  	"gssbx",    "gsshx",    "gsswx",    "gssdx",    "",         "",         "gsswxc1",  "gssdxc1"
    1.17  };
    1.18  
    1.19 +const char* Assembler::gs_swc2_name[] = {
    1.20 +        "",    "",    "",    "",    "",    "",    "",    "",    "",    "",
    1.21 +        "",    "",    "",    "",    "",    "",    "",    "",    "",    "",
    1.22 +        "",    "",    "",    "",    "",    "",    "",    "",    "",    "",
    1.23 +        "",    "",    "gssq",       "" 
    1.24 +};
    1.25 +
    1.26  //misleading name, print only branch/jump instruction 
    1.27  void Assembler::print_instruction(int inst) {
    1.28  	const char *s;
     2.1 --- a/src/cpu/mips/vm/assembler_mips.hpp	Sat Oct 08 09:31:28 2016 -0400
     2.2 +++ b/src/cpu/mips/vm/assembler_mips.hpp	Sun Oct 09 15:36:29 2016 +0800
     2.3 @@ -413,11 +413,13 @@
     2.4  	  cache_op    = 0x2f,
     2.5  	  ll_op       = 0x30,
     2.6  	  lwc1_op     = 0x31,
     2.7 +	  lwc2_op     = 0x32,
     2.8  	  lld_op      = 0x34,
     2.9  	  ldc1_op     = 0x35,
    2.10  	  ld_op       = 0x37,
    2.11  	  sc_op       = 0x38,
    2.12  	  swc1_op     = 0x39,
    2.13 +	  swc2_op     = 0x3a,
    2.14  	  scd_op      = 0x3c,
    2.15  	  sdc1_op     = 0x3d,
    2.16  	  sd_op       = 0x3f
    2.17 @@ -608,11 +610,18 @@
    2.18  
    2.19          /* Godson3 extension */
    2.20          enum godson3_ops {
    2.21 -                gs_ldc2_op      = 0x36, 
    2.22 -                gs_sdc2_op      = 0x3e, 
    2.23 +                gs_lwc2_op      = 0x32,
    2.24 +                gs_ldc2_op      = 0x36,
    2.25 +                gs_swc2_op      = 0x3a,
    2.26 +                gs_sdc2_op      = 0x3e
    2.27          };
    2.28   
    2.29 - 
    2.30 +        enum gs_lwc2_ops {
    2.31 +                gslq_op         = 0x20
    2.32 +        };
    2.33 +
    2.34 +        static const char* gs_lwc2_name[];
    2.35 +
    2.36          enum gs_ldc2_ops {
    2.37                  gslbx_op        =  0x0,
    2.38                  gslhx_op        =  0x1,
    2.39 @@ -622,7 +631,13 @@
    2.40                  gsldxc1_op      =  0x7
    2.41          };
    2.42  
    2.43 -	static const char* gs_ldc2_name[]; 
    2.44 +        static const char* gs_ldc2_name[];
    2.45 +
    2.46 +        enum gs_swc2_ops {
    2.47 +                gssq_op         = 0x20
    2.48 +        };
    2.49 +
    2.50 +        static const char* gs_swc2_name[];
    2.51  
    2.52          enum gs_sdc2_ops {
    2.53                  gssbx_op        =  0x0,
    2.54 @@ -633,7 +648,7 @@
    2.55                  gssdxc1_op      =  0x7
    2.56          };
    2.57  
    2.58 -	static const char* gs_sdc2_name[]; 
    2.59 +        static const char* gs_sdc2_name[];
    2.60  
    2.61  	static int opcode(int insn) { return (insn>>26)&0x3f; }
    2.62  	static int rs(int insn) { return (insn>>21)&0x1f; }
    2.63 @@ -1203,6 +1218,20 @@
    2.64  	int branch_destination(int inst, int pos);
    2.65  
    2.66  	/* Godson3 extension */
    2.67 +
    2.68 +// gssq/gslq/gssqc1/gslqc1: vAddr = sign_extend(offset << 4 ) + GPR[base]. Therefore, the off should be ">> 4".
    2.69 +  void gslq(Register rq, Register rt, Register base, int off) {
    2.70 +    off = off >> 4;
    2.71 +    assert(is_simm(off, 9),"gslq: off exceeds 9 bits");
    2.72 +    emit_long((gs_lwc2_op << 26) | ((int)base->encoding() << 21) | ((int)rt->encoding() << 16) | 0 << 15 | (low(off, 9) << 6) | gslq_op | (int)rq->encoding() );
    2.73 +  }
    2.74 +
    2.75 +  void gssq(Register rq, Register rt, Register base, int off) {
    2.76 +    off = off >> 4;
    2.77 +    assert(is_simm(off, 9),"gssq: off exceeds 9 bits");
    2.78 +    emit_long((gs_swc2_op << 26) | ((int)base->encoding() << 21) | ((int)rt->encoding() << 16) | 0 << 15 | (low(off, 9) << 6) | gssq_op | (int)rq->encoding() );
    2.79 +  }
    2.80 +
    2.81  	void gsldxc1(FloatRegister rt, Register base, Register index, int off) {
    2.82  		assert(is_simm(off, 8), "gsldxc1: off exceeds 8 bits");
    2.83  		emit_long((gs_ldc2_op << 26) | ((int)base->encoding() << 21) | ((int)rt->encoding() << 16) | ((int)index->encoding() << 11) | (off << 3) | gsldxc1_op);
     3.1 --- a/src/cpu/mips/vm/disassembler_mips.cpp	Sat Oct 08 09:31:28 2016 -0400
     3.2 +++ b/src/cpu/mips/vm/disassembler_mips.cpp	Sun Oct 09 15:36:29 2016 +0800
     3.3 @@ -165,6 +165,17 @@
     3.4                          as_Register(Assembler::rd(insn))->name(), \
     3.5                          ((short)Assembler::low(insn, 11) >> 3) )
     3.6  
     3.7 +/*
     3.8 + * "<< 17 >> 23 << 4" is short for "<< 17 >> 17 >> 6 << 4". vAddr = sign_extend(offset << 4 ) + GPR[base]
     3.9 + * "<< 17 >> 17": sign-extending
    3.10 + * ">> 6": offset pos
    3.11 + * "<< 4": offset << 4
    3.12 + */
    3.13 +#define PRINT_ORRRI_GSLQ(OP) \
    3.14 +        env->print("%s %s, %s, %d(%s)", OP, as_Register((insn)&0x1f)->name(), \
    3.15 +            as_Register(Assembler::rt(insn))->name(), (Assembler::low(insn, 15) << 17 >> 23 << 4), \
    3.16 +            as_Register(Assembler::rs(insn))->name())
    3.17 +
    3.18  #define PRINT_ORRR_2(OP) \
    3.19  	env->print("%s %s, %s, %s", OP, as_Register(Assembler::rd(insn))->name(), \
    3.20  			as_Register(Assembler::rt(insn))->name(), \
    3.21 @@ -538,11 +549,43 @@
    3.22  		PRINT_OFOB(Assembler::ops_name[opcode]);
    3.23  		break;
    3.24  
    3.25 +  case Assembler::gs_lwc2_op:
    3.26 +    if ((Assembler::special(insn) & 0x20) != 0 ) {
    3.27 +      //gslq rq, rt, offset(base)
    3.28 +      if ( (insn & (1 << 15)) == 0) {
    3.29 +        //gsLQ
    3.30 +        special = Assembler::gslq_op;
    3.31 +        PRINT_ORRRI_GSLQ(Assembler::gs_lwc2_name[special]);
    3.32 +      } else {
    3.33 +        //gsLQC1
    3.34 +        env->print("0x%x\n", insn);
    3.35 +      }
    3.36 +    } else {
    3.37 +      env->print("0x%x\n", insn);
    3.38 +    }
    3.39 +    break;
    3.40 +
    3.41  	case Assembler::gs_ldc2_op:
    3.42  		special = Assembler::special(insn) & 0x7;
    3.43                  PRINT_ORRRI_GSLDC2(Assembler::gs_ldc2_name[special]);
    3.44  		break;
    3.45  
    3.46 +  case Assembler::gs_swc2_op:
    3.47 +    if ((Assembler::special(insn) & 0x20) != 0 ) {
    3.48 +      //gssq rq, rt, offset(base)
    3.49 +      if ( (insn & (1 << 15)) == 0) {
    3.50 +        //gsSQ
    3.51 +        special = Assembler::gssq_op;
    3.52 +        PRINT_ORRRI_GSLQ(Assembler::gs_swc2_name[special]);
    3.53 +      } else {
    3.54 +        //gsSQC1
    3.55 +        env->print("0x%x\n", insn);
    3.56 +      }
    3.57 +    } else {
    3.58 +      env->print("0x%x\n", insn);
    3.59 +    }
    3.60 +    break;
    3.61 +
    3.62  	case Assembler::gs_sdc2_op:
    3.63  		special = Assembler::special(insn) & 0x7;
    3.64                  PRINT_ORRRI_GSLDC2(Assembler::gs_sdc2_name[special]);
     4.1 --- a/src/cpu/mips/vm/stubGenerator_mips_64.cpp	Sat Oct 08 09:31:28 2016 -0400
     4.2 +++ b/src/cpu/mips/vm/stubGenerator_mips_64.cpp	Sun Oct 09 15:36:29 2016 +0800
     4.3 @@ -999,17 +999,19 @@
     4.4      Register tmp1 = T0;
     4.5      Register tmp2 = T1;
     4.6      Register tmp3 = T3;
     4.7 +    Register tmp4 = T8;
     4.8  
     4.9      address start = __ pc();
    4.10  
    4.11      __ push(tmp1);
    4.12      __ push(tmp2);
    4.13      __ push(tmp3);
    4.14 +    __ push(tmp4);
    4.15      __ move(tmp1, A0);
    4.16      __ move(tmp2, A1);
    4.17      __ move(tmp3, A2);
    4.18  
    4.19 -    Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8;
    4.20 +    Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9, l_10, l_11;
    4.21      Label l_debug;
    4.22      // don't try anything fancy if arrays don't have many elements
    4.23      __ daddi(AT, tmp3, -9);
    4.24 @@ -1062,20 +1064,63 @@
    4.25          __ daddi(tmp1, tmp1, 4);
    4.26          __ daddi(tmp2, tmp2, 4);
    4.27        }
    4.28 +    }//end of if
    4.29 +
    4.30 +      __ bind(l_7);
    4.31 +
    4.32 +      // At this time the position of both, from and to, are at least 8 byte aligned.
    4.33 +
    4.34 +      // Copy 8 elemnets at a time.
    4.35 +      // Align to 16 bytes, but only if both from and to have same alignment mod 8.
    4.36 +      __ xorr(AT, tmp1, tmp2);
    4.37 +      __ andi(AT, AT, 15);
    4.38 +      __ bne(AT, R0, l_9);
    4.39 +      __ delayed()->nop();
    4.40 +
    4.41 +      // Copy 4-element word if necessary to align to 16 bytes,
    4.42 +      __ andi(AT, tmp1, 15);
    4.43 +      __ beq(AT, R0, l_10);
    4.44 +      __ delayed()->nop();
    4.45 +
    4.46 +      __ ld(AT, tmp1, 0);
    4.47 +      __ daddi(tmp3, tmp3, -4);
    4.48 +      __ sd(AT, tmp2, 0);
    4.49 +      { // FasterArrayCopy
    4.50 +        __ daddi(tmp1, tmp1, 8);
    4.51 +        __ daddi(tmp2, tmp2, 8);
    4.52 +      }
    4.53 +
    4.54 +      __ bind(l_10);
    4.55 +
    4.56 +    // Copy 8 elements at a time; either the loads or the stores can 
    4.57 +    // be unalligned if aligned == false
    4.58 +
    4.59 +    { // FasterArrayCopy
    4.60 +      __ daddi(AT, tmp3, -15);
    4.61 +      __ blez(AT, l_9);
    4.62 +      __ delayed()->nop();
    4.63 +
    4.64 +      __ bind(l_11);
    4.65 +      // For loongson the 128-bit memory access instruction is gslq/gssq
    4.66 +      __ gslq(AT, tmp4, tmp1, 0);
    4.67 +      __ daddi(tmp1, tmp1, 16);
    4.68 +      __ daddi(tmp3, tmp3, -8);
    4.69 +      __ daddi(tmp2, tmp2, 16);
    4.70 +      __ gssq(AT, tmp4, tmp2, -16);
    4.71 +      __ daddi(AT, tmp3, -8);
    4.72 +      __ bgez(AT, l_11);
    4.73 +      __ delayed()->nop();
    4.74      }
    4.75 -
    4.76 -    __ bind(l_7);
    4.77 -
    4.78 +    __ bind(l_9);
    4.79      // Copy 4 elements at a time; either the loads or the stores can
    4.80      // be unaligned if aligned == false.
    4.81  
    4.82      { // FasterArrayCopy
    4.83 -      __ daddi(AT, tmp3, -15);
    4.84 +      __ daddi(AT, tmp3, -3);
    4.85        __ blez(AT, l_6); // copy 2 at a time if less than 16 elements remain
    4.86        __ delayed()->nop();
    4.87  
    4.88        __ bind(l_8);
    4.89 -      // For Loongson, there is 128-bit memory access. TODO
    4.90        __ ld(AT, tmp1, 0);
    4.91        __ sd(AT, tmp2, 0);
    4.92        __ daddi(tmp1, tmp1, 8);
    4.93 @@ -1123,6 +1168,7 @@
    4.94        __ delayed()->nop();
    4.95      }
    4.96      __ bind(l_4);
    4.97 +    __ pop(tmp4);
    4.98      __ pop(tmp3);
    4.99      __ pop(tmp2);
   4.100      __ pop(tmp1);

mercurial