src/cpu/x86/vm/macroAssembler_x86.cpp

changeset 5353
b800986664f4
parent 4873
e961c11b85fe
child 5528
740e263c80c6
     1.1 --- a/src/cpu/x86/vm/macroAssembler_x86.cpp	Tue Jul 02 07:51:31 2013 +0200
     1.2 +++ b/src/cpu/x86/vm/macroAssembler_x86.cpp	Tue Jul 02 20:42:12 2013 -0400
     1.3 @@ -1,5 +1,5 @@
     1.4  /*
     1.5 - * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
     1.6 + * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
     1.7   * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     1.8   *
     1.9   * This code is free software; you can redistribute it and/or modify it
    1.10 @@ -2794,6 +2794,15 @@
    1.11    }
    1.12  }
    1.13  
    1.14 +void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src) {
    1.15 +  if (reachable(src)) {
    1.16 +    Assembler::movdqa(dst, as_Address(src));
    1.17 +  } else {
    1.18 +    lea(rscratch1, src);
    1.19 +    Assembler::movdqa(dst, Address(rscratch1, 0));
    1.20 +  }
    1.21 +}
    1.22 +
    1.23  void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) {
    1.24    if (reachable(src)) {
    1.25      Assembler::movsd(dst, as_Address(src));
    1.26 @@ -6388,6 +6397,193 @@
    1.27    bind(L_done);
    1.28  }
    1.29  
    1.30 +/**
    1.31 + * Emits code to update CRC-32 with a byte value according to constants in table
    1.32 + *
    1.33 + * @param [in,out]crc   Register containing the crc.
    1.34 + * @param [in]val       Register containing the byte to fold into the CRC.
    1.35 + * @param [in]table     Register containing the table of crc constants.
    1.36 + *
    1.37 + * uint32_t crc;
    1.38 + * val = crc_table[(val ^ crc) & 0xFF];
    1.39 + * crc = val ^ (crc >> 8);
    1.40 + *
    1.41 + */
    1.42 +void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
    1.43 +  xorl(val, crc);
    1.44 +  andl(val, 0xFF);
    1.45 +  shrl(crc, 8); // unsigned shift
    1.46 +  xorl(crc, Address(table, val, Address::times_4, 0));
    1.47 +}
    1.48 +
    1.49 +/**
    1.50 + * Fold 128-bit data chunk
    1.51 + */
    1.52 +void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
    1.53 +  vpclmulhdq(xtmp, xK, xcrc); // [123:64]
    1.54 +  vpclmulldq(xcrc, xK, xcrc); // [63:0]
    1.55 +  vpxor(xcrc, xcrc, Address(buf, offset), false /* vector256 */);
    1.56 +  pxor(xcrc, xtmp);
    1.57 +}
    1.58 +
    1.59 +void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
    1.60 +  vpclmulhdq(xtmp, xK, xcrc);
    1.61 +  vpclmulldq(xcrc, xK, xcrc);
    1.62 +  pxor(xcrc, xbuf);
    1.63 +  pxor(xcrc, xtmp);
    1.64 +}
    1.65 +
    1.66 +/**
    1.67 + * 8-bit folds to compute 32-bit CRC
    1.68 + *
    1.69 + * uint64_t xcrc;
    1.70 + * timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8);
    1.71 + */
    1.72 +void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) {
    1.73 +  movdl(tmp, xcrc);
    1.74 +  andl(tmp, 0xFF);
    1.75 +  movdl(xtmp, Address(table, tmp, Address::times_4, 0));
    1.76 +  psrldq(xcrc, 1); // unsigned shift one byte
    1.77 +  pxor(xcrc, xtmp);
    1.78 +}
    1.79 +
    1.80 +/**
    1.81 + * uint32_t crc;
    1.82 + * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
    1.83 + */
    1.84 +void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
    1.85 +  movl(tmp, crc);
    1.86 +  andl(tmp, 0xFF);
    1.87 +  shrl(crc, 8);
    1.88 +  xorl(crc, Address(table, tmp, Address::times_4, 0));
    1.89 +}
    1.90 +
    1.91 +/**
    1.92 + * @param crc   register containing existing CRC (32-bit)
    1.93 + * @param buf   register pointing to input byte buffer (byte*)
    1.94 + * @param len   register containing number of bytes
    1.95 + * @param table register that will contain address of CRC table
    1.96 + * @param tmp   scratch register
    1.97 + */
    1.98 +void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) {
    1.99 +  assert_different_registers(crc, buf, len, table, tmp, rax);
   1.100 +
   1.101 +  Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
   1.102 +  Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
   1.103 +
   1.104 +  lea(table, ExternalAddress(StubRoutines::crc_table_addr()));
   1.105 +  notl(crc); // ~crc
   1.106 +  cmpl(len, 16);
   1.107 +  jcc(Assembler::less, L_tail);
   1.108 +
   1.109 +  // Align buffer to 16 bytes
   1.110 +  movl(tmp, buf);
   1.111 +  andl(tmp, 0xF);
   1.112 +  jccb(Assembler::zero, L_aligned);
   1.113 +  subl(tmp,  16);
   1.114 +  addl(len, tmp);
   1.115 +
   1.116 +  align(4);
   1.117 +  BIND(L_align_loop);
   1.118 +  movsbl(rax, Address(buf, 0)); // load byte with sign extension
   1.119 +  update_byte_crc32(crc, rax, table);
   1.120 +  increment(buf);
   1.121 +  incrementl(tmp);
   1.122 +  jccb(Assembler::less, L_align_loop);
   1.123 +
   1.124 +  BIND(L_aligned);
   1.125 +  movl(tmp, len); // save
   1.126 +  shrl(len, 4);
   1.127 +  jcc(Assembler::zero, L_tail_restore);
   1.128 +
   1.129 +  // Fold crc into first bytes of vector
   1.130 +  movdqa(xmm1, Address(buf, 0));
   1.131 +  movdl(rax, xmm1);
   1.132 +  xorl(crc, rax);
   1.133 +  pinsrd(xmm1, crc, 0);
   1.134 +  addptr(buf, 16);
   1.135 +  subl(len, 4); // len > 0
   1.136 +  jcc(Assembler::less, L_fold_tail);
   1.137 +
   1.138 +  movdqa(xmm2, Address(buf,  0));
   1.139 +  movdqa(xmm3, Address(buf, 16));
   1.140 +  movdqa(xmm4, Address(buf, 32));
   1.141 +  addptr(buf, 48);
   1.142 +  subl(len, 3);
   1.143 +  jcc(Assembler::lessEqual, L_fold_512b);
   1.144 +
   1.145 +  // Fold total 512 bits of polynomial on each iteration,
   1.146 +  // 128 bits per each of 4 parallel streams.
   1.147 +  movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32));
   1.148 +
   1.149 +  align(32);
   1.150 +  BIND(L_fold_512b_loop);
   1.151 +  fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
   1.152 +  fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16);
   1.153 +  fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32);
   1.154 +  fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48);
   1.155 +  addptr(buf, 64);
   1.156 +  subl(len, 4);
   1.157 +  jcc(Assembler::greater, L_fold_512b_loop);
   1.158 +
   1.159 +  // Fold 512 bits to 128 bits.
   1.160 +  BIND(L_fold_512b);
   1.161 +  movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
   1.162 +  fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2);
   1.163 +  fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3);
   1.164 +  fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4);
   1.165 +
   1.166 +  // Fold the rest of 128 bits data chunks
   1.167 +  BIND(L_fold_tail);
   1.168 +  addl(len, 3);
   1.169 +  jccb(Assembler::lessEqual, L_fold_128b);
   1.170 +  movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
   1.171 +
   1.172 +  BIND(L_fold_tail_loop);
   1.173 +  fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
   1.174 +  addptr(buf, 16);
   1.175 +  decrementl(len);
   1.176 +  jccb(Assembler::greater, L_fold_tail_loop);
   1.177 +
   1.178 +  // Fold 128 bits in xmm1 down into 32 bits in crc register.
   1.179 +  BIND(L_fold_128b);
   1.180 +  movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()));
   1.181 +  vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
   1.182 +  vpand(xmm3, xmm0, xmm2, false /* vector256 */);
   1.183 +  vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
   1.184 +  psrldq(xmm1, 8);
   1.185 +  psrldq(xmm2, 4);
   1.186 +  pxor(xmm0, xmm1);
   1.187 +  pxor(xmm0, xmm2);
   1.188 +
   1.189 +  // 8 8-bit folds to compute 32-bit CRC.
   1.190 +  for (int j = 0; j < 4; j++) {
   1.191 +    fold_8bit_crc32(xmm0, table, xmm1, rax);
   1.192 +  }
   1.193 +  movdl(crc, xmm0); // mov 32 bits to general register
   1.194 +  for (int j = 0; j < 4; j++) {
   1.195 +    fold_8bit_crc32(crc, table, rax);
   1.196 +  }
   1.197 +
   1.198 +  BIND(L_tail_restore);
   1.199 +  movl(len, tmp); // restore
   1.200 +  BIND(L_tail);
   1.201 +  andl(len, 0xf);
   1.202 +  jccb(Assembler::zero, L_exit);
   1.203 +
   1.204 +  // Fold the rest of bytes
   1.205 +  align(4);
   1.206 +  BIND(L_tail_loop);
   1.207 +  movsbl(rax, Address(buf, 0)); // load byte with sign extension
   1.208 +  update_byte_crc32(crc, rax, table);
   1.209 +  increment(buf);
   1.210 +  decrementl(len);
   1.211 +  jccb(Assembler::greater, L_tail_loop);
   1.212 +
   1.213 +  BIND(L_exit);
   1.214 +  notl(crc); // ~c
   1.215 +}
   1.216 +
   1.217  #undef BIND
   1.218  #undef BLOCK_COMMENT
   1.219  

mercurial