1.1 --- a/src/cpu/x86/vm/macroAssembler_x86.cpp Tue Jul 02 07:51:31 2013 +0200 1.2 +++ b/src/cpu/x86/vm/macroAssembler_x86.cpp Tue Jul 02 20:42:12 2013 -0400 1.3 @@ -1,5 +1,5 @@ 1.4 /* 1.5 - * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved. 1.6 + * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved. 1.7 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 1.8 * 1.9 * This code is free software; you can redistribute it and/or modify it 1.10 @@ -2794,6 +2794,15 @@ 1.11 } 1.12 } 1.13 1.14 +void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src) { 1.15 + if (reachable(src)) { 1.16 + Assembler::movdqa(dst, as_Address(src)); 1.17 + } else { 1.18 + lea(rscratch1, src); 1.19 + Assembler::movdqa(dst, Address(rscratch1, 0)); 1.20 + } 1.21 +} 1.22 + 1.23 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) { 1.24 if (reachable(src)) { 1.25 Assembler::movsd(dst, as_Address(src)); 1.26 @@ -6388,6 +6397,193 @@ 1.27 bind(L_done); 1.28 } 1.29 1.30 +/** 1.31 + * Emits code to update CRC-32 with a byte value according to constants in table 1.32 + * 1.33 + * @param [in,out]crc Register containing the crc. 1.34 + * @param [in]val Register containing the byte to fold into the CRC. 1.35 + * @param [in]table Register containing the table of crc constants. 1.36 + * 1.37 + * uint32_t crc; 1.38 + * val = crc_table[(val ^ crc) & 0xFF]; 1.39 + * crc = val ^ (crc >> 8); 1.40 + * 1.41 + */ 1.42 +void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 1.43 + xorl(val, crc); 1.44 + andl(val, 0xFF); 1.45 + shrl(crc, 8); // unsigned shift 1.46 + xorl(crc, Address(table, val, Address::times_4, 0)); 1.47 +} 1.48 + 1.49 +/** 1.50 + * Fold 128-bit data chunk 1.51 + */ 1.52 +void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) { 1.53 + vpclmulhdq(xtmp, xK, xcrc); // [123:64] 1.54 + vpclmulldq(xcrc, xK, xcrc); // [63:0] 1.55 + vpxor(xcrc, xcrc, Address(buf, offset), false /* vector256 */); 1.56 + pxor(xcrc, xtmp); 1.57 +} 1.58 + 1.59 +void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) { 1.60 + vpclmulhdq(xtmp, xK, xcrc); 1.61 + vpclmulldq(xcrc, xK, xcrc); 1.62 + pxor(xcrc, xbuf); 1.63 + pxor(xcrc, xtmp); 1.64 +} 1.65 + 1.66 +/** 1.67 + * 8-bit folds to compute 32-bit CRC 1.68 + * 1.69 + * uint64_t xcrc; 1.70 + * timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8); 1.71 + */ 1.72 +void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) { 1.73 + movdl(tmp, xcrc); 1.74 + andl(tmp, 0xFF); 1.75 + movdl(xtmp, Address(table, tmp, Address::times_4, 0)); 1.76 + psrldq(xcrc, 1); // unsigned shift one byte 1.77 + pxor(xcrc, xtmp); 1.78 +} 1.79 + 1.80 +/** 1.81 + * uint32_t crc; 1.82 + * timesXtoThe32[crc & 0xFF] ^ (crc >> 8); 1.83 + */ 1.84 +void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) { 1.85 + movl(tmp, crc); 1.86 + andl(tmp, 0xFF); 1.87 + shrl(crc, 8); 1.88 + xorl(crc, Address(table, tmp, Address::times_4, 0)); 1.89 +} 1.90 + 1.91 +/** 1.92 + * @param crc register containing existing CRC (32-bit) 1.93 + * @param buf register pointing to input byte buffer (byte*) 1.94 + * @param len register containing number of bytes 1.95 + * @param table register that will contain address of CRC table 1.96 + * @param tmp scratch register 1.97 + */ 1.98 +void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) { 1.99 + assert_different_registers(crc, buf, len, table, tmp, rax); 1.100 + 1.101 + Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned; 1.102 + Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop; 1.103 + 1.104 + lea(table, ExternalAddress(StubRoutines::crc_table_addr())); 1.105 + notl(crc); // ~crc 1.106 + cmpl(len, 16); 1.107 + jcc(Assembler::less, L_tail); 1.108 + 1.109 + // Align buffer to 16 bytes 1.110 + movl(tmp, buf); 1.111 + andl(tmp, 0xF); 1.112 + jccb(Assembler::zero, L_aligned); 1.113 + subl(tmp, 16); 1.114 + addl(len, tmp); 1.115 + 1.116 + align(4); 1.117 + BIND(L_align_loop); 1.118 + movsbl(rax, Address(buf, 0)); // load byte with sign extension 1.119 + update_byte_crc32(crc, rax, table); 1.120 + increment(buf); 1.121 + incrementl(tmp); 1.122 + jccb(Assembler::less, L_align_loop); 1.123 + 1.124 + BIND(L_aligned); 1.125 + movl(tmp, len); // save 1.126 + shrl(len, 4); 1.127 + jcc(Assembler::zero, L_tail_restore); 1.128 + 1.129 + // Fold crc into first bytes of vector 1.130 + movdqa(xmm1, Address(buf, 0)); 1.131 + movdl(rax, xmm1); 1.132 + xorl(crc, rax); 1.133 + pinsrd(xmm1, crc, 0); 1.134 + addptr(buf, 16); 1.135 + subl(len, 4); // len > 0 1.136 + jcc(Assembler::less, L_fold_tail); 1.137 + 1.138 + movdqa(xmm2, Address(buf, 0)); 1.139 + movdqa(xmm3, Address(buf, 16)); 1.140 + movdqa(xmm4, Address(buf, 32)); 1.141 + addptr(buf, 48); 1.142 + subl(len, 3); 1.143 + jcc(Assembler::lessEqual, L_fold_512b); 1.144 + 1.145 + // Fold total 512 bits of polynomial on each iteration, 1.146 + // 128 bits per each of 4 parallel streams. 1.147 + movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32)); 1.148 + 1.149 + align(32); 1.150 + BIND(L_fold_512b_loop); 1.151 + fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0); 1.152 + fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16); 1.153 + fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32); 1.154 + fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48); 1.155 + addptr(buf, 64); 1.156 + subl(len, 4); 1.157 + jcc(Assembler::greater, L_fold_512b_loop); 1.158 + 1.159 + // Fold 512 bits to 128 bits. 1.160 + BIND(L_fold_512b); 1.161 + movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16)); 1.162 + fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2); 1.163 + fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3); 1.164 + fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4); 1.165 + 1.166 + // Fold the rest of 128 bits data chunks 1.167 + BIND(L_fold_tail); 1.168 + addl(len, 3); 1.169 + jccb(Assembler::lessEqual, L_fold_128b); 1.170 + movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16)); 1.171 + 1.172 + BIND(L_fold_tail_loop); 1.173 + fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0); 1.174 + addptr(buf, 16); 1.175 + decrementl(len); 1.176 + jccb(Assembler::greater, L_fold_tail_loop); 1.177 + 1.178 + // Fold 128 bits in xmm1 down into 32 bits in crc register. 1.179 + BIND(L_fold_128b); 1.180 + movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr())); 1.181 + vpclmulqdq(xmm2, xmm0, xmm1, 0x1); 1.182 + vpand(xmm3, xmm0, xmm2, false /* vector256 */); 1.183 + vpclmulqdq(xmm0, xmm0, xmm3, 0x1); 1.184 + psrldq(xmm1, 8); 1.185 + psrldq(xmm2, 4); 1.186 + pxor(xmm0, xmm1); 1.187 + pxor(xmm0, xmm2); 1.188 + 1.189 + // 8 8-bit folds to compute 32-bit CRC. 1.190 + for (int j = 0; j < 4; j++) { 1.191 + fold_8bit_crc32(xmm0, table, xmm1, rax); 1.192 + } 1.193 + movdl(crc, xmm0); // mov 32 bits to general register 1.194 + for (int j = 0; j < 4; j++) { 1.195 + fold_8bit_crc32(crc, table, rax); 1.196 + } 1.197 + 1.198 + BIND(L_tail_restore); 1.199 + movl(len, tmp); // restore 1.200 + BIND(L_tail); 1.201 + andl(len, 0xf); 1.202 + jccb(Assembler::zero, L_exit); 1.203 + 1.204 + // Fold the rest of bytes 1.205 + align(4); 1.206 + BIND(L_tail_loop); 1.207 + movsbl(rax, Address(buf, 0)); // load byte with sign extension 1.208 + update_byte_crc32(crc, rax, table); 1.209 + increment(buf); 1.210 + decrementl(len); 1.211 + jccb(Assembler::greater, L_tail_loop); 1.212 + 1.213 + BIND(L_exit); 1.214 + notl(crc); // ~c 1.215 +} 1.216 + 1.217 #undef BIND 1.218 #undef BLOCK_COMMENT 1.219