src/cpu/x86/vm/stubGenerator_x86_64.cpp

changeset 4363
2c7f594145dc
parent 4325
d2f8c38e543d
child 4411
e2e6bf86682c
equal deleted inserted replaced
4362:65c8342f726a 4363:2c7f594145dc
2951 } else { 2951 } else {
2952 __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 2952 __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2953 } 2953 }
2954 } 2954 }
2955 2955
2956 // aesenc using specified key+offset
2957 // can optionally specify that the shuffle mask is already in an xmmregister
2958 void aes_enc_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
2959 load_key(xmmtmp, key, offset, xmm_shuf_mask);
2960 __ aesenc(xmmdst, xmmtmp);
2961 }
2962
2963 // aesdec using specified key+offset
2964 // can optionally specify that the shuffle mask is already in an xmmregister
2965 void aes_dec_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
2966 load_key(xmmtmp, key, offset, xmm_shuf_mask);
2967 __ aesdec(xmmdst, xmmtmp);
2968 }
2969
2970
2971 // Arguments: 2956 // Arguments:
2972 // 2957 //
2973 // Inputs: 2958 // Inputs:
2974 // c_rarg0 - source byte array address 2959 // c_rarg0 - source byte array address
2975 // c_rarg1 - destination byte array address 2960 // c_rarg1 - destination byte array address
2976 // c_rarg2 - K (key) in little endian int array 2961 // c_rarg2 - K (key) in little endian int array
2977 // 2962 //
2978 address generate_aescrypt_encryptBlock() { 2963 address generate_aescrypt_encryptBlock() {
2979 assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); 2964 assert(UseAES, "need AES instructions and misaligned SSE support");
2980 __ align(CodeEntryAlignment); 2965 __ align(CodeEntryAlignment);
2981 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2966 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2982 Label L_doLast; 2967 Label L_doLast;
2983 address start = __ pc(); 2968 address start = __ pc();
2984 2969
2986 const Register to = c_rarg1; // destination array address 2971 const Register to = c_rarg1; // destination array address
2987 const Register key = c_rarg2; // key array address 2972 const Register key = c_rarg2; // key array address
2988 const Register keylen = rax; 2973 const Register keylen = rax;
2989 2974
2990 const XMMRegister xmm_result = xmm0; 2975 const XMMRegister xmm_result = xmm0;
2991 const XMMRegister xmm_temp = xmm1; 2976 const XMMRegister xmm_key_shuf_mask = xmm1;
2992 const XMMRegister xmm_key_shuf_mask = xmm2; 2977 // On win64 xmm6-xmm15 must be preserved so don't use them.
2978 const XMMRegister xmm_temp1 = xmm2;
2979 const XMMRegister xmm_temp2 = xmm3;
2980 const XMMRegister xmm_temp3 = xmm4;
2981 const XMMRegister xmm_temp4 = xmm5;
2993 2982
2994 __ enter(); // required for proper stackwalking of RuntimeStub frame 2983 __ enter(); // required for proper stackwalking of RuntimeStub frame
2995 2984
2985 // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
2996 __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2986 __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2997 // keylen = # of 32-bit words, convert to 128-bit words
2998 __ shrl(keylen, 2);
2999 __ subl(keylen, 11); // every key has at least 11 128-bit words, some have more
3000 2987
3001 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 2988 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3002 __ movdqu(xmm_result, Address(from, 0)); // get 16 bytes of input 2989 __ movdqu(xmm_result, Address(from, 0)); // get 16 bytes of input
3003 2990
3004 // For encryption, the java expanded key ordering is just what we need 2991 // For encryption, the java expanded key ordering is just what we need
3005 // we don't know if the key is aligned, hence not using load-execute form 2992 // we don't know if the key is aligned, hence not using load-execute form
3006 2993
3007 load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask); 2994 load_key(xmm_temp1, key, 0x00, xmm_key_shuf_mask);
3008 __ pxor(xmm_result, xmm_temp); 2995 __ pxor(xmm_result, xmm_temp1);
3009 for (int offset = 0x10; offset <= 0x90; offset += 0x10) { 2996
3010 aes_enc_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask); 2997 load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
3011 } 2998 load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
3012 load_key (xmm_temp, key, 0xa0, xmm_key_shuf_mask); 2999 load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
3013 __ cmpl(keylen, 0); 3000 load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
3014 __ jcc(Assembler::equal, L_doLast); 3001
3015 __ aesenc(xmm_result, xmm_temp); // only in 192 and 256 bit keys 3002 __ aesenc(xmm_result, xmm_temp1);
3016 aes_enc_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask); 3003 __ aesenc(xmm_result, xmm_temp2);
3017 load_key(xmm_temp, key, 0xc0, xmm_key_shuf_mask); 3004 __ aesenc(xmm_result, xmm_temp3);
3018 __ subl(keylen, 2); 3005 __ aesenc(xmm_result, xmm_temp4);
3019 __ jcc(Assembler::equal, L_doLast); 3006
3020 __ aesenc(xmm_result, xmm_temp); // only in 256 bit keys 3007 load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
3021 aes_enc_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask); 3008 load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
3022 load_key(xmm_temp, key, 0xe0, xmm_key_shuf_mask); 3009 load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
3010 load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
3011
3012 __ aesenc(xmm_result, xmm_temp1);
3013 __ aesenc(xmm_result, xmm_temp2);
3014 __ aesenc(xmm_result, xmm_temp3);
3015 __ aesenc(xmm_result, xmm_temp4);
3016
3017 load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
3018 load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
3019
3020 __ cmpl(keylen, 44);
3021 __ jccb(Assembler::equal, L_doLast);
3022
3023 __ aesenc(xmm_result, xmm_temp1);
3024 __ aesenc(xmm_result, xmm_temp2);
3025
3026 load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
3027 load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
3028
3029 __ cmpl(keylen, 52);
3030 __ jccb(Assembler::equal, L_doLast);
3031
3032 __ aesenc(xmm_result, xmm_temp1);
3033 __ aesenc(xmm_result, xmm_temp2);
3034
3035 load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
3036 load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
3023 3037
3024 __ BIND(L_doLast); 3038 __ BIND(L_doLast);
3025 __ aesenclast(xmm_result, xmm_temp); 3039 __ aesenc(xmm_result, xmm_temp1);
3040 __ aesenclast(xmm_result, xmm_temp2);
3026 __ movdqu(Address(to, 0), xmm_result); // store the result 3041 __ movdqu(Address(to, 0), xmm_result); // store the result
3027 __ xorptr(rax, rax); // return 0 3042 __ xorptr(rax, rax); // return 0
3028 __ leave(); // required for proper stackwalking of RuntimeStub frame 3043 __ leave(); // required for proper stackwalking of RuntimeStub frame
3029 __ ret(0); 3044 __ ret(0);
3030 3045
3038 // c_rarg0 - source byte array address 3053 // c_rarg0 - source byte array address
3039 // c_rarg1 - destination byte array address 3054 // c_rarg1 - destination byte array address
3040 // c_rarg2 - K (key) in little endian int array 3055 // c_rarg2 - K (key) in little endian int array
3041 // 3056 //
3042 address generate_aescrypt_decryptBlock() { 3057 address generate_aescrypt_decryptBlock() {
3043 assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); 3058 assert(UseAES, "need AES instructions and misaligned SSE support");
3044 __ align(CodeEntryAlignment); 3059 __ align(CodeEntryAlignment);
3045 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 3060 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
3046 Label L_doLast; 3061 Label L_doLast;
3047 address start = __ pc(); 3062 address start = __ pc();
3048 3063
3050 const Register to = c_rarg1; // destination array address 3065 const Register to = c_rarg1; // destination array address
3051 const Register key = c_rarg2; // key array address 3066 const Register key = c_rarg2; // key array address
3052 const Register keylen = rax; 3067 const Register keylen = rax;
3053 3068
3054 const XMMRegister xmm_result = xmm0; 3069 const XMMRegister xmm_result = xmm0;
3055 const XMMRegister xmm_temp = xmm1; 3070 const XMMRegister xmm_key_shuf_mask = xmm1;
3056 const XMMRegister xmm_key_shuf_mask = xmm2; 3071 // On win64 xmm6-xmm15 must be preserved so don't use them.
3072 const XMMRegister xmm_temp1 = xmm2;
3073 const XMMRegister xmm_temp2 = xmm3;
3074 const XMMRegister xmm_temp3 = xmm4;
3075 const XMMRegister xmm_temp4 = xmm5;
3057 3076
3058 __ enter(); // required for proper stackwalking of RuntimeStub frame 3077 __ enter(); // required for proper stackwalking of RuntimeStub frame
3059 3078
3079 // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
3060 __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3080 __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3061 // keylen = # of 32-bit words, convert to 128-bit words
3062 __ shrl(keylen, 2);
3063 __ subl(keylen, 11); // every key has at least 11 128-bit words, some have more
3064 3081
3065 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 3082 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3066 __ movdqu(xmm_result, Address(from, 0)); 3083 __ movdqu(xmm_result, Address(from, 0));
3067 3084
3068 // for decryption java expanded key ordering is rotated one position from what we want 3085 // for decryption java expanded key ordering is rotated one position from what we want
3069 // so we start from 0x10 here and hit 0x00 last 3086 // so we start from 0x10 here and hit 0x00 last
3070 // we don't know if the key is aligned, hence not using load-execute form 3087 // we don't know if the key is aligned, hence not using load-execute form
3071 load_key(xmm_temp, key, 0x10, xmm_key_shuf_mask); 3088 load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
3072 __ pxor (xmm_result, xmm_temp); 3089 load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
3073 for (int offset = 0x20; offset <= 0xa0; offset += 0x10) { 3090 load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
3074 aes_dec_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask); 3091 load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
3075 } 3092
3076 __ cmpl(keylen, 0); 3093 __ pxor (xmm_result, xmm_temp1);
3077 __ jcc(Assembler::equal, L_doLast); 3094 __ aesdec(xmm_result, xmm_temp2);
3078 // only in 192 and 256 bit keys 3095 __ aesdec(xmm_result, xmm_temp3);
3079 aes_dec_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask); 3096 __ aesdec(xmm_result, xmm_temp4);
3080 aes_dec_key(xmm_result, xmm_temp, key, 0xc0, xmm_key_shuf_mask); 3097
3081 __ subl(keylen, 2); 3098 load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
3082 __ jcc(Assembler::equal, L_doLast); 3099 load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
3083 // only in 256 bit keys 3100 load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
3084 aes_dec_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask); 3101 load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
3085 aes_dec_key(xmm_result, xmm_temp, key, 0xe0, xmm_key_shuf_mask); 3102
3103 __ aesdec(xmm_result, xmm_temp1);
3104 __ aesdec(xmm_result, xmm_temp2);
3105 __ aesdec(xmm_result, xmm_temp3);
3106 __ aesdec(xmm_result, xmm_temp4);
3107
3108 load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
3109 load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
3110 load_key(xmm_temp3, key, 0x00, xmm_key_shuf_mask);
3111
3112 __ cmpl(keylen, 44);
3113 __ jccb(Assembler::equal, L_doLast);
3114
3115 __ aesdec(xmm_result, xmm_temp1);
3116 __ aesdec(xmm_result, xmm_temp2);
3117
3118 load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
3119 load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
3120
3121 __ cmpl(keylen, 52);
3122 __ jccb(Assembler::equal, L_doLast);
3123
3124 __ aesdec(xmm_result, xmm_temp1);
3125 __ aesdec(xmm_result, xmm_temp2);
3126
3127 load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
3128 load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
3086 3129
3087 __ BIND(L_doLast); 3130 __ BIND(L_doLast);
3131 __ aesdec(xmm_result, xmm_temp1);
3132 __ aesdec(xmm_result, xmm_temp2);
3133
3088 // for decryption the aesdeclast operation is always on key+0x00 3134 // for decryption the aesdeclast operation is always on key+0x00
3089 load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask); 3135 __ aesdeclast(xmm_result, xmm_temp3);
3090 __ aesdeclast(xmm_result, xmm_temp);
3091
3092 __ movdqu(Address(to, 0), xmm_result); // store the result 3136 __ movdqu(Address(to, 0), xmm_result); // store the result
3093
3094 __ xorptr(rax, rax); // return 0 3137 __ xorptr(rax, rax); // return 0
3095 __ leave(); // required for proper stackwalking of RuntimeStub frame 3138 __ leave(); // required for proper stackwalking of RuntimeStub frame
3096 __ ret(0); 3139 __ ret(0);
3097 3140
3098 return start; 3141 return start;
3107 // c_rarg2 - K (key) in little endian int array 3150 // c_rarg2 - K (key) in little endian int array
3108 // c_rarg3 - r vector byte array address 3151 // c_rarg3 - r vector byte array address
3109 // c_rarg4 - input length 3152 // c_rarg4 - input length
3110 // 3153 //
3111 address generate_cipherBlockChaining_encryptAESCrypt() { 3154 address generate_cipherBlockChaining_encryptAESCrypt() {
3112 assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); 3155 assert(UseAES, "need AES instructions and misaligned SSE support");
3113 __ align(CodeEntryAlignment); 3156 __ align(CodeEntryAlignment);
3114 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 3157 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
3115 address start = __ pc(); 3158 address start = __ pc();
3116 3159
3117 Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256; 3160 Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256;
3131 // xmm register assignments for the loops below 3174 // xmm register assignments for the loops below
3132 const XMMRegister xmm_result = xmm0; 3175 const XMMRegister xmm_result = xmm0;
3133 const XMMRegister xmm_temp = xmm1; 3176 const XMMRegister xmm_temp = xmm1;
3134 // keys 0-10 preloaded into xmm2-xmm12 3177 // keys 0-10 preloaded into xmm2-xmm12
3135 const int XMM_REG_NUM_KEY_FIRST = 2; 3178 const int XMM_REG_NUM_KEY_FIRST = 2;
3136 const int XMM_REG_NUM_KEY_LAST = 12; 3179 const int XMM_REG_NUM_KEY_LAST = 15;
3137 const XMMRegister xmm_key0 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST); 3180 const XMMRegister xmm_key0 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3138 const XMMRegister xmm_key10 = as_XMMRegister(XMM_REG_NUM_KEY_LAST); 3181 const XMMRegister xmm_key10 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+10);
3182 const XMMRegister xmm_key11 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+11);
3183 const XMMRegister xmm_key12 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+12);
3184 const XMMRegister xmm_key13 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+13);
3139 3185
3140 __ enter(); // required for proper stackwalking of RuntimeStub frame 3186 __ enter(); // required for proper stackwalking of RuntimeStub frame
3141 3187
3142 #ifdef _WIN64 3188 #ifdef _WIN64
3143 // on win64, fill len_reg from stack position 3189 // on win64, fill len_reg from stack position
3144 __ movl(len_reg, len_mem); 3190 __ movl(len_reg, len_mem);
3145 // save the xmm registers which must be preserved 6-12 3191 // save the xmm registers which must be preserved 6-15
3146 __ subptr(rsp, -rsp_after_call_off * wordSize); 3192 __ subptr(rsp, -rsp_after_call_off * wordSize);
3147 for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { 3193 for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
3148 __ movdqu(xmm_save(i), as_XMMRegister(i)); 3194 __ movdqu(xmm_save(i), as_XMMRegister(i));
3149 } 3195 }
3150 #endif 3196 #endif
3151 3197
3152 const XMMRegister xmm_key_shuf_mask = xmm_temp; // used temporarily to swap key bytes up front 3198 const XMMRegister xmm_key_shuf_mask = xmm_temp; // used temporarily to swap key bytes up front
3153 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 3199 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3154 // load up xmm regs 2 thru 12 with key 0x00 - 0xa0 3200 // load up xmm regs xmm2 thru xmm12 with key 0x00 - 0xa0
3155 for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { 3201 for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_FIRST+10; rnum++) {
3156 load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask); 3202 load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3157 offset += 0x10; 3203 offset += 0x10;
3158 } 3204 }
3159
3160 __ movdqu(xmm_result, Address(rvec, 0x00)); // initialize xmm_result with r vec 3205 __ movdqu(xmm_result, Address(rvec, 0x00)); // initialize xmm_result with r vec
3161 3206
3162 // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256)) 3207 // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3163 __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3208 __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3164 __ cmpl(rax, 44); 3209 __ cmpl(rax, 44);
3165 __ jcc(Assembler::notEqual, L_key_192_256); 3210 __ jcc(Assembler::notEqual, L_key_192_256);
3166 3211
3167 // 128 bit code follows here 3212 // 128 bit code follows here
3168 __ movptr(pos, 0); 3213 __ movptr(pos, 0);
3169 __ align(OptoLoopAlignment); 3214 __ align(OptoLoopAlignment);
3215
3170 __ BIND(L_loopTop_128); 3216 __ BIND(L_loopTop_128);
3171 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input 3217 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
3172 __ pxor (xmm_result, xmm_temp); // xor with the current r vector 3218 __ pxor (xmm_result, xmm_temp); // xor with the current r vector
3173
3174 __ pxor (xmm_result, xmm_key0); // do the aes rounds 3219 __ pxor (xmm_result, xmm_key0); // do the aes rounds
3175 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) { 3220 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 9; rnum++) {
3176 __ aesenc(xmm_result, as_XMMRegister(rnum)); 3221 __ aesenc(xmm_result, as_XMMRegister(rnum));
3177 } 3222 }
3178 __ aesenclast(xmm_result, xmm_key10); 3223 __ aesenclast(xmm_result, xmm_key10);
3179
3180 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output 3224 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
3181 // no need to store r to memory until we exit 3225 // no need to store r to memory until we exit
3182 __ addptr(pos, AESBlockSize); 3226 __ addptr(pos, AESBlockSize);
3183 __ subptr(len_reg, AESBlockSize); 3227 __ subptr(len_reg, AESBlockSize);
3184 __ jcc(Assembler::notEqual, L_loopTop_128); 3228 __ jcc(Assembler::notEqual, L_loopTop_128);
3196 __ leave(); // required for proper stackwalking of RuntimeStub frame 3240 __ leave(); // required for proper stackwalking of RuntimeStub frame
3197 __ ret(0); 3241 __ ret(0);
3198 3242
3199 __ BIND(L_key_192_256); 3243 __ BIND(L_key_192_256);
3200 // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256) 3244 // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
3245 load_key(xmm_key11, key, 0xb0, xmm_key_shuf_mask);
3246 load_key(xmm_key12, key, 0xc0, xmm_key_shuf_mask);
3201 __ cmpl(rax, 52); 3247 __ cmpl(rax, 52);
3202 __ jcc(Assembler::notEqual, L_key_256); 3248 __ jcc(Assembler::notEqual, L_key_256);
3203 3249
3204 // 192-bit code follows here (could be changed to use more xmm registers) 3250 // 192-bit code follows here (could be changed to use more xmm registers)
3205 __ movptr(pos, 0); 3251 __ movptr(pos, 0);
3206 __ align(OptoLoopAlignment); 3252 __ align(OptoLoopAlignment);
3253
3207 __ BIND(L_loopTop_192); 3254 __ BIND(L_loopTop_192);
3208 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input 3255 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
3209 __ pxor (xmm_result, xmm_temp); // xor with the current r vector 3256 __ pxor (xmm_result, xmm_temp); // xor with the current r vector
3210
3211 __ pxor (xmm_result, xmm_key0); // do the aes rounds 3257 __ pxor (xmm_result, xmm_key0); // do the aes rounds
3212 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { 3258 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 11; rnum++) {
3213 __ aesenc(xmm_result, as_XMMRegister(rnum)); 3259 __ aesenc(xmm_result, as_XMMRegister(rnum));
3214 } 3260 }
3215 aes_enc_key(xmm_result, xmm_temp, key, 0xb0); 3261 __ aesenclast(xmm_result, xmm_key12);
3216 load_key(xmm_temp, key, 0xc0);
3217 __ aesenclast(xmm_result, xmm_temp);
3218
3219 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output 3262 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
3220 // no need to store r to memory until we exit 3263 // no need to store r to memory until we exit
3221 __ addptr(pos, AESBlockSize); 3264 __ addptr(pos, AESBlockSize);
3222 __ subptr(len_reg, AESBlockSize); 3265 __ subptr(len_reg, AESBlockSize);
3223 __ jcc(Assembler::notEqual, L_loopTop_192); 3266 __ jcc(Assembler::notEqual, L_loopTop_192);
3224 __ jmp(L_exit); 3267 __ jmp(L_exit);
3225 3268
3226 __ BIND(L_key_256); 3269 __ BIND(L_key_256);
3227 // 256-bit code follows here (could be changed to use more xmm registers) 3270 // 256-bit code follows here (could be changed to use more xmm registers)
3271 load_key(xmm_key13, key, 0xd0, xmm_key_shuf_mask);
3228 __ movptr(pos, 0); 3272 __ movptr(pos, 0);
3229 __ align(OptoLoopAlignment); 3273 __ align(OptoLoopAlignment);
3274
3230 __ BIND(L_loopTop_256); 3275 __ BIND(L_loopTop_256);
3231 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input 3276 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
3232 __ pxor (xmm_result, xmm_temp); // xor with the current r vector 3277 __ pxor (xmm_result, xmm_temp); // xor with the current r vector
3233
3234 __ pxor (xmm_result, xmm_key0); // do the aes rounds 3278 __ pxor (xmm_result, xmm_key0); // do the aes rounds
3235 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { 3279 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 13; rnum++) {
3236 __ aesenc(xmm_result, as_XMMRegister(rnum)); 3280 __ aesenc(xmm_result, as_XMMRegister(rnum));
3237 } 3281 }
3238 aes_enc_key(xmm_result, xmm_temp, key, 0xb0);
3239 aes_enc_key(xmm_result, xmm_temp, key, 0xc0);
3240 aes_enc_key(xmm_result, xmm_temp, key, 0xd0);
3241 load_key(xmm_temp, key, 0xe0); 3282 load_key(xmm_temp, key, 0xe0);
3242 __ aesenclast(xmm_result, xmm_temp); 3283 __ aesenclast(xmm_result, xmm_temp);
3243
3244 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output 3284 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
3245 // no need to store r to memory until we exit 3285 // no need to store r to memory until we exit
3246 __ addptr(pos, AESBlockSize); 3286 __ addptr(pos, AESBlockSize);
3247 __ subptr(len_reg, AESBlockSize); 3287 __ subptr(len_reg, AESBlockSize);
3248 __ jcc(Assembler::notEqual, L_loopTop_256); 3288 __ jcc(Assembler::notEqual, L_loopTop_256);
3265 // c_rarg3 - r vector byte array address 3305 // c_rarg3 - r vector byte array address
3266 // c_rarg4 - input length 3306 // c_rarg4 - input length
3267 // 3307 //
3268 3308
3269 address generate_cipherBlockChaining_decryptAESCrypt_Parallel() { 3309 address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
3270 assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); 3310 assert(UseAES, "need AES instructions and misaligned SSE support");
3271 __ align(CodeEntryAlignment); 3311 __ align(CodeEntryAlignment);
3272 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 3312 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
3273 address start = __ pc(); 3313 address start = __ pc();
3274 3314
3275 Label L_exit, L_key_192_256, L_key_256; 3315 Label L_exit, L_key_192_256, L_key_256;
3286 const Address len_mem(rsp, 6 * wordSize); // length is on stack on Win64 3326 const Address len_mem(rsp, 6 * wordSize); // length is on stack on Win64
3287 const Register len_reg = r10; // pick the first volatile windows register 3327 const Register len_reg = r10; // pick the first volatile windows register
3288 #endif 3328 #endif
3289 const Register pos = rax; 3329 const Register pos = rax;
3290 3330
3291 // xmm register assignments for the loops below
3292 const XMMRegister xmm_result = xmm0;
3293 // keys 0-10 preloaded into xmm2-xmm12 3331 // keys 0-10 preloaded into xmm2-xmm12
3294 const int XMM_REG_NUM_KEY_FIRST = 5; 3332 const int XMM_REG_NUM_KEY_FIRST = 5;
3295 const int XMM_REG_NUM_KEY_LAST = 15; 3333 const int XMM_REG_NUM_KEY_LAST = 15;
3296 const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST); 3334 const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3297 const XMMRegister xmm_key_last = as_XMMRegister(XMM_REG_NUM_KEY_LAST); 3335 const XMMRegister xmm_key_last = as_XMMRegister(XMM_REG_NUM_KEY_LAST);
3298 3336
3299 __ enter(); // required for proper stackwalking of RuntimeStub frame 3337 __ enter(); // required for proper stackwalking of RuntimeStub frame
3300 3338
3301 #ifdef _WIN64 3339 #ifdef _WIN64
3310 // the java expanded key ordering is rotated one position from what we want 3348 // the java expanded key ordering is rotated one position from what we want
3311 // so we start from 0x10 here and hit 0x00 last 3349 // so we start from 0x10 here and hit 0x00 last
3312 const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front 3350 const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front
3313 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 3351 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3314 // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00 3352 // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00
3315 for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { 3353 for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum < XMM_REG_NUM_KEY_LAST; rnum++) {
3316 if (rnum == XMM_REG_NUM_KEY_LAST) offset = 0x00;
3317 load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask); 3354 load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3318 offset += 0x10; 3355 offset += 0x10;
3319 } 3356 }
3357 load_key(xmm_key_last, key, 0x00, xmm_key_shuf_mask);
3320 3358
3321 const XMMRegister xmm_prev_block_cipher = xmm1; // holds cipher of previous block 3359 const XMMRegister xmm_prev_block_cipher = xmm1; // holds cipher of previous block
3360
3322 // registers holding the four results in the parallelized loop 3361 // registers holding the four results in the parallelized loop
3323 const XMMRegister xmm_result0 = xmm0; 3362 const XMMRegister xmm_result0 = xmm0;
3324 const XMMRegister xmm_result1 = xmm2; 3363 const XMMRegister xmm_result1 = xmm2;
3325 const XMMRegister xmm_result2 = xmm3; 3364 const XMMRegister xmm_result2 = xmm3;
3326 const XMMRegister xmm_result3 = xmm4; 3365 const XMMRegister xmm_result3 = xmm4;
3374 __ addptr(pos, 4*AESBlockSize); 3413 __ addptr(pos, 4*AESBlockSize);
3375 __ subptr(len_reg, 4*AESBlockSize); 3414 __ subptr(len_reg, 4*AESBlockSize);
3376 __ jmp(L_multiBlock_loopTop_128); 3415 __ jmp(L_multiBlock_loopTop_128);
3377 3416
3378 // registers used in the non-parallelized loops 3417 // registers used in the non-parallelized loops
3418 // xmm register assignments for the loops below
3419 const XMMRegister xmm_result = xmm0;
3379 const XMMRegister xmm_prev_block_cipher_save = xmm2; 3420 const XMMRegister xmm_prev_block_cipher_save = xmm2;
3380 const XMMRegister xmm_temp = xmm3; 3421 const XMMRegister xmm_key11 = xmm3;
3422 const XMMRegister xmm_key12 = xmm4;
3423 const XMMRegister xmm_temp = xmm4;
3381 3424
3382 __ align(OptoLoopAlignment); 3425 __ align(OptoLoopAlignment);
3383 __ BIND(L_singleBlock_loopTop_128); 3426 __ BIND(L_singleBlock_loopTop_128);
3384 __ cmpptr(len_reg, 0); // any blocks left?? 3427 __ cmpptr(len_reg, 0); // any blocks left??
3385 __ jcc(Assembler::equal, L_exit); 3428 __ jcc(Assembler::equal, L_exit);
3413 __ ret(0); 3456 __ ret(0);
3414 3457
3415 3458
3416 __ BIND(L_key_192_256); 3459 __ BIND(L_key_192_256);
3417 // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256) 3460 // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
3461 load_key(xmm_key11, key, 0xb0);
3418 __ cmpl(rax, 52); 3462 __ cmpl(rax, 52);
3419 __ jcc(Assembler::notEqual, L_key_256); 3463 __ jcc(Assembler::notEqual, L_key_256);
3420 3464
3421 // 192-bit code follows here (could be optimized to use parallelism) 3465 // 192-bit code follows here (could be optimized to use parallelism)
3466 load_key(xmm_key12, key, 0xc0); // 192-bit key goes up to c0
3422 __ movptr(pos, 0); 3467 __ movptr(pos, 0);
3423 __ align(OptoLoopAlignment); 3468 __ align(OptoLoopAlignment);
3469
3424 __ BIND(L_singleBlock_loopTop_192); 3470 __ BIND(L_singleBlock_loopTop_192);
3425 __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input 3471 __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
3426 __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector 3472 __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector
3427 __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds 3473 __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds
3428 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) { 3474 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
3429 __ aesdec(xmm_result, as_XMMRegister(rnum)); 3475 __ aesdec(xmm_result, as_XMMRegister(rnum));
3430 } 3476 }
3431 aes_dec_key(xmm_result, xmm_temp, key, 0xb0); // 192-bit key goes up to c0 3477 __ aesdec(xmm_result, xmm_key11);
3432 aes_dec_key(xmm_result, xmm_temp, key, 0xc0); 3478 __ aesdec(xmm_result, xmm_key12);
3433 __ aesdeclast(xmm_result, xmm_key_last); // xmm15 always came from key+0 3479 __ aesdeclast(xmm_result, xmm_key_last); // xmm15 always came from key+0
3434 __ pxor (xmm_result, xmm_prev_block_cipher); // xor with the current r vector 3480 __ pxor (xmm_result, xmm_prev_block_cipher); // xor with the current r vector
3435 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output 3481 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
3436 // no need to store r to memory until we exit 3482 // no need to store r to memory until we exit
3437 __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block 3483 __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block
3438
3439 __ addptr(pos, AESBlockSize); 3484 __ addptr(pos, AESBlockSize);
3440 __ subptr(len_reg, AESBlockSize); 3485 __ subptr(len_reg, AESBlockSize);
3441 __ jcc(Assembler::notEqual,L_singleBlock_loopTop_192); 3486 __ jcc(Assembler::notEqual,L_singleBlock_loopTop_192);
3442 __ jmp(L_exit); 3487 __ jmp(L_exit);
3443 3488
3444 __ BIND(L_key_256); 3489 __ BIND(L_key_256);
3445 // 256-bit code follows here (could be optimized to use parallelism) 3490 // 256-bit code follows here (could be optimized to use parallelism)
3446 __ movptr(pos, 0); 3491 __ movptr(pos, 0);
3447 __ align(OptoLoopAlignment); 3492 __ align(OptoLoopAlignment);
3493
3448 __ BIND(L_singleBlock_loopTop_256); 3494 __ BIND(L_singleBlock_loopTop_256);
3449 __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input 3495 __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
3450 __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector 3496 __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector
3451 __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds 3497 __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds
3452 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) { 3498 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
3453 __ aesdec(xmm_result, as_XMMRegister(rnum)); 3499 __ aesdec(xmm_result, as_XMMRegister(rnum));
3454 } 3500 }
3455 aes_dec_key(xmm_result, xmm_temp, key, 0xb0); // 256-bit key goes up to e0 3501 __ aesdec(xmm_result, xmm_key11);
3456 aes_dec_key(xmm_result, xmm_temp, key, 0xc0); 3502 load_key(xmm_temp, key, 0xc0);
3457 aes_dec_key(xmm_result, xmm_temp, key, 0xd0); 3503 __ aesdec(xmm_result, xmm_temp);
3458 aes_dec_key(xmm_result, xmm_temp, key, 0xe0); 3504 load_key(xmm_temp, key, 0xd0);
3459 __ aesdeclast(xmm_result, xmm_key_last); // xmm15 came from key+0 3505 __ aesdec(xmm_result, xmm_temp);
3506 load_key(xmm_temp, key, 0xe0); // 256-bit key goes up to e0
3507 __ aesdec(xmm_result, xmm_temp);
3508 __ aesdeclast(xmm_result, xmm_key_last); // xmm15 came from key+0
3460 __ pxor (xmm_result, xmm_prev_block_cipher); // xor with the current r vector 3509 __ pxor (xmm_result, xmm_prev_block_cipher); // xor with the current r vector
3461 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output 3510 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
3462 // no need to store r to memory until we exit 3511 // no need to store r to memory until we exit
3463 __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block 3512 __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block
3464
3465 __ addptr(pos, AESBlockSize); 3513 __ addptr(pos, AESBlockSize);
3466 __ subptr(len_reg, AESBlockSize); 3514 __ subptr(len_reg, AESBlockSize);
3467 __ jcc(Assembler::notEqual,L_singleBlock_loopTop_256); 3515 __ jcc(Assembler::notEqual,L_singleBlock_loopTop_256);
3468 __ jmp(L_exit); 3516 __ jmp(L_exit);
3469 3517

mercurial