2939 __ addq(rsp, 8); |
2939 __ addq(rsp, 8); |
2940 __ ret(0); |
2940 __ ret(0); |
2941 } |
2941 } |
2942 } |
2942 } |
2943 |
2943 |
|
2944 // AES intrinsic stubs |
|
2945 enum {AESBlockSize = 16}; |
|
2946 |
|
2947 address generate_key_shuffle_mask() { |
|
2948 __ align(16); |
|
2949 StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask"); |
|
2950 address start = __ pc(); |
|
2951 __ emit_data64( 0x0405060700010203, relocInfo::none ); |
|
2952 __ emit_data64( 0x0c0d0e0f08090a0b, relocInfo::none ); |
|
2953 return start; |
|
2954 } |
|
2955 |
|
2956 // Utility routine for loading a 128-bit key word in little endian format |
|
2957 // can optionally specify that the shuffle mask is already in an xmmregister |
|
2958 void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) { |
|
2959 __ movdqu(xmmdst, Address(key, offset)); |
|
2960 if (xmm_shuf_mask != NULL) { |
|
2961 __ pshufb(xmmdst, xmm_shuf_mask); |
|
2962 } else { |
|
2963 __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); |
|
2964 } |
|
2965 } |
|
2966 |
|
2967 // aesenc using specified key+offset |
|
2968 // can optionally specify that the shuffle mask is already in an xmmregister |
|
2969 void aes_enc_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) { |
|
2970 load_key(xmmtmp, key, offset, xmm_shuf_mask); |
|
2971 __ aesenc(xmmdst, xmmtmp); |
|
2972 } |
|
2973 |
|
2974 // aesdec using specified key+offset |
|
2975 // can optionally specify that the shuffle mask is already in an xmmregister |
|
2976 void aes_dec_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) { |
|
2977 load_key(xmmtmp, key, offset, xmm_shuf_mask); |
|
2978 __ aesdec(xmmdst, xmmtmp); |
|
2979 } |
|
2980 |
|
2981 |
|
2982 // Arguments: |
|
2983 // |
|
2984 // Inputs: |
|
2985 // c_rarg0 - source byte array address |
|
2986 // c_rarg1 - destination byte array address |
|
2987 // c_rarg2 - K (key) in little endian int array |
|
2988 // |
|
2989 address generate_aescrypt_encryptBlock() { |
|
2990 assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); |
|
2991 __ align(CodeEntryAlignment); |
|
2992 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); |
|
2993 Label L_doLast; |
|
2994 address start = __ pc(); |
|
2995 |
|
2996 const Register from = c_rarg0; // source array address |
|
2997 const Register to = c_rarg1; // destination array address |
|
2998 const Register key = c_rarg2; // key array address |
|
2999 const Register keylen = rax; |
|
3000 |
|
3001 const XMMRegister xmm_result = xmm0; |
|
3002 const XMMRegister xmm_temp = xmm1; |
|
3003 const XMMRegister xmm_key_shuf_mask = xmm2; |
|
3004 |
|
3005 __ enter(); // required for proper stackwalking of RuntimeStub frame |
|
3006 |
|
3007 __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); |
|
3008 // keylen = # of 32-bit words, convert to 128-bit words |
|
3009 __ shrl(keylen, 2); |
|
3010 __ subl(keylen, 11); // every key has at least 11 128-bit words, some have more |
|
3011 |
|
3012 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); |
|
3013 __ movdqu(xmm_result, Address(from, 0)); // get 16 bytes of input |
|
3014 |
|
3015 // For encryption, the java expanded key ordering is just what we need |
|
3016 // we don't know if the key is aligned, hence not using load-execute form |
|
3017 |
|
3018 load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask); |
|
3019 __ pxor(xmm_result, xmm_temp); |
|
3020 for (int offset = 0x10; offset <= 0x90; offset += 0x10) { |
|
3021 aes_enc_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask); |
|
3022 } |
|
3023 load_key (xmm_temp, key, 0xa0, xmm_key_shuf_mask); |
|
3024 __ cmpl(keylen, 0); |
|
3025 __ jcc(Assembler::equal, L_doLast); |
|
3026 __ aesenc(xmm_result, xmm_temp); // only in 192 and 256 bit keys |
|
3027 aes_enc_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask); |
|
3028 load_key(xmm_temp, key, 0xc0, xmm_key_shuf_mask); |
|
3029 __ subl(keylen, 2); |
|
3030 __ jcc(Assembler::equal, L_doLast); |
|
3031 __ aesenc(xmm_result, xmm_temp); // only in 256 bit keys |
|
3032 aes_enc_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask); |
|
3033 load_key(xmm_temp, key, 0xe0, xmm_key_shuf_mask); |
|
3034 |
|
3035 __ BIND(L_doLast); |
|
3036 __ aesenclast(xmm_result, xmm_temp); |
|
3037 __ movdqu(Address(to, 0), xmm_result); // store the result |
|
3038 __ xorptr(rax, rax); // return 0 |
|
3039 __ leave(); // required for proper stackwalking of RuntimeStub frame |
|
3040 __ ret(0); |
|
3041 |
|
3042 return start; |
|
3043 } |
|
3044 |
|
3045 |
|
3046 // Arguments: |
|
3047 // |
|
3048 // Inputs: |
|
3049 // c_rarg0 - source byte array address |
|
3050 // c_rarg1 - destination byte array address |
|
3051 // c_rarg2 - K (key) in little endian int array |
|
3052 // |
|
3053 address generate_aescrypt_decryptBlock() { |
|
3054 assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); |
|
3055 __ align(CodeEntryAlignment); |
|
3056 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); |
|
3057 Label L_doLast; |
|
3058 address start = __ pc(); |
|
3059 |
|
3060 const Register from = c_rarg0; // source array address |
|
3061 const Register to = c_rarg1; // destination array address |
|
3062 const Register key = c_rarg2; // key array address |
|
3063 const Register keylen = rax; |
|
3064 |
|
3065 const XMMRegister xmm_result = xmm0; |
|
3066 const XMMRegister xmm_temp = xmm1; |
|
3067 const XMMRegister xmm_key_shuf_mask = xmm2; |
|
3068 |
|
3069 __ enter(); // required for proper stackwalking of RuntimeStub frame |
|
3070 |
|
3071 __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); |
|
3072 // keylen = # of 32-bit words, convert to 128-bit words |
|
3073 __ shrl(keylen, 2); |
|
3074 __ subl(keylen, 11); // every key has at least 11 128-bit words, some have more |
|
3075 |
|
3076 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); |
|
3077 __ movdqu(xmm_result, Address(from, 0)); |
|
3078 |
|
3079 // for decryption java expanded key ordering is rotated one position from what we want |
|
3080 // so we start from 0x10 here and hit 0x00 last |
|
3081 // we don't know if the key is aligned, hence not using load-execute form |
|
3082 load_key(xmm_temp, key, 0x10, xmm_key_shuf_mask); |
|
3083 __ pxor (xmm_result, xmm_temp); |
|
3084 for (int offset = 0x20; offset <= 0xa0; offset += 0x10) { |
|
3085 aes_dec_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask); |
|
3086 } |
|
3087 __ cmpl(keylen, 0); |
|
3088 __ jcc(Assembler::equal, L_doLast); |
|
3089 // only in 192 and 256 bit keys |
|
3090 aes_dec_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask); |
|
3091 aes_dec_key(xmm_result, xmm_temp, key, 0xc0, xmm_key_shuf_mask); |
|
3092 __ subl(keylen, 2); |
|
3093 __ jcc(Assembler::equal, L_doLast); |
|
3094 // only in 256 bit keys |
|
3095 aes_dec_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask); |
|
3096 aes_dec_key(xmm_result, xmm_temp, key, 0xe0, xmm_key_shuf_mask); |
|
3097 |
|
3098 __ BIND(L_doLast); |
|
3099 // for decryption the aesdeclast operation is always on key+0x00 |
|
3100 load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask); |
|
3101 __ aesdeclast(xmm_result, xmm_temp); |
|
3102 |
|
3103 __ movdqu(Address(to, 0), xmm_result); // store the result |
|
3104 |
|
3105 __ xorptr(rax, rax); // return 0 |
|
3106 __ leave(); // required for proper stackwalking of RuntimeStub frame |
|
3107 __ ret(0); |
|
3108 |
|
3109 return start; |
|
3110 } |
|
3111 |
|
3112 |
|
3113 // Arguments: |
|
3114 // |
|
3115 // Inputs: |
|
3116 // c_rarg0 - source byte array address |
|
3117 // c_rarg1 - destination byte array address |
|
3118 // c_rarg2 - K (key) in little endian int array |
|
3119 // c_rarg3 - r vector byte array address |
|
3120 // c_rarg4 - input length |
|
3121 // |
|
3122 address generate_cipherBlockChaining_encryptAESCrypt() { |
|
3123 assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); |
|
3124 __ align(CodeEntryAlignment); |
|
3125 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); |
|
3126 address start = __ pc(); |
|
3127 |
|
3128 Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256; |
|
3129 const Register from = c_rarg0; // source array address |
|
3130 const Register to = c_rarg1; // destination array address |
|
3131 const Register key = c_rarg2; // key array address |
|
3132 const Register rvec = c_rarg3; // r byte array initialized from initvector array address |
|
3133 // and left with the results of the last encryption block |
|
3134 #ifndef _WIN64 |
|
3135 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) |
|
3136 #else |
|
3137 const Address len_mem(rsp, 6 * wordSize); // length is on stack on Win64 |
|
3138 const Register len_reg = r10; // pick the first volatile windows register |
|
3139 #endif |
|
3140 const Register pos = rax; |
|
3141 |
|
3142 // xmm register assignments for the loops below |
|
3143 const XMMRegister xmm_result = xmm0; |
|
3144 const XMMRegister xmm_temp = xmm1; |
|
3145 // keys 0-10 preloaded into xmm2-xmm12 |
|
3146 const int XMM_REG_NUM_KEY_FIRST = 2; |
|
3147 const int XMM_REG_NUM_KEY_LAST = 12; |
|
3148 const XMMRegister xmm_key0 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST); |
|
3149 const XMMRegister xmm_key10 = as_XMMRegister(XMM_REG_NUM_KEY_LAST); |
|
3150 |
|
3151 __ enter(); // required for proper stackwalking of RuntimeStub frame |
|
3152 |
|
3153 #ifdef _WIN64 |
|
3154 // on win64, fill len_reg from stack position |
|
3155 __ movl(len_reg, len_mem); |
|
3156 // save the xmm registers which must be preserved 6-12 |
|
3157 __ subptr(rsp, -rsp_after_call_off * wordSize); |
|
3158 for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { |
|
3159 __ movdqu(xmm_save(i), as_XMMRegister(i)); |
|
3160 } |
|
3161 #endif |
|
3162 |
|
3163 const XMMRegister xmm_key_shuf_mask = xmm_temp; // used temporarily to swap key bytes up front |
|
3164 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); |
|
3165 // load up xmm regs 2 thru 12 with key 0x00 - 0xa0 |
|
3166 for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { |
|
3167 load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask); |
|
3168 offset += 0x10; |
|
3169 } |
|
3170 |
|
3171 __ movdqu(xmm_result, Address(rvec, 0x00)); // initialize xmm_result with r vec |
|
3172 |
|
3173 // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256)) |
|
3174 __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); |
|
3175 __ cmpl(rax, 44); |
|
3176 __ jcc(Assembler::notEqual, L_key_192_256); |
|
3177 |
|
3178 // 128 bit code follows here |
|
3179 __ movptr(pos, 0); |
|
3180 __ align(OptoLoopAlignment); |
|
3181 __ BIND(L_loopTop_128); |
|
3182 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input |
|
3183 __ pxor (xmm_result, xmm_temp); // xor with the current r vector |
|
3184 |
|
3185 __ pxor (xmm_result, xmm_key0); // do the aes rounds |
|
3186 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) { |
|
3187 __ aesenc(xmm_result, as_XMMRegister(rnum)); |
|
3188 } |
|
3189 __ aesenclast(xmm_result, xmm_key10); |
|
3190 |
|
3191 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output |
|
3192 // no need to store r to memory until we exit |
|
3193 __ addptr(pos, AESBlockSize); |
|
3194 __ subptr(len_reg, AESBlockSize); |
|
3195 __ jcc(Assembler::notEqual, L_loopTop_128); |
|
3196 |
|
3197 __ BIND(L_exit); |
|
3198 __ movdqu(Address(rvec, 0), xmm_result); // final value of r stored in rvec of CipherBlockChaining object |
|
3199 |
|
3200 #ifdef _WIN64 |
|
3201 // restore xmm regs belonging to calling function |
|
3202 for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { |
|
3203 __ movdqu(as_XMMRegister(i), xmm_save(i)); |
|
3204 } |
|
3205 #endif |
|
3206 __ movl(rax, 0); // return 0 (why?) |
|
3207 __ leave(); // required for proper stackwalking of RuntimeStub frame |
|
3208 __ ret(0); |
|
3209 |
|
3210 __ BIND(L_key_192_256); |
|
3211 // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256) |
|
3212 __ cmpl(rax, 52); |
|
3213 __ jcc(Assembler::notEqual, L_key_256); |
|
3214 |
|
3215 // 192-bit code follows here (could be changed to use more xmm registers) |
|
3216 __ movptr(pos, 0); |
|
3217 __ align(OptoLoopAlignment); |
|
3218 __ BIND(L_loopTop_192); |
|
3219 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input |
|
3220 __ pxor (xmm_result, xmm_temp); // xor with the current r vector |
|
3221 |
|
3222 __ pxor (xmm_result, xmm_key0); // do the aes rounds |
|
3223 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { |
|
3224 __ aesenc(xmm_result, as_XMMRegister(rnum)); |
|
3225 } |
|
3226 aes_enc_key(xmm_result, xmm_temp, key, 0xb0); |
|
3227 load_key(xmm_temp, key, 0xc0); |
|
3228 __ aesenclast(xmm_result, xmm_temp); |
|
3229 |
|
3230 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output |
|
3231 // no need to store r to memory until we exit |
|
3232 __ addptr(pos, AESBlockSize); |
|
3233 __ subptr(len_reg, AESBlockSize); |
|
3234 __ jcc(Assembler::notEqual, L_loopTop_192); |
|
3235 __ jmp(L_exit); |
|
3236 |
|
3237 __ BIND(L_key_256); |
|
3238 // 256-bit code follows here (could be changed to use more xmm registers) |
|
3239 __ movptr(pos, 0); |
|
3240 __ align(OptoLoopAlignment); |
|
3241 __ BIND(L_loopTop_256); |
|
3242 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input |
|
3243 __ pxor (xmm_result, xmm_temp); // xor with the current r vector |
|
3244 |
|
3245 __ pxor (xmm_result, xmm_key0); // do the aes rounds |
|
3246 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { |
|
3247 __ aesenc(xmm_result, as_XMMRegister(rnum)); |
|
3248 } |
|
3249 aes_enc_key(xmm_result, xmm_temp, key, 0xb0); |
|
3250 aes_enc_key(xmm_result, xmm_temp, key, 0xc0); |
|
3251 aes_enc_key(xmm_result, xmm_temp, key, 0xd0); |
|
3252 load_key(xmm_temp, key, 0xe0); |
|
3253 __ aesenclast(xmm_result, xmm_temp); |
|
3254 |
|
3255 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output |
|
3256 // no need to store r to memory until we exit |
|
3257 __ addptr(pos, AESBlockSize); |
|
3258 __ subptr(len_reg, AESBlockSize); |
|
3259 __ jcc(Assembler::notEqual, L_loopTop_256); |
|
3260 __ jmp(L_exit); |
|
3261 |
|
3262 return start; |
|
3263 } |
|
3264 |
|
3265 |
|
3266 |
|
3267 // This is a version of CBC/AES Decrypt which does 4 blocks in a loop at a time |
|
3268 // to hide instruction latency |
|
3269 // |
|
3270 // Arguments: |
|
3271 // |
|
3272 // Inputs: |
|
3273 // c_rarg0 - source byte array address |
|
3274 // c_rarg1 - destination byte array address |
|
3275 // c_rarg2 - K (key) in little endian int array |
|
3276 // c_rarg3 - r vector byte array address |
|
3277 // c_rarg4 - input length |
|
3278 // |
|
3279 |
|
3280 address generate_cipherBlockChaining_decryptAESCrypt_Parallel() { |
|
3281 assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); |
|
3282 __ align(CodeEntryAlignment); |
|
3283 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); |
|
3284 address start = __ pc(); |
|
3285 |
|
3286 Label L_exit, L_key_192_256, L_key_256; |
|
3287 Label L_singleBlock_loopTop_128, L_multiBlock_loopTop_128; |
|
3288 Label L_singleBlock_loopTop_192, L_singleBlock_loopTop_256; |
|
3289 const Register from = c_rarg0; // source array address |
|
3290 const Register to = c_rarg1; // destination array address |
|
3291 const Register key = c_rarg2; // key array address |
|
3292 const Register rvec = c_rarg3; // r byte array initialized from initvector array address |
|
3293 // and left with the results of the last encryption block |
|
3294 #ifndef _WIN64 |
|
3295 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) |
|
3296 #else |
|
3297 const Address len_mem(rsp, 6 * wordSize); // length is on stack on Win64 |
|
3298 const Register len_reg = r10; // pick the first volatile windows register |
|
3299 #endif |
|
3300 const Register pos = rax; |
|
3301 |
|
3302 // xmm register assignments for the loops below |
|
3303 const XMMRegister xmm_result = xmm0; |
|
3304 // keys 0-10 preloaded into xmm2-xmm12 |
|
3305 const int XMM_REG_NUM_KEY_FIRST = 5; |
|
3306 const int XMM_REG_NUM_KEY_LAST = 15; |
|
3307 const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST); |
|
3308 const XMMRegister xmm_key_last = as_XMMRegister(XMM_REG_NUM_KEY_LAST); |
|
3309 |
|
3310 __ enter(); // required for proper stackwalking of RuntimeStub frame |
|
3311 |
|
3312 #ifdef _WIN64 |
|
3313 // on win64, fill len_reg from stack position |
|
3314 __ movl(len_reg, len_mem); |
|
3315 // save the xmm registers which must be preserved 6-15 |
|
3316 __ subptr(rsp, -rsp_after_call_off * wordSize); |
|
3317 for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { |
|
3318 __ movdqu(xmm_save(i), as_XMMRegister(i)); |
|
3319 } |
|
3320 #endif |
|
3321 // the java expanded key ordering is rotated one position from what we want |
|
3322 // so we start from 0x10 here and hit 0x00 last |
|
3323 const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front |
|
3324 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); |
|
3325 // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00 |
|
3326 for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { |
|
3327 if (rnum == XMM_REG_NUM_KEY_LAST) offset = 0x00; |
|
3328 load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask); |
|
3329 offset += 0x10; |
|
3330 } |
|
3331 |
|
3332 const XMMRegister xmm_prev_block_cipher = xmm1; // holds cipher of previous block |
|
3333 // registers holding the four results in the parallelized loop |
|
3334 const XMMRegister xmm_result0 = xmm0; |
|
3335 const XMMRegister xmm_result1 = xmm2; |
|
3336 const XMMRegister xmm_result2 = xmm3; |
|
3337 const XMMRegister xmm_result3 = xmm4; |
|
3338 |
|
3339 __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // initialize with initial rvec |
|
3340 |
|
3341 // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256)) |
|
3342 __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); |
|
3343 __ cmpl(rax, 44); |
|
3344 __ jcc(Assembler::notEqual, L_key_192_256); |
|
3345 |
|
3346 |
|
3347 // 128-bit code follows here, parallelized |
|
3348 __ movptr(pos, 0); |
|
3349 __ align(OptoLoopAlignment); |
|
3350 __ BIND(L_multiBlock_loopTop_128); |
|
3351 __ cmpptr(len_reg, 4*AESBlockSize); // see if at least 4 blocks left |
|
3352 __ jcc(Assembler::less, L_singleBlock_loopTop_128); |
|
3353 |
|
3354 __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0*AESBlockSize)); // get next 4 blocks into xmmresult registers |
|
3355 __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1*AESBlockSize)); |
|
3356 __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2*AESBlockSize)); |
|
3357 __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3*AESBlockSize)); |
|
3358 |
|
3359 #define DoFour(opc, src_reg) \ |
|
3360 __ opc(xmm_result0, src_reg); \ |
|
3361 __ opc(xmm_result1, src_reg); \ |
|
3362 __ opc(xmm_result2, src_reg); \ |
|
3363 __ opc(xmm_result3, src_reg); |
|
3364 |
|
3365 DoFour(pxor, xmm_key_first); |
|
3366 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) { |
|
3367 DoFour(aesdec, as_XMMRegister(rnum)); |
|
3368 } |
|
3369 DoFour(aesdeclast, xmm_key_last); |
|
3370 // for each result, xor with the r vector of previous cipher block |
|
3371 __ pxor(xmm_result0, xmm_prev_block_cipher); |
|
3372 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0*AESBlockSize)); |
|
3373 __ pxor(xmm_result1, xmm_prev_block_cipher); |
|
3374 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1*AESBlockSize)); |
|
3375 __ pxor(xmm_result2, xmm_prev_block_cipher); |
|
3376 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2*AESBlockSize)); |
|
3377 __ pxor(xmm_result3, xmm_prev_block_cipher); |
|
3378 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3*AESBlockSize)); // this will carry over to next set of blocks |
|
3379 |
|
3380 __ movdqu(Address(to, pos, Address::times_1, 0*AESBlockSize), xmm_result0); // store 4 results into the next 64 bytes of output |
|
3381 __ movdqu(Address(to, pos, Address::times_1, 1*AESBlockSize), xmm_result1); |
|
3382 __ movdqu(Address(to, pos, Address::times_1, 2*AESBlockSize), xmm_result2); |
|
3383 __ movdqu(Address(to, pos, Address::times_1, 3*AESBlockSize), xmm_result3); |
|
3384 |
|
3385 __ addptr(pos, 4*AESBlockSize); |
|
3386 __ subptr(len_reg, 4*AESBlockSize); |
|
3387 __ jmp(L_multiBlock_loopTop_128); |
|
3388 |
|
3389 // registers used in the non-parallelized loops |
|
3390 const XMMRegister xmm_prev_block_cipher_save = xmm2; |
|
3391 const XMMRegister xmm_temp = xmm3; |
|
3392 |
|
3393 __ align(OptoLoopAlignment); |
|
3394 __ BIND(L_singleBlock_loopTop_128); |
|
3395 __ cmpptr(len_reg, 0); // any blocks left?? |
|
3396 __ jcc(Assembler::equal, L_exit); |
|
3397 __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input |
|
3398 __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector |
|
3399 __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds |
|
3400 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) { |
|
3401 __ aesdec(xmm_result, as_XMMRegister(rnum)); |
|
3402 } |
|
3403 __ aesdeclast(xmm_result, xmm_key_last); |
|
3404 __ pxor (xmm_result, xmm_prev_block_cipher); // xor with the current r vector |
|
3405 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output |
|
3406 // no need to store r to memory until we exit |
|
3407 __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block |
|
3408 |
|
3409 __ addptr(pos, AESBlockSize); |
|
3410 __ subptr(len_reg, AESBlockSize); |
|
3411 __ jmp(L_singleBlock_loopTop_128); |
|
3412 |
|
3413 |
|
3414 __ BIND(L_exit); |
|
3415 __ movdqu(Address(rvec, 0), xmm_prev_block_cipher); // final value of r stored in rvec of CipherBlockChaining object |
|
3416 #ifdef _WIN64 |
|
3417 // restore regs belonging to calling function |
|
3418 for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { |
|
3419 __ movdqu(as_XMMRegister(i), xmm_save(i)); |
|
3420 } |
|
3421 #endif |
|
3422 __ movl(rax, 0); // return 0 (why?) |
|
3423 __ leave(); // required for proper stackwalking of RuntimeStub frame |
|
3424 __ ret(0); |
|
3425 |
|
3426 |
|
3427 __ BIND(L_key_192_256); |
|
3428 // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256) |
|
3429 __ cmpl(rax, 52); |
|
3430 __ jcc(Assembler::notEqual, L_key_256); |
|
3431 |
|
3432 // 192-bit code follows here (could be optimized to use parallelism) |
|
3433 __ movptr(pos, 0); |
|
3434 __ align(OptoLoopAlignment); |
|
3435 __ BIND(L_singleBlock_loopTop_192); |
|
3436 __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input |
|
3437 __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector |
|
3438 __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds |
|
3439 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) { |
|
3440 __ aesdec(xmm_result, as_XMMRegister(rnum)); |
|
3441 } |
|
3442 aes_dec_key(xmm_result, xmm_temp, key, 0xb0); // 192-bit key goes up to c0 |
|
3443 aes_dec_key(xmm_result, xmm_temp, key, 0xc0); |
|
3444 __ aesdeclast(xmm_result, xmm_key_last); // xmm15 always came from key+0 |
|
3445 __ pxor (xmm_result, xmm_prev_block_cipher); // xor with the current r vector |
|
3446 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output |
|
3447 // no need to store r to memory until we exit |
|
3448 __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block |
|
3449 |
|
3450 __ addptr(pos, AESBlockSize); |
|
3451 __ subptr(len_reg, AESBlockSize); |
|
3452 __ jcc(Assembler::notEqual,L_singleBlock_loopTop_192); |
|
3453 __ jmp(L_exit); |
|
3454 |
|
3455 __ BIND(L_key_256); |
|
3456 // 256-bit code follows here (could be optimized to use parallelism) |
|
3457 __ movptr(pos, 0); |
|
3458 __ align(OptoLoopAlignment); |
|
3459 __ BIND(L_singleBlock_loopTop_256); |
|
3460 __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input |
|
3461 __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector |
|
3462 __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds |
|
3463 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) { |
|
3464 __ aesdec(xmm_result, as_XMMRegister(rnum)); |
|
3465 } |
|
3466 aes_dec_key(xmm_result, xmm_temp, key, 0xb0); // 256-bit key goes up to e0 |
|
3467 aes_dec_key(xmm_result, xmm_temp, key, 0xc0); |
|
3468 aes_dec_key(xmm_result, xmm_temp, key, 0xd0); |
|
3469 aes_dec_key(xmm_result, xmm_temp, key, 0xe0); |
|
3470 __ aesdeclast(xmm_result, xmm_key_last); // xmm15 came from key+0 |
|
3471 __ pxor (xmm_result, xmm_prev_block_cipher); // xor with the current r vector |
|
3472 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output |
|
3473 // no need to store r to memory until we exit |
|
3474 __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block |
|
3475 |
|
3476 __ addptr(pos, AESBlockSize); |
|
3477 __ subptr(len_reg, AESBlockSize); |
|
3478 __ jcc(Assembler::notEqual,L_singleBlock_loopTop_256); |
|
3479 __ jmp(L_exit); |
|
3480 |
|
3481 return start; |
|
3482 } |
|
3483 |
|
3484 |
|
3485 |
2944 #undef __ |
3486 #undef __ |
2945 #define __ masm-> |
3487 #define __ masm-> |
2946 |
3488 |
2947 // Continuation point for throwing of implicit exceptions that are |
3489 // Continuation point for throwing of implicit exceptions that are |
2948 // not handled in the current activation. Fabricates an exception |
3490 // not handled in the current activation. Fabricates an exception |