2951 } else { |
2951 } else { |
2952 __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); |
2952 __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); |
2953 } |
2953 } |
2954 } |
2954 } |
2955 |
2955 |
2956 // aesenc using specified key+offset |
|
2957 // can optionally specify that the shuffle mask is already in an xmmregister |
|
2958 void aes_enc_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) { |
|
2959 load_key(xmmtmp, key, offset, xmm_shuf_mask); |
|
2960 __ aesenc(xmmdst, xmmtmp); |
|
2961 } |
|
2962 |
|
2963 // aesdec using specified key+offset |
|
2964 // can optionally specify that the shuffle mask is already in an xmmregister |
|
2965 void aes_dec_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) { |
|
2966 load_key(xmmtmp, key, offset, xmm_shuf_mask); |
|
2967 __ aesdec(xmmdst, xmmtmp); |
|
2968 } |
|
2969 |
|
2970 |
|
2971 // Arguments: |
2956 // Arguments: |
2972 // |
2957 // |
2973 // Inputs: |
2958 // Inputs: |
2974 // c_rarg0 - source byte array address |
2959 // c_rarg0 - source byte array address |
2975 // c_rarg1 - destination byte array address |
2960 // c_rarg1 - destination byte array address |
2976 // c_rarg2 - K (key) in little endian int array |
2961 // c_rarg2 - K (key) in little endian int array |
2977 // |
2962 // |
2978 address generate_aescrypt_encryptBlock() { |
2963 address generate_aescrypt_encryptBlock() { |
2979 assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); |
2964 assert(UseAES, "need AES instructions and misaligned SSE support"); |
2980 __ align(CodeEntryAlignment); |
2965 __ align(CodeEntryAlignment); |
2981 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); |
2966 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); |
2982 Label L_doLast; |
2967 Label L_doLast; |
2983 address start = __ pc(); |
2968 address start = __ pc(); |
2984 |
2969 |
2986 const Register to = c_rarg1; // destination array address |
2971 const Register to = c_rarg1; // destination array address |
2987 const Register key = c_rarg2; // key array address |
2972 const Register key = c_rarg2; // key array address |
2988 const Register keylen = rax; |
2973 const Register keylen = rax; |
2989 |
2974 |
2990 const XMMRegister xmm_result = xmm0; |
2975 const XMMRegister xmm_result = xmm0; |
2991 const XMMRegister xmm_temp = xmm1; |
2976 const XMMRegister xmm_key_shuf_mask = xmm1; |
2992 const XMMRegister xmm_key_shuf_mask = xmm2; |
2977 // On win64 xmm6-xmm15 must be preserved so don't use them. |
|
2978 const XMMRegister xmm_temp1 = xmm2; |
|
2979 const XMMRegister xmm_temp2 = xmm3; |
|
2980 const XMMRegister xmm_temp3 = xmm4; |
|
2981 const XMMRegister xmm_temp4 = xmm5; |
2993 |
2982 |
2994 __ enter(); // required for proper stackwalking of RuntimeStub frame |
2983 __ enter(); // required for proper stackwalking of RuntimeStub frame |
2995 |
2984 |
|
2985 // keylen could be only {11, 13, 15} * 4 = {44, 52, 60} |
2996 __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); |
2986 __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); |
2997 // keylen = # of 32-bit words, convert to 128-bit words |
|
2998 __ shrl(keylen, 2); |
|
2999 __ subl(keylen, 11); // every key has at least 11 128-bit words, some have more |
|
3000 |
2987 |
3001 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); |
2988 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); |
3002 __ movdqu(xmm_result, Address(from, 0)); // get 16 bytes of input |
2989 __ movdqu(xmm_result, Address(from, 0)); // get 16 bytes of input |
3003 |
2990 |
3004 // For encryption, the java expanded key ordering is just what we need |
2991 // For encryption, the java expanded key ordering is just what we need |
3005 // we don't know if the key is aligned, hence not using load-execute form |
2992 // we don't know if the key is aligned, hence not using load-execute form |
3006 |
2993 |
3007 load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask); |
2994 load_key(xmm_temp1, key, 0x00, xmm_key_shuf_mask); |
3008 __ pxor(xmm_result, xmm_temp); |
2995 __ pxor(xmm_result, xmm_temp1); |
3009 for (int offset = 0x10; offset <= 0x90; offset += 0x10) { |
2996 |
3010 aes_enc_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask); |
2997 load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask); |
3011 } |
2998 load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask); |
3012 load_key (xmm_temp, key, 0xa0, xmm_key_shuf_mask); |
2999 load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask); |
3013 __ cmpl(keylen, 0); |
3000 load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask); |
3014 __ jcc(Assembler::equal, L_doLast); |
3001 |
3015 __ aesenc(xmm_result, xmm_temp); // only in 192 and 256 bit keys |
3002 __ aesenc(xmm_result, xmm_temp1); |
3016 aes_enc_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask); |
3003 __ aesenc(xmm_result, xmm_temp2); |
3017 load_key(xmm_temp, key, 0xc0, xmm_key_shuf_mask); |
3004 __ aesenc(xmm_result, xmm_temp3); |
3018 __ subl(keylen, 2); |
3005 __ aesenc(xmm_result, xmm_temp4); |
3019 __ jcc(Assembler::equal, L_doLast); |
3006 |
3020 __ aesenc(xmm_result, xmm_temp); // only in 256 bit keys |
3007 load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask); |
3021 aes_enc_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask); |
3008 load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask); |
3022 load_key(xmm_temp, key, 0xe0, xmm_key_shuf_mask); |
3009 load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask); |
|
3010 load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask); |
|
3011 |
|
3012 __ aesenc(xmm_result, xmm_temp1); |
|
3013 __ aesenc(xmm_result, xmm_temp2); |
|
3014 __ aesenc(xmm_result, xmm_temp3); |
|
3015 __ aesenc(xmm_result, xmm_temp4); |
|
3016 |
|
3017 load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask); |
|
3018 load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask); |
|
3019 |
|
3020 __ cmpl(keylen, 44); |
|
3021 __ jccb(Assembler::equal, L_doLast); |
|
3022 |
|
3023 __ aesenc(xmm_result, xmm_temp1); |
|
3024 __ aesenc(xmm_result, xmm_temp2); |
|
3025 |
|
3026 load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask); |
|
3027 load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask); |
|
3028 |
|
3029 __ cmpl(keylen, 52); |
|
3030 __ jccb(Assembler::equal, L_doLast); |
|
3031 |
|
3032 __ aesenc(xmm_result, xmm_temp1); |
|
3033 __ aesenc(xmm_result, xmm_temp2); |
|
3034 |
|
3035 load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask); |
|
3036 load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask); |
3023 |
3037 |
3024 __ BIND(L_doLast); |
3038 __ BIND(L_doLast); |
3025 __ aesenclast(xmm_result, xmm_temp); |
3039 __ aesenc(xmm_result, xmm_temp1); |
|
3040 __ aesenclast(xmm_result, xmm_temp2); |
3026 __ movdqu(Address(to, 0), xmm_result); // store the result |
3041 __ movdqu(Address(to, 0), xmm_result); // store the result |
3027 __ xorptr(rax, rax); // return 0 |
3042 __ xorptr(rax, rax); // return 0 |
3028 __ leave(); // required for proper stackwalking of RuntimeStub frame |
3043 __ leave(); // required for proper stackwalking of RuntimeStub frame |
3029 __ ret(0); |
3044 __ ret(0); |
3030 |
3045 |
3050 const Register to = c_rarg1; // destination array address |
3065 const Register to = c_rarg1; // destination array address |
3051 const Register key = c_rarg2; // key array address |
3066 const Register key = c_rarg2; // key array address |
3052 const Register keylen = rax; |
3067 const Register keylen = rax; |
3053 |
3068 |
3054 const XMMRegister xmm_result = xmm0; |
3069 const XMMRegister xmm_result = xmm0; |
3055 const XMMRegister xmm_temp = xmm1; |
3070 const XMMRegister xmm_key_shuf_mask = xmm1; |
3056 const XMMRegister xmm_key_shuf_mask = xmm2; |
3071 // On win64 xmm6-xmm15 must be preserved so don't use them. |
|
3072 const XMMRegister xmm_temp1 = xmm2; |
|
3073 const XMMRegister xmm_temp2 = xmm3; |
|
3074 const XMMRegister xmm_temp3 = xmm4; |
|
3075 const XMMRegister xmm_temp4 = xmm5; |
3057 |
3076 |
3058 __ enter(); // required for proper stackwalking of RuntimeStub frame |
3077 __ enter(); // required for proper stackwalking of RuntimeStub frame |
3059 |
3078 |
|
3079 // keylen could be only {11, 13, 15} * 4 = {44, 52, 60} |
3060 __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); |
3080 __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); |
3061 // keylen = # of 32-bit words, convert to 128-bit words |
|
3062 __ shrl(keylen, 2); |
|
3063 __ subl(keylen, 11); // every key has at least 11 128-bit words, some have more |
|
3064 |
3081 |
3065 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); |
3082 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); |
3066 __ movdqu(xmm_result, Address(from, 0)); |
3083 __ movdqu(xmm_result, Address(from, 0)); |
3067 |
3084 |
3068 // for decryption java expanded key ordering is rotated one position from what we want |
3085 // for decryption java expanded key ordering is rotated one position from what we want |
3069 // so we start from 0x10 here and hit 0x00 last |
3086 // so we start from 0x10 here and hit 0x00 last |
3070 // we don't know if the key is aligned, hence not using load-execute form |
3087 // we don't know if the key is aligned, hence not using load-execute form |
3071 load_key(xmm_temp, key, 0x10, xmm_key_shuf_mask); |
3088 load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask); |
3072 __ pxor (xmm_result, xmm_temp); |
3089 load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask); |
3073 for (int offset = 0x20; offset <= 0xa0; offset += 0x10) { |
3090 load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask); |
3074 aes_dec_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask); |
3091 load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask); |
3075 } |
3092 |
3076 __ cmpl(keylen, 0); |
3093 __ pxor (xmm_result, xmm_temp1); |
3077 __ jcc(Assembler::equal, L_doLast); |
3094 __ aesdec(xmm_result, xmm_temp2); |
3078 // only in 192 and 256 bit keys |
3095 __ aesdec(xmm_result, xmm_temp3); |
3079 aes_dec_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask); |
3096 __ aesdec(xmm_result, xmm_temp4); |
3080 aes_dec_key(xmm_result, xmm_temp, key, 0xc0, xmm_key_shuf_mask); |
3097 |
3081 __ subl(keylen, 2); |
3098 load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask); |
3082 __ jcc(Assembler::equal, L_doLast); |
3099 load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask); |
3083 // only in 256 bit keys |
3100 load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask); |
3084 aes_dec_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask); |
3101 load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask); |
3085 aes_dec_key(xmm_result, xmm_temp, key, 0xe0, xmm_key_shuf_mask); |
3102 |
|
3103 __ aesdec(xmm_result, xmm_temp1); |
|
3104 __ aesdec(xmm_result, xmm_temp2); |
|
3105 __ aesdec(xmm_result, xmm_temp3); |
|
3106 __ aesdec(xmm_result, xmm_temp4); |
|
3107 |
|
3108 load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask); |
|
3109 load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask); |
|
3110 load_key(xmm_temp3, key, 0x00, xmm_key_shuf_mask); |
|
3111 |
|
3112 __ cmpl(keylen, 44); |
|
3113 __ jccb(Assembler::equal, L_doLast); |
|
3114 |
|
3115 __ aesdec(xmm_result, xmm_temp1); |
|
3116 __ aesdec(xmm_result, xmm_temp2); |
|
3117 |
|
3118 load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask); |
|
3119 load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask); |
|
3120 |
|
3121 __ cmpl(keylen, 52); |
|
3122 __ jccb(Assembler::equal, L_doLast); |
|
3123 |
|
3124 __ aesdec(xmm_result, xmm_temp1); |
|
3125 __ aesdec(xmm_result, xmm_temp2); |
|
3126 |
|
3127 load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask); |
|
3128 load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask); |
3086 |
3129 |
3087 __ BIND(L_doLast); |
3130 __ BIND(L_doLast); |
|
3131 __ aesdec(xmm_result, xmm_temp1); |
|
3132 __ aesdec(xmm_result, xmm_temp2); |
|
3133 |
3088 // for decryption the aesdeclast operation is always on key+0x00 |
3134 // for decryption the aesdeclast operation is always on key+0x00 |
3089 load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask); |
3135 __ aesdeclast(xmm_result, xmm_temp3); |
3090 __ aesdeclast(xmm_result, xmm_temp); |
|
3091 |
|
3092 __ movdqu(Address(to, 0), xmm_result); // store the result |
3136 __ movdqu(Address(to, 0), xmm_result); // store the result |
3093 |
|
3094 __ xorptr(rax, rax); // return 0 |
3137 __ xorptr(rax, rax); // return 0 |
3095 __ leave(); // required for proper stackwalking of RuntimeStub frame |
3138 __ leave(); // required for proper stackwalking of RuntimeStub frame |
3096 __ ret(0); |
3139 __ ret(0); |
3097 |
3140 |
3098 return start; |
3141 return start; |
3131 // xmm register assignments for the loops below |
3174 // xmm register assignments for the loops below |
3132 const XMMRegister xmm_result = xmm0; |
3175 const XMMRegister xmm_result = xmm0; |
3133 const XMMRegister xmm_temp = xmm1; |
3176 const XMMRegister xmm_temp = xmm1; |
3134 // keys 0-10 preloaded into xmm2-xmm12 |
3177 // keys 0-10 preloaded into xmm2-xmm12 |
3135 const int XMM_REG_NUM_KEY_FIRST = 2; |
3178 const int XMM_REG_NUM_KEY_FIRST = 2; |
3136 const int XMM_REG_NUM_KEY_LAST = 12; |
3179 const int XMM_REG_NUM_KEY_LAST = 15; |
3137 const XMMRegister xmm_key0 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST); |
3180 const XMMRegister xmm_key0 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST); |
3138 const XMMRegister xmm_key10 = as_XMMRegister(XMM_REG_NUM_KEY_LAST); |
3181 const XMMRegister xmm_key10 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+10); |
|
3182 const XMMRegister xmm_key11 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+11); |
|
3183 const XMMRegister xmm_key12 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+12); |
|
3184 const XMMRegister xmm_key13 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+13); |
3139 |
3185 |
3140 __ enter(); // required for proper stackwalking of RuntimeStub frame |
3186 __ enter(); // required for proper stackwalking of RuntimeStub frame |
3141 |
3187 |
3142 #ifdef _WIN64 |
3188 #ifdef _WIN64 |
3143 // on win64, fill len_reg from stack position |
3189 // on win64, fill len_reg from stack position |
3144 __ movl(len_reg, len_mem); |
3190 __ movl(len_reg, len_mem); |
3145 // save the xmm registers which must be preserved 6-12 |
3191 // save the xmm registers which must be preserved 6-15 |
3146 __ subptr(rsp, -rsp_after_call_off * wordSize); |
3192 __ subptr(rsp, -rsp_after_call_off * wordSize); |
3147 for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { |
3193 for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { |
3148 __ movdqu(xmm_save(i), as_XMMRegister(i)); |
3194 __ movdqu(xmm_save(i), as_XMMRegister(i)); |
3149 } |
3195 } |
3150 #endif |
3196 #endif |
3151 |
3197 |
3152 const XMMRegister xmm_key_shuf_mask = xmm_temp; // used temporarily to swap key bytes up front |
3198 const XMMRegister xmm_key_shuf_mask = xmm_temp; // used temporarily to swap key bytes up front |
3153 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); |
3199 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); |
3154 // load up xmm regs 2 thru 12 with key 0x00 - 0xa0 |
3200 // load up xmm regs xmm2 thru xmm12 with key 0x00 - 0xa0 |
3155 for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { |
3201 for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_FIRST+10; rnum++) { |
3156 load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask); |
3202 load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask); |
3157 offset += 0x10; |
3203 offset += 0x10; |
3158 } |
3204 } |
3159 |
|
3160 __ movdqu(xmm_result, Address(rvec, 0x00)); // initialize xmm_result with r vec |
3205 __ movdqu(xmm_result, Address(rvec, 0x00)); // initialize xmm_result with r vec |
3161 |
3206 |
3162 // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256)) |
3207 // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256)) |
3163 __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); |
3208 __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); |
3164 __ cmpl(rax, 44); |
3209 __ cmpl(rax, 44); |
3165 __ jcc(Assembler::notEqual, L_key_192_256); |
3210 __ jcc(Assembler::notEqual, L_key_192_256); |
3166 |
3211 |
3167 // 128 bit code follows here |
3212 // 128 bit code follows here |
3168 __ movptr(pos, 0); |
3213 __ movptr(pos, 0); |
3169 __ align(OptoLoopAlignment); |
3214 __ align(OptoLoopAlignment); |
|
3215 |
3170 __ BIND(L_loopTop_128); |
3216 __ BIND(L_loopTop_128); |
3171 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input |
3217 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input |
3172 __ pxor (xmm_result, xmm_temp); // xor with the current r vector |
3218 __ pxor (xmm_result, xmm_temp); // xor with the current r vector |
3173 |
|
3174 __ pxor (xmm_result, xmm_key0); // do the aes rounds |
3219 __ pxor (xmm_result, xmm_key0); // do the aes rounds |
3175 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) { |
3220 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 9; rnum++) { |
3176 __ aesenc(xmm_result, as_XMMRegister(rnum)); |
3221 __ aesenc(xmm_result, as_XMMRegister(rnum)); |
3177 } |
3222 } |
3178 __ aesenclast(xmm_result, xmm_key10); |
3223 __ aesenclast(xmm_result, xmm_key10); |
3179 |
|
3180 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output |
3224 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output |
3181 // no need to store r to memory until we exit |
3225 // no need to store r to memory until we exit |
3182 __ addptr(pos, AESBlockSize); |
3226 __ addptr(pos, AESBlockSize); |
3183 __ subptr(len_reg, AESBlockSize); |
3227 __ subptr(len_reg, AESBlockSize); |
3184 __ jcc(Assembler::notEqual, L_loopTop_128); |
3228 __ jcc(Assembler::notEqual, L_loopTop_128); |
3196 __ leave(); // required for proper stackwalking of RuntimeStub frame |
3240 __ leave(); // required for proper stackwalking of RuntimeStub frame |
3197 __ ret(0); |
3241 __ ret(0); |
3198 |
3242 |
3199 __ BIND(L_key_192_256); |
3243 __ BIND(L_key_192_256); |
3200 // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256) |
3244 // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256) |
|
3245 load_key(xmm_key11, key, 0xb0, xmm_key_shuf_mask); |
|
3246 load_key(xmm_key12, key, 0xc0, xmm_key_shuf_mask); |
3201 __ cmpl(rax, 52); |
3247 __ cmpl(rax, 52); |
3202 __ jcc(Assembler::notEqual, L_key_256); |
3248 __ jcc(Assembler::notEqual, L_key_256); |
3203 |
3249 |
3204 // 192-bit code follows here (could be changed to use more xmm registers) |
3250 // 192-bit code follows here (could be changed to use more xmm registers) |
3205 __ movptr(pos, 0); |
3251 __ movptr(pos, 0); |
3206 __ align(OptoLoopAlignment); |
3252 __ align(OptoLoopAlignment); |
|
3253 |
3207 __ BIND(L_loopTop_192); |
3254 __ BIND(L_loopTop_192); |
3208 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input |
3255 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input |
3209 __ pxor (xmm_result, xmm_temp); // xor with the current r vector |
3256 __ pxor (xmm_result, xmm_temp); // xor with the current r vector |
3210 |
|
3211 __ pxor (xmm_result, xmm_key0); // do the aes rounds |
3257 __ pxor (xmm_result, xmm_key0); // do the aes rounds |
3212 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { |
3258 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 11; rnum++) { |
3213 __ aesenc(xmm_result, as_XMMRegister(rnum)); |
3259 __ aesenc(xmm_result, as_XMMRegister(rnum)); |
3214 } |
3260 } |
3215 aes_enc_key(xmm_result, xmm_temp, key, 0xb0); |
3261 __ aesenclast(xmm_result, xmm_key12); |
3216 load_key(xmm_temp, key, 0xc0); |
|
3217 __ aesenclast(xmm_result, xmm_temp); |
|
3218 |
|
3219 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output |
3262 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output |
3220 // no need to store r to memory until we exit |
3263 // no need to store r to memory until we exit |
3221 __ addptr(pos, AESBlockSize); |
3264 __ addptr(pos, AESBlockSize); |
3222 __ subptr(len_reg, AESBlockSize); |
3265 __ subptr(len_reg, AESBlockSize); |
3223 __ jcc(Assembler::notEqual, L_loopTop_192); |
3266 __ jcc(Assembler::notEqual, L_loopTop_192); |
3224 __ jmp(L_exit); |
3267 __ jmp(L_exit); |
3225 |
3268 |
3226 __ BIND(L_key_256); |
3269 __ BIND(L_key_256); |
3227 // 256-bit code follows here (could be changed to use more xmm registers) |
3270 // 256-bit code follows here (could be changed to use more xmm registers) |
|
3271 load_key(xmm_key13, key, 0xd0, xmm_key_shuf_mask); |
3228 __ movptr(pos, 0); |
3272 __ movptr(pos, 0); |
3229 __ align(OptoLoopAlignment); |
3273 __ align(OptoLoopAlignment); |
|
3274 |
3230 __ BIND(L_loopTop_256); |
3275 __ BIND(L_loopTop_256); |
3231 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input |
3276 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input |
3232 __ pxor (xmm_result, xmm_temp); // xor with the current r vector |
3277 __ pxor (xmm_result, xmm_temp); // xor with the current r vector |
3233 |
|
3234 __ pxor (xmm_result, xmm_key0); // do the aes rounds |
3278 __ pxor (xmm_result, xmm_key0); // do the aes rounds |
3235 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { |
3279 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 13; rnum++) { |
3236 __ aesenc(xmm_result, as_XMMRegister(rnum)); |
3280 __ aesenc(xmm_result, as_XMMRegister(rnum)); |
3237 } |
3281 } |
3238 aes_enc_key(xmm_result, xmm_temp, key, 0xb0); |
|
3239 aes_enc_key(xmm_result, xmm_temp, key, 0xc0); |
|
3240 aes_enc_key(xmm_result, xmm_temp, key, 0xd0); |
|
3241 load_key(xmm_temp, key, 0xe0); |
3282 load_key(xmm_temp, key, 0xe0); |
3242 __ aesenclast(xmm_result, xmm_temp); |
3283 __ aesenclast(xmm_result, xmm_temp); |
3243 |
|
3244 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output |
3284 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output |
3245 // no need to store r to memory until we exit |
3285 // no need to store r to memory until we exit |
3246 __ addptr(pos, AESBlockSize); |
3286 __ addptr(pos, AESBlockSize); |
3247 __ subptr(len_reg, AESBlockSize); |
3287 __ subptr(len_reg, AESBlockSize); |
3248 __ jcc(Assembler::notEqual, L_loopTop_256); |
3288 __ jcc(Assembler::notEqual, L_loopTop_256); |
3310 // the java expanded key ordering is rotated one position from what we want |
3348 // the java expanded key ordering is rotated one position from what we want |
3311 // so we start from 0x10 here and hit 0x00 last |
3349 // so we start from 0x10 here and hit 0x00 last |
3312 const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front |
3350 const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front |
3313 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); |
3351 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); |
3314 // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00 |
3352 // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00 |
3315 for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { |
3353 for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum < XMM_REG_NUM_KEY_LAST; rnum++) { |
3316 if (rnum == XMM_REG_NUM_KEY_LAST) offset = 0x00; |
|
3317 load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask); |
3354 load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask); |
3318 offset += 0x10; |
3355 offset += 0x10; |
3319 } |
3356 } |
|
3357 load_key(xmm_key_last, key, 0x00, xmm_key_shuf_mask); |
3320 |
3358 |
3321 const XMMRegister xmm_prev_block_cipher = xmm1; // holds cipher of previous block |
3359 const XMMRegister xmm_prev_block_cipher = xmm1; // holds cipher of previous block |
|
3360 |
3322 // registers holding the four results in the parallelized loop |
3361 // registers holding the four results in the parallelized loop |
3323 const XMMRegister xmm_result0 = xmm0; |
3362 const XMMRegister xmm_result0 = xmm0; |
3324 const XMMRegister xmm_result1 = xmm2; |
3363 const XMMRegister xmm_result1 = xmm2; |
3325 const XMMRegister xmm_result2 = xmm3; |
3364 const XMMRegister xmm_result2 = xmm3; |
3326 const XMMRegister xmm_result3 = xmm4; |
3365 const XMMRegister xmm_result3 = xmm4; |
3413 __ ret(0); |
3456 __ ret(0); |
3414 |
3457 |
3415 |
3458 |
3416 __ BIND(L_key_192_256); |
3459 __ BIND(L_key_192_256); |
3417 // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256) |
3460 // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256) |
|
3461 load_key(xmm_key11, key, 0xb0); |
3418 __ cmpl(rax, 52); |
3462 __ cmpl(rax, 52); |
3419 __ jcc(Assembler::notEqual, L_key_256); |
3463 __ jcc(Assembler::notEqual, L_key_256); |
3420 |
3464 |
3421 // 192-bit code follows here (could be optimized to use parallelism) |
3465 // 192-bit code follows here (could be optimized to use parallelism) |
|
3466 load_key(xmm_key12, key, 0xc0); // 192-bit key goes up to c0 |
3422 __ movptr(pos, 0); |
3467 __ movptr(pos, 0); |
3423 __ align(OptoLoopAlignment); |
3468 __ align(OptoLoopAlignment); |
|
3469 |
3424 __ BIND(L_singleBlock_loopTop_192); |
3470 __ BIND(L_singleBlock_loopTop_192); |
3425 __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input |
3471 __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input |
3426 __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector |
3472 __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector |
3427 __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds |
3473 __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds |
3428 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) { |
3474 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) { |
3429 __ aesdec(xmm_result, as_XMMRegister(rnum)); |
3475 __ aesdec(xmm_result, as_XMMRegister(rnum)); |
3430 } |
3476 } |
3431 aes_dec_key(xmm_result, xmm_temp, key, 0xb0); // 192-bit key goes up to c0 |
3477 __ aesdec(xmm_result, xmm_key11); |
3432 aes_dec_key(xmm_result, xmm_temp, key, 0xc0); |
3478 __ aesdec(xmm_result, xmm_key12); |
3433 __ aesdeclast(xmm_result, xmm_key_last); // xmm15 always came from key+0 |
3479 __ aesdeclast(xmm_result, xmm_key_last); // xmm15 always came from key+0 |
3434 __ pxor (xmm_result, xmm_prev_block_cipher); // xor with the current r vector |
3480 __ pxor (xmm_result, xmm_prev_block_cipher); // xor with the current r vector |
3435 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output |
3481 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output |
3436 // no need to store r to memory until we exit |
3482 // no need to store r to memory until we exit |
3437 __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block |
3483 __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block |
3438 |
|
3439 __ addptr(pos, AESBlockSize); |
3484 __ addptr(pos, AESBlockSize); |
3440 __ subptr(len_reg, AESBlockSize); |
3485 __ subptr(len_reg, AESBlockSize); |
3441 __ jcc(Assembler::notEqual,L_singleBlock_loopTop_192); |
3486 __ jcc(Assembler::notEqual,L_singleBlock_loopTop_192); |
3442 __ jmp(L_exit); |
3487 __ jmp(L_exit); |
3443 |
3488 |
3444 __ BIND(L_key_256); |
3489 __ BIND(L_key_256); |
3445 // 256-bit code follows here (could be optimized to use parallelism) |
3490 // 256-bit code follows here (could be optimized to use parallelism) |
3446 __ movptr(pos, 0); |
3491 __ movptr(pos, 0); |
3447 __ align(OptoLoopAlignment); |
3492 __ align(OptoLoopAlignment); |
|
3493 |
3448 __ BIND(L_singleBlock_loopTop_256); |
3494 __ BIND(L_singleBlock_loopTop_256); |
3449 __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input |
3495 __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input |
3450 __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector |
3496 __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector |
3451 __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds |
3497 __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds |
3452 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) { |
3498 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) { |
3453 __ aesdec(xmm_result, as_XMMRegister(rnum)); |
3499 __ aesdec(xmm_result, as_XMMRegister(rnum)); |
3454 } |
3500 } |
3455 aes_dec_key(xmm_result, xmm_temp, key, 0xb0); // 256-bit key goes up to e0 |
3501 __ aesdec(xmm_result, xmm_key11); |
3456 aes_dec_key(xmm_result, xmm_temp, key, 0xc0); |
3502 load_key(xmm_temp, key, 0xc0); |
3457 aes_dec_key(xmm_result, xmm_temp, key, 0xd0); |
3503 __ aesdec(xmm_result, xmm_temp); |
3458 aes_dec_key(xmm_result, xmm_temp, key, 0xe0); |
3504 load_key(xmm_temp, key, 0xd0); |
3459 __ aesdeclast(xmm_result, xmm_key_last); // xmm15 came from key+0 |
3505 __ aesdec(xmm_result, xmm_temp); |
|
3506 load_key(xmm_temp, key, 0xe0); // 256-bit key goes up to e0 |
|
3507 __ aesdec(xmm_result, xmm_temp); |
|
3508 __ aesdeclast(xmm_result, xmm_key_last); // xmm15 came from key+0 |
3460 __ pxor (xmm_result, xmm_prev_block_cipher); // xor with the current r vector |
3509 __ pxor (xmm_result, xmm_prev_block_cipher); // xor with the current r vector |
3461 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output |
3510 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output |
3462 // no need to store r to memory until we exit |
3511 // no need to store r to memory until we exit |
3463 __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block |
3512 __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block |
3464 |
|
3465 __ addptr(pos, AESBlockSize); |
3513 __ addptr(pos, AESBlockSize); |
3466 __ subptr(len_reg, AESBlockSize); |
3514 __ subptr(len_reg, AESBlockSize); |
3467 __ jcc(Assembler::notEqual,L_singleBlock_loopTop_256); |
3515 __ jcc(Assembler::notEqual,L_singleBlock_loopTop_256); |
3468 __ jmp(L_exit); |
3516 __ jmp(L_exit); |
3469 |
3517 |