8035936: SIGBUS in StubRoutines::aesencryptBlock, solaris-sparc

Wed, 30 Apr 2014 14:14:01 -0700

author
kvn
date
Wed, 30 Apr 2014 14:14:01 -0700
changeset 6653
03214612e77e
parent 6652
85d6efcb1fa3
child 6654
f48e481abef0

8035936: SIGBUS in StubRoutines::aesencryptBlock, solaris-sparc
Summary: Fix the arbitrary alignment issue in SPARC AES crypto stub routines.
Reviewed-by: kvn, iveresov
Contributed-by: shrinivas.joshi@oracle.com

src/cpu/sparc/vm/assembler_sparc.hpp file | annotate | diff | comparison | revisions
src/cpu/sparc/vm/stubGenerator_sparc.cpp file | annotate | diff | comparison | revisions
src/cpu/sparc/vm/stubRoutines_sparc.hpp file | annotate | diff | comparison | revisions
src/cpu/sparc/vm/vm_version_sparc.cpp file | annotate | diff | comparison | revisions
src/share/vm/classfile/vmSymbols.hpp file | annotate | diff | comparison | revisions
src/share/vm/opto/runtime.cpp file | annotate | diff | comparison | revisions
test/compiler/7184394/TestAESBase.java file | annotate | diff | comparison | revisions
test/compiler/7184394/TestAESDecode.java file | annotate | diff | comparison | revisions
test/compiler/7184394/TestAESEncode.java file | annotate | diff | comparison | revisions
test/compiler/7184394/TestAESMain.java file | annotate | diff | comparison | revisions
     1.1 --- a/src/cpu/sparc/vm/assembler_sparc.hpp	Thu May 01 15:02:46 2014 -0700
     1.2 +++ b/src/cpu/sparc/vm/assembler_sparc.hpp	Wed Apr 30 14:14:01 2014 -0700
     1.3 @@ -1,5 +1,5 @@
     1.4  /*
     1.5 - * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
     1.6 + * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
     1.7   * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     1.8   *
     1.9   * This code is free software; you can redistribute it and/or modify it
    1.10 @@ -123,8 +123,13 @@
    1.11      fpop2_op3    = 0x35,
    1.12      impdep1_op3  = 0x36,
    1.13      aes3_op3     = 0x36,
    1.14 +    alignaddr_op3  = 0x36,
    1.15 +    faligndata_op3 = 0x36,
    1.16      flog3_op3    = 0x36,
    1.17 +    edge_op3     = 0x36,
    1.18 +    fsrc_op3     = 0x36,
    1.19      impdep2_op3  = 0x37,
    1.20 +    stpartialf_op3 = 0x37,
    1.21      jmpl_op3     = 0x38,
    1.22      rett_op3     = 0x39,
    1.23      trap_op3     = 0x3a,
    1.24 @@ -175,17 +180,23 @@
    1.25  
    1.26    enum opfs {
    1.27      // selected opfs
    1.28 +    edge8n_opf         = 0x01,
    1.29 +
    1.30      fmovs_opf          = 0x01,
    1.31      fmovd_opf          = 0x02,
    1.32  
    1.33      fnegs_opf          = 0x05,
    1.34      fnegd_opf          = 0x06,
    1.35  
    1.36 +    alignaddr_opf      = 0x18,
    1.37 +
    1.38      fadds_opf          = 0x41,
    1.39      faddd_opf          = 0x42,
    1.40      fsubs_opf          = 0x45,
    1.41      fsubd_opf          = 0x46,
    1.42  
    1.43 +    faligndata_opf     = 0x48,
    1.44 +
    1.45      fmuls_opf          = 0x49,
    1.46      fmuld_opf          = 0x4a,
    1.47      fdivs_opf          = 0x4d,
    1.48 @@ -348,6 +359,8 @@
    1.49      ASI_PRIMARY            = 0x80,
    1.50      ASI_PRIMARY_NOFAULT    = 0x82,
    1.51      ASI_PRIMARY_LITTLE     = 0x88,
    1.52 +    // 8x8-bit partial store
    1.53 +    ASI_PST8_PRIMARY       = 0xC0,
    1.54      // Block initializing store
    1.55      ASI_ST_BLKINIT_PRIMARY = 0xE2,
    1.56      // Most-Recently-Used (MRU) BIS variant
    1.57 @@ -585,6 +598,9 @@
    1.58    // instruction only in VIS1
    1.59    static void vis1_only() { assert( VM_Version::has_vis1(), "This instruction only works on SPARC with VIS1"); }
    1.60  
    1.61 +  // instruction only in VIS2
    1.62 +  static void vis2_only() { assert( VM_Version::has_vis2(), "This instruction only works on SPARC with VIS2"); }
    1.63 +
    1.64    // instruction only in VIS3
    1.65    static void vis3_only() { assert( VM_Version::has_vis3(), "This instruction only works on SPARC with VIS3"); }
    1.66  
    1.67 @@ -1164,6 +1180,20 @@
    1.68    inline void wrfprs( Register d) { v9_only(); emit_int32( op(arith_op) | rs1(d) | op3(wrreg_op3) | u_field(6, 29, 25)); }
    1.69  
    1.70  
    1.71 +  //  VIS1 instructions
    1.72 +
    1.73 +  void alignaddr( Register s1, Register s2, Register d ) { vis1_only(); emit_int32( op(arith_op) | rd(d) | op3(alignaddr_op3) | rs1(s1) | opf(alignaddr_opf) | rs2(s2)); }
    1.74 +
    1.75 +  void faligndata( FloatRegister s1, FloatRegister s2, FloatRegister d ) { vis1_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(faligndata_op3) | fs1(s1, FloatRegisterImpl::D) | opf(faligndata_opf) | fs2(s2, FloatRegisterImpl::D)); }
    1.76 +
    1.77 +  void fsrc2( FloatRegisterImpl::Width w, FloatRegister s2, FloatRegister d ) { vis1_only(); emit_int32( op(arith_op) | fd(d, w) | op3(fsrc_op3) | opf(0x7A - w) | fs2(s2, w)); }
    1.78 +
    1.79 +  void stpartialf( Register s1, Register s2, FloatRegister d, int ia = -1 ) { vis1_only(); emit_int32( op(ldst_op) | fd(d, FloatRegisterImpl::D) | op3(stpartialf_op3) | rs1(s1) | imm_asi(ia) | rs2(s2)); }
    1.80 +
    1.81 +  //  VIS2 instructions
    1.82 +
    1.83 +  void edge8n( Register s1, Register s2, Register d ) { vis2_only(); emit_int32( op(arith_op) | rd(d) | op3(edge_op3) | rs1(s1) | opf(edge8n_opf) | rs2(s2)); }
    1.84 +
    1.85    // VIS3 instructions
    1.86  
    1.87    void movstosw( FloatRegister s, Register d ) { vis3_only();  emit_int32( op(arith_op) | rd(d) | op3(mftoi_op3) | opf(mstosw_opf) | fs2(s, FloatRegisterImpl::S)); }
     2.1 --- a/src/cpu/sparc/vm/stubGenerator_sparc.cpp	Thu May 01 15:02:46 2014 -0700
     2.2 +++ b/src/cpu/sparc/vm/stubGenerator_sparc.cpp	Wed Apr 30 14:14:01 2014 -0700
     2.3 @@ -1,5 +1,5 @@
     2.4  /*
     2.5 - * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
     2.6 + * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
     2.7   * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     2.8   *
     2.9   * This code is free software; you can redistribute it and/or modify it
    2.10 @@ -3305,9 +3305,12 @@
    2.11    }
    2.12  
    2.13    address generate_aescrypt_encryptBlock() {
    2.14 +    // required since we read expanded key 'int' array starting first element without alignment considerations
    2.15 +    assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
    2.16 +           "the following code assumes that first element of an int array is aligned to 8 bytes");
    2.17      __ align(CodeEntryAlignment);
    2.18 -    StubCodeMark mark(this, "StubRoutines", "aesencryptBlock");
    2.19 -    Label L_doLast128bit, L_storeOutput;
    2.20 +    StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
    2.21 +    Label L_load_misaligned_input, L_load_expanded_key, L_doLast128bit, L_storeOutput, L_store_misaligned_output;
    2.22      address start = __ pc();
    2.23      Register from = O0; // source byte array
    2.24      Register to = O1;   // destination byte array
    2.25 @@ -3317,15 +3320,33 @@
    2.26      // read expanded key length
    2.27      __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
    2.28  
    2.29 -    // load input into F54-F56; F30-F31 used as temp
    2.30 -    __ ldf(FloatRegisterImpl::S, from, 0, F30);
    2.31 -    __ ldf(FloatRegisterImpl::S, from, 4, F31);
    2.32 -    __ fmov(FloatRegisterImpl::D, F30, F54);
    2.33 -    __ ldf(FloatRegisterImpl::S, from, 8, F30);
    2.34 -    __ ldf(FloatRegisterImpl::S, from, 12, F31);
    2.35 -    __ fmov(FloatRegisterImpl::D, F30, F56);
    2.36 -
    2.37 -    // load expanded key
    2.38 +    // Method to address arbitrary alignment for load instructions:
    2.39 +    // Check last 3 bits of 'from' address to see if it is aligned to 8-byte boundary
    2.40 +    // If zero/aligned then continue with double FP load instructions
    2.41 +    // If not zero/mis-aligned then alignaddr will set GSR.align with number of bytes to skip during faligndata
    2.42 +    // alignaddr will also convert arbitrary aligned 'from' address to nearest 8-byte aligned address
    2.43 +    // load 3 * 8-byte components (to read 16 bytes input) in 3 different FP regs starting at this aligned address
    2.44 +    // faligndata will then extract (based on GSR.align value) the appropriate 8 bytes from the 2 source regs
    2.45 +
    2.46 +    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
    2.47 +    __ andcc(from, 7, G0);
    2.48 +    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input);
    2.49 +    __ delayed()->alignaddr(from, G0, from);
    2.50 +
    2.51 +    // aligned case: load input into F54-F56
    2.52 +    __ ldf(FloatRegisterImpl::D, from, 0, F54);
    2.53 +    __ ldf(FloatRegisterImpl::D, from, 8, F56);
    2.54 +    __ ba_short(L_load_expanded_key);
    2.55 +
    2.56 +    __ BIND(L_load_misaligned_input);
    2.57 +    __ ldf(FloatRegisterImpl::D, from, 0, F54);
    2.58 +    __ ldf(FloatRegisterImpl::D, from, 8, F56);
    2.59 +    __ ldf(FloatRegisterImpl::D, from, 16, F58);
    2.60 +    __ faligndata(F54, F56, F54);
    2.61 +    __ faligndata(F56, F58, F56);
    2.62 +
    2.63 +    __ BIND(L_load_expanded_key);
    2.64 +    // Since we load expanded key buffers starting first element, 8-byte alignment is guaranteed
    2.65      for ( int i = 0;  i <= 38; i += 2 ) {
    2.66        __ ldf(FloatRegisterImpl::D, key, i*4, as_FloatRegister(i));
    2.67      }
    2.68 @@ -3365,8 +3386,7 @@
    2.69      __ ldf(FloatRegisterImpl::D, key, 232, F50);
    2.70      __ aes_eround01(F52, F54, F56, F58); //round 13
    2.71      __ aes_eround23(F46, F54, F56, F60);
    2.72 -    __ br(Assembler::always, false, Assembler::pt, L_storeOutput);
    2.73 -    __ delayed()->nop();
    2.74 +    __ ba_short(L_storeOutput);
    2.75  
    2.76      __ BIND(L_doLast128bit);
    2.77      __ ldf(FloatRegisterImpl::D, key, 160, F48);
    2.78 @@ -3377,23 +3397,62 @@
    2.79      __ aes_eround01_l(F48, F58, F60, F54); //last round
    2.80      __ aes_eround23_l(F50, F58, F60, F56);
    2.81  
    2.82 -    // store output into the destination array, F0-F1 used as temp
    2.83 -    __ fmov(FloatRegisterImpl::D, F54, F0);
    2.84 -    __ stf(FloatRegisterImpl::S, F0, to, 0);
    2.85 -    __ stf(FloatRegisterImpl::S, F1, to, 4);
    2.86 -    __ fmov(FloatRegisterImpl::D, F56, F0);
    2.87 -    __ stf(FloatRegisterImpl::S, F0, to, 8);
    2.88 +    // Method to address arbitrary alignment for store instructions:
    2.89 +    // Check last 3 bits of 'dest' address to see if it is aligned to 8-byte boundary
    2.90 +    // If zero/aligned then continue with double FP store instructions
    2.91 +    // If not zero/mis-aligned then edge8n will generate edge mask in result reg (O3 in below case)
    2.92 +    // Example: If dest address is 0x07 and nearest 8-byte aligned address is 0x00 then edge mask will be 00000001
    2.93 +    // Compute (8-n) where n is # of bytes skipped by partial store(stpartialf) inst from edge mask, n=7 in this case
    2.94 +    // We get the value of n from the andcc that checks 'dest' alignment. n is available in O5 in below case.
    2.95 +    // Set GSR.align to (8-n) using alignaddr
    2.96 +    // Circular byte shift store values by n places so that the original bytes are at correct position for stpartialf
    2.97 +    // Set the arbitrarily aligned 'dest' address to nearest 8-byte aligned address
    2.98 +    // Store (partial) the original first (8-n) bytes starting at the original 'dest' address
    2.99 +    // Negate the edge mask so that the subsequent stpartialf can store the original (8-n-1)th through 8th bytes at appropriate address
   2.100 +    // We need to execute this process for both the 8-byte result values
   2.101 +
   2.102 +    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
   2.103 +    __ andcc(to, 7, O5);
   2.104 +    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output);
   2.105 +    __ delayed()->edge8n(to, G0, O3);
   2.106 +
   2.107 +    // aligned case: store output into the destination array
   2.108 +    __ stf(FloatRegisterImpl::D, F54, to, 0);
   2.109      __ retl();
   2.110 -    __ delayed()->stf(FloatRegisterImpl::S, F1, to, 12);
   2.111 +    __ delayed()->stf(FloatRegisterImpl::D, F56, to, 8);
   2.112 +
   2.113 +    __ BIND(L_store_misaligned_output);
   2.114 +    __ add(to, 8, O4);
   2.115 +    __ mov(8, O2);
   2.116 +    __ sub(O2, O5, O2);
   2.117 +    __ alignaddr(O2, G0, O2);
   2.118 +    __ faligndata(F54, F54, F54);
   2.119 +    __ faligndata(F56, F56, F56);
   2.120 +    __ and3(to, -8, to);
   2.121 +    __ and3(O4, -8, O4);
   2.122 +    __ stpartialf(to, O3, F54, Assembler::ASI_PST8_PRIMARY);
   2.123 +    __ stpartialf(O4, O3, F56, Assembler::ASI_PST8_PRIMARY);
   2.124 +    __ add(to, 8, to);
   2.125 +    __ add(O4, 8, O4);
   2.126 +    __ orn(G0, O3, O3);
   2.127 +    __ stpartialf(to, O3, F54, Assembler::ASI_PST8_PRIMARY);
   2.128 +    __ retl();
   2.129 +    __ delayed()->stpartialf(O4, O3, F56, Assembler::ASI_PST8_PRIMARY);
   2.130  
   2.131      return start;
   2.132    }
   2.133  
   2.134    address generate_aescrypt_decryptBlock() {
   2.135 +    assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
   2.136 +           "the following code assumes that first element of an int array is aligned to 8 bytes");
   2.137 +    // required since we read original key 'byte' array as well in the decryption stubs
   2.138 +    assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0,
   2.139 +           "the following code assumes that first element of a byte array is aligned to 8 bytes");
   2.140      __ align(CodeEntryAlignment);
   2.141 -    StubCodeMark mark(this, "StubRoutines", "aesdecryptBlock");
   2.142 +    StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
   2.143      address start = __ pc();
   2.144 -    Label L_expand192bit, L_expand256bit, L_common_transform;
   2.145 +    Label L_load_misaligned_input, L_load_original_key, L_expand192bit, L_expand256bit, L_reload_misaligned_input;
   2.146 +    Label L_256bit_transform, L_common_transform, L_store_misaligned_output;
   2.147      Register from = O0; // source byte array
   2.148      Register to = O1;   // destination byte array
   2.149      Register key = O2;  // expanded key array
   2.150 @@ -3403,15 +3462,29 @@
   2.151      // read expanded key array length
   2.152      __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
   2.153  
   2.154 -    // load input into F52-F54; F30,F31 used as temp
   2.155 -    __ ldf(FloatRegisterImpl::S, from, 0, F30);
   2.156 -    __ ldf(FloatRegisterImpl::S, from, 4, F31);
   2.157 -    __ fmov(FloatRegisterImpl::D, F30, F52);
   2.158 -    __ ldf(FloatRegisterImpl::S, from, 8, F30);
   2.159 -    __ ldf(FloatRegisterImpl::S, from, 12, F31);
   2.160 -    __ fmov(FloatRegisterImpl::D, F30, F54);
   2.161 -
   2.162 +    // save 'from' since we may need to recheck alignment in case of 256-bit decryption
   2.163 +    __ mov(from, G1);
   2.164 +
   2.165 +    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
   2.166 +    __ andcc(from, 7, G0);
   2.167 +    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input);
   2.168 +    __ delayed()->alignaddr(from, G0, from);
   2.169 +
   2.170 +    // aligned case: load input into F52-F54
   2.171 +    __ ldf(FloatRegisterImpl::D, from, 0, F52);
   2.172 +    __ ldf(FloatRegisterImpl::D, from, 8, F54);
   2.173 +    __ ba_short(L_load_original_key);
   2.174 +
   2.175 +    __ BIND(L_load_misaligned_input);
   2.176 +    __ ldf(FloatRegisterImpl::D, from, 0, F52);
   2.177 +    __ ldf(FloatRegisterImpl::D, from, 8, F54);
   2.178 +    __ ldf(FloatRegisterImpl::D, from, 16, F56);
   2.179 +    __ faligndata(F52, F54, F52);
   2.180 +    __ faligndata(F54, F56, F54);
   2.181 +
   2.182 +    __ BIND(L_load_original_key);
   2.183      // load original key from SunJCE expanded decryption key
   2.184 +    // Since we load original key buffer starting first element, 8-byte alignment is guaranteed
   2.185      for ( int i = 0;  i <= 3; i++ ) {
   2.186        __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
   2.187      }
   2.188 @@ -3432,8 +3505,7 @@
   2.189      // perform 128-bit key specific inverse cipher transformation
   2.190      __ fxor(FloatRegisterImpl::D, F42, F54, F54);
   2.191      __ fxor(FloatRegisterImpl::D, F40, F52, F52);
   2.192 -    __ br(Assembler::always, false, Assembler::pt, L_common_transform);
   2.193 -    __ delayed()->nop();
   2.194 +    __ ba_short(L_common_transform);
   2.195  
   2.196      __ BIND(L_expand192bit);
   2.197  
   2.198 @@ -3457,8 +3529,7 @@
   2.199      __ aes_dround01(F44, F52, F54, F56);
   2.200      __ aes_dround23(F42, F56, F58, F54);
   2.201      __ aes_dround01(F40, F56, F58, F52);
   2.202 -    __ br(Assembler::always, false, Assembler::pt, L_common_transform);
   2.203 -    __ delayed()->nop();
   2.204 +    __ ba_short(L_common_transform);
   2.205  
   2.206      __ BIND(L_expand256bit);
   2.207  
   2.208 @@ -3478,14 +3549,31 @@
   2.209      __ aes_kexpand2(F50, F56, F58);
   2.210  
   2.211      for ( int i = 0;  i <= 6; i += 2 ) {
   2.212 -      __ fmov(FloatRegisterImpl::D, as_FloatRegister(58-i), as_FloatRegister(i));
   2.213 +      __ fsrc2(FloatRegisterImpl::D, as_FloatRegister(58-i), as_FloatRegister(i));
   2.214      }
   2.215  
   2.216 -    // load input into F52-F54
   2.217 +    // reload original 'from' address
   2.218 +    __ mov(G1, from);
   2.219 +
   2.220 +    // re-check 8-byte alignment
   2.221 +    __ andcc(from, 7, G0);
   2.222 +    __ br(Assembler::notZero, true, Assembler::pn, L_reload_misaligned_input);
   2.223 +    __ delayed()->alignaddr(from, G0, from);
   2.224 +
   2.225 +    // aligned case: load input into F52-F54
   2.226      __ ldf(FloatRegisterImpl::D, from, 0, F52);
   2.227      __ ldf(FloatRegisterImpl::D, from, 8, F54);
   2.228 +    __ ba_short(L_256bit_transform);
   2.229 +
   2.230 +    __ BIND(L_reload_misaligned_input);
   2.231 +    __ ldf(FloatRegisterImpl::D, from, 0, F52);
   2.232 +    __ ldf(FloatRegisterImpl::D, from, 8, F54);
   2.233 +    __ ldf(FloatRegisterImpl::D, from, 16, F56);
   2.234 +    __ faligndata(F52, F54, F52);
   2.235 +    __ faligndata(F54, F56, F54);
   2.236  
   2.237      // perform 256-bit key specific inverse cipher transformation
   2.238 +    __ BIND(L_256bit_transform);
   2.239      __ fxor(FloatRegisterImpl::D, F0, F54, F54);
   2.240      __ fxor(FloatRegisterImpl::D, F2, F52, F52);
   2.241      __ aes_dround23(F4, F52, F54, F58);
   2.242 @@ -3515,43 +3603,71 @@
   2.243        }
   2.244      }
   2.245  
   2.246 -    // store output to destination array, F0-F1 used as temp
   2.247 -    __ fmov(FloatRegisterImpl::D, F52, F0);
   2.248 -    __ stf(FloatRegisterImpl::S, F0, to, 0);
   2.249 -    __ stf(FloatRegisterImpl::S, F1, to, 4);
   2.250 -    __ fmov(FloatRegisterImpl::D, F54, F0);
   2.251 -    __ stf(FloatRegisterImpl::S, F0, to, 8);
   2.252 +    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
   2.253 +    __ andcc(to, 7, O5);
   2.254 +    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output);
   2.255 +    __ delayed()->edge8n(to, G0, O3);
   2.256 +
   2.257 +    // aligned case: store output into the destination array
   2.258 +    __ stf(FloatRegisterImpl::D, F52, to, 0);
   2.259      __ retl();
   2.260 -    __ delayed()->stf(FloatRegisterImpl::S, F1, to, 12);
   2.261 +    __ delayed()->stf(FloatRegisterImpl::D, F54, to, 8);
   2.262 +
   2.263 +    __ BIND(L_store_misaligned_output);
   2.264 +    __ add(to, 8, O4);
   2.265 +    __ mov(8, O2);
   2.266 +    __ sub(O2, O5, O2);
   2.267 +    __ alignaddr(O2, G0, O2);
   2.268 +    __ faligndata(F52, F52, F52);
   2.269 +    __ faligndata(F54, F54, F54);
   2.270 +    __ and3(to, -8, to);
   2.271 +    __ and3(O4, -8, O4);
   2.272 +    __ stpartialf(to, O3, F52, Assembler::ASI_PST8_PRIMARY);
   2.273 +    __ stpartialf(O4, O3, F54, Assembler::ASI_PST8_PRIMARY);
   2.274 +    __ add(to, 8, to);
   2.275 +    __ add(O4, 8, O4);
   2.276 +    __ orn(G0, O3, O3);
   2.277 +    __ stpartialf(to, O3, F52, Assembler::ASI_PST8_PRIMARY);
   2.278 +    __ retl();
   2.279 +    __ delayed()->stpartialf(O4, O3, F54, Assembler::ASI_PST8_PRIMARY);
   2.280  
   2.281      return start;
   2.282    }
   2.283  
   2.284    address generate_cipherBlockChaining_encryptAESCrypt() {
   2.285 +    assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
   2.286 +           "the following code assumes that first element of an int array is aligned to 8 bytes");
   2.287 +    assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0,
   2.288 +           "the following code assumes that first element of a byte array is aligned to 8 bytes");
   2.289      __ align(CodeEntryAlignment);
   2.290      StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
   2.291 -    Label L_cbcenc128, L_cbcenc192, L_cbcenc256;
   2.292 +    Label L_cbcenc128, L_load_misaligned_input_128bit, L_128bit_transform, L_store_misaligned_output_128bit;
   2.293 +    Label L_check_loop_end_128bit, L_cbcenc192, L_load_misaligned_input_192bit, L_192bit_transform;
   2.294 +    Label L_store_misaligned_output_192bit, L_check_loop_end_192bit, L_cbcenc256, L_load_misaligned_input_256bit;
   2.295 +    Label L_256bit_transform, L_store_misaligned_output_256bit, L_check_loop_end_256bit;
   2.296      address start = __ pc();
   2.297 -    Register from = O0; // source byte array
   2.298 -    Register to = O1;   // destination byte array
   2.299 -    Register key = O2;  // expanded key array
   2.300 -    Register rvec = O3; // init vector
   2.301 -    const Register len_reg = O4; // cipher length
   2.302 -    const Register keylen = O5;  // reg for storing expanded key array length
   2.303 -
   2.304 -    // save cipher len to return in the end
   2.305 -    __ mov(len_reg, L1);
   2.306 +    Register from = I0; // source byte array
   2.307 +    Register to = I1;   // destination byte array
   2.308 +    Register key = I2;  // expanded key array
   2.309 +    Register rvec = I3; // init vector
   2.310 +    const Register len_reg = I4; // cipher length
   2.311 +    const Register keylen = I5;  // reg for storing expanded key array length
   2.312 +
   2.313 +    // save cipher len before save_frame, to return in the end
   2.314 +    __ mov(O4, L0);
   2.315 +    __ save_frame(0);
   2.316  
   2.317      // read expanded key length
   2.318      __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
   2.319  
   2.320 -    // load init vector
   2.321 +    // load initial vector, 8-byte alignment is guranteed
   2.322      __ ldf(FloatRegisterImpl::D, rvec, 0, F60);
   2.323      __ ldf(FloatRegisterImpl::D, rvec, 8, F62);
   2.324 +    // load key, 8-byte alignment is guranteed
   2.325      __ ldx(key,0,G1);
   2.326 -    __ ldx(key,8,G2);
   2.327 -
   2.328 -    // start loading expanded key
   2.329 +    __ ldx(key,8,G5);
   2.330 +
   2.331 +    // start loading expanded key, 8-byte alignment is guranteed
   2.332      for ( int i = 0, j = 16;  i <= 38; i += 2, j += 8 ) {
   2.333        __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i));
   2.334      }
   2.335 @@ -3571,15 +3687,35 @@
   2.336      }
   2.337  
   2.338      // 256-bit original key size
   2.339 -    __ br(Assembler::always, false, Assembler::pt, L_cbcenc256);
   2.340 -    __ delayed()->nop();
   2.341 +    __ ba_short(L_cbcenc256);
   2.342  
   2.343      __ align(OptoLoopAlignment);
   2.344      __ BIND(L_cbcenc128);
   2.345 +    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
   2.346 +    __ andcc(from, 7, G0);
   2.347 +    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_128bit);
   2.348 +    __ delayed()->mov(from, L1); // save original 'from' address before alignaddr
   2.349 +
   2.350 +    // aligned case: load input into G3 and G4
   2.351      __ ldx(from,0,G3);
   2.352      __ ldx(from,8,G4);
   2.353 +    __ ba_short(L_128bit_transform);
   2.354 +
   2.355 +    __ BIND(L_load_misaligned_input_128bit);
   2.356 +    // can clobber F48, F50 and F52 as they are not used in 128 and 192-bit key encryption
   2.357 +    __ alignaddr(from, G0, from);
   2.358 +    __ ldf(FloatRegisterImpl::D, from, 0, F48);
   2.359 +    __ ldf(FloatRegisterImpl::D, from, 8, F50);
   2.360 +    __ ldf(FloatRegisterImpl::D, from, 16, F52);
   2.361 +    __ faligndata(F48, F50, F48);
   2.362 +    __ faligndata(F50, F52, F50);
   2.363 +    __ movdtox(F48, G3);
   2.364 +    __ movdtox(F50, G4);
   2.365 +    __ mov(L1, from);
   2.366 +
   2.367 +    __ BIND(L_128bit_transform);
   2.368      __ xor3(G1,G3,G3);
   2.369 -    __ xor3(G2,G4,G4);
   2.370 +    __ xor3(G5,G4,G4);
   2.371      __ movxtod(G3,F56);
   2.372      __ movxtod(G4,F58);
   2.373      __ fxor(FloatRegisterImpl::D, F60, F56, F60);
   2.374 @@ -3598,24 +3734,81 @@
   2.375        }
   2.376      }
   2.377  
   2.378 +    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
   2.379 +    __ andcc(to, 7, L1);
   2.380 +    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_128bit);
   2.381 +    __ delayed()->edge8n(to, G0, L2);
   2.382 +
   2.383 +    // aligned case: store output into the destination array
   2.384      __ stf(FloatRegisterImpl::D, F60, to, 0);
   2.385      __ stf(FloatRegisterImpl::D, F62, to, 8);
   2.386 +    __ ba_short(L_check_loop_end_128bit);
   2.387 +
   2.388 +    __ BIND(L_store_misaligned_output_128bit);
   2.389 +    __ add(to, 8, L3);
   2.390 +    __ mov(8, L4);
   2.391 +    __ sub(L4, L1, L4);
   2.392 +    __ alignaddr(L4, G0, L4);
   2.393 +    // save cipher text before circular right shift
   2.394 +    // as it needs to be stored as iv for next block (see code before next retl)
   2.395 +    __ movdtox(F60, L6);
   2.396 +    __ movdtox(F62, L7);
   2.397 +    __ faligndata(F60, F60, F60);
   2.398 +    __ faligndata(F62, F62, F62);
   2.399 +    __ mov(to, L5);
   2.400 +    __ and3(to, -8, to);
   2.401 +    __ and3(L3, -8, L3);
   2.402 +    __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
   2.403 +    __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
   2.404 +    __ add(to, 8, to);
   2.405 +    __ add(L3, 8, L3);
   2.406 +    __ orn(G0, L2, L2);
   2.407 +    __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
   2.408 +    __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
   2.409 +    __ mov(L5, to);
   2.410 +    __ movxtod(L6, F60);
   2.411 +    __ movxtod(L7, F62);
   2.412 +
   2.413 +    __ BIND(L_check_loop_end_128bit);
   2.414      __ add(from, 16, from);
   2.415      __ add(to, 16, to);
   2.416      __ subcc(len_reg, 16, len_reg);
   2.417      __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc128);
   2.418      __ delayed()->nop();
   2.419 +    // re-init intial vector for next block, 8-byte alignment is guaranteed
   2.420      __ stf(FloatRegisterImpl::D, F60, rvec, 0);
   2.421      __ stf(FloatRegisterImpl::D, F62, rvec, 8);
   2.422 +    __ restore();
   2.423      __ retl();
   2.424 -    __ delayed()->mov(L1, O0);
   2.425 +    __ delayed()->mov(L0, O0);
   2.426  
   2.427      __ align(OptoLoopAlignment);
   2.428      __ BIND(L_cbcenc192);
   2.429 +    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
   2.430 +    __ andcc(from, 7, G0);
   2.431 +    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_192bit);
   2.432 +    __ delayed()->mov(from, L1); // save original 'from' address before alignaddr
   2.433 +
   2.434 +    // aligned case: load input into G3 and G4
   2.435      __ ldx(from,0,G3);
   2.436      __ ldx(from,8,G4);
   2.437 +    __ ba_short(L_192bit_transform);
   2.438 +
   2.439 +    __ BIND(L_load_misaligned_input_192bit);
   2.440 +    // can clobber F48, F50 and F52 as they are not used in 128 and 192-bit key encryption
   2.441 +    __ alignaddr(from, G0, from);
   2.442 +    __ ldf(FloatRegisterImpl::D, from, 0, F48);
   2.443 +    __ ldf(FloatRegisterImpl::D, from, 8, F50);
   2.444 +    __ ldf(FloatRegisterImpl::D, from, 16, F52);
   2.445 +    __ faligndata(F48, F50, F48);
   2.446 +    __ faligndata(F50, F52, F50);
   2.447 +    __ movdtox(F48, G3);
   2.448 +    __ movdtox(F50, G4);
   2.449 +    __ mov(L1, from);
   2.450 +
   2.451 +    __ BIND(L_192bit_transform);
   2.452      __ xor3(G1,G3,G3);
   2.453 -    __ xor3(G2,G4,G4);
   2.454 +    __ xor3(G5,G4,G4);
   2.455      __ movxtod(G3,F56);
   2.456      __ movxtod(G4,F58);
   2.457      __ fxor(FloatRegisterImpl::D, F60, F56, F60);
   2.458 @@ -3634,24 +3827,81 @@
   2.459        }
   2.460      }
   2.461  
   2.462 +    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
   2.463 +    __ andcc(to, 7, L1);
   2.464 +    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_192bit);
   2.465 +    __ delayed()->edge8n(to, G0, L2);
   2.466 +
   2.467 +    // aligned case: store output into the destination array
   2.468      __ stf(FloatRegisterImpl::D, F60, to, 0);
   2.469      __ stf(FloatRegisterImpl::D, F62, to, 8);
   2.470 +    __ ba_short(L_check_loop_end_192bit);
   2.471 +
   2.472 +    __ BIND(L_store_misaligned_output_192bit);
   2.473 +    __ add(to, 8, L3);
   2.474 +    __ mov(8, L4);
   2.475 +    __ sub(L4, L1, L4);
   2.476 +    __ alignaddr(L4, G0, L4);
   2.477 +    __ movdtox(F60, L6);
   2.478 +    __ movdtox(F62, L7);
   2.479 +    __ faligndata(F60, F60, F60);
   2.480 +    __ faligndata(F62, F62, F62);
   2.481 +    __ mov(to, L5);
   2.482 +    __ and3(to, -8, to);
   2.483 +    __ and3(L3, -8, L3);
   2.484 +    __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
   2.485 +    __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
   2.486 +    __ add(to, 8, to);
   2.487 +    __ add(L3, 8, L3);
   2.488 +    __ orn(G0, L2, L2);
   2.489 +    __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
   2.490 +    __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
   2.491 +    __ mov(L5, to);
   2.492 +    __ movxtod(L6, F60);
   2.493 +    __ movxtod(L7, F62);
   2.494 +
   2.495 +    __ BIND(L_check_loop_end_192bit);
   2.496      __ add(from, 16, from);
   2.497      __ subcc(len_reg, 16, len_reg);
   2.498      __ add(to, 16, to);
   2.499      __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc192);
   2.500      __ delayed()->nop();
   2.501 +    // re-init intial vector for next block, 8-byte alignment is guaranteed
   2.502      __ stf(FloatRegisterImpl::D, F60, rvec, 0);
   2.503      __ stf(FloatRegisterImpl::D, F62, rvec, 8);
   2.504 +    __ restore();
   2.505      __ retl();
   2.506 -    __ delayed()->mov(L1, O0);
   2.507 +    __ delayed()->mov(L0, O0);
   2.508  
   2.509      __ align(OptoLoopAlignment);
   2.510      __ BIND(L_cbcenc256);
   2.511 +    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
   2.512 +    __ andcc(from, 7, G0);
   2.513 +    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_256bit);
   2.514 +    __ delayed()->mov(from, L1); // save original 'from' address before alignaddr
   2.515 +
   2.516 +    // aligned case: load input into G3 and G4
   2.517      __ ldx(from,0,G3);
   2.518      __ ldx(from,8,G4);
   2.519 +    __ ba_short(L_256bit_transform);
   2.520 +
   2.521 +    __ BIND(L_load_misaligned_input_256bit);
   2.522 +    // cannot clobber F48, F50 and F52. F56, F58 can be used though
   2.523 +    __ alignaddr(from, G0, from);
   2.524 +    __ movdtox(F60, L2); // save F60 before overwriting
   2.525 +    __ ldf(FloatRegisterImpl::D, from, 0, F56);
   2.526 +    __ ldf(FloatRegisterImpl::D, from, 8, F58);
   2.527 +    __ ldf(FloatRegisterImpl::D, from, 16, F60);
   2.528 +    __ faligndata(F56, F58, F56);
   2.529 +    __ faligndata(F58, F60, F58);
   2.530 +    __ movdtox(F56, G3);
   2.531 +    __ movdtox(F58, G4);
   2.532 +    __ mov(L1, from);
   2.533 +    __ movxtod(L2, F60);
   2.534 +
   2.535 +    __ BIND(L_256bit_transform);
   2.536      __ xor3(G1,G3,G3);
   2.537 -    __ xor3(G2,G4,G4);
   2.538 +    __ xor3(G5,G4,G4);
   2.539      __ movxtod(G3,F56);
   2.540      __ movxtod(G4,F58);
   2.541      __ fxor(FloatRegisterImpl::D, F60, F56, F60);
   2.542 @@ -3670,26 +3920,69 @@
   2.543        }
   2.544      }
   2.545  
   2.546 +    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
   2.547 +    __ andcc(to, 7, L1);
   2.548 +    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_256bit);
   2.549 +    __ delayed()->edge8n(to, G0, L2);
   2.550 +
   2.551 +    // aligned case: store output into the destination array
   2.552      __ stf(FloatRegisterImpl::D, F60, to, 0);
   2.553      __ stf(FloatRegisterImpl::D, F62, to, 8);
   2.554 +    __ ba_short(L_check_loop_end_256bit);
   2.555 +
   2.556 +    __ BIND(L_store_misaligned_output_256bit);
   2.557 +    __ add(to, 8, L3);
   2.558 +    __ mov(8, L4);
   2.559 +    __ sub(L4, L1, L4);
   2.560 +    __ alignaddr(L4, G0, L4);
   2.561 +    __ movdtox(F60, L6);
   2.562 +    __ movdtox(F62, L7);
   2.563 +    __ faligndata(F60, F60, F60);
   2.564 +    __ faligndata(F62, F62, F62);
   2.565 +    __ mov(to, L5);
   2.566 +    __ and3(to, -8, to);
   2.567 +    __ and3(L3, -8, L3);
   2.568 +    __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
   2.569 +    __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
   2.570 +    __ add(to, 8, to);
   2.571 +    __ add(L3, 8, L3);
   2.572 +    __ orn(G0, L2, L2);
   2.573 +    __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
   2.574 +    __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
   2.575 +    __ mov(L5, to);
   2.576 +    __ movxtod(L6, F60);
   2.577 +    __ movxtod(L7, F62);
   2.578 +
   2.579 +    __ BIND(L_check_loop_end_256bit);
   2.580      __ add(from, 16, from);
   2.581      __ subcc(len_reg, 16, len_reg);
   2.582      __ add(to, 16, to);
   2.583      __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc256);
   2.584      __ delayed()->nop();
   2.585 +    // re-init intial vector for next block, 8-byte alignment is guaranteed
   2.586      __ stf(FloatRegisterImpl::D, F60, rvec, 0);
   2.587      __ stf(FloatRegisterImpl::D, F62, rvec, 8);
   2.588 +    __ restore();
   2.589      __ retl();
   2.590 -    __ delayed()->mov(L1, O0);
   2.591 +    __ delayed()->mov(L0, O0);
   2.592  
   2.593      return start;
   2.594    }
   2.595  
   2.596    address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
   2.597 +    assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
   2.598 +           "the following code assumes that first element of an int array is aligned to 8 bytes");
   2.599 +    assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0,
   2.600 +           "the following code assumes that first element of a byte array is aligned to 8 bytes");
   2.601      __ align(CodeEntryAlignment);
   2.602      StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
   2.603      Label L_cbcdec_end, L_expand192bit, L_expand256bit, L_dec_first_block_start;
   2.604      Label L_dec_first_block128, L_dec_first_block192, L_dec_next2_blocks128, L_dec_next2_blocks192, L_dec_next2_blocks256;
   2.605 +    Label L_load_misaligned_input_first_block, L_transform_first_block, L_load_misaligned_next2_blocks128, L_transform_next2_blocks128;
   2.606 +    Label L_load_misaligned_next2_blocks192, L_transform_next2_blocks192, L_load_misaligned_next2_blocks256, L_transform_next2_blocks256;
   2.607 +    Label L_store_misaligned_output_first_block, L_check_decrypt_end, L_store_misaligned_output_next2_blocks128;
   2.608 +    Label L_check_decrypt_loop_end128, L_store_misaligned_output_next2_blocks192, L_check_decrypt_loop_end192;
   2.609 +    Label L_store_misaligned_output_next2_blocks256, L_check_decrypt_loop_end256;
   2.610      address start = __ pc();
   2.611      Register from = I0; // source byte array
   2.612      Register to = I1;   // destination byte array
   2.613 @@ -3704,11 +3997,12 @@
   2.614      __ save_frame(0); //args are read from I* registers since we save the frame in the beginning
   2.615  
   2.616      // load original key from SunJCE expanded decryption key
   2.617 +    // Since we load original key buffer starting first element, 8-byte alignment is guaranteed
   2.618      for ( int i = 0;  i <= 3; i++ ) {
   2.619        __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
   2.620      }
   2.621  
   2.622 -    // load initial vector
   2.623 +    // load initial vector, 8-byte alignment is guaranteed
   2.624      __ ldx(rvec,0,L0);
   2.625      __ ldx(rvec,8,L1);
   2.626  
   2.627 @@ -3733,11 +4027,10 @@
   2.628      __ movdtox(F42,L3);
   2.629  
   2.630      __ and3(len_reg, 16, L4);
   2.631 -    __ br_null(L4, false, Assembler::pt, L_dec_next2_blocks128);
   2.632 -    __ delayed()->nop();
   2.633 -
   2.634 -    __ br(Assembler::always, false, Assembler::pt, L_dec_first_block_start);
   2.635 -    __ delayed()->nop();
   2.636 +    __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks128);
   2.637 +    __ nop();
   2.638 +
   2.639 +    __ ba_short(L_dec_first_block_start);
   2.640  
   2.641      __ BIND(L_expand192bit);
   2.642      // load rest of the 192-bit key
   2.643 @@ -3758,11 +4051,10 @@
   2.644      __ movdtox(F50,L3);
   2.645  
   2.646      __ and3(len_reg, 16, L4);
   2.647 -    __ br_null(L4, false, Assembler::pt, L_dec_next2_blocks192);
   2.648 -    __ delayed()->nop();
   2.649 -
   2.650 -    __ br(Assembler::always, false, Assembler::pt, L_dec_first_block_start);
   2.651 -    __ delayed()->nop();
   2.652 +    __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks192);
   2.653 +    __ nop();
   2.654 +
   2.655 +    __ ba_short(L_dec_first_block_start);
   2.656  
   2.657      __ BIND(L_expand256bit);
   2.658      // load rest of the 256-bit key
   2.659 @@ -3785,12 +4077,32 @@
   2.660      __ movdtox(F58,L3);
   2.661  
   2.662      __ and3(len_reg, 16, L4);
   2.663 -    __ br_null(L4, false, Assembler::pt, L_dec_next2_blocks256);
   2.664 -    __ delayed()->nop();
   2.665 +    __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks256);
   2.666  
   2.667      __ BIND(L_dec_first_block_start);
   2.668 +    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
   2.669 +    __ andcc(from, 7, G0);
   2.670 +    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_first_block);
   2.671 +    __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
   2.672 +
   2.673 +    // aligned case: load input into L4 and L5
   2.674      __ ldx(from,0,L4);
   2.675      __ ldx(from,8,L5);
   2.676 +    __ ba_short(L_transform_first_block);
   2.677 +
   2.678 +    __ BIND(L_load_misaligned_input_first_block);
   2.679 +    __ alignaddr(from, G0, from);
   2.680 +    // F58, F60, F62 can be clobbered
   2.681 +    __ ldf(FloatRegisterImpl::D, from, 0, F58);
   2.682 +    __ ldf(FloatRegisterImpl::D, from, 8, F60);
   2.683 +    __ ldf(FloatRegisterImpl::D, from, 16, F62);
   2.684 +    __ faligndata(F58, F60, F58);
   2.685 +    __ faligndata(F60, F62, F60);
   2.686 +    __ movdtox(F58, L4);
   2.687 +    __ movdtox(F60, L5);
   2.688 +    __ mov(G1, from);
   2.689 +
   2.690 +    __ BIND(L_transform_first_block);
   2.691      __ xor3(L2,L4,G1);
   2.692      __ movxtod(G1,F60);
   2.693      __ xor3(L3,L5,G1);
   2.694 @@ -3833,9 +4145,36 @@
   2.695      __ fxor(FloatRegisterImpl::D, F56, F60, F60);
   2.696      __ fxor(FloatRegisterImpl::D, F58, F62, F62);
   2.697  
   2.698 +    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
   2.699 +    __ andcc(to, 7, G1);
   2.700 +    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_first_block);
   2.701 +    __ delayed()->edge8n(to, G0, G2);
   2.702 +
   2.703 +    // aligned case: store output into the destination array
   2.704      __ stf(FloatRegisterImpl::D, F60, to, 0);
   2.705      __ stf(FloatRegisterImpl::D, F62, to, 8);
   2.706 -
   2.707 +    __ ba_short(L_check_decrypt_end);
   2.708 +
   2.709 +    __ BIND(L_store_misaligned_output_first_block);
   2.710 +    __ add(to, 8, G3);
   2.711 +    __ mov(8, G4);
   2.712 +    __ sub(G4, G1, G4);
   2.713 +    __ alignaddr(G4, G0, G4);
   2.714 +    __ faligndata(F60, F60, F60);
   2.715 +    __ faligndata(F62, F62, F62);
   2.716 +    __ mov(to, G1);
   2.717 +    __ and3(to, -8, to);
   2.718 +    __ and3(G3, -8, G3);
   2.719 +    __ stpartialf(to, G2, F60, Assembler::ASI_PST8_PRIMARY);
   2.720 +    __ stpartialf(G3, G2, F62, Assembler::ASI_PST8_PRIMARY);
   2.721 +    __ add(to, 8, to);
   2.722 +    __ add(G3, 8, G3);
   2.723 +    __ orn(G0, G2, G2);
   2.724 +    __ stpartialf(to, G2, F60, Assembler::ASI_PST8_PRIMARY);
   2.725 +    __ stpartialf(G3, G2, F62, Assembler::ASI_PST8_PRIMARY);
   2.726 +    __ mov(G1, to);
   2.727 +
   2.728 +    __ BIND(L_check_decrypt_end);
   2.729      __ add(from, 16, from);
   2.730      __ add(to, 16, to);
   2.731      __ subcc(len_reg, 16, len_reg);
   2.732 @@ -3852,17 +4191,44 @@
   2.733      __ BIND(L_dec_next2_blocks128);
   2.734      __ nop();
   2.735  
   2.736 -    // F40:F42 used for first 16-bytes
   2.737 +    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
   2.738 +    __ andcc(from, 7, G0);
   2.739 +    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks128);
   2.740 +    __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
   2.741 +
   2.742 +    // aligned case: load input into G4, G5, L4 and L5
   2.743      __ ldx(from,0,G4);
   2.744      __ ldx(from,8,G5);
   2.745 +    __ ldx(from,16,L4);
   2.746 +    __ ldx(from,24,L5);
   2.747 +    __ ba_short(L_transform_next2_blocks128);
   2.748 +
   2.749 +    __ BIND(L_load_misaligned_next2_blocks128);
   2.750 +    __ alignaddr(from, G0, from);
   2.751 +    // F40, F42, F58, F60, F62 can be clobbered
   2.752 +    __ ldf(FloatRegisterImpl::D, from, 0, F40);
   2.753 +    __ ldf(FloatRegisterImpl::D, from, 8, F42);
   2.754 +    __ ldf(FloatRegisterImpl::D, from, 16, F60);
   2.755 +    __ ldf(FloatRegisterImpl::D, from, 24, F62);
   2.756 +    __ ldf(FloatRegisterImpl::D, from, 32, F58);
   2.757 +    __ faligndata(F40, F42, F40);
   2.758 +    __ faligndata(F42, F60, F42);
   2.759 +    __ faligndata(F60, F62, F60);
   2.760 +    __ faligndata(F62, F58, F62);
   2.761 +    __ movdtox(F40, G4);
   2.762 +    __ movdtox(F42, G5);
   2.763 +    __ movdtox(F60, L4);
   2.764 +    __ movdtox(F62, L5);
   2.765 +    __ mov(G1, from);
   2.766 +
   2.767 +    __ BIND(L_transform_next2_blocks128);
   2.768 +    // F40:F42 used for first 16-bytes
   2.769      __ xor3(L2,G4,G1);
   2.770      __ movxtod(G1,F40);
   2.771      __ xor3(L3,G5,G1);
   2.772      __ movxtod(G1,F42);
   2.773  
   2.774      // F60:F62 used for next 16-bytes
   2.775 -    __ ldx(from,16,L4);
   2.776 -    __ ldx(from,24,L5);
   2.777      __ xor3(L2,L4,G1);
   2.778      __ movxtod(G1,F60);
   2.779      __ xor3(L3,L5,G1);
   2.780 @@ -3891,9 +4257,6 @@
   2.781      __ fxor(FloatRegisterImpl::D, F46, F40, F40);
   2.782      __ fxor(FloatRegisterImpl::D, F44, F42, F42);
   2.783  
   2.784 -    __ stf(FloatRegisterImpl::D, F40, to, 0);
   2.785 -    __ stf(FloatRegisterImpl::D, F42, to, 8);
   2.786 -
   2.787      __ movxtod(G4,F56);
   2.788      __ movxtod(G5,F58);
   2.789      __ mov(L4,L0);
   2.790 @@ -3901,32 +4264,93 @@
   2.791      __ fxor(FloatRegisterImpl::D, F56, F60, F60);
   2.792      __ fxor(FloatRegisterImpl::D, F58, F62, F62);
   2.793  
   2.794 +    // For mis-aligned store of 32 bytes of result we can do:
   2.795 +    // Circular right-shift all 4 FP registers so that 'head' and 'tail'
   2.796 +    // parts that need to be stored starting at mis-aligned address are in a FP reg
   2.797 +    // the other 3 FP regs can thus be stored using regular store
   2.798 +    // we then use the edge + partial-store mechanism to store the 'head' and 'tail' parts
   2.799 +
   2.800 +    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
   2.801 +    __ andcc(to, 7, G1);
   2.802 +    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks128);
   2.803 +    __ delayed()->edge8n(to, G0, G2);
   2.804 +
   2.805 +    // aligned case: store output into the destination array
   2.806 +    __ stf(FloatRegisterImpl::D, F40, to, 0);
   2.807 +    __ stf(FloatRegisterImpl::D, F42, to, 8);
   2.808      __ stf(FloatRegisterImpl::D, F60, to, 16);
   2.809      __ stf(FloatRegisterImpl::D, F62, to, 24);
   2.810 -
   2.811 +    __ ba_short(L_check_decrypt_loop_end128);
   2.812 +
   2.813 +    __ BIND(L_store_misaligned_output_next2_blocks128);
   2.814 +    __ mov(8, G4);
   2.815 +    __ sub(G4, G1, G4);
   2.816 +    __ alignaddr(G4, G0, G4);
   2.817 +    __ faligndata(F40, F42, F56); // F56 can be clobbered
   2.818 +    __ faligndata(F42, F60, F42);
   2.819 +    __ faligndata(F60, F62, F60);
   2.820 +    __ faligndata(F62, F40, F40);
   2.821 +    __ mov(to, G1);
   2.822 +    __ and3(to, -8, to);
   2.823 +    __ stpartialf(to, G2, F40, Assembler::ASI_PST8_PRIMARY);
   2.824 +    __ stf(FloatRegisterImpl::D, F56, to, 8);
   2.825 +    __ stf(FloatRegisterImpl::D, F42, to, 16);
   2.826 +    __ stf(FloatRegisterImpl::D, F60, to, 24);
   2.827 +    __ add(to, 32, to);
   2.828 +    __ orn(G0, G2, G2);
   2.829 +    __ stpartialf(to, G2, F40, Assembler::ASI_PST8_PRIMARY);
   2.830 +    __ mov(G1, to);
   2.831 +
   2.832 +    __ BIND(L_check_decrypt_loop_end128);
   2.833      __ add(from, 32, from);
   2.834      __ add(to, 32, to);
   2.835      __ subcc(len_reg, 32, len_reg);
   2.836      __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks128);
   2.837      __ delayed()->nop();
   2.838 -    __ br(Assembler::always, false, Assembler::pt, L_cbcdec_end);
   2.839 -    __ delayed()->nop();
   2.840 +    __ ba_short(L_cbcdec_end);
   2.841  
   2.842      __ align(OptoLoopAlignment);
   2.843      __ BIND(L_dec_next2_blocks192);
   2.844      __ nop();
   2.845  
   2.846 -    // F48:F50 used for first 16-bytes
   2.847 +    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
   2.848 +    __ andcc(from, 7, G0);
   2.849 +    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks192);
   2.850 +    __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
   2.851 +
   2.852 +    // aligned case: load input into G4, G5, L4 and L5
   2.853      __ ldx(from,0,G4);
   2.854      __ ldx(from,8,G5);
   2.855 +    __ ldx(from,16,L4);
   2.856 +    __ ldx(from,24,L5);
   2.857 +    __ ba_short(L_transform_next2_blocks192);
   2.858 +
   2.859 +    __ BIND(L_load_misaligned_next2_blocks192);
   2.860 +    __ alignaddr(from, G0, from);
   2.861 +    // F48, F50, F52, F60, F62 can be clobbered
   2.862 +    __ ldf(FloatRegisterImpl::D, from, 0, F48);
   2.863 +    __ ldf(FloatRegisterImpl::D, from, 8, F50);
   2.864 +    __ ldf(FloatRegisterImpl::D, from, 16, F60);
   2.865 +    __ ldf(FloatRegisterImpl::D, from, 24, F62);
   2.866 +    __ ldf(FloatRegisterImpl::D, from, 32, F52);
   2.867 +    __ faligndata(F48, F50, F48);
   2.868 +    __ faligndata(F50, F60, F50);
   2.869 +    __ faligndata(F60, F62, F60);
   2.870 +    __ faligndata(F62, F52, F62);
   2.871 +    __ movdtox(F48, G4);
   2.872 +    __ movdtox(F50, G5);
   2.873 +    __ movdtox(F60, L4);
   2.874 +    __ movdtox(F62, L5);
   2.875 +    __ mov(G1, from);
   2.876 +
   2.877 +    __ BIND(L_transform_next2_blocks192);
   2.878 +    // F48:F50 used for first 16-bytes
   2.879      __ xor3(L2,G4,G1);
   2.880      __ movxtod(G1,F48);
   2.881      __ xor3(L3,G5,G1);
   2.882      __ movxtod(G1,F50);
   2.883  
   2.884      // F60:F62 used for next 16-bytes
   2.885 -    __ ldx(from,16,L4);
   2.886 -    __ ldx(from,24,L5);
   2.887      __ xor3(L2,L4,G1);
   2.888      __ movxtod(G1,F60);
   2.889      __ xor3(L3,L5,G1);
   2.890 @@ -3955,9 +4379,6 @@
   2.891      __ fxor(FloatRegisterImpl::D, F54, F48, F48);
   2.892      __ fxor(FloatRegisterImpl::D, F52, F50, F50);
   2.893  
   2.894 -    __ stf(FloatRegisterImpl::D, F48, to, 0);
   2.895 -    __ stf(FloatRegisterImpl::D, F50, to, 8);
   2.896 -
   2.897      __ movxtod(G4,F56);
   2.898      __ movxtod(G5,F58);
   2.899      __ mov(L4,L0);
   2.900 @@ -3965,32 +4386,87 @@
   2.901      __ fxor(FloatRegisterImpl::D, F56, F60, F60);
   2.902      __ fxor(FloatRegisterImpl::D, F58, F62, F62);
   2.903  
   2.904 +    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
   2.905 +    __ andcc(to, 7, G1);
   2.906 +    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks192);
   2.907 +    __ delayed()->edge8n(to, G0, G2);
   2.908 +
   2.909 +    // aligned case: store output into the destination array
   2.910 +    __ stf(FloatRegisterImpl::D, F48, to, 0);
   2.911 +    __ stf(FloatRegisterImpl::D, F50, to, 8);
   2.912      __ stf(FloatRegisterImpl::D, F60, to, 16);
   2.913      __ stf(FloatRegisterImpl::D, F62, to, 24);
   2.914 -
   2.915 +    __ ba_short(L_check_decrypt_loop_end192);
   2.916 +
   2.917 +    __ BIND(L_store_misaligned_output_next2_blocks192);
   2.918 +    __ mov(8, G4);
   2.919 +    __ sub(G4, G1, G4);
   2.920 +    __ alignaddr(G4, G0, G4);
   2.921 +    __ faligndata(F48, F50, F56); // F56 can be clobbered
   2.922 +    __ faligndata(F50, F60, F50);
   2.923 +    __ faligndata(F60, F62, F60);
   2.924 +    __ faligndata(F62, F48, F48);
   2.925 +    __ mov(to, G1);
   2.926 +    __ and3(to, -8, to);
   2.927 +    __ stpartialf(to, G2, F48, Assembler::ASI_PST8_PRIMARY);
   2.928 +    __ stf(FloatRegisterImpl::D, F56, to, 8);
   2.929 +    __ stf(FloatRegisterImpl::D, F50, to, 16);
   2.930 +    __ stf(FloatRegisterImpl::D, F60, to, 24);
   2.931 +    __ add(to, 32, to);
   2.932 +    __ orn(G0, G2, G2);
   2.933 +    __ stpartialf(to, G2, F48, Assembler::ASI_PST8_PRIMARY);
   2.934 +    __ mov(G1, to);
   2.935 +
   2.936 +    __ BIND(L_check_decrypt_loop_end192);
   2.937      __ add(from, 32, from);
   2.938      __ add(to, 32, to);
   2.939      __ subcc(len_reg, 32, len_reg);
   2.940      __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks192);
   2.941      __ delayed()->nop();
   2.942 -    __ br(Assembler::always, false, Assembler::pt, L_cbcdec_end);
   2.943 -    __ delayed()->nop();
   2.944 +    __ ba_short(L_cbcdec_end);
   2.945  
   2.946      __ align(OptoLoopAlignment);
   2.947      __ BIND(L_dec_next2_blocks256);
   2.948      __ nop();
   2.949  
   2.950 -    // F0:F2 used for first 16-bytes
   2.951 +    // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
   2.952 +    __ andcc(from, 7, G0);
   2.953 +    __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks256);
   2.954 +    __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
   2.955 +
   2.956 +    // aligned case: load input into G4, G5, L4 and L5
   2.957      __ ldx(from,0,G4);
   2.958      __ ldx(from,8,G5);
   2.959 +    __ ldx(from,16,L4);
   2.960 +    __ ldx(from,24,L5);
   2.961 +    __ ba_short(L_transform_next2_blocks256);
   2.962 +
   2.963 +    __ BIND(L_load_misaligned_next2_blocks256);
   2.964 +    __ alignaddr(from, G0, from);
   2.965 +    // F0, F2, F4, F60, F62 can be clobbered
   2.966 +    __ ldf(FloatRegisterImpl::D, from, 0, F0);
   2.967 +    __ ldf(FloatRegisterImpl::D, from, 8, F2);
   2.968 +    __ ldf(FloatRegisterImpl::D, from, 16, F60);
   2.969 +    __ ldf(FloatRegisterImpl::D, from, 24, F62);
   2.970 +    __ ldf(FloatRegisterImpl::D, from, 32, F4);
   2.971 +    __ faligndata(F0, F2, F0);
   2.972 +    __ faligndata(F2, F60, F2);
   2.973 +    __ faligndata(F60, F62, F60);
   2.974 +    __ faligndata(F62, F4, F62);
   2.975 +    __ movdtox(F0, G4);
   2.976 +    __ movdtox(F2, G5);
   2.977 +    __ movdtox(F60, L4);
   2.978 +    __ movdtox(F62, L5);
   2.979 +    __ mov(G1, from);
   2.980 +
   2.981 +    __ BIND(L_transform_next2_blocks256);
   2.982 +    // F0:F2 used for first 16-bytes
   2.983      __ xor3(L2,G4,G1);
   2.984      __ movxtod(G1,F0);
   2.985      __ xor3(L3,G5,G1);
   2.986      __ movxtod(G1,F2);
   2.987  
   2.988      // F60:F62 used for next 16-bytes
   2.989 -    __ ldx(from,16,L4);
   2.990 -    __ ldx(from,24,L5);
   2.991      __ xor3(L2,L4,G1);
   2.992      __ movxtod(G1,F60);
   2.993      __ xor3(L3,L5,G1);
   2.994 @@ -4043,9 +4519,6 @@
   2.995      __ fxor(FloatRegisterImpl::D, F6, F0, F0);
   2.996      __ fxor(FloatRegisterImpl::D, F4, F2, F2);
   2.997  
   2.998 -    __ stf(FloatRegisterImpl::D, F0, to, 0);
   2.999 -    __ stf(FloatRegisterImpl::D, F2, to, 8);
  2.1000 -
  2.1001      __ movxtod(G4,F56);
  2.1002      __ movxtod(G5,F58);
  2.1003      __ mov(L4,L0);
  2.1004 @@ -4053,9 +4526,38 @@
  2.1005      __ fxor(FloatRegisterImpl::D, F56, F60, F60);
  2.1006      __ fxor(FloatRegisterImpl::D, F58, F62, F62);
  2.1007  
  2.1008 +    // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
  2.1009 +    __ andcc(to, 7, G1);
  2.1010 +    __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks256);
  2.1011 +    __ delayed()->edge8n(to, G0, G2);
  2.1012 +
  2.1013 +    // aligned case: store output into the destination array
  2.1014 +    __ stf(FloatRegisterImpl::D, F0, to, 0);
  2.1015 +    __ stf(FloatRegisterImpl::D, F2, to, 8);
  2.1016      __ stf(FloatRegisterImpl::D, F60, to, 16);
  2.1017      __ stf(FloatRegisterImpl::D, F62, to, 24);
  2.1018 -
  2.1019 +    __ ba_short(L_check_decrypt_loop_end256);
  2.1020 +
  2.1021 +    __ BIND(L_store_misaligned_output_next2_blocks256);
  2.1022 +    __ mov(8, G4);
  2.1023 +    __ sub(G4, G1, G4);
  2.1024 +    __ alignaddr(G4, G0, G4);
  2.1025 +    __ faligndata(F0, F2, F56); // F56 can be clobbered
  2.1026 +    __ faligndata(F2, F60, F2);
  2.1027 +    __ faligndata(F60, F62, F60);
  2.1028 +    __ faligndata(F62, F0, F0);
  2.1029 +    __ mov(to, G1);
  2.1030 +    __ and3(to, -8, to);
  2.1031 +    __ stpartialf(to, G2, F0, Assembler::ASI_PST8_PRIMARY);
  2.1032 +    __ stf(FloatRegisterImpl::D, F56, to, 8);
  2.1033 +    __ stf(FloatRegisterImpl::D, F2, to, 16);
  2.1034 +    __ stf(FloatRegisterImpl::D, F60, to, 24);
  2.1035 +    __ add(to, 32, to);
  2.1036 +    __ orn(G0, G2, G2);
  2.1037 +    __ stpartialf(to, G2, F0, Assembler::ASI_PST8_PRIMARY);
  2.1038 +    __ mov(G1, to);
  2.1039 +
  2.1040 +    __ BIND(L_check_decrypt_loop_end256);
  2.1041      __ add(from, 32, from);
  2.1042      __ add(to, 32, to);
  2.1043      __ subcc(len_reg, 32, len_reg);
  2.1044 @@ -4063,6 +4565,7 @@
  2.1045      __ delayed()->nop();
  2.1046  
  2.1047      __ BIND(L_cbcdec_end);
  2.1048 +    // re-init intial vector for next block, 8-byte alignment is guaranteed
  2.1049      __ stx(L0, rvec, 0);
  2.1050      __ stx(L1, rvec, 8);
  2.1051      __ restore();
     3.1 --- a/src/cpu/sparc/vm/stubRoutines_sparc.hpp	Thu May 01 15:02:46 2014 -0700
     3.2 +++ b/src/cpu/sparc/vm/stubRoutines_sparc.hpp	Wed Apr 30 14:14:01 2014 -0700
     3.3 @@ -41,7 +41,7 @@
     3.4  enum /* platform_dependent_constants */ {
     3.5    // %%%%%%%% May be able to shrink this a lot
     3.6    code_size1 = 20000,           // simply increase if too small (assembler will crash if too small)
     3.7 -  code_size2 = 20000            // simply increase if too small (assembler will crash if too small)
     3.8 +  code_size2 = 22000            // simply increase if too small (assembler will crash if too small)
     3.9  };
    3.10  
    3.11  class Sparc {
     4.1 --- a/src/cpu/sparc/vm/vm_version_sparc.cpp	Thu May 01 15:02:46 2014 -0700
     4.2 +++ b/src/cpu/sparc/vm/vm_version_sparc.cpp	Wed Apr 30 14:14:01 2014 -0700
     4.3 @@ -1,5 +1,5 @@
     4.4  /*
     4.5 - * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
     4.6 + * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
     4.7   * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     4.8   *
     4.9   * This code is free software; you can redistribute it and/or modify it
    4.10 @@ -266,9 +266,9 @@
    4.11    if (!has_vis1()) // Drop to 0 if no VIS1 support
    4.12      UseVIS = 0;
    4.13  
    4.14 -  // T2 and above should have support for AES instructions
    4.15 +  // SPARC T4 and above should have support for AES instructions
    4.16    if (has_aes()) {
    4.17 -    if (UseVIS > 0) { // AES intrinsics use FXOR instruction which is VIS1
    4.18 +    if (UseVIS > 2) { // AES intrinsics use MOVxTOd/MOVdTOx which are VIS3
    4.19        if (FLAG_IS_DEFAULT(UseAES)) {
    4.20          FLAG_SET_DEFAULT(UseAES, true);
    4.21        }
    4.22 @@ -282,7 +282,7 @@
    4.23        }
    4.24      } else {
    4.25          if (UseAES || UseAESIntrinsics) {
    4.26 -          warning("SPARC AES intrinsics require VIS1 instruction support. Intrinsics will be disabled.");
    4.27 +          warning("SPARC AES intrinsics require VIS3 instruction support. Intrinsics will be disabled.");
    4.28            if (UseAES) {
    4.29              FLAG_SET_DEFAULT(UseAES, false);
    4.30            }
     5.1 --- a/src/share/vm/classfile/vmSymbols.hpp	Thu May 01 15:02:46 2014 -0700
     5.2 +++ b/src/share/vm/classfile/vmSymbols.hpp	Wed Apr 30 14:14:01 2014 -0700
     5.3 @@ -774,7 +774,7 @@
     5.4    /* java/lang/ref/Reference */                                                                                         \
     5.5    do_intrinsic(_Reference_get,            java_lang_ref_Reference, get_name,    void_object_signature, F_R)             \
     5.6                                                                                                                          \
     5.7 -  /* support for com.sum.crypto.provider.AESCrypt and some of its callers */                                            \
     5.8 +  /* support for com.sun.crypto.provider.AESCrypt and some of its callers */                                            \
     5.9    do_class(com_sun_crypto_provider_aescrypt,      "com/sun/crypto/provider/AESCrypt")                                   \
    5.10    do_intrinsic(_aescrypt_encryptBlock, com_sun_crypto_provider_aescrypt, encryptBlock_name, byteArray_int_byteArray_int_signature, F_R)   \
    5.11    do_intrinsic(_aescrypt_decryptBlock, com_sun_crypto_provider_aescrypt, decryptBlock_name, byteArray_int_byteArray_int_signature, F_R)   \
     6.1 --- a/src/share/vm/opto/runtime.cpp	Thu May 01 15:02:46 2014 -0700
     6.2 +++ b/src/share/vm/opto/runtime.cpp	Wed Apr 30 14:14:01 2014 -0700
     6.3 @@ -1,5 +1,5 @@
     6.4  /*
     6.5 - * Copyright (c) 1998, 2013, Oracle and/or its affiliates. All rights reserved.
     6.6 + * Copyright (c) 1998, 2014, Oracle and/or its affiliates. All rights reserved.
     6.7   * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     6.8   *
     6.9   * This code is free software; you can redistribute it and/or modify it
    6.10 @@ -870,7 +870,7 @@
    6.11    return TypeFunc::make(domain, range);
    6.12  }
    6.13  
    6.14 -// for cipherBlockChaining calls of aescrypt encrypt/decrypt, four pointers and a length, returning void
    6.15 +// for cipherBlockChaining calls of aescrypt encrypt/decrypt, four pointers and a length, returning int
    6.16  const TypeFunc* OptoRuntime::cipherBlockChaining_aescrypt_Type() {
    6.17    // create input type (domain)
    6.18    int num_args      = 5;
     7.1 --- a/test/compiler/7184394/TestAESBase.java	Thu May 01 15:02:46 2014 -0700
     7.2 +++ b/test/compiler/7184394/TestAESBase.java	Wed Apr 30 14:14:01 2014 -0700
     7.3 @@ -1,5 +1,5 @@
     7.4  /*
     7.5 - * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
     7.6 + * Copyright (c) 2012, 2014, Oracle and/or its affiliates. All rights reserved.
     7.7   * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     7.8   *
     7.9   * This code is free software; you can redistribute it and/or modify it
    7.10 @@ -40,9 +40,20 @@
    7.11    int msgSize = Integer.getInteger("msgSize", 646);
    7.12    boolean checkOutput = Boolean.getBoolean("checkOutput");
    7.13    boolean noReinit = Boolean.getBoolean("noReinit");
    7.14 +  boolean testingMisalignment;
    7.15 +  private static final int ALIGN = 8;
    7.16 +  int encInputOffset = Integer.getInteger("encInputOffset", 0) % ALIGN;
    7.17 +  int encOutputOffset = Integer.getInteger("encOutputOffset", 0) % ALIGN;
    7.18 +  int decOutputOffset = Integer.getInteger("decOutputOffset", 0) % ALIGN;
    7.19 +  int lastChunkSize = Integer.getInteger("lastChunkSize", 32);
    7.20    int keySize = Integer.getInteger("keySize", 128);
    7.21 +  int inputLength;
    7.22 +  int encodeLength;
    7.23 +  int decodeLength;
    7.24 +  int decodeMsgSize;
    7.25    String algorithm = System.getProperty("algorithm", "AES");
    7.26    String mode = System.getProperty("mode", "CBC");
    7.27 +  String paddingStr = System.getProperty("paddingStr", "PKCS5Padding");
    7.28    byte[] input;
    7.29    byte[] encode;
    7.30    byte[] expectedEncode;
    7.31 @@ -51,7 +62,6 @@
    7.32    Random random = new Random(0);
    7.33    Cipher cipher;
    7.34    Cipher dCipher;
    7.35 -  String paddingStr = "PKCS5Padding";
    7.36    AlgorithmParameters algParams;
    7.37    SecretKey key;
    7.38  
    7.39 @@ -67,7 +77,10 @@
    7.40  
    7.41    public void prepare() {
    7.42      try {
    7.43 -    System.out.println("\nalgorithm=" + algorithm + ", mode=" + mode + ", msgSize=" + msgSize + ", keySize=" + keySize + ", noReinit=" + noReinit + ", checkOutput=" + checkOutput);
    7.44 +    System.out.println("\nalgorithm=" + algorithm + ", mode=" + mode + ", paddingStr=" + paddingStr + ", msgSize=" + msgSize + ", keySize=" + keySize + ", noReinit=" + noReinit + ", checkOutput=" + checkOutput + ", encInputOffset=" + encInputOffset + ", encOutputOffset=" + encOutputOffset + ", decOutputOffset=" + decOutputOffset + ", lastChunkSize=" +lastChunkSize );
    7.45 +
    7.46 +      if (encInputOffset % ALIGN != 0 || encOutputOffset % ALIGN != 0 || decOutputOffset % ALIGN !=0 )
    7.47 +        testingMisalignment = true;
    7.48  
    7.49        int keyLenBytes = (keySize == 0 ? 16 : keySize/8);
    7.50        byte keyBytes[] = new byte[keyLenBytes];
    7.51 @@ -81,10 +94,6 @@
    7.52          System.out.println("Algorithm: " + key.getAlgorithm() + "("
    7.53                             + key.getEncoded().length * 8 + "bit)");
    7.54        }
    7.55 -      input = new byte[msgSize];
    7.56 -      for (int i=0; i<input.length; i++) {
    7.57 -        input[i] = (byte) (i & 0xff);
    7.58 -      }
    7.59  
    7.60        cipher = Cipher.getInstance(algorithm + "/" + mode + "/" + paddingStr, "SunJCE");
    7.61        dCipher = Cipher.getInstance(algorithm + "/" + mode + "/" + paddingStr, "SunJCE");
    7.62 @@ -103,10 +112,35 @@
    7.63          childShowCipher();
    7.64        }
    7.65  
    7.66 +      inputLength = msgSize + encInputOffset;
    7.67 +      if (testingMisalignment) {
    7.68 +        encodeLength = cipher.getOutputSize(msgSize - lastChunkSize) + encOutputOffset;
    7.69 +        encodeLength += cipher.getOutputSize(lastChunkSize);
    7.70 +        decodeLength = dCipher.getOutputSize(encodeLength - lastChunkSize) + decOutputOffset;
    7.71 +        decodeLength += dCipher.getOutputSize(lastChunkSize);
    7.72 +      } else {
    7.73 +        encodeLength = cipher.getOutputSize(msgSize) + encOutputOffset;
    7.74 +        decodeLength = dCipher.getOutputSize(encodeLength) + decOutputOffset;
    7.75 +      }
    7.76 +
    7.77 +      input = new byte[inputLength];
    7.78 +      for (int i=encInputOffset, j=0; i<inputLength; i++, j++) {
    7.79 +        input[i] = (byte) (j & 0xff);
    7.80 +      }
    7.81 +
    7.82        // do one encode and decode in preparation
    7.83 -      // this will also create the encode buffer and decode buffer
    7.84 -      encode = cipher.doFinal(input);
    7.85 -      decode = dCipher.doFinal(encode);
    7.86 +      encode = new byte[encodeLength];
    7.87 +      decode = new byte[decodeLength];
    7.88 +      if (testingMisalignment) {
    7.89 +        decodeMsgSize = cipher.update(input, encInputOffset, (msgSize - lastChunkSize), encode, encOutputOffset);
    7.90 +        decodeMsgSize += cipher.doFinal(input, (encInputOffset + msgSize - lastChunkSize), lastChunkSize, encode, (encOutputOffset + decodeMsgSize));
    7.91 +
    7.92 +        int tempSize = dCipher.update(encode, encOutputOffset, (decodeMsgSize - lastChunkSize), decode, decOutputOffset);
    7.93 +        dCipher.doFinal(encode, (encOutputOffset + decodeMsgSize - lastChunkSize), lastChunkSize, decode, (decOutputOffset + tempSize));
    7.94 +      } else {
    7.95 +        decodeMsgSize = cipher.doFinal(input, encInputOffset, msgSize, encode, encOutputOffset);
    7.96 +        dCipher.doFinal(encode, encOutputOffset, decodeMsgSize, decode, decOutputOffset);
    7.97 +      }
    7.98        if (checkOutput) {
    7.99          expectedEncode = (byte[]) encode.clone();
   7.100          expectedDecode = (byte[]) decode.clone();
     8.1 --- a/test/compiler/7184394/TestAESDecode.java	Thu May 01 15:02:46 2014 -0700
     8.2 +++ b/test/compiler/7184394/TestAESDecode.java	Wed Apr 30 14:14:01 2014 -0700
     8.3 @@ -1,5 +1,5 @@
     8.4  /*
     8.5 - * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
     8.6 + * Copyright (c) 2012, 2014, Oracle and/or its affiliates. All rights reserved.
     8.7   * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     8.8   *
     8.9   * This code is free software; you can redistribute it and/or modify it
    8.10 @@ -33,14 +33,15 @@
    8.11    public void run() {
    8.12      try {
    8.13        if (!noReinit) dCipher.init(Cipher.DECRYPT_MODE, key, algParams);
    8.14 +      decode = new byte[decodeLength];
    8.15 +      if (testingMisalignment) {
    8.16 +        int tempSize = dCipher.update(encode, encOutputOffset, (decodeMsgSize - lastChunkSize), decode, decOutputOffset);
    8.17 +        dCipher.doFinal(encode, (encOutputOffset + decodeMsgSize - lastChunkSize), lastChunkSize, decode, (decOutputOffset + tempSize));
    8.18 +      } else {
    8.19 +        dCipher.doFinal(encode, encOutputOffset, decodeMsgSize, decode, decOutputOffset);
    8.20 +      }
    8.21        if (checkOutput) {
    8.22 -        // checked version creates new output buffer each time
    8.23 -        decode = dCipher.doFinal(encode, 0, encode.length);
    8.24          compareArrays(decode, expectedDecode);
    8.25 -      } else {
    8.26 -        // non-checked version outputs to existing encode buffer for maximum speed
    8.27 -        decode = new byte[dCipher.getOutputSize(encode.length)];
    8.28 -        dCipher.doFinal(encode, 0, encode.length, decode);
    8.29        }
    8.30      }
    8.31      catch (Exception e) {
     9.1 --- a/test/compiler/7184394/TestAESEncode.java	Thu May 01 15:02:46 2014 -0700
     9.2 +++ b/test/compiler/7184394/TestAESEncode.java	Wed Apr 30 14:14:01 2014 -0700
     9.3 @@ -1,5 +1,5 @@
     9.4  /*
     9.5 - * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
     9.6 + * Copyright (c) 2012, 2014, Oracle and/or its affiliates. All rights reserved.
     9.7   * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     9.8   *
     9.9   * This code is free software; you can redistribute it and/or modify it
    9.10 @@ -33,14 +33,15 @@
    9.11    public void run() {
    9.12      try {
    9.13        if (!noReinit) cipher.init(Cipher.ENCRYPT_MODE, key, algParams);
    9.14 +      encode = new byte[encodeLength];
    9.15 +      if (testingMisalignment) {
    9.16 +        int tempSize = cipher.update(input, encInputOffset, (msgSize - lastChunkSize), encode, encOutputOffset);
    9.17 +        cipher.doFinal(input, (encInputOffset + msgSize - lastChunkSize), lastChunkSize, encode, (encOutputOffset + tempSize));
    9.18 +      } else {
    9.19 +        cipher.doFinal(input, encInputOffset, msgSize, encode, encOutputOffset);
    9.20 +      }
    9.21        if (checkOutput) {
    9.22 -        // checked version creates new output buffer each time
    9.23 -        encode = cipher.doFinal(input, 0, msgSize);
    9.24          compareArrays(encode, expectedEncode);
    9.25 -      } else {
    9.26 -        // non-checked version outputs to existing encode buffer for maximum speed
    9.27 -        encode = new byte[cipher.getOutputSize(msgSize)];
    9.28 -        cipher.doFinal(input, 0, msgSize, encode);
    9.29        }
    9.30      }
    9.31      catch (Exception e) {
    10.1 --- a/test/compiler/7184394/TestAESMain.java	Thu May 01 15:02:46 2014 -0700
    10.2 +++ b/test/compiler/7184394/TestAESMain.java	Wed Apr 30 14:14:01 2014 -0700
    10.3 @@ -1,5 +1,5 @@
    10.4  /*
    10.5 - * Copyright (c) 2012, 2014 Oracle and/or its affiliates. All rights reserved.
    10.6 + * Copyright (c) 2012, 2014, Oracle and/or its affiliates. All rights reserved.
    10.7   * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    10.8   *
    10.9   * This code is free software; you can redistribute it and/or modify it
   10.10 @@ -28,7 +28,19 @@
   10.11   * @summary add intrinsics to use AES instructions
   10.12   *
   10.13   * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CBC TestAESMain
   10.14 + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CBC -DencInputOffset=1 TestAESMain
   10.15 + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CBC -DencOutputOffset=1 TestAESMain
   10.16 + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CBC -DdecOutputOffset=1 TestAESMain
   10.17 + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CBC -DencInputOffset=1 -DencOutputOffset=1 TestAESMain
   10.18 + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CBC -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 TestAESMain
   10.19 + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CBC -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 -DpaddingStr=NoPadding TestAESMain
   10.20   * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB TestAESMain
   10.21 + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB -DencInputOffset=1 TestAESMain
   10.22 + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB -DencOutputOffset=1 TestAESMain
   10.23 + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB -DdecOutputOffset=1 TestAESMain
   10.24 + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB -DencInputOffset=1 -DencOutputOffset=1 TestAESMain
   10.25 + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 TestAESMain
   10.26 + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 -DpaddingStr=NoPadding TestAESMain
   10.27   *
   10.28   * @author Tom Deneau
   10.29   */
   10.30 @@ -36,12 +48,13 @@
   10.31  public class TestAESMain {
   10.32    public static void main(String[] args) {
   10.33      int iters = (args.length > 0 ? Integer.valueOf(args[0]) : 1000000);
   10.34 +    int warmupIters = (args.length > 1 ? Integer.valueOf(args[1]) : 20000);
   10.35      System.out.println(iters + " iterations");
   10.36      TestAESEncode etest = new TestAESEncode();
   10.37      etest.prepare();
   10.38 -    // warm-up for 20K iterations
   10.39 +    // warm-up
   10.40      System.out.println("Starting encryption warm-up");
   10.41 -    for (int i=0; i<20000; i++) {
   10.42 +    for (int i=0; i<warmupIters; i++) {
   10.43        etest.run();
   10.44      }
   10.45      System.out.println("Finished encryption warm-up");
   10.46 @@ -54,9 +67,9 @@
   10.47  
   10.48      TestAESDecode dtest = new TestAESDecode();
   10.49      dtest.prepare();
   10.50 -    // warm-up for 20K iterations
   10.51 +    // warm-up
   10.52      System.out.println("Starting decryption warm-up");
   10.53 -    for (int i=0; i<20000; i++) {
   10.54 +    for (int i=0; i<warmupIters; i++) {
   10.55        dtest.run();
   10.56      }
   10.57      System.out.println("Finished decryption warm-up");

mercurial