00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067 #include "pch.h"
00068
00069 #ifndef CRYPTOPP_IMPORTS
00070 #ifndef CRYPTOPP_GENERATE_X64_MASM
00071
00072 #include "rijndael.h"
00073 #include "misc.h"
00074 #include "cpu.h"
00075
00076 NAMESPACE_BEGIN(CryptoPP)
00077
00078 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00079 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
00080 namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];}
00081 using namespace rdtable;
00082 #else
00083 static word64 Te[256];
00084 #endif
00085 static word64 Td[256];
00086 #else
00087 static word32 Te[256*4], Td[256*4];
00088 #endif
00089 static volatile bool s_TeFilled = false, s_TdFilled = false;
00090
00091
00092
00093 #define QUARTER_ROUND(L, T, t, a, b, c, d) \
00094 a ^= L(T, 3, byte(t)); t >>= 8;\
00095 b ^= L(T, 2, byte(t)); t >>= 8;\
00096 c ^= L(T, 1, byte(t)); t >>= 8;\
00097 d ^= L(T, 0, t);
00098
00099 #define QUARTER_ROUND_LE(t, a, b, c, d) \
00100 tempBlock[a] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
00101 tempBlock[b] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
00102 tempBlock[c] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
00103 tempBlock[d] = ((byte *)(Te+t))[1];
00104
00105 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00106 #define QUARTER_ROUND_LD(t, a, b, c, d) \
00107 tempBlock[a] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
00108 tempBlock[b] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
00109 tempBlock[c] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
00110 tempBlock[d] = ((byte *)(Td+t))[GetNativeByteOrder()*7];
00111 #else
00112 #define QUARTER_ROUND_LD(t, a, b, c, d) \
00113 tempBlock[a] = Sd[byte(t)]; t >>= 8;\
00114 tempBlock[b] = Sd[byte(t)]; t >>= 8;\
00115 tempBlock[c] = Sd[byte(t)]; t >>= 8;\
00116 tempBlock[d] = Sd[t];
00117 #endif
00118
00119 #define QUARTER_ROUND_E(t, a, b, c, d) QUARTER_ROUND(TL_M, Te, t, a, b, c, d)
00120 #define QUARTER_ROUND_D(t, a, b, c, d) QUARTER_ROUND(TL_M, Td, t, a, b, c, d)
00121
00122 #ifdef IS_LITTLE_ENDIAN
00123 #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, d, c, b, a)
00124 #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, d, c, b, a)
00125 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00126 #define TL_F(T, i, x) (*(word32 *)((byte *)T + x*8 + (6-i)%4+1))
00127 #define TL_M(T, i, x) (*(word32 *)((byte *)T + x*8 + (i+3)%4+1))
00128 #else
00129 #define TL_F(T, i, x) rotrFixed(T[x], (3-i)*8)
00130 #define TL_M(T, i, x) T[i*256 + x]
00131 #endif
00132 #else
00133 #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, a, b, c, d)
00134 #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, a, b, c, d)
00135 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00136 #define TL_F(T, i, x) (*(word32 *)((byte *)T + x*8 + (4-i)%4))
00137 #define TL_M TL_F
00138 #else
00139 #define TL_F(T, i, x) rotrFixed(T[x], i*8)
00140 #define TL_M(T, i, x) T[i*256 + x]
00141 #endif
00142 #endif
00143
00144
00145 #define f2(x) ((x<<1)^(((x>>7)&1)*0x11b))
00146 #define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
00147 #define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))
00148
00149 #define f3(x) (f2(x) ^ x)
00150 #define f9(x) (f8(x) ^ x)
00151 #define fb(x) (f8(x) ^ f2(x) ^ x)
00152 #define fd(x) (f8(x) ^ f4(x) ^ x)
00153 #define fe(x) (f8(x) ^ f4(x) ^ f2(x))
00154
00155 void Rijndael::Base::FillEncTable()
00156 {
00157 for (int i=0; i<256; i++)
00158 {
00159 byte x = Se[i];
00160 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00161 word32 y = word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
00162 Te[i] = word64(y | f3(x))<<32 | y;
00163 #else
00164 word32 y = f3(x) | word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
00165 for (int j=0; j<4; j++)
00166 {
00167 Te[i+j*256] = y;
00168 y = rotrFixed(y, 8);
00169 }
00170 #endif
00171 }
00172 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
00173 Te[256] = Te[257] = 0;
00174 #endif
00175 s_TeFilled = true;
00176 }
00177
00178 void Rijndael::Base::FillDecTable()
00179 {
00180 for (int i=0; i<256; i++)
00181 {
00182 byte x = Sd[i];
00183 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00184 word32 y = word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;
00185 Td[i] = word64(y | fb(x))<<32 | y | x;
00186 #else
00187 word32 y = fb(x) | word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;;
00188 for (int j=0; j<4; j++)
00189 {
00190 Td[i+j*256] = y;
00191 y = rotrFixed(y, 8);
00192 }
00193 #endif
00194 }
00195 s_TdFilled = true;
00196 }
00197
00198 void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, const NameValuePairs &)
00199 {
00200 AssertValidKeyLength(keylen);
00201
00202 m_rounds = keylen/4 + 6;
00203 m_key.New(4*(m_rounds+1));
00204
00205 word32 *rk = m_key;
00206
00207 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE && (!defined(_MSC_VER) || _MSC_VER >= 1600 || CRYPTOPP_BOOL_X86)
00208
00209 if (HasAESNI())
00210 {
00211 static const word32 rcLE[] = {
00212 0x01, 0x02, 0x04, 0x08,
00213 0x10, 0x20, 0x40, 0x80,
00214 0x1B, 0x36,
00215 };
00216 const word32 *rc = rcLE;
00217
00218 __m128i temp = _mm_loadu_si128((__m128i *)(userKey+keylen-16));
00219 memcpy(rk, userKey, keylen);
00220
00221 while (true)
00222 {
00223 rk[keylen/4] = rk[0] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 3) ^ *(rc++);
00224 rk[keylen/4+1] = rk[1] ^ rk[keylen/4];
00225 rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1];
00226 rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2];
00227
00228 if (rk + keylen/4 + 4 == m_key.end())
00229 break;
00230
00231 if (keylen == 24)
00232 {
00233 rk[10] = rk[ 4] ^ rk[ 9];
00234 rk[11] = rk[ 5] ^ rk[10];
00235 temp = _mm_insert_epi32(temp, rk[11], 3);
00236 }
00237 else if (keylen == 32)
00238 {
00239 temp = _mm_insert_epi32(temp, rk[11], 3);
00240 rk[12] = rk[ 4] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 2);
00241 rk[13] = rk[ 5] ^ rk[12];
00242 rk[14] = rk[ 6] ^ rk[13];
00243 rk[15] = rk[ 7] ^ rk[14];
00244 temp = _mm_insert_epi32(temp, rk[15], 3);
00245 }
00246 else
00247 temp = _mm_insert_epi32(temp, rk[7], 3);
00248
00249 rk += keylen/4;
00250 }
00251
00252 if (!IsForwardTransformation())
00253 {
00254 rk = m_key;
00255 unsigned int i, j;
00256
00257 std::swap(*(__m128i *)(rk), *(__m128i *)(rk+4*m_rounds));
00258
00259 for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
00260 {
00261 temp = _mm_aesimc_si128(*(__m128i *)(rk+i));
00262 *(__m128i *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(rk+j));
00263 *(__m128i *)(rk+j) = temp;
00264 }
00265
00266 *(__m128i *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(rk+i));
00267 }
00268
00269 return;
00270 }
00271 #endif
00272
00273 GetUserKey(BIG_ENDIAN_ORDER, rk, keylen/4, userKey, keylen);
00274 const word32 *rc = rcon;
00275 word32 temp;
00276
00277 while (true)
00278 {
00279 temp = rk[keylen/4-1];
00280 word32 x = (word32(Se[GETBYTE(temp, 2)]) << 24) ^ (word32(Se[GETBYTE(temp, 1)]) << 16) ^ (word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)];
00281 rk[keylen/4] = rk[0] ^ x ^ *(rc++);
00282 rk[keylen/4+1] = rk[1] ^ rk[keylen/4];
00283 rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1];
00284 rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2];
00285
00286 if (rk + keylen/4 + 4 == m_key.end())
00287 break;
00288
00289 if (keylen == 24)
00290 {
00291 rk[10] = rk[ 4] ^ rk[ 9];
00292 rk[11] = rk[ 5] ^ rk[10];
00293 }
00294 else if (keylen == 32)
00295 {
00296 temp = rk[11];
00297 rk[12] = rk[ 4] ^ (word32(Se[GETBYTE(temp, 3)]) << 24) ^ (word32(Se[GETBYTE(temp, 2)]) << 16) ^ (word32(Se[GETBYTE(temp, 1)]) << 8) ^ Se[GETBYTE(temp, 0)];
00298 rk[13] = rk[ 5] ^ rk[12];
00299 rk[14] = rk[ 6] ^ rk[13];
00300 rk[15] = rk[ 7] ^ rk[14];
00301 }
00302 rk += keylen/4;
00303 }
00304
00305 rk = m_key;
00306
00307 if (IsForwardTransformation())
00308 {
00309 if (!s_TeFilled)
00310 FillEncTable();
00311
00312 ConditionalByteReverse(BIG_ENDIAN_ORDER, rk, rk, 16);
00313 ConditionalByteReverse(BIG_ENDIAN_ORDER, rk + m_rounds*4, rk + m_rounds*4, 16);
00314 }
00315 else
00316 {
00317 if (!s_TdFilled)
00318 FillDecTable();
00319
00320 unsigned int i, j;
00321
00322 #define InverseMixColumn(x) TL_M(Td, 0, Se[GETBYTE(x, 3)]) ^ TL_M(Td, 1, Se[GETBYTE(x, 2)]) ^ TL_M(Td, 2, Se[GETBYTE(x, 1)]) ^ TL_M(Td, 3, Se[GETBYTE(x, 0)])
00323
00324 for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
00325 {
00326 temp = InverseMixColumn(rk[i ]); rk[i ] = InverseMixColumn(rk[j ]); rk[j ] = temp;
00327 temp = InverseMixColumn(rk[i + 1]); rk[i + 1] = InverseMixColumn(rk[j + 1]); rk[j + 1] = temp;
00328 temp = InverseMixColumn(rk[i + 2]); rk[i + 2] = InverseMixColumn(rk[j + 2]); rk[j + 2] = temp;
00329 temp = InverseMixColumn(rk[i + 3]); rk[i + 3] = InverseMixColumn(rk[j + 3]); rk[j + 3] = temp;
00330 }
00331
00332 rk[i+0] = InverseMixColumn(rk[i+0]);
00333 rk[i+1] = InverseMixColumn(rk[i+1]);
00334 rk[i+2] = InverseMixColumn(rk[i+2]);
00335 rk[i+3] = InverseMixColumn(rk[i+3]);
00336
00337 temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[0]); rk[0] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+0]); rk[4*m_rounds+0] = temp;
00338 temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[1]); rk[1] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+1]); rk[4*m_rounds+1] = temp;
00339 temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[2]); rk[2] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+2]); rk[4*m_rounds+2] = temp;
00340 temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[3]); rk[3] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+3]); rk[4*m_rounds+3] = temp;
00341 }
00342
00343 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
00344 if (HasAESNI())
00345 ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
00346 #endif
00347 }
00348
00349 void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
00350 {
00351 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) || CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
00352 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
00353 if (HasSSE2())
00354 #else
00355 if (HasAESNI())
00356 #endif
00357 {
00358 Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
00359 return;
00360 }
00361 #endif
00362
00363 typedef BlockGetAndPut<word32, NativeByteOrder> Block;
00364
00365 word32 s0, s1, s2, s3, t0, t1, t2, t3;
00366 Block::Get(inBlock)(s0)(s1)(s2)(s3);
00367
00368 const word32 *rk = m_key;
00369 s0 ^= rk[0];
00370 s1 ^= rk[1];
00371 s2 ^= rk[2];
00372 s3 ^= rk[3];
00373 t0 = rk[4];
00374 t1 = rk[5];
00375 t2 = rk[6];
00376 t3 = rk[7];
00377 rk += 8;
00378
00379
00380 const int cacheLineSize = GetCacheLineSize();
00381 unsigned int i;
00382 word32 u = 0;
00383 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00384 for (i=0; i<2048; i+=cacheLineSize)
00385 #else
00386 for (i=0; i<1024; i+=cacheLineSize)
00387 #endif
00388 u &= *(const word32 *)(((const byte *)Te)+i);
00389 u &= Te[255];
00390 s0 |= u; s1 |= u; s2 |= u; s3 |= u;
00391
00392 QUARTER_ROUND_FE(s3, t0, t1, t2, t3)
00393 QUARTER_ROUND_FE(s2, t3, t0, t1, t2)
00394 QUARTER_ROUND_FE(s1, t2, t3, t0, t1)
00395 QUARTER_ROUND_FE(s0, t1, t2, t3, t0)
00396
00397
00398 unsigned int r = m_rounds/2 - 1;
00399 do
00400 {
00401 s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
00402
00403 QUARTER_ROUND_E(t3, s0, s1, s2, s3)
00404 QUARTER_ROUND_E(t2, s3, s0, s1, s2)
00405 QUARTER_ROUND_E(t1, s2, s3, s0, s1)
00406 QUARTER_ROUND_E(t0, s1, s2, s3, s0)
00407
00408 t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
00409
00410 QUARTER_ROUND_E(s3, t0, t1, t2, t3)
00411 QUARTER_ROUND_E(s2, t3, t0, t1, t2)
00412 QUARTER_ROUND_E(s1, t2, t3, t0, t1)
00413 QUARTER_ROUND_E(s0, t1, t2, t3, t0)
00414
00415 rk += 8;
00416 } while (--r);
00417
00418 word32 tbw[4];
00419 byte *const tempBlock = (byte *)tbw;
00420
00421 QUARTER_ROUND_LE(t2, 15, 2, 5, 8)
00422 QUARTER_ROUND_LE(t1, 11, 14, 1, 4)
00423 QUARTER_ROUND_LE(t0, 7, 10, 13, 0)
00424 QUARTER_ROUND_LE(t3, 3, 6, 9, 12)
00425
00426 Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
00427 }
00428
00429 void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
00430 {
00431 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
00432 if (HasAESNI())
00433 {
00434 Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
00435 return;
00436 }
00437 #endif
00438
00439 typedef BlockGetAndPut<word32, NativeByteOrder> Block;
00440
00441 word32 s0, s1, s2, s3, t0, t1, t2, t3;
00442 Block::Get(inBlock)(s0)(s1)(s2)(s3);
00443
00444 const word32 *rk = m_key;
00445 s0 ^= rk[0];
00446 s1 ^= rk[1];
00447 s2 ^= rk[2];
00448 s3 ^= rk[3];
00449 t0 = rk[4];
00450 t1 = rk[5];
00451 t2 = rk[6];
00452 t3 = rk[7];
00453 rk += 8;
00454
00455
00456 const int cacheLineSize = GetCacheLineSize();
00457 unsigned int i;
00458 word32 u = 0;
00459 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00460 for (i=0; i<2048; i+=cacheLineSize)
00461 #else
00462 for (i=0; i<1024; i+=cacheLineSize)
00463 #endif
00464 u &= *(const word32 *)(((const byte *)Td)+i);
00465 u &= Td[255];
00466 s0 |= u; s1 |= u; s2 |= u; s3 |= u;
00467
00468 QUARTER_ROUND_FD(s3, t2, t1, t0, t3)
00469 QUARTER_ROUND_FD(s2, t1, t0, t3, t2)
00470 QUARTER_ROUND_FD(s1, t0, t3, t2, t1)
00471 QUARTER_ROUND_FD(s0, t3, t2, t1, t0)
00472
00473
00474 unsigned int r = m_rounds/2 - 1;
00475 do
00476 {
00477 s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
00478
00479 QUARTER_ROUND_D(t3, s2, s1, s0, s3)
00480 QUARTER_ROUND_D(t2, s1, s0, s3, s2)
00481 QUARTER_ROUND_D(t1, s0, s3, s2, s1)
00482 QUARTER_ROUND_D(t0, s3, s2, s1, s0)
00483
00484 t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
00485
00486 QUARTER_ROUND_D(s3, t2, t1, t0, t3)
00487 QUARTER_ROUND_D(s2, t1, t0, t3, t2)
00488 QUARTER_ROUND_D(s1, t0, t3, t2, t1)
00489 QUARTER_ROUND_D(s0, t3, t2, t1, t0)
00490
00491 rk += 8;
00492 } while (--r);
00493
00494 #ifndef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
00495
00496
00497
00498 u = 0;
00499 for (i=0; i<256; i+=cacheLineSize)
00500 u &= *(const word32 *)(Sd+i);
00501 u &= *(const word32 *)(Sd+252);
00502 t0 |= u; t1 |= u; t2 |= u; t3 |= u;
00503 #endif
00504
00505 word32 tbw[4];
00506 byte *const tempBlock = (byte *)tbw;
00507
00508 QUARTER_ROUND_LD(t2, 7, 2, 13, 8)
00509 QUARTER_ROUND_LD(t1, 3, 14, 9, 4)
00510 QUARTER_ROUND_LD(t0, 15, 10, 5, 0)
00511 QUARTER_ROUND_LD(t3, 11, 6, 1, 12)
00512
00513 Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
00514 }
00515
00516
00517
00518 #pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
00519
00520 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
00521
00522 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
00523
00524 CRYPTOPP_NAKED void CRYPTOPP_FASTCALL Rijndael_Enc_AdvancedProcessBlocks(void *locals, const word32 *k)
00525 {
00526 #if CRYPTOPP_BOOL_X86
00527
00528 #define L_REG esp
00529 #define L_INDEX(i) (L_REG+768+i)
00530 #define L_INXORBLOCKS L_INBLOCKS+4
00531 #define L_OUTXORBLOCKS L_INBLOCKS+8
00532 #define L_OUTBLOCKS L_INBLOCKS+12
00533 #define L_INCREMENTS L_INDEX(16*15)
00534 #define L_SP L_INDEX(16*16)
00535 #define L_LENGTH L_INDEX(16*16+4)
00536 #define L_KEYS_BEGIN L_INDEX(16*16+8)
00537
00538 #define MOVD movd
00539 #define MM(i) mm##i
00540
00541 #define MXOR(a,b,c) \
00542 AS2( movzx esi, b)\
00543 AS2( movd mm7, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
00544 AS2( pxor MM(a), mm7)\
00545
00546 #define MMOV(a,b,c) \
00547 AS2( movzx esi, b)\
00548 AS2( movd MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
00549
00550 #else
00551
00552 #define L_REG r8
00553 #define L_INDEX(i) (L_REG+i)
00554 #define L_INXORBLOCKS L_INBLOCKS+8
00555 #define L_OUTXORBLOCKS L_INBLOCKS+16
00556 #define L_OUTBLOCKS L_INBLOCKS+24
00557 #define L_INCREMENTS L_INDEX(16*16)
00558 #define L_LENGTH L_INDEX(16*18+8)
00559 #define L_KEYS_BEGIN L_INDEX(16*19)
00560
00561 #define MOVD mov
00562 #define MM_0 r9d
00563 #define MM_1 r12d
00564 #ifdef __GNUC__
00565 #define MM_2 r11d
00566 #else
00567 #define MM_2 r10d
00568 #endif
00569 #define MM(i) MM_##i
00570
00571 #define MXOR(a,b,c) \
00572 AS2( movzx esi, b)\
00573 AS2( xor MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
00574
00575 #define MMOV(a,b,c) \
00576 AS2( movzx esi, b)\
00577 AS2( mov MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
00578
00579 #endif
00580
00581 #define L_SUBKEYS L_INDEX(0)
00582 #define L_SAVED_X L_SUBKEYS
00583 #define L_KEY12 L_INDEX(16*12)
00584 #define L_LASTROUND L_INDEX(16*13)
00585 #define L_INBLOCKS L_INDEX(16*14)
00586 #define MAP0TO4(i) (ASM_MOD(i+3,4)+1)
00587
00588 #define XOR(a,b,c) \
00589 AS2( movzx esi, b)\
00590 AS2( xor a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
00591
00592 #define MOV(a,b,c) \
00593 AS2( movzx esi, b)\
00594 AS2( mov a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
00595
00596 #ifdef CRYPTOPP_GENERATE_X64_MASM
00597 ALIGN 8
00598 Rijndael_Enc_AdvancedProcessBlocks PROC FRAME
00599 rex_push_reg rsi
00600 push_reg rdi
00601 push_reg rbx
00602 push_reg r12
00603 .endprolog
00604 mov L_REG, rcx
00605 mov AS_REG_7, ?Te@rdtable@CryptoPP@@3PA_KA
00606 mov edi, DWORD PTR [?g_cacheLineSize@CryptoPP@@3IA]
00607 #elif defined(__GNUC__)
00608 __asm__ __volatile__
00609 (
00610 ".intel_syntax noprefix;"
00611 #if CRYPTOPP_BOOL_X64
00612 AS2( mov L_REG, rcx)
00613 #endif
00614 AS_PUSH_IF86(bx)
00615 AS_PUSH_IF86(bp)
00616 AS2( mov AS_REG_7, WORD_REG(si))
00617 #else
00618 AS_PUSH_IF86(si)
00619 AS_PUSH_IF86(di)
00620 AS_PUSH_IF86(bx)
00621 AS_PUSH_IF86(bp)
00622 AS2( lea AS_REG_7, [Te])
00623 AS2( mov edi, [g_cacheLineSize])
00624 #endif
00625
00626 #if CRYPTOPP_BOOL_X86
00627 AS2( mov [ecx+16*12+16*4], esp)
00628 AS2( lea esp, [ecx-768])
00629 #endif
00630
00631
00632 AS2( mov WORD_REG(si), [L_KEYS_BEGIN])
00633 AS2( mov WORD_REG(ax), 16)
00634 AS2( and WORD_REG(ax), WORD_REG(si))
00635 AS2( movdqa xmm3, XMMWORD_PTR [WORD_REG(dx)+16+WORD_REG(ax)])
00636 AS2( movdqa [L_KEY12], xmm3)
00637 AS2( lea WORD_REG(ax), [WORD_REG(dx)+WORD_REG(ax)+2*16])
00638 AS2( sub WORD_REG(ax), WORD_REG(si))
00639 ASL(0)
00640 AS2( movdqa xmm0, [WORD_REG(ax)+WORD_REG(si)])
00641 AS2( movdqa XMMWORD_PTR [L_SUBKEYS+WORD_REG(si)], xmm0)
00642 AS2( add WORD_REG(si), 16)
00643 AS2( cmp WORD_REG(si), 16*12)
00644 ASJ( jl, 0, b)
00645
00646
00647 AS2( movdqa xmm4, [WORD_REG(ax)+WORD_REG(si)])
00648 AS2( movdqa xmm1, [WORD_REG(dx)])
00649 AS2( MOVD MM(1), [WORD_REG(dx)+4*4])
00650 AS2( mov ebx, [WORD_REG(dx)+5*4])
00651 AS2( mov ecx, [WORD_REG(dx)+6*4])
00652 AS2( mov edx, [WORD_REG(dx)+7*4])
00653
00654
00655 AS2( xor WORD_REG(ax), WORD_REG(ax))
00656 ASL(9)
00657 AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
00658 AS2( add WORD_REG(ax), WORD_REG(di))
00659 AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
00660 AS2( add WORD_REG(ax), WORD_REG(di))
00661 AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
00662 AS2( add WORD_REG(ax), WORD_REG(di))
00663 AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
00664 AS2( add WORD_REG(ax), WORD_REG(di))
00665 AS2( cmp WORD_REG(ax), 2048)
00666 ASJ( jl, 9, b)
00667 AS1( lfence)
00668
00669 AS2( test DWORD PTR [L_LENGTH], 1)
00670 ASJ( jz, 8, f)
00671
00672
00673 AS2( mov WORD_REG(si), [L_INBLOCKS])
00674 AS2( movdqu xmm2, [WORD_REG(si)])
00675 AS2( pxor xmm2, xmm1)
00676 AS2( psrldq xmm1, 14)
00677 AS2( movd eax, xmm1)
00678 AS2( mov al, BYTE PTR [WORD_REG(si)+15])
00679 AS2( MOVD MM(2), eax)
00680 #if CRYPTOPP_BOOL_X86
00681 AS2( mov eax, 1)
00682 AS2( movd mm3, eax)
00683 #endif
00684
00685
00686 AS2( movd eax, xmm2)
00687 AS2( psrldq xmm2, 4)
00688 AS2( movd edi, xmm2)
00689 AS2( psrldq xmm2, 4)
00690 MXOR( 1, al, 0)
00691 XOR( edx, ah, 1)
00692 AS2( shr eax, 16)
00693 XOR( ecx, al, 2)
00694 XOR( ebx, ah, 3)
00695 AS2( mov eax, edi)
00696 AS2( movd edi, xmm2)
00697 AS2( psrldq xmm2, 4)
00698 XOR( ebx, al, 0)
00699 MXOR( 1, ah, 1)
00700 AS2( shr eax, 16)
00701 XOR( edx, al, 2)
00702 XOR( ecx, ah, 3)
00703 AS2( mov eax, edi)
00704 AS2( movd edi, xmm2)
00705 XOR( ecx, al, 0)
00706 XOR( ebx, ah, 1)
00707 AS2( shr eax, 16)
00708 MXOR( 1, al, 2)
00709 XOR( edx, ah, 3)
00710 AS2( mov eax, edi)
00711 XOR( edx, al, 0)
00712 XOR( ecx, ah, 1)
00713 AS2( shr eax, 16)
00714 XOR( ebx, al, 2)
00715 AS2( psrldq xmm2, 3)
00716
00717
00718 AS2( mov eax, [L_KEY12+0*4])
00719 AS2( mov edi, [L_KEY12+2*4])
00720 AS2( MOVD MM(0), [L_KEY12+3*4])
00721 MXOR( 0, cl, 3)
00722 XOR( edi, bl, 3)
00723 MXOR( 0, bh, 2)
00724 AS2( shr ebx, 16)
00725 XOR( eax, bl, 1)
00726 MOV( ebx, bh, 0)
00727 AS2( xor ebx, [L_KEY12+1*4])
00728 XOR( eax, ch, 2)
00729 AS2( shr ecx, 16)
00730 XOR( eax, dl, 3)
00731 XOR( ebx, dh, 2)
00732 AS2( shr edx, 16)
00733 XOR( edi, ch, 0)
00734 XOR( ebx, cl, 1)
00735 XOR( edi, dl, 1)
00736 MXOR( 0, dh, 0)
00737
00738 AS2( movd ecx, xmm2)
00739 AS2( MOVD edx, MM(1))
00740 AS2( MOVD [L_SAVED_X+3*4], MM(0))
00741 AS2( mov [L_SAVED_X+0*4], eax)
00742 AS2( mov [L_SAVED_X+1*4], ebx)
00743 AS2( mov [L_SAVED_X+2*4], edi)
00744 ASJ( jmp, 5, f)
00745
00746 ASL(3)
00747
00748 AS2( MOVD MM(1), [L_KEY12+0*4])
00749 AS2( mov ebx, [L_KEY12+1*4])
00750 AS2( mov ecx, [L_KEY12+2*4])
00751 AS2( mov edx, [L_KEY12+3*4])
00752 ASL(8)
00753 AS2( mov WORD_REG(ax), [L_INBLOCKS])
00754 AS2( movdqu xmm2, [WORD_REG(ax)])
00755 AS2( mov WORD_REG(si), [L_INXORBLOCKS])
00756 AS2( movdqu xmm5, [WORD_REG(si)])
00757 AS2( pxor xmm2, xmm1)
00758 AS2( pxor xmm2, xmm5)
00759
00760
00761 AS2( movd eax, xmm2)
00762 AS2( psrldq xmm2, 4)
00763 AS2( movd edi, xmm2)
00764 AS2( psrldq xmm2, 4)
00765 MXOR( 1, al, 0)
00766 XOR( edx, ah, 1)
00767 AS2( shr eax, 16)
00768 XOR( ecx, al, 2)
00769 XOR( ebx, ah, 3)
00770 AS2( mov eax, edi)
00771 AS2( movd edi, xmm2)
00772 AS2( psrldq xmm2, 4)
00773 XOR( ebx, al, 0)
00774 MXOR( 1, ah, 1)
00775 AS2( shr eax, 16)
00776 XOR( edx, al, 2)
00777 XOR( ecx, ah, 3)
00778 AS2( mov eax, edi)
00779 AS2( movd edi, xmm2)
00780 XOR( ecx, al, 0)
00781 XOR( ebx, ah, 1)
00782 AS2( shr eax, 16)
00783 MXOR( 1, al, 2)
00784 XOR( edx, ah, 3)
00785 AS2( mov eax, edi)
00786 XOR( edx, al, 0)
00787 XOR( ecx, ah, 1)
00788 AS2( shr eax, 16)
00789 XOR( ebx, al, 2)
00790 MXOR( 1, ah, 3)
00791 AS2( MOVD eax, MM(1))
00792
00793 AS2( add L_REG, [L_KEYS_BEGIN])
00794 AS2( add L_REG, 4*16)
00795 ASJ( jmp, 2, f)
00796
00797 ASL(1)
00798
00799 AS2( MOVD ecx, MM(2))
00800 AS2( MOVD edx, MM(1))
00801 AS2( mov eax, [L_SAVED_X+0*4])
00802 AS2( mov ebx, [L_SAVED_X+1*4])
00803 AS2( xor cl, ch)
00804 AS2( and WORD_REG(cx), 255)
00805 ASL(5)
00806 #if CRYPTOPP_BOOL_X86
00807 AS2( paddb MM(2), mm3)
00808 #else
00809 AS2( add MM(2), 1)
00810 #endif
00811
00812 AS2( xor edx, DWORD PTR [AS_REG_7+WORD_REG(cx)*8+3])
00813 XOR( ebx, dl, 3)
00814 MOV( ecx, dh, 2)
00815 AS2( shr edx, 16)
00816 AS2( xor ecx, [L_SAVED_X+2*4])
00817 XOR( eax, dh, 0)
00818 MOV( edx, dl, 1)
00819 AS2( xor edx, [L_SAVED_X+3*4])
00820
00821 AS2( add L_REG, [L_KEYS_BEGIN])
00822 AS2( add L_REG, 3*16)
00823 ASJ( jmp, 4, f)
00824
00825
00826
00827 #define ROUND() \
00828 MXOR( 0, cl, 3) \
00829 AS2( mov cl, al) \
00830 XOR( edi, ah, 2) \
00831 AS2( shr eax, 16) \
00832 XOR( edi, bl, 3) \
00833 MXOR( 0, bh, 2) \
00834 AS2( shr ebx, 16) \
00835 MXOR( 0, al, 1) \
00836 MOV( eax, ah, 0) \
00837 XOR( eax, bl, 1) \
00838 MOV( ebx, bh, 0) \
00839 XOR( eax, ch, 2) \
00840 XOR( ebx, cl, 3) \
00841 AS2( shr ecx, 16) \
00842 XOR( eax, dl, 3) \
00843 XOR( ebx, dh, 2) \
00844 AS2( shr edx, 16) \
00845 XOR( edi, ch, 0) \
00846 XOR( ebx, cl, 1) \
00847 XOR( edi, dl, 1) \
00848 MXOR( 0, dh, 0) \
00849
00850 ASL(2)
00851 AS2( MOVD MM(0), [L_SUBKEYS-4*16+3*4])
00852 AS2( mov edi, [L_SUBKEYS-4*16+2*4])
00853 ROUND()
00854 AS2( mov ecx, edi)
00855 AS2( xor eax, [L_SUBKEYS-4*16+0*4])
00856 AS2( xor ebx, [L_SUBKEYS-4*16+1*4])
00857 AS2( MOVD edx, MM(0))
00858
00859 ASL(4)
00860 AS2( MOVD MM(0), [L_SUBKEYS-4*16+7*4])
00861 AS2( mov edi, [L_SUBKEYS-4*16+6*4])
00862 ROUND()
00863 AS2( mov ecx, edi)
00864 AS2( xor eax, [L_SUBKEYS-4*16+4*4])
00865 AS2( xor ebx, [L_SUBKEYS-4*16+5*4])
00866 AS2( MOVD edx, MM(0))
00867
00868 AS2( add L_REG, 32)
00869 AS2( test L_REG, 255)
00870 ASJ( jnz, 2, b)
00871 AS2( sub L_REG, 16*16)
00872
00873 #define LAST(a, b, c) \
00874 AS2( movzx esi, a )\
00875 AS2( movzx edi, BYTE PTR [AS_REG_7+WORD_REG(si)*8+1] )\
00876 AS2( movzx esi, b )\
00877 AS2( xor edi, DWORD PTR [AS_REG_7+WORD_REG(si)*8+0] )\
00878 AS2( mov WORD PTR [L_LASTROUND+c], di )\
00879
00880
00881 LAST(ch, dl, 2)
00882 LAST(dh, al, 6)
00883 AS2( shr edx, 16)
00884 LAST(ah, bl, 10)
00885 AS2( shr eax, 16)
00886 LAST(bh, cl, 14)
00887 AS2( shr ebx, 16)
00888 LAST(dh, al, 12)
00889 AS2( shr ecx, 16)
00890 LAST(ah, bl, 0)
00891 LAST(bh, cl, 4)
00892 LAST(ch, dl, 8)
00893
00894 AS2( mov WORD_REG(ax), [L_OUTXORBLOCKS])
00895 AS2( mov WORD_REG(bx), [L_OUTBLOCKS])
00896
00897 AS2( mov WORD_REG(cx), [L_LENGTH])
00898 AS2( sub WORD_REG(cx), 16)
00899
00900 AS2( movdqu xmm2, [WORD_REG(ax)])
00901 AS2( pxor xmm2, xmm4)
00902
00903 #if CRYPTOPP_BOOL_X86
00904 AS2( movdqa xmm0, [L_INCREMENTS])
00905 AS2( paddd xmm0, [L_INBLOCKS])
00906 AS2( movdqa [L_INBLOCKS], xmm0)
00907 #else
00908 AS2( movdqa xmm0, [L_INCREMENTS+16])
00909 AS2( paddq xmm0, [L_INBLOCKS+16])
00910 AS2( movdqa [L_INBLOCKS+16], xmm0)
00911 #endif
00912
00913 AS2( pxor xmm2, [L_LASTROUND])
00914 AS2( movdqu [WORD_REG(bx)], xmm2)
00915
00916 ASJ( jle, 7, f)
00917 AS2( mov [L_LENGTH], WORD_REG(cx))
00918 AS2( test WORD_REG(cx), 1)
00919 ASJ( jnz, 1, b)
00920 #if CRYPTOPP_BOOL_X64
00921 AS2( movdqa xmm0, [L_INCREMENTS])
00922 AS2( paddq xmm0, [L_INBLOCKS])
00923 AS2( movdqa [L_INBLOCKS], xmm0)
00924 #endif
00925 ASJ( jmp, 3, b)
00926
00927 ASL(7)
00928
00929 AS2( xorps xmm0, xmm0)
00930 AS2( lea WORD_REG(ax), [L_SUBKEYS+7*16])
00931 AS2( movaps [WORD_REG(ax)-7*16], xmm0)
00932 AS2( movaps [WORD_REG(ax)-6*16], xmm0)
00933 AS2( movaps [WORD_REG(ax)-5*16], xmm0)
00934 AS2( movaps [WORD_REG(ax)-4*16], xmm0)
00935 AS2( movaps [WORD_REG(ax)-3*16], xmm0)
00936 AS2( movaps [WORD_REG(ax)-2*16], xmm0)
00937 AS2( movaps [WORD_REG(ax)-1*16], xmm0)
00938 AS2( movaps [WORD_REG(ax)+0*16], xmm0)
00939 AS2( movaps [WORD_REG(ax)+1*16], xmm0)
00940 AS2( movaps [WORD_REG(ax)+2*16], xmm0)
00941 AS2( movaps [WORD_REG(ax)+3*16], xmm0)
00942 AS2( movaps [WORD_REG(ax)+4*16], xmm0)
00943 AS2( movaps [WORD_REG(ax)+5*16], xmm0)
00944 AS2( movaps [WORD_REG(ax)+6*16], xmm0)
00945 #if CRYPTOPP_BOOL_X86
00946 AS2( mov esp, [L_SP])
00947 AS1( emms)
00948 #endif
00949 AS_POP_IF86(bp)
00950 AS_POP_IF86(bx)
00951 #if defined(_MSC_VER) && CRYPTOPP_BOOL_X86
00952 AS_POP_IF86(di)
00953 AS_POP_IF86(si)
00954 AS1(ret)
00955 #endif
00956 #ifdef CRYPTOPP_GENERATE_X64_MASM
00957 pop r12
00958 pop rbx
00959 pop rdi
00960 pop rsi
00961 ret
00962 Rijndael_Enc_AdvancedProcessBlocks ENDP
00963 #endif
00964 #ifdef __GNUC__
00965 ".att_syntax prefix;"
00966 :
00967 : "c" (locals), "d" (k), "S" (Te), "D" (g_cacheLineSize)
00968 : "memory", "cc", "%eax"
00969 #if CRYPTOPP_BOOL_X64
00970 , "%rbx", "%r8", "%r9", "%r10", "%r11", "%r12"
00971 #endif
00972 );
00973 #endif
00974 }
00975
00976 #endif
00977
00978 #ifndef CRYPTOPP_GENERATE_X64_MASM
00979
00980 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
00981 extern "C" {
00982 void Rijndael_Enc_AdvancedProcessBlocks(void *locals, const word32 *k);
00983 }
00984 #endif
00985
00986 #if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X86
00987
00988 static inline bool AliasedWithTable(const byte *begin, const byte *end)
00989 {
00990 size_t s0 = size_t(begin)%4096, s1 = size_t(end)%4096;
00991 size_t t0 = size_t(Te)%4096, t1 = (size_t(Te)+sizeof(Te))%4096;
00992 if (t1 > t0)
00993 return (s0 >= t0 && s0 < t1) || (s1 > t0 && s1 <= t1);
00994 else
00995 return (s0 < t1 || s1 <= t1) || (s0 >= t0 || s1 > t0);
00996 }
00997
00998 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
00999
01000 inline void AESNI_Enc_Block(__m128i &block, const __m128i *subkeys, unsigned int rounds)
01001 {
01002 block = _mm_xor_si128(block, subkeys[0]);
01003 for (unsigned int i=1; i<rounds-1; i+=2)
01004 {
01005 block = _mm_aesenc_si128(block, subkeys[i]);
01006 block = _mm_aesenc_si128(block, subkeys[i+1]);
01007 }
01008 block = _mm_aesenc_si128(block, subkeys[rounds-1]);
01009 block = _mm_aesenclast_si128(block, subkeys[rounds]);
01010 }
01011
01012 inline void AESNI_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, const __m128i *subkeys, unsigned int rounds)
01013 {
01014 __m128i rk = subkeys[0];
01015 block0 = _mm_xor_si128(block0, rk);
01016 block1 = _mm_xor_si128(block1, rk);
01017 block2 = _mm_xor_si128(block2, rk);
01018 block3 = _mm_xor_si128(block3, rk);
01019 for (unsigned int i=1; i<rounds; i++)
01020 {
01021 rk = subkeys[i];
01022 block0 = _mm_aesenc_si128(block0, rk);
01023 block1 = _mm_aesenc_si128(block1, rk);
01024 block2 = _mm_aesenc_si128(block2, rk);
01025 block3 = _mm_aesenc_si128(block3, rk);
01026 }
01027 rk = subkeys[rounds];
01028 block0 = _mm_aesenclast_si128(block0, rk);
01029 block1 = _mm_aesenclast_si128(block1, rk);
01030 block2 = _mm_aesenclast_si128(block2, rk);
01031 block3 = _mm_aesenclast_si128(block3, rk);
01032 }
01033
01034 inline void AESNI_Dec_Block(__m128i &block, const __m128i *subkeys, unsigned int rounds)
01035 {
01036 block = _mm_xor_si128(block, subkeys[0]);
01037 for (unsigned int i=1; i<rounds-1; i+=2)
01038 {
01039 block = _mm_aesdec_si128(block, subkeys[i]);
01040 block = _mm_aesdec_si128(block, subkeys[i+1]);
01041 }
01042 block = _mm_aesdec_si128(block, subkeys[rounds-1]);
01043 block = _mm_aesdeclast_si128(block, subkeys[rounds]);
01044 }
01045
01046 inline void AESNI_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, const __m128i *subkeys, unsigned int rounds)
01047 {
01048 __m128i rk = subkeys[0];
01049 block0 = _mm_xor_si128(block0, rk);
01050 block1 = _mm_xor_si128(block1, rk);
01051 block2 = _mm_xor_si128(block2, rk);
01052 block3 = _mm_xor_si128(block3, rk);
01053 for (unsigned int i=1; i<rounds; i++)
01054 {
01055 rk = subkeys[i];
01056 block0 = _mm_aesdec_si128(block0, rk);
01057 block1 = _mm_aesdec_si128(block1, rk);
01058 block2 = _mm_aesdec_si128(block2, rk);
01059 block3 = _mm_aesdec_si128(block3, rk);
01060 }
01061 rk = subkeys[rounds];
01062 block0 = _mm_aesdeclast_si128(block0, rk);
01063 block1 = _mm_aesdeclast_si128(block1, rk);
01064 block2 = _mm_aesdeclast_si128(block2, rk);
01065 block3 = _mm_aesdeclast_si128(block3, rk);
01066 }
01067
01068 static CRYPTOPP_ALIGN_DATA(16) const word32 s_one[] = {0, 0, 0, 1<<24};
01069
01070 template <typename F1, typename F4>
01071 inline size_t AESNI_AdvancedProcessBlocks(F1 func1, F4 func4, const __m128i *subkeys, unsigned int rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
01072 {
01073 size_t blockSize = 16;
01074 size_t inIncrement = (flags & (BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_DontIncrementInOutPointers)) ? 0 : blockSize;
01075 size_t xorIncrement = xorBlocks ? blockSize : 0;
01076 size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : blockSize;
01077
01078 if (flags & BlockTransformation::BT_ReverseDirection)
01079 {
01080 assert(length % blockSize == 0);
01081 inBlocks += length - blockSize;
01082 xorBlocks += length - blockSize;
01083 outBlocks += length - blockSize;
01084 inIncrement = 0-inIncrement;
01085 xorIncrement = 0-xorIncrement;
01086 outIncrement = 0-outIncrement;
01087 }
01088
01089 if (flags & BlockTransformation::BT_AllowParallel)
01090 {
01091 while (length >= 4*blockSize)
01092 {
01093 __m128i block0 = _mm_loadu_si128((const __m128i *)inBlocks), block1, block2, block3;
01094 if (flags & BlockTransformation::BT_InBlockIsCounter)
01095 {
01096 const __m128i be1 = *(const __m128i *)s_one;
01097 block1 = _mm_add_epi32(block0, be1);
01098 block2 = _mm_add_epi32(block1, be1);
01099 block3 = _mm_add_epi32(block2, be1);
01100 _mm_storeu_si128((__m128i *)inBlocks, _mm_add_epi32(block3, be1));
01101 }
01102 else
01103 {
01104 inBlocks += inIncrement;
01105 block1 = _mm_loadu_si128((const __m128i *)inBlocks);
01106 inBlocks += inIncrement;
01107 block2 = _mm_loadu_si128((const __m128i *)inBlocks);
01108 inBlocks += inIncrement;
01109 block3 = _mm_loadu_si128((const __m128i *)inBlocks);
01110 inBlocks += inIncrement;
01111 }
01112
01113 if (flags & BlockTransformation::BT_XorInput)
01114 {
01115 block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)xorBlocks));
01116 xorBlocks += xorIncrement;
01117 block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)xorBlocks));
01118 xorBlocks += xorIncrement;
01119 block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)xorBlocks));
01120 xorBlocks += xorIncrement;
01121 block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)xorBlocks));
01122 xorBlocks += xorIncrement;
01123 }
01124
01125 func4(block0, block1, block2, block3, subkeys, rounds);
01126
01127 if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
01128 {
01129 block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)xorBlocks));
01130 xorBlocks += xorIncrement;
01131 block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)xorBlocks));
01132 xorBlocks += xorIncrement;
01133 block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)xorBlocks));
01134 xorBlocks += xorIncrement;
01135 block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)xorBlocks));
01136 xorBlocks += xorIncrement;
01137 }
01138
01139 _mm_storeu_si128((__m128i *)outBlocks, block0);
01140 outBlocks += outIncrement;
01141 _mm_storeu_si128((__m128i *)outBlocks, block1);
01142 outBlocks += outIncrement;
01143 _mm_storeu_si128((__m128i *)outBlocks, block2);
01144 outBlocks += outIncrement;
01145 _mm_storeu_si128((__m128i *)outBlocks, block3);
01146 outBlocks += outIncrement;
01147
01148 length -= 4*blockSize;
01149 }
01150 }
01151
01152 while (length >= blockSize)
01153 {
01154 __m128i block = _mm_loadu_si128((const __m128i *)inBlocks);
01155
01156 if (flags & BlockTransformation::BT_XorInput)
01157 block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)xorBlocks));
01158
01159 if (flags & BlockTransformation::BT_InBlockIsCounter)
01160 const_cast<byte *>(inBlocks)[15]++;
01161
01162 func1(block, subkeys, rounds);
01163
01164 if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
01165 block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)xorBlocks));
01166
01167 _mm_storeu_si128((__m128i *)outBlocks, block);
01168
01169 inBlocks += inIncrement;
01170 outBlocks += outIncrement;
01171 xorBlocks += xorIncrement;
01172 length -= blockSize;
01173 }
01174
01175 return length;
01176 }
01177 #endif
01178
01179 size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
01180 {
01181 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
01182 if (HasAESNI())
01183 return AESNI_AdvancedProcessBlocks(AESNI_Enc_Block, AESNI_Enc_4_Blocks, (const __m128i *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
01184 #endif
01185
01186 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
01187 if (HasSSE2())
01188 {
01189 if (length < BLOCKSIZE)
01190 return length;
01191
01192 struct Locals
01193 {
01194 word32 subkeys[4*12], workspace[8];
01195 const byte *inBlocks, *inXorBlocks, *outXorBlocks;
01196 byte *outBlocks;
01197 size_t inIncrement, inXorIncrement, outXorIncrement, outIncrement;
01198 size_t regSpill, lengthAndCounterFlag, keysBegin;
01199 };
01200
01201 size_t increment = BLOCKSIZE;
01202 const byte* zeros = (byte *)(Te+256);
01203 byte *space;
01204
01205 do {
01206 space = (byte *)alloca(255+sizeof(Locals));
01207 space += (256-(size_t)space%256)%256;
01208 }
01209 while (AliasedWithTable(space, space+sizeof(Locals)));
01210
01211 if (flags & BT_ReverseDirection)
01212 {
01213 assert(length % BLOCKSIZE == 0);
01214 inBlocks += length - BLOCKSIZE;
01215 xorBlocks += length - BLOCKSIZE;
01216 outBlocks += length - BLOCKSIZE;
01217 increment = 0-increment;
01218 }
01219
01220 Locals &locals = *(Locals *)space;
01221
01222 locals.inBlocks = inBlocks;
01223 locals.inXorBlocks = (flags & BT_XorInput) && xorBlocks ? xorBlocks : zeros;
01224 locals.outXorBlocks = (flags & BT_XorInput) || !xorBlocks ? zeros : xorBlocks;
01225 locals.outBlocks = outBlocks;
01226
01227 locals.inIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
01228 locals.inXorIncrement = (flags & BT_XorInput) && xorBlocks ? increment : 0;
01229 locals.outXorIncrement = (flags & BT_XorInput) || !xorBlocks ? 0 : increment;
01230 locals.outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
01231
01232 locals.lengthAndCounterFlag = length - (length%16) - bool(flags & BT_InBlockIsCounter);
01233 int keysToCopy = m_rounds - (flags & BT_InBlockIsCounter ? 3 : 2);
01234 locals.keysBegin = (12-keysToCopy)*16;
01235
01236 Rijndael_Enc_AdvancedProcessBlocks(&locals, m_key);
01237 return length % BLOCKSIZE;
01238 }
01239 #endif
01240
01241 return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
01242 }
01243
01244 #endif
01245
01246 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
01247
01248 size_t Rijndael::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
01249 {
01250 if (HasAESNI())
01251 return AESNI_AdvancedProcessBlocks(AESNI_Dec_Block, AESNI_Dec_4_Blocks, (const __m128i *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
01252
01253 return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
01254 }
01255
01256 #endif // #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
01257
01258 NAMESPACE_END
01259
01260 #endif
01261 #endif