diff --git a/libcperciva/cpusupport/Build/cpusupport-X86-AESNI.c b/libcperciva/cpusupport/Build/cpusupport-X86-AESNI.c index 5ddda7fe..53ac5fe1 100644 --- a/libcperciva/cpusupport/Build/cpusupport-X86-AESNI.c +++ b/libcperciva/cpusupport/Build/cpusupport-X86-AESNI.c @@ -23,7 +23,12 @@ main(void) uint8_t a[16]; x = load_128(a); - y = _mm_aesenc_si128(x, x); +#ifdef BROKEN_MM_LOADU_SI64 + y = _mm_loadu_si128(a); +#else + y = _mm_loadu_si64(a); +#endif + y = _mm_aesenc_si128(x, y); _mm_storeu_si128((__m128i *)&a[0], y); return (a[0]); } diff --git a/libcperciva/cpusupport/Build/cpusupport.sh b/libcperciva/cpusupport/Build/cpusupport.sh index 43754dfe..4c9a3f00 100755 --- a/libcperciva/cpusupport/Build/cpusupport.sh +++ b/libcperciva/cpusupport/Build/cpusupport.sh @@ -60,7 +60,9 @@ feature X86 CPUID_COUNT "" feature X86 AESNI "" "-maes" \ "-maes -Wno-cast-align" \ "-maes -Wno-missing-prototypes -Wno-cast-qual" \ - "-maes -Wno-missing-prototypes -Wno-cast-qual -Wno-cast-align" + "-maes -Wno-missing-prototypes -Wno-cast-qual -Wno-cast-align" \ + "-maes -Wno-missing-prototypes -Wno-cast-qual -Wno-cast-align \ + -DBROKEN_MM_LOADU_SI64" feature X86 CRC32_64 "" "-msse4.2" \ "-msse4.2 -Wno-cast-align" \ "-msse4.2 -Wno-cast-align -fno-strict-aliasing" \ diff --git a/libcperciva/crypto/crypto_aesctr_aesni.c b/libcperciva/crypto/crypto_aesctr_aesni.c index de80f414..5967edbb 100644 --- a/libcperciva/crypto/crypto_aesctr_aesni.c +++ b/libcperciva/crypto/crypto_aesctr_aesni.c @@ -30,6 +30,27 @@ */ #include "crypto_aesctr_shared.c" +#ifdef BROKEN_MM_LOADU_SI64 +#warning Working around compiler bug: _mm_loadu_si64 is missing +#warning Updating to a newer compiler may improve performance +#endif + +/** + * load_si64(mem): + * Load an unaligned 64-bit integer from memory into the lowest 64 bits of the + * returned value. The contents of the upper 64 bits is not defined. + */ +static inline __m128i +load_si64(const void * mem) +{ + +#ifdef BROKEN_MM_LOADU_SI64 + return (_mm_castpd_si128(_mm_load_sd(mem))); +#else + return (_mm_loadu_si64(mem)); +#endif +} + /* Process multiple whole blocks by generating & using a cipherblock. */ static void crypto_aesctr_aesni_stream_wholeblocks(struct crypto_aesctr * stream, @@ -44,7 +65,7 @@ crypto_aesctr_aesni_stream_wholeblocks(struct crypto_aesctr * stream, size_t i; /* Load local variables from stream. */ - nonce_be = _mm_loadu_si64(stream->pblk); + nonce_be = load_si64(stream->pblk); block_counter = stream->bytectr / 16; /* How many blocks should we process? */ @@ -60,7 +81,7 @@ crypto_aesctr_aesni_stream_wholeblocks(struct crypto_aesctr * stream, be64enc(block_counter_be_arr, block_counter); /* Encrypt the cipherblock. */ - bufsse = _mm_loadu_si64(block_counter_be_arr); + bufsse = load_si64(block_counter_be_arr); bufsse = _mm_unpacklo_epi64(nonce_be, bufsse); bufsse = crypto_aes_encrypt_block_aesni_m128i(bufsse, stream->key);