From d23ec1dbcd4af427bcf007192067755e4c726d2f Mon Sep 17 00:00:00 2001 From: Ger Hobbelt Date: Tue, 13 Jul 2021 09:20:39 +0200 Subject: [PATCH] extracted from 3490: implements DotProductSSE() for FAST_FLOAT --- src/arch/dotproductsse.cpp | 64 ++++++++++++++++++++++++++++++++++---- src/arch/intsimdmatrix.h | 2 +- 2 files changed, 59 insertions(+), 7 deletions(-) diff --git a/src/arch/dotproductsse.cpp b/src/arch/dotproductsse.cpp index ec94f50341..9122e9d1b1 100644 --- a/src/arch/dotproductsse.cpp +++ b/src/arch/dotproductsse.cpp @@ -31,12 +31,63 @@ namespace tesseract { // Computes and returns the dot product of the n-vectors u and v. // Uses Intel SSE intrinsics to access the SIMD instruction set. #if defined(FAST_FLOAT) -TFloat DotProductSSE(const TFloat *u, const TFloat *v, int n) { - TFloat total = 0.0; - for (int k = 0; k < n; ++k) { - total += u[k] * v[k]; +float DotProductSSE(const float *u, const float *v, int n) { + int max_offset = n - 4; + int offset = 0; + // Accumulate a set of 4 sums in sum, by loading pairs of 4 values from u and + // v, and multiplying them together in parallel. + __m128 sum = _mm_setzero_ps(); + if (offset <= max_offset) { + offset = 4; + // Aligned load is reputedly faster but requires 16 byte aligned input. + if ((reinterpret_cast(u) & 15) == 0 && + (reinterpret_cast(v) & 15) == 0) { + // Use aligned load. + sum = _mm_load_ps(u); + __m128 floats2 = _mm_load_ps(v); + // Multiply. + sum = _mm_mul_ps(sum, floats2); + while (offset <= max_offset) { + __m128 floats1 = _mm_load_ps(u + offset); + floats2 = _mm_load_ps(v + offset); + floats1 = _mm_mul_ps(floats1, floats2); + sum = _mm_add_ps(sum, floats1); + offset += 4; + } + } else { + // Use unaligned load. + sum = _mm_loadu_ps(u); + __m128 floats2 = _mm_loadu_ps(v); + // Multiply. + sum = _mm_mul_ps(sum, floats2); + while (offset <= max_offset) { + __m128 floats1 = _mm_loadu_ps(u + offset); + floats2 = _mm_loadu_ps(v + offset); + floats1 = _mm_mul_ps(floats1, floats2); + sum = _mm_add_ps(sum, floats1); + offset += 4; + } + } } - return total; + // Add the 4 sums in sum horizontally. +#if 0 + alignas(32) float tmp[4]; + _mm_store_ps(tmp, sum); + float result = tmp[0] + tmp[1] + tmp[2] + tmp[3]; +#else + __m128 zero = _mm_setzero_ps(); + // https://www.felixcloutier.com/x86/haddps + sum = _mm_hadd_ps(sum, zero); + sum = _mm_hadd_ps(sum, zero); + // Extract the low result. + float result = _mm_cvtss_f32(sum); +#endif + // Add on any left-over products. + while (offset < n) { + result += u[offset] * v[offset]; + ++offset; + } + return result; } #else double DotProductSSE(const double *u, const double *v, int n) { @@ -48,7 +99,8 @@ double DotProductSSE(const double *u, const double *v, int n) { if (offset <= max_offset) { offset = 2; // Aligned load is reputedly faster but requires 16 byte aligned input. - if ((reinterpret_cast(u) & 15) == 0 && (reinterpret_cast(v) & 15) == 0) { + if ((reinterpret_cast(u) & 15) == 0 && + (reinterpret_cast(v) & 15) == 0) { // Use aligned load. sum = _mm_load_pd(u); __m128d floats2 = _mm_load_pd(v); diff --git a/src/arch/intsimdmatrix.h b/src/arch/intsimdmatrix.h index aa05a450ee..98a894f0ca 100644 --- a/src/arch/intsimdmatrix.h +++ b/src/arch/intsimdmatrix.h @@ -115,7 +115,7 @@ struct TESS_API IntSimdMatrix { static const IntSimdMatrix *intSimdMatrix; // Only available with NEON. static const IntSimdMatrix *intSimdMatrixNEON; - // Only available with AVX2 / SSE. + // Only available with AVX2 / AVX / FMA / SSE. static const IntSimdMatrix *intSimdMatrixAVX2; static const IntSimdMatrix *intSimdMatrixSSE; };