bugfix of FMA port to FAST_FLOAT: 8 float FPs fit in a single 256bit …

…vector (8x32) (contrasting 4 double FPs: 4*64)
stweil · stweil · Jul 13, 2021 · Jul 13, 2021 · Jul 13, 2021 · 30bf2635fd6a31e534e354608052db415ea12ba9
commit 30bf2635fd6a31e534e354608052db415ea12ba9
diff --git a/src/arch/dotproductfma.cpp b/src/arch/dotproductfma.cpp
@@ -31,26 +31,26 @@ namespace tesseract {
 // Uses Intel FMA intrinsics to access the SIMD instruction set.
 #if defined(FAST_FLOAT)
 TFloat DotProductFMA(const TFloat *u, const TFloat *v, int n) {
-  const unsigned quot = n / 8;
-  const unsigned rem = n % 8;
+  const unsigned quot = n / 16;
+  const unsigned rem = n % 16;
   __m256 t0 = _mm256_setzero_ps();
   __m256 t1 = _mm256_setzero_ps();
   for (unsigned k = 0; k < quot; k++) {
     __m256 f0 = _mm256_loadu_ps(u);
     __m256 f1 = _mm256_loadu_ps(v);
     t0 = _mm256_fmadd_ps(f0, f1, t0);
-    u += 4;
-    v += 4;
+    u += 8;
+    v += 8;
     __m256 f2 = _mm256_loadu_ps(u);
     __m256 f3 = _mm256_loadu_ps(v);
     t1 = _mm256_fmadd_ps(f2, f3, t1);
-    u += 4;
-    v += 4;
+    u += 8;
+    v += 8;
   }
   t0 = _mm256_hadd_ps(t0, t1);
-  alignas(32) float tmp[4];
+  alignas(32) TFloat tmp[8];
   _mm256_store_ps(tmp, t0);
-  float result = tmp[0] + tmp[1] + tmp[2] + tmp[3];
+  TFloat result = tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7];
   for (unsigned k = 0; k < rem; k++) {
     result += *u++ * *v++;
   }