- added tfloat float+double DotProduct benchmark for the various inca…

…ntations: `unittest/tfloat_benchmark.cc` - working towards float+double co-existence as desired in stweil#2 (comment) using function templates for DRY as per query in stweil#2 (comment) - fix typo mistake in OpenMP code. (Probably me earlier this morning, too hurried.)
GerHobbelt · Jul 13, 2021 · 6b59323 · 6b59323
1 parent 6d01734
commit 6b59323
Show file tree

Hide file tree

Showing 8 changed files with 358 additions and 46 deletions.
diff --git a/src/arch/dotproduct.cpp b/src/arch/dotproduct.cpp
@@ -19,13 +19,18 @@
 namespace tesseract {
 
 // Computes and returns the dot product of the two n-vectors u and v.
+template <class TFloat>
 TFloat DotProductNative(const TFloat *u, const TFloat *v, int n) {
   TFloat total = 0;
-#pragma omp simdi reduction(+:total)
+#pragma omp simd reduction(+:total)
   for (int k = 0; k < n; k++) {
     total += u[k] * v[k];
   }
   return total;
 }
 
+// two instantiations: float & double.
+template float DotProductNative<float>(const float *u, const float *v, int n);
+template double DotProductNative<double>(const double *u, const double *v, int n);
+
 } // namespace tesseract
diff --git a/src/arch/dotproduct.h b/src/arch/dotproduct.h
@@ -22,22 +22,43 @@
 namespace tesseract {
 
 // Computes and returns the dot product of the n-vectors u and v.
+template <class TFloat>
 TFloat DotProductNative(const TFloat *u, const TFloat *v, int n);
 
+// ------------ FAST FLOAT specializations -----------------
+
+// Uses Intel AVX intrinsics to access the SIMD instruction set.
+float DotProductAVX(const float *u, const float *v, int n);
+float DotProductAVX1(const float *u, const float *v, int n);
+float DotProductAVX2(const float *u, const float *v, int n);
+float DotProductAVX3(const float *u, const float *v, int n);
+float DotProductAVX4(const float *u, const float *v, int n);
+
+// Use Intel FMA.
+float DotProductFMA(const float *u, const float *v, int n);
+
+// Uses Intel SSE intrinsics to access the SIMD instruction set.
+float DotProductSSE(const float *u, const float *v, int n);
+
+float DotProductAccelerate(const float *u, const float *v, int n);
+
+// ------------ HIGH PRECISION DOUBLE specializations -----------------
+
 // Uses Intel AVX intrinsics to access the SIMD instruction set.
-TFloat DotProductAVX(const TFloat *u, const TFloat *v, int n);
-TFloat DotProductAVX1(const TFloat *u, const TFloat *v, int n);
-TFloat DotProductAVX2(const TFloat *u, const TFloat *v, int n);
-TFloat DotProductAVX3(const TFloat *u, const TFloat *v, int n);
-TFloat DotProductAVX4(const TFloat *u, const TFloat *v, int n);
+double DotProductAVX(const double *u, const double *v, int n);
+double DotProductAVX1(const double *u, const double *v, int n);
+double DotProductAVX2(const double *u, const double *v, int n);
+double DotProductAVX3(const double *u, const double *v, int n);
+double DotProductAVX4(const double *u, const double *v, int n);
 
 // Use Intel FMA.
-TFloat DotProductFMA(const TFloat *u, const TFloat *v, int n);
+double DotProductFMA(const double *u, const double *v, int n);
 
 // Uses Intel SSE intrinsics to access the SIMD instruction set.
-TFloat DotProductSSE(const TFloat *u, const TFloat *v, int n);
+double DotProductSSE(const double *u, const double *v, int n);
+
+double DotProductAccelerate(const double *u, const double *v, int n);
 
-TFloat DotProductAccelerate(const TFloat *u, const TFloat *v, int n);
 } // namespace tesseract.
 
 #endif // TESSERACT_ARCH_DOTPRODUCT_H_
diff --git a/src/arch/dotproductavx.cpp b/src/arch/dotproductavx.cpp
@@ -25,9 +25,10 @@
 
 namespace tesseract {
 
+// ---------------------------- FAST FLOAT section ------------------------
+
 // Computes and returns the dot product of the n-vectors u and v.
 // Uses Intel AVX intrinsics to access the SIMD instruction set.
-#if defined(FAST_FLOAT)
 float DotProductAVX(const float *u, const float *v, int n) {
   const unsigned quot = n / 8;
   const unsigned rem = n % 8;
@@ -76,7 +77,7 @@ float DotProductAVX1(const float *u, const float *v, int n) {
   return result;
 }
 
-#else
+// ---------------------------- HIGH-PRECISION DOUBLE section ------------------------
 
 double DotProductAVX1(const double *u, const double *v, int n) {
   __m256d t0 = _mm256_setzero_pd();
@@ -131,7 +132,8 @@ double DotProductAVX(const double *u, const double *v, int n) {
   }
   return result;
 }
-#endif
+
+// ---------------------------- END FLOAT/DOUBLE sections ------------------------
 
 } // namespace tesseract.
 

diff --git a/src/arch/dotproductfma.cpp b/src/arch/dotproductfma.cpp
@@ -23,10 +23,11 @@
 
 namespace tesseract {
 
+// ---------------------------- FAST FLOAT section ------------------------
+
 // Computes and returns the dot product of the n-vectors u and v.
 // Uses Intel FMA intrinsics to access the SIMD instruction set.
-#if defined(FAST_FLOAT)
-TFloat DotProductFMA(const TFloat *u, const TFloat *v, int n) {
+float DotProductFMA(const float *u, const float *v, int n) {
   const unsigned quot = n / 16;
   const unsigned rem = n % 16;
   __m256 t0 = _mm256_setzero_ps();
@@ -44,15 +45,17 @@ TFloat DotProductFMA(const TFloat *u, const TFloat *v, int n) {
     v += 8;
   }
   t0 = _mm256_hadd_ps(t0, t1);
-  alignas(32) TFloat tmp[8];
+  alignas(32) float tmp[8];
   _mm256_store_ps(tmp, t0);
-  TFloat result = tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7];
+  float result = tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7];
   for (unsigned k = 0; k < rem; k++) {
     result += *u++ * *v++;
   }
   return result;
 }
-#else
+
+// ---------------------------- HIGH-PRECISION DOUBLE section ------------------------
+
 double DotProductFMA(const double *u, const double *v, int n) {
   const unsigned quot = n / 8;
   const unsigned rem = n % 8;
@@ -79,19 +82,23 @@ double DotProductFMA(const double *u, const double *v, int n) {
   }
   return result;
 }
-#endif
+
+// ---------------------------- END section ------------------------
 
 } // namespace tesseract.
 
 #else
 
 namespace tesseract {
 
-	// Computes and returns the dot product of the n-vectors u and v.
-	// Uses Intel FMA intrinsics to access the SIMD instruction set.
-	inline TFloat DotProductFMA(const TFloat* u, const TFloat* v, int n) {
-		return DotProductSSE(u, v, n);
-	}
+// Computes and returns the dot product of the n-vectors u and v.
+// Uses Intel FMA intrinsics to access the SIMD instruction set.
+inline float DotProductFMA(const float *u, const float *v, int n) {
+	return DotProductSSE(u, v, n);
+}
+inline double DotProductFMA(const double *u, const double *v, int n) {
+  return DotProductSSE(u, v, n);
+}
 
 }
 

diff --git a/src/arch/dotproductsse.cpp b/src/arch/dotproductsse.cpp
@@ -24,10 +24,11 @@
 
 namespace tesseract {
 
+// ---------------------------- FAST FLOAT section ------------------------
+
 // Computes and returns the dot product of the n-vectors u and v.
 // Uses Intel SSE intrinsics to access the SIMD instruction set.
-#if defined(FAST_FLOAT)
-TFloat DotProductSSE(const TFloat *u, const TFloat *v, int n) {
+float DotProductSSE(const float *u, const float *v, int n) {
 	int max_offset = n - 4;
 	int offset = 0;
 	// Accumulate a set of 4 sums in sum, by loading pairs of 4 values from u and
@@ -85,8 +86,10 @@ TFloat DotProductSSE(const TFloat *u, const TFloat *v, int n) {
 	}
 	return result;
 }
-#else
-TFloat DotProductSSE(const TFloat *u, const TFloat *v, int n) {
+
+// ---------------------------- HIGH-PRECISION DOUBLE section ------------------------
+
+double DotProductSSE(const double *u, const double *v, int n) {
   int max_offset = n - 2;
   int offset = 0;
   // Accumulate a set of 2 sums in sum, by loading pairs of 2 values from u and
@@ -135,7 +138,8 @@ TFloat DotProductSSE(const TFloat *u, const TFloat *v, int n) {
   }
   return result;
 }
-#endif
+
+// ---------------------------- END section ------------------------
 
 } // namespace tesseract.
 

diff --git a/src/lstm/weightmatrix.cpp b/src/lstm/weightmatrix.cpp
@@ -65,26 +65,11 @@ static void FloatToDouble(const GENERIC_2D_ARRAY<float> &src, GENERIC_2D_ARRAY<d
 }
 
 static bool DeSerialize(TFile *fp, GENERIC_2D_ARRAY<TFloat> &tfloat_array) {
-#ifdef FAST_FLOAT
-  GENERIC_2D_ARRAY<double> double_array;
-  if (!double_array.DeSerialize(fp)) {
-    return false;
-  }
-  DoubleToFloat(double_array, tfloat_array);
-  return true;
-#else
-  return tfloat_array.DeSerialize(fp);
-#endif
+  return tfloat_array.DeSerialize<double>(fp);
 }
 
 static bool Serialize(TFile *fp, const GENERIC_2D_ARRAY<TFloat> &tfloat_array) {
-#ifdef FAST_FLOAT
-  GENERIC_2D_ARRAY<double> double_array;
-  FloatToDouble(tfloat_array, double_array);
-  return double_array.Serialize(fp);
-#else
-  return tfloat_array.Serialize(fp);
-#endif
+  return tfloat_array.Serialize<double>(fp);
 }
 
 

diff --git a/unittest/dotproduct_test.cc b/unittest/dotproduct_test.cc
@@ -43,7 +43,6 @@ void DotProductTest::RunTest(TFloat (*f)(const TFloat *u, const TFloat *v, int n
   }
 }
 
-TFloat DotProductGeneric(const TFloat *u, const TFloat *v, int n);
 TFloat DotProductGeneric(const TFloat *u, const TFloat *v, int n) {
   TFloat total = 0;
 #pragma omp simd reduction(+:total)