Skip to content

Commit

Permalink
- added tfloat float+double DotProduct benchmark for the various inca…
Browse files Browse the repository at this point in the history
…ntations: `unittest/tfloat_benchmark.cc`

- working towards float+double co-existence as desired in stweil#2 (comment) using function templates for DRY as per query in stweil#2 (comment)
- fix typo mistake in OpenMP code. (Probably me earlier this morning, too hurried.)
  • Loading branch information
GerHobbelt committed Jul 13, 2021
1 parent 6d01734 commit 6b59323
Show file tree
Hide file tree
Showing 8 changed files with 358 additions and 46 deletions.
7 changes: 6 additions & 1 deletion src/arch/dotproduct.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,18 @@
namespace tesseract {

// Computes and returns the dot product of the two n-vectors u and v.
template <class TFloat>
TFloat DotProductNative(const TFloat *u, const TFloat *v, int n) {
TFloat total = 0;
#pragma omp simdi reduction(+:total)
#pragma omp simd reduction(+:total)
for (int k = 0; k < n; k++) {
total += u[k] * v[k];
}
return total;
}

// two instantiations: float & double.
template float DotProductNative<float>(const float *u, const float *v, int n);
template double DotProductNative<double>(const double *u, const double *v, int n);

} // namespace tesseract
37 changes: 29 additions & 8 deletions src/arch/dotproduct.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,22 +22,43 @@
namespace tesseract {

// Computes and returns the dot product of the n-vectors u and v.
template <class TFloat>
TFloat DotProductNative(const TFloat *u, const TFloat *v, int n);

// ------------ FAST FLOAT specializations -----------------

// Uses Intel AVX intrinsics to access the SIMD instruction set.
float DotProductAVX(const float *u, const float *v, int n);
float DotProductAVX1(const float *u, const float *v, int n);
float DotProductAVX2(const float *u, const float *v, int n);
float DotProductAVX3(const float *u, const float *v, int n);
float DotProductAVX4(const float *u, const float *v, int n);

// Use Intel FMA.
float DotProductFMA(const float *u, const float *v, int n);

// Uses Intel SSE intrinsics to access the SIMD instruction set.
float DotProductSSE(const float *u, const float *v, int n);

float DotProductAccelerate(const float *u, const float *v, int n);

// ------------ HIGH PRECISION DOUBLE specializations -----------------

// Uses Intel AVX intrinsics to access the SIMD instruction set.
TFloat DotProductAVX(const TFloat *u, const TFloat *v, int n);
TFloat DotProductAVX1(const TFloat *u, const TFloat *v, int n);
TFloat DotProductAVX2(const TFloat *u, const TFloat *v, int n);
TFloat DotProductAVX3(const TFloat *u, const TFloat *v, int n);
TFloat DotProductAVX4(const TFloat *u, const TFloat *v, int n);
double DotProductAVX(const double *u, const double *v, int n);
double DotProductAVX1(const double *u, const double *v, int n);
double DotProductAVX2(const double *u, const double *v, int n);
double DotProductAVX3(const double *u, const double *v, int n);
double DotProductAVX4(const double *u, const double *v, int n);

// Use Intel FMA.
TFloat DotProductFMA(const TFloat *u, const TFloat *v, int n);
double DotProductFMA(const double *u, const double *v, int n);

// Uses Intel SSE intrinsics to access the SIMD instruction set.
TFloat DotProductSSE(const TFloat *u, const TFloat *v, int n);
double DotProductSSE(const double *u, const double *v, int n);

double DotProductAccelerate(const double *u, const double *v, int n);

TFloat DotProductAccelerate(const TFloat *u, const TFloat *v, int n);
} // namespace tesseract.

#endif // TESSERACT_ARCH_DOTPRODUCT_H_
8 changes: 5 additions & 3 deletions src/arch/dotproductavx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,10 @@

namespace tesseract {

// ---------------------------- FAST FLOAT section ------------------------

// Computes and returns the dot product of the n-vectors u and v.
// Uses Intel AVX intrinsics to access the SIMD instruction set.
#if defined(FAST_FLOAT)
float DotProductAVX(const float *u, const float *v, int n) {
const unsigned quot = n / 8;
const unsigned rem = n % 8;
Expand Down Expand Up @@ -76,7 +77,7 @@ float DotProductAVX1(const float *u, const float *v, int n) {
return result;
}

#else
// ---------------------------- HIGH-PRECISION DOUBLE section ------------------------

double DotProductAVX1(const double *u, const double *v, int n) {
__m256d t0 = _mm256_setzero_pd();
Expand Down Expand Up @@ -131,7 +132,8 @@ double DotProductAVX(const double *u, const double *v, int n) {
}
return result;
}
#endif

// ---------------------------- END FLOAT/DOUBLE sections ------------------------

} // namespace tesseract.

Expand Down
29 changes: 18 additions & 11 deletions src/arch/dotproductfma.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,11 @@

namespace tesseract {

// ---------------------------- FAST FLOAT section ------------------------

// Computes and returns the dot product of the n-vectors u and v.
// Uses Intel FMA intrinsics to access the SIMD instruction set.
#if defined(FAST_FLOAT)
TFloat DotProductFMA(const TFloat *u, const TFloat *v, int n) {
float DotProductFMA(const float *u, const float *v, int n) {
const unsigned quot = n / 16;
const unsigned rem = n % 16;
__m256 t0 = _mm256_setzero_ps();
Expand All @@ -44,15 +45,17 @@ TFloat DotProductFMA(const TFloat *u, const TFloat *v, int n) {
v += 8;
}
t0 = _mm256_hadd_ps(t0, t1);
alignas(32) TFloat tmp[8];
alignas(32) float tmp[8];
_mm256_store_ps(tmp, t0);
TFloat result = tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7];
float result = tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7];
for (unsigned k = 0; k < rem; k++) {
result += *u++ * *v++;
}
return result;
}
#else

// ---------------------------- HIGH-PRECISION DOUBLE section ------------------------

double DotProductFMA(const double *u, const double *v, int n) {
const unsigned quot = n / 8;
const unsigned rem = n % 8;
Expand All @@ -79,19 +82,23 @@ double DotProductFMA(const double *u, const double *v, int n) {
}
return result;
}
#endif

// ---------------------------- END section ------------------------

} // namespace tesseract.

#else

namespace tesseract {

// Computes and returns the dot product of the n-vectors u and v.
// Uses Intel FMA intrinsics to access the SIMD instruction set.
inline TFloat DotProductFMA(const TFloat* u, const TFloat* v, int n) {
return DotProductSSE(u, v, n);
}
// Computes and returns the dot product of the n-vectors u and v.
// Uses Intel FMA intrinsics to access the SIMD instruction set.
inline float DotProductFMA(const float *u, const float *v, int n) {
return DotProductSSE(u, v, n);
}
inline double DotProductFMA(const double *u, const double *v, int n) {
return DotProductSSE(u, v, n);
}

}

Expand Down
14 changes: 9 additions & 5 deletions src/arch/dotproductsse.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,11 @@

namespace tesseract {

// ---------------------------- FAST FLOAT section ------------------------

// Computes and returns the dot product of the n-vectors u and v.
// Uses Intel SSE intrinsics to access the SIMD instruction set.
#if defined(FAST_FLOAT)
TFloat DotProductSSE(const TFloat *u, const TFloat *v, int n) {
float DotProductSSE(const float *u, const float *v, int n) {
int max_offset = n - 4;
int offset = 0;
// Accumulate a set of 4 sums in sum, by loading pairs of 4 values from u and
Expand Down Expand Up @@ -85,8 +86,10 @@ TFloat DotProductSSE(const TFloat *u, const TFloat *v, int n) {
}
return result;
}
#else
TFloat DotProductSSE(const TFloat *u, const TFloat *v, int n) {

// ---------------------------- HIGH-PRECISION DOUBLE section ------------------------

double DotProductSSE(const double *u, const double *v, int n) {
int max_offset = n - 2;
int offset = 0;
// Accumulate a set of 2 sums in sum, by loading pairs of 2 values from u and
Expand Down Expand Up @@ -135,7 +138,8 @@ TFloat DotProductSSE(const TFloat *u, const TFloat *v, int n) {
}
return result;
}
#endif

// ---------------------------- END section ------------------------

} // namespace tesseract.

Expand Down
19 changes: 2 additions & 17 deletions src/lstm/weightmatrix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,26 +65,11 @@ static void FloatToDouble(const GENERIC_2D_ARRAY<float> &src, GENERIC_2D_ARRAY<d
}

static bool DeSerialize(TFile *fp, GENERIC_2D_ARRAY<TFloat> &tfloat_array) {
#ifdef FAST_FLOAT
GENERIC_2D_ARRAY<double> double_array;
if (!double_array.DeSerialize(fp)) {
return false;
}
DoubleToFloat(double_array, tfloat_array);
return true;
#else
return tfloat_array.DeSerialize(fp);
#endif
return tfloat_array.DeSerialize<double>(fp);
}

static bool Serialize(TFile *fp, const GENERIC_2D_ARRAY<TFloat> &tfloat_array) {
#ifdef FAST_FLOAT
GENERIC_2D_ARRAY<double> double_array;
FloatToDouble(tfloat_array, double_array);
return double_array.Serialize(fp);
#else
return tfloat_array.Serialize(fp);
#endif
return tfloat_array.Serialize<double>(fp);
}


Expand Down
1 change: 0 additions & 1 deletion unittest/dotproduct_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ void DotProductTest::RunTest(TFloat (*f)(const TFloat *u, const TFloat *v, int n
}
}

TFloat DotProductGeneric(const TFloat *u, const TFloat *v, int n);
TFloat DotProductGeneric(const TFloat *u, const TFloat *v, int n) {
TFloat total = 0;
#pragma omp simd reduction(+:total)
Expand Down
Loading

0 comments on commit 6b59323

Please sign in to comment.