Skip to content

Commit

Permalink
Merge pull request #1 from GerHobbelt/tfloat-patch-2
Browse files Browse the repository at this point in the history
bugfix of FMA port to FAST_FLOAT: 8 float FPs fit in a single 256bit
  • Loading branch information
stweil authored Jul 13, 2021
2 parents 1a59b6f + 30bf263 commit b3adfdd
Showing 1 changed file with 8 additions and 8 deletions.
16 changes: 8 additions & 8 deletions src/arch/dotproductfma.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,26 +31,26 @@ namespace tesseract {
// Uses Intel FMA intrinsics to access the SIMD instruction set.
#if defined(FAST_FLOAT)
TFloat DotProductFMA(const TFloat *u, const TFloat *v, int n) {
const unsigned quot = n / 8;
const unsigned rem = n % 8;
const unsigned quot = n / 16;
const unsigned rem = n % 16;
__m256 t0 = _mm256_setzero_ps();
__m256 t1 = _mm256_setzero_ps();
for (unsigned k = 0; k < quot; k++) {
__m256 f0 = _mm256_loadu_ps(u);
__m256 f1 = _mm256_loadu_ps(v);
t0 = _mm256_fmadd_ps(f0, f1, t0);
u += 4;
v += 4;
u += 8;
v += 8;
__m256 f2 = _mm256_loadu_ps(u);
__m256 f3 = _mm256_loadu_ps(v);
t1 = _mm256_fmadd_ps(f2, f3, t1);
u += 4;
v += 4;
u += 8;
v += 8;
}
t0 = _mm256_hadd_ps(t0, t1);
alignas(32) float tmp[4];
alignas(32) TFloat tmp[8];
_mm256_store_ps(tmp, t0);
float result = tmp[0] + tmp[1] + tmp[2] + tmp[3];
TFloat result = tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7];
for (unsigned k = 0; k < rem; k++) {
result += *u++ * *v++;
}
Expand Down

0 comments on commit b3adfdd

Please sign in to comment.