-
Notifications
You must be signed in to change notification settings - Fork 67
/
Copy pathbenchmarkflops.jl
484 lines (470 loc) · 16.4 KB
/
benchmarkflops.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
include(joinpath(LOOPVECBENCHDIR, "looptests.jl"))
include(joinpath(LOOPVECBENCHDIR, "loadsharedlibs.jl"))
using BenchmarkTools, SharedArrays
struct SizedResults{V<:AbstractVector} <: AbstractMatrix{String}
results::Matrix{Float64}
sizes::V
end
function Base.size(sr::SizedResults)
M, N = size(sr.results)
N, M + 1
end
struct BenchmarkResult{V}
tests::Vector{String}
sizedresults::SizedResults{V}
end
function BenchmarkResult(results, tests, sizes)
ntests = length(tests)
nsizes = length(sizes)
BenchmarkResult(append!(["Size"], tests), SizedResults(results, sizes))
end
function Base.getindex(br::SizedResults, row, col)
col == 1 ? string(br.sizes[row]) : string(br.results[col-1, row])
end
Base.setindex!(br::BenchmarkResult, v, i...) = br.sizedresults.results[i...] = v
function Base.vcat(br1::BenchmarkResult, br2::BenchmarkResult)
BenchmarkResult(
br1.tests,
SizedResults(
hcat(br1.sizedresults.results, br2.sizedresults.results),
vcat(br1.sizedresults.sizes, br2.sizedresults.sizes)
)
)
end
tothreetuple(i::Int) = (i, i, i)
tothreetuple(i::NTuple{3,Int}) = i
function matmul_bench!(br, C, A, B, i)
M, N = size(C)
K = size(B, 1)
n_gflop = M * K * N * 2e-9
Cblas = A * B
br[1, i] = n_gflop / @belapsed gemmavx!($C, $A, $B)
@assert C ≈ Cblas "LoopVec gemm wrong?"
fill!(C, NaN)
br[2, i] = n_gflop / @belapsed jgemm!($C, $A, $B)
@assert C ≈ Cblas "Julia gemm wrong?"
fill!(C, NaN)
br[3, i] = n_gflop / @belapsed cgemm!($C, $A, $B)
@assert C ≈ Cblas "Clang gemm wrong?"
fill!(C, NaN)
br[4, i] = n_gflop / @belapsed fgemm!($C, $A, $B)
@assert C ≈ Cblas "Fort gemm wrong?"
fill!(C, NaN)
if INTEL_BENCH
br[5, i] = n_gflop / @belapsed icgemm!($C, $A, $B)
@assert C ≈ Cblas "icc gemm wrong?"
fill!(C, NaN)
br[6, i] = n_gflop / @belapsed ifgemm!($C, $A, $B)
@assert C ≈ Cblas "ifort gemm wrong?"
fill!(C, NaN)
end
br[5+2INTEL_BENCH, i] = n_gflop / @belapsed egemm!($C, $A, $B)
@assert C ≈ Cblas "eigen gemm wrong?"
fill!(C, NaN)
br[6+2INTEL_BENCH, i] = n_gflop / @belapsed iegemm!($C, $A, $B)
@assert C ≈ Cblas "i-eigen gemm wrong?"
fill!(C, NaN)
br[7+2INTEL_BENCH, i] = n_gflop / @belapsed fgemm_builtin!($C, $A, $B)
@assert C ≈ Cblas "Fort builtin gemm wrong?"
fill!(C, NaN)
if INTEL_BENCH
br[8+2INTEL_BENCH, i] = n_gflop / @belapsed ifgemm_builtin!($C, $A, $B)
@assert C ≈ Cblas "ifort builtin gemm wrong?"
fill!(C, NaN)
end
br[8+3INTEL_BENCH, i] = n_gflop / @belapsed gemmopenblas!($C, $A, $B)
@assert C ≈ Cblas "OpenBLAS gemm wrong?"
if MKL_BENCH
br[9+3INTEL_BENCH, i] = n_gflop / @belapsed gemmmkl!($C, $A, $B)
@assert C ≈ Cblas "MKL gemm wrong?"
end
# br[12,i] = n_gflop / @belapsed gemmavx!($C, $A, $B)
end
function A_mul_B_bench!(br, s, i)
M, K, N = tothreetuple(s)
C = Matrix{Float64}(undef, M, N)
A = rand(M, K)
B = rand(K, N)
matmul_bench!(br, C, A, B, i)
end
function A_mul_B_bench!(br, s, i)
M, K, N = tothreetuple(s)
C = Matrix{Float64}(undef, M, N)
A = rand(M, K)
B = rand(K, N)
matmul_bench!(br, C, A, B, i)
end
function A_mul_Bt_bench!(br, s, i)
M, K, N = tothreetuple(s)
C = Matrix{Float64}(undef, M, N)
A = rand(M, K)
B = rand(N, K)'
matmul_bench!(br, C, A, B, i)
end
function At_mul_B_bench!(br, s, i)
M, K, N = tothreetuple(s)
C = Matrix{Float64}(undef, M, N)
A = rand(K, M)'
B = rand(K, N)
matmul_bench!(br, C, A, B, i)
end
function At_mul_Bt_bench!(br, s, i)
M, K, N = tothreetuple(s)
C = Matrix{Float64}(undef, M, N)
A = rand(K, M)'
B = rand(N, K)'
matmul_bench!(br, C, A, B, i)
end
function dot_bench!(br, s, i)
a = rand(s)
b = rand(s)
dotblas = dot(a, b)
n_gflop = s * 2e-9
br[1, i] = n_gflop / @belapsed jdotavx($a, $b)
@assert jdotavx(a, b) ≈ dotblas "LoopVec dot wrong?"
br[2, i] = n_gflop / @belapsed jdot($a, $b)
@assert jdot(a, b) ≈ dotblas "Julia dot wrong?"
br[3, i] = n_gflop / @belapsed cdot($a, $b)
@assert cdot(a, b) ≈ dotblas "Clang dot wrong?"
br[4, i] = n_gflop / @belapsed fdot($a, $b)
@assert fdot(a, b) ≈ dotblas "Fort dot wrong?"
if INTEL_BENCH
br[5, i] = n_gflop / @belapsed icdot($a, $b)
@assert icdot(a, b) ≈ dotblas "icc dot wrong?"
br[6, i] = n_gflop / @belapsed ifdot($a, $b)
@assert ifdot(a, b) ≈ dotblas "ifort dot wrong?"
end
br[5+2INTEL_BENCH, i] = n_gflop / @belapsed edot($a, $b)
@assert edot(a, b) ≈ dotblas "eigen dot wrong?"
br[6+2INTEL_BENCH, i] = n_gflop / @belapsed iedot($a, $b)
@assert iedot(a, b) ≈ dotblas "i-eigen dot wrong?"
# br[9,i] = n_gflop / @belapsed dot($a, $b)
end
function selfdot_bench!(br, s, i)
a = rand(s)
b = rand(s)
dotblas = dot(a, a)
n_gflop = s * 2e-9
br[1, i] = n_gflop / @belapsed jselfdotavx($a)
@assert jselfdotavx(a) ≈ dotblas "LoopVec dot wrong?"
br[2, i] = n_gflop / @belapsed jselfdot($a)
@assert jselfdot(a) ≈ dotblas "Julia dot wrong?"
br[3, i] = n_gflop / @belapsed cselfdot($a)
@assert cselfdot(a) ≈ dotblas "Clang dot wrong?"
br[4, i] = n_gflop / @belapsed fselfdot($a)
@assert fselfdot(a) ≈ dotblas "Fort dot wrong?"
if INTEL_BENCH
br[5, i] = n_gflop / @belapsed icselfdot($a)
@assert cselfdot(a) ≈ dotblas "icc dot wrong?"
br[6, i] = n_gflop / @belapsed ifselfdot($a)
@assert fselfdot(a) ≈ dotblas "ifort dot wrong?"
end
br[5+2INTEL_BENCH, i] = n_gflop / @belapsed eselfdot($a)
@assert eselfdot(a) ≈ dotblas "eigen dot wrong?"
br[6+2INTEL_BENCH, i] = n_gflop / @belapsed ieselfdot($a)
@assert ieselfdot(a) ≈ dotblas "i-eigen dot wrong?"
# br[9,i] = n_gflop / @belapsed dot($a, $a)
end
totwotuple(i::Int) = (i, i)
totwotuple(i::Tuple{Int,Int}) = i
function gemv_bench!(br, x, A, y, i)
M, N = size(A)
n_gflop = M * N * 2e-9
xblas = A * y
br[1, i] = n_gflop / @belapsed jgemvavx!($x, $A, $y)
@assert x ≈ xblas "LoopVec wrong?"
fill!(x, NaN)
br[2, i] = n_gflop / @belapsed jgemv!($x, $A, $y)
@assert x ≈ xblas "Julia wrong?"
fill!(x, NaN)
br[3, i] = n_gflop / @belapsed cgemv!($x, $A, $y)
@assert x ≈ xblas "Clang wrong?"
fill!(x, NaN)
br[4, i] = n_gflop / @belapsed fgemv!($x, $A, $y)
@assert x ≈ xblas "Fort wrong?"
fill!(x, NaN)
if INTEL_BENCH
br[5, i] = n_gflop / @belapsed icgemv!($x, $A, $y)
@assert x ≈ xblas "icc wrong?"
fill!(x, NaN)
br[6, i] = n_gflop / @belapsed ifgemv!($x, $A, $y)
@assert x ≈ xblas "ifort wrong?"
fill!(x, NaN)
end
br[5+2INTEL_BENCH, i] = n_gflop / @belapsed egemv!($x, $A, $y)
@assert x ≈ xblas "eigen wrong?"
fill!(x, NaN)
br[6+2INTEL_BENCH, i] = n_gflop / @belapsed iegemv!($x, $A, $y)
@assert x ≈ xblas "i-eigen wrong?"
fill!(x, NaN)
br[7+2INTEL_BENCH, i] = n_gflop / @belapsed fgemv_builtin!($x, $A, $y)
@assert x ≈ xblas "Fort wrong?"
fill!(x, NaN)
br[8+2INTEL_BENCH, i] = n_gflop / @belapsed ifgemv_builtin!($x, $A, $y)
@assert x ≈ xblas "ifort wrong?"
fill!(x, NaN)
br[9+2INTEL_BENCH, i] = n_gflop / @belapsed dgemvopenblas!($x, $A, $y)
@assert x ≈ xblas "gemvopenblas wrong?"
fill!(x, NaN)
br[10+2INTEL_BENCH, i] = n_gflop / @belapsed dgemvmkl!($x, $A, $y)
@assert x ≈ xblas "gemvmkl wrong?"
fill!(x, NaN)
end
function A_mul_vb_bench!(br, s, i)
M, N = totwotuple(s)
x = Vector{Float64}(undef, M)
A = rand(M, N)
y = rand(N)
gemv_bench!(br, x, A, y, i)
end
function At_mul_vb_bench!(br, s, i)
M, N = totwotuple(s)
x = Vector{Float64}(undef, M)
A = rand(N, M)'
y = rand(N)
gemv_bench!(br, x, A, y, i)
end
function dot3_bench!(br, s, i)
M, N = totwotuple(s)
x = rand(M)
A = rand(M, N)
y = rand(N)
dotblas = dot(x, A, y)
n_gflop = M * N * 3e-9
br[1, i] = n_gflop / @belapsed jdot3avx($x, $A, $y)
@assert jdot3avx(x, A, y) ≈ dotblas "LoopVec dot wrong?"
br[2, i] = n_gflop / @belapsed jdot3($x, $A, $y)
@assert jdot3(x, A, y) ≈ dotblas "Julia dot wrong?"
br[3, i] = n_gflop / @belapsed cdot3($x, $A, $y)
@assert cdot3(x, A, y) ≈ dotblas "Clang dot wrong?"
br[4, i] = n_gflop / @belapsed fdot3($x, $A, $y)
@assert fdot3(x, A, y) ≈ dotblas "Fort dot wrong?"
if INTEL_BENCH
br[5, i] = n_gflop / @belapsed icdot3($x, $A, $y)
@assert icdot3(x, A, y) ≈ dotblas "icc dot wrong?"
br[6, i] = n_gflop / @belapsed ifdot3($x, $A, $y)
@assert ifdot3(x, A, y) ≈ dotblas "ifort dot wrong?"
end
br[5+2INTEL_BENCH, i] = n_gflop / @belapsed edot3($x, $A, $y)
@assert edot3(x, A, y) ≈ dotblas "eigen dot wrong?"
br[6+2INTEL_BENCH, i] = n_gflop / @belapsed iedot3($x, $A, $y)
@assert iedot3(x, A, y) ≈ dotblas "c-eigen dot wrong?"
br[7+2INTEL_BENCH, i] = n_gflop / @belapsed dot($x, $A, $y)
end
# BLAS.set_num_threads(1)
function sse!(Xβ, y, X, β)
dgemvmkl!(copyto!(Xβ, y), X, β, 1.0, -1.0)
jdot(Xβ, Xβ)
end
sse_totwotuple(s::NTuple{2}) = s
sse_totwotuple(s::Integer) = ((3s) >> 1, s >> 1)
function sse_bench!(br, s, i)
N, P = sse_totwotuple(s)
y = rand(N)
β = rand(P)
X = randn(N, P)
Xβ = similar(y)
lpblas = sse!(Xβ, y, X, β)
n_gflop = 2e-9 * (P * N + 2N)
br[1, i] = n_gflop / @belapsed jOLSlp_avx($y, $X, $β)
@assert jOLSlp_avx(y, X, β) ≈ lpblas "LoopVec wrong?"
br[2, i] = n_gflop / @belapsed jOLSlp($y, $X, $β)
@assert jOLSlp(y, X, β) ≈ lpblas "Julia wrong?"
br[3, i] = n_gflop / @belapsed cOLSlp($y, $X, $β)
@assert cOLSlp(y, X, β) ≈ lpblas "Clang wrong?"
br[4, i] = n_gflop / @belapsed fOLSlp($y, $X, $β)
@assert fOLSlp(y, X, β) ≈ lpblas "Fort wrong?"
if INTEL_BENCH
br[5, i] = n_gflop / @belapsed icOLSlp($y, $X, $β)
@assert icOLSlp(y, X, β) ≈ lpblas "icc wrong?"
br[6, i] = n_gflop / @belapsed ifOLSlp($y, $X, $β)
@assert ifOLSlp(y, X, β) ≈ lpblas "ifort wrong?"
end
br[5+2INTEL_BENCH, i] = n_gflop / @belapsed eOLSlp($y, $X, $β)
@assert eOLSlp(y, X, β) ≈ lpblas "eigen wrong?"
br[6+2INTEL_BENCH, i] = n_gflop / @belapsed ieOLSlp($y, $X, $β)
@assert ieOLSlp(y, X, β) ≈ lpblas "i-eigen wrong?"
if MKL_BENCH
br[7+2INTEL_BENCH, i] = n_gflop / @belapsed sse!($Xβ, $y, $X, $β)
@assert sse!(Xβ, y, X, β) ≈ lpblas "MKL wrong?"
end
end
function exp_bench!(br, s, i)
a = rand(s)
b = similar(a)
n_gflop = 1e-9 * s # not really gflops
br[1, i] = n_gflop / @belapsed @turbo @. $b = exp($a)
baseb = copy(b)
br[2, i] = n_gflop / @belapsed @. $b = exp($a)
@assert b ≈ baseb "LoopVec wrong?"
br[3, i] = n_gflop / @belapsed cvexp!($b, $a)
@assert b ≈ baseb "Clang wrong?"
br[4, i] = n_gflop / @belapsed fvexp!($b, $a)
@assert b ≈ baseb "Fort wrong?"
if INTEL_BENCH
br[5, i] = n_gflop / @belapsed icvexp!($b, $a)
@assert b ≈ baseb "icc wrong?"
br[6, i] = n_gflop / @belapsed ifvexp!($b, $a)
@assert b ≈ baseb "ifort wrong?"
end
end
function aplusBc_bench!(br, s, i)
M, N = totwotuple(s)
a = rand(M)
B = rand(M, N)
c = rand(N)
c′ = c'
D = similar(B)
n_gflop = 2e-9 * M * N
br[1, i] = n_gflop / @belapsed @turbo @. $D = $a + $B * $c′
Dcopy = copy(D)
fill!(D, NaN)
br[2, i] = n_gflop / @belapsed @. $D = $a + $B * $c′
@assert D ≈ Dcopy "LoopVec wrong?"
br[3, i] = n_gflop / @belapsed caplusBc!($D, $a, $B, $c)
@assert D ≈ Dcopy "Clang wrong?"
fill!(D, NaN)
br[4, i] = n_gflop / @belapsed faplusBc!($D, $a, $B, $c)
@assert D ≈ Dcopy "Fort wrong?"
fill!(D, NaN)
if INTEL_BENCH
br[5, i] = n_gflop / @belapsed icaplusBc!($D, $a, $B, $c)
@assert D ≈ Dcopy "icc wrong?"
fill!(D, NaN)
br[6, i] = n_gflop / @belapsed ifaplusBc!($D, $a, $B, $c)
@assert D ≈ Dcopy "ifort wrong?"
fill!(D, NaN)
end
br[7, i] = n_gflop / @belapsed eaplusBc!($D, $a, $B, $c)
@assert D ≈ Dcopy "eigen wrong?"
fill!(D, NaN)
br[8, i] = n_gflop / @belapsed ieaplusBc!($D, $a, $B, $c)
@assert D ≈ Dcopy "i-eigen wrong?"
fill!(D, NaN)
end
function AplusAt_bench!(br, s, i)
A = rand(s, s)
B = similar(A)
n_gflop = 1e-9 * s^2
br[1, i] = n_gflop / @belapsed @turbo @. $B = $A + $A'
baseB = copy(B)
fill!(B, NaN)
br[2, i] = n_gflop / @belapsed @. $B = $A + $A'
@assert B ≈ baseB "LoopVec wrong?"
br[3, i] = n_gflop / @belapsed cAplusAt!($B, $A)
@assert B ≈ baseB "Clang wrong?"
fill!(B, NaN)
br[4, i] = n_gflop / @belapsed fAplusAt!($B, $A)
@assert B ≈ baseB "Fort wrong?"
fill!(B, NaN)
if INTEL_BENCH
br[5, i] = n_gflop / @belapsed icAplusAt!($B, $A)
@assert B ≈ baseB "icc wrong?"
fill!(B, NaN)
br[6, i] = n_gflop / @belapsed ifAplusAt!($B, $A)
@assert B ≈ baseB "ifort wrong?"
fill!(B, NaN)
end
br[5+2INTEL_BENCH, i] = n_gflop / @belapsed eAplusAt!($B, $A)
@assert B ≈ baseB "eigen wrong?"
fill!(B, NaN)
br[6+2INTEL_BENCH, i] = n_gflop / @belapsed ieAplusAt!($B, $A)
@assert B ≈ baseB "i-eigen wrong?"
fill!(B, NaN)
br[7+2INTEL_BENCH, i] = n_gflop / @belapsed fAplusAt_builtin!($B, $A)
@assert B ≈ baseB "Fort-builtin wrong?"
fill!(B, NaN)
if INTEL_BENCH
br[8+2INTEL_BENCH, i] = n_gflop / @belapsed ifAplusAt_builtin!($B, $A)
@assert B ≈ baseB "ifort-builtin wrong?"
fill!(B, NaN)
end
end
function randomaccess_bench!(br, s, i)
A, C = totwotuple(s)
P = rand(A, C) .+= 0.5
basis = rand(1:C, A, C)
coefs = randn(C)
n_gflop = 1e-9 * (A * C + C)
p = randomaccess(P, basis, coefs)
br[1, i] = n_gflop / @belapsed randomaccessavx($P, $basis, $coefs)
@assert p ≈ randomaccessavx(P, basis, coefs) "LoopVec wrong?"
br[2, i] = n_gflop / @belapsed randomaccess($P, $basis, $coefs)
br[3, i] = n_gflop / @belapsed crandomaccess($P, $basis, $coefs)
@assert p ≈ crandomaccess(P, basis, coefs) "Clang wrong?"
br[4, i] = n_gflop / @belapsed frandomaccess($P, $basis, $coefs)
@assert p ≈ frandomaccess(P, basis, coefs) "Fort wrong?"
if INTEL_BENCH
br[5, i] = n_gflop / @belapsed icrandomaccess($P, $basis, $coefs)
@assert p ≈ icrandomaccess(P, basis, coefs) "icc wrong?"
br[6, i] = n_gflop / @belapsed ifrandomaccess($P, $basis, $coefs)
@assert p ≈ ifrandomaccess(P, basis, coefs) "ifort wrong?"
end
end
function logdettriangle_bench!(br, s, i)
S = randn(s, 2s)
U = cholesky(Symmetric(S * S')).U
n_gflop = 1e-9 * s
ld = logdet(U)
br[1, i] = n_gflop / @belapsed jlogdettriangleavx($U)
@assert ld ≈ jlogdettriangleavx(U) "LoopVec wrong?"
br[2, i] = n_gflop / @belapsed jlogdettriangle($U)
@assert ld ≈ jlogdettriangle(U) "Julia wrong?"
br[3, i] = n_gflop / @belapsed clogdettriangle($U)
@assert ld ≈ clogdettriangle(U) "Clang wrong?"
br[4, i] = n_gflop / @belapsed flogdettriangle($U)
@assert ld ≈ flogdettriangle(U) "Fort wrong?"
if INTEL_BENCH
br[5, i] = n_gflop / @belapsed iclogdettriangle($U)
@assert ld ≈ iclogdettriangle(U) "icc wrong?"
br[6, i] = n_gflop / @belapsed iflogdettriangle($U)
@assert ld ≈ iflogdettriangle(U) "ifort wrong?"
end
# br[7,i] = n_gflop / @belapsed elogdettriangle($U)
# @assert ld ≈ elogdettriangle(U) "eigen wrong?"; fill!(B, NaN);
# br[8,i] = n_gflop / @belapsed ielogdettriangle($U)
# @assert ld ≈ ielogdettriangle(U) "i-eigen wrong?"; fill!(B, NaN);
br[5+2INTEL_BENCH, i] = n_gflop / @belapsed logdet($U)
end
function filter2d_bench_run!(br, s, i, K)
A = rand(s + 2, s + 2)
B = OffsetArray(similar(A, (s, s)), 1, 1)
Mk, Nk = size(K)
n_gflop = 1e-9 * (2Mk * Nk - 1) * s^2
br[1, i] = n_gflop / @belapsed filter2davx!($B, $A, $K)
Bcopy = copy(B)
fill!(B, NaN)
br[2, i] = n_gflop / @belapsed filter2d!($B, $A, $K)
@assert B ≈ Bcopy "LoopVec wrong?"
br[3, i] = n_gflop / @belapsed cfilter2d!($B, $A, $K)
@assert B ≈ Bcopy "Clang wrong?"
br[4, i] = n_gflop / @belapsed ffilter2d!($B, $A, $K)
@assert B ≈ Bcopy "Fort wrong?"
if INTEL_BENCH
br[5, i] = n_gflop / @belapsed icfilter2d!($B, $A, $K)
@assert B ≈ Bcopy "icc wrong?"
br[6, i] = n_gflop / @belapsed iffilter2d!($B, $A, $K)
@assert B ≈ Bcopy "ifort wrong?"
end
end
function filter2dunrolled_bench_run!(br, s, i, K)
A = rand(s + 2, s + 2)
B = OffsetArray(similar(A, (s, s)), 1, 1)
Mk, Nk = size(K)
n_gflop = 1e-9 * (2Mk * Nk - 1) * s^2
br[1, i] = n_gflop / @belapsed filter2dunrolledavx!($B, $A, $K)
Bcopy = copy(B)
fill!(B, NaN)
br[2, i] = n_gflop / @belapsed filter2dunrolled!($B, $A, $K)
@assert B ≈ Bcopy "LoopVec wrong?"
br[3, i] = n_gflop / @belapsed cfilter2dunrolled!($B, $A, $K)
@assert B ≈ Bcopy "Clang wrong?"
br[4, i] = n_gflop / @belapsed ffilter2dunrolled!($B, $A, $K)
@assert B ≈ Bcopy "Fort wrong?"
if INTEL_BENCH
br[5, i] = n_gflop / @belapsed icfilter2dunrolled!($B, $A, $K)
@assert B ≈ Bcopy "icc wrong?"
br[6, i] = n_gflop / @belapsed iffilter2dunrolled!($B, $A, $K)
@assert B ≈ Bcopy "ifort wrong?"
end
end