* improved performance of convolution on ARM. Now Resnet-50 runs in 2…

…6.6ms on Apple M1 (the previous results was ~42ms). * added preliminary implementation of Winograd-based convolution; for now it's disabled, because it does not accelerate execution, quite the opposite.
vpisarev · May 24, 2022 · 616e3ad · 616e3ad
1 parent da008cd
commit 616e3ad
Show file tree

Hide file tree

Showing 8 changed files with 1,182 additions and 43 deletions.
diff --git a/README.md b/README.md
@@ -12,7 +12,7 @@ The code is distributed under Apache 2 license, see the [LICENSE](LICENSE)
 
 The compiler is written in Ficus itself and needs C/C++ compiler and make utility.
 
-### **Unix (Linux, macOS, BSD, ...)**
+### **Unix (Linux, macOS, BSD, WSL, ...)**
 
 ```
 cd <ficus_root>
@@ -22,7 +22,7 @@ bin/ficus -run -O3 examples/fst.fx # run some examples, e.g. fst.fx,
                                    # optionally specify optimization level
 ```
 
-### **Windows**
+### **Windows (native)**
 
 Install Visual Studio, for example Visual Studio 2019 Community Edition, open "Developer PowerShell for VS2019" from the Windows menu and type:
 

diff --git a/examples/classify_img.fx b/examples/classify_img.fx
@@ -1,7 +1,7 @@
 import Json, Sys, LexerUtils as Lxu
 import OpenCV as cv
 //import Image.Decoder
-import NN.Ast, NN.Inference, NN.FromOnnx, NN.FuseBasic, NN.BufferAllocator
+import NN.Ast, NN.Inference, NN.FromOnnx, NN.FuseBasic, NN.BufferAllocator, NN.OpConv
 
 var mname = "", lname = ""
 var images: string list = []
@@ -99,7 +99,7 @@ for imgname@i <- images {
     val inp = cv.blobFromImage(img, size=(224, 224),
             scaleFactor=0.017,
             mean=(103., 116., 123.),
-            swapRB=true, crop=false)
+            swapRB=false, crop=false)
     println(inp.size())
     val out = net.forward(inp)
     //println(f"out[1]={out[1][:]}")
@@ -110,15 +110,18 @@ for imgname@i <- images {
     sort(tprobs, (>))
     val inp_ = NN.Ast.make_tensor(inp)
     var outputs: nn_output_t [] = []
+    NN.OpConv.reset_min_total_time_1x1()
+    val niters = 30
     val (gmean, mintime) = Sys.timeit(
         fun () {
             outputs =
             try NN.Inference.run(model, [("", inp_)], outputs=temp_outputs) catch {
             | NN.Ast.NNError msg => println(f"exception NNError('{msg}') occured"); []
             | Fail msg => println(f"failure: '{msg}'"); []
             }
-        }, iterations=15, batch=1)
-    println(f"execution time: gmean={gmean*1000.}, mintime={mintime*1000.}")
+        }, iterations=niters, batch=1)
+    val total_time = NN.OpConv.get_total_time_1x1()*1000/Sys.tick_frequency()
+    println(f"execution time: gmean={gmean*1000.}, mintime={mintime*1000.}, 1x1 total={total_time} ms")
     /*for t_out@i <- temp_outputs {
         println(f"temp output #{i}: name='{t_out.0}', shape={t_out.1.shape}")
     }

diff --git a/lib/NN/Ast.fx b/lib/NN/Ast.fx
@@ -701,7 +701,8 @@ fun graph2str(net: nnet_t, graph: nngraph_t, indent: string)
     val prog_indent = new_indent + "  "
     val inpstrs = [for a <- inpargs {net.args[a].name}]
     val outstrs = [for a <- outargs {net.args[a].name}]
-    val prog = [for op <- prog {op2str(net, op, prog_indent)}]
+    val prog = [for op@i <- prog {
+        f"{indent}// op #{i}\n{prog_indent}" + op2str(net, op, prog_indent)}]
     join_embrace(f"graph {{\n{new_indent}inputs={inpstrs},\n\
         {new_indent}outputs={outstrs},\n{new_indent}prog={{\n{prog_indent}",
         f"\n{new_indent}}}\n{indent}}}",
@@ -807,7 +808,7 @@ fun nnop_t.get_inputs_outputs(): (int [], int []) = match self
     | NN_Unsqueeze {t_inp, t_axes, t_out} => ([t_inp, t_axes], [t_out])
 }
 
-fun op2str(net: nnet_t, op: nnop_t, indent: string)
+fun op2str(net: nnet_t, op: nnop_t, indent: string): string
 {
     val sub_indent = indent + "  "
     //println(f"dumping op={op.name()}")

diff --git a/lib/NN/BufferAllocator.fx b/lib/NN/BufferAllocator.fx
@@ -95,6 +95,8 @@ fun assign_buffers(net: Ast.nnet_t)
                     | Some(argidx) => (true, argidx)
                     | _ => (false, -1)
                     }
+                | Ast.NN_Conv {t_passby} when t_passby > 0 && usecounts[t_passby] == 1 =>
+                    (true, t_passby)
                 | _ => (false, -1)
                 }
             //println(f"name={op.name()}, inplace={inplace_op}, inps={[::for i<-inps {net.args[i].name}]}, outs={[::for i<-outs {net.args[i].name}]}")

diff --git a/lib/NN/Inference.fx b/lib/NN/Inference.fx
@@ -14,6 +14,7 @@ fun run(net: Ast.nnet_t, inputs: (string, Ast.nntensor_t) []/*,
     (string, Ast.nntensor_t) []
 {
     var empty_names = true
+    OpConv.reset_total_time_1x1()
 
     // assign input tensors
     for (inpname, t)@i <- inputs {
@@ -40,6 +41,7 @@ fun run(net: Ast.nnet_t, inputs: (string, Ast.nntensor_t) []/*,
 
     //println("running main graph")
     run_graph(net, net.graph, outputs)
+    OpConv.update_total_time_1x1()
 
     // collect outputs
     [for argidx <- net.graph.outargs {