From bee96a19ff1eabbafeb41770a5e3563cef28caec Mon Sep 17 00:00:00 2001
From: qazal <77887910+Qazalin@users.noreply.github.com>
Date: Tue, 9 Jul 2024 15:24:56 +0300
Subject: [PATCH] fuzz uop schedules (#5345)

* basic blocks + cleanups

* fixups

* elif is better for future me

* fuzz_schedule_max_paths

* fix linter
---
 test/external/fuzz_schedule.py |  7 +--
 test/external/fuzz_uops.py     | 78 ++++++++++++++--------------------
 tinygrad/codegen/uops.py       | 18 ++++----
 tinygrad/engine/realize.py     |  2 +-
 4 files changed, 47 insertions(+), 58 deletions(-)

diff --git a/test/external/fuzz_schedule.py b/test/external/fuzz_schedule.py
index 05dc2a8b88b4c..bd419797e4d87 100644
--- a/test/external/fuzz_schedule.py
+++ b/test/external/fuzz_schedule.py
@@ -1,6 +1,6 @@
 import itertools
 import numpy as np
-from typing import DefaultDict, Dict, List, Set, Tuple, TypeVar
+from typing import DefaultDict, Dict, List, Set, Tuple, TypeVar, Union
 from tinygrad.device import Buffer
 from tinygrad.engine.realize import CustomOp, capturing, lower_schedule_item
 from tinygrad.helpers import DEBUG, MULTIOUTPUT, colored, getenv
@@ -10,6 +10,7 @@
 from tinygrad.tensor import Tensor, _to_np_dtype
 
 ctx_vars = { MULTIOUTPUT: (0, 1) }
+FUZZ_SCHEDULE_MAX_PATHS = getenv("FUZZ_SCHEDULE_MAX_PATHS", 10)
 
 def fuzz_schedule(outs:List[LazyBuffer]):
   # find toposorts across all tunable params
@@ -73,7 +74,7 @@ def _exec_si(si:ScheduleItem, seed:int):
   ei.run()
 
 T = TypeVar("T")
-def find_all_toposorts(graph:DefaultDict[T, List[T]], in_degree:DefaultDict[T, int]) -> List[Tuple[T, ...]]:
+def find_all_toposorts(graph:DefaultDict[T, List[T]], in_degree:Union[DefaultDict[T, int], Dict[T, int]]) -> List[Tuple[T, ...]]:
   visited: Set[T] = set()
   ret: List[Tuple[T, ...]] = []
   path: List[T] = []
@@ -85,7 +86,7 @@ def recurse_paths(path:List[T]):
       path.append(v)
       visited.add(v)
       recurse_paths(path)
-      if len(ret) >= getenv("FUZZ_SCHEDULE_MAX_PATHS", 10): return
+      if len(ret) >= FUZZ_SCHEDULE_MAX_PATHS: return
       # backtrack
       for u in graph[v]: in_degree[u] += 1
       path.pop()
diff --git a/test/external/fuzz_uops.py b/test/external/fuzz_uops.py
index 21293790076bf..a34cf7295a4ef 100644
--- a/test/external/fuzz_uops.py
+++ b/test/external/fuzz_uops.py
@@ -1,40 +1,55 @@
+import itertools
+from collections import defaultdict
 import numpy as np
 from dataclasses import replace
-from typing import Dict, List, Set, Tuple
-from tinygrad.codegen.uops import UOp, UOpGraph, UOps
+from typing import DefaultDict, Dict, List, Tuple
+from tinygrad.codegen.uops import END_FOR_UOP, UOp, UOpGraph
 from tinygrad.device import Buffer, Device
 from tinygrad.engine.realize import CompiledRunner
-from tinygrad.helpers import DEBUG, colored, getenv
+from tinygrad.helpers import DEBUG, colored
 from tinygrad.shape.symbolic import Variable
 from tinygrad.tensor import _to_np_dtype
+from test.external.fuzz_schedule import FUZZ_SCHEDULE_MAX_PATHS, find_all_toposorts
 
-def fuzz_uops(graph:Dict[UOp, List[UOp]], in_degree:Dict[UOp, int], loops_children:Dict[UOp, Set[UOp]]):
-  paths: List[List[UOp]] = []
-  # TODO: express DEFINE_ACC and loop children conditions in the graph, builtin.
-  for p in find_all_toposorts(graph, in_degree):
-    assert p[-1].op is UOps.SINK, f"didn't end with SINK, ended with {p[-1]}"
-    paths.append(path:=list(p[:-1]))
-    for u in path:
-      if u.op is UOps.IF: path.append(UOp(UOps.ENDIF, None, (u,)))
-      if u.op is UOps.RANGE:
-        path.insert(max(path.index(x) for x in loops_children[u] if x in path)+1, UOp(UOps.ENDRANGE, None, (u,)))
-  return paths
+def fuzz_uops(uops:UOpGraph) -> List[Tuple[UOp, ...]]:
+  blocks: List[List[UOp]] = [[]]
+  for u in uops:
+    if u.op in END_FOR_UOP: blocks.append([u])
+    elif u.op in {x[1] for x in END_FOR_UOP.values()}: blocks.extend([[u], []])
+    else: blocks[-1].append(u)
+
+  paths_for_block: Dict[int, List[Tuple[UOp, ...]]] = {}
+  for bi, bb in enumerate(blocks):
+    children: DefaultDict[UOp, List[UOp]] = defaultdict(list)
+    in_degree: Dict[UOp, int] = {}
+    for u in bb:
+      in_degree[u] = 0
+      for x in u.src:
+        if x in bb:
+          children[x].append(u)
+          in_degree[u] += 1
+    paths_for_block[bi] = find_all_toposorts(children, in_degree)
+  paths: Dict[Tuple[UOp, ...], None] = {}
+  for up in itertools.product(*paths_for_block.values()):
+    paths[tuple(uop for path in up for uop in path)] = None
+    if len(paths) >= FUZZ_SCHEDULE_MAX_PATHS: break
+  return list(paths)
 
 class UOpsFuzzerRunner(CompiledRunner):
   def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=False):
-    assert self.p.uops is not None and len(self.p.uops.fuzz_paths) >= 1
+    assert self.p.uops is not None and len(self.p.uops._fuzz_paths) >= 1
     init_rawbufs, init_name = {x:x.as_buffer() for x in rawbufs}, self.p.function_name
     init_globals = {i[0]:buf for i, buf in zip(self.p.globals, rawbufs)}
-    if DEBUG >= 1: print(colored(f"fuzzing {len(self.p.uops.fuzz_paths)} UOps permutations for {init_name}", "yellow"))
+    if DEBUG >= 1: print(colored(f"fuzzing {len(self.p.uops._fuzz_paths)} uop permutations for {init_name}", "yellow"))
 
     super().__call__(rawbufs, var_vals, wait)
     ground_truth = {x:np.frombuffer(x.as_buffer(), _to_np_dtype(x.dtype)) for x in rawbufs}
 
-    for i, path in enumerate(self.p.uops.fuzz_paths):
+    for i, path in enumerate(self.p.uops._fuzz_paths):
       # setup prg
       uops = UOpGraph([])
       uops._uops = list(path)
-      if DEBUG >= 6: uops.print()
+      if DEBUG >= 5: uops.print()
       self.p = replace(self.p, name=(name:=f"{init_name}fuzz{i}"), src=Device[self.p.dname].renderer.render(name, uops), uops=uops)
       if DEBUG >= 4: print(self.p.src)
       self.lib = Device[self.p.dname].compiler.compile_cached(self.p.src)
@@ -49,30 +64,3 @@ def __call__(self, rawbufs:List[Buffer], var_vals:Dict[Variable, int], wait=Fals
         except AssertionError as e:
           print(colored(name, "red"))
           raise e
-
-def find_all_toposorts(graph:Dict[UOp, List[UOp]], in_degree:Dict[UOp, int]) -> List[Tuple[UOp, ...]]:
-  visited: Set[UOp] = set()
-  ret: List[Tuple[UOp, ...]] = []
-  path: List[UOp] = []
-
-  def recurse_paths(path:List[UOp]):
-    for v, d in in_degree.items():
-      if d != 0 or v in visited: continue
-      if v.op is UOps.DEFINE_ACC and any(l not in path for l in v.src): continue
-      for u in graph[v]: in_degree[u] -= 1
-      if v.op is UOps.DEFINE_ACC: path.insert(min(path.index(l) for l in v.src), v)
-      else: path.append(v)
-      visited.add(v)
-      recurse_paths(path)
-      if len(ret) >= getenv("FUZZ_UOPS_MAX_PATHS", 10): return
-      # backtrack
-      for u in graph[v]: in_degree[u] += 1
-      path.pop()
-      visited.remove(v)
-    if len(path) == len(in_degree): ret.append(tuple(path))
-  recurse_paths(path)
-
-  if len(ret) == 0: raise RuntimeError("detected cycle in the graph")
-  # verify all paths are unique
-  assert len(ret) == len(set(ret))
-  return ret
diff --git a/tinygrad/codegen/uops.py b/tinygrad/codegen/uops.py
index 3f9de86a916a5..a7ecebf6c31fd 100644
--- a/tinygrad/codegen/uops.py
+++ b/tinygrad/codegen/uops.py
@@ -26,6 +26,8 @@ class UOps(Enum):
   # these two are not graph nodes
   ENDRANGE = auto(); ENDIF = auto() # noqa: E702
 
+END_FOR_UOP = {UOps.IF:(UOps.STORE, UOps.ENDIF), UOps.RANGE:(UOps.PHI, UOps.ENDRANGE)}
+
 def ufix(dtype: Optional[DType], x): return UOp.const(dtype, x) if not isinstance(x, UOp) else x
 @dataclass(frozen=True, eq=False)
 class UOp:
@@ -368,9 +370,9 @@ def vars(self) -> List[Variable]: return sorted([x.arg for x in self.uops if x.o
   def globals(self) -> List[Tuple[int, bool]]: return [x.arg for x in self.uops if x.op is UOps.DEFINE_GLOBAL]
 
   @property
-  def uops(self):
+  def uops(self) -> List[UOp]:
     if self._uops is None: self.linearize()
-    return self._uops
+    return cast(List[UOp], self._uops)
 
   def graph(self):
     from tinygrad.engine.graph import graph_uops
@@ -412,8 +414,7 @@ def get_recursive_children(x:UOp, end:UOps, include_self=False) -> Set[UOp]:
       return set.union(set((x,)) if include_self else set(), *([get_recursive_children(u, end, True) for u in children[x] if x.op is not end]))
 
     # scope children impact the toposort and END* insertion
-    end_for_uop = {UOps.IF:(UOps.STORE, UOps.ENDIF), UOps.RANGE:(UOps.PHI, UOps.ENDRANGE)}
-    scope_children = {p:get_recursive_children(p, end_for_uop[p.op][0]) for p in reversed(in_degree) if p.op in end_for_uop}
+    scope_children = {p:get_recursive_children(p, END_FOR_UOP[p.op][0]) for p in reversed(in_degree) if p.op in END_FOR_UOP}
 
     queue:List[Tuple[int, UOp]] = []
     def push(u:UOp):
@@ -426,10 +427,6 @@ def push(u:UOp):
     for u in children:
       if in_degree[u] == 0: push(u)
 
-    if getenv("FUZZ_UOPS", 0):
-      from test.external.fuzz_uops import fuzz_uops
-      self.fuzz_paths = fuzz_uops(children, in_degree.copy(), scope_children)
-
     self._uops = []
     while queue:
       p,x = heapq.heappop(queue)
@@ -443,11 +440,14 @@ def push(u:UOp):
         if in_degree[u] == 0: push(u)
 
     for u in (self._uops):
-      if u.op in end_for_uop: self._uops.insert(max([self._uops.index(l) for l in scope_children[u]])+1, UOp(end_for_uop[u.op][1], None, (u,)))
+      if u.op in END_FOR_UOP: self._uops.insert(max([self._uops.index(l) for l in scope_children[u]])+1, UOp(END_FOR_UOP[u.op][1], None, (u,)))
 
     assert self._uops[-1].op is UOps.SINK, f"didn't end with SINK, ended with {self._uops[-1]}"
     self._uops = self._uops[:-1]
 
+    if getenv("FUZZ_UOPS"):
+      from test.external.fuzz_uops import fuzz_uops
+      self._fuzz_paths = fuzz_uops(self)
     if do_type_verify: type_verify(self.uops)
 
   # *** checker functions ***
diff --git a/tinygrad/engine/realize.py b/tinygrad/engine/realize.py
index 68670bcb292f1..829d368092e63 100644
--- a/tinygrad/engine/realize.py
+++ b/tinygrad/engine/realize.py
@@ -136,7 +136,7 @@ def get_runner(dname:str, ast:Tuple[LazyOp, ...]) -> CompiledRunner:
     method_cache[ckey] = ret = CompiledRunner(replace(bret.p, dname=dname), bret.lib)
   else:
     prg: Program = get_linearizer(Device[dname].renderer, ast).to_program()
-    if hasattr(prg.uops, "fuzz_paths"):
+    if hasattr(prg.uops, "_fuzz_paths"):
       from test.external.fuzz_uops import UOpsFuzzerRunner
       return UOpsFuzzerRunner(replace(prg, dname=dname))
     method_cache[ckey] = method_cache[bkey] = ret = CompiledRunner(replace(prg, dname=dname))