Skip to content

Commit

Permalink
use hcq_profile in nv/amd program (tinygrad#5344)
Browse files Browse the repository at this point in the history
  • Loading branch information
nimlgen authored Jul 9, 2024
1 parent bee96a1 commit e815c57
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 29 deletions.
19 changes: 12 additions & 7 deletions tinygrad/device.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,12 +187,17 @@ def synchronize(self): pass # override this in your device
# **************** for HCQ Compatible Devices ****************

@contextlib.contextmanager
def hcq_profile(dev, queue_type, enabled, desc):
def hcq_profile(dev, enabled, desc, queue_type=None, queue=None):
st, en = (dev._alloc_signal(), dev._alloc_signal()) if enabled else (None, None)
if enabled: queue_type().timestamp(st).submit(dev)

if enabled and queue is not None: queue.timestamp(st)
elif enabled: queue_type().timestamp(st).submit(dev)

try: yield (st, en)
finally:
if enabled: queue_type().timestamp(en).submit(dev)
if enabled and queue is not None: queue.timestamp(en)
elif enabled: queue_type().timestamp(en).submit(dev)

if enabled and PROFILE: dev.sig_prof_records.append((st, en, desc, queue_type is dev.hw_copy_queue_t))

class HCQCompatCompiled(Compiled):
Expand Down Expand Up @@ -268,7 +273,7 @@ def __init__(self, device, batch_size=(2 << 20), batch_cnt=32):
def _alloc(self, size:int, options:BufferOptions) -> HCQCompatAllocRes: raise NotImplementedError("need hcq compat alloc")

def copyin(self, dest: HCQCompatAllocRes, src: memoryview):
with hcq_profile(self.device, self.device.hw_copy_queue_t, desc=f"CPU -> {self.device.dname}", enabled=PROFILE):
with hcq_profile(self.device, queue_type=self.device.hw_copy_queue_t, desc=f"CPU -> {self.device.dname}", enabled=PROFILE):
for i in range(0, src.nbytes, self.b[0].size):
self.b_next = (self.b_next + 1) % len(self.b)
self.device._wait_signal(self.device.timeline_signal, self.b_timeline[self.b_next])
Expand All @@ -287,7 +292,7 @@ def _get_temp_buf():
return (self.b[self.b_next].va_addr, self.b_next)
return None

with hcq_profile(self.device, self.device.hw_copy_queue_t, desc=f"DISK -> {self.device.dname}", enabled=PROFILE):
with hcq_profile(self.device, queue_type=self.device.hw_copy_queue_t, desc=f"DISK -> {self.device.dname}", enabled=PROFILE):
for (batch_info, dst_off, src_off, copy_size) in src.device.allocator._copyout_sharded(src, size, _get_temp_buf, seg_len=self.b[0].size):
self.device.hw_copy_queue_t().wait(self.device.timeline_signal, self.device.timeline_value - 1) \
.copy(dest.va_addr + dst_off, batch_info[0] + src_off, copy_size) \
Expand All @@ -298,7 +303,7 @@ def _get_temp_buf():
def copyout(self, dest:memoryview, src: HCQCompatAllocRes):
self.device.synchronize()

with hcq_profile(self.device, self.device.hw_copy_queue_t, desc=f"{self.device.dname} -> CPU", enabled=PROFILE):
with hcq_profile(self.device, queue_type=self.device.hw_copy_queue_t, desc=f"{self.device.dname} -> CPU", enabled=PROFILE):
for i in range(0, dest.nbytes, self.b[0].size):
self.device.hw_copy_queue_t().wait(self.device.timeline_signal, self.device.timeline_value - 1) \
.copy(self.b[0].va_addr, src.va_addr+i, lsize:=min(self.b[0].size, dest.nbytes-i)) \
Expand All @@ -311,7 +316,7 @@ def copyout(self, dest:memoryview, src: HCQCompatAllocRes):
def transfer(self, dest: HCQCompatAllocRes, src: HCQCompatAllocRes, sz: int, src_dev, dest_dev):
src_dev._gpu_map(dest)

with hcq_profile(self.device, self.device.hw_copy_queue_t, desc=f"{src_dev.dname} -> {dest_dev.dname}", enabled=PROFILE):
with hcq_profile(self.device, queue_type=self.device.hw_copy_queue_t, desc=f"{src_dev.dname} -> {dest_dev.dname}", enabled=PROFILE):
src_dev.hw_copy_queue_t().wait(src_dev.timeline_signal, src_dev.timeline_value - 1) \
.wait(dest_dev.timeline_signal, dest_dev.timeline_value - 1) \
.copy(dest.va_addr, src.va_addr, sz) \
Expand Down
17 changes: 7 additions & 10 deletions tinygrad/runtime/ops_amd.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from typing import Tuple, List, Any
import os, fcntl, ctypes, ctypes.util, functools, re, pathlib, mmap, struct, errno, subprocess, time, array
from dataclasses import dataclass
from tinygrad.device import HCQCompatCompiled, HCQCompatAllocator, HCQCompatAllocRes, Compiler, CompileError, BufferOptions
from tinygrad.device import HCQCompatCompiled, HCQCompatAllocator, HCQCompatAllocRes, Compiler, CompileError, BufferOptions, hcq_profile
from tinygrad.helpers import getenv, init_c_struct_t, to_mv, round_up, DEBUG, PROFILE, mv_address
from tinygrad.renderer.cstyle import AMDRenderer
from tinygrad.runtime.driver.hip_comgr import compile_hip
Expand Down Expand Up @@ -365,20 +365,19 @@ def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tup
for i in range(len(args)): args_st.__setattr__(f'f{i}', args[i].va_addr)
for i in range(len(vals)): args_st.__setattr__(f'v{i}', vals[i])

sig_st, sig_en = (self.device._alloc_signal(), self.device._alloc_signal()) if PROFILE else (self.device.time_event_st, self.device.time_event_en)
q = HWPM4Queue().wait(self.device.timeline_signal, self.device.timeline_value - 1).memory_barrier()

with hcq_profile(self.device, queue=q, desc=self.name, enabled=wait or PROFILE) as (sig_st, sig_en):
q.exec(self, self.device.kernargs_ptr, global_size, local_size)

q = HWPM4Queue()
q.wait(self.device.timeline_signal, self.device.timeline_value - 1).memory_barrier()
if wait or PROFILE: q.timestamp(sig_st)
q.exec(self, self.device.kernargs_ptr, global_size, local_size)
if wait or PROFILE: q.timestamp(sig_en)
q.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)

self.device.timeline_value += 1
self.device.kernargs_ptr += self.kernargs_alloc_size

if PROFILE: self.device.sig_prof_records.append((sig_st, sig_en, self.name, False))
if wait:
self.device._wait_signal(self.device.timeline_signal, self.device.timeline_value - 1)
if not PROFILE: self.device.signals_pool += [sig_st, sig_en]
return (sig_en.start_ts - sig_st.start_ts) / 1e8

class AMDAllocator(HCQCompatAllocator):
Expand Down Expand Up @@ -499,8 +498,6 @@ def __init__(self, device:str=""):
self._gpu_map(AMDDevice.event_page)
sync_event = kio.create_event(AMDDevice.kfd, auto_reset=1)

self.time_event_st, self.time_event_en = AMDDevice._alloc_signal(), AMDDevice._alloc_signal()

self.kernargs = self._gpu_alloc(0x1000000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
self.kernargs_ptr = self.kernargs.va_addr

Expand Down
21 changes: 9 additions & 12 deletions tinygrad/runtime/ops_nv.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import os, ctypes, contextlib, pathlib, re, fcntl, functools, mmap, struct, tempfile, hashlib, subprocess, time, array
from typing import Tuple, List, Any
from dataclasses import dataclass
from tinygrad.device import HCQCompatCompiled, HCQCompatAllocator, HCQCompatAllocRes, Compiler, CompileError, BufferOptions
from tinygrad.device import HCQCompatCompiled, HCQCompatAllocator, HCQCompatAllocRes, Compiler, CompileError, BufferOptions, hcq_profile
from tinygrad.helpers import getenv, from_mv, mv_address, init_c_struct_t, to_mv, round_up, to_char_p_p, DEBUG, prod, PROFILE
from tinygrad.renderer.cstyle import NVRenderer
from tinygrad.runtime.ops_cuda import check as cuda_check, _get_bytes, CUDACompiler, PTXCompiler, PTX
Expand Down Expand Up @@ -341,21 +341,20 @@ def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tup
if MOCKGPU: self.constbuffer_0[0:2] = [len(args), len(vals)]
kernargs = [arg_half for arg in args for arg_half in nvdata64_le(arg.va_addr)] + list(vals)

sig_st, sig_en = (self.device._alloc_signal(), self.device._alloc_signal()) if PROFILE else (self.device.time_event_st, self.device.time_event_en)
q = HWComputeQueue().wait(self.device.timeline_signal, self.device.timeline_value - 1) \
.copy_from_cpu(self.device.kernargs_ptr, self.constbuffer_0 + kernargs)

with hcq_profile(self.device, queue=q, desc=self.name, enabled=wait or PROFILE) as (sig_st, sig_en):
q.exec(self, self.device.kernargs_ptr, global_size, local_size)

q.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)

queue = HWComputeQueue()
queue.wait(self.device.timeline_signal, self.device.timeline_value - 1)
if wait or PROFILE: queue.timestamp(sig_st)
queue.copy_from_cpu(self.device.kernargs_ptr, self.constbuffer_0 + kernargs)
queue.exec(self, self.device.kernargs_ptr, global_size, local_size)
if wait or PROFILE: queue.timestamp(sig_en)
queue.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
self.device.timeline_value += 1
self.device.kernargs_ptr += self.kernargs_alloc_size

if PROFILE: self.device.sig_prof_records.append((sig_st, sig_en, self.name, False))
if wait:
self.device._wait_signal(self.device.timeline_signal, self.device.timeline_value - 1)
if not PROFILE: self.device.signals_pool += [sig_st, sig_en]
return (sig_en[1] - sig_st[1]) / 1e9

class NVAllocator(HCQCompatAllocator):
Expand Down Expand Up @@ -544,8 +543,6 @@ def __init__(self, device:str=""):

rmctrl.gpfifo_schedule(self.fd_ctl, self.root, channel_group, bEnable=1)

self.time_event_st, self.time_event_en = NVDevice._alloc_signal(), NVDevice._alloc_signal()

self.cmdq_page: nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS = self._gpu_alloc(0x200000, map_to_cpu=True, huge_page=True)
self.cmdq: memoryview = to_mv(self.cmdq_page.va_addr, 0x200000).cast("I")
self.cmdq_wptr: int = 0 # in bytes
Expand Down

0 comments on commit e815c57

Please sign in to comment.