Skip to content

Commit

Permalink
add update_copy to hcq spec (tinygrad#5348)
Browse files Browse the repository at this point in the history
* add update_copy to hcq spec

* fix amd
  • Loading branch information
nimlgen authored Jul 9, 2024
1 parent 9504db1 commit 1678199
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 20 deletions.
55 changes: 47 additions & 8 deletions test/test_hcq.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,22 +144,61 @@ def test_exec_update(self):
# Test copy
def test_copy(self):
TestHCQ.d0.hw_copy_queue_t().wait(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value - 1) \
.copy(TestHCQ.a.lazydata.buffer._buf.va_addr, TestHCQ.b.lazydata.buffer._buf.va_addr, 8) \
.copy(TestHCQ.b.lazydata.buffer._buf.va_addr, TestHCQ.a.lazydata.buffer._buf.va_addr, 8) \
.signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0)

TestHCQ.d0._wait_signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value)
TestHCQ.d0.timeline_value += 1

assert (val:=TestHCQ.b.lazydata.buffer.as_buffer().cast("f")[1]) == 1.0, f"got val {val}"

def test_copy_long(self):
sz = 64 << 20
buf1 = Buffer(Device.DEFAULT, sz, dtypes.int8, options=BufferOptions(nolru=True)).ensure_allocated()
buf2 = Buffer(Device.DEFAULT, sz, dtypes.int8, options=BufferOptions(host=True, nolru=True)).ensure_allocated()
ctypes.memset(buf2._buf.va_addr, 1, sz)

TestHCQ.d0.hw_copy_queue_t().wait(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value - 1) \
.copy(buf1._buf.va_addr, buf2._buf.va_addr, sz) \
.signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0)

TestHCQ.d0._wait_signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value)
TestHCQ.d0.timeline_value += 1

assert (val:=TestHCQ.b.lazydata.buffer.as_buffer().cast("f")[1]) == 0.0, f"got val {val}"
mv_buf1 = buf1.as_buffer().cast('Q')
for i in range(sz//8): assert mv_buf1[i] == 0x0101010101010101, f"offset {i*8} differs, not all copied, got {hex(mv_buf1[i])}"

def test_copy_100_times(self):
def test_update_copy(self):
q = TestHCQ.d0.hw_copy_queue_t().wait(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value - 1) \
.copy(TestHCQ.a.lazydata.buffer._buf.va_addr, TestHCQ.b.lazydata.buffer._buf.va_addr, 8) \
.copy(0x0, 0x0, 8) \
.signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value)

for _ in range(100):
q.update_wait(0, value=TestHCQ.d0.timeline_value - 1).update_signal(2, value=TestHCQ.d0.timeline_value).submit(TestHCQ.d0)
q.submit(TestHCQ.d0)
TestHCQ.d0.timeline_value += 1
q.update_copy(1, dest=TestHCQ.b.lazydata.buffer._buf.va_addr, src=TestHCQ.a.lazydata.buffer._buf.va_addr) \
.submit(TestHCQ.d0)

TestHCQ.d0._wait_signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value)
TestHCQ.d0.timeline_value += 1

assert (val:=TestHCQ.b.lazydata.buffer.as_buffer().cast("f")[1]) == 1.0, f"got val {val}"

def test_update_copy_long(self):
sz = 64 << 20
buf1 = Buffer(Device.DEFAULT, sz, dtypes.int8, options=BufferOptions(nolru=True)).ensure_allocated()
buf2 = Buffer(Device.DEFAULT, sz, dtypes.int8, options=BufferOptions(host=True, nolru=True)).ensure_allocated()
ctypes.memset(buf2._buf.va_addr, 1, sz)

q = TestHCQ.d0.hw_copy_queue_t().wait(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value - 1) \
.copy(0x0, 0x0, sz) \
.signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value)

q.update_copy(1, buf1._buf.va_addr, buf2._buf.va_addr) \
.submit(TestHCQ.d0)

TestHCQ.d0._wait_signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value)
TestHCQ.d0.timeline_value += 1

mv_buf1 = buf1.as_buffer().cast('Q')
for i in range(sz//8): assert mv_buf1[i] == 0x0101010101010101, f"offset {i*8} differs, not all copied, got {hex(mv_buf1[i])}"

# Test bind api
def test_bind(self):
Expand Down
3 changes: 0 additions & 3 deletions test/test_multitensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,6 @@ def test_allreduce_ring(self):
a,b = _test_allreduce(Tensor.rand(256, 256))
np.testing.assert_almost_equal(a.numpy(), b.numpy(), decimal=5)

@unittest.skipIf(Device.DEFAULT in ("NV", "AMD"), "not supported in HCQ #4817")
def test_copy_jit(self):
@TinyJit
def copy_tensor(x:Tensor): return (x.to(f"{x.device.split(':')[0]}:1") + 1)
Expand All @@ -174,15 +173,13 @@ def copy_tensor(x:Tensor): return (x.to(f"{x.device.split(':')[0]}:1") + 1)
x = copy_tensor(t)
np.testing.assert_equal((t+1).numpy(), x.numpy())

@unittest.skipIf(Device.DEFAULT in ("NV", "AMD"), "not supported in HCQ #4817")
def test_allreduce_naive_jit(self):
with Context(RING=0):
jit_allreduce = TinyJit(_test_allreduce)
for _ in range(5):
a,b = jit_allreduce(Tensor.rand(256, 256))
np.testing.assert_almost_equal(a.numpy(), b.numpy(), decimal=5)

@unittest.skipIf(Device.DEFAULT in ("NV", "AMD"), "not supported in HCQ #4817")
def test_allreduce_ring_jit(self):
with Context(RING=2):
jit_allreduce = TinyJit(_test_allreduce)
Expand Down
13 changes: 7 additions & 6 deletions tinygrad/runtime/graph/hcq.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def __init__(self, jit_cache: List[ExecItem], input_rawbuffers: List[Buffer], va
self.last_ji[enqueue_queue] = j

# Build hardware queues.
self.exec_ptrs: Dict[int, Tuple[Any, int]] = {}
self.op_cmd_idx: Dict[int, Tuple[Any, int]] = {}
self.copy_to_devs: Dict[Compiled, Set[Compiled]] = {dev: set() for dev in self.devices}
self.kickoff_wait_cmds: Dict[Any, List] = {q: list() for q in list(self.comp_queues.values()) + list(self.copy_queues.values())}

Expand All @@ -101,14 +101,13 @@ def __init__(self, jit_cache: List[ExecItem], input_rawbuffers: List[Buffer], va
if prof_info: enqueue_queue.timestamp(prof_info[0])

# Encode main commands based on ji type.
if isinstance(ji.prg, CompiledRunner):
enqueue_queue.exec(ji.prg.clprg, self.kargs_addrs[j], *ji.prg.p.launch_dims(var_vals))
self.exec_ptrs[j] = (enqueue_queue, len(enqueue_queue) - 1)
if isinstance(ji.prg, CompiledRunner): enqueue_queue.exec(ji.prg.clprg, self.kargs_addrs[j], *ji.prg.p.launch_dims(var_vals))
elif isinstance(ji.prg, BufferXfer):
dest, src = [cast(Buffer, x) for x in ji.bufs[0:2]]
Device[src.device]._gpu_map(dest._buf) #type: ignore
enqueue_queue.copy(dest._buf.va_addr, src._buf.va_addr, dest.nbytes)
self.copy_to_devs[Device[dest.device]].add(Device[src.device])
self.op_cmd_idx[j] = (enqueue_queue, len(enqueue_queue) - 1)

if signal_val is not None: enqueue_queue.signal(signal, signal_val)

Expand Down Expand Up @@ -137,14 +136,16 @@ def __call__(self, input_rawbuffers: List[Buffer], var_vals: Dict[Variable, int]
dev.raw_prof_records += [(dev._read_timestamp(st), dev._read_timestamp(en), desc, is_cp)]

# Update rawbuffers
for (j,i),input_idx in self.input_replace.items(): self.ji_args_bufs[j][i] = input_rawbuffers[input_idx]._buf.va_addr
for (j,i),input_idx in self.input_replace.items():
if j in self.ji_args_bufs: self.ji_args_bufs[j][i] = input_rawbuffers[input_idx]._buf.va_addr
else: self.op_cmd_idx[j][0].update_copy(self.op_cmd_idx[j][1], **{('dest' if i == 0 else 'src'): input_rawbuffers[input_idx]._buf.va_addr})

# Update var_vals
for j in self.jc_idx_with_updatable_var_vals:
for i,v in enumerate(cast(CompiledRunner, self.jit_cache[j].prg).p.vars): self.ji_args_vars[j][i] = var_vals[v]

for j in self.jc_idx_with_updatable_launch_dims:
queue, cmd_ptr = self.exec_ptrs[j]
queue, cmd_ptr = self.op_cmd_idx[j]
queue.update_exec(cmd_ptr, *cast(CompiledRunner, self.jit_cache[j].prg).p.launch_dims(var_vals))

for dev in self.devices:
Expand Down
12 changes: 9 additions & 3 deletions tinygrad/runtime/ops_amd.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ def submit(self, device: AMDDevice):
SDMA_MAX_COPY_SIZE = 0x400000
class HWCopyQueue(HWQueue):
def __init__(self):
self.internal_cmd_sizes = []
self.internal_cmd_sizes, self.copy_cmds_per_copy = [], {}
super().__init__()

def _q(self, arr):
Expand All @@ -228,8 +228,8 @@ def copy(self, dest, src, copy_size):
self._q([amd_gpu.SDMA_OP_GCR_REQ, 0, amd_gpu.SDMA_GCR_GLM_INV | amd_gpu.SDMA_GCR_GLK_INV | amd_gpu.SDMA_GCR_GLK_WB | amd_gpu.SDMA_GCR_GLV_INV | \
amd_gpu.SDMA_GCR_GL1_INV | amd_gpu.SDMA_GCR_GL2_WB | amd_gpu.SDMA_GCR_GL2_INV, 0, 0])

copied = 0
copy_commands = (copy_size + SDMA_MAX_COPY_SIZE - 1) // SDMA_MAX_COPY_SIZE
copied, copy_commands = 0, (copy_size + SDMA_MAX_COPY_SIZE - 1) // SDMA_MAX_COPY_SIZE
self.copy_cmds_per_copy[len(self)] = copy_commands
for _ in range(copy_commands):
step_copy_size = min(copy_size - copied, SDMA_MAX_COPY_SIZE)

Expand All @@ -243,6 +243,12 @@ def copy(self, dest, src, copy_size):

return self._mark_command_end()

def update_copy(self, cmd_idx, dest=None, src=None):
for i in range(self.copy_cmds_per_copy[cmd_idx]):
if src is not None: self.q[(sigoff:=self.cmd_offsets[cmd_idx]+8+i*7):sigoff+2] = array.array('I', [*data64_le(src + SDMA_MAX_COPY_SIZE*i)])
if dest is not None: self.q[(sigoff:=self.cmd_offsets[cmd_idx]+10+i*7):sigoff+2] = array.array('I', [*data64_le(dest + SDMA_MAX_COPY_SIZE*i)])
return self

def signal(self, signal: hsa.amd_signal_t, value=0):
self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET), value])

Expand Down
5 changes: 5 additions & 0 deletions tinygrad/runtime/ops_nv.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,11 @@ def copy(self, dest, src, copy_size):
self.q += [nvmethod(4, nv_gpu.NVC6B5_LAUNCH_DMA, 1), 0x182] # TRANSFER_TYPE_NON_PIPELINED | DST_MEMORY_LAYOUT_PITCH | SRC_MEMORY_LAYOUT_PITCH
return self._mark_command_end()

def update_copy(self, cmd_idx, dest=None, src=None):
if dest is not None: self.q[(sigoff:=self.cmd_offsets[cmd_idx]+3):sigoff+2] = array.array('I', [*nvdata64(dest)])
if src is not None: self.q[(sigoff:=self.cmd_offsets[cmd_idx]+1):sigoff+2] = array.array('I', [*nvdata64(src)])
return self

def signal(self, signal, value=0):
self.q += [nvmethod(4, nv_gpu.NVC6B5_SET_SEMAPHORE_A, 4), *nvdata64(ctypes.addressof(from_mv(signal))), value, 4]
self.q += [nvmethod(4, nv_gpu.NVC6B5_LAUNCH_DMA, 1), 0x14]
Expand Down

0 comments on commit 1678199

Please sign in to comment.