Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

nv follows HCQCompatAllocRes protocol #5275

Merged
merged 2 commits into from
Jul 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions tinygrad/device.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,8 +255,8 @@ def _wrap_timeline_signal(self):
self._set_signal(self.timeline_signal, 0)
cast(HCQCompatAllocator, self.allocator).b_timeline = [0] * len(cast(HCQCompatAllocator, self.allocator).b)

# used for copying and transfering, allocator implementations must have this set up.
class HCQCompatAllocRes(Protocol): va_addr: int; base: int; size: int; length: int # noqa: E702
# Protocol for hcq compatible allocators for allocated buffers to contain VA address and it's size.
class HCQCompatAllocRes(Protocol): va_addr: int; size: int # noqa: E702

class HCQCompatAllocator(LRUAllocator): # pylint: disable=abstract-method
def __init__(self, device, batch_size=(2 << 20), batch_cnt=32):
Expand Down Expand Up @@ -325,4 +325,5 @@ def transfer(self, dest: HCQCompatAllocRes, src: HCQCompatAllocRes, sz: int, src
dest_dev.timeline_value += 1

def offset(self, buf, size:int, offset:int) -> HCQCompatAllocRes:
return type(buf)(base=buf.base + offset, va_addr=buf.va_addr + offset, length=size, size=size)
return type(buf)(va_addr=buf.va_addr + offset, size=size, **{k:v for k,v in buf.__dict__.items() if k not in ['va_addr', 'size']},
**{x[0]:getattr(buf, x[0]) for x in getattr(buf, '_fields_', []) if x[0] not in ['va_addr', 'size']})
54 changes: 27 additions & 27 deletions tinygrad/runtime/ops_nv.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ def update_wait(self, cmd_idx, signal=None, value=None):
def bind(self, device: NVDevice):
self.binded_device = device
self.hw_page = device._gpu_alloc(len(self.q) * 4, map_to_cpu=True)
hw_view = to_mv(self.hw_page.base, self.hw_page.length).cast("I")
hw_view = to_mv(self.hw_page.va_addr, self.hw_page.size).cast("I")
for i, value in enumerate(self.q): hw_view[i] = value

# From now on, the queue is on the device for faster submission.
Expand All @@ -128,15 +128,15 @@ def bind(self, device: NVDevice):
def _submit(self, dev, gpfifo:GPFifo):
if len(self.q) == 0: return

if dev == self.binded_device: cmdq_addr = self.hw_page.base
if dev == self.binded_device: cmdq_addr = self.hw_page.va_addr
else:
if dev.cmdq_wptr + len(self.q) * 4 > dev.cmdq_page.length:
assert (gpfifo.ring[gpfifo.controls.GPGet] & 0xFFFFFFFFFC) >= dev.cmdq_page.base + len(self.q) * 4 or \
if dev.cmdq_wptr + len(self.q) * 4 > dev.cmdq_page.size:
assert (gpfifo.ring[gpfifo.controls.GPGet] & 0xFFFFFFFFFC) >= dev.cmdq_page.va_addr + len(self.q) * 4 or \
gpfifo.controls.GPGet == gpfifo.controls.GPPut, "cmdq overrun"
dev.cmdq_wptr = 0

dev.cmdq[dev.cmdq_wptr//4:dev.cmdq_wptr//4+len(self.q)] = array.array('I', self.q)
cmdq_addr = dev.cmdq_page.base+dev.cmdq_wptr
cmdq_addr = dev.cmdq_page.va_addr+dev.cmdq_wptr
dev.cmdq_wptr += len(self.q) * 4

gpfifo.ring[gpfifo.put_value % gpfifo.entries_count] = (cmdq_addr//4 << 2) | (len(self.q) << 42) | (1 << 41)
Expand Down Expand Up @@ -266,8 +266,8 @@ def __init__(self, device:NVDevice, name:str, lib:bytes):
shared_memory_size=max(0x400, round_up(self.shmem_usage, 0x100)), min_sm_config_shared_mem_size=smem_config,
max_sm_config_shared_mem_size=0x1a, register_count_v=self.registers_usage, target_sm_config_shared_mem_size=smem_config,
barrier_count=1, shader_local_memory_high_size=self.device.slm_per_thread, program_prefetch_size=self.program.nbytes>>8,
program_address_lower=self.lib_gpu.base&0xffffffff, program_address_upper=self.lib_gpu.base>>32, sass_version=0x89,
program_prefetch_addr_lower_shifted=self.lib_gpu.base>>8, program_prefetch_addr_upper_shifted=self.lib_gpu.base>>40,
program_address_lower=self.lib_gpu.va_addr&0xffffffff, program_address_upper=self.lib_gpu.va_addr>>32, sass_version=0x89,
program_prefetch_addr_lower_shifted=self.lib_gpu.va_addr>>8, program_prefetch_addr_upper_shifted=self.lib_gpu.va_addr>>40,
constant_buffer_size_shifted4_0=0x190, constant_buffer_valid_0=1, constant_buffer_invalidate_0=1)

# NV's kernargs is constbuffer (size 0x160), then arguments to the kernel follows. Kernargs also appends QMD at the end of the kernel.
Expand All @@ -282,31 +282,31 @@ def __init__(self, device:NVDevice, name:str, lib:bytes):

if self.rel_info is not None:
assert self.global_init is not None
global_init_addr = self.lib_gpu.base + off
global_init_addr = self.lib_gpu.va_addr + off
for rel_i in range(0, len(self.rel_info), 4):
if self.rel_info[rel_i+2] == 0x39: self.program[self.rel_info[rel_i]//4 + 1] = (global_init_addr >> 32) # R_CUDA_ABS32_HI_32
elif self.rel_info[rel_i+2] == 0x38: self.program[self.rel_info[rel_i]//4 + 1] = (global_init_addr & 0xffffffff) # R_CUDA_ABS32_LO_32
else: raise RuntimeError(f"unknown reloc: {self.rel_info[rel_i+2]}")

HWComputeQueue().wait(self.device.timeline_signal, self.device.timeline_value - 1).submit(self.device)
for st in range(0, len(self.program), 4095):
HWComputeQueue().copy_from_cpu(self.lib_gpu.base+st*4, self.program[st:st+4095]).submit(self.device)
HWComputeQueue().copy_from_cpu(self.lib_gpu.va_addr+st*4, self.program[st:st+4095]).submit(self.device)

if self.global_init is not None:
HWComputeQueue().copy_from_cpu(load_addr:=(self.lib_gpu.base + off), self.global_init).submit(self.device)
HWComputeQueue().copy_from_cpu(load_addr:=(self.lib_gpu.va_addr + off), self.global_init).submit(self.device)
off += round_up(self.global_init.nbytes, 128)
if 4 in constant_buffers_data: # >= 12.4
# Constbuffer 4 contains a pointer to nv.global.init, load section and set up the pointer.
assert constant_buffers_data[4].nbytes == 8
constant_buffers_data[4][0:2] = memoryview(struct.pack('Q', load_addr)).cast('I')

for i,data in constant_buffers_data.items():
self.qmd.__setattr__(f'constant_buffer_addr_upper_{i}', (self.lib_gpu.base + off) >> 32)
self.qmd.__setattr__(f'constant_buffer_addr_lower_{i}', (self.lib_gpu.base + off) & 0xffffffff)
self.qmd.__setattr__(f'constant_buffer_addr_upper_{i}', (self.lib_gpu.va_addr + off) >> 32)
self.qmd.__setattr__(f'constant_buffer_addr_lower_{i}', (self.lib_gpu.va_addr + off) & 0xffffffff)
self.qmd.__setattr__(f'constant_buffer_size_shifted4_{i}', data.nbytes)
self.qmd.__setattr__(f'constant_buffer_valid_{i}', 1)

HWComputeQueue().copy_from_cpu(self.lib_gpu.base + off, data).submit(self.device)
HWComputeQueue().copy_from_cpu(self.lib_gpu.va_addr + off, data).submit(self.device)
off += round_up(data.nbytes, 128)

HWComputeQueue().signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
Expand All @@ -321,12 +321,12 @@ def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tup
if any(cur > mx for cur,mx in zip(global_size, [2147483647, 65535, 65535])) or any(cur > mx for cur,mx in zip(local_size, [1024, 1024, 64])):
raise RuntimeError("Invalid global/local dims")

if self.device.kernargs_ptr >= (self.device.kernargs_page.base + self.device.kernargs_page.length - self.kernargs_alloc_size):
self.device.kernargs_ptr = self.device.kernargs_page.base
if self.device.kernargs_ptr >= (self.device.kernargs_page.va_addr + self.device.kernargs_page.size - self.kernargs_alloc_size):
self.device.kernargs_ptr = self.device.kernargs_page.va_addr

# HACK: Save counts of args and vars to "unused" constbuffer for later extraction in mockgpu to pass into gpuocelot.
if MOCKGPU: self.constbuffer_0[0:2] = [len(args), len(vals)]
kernargs = [arg_half for arg in args for arg_half in nvdata64_le(arg.base)] + list(vals)
kernargs = [arg_half for arg in args for arg_half in nvdata64_le(arg.va_addr)] + list(vals)

sig_st, sig_en = (self.device._alloc_signal(), self.device._alloc_signal()) if PROFILE else (self.device.time_event_st, self.device.time_event_en)

Expand Down Expand Up @@ -429,11 +429,11 @@ def _gpu_free(self, mem):
made = nv_gpu.NVOS00_PARAMETERS(hRoot=self.root, hObjectParent=self.device, hObjectOld=mem.hMemory)
nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_FREE, made)
if made.status != 0: raise RuntimeError(f"_gpu_free returned {made.status}")
uvm.free(self.fd_uvm, base=mem.base, length=mem.length)
uvm.free(self.fd_uvm, base=mem.va_addr, length=mem.size)

def _gpu_host_free(self, mem):
uvm.free(self.fd_uvm, base=mem.base, length=mem.length)
libc.munmap(mem.base, mem.length)
uvm.free(self.fd_uvm, base=mem.va_addr, length=mem.size)
libc.munmap(mem.va_addr, mem.size)

def _map_to_gpu(self, va_base, size):
NVDevice.host_object_enumerator += 1
Expand All @@ -457,7 +457,7 @@ def _gpu_uvm_map(self, va_base, size, mem_handle, create_range=True) -> nv_gpu.U
def _gpu_map(self, mem):
if self.gpu_uuid in getattr(mem, "mapped_gpu_ids", []): return
mem.__setattr__("mapped_gpu_ids", getattr(mem, "mapped_gpu_ids", []) + [self.gpu_uuid])
return self._gpu_uvm_map(mem.base, mem.length, mem.hMemory, create_range=False)
return self._gpu_uvm_map(mem.va_addr, mem.size, mem.hMemory, create_range=False)

def _alloc_gpu_vaddr(self, size, alignment=(4 << 10)):
NVDevice.uvm_vaddr = (res_va:=round_up(NVDevice.uvm_vaddr, alignment)) + size
Expand Down Expand Up @@ -513,7 +513,7 @@ def __init__(self, device:str=""):

if NVDevice.signals_page is None:
NVDevice.signals_page = self._gpu_system_alloc(16 * 65536, map_to_cpu=True)
NVDevice.signals_pool = [to_mv(self.signals_page.base + off, 16).cast("Q") for off in range(0, NVDevice.signals_page.length, 16)]
NVDevice.signals_pool = [to_mv(self.signals_page.va_addr + off, 16).cast("Q") for off in range(0, NVDevice.signals_page.size, 16)]
else: self._gpu_map(NVDevice.signals_page)

channel_params = nv_gpu.NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS(engineType=nv_gpu.NV2080_ENGINE_TYPE_GRAPHICS)
Expand All @@ -533,11 +533,11 @@ def __init__(self, device:str=""):
self.time_event_st, self.time_event_en = NVDevice._alloc_signal(), NVDevice._alloc_signal()

self.cmdq_page: nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS = self._gpu_alloc(0x200000, map_to_cpu=True, huge_page=True)
self.cmdq: memoryview = to_mv(self.cmdq_page.base, 0x200000).cast("I")
self.cmdq: memoryview = to_mv(self.cmdq_page.va_addr, 0x200000).cast("I")
self.cmdq_wptr: int = 0 # in bytes

self.kernargs_page: nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS = self._gpu_alloc(0x4000000, map_to_cpu=True)
self.kernargs_ptr: int = self.kernargs_page.base
self.kernargs_ptr: int = self.kernargs_page.va_addr

self.arch: str = "sm_89" if not MOCKGPU else "sm_35" # TODO: fix

Expand Down Expand Up @@ -586,7 +586,7 @@ def synchronize(self):
def _new_gpu_fifo(self, gpfifo_area, ctxshare, channel_group, offset=0, entries=0x400) -> GPFifo:
notifier = self._gpu_system_alloc(48 << 20)
params = nv_gpu.NV_CHANNELGPFIFO_ALLOCATION_PARAMETERS(hObjectError=notifier.hMemory, hObjectBuffer=gpfifo_area.hMemory,
gpFifoOffset=gpfifo_area.base+offset, gpFifoEntries=entries, hContextShare=ctxshare,
gpFifoOffset=gpfifo_area.va_addr+offset, gpFifoEntries=entries, hContextShare=ctxshare,
hUserdMemory=(ctypes.c_uint32*8)(gpfifo_area.hMemory), userdOffset=(ctypes.c_uint64*8)(entries*8+offset))
gpfifo = rm_alloc(self.fd_ctl, nv_gpu.AMPERE_CHANNEL_GPFIFO_A, self.root, channel_group, params).hObjectNew
rm_alloc(self.fd_ctl, self.compute_type, self.root, gpfifo, None)
Expand All @@ -600,14 +600,14 @@ def _new_gpu_fifo(self, gpfifo_area, ctxshare, channel_group, offset=0, entries=
uvm.register_channel(self.fd_uvm, gpuUuid=nv_gpu.struct_nv_uuid(uuid=self.gpu_uuid), rmCtrlFd=self.fd_ctl, hClient=self.root,
hChannel=gpfifo, base=channel_base, length=0x4000000)

return GPFifo(ring=to_mv(gpfifo_area.base + offset, entries * 8).cast("Q"), entries_count=entries, token=ws_token_params.workSubmitToken,
controls=nv_gpu.AmpereAControlGPFifo.from_address(gpfifo_area.base + offset + entries * 8))
return GPFifo(ring=to_mv(gpfifo_area.va_addr + offset, entries * 8).cast("Q"), entries_count=entries, token=ws_token_params.workSubmitToken,
controls=nv_gpu.AmpereAControlGPFifo.from_address(gpfifo_area.va_addr + offset + entries * 8))

def _cmdq_setup_compute_gpfifo(self):
self.slm_per_thread = 0x900
bytes_per_warp = round_up(self.slm_per_thread * 32, 0x200)
bytes_per_tpc = round_up(bytes_per_warp * 48 * 2, 0x8000)
self.shader_local_mem = self._gpu_alloc(round_up(bytes_per_tpc * 64, 0x20000), huge_page=True, contig=True).base
self.shader_local_mem = self._gpu_alloc(round_up(bytes_per_tpc * 64, 0x20000), huge_page=True, contig=True).va_addr

# Set windows addresses to not collide with other allocated buffers.
self.shared_mem_window, self.local_mem_window = 0xfe000000, 0xff000000
Expand Down