Skip to content

Commit

Permalink
nv get classes based on device (tinygrad#5325)
Browse files Browse the repository at this point in the history
* nv get classes

* support in mockgpu

* choose sm based on gpu

* fix

* fix

* fix arch
  • Loading branch information
nimlgen authored Jul 8, 2024
1 parent 7d049fc commit 51d6f37
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 8 deletions.
15 changes: 14 additions & 1 deletion extra/mockgpu/nv/nvdriver.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,9 +135,22 @@ def rm_alloc(self, argp):
def rm_control(self, argp):
struct = nv_gpu.NVOS54_PARAMETERS.from_address(argp)
params_ptr = struct.params if struct.params else None
if struct.cmd == nv_gpu.NV0000_CTRL_CMD_GPU_GET_ID_INFO_V2:
if struct.cmd == nv_gpu.NV0000_CTRL_CMD_GPU_GET_ID_INFO_V2:
params:Any = nv_gpu.NV0000_CTRL_GPU_GET_ID_INFO_V2_PARAMS.from_address(params_ptr)
params.deviceInstance = params.gpuId # emulate them to be the same
elif struct.cmd == nv_gpu.NV0080_CTRL_CMD_GPU_GET_CLASSLIST_V2:
params = nv_gpu.NV0080_CTRL_GPU_GET_CLASSLIST_V2_PARAMS.from_address(params_ptr)
classes = [50021, 51607, 51648, 50543, 51125, 51125, 51125, 51125, 50529, 36967, 36909, 37105, 33868, 36978, 37095, 37094, 36980, 37014, 49270,
41068, 41088, 41280, 50025, 96, 112, 115, 125, 20608, 20640, 20539, 20540, 41089, 41092, 50034, 50810, 50811, 50814, 51056, 51057,
51059, 51069, 51071, 51632, 51639, 51639, 51706, 52019, 222, 50287, 50273, 50031, 50017] # from ada102
params.numClasses = len(classes)
for i,c in enumerate(classes): params.classList[i] = c
elif struct.cmd == nv_gpu.NV2080_CTRL_CMD_GR_GET_INFO:
info = {nv_gpu.NV2080_CTRL_GR_INFO_INDEX_SM_VERSION: nv_gpu.NV2080_CTRL_GR_INFO_SM_VERSION_3_5}

params = nv_gpu.NV2080_CTRL_GR_GET_INFO_PARAMS.from_address(params_ptr)
reqlist = (nv_gpu.NV2080_CTRL_GR_INFO * params.grInfoListSize).from_address(params.grInfoList)
for i in range(params.grInfoListSize): reqlist[i].data = info[reqlist[i].index]
elif struct.cmd == nv_gpu.NV2080_CTRL_CMD_GPU_GET_GID_INFO:
assert struct.hObject in self.object_by_handle and isinstance(self.object_by_handle[struct.hObject], NVSubDevice)
gpu = self.object_by_handle[struct.hObject].device
Expand Down
19 changes: 12 additions & 7 deletions tinygrad/runtime/ops_nv.py
Original file line number Diff line number Diff line change
Expand Up @@ -474,6 +474,11 @@ def _alloc_gpu_vaddr(self, size, alignment=(4 << 10)):
NVDevice.uvm_vaddr = (res_va:=round_up(NVDevice.uvm_vaddr, alignment)) + size
return res_va

def _setup_nvclasses(self):
clsinfo = rmctrl.gpu_get_classlist_v2(self.fd_ctl, self.root, self.device)
self.nvclasses = {clsinfo.classList[i] for i in range(clsinfo.numClasses)}
self.compute_class = next(clss for clss in [nv_gpu.ADA_COMPUTE_A, nv_gpu.AMPERE_COMPUTE_B] if clss in self.nvclasses)

def __init__(self, device:str=""):
if NVDevice.root is None:
NVDevice.fd_ctl = os.open("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC)
Expand All @@ -486,23 +491,21 @@ def __init__(self, device:str=""):
NVDevice.gpus_info = (nv_gpu.nv_ioctl_card_info_t*64)()
nv_iowr(NVDevice.fd_ctl, nv_gpu.NV_ESC_CARD_INFO, NVDevice.gpus_info)

# TODO: Get classes from NV0080_CTRL_CMD_GPU_GET_CLASSLIST_V2
self.device_id = int(device.split(":")[1]) if ":" in device else 0
self.fd_dev = self._new_gpu_fd()

assert NVDevice.gpus_info[self.device_id].valid, f"No valid device found for NV:{self.device_id}. Requesting more devices than the system has?"
gpu_info = rmctrl.gpu_get_id_info_v2(self.fd_ctl, self.root, self.root, gpuId=NVDevice.gpus_info[self.device_id].gpu_id)

device_id = NVDevice.gpus_info[self.device_id].pci_info.device_id
self.compute_type = nv_gpu.AMPERE_COMPUTE_B if device_id in [0x2204, 0x2206] else nv_gpu.ADA_COMPUTE_A

device_params = nv_gpu.NV0080_ALLOC_PARAMETERS(deviceId=gpu_info.deviceInstance, hClientShare=self.root,
vaMode=nv_gpu.NV_DEVICE_ALLOCATION_VAMODE_MULTIPLE_VASPACES)
self.device = rm_alloc(self.fd_ctl, nv_gpu.NV01_DEVICE_0, self.root, self.root, device_params).hObjectNew
self.subdevice = rm_alloc(self.fd_ctl, nv_gpu.NV20_SUBDEVICE_0, self.root, self.device, None).hObjectNew
self.usermode = rm_alloc(self.fd_ctl, nv_gpu.TURING_USERMODE_A, self.root, self.subdevice, None).hObjectNew
self.gpu_mmio = to_mv(self._gpu_map_to_cpu(self.usermode, mmio_sz:=0x10000, flags=2), mmio_sz).cast("I")

self._setup_nvclasses()

rmctrl.perf_boost(self.fd_ctl, self.root, self.subdevice, duration=0xffffffff, flags=((nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CUDA_YES << 4) | \
(nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CUDA_PRIORITY_HIGH << 6) | (nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CMD_BOOST_TO_MAX << 0)))

Expand Down Expand Up @@ -547,7 +550,9 @@ def __init__(self, device:str=""):
self.kernargs_page: nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS = self._gpu_alloc(0x4000000, map_to_cpu=True)
self.kernargs_ptr: int = self.kernargs_page.va_addr

self.arch: str = "sm_89" if not MOCKGPU else "sm_35" # TODO: fix
sm_info = nv_gpu.NV2080_CTRL_GR_INFO(index=nv_gpu.NV2080_CTRL_GR_INFO_INDEX_SM_VERSION)
rmctrl.gr_get_info(self.fd_ctl, self.root, self.subdevice, grInfoListSize=1, grInfoList=ctypes.addressof(sm_info))
self.arch: str = f"sm_{(sm_info.data>>8)&0xff}{(val>>4) if (val:=sm_info.data&0xff) > 0xf else val}"

compiler_t = (PTXCompiler if PTX else CUDACompiler) if MOCKGPU else (NVPTXCompiler if PTX else NVCompiler)
super().__init__(device, NVAllocator(self), PTXRenderer(self.arch, device="NV") if PTX else NVRenderer(self.arch), compiler_t(self.arch),
Expand Down Expand Up @@ -597,7 +602,7 @@ def _new_gpu_fifo(self, gpfifo_area, ctxshare, channel_group, offset=0, entries=
gpFifoOffset=gpfifo_area.va_addr+offset, gpFifoEntries=entries, hContextShare=ctxshare,
hUserdMemory=(ctypes.c_uint32*8)(gpfifo_area.hMemory), userdOffset=(ctypes.c_uint64*8)(entries*8+offset))
gpfifo = rm_alloc(self.fd_ctl, nv_gpu.AMPERE_CHANNEL_GPFIFO_A, self.root, channel_group, params).hObjectNew
rm_alloc(self.fd_ctl, self.compute_type, self.root, gpfifo, None)
rm_alloc(self.fd_ctl, self.compute_class, self.root, gpfifo, None)
rm_alloc(self.fd_ctl, nv_gpu.AMPERE_DMA_COPY_B, self.root, gpfifo, None)

ws_token_params = rmctrl.gpfifo_get_work_submit_token(self.fd_ctl, self.root, gpfifo, workSubmitToken=-1)
Expand All @@ -615,7 +620,7 @@ def _cmdq_setup_compute_gpfifo(self):
self.shared_mem_window, self.local_mem_window, self.slm_per_thread = 0xfe000000, 0xff000000, 0

queue = HWComputeQueue()
queue.q += [nvmethod(1, nv_gpu.NVC6C0_SET_OBJECT, 1), self.compute_type]
queue.q += [nvmethod(1, nv_gpu.NVC6C0_SET_OBJECT, 1), self.compute_class]
queue.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_WINDOW_A, 2), *nvdata64(self.local_mem_window)]
queue.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_SHARED_MEMORY_WINDOW_A, 2), *nvdata64(self.shared_mem_window)]
queue.signal(self.timeline_signal, self.timeline_value).submit(self)
Expand Down

0 comments on commit 51d6f37

Please sign in to comment.