Skip to content

Commit

Permalink
Map LevelZero GPUs to CPUs by PCI address
Browse files Browse the repository at this point in the history
- Previous implementation assumed a fixed pattern for mapping GPUs to
  CPUs
- This change queries sysfs to get the CPUs local to each GPU
  • Loading branch information
dannosliwcd authored and cmcantalupo committed Jan 4, 2025
1 parent 7451bb2 commit 11e85fe
Show file tree
Hide file tree
Showing 14 changed files with 187 additions and 118 deletions.
4 changes: 4 additions & 0 deletions libgeopmd/include/geopm/Helper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,10 @@ namespace geopm
void GEOPM_PUBLIC
enable_fixed_counters(PlatformIO &pio);

/// Convert a Linux cpumask string (e.g., from a sysfs local_cpus file) to
/// an integer set of CPU indices that are present in the mask.
std::set<int> GEOPM_PUBLIC
linux_cpumask_buf_to_int_set(const std::string &cpumask_buf);
}

#endif
27 changes: 0 additions & 27 deletions libgeopmd/src/DrmGpuTopo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,36 +23,9 @@
#include "geopm/Helper.hpp"
#include "geopm_topo.h"

static const int MAX_CPUS_PER_CPUMASK_SEGMENT = 32;
static const std::regex GPU_CARD_REGEX("^card(\\d+)$");
static const std::regex GPU_TILE_REGEX("^gt(\\d+)$");

static std::set<int> linux_cpumask_buf_to_int_set(const std::string &cpumask_buf)
{
// The expected bitmask format is "HEX,HEX,...,HEX", where commas separate
// 32-bit segments. Higher-ordered bits indicate higher CPU indices (i.e.
// LSB is CPU 0).
std::set<int> mapped_cpus;
int cpu_offset = 0;
auto hex_segments = geopm::string_split(cpumask_buf, ",");
for (auto it = hex_segments.rbegin(); it != hex_segments.rend(); ++it) {
auto bitmask_segment = std::stoull(*it, nullptr, 16);
if (bitmask_segment >> MAX_CPUS_PER_CPUMASK_SEGMENT) {
throw geopm::Exception("linux_cpumask_buf_to_int_set: malformed cpumask: " + cpumask_buf,
GEOPM_ERROR_RUNTIME, __FILE__, __LINE__);
}
int next_segment_cpu_offset = cpu_offset + MAX_CPUS_PER_CPUMASK_SEGMENT;
while (cpu_offset < next_segment_cpu_offset) {
if (bitmask_segment & 1) {
mapped_cpus.insert(cpu_offset);
}
bitmask_segment >>= 1;
cpu_offset += 1;
}
}
return mapped_cpus;
}

// Return the name of the driver that provides the given /sys/class/drm/card*/ device
static std::string drm_driver_name_from_card_path(const std::string &card_path)
{
Expand Down
26 changes: 26 additions & 0 deletions libgeopmd/src/Helper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
namespace geopm
{
static const size_t NUMERIC_STRING_MAX = 255;
static const int MAX_CPUS_PER_CPUMASK_SEGMENT = 32;

std::string read_file(const std::string &path)
{
Expand Down Expand Up @@ -454,4 +455,29 @@ namespace geopm
pio.write_control("MSR::PERF_GLOBAL_OVF_CTRL:CLEAR_OVF_FIXED_CTR2", GEOPM_DOMAIN_BOARD, 0, 0);
}

std::set<int> linux_cpumask_buf_to_int_set(const std::string &cpumask_buf)
{
// The expected bitmask format is "HEX,HEX,...,HEX", where commas separate
// 32-bit segments. Higher-ordered bits indicate higher CPU indices (i.e.
// LSB is CPU 0).
std::set<int> mapped_cpus;
int cpu_offset = 0;
auto hex_segments = geopm::string_split(cpumask_buf, ",");
for (auto it = hex_segments.rbegin(); it != hex_segments.rend(); ++it) {
auto bitmask_segment = std::stoull(*it, nullptr, 16);
if (bitmask_segment >> MAX_CPUS_PER_CPUMASK_SEGMENT) {
throw geopm::Exception("linux_cpumask_buf_to_int_set: malformed cpumask: " + cpumask_buf,
GEOPM_ERROR_RUNTIME, __FILE__, __LINE__);
}
int next_segment_cpu_offset = cpu_offset + MAX_CPUS_PER_CPUMASK_SEGMENT;
while (cpu_offset < next_segment_cpu_offset) {
if (bitmask_segment & 1) {
mapped_cpus.insert(cpu_offset);
}
bitmask_segment >>= 1;
cpu_offset += 1;
}
}
return mapped_cpus;
}
}
18 changes: 18 additions & 0 deletions libgeopmd/src/LevelZero.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <unistd.h>
#include <string>
#include <iostream>
#include <sstream>
#include <map>
#include <cstdlib>

Expand All @@ -16,6 +17,10 @@

#include "LevelZeroImp.hpp"

// (DBDF buffer: domain:bus:device.function plus NUL (e.g., 0000:00:00.0\0)
#define DBDF_BUFFER_SIZE (4 + 1 + 2 + 1 + 2 + 1 + 1 + 1)
#define DBDF_BUFFER_FORMAT ("%04x:%02x:%02x.%01x")

namespace geopm
{
static double convert_nan(double result)
Expand Down Expand Up @@ -1117,4 +1122,17 @@ namespace geopm
error, __FILE__, line);
}
}

std::string LevelZeroImp::pci_dbdf_address(unsigned int l0_device_idx) const
{
ze_pci_ext_properties_t pci_properties;
char dbdf_buffer[DBDF_BUFFER_SIZE];
check_ze_result(zeDevicePciGetPropertiesExt(m_devices.at(l0_device_idx).device_handle, &pci_properties),
GEOPM_ERROR_RUNTIME, "LevelZero::" + std::string(__func__) +
": failed to get PCI device properties.", __LINE__);
snprintf(dbdf_buffer, DBDF_BUFFER_SIZE, DBDF_BUFFER_FORMAT,
pci_properties.address.domain, pci_properties.address.bus,
pci_properties.address.device, pci_properties.address.function);
return dbdf_buffer;
}
}
3 changes: 3 additions & 0 deletions libgeopmd/src/LevelZero.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,9 @@ namespace geopm
/// @return Display Error Count
virtual double ras_display_errcount_uncorrectable(unsigned int l0_device_idx,
int l0_domain, int l0_domain_idx) const = 0;

// Return the formatted domain:bus:device.function PCI address of the given LevelZero device
virtual std::string pci_dbdf_address(unsigned int l0_device_idx) const = 0;
};

const LevelZero &levelzero();
Expand Down
21 changes: 19 additions & 2 deletions libgeopmd/src/LevelZeroDevicePool.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -509,7 +509,6 @@ namespace geopm
l0_domain, dev_subdev_idx_pair.second, setting);
}


// RAS Correctable Counters
double LevelZeroDevicePoolImp::ras_reset_count_correctable(int domain, unsigned int domain_idx,
int l0_domain) const
Expand Down Expand Up @@ -765,5 +764,23 @@ namespace geopm
dev_subdev_idx_pair.second);
}


std::string LevelZeroDevicePoolImp::pci_dbdf_address(int geopm_domain, unsigned int geopm_domain_idx) const
{
check_idx_range(geopm_domain, geopm_domain_idx);
if (geopm_domain == GEOPM_DOMAIN_GPU) {
// Get the address of the requested device
return m_levelzero.pci_dbdf_address(geopm_domain_idx);
}
else if (geopm_domain == GEOPM_DOMAIN_GPU_CHIP) {
// Get the address of the device containing the requested subdevice
auto dev_subdev_idx_pair = subdevice_device_conversion(geopm_domain_idx);
return m_levelzero.pci_dbdf_address(dev_subdev_idx_pair.first);
}
else {
throw Exception("LevelZeroDevicePool::" + std::string(__func__) +
": domain " + std::to_string(geopm_domain) +
" is not supported",
GEOPM_ERROR_INVALID, __FILE__, __LINE__);
}
}
}
4 changes: 4 additions & 0 deletions libgeopmd/src/LevelZeroDevicePool.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,10 @@ namespace geopm
/// @return Display Error Count
virtual double ras_display_errcount_uncorrectable(int domain, unsigned int domain_idx,
int l0_domain) const = 0;

// Return the formatted domain:bus:device.function PCI address of the given
// GEOPM GPU or GPU chip.
virtual std::string pci_dbdf_address(int geopm_domain, unsigned int geopm_domain_idx) const = 0;
private:
};

Expand Down
1 change: 1 addition & 0 deletions libgeopmd/src/LevelZeroDevicePoolImp.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ namespace geopm
int l0_domain) const override;
double ras_display_errcount_uncorrectable(int domain, unsigned int domain_idx,
int l0_domain) const override;
std::string pci_dbdf_address(int geopm_domain, unsigned int geopm_domain_idx) const override;
private:
const LevelZero &m_levelzero;

Expand Down
47 changes: 17 additions & 30 deletions libgeopmd/src/LevelZeroGPUTopo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,23 @@
#include <map>

#include "geopm/Exception.hpp"
#include "geopm/Helper.hpp"
#include "LevelZeroDevicePool.hpp"
#include "LevelZeroGPUTopo.hpp"

namespace geopm
{
static const std::string PCI_DEVICES_PATH = "/sys/bus/pci/devices";

LevelZeroGPUTopo::LevelZeroGPUTopo()
: LevelZeroGPUTopo(levelzero_device_pool(), geopm_sched_num_cpu())
: LevelZeroGPUTopo(levelzero_device_pool(), PCI_DEVICES_PATH)
{
}

LevelZeroGPUTopo::LevelZeroGPUTopo(const LevelZeroDevicePool &device_pool,
const int num_cpu)
const std::string &pci_devices_path)
: m_levelzero_device_pool(device_pool)
, m_pci_devices_path(pci_devices_path)
{
if (getenv("ZE_AFFINITY_MASK") != nullptr) {
throw Exception("LevelZeroGPUTopo: Refusing to create a topology cache file while ZE_AFFINITY_MASK environment variable is set",
Expand All @@ -35,37 +39,20 @@ namespace geopm
#endif
}
else {
m_cpu_affinity_ideal.resize(num_gpu);
int num_cpu_per_gpu = num_cpu / num_gpu;

m_cpu_affinity_ideal_chip.resize(num_gpu_chip);
m_cpu_affinity_ideal_chip.reserve(num_gpu_chip);
int num_chip_per_gpu = num_gpu_chip / num_gpu;

// TODO: Add ideal CPU to GPU affinitization that isn't a simple split if needed.
// This may come from a call to oneAPI, LevelZero, etc
for (int gpu_idx = 0; gpu_idx < num_gpu; ++gpu_idx) {
size_t gpu_chip_index = gpu_idx * static_cast<size_t>(num_chip_per_gpu);
int end_cpu_idx = (gpu_idx + 1) * num_cpu_per_gpu;
for (int cpu_idx = gpu_idx * num_cpu_per_gpu, chip_idx = 0;
cpu_idx < end_cpu_idx;
++cpu_idx) {
m_cpu_affinity_ideal.at(gpu_idx).insert(cpu_idx);
for (int gpu_idx = 0; gpu_idx < num_gpu; ++gpu_idx) {
std::string pci_address = m_levelzero_device_pool.pci_dbdf_address(GEOPM_DOMAIN_GPU, gpu_idx);
std::string cpu_mask_path = m_pci_devices_path + "/" + pci_address + "/local_cpus";
auto cpu_mask_buf = geopm::read_file(cpu_mask_path);
auto cpu_set = linux_cpumask_buf_to_int_set(cpu_mask_buf);

// CHIP to CPU association is currently only used to associate CHIPS to
// GPUS. This logic just distributes the CPUs associated with
// an GPU to its CHIPS in a round robin fashion.
m_cpu_affinity_ideal_chip.at(gpu_chip_index +
(chip_idx % num_chip_per_gpu)).insert(cpu_idx);
++chip_idx;
}
}
if ((num_cpu % num_gpu) != 0) {
for (int cpu_idx = num_cpu_per_gpu * num_gpu, gpu_idx = 0;
cpu_idx < num_cpu; ++cpu_idx) {
m_cpu_affinity_ideal.at(gpu_idx % num_gpu).insert(cpu_idx);
size_t gpu_chip_index = gpu_idx * static_cast<size_t>(num_chip_per_gpu);
m_cpu_affinity_ideal_chip.at(gpu_chip_index).insert(cpu_idx);
++gpu_idx;
// This CPU set is local to the current iterated GPU and each
// of the GPU's subdevices.
m_cpu_affinity_ideal.push_back(cpu_set);
for (int gpu_subdevice = 0; gpu_subdevice < num_chip_per_gpu; ++gpu_subdevice) {
m_cpu_affinity_ideal_chip.push_back(cpu_set);
}
}
}
Expand Down
4 changes: 3 additions & 1 deletion libgeopmd/src/LevelZeroGPUTopo.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <cstdint>
#include <vector>
#include <set>
#include <string>

#include "GPUTopo.hpp"

Expand All @@ -21,7 +22,7 @@ namespace geopm
public:
LevelZeroGPUTopo();
LevelZeroGPUTopo(const LevelZeroDevicePool &device_pool,
const int num_cpu);
const std::string &pci_devices_path);
virtual ~LevelZeroGPUTopo() = default;
int num_gpu(void) const override;
int num_gpu(int domain) const override;
Expand All @@ -31,6 +32,7 @@ namespace geopm
const LevelZeroDevicePool &m_levelzero_device_pool;
std::vector<std::set<int> > m_cpu_affinity_ideal;
std::vector<std::set<int> > m_cpu_affinity_ideal_chip;
std::string m_pci_devices_path;
};
}
#endif
1 change: 1 addition & 0 deletions libgeopmd/src/LevelZeroImp.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ namespace geopm
double ras_display_errcount_uncorrectable(unsigned int l0_device_idx,
int l0_domain,
int l0_domain_idx) const override;
std::string pci_dbdf_address(unsigned int l0_device_idx) const override;

private:
enum m_error_type {
Expand Down
Loading

0 comments on commit 11e85fe

Please sign in to comment.