diff --git a/libgeopmd/include/geopm/Helper.hpp b/libgeopmd/include/geopm/Helper.hpp index 589ab5a61a..95ba0b7c80 100644 --- a/libgeopmd/include/geopm/Helper.hpp +++ b/libgeopmd/include/geopm/Helper.hpp @@ -219,6 +219,10 @@ namespace geopm void GEOPM_PUBLIC enable_fixed_counters(PlatformIO &pio); + /// Convert a Linux cpumask string (e.g., from a sysfs local_cpus file) to + /// an integer set of CPU indices that are present in the mask. + std::set GEOPM_PUBLIC + linux_cpumask_buf_to_int_set(const std::string &cpumask_buf); } #endif diff --git a/libgeopmd/src/DrmGpuTopo.cpp b/libgeopmd/src/DrmGpuTopo.cpp index f3300330aa..e986269ca0 100644 --- a/libgeopmd/src/DrmGpuTopo.cpp +++ b/libgeopmd/src/DrmGpuTopo.cpp @@ -23,36 +23,9 @@ #include "geopm/Helper.hpp" #include "geopm_topo.h" -static const int MAX_CPUS_PER_CPUMASK_SEGMENT = 32; static const std::regex GPU_CARD_REGEX("^card(\\d+)$"); static const std::regex GPU_TILE_REGEX("^gt(\\d+)$"); -static std::set linux_cpumask_buf_to_int_set(const std::string &cpumask_buf) -{ - // The expected bitmask format is "HEX,HEX,...,HEX", where commas separate - // 32-bit segments. Higher-ordered bits indicate higher CPU indices (i.e. - // LSB is CPU 0). - std::set mapped_cpus; - int cpu_offset = 0; - auto hex_segments = geopm::string_split(cpumask_buf, ","); - for (auto it = hex_segments.rbegin(); it != hex_segments.rend(); ++it) { - auto bitmask_segment = std::stoull(*it, nullptr, 16); - if (bitmask_segment >> MAX_CPUS_PER_CPUMASK_SEGMENT) { - throw geopm::Exception("linux_cpumask_buf_to_int_set: malformed cpumask: " + cpumask_buf, - GEOPM_ERROR_RUNTIME, __FILE__, __LINE__); - } - int next_segment_cpu_offset = cpu_offset + MAX_CPUS_PER_CPUMASK_SEGMENT; - while (cpu_offset < next_segment_cpu_offset) { - if (bitmask_segment & 1) { - mapped_cpus.insert(cpu_offset); - } - bitmask_segment >>= 1; - cpu_offset += 1; - } - } - return mapped_cpus; -} - // Return the name of the driver that provides the given /sys/class/drm/card*/ device static std::string drm_driver_name_from_card_path(const std::string &card_path) { diff --git a/libgeopmd/src/Helper.cpp b/libgeopmd/src/Helper.cpp index ee04c59a08..685ab09d01 100644 --- a/libgeopmd/src/Helper.cpp +++ b/libgeopmd/src/Helper.cpp @@ -37,6 +37,7 @@ namespace geopm { static const size_t NUMERIC_STRING_MAX = 255; + static const int MAX_CPUS_PER_CPUMASK_SEGMENT = 32; std::string read_file(const std::string &path) { @@ -454,4 +455,29 @@ namespace geopm pio.write_control("MSR::PERF_GLOBAL_OVF_CTRL:CLEAR_OVF_FIXED_CTR2", GEOPM_DOMAIN_BOARD, 0, 0); } + std::set linux_cpumask_buf_to_int_set(const std::string &cpumask_buf) + { + // The expected bitmask format is "HEX,HEX,...,HEX", where commas separate + // 32-bit segments. Higher-ordered bits indicate higher CPU indices (i.e. + // LSB is CPU 0). + std::set mapped_cpus; + int cpu_offset = 0; + auto hex_segments = geopm::string_split(cpumask_buf, ","); + for (auto it = hex_segments.rbegin(); it != hex_segments.rend(); ++it) { + auto bitmask_segment = std::stoull(*it, nullptr, 16); + if (bitmask_segment >> MAX_CPUS_PER_CPUMASK_SEGMENT) { + throw geopm::Exception("linux_cpumask_buf_to_int_set: malformed cpumask: " + cpumask_buf, + GEOPM_ERROR_RUNTIME, __FILE__, __LINE__); + } + int next_segment_cpu_offset = cpu_offset + MAX_CPUS_PER_CPUMASK_SEGMENT; + while (cpu_offset < next_segment_cpu_offset) { + if (bitmask_segment & 1) { + mapped_cpus.insert(cpu_offset); + } + bitmask_segment >>= 1; + cpu_offset += 1; + } + } + return mapped_cpus; + } } diff --git a/libgeopmd/src/LevelZero.cpp b/libgeopmd/src/LevelZero.cpp index fd0a81fca9..54eda954d3 100644 --- a/libgeopmd/src/LevelZero.cpp +++ b/libgeopmd/src/LevelZero.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -16,6 +17,10 @@ #include "LevelZeroImp.hpp" +// (DBDF buffer: domain:bus:device.function plus NUL (e.g., 0000:00:00.0\0) +#define DBDF_BUFFER_SIZE (4 + 1 + 2 + 1 + 2 + 1 + 1 + 1) +#define DBDF_BUFFER_FORMAT ("%04x:%02x:%02x.%01x") + namespace geopm { static double convert_nan(double result) @@ -1117,4 +1122,17 @@ namespace geopm error, __FILE__, line); } } + + std::string LevelZeroImp::pci_dbdf_address(unsigned int l0_device_idx) const + { + ze_pci_ext_properties_t pci_properties; + char dbdf_buffer[DBDF_BUFFER_SIZE]; + check_ze_result(zeDevicePciGetPropertiesExt(m_devices.at(l0_device_idx).device_handle, &pci_properties), + GEOPM_ERROR_RUNTIME, "LevelZero::" + std::string(__func__) + + ": failed to get PCI device properties.", __LINE__); + snprintf(dbdf_buffer, DBDF_BUFFER_SIZE, DBDF_BUFFER_FORMAT, + pci_properties.address.domain, pci_properties.address.bus, + pci_properties.address.device, pci_properties.address.function); + return dbdf_buffer; + } } diff --git a/libgeopmd/src/LevelZero.hpp b/libgeopmd/src/LevelZero.hpp index e89dde485e..4135d7c5e5 100644 --- a/libgeopmd/src/LevelZero.hpp +++ b/libgeopmd/src/LevelZero.hpp @@ -352,6 +352,9 @@ namespace geopm /// @return Display Error Count virtual double ras_display_errcount_uncorrectable(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const = 0; + + // Return the formatted domain:bus:device.function PCI address of the given LevelZero device + virtual std::string pci_dbdf_address(unsigned int l0_device_idx) const = 0; }; const LevelZero &levelzero(); diff --git a/libgeopmd/src/LevelZeroDevicePool.cpp b/libgeopmd/src/LevelZeroDevicePool.cpp index 2f68837fc8..9965d3db3c 100644 --- a/libgeopmd/src/LevelZeroDevicePool.cpp +++ b/libgeopmd/src/LevelZeroDevicePool.cpp @@ -509,7 +509,6 @@ namespace geopm l0_domain, dev_subdev_idx_pair.second, setting); } - // RAS Correctable Counters double LevelZeroDevicePoolImp::ras_reset_count_correctable(int domain, unsigned int domain_idx, int l0_domain) const @@ -765,5 +764,23 @@ namespace geopm dev_subdev_idx_pair.second); } - + std::string LevelZeroDevicePoolImp::pci_dbdf_address(int geopm_domain, unsigned int geopm_domain_idx) const + { + check_idx_range(geopm_domain, geopm_domain_idx); + if (geopm_domain == GEOPM_DOMAIN_GPU) { + // Get the address of the requested device + return m_levelzero.pci_dbdf_address(geopm_domain_idx); + } + else if (geopm_domain == GEOPM_DOMAIN_GPU_CHIP) { + // Get the address of the device containing the requested subdevice + auto dev_subdev_idx_pair = subdevice_device_conversion(geopm_domain_idx); + return m_levelzero.pci_dbdf_address(dev_subdev_idx_pair.first); + } + else { + throw Exception("LevelZeroDevicePool::" + std::string(__func__) + + ": domain " + std::to_string(geopm_domain) + + " is not supported", + GEOPM_ERROR_INVALID, __FILE__, __LINE__); + } + } } diff --git a/libgeopmd/src/LevelZeroDevicePool.hpp b/libgeopmd/src/LevelZeroDevicePool.hpp index bde168d939..4d1d9725e1 100644 --- a/libgeopmd/src/LevelZeroDevicePool.hpp +++ b/libgeopmd/src/LevelZeroDevicePool.hpp @@ -312,6 +312,10 @@ namespace geopm /// @return Display Error Count virtual double ras_display_errcount_uncorrectable(int domain, unsigned int domain_idx, int l0_domain) const = 0; + + // Return the formatted domain:bus:device.function PCI address of the given + // GEOPM GPU or GPU chip. + virtual std::string pci_dbdf_address(int geopm_domain, unsigned int geopm_domain_idx) const = 0; private: }; diff --git a/libgeopmd/src/LevelZeroDevicePoolImp.hpp b/libgeopmd/src/LevelZeroDevicePoolImp.hpp index 814294d18a..d53f6a6899 100644 --- a/libgeopmd/src/LevelZeroDevicePoolImp.hpp +++ b/libgeopmd/src/LevelZeroDevicePoolImp.hpp @@ -96,6 +96,7 @@ namespace geopm int l0_domain) const override; double ras_display_errcount_uncorrectable(int domain, unsigned int domain_idx, int l0_domain) const override; + std::string pci_dbdf_address(int geopm_domain, unsigned int geopm_domain_idx) const override; private: const LevelZero &m_levelzero; diff --git a/libgeopmd/src/LevelZeroGPUTopo.cpp b/libgeopmd/src/LevelZeroGPUTopo.cpp index 15ebe6e8b4..73719a0e91 100644 --- a/libgeopmd/src/LevelZeroGPUTopo.cpp +++ b/libgeopmd/src/LevelZeroGPUTopo.cpp @@ -8,19 +8,23 @@ #include #include "geopm/Exception.hpp" +#include "geopm/Helper.hpp" #include "LevelZeroDevicePool.hpp" #include "LevelZeroGPUTopo.hpp" namespace geopm { + static const std::string PCI_DEVICES_PATH = "/sys/bus/pci/devices"; + LevelZeroGPUTopo::LevelZeroGPUTopo() - : LevelZeroGPUTopo(levelzero_device_pool(), geopm_sched_num_cpu()) + : LevelZeroGPUTopo(levelzero_device_pool(), PCI_DEVICES_PATH) { } LevelZeroGPUTopo::LevelZeroGPUTopo(const LevelZeroDevicePool &device_pool, - const int num_cpu) + const std::string &pci_devices_path) : m_levelzero_device_pool(device_pool) + , m_pci_devices_path(pci_devices_path) { if (getenv("ZE_AFFINITY_MASK") != nullptr) { throw Exception("LevelZeroGPUTopo: Refusing to create a topology cache file while ZE_AFFINITY_MASK environment variable is set", @@ -35,37 +39,20 @@ namespace geopm #endif } else { - m_cpu_affinity_ideal.resize(num_gpu); - int num_cpu_per_gpu = num_cpu / num_gpu; - - m_cpu_affinity_ideal_chip.resize(num_gpu_chip); + m_cpu_affinity_ideal_chip.reserve(num_gpu_chip); int num_chip_per_gpu = num_gpu_chip / num_gpu; - // TODO: Add ideal CPU to GPU affinitization that isn't a simple split if needed. - // This may come from a call to oneAPI, LevelZero, etc - for (int gpu_idx = 0; gpu_idx < num_gpu; ++gpu_idx) { - size_t gpu_chip_index = gpu_idx * static_cast(num_chip_per_gpu); - int end_cpu_idx = (gpu_idx + 1) * num_cpu_per_gpu; - for (int cpu_idx = gpu_idx * num_cpu_per_gpu, chip_idx = 0; - cpu_idx < end_cpu_idx; - ++cpu_idx) { - m_cpu_affinity_ideal.at(gpu_idx).insert(cpu_idx); + for (int gpu_idx = 0; gpu_idx < num_gpu; ++gpu_idx) { + std::string pci_address = m_levelzero_device_pool.pci_dbdf_address(GEOPM_DOMAIN_GPU, gpu_idx); + std::string cpu_mask_path = m_pci_devices_path + "/" + pci_address + "/local_cpus"; + auto cpu_mask_buf = geopm::read_file(cpu_mask_path); + auto cpu_set = linux_cpumask_buf_to_int_set(cpu_mask_buf); - // CHIP to CPU association is currently only used to associate CHIPS to - // GPUS. This logic just distributes the CPUs associated with - // an GPU to its CHIPS in a round robin fashion. - m_cpu_affinity_ideal_chip.at(gpu_chip_index + - (chip_idx % num_chip_per_gpu)).insert(cpu_idx); - ++chip_idx; - } - } - if ((num_cpu % num_gpu) != 0) { - for (int cpu_idx = num_cpu_per_gpu * num_gpu, gpu_idx = 0; - cpu_idx < num_cpu; ++cpu_idx) { - m_cpu_affinity_ideal.at(gpu_idx % num_gpu).insert(cpu_idx); - size_t gpu_chip_index = gpu_idx * static_cast(num_chip_per_gpu); - m_cpu_affinity_ideal_chip.at(gpu_chip_index).insert(cpu_idx); - ++gpu_idx; + // This CPU set is local to the current iterated GPU and each + // of the GPU's subdevices. + m_cpu_affinity_ideal.push_back(cpu_set); + for (int gpu_subdevice = 0; gpu_subdevice < num_chip_per_gpu; ++gpu_subdevice) { + m_cpu_affinity_ideal_chip.push_back(cpu_set); } } } diff --git a/libgeopmd/src/LevelZeroGPUTopo.hpp b/libgeopmd/src/LevelZeroGPUTopo.hpp index 3b83558b6b..7c2019b645 100644 --- a/libgeopmd/src/LevelZeroGPUTopo.hpp +++ b/libgeopmd/src/LevelZeroGPUTopo.hpp @@ -9,6 +9,7 @@ #include #include #include +#include #include "GPUTopo.hpp" @@ -21,7 +22,7 @@ namespace geopm public: LevelZeroGPUTopo(); LevelZeroGPUTopo(const LevelZeroDevicePool &device_pool, - const int num_cpu); + const std::string &pci_devices_path); virtual ~LevelZeroGPUTopo() = default; int num_gpu(void) const override; int num_gpu(int domain) const override; @@ -31,6 +32,7 @@ namespace geopm const LevelZeroDevicePool &m_levelzero_device_pool; std::vector > m_cpu_affinity_ideal; std::vector > m_cpu_affinity_ideal_chip; + std::string m_pci_devices_path; }; } #endif diff --git a/libgeopmd/src/LevelZeroImp.hpp b/libgeopmd/src/LevelZeroImp.hpp index 9c1274eec8..50a8df2f29 100644 --- a/libgeopmd/src/LevelZeroImp.hpp +++ b/libgeopmd/src/LevelZeroImp.hpp @@ -126,6 +126,7 @@ namespace geopm double ras_display_errcount_uncorrectable(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override; + std::string pci_dbdf_address(unsigned int l0_device_idx) const override; private: enum m_error_type { diff --git a/libgeopmd/test/LevelZeroGPUTopoTest.cpp b/libgeopmd/test/LevelZeroGPUTopoTest.cpp index 04c4294fd6..8a15632960 100644 --- a/libgeopmd/test/LevelZeroGPUTopoTest.cpp +++ b/libgeopmd/test/LevelZeroGPUTopoTest.cpp @@ -24,35 +24,69 @@ using geopm::LevelZeroDevicePoolImp; using geopm::Exception; using testing::Return; + class LevelZeroGPUTopoTest : public :: testing :: Test { protected: void SetUp(); void TearDown(); + void add_device(const std::string& device_address, const std::string& local_cpus); std::shared_ptr m_levelzero; + std::string m_test_dir; + std::string m_test_devices_dir; + std::vector m_test_device_addresses; }; void LevelZeroGPUTopoTest::SetUp() { + char test_dir_template[] = "/tmp/LevelZeroGPUTopoTest_XXXXXX"; + if (mkdtemp(test_dir_template) == nullptr) { + throw std::runtime_error(std::string("Could not create a temporary directory at ") + + test_dir_template); + } + m_test_dir = test_dir_template; + m_test_devices_dir = m_test_dir + "/sys_bus_devices"; + if (mkdir(m_test_devices_dir.c_str(), 0755) == -1) { + rmdir(m_test_dir.c_str()); + throw std::runtime_error("Could not create directory at " + m_test_devices_dir); + } m_levelzero = std::make_shared(); } void LevelZeroGPUTopoTest::TearDown() { + for (const auto& device_address : m_test_device_addresses) { + std::string device_path = m_test_devices_dir + "/" + device_address; + std::string cpumask_path = device_path + "/local_cpus"; + unlink(cpumask_path.c_str()); + rmdir(device_path.c_str()); + } + rmdir(m_test_devices_dir.c_str()); + rmdir(m_test_dir.c_str()); +} + +void LevelZeroGPUTopoTest::add_device(const std::string& device_address, const std::string& local_cpus) +{ + std::string device_path = m_test_devices_dir + "/" + device_address; + if (mkdir(device_path.c_str(), 0755) == -1) { + throw std::runtime_error("Could not create directory at " + device_path); + } + geopm::write_file(device_path + "/local_cpus", local_cpus); + m_test_device_addresses.push_back(device_address); + ON_CALL(*m_levelzero, pci_dbdf_address(m_test_device_addresses.size() - 1)).WillByDefault(Return(device_address)); } //Test case: Mock num_gpu = 0 so we hit the appropriate warning and throw on affinitization requests. TEST_F(LevelZeroGPUTopoTest, no_gpu_config) { const int num_gpu = 0; - const int num_cpu = 40; LevelZeroDevicePoolImp m_device_pool(*m_levelzero); - EXPECT_CALL(*m_levelzero, num_gpu(GEOPM_DOMAIN_GPU)).WillOnce(Return(num_gpu)); - EXPECT_CALL(*m_levelzero, num_gpu(GEOPM_DOMAIN_GPU_CHIP)).WillOnce(Return(num_gpu)); + EXPECT_CALL(*m_levelzero, num_gpu(GEOPM_DOMAIN_GPU)).WillRepeatedly(Return(num_gpu)); + EXPECT_CALL(*m_levelzero, num_gpu(GEOPM_DOMAIN_GPU_CHIP)).WillRepeatedly(Return(num_gpu)); - LevelZeroGPUTopo topo(m_device_pool, num_cpu); + LevelZeroGPUTopo topo(m_device_pool, m_test_devices_dir); EXPECT_EQ(num_gpu, topo.num_gpu()); EXPECT_EQ(num_gpu, topo.num_gpu(GEOPM_DOMAIN_GPU_CHIP)); @@ -63,13 +97,16 @@ TEST_F(LevelZeroGPUTopoTest, four_forty_config) { const int num_gpu = 4; int num_gpu_subdevice = 4; - const int num_cpu = 40; + add_device("gpu0", "000003ff"); + add_device("gpu1", "000ffc00"); + add_device("gpu2", "3ff00000"); + add_device("gpu3", "ff,c0000000"); LevelZeroDevicePoolImp m_device_pool(*m_levelzero); - EXPECT_CALL(*m_levelzero, num_gpu(GEOPM_DOMAIN_GPU)).WillOnce(Return(num_gpu)); - EXPECT_CALL(*m_levelzero, num_gpu(GEOPM_DOMAIN_GPU_CHIP)).WillOnce(Return(num_gpu_subdevice)); + EXPECT_CALL(*m_levelzero, num_gpu(GEOPM_DOMAIN_GPU)).WillRepeatedly(Return(num_gpu)); + EXPECT_CALL(*m_levelzero, num_gpu(GEOPM_DOMAIN_GPU_CHIP)).WillRepeatedly(Return(num_gpu_subdevice)); - LevelZeroGPUTopo topo(m_device_pool, num_cpu); + LevelZeroGPUTopo topo(m_device_pool, m_test_devices_dir); EXPECT_EQ(num_gpu, topo.num_gpu()); EXPECT_EQ(num_gpu_subdevice, topo.num_gpu(GEOPM_DOMAIN_GPU_CHIP)); @@ -85,22 +122,22 @@ TEST_F(LevelZeroGPUTopoTest, four_forty_config) } num_gpu_subdevice = 8; - EXPECT_CALL(*m_levelzero, num_gpu(GEOPM_DOMAIN_GPU)).WillOnce(Return(num_gpu)); - EXPECT_CALL(*m_levelzero, num_gpu(GEOPM_DOMAIN_GPU_CHIP)).WillOnce(Return(num_gpu_subdevice)); + EXPECT_CALL(*m_levelzero, num_gpu(GEOPM_DOMAIN_GPU)).WillRepeatedly(Return(num_gpu)); + EXPECT_CALL(*m_levelzero, num_gpu(GEOPM_DOMAIN_GPU_CHIP)).WillRepeatedly(Return(num_gpu_subdevice)); - LevelZeroGPUTopo topo_sub(m_device_pool, num_cpu); + LevelZeroGPUTopo topo_sub(m_device_pool, m_test_devices_dir); EXPECT_EQ(num_gpu, topo_sub.num_gpu()); EXPECT_EQ(num_gpu_subdevice, topo_sub.num_gpu(GEOPM_DOMAIN_GPU_CHIP)); std::vector> cpus_allowed_set_subdevice = { - {0,2,4,6,8}, - {1,3,5,7,9}, - {10,12,14,16,18}, - {11,13,15,17,19}, - {20,22,24,26,28}, - {21,23,25,27,29}, - {30,32,34,36,38}, - {31,33,35,37,39} + {0,1,2,3,4,5,6,7,8,9}, + {0,1,2,3,4,5,6,7,8,9}, + {10,11,12,13,14,15,16,17,18,19}, + {10,11,12,13,14,15,16,17,18,19}, + {20,21,22,23,24,25,26,27,28,29}, + {20,21,22,23,24,25,26,27,28,29}, + {30,31,32,33,34,35,36,37,38,39}, + {30,31,32,33,34,35,36,37,38,39}, }; for (int gpu_idx = 0; gpu_idx < num_gpu; ++gpu_idx) { @@ -116,13 +153,20 @@ TEST_F(LevelZeroGPUTopoTest, eight_fiftysix_affinitization_config) { const int num_gpu = 8; const int num_gpu_subdevice = 8; - const int num_cpu = 56; + add_device("gpu0", "0000007f"); + add_device("gpu1", "00003f80"); + add_device("gpu2", "001fc000"); + add_device("gpu3", "0fe00000"); + add_device("gpu4", "7,f0000000"); + add_device("gpu5", "3f8,00000000"); + add_device("gpu6", "1fc00,00000000"); + add_device("gpu7", "fe0000,00000000"); LevelZeroDevicePoolImp m_device_pool(*m_levelzero); - EXPECT_CALL(*m_levelzero, num_gpu(GEOPM_DOMAIN_GPU)).WillOnce(Return(num_gpu)); - EXPECT_CALL(*m_levelzero, num_gpu(GEOPM_DOMAIN_GPU_CHIP)).WillOnce(Return(num_gpu_subdevice)); + EXPECT_CALL(*m_levelzero, num_gpu(GEOPM_DOMAIN_GPU)).WillRepeatedly(Return(num_gpu)); + EXPECT_CALL(*m_levelzero, num_gpu(GEOPM_DOMAIN_GPU_CHIP)).WillRepeatedly(Return(num_gpu_subdevice)); - LevelZeroGPUTopo topo(m_device_pool, num_cpu); + LevelZeroGPUTopo topo(m_device_pool, m_test_devices_dir); EXPECT_EQ(num_gpu, topo.num_gpu()); EXPECT_EQ(num_gpu_subdevice, topo.num_gpu(GEOPM_DOMAIN_GPU_CHIP)); @@ -147,13 +191,15 @@ TEST_F(LevelZeroGPUTopoTest, uneven_affinitization_config) { const int num_gpu = 3; const int num_gpu_subdevice = 6; - const int num_cpu =20; + add_device("gpu0", "0004003f"); + add_device("gpu1", "00080fc0"); + add_device("gpu2", "0003f000"); LevelZeroDevicePoolImp m_device_pool(*m_levelzero); - EXPECT_CALL(*m_levelzero, num_gpu(GEOPM_DOMAIN_GPU)).WillOnce(Return(num_gpu)); - EXPECT_CALL(*m_levelzero, num_gpu(GEOPM_DOMAIN_GPU_CHIP)).WillOnce(Return(num_gpu_subdevice)); + EXPECT_CALL(*m_levelzero, num_gpu(GEOPM_DOMAIN_GPU)).WillRepeatedly(Return(num_gpu)); + EXPECT_CALL(*m_levelzero, num_gpu(GEOPM_DOMAIN_GPU_CHIP)).WillRepeatedly(Return(num_gpu_subdevice)); - LevelZeroGPUTopo topo(m_device_pool, num_cpu); + LevelZeroGPUTopo topo(m_device_pool, m_test_devices_dir); EXPECT_EQ(num_gpu, topo.num_gpu()); std::set cpus_allowed_set[num_gpu]; @@ -166,14 +212,14 @@ TEST_F(LevelZeroGPUTopoTest, uneven_affinitization_config) } std::set cpus_allowed_set_subdevice[num_gpu_subdevice]; - cpus_allowed_set_subdevice[0] = {0, 2, 4,18}; - cpus_allowed_set_subdevice[1] = {1, 3, 5}; + cpus_allowed_set_subdevice[0] = {0 ,1 ,2 ,3 ,4 ,5 ,18}; + cpus_allowed_set_subdevice[1] = {0 ,1 ,2 ,3 ,4 ,5 ,18}; - cpus_allowed_set_subdevice[2] = {6, 8,10,19}; - cpus_allowed_set_subdevice[3] = {7, 9,11}; + cpus_allowed_set_subdevice[2] = {6 ,7 ,8 ,9 ,10,11,19}; + cpus_allowed_set_subdevice[3] = {6 ,7 ,8 ,9 ,10,11,19}; - cpus_allowed_set_subdevice[4] = {12,14,16}; - cpus_allowed_set_subdevice[5] = {13,15,17}; + cpus_allowed_set_subdevice[4] = {12,13,14,15,16,17}; + cpus_allowed_set_subdevice[5] = {12,13,14,15,16,17}; for (int sub_idx = 0; sub_idx < num_gpu; ++sub_idx) { ASSERT_THAT(topo.cpu_affinity_ideal(GEOPM_DOMAIN_GPU_CHIP, sub_idx), cpus_allowed_set_subdevice[sub_idx]); @@ -181,35 +227,17 @@ TEST_F(LevelZeroGPUTopoTest, uneven_affinitization_config) } //Test case: High Core count, theoretical system to test large CPU SETS. -// This represents a system with 64 cores and 8 GPUs TEST_F(LevelZeroGPUTopoTest, high_cpu_count_config) { - const int num_gpu = 8; - const int num_gpu_subdevice = 32; - const int num_cpu = 128; LevelZeroDevicePoolImp m_device_pool(*m_levelzero); + add_device("gpu0", "80000000,00000000,00000000,00000000"); - EXPECT_CALL(*m_levelzero, num_gpu(GEOPM_DOMAIN_GPU)).WillOnce(Return(num_gpu)); - EXPECT_CALL(*m_levelzero, num_gpu(GEOPM_DOMAIN_GPU_CHIP)).WillOnce(Return(num_gpu_subdevice)); + EXPECT_CALL(*m_levelzero, num_gpu(GEOPM_DOMAIN_GPU)).WillRepeatedly(Return(1)); + EXPECT_CALL(*m_levelzero, num_gpu(GEOPM_DOMAIN_GPU_CHIP)).WillRepeatedly(Return(1)); - LevelZeroGPUTopo topo(m_device_pool, num_cpu); + LevelZeroGPUTopo topo(m_device_pool, m_test_devices_dir); - EXPECT_EQ(num_gpu, topo.num_gpu()); - std::set cpus_allowed_set[num_gpu]; - - for (int gpu_idx = 0; gpu_idx < num_gpu; ++gpu_idx) { - for (int cpu_idx = 0; cpu_idx < num_cpu/num_gpu; ++cpu_idx) { - cpus_allowed_set[gpu_idx].insert(cpu_idx+(gpu_idx*16)); - } - ASSERT_THAT(topo.cpu_affinity_ideal(gpu_idx), cpus_allowed_set[gpu_idx]); - } - - std::set cpus_allowed_set_subdevice[num_gpu_subdevice]; - for (int sub_idx = 0; sub_idx < num_gpu_subdevice; ++sub_idx) { - for (int cpu_idx = 0; cpu_idx < num_cpu/num_gpu_subdevice; ++cpu_idx) { - int gpu_idx = sub_idx/(num_gpu_subdevice/num_gpu); - cpus_allowed_set_subdevice[sub_idx].insert((cpu_idx)*4 + sub_idx + (gpu_idx)*12); - } - ASSERT_THAT(topo.cpu_affinity_ideal(GEOPM_DOMAIN_GPU_CHIP, sub_idx), cpus_allowed_set_subdevice[sub_idx]); - } + EXPECT_EQ(1, topo.num_gpu()); + ASSERT_EQ(topo.cpu_affinity_ideal(0), std::set{127}); + ASSERT_EQ(topo.cpu_affinity_ideal(GEOPM_DOMAIN_GPU_CHIP, 0), std::set{127}); } diff --git a/libgeopmd/test/MockLevelZero.hpp b/libgeopmd/test/MockLevelZero.hpp index c583417081..70188f0ea2 100644 --- a/libgeopmd/test/MockLevelZero.hpp +++ b/libgeopmd/test/MockLevelZero.hpp @@ -105,6 +105,9 @@ class MockLevelZero : public geopm::LevelZero (unsigned int, int, int, double, double), (const, override)); MOCK_METHOD(void, performance_factor_control, (unsigned int, int, int, double), (const, override)); + + MOCK_METHOD(std::string, pci_dbdf_address, + (unsigned int l0_device_idx), (const, override)); }; #endif diff --git a/libgeopmd/test/MockLevelZeroDevicePool.hpp b/libgeopmd/test/MockLevelZeroDevicePool.hpp index 6840bb6859..104d709058 100644 --- a/libgeopmd/test/MockLevelZeroDevicePool.hpp +++ b/libgeopmd/test/MockLevelZeroDevicePool.hpp @@ -91,6 +91,8 @@ class MockLevelZeroDevicePool : public geopm::LevelZeroDevicePool (int, unsigned int, int, double, double),(const, override)); MOCK_METHOD(void, performance_factor_control, (int, unsigned int, int, double),(const, override)); + MOCK_METHOD(std::string, pci_dbdf_address, + (int, unsigned int), (const, override)); }; #endif