Skip to content

Commit

Permalink
Share allocator between CUDA EP & TRT EP. (microsoft#6332)
Browse files Browse the repository at this point in the history
* Share allocator between CUDA EP & TRT EP.
limitation:
1. Does not cover the per-thread allocator created by CUDA EP, still need to figure out the way to remove it
2. Need to have more identifiers to make it able to share CPU allocator across all EPs
  • Loading branch information
HectorSVC authored Jan 27, 2021
1 parent 9835b46 commit b5d1a49
Show file tree
Hide file tree
Showing 22 changed files with 281 additions and 111 deletions.
3 changes: 0 additions & 3 deletions include/onnxruntime/core/framework/allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,6 @@ namespace onnxruntime {
constexpr const char* CPU = "Cpu";
constexpr const char* CUDA = "Cuda";
constexpr const char* CUDA_PINNED = "CudaPinned";
// TODO: Unify the allocator for CUDA and Tensorrt
constexpr const char* TRT = "Tensorrt";
constexpr const char* TRT_PINNED = "TensorrtPinned";
constexpr const char* MIGRAPHX = "MIGraphX";
constexpr const char* MIGRAPHX_PINNED = "MIGraphXPinned";

Expand Down
15 changes: 12 additions & 3 deletions include/onnxruntime/core/framework/execution_provider.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
#pragma once

#ifndef PROVIDER_BRIDGE_PROVIDER
#include <map>
#include <unordered_map>
#include <unordered_set>

Expand All @@ -25,13 +24,14 @@ class KernelRegistryManager;

#include "core/framework/provider_options.h"
#include "core/framework/func_api.h"
#include "core/framework/allocatormgr.h"

namespace onnxruntime {

/**
Logical device representation.
*/
using AllocatorMap = std::map<int, AllocatorPtr>;
using AllocatorMap = std::unordered_map<int, AllocatorPtr>;
using MemoryInfoSet = std::set<OrtMemoryInfo>;

// if we are export the fused function to dll, the function will still in the same binary as onnxruntime
Expand Down Expand Up @@ -167,6 +167,8 @@ class IExecutionProvider {

void InsertAllocator(AllocatorPtr allocator);
void ReplaceAllocator(AllocatorPtr allocator);
// TODO: temparary sulotion, need to unify the interface in EP and AllocatorManager
void TryInsertAllocator(AllocatorPtr allocator);

// creation of a fused node is not supported in a minimal build, so any EP enabled in that scenario must support
// compilation via GraphViewer instances.
Expand Down Expand Up @@ -248,10 +250,17 @@ class IExecutionProvider {
@remarks e.g. the TensorRT Execution Provider is used in multiple sessions and the underlying infrastructure caches
compiled kernels, so the name must be unique and deterministic across models and sessions.
NOTE: Ideally this would be a protected method, but to work across the EP bridge it has to be public and
virtual, and ModelMetadefIdGenerator but be defined in the header as well.
virtual, and ModelMetadefIdGenerator but be defined in the header as well.
*/
virtual int GenerateMetaDefId(const onnxruntime::GraphViewer& graph_viewer, uint64_t& model_hash) const;

/**
Register allocators used for EP
TODO: Used for CUDA & TRT only for now, will have one more PR to apply this for all EPs.
EPs will have a shared pointer to allocator_manager, allocator_managerall will be the only place for allocators
*/
virtual void RegisterAllocator(std::shared_ptr<AllocatorManager> allocator_manager);

private:
const std::string type_;
AllocatorMap allocators_;
Expand Down
8 changes: 0 additions & 8 deletions onnxruntime/core/framework/allocator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -69,14 +69,6 @@ ORT_API_STATUS_IMPL(OrtApis::CreateMemoryInfo, _In_ const char* name1, enum OrtA
*out = new OrtMemoryInfo(
onnxruntime::CUDA_PINNED, type, OrtDevice(OrtDevice::CPU, OrtDevice::MemType::CUDA_PINNED, static_cast<OrtDevice::DeviceId>(id1)),
id1, mem_type1);
} else if (strcmp(name1, onnxruntime::TRT) == 0) {
*out = new OrtMemoryInfo(
onnxruntime::TRT, type, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, static_cast<OrtDevice::DeviceId>(id1)), id1,
mem_type1);
} else if (strcmp(name1, onnxruntime::TRT_PINNED) == 0) {
*out = new OrtMemoryInfo(
onnxruntime::TRT_PINNED, type, OrtDevice(OrtDevice::CPU, OrtDevice::MemType::CUDA_PINNED, static_cast<OrtDevice::DeviceId>(id1)),
id1, mem_type1);
} else {
return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "Specified device is not supported.");
}
Expand Down
37 changes: 37 additions & 0 deletions onnxruntime/core/framework/allocatormgr.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,13 @@
namespace onnxruntime {
using namespace common;

namespace {
//It assumes max(OrtMemType) <= 1, min(OrtMemType) = -2
inline int MakeKey(int id, OrtMemType mem_type) {
return id << 2 | (mem_type + 2);
}
} // namespace

AllocatorPtr CreateAllocator(const AllocatorCreationInfo& info) {
auto device_allocator = std::unique_ptr<IAllocator>(info.device_alloc_factory(info.device_id));

Expand Down Expand Up @@ -54,4 +61,34 @@ AllocatorPtr CreateAllocator(const AllocatorCreationInfo& info) {
return AllocatorPtr(std::move(device_allocator));
}


// Update allocator in the provider if already present; ignore if not.
void AllocatorManager::ReplaceAllocator(AllocatorPtr allocator) {
const auto& info = allocator->Info();
auto ite = mem_info_set_.find(info);
if (ite != mem_info_set_.end()) {
const int key = MakeKey(info.id, info.mem_type);
allocators_[key] = allocator;
}
}

void AllocatorManager::InsertAllocator(AllocatorPtr allocator) {
const OrtMemoryInfo& info = allocator->Info();
auto ite = mem_info_set_.find(info);
if (ite != mem_info_set_.end()) {
ORT_THROW("duplicated allocator");
}
const int key = MakeKey(info.id, info.mem_type);
allocators_.insert({key, allocator});
mem_info_set_.insert(ite, info);
allocator_list_.push_back(allocator);
}

AllocatorPtr AllocatorManager::GetAllocator(int id, OrtMemType mem_type) const {
auto iter = allocators_.find(MakeKey(id, mem_type));
if (iter != allocators_.end()) {
return iter->second;
}
return nullptr;
}
} // namespace onnxruntime
27 changes: 27 additions & 0 deletions onnxruntime/core/framework/allocatormgr.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,18 @@
#include "core/common/common.h"
#include "core/framework/allocator.h"
#include "core/session/onnxruntime_c_api.h"
#include <unordered_map>

namespace onnxruntime {

using AllocatorFactory = std::function<std::unique_ptr<IAllocator>(OrtDevice::DeviceId)>;

using AllocatorMap = std::unordered_map<int, AllocatorPtr>;
// TODO: update OrtMemoryInfo, use unordered_set instead
using MemoryInfoSet = std::set<OrtMemoryInfo>;

const int DEFAULT_CPU_ALLOCATOR_DEVICE_ID = 0;

struct AllocatorCreationInfo {
AllocatorCreationInfo(AllocatorFactory device_alloc_factory0,
OrtDevice::DeviceId device_id0 = 0,
Expand All @@ -33,4 +40,24 @@ struct AllocatorCreationInfo {
// Valid values can be found in onnxruntime_c_api.h.
AllocatorPtr CreateAllocator(const AllocatorCreationInfo& info);

// TODO: Only used for TRT and CUDA EP currently, need to add more identifiers to use it across all EPs
class AllocatorManager {
//
public:
AllocatorManager() = default;
void InsertAllocator(AllocatorPtr allocator);
void ReplaceAllocator(AllocatorPtr allocator);
//Get an allocator with specified device id and MemType. Return nullptr if it doesn't exist
AllocatorPtr GetAllocator(int id, OrtMemType mem_type) const;

private:
AllocatorMap allocators_;
// to ensure only allocators with unique OrtMemoryInfo are registered in the provider.
MemoryInfoSet mem_info_set_;

// convenience list of the allocators so GetAllocatorList doesn't have to build a new vector each time
// contains the same instances as allocators_
std::vector<AllocatorPtr> allocator_list_;
};

} // namespace onnxruntime
14 changes: 14 additions & 0 deletions onnxruntime/core/framework/execution_provider.cc
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,20 @@ void IExecutionProvider::InsertAllocator(AllocatorPtr allocator) {
allocator_list_.push_back(allocator);
}

void IExecutionProvider::TryInsertAllocator(AllocatorPtr allocator) {
const OrtMemoryInfo& info = allocator->Info();
auto ite = mem_info_set_.find(info);
if (ite != mem_info_set_.end()) {
LOGS_DEFAULT(WARNING) << "duplicated allocator: " << info.ToString();
return;
}
InsertAllocator(allocator);
}

void IExecutionProvider::RegisterAllocator(std::shared_ptr<AllocatorManager> ) {
return;
}

#if !defined(ORT_MINIMAL_BUILD)
common::Status IExecutionProvider::Compile(const std::vector<onnxruntime::Node*>& /*fused_node*/,
std::vector<NodeComputeInfo>& /*node_compute_funcs*/) {
Expand Down
11 changes: 10 additions & 1 deletion onnxruntime/core/framework/provider_bridge_ort.cc
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,7 @@ struct ProviderHostImpl : ProviderHost {
// IExecutionProvider
AllocatorPtr IExecutionProvider__GetAllocator(const IExecutionProvider* p, int id, OrtMemType mem_type) override { return p->IExecutionProvider::GetAllocator(id, mem_type); }
void IExecutionProvider__InsertAllocator(IExecutionProvider* p, AllocatorPtr allocator) override { return p->IExecutionProvider::InsertAllocator(allocator); }
void IExecutionProvider__TryInsertAllocator(IExecutionProvider* p, AllocatorPtr allocator) override { return p->IExecutionProvider::TryInsertAllocator(allocator); }
std::vector<std::unique_ptr<ComputeCapability>> IExecutionProvider__GetCapability(const IExecutionProvider* p, const onnxruntime::GraphViewer& graph_viewer,
const std::vector<const KernelRegistry*>& kernel_registries) override { return p->IExecutionProvider::GetCapability(graph_viewer, kernel_registries); }
common::Status IExecutionProvider__Compile(IExecutionProvider* p, const std::vector<onnxruntime::Node*>& fused_nodes, std::vector<NodeComputeInfo>& node_compute_funcs) override {
Expand All @@ -221,6 +222,10 @@ struct ProviderHostImpl : ProviderHost {
return p->IExecutionProvider::GenerateMetaDefId(graph_viewer, model_hash);
}

void IExecutionProvider__RegisterAllocator(IExecutionProvider* p, std::shared_ptr<AllocatorManager> allocator_manager) override {
return p->IExecutionProvider::RegisterAllocator(allocator_manager);
}

// Status
std::string Status__ToString(const Status* p) override { return p->ToString(); }

Expand Down Expand Up @@ -563,7 +568,11 @@ struct ProviderHostImpl : ProviderHost {

const TensorShape& Tensor__Shape(const Tensor* p) override { return p->Shape(); }
size_t Tensor__SizeInBytes(const Tensor* p) override { return p->SizeInBytes(); }
const OrtMemoryInfo& Tensor__Location(const Tensor* p) override { return p->Location(); }
const OrtMemoryInfo& Tensor__Location(const Tensor* p) override { return p->Location(); }

// AllocatorManager
void AllocatorManager__InsertAllocator(AllocatorManager* p, AllocatorPtr allocator) override { p->InsertAllocator(allocator); }
AllocatorPtr AllocatorManager__GetAllocator(AllocatorManager* p, int id, OrtMemType mem_type) override { return p->GetAllocator(id, mem_type); };

} provider_host_;

Expand Down
90 changes: 57 additions & 33 deletions onnxruntime/core/providers/cuda/cuda_execution_provider.cc
Original file line number Diff line number Diff line change
Expand Up @@ -107,42 +107,10 @@ CUDAExecutionProvider::CUDAExecutionProvider(const CUDAExecutionProviderInfo& in
size_t free = 0;
size_t total = 0;
CUDA_CALL_THROW(cudaMemGetInfo(&free, &total));

AllocatorCreationInfo default_memory_info(
[](OrtDevice::DeviceId device_id) {
return onnxruntime::make_unique<CUDAAllocator>(device_id, CUDA);
},
info_.device_id,
true,
{info_.cuda_mem_limit,
static_cast<int>(info_.arena_extend_strategy),
-1, -1});

InsertAllocator(CreateAllocator(default_memory_info));

AllocatorCreationInfo pinned_memory_info(
[](OrtDevice::DeviceId device_id) {
return onnxruntime::make_unique<CUDAPinnedAllocator>(device_id, CUDA_PINNED);
},
CPU_ALLOCATOR_DEVICE_ID);

InsertAllocator(CreateAllocator(pinned_memory_info));

// TODO: this is actually used for the cuda kernels which explicitly ask for inputs from CPU.
// This will be refactored/removed when allocator and execution provider are decoupled.
AllocatorCreationInfo cpu_memory_info(
[](int device_id) {
return onnxruntime::make_unique<CPUAllocator>(
OrtMemoryInfo("CUDA_CPU", OrtAllocatorType::OrtDeviceAllocator, OrtDevice(), device_id,
OrtMemTypeCPUInput));
},
CPU_ALLOCATOR_DEVICE_ID);

InsertAllocator(CreateAllocator(cpu_memory_info));
}

CUDAExecutionProvider::~CUDAExecutionProvider() {
auto cpu_alloc = GetAllocator(CPU_ALLOCATOR_DEVICE_ID, OrtMemTypeCPU);
auto cpu_alloc = GetAllocator(DEFAULT_CPU_ALLOCATOR_DEVICE_ID, OrtMemTypeCPU);
{
std::lock_guard<OrtMutex> lock(deferred_release_cpu_ptr_mutex_);
auto it = deferred_release_cpu_ptr_.begin();
Expand Down Expand Up @@ -1991,4 +1959,60 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
return result;
}

void CUDAExecutionProvider::RegisterAllocator(std::shared_ptr<AllocatorManager> allocator_manager) {
// Try to get a CUDA allocator from allocator manager first
// Used to allocate CUDA device memory
auto cuda_alloc = allocator_manager->GetAllocator(info_.device_id, OrtMemTypeDefault);
if (nullptr == cuda_alloc) {
AllocatorCreationInfo default_memory_info(
[](OrtDevice::DeviceId device_id) {
return onnxruntime::make_unique<CUDAAllocator>(device_id, CUDA);
},
info_.device_id,
true,
{info_.cuda_mem_limit,
static_cast<int>(info_.arena_extend_strategy),
-1, -1});
cuda_alloc = CreateAllocator(default_memory_info);
allocator_manager->InsertAllocator(cuda_alloc);
}
TryInsertAllocator(cuda_alloc);

// OrtMemTypeCPUOutput -- allocated by cudaMallocHost, used to copy CUDA device memory to CPU
// Use pinned memory instead of pageable memory make the data transfer faster
// Used by node MemcpyToHost only
auto cuda_pinned_alloc = allocator_manager->GetAllocator(DEFAULT_CPU_ALLOCATOR_DEVICE_ID, OrtMemTypeCPUOutput);
if (nullptr == cuda_pinned_alloc) {
AllocatorCreationInfo pinned_memory_info(
[](OrtDevice::DeviceId device_id) {
return onnxruntime::make_unique<CUDAPinnedAllocator>(device_id, CUDA_PINNED);
},
DEFAULT_CPU_ALLOCATOR_DEVICE_ID);

cuda_pinned_alloc = CreateAllocator(pinned_memory_info);
allocator_manager->InsertAllocator(cuda_pinned_alloc);
}
TryInsertAllocator(cuda_pinned_alloc);

// OrtMemTypeCPUInput -- CUDA op place the input on CPU and will not be accessed by CUDA kernel, no sync issue
auto cuda_cpu_alloc = allocator_manager->GetAllocator(DEFAULT_CPU_ALLOCATOR_DEVICE_ID, OrtMemTypeCPUInput);
if (nullptr == cuda_cpu_alloc) {
// TODO: this is actually used for the cuda kernels which explicitly ask for inputs from CPU.
// This will be refactored/removed when allocator and execution provider are decoupled.
// Need to move the OrtMemoryType out of Allocator, that's one thing blocking us to share it with CPU EP
// CPUAllocator is OrtMemTypeDefault for CPU EP
AllocatorCreationInfo cpu_memory_info(
[](int device_id) {
return onnxruntime::make_unique<CPUAllocator>(
OrtMemoryInfo("CUDA_CPU", OrtAllocatorType::OrtDeviceAllocator, OrtDevice(), device_id,
OrtMemTypeCPUInput));
},
DEFAULT_CPU_ALLOCATOR_DEVICE_ID);

cuda_cpu_alloc = CreateAllocator(cpu_memory_info);
allocator_manager->InsertAllocator(cuda_cpu_alloc);
}
TryInsertAllocator(cuda_cpu_alloc);
}

} // namespace onnxruntime
4 changes: 2 additions & 2 deletions onnxruntime/core/providers/cuda/cuda_execution_provider.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@

namespace onnxruntime {

const int CPU_ALLOCATOR_DEVICE_ID = 0;

// Logical device representation.
class CUDAExecutionProvider : public IExecutionProvider {
public:
Expand Down Expand Up @@ -77,6 +75,8 @@ class CUDAExecutionProvider : public IExecutionProvider {
return CUDAExecutionProviderInfo::ToProviderOptions(info_);
}

void RegisterAllocator(std::shared_ptr<AllocatorManager> allocator_manager) override;

private:
CUDAExecutionProviderInfo info_;
cudaDeviceProp device_prop_;
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/core/providers/cuda/cuda_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ class CudaKernel : public OpKernel {

template <typename T>
inline IAllocatorUniquePtr<T> AllocateBufferOnCPUPinned(size_t count_or_bytes) const {
AllocatorPtr allocator = provider_->GetAllocator(CPU_ALLOCATOR_DEVICE_ID, OrtMemTypeCPU);
AllocatorPtr allocator = provider_->GetAllocator(DEFAULT_CPU_ALLOCATOR_DEVICE_ID, OrtMemTypeCPU);
if (!allocator)
return nullptr;
return IAllocator::MakeUniquePtr<T>(allocator, count_or_bytes);
Expand Down
6 changes: 3 additions & 3 deletions onnxruntime/core/providers/rocm/rocm_execution_provider.cc
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ ROCMExecutionProvider::ROCMExecutionProvider(const ROCMExecutionProviderInfo& in
[](OrtDevice::DeviceId device_id) {
return onnxruntime::make_unique<ROCMPinnedAllocator>(device_id, CUDA_PINNED);
},
CPU_ALLOCATOR_DEVICE_ID);
DEFAULT_CPU_ALLOCATOR_DEVICE_ID);

InsertAllocator(CreateAllocator(pinned_memory_info));

Expand All @@ -136,13 +136,13 @@ ROCMExecutionProvider::ROCMExecutionProvider(const ROCMExecutionProviderInfo& in
OrtMemoryInfo("HIP_CPU", OrtAllocatorType::OrtDeviceAllocator, OrtDevice(), device_id,
OrtMemTypeCPUInput));
},
CPU_ALLOCATOR_DEVICE_ID);
DEFAULT_CPU_ALLOCATOR_DEVICE_ID);

InsertAllocator(CreateAllocator(cpu_memory_info));
}

ROCMExecutionProvider::~ROCMExecutionProvider() {
auto cpu_alloc = GetAllocator(CPU_ALLOCATOR_DEVICE_ID, OrtMemTypeCPU);
auto cpu_alloc = GetAllocator(DEFAULT_CPU_ALLOCATOR_DEVICE_ID, OrtMemTypeCPU);
{
std::lock_guard<OrtMutex> lock(deferred_release_cpu_ptr_mutex_);
auto it = deferred_release_cpu_ptr_.begin();
Expand Down
2 changes: 0 additions & 2 deletions onnxruntime/core/providers/rocm/rocm_execution_provider.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@

namespace onnxruntime {

const int CPU_ALLOCATOR_DEVICE_ID = 0;

// Logical device representation.
class ROCMExecutionProvider : public IExecutionProvider {
public:
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/core/providers/rocm/rocm_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ class RocmKernel : public OpKernel {

template <typename T>
inline IAllocatorUniquePtr<T> AllocateBufferOnCPUPinned(size_t count_or_bytes) const {
AllocatorPtr allocator = provider_->GetAllocator(CPU_ALLOCATOR_DEVICE_ID, OrtMemTypeCPU);
AllocatorPtr allocator = provider_->GetAllocator(DEFAULT_CPU_ALLOCATOR_DEVICE_ID, OrtMemTypeCPU);
if (!allocator)
return nullptr;
return IAllocator::MakeUniquePtr<T>(allocator, count_or_bytes);
Expand Down
Loading

0 comments on commit b5d1a49

Please sign in to comment.