Share allocator between CUDA EP & TRT EP. (microsoft#6332)

* Share allocator between CUDA EP & TRT EP. limitation: 1. Does not cover the per-thread allocator created by CUDA EP, still need to figure out the way to remove it 2. Need to have more identifiers to make it able to share CPU allocator across all EPs
jjsbear · Jan 27, 2021 · b5d1a49 · b5d1a49
1 parent 9835b46
commit b5d1a49
Show file tree

Hide file tree

Showing 22 changed files with 281 additions and 111 deletions.
diff --git a/include/onnxruntime/core/framework/allocator.h b/include/onnxruntime/core/framework/allocator.h
@@ -22,9 +22,6 @@ namespace onnxruntime {
 constexpr const char* CPU = "Cpu";
 constexpr const char* CUDA = "Cuda";
 constexpr const char* CUDA_PINNED = "CudaPinned";
-// TODO: Unify the allocator for CUDA and Tensorrt
-constexpr const char* TRT = "Tensorrt";
-constexpr const char* TRT_PINNED = "TensorrtPinned";
 constexpr const char* MIGRAPHX = "MIGraphX";
 constexpr const char* MIGRAPHX_PINNED = "MIGraphXPinned";
 

diff --git a/include/onnxruntime/core/framework/execution_provider.h b/include/onnxruntime/core/framework/execution_provider.h
@@ -4,7 +4,6 @@
 #pragma once
 
 #ifndef PROVIDER_BRIDGE_PROVIDER
-#include <map>
 #include <unordered_map>
 #include <unordered_set>
 
@@ -25,13 +24,14 @@ class KernelRegistryManager;
 
 #include "core/framework/provider_options.h"
 #include "core/framework/func_api.h"
+#include "core/framework/allocatormgr.h"
 
 namespace onnxruntime {
 
 /**
    Logical device representation.
 */
-using AllocatorMap = std::map<int, AllocatorPtr>;
+using AllocatorMap = std::unordered_map<int, AllocatorPtr>;
 using MemoryInfoSet = std::set<OrtMemoryInfo>;
 
 // if we are export the fused function to dll, the function will still in the same binary as onnxruntime
@@ -167,6 +167,8 @@ class IExecutionProvider {
 
   void InsertAllocator(AllocatorPtr allocator);
   void ReplaceAllocator(AllocatorPtr allocator);
+  // TODO: temparary sulotion, need to unify the interface in EP and AllocatorManager
+  void TryInsertAllocator(AllocatorPtr allocator);
 
   // creation of a fused node is not supported in a minimal build, so any EP enabled in that scenario must support
   // compilation via GraphViewer instances.
@@ -248,10 +250,17 @@ class IExecutionProvider {
    @remarks e.g. the TensorRT Execution Provider is used in multiple sessions and the underlying infrastructure caches
             compiled kernels, so the name must be unique and deterministic across models and sessions.
             NOTE: Ideally this would be a protected method, but to work across the EP bridge it has to be public and 
-			      virtual, and ModelMetadefIdGenerator but be defined in the header as well.
+                  virtual, and ModelMetadefIdGenerator but be defined in the header as well.
    */
   virtual int GenerateMetaDefId(const onnxruntime::GraphViewer& graph_viewer, uint64_t& model_hash) const;
 
+  /**
+     Register allocators used for EP
+     TODO: Used for CUDA & TRT only for now, will have one more PR to apply this for all EPs.
+     EPs will have a shared pointer to allocator_manager, allocator_managerall will be the only place for allocators
+  */
+  virtual void RegisterAllocator(std::shared_ptr<AllocatorManager> allocator_manager);
+
  private:
   const std::string type_;
   AllocatorMap allocators_;

diff --git a/onnxruntime/core/framework/allocator.cc b/onnxruntime/core/framework/allocator.cc
@@ -69,14 +69,6 @@ ORT_API_STATUS_IMPL(OrtApis::CreateMemoryInfo, _In_ const char* name1, enum OrtA
     *out = new OrtMemoryInfo(
         onnxruntime::CUDA_PINNED, type, OrtDevice(OrtDevice::CPU, OrtDevice::MemType::CUDA_PINNED, static_cast<OrtDevice::DeviceId>(id1)),
         id1, mem_type1);
-  } else if (strcmp(name1, onnxruntime::TRT) == 0) {
-    *out = new OrtMemoryInfo(
-        onnxruntime::TRT, type, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, static_cast<OrtDevice::DeviceId>(id1)), id1,
-        mem_type1);
-  } else if (strcmp(name1, onnxruntime::TRT_PINNED) == 0) {
-    *out = new OrtMemoryInfo(
-        onnxruntime::TRT_PINNED, type, OrtDevice(OrtDevice::CPU, OrtDevice::MemType::CUDA_PINNED, static_cast<OrtDevice::DeviceId>(id1)),
-        id1, mem_type1);
   } else {
     return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "Specified device is not supported.");
   }

diff --git a/onnxruntime/core/framework/allocatormgr.cc b/onnxruntime/core/framework/allocatormgr.cc
@@ -13,6 +13,13 @@
 namespace onnxruntime {
 using namespace common;
 
+namespace {
+//It assumes max(OrtMemType) <= 1, min(OrtMemType) = -2
+inline int MakeKey(int id, OrtMemType mem_type) {
+  return id << 2 | (mem_type + 2);
+}
+}  // namespace
+
 AllocatorPtr CreateAllocator(const AllocatorCreationInfo& info) {
   auto device_allocator = std::unique_ptr<IAllocator>(info.device_alloc_factory(info.device_id));
 
@@ -54,4 +61,34 @@ AllocatorPtr CreateAllocator(const AllocatorCreationInfo& info) {
   return AllocatorPtr(std::move(device_allocator));
 }
 
+
+// Update allocator in the provider if already present; ignore if not.
+void AllocatorManager::ReplaceAllocator(AllocatorPtr allocator) {
+  const auto& info = allocator->Info();
+  auto ite = mem_info_set_.find(info);
+  if (ite != mem_info_set_.end()) {
+    const int key = MakeKey(info.id, info.mem_type);
+    allocators_[key] = allocator;
+  }
+}
+
+void AllocatorManager::InsertAllocator(AllocatorPtr allocator) {
+  const OrtMemoryInfo& info = allocator->Info();
+  auto ite = mem_info_set_.find(info);
+  if (ite != mem_info_set_.end()) {
+    ORT_THROW("duplicated allocator");
+  }
+  const int key = MakeKey(info.id, info.mem_type);
+  allocators_.insert({key, allocator});
+  mem_info_set_.insert(ite, info);
+  allocator_list_.push_back(allocator);
+}
+
+AllocatorPtr AllocatorManager::GetAllocator(int id, OrtMemType mem_type) const {
+  auto iter = allocators_.find(MakeKey(id, mem_type));
+  if (iter != allocators_.end()) {
+    return iter->second;
+  }
+  return nullptr;
+}
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/allocatormgr.h b/onnxruntime/core/framework/allocatormgr.h
@@ -6,11 +6,18 @@
 #include "core/common/common.h"
 #include "core/framework/allocator.h"
 #include "core/session/onnxruntime_c_api.h"
+#include <unordered_map>
 
 namespace onnxruntime {
 
 using AllocatorFactory = std::function<std::unique_ptr<IAllocator>(OrtDevice::DeviceId)>;
 
+using AllocatorMap = std::unordered_map<int, AllocatorPtr>;
+// TODO: update OrtMemoryInfo, use unordered_set instead
+using MemoryInfoSet = std::set<OrtMemoryInfo>;
+
+const int DEFAULT_CPU_ALLOCATOR_DEVICE_ID = 0;
+
 struct AllocatorCreationInfo {
   AllocatorCreationInfo(AllocatorFactory device_alloc_factory0,
                         OrtDevice::DeviceId device_id0 = 0,
@@ -33,4 +40,24 @@ struct AllocatorCreationInfo {
 // Valid values can be found in onnxruntime_c_api.h.
 AllocatorPtr CreateAllocator(const AllocatorCreationInfo& info);
 
+// TODO: Only used for TRT and CUDA EP currently, need to add more identifiers to use it across all EPs
+class AllocatorManager {
+  //
+ public:
+  AllocatorManager() = default;
+  void InsertAllocator(AllocatorPtr allocator);
+  void ReplaceAllocator(AllocatorPtr allocator);
+  //Get an allocator with specified device id and MemType. Return nullptr if it doesn't exist
+  AllocatorPtr GetAllocator(int id, OrtMemType mem_type) const;
+
+ private:
+  AllocatorMap allocators_;
+  // to ensure only allocators with unique OrtMemoryInfo are registered in the provider.
+  MemoryInfoSet mem_info_set_;
+
+  // convenience list of the allocators so GetAllocatorList doesn't have to build a new vector each time
+  // contains the same instances as allocators_
+  std::vector<AllocatorPtr> allocator_list_;
+};
+
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/execution_provider.cc b/onnxruntime/core/framework/execution_provider.cc
@@ -74,6 +74,20 @@ void IExecutionProvider::InsertAllocator(AllocatorPtr allocator) {
   allocator_list_.push_back(allocator);
 }
 
+void IExecutionProvider::TryInsertAllocator(AllocatorPtr allocator) {
+  const OrtMemoryInfo& info = allocator->Info();
+  auto ite = mem_info_set_.find(info);
+  if (ite != mem_info_set_.end()) {
+    LOGS_DEFAULT(WARNING) << "duplicated allocator: " << info.ToString();
+    return;
+  }
+  InsertAllocator(allocator);
+}
+
+void IExecutionProvider::RegisterAllocator(std::shared_ptr<AllocatorManager> ) {
+  return;
+}
+
 #if !defined(ORT_MINIMAL_BUILD)
 common::Status IExecutionProvider::Compile(const std::vector<onnxruntime::Node*>& /*fused_node*/,
                                            std::vector<NodeComputeInfo>& /*node_compute_funcs*/) {

diff --git a/onnxruntime/core/framework/provider_bridge_ort.cc b/onnxruntime/core/framework/provider_bridge_ort.cc
@@ -203,6 +203,7 @@ struct ProviderHostImpl : ProviderHost {
   // IExecutionProvider
   AllocatorPtr IExecutionProvider__GetAllocator(const IExecutionProvider* p, int id, OrtMemType mem_type) override { return p->IExecutionProvider::GetAllocator(id, mem_type); }
   void IExecutionProvider__InsertAllocator(IExecutionProvider* p, AllocatorPtr allocator) override { return p->IExecutionProvider::InsertAllocator(allocator); }
+  void IExecutionProvider__TryInsertAllocator(IExecutionProvider* p, AllocatorPtr allocator) override { return p->IExecutionProvider::TryInsertAllocator(allocator); }
   std::vector<std::unique_ptr<ComputeCapability>> IExecutionProvider__GetCapability(const IExecutionProvider* p, const onnxruntime::GraphViewer& graph_viewer,
                                                                                     const std::vector<const KernelRegistry*>& kernel_registries) override { return p->IExecutionProvider::GetCapability(graph_viewer, kernel_registries); }
   common::Status IExecutionProvider__Compile(IExecutionProvider* p, const std::vector<onnxruntime::Node*>& fused_nodes, std::vector<NodeComputeInfo>& node_compute_funcs) override {
@@ -221,6 +222,10 @@ struct ProviderHostImpl : ProviderHost {
     return p->IExecutionProvider::GenerateMetaDefId(graph_viewer, model_hash);
   }
 
+  void IExecutionProvider__RegisterAllocator(IExecutionProvider* p, std::shared_ptr<AllocatorManager> allocator_manager) override {
+    return p->IExecutionProvider::RegisterAllocator(allocator_manager);
+  }
+
   // Status
   std::string Status__ToString(const Status* p) override { return p->ToString(); }
 
@@ -563,7 +568,11 @@ struct ProviderHostImpl : ProviderHost {
 
   const TensorShape& Tensor__Shape(const Tensor* p) override { return p->Shape(); }
   size_t Tensor__SizeInBytes(const Tensor* p) override { return p->SizeInBytes(); }
-  const OrtMemoryInfo& Tensor__Location(const Tensor* p) override { return p->Location(); }
+  const OrtMemoryInfo& Tensor__Location(const Tensor* p) override { return p->Location(); }  
+
+  // AllocatorManager
+  void AllocatorManager__InsertAllocator(AllocatorManager* p, AllocatorPtr allocator) override { p->InsertAllocator(allocator); }
+  AllocatorPtr AllocatorManager__GetAllocator(AllocatorManager* p, int id, OrtMemType mem_type) override { return p->GetAllocator(id, mem_type); };
 
 } provider_host_;
 

diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -107,42 +107,10 @@ CUDAExecutionProvider::CUDAExecutionProvider(const CUDAExecutionProviderInfo& in
   size_t free = 0;
   size_t total = 0;
   CUDA_CALL_THROW(cudaMemGetInfo(&free, &total));
-
-  AllocatorCreationInfo default_memory_info(
-      [](OrtDevice::DeviceId device_id) {
-        return onnxruntime::make_unique<CUDAAllocator>(device_id, CUDA);
-      },
-      info_.device_id,
-      true,
-      {info_.cuda_mem_limit,
-       static_cast<int>(info_.arena_extend_strategy),
-       -1, -1});
-
-  InsertAllocator(CreateAllocator(default_memory_info));
-
-  AllocatorCreationInfo pinned_memory_info(
-      [](OrtDevice::DeviceId device_id) {
-        return onnxruntime::make_unique<CUDAPinnedAllocator>(device_id, CUDA_PINNED);
-      },
-      CPU_ALLOCATOR_DEVICE_ID);
-
-  InsertAllocator(CreateAllocator(pinned_memory_info));
-
-  // TODO: this is actually used for the cuda kernels which explicitly ask for inputs from CPU.
-  // This will be refactored/removed when allocator and execution provider are decoupled.
-  AllocatorCreationInfo cpu_memory_info(
-      [](int device_id) {
-        return onnxruntime::make_unique<CPUAllocator>(
-            OrtMemoryInfo("CUDA_CPU", OrtAllocatorType::OrtDeviceAllocator, OrtDevice(), device_id,
-                          OrtMemTypeCPUInput));
-      },
-      CPU_ALLOCATOR_DEVICE_ID);
-
-  InsertAllocator(CreateAllocator(cpu_memory_info));
 }
 
 CUDAExecutionProvider::~CUDAExecutionProvider() {
-  auto cpu_alloc = GetAllocator(CPU_ALLOCATOR_DEVICE_ID, OrtMemTypeCPU);
+  auto cpu_alloc = GetAllocator(DEFAULT_CPU_ALLOCATOR_DEVICE_ID, OrtMemTypeCPU);
   {
     std::lock_guard<OrtMutex> lock(deferred_release_cpu_ptr_mutex_);
     auto it = deferred_release_cpu_ptr_.begin();
@@ -1991,4 +1959,60 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
   return result;
 }
 
+void CUDAExecutionProvider::RegisterAllocator(std::shared_ptr<AllocatorManager> allocator_manager) {
+  // Try to get a CUDA allocator from allocator manager first
+  // Used to allocate CUDA device memory
+  auto cuda_alloc = allocator_manager->GetAllocator(info_.device_id, OrtMemTypeDefault);
+  if (nullptr == cuda_alloc) {
+    AllocatorCreationInfo default_memory_info(
+        [](OrtDevice::DeviceId device_id) {
+          return onnxruntime::make_unique<CUDAAllocator>(device_id, CUDA);
+        },
+        info_.device_id,
+        true,
+        {info_.cuda_mem_limit,
+         static_cast<int>(info_.arena_extend_strategy),
+         -1, -1});
+    cuda_alloc = CreateAllocator(default_memory_info);
+    allocator_manager->InsertAllocator(cuda_alloc);
+  }
+  TryInsertAllocator(cuda_alloc);
+
+  // OrtMemTypeCPUOutput -- allocated by cudaMallocHost, used to copy CUDA device memory to CPU
+  // Use pinned memory instead of pageable memory make the data transfer faster
+  // Used by node MemcpyToHost only
+  auto cuda_pinned_alloc = allocator_manager->GetAllocator(DEFAULT_CPU_ALLOCATOR_DEVICE_ID, OrtMemTypeCPUOutput);
+  if (nullptr == cuda_pinned_alloc) {
+    AllocatorCreationInfo pinned_memory_info(
+        [](OrtDevice::DeviceId device_id) {
+          return onnxruntime::make_unique<CUDAPinnedAllocator>(device_id, CUDA_PINNED);
+        },
+        DEFAULT_CPU_ALLOCATOR_DEVICE_ID);
+
+    cuda_pinned_alloc = CreateAllocator(pinned_memory_info);
+    allocator_manager->InsertAllocator(cuda_pinned_alloc);
+  }
+  TryInsertAllocator(cuda_pinned_alloc);
+
+  // OrtMemTypeCPUInput -- CUDA op place the input on CPU and will not be accessed by CUDA kernel, no sync issue
+  auto cuda_cpu_alloc = allocator_manager->GetAllocator(DEFAULT_CPU_ALLOCATOR_DEVICE_ID, OrtMemTypeCPUInput);
+  if (nullptr == cuda_cpu_alloc) {
+    // TODO: this is actually used for the cuda kernels which explicitly ask for inputs from CPU.
+    // This will be refactored/removed when allocator and execution provider are decoupled.
+    // Need to move the OrtMemoryType out of Allocator, that's one thing blocking us to share it with CPU EP
+    // CPUAllocator is OrtMemTypeDefault for CPU EP
+    AllocatorCreationInfo cpu_memory_info(
+        [](int device_id) {
+          return onnxruntime::make_unique<CPUAllocator>(
+              OrtMemoryInfo("CUDA_CPU", OrtAllocatorType::OrtDeviceAllocator, OrtDevice(), device_id,
+                            OrtMemTypeCPUInput));
+        },
+        DEFAULT_CPU_ALLOCATOR_DEVICE_ID);
+
+    cuda_cpu_alloc = CreateAllocator(cpu_memory_info);
+    allocator_manager->InsertAllocator(cuda_cpu_alloc);
+  }
+  TryInsertAllocator(cuda_cpu_alloc);
+}
+
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.h b/onnxruntime/core/providers/cuda/cuda_execution_provider.h
@@ -18,8 +18,6 @@
 
 namespace onnxruntime {
 
-const int CPU_ALLOCATOR_DEVICE_ID = 0;
-
 // Logical device representation.
 class CUDAExecutionProvider : public IExecutionProvider {
  public:
@@ -77,6 +75,8 @@ class CUDAExecutionProvider : public IExecutionProvider {
     return CUDAExecutionProviderInfo::ToProviderOptions(info_);
   }
 
+  void RegisterAllocator(std::shared_ptr<AllocatorManager> allocator_manager) override;
+
  private:
   CUDAExecutionProviderInfo info_;
   cudaDeviceProp device_prop_;

diff --git a/onnxruntime/core/providers/cuda/cuda_kernel.h b/onnxruntime/core/providers/cuda/cuda_kernel.h
@@ -44,7 +44,7 @@ class CudaKernel : public OpKernel {
 
   template <typename T>
   inline IAllocatorUniquePtr<T> AllocateBufferOnCPUPinned(size_t count_or_bytes) const {
-    AllocatorPtr allocator = provider_->GetAllocator(CPU_ALLOCATOR_DEVICE_ID, OrtMemTypeCPU);
+    AllocatorPtr allocator = provider_->GetAllocator(DEFAULT_CPU_ALLOCATOR_DEVICE_ID, OrtMemTypeCPU);
     if (!allocator)
       return nullptr;
     return IAllocator::MakeUniquePtr<T>(allocator, count_or_bytes);

diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
@@ -124,7 +124,7 @@ ROCMExecutionProvider::ROCMExecutionProvider(const ROCMExecutionProviderInfo& in
       [](OrtDevice::DeviceId device_id) {
         return onnxruntime::make_unique<ROCMPinnedAllocator>(device_id, CUDA_PINNED);
       },
-      CPU_ALLOCATOR_DEVICE_ID);
+      DEFAULT_CPU_ALLOCATOR_DEVICE_ID);
 
   InsertAllocator(CreateAllocator(pinned_memory_info));
 
@@ -136,13 +136,13 @@ ROCMExecutionProvider::ROCMExecutionProvider(const ROCMExecutionProviderInfo& in
             OrtMemoryInfo("HIP_CPU", OrtAllocatorType::OrtDeviceAllocator, OrtDevice(), device_id,
                           OrtMemTypeCPUInput));
       },
-      CPU_ALLOCATOR_DEVICE_ID);
+      DEFAULT_CPU_ALLOCATOR_DEVICE_ID);
 
   InsertAllocator(CreateAllocator(cpu_memory_info));
 }
 
 ROCMExecutionProvider::~ROCMExecutionProvider() {
-  auto cpu_alloc = GetAllocator(CPU_ALLOCATOR_DEVICE_ID, OrtMemTypeCPU);
+  auto cpu_alloc = GetAllocator(DEFAULT_CPU_ALLOCATOR_DEVICE_ID, OrtMemTypeCPU);
   {
     std::lock_guard<OrtMutex> lock(deferred_release_cpu_ptr_mutex_);
     auto it = deferred_release_cpu_ptr_.begin();

diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider.h b/onnxruntime/core/providers/rocm/rocm_execution_provider.h
@@ -18,8 +18,6 @@
 
 namespace onnxruntime {
 
-const int CPU_ALLOCATOR_DEVICE_ID = 0;
-
 // Logical device representation.
 class ROCMExecutionProvider : public IExecutionProvider {
  public:

diff --git a/onnxruntime/core/providers/rocm/rocm_kernel.h b/onnxruntime/core/providers/rocm/rocm_kernel.h
@@ -41,7 +41,7 @@ class RocmKernel : public OpKernel {
 
   template <typename T>
   inline IAllocatorUniquePtr<T> AllocateBufferOnCPUPinned(size_t count_or_bytes) const {
-    AllocatorPtr allocator = provider_->GetAllocator(CPU_ALLOCATOR_DEVICE_ID, OrtMemTypeCPU);
+    AllocatorPtr allocator = provider_->GetAllocator(DEFAULT_CPU_ALLOCATOR_DEVICE_ID, OrtMemTypeCPU);
     if (!allocator)
       return nullptr;
     return IAllocator::MakeUniquePtr<T>(allocator, count_or_bytes);