Revert "Let mlas use the session threadpool for gemm functions (micro…

…soft#1196)" This reverts commit 280ab9a.
iiSeymour · Jun 18, 2019 · 40ec691 · 40ec691
1 parent 13d8558
commit 40ec691
Show file tree

Hide file tree

Showing 27 changed files with 234 additions and 231 deletions.
diff --git a/onnxruntime/contrib_ops/cpu/attnlstm/attention_mechanism.h b/onnxruntime/contrib_ops/cpu/attnlstm/attention_mechanism.h
@@ -6,9 +6,6 @@
 #include <gsl/span>
 
 namespace onnxruntime {
-namespace concurrency {
-class ThreadPool;
-}
 namespace contrib {
 
 template <typename T>

diff --git a/onnxruntime/contrib_ops/cpu/attnlstm/attention_wrapper.cc b/onnxruntime/contrib_ops/cpu/attnlstm/attention_wrapper.cc
@@ -2,7 +2,6 @@
 // Licensed under the MIT License.
 
 #include "attention_wrapper.h"
-#include "core/framework/op_kernel_context_internal.h"
 #include "core/providers/cpu/rnn/rnn_helpers.h"
 
 #include <stdexcept>
@@ -35,14 +34,14 @@ AttentionWrapper<T>::AttentionWrapper(AllocatorPtr alloc, const logging::Logger&
 
 // rnn_cell_output is of [batch_size, rnn_cell_hidden_size]
 template <typename T>
-void AttentionWrapper<T>::ProcessOutput(const gsl::span<const T>& rnn_cell_output, concurrency::ThreadPool* tp) {
+void AttentionWrapper<T>::ProcessOutput(const gsl::span<const T>& rnn_cell_output) {
   if (has_attn_layer_) {
     // rnn_cell_output * cell_weights, (part of the attention layer above the attention mechanism).
-    math::GemmEx<T, concurrency::ThreadPool>(CblasNoTrans, CblasNoTrans,
-                                             batch_size_, attn_layer_depth_, inner_cell_hidden_size_, T{1.0},
-                                             rnn_cell_output.data(), inner_cell_hidden_size_,
-                                             attn_layer_cell_weights_.data(), attn_layer_depth_, T{0.0},
-                                             attn_states_.data(), attn_layer_depth_, tp);
+    math::GemmEx<T, CPUMathUtil>(CblasNoTrans, CblasNoTrans,
+                                 batch_size_, attn_layer_depth_, inner_cell_hidden_size_, T{1.0},
+                                 rnn_cell_output.data(), inner_cell_hidden_size_,
+                                 attn_layer_cell_weights_.data(), attn_layer_depth_, T{0.0},
+                                 attn_states_.data(), attn_layer_depth_, &CPUMathUtil::Instance());
   }
 
   // Get the context which is calculated within attention mechanism.
@@ -55,11 +54,11 @@ void AttentionWrapper<T>::ProcessOutput(const gsl::span<const T>& rnn_cell_outpu
     //concat([p_cell_output, context]) * stack([attn_layer_cell_weights_, attn_layer_attn_weights_]) =
     //     p_cell_output * attn_layer_cell_weights_ + context * attn_layer_attn_weights_
     // The first part is calulated above. Here just add the later.
-    math::GemmEx<T, concurrency::ThreadPool>(CblasNoTrans, CblasNoTrans,
-                                             batch_size_, attn_layer_depth_, attn_context_depth_, T{1.0},
-                                             attn_context_.data(), attn_context_depth_,
-                                             attn_layer_attn_weights_.data(), attn_layer_depth_, T{1.0},
-                                             attn_states_.data(), attn_layer_depth_, tp);
+    math::GemmEx<T, CPUMathUtil>(CblasNoTrans, CblasNoTrans,
+                                 batch_size_, attn_layer_depth_, attn_context_depth_, T{1.0},
+                                 attn_context_.data(), attn_context_depth_,
+                                 attn_layer_attn_weights_.data(), attn_layer_depth_, T{1.0},
+                                 attn_states_.data(), attn_layer_depth_, &CPUMathUtil::Instance());
   }
 }
 

diff --git a/onnxruntime/contrib_ops/cpu/attnlstm/attention_wrapper.h b/onnxruntime/contrib_ops/cpu/attnlstm/attention_wrapper.h
@@ -10,9 +10,6 @@
 #include "core/framework/allocator.h"
 
 namespace onnxruntime {
-namespace concurrency {
-class ThreadPool;
-}
 namespace contrib {
 
 template <typename T>
@@ -30,7 +27,7 @@ class AttentionWrapper {
   virtual ~AttentionWrapper() = default;
 
   // Calculation based on output of the inner wrapped rnn_cell.
-  void ProcessOutput(const gsl::span<const T>& rnn_cell_state, onnxruntime::concurrency::ThreadPool* tp);
+  void ProcessOutput(const gsl::span<const T>& rnn_cell_state);
 
   gsl::span<const T> GetAttnStates() const;
 

diff --git a/onnxruntime/contrib_ops/cpu/attnlstm/bahdanau_attention.cc b/onnxruntime/contrib_ops/cpu/attnlstm/bahdanau_attention.cc
@@ -15,8 +15,8 @@ namespace contrib {
 template <typename T>
 BahdanauAttention<T>::BahdanauAttention(AllocatorPtr allocator, const logging::Logger& logger,
                                         int batch_size, int max_memory_step, int memory_depth,
-                                        int query_depth, int attn_depth, bool normalize, concurrency::ThreadPool* tp)
-    : allocator_(allocator), logger_(logger), batch_size_(batch_size), max_memory_steps_(max_memory_step), memory_depth_(memory_depth), query_depth_(query_depth), attn_depth_(attn_depth), normalize_(normalize), tp_(tp) {
+                                        int query_depth, int attn_depth, bool normalize)
+    : allocator_(allocator), logger_(logger), batch_size_(batch_size), max_memory_steps_(max_memory_step), memory_depth_(memory_depth), query_depth_(query_depth), attn_depth_(attn_depth), normalize_(normalize) {
   values_ = Allocate(allocator_, batch_size_ * max_memory_steps_ * memory_depth_, values_ptr_, true);
   keys_ = Allocate(allocator_, batch_size_ * max_memory_steps_ * attn_depth_, keys_ptr_, true);
   processed_query_ = Allocate(allocator_, batch_size_ * attn_depth_, processed_query_ptr_, true);
@@ -72,11 +72,11 @@ void BahdanauAttention<T>::PrepareMemory(
                 "Real memory steps ", mem_steps, " is not in (0, ", max_memory_steps_, "]");
   }
 
-  math::GemmEx<T, concurrency::ThreadPool>(CblasNoTrans, CblasNoTrans,
-                                           batch_size_ * max_memory_steps_, attn_depth_, memory_depth_, T{1.0},
-                                           memory.data(), memory_depth_,
-                                           memory_layer_weights_.data(), attn_depth_, T{0.0},
-                                           keys_.data(), attn_depth_, tp_);
+  math::GemmEx<T, CPUMathUtil>(CblasNoTrans, CblasNoTrans,
+                               batch_size_ * max_memory_steps_, attn_depth_, memory_depth_, T{1.0},
+                               memory.data(), memory_depth_,
+                               memory_layer_weights_.data(), attn_depth_, T{0.0},
+                               keys_.data(), attn_depth_, &CPUMathUtil::Instance());
 }
 
 template <typename T>
@@ -115,11 +115,11 @@ void BahdanauAttention<T>::Compute(
     const gsl::span<T>& output,
     const gsl::span<T>& aligns) const {
   //process query in dense query layer without bias
-  math::GemmEx<T, onnxruntime::concurrency::ThreadPool>(CblasNoTrans, CblasNoTrans,
-                                                        batch_size_, attn_depth_, query_depth_, T{1.0},
-                                                        queries.data(), query_depth_,
-                                                        query_layer_weights_.data(), attn_depth_, T{0.0},
-                                                        processed_query_.data(), attn_depth_, tp_);
+  math::GemmEx<T, CPUMathUtil>(CblasNoTrans, CblasNoTrans,
+                               batch_size_, attn_depth_, query_depth_, T{1.0},
+                               queries.data(), query_depth_,
+                               query_layer_weights_.data(), attn_depth_, T{0.0},
+                               processed_query_.data(), attn_depth_, &CPUMathUtil::Instance());
 
   std::fill(aligns.begin(), aligns.end(), T{});
 
@@ -146,11 +146,11 @@ void BahdanauAttention<T>::Compute(
     // Calculate the context
     auto outspan = output.subspan(b * memory_depth_);
     auto values = values_.subspan(b * max_memory_steps_ * memory_depth_);
-    math::GemmEx<T, onnxruntime::concurrency::ThreadPool>(CblasNoTrans, CblasNoTrans,
-                                                          1, memory_depth_, max_memory_steps_, T{1.0},
-                                                          alignments, max_memory_steps_,
-                                                          values.data(), memory_depth_, T{0.0},
-                                                          outspan.data(), memory_depth_, tp_);
+    math::GemmEx<T, CPUMathUtil>(CblasNoTrans, CblasNoTrans,
+                                 1, memory_depth_, max_memory_steps_, T{1.0},
+                                 alignments, max_memory_steps_,
+                                 values.data(), memory_depth_, T{0.0},
+                                 outspan.data(), memory_depth_, &CPUMathUtil::Instance());
   }
 }
 

diff --git a/onnxruntime/contrib_ops/cpu/attnlstm/bahdanau_attention.h b/onnxruntime/contrib_ops/cpu/attnlstm/bahdanau_attention.h
@@ -23,7 +23,7 @@ class BahdanauAttention : public IAttentionMechanism<T> {
       int memory_depth,
       int query_depth,
       int attn_depth,
-      bool normalize, concurrency::ThreadPool* tp);
+      bool normalize);
 
   void SetWeights(
       const gsl::span<const T>& attn_weights,
@@ -53,6 +53,7 @@ class BahdanauAttention : public IAttentionMechanism<T> {
  private:
   AllocatorPtr allocator_;
   const logging::Logger& logger_;
+
   int batch_size_;
   int max_memory_steps_;
   int memory_depth_;
@@ -76,7 +77,6 @@ class BahdanauAttention : public IAttentionMechanism<T> {
   gsl::span<int> mem_seq_lengths_;
 
   bool normalize_;
-  concurrency::ThreadPool* const tp_;
 };
 
 }  // namespace contrib

diff --git a/onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.cc b/onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.cc
@@ -9,7 +9,6 @@
 #include "core/common/common.h"
 #include "core/common/logging/logging.h"
 #include "core/framework/allocator.h"
-#include "core/framework/op_kernel_context_internal.h"
 
 namespace onnxruntime {
 namespace contrib {
@@ -71,8 +70,6 @@ static gsl::span<const T> SecondHalfSpan(const gsl::span<const T>& dspan) {
 
 template <typename T>
 Status DeepCpuAttnLstmOp::ComputeImpl(OpKernelContext& context) const {
-  auto ctx_internal = static_cast<OpKernelContextInternal*>(&context);
-  auto tp = ctx_internal->GetOperatorThreadPool();
   auto& logger = context.Logger();
 
   // original lstm processing
@@ -232,7 +229,7 @@ Status DeepCpuAttnLstmOp::ComputeImpl(OpKernelContext& context) const {
                                                  last_cell_size_per_direction);
 
     auto fam = std::make_unique<BahdanauAttention<T>>(
-        alloc, logger, batch_size, max_memory_step, memory_depth, query_depth, am_attn_size, false, tp);
+        alloc, logger, batch_size, max_memory_step, memory_depth, query_depth, am_attn_size, false);
     fam->SetWeights(
         FirstHalfSpan(am_v_weights.DataAsSpan<T>()),
         FirstHalfSpan(am_query_layer_weights.DataAsSpan<T>()),
@@ -251,10 +248,10 @@ Status DeepCpuAttnLstmOp::ComputeImpl(OpKernelContext& context) const {
         activation_funcs_.Entries()[0],
         activation_funcs_.Entries()[1],
         activation_funcs_.Entries()[2],
-        clip_, *tp);
+        clip_, ttp_);
 
     auto bam = std::make_unique<BahdanauAttention<T>>(
-        alloc, logger, batch_size, max_memory_step, memory_depth, query_depth, am_attn_size, false, tp);
+        alloc, logger, batch_size, max_memory_step, memory_depth, query_depth, am_attn_size, false);
     bam->SetWeights(
         SecondHalfSpan(am_v_weights.DataAsSpan<T>()),
         SecondHalfSpan(am_query_layer_weights.DataAsSpan<T>()),
@@ -273,14 +270,14 @@ Status DeepCpuAttnLstmOp::ComputeImpl(OpKernelContext& context) const {
         activation_funcs_.Entries()[3],
         activation_funcs_.Entries()[4],
         activation_funcs_.Entries()[5],
-        clip_, *tp);
+        clip_, ttp_);
 
     fw->Compute(input, sequence_lens_span, num_directions_, input_weights_1, recurrent_weights_1, output_1, hidden_output_1, last_cell_1);
     bw->Compute(input, sequence_lens_span, num_directions_, input_weights_2, hidden_weights_2, output_2, hidden_output_2, last_cell_2);
 
   } else {
     auto fam = std::make_unique<BahdanauAttention<T>>(
-        alloc, logger, batch_size, max_memory_step, memory_depth, query_depth, am_attn_size, false, tp);
+        alloc, logger, batch_size, max_memory_step, memory_depth, query_depth, am_attn_size, false);
     fam->SetWeights(
         am_v_weights.DataAsSpan<T>(),
         am_query_layer_weights.DataAsSpan<T>(),
@@ -299,7 +296,7 @@ Status DeepCpuAttnLstmOp::ComputeImpl(OpKernelContext& context) const {
         activation_funcs_.Entries()[0],
         activation_funcs_.Entries()[1],
         activation_funcs_.Entries()[2],
-        clip_, *tp);
+        clip_, ttp_);
 
     fw->Compute(input, sequence_lens_span, num_directions_, input_weights_1, recurrent_weights_1, output_1, hidden_output_1, last_cell_1);
   }

diff --git a/onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.h b/onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.h
@@ -91,6 +91,12 @@ class DeepCpuAttnLstmOp final : public OpKernel {
   bool input_forget_ = false;
 
   ActivationFuncs activation_funcs_;
+
+// Threadpool for operator. If concurrent Compute calls are possible, it will be shared
+// across them. mutable due to this.
+// The alternative would be to create a threadpool in each call to Compute but that would incur thread creation
+// cost on every call.
+  mutable onnxruntime::concurrency::ThreadPool ttp_{"DEEPCPU_ATTN_LSTM", (int)std::thread::hardware_concurrency()};
 };
 
 }  // namespace contrib

diff --git a/onnxruntime/contrib_ops/cpu/attnlstm/uni_dir_attn_lstm.cc b/onnxruntime/contrib_ops/cpu/attnlstm/uni_dir_attn_lstm.cc
@@ -200,7 +200,6 @@ void UniDirectionalAttnLstm<T>::Compute(const gsl::span<const T>& inputs_arg,
                                         gsl::span<T>& outputs,
                                         gsl::span<T>& final_hidden_state,
                                         gsl::span<T>& final_cell_state) {
-  onnxruntime::concurrency::ThreadPool* tp = &ttp_;
   // copy spans (just T* and size, not data in span) as we may change them
   gsl::span<const T> inputs = inputs_arg;
   gsl::span<const int> sequence_lengths = sequence_lengths_arg;
@@ -255,7 +254,7 @@ void UniDirectionalAttnLstm<T>::Compute(const gsl::span<const T>& inputs_arg,
               input_weights.cbegin(), input_weights.cend(),  // W[iofc]^T
               input_size_ + attention_size_, T{0.0},
               output_iofc_.begin(), output_iofc_.end(),
-              hidden_size_x4, tp);
+              hidden_size_x4);
 
   DumpMatrix("Xt*(W[iofc]^T)", output_iofc_.data(), total_rows, hidden_size_x4);
 
@@ -297,7 +296,7 @@ void UniDirectionalAttnLstm<T>::Compute(const gsl::span<const T>& inputs_arg,
                   input_weights.cbegin() + input_size_, input_weights.cend(),  // WA[iofc]
                   input_size_ + attention_size_, T{1.0},
                   step_out_IOFC, output_iofc_.end(),  // input contains Xt*(W[iofc]^T)
-                  hidden_size_x4, tp);
+                  hidden_size_x4);
 
       // calculate Xt*(W[iofc]^T) + Ht-1*R[iofc]
       ComputeGemm(batch_size_, hidden_size_x4, hidden_size_, T{1.0},
@@ -306,7 +305,7 @@ void UniDirectionalAttnLstm<T>::Compute(const gsl::span<const T>& inputs_arg,
                   recurrent_weights.cbegin(), recurrent_weights.cend(),  // R[iofc]
                   hidden_size_, T{1.0},
                   step_out_IOFC, output_iofc_.end(),  // input contains Xt*(W[iofc]^T)
-                  hidden_size_x4, tp);
+                  hidden_size_x4);
 
       span_T_iter batched_output, batched_output_end;
       if (output_sequence) {
@@ -346,7 +345,7 @@ void UniDirectionalAttnLstm<T>::Compute(const gsl::span<const T>& inputs_arg,
       previous_state = batched_output;
       previous_state_end = batched_output_end;
 
-      attention_wrapper_.ProcessOutput(outputs.subspan(step * output_step_length, batch_size_ * hidden_size_), tp);
+      attention_wrapper_.ProcessOutput(outputs.subspan(step * output_step_length, batch_size_ * hidden_size_));
     }
   }
 

diff --git a/onnxruntime/contrib_ops/cpu/word_conv_embedding.cc b/onnxruntime/contrib_ops/cpu/word_conv_embedding.cc
@@ -6,7 +6,6 @@
 #include "core/util/math.h"
 #include "core/util/math_cpuonly.h"
 #include "core/mlas/inc/mlas.h"
-#include "core/framework/op_kernel_context_internal.h"
 
 namespace onnxruntime {
 namespace contrib {
@@ -46,7 +45,7 @@ void WordConvEmbedding::ComputeConvMaxPoolWithActivation(
     int64_t char_embedding_size,
     int64_t filter_width,
     int64_t num_filters,
-    float* output, concurrency::ThreadPool* tp) const {
+    float* output) const {
   int64_t input_word_size = word_len * char_embedding_size;
   int64_t unfolded_width = word_len - filter_width + 1;
   int64_t unfolded_kernal_size = filter_width * char_embedding_size;
@@ -84,12 +83,12 @@ void WordConvEmbedding::ComputeConvMaxPoolWithActivation(
       tmp_word_inx++;
     }
 
-    math::GemmEx<float, concurrency::ThreadPool>(
+    math::GemmEx<float, CPUMathUtil>(
         CblasNoTrans, CblasTrans,
         static_cast<int>(words_unfolded_width), static_cast<int>(num_filters), static_cast<int>(unfolded_kernal_size), 1.0f,
         unfolded_buffer_p.get(), static_cast<int>(unfolded_kernal_size),
         weights, static_cast<int>(unfolded_kernal_size), 0.0f,
-        conv_buf_p, static_cast<int>(num_filters), tp);
+        conv_buf_p, static_cast<int>(num_filters), &CPUMathUtil::Instance());
 
     for (int64_t unfolded_inx = 0; unfolded_inx < words_unfolded_width; unfolded_inx++)
       for (int64_t filter_inx = 0; filter_inx < num_filters; filter_inx++) {
@@ -161,9 +160,6 @@ Status WordConvEmbedding::ValidateInputShape(const TensorShape& w_conv_shape, co
 }
 
 Status WordConvEmbedding::Compute(OpKernelContext* ctx) const {
-  auto ctx_internal = static_cast<OpKernelContextInternal*>(ctx);
-  auto tp = ctx_internal->GetOperatorThreadPool();
-
   // original lstm processing
   const Tensor& sequence = *(ctx->Input<Tensor>(0));          // sequence: [sequence_length, word_length]
   const Tensor& w_conv = *(ctx->Input<Tensor>(1));            // conv weight: [M, C/group, kH, kW]
@@ -220,7 +216,7 @@ Status WordConvEmbedding::Compute(OpKernelContext* ctx) const {
       char_embedding_size,
       filter_width,
       filter_size,
-      Y->MutableData<float>(), tp);
+      Y->MutableData<float>());
 
   return Status::OK();
 }

diff --git a/onnxruntime/contrib_ops/cpu/word_conv_embedding.h b/onnxruntime/contrib_ops/cpu/word_conv_embedding.h
@@ -8,9 +8,6 @@
 #include "core/framework/tensor.h"
 
 namespace onnxruntime {
-namespace concurrency {
-class ThreadPool;
-}
 namespace contrib {
 
 class WordConvEmbedding final : public OpKernel {
@@ -41,7 +38,7 @@ class WordConvEmbedding final : public OpKernel {
       int64_t char_embedding_size,
       int64_t filter_width,
       int64_t num_filters,
-      float* output, onnxruntime::concurrency::ThreadPool* tp) const;
+      float* output) const;
   void CalculateLengthOfEachWordInSequence(
       const int* seq_ptr,
       int* words_len_ptr,

diff --git a/onnxruntime/core/framework/op_kernel_context_internal.h b/onnxruntime/core/framework/op_kernel_context_internal.h
@@ -58,7 +58,6 @@ class OpKernelContextInternal : public OpKernelContext {
   const bool& GetTerminateFlag() const noexcept { return terminate_flag_; }
 
   const onnxruntime::concurrency::ThreadPool* GetOperatorThreadPool() const { return session_state_.GetThreadPool(); }
-  onnxruntime::concurrency::ThreadPool* GetOperatorThreadPool() { return session_state_.GetThreadPool(); }
 
  private:
   const SessionState& session_state_;