Skip to content

Commit

Permalink
Revert "Let mlas use the session threadpool for gemm functions (micro…
Browse files Browse the repository at this point in the history
…soft#1196)"

This reverts commit 280ab9a.
  • Loading branch information
snnn committed Jun 18, 2019
1 parent 13d8558 commit 40ec691
Show file tree
Hide file tree
Showing 27 changed files with 234 additions and 231 deletions.
3 changes: 0 additions & 3 deletions onnxruntime/contrib_ops/cpu/attnlstm/attention_mechanism.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,6 @@
#include <gsl/span>

namespace onnxruntime {
namespace concurrency {
class ThreadPool;
}
namespace contrib {

template <typename T>
Expand Down
23 changes: 11 additions & 12 deletions onnxruntime/contrib_ops/cpu/attnlstm/attention_wrapper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
// Licensed under the MIT License.

#include "attention_wrapper.h"
#include "core/framework/op_kernel_context_internal.h"
#include "core/providers/cpu/rnn/rnn_helpers.h"

#include <stdexcept>
Expand Down Expand Up @@ -35,14 +34,14 @@ AttentionWrapper<T>::AttentionWrapper(AllocatorPtr alloc, const logging::Logger&

// rnn_cell_output is of [batch_size, rnn_cell_hidden_size]
template <typename T>
void AttentionWrapper<T>::ProcessOutput(const gsl::span<const T>& rnn_cell_output, concurrency::ThreadPool* tp) {
void AttentionWrapper<T>::ProcessOutput(const gsl::span<const T>& rnn_cell_output) {
if (has_attn_layer_) {
// rnn_cell_output * cell_weights, (part of the attention layer above the attention mechanism).
math::GemmEx<T, concurrency::ThreadPool>(CblasNoTrans, CblasNoTrans,
batch_size_, attn_layer_depth_, inner_cell_hidden_size_, T{1.0},
rnn_cell_output.data(), inner_cell_hidden_size_,
attn_layer_cell_weights_.data(), attn_layer_depth_, T{0.0},
attn_states_.data(), attn_layer_depth_, tp);
math::GemmEx<T, CPUMathUtil>(CblasNoTrans, CblasNoTrans,
batch_size_, attn_layer_depth_, inner_cell_hidden_size_, T{1.0},
rnn_cell_output.data(), inner_cell_hidden_size_,
attn_layer_cell_weights_.data(), attn_layer_depth_, T{0.0},
attn_states_.data(), attn_layer_depth_, &CPUMathUtil::Instance());
}

// Get the context which is calculated within attention mechanism.
Expand All @@ -55,11 +54,11 @@ void AttentionWrapper<T>::ProcessOutput(const gsl::span<const T>& rnn_cell_outpu
//concat([p_cell_output, context]) * stack([attn_layer_cell_weights_, attn_layer_attn_weights_]) =
// p_cell_output * attn_layer_cell_weights_ + context * attn_layer_attn_weights_
// The first part is calulated above. Here just add the later.
math::GemmEx<T, concurrency::ThreadPool>(CblasNoTrans, CblasNoTrans,
batch_size_, attn_layer_depth_, attn_context_depth_, T{1.0},
attn_context_.data(), attn_context_depth_,
attn_layer_attn_weights_.data(), attn_layer_depth_, T{1.0},
attn_states_.data(), attn_layer_depth_, tp);
math::GemmEx<T, CPUMathUtil>(CblasNoTrans, CblasNoTrans,
batch_size_, attn_layer_depth_, attn_context_depth_, T{1.0},
attn_context_.data(), attn_context_depth_,
attn_layer_attn_weights_.data(), attn_layer_depth_, T{1.0},
attn_states_.data(), attn_layer_depth_, &CPUMathUtil::Instance());
}
}

Expand Down
5 changes: 1 addition & 4 deletions onnxruntime/contrib_ops/cpu/attnlstm/attention_wrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,6 @@
#include "core/framework/allocator.h"

namespace onnxruntime {
namespace concurrency {
class ThreadPool;
}
namespace contrib {

template <typename T>
Expand All @@ -30,7 +27,7 @@ class AttentionWrapper {
virtual ~AttentionWrapper() = default;

// Calculation based on output of the inner wrapped rnn_cell.
void ProcessOutput(const gsl::span<const T>& rnn_cell_state, onnxruntime::concurrency::ThreadPool* tp);
void ProcessOutput(const gsl::span<const T>& rnn_cell_state);

gsl::span<const T> GetAttnStates() const;

Expand Down
34 changes: 17 additions & 17 deletions onnxruntime/contrib_ops/cpu/attnlstm/bahdanau_attention.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ namespace contrib {
template <typename T>
BahdanauAttention<T>::BahdanauAttention(AllocatorPtr allocator, const logging::Logger& logger,
int batch_size, int max_memory_step, int memory_depth,
int query_depth, int attn_depth, bool normalize, concurrency::ThreadPool* tp)
: allocator_(allocator), logger_(logger), batch_size_(batch_size), max_memory_steps_(max_memory_step), memory_depth_(memory_depth), query_depth_(query_depth), attn_depth_(attn_depth), normalize_(normalize), tp_(tp) {
int query_depth, int attn_depth, bool normalize)
: allocator_(allocator), logger_(logger), batch_size_(batch_size), max_memory_steps_(max_memory_step), memory_depth_(memory_depth), query_depth_(query_depth), attn_depth_(attn_depth), normalize_(normalize) {
values_ = Allocate(allocator_, batch_size_ * max_memory_steps_ * memory_depth_, values_ptr_, true);
keys_ = Allocate(allocator_, batch_size_ * max_memory_steps_ * attn_depth_, keys_ptr_, true);
processed_query_ = Allocate(allocator_, batch_size_ * attn_depth_, processed_query_ptr_, true);
Expand Down Expand Up @@ -72,11 +72,11 @@ void BahdanauAttention<T>::PrepareMemory(
"Real memory steps ", mem_steps, " is not in (0, ", max_memory_steps_, "]");
}

math::GemmEx<T, concurrency::ThreadPool>(CblasNoTrans, CblasNoTrans,
batch_size_ * max_memory_steps_, attn_depth_, memory_depth_, T{1.0},
memory.data(), memory_depth_,
memory_layer_weights_.data(), attn_depth_, T{0.0},
keys_.data(), attn_depth_, tp_);
math::GemmEx<T, CPUMathUtil>(CblasNoTrans, CblasNoTrans,
batch_size_ * max_memory_steps_, attn_depth_, memory_depth_, T{1.0},
memory.data(), memory_depth_,
memory_layer_weights_.data(), attn_depth_, T{0.0},
keys_.data(), attn_depth_, &CPUMathUtil::Instance());
}

template <typename T>
Expand Down Expand Up @@ -115,11 +115,11 @@ void BahdanauAttention<T>::Compute(
const gsl::span<T>& output,
const gsl::span<T>& aligns) const {
//process query in dense query layer without bias
math::GemmEx<T, onnxruntime::concurrency::ThreadPool>(CblasNoTrans, CblasNoTrans,
batch_size_, attn_depth_, query_depth_, T{1.0},
queries.data(), query_depth_,
query_layer_weights_.data(), attn_depth_, T{0.0},
processed_query_.data(), attn_depth_, tp_);
math::GemmEx<T, CPUMathUtil>(CblasNoTrans, CblasNoTrans,
batch_size_, attn_depth_, query_depth_, T{1.0},
queries.data(), query_depth_,
query_layer_weights_.data(), attn_depth_, T{0.0},
processed_query_.data(), attn_depth_, &CPUMathUtil::Instance());

std::fill(aligns.begin(), aligns.end(), T{});

Expand All @@ -146,11 +146,11 @@ void BahdanauAttention<T>::Compute(
// Calculate the context
auto outspan = output.subspan(b * memory_depth_);
auto values = values_.subspan(b * max_memory_steps_ * memory_depth_);
math::GemmEx<T, onnxruntime::concurrency::ThreadPool>(CblasNoTrans, CblasNoTrans,
1, memory_depth_, max_memory_steps_, T{1.0},
alignments, max_memory_steps_,
values.data(), memory_depth_, T{0.0},
outspan.data(), memory_depth_, tp_);
math::GemmEx<T, CPUMathUtil>(CblasNoTrans, CblasNoTrans,
1, memory_depth_, max_memory_steps_, T{1.0},
alignments, max_memory_steps_,
values.data(), memory_depth_, T{0.0},
outspan.data(), memory_depth_, &CPUMathUtil::Instance());
}
}

Expand Down
4 changes: 2 additions & 2 deletions onnxruntime/contrib_ops/cpu/attnlstm/bahdanau_attention.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class BahdanauAttention : public IAttentionMechanism<T> {
int memory_depth,
int query_depth,
int attn_depth,
bool normalize, concurrency::ThreadPool* tp);
bool normalize);

void SetWeights(
const gsl::span<const T>& attn_weights,
Expand Down Expand Up @@ -53,6 +53,7 @@ class BahdanauAttention : public IAttentionMechanism<T> {
private:
AllocatorPtr allocator_;
const logging::Logger& logger_;

int batch_size_;
int max_memory_steps_;
int memory_depth_;
Expand All @@ -76,7 +77,6 @@ class BahdanauAttention : public IAttentionMechanism<T> {
gsl::span<int> mem_seq_lengths_;

bool normalize_;
concurrency::ThreadPool* const tp_;
};

} // namespace contrib
Expand Down
15 changes: 6 additions & 9 deletions onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
#include "core/common/common.h"
#include "core/common/logging/logging.h"
#include "core/framework/allocator.h"
#include "core/framework/op_kernel_context_internal.h"

namespace onnxruntime {
namespace contrib {
Expand Down Expand Up @@ -71,8 +70,6 @@ static gsl::span<const T> SecondHalfSpan(const gsl::span<const T>& dspan) {

template <typename T>
Status DeepCpuAttnLstmOp::ComputeImpl(OpKernelContext& context) const {
auto ctx_internal = static_cast<OpKernelContextInternal*>(&context);
auto tp = ctx_internal->GetOperatorThreadPool();
auto& logger = context.Logger();

// original lstm processing
Expand Down Expand Up @@ -232,7 +229,7 @@ Status DeepCpuAttnLstmOp::ComputeImpl(OpKernelContext& context) const {
last_cell_size_per_direction);

auto fam = std::make_unique<BahdanauAttention<T>>(
alloc, logger, batch_size, max_memory_step, memory_depth, query_depth, am_attn_size, false, tp);
alloc, logger, batch_size, max_memory_step, memory_depth, query_depth, am_attn_size, false);
fam->SetWeights(
FirstHalfSpan(am_v_weights.DataAsSpan<T>()),
FirstHalfSpan(am_query_layer_weights.DataAsSpan<T>()),
Expand All @@ -251,10 +248,10 @@ Status DeepCpuAttnLstmOp::ComputeImpl(OpKernelContext& context) const {
activation_funcs_.Entries()[0],
activation_funcs_.Entries()[1],
activation_funcs_.Entries()[2],
clip_, *tp);
clip_, ttp_);

auto bam = std::make_unique<BahdanauAttention<T>>(
alloc, logger, batch_size, max_memory_step, memory_depth, query_depth, am_attn_size, false, tp);
alloc, logger, batch_size, max_memory_step, memory_depth, query_depth, am_attn_size, false);
bam->SetWeights(
SecondHalfSpan(am_v_weights.DataAsSpan<T>()),
SecondHalfSpan(am_query_layer_weights.DataAsSpan<T>()),
Expand All @@ -273,14 +270,14 @@ Status DeepCpuAttnLstmOp::ComputeImpl(OpKernelContext& context) const {
activation_funcs_.Entries()[3],
activation_funcs_.Entries()[4],
activation_funcs_.Entries()[5],
clip_, *tp);
clip_, ttp_);

fw->Compute(input, sequence_lens_span, num_directions_, input_weights_1, recurrent_weights_1, output_1, hidden_output_1, last_cell_1);
bw->Compute(input, sequence_lens_span, num_directions_, input_weights_2, hidden_weights_2, output_2, hidden_output_2, last_cell_2);

} else {
auto fam = std::make_unique<BahdanauAttention<T>>(
alloc, logger, batch_size, max_memory_step, memory_depth, query_depth, am_attn_size, false, tp);
alloc, logger, batch_size, max_memory_step, memory_depth, query_depth, am_attn_size, false);
fam->SetWeights(
am_v_weights.DataAsSpan<T>(),
am_query_layer_weights.DataAsSpan<T>(),
Expand All @@ -299,7 +296,7 @@ Status DeepCpuAttnLstmOp::ComputeImpl(OpKernelContext& context) const {
activation_funcs_.Entries()[0],
activation_funcs_.Entries()[1],
activation_funcs_.Entries()[2],
clip_, *tp);
clip_, ttp_);

fw->Compute(input, sequence_lens_span, num_directions_, input_weights_1, recurrent_weights_1, output_1, hidden_output_1, last_cell_1);
}
Expand Down
6 changes: 6 additions & 0 deletions onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,12 @@ class DeepCpuAttnLstmOp final : public OpKernel {
bool input_forget_ = false;

ActivationFuncs activation_funcs_;

// Threadpool for operator. If concurrent Compute calls are possible, it will be shared
// across them. mutable due to this.
// The alternative would be to create a threadpool in each call to Compute but that would incur thread creation
// cost on every call.
mutable onnxruntime::concurrency::ThreadPool ttp_{"DEEPCPU_ATTN_LSTM", (int)std::thread::hardware_concurrency()};
};

} // namespace contrib
Expand Down
9 changes: 4 additions & 5 deletions onnxruntime/contrib_ops/cpu/attnlstm/uni_dir_attn_lstm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,6 @@ void UniDirectionalAttnLstm<T>::Compute(const gsl::span<const T>& inputs_arg,
gsl::span<T>& outputs,
gsl::span<T>& final_hidden_state,
gsl::span<T>& final_cell_state) {
onnxruntime::concurrency::ThreadPool* tp = &ttp_;
// copy spans (just T* and size, not data in span) as we may change them
gsl::span<const T> inputs = inputs_arg;
gsl::span<const int> sequence_lengths = sequence_lengths_arg;
Expand Down Expand Up @@ -255,7 +254,7 @@ void UniDirectionalAttnLstm<T>::Compute(const gsl::span<const T>& inputs_arg,
input_weights.cbegin(), input_weights.cend(), // W[iofc]^T
input_size_ + attention_size_, T{0.0},
output_iofc_.begin(), output_iofc_.end(),
hidden_size_x4, tp);
hidden_size_x4);

DumpMatrix("Xt*(W[iofc]^T)", output_iofc_.data(), total_rows, hidden_size_x4);

Expand Down Expand Up @@ -297,7 +296,7 @@ void UniDirectionalAttnLstm<T>::Compute(const gsl::span<const T>& inputs_arg,
input_weights.cbegin() + input_size_, input_weights.cend(), // WA[iofc]
input_size_ + attention_size_, T{1.0},
step_out_IOFC, output_iofc_.end(), // input contains Xt*(W[iofc]^T)
hidden_size_x4, tp);
hidden_size_x4);

// calculate Xt*(W[iofc]^T) + Ht-1*R[iofc]
ComputeGemm(batch_size_, hidden_size_x4, hidden_size_, T{1.0},
Expand All @@ -306,7 +305,7 @@ void UniDirectionalAttnLstm<T>::Compute(const gsl::span<const T>& inputs_arg,
recurrent_weights.cbegin(), recurrent_weights.cend(), // R[iofc]
hidden_size_, T{1.0},
step_out_IOFC, output_iofc_.end(), // input contains Xt*(W[iofc]^T)
hidden_size_x4, tp);
hidden_size_x4);

span_T_iter batched_output, batched_output_end;
if (output_sequence) {
Expand Down Expand Up @@ -346,7 +345,7 @@ void UniDirectionalAttnLstm<T>::Compute(const gsl::span<const T>& inputs_arg,
previous_state = batched_output;
previous_state_end = batched_output_end;

attention_wrapper_.ProcessOutput(outputs.subspan(step * output_step_length, batch_size_ * hidden_size_), tp);
attention_wrapper_.ProcessOutput(outputs.subspan(step * output_step_length, batch_size_ * hidden_size_));
}
}

Expand Down
12 changes: 4 additions & 8 deletions onnxruntime/contrib_ops/cpu/word_conv_embedding.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
#include "core/util/math.h"
#include "core/util/math_cpuonly.h"
#include "core/mlas/inc/mlas.h"
#include "core/framework/op_kernel_context_internal.h"

namespace onnxruntime {
namespace contrib {
Expand Down Expand Up @@ -46,7 +45,7 @@ void WordConvEmbedding::ComputeConvMaxPoolWithActivation(
int64_t char_embedding_size,
int64_t filter_width,
int64_t num_filters,
float* output, concurrency::ThreadPool* tp) const {
float* output) const {
int64_t input_word_size = word_len * char_embedding_size;
int64_t unfolded_width = word_len - filter_width + 1;
int64_t unfolded_kernal_size = filter_width * char_embedding_size;
Expand Down Expand Up @@ -84,12 +83,12 @@ void WordConvEmbedding::ComputeConvMaxPoolWithActivation(
tmp_word_inx++;
}

math::GemmEx<float, concurrency::ThreadPool>(
math::GemmEx<float, CPUMathUtil>(
CblasNoTrans, CblasTrans,
static_cast<int>(words_unfolded_width), static_cast<int>(num_filters), static_cast<int>(unfolded_kernal_size), 1.0f,
unfolded_buffer_p.get(), static_cast<int>(unfolded_kernal_size),
weights, static_cast<int>(unfolded_kernal_size), 0.0f,
conv_buf_p, static_cast<int>(num_filters), tp);
conv_buf_p, static_cast<int>(num_filters), &CPUMathUtil::Instance());

for (int64_t unfolded_inx = 0; unfolded_inx < words_unfolded_width; unfolded_inx++)
for (int64_t filter_inx = 0; filter_inx < num_filters; filter_inx++) {
Expand Down Expand Up @@ -161,9 +160,6 @@ Status WordConvEmbedding::ValidateInputShape(const TensorShape& w_conv_shape, co
}

Status WordConvEmbedding::Compute(OpKernelContext* ctx) const {
auto ctx_internal = static_cast<OpKernelContextInternal*>(ctx);
auto tp = ctx_internal->GetOperatorThreadPool();

// original lstm processing
const Tensor& sequence = *(ctx->Input<Tensor>(0)); // sequence: [sequence_length, word_length]
const Tensor& w_conv = *(ctx->Input<Tensor>(1)); // conv weight: [M, C/group, kH, kW]
Expand Down Expand Up @@ -220,7 +216,7 @@ Status WordConvEmbedding::Compute(OpKernelContext* ctx) const {
char_embedding_size,
filter_width,
filter_size,
Y->MutableData<float>(), tp);
Y->MutableData<float>());

return Status::OK();
}
Expand Down
5 changes: 1 addition & 4 deletions onnxruntime/contrib_ops/cpu/word_conv_embedding.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,6 @@
#include "core/framework/tensor.h"

namespace onnxruntime {
namespace concurrency {
class ThreadPool;
}
namespace contrib {

class WordConvEmbedding final : public OpKernel {
Expand Down Expand Up @@ -41,7 +38,7 @@ class WordConvEmbedding final : public OpKernel {
int64_t char_embedding_size,
int64_t filter_width,
int64_t num_filters,
float* output, onnxruntime::concurrency::ThreadPool* tp) const;
float* output) const;
void CalculateLengthOfEachWordInSequence(
const int* seq_ptr,
int* words_len_ptr,
Expand Down
1 change: 0 additions & 1 deletion onnxruntime/core/framework/op_kernel_context_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ class OpKernelContextInternal : public OpKernelContext {
const bool& GetTerminateFlag() const noexcept { return terminate_flag_; }

const onnxruntime::concurrency::ThreadPool* GetOperatorThreadPool() const { return session_state_.GetThreadPool(); }
onnxruntime::concurrency::ThreadPool* GetOperatorThreadPool() { return session_state_.GetThreadPool(); }

private:
const SessionState& session_state_;
Expand Down
Loading

0 comments on commit 40ec691

Please sign in to comment.