Skip to content

Commit

Permalink
Introduce VariadicAlias, remove hardcoded alias limits (microsoft#6106)
Browse files Browse the repository at this point in the history
* Introduce VariadicAlias, remove hardcoded alias limits

* Include optional-lite in winml build

Co-authored-by: Sherlock Huang <bahuang@OrtTrainingDev3.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
  • Loading branch information
SherlockNoMad and Sherlock Huang authored Dec 11, 2020
1 parent 38c49c2 commit a53f4dd
Show file tree
Hide file tree
Showing 13 changed files with 54 additions and 54 deletions.
4 changes: 4 additions & 0 deletions cmake/winml.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,7 @@ target_include_directories(winml_lib_image PRIVATE ${REPO_ROOT}/cmake/external/o
target_include_directories(winml_lib_image PRIVATE ${REPO_ROOT}/cmake/external/protobuf/src)
target_include_directories(winml_lib_image PRIVATE ${ONNXRUNTIME_INCLUDE_DIR}/core/platform/windows)
target_include_directories(winml_lib_image PRIVATE ${REPO_ROOT}/cmake/external/flatbuffers/include)
target_include_directories(winml_lib_image PRIVATE ${REPO_ROOT}/cmake/external/optional-lite/include)

# Properties
set_target_properties(winml_lib_image
Expand Down Expand Up @@ -507,6 +508,7 @@ target_include_directories(winml_lib_api PRIVATE ${REPO_ROOT}/cmake/external/pro
target_include_directories(winml_lib_api PRIVATE ${REPO_ROOT}/cmake/external/gsl/include)
target_include_directories(winml_lib_api PRIVATE ${REPO_ROOT}/cmake/external/SafeInt)
target_include_directories(winml_lib_api PRIVATE ${REPO_ROOT}/cmake/external/flatbuffers/include)
target_include_directories(winml_lib_api PRIVATE ${REPO_ROOT}/cmake/external/optional-lite/include)

# Properties
set_target_properties(winml_lib_api
Expand Down Expand Up @@ -583,6 +585,7 @@ target_include_directories(winml_lib_api_experimental PRIVATE ${REPO_ROOT}/cmake
target_include_directories(winml_lib_api_experimental PRIVATE ${REPO_ROOT}/cmake/external/gsl/include)
target_include_directories(winml_lib_api_experimental PRIVATE ${REPO_ROOT}/cmake/external/SafeInt)
target_include_directories(winml_lib_api_experimental PRIVATE ${REPO_ROOT}/cmake/external/flatbuffers/include)
target_include_directories(winml_lib_api_experimental PRIVATE ${REPO_ROOT}/cmake/external/optional-lite/include)

# Properties
set_target_properties(winml_lib_api_experimental
Expand Down Expand Up @@ -730,6 +733,7 @@ target_include_directories(winml_dll PRIVATE ${REPO_ROOT}/cmake/external/gsl/inc
target_include_directories(winml_dll PRIVATE ${REPO_ROOT}/cmake/external/eigen)
target_include_directories(winml_dll PRIVATE ${REPO_ROOT}/cmake/external/SafeInt)
target_include_directories(winml_dll PRIVATE ${REPO_ROOT}/cmake/external/flatbuffers/include)
target_include_directories(winml_dll PRIVATE ${REPO_ROOT}/cmake/external/optional-lite/include)

# Properties
set_target_properties(winml_dll
Expand Down
19 changes: 17 additions & 2 deletions include/onnxruntime/core/framework/kernel_def_builder.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include <limits.h>

#include "core/common/common.h"
#include "core/common/optional.h"
#include "core/graph/basic_types.h"
#include "core/framework/data_types.h"
#include "core/framework/allocator.h"
Expand Down Expand Up @@ -64,6 +65,10 @@ class KernelDef {
return alias_map_;
}

const optional<std::pair<int, int>>& VariadicAlias() const {
return variadic_alias_offsets_;
}

OrtMemType InputMemoryType(size_t input_index) const {
auto it = input_memory_type_args_.find(input_index);
if (it == input_memory_type_args_.end())
Expand Down Expand Up @@ -130,7 +135,11 @@ class KernelDef {

// An element <i, j> means that output j is an alias of input i.
std::vector<std::pair<int, int>> alias_map_;


// This variable stores <input_offset, output_offset> for the variadic alias mapping
// output 'i + output_offset' is an alias of input 'i + input_offset' for all i >= 0
optional<std::pair<int, int>> variadic_alias_offsets_;

// Require input tensors to be allocated contiguously.
bool allocate_inputs_contiguously_ = false;

Expand Down Expand Up @@ -220,6 +229,12 @@ class KernelDefBuilder {
KernelDefBuilder& Alias(const std::vector<std::pair<int, int>>& aliases);
KernelDefBuilder& Alias(int input_index, int output_index);

/**
Apply variadic number of alias mapping from inputs to outputs.
This is effectively applying Alias(i + input_offset, i + output_offset) for i >= 0
*/
KernelDefBuilder& VariadicAlias(int input_offset, int output_offset);

/**
Specify that this kernel requires input tensors to be allocated
contiguously. This allows kernels to execute as a single large
Expand All @@ -229,7 +244,7 @@ class KernelDefBuilder {
kernel_def_->allocate_inputs_contiguously_ = true;
return *this;
}

/**
Specify that this kernel requires an input arg
in certain memory type (instead of the default, device memory).
Expand Down
16 changes: 15 additions & 1 deletion onnxruntime/core/framework/allocation_planner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,21 @@ class PlannerImpl {
}
}

const optional<std::pair<int, int>>& variadic_alias_offsets = ci.kernel_def->VariadicAlias();
if (variadic_alias_offsets.has_value()) {
int input_offset = variadic_alias_offsets.value().first;
int output_offset = variadic_alias_offsets.value().second;
// we _must_ reuse this input to satisfy aliasing requirement: (e.g., for AllReduce)
int alias_input_index = output_arg_num - output_offset + input_offset;
if (alias_input_index >= 0 && static_cast<size_t>(alias_input_index) < input_args.size()) {
auto p_input_arg = input_args[alias_input_index];
if (p_input_arg->Exists()) {
*reusable_input = Index(p_input_arg->Name());
return true;
}
}
}

const std::vector<std::pair<int, int>>& inplace_map = ci.kernel_def->MayInplace();
for (auto pair : inplace_map) {
if (pair.second == output_arg_num) {
Expand Down Expand Up @@ -770,7 +785,6 @@ class PlannerImpl {
}
}


// Whether a given NodeArg has fence or not.
// If the buffer is reused, need to check whether original OrtValue has fence or not.
bool HasFence(const onnxruntime::NodeArg* arg) {
Expand Down
6 changes: 6 additions & 0 deletions onnxruntime/core/framework/kernel_def_builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -207,4 +207,10 @@ KernelDefBuilder& KernelDefBuilder::Alias(int input_index, int output_index) {
return *this;
}

KernelDefBuilder& KernelDefBuilder::VariadicAlias(int input_offset, int output_offset) {
ORT_ENFORCE(input_offset >= 0 && output_offset >= 0);
kernel_def_->variadic_alias_offsets_ = std::make_pair(input_offset, output_offset);
return *this;
}

} // namespace onnxruntime
19 changes: 0 additions & 19 deletions orttraining/orttraining/training_ops/cpu/controlflow/common.h

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
// Licensed under the MIT License.

#include "orttraining/training_ops/cpu/controlflow/group.h"
#include "orttraining/training_ops/cpu/controlflow/common.h"

namespace onnxruntime {
namespace contrib {
Expand All @@ -26,8 +25,6 @@ ONNX_OPERATOR_KERNEL_EX(
Group);

Status PassThrough::Compute(OpKernelContext* context) const {
ORT_ENFORCE(context->InputCount() <= passthrough_input_count_limit, "Number of inputs for PassThrough node exceeds the limit.");

for (int i = 0; i < context->InputCount(); ++i) {
const auto* X = context->Input<Tensor>(i);
ORT_ENFORCE(X != nullptr);
Expand All @@ -44,7 +41,7 @@ ONNX_OPERATOR_KERNEL_EX(
kCpuExecutionProvider,
KernelDefBuilder()
.TypeConstraint("T", DataTypeImpl::AllTensorTypes())
.Alias(AliasRange<0, 0>(0, passthrough_input_count_limit)), // outputs and inputs are mapped one to one
.VariadicAlias(0, 0), // outputs and inputs are mapped one to one
PassThrough);

} // namespace contrib
Expand Down
2 changes: 0 additions & 2 deletions orttraining/orttraining/training_ops/cpu/controlflow/group.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@ class Group final : public OpKernel {
Status Compute(OpKernelContext* context) const override;
};

constexpr int passthrough_input_count_limit = 10000; // limit of of pair count

class PassThrough : public OpKernel {
public:
PassThrough(const OpKernelInfo& info) : OpKernel(info) {}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

#include "orttraining/training_ops/cpu/controlflow/record.h"
#include "core/providers/cpu/tensor/utils.h"
#include "common.h"

namespace onnxruntime {
namespace contrib {
Expand All @@ -25,7 +24,7 @@ ONNX_OPERATOR_KERNEL_EX(
KernelDefBuilder()
.TypeConstraint("TInt64", DataTypeImpl::GetTensorType<int64_t>())
.TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
.Alias(AliasRange<1, 0>(0, 1024)),
.VariadicAlias(1, 0), // outputs and inputs are mapped one to one, with input offset by 1
RecordEvent);

Status RecordEvent::Compute(OpKernelContext* ctx) const {
Expand Down
3 changes: 1 addition & 2 deletions orttraining/orttraining/training_ops/cpu/controlflow/wait.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

#include "orttraining/training_ops/cpu/controlflow/wait.h"
#include "core/providers/cpu/tensor/utils.h"
#include "common.h"

namespace onnxruntime {
namespace contrib {
Expand All @@ -25,7 +24,7 @@ ONNX_OPERATOR_KERNEL_EX(
KernelDefBuilder()
.TypeConstraint("TInt64", DataTypeImpl::GetTensorType<int64_t>())
.TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
.Alias(AliasRange<1, 0>(0, 1024)),
.VariadicAlias(1, 0), // outputs and inputs are mapped one to one, with input offset by 1
WaitEvent);

Status WaitEvent::Compute(OpKernelContext* ctx) const {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@ Status NcclAllReduce::ComputeInternal(OpKernelContext* context) const {
void* output_data = context->Output(0, context->Input<Tensor>(0)->Shape())->MutableDataRaw();
MLDataType onnx_type = context->Input<Tensor>(0)->DataType();

// Although we assumed the memory address is contiguous for the input, ORT pads activation tensors to 64 bytes aligned
// and initializers to 256 bytes aligned. There are tiny padding gaps in the contiguous buffer space.
// We have to AllReduce on the entire buffer, including the padding space.
// Although we assumed the memory address is contiguous for the input, ORT pads activation tensors to 64 bytes aligned
// and initializers to 256 bytes aligned. There are tiny padding gaps in the contiguous buffer space.
// We have to AllReduce on the entire buffer, including the padding space.
const Tensor* last_tensor = context->Input<Tensor>(context->InputCount() - 1);
int8_t* end_address = (int8_t*)last_tensor->DataRaw() + last_tensor->SizeInBytes();
size_t num_bytes = end_address - (int8_t*)input_data;
Expand Down Expand Up @@ -210,21 +210,13 @@ Status NcclReduceScatter::ComputeInternal(OpKernelContext* context) const {
return Status::OK();
}

static std::vector<std::pair<int, int>> AliasRange(int start, int end) {
std::vector<std::pair<int, int>> aliases;
for (int i = start; i < end; i++) {
aliases.push_back(std::pair<int, int>(i, i));
}
return aliases;
}

ONNX_OPERATOR_KERNEL_EX(
NcclAllReduce,
kMSDomain,
1,
kCudaExecutionProvider,
KernelDefBuilder()
.Alias(AliasRange(0, 1024))
.VariadicAlias(0, 0) // outputs and inputs are mapped one to one
.AllocateInputsContiguously()
.TypeConstraint("T", DataTypeImpl::AllIEEEFloatTensorTypes()),
NcclAllReduce);
Expand All @@ -235,7 +227,7 @@ ONNX_OPERATOR_KERNEL_EX(
1,
kCudaExecutionProvider,
KernelDefBuilder()
.Alias(AliasRange(0, 1024))
.VariadicAlias(0, 0) // outputs and inputs are mapped one to one
.AllocateInputsContiguously()
.TypeConstraint("T", DataTypeImpl::AllIEEEFloatTensorTypes()),
NcclAllGather);
Expand All @@ -246,7 +238,7 @@ ONNX_OPERATOR_KERNEL_EX(
1,
kCudaExecutionProvider,
KernelDefBuilder()
.Alias(AliasRange(0, 1024))
.VariadicAlias(0, 0) // outputs and inputs are mapped one to one
.AllocateInputsContiguously()
.TypeConstraint("T", DataTypeImpl::AllIEEEFloatTensorTypes()),
NcclReduceScatter);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
// Licensed under the MIT License.

#include "orttraining/training_ops/cpu/controlflow/group.h"
#include "orttraining/training_ops/cpu/controlflow/common.h"
#include "core/providers/cuda/cuda_fwd.h"

namespace onnxruntime {
Expand All @@ -26,7 +25,7 @@ ONNX_OPERATOR_KERNEL_EX(
kCudaExecutionProvider,
KernelDefBuilder()
.TypeConstraint("T", DataTypeImpl::AllTensorTypes())
.Alias(onnxruntime::contrib::AliasRange<0, 0>(0, onnxruntime::contrib::passthrough_input_count_limit)), // outputs and inputs are mapped one to one
.VariadicAlias(0, 0), // outputs and inputs are mapped one to one
onnxruntime::contrib::PassThrough);

} // namespace cuda
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@

#include "orttraining/training_ops/cuda/controlflow/record.h"
#include "core/providers/cpu/tensor/utils.h"
// Include RecordEvent's utility functions shared by CPU and GPU implementations.
#include "orttraining/training_ops/cpu/controlflow/common.h"
// Include event mechanism shared by CPU and GPU implementations.
#include "orttraining/training_ops/cpu/controlflow/event_pool.h"
#include "orttraining/training_ops/cpu/controlflow/record.h"
Expand All @@ -23,7 +21,7 @@ ONNX_OPERATOR_KERNEL_EX(
.InputMemoryType<OrtMemTypeCPUInput>(0) /* Keep EventIdentifier in CPU */
.TypeConstraint("TInt64", DataTypeImpl::GetTensorType<int64_t>())
.TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
.Alias(onnxruntime::contrib::AliasRange<1, 0>(0, 1024)),
.VariadicAlias(1, 0), // outputs and inputs are mapped one to one, with input offset by 1
RecordEvent);

Status RecordEvent::ComputeInternal(OpKernelContext* ctx) const {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@

#include "orttraining/training_ops/cuda/controlflow/wait.h"
#include "core/providers/cpu/tensor/utils.h"
// Include RecordEvent's utility functions shared by CPU and GPU implementations.
#include "orttraining/training_ops/cpu/controlflow/common.h"
// Include event mechanism shared by CPU and GPU implementations.
#include "orttraining/training_ops/cpu/controlflow/event_pool.h"
#include "orttraining/training_ops/cpu/controlflow/wait.h"
Expand All @@ -23,7 +21,7 @@ ONNX_OPERATOR_KERNEL_EX(
.InputMemoryType<OrtMemTypeCPUInput>(0) /* CPU variable */
.TypeConstraint("TInt64", DataTypeImpl::GetTensorType<int64_t>())
.TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
.Alias(onnxruntime::contrib::AliasRange<1, 0>(0, 1024)),
.VariadicAlias(1, 0), // outputs and inputs are mapped one to one, with input offset by 1
WaitEvent);

Status WaitEvent::ComputeInternal(OpKernelContext* ctx) const {
Expand Down

0 comments on commit a53f4dd

Please sign in to comment.