diff --git a/src/Accelerators/NNPA/Conversion/ZLowToLLVM/ZLowToLLVMCommon.cpp b/src/Accelerators/NNPA/Conversion/ZLowToLLVM/ZLowToLLVMCommon.cpp index e59208bff0..b6ec195840 100644 --- a/src/Accelerators/NNPA/Conversion/ZLowToLLVM/ZLowToLLVMCommon.cpp +++ b/src/Accelerators/NNPA/Conversion/ZLowToLLVM/ZLowToLLVMCommon.cpp @@ -56,19 +56,19 @@ ApiRegistry RegisterAllApis(MLIRContext *context) { ApiSpec(API::ZDNN_TRANSFORM_ZTENSOR, "zdnn_transform_ztensor", int32Ty, {opaquePtrTy}, true), ApiSpec(API::ZDNN_TRANSFORM_ORIGTENSOR, "zdnn_transform_origtensor", int32Ty, {opaquePtrTy, opaquePtrTy}, false), // Elementwise operations - ApiSpec(API::ZDNN_ADD, "zdnn_add", int32Ty, {opaquePtrTy, opaquePtrTy, opaquePtrTy}, false), - ApiSpec(API::ZDNN_SUB, "zdnn_sub", int32Ty, {opaquePtrTy, opaquePtrTy, opaquePtrTy}, false), - ApiSpec(API::ZDNN_MUL, "zdnn_mul", int32Ty, {opaquePtrTy, opaquePtrTy, opaquePtrTy}, false), - ApiSpec(API::ZDNN_DIV, "zdnn_div", int32Ty, {opaquePtrTy, opaquePtrTy, opaquePtrTy}, false), - ApiSpec(API::ZDNN_MIN, "zdnn_min", int32Ty, {opaquePtrTy, opaquePtrTy, opaquePtrTy}, false), - ApiSpec(API::ZDNN_MAX, "zdnn_max", int32Ty, {opaquePtrTy, opaquePtrTy, opaquePtrTy}, false), - ApiSpec(API::ZDNN_LOG, "zdnn_log", int32Ty, {opaquePtrTy, opaquePtrTy}, false), - ApiSpec(API::ZDNN_EXP, "zdnn_exp", int32Ty, {opaquePtrTy, opaquePtrTy}, false), + ApiSpec(API::ZDNN_ADD, "zdnn_add_ext", int32Ty, {opaquePtrTy, opaquePtrTy, opaquePtrTy}, false), + ApiSpec(API::ZDNN_SUB, "zdnn_sub_ext", int32Ty, {opaquePtrTy, opaquePtrTy, opaquePtrTy}, false), + ApiSpec(API::ZDNN_MUL, "zdnn_mul_ext", int32Ty, {opaquePtrTy, opaquePtrTy, opaquePtrTy}, false), + ApiSpec(API::ZDNN_DIV, "zdnn_div_ext", int32Ty, {opaquePtrTy, opaquePtrTy, opaquePtrTy}, false), + ApiSpec(API::ZDNN_MIN, "zdnn_min_ext", int32Ty, {opaquePtrTy, opaquePtrTy, opaquePtrTy}, false), + ApiSpec(API::ZDNN_MAX, "zdnn_max_ext", int32Ty, {opaquePtrTy, opaquePtrTy, opaquePtrTy}, false), + ApiSpec(API::ZDNN_LOG, "zdnn_log_ext", int32Ty, {opaquePtrTy, opaquePtrTy}, false), + ApiSpec(API::ZDNN_EXP, "zdnn_exp_ext", int32Ty, {opaquePtrTy, opaquePtrTy}, false), // Activation operations - ApiSpec(API::ZDNN_RELU, "zdnn_relu", int32Ty, {opaquePtrTy, opaquePtrTy, opaquePtrTy}, false), - ApiSpec(API::ZDNN_TANH, "zdnn_tanh", int32Ty, {opaquePtrTy, opaquePtrTy}, false), - ApiSpec(API::ZDNN_SIGMOID, "zdnn_sigmoid", int32Ty, {opaquePtrTy, opaquePtrTy}, false), - ApiSpec(API::ZDNN_SOFTMAX, "zdnn_softmax", int32Ty, {opaquePtrTy, opaquePtrTy, int64Ty, opaquePtrTy}, false), + ApiSpec(API::ZDNN_RELU, "zdnn_relu_ext", int32Ty, {opaquePtrTy, opaquePtrTy, opaquePtrTy}, false), + ApiSpec(API::ZDNN_TANH, "zdnn_tanh_ext", int32Ty, {opaquePtrTy, opaquePtrTy}, false), + ApiSpec(API::ZDNN_SIGMOID, "zdnn_sigmoid_ext", int32Ty, {opaquePtrTy, opaquePtrTy}, false), + ApiSpec(API::ZDNN_SOFTMAX, "zdnn_softmax_ext", int32Ty, {opaquePtrTy, opaquePtrTy, int64Ty, opaquePtrTy}, false), // RNN operations ApiSpec(API::ZDNN_LSTM, "zdnn_lstm", int32Ty, {opaquePtrTy, opaquePtrTy, opaquePtrTy, opaquePtrTy, opaquePtrTy, opaquePtrTy, opaquePtrTy, int64Ty, opaquePtrTy, opaquePtrTy, opaquePtrTy}, false), ApiSpec(API::ZDNN_GRU, "zdnn_gru", int32Ty, {opaquePtrTy, opaquePtrTy, opaquePtrTy, opaquePtrTy, opaquePtrTy, opaquePtrTy, int64Ty, opaquePtrTy, opaquePtrTy}, false), diff --git a/src/Accelerators/NNPA/Runtime/CMakeLists.txt b/src/Accelerators/NNPA/Runtime/CMakeLists.txt index 083e616ca7..e6b3e80caa 100644 --- a/src/Accelerators/NNPA/Runtime/CMakeLists.txt +++ b/src/Accelerators/NNPA/Runtime/CMakeLists.txt @@ -5,7 +5,9 @@ add_onnx_mlir_library(RuntimeNNPA STATIC OMRuntimeNNPA.c zDNNExtension/zDNNExtension.c + zDNNExtension/Elementwise.c zDNNExtension/MatMul.c + zDNNExtension/Softmax.c EXCLUDE_FROM_OM_LIBS diff --git a/src/Accelerators/NNPA/Runtime/zDNNExtension/Elementwise.c b/src/Accelerators/NNPA/Runtime/zDNNExtension/Elementwise.c new file mode 100644 index 0000000000..cb65ae84d4 --- /dev/null +++ b/src/Accelerators/NNPA/Runtime/zDNNExtension/Elementwise.c @@ -0,0 +1,340 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + */ + +//===------------------------ Elementwise.c -------------------------------===// +// +// Copyright 2024 The IBM Research Authors. +// +// ============================================================================= +// +// A wrapper of zdnn elementwise ops for ztensor partition and parallelism. +// +//===----------------------------------------------------------------------===// + +// Include pthreads (need special treatment on z/OS). +#ifdef __MVS__ +#define _OPEN_THREADS +#endif +#include + +#include +#include +#include +#include +#include + +#include "zDNNExtension.h" +#include "zdnn.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum ElemementwiseOp { + // Binary + ZDNN_ADD_EXT, + ZDNN_DIV_EXT, + ZDNN_MAX_EXT, + ZDNN_MIN_EXT, + ZDNN_MUL_EXT, + ZDNN_SUB_EXT, + // Unary + ZDNN_EXP_EXT, + ZDNN_LOG_EXT, + ZDNN_RELU_EXT, + ZDNN_TANH_EXT, + ZDNN_SIGMOID_EXT, +} ElemementwiseOp; + +static zdnn_status zdnn_unary_elementwise_common(const zdnn_ztensor *input, + const void *clippingValue, zdnn_ztensor *output, ElemementwiseOp opType) { + // Verify that e4, e3, e1 do not exceed the maximum dimension size. Thus, we + // will split e2 safely. + OrigShape origShapeOfX; + getOrigShape(input, &origShapeOfX); + uint32_t maxDimSize = zdnn_get_nnpa_max_dim_idx_size(); + if ((origShapeOfX.e4 > maxDimSize) || (origShapeOfX.e3 > maxDimSize) || + (origShapeOfX.e1 > maxDimSize)) { + printf("[UnaryElementwise] The input tensor dimension exceeds maximum " + "dimension index " + "size (MDIS) of %d: e4 = %d, e3 = %d, e1 = %d.\n", + maxDimSize, origShapeOfX.e4, origShapeOfX.e3, origShapeOfX.e1); + return ZDNN_EXCEEDS_MDIS; + } + + // We split e2 in (e4, e3, e2, e1). + SplitInfo splitInfoX = { + .origZTensor = input, .axis = 2, .chunkSize = OMZTensorSplitSize}; + SplitInfo splitInfoY = { + .origZTensor = output, .axis = 2, .chunkSize = OMZTensorSplitSize}; + + // Dim is small or ztensor split is disabled. + if (!OMZTensorSplitEnabled || !initSplitInfo(&splitInfoX) || + !initSplitInfo(&splitInfoY)) { + if (OMZTensorSplitDebug) + printf("[UnaryElementwise] Not split zTensor ...\n"); + if (opType == ZDNN_EXP_EXT) + return zdnn_exp(input, output); + else if (opType == ZDNN_LOG_EXT) + return zdnn_log(input, output); + else if (opType == ZDNN_RELU_EXT) + return zdnn_relu(input, clippingValue, output); + else if (opType == ZDNN_SIGMOID_EXT) + return zdnn_sigmoid(input, output); + else if (opType == ZDNN_TANH_EXT) + return zdnn_tanh(input, output); + else + return ZDNN_UNAVAILABLE_FUNCTION; + } + + // Split input. + if (OMZTensorSplitDebug) + printf("[UnaryElementwise] Split the input ztensor along e2 into %d chunks " + "of %d elements \n", + splitInfoX.numOfChunks, splitInfoX.chunkSize); + + double splitTime = 0.; + double mmTime = 0.; + double mergeTime = 0.; + clock_t start_time, end_time; + + // Split input into chunks. + if (OMZTensorSplitDebug) + start_time = clock(); + splitZTensor(&splitInfoX, /*copyData=*/true); + splitZTensor(&splitInfoY, /*copyData=*/false); + if (OMZTensorSplitDebug) { + end_time = clock(); + splitTime = ((float)(end_time - start_time) / (float)CLOCKS_PER_SEC) * 1000; + } + + // Call zdnn op on each chunk. + if (OMZTensorSplitDebug) + start_time = clock(); + for (uint32_t i = 0; i < splitInfoX.numOfChunks; ++i) { + zdnn_ztensor *zxTensor = (splitInfoX.chunks + i)->ztensor; + zdnn_ztensor *zyTensor = (splitInfoY.chunks + i)->ztensor; + zdnn_status status; + if (opType == ZDNN_EXP_EXT) + status = zdnn_exp(zxTensor, zyTensor); + else if (opType == ZDNN_LOG_EXT) + status = zdnn_log(zxTensor, zyTensor); + else if (opType == ZDNN_RELU_EXT) + status = zdnn_relu(zxTensor, clippingValue, zyTensor); + else if (opType == ZDNN_SIGMOID_EXT) + status = zdnn_sigmoid(zxTensor, zyTensor); + else if (opType == ZDNN_TANH_EXT) + status = zdnn_tanh(zxTensor, zyTensor); + else + status = ZDNN_UNAVAILABLE_FUNCTION; + assert(status == ZDNN_OK); + } + if (OMZTensorSplitDebug) { + end_time = clock(); + mmTime = ((float)(end_time - start_time) / (float)CLOCKS_PER_SEC) * 1000; + } + + // Merging the chunks into the output. + if (OMZTensorSplitDebug) + start_time = clock(); + mergeZTensors(&splitInfoY); + if (OMZTensorSplitDebug) { + end_time = clock(); + mergeTime = ((float)(end_time - start_time) / (float)CLOCKS_PER_SEC) * 1000; + } + + freeSplitInfoBuffer(&splitInfoX); + freeSplitInfoBuffer(&splitInfoY); + + if (OMZTensorSplitDebug) + printf("[UnaryElementwise] split, %f, mm, %f, merge, %f (milliseconds)\n", + splitTime, mmTime, mergeTime); + + return ZDNN_OK; +} + +static zdnn_status zdnn_binary_elementwise_common(const zdnn_ztensor *inputA, + const zdnn_ztensor *inputB, zdnn_ztensor *output, ElemementwiseOp opType) { + // Verify that e4, e3, e1 do not exceed the maximum dimension size. Thus, we + // will split e2 safely. + OrigShape origShapeOfA, origShapeOfB; + getOrigShape(inputA, &origShapeOfA); + getOrigShape(inputB, &origShapeOfB); + uint32_t maxDimSize = zdnn_get_nnpa_max_dim_idx_size(); + if ((origShapeOfA.e4 > maxDimSize) || (origShapeOfA.e3 > maxDimSize) || + (origShapeOfA.e1 > maxDimSize)) { + printf("[BinaryElementwise] The 1st tensor dimension exceeds maximum " + "dimension index " + "size (MDIS) of %d: e4 = %d, e3 = %d, e1 = %d.\n", + maxDimSize, origShapeOfA.e4, origShapeOfA.e3, origShapeOfA.e1); + return ZDNN_EXCEEDS_MDIS; + } + if ((origShapeOfB.e4 > maxDimSize) || (origShapeOfB.e3 > maxDimSize) || + (origShapeOfB.e1 > maxDimSize)) { + printf("[BinaryElementwise] The 2nd tensor dimension exceeds maximum " + "dimension index " + "size (MDIS) of %d: e4 = %d, e3 = %d, e1 = %d.\n", + maxDimSize, origShapeOfB.e4, origShapeOfB.e3, origShapeOfB.e1); + return ZDNN_EXCEEDS_MDIS; + } + + // We split e2 in (e4, e3, e2, e1). + SplitInfo splitInfoA = { + .origZTensor = inputA, .axis = 2, .chunkSize = OMZTensorSplitSize}; + SplitInfo splitInfoB = { + .origZTensor = inputB, .axis = 2, .chunkSize = OMZTensorSplitSize}; + SplitInfo splitInfoY = { + .origZTensor = output, .axis = 2, .chunkSize = OMZTensorSplitSize}; + + // Dim is small or ztensor split is disabled. + if (!OMZTensorSplitEnabled || !initSplitInfo(&splitInfoA) || + !initSplitInfo(&splitInfoB) || !initSplitInfo(&splitInfoY)) { + if (OMZTensorSplitDebug) + printf("[BinaryElementwise] Not split zTensor ...\n"); + if (opType == ZDNN_ADD_EXT) + return zdnn_add(inputA, inputB, output); + else if (opType == ZDNN_SUB_EXT) + return zdnn_sub(inputA, inputB, output); + else if (opType == ZDNN_MUL_EXT) + return zdnn_mul(inputA, inputB, output); + else if (opType == ZDNN_MAX_EXT) + return zdnn_max(inputA, inputB, output); + else if (opType == ZDNN_MIN_EXT) + return zdnn_min(inputA, inputB, output); + else + return ZDNN_UNAVAILABLE_FUNCTION; + } + + // Split input. + if (OMZTensorSplitDebug) + printf( + "[BinaryElementwise] Split the input ztensors along e2 into %d chunks " + "of %d elements \n", + splitInfoA.numOfChunks, splitInfoA.chunkSize); + + double splitTime = 0.; + double mmTime = 0.; + double mergeTime = 0.; + clock_t start_time, end_time; + + // Split input into chunks. + if (OMZTensorSplitDebug) + start_time = clock(); + splitZTensor(&splitInfoA, /*copyData=*/true); + splitZTensor(&splitInfoB, /*copyData=*/true); + splitZTensor(&splitInfoY, /*copyData=*/false); + if (OMZTensorSplitDebug) { + end_time = clock(); + splitTime = ((float)(end_time - start_time) / (float)CLOCKS_PER_SEC) * 1000; + } + + // Call zdnn op on each chunk. + if (OMZTensorSplitDebug) + start_time = clock(); + for (uint32_t i = 0; i < splitInfoA.numOfChunks; ++i) { + zdnn_ztensor *zaTensor = (splitInfoA.chunks + i)->ztensor; + zdnn_ztensor *zbTensor = (splitInfoB.chunks + i)->ztensor; + zdnn_ztensor *zyTensor = (splitInfoY.chunks + i)->ztensor; + zdnn_status status; + if (opType == ZDNN_ADD_EXT) + status = zdnn_add(zaTensor, zbTensor, zyTensor); + else if (opType == ZDNN_SUB_EXT) + status = zdnn_sub(zaTensor, zbTensor, zyTensor); + else if (opType == ZDNN_MUL_EXT) + status = zdnn_mul(zaTensor, zbTensor, zyTensor); + else if (opType == ZDNN_DIV_EXT) + status = zdnn_div(zaTensor, zbTensor, zyTensor); + else if (opType == ZDNN_MAX_EXT) + status = zdnn_max(zaTensor, zbTensor, zyTensor); + else if (opType == ZDNN_MIN_EXT) + status = zdnn_min(zaTensor, zbTensor, zyTensor); + else + status = ZDNN_UNAVAILABLE_FUNCTION; + assert(status == ZDNN_OK); + } + if (OMZTensorSplitDebug) { + end_time = clock(); + mmTime = ((float)(end_time - start_time) / (float)CLOCKS_PER_SEC) * 1000; + } + + // Merging the chunks into the output. + if (OMZTensorSplitDebug) + start_time = clock(); + mergeZTensors(&splitInfoY); + if (OMZTensorSplitDebug) { + end_time = clock(); + mergeTime = ((float)(end_time - start_time) / (float)CLOCKS_PER_SEC) * 1000; + } + + freeSplitInfoBuffer(&splitInfoA); + freeSplitInfoBuffer(&splitInfoB); + freeSplitInfoBuffer(&splitInfoY); + + if (OMZTensorSplitDebug) + printf("[BinaryElementwise] split, %f, mm, %f, merge, %f (milliseconds)\n", + splitTime, mmTime, mergeTime); + + return ZDNN_OK; +} + +// ----------------------------------------------------------------------------- +// Extension Functions +// Same name as zdnn functions but with the `_ext` postfix. +// ----------------------------------------------------------------------------- + +zdnn_status zdnn_add_ext(const zdnn_ztensor *inputA, const zdnn_ztensor *inputB, + zdnn_ztensor *output) { + return zdnn_binary_elementwise_common(inputA, inputB, output, ZDNN_ADD_EXT); +} + +zdnn_status zdnn_sub_ext(const zdnn_ztensor *inputA, const zdnn_ztensor *inputB, + zdnn_ztensor *output) { + return zdnn_binary_elementwise_common(inputA, inputB, output, ZDNN_SUB_EXT); +} + +zdnn_status zdnn_mul_ext(const zdnn_ztensor *inputA, const zdnn_ztensor *inputB, + zdnn_ztensor *output) { + return zdnn_binary_elementwise_common(inputA, inputB, output, ZDNN_MUL_EXT); +} + +zdnn_status zdnn_div_ext(const zdnn_ztensor *inputA, const zdnn_ztensor *inputB, + zdnn_ztensor *output) { + return zdnn_binary_elementwise_common(inputA, inputB, output, ZDNN_DIV_EXT); +} + +zdnn_status zdnn_min_ext(const zdnn_ztensor *inputA, const zdnn_ztensor *inputB, + zdnn_ztensor *output) { + return zdnn_binary_elementwise_common(inputA, inputB, output, ZDNN_MIN_EXT); +} + +zdnn_status zdnn_max_ext(const zdnn_ztensor *inputA, const zdnn_ztensor *inputB, + zdnn_ztensor *output) { + return zdnn_binary_elementwise_common(inputA, inputB, output, ZDNN_MAX_EXT); +} + +zdnn_status zdnn_exp_ext(const zdnn_ztensor *input, zdnn_ztensor *output) { + return zdnn_unary_elementwise_common(input, NULL, output, ZDNN_EXP_EXT); +} + +zdnn_status zdnn_log_ext(const zdnn_ztensor *input, zdnn_ztensor *output) { + return zdnn_unary_elementwise_common(input, NULL, output, ZDNN_LOG_EXT); +} + +zdnn_status zdnn_relu_ext(const zdnn_ztensor *input, const void *clippingValue, + zdnn_ztensor *output) { + return zdnn_unary_elementwise_common( + input, clippingValue, output, ZDNN_RELU_EXT); +} + +zdnn_status zdnn_sigmoid_ext(const zdnn_ztensor *input, zdnn_ztensor *output) { + return zdnn_unary_elementwise_common(input, NULL, output, ZDNN_SIGMOID_EXT); +} + +zdnn_status zdnn_tanh_ext(const zdnn_ztensor *input, zdnn_ztensor *output) { + return zdnn_unary_elementwise_common(input, NULL, output, ZDNN_TANH_EXT); +} + +#ifdef __cplusplus +} +#endif diff --git a/src/Accelerators/NNPA/Runtime/zDNNExtension/MatMul.c b/src/Accelerators/NNPA/Runtime/zDNNExtension/MatMul.c index 9131479f08..3bfd8374f7 100644 --- a/src/Accelerators/NNPA/Runtime/zDNNExtension/MatMul.c +++ b/src/Accelerators/NNPA/Runtime/zDNNExtension/MatMul.c @@ -59,14 +59,14 @@ static zdnn_status zdnn_matmul_op_common(const zdnn_ztensor *inputA, // For a MatMul of (M,N)*(N,P), // We split M that is e2 in (e4, e3, e2, e1). - SplitInfo splitInfoA, splitInfoY; - splitInfoA.axis = 2; - splitInfoY.axis = 2; - splitInfoA.chunkSize = OMZTensorSplitSize; - splitInfoY.chunkSize = OMZTensorSplitSize; + SplitInfo splitInfoA = { + .origZTensor = inputA, .axis = 2, .chunkSize = OMZTensorSplitSize}; + SplitInfo splitInfoY = { + .origZTensor = output, .axis = 2, .chunkSize = OMZTensorSplitSize}; // Dim is small or ztensor split is disabled. - if (!OMZTensorSplitEnabled || !initSplitInfo(inputA, &splitInfoA)) { + if (!OMZTensorSplitEnabled || !initSplitInfo(&splitInfoA) || + !initSplitInfo(&splitInfoY)) { if (OMZTensorSplitDebug) printf("[MatMul] Not split zTensor ...\n"); return call_zdnn_matmul_op(inputA, inputB, inputC, opType, output, isBcast); @@ -77,7 +77,6 @@ static zdnn_status zdnn_matmul_op_common(const zdnn_ztensor *inputA, printf("[MatMul] Split the 1st ztensor along e2 into %d chunks of %d " "elements \n", splitInfoA.numOfChunks, splitInfoA.chunkSize); - initSplitInfo(output, &splitInfoY); double splitTime = 0.; double mmTime = 0.; @@ -87,8 +86,8 @@ static zdnn_status zdnn_matmul_op_common(const zdnn_ztensor *inputA, // Split input A into chunks. if (OMZTensorSplitDebug) start_time = clock(); - splitZTensor(inputA, &splitInfoA, /*copyData=*/true); - splitZTensor(output, &splitInfoY, /*copyData=*/false); + splitZTensor(&splitInfoA, /*copyData=*/true); + splitZTensor(&splitInfoY, /*copyData=*/false); if (OMZTensorSplitDebug) { end_time = clock(); splitTime = ((float)(end_time - start_time) / (float)CLOCKS_PER_SEC) * 1000; @@ -98,8 +97,10 @@ static zdnn_status zdnn_matmul_op_common(const zdnn_ztensor *inputA, if (OMZTensorSplitDebug) start_time = clock(); for (uint32_t i = 0; i < splitInfoA.numOfChunks; ++i) { - zdnn_status status = call_zdnn_matmul_op(splitInfoA.tensors + i, inputB, - inputC, opType, splitInfoY.tensors + i, isBcast); + zdnn_ztensor *zaTensor = (splitInfoA.chunks + i)->ztensor; + zdnn_ztensor *zyTensor = (splitInfoY.chunks + i)->ztensor; + zdnn_status status = call_zdnn_matmul_op( + zaTensor, inputB, inputC, opType, zyTensor, isBcast); assert(status == ZDNN_OK); } if (OMZTensorSplitDebug) { @@ -110,7 +111,7 @@ static zdnn_status zdnn_matmul_op_common(const zdnn_ztensor *inputA, // Merging the chunks into the output. if (OMZTensorSplitDebug) start_time = clock(); - mergeZTensors(&splitInfoY, output); + mergeZTensors(&splitInfoY); if (OMZTensorSplitDebug) { end_time = clock(); mergeTime = ((float)(end_time - start_time) / (float)CLOCKS_PER_SEC) * 1000; @@ -126,6 +127,11 @@ static zdnn_status zdnn_matmul_op_common(const zdnn_ztensor *inputA, return ZDNN_OK; } +// ----------------------------------------------------------------------------- +// Extension Functions +// Same name as zdnn functions but with the `_ext` postfix. +// ----------------------------------------------------------------------------- + zdnn_status zdnn_matmul_op_ext(const zdnn_ztensor *inputA, const zdnn_ztensor *inputB, const zdnn_ztensor *inputC, int opType, zdnn_ztensor *output) { diff --git a/src/Accelerators/NNPA/Runtime/zDNNExtension/Softmax.c b/src/Accelerators/NNPA/Runtime/zDNNExtension/Softmax.c new file mode 100644 index 0000000000..53147f50c2 --- /dev/null +++ b/src/Accelerators/NNPA/Runtime/zDNNExtension/Softmax.c @@ -0,0 +1,126 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + */ + +//===-------------------------- Softmax.c ---------------------------------===// +// +// Copyright 2024 The IBM Research Authors. +// +// ============================================================================= +// +// A wrapper of zdnn_softmax for ztensor partition and parallelism. +// +//===----------------------------------------------------------------------===// + +// Include pthreads (need special treatment on z/OS). +#ifdef __MVS__ +#define _OPEN_THREADS +#endif +#include + +#include +#include +#include +#include +#include + +#include "zDNNExtension.h" +#include "zdnn.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// ----------------------------------------------------------------------------- +// Extension Functions +// Same name as zdnn functions but with the `_ext` postfix. +// ----------------------------------------------------------------------------- + +zdnn_status zdnn_softmax_ext(const zdnn_ztensor *input, void *save_area, + zdnn_softmax_act act_func, zdnn_ztensor *output) { + // Verify that e4, e3, e1 do not exceed the maximum dimension size. Thus, we + // will split e2 safely. + OrigShape origShapeOfX; + getOrigShape(input, &origShapeOfX); + uint32_t maxDimSize = zdnn_get_nnpa_max_dim_idx_size(); + if ((origShapeOfX.e4 > maxDimSize) || (origShapeOfX.e3 > maxDimSize) || + (origShapeOfX.e1 > maxDimSize)) { + printf( + "[Softmax] The input tensor dimension exceeds maximum dimension index " + "size (MDIS) of %d: e4 = %d, e3 = %d, e1 = %d.\n", + maxDimSize, origShapeOfX.e4, origShapeOfX.e3, origShapeOfX.e1); + return ZDNN_EXCEEDS_MDIS; + } + + // We split e2 in (e4, e3, e2, e1). + SplitInfo splitInfoX = { + .origZTensor = input, .axis = 2, .chunkSize = OMZTensorSplitSize}; + SplitInfo splitInfoY = { + .origZTensor = output, .axis = 2, .chunkSize = OMZTensorSplitSize}; + + // Dim is small or ztensor split is disabled. + if (!OMZTensorSplitEnabled || !initSplitInfo(&splitInfoX) || + !initSplitInfo(&splitInfoY)) { + if (OMZTensorSplitDebug) + printf("[Softmax] Not split zTensor ...\n"); + return zdnn_softmax(input, save_area, act_func, output); + } + + // Split input. + if (OMZTensorSplitDebug) + printf("[Softmax] Split the input ztensor along e2 into %d chunks of %d " + "elements \n", + splitInfoX.numOfChunks, splitInfoX.chunkSize); + + double splitTime = 0.; + double mmTime = 0.; + double mergeTime = 0.; + clock_t start_time, end_time; + + // Split input into chunks. + if (OMZTensorSplitDebug) + start_time = clock(); + splitZTensor(&splitInfoX, /*copyData=*/true); + splitZTensor(&splitInfoY, /*copyData=*/false); + if (OMZTensorSplitDebug) { + end_time = clock(); + splitTime = ((float)(end_time - start_time) / (float)CLOCKS_PER_SEC) * 1000; + } + + // Call zdnn_softmax on each chunk. Not use save_area. + // TODO: could we reuse save_area in particular in the parallel scenario? + if (OMZTensorSplitDebug) + start_time = clock(); + for (uint32_t i = 0; i < splitInfoX.numOfChunks; ++i) { + zdnn_ztensor *zxTensor = (splitInfoX.chunks + i)->ztensor; + zdnn_ztensor *zyTensor = (splitInfoY.chunks + i)->ztensor; + zdnn_status status = zdnn_softmax(zxTensor, NULL, act_func, zyTensor); + assert(status == ZDNN_OK); + } + if (OMZTensorSplitDebug) { + end_time = clock(); + mmTime = ((float)(end_time - start_time) / (float)CLOCKS_PER_SEC) * 1000; + } + + // Merging the chunks into the output. + if (OMZTensorSplitDebug) + start_time = clock(); + mergeZTensors(&splitInfoY); + if (OMZTensorSplitDebug) { + end_time = clock(); + mergeTime = ((float)(end_time - start_time) / (float)CLOCKS_PER_SEC) * 1000; + } + + freeSplitInfoBuffer(&splitInfoX); + freeSplitInfoBuffer(&splitInfoY); + + if (OMZTensorSplitDebug) + printf("[Softmax] split, %f, mm, %f, merge, %f (milliseconds)\n", splitTime, + mmTime, mergeTime); + + return ZDNN_OK; +} + +#ifdef __cplusplus +} +#endif diff --git a/src/Accelerators/NNPA/Runtime/zDNNExtension/zDNNExtension.c b/src/Accelerators/NNPA/Runtime/zDNNExtension/zDNNExtension.c index 8a945a796d..4093a885c9 100644 --- a/src/Accelerators/NNPA/Runtime/zDNNExtension/zDNNExtension.c +++ b/src/Accelerators/NNPA/Runtime/zDNNExtension/zDNNExtension.c @@ -94,31 +94,47 @@ static void getZTensorShape(const zdnn_ztensor *t, zTensorShape *shape) { assert(sizeFromDim == sizeFromBuffer && "buffer size mismatched"); } -static zdnn_status allocZTensorChunk(const zdnn_ztensor *input, uint32_t axis, - uint32_t chunkSize, zdnn_ztensor *output) { +static zdnn_status allocZTensorChunk( + const SplitInfo *splitInfo, uint32_t chunkID) { + const zdnn_ztensor *origZTensor = splitInfo->origZTensor; + + uint32_t axis = splitInfo->axis; + ChunkInfo *chunk = splitInfo->chunks + chunkID; + uint32_t chunkSize = chunk->dimSize; + + // Allocate one ztensor struct. + chunk->ztensor = malloc(sizeof(zdnn_ztensor)); + if (!chunk->ztensor) + return ZDNN_ALLOCATION_FAILURE; + zdnn_ztensor *chunkZTensor = chunk->ztensor; + + // Allocate one buffer for two descriptors. zdnn_tensor_desc *descriptors = malloc(2 * sizeof(zdnn_tensor_desc)); if (!descriptors) return ZDNN_ALLOCATION_FAILURE; zdnn_tensor_desc *preTransDesc = descriptors; zdnn_tensor_desc *transDesc = descriptors + 1; - // Copy pre_transform_desc from the input. - preTransDesc->layout = input->pre_transformed_desc->layout; - preTransDesc->format = input->pre_transformed_desc->format; - preTransDesc->type = input->pre_transformed_desc->type; + + // Copy pre_transform_desc from the origZTensor. + preTransDesc->layout = origZTensor->pre_transformed_desc->layout; + preTransDesc->format = origZTensor->pre_transformed_desc->format; + preTransDesc->type = origZTensor->pre_transformed_desc->type; preTransDesc->dim4 = - (axis == 0) ? chunkSize : input->pre_transformed_desc->dim4; + (axis == 0) ? chunkSize : origZTensor->pre_transformed_desc->dim4; preTransDesc->dim3 = - (axis == 1) ? chunkSize : input->pre_transformed_desc->dim3; + (axis == 1) ? chunkSize : origZTensor->pre_transformed_desc->dim3; preTransDesc->dim2 = - (axis == 2) ? chunkSize : input->pre_transformed_desc->dim2; + (axis == 2) ? chunkSize : origZTensor->pre_transformed_desc->dim2; preTransDesc->dim1 = - (axis == 3) ? chunkSize : input->pre_transformed_desc->dim1; + (axis == 3) ? chunkSize : origZTensor->pre_transformed_desc->dim1; + // Copy a transformed desc. zdnn_status status = zdnn_generate_transformed_desc(preTransDesc, transDesc); if (status != ZDNN_OK) return status; + // Init a zTensor with malloc. - return zdnn_init_ztensor_with_malloc(preTransDesc, transDesc, output); + return zdnn_init_ztensor_with_malloc(preTransDesc, transDesc, chunkZTensor); } static void freeZTensorChunk(zdnn_ztensor *t) { @@ -129,27 +145,42 @@ static void freeZTensorChunk(zdnn_ztensor *t) { free(t->pre_transformed_desc); } -static void copyZTensorChunk(zdnn_ztensor *output, const zdnn_ztensor *input, - uint32_t axis, uint32_t offset, bool fromChunk) { +static void copyZTensorChunk( + const SplitInfo *splitInfo, uint32_t chunkID, bool fromChunk) { // Only support the second innermost axis in the CPU tensor at this moment. // axis = 2 in the CPU tensor corresponds to dim3 in zTensor. - if (axis != 2) { + if (splitInfo->axis != 2) { printf("Only support the second innermost dimension at this moment."); return; } - zTensorShape inShape, outShape; - getZTensorShape(input, &inShape); - getZTensorShape(output, &outShape); - zTensorShape origShape = fromChunk ? outShape : inShape; - zTensorShape chunkShape = fromChunk ? inShape : outShape; + ChunkInfo *chunk = splitInfo->chunks + chunkID; + uint32_t offset = chunk->offsetInStick; + + // Buffer pointers. + void *src, *dst; + if (fromChunk) { + src = chunk->ztensor->buffer; + dst = splitInfo->origZTensor->buffer; + } else { + src = splitInfo->origZTensor->buffer; + dst = chunk->ztensor->buffer; + } + assert(src && "Source buffer is NULL"); + assert(dst && "Destination buffer is NULL"); + + // Shape information. + zTensorShape origShape; + getZTensorShape(splitInfo->origZTensor, &origShape); + zTensorShape chunkShape; + getZTensorShape(chunk->ztensor, &chunkShape); assert(origShape.dim6 == chunkShape.dim6); assert(origShape.dim5 == chunkShape.dim5); assert(origShape.dim4 == chunkShape.dim4); assert(origShape.dim2 == chunkShape.dim2); assert(origShape.dim1 == chunkShape.dim1); // Ensure that each element is 2 bytes. - assert(input->transformed_desc->type == ZDNN_DLFLOAT16); + assert(splitInfo->origZTensor->transformed_desc->type == ZDNN_DLFLOAT16); uint64_t D6 = chunkShape.dim6; uint64_t D5 = chunkShape.dim5; @@ -170,9 +201,8 @@ static void copyZTensorChunk(zdnn_ztensor *output, const zdnn_ztensor *input, uint64_t TD3Offset = td3 + TD3 * TD4Offset; // Copy one page at a time. uint64_t offsetSrc = AIU_PAGESIZE_IN_BYTES * SD3Offset; - uint64_t offsetDest = AIU_PAGESIZE_IN_BYTES * TD3Offset; - memcpy(output->buffer + offsetDest, input->buffer + offsetSrc, - AIU_PAGESIZE_IN_BYTES); + uint64_t offsetDst = AIU_PAGESIZE_IN_BYTES * TD3Offset; + memcpy(dst + offsetDst, src + offsetSrc, AIU_PAGESIZE_IN_BYTES); } } } @@ -180,25 +210,42 @@ static void copyZTensorChunk(zdnn_ztensor *output, const zdnn_ztensor *input, return; } -static void copyZTensorChunkScalar(zdnn_ztensor *output, - const zdnn_ztensor *input, uint32_t axis, uint32_t offset, bool fromChunk) { +static void copyZTensorChunkScalar( + const SplitInfo *splitInfo, uint32_t chunkID, bool fromChunk) { // Only support the second innermost axis in the CPU tensor at this moment. // axis = 2 in the CPU tensor corresponds to dim3 in zTensor. - if (axis != 2) { + if (splitInfo->axis != 2) { printf("Only support the second innermost dimension at this moment."); return; } - zTensorShape inShape, outShape; - getZTensorShape(input, &inShape); - getZTensorShape(output, &outShape); - zTensorShape origShape = fromChunk ? outShape : inShape; - zTensorShape chunkShape = fromChunk ? inShape : outShape; + ChunkInfo *chunk = splitInfo->chunks + chunkID; + uint32_t offset = chunk->offsetInStick; + + // Buffers pointers. + uint16_t *src, *dst; + if (fromChunk) { + src = (uint16_t *)chunk->ztensor->buffer; + dst = (uint16_t *)splitInfo->origZTensor->buffer; + } else { + src = (uint16_t *)splitInfo->origZTensor->buffer; + dst = (uint16_t *)chunk->ztensor->buffer; + } + assert(src && "Source buffer is NULL"); + assert(dst && "Destination buffer is NULL"); + + // Shape information. + zTensorShape origShape; + getZTensorShape(splitInfo->origZTensor, &origShape); + zTensorShape chunkShape; + getZTensorShape(chunk->ztensor, &chunkShape); assert(origShape.dim6 == chunkShape.dim6); assert(origShape.dim5 == chunkShape.dim5); assert(origShape.dim4 == chunkShape.dim4); assert(origShape.dim2 == chunkShape.dim2); assert(origShape.dim1 == chunkShape.dim1); + // Ensure that each element is 2 bytes. + assert(splitInfo->origZTensor->transformed_desc->type == ZDNN_DLFLOAT16); uint64_t D6 = chunkShape.dim6; uint64_t D5 = chunkShape.dim5; @@ -220,13 +267,9 @@ static void copyZTensorChunkScalar(zdnn_ztensor *output, uint64_t TD3Offset = td3 + TD3 * (d4 + D4 * (d5 + D5 * d6)); for (uint64_t d2 = 0; d2 < D2; ++d2) { for (uint64_t d1 = 0; d1 < D1; ++d1) { - // Copy 2 bytes at a time. - uint64_t offsetSrc = - AIU_2BYTE_CELL_SIZE * (d1 + D1 * (d2 + D2 * SD3Offset)); - uint64_t offsetDest = - AIU_2BYTE_CELL_SIZE * (d1 + D1 * (d2 + D2 * TD3Offset)); - memcpy(output->buffer + offsetDest, input->buffer + offsetSrc, - AIU_2BYTE_CELL_SIZE); + uint64_t offsetSrc = d1 + D1 * (d2 + D2 * SD3Offset); + uint64_t offsetDst = d1 + D1 * (d2 + D2 * TD3Offset); + *(dst + offsetDst) = *(src + offsetSrc); } } } @@ -236,66 +279,77 @@ static void copyZTensorChunkScalar(zdnn_ztensor *output, return; } -bool initSplitInfo(const zdnn_ztensor *input, SplitInfo *splitInfo) { +bool initSplitInfo(SplitInfo *splitInfo) { // Only support the second innermost dimension at this moment. if (splitInfo->axis != 2) return false; - splitInfo->totalSize = input->transformed_desc->dim2; - splitInfo->chunkSizeInStick = CEIL(splitInfo->chunkSize, AIU_STICKS_PER_PAGE); + // Init general split information. + const zdnn_ztensor *origZTensor = splitInfo->origZTensor; + splitInfo->totalSize = origZTensor->transformed_desc->dim2; splitInfo->numOfChunks = CEIL(splitInfo->totalSize, splitInfo->chunkSize); + // No split benefit. if (splitInfo->numOfChunks == 1) return false; + // Stickification: (e4, e3, e2, e1) -> (e4, e1/64, e3, e2/32, 32, 64) + uint32_t chunkSizeInStick; + if (splitInfo->axis == 0) // e4 + chunkSizeInStick = splitInfo->chunkSize; + else if (splitInfo->axis == 1) // e3 + chunkSizeInStick = splitInfo->chunkSize; + else if (splitInfo->axis == 2) // e2 + chunkSizeInStick = CEIL(splitInfo->chunkSize, AIU_STICKS_PER_PAGE); + else if (splitInfo->axis == 3) // e1 + chunkSizeInStick = CEIL(splitInfo->chunkSize, AIU_2BYTE_CELLS_PER_STICK); + else + return false; + + // Init chunk information. splitInfo->chunks = malloc(splitInfo->numOfChunks * sizeof(ChunkInfo)); + assert(splitInfo->chunks && "Failed to allocate ChunkInfo struct"); for (uint32_t i = 0; i < splitInfo->numOfChunks; ++i) { ChunkInfo *chunkInfo = splitInfo->chunks + i; if (i == splitInfo->numOfChunks - 1) - chunkInfo->size = splitInfo->totalSize - i * splitInfo->chunkSize; + chunkInfo->dimSize = splitInfo->totalSize - i * splitInfo->chunkSize; else - chunkInfo->size = splitInfo->chunkSize; + chunkInfo->dimSize = splitInfo->chunkSize; + chunkInfo->offsetInStick = i * chunkSizeInStick; } return true; } void freeSplitInfoBuffer(SplitInfo *splitInfo) { + // Free the sub tensors. + for (uint32_t i = 0; i < splitInfo->numOfChunks; ++i) { + zdnn_ztensor *t = (splitInfo->chunks + i)->ztensor; + // Free the ztensor buffer and descriptors. + freeZTensorChunk(t); + // Free ztensor struct. + free(t); + } // Free chunk info. if (splitInfo->chunks) free(splitInfo->chunks); - // Free the sub tensors. - for (uint32_t i = 0; i < splitInfo->numOfChunks; ++i) - freeZTensorChunk(splitInfo->tensors + i); - if (splitInfo->tensors) - free(splitInfo->tensors); } -void splitZTensor( - const zdnn_ztensor *input, SplitInfo *splitInfo, bool copyData) { - splitInfo->tensors = - malloc(splitInfo->numOfChunks * sizeof(struct zdnn_ztensor)); - assert(splitInfo->tensors && "Failed to allocate a buffer"); - uint32_t axis = splitInfo->axis; +void splitZTensor(const SplitInfo *splitInfo, bool copyData) { for (uint32_t i = 0; i < splitInfo->numOfChunks; ++i) { - zdnn_ztensor *chunk = splitInfo->tensors + i; - ChunkInfo *chunkInfo = splitInfo->chunks + i; - // Allocate ztensor struct for the chunk. - zdnn_status status = - allocZTensorChunk(input, /*axis=*/axis, chunkInfo->size, chunk); + // Allocate a chunk ztensor. + zdnn_status status = allocZTensorChunk(splitInfo, i); assert(status == ZDNN_OK && "Failed to allocate zTensor chunk"); if (copyData) { - // Copy data from the input to the chunk. - uint32_t offset = i * splitInfo->chunkSizeInStick; - copyZTensorChunk(chunk, input, axis, offset, /*fromChunk=*/false); + // Copy data from the original ztensor to the chunk ztensor. + copyZTensorChunk(splitInfo, i, /*fromChunk=*/false); } } } -void mergeZTensors(const SplitInfo *splitInfo, zdnn_ztensor *output) { +void mergeZTensors(const SplitInfo *splitInfo) { for (uint32_t i = 0; i < splitInfo->numOfChunks; ++i) { - uint32_t offset = i * splitInfo->chunkSizeInStick; - copyZTensorChunk(output, splitInfo->tensors + i, splitInfo->axis, offset, - /*fromChunk=*/true); + // Copy data from the chunk ztensor back to the original ztensor. + copyZTensorChunk(splitInfo, i, /*fromChunk=*/true); } } diff --git a/src/Accelerators/NNPA/Runtime/zDNNExtension/zDNNExtension.h b/src/Accelerators/NNPA/Runtime/zDNNExtension/zDNNExtension.h index affc1f4837..223a7f1a94 100644 --- a/src/Accelerators/NNPA/Runtime/zDNNExtension/zDNNExtension.h +++ b/src/Accelerators/NNPA/Runtime/zDNNExtension/zDNNExtension.h @@ -66,25 +66,27 @@ typedef struct zTensorShape { } zTensorShape; typedef struct ChunkInfo { - uint32_t size; + // Dim size for this chunk along the original axis. + uint32_t dimSize; + // Offset of the split point of this chunk in the stickified axis. + uint32_t offsetInStick; + // ztensor of this chunk. + zdnn_ztensor *ztensor; } ChunkInfo; typedef struct SplitInfo { - // Axis to split the tensor. Used to refer to an axis in (e4, e3, e2, e1) + // Original ztensor. + const zdnn_ztensor *origZTensor; + // Axis to split the tensor. Used to refer to an axis in (e4, e3, e2, e1). uint32_t axis; - // Size of the dimension at axis + // Size of the dimension at axis. uint32_t totalSize; - // Size of each chunk. The last chunk may be smaller + // Size of each chunk. The last chunk may be smaller. uint32_t chunkSize; - // Size of each chunk in the stickifified tensor. The last chunk may be - // smaller - uint32_t chunkSizeInStick; - // The number of chunks + // The number of chunks. uint32_t numOfChunks; - // Information for each chunk + // Information for each chunk. ChunkInfo *chunks; - // Sub zTensors - zdnn_ztensor *tensors; } SplitInfo; // ----------------------------------------------------------------------------- @@ -111,11 +113,10 @@ void getOrigShape(const zdnn_ztensor *t, OrigShape *shape); /** * \brief Initialize a SplitInfo struct. * - * @param input input ztensor to split * @param splitInfo information for splitting * @return true if the ztensor is splitable. Otherwise, false */ -bool initSplitInfo(const zdnn_ztensor *input, SplitInfo *splitInfo); +bool initSplitInfo(SplitInfo *splitInfo); /** * \brief Free buffers related to a SplitInfo struct. @@ -129,19 +130,17 @@ void freeSplitInfoBuffer(SplitInfo *splitInfo); /** * \brief Split a ztensor into multiple chunks. * - * @param input a ztensor to split - * @param splitInfo information of all chunks + * @param splitInfo information for splitting * @param copyData whether or not copy data from ztensor to each chunk */ -void splitZTensor( - const zdnn_ztensor *input, SplitInfo *splitInfo, bool copyData); +void splitZTensor(const SplitInfo *splitInfo, bool copyData); /** * \brief Merge chunks into a ztensor. * - * @param splitInfo information of all chunks + * @param splitInfo information for splitting * @param output a ztensor obtained by merging the chunks */ -void mergeZTensors(const SplitInfo *splitInfo, zdnn_ztensor *output); +void mergeZTensors(const SplitInfo *splitInfo); // ----------------------------------------------------------------------------- // Extension Functions @@ -156,6 +155,28 @@ zdnn_status zdnn_matmul_bcast_op_ext(const zdnn_ztensor *inputA, const zdnn_ztensor *inputB, const zdnn_ztensor *inputC, int opType, zdnn_ztensor *output); +// Elementwise Operations +zdnn_status zdnn_add_ext(const zdnn_ztensor *inputA, const zdnn_ztensor *inputB, + zdnn_ztensor *output); +zdnn_status zdnn_sub_ext(const zdnn_ztensor *inputA, const zdnn_ztensor *inputB, + zdnn_ztensor *output); +zdnn_status zdnn_mul_ext(const zdnn_ztensor *inputA, const zdnn_ztensor *inputB, + zdnn_ztensor *output); +zdnn_status zdnn_div_ext(const zdnn_ztensor *inputA, const zdnn_ztensor *inputB, + zdnn_ztensor *output); +zdnn_status zdnn_min_ext(const zdnn_ztensor *inputA, const zdnn_ztensor *inputB, + zdnn_ztensor *output); +zdnn_status zdnn_max_ext(const zdnn_ztensor *inputA, const zdnn_ztensor *inputB, + zdnn_ztensor *output); +zdnn_status zdnn_exp_ext(const zdnn_ztensor *input, zdnn_ztensor *output); +zdnn_status zdnn_log_ext(const zdnn_ztensor *input, zdnn_ztensor *output); +zdnn_status zdnn_relu_ext( + const zdnn_ztensor *input, const void *clippingValue, zdnn_ztensor *output); +zdnn_status zdnn_sigmoid_ext(const zdnn_ztensor *input, zdnn_ztensor *output); +zdnn_status zdnn_softmax_ext(const zdnn_ztensor *input, void *save_area, + zdnn_softmax_act act_func, zdnn_ztensor *output); +zdnn_status zdnn_tanh_ext(const zdnn_ztensor *input, zdnn_ztensor *output); + #ifdef __cplusplus } #endif diff --git a/test/mlir/accelerators/nnpa/conversion/lower-all-to-llvm-typed-pointer.mlir b/test/mlir/accelerators/nnpa/conversion/lower-all-to-llvm-typed-pointer.mlir index 3f7a33fcf9..fb357ea4a4 100644 --- a/test/mlir/accelerators/nnpa/conversion/lower-all-to-llvm-typed-pointer.mlir +++ b/test/mlir/accelerators/nnpa/conversion/lower-all-to-llvm-typed-pointer.mlir @@ -132,7 +132,7 @@ func.func @test_call_zdnn_relu() -> () { return // CHECK-LABEL: test_call_zdnn_relu - // CHECK: {{.*}} = llvm.call @zdnn_relu({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32 + // CHECK: {{.*}} = llvm.call @zdnn_relu_ext({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32 } // ----- @@ -146,7 +146,7 @@ func.func @test_call_zdnn_tanh() -> () { return // CHECK-LABEL: test_call_zdnn_tanh - // CHECK: {{.*}} = llvm.call @zdnn_tanh({{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr) -> i32 + // CHECK: {{.*}} = llvm.call @zdnn_tanh_ext({{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr) -> i32 } // ----- @@ -160,7 +160,7 @@ func.func @test_call_zdnn_sigmoid() -> () { return // CHECK-LABEL: test_call_zdnn_sigmoid - // CHECK: {{.*}} = llvm.call @zdnn_sigmoid({{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr) -> i32 + // CHECK: {{.*}} = llvm.call @zdnn_sigmoid_ext({{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr) -> i32 } // ----- @@ -175,7 +175,7 @@ func.func @test_call_zdnn_add() -> () { return // CHECK-LABEL: test_call_zdnn_add - // CHECK: {{.*}} = llvm.call @zdnn_add({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32 + // CHECK: {{.*}} = llvm.call @zdnn_add_ext({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32 } // ----- @@ -190,7 +190,7 @@ func.func @test_call_zdnn_sub() -> () { return // CHECK-LABEL: test_call_zdnn_sub - // CHECK: {{.*}} = llvm.call @zdnn_sub({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32 + // CHECK: {{.*}} = llvm.call @zdnn_sub_ext({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32 } // ----- @@ -205,7 +205,7 @@ func.func @test_call_zdnn_mul() -> () { return // CHECK-LABEL: test_call_zdnn_mul - // CHECK: {{.*}} = llvm.call @zdnn_mul({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32 + // CHECK: {{.*}} = llvm.call @zdnn_mul_ext({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32 } // ----- @@ -220,7 +220,7 @@ func.func @test_call_zdnn_div() -> () { return // CHECK-LABEL: test_call_zdnn_div - // CHECK: {{.*}} = llvm.call @zdnn_div({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32 + // CHECK: {{.*}} = llvm.call @zdnn_div_ext({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32 } // ----- @@ -235,7 +235,7 @@ func.func @test_call_zdnn_softmax() -> () { return // CHECK-LABEL: test_call_zdnn_softmax - // CHECK: {{.*}} = llvm.call @zdnn_softmax({{.*}}, {{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> i32 + // CHECK: {{.*}} = llvm.call @zdnn_softmax_ext({{.*}}, {{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> i32 } // ----- @@ -283,7 +283,7 @@ func.func @test_call_zdnn_min() -> () { return // CHECK-LABEL: test_call_zdnn_min - // CHECK: {{.*}} = llvm.call @zdnn_min({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32 + // CHECK: {{.*}} = llvm.call @zdnn_min_ext({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32 } // ----- @@ -298,7 +298,7 @@ func.func @test_call_zdnn_max() -> () { return // CHECK-LABEL: test_call_zdnn_max - // CHECK: {{.*}} = llvm.call @zdnn_max({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32 + // CHECK: {{.*}} = llvm.call @zdnn_max_ext({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32 } // ----- @@ -312,7 +312,7 @@ func.func @test_call_zdnn_exp() -> () { return // CHECK-LABEL: test_call_zdnn_exp - // CHECK: {{.*}} = llvm.call @zdnn_exp({{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr) -> i32 + // CHECK: {{.*}} = llvm.call @zdnn_exp_ext({{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr) -> i32 } // ----- @@ -326,7 +326,7 @@ func.func @test_call_zdnn_log() -> () { return // CHECK-LABEL: test_call_zdnn_log - // CHECK: {{.*}} = llvm.call @zdnn_log({{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr) -> i32 + // CHECK: {{.*}} = llvm.call @zdnn_log_ext({{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr) -> i32 } // ----- diff --git a/test/mlir/accelerators/nnpa/conversion/lower-all-to-llvm.mlir b/test/mlir/accelerators/nnpa/conversion/lower-all-to-llvm.mlir index 3c907d2e23..9f928ffc31 100644 --- a/test/mlir/accelerators/nnpa/conversion/lower-all-to-llvm.mlir +++ b/test/mlir/accelerators/nnpa/conversion/lower-all-to-llvm.mlir @@ -131,7 +131,7 @@ func.func @test_call_zdnn_relu() -> () { return // CHECK-LABEL: test_call_zdnn_relu - // CHECK: {{.*}} = llvm.call @zdnn_relu({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32 + // CHECK: {{.*}} = llvm.call @zdnn_relu_ext({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32 } // ----- @@ -145,7 +145,7 @@ func.func @test_call_zdnn_tanh() -> () { return // CHECK-LABEL: test_call_zdnn_tanh - // CHECK: {{.*}} = llvm.call @zdnn_tanh({{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr) -> i32 + // CHECK: {{.*}} = llvm.call @zdnn_tanh_ext({{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr) -> i32 } // ----- @@ -159,7 +159,7 @@ func.func @test_call_zdnn_sigmoid() -> () { return // CHECK-LABEL: test_call_zdnn_sigmoid - // CHECK: {{.*}} = llvm.call @zdnn_sigmoid({{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr) -> i32 + // CHECK: {{.*}} = llvm.call @zdnn_sigmoid_ext({{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr) -> i32 } // ----- @@ -174,7 +174,7 @@ func.func @test_call_zdnn_add() -> () { return // CHECK-LABEL: test_call_zdnn_add - // CHECK: {{.*}} = llvm.call @zdnn_add({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32 + // CHECK: {{.*}} = llvm.call @zdnn_add_ext({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32 } // ----- @@ -189,7 +189,7 @@ func.func @test_call_zdnn_sub() -> () { return // CHECK-LABEL: test_call_zdnn_sub - // CHECK: {{.*}} = llvm.call @zdnn_sub({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32 + // CHECK: {{.*}} = llvm.call @zdnn_sub_ext({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32 } // ----- @@ -204,7 +204,7 @@ func.func @test_call_zdnn_mul() -> () { return // CHECK-LABEL: test_call_zdnn_mul - // CHECK: {{.*}} = llvm.call @zdnn_mul({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32 + // CHECK: {{.*}} = llvm.call @zdnn_mul_ext({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32 } // ----- @@ -219,7 +219,7 @@ func.func @test_call_zdnn_div() -> () { return // CHECK-LABEL: test_call_zdnn_div - // CHECK: {{.*}} = llvm.call @zdnn_div({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32 + // CHECK: {{.*}} = llvm.call @zdnn_div_ext({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32 } // ----- @@ -234,7 +234,7 @@ func.func @test_call_zdnn_softmax() -> () { return // CHECK-LABEL: test_call_zdnn_softmax - // CHECK: {{.*}} = llvm.call @zdnn_softmax({{.*}}, {{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> i32 + // CHECK: {{.*}} = llvm.call @zdnn_softmax_ext({{.*}}, {{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> i32 } // ----- @@ -282,7 +282,7 @@ func.func @test_call_zdnn_min() -> () { return // CHECK-LABEL: test_call_zdnn_min - // CHECK: {{.*}} = llvm.call @zdnn_min({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32 + // CHECK: {{.*}} = llvm.call @zdnn_min_ext({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32 } // ----- @@ -297,7 +297,7 @@ func.func @test_call_zdnn_max() -> () { return // CHECK-LABEL: test_call_zdnn_max - // CHECK: {{.*}} = llvm.call @zdnn_max({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32 + // CHECK: {{.*}} = llvm.call @zdnn_max_ext({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32 } // ----- @@ -311,7 +311,7 @@ func.func @test_call_zdnn_exp() -> () { return // CHECK-LABEL: test_call_zdnn_exp - // CHECK: {{.*}} = llvm.call @zdnn_exp({{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr) -> i32 + // CHECK: {{.*}} = llvm.call @zdnn_exp_ext({{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr) -> i32 } // ----- @@ -325,7 +325,7 @@ func.func @test_call_zdnn_log() -> () { return // CHECK-LABEL: test_call_zdnn_log - // CHECK: {{.*}} = llvm.call @zdnn_log({{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr) -> i32 + // CHECK: {{.*}} = llvm.call @zdnn_log_ext({{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr) -> i32 } // -----