diff --git a/src/Accelerators/NNPA/Conversion/ZLowToLLVM/ZLowToLLVMCommon.cpp b/src/Accelerators/NNPA/Conversion/ZLowToLLVM/ZLowToLLVMCommon.cpp
index e59208bff0..b6ec195840 100644
--- a/src/Accelerators/NNPA/Conversion/ZLowToLLVM/ZLowToLLVMCommon.cpp
+++ b/src/Accelerators/NNPA/Conversion/ZLowToLLVM/ZLowToLLVMCommon.cpp
@@ -56,19 +56,19 @@ ApiRegistry RegisterAllApis(MLIRContext *context) {
     ApiSpec(API::ZDNN_TRANSFORM_ZTENSOR, "zdnn_transform_ztensor", int32Ty, {opaquePtrTy}, true),
     ApiSpec(API::ZDNN_TRANSFORM_ORIGTENSOR, "zdnn_transform_origtensor", int32Ty, {opaquePtrTy, opaquePtrTy}, false),
     // Elementwise operations
-    ApiSpec(API::ZDNN_ADD, "zdnn_add", int32Ty, {opaquePtrTy, opaquePtrTy, opaquePtrTy}, false),
-    ApiSpec(API::ZDNN_SUB, "zdnn_sub", int32Ty, {opaquePtrTy, opaquePtrTy, opaquePtrTy}, false),
-    ApiSpec(API::ZDNN_MUL, "zdnn_mul", int32Ty, {opaquePtrTy, opaquePtrTy, opaquePtrTy}, false),
-    ApiSpec(API::ZDNN_DIV, "zdnn_div", int32Ty, {opaquePtrTy, opaquePtrTy, opaquePtrTy}, false),
-    ApiSpec(API::ZDNN_MIN, "zdnn_min", int32Ty, {opaquePtrTy, opaquePtrTy, opaquePtrTy}, false),
-    ApiSpec(API::ZDNN_MAX, "zdnn_max", int32Ty, {opaquePtrTy, opaquePtrTy, opaquePtrTy}, false),
-    ApiSpec(API::ZDNN_LOG, "zdnn_log", int32Ty, {opaquePtrTy, opaquePtrTy}, false),
-    ApiSpec(API::ZDNN_EXP, "zdnn_exp", int32Ty, {opaquePtrTy, opaquePtrTy}, false),
+    ApiSpec(API::ZDNN_ADD, "zdnn_add_ext", int32Ty, {opaquePtrTy, opaquePtrTy, opaquePtrTy}, false),
+    ApiSpec(API::ZDNN_SUB, "zdnn_sub_ext", int32Ty, {opaquePtrTy, opaquePtrTy, opaquePtrTy}, false),
+    ApiSpec(API::ZDNN_MUL, "zdnn_mul_ext", int32Ty, {opaquePtrTy, opaquePtrTy, opaquePtrTy}, false),
+    ApiSpec(API::ZDNN_DIV, "zdnn_div_ext", int32Ty, {opaquePtrTy, opaquePtrTy, opaquePtrTy}, false),
+    ApiSpec(API::ZDNN_MIN, "zdnn_min_ext", int32Ty, {opaquePtrTy, opaquePtrTy, opaquePtrTy}, false),
+    ApiSpec(API::ZDNN_MAX, "zdnn_max_ext", int32Ty, {opaquePtrTy, opaquePtrTy, opaquePtrTy}, false),
+    ApiSpec(API::ZDNN_LOG, "zdnn_log_ext", int32Ty, {opaquePtrTy, opaquePtrTy}, false),
+    ApiSpec(API::ZDNN_EXP, "zdnn_exp_ext", int32Ty, {opaquePtrTy, opaquePtrTy}, false),
     // Activation operations
-    ApiSpec(API::ZDNN_RELU, "zdnn_relu", int32Ty, {opaquePtrTy, opaquePtrTy, opaquePtrTy}, false),
-    ApiSpec(API::ZDNN_TANH, "zdnn_tanh", int32Ty, {opaquePtrTy, opaquePtrTy}, false),
-    ApiSpec(API::ZDNN_SIGMOID, "zdnn_sigmoid", int32Ty, {opaquePtrTy, opaquePtrTy}, false),
-    ApiSpec(API::ZDNN_SOFTMAX, "zdnn_softmax", int32Ty, {opaquePtrTy, opaquePtrTy, int64Ty, opaquePtrTy}, false),
+    ApiSpec(API::ZDNN_RELU, "zdnn_relu_ext", int32Ty, {opaquePtrTy, opaquePtrTy, opaquePtrTy}, false),
+    ApiSpec(API::ZDNN_TANH, "zdnn_tanh_ext", int32Ty, {opaquePtrTy, opaquePtrTy}, false),
+    ApiSpec(API::ZDNN_SIGMOID, "zdnn_sigmoid_ext", int32Ty, {opaquePtrTy, opaquePtrTy}, false),
+    ApiSpec(API::ZDNN_SOFTMAX, "zdnn_softmax_ext", int32Ty, {opaquePtrTy, opaquePtrTy, int64Ty, opaquePtrTy}, false),
     // RNN operations
     ApiSpec(API::ZDNN_LSTM, "zdnn_lstm", int32Ty, {opaquePtrTy, opaquePtrTy, opaquePtrTy, opaquePtrTy, opaquePtrTy, opaquePtrTy, opaquePtrTy, int64Ty, opaquePtrTy, opaquePtrTy, opaquePtrTy}, false),
     ApiSpec(API::ZDNN_GRU, "zdnn_gru", int32Ty, {opaquePtrTy, opaquePtrTy, opaquePtrTy, opaquePtrTy, opaquePtrTy, opaquePtrTy, int64Ty, opaquePtrTy, opaquePtrTy}, false),
diff --git a/src/Accelerators/NNPA/Runtime/CMakeLists.txt b/src/Accelerators/NNPA/Runtime/CMakeLists.txt
index 083e616ca7..e6b3e80caa 100644
--- a/src/Accelerators/NNPA/Runtime/CMakeLists.txt
+++ b/src/Accelerators/NNPA/Runtime/CMakeLists.txt
@@ -5,7 +5,9 @@
 add_onnx_mlir_library(RuntimeNNPA STATIC
   OMRuntimeNNPA.c
   zDNNExtension/zDNNExtension.c
+  zDNNExtension/Elementwise.c
   zDNNExtension/MatMul.c
+  zDNNExtension/Softmax.c
 
   EXCLUDE_FROM_OM_LIBS
 
diff --git a/src/Accelerators/NNPA/Runtime/zDNNExtension/Elementwise.c b/src/Accelerators/NNPA/Runtime/zDNNExtension/Elementwise.c
new file mode 100644
index 0000000000..cb65ae84d4
--- /dev/null
+++ b/src/Accelerators/NNPA/Runtime/zDNNExtension/Elementwise.c
@@ -0,0 +1,340 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+//===------------------------ Elementwise.c -------------------------------===//
+//
+// Copyright 2024 The IBM Research Authors.
+//
+// =============================================================================
+//
+// A wrapper of zdnn elementwise ops for ztensor partition and parallelism.
+//
+//===----------------------------------------------------------------------===//
+
+// Include pthreads (need special treatment on z/OS).
+#ifdef __MVS__
+#define _OPEN_THREADS
+#endif
+#include <pthread.h>
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+
+#include "zDNNExtension.h"
+#include "zdnn.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum ElemementwiseOp {
+  // Binary
+  ZDNN_ADD_EXT,
+  ZDNN_DIV_EXT,
+  ZDNN_MAX_EXT,
+  ZDNN_MIN_EXT,
+  ZDNN_MUL_EXT,
+  ZDNN_SUB_EXT,
+  // Unary
+  ZDNN_EXP_EXT,
+  ZDNN_LOG_EXT,
+  ZDNN_RELU_EXT,
+  ZDNN_TANH_EXT,
+  ZDNN_SIGMOID_EXT,
+} ElemementwiseOp;
+
+static zdnn_status zdnn_unary_elementwise_common(const zdnn_ztensor *input,
+    const void *clippingValue, zdnn_ztensor *output, ElemementwiseOp opType) {
+  // Verify that e4, e3, e1 do not exceed the maximum dimension size. Thus, we
+  // will split e2 safely.
+  OrigShape origShapeOfX;
+  getOrigShape(input, &origShapeOfX);
+  uint32_t maxDimSize = zdnn_get_nnpa_max_dim_idx_size();
+  if ((origShapeOfX.e4 > maxDimSize) || (origShapeOfX.e3 > maxDimSize) ||
+      (origShapeOfX.e1 > maxDimSize)) {
+    printf("[UnaryElementwise] The input tensor dimension exceeds maximum "
+           "dimension index "
+           "size (MDIS) of %d: e4 = %d, e3 = %d, e1 = %d.\n",
+        maxDimSize, origShapeOfX.e4, origShapeOfX.e3, origShapeOfX.e1);
+    return ZDNN_EXCEEDS_MDIS;
+  }
+
+  // We split e2 in (e4, e3, e2, e1).
+  SplitInfo splitInfoX = {
+      .origZTensor = input, .axis = 2, .chunkSize = OMZTensorSplitSize};
+  SplitInfo splitInfoY = {
+      .origZTensor = output, .axis = 2, .chunkSize = OMZTensorSplitSize};
+
+  // Dim is small or ztensor split is disabled.
+  if (!OMZTensorSplitEnabled || !initSplitInfo(&splitInfoX) ||
+      !initSplitInfo(&splitInfoY)) {
+    if (OMZTensorSplitDebug)
+      printf("[UnaryElementwise] Not split zTensor ...\n");
+    if (opType == ZDNN_EXP_EXT)
+      return zdnn_exp(input, output);
+    else if (opType == ZDNN_LOG_EXT)
+      return zdnn_log(input, output);
+    else if (opType == ZDNN_RELU_EXT)
+      return zdnn_relu(input, clippingValue, output);
+    else if (opType == ZDNN_SIGMOID_EXT)
+      return zdnn_sigmoid(input, output);
+    else if (opType == ZDNN_TANH_EXT)
+      return zdnn_tanh(input, output);
+    else
+      return ZDNN_UNAVAILABLE_FUNCTION;
+  }
+
+  // Split input.
+  if (OMZTensorSplitDebug)
+    printf("[UnaryElementwise] Split the input ztensor along e2 into %d chunks "
+           "of %d elements \n",
+        splitInfoX.numOfChunks, splitInfoX.chunkSize);
+
+  double splitTime = 0.;
+  double mmTime = 0.;
+  double mergeTime = 0.;
+  clock_t start_time, end_time;
+
+  // Split input into chunks.
+  if (OMZTensorSplitDebug)
+    start_time = clock();
+  splitZTensor(&splitInfoX, /*copyData=*/true);
+  splitZTensor(&splitInfoY, /*copyData=*/false);
+  if (OMZTensorSplitDebug) {
+    end_time = clock();
+    splitTime = ((float)(end_time - start_time) / (float)CLOCKS_PER_SEC) * 1000;
+  }
+
+  // Call zdnn op on each chunk.
+  if (OMZTensorSplitDebug)
+    start_time = clock();
+  for (uint32_t i = 0; i < splitInfoX.numOfChunks; ++i) {
+    zdnn_ztensor *zxTensor = (splitInfoX.chunks + i)->ztensor;
+    zdnn_ztensor *zyTensor = (splitInfoY.chunks + i)->ztensor;
+    zdnn_status status;
+    if (opType == ZDNN_EXP_EXT)
+      status = zdnn_exp(zxTensor, zyTensor);
+    else if (opType == ZDNN_LOG_EXT)
+      status = zdnn_log(zxTensor, zyTensor);
+    else if (opType == ZDNN_RELU_EXT)
+      status = zdnn_relu(zxTensor, clippingValue, zyTensor);
+    else if (opType == ZDNN_SIGMOID_EXT)
+      status = zdnn_sigmoid(zxTensor, zyTensor);
+    else if (opType == ZDNN_TANH_EXT)
+      status = zdnn_tanh(zxTensor, zyTensor);
+    else
+      status = ZDNN_UNAVAILABLE_FUNCTION;
+    assert(status == ZDNN_OK);
+  }
+  if (OMZTensorSplitDebug) {
+    end_time = clock();
+    mmTime = ((float)(end_time - start_time) / (float)CLOCKS_PER_SEC) * 1000;
+  }
+
+  // Merging the chunks into the output.
+  if (OMZTensorSplitDebug)
+    start_time = clock();
+  mergeZTensors(&splitInfoY);
+  if (OMZTensorSplitDebug) {
+    end_time = clock();
+    mergeTime = ((float)(end_time - start_time) / (float)CLOCKS_PER_SEC) * 1000;
+  }
+
+  freeSplitInfoBuffer(&splitInfoX);
+  freeSplitInfoBuffer(&splitInfoY);
+
+  if (OMZTensorSplitDebug)
+    printf("[UnaryElementwise] split, %f, mm, %f, merge, %f (milliseconds)\n",
+        splitTime, mmTime, mergeTime);
+
+  return ZDNN_OK;
+}
+
+static zdnn_status zdnn_binary_elementwise_common(const zdnn_ztensor *inputA,
+    const zdnn_ztensor *inputB, zdnn_ztensor *output, ElemementwiseOp opType) {
+  // Verify that e4, e3, e1 do not exceed the maximum dimension size. Thus, we
+  // will split e2 safely.
+  OrigShape origShapeOfA, origShapeOfB;
+  getOrigShape(inputA, &origShapeOfA);
+  getOrigShape(inputB, &origShapeOfB);
+  uint32_t maxDimSize = zdnn_get_nnpa_max_dim_idx_size();
+  if ((origShapeOfA.e4 > maxDimSize) || (origShapeOfA.e3 > maxDimSize) ||
+      (origShapeOfA.e1 > maxDimSize)) {
+    printf("[BinaryElementwise] The 1st tensor dimension exceeds maximum "
+           "dimension index "
+           "size (MDIS) of %d: e4 = %d, e3 = %d, e1 = %d.\n",
+        maxDimSize, origShapeOfA.e4, origShapeOfA.e3, origShapeOfA.e1);
+    return ZDNN_EXCEEDS_MDIS;
+  }
+  if ((origShapeOfB.e4 > maxDimSize) || (origShapeOfB.e3 > maxDimSize) ||
+      (origShapeOfB.e1 > maxDimSize)) {
+    printf("[BinaryElementwise] The 2nd tensor dimension exceeds maximum "
+           "dimension index "
+           "size (MDIS) of %d: e4 = %d, e3 = %d, e1 = %d.\n",
+        maxDimSize, origShapeOfB.e4, origShapeOfB.e3, origShapeOfB.e1);
+    return ZDNN_EXCEEDS_MDIS;
+  }
+
+  // We split e2 in (e4, e3, e2, e1).
+  SplitInfo splitInfoA = {
+      .origZTensor = inputA, .axis = 2, .chunkSize = OMZTensorSplitSize};
+  SplitInfo splitInfoB = {
+      .origZTensor = inputB, .axis = 2, .chunkSize = OMZTensorSplitSize};
+  SplitInfo splitInfoY = {
+      .origZTensor = output, .axis = 2, .chunkSize = OMZTensorSplitSize};
+
+  // Dim is small or ztensor split is disabled.
+  if (!OMZTensorSplitEnabled || !initSplitInfo(&splitInfoA) ||
+      !initSplitInfo(&splitInfoB) || !initSplitInfo(&splitInfoY)) {
+    if (OMZTensorSplitDebug)
+      printf("[BinaryElementwise] Not split zTensor ...\n");
+    if (opType == ZDNN_ADD_EXT)
+      return zdnn_add(inputA, inputB, output);
+    else if (opType == ZDNN_SUB_EXT)
+      return zdnn_sub(inputA, inputB, output);
+    else if (opType == ZDNN_MUL_EXT)
+      return zdnn_mul(inputA, inputB, output);
+    else if (opType == ZDNN_MAX_EXT)
+      return zdnn_max(inputA, inputB, output);
+    else if (opType == ZDNN_MIN_EXT)
+      return zdnn_min(inputA, inputB, output);
+    else
+      return ZDNN_UNAVAILABLE_FUNCTION;
+  }
+
+  // Split input.
+  if (OMZTensorSplitDebug)
+    printf(
+        "[BinaryElementwise] Split the input ztensors along e2 into %d chunks "
+        "of %d elements \n",
+        splitInfoA.numOfChunks, splitInfoA.chunkSize);
+
+  double splitTime = 0.;
+  double mmTime = 0.;
+  double mergeTime = 0.;
+  clock_t start_time, end_time;
+
+  // Split input into chunks.
+  if (OMZTensorSplitDebug)
+    start_time = clock();
+  splitZTensor(&splitInfoA, /*copyData=*/true);
+  splitZTensor(&splitInfoB, /*copyData=*/true);
+  splitZTensor(&splitInfoY, /*copyData=*/false);
+  if (OMZTensorSplitDebug) {
+    end_time = clock();
+    splitTime = ((float)(end_time - start_time) / (float)CLOCKS_PER_SEC) * 1000;
+  }
+
+  // Call zdnn op on each chunk.
+  if (OMZTensorSplitDebug)
+    start_time = clock();
+  for (uint32_t i = 0; i < splitInfoA.numOfChunks; ++i) {
+    zdnn_ztensor *zaTensor = (splitInfoA.chunks + i)->ztensor;
+    zdnn_ztensor *zbTensor = (splitInfoB.chunks + i)->ztensor;
+    zdnn_ztensor *zyTensor = (splitInfoY.chunks + i)->ztensor;
+    zdnn_status status;
+    if (opType == ZDNN_ADD_EXT)
+      status = zdnn_add(zaTensor, zbTensor, zyTensor);
+    else if (opType == ZDNN_SUB_EXT)
+      status = zdnn_sub(zaTensor, zbTensor, zyTensor);
+    else if (opType == ZDNN_MUL_EXT)
+      status = zdnn_mul(zaTensor, zbTensor, zyTensor);
+    else if (opType == ZDNN_DIV_EXT)
+      status = zdnn_div(zaTensor, zbTensor, zyTensor);
+    else if (opType == ZDNN_MAX_EXT)
+      status = zdnn_max(zaTensor, zbTensor, zyTensor);
+    else if (opType == ZDNN_MIN_EXT)
+      status = zdnn_min(zaTensor, zbTensor, zyTensor);
+    else
+      status = ZDNN_UNAVAILABLE_FUNCTION;
+    assert(status == ZDNN_OK);
+  }
+  if (OMZTensorSplitDebug) {
+    end_time = clock();
+    mmTime = ((float)(end_time - start_time) / (float)CLOCKS_PER_SEC) * 1000;
+  }
+
+  // Merging the chunks into the output.
+  if (OMZTensorSplitDebug)
+    start_time = clock();
+  mergeZTensors(&splitInfoY);
+  if (OMZTensorSplitDebug) {
+    end_time = clock();
+    mergeTime = ((float)(end_time - start_time) / (float)CLOCKS_PER_SEC) * 1000;
+  }
+
+  freeSplitInfoBuffer(&splitInfoA);
+  freeSplitInfoBuffer(&splitInfoB);
+  freeSplitInfoBuffer(&splitInfoY);
+
+  if (OMZTensorSplitDebug)
+    printf("[BinaryElementwise] split, %f, mm, %f, merge, %f (milliseconds)\n",
+        splitTime, mmTime, mergeTime);
+
+  return ZDNN_OK;
+}
+
+// -----------------------------------------------------------------------------
+// Extension Functions
+// Same name as zdnn functions but with the `_ext` postfix.
+// -----------------------------------------------------------------------------
+
+zdnn_status zdnn_add_ext(const zdnn_ztensor *inputA, const zdnn_ztensor *inputB,
+    zdnn_ztensor *output) {
+  return zdnn_binary_elementwise_common(inputA, inputB, output, ZDNN_ADD_EXT);
+}
+
+zdnn_status zdnn_sub_ext(const zdnn_ztensor *inputA, const zdnn_ztensor *inputB,
+    zdnn_ztensor *output) {
+  return zdnn_binary_elementwise_common(inputA, inputB, output, ZDNN_SUB_EXT);
+}
+
+zdnn_status zdnn_mul_ext(const zdnn_ztensor *inputA, const zdnn_ztensor *inputB,
+    zdnn_ztensor *output) {
+  return zdnn_binary_elementwise_common(inputA, inputB, output, ZDNN_MUL_EXT);
+}
+
+zdnn_status zdnn_div_ext(const zdnn_ztensor *inputA, const zdnn_ztensor *inputB,
+    zdnn_ztensor *output) {
+  return zdnn_binary_elementwise_common(inputA, inputB, output, ZDNN_DIV_EXT);
+}
+
+zdnn_status zdnn_min_ext(const zdnn_ztensor *inputA, const zdnn_ztensor *inputB,
+    zdnn_ztensor *output) {
+  return zdnn_binary_elementwise_common(inputA, inputB, output, ZDNN_MIN_EXT);
+}
+
+zdnn_status zdnn_max_ext(const zdnn_ztensor *inputA, const zdnn_ztensor *inputB,
+    zdnn_ztensor *output) {
+  return zdnn_binary_elementwise_common(inputA, inputB, output, ZDNN_MAX_EXT);
+}
+
+zdnn_status zdnn_exp_ext(const zdnn_ztensor *input, zdnn_ztensor *output) {
+  return zdnn_unary_elementwise_common(input, NULL, output, ZDNN_EXP_EXT);
+}
+
+zdnn_status zdnn_log_ext(const zdnn_ztensor *input, zdnn_ztensor *output) {
+  return zdnn_unary_elementwise_common(input, NULL, output, ZDNN_LOG_EXT);
+}
+
+zdnn_status zdnn_relu_ext(const zdnn_ztensor *input, const void *clippingValue,
+    zdnn_ztensor *output) {
+  return zdnn_unary_elementwise_common(
+      input, clippingValue, output, ZDNN_RELU_EXT);
+}
+
+zdnn_status zdnn_sigmoid_ext(const zdnn_ztensor *input, zdnn_ztensor *output) {
+  return zdnn_unary_elementwise_common(input, NULL, output, ZDNN_SIGMOID_EXT);
+}
+
+zdnn_status zdnn_tanh_ext(const zdnn_ztensor *input, zdnn_ztensor *output) {
+  return zdnn_unary_elementwise_common(input, NULL, output, ZDNN_TANH_EXT);
+}
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/Accelerators/NNPA/Runtime/zDNNExtension/MatMul.c b/src/Accelerators/NNPA/Runtime/zDNNExtension/MatMul.c
index 9131479f08..3bfd8374f7 100644
--- a/src/Accelerators/NNPA/Runtime/zDNNExtension/MatMul.c
+++ b/src/Accelerators/NNPA/Runtime/zDNNExtension/MatMul.c
@@ -59,14 +59,14 @@ static zdnn_status zdnn_matmul_op_common(const zdnn_ztensor *inputA,
 
   // For a MatMul of (M,N)*(N,P),
   // We split M that is e2 in (e4, e3, e2, e1).
-  SplitInfo splitInfoA, splitInfoY;
-  splitInfoA.axis = 2;
-  splitInfoY.axis = 2;
-  splitInfoA.chunkSize = OMZTensorSplitSize;
-  splitInfoY.chunkSize = OMZTensorSplitSize;
+  SplitInfo splitInfoA = {
+      .origZTensor = inputA, .axis = 2, .chunkSize = OMZTensorSplitSize};
+  SplitInfo splitInfoY = {
+      .origZTensor = output, .axis = 2, .chunkSize = OMZTensorSplitSize};
 
   // Dim is small or ztensor split is disabled.
-  if (!OMZTensorSplitEnabled || !initSplitInfo(inputA, &splitInfoA)) {
+  if (!OMZTensorSplitEnabled || !initSplitInfo(&splitInfoA) ||
+      !initSplitInfo(&splitInfoY)) {
     if (OMZTensorSplitDebug)
       printf("[MatMul] Not split zTensor ...\n");
     return call_zdnn_matmul_op(inputA, inputB, inputC, opType, output, isBcast);
@@ -77,7 +77,6 @@ static zdnn_status zdnn_matmul_op_common(const zdnn_ztensor *inputA,
     printf("[MatMul] Split the 1st ztensor along e2 into %d chunks of %d "
            "elements \n",
         splitInfoA.numOfChunks, splitInfoA.chunkSize);
-  initSplitInfo(output, &splitInfoY);
 
   double splitTime = 0.;
   double mmTime = 0.;
@@ -87,8 +86,8 @@ static zdnn_status zdnn_matmul_op_common(const zdnn_ztensor *inputA,
   // Split input A into chunks.
   if (OMZTensorSplitDebug)
     start_time = clock();
-  splitZTensor(inputA, &splitInfoA, /*copyData=*/true);
-  splitZTensor(output, &splitInfoY, /*copyData=*/false);
+  splitZTensor(&splitInfoA, /*copyData=*/true);
+  splitZTensor(&splitInfoY, /*copyData=*/false);
   if (OMZTensorSplitDebug) {
     end_time = clock();
     splitTime = ((float)(end_time - start_time) / (float)CLOCKS_PER_SEC) * 1000;
@@ -98,8 +97,10 @@ static zdnn_status zdnn_matmul_op_common(const zdnn_ztensor *inputA,
   if (OMZTensorSplitDebug)
     start_time = clock();
   for (uint32_t i = 0; i < splitInfoA.numOfChunks; ++i) {
-    zdnn_status status = call_zdnn_matmul_op(splitInfoA.tensors + i, inputB,
-        inputC, opType, splitInfoY.tensors + i, isBcast);
+    zdnn_ztensor *zaTensor = (splitInfoA.chunks + i)->ztensor;
+    zdnn_ztensor *zyTensor = (splitInfoY.chunks + i)->ztensor;
+    zdnn_status status = call_zdnn_matmul_op(
+        zaTensor, inputB, inputC, opType, zyTensor, isBcast);
     assert(status == ZDNN_OK);
   }
   if (OMZTensorSplitDebug) {
@@ -110,7 +111,7 @@ static zdnn_status zdnn_matmul_op_common(const zdnn_ztensor *inputA,
   // Merging the chunks into the output.
   if (OMZTensorSplitDebug)
     start_time = clock();
-  mergeZTensors(&splitInfoY, output);
+  mergeZTensors(&splitInfoY);
   if (OMZTensorSplitDebug) {
     end_time = clock();
     mergeTime = ((float)(end_time - start_time) / (float)CLOCKS_PER_SEC) * 1000;
@@ -126,6 +127,11 @@ static zdnn_status zdnn_matmul_op_common(const zdnn_ztensor *inputA,
   return ZDNN_OK;
 }
 
+// -----------------------------------------------------------------------------
+// Extension Functions
+// Same name as zdnn functions but with the `_ext` postfix.
+// -----------------------------------------------------------------------------
+
 zdnn_status zdnn_matmul_op_ext(const zdnn_ztensor *inputA,
     const zdnn_ztensor *inputB, const zdnn_ztensor *inputC, int opType,
     zdnn_ztensor *output) {
diff --git a/src/Accelerators/NNPA/Runtime/zDNNExtension/Softmax.c b/src/Accelerators/NNPA/Runtime/zDNNExtension/Softmax.c
new file mode 100644
index 0000000000..53147f50c2
--- /dev/null
+++ b/src/Accelerators/NNPA/Runtime/zDNNExtension/Softmax.c
@@ -0,0 +1,126 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+//===-------------------------- Softmax.c ---------------------------------===//
+//
+// Copyright 2024 The IBM Research Authors.
+//
+// =============================================================================
+//
+// A wrapper of zdnn_softmax for ztensor partition and parallelism.
+//
+//===----------------------------------------------------------------------===//
+
+// Include pthreads (need special treatment on z/OS).
+#ifdef __MVS__
+#define _OPEN_THREADS
+#endif
+#include <pthread.h>
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+
+#include "zDNNExtension.h"
+#include "zdnn.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// -----------------------------------------------------------------------------
+// Extension Functions
+// Same name as zdnn functions but with the `_ext` postfix.
+// -----------------------------------------------------------------------------
+
+zdnn_status zdnn_softmax_ext(const zdnn_ztensor *input, void *save_area,
+    zdnn_softmax_act act_func, zdnn_ztensor *output) {
+  // Verify that e4, e3, e1 do not exceed the maximum dimension size. Thus, we
+  // will split e2 safely.
+  OrigShape origShapeOfX;
+  getOrigShape(input, &origShapeOfX);
+  uint32_t maxDimSize = zdnn_get_nnpa_max_dim_idx_size();
+  if ((origShapeOfX.e4 > maxDimSize) || (origShapeOfX.e3 > maxDimSize) ||
+      (origShapeOfX.e1 > maxDimSize)) {
+    printf(
+        "[Softmax] The input tensor dimension exceeds maximum dimension index "
+        "size (MDIS) of %d: e4 = %d, e3 = %d, e1 = %d.\n",
+        maxDimSize, origShapeOfX.e4, origShapeOfX.e3, origShapeOfX.e1);
+    return ZDNN_EXCEEDS_MDIS;
+  }
+
+  // We split e2 in (e4, e3, e2, e1).
+  SplitInfo splitInfoX = {
+      .origZTensor = input, .axis = 2, .chunkSize = OMZTensorSplitSize};
+  SplitInfo splitInfoY = {
+      .origZTensor = output, .axis = 2, .chunkSize = OMZTensorSplitSize};
+
+  // Dim is small or ztensor split is disabled.
+  if (!OMZTensorSplitEnabled || !initSplitInfo(&splitInfoX) ||
+      !initSplitInfo(&splitInfoY)) {
+    if (OMZTensorSplitDebug)
+      printf("[Softmax] Not split zTensor ...\n");
+    return zdnn_softmax(input, save_area, act_func, output);
+  }
+
+  // Split input.
+  if (OMZTensorSplitDebug)
+    printf("[Softmax] Split the input ztensor along e2 into %d chunks of %d "
+           "elements \n",
+        splitInfoX.numOfChunks, splitInfoX.chunkSize);
+
+  double splitTime = 0.;
+  double mmTime = 0.;
+  double mergeTime = 0.;
+  clock_t start_time, end_time;
+
+  // Split input into chunks.
+  if (OMZTensorSplitDebug)
+    start_time = clock();
+  splitZTensor(&splitInfoX, /*copyData=*/true);
+  splitZTensor(&splitInfoY, /*copyData=*/false);
+  if (OMZTensorSplitDebug) {
+    end_time = clock();
+    splitTime = ((float)(end_time - start_time) / (float)CLOCKS_PER_SEC) * 1000;
+  }
+
+  // Call zdnn_softmax on each chunk. Not use save_area.
+  // TODO: could we reuse save_area in particular in the parallel scenario?
+  if (OMZTensorSplitDebug)
+    start_time = clock();
+  for (uint32_t i = 0; i < splitInfoX.numOfChunks; ++i) {
+    zdnn_ztensor *zxTensor = (splitInfoX.chunks + i)->ztensor;
+    zdnn_ztensor *zyTensor = (splitInfoY.chunks + i)->ztensor;
+    zdnn_status status = zdnn_softmax(zxTensor, NULL, act_func, zyTensor);
+    assert(status == ZDNN_OK);
+  }
+  if (OMZTensorSplitDebug) {
+    end_time = clock();
+    mmTime = ((float)(end_time - start_time) / (float)CLOCKS_PER_SEC) * 1000;
+  }
+
+  // Merging the chunks into the output.
+  if (OMZTensorSplitDebug)
+    start_time = clock();
+  mergeZTensors(&splitInfoY);
+  if (OMZTensorSplitDebug) {
+    end_time = clock();
+    mergeTime = ((float)(end_time - start_time) / (float)CLOCKS_PER_SEC) * 1000;
+  }
+
+  freeSplitInfoBuffer(&splitInfoX);
+  freeSplitInfoBuffer(&splitInfoY);
+
+  if (OMZTensorSplitDebug)
+    printf("[Softmax] split, %f, mm, %f, merge, %f (milliseconds)\n", splitTime,
+        mmTime, mergeTime);
+
+  return ZDNN_OK;
+}
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/Accelerators/NNPA/Runtime/zDNNExtension/zDNNExtension.c b/src/Accelerators/NNPA/Runtime/zDNNExtension/zDNNExtension.c
index 8a945a796d..4093a885c9 100644
--- a/src/Accelerators/NNPA/Runtime/zDNNExtension/zDNNExtension.c
+++ b/src/Accelerators/NNPA/Runtime/zDNNExtension/zDNNExtension.c
@@ -94,31 +94,47 @@ static void getZTensorShape(const zdnn_ztensor *t, zTensorShape *shape) {
   assert(sizeFromDim == sizeFromBuffer && "buffer size mismatched");
 }
 
-static zdnn_status allocZTensorChunk(const zdnn_ztensor *input, uint32_t axis,
-    uint32_t chunkSize, zdnn_ztensor *output) {
+static zdnn_status allocZTensorChunk(
+    const SplitInfo *splitInfo, uint32_t chunkID) {
+  const zdnn_ztensor *origZTensor = splitInfo->origZTensor;
+
+  uint32_t axis = splitInfo->axis;
+  ChunkInfo *chunk = splitInfo->chunks + chunkID;
+  uint32_t chunkSize = chunk->dimSize;
+
+  // Allocate one ztensor struct.
+  chunk->ztensor = malloc(sizeof(zdnn_ztensor));
+  if (!chunk->ztensor)
+    return ZDNN_ALLOCATION_FAILURE;
+  zdnn_ztensor *chunkZTensor = chunk->ztensor;
+
+  // Allocate one buffer for two descriptors.
   zdnn_tensor_desc *descriptors = malloc(2 * sizeof(zdnn_tensor_desc));
   if (!descriptors)
     return ZDNN_ALLOCATION_FAILURE;
   zdnn_tensor_desc *preTransDesc = descriptors;
   zdnn_tensor_desc *transDesc = descriptors + 1;
-  // Copy pre_transform_desc from the input.
-  preTransDesc->layout = input->pre_transformed_desc->layout;
-  preTransDesc->format = input->pre_transformed_desc->format;
-  preTransDesc->type = input->pre_transformed_desc->type;
+
+  // Copy pre_transform_desc from the origZTensor.
+  preTransDesc->layout = origZTensor->pre_transformed_desc->layout;
+  preTransDesc->format = origZTensor->pre_transformed_desc->format;
+  preTransDesc->type = origZTensor->pre_transformed_desc->type;
   preTransDesc->dim4 =
-      (axis == 0) ? chunkSize : input->pre_transformed_desc->dim4;
+      (axis == 0) ? chunkSize : origZTensor->pre_transformed_desc->dim4;
   preTransDesc->dim3 =
-      (axis == 1) ? chunkSize : input->pre_transformed_desc->dim3;
+      (axis == 1) ? chunkSize : origZTensor->pre_transformed_desc->dim3;
   preTransDesc->dim2 =
-      (axis == 2) ? chunkSize : input->pre_transformed_desc->dim2;
+      (axis == 2) ? chunkSize : origZTensor->pre_transformed_desc->dim2;
   preTransDesc->dim1 =
-      (axis == 3) ? chunkSize : input->pre_transformed_desc->dim1;
+      (axis == 3) ? chunkSize : origZTensor->pre_transformed_desc->dim1;
+
   // Copy a transformed desc.
   zdnn_status status = zdnn_generate_transformed_desc(preTransDesc, transDesc);
   if (status != ZDNN_OK)
     return status;
+
   // Init a zTensor with malloc.
-  return zdnn_init_ztensor_with_malloc(preTransDesc, transDesc, output);
+  return zdnn_init_ztensor_with_malloc(preTransDesc, transDesc, chunkZTensor);
 }
 
 static void freeZTensorChunk(zdnn_ztensor *t) {
@@ -129,27 +145,42 @@ static void freeZTensorChunk(zdnn_ztensor *t) {
     free(t->pre_transformed_desc);
 }
 
-static void copyZTensorChunk(zdnn_ztensor *output, const zdnn_ztensor *input,
-    uint32_t axis, uint32_t offset, bool fromChunk) {
+static void copyZTensorChunk(
+    const SplitInfo *splitInfo, uint32_t chunkID, bool fromChunk) {
   // Only support the second innermost axis in the CPU tensor at this moment.
   // axis = 2 in the CPU tensor corresponds to dim3 in zTensor.
-  if (axis != 2) {
+  if (splitInfo->axis != 2) {
     printf("Only support the second innermost dimension at this moment.");
     return;
   }
 
-  zTensorShape inShape, outShape;
-  getZTensorShape(input, &inShape);
-  getZTensorShape(output, &outShape);
-  zTensorShape origShape = fromChunk ? outShape : inShape;
-  zTensorShape chunkShape = fromChunk ? inShape : outShape;
+  ChunkInfo *chunk = splitInfo->chunks + chunkID;
+  uint32_t offset = chunk->offsetInStick;
+
+  // Buffer pointers.
+  void *src, *dst;
+  if (fromChunk) {
+    src = chunk->ztensor->buffer;
+    dst = splitInfo->origZTensor->buffer;
+  } else {
+    src = splitInfo->origZTensor->buffer;
+    dst = chunk->ztensor->buffer;
+  }
+  assert(src && "Source buffer is NULL");
+  assert(dst && "Destination buffer is NULL");
+
+  // Shape information.
+  zTensorShape origShape;
+  getZTensorShape(splitInfo->origZTensor, &origShape);
+  zTensorShape chunkShape;
+  getZTensorShape(chunk->ztensor, &chunkShape);
   assert(origShape.dim6 == chunkShape.dim6);
   assert(origShape.dim5 == chunkShape.dim5);
   assert(origShape.dim4 == chunkShape.dim4);
   assert(origShape.dim2 == chunkShape.dim2);
   assert(origShape.dim1 == chunkShape.dim1);
   // Ensure that each element is 2 bytes.
-  assert(input->transformed_desc->type == ZDNN_DLFLOAT16);
+  assert(splitInfo->origZTensor->transformed_desc->type == ZDNN_DLFLOAT16);
 
   uint64_t D6 = chunkShape.dim6;
   uint64_t D5 = chunkShape.dim5;
@@ -170,9 +201,8 @@ static void copyZTensorChunk(zdnn_ztensor *output, const zdnn_ztensor *input,
           uint64_t TD3Offset = td3 + TD3 * TD4Offset;
           // Copy one page at a time.
           uint64_t offsetSrc = AIU_PAGESIZE_IN_BYTES * SD3Offset;
-          uint64_t offsetDest = AIU_PAGESIZE_IN_BYTES * TD3Offset;
-          memcpy(output->buffer + offsetDest, input->buffer + offsetSrc,
-              AIU_PAGESIZE_IN_BYTES);
+          uint64_t offsetDst = AIU_PAGESIZE_IN_BYTES * TD3Offset;
+          memcpy(dst + offsetDst, src + offsetSrc, AIU_PAGESIZE_IN_BYTES);
         }
       }
     }
@@ -180,25 +210,42 @@ static void copyZTensorChunk(zdnn_ztensor *output, const zdnn_ztensor *input,
   return;
 }
 
-static void copyZTensorChunkScalar(zdnn_ztensor *output,
-    const zdnn_ztensor *input, uint32_t axis, uint32_t offset, bool fromChunk) {
+static void copyZTensorChunkScalar(
+    const SplitInfo *splitInfo, uint32_t chunkID, bool fromChunk) {
   // Only support the second innermost axis in the CPU tensor at this moment.
   // axis = 2 in the CPU tensor corresponds to dim3 in zTensor.
-  if (axis != 2) {
+  if (splitInfo->axis != 2) {
     printf("Only support the second innermost dimension at this moment.");
     return;
   }
 
-  zTensorShape inShape, outShape;
-  getZTensorShape(input, &inShape);
-  getZTensorShape(output, &outShape);
-  zTensorShape origShape = fromChunk ? outShape : inShape;
-  zTensorShape chunkShape = fromChunk ? inShape : outShape;
+  ChunkInfo *chunk = splitInfo->chunks + chunkID;
+  uint32_t offset = chunk->offsetInStick;
+
+  // Buffers pointers.
+  uint16_t *src, *dst;
+  if (fromChunk) {
+    src = (uint16_t *)chunk->ztensor->buffer;
+    dst = (uint16_t *)splitInfo->origZTensor->buffer;
+  } else {
+    src = (uint16_t *)splitInfo->origZTensor->buffer;
+    dst = (uint16_t *)chunk->ztensor->buffer;
+  }
+  assert(src && "Source buffer is NULL");
+  assert(dst && "Destination buffer is NULL");
+
+  // Shape information.
+  zTensorShape origShape;
+  getZTensorShape(splitInfo->origZTensor, &origShape);
+  zTensorShape chunkShape;
+  getZTensorShape(chunk->ztensor, &chunkShape);
   assert(origShape.dim6 == chunkShape.dim6);
   assert(origShape.dim5 == chunkShape.dim5);
   assert(origShape.dim4 == chunkShape.dim4);
   assert(origShape.dim2 == chunkShape.dim2);
   assert(origShape.dim1 == chunkShape.dim1);
+  // Ensure that each element is 2 bytes.
+  assert(splitInfo->origZTensor->transformed_desc->type == ZDNN_DLFLOAT16);
 
   uint64_t D6 = chunkShape.dim6;
   uint64_t D5 = chunkShape.dim5;
@@ -220,13 +267,9 @@ static void copyZTensorChunkScalar(zdnn_ztensor *output,
           uint64_t TD3Offset = td3 + TD3 * (d4 + D4 * (d5 + D5 * d6));
           for (uint64_t d2 = 0; d2 < D2; ++d2) {
             for (uint64_t d1 = 0; d1 < D1; ++d1) {
-              // Copy 2 bytes at a time.
-              uint64_t offsetSrc =
-                  AIU_2BYTE_CELL_SIZE * (d1 + D1 * (d2 + D2 * SD3Offset));
-              uint64_t offsetDest =
-                  AIU_2BYTE_CELL_SIZE * (d1 + D1 * (d2 + D2 * TD3Offset));
-              memcpy(output->buffer + offsetDest, input->buffer + offsetSrc,
-                  AIU_2BYTE_CELL_SIZE);
+              uint64_t offsetSrc = d1 + D1 * (d2 + D2 * SD3Offset);
+              uint64_t offsetDst = d1 + D1 * (d2 + D2 * TD3Offset);
+              *(dst + offsetDst) = *(src + offsetSrc);
             }
           }
         }
@@ -236,66 +279,77 @@ static void copyZTensorChunkScalar(zdnn_ztensor *output,
   return;
 }
 
-bool initSplitInfo(const zdnn_ztensor *input, SplitInfo *splitInfo) {
+bool initSplitInfo(SplitInfo *splitInfo) {
   // Only support the second innermost dimension at this moment.
   if (splitInfo->axis != 2)
     return false;
 
-  splitInfo->totalSize = input->transformed_desc->dim2;
-  splitInfo->chunkSizeInStick = CEIL(splitInfo->chunkSize, AIU_STICKS_PER_PAGE);
+  // Init general split information.
+  const zdnn_ztensor *origZTensor = splitInfo->origZTensor;
+  splitInfo->totalSize = origZTensor->transformed_desc->dim2;
   splitInfo->numOfChunks = CEIL(splitInfo->totalSize, splitInfo->chunkSize);
 
+  // No split benefit.
   if (splitInfo->numOfChunks == 1)
     return false;
 
+  // Stickification: (e4, e3, e2, e1) -> (e4, e1/64, e3, e2/32, 32, 64)
+  uint32_t chunkSizeInStick;
+  if (splitInfo->axis == 0) // e4
+    chunkSizeInStick = splitInfo->chunkSize;
+  else if (splitInfo->axis == 1) // e3
+    chunkSizeInStick = splitInfo->chunkSize;
+  else if (splitInfo->axis == 2) // e2
+    chunkSizeInStick = CEIL(splitInfo->chunkSize, AIU_STICKS_PER_PAGE);
+  else if (splitInfo->axis == 3) // e1
+    chunkSizeInStick = CEIL(splitInfo->chunkSize, AIU_2BYTE_CELLS_PER_STICK);
+  else
+    return false;
+
+  // Init chunk information.
   splitInfo->chunks = malloc(splitInfo->numOfChunks * sizeof(ChunkInfo));
+  assert(splitInfo->chunks && "Failed to allocate ChunkInfo struct");
   for (uint32_t i = 0; i < splitInfo->numOfChunks; ++i) {
     ChunkInfo *chunkInfo = splitInfo->chunks + i;
     if (i == splitInfo->numOfChunks - 1)
-      chunkInfo->size = splitInfo->totalSize - i * splitInfo->chunkSize;
+      chunkInfo->dimSize = splitInfo->totalSize - i * splitInfo->chunkSize;
     else
-      chunkInfo->size = splitInfo->chunkSize;
+      chunkInfo->dimSize = splitInfo->chunkSize;
+    chunkInfo->offsetInStick = i * chunkSizeInStick;
   }
   return true;
 }
 
 void freeSplitInfoBuffer(SplitInfo *splitInfo) {
+  // Free the sub tensors.
+  for (uint32_t i = 0; i < splitInfo->numOfChunks; ++i) {
+    zdnn_ztensor *t = (splitInfo->chunks + i)->ztensor;
+    // Free the ztensor buffer and descriptors.
+    freeZTensorChunk(t);
+    // Free ztensor struct.
+    free(t);
+  }
   // Free chunk info.
   if (splitInfo->chunks)
     free(splitInfo->chunks);
-  // Free the sub tensors.
-  for (uint32_t i = 0; i < splitInfo->numOfChunks; ++i)
-    freeZTensorChunk(splitInfo->tensors + i);
-  if (splitInfo->tensors)
-    free(splitInfo->tensors);
 }
 
-void splitZTensor(
-    const zdnn_ztensor *input, SplitInfo *splitInfo, bool copyData) {
-  splitInfo->tensors =
-      malloc(splitInfo->numOfChunks * sizeof(struct zdnn_ztensor));
-  assert(splitInfo->tensors && "Failed to allocate a buffer");
-  uint32_t axis = splitInfo->axis;
+void splitZTensor(const SplitInfo *splitInfo, bool copyData) {
   for (uint32_t i = 0; i < splitInfo->numOfChunks; ++i) {
-    zdnn_ztensor *chunk = splitInfo->tensors + i;
-    ChunkInfo *chunkInfo = splitInfo->chunks + i;
-    // Allocate ztensor struct for the chunk.
-    zdnn_status status =
-        allocZTensorChunk(input, /*axis=*/axis, chunkInfo->size, chunk);
+    // Allocate a chunk ztensor.
+    zdnn_status status = allocZTensorChunk(splitInfo, i);
     assert(status == ZDNN_OK && "Failed to allocate zTensor chunk");
     if (copyData) {
-      // Copy data from the input to the chunk.
-      uint32_t offset = i * splitInfo->chunkSizeInStick;
-      copyZTensorChunk(chunk, input, axis, offset, /*fromChunk=*/false);
+      // Copy data from the original ztensor to the chunk ztensor.
+      copyZTensorChunk(splitInfo, i, /*fromChunk=*/false);
     }
   }
 }
 
-void mergeZTensors(const SplitInfo *splitInfo, zdnn_ztensor *output) {
+void mergeZTensors(const SplitInfo *splitInfo) {
   for (uint32_t i = 0; i < splitInfo->numOfChunks; ++i) {
-    uint32_t offset = i * splitInfo->chunkSizeInStick;
-    copyZTensorChunk(output, splitInfo->tensors + i, splitInfo->axis, offset,
-        /*fromChunk=*/true);
+    // Copy data from the chunk ztensor back to the original ztensor.
+    copyZTensorChunk(splitInfo, i, /*fromChunk=*/true);
   }
 }
 
diff --git a/src/Accelerators/NNPA/Runtime/zDNNExtension/zDNNExtension.h b/src/Accelerators/NNPA/Runtime/zDNNExtension/zDNNExtension.h
index affc1f4837..223a7f1a94 100644
--- a/src/Accelerators/NNPA/Runtime/zDNNExtension/zDNNExtension.h
+++ b/src/Accelerators/NNPA/Runtime/zDNNExtension/zDNNExtension.h
@@ -66,25 +66,27 @@ typedef struct zTensorShape {
 } zTensorShape;
 
 typedef struct ChunkInfo {
-  uint32_t size;
+  // Dim size for this chunk along the original axis.
+  uint32_t dimSize;
+  // Offset of the split point of this chunk in the stickified axis.
+  uint32_t offsetInStick;
+  // ztensor of this chunk.
+  zdnn_ztensor *ztensor;
 } ChunkInfo;
 
 typedef struct SplitInfo {
-  // Axis to split the tensor. Used to refer to an axis in (e4, e3, e2, e1)
+  // Original ztensor.
+  const zdnn_ztensor *origZTensor;
+  // Axis to split the tensor. Used to refer to an axis in (e4, e3, e2, e1).
   uint32_t axis;
-  // Size of the dimension at axis
+  // Size of the dimension at axis.
   uint32_t totalSize;
-  // Size of each chunk. The last chunk may be smaller
+  // Size of each chunk. The last chunk may be smaller.
   uint32_t chunkSize;
-  // Size of each chunk in the stickifified tensor. The last chunk may be
-  // smaller
-  uint32_t chunkSizeInStick;
-  // The number of chunks
+  // The number of chunks.
   uint32_t numOfChunks;
-  // Information for each chunk
+  // Information for each chunk.
   ChunkInfo *chunks;
-  // Sub zTensors
-  zdnn_ztensor *tensors;
 } SplitInfo;
 
 // -----------------------------------------------------------------------------
@@ -111,11 +113,10 @@ void getOrigShape(const zdnn_ztensor *t, OrigShape *shape);
 /**
  * \brief Initialize a SplitInfo struct.
  *
- * @param input input ztensor to split
  * @param splitInfo information for splitting
  * @return true if the ztensor is splitable. Otherwise, false
  */
-bool initSplitInfo(const zdnn_ztensor *input, SplitInfo *splitInfo);
+bool initSplitInfo(SplitInfo *splitInfo);
 
 /**
  * \brief Free buffers related to a SplitInfo struct.
@@ -129,19 +130,17 @@ void freeSplitInfoBuffer(SplitInfo *splitInfo);
 /**
  * \brief Split a ztensor into multiple chunks.
  *
- * @param input a ztensor to split
- * @param splitInfo information of all chunks
+ * @param splitInfo information for splitting
  * @param copyData whether or not copy data from ztensor to each chunk
  */
-void splitZTensor(
-    const zdnn_ztensor *input, SplitInfo *splitInfo, bool copyData);
+void splitZTensor(const SplitInfo *splitInfo, bool copyData);
 /**
  * \brief Merge chunks into a ztensor.
  *
- * @param splitInfo information of all chunks
+ * @param splitInfo information for splitting
  * @param output a ztensor obtained by merging the chunks
  */
-void mergeZTensors(const SplitInfo *splitInfo, zdnn_ztensor *output);
+void mergeZTensors(const SplitInfo *splitInfo);
 
 // -----------------------------------------------------------------------------
 // Extension Functions
@@ -156,6 +155,28 @@ zdnn_status zdnn_matmul_bcast_op_ext(const zdnn_ztensor *inputA,
     const zdnn_ztensor *inputB, const zdnn_ztensor *inputC, int opType,
     zdnn_ztensor *output);
 
+// Elementwise Operations
+zdnn_status zdnn_add_ext(const zdnn_ztensor *inputA, const zdnn_ztensor *inputB,
+    zdnn_ztensor *output);
+zdnn_status zdnn_sub_ext(const zdnn_ztensor *inputA, const zdnn_ztensor *inputB,
+    zdnn_ztensor *output);
+zdnn_status zdnn_mul_ext(const zdnn_ztensor *inputA, const zdnn_ztensor *inputB,
+    zdnn_ztensor *output);
+zdnn_status zdnn_div_ext(const zdnn_ztensor *inputA, const zdnn_ztensor *inputB,
+    zdnn_ztensor *output);
+zdnn_status zdnn_min_ext(const zdnn_ztensor *inputA, const zdnn_ztensor *inputB,
+    zdnn_ztensor *output);
+zdnn_status zdnn_max_ext(const zdnn_ztensor *inputA, const zdnn_ztensor *inputB,
+    zdnn_ztensor *output);
+zdnn_status zdnn_exp_ext(const zdnn_ztensor *input, zdnn_ztensor *output);
+zdnn_status zdnn_log_ext(const zdnn_ztensor *input, zdnn_ztensor *output);
+zdnn_status zdnn_relu_ext(
+    const zdnn_ztensor *input, const void *clippingValue, zdnn_ztensor *output);
+zdnn_status zdnn_sigmoid_ext(const zdnn_ztensor *input, zdnn_ztensor *output);
+zdnn_status zdnn_softmax_ext(const zdnn_ztensor *input, void *save_area,
+    zdnn_softmax_act act_func, zdnn_ztensor *output);
+zdnn_status zdnn_tanh_ext(const zdnn_ztensor *input, zdnn_ztensor *output);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/test/mlir/accelerators/nnpa/conversion/lower-all-to-llvm-typed-pointer.mlir b/test/mlir/accelerators/nnpa/conversion/lower-all-to-llvm-typed-pointer.mlir
index 3f7a33fcf9..fb357ea4a4 100644
--- a/test/mlir/accelerators/nnpa/conversion/lower-all-to-llvm-typed-pointer.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/lower-all-to-llvm-typed-pointer.mlir
@@ -132,7 +132,7 @@ func.func @test_call_zdnn_relu() -> () {
   return
 
   // CHECK-LABEL: test_call_zdnn_relu
-  // CHECK: {{.*}} = llvm.call @zdnn_relu({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32
+  // CHECK: {{.*}} = llvm.call @zdnn_relu_ext({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32
 }
 
 // -----
@@ -146,7 +146,7 @@ func.func @test_call_zdnn_tanh() -> () {
   return
 
   // CHECK-LABEL: test_call_zdnn_tanh
-  // CHECK: {{.*}} = llvm.call @zdnn_tanh({{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr) -> i32
+  // CHECK: {{.*}} = llvm.call @zdnn_tanh_ext({{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr) -> i32
 }
 
 // -----
@@ -160,7 +160,7 @@ func.func @test_call_zdnn_sigmoid() -> () {
   return
 
   // CHECK-LABEL: test_call_zdnn_sigmoid
-  // CHECK: {{.*}} = llvm.call @zdnn_sigmoid({{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr) -> i32
+  // CHECK: {{.*}} = llvm.call @zdnn_sigmoid_ext({{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr) -> i32
 }
 
 // -----
@@ -175,7 +175,7 @@ func.func @test_call_zdnn_add() -> () {
   return
 
   // CHECK-LABEL: test_call_zdnn_add
-  // CHECK: {{.*}} = llvm.call @zdnn_add({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32
+  // CHECK: {{.*}} = llvm.call @zdnn_add_ext({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32
 }
 
 // -----
@@ -190,7 +190,7 @@ func.func @test_call_zdnn_sub() -> () {
   return
 
   // CHECK-LABEL: test_call_zdnn_sub
-  // CHECK: {{.*}} = llvm.call @zdnn_sub({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32
+  // CHECK: {{.*}} = llvm.call @zdnn_sub_ext({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32
 }
 
 // -----
@@ -205,7 +205,7 @@ func.func @test_call_zdnn_mul() -> () {
   return
 
   // CHECK-LABEL: test_call_zdnn_mul
-  // CHECK: {{.*}} = llvm.call @zdnn_mul({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32
+  // CHECK: {{.*}} = llvm.call @zdnn_mul_ext({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32
 }
 
 // -----
@@ -220,7 +220,7 @@ func.func @test_call_zdnn_div() -> () {
   return
 
   // CHECK-LABEL: test_call_zdnn_div
-  // CHECK: {{.*}} = llvm.call @zdnn_div({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32
+  // CHECK: {{.*}} = llvm.call @zdnn_div_ext({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32
 }
 
 // -----
@@ -235,7 +235,7 @@ func.func @test_call_zdnn_softmax() -> () {
   return
 
   // CHECK-LABEL: test_call_zdnn_softmax
-  // CHECK: {{.*}} = llvm.call @zdnn_softmax({{.*}}, {{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> i32
+  // CHECK: {{.*}} = llvm.call @zdnn_softmax_ext({{.*}}, {{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> i32
 }
 
 // -----
@@ -283,7 +283,7 @@ func.func @test_call_zdnn_min() -> () {
   return
 
   // CHECK-LABEL: test_call_zdnn_min
-  // CHECK: {{.*}} = llvm.call @zdnn_min({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32
+  // CHECK: {{.*}} = llvm.call @zdnn_min_ext({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32
 }
 
 // -----
@@ -298,7 +298,7 @@ func.func @test_call_zdnn_max() -> () {
   return
 
   // CHECK-LABEL: test_call_zdnn_max
-  // CHECK: {{.*}} = llvm.call @zdnn_max({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32
+  // CHECK: {{.*}} = llvm.call @zdnn_max_ext({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32
 }
 
 // -----
@@ -312,7 +312,7 @@ func.func @test_call_zdnn_exp() -> () {
   return
 
   // CHECK-LABEL: test_call_zdnn_exp
-  // CHECK: {{.*}} = llvm.call @zdnn_exp({{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr) -> i32
+  // CHECK: {{.*}} = llvm.call @zdnn_exp_ext({{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr) -> i32
 }
 
 // -----
@@ -326,7 +326,7 @@ func.func @test_call_zdnn_log() -> () {
   return
 
   // CHECK-LABEL: test_call_zdnn_log
-  // CHECK: {{.*}} = llvm.call @zdnn_log({{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr) -> i32
+  // CHECK: {{.*}} = llvm.call @zdnn_log_ext({{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr) -> i32
 }
 
 // -----
diff --git a/test/mlir/accelerators/nnpa/conversion/lower-all-to-llvm.mlir b/test/mlir/accelerators/nnpa/conversion/lower-all-to-llvm.mlir
index 3c907d2e23..9f928ffc31 100644
--- a/test/mlir/accelerators/nnpa/conversion/lower-all-to-llvm.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/lower-all-to-llvm.mlir
@@ -131,7 +131,7 @@ func.func @test_call_zdnn_relu() -> () {
   return
 
   // CHECK-LABEL: test_call_zdnn_relu
-  // CHECK: {{.*}} = llvm.call @zdnn_relu({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32
+  // CHECK: {{.*}} = llvm.call @zdnn_relu_ext({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32
 }
 
 // -----
@@ -145,7 +145,7 @@ func.func @test_call_zdnn_tanh() -> () {
   return
 
   // CHECK-LABEL: test_call_zdnn_tanh
-  // CHECK: {{.*}} = llvm.call @zdnn_tanh({{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr) -> i32
+  // CHECK: {{.*}} = llvm.call @zdnn_tanh_ext({{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr) -> i32
 }
 
 // -----
@@ -159,7 +159,7 @@ func.func @test_call_zdnn_sigmoid() -> () {
   return
 
   // CHECK-LABEL: test_call_zdnn_sigmoid
-  // CHECK: {{.*}} = llvm.call @zdnn_sigmoid({{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr) -> i32
+  // CHECK: {{.*}} = llvm.call @zdnn_sigmoid_ext({{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr) -> i32
 }
 
 // -----
@@ -174,7 +174,7 @@ func.func @test_call_zdnn_add() -> () {
   return
 
   // CHECK-LABEL: test_call_zdnn_add
-  // CHECK: {{.*}} = llvm.call @zdnn_add({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32
+  // CHECK: {{.*}} = llvm.call @zdnn_add_ext({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32
 }
 
 // -----
@@ -189,7 +189,7 @@ func.func @test_call_zdnn_sub() -> () {
   return
 
   // CHECK-LABEL: test_call_zdnn_sub
-  // CHECK: {{.*}} = llvm.call @zdnn_sub({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32
+  // CHECK: {{.*}} = llvm.call @zdnn_sub_ext({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32
 }
 
 // -----
@@ -204,7 +204,7 @@ func.func @test_call_zdnn_mul() -> () {
   return
 
   // CHECK-LABEL: test_call_zdnn_mul
-  // CHECK: {{.*}} = llvm.call @zdnn_mul({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32
+  // CHECK: {{.*}} = llvm.call @zdnn_mul_ext({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32
 }
 
 // -----
@@ -219,7 +219,7 @@ func.func @test_call_zdnn_div() -> () {
   return
 
   // CHECK-LABEL: test_call_zdnn_div
-  // CHECK: {{.*}} = llvm.call @zdnn_div({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32
+  // CHECK: {{.*}} = llvm.call @zdnn_div_ext({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32
 }
 
 // -----
@@ -234,7 +234,7 @@ func.func @test_call_zdnn_softmax() -> () {
   return
 
   // CHECK-LABEL: test_call_zdnn_softmax
-  // CHECK: {{.*}} = llvm.call @zdnn_softmax({{.*}}, {{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> i32
+  // CHECK: {{.*}} = llvm.call @zdnn_softmax_ext({{.*}}, {{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> i32
 }
 
 // -----
@@ -282,7 +282,7 @@ func.func @test_call_zdnn_min() -> () {
   return
 
   // CHECK-LABEL: test_call_zdnn_min
-  // CHECK: {{.*}} = llvm.call @zdnn_min({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32
+  // CHECK: {{.*}} = llvm.call @zdnn_min_ext({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32
 }
 
 // -----
@@ -297,7 +297,7 @@ func.func @test_call_zdnn_max() -> () {
   return
 
   // CHECK-LABEL: test_call_zdnn_max
-  // CHECK: {{.*}} = llvm.call @zdnn_max({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32
+  // CHECK: {{.*}} = llvm.call @zdnn_max_ext({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32
 }
 
 // -----
@@ -311,7 +311,7 @@ func.func @test_call_zdnn_exp() -> () {
   return
 
   // CHECK-LABEL: test_call_zdnn_exp
-  // CHECK: {{.*}} = llvm.call @zdnn_exp({{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr) -> i32
+  // CHECK: {{.*}} = llvm.call @zdnn_exp_ext({{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr) -> i32
 }
 
 // -----
@@ -325,7 +325,7 @@ func.func @test_call_zdnn_log() -> () {
   return
 
   // CHECK-LABEL: test_call_zdnn_log
-  // CHECK: {{.*}} = llvm.call @zdnn_log({{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr) -> i32
+  // CHECK: {{.*}} = llvm.call @zdnn_log_ext({{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr) -> i32
 }
 
 // -----