Skip to content

Commit

Permalink
AMDGPU: Add support for v_ashr_pk_i8/u8_i32 instructions for gfx950 (…
Browse files Browse the repository at this point in the history
…#117596)

This patch adds assembly and builtin support for v_ashr_pk_i8/u8_i32
instructions.

Co-authored-by: Sirish Pande <Sirish.Pande@amd.com>
  • Loading branch information
arsenm and srpande authored Nov 26, 2024
1 parent a87d484 commit 5d650a6
Show file tree
Hide file tree
Showing 13 changed files with 194 additions and 1 deletion.
3 changes: 3 additions & 0 deletions clang/include/clang/Basic/BuiltinsAMDGPU.def
Original file line number Diff line number Diff line change
Expand Up @@ -467,6 +467,9 @@ TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr6_b96_v3i32, "V3iV3i*3", "nc", "gfx950
TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr8_b64_v2i32, "V2iV2i*3", "nc", "gfx950-insts")
TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr16_b64_v4i16, "V4sV4s*3", "nc", "gfx950-insts")

TARGET_BUILTIN(__builtin_amdgcn_ashr_pk_i8_i32, "UsUiUiUi", "nc", "ashr-pk-insts")
TARGET_BUILTIN(__builtin_amdgcn_ashr_pk_u8_i32, "UsUiUiUi", "nc", "ashr-pk-insts")

TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_2xpk16_fp6_f32, "V6UiV16fV16ff", "nc", "gfx950-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_2xpk16_bf6_f32, "V6UiV16fV16ff", "nc", "gfx950-insts")

Expand Down
2 changes: 1 addition & 1 deletion clang/test/CodeGenOpenCL/amdgpu-features.cl
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@
// GFX941: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts"
// GFX942: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts"
// GFX9_4_Generic: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
// GFX950: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
// GFX950: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
// GFX1010: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
// GFX1011: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
// GFX1012: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
Expand Down
46 changes: 46 additions & 0 deletions clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
Original file line number Diff line number Diff line change
Expand Up @@ -169,3 +169,49 @@ void test_cvt_scalef32_pk(global uint6 *out6, bfloat32 srcbf32, half32 srch32, f
*out6 = __builtin_amdgcn_cvt_scalef32_2xpk16_bf6_f32(src0f32, src1f32, scale);
*out6 = __builtin_amdgcn_cvt_scalef32_2xpk16_fp6_f32(src0f32, src1f32, scale);
}

// CHECK-LABEL: @test_ashr_pk_i8_i32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT: [[SRC0_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[SRC2_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
// CHECK-NEXT: store i32 [[SRC0:%.*]], ptr addrspace(5) [[SRC0_ADDR]], align 4
// CHECK-NEXT: store i32 [[SRC1:%.*]], ptr addrspace(5) [[SRC1_ADDR]], align 4
// CHECK-NEXT: store i32 [[SRC2:%.*]], ptr addrspace(5) [[SRC2_ADDR]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC0_ADDR]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[SRC2_ADDR]], align 4
// CHECK-NEXT: [[TMP3:%.*]] = call i16 @llvm.amdgcn.ashr.pk.i8.i32(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]])
// CHECK-NEXT: [[CONV:%.*]] = zext i16 [[TMP3]] to i32
// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
// CHECK-NEXT: store i32 [[CONV]], ptr addrspace(1) [[TMP4]], align 4
// CHECK-NEXT: ret void
//
void test_ashr_pk_i8_i32(global int* out, uint src0, uint src1, uint src2) {
*out = __builtin_amdgcn_ashr_pk_i8_i32(src0, src1, src2);
}

// CHECK-LABEL: @test_ashr_pk_u8_i32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT: [[SRC0_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[SRC2_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
// CHECK-NEXT: store i32 [[SRC0:%.*]], ptr addrspace(5) [[SRC0_ADDR]], align 4
// CHECK-NEXT: store i32 [[SRC1:%.*]], ptr addrspace(5) [[SRC1_ADDR]], align 4
// CHECK-NEXT: store i32 [[SRC2:%.*]], ptr addrspace(5) [[SRC2_ADDR]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC0_ADDR]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[SRC2_ADDR]], align 4
// CHECK-NEXT: [[TMP3:%.*]] = call i16 @llvm.amdgcn.ashr.pk.u8.i32(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]])
// CHECK-NEXT: [[CONV:%.*]] = zext i16 [[TMP3]] to i32
// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
// CHECK-NEXT: store i32 [[CONV]], ptr addrspace(1) [[TMP4]], align 4
// CHECK-NEXT: ret void
//
void test_ashr_pk_u8_i32(global int* out, uint src0, uint src1, uint src2) {
*out = __builtin_amdgcn_ashr_pk_u8_i32(src0, src1, src2);
}
10 changes: 10 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -3199,6 +3199,16 @@ def int_amdgcn_permlane32_swap :
[IntrNoMem, IntrConvergent, IntrWillReturn,
ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, IntrNoCallback, IntrNoFree]>;

// llvm.amdgcn.ashr_pk_i8_i32 int vdst, int src0, int src1 int src2
def int_amdgcn_ashr_pk_i8_i32 : ClangBuiltin<"__builtin_amdgcn_ashr_pk_i8_i32">,
DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
[IntrNoMem, IntrSpeculatable]>;

// llvm.amdgcn.ashr_pk_u8_i32 int vdst, int src0, int src1 int src2
def int_amdgcn_ashr_pk_u8_i32 : ClangBuiltin<"__builtin_amdgcn_ashr_pk_u8_i32">,
DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
[IntrNoMem, IntrSpeculatable]>;

//===----------------------------------------------------------------------===//
// Special Intrinsics for backend internal use only. No frontend
// should emit calls to these.
Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -414,12 +414,19 @@ def FeatureF16BF16ToFP6BF6ConversionScaleInsts : SubtargetFeature<"f16bf16-to-fp
"Has f16bf16 to fp6bf6 conversion scale instructions"
>;

def FeatureAshrPkInsts : SubtargetFeature<"ashr-pk-insts",
"HasAshrPkInsts",
"true",
"Has Arithmetic Shift Pack instructions"
>;

def FeatureGFX950Insts : SubtargetFeature<"gfx950-insts",
"GFX950Insts",
"true",
"Additional instructions for GFX950+",
[FeaturePermlane16Swap,
FeaturePermlane32Swap,
FeatureAshrPkInsts,
FeatureFP8ConversionScaleInsts,
FeatureBF8ConversionScaleInsts,
FeatureFP4ConversionScaleInsts,
Expand Down Expand Up @@ -2474,6 +2481,9 @@ def HasScalarDwordx3Loads : Predicate<"Subtarget->hasScalarDwordx3Loads()">;
def HasXF32Insts : Predicate<"Subtarget->hasXF32Insts()">,
AssemblerPredicate<(all_of FeatureXF32Insts)>;

def HasAshrPkInsts : Predicate<"Subtarget->hasAshrPkInsts()">,
AssemblerPredicate<(all_of FeatureAshrPkInsts)>;

// Include AMDGPU TD files
include "SISchedule.td"
include "GCNProcessors.td"
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4546,6 +4546,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_cvt_scalef32_pk32_bf6_f16:
case Intrinsic::amdgcn_cvt_scalef32_pk32_fp6_bf16:
case Intrinsic::amdgcn_cvt_scalef32_pk32_bf6_bf16:
case Intrinsic::amdgcn_ashr_pk_i8_i32:
case Intrinsic::amdgcn_ashr_pk_u8_i32:
case Intrinsic::amdgcn_cvt_scalef32_2xpk16_fp6_f32:
case Intrinsic::amdgcn_cvt_scalef32_2xpk16_bf6_f32:
case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16:
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -245,8 +245,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool HasForceStoreSC0SC1 = false;
bool HasRequiredExportPriority = false;
bool HasVmemWriteVgprInOrder = false;
bool HasAshrPkInsts = false;
bool HasMinimum3Maximum3F32 = false;
bool HasMinimum3Maximum3F16 = false;

bool RequiresCOV6 = false;

// Dummy feature to use for assembler in tablegen.
Expand Down Expand Up @@ -1326,6 +1328,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,

bool hasPermlane16Swap() const { return HasPermlane16Swap; }
bool hasPermlane32Swap() const { return HasPermlane32Swap; }
bool hasAshrPkInsts() const { return HasAshrPkInsts; }

bool hasMinimum3Maximum3F32() const {
return HasMinimum3Maximum3F32;
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -2856,6 +2856,7 @@ def VOP_I64_I32_I32_I64 : VOPProfile <[i64, i32, i32, i64]>;
def VOP_I32_F32_I32_I32 : VOPProfile <[i32, f32, i32, i32]>;
def VOP_I64_I64_I32_I64 : VOPProfile <[i64, i64, i32, i64]>;
def VOP_V4I32_I64_I32_V4I32 : VOPProfile <[v4i32, i64, i32, v4i32]>;
def VOP_I16_I32_I32_I32 : VOPProfile <[i16, i32, i32, i32]>;

def VOP_F32_V2F16_V2F16_F32 : VOPProfile <[f32, v2f16, v2f16, f32]>;
def VOP_I32_V2I16_V2I16_I32 : VOPProfile <[i32, v2i16, v2i16, i32]>;
Expand Down
8 changes: 8 additions & 0 deletions llvm/lib/Target/AMDGPU/VOP3Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1189,6 +1189,11 @@ let SubtargetPredicate = HasPseudoScalarTrans in {
def : PseudoScalarPatF16<any_amdgcn_sqrt, V_S_SQRT_F16_e64>;
}

let SubtargetPredicate = HasAshrPkInsts, isReMaterializable = 1 in {
defm V_ASHR_PK_I8_I32 : VOP3Inst<"v_ashr_pk_i8_i32", VOP3_Profile<VOP_I16_I32_I32_I32, VOP3_OPSEL_ONLY>, int_amdgcn_ashr_pk_i8_i32>;
defm V_ASHR_PK_U8_I32 : VOP3Inst<"v_ashr_pk_u8_i32", VOP3_Profile<VOP_I16_I32_I32_I32, VOP3_OPSEL_ONLY>, int_amdgcn_ashr_pk_u8_i32>;
} // End SubtargetPredicate = HasAshrPkInsts, isReMaterializable = 1

//===----------------------------------------------------------------------===//
// Integer Clamp Patterns
//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -1978,5 +1983,8 @@ defm V_CVT_SCALEF32_PK32_BF6_F16 : VOP3_Real_gfx9<0x25a, "v_cvt_scalef32_pk32_b
defm V_CVT_SCALEF32_PK32_BF6_BF16 : VOP3_Real_gfx9<0x25b, "v_cvt_scalef32_pk32_bf6_bf16">;
}

defm V_ASHR_PK_I8_I32 : VOP3OpSel_Real_gfx9 <0x265>;
defm V_ASHR_PK_U8_I32 : VOP3OpSel_Real_gfx9 <0x266>;

defm V_CVT_SCALEF32_2XPK16_FP6_F32 : VOP3_Real_gfx9<0x252, "v_cvt_scalef32_2xpk16_fp6_f32">;
defm V_CVT_SCALEF32_2XPK16_BF6_F32 : VOP3_Real_gfx9<0x253, "v_cvt_scalef32_2xpk16_bf6_f32">;
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/VOPInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1455,6 +1455,7 @@ def VOP3_CLAMP : VOP3Features<1, 0, 0, 0>;
def VOP3_OPSEL : VOP3Features<1, 1, 0, 0>;
def VOP3_PACKED : VOP3Features<1, 1, 1, 0>;
def VOP3_MAI : VOP3Features<0, 0, 0, 1>;
def VOP3_OPSEL_ONLY : VOP3Features<0, 1, 0, 0>;

// Packed is misleading, but it enables the appropriate op_sel
// modifiers.
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/TargetParser/TargetParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -474,6 +474,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
Features["prng-inst"] = true;
Features["permlane16-swap"] = true;
Features["permlane32-swap"] = true;
Features["ashr-pk-insts"] = true;
Features["gfx950-insts"] = true;
[[fallthrough]];
case GK_GFX942:
Expand Down
72 changes: 72 additions & 0 deletions llvm/test/MC/AMDGPU/gfx950_asm_vop3.s
Original file line number Diff line number Diff line change
Expand Up @@ -74,3 +74,75 @@ v_bitop3_b16 v5, v1, v2, s3 bitop3:161
// GFX940-ERR: error: instruction not supported on this GPU
// GFX950: v_bitop3_b16 v5, v1, v2, s3 bitop3:0xa1 ; encoding: [0x05,0x04,0x33,0xd2,0x01,0x05,0x0e,0x30]
// GFX12-ERR: error: instruction not supported on this GPU

v_ashr_pk_i8_i32 v2, s4, v7, v8
// GFX906-ERR: error: instruction not supported on this GPU
// GFX940-ERR: error: instruction not supported on this GPU
// GFX950: v_ashr_pk_i8_i32 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x65,0xd2,0x04,0x0e,0x22,0x04]
// GFX12-ERR: error: instruction not supported on this GPU

v_ashr_pk_i8_i32 v2, v4, 0, 1
// GFX906-ERR: error: instruction not supported on this GPU
// GFX940-ERR: error: instruction not supported on this GPU
// GFX950: v_ashr_pk_i8_i32 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x65,0xd2,0x04,0x01,0x05,0x02]
// GFX12-ERR: error: instruction not supported on this GPU

v_ashr_pk_i8_i32 v2, v4, 3, s2
// GFX906-ERR: error: instruction not supported on this GPU
// GFX940-ERR: error: instruction not supported on this GPU
// GFX950: v_ashr_pk_i8_i32 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x65,0xd2,0x04,0x07,0x09,0x00]
// GFX12-ERR: error: instruction not supported on this GPU

v_ashr_pk_i8_i32 v2, s4, 4, v2
// GFX906-ERR: error: instruction not supported on this GPU
// GFX940-ERR: error: instruction not supported on this GPU
// GFX950: v_ashr_pk_i8_i32 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x65,0xd2,0x04,0x08,0x09,0x04]
// GFX12-ERR: error: instruction not supported on this GPU

v_ashr_pk_i8_i32 v2, v4, v7, 0.5
// GFX906-ERR: error: instruction not supported on this GPU
// GFX940-ERR: error: instruction not supported on this GPU
// GFX950: v_ashr_pk_i8_i32 v2, v4, v7, 0.5 ; encoding: [0x02,0x00,0x65,0xd2,0x04,0x0f,0xc2,0x03]
// GFX12-ERR: error: instruction not supported on this GPU

v_ashr_pk_i8_i32 v1, v2, v3, v4 op_sel:[0,0,0,1]
// GFX906-ERR: error: instruction not supported on this GPU
// GFX940-ERR: error: instruction not supported on this GPU
// GFX950: v_ashr_pk_i8_i32 v1, v2, v3, v4 op_sel:[0,0,0,1] ; encoding: [0x01,0x40,0x65,0xd2,0x02,0x07,0x12,0x04]
// GFX12-ERR: error: instruction not supported on this GPU

v_ashr_pk_u8_i32 v2, s4, v7, v8
// GFX906-ERR: error: instruction not supported on this GPU
// GFX940-ERR: error: instruction not supported on this GPU
// GFX950: v_ashr_pk_u8_i32 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x66,0xd2,0x04,0x0e,0x22,0x04]
// GFX12-ERR: error: instruction not supported on this GPU

v_ashr_pk_u8_i32 v2, v4, 0, 1
// GFX906-ERR: error: instruction not supported on this GPU
// GFX940-ERR: error: instruction not supported on this GPU
// GFX950: v_ashr_pk_u8_i32 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x66,0xd2,0x04,0x01,0x05,0x02]
// GFX12-ERR: error: instruction not supported on this GPU

v_ashr_pk_u8_i32 v2, v4, 3, s2
// GFX906-ERR: error: instruction not supported on this GPU
// GFX940-ERR: error: instruction not supported on this GPU
// GFX950: v_ashr_pk_u8_i32 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x66,0xd2,0x04,0x07,0x09,0x00]
// GFX12-ERR: error: instruction not supported on this GPU

v_ashr_pk_u8_i32 v2, s4, 4, v2
// GFX906-ERR: error: instruction not supported on this GPU
// GFX940-ERR: error: instruction not supported on this GPU
// GFX950: v_ashr_pk_u8_i32 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x66,0xd2,0x04,0x08,0x09,0x04]
// GFX12-ERR: error: instruction not supported on this GPU

v_ashr_pk_u8_i32 v2, v4, v7, -2.0
// GFX906-ERR: error: instruction not supported on this GPU
// GFX940-ERR: error: instruction not supported on this GPU
// GFX950: v_ashr_pk_u8_i32 v2, v4, v7, -2.0 ; encoding: [0x02,0x00,0x66,0xd2,0x04,0x0f,0xd6,0x03]
// GFX12-ERR: error: instruction not supported on this GPU

v_ashr_pk_u8_i32 v1, v2, v3, v4 op_sel:[0,0,0,1]
// GFX906-ERR: error: instruction not supported on this GPU
// GFX940-ERR: error: instruction not supported on this GPU
// GFX950: v_ashr_pk_u8_i32 v1, v2, v3, v4 op_sel:[0,0,0,1] ; encoding: [0x01,0x40,0x66,0xd2,0x02,0x07,0x12,0x04]
// GFX12-ERR: error: instruction not supported on this GPU
36 changes: 36 additions & 0 deletions llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt
Original file line number Diff line number Diff line change
Expand Up @@ -744,6 +744,42 @@
# GFX950: v_cvt_scalef32_pk_fp4_bf16 v1, -|s2|, v3 ; encoding: [0x01,0x01,0x4d,0xd2,0x02,0x06,0x02,0x20]
0x01,0x01,0x4d,0xd2,0x02,0x06,0x02,0x20

# GFX950: v_ashr_pk_i8_i32 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x65,0xd2,0x04,0x08,0x09,0x04]
0x02,0x00,0x65,0xd2,0x04,0x08,0x09,0x04

# GFX950: v_ashr_pk_i8_i32 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x65,0xd2,0x04,0x0e,0x22,0x04]
0x02,0x00,0x65,0xd2,0x04,0x0e,0x22,0x04

# GFX950: v_ashr_pk_i8_i32 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x65,0xd2,0x04,0x01,0x05,0x02]
0x02,0x00,0x65,0xd2,0x04,0x01,0x05,0x02

# GFX950: v_ashr_pk_i8_i32 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x65,0xd2,0x04,0x07,0x09,0x00]
0x02,0x00,0x65,0xd2,0x04,0x07,0x09,0x00

# GFX950: v_ashr_pk_i8_i32 v2, v4, v7, 0.5 ; encoding: [0x02,0x00,0x65,0xd2,0x04,0x0f,0xc2,0x03]
0x02,0x00,0x65,0xd2,0x04,0x0f,0xc2,0x03

# GFX950: v_ashr_pk_i8_i32 v1, v2, v3, v4 op_sel:[0,0,0,1] ; encoding: [0x01,0x40,0x65,0xd2,0x02,0x07,0x12,0x04]
0x01,0x40,0x65,0xd2,0x02,0x07,0x12,0x04

# GFX950: v_ashr_pk_u8_i32 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x66,0xd2,0x04,0x08,0x09,0x04]
0x02,0x00,0x66,0xd2,0x04,0x08,0x09,0x04

# GFX950: v_ashr_pk_u8_i32 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x66,0xd2,0x04,0x0e,0x22,0x04]
0x02,0x00,0x66,0xd2,0x04,0x0e,0x22,0x04

# GFX950: v_ashr_pk_u8_i32 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x66,0xd2,0x04,0x01,0x05,0x02]
0x02,0x00,0x66,0xd2,0x04,0x01,0x05,0x02

# GFX950: v_ashr_pk_u8_i32 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x66,0xd2,0x04,0x07,0x09,0x00]
0x02,0x00,0x66,0xd2,0x04,0x07,0x09,0x00

# GFX950: v_ashr_pk_u8_i32 v2, v4, v7, -2.0 ; encoding: [0x02,0x00,0x66,0xd2,0x04,0x0f,0xd6,0x03]
0x02,0x00,0x66,0xd2,0x04,0x0f,0xd6,0x03

# GFX950: v_ashr_pk_u8_i32 v1, v2, v3, v4 op_sel:[0,0,0,1] ; encoding: [0x01,0x40,0x66,0xd2,0x02,0x07,0x12,0x04]
0x01,0x40,0x66,0xd2,0x02,0x07,0x12,0x04

# GFX950: v_cvt_scalef32_2xpk16_fp6_f32 v[20:25], v[10:25], v[10:25], v6 ; encoding: [0x14,0x00,0x52,0xd2,0x0a,0x15,0x1a,0x04]
0x14,0x00,0x52,0xd2,0x0a,0x15,0x1a,0x04

Expand Down

0 comments on commit 5d650a6

Please sign in to comment.