diff options
author | Brox Chen <guochen2@amd.com> | 2025-04-02 16:08:26 -0400 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-04-02 16:08:26 -0400 |
commit | fb0e7b5f161118a24eeef39b05882f6950be43c0 (patch) | |
tree | 81d127f59c7838bb8e4540c76a6fabe2a4dd1d19 | |
parent | 6f1347d57bdaed75b73b2013a96a4a69c8969ebe (diff) | |
download | llvm-fb0e7b5f161118a24eeef39b05882f6950be43c0.zip llvm-fb0e7b5f161118a24eeef39b05882f6950be43c0.tar.gz llvm-fb0e7b5f161118a24eeef39b05882f6950be43c0.tar.bz2 |
[AMDGPU][True16][CodeGen] Implement sgpr folding in true16 (#128929)
We haven't implemented 16 bit SGPRs. Currently allow 32-bit SGPRs to be
folded into True16 bit instructions taking 16 bit values. Also use
sgpr_32 when Imm is copied to spgr_lo16 so it could be further folded.
This improves generated code quality.
27 files changed, 764 insertions, 1055 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 46bd5d8..2bfc37b 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -12,8 +12,11 @@ #include "AMDGPU.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" +#include "SIRegisterInfo.h" #include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineOperand.h" @@ -576,6 +579,10 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const { } MachineOperand *New = Fold.OpToFold; + // Rework once the VS_16 register class is updated to include proper + // 16-bit SGPRs instead of 32-bit ones. + if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg())) + Old.setSubReg(AMDGPU::NoSubRegister); Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI); Old.setIsUndef(New->isUndef()); return true; @@ -947,9 +954,15 @@ void SIFoldOperandsImpl::foldOperand( return; // FIXME: Fold operands with subregs. - if (UseOp->isReg() && OpToFold.isReg() && - (UseOp->isImplicit() || UseOp->getSubReg() != AMDGPU::NoSubRegister)) - return; + if (UseOp->isReg() && OpToFold.isReg()) { + if (UseOp->isImplicit()) + return; + // Allow folding from SGPRs to 16-bit VGPRs. + if (UseOp->getSubReg() != AMDGPU::NoSubRegister && + (UseOp->getSubReg() != AMDGPU::lo16 || + !TRI->isSGPRReg(*MRI, OpToFold.getReg()))) + return; + } // Special case for REG_SEQUENCE: We can't fold literals into // REG_SEQUENCE instructions, so we have to fold them into the @@ -1040,6 +1053,14 @@ void SIFoldOperandsImpl::foldOperand( } } + // Allow immediates COPYd into sgpr_lo16 to be further folded while + // still being legal if not further folded + if (DestRC == &AMDGPU::SGPR_LO16RegClass) { + assert(ST->useRealTrue16Insts()); + MRI->setRegClass(DestReg, &AMDGPU::SGPR_32RegClass); + DestRC = &AMDGPU::SGPR_32RegClass; + } + // In order to fold immediates into copies, we need to change the // copy to a MOV. @@ -1073,9 +1094,43 @@ void SIFoldOperandsImpl::foldOperand( UseMI->getOperand(0).getReg().isVirtual() && !UseMI->getOperand(1).getSubReg()) { LLVM_DEBUG(dbgs() << "Folding " << OpToFold << "\n into " << *UseMI); + unsigned Size = TII->getOpSize(*UseMI, 1); Register UseReg = OpToFold.getReg(); UseMI->getOperand(1).setReg(UseReg); - UseMI->getOperand(1).setSubReg(OpToFold.getSubReg()); + unsigned SubRegIdx = OpToFold.getSubReg(); + // Hack to allow 32-bit SGPRs to be folded into True16 instructions + // Remove this if 16-bit SGPRs (i.e. SGPR_LO16) are added to the + // VS_16RegClass + // + // Excerpt from AMDGPUGenRegisterInfo.inc + // NoSubRegister, //0 + // hi16, // 1 + // lo16, // 2 + // sub0, // 3 + // ... + // sub1, // 11 + // sub1_hi16, // 12 + // sub1_lo16, // 13 + static_assert(AMDGPU::sub1_hi16 == 12, "Subregister layout has changed"); + if (Size == 2 && TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) && + TRI->isSGPRReg(*MRI, UseReg)) { + // Produce the 32 bit subregister index to which the 16-bit subregister + // is aligned. + if (SubRegIdx > AMDGPU::sub1) { + LaneBitmask M = TRI->getSubRegIndexLaneMask(SubRegIdx); + M |= M.getLane(M.getHighestLane() - 1); + SmallVector<unsigned, 4> Indexes; + TRI->getCoveringSubRegIndexes(TRI->getRegClassForReg(*MRI, UseReg), M, + Indexes); + assert(Indexes.size() == 1 && "Expected one 32-bit subreg to cover"); + SubRegIdx = Indexes[0]; + // 32-bit registers do not have a sub0 index + } else if (TII->getOpSize(*UseMI, 1) == 4) + SubRegIdx = 0; + else + SubRegIdx = AMDGPU::sub0; + } + UseMI->getOperand(1).setSubReg(SubRegIdx); UseMI->getOperand(1).setIsKill(false); CopiesToReplace.push_back(UseMI); OpToFold.setIsKill(false); @@ -1713,6 +1768,31 @@ bool SIFoldOperandsImpl::tryFoldFoldableCopy( if (OpToFold.isReg() && !OpToFold.getReg().isVirtual()) return false; + // True16: Fix malformed 16-bit sgpr COPY produced by peephole-opt + // Can remove this code if proper 16-bit SGPRs are implemented + // Example: Pre-peephole-opt + // %29:sgpr_lo16 = COPY %16.lo16:sreg_32 + // %32:sreg_32 = COPY %29:sgpr_lo16 + // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32 + // Post-peephole-opt and DCE + // %32:sreg_32 = COPY %16.lo16:sreg_32 + // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32 + // After this transform + // %32:sreg_32 = COPY %16:sreg_32 + // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32 + // After the fold operands pass + // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %16:sreg_32 + if (MI.getOpcode() == AMDGPU::COPY && OpToFold.isReg() && + OpToFold.getSubReg()) { + const TargetRegisterClass *DstRC = + MRI->getRegClass(MI.getOperand(0).getReg()); + if (DstRC == &AMDGPU::SReg_32RegClass && + DstRC == MRI->getRegClass(OpToFold.getReg())) { + assert(OpToFold.getSubReg() == AMDGPU::lo16); + OpToFold.setSubReg(0); + } + } + // Prevent folding operands backwards in the function. For example, // the COPY opcode must not be replaced by 1 in this example: // diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 170e794..071f55c 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -776,6 +776,7 @@ let SubtargetPredicate = isGFX11Plus in { // Restrict src0 to be VGPR def V_PERMLANE64_B32 : VOP1_Pseudo<"v_permlane64_b32", VOP_MOVRELS, [], /*VOP1Only=*/ 1>; + let isAsCheapAsAMove = 1 in defm V_MOV_B16 : VOP1Inst_t16<"v_mov_b16", VOP_I16_I16>; defm V_NOT_B16 : VOP1Inst_t16<"v_not_b16", VOP_I16_I16>; defm V_CVT_I32_I16 : VOP1Inst_t16<"v_cvt_i32_i16", VOP_I32_I16>; diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 375ae0d..8582b61 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -38819,16 +38819,14 @@ define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg ; GFX11TRUE16-LABEL: s_select_v2bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_lshr_b32 s2, s0, 16 -; GFX11TRUE16-NEXT: s_lshr_b32 s3, s1, 16 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s3 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s2 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s1 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, vcc_lo -; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v1.h, vcc_lo -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s2 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11TRUE16-NEXT: s_lshr_b32 s0, s1, 16 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, s0, v1.l, vcc_lo +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, s1, v0.l, vcc_lo ; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11TRUE16-NEXT: ; return to shader part epilog ; @@ -38936,19 +38934,17 @@ define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg ; ; GFX11TRUE16-LABEL: s_vselect_v2bf16: ; GFX11TRUE16: ; %bb.0: -; GFX11TRUE16-NEXT: s_lshr_b32 s3, s1, 16 -; GFX11TRUE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11TRUE16-NEXT: s_lshr_b32 s3, s0, 16 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 0, v1 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s3 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s4 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s1 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s2 -; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v1.h, vcc_lo -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s0 +; GFX11TRUE16-NEXT: s_lshr_b32 s0, s1, 16 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, s0, v0.l, s2 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, s1, v0.h, vcc_lo +; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v1 ; GFX11TRUE16-NEXT: ; return to shader part epilog ; ; GFX11FAKE16-LABEL: s_vselect_v2bf16: @@ -40655,30 +40651,25 @@ define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> ; ; GFX11TRUE16-LABEL: s_vselect_v4bf16: ; GFX11TRUE16: ; %bb.0: -; GFX11TRUE16-NEXT: s_lshr_b32 s7, s3, 16 +; GFX11TRUE16-NEXT: s_lshr_b32 s7, s1, 16 +; GFX11TRUE16-NEXT: s_lshr_b32 s9, s0, 16 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 0, v1 -; GFX11TRUE16-NEXT: s_lshr_b32 s8, s1, 16 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s7 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s3 -; GFX11TRUE16-NEXT: s_lshr_b32 s3, s2, 16 -; GFX11TRUE16-NEXT: s_lshr_b32 s7, s0, 16 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s5, 0, v2 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s6, 0, v3 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s8 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s3 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, s7 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, s2 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, s0 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.h, s1 -; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s6 -; GFX11TRUE16-NEXT: v_cndmask_b16 v4.h, v1.h, v2.l, s4 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11TRUE16-NEXT: v_cndmask_b16 v4.l, v2.h, v3.l, vcc_lo -; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v3.h, s5 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s7 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s9 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s1 +; GFX11TRUE16-NEXT: s_lshr_b32 s8, s3, 16 +; GFX11TRUE16-NEXT: s_lshr_b32 s0, s2, 16 +; GFX11TRUE16-NEXT: v_cndmask_b16 v2.h, s8, v0.l, s6 +; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, s0, v0.h, s4 +; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, s2, v1.l, vcc_lo +; GFX11TRUE16-NEXT: v_cndmask_b16 v2.l, s3, v1.h, s5 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v4 -; GFX11TRUE16-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11TRUE16-NEXT: v_readfirstlane_b32 s1, v2 ; GFX11TRUE16-NEXT: ; return to shader part epilog ; ; GFX11FAKE16-LABEL: s_vselect_v4bf16: diff --git a/llvm/test/CodeGen/AMDGPU/bswap.ll b/llvm/test/CodeGen/AMDGPU/bswap.ll index 4787f21..5730d75 100644 --- a/llvm/test/CodeGen/AMDGPU/bswap.ll +++ b/llvm/test/CodeGen/AMDGPU/bswap.ll @@ -303,18 +303,32 @@ define amdgpu_kernel void @test_bswap_i64(ptr addrspace(1) %out, ptr addrspace(1 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: test_bswap_i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x0 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_perm_b32 v1, 0, s4, 0x10203 -; GFX11-NEXT: v_perm_b32 v0, 0, s5, 0x10203 -; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX11-NEXT: s_endpgm +; GFX11-REAL16-LABEL: test_bswap_i64: +; GFX11-REAL16: ; %bb.0: +; GFX11-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-REAL16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-REAL16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-REAL16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-REAL16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-REAL16-NEXT: v_perm_b32 v0, 0, s2, 0x10203 +; GFX11-REAL16-NEXT: s_mov_b32 s2, -1 +; GFX11-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-REAL16-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-REAL16-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-REAL16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: test_bswap_i64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[2:3], 0x0 +; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_perm_b32 v1, 0, s4, 0x10203 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, 0, s5, 0x10203 +; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-FAKE16-NEXT: s_endpgm %val = load i64, ptr addrspace(1) %in, align 8 %bswap = call i64 @llvm.bswap.i64(i64 %val) nounwind readnone store i64 %bswap, ptr addrspace(1) %out, align 8 @@ -364,20 +378,36 @@ define amdgpu_kernel void @test_bswap_v2i64(ptr addrspace(1) %out, ptr addrspace ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: test_bswap_v2i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_perm_b32 v3, 0, s6, 0x10203 -; GFX11-NEXT: v_perm_b32 v2, 0, s7, 0x10203 -; GFX11-NEXT: v_perm_b32 v1, 0, s4, 0x10203 -; GFX11-NEXT: v_perm_b32 v0, 0, s5, 0x10203 -; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 -; GFX11-NEXT: s_endpgm +; GFX11-REAL16-LABEL: test_bswap_v2i64: +; GFX11-REAL16: ; %bb.0: +; GFX11-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-REAL16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-REAL16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-REAL16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-REAL16-NEXT: s_mov_b32 s2, -1 +; GFX11-REAL16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-REAL16-NEXT: v_perm_b32 v0, 0, s4, 0x10203 +; GFX11-REAL16-NEXT: v_perm_b32 v2, 0, s6, 0x10203 +; GFX11-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-REAL16-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-REAL16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-REAL16-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-REAL16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: test_bswap_v2i64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_perm_b32 v3, 0, s6, 0x10203 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, 0, s7, 0x10203 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, 0, s4, 0x10203 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, 0, s5, 0x10203 +; GFX11-FAKE16-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-FAKE16-NEXT: s_endpgm %val = load <2 x i64>, ptr addrspace(1) %in, align 16 %bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %val) nounwind readnone store <2 x i64> %bswap, ptr addrspace(1) %out, align 16 @@ -445,26 +475,49 @@ define amdgpu_kernel void @test_bswap_v4i64(ptr addrspace(1) %out, ptr addrspace ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: test_bswap_v4i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_perm_b32 v7, 0, s6, 0x10203 -; GFX11-NEXT: v_perm_b32 v6, 0, s7, 0x10203 -; GFX11-NEXT: v_perm_b32 v5, 0, s4, 0x10203 -; GFX11-NEXT: v_perm_b32 v4, 0, s5, 0x10203 -; GFX11-NEXT: v_perm_b32 v3, 0, s2, 0x10203 -; GFX11-NEXT: v_perm_b32 v2, 0, s3, 0x10203 -; GFX11-NEXT: v_perm_b32 v1, 0, s0, 0x10203 -; GFX11-NEXT: v_perm_b32 v0, 0, s1, 0x10203 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[8:11], 0 offset:16 -; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[8:11], 0 -; GFX11-NEXT: s_endpgm +; GFX11-REAL16-LABEL: test_bswap_v4i64: +; GFX11-REAL16: ; %bb.0: +; GFX11-REAL16-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 +; GFX11-REAL16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-REAL16-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 +; GFX11-REAL16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-REAL16-NEXT: s_mov_b32 s10, -1 +; GFX11-REAL16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-REAL16-NEXT: v_perm_b32 v0, 0, s4, 0x10203 +; GFX11-REAL16-NEXT: v_perm_b32 v2, 0, s6, 0x10203 +; GFX11-REAL16-NEXT: v_perm_b32 v4, 0, s0, 0x10203 +; GFX11-REAL16-NEXT: v_perm_b32 v6, 0, s2, 0x10203 +; GFX11-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-REAL16-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-REAL16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-REAL16-NEXT: v_mov_b32_e32 v5, v4 +; GFX11-REAL16-NEXT: v_mov_b32_e32 v7, v6 +; GFX11-REAL16-NEXT: s_clause 0x1 +; GFX11-REAL16-NEXT: buffer_store_b128 v[0:3], off, s[8:11], 0 offset:16 +; GFX11-REAL16-NEXT: buffer_store_b128 v[4:7], off, s[8:11], 0 +; GFX11-REAL16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: test_bswap_v4i64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_perm_b32 v7, 0, s6, 0x10203 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, 0, s7, 0x10203 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, 0, s4, 0x10203 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, 0, s5, 0x10203 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, 0, s2, 0x10203 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, 0, s3, 0x10203 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, 0, s0, 0x10203 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, 0, s1, 0x10203 +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: buffer_store_b128 v[4:7], off, s[8:11], 0 offset:16 +; GFX11-FAKE16-NEXT: buffer_store_b128 v[0:3], off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_endpgm %val = load <4 x i64>, ptr addrspace(1) %in, align 32 %bswap = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %val) nounwind readnone store <4 x i64> %bswap, ptr addrspace(1) %out, align 32 diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll index fdbe59c..8aab9ec 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll @@ -216,34 +216,19 @@ define amdgpu_kernel void @extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:2 ; VI-NEXT: s_endpgm ; -; GFX11-TRUE16-LABEL: extract_vector_elt_v3f16: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s2 -; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 -; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-TRUE16-NEXT: buffer_store_b16 v1, off, s[4:7], 0 offset:2 -; GFX11-TRUE16-NEXT: s_endpgm -; -; GFX11-FAKE16-LABEL: extract_vector_elt_v3f16: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2 -; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 -; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-FAKE16-NEXT: buffer_store_b16 v1, off, s[4:7], 0 offset:2 -; GFX11-FAKE16-NEXT: s_endpgm +; GFX11-LABEL: extract_vector_elt_v3f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: buffer_store_b16 v1, off, s[4:7], 0 offset:2 +; GFX11-NEXT: s_endpgm %p0 = extractelement <3 x half> %foo, i32 0 %p1 = extractelement <3 x half> %foo, i32 2 %out1 = getelementptr half, ptr addrspace(1) %out, i32 1 @@ -284,35 +269,20 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(ptr addrspace(1) %ou ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-TRUE16-LABEL: dynamic_extract_vector_elt_v3f16: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34 -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s6, 4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 -; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 -; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 -; GFX11-TRUE16-NEXT: s_endpgm -; -; GFX11-FAKE16-LABEL: dynamic_extract_vector_elt_v3f16: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: s_load_b32 s6, s[4:5], 0x34 -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s6, 4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 -; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 -; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 -; GFX11-FAKE16-NEXT: s_endpgm +; GFX11-LABEL: dynamic_extract_vector_elt_v3f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b32 s4, s6, 4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-NEXT: s_endpgm %p0 = extractelement <3 x half> %foo, i32 %idx %out1 = getelementptr half, ptr addrspace(1) %out, i32 1 store half %p0, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll index 93e2d07..32f75f3 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll @@ -297,10 +297,8 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s2, 16 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mul_f16_e64 v0.l, |v0.l|, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_mul_f16_e64 v0.l, |s2|, s3 ; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] ; GFX11-TRUE16-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll index 9063af4..9ef4858 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -175,9 +175,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f16(ptr addrspace(1) %out, i1 ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: v_max_f16_e64 v0.l, s2, s2 ; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] ; GFX11-TRUE16-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll index 7e4b125..4f774867 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -60,34 +60,19 @@ define amdgpu_kernel void @s_copysign_f16(ptr addrspace(1) %arg_out, half %mag, ; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; -; GFX11-TRUE16-LABEL: s_copysign_f16: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c -; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s2, 16 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] -; GFX11-TRUE16-NEXT: s_endpgm -; -; GFX11-FAKE16-LABEL: s_copysign_f16: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c -; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s2, 16 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s3 -; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, s2, v0 -; GFX11-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1] -; GFX11-FAKE16-NEXT: s_endpgm +; GFX11-LABEL: s_copysign_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v0, s3 +; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s2, v0 +; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-NEXT: s_endpgm %out = call half @llvm.copysign.f16(half %mag, half %sign) store half %out, ptr addrspace(1) %arg_out ret void @@ -1928,122 +1913,63 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) ; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; -; GFX11-TRUE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x34 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_and_b32 s5, s3, 0x1ff -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s3, 8 -; GFX11-TRUE16-NEXT: s_or_b32 s2, s5, s2 -; GFX11-TRUE16-NEXT: s_and_b32 s5, s6, 0xffe -; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 -; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 -; GFX11-TRUE16-NEXT: s_bfe_u32 s2, s3, 0xb0014 -; GFX11-TRUE16-NEXT: s_sub_i32 s3, 0x3f1, s2 -; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0xfc10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, s5, v0 -; GFX11-TRUE16-NEXT: v_med3_i32 v1, s3, 0, 13 -; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s2, 12 -; GFX11-TRUE16-NEXT: s_cmp_lt_i32 s2, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x1000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, v1, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, s3, v0 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX11-TRUE16-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-TRUE16-NEXT: s_cmp_lt_i32 s2, 31 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 7, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, 0x7c00, v3 :: v_dual_add_nc_u32 v1, v1, v2 -; GFX11-TRUE16-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-TRUE16-NEXT: s_cmpk_eq_i32 s2, 0x40f -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, 0x7c00, v1, vcc_lo -; GFX11-TRUE16-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s4 -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] -; GFX11-TRUE16-NEXT: s_endpgm -; -; GFX11-FAKE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x34 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_and_b32 s5, s3, 0x1ff -; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s3, 8 -; GFX11-FAKE16-NEXT: s_or_b32 s2, s5, s2 -; GFX11-FAKE16-NEXT: s_and_b32 s5, s6, 0xffe -; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 -; GFX11-FAKE16-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 -; GFX11-FAKE16-NEXT: s_bfe_u32 s2, s3, 0xb0014 -; GFX11-FAKE16-NEXT: s_sub_i32 s3, 0x3f1, s2 -; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0xfc10 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, s5, v0 -; GFX11-FAKE16-NEXT: v_med3_i32 v1, s3, 0, 13 -; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s2, 12 -; GFX11-FAKE16-NEXT: s_cmp_lt_i32 s2, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x1000, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, v1, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, s3, v0 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-FAKE16-NEXT: s_cmp_lt_i32 s2, 31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 7, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 2, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v2 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v2 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-FAKE16-NEXT: s_cmpk_eq_i32 s2, 0x40f -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, 0x7e00 :: v_dual_add_nc_u32 v1, v1, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7c00, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v2, vcc_lo -; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v1, v0 :: v_dual_mov_b32 v1, 0 -; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, s4 -; GFX11-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1] -; GFX11-FAKE16-NEXT: s_endpgm +; GFX11-LABEL: s_copysign_out_f16_mag_f64_sign_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_and_b32 s5, s3, 0x1ff +; GFX11-NEXT: s_lshr_b32 s6, s3, 8 +; GFX11-NEXT: s_or_b32 s2, s5, s2 +; GFX11-NEXT: s_and_b32 s5, s6, 0xffe +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX11-NEXT: s_bfe_u32 s2, s3, 0xb0014 +; GFX11-NEXT: s_sub_i32 s3, 0x3f1, s2 +; GFX11-NEXT: s_addk_i32 s2, 0xfc10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v0, s5, v0 +; GFX11-NEXT: v_med3_i32 v1, s3, 0, 13 +; GFX11-NEXT: s_lshl_b32 s3, s2, 12 +; GFX11-NEXT: s_cmp_lt_i32 s2, 1 +; GFX11-NEXT: v_or_b32_e32 v2, 0x1000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, v1, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, v1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v2 +; GFX11-NEXT: v_or_b32_e32 v2, s3, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: s_cmp_lt_i32 s2, 31 +; GFX11-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v2, 7, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 2, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x40f +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX11-NEXT: v_dual_mov_b32 v2, 0x7e00 :: v_dual_add_nc_u32 v1, v1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7c00, v1, vcc_lo +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v2, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_cndmask_b32 v0, v1, v0 :: v_dual_mov_b32 v1, 0 +; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, s4 +; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-NEXT: s_endpgm %mag.trunc = fptrunc double %mag to half %result = call half @llvm.copysign.f16(half %mag.trunc, half %sign) store half %result, ptr addrspace(1) %arg_out @@ -2114,44 +2040,24 @@ define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; -; GFX11-TRUE16-LABEL: s_copysign_v2f16: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s3 -; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s3, 16 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s3 -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, v2, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-TRUE16-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX11-TRUE16-NEXT: s_endpgm -; -; GFX11-FAKE16-LABEL: s_copysign_v2f16: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s3 -; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s3, 16 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, s2, v0 -; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s2, 16 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0x7fff, s2, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-FAKE16-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX11-FAKE16-NEXT: s_endpgm +; GFX11-LABEL: s_copysign_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s3 +; GFX11-NEXT: s_lshr_b32 s3, s3, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s2, v0 +; GFX11-NEXT: s_lshr_b32 s2, s2, 16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s2, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-NEXT: s_endpgm %out = call <2 x half> @llvm.copysign.v2f16(<2 x half> %arg_mag, <2 x half> %arg_sign) store <2 x half> %out, ptr addrspace(1) %arg_out ret void @@ -2244,24 +2150,23 @@ define amdgpu_kernel void @s_copysign_v3f16(ptr addrspace(1) %arg_out, <3 x half ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s2 -; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, 16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 16 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s3 -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, v2, v3 -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v4, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_and_b32 v3, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, s0, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, s1, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v1, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_store_b16 v4, v0, s[4:5] offset:4 ; GFX11-TRUE16-NEXT: global_store_b32 v4, v1, s[4:5] @@ -2391,62 +2296,31 @@ define amdgpu_kernel void @s_copysign_v4f16(ptr addrspace(1) %arg_out, <4 x half ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; -; GFX11-TRUE16-LABEL: s_copysign_v4f16: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s2 -; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s3, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, 16 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s2 -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, v2, v3 -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v4, v5 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, 0 -; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0x7fff, v6, v7 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v2, 16, v0 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v3, 16, v4 -; GFX11-TRUE16-NEXT: global_store_b64 v5, v[0:1], s[4:5] -; GFX11-TRUE16-NEXT: s_endpgm -; -; GFX11-FAKE16-LABEL: s_copysign_v4f16: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v0, s3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s3, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s2, 16 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, s2 -; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, s1, v0 -; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0x7fff, s0, v1 -; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s1, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s0, s0, 16 -; GFX11-FAKE16-NEXT: v_bfi_b32 v2, 0x7fff, s6, v2 -; GFX11-FAKE16-NEXT: v_bfi_b32 v3, 0x7fff, s0, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v2, 16, v0 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v3, 16, v4 -; GFX11-FAKE16-NEXT: global_store_b64 v5, v[0:1], s[4:5] -; GFX11-FAKE16-NEXT: s_endpgm +; GFX11-LABEL: s_copysign_v4f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v0, s3 +; GFX11-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-NEXT: s_lshr_b32 s3, s3, 16 +; GFX11-NEXT: s_lshr_b32 s2, s2, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, s2 +; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s1, v0 +; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s0, v1 +; GFX11-NEXT: s_lshr_b32 s6, s1, 16 +; GFX11-NEXT: s_lshr_b32 s0, s0, 16 +; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, s6, v2 +; GFX11-NEXT: v_bfi_b32 v3, 0x7fff, s0, v3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v0 +; GFX11-NEXT: v_lshl_or_b32 v0, v3, 16, v4 +; GFX11-NEXT: global_store_b64 v5, v[0:1], s[4:5] +; GFX11-NEXT: s_endpgm %out = call <4 x half> @llvm.copysign.v4f16(<4 x half> %arg_mag, <4 x half> %arg_sign) store <4 x half> %out, ptr addrspace(1) %arg_out ret void diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll index 9642b36ec..67bec43 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll @@ -55,10 +55,8 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s2, 16 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_sub_f16_e64 v0.l, v0.l, |v0.h| +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_sub_f16_e64 v0.l, s3, |s2| ; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] ; GFX11-TRUE16-NEXT: s_endpgm ; @@ -132,10 +130,8 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s2, 16 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mul_f16_e64 v0.l, v0.l, -|v0.h| +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_mul_f16_e64 v0.l, s3, -|s2| ; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] ; GFX11-TRUE16-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fp-classify.ll b/llvm/test/CodeGen/AMDGPU/fp-classify.ll index 6a0d529..498df8a 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-classify.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-classify.ll @@ -620,32 +620,18 @@ define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %ou ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; -; GFX11-TRUE16-LABEL: test_isinf_pattern_f16: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c -; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cmp_class_f16_e64 s2, v0.l, 0x204 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 -; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-TRUE16-NEXT: s_endpgm -; -; GFX11-FAKE16-LABEL: test_isinf_pattern_f16: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c -; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_cmp_class_f16_e64 s2, s2, 0x204 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 -; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-FAKE16-NEXT: s_endpgm +; GFX11-LABEL: test_isinf_pattern_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_cmp_class_f16_e64 s2, s2, 0x204 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_endpgm %fabs = tail call half @llvm.fabs.f16(half %x) #1 %cmp = fcmp oeq half %fabs, 0xH7C00 %ext = zext i1 %cmp to i32 @@ -684,32 +670,18 @@ define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocaptur ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; -; GFX11-TRUE16-LABEL: test_isfinite_pattern_0_f16: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c -; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cmp_class_f16_e64 s2, v0.l, 0x1f8 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 -; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-TRUE16-NEXT: s_endpgm -; -; GFX11-FAKE16-LABEL: test_isfinite_pattern_0_f16: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c -; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_cmp_class_f16_e64 s2, s2, 0x1f8 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 -; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-FAKE16-NEXT: s_endpgm +; GFX11-LABEL: test_isfinite_pattern_0_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_cmp_class_f16_e64 s2, s2, 0x1f8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_endpgm %ord = fcmp ord half %x, 0.0 %x.fabs = tail call half @llvm.fabs.f16(half %x) #1 %ninf = fcmp une half %x.fabs, 0xH7C00 @@ -747,32 +719,18 @@ define amdgpu_kernel void @test_isfinite_pattern_4_f16(ptr addrspace(1) nocaptur ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; -; GFX11-TRUE16-LABEL: test_isfinite_pattern_4_f16: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c -; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cmp_class_f16_e64 s2, v0.l, 0x1f8 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 -; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-TRUE16-NEXT: s_endpgm -; -; GFX11-FAKE16-LABEL: test_isfinite_pattern_4_f16: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c -; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_cmp_class_f16_e64 s2, s2, 0x1f8 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 -; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-FAKE16-NEXT: s_endpgm +; GFX11-LABEL: test_isfinite_pattern_4_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_cmp_class_f16_e64 s2, s2, 0x1f8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_endpgm %ord = fcmp ord half %x, 0.0 %x.fabs = tail call half @llvm.fabs.f16(half %x) #1 %ninf = fcmp one half %x.fabs, 0xH7C00 @@ -786,3 +744,6 @@ declare half @llvm.fabs.f16(half) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11-FAKE16: {{.*}} +; GFX11-TRUE16: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll index 59ba9b7..fa358c9 100644 --- a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll @@ -393,10 +393,8 @@ define amdgpu_kernel void @s_fneg_fpext_f16_to_f32(ptr addrspace(1) %r, i32 %a) ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, s2 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll index f84e14e..97a94ed 100644 --- a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll @@ -616,11 +616,10 @@ define amdgpu_kernel void @fptosi_f16_to_i1(ptr addrspace(1) %out, half %in) { ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 -; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 +; GFX11-TRUE16-NEXT: v_cmp_eq_f16_e64 s2, -1.0, s2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cmp_eq_f16_e32 vcc_lo, -1.0, v0.l -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX11-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll index bba3a23..72ddc32 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll @@ -614,11 +614,10 @@ define amdgpu_kernel void @fptoui_f16_to_i1(ptr addrspace(1) %out, half %in) { ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 -; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 +; GFX11-TRUE16-NEXT: v_cmp_eq_f16_e64 s2, 1.0, s2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cmp_eq_f16_e32 vcc_lo, 1.0, v0.l -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX11-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/icmp.i16.ll b/llvm/test/CodeGen/AMDGPU/icmp.i16.ll index 77575c7..6a4ae7f 100644 --- a/llvm/test/CodeGen/AMDGPU/icmp.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/icmp.i16.ll @@ -1,8 +1,6 @@ ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s| FileCheck -check-prefix=GCN -check-prefix=SI %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s| FileCheck -check-prefixes=GCN,GFX11-FAKE16 %s -; FIXME-TRUE16. In true16 flow, the codegen introduces addtional s2v copy and mov, and revert the operand order thus picking different cmp instructions -; This should be corrected after addtional mov/copy is removed ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s| FileCheck -check-prefixes=GCN,GFX11-TRUE16 %s ;;;==========================================================================;;; @@ -215,7 +213,7 @@ entry: ; VI: v_cmp_eq_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_eq_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; GFX11-FAKE16: v_cmp_eq_u16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}} -; GFX11-TRUE16: v_cmp_eq_u16_e32 vcc_lo, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}} +; GFX11-TRUE16: v_cmp_eq_u16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}.{{(l|h)}} define amdgpu_kernel void @i16_eq_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -233,7 +231,7 @@ entry: ; VI: v_cmp_ne_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_ne_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; GFX11-FAKE16: v_cmp_ne_u16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}} -; GFX11-TRUE16: v_cmp_ne_u16_e32 vcc_lo, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}} +; GFX11-TRUE16: v_cmp_ne_u16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}.{{(l|h)}} define amdgpu_kernel void @i16_ne_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -251,7 +249,7 @@ entry: ; VI: v_cmp_lt_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_lt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; GFX11-FAKE16: v_cmp_lt_u16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}} -; GFX11-TRUE16: v_cmp_gt_u16_e32 vcc_lo, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}} +; GFX11-TRUE16: v_cmp_lt_u16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}.{{(l|h)}} define amdgpu_kernel void @i16_ugt_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -269,7 +267,7 @@ entry: ; VI: v_cmp_le_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_le_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; GFX11-FAKE16: v_cmp_le_u16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}} -; GFX11-TRUE16: v_cmp_ge_u16_e32 vcc_lo, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}} +; GFX11-TRUE16: v_cmp_le_u16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}.{{(l|h)}} define amdgpu_kernel void @i16_uge_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -287,7 +285,7 @@ entry: ; VI: v_cmp_gt_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_gt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; GFX11-FAKE16: v_cmp_gt_u16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}} -; GFX11-TRUE16: v_cmp_lt_u16_e32 vcc_lo, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}} +; GFX11-TRUE16: v_cmp_gt_u16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}.{{(l|h)}} define amdgpu_kernel void @i16_ult_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -305,7 +303,7 @@ entry: ; VI: v_cmp_ge_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_ge_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; GFX11-FAKE16: v_cmp_ge_u16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}} -; GFX11-TRUE16: v_cmp_le_u16_e32 vcc_lo, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}} +; GFX11-TRUE16: v_cmp_ge_u16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}.{{(l|h)}} define amdgpu_kernel void @i16_ule_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -323,7 +321,7 @@ entry: ; VI: v_cmp_lt_i16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_lt_i32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; GFX11-FAKE16: v_cmp_lt_i16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}} -; GFX11-TRUE16: v_cmp_gt_i16_e32 vcc_lo, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}} +; GFX11-TRUE16: v_cmp_lt_i16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}.{{(l|h)}} define amdgpu_kernel void @i16_sgt_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -341,7 +339,7 @@ entry: ; VI: v_cmp_le_i16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_le_i32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; GFX11-FAKE16: v_cmp_le_i16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}} -; GFX11-TRUE16: v_cmp_ge_i16_e32 vcc_lo, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}} +; GFX11-TRUE16: v_cmp_le_i16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}.{{(l|h)}} define amdgpu_kernel void @i16_sge_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -359,7 +357,7 @@ entry: ; VI: v_cmp_gt_i16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_gt_i32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; GFX11-FAKE16: v_cmp_gt_i16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}} -; GFX11-TRUE16: v_cmp_lt_i16_e32 vcc_lo, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}} +; GFX11-TRUE16: v_cmp_gt_i16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}.{{(l|h)}} define amdgpu_kernel void @i16_slt_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -377,7 +375,7 @@ entry: ; VI: v_cmp_ge_i16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_ge_i32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; GFX11-FAKE16: v_cmp_ge_i16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}} -; GFX11-TRUE16: v_cmp_le_i16_e32 vcc_lo, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}} +; GFX11-TRUE16: v_cmp_ge_i16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}.{{(l|h)}} define amdgpu_kernel void @i16_sle_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/imm16.ll b/llvm/test/CodeGen/AMDGPU/imm16.ll index bc4a863..8ca8767 100644 --- a/llvm/test/CodeGen/AMDGPU/imm16.ll +++ b/llvm/test/CodeGen/AMDGPU/imm16.ll @@ -647,10 +647,8 @@ define amdgpu_kernel void @add_inline_imm_0.0_f16(ptr addrspace(1) %out, half %x ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e] +; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s2, 0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x00,0x01,0x00] ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf] -; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 0, v0.l ; encoding: [0x80,0x00,0x00,0x64] ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -715,10 +713,8 @@ define amdgpu_kernel void @add_inline_imm_0.5_f16(ptr addrspace(1) %out, half %x ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e] +; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s2, 0.5 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe0,0x01,0x00] ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf] -; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 0.5, v0.l ; encoding: [0xf0,0x00,0x00,0x64] ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -783,10 +779,8 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_f16(ptr addrspace(1) %out, hal ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e] +; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s2, -0.5 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe2,0x01,0x00] ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf] -; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, -0.5, v0.l ; encoding: [0xf1,0x00,0x00,0x64] ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -851,10 +845,8 @@ define amdgpu_kernel void @add_inline_imm_1.0_f16(ptr addrspace(1) %out, half %x ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e] +; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s2, 1.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe4,0x01,0x00] ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf] -; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l ; encoding: [0xf2,0x00,0x00,0x64] ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -919,10 +911,8 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_f16(ptr addrspace(1) %out, hal ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e] +; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s2, -1.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe6,0x01,0x00] ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf] -; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, -1.0, v0.l ; encoding: [0xf3,0x00,0x00,0x64] ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -987,10 +977,8 @@ define amdgpu_kernel void @add_inline_imm_2.0_f16(ptr addrspace(1) %out, half %x ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e] +; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s2, 2.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe8,0x01,0x00] ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf] -; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 2.0, v0.l ; encoding: [0xf4,0x00,0x00,0x64] ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -1055,10 +1043,8 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_f16(ptr addrspace(1) %out, hal ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e] +; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s2, -2.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xea,0x01,0x00] ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf] -; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, -2.0, v0.l ; encoding: [0xf5,0x00,0x00,0x64] ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -1123,10 +1109,8 @@ define amdgpu_kernel void @add_inline_imm_4.0_f16(ptr addrspace(1) %out, half %x ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e] +; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s2, 4.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xec,0x01,0x00] ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf] -; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 4.0, v0.l ; encoding: [0xf6,0x00,0x00,0x64] ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -1191,10 +1175,8 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_f16(ptr addrspace(1) %out, hal ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e] +; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s2, -4.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xee,0x01,0x00] ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf] -; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, -4.0, v0.l ; encoding: [0xf7,0x00,0x00,0x64] ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -1455,10 +1437,8 @@ define amdgpu_kernel void @add_inline_imm_1_f16(ptr addrspace(1) %out, half %x) ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e] +; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s2, 1 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x02,0x01,0x00] ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf] -; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 1, v0.l ; encoding: [0x81,0x00,0x00,0x64] ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -1523,10 +1503,8 @@ define amdgpu_kernel void @add_inline_imm_2_f16(ptr addrspace(1) %out, half %x) ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e] +; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s2, 2 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x04,0x01,0x00] ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf] -; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 2, v0.l ; encoding: [0x82,0x00,0x00,0x64] ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -1591,10 +1569,8 @@ define amdgpu_kernel void @add_inline_imm_16_f16(ptr addrspace(1) %out, half %x) ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e] +; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s2, 16 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x20,0x01,0x00] ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf] -; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 16, v0.l ; encoding: [0x90,0x00,0x00,0x64] ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -1896,10 +1872,8 @@ define amdgpu_kernel void @add_inline_imm_63_f16(ptr addrspace(1) %out, half %x) ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e] +; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s2, 63 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x7e,0x01,0x00] ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf] -; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 63, v0.l ; encoding: [0xbf,0x00,0x00,0x64] ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -1964,10 +1938,8 @@ define amdgpu_kernel void @add_inline_imm_64_f16(ptr addrspace(1) %out, half %x) ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e] +; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s2, 64 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x80,0x01,0x00] ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf] -; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 64, v0.l ; encoding: [0xc0,0x00,0x00,0x64] ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index 93052fe..97c97ac8 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -199,9 +199,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspac ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 16 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_mov_b32 s4, s2 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s2 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-TRUE16-NEXT: ;;#ASMSTART @@ -356,42 +354,23 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa ; CI-NEXT: ;;#ASMEND ; CI-NEXT: s_endpgm ; -; GFX11-TRUE16-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x10 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s4, 16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_mov_b32 s4, s3 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s2, s4, s2 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-TRUE16-NEXT: ;;#ASMSTART -; GFX11-TRUE16-NEXT: ; use s3 -; GFX11-TRUE16-NEXT: ;;#ASMEND -; GFX11-TRUE16-NEXT: s_endpgm -; -; GFX11-FAKE16-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x10 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s4, 16 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s2, s3, s2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-FAKE16-NEXT: ;;#ASMSTART -; GFX11-FAKE16-NEXT: ; use s3 -; GFX11-FAKE16-NEXT: ;;#ASMEND -; GFX11-FAKE16-NEXT: s_endpgm +; GFX11-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-NEXT: s_lshr_b32 s3, s4, 16 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_pack_lh_b32_b16 s2, s3, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use s3 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr %elt.hi = lshr i32 %elt.arg, 16 %elt = trunc i32 %elt.hi to i16 @@ -468,52 +447,27 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad ; CI-NEXT: ;;#ASMEND ; CI-NEXT: s_endpgm ; -; GFX11-TRUE16-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x10 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s4, 16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_mov_b32 s4, s3 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s5, s2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 -; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-TRUE16-NEXT: ;;#ASMSTART -; GFX11-TRUE16-NEXT: ; use s3 -; GFX11-TRUE16-NEXT: ;;#ASMEND -; GFX11-TRUE16-NEXT: ;;#ASMSTART -; GFX11-TRUE16-NEXT: ; use s2 -; GFX11-TRUE16-NEXT: ;;#ASMEND -; GFX11-TRUE16-NEXT: s_endpgm -; -; GFX11-FAKE16-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x10 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s4, 16 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s2, 16 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s3, s2 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 -; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-FAKE16-NEXT: ;;#ASMSTART -; GFX11-FAKE16-NEXT: ; use s3 -; GFX11-FAKE16-NEXT: ;;#ASMEND -; GFX11-FAKE16-NEXT: ;;#ASMSTART -; GFX11-FAKE16-NEXT: ; use s2 -; GFX11-FAKE16-NEXT: ;;#ASMEND -; GFX11-FAKE16-NEXT: s_endpgm +; GFX11-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-NEXT: s_lshr_b32 s3, s4, 16 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshr_b32 s2, s2, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_pack_ll_b32_b16 s4, s3, s2 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use s3 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use s2 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr %elt.hi = lshr i32 %elt.arg, 16 %elt = trunc i32 %elt.hi to i16 @@ -1792,34 +1746,19 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr ad ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm ; -; GFX11-TRUE16-LABEL: v_insertelement_v4f16_0: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x30 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v2, s[2:3] -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s4 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v3, v0 -; GFX11-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-TRUE16-NEXT: s_endpgm -; -; GFX11-FAKE16-LABEL: v_insertelement_v4f16_0: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x30 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v2, s[2:3] -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0xffff, s4, v0 -; GFX11-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-FAKE16-NEXT: s_endpgm +; GFX11-LABEL: v_insertelement_v4f16_0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x30 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfi_b32 v0, 0xffff, s4, v0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %in, i64 %tid.ext @@ -1978,34 +1917,19 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr ad ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm ; -; GFX11-TRUE16-LABEL: v_insertelement_v4f16_2: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x30 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v2, s[2:3] -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s4 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v3, v1 -; GFX11-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-TRUE16-NEXT: s_endpgm -; -; GFX11-FAKE16-LABEL: v_insertelement_v4f16_2: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x30 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v2, s[2:3] -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0xffff, s4, v1 -; GFX11-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-FAKE16-NEXT: s_endpgm +; GFX11-LABEL: v_insertelement_v4f16_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x30 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfi_b32 v1, 0xffff, s4, v1 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %in, i64 %tid.ext @@ -2164,34 +2088,19 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr ad ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm ; -; GFX11-TRUE16-LABEL: v_insertelement_v4i16_2: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v2, s[2:3] -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s4 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v3, v1 -; GFX11-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-TRUE16-NEXT: s_endpgm -; -; GFX11-FAKE16-LABEL: v_insertelement_v4i16_2: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x10 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v2, s[2:3] -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0xffff, s4, v1 -; GFX11-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-FAKE16-NEXT: s_endpgm +; GFX11-LABEL: v_insertelement_v4i16_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfi_b32 v1, 0xffff, s4, v1 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %in, i64 %tid.ext @@ -2583,34 +2492,19 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; CI-NEXT: s_endpgm ; -; GFX11-TRUE16-LABEL: v_insertelement_v8i16_6: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 4, v0 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b128 v[0:3], v4, s[2:3] -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s4 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v5, v3 -; GFX11-TRUE16-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX11-TRUE16-NEXT: s_endpgm -; -; GFX11-FAKE16-LABEL: v_insertelement_v8i16_6: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x10 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 4, v0 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: global_load_b128 v[0:3], v4, s[2:3] -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_bfi_b32 v3, 0xffff, s4, v3 -; GFX11-FAKE16-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX11-FAKE16-NEXT: s_endpgm +; GFX11-LABEL: v_insertelement_v8i16_6: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfi_b32 v3, 0xffff, s4, v3 +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <8 x i16>, ptr addrspace(1) %in, i64 %tid.ext @@ -2799,11 +2693,10 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 4, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b128 v[0:3], v5, s[2:3] +; GFX11-TRUE16-NEXT: global_load_b128 v[0:3], v4, s[2:3] ; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s4 ; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 7 ; GFX11-TRUE16-NEXT: s_cselect_b32 s3, -1, 0 @@ -2816,19 +2709,19 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 3 ; GFX11-TRUE16-NEXT: s_cselect_b32 s9, -1, 0 ; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 0 -; GFX11-TRUE16-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11-TRUE16-NEXT: s_cselect_b32 s10, -1, 0 ; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 1 ; GFX11-TRUE16-NEXT: s_cselect_b32 s5, -1, 0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v4.l, s2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v3.h, v4.l, s3 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v4.l, s6 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v2.h, v4.l, s7 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.l, s8 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v4.l, s9 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v4.l, s4 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v4.l, s5 -; GFX11-TRUE16-NEXT: global_store_b128 v5, v[0:3], s[0:1] +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, s4, s2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v3.h, s4, s3 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, s4, s6 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v2.h, s4, s7 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, s4, s8 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, s4, s9 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, s4, s10 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, s4, s5 +; GFX11-TRUE16-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: v_insertelement_v8f16_dynamic: @@ -3078,45 +2971,24 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a ; CI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; CI-NEXT: s_endpgm ; -; GFX11-TRUE16-LABEL: v_insertelement_v16i16_6: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 5, v0 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: global_load_b128 v[0:3], v8, s[2:3] -; GFX11-TRUE16-NEXT: global_load_b128 v[4:7], v8, s[2:3] offset:16 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s4 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v9, v3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 -; GFX11-TRUE16-NEXT: global_store_b128 v8, v[0:3], s[0:1] -; GFX11-TRUE16-NEXT: s_endpgm -; -; GFX11-FAKE16-LABEL: v_insertelement_v16i16_6: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x10 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 5, v0 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: global_load_b128 v[0:3], v8, s[2:3] -; GFX11-FAKE16-NEXT: global_load_b128 v[4:7], v8, s[2:3] offset:16 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-FAKE16-NEXT: v_bfi_b32 v3, 0xffff, s4, v3 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 -; GFX11-FAKE16-NEXT: global_store_b128 v8, v[0:3], s[0:1] -; GFX11-FAKE16-NEXT: s_endpgm +; GFX11-LABEL: v_insertelement_v16i16_6: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b128 v[0:3], v8, s[2:3] +; GFX11-NEXT: global_load_b128 v[4:7], v8, s[2:3] offset:16 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_bfi_b32 v3, 0xffff, s4, v3 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 +; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <16 x i16>, ptr addrspace(1) %in, i64 %tid.ext @@ -3443,13 +3315,12 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 5, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: global_load_b128 v[0:3], v9, s[2:3] -; GFX11-TRUE16-NEXT: global_load_b128 v[4:7], v9, s[2:3] offset:16 +; GFX11-TRUE16-NEXT: global_load_b128 v[0:3], v8, s[2:3] +; GFX11-TRUE16-NEXT: global_load_b128 v[4:7], v8, s[2:3] offset:16 ; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s4 ; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 7 ; GFX11-TRUE16-NEXT: s_cselect_b32 s3, -1, 0 @@ -3478,30 +3349,30 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 11 ; GFX11-TRUE16-NEXT: s_cselect_b32 s17, -1, 0 ; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 8 -; GFX11-TRUE16-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11-TRUE16-NEXT: s_cselect_b32 s18, -1, 0 ; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 9 ; GFX11-TRUE16-NEXT: s_cselect_b32 s5, -1, 0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v8.l, s2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, s4, s2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, v8.l, s12 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v7.h, v8.l, s13 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, v8.l, s14 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v6.h, v8.l, s15 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v8.l, s16 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.h, v8.l, s17 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v8.l, s4 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.h, v8.l, s5 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v3.h, v8.l, s3 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v8.l, s6 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v2.h, v8.l, s7 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v8.l, s8 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v8.l, s9 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v8.l, s10 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v8.l, s11 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, s4, s12 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v7.h, s4, s13 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, s4, s14 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v6.h, s4, s15 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, s4, s16 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.h, s4, s17 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, s4, s18 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.h, s4, s5 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v3.h, s4, s3 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, s4, s6 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v2.h, s4, s7 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, s4, s8 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, s4, s9 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, s4, s10 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, s4, s11 ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: global_store_b128 v9, v[4:7], s[0:1] offset:16 -; GFX11-TRUE16-NEXT: global_store_b128 v9, v[0:3], s[0:1] +; GFX11-TRUE16-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 +; GFX11-TRUE16-NEXT: global_store_b128 v8, v[0:3], s[0:1] ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: v_insertelement_v16f16_dynamic: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll index 07421af..b77b2f7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll @@ -78,20 +78,19 @@ define amdgpu_kernel void @v_alignbyte_b32_2(ptr addrspace(1) %out, ptr addrspac ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b32 v2, v0, s[6:7] glc dlc +; GFX11-TRUE16-NEXT: global_load_b32 v0, v0, s[6:7] glc dlc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x3c ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_alignbyte_b32 v0, v1, v2, v0.l -; GFX11-TRUE16-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX11-TRUE16-NEXT: v_alignbyte_b32 v0, v1, v0, s2 +; GFX11-TRUE16-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: v_alignbyte_b32_2: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll index 839892e..d8e2ce3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll @@ -35,25 +35,15 @@ define amdgpu_kernel void @tbuffer_store_d16_x(ptr addrspace(8) %rsrc, half %dat ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_x v0, off, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] ; GFX10-PACKED-NEXT: s_endpgm ; -; GFX11-PACKED-TRUE16-LABEL: tbuffer_store_d16_x: -; GFX11-PACKED-TRUE16: ; %bb.0: ; %main_body -; GFX11-PACKED-TRUE16-NEXT: s_clause 0x1 -; GFX11-PACKED-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34 -; GFX11-PACKED-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-PACKED-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-PACKED-TRUE16-NEXT: v_mov_b16_e32 v0.l, s6 -; GFX11-PACKED-TRUE16-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] -; GFX11-PACKED-TRUE16-NEXT: s_endpgm -; -; GFX11-PACKED-FAKE16-LABEL: tbuffer_store_d16_x: -; GFX11-PACKED-FAKE16: ; %bb.0: ; %main_body -; GFX11-PACKED-FAKE16-NEXT: s_clause 0x1 -; GFX11-PACKED-FAKE16-NEXT: s_load_b32 s6, s[4:5], 0x34 -; GFX11-PACKED-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-PACKED-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-PACKED-FAKE16-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-PACKED-FAKE16-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] -; GFX11-PACKED-FAKE16-NEXT: s_endpgm +; GFX11-PACKED-LABEL: tbuffer_store_d16_x: +; GFX11-PACKED: ; %bb.0: ; %main_body +; GFX11-PACKED-NEXT: s_clause 0x1 +; GFX11-PACKED-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-PACKED-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] +; GFX11-PACKED-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.ptr.tbuffer.store.f16(half %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 33, i32 0) ret void @@ -217,3 +207,6 @@ declare void @llvm.amdgcn.raw.ptr.tbuffer.store.f16(half, ptr addrspace(8), i32, declare void @llvm.amdgcn.raw.ptr.tbuffer.store.v2f16(<2 x half>, ptr addrspace(8), i32, i32, i32, i32) declare void @llvm.amdgcn.raw.ptr.tbuffer.store.v3f16(<3 x half>, ptr addrspace(8), i32, i32, i32, i32) declare void @llvm.amdgcn.raw.ptr.tbuffer.store.v4f16(<4 x half>, ptr addrspace(8), i32, i32, i32, i32) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11-PACKED-FAKE16: {{.*}} +; GFX11-PACKED-TRUE16: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll index c53c491..052f7f1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll @@ -39,55 +39,25 @@ define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data) { ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_x v0, off, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] ; GFX10-PACKED-NEXT: s_endpgm ; -; GFX11-PACKED-TRUE16-LABEL: tbuffer_store_d16_x: -; GFX11-PACKED-TRUE16: ; %bb.0: ; %main_body -; GFX11-PACKED-TRUE16-NEXT: s_clause 0x1 -; GFX11-PACKED-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34 -; GFX11-PACKED-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-PACKED-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-PACKED-TRUE16-NEXT: v_mov_b16_e32 v0.l, s6 -; GFX11-PACKED-TRUE16-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] -; GFX11-PACKED-TRUE16-NEXT: s_endpgm -; -; GFX11-PACKED-FAKE16-LABEL: tbuffer_store_d16_x: -; GFX11-PACKED-FAKE16: ; %bb.0: ; %main_body -; GFX11-PACKED-FAKE16-NEXT: s_clause 0x1 -; GFX11-PACKED-FAKE16-NEXT: s_load_b32 s6, s[4:5], 0x34 -; GFX11-PACKED-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-PACKED-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-PACKED-FAKE16-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-PACKED-FAKE16-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] -; GFX11-PACKED-FAKE16-NEXT: s_endpgm -; -; GFX12-PACKED-SDAG-TRUE16-LABEL: tbuffer_store_d16_x: -; GFX12-PACKED-SDAG-TRUE16: ; %bb.0: ; %main_body -; GFX12-PACKED-SDAG-TRUE16-NEXT: s_clause 0x1 -; GFX12-PACKED-SDAG-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34 -; GFX12-PACKED-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-PACKED-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-PACKED-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, s6 -; GFX12-PACKED-SDAG-TRUE16-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] -; GFX12-PACKED-SDAG-TRUE16-NEXT: s_endpgm -; -; GFX12-PACKED-SDAG-FAKE16-LABEL: tbuffer_store_d16_x: -; GFX12-PACKED-SDAG-FAKE16: ; %bb.0: ; %main_body -; GFX12-PACKED-SDAG-FAKE16-NEXT: s_clause 0x1 -; GFX12-PACKED-SDAG-FAKE16-NEXT: s_load_b32 s6, s[4:5], 0x34 -; GFX12-PACKED-SDAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-PACKED-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-PACKED-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-PACKED-SDAG-FAKE16-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] -; GFX12-PACKED-SDAG-FAKE16-NEXT: s_endpgm +; GFX11-PACKED-LABEL: tbuffer_store_d16_x: +; GFX11-PACKED: ; %bb.0: ; %main_body +; GFX11-PACKED-NEXT: s_clause 0x1 +; GFX11-PACKED-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-PACKED-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] +; GFX11-PACKED-NEXT: s_endpgm ; -; GFX12-PACKED-GISEL-LABEL: tbuffer_store_d16_x: -; GFX12-PACKED-GISEL: ; %bb.0: ; %main_body -; GFX12-PACKED-GISEL-NEXT: s_clause 0x1 -; GFX12-PACKED-GISEL-NEXT: s_load_b32 s6, s[4:5], 0x34 -; GFX12-PACKED-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-PACKED-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-PACKED-GISEL-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] -; GFX12-PACKED-GISEL-NEXT: s_endpgm +; GFX12-PACKED-LABEL: tbuffer_store_d16_x: +; GFX12-PACKED: ; %bb.0: ; %main_body +; GFX12-PACKED-NEXT: s_clause 0x1 +; GFX12-PACKED-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-PACKED-NEXT: s_wait_kmcnt 0x0 +; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-PACKED-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] +; GFX12-PACKED-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.tbuffer.store.f16(half %data, <4 x i32> %rsrc, i32 0, i32 0, i32 33, i32 0) ret void @@ -298,5 +268,9 @@ declare void @llvm.amdgcn.raw.tbuffer.store.v2f16(<2 x half>, <4 x i32>, i32, i3 declare void @llvm.amdgcn.raw.tbuffer.store.v3f16(<3 x half>, <4 x i32>, i32, i32, i32, i32) declare void @llvm.amdgcn.raw.tbuffer.store.v4f16(<4 x half>, <4 x i32>, i32, i32, i32, i32) ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11-PACKED-FAKE16: {{.*}} +; GFX11-PACKED-TRUE16: {{.*}} ; GFX12-PACKED-GISEL-FAKE16: {{.*}} ; GFX12-PACKED-GISEL-TRUE16: {{.*}} +; GFX12-PACKED-SDAG-FAKE16: {{.*}} +; GFX12-PACKED-SDAG-TRUE16: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll index 530ace7..fc8f8af 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll @@ -38,27 +38,16 @@ define amdgpu_kernel void @tbuffer_store_d16_x(ptr addrspace(8) %rsrc, half %dat ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen ; GFX10-PACKED-NEXT: s_endpgm ; -; GFX11-PACKED-TRUE16-LABEL: tbuffer_store_d16_x: -; GFX11-PACKED-TRUE16: ; %bb.0: ; %main_body -; GFX11-PACKED-TRUE16-NEXT: s_clause 0x1 -; GFX11-PACKED-TRUE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 -; GFX11-PACKED-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-PACKED-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-PACKED-TRUE16-NEXT: v_mov_b16_e32 v0.l, s6 -; GFX11-PACKED-TRUE16-NEXT: v_mov_b32_e32 v1, s7 -; GFX11-PACKED-TRUE16-NEXT: tbuffer_store_d16_format_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen -; GFX11-PACKED-TRUE16-NEXT: s_endpgm -; -; GFX11-PACKED-FAKE16-LABEL: tbuffer_store_d16_x: -; GFX11-PACKED-FAKE16: ; %bb.0: ; %main_body -; GFX11-PACKED-FAKE16-NEXT: s_clause 0x1 -; GFX11-PACKED-FAKE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 -; GFX11-PACKED-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-PACKED-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-PACKED-FAKE16-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-PACKED-FAKE16-NEXT: v_mov_b32_e32 v1, s7 -; GFX11-PACKED-FAKE16-NEXT: tbuffer_store_d16_format_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen -; GFX11-PACKED-FAKE16-NEXT: s_endpgm +; GFX11-PACKED-LABEL: tbuffer_store_d16_x: +; GFX11-PACKED: ; %bb.0: ; %main_body +; GFX11-PACKED-NEXT: s_clause 0x1 +; GFX11-PACKED-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s7 +; GFX11-PACKED-NEXT: tbuffer_store_d16_format_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen +; GFX11-PACKED-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.ptr.tbuffer.store.f16(half %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0) ret void @@ -242,3 +231,6 @@ declare void @llvm.amdgcn.struct.ptr.tbuffer.store.f16(half, ptr addrspace(8), i declare void @llvm.amdgcn.struct.ptr.tbuffer.store.v2f16(<2 x half>, ptr addrspace(8), i32, i32, i32, i32, i32) declare void @llvm.amdgcn.struct.ptr.tbuffer.store.v3f16(<3 x half>, ptr addrspace(8), i32, i32, i32, i32, i32) declare void @llvm.amdgcn.struct.ptr.tbuffer.store.v4f16(<4 x half>, ptr addrspace(8), i32, i32, i32, i32, i32) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11-PACKED-FAKE16: {{.*}} +; GFX11-PACKED-TRUE16: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll index bdb8299..d025e7a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll @@ -44,60 +44,27 @@ define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data, i32 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen ; GFX10-PACKED-NEXT: s_endpgm ; -; GFX11-PACKED-TRUE16-LABEL: tbuffer_store_d16_x: -; GFX11-PACKED-TRUE16: ; %bb.0: ; %main_body -; GFX11-PACKED-TRUE16-NEXT: s_clause 0x1 -; GFX11-PACKED-TRUE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 -; GFX11-PACKED-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-PACKED-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-PACKED-TRUE16-NEXT: v_mov_b16_e32 v0.l, s6 -; GFX11-PACKED-TRUE16-NEXT: v_mov_b32_e32 v1, s7 -; GFX11-PACKED-TRUE16-NEXT: tbuffer_store_d16_format_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen -; GFX11-PACKED-TRUE16-NEXT: s_endpgm -; -; GFX11-PACKED-FAKE16-LABEL: tbuffer_store_d16_x: -; GFX11-PACKED-FAKE16: ; %bb.0: ; %main_body -; GFX11-PACKED-FAKE16-NEXT: s_clause 0x1 -; GFX11-PACKED-FAKE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 -; GFX11-PACKED-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-PACKED-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-PACKED-FAKE16-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-PACKED-FAKE16-NEXT: v_mov_b32_e32 v1, s7 -; GFX11-PACKED-FAKE16-NEXT: tbuffer_store_d16_format_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen -; GFX11-PACKED-FAKE16-NEXT: s_endpgm -; -; GFX12-PACKED-SDAG-TRUE16-LABEL: tbuffer_store_d16_x: -; GFX12-PACKED-SDAG-TRUE16: ; %bb.0: ; %main_body -; GFX12-PACKED-SDAG-TRUE16-NEXT: s_clause 0x1 -; GFX12-PACKED-SDAG-TRUE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 -; GFX12-PACKED-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX12-PACKED-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-PACKED-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, s6 -; GFX12-PACKED-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s7 -; GFX12-PACKED-SDAG-TRUE16-NEXT: tbuffer_store_d16_format_x v0, v1, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] idxen -; GFX12-PACKED-SDAG-TRUE16-NEXT: s_endpgm -; -; GFX12-PACKED-SDAG-FAKE16-LABEL: tbuffer_store_d16_x: -; GFX12-PACKED-SDAG-FAKE16: ; %bb.0: ; %main_body -; GFX12-PACKED-SDAG-FAKE16-NEXT: s_clause 0x1 -; GFX12-PACKED-SDAG-FAKE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 -; GFX12-PACKED-SDAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX12-PACKED-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-PACKED-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-PACKED-SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, s7 -; GFX12-PACKED-SDAG-FAKE16-NEXT: tbuffer_store_d16_format_x v0, v1, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] idxen -; GFX12-PACKED-SDAG-FAKE16-NEXT: s_endpgm +; GFX11-PACKED-LABEL: tbuffer_store_d16_x: +; GFX11-PACKED: ; %bb.0: ; %main_body +; GFX11-PACKED-NEXT: s_clause 0x1 +; GFX11-PACKED-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s7 +; GFX11-PACKED-NEXT: tbuffer_store_d16_format_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen +; GFX11-PACKED-NEXT: s_endpgm ; -; GFX12-PACKED-GISEL-LABEL: tbuffer_store_d16_x: -; GFX12-PACKED-GISEL: ; %bb.0: ; %main_body -; GFX12-PACKED-GISEL-NEXT: s_clause 0x1 -; GFX12-PACKED-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 -; GFX12-PACKED-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX12-PACKED-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX12-PACKED-GISEL-NEXT: tbuffer_store_d16_format_x v0, v1, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] idxen -; GFX12-PACKED-GISEL-NEXT: s_endpgm +; GFX12-PACKED-LABEL: tbuffer_store_d16_x: +; GFX12-PACKED: ; %bb.0: ; %main_body +; GFX12-PACKED-NEXT: s_clause 0x1 +; GFX12-PACKED-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 +; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX12-PACKED-NEXT: s_wait_kmcnt 0x0 +; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-PACKED-NEXT: v_mov_b32_e32 v1, s7 +; GFX12-PACKED-NEXT: tbuffer_store_d16_format_x v0, v1, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] idxen +; GFX12-PACKED-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.tbuffer.store.f16(half %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0) ret void @@ -331,5 +298,9 @@ declare void @llvm.amdgcn.struct.tbuffer.store.v2f16(<2 x half>, <4 x i32>, i32, declare void @llvm.amdgcn.struct.tbuffer.store.v3f16(<3 x half>, <4 x i32>, i32, i32, i32, i32, i32) declare void @llvm.amdgcn.struct.tbuffer.store.v4f16(<4 x half>, <4 x i32>, i32, i32, i32, i32, i32) ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11-PACKED-FAKE16: {{.*}} +; GFX11-PACKED-TRUE16: {{.*}} ; GFX12-PACKED-GISEL-FAKE16: {{.*}} ; GFX12-PACKED-GISEL-TRUE16: {{.*}} +; GFX12-PACKED-SDAG-FAKE16: {{.*}} +; GFX12-PACKED-SDAG-TRUE16: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll index 26bcd61..18c462f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll @@ -77,42 +77,17 @@ define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) { ; GFX10CHECK-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10CHECK-NEXT: s_endpgm ; -; GFX11SELDAG-TRUE16-LABEL: sgpr_isnan_f16: -; GFX11SELDAG-TRUE16: ; %bb.0: -; GFX11SELDAG-TRUE16-NEXT: s_clause 0x1 -; GFX11SELDAG-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c -; GFX11SELDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0 -; GFX11SELDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s2, v0.l, 3 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, s2 -; GFX11SELDAG-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11SELDAG-TRUE16-NEXT: s_endpgm -; -; GFX11SELDAG-FAKE16-LABEL: sgpr_isnan_f16: -; GFX11SELDAG-FAKE16: ; %bb.0: -; GFX11SELDAG-FAKE16-NEXT: s_clause 0x1 -; GFX11SELDAG-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c -; GFX11SELDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11SELDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0 -; GFX11SELDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s2, s2, 3 -; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 -; GFX11SELDAG-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11SELDAG-FAKE16-NEXT: s_endpgm -; -; GFX11GLISEL-LABEL: sgpr_isnan_f16: -; GFX11GLISEL: ; %bb.0: -; GFX11GLISEL-NEXT: s_clause 0x1 -; GFX11GLISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c -; GFX11GLISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11GLISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX11GLISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11GLISEL-NEXT: v_cmp_class_f16_e64 s2, s2, 3 -; GFX11GLISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 -; GFX11GLISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11GLISEL-NEXT: s_endpgm +; GFX11CHECK-LABEL: sgpr_isnan_f16: +; GFX11CHECK: ; %bb.0: +; GFX11CHECK-NEXT: s_clause 0x1 +; GFX11CHECK-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11CHECK-NEXT: v_mov_b32_e32 v0, 0 +; GFX11CHECK-NEXT: s_waitcnt lgkmcnt(0) +; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s2, s2, 3 +; GFX11CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 +; GFX11CHECK-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11CHECK-NEXT: s_endpgm %result = call i1 @llvm.is.fpclass.f16(half %x, i32 3) %sext = sext i1 %result to i32 store i32 %sext, ptr addrspace(1) %out, align 4 @@ -4311,4 +4286,5 @@ attributes #0 = { "denormal-fp-math"="ieee,preserve-sign" } ; Maybe daz attributes #1 = { "denormal-fp-math"="ieee,dynamic" } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11GLISEL: {{.*}} ; GFX11SELDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll index 297e4f0..362b9da 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll @@ -1083,23 +1083,19 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX11-TRUE16-LABEL: s_maximum_v2f16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s1 -; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s0, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s1, 16 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, s3 -; GFX11-TRUE16-NEXT: v_pk_max_f16 v2, s0, s1 -; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v1.l, v1.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v2.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s2, s0, s1 +; GFX11-TRUE16-NEXT: v_pk_max_f16 v0, s0, s1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, s0, s1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v1.l, s0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX11-TRUE16-NEXT: ;;#ASMSTART ; GFX11-TRUE16-NEXT: ; use v0 ; GFX11-TRUE16-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll index ffbb9fd..f6d37b3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll @@ -896,23 +896,19 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX11-TRUE16-LABEL: s_minimum_v2f16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s1 -; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s0, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s1, 16 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, s3 -; GFX11-TRUE16-NEXT: v_pk_min_f16 v2, s0, s1 -; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v1.l, v1.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v2.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s2, s0, s1 +; GFX11-TRUE16-NEXT: v_pk_min_f16 v0, s0, s1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, s0, s1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v1.l, s0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX11-TRUE16-NEXT: ;;#ASMSTART ; GFX11-TRUE16-NEXT: ; use v0 ; GFX11-TRUE16-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll index 452acbc..2f1dfa1 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll @@ -790,14 +790,10 @@ define amdgpu_ps <2 x half> @s_constained_fsub_v2f16_fpexcept_strict(<2 x half> ; ; GFX11-SDAG-TRUE16-LABEL: s_constained_fsub_v2f16_fpexcept_strict: ; GFX11-SDAG-TRUE16: ; %bb.0: -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, s3 -; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s0, s2, 16 -; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s1, s3, 16 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, s0 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, s1 -; GFX11-SDAG-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.l, v0.h -; GFX11-SDAG-TRUE16-NEXT: v_sub_f16_e32 v1.l, v1.l, v1.h +; GFX11-SDAG-TRUE16-NEXT: v_sub_f16_e64 v0.l, s2, s3 +; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s0, s3, 16 +; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s1, s2, 16 +; GFX11-SDAG-TRUE16-NEXT: v_sub_f16_e64 v1.l, s1, s0 ; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-SDAG-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX11-SDAG-TRUE16-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/true16-fold.mir b/llvm/test/CodeGen/AMDGPU/true16-fold.mir new file mode 100644 index 0000000..ef6e400 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/true16-fold.mir @@ -0,0 +1,60 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass si-fold-operands -mattr=+real-true16 -o - %s | FileCheck %s + +--- +name: fold_16bit_subreg_1 +tracksRegLiveness: true +registers: +body: | + bb.0.entry: + ; CHECK-LABEL: name: fold_16bit_subreg_1 + ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_CMP_EQ_F16_t16_e64_:%[0-9]+]]:sreg_32 = nofpexcept V_CMP_EQ_F16_t16_e64 0, killed [[DEF1]], 2, [[DEF]].sub1, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_EQ_F16_t16_e64_]] + %0:sreg_64_xexec = IMPLICIT_DEF + %1:sgpr_lo16 = COPY %0.sub1_lo16:sreg_64_xexec + %2:vgpr_16 = COPY %1:sgpr_lo16 + %3:vgpr_16 = IMPLICIT_DEF + %4:sreg_32 = nofpexcept V_CMP_EQ_F16_t16_e64 0, killed %3:vgpr_16, 2, killed %2:vgpr_16, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0, implicit %4 +... + +--- +name: fold_16bit_subreg_0 +tracksRegLiveness: true +registers: +body: | + bb.0.entry: + ; CHECK-LABEL: name: fold_16bit_subreg_0 + ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_CMP_EQ_F16_t16_e64_:%[0-9]+]]:sreg_32 = nofpexcept V_CMP_EQ_F16_t16_e64 0, killed [[DEF1]], 2, [[DEF]].sub0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_EQ_F16_t16_e64_]] + %0:sreg_64_xexec = IMPLICIT_DEF + %1:sgpr_lo16 = COPY %0.lo16:sreg_64_xexec + %2:vgpr_16 = COPY %1:sgpr_lo16 + %3:vgpr_16 = IMPLICIT_DEF + %4:sreg_32 = nofpexcept V_CMP_EQ_F16_t16_e64 0, killed %3:vgpr_16, 2, killed %2:vgpr_16, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0, implicit %4 +... + +--- +name: sgpr_lo16 +tracksRegLiveness: true +registers: +body: | + bb.0.entry: + ; CHECK-LABEL: name: sgpr_lo16 + ; CHECK: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_ALIGNBIT_B32_t16_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_t16_e64 0, [[DEF]], 0, killed [[DEF1]], 0, 30, 0, 0, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_ALIGNBIT_B32_t16_e64_]] + %0:sreg_32 = IMPLICIT_DEF + %1:sreg_32 = IMPLICIT_DEF + %2:sreg_32 = S_MOV_B32 30 + %3:sgpr_lo16 = COPY %2.lo16:sreg_32 + %4:vgpr_16 = COPY %3:sgpr_lo16 + %5:vgpr_32 = V_ALIGNBIT_B32_t16_e64 0, %0:sreg_32, 0, killed %1:sreg_32, 0, killed %4:vgpr_16, 0, 0, implicit $exec + S_ENDPGM 0, implicit %5 +... diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll index 12137bd..40a4d4a 100644 --- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll @@ -187,11 +187,8 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg ; SDAG-GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; SDAG-GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, 0 ; SDAG-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; SDAG-GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 -; SDAG-GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, s3 -; SDAG-GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; SDAG-GFX12-TRUE16-NEXT: v_med3_i16 v0.l, v0.l, 0, 0xff -; SDAG-GFX12-TRUE16-NEXT: v_med3_i16 v1.l, v0.h, 0, 0xff +; SDAG-GFX12-TRUE16-NEXT: v_med3_i16 v0.l, s2, 0, 0xff +; SDAG-GFX12-TRUE16-NEXT: v_med3_i16 v1.l, s3, 0, 0xff ; SDAG-GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; SDAG-GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SDAG-GFX12-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 |