diff options
| author | padivedi <padivedi@amd.com> | 2026-04-28 13:00:46 +0530 |
|---|---|---|
| committer | padivedi <padivedi@amd.com> | 2026-04-28 13:00:46 +0530 |
| commit | cad0ef5a3fae7d37a1a00de822f14aa342596178 (patch) | |
| tree | 5425f44f61b026b4f542419493dad696e0a3c76c | |
| parent | 60e9465ae09fee21bf11805b120519946e99860b (diff) | |
| download | llvm-users/Pankajdwivedi-25/LCOMPILER-1697-dpp-permlane-lowering.tar.gz llvm-users/Pankajdwivedi-25/LCOMPILER-1697-dpp-permlane-lowering.tar.bz2 llvm-users/Pankajdwivedi-25/LCOMPILER-1697-dpp-permlane-lowering.zip | |
[AMDGPU] Convert ds_bpermute/wave_shuffle XOR patterns to DPP row_xmask and permlanex16users/Pankajdwivedi-25/LCOMPILER-1697-dpp-permlane-lowering
4 files changed, 163 insertions, 203 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 463d63a88f69..ea70bd0d7e26 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -572,19 +572,69 @@ static bool isThreadID(const GCNSubtarget &ST, Value *V) { return false; } +// Emit a DPP row_xmask operation: each lane reads from lane (self ^ XorMask) +// within a 16-lane row. Replaces II with llvm.amdgcn.update.dpp. +static Instruction *emitDPPRowXmask(InstCombiner &IC, IntrinsicInst &II, + Value *Val, uint64_t XorMask) { + IRBuilderBase &B = IC.Builder; + CallInst *UpdateDPP = + B.CreateIntrinsic(Intrinsic::amdgcn_update_dpp, Val->getType(), + {PoisonValue::get(Val->getType()), Val, + B.getInt32(AMDGPU::DPP::ROW_XMASK0 | XorMask), + B.getInt32(0xF), B.getInt32(0xF), B.getTrue()}); + UpdateDPP->takeName(&II); + UpdateDPP->copyMetadata(II); + return IC.replaceInstUsesWith(II, UpdateDPP); +} + +// Emit a v_permlanex16 with identity lane select: each lane reads from the +// same-numbered lane in the other 16-lane row, equivalent to XOR with 16. +// Replaces II with llvm.amdgcn.permlanex16. +static Instruction *emitPermLaneX16(InstCombiner &IC, IntrinsicInst &II, + Value *Val) { + IRBuilderBase &B = IC.Builder; + CallInst *PermLane = B.CreateIntrinsic( + Intrinsic::amdgcn_permlanex16, Val->getType(), + {PoisonValue::get(Val->getType()), Val, B.getInt32(0x76543210), + B.getInt32(0xFEDCBA98), B.getFalse(), B.getFalse()}); + PermLane->takeName(&II); + PermLane->copyMetadata(II); + return IC.replaceInstUsesWith(II, PermLane); +} + // Attempt to capture situations where the index argument matches -// a DPP pattern, and convert to a DPP-based mov +// a DPP or cross-lane VALU pattern, and convert to the faster instruction. static std::optional<Instruction *> tryWaveShuffleDPP(const GCNSubtarget &ST, InstCombiner &IC, IntrinsicInst &II) { Value *Val = II.getArgOperand(0); Value *Idx = II.getArgOperand(1); - auto &B = IC.Builder; - // DPP16 Row Share requires known wave size, architecture support - if (!ST.isWaveSizeKnown() || !ST.hasDPPRowShare()) + if (!ST.isWaveSizeKnown() || !ST.hasDPP()) return std::nullopt; Value *Tid; + uint64_t XorMask; + + // DPP16 Row Xmask: Idx = Tid ^ ConstMask (where ConstMask is 1-15) + // row_xmask:N performs XOR within each 16-lane row, which is equivalent to + // __shfl_xor(val, N) for N < 16 since XOR only affects the low 4 bits. + if (match(Idx, m_Xor(m_Value(Tid), m_ConstantInt(XorMask))) && + isThreadID(ST, Tid) && XorMask >= 1 && XorMask <= 15 && + ST.getGeneration() >= AMDGPUSubtarget::GFX10) + return emitDPPRowXmask(IC, II, Val, XorMask); + + // Permlanex16: Idx = Tid ^ 16 + // v_permlanex16 performs a gather from the other 16-lane row within each + // 32-lane half, which is equivalent to __shfl_xor(val, 16). + if (match(Idx, m_Xor(m_Value(Tid), m_SpecificInt(16))) && + isThreadID(ST, Tid) && ST.hasPermLaneX16()) + return emitPermLaneX16(IC, II, Val); + + // DPP16 Row Share requires architecture support + if (!ST.hasDPPRowShare()) + return std::nullopt; + + IRBuilderBase &B = IC.Builder; uint64_t Mask; uint64_t RowIdx; bool CanDPP16RowShare = false; @@ -636,6 +686,57 @@ tryWaveShuffleDPP(const GCNSubtarget &ST, InstCombiner &IC, IntrinsicInst &II) { return std::nullopt; } +// Match ds_bpermute((tid ^ const) << 2, val) patterns and convert to VALU +// cross-lane operations: DPP row_xmask (masks 1-15) or v_permlanex16 +// (mask 16). InstCombine may canonicalize shl(xor(tid, N), 2) into +// xor(shl(tid, 2), N*4), so both forms are matched. +static std::optional<Instruction *> +tryBPermuteToVALU(const GCNSubtarget &ST, InstCombiner &IC, IntrinsicInst &II) { + Value *Index = II.getArgOperand(0); + Value *Val = II.getArgOperand(1); + + if (!ST.hasDPP() || !ST.isWaveSizeKnown()) + return std::nullopt; + + Value *Tid; + uint64_t XorMask = 0; + + // Form A (canonical): xor(shl(tid, 2), Const) where Const = mask * 4 + Value *ShiftedTid; + uint64_t XorConst; + if (match(Index, m_Xor(m_Value(ShiftedTid), m_ConstantInt(XorConst)))) { + uint64_t ShiftAmt; + if (match(ShiftedTid, m_Shl(m_Value(Tid), m_ConstantInt(ShiftAmt))) && + ShiftAmt == 2 && isThreadID(ST, Tid) && (XorConst & 3) == 0) + XorMask = XorConst >> 2; + } + + // Form B (pre-canonical): shl(xor(tid, mask), 2) + if (!XorMask) { + Value *XorResult; + uint64_t ShiftAmt; + if (match(Index, m_Shl(m_Value(XorResult), m_ConstantInt(ShiftAmt))) && + ShiftAmt == 2 && + match(XorResult, m_Xor(m_Value(Tid), m_ConstantInt(XorMask))) && + isThreadID(ST, Tid)) { + // XorMask is set by the match + } else { + return std::nullopt; + } + } + + // DPP row_xmask for masks 1-15 + if (XorMask >= 1 && XorMask <= 15 && + ST.getGeneration() >= AMDGPUSubtarget::GFX10) + return emitDPPRowXmask(IC, II, Val, XorMask); + + // v_permlanex16 for mask 16 + if (XorMask == 16 && ST.hasPermLaneX16()) + return emitPermLaneX16(IC, II, Val); + + return std::nullopt; +} + Instruction * GCNTTIImpl::hoistLaneIntrinsicThroughOperand(InstCombiner &IC, IntrinsicInst &II) const { @@ -1572,10 +1673,17 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { simplifyDemandedLaneMaskArg(IC, II, 1)) return &II; - // If the lane argument of bpermute is uniform, change it to readlane. This - // generates better code and can enable further optimizations because - // readlane is AlwaysUniform. if (IID == Intrinsic::amdgcn_ds_bpermute) { + // Try to convert bpermute with XOR-based lane index into VALU + // cross-lane ops: DPP row_xmask (masks 1-15) or v_permlanex16 + // (mask 16). + if (std::optional<Instruction *> VALUResult = + tryBPermuteToVALU(*ST, IC, II)) + return *VALUResult; + + // If the lane argument of bpermute is uniform, change it to readlane. + // This generates better code and can enable further optimizations because + // readlane is AlwaysUniform. const Use &Lane = II.getArgOperandUse(0); if (isTriviallyUniform(Lane)) { Value *NewLane = IC.Builder.CreateLShr(Lane, 2); diff --git a/llvm/test/CodeGen/AMDGPU/bpermute-xor-dpp-combine.ll b/llvm/test/CodeGen/AMDGPU/bpermute-xor-dpp-combine.ll index c3df0b7c8e14..55fbd8d327c1 100644 --- a/llvm/test/CodeGen/AMDGPU/bpermute-xor-dpp-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/bpermute-xor-dpp-combine.ll @@ -15,25 +15,13 @@ define float @wave_shuffle_xor1_fadd(float %val, float %other) #0 { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_xor_b32_e32 v2, 1, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 2, v2 -; GFX12-NEXT: ds_bpermute_b32 v0, v2, v0 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX12-NEXT: v_add_f32_dpp v0, v0, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: wave_shuffle_xor1_fadd: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_xor_b32_e32 v2, 1, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v2 -; GFX11-NEXT: ds_bpermute_b32 v0, v2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_add_f32_dpp v0, v0, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) %xor = xor i32 %tid, 1 @@ -51,25 +39,13 @@ define float @wave_shuffle_xor4_fmul(float %val, float %other) #0 { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_xor_b32_e32 v2, 4, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 2, v2 -; GFX12-NEXT: ds_bpermute_b32 v0, v2, v0 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX12-NEXT: v_mul_f32_dpp v0, v0, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: wave_shuffle_xor4_fmul: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_xor_b32_e32 v2, 4, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v2 -; GFX11-NEXT: ds_bpermute_b32 v0, v2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_mul_f32_dpp v0, v0, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) %xor = xor i32 %tid, 4 @@ -87,25 +63,13 @@ define float @bpermute_xor2_fadd(float %val, float %other) #0 { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 2, v2 -; GFX12-NEXT: v_xor_b32_e32 v2, 8, v2 -; GFX12-NEXT: ds_bpermute_b32 v0, v2, v0 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX12-NEXT: v_add_f32_dpp v0, v0, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: bpermute_xor2_fadd: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v2 -; GFX11-NEXT: v_xor_b32_e32 v2, 8, v2 -; GFX11-NEXT: ds_bpermute_b32 v0, v2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX11-NEXT: v_add_f32_dpp v0, v0, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) %xor = xor i32 %tid, 2 @@ -126,25 +90,13 @@ define float @bpermute_xor8_fsub(float %val, float %other) #0 { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 2, v2 -; GFX12-NEXT: v_xor_b32_e32 v2, 32, v2 -; GFX12-NEXT: ds_bpermute_b32 v0, v2, v0 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: v_sub_f32_e32 v0, v1, v0 +; GFX12-NEXT: v_subrev_f32_dpp v0, v0, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: bpermute_xor8_fsub: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v2 -; GFX11-NEXT: v_xor_b32_e32 v2, 32, v2 -; GFX11-NEXT: ds_bpermute_b32 v0, v2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_sub_f32_e32 v0, v1, v0 +; GFX11-NEXT: v_subrev_f32_dpp v0, v0, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) %xor = xor i32 %tid, 8 @@ -165,29 +117,17 @@ define float @wave_shuffle_xor1_fma_like(float %val, float %sign) #0 { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_xor_b32_e32 v2, 1, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 2, v2 -; GFX12-NEXT: ds_bpermute_b32 v2, v2, v0 -; GFX12-NEXT: v_mul_f32_e32 v0, v1, v0 -; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: v_mul_f32_e32 v1, v1, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX12-NEXT: v_add_f32_dpp v0, v0, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: wave_shuffle_xor1_fma_like: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_xor_b32_e32 v2, 1, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v2 -; GFX11-NEXT: ds_bpermute_b32 v2, v2, v0 -; GFX11-NEXT: v_mul_f32_e32 v0, v1, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX11-NEXT: v_add_f32_dpp v0, v0, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) %xor = xor i32 %tid, 1 @@ -206,24 +146,19 @@ define float @wave_shuffle_xor16_fadd(float %val, float %other) #0 { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_xor_b32_e32 v2, 16, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 2, v2 -; GFX12-NEXT: ds_bpermute_b32 v0, v2, v0 -; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: s_mov_b32 s0, 0x76543210 +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, 0xfedcba98 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: wave_shuffle_xor16_fadd: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_xor_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v2 -; GFX11-NEXT: ds_bpermute_b32 v0, v2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, 0x76543210 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, 0xfedcba98 ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-bpermute-xor-to-dpp.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-bpermute-xor-to-dpp.ll index 0205d082f498..6bb3b8dcfc4a 100644 --- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-bpermute-xor-to-dpp.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-bpermute-xor-to-dpp.ll @@ -8,18 +8,12 @@ define i32 @test_bpermute_xor1(i32 %val) { ; GFX1200-W32-LABEL: define i32 @test_bpermute_xor1( ; GFX1200-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0:[0-9]+]] { -; GFX1200-W32-NEXT: [[LANE_ID:%.*]] = call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) -; GFX1200-W32-NEXT: [[XOR_LANE:%.*]] = shl nuw nsw i32 [[LANE_ID]], 2 -; GFX1200-W32-NEXT: [[BYTE_OFFSET:%.*]] = xor i32 [[XOR_LANE]], 4 -; GFX1200-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.ds.bpermute(i32 [[BYTE_OFFSET]], i32 [[VAL]]) +; GFX1200-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 353, i32 15, i32 15, i1 true) ; GFX1200-W32-NEXT: ret i32 [[RESULT]] ; ; GFX11-W32-LABEL: define i32 @test_bpermute_xor1( ; GFX11-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0:[0-9]+]] { -; GFX11-W32-NEXT: [[LANE_ID:%.*]] = call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) -; GFX11-W32-NEXT: [[XOR_LANE:%.*]] = shl nuw nsw i32 [[LANE_ID]], 2 -; GFX11-W32-NEXT: [[BYTE_OFFSET:%.*]] = xor i32 [[XOR_LANE]], 4 -; GFX11-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.ds.bpermute(i32 [[BYTE_OFFSET]], i32 [[VAL]]) +; GFX11-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 353, i32 15, i32 15, i1 true) ; GFX11-W32-NEXT: ret i32 [[RESULT]] ; ; GFX11-W64-LABEL: define i32 @test_bpermute_xor1( @@ -49,18 +43,12 @@ define i32 @test_bpermute_xor1(i32 %val) { define i32 @test_bpermute_xor2(i32 %val) { ; GFX1200-W32-LABEL: define i32 @test_bpermute_xor2( ; GFX1200-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] { -; GFX1200-W32-NEXT: [[LANE_ID:%.*]] = call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) -; GFX1200-W32-NEXT: [[XOR_LANE:%.*]] = shl nuw nsw i32 [[LANE_ID]], 2 -; GFX1200-W32-NEXT: [[BYTE_OFFSET:%.*]] = xor i32 [[XOR_LANE]], 8 -; GFX1200-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.ds.bpermute(i32 [[BYTE_OFFSET]], i32 [[VAL]]) +; GFX1200-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 354, i32 15, i32 15, i1 true) ; GFX1200-W32-NEXT: ret i32 [[RESULT]] ; ; GFX11-W32-LABEL: define i32 @test_bpermute_xor2( ; GFX11-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] { -; GFX11-W32-NEXT: [[LANE_ID:%.*]] = call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) -; GFX11-W32-NEXT: [[XOR_LANE:%.*]] = shl nuw nsw i32 [[LANE_ID]], 2 -; GFX11-W32-NEXT: [[BYTE_OFFSET:%.*]] = xor i32 [[XOR_LANE]], 8 -; GFX11-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.ds.bpermute(i32 [[BYTE_OFFSET]], i32 [[VAL]]) +; GFX11-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 354, i32 15, i32 15, i1 true) ; GFX11-W32-NEXT: ret i32 [[RESULT]] ; ; GFX11-W64-LABEL: define i32 @test_bpermute_xor2( @@ -90,18 +78,12 @@ define i32 @test_bpermute_xor2(i32 %val) { define i32 @test_bpermute_xor4(i32 %val) { ; GFX1200-W32-LABEL: define i32 @test_bpermute_xor4( ; GFX1200-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] { -; GFX1200-W32-NEXT: [[LANE_ID:%.*]] = call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) -; GFX1200-W32-NEXT: [[XOR_LANE:%.*]] = shl nuw nsw i32 [[LANE_ID]], 2 -; GFX1200-W32-NEXT: [[BYTE_OFFSET:%.*]] = xor i32 [[XOR_LANE]], 16 -; GFX1200-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.ds.bpermute(i32 [[BYTE_OFFSET]], i32 [[VAL]]) +; GFX1200-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 356, i32 15, i32 15, i1 true) ; GFX1200-W32-NEXT: ret i32 [[RESULT]] ; ; GFX11-W32-LABEL: define i32 @test_bpermute_xor4( ; GFX11-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] { -; GFX11-W32-NEXT: [[LANE_ID:%.*]] = call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) -; GFX11-W32-NEXT: [[XOR_LANE:%.*]] = shl nuw nsw i32 [[LANE_ID]], 2 -; GFX11-W32-NEXT: [[BYTE_OFFSET:%.*]] = xor i32 [[XOR_LANE]], 16 -; GFX11-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.ds.bpermute(i32 [[BYTE_OFFSET]], i32 [[VAL]]) +; GFX11-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 356, i32 15, i32 15, i1 true) ; GFX11-W32-NEXT: ret i32 [[RESULT]] ; ; GFX11-W64-LABEL: define i32 @test_bpermute_xor4( @@ -131,18 +113,12 @@ define i32 @test_bpermute_xor4(i32 %val) { define i32 @test_bpermute_xor8(i32 %val) { ; GFX1200-W32-LABEL: define i32 @test_bpermute_xor8( ; GFX1200-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] { -; GFX1200-W32-NEXT: [[LANE_ID:%.*]] = call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) -; GFX1200-W32-NEXT: [[XOR_LANE:%.*]] = shl nuw nsw i32 [[LANE_ID]], 2 -; GFX1200-W32-NEXT: [[BYTE_OFFSET:%.*]] = xor i32 [[XOR_LANE]], 32 -; GFX1200-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.ds.bpermute(i32 [[BYTE_OFFSET]], i32 [[VAL]]) +; GFX1200-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 360, i32 15, i32 15, i1 true) ; GFX1200-W32-NEXT: ret i32 [[RESULT]] ; ; GFX11-W32-LABEL: define i32 @test_bpermute_xor8( ; GFX11-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] { -; GFX11-W32-NEXT: [[LANE_ID:%.*]] = call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) -; GFX11-W32-NEXT: [[XOR_LANE:%.*]] = shl nuw nsw i32 [[LANE_ID]], 2 -; GFX11-W32-NEXT: [[BYTE_OFFSET:%.*]] = xor i32 [[XOR_LANE]], 32 -; GFX11-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.ds.bpermute(i32 [[BYTE_OFFSET]], i32 [[VAL]]) +; GFX11-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 360, i32 15, i32 15, i1 true) ; GFX11-W32-NEXT: ret i32 [[RESULT]] ; ; GFX11-W64-LABEL: define i32 @test_bpermute_xor8( @@ -172,18 +148,12 @@ define i32 @test_bpermute_xor8(i32 %val) { define i32 @test_bpermute_xor15(i32 %val) { ; GFX1200-W32-LABEL: define i32 @test_bpermute_xor15( ; GFX1200-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] { -; GFX1200-W32-NEXT: [[LANE_ID:%.*]] = call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) -; GFX1200-W32-NEXT: [[XOR_LANE:%.*]] = shl nuw nsw i32 [[LANE_ID]], 2 -; GFX1200-W32-NEXT: [[BYTE_OFFSET:%.*]] = xor i32 [[XOR_LANE]], 60 -; GFX1200-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.ds.bpermute(i32 [[BYTE_OFFSET]], i32 [[VAL]]) +; GFX1200-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 367, i32 15, i32 15, i1 true) ; GFX1200-W32-NEXT: ret i32 [[RESULT]] ; ; GFX11-W32-LABEL: define i32 @test_bpermute_xor15( ; GFX11-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] { -; GFX11-W32-NEXT: [[LANE_ID:%.*]] = call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) -; GFX11-W32-NEXT: [[XOR_LANE:%.*]] = shl nuw nsw i32 [[LANE_ID]], 2 -; GFX11-W32-NEXT: [[BYTE_OFFSET:%.*]] = xor i32 [[XOR_LANE]], 60 -; GFX11-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.ds.bpermute(i32 [[BYTE_OFFSET]], i32 [[VAL]]) +; GFX11-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 367, i32 15, i32 15, i1 true) ; GFX11-W32-NEXT: ret i32 [[RESULT]] ; ; GFX11-W64-LABEL: define i32 @test_bpermute_xor15( @@ -213,18 +183,12 @@ define i32 @test_bpermute_xor15(i32 %val) { define i32 @test_bpermute_xor16(i32 %val) { ; GFX1200-W32-LABEL: define i32 @test_bpermute_xor16( ; GFX1200-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] { -; GFX1200-W32-NEXT: [[LANE_ID:%.*]] = call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) -; GFX1200-W32-NEXT: [[XOR_LANE:%.*]] = shl nuw nsw i32 [[LANE_ID]], 2 -; GFX1200-W32-NEXT: [[BYTE_OFFSET:%.*]] = xor i32 [[XOR_LANE]], 64 -; GFX1200-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.ds.bpermute(i32 [[BYTE_OFFSET]], i32 [[VAL]]) +; GFX1200-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.permlanex16.i32(i32 poison, i32 [[VAL]], i32 1985229328, i32 -19088744, i1 false, i1 false) ; GFX1200-W32-NEXT: ret i32 [[RESULT]] ; ; GFX11-W32-LABEL: define i32 @test_bpermute_xor16( ; GFX11-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] { -; GFX11-W32-NEXT: [[LANE_ID:%.*]] = call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) -; GFX11-W32-NEXT: [[XOR_LANE:%.*]] = shl nuw nsw i32 [[LANE_ID]], 2 -; GFX11-W32-NEXT: [[BYTE_OFFSET:%.*]] = xor i32 [[XOR_LANE]], 64 -; GFX11-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.ds.bpermute(i32 [[BYTE_OFFSET]], i32 [[VAL]]) +; GFX11-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.permlanex16.i32(i32 poison, i32 [[VAL]], i32 1985229328, i32 -19088744, i1 false, i1 false) ; GFX11-W32-NEXT: ret i32 [[RESULT]] ; ; GFX11-W64-LABEL: define i32 @test_bpermute_xor16( @@ -254,27 +218,17 @@ define i32 @test_bpermute_xor16(i32 %val) { define i32 @test_bpermute_xor4_wave64(i32 %val) { ; GFX1200-W32-LABEL: define i32 @test_bpermute_xor4_wave64( ; GFX1200-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] { -; GFX1200-W32-NEXT: [[LO:%.*]] = call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) -; GFX1200-W32-NEXT: [[XOR_LANE:%.*]] = shl nuw nsw i32 [[LO]], 2 -; GFX1200-W32-NEXT: [[BYTE_OFFSET:%.*]] = xor i32 [[XOR_LANE]], 16 -; GFX1200-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.ds.bpermute(i32 [[BYTE_OFFSET]], i32 [[VAL]]) +; GFX1200-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 356, i32 15, i32 15, i1 true) ; GFX1200-W32-NEXT: ret i32 [[RESULT]] ; ; GFX11-W32-LABEL: define i32 @test_bpermute_xor4_wave64( ; GFX11-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] { -; GFX11-W32-NEXT: [[LO:%.*]] = call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) -; GFX11-W32-NEXT: [[XOR_LANE:%.*]] = shl nuw nsw i32 [[LO]], 2 -; GFX11-W32-NEXT: [[BYTE_OFFSET:%.*]] = xor i32 [[XOR_LANE]], 16 -; GFX11-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.ds.bpermute(i32 [[BYTE_OFFSET]], i32 [[VAL]]) +; GFX11-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 356, i32 15, i32 15, i1 true) ; GFX11-W32-NEXT: ret i32 [[RESULT]] ; ; GFX11-W64-LABEL: define i32 @test_bpermute_xor4_wave64( ; GFX11-W64-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] { -; GFX11-W64-NEXT: [[LO:%.*]] = call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) -; GFX11-W64-NEXT: [[LANE_ID:%.*]] = call range(i32 0, 65) i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[LO]]) -; GFX11-W64-NEXT: [[XOR_LANE:%.*]] = shl nuw nsw i32 [[LANE_ID]], 2 -; GFX11-W64-NEXT: [[BYTE_OFFSET:%.*]] = xor i32 [[XOR_LANE]], 16 -; GFX11-W64-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.ds.bpermute(i32 [[BYTE_OFFSET]], i32 [[VAL]]) +; GFX11-W64-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 356, i32 15, i32 15, i1 true) ; GFX11-W64-NEXT: ret i32 [[RESULT]] ; ; GFX9-LABEL: define i32 @test_bpermute_xor4_wave64( @@ -298,27 +252,17 @@ define i32 @test_bpermute_xor4_wave64(i32 %val) { define i32 @test_bpermute_xor16_wave64(i32 %val) { ; GFX1200-W32-LABEL: define i32 @test_bpermute_xor16_wave64( ; GFX1200-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] { -; GFX1200-W32-NEXT: [[LO:%.*]] = call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) -; GFX1200-W32-NEXT: [[XOR_LANE:%.*]] = shl nuw nsw i32 [[LO]], 2 -; GFX1200-W32-NEXT: [[BYTE_OFFSET:%.*]] = xor i32 [[XOR_LANE]], 64 -; GFX1200-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.ds.bpermute(i32 [[BYTE_OFFSET]], i32 [[VAL]]) +; GFX1200-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.permlanex16.i32(i32 poison, i32 [[VAL]], i32 1985229328, i32 -19088744, i1 false, i1 false) ; GFX1200-W32-NEXT: ret i32 [[RESULT]] ; ; GFX11-W32-LABEL: define i32 @test_bpermute_xor16_wave64( ; GFX11-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] { -; GFX11-W32-NEXT: [[LO:%.*]] = call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) -; GFX11-W32-NEXT: [[XOR_LANE:%.*]] = shl nuw nsw i32 [[LO]], 2 -; GFX11-W32-NEXT: [[BYTE_OFFSET:%.*]] = xor i32 [[XOR_LANE]], 64 -; GFX11-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.ds.bpermute(i32 [[BYTE_OFFSET]], i32 [[VAL]]) +; GFX11-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.permlanex16.i32(i32 poison, i32 [[VAL]], i32 1985229328, i32 -19088744, i1 false, i1 false) ; GFX11-W32-NEXT: ret i32 [[RESULT]] ; ; GFX11-W64-LABEL: define i32 @test_bpermute_xor16_wave64( ; GFX11-W64-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] { -; GFX11-W64-NEXT: [[LO:%.*]] = call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) -; GFX11-W64-NEXT: [[LANE_ID:%.*]] = call range(i32 0, 65) i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[LO]]) -; GFX11-W64-NEXT: [[XOR_LANE:%.*]] = shl nuw nsw i32 [[LANE_ID]], 2 -; GFX11-W64-NEXT: [[BYTE_OFFSET:%.*]] = xor i32 [[XOR_LANE]], 64 -; GFX11-W64-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.ds.bpermute(i32 [[BYTE_OFFSET]], i32 [[VAL]]) +; GFX11-W64-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.permlanex16.i32(i32 poison, i32 [[VAL]], i32 1985229328, i32 -19088744, i1 false, i1 false) ; GFX11-W64-NEXT: ret i32 [[RESULT]] ; ; GFX9-LABEL: define i32 @test_bpermute_xor16_wave64( diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wave.shuffle.ll b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wave.shuffle.ll index 79362b8856a9..908b04955de8 100644 --- a/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wave.shuffle.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wave.shuffle.ll @@ -242,17 +242,12 @@ define i32 @test_wave_shuffle_not_quite_row_share(i32 %val) { define i32 @test_wave_shuffle_xor1(i32 %val) { ; CHECK-W32-LABEL: define i32 @test_wave_shuffle_xor1( ; CHECK-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] { -; CHECK-W32-NEXT: [[LO:%.*]] = tail call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) -; CHECK-W32-NEXT: [[XOR:%.*]] = xor i32 [[LO]], 1 -; CHECK-W32-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[XOR]]) +; CHECK-W32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 353, i32 15, i32 15, i1 true) ; CHECK-W32-NEXT: ret i32 [[RES]] ; ; CHECK-W64-LABEL: define i32 @test_wave_shuffle_xor1( ; CHECK-W64-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] { -; CHECK-W64-NEXT: [[LO:%.*]] = tail call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) -; CHECK-W64-NEXT: [[TID:%.*]] = tail call range(i32 0, 65) i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[LO]]) -; CHECK-W64-NEXT: [[XOR:%.*]] = xor i32 [[TID]], 1 -; CHECK-W64-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[XOR]]) +; CHECK-W64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 353, i32 15, i32 15, i1 true) ; CHECK-W64-NEXT: ret i32 [[RES]] ; ; CHECK-NO-WAVE-SIZE-LABEL: define i32 @test_wave_shuffle_xor1( @@ -274,17 +269,12 @@ define i32 @test_wave_shuffle_xor1(i32 %val) { define i32 @test_wave_shuffle_xor4(i32 %val) { ; CHECK-W32-LABEL: define i32 @test_wave_shuffle_xor4( ; CHECK-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] { -; CHECK-W32-NEXT: [[LO:%.*]] = tail call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) -; CHECK-W32-NEXT: [[XOR:%.*]] = xor i32 [[LO]], 4 -; CHECK-W32-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[XOR]]) +; CHECK-W32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 356, i32 15, i32 15, i1 true) ; CHECK-W32-NEXT: ret i32 [[RES]] ; ; CHECK-W64-LABEL: define i32 @test_wave_shuffle_xor4( ; CHECK-W64-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] { -; CHECK-W64-NEXT: [[LO:%.*]] = tail call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) -; CHECK-W64-NEXT: [[TID:%.*]] = tail call range(i32 0, 65) i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[LO]]) -; CHECK-W64-NEXT: [[XOR:%.*]] = xor i32 [[TID]], 4 -; CHECK-W64-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[XOR]]) +; CHECK-W64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 356, i32 15, i32 15, i1 true) ; CHECK-W64-NEXT: ret i32 [[RES]] ; ; CHECK-NO-WAVE-SIZE-LABEL: define i32 @test_wave_shuffle_xor4( @@ -306,17 +296,12 @@ define i32 @test_wave_shuffle_xor4(i32 %val) { define i32 @test_wave_shuffle_xor15(i32 %val) { ; CHECK-W32-LABEL: define i32 @test_wave_shuffle_xor15( ; CHECK-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] { -; CHECK-W32-NEXT: [[LO:%.*]] = tail call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) -; CHECK-W32-NEXT: [[XOR:%.*]] = xor i32 [[LO]], 15 -; CHECK-W32-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[XOR]]) +; CHECK-W32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 367, i32 15, i32 15, i1 true) ; CHECK-W32-NEXT: ret i32 [[RES]] ; ; CHECK-W64-LABEL: define i32 @test_wave_shuffle_xor15( ; CHECK-W64-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] { -; CHECK-W64-NEXT: [[LO:%.*]] = tail call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) -; CHECK-W64-NEXT: [[TID:%.*]] = tail call range(i32 0, 65) i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[LO]]) -; CHECK-W64-NEXT: [[XOR:%.*]] = xor i32 [[TID]], 15 -; CHECK-W64-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[XOR]]) +; CHECK-W64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 367, i32 15, i32 15, i1 true) ; CHECK-W64-NEXT: ret i32 [[RES]] ; ; CHECK-NO-WAVE-SIZE-LABEL: define i32 @test_wave_shuffle_xor15( @@ -338,17 +323,12 @@ define i32 @test_wave_shuffle_xor15(i32 %val) { define i32 @test_wave_shuffle_xor16(i32 %val) { ; CHECK-W32-LABEL: define i32 @test_wave_shuffle_xor16( ; CHECK-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] { -; CHECK-W32-NEXT: [[LO:%.*]] = tail call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) -; CHECK-W32-NEXT: [[XOR:%.*]] = xor i32 [[LO]], 16 -; CHECK-W32-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[XOR]]) +; CHECK-W32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlanex16.i32(i32 poison, i32 [[VAL]], i32 1985229328, i32 -19088744, i1 false, i1 false) ; CHECK-W32-NEXT: ret i32 [[RES]] ; ; CHECK-W64-LABEL: define i32 @test_wave_shuffle_xor16( ; CHECK-W64-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] { -; CHECK-W64-NEXT: [[LO:%.*]] = tail call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) -; CHECK-W64-NEXT: [[TID:%.*]] = tail call range(i32 0, 65) i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[LO]]) -; CHECK-W64-NEXT: [[XOR:%.*]] = xor i32 [[TID]], 16 -; CHECK-W64-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[XOR]]) +; CHECK-W64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlanex16.i32(i32 poison, i32 [[VAL]], i32 1985229328, i32 -19088744, i1 false, i1 false) ; CHECK-W64-NEXT: ret i32 [[RES]] ; ; CHECK-NO-WAVE-SIZE-LABEL: define i32 @test_wave_shuffle_xor16( @@ -370,9 +350,7 @@ define i32 @test_wave_shuffle_xor16(i32 %val) { define i32 @test_wave_shuffle_xor1_lo_only(i32 %val) { ; CHECK-W32-LABEL: define i32 @test_wave_shuffle_xor1_lo_only( ; CHECK-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] { -; CHECK-W32-NEXT: [[TID:%.*]] = tail call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) -; CHECK-W32-NEXT: [[XOR:%.*]] = xor i32 [[TID]], 1 -; CHECK-W32-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[XOR]]) +; CHECK-W32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 353, i32 15, i32 15, i1 true) ; CHECK-W32-NEXT: ret i32 [[RES]] ; ; CHECK-W64-LABEL: define i32 @test_wave_shuffle_xor1_lo_only( @@ -399,17 +377,12 @@ define i32 @test_wave_shuffle_xor1_lo_only(i32 %val) { define float @test_wave_shuffle_xor2_float(float %val) { ; CHECK-W32-LABEL: define float @test_wave_shuffle_xor2_float( ; CHECK-W32-SAME: float [[VAL:%.*]]) #[[ATTR0]] { -; CHECK-W32-NEXT: [[LO:%.*]] = tail call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) -; CHECK-W32-NEXT: [[XOR:%.*]] = xor i32 [[LO]], 2 -; CHECK-W32-NEXT: [[RES:%.*]] = tail call float @llvm.amdgcn.wave.shuffle.f32(float [[VAL]], i32 [[XOR]]) +; CHECK-W32-NEXT: [[RES:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float poison, float [[VAL]], i32 354, i32 15, i32 15, i1 true) ; CHECK-W32-NEXT: ret float [[RES]] ; ; CHECK-W64-LABEL: define float @test_wave_shuffle_xor2_float( ; CHECK-W64-SAME: float [[VAL:%.*]]) #[[ATTR0]] { -; CHECK-W64-NEXT: [[LO:%.*]] = tail call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) -; CHECK-W64-NEXT: [[TID:%.*]] = tail call range(i32 0, 65) i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[LO]]) -; CHECK-W64-NEXT: [[XOR:%.*]] = xor i32 [[TID]], 2 -; CHECK-W64-NEXT: [[RES:%.*]] = tail call float @llvm.amdgcn.wave.shuffle.f32(float [[VAL]], i32 [[XOR]]) +; CHECK-W64-NEXT: [[RES:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float poison, float [[VAL]], i32 354, i32 15, i32 15, i1 true) ; CHECK-W64-NEXT: ret float [[RES]] ; ; CHECK-NO-WAVE-SIZE-LABEL: define float @test_wave_shuffle_xor2_float( |
