aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorpadivedi <padivedi@amd.com>2026-04-28 13:00:46 +0530
committerpadivedi <padivedi@amd.com>2026-04-28 13:00:46 +0530
commitcad0ef5a3fae7d37a1a00de822f14aa342596178 (patch)
tree5425f44f61b026b4f542419493dad696e0a3c76c
parent60e9465ae09fee21bf11805b120519946e99860b (diff)
downloadllvm-users/Pankajdwivedi-25/LCOMPILER-1697-dpp-permlane-lowering.tar.gz
llvm-users/Pankajdwivedi-25/LCOMPILER-1697-dpp-permlane-lowering.tar.bz2
llvm-users/Pankajdwivedi-25/LCOMPILER-1697-dpp-permlane-lowering.zip
[AMDGPU] Convert ds_bpermute/wave_shuffle XOR patterns to DPP row_xmask and permlanex16users/Pankajdwivedi-25/LCOMPILER-1697-dpp-permlane-lowering
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp122
-rw-r--r--llvm/test/CodeGen/AMDGPU/bpermute-xor-dpp-combine.ll103
-rw-r--r--llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-bpermute-xor-to-dpp.ll92
-rw-r--r--llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wave.shuffle.ll49
4 files changed, 163 insertions, 203 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 463d63a88f69..ea70bd0d7e26 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -572,19 +572,69 @@ static bool isThreadID(const GCNSubtarget &ST, Value *V) {
return false;
}
+// Emit a DPP row_xmask operation: each lane reads from lane (self ^ XorMask)
+// within a 16-lane row. Replaces II with llvm.amdgcn.update.dpp.
+static Instruction *emitDPPRowXmask(InstCombiner &IC, IntrinsicInst &II,
+ Value *Val, uint64_t XorMask) {
+ IRBuilderBase &B = IC.Builder;
+ CallInst *UpdateDPP =
+ B.CreateIntrinsic(Intrinsic::amdgcn_update_dpp, Val->getType(),
+ {PoisonValue::get(Val->getType()), Val,
+ B.getInt32(AMDGPU::DPP::ROW_XMASK0 | XorMask),
+ B.getInt32(0xF), B.getInt32(0xF), B.getTrue()});
+ UpdateDPP->takeName(&II);
+ UpdateDPP->copyMetadata(II);
+ return IC.replaceInstUsesWith(II, UpdateDPP);
+}
+
+// Emit a v_permlanex16 with identity lane select: each lane reads from the
+// same-numbered lane in the other 16-lane row, equivalent to XOR with 16.
+// Replaces II with llvm.amdgcn.permlanex16.
+static Instruction *emitPermLaneX16(InstCombiner &IC, IntrinsicInst &II,
+ Value *Val) {
+ IRBuilderBase &B = IC.Builder;
+ CallInst *PermLane = B.CreateIntrinsic(
+ Intrinsic::amdgcn_permlanex16, Val->getType(),
+ {PoisonValue::get(Val->getType()), Val, B.getInt32(0x76543210),
+ B.getInt32(0xFEDCBA98), B.getFalse(), B.getFalse()});
+ PermLane->takeName(&II);
+ PermLane->copyMetadata(II);
+ return IC.replaceInstUsesWith(II, PermLane);
+}
+
// Attempt to capture situations where the index argument matches
-// a DPP pattern, and convert to a DPP-based mov
+// a DPP or cross-lane VALU pattern, and convert to the faster instruction.
static std::optional<Instruction *>
tryWaveShuffleDPP(const GCNSubtarget &ST, InstCombiner &IC, IntrinsicInst &II) {
Value *Val = II.getArgOperand(0);
Value *Idx = II.getArgOperand(1);
- auto &B = IC.Builder;
- // DPP16 Row Share requires known wave size, architecture support
- if (!ST.isWaveSizeKnown() || !ST.hasDPPRowShare())
+ if (!ST.isWaveSizeKnown() || !ST.hasDPP())
return std::nullopt;
Value *Tid;
+ uint64_t XorMask;
+
+ // DPP16 Row Xmask: Idx = Tid ^ ConstMask (where ConstMask is 1-15)
+ // row_xmask:N performs XOR within each 16-lane row, which is equivalent to
+ // __shfl_xor(val, N) for N < 16 since XOR only affects the low 4 bits.
+ if (match(Idx, m_Xor(m_Value(Tid), m_ConstantInt(XorMask))) &&
+ isThreadID(ST, Tid) && XorMask >= 1 && XorMask <= 15 &&
+ ST.getGeneration() >= AMDGPUSubtarget::GFX10)
+ return emitDPPRowXmask(IC, II, Val, XorMask);
+
+ // Permlanex16: Idx = Tid ^ 16
+ // v_permlanex16 performs a gather from the other 16-lane row within each
+ // 32-lane half, which is equivalent to __shfl_xor(val, 16).
+ if (match(Idx, m_Xor(m_Value(Tid), m_SpecificInt(16))) &&
+ isThreadID(ST, Tid) && ST.hasPermLaneX16())
+ return emitPermLaneX16(IC, II, Val);
+
+ // DPP16 Row Share requires architecture support
+ if (!ST.hasDPPRowShare())
+ return std::nullopt;
+
+ IRBuilderBase &B = IC.Builder;
uint64_t Mask;
uint64_t RowIdx;
bool CanDPP16RowShare = false;
@@ -636,6 +686,57 @@ tryWaveShuffleDPP(const GCNSubtarget &ST, InstCombiner &IC, IntrinsicInst &II) {
return std::nullopt;
}
+// Match ds_bpermute((tid ^ const) << 2, val) patterns and convert to VALU
+// cross-lane operations: DPP row_xmask (masks 1-15) or v_permlanex16
+// (mask 16). InstCombine may canonicalize shl(xor(tid, N), 2) into
+// xor(shl(tid, 2), N*4), so both forms are matched.
+static std::optional<Instruction *>
+tryBPermuteToVALU(const GCNSubtarget &ST, InstCombiner &IC, IntrinsicInst &II) {
+ Value *Index = II.getArgOperand(0);
+ Value *Val = II.getArgOperand(1);
+
+ if (!ST.hasDPP() || !ST.isWaveSizeKnown())
+ return std::nullopt;
+
+ Value *Tid;
+ uint64_t XorMask = 0;
+
+ // Form A (canonical): xor(shl(tid, 2), Const) where Const = mask * 4
+ Value *ShiftedTid;
+ uint64_t XorConst;
+ if (match(Index, m_Xor(m_Value(ShiftedTid), m_ConstantInt(XorConst)))) {
+ uint64_t ShiftAmt;
+ if (match(ShiftedTid, m_Shl(m_Value(Tid), m_ConstantInt(ShiftAmt))) &&
+ ShiftAmt == 2 && isThreadID(ST, Tid) && (XorConst & 3) == 0)
+ XorMask = XorConst >> 2;
+ }
+
+ // Form B (pre-canonical): shl(xor(tid, mask), 2)
+ if (!XorMask) {
+ Value *XorResult;
+ uint64_t ShiftAmt;
+ if (match(Index, m_Shl(m_Value(XorResult), m_ConstantInt(ShiftAmt))) &&
+ ShiftAmt == 2 &&
+ match(XorResult, m_Xor(m_Value(Tid), m_ConstantInt(XorMask))) &&
+ isThreadID(ST, Tid)) {
+ // XorMask is set by the match
+ } else {
+ return std::nullopt;
+ }
+ }
+
+ // DPP row_xmask for masks 1-15
+ if (XorMask >= 1 && XorMask <= 15 &&
+ ST.getGeneration() >= AMDGPUSubtarget::GFX10)
+ return emitDPPRowXmask(IC, II, Val, XorMask);
+
+ // v_permlanex16 for mask 16
+ if (XorMask == 16 && ST.hasPermLaneX16())
+ return emitPermLaneX16(IC, II, Val);
+
+ return std::nullopt;
+}
+
Instruction *
GCNTTIImpl::hoistLaneIntrinsicThroughOperand(InstCombiner &IC,
IntrinsicInst &II) const {
@@ -1572,10 +1673,17 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
simplifyDemandedLaneMaskArg(IC, II, 1))
return &II;
- // If the lane argument of bpermute is uniform, change it to readlane. This
- // generates better code and can enable further optimizations because
- // readlane is AlwaysUniform.
if (IID == Intrinsic::amdgcn_ds_bpermute) {
+ // Try to convert bpermute with XOR-based lane index into VALU
+ // cross-lane ops: DPP row_xmask (masks 1-15) or v_permlanex16
+ // (mask 16).
+ if (std::optional<Instruction *> VALUResult =
+ tryBPermuteToVALU(*ST, IC, II))
+ return *VALUResult;
+
+ // If the lane argument of bpermute is uniform, change it to readlane.
+ // This generates better code and can enable further optimizations because
+ // readlane is AlwaysUniform.
const Use &Lane = II.getArgOperandUse(0);
if (isTriviallyUniform(Lane)) {
Value *NewLane = IC.Builder.CreateLShr(Lane, 2);
diff --git a/llvm/test/CodeGen/AMDGPU/bpermute-xor-dpp-combine.ll b/llvm/test/CodeGen/AMDGPU/bpermute-xor-dpp-combine.ll
index c3df0b7c8e14..55fbd8d327c1 100644
--- a/llvm/test/CodeGen/AMDGPU/bpermute-xor-dpp-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/bpermute-xor-dpp-combine.ll
@@ -15,25 +15,13 @@ define float @wave_shuffle_xor1_fadd(float %val, float %other) #0 {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_xor_b32_e32 v2, 1, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 2, v2
-; GFX12-NEXT: ds_bpermute_b32 v0, v2, v0
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX12-NEXT: v_add_f32_dpp v0, v0, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: wave_shuffle_xor1_fadd:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_xor_b32_e32 v2, 1, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v2
-; GFX11-NEXT: ds_bpermute_b32 v0, v2, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_add_f32_dpp v0, v0, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
%xor = xor i32 %tid, 1
@@ -51,25 +39,13 @@ define float @wave_shuffle_xor4_fmul(float %val, float %other) #0 {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_xor_b32_e32 v2, 4, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 2, v2
-; GFX12-NEXT: ds_bpermute_b32 v0, v2, v0
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX12-NEXT: v_mul_f32_dpp v0, v0, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: wave_shuffle_xor4_fmul:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_xor_b32_e32 v2, 4, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v2
-; GFX11-NEXT: ds_bpermute_b32 v0, v2, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_mul_f32_dpp v0, v0, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
%xor = xor i32 %tid, 4
@@ -87,25 +63,13 @@ define float @bpermute_xor2_fadd(float %val, float %other) #0 {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 2, v2
-; GFX12-NEXT: v_xor_b32_e32 v2, 8, v2
-; GFX12-NEXT: ds_bpermute_b32 v0, v2, v0
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_add_f32_e32 v0, v1, v0
+; GFX12-NEXT: v_add_f32_dpp v0, v0, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: bpermute_xor2_fadd:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v2
-; GFX11-NEXT: v_xor_b32_e32 v2, 8, v2
-; GFX11-NEXT: ds_bpermute_b32 v0, v2, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_add_f32_e32 v0, v1, v0
+; GFX11-NEXT: v_add_f32_dpp v0, v0, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
%xor = xor i32 %tid, 2
@@ -126,25 +90,13 @@ define float @bpermute_xor8_fsub(float %val, float %other) #0 {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 2, v2
-; GFX12-NEXT: v_xor_b32_e32 v2, 32, v2
-; GFX12-NEXT: ds_bpermute_b32 v0, v2, v0
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_sub_f32_e32 v0, v1, v0
+; GFX12-NEXT: v_subrev_f32_dpp v0, v0, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: bpermute_xor8_fsub:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v2
-; GFX11-NEXT: v_xor_b32_e32 v2, 32, v2
-; GFX11-NEXT: ds_bpermute_b32 v0, v2, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_sub_f32_e32 v0, v1, v0
+; GFX11-NEXT: v_subrev_f32_dpp v0, v0, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
%xor = xor i32 %tid, 8
@@ -165,29 +117,17 @@ define float @wave_shuffle_xor1_fma_like(float %val, float %sign) #0 {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_xor_b32_e32 v2, 1, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 2, v2
-; GFX12-NEXT: ds_bpermute_b32 v2, v2, v0
-; GFX12-NEXT: v_mul_f32_e32 v0, v1, v0
-; GFX12-NEXT: s_wait_dscnt 0x0
+; GFX12-NEXT: v_mul_f32_e32 v1, v1, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX12-NEXT: v_add_f32_dpp v0, v0, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: wave_shuffle_xor1_fma_like:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_xor_b32_e32 v2, 1, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v2
-; GFX11-NEXT: ds_bpermute_b32 v2, v2, v0
-; GFX11-NEXT: v_mul_f32_e32 v0, v1, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX11-NEXT: v_add_f32_dpp v0, v0, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
%xor = xor i32 %tid, 1
@@ -206,24 +146,19 @@ define float @wave_shuffle_xor16_fadd(float %val, float %other) #0 {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_xor_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 2, v2
-; GFX12-NEXT: ds_bpermute_b32 v0, v2, v0
-; GFX12-NEXT: s_wait_dscnt 0x0
+; GFX12-NEXT: s_mov_b32 s0, 0x76543210
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, 0xfedcba98
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_add_f32_e32 v0, v0, v1
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: wave_shuffle_xor16_fadd:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_xor_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v2
-; GFX11-NEXT: ds_bpermute_b32 v0, v2, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s0, 0x76543210
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, 0xfedcba98
; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-bpermute-xor-to-dpp.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-bpermute-xor-to-dpp.ll
index 0205d082f498..6bb3b8dcfc4a 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-bpermute-xor-to-dpp.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-bpermute-xor-to-dpp.ll
@@ -8,18 +8,12 @@
define i32 @test_bpermute_xor1(i32 %val) {
; GFX1200-W32-LABEL: define i32 @test_bpermute_xor1(
; GFX1200-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0:[0-9]+]] {
-; GFX1200-W32-NEXT: [[LANE_ID:%.*]] = call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-; GFX1200-W32-NEXT: [[XOR_LANE:%.*]] = shl nuw nsw i32 [[LANE_ID]], 2
-; GFX1200-W32-NEXT: [[BYTE_OFFSET:%.*]] = xor i32 [[XOR_LANE]], 4
-; GFX1200-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.ds.bpermute(i32 [[BYTE_OFFSET]], i32 [[VAL]])
+; GFX1200-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 353, i32 15, i32 15, i1 true)
; GFX1200-W32-NEXT: ret i32 [[RESULT]]
;
; GFX11-W32-LABEL: define i32 @test_bpermute_xor1(
; GFX11-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0:[0-9]+]] {
-; GFX11-W32-NEXT: [[LANE_ID:%.*]] = call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-; GFX11-W32-NEXT: [[XOR_LANE:%.*]] = shl nuw nsw i32 [[LANE_ID]], 2
-; GFX11-W32-NEXT: [[BYTE_OFFSET:%.*]] = xor i32 [[XOR_LANE]], 4
-; GFX11-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.ds.bpermute(i32 [[BYTE_OFFSET]], i32 [[VAL]])
+; GFX11-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 353, i32 15, i32 15, i1 true)
; GFX11-W32-NEXT: ret i32 [[RESULT]]
;
; GFX11-W64-LABEL: define i32 @test_bpermute_xor1(
@@ -49,18 +43,12 @@ define i32 @test_bpermute_xor1(i32 %val) {
define i32 @test_bpermute_xor2(i32 %val) {
; GFX1200-W32-LABEL: define i32 @test_bpermute_xor2(
; GFX1200-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; GFX1200-W32-NEXT: [[LANE_ID:%.*]] = call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-; GFX1200-W32-NEXT: [[XOR_LANE:%.*]] = shl nuw nsw i32 [[LANE_ID]], 2
-; GFX1200-W32-NEXT: [[BYTE_OFFSET:%.*]] = xor i32 [[XOR_LANE]], 8
-; GFX1200-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.ds.bpermute(i32 [[BYTE_OFFSET]], i32 [[VAL]])
+; GFX1200-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 354, i32 15, i32 15, i1 true)
; GFX1200-W32-NEXT: ret i32 [[RESULT]]
;
; GFX11-W32-LABEL: define i32 @test_bpermute_xor2(
; GFX11-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; GFX11-W32-NEXT: [[LANE_ID:%.*]] = call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-; GFX11-W32-NEXT: [[XOR_LANE:%.*]] = shl nuw nsw i32 [[LANE_ID]], 2
-; GFX11-W32-NEXT: [[BYTE_OFFSET:%.*]] = xor i32 [[XOR_LANE]], 8
-; GFX11-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.ds.bpermute(i32 [[BYTE_OFFSET]], i32 [[VAL]])
+; GFX11-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 354, i32 15, i32 15, i1 true)
; GFX11-W32-NEXT: ret i32 [[RESULT]]
;
; GFX11-W64-LABEL: define i32 @test_bpermute_xor2(
@@ -90,18 +78,12 @@ define i32 @test_bpermute_xor2(i32 %val) {
define i32 @test_bpermute_xor4(i32 %val) {
; GFX1200-W32-LABEL: define i32 @test_bpermute_xor4(
; GFX1200-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; GFX1200-W32-NEXT: [[LANE_ID:%.*]] = call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-; GFX1200-W32-NEXT: [[XOR_LANE:%.*]] = shl nuw nsw i32 [[LANE_ID]], 2
-; GFX1200-W32-NEXT: [[BYTE_OFFSET:%.*]] = xor i32 [[XOR_LANE]], 16
-; GFX1200-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.ds.bpermute(i32 [[BYTE_OFFSET]], i32 [[VAL]])
+; GFX1200-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 356, i32 15, i32 15, i1 true)
; GFX1200-W32-NEXT: ret i32 [[RESULT]]
;
; GFX11-W32-LABEL: define i32 @test_bpermute_xor4(
; GFX11-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; GFX11-W32-NEXT: [[LANE_ID:%.*]] = call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-; GFX11-W32-NEXT: [[XOR_LANE:%.*]] = shl nuw nsw i32 [[LANE_ID]], 2
-; GFX11-W32-NEXT: [[BYTE_OFFSET:%.*]] = xor i32 [[XOR_LANE]], 16
-; GFX11-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.ds.bpermute(i32 [[BYTE_OFFSET]], i32 [[VAL]])
+; GFX11-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 356, i32 15, i32 15, i1 true)
; GFX11-W32-NEXT: ret i32 [[RESULT]]
;
; GFX11-W64-LABEL: define i32 @test_bpermute_xor4(
@@ -131,18 +113,12 @@ define i32 @test_bpermute_xor4(i32 %val) {
define i32 @test_bpermute_xor8(i32 %val) {
; GFX1200-W32-LABEL: define i32 @test_bpermute_xor8(
; GFX1200-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; GFX1200-W32-NEXT: [[LANE_ID:%.*]] = call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-; GFX1200-W32-NEXT: [[XOR_LANE:%.*]] = shl nuw nsw i32 [[LANE_ID]], 2
-; GFX1200-W32-NEXT: [[BYTE_OFFSET:%.*]] = xor i32 [[XOR_LANE]], 32
-; GFX1200-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.ds.bpermute(i32 [[BYTE_OFFSET]], i32 [[VAL]])
+; GFX1200-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 360, i32 15, i32 15, i1 true)
; GFX1200-W32-NEXT: ret i32 [[RESULT]]
;
; GFX11-W32-LABEL: define i32 @test_bpermute_xor8(
; GFX11-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; GFX11-W32-NEXT: [[LANE_ID:%.*]] = call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-; GFX11-W32-NEXT: [[XOR_LANE:%.*]] = shl nuw nsw i32 [[LANE_ID]], 2
-; GFX11-W32-NEXT: [[BYTE_OFFSET:%.*]] = xor i32 [[XOR_LANE]], 32
-; GFX11-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.ds.bpermute(i32 [[BYTE_OFFSET]], i32 [[VAL]])
+; GFX11-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 360, i32 15, i32 15, i1 true)
; GFX11-W32-NEXT: ret i32 [[RESULT]]
;
; GFX11-W64-LABEL: define i32 @test_bpermute_xor8(
@@ -172,18 +148,12 @@ define i32 @test_bpermute_xor8(i32 %val) {
define i32 @test_bpermute_xor15(i32 %val) {
; GFX1200-W32-LABEL: define i32 @test_bpermute_xor15(
; GFX1200-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; GFX1200-W32-NEXT: [[LANE_ID:%.*]] = call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-; GFX1200-W32-NEXT: [[XOR_LANE:%.*]] = shl nuw nsw i32 [[LANE_ID]], 2
-; GFX1200-W32-NEXT: [[BYTE_OFFSET:%.*]] = xor i32 [[XOR_LANE]], 60
-; GFX1200-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.ds.bpermute(i32 [[BYTE_OFFSET]], i32 [[VAL]])
+; GFX1200-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 367, i32 15, i32 15, i1 true)
; GFX1200-W32-NEXT: ret i32 [[RESULT]]
;
; GFX11-W32-LABEL: define i32 @test_bpermute_xor15(
; GFX11-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; GFX11-W32-NEXT: [[LANE_ID:%.*]] = call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-; GFX11-W32-NEXT: [[XOR_LANE:%.*]] = shl nuw nsw i32 [[LANE_ID]], 2
-; GFX11-W32-NEXT: [[BYTE_OFFSET:%.*]] = xor i32 [[XOR_LANE]], 60
-; GFX11-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.ds.bpermute(i32 [[BYTE_OFFSET]], i32 [[VAL]])
+; GFX11-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 367, i32 15, i32 15, i1 true)
; GFX11-W32-NEXT: ret i32 [[RESULT]]
;
; GFX11-W64-LABEL: define i32 @test_bpermute_xor15(
@@ -213,18 +183,12 @@ define i32 @test_bpermute_xor15(i32 %val) {
define i32 @test_bpermute_xor16(i32 %val) {
; GFX1200-W32-LABEL: define i32 @test_bpermute_xor16(
; GFX1200-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; GFX1200-W32-NEXT: [[LANE_ID:%.*]] = call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-; GFX1200-W32-NEXT: [[XOR_LANE:%.*]] = shl nuw nsw i32 [[LANE_ID]], 2
-; GFX1200-W32-NEXT: [[BYTE_OFFSET:%.*]] = xor i32 [[XOR_LANE]], 64
-; GFX1200-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.ds.bpermute(i32 [[BYTE_OFFSET]], i32 [[VAL]])
+; GFX1200-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.permlanex16.i32(i32 poison, i32 [[VAL]], i32 1985229328, i32 -19088744, i1 false, i1 false)
; GFX1200-W32-NEXT: ret i32 [[RESULT]]
;
; GFX11-W32-LABEL: define i32 @test_bpermute_xor16(
; GFX11-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; GFX11-W32-NEXT: [[LANE_ID:%.*]] = call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-; GFX11-W32-NEXT: [[XOR_LANE:%.*]] = shl nuw nsw i32 [[LANE_ID]], 2
-; GFX11-W32-NEXT: [[BYTE_OFFSET:%.*]] = xor i32 [[XOR_LANE]], 64
-; GFX11-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.ds.bpermute(i32 [[BYTE_OFFSET]], i32 [[VAL]])
+; GFX11-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.permlanex16.i32(i32 poison, i32 [[VAL]], i32 1985229328, i32 -19088744, i1 false, i1 false)
; GFX11-W32-NEXT: ret i32 [[RESULT]]
;
; GFX11-W64-LABEL: define i32 @test_bpermute_xor16(
@@ -254,27 +218,17 @@ define i32 @test_bpermute_xor16(i32 %val) {
define i32 @test_bpermute_xor4_wave64(i32 %val) {
; GFX1200-W32-LABEL: define i32 @test_bpermute_xor4_wave64(
; GFX1200-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; GFX1200-W32-NEXT: [[LO:%.*]] = call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-; GFX1200-W32-NEXT: [[XOR_LANE:%.*]] = shl nuw nsw i32 [[LO]], 2
-; GFX1200-W32-NEXT: [[BYTE_OFFSET:%.*]] = xor i32 [[XOR_LANE]], 16
-; GFX1200-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.ds.bpermute(i32 [[BYTE_OFFSET]], i32 [[VAL]])
+; GFX1200-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 356, i32 15, i32 15, i1 true)
; GFX1200-W32-NEXT: ret i32 [[RESULT]]
;
; GFX11-W32-LABEL: define i32 @test_bpermute_xor4_wave64(
; GFX11-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; GFX11-W32-NEXT: [[LO:%.*]] = call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-; GFX11-W32-NEXT: [[XOR_LANE:%.*]] = shl nuw nsw i32 [[LO]], 2
-; GFX11-W32-NEXT: [[BYTE_OFFSET:%.*]] = xor i32 [[XOR_LANE]], 16
-; GFX11-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.ds.bpermute(i32 [[BYTE_OFFSET]], i32 [[VAL]])
+; GFX11-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 356, i32 15, i32 15, i1 true)
; GFX11-W32-NEXT: ret i32 [[RESULT]]
;
; GFX11-W64-LABEL: define i32 @test_bpermute_xor4_wave64(
; GFX11-W64-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; GFX11-W64-NEXT: [[LO:%.*]] = call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-; GFX11-W64-NEXT: [[LANE_ID:%.*]] = call range(i32 0, 65) i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[LO]])
-; GFX11-W64-NEXT: [[XOR_LANE:%.*]] = shl nuw nsw i32 [[LANE_ID]], 2
-; GFX11-W64-NEXT: [[BYTE_OFFSET:%.*]] = xor i32 [[XOR_LANE]], 16
-; GFX11-W64-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.ds.bpermute(i32 [[BYTE_OFFSET]], i32 [[VAL]])
+; GFX11-W64-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 356, i32 15, i32 15, i1 true)
; GFX11-W64-NEXT: ret i32 [[RESULT]]
;
; GFX9-LABEL: define i32 @test_bpermute_xor4_wave64(
@@ -298,27 +252,17 @@ define i32 @test_bpermute_xor4_wave64(i32 %val) {
define i32 @test_bpermute_xor16_wave64(i32 %val) {
; GFX1200-W32-LABEL: define i32 @test_bpermute_xor16_wave64(
; GFX1200-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; GFX1200-W32-NEXT: [[LO:%.*]] = call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-; GFX1200-W32-NEXT: [[XOR_LANE:%.*]] = shl nuw nsw i32 [[LO]], 2
-; GFX1200-W32-NEXT: [[BYTE_OFFSET:%.*]] = xor i32 [[XOR_LANE]], 64
-; GFX1200-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.ds.bpermute(i32 [[BYTE_OFFSET]], i32 [[VAL]])
+; GFX1200-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.permlanex16.i32(i32 poison, i32 [[VAL]], i32 1985229328, i32 -19088744, i1 false, i1 false)
; GFX1200-W32-NEXT: ret i32 [[RESULT]]
;
; GFX11-W32-LABEL: define i32 @test_bpermute_xor16_wave64(
; GFX11-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; GFX11-W32-NEXT: [[LO:%.*]] = call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-; GFX11-W32-NEXT: [[XOR_LANE:%.*]] = shl nuw nsw i32 [[LO]], 2
-; GFX11-W32-NEXT: [[BYTE_OFFSET:%.*]] = xor i32 [[XOR_LANE]], 64
-; GFX11-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.ds.bpermute(i32 [[BYTE_OFFSET]], i32 [[VAL]])
+; GFX11-W32-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.permlanex16.i32(i32 poison, i32 [[VAL]], i32 1985229328, i32 -19088744, i1 false, i1 false)
; GFX11-W32-NEXT: ret i32 [[RESULT]]
;
; GFX11-W64-LABEL: define i32 @test_bpermute_xor16_wave64(
; GFX11-W64-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; GFX11-W64-NEXT: [[LO:%.*]] = call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-; GFX11-W64-NEXT: [[LANE_ID:%.*]] = call range(i32 0, 65) i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[LO]])
-; GFX11-W64-NEXT: [[XOR_LANE:%.*]] = shl nuw nsw i32 [[LANE_ID]], 2
-; GFX11-W64-NEXT: [[BYTE_OFFSET:%.*]] = xor i32 [[XOR_LANE]], 64
-; GFX11-W64-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.ds.bpermute(i32 [[BYTE_OFFSET]], i32 [[VAL]])
+; GFX11-W64-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.permlanex16.i32(i32 poison, i32 [[VAL]], i32 1985229328, i32 -19088744, i1 false, i1 false)
; GFX11-W64-NEXT: ret i32 [[RESULT]]
;
; GFX9-LABEL: define i32 @test_bpermute_xor16_wave64(
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wave.shuffle.ll b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wave.shuffle.ll
index 79362b8856a9..908b04955de8 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wave.shuffle.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wave.shuffle.ll
@@ -242,17 +242,12 @@ define i32 @test_wave_shuffle_not_quite_row_share(i32 %val) {
define i32 @test_wave_shuffle_xor1(i32 %val) {
; CHECK-W32-LABEL: define i32 @test_wave_shuffle_xor1(
; CHECK-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-W32-NEXT: [[LO:%.*]] = tail call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-; CHECK-W32-NEXT: [[XOR:%.*]] = xor i32 [[LO]], 1
-; CHECK-W32-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[XOR]])
+; CHECK-W32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 353, i32 15, i32 15, i1 true)
; CHECK-W32-NEXT: ret i32 [[RES]]
;
; CHECK-W64-LABEL: define i32 @test_wave_shuffle_xor1(
; CHECK-W64-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-W64-NEXT: [[LO:%.*]] = tail call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-; CHECK-W64-NEXT: [[TID:%.*]] = tail call range(i32 0, 65) i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[LO]])
-; CHECK-W64-NEXT: [[XOR:%.*]] = xor i32 [[TID]], 1
-; CHECK-W64-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[XOR]])
+; CHECK-W64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 353, i32 15, i32 15, i1 true)
; CHECK-W64-NEXT: ret i32 [[RES]]
;
; CHECK-NO-WAVE-SIZE-LABEL: define i32 @test_wave_shuffle_xor1(
@@ -274,17 +269,12 @@ define i32 @test_wave_shuffle_xor1(i32 %val) {
define i32 @test_wave_shuffle_xor4(i32 %val) {
; CHECK-W32-LABEL: define i32 @test_wave_shuffle_xor4(
; CHECK-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-W32-NEXT: [[LO:%.*]] = tail call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-; CHECK-W32-NEXT: [[XOR:%.*]] = xor i32 [[LO]], 4
-; CHECK-W32-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[XOR]])
+; CHECK-W32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 356, i32 15, i32 15, i1 true)
; CHECK-W32-NEXT: ret i32 [[RES]]
;
; CHECK-W64-LABEL: define i32 @test_wave_shuffle_xor4(
; CHECK-W64-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-W64-NEXT: [[LO:%.*]] = tail call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-; CHECK-W64-NEXT: [[TID:%.*]] = tail call range(i32 0, 65) i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[LO]])
-; CHECK-W64-NEXT: [[XOR:%.*]] = xor i32 [[TID]], 4
-; CHECK-W64-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[XOR]])
+; CHECK-W64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 356, i32 15, i32 15, i1 true)
; CHECK-W64-NEXT: ret i32 [[RES]]
;
; CHECK-NO-WAVE-SIZE-LABEL: define i32 @test_wave_shuffle_xor4(
@@ -306,17 +296,12 @@ define i32 @test_wave_shuffle_xor4(i32 %val) {
define i32 @test_wave_shuffle_xor15(i32 %val) {
; CHECK-W32-LABEL: define i32 @test_wave_shuffle_xor15(
; CHECK-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-W32-NEXT: [[LO:%.*]] = tail call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-; CHECK-W32-NEXT: [[XOR:%.*]] = xor i32 [[LO]], 15
-; CHECK-W32-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[XOR]])
+; CHECK-W32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 367, i32 15, i32 15, i1 true)
; CHECK-W32-NEXT: ret i32 [[RES]]
;
; CHECK-W64-LABEL: define i32 @test_wave_shuffle_xor15(
; CHECK-W64-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-W64-NEXT: [[LO:%.*]] = tail call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-; CHECK-W64-NEXT: [[TID:%.*]] = tail call range(i32 0, 65) i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[LO]])
-; CHECK-W64-NEXT: [[XOR:%.*]] = xor i32 [[TID]], 15
-; CHECK-W64-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[XOR]])
+; CHECK-W64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 367, i32 15, i32 15, i1 true)
; CHECK-W64-NEXT: ret i32 [[RES]]
;
; CHECK-NO-WAVE-SIZE-LABEL: define i32 @test_wave_shuffle_xor15(
@@ -338,17 +323,12 @@ define i32 @test_wave_shuffle_xor15(i32 %val) {
define i32 @test_wave_shuffle_xor16(i32 %val) {
; CHECK-W32-LABEL: define i32 @test_wave_shuffle_xor16(
; CHECK-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-W32-NEXT: [[LO:%.*]] = tail call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-; CHECK-W32-NEXT: [[XOR:%.*]] = xor i32 [[LO]], 16
-; CHECK-W32-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[XOR]])
+; CHECK-W32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlanex16.i32(i32 poison, i32 [[VAL]], i32 1985229328, i32 -19088744, i1 false, i1 false)
; CHECK-W32-NEXT: ret i32 [[RES]]
;
; CHECK-W64-LABEL: define i32 @test_wave_shuffle_xor16(
; CHECK-W64-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-W64-NEXT: [[LO:%.*]] = tail call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-; CHECK-W64-NEXT: [[TID:%.*]] = tail call range(i32 0, 65) i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[LO]])
-; CHECK-W64-NEXT: [[XOR:%.*]] = xor i32 [[TID]], 16
-; CHECK-W64-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[XOR]])
+; CHECK-W64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.permlanex16.i32(i32 poison, i32 [[VAL]], i32 1985229328, i32 -19088744, i1 false, i1 false)
; CHECK-W64-NEXT: ret i32 [[RES]]
;
; CHECK-NO-WAVE-SIZE-LABEL: define i32 @test_wave_shuffle_xor16(
@@ -370,9 +350,7 @@ define i32 @test_wave_shuffle_xor16(i32 %val) {
define i32 @test_wave_shuffle_xor1_lo_only(i32 %val) {
; CHECK-W32-LABEL: define i32 @test_wave_shuffle_xor1_lo_only(
; CHECK-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-W32-NEXT: [[TID:%.*]] = tail call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-; CHECK-W32-NEXT: [[XOR:%.*]] = xor i32 [[TID]], 1
-; CHECK-W32-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[XOR]])
+; CHECK-W32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 [[VAL]], i32 353, i32 15, i32 15, i1 true)
; CHECK-W32-NEXT: ret i32 [[RES]]
;
; CHECK-W64-LABEL: define i32 @test_wave_shuffle_xor1_lo_only(
@@ -399,17 +377,12 @@ define i32 @test_wave_shuffle_xor1_lo_only(i32 %val) {
define float @test_wave_shuffle_xor2_float(float %val) {
; CHECK-W32-LABEL: define float @test_wave_shuffle_xor2_float(
; CHECK-W32-SAME: float [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-W32-NEXT: [[LO:%.*]] = tail call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-; CHECK-W32-NEXT: [[XOR:%.*]] = xor i32 [[LO]], 2
-; CHECK-W32-NEXT: [[RES:%.*]] = tail call float @llvm.amdgcn.wave.shuffle.f32(float [[VAL]], i32 [[XOR]])
+; CHECK-W32-NEXT: [[RES:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float poison, float [[VAL]], i32 354, i32 15, i32 15, i1 true)
; CHECK-W32-NEXT: ret float [[RES]]
;
; CHECK-W64-LABEL: define float @test_wave_shuffle_xor2_float(
; CHECK-W64-SAME: float [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-W64-NEXT: [[LO:%.*]] = tail call range(i32 0, 33) i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-; CHECK-W64-NEXT: [[TID:%.*]] = tail call range(i32 0, 65) i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[LO]])
-; CHECK-W64-NEXT: [[XOR:%.*]] = xor i32 [[TID]], 2
-; CHECK-W64-NEXT: [[RES:%.*]] = tail call float @llvm.amdgcn.wave.shuffle.f32(float [[VAL]], i32 [[XOR]])
+; CHECK-W64-NEXT: [[RES:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float poison, float [[VAL]], i32 354, i32 15, i32 15, i1 true)
; CHECK-W64-NEXT: ret float [[RES]]
;
; CHECK-NO-WAVE-SIZE-LABEL: define float @test_wave_shuffle_xor2_float(