aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorpvanhout <pierre.vanhoutryve@amd.com>2025-07-07 11:59:25 +0200
committerPierre van Houtryve <29600849+Pierre-vh@users.noreply.github.com>2025-07-16 07:59:20 +0000
commit93f930842d3b24938dfe4e7e692d19a418a5b62a (patch)
tree6b61e90aefc457cbe353d84c58b81e369f734aee
parent4b9be18d9e33eaeda303a008f6f09b86e61617c9 (diff)
downloadllvm-users/pierre-vh/lower-sbfe-in-rbcomb.zip
llvm-users/pierre-vh/lower-sbfe-in-rbcomb.tar.gz
llvm-users/pierre-vh/lower-sbfe-in-rbcomb.tar.bz2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp13
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll9
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll14
3 files changed, 12 insertions, 24 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index 8d0c1b6..257acfd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -419,19 +419,18 @@ bool AMDGPURegBankCombinerImpl::lowerUniformBFX(MachineInstr &MI) const {
// Pack the offset and width of a BFE into
// the format expected by the S_BFE_I32 / S_BFE_U32. In the second
// source, bits [5:0] contain the offset and bits [22:16] the width.
-
- // Ensure the high bits are clear to insert the offset.
- auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
- auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
+ // The 64 bit variants use bits [6:0]
+ //
+ // If the value takes more than 5/6 bits, the G_U/SBFX is ill-formed.
+ // Thus, we do not clamp the values. We assume they are in range,
+ // and if they aren't, it is UB anyway.
// Zeros out the low bits, so don't bother clamping the input value.
auto ShiftAmt = B.buildConstant(S32, 16);
auto ShiftWidth = B.buildShl(S32, WidthReg, ShiftAmt);
- auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
+ auto MergedInputs = B.buildOr(S32, OffsetReg, ShiftWidth);
- MRI.setRegBank(OffsetMask.getReg(0), *RB);
- MRI.setRegBank(ClampOffset.getReg(0), *RB);
MRI.setRegBank(ShiftAmt.getReg(0), *RB);
MRI.setRegBank(ShiftWidth.getReg(0), *RB);
MRI.setRegBank(MergedInputs.getReg(0), *RB);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll
index 45bade2..0e65e1a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll
@@ -14,7 +14,6 @@ define i32 @v_bfe_i32_arg_arg_arg(i32 %src0, i32 %src1, i32 %src2) #0 {
define amdgpu_ps i32 @s_bfe_i32_arg_arg_arg(i32 inreg %src0, i32 inreg %src1, i32 inreg %src2) #0 {
; GFX6-LABEL: s_bfe_i32_arg_arg_arg:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_and_b32 s1, s1, 63
; GFX6-NEXT: s_lshl_b32 s2, s2, 16
; GFX6-NEXT: s_or_b32 s1, s1, s2
; GFX6-NEXT: s_bfe_i32 s0, s0, s1
@@ -32,7 +31,6 @@ define amdgpu_ps i32 @s_bfe_i32_arg_arg_arg(i32 inreg %src0, i32 inreg %src1, i3
define amdgpu_ps i64 @s_bfe_i64_arg_arg_arg(i64 inreg %src0, i32 inreg %src1, i32 inreg %src2) #0 {
; GFX6-LABEL: s_bfe_i64_arg_arg_arg:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_and_b32 s2, s2, 63
; GFX6-NEXT: s_lshl_b32 s3, s3, 16
; GFX6-NEXT: s_or_b32 s2, s2, s3
; GFX6-NEXT: s_bfe_i64 s[0:1], s[0:1], s2
@@ -46,7 +44,6 @@ define amdgpu_kernel void @bfe_i32_arg_arg_imm(ptr addrspace(1) %out, i32 %src0,
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_and_b32 s3, s3, 63
; GFX6-NEXT: s_or_b32 s3, s3, 0x7b0000
; GFX6-NEXT: s_bfe_i32 s3, s2, s3
; GFX6-NEXT: s_mov_b32 s2, -1
@@ -65,7 +62,7 @@ define amdgpu_kernel void @bfe_i32_arg_imm_arg(ptr addrspace(1) %out, i32 %src0,
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_lshl_b32 s3, s3, 16
-; GFX6-NEXT: s_or_b32 s3, 59, s3
+; GFX6-NEXT: s_or_b32 s3, 0x7b, s3
; GFX6-NEXT: s_bfe_i32 s3, s2, s3
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: v_mov_b32_e32 v0, s3
@@ -82,9 +79,8 @@ define amdgpu_kernel void @bfe_i32_imm_arg_arg(ptr addrspace(1) %out, i32 %src1,
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_and_b32 s4, s2, 63
; GFX6-NEXT: s_lshl_b32 s3, s3, 16
-; GFX6-NEXT: s_or_b32 s3, s4, s3
+; GFX6-NEXT: s_or_b32 s3, s2, s3
; GFX6-NEXT: s_bfe_i32 s3, 0x7b, s3
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: v_mov_b32_e32 v0, s3
@@ -120,7 +116,6 @@ define amdgpu_kernel void @bfe_i32_arg_0_width_reg_offset(ptr addrspace(1) %out,
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_and_b32 s3, s3, 63
; GFX6-NEXT: s_bfe_i32 s3, s2, s3
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: v_mov_b32_e32 v0, s3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll
index d327c15..6d435ac 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll
@@ -14,7 +14,6 @@ define i32 @v_bfe_i32_arg_arg_arg(i32 %src0, i32 %src1, i32 %src2) #0 {
define amdgpu_ps i32 @s_bfe_i32_arg_arg_arg(i32 inreg %src0, i32 inreg %src1, i32 inreg %src2) #0 {
; GFX6-LABEL: s_bfe_i32_arg_arg_arg:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_and_b32 s1, s1, 63
; GFX6-NEXT: s_lshl_b32 s2, s2, 16
; GFX6-NEXT: s_or_b32 s1, s1, s2
; GFX6-NEXT: s_bfe_u32 s0, s0, s1
@@ -32,7 +31,6 @@ define amdgpu_ps i32 @s_bfe_i32_arg_arg_arg(i32 inreg %src0, i32 inreg %src1, i3
define amdgpu_ps i64 @s_bfe_i64_arg_arg_arg(i64 inreg %src0, i32 inreg %src1, i32 inreg %src2) #0 {
; GFX6-LABEL: s_bfe_i64_arg_arg_arg:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_and_b32 s2, s2, 63
; GFX6-NEXT: s_lshl_b32 s3, s3, 16
; GFX6-NEXT: s_or_b32 s2, s2, s3
; GFX6-NEXT: s_bfe_u64 s[0:1], s[0:1], s2
@@ -46,9 +44,8 @@ define amdgpu_kernel void @bfe_u32_arg_arg_arg(ptr addrspace(1) %out, i32 %src0,
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_and_b32 s4, s3, 63
-; GFX6-NEXT: s_lshl_b32 s3, s3, 16
-; GFX6-NEXT: s_or_b32 s3, s4, s3
+; GFX6-NEXT: s_lshl_b32 s4, s3, 16
+; GFX6-NEXT: s_or_b32 s3, s3, s4
; GFX6-NEXT: s_bfe_u32 s3, s2, s3
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: v_mov_b32_e32 v0, s3
@@ -65,7 +62,6 @@ define amdgpu_kernel void @bfe_u32_arg_arg_imm(ptr addrspace(1) %out, i32 %src0,
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_and_b32 s3, s3, 63
; GFX6-NEXT: s_or_b32 s3, s3, 0x7b0000
; GFX6-NEXT: s_bfe_u32 s3, s2, s3
; GFX6-NEXT: s_mov_b32 s2, -1
@@ -84,7 +80,7 @@ define amdgpu_kernel void @bfe_u32_arg_imm_arg(ptr addrspace(1) %out, i32 %src0,
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_lshl_b32 s3, s3, 16
-; GFX6-NEXT: s_or_b32 s3, 59, s3
+; GFX6-NEXT: s_or_b32 s3, 0x7b, s3
; GFX6-NEXT: s_bfe_u32 s3, s2, s3
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: v_mov_b32_e32 v0, s3
@@ -101,9 +97,8 @@ define amdgpu_kernel void @bfe_u32_imm_arg_arg(ptr addrspace(1) %out, i32 %src1,
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_and_b32 s4, s2, 63
; GFX6-NEXT: s_lshl_b32 s3, s3, 16
-; GFX6-NEXT: s_or_b32 s3, s4, s3
+; GFX6-NEXT: s_or_b32 s3, s2, s3
; GFX6-NEXT: s_bfe_u32 s3, 0x7b, s3
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: v_mov_b32_e32 v0, s3
@@ -120,7 +115,6 @@ define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(ptr addrspace(1) %out,
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_and_b32 s3, s3, 63
; GFX6-NEXT: s_bfe_u32 s3, s2, s3
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: v_mov_b32_e32 v0, s3