diff options
author | pvanhout <pierre.vanhoutryve@amd.com> | 2025-07-07 11:59:25 +0200 |
---|---|---|
committer | Pierre van Houtryve <29600849+Pierre-vh@users.noreply.github.com> | 2025-07-16 07:59:20 +0000 |
commit | 93f930842d3b24938dfe4e7e692d19a418a5b62a (patch) | |
tree | 6b61e90aefc457cbe353d84c58b81e369f734aee | |
parent | 4b9be18d9e33eaeda303a008f6f09b86e61617c9 (diff) | |
download | llvm-users/pierre-vh/lower-sbfe-in-rbcomb.zip llvm-users/pierre-vh/lower-sbfe-in-rbcomb.tar.gz llvm-users/pierre-vh/lower-sbfe-in-rbcomb.tar.bz2 |
remove clampusers/pierre-vh/lower-sbfe-in-rbcomb
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp | 13 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll | 9 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll | 14 |
3 files changed, 12 insertions, 24 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp index 8d0c1b6..257acfd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp @@ -419,19 +419,18 @@ bool AMDGPURegBankCombinerImpl::lowerUniformBFX(MachineInstr &MI) const { // Pack the offset and width of a BFE into // the format expected by the S_BFE_I32 / S_BFE_U32. In the second // source, bits [5:0] contain the offset and bits [22:16] the width. - - // Ensure the high bits are clear to insert the offset. - auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6)); - auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask); + // The 64 bit variants use bits [6:0] + // + // If the value takes more than 5/6 bits, the G_U/SBFX is ill-formed. + // Thus, we do not clamp the values. We assume they are in range, + // and if they aren't, it is UB anyway. // Zeros out the low bits, so don't bother clamping the input value. auto ShiftAmt = B.buildConstant(S32, 16); auto ShiftWidth = B.buildShl(S32, WidthReg, ShiftAmt); - auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth); + auto MergedInputs = B.buildOr(S32, OffsetReg, ShiftWidth); - MRI.setRegBank(OffsetMask.getReg(0), *RB); - MRI.setRegBank(ClampOffset.getReg(0), *RB); MRI.setRegBank(ShiftAmt.getReg(0), *RB); MRI.setRegBank(ShiftWidth.getReg(0), *RB); MRI.setRegBank(MergedInputs.getReg(0), *RB); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll index 45bade2..0e65e1a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll @@ -14,7 +14,6 @@ define i32 @v_bfe_i32_arg_arg_arg(i32 %src0, i32 %src1, i32 %src2) #0 { define amdgpu_ps i32 @s_bfe_i32_arg_arg_arg(i32 inreg %src0, i32 inreg %src1, i32 inreg %src2) #0 { ; GFX6-LABEL: s_bfe_i32_arg_arg_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s1, s1, 63 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_bfe_i32 s0, s0, s1 @@ -32,7 +31,6 @@ define amdgpu_ps i32 @s_bfe_i32_arg_arg_arg(i32 inreg %src0, i32 inreg %src1, i3 define amdgpu_ps i64 @s_bfe_i64_arg_arg_arg(i64 inreg %src0, i32 inreg %src1, i32 inreg %src2) #0 { ; GFX6-LABEL: s_bfe_i64_arg_arg_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s2, s2, 63 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: s_bfe_i64 s[0:1], s[0:1], s2 @@ -46,7 +44,6 @@ define amdgpu_kernel void @bfe_i32_arg_arg_imm(ptr addrspace(1) %out, i32 %src0, ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s3, s3, 63 ; GFX6-NEXT: s_or_b32 s3, s3, 0x7b0000 ; GFX6-NEXT: s_bfe_i32 s3, s2, s3 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -65,7 +62,7 @@ define amdgpu_kernel void @bfe_i32_arg_imm_arg(ptr addrspace(1) %out, i32 %src0, ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_or_b32 s3, 59, s3 +; GFX6-NEXT: s_or_b32 s3, 0x7b, s3 ; GFX6-NEXT: s_bfe_i32 s3, s2, s3 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 @@ -82,9 +79,8 @@ define amdgpu_kernel void @bfe_i32_imm_arg_arg(ptr addrspace(1) %out, i32 %src1, ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s4, s2, 63 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_or_b32 s3, s4, s3 +; GFX6-NEXT: s_or_b32 s3, s2, s3 ; GFX6-NEXT: s_bfe_i32 s3, 0x7b, s3 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 @@ -120,7 +116,6 @@ define amdgpu_kernel void @bfe_i32_arg_0_width_reg_offset(ptr addrspace(1) %out, ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s3, s3, 63 ; GFX6-NEXT: s_bfe_i32 s3, s2, s3 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll index d327c15..6d435ac 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll @@ -14,7 +14,6 @@ define i32 @v_bfe_i32_arg_arg_arg(i32 %src0, i32 %src1, i32 %src2) #0 { define amdgpu_ps i32 @s_bfe_i32_arg_arg_arg(i32 inreg %src0, i32 inreg %src1, i32 inreg %src2) #0 { ; GFX6-LABEL: s_bfe_i32_arg_arg_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s1, s1, 63 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_bfe_u32 s0, s0, s1 @@ -32,7 +31,6 @@ define amdgpu_ps i32 @s_bfe_i32_arg_arg_arg(i32 inreg %src0, i32 inreg %src1, i3 define amdgpu_ps i64 @s_bfe_i64_arg_arg_arg(i64 inreg %src0, i32 inreg %src1, i32 inreg %src2) #0 { ; GFX6-LABEL: s_bfe_i64_arg_arg_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s2, s2, 63 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: s_bfe_u64 s[0:1], s[0:1], s2 @@ -46,9 +44,8 @@ define amdgpu_kernel void @bfe_u32_arg_arg_arg(ptr addrspace(1) %out, i32 %src0, ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s4, s3, 63 -; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_or_b32 s3, s4, s3 +; GFX6-NEXT: s_lshl_b32 s4, s3, 16 +; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: s_bfe_u32 s3, s2, s3 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 @@ -65,7 +62,6 @@ define amdgpu_kernel void @bfe_u32_arg_arg_imm(ptr addrspace(1) %out, i32 %src0, ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s3, s3, 63 ; GFX6-NEXT: s_or_b32 s3, s3, 0x7b0000 ; GFX6-NEXT: s_bfe_u32 s3, s2, s3 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -84,7 +80,7 @@ define amdgpu_kernel void @bfe_u32_arg_imm_arg(ptr addrspace(1) %out, i32 %src0, ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_or_b32 s3, 59, s3 +; GFX6-NEXT: s_or_b32 s3, 0x7b, s3 ; GFX6-NEXT: s_bfe_u32 s3, s2, s3 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 @@ -101,9 +97,8 @@ define amdgpu_kernel void @bfe_u32_imm_arg_arg(ptr addrspace(1) %out, i32 %src1, ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s4, s2, 63 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_or_b32 s3, s4, s3 +; GFX6-NEXT: s_or_b32 s3, s2, s3 ; GFX6-NEXT: s_bfe_u32 s3, 0x7b, s3 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 @@ -120,7 +115,6 @@ define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(ptr addrspace(1) %out, ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s3, s3, 63 ; GFX6-NEXT: s_bfe_u32 s3, s2, s3 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 |