aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChangpeng Fang <changpeng.fang@amd.com>2025-10-30 09:44:25 -0700
committerGitHub <noreply@github.com>2025-10-30 09:44:25 -0700
commit6b5afdc3ab3e2791baa1946acb4ee3f0b6db8ce3 (patch)
treeac42d658a2273f46f7340a931d89eb7fcb026535
parent88cee4c73787c977b03b89f22309c2e52769e0ec (diff)
downloadllvm-6b5afdc3ab3e2791baa1946acb4ee3f0b6db8ce3.zip
llvm-6b5afdc3ab3e2791baa1946acb4ee3f0b6db8ce3.tar.gz
llvm-6b5afdc3ab3e2791baa1946acb4ee3f0b6db8ce3.tar.bz2
[AMDGPU] Support bfloat comparison for ballot intrinsic (#165495)
We do not have native instructions for direct bfloat comparisons. However, we can expand bfloat to float, and do float comparison instead. TODO: handle bfloat comparison for ballot intrinsic on global isel path. Fixes: SWDEV-563403
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp10
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll21
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll12
3 files changed, 41 insertions, 2 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index b34ab2a..8bb2808 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -7035,9 +7035,15 @@ static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N,
SDLoc SL(N);
if (Src.getOpcode() == ISD::SETCC) {
+ SDValue Op0 = Src.getOperand(0);
+ SDValue Op1 = Src.getOperand(1);
+ // Need to expand bfloat to float for comparison (setcc).
+ if (Op0.getValueType() == MVT::bf16) {
+ Op0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op0);
+ Op1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op1);
+ }
// (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
- return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
- Src.getOperand(1), Src.getOperand(2));
+ return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Op0, Op1, Src.getOperand(2));
}
if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
// (ballot 0) -> 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
index aa591d2..c1f3a12 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
@@ -591,3 +591,24 @@ exit:
store i32 %ballot, ptr addrspace(1) %out
ret void
}
+
+define amdgpu_cs i32 @compare_bfloats(bfloat %x, bfloat %y) {
+; GFX10-LABEL: compare_bfloats:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_cmp_gt_f32_e64 s0, v0, v1
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: compare_bfloats:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mov_b16_e32 v2.l, 0
+; GFX11-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX11-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11-NEXT: v_cmp_gt_f32_e64 s0, v1, v2
+; GFX11-NEXT: ; return to shader part epilog
+ %cmp = fcmp ogt bfloat %x, %y
+ %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
+ ret i32 %ballot
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
index 30c2c26..827a01f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
@@ -557,3 +557,15 @@ exit:
store i64 %ballot, ptr addrspace(1) %out
ret void
}
+
+define amdgpu_cs i64 @compare_bfloats(bfloat %x, bfloat %y) {
+; CHECK-LABEL: compare_bfloats:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; CHECK-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; CHECK-NEXT: v_cmp_gt_f32_e64 s[0:1], v0, v1
+; CHECK-NEXT: ; return to shader part epilog
+ %cmp = fcmp ogt bfloat %x, %y
+ %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+ ret i64 %ballot
+}