AMDGPU: Check for subreg match when folding through reg_sequence (#140582)

We need to consider the use instruction's intepretation of the bits, not the defined immediate without use context. This will regress some cases where we previously coud match f64 inline constants. We can restore them by either using pseudo instructions to materialize f64 constants, or recognizing reg_sequence decomposed into 32-bit pieces for them (which essentially means recognizing every other input is a 0). Fixes #139908
author: Matt Arsenault <Matthew.Arsenault@amd.com> 2025-05-19 21:44:44 +0200
committer: GitHub <noreply@github.com> 2025-05-19 21:44:44 +0200
commit: 36018494fdb9e92e0f61f6937e5ecd3a4472677f (patch)
tree: 405e1015b7c5bdbde9354dcd25f76d38df31d145
parent: 2b7cc2b03ea858633016cd16a1630be7fc0db837 (diff)
download: llvm-36018494fdb9e92e0f61f6937e5ecd3a4472677f.zip
llvm-36018494fdb9e92e0f61f6937e5ecd3a4472677f.tar.gz
llvm-36018494fdb9e92e0f61f6937e5ecd3a4472677f.tar.bz2
7 files changed, 242 insertions, 35 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 92937e3..d81f25c 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -895,6 +895,8 @@ SIFoldOperandsImpl::isRegSeqSplat(MachineInstr &RegSeq) const {
   if (!SrcRC)
     return {};
 
+  // TODO: Recognize 64-bit splats broken into 32-bit pieces (i.e. recognize
+  // every other other element is 0 for 64-bit immediates)
   int64_t Imm;
   for (unsigned I = 0, E = Defs.size(); I != E; ++I) {
     const MachineOperand *Op = Defs[I].first;
@@ -924,10 +926,41 @@ MachineOperand *SIFoldOperandsImpl::tryFoldRegSeqSplat(
   if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx))
     return nullptr;
 
-  // FIXME: Verify SplatRC is compatible with the use operand
-  uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
-  if (!TII->isInlineConstant(*SplatVal, OpTy) ||
-      !TII->isOperandLegal(*UseMI, UseOpIdx, SplatVal))
+  int16_t RCID = Desc.operands()[UseOpIdx].RegClass;
+  if (RCID == -1)
+    return nullptr;
+
+  // Special case 0/-1, since when interpreted as a 64-bit element both halves
+  // have the same bits. Effectively this code does not handle 64-bit element
+  // operands correctly, as the incoming 64-bit constants are already split into
+  // 32-bit sequence elements.
+  //
+  // TODO: We should try to figure out how to interpret the reg_sequence as a
+  // split 64-bit splat constant, or use 64-bit pseudos for materializing f64
+  // constants.
+  if (SplatVal->getImm() != 0 && SplatVal->getImm() != -1) {
+    const TargetRegisterClass *OpRC = TRI->getRegClass(RCID);
+    // We need to figure out the scalar type read by the operand. e.g. the MFMA
+    // operand will be AReg_128, and we want to check if it's compatible with an
+    // AReg_32 constant.
+    uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
+    switch (OpTy) {
+    case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
+    case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
+      OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0);
+      break;
+    case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
+      OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0_sub1);
+      break;
+    default:
+      return nullptr;
+    }
+
+    if (!TRI->getCommonSubClass(OpRC, SplatRC))
+      return nullptr;
+  }
+
+  if (!TII->isOperandLegal(*UseMI, UseOpIdx, SplatVal))
     return nullptr;
 
   return SplatVal;
@@ -1039,14 +1072,13 @@ void SIFoldOperandsImpl::foldOperand(
         }
       }
 
-      if (tryToFoldACImm(UseMI->getOperand(0), RSUseMI, OpNo, FoldList))
+      if (RSUse->getSubReg() != RegSeqDstSubReg)
         continue;
 
-      if (RSUse->getSubReg() != RegSeqDstSubReg)
+      if (tryToFoldACImm(UseMI->getOperand(0), RSUseMI, OpNo, FoldList))
         continue;
 
-      foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(RSUse), FoldList,
-                  CopiesToReplace);
+      foldOperand(OpToFold, RSUseMI, OpNo, FoldList, CopiesToReplace);
     }
 
     return;
diff --git a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll
index af4ca2a..fb53e88 100644
--- a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll
+++ b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll
@@ -192,8 +192,10 @@ define amdgpu_ps <4 x i32> @s_csh_v4i32(<4 x i32> inreg %a, <4 x i32> inreg %b)
 ;
 ; GISEL-LABEL: s_csh_v4i32:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_and_b64 s[4:5], s[4:5], 31
-; GISEL-NEXT:    s_and_b64 s[6:7], s[6:7], 31
+; GISEL-NEXT:    s_mov_b32 s8, 31
+; GISEL-NEXT:    s_mov_b32 s9, s8
+; GISEL-NEXT:    s_and_b64 s[4:5], s[4:5], s[8:9]
+; GISEL-NEXT:    s_and_b64 s[6:7], s[6:7], s[8:9]
 ; GISEL-NEXT:    s_lshl_b32 s8, s0, s4
 ; GISEL-NEXT:    s_lshl_b32 s9, s1, s5
 ; GISEL-NEXT:    s_lshl_b32 s10, s2, s6
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
index 28245c5..d588f0e 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
@@ -745,7 +745,10 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0x100000001(ptr addrspace(1)
 ;
 ; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_0x100000001:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[2:3], 1
+; GFX12-SDAG-NEXT:    s_mov_b32 s0, 1
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    s_mov_b32 s1, s0
+; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[2:3], s[0:1]
 ; GFX12-SDAG-NEXT:    s_load_u8 s0, s[0:1], 0x0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
index 86bfb694a..5d5dc01 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
@@ -262,11 +262,19 @@ bb:
   ret void
 }
 
-; FIXME: This should not be foldable as an inline immediate
 ; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and_low:
-; GFX90A: v_mfma_f64_16x16x4f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], 64{{$}}
+; GCN: v_accvgpr_write_b32 a[[A_LOW_BITS_0:[0-9]+]], 64{{$}}
+; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
+; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
+; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
+; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
+; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
+; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
+; GCN: v_accvgpr_mov_b32 a[[LAST_CONST_REG:[0-9]+]], a[[A_LOW_BITS_0]]
+
+; GFX90A: v_mfma_f64_16x16x4f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a{{\[}}[[A_LOW_BITS_0]]:[[LAST_CONST_REG]]{{\]$}}
 ; GFX90A: v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 blgp:3
-; GFX942: v_mfma_f64_16x16x4_f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], 64{{$}}
+; GFX942: v_mfma_f64_16x16x4_f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a{{\[}}[[A_LOW_BITS_0]]:[[LAST_CONST_REG]]{{\]$}}
 ; GFX942: v_mfma_f64_16x16x4_f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 neg:[1,1,0]
 ; GCN:    global_store_dwordx4
 ; GCN:    global_store_dwordx4
diff --git a/llvm/test/CodeGen/AMDGPU/operand-folding.ll b/llvm/test/CodeGen/AMDGPU/operand-folding.ll
index ebfc5d0..778d73f 100644
--- a/llvm/test/CodeGen/AMDGPU/operand-folding.ll
+++ b/llvm/test/CodeGen/AMDGPU/operand-folding.ll
@@ -155,7 +155,9 @@ define i32 @issue139908(i64 %in) {
 ; CHECK-LABEL: issue139908:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc, 42, v[0:1]
+; CHECK-NEXT:    s_mov_b32 s4, 42
+; CHECK-NEXT:    s_mov_b32 s5, s4
+; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 2, 1, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %eq = icmp eq i64 %in, 180388626474
diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index ddc3e77..bef38c1 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX900 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-SDAG,GFX90A-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-GISEL,GFX90A-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-SDAG,GFX942-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-GISEL,GFX942-GISEL %s
 
 define amdgpu_kernel void @fadd_v2_vv(ptr addrspace(1) %a) {
 ; GFX900-LABEL: fadd_v2_vv:
@@ -411,10 +411,12 @@ define amdgpu_kernel void @fadd_v2_v_lit_splat(ptr addrspace(1) %a) {
 ; PACKED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; PACKED-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; PACKED-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; PACKED-GISEL-NEXT:    s_mov_b32 s2, 1.0
+; PACKED-GISEL-NEXT:    s_mov_b32 s3, s2
 ; PACKED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; PACKED-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
 ; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; PACKED-GISEL-NEXT:    v_pk_add_f32 v[0:1], v[0:1], 1.0
+; PACKED-GISEL-NEXT:    v_pk_add_f32 v[0:1], v[0:1], s[2:3]
 ; PACKED-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; PACKED-GISEL-NEXT:    s_endpgm
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -1186,10 +1188,12 @@ define amdgpu_kernel void @fmul_v2_v_lit_splat(ptr addrspace(1) %a) {
 ; PACKED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; PACKED-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; PACKED-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; PACKED-GISEL-NEXT:    s_mov_b32 s2, 4.0
+; PACKED-GISEL-NEXT:    s_mov_b32 s3, s2
 ; PACKED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; PACKED-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
 ; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; PACKED-GISEL-NEXT:    v_pk_mul_f32 v[0:1], v[0:1], 4.0
+; PACKED-GISEL-NEXT:    v_pk_mul_f32 v[0:1], v[0:1], s[2:3]
 ; PACKED-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; PACKED-GISEL-NEXT:    s_endpgm
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -1594,6 +1598,40 @@ define amdgpu_kernel void @fma_v2_v_imm(ptr addrspace(1) %a) {
 ; PACKED-SDAG-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3] op_sel_hi:[1,0,0]
 ; PACKED-SDAG-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
 ; PACKED-SDAG-NEXT:    s_endpgm
+;
+; GFX90A-GISEL-LABEL: fma_v2_v_imm:
+; GFX90A-GISEL:       ; %bb.0:
+; GFX90A-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90A-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX90A-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; GFX90A-GISEL-NEXT:    s_mov_b32 s4, 0x43480000
+; GFX90A-GISEL-NEXT:    s_mov_b32 s2, 0x42c80000
+; GFX90A-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-GISEL-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
+; GFX90A-GISEL-NEXT:    s_mov_b32 s5, s4
+; GFX90A-GISEL-NEXT:    s_mov_b32 s3, s2
+; GFX90A-GISEL-NEXT:    v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-GISEL-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3]
+; GFX90A-GISEL-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX90A-GISEL-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: fma_v2_v_imm:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 s4, 0x43480000
+; GFX942-GISEL-NEXT:    s_mov_b32 s2, 0x42c80000
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
+; GFX942-GISEL-NEXT:    s_mov_b32 s5, s4
+; GFX942-GISEL-NEXT:    s_mov_b32 s3, s2
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3]
+; GFX942-GISEL-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    s_endpgm
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
   %load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -1675,19 +1713,39 @@ define amdgpu_kernel void @fma_v2_v_lit_splat(ptr addrspace(1) %a) {
 ; PACKED-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; PACKED-SDAG-NEXT:    s_endpgm
 ;
-; PACKED-GISEL-LABEL: fma_v2_v_lit_splat:
-; PACKED-GISEL:       ; %bb.0:
-; PACKED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; PACKED-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; PACKED-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; PACKED-GISEL-NEXT:    s_mov_b32 s2, 1.0
-; PACKED-GISEL-NEXT:    s_mov_b32 s3, s2
-; PACKED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; PACKED-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
-; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; PACKED-GISEL-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], 4.0, s[2:3]
-; PACKED-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; PACKED-GISEL-NEXT:    s_endpgm
+; GFX90A-GISEL-LABEL: fma_v2_v_lit_splat:
+; GFX90A-GISEL:       ; %bb.0:
+; GFX90A-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90A-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX90A-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; GFX90A-GISEL-NEXT:    s_mov_b32 s4, 1.0
+; GFX90A-GISEL-NEXT:    s_mov_b32 s2, 4.0
+; GFX90A-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-GISEL-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
+; GFX90A-GISEL-NEXT:    s_mov_b32 s5, s4
+; GFX90A-GISEL-NEXT:    s_mov_b32 s3, s2
+; GFX90A-GISEL-NEXT:    v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-GISEL-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3]
+; GFX90A-GISEL-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX90A-GISEL-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: fma_v2_v_lit_splat:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 s4, 1.0
+; GFX942-GISEL-NEXT:    s_mov_b32 s2, 4.0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
+; GFX942-GISEL-NEXT:    s_mov_b32 s5, s4
+; GFX942-GISEL-NEXT:    s_mov_b32 s3, s2
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3]
+; GFX942-GISEL-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    s_endpgm
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
   %load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -1725,6 +1783,40 @@ define amdgpu_kernel void @fma_v2_v_unfoldable_lit(ptr addrspace(1) %a) {
 ; PACKED-SDAG-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3]
 ; PACKED-SDAG-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
 ; PACKED-SDAG-NEXT:    s_endpgm
+;
+; GFX90A-GISEL-LABEL: fma_v2_v_unfoldable_lit:
+; GFX90A-GISEL:       ; %bb.0:
+; GFX90A-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90A-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX90A-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; GFX90A-GISEL-NEXT:    s_mov_b32 s4, 1.0
+; GFX90A-GISEL-NEXT:    s_mov_b32 s2, 4.0
+; GFX90A-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-GISEL-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
+; GFX90A-GISEL-NEXT:    s_mov_b32 s5, 2.0
+; GFX90A-GISEL-NEXT:    s_mov_b32 s3, 0x40400000
+; GFX90A-GISEL-NEXT:    v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-GISEL-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3]
+; GFX90A-GISEL-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX90A-GISEL-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: fma_v2_v_unfoldable_lit:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 s4, 1.0
+; GFX942-GISEL-NEXT:    s_mov_b32 s2, 4.0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1]
+; GFX942-GISEL-NEXT:    s_mov_b32 s5, 2.0
+; GFX942-GISEL-NEXT:    s_mov_b32 s3, 0x40400000
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], s[2:3], v[2:3]
+; GFX942-GISEL-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    s_endpgm
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
   %load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -2059,6 +2151,37 @@ define amdgpu_kernel void @fadd_fadd_fsub_0(<2 x float> %arg) {
 ; PACKED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; PACKED-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
 ; PACKED-SDAG-NEXT:    s_endpgm
+;
+; GFX90A-GISEL-LABEL: fadd_fadd_fsub_0:
+; GFX90A-GISEL:       ; %bb.0: ; %bb
+; GFX90A-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90A-GISEL-NEXT:    s_mov_b32 s2, 0
+; GFX90A-GISEL-NEXT:    s_mov_b32 s3, s2
+; GFX90A-GISEL-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-GISEL-NEXT:    v_pk_add_f32 v[0:1], s[0:1], v[0:1]
+; GFX90A-GISEL-NEXT:    v_mov_b32_e32 v0, v1
+; GFX90A-GISEL-NEXT:    v_pk_add_f32 v[0:1], v[0:1], 0
+; GFX90A-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX90A-GISEL-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-GISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX90A-GISEL-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: fadd_fadd_fsub_0:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    s_mov_b32 s2, 0
+; GFX942-GISEL-NEXT:    s_mov_b32 s3, s2
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_pk_add_f32 v[0:1], s[0:1], v[0:1]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, v1
+; GFX942-GISEL-NEXT:    v_pk_add_f32 v[0:1], v[0:1], 0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-GISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX942-GISEL-NEXT:    s_endpgm
 bb:
   %i12 = fadd <2 x float> zeroinitializer, %arg
   %shift8 = shufflevector <2 x float> %i12, <2 x float> poison, <2 x i32> <i32 1, i32 poison>
@@ -2099,6 +2222,40 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p
 ; PACKED-SDAG-NEXT:    v_pk_add_f32 v[0:1], v[2:3], s[2:3] neg_lo:[0,1] neg_hi:[0,1]
 ; PACKED-SDAG-NEXT:    global_store_dwordx2 v4, v[0:1], s[6:7]
 ; PACKED-SDAG-NEXT:    s_endpgm
+;
+; GFX90A-GISEL-LABEL: fadd_fadd_fsub:
+; GFX90A-GISEL:       ; %bb.0: ; %bb
+; GFX90A-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-GISEL-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-GISEL-NEXT:    v_pk_add_f32 v[0:1], s[0:1], v[0:1]
+; GFX90A-GISEL-NEXT:    v_sub_f32_e32 v0, s0, v2
+; GFX90A-GISEL-NEXT:    v_mov_b32_e32 v2, v1
+; GFX90A-GISEL-NEXT:    v_pk_add_f32 v[2:3], s[2:3], v[2:3]
+; GFX90A-GISEL-NEXT:    v_subrev_f32_e32 v1, s3, v2
+; GFX90A-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90A-GISEL-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: fadd_fadd_fsub:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-GISEL-NEXT:    v_pk_add_f32 v[0:1], s[0:1], v[0:1]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_sub_f32_e32 v0, s0, v2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, v1
+; GFX942-GISEL-NEXT:    v_pk_add_f32 v[2:3], s[2:3], v[2:3]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subrev_f32_e32 v1, s3, v2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX942-GISEL-NEXT:    s_endpgm
 bb:
   %i12 = fadd <2 x float> %arg, %arg1
   %shift8 = shufflevector <2 x float> %i12, <2 x float> poison, <2 x i32> <i32 1, i32 poison>
@@ -2251,3 +2408,6 @@ declare i32 @llvm.amdgcn.workitem.id.x()
 declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
 declare <32 x float> @llvm.fma.v32f32(<32 x float>, <32 x float>, <32 x float>)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX90A-SDAG: {{.*}}
+; GFX942-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/si-fold-operands-subreg-imm.mir b/llvm/test/CodeGen/AMDGPU/si-fold-operands-subreg-imm.mir
index aa1a744..8d6c3ef 100644
--- a/llvm/test/CodeGen/AMDGPU/si-fold-operands-subreg-imm.mir
+++ b/llvm/test/CodeGen/AMDGPU/si-fold-operands-subreg-imm.mir
@@ -140,7 +140,7 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
     ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 42
     ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1
-    ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY]], 42, implicit-def $scc
+    ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY]], [[REG_SEQUENCE]], implicit-def $scc
     ; CHECK-NEXT: S_ENDPGM 0, implicit $scc
     %0:sgpr_64 = COPY $sgpr8_sgpr9
     %1:sreg_32 = S_MOV_B32 42
author	Matt Arsenault <Matthew.Arsenault@amd.com>	2025-05-19 21:44:44 +0200
committer	GitHub <noreply@github.com>	2025-05-19 21:44:44 +0200
commit	36018494fdb9e92e0f61f6937e5ecd3a4472677f (patch)
tree	405e1015b7c5bdbde9354dcd25f76d38df31d145
parent	2b7cc2b03ea858633016cd16a1630be7fc0db837 (diff)
download	llvm-36018494fdb9e92e0f61f6937e5ecd3a4472677f.zip llvm-36018494fdb9e92e0f61f6937e5ecd3a4472677f.tar.gz llvm-36018494fdb9e92e0f61f6937e5ecd3a4472677f.tar.bz2