diff options
author | Matt Arsenault <Matthew.Arsenault@amd.com> | 2024-11-12 21:35:51 -0800 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-11-12 21:35:51 -0800 |
commit | 5911fbb39d615b39f1bf6fd732503ab433de5f27 (patch) | |
tree | b034d98c79abe889dc5b68f211209c88ac8ab8c2 | |
parent | 2baead09b2eea3b7b76afa193e35b93a236d948d (diff) | |
download | llvm-5911fbb39d615b39f1bf6fd732503ab433de5f27.zip llvm-5911fbb39d615b39f1bf6fd732503ab433de5f27.tar.gz llvm-5911fbb39d615b39f1bf6fd732503ab433de5f27.tar.bz2 |
AMDGPU: Do not fold copy to physreg from operation on frame index (#115977)
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 8 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/captured-frame-index.ll | 15 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/fold-operands-s-add-copy-to-vgpr.mir | 80 |
3 files changed, 88 insertions, 15 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 7383477..4fb5cb0 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1599,10 +1599,6 @@ bool SIFoldOperandsImpl::tryFoldFoldableCopy( if (OpToFold.isReg() && !OpToFold.getReg().isVirtual()) return false; - if (OpToFold.isReg() && - foldCopyToVGPROfScalarAddOfFrameIndex(DstReg, OpToFold.getReg(), MI)) - return true; - // Prevent folding operands backwards in the function. For example, // the COPY opcode must not be replaced by 1 in this example: // @@ -1612,6 +1608,10 @@ bool SIFoldOperandsImpl::tryFoldFoldableCopy( if (!DstReg.isVirtual()) return false; + if (OpToFold.isReg() && + foldCopyToVGPROfScalarAddOfFrameIndex(DstReg, OpToFold.getReg(), MI)) + return true; + bool Changed = foldInstOperand(MI, OpToFold); // If we managed to fold all uses of this copy then we might as well diff --git a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll index 2ec4c07..03f0609 100644 --- a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll +++ b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll @@ -325,8 +325,23 @@ define amdgpu_kernel void @kernel_alloca_offset_use_asm_vgpr() { ret void } +; GCN-LABEL: {{^}}live_out_physreg_copy_add_fi: +; GCN: s_or_b32 [[FI:s[0-9]+]], s{{[0-9]+}}, 4 +; GCN: v_mov_b32_e32 v0, [[FI]] +; GCN: v_mov_b32_e32 v1 +; GCN: s_swappc_b64 +define void @live_out_physreg_copy_add_fi(ptr %fptr) #2 { +bb: + %alloca = alloca [4 x i32], align 16, addrspace(5) + %addrspacecast = addrspacecast ptr addrspace(5) %alloca to ptr + %getelementptr = getelementptr i8, ptr %addrspacecast, i64 4 + call void %fptr(ptr %getelementptr) #2 + ret void +} + declare void @llvm.lifetime.start.p5(i64, ptr addrspace(5) nocapture) #1 declare void @llvm.lifetime.end.p5(i64, ptr addrspace(5) nocapture) #1 attributes #0 = { nounwind } attributes #1 = { argmemonly nounwind } +attributes #2 = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } diff --git a/llvm/test/CodeGen/AMDGPU/fold-operands-s-add-copy-to-vgpr.mir b/llvm/test/CodeGen/AMDGPU/fold-operands-s-add-copy-to-vgpr.mir index 8c88c7a..4b4324b 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-operands-s-add-copy-to-vgpr.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-operands-s-add-copy-to-vgpr.mir @@ -120,17 +120,10 @@ stack: - { id: 0, size: 16384, alignment: 4, local-offset: 0 } body: | bb.0: - ; GFX8-LABEL: name: fold_s_add_i32__mov_fi_const_copy_to_phys_vgpr - ; GFX8: $vgpr0 = V_ADD_CO_U32_e32 128, %stack.0, implicit-def dead $vcc, implicit $exec - ; GFX8-NEXT: SI_RETURN implicit $vgpr0 - ; - ; GFX9-LABEL: name: fold_s_add_i32__mov_fi_const_copy_to_phys_vgpr - ; GFX9: $vgpr0 = V_ADD_U32_e32 128, %stack.0, implicit $exec - ; GFX9-NEXT: SI_RETURN implicit $vgpr0 - ; - ; GFX10-LABEL: name: fold_s_add_i32__mov_fi_const_copy_to_phys_vgpr - ; GFX10: $vgpr0 = V_ADD_U32_e32 128, %stack.0, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr0 + ; CHECK-LABEL: name: fold_s_add_i32__mov_fi_const_copy_to_phys_vgpr + ; CHECK: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, 128, implicit-def dead $scc + ; CHECK-NEXT: $vgpr0 = COPY [[S_ADD_I32_]] + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 %0:sreg_32 = S_MOV_B32 %stack.0 %1:sreg_32 = S_ADD_I32 %0, 128, implicit-def dead $scc $vgpr0 = COPY %1 @@ -535,3 +528,68 @@ body: | %2:vgpr_32 = COPY %1 SI_RETURN implicit %2 ... + +# Physreg copy of %2 to $vgpr0 should not be erased +--- +name: fold_fi_into_s_or_b32_user_is_physreg_copy +tracksRegLiveness: true +stack: + - { id: 0, size: 16, alignment: 16 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' +body: | + ; CHECK-LABEL: name: fold_fi_into_s_or_b32_user_is_physreg_copy + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, 4, implicit-def dead $scc + ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]].sub0, implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]].sub1, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE]], [[COPY]], implicit $exec + ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U64_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 + ; CHECK-NEXT: $vgpr0 = COPY [[S_ADD_I32_]] + ; CHECK-NEXT: $sgpr30_sgpr31 = SI_CALL [[REG_SEQUENCE]], 0, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0 + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 + ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]] + ; CHECK-NEXT: SI_RETURN + bb.0: + liveins: $vgpr0_vgpr1 + + %0:vreg_64 = COPY $vgpr0_vgpr1 + %1:sreg_32 = S_MOV_B32 %stack.0 + %2:sreg_32 = S_ADD_I32 killed %1, 4, implicit-def dead $scc + %3:sreg_64_xexec = S_MOV_B64 $exec + + bb.1: + %4:sgpr_32 = V_READFIRSTLANE_B32 %0.sub0, implicit $exec + %5:sgpr_32 = V_READFIRSTLANE_B32 %0.sub1, implicit $exec + %6:sgpr_64 = REG_SEQUENCE %4, %subreg.sub0, %5, %subreg.sub1 + %7:sreg_64_xexec = V_CMP_EQ_U64_e64 %6, %0, implicit $exec + %8:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed %7, implicit-def $exec, implicit-def $scc, implicit $exec + ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 + $vgpr0 = COPY %2 + $sgpr30_sgpr31 = SI_CALL %6, 0, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0 + ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 + $exec = S_XOR_B64_term $exec, %8, implicit-def $scc + SI_WATERFALL_LOOP %bb.1, implicit $exec + + bb.2: + $exec = S_MOV_B64 %3 + SI_RETURN + +... |