diff options
author | anikelal <anikelal@amd.com> | 2025-06-02 10:01:05 +0530 |
---|---|---|
committer | anikelal <anikelal@amd.com> | 2025-06-02 10:01:05 +0530 |
commit | 2b8b07248d2f7976f977875e06999074162c2bf8 (patch) | |
tree | db8f59e7c7a6da3b49f40b45a8d62ae16d25081f | |
parent | 4425b513f3f6d1abb82635309d4eafd2aac6ded5 (diff) | |
download | llvm-users/lalaniket8/mubuf-load-store-partition-scalar-offset.zip llvm-users/lalaniket8/mubuf-load-store-partition-scalar-offset.tar.gz llvm-users/lalaniket8/mubuf-load-store-partition-scalar-offset.tar.bz2 |
promoteMUBUFLoadStoreScalarOffsetusers/lalaniket8/mubuf-load-store-partition-scalar-offset
-rw-r--r-- | llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp | 73 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/amdgcn-partition-load-store-into-vector-scalar-parts.ll | 65 |
2 files changed, 120 insertions, 18 deletions
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index b0d6fd9..b466796 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -282,6 +282,7 @@ private: bool promoteConstantOffsetToImm(MachineInstr &CI, MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &Promoted) const; + bool promoteMUBUFLoadStoreScalarOffset(MachineInstr &CI) const; void addInstToMergeableList(const CombineInfo &CI, std::list<std::list<CombineInfo> > &MergeableInsts) const; @@ -427,16 +428,16 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact: case AMDGPU::BUFFER_LOAD_DWORD_IDXEN: case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact: - case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: - case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: + case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:// + case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:// case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN: case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact: case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN: case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact: - case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN: - case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact: + case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN:// + case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact:// case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET: case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact: return BUFFER_LOAD; @@ -2092,25 +2093,25 @@ void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base if (!Base.isReg()) return; - MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg()); + MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());//REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE || Def->getNumOperands() != 5) return; - MachineOperand BaseLo = Def->getOperand(1); - MachineOperand BaseHi = Def->getOperand(3); + MachineOperand BaseLo = Def->getOperand(1);//%LO:vgpr_32 + MachineOperand BaseHi = Def->getOperand(3);//%HI:vgpr_32 if (!BaseLo.isReg() || !BaseHi.isReg()) return; - MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg()); - MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg()); + MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());//%LO:vgpr_32, %c:sreg_64_xexec = V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32, + MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());//%HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 || !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64) return; - const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0); - const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1); + const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);//%BASE_LO:vgpr_32 + const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);//%103:sgpr_32 auto Offset0P = extractConstOffset(*Src0); if (Offset0P) @@ -2120,12 +2121,12 @@ void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base return; BaseLo = *Src0; } - +//BaseLo = %103:sgpr_32 if (!BaseLo.isReg()) return; - Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0); - Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1); + Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);// %BASE_HI:vgpr_32 + Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);// 0 if (Src0->isImm()) std::swap(Src0, Src1); @@ -2133,14 +2134,14 @@ void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base if (!Src1->isImm() || Src0->isImm()) return; - uint64_t Offset1 = Src1->getImm(); - BaseHi = *Src0; + uint64_t Offset1 = Src1->getImm(); //0 + BaseHi = *Src0;//%BASE_HI:vgpr_32 if (!BaseHi.isReg()) return; - Addr.Base.LoReg = BaseLo.getReg(); - Addr.Base.HiReg = BaseHi.getReg(); + Addr.Base.LoReg = BaseLo.getReg();//%103:sgpr_32 + Addr.Base.HiReg = BaseHi.getReg();//%BASE_HI:vgpr_32 Addr.Base.LoSubReg = BaseLo.getSubReg(); Addr.Base.HiSubReg = BaseHi.getSubReg(); Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32); @@ -2298,6 +2299,39 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm( return false; } +bool SILoadStoreOptimizer::promoteMUBUFLoadStoreScalarOffset( + MachineInstr &MI) const{ + if(!SIInstrInfo::isMUBUF(MI)) + return false; + LLVM_DEBUG(dbgs() << "tryToPromoteMUBUFLoadStoreScalarOffset:"; MI.dump()); + auto vaddr = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); + if(!vaddr) return false; + LLVM_DEBUG(dbgs() << "\n vaddr:"; vaddr->dump()); + MachineInstr *Def = MRI->getUniqueVRegDef(vaddr->getReg()); + if(!Def) return false; + LLVM_DEBUG(dbgs() << "\n def:"; Def->dump()); + auto opsrc0 = TII->getNamedOperand(*Def, AMDGPU::OpName::src0); + if(!opsrc0) return false; + auto opsrc1 = TII->getNamedOperand(*Def, AMDGPU::OpName::src1); + if(!opsrc1) return false; + LLVM_DEBUG(dbgs() << "\n opsrc0:"; opsrc0->dump()); + LLVM_DEBUG(dbgs() << "\n opsrc1:"; opsrc1->dump()); + auto isopsrc0scalarreg = TII->getRegisterInfo().isSGPRClass(MRI->getRegClass(opsrc0->getReg())); + auto isopsrc1scalarreg = TII->getRegisterInfo().isSGPRClass(MRI->getRegClass(opsrc1->getReg())); + LLVM_DEBUG(dbgs() << "\n isopsrc0scalarreg:" << isopsrc0scalarreg << " isopsrc1scalarreg:" << isopsrc1scalarreg;); + if(!(isopsrc0scalarreg ^ isopsrc1scalarreg)) return false; + auto scalarOp = isopsrc0scalarreg ? opsrc0 : opsrc1; + + // if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 || + // !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64) + // return; + + // const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);//%BASE_LO:vgpr_32 + // const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);//%103:sgpr_32 + + return false; +} + void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI, std::list<std::list<CombineInfo> > &MergeableInsts) const { for (std::list<CombineInfo> &AddrList : MergeableInsts) { @@ -2331,6 +2365,9 @@ SILoadStoreOptimizer::collectMergeableInsts( if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) Modified = true; + if (promoteMUBUFLoadStoreScalarOffset(MI)) + Modified = true; + // Treat volatile accesses, ordered accesses and unmodeled side effects as // barriers. We can look after this barrier for separate merges. if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) { diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-partition-load-store-into-vector-scalar-parts.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-partition-load-store-into-vector-scalar-parts.ll new file mode 100644 index 0000000..71994b8 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgcn-partition-load-store-into-vector-scalar-parts.ll @@ -0,0 +1,65 @@ +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 -verify-machineinstrs -stop-after=finalize-isel -o - %s | FileCheck -check-prefixes=GCN,SDAG %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -verify-machineinstrs -stop-after=finalize-isel -o - %s | FileCheck -check-prefixes=GCN,GISEL %s + +@0 = external dso_local addrspace(4) constant [4 x <2 x float>] +@1 = external dso_local addrspace(4) constant i32 + +; GCN-LABEL: name: test_buffer_load_sgpr_plus_imm_offset +; SDAG-DAG: %[[BASE0:.*]]:sgpr_32 = COPY $sgpr0 +; SDAG-DAG: %[[BASE1:.*]]:sgpr_32 = COPY $sgpr1 +; SDAG-DAG: %[[BASE2:.*]]:sgpr_32 = COPY $sgpr2 +; SDAG-DAG: %[[BASE3:.*]]:sgpr_32 = COPY $sgpr3 +; SDAG-DAG: %[[OFFSET:.*]]:sgpr_32 = COPY $sgpr4 +; SDAG-DAG: %[[BASE:.*]]:sgpr_128 = REG_SEQUENCE %[[BASE0]], %subreg.sub0, %[[BASE1]], %subreg.sub1, %[[BASE2]], %subreg.sub2, %[[BASE3]], %subreg.sub3 +; SDAG: S_BUFFER_LOAD_DWORD_SGPR_IMM killed %[[BASE]], %[[OFFSET]], 77, +; GISEL-DAG: %[[BASE0:.*]]:sreg_32 = COPY $sgpr0 +; GISEL-DAG: %[[BASE1:.*]]:sreg_32 = COPY $sgpr1 +; GISEL-DAG: %[[BASE2:.*]]:sreg_32 = COPY $sgpr2 +; GISEL-DAG: %[[BASE3:.*]]:sreg_32 = COPY $sgpr3 +; GISEL-DAG: %[[OFFSET:.*]]:sreg_32 = COPY $sgpr4 +; GISEL-DAG: %[[BASE:.*]]:sgpr_128 = REG_SEQUENCE %[[BASE0]], %subreg.sub0, %[[BASE1]], %subreg.sub1, %[[BASE2]], %subreg.sub2, %[[BASE3]], %subreg.sub3 +; GISEL: S_BUFFER_LOAD_DWORD_SGPR_IMM %[[BASE]], %[[OFFSET]], 77, +define void @test_buffer_load_sgpr_plus_imm_offset(<4 x i32> %base, i32 %i, i32 inreg %j, ptr addrspace(1) inreg %out) { + %off = add i32 %i, %j + %v = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %base, i32 %off, i32 0) + store i32 %v, ptr addrspace(1) %out, align 4 + ret void +} + +declare void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32>, ptr addrspace(8) nocapture, i32, i32, i32 immarg) #1 + +declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32 immarg) nounwind readnone willreturn + +; Function Attrs: nounwind readnone speculatable +declare i32 @llvm.amdgcn.reloc.constant(metadata) #3 + +; Function Attrs: nounwind readnone speculatable +declare i64 @llvm.amdgcn.s.getpc() #3 + +; Function Attrs: nounwind readnone +declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32 immarg) #1 + +attributes #0 = { argmemonly nounwind willreturn } +attributes #1 = { nounwind memory(argmem: write) } +attributes #2 = { nounwind "amdgpu-unroll-threshold"="700" } +attributes #3 = { nounwind readnone speculatable } +attributes #4 = { nounwind writeonly } + +!llpc.compute.mode = !{!0} +!llpc.options = !{!1} +!llpc.options.CS = !{!2} +!llpc.user.data.nodes = !{!3, !4, !5, !6} +!amdgpu.pal.metadata.msgpack = !{!7} + +!0 = !{i32 2, i32 3, i32 1} +!1 = !{i32 245227952, i32 996822128, i32 2024708198, i32 497230408} +!2 = !{i32 1381820427, i32 1742110173, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64} +!3 = !{!"DescriptorTableVaPtr", i32 0, i32 1, i32 1} +!4 = !{!"DescriptorBuffer", i32 4, i32 8, i32 0, i32 0} +!5 = !{!"DescriptorTableVaPtr", i32 1, i32 1, i32 1} +!6 = !{!"DescriptorBuffer", i32 4, i32 8, i32 1, i32 0} +!7 = !{!"\82\B0amdpal.pipelines\91\88\A4.api\A6Vulkan\B0.hardware_stages\81\A3.cs\82\AB.sgpr_limith\AB.vgpr_limit\CD\01\00\B7.internal_pipeline_hash\92\CF;jLp\0E\9D\E1\B0\CF\1D\A3\22Hx\AE\98f\AA.registers\88\CD.\07\02\CD.\08\03\CD.\09\01\CD.\12\CE\00,\00\00\CD.\13\CD\0F\88\CD.@\CE\10\00\00\00\CD.B\00\CD.C\01\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\CFg\D6}\DDR\\\E8\0B\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CE\FF\FF\FF\FF\A5.type\A2Cs\B0.user_data_limit\02\AEamdpal.version\92\02\03"} +!8 = !{i32 5} +!9 = !{!"doff_0_0_b"} +!10 = !{} +!11 = !{!"doff_1_0_b"} |