diff options
author | SingleAccretion <62474226+SingleAccretion@users.noreply.github.com> | 2025-06-24 21:40:47 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-06-24 11:40:47 -0700 |
commit | cd46354dbd10820158edabe14dbd49d9f9010722 (patch) | |
tree | 9642fadb5bae891dcec19be887530dfcb935354c | |
parent | de569ad6b845310335c37e19105e41f201c45dd9 (diff) | |
download | llvm-cd46354dbd10820158edabe14dbd49d9f9010722.zip llvm-cd46354dbd10820158edabe14dbd49d9f9010722.tar.gz llvm-cd46354dbd10820158edabe14dbd49d9f9010722.tar.bz2 |
[WebAssembly] Enable a limited amount of stackification for debug code (#136510)
This change is a step towards fixing one long-standing problem with
LLVM's debug WASM codegen: excessive use of locals. One local for each
temporary value in IR (roughly speaking).
This has a lot of problems:
1) It makes it easy to hit engine limitations of 50K locals with certain
code patterns and large functions.
2) It makes for larger binaries that are slower to load and slower to
compile to native code.
3) It makes certain compilation strategies (spill all WASM locals to
stack, for example) for debug code excessively expensive and makes debug
WASM code either run very slow, or be less debuggable.
4) It slows down LLVM itself.
This change addresses these partially by running a limited version of
the stackification pass for unoptimized code, one that gets rid of the
most 'obviously' unnecessary locals.
Care needs to be taken to not impact LLVM's ability to produce high
quality debug variable locations with this pass. To that end:
1) We only allow stackification when it doesn't require moving any
instructions.
2) We disable stackification of any locals that are used in
DEBUG_VALUEs, or as a frame base.
I have verified on a moderately large example that the baseline and the
diff produce the same kinds (local/global/stack) of locations, and the
only differences are due to the shifting of instruction offsets, with
many local.[get|set]s not being present anymore.
Even with this quite conservative approach, the results are pretty good:
1) 30% reduction in raw code size, up to 10x reduction in the number of
locals for select large methods (~1000 => ~100).
2) ~10% reduction in instructions retired for an "llc -O0" run on a
moderately sized input.
-rw-r--r-- | llvm/lib/Target/WebAssembly/WebAssembly.h | 2 | ||||
-rw-r--r-- | llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp | 138 | ||||
-rw-r--r-- | llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp | 14 | ||||
-rw-r--r-- | llvm/test/CodeGen/WebAssembly/PR40172.ll | 6 | ||||
-rw-r--r-- | llvm/test/CodeGen/WebAssembly/PR41841.ll | 16 | ||||
-rw-r--r-- | llvm/test/CodeGen/WebAssembly/debug-code-stackification.ll | 89 | ||||
-rw-r--r-- | llvm/test/CodeGen/WebAssembly/pr51651.ll | 14 | ||||
-rw-r--r-- | llvm/test/CodeGen/WebAssembly/signext-zeroext-callsite.ll | 60 | ||||
-rw-r--r-- | llvm/test/CodeGen/WebAssembly/suboptimal-compare.ll | 25 |
9 files changed, 221 insertions, 143 deletions
diff --git a/llvm/lib/Target/WebAssembly/WebAssembly.h b/llvm/lib/Target/WebAssembly/WebAssembly.h index 17481d7..2dbd597 100644 --- a/llvm/lib/Target/WebAssembly/WebAssembly.h +++ b/llvm/lib/Target/WebAssembly/WebAssembly.h @@ -44,7 +44,7 @@ FunctionPass *createWebAssemblyReplacePhysRegs(); FunctionPass *createWebAssemblyNullifyDebugValueLists(); FunctionPass *createWebAssemblyOptimizeLiveIntervals(); FunctionPass *createWebAssemblyMemIntrinsicResults(); -FunctionPass *createWebAssemblyRegStackify(); +FunctionPass *createWebAssemblyRegStackify(CodeGenOptLevel OptLevel); FunctionPass *createWebAssemblyRegColoring(); FunctionPass *createWebAssemblyFixBrTableDefaults(); FunctionPass *createWebAssemblyFixIrreducibleControlFlow(); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp index 428d573..bc91c64 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp @@ -41,14 +41,18 @@ using namespace llvm; namespace { class WebAssemblyRegStackify final : public MachineFunctionPass { + bool Optimize; + StringRef getPassName() const override { return "WebAssembly Register Stackify"; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired<MachineDominatorTreeWrapperPass>(); - AU.addRequired<LiveIntervalsWrapperPass>(); + if (Optimize) { + AU.addRequired<LiveIntervalsWrapperPass>(); + AU.addRequired<MachineDominatorTreeWrapperPass>(); + } AU.addPreserved<MachineBlockFrequencyInfoWrapperPass>(); AU.addPreserved<SlotIndexesWrapperPass>(); AU.addPreserved<LiveIntervalsWrapperPass>(); @@ -61,7 +65,9 @@ class WebAssemblyRegStackify final : public MachineFunctionPass { public: static char ID; // Pass identification, replacement for typeid - WebAssemblyRegStackify() : MachineFunctionPass(ID) {} + WebAssemblyRegStackify(CodeGenOptLevel OptLevel) + : MachineFunctionPass(ID), Optimize(OptLevel != CodeGenOptLevel::None) {} + WebAssemblyRegStackify() : WebAssemblyRegStackify(CodeGenOptLevel::Default) {} }; } // end anonymous namespace @@ -70,8 +76,8 @@ INITIALIZE_PASS(WebAssemblyRegStackify, DEBUG_TYPE, "Reorder instructions to use the WebAssembly value stack", false, false) -FunctionPass *llvm::createWebAssemblyRegStackify() { - return new WebAssemblyRegStackify(); +FunctionPass *llvm::createWebAssemblyRegStackify(CodeGenOptLevel OptLevel) { + return new WebAssemblyRegStackify(OptLevel); } // Decorate the given instruction with implicit operands that enforce the @@ -96,8 +102,7 @@ static void imposeStackOrdering(MachineInstr *MI) { static void convertImplicitDefToConstZero(MachineInstr *MI, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, - MachineFunction &MF, - LiveIntervals &LIS) { + MachineFunction &MF) { assert(MI->getOpcode() == TargetOpcode::IMPLICIT_DEF); const auto *RegClass = MRI.getRegClass(MI->getOperand(0).getReg()); @@ -262,36 +267,53 @@ static bool shouldRematerialize(const MachineInstr &Def, // LiveIntervals to handle complex cases. static MachineInstr *getVRegDef(unsigned Reg, const MachineInstr *Insert, const MachineRegisterInfo &MRI, - const LiveIntervals &LIS) { + const LiveIntervals *LIS) { // Most registers are in SSA form here so we try a quick MRI query first. if (MachineInstr *Def = MRI.getUniqueVRegDef(Reg)) return Def; // MRI doesn't know what the Def is. Try asking LIS. - if (const VNInfo *ValNo = LIS.getInterval(Reg).getVNInfoBefore( - LIS.getInstructionIndex(*Insert))) - return LIS.getInstructionFromIndex(ValNo->def); + if (LIS != nullptr) { + SlotIndex InstIndex = LIS->getInstructionIndex(*Insert); + if (const VNInfo *ValNo = LIS->getInterval(Reg).getVNInfoBefore(InstIndex)) + return LIS->getInstructionFromIndex(ValNo->def); + } return nullptr; } // Test whether Reg, as defined at Def, has exactly one use. This is a // generalization of MachineRegisterInfo::hasOneNonDBGUse that uses -// LiveIntervals to handle complex cases. -static bool hasOneNonDBGUse(unsigned Reg, MachineInstr *Def, - MachineRegisterInfo &MRI, MachineDominatorTree &MDT, - LiveIntervals &LIS) { +// LiveIntervals to handle complex cases in optimized code. +static bool hasSingleUse(unsigned Reg, MachineRegisterInfo &MRI, + WebAssemblyFunctionInfo &MFI, bool Optimize, + MachineInstr *Def, LiveIntervals *LIS) { + if (!Optimize) { + // Using "hasOneUse" instead of "hasOneNonDBGUse" here because we don't + // want to stackify DBG_VALUE operands - WASM stack locations are less + // useful and less widely supported than WASM local locations. + if (!MRI.hasOneUse(Reg)) + return false; + // The frame base always has an implicit DBG use as DW_AT_frame_base. + if (MFI.isFrameBaseVirtual() && MFI.getFrameBaseVreg() == Reg) + return false; + return true; + } + // Most registers are in SSA form here so we try a quick MRI query first. if (MRI.hasOneNonDBGUse(Reg)) return true; + if (LIS == nullptr) + return false; + bool HasOne = false; - const LiveInterval &LI = LIS.getInterval(Reg); + const LiveInterval &LI = LIS->getInterval(Reg); const VNInfo *DefVNI = - LI.getVNInfoAt(LIS.getInstructionIndex(*Def).getRegSlot()); + LI.getVNInfoAt(LIS->getInstructionIndex(*Def).getRegSlot()); assert(DefVNI); for (auto &I : MRI.use_nodbg_operands(Reg)) { - const auto &Result = LI.Query(LIS.getInstructionIndex(*I.getParent())); + const auto &Result = LI.Query(LIS->getInstructionIndex(*I.getParent())); if (Result.valueIn() == DefVNI) { if (!Result.isKill()) return false; @@ -311,7 +333,7 @@ static bool hasOneNonDBGUse(unsigned Reg, MachineInstr *Def, static bool isSafeToMove(const MachineOperand *Def, const MachineOperand *Use, const MachineInstr *Insert, const WebAssemblyFunctionInfo &MFI, - const MachineRegisterInfo &MRI) { + const MachineRegisterInfo &MRI, bool Optimize) { const MachineInstr *DefI = Def->getParent(); const MachineInstr *UseI = Use->getParent(); assert(DefI->getParent() == Insert->getParent()); @@ -357,6 +379,12 @@ static bool isSafeToMove(const MachineOperand *Def, const MachineOperand *Use, if (NextI == Insert) return true; + // When not optimizing, we only handle the trivial case above + // to guarantee no impact to debugging and to avoid spending + // compile time. + if (!Optimize) + return false; + // 'catch' and 'catch_all' should be the first instruction of a BB and cannot // move. if (WebAssembly::isCatch(DefI->getOpcode())) @@ -520,14 +548,15 @@ static void shrinkToUses(LiveInterval &LI, LiveIntervals &LIS) { /// dependencies; move the def down and nest it with the current instruction. static MachineInstr *moveForSingleUse(unsigned Reg, MachineOperand &Op, MachineInstr *Def, MachineBasicBlock &MBB, - MachineInstr *Insert, LiveIntervals &LIS, + MachineInstr *Insert, LiveIntervals *LIS, WebAssemblyFunctionInfo &MFI, MachineRegisterInfo &MRI) { LLVM_DEBUG(dbgs() << "Move for single use: "; Def->dump()); WebAssemblyDebugValueManager DefDIs(Def); DefDIs.sink(Insert); - LIS.handleMove(*Def); + if (LIS != nullptr) + LIS->handleMove(*Def); if (MRI.hasOneDef(Reg) && MRI.hasOneNonDBGUse(Reg)) { // No one else is using this register for anything so we can just stackify @@ -540,17 +569,18 @@ static MachineInstr *moveForSingleUse(unsigned Reg, MachineOperand &Op, Op.setReg(NewReg); DefDIs.updateReg(NewReg); - // Tell LiveIntervals about the new register. - LIS.createAndComputeVirtRegInterval(NewReg); + if (LIS != nullptr) { + // Tell LiveIntervals about the new register. + LIS->createAndComputeVirtRegInterval(NewReg); - // Tell LiveIntervals about the changes to the old register. - LiveInterval &LI = LIS.getInterval(Reg); - LI.removeSegment(LIS.getInstructionIndex(*Def).getRegSlot(), - LIS.getInstructionIndex(*Op.getParent()).getRegSlot(), - /*RemoveDeadValNo=*/true); + // Tell LiveIntervals about the changes to the old register. + LiveInterval &LI = LIS->getInterval(Reg); + LI.removeSegment(LIS->getInstructionIndex(*Def).getRegSlot(), + LIS->getInstructionIndex(*Op.getParent()).getRegSlot(), + /*RemoveDeadValNo=*/true); + } MFI.stackifyVReg(MRI, NewReg); - LLVM_DEBUG(dbgs() << " - Replaced register: "; Def->dump()); } @@ -567,11 +597,12 @@ static MachineInstr *getPrevNonDebugInst(MachineInstr *MI) { /// A trivially cloneable instruction; clone it and nest the new copy with the /// current instruction. -static MachineInstr *rematerializeCheapDef( - unsigned Reg, MachineOperand &Op, MachineInstr &Def, MachineBasicBlock &MBB, - MachineBasicBlock::instr_iterator Insert, LiveIntervals &LIS, - WebAssemblyFunctionInfo &MFI, MachineRegisterInfo &MRI, - const WebAssemblyInstrInfo *TII, const WebAssemblyRegisterInfo *TRI) { +static MachineInstr * +rematerializeCheapDef(unsigned Reg, MachineOperand &Op, MachineInstr &Def, + MachineBasicBlock::instr_iterator Insert, + LiveIntervals &LIS, WebAssemblyFunctionInfo &MFI, + MachineRegisterInfo &MRI, + const WebAssemblyInstrInfo *TII) { LLVM_DEBUG(dbgs() << "Rematerializing cheap def: "; Def.dump()); LLVM_DEBUG(dbgs() << " - for use in "; Op.getParent()->dump()); @@ -811,9 +842,12 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) { MachineRegisterInfo &MRI = MF.getRegInfo(); WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>(); const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo(); - const auto *TRI = MF.getSubtarget<WebAssemblySubtarget>().getRegisterInfo(); - auto &MDT = getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); - auto &LIS = getAnalysis<LiveIntervalsWrapperPass>().getLIS(); + MachineDominatorTree *MDT = nullptr; + LiveIntervals *LIS = nullptr; + if (Optimize) { + MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); + LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS(); + } // Walk the instructions from the bottom up. Currently we don't look past // block boundaries, and the blocks aren't ordered so the block visitation @@ -876,23 +910,28 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) { // supports intra-block moves) and it's MachineSink's job to catch all // the sinking opportunities anyway. bool SameBlock = DefI->getParent() == &MBB; - bool CanMove = SameBlock && isSafeToMove(Def, &Use, Insert, MFI, MRI) && + bool CanMove = SameBlock && + isSafeToMove(Def, &Use, Insert, MFI, MRI, Optimize) && !TreeWalker.isOnStack(Reg); - if (CanMove && hasOneNonDBGUse(Reg, DefI, MRI, MDT, LIS)) { + if (CanMove && hasSingleUse(Reg, MRI, MFI, Optimize, DefI, LIS)) { Insert = moveForSingleUse(Reg, Use, DefI, MBB, Insert, LIS, MFI, MRI); // If we are removing the frame base reg completely, remove the debug // info as well. // TODO: Encode this properly as a stackified value. - if (MFI.isFrameBaseVirtual() && MFI.getFrameBaseVreg() == Reg) + if (MFI.isFrameBaseVirtual() && MFI.getFrameBaseVreg() == Reg) { + assert( + Optimize && + "Stackifying away frame base in unoptimized code not expected"); MFI.clearFrameBaseVreg(); - } else if (shouldRematerialize(*DefI, TII)) { - Insert = - rematerializeCheapDef(Reg, Use, *DefI, MBB, Insert->getIterator(), - LIS, MFI, MRI, TII, TRI); - } else if (CanMove && oneUseDominatesOtherUses(Reg, Use, MBB, MRI, MDT, - LIS, MFI)) { - Insert = moveAndTeeForMultiUse(Reg, Use, DefI, MBB, Insert, LIS, MFI, + } + } else if (Optimize && shouldRematerialize(*DefI, TII)) { + Insert = rematerializeCheapDef(Reg, Use, *DefI, Insert->getIterator(), + *LIS, MFI, MRI, TII); + } else if (Optimize && CanMove && + oneUseDominatesOtherUses(Reg, Use, MBB, MRI, *MDT, *LIS, + MFI)) { + Insert = moveAndTeeForMultiUse(Reg, Use, DefI, MBB, Insert, *LIS, MFI, MRI, TII); } else { // We failed to stackify the operand. If the problem was ordering @@ -915,7 +954,8 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) { Register DefReg = SubsequentDef->getReg(); Register UseReg = SubsequentUse->getReg(); // TODO: This single-use restriction could be relaxed by using tees - if (DefReg != UseReg || !MRI.hasOneNonDBGUse(DefReg)) + if (DefReg != UseReg || + !hasSingleUse(DefReg, MRI, MFI, Optimize, nullptr, nullptr)) break; MFI.stackifyVReg(MRI, DefReg); ++SubsequentDef; @@ -926,7 +966,7 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) { // to a constant 0 so that the def is explicit, and the push/pop // correspondence is maintained. if (Insert->getOpcode() == TargetOpcode::IMPLICIT_DEF) - convertImplicitDefToConstZero(Insert, MRI, TII, MF, LIS); + convertImplicitDefToConstZero(Insert, MRI, TII, MF); // We stackified an operand. Add the defining instruction's operands to // the worklist stack now to continue to build an ever deeper tree. diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp index 6e551e5c..378af22 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp @@ -602,14 +602,16 @@ void WebAssemblyPassConfig::addPreEmitPass() { // Prepare memory intrinsic calls for register stackifying. addPass(createWebAssemblyMemIntrinsicResults()); + } - // Mark registers as representing wasm's value stack. This is a key - // code-compression technique in WebAssembly. We run this pass (and - // MemIntrinsicResults above) very late, so that it sees as much code as - // possible, including code emitted by PEI and expanded by late tail - // duplication. - addPass(createWebAssemblyRegStackify()); + // Mark registers as representing wasm's value stack. This is a key + // code-compression technique in WebAssembly. We run this pass (and + // MemIntrinsicResults above) very late, so that it sees as much code as + // possible, including code emitted by PEI and expanded by late tail + // duplication. + addPass(createWebAssemblyRegStackify(getOptLevel())); + if (getOptLevel() != CodeGenOptLevel::None) { // Run the register coloring pass to reduce the total number of registers. // This runs after stackification so that it doesn't consider registers // that become stackified. diff --git a/llvm/test/CodeGen/WebAssembly/PR40172.ll b/llvm/test/CodeGen/WebAssembly/PR40172.ll index ed1630c..e70b115 100644 --- a/llvm/test/CodeGen/WebAssembly/PR40172.ll +++ b/llvm/test/CodeGen/WebAssembly/PR40172.ll @@ -10,9 +10,9 @@ target triple = "wasm32-unknown-unknown" ; CHECK: i32.sub $[[BASE:[0-9]+]]=, ; CHECK: local.copy $[[ARG:[0-9]+]]=, $0{{$}} -; CHECK: i32.const $[[A0:[0-9]+]]=, 1{{$}} -; CHECK: i32.and $[[A1:[0-9]+]]=, $[[ARG]], $[[A0]]{{$}} -; CHECK: i32.store8 8($[[BASE]]), $[[A1]]{{$}} +; CHECK: i32.const $push[[A0:[0-9]+]]=, 1{{$}} +; CHECK: i32.and $push[[A1:[0-9]+]]=, $[[ARG]], $pop[[A0]]{{$}} +; CHECK: i32.store8 8($[[BASE]]), $pop[[A1]]{{$}} define void @test(i8 %byte) { %t = alloca { i8, i8 }, align 8 diff --git a/llvm/test/CodeGen/WebAssembly/PR41841.ll b/llvm/test/CodeGen/WebAssembly/PR41841.ll index 4e3166a..76ec83e 100644 --- a/llvm/test/CodeGen/WebAssembly/PR41841.ll +++ b/llvm/test/CodeGen/WebAssembly/PR41841.ll @@ -6,9 +6,9 @@ declare void @foo(i128) ; CHECK-LABEL: test_zext: ; CHECK-NEXT: .functype test_zext (i32) -> (){{$}} -; CHECK-NEXT: i64.extend_i32_u $[[TMP3:[0-9]+]]=, $0{{$}} -; CHECK-NEXT: i64.const $[[TMP4:[0-9]+]]=, 1{{$}} -; CHECK-NEXT: i64.and $[[TMP1:[0-9]+]]=, $[[TMP3]], $[[TMP4]]{{$}} +; CHECK-NEXT: i64.extend_i32_u $push[[TMP3:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: i64.const $push[[TMP4:[0-9]+]]=, 1{{$}} +; CHECK-NEXT: i64.and $[[TMP1:[0-9]+]]=, $pop[[TMP3]], $pop[[TMP4]]{{$}} ; CHECK-NEXT: i64.const $[[TMP2:[0-9]+]]=, 0{{$}} ; CHECK-NEXT: call foo, $[[TMP1]], $[[TMP2]]{{$}} ; CHECK-NEXT: return{{$}} @@ -23,11 +23,11 @@ next: ; preds = %start ; CHECK-LABEL: test_sext: ; CHECK-NEXT:.functype test_sext (i32) -> (){{$}} -; CHECK-NEXT: i64.extend_i32_u $[[TMP3:[0-9]+]]=, $0{{$}} -; CHECK-NEXT: i64.const $[[TMP4:[0-9]+]]=, 1{{$}} -; CHECK-NEXT: i64.and $[[TMP5:[0-9]+]]=, $[[TMP3]], $[[TMP4]]{{$}} -; CHECK-NEXT: i64.const $[[TMP6:[0-9]+]]=, 0{{$}} -; CHECK-NEXT: i64.sub $[[TMP1:[0-9]+]]=, $[[TMP6]], $[[TMP5]]{{$}} +; CHECK-NEXT: i64.extend_i32_u $push[[TMP3:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: i64.const $push[[TMP4:[0-9]+]]=, 1{{$}} +; CHECK-NEXT: i64.and $[[TMP5:[0-9]+]]=, $pop[[TMP3]], $pop[[TMP4]]{{$}} +; CHECK-NEXT: i64.const $push[[TMP6:[0-9]+]]=, 0{{$}} +; CHECK-NEXT: i64.sub $[[TMP1:[0-9]+]]=, $pop[[TMP6]], $[[TMP5]]{{$}} ; CHECK-NEXT: local.copy $[[TMP2:[0-9]+]]=, $[[TMP1]]{{$}} ; CHECK-NEXT: call foo, $[[TMP1]], $[[TMP2]]{{$}} ; CHECK-NEXT: return{{$}} diff --git a/llvm/test/CodeGen/WebAssembly/debug-code-stackification.ll b/llvm/test/CodeGen/WebAssembly/debug-code-stackification.ll new file mode 100644 index 0000000..98d0f02 --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/debug-code-stackification.ll @@ -0,0 +1,89 @@ +; RUN: llc < %s -O0 --filetype=obj -o - | llvm-dwarfdump - | FileCheck %s --check-prefixes DBG_USE + +target triple = "wasm32-unknown-unknown" + +declare i32 @extern_func(i32, i32) + +; We want to produce WASM local "DW_OP_WASM_location 0x00 <local number>" locations +; in debug code instead of operand stack ("DW_OP_WASM_location 0x02 <depth>") locations +; since local locations are more widely supported and can cover the entirety of the method. +; DBG_USE: DW_TAG_subprogram +; DBG_USE: DW_AT_name ("single_non_dbg_use") +; DBG_USE: DW_TAG_variable +; DBG_USE: DW_AT_location +; DBG_USE: DW_OP_WASM_location 0x0 +; DBG_USE: DW_AT_name ("call_value") +; DBG_USE: DW_TAG_variable +; DBG_USE: DW_AT_location +; DBG_USE: DW_OP_WASM_location 0x0 +; DBG_USE: DW_AT_name ("sub_value") +define i32 @single_non_dbg_use(i32 %0, i32 %1) !dbg !6 { + %call_value = call i32 @extern_func(i32 1, i32 2), !dbg !20 + call void @llvm.dbg.value(metadata i32 %call_value, metadata !11, metadata !DIExpression()), !dbg !20 + %div = udiv i32 %0, %1, !dbg !21 + %sub = sub i32 %call_value, %div, !dbg !22 + call void @llvm.dbg.value(metadata i32 %sub, metadata !12, metadata !DIExpression()), !dbg !22 + ret i32 %sub, !dbg !23 +} + +!6 = distinct !DISubprogram(name: "single_non_dbg_use", scope: !1, file: !1, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0) +!7 = !DISubroutineType(types: !8) +!8 = !{!9, !9, !9} +!9 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!11 = !DILocalVariable(name: "call_value", scope: !6, type: !9) +!12 = !DILocalVariable(name: "sub_value", scope: !6, type: !9) +!20 = !DILocation(line: 20, scope: !6) +!21 = !DILocation(line: 21, scope: !6) +!22 = !DILocation(line: 22, scope: !6) +!23 = !DILocation(line: 23, scope: !6) + +; Similarly for a singly-used frame base. +; DBG_USE: DW_TAG_subprogram +; DBG_USE: DW_AT_frame_base (DW_OP_WASM_location 0x0 +; DBG_USE: DW_AT_name ("single_use_frame_base") +; DBG_USE: DW_TAG_variable +; DBG_USE: DW_AT_location (DW_OP_fbreg +12) +; DBG_USE: DW_AT_name ("arg_value") +define i32 @single_use_frame_base(i32 %0, i32 %1) !dbg !13 { + %arg_loc = alloca i32, !dbg !24 + store i32 %1, ptr %arg_loc, !dbg !25 + call void @llvm.dbg.declare(metadata ptr %arg_loc, metadata !14, metadata !DIExpression()), !dbg !25 + ret i32 %0, !dbg !26 +} + +!13 = distinct !DISubprogram(name: "single_use_frame_base", scope: !1, file: !1, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0) +!14 = !DILocalVariable(name: "arg_value", scope: !13, type: !9) +!24 = !DILocation(line: 24, scope: !13) +!25 = !DILocation(line: 25, scope: !13) +!26 = !DILocation(line: 26, scope: !13) + +; Similarly for multivalue defs... But we can't really test +; it due to https://github.com/llvm/llvm-project/issues/136506. +; declare {i32, i32} @extern_func_multivalue(i32, i32) +; +; define i32 @single_non_dbg_use_multivalue(i32 %0, i32 %1) !dbg !15 { +; %full_value = call {i32, i32} @extern_func_multivalue(i32 1, i32 2), !dbg !27 +; %full_value_one = extractvalue {i32, i32} %full_value, 0, !dbg !27 +; %full_value_two = extractvalue {i32, i32} %full_value, 1, !dbg !27 +; %partial_value = call {i32, i32} @extern_func_multivalue(i32 %full_value_one, i32 %full_value_two), !dbg !28 +; call void @llvm.dbg.value(metadata i32 %full_value_two, metadata !16, metadata !DIExpression()), !dbg !28 +; %partial_value_one = extractvalue {i32, i32} %partial_value, 0, !dbg !28 +; %partial_value_two = extractvalue {i32, i32} %partial_value, 1, !dbg !28 +; call void @llvm.dbg.value(metadata i32 %partial_value_two, metadata !17, metadata !DIExpression()), !dbg !28 +; ret i32 %partial_value_one, !dbg !29 +; } +; +; !15 = distinct !DISubprogram(name: "single_non_dbg_use_multivalue", scope: !1, file: !1, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0) +; !16 = !DILocalVariable(name: "value_used", scope: !15, type: !9) +; !17 = !DILocalVariable(name: "value_unused", scope: !15, type: !9) +; !27 = !DILocation(line: 27, scope: !15) +; !28 = !DILocation(line: 28, scope: !15) +; !29 = !DILocation(line: 29, scope: !15) + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "LLC", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "test.ll", directory: "") +!2 = !{i32 7, !"Dwarf Version", i32 4} +!3 = !{i32 2, !"Debug Info Version", i32 3} diff --git a/llvm/test/CodeGen/WebAssembly/pr51651.ll b/llvm/test/CodeGen/WebAssembly/pr51651.ll index 56aa99b..7328574 100644 --- a/llvm/test/CodeGen/WebAssembly/pr51651.ll +++ b/llvm/test/CodeGen/WebAssembly/pr51651.ll @@ -9,17 +9,17 @@ define i32 @test(ptr %p, ptr %p2) { ; CHECK-NEXT: i32.eqz $2=, $3 ; CHECK-NEXT: i32.store8 0($1), $3 ; CHECK-NEXT: # %bb.1: # %bb2 -; CHECK-NEXT: i32.const $4=, 1 -; CHECK-NEXT: i32.and $5=, $2, $4 ; CHECK-NEXT: block -; CHECK-NEXT: br_if 0, $5 # 0: down to label0 +; CHECK-NEXT: i32.const $push0=, 1 +; CHECK-NEXT: i32.and $push1=, $2, $pop0 +; CHECK-NEXT: br_if 0, $pop1 # 0: down to label0 ; CHECK-NEXT: # %bb.2: # %bb4 -; CHECK-NEXT: i32.const $6=, 0 -; CHECK-NEXT: return $6 +; CHECK-NEXT: i32.const $push2=, 0 +; CHECK-NEXT: return $pop2 ; CHECK-NEXT: .LBB0_3: # %bb3 ; CHECK-NEXT: end_block # label0: -; CHECK-NEXT: i32.const $7=, 1 -; CHECK-NEXT: return $7 +; CHECK-NEXT: i32.const $push3=, 1 +; CHECK-NEXT: return $pop3 %v = load i8, ptr %p %v.ext = zext i8 %v to i32 %cond = icmp eq i32 %v.ext, 0 diff --git a/llvm/test/CodeGen/WebAssembly/signext-zeroext-callsite.ll b/llvm/test/CodeGen/WebAssembly/signext-zeroext-callsite.ll index e33337f..f7f0b68 100644 --- a/llvm/test/CodeGen/WebAssembly/signext-zeroext-callsite.ll +++ b/llvm/test/CodeGen/WebAssembly/signext-zeroext-callsite.ll @@ -12,7 +12,7 @@ declare i32 @foo(i1 signext noundef, i32 noundef) define i32 @callsite_nosignext() { ; CHECK-LABEL: callsite_nosignext: ; CHECK: .functype callsite_nosignext () -> (i32) -; CHECK-NEXT: .local i32, i32, i32, i32, i32, i32 +; CHECK-NEXT: .local i32, i32, i32 ; CHECK-NEXT: # %bb.0: # %start ; CHECK-NEXT: i32.const 1 ; CHECK-NEXT: local.set 0 @@ -23,31 +23,21 @@ define i32 @callsite_nosignext() { ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: local.get 2 ; CHECK-NEXT: i32.shl -; CHECK-NEXT: local.set 3 -; CHECK-NEXT: local.get 3 ; CHECK-NEXT: local.get 2 ; CHECK-NEXT: i32.shr_s -; CHECK-NEXT: local.set 4 -; CHECK-NEXT: local.get 4 ; CHECK-NEXT: local.get 1 ; CHECK-NEXT: call foo -; CHECK-NEXT: local.set 5 -; CHECK-NEXT: local.get 5 ; CHECK-NEXT: return ; ; NO-FAST-ISEL-LABEL: callsite_nosignext: ; NO-FAST-ISEL: .functype callsite_nosignext () -> (i32) -; NO-FAST-ISEL-NEXT: .local i32, i32, i32 +; NO-FAST-ISEL-NEXT: .local i32 ; NO-FAST-ISEL-NEXT: # %bb.0: # %start ; NO-FAST-ISEL-NEXT: i32.const 0 ; NO-FAST-ISEL-NEXT: local.set 0 ; NO-FAST-ISEL-NEXT: i32.const -1 -; NO-FAST-ISEL-NEXT: local.set 1 -; NO-FAST-ISEL-NEXT: local.get 1 ; NO-FAST-ISEL-NEXT: local.get 0 ; NO-FAST-ISEL-NEXT: call foo -; NO-FAST-ISEL-NEXT: local.set 2 -; NO-FAST-ISEL-NEXT: local.get 2 ; NO-FAST-ISEL-NEXT: return start: %0 = call i32 @foo(i1 1, i32 0) @@ -57,7 +47,7 @@ start: define i32 @callsite_signext() { ; CHECK-LABEL: callsite_signext: ; CHECK: .functype callsite_signext () -> (i32) -; CHECK-NEXT: .local i32, i32, i32, i32, i32, i32 +; CHECK-NEXT: .local i32, i32, i32 ; CHECK-NEXT: # %bb.0: # %start ; CHECK-NEXT: i32.const 1 ; CHECK-NEXT: local.set 0 @@ -68,31 +58,21 @@ define i32 @callsite_signext() { ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: local.get 2 ; CHECK-NEXT: i32.shl -; CHECK-NEXT: local.set 3 -; CHECK-NEXT: local.get 3 ; CHECK-NEXT: local.get 2 ; CHECK-NEXT: i32.shr_s -; CHECK-NEXT: local.set 4 -; CHECK-NEXT: local.get 4 ; CHECK-NEXT: local.get 1 ; CHECK-NEXT: call foo -; CHECK-NEXT: local.set 5 -; CHECK-NEXT: local.get 5 ; CHECK-NEXT: return ; ; NO-FAST-ISEL-LABEL: callsite_signext: ; NO-FAST-ISEL: .functype callsite_signext () -> (i32) -; NO-FAST-ISEL-NEXT: .local i32, i32, i32 +; NO-FAST-ISEL-NEXT: .local i32 ; NO-FAST-ISEL-NEXT: # %bb.0: # %start ; NO-FAST-ISEL-NEXT: i32.const 0 ; NO-FAST-ISEL-NEXT: local.set 0 ; NO-FAST-ISEL-NEXT: i32.const -1 -; NO-FAST-ISEL-NEXT: local.set 1 -; NO-FAST-ISEL-NEXT: local.get 1 ; NO-FAST-ISEL-NEXT: local.get 0 ; NO-FAST-ISEL-NEXT: call foo -; NO-FAST-ISEL-NEXT: local.set 2 -; NO-FAST-ISEL-NEXT: local.get 2 ; NO-FAST-ISEL-NEXT: return start: %0 = call i32 @foo(i1 signext 1, i32 0) @@ -106,38 +86,28 @@ declare i32 @foo2(i1 zeroext noundef, i32 noundef) define i32 @callsite_nozeroext() { ; CHECK-LABEL: callsite_nozeroext: ; CHECK: .functype callsite_nozeroext () -> (i32) -; CHECK-NEXT: .local i32, i32, i32, i32, i32 +; CHECK-NEXT: .local i32, i32 ; CHECK-NEXT: # %bb.0: # %start ; CHECK-NEXT: i32.const 1 ; CHECK-NEXT: local.set 0 ; CHECK-NEXT: i32.const 0 ; CHECK-NEXT: local.set 1 -; CHECK-NEXT: i32.const 1 -; CHECK-NEXT: local.set 2 ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: local.get 2 +; CHECK-NEXT: i32.const 1 ; CHECK-NEXT: i32.and -; CHECK-NEXT: local.set 3 -; CHECK-NEXT: local.get 3 ; CHECK-NEXT: local.get 1 ; CHECK-NEXT: call foo2 -; CHECK-NEXT: local.set 4 -; CHECK-NEXT: local.get 4 ; CHECK-NEXT: return ; ; NO-FAST-ISEL-LABEL: callsite_nozeroext: ; NO-FAST-ISEL: .functype callsite_nozeroext () -> (i32) -; NO-FAST-ISEL-NEXT: .local i32, i32, i32 +; NO-FAST-ISEL-NEXT: .local i32 ; NO-FAST-ISEL-NEXT: # %bb.0: # %start ; NO-FAST-ISEL-NEXT: i32.const 0 ; NO-FAST-ISEL-NEXT: local.set 0 ; NO-FAST-ISEL-NEXT: i32.const 1 -; NO-FAST-ISEL-NEXT: local.set 1 -; NO-FAST-ISEL-NEXT: local.get 1 ; NO-FAST-ISEL-NEXT: local.get 0 ; NO-FAST-ISEL-NEXT: call foo2 -; NO-FAST-ISEL-NEXT: local.set 2 -; NO-FAST-ISEL-NEXT: local.get 2 ; NO-FAST-ISEL-NEXT: return start: %0 = call i32 @foo2(i1 1, i32 0) @@ -147,38 +117,28 @@ start: define i32 @callsite_zeroext() { ; CHECK-LABEL: callsite_zeroext: ; CHECK: .functype callsite_zeroext () -> (i32) -; CHECK-NEXT: .local i32, i32, i32, i32, i32 +; CHECK-NEXT: .local i32, i32 ; CHECK-NEXT: # %bb.0: # %start ; CHECK-NEXT: i32.const 1 ; CHECK-NEXT: local.set 0 ; CHECK-NEXT: i32.const 0 ; CHECK-NEXT: local.set 1 -; CHECK-NEXT: i32.const 1 -; CHECK-NEXT: local.set 2 ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: local.get 2 +; CHECK-NEXT: i32.const 1 ; CHECK-NEXT: i32.and -; CHECK-NEXT: local.set 3 -; CHECK-NEXT: local.get 3 ; CHECK-NEXT: local.get 1 ; CHECK-NEXT: call foo2 -; CHECK-NEXT: local.set 4 -; CHECK-NEXT: local.get 4 ; CHECK-NEXT: return ; ; NO-FAST-ISEL-LABEL: callsite_zeroext: ; NO-FAST-ISEL: .functype callsite_zeroext () -> (i32) -; NO-FAST-ISEL-NEXT: .local i32, i32, i32 +; NO-FAST-ISEL-NEXT: .local i32 ; NO-FAST-ISEL-NEXT: # %bb.0: # %start ; NO-FAST-ISEL-NEXT: i32.const 0 ; NO-FAST-ISEL-NEXT: local.set 0 ; NO-FAST-ISEL-NEXT: i32.const 1 -; NO-FAST-ISEL-NEXT: local.set 1 -; NO-FAST-ISEL-NEXT: local.get 1 ; NO-FAST-ISEL-NEXT: local.get 0 ; NO-FAST-ISEL-NEXT: call foo2 -; NO-FAST-ISEL-NEXT: local.set 2 -; NO-FAST-ISEL-NEXT: local.get 2 ; NO-FAST-ISEL-NEXT: return start: %0 = call i32 @foo2(i1 zeroext 1, i32 0) diff --git a/llvm/test/CodeGen/WebAssembly/suboptimal-compare.ll b/llvm/test/CodeGen/WebAssembly/suboptimal-compare.ll index fadeb39..9a2716b 100644 --- a/llvm/test/CodeGen/WebAssembly/suboptimal-compare.ll +++ b/llvm/test/CodeGen/WebAssembly/suboptimal-compare.ll @@ -4,32 +4,19 @@ target triple = "wasm32-unknown-unknown" ; CHECK-LABEL: gh_80053: # @gh_80053 ; CHECK-NEXT: .functype gh_80053 (i32) -> (i32) -; CHECK-NEXT: .local i32, i32, i32, i32, i32, i32 -; CHECK: i32.const 0 -; CHECK-NEXT: local.set 1 +; CHECK: block ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.const 0 ; CHECK-NEXT: i32.eq -; CHECK-NEXT: local.set 2 ; CHECK-NEXT: i32.const 1 -; CHECK-NEXT: local.set 3 -; CHECK-NEXT: local.get 2 -; CHECK-NEXT: local.get 3 ; CHECK-NEXT: i32.and -; CHECK-NEXT: local.set 4 -; CHECK-NEXT: block -; CHECK-NEXT: local.get 4 -; CHECK-NEXT: i32.eqz -; CHECK-NEXT: br_if 0 # 0: down to label0 -; CHECK: i32.const 0 -; CHECK-NEXT: local.set 5 -; CHECK-NEXT: local.get 5 -; CHECK-NEXT: return +; CHECK-NEXT: i32.eqz +; CHECK-NEXT: br_if 0 # 0: down to label0 +; CHECK: i32.const 0 +; CHECK-NEXT: return ; CHECK-NEXT: .LBB0_2: # %BB03 ; CHECK-NEXT: end_block # label0: ; CHECK-NEXT: i32.const 1 -; CHECK-NEXT: local.set 6 -; CHECK-NEXT: local.get 6 ; CHECK-NEXT: return ; CHECK-NEXT: end_function define i1 @gh_80053(ptr) { |