From 63f98290d09e1da05fb1217d1b760cbe24b76db9 Mon Sep 17 00:00:00 2001 From: Koakuma Date: Mon, 15 Jan 2024 04:28:51 +0700 Subject: [SPARC] Prefer RDPC over CALL to implement GETPCX for 64-bit target (#77196) On 64-bit target, prefer usng RDPC over CALL to get the value of %pc. This is faster on modern processors (Niagara T1 and newer) and avoids polluting the processor's predictor state. The old behavior of using a fake CALL is still done when tuning for classic UltraSPARC processors, since RDPC is much slower there. A quick pgbench test on a SPARC T4 shows about 2% speedup on SELECT loads, and about 7% speedup on INSERT/UPDATE loads. --- llvm/lib/Target/Sparc/Sparc.td | 18 ++++++++--- llvm/lib/Target/Sparc/SparcAsmPrinter.cpp | 25 +++++++++++++-- llvm/test/CodeGen/SPARC/getpcx-call.ll | 51 +++++++++++++++++++++++++++++++ llvm/test/CodeGen/SPARC/getpcx-rdpc.ll | 51 +++++++++++++++++++++++++++++++ 4 files changed, 138 insertions(+), 7 deletions(-) create mode 100644 llvm/test/CodeGen/SPARC/getpcx-call.ll create mode 100644 llvm/test/CodeGen/SPARC/getpcx-rdpc.ll diff --git a/llvm/lib/Target/Sparc/Sparc.td b/llvm/lib/Target/Sparc/Sparc.td index 1a71cfe..7b10339 100644 --- a/llvm/lib/Target/Sparc/Sparc.td +++ b/llvm/lib/Target/Sparc/Sparc.td @@ -62,6 +62,13 @@ def UsePopc : SubtargetFeature<"popc", "UsePopc", "true", def FeatureSoftFloat : SubtargetFeature<"soft-float", "UseSoftFloat", "true", "Use software emulation for floating point">; +//===----------------------------------------------------------------------===// +// SPARC Subtarget tuning features. +// + +def TuneSlowRDPC : SubtargetFeature<"slow-rdpc", "HasSlowRDPC", "true", + "rd %pc, %XX is slow", [FeatureV9]>; + //==== Features added predmoninantly for LEON subtarget support include "LeonFeatures.td" @@ -89,8 +96,9 @@ def SparcAsmParserVariant : AsmParserVariant { // SPARC processors supported. //===----------------------------------------------------------------------===// -class Proc Features> - : Processor; +class Proc Features, + list TuneFeatures = []> + : Processor; def : Proc<"generic", []>; def : Proc<"v7", [FeatureSoftMulDiv, FeatureNoFSMULD]>; @@ -118,9 +126,11 @@ def : Proc<"ma2480", [FeatureLeon, LeonCASA]>; def : Proc<"ma2485", [FeatureLeon, LeonCASA]>; def : Proc<"ma2x8x", [FeatureLeon, LeonCASA]>; def : Proc<"v9", [FeatureV9]>; -def : Proc<"ultrasparc", [FeatureV9, FeatureV8Deprecated, FeatureVIS]>; +def : Proc<"ultrasparc", [FeatureV9, FeatureV8Deprecated, FeatureVIS], + [TuneSlowRDPC]>; def : Proc<"ultrasparc3", [FeatureV9, FeatureV8Deprecated, FeatureVIS, - FeatureVIS2]>; + FeatureVIS2], + [TuneSlowRDPC]>; def : Proc<"niagara", [FeatureV9, FeatureV8Deprecated, FeatureVIS, FeatureVIS2]>; def : Proc<"niagara2", [FeatureV9, FeatureV8Deprecated, UsePopc, diff --git a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp index cca624e..215a8ea 100644 --- a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp +++ b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp @@ -13,6 +13,7 @@ #include "MCTargetDesc/SparcInstPrinter.h" #include "MCTargetDesc/SparcMCExpr.h" +#include "MCTargetDesc/SparcMCTargetDesc.h" #include "MCTargetDesc/SparcTargetStreamer.h" #include "Sparc.h" #include "SparcInstrInfo.h" @@ -111,6 +112,15 @@ static void EmitCall(MCStreamer &OutStreamer, OutStreamer.emitInstruction(CallInst, STI); } +static void EmitRDPC(MCStreamer &OutStreamer, MCOperand &RD, + const MCSubtargetInfo &STI) { + MCInst RDPCInst; + RDPCInst.setOpcode(SP::RDASR); + RDPCInst.addOperand(RD); + RDPCInst.addOperand(MCOperand::createReg(SP::ASR5)); + OutStreamer.emitInstruction(RDPCInst, STI); +} + static void EmitSETHI(MCStreamer &OutStreamer, MCOperand &Imm, MCOperand &RD, const MCSubtargetInfo &STI) @@ -226,7 +236,7 @@ void SparcAsmPrinter::LowerGETPCXAndEmitMCInsts(const MachineInstr *MI, MCOperand RegO7 = MCOperand::createReg(SP::O7); // : - // call + // // This will be either `call ` or `rd %pc, %o7`. // : // sethi %hi(_GLOBAL_OFFSET_TABLE_+(-)), // : @@ -234,8 +244,17 @@ void SparcAsmPrinter::LowerGETPCXAndEmitMCInsts(const MachineInstr *MI, // add , %o7, OutStreamer->emitLabel(StartLabel); - MCOperand Callee = createPCXCallOP(EndLabel, OutContext); - EmitCall(*OutStreamer, Callee, STI); + if (!STI.getTargetTriple().isSPARC64() || + STI.hasFeature(Sparc::TuneSlowRDPC)) { + MCOperand Callee = createPCXCallOP(EndLabel, OutContext); + EmitCall(*OutStreamer, Callee, STI); + } else { + // TODO find out whether it is possible to store PC + // in other registers, to enable leaf function optimization. + // (On the other hand, approx. over 97.8% of GETPCXes happen + // in non-leaf functions, so would this be worth the effort?) + EmitRDPC(*OutStreamer, RegO7, STI); + } OutStreamer->emitLabel(SethiLabel); MCOperand hiImm = createPCXRelExprOp(SparcMCExpr::VK_Sparc_PC22, GOTLabel, StartLabel, SethiLabel, diff --git a/llvm/test/CodeGen/SPARC/getpcx-call.ll b/llvm/test/CodeGen/SPARC/getpcx-call.ll new file mode 100644 index 0000000..72d7b5a --- /dev/null +++ b/llvm/test/CodeGen/SPARC/getpcx-call.ll @@ -0,0 +1,51 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -relocation-model=pic -mtriple=sparc | FileCheck --check-prefix=SPARC %s +; RUN: llc < %s -relocation-model=pic -mtriple=sparcv9 | FileCheck --check-prefix=SPARC64 %s + +;; SPARC32 and SPARC64 for classic UltraSPARCs implement GETPCX +;; with a fake `call`. +;; All other SPARC64 targets implement it with `rd %pc, %o7`. +;; Need to do the tests in separate files because apparently `tune-cpu` +;; attribute applies to the entire file at once. + +@value = external global i32 + +define i32 @testCall() nounwind #0 { +; SPARC-LABEL: testCall: +; SPARC: ! %bb.0: +; SPARC-NEXT: save %sp, -96, %sp +; SPARC-NEXT: .Ltmp0: +; SPARC-NEXT: call .Ltmp1 +; SPARC-NEXT: .Ltmp2: +; SPARC-NEXT: sethi %hi(_GLOBAL_OFFSET_TABLE_+(.Ltmp2-.Ltmp0)), %i0 +; SPARC-NEXT: .Ltmp1: +; SPARC-NEXT: or %i0, %lo(_GLOBAL_OFFSET_TABLE_+(.Ltmp1-.Ltmp0)), %i0 +; SPARC-NEXT: add %i0, %o7, %i0 +; SPARC-NEXT: sethi %hi(value), %i1 +; SPARC-NEXT: add %i1, %lo(value), %i1 +; SPARC-NEXT: ld [%i0+%i1], %i0 +; SPARC-NEXT: ld [%i0], %i0 +; SPARC-NEXT: ret +; SPARC-NEXT: restore +; +; SPARC64-LABEL: testCall: +; SPARC64: ! %bb.0: +; SPARC64-NEXT: save %sp, -128, %sp +; SPARC64-NEXT: .Ltmp0: +; SPARC64-NEXT: call .Ltmp1 +; SPARC64-NEXT: .Ltmp2: +; SPARC64-NEXT: sethi %hi(_GLOBAL_OFFSET_TABLE_+(.Ltmp2-.Ltmp0)), %i0 +; SPARC64-NEXT: .Ltmp1: +; SPARC64-NEXT: or %i0, %lo(_GLOBAL_OFFSET_TABLE_+(.Ltmp1-.Ltmp0)), %i0 +; SPARC64-NEXT: add %i0, %o7, %i0 +; SPARC64-NEXT: sethi %hi(value), %i1 +; SPARC64-NEXT: add %i1, %lo(value), %i1 +; SPARC64-NEXT: ldx [%i0+%i1], %i0 +; SPARC64-NEXT: ld [%i0], %i0 +; SPARC64-NEXT: ret +; SPARC64-NEXT: restore + %1 = load i32, ptr @value + ret i32 %1 +} + +attributes #0 = { "tune-cpu"="ultrasparc" } diff --git a/llvm/test/CodeGen/SPARC/getpcx-rdpc.ll b/llvm/test/CodeGen/SPARC/getpcx-rdpc.ll new file mode 100644 index 0000000..286750a --- /dev/null +++ b/llvm/test/CodeGen/SPARC/getpcx-rdpc.ll @@ -0,0 +1,51 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -relocation-model=pic -mtriple=sparc | FileCheck --check-prefix=SPARC %s +; RUN: llc < %s -relocation-model=pic -mtriple=sparcv9 | FileCheck --check-prefix=SPARC64 %s + +;; SPARC32 and SPARC64 for classic UltraSPARCs implement GETPCX +;; with a fake `call`. +;; All other SPARC64 targets implement it with `rd %pc, %o7`. +;; Need to do the tests in separate files because apparently `tune-cpu` +;; attribute applies to the entire file at once. + +@value = external global i32 + +define i32 @testRdpc() nounwind #0 { +; SPARC-LABEL: testRdpc: +; SPARC: ! %bb.0: +; SPARC-NEXT: save %sp, -96, %sp +; SPARC-NEXT: .Ltmp0: +; SPARC-NEXT: call .Ltmp1 +; SPARC-NEXT: .Ltmp2: +; SPARC-NEXT: sethi %hi(_GLOBAL_OFFSET_TABLE_+(.Ltmp2-.Ltmp0)), %i0 +; SPARC-NEXT: .Ltmp1: +; SPARC-NEXT: or %i0, %lo(_GLOBAL_OFFSET_TABLE_+(.Ltmp1-.Ltmp0)), %i0 +; SPARC-NEXT: add %i0, %o7, %i0 +; SPARC-NEXT: sethi %hi(value), %i1 +; SPARC-NEXT: add %i1, %lo(value), %i1 +; SPARC-NEXT: ld [%i0+%i1], %i0 +; SPARC-NEXT: ld [%i0], %i0 +; SPARC-NEXT: ret +; SPARC-NEXT: restore +; +; SPARC64-LABEL: testRdpc: +; SPARC64: ! %bb.0: +; SPARC64-NEXT: save %sp, -128, %sp +; SPARC64-NEXT: .Ltmp0: +; SPARC64-NEXT: rd %pc, %o7 +; SPARC64-NEXT: .Ltmp2: +; SPARC64-NEXT: sethi %hi(_GLOBAL_OFFSET_TABLE_+(.Ltmp2-.Ltmp0)), %i0 +; SPARC64-NEXT: .Ltmp1: +; SPARC64-NEXT: or %i0, %lo(_GLOBAL_OFFSET_TABLE_+(.Ltmp1-.Ltmp0)), %i0 +; SPARC64-NEXT: add %i0, %o7, %i0 +; SPARC64-NEXT: sethi %hi(value), %i1 +; SPARC64-NEXT: add %i1, %lo(value), %i1 +; SPARC64-NEXT: ldx [%i0+%i1], %i0 +; SPARC64-NEXT: ld [%i0], %i0 +; SPARC64-NEXT: ret +; SPARC64-NEXT: restore + %1 = load i32, ptr @value + ret i32 %1 +} + +attributes #0 = { "tune-cpu"="niagara" } -- cgit v1.1