aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp10
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.cpp144
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp73
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h21
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp16
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/CMakeLists.txt1
-rw-r--r--llvm/lib/Target/AMDGPU/SIDefines.h3
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp13
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp140
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h5
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstructions.td4
-rw-r--r--llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp114
-rw-r--r--llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp2
-rw-r--r--llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp2
-rw-r--r--llvm/lib/Target/BPF/BTFDebug.cpp112
-rw-r--r--llvm/lib/Target/Hexagon/CMakeLists.txt1
-rw-r--r--llvm/lib/Target/Hexagon/Hexagon.h3
-rw-r--r--llvm/lib/Target/Hexagon/HexagonGenInsert.cpp8
-rw-r--r--llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp334
-rw-r--r--llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp2
-rw-r--r--llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp2
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.cpp18
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfo.td24
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoZb.td38
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp11
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp14
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp14
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h14
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp21
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp6
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h4
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVUtils.cpp15
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVUtils.h3
-rw-r--r--llvm/lib/Target/X86/X86.td1
36 files changed, 902 insertions, 297 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 662d84b..a81de5c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -27602,6 +27602,15 @@ static SDValue performPTestFirstCombine(SDNode *N,
static SDValue
performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
+ SDLoc DL(N);
+
+ // If a DUP(Op0) already exists, reuse it for the scalar_to_vector.
+ if (DCI.isAfterLegalizeDAG()) {
+ if (SDNode *LN = DCI.DAG.getNodeIfExists(AArch64ISD::DUP, N->getVTList(),
+ N->getOperand(0)))
+ return SDValue(LN, 0);
+ }
+
// Let's do below transform.
//
// t34: v4i32 = AArch64ISD::UADDLV t2
@@ -27638,7 +27647,6 @@ performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
return SDValue();
// Let's generate new sequence with AArch64ISD::NVCAST.
- SDLoc DL(N);
SDValue EXTRACT_SUBVEC =
DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, UADDLV,
DAG.getConstant(0, DL, MVT::i64));
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 12c600f..d5117da 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -700,7 +700,7 @@ static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
// csel instruction. If so, return the folded opcode, and the replacement
// register.
static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
- unsigned *NewVReg = nullptr) {
+ unsigned *NewReg = nullptr) {
VReg = removeCopies(MRI, VReg);
if (!Register::isVirtualRegister(VReg))
return 0;
@@ -708,8 +708,37 @@ static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
const MachineInstr *DefMI = MRI.getVRegDef(VReg);
unsigned Opc = 0;
- unsigned SrcOpNum = 0;
+ unsigned SrcReg = 0;
switch (DefMI->getOpcode()) {
+ case AArch64::SUBREG_TO_REG:
+ // Check for the following way to define an 64-bit immediate:
+ // %0:gpr32 = MOVi32imm 1
+ // %1:gpr64 = SUBREG_TO_REG 0, %0:gpr32, %subreg.sub_32
+ if (!DefMI->getOperand(1).isImm() || DefMI->getOperand(1).getImm() != 0)
+ return 0;
+ if (!DefMI->getOperand(2).isReg())
+ return 0;
+ if (!DefMI->getOperand(3).isImm() ||
+ DefMI->getOperand(3).getImm() != AArch64::sub_32)
+ return 0;
+ DefMI = MRI.getVRegDef(DefMI->getOperand(2).getReg());
+ if (DefMI->getOpcode() != AArch64::MOVi32imm)
+ return 0;
+ if (!DefMI->getOperand(1).isImm() || DefMI->getOperand(1).getImm() != 1)
+ return 0;
+ assert(Is64Bit);
+ SrcReg = AArch64::XZR;
+ Opc = AArch64::CSINCXr;
+ break;
+
+ case AArch64::MOVi32imm:
+ case AArch64::MOVi64imm:
+ if (!DefMI->getOperand(1).isImm() || DefMI->getOperand(1).getImm() != 1)
+ return 0;
+ SrcReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
+ Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
+ break;
+
case AArch64::ADDSXri:
case AArch64::ADDSWri:
// if NZCV is used, do not fold.
@@ -724,7 +753,7 @@ static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
DefMI->getOperand(3).getImm() != 0)
return 0;
- SrcOpNum = 1;
+ SrcReg = DefMI->getOperand(1).getReg();
Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
break;
@@ -734,7 +763,7 @@ static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
return 0;
- SrcOpNum = 2;
+ SrcReg = DefMI->getOperand(2).getReg();
Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
break;
}
@@ -753,17 +782,17 @@ static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
return 0;
- SrcOpNum = 2;
+ SrcReg = DefMI->getOperand(2).getReg();
Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
break;
}
default:
return 0;
}
- assert(Opc && SrcOpNum && "Missing parameters");
+ assert(Opc && SrcReg && "Missing parameters");
- if (NewVReg)
- *NewVReg = DefMI->getOperand(SrcOpNum).getReg();
+ if (NewReg)
+ *NewReg = SrcReg;
return Opc;
}
@@ -964,28 +993,34 @@ void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
// Try folding simple instructions into the csel.
if (TryFold) {
- unsigned NewVReg = 0;
- unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg);
+ unsigned NewReg = 0;
+ unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewReg);
if (FoldedOpc) {
// The folded opcodes csinc, csinc and csneg apply the operation to
// FalseReg, so we need to invert the condition.
CC = AArch64CC::getInvertedCondCode(CC);
TrueReg = FalseReg;
} else
- FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg);
+ FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewReg);
// Fold the operation. Leave any dead instructions for DCE to clean up.
if (FoldedOpc) {
- FalseReg = NewVReg;
+ FalseReg = NewReg;
Opc = FoldedOpc;
- // The extends the live range of NewVReg.
- MRI.clearKillFlags(NewVReg);
+ // Extend the live range of NewReg.
+ MRI.clearKillFlags(NewReg);
}
}
// Pull all virtual register into the appropriate class.
MRI.constrainRegClass(TrueReg, RC);
- MRI.constrainRegClass(FalseReg, RC);
+ // FalseReg might be WZR or XZR if the folded operand is a literal 1.
+ assert(
+ (FalseReg.isVirtual() || FalseReg == AArch64::WZR ||
+ FalseReg == AArch64::XZR) &&
+ "FalseReg was folded into a non-virtual register other than WZR or XZR");
+ if (FalseReg.isVirtual())
+ MRI.constrainRegClass(FalseReg, RC);
// Insert the csel.
BuildMI(MBB, I, DL, get(Opc), DstReg)
@@ -5063,7 +5098,7 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
bool RenamableDest,
bool RenamableSrc) const {
if (AArch64::GPR32spRegClass.contains(DestReg) &&
- (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) {
+ AArch64::GPR32spRegClass.contains(SrcReg)) {
if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
// If either operand is WSP, expand to ADD #0.
if (Subtarget.hasZeroCycleRegMoveGPR64() &&
@@ -5088,21 +5123,14 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
.addImm(0)
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
}
- } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGPR32()) {
- BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
- .addImm(0)
- .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
} else if (Subtarget.hasZeroCycleRegMoveGPR64() &&
!Subtarget.hasZeroCycleRegMoveGPR32()) {
// Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
MCRegister DestRegX = RI.getMatchingSuperReg(DestReg, AArch64::sub_32,
&AArch64::GPR64spRegClass);
assert(DestRegX.isValid() && "Destination super-reg not valid");
- MCRegister SrcRegX =
- SrcReg == AArch64::WZR
- ? AArch64::XZR
- : RI.getMatchingSuperReg(SrcReg, AArch64::sub_32,
- &AArch64::GPR64spRegClass);
+ MCRegister SrcRegX = RI.getMatchingSuperReg(SrcReg, AArch64::sub_32,
+ &AArch64::GPR64spRegClass);
assert(SrcRegX.isValid() && "Source super-reg not valid");
// This instruction is reading and writing X registers. This may upset
// the register scavenger and machine verifier, so we need to indicate
@@ -5121,6 +5149,51 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
return;
}
+ // GPR32 zeroing
+ if (AArch64::GPR32spRegClass.contains(DestReg) && SrcReg == AArch64::WZR) {
+ if (Subtarget.hasZeroCycleZeroingGPR32()) {
+ BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
+ .addImm(0)
+ .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+ } else {
+ BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
+ .addReg(AArch64::WZR)
+ .addReg(AArch64::WZR);
+ }
+ return;
+ }
+
+ if (AArch64::GPR64spRegClass.contains(DestReg) &&
+ AArch64::GPR64spRegClass.contains(SrcReg)) {
+ if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
+ // If either operand is SP, expand to ADD #0.
+ BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc))
+ .addImm(0)
+ .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+ } else {
+ // Otherwise, expand to ORR XZR.
+ BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
+ .addReg(AArch64::XZR)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ }
+ return;
+ }
+
+ // GPR64 zeroing
+ if (AArch64::GPR64spRegClass.contains(DestReg) && SrcReg == AArch64::XZR) {
+ if (Subtarget.hasZeroCycleZeroingGPR64()) {
+ BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
+ .addImm(0)
+ .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+ } else {
+ BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
+ .addReg(AArch64::XZR)
+ .addReg(AArch64::XZR);
+ }
+ return;
+ }
+
// Copy a Predicate register by ORRing with itself.
if (AArch64::PPRRegClass.contains(DestReg) &&
AArch64::PPRRegClass.contains(SrcReg)) {
@@ -5205,27 +5278,6 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
return;
}
- if (AArch64::GPR64spRegClass.contains(DestReg) &&
- (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) {
- if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
- // If either operand is SP, expand to ADD #0.
- BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
- .addReg(SrcReg, getKillRegState(KillSrc))
- .addImm(0)
- .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
- } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGPR64()) {
- BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
- .addImm(0)
- .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
- } else {
- // Otherwise, expand to ORR XZR.
- BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
- .addReg(AArch64::XZR)
- .addReg(SrcReg, getKillRegState(KillSrc));
- }
- return;
- }
-
// Copy a DDDD register quad by copying the individual sub-registers.
if (AArch64::DDDDRegClass.contains(DestReg) &&
AArch64::DDDDRegClass.contains(SrcReg)) {
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 479e345..e3370d3 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5722,7 +5722,7 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
}
// Add additional cost for the extends that would need to be inserted.
- return Cost + 4;
+ return Cost + 2;
}
InstructionCost
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp
new file mode 100644
index 0000000..30a1f05
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp
@@ -0,0 +1,73 @@
+//===--- AMDGPUBarrierLatency.cpp - AMDGPU Barrier Latency ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains a DAG scheduling mutation to add latency to
+/// barrier edges between ATOMIC_FENCE instructions and preceding
+/// memory accesses potentially affected by the fence.
+/// This encourages the scheduling of more instructions before
+/// ATOMIC_FENCE instructions. ATOMIC_FENCE instructions may
+/// introduce wait counting or indicate an impending S_BARRIER
+/// wait. Having more instructions in-flight across these
+/// constructs improves latency hiding.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUBarrierLatency.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+
+using namespace llvm;
+
+namespace {
+
+class BarrierLatency : public ScheduleDAGMutation {
+public:
+ BarrierLatency() = default;
+ void apply(ScheduleDAGInstrs *DAG) override;
+};
+
+void BarrierLatency::apply(ScheduleDAGInstrs *DAG) {
+ constexpr unsigned SyntheticLatency = 2000;
+ for (SUnit &SU : DAG->SUnits) {
+ const MachineInstr *MI = SU.getInstr();
+ if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
+ continue;
+
+ // Update latency on barrier edges of ATOMIC_FENCE.
+ // We don't consider the scope of the fence or type of instruction
+ // involved in the barrier edge.
+ for (SDep &PredDep : SU.Preds) {
+ if (!PredDep.isBarrier())
+ continue;
+ SUnit *PredSU = PredDep.getSUnit();
+ MachineInstr *MI = PredSU->getInstr();
+ // Only consider memory loads
+ if (!MI->mayLoad() || MI->mayStore())
+ continue;
+ SDep ForwardD = PredDep;
+ ForwardD.setSUnit(&SU);
+ for (SDep &SuccDep : PredSU->Succs) {
+ if (SuccDep == ForwardD) {
+ SuccDep.setLatency(SuccDep.getLatency() + SyntheticLatency);
+ break;
+ }
+ }
+ PredDep.setLatency(PredDep.getLatency() + SyntheticLatency);
+ PredSU->setDepthDirty();
+ SU.setDepthDirty();
+ }
+ }
+}
+
+} // end namespace
+
+std::unique_ptr<ScheduleDAGMutation>
+llvm::createAMDGPUBarrierLatencyDAGMutation() {
+ return std::make_unique<BarrierLatency>();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h
new file mode 100644
index 0000000..c23f0b9
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h
@@ -0,0 +1,21 @@
+//===- AMDGPUBarrierLatency.h - AMDGPU Export Clustering --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUBARRIERLATENCY_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUBARRIERLATENCY_H
+
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
+#include <memory>
+
+namespace llvm {
+
+std::unique_ptr<ScheduleDAGMutation> createAMDGPUBarrierLatencyDAGMutation();
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUBARRIERLATENCY_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 12915c73..97c2c9c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3446,10 +3446,14 @@ bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
: 0); // swz
MachineMemOperand *LoadMMO = *MI.memoperands_begin();
+ // Don't set the offset value here because the pointer points to the base of
+ // the buffer.
MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
- LoadPtrI.Offset = MI.getOperand(6 + OpOffset).getImm();
+
MachinePointerInfo StorePtrI = LoadPtrI;
- StorePtrI.V = nullptr;
+ LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),
+ AMDGPUAS::BUFFER_RESOURCE));
+ LoadPtrI.AddrSpace = AMDGPUAS::BUFFER_RESOURCE;
StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
auto F = LoadMMO->getFlags() &
@@ -3627,13 +3631,17 @@ bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
if (isSGPR(Addr))
MIB.addReg(VOffset);
- MIB.add(MI.getOperand(4)) // offset
- .add(MI.getOperand(5)); // cpol
+ MIB.add(MI.getOperand(4)); // offset
+
+ unsigned Aux = MI.getOperand(5).getImm();
+ MIB.addImm(Aux & ~AMDGPU::CPol::VIRTUAL_BITS); // cpol
MachineMemOperand *LoadMMO = *MI.memoperands_begin();
MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
LoadPtrI.Offset = MI.getOperand(4).getImm();
MachinePointerInfo StorePtrI = LoadPtrI;
+ LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),
+ AMDGPUAS::GLOBAL_ADDRESS));
LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
auto F = LoadMMO->getFlags() &
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 4958a20..996b55f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -17,6 +17,7 @@
#include "AMDGPUTargetMachine.h"
#include "AMDGPU.h"
#include "AMDGPUAliasAnalysis.h"
+#include "AMDGPUBarrierLatency.h"
#include "AMDGPUCtorDtorLowering.h"
#include "AMDGPUExportClustering.h"
#include "AMDGPUExportKernelRuntimeHandles.h"
@@ -639,6 +640,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
+ DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation());
return DAG;
}
@@ -659,6 +661,7 @@ createGCNMaxMemoryClauseMachineScheduler(MachineSchedContext *C) {
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
+ DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation());
return DAG;
}
@@ -1197,6 +1200,7 @@ GCNTargetMachine::createPostMachineScheduler(MachineSchedContext *C) const {
EnableVOPD)
DAG->addMutation(createVOPDPairingMutation());
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
+ DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation());
return DAG;
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 13f727b68..a1e0e52 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -52,6 +52,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUAsmPrinter.cpp
AMDGPUAtomicOptimizer.cpp
AMDGPUAttributor.cpp
+ AMDGPUBarrierLatency.cpp
AMDGPUCallLowering.cpp
AMDGPUCodeGenPrepare.cpp
AMDGPUCombinerHelper.cpp
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index ecc2824..b7a92a0 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -423,6 +423,9 @@ enum CPol {
// Volatile (used to preserve/signal operation volatility for buffer
// operations not a real instruction bit)
VOLATILE = 1 << 31,
+ // The set of "cache policy" bits used for compiler features that
+ // do not correspond to handware features.
+ VIRTUAL_BITS = VOLATILE,
};
} // namespace CPol
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a2841c11..a757421 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1651,6 +1651,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
Info.ptrVal = CI.getArgOperand(1);
Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+ auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
+ if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
+ Info.flags |= MachineMemOperand::MOVolatile;
return true;
}
case Intrinsic::amdgcn_ds_bvh_stack_rtn:
@@ -11219,8 +11222,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
MachinePointerInfo StorePtrI = LoadPtrI;
LoadPtrI.V = PoisonValue::get(
- PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
- LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
+ PointerType::get(*DAG.getContext(), AMDGPUAS::BUFFER_RESOURCE));
+ LoadPtrI.AddrSpace = AMDGPUAS::BUFFER_RESOURCE;
StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
auto F = LoadMMO->getFlags() &
@@ -11307,7 +11310,11 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
}
Ops.push_back(Op.getOperand(5)); // Offset
- Ops.push_back(Op.getOperand(6)); // CPol
+
+ unsigned Aux = Op.getConstantOperandVal(6);
+ Ops.push_back(DAG.getTargetConstant(Aux & ~AMDGPU::CPol::VIRTUAL_BITS, DL,
+ MVT::i32)); // CPol
+
Ops.push_back(M0Val.getValue(0)); // Chain
Ops.push_back(M0Val.getValue(1)); // Glue
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 50447f4..2ff2d2f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4032,28 +4032,31 @@ static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
}
}
+/// Helper struct for the implementation of 3-address conversion to communicate
+/// updates made to instruction operands.
+struct SIInstrInfo::ThreeAddressUpdates {
+ /// Other instruction whose def is no longer used by the converted
+ /// instruction.
+ MachineInstr *RemoveMIUse = nullptr;
+};
+
MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
LiveVariables *LV,
LiveIntervals *LIS) const {
MachineBasicBlock &MBB = *MI.getParent();
- unsigned Opc = MI.getOpcode();
+ ThreeAddressUpdates U;
+ MachineInstr *NewMI = convertToThreeAddressImpl(MI, U);
- // Handle MFMA.
- int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
- if (NewMFMAOpc != -1) {
- MachineInstrBuilder MIB =
- BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
- for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
- MIB.add(MI.getOperand(I));
- updateLiveVariables(LV, MI, *MIB);
+ if (NewMI) {
+ updateLiveVariables(LV, MI, *NewMI);
if (LIS) {
- LIS->ReplaceMachineInstrInMaps(MI, *MIB);
+ LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
// SlotIndex of defs needs to be updated when converting to early-clobber
- MachineOperand &Def = MIB->getOperand(0);
+ MachineOperand &Def = NewMI->getOperand(0);
if (Def.isEarlyClobber() && Def.isReg() &&
LIS->hasInterval(Def.getReg())) {
- SlotIndex OldIndex = LIS->getInstructionIndex(*MIB).getRegSlot(false);
- SlotIndex NewIndex = LIS->getInstructionIndex(*MIB).getRegSlot(true);
+ SlotIndex OldIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(false);
+ SlotIndex NewIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(true);
auto &LI = LIS->getInterval(Def.getReg());
auto UpdateDefIndex = [&](LiveRange &LR) {
auto *S = LR.find(OldIndex);
@@ -4068,6 +4071,58 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
UpdateDefIndex(SR);
}
}
+ }
+
+ if (U.RemoveMIUse) {
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ // The only user is the instruction which will be killed.
+ Register DefReg = U.RemoveMIUse->getOperand(0).getReg();
+
+ if (MRI.hasOneNonDBGUse(DefReg)) {
+ // We cannot just remove the DefMI here, calling pass will crash.
+ U.RemoveMIUse->setDesc(get(AMDGPU::IMPLICIT_DEF));
+ U.RemoveMIUse->getOperand(0).setIsDead(true);
+ for (unsigned I = U.RemoveMIUse->getNumOperands() - 1; I != 0; --I)
+ U.RemoveMIUse->removeOperand(I);
+ if (LV)
+ LV->getVarInfo(DefReg).AliveBlocks.clear();
+ }
+
+ if (LIS) {
+ LiveInterval &DefLI = LIS->getInterval(DefReg);
+
+ // We cannot delete the original instruction here, so hack out the use
+ // in the original instruction with a dummy register so we can use
+ // shrinkToUses to deal with any multi-use edge cases. Other targets do
+ // not have the complexity of deleting a use to consider here.
+ Register DummyReg = MRI.cloneVirtualRegister(DefReg);
+ for (MachineOperand &MIOp : MI.uses()) {
+ if (MIOp.isReg() && MIOp.getReg() == DefReg) {
+ MIOp.setIsUndef(true);
+ MIOp.setReg(DummyReg);
+ }
+ }
+
+ LIS->shrinkToUses(&DefLI);
+ }
+ }
+
+ return NewMI;
+}
+
+MachineInstr *
+SIInstrInfo::convertToThreeAddressImpl(MachineInstr &MI,
+ ThreeAddressUpdates &U) const {
+ MachineBasicBlock &MBB = *MI.getParent();
+ unsigned Opc = MI.getOpcode();
+
+ // Handle MFMA.
+ int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
+ if (NewMFMAOpc != -1) {
+ MachineInstrBuilder MIB =
+ BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
+ for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
+ MIB.add(MI.getOperand(I));
return MIB;
}
@@ -4077,11 +4132,6 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
.setMIFlags(MI.getFlags());
for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
MIB->addOperand(MI.getOperand(I));
-
- updateLiveVariables(LV, MI, *MIB);
- if (LIS)
- LIS->ReplaceMachineInstrInMaps(MI, *MIB);
-
return MIB;
}
@@ -4152,39 +4202,6 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
(ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
!RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
MachineInstr *DefMI;
- const auto killDef = [&]() -> void {
- MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
- // The only user is the instruction which will be killed.
- Register DefReg = DefMI->getOperand(0).getReg();
-
- if (MRI.hasOneNonDBGUse(DefReg)) {
- // We cannot just remove the DefMI here, calling pass will crash.
- DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF));
- DefMI->getOperand(0).setIsDead(true);
- for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I)
- DefMI->removeOperand(I);
- if (LV)
- LV->getVarInfo(DefReg).AliveBlocks.clear();
- }
-
- if (LIS) {
- LiveInterval &DefLI = LIS->getInterval(DefReg);
-
- // We cannot delete the original instruction here, so hack out the use
- // in the original instruction with a dummy register so we can use
- // shrinkToUses to deal with any multi-use edge cases. Other targets do
- // not have the complexity of deleting a use to consider here.
- Register DummyReg = MRI.cloneVirtualRegister(DefReg);
- for (MachineOperand &MIOp : MI.uses()) {
- if (MIOp.isReg() && MIOp.getReg() == DefReg) {
- MIOp.setIsUndef(true);
- MIOp.setReg(DummyReg);
- }
- }
-
- LIS->shrinkToUses(&DefLI);
- }
- };
int64_t Imm;
if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
@@ -4196,10 +4213,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
.add(*Src1)
.addImm(Imm)
.setMIFlags(MI.getFlags());
- updateLiveVariables(LV, MI, *MIB);
- if (LIS)
- LIS->ReplaceMachineInstrInMaps(MI, *MIB);
- killDef();
+ U.RemoveMIUse = DefMI;
return MIB;
}
}
@@ -4212,11 +4226,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
.addImm(Imm)
.add(*Src2)
.setMIFlags(MI.getFlags());
- updateLiveVariables(LV, MI, *MIB);
-
- if (LIS)
- LIS->ReplaceMachineInstrInMaps(MI, *MIB);
- killDef();
+ U.RemoveMIUse = DefMI;
return MIB;
}
}
@@ -4235,12 +4245,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
.addImm(Imm)
.add(*Src2)
.setMIFlags(MI.getFlags());
- updateLiveVariables(LV, MI, *MIB);
-
- if (LIS)
- LIS->ReplaceMachineInstrInMaps(MI, *MIB);
- if (DefMI)
- killDef();
+ U.RemoveMIUse = DefMI;
return MIB;
}
}
@@ -4269,9 +4274,6 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
.setMIFlags(MI.getFlags());
if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
MIB.addImm(OpSel ? OpSel->getImm() : 0);
- updateLiveVariables(LV, MI, *MIB);
- if (LIS)
- LIS->ReplaceMachineInstrInMaps(MI, *MIB);
return MIB;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index df27ec1..e1d7a07 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -88,6 +88,8 @@ private:
};
class SIInstrInfo final : public AMDGPUGenInstrInfo {
+ struct ThreeAddressUpdates;
+
private:
const SIRegisterInfo RI;
const GCNSubtarget &ST;
@@ -190,6 +192,9 @@ private:
bool resultDependsOnExec(const MachineInstr &MI) const;
+ MachineInstr *convertToThreeAddressImpl(MachineInstr &MI,
+ ThreeAddressUpdates &Updates) const;
+
protected:
/// If the specific machine instruction is a instruction that moves/copies
/// value from one register to another register return destination and source
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 74d4153..6f1feb1 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2223,8 +2223,8 @@ def : GCNPat <
def : GCNPat <
(DivergentUnaryFrag<fneg> (v2f32 VReg_64:$src)),
- (V_PK_ADD_F32 11 /* OP_SEL_1 | NEG_LO | HEG_HI */, VReg_64:$src,
- 11 /* OP_SEL_1 | NEG_LO | HEG_HI */, (i64 0),
+ (V_PK_ADD_F32 !or(SRCMODS.OP_SEL_1, SRCMODS.NEG, SRCMODS.NEG_HI), VReg_64:$src,
+ !or(SRCMODS.OP_SEL_1, SRCMODS.NEG, SRCMODS.NEG_HI), (i64 0),
0, 0, 0, 0, 0)
> {
let SubtargetPredicate = HasPackedFP32Ops;
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 362ef14..07264d9 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -25,6 +25,7 @@
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
#include "llvm/IR/PassManager.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
#include "llvm/Support/AtomicOrdering.h"
#include "llvm/TargetParser/TargetParser.h"
@@ -277,6 +278,12 @@ public:
/// rmw operation, "std::nullopt" otherwise.
std::optional<SIMemOpInfo>
getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const;
+
+ /// \returns DMA to LDS info if \p MI is as a direct-to/from-LDS load/store,
+ /// along with an indication of whether this is a load or store. If it is not
+ /// a direct-to-LDS operation, returns std::nullopt.
+ std::optional<SIMemOpInfo>
+ getLDSDMAInfo(const MachineBasicBlock::iterator &MI) const;
};
class SICacheControl {
@@ -390,12 +397,6 @@ public:
bool IsCrossAddrSpaceOrdering,
Position Pos) const = 0;
- /// Inserts any necessary instructions before the barrier start instruction
- /// \p MI in order to support pairing of barriers and fences.
- virtual bool insertBarrierStart(MachineBasicBlock::iterator &MI) const {
- return false;
- };
-
/// Virtual destructor to allow derivations to be deleted.
virtual ~SICacheControl() = default;
};
@@ -576,12 +577,8 @@ public:
bool IsCrossAddrSpaceOrdering, Position Pos,
AtomicOrdering Order, bool AtomicsOnly) const override;
- bool insertAcquire(MachineBasicBlock::iterator &MI,
- SIAtomicScope Scope,
- SIAtomicAddrSpace AddrSpace,
- Position Pos) const override;
-
- bool insertBarrierStart(MachineBasicBlock::iterator &MI) const override;
+ bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace, Position Pos) const override;
};
class SIGfx11CacheControl : public SIGfx10CacheControl {
@@ -703,6 +700,9 @@ private:
/// instructions are added/deleted or \p MI is modified, false otherwise.
bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
MachineBasicBlock::iterator &MI);
+ /// Expands LDS DMA operation \p MI. Returns true if instructions are
+ /// added/deleted or \p MI is modified, false otherwise.
+ bool expandLDSDMA(const SIMemOpInfo &MOI, MachineBasicBlock::iterator &MI);
public:
SIMemoryLegalizer(const MachineModuleInfo &MMI) : MMI(MMI) {};
@@ -832,6 +832,9 @@ SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
return SIAtomicAddrSpace::SCRATCH;
if (AS == AMDGPUAS::REGION_ADDRESS)
return SIAtomicAddrSpace::GDS;
+ if (AS == AMDGPUAS::BUFFER_FAT_POINTER || AS == AMDGPUAS::BUFFER_RESOURCE ||
+ AS == AMDGPUAS::BUFFER_STRIDED_POINTER)
+ return SIAtomicAddrSpace::GLOBAL;
return SIAtomicAddrSpace::OTHER;
}
@@ -987,6 +990,16 @@ std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
return constructFromMIWithMMO(MI);
}
+std::optional<SIMemOpInfo>
+SIMemOpAccess::getLDSDMAInfo(const MachineBasicBlock::iterator &MI) const {
+ assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
+
+ if (!SIInstrInfo::isLDSDMA(*MI))
+ return std::nullopt;
+
+ return constructFromMIWithMMO(MI);
+}
+
SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
TII = ST.getInstrInfo();
IV = getIsaVersion(ST.getCPU());
@@ -1099,7 +1112,7 @@ bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
// Only handle load and store, not atomic read-modify-write insructions. The
// latter use glc to indicate if the atomic returns a result and so must not
// be used for cache control.
- assert(MI->mayLoad() ^ MI->mayStore());
+ assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
// Only update load and store, not LLVM IR atomic read-modify-write
// instructions. The latter are always marked as volatile so cannot sensibly
@@ -1429,7 +1442,7 @@ bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
// Only handle load and store, not atomic read-modify-write insructions. The
// latter use glc to indicate if the atomic returns a result and so must not
// be used for cache control.
- assert(MI->mayLoad() ^ MI->mayStore());
+ assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
// Only update load and store, not LLVM IR atomic read-modify-write
// instructions. The latter are always marked as volatile so cannot sensibly
@@ -1733,7 +1746,7 @@ bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
// Only handle load and store, not atomic read-modify-write insructions. The
// latter use glc to indicate if the atomic returns a result and so must not
// be used for cache control.
- assert(MI->mayLoad() ^ MI->mayStore());
+ assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
// Only update load and store, not LLVM IR atomic read-modify-write
// instructions. The latter are always marked as volatile so cannot sensibly
@@ -1968,7 +1981,7 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
// Only handle load and store, not atomic read-modify-write insructions. The
// latter use glc to indicate if the atomic returns a result and so must not
// be used for cache control.
- assert(MI->mayLoad() ^ MI->mayStore());
+ assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
// Only update load and store, not LLVM IR atomic read-modify-write
// instructions. The latter are always marked as volatile so cannot sensibly
@@ -2046,8 +2059,11 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
// the WGP. Therefore need to wait for operations to complete to ensure
// they are visible to waves in the other CU as the L0 is per CU.
// Otherwise in CU mode and all waves of a work-group are on the same CU
- // which shares the same L0.
- if (!ST.isCuModeEnabled()) {
+ // which shares the same L0. Note that we still need to wait when
+ // performing a release in this mode to respect the transitivity of
+ // happens-before, e.g. other waves of the workgroup must be able to
+ // release the memory from another wave at a wider scope.
+ if (!ST.isCuModeEnabled() || isReleaseOrStronger(Order)) {
if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
VMCnt |= true;
if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
@@ -2202,22 +2218,6 @@ bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
return Changed;
}
-bool SIGfx10CacheControl::insertBarrierStart(
- MachineBasicBlock::iterator &MI) const {
- // We need to wait on vm_vsrc so barriers can pair with fences in GFX10+ CU
- // mode. This is because a CU mode release fence does not emit any wait, which
- // is fine when only dealing with vmem, but isn't sufficient in the presence
- // of barriers which do not go through vmem.
- // GFX12.5 does not require this additional wait.
- if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts())
- return false;
-
- BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
- TII->get(AMDGPU::S_WAITCNT_DEPCTR))
- .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
- return true;
-}
-
bool SIGfx11CacheControl::enableLoadCacheBypass(
const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace) const {
@@ -2266,7 +2266,7 @@ bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
// Only handle load and store, not atomic read-modify-write insructions. The
// latter use glc to indicate if the atomic returns a result and so must not
// be used for cache control.
- assert(MI->mayLoad() ^ MI->mayStore());
+ assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
// Only update load and store, not LLVM IR atomic read-modify-write
// instructions. The latter are always marked as volatile so cannot sensibly
@@ -2396,15 +2396,20 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
// In WGP mode the waves of a work-group can be executing on either CU
// of the WGP. Therefore need to wait for operations to complete to
// ensure they are visible to waves in the other CU as the L0 is per CU.
+ //
// Otherwise in CU mode and all waves of a work-group are on the same CU
- // which shares the same L0.
+ // which shares the same L0. Note that we still need to wait when
+ // performing a release in this mode to respect the transitivity of
+ // happens-before, e.g. other waves of the workgroup must be able to
+ // release the memory from another wave at a wider scope.
//
// GFX12.5:
// CU$ has two ports. To ensure operations are visible at the workgroup
// level, we need to ensure all operations in this port have completed
// so the other SIMDs in the WG can see them. There is no ordering
// guarantee between the ports.
- if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts()) {
+ if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts() ||
+ isReleaseOrStronger(Order)) {
if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
LOADCnt |= true;
if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
@@ -2611,7 +2616,7 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
// Only handle load and store, not atomic read-modify-write instructions.
- assert(MI->mayLoad() ^ MI->mayStore());
+ assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
// Only update load and store, not LLVM IR atomic read-modify-write
// instructions. The latter are always marked as volatile so cannot sensibly
@@ -2934,6 +2939,23 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
return Changed;
}
+bool SIMemoryLegalizer::expandLDSDMA(const SIMemOpInfo &MOI,
+ MachineBasicBlock::iterator &MI) {
+ assert(MI->mayLoad() && MI->mayStore());
+
+ // The volatility or nontemporal-ness of the operation is a
+ // function of the global memory, not the LDS.
+ SIMemOp OpKind =
+ SIInstrInfo::mayWriteLDSThroughDMA(*MI) ? SIMemOp::LOAD : SIMemOp::STORE;
+
+ // Handle volatile and/or nontemporal markers on direct-to-LDS loads and
+ // stores. The operation is treated as a volatile/nontemporal store
+ // to its second argument.
+ return CC->enableVolatileAndOrNonTemporal(
+ MI, MOI.getInstrAddrSpace(), OpKind, MOI.isVolatile(),
+ MOI.isNonTemporal(), MOI.isLastUse());
+}
+
bool SIMemoryLegalizerLegacy::runOnMachineFunction(MachineFunction &MF) {
const MachineModuleInfo &MMI =
getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
@@ -2977,22 +2999,20 @@ bool SIMemoryLegalizer::run(MachineFunction &MF) {
MI = II->getIterator();
}
- if (ST.getInstrInfo()->isBarrierStart(MI->getOpcode())) {
- Changed |= CC->insertBarrierStart(MI);
- continue;
- }
-
if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
continue;
- if (const auto &MOI = MOA.getLoadInfo(MI))
+ if (const auto &MOI = MOA.getLoadInfo(MI)) {
Changed |= expandLoad(*MOI, MI);
- else if (const auto &MOI = MOA.getStoreInfo(MI)) {
+ } else if (const auto &MOI = MOA.getStoreInfo(MI)) {
Changed |= expandStore(*MOI, MI);
- } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
+ } else if (const auto &MOI = MOA.getLDSDMAInfo(MI)) {
+ Changed |= expandLDSDMA(*MOI, MI);
+ } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) {
Changed |= expandAtomicFence(*MOI, MI);
- else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
+ } else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) {
Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
+ }
}
}
diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index 96ee69c..406f4c1 100644
--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -882,7 +882,7 @@ static bool producesFalseLanesZero(MachineInstr &MI,
continue;
// Skip the lr predicate reg
int PIdx = llvm::findFirstVPTPredOperandIdx(MI);
- if (PIdx != -1 && (int)MO.getOperandNo() == PIdx + 2)
+ if (PIdx != -1 && MO.getOperandNo() == PIdx + ARM::SUBOP_vpred_n_tp_reg)
continue;
// Check that this instruction will produce zeros in its false lanes:
diff --git a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
index 5eeb4fe..413e844 100644
--- a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
+++ b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
@@ -534,7 +534,7 @@ bool MVETPAndVPTOptimisations::ConvertTailPredLoop(MachineLoop *ML,
Register LR = LoopPhi->getOperand(0).getReg();
for (MachineInstr *MI : MVEInstrs) {
int Idx = findFirstVPTPredOperandIdx(*MI);
- MI->getOperand(Idx + 2).setReg(LR);
+ MI->getOperand(Idx + ARM::SUBOP_vpred_n_tp_reg).setReg(LR);
}
}
diff --git a/llvm/lib/Target/BPF/BTFDebug.cpp b/llvm/lib/Target/BPF/BTFDebug.cpp
index ba4b489..9b5fc9d 100644
--- a/llvm/lib/Target/BPF/BTFDebug.cpp
+++ b/llvm/lib/Target/BPF/BTFDebug.cpp
@@ -14,6 +14,7 @@
#include "BPF.h"
#include "BPFCORE.h"
#include "MCTargetDesc/BPFMCTargetDesc.h"
+#include "llvm/BinaryFormat/Dwarf.h"
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/CodeGen/AsmPrinter.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
@@ -23,6 +24,7 @@
#include "llvm/MC/MCObjectFileInfo.h"
#include "llvm/MC/MCSectionELF.h"
#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/LineIterator.h"
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Target/TargetLoweringObjectFile.h"
@@ -301,21 +303,59 @@ void BTFTypeStruct::completeType(BTFDebug &BDebug) {
BTFType.NameOff = BDebug.addString(STy->getName());
+ if (STy->getTag() == dwarf::DW_TAG_variant_part) {
+ // Variant parts might have a discriminator, which has its own memory
+ // location, and variants, which share the memory location afterwards. LLVM
+ // DI doesn't consider discriminator as an element and instead keeps
+ // it as a separate reference.
+ // To keep BTF simple, let's represent the structure as an union with
+ // discriminator as the first element.
+ // The offsets inside variant types are already handled correctly in the
+ // DI.
+ const auto *DTy = STy->getDiscriminator();
+ if (DTy) {
+ struct BTF::BTFMember Discriminator;
+
+ Discriminator.NameOff = BDebug.addString(DTy->getName());
+ Discriminator.Offset = DTy->getOffsetInBits();
+ const auto *BaseTy = DTy->getBaseType();
+ Discriminator.Type = BDebug.getTypeId(BaseTy);
+
+ Members.push_back(Discriminator);
+ }
+ }
+
// Add struct/union members.
const DINodeArray Elements = STy->getElements();
for (const auto *Element : Elements) {
struct BTF::BTFMember BTFMember;
- const auto *DDTy = cast<DIDerivedType>(Element);
- BTFMember.NameOff = BDebug.addString(DDTy->getName());
- if (HasBitField) {
- uint8_t BitFieldSize = DDTy->isBitField() ? DDTy->getSizeInBits() : 0;
- BTFMember.Offset = BitFieldSize << 24 | DDTy->getOffsetInBits();
- } else {
- BTFMember.Offset = DDTy->getOffsetInBits();
+ switch (Element->getTag()) {
+ case dwarf::DW_TAG_member: {
+ const auto *DDTy = cast<DIDerivedType>(Element);
+
+ BTFMember.NameOff = BDebug.addString(DDTy->getName());
+ if (HasBitField) {
+ uint8_t BitFieldSize = DDTy->isBitField() ? DDTy->getSizeInBits() : 0;
+ BTFMember.Offset = BitFieldSize << 24 | DDTy->getOffsetInBits();
+ } else {
+ BTFMember.Offset = DDTy->getOffsetInBits();
+ }
+ const auto *BaseTy = tryRemoveAtomicType(DDTy->getBaseType());
+ BTFMember.Type = BDebug.getTypeId(BaseTy);
+ break;
+ }
+ case dwarf::DW_TAG_variant_part: {
+ const auto *DCTy = dyn_cast<DICompositeType>(Element);
+
+ BTFMember.NameOff = BDebug.addString(DCTy->getName());
+ BTFMember.Offset = DCTy->getOffsetInBits();
+ BTFMember.Type = BDebug.getTypeId(DCTy);
+ break;
+ }
+ default:
+ llvm_unreachable("Unexpected DI tag of a struct/union element");
}
- const auto *BaseTy = tryRemoveAtomicType(DDTy->getBaseType());
- BTFMember.Type = BDebug.getTypeId(BaseTy);
Members.push_back(BTFMember);
}
}
@@ -672,16 +712,28 @@ void BTFDebug::visitStructType(const DICompositeType *CTy, bool IsStruct,
uint32_t &TypeId) {
const DINodeArray Elements = CTy->getElements();
uint32_t VLen = Elements.size();
+ // Variant parts might have a discriminator. LLVM DI doesn't consider it as
+ // an element and instead keeps it as a separate reference. But we represent
+ // it as an element in BTF.
+ if (CTy->getTag() == dwarf::DW_TAG_variant_part) {
+ const auto *DTy = CTy->getDiscriminator();
+ if (DTy) {
+ visitTypeEntry(DTy);
+ VLen++;
+ }
+ }
if (VLen > BTF::MAX_VLEN)
return;
// Check whether we have any bitfield members or not
bool HasBitField = false;
for (const auto *Element : Elements) {
- auto E = cast<DIDerivedType>(Element);
- if (E->isBitField()) {
- HasBitField = true;
- break;
+ if (Element->getTag() == dwarf::DW_TAG_member) {
+ auto E = cast<DIDerivedType>(Element);
+ if (E->isBitField()) {
+ HasBitField = true;
+ break;
+ }
}
}
@@ -696,9 +748,22 @@ void BTFDebug::visitStructType(const DICompositeType *CTy, bool IsStruct,
// Visit all struct members.
int FieldNo = 0;
for (const auto *Element : Elements) {
- const auto Elem = cast<DIDerivedType>(Element);
- visitTypeEntry(Elem);
- processDeclAnnotations(Elem->getAnnotations(), TypeId, FieldNo);
+ switch (Element->getTag()) {
+ case dwarf::DW_TAG_member: {
+ const auto Elem = cast<DIDerivedType>(Element);
+ visitTypeEntry(Elem);
+ processDeclAnnotations(Elem->getAnnotations(), TypeId, FieldNo);
+ break;
+ }
+ case dwarf::DW_TAG_variant_part: {
+ const auto Elem = cast<DICompositeType>(Element);
+ visitTypeEntry(Elem);
+ processDeclAnnotations(Elem->getAnnotations(), TypeId, FieldNo);
+ break;
+ }
+ default:
+ llvm_unreachable("Unexpected DI tag of a struct/union element");
+ }
FieldNo++;
}
}
@@ -781,16 +846,25 @@ void BTFDebug::visitFwdDeclType(const DICompositeType *CTy, bool IsUnion,
void BTFDebug::visitCompositeType(const DICompositeType *CTy,
uint32_t &TypeId) {
auto Tag = CTy->getTag();
- if (Tag == dwarf::DW_TAG_structure_type || Tag == dwarf::DW_TAG_union_type) {
+ switch (Tag) {
+ case dwarf::DW_TAG_structure_type:
+ case dwarf::DW_TAG_union_type:
+ case dwarf::DW_TAG_variant_part:
// Handle forward declaration differently as it does not have members.
if (CTy->isForwardDecl())
visitFwdDeclType(CTy, Tag == dwarf::DW_TAG_union_type, TypeId);
else
visitStructType(CTy, Tag == dwarf::DW_TAG_structure_type, TypeId);
- } else if (Tag == dwarf::DW_TAG_array_type)
+ break;
+ case dwarf::DW_TAG_array_type:
visitArrayType(CTy, TypeId);
- else if (Tag == dwarf::DW_TAG_enumeration_type)
+ break;
+ case dwarf::DW_TAG_enumeration_type:
visitEnumType(CTy, TypeId);
+ break;
+ default:
+ llvm_unreachable("Unexpected DI tag of a composite type");
+ }
}
bool BTFDebug::IsForwardDeclCandidate(const DIType *Base) {
diff --git a/llvm/lib/Target/Hexagon/CMakeLists.txt b/llvm/lib/Target/Hexagon/CMakeLists.txt
index d758260..1a5f096 100644
--- a/llvm/lib/Target/Hexagon/CMakeLists.txt
+++ b/llvm/lib/Target/Hexagon/CMakeLists.txt
@@ -54,6 +54,7 @@ add_llvm_target(HexagonCodeGen
HexagonOptAddrMode.cpp
HexagonOptimizeSZextends.cpp
HexagonPeephole.cpp
+ HexagonQFPOptimizer.cpp
HexagonRDFOpt.cpp
HexagonRegisterInfo.cpp
HexagonSelectionDAGInfo.cpp
diff --git a/llvm/lib/Target/Hexagon/Hexagon.h b/llvm/lib/Target/Hexagon/Hexagon.h
index 109aba5..422ab20 100644
--- a/llvm/lib/Target/Hexagon/Hexagon.h
+++ b/llvm/lib/Target/Hexagon/Hexagon.h
@@ -67,6 +67,8 @@ void initializeHexagonPeepholePass(PassRegistry &);
void initializeHexagonSplitConst32AndConst64Pass(PassRegistry &);
void initializeHexagonVectorPrintPass(PassRegistry &);
+void initializeHexagonQFPOptimizerPass(PassRegistry &);
+
Pass *createHexagonLoopIdiomPass();
Pass *createHexagonVectorLoopCarriedReuseLegacyPass();
@@ -112,6 +114,7 @@ FunctionPass *createHexagonVectorCombineLegacyPass();
FunctionPass *createHexagonVectorPrint();
FunctionPass *createHexagonVExtract();
FunctionPass *createHexagonExpandCondsets();
+FunctionPass *createHexagonQFPOptimizer();
} // end namespace llvm;
diff --git a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
index 4ddbe7a..ff876f6 100644
--- a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
@@ -920,6 +920,10 @@ void HexagonGenInsert::collectInBlock(MachineBasicBlock *B,
// successors have been processed.
RegisterSet BlockDefs, InsDefs;
for (MachineInstr &MI : *B) {
+ // Stop if the map size is too large.
+ if (IFMap.size() >= MaxIFMSize)
+ break;
+
InsDefs.clear();
getInstrDefs(&MI, InsDefs);
// Leave those alone. They are more transparent than "insert".
@@ -942,8 +946,8 @@ void HexagonGenInsert::collectInBlock(MachineBasicBlock *B,
findRecordInsertForms(VR, AVs);
// Stop if the map size is too large.
- if (IFMap.size() > MaxIFMSize)
- return;
+ if (IFMap.size() >= MaxIFMSize)
+ break;
}
}
diff --git a/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp b/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp
new file mode 100644
index 0000000..479ac90
--- /dev/null
+++ b/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp
@@ -0,0 +1,334 @@
+//===----- HexagonQFPOptimizer.cpp - Qualcomm-FP to IEEE-FP conversions
+// optimizer ------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Basic infrastructure for optimizing intermediate conversion instructions
+// generated while performing vector floating point operations.
+// Currently run at the starting of the code generation for Hexagon, cleans
+// up redundant conversion instructions and replaces the uses of conversion
+// with appropriate machine operand. Liveness is preserved after this pass.
+//
+// @note: The redundant conversion instructions are not eliminated in this pass.
+// In this pass, we are only trying to replace the uses of conversion
+// instructions with its appropriate QFP instruction. We are leaving the job to
+// Dead instruction Elimination pass to remove redundant conversion
+// instructions.
+//
+// Brief overview of working of this QFP optimizer.
+// This version of Hexagon QFP optimizer basically iterates over each
+// instruction, checks whether if it belongs to hexagon floating point HVX
+// arithmetic instruction category(Add, Sub, Mul). And then it finds the unique
+// definition for the machine operands corresponding to the instruction.
+//
+// Example:
+// MachineInstruction *MI be the HVX vadd instruction
+// MI -> $v0 = V6_vadd_sf $v1, $v2
+// MachineOperand *DefMI1 = MRI->getVRegDef(MI->getOperand(1).getReg());
+// MachineOperand *DefMI2 = MRI->getVRegDef(MI->getOperand(2).getReg());
+//
+// In the above example, DefMI1 and DefMI2 gives the unique definitions
+// corresponding to the operands($v1 and &v2 respectively) of instruction MI.
+//
+// If both of the definitions are not conversion instructions(V6_vconv_sf_qf32,
+// V6_vconv_hf_qf16), then it will skip optimizing the current instruction and
+// iterates over next instruction.
+//
+// If one the definitions is conversion instruction then our pass will replace
+// the arithmetic instruction with its corresponding mix variant.
+// In the above example, if $v1 is conversion instruction
+// DefMI1 -> $v1 = V6_vconv_sf_qf32 $v3
+// After Transformation:
+// MI -> $v0 = V6_vadd_qf32_mix $v3, $v2 ($v1 is replaced with $v3)
+//
+// If both the definitions are conversion instructions then the instruction will
+// be replaced with its qf variant
+// In the above example, if $v1 and $v2 are conversion instructions
+// DefMI1 -> $v1 = V6_vconv_sf_qf32 $v3
+// DefMI2 -> $v2 = V6_vconv_sf_qf32 $v4
+// After Transformation:
+// MI -> $v0 = V6_vadd_qf32 $v3, $v4 ($v1 is replaced with $v3, $v2 is replaced
+// with $v4)
+//
+// Currently, in this pass, we are not handling the case when the definitions
+// are PHI inst.
+//
+//===----------------------------------------------------------------------===//
+#include <unordered_set>
+#define HEXAGON_QFP_OPTIMIZER "QFP optimizer pass"
+
+#include "Hexagon.h"
+#include "HexagonInstrInfo.h"
+#include "HexagonSubtarget.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <map>
+#include <vector>
+
+#define DEBUG_TYPE "hexagon-qfp-optimizer"
+
+using namespace llvm;
+
+cl::opt<bool>
+ DisableQFOptimizer("disable-qfp-opt", cl::init(false),
+ cl::desc("Disable optimization of Qfloat operations."));
+
+namespace {
+const std::map<unsigned short, unsigned short> QFPInstMap{
+ {Hexagon::V6_vadd_hf, Hexagon::V6_vadd_qf16_mix},
+ {Hexagon::V6_vadd_qf16_mix, Hexagon::V6_vadd_qf16},
+ {Hexagon::V6_vadd_sf, Hexagon::V6_vadd_qf32_mix},
+ {Hexagon::V6_vadd_qf32_mix, Hexagon::V6_vadd_qf32},
+ {Hexagon::V6_vsub_hf, Hexagon::V6_vsub_qf16_mix},
+ {Hexagon::V6_vsub_qf16_mix, Hexagon::V6_vsub_qf16},
+ {Hexagon::V6_vsub_sf, Hexagon::V6_vsub_qf32_mix},
+ {Hexagon::V6_vsub_qf32_mix, Hexagon::V6_vsub_qf32},
+ {Hexagon::V6_vmpy_qf16_hf, Hexagon::V6_vmpy_qf16_mix_hf},
+ {Hexagon::V6_vmpy_qf16_mix_hf, Hexagon::V6_vmpy_qf16},
+ {Hexagon::V6_vmpy_qf32_hf, Hexagon::V6_vmpy_qf32_mix_hf},
+ {Hexagon::V6_vmpy_qf32_mix_hf, Hexagon::V6_vmpy_qf32_qf16},
+ {Hexagon::V6_vmpy_qf32_sf, Hexagon::V6_vmpy_qf32}};
+} // namespace
+
+namespace llvm {
+
+FunctionPass *createHexagonQFPOptimizer();
+void initializeHexagonQFPOptimizerPass(PassRegistry &);
+
+} // namespace llvm
+
+namespace {
+
+struct HexagonQFPOptimizer : public MachineFunctionPass {
+public:
+ static char ID;
+
+ HexagonQFPOptimizer() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ bool optimizeQfp(MachineInstr *MI, MachineBasicBlock *MBB);
+
+ StringRef getPassName() const override { return HEXAGON_QFP_OPTIMIZER; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+private:
+ const HexagonSubtarget *HST = nullptr;
+ const HexagonInstrInfo *HII = nullptr;
+ const MachineRegisterInfo *MRI = nullptr;
+};
+
+char HexagonQFPOptimizer::ID = 0;
+} // namespace
+
+INITIALIZE_PASS(HexagonQFPOptimizer, "hexagon-qfp-optimizer",
+ HEXAGON_QFP_OPTIMIZER, false, false)
+
+FunctionPass *llvm::createHexagonQFPOptimizer() {
+ return new HexagonQFPOptimizer();
+}
+
+bool HexagonQFPOptimizer::optimizeQfp(MachineInstr *MI,
+ MachineBasicBlock *MBB) {
+
+ // Early exit:
+ // - if instruction is invalid or has too few operands (QFP ops need 2 sources
+ // + 1 dest),
+ // - or does not have a transformation mapping.
+ if (MI->getNumOperands() < 3)
+ return false;
+ auto It = QFPInstMap.find(MI->getOpcode());
+ if (It == QFPInstMap.end())
+ return false;
+ unsigned short InstTy = It->second;
+
+ unsigned Op0F = 0;
+ unsigned Op1F = 0;
+ // Get the reaching defs of MI, DefMI1 and DefMI2
+ MachineInstr *DefMI1 = nullptr;
+ MachineInstr *DefMI2 = nullptr;
+
+ if (MI->getOperand(1).isReg())
+ DefMI1 = MRI->getVRegDef(MI->getOperand(1).getReg());
+ if (MI->getOperand(2).isReg())
+ DefMI2 = MRI->getVRegDef(MI->getOperand(2).getReg());
+ if (!DefMI1 || !DefMI2)
+ return false;
+
+ MachineOperand &Res = MI->getOperand(0);
+ MachineInstr *Inst1 = nullptr;
+ MachineInstr *Inst2 = nullptr;
+ LLVM_DEBUG(dbgs() << "\n[Reaching Defs of operands]: "; DefMI1->dump();
+ DefMI2->dump());
+
+ // Get the reaching defs of DefMI
+ if (DefMI1->getNumOperands() > 1 && DefMI1->getOperand(1).isReg() &&
+ DefMI1->getOperand(1).getReg().isVirtual())
+ Inst1 = MRI->getVRegDef(DefMI1->getOperand(1).getReg());
+
+ if (DefMI2->getNumOperands() > 1 && DefMI2->getOperand(1).isReg() &&
+ DefMI2->getOperand(1).getReg().isVirtual())
+ Inst2 = MRI->getVRegDef(DefMI2->getOperand(1).getReg());
+
+ unsigned Def1OP = DefMI1->getOpcode();
+ unsigned Def2OP = DefMI2->getOpcode();
+
+ MachineInstrBuilder MIB;
+ // Case 1: Both reaching defs of MI are qf to sf/hf conversions
+ if ((Def1OP == Hexagon::V6_vconv_sf_qf32 &&
+ Def2OP == Hexagon::V6_vconv_sf_qf32) ||
+ (Def1OP == Hexagon::V6_vconv_hf_qf16 &&
+ Def2OP == Hexagon::V6_vconv_hf_qf16)) {
+
+ // If the reaching defs of DefMI are W register type, we return
+ if ((Inst1 && Inst1->getNumOperands() > 0 && Inst1->getOperand(0).isReg() &&
+ MRI->getRegClass(Inst1->getOperand(0).getReg()) ==
+ &Hexagon::HvxWRRegClass) ||
+ (Inst2 && Inst2->getNumOperands() > 0 && Inst2->getOperand(0).isReg() &&
+ MRI->getRegClass(Inst2->getOperand(0).getReg()) ==
+ &Hexagon::HvxWRRegClass))
+ return false;
+
+ // Analyze the use operands of the conversion to get their KILL status
+ MachineOperand &Src1 = DefMI1->getOperand(1);
+ MachineOperand &Src2 = DefMI2->getOperand(1);
+
+ Op0F = getKillRegState(Src1.isKill());
+ Src1.setIsKill(false);
+
+ Op1F = getKillRegState(Src2.isKill());
+ Src2.setIsKill(false);
+
+ if (MI->getOpcode() != Hexagon::V6_vmpy_qf32_sf) {
+ auto OuterIt = QFPInstMap.find(MI->getOpcode());
+ if (OuterIt == QFPInstMap.end())
+ return false;
+ auto InnerIt = QFPInstMap.find(OuterIt->second);
+ if (InnerIt == QFPInstMap.end())
+ return false;
+ InstTy = InnerIt->second;
+ }
+
+ MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), HII->get(InstTy), Res.getReg())
+ .addReg(Src1.getReg(), Op0F, Src1.getSubReg())
+ .addReg(Src2.getReg(), Op1F, Src2.getSubReg());
+ LLVM_DEBUG(dbgs() << "\n[Inserting]: "; MIB.getInstr()->dump());
+ return true;
+
+ // Case 2: Left operand is conversion to sf/hf
+ } else if (((Def1OP == Hexagon::V6_vconv_sf_qf32 &&
+ Def2OP != Hexagon::V6_vconv_sf_qf32) ||
+ (Def1OP == Hexagon::V6_vconv_hf_qf16 &&
+ Def2OP != Hexagon::V6_vconv_hf_qf16)) &&
+ !DefMI2->isPHI() &&
+ (MI->getOpcode() != Hexagon::V6_vmpy_qf32_sf)) {
+
+ if (Inst1 && MRI->getRegClass(Inst1->getOperand(0).getReg()) ==
+ &Hexagon::HvxWRRegClass)
+ return false;
+
+ MachineOperand &Src1 = DefMI1->getOperand(1);
+ MachineOperand &Src2 = MI->getOperand(2);
+
+ Op0F = getKillRegState(Src1.isKill());
+ Src1.setIsKill(false);
+ Op1F = getKillRegState(Src2.isKill());
+ MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), HII->get(InstTy), Res.getReg())
+ .addReg(Src1.getReg(), Op0F, Src1.getSubReg())
+ .addReg(Src2.getReg(), Op1F, Src2.getSubReg());
+ LLVM_DEBUG(dbgs() << "\n[Inserting]: "; MIB.getInstr()->dump());
+ return true;
+
+ // Case 2: Left operand is conversion to sf/hf
+ } else if (((Def1OP != Hexagon::V6_vconv_sf_qf32 &&
+ Def2OP == Hexagon::V6_vconv_sf_qf32) ||
+ (Def1OP != Hexagon::V6_vconv_hf_qf16 &&
+ Def2OP == Hexagon::V6_vconv_hf_qf16)) &&
+ !DefMI1->isPHI() &&
+ (MI->getOpcode() != Hexagon::V6_vmpy_qf32_sf)) {
+ // The second operand of original instruction is converted.
+ // In "mix" instructions, "qf" operand is always the first operand.
+
+ // Caveat: vsub is not commutative w.r.t operands.
+ if (InstTy == Hexagon::V6_vsub_qf16_mix ||
+ InstTy == Hexagon::V6_vsub_qf32_mix)
+ return false;
+
+ if (Inst2 && MRI->getRegClass(Inst2->getOperand(0).getReg()) ==
+ &Hexagon::HvxWRRegClass)
+ return false;
+
+ MachineOperand &Src1 = MI->getOperand(1);
+ MachineOperand &Src2 = DefMI2->getOperand(1);
+
+ Op1F = getKillRegState(Src2.isKill());
+ Src2.setIsKill(false);
+ Op0F = getKillRegState(Src1.isKill());
+ MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), HII->get(InstTy), Res.getReg())
+ .addReg(Src2.getReg(), Op1F,
+ Src2.getSubReg()) // Notice the operands are flipped.
+ .addReg(Src1.getReg(), Op0F, Src1.getSubReg());
+ LLVM_DEBUG(dbgs() << "\n[Inserting]: "; MIB.getInstr()->dump());
+ return true;
+ }
+
+ return false;
+}
+
+bool HexagonQFPOptimizer::runOnMachineFunction(MachineFunction &MF) {
+
+ bool Changed = false;
+
+ if (DisableQFOptimizer)
+ return Changed;
+
+ HST = &MF.getSubtarget<HexagonSubtarget>();
+ if (!HST->useHVXV68Ops() || !HST->usePackets() ||
+ skipFunction(MF.getFunction()))
+ return false;
+ HII = HST->getInstrInfo();
+ MRI = &MF.getRegInfo();
+
+ MachineFunction::iterator MBBI = MF.begin();
+ LLVM_DEBUG(dbgs() << "\n=== Running QFPOptimzer Pass for : " << MF.getName()
+ << " Optimize intermediate conversions ===\n");
+ while (MBBI != MF.end()) {
+ MachineBasicBlock *MBB = &*MBBI;
+ MachineBasicBlock::iterator MII = MBBI->instr_begin();
+ while (MII != MBBI->instr_end()) {
+ MachineInstr *MI = &*MII;
+ ++MII; // As MI might be removed.
+
+ if (QFPInstMap.count(MI->getOpcode()) &&
+ MI->getOpcode() != Hexagon::V6_vconv_sf_qf32 &&
+ MI->getOpcode() != Hexagon::V6_vconv_hf_qf16) {
+ LLVM_DEBUG(dbgs() << "\n###Analyzing for removal: "; MI->dump());
+ if (optimizeQfp(MI, MBB)) {
+ MI->eraseFromParent();
+ LLVM_DEBUG(dbgs() << "\t....Removing....");
+ Changed = true;
+ }
+ }
+ }
+ ++MBBI;
+ }
+ return Changed;
+}
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
index f5d8b69..d9824a31 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -220,6 +220,7 @@ LLVMInitializeHexagonTarget() {
initializeHexagonPeepholePass(PR);
initializeHexagonSplitConst32AndConst64Pass(PR);
initializeHexagonVectorPrintPass(PR);
+ initializeHexagonQFPOptimizerPass(PR);
}
HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT,
@@ -386,6 +387,7 @@ bool HexagonPassConfig::addInstSelector() {
addPass(createHexagonGenInsert());
if (EnableEarlyIf)
addPass(createHexagonEarlyIfConversion());
+ addPass(createHexagonQFPOptimizer());
}
return false;
diff --git a/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp b/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp
index ab93bba..b00589a 100644
--- a/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp
+++ b/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp
@@ -68,7 +68,7 @@ const llvm::StringRef RISCVSEWInstrument::DESC_NAME = "RISCV-SEW";
bool RISCVSEWInstrument::isDataValid(llvm::StringRef Data) {
// Return true if not one of the valid SEW strings
return StringSwitch<bool>(Data)
- .Cases("E8", "E16", "E32", "E64", true)
+ .Cases({"E8", "E16", "E32", "E64"}, true)
.Default(false);
}
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 169465e..a77d765 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -12649,10 +12649,7 @@ SDValue RISCVTargetLowering::lowerVECTOR_REVERSE(SDValue Op,
Lo = DAG.getNode(ISD::VECTOR_REVERSE, DL, LoVT, Lo);
Hi = DAG.getNode(ISD::VECTOR_REVERSE, DL, HiVT, Hi);
// Reassemble the low and high pieces reversed.
- // FIXME: This is a CONCAT_VECTORS.
- SDValue Res = DAG.getInsertSubvector(DL, DAG.getUNDEF(VecVT), Hi, 0);
- return DAG.getInsertSubvector(DL, Res, Lo,
- LoVT.getVectorMinNumElements());
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Hi, Lo);
}
// Just promote the int type to i16 which will double the LMUL.
@@ -24047,18 +24044,7 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
}
}
- std::pair<Register, const TargetRegisterClass *> Res =
- TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
-
- // If we picked one of the Zfinx register classes, remap it to the GPR class.
- // FIXME: When Zfinx is supported in CodeGen this will need to take the
- // Subtarget into account.
- if (Res.second == &RISCV::GPRF16RegClass ||
- Res.second == &RISCV::GPRF32RegClass ||
- Res.second == &RISCV::GPRPairRegClass)
- return std::make_pair(Res.first, &RISCV::GPRRegClass);
-
- return Res;
+ return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
}
InlineAsm::ConstraintCode
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 66717b9..7c89686 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -1511,16 +1511,16 @@ def GIShiftMask32 :
GIComplexOperandMatcher<s64, "selectShiftMask32">,
GIComplexPatternEquiv<shiftMask32>;
-class shiftop<SDPatternOperator operator>
- : PatFrag<(ops node:$val, node:$count),
- (operator node:$val, (XLenVT (shiftMaskXLen node:$count)))>;
-class shiftopw<SDPatternOperator operator>
- : PatFrag<(ops node:$val, node:$count),
- (operator node:$val, (i64 (shiftMask32 node:$count)))>;
+class PatGprShiftMaskXLen<SDPatternOperator OpNode, RVInst Inst>
+ : Pat<(OpNode GPR:$rs1, shiftMaskXLen:$rs2),
+ (Inst GPR:$rs1, shiftMaskXLen:$rs2)>;
+class PatGprShiftMask32<SDPatternOperator OpNode, RVInst Inst>
+ : Pat<(OpNode GPR:$rs1, shiftMask32:$rs2),
+ (Inst GPR:$rs1, shiftMask32:$rs2)>;
-def : PatGprGpr<shiftop<shl>, SLL>;
-def : PatGprGpr<shiftop<srl>, SRL>;
-def : PatGprGpr<shiftop<sra>, SRA>;
+def : PatGprShiftMaskXLen<shl, SLL>;
+def : PatGprShiftMaskXLen<srl, SRL>;
+def : PatGprShiftMaskXLen<sra, SRA>;
// This is a special case of the ADD instruction used to facilitate the use of a
// fourth operand to emit a relocation on a symbol relating to this instruction.
@@ -2203,9 +2203,9 @@ def : Pat<(sra (sext_inreg GPR:$rs1, i32), uimm5:$shamt),
def : Pat<(i64 (sra (shl GPR:$rs1, (i64 32)), uimm6gt32:$shamt)),
(SRAIW GPR:$rs1, (ImmSub32 uimm6gt32:$shamt))>;
-def : PatGprGpr<shiftopw<riscv_sllw>, SLLW>;
-def : PatGprGpr<shiftopw<riscv_srlw>, SRLW>;
-def : PatGprGpr<shiftopw<riscv_sraw>, SRAW>;
+def : PatGprShiftMask32<riscv_sllw, SLLW>;
+def : PatGprShiftMask32<riscv_srlw, SRLW>;
+def : PatGprShiftMask32<riscv_sraw, SRAW>;
// Select W instructions if only the lower 32 bits of the result are used.
def : PatGprGpr<binop_allwusers<add>, ADDW>;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index 57fbaa0..62b7bcd 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -506,8 +506,8 @@ def : Pat<(XLenVT (xor GPR:$rs1, invLogicImm:$rs2)), (XNOR GPR:$rs1, invLogicImm
} // Predicates = [HasStdExtZbbOrZbkb]
let Predicates = [HasStdExtZbbOrZbkb] in {
-def : PatGprGpr<shiftop<rotl>, ROL>;
-def : PatGprGpr<shiftop<rotr>, ROR>;
+def : PatGprShiftMaskXLen<rotl, ROL>;
+def : PatGprShiftMaskXLen<rotr, ROR>;
def : PatGprImm<rotr, RORI, uimmlog2xlen>;
// There's no encoding for roli in the the 'B' extension as it can be
@@ -517,29 +517,29 @@ def : Pat<(XLenVT (rotl GPR:$rs1, uimmlog2xlen:$shamt)),
} // Predicates = [HasStdExtZbbOrZbkb]
let Predicates = [HasStdExtZbbOrZbkb, IsRV64] in {
-def : PatGprGpr<shiftopw<riscv_rolw>, ROLW>;
-def : PatGprGpr<shiftopw<riscv_rorw>, RORW>;
+def : PatGprShiftMask32<riscv_rolw, ROLW>;
+def : PatGprShiftMask32<riscv_rorw, RORW>;
def : PatGprImm<riscv_rorw, RORIW, uimm5>;
def : Pat<(riscv_rolw GPR:$rs1, uimm5:$rs2),
(RORIW GPR:$rs1, (ImmSubFrom32 uimm5:$rs2))>;
} // Predicates = [HasStdExtZbbOrZbkb, IsRV64]
let Predicates = [HasStdExtZbs] in {
-def : Pat<(XLenVT (and (not (shiftop<shl> 1, (XLenVT GPR:$rs2))), GPR:$rs1)),
- (BCLR GPR:$rs1, GPR:$rs2)>;
-def : Pat<(XLenVT (and (rotl -2, (XLenVT GPR:$rs2)), GPR:$rs1)),
- (BCLR GPR:$rs1, GPR:$rs2)>;
-def : Pat<(XLenVT (or (shiftop<shl> 1, (XLenVT GPR:$rs2)), GPR:$rs1)),
- (BSET GPR:$rs1, GPR:$rs2)>;
-def : Pat<(XLenVT (xor (shiftop<shl> 1, (XLenVT GPR:$rs2)), GPR:$rs1)),
- (BINV GPR:$rs1, GPR:$rs2)>;
-def : Pat<(XLenVT (and (shiftop<srl> GPR:$rs1, (XLenVT GPR:$rs2)), 1)),
- (BEXT GPR:$rs1, GPR:$rs2)>;
-
-def : Pat<(XLenVT (shiftop<shl> 1, (XLenVT GPR:$rs2))),
- (BSET (XLenVT X0), GPR:$rs2)>;
-def : Pat<(XLenVT (not (shiftop<shl> -1, (XLenVT GPR:$rs2)))),
- (ADDI (XLenVT (BSET (XLenVT X0), GPR:$rs2)), -1)>;
+def : Pat<(XLenVT (and (not (shl 1, shiftMaskXLen:$rs2)), GPR:$rs1)),
+ (BCLR GPR:$rs1, shiftMaskXLen:$rs2)>;
+def : Pat<(XLenVT (and (rotl -2, shiftMaskXLen:$rs2), GPR:$rs1)),
+ (BCLR GPR:$rs1, shiftMaskXLen:$rs2)>;
+def : Pat<(XLenVT (or (shl 1, shiftMaskXLen:$rs2), GPR:$rs1)),
+ (BSET GPR:$rs1, shiftMaskXLen:$rs2)>;
+def : Pat<(XLenVT (xor (shl 1, shiftMaskXLen:$rs2), GPR:$rs1)),
+ (BINV GPR:$rs1, shiftMaskXLen:$rs2)>;
+def : Pat<(XLenVT (and (srl GPR:$rs1, shiftMaskXLen:$rs2), 1)),
+ (BEXT GPR:$rs1, shiftMaskXLen:$rs2)>;
+
+def : Pat<(XLenVT (shl 1, shiftMaskXLen:$rs2)),
+ (BSET (XLenVT X0), shiftMaskXLen:$rs2)>;
+def : Pat<(XLenVT (not (shl -1, shiftMaskXLen:$rs2))),
+ (ADDI (XLenVT (BSET (XLenVT X0), shiftMaskXLen:$rs2)), -1)>;
def : Pat<(XLenVT (and GPR:$rs1, BCLRMask:$mask)),
(BCLRI GPR:$rs1, BCLRMask:$mask)>;
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
index dbe8e18..d91923b 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
@@ -507,7 +507,9 @@ static Register buildLoadInst(SPIRVType *BaseType, Register PtrRegister,
static Register buildBuiltinVariableLoad(
MachineIRBuilder &MIRBuilder, SPIRVType *VariableType,
SPIRVGlobalRegistry *GR, SPIRV::BuiltIn::BuiltIn BuiltinValue, LLT LLType,
- Register Reg = Register(0), bool isConst = true, bool hasLinkageTy = true) {
+ Register Reg = Register(0), bool isConst = true,
+ const std::optional<SPIRV::LinkageType::LinkageType> &LinkageTy = {
+ SPIRV::LinkageType::Import}) {
Register NewRegister =
MIRBuilder.getMRI()->createVirtualRegister(&SPIRV::pIDRegClass);
MIRBuilder.getMRI()->setType(
@@ -521,9 +523,8 @@ static Register buildBuiltinVariableLoad(
// Set up the global OpVariable with the necessary builtin decorations.
Register Variable = GR->buildGlobalVariable(
NewRegister, PtrType, getLinkStringForBuiltIn(BuiltinValue), nullptr,
- SPIRV::StorageClass::Input, nullptr, /* isConst= */ isConst,
- /* HasLinkageTy */ hasLinkageTy, SPIRV::LinkageType::Import, MIRBuilder,
- false);
+ SPIRV::StorageClass::Input, nullptr, /* isConst= */ isConst, LinkageTy,
+ MIRBuilder, false);
// Load the value from the global variable.
Register LoadedRegister =
@@ -1851,7 +1852,7 @@ static bool generateWaveInst(const SPIRV::IncomingCall *Call,
return buildBuiltinVariableLoad(
MIRBuilder, Call->ReturnType, GR, Value, LLType, Call->ReturnRegister,
- /* isConst= */ false, /* hasLinkageTy= */ false);
+ /* isConst= */ false, /* LinkageType= */ std::nullopt);
}
// We expect a builtin
diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
index 1a7c02c..9e11c3a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
@@ -479,19 +479,9 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
.addImm(static_cast<uint32_t>(getExecutionModel(*ST, F)))
.addUse(FuncVReg);
addStringImm(F.getName(), MIB);
- } else if (F.getLinkage() != GlobalValue::InternalLinkage &&
- F.getLinkage() != GlobalValue::PrivateLinkage &&
- F.getVisibility() != GlobalValue::HiddenVisibility) {
- SPIRV::LinkageType::LinkageType LnkTy =
- F.isDeclaration()
- ? SPIRV::LinkageType::Import
- : (F.getLinkage() == GlobalValue::LinkOnceODRLinkage &&
- ST->canUseExtension(
- SPIRV::Extension::SPV_KHR_linkonce_odr)
- ? SPIRV::LinkageType::LinkOnceODR
- : SPIRV::LinkageType::Export);
+ } else if (const auto LnkTy = getSpirvLinkageTypeFor(*ST, F)) {
buildOpDecorate(FuncVReg, MIRBuilder, SPIRV::Decoration::LinkageAttributes,
- {static_cast<uint32_t>(LnkTy)}, F.getName());
+ {static_cast<uint32_t>(*LnkTy)}, F.getName());
}
// Handle function pointers decoration
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
index 6fd1c7e..6181abb 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
@@ -712,9 +712,9 @@ SPIRVGlobalRegistry::buildConstantSampler(Register ResReg, unsigned AddrMode,
Register SPIRVGlobalRegistry::buildGlobalVariable(
Register ResVReg, SPIRVType *BaseType, StringRef Name,
const GlobalValue *GV, SPIRV::StorageClass::StorageClass Storage,
- const MachineInstr *Init, bool IsConst, bool HasLinkageTy,
- SPIRV::LinkageType::LinkageType LinkageType, MachineIRBuilder &MIRBuilder,
- bool IsInstSelector) {
+ const MachineInstr *Init, bool IsConst,
+ const std::optional<SPIRV::LinkageType::LinkageType> &LinkageType,
+ MachineIRBuilder &MIRBuilder, bool IsInstSelector) {
const GlobalVariable *GVar = nullptr;
if (GV) {
GVar = cast<const GlobalVariable>(GV);
@@ -792,9 +792,9 @@ Register SPIRVGlobalRegistry::buildGlobalVariable(
buildOpDecorate(Reg, MIRBuilder, SPIRV::Decoration::Alignment, {Alignment});
}
- if (HasLinkageTy)
+ if (LinkageType)
buildOpDecorate(Reg, MIRBuilder, SPIRV::Decoration::LinkageAttributes,
- {static_cast<uint32_t>(LinkageType)}, Name);
+ {static_cast<uint32_t>(*LinkageType)}, Name);
SPIRV::BuiltIn::BuiltIn BuiltInId;
if (getSpirvBuiltInIdByName(Name, BuiltInId))
@@ -821,8 +821,8 @@ Register SPIRVGlobalRegistry::getOrCreateGlobalVariableWithBinding(
MIRBuilder.getMRI()->createVirtualRegister(&SPIRV::iIDRegClass);
buildGlobalVariable(VarReg, VarType, Name, nullptr,
- getPointerStorageClass(VarType), nullptr, false, false,
- SPIRV::LinkageType::Import, MIRBuilder, false);
+ getPointerStorageClass(VarType), nullptr, false,
+ std::nullopt, MIRBuilder, false);
buildOpDecorate(VarReg, MIRBuilder, SPIRV::Decoration::DescriptorSet, {Set});
buildOpDecorate(VarReg, MIRBuilder, SPIRV::Decoration::Binding, {Binding});
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
index a648def..c230e62 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
@@ -548,14 +548,12 @@ public:
MachineIRBuilder &MIRBuilder);
Register getOrCreateUndef(MachineInstr &I, SPIRVType *SpvType,
const SPIRVInstrInfo &TII);
- Register buildGlobalVariable(Register Reg, SPIRVType *BaseType,
- StringRef Name, const GlobalValue *GV,
- SPIRV::StorageClass::StorageClass Storage,
- const MachineInstr *Init, bool IsConst,
- bool HasLinkageTy,
- SPIRV::LinkageType::LinkageType LinkageType,
- MachineIRBuilder &MIRBuilder,
- bool IsInstSelector);
+ Register buildGlobalVariable(
+ Register Reg, SPIRVType *BaseType, StringRef Name, const GlobalValue *GV,
+ SPIRV::StorageClass::StorageClass Storage, const MachineInstr *Init,
+ bool IsConst,
+ const std::optional<SPIRV::LinkageType::LinkageType> &LinkageType,
+ MachineIRBuilder &MIRBuilder, bool IsInstSelector);
Register getOrCreateGlobalVariableWithBinding(const SPIRVType *VarType,
uint32_t Set, uint32_t Binding,
StringRef Name,
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index a0cff4d..5591d9f 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -4350,15 +4350,8 @@ bool SPIRVInstructionSelector::selectGlobalValue(
if (hasInitializer(GlobalVar) && !Init)
return true;
- bool HasLnkTy = !GV->hasInternalLinkage() && !GV->hasPrivateLinkage() &&
- !GV->hasHiddenVisibility();
- SPIRV::LinkageType::LinkageType LnkType =
- GV->isDeclarationForLinker()
- ? SPIRV::LinkageType::Import
- : (GV->hasLinkOnceODRLinkage() &&
- STI.canUseExtension(SPIRV::Extension::SPV_KHR_linkonce_odr)
- ? SPIRV::LinkageType::LinkOnceODR
- : SPIRV::LinkageType::Export);
+ const std::optional<SPIRV::LinkageType::LinkageType> LnkType =
+ getSpirvLinkageTypeFor(STI, *GV);
const unsigned AddrSpace = GV->getAddressSpace();
SPIRV::StorageClass::StorageClass StorageClass =
@@ -4366,7 +4359,7 @@ bool SPIRVInstructionSelector::selectGlobalValue(
SPIRVType *ResType = GR.getOrCreateSPIRVPointerType(GVType, I, StorageClass);
Register Reg = GR.buildGlobalVariable(
ResVReg, ResType, GlobalIdent, GV, StorageClass, Init,
- GlobalVar->isConstant(), HasLnkTy, LnkType, MIRBuilder, true);
+ GlobalVar->isConstant(), LnkType, MIRBuilder, true);
return Reg.isValid();
}
@@ -4517,8 +4510,8 @@ bool SPIRVInstructionSelector::loadVec3BuiltinInputID(
// builtin variable.
Register Variable = GR.buildGlobalVariable(
NewRegister, PtrType, getLinkStringForBuiltIn(BuiltInValue), nullptr,
- SPIRV::StorageClass::Input, nullptr, true, false,
- SPIRV::LinkageType::Import, MIRBuilder, false);
+ SPIRV::StorageClass::Input, nullptr, true, std::nullopt, MIRBuilder,
+ false);
// Create new register for loading value.
MachineRegisterInfo *MRI = MIRBuilder.getMRI();
@@ -4570,8 +4563,8 @@ bool SPIRVInstructionSelector::loadBuiltinInputID(
// builtin variable.
Register Variable = GR.buildGlobalVariable(
NewRegister, PtrType, getLinkStringForBuiltIn(BuiltInValue), nullptr,
- SPIRV::StorageClass::Input, nullptr, true, false,
- SPIRV::LinkageType::Import, MIRBuilder, false);
+ SPIRV::StorageClass::Input, nullptr, true, std::nullopt, MIRBuilder,
+ false);
// Load uint value from the global variable.
auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpLoad))
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index 61a0bbe..f7cdfcb 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -547,9 +547,9 @@ void SPIRVModuleAnalysis::collectFuncNames(MachineInstr &MI,
if (MI.getOpcode() == SPIRV::OpDecorate) {
// If it's got Import linkage.
auto Dec = MI.getOperand(1).getImm();
- if (Dec == static_cast<unsigned>(SPIRV::Decoration::LinkageAttributes)) {
+ if (Dec == SPIRV::Decoration::LinkageAttributes) {
auto Lnk = MI.getOperand(MI.getNumOperands() - 1).getImm();
- if (Lnk == static_cast<unsigned>(SPIRV::LinkageType::Import)) {
+ if (Lnk == SPIRV::LinkageType::Import) {
// Map imported function name to function ID register.
const Function *ImportedFunc =
F->getParent()->getFunction(getStringImm(MI, 2));
@@ -635,7 +635,7 @@ static void collectOtherInstr(MachineInstr &MI, SPIRV::ModuleAnalysisInfo &MAI,
void SPIRVModuleAnalysis::processOtherInstrs(const Module &M) {
InstrTraces IS;
for (auto F = M.begin(), E = M.end(); F != E; ++F) {
- if ((*F).isDeclaration())
+ if (F->isDeclaration())
continue;
MachineFunction *MF = MMI->getMachineFunction(*F);
assert(MF);
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
index d8376cd..2d19f6de 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
@@ -169,9 +169,7 @@ struct ModuleAnalysisInfo {
MCRegister getFuncReg(const Function *F) {
assert(F && "Function is null");
- auto FuncPtrRegPair = FuncMap.find(F);
- return FuncPtrRegPair == FuncMap.end() ? MCRegister()
- : FuncPtrRegPair->second;
+ return FuncMap.lookup(F);
}
MCRegister getExtInstSetReg(unsigned SetNum) { return ExtInstSetMap[SetNum]; }
InstrList &getMSInstrs(unsigned MSType) { return MS[MSType]; }
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
index 1d47c89..4e2cc88 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
@@ -1040,4 +1040,19 @@ getFirstValidInstructionInsertPoint(MachineBasicBlock &BB) {
: VarPos;
}
+std::optional<SPIRV::LinkageType::LinkageType>
+getSpirvLinkageTypeFor(const SPIRVSubtarget &ST, const GlobalValue &GV) {
+ if (GV.hasLocalLinkage() || GV.hasHiddenVisibility())
+ return std::nullopt;
+
+ if (GV.isDeclarationForLinker())
+ return SPIRV::LinkageType::Import;
+
+ if (GV.hasLinkOnceODRLinkage() &&
+ ST.canUseExtension(SPIRV::Extension::SPV_KHR_linkonce_odr))
+ return SPIRV::LinkageType::LinkOnceODR;
+
+ return SPIRV::LinkageType::Export;
+}
+
} // namespace llvm
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.h b/llvm/lib/Target/SPIRV/SPIRVUtils.h
index 5777a24..99d9d40 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.h
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.h
@@ -559,5 +559,8 @@ unsigned getArrayComponentCount(const MachineRegisterInfo *MRI,
const MachineInstr *ResType);
MachineBasicBlock::iterator
getFirstValidInstructionInsertPoint(MachineBasicBlock &BB);
+
+std::optional<SPIRV::LinkageType::LinkageType>
+getSpirvLinkageTypeFor(const SPIRVSubtarget &ST, const GlobalValue &GV);
} // namespace llvm
#endif // LLVM_LIB_TARGET_SPIRV_SPIRVUTILS_H
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 8e08d16..a1fd366 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -1164,7 +1164,6 @@ def ProcessorFeatures {
FeatureAVXNECONVERT,
FeatureAVXVNNIINT8,
FeatureAVXVNNIINT16,
- FeatureUSERMSR,
FeatureSHA512,
FeatureSM3,
FeatureEGPR,